196 lines
5.8 KiB
Rust
196 lines
5.8 KiB
Rust
use crate::trunc_key_at0;
|
|
use xxhash_rust::xxh64::xxh64;
|
|
|
|
#[derive(Clone, Copy, Debug)]
|
|
pub struct Settings {
|
|
pub seed: u32,
|
|
pub entsize: u16,
|
|
pub blshift: u8,
|
|
}
|
|
|
|
#[derive(Clone, Copy, Debug)]
|
|
pub struct Header {
|
|
pub strtab_link: u32,
|
|
pub nbuckets: u32,
|
|
pub nchains: u32,
|
|
pub nblf: u16,
|
|
pub settings: Settings,
|
|
}
|
|
|
|
#[derive(Clone)]
|
|
pub struct Ref<'a> {
|
|
strtab: &'a [u8],
|
|
bloom: &'a [u8],
|
|
buckets: &'a [u8],
|
|
chains: &'a [u8],
|
|
settings: Settings,
|
|
}
|
|
|
|
#[derive(Clone)]
|
|
pub struct Iter<'a> {
|
|
strtab: &'a [u8],
|
|
chains: &'a [u8],
|
|
}
|
|
|
|
#[derive(Clone, Copy, Debug)]
|
|
pub struct Value<R> {
|
|
pub typ: u32,
|
|
pub rest: R,
|
|
}
|
|
|
|
impl Settings {
|
|
#[inline]
|
|
pub fn chain_entry_size(&self) -> usize {
|
|
16 * (1 + usize::from(self.entsize))
|
|
}
|
|
}
|
|
|
|
impl Header {
|
|
pub fn parse(data: &[u8]) -> Option<Self> {
|
|
if data.len() < 20 {
|
|
return None;
|
|
}
|
|
Some(Self {
|
|
strtab_link: u32::from_be_bytes(data[0..4].try_into().unwrap()),
|
|
nbuckets: u32::from_be_bytes(data[4..8].try_into().unwrap()),
|
|
nchains: u32::from_be_bytes(data[8..12].try_into().unwrap()),
|
|
nblf: u16::from_be_bytes(data[14..16].try_into().unwrap()),
|
|
settings: Settings {
|
|
entsize: u16::from_be_bytes(data[12..14].try_into().unwrap()),
|
|
blshift: data[16],
|
|
// to make this easier, we first fold the blshift into the seed,
|
|
// and immediately drop that after the conversion
|
|
seed: u32::from_be_bytes(data[16..20].try_into().unwrap()) & 0xffffff,
|
|
},
|
|
})
|
|
}
|
|
|
|
pub fn tabsize(&self) -> usize {
|
|
let tmp: usize = 4 + usize::try_from(self.nbuckets).unwrap() + 2 * usize::from(self.nblf);
|
|
4 * tmp + self.settings.chain_entry_size() * usize::try_from(self.nchains).unwrap()
|
|
}
|
|
}
|
|
|
|
// hash -> index conversion/transformation helper
|
|
pub fn hash_trf(h: u64, items: usize, div: usize) -> usize {
|
|
div * usize::try_from(h % u64::try_from(items / div).unwrap()).unwrap()
|
|
}
|
|
|
|
impl<'a> Value<&'a [u8]> {
|
|
pub fn parse(entry: &'a [u8]) -> Option<Self> {
|
|
if entry.len() < 16 {
|
|
return None;
|
|
}
|
|
Some(Self {
|
|
typ: u32::from_be_bytes(entry[12..16].try_into().unwrap()),
|
|
rest: &entry[16..],
|
|
})
|
|
}
|
|
}
|
|
|
|
impl<'a> Ref<'a> {
|
|
/// `location` should be the offset where the hash table is present (in units of 16 bytes)
|
|
pub fn parse(data: &'a [u8], location: u32) -> Option<Self> {
|
|
let alldata = data;
|
|
let uf = <usize as TryFrom<u32>>::try_from;
|
|
let offset = crate::decode_location(location);
|
|
let data = data.get(offset..)?;
|
|
|
|
let header = Header::parse(data)?;
|
|
let data = data.get(..header.tabsize())?;
|
|
|
|
let bloom_end: usize = 16 + 8 * usize::from(header.nblf);
|
|
let buckets_end = bloom_end + 4 * uf(header.nbuckets).unwrap();
|
|
let chains_end =
|
|
buckets_end + header.settings.chain_entry_size() * uf(header.nchains).unwrap();
|
|
assert_eq!(data.len(), chains_end);
|
|
|
|
Some(Ref {
|
|
settings: header.settings,
|
|
strtab: alldata.get(usize::try_from(header.strtab_link).ok()?..)?,
|
|
bloom: &data[16..bloom_end],
|
|
buckets: &data[bloom_end..buckets_end],
|
|
chains: &data[buckets_end..chains_end],
|
|
})
|
|
}
|
|
|
|
/// NOTE: the key is truncated after the first null byte
|
|
pub fn lookup(&self, key: &[u8]) -> Option<Value<&'a [u8]>> {
|
|
let key = trunc_key_at0(key);
|
|
let (h, blmask) = self.settings.translate_key(key);
|
|
|
|
// check bloom filter
|
|
let blsel = hash_trf(h / 64, self.bloom.len(), 8);
|
|
let blword = u64::from_be_bytes(self.bloom[blsel..blsel + 8].try_into().unwrap());
|
|
if (blword & blmask) != blmask {
|
|
return None;
|
|
}
|
|
|
|
// retrieve bucket/chain start index
|
|
let bkid = hash_trf(h, self.buckets.len(), 4);
|
|
let chain_start = usize::try_from(u32::from_be_bytes(
|
|
self.buckets[bkid..bkid + 4].try_into().unwrap(),
|
|
))
|
|
.ok()?;
|
|
|
|
for sel in self
|
|
.chains
|
|
.chunks(self.settings.chain_entry_size())
|
|
.take(chain_start)
|
|
{
|
|
assert!(sel.len() >= 16);
|
|
let e_hash = u64::from_be_bytes(sel[0..8].try_into().unwrap());
|
|
if (h | 1) == (e_hash | 1) {
|
|
let e_name_ix =
|
|
usize::try_from(u32::from_be_bytes(sel[8..12].try_into().unwrap())).ok()?;
|
|
let e_name = trunc_key_at0(self.strtab.get(e_name_ix..)?);
|
|
|
|
if e_name == key {
|
|
return Some(Value::parse(sel).unwrap());
|
|
}
|
|
}
|
|
|
|
if (e_hash & 1) == 0 {
|
|
break;
|
|
}
|
|
}
|
|
|
|
return None;
|
|
}
|
|
|
|
pub fn iter(&self) -> Iter<'a> {
|
|
Iter {
|
|
strtab: self.strtab,
|
|
chains: self.chains,
|
|
}
|
|
}
|
|
}
|
|
|
|
impl<'a> Iterator for Iter<'a> {
|
|
type Item = (u64, &'a [u8], Value<&'a [u8]>);
|
|
|
|
fn next(&mut self) -> Option<Self::Item> {
|
|
if self.chains.len() < 32 {
|
|
return None;
|
|
}
|
|
let (i, j) = self.chains.split_at(32);
|
|
self.chains = j;
|
|
let e_name_ix = usize::try_from(u32::from_be_bytes(i[8..12].try_into().unwrap())).ok()?;
|
|
let e_name = trunc_key_at0(self.strtab.get(e_name_ix..)?);
|
|
Some((
|
|
u64::from_be_bytes(i[0..8].try_into().unwrap()),
|
|
e_name,
|
|
Value::parse(i).unwrap(),
|
|
))
|
|
}
|
|
}
|
|
|
|
impl Settings {
|
|
/// NOTE: the key must be truncated first using [`trunc_key_at0`]
|
|
pub fn translate_key(&self, key: &[u8]) -> (u64, u64) {
|
|
let h = xxh64(key, self.seed.into());
|
|
let blmask = (1 << (h % 64)) | (1 << ((h >> self.blshift) % 64));
|
|
(h, blmask)
|
|
}
|
|
}
|