split hash-table code into separate file
This commit is contained in:
parent
25dd03ce25
commit
b440739715
122
crates/yglnk-core/src/ht.rs
Normal file
122
crates/yglnk-core/src/ht.rs
Normal file
|
@ -0,0 +1,122 @@
|
|||
use xxhash_rust::xxh64::xxh64;
|
||||
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
pub struct HashTableHeader {
|
||||
pub strtab_link: u32,
|
||||
pub nbuckets: u32,
|
||||
pub nblf: u16,
|
||||
pub blshift: u8,
|
||||
pub seed: u64,
|
||||
}
|
||||
|
||||
pub struct HashTableRef<'a> {
|
||||
seed: u64,
|
||||
strtab: &'a [u8],
|
||||
bloom: &'a [u8],
|
||||
buckets: &'a [u8],
|
||||
chains: &'a [u8],
|
||||
blshift: u8,
|
||||
}
|
||||
|
||||
impl HashTableHeader {
|
||||
pub fn parse(data: &[u8]) -> Option<Self> {
|
||||
if data.len() < 16 {
|
||||
return None;
|
||||
}
|
||||
let mut seed = u64::from(data[11]) << 32;
|
||||
seed += u64::from(u32::from_be_bytes(data[12..16].try_into().unwrap()));
|
||||
Some(Self {
|
||||
strtab_link: u32::from_be_bytes(data[0..4].try_into().unwrap()),
|
||||
nbuckets: u32::from_be_bytes(data[4..8].try_into().unwrap()),
|
||||
nblf: u16::from_be_bytes(data[8..10].try_into().unwrap()),
|
||||
blshift: data[10],
|
||||
seed,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn tabsize(&self) -> usize {
|
||||
let tmp: usize = 4 + 2 * usize::try_from(self.nbuckets).unwrap() + usize::from(self.nblf);
|
||||
4 * tmp
|
||||
}
|
||||
}
|
||||
|
||||
pub fn trunc_key_at0(key: &[u8]) -> &[u8] {
|
||||
let key_end = key.iter().take_while(|&&i| i != 0).count();
|
||||
&key[..key_end]
|
||||
}
|
||||
|
||||
impl<'a> HashTableRef<'a> {
|
||||
/// `location` should be the offset where the hash table is present (in units of 16 bytes)
|
||||
pub fn parse(data: &'a [u8], location: u32, entsize: u16, entcount: u16) -> Option<Self> {
|
||||
let alldata = data;
|
||||
let uf = <usize as TryFrom<u32>>::try_from;
|
||||
let offset = 16 * uf(location).unwrap();
|
||||
let data = &data[offset..offset + 16 * usize::from(entsize) * usize::from(entcount)];
|
||||
|
||||
let header = HashTableHeader::parse(data)?;
|
||||
if data.len() < header.tabsize() {
|
||||
return None;
|
||||
}
|
||||
|
||||
let bloom_end: usize = 16 + 8 * usize::from(header.nblf);
|
||||
let buckets_end = bloom_end + 4 * uf(header.nbuckets).unwrap();
|
||||
Some(HashTableRef {
|
||||
blshift: header.blshift,
|
||||
seed: header.seed,
|
||||
strtab: alldata.get(usize::try_from(header.strtab_link).ok()?..)?,
|
||||
bloom: &data[16..bloom_end],
|
||||
buckets: &data[bloom_end..buckets_end],
|
||||
chains: &data[buckets_end..],
|
||||
})
|
||||
}
|
||||
|
||||
/// NOTE: the key is truncated after the first null byte
|
||||
pub fn lookup(&self, key: &[u8]) -> Option<(u32, u64, u64)> {
|
||||
// hash -> index conversion/transformation helper
|
||||
fn htr(h: u64, items: usize, div: usize) -> usize {
|
||||
div * usize::try_from(h % u64::try_from(items / div).unwrap()).unwrap()
|
||||
}
|
||||
|
||||
let key = trunc_key_at0(key);
|
||||
let h = xxh64(key, self.seed);
|
||||
|
||||
// check bloom filter
|
||||
let blsel = htr(h / 64, self.bloom.len(), 8);
|
||||
let blword = u64::from_be_bytes(self.bloom[blsel..blsel + 8].try_into().unwrap());
|
||||
let blmask: u64 = (1 << (h % 64)) | (1 << ((h >> self.blshift) % 64));
|
||||
|
||||
if (blword & blmask) != blmask {
|
||||
return None;
|
||||
}
|
||||
|
||||
// retrieve bucket/chain start index
|
||||
let bkid = htr(h, self.buckets.len(), 4);
|
||||
let chain_start = usize::try_from(u32::from_be_bytes(
|
||||
self.buckets[bkid..bkid + 4].try_into().unwrap(),
|
||||
))
|
||||
.ok()?;
|
||||
|
||||
for sel in self.chains.chunks(32).take(chain_start) {
|
||||
let e_hash = u64::from_be_bytes(sel[0..8].try_into().unwrap());
|
||||
if (h | 1) == (e_hash | 1) {
|
||||
let e_name_ix =
|
||||
usize::try_from(u32::from_be_bytes(sel[8..12].try_into().unwrap())).ok()?;
|
||||
let e_name = trunc_key_at0(self.strtab.get(e_name_ix..)?);
|
||||
|
||||
if e_name == key {
|
||||
return Some((
|
||||
u32::from_be_bytes(sel[12..16].try_into().unwrap()),
|
||||
u64::from_be_bytes(sel[16..24].try_into().unwrap()),
|
||||
u64::from_be_bytes(sel[24..32].try_into().unwrap()),
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
if (e_hash & 1) == 0 {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return None;
|
||||
}
|
||||
}
|
|
@ -1,7 +1,10 @@
|
|||
#![no_std]
|
||||
#![forbid(unsafe_code)]
|
||||
|
||||
use int_enum::IntEnum;
|
||||
use xxhash_rust::xxh64::xxh64;
|
||||
|
||||
mod ht;
|
||||
pub use ht::*;
|
||||
|
||||
#[derive(Clone, Copy, Debug, IntEnum)]
|
||||
#[repr(u32)]
|
||||
|
@ -66,124 +69,3 @@ pub fn linear_table_iter(
|
|||
rest: &i[16..],
|
||||
})
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
pub struct HashTableHeader {
|
||||
pub strtab_link: u32,
|
||||
pub nbuckets: u32,
|
||||
pub nblf: u16,
|
||||
pub blshift: u8,
|
||||
pub seed: u64,
|
||||
}
|
||||
|
||||
pub struct HashTableRef<'a> {
|
||||
seed: u64,
|
||||
strtab: &'a [u8],
|
||||
bloom: &'a [u8],
|
||||
buckets: &'a [u8],
|
||||
chains: &'a [u8],
|
||||
blshift: u8,
|
||||
}
|
||||
|
||||
impl HashTableHeader {
|
||||
pub fn parse(data: &[u8]) -> Option<Self> {
|
||||
if data.len() < 16 {
|
||||
return None;
|
||||
}
|
||||
let mut seed = u64::from(data[11]) << 32;
|
||||
seed += u64::from(u32::from_be_bytes(data[12..16].try_into().unwrap()));
|
||||
Some(Self {
|
||||
strtab_link: u32::from_be_bytes(data[0..4].try_into().unwrap()),
|
||||
nbuckets: u32::from_be_bytes(data[4..8].try_into().unwrap()),
|
||||
nblf: u16::from_be_bytes(data[8..10].try_into().unwrap()),
|
||||
blshift: data[10],
|
||||
seed,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn tabsize(&self) -> usize {
|
||||
let tmp: usize = 4 + 2 * usize::try_from(self.nbuckets).unwrap() + usize::from(self.nblf);
|
||||
4 * tmp
|
||||
}
|
||||
}
|
||||
|
||||
pub fn trunc_key_at0(key: &[u8]) -> &[u8] {
|
||||
let key_end = key.iter().take_while(|&&i| i != 0).count();
|
||||
&key[..key_end]
|
||||
}
|
||||
|
||||
impl<'a> HashTableRef<'a> {
|
||||
/// `location` should be the offset where the hash table is present (in units of 16 bytes)
|
||||
pub fn parse(data: &'a [u8], location: u32, entsize: u16, entcount: u16) -> Option<Self> {
|
||||
let alldata = data;
|
||||
let uf = <usize as TryFrom<u32>>::try_from;
|
||||
let offset = 16 * uf(location).unwrap();
|
||||
let data = &data[offset..offset + 16 * usize::from(entsize) * usize::from(entcount)];
|
||||
|
||||
let header = HashTableHeader::parse(data)?;
|
||||
if data.len() < header.tabsize() {
|
||||
return None;
|
||||
}
|
||||
|
||||
let bloom_end: usize = 16 + 8 * usize::from(header.nblf);
|
||||
let buckets_end = bloom_end + 4 * uf(header.nbuckets).unwrap();
|
||||
Some(HashTableRef {
|
||||
blshift: header.blshift,
|
||||
seed: header.seed,
|
||||
strtab: alldata.get(usize::try_from(header.strtab_link).ok()?..)?,
|
||||
bloom: &data[16..bloom_end],
|
||||
buckets: &data[bloom_end..buckets_end],
|
||||
chains: &data[buckets_end..],
|
||||
})
|
||||
}
|
||||
|
||||
/// NOTE: the key is truncated after the first null byte
|
||||
pub fn lookup(&self, key: &[u8]) -> Option<(u32, u64, u64)> {
|
||||
// hash -> index conversion/transformation helper
|
||||
fn htr(h: u64, items: usize, div: usize) -> usize {
|
||||
div * usize::try_from(h % u64::try_from(items / div).unwrap()).unwrap()
|
||||
}
|
||||
|
||||
let key = trunc_key_at0(key);
|
||||
let h = xxh64(key, self.seed);
|
||||
|
||||
// check bloom filter
|
||||
let blsel = htr(h / 64, self.bloom.len(), 8);
|
||||
let blword = u64::from_be_bytes(self.bloom[blsel..blsel + 8].try_into().unwrap());
|
||||
let blmask: u64 = (1 << (h % 64)) | (1 << ((h >> self.blshift) % 64));
|
||||
|
||||
if (blword & blmask) != blmask {
|
||||
return None;
|
||||
}
|
||||
|
||||
// retrieve bucket/chain start index
|
||||
let bkid = htr(h, self.buckets.len(), 4);
|
||||
let chain_start = usize::try_from(u32::from_be_bytes(
|
||||
self.buckets[bkid..bkid + 4].try_into().unwrap(),
|
||||
))
|
||||
.ok()?;
|
||||
|
||||
for sel in self.chains.chunks(32).take(chain_start) {
|
||||
let e_hash = u64::from_be_bytes(sel[0..8].try_into().unwrap());
|
||||
if (h | 1) == (e_hash | 1) {
|
||||
let e_name_ix =
|
||||
usize::try_from(u32::from_be_bytes(sel[8..12].try_into().unwrap())).ok()?;
|
||||
let e_name = trunc_key_at0(self.strtab.get(e_name_ix..)?);
|
||||
|
||||
if e_name == key {
|
||||
return Some((
|
||||
u32::from_be_bytes(sel[12..16].try_into().unwrap()),
|
||||
u64::from_be_bytes(sel[16..24].try_into().unwrap()),
|
||||
u64::from_be_bytes(sel[24..32].try_into().unwrap()),
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
if (e_hash & 1) == 0 {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return None;
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue