hash tables: allow per-table custom entry size

This commit is contained in:
Alain Emilia Anna Zscheile 2023-01-09 20:03:10 +01:00
parent a3c3228c1a
commit aadedaa1a3
3 changed files with 56 additions and 46 deletions

View file

@ -3,7 +3,8 @@ use xxhash_rust::xxh64::xxh64;
#[derive(Clone, Copy, Debug)]
pub struct Settings {
pub seed: u64,
pub seed: u32,
pub entsize: u16,
pub blshift: u8,
}
@ -32,10 +33,16 @@ pub struct Iter<'a> {
}
#[derive(Clone, Copy, Debug)]
pub struct Value {
pub struct Value<R> {
pub typ: u32,
pub lhs: u64,
pub rhs: u64,
pub rest: R,
}
impl Settings {
#[inline]
pub fn chain_entry_size(&self) -> usize {
16 * (1 + usize::from(self.entsize))
}
}
impl Header {
@ -43,26 +50,24 @@ impl Header {
if data.len() < 20 {
return None;
}
let mut seed = u64::from(data[15]) << 32;
seed += u64::from(u32::from_be_bytes(data[16..20].try_into().unwrap()));
Some(Self {
strtab_link: u32::from_be_bytes(data[0..4].try_into().unwrap()),
nbuckets: u32::from_be_bytes(data[4..8].try_into().unwrap()),
nchains: u32::from_be_bytes(data[8..12].try_into().unwrap()),
nblf: u16::from_be_bytes(data[12..14].try_into().unwrap()),
nblf: u16::from_be_bytes(data[14..16].try_into().unwrap()),
settings: Settings {
blshift: data[14],
seed,
entsize: u16::from_be_bytes(data[12..14].try_into().unwrap()),
blshift: data[16],
// to make this easier, we first fold the blshift into the seed,
// and immediately drop that after the conversion
seed: u32::from_be_bytes(data[16..20].try_into().unwrap()) & 0xffffff,
},
})
}
pub fn tabsize(&self) -> usize {
let tmp: usize = 4
+ usize::try_from(self.nbuckets).unwrap()
+ 8 * usize::try_from(self.nchains).unwrap()
+ 2 * usize::from(self.nblf);
4 * tmp
let tmp: usize = 4 + usize::try_from(self.nbuckets).unwrap() + 2 * usize::from(self.nblf);
4 * tmp + self.settings.chain_entry_size() * usize::try_from(self.nchains).unwrap()
}
}
@ -71,15 +76,14 @@ pub fn hash_trf(h: u64, items: usize, div: usize) -> usize {
div * usize::try_from(h % u64::try_from(items / div).unwrap()).unwrap()
}
impl Value {
pub fn parse(entry: &[u8]) -> Option<Self> {
if entry.len() < 32 {
impl<'a> Value<&'a [u8]> {
pub fn parse(entry: &'a [u8]) -> Option<Self> {
if entry.len() < 16 {
return None;
}
Some(Self {
typ: u32::from_be_bytes(entry[12..16].try_into().unwrap()),
lhs: u64::from_be_bytes(entry[16..24].try_into().unwrap()),
rhs: u64::from_be_bytes(entry[24..32].try_into().unwrap()),
rest: &entry[16..],
})
}
}
@ -97,7 +101,10 @@ impl<'a> Ref<'a> {
let bloom_end: usize = 16 + 8 * usize::from(header.nblf);
let buckets_end = bloom_end + 4 * uf(header.nbuckets).unwrap();
let chains_end = buckets_end + 32 * uf(header.nchains).unwrap();
let chains_end =
buckets_end + header.settings.chain_entry_size() * uf(header.nchains).unwrap();
assert_eq!(data.len(), chains_end);
Some(Ref {
settings: header.settings,
strtab: alldata.get(usize::try_from(header.strtab_link).ok()?..)?,
@ -108,7 +115,7 @@ impl<'a> Ref<'a> {
}
/// NOTE: the key is truncated after the first null byte
pub fn lookup(&self, key: &[u8]) -> Option<Value> {
pub fn lookup(&self, key: &[u8]) -> Option<Value<&'a [u8]>> {
let key = trunc_key_at0(key);
let (h, blmask) = self.settings.translate_key(key);
@ -126,7 +133,12 @@ impl<'a> Ref<'a> {
))
.ok()?;
for sel in self.chains.chunks(32).take(chain_start) {
for sel in self
.chains
.chunks(self.settings.chain_entry_size())
.take(chain_start)
{
assert!(sel.len() >= 16);
let e_hash = u64::from_be_bytes(sel[0..8].try_into().unwrap());
if (h | 1) == (e_hash | 1) {
let e_name_ix =
@ -134,7 +146,7 @@ impl<'a> Ref<'a> {
let e_name = trunc_key_at0(self.strtab.get(e_name_ix..)?);
if e_name == key {
return Value::parse(sel);
return Some(Value::parse(sel).unwrap());
}
}
@ -155,7 +167,7 @@ impl<'a> Ref<'a> {
}
impl<'a> Iterator for Iter<'a> {
type Item = (u64, &'a [u8], Value);
type Item = (u64, &'a [u8], Value<&'a [u8]>);
fn next(&mut self) -> Option<Self::Item> {
if self.chains.len() < 32 {
@ -176,7 +188,7 @@ impl<'a> Iterator for Iter<'a> {
impl Settings {
/// NOTE: the key must be truncated first using [`trunc_key_at0`]
pub fn translate_key(&self, key: &[u8]) -> (u64, u64) {
let h = xxh64(key, self.seed);
let h = xxh64(key, self.seed.into());
let blmask = (1 << (h % 64)) | (1 << ((h >> self.blshift) % 64));
(h, blmask)
}

View file

@ -43,7 +43,12 @@ fn name_to_txt(name: &[u8]) -> String {
if let Ok(name) = core::str::from_utf8(name) {
format!("{:?}", name)
} else {
format!("{:?}", name)
"0x".chars()
.chain(
name.iter()
.flat_map(|i| format!("{:02x}", i).chars().collect::<Vec<_>>()),
)
.collect()
}
}
@ -64,31 +69,27 @@ fn main() {
.expect("unable to parse table header");
println!("location\tmeta\t\tname\t\ttyp\t\trest");
for i in ltr {
print!(
"{:08x}\t{:08x}\t{}\t{}\t",
println!(
"{:08x}\t{:08x}\t{}\t{}\t{}",
i.location,
i.meta,
name_to_txt(i.name),
typ_to_txt(i.typ),
name_to_txt(i.rest),
);
for j in i.rest {
print!("{:02x}", j);
}
println!("");
}
}
Command::Hash => {
let ht = yglnk_core::hash_table::Ref::parse(data, cli.location)
.expect("unable to parse table header");
println!("hash\tL\tR\tname\t\ttyp");
println!("hash\t\tname\t\ttyp\trest");
for (hash, name, value) in ht.iter() {
println!(
"{:016x}\t{:016x}\t{:016x}\t{}\t{}",
"{:016x}\t{}\t{}\t{}",
hash,
value.lhs,
value.rhs,
name_to_txt(name),
typ_to_txt(value.typ)
typ_to_txt(value.typ),
name_to_txt(value.rest),
);
}
}
@ -97,12 +98,9 @@ fn main() {
.expect("unable to parse table header");
match ht.lookup(key.as_bytes()) {
None => println!("(none)"),
Some(value) => println!(
"typ={} L={:016x} R={:016x}",
typ_to_txt(value.typ),
value.lhs,
value.rhs
),
Some(value) => {
println!("typ={} {}", typ_to_txt(value.typ), name_to_txt(value.rest),)
}
}
}
Command::X2dhc => unimplemented!(),

View file

@ -85,9 +85,9 @@ list entry := name[4b] type[4b] location[4b] meta[4b] rest[*]
A hash table, using 64bit-xxHash. chains are traversed in order, the last bit of the first 8b of a chain entry is used to indicate if another entry follows (0 = last entry), the rest contains the hash.
```
ht header := strtab_link[4b] nbuckets[4b] nchains[4b] nblf[2b] blshift[1b] seed[5b]
ht body := bloom[8b * nlbf] buckets[4b * nbuckets] chains[32b * nchains]
chain entry := hash[8b] name[4b] type[4b] value[16b]
ht header := strtab_link[4b] nbuckets[4b] nchains[4b] entsize[2b] nblf[2b] blshift[1b] seed[3b]
ht body := bloom[8b * nlbf] buckets[4b * nbuckets] chains[16b * (1 + entsize) * nchains]
chain entry := hash[8b] name[4b] type[4b] rest[16b * entsize]
```
## 2D "hilbert curve" tables