hash tables: allow per-table custom entry size

2023-01-09 20:03:10 +01:00 · 2023-01-09 20:03:10 +01:00 · aadedaa1a3
commit aadedaa1a3
parent a3c3228c1a
3 changed files with 56 additions and 46 deletions
--- a/crates/yglnk-core/src/hash_table.rs
+++ b/crates/yglnk-core/src/hash_table.rs
@ -3,7 +3,8 @@ use xxhash_rust::xxh64::xxh64;

 #[derive(Clone, Copy, Debug)]
 pub struct Settings {
-    pub seed: u64,
+    pub seed: u32,
+    pub entsize: u16,
    pub blshift: u8,
 }

@ -32,10 +33,16 @@ pub struct Iter<'a> {
 }

 #[derive(Clone, Copy, Debug)]
-pub struct Value {
+pub struct Value<R> {
    pub typ: u32,
-    pub lhs: u64,
-    pub rhs: u64,
+    pub rest: R,
+}
+
+impl Settings {
+    #[inline]
+    pub fn chain_entry_size(&self) -> usize {
+        16 * (1 + usize::from(self.entsize))
+    }
 }

 impl Header {
@ -43,26 +50,24 @@ impl Header {
        if data.len() < 20 {
            return None;
        }
-        let mut seed = u64::from(data[15]) << 32;
-        seed += u64::from(u32::from_be_bytes(data[16..20].try_into().unwrap()));
        Some(Self {
            strtab_link: u32::from_be_bytes(data[0..4].try_into().unwrap()),
            nbuckets: u32::from_be_bytes(data[4..8].try_into().unwrap()),
            nchains: u32::from_be_bytes(data[8..12].try_into().unwrap()),
-            nblf: u16::from_be_bytes(data[12..14].try_into().unwrap()),
+            nblf: u16::from_be_bytes(data[14..16].try_into().unwrap()),
            settings: Settings {
-                blshift: data[14],
-                seed,
+                entsize: u16::from_be_bytes(data[12..14].try_into().unwrap()),
+                blshift: data[16],
+                // to make this easier, we first fold the blshift into the seed,
+                // and immediately drop that after the conversion
+                seed: u32::from_be_bytes(data[16..20].try_into().unwrap()) & 0xffffff,
            },
        })
    }

    pub fn tabsize(&self) -> usize {
-        let tmp: usize = 4
-            + usize::try_from(self.nbuckets).unwrap()
-            + 8 * usize::try_from(self.nchains).unwrap()
-            + 2 * usize::from(self.nblf);
-        4 * tmp
+        let tmp: usize = 4 + usize::try_from(self.nbuckets).unwrap() + 2 * usize::from(self.nblf);
+        4 * tmp + self.settings.chain_entry_size() * usize::try_from(self.nchains).unwrap()
    }
 }

@ -71,15 +76,14 @@ pub fn hash_trf(h: u64, items: usize, div: usize) -> usize {
    div * usize::try_from(h % u64::try_from(items / div).unwrap()).unwrap()
 }

-impl Value {
-    pub fn parse(entry: &[u8]) -> Option<Self> {
-        if entry.len() < 32 {
+impl<'a> Value<&'a [u8]> {
+    pub fn parse(entry: &'a [u8]) -> Option<Self> {
+        if entry.len() < 16 {
            return None;
        }
        Some(Self {
            typ: u32::from_be_bytes(entry[12..16].try_into().unwrap()),
-            lhs: u64::from_be_bytes(entry[16..24].try_into().unwrap()),
-            rhs: u64::from_be_bytes(entry[24..32].try_into().unwrap()),
+            rest: &entry[16..],
        })
    }
 }
@ -97,7 +101,10 @@ impl<'a> Ref<'a> {

        let bloom_end: usize = 16 + 8 * usize::from(header.nblf);
        let buckets_end = bloom_end + 4 * uf(header.nbuckets).unwrap();
-        let chains_end = buckets_end + 32 * uf(header.nchains).unwrap();
+        let chains_end =
+            buckets_end + header.settings.chain_entry_size() * uf(header.nchains).unwrap();
+        assert_eq!(data.len(), chains_end);
+
        Some(Ref {
            settings: header.settings,
            strtab: alldata.get(usize::try_from(header.strtab_link).ok()?..)?,
@ -108,7 +115,7 @@ impl<'a> Ref<'a> {
    }

    /// NOTE: the key is truncated after the first null byte
-    pub fn lookup(&self, key: &[u8]) -> Option<Value> {
+    pub fn lookup(&self, key: &[u8]) -> Option<Value<&'a [u8]>> {
        let key = trunc_key_at0(key);
        let (h, blmask) = self.settings.translate_key(key);

@ -126,7 +133,12 @@ impl<'a> Ref<'a> {
        ))
        .ok()?;

-        for sel in self.chains.chunks(32).take(chain_start) {
+        for sel in self
+            .chains
+            .chunks(self.settings.chain_entry_size())
+            .take(chain_start)
+        {
+            assert!(sel.len() >= 16);
            let e_hash = u64::from_be_bytes(sel[0..8].try_into().unwrap());
            if (h | 1) == (e_hash | 1) {
                let e_name_ix =
@ -134,7 +146,7 @@ impl<'a> Ref<'a> {
                let e_name = trunc_key_at0(self.strtab.get(e_name_ix..)?);

                if e_name == key {
-                    return Value::parse(sel);
+                    return Some(Value::parse(sel).unwrap());
                }
            }

@ -155,7 +167,7 @@ impl<'a> Ref<'a> {
 }

 impl<'a> Iterator for Iter<'a> {
-    type Item = (u64, &'a [u8], Value);
+    type Item = (u64, &'a [u8], Value<&'a [u8]>);

    fn next(&mut self) -> Option<Self::Item> {
        if self.chains.len() < 32 {
@ -176,7 +188,7 @@ impl<'a> Iterator for Iter<'a> {
 impl Settings {
    /// NOTE: the key must be truncated first using [`trunc_key_at0`]
    pub fn translate_key(&self, key: &[u8]) -> (u64, u64) {
-        let h = xxh64(key, self.seed);
+        let h = xxh64(key, self.seed.into());
        let blmask = (1 << (h % 64)) | (1 << ((h >> self.blshift) % 64));
        (h, blmask)
    }
--- a/crates/yglnk-ls/src/main.rs
+++ b/crates/yglnk-ls/src/main.rs
@ -43,7 +43,12 @@ fn name_to_txt(name: &[u8]) -> String {
    if let Ok(name) = core::str::from_utf8(name) {
        format!("{:?}", name)
    } else {
-        format!("{:?}", name)
+        "0x".chars()
+            .chain(
+                name.iter()
+                    .flat_map(|i| format!("{:02x}", i).chars().collect::<Vec<_>>()),
+            )
+            .collect()
    }
 }

@ -64,31 +69,27 @@ fn main() {
                .expect("unable to parse table header");
            println!("location\tmeta\t\tname\t\ttyp\t\trest");
            for i in ltr {
-                print!(
-                    "{:08x}\t{:08x}\t{}\t{}\t",
+                println!(
+                    "{:08x}\t{:08x}\t{}\t{}\t{}",
                    i.location,
                    i.meta,
                    name_to_txt(i.name),
                    typ_to_txt(i.typ),
+                    name_to_txt(i.rest),
                );
-                for j in i.rest {
-                    print!("{:02x}", j);
-                }
-                println!("");
            }
        }
        Command::Hash => {
            let ht = yglnk_core::hash_table::Ref::parse(data, cli.location)
                .expect("unable to parse table header");
-            println!("hash\tL\tR\tname\t\ttyp");
+            println!("hash\t\tname\t\ttyp\trest");
            for (hash, name, value) in ht.iter() {
                println!(
-                    "{:016x}\t{:016x}\t{:016x}\t{}\t{}",
+                    "{:016x}\t{}\t{}\t{}",
                    hash,
-                    value.lhs,
-                    value.rhs,
                    name_to_txt(name),
-                    typ_to_txt(value.typ)
+                    typ_to_txt(value.typ),
+                    name_to_txt(value.rest),
                );
            }
        }
@ -97,12 +98,9 @@ fn main() {
                .expect("unable to parse table header");
            match ht.lookup(key.as_bytes()) {
                None => println!("(none)"),
-                Some(value) => println!(
-                    "typ={} L={:016x} R={:016x}",
-                    typ_to_txt(value.typ),
-                    value.lhs,
-                    value.rhs
-                ),
+                Some(value) => {
+                    println!("typ={} {}", typ_to_txt(value.typ), name_to_txt(value.rest),)
+                }
            }
        }
        Command::X2dhc => unimplemented!(),
--- a/docs/index.gmi
+++ b/docs/index.gmi
@ -85,9 +85,9 @@ list entry  := name[4b] type[4b] location[4b] meta[4b] rest[*]
 A hash table, using 64bit-xxHash. chains are traversed in order, the last bit of the first 8b of a chain entry is used to indicate if another entry follows (0 = last entry), the rest contains the hash.

 ```
-ht header   := strtab_link[4b] nbuckets[4b] nchains[4b] nblf[2b] blshift[1b] seed[5b]
-ht body     := bloom[8b * nlbf] buckets[4b * nbuckets] chains[32b * nchains]
-chain entry := hash[8b] name[4b] type[4b] value[16b]
+ht header   := strtab_link[4b] nbuckets[4b] nchains[4b] entsize[2b] nblf[2b] blshift[1b] seed[3b]
+ht body     := bloom[8b * nlbf] buckets[4b * nbuckets] chains[16b * (1 + entsize) * nchains]
+chain entry := hash[8b] name[4b] type[4b] rest[16b * entsize]
 ```

 ## 2D "hilbert curve" tables