hash_table: add serializer

This commit is contained in:
Alain Zscheile 2023-01-25 16:31:05 +01:00
parent 7d61a9b7fc
commit a40017fcf1
3 changed files with 194 additions and 13 deletions

View file

@ -19,3 +19,9 @@ default-features = false
[dependencies.xxhash-rust]
version = "0.8"
features = ["xxh64"]
[features]
alloc = []
[package.metadata.docs.rs]
features = ["alloc"]

View file

@ -8,6 +8,15 @@ pub struct Settings {
pub blshift: u8,
}
impl Settings {
/// NOTE: the key must be truncated first using [`trunc_key_at0`]
pub fn translate_key(&self, key: &[u8]) -> (u64, u64) {
let h = xxh64(key, self.seed.into());
let blmask = (1 << (h % 64)) | (1 << ((h >> self.blshift) % 64));
(h, blmask)
}
}
#[derive(Clone, Copy, Debug)]
pub struct Header {
pub strtab_link: u32,
@ -50,10 +59,11 @@ impl Settings {
impl Header {
pub fn parse(data: &[u8]) -> Option<Self> {
if data.len() < 20 {
return None;
}
Some(Self {
Some(Self::decode(data.get(0..20)?.try_into().unwrap()))
}
pub fn decode(data: [u8; 20]) -> Self {
Self {
strtab_link: u32::from_be_bytes(data[0..4].try_into().unwrap()),
nbuckets: u32::from_be_bytes(data[4..8].try_into().unwrap()),
nchains: u32::from_be_bytes(data[8..12].try_into().unwrap()),
@ -65,7 +75,22 @@ impl Header {
// and immediately drop that after the conversion
seed: u32::from_be_bytes(data[16..20].try_into().unwrap()) & 0xffffff,
},
})
}
}
pub fn encode(&self) -> [u8; 20] {
let mut ret = [0u8; 20];
ret[0..4].copy_from_slice(&u32::to_be_bytes(self.strtab_link));
ret[4..8].copy_from_slice(&u32::to_be_bytes(self.nbuckets));
ret[8..12].copy_from_slice(&u32::to_be_bytes(self.nchains));
ret[12..14].copy_from_slice(&u16::to_be_bytes(self.settings.entsize));
ret[14..16].copy_from_slice(&u16::to_be_bytes(self.nblf));
// to make this easier, we first overwrite the blshift field with the seed,
// then immediately fix it up
assert_eq!(self.settings.seed & 0xff000000, 0);
ret[16..20].copy_from_slice(&u32::to_be_bytes(self.settings.seed));
ret[16] = self.settings.blshift;
ret
}
pub fn tabsize(&self) -> usize {
@ -102,7 +127,7 @@ impl<'a> Ref<'a> {
let header = Header::parse(data)?;
let data = data.get(..header.tabsize())?;
let bloom_end: usize = 16 + 8 * usize::from(header.nblf);
let bloom_end: usize = 20 + 8 * usize::from(header.nblf);
let buckets_end = bloom_end + 4 * uf(header.nbuckets).unwrap();
let chains_end =
buckets_end + header.settings.chain_entry_size() * uf(header.nchains).unwrap();
@ -186,11 +211,141 @@ impl<'a> Iterator for Iter<'a> {
}
}
impl Settings {
/// NOTE: the key must be truncated first using [`trunc_key_at0`]
pub fn translate_key(&self, key: &[u8]) -> (u64, u64) {
let h = xxh64(key, self.seed.into());
let blmask = (1 << (h % 64)) | (1 << ((h >> self.blshift) % 64));
(h, blmask)
}
#[derive(Copy, Clone, Debug)]
pub struct PreEntry<R> {
/// index into the string table corresponding to the entry name
pub name_ix: u32,
/// type of the entry
pub typ: u32,
/// content of the entry
pub rest: R,
}
#[cfg(feature = "alloc")]
use alloc::{boxed::Box, vec, vec::Vec};
#[cfg(feature = "alloc")]
pub fn serialize<I, R, O>(
strtab: crate::StrtabDescriptorRef<'_>,
nbuckets: u32,
nblf: u16,
settings: Settings,
data: I,
) -> Option<Vec<u8>>
where
I: Iterator<Item = PreEntry<R>>,
R: Into<Box<[u8]>>,
{
let mut header = Header {
strtab_link: strtab.location,
nbuckets,
// chains get computed later
nchains: 0,
nblf,
settings,
};
let mut bloom = vec![0u64; nblf.into()];
let nbku: usize = nbuckets.try_into().ok()?;
let (data_lb_count, _) = data.size_hint();
if usize::try_from(u32::MAX)
.map(|dub| data_lb_count > dub)
.unwrap_or(false)
{
// too much data, overflow
return None;
}
let mut chains = vec![Vec::<PreEntry<(u64, Box<[u8]>)>>::new(); nbku];
let actual_entsize = settings.chain_entry_size();
assert!(actual_entsize >= 16);
for PreEntry { name_ix, typ, rest } in data {
let name = &strtab[name_ix];
let rest: Box<[u8]> = rest.into();
assert!(actual_entsize >= (16 + rest.len()));
let (h, blmask) = header.settings.translate_key(name);
// add to bloom filter; (div=1 because we use u64 entries and convert later)
let blid = hash_trf(h / 64, bloom.len(), 1);
bloom[blid] |= blmask;
// add to chain
chains[hash_trf(h, nbku, 1)].push(PreEntry {
name_ix,
typ,
rest: (h, rest),
});
}
header.nchains = u32::try_from(chains.iter().map(|i| i.len()).sum::<usize>()).ok()?;
let mut ret = vec![0u8; header.tabsize()];
// copy header
ret[0..20].copy_from_slice(&header.encode());
// copy bloom filter
let bloom_end: usize = 20 + 8 * usize::from(header.nblf);
for (blin, blout) in bloom
.into_iter()
.zip(ret[20..bloom_end].chunks_exact_mut(8))
{
blout.copy_from_slice(&u64::to_be_bytes(blin));
}
// copy buckets and chains
let (buckets_out, chains_out) =
ret[bloom_end..].split_at_mut(4 * usize::try_from(header.nbuckets).unwrap());
assert_eq!(
chains_out.len(),
actual_entsize * usize::try_from(header.nchains).unwrap()
);
let mut buckets_out = buckets_out.chunks_exact_mut(4);
let mut chain_offset = 0u32;
for i in chains {
buckets_out
.next()
.unwrap()
.copy_from_slice(&u32::to_be_bytes(chain_offset));
if i.is_empty() {
continue;
}
let ilenm1 = i.len() - 1;
for (
n,
PreEntry {
name_ix,
typ,
rest: (mut h, rest),
},
) in i.into_iter().enumerate()
{
let actual_offset = usize::try_from(chain_offset)
.unwrap()
.checked_mul(actual_entsize)
.unwrap();
let entry = &mut chains_out[actual_offset..actual_offset + actual_entsize];
chain_offset += 1;
h |= 1;
h = if n == ilenm1 {
// last chain element gets its last hash bit set to 0
h ^ 1
} else {
h
};
entry[0..8].copy_from_slice(&u64::to_be_bytes(h));
entry[8..12].copy_from_slice(&u32::to_be_bytes(name_ix));
entry[12..16].copy_from_slice(&u32::to_be_bytes(typ));
entry[16..16 + rest.len()].copy_from_slice(&rest[..]);
}
}
assert_eq!(buckets_out.next(), None);
assert_eq!(chain_offset, header.nchains);
Some(ret)
}

View file

@ -1,6 +1,9 @@
#![no_std]
#![forbid(unsafe_code)]
#[cfg(feature = "alloc")]
extern crate alloc;
pub use int_enum::{IntEnum, IntEnumError};
pub mod hash_table;
@ -79,6 +82,23 @@ pub fn trunc_key_at0(key: &[u8]) -> &[u8] {
.unwrap_or(key)
}
#[inline]
pub fn decode_location(location: u32) -> Option<usize> {
usize::try_from(location).ok()?.checked_mul(16)
}
/// A reference to a string table, including its data and location
pub struct StrtabDescriptorRef<'a> {
pub data: &'a [u8],
/// as usual for yglnk, the location is specified in 16-byte units
pub location: u32,
}
impl core::ops::Index<u32> for StrtabDescriptorRef<'_> {
type Output = [u8];
fn index(&self, index: u32) -> &[u8] {
trunc_key_at0(&self.data[index.try_into().unwrap()..])
}
}