hash_table: add serializer
This commit is contained in:
parent
7d61a9b7fc
commit
a40017fcf1
|
@ -19,3 +19,9 @@ default-features = false
|
|||
[dependencies.xxhash-rust]
|
||||
version = "0.8"
|
||||
features = ["xxh64"]
|
||||
|
||||
[features]
|
||||
alloc = []
|
||||
|
||||
[package.metadata.docs.rs]
|
||||
features = ["alloc"]
|
||||
|
|
|
@ -8,6 +8,15 @@ pub struct Settings {
|
|||
pub blshift: u8,
|
||||
}
|
||||
|
||||
impl Settings {
|
||||
/// NOTE: the key must be truncated first using [`trunc_key_at0`]
|
||||
pub fn translate_key(&self, key: &[u8]) -> (u64, u64) {
|
||||
let h = xxh64(key, self.seed.into());
|
||||
let blmask = (1 << (h % 64)) | (1 << ((h >> self.blshift) % 64));
|
||||
(h, blmask)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
pub struct Header {
|
||||
pub strtab_link: u32,
|
||||
|
@ -50,10 +59,11 @@ impl Settings {
|
|||
|
||||
impl Header {
|
||||
pub fn parse(data: &[u8]) -> Option<Self> {
|
||||
if data.len() < 20 {
|
||||
return None;
|
||||
}
|
||||
Some(Self {
|
||||
Some(Self::decode(data.get(0..20)?.try_into().unwrap()))
|
||||
}
|
||||
|
||||
pub fn decode(data: [u8; 20]) -> Self {
|
||||
Self {
|
||||
strtab_link: u32::from_be_bytes(data[0..4].try_into().unwrap()),
|
||||
nbuckets: u32::from_be_bytes(data[4..8].try_into().unwrap()),
|
||||
nchains: u32::from_be_bytes(data[8..12].try_into().unwrap()),
|
||||
|
@ -65,7 +75,22 @@ impl Header {
|
|||
// and immediately drop that after the conversion
|
||||
seed: u32::from_be_bytes(data[16..20].try_into().unwrap()) & 0xffffff,
|
||||
},
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
pub fn encode(&self) -> [u8; 20] {
|
||||
let mut ret = [0u8; 20];
|
||||
ret[0..4].copy_from_slice(&u32::to_be_bytes(self.strtab_link));
|
||||
ret[4..8].copy_from_slice(&u32::to_be_bytes(self.nbuckets));
|
||||
ret[8..12].copy_from_slice(&u32::to_be_bytes(self.nchains));
|
||||
ret[12..14].copy_from_slice(&u16::to_be_bytes(self.settings.entsize));
|
||||
ret[14..16].copy_from_slice(&u16::to_be_bytes(self.nblf));
|
||||
// to make this easier, we first overwrite the blshift field with the seed,
|
||||
// then immediately fix it up
|
||||
assert_eq!(self.settings.seed & 0xff000000, 0);
|
||||
ret[16..20].copy_from_slice(&u32::to_be_bytes(self.settings.seed));
|
||||
ret[16] = self.settings.blshift;
|
||||
ret
|
||||
}
|
||||
|
||||
pub fn tabsize(&self) -> usize {
|
||||
|
@ -102,7 +127,7 @@ impl<'a> Ref<'a> {
|
|||
let header = Header::parse(data)?;
|
||||
let data = data.get(..header.tabsize())?;
|
||||
|
||||
let bloom_end: usize = 16 + 8 * usize::from(header.nblf);
|
||||
let bloom_end: usize = 20 + 8 * usize::from(header.nblf);
|
||||
let buckets_end = bloom_end + 4 * uf(header.nbuckets).unwrap();
|
||||
let chains_end =
|
||||
buckets_end + header.settings.chain_entry_size() * uf(header.nchains).unwrap();
|
||||
|
@ -186,11 +211,141 @@ impl<'a> Iterator for Iter<'a> {
|
|||
}
|
||||
}
|
||||
|
||||
impl Settings {
|
||||
/// NOTE: the key must be truncated first using [`trunc_key_at0`]
|
||||
pub fn translate_key(&self, key: &[u8]) -> (u64, u64) {
|
||||
let h = xxh64(key, self.seed.into());
|
||||
let blmask = (1 << (h % 64)) | (1 << ((h >> self.blshift) % 64));
|
||||
(h, blmask)
|
||||
}
|
||||
#[derive(Copy, Clone, Debug)]
|
||||
pub struct PreEntry<R> {
|
||||
/// index into the string table corresponding to the entry name
|
||||
pub name_ix: u32,
|
||||
/// type of the entry
|
||||
pub typ: u32,
|
||||
/// content of the entry
|
||||
pub rest: R,
|
||||
}
|
||||
|
||||
#[cfg(feature = "alloc")]
|
||||
use alloc::{boxed::Box, vec, vec::Vec};
|
||||
|
||||
#[cfg(feature = "alloc")]
|
||||
pub fn serialize<I, R, O>(
|
||||
strtab: crate::StrtabDescriptorRef<'_>,
|
||||
nbuckets: u32,
|
||||
nblf: u16,
|
||||
settings: Settings,
|
||||
data: I,
|
||||
) -> Option<Vec<u8>>
|
||||
where
|
||||
I: Iterator<Item = PreEntry<R>>,
|
||||
R: Into<Box<[u8]>>,
|
||||
{
|
||||
let mut header = Header {
|
||||
strtab_link: strtab.location,
|
||||
nbuckets,
|
||||
// chains get computed later
|
||||
nchains: 0,
|
||||
nblf,
|
||||
settings,
|
||||
};
|
||||
|
||||
let mut bloom = vec![0u64; nblf.into()];
|
||||
let nbku: usize = nbuckets.try_into().ok()?;
|
||||
let (data_lb_count, _) = data.size_hint();
|
||||
if usize::try_from(u32::MAX)
|
||||
.map(|dub| data_lb_count > dub)
|
||||
.unwrap_or(false)
|
||||
{
|
||||
// too much data, overflow
|
||||
return None;
|
||||
}
|
||||
let mut chains = vec![Vec::<PreEntry<(u64, Box<[u8]>)>>::new(); nbku];
|
||||
let actual_entsize = settings.chain_entry_size();
|
||||
assert!(actual_entsize >= 16);
|
||||
|
||||
for PreEntry { name_ix, typ, rest } in data {
|
||||
let name = &strtab[name_ix];
|
||||
let rest: Box<[u8]> = rest.into();
|
||||
assert!(actual_entsize >= (16 + rest.len()));
|
||||
let (h, blmask) = header.settings.translate_key(name);
|
||||
|
||||
// add to bloom filter; (div=1 because we use u64 entries and convert later)
|
||||
let blid = hash_trf(h / 64, bloom.len(), 1);
|
||||
bloom[blid] |= blmask;
|
||||
|
||||
// add to chain
|
||||
chains[hash_trf(h, nbku, 1)].push(PreEntry {
|
||||
name_ix,
|
||||
typ,
|
||||
rest: (h, rest),
|
||||
});
|
||||
}
|
||||
|
||||
header.nchains = u32::try_from(chains.iter().map(|i| i.len()).sum::<usize>()).ok()?;
|
||||
let mut ret = vec![0u8; header.tabsize()];
|
||||
|
||||
// copy header
|
||||
ret[0..20].copy_from_slice(&header.encode());
|
||||
|
||||
// copy bloom filter
|
||||
let bloom_end: usize = 20 + 8 * usize::from(header.nblf);
|
||||
for (blin, blout) in bloom
|
||||
.into_iter()
|
||||
.zip(ret[20..bloom_end].chunks_exact_mut(8))
|
||||
{
|
||||
blout.copy_from_slice(&u64::to_be_bytes(blin));
|
||||
}
|
||||
|
||||
// copy buckets and chains
|
||||
let (buckets_out, chains_out) =
|
||||
ret[bloom_end..].split_at_mut(4 * usize::try_from(header.nbuckets).unwrap());
|
||||
assert_eq!(
|
||||
chains_out.len(),
|
||||
actual_entsize * usize::try_from(header.nchains).unwrap()
|
||||
);
|
||||
let mut buckets_out = buckets_out.chunks_exact_mut(4);
|
||||
let mut chain_offset = 0u32;
|
||||
|
||||
for i in chains {
|
||||
buckets_out
|
||||
.next()
|
||||
.unwrap()
|
||||
.copy_from_slice(&u32::to_be_bytes(chain_offset));
|
||||
|
||||
if i.is_empty() {
|
||||
continue;
|
||||
}
|
||||
let ilenm1 = i.len() - 1;
|
||||
|
||||
for (
|
||||
n,
|
||||
PreEntry {
|
||||
name_ix,
|
||||
typ,
|
||||
rest: (mut h, rest),
|
||||
},
|
||||
) in i.into_iter().enumerate()
|
||||
{
|
||||
let actual_offset = usize::try_from(chain_offset)
|
||||
.unwrap()
|
||||
.checked_mul(actual_entsize)
|
||||
.unwrap();
|
||||
let entry = &mut chains_out[actual_offset..actual_offset + actual_entsize];
|
||||
chain_offset += 1;
|
||||
|
||||
h |= 1;
|
||||
h = if n == ilenm1 {
|
||||
// last chain element gets its last hash bit set to 0
|
||||
h ^ 1
|
||||
} else {
|
||||
h
|
||||
};
|
||||
|
||||
entry[0..8].copy_from_slice(&u64::to_be_bytes(h));
|
||||
entry[8..12].copy_from_slice(&u32::to_be_bytes(name_ix));
|
||||
entry[12..16].copy_from_slice(&u32::to_be_bytes(typ));
|
||||
entry[16..16 + rest.len()].copy_from_slice(&rest[..]);
|
||||
}
|
||||
}
|
||||
|
||||
assert_eq!(buckets_out.next(), None);
|
||||
assert_eq!(chain_offset, header.nchains);
|
||||
|
||||
Some(ret)
|
||||
}
|
||||
|
|
|
@ -1,6 +1,9 @@
|
|||
#![no_std]
|
||||
#![forbid(unsafe_code)]
|
||||
|
||||
#[cfg(feature = "alloc")]
|
||||
extern crate alloc;
|
||||
|
||||
pub use int_enum::{IntEnum, IntEnumError};
|
||||
|
||||
pub mod hash_table;
|
||||
|
@ -79,6 +82,23 @@ pub fn trunc_key_at0(key: &[u8]) -> &[u8] {
|
|||
.unwrap_or(key)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn decode_location(location: u32) -> Option<usize> {
|
||||
usize::try_from(location).ok()?.checked_mul(16)
|
||||
}
|
||||
|
||||
/// A reference to a string table, including its data and location
|
||||
pub struct StrtabDescriptorRef<'a> {
|
||||
pub data: &'a [u8],
|
||||
|
||||
/// as usual for yglnk, the location is specified in 16-byte units
|
||||
pub location: u32,
|
||||
}
|
||||
|
||||
impl core::ops::Index<u32> for StrtabDescriptorRef<'_> {
|
||||
type Output = [u8];
|
||||
|
||||
fn index(&self, index: u32) -> &[u8] {
|
||||
trunc_key_at0(&self.data[index.try_into().unwrap()..])
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue