more modifications, minimal rust stuff

This commit is contained in:
Alain Emilia Anna Zscheile 2023-01-08 06:10:55 +01:00
parent 74ce57c0ed
commit 25dd03ce25
6 changed files with 387 additions and 45 deletions

5
.gitignore vendored Normal file
View file

@ -0,0 +1,5 @@
.#*
/build/
result
result-*
/target/

125
Cargo.lock generated Normal file
View file

@ -0,0 +1,125 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
version = 3
[[package]]
name = "int-enum"
version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cff87d3cc4b79b4559e3c75068d64247284aceb6a038bd4bb38387f3f164476d"
dependencies = [
"int-enum-impl",
]
[[package]]
name = "int-enum-impl"
version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "df1f2f068675add1a3fc77f5f5ab2e29290c841ee34d151abc007bce902e5d34"
dependencies = [
"proc-macro-crate",
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "once_cell"
version = "1.17.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6f61fba1741ea2b3d6a1e3178721804bb716a68a6aeba1149b5d52e3d464ea66"
[[package]]
name = "proc-macro-crate"
version = "1.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "eda0fc3b0fb7c975631757e14d9049da17374063edb6ebbcbc54d880d4fe94e9"
dependencies = [
"once_cell",
"thiserror",
"toml",
]
[[package]]
name = "proc-macro2"
version = "1.0.49"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "57a8eca9f9c4ffde41714334dee777596264c7825420f521abc92b5b5deb63a5"
dependencies = [
"unicode-ident",
]
[[package]]
name = "quote"
version = "1.0.23"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8856d8364d252a14d474036ea1358d63c9e6965c8e5c1885c18f73d70bff9c7b"
dependencies = [
"proc-macro2",
]
[[package]]
name = "serde"
version = "1.0.152"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bb7d1f0d3021d347a83e556fc4683dea2ea09d87bccdf88ff5c12545d89d5efb"
[[package]]
name = "syn"
version = "1.0.107"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1f4064b5b16e03ae50984a5a8ed5d4f8803e6bc1fd170a3cda91a1be4b18e3f5"
dependencies = [
"proc-macro2",
"quote",
"unicode-ident",
]
[[package]]
name = "thiserror"
version = "1.0.38"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6a9cd18aa97d5c45c6603caea1da6628790b37f7a34b6ca89522331c5180fed0"
dependencies = [
"thiserror-impl",
]
[[package]]
name = "thiserror-impl"
version = "1.0.38"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1fb327af4685e4d03fa8cbcf1716380da910eeb2bb8be417e7f9fd3fb164f36f"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "toml"
version = "0.5.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1333c76748e868a4d9d1017b5ab53171dfd095f70c712fdb4653a406547f598f"
dependencies = [
"serde",
]
[[package]]
name = "unicode-ident"
version = "1.0.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "84a22b9f218b40614adcb3f4ff08b703773ad44fa9423e4e0d346d5db86e4ebc"
[[package]]
name = "xxhash-rust"
version = "0.8.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "735a71d46c4d68d71d4b24d03fdc2b98e38cea81730595801db779c04fe80d70"
[[package]]
name = "yglnk-core"
version = "0.1.0"
dependencies = [
"int-enum",
"xxhash-rust",
]

7
Cargo.toml Normal file
View file

@ -0,0 +1,7 @@
[workspace]
members = ["crates/*"]
[profile.release]
codegen-units = 2
debug = true
lto = "thin"

View file

@ -0,0 +1,15 @@
[package]
name = "yglnk-core"
version = "0.1.0"
edition = "2021"
license = "Apache-2.0 OR ISC"
[dependencies]
[dependencies.int-enum]
version = "0.5"
default-features = false
[dependencies.xxhash-rust]
version = "0.8"
features = ["xxh64"]

View file

@ -0,0 +1,189 @@
#![no_std]
use int_enum::IntEnum;
use xxhash_rust::xxh64::xxh64;
#[derive(Clone, Copy, Debug, IntEnum)]
#[repr(u32)]
#[rustfmt::skip]
pub enum FileType {
None = 0x0000_0000,
Text = 0x0000_0001,
}
#[derive(Clone, Copy, Debug, IntEnum)]
#[repr(u32)]
#[rustfmt::skip]
pub enum Type {
PlainText = 0x0000_0000,
NestedText = 0x0000_0001,
StringTable = 0x0000_0010,
LinearPlain = 0x0000_0012,
HashPlain = 0x0000_0020,
HashLink = 0x0000_0021,
X2dhcPlain = 0x0000_0030,
X2dhcLink = 0x0000_0031,
}
#[derive(Clone, Copy, Debug, IntEnum)]
#[repr(u16)]
#[rustfmt::skip]
pub enum Ntt01 {
Div = 0x0000,
Group = 0x0001,
Header = 0x0002,
Quote = 0x0003,
Code = 0x0004,
}
#[derive(Clone, Copy, Debug)]
pub struct LinearTableEntry<R> {
pub name: u32,
pub typ: u32,
pub location: u32,
pub entsize: u16,
pub entcount: u16,
pub rest: R,
}
pub fn linear_table_iter(
data: &[u8],
entsize: u16,
entcount: u16,
) -> impl Iterator<Item = LinearTableEntry<&'_ [u8]>> {
assert!(entsize > 0);
data.chunks(usize::from(entsize) * 16)
.take(entcount.into())
.map(|i| LinearTableEntry {
name: u32::from_be_bytes(i[0..4].try_into().unwrap()),
typ: u32::from_be_bytes(i[4..8].try_into().unwrap()),
location: u32::from_be_bytes(i[8..12].try_into().unwrap()),
entsize: u16::from_be_bytes(i[12..14].try_into().unwrap()),
entcount: u16::from_be_bytes(i[14..16].try_into().unwrap()),
rest: &i[16..],
})
}
#[derive(Clone, Copy, Debug)]
pub struct HashTableHeader {
pub strtab_link: u32,
pub nbuckets: u32,
pub nblf: u16,
pub blshift: u8,
pub seed: u64,
}
pub struct HashTableRef<'a> {
seed: u64,
strtab: &'a [u8],
bloom: &'a [u8],
buckets: &'a [u8],
chains: &'a [u8],
blshift: u8,
}
impl HashTableHeader {
pub fn parse(data: &[u8]) -> Option<Self> {
if data.len() < 16 {
return None;
}
let mut seed = u64::from(data[11]) << 32;
seed += u64::from(u32::from_be_bytes(data[12..16].try_into().unwrap()));
Some(Self {
strtab_link: u32::from_be_bytes(data[0..4].try_into().unwrap()),
nbuckets: u32::from_be_bytes(data[4..8].try_into().unwrap()),
nblf: u16::from_be_bytes(data[8..10].try_into().unwrap()),
blshift: data[10],
seed,
})
}
pub fn tabsize(&self) -> usize {
let tmp: usize = 4 + 2 * usize::try_from(self.nbuckets).unwrap() + usize::from(self.nblf);
4 * tmp
}
}
pub fn trunc_key_at0(key: &[u8]) -> &[u8] {
let key_end = key.iter().take_while(|&&i| i != 0).count();
&key[..key_end]
}
impl<'a> HashTableRef<'a> {
/// `location` should be the offset where the hash table is present (in units of 16 bytes)
pub fn parse(data: &'a [u8], location: u32, entsize: u16, entcount: u16) -> Option<Self> {
let alldata = data;
let uf = <usize as TryFrom<u32>>::try_from;
let offset = 16 * uf(location).unwrap();
let data = &data[offset..offset + 16 * usize::from(entsize) * usize::from(entcount)];
let header = HashTableHeader::parse(data)?;
if data.len() < header.tabsize() {
return None;
}
let bloom_end: usize = 16 + 8 * usize::from(header.nblf);
let buckets_end = bloom_end + 4 * uf(header.nbuckets).unwrap();
Some(HashTableRef {
blshift: header.blshift,
seed: header.seed,
strtab: alldata.get(usize::try_from(header.strtab_link).ok()?..)?,
bloom: &data[16..bloom_end],
buckets: &data[bloom_end..buckets_end],
chains: &data[buckets_end..],
})
}
/// NOTE: the key is truncated after the first null byte
pub fn lookup(&self, key: &[u8]) -> Option<(u32, u64, u64)> {
// hash -> index conversion/transformation helper
fn htr(h: u64, items: usize, div: usize) -> usize {
div * usize::try_from(h % u64::try_from(items / div).unwrap()).unwrap()
}
let key = trunc_key_at0(key);
let h = xxh64(key, self.seed);
// check bloom filter
let blsel = htr(h / 64, self.bloom.len(), 8);
let blword = u64::from_be_bytes(self.bloom[blsel..blsel + 8].try_into().unwrap());
let blmask: u64 = (1 << (h % 64)) | (1 << ((h >> self.blshift) % 64));
if (blword & blmask) != blmask {
return None;
}
// retrieve bucket/chain start index
let bkid = htr(h, self.buckets.len(), 4);
let chain_start = usize::try_from(u32::from_be_bytes(
self.buckets[bkid..bkid + 4].try_into().unwrap(),
))
.ok()?;
for sel in self.chains.chunks(32).take(chain_start) {
let e_hash = u64::from_be_bytes(sel[0..8].try_into().unwrap());
if (h | 1) == (e_hash | 1) {
let e_name_ix =
usize::try_from(u32::from_be_bytes(sel[8..12].try_into().unwrap())).ok()?;
let e_name = trunc_key_at0(self.strtab.get(e_name_ix..)?);
if e_name == key {
return Some((
u32::from_be_bytes(sel[12..16].try_into().unwrap()),
u64::from_be_bytes(sel[16..24].try_into().unwrap()),
u64::from_be_bytes(sel[24..32].try_into().unwrap()),
));
}
}
if (e_hash & 1) == 0 {
break;
}
}
return None;
}
}

View file

@ -19,11 +19,11 @@ All integers are encoded as big-endian.
## Header
```
header := magic[4b] generator[4b] type[4b] version[4b] tstr_loc[4b] reserved[12b]
header := magic[4b] generator[4b] type[4b] version[4b]
tstr_loc[4b] entsize[2b] entcount[2b] reserved[8b]
```
The file magic at offset 0 is "YgLn" = 0x5967'4c63. The header itself forms a section, and contains the types of the top-level sections. After the first 32 bytes follows the primary linear table.
The top level linear table has `entsize = 1`.
## File (sub)types
@ -37,43 +37,15 @@ The top level linear table has `entsize = 1`.
```
0x00000000 TT_PLAIN_TEXT (UTF-8)
0x00000001 TT_NESTED_TEXT
0x00000010 TT_STRING_TAB
0x00000020 TT_LINEAR_PLAIN_TAB
0x00000030 TT_HASH_PLAIN_TAB
0x00000031 TT_HASH_LINK_TAB
0x00000040 TT_2DHC_PLAIN_TAB
0x00000041 TT_2DHC_LINK_TAB
```
0x00000012 TT_LINEAR_PLAIN_TAB
## External link table (0x21, TT_HASH_LINK_TAB; 0x31, TT_2DHC_LINK_TAB)
0x00000020 TT_HASH_PLAIN_TAB
0x00000021 TT_HASH_LINK_TAB
A hash table or "hilbert curve" table (see corresponding sections). Used to reference external content and facilitate its lookup.
## Linear Tables
A simple list of entries. An entry containing all-zeros indicates the end of the list. The actual location decoded resides at `location << 4`, because names and sections are aligned to 16 byte boundaries. `rest` contains potentially additional data, also aligned to 16 bytes. `entsize * entcount << 4` is the length of each entry.
```
list entry := name[4b] type[4b] location[4b] entsize[2b] entcount[2b] rest[*]
```
## Hash tables
A hash table, using 64bit-xxHash. chains are traversed in order, the last bit of the first 8b of a chain entry is used to indicate if another entry follows (0 = last entry), the rest contains the hash.
```
ht header := strtab_link[8b] nbuckets[4b] nvals[4b] nblf[4b] blshift[4b]
ht body := bloom[4b * nlbf] buckets[4b * nbuckets] chains[32b * nvals]
chain entry := hash[8b] name[4b] type[4b] value[16b]
```
## 2D "hilbert curve" tables
An X-Y-indexable table, similar to the previous tables, stores "key-value" pairs, but the key this time is a 2 byte value, where the first byte is an x coordinate and the second byte is an y coordinate. The purpose is to increase locality between adjacent x values and adjacent y values. Only the first `xybits` are honored of each x and y value. the size of the table is then `16 << (2 * xybits)`.
```
2dt header := xybits[1b] reserved[15b]
2dt entry := meta[8b] location[8b]
0x00000030 TT_2DHC_PLAIN_TAB
0x00000031 TT_2DHC_LINK_TAB
```
## Nested Text
@ -96,6 +68,38 @@ nt nesting := typ[1b]!=0 subtype[2b] length[4b] elems[1b * length]
0x010004 NTT_CODE
```
## External link table (TT_*_LINK_TAB)
A hash table or alternatively "hilbert curve" table (see corresponding sections). Used to reference external content and facilitate its lookup.
## Linear Tables
A simple list of entries. An entry containing all-zeros indicates the end of the list. The actual location decoded resides at `location << 4`, because names and sections are aligned to 16 byte boundaries. `rest` contains potentially additional data, also aligned to 16 bytes. `entsize * entcount << 4` is the length of each entry.
```
list entry := name[4b] type[4b] location[4b] entsize[2b] entcount[2b] rest[*]
```
## Hash tables
A hash table, using 64bit-xxHash. chains are traversed in order, the last bit of the first 8b of a chain entry is used to indicate if another entry follows (0 = last entry), the rest contains the hash.
```
ht header := strtab_link[4b] nbuckets[4b] nblf[2b] blshift[1b] seed[5b]
ht body := bloom[8b * nlbf] buckets[4b * nbuckets] chains[32b **]
chain entry := hash[8b] name[4b] type[4b] value[16b]
```
## 2D "hilbert curve" tables
An X-Y-indexable table, similar to the previous tables, stores "key-value" pairs, but the key this time is a 2 byte value, where the first byte is an x coordinate and the second byte is an y coordinate. The purpose is to increase locality between adjacent x values and adjacent y values. Only the first `xybits` are honored of each x and y value. the size of the table is then `16 << (2 * xybits)`.
The reason this table doesn't allow more data per entry is that performance hinges heavily on the fact that entries are relatively small.
```
2dt header := xybits[1b] reserved[15b]
2dt entry := meta[8b] location[8b]
```
# Examples
## 01
@ -106,25 +110,22 @@ nt nesting := typ[1b]!=0 subtype[2b] length[4b] elems[1b * length]
magic generator type version
@ 0001
0000 0005 0000 0000 0000 0000 0000 0000
tstr_loc reserved
0000 0005 0001 0002 0000 0000 0000 0000
tstr_loc esiz ecnt reserved
@ 0002 linear table:
name type location esiz ecnt
0000 0001 0000 0010 0000 0005 0001 0001
0000 0001 0000 0010 0000 0004 0001 0001
.strtab strtab
0000 0009 0000 0001 0000 ???? 0001 0001
0000 0009 0000 0001 0000 0005 0001 0001
.text nt text
0000 0000 0000 0000 0000 0000 0000 0000
end-of-table marker
@ 0005 string table:
@ 0004 string table:
002e 7374 7274 6162 002e 7465 7874 0000
.strtab .text
@ 0006 text:
@ 0005 text:
0000 0d48 656c 6c6f 2057 6f72 6c64 210a
t len Hello World!
```