more modifications, minimal rust stuff
This commit is contained in:
parent
74ce57c0ed
commit
25dd03ce25
6 changed files with 387 additions and 45 deletions
5
.gitignore
vendored
Normal file
5
.gitignore
vendored
Normal file
|
@ -0,0 +1,5 @@
|
|||
.#*
|
||||
/build/
|
||||
result
|
||||
result-*
|
||||
/target/
|
125
Cargo.lock
generated
Normal file
125
Cargo.lock
generated
Normal file
|
@ -0,0 +1,125 @@
|
|||
# This file is automatically @generated by Cargo.
|
||||
# It is not intended for manual editing.
|
||||
version = 3
|
||||
|
||||
[[package]]
|
||||
name = "int-enum"
|
||||
version = "0.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "cff87d3cc4b79b4559e3c75068d64247284aceb6a038bd4bb38387f3f164476d"
|
||||
dependencies = [
|
||||
"int-enum-impl",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "int-enum-impl"
|
||||
version = "0.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "df1f2f068675add1a3fc77f5f5ab2e29290c841ee34d151abc007bce902e5d34"
|
||||
dependencies = [
|
||||
"proc-macro-crate",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "once_cell"
|
||||
version = "1.17.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6f61fba1741ea2b3d6a1e3178721804bb716a68a6aeba1149b5d52e3d464ea66"
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro-crate"
|
||||
version = "1.2.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "eda0fc3b0fb7c975631757e14d9049da17374063edb6ebbcbc54d880d4fe94e9"
|
||||
dependencies = [
|
||||
"once_cell",
|
||||
"thiserror",
|
||||
"toml",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro2"
|
||||
version = "1.0.49"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "57a8eca9f9c4ffde41714334dee777596264c7825420f521abc92b5b5deb63a5"
|
||||
dependencies = [
|
||||
"unicode-ident",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quote"
|
||||
version = "1.0.23"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8856d8364d252a14d474036ea1358d63c9e6965c8e5c1885c18f73d70bff9c7b"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde"
|
||||
version = "1.0.152"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bb7d1f0d3021d347a83e556fc4683dea2ea09d87bccdf88ff5c12545d89d5efb"
|
||||
|
||||
[[package]]
|
||||
name = "syn"
|
||||
version = "1.0.107"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1f4064b5b16e03ae50984a5a8ed5d4f8803e6bc1fd170a3cda91a1be4b18e3f5"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"unicode-ident",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "thiserror"
|
||||
version = "1.0.38"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6a9cd18aa97d5c45c6603caea1da6628790b37f7a34b6ca89522331c5180fed0"
|
||||
dependencies = [
|
||||
"thiserror-impl",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "thiserror-impl"
|
||||
version = "1.0.38"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1fb327af4685e4d03fa8cbcf1716380da910eeb2bb8be417e7f9fd3fb164f36f"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "toml"
|
||||
version = "0.5.10"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1333c76748e868a4d9d1017b5ab53171dfd095f70c712fdb4653a406547f598f"
|
||||
dependencies = [
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "unicode-ident"
|
||||
version = "1.0.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "84a22b9f218b40614adcb3f4ff08b703773ad44fa9423e4e0d346d5db86e4ebc"
|
||||
|
||||
[[package]]
|
||||
name = "xxhash-rust"
|
||||
version = "0.8.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "735a71d46c4d68d71d4b24d03fdc2b98e38cea81730595801db779c04fe80d70"
|
||||
|
||||
[[package]]
|
||||
name = "yglnk-core"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"int-enum",
|
||||
"xxhash-rust",
|
||||
]
|
7
Cargo.toml
Normal file
7
Cargo.toml
Normal file
|
@ -0,0 +1,7 @@
|
|||
[workspace]
|
||||
members = ["crates/*"]
|
||||
|
||||
[profile.release]
|
||||
codegen-units = 2
|
||||
debug = true
|
||||
lto = "thin"
|
15
crates/yglnk-core/Cargo.toml
Normal file
15
crates/yglnk-core/Cargo.toml
Normal file
|
@ -0,0 +1,15 @@
|
|||
[package]
|
||||
name = "yglnk-core"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
license = "Apache-2.0 OR ISC"
|
||||
|
||||
[dependencies]
|
||||
|
||||
[dependencies.int-enum]
|
||||
version = "0.5"
|
||||
default-features = false
|
||||
|
||||
[dependencies.xxhash-rust]
|
||||
version = "0.8"
|
||||
features = ["xxh64"]
|
189
crates/yglnk-core/src/lib.rs
Normal file
189
crates/yglnk-core/src/lib.rs
Normal file
|
@ -0,0 +1,189 @@
|
|||
#![no_std]
|
||||
|
||||
use int_enum::IntEnum;
|
||||
use xxhash_rust::xxh64::xxh64;
|
||||
|
||||
#[derive(Clone, Copy, Debug, IntEnum)]
|
||||
#[repr(u32)]
|
||||
#[rustfmt::skip]
|
||||
pub enum FileType {
|
||||
None = 0x0000_0000,
|
||||
Text = 0x0000_0001,
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug, IntEnum)]
|
||||
#[repr(u32)]
|
||||
#[rustfmt::skip]
|
||||
pub enum Type {
|
||||
PlainText = 0x0000_0000,
|
||||
NestedText = 0x0000_0001,
|
||||
|
||||
StringTable = 0x0000_0010,
|
||||
LinearPlain = 0x0000_0012,
|
||||
|
||||
HashPlain = 0x0000_0020,
|
||||
HashLink = 0x0000_0021,
|
||||
|
||||
X2dhcPlain = 0x0000_0030,
|
||||
X2dhcLink = 0x0000_0031,
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug, IntEnum)]
|
||||
#[repr(u16)]
|
||||
#[rustfmt::skip]
|
||||
pub enum Ntt01 {
|
||||
Div = 0x0000,
|
||||
Group = 0x0001,
|
||||
Header = 0x0002,
|
||||
Quote = 0x0003,
|
||||
Code = 0x0004,
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
pub struct LinearTableEntry<R> {
|
||||
pub name: u32,
|
||||
pub typ: u32,
|
||||
pub location: u32,
|
||||
pub entsize: u16,
|
||||
pub entcount: u16,
|
||||
pub rest: R,
|
||||
}
|
||||
|
||||
pub fn linear_table_iter(
|
||||
data: &[u8],
|
||||
entsize: u16,
|
||||
entcount: u16,
|
||||
) -> impl Iterator<Item = LinearTableEntry<&'_ [u8]>> {
|
||||
assert!(entsize > 0);
|
||||
data.chunks(usize::from(entsize) * 16)
|
||||
.take(entcount.into())
|
||||
.map(|i| LinearTableEntry {
|
||||
name: u32::from_be_bytes(i[0..4].try_into().unwrap()),
|
||||
typ: u32::from_be_bytes(i[4..8].try_into().unwrap()),
|
||||
location: u32::from_be_bytes(i[8..12].try_into().unwrap()),
|
||||
entsize: u16::from_be_bytes(i[12..14].try_into().unwrap()),
|
||||
entcount: u16::from_be_bytes(i[14..16].try_into().unwrap()),
|
||||
rest: &i[16..],
|
||||
})
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
pub struct HashTableHeader {
|
||||
pub strtab_link: u32,
|
||||
pub nbuckets: u32,
|
||||
pub nblf: u16,
|
||||
pub blshift: u8,
|
||||
pub seed: u64,
|
||||
}
|
||||
|
||||
pub struct HashTableRef<'a> {
|
||||
seed: u64,
|
||||
strtab: &'a [u8],
|
||||
bloom: &'a [u8],
|
||||
buckets: &'a [u8],
|
||||
chains: &'a [u8],
|
||||
blshift: u8,
|
||||
}
|
||||
|
||||
impl HashTableHeader {
|
||||
pub fn parse(data: &[u8]) -> Option<Self> {
|
||||
if data.len() < 16 {
|
||||
return None;
|
||||
}
|
||||
let mut seed = u64::from(data[11]) << 32;
|
||||
seed += u64::from(u32::from_be_bytes(data[12..16].try_into().unwrap()));
|
||||
Some(Self {
|
||||
strtab_link: u32::from_be_bytes(data[0..4].try_into().unwrap()),
|
||||
nbuckets: u32::from_be_bytes(data[4..8].try_into().unwrap()),
|
||||
nblf: u16::from_be_bytes(data[8..10].try_into().unwrap()),
|
||||
blshift: data[10],
|
||||
seed,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn tabsize(&self) -> usize {
|
||||
let tmp: usize = 4 + 2 * usize::try_from(self.nbuckets).unwrap() + usize::from(self.nblf);
|
||||
4 * tmp
|
||||
}
|
||||
}
|
||||
|
||||
pub fn trunc_key_at0(key: &[u8]) -> &[u8] {
|
||||
let key_end = key.iter().take_while(|&&i| i != 0).count();
|
||||
&key[..key_end]
|
||||
}
|
||||
|
||||
impl<'a> HashTableRef<'a> {
|
||||
/// `location` should be the offset where the hash table is present (in units of 16 bytes)
|
||||
pub fn parse(data: &'a [u8], location: u32, entsize: u16, entcount: u16) -> Option<Self> {
|
||||
let alldata = data;
|
||||
let uf = <usize as TryFrom<u32>>::try_from;
|
||||
let offset = 16 * uf(location).unwrap();
|
||||
let data = &data[offset..offset + 16 * usize::from(entsize) * usize::from(entcount)];
|
||||
|
||||
let header = HashTableHeader::parse(data)?;
|
||||
if data.len() < header.tabsize() {
|
||||
return None;
|
||||
}
|
||||
|
||||
let bloom_end: usize = 16 + 8 * usize::from(header.nblf);
|
||||
let buckets_end = bloom_end + 4 * uf(header.nbuckets).unwrap();
|
||||
Some(HashTableRef {
|
||||
blshift: header.blshift,
|
||||
seed: header.seed,
|
||||
strtab: alldata.get(usize::try_from(header.strtab_link).ok()?..)?,
|
||||
bloom: &data[16..bloom_end],
|
||||
buckets: &data[bloom_end..buckets_end],
|
||||
chains: &data[buckets_end..],
|
||||
})
|
||||
}
|
||||
|
||||
/// NOTE: the key is truncated after the first null byte
|
||||
pub fn lookup(&self, key: &[u8]) -> Option<(u32, u64, u64)> {
|
||||
// hash -> index conversion/transformation helper
|
||||
fn htr(h: u64, items: usize, div: usize) -> usize {
|
||||
div * usize::try_from(h % u64::try_from(items / div).unwrap()).unwrap()
|
||||
}
|
||||
|
||||
let key = trunc_key_at0(key);
|
||||
let h = xxh64(key, self.seed);
|
||||
|
||||
// check bloom filter
|
||||
let blsel = htr(h / 64, self.bloom.len(), 8);
|
||||
let blword = u64::from_be_bytes(self.bloom[blsel..blsel + 8].try_into().unwrap());
|
||||
let blmask: u64 = (1 << (h % 64)) | (1 << ((h >> self.blshift) % 64));
|
||||
|
||||
if (blword & blmask) != blmask {
|
||||
return None;
|
||||
}
|
||||
|
||||
// retrieve bucket/chain start index
|
||||
let bkid = htr(h, self.buckets.len(), 4);
|
||||
let chain_start = usize::try_from(u32::from_be_bytes(
|
||||
self.buckets[bkid..bkid + 4].try_into().unwrap(),
|
||||
))
|
||||
.ok()?;
|
||||
|
||||
for sel in self.chains.chunks(32).take(chain_start) {
|
||||
let e_hash = u64::from_be_bytes(sel[0..8].try_into().unwrap());
|
||||
if (h | 1) == (e_hash | 1) {
|
||||
let e_name_ix =
|
||||
usize::try_from(u32::from_be_bytes(sel[8..12].try_into().unwrap())).ok()?;
|
||||
let e_name = trunc_key_at0(self.strtab.get(e_name_ix..)?);
|
||||
|
||||
if e_name == key {
|
||||
return Some((
|
||||
u32::from_be_bytes(sel[12..16].try_into().unwrap()),
|
||||
u64::from_be_bytes(sel[16..24].try_into().unwrap()),
|
||||
u64::from_be_bytes(sel[24..32].try_into().unwrap()),
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
if (e_hash & 1) == 0 {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return None;
|
||||
}
|
||||
}
|
|
@ -19,11 +19,11 @@ All integers are encoded as big-endian.
|
|||
## Header
|
||||
|
||||
```
|
||||
header := magic[4b] generator[4b] type[4b] version[4b] tstr_loc[4b] reserved[12b]
|
||||
header := magic[4b] generator[4b] type[4b] version[4b]
|
||||
tstr_loc[4b] entsize[2b] entcount[2b] reserved[8b]
|
||||
```
|
||||
|
||||
The file magic at offset 0 is "YgLn" = 0x5967'4c63. The header itself forms a section, and contains the types of the top-level sections. After the first 32 bytes follows the primary linear table.
|
||||
The top level linear table has `entsize = 1`.
|
||||
|
||||
## File (sub)types
|
||||
|
||||
|
@ -37,43 +37,15 @@ The top level linear table has `entsize = 1`.
|
|||
```
|
||||
0x00000000 TT_PLAIN_TEXT (UTF-8)
|
||||
0x00000001 TT_NESTED_TEXT
|
||||
|
||||
0x00000010 TT_STRING_TAB
|
||||
0x00000020 TT_LINEAR_PLAIN_TAB
|
||||
0x00000030 TT_HASH_PLAIN_TAB
|
||||
0x00000031 TT_HASH_LINK_TAB
|
||||
0x00000040 TT_2DHC_PLAIN_TAB
|
||||
0x00000041 TT_2DHC_LINK_TAB
|
||||
```
|
||||
0x00000012 TT_LINEAR_PLAIN_TAB
|
||||
|
||||
## External link table (0x21, TT_HASH_LINK_TAB; 0x31, TT_2DHC_LINK_TAB)
|
||||
0x00000020 TT_HASH_PLAIN_TAB
|
||||
0x00000021 TT_HASH_LINK_TAB
|
||||
|
||||
A hash table or "hilbert curve" table (see corresponding sections). Used to reference external content and facilitate its lookup.
|
||||
|
||||
## Linear Tables
|
||||
|
||||
A simple list of entries. An entry containing all-zeros indicates the end of the list. The actual location decoded resides at `location << 4`, because names and sections are aligned to 16 byte boundaries. `rest` contains potentially additional data, also aligned to 16 bytes. `entsize * entcount << 4` is the length of each entry.
|
||||
|
||||
```
|
||||
list entry := name[4b] type[4b] location[4b] entsize[2b] entcount[2b] rest[*]
|
||||
```
|
||||
|
||||
## Hash tables
|
||||
|
||||
A hash table, using 64bit-xxHash. chains are traversed in order, the last bit of the first 8b of a chain entry is used to indicate if another entry follows (0 = last entry), the rest contains the hash.
|
||||
|
||||
```
|
||||
ht header := strtab_link[8b] nbuckets[4b] nvals[4b] nblf[4b] blshift[4b]
|
||||
ht body := bloom[4b * nlbf] buckets[4b * nbuckets] chains[32b * nvals]
|
||||
chain entry := hash[8b] name[4b] type[4b] value[16b]
|
||||
```
|
||||
|
||||
## 2D "hilbert curve" tables
|
||||
|
||||
An X-Y-indexable table, similar to the previous tables, stores "key-value" pairs, but the key this time is a 2 byte value, where the first byte is an x coordinate and the second byte is an y coordinate. The purpose is to increase locality between adjacent x values and adjacent y values. Only the first `xybits` are honored of each x and y value. the size of the table is then `16 << (2 * xybits)`.
|
||||
|
||||
```
|
||||
2dt header := xybits[1b] reserved[15b]
|
||||
2dt entry := meta[8b] location[8b]
|
||||
0x00000030 TT_2DHC_PLAIN_TAB
|
||||
0x00000031 TT_2DHC_LINK_TAB
|
||||
```
|
||||
|
||||
## Nested Text
|
||||
|
@ -96,6 +68,38 @@ nt nesting := typ[1b]!=0 subtype[2b] length[4b] elems[1b * length]
|
|||
0x010004 NTT_CODE
|
||||
```
|
||||
|
||||
## External link table (TT_*_LINK_TAB)
|
||||
|
||||
A hash table or alternatively "hilbert curve" table (see corresponding sections). Used to reference external content and facilitate its lookup.
|
||||
|
||||
## Linear Tables
|
||||
|
||||
A simple list of entries. An entry containing all-zeros indicates the end of the list. The actual location decoded resides at `location << 4`, because names and sections are aligned to 16 byte boundaries. `rest` contains potentially additional data, also aligned to 16 bytes. `entsize * entcount << 4` is the length of each entry.
|
||||
|
||||
```
|
||||
list entry := name[4b] type[4b] location[4b] entsize[2b] entcount[2b] rest[*]
|
||||
```
|
||||
|
||||
## Hash tables
|
||||
|
||||
A hash table, using 64bit-xxHash. chains are traversed in order, the last bit of the first 8b of a chain entry is used to indicate if another entry follows (0 = last entry), the rest contains the hash.
|
||||
|
||||
```
|
||||
ht header := strtab_link[4b] nbuckets[4b] nblf[2b] blshift[1b] seed[5b]
|
||||
ht body := bloom[8b * nlbf] buckets[4b * nbuckets] chains[32b **]
|
||||
chain entry := hash[8b] name[4b] type[4b] value[16b]
|
||||
```
|
||||
|
||||
## 2D "hilbert curve" tables
|
||||
|
||||
An X-Y-indexable table, similar to the previous tables, stores "key-value" pairs, but the key this time is a 2 byte value, where the first byte is an x coordinate and the second byte is an y coordinate. The purpose is to increase locality between adjacent x values and adjacent y values. Only the first `xybits` are honored of each x and y value. the size of the table is then `16 << (2 * xybits)`.
|
||||
The reason this table doesn't allow more data per entry is that performance hinges heavily on the fact that entries are relatively small.
|
||||
|
||||
```
|
||||
2dt header := xybits[1b] reserved[15b]
|
||||
2dt entry := meta[8b] location[8b]
|
||||
```
|
||||
|
||||
# Examples
|
||||
|
||||
## 01
|
||||
|
@ -106,25 +110,22 @@ nt nesting := typ[1b]!=0 subtype[2b] length[4b] elems[1b * length]
|
|||
magic generator type version
|
||||
|
||||
@ 0001
|
||||
0000 0005 0000 0000 0000 0000 0000 0000
|
||||
tstr_loc reserved
|
||||
0000 0005 0001 0002 0000 0000 0000 0000
|
||||
tstr_loc esiz ecnt reserved
|
||||
|
||||
@ 0002 linear table:
|
||||
name type location esiz ecnt
|
||||
0000 0001 0000 0010 0000 0005 0001 0001
|
||||
0000 0001 0000 0010 0000 0004 0001 0001
|
||||
.strtab strtab
|
||||
|
||||
0000 0009 0000 0001 0000 ???? 0001 0001
|
||||
0000 0009 0000 0001 0000 0005 0001 0001
|
||||
.text nt text
|
||||
|
||||
0000 0000 0000 0000 0000 0000 0000 0000
|
||||
end-of-table marker
|
||||
|
||||
@ 0005 string table:
|
||||
@ 0004 string table:
|
||||
002e 7374 7274 6162 002e 7465 7874 0000
|
||||
.strtab .text
|
||||
|
||||
@ 0006 text:
|
||||
@ 0005 text:
|
||||
0000 0d48 656c 6c6f 2057 6f72 6c64 210a
|
||||
t len Hello World!
|
||||
```
|
||||
|
|
Loading…
Reference in a new issue