From 31ac540c1af4b26a053304c46a31f89178a86630 Mon Sep 17 00:00:00 2001 From: Alain Zscheile Date: Tue, 18 Oct 2022 00:07:34 +0200 Subject: [PATCH] feat(core): patience diff for merges (not yet used) --- .gitignore | 1 + Cargo.lock | 51 ++ crates/byr-core/Cargo.toml | 10 + crates/byr-core/src/lib.rs | 15 +- crates/byr-core/src/patience.rs | 255 ++++++++ ..._core__patience__tests__complex_alfed.snap | 546 ++++++++++++++++++ .../byr_core__patience__tests__simple.snap | 5 + 7 files changed, 877 insertions(+), 6 deletions(-) create mode 100644 crates/byr-core/src/patience.rs create mode 100644 crates/byr-core/src/snapshots/byr_core__patience__tests__complex_alfed.snap create mode 100644 crates/byr-core/src/snapshots/byr_core__patience__tests__simple.snap diff --git a/.gitignore b/.gitignore index f1ca14a..26b6e87 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ .#* +*.snap.new /target result result-* diff --git a/Cargo.lock b/Cargo.lock index db3ffd7..d8ae1ff 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -73,6 +73,8 @@ dependencies = [ "cap-std", "cap-tempfile", "fs2", + "insta", + "serde", "tracing", "yzb64", "zstd", @@ -346,6 +348,20 @@ dependencies = [ "unicode-width", ] +[[package]] +name = "insta" +version = "1.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "581d4e3314cae4536e5d22ffd23189d4a374696c5ef733eadafae0ed273fd303" +dependencies = [ + "console", + "lazy_static", + "linked-hash-map", + "serde", + "similar", + "yaml-rust", +] + [[package]] name = "int-enum" version = "0.4.0" @@ -418,6 +434,12 @@ version = "0.2.135" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "68783febc7782c6c5cb401fbda4de5a9898be1762314da0bb2c10ced61f18b0c" +[[package]] +name = "linked-hash-map" +version = "0.5.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0717cef1bc8b636c6e1c1bbdefc09e6322da8a9321966e8928ef80d20f7f770f" + [[package]] name = "linux-raw-sys" version = "0.0.46" @@ -562,6 +584,26 @@ name = "serde" version = "1.0.145" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "728eb6351430bccb993660dfffc5a72f91ccc1295abaa8ce19b27ebe4f75568b" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.145" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81fa1584d3d1bcacd84c277a0dfe21f5b0f6accf4a23d04d4c6d61f1af522b4c" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "similar" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62ac7f900db32bf3fd12e0117dd3dc4da74bc52ebaac97f39668446d89694803" [[package]] name = "strsim" @@ -790,6 +832,15 @@ dependencies = [ "windows-sys", ] +[[package]] +name = "yaml-rust" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56c1936c4cc7a1c9ab21a1ebb602eb942ba868cbd44a99cb7cdc5892335e1c85" +dependencies = [ + "linked-hash-map", +] + [[package]] name = "yzb64" version = "0.1.0" diff --git a/crates/byr-core/Cargo.toml b/crates/byr-core/Cargo.toml index 0ef9531..6a4d6fb 100644 --- a/crates/byr-core/Cargo.toml +++ b/crates/byr-core/Cargo.toml @@ -11,3 +11,13 @@ fs2 = "0.4" tracing = "0.1" yzb64 = "0.1" zstd = "0.11" + +[dev-dependencies] + +[dev-dependencies.serde] +version = "1.0" +features = ["derive"] + +[dev-dependencies.insta] +version = "1.21" +features = ["json"] diff --git a/crates/byr-core/src/lib.rs b/crates/byr-core/src/lib.rs index e12e376..7fb80c8 100644 --- a/crates/byr-core/src/lib.rs +++ b/crates/byr-core/src/lib.rs @@ -1,18 +1,21 @@ use cap_std::fs::Dir; use std::io::{Error as IoError, ErrorKind as IoEk, Read, Result as IoResult, Write}; -mod traits; -pub use traits::{Event, FlowData}; - -mod lock; -use lock::RepoLock; - mod changelist; pub use changelist::ChangeList; +mod lock; +use lock::RepoLock; + +mod patience; +pub use patience::{patience_diff, DiffBlock}; + mod thin_snapshot; pub use thin_snapshot::ThinSnapshot; +mod traits; +pub use traits::{Event, FlowData}; + pub struct Repository(pub Dir); impl Repository { diff --git a/crates/byr-core/src/patience.rs b/crates/byr-core/src/patience.rs new file mode 100644 index 0000000..656b946 --- /dev/null +++ b/crates/byr-core/src/patience.rs @@ -0,0 +1,255 @@ +use core::clone::Clone; +use core::cmp::{Eq, Ord}; +use std::collections::BTreeMap; + +/// handle shared prefixes and suffixes +/// algorithmic complexity: `O(min(|lhs|, |rhs|))` +fn handle_common<'i, Item: PartialEq>( + mut lhs: &'i [Item], + mut rhs: &'i [Item], +) -> (&'i [Item], &'i [Item], &'i [Item], &'i [Item]) { + let cpfx: usize = lhs + .iter() + .zip(rhs.iter()) + .take_while(|(i, j)| i == j) + .count(); + + let head = if cpfx != 0 { + let (lhs_comm, lhs_rest) = lhs.split_at(cpfx); + lhs = lhs_rest; + rhs = &rhs[cpfx..]; + lhs_comm + } else { + &[] + }; + + let csfx: usize = lhs + .iter() + .rev() + .zip(rhs.iter().rev()) + .take_while(|(i, j)| i == j) + .count(); + + let tail = if csfx != 0 { + let (lhs_rest, lhs_comm) = lhs.split_at(lhs.len() - csfx); + lhs = lhs_rest; + rhs = &rhs[..rhs.len() - csfx]; + lhs_comm + } else { + &[] + }; + + (head, lhs, rhs, tail) +} + +#[derive(Default)] +struct DualPos { + lpos: usize, + rpos: usize, +} +impl DualPos { + // this keeps track of the current position, + // to mark any segments where the position jumps backwards + fn apply(&mut self, lpos: usize, rpos: usize) -> bool { + if lpos >= self.lpos && rpos >= self.rpos { + self.lpos = lpos; + self.rpos = rpos; + true + } else { + false + } + } +} + +#[derive(Clone, Copy)] +#[cfg_attr(test, derive(serde::Serialize))] +pub enum DiffBlock { + Common(T), + Diverging(T, T), +} + +/// patience diff algorithm +/// +/// in some descriptions the "handling of shared prefixes and suffixes" isn't +/// mentioned, but in this case it is vitally important, +/// because such scanning usually manages to reduce the working set massively +/// and is relatively cheap with complexity O(`min(|lhs|, |rhs|)`). Let `L = |lhs|; R = |rhs|`. +/// +/// overall time complexity: O(`L + R + min(L, R) * log min(L, R) `) +/// +/// overall memory complexity: O(`L + R + min(L, R)`) +/// +/// developed according to the ideas outlined in +/// * https://bramcohen.livejournal.com/73318.html by Bram Cohen +/// * https://alfedenzo.livejournal.com/170301.html +pub fn patience_diff<'i, Item>(lhs: &'i [Item], rhs: &'i [Item]) -> Vec> +where + Item: Eq + Ord, +{ + let mut ret = Vec::new(); + let (head, lhs, rhs, tail) = handle_common(lhs, rhs); + + if !head.is_empty() { + ret.push(DiffBlock::Common(head)); + } + + // generate list of unique items present in both lists + let mut comm_uniq: Vec<(usize, usize)> = { + let mut xs = BTreeMap::<&Item, (Vec, Vec)>::new(); + for (n, i) in lhs.iter().enumerate() { + xs.entry(i).or_default().0.push(n); + } + for (n, i) in rhs.iter().enumerate() { + xs.entry(i).or_default().1.push(n); + } + xs.into_iter() + .flat_map(|(_, v)| match (&v.0[..], &v.1[..]) { + ([x], [y]) => Some((*x, *y)), + _ => None, + }).collect() + }; + + + // now eliminate crossover's + // NOTE: replace take.filter.collect by drain_filter once stable + { + comm_uniq.sort_unstable_by_key(|i| i.0); + let mut fav = DualPos::default(); + comm_uniq = core::mem::take(&mut comm_uniq) + .into_iter() + .filter(|&(x, y)| fav.apply(x, y)) + .collect(); + } + { + comm_uniq.sort_unstable_by_key(|i| i.1); + let mut fav = DualPos::default(); + comm_uniq = core::mem::take(&mut comm_uniq) + .into_iter() + .filter(|&(x, y)| fav.apply(x, y)) + .collect(); + } + + // collate common unique items present in rows + // also relatively efficient because we already know the unique indices + // (lhs_start, rhs_start, length) + let mut comm_uniq_it = comm_uniq.into_iter(); + let comm_uniq_coll: Vec<(usize, usize, usize)> = if let Some((x, y)) = comm_uniq_it.next() { + let mut comm_uniq_coll = vec![(x, y, 1)]; + for (x, y) in comm_uniq_it { + let l = comm_uniq_coll.last_mut().unwrap(); + if x == l.0 + l.2 && y == l.1 + l.2 { + l.2 += 1; + } else { + comm_uniq_coll.push((x, y, 1)); + } + } + comm_uniq_coll + } else { + Vec::new() + }; + + // we are left with the actually matching unique parts, and random stuff inbetween + let (mut lpos, mut rpos) = (0, 0); + for i in comm_uniq_coll { + let (lhs_part, rhs_part) = (&lhs[lpos..i.0 + i.2], &rhs[rpos..i.1 + i.2]); + assert!(!(lhs_part.is_empty() && rhs_part.is_empty())); + let (lead, lhs_div, rhs_div, post) = handle_common(lhs_part, rhs_part); + if !lead.is_empty() { + ret.push(DiffBlock::Common(lead)); + } + if !lhs_div.is_empty() || !rhs_div.is_empty() { + ret.push(DiffBlock::Diverging(lhs_div, rhs_div)); + } + if !post.is_empty() { + ret.push(DiffBlock::Common(post)); + } + lpos = i.0 + i.2; + rpos = i.1 + i.2; + } + + // handle remaining stuff + let (lhs_part, rhs_part) = (&lhs[lpos..], &rhs[rpos..]); + if !(lhs_part.is_empty() && rhs_part.is_empty()) { + ret.push(DiffBlock::Diverging(lhs_part, rhs_part)); + } + + if !tail.is_empty() { + ret.push(DiffBlock::Common(tail)); + } + + ret +} + +#[cfg(test)] +mod tests { + use super::patience_diff; + + #[test] + fn simple() { + insta::assert_compact_json_snapshot!(patience_diff(b"ABCDEF", b"ABGGGE")); + } + + #[test] + // example taken from: https://alfedenzo.livejournal.com/170301.html + fn complex_alfed() { + let a = r#" +#include + +// Frobs foo heartily +int frobnitz(int foo) +{ + int i; + for(i = 0; i < 10; i++) + { + printf("Your answer is: "); + printf("%d\n", foo); + } +} + +int fact(int n) +{ + if(n > 1) + { + return fact(n-1) * n; + } + return 1; +} + +int main(int argc, char **argv) +{ + frobnitz(fact(10)); +} + } +} +"#; + let b = r#" +#include + +int fib(int n) +{ + if(n > 2) + { + return fib(n-1) + fib(n-2); + } + return 1; +} + +// Frobs foo heartily +int frobnitz(int foo) +{ + int i; + for(i = 0; i < 10; i++) + { + printf("%d\n", foo); + } +} + +int main(int argc, char **argv) +{ + frobnitz(fib(10)); +} +"#; + + insta::assert_compact_json_snapshot!(patience_diff(&a.chars().collect::>(), &b.chars().collect::>())); + } +} diff --git a/crates/byr-core/src/snapshots/byr_core__patience__tests__complex_alfed.snap b/crates/byr-core/src/snapshots/byr_core__patience__tests__complex_alfed.snap new file mode 100644 index 0000000..c067af2 --- /dev/null +++ b/crates/byr-core/src/snapshots/byr_core__patience__tests__complex_alfed.snap @@ -0,0 +1,546 @@ +--- +source: crates/byr-core/src/patience.rs +expression: "patience_diff(&a.chars().collect::>(), &b.chars().collect::>())" +--- +[ + { + "Common": [ + "\n", + "#", + "i", + "n", + "c", + "l", + "u", + "d", + "e", + " ", + "<", + "s", + "t", + "d", + "i", + "o", + ".", + "h", + ">", + "\n", + "\n" + ] + }, + { + "Diverging": [ + [], + [ + "i", + "n", + "t", + " ", + "f", + "i", + "b", + "(", + "i", + "n", + "t", + " ", + "n", + ")", + "\n", + "{", + "\n", + " ", + " ", + " ", + " ", + "i", + "f", + "(", + "n", + " ", + ">", + " ", + "2", + ")", + "\n", + " ", + " ", + " ", + " ", + "{", + "\n", + " ", + " ", + " ", + " ", + " ", + " ", + " ", + " ", + "r", + "e", + "t", + "u", + "r", + "n", + " ", + "f", + "i", + "b", + "(", + "n", + "-", + "1", + ")", + " ", + "+", + " ", + "f", + "i", + "b", + "(", + "n", + "-", + "2", + ")", + ";", + "\n", + " ", + " ", + " ", + " ", + "}", + "\n", + " ", + " ", + " ", + " ", + "r", + "e", + "t", + "u", + "r", + "n", + " ", + "1", + ";", + "\n", + "}", + "\n", + "\n" + ] + ] + }, + { + "Common": [ + "/", + "/", + " ", + "F" + ] + }, + { + "Common": [ + "r", + "o", + "b", + "s", + " ", + "f", + "o", + "o", + " ", + "h", + "e", + "a", + "r", + "t", + "i", + "l", + "y" + ] + }, + { + "Common": [ + "\n", + "i", + "n", + "t", + " ", + "f", + "r", + "o", + "b", + "n", + "i", + "t", + "z", + "(", + "i", + "n", + "t", + " ", + "f", + "o", + "o", + ")", + "\n", + "{", + "\n", + " ", + " ", + " ", + " ", + "i", + "n", + "t", + " ", + "i", + ";", + "\n", + " ", + " ", + " ", + " ", + "f", + "o", + "r", + "(", + "i", + " ", + "=" + ] + }, + { + "Common": [ + " ", + "0", + ";", + " ", + "i", + " ", + "<" + ] + }, + { + "Common": [ + " ", + "1", + "0", + ";", + " ", + "i", + "+", + "+", + ")", + "\n", + " ", + " ", + " ", + " ", + "{", + "\n", + " ", + " ", + " ", + " ", + " ", + " ", + " ", + " ", + "p", + "r", + "i", + "n", + "t", + "f", + "(", + "\"" + ] + }, + { + "Diverging": [ + [ + "Y", + "o", + "u", + "r", + " ", + "a", + "n", + "s", + "w", + "e", + "r", + " ", + "i", + "s", + ":", + " ", + "\"", + ")", + ";", + "\n", + " ", + " ", + " ", + " ", + " ", + " ", + " ", + " ", + "p", + "r", + "i", + "n", + "t", + "f", + "(", + "\"" + ], + [] + ] + }, + { + "Common": [ + "%", + "d", + "\\" + ] + }, + { + "Common": [ + "n", + "\"", + ",", + " ", + "f", + "o", + "o", + ")", + ";", + "\n", + " ", + " ", + " ", + " ", + "}", + "\n", + "}", + "\n", + "\n", + "i", + "n", + "t", + " " + ] + }, + { + "Diverging": [ + [ + "f", + "a", + "c", + "t", + "(", + "i", + "n", + "t", + " ", + "n", + ")", + "\n", + "{", + "\n", + " ", + " ", + " ", + " ", + "i", + "f", + "(", + "n", + " ", + ">", + " ", + "1", + ")", + "\n", + " ", + " ", + " ", + " ", + "{", + "\n", + " ", + " ", + " ", + " ", + " ", + " ", + " ", + " ", + "r", + "e", + "t", + "u", + "r", + "n", + " ", + "f", + "a", + "c", + "t", + "(", + "n", + "-", + "1", + ")", + " ", + "*", + " ", + "n", + ";", + "\n", + " ", + " ", + " ", + " ", + "}", + "\n", + " ", + " ", + " ", + " ", + "r", + "e", + "t", + "u", + "r", + "n", + " ", + "1", + ";", + "\n", + "}", + "\n", + "\n", + "i", + "n", + "t", + " " + ], + [] + ] + }, + { + "Common": [ + "m" + ] + }, + { + "Common": [ + "a", + "i", + "n", + "(", + "i", + "n", + "t", + " ", + "a", + "r", + "g", + "c", + ",", + " ", + "c", + "h", + "a", + "r", + " ", + "*", + "*", + "a", + "r", + "g", + "v" + ] + }, + { + "Diverging": [ + [ + ")", + "\n", + "{", + "\n", + " ", + " ", + " ", + " ", + "f", + "r", + "o", + "b", + "n", + "i", + "t", + "z", + "(", + "f", + "a", + "c", + "t", + "(", + "1", + "0", + ")", + ")", + ";", + "\n", + "}", + "\n", + " ", + " ", + " ", + " ", + "}" + ], + [ + ")", + "\n", + "{", + "\n", + " ", + " ", + " ", + " ", + "f", + "r", + "o", + "b", + "n", + "i", + "t", + "z", + "(", + "f", + "i", + "b", + "(", + "1", + "0", + ")", + ")", + ";" + ] + ] + }, + { + "Common": [ + "\n", + "}", + "\n" + ] + } +] diff --git a/crates/byr-core/src/snapshots/byr_core__patience__tests__simple.snap b/crates/byr-core/src/snapshots/byr_core__patience__tests__simple.snap new file mode 100644 index 0000000..3615242 --- /dev/null +++ b/crates/byr-core/src/snapshots/byr_core__patience__tests__simple.snap @@ -0,0 +1,5 @@ +--- +source: crates/byr-core/src/patience.rs +expression: "patience_diff(b\"ABCDEF\", b\"ABGGGE\")" +--- +[{"Common": [65, 66]}, {"Diverging": [[67, 68], [71, 71, 71]]}, {"Common": [69]}, {"Diverging": [[70], []]}]