feat(core): patience diff for merges (not yet used)

This commit is contained in:
Alain Zscheile 2022-10-18 00:07:34 +02:00
parent 0de72adc10
commit 31ac540c1a
7 changed files with 877 additions and 6 deletions

1
.gitignore vendored
View file

@ -1,4 +1,5 @@
.#*
*.snap.new
/target
result
result-*

51
Cargo.lock generated
View file

@ -73,6 +73,8 @@ dependencies = [
"cap-std",
"cap-tempfile",
"fs2",
"insta",
"serde",
"tracing",
"yzb64",
"zstd",
@ -346,6 +348,20 @@ dependencies = [
"unicode-width",
]
[[package]]
name = "insta"
version = "1.21.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "581d4e3314cae4536e5d22ffd23189d4a374696c5ef733eadafae0ed273fd303"
dependencies = [
"console",
"lazy_static",
"linked-hash-map",
"serde",
"similar",
"yaml-rust",
]
[[package]]
name = "int-enum"
version = "0.4.0"
@ -418,6 +434,12 @@ version = "0.2.135"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "68783febc7782c6c5cb401fbda4de5a9898be1762314da0bb2c10ced61f18b0c"
[[package]]
name = "linked-hash-map"
version = "0.5.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0717cef1bc8b636c6e1c1bbdefc09e6322da8a9321966e8928ef80d20f7f770f"
[[package]]
name = "linux-raw-sys"
version = "0.0.46"
@ -562,6 +584,26 @@ name = "serde"
version = "1.0.145"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "728eb6351430bccb993660dfffc5a72f91ccc1295abaa8ce19b27ebe4f75568b"
dependencies = [
"serde_derive",
]
[[package]]
name = "serde_derive"
version = "1.0.145"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "81fa1584d3d1bcacd84c277a0dfe21f5b0f6accf4a23d04d4c6d61f1af522b4c"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "similar"
version = "2.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "62ac7f900db32bf3fd12e0117dd3dc4da74bc52ebaac97f39668446d89694803"
[[package]]
name = "strsim"
@ -790,6 +832,15 @@ dependencies = [
"windows-sys",
]
[[package]]
name = "yaml-rust"
version = "0.4.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "56c1936c4cc7a1c9ab21a1ebb602eb942ba868cbd44a99cb7cdc5892335e1c85"
dependencies = [
"linked-hash-map",
]
[[package]]
name = "yzb64"
version = "0.1.0"

View file

@ -11,3 +11,13 @@ fs2 = "0.4"
tracing = "0.1"
yzb64 = "0.1"
zstd = "0.11"
[dev-dependencies]
[dev-dependencies.serde]
version = "1.0"
features = ["derive"]
[dev-dependencies.insta]
version = "1.21"
features = ["json"]

View file

@ -1,18 +1,21 @@
use cap_std::fs::Dir;
use std::io::{Error as IoError, ErrorKind as IoEk, Read, Result as IoResult, Write};
mod traits;
pub use traits::{Event, FlowData};
mod lock;
use lock::RepoLock;
mod changelist;
pub use changelist::ChangeList;
mod lock;
use lock::RepoLock;
mod patience;
pub use patience::{patience_diff, DiffBlock};
mod thin_snapshot;
pub use thin_snapshot::ThinSnapshot;
mod traits;
pub use traits::{Event, FlowData};
pub struct Repository(pub Dir);
impl Repository {

View file

@ -0,0 +1,255 @@
use core::clone::Clone;
use core::cmp::{Eq, Ord};
use std::collections::BTreeMap;
/// handle shared prefixes and suffixes
/// algorithmic complexity: `O(min(|lhs|, |rhs|))`
fn handle_common<'i, Item: PartialEq>(
mut lhs: &'i [Item],
mut rhs: &'i [Item],
) -> (&'i [Item], &'i [Item], &'i [Item], &'i [Item]) {
let cpfx: usize = lhs
.iter()
.zip(rhs.iter())
.take_while(|(i, j)| i == j)
.count();
let head = if cpfx != 0 {
let (lhs_comm, lhs_rest) = lhs.split_at(cpfx);
lhs = lhs_rest;
rhs = &rhs[cpfx..];
lhs_comm
} else {
&[]
};
let csfx: usize = lhs
.iter()
.rev()
.zip(rhs.iter().rev())
.take_while(|(i, j)| i == j)
.count();
let tail = if csfx != 0 {
let (lhs_rest, lhs_comm) = lhs.split_at(lhs.len() - csfx);
lhs = lhs_rest;
rhs = &rhs[..rhs.len() - csfx];
lhs_comm
} else {
&[]
};
(head, lhs, rhs, tail)
}
#[derive(Default)]
struct DualPos {
lpos: usize,
rpos: usize,
}
impl DualPos {
// this keeps track of the current position,
// to mark any segments where the position jumps backwards
fn apply(&mut self, lpos: usize, rpos: usize) -> bool {
if lpos >= self.lpos && rpos >= self.rpos {
self.lpos = lpos;
self.rpos = rpos;
true
} else {
false
}
}
}
#[derive(Clone, Copy)]
#[cfg_attr(test, derive(serde::Serialize))]
pub enum DiffBlock<T> {
Common(T),
Diverging(T, T),
}
/// patience diff algorithm
///
/// in some descriptions the "handling of shared prefixes and suffixes" isn't
/// mentioned, but in this case it is vitally important,
/// because such scanning usually manages to reduce the working set massively
/// and is relatively cheap with complexity O(`min(|lhs|, |rhs|)`). Let `L = |lhs|; R = |rhs|`.
///
/// overall time complexity: O(`L + R + min(L, R) * log min(L, R) `)
///
/// overall memory complexity: O(`L + R + min(L, R)`)
///
/// developed according to the ideas outlined in
/// * https://bramcohen.livejournal.com/73318.html by Bram Cohen
/// * https://alfedenzo.livejournal.com/170301.html
pub fn patience_diff<'i, Item>(lhs: &'i [Item], rhs: &'i [Item]) -> Vec<DiffBlock<&'i [Item]>>
where
Item: Eq + Ord,
{
let mut ret = Vec::new();
let (head, lhs, rhs, tail) = handle_common(lhs, rhs);
if !head.is_empty() {
ret.push(DiffBlock::Common(head));
}
// generate list of unique items present in both lists
let mut comm_uniq: Vec<(usize, usize)> = {
let mut xs = BTreeMap::<&Item, (Vec<usize>, Vec<usize>)>::new();
for (n, i) in lhs.iter().enumerate() {
xs.entry(i).or_default().0.push(n);
}
for (n, i) in rhs.iter().enumerate() {
xs.entry(i).or_default().1.push(n);
}
xs.into_iter()
.flat_map(|(_, v)| match (&v.0[..], &v.1[..]) {
([x], [y]) => Some((*x, *y)),
_ => None,
}).collect()
};
// now eliminate crossover's
// NOTE: replace take.filter.collect by drain_filter once stable
{
comm_uniq.sort_unstable_by_key(|i| i.0);
let mut fav = DualPos::default();
comm_uniq = core::mem::take(&mut comm_uniq)
.into_iter()
.filter(|&(x, y)| fav.apply(x, y))
.collect();
}
{
comm_uniq.sort_unstable_by_key(|i| i.1);
let mut fav = DualPos::default();
comm_uniq = core::mem::take(&mut comm_uniq)
.into_iter()
.filter(|&(x, y)| fav.apply(x, y))
.collect();
}
// collate common unique items present in rows
// also relatively efficient because we already know the unique indices
// (lhs_start, rhs_start, length)
let mut comm_uniq_it = comm_uniq.into_iter();
let comm_uniq_coll: Vec<(usize, usize, usize)> = if let Some((x, y)) = comm_uniq_it.next() {
let mut comm_uniq_coll = vec![(x, y, 1)];
for (x, y) in comm_uniq_it {
let l = comm_uniq_coll.last_mut().unwrap();
if x == l.0 + l.2 && y == l.1 + l.2 {
l.2 += 1;
} else {
comm_uniq_coll.push((x, y, 1));
}
}
comm_uniq_coll
} else {
Vec::new()
};
// we are left with the actually matching unique parts, and random stuff inbetween
let (mut lpos, mut rpos) = (0, 0);
for i in comm_uniq_coll {
let (lhs_part, rhs_part) = (&lhs[lpos..i.0 + i.2], &rhs[rpos..i.1 + i.2]);
assert!(!(lhs_part.is_empty() && rhs_part.is_empty()));
let (lead, lhs_div, rhs_div, post) = handle_common(lhs_part, rhs_part);
if !lead.is_empty() {
ret.push(DiffBlock::Common(lead));
}
if !lhs_div.is_empty() || !rhs_div.is_empty() {
ret.push(DiffBlock::Diverging(lhs_div, rhs_div));
}
if !post.is_empty() {
ret.push(DiffBlock::Common(post));
}
lpos = i.0 + i.2;
rpos = i.1 + i.2;
}
// handle remaining stuff
let (lhs_part, rhs_part) = (&lhs[lpos..], &rhs[rpos..]);
if !(lhs_part.is_empty() && rhs_part.is_empty()) {
ret.push(DiffBlock::Diverging(lhs_part, rhs_part));
}
if !tail.is_empty() {
ret.push(DiffBlock::Common(tail));
}
ret
}
#[cfg(test)]
mod tests {
use super::patience_diff;
#[test]
fn simple() {
insta::assert_compact_json_snapshot!(patience_diff(b"ABCDEF", b"ABGGGE"));
}
#[test]
// example taken from: https://alfedenzo.livejournal.com/170301.html
fn complex_alfed() {
let a = r#"
#include <stdio.h>
// Frobs foo heartily
int frobnitz(int foo)
{
int i;
for(i = 0; i < 10; i++)
{
printf("Your answer is: ");
printf("%d\n", foo);
}
}
int fact(int n)
{
if(n > 1)
{
return fact(n-1) * n;
}
return 1;
}
int main(int argc, char **argv)
{
frobnitz(fact(10));
}
}
}
"#;
let b = r#"
#include <stdio.h>
int fib(int n)
{
if(n > 2)
{
return fib(n-1) + fib(n-2);
}
return 1;
}
// Frobs foo heartily
int frobnitz(int foo)
{
int i;
for(i = 0; i < 10; i++)
{
printf("%d\n", foo);
}
}
int main(int argc, char **argv)
{
frobnitz(fib(10));
}
"#;
insta::assert_compact_json_snapshot!(patience_diff(&a.chars().collect::<Vec<_>>(), &b.chars().collect::<Vec<_>>()));
}
}

View file

@ -0,0 +1,546 @@
---
source: crates/byr-core/src/patience.rs
expression: "patience_diff(&a.chars().collect::<Vec<_>>(), &b.chars().collect::<Vec<_>>())"
---
[
{
"Common": [
"\n",
"#",
"i",
"n",
"c",
"l",
"u",
"d",
"e",
" ",
"<",
"s",
"t",
"d",
"i",
"o",
".",
"h",
">",
"\n",
"\n"
]
},
{
"Diverging": [
[],
[
"i",
"n",
"t",
" ",
"f",
"i",
"b",
"(",
"i",
"n",
"t",
" ",
"n",
")",
"\n",
"{",
"\n",
" ",
" ",
" ",
" ",
"i",
"f",
"(",
"n",
" ",
">",
" ",
"2",
")",
"\n",
" ",
" ",
" ",
" ",
"{",
"\n",
" ",
" ",
" ",
" ",
" ",
" ",
" ",
" ",
"r",
"e",
"t",
"u",
"r",
"n",
" ",
"f",
"i",
"b",
"(",
"n",
"-",
"1",
")",
" ",
"+",
" ",
"f",
"i",
"b",
"(",
"n",
"-",
"2",
")",
";",
"\n",
" ",
" ",
" ",
" ",
"}",
"\n",
" ",
" ",
" ",
" ",
"r",
"e",
"t",
"u",
"r",
"n",
" ",
"1",
";",
"\n",
"}",
"\n",
"\n"
]
]
},
{
"Common": [
"/",
"/",
" ",
"F"
]
},
{
"Common": [
"r",
"o",
"b",
"s",
" ",
"f",
"o",
"o",
" ",
"h",
"e",
"a",
"r",
"t",
"i",
"l",
"y"
]
},
{
"Common": [
"\n",
"i",
"n",
"t",
" ",
"f",
"r",
"o",
"b",
"n",
"i",
"t",
"z",
"(",
"i",
"n",
"t",
" ",
"f",
"o",
"o",
")",
"\n",
"{",
"\n",
" ",
" ",
" ",
" ",
"i",
"n",
"t",
" ",
"i",
";",
"\n",
" ",
" ",
" ",
" ",
"f",
"o",
"r",
"(",
"i",
" ",
"="
]
},
{
"Common": [
" ",
"0",
";",
" ",
"i",
" ",
"<"
]
},
{
"Common": [
" ",
"1",
"0",
";",
" ",
"i",
"+",
"+",
")",
"\n",
" ",
" ",
" ",
" ",
"{",
"\n",
" ",
" ",
" ",
" ",
" ",
" ",
" ",
" ",
"p",
"r",
"i",
"n",
"t",
"f",
"(",
"\""
]
},
{
"Diverging": [
[
"Y",
"o",
"u",
"r",
" ",
"a",
"n",
"s",
"w",
"e",
"r",
" ",
"i",
"s",
":",
" ",
"\"",
")",
";",
"\n",
" ",
" ",
" ",
" ",
" ",
" ",
" ",
" ",
"p",
"r",
"i",
"n",
"t",
"f",
"(",
"\""
],
[]
]
},
{
"Common": [
"%",
"d",
"\\"
]
},
{
"Common": [
"n",
"\"",
",",
" ",
"f",
"o",
"o",
")",
";",
"\n",
" ",
" ",
" ",
" ",
"}",
"\n",
"}",
"\n",
"\n",
"i",
"n",
"t",
" "
]
},
{
"Diverging": [
[
"f",
"a",
"c",
"t",
"(",
"i",
"n",
"t",
" ",
"n",
")",
"\n",
"{",
"\n",
" ",
" ",
" ",
" ",
"i",
"f",
"(",
"n",
" ",
">",
" ",
"1",
")",
"\n",
" ",
" ",
" ",
" ",
"{",
"\n",
" ",
" ",
" ",
" ",
" ",
" ",
" ",
" ",
"r",
"e",
"t",
"u",
"r",
"n",
" ",
"f",
"a",
"c",
"t",
"(",
"n",
"-",
"1",
")",
" ",
"*",
" ",
"n",
";",
"\n",
" ",
" ",
" ",
" ",
"}",
"\n",
" ",
" ",
" ",
" ",
"r",
"e",
"t",
"u",
"r",
"n",
" ",
"1",
";",
"\n",
"}",
"\n",
"\n",
"i",
"n",
"t",
" "
],
[]
]
},
{
"Common": [
"m"
]
},
{
"Common": [
"a",
"i",
"n",
"(",
"i",
"n",
"t",
" ",
"a",
"r",
"g",
"c",
",",
" ",
"c",
"h",
"a",
"r",
" ",
"*",
"*",
"a",
"r",
"g",
"v"
]
},
{
"Diverging": [
[
")",
"\n",
"{",
"\n",
" ",
" ",
" ",
" ",
"f",
"r",
"o",
"b",
"n",
"i",
"t",
"z",
"(",
"f",
"a",
"c",
"t",
"(",
"1",
"0",
")",
")",
";",
"\n",
"}",
"\n",
" ",
" ",
" ",
" ",
"}"
],
[
")",
"\n",
"{",
"\n",
" ",
" ",
" ",
" ",
"f",
"r",
"o",
"b",
"n",
"i",
"t",
"z",
"(",
"f",
"i",
"b",
"(",
"1",
"0",
")",
")",
";"
]
]
},
{
"Common": [
"\n",
"}",
"\n"
]
}
]

View file

@ -0,0 +1,5 @@
---
source: crates/byr-core/src/patience.rs
expression: "patience_diff(b\"ABCDEF\", b\"ABGGGE\")"
---
[{"Common": [65, 66]}, {"Diverging": [[67, 68], [71, 71, 71]]}, {"Common": [69]}, {"Diverging": [[70], []]}]