256 lines
6.4 KiB
Rust
256 lines
6.4 KiB
Rust
use core::clone::Clone;
|
|
use core::cmp::{Eq, Ord};
|
|
use std::collections::BTreeMap;
|
|
|
|
/// handle shared prefixes and suffixes
|
|
/// algorithmic complexity: `O(min(|lhs|, |rhs|))`
|
|
fn handle_common<'i, Item: PartialEq>(
|
|
mut lhs: &'i [Item],
|
|
mut rhs: &'i [Item],
|
|
) -> (&'i [Item], &'i [Item], &'i [Item], &'i [Item]) {
|
|
let cpfx: usize = lhs
|
|
.iter()
|
|
.zip(rhs.iter())
|
|
.take_while(|(i, j)| i == j)
|
|
.count();
|
|
|
|
let head = if cpfx != 0 {
|
|
let (lhs_comm, lhs_rest) = lhs.split_at(cpfx);
|
|
lhs = lhs_rest;
|
|
rhs = &rhs[cpfx..];
|
|
lhs_comm
|
|
} else {
|
|
&[]
|
|
};
|
|
|
|
let csfx: usize = lhs
|
|
.iter()
|
|
.rev()
|
|
.zip(rhs.iter().rev())
|
|
.take_while(|(i, j)| i == j)
|
|
.count();
|
|
|
|
let tail = if csfx != 0 {
|
|
let (lhs_rest, lhs_comm) = lhs.split_at(lhs.len() - csfx);
|
|
lhs = lhs_rest;
|
|
rhs = &rhs[..rhs.len() - csfx];
|
|
lhs_comm
|
|
} else {
|
|
&[]
|
|
};
|
|
|
|
(head, lhs, rhs, tail)
|
|
}
|
|
|
|
#[derive(Default)]
|
|
struct DualPos {
|
|
lpos: usize,
|
|
rpos: usize,
|
|
}
|
|
impl DualPos {
|
|
// this keeps track of the current position,
|
|
// to mark any segments where the position jumps backwards
|
|
fn apply(&mut self, lpos: usize, rpos: usize) -> bool {
|
|
if lpos >= self.lpos && rpos >= self.rpos {
|
|
self.lpos = lpos;
|
|
self.rpos = rpos;
|
|
true
|
|
} else {
|
|
false
|
|
}
|
|
}
|
|
}
|
|
|
|
#[derive(Clone, Copy)]
|
|
#[cfg_attr(test, derive(serde::Serialize))]
|
|
pub enum DiffBlock<T> {
|
|
Common(T),
|
|
Diverging(T, T),
|
|
}
|
|
|
|
/// patience diff algorithm
|
|
///
|
|
/// in some descriptions the "handling of shared prefixes and suffixes" isn't
|
|
/// mentioned, but in this case it is vitally important,
|
|
/// because such scanning usually manages to reduce the working set massively
|
|
/// and is relatively cheap with complexity O(`min(|lhs|, |rhs|)`). Let `L = |lhs|; R = |rhs|`.
|
|
///
|
|
/// overall time complexity: O(`L + R + min(L, R) * log min(L, R) `)
|
|
///
|
|
/// overall memory complexity: O(`L + R + min(L, R)`)
|
|
///
|
|
/// developed according to the ideas outlined in
|
|
/// * https://bramcohen.livejournal.com/73318.html by Bram Cohen
|
|
/// * https://alfedenzo.livejournal.com/170301.html
|
|
pub fn patience_diff<'i, Item>(lhs: &'i [Item], rhs: &'i [Item]) -> Vec<DiffBlock<&'i [Item]>>
|
|
where
|
|
Item: Eq + Ord,
|
|
{
|
|
let mut ret = Vec::new();
|
|
let (head, lhs, rhs, tail) = handle_common(lhs, rhs);
|
|
|
|
if !head.is_empty() {
|
|
ret.push(DiffBlock::Common(head));
|
|
}
|
|
|
|
// generate list of unique items present in both lists
|
|
let mut comm_uniq: Vec<(usize, usize)> = {
|
|
let mut xs = BTreeMap::<&Item, (Vec<usize>, Vec<usize>)>::new();
|
|
for (n, i) in lhs.iter().enumerate() {
|
|
xs.entry(i).or_default().0.push(n);
|
|
}
|
|
for (n, i) in rhs.iter().enumerate() {
|
|
xs.entry(i).or_default().1.push(n);
|
|
}
|
|
xs.into_iter()
|
|
.flat_map(|(_, v)| match (&v.0[..], &v.1[..]) {
|
|
([x], [y]) => Some((*x, *y)),
|
|
_ => None,
|
|
}).collect()
|
|
};
|
|
|
|
|
|
// now eliminate crossover's
|
|
// NOTE: replace take.filter.collect by drain_filter once stable
|
|
{
|
|
comm_uniq.sort_unstable_by_key(|i| i.0);
|
|
let mut fav = DualPos::default();
|
|
comm_uniq = core::mem::take(&mut comm_uniq)
|
|
.into_iter()
|
|
.filter(|&(x, y)| fav.apply(x, y))
|
|
.collect();
|
|
}
|
|
{
|
|
comm_uniq.sort_unstable_by_key(|i| i.1);
|
|
let mut fav = DualPos::default();
|
|
comm_uniq = core::mem::take(&mut comm_uniq)
|
|
.into_iter()
|
|
.filter(|&(x, y)| fav.apply(x, y))
|
|
.collect();
|
|
}
|
|
|
|
// collate common unique items present in rows
|
|
// also relatively efficient because we already know the unique indices
|
|
// (lhs_start, rhs_start, length)
|
|
let mut comm_uniq_it = comm_uniq.into_iter();
|
|
let comm_uniq_coll: Vec<(usize, usize, usize)> = if let Some((x, y)) = comm_uniq_it.next() {
|
|
let mut comm_uniq_coll = vec![(x, y, 1)];
|
|
for (x, y) in comm_uniq_it {
|
|
let l = comm_uniq_coll.last_mut().unwrap();
|
|
if x == l.0 + l.2 && y == l.1 + l.2 {
|
|
l.2 += 1;
|
|
} else {
|
|
comm_uniq_coll.push((x, y, 1));
|
|
}
|
|
}
|
|
comm_uniq_coll
|
|
} else {
|
|
Vec::new()
|
|
};
|
|
|
|
// we are left with the actually matching unique parts, and random stuff inbetween
|
|
let (mut lpos, mut rpos) = (0, 0);
|
|
for i in comm_uniq_coll {
|
|
let (lhs_part, rhs_part) = (&lhs[lpos..i.0 + i.2], &rhs[rpos..i.1 + i.2]);
|
|
assert!(!(lhs_part.is_empty() && rhs_part.is_empty()));
|
|
let (lead, lhs_div, rhs_div, post) = handle_common(lhs_part, rhs_part);
|
|
if !lead.is_empty() {
|
|
ret.push(DiffBlock::Common(lead));
|
|
}
|
|
if !lhs_div.is_empty() || !rhs_div.is_empty() {
|
|
ret.push(DiffBlock::Diverging(lhs_div, rhs_div));
|
|
}
|
|
if !post.is_empty() {
|
|
ret.push(DiffBlock::Common(post));
|
|
}
|
|
lpos = i.0 + i.2;
|
|
rpos = i.1 + i.2;
|
|
}
|
|
|
|
// handle remaining stuff
|
|
let (lhs_part, rhs_part) = (&lhs[lpos..], &rhs[rpos..]);
|
|
if !(lhs_part.is_empty() && rhs_part.is_empty()) {
|
|
ret.push(DiffBlock::Diverging(lhs_part, rhs_part));
|
|
}
|
|
|
|
if !tail.is_empty() {
|
|
ret.push(DiffBlock::Common(tail));
|
|
}
|
|
|
|
ret
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::patience_diff;
|
|
|
|
#[test]
|
|
fn simple() {
|
|
insta::assert_compact_json_snapshot!(patience_diff(b"ABCDEF", b"ABGGGE"));
|
|
}
|
|
|
|
#[test]
|
|
// example taken from: https://alfedenzo.livejournal.com/170301.html
|
|
fn complex_alfed() {
|
|
let a = r#"
|
|
#include <stdio.h>
|
|
|
|
// Frobs foo heartily
|
|
int frobnitz(int foo)
|
|
{
|
|
int i;
|
|
for(i = 0; i < 10; i++)
|
|
{
|
|
printf("Your answer is: ");
|
|
printf("%d\n", foo);
|
|
}
|
|
}
|
|
|
|
int fact(int n)
|
|
{
|
|
if(n > 1)
|
|
{
|
|
return fact(n-1) * n;
|
|
}
|
|
return 1;
|
|
}
|
|
|
|
int main(int argc, char **argv)
|
|
{
|
|
frobnitz(fact(10));
|
|
}
|
|
}
|
|
}
|
|
"#;
|
|
let b = r#"
|
|
#include <stdio.h>
|
|
|
|
int fib(int n)
|
|
{
|
|
if(n > 2)
|
|
{
|
|
return fib(n-1) + fib(n-2);
|
|
}
|
|
return 1;
|
|
}
|
|
|
|
// Frobs foo heartily
|
|
int frobnitz(int foo)
|
|
{
|
|
int i;
|
|
for(i = 0; i < 10; i++)
|
|
{
|
|
printf("%d\n", foo);
|
|
}
|
|
}
|
|
|
|
int main(int argc, char **argv)
|
|
{
|
|
frobnitz(fib(10));
|
|
}
|
|
"#;
|
|
|
|
insta::assert_compact_json_snapshot!(patience_diff(&a.chars().collect::<Vec<_>>(), &b.chars().collect::<Vec<_>>()));
|
|
}
|
|
}
|