byr/crates/byr-core/src/patience.rs
2022-10-18 00:07:54 +02:00

256 lines
6.4 KiB
Rust

use core::clone::Clone;
use core::cmp::{Eq, Ord};
use std::collections::BTreeMap;
/// handle shared prefixes and suffixes
/// algorithmic complexity: `O(min(|lhs|, |rhs|))`
fn handle_common<'i, Item: PartialEq>(
mut lhs: &'i [Item],
mut rhs: &'i [Item],
) -> (&'i [Item], &'i [Item], &'i [Item], &'i [Item]) {
let cpfx: usize = lhs
.iter()
.zip(rhs.iter())
.take_while(|(i, j)| i == j)
.count();
let head = if cpfx != 0 {
let (lhs_comm, lhs_rest) = lhs.split_at(cpfx);
lhs = lhs_rest;
rhs = &rhs[cpfx..];
lhs_comm
} else {
&[]
};
let csfx: usize = lhs
.iter()
.rev()
.zip(rhs.iter().rev())
.take_while(|(i, j)| i == j)
.count();
let tail = if csfx != 0 {
let (lhs_rest, lhs_comm) = lhs.split_at(lhs.len() - csfx);
lhs = lhs_rest;
rhs = &rhs[..rhs.len() - csfx];
lhs_comm
} else {
&[]
};
(head, lhs, rhs, tail)
}
#[derive(Default)]
struct DualPos {
lpos: usize,
rpos: usize,
}
impl DualPos {
// this keeps track of the current position,
// to mark any segments where the position jumps backwards
fn apply(&mut self, lpos: usize, rpos: usize) -> bool {
if lpos >= self.lpos && rpos >= self.rpos {
self.lpos = lpos;
self.rpos = rpos;
true
} else {
false
}
}
}
#[derive(Clone, Copy)]
#[cfg_attr(test, derive(serde::Serialize))]
pub enum DiffBlock<T> {
Common(T),
Diverging(T, T),
}
/// patience diff algorithm
///
/// in some descriptions the "handling of shared prefixes and suffixes" isn't
/// mentioned, but in this case it is vitally important,
/// because such scanning usually manages to reduce the working set massively
/// and is relatively cheap with complexity O(`min(|lhs|, |rhs|)`). Let `L = |lhs|; R = |rhs|`.
///
/// overall time complexity: O(`L + R + min(L, R) * log min(L, R) `)
///
/// overall memory complexity: O(`L + R + min(L, R)`)
///
/// developed according to the ideas outlined in
/// * https://bramcohen.livejournal.com/73318.html by Bram Cohen
/// * https://alfedenzo.livejournal.com/170301.html
pub fn patience_diff<'i, Item>(lhs: &'i [Item], rhs: &'i [Item]) -> Vec<DiffBlock<&'i [Item]>>
where
Item: Eq + Ord,
{
let mut ret = Vec::new();
let (head, lhs, rhs, tail) = handle_common(lhs, rhs);
if !head.is_empty() {
ret.push(DiffBlock::Common(head));
}
// generate list of unique items present in both lists
let mut comm_uniq: Vec<(usize, usize)> = {
let mut xs = BTreeMap::<&Item, (Vec<usize>, Vec<usize>)>::new();
for (n, i) in lhs.iter().enumerate() {
xs.entry(i).or_default().0.push(n);
}
for (n, i) in rhs.iter().enumerate() {
xs.entry(i).or_default().1.push(n);
}
xs.into_iter()
.flat_map(|(_, v)| match (&v.0[..], &v.1[..]) {
([x], [y]) => Some((*x, *y)),
_ => None,
}).collect()
};
// now eliminate crossover's
// NOTE: replace take.filter.collect by drain_filter once stable
{
comm_uniq.sort_unstable_by_key(|i| i.0);
let mut fav = DualPos::default();
comm_uniq = core::mem::take(&mut comm_uniq)
.into_iter()
.filter(|&(x, y)| fav.apply(x, y))
.collect();
}
{
comm_uniq.sort_unstable_by_key(|i| i.1);
let mut fav = DualPos::default();
comm_uniq = core::mem::take(&mut comm_uniq)
.into_iter()
.filter(|&(x, y)| fav.apply(x, y))
.collect();
}
// collate common unique items present in rows
// also relatively efficient because we already know the unique indices
// (lhs_start, rhs_start, length)
let mut comm_uniq_it = comm_uniq.into_iter();
let comm_uniq_coll: Vec<(usize, usize, usize)> = if let Some((x, y)) = comm_uniq_it.next() {
let mut comm_uniq_coll = vec![(x, y, 1)];
for (x, y) in comm_uniq_it {
let l = comm_uniq_coll.last_mut().unwrap();
if x == l.0 + l.2 && y == l.1 + l.2 {
l.2 += 1;
} else {
comm_uniq_coll.push((x, y, 1));
}
}
comm_uniq_coll
} else {
Vec::new()
};
// we are left with the actually matching unique parts, and random stuff inbetween
let (mut lpos, mut rpos) = (0, 0);
for i in comm_uniq_coll {
let (lhs_part, rhs_part) = (&lhs[lpos..i.0 + i.2], &rhs[rpos..i.1 + i.2]);
assert!(!(lhs_part.is_empty() && rhs_part.is_empty()));
let (lead, lhs_div, rhs_div, post) = handle_common(lhs_part, rhs_part);
if !lead.is_empty() {
ret.push(DiffBlock::Common(lead));
}
if !lhs_div.is_empty() || !rhs_div.is_empty() {
ret.push(DiffBlock::Diverging(lhs_div, rhs_div));
}
if !post.is_empty() {
ret.push(DiffBlock::Common(post));
}
lpos = i.0 + i.2;
rpos = i.1 + i.2;
}
// handle remaining stuff
let (lhs_part, rhs_part) = (&lhs[lpos..], &rhs[rpos..]);
if !(lhs_part.is_empty() && rhs_part.is_empty()) {
ret.push(DiffBlock::Diverging(lhs_part, rhs_part));
}
if !tail.is_empty() {
ret.push(DiffBlock::Common(tail));
}
ret
}
#[cfg(test)]
mod tests {
use super::patience_diff;
#[test]
fn simple() {
insta::assert_compact_json_snapshot!(patience_diff(b"ABCDEF", b"ABGGGE"));
}
#[test]
// example taken from: https://alfedenzo.livejournal.com/170301.html
fn complex_alfed() {
let a = r#"
#include <stdio.h>
// Frobs foo heartily
int frobnitz(int foo)
{
int i;
for(i = 0; i < 10; i++)
{
printf("Your answer is: ");
printf("%d\n", foo);
}
}
int fact(int n)
{
if(n > 1)
{
return fact(n-1) * n;
}
return 1;
}
int main(int argc, char **argv)
{
frobnitz(fact(10));
}
}
}
"#;
let b = r#"
#include <stdio.h>
int fib(int n)
{
if(n > 2)
{
return fib(n-1) + fib(n-2);
}
return 1;
}
// Frobs foo heartily
int frobnitz(int foo)
{
int i;
for(i = 0; i < 10; i++)
{
printf("%d\n", foo);
}
}
int main(int argc, char **argv)
{
frobnitz(fib(10));
}
"#;
insta::assert_compact_json_snapshot!(patience_diff(&a.chars().collect::<Vec<_>>(), &b.chars().collect::<Vec<_>>()));
}
}