byr/crates/byr-core/src/patience.rs

use core::clone::Clone;
use core::cmp::{Eq, Ord};
use std::collections::BTreeMap;

/// handle shared prefixes and suffixes
/// algorithmic complexity: `O(min(|lhs|, |rhs|))`
fn handle_common<'i, Item: PartialEq>(
    mut lhs: &'i [Item],
    mut rhs: &'i [Item],
) -> (&'i [Item], &'i [Item], &'i [Item], &'i [Item]) {
    let cpfx: usize = lhs
        .iter()
        .zip(rhs.iter())
        .take_while(|(i, j)| i == j)
        .count();

    let head = if cpfx != 0 {
        let (lhs_comm, lhs_rest) = lhs.split_at(cpfx);
        lhs = lhs_rest;
        rhs = &rhs[cpfx..];
        lhs_comm
    } else {
        &[]
    };

    let csfx: usize = lhs
        .iter()
        .rev()
        .zip(rhs.iter().rev())
        .take_while(|(i, j)| i == j)
        .count();

    let tail = if csfx != 0 {
        let (lhs_rest, lhs_comm) = lhs.split_at(lhs.len() - csfx);
        lhs = lhs_rest;
        rhs = &rhs[..rhs.len() - csfx];
        lhs_comm
    } else {
        &[]
    };

    (head, lhs, rhs, tail)
}

#[derive(Default)]
struct DualPos {
    lpos: usize,
    rpos: usize,
}
impl DualPos {
    // this keeps track of the current position,
    // to mark any segments where the position jumps backwards
    fn apply(&mut self, lpos: usize, rpos: usize) -> bool {
        if lpos >= self.lpos && rpos >= self.rpos {
            self.lpos = lpos;
            self.rpos = rpos;
            true
        } else {
            false
        }
    }
}

#[derive(Clone, Copy)]
#[cfg_attr(test, derive(serde::Serialize))]
pub enum DiffBlock<T> {
    Common(T),
    Diverging(T, T),
}

/// patience diff algorithm
///
/// in some descriptions the "handling of shared prefixes and suffixes" isn't
/// mentioned, but in this case it is vitally important,
/// because such scanning usually manages to reduce the working set massively
/// and is relatively cheap with complexity O(`min(|lhs|, |rhs|)`). Let `L = |lhs|; R = |rhs|`.
///
/// overall time complexity: O(`L + R + min(L, R) * log min(L, R) `)
///
/// overall memory complexity: O(`L + R + min(L, R)`)
///
/// developed according to the ideas outlined in
/// * https://bramcohen.livejournal.com/73318.html by Bram Cohen
/// * https://alfedenzo.livejournal.com/170301.html
pub fn patience_diff<'i, Item>(lhs: &'i [Item], rhs: &'i [Item]) -> Vec<DiffBlock<&'i [Item]>>
where
    Item: Eq + Ord,
{
    let mut ret = Vec::new();
    let (head, lhs, rhs, tail) = handle_common(lhs, rhs);

    if !head.is_empty() {
        ret.push(DiffBlock::Common(head));
    }

    // generate list of unique items present in both lists
    let mut comm_uniq: Vec<(usize, usize)> = {
        let mut xs = BTreeMap::<&Item, (Vec<usize>, Vec<usize>)>::new();
        for (n, i) in lhs.iter().enumerate() {
            xs.entry(i).or_default().0.push(n);
        }
        for (n, i) in rhs.iter().enumerate() {
            xs.entry(i).or_default().1.push(n);
        }
        xs.into_iter()
            .flat_map(|(_, v)| match (&v.0[..], &v.1[..]) {
                ([x], [y]) => Some((*x, *y)),
                _ => None,
            }).collect()
    };


    // now eliminate crossover's
    // NOTE: replace take.filter.collect by drain_filter once stable
    {
        comm_uniq.sort_unstable_by_key(|i| i.0);
        let mut fav = DualPos::default();
        comm_uniq = core::mem::take(&mut comm_uniq)
            .into_iter()
            .filter(|&(x, y)| fav.apply(x, y))
            .collect();
    }
    {
        comm_uniq.sort_unstable_by_key(|i| i.1);
        let mut fav = DualPos::default();
        comm_uniq = core::mem::take(&mut comm_uniq)
            .into_iter()
            .filter(|&(x, y)| fav.apply(x, y))
            .collect();
    }

    // collate common unique items present in rows
    // also relatively efficient because we already know the unique indices
    // (lhs_start, rhs_start, length)
    let mut comm_uniq_it = comm_uniq.into_iter();
    let comm_uniq_coll: Vec<(usize, usize, usize)> = if let Some((x, y)) = comm_uniq_it.next() {
        let mut comm_uniq_coll = vec![(x, y, 1)];
        for (x, y) in comm_uniq_it {
            let l = comm_uniq_coll.last_mut().unwrap();
            if x == l.0 + l.2 && y == l.1 + l.2 {
                l.2 += 1;
            } else {
                comm_uniq_coll.push((x, y, 1));
            }
        }
        comm_uniq_coll
    } else {
        Vec::new()
    };

    // we are left with the actually matching unique parts, and random stuff inbetween
    let (mut lpos, mut rpos) = (0, 0);
    for i in comm_uniq_coll {
        let (lhs_part, rhs_part) = (&lhs[lpos..i.0 + i.2], &rhs[rpos..i.1 + i.2]);
        assert!(!(lhs_part.is_empty() && rhs_part.is_empty()));
        let (lead, lhs_div, rhs_div, post) = handle_common(lhs_part, rhs_part);
        if !lead.is_empty() {
            ret.push(DiffBlock::Common(lead));
        }
        if !lhs_div.is_empty() || !rhs_div.is_empty() {
            ret.push(DiffBlock::Diverging(lhs_div, rhs_div));
        }
        if !post.is_empty() {
            ret.push(DiffBlock::Common(post));
        }
        lpos = i.0 + i.2;
        rpos = i.1 + i.2;
    }

    // handle remaining stuff
    let (lhs_part, rhs_part) = (&lhs[lpos..], &rhs[rpos..]);
    if !(lhs_part.is_empty() && rhs_part.is_empty()) {
        ret.push(DiffBlock::Diverging(lhs_part, rhs_part));
    }

    if !tail.is_empty() {
        ret.push(DiffBlock::Common(tail));
    }

    ret
}

#[cfg(test)]
mod tests {
    use super::patience_diff;

    #[test]
    fn simple() {
        insta::assert_compact_json_snapshot!(patience_diff(b"ABCDEF", b"ABGGGE"));
    }

    #[test]
    // example taken from: https://alfedenzo.livejournal.com/170301.html
    fn complex_alfed() {
        let a = r#"
#include <stdio.h>

// Frobs foo heartily
int frobnitz(int foo)
{
    int i;
    for(i = 0; i < 10; i++)
    {
        printf("Your answer is: ");
        printf("%d\n", foo);
    }
}

int fact(int n)
{
    if(n > 1)
    {
        return fact(n-1) * n;
    }
    return 1;
}

int main(int argc, char **argv)
{
    frobnitz(fact(10));
}
    }
}
"#;
        let b = r#"
#include <stdio.h>

int fib(int n)
{
    if(n > 2)
    {
        return fib(n-1) + fib(n-2);
    }
    return 1;
}

// Frobs foo heartily
int frobnitz(int foo)
{
    int i;
    for(i = 0; i < 10; i++)
    {
        printf("%d\n", foo);
    }
}

int main(int argc, char **argv)
{
    frobnitz(fib(10));
}
"#;

        insta::assert_compact_json_snapshot!(patience_diff(&a.chars().collect::<Vec<_>>(), &b.chars().collect::<Vec<_>>()));
    }
}