diff --git a/Cargo.lock b/Cargo.lock index b593fa99e29..bd526b0380e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1519,6 +1519,7 @@ dependencies = [ name = "gix-diff" version = "0.37.0" dependencies = [ + "bstr", "document-features", "getrandom", "gix-hash 0.13.1", diff --git a/gix-diff/Cargo.toml b/gix-diff/Cargo.toml index b51eaaaa7da..589b3c34b2d 100644 --- a/gix-diff/Cargo.toml +++ b/gix-diff/Cargo.toml @@ -12,7 +12,7 @@ autotests = false [features] default = ["blob"] -## Enable diffing of blobs using imara-diff. +## Enable diffing of blobs using imara-diff, which also allows for a generic rewrite tracking implementation. blob = ["dep:imara-diff"] ## Data structures implement `serde::Serialize` and `serde::Deserialize`. serde = ["dep:serde", "gix-hash/serde", "gix-object/serde"] @@ -25,10 +25,12 @@ doctest = false [dependencies] gix-hash = { version = "^0.13.1", path = "../gix-hash" } gix-object = { version = "^0.38.0", path = "../gix-object" } + thiserror = "1.0.32" imara-diff = { version = "0.1.3", optional = true } serde = { version = "1.0.114", optional = true, default-features = false, features = ["derive"]} getrandom = { version = "0.2.8", optional = true, default-features = false, features = ["js"] } +bstr = { version = "1.5.0", default-features = false } document-features = { version = "0.2.0", optional = true } diff --git a/gix-diff/src/blob.rs b/gix-diff/src/blob.rs index 27c1a131724..7b2a082bd1e 100644 --- a/gix-diff/src/blob.rs +++ b/gix-diff/src/blob.rs @@ -1,3 +1,18 @@ //! For using text diffs, please have a look at the [`imara-diff` documentation](https://docs.rs/imara-diff), //! maintained by [Pascal Kuthe](https://github.com/pascalkuthe). +//! +//! +/// Information about the diff performed to detect similarity. +#[derive(Debug, Default, Clone, Copy, Eq, PartialEq)] +pub struct DiffLineStats { + /// The amount of lines to remove from the source to get to the destination. + pub removals: u32, + /// The amount of lines to add to the source to get to the destination. + pub insertions: u32, + /// The amount of lines of the previous state, in the source. + pub before: u32, + /// The amount of lines of the new state, in the destination. + pub after: u32, +} + pub use imara_diff::*; diff --git a/gix-diff/src/lib.rs b/gix-diff/src/lib.rs index 6d94a75919f..b3a61b2b97b 100644 --- a/gix-diff/src/lib.rs +++ b/gix-diff/src/lib.rs @@ -8,6 +8,34 @@ cfg_attr(doc, doc = ::document_features::document_features!()) #![deny(missing_docs, rust_2018_idioms)] #![forbid(unsafe_code)] +/// A structure to capture how to perform rename and copy tracking, used by the [rewrites::Tracker]. +#[derive(Debug, Copy, Clone, PartialEq)] +#[cfg(feature = "blob")] +pub struct Rewrites { + /// If `Some(…)`, also find copies. `None` is the default which does not try to detect copies at all. + /// + /// Note that this is an even more expensive operation than detecting renames stemming from additions and deletions + /// as the resulting set to search through is usually larger. + pub copies: Option, + /// The percentage of similarity needed for files to be considered renamed, defaulting to `Some(0.5)`. + /// This field is similar to `git diff -M50%`. + /// + /// If `None`, files are only considered equal if their content matches 100%. + /// Note that values greater than 1.0 have no different effect than 1.0. + pub percentage: Option, + /// The amount of files to consider for fuzzy rename or copy tracking. Defaults to 1000, meaning that only 1000*1000 + /// combinations can be tested for fuzzy matches, i.e. the ones that try to find matches by comparing similarity. + /// If 0, there is no limit. + /// + /// If the limit would not be enough to test the entire set of combinations, the algorithm will trade in precision and not + /// run the fuzzy version of identity tests at all. That way results are never partial. + pub limit: usize, +} + +/// Contains a [Tracker](rewrites::Tracker) to detect rewrites. +#[cfg(feature = "blob")] +pub mod rewrites; + /// pub mod tree; diff --git a/gix-diff/src/rewrites/mod.rs b/gix-diff/src/rewrites/mod.rs new file mode 100644 index 00000000000..8af13165f6f --- /dev/null +++ b/gix-diff/src/rewrites/mod.rs @@ -0,0 +1,77 @@ +use crate::Rewrites; + +/// Types related to the rename tracker for renames, rewrites and copies. +pub mod tracker; + +/// A type to retain state related to an ongoing tracking operation to retain sets of interesting changes +/// of which some are retained to at a later stage compute the ones that seem to be renames or copies. +pub struct Tracker { + /// The tracked items thus far, which will be used to determine renames/copies and rewrites later. + items: Vec>, + /// A place to store all paths in to reduce amount of allocations. + path_backing: Vec, + /// A buffer for use when fetching objects for similarity tests. + buf1: Vec, + /// Another buffer for use when fetching objects for similarity tests. + buf2: Vec, + /// How to track copies and/or rewrites. + rewrites: Rewrites, + /// The diff algorithm to use when checking for similarity. + diff_algo: crate::blob::Algorithm, +} + +/// Determine in which set of files to search for copies. +#[derive(Default, Debug, Copy, Clone, Eq, PartialEq)] +pub enum CopySource { + /// Find copies from the set of modified files only. + #[default] + FromSetOfModifiedFiles, + /// Find copies from the set of modified files, as well as all files known to the source (i.e. previous state of the tree). + /// + /// This can be an expensive operation as it scales exponentially with the total amount of files in the set. + FromSetOfModifiedFilesAndAllSources, +} + +/// Under which circumstances we consider a file to be a copy. +#[derive(Debug, Copy, Clone, PartialEq)] +pub struct Copies { + /// The set of files to search when finding the source of copies. + pub source: CopySource, + /// Equivalent to [`Rewrites::percentage`], but used for copy tracking. + /// + /// Useful to have similarity-based rename tracking and cheaper copy tracking. + pub percentage: Option, +} + +impl Default for Copies { + fn default() -> Self { + Copies { + source: CopySource::default(), + percentage: Some(0.5), + } + } +} + +/// Information collected while handling rewrites of files which may be tracked. +#[derive(Default, Clone, Copy, Debug, PartialEq)] +pub struct Outcome { + /// The options used to guide the rewrite tracking. Either fully provided by the caller or retrieved from git configuration. + pub options: Rewrites, + /// The amount of similarity checks that have been conducted to find renamed files and potentially copies. + pub num_similarity_checks: usize, + /// Set to the amount of worst-case rename permutations we didn't search as our limit didn't allow it. + pub num_similarity_checks_skipped_for_rename_tracking_due_to_limit: usize, + /// Set to the amount of worst-case copy permutations we didn't search as our limit didn't allow it. + pub num_similarity_checks_skipped_for_copy_tracking_due_to_limit: usize, +} + +/// The default settings for rewrites according to the git configuration defaults. +impl Default for Rewrites { + fn default() -> Self { + Rewrites { + copies: None, + percentage: Some(0.5), + limit: 1000, + } + } +} diff --git a/gix-diff/src/rewrites/tracker.rs b/gix-diff/src/rewrites/tracker.rs new file mode 100644 index 00000000000..09d3c724608 --- /dev/null +++ b/gix-diff/src/rewrites/tracker.rs @@ -0,0 +1,488 @@ +use std::ops::Range; + +use gix_object::tree::{EntryKind, EntryMode}; + +use crate::blob::DiffLineStats; +use crate::rewrites::{CopySource, Outcome}; +use crate::{rewrites::Tracker, Rewrites}; +use bstr::BStr; +use gix_object::FindExt; + +/// The kind of a change. +#[derive(Debug, Copy, Clone, Ord, PartialOrd, PartialEq, Eq)] +pub enum ChangeKind { + /// The change represents the *deletion* of an item. + Deletion, + /// The change represents the *modification* of an item. + Modification, + /// The change represents the *addition* of an item. + Addition, +} + +/// A trait providing all functionality to abstract over the concept of a change, as seen by the [`Tracker`]. +pub trait Change: Clone { + /// Return the hash of this change for identification. + fn id(&self) -> &gix_hash::oid; + /// Return the kind of this change. + fn kind(&self) -> ChangeKind; + /// Return more information about the kind of entry affected by this change. + fn entry_mode(&self) -> EntryMode; + /// Return the id of the change along with its mode. + fn id_and_entry_mode(&self) -> (&gix_hash::oid, EntryMode); +} + +/// A set of tracked items allows to figure out their relations by figuring out their similarity. +pub(crate) struct Item { + /// The underlying raw change + change: T, + /// That slice into the backing for paths. + path: Range, + /// If true, this item was already emitted, i.e. seen by the caller. + emitted: bool, +} + +impl Item { + fn location<'a>(&self, backing: &'a [u8]) -> &'a BStr { + backing[self.path.clone()].as_ref() + } + fn entry_mode_compatible(&self, mode: EntryMode) -> bool { + use EntryKind::*; + matches!( + (mode.kind(), self.change.entry_mode().kind()), + (Blob | BlobExecutable, Blob | BlobExecutable) | (Link, Link) + ) + } + + fn is_source_for_destination_of(&self, kind: visit::SourceKind, dest_item_mode: EntryMode) -> bool { + self.entry_mode_compatible(dest_item_mode) + && match kind { + visit::SourceKind::Rename => !self.emitted && matches!(self.change.kind(), ChangeKind::Deletion), + visit::SourceKind::Copy => { + matches!(self.change.kind(), ChangeKind::Modification) + } + } + } +} + +/// A module with types used in the user-callback in [Tracker::emit()](crate::rewrites::Tracker::emit()). +pub mod visit { + use crate::blob::DiffLineStats; + use bstr::BStr; + use gix_object::tree::EntryMode; + + /// The source of a rewrite, rename or copy. + pub struct Source<'a> { + /// The kind of entry. + pub entry_mode: EntryMode, + /// The hash of the state of the source as seen in the object database. + pub id: gix_hash::ObjectId, + /// Further specify what kind of source this is. + pub kind: SourceKind, + /// The repository-relative location of this entry. + pub location: &'a BStr, + /// If this is a rewrite, indicate how many lines would need to change to turn this source into the destination. + pub diff: Option, + } + + /// Further identify the kind of [Source]. + #[derive(Debug, Copy, Clone, Eq, PartialEq)] + pub enum SourceKind { + /// This is the source of an entry that was renamed, as `source` was renamed to `destination`. + Rename, + /// This is the source of a copy, as `source` was copied into `destination`. + Copy, + } + + /// A change along with a location. + pub struct Destination<'a, T> { + /// The change at the given `location`. + pub change: T, + /// The repository-relative location of this destination. + pub location: &'a BStr, + } +} + +/// +pub mod emit { + /// The error returned by [Tracker::emit()](super::Tracker::emit()). + #[derive(Debug, thiserror::Error)] + #[allow(missing_docs)] + pub enum Error { + #[error("Could not find blob for similarity checking")] + FindExistingBlob(#[from] gix_object::find::existing_object::Error), + #[error("Could not obtain exhaustive item set to use as possible sources for copy detection")] + GetItemsForExhaustiveCopyDetection(#[source] Box), + } +} + +/// Lifecycle +impl Tracker { + /// Create a new instance with `rewrites` configuration, and the `diff_algo` to use when performing + /// similarity checking. + pub fn new(rewrites: Rewrites, diff_algo: crate::blob::Algorithm) -> Self { + Tracker { + items: vec![], + path_backing: vec![], + buf1: Vec::new(), + buf2: Vec::new(), + rewrites, + diff_algo, + } + } +} + +/// build state and find matches. +impl Tracker { + /// We may refuse the push if that information isn't needed for what we have to track. + pub fn try_push_change(&mut self, change: T, location: &BStr) -> Option { + if !change.entry_mode().is_blob_or_symlink() { + return Some(change); + } + let keep = match (self.rewrites.copies, change.kind()) { + (Some(_find_copies), _) => true, + (None, ChangeKind::Modification { .. }) => false, + (None, _) => true, + }; + + if !keep { + return Some(change); + } + + let start = self.path_backing.len(); + self.path_backing.extend_from_slice(location); + self.items.push(Item { + path: start..self.path_backing.len(), + change, + emitted: false, + }); + None + } + + /// Can only be called once effectively as it alters its own state. + /// + /// `cb(destination, source)` is called for each item, either with `Some(source)` if it's + /// the destination of a copy or rename, or with `None` for source if no relation to other + /// items in the tracked set exist. + /// + /// `objects` is used to access blob data for similarity checks if required and is taken directly from the object database. + /// Worktree filters and diff conversions will be applied afterwards automatically. + /// + /// `push_source_tree(push_fn: push(change, location))` is a function that is called when the entire tree of the source + /// should be added as modifications by calling `push` repeatedly to use for perfect copy tracking. Note that `push` + /// will panic if `change` is not a modification, and it's valid to not call `push` at all. + pub fn emit( + &mut self, + mut cb: impl FnMut(visit::Destination<'_, T>, Option>) -> crate::tree::visit::Action, + objects: &dyn gix_object::Find, + mut push_source_tree: PushSourceTreeFn, + ) -> Result + where + PushSourceTreeFn: FnMut(&mut dyn FnMut(T, &BStr)) -> Result<(), E>, + E: std::error::Error + Send + Sync + 'static, + { + fn by_id_and_location(a: &Item, b: &Item) -> std::cmp::Ordering { + a.change + .id() + .cmp(b.change.id()) + .then_with(|| a.path.start.cmp(&b.path.start).then(a.path.end.cmp(&b.path.end))) + } + self.items.sort_by(by_id_and_location); + + let mut out = Outcome { + options: self.rewrites, + ..Default::default() + }; + out = self.match_pairs_of_kind( + visit::SourceKind::Rename, + &mut cb, + self.rewrites.percentage, + out, + objects, + )?; + + if let Some(copies) = self.rewrites.copies { + out = self.match_pairs_of_kind(visit::SourceKind::Copy, &mut cb, copies.percentage, out, objects)?; + + match copies.source { + CopySource::FromSetOfModifiedFiles => {} + CopySource::FromSetOfModifiedFilesAndAllSources => { + push_source_tree(&mut |change, location| { + assert!( + self.try_push_change(change, location).is_none(), + "we must accept every change" + ); + // make sure these aren't viable to be emitted anymore. + self.items.last_mut().expect("just pushed").emitted = true; + }) + .map_err(|err| emit::Error::GetItemsForExhaustiveCopyDetection(Box::new(err)))?; + self.items.sort_by(by_id_and_location); + + out = + self.match_pairs_of_kind(visit::SourceKind::Copy, &mut cb, copies.percentage, out, objects)?; + } + } + } + + self.items + .sort_by(|a, b| a.location(&self.path_backing).cmp(b.location(&self.path_backing))); + for item in self.items.drain(..).filter(|item| !item.emitted) { + if cb( + visit::Destination { + location: item.location(&self.path_backing), + change: item.change, + }, + None, + ) == crate::tree::visit::Action::Cancel + { + break; + } + } + Ok(out) + } +} + +impl Tracker { + fn match_pairs_of_kind( + &mut self, + kind: visit::SourceKind, + cb: &mut impl FnMut(visit::Destination<'_, T>, Option>) -> crate::tree::visit::Action, + percentage: Option, + mut out: Outcome, + objects: &dyn gix_object::Find, + ) -> Result { + // we try to cheaply reduce the set of possibilities first, before possibly looking more exhaustively. + let needs_second_pass = !needs_exact_match(percentage); + if self.match_pairs(cb, None /* by identity */, kind, &mut out, objects)? == crate::tree::visit::Action::Cancel + { + return Ok(out); + } + if needs_second_pass { + let is_limited = if self.rewrites.limit == 0 { + false + } else if let Some(permutations) = permutations_over_limit(&self.items, self.rewrites.limit, kind) { + match kind { + visit::SourceKind::Rename => { + out.num_similarity_checks_skipped_for_rename_tracking_due_to_limit = permutations; + } + visit::SourceKind::Copy => { + out.num_similarity_checks_skipped_for_copy_tracking_due_to_limit = permutations; + } + } + true + } else { + false + }; + if !is_limited { + self.match_pairs(cb, percentage, kind, &mut out, objects)?; + } + } + Ok(out) + } + + fn match_pairs( + &mut self, + cb: &mut impl FnMut(visit::Destination<'_, T>, Option>) -> crate::tree::visit::Action, + percentage: Option, + kind: visit::SourceKind, + stats: &mut Outcome, + objects: &dyn gix_object::Find, + ) -> Result { + // TODO(perf): reuse object data and interner state and interned tokens, make these available to `find_match()` + let mut dest_ofs = 0; + while let Some((mut dest_idx, dest)) = self.items[dest_ofs..].iter().enumerate().find_map(|(idx, item)| { + (!item.emitted && matches!(item.change.kind(), ChangeKind::Addition)).then_some((idx, item)) + }) { + dest_idx += dest_ofs; + dest_ofs = dest_idx + 1; + let src = find_match( + &self.items, + dest, + dest_idx, + percentage.map(|p| (p, self.diff_algo)), + kind, + stats, + objects, + &mut self.buf1, + &mut self.buf2, + )? + .map(|(src_idx, src, diff)| { + let (id, entry_mode) = src.change.id_and_entry_mode(); + let id = id.to_owned(); + let location = src.location(&self.path_backing); + ( + visit::Source { + entry_mode, + id, + kind, + location, + diff, + }, + src_idx, + ) + }); + if src.is_none() { + continue; + } + let location = dest.location(&self.path_backing); + let change = dest.change.clone(); + let dest = visit::Destination { change, location }; + self.items[dest_idx].emitted = true; + if let Some(src_idx) = src.as_ref().map(|t| t.1) { + self.items[src_idx].emitted = true; + } + if cb(dest, src.map(|t| t.0)) == crate::tree::visit::Action::Cancel { + return Ok(crate::tree::visit::Action::Cancel); + } + } + Ok(crate::tree::visit::Action::Continue) + } +} + +fn permutations_over_limit(items: &[Item], limit: usize, kind: visit::SourceKind) -> Option { + let (sources, destinations) = items + .iter() + .filter(|item| match kind { + visit::SourceKind::Rename => !item.emitted, + visit::SourceKind::Copy => true, + }) + .fold((0, 0), |(mut src, mut dest), item| { + match item.change.kind() { + ChangeKind::Addition => { + dest += 1; + } + ChangeKind::Deletion => { + if kind == visit::SourceKind::Rename { + src += 1 + } + } + ChangeKind::Modification => { + if kind == visit::SourceKind::Copy { + src += 1 + } + } + } + (src, dest) + }); + let permutations = sources * destinations; + (permutations > limit * limit).then_some(permutations) +} + +fn needs_exact_match(percentage: Option) -> bool { + percentage.map_or(true, |p| p >= 1.0) +} + +/// <`src_idx`, src, possibly diff stat> +type SourceTuple<'a, T> = (usize, &'a Item, Option); + +/// Find `item` in our set of items ignoring `item_idx` to avoid finding ourselves, by similarity indicated by `percentage`. +/// The latter can be `None` or `Some(x)` where `x>=1` for identity, and anything else for similarity. +/// We also ignore emitted items entirely. +/// Use `kind` to indicate what kind of match we are looking for, which might be deletions matching an `item` addition, or +/// any non-deletion otherwise. +/// Note that we always try to find by identity first even if a percentage is given as it's much faster and may reduce the set +/// of items to be searched. +#[allow(clippy::too_many_arguments)] +fn find_match<'a, T: Change>( + items: &'a [Item], + item: &Item, + item_idx: usize, + percentage: Option<(f32, crate::blob::Algorithm)>, + kind: visit::SourceKind, + stats: &mut Outcome, + objects: &dyn gix_object::Find, + buf1: &mut Vec, + buf2: &mut Vec, +) -> Result>, emit::Error> { + let (item_id, item_mode) = item.change.id_and_entry_mode(); + if needs_exact_match(percentage.map(|t| t.0)) || item_mode.is_link() { + let first_idx = items.partition_point(|a| a.change.id() < item_id); + let range = match items.get(first_idx..).map(|items| { + let end = items + .iter() + .position(|a| a.change.id() != item_id) + .map_or(items.len(), |idx| first_idx + idx); + first_idx..end + }) { + Some(range) => range, + None => return Ok(None), + }; + if range.is_empty() { + return Ok(None); + } + let res = items[range.clone()].iter().enumerate().find_map(|(mut src_idx, src)| { + src_idx += range.start; + (src_idx != item_idx && src.is_source_for_destination_of(kind, item_mode)).then_some((src_idx, src, None)) + }); + if let Some(src) = res { + return Ok(Some(src)); + } + } else { + let new = objects.find_blob(item_id, buf1)?; + let (percentage, algo) = percentage.expect("it's set to something below 1.0 and we assured this"); + debug_assert_eq!( + item.change.entry_mode().kind(), + EntryKind::Blob, + "symlinks are matched exactly, and trees aren't used here" + ); + for (can_idx, src) in items + .iter() + .enumerate() + .filter(|(src_idx, src)| *src_idx != item_idx && src.is_source_for_destination_of(kind, item_mode)) + { + let old = objects.find_blob(src.change.id(), buf2)?; + // TODO: make sure we get attribute handling/worktree conversion and binary skips and filters right here. + let tokens = crate::blob::intern::InternedInput::new( + crate::blob::sources::byte_lines_with_terminator(old.data), + crate::blob::sources::byte_lines_with_terminator(new.data), + ); + let counts = crate::blob::diff( + algo, + &tokens, + crate::blob::sink::Counter::new(diff::Statistics { + removed_bytes: 0, + input: &tokens, + }), + ); + let similarity = (old.data.len() - counts.wrapped) as f32 / old.data.len().max(new.data.len()) as f32; + stats.num_similarity_checks += 1; + if similarity >= percentage { + return Ok(Some(( + can_idx, + src, + DiffLineStats { + removals: counts.removals, + insertions: counts.insertions, + before: tokens.before.len().try_into().expect("interner handles only u32"), + after: tokens.after.len().try_into().expect("interner handles only u32"), + } + .into(), + ))); + } + } + } + Ok(None) +} + +mod diff { + use std::ops::Range; + + pub struct Statistics<'a, 'data> { + pub removed_bytes: usize, + pub input: &'a crate::blob::intern::InternedInput<&'data [u8]>, + } + + impl<'a, 'data> crate::blob::Sink for Statistics<'a, 'data> { + type Out = usize; + + fn process_change(&mut self, before: Range, _after: Range) { + self.removed_bytes = self.input.before[before.start as usize..before.end as usize] + .iter() + .map(|token| self.input.interner[*token].len()) + .sum(); + } + + fn finish(self) -> Self::Out { + self.removed_bytes + } + } +} diff --git a/gix-diff/src/tree/visit.rs b/gix-diff/src/tree/visit.rs index 82e38931dc2..c279ed90888 100644 --- a/gix-diff/src/tree/visit.rs +++ b/gix-diff/src/tree/visit.rs @@ -92,6 +92,46 @@ pub trait Visit { fn visit(&mut self, change: Change) -> Action; } +#[cfg(feature = "blob")] +mod change_impls { + use crate::rewrites::tracker::ChangeKind; + use crate::tree::visit::Change; + use gix_hash::oid; + use gix_object::tree::EntryMode; + + impl crate::rewrites::tracker::Change for crate::tree::visit::Change { + fn id(&self) -> &oid { + match self { + Change::Addition { oid, .. } | Change::Deletion { oid, .. } | Change::Modification { oid, .. } => oid, + } + } + + fn kind(&self) -> ChangeKind { + match self { + Change::Addition { .. } => ChangeKind::Addition, + Change::Deletion { .. } => ChangeKind::Deletion, + Change::Modification { .. } => ChangeKind::Modification, + } + } + + fn entry_mode(&self) -> EntryMode { + match self { + Change::Addition { entry_mode, .. } + | Change::Deletion { entry_mode, .. } + | Change::Modification { entry_mode, .. } => *entry_mode, + } + } + + fn id_and_entry_mode(&self) -> (&oid, EntryMode) { + match self { + Change::Addition { entry_mode, oid, .. } + | Change::Deletion { entry_mode, oid, .. } + | Change::Modification { entry_mode, oid, .. } => (oid, *entry_mode), + } + } + } +} + #[cfg(test)] mod tests { use super::*;