Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

various improvements #1785

Merged
merged 3 commits into from
Jan 20, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions gix-diff/src/blob/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@ pub mod pipeline;
///
pub mod platform;

pub mod unified_diff;
pub use unified_diff::_impl::UnifiedDiff;

/// Information about the diff performed to detect similarity.
#[derive(Debug, Default, Clone, Copy, PartialEq, PartialOrd)]
pub struct DiffLineStats {
Expand Down
255 changes: 255 additions & 0 deletions gix-diff/src/blob/unified_diff.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,255 @@
//! Facilities to produce the unified diff format.
//!
//! Originally based on <https://github.com/pascalkuthe/imara-diff/pull/14>.

/// Defines the size of the context printed before and after each change.
///
/// Similar to the `-U` option in git diff or gnu-diff. If the context overlaps
/// with previous or next change, the context gets reduced accordingly.
#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq, Ord, PartialOrd)]
pub struct ContextSize {
/// Defines the size of the context printed before and after each change.
symmetrical: u32,
}

impl Default for ContextSize {
fn default() -> Self {
ContextSize::symmetrical(3)
}
}

/// Instantiation
impl ContextSize {
/// Create a symmetrical context with `n` lines before and after a changed hunk.
pub fn symmetrical(n: u32) -> Self {
ContextSize { symmetrical: n }
}
}

/// A utility trait for use in [`UnifiedDiff`](super::UnifiedDiff).
pub trait ConsumeHunk {
/// The item this instance produces after consuming all hunks.
type Out;

/// Consume a single `hunk` in unified diff format, that would be prefixed with `header`.
/// Note that all newlines are added.
///
/// Note that the [`UnifiedDiff`](super::UnifiedDiff) sink will wrap its output in an [`std::io::Result`].
/// After this method returned its first error, it will not be called anymore.
///
/// The following is hunk-related information and the same that is used in the `header`.
/// * `before_hunk_start` is the 1-based first line of this hunk in the old file.
/// * `before_hunk_len` the amount of lines of this hunk in the old file.
/// * `after_hunk_start` is the 1-based first line of this hunk in the new file.
/// * `after_hunk_len` the amount of lines of this hunk in the new file.
fn consume_hunk(
&mut self,
before_hunk_start: u32,
before_hunk_len: u32,
after_hunk_start: u32,
after_hunk_len: u32,
header: &str,
hunk: &[u8],
) -> std::io::Result<()>;
/// Called after the last hunk is consumed to produce an output.
fn finish(self) -> Self::Out;
}

pub(super) mod _impl {
use super::{ConsumeHunk, ContextSize};
use bstr::{ByteSlice, ByteVec};
use imara_diff::{intern, Sink};
use intern::{InternedInput, Interner, Token};
use std::hash::Hash;
use std::io::ErrorKind;
use std::ops::Range;

/// A [`Sink`] that creates a textual diff in the format typically output by git or `gnu-diff` if the `-u` option is used,
/// and passes it in full to a consumer.
pub struct UnifiedDiff<'a, T, D>
where
T: Hash + Eq + AsRef<[u8]>,
D: ConsumeHunk,
{
before: &'a [Token],
after: &'a [Token],
interner: &'a Interner<T>,

pos: u32,
before_hunk_start: u32,
after_hunk_start: u32,
before_hunk_len: u32,
after_hunk_len: u32,
/// Symmetrical context before and after the changed hunk.
ctx_size: u32,

buffer: Vec<u8>,
header_buf: String,
delegate: D,
newline: &'a str,

err: Option<std::io::Error>,
}

impl<'a, T, D> UnifiedDiff<'a, T, D>
where
T: Hash + Eq + AsRef<[u8]>,
D: ConsumeHunk,
{
/// Create a new instance to create unified diff using the lines in `input`,
/// which also must be used when running the diff algorithm.
/// `context_size` is the amount of lines around each hunk which will be passed
///to `consume_hunk`.
///
/// `consume_hunk` is called for each hunk in unified-diff format, as created from each line separated by `newline_separator`,
pub fn new(
input: &'a InternedInput<T>,
consume_hunk: D,
newline_separator: &'a str,
context_size: ContextSize,
) -> Self {
Self {
before_hunk_start: 0,
after_hunk_start: 0,
before_hunk_len: 0,
after_hunk_len: 0,
buffer: Vec::with_capacity(8),
header_buf: String::new(),
delegate: consume_hunk,
interner: &input.interner,
before: &input.before,
after: &input.after,
pos: 0,
ctx_size: context_size.symmetrical,
newline: newline_separator,

err: None,
}
}

fn print_tokens(&mut self, tokens: &[Token], prefix: char) {
for &token in tokens {
self.buffer.push_char(prefix);
self.buffer.push_str(&self.interner[token]);
self.buffer.push_str(self.newline.as_bytes());
}
}

fn flush(&mut self) -> std::io::Result<()> {
if self.before_hunk_len == 0 && self.after_hunk_len == 0 {
return Ok(());
}

let end = (self.pos + self.ctx_size).min(self.before.len() as u32);
self.update_pos(end, end);

self.header_buf.clear();

std::fmt::Write::write_fmt(
&mut self.header_buf,
format_args!(
"@@ -{},{} +{},{} @@{nl}",
self.before_hunk_start + 1,
self.before_hunk_len,
self.after_hunk_start + 1,
self.after_hunk_len,
nl = self.newline
),
)
.map_err(|err| std::io::Error::new(ErrorKind::Other, err))?;
self.delegate.consume_hunk(
self.before_hunk_start + 1,
self.before_hunk_len,
self.after_hunk_start + 1,
self.after_hunk_len,
&self.header_buf,
&self.buffer,
)?;
self.buffer.clear();
self.before_hunk_len = 0;
self.after_hunk_len = 0;
Ok(())
}

fn update_pos(&mut self, print_to: u32, move_to: u32) {
self.print_tokens(&self.before[self.pos as usize..print_to as usize], ' ');
let len = print_to - self.pos;
self.pos = move_to;
self.before_hunk_len += len;
self.after_hunk_len += len;
}
}

impl<T, D> Sink for UnifiedDiff<'_, T, D>
where
T: Hash + Eq + AsRef<[u8]>,
D: ConsumeHunk,
{
type Out = std::io::Result<D::Out>;

fn process_change(&mut self, before: Range<u32>, after: Range<u32>) {
if self.err.is_some() {
return;
}
if ((self.pos == 0) && (before.start - self.pos > self.ctx_size))
|| (before.start - self.pos > 2 * self.ctx_size)
{
if let Err(err) = self.flush() {
self.err = Some(err);
return;
}
self.pos = before.start - self.ctx_size;
self.before_hunk_start = self.pos;
self.after_hunk_start = after.start - self.ctx_size;
}
self.update_pos(before.start, before.end);
self.before_hunk_len += before.end - before.start;
self.after_hunk_len += after.end - after.start;
self.print_tokens(&self.before[before.start as usize..before.end as usize], '-');
self.print_tokens(&self.after[after.start as usize..after.end as usize], '+');
}

fn finish(mut self) -> Self::Out {
if let Err(err) = self.flush() {
self.err = Some(err);
}
if let Some(err) = self.err {
return Err(err);
}
Ok(self.delegate.finish())
}
}

/// An implementation that fails if the input isn't UTF-8.
impl ConsumeHunk for String {
type Out = Self;

fn consume_hunk(&mut self, _: u32, _: u32, _: u32, _: u32, header: &str, hunk: &[u8]) -> std::io::Result<()> {
self.push_str(header);
self.push_str(
hunk.to_str()
.map_err(|err| std::io::Error::new(ErrorKind::Other, err))?,
);
Ok(())
}

fn finish(self) -> Self::Out {
self
}
}

/// An implementation that writes hunks into a byte buffer.
impl ConsumeHunk for Vec<u8> {
type Out = Self;

fn consume_hunk(&mut self, _: u32, _: u32, _: u32, _: u32, header: &str, hunk: &[u8]) -> std::io::Result<()> {
self.push_str(header);
self.push_str(hunk);
Ok(())
}

fn finish(self) -> Self::Out {
self
}
}
}
1 change: 1 addition & 0 deletions gix-diff/tests/diff/blob/mod.rs
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
pub(crate) mod pipeline;
mod platform;
mod unified_diff;
Loading
Loading