Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -347,6 +347,7 @@ sem diff --format json
"moved": 0,
"renamed": 0,
"reordered": 0,
"binary": 0,
"orphan": 0,
"total": 3
},
Expand All @@ -362,7 +363,8 @@ sem diff --format json
"oldEndLine": null,
"filePath": "src/auth.ts"
}
]
],
"binaryChanges": []
}
```

Expand Down
140 changes: 118 additions & 22 deletions crates/sem-cli/src/commands/diff.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ use sem_core::git::bridge::GitBridge;
use sem_core::git::jj::maybe_resolve_ref;
use sem_core::git::types::{DiffScope, FileChange, FileStatus};
use sem_core::model::change::ChangeType;
use sem_core::parser::differ::{compute_semantic_diff, DiffResult};
use sem_core::parser::differ::{collect_binary_file_changes, compute_semantic_diff, DiffResult};
use sem_core::parser::plugins::code::languages::get_language_config;
use sem_core::parser::registry::{detect_ext_from_content, ParserRegistry};

Expand Down Expand Up @@ -232,6 +232,27 @@ fn normalize_trailing_output_format(opts: &mut DiffOptions) {
opts.args = args;
}

fn bytes_look_binary(bytes: &[u8], complete: bool) -> bool {
if bytes.iter().any(|byte| *byte == 0) {
return true;
}

match std::str::from_utf8(bytes) {
Ok(_) => false,
Err(error) => complete || error.error_len().is_some(),
}
}

fn read_file_compare_content(path: &Path) -> Result<Option<String>, std::io::Error> {
let bytes = std::fs::read(path)?;
if bytes_look_binary(&bytes, true) {
Ok(None)
} else {
let content = String::from_utf8(bytes)
.map_err(|err| std::io::Error::new(std::io::ErrorKind::InvalidData, err))?;
Ok(Some(content))
}
}

fn parse_args(args: Vec<String>, cwd: &str) -> ParsedArgs {
let (refs, pathspecs) = split_on_separator(args);
Expand Down Expand Up @@ -600,17 +621,31 @@ fn parse_unified_diff(patch: &str, cwd: &str) -> Result<Vec<FileChange>, PatchPa
Ok(entries
.into_iter()
.map(|e| {
let before_content = e.old_sha.as_deref().and_then(&git_show);
let mut after_content = e.new_sha.as_deref().and_then(&git_show);
let is_binary = e.has_binary_marker || e.has_git_binary_patch;
let mut has_binary_content = is_binary;
let before_content = if is_binary {
None
} else {
e.old_sha.as_deref().and_then(&git_show)
};
let mut after_content = if is_binary {
None
} else {
e.new_sha.as_deref().and_then(&git_show)
};

// Fallback: if git show fails for the new SHA (e.g. unstaged working
// tree changes where the blob doesn't exist yet), read from disk.
if after_content.is_none() && e.new_sha.is_some() {
if !is_binary && after_content.is_none() && e.new_sha.is_some() {
let file = Path::new(cwd).join(&e.file_path);
after_content = std::fs::read_to_string(&file).ok();
match read_file_compare_content(&file) {
Ok(Some(content)) => after_content = Some(content),
Ok(None) => has_binary_content = true,
Err(_) => {}
}
}

if before_content.is_none() && after_content.is_none() {
if !has_binary_content && before_content.is_none() && after_content.is_none() {
eprintln!(
"\x1b[33mwarning:\x1b[0m could not resolve contents for \x1b[1m{}\x1b[0m. \
Try running from inside the repo, or use \x1b[1m-C /path/to/repo\x1b[0m.",
Expand Down Expand Up @@ -828,11 +863,11 @@ pub fn diff_command(mut opts: DiffOptions) {
}
}

let content_a = std::fs::read_to_string(&path_a).unwrap_or_else(|e| {
let content_a = read_file_compare_content(&path_a).unwrap_or_else(|e| {
eprintln!("\x1b[31mError reading {}: {e}\x1b[0m", path_a.display());
process::exit(1);
});
let content_b = std::fs::read_to_string(&path_b).unwrap_or_else(|e| {
let content_b = read_file_compare_content(&path_b).unwrap_or_else(|e| {
eprintln!("\x1b[31mError reading {}: {e}\x1b[0m", path_b.display());
process::exit(1);
});
Expand All @@ -842,16 +877,34 @@ pub fn diff_command(mut opts: DiffOptions) {
.clone()
.or_else(|| label.clone())
.unwrap_or_else(|| after.clone());
let registry = super::create_registry(&opts.cwd);
let (changes, language_mismatch) =
file_compare_changes(before, &target_label, content_a, content_b, &registry);
if let Some((language_a, language_b)) = language_mismatch {
eprintln!(
"warning: comparing files with different languages: {} ({}) and {} ({}); rendering as delete/add",
before, language_a, after, language_b
if content_a.is_none() || content_b.is_none() {
// A None side is binary or otherwise not UTF-8; emit a file-level
// change so the diff pipeline reports it as a binary change.
let change = FileChange {
file_path: target_label,
old_file_path: None,
status: FileStatus::Modified,
before_content: content_a,
after_content: content_b,
};
(vec![change], false)
} else {
let registry = super::create_registry(&opts.cwd);
let (changes, language_mismatch) = file_compare_changes(
before,
&target_label,
content_a.unwrap(),
content_b.unwrap(),
&registry,
);
if let Some((language_a, language_b)) = language_mismatch {
eprintln!(
"warning: comparing files with different languages: {} ({}) and {} ({}); rendering as delete/add",
before, language_a, after, language_b
);
}
(changes, false)
}
(changes, false)
} else if opts.patch {
// Read unified diff from stdin and parse it
let mut input = String::new();
Expand Down Expand Up @@ -1026,7 +1079,7 @@ fn run_diff_pipeline(
if file_changes.is_empty() {
match opts.format {
OutputFormat::Json => {
println!("{{\"summary\":{{\"fileCount\":0,\"added\":0,\"modified\":0,\"deleted\":0,\"moved\":0,\"renamed\":0,\"reordered\":0,\"orphan\":0,\"total\":0}},\"changes\":[]}}");
println!("{{\"summary\":{{\"fileCount\":0,\"added\":0,\"modified\":0,\"deleted\":0,\"moved\":0,\"renamed\":0,\"reordered\":0,\"binary\":0,\"orphan\":0,\"total\":0}},\"changes\":[],\"binaryChanges\":[]}}");
}
_ => {
println!("\x1b[2mNo semantic changes detected.\x1b[0m");
Expand All @@ -1040,6 +1093,7 @@ fn run_diff_pipeline(
let registry_ms = t2.elapsed().as_secs_f64() * 1000.0;

let t3 = Instant::now();
let binary_changes = collect_binary_file_changes(&file_changes);
let mut result = compute_semantic_diff(&file_changes, &registry, None, None);
let parse_diff_ms = t3.elapsed().as_secs_f64() * 1000.0;

Expand All @@ -1049,14 +1103,16 @@ fn run_diff_pipeline(
}

// Record lifetime stats (best-effort)
let _ = SemLifetimeStats::load().record_diff(&result).save();
let _ = SemLifetimeStats::load()
.record_diff(&result, binary_changes.len())
.save();

let t4 = Instant::now();
let output = match opts.format {
OutputFormat::Json => format_json(&result),
OutputFormat::Markdown => format_markdown(&result, opts.verbose),
OutputFormat::Plain => format_plain(&result),
OutputFormat::Terminal => format_terminal(&result, opts.verbose),
OutputFormat::Json => format_json(&result, &binary_changes),
OutputFormat::Markdown => format_markdown(&result, &binary_changes, opts.verbose),
OutputFormat::Plain => format_plain(&result, &binary_changes),
OutputFormat::Terminal => format_terminal(&result, &binary_changes, opts.verbose),
};
let format_ms = t4.elapsed().as_secs_f64() * 1000.0;

Expand Down Expand Up @@ -1085,6 +1141,7 @@ fn run_diff_pipeline(
+ result.moved_count
+ result.renamed_count
+ result.reordered_count
+ binary_changes.len()
);
eprintln!("\x1b[2m─────────────────────────────────────────────\x1b[0m");
}
Expand Down Expand Up @@ -1365,4 +1422,43 @@ mod tests {
);
}
}

#[test]
fn file_compare_binary_detection_handles_nuls_and_invalid_utf8() {
assert!(bytes_look_binary(b"\0png", true));
assert!(bytes_look_binary(&[0xff, 0xfe], true));
assert!(!bytes_look_binary("plain text".as_bytes(), true));
}

#[test]
fn file_compare_binary_detection_ignores_partial_utf8_at_scan_boundary() {
assert!(!bytes_look_binary(&[0xe2, 0x82], false));
assert!(bytes_look_binary(&[0xe2, 0x82], true));
}

#[test]
fn read_file_compare_content_preserves_text_side_for_mixed_binary_compare() {
let unique = std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap()
.as_nanos();
let dir = std::env::temp_dir().join(format!(
"sem-file-compare-test-{}-{unique}",
std::process::id()
));
std::fs::create_dir(&dir).unwrap();

let text_path = dir.join("text.txt");
let binary_path = dir.join("image.bin");
std::fs::write(&text_path, "plain text\n").unwrap();
std::fs::write(&binary_path, b"\0binary").unwrap();

assert_eq!(
read_file_compare_content(&text_path).unwrap().as_deref(),
Some("plain text\n")
);
assert!(read_file_compare_content(&binary_path).unwrap().is_none());

std::fs::remove_dir_all(dir).unwrap();
}
}
45 changes: 39 additions & 6 deletions crates/sem-cli/src/formatters/json.rs
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
use sem_core::parser::differ::DiffResult;
use sem_core::parser::differ::{BinaryFileChange, DiffResult};

pub fn format_json(result: &DiffResult) -> String {
sem_core::format::json::format_diff_json(result)
pub fn format_json(result: &DiffResult, binary_changes: &[BinaryFileChange]) -> String {
sem_core::format::json::format_diff_json_with_binary_changes(result, binary_changes)
}

#[cfg(test)]
mod tests {
use super::*;
use sem_core::git::types::{FileChange, FileStatus};
use sem_core::model::change::SemanticChange;
use sem_core::parser::differ::compute_semantic_diff;
use sem_core::parser::differ::{compute_semantic_diff, BinaryFileChange};
use sem_core::parser::plugins::create_default_registry;

fn modified_file(path: &str, before: &str, after: &str) -> FileChange {
Expand All @@ -36,7 +36,7 @@ mod tests {
None,
);

let output: serde_json::Value = serde_json::from_str(&format_json(&result)).unwrap();
let output: serde_json::Value = serde_json::from_str(&format_json(&result, &[])).unwrap();
let summary = &output["summary"];
let bucket_total = summary["added"].as_u64().unwrap()
+ summary["modified"].as_u64().unwrap()
Expand Down Expand Up @@ -83,12 +83,45 @@ mod tests {
total_entities_after: 1,
};

let output: serde_json::Value = serde_json::from_str(&format_json(&result)).unwrap();
let output: serde_json::Value = serde_json::from_str(&format_json(&result, &[])).unwrap();
let change = &output["changes"][0];

assert_eq!(change["startLine"], 7);
assert_eq!(change["endLine"], 9);
assert_eq!(change["oldStartLine"], 3);
assert_eq!(change["oldEndLine"], 5);
}

#[test]
fn json_includes_binary_changes_in_summary_and_binary_changes() {
let result = DiffResult {
changes: Vec::new(),
file_count: 0,
added_count: 0,
modified_count: 0,
deleted_count: 0,
moved_count: 0,
renamed_count: 0,
reordered_count: 0,
orphan_count: 0,
total_entities_before: 0,
total_entities_after: 0,
};
let binary_changes = vec![BinaryFileChange {
file_path: "pic.png".to_string(),
status: FileStatus::Modified,
old_file_path: None,
}];

let value: serde_json::Value =
serde_json::from_str(&format_json(&result, &binary_changes)).unwrap();

assert_eq!(value["summary"]["fileCount"], 1);
assert_eq!(value["summary"]["binary"], 1);
assert_eq!(value["summary"]["total"], 1);
assert_eq!(value["changes"].as_array().unwrap().len(), 0);
assert_eq!(value["binaryChanges"][0]["changeType"], "binary");
assert_eq!(value["binaryChanges"][0]["filePath"], "pic.png");
assert_eq!(value["binaryChanges"][0]["fileStatus"], "modified");
}
}
39 changes: 31 additions & 8 deletions crates/sem-cli/src/formatters/markdown.rs
Original file line number Diff line number Diff line change
@@ -1,30 +1,48 @@
use super::orphan_summary_parts;
use sem_core::model::change::ChangeType;
use sem_core::parser::differ::DiffResult;
use sem_core::parser::differ::{BinaryFileChange, DiffResult};
use similar::{ChangeTag, TextDiff};
use std::collections::BTreeMap;

pub fn format_markdown(result: &DiffResult, verbose: bool) -> String {
if result.changes.is_empty() {
use super::{binary_display_name, file_count, has_reportable_changes};

pub fn format_markdown(
result: &DiffResult,
binary_changes: &[BinaryFileChange],
verbose: bool,
) -> String {
if !has_reportable_changes(result, binary_changes) {
return "No semantic changes detected.".to_string();
}

let mut lines: Vec<String> = Vec::new();

// Group changes by file (BTreeMap for sorted output)
let mut by_file: BTreeMap<&str, Vec<usize>> = BTreeMap::new();
let mut by_file: BTreeMap<&str, (Vec<usize>, Vec<usize>)> = BTreeMap::new();
for (i, change) in result.changes.iter().enumerate() {
by_file.entry(&change.file_path).or_default().push(i);
by_file.entry(&change.file_path).or_default().0.push(i);
}
for (i, change) in binary_changes.iter().enumerate() {
by_file.entry(&change.file_path).or_default().1.push(i);
}

for (file_path, indices) in &by_file {
for (file_path, (indices, binary_indices)) in &by_file {
lines.push(format!("### {file_path}"));
lines.push(String::new());
lines.push("| Status | Type | Name |".to_string());
lines.push("|--------|------|------|".to_string());

let mut post_table: Vec<String> = Vec::new();

for &idx in binary_indices {
let change = &binary_changes[idx];
lines.push(format!(
"| B | file | {} `[binary {}]` |",
binary_display_name(change),
change.status,
));
}

for &idx in indices {
let change = &result.changes[idx];
let status = match change.change_type {
Expand Down Expand Up @@ -181,7 +199,12 @@ pub fn format_markdown(result: &DiffResult, verbose: bool) -> String {
if result.reordered_count > 0 {
parts.push(format!("{} reordered", result.reordered_count));
}
let files_label = if result.file_count == 1 {
if !binary_changes.is_empty() {
parts.push(format!("{} binary", binary_changes.len()));
}

let reported_file_count = file_count(result, binary_changes);
let files_label = if reported_file_count == 1 {
"file"
} else {
"files"
Expand All @@ -196,7 +219,7 @@ pub fn format_markdown(result: &DiffResult, verbose: bool) -> String {
lines.push(format!(
"**Summary:** {} across {} {files_label}{}",
parts.join(", "),
result.file_count,
reported_file_count,
orphan_suffix,
));

Expand Down
Loading
Loading