diff --git a/README.md b/README.md index 1fbe061..a62b335 100644 --- a/README.md +++ b/README.md @@ -347,6 +347,7 @@ sem diff --format json "moved": 0, "renamed": 0, "reordered": 0, + "binary": 0, "orphan": 0, "total": 3 }, @@ -362,7 +363,8 @@ sem diff --format json "oldEndLine": null, "filePath": "src/auth.ts" } - ] + ], + "binaryChanges": [] } ``` diff --git a/crates/sem-cli/src/commands/diff.rs b/crates/sem-cli/src/commands/diff.rs index e352a5f..cf158a2 100644 --- a/crates/sem-cli/src/commands/diff.rs +++ b/crates/sem-cli/src/commands/diff.rs @@ -8,7 +8,7 @@ use sem_core::git::bridge::GitBridge; use sem_core::git::jj::maybe_resolve_ref; use sem_core::git::types::{DiffScope, FileChange, FileStatus}; use sem_core::model::change::ChangeType; -use sem_core::parser::differ::{compute_semantic_diff, DiffResult}; +use sem_core::parser::differ::{collect_binary_file_changes, compute_semantic_diff, DiffResult}; use sem_core::parser::plugins::code::languages::get_language_config; use sem_core::parser::registry::{detect_ext_from_content, ParserRegistry}; @@ -232,6 +232,27 @@ fn normalize_trailing_output_format(opts: &mut DiffOptions) { opts.args = args; } +fn bytes_look_binary(bytes: &[u8], complete: bool) -> bool { + if bytes.iter().any(|byte| *byte == 0) { + return true; + } + + match std::str::from_utf8(bytes) { + Ok(_) => false, + Err(error) => complete || error.error_len().is_some(), + } +} + +fn read_file_compare_content(path: &Path) -> Result, std::io::Error> { + let bytes = std::fs::read(path)?; + if bytes_look_binary(&bytes, true) { + Ok(None) + } else { + let content = String::from_utf8(bytes) + .map_err(|err| std::io::Error::new(std::io::ErrorKind::InvalidData, err))?; + Ok(Some(content)) + } +} fn parse_args(args: Vec, cwd: &str) -> ParsedArgs { let (refs, pathspecs) = split_on_separator(args); @@ -600,17 +621,31 @@ fn parse_unified_diff(patch: &str, cwd: &str) -> Result, PatchPa Ok(entries .into_iter() .map(|e| { - let before_content = e.old_sha.as_deref().and_then(&git_show); - let mut after_content = e.new_sha.as_deref().and_then(&git_show); + let is_binary = e.has_binary_marker || e.has_git_binary_patch; + let mut has_binary_content = is_binary; + let before_content = if is_binary { + None + } else { + e.old_sha.as_deref().and_then(&git_show) + }; + let mut after_content = if is_binary { + None + } else { + e.new_sha.as_deref().and_then(&git_show) + }; // Fallback: if git show fails for the new SHA (e.g. unstaged working // tree changes where the blob doesn't exist yet), read from disk. - if after_content.is_none() && e.new_sha.is_some() { + if !is_binary && after_content.is_none() && e.new_sha.is_some() { let file = Path::new(cwd).join(&e.file_path); - after_content = std::fs::read_to_string(&file).ok(); + match read_file_compare_content(&file) { + Ok(Some(content)) => after_content = Some(content), + Ok(None) => has_binary_content = true, + Err(_) => {} + } } - if before_content.is_none() && after_content.is_none() { + if !has_binary_content && before_content.is_none() && after_content.is_none() { eprintln!( "\x1b[33mwarning:\x1b[0m could not resolve contents for \x1b[1m{}\x1b[0m. \ Try running from inside the repo, or use \x1b[1m-C /path/to/repo\x1b[0m.", @@ -828,11 +863,11 @@ pub fn diff_command(mut opts: DiffOptions) { } } - let content_a = std::fs::read_to_string(&path_a).unwrap_or_else(|e| { + let content_a = read_file_compare_content(&path_a).unwrap_or_else(|e| { eprintln!("\x1b[31mError reading {}: {e}\x1b[0m", path_a.display()); process::exit(1); }); - let content_b = std::fs::read_to_string(&path_b).unwrap_or_else(|e| { + let content_b = read_file_compare_content(&path_b).unwrap_or_else(|e| { eprintln!("\x1b[31mError reading {}: {e}\x1b[0m", path_b.display()); process::exit(1); }); @@ -842,16 +877,34 @@ pub fn diff_command(mut opts: DiffOptions) { .clone() .or_else(|| label.clone()) .unwrap_or_else(|| after.clone()); - let registry = super::create_registry(&opts.cwd); - let (changes, language_mismatch) = - file_compare_changes(before, &target_label, content_a, content_b, ®istry); - if let Some((language_a, language_b)) = language_mismatch { - eprintln!( - "warning: comparing files with different languages: {} ({}) and {} ({}); rendering as delete/add", - before, language_a, after, language_b + if content_a.is_none() || content_b.is_none() { + // A None side is binary or otherwise not UTF-8; emit a file-level + // change so the diff pipeline reports it as a binary change. + let change = FileChange { + file_path: target_label, + old_file_path: None, + status: FileStatus::Modified, + before_content: content_a, + after_content: content_b, + }; + (vec![change], false) + } else { + let registry = super::create_registry(&opts.cwd); + let (changes, language_mismatch) = file_compare_changes( + before, + &target_label, + content_a.unwrap(), + content_b.unwrap(), + ®istry, ); + if let Some((language_a, language_b)) = language_mismatch { + eprintln!( + "warning: comparing files with different languages: {} ({}) and {} ({}); rendering as delete/add", + before, language_a, after, language_b + ); + } + (changes, false) } - (changes, false) } else if opts.patch { // Read unified diff from stdin and parse it let mut input = String::new(); @@ -1026,7 +1079,7 @@ fn run_diff_pipeline( if file_changes.is_empty() { match opts.format { OutputFormat::Json => { - println!("{{\"summary\":{{\"fileCount\":0,\"added\":0,\"modified\":0,\"deleted\":0,\"moved\":0,\"renamed\":0,\"reordered\":0,\"orphan\":0,\"total\":0}},\"changes\":[]}}"); + println!("{{\"summary\":{{\"fileCount\":0,\"added\":0,\"modified\":0,\"deleted\":0,\"moved\":0,\"renamed\":0,\"reordered\":0,\"binary\":0,\"orphan\":0,\"total\":0}},\"changes\":[],\"binaryChanges\":[]}}"); } _ => { println!("\x1b[2mNo semantic changes detected.\x1b[0m"); @@ -1040,6 +1093,7 @@ fn run_diff_pipeline( let registry_ms = t2.elapsed().as_secs_f64() * 1000.0; let t3 = Instant::now(); + let binary_changes = collect_binary_file_changes(&file_changes); let mut result = compute_semantic_diff(&file_changes, ®istry, None, None); let parse_diff_ms = t3.elapsed().as_secs_f64() * 1000.0; @@ -1049,14 +1103,16 @@ fn run_diff_pipeline( } // Record lifetime stats (best-effort) - let _ = SemLifetimeStats::load().record_diff(&result).save(); + let _ = SemLifetimeStats::load() + .record_diff(&result, binary_changes.len()) + .save(); let t4 = Instant::now(); let output = match opts.format { - OutputFormat::Json => format_json(&result), - OutputFormat::Markdown => format_markdown(&result, opts.verbose), - OutputFormat::Plain => format_plain(&result), - OutputFormat::Terminal => format_terminal(&result, opts.verbose), + OutputFormat::Json => format_json(&result, &binary_changes), + OutputFormat::Markdown => format_markdown(&result, &binary_changes, opts.verbose), + OutputFormat::Plain => format_plain(&result, &binary_changes), + OutputFormat::Terminal => format_terminal(&result, &binary_changes, opts.verbose), }; let format_ms = t4.elapsed().as_secs_f64() * 1000.0; @@ -1085,6 +1141,7 @@ fn run_diff_pipeline( + result.moved_count + result.renamed_count + result.reordered_count + + binary_changes.len() ); eprintln!("\x1b[2m─────────────────────────────────────────────\x1b[0m"); } @@ -1365,4 +1422,43 @@ mod tests { ); } } + + #[test] + fn file_compare_binary_detection_handles_nuls_and_invalid_utf8() { + assert!(bytes_look_binary(b"\0png", true)); + assert!(bytes_look_binary(&[0xff, 0xfe], true)); + assert!(!bytes_look_binary("plain text".as_bytes(), true)); + } + + #[test] + fn file_compare_binary_detection_ignores_partial_utf8_at_scan_boundary() { + assert!(!bytes_look_binary(&[0xe2, 0x82], false)); + assert!(bytes_look_binary(&[0xe2, 0x82], true)); + } + + #[test] + fn read_file_compare_content_preserves_text_side_for_mixed_binary_compare() { + let unique = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap() + .as_nanos(); + let dir = std::env::temp_dir().join(format!( + "sem-file-compare-test-{}-{unique}", + std::process::id() + )); + std::fs::create_dir(&dir).unwrap(); + + let text_path = dir.join("text.txt"); + let binary_path = dir.join("image.bin"); + std::fs::write(&text_path, "plain text\n").unwrap(); + std::fs::write(&binary_path, b"\0binary").unwrap(); + + assert_eq!( + read_file_compare_content(&text_path).unwrap().as_deref(), + Some("plain text\n") + ); + assert!(read_file_compare_content(&binary_path).unwrap().is_none()); + + std::fs::remove_dir_all(dir).unwrap(); + } } diff --git a/crates/sem-cli/src/formatters/json.rs b/crates/sem-cli/src/formatters/json.rs index cb0f036..92b2d95 100644 --- a/crates/sem-cli/src/formatters/json.rs +++ b/crates/sem-cli/src/formatters/json.rs @@ -1,7 +1,7 @@ -use sem_core::parser::differ::DiffResult; +use sem_core::parser::differ::{BinaryFileChange, DiffResult}; -pub fn format_json(result: &DiffResult) -> String { - sem_core::format::json::format_diff_json(result) +pub fn format_json(result: &DiffResult, binary_changes: &[BinaryFileChange]) -> String { + sem_core::format::json::format_diff_json_with_binary_changes(result, binary_changes) } #[cfg(test)] @@ -9,7 +9,7 @@ mod tests { use super::*; use sem_core::git::types::{FileChange, FileStatus}; use sem_core::model::change::SemanticChange; - use sem_core::parser::differ::compute_semantic_diff; + use sem_core::parser::differ::{compute_semantic_diff, BinaryFileChange}; use sem_core::parser::plugins::create_default_registry; fn modified_file(path: &str, before: &str, after: &str) -> FileChange { @@ -36,7 +36,7 @@ mod tests { None, ); - let output: serde_json::Value = serde_json::from_str(&format_json(&result)).unwrap(); + let output: serde_json::Value = serde_json::from_str(&format_json(&result, &[])).unwrap(); let summary = &output["summary"]; let bucket_total = summary["added"].as_u64().unwrap() + summary["modified"].as_u64().unwrap() @@ -83,7 +83,7 @@ mod tests { total_entities_after: 1, }; - let output: serde_json::Value = serde_json::from_str(&format_json(&result)).unwrap(); + let output: serde_json::Value = serde_json::from_str(&format_json(&result, &[])).unwrap(); let change = &output["changes"][0]; assert_eq!(change["startLine"], 7); @@ -91,4 +91,37 @@ mod tests { assert_eq!(change["oldStartLine"], 3); assert_eq!(change["oldEndLine"], 5); } + + #[test] + fn json_includes_binary_changes_in_summary_and_binary_changes() { + let result = DiffResult { + changes: Vec::new(), + file_count: 0, + added_count: 0, + modified_count: 0, + deleted_count: 0, + moved_count: 0, + renamed_count: 0, + reordered_count: 0, + orphan_count: 0, + total_entities_before: 0, + total_entities_after: 0, + }; + let binary_changes = vec![BinaryFileChange { + file_path: "pic.png".to_string(), + status: FileStatus::Modified, + old_file_path: None, + }]; + + let value: serde_json::Value = + serde_json::from_str(&format_json(&result, &binary_changes)).unwrap(); + + assert_eq!(value["summary"]["fileCount"], 1); + assert_eq!(value["summary"]["binary"], 1); + assert_eq!(value["summary"]["total"], 1); + assert_eq!(value["changes"].as_array().unwrap().len(), 0); + assert_eq!(value["binaryChanges"][0]["changeType"], "binary"); + assert_eq!(value["binaryChanges"][0]["filePath"], "pic.png"); + assert_eq!(value["binaryChanges"][0]["fileStatus"], "modified"); + } } diff --git a/crates/sem-cli/src/formatters/markdown.rs b/crates/sem-cli/src/formatters/markdown.rs index ad7945b..4bf22bf 100644 --- a/crates/sem-cli/src/formatters/markdown.rs +++ b/crates/sem-cli/src/formatters/markdown.rs @@ -1,23 +1,32 @@ use super::orphan_summary_parts; use sem_core::model::change::ChangeType; -use sem_core::parser::differ::DiffResult; +use sem_core::parser::differ::{BinaryFileChange, DiffResult}; use similar::{ChangeTag, TextDiff}; use std::collections::BTreeMap; -pub fn format_markdown(result: &DiffResult, verbose: bool) -> String { - if result.changes.is_empty() { +use super::{binary_display_name, file_count, has_reportable_changes}; + +pub fn format_markdown( + result: &DiffResult, + binary_changes: &[BinaryFileChange], + verbose: bool, +) -> String { + if !has_reportable_changes(result, binary_changes) { return "No semantic changes detected.".to_string(); } let mut lines: Vec = Vec::new(); // Group changes by file (BTreeMap for sorted output) - let mut by_file: BTreeMap<&str, Vec> = BTreeMap::new(); + let mut by_file: BTreeMap<&str, (Vec, Vec)> = BTreeMap::new(); for (i, change) in result.changes.iter().enumerate() { - by_file.entry(&change.file_path).or_default().push(i); + by_file.entry(&change.file_path).or_default().0.push(i); + } + for (i, change) in binary_changes.iter().enumerate() { + by_file.entry(&change.file_path).or_default().1.push(i); } - for (file_path, indices) in &by_file { + for (file_path, (indices, binary_indices)) in &by_file { lines.push(format!("### {file_path}")); lines.push(String::new()); lines.push("| Status | Type | Name |".to_string()); @@ -25,6 +34,15 @@ pub fn format_markdown(result: &DiffResult, verbose: bool) -> String { let mut post_table: Vec = Vec::new(); + for &idx in binary_indices { + let change = &binary_changes[idx]; + lines.push(format!( + "| B | file | {} `[binary {}]` |", + binary_display_name(change), + change.status, + )); + } + for &idx in indices { let change = &result.changes[idx]; let status = match change.change_type { @@ -181,7 +199,12 @@ pub fn format_markdown(result: &DiffResult, verbose: bool) -> String { if result.reordered_count > 0 { parts.push(format!("{} reordered", result.reordered_count)); } - let files_label = if result.file_count == 1 { + if !binary_changes.is_empty() { + parts.push(format!("{} binary", binary_changes.len())); + } + + let reported_file_count = file_count(result, binary_changes); + let files_label = if reported_file_count == 1 { "file" } else { "files" @@ -196,7 +219,7 @@ pub fn format_markdown(result: &DiffResult, verbose: bool) -> String { lines.push(format!( "**Summary:** {} across {} {files_label}{}", parts.join(", "), - result.file_count, + reported_file_count, orphan_suffix, )); diff --git a/crates/sem-cli/src/formatters/mod.rs b/crates/sem-cli/src/formatters/mod.rs index e77ff77..ce15b54 100644 --- a/crates/sem-cli/src/formatters/mod.rs +++ b/crates/sem-cli/src/formatters/mod.rs @@ -1,11 +1,38 @@ use sem_core::model::change::ChangeType; -use sem_core::parser::differ::DiffResult; +use sem_core::parser::differ::{BinaryFileChange, DiffResult}; pub mod json; pub mod markdown; pub mod plain; pub mod terminal; +pub(crate) fn binary_display_name(change: &BinaryFileChange) -> String { + match change.old_file_path.as_deref() { + Some(old_path) if old_path != change.file_path => { + format!("{old_path} -> {}", change.file_path) + } + _ => change.file_path.clone(), + } +} + +pub(crate) fn has_reportable_changes( + result: &DiffResult, + binary_changes: &[BinaryFileChange], +) -> bool { + !result.changes.is_empty() || !binary_changes.is_empty() +} + +pub(crate) fn file_count(result: &DiffResult, binary_changes: &[BinaryFileChange]) -> usize { + result.file_count + binary_changes.len() +} + +pub(crate) fn total_change_count( + result: &DiffResult, + binary_changes: &[BinaryFileChange], +) -> usize { + result.changes.len() + binary_changes.len() +} + pub(crate) fn orphan_summary_parts(result: &DiffResult) -> Vec { let mut added = 0; let mut modified = 0; diff --git a/crates/sem-cli/src/formatters/plain.rs b/crates/sem-cli/src/formatters/plain.rs index a7abcee..466bb21 100644 --- a/crates/sem-cli/src/formatters/plain.rs +++ b/crates/sem-cli/src/formatters/plain.rs @@ -1,25 +1,44 @@ use super::orphan_summary_parts; use colored::Colorize; use sem_core::model::change::ChangeType; -use sem_core::parser::differ::DiffResult; +use sem_core::parser::differ::{BinaryFileChange, DiffResult}; use std::collections::BTreeMap; -pub fn format_plain(result: &DiffResult) -> String { - if result.changes.is_empty() { +use super::{binary_display_name, file_count, has_reportable_changes}; + +pub fn format_plain(result: &DiffResult, binary_changes: &[BinaryFileChange]) -> String { + if !has_reportable_changes(result, binary_changes) { return "No semantic changes detected.".to_string(); } let mut lines: Vec = Vec::new(); // Group changes by file (BTreeMap for sorted output) - let mut by_file: BTreeMap<&str, Vec> = BTreeMap::new(); + let mut by_file: BTreeMap<&str, (Vec, Vec)> = BTreeMap::new(); for (i, change) in result.changes.iter().enumerate() { - by_file.entry(&change.file_path).or_default().push(i); + by_file.entry(&change.file_path).or_default().0.push(i); + } + for (i, change) in binary_changes.iter().enumerate() { + by_file.entry(&change.file_path).or_default().1.push(i); } - for (file_path, indices) in &by_file { + for (file_path, (indices, binary_indices)) in &by_file { lines.push(file_path.bold().to_string()); + for &idx in binary_indices { + let change = &binary_changes[idx]; + let letter = "B".yellow().to_string(); + let type_label = format!("{:<12}", "file"); + let name_display = binary_display_name(change); + lines.push(format!( + " {} {}{} {}", + letter, + type_label.dimmed(), + name_display, + format!("[binary {}]", change.status).yellow(), + )); + } + for &idx in indices { let change = &result.changes[idx]; let letter = match change.change_type { @@ -99,7 +118,12 @@ pub fn format_plain(result: &DiffResult) -> String { .to_string(), ); } - let files_label = if result.file_count == 1 { + if !binary_changes.is_empty() { + parts.push(format!("{} binary", binary_changes.len()).yellow().to_string()); + } + + let reported_file_count = file_count(result, binary_changes); + let files_label = if reported_file_count == 1 { "file" } else { "files" @@ -115,7 +139,7 @@ pub fn format_plain(result: &DiffResult) -> String { lines.push(format!( "{} across {} {files_label}{}", parts.join(", "), - result.file_count, + reported_file_count, orphan_suffix, )); diff --git a/crates/sem-cli/src/formatters/terminal.rs b/crates/sem-cli/src/formatters/terminal.rs index 5817037..c011b10 100644 --- a/crates/sem-cli/src/formatters/terminal.rs +++ b/crates/sem-cli/src/formatters/terminal.rs @@ -1,10 +1,12 @@ use super::orphan_summary_parts; use colored::Colorize; use sem_core::model::change::ChangeType; -use sem_core::parser::differ::DiffResult; +use sem_core::parser::differ::{BinaryFileChange, DiffResult}; use similar::{ChangeTag, TextDiff}; use std::collections::BTreeMap; +use super::{binary_display_name, file_count, has_reportable_changes}; + /// Runs word-level diff on two lines and returns (delete_line, insert_line) /// with changed words highlighted (strikethrough+red / bold+green). fn render_inline_diff(old_line: &str, new_line: &str) -> (String, String) { @@ -31,22 +33,30 @@ fn render_inline_diff(old_line: &str, new_line: &str) -> (String, String) { (del, ins) } -pub fn format_terminal(result: &DiffResult, verbose: bool) -> String { - if result.changes.is_empty() { +pub fn format_terminal( + result: &DiffResult, + binary_changes: &[BinaryFileChange], + verbose: bool, +) -> String { + if !has_reportable_changes(result, binary_changes) { return "No semantic changes detected.".dimmed().to_string(); } let mut lines: Vec = Vec::new(); // Group changes by file (BTreeMap for sorted output) - let mut by_file: BTreeMap<&str, Vec> = BTreeMap::new(); + let mut by_file: BTreeMap<&str, (Vec, Vec)> = BTreeMap::new(); for (i, change) in result.changes.iter().enumerate() { - by_file.entry(&change.file_path).or_default().push(i); + by_file.entry(&change.file_path).or_default().0.push(i); + } + for (i, change) in binary_changes.iter().enumerate() { + by_file.entry(&change.file_path).or_default().1.push(i); } - for (file_path, indices) in &by_file { + for (file_path, (indices, binary_indices)) in &by_file { // Skip files where all changes are orphans in non-verbose mode if !verbose + && binary_indices.is_empty() && indices .iter() .all(|&i| result.changes[i].entity_type == "orphan") @@ -63,6 +73,23 @@ pub fn format_terminal(result: &DiffResult, verbose: bool) -> String { ); lines.push("│".dimmed().to_string()); + for &idx in binary_indices { + let change = &binary_changes[idx]; + let symbol = "■".yellow().to_string(); + let tag = format!("[binary {}]", change.status).yellow().to_string(); + let type_label = format!("{:<10}", "file"); + let name_label = format!("{:<25}", binary_display_name(change)); + + lines.push(format!( + "{} {} {} {} {}", + "│".dimmed(), + symbol, + type_label.dimmed(), + name_label.bold(), + tag, + )); + } + for &idx in indices { let change = &result.changes[idx]; @@ -304,7 +331,12 @@ pub fn format_terminal(result: &DiffResult, verbose: bool) -> String { .to_string(), ); } - let files_label = if result.file_count == 1 { + if !binary_changes.is_empty() { + parts.push(format!("{} binary", binary_changes.len()).yellow().to_string()); + } + + let reported_file_count = file_count(result, binary_changes); + let files_label = if reported_file_count == 1 { "file" } else { "files" @@ -321,7 +353,7 @@ pub fn format_terminal(result: &DiffResult, verbose: bool) -> String { lines.push(format!( "Summary: {} across {} {files_label}{}", parts.join(", "), - result.file_count, + reported_file_count, orphan_suffix, )); @@ -334,7 +366,8 @@ pub fn format_terminal(result: &DiffResult, verbose: bool) -> String { + result.deleted_count + result.moved_count + result.renamed_count - + result.reordered_count; + + result.reordered_count + + binary_changes.len(); if entities_analyzed > changes_detected { let noise = entities_analyzed - changes_detected; lines.push( diff --git a/crates/sem-cli/src/stats.rs b/crates/sem-cli/src/stats.rs index e9d6a10..3c8f26b 100644 --- a/crates/sem-cli/src/stats.rs +++ b/crates/sem-cli/src/stats.rs @@ -101,7 +101,7 @@ impl SemLifetimeStats { .unwrap_or_default() } - pub fn record_diff(mut self, result: &DiffResult) -> Self { + pub fn record_diff(mut self, result: &DiffResult, binary_count: usize) -> Self { let now = now_iso(); if self.first_run.is_none() { self.first_run = Some(now.clone()); @@ -122,7 +122,8 @@ impl SemLifetimeStats { + result.deleted_count + result.moved_count + result.renamed_count - + result.reordered_count) as u64; + + result.reordered_count + + binary_count) as u64; self.total_changes_detected += changes; self.noise_filtered += entities_analyzed.saturating_sub(changes); diff --git a/crates/sem-core/src/format/json.rs b/crates/sem-core/src/format/json.rs index ce8c400..d4ef977 100644 --- a/crates/sem-core/src/format/json.rs +++ b/crates/sem-core/src/format/json.rs @@ -1,7 +1,22 @@ -use crate::parser::differ::DiffResult; +use crate::parser::differ::{BinaryFileChange, DiffResult}; use serde_json::{json, Value}; pub fn diff_json_value(result: &DiffResult) -> Value { + diff_json_value_inner(result, &[], false) +} + +pub fn diff_json_value_with_binary_changes( + result: &DiffResult, + binary_changes: &[BinaryFileChange], +) -> Value { + diff_json_value_inner(result, binary_changes, true) +} + +fn diff_json_value_inner( + result: &DiffResult, + binary_changes: &[BinaryFileChange], + include_binary_changes: bool, +) -> Value { let changes: Vec = result .changes .iter() @@ -28,19 +43,50 @@ pub fn diff_json_value(result: &DiffResult) -> Value { }) .collect(); + if !include_binary_changes { + return json!({ + "summary": { + "fileCount": result.file_count, + "added": result.added_count, + "modified": result.modified_count, + "deleted": result.deleted_count, + "moved": result.moved_count, + "renamed": result.renamed_count, + "reordered": result.reordered_count, + "orphan": result.orphan_count, + "total": result.changes.len(), + }, + "changes": changes, + }); + } + + let binary_changes_json: Vec = binary_changes + .iter() + .map(|c| { + json!({ + "changeType": "binary", + "filePath": c.file_path, + "oldFilePath": c.old_file_path, + "fileStatus": c.status, + }) + }) + .collect(); + json!({ "summary": { - "fileCount": result.file_count, + "fileCount": result.file_count + binary_changes.len(), "added": result.added_count, "modified": result.modified_count, "deleted": result.deleted_count, "moved": result.moved_count, "renamed": result.renamed_count, "reordered": result.reordered_count, + "binary": binary_changes.len(), "orphan": result.orphan_count, - "total": result.changes.len(), + "total": result.changes.len() + binary_changes.len(), }, "changes": changes, + "binaryChanges": binary_changes_json, }) } @@ -48,13 +94,21 @@ pub fn format_diff_json(result: &DiffResult) -> String { serde_json::to_string(&diff_json_value(result)).unwrap_or_default() } +pub fn format_diff_json_with_binary_changes( + result: &DiffResult, + binary_changes: &[BinaryFileChange], +) -> String { + serde_json::to_string(&diff_json_value_with_binary_changes(result, binary_changes)) + .unwrap_or_default() +} + #[cfg(test)] mod tests { use super::*; use crate::model::change::{ChangeType, SemanticChange}; #[test] - fn diff_json_value_matches_cli_envelope() { + fn diff_json_value_preserves_public_schema() { let result = DiffResult { changes: vec![SemanticChange { id: "internal-change-id".to_string(), @@ -129,4 +183,53 @@ mod tests { }) ); } + + #[test] + fn diff_json_value_with_binary_changes_matches_cli_envelope() { + let result = DiffResult { + changes: Vec::new(), + file_count: 0, + added_count: 0, + modified_count: 0, + deleted_count: 0, + moved_count: 0, + renamed_count: 0, + reordered_count: 0, + orphan_count: 0, + total_entities_before: 0, + total_entities_after: 0, + }; + let binary_changes = vec![BinaryFileChange { + file_path: "pic.png".to_string(), + status: crate::git::types::FileStatus::Modified, + old_file_path: None, + }]; + + let value = diff_json_value_with_binary_changes(&result, &binary_changes); + + assert_eq!( + value, + json!({ + "summary": { + "fileCount": 1, + "added": 0, + "modified": 0, + "deleted": 0, + "moved": 0, + "renamed": 0, + "reordered": 0, + "binary": 1, + "orphan": 0, + "total": 1, + }, + "changes": [], + "binaryChanges": [{ + "changeType": "binary", + "filePath": "pic.png", + "oldFilePath": null, + "fileStatus": "modified", + }], + }) + ); + } } diff --git a/crates/sem-core/src/git/bridge.rs b/crates/sem-core/src/git/bridge.rs index 1ff3e22..ec85ff8 100644 --- a/crates/sem-core/src/git/bridge.rs +++ b/crates/sem-core/src/git/bridge.rs @@ -487,6 +487,17 @@ impl GitBridge { files } + fn bytes_look_binary(bytes: &[u8], complete: bool) -> bool { + if bytes.iter().any(|byte| *byte == 0) { + return true; + } + + match std::str::from_utf8(bytes) { + Ok(_) => false, + Err(error) => complete || error.error_len().is_some(), + } + } + fn populate_contents( &self, files: &mut [FileChange], @@ -605,14 +616,22 @@ impl GitBridge { fn read_blob_from_tree(&self, tree: &git2::Tree, file_path: &str) -> Option { let entry = tree.get_path(Path::new(file_path)).ok()?; let blob = self.repo.find_blob(entry.id()).ok()?; - std::str::from_utf8(blob.content()) + let bytes = blob.content(); + if blob.is_binary() || Self::bytes_look_binary(bytes, true) { + return None; + } + std::str::from_utf8(bytes) .ok() .map(|s| Self::normalize_line_endings(s.to_string())) } fn read_working_file(&self, file_path: &str) -> Option { let full_path = self.repo_root.join(file_path); - fs::read_to_string(full_path) + let bytes = fs::read(full_path).ok()?; + if Self::bytes_look_binary(&bytes, true) { + return None; + } + String::from_utf8(bytes) .ok() .map(Self::normalize_line_endings) } @@ -621,7 +640,11 @@ impl GitBridge { let index = self.repo.index().ok()?; let entry = index.get_path(Path::new(file_path), 0)?; let blob = self.repo.find_blob(entry.id).ok()?; - std::str::from_utf8(blob.content()) + let bytes = blob.content(); + if blob.is_binary() || Self::bytes_look_binary(bytes, true) { + return None; + } + std::str::from_utf8(bytes) .ok() .map(|s| Self::normalize_line_endings(s.to_string())) } @@ -1192,7 +1215,7 @@ fn normalize_lexical(path: &Path) -> PathBuf { mod tests { use super::*; use crate::model::change::ChangeType; - use crate::parser::differ::compute_semantic_diff; + use crate::parser::differ::{collect_binary_file_changes, compute_semantic_diff}; use crate::parser::plugins::create_default_registry; use git2::{ErrorClass, Oid, Repository, Signature}; use tempfile::TempDir; @@ -1220,6 +1243,34 @@ mod tests { } } + fn commit_binary_file( + repo: &Repository, + file_path: &str, + contents: &[u8], + message: &str, + ) -> Oid { + fs::write(repo.workdir().unwrap().join(file_path), contents).unwrap(); + + let mut index = repo.index().unwrap(); + index.add_path(Path::new(file_path)).unwrap(); + index.write().unwrap(); + + let tree_id = index.write_tree().unwrap(); + let tree = repo.find_tree(tree_id).unwrap(); + let sig = Signature::now("Test User", "test@example.com").unwrap(); + + match repo.head() { + Ok(head) => { + let parent = repo.find_commit(head.target().unwrap()).unwrap(); + repo.commit(Some("HEAD"), &sig, &sig, message, &tree, &[&parent]) + .unwrap() + } + Err(_) => repo + .commit(Some("HEAD"), &sig, &sig, message, &tree, &[]) + .unwrap(), + } + } + #[test] fn porcelain_blame_reports_uncommitted_lines() { let temp = TempDir::new().unwrap(); @@ -1416,6 +1467,81 @@ mod tests { assert!(message.contains("is outside repository")); } + #[test] + fn working_binary_modification_is_reported_as_binary_change() { + let temp = TempDir::new().unwrap(); + let repo = Repository::init(temp.path()).unwrap(); + + commit_binary_file(&repo, "pic.png", b"\0png-v1\0", "init"); + fs::write(temp.path().join("pic.png"), b"\0png-v2\0extra").unwrap(); + + let bridge = GitBridge::open(temp.path()).unwrap(); + let files = bridge.get_changed_files(&DiffScope::Working, &[]).unwrap(); + + assert_eq!(files.len(), 1); + assert_eq!(files[0].file_path, "pic.png"); + assert_eq!(files[0].status, FileStatus::Modified); + assert!(files[0].before_content.is_none()); + assert!(files[0].after_content.is_none()); + + let binary_changes = collect_binary_file_changes(&files); + let registry = create_default_registry(); + let result = compute_semantic_diff(&files, ®istry, None, None); + + assert!(result.changes.is_empty()); + assert_eq!(result.file_count, 0); + assert_eq!(binary_changes.len(), 1); + assert_eq!(binary_changes[0].file_path, "pic.png"); + assert_eq!(binary_changes[0].status, FileStatus::Modified); + } + + #[test] + fn staged_binary_add_and_delete_are_reported_as_binary_changes() { + let temp = TempDir::new().unwrap(); + let repo = Repository::init(temp.path()).unwrap(); + + fs::write(temp.path().join("added.png"), b"\0added-binary\0").unwrap(); + let mut index = repo.index().unwrap(); + index.add_path(Path::new("added.png")).unwrap(); + index.write().unwrap(); + + let bridge = GitBridge::open(temp.path()).unwrap(); + let added_files = bridge.get_changed_files(&DiffScope::Staged, &[]).unwrap(); + assert_eq!(added_files.len(), 1); + assert_eq!(added_files[0].file_path, "added.png"); + assert_eq!(added_files[0].status, FileStatus::Added); + assert!(added_files[0].before_content.is_none()); + assert!(added_files[0].after_content.is_none()); + let added_binary_changes = collect_binary_file_changes(&added_files); + assert_eq!(added_binary_changes.len(), 1); + assert_eq!(added_binary_changes[0].file_path, "added.png"); + + let temp = TempDir::new().unwrap(); + let repo = Repository::init(temp.path()).unwrap(); + commit_binary_file(&repo, "deleted.png", b"\0deleted-binary\0", "init"); + fs::remove_file(temp.path().join("deleted.png")).unwrap(); + let mut index = repo.index().unwrap(); + index.remove_path(Path::new("deleted.png")).unwrap(); + index.write().unwrap(); + + let bridge = GitBridge::open(temp.path()).unwrap(); + let deleted_files = bridge.get_changed_files(&DiffScope::Staged, &[]).unwrap(); + assert_eq!(deleted_files.len(), 1); + assert_eq!(deleted_files[0].file_path, "deleted.png"); + assert_eq!(deleted_files[0].status, FileStatus::Deleted); + assert!(deleted_files[0].before_content.is_none()); + assert!(deleted_files[0].after_content.is_none()); + let deleted_binary_changes = collect_binary_file_changes(&deleted_files); + assert_eq!(deleted_binary_changes.len(), 1); + assert_eq!(deleted_binary_changes[0].file_path, "deleted.png"); + } + + #[test] + fn partial_utf8_boundary_is_not_treated_as_binary() { + assert!(!GitBridge::bytes_look_binary(&[0xe2, 0x82], false)); + assert!(GitBridge::bytes_look_binary(&[0xe2, 0x82], true)); + } + #[test] fn staged_file_rename_is_reported_as_single_rename_with_old_contents() { let temp = TempDir::new().unwrap(); diff --git a/crates/sem-core/src/git/types.rs b/crates/sem-core/src/git/types.rs index bf394b0..12723d8 100644 --- a/crates/sem-core/src/git/types.rs +++ b/crates/sem-core/src/git/types.rs @@ -19,6 +19,17 @@ pub enum FileStatus { Renamed, } +impl std::fmt::Display for FileStatus { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + FileStatus::Added => write!(f, "added"), + FileStatus::Modified => write!(f, "modified"), + FileStatus::Deleted => write!(f, "deleted"), + FileStatus::Renamed => write!(f, "renamed"), + } + } +} + #[derive(Debug, Clone, Serialize, Deserialize)] #[serde(rename_all = "camelCase", deny_unknown_fields)] pub struct FileChange { diff --git a/crates/sem-core/src/parser/differ.rs b/crates/sem-core/src/parser/differ.rs index 3fb2f0a..8ad97a3 100644 --- a/crates/sem-core/src/parser/differ.rs +++ b/crates/sem-core/src/parser/differ.rs @@ -2,7 +2,7 @@ use rayon::prelude::*; use serde::Serialize; -use crate::git::types::FileChange; +use crate::git::types::{FileChange, FileStatus}; macro_rules! maybe_par_iter { ($slice:expr) => {{ @@ -38,6 +38,42 @@ pub struct DiffResult { pub total_entities_after: usize, } +#[derive(Debug, Clone, Serialize)] +#[serde(rename_all = "camelCase")] +pub struct BinaryFileChange { + pub file_path: String, + pub status: FileStatus, + pub old_file_path: Option, +} + +impl From<&FileChange> for BinaryFileChange { + fn from(file: &FileChange) -> Self { + Self { + file_path: file.file_path.clone(), + status: file.status.clone(), + old_file_path: file.old_file_path.clone(), + } + } +} + +pub fn collect_binary_file_changes(file_changes: &[FileChange]) -> Vec { + file_changes + .iter() + .filter(|file| lacks_diffable_content(file)) + .map(BinaryFileChange::from) + .collect() +} + +fn lacks_diffable_content(file: &FileChange) -> bool { + match &file.status { + FileStatus::Added => file.after_content.is_none(), + FileStatus::Deleted => file.before_content.is_none(), + FileStatus::Modified | FileStatus::Renamed => { + file.before_content.is_none() || file.after_content.is_none() + } + } +} + pub fn compute_semantic_diff( file_changes: &[FileChange], registry: &ParserRegistry, @@ -47,6 +83,7 @@ pub fn compute_semantic_diff( // Process files in parallel: each file's entity extraction and matching is independent let per_file_changes: Vec<(String, Vec, usize, usize)> = maybe_par_iter!(file_changes) + .filter(|file| !lacks_diffable_content(file)) .filter_map(|file| { let content_hint = file .after_content diff --git a/crates/sem-mcp/src/server.rs b/crates/sem-mcp/src/server.rs index 15a6ed4..1e84b5c 100644 --- a/crates/sem-mcp/src/server.rs +++ b/crates/sem-mcp/src/server.rs @@ -10,11 +10,11 @@ use rmcp::handler::server::router::tool::ToolRouter; use rmcp::handler::server::wrapper::Parameters; use rmcp::model::{CallToolResult, Content, ServerCapabilities, ServerInfo}; use rmcp::{tool, tool_handler, tool_router, ServerHandler}; -use sem_core::format::json::format_diff_json; +use sem_core::format::json::format_diff_json_with_binary_changes; use sem_core::git::bridge::GitBridge; use sem_core::git::types::{BlameLineInfo, CommitInfo, DiffScope}; use sem_core::model::entity::SemanticEntity; -use sem_core::parser::differ::compute_semantic_diff; +use sem_core::parser::differ::{collect_binary_file_changes, compute_semantic_diff}; use sem_core::parser::graph::EntityGraph; use sem_core::parser::plugins::create_default_registry; use sem_core::parser::registry::ParserRegistry; @@ -501,11 +501,12 @@ impl SemServer { Err(err) => return Ok(tool_error(err.to_string())), }; + let binary_changes = collect_binary_file_changes(&file_changes); let diff_result = compute_semantic_diff(&file_changes, &self.registry, None, None); Ok(CallToolResult::success(vec![Content::text( - format_diff_json(&diff_result), + format_diff_json_with_binary_changes(&diff_result, &binary_changes), )])) }