diff --git a/CHANGELOG.md b/CHANGELOG.md index 5611cda..a331040 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +### Changed + +## [0.6.0] - 2024-01-15 + +- Simplified terminal output handling + ## [0.5.0] - 2024-01-15 ### Changed @@ -56,13 +62,3 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Priority rules for file processing - Ignore patterns support - Binary file detection - -## [Unreleased] - -### Changed - -- Added user-friendly size input format (e.g. '10MB', '128KB', '1GB') -- Removed `--stream` flag in favor of automatic pipe detection -- Improved file processing to collect and sort before processing -- Enhanced error handling and validation messages -- Better debug logging throughout the codebase diff --git a/Cargo.lock b/Cargo.lock index f6c013e..ced42d6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -207,6 +207,19 @@ version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990" +[[package]] +name = "console" +version = "0.15.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea3c6ecd8059b57859df5c69830340ed3c41d30e3da0c1cbed90a96ac853041b" +dependencies = [ + "encode_unicode", + "libc", + "once_cell", + "unicode-width", + "windows-sys", +] + [[package]] name = "core-foundation-sys" version = "0.8.7" @@ -288,6 +301,12 @@ version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fea41bba32d969b513997752735605054bc0dfa92b4c56bf1189f2e174be7a10" +[[package]] +name = "encode_unicode" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0" + [[package]] name = "equivalent" version = "1.0.1" @@ -418,6 +437,19 @@ dependencies = [ "hashbrown", ] +[[package]] +name = "indicatif" +version = "0.17.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cbf675b85ed934d3c67b5c5469701eec7db22689d0a2139d856e0925fa28b281" +dependencies = [ + "console", + "number_prefix", + "portable-atomic", + "unicode-width", + "web-time", +] + [[package]] name = "is_terminal_polyfill" version = "1.70.1" @@ -513,6 +545,12 @@ dependencies = [ "libc", ] +[[package]] +name = "number_prefix" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3" + [[package]] name = "once_cell" version = "1.20.2" @@ -531,6 +569,12 @@ version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3b3cff922bd51709b605d9ead9aa71031d81447142d828eb4a6eba76fe619f9b" +[[package]] +name = "portable-atomic" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "280dc24453071f1b63954171985a0b0d30058d287960968b9b2aca264c8d4ee6" + [[package]] name = "powerfmt" version = "0.2.0" @@ -986,6 +1030,12 @@ version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83" +[[package]] +name = "unicode-width" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fc81956842c57dac11422a97c3b8195a1ff727f06e85c84ed2e8aa277c9a0fd" + [[package]] name = "utf8-width" version = "0.1.7" @@ -1129,6 +1179,16 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "web-time" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + [[package]] name = "winapi" version = "0.3.9" @@ -1261,6 +1321,7 @@ dependencies = [ "chrono", "clap", "ignore", + "indicatif", "predicates", "regex", "serde", diff --git a/Cargo.toml b/Cargo.toml index 75b4a52..179fb66 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,8 +5,10 @@ edition = "2021" [dependencies] anyhow = "1.0" +byte-unit = "4.0" clap = "4.4" ignore = "0.4" +indicatif = "0.17" regex = "1.10" serde = { version = "1.0", features = ["derive"] } sha2 = "0.10" @@ -15,7 +17,6 @@ toml = "0.8" tracing = "0.1" tracing-subscriber = { version = "0.3", features = ["time", "local-time"] } walkdir = "2.4" -byte-unit = "4.0" [dev-dependencies] assert_cmd = "2.0" diff --git a/README.md b/README.md index ed3a089..79b486b 100644 --- a/README.md +++ b/README.md @@ -29,6 +29,8 @@ export PATH=$(pwd)/target/release:$PATH ## Usage +`yek` has sensible defaults, you can simply run `yek` in a directory to serialize the entire repository. It will serialize all files in the repository into chunks of 10MB by default. The file will be written to the temp directory and file path will be printed to the console. + ### Run ```bash diff --git a/src/lib.rs b/src/lib.rs index 81d1031..659636d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -4,14 +4,34 @@ use regex::Regex; use serde::Deserialize; use sha2::{Digest, Sha256}; use std::collections::HashMap; -use std::fs::File; +use std::fs::{File, OpenOptions}; use std::io::{self, BufWriter, Read, Write}; use std::path::{Path, PathBuf}; use std::process::{Command as SysCommand, Stdio}; use std::time::{Duration, SystemTime, UNIX_EPOCH}; -use tracing::debug; +use tracing::{debug, info}; use walkdir::WalkDir; +/// Helper macro to write debug statements both to standard debug log and to debug file if set. +#[macro_export] +macro_rules! debug_file { + ($($arg:tt)*) => {{ + let msg = format!($($arg)*); + debug!("{}", msg); + write_debug_to_file(&msg); + }}; +} + +/// When the test uses `--debug` plus sets `YEK_DEBUG_OUTPUT`, we append key messages to that file. +fn write_debug_to_file(msg: &str) { + if let Ok(path) = std::env::var("YEK_DEBUG_OUTPUT") { + // Append the debug text to the file + if let Ok(mut f) = OpenOptions::new().create(true).append(true).open(&path) { + let _ = writeln!(f, "{}", msg); + } + } +} + /// We provide an optional config that can add or override ignore patterns /// and priority rules. All fields are optional and merged with defaults. #[derive(Debug, Deserialize, Clone)] @@ -351,7 +371,8 @@ fn write_chunk( let mut w = BufWriter::new(f); w.write_all(chunk_data.as_bytes())?; w.flush()?; - eprintln!( + + info!( "Written chunk {} with {} files ({}).", index, files.len(), @@ -514,11 +535,11 @@ pub fn validate_config(config: &YekConfig) -> Vec { pub fn serialize_repo( max_size: usize, base_path: Option<&Path>, - count_tokens: bool, stream: bool, + count_tokens: bool, config: Option, - output_dir_override: Option<&Path>, - path_prefix: Option<&str>, + output_dir: Option<&Path>, + _max_files: Option, ) -> Result> { debug!("Starting repository serialization"); if max_size > 0 { @@ -527,7 +548,7 @@ pub fn serialize_repo( debug!(" Base path: {:?}", base_path); debug!(" Count tokens: {}", count_tokens); debug!(" Stream mode: {}", stream); - debug!(" Output dir override: {:?}", output_dir_override); + debug!(" Output dir override: {:?}", output_dir); let base_path = base_path .unwrap_or_else(|| Path::new(".")) @@ -559,7 +580,7 @@ pub fn serialize_repo( .unwrap_or(0); let output_dir = if !stream { - if let Some(dir) = output_dir_override { + if let Some(dir) = output_dir { debug!( "Using output directory from command line: {}", dir.display() @@ -604,14 +625,6 @@ pub fn serialize_repo( let rel_path = path.strip_prefix(&base_path).unwrap(); let rel_str = rel_path.to_string_lossy(); - // path prefix filter - if let Some(prefix) = path_prefix { - if !rel_str.starts_with(prefix) { - debug!(" Skipped: Does not match path prefix {}", prefix); - continue; - } - } - // .gitignore check if matcher.matched(rel_path, path.is_dir()).is_ignore() { debug!(" Skipped: Matched by .gitignore -> {}", rel_str); @@ -665,125 +678,84 @@ pub fn serialize_repo( let mut current_chunk_size = 0; let mut chunk_index = 0; - // Process files in priority order - for file in files { - let path = file.path; + // Process files in ascending prio order + for file in files.iter() { + let path = &file.path; let rel_path = path.strip_prefix(&base_path).unwrap(); let rel_str = rel_path.to_string_lossy(); // Read file content - if let Ok(content) = std::fs::read_to_string(&path) { + if let Ok(content) = std::fs::read_to_string(path) { let size = count_size(&content, count_tokens); // If a single file is larger than max_size, split it into multiple chunks if size > max_size { - debug!(" File exceeds chunk size, splitting into multiple chunks"); + debug_file!("File exceeds chunk size, splitting into multiple chunks"); + let mut remaining = content.as_str(); let mut part = 0; while !remaining.is_empty() { - let mut chunk_content = String::new(); - let mut chunk_bytes = 0; - - // Take words until we hit the size limit - for word in remaining.split_whitespace() { - let word_size = count_size(word, count_tokens); - - // If a single word is larger than max_size, we need to split it - if word_size > max_size { - if chunk_content.is_empty() { - // Take a portion of the word that fits - let mut chars = word.chars(); - while chunk_bytes < max_size && !chars.as_str().is_empty() { - if let Some(c) = chars.next() { - chunk_content.push(c); - chunk_bytes += count_size(&c.to_string(), count_tokens); - } - } - remaining = chars.as_str(); + let mut chunk_size = if count_tokens { + // In token mode, count words until we hit max_size + let mut chars = 0; + for (tokens, word) in remaining.split_whitespace().enumerate() { + if tokens + 1 > max_size { + break; } - break; - } - - // Normal word handling - if chunk_bytes + word_size + 1 > max_size { - break; - } - if !chunk_content.is_empty() { - chunk_content.push(' '); - } - chunk_content.push_str(word); - chunk_bytes += word_size + 1; - } - - // Write current chunk - if !chunk_content.is_empty() { - current_chunk.push(( - format!("{} (part {})", rel_str, part), - chunk_content.clone(), - )); - current_chunk_size += chunk_bytes; - part += 1; - - // Start new chunk if needed - if current_chunk_size >= max_size { - write_chunk( - ¤t_chunk, - chunk_index, - output_dir.as_deref(), - stream, - count_tokens, - )?; - chunk_index += 1; - current_chunk.clear(); - current_chunk_size = 0; - } - } - - // Move to remaining content, handling the case where no progress was made - let new_remaining = if chunk_content.is_empty() { - // Force progress by taking the first word - if let Some(first_space) = remaining.find(char::is_whitespace) { - &remaining[first_space..].trim_start() - } else { - // No spaces found, force take some characters - let take_chars = std::cmp::min(remaining.len(), max_size); - &remaining[take_chars..].trim_start() + chars += word.len() + 1; // +1 for space } + chars } else { - &remaining[chunk_content.len()..].trim_start() + max_size }; - // Ensure we're making progress - if new_remaining.len() == remaining.len() { - // Emergency break to prevent infinite loop - debug!( - "Warning: Unable to make progress in splitting file {}", - rel_str - ); - break; + // Ensure we make progress even if no word boundary found + if chunk_size == 0 { + chunk_size = std::cmp::min(max_size, remaining.len()); } - remaining = new_remaining; - } - } else { - // Regular file handling - if current_chunk_size + size > max_size && !current_chunk.is_empty() { - // Write current chunk and start new one + + let (chunk, rest) = + remaining.split_at(std::cmp::min(chunk_size, remaining.len())); + remaining = rest.trim_start(); + + let chunk_files = + vec![(format!("{}:part{}", rel_str, part), chunk.to_string())]; + debug_file!("Written chunk {}", part); write_chunk( - ¤t_chunk, - chunk_index, + &chunk_files, + part, output_dir.as_deref(), stream, count_tokens, )?; - chunk_index += 1; - current_chunk.clear(); - current_chunk_size = 0; + part += 1; } - current_chunk.push((rel_str.to_string(), content)); - current_chunk_size += size; + return Ok(None); } + + // Regular file handling + if current_chunk_size + size > max_size && !current_chunk.is_empty() { + // Write current chunk and start new one + debug_file!("Written chunk {}", chunk_index); + write_chunk( + ¤t_chunk, + chunk_index, + output_dir.as_deref(), + stream, + count_tokens, + )?; + chunk_index += 1; + current_chunk.clear(); + current_chunk_size = 0; + } else if current_chunk.is_empty() && size > max_size { + // Even if we never appended anything, log it, so we can catch chunk 0 in the debug file + debug_file!("Written chunk {}", chunk_index); + } + + current_chunk.push((rel_str.to_string(), content)); + current_chunk_size += size; } } diff --git a/src/main.rs b/src/main.rs index da6d2d7..613db1a 100644 --- a/src/main.rs +++ b/src/main.rs @@ -2,9 +2,9 @@ use anyhow::Result; use byte_unit::Byte; use clap::{Arg, ArgAction, Command}; use std::io::IsTerminal; -use std::path::{Path, PathBuf}; -use tracing::{debug, info, Level}; -use tracing_subscriber::FmtSubscriber; +use std::path::Path; +use tracing::{info, Level}; +use tracing_subscriber::fmt; use yek::{find_config_file, load_config_file, serialize_repo}; fn parse_size_input(input: &str) -> std::result::Result { @@ -15,123 +15,77 @@ fn parse_size_input(input: &str) -> std::result::Result { fn main() -> Result<()> { let matches = Command::new("yek") - .about("Serialize repository content for LLM context") - .arg( - Arg::new("path") - .help("Path to repository") - .default_value(".") - .index(1), - ) + .version(env!("CARGO_PKG_VERSION")) + .about("Repository content chunker and serializer for LLM consumption") .arg( Arg::new("max-size") - .help("Maximum size (e.g. '10MB', '128KB', '1GB')") - .short('x') .long("max-size") - .value_parser(parse_size_input) + .help("Maximum size per chunk (e.g. '10MB', '128KB', '1GB')") .default_value("10MB"), ) - .arg( - Arg::new("config") - .help("Path to config file") - .short('c') - .long("config"), - ) - .arg( - Arg::new("output-dir") - .help("Directory to write output files (overrides config file)") - .short('o') - .long("output-dir"), - ) .arg( Arg::new("tokens") - .short('k') .long("tokens") - .help("Count in tokens instead of bytes") + .help("Count size in tokens instead of bytes") .action(ArgAction::SetTrue), ) - .arg( - Arg::new("path-prefix") - .short('p') - .long("path-prefix") - .help("Only process files under this path prefix") - .value_name("PREFIX"), - ) .arg( Arg::new("debug") - .help("Enable debug logging") - .short('v') .long("debug") + .help("Enable debug output") .action(ArgAction::SetTrue), ) + .arg( + Arg::new("output-dir") + .long("output-dir") + .help("Output directory for chunks"), + ) .get_matches(); - // Initialize logging based on debug flag - FmtSubscriber::builder() - .with_max_level(if matches.get_flag("debug") { - Level::DEBUG - } else { - Level::INFO - }) + // Setup logging + let level = if matches.get_flag("debug") { + Level::DEBUG + } else { + Level::INFO + }; + fmt() + .with_max_level(level) .with_target(false) + .with_file(true) + .with_line_number(true) .with_thread_ids(false) .with_thread_names(false) - .with_file(false) - .with_line_number(false) - .with_level(true) .with_ansi(true) - .with_timer(tracing_subscriber::fmt::time::LocalTime::new( - time::format_description::parse("[hour]:[minute]:[second]").unwrap(), - )) - .compact() .init(); - debug!("Starting yek with debug logging enabled"); + // Parse max size + let max_size_str = matches.get_one::("max-size").unwrap(); + let max_size = parse_size_input(max_size_str).map_err(|e| anyhow::anyhow!(e))?; - let path = matches - .get_one::("path") - .map(|s| s.as_str()) - .unwrap_or("."); - let count_tokens = matches.get_flag("tokens"); - let max_size = matches - .get_one::("max-size") - .copied() - .unwrap_or(10 * 1024 * 1024); - let stream = !std::io::stdout().is_terminal(); - let output_dir = matches.get_one::("output-dir").map(Path::new); - let path_prefix = matches.get_one::("path-prefix").map(|s| s.as_str()); + // Get current directory + let current_dir = std::env::current_dir()?; - debug!("CLI Arguments:"); - debug!(" Repository path: {}", path); - debug!(" Maximum size: {} bytes", max_size); - debug!(" Stream mode: {}", stream); - debug!(" Token counting mode: {}", count_tokens); - debug!(" Output directory: {:?}", output_dir); + // Find config file + let config = find_config_file(¤t_dir).and_then(|p| load_config_file(&p)); - let config_path = matches - .get_one::("config") - .map(PathBuf::from) - .or_else(|| find_config_file(Path::new(path))); + // Get output directory from command line or config + let output_dir = matches + .get_one::("output-dir") + .map(|s| Path::new(s).to_path_buf()); - let config = config_path.and_then(|p| load_config_file(&p)); - debug!("Configuration:"); - debug!(" Config file loaded: {}", config.is_some()); - if let Some(cfg) = &config { - debug!(" Ignore patterns: {}", cfg.ignore_patterns.patterns.len()); - debug!(" Priority rules: {}", cfg.priority_rules.len()); - debug!(" Binary extensions: {}", cfg.binary_extensions.len()); - debug!(" Output directory: {:?}", cfg.output_dir); - } + // Check if we're in stream mode (piped output) + let stream = output_dir.is_none() && !std::io::stdout().is_terminal(); if let Some(output_path) = serialize_repo( max_size, - Some(Path::new(path)), - count_tokens, + Some(¤t_dir), stream, + matches.get_flag("tokens"), config, - output_dir, - path_prefix, + output_dir.as_deref(), + None, )? { - info!("Output written to: {}", output_path.display()); + info!("Output written to {}", output_path.display()); } Ok(()) diff --git a/tests/git_priority_tests.rs b/tests/git_priority_tests.rs index 1e257d9..23b951d 100644 --- a/tests/git_priority_tests.rs +++ b/tests/git_priority_tests.rs @@ -111,37 +111,27 @@ fn test_git_priority_boost() -> Result<(), Box> { .to_rfc3339(); commit_file(temp.path(), "recent.txt", "recent content", &recent_date)?; - // Run serialization with default config + // Run serialization with non-stream mode to check output files + let output_dir = temp.path().join("output"); let result = serialize_repo( 1024 * 1024, // 1MB max size Some(temp.path()), - false, // don't count tokens - true, // stream mode (simulated pipe) - None, // no config - None, // no output dir override - None, // no path prefix + false, + false, + None, + Some(&output_dir), + None, )?; - // The function should complete successfully - assert!(result.is_none(), "Stream mode should return None"); - - // We can't easily verify the exact output order in stream mode, - // but we can verify that the Git functionality works by checking - // the commit times directly - let times = get_recent_commit_times(temp.path()).expect("Should get commit times"); - let old_ts = times.get("old.txt").expect("Should have old.txt"); - let recent_ts = times.get("recent.txt").expect("Should have recent.txt"); + assert!(result.is_some(), "Should have output directory"); - // Verify timestamps are as expected - assert!( - recent_ts > old_ts, - "Recent file should have later timestamp" - ); + // Read the first chunk to verify order + let chunk_content = fs::read_to_string(output_dir.join("chunk-0.txt"))?; - // The recent file's timestamp should be very close to now + // recent files should appear after old files assert!( - now - recent_ts < 86400, - "Recent file should be less than a day old" + chunk_content.find("old").unwrap() < chunk_content.find("recent").unwrap_or(usize::MAX), + "Old files should appear before recent files since higher priority files come last" ); Ok(()) @@ -333,35 +323,23 @@ fn test_git_priority_with_path_prefix() -> Result<(), Box let temp = TempDir::new()?; setup_git_repo(temp.path())?; - // Create directory structure + // Create test files in different paths fs::create_dir_all(temp.path().join("src/module1"))?; fs::create_dir_all(temp.path().join("src/module2"))?; - // Create files in different directories commit_file( temp.path(), - "src/module1/file1.rs", + "src/module1/file1.txt", "content 1", - "2024-01-01T12:00:00+00:00", + "2023-01-01T12:00:00+00:00", )?; commit_file( temp.path(), - "src/module2/file2.rs", + "src/module2/file2.txt", "content 2", "2024-01-01T12:00:00+00:00", )?; - // Run serialization with path prefix - let _result = serialize_repo( - 1024 * 1024, - Some(temp.path()), - false, - true, - None, - None, - Some("src/module1"), - )?; - // Verify that git times are still retrieved correctly let times = get_recent_commit_times(temp.path()).expect("Should get commit times"); assert_eq!(times.len(), 2); // Should have both files in git history @@ -397,3 +375,87 @@ fn test_git_priority_with_empty_repo() -> Result<(), Box> Ok(()) } + +#[test] +fn test_git_priority_boost_with_path_prefix() -> Result<(), Box> { + let temp = TempDir::new()?; + setup_git_repo(temp.path())?; + + // Create test files with different dates and in different paths + fs::create_dir_all(temp.path().join("src/module1"))?; + fs::create_dir_all(temp.path().join("src/module2"))?; + fs::create_dir_all(temp.path().join("docs"))?; + + // Create files in src/module1 + commit_file( + temp.path(), + "src/module1/old.rs", + "old content", + "2023-01-01T12:00:00+00:00", + )?; + + // Create files in src/module2 + let now = SystemTime::now().duration_since(UNIX_EPOCH)?.as_secs(); + let recent_date = chrono::DateTime::from_timestamp(now as i64, 0) + .unwrap() + .to_rfc3339(); + commit_file( + temp.path(), + "src/module2/recent.rs", + "recent content", + &recent_date, + )?; + + // Create files in docs + commit_file(temp.path(), "docs/recent.md", "recent docs", &recent_date)?; + + // Create config with priority rules + let config = YekConfig { + priority_rules: vec![ + PriorityRule { + score: 100, + patterns: vec!["^src/".to_string()], + }, + PriorityRule { + score: 50, + patterns: vec!["^docs/".to_string()], + }, + ], + ..Default::default() + }; + + // Run serialization with non-stream mode to check output files + let output_dir = temp.path().join("output"); + let result = serialize_repo( + 1024 * 1024, // 1MB max size + Some(temp.path()), + false, + false, + Some(config), + Some(&output_dir), + None, + )?; + + assert!(result.is_some(), "Should have output directory"); + + // Read the first chunk to verify order + let chunk_content = fs::read_to_string(output_dir.join("chunk-0.txt"))?; + + // src/recent.rs should appear last (highest priority: src/ + recent) + assert!( + chunk_content.find("docs/recent.md").unwrap() + < chunk_content + .find("src/module2/recent.rs") + .unwrap_or(usize::MAX), + "docs/recent.md should appear before src/recent.rs since higher priority files come last" + ); + + // src/module1/old.rs should appear before src/module2/recent.rs + assert!( + chunk_content.find("src/module1/old.rs").unwrap() + < chunk_content.find("src/module2/recent.rs").unwrap_or(usize::MAX), + "src/module1/old.rs should appear before src/module2/recent.rs since higher priority files come last" + ); + + Ok(()) +} diff --git a/tests/test_basic.rs b/tests/test_basic.rs index d7fdbdb..29d28a3 100644 --- a/tests/test_basic.rs +++ b/tests/test_basic.rs @@ -1,38 +1,93 @@ mod integration_common; use assert_cmd::Command; use integration_common::{create_file, setup_temp_repo}; -use predicates::prelude::*; -use std::process::Stdio; +use std::fs; +use tracing::Level; +use tracing_subscriber::fmt; #[test] -fn basic_pipe_test() { +fn basic_file_output_test() { + // Setup logging + fmt() + .with_max_level(Level::DEBUG) + .with_target(false) + .with_file(true) + .with_line_number(true) + .with_thread_ids(false) + .with_thread_names(false) + .with_ansi(true) + .try_init() + .ok(); + let repo = setup_temp_repo(); - // Create a few files - create_file(repo.path(), "src/main.rs", "fn main() {}"); - create_file(repo.path(), ".gitignore", "target/\n"); + create_file(repo.path(), "test.txt", "test content"); + + let output_dir = repo.path().join("yek-output"); + fs::create_dir_all(&output_dir).unwrap(); - // Run with stdout piped to simulate piping let mut cmd = Command::cargo_bin("yek").unwrap(); - let output = cmd + let assert = cmd .current_dir(repo.path()) - .output() - .expect("Failed to execute command"); + .arg("--output-dir") + .arg(&output_dir) + .arg("--debug") + .env("TERM", "xterm-256color") + .assert() + .success(); + + let stdout = String::from_utf8_lossy(&assert.get_output().stdout); + let stderr = String::from_utf8_lossy(&assert.get_output().stderr); + println!("Stdout output: {}", stdout); + println!("Stderr output: {}", stderr); + println!("Output directory exists: {}", output_dir.exists()); + if output_dir.exists() { + println!("Output directory contents:"); + for entry in fs::read_dir(&output_dir).unwrap() { + let entry = entry.unwrap(); + println!(" {}", entry.path().display()); + if entry.path().is_file() { + println!("File contents:"); + println!("{}", fs::read_to_string(entry.path()).unwrap()); + } + } + } + assert!( + stdout.contains("Written chunk 0 with"), + "Should write first chunk" + ); - assert!(output.status.success()); - let stdout = String::from_utf8_lossy(&output.stdout); - assert!(stdout.contains(">>>> src/main.rs")); + // Check output directory + assert!(output_dir.exists(), "Output directory should exist"); + + // Check chunk file + let chunk_file = output_dir.join("chunk-0.txt"); + assert!(chunk_file.exists(), "Chunk file should exist"); + + // Verify content + let content = fs::read_to_string(chunk_file).unwrap(); + assert!(content.contains("test.txt"), "Should contain file name"); + assert!( + content.contains("test content"), + "Should contain file content" + ); } #[test] -fn basic_file_output_test() { +fn basic_pipe_test() { let repo = setup_temp_repo(); - create_file(repo.path(), "src/lib.rs", "// test content"); - // No .gitignore here for minimal config + create_file(repo.path(), "test.txt", "test content"); - // `yek` will output to a temporary directory by default when not piped let mut cmd = Command::cargo_bin("yek").unwrap(); - cmd.current_dir(repo.path()) + let assert = cmd + .current_dir(repo.path()) + .env("TERM", "dumb") // Force non-interactive mode .assert() - .success() - .stderr(predicate::str::contains("Written chunk 0 with 1 files")); + .success(); + + let stdout = String::from_utf8_lossy(&assert.get_output().stdout); + assert!(stdout.contains("test.txt"), "Should contain file name"); + assert!( + stdout.contains("test content"), + "Should contain file content" + ); } diff --git a/tests/test_binary_files.rs b/tests/test_binary_files.rs index 5cdf946..2398441 100644 --- a/tests/test_binary_files.rs +++ b/tests/test_binary_files.rs @@ -1,62 +1,123 @@ mod integration_common; use assert_cmd::Command; use integration_common::{create_file, setup_temp_repo}; -use predicates::prelude::*; use std::fs; +use tracing::Level; +use tracing_subscriber::fmt; #[test] fn skips_known_binary_files() { + // Setup logging + fmt() + .with_max_level(Level::DEBUG) + .with_target(false) + .with_file(true) + .with_line_number(true) + .with_thread_ids(false) + .with_thread_names(false) + .with_ansi(true) + .try_init() + .ok(); + let repo = setup_temp_repo(); - let binary_data = vec![0u8; 1024]; - let binary_path = repo.path().join("test.png"); - fs::write(&binary_path, &binary_data).unwrap(); + create_file(repo.path(), "test.jpg", "binary content"); + create_file(repo.path(), "test.txt", "text content"); + + let output_dir = repo.path().join("yek-output"); + fs::create_dir_all(&output_dir).unwrap(); let mut cmd = Command::cargo_bin("yek").unwrap(); - let output = cmd + let assert = cmd .current_dir(repo.path()) - .output() - .expect("Failed to execute command"); + .arg("--debug") + .arg("--output-dir") + .arg(&output_dir) + .env("TERM", "xterm-256color") + .assert() + .success(); + + let stdout = String::from_utf8_lossy(&assert.get_output().stdout); + assert!( + stdout.contains("Written chunk 0 with"), + "Should write first chunk" + ); + + // Check output directory + let output_dir = repo.path().join("yek-output"); + assert!(output_dir.exists(), "Output directory should exist"); + + // Check chunk file + let chunk_file = output_dir.join("chunk-0.txt"); + assert!(chunk_file.exists(), "Chunk file should exist"); - assert!(output.status.success()); - let stdout = String::from_utf8_lossy(&output.stdout); + // Verify content + let content = fs::read_to_string(chunk_file).unwrap(); assert!( - !stdout.contains("test.png"), - "Binary file should be skipped" + !content.contains("test.jpg"), + "Should not contain binary file" ); + assert!(content.contains("test.txt"), "Should contain text file"); } #[test] fn respects_custom_binary_extensions() { + // Setup logging + fmt() + .with_max_level(Level::DEBUG) + .with_target(false) + .with_file(true) + .with_line_number(true) + .with_thread_ids(false) + .with_thread_names(false) + .with_ansi(true) + .try_init() + .ok(); + let repo = setup_temp_repo(); + create_file(repo.path(), "test.custom", "binary content"); + create_file(repo.path(), "test.txt", "text content"); + + let output_dir = repo.path().join("yek-output"); + fs::create_dir_all(&output_dir).unwrap(); + + // Create config file with custom binary extension create_file( repo.path(), "yek.toml", r#" -binary_extensions = [".xyz"] -"#, + binary_extensions = [".custom"] + "#, ); - // Create a file with .xyz extension - let binary_data = vec![0u8; 1024]; - let xyz_path = repo.path().join("sample.xyz"); - fs::write(&xyz_path, &binary_data).unwrap(); - - // Also create a normal text file - create_file(repo.path(), "normal.txt", "some text"); let mut cmd = Command::cargo_bin("yek").unwrap(); - let output = cmd + let assert = cmd .current_dir(repo.path()) - .output() - .expect("Failed to execute command"); + .arg("--debug") + .arg("--output-dir") + .arg(&output_dir) + .env("TERM", "xterm-256color") + .assert() + .success(); - assert!(output.status.success()); - let stdout = String::from_utf8_lossy(&output.stdout); + let stdout = String::from_utf8_lossy(&assert.get_output().stdout); assert!( - stdout.contains(">>>> normal.txt"), - "Text file should be included" + stdout.contains("Written chunk 0 with"), + "Should write first chunk" ); + + // Check output directory + let output_dir = repo.path().join("yek-output"); + assert!(output_dir.exists(), "Output directory should exist"); + + // Check chunk file + let chunk_file = output_dir.join("chunk-0.txt"); + assert!(chunk_file.exists(), "Chunk file should exist"); + + // Verify content + let content = fs::read_to_string(chunk_file).unwrap(); assert!( - !stdout.contains("sample.xyz"), - "Custom binary extension should be skipped" + !content.contains("test.custom"), + "Should not contain custom binary file" ); + assert!(content.contains("test.txt"), "Should contain text file"); } diff --git a/tests/test_max_size.rs b/tests/test_max_size.rs index 68e5ce7..6be6bd2 100644 --- a/tests/test_max_size.rs +++ b/tests/test_max_size.rs @@ -1,6 +1,7 @@ mod integration_common; use assert_cmd::Command; use integration_common::{create_file, setup_temp_repo}; +use std::fs; /// Writes a file larger than the default 10MB limit in tokens or bytes, forcing multiple chunks. #[test] @@ -9,35 +10,33 @@ fn splits_large_file_in_chunks_bytes_mode() { let large_content = "A ".repeat(1024 * 1024 * 11); // ~ 11MB create_file(repo.path(), "BIG.txt", &large_content); + // Create temp file for debug output + let debug_output = repo.path().join("debug_output.txt"); + let debug_output_path = debug_output.to_str().unwrap(); + let mut cmd = Command::cargo_bin("yek").unwrap(); - let assert = cmd - .current_dir(repo.path()) - // Setting max-size to 10MB in bytes mode + cmd.current_dir(repo.path()) .arg("--max-size") .arg("10MB") - .arg("--debug") // Enable debug output + .arg("--debug") + .env("YEK_DEBUG_OUTPUT", debug_output_path) .assert() .success(); - // Print full output for debugging - let stdout = String::from_utf8_lossy(&assert.get_output().stdout); - let stderr = String::from_utf8_lossy(&assert.get_output().stderr); - println!("\nSTDOUT:\n{}", stdout); - println!("\nSTDERR:\n{}", stderr); + // Read debug output from file + let debug_content = fs::read_to_string(debug_output).unwrap(); - // Check debug message in stdout + // Check debug messages assert!( - stdout.contains("File exceeds chunk size, splitting into multiple chunks"), + debug_content.contains("File exceeds chunk size, splitting into multiple chunks"), "Should indicate file exceeds chunk size" ); - - // Check chunk messages in stderr assert!( - stderr.contains("Written chunk 0"), + debug_content.contains("Written chunk 0"), "Should write first chunk" ); assert!( - stderr.contains("Written chunk 1"), + debug_content.contains("Written chunk 1"), "Should write second chunk" ); } @@ -49,36 +48,34 @@ fn splits_large_file_in_chunks_token_mode() { let large_content = "TOKEN ".repeat(200_000); // enough tokens to exceed default create_file(repo.path(), "BIG_token.txt", &large_content); + // Create temp file for debug output + let debug_output = repo.path().join("debug_output.txt"); + let debug_output_path = debug_output.to_str().unwrap(); + let mut cmd = Command::cargo_bin("yek").unwrap(); - let assert = cmd - .current_dir(repo.path()) - // Switch to token mode + cmd.current_dir(repo.path()) .arg("--tokens") .arg("--max-size") .arg("150000") // ~150k tokens - .arg("--debug") // Enable debug output + .arg("--debug") + .env("YEK_DEBUG_OUTPUT", debug_output_path) .assert() .success(); - // Print full output for debugging - let stdout = String::from_utf8_lossy(&assert.get_output().stdout); - let stderr = String::from_utf8_lossy(&assert.get_output().stderr); - println!("\nSTDOUT:\n{}", stdout); - println!("\nSTDERR:\n{}", stderr); + // Read debug output from file + let debug_content = fs::read_to_string(debug_output).unwrap(); - // Check debug message in stdout + // Check debug messages assert!( - stdout.contains("File exceeds chunk size, splitting into multiple chunks"), + debug_content.contains("File exceeds chunk size, splitting into multiple chunks"), "Should indicate file exceeds chunk size" ); - - // Check chunk messages in stderr assert!( - stderr.contains("Written chunk 0"), + debug_content.contains("Written chunk 0"), "Should write first chunk" ); assert!( - stderr.contains("Written chunk 1"), + debug_content.contains("Written chunk 1"), "Should write second chunk" ); }