From 9343ffe13246fe524a280f85bbca78807e41ec3e Mon Sep 17 00:00:00 2001 From: Mohsen Azimi Date: Sun, 19 Jan 2025 10:32:01 +0700 Subject: [PATCH 1/7] feat: add performance test --- tests/test_perf.rs | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 tests/test_perf.rs diff --git a/tests/test_perf.rs b/tests/test_perf.rs new file mode 100644 index 0000000..fca904b --- /dev/null +++ b/tests/test_perf.rs @@ -0,0 +1,42 @@ +use std::fs; +use std::path::Path; +use std::time::Instant; +use yek::serialize_repo; + +#[test] +fn test_serialization_performance() { + // Create test data directory + let test_dir = "test_perf_data"; + fs::create_dir_all(test_dir).unwrap(); + + // Create test files of different sizes + let sizes = vec![1024, 1024 * 1024, 10 * 1024 * 1024]; // 1KB, 1MB, 10MB + + for size in sizes { + let filename = format!("{}/file_{}_bytes.txt", test_dir, size); + let data = vec![b'a'; size]; + fs::write(&filename, &data).unwrap(); + + // Measure serialization time + let start = Instant::now(); + serialize_repo( + size, // max_size + Some(Path::new(test_dir)), // base_path + false, // stream + false, // count_tokens + None, // config + Some(Path::new("perf_output")), // output_dir + None, // max_files + ) + .unwrap(); + let duration = start.elapsed(); + + println!("Serializing {}B took: {:?}", size, duration); + + // Cleanup + fs::remove_dir_all("perf_output").unwrap(); + } + + // Final cleanup + fs::remove_dir_all(test_dir).unwrap(); +} From d74b60a1b431911e97b6cc9ee2ea400905414b28 Mon Sep 17 00:00:00 2001 From: Mohsen Azimi Date: Sun, 19 Jan 2025 11:21:08 +0700 Subject: [PATCH 2/7] feat: parallel execution for better perf --- Cargo.lock | 282 +++++++++++++++++++++++++++++++- Cargo.toml | 7 + README.md | 72 +++------ benches/serialization.rs | 50 ++++++ src/lib.rs | 337 +++++++++++++++++++++------------------ src/parallel.rs | 337 +++++++++++++++++++++++++++++++++++++++ tests/test_perf.rs | 96 ++++++++--- 7 files changed, 950 insertions(+), 231 deletions(-) create mode 100644 benches/serialization.rs create mode 100644 src/parallel.rs diff --git a/Cargo.lock b/Cargo.lock index 11702f8..7184df0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -26,6 +26,12 @@ dependencies = [ "libc", ] +[[package]] +name = "anes" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" + [[package]] name = "anstream" version = "0.6.18" @@ -62,7 +68,7 @@ version = "1.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "79947af37f4177cfead1110013d678905c37501914fba0efea834c3fe9a8d60c" dependencies = [ - "windows-sys", + "windows-sys 0.59.0", ] [[package]] @@ -72,7 +78,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2109dbce0e72be3ec00bed26e6a7479ca384ad226efdd66db8fa2e3a38c83125" dependencies = [ "anstyle", - "windows-sys", + "windows-sys 0.59.0", ] [[package]] @@ -145,6 +151,12 @@ dependencies = [ "utf8-width", ] +[[package]] +name = "cast" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" + [[package]] name = "cc" version = "1.2.9" @@ -174,6 +186,33 @@ dependencies = [ "windows-targets", ] +[[package]] +name = "ciborium" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42e69ffd6f0917f5c029256a24d0161db17cea3997d185db0d35926308770f0e" +dependencies = [ + "ciborium-io", + "ciborium-ll", + "serde", +] + +[[package]] +name = "ciborium-io" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05afea1e0a06c9be33d539b876f1ce3692f4afea2cb41f740e7743225ed1c757" + +[[package]] +name = "ciborium-ll" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9" +dependencies = [ + "ciborium-io", + "half", +] + [[package]] name = "clap" version = "4.5.26" @@ -217,7 +256,7 @@ dependencies = [ "libc", "once_cell", "unicode-width", - "windows-sys", + "windows-sys 0.59.0", ] [[package]] @@ -235,6 +274,64 @@ dependencies = [ "libc", ] +[[package]] +name = "criterion" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2b12d017a929603d80db1831cd3a24082f8137ce19c69e6447f54f5fc8d692f" +dependencies = [ + "anes", + "cast", + "ciborium", + "clap", + "criterion-plot", + "is-terminal", + "itertools", + "num-traits", + "once_cell", + "oorandom", + "plotters", + "rayon", + "regex", + "serde", + "serde_derive", + "serde_json", + "tinytemplate", + "walkdir", +] + +[[package]] +name = "criterion-plot" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1" +dependencies = [ + "cast", + "itertools", +] + +[[package]] +name = "crossbeam" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1137cd7e7fc0fb5d3c5a8678be38ec56e819125d8d7907411fe24ccb943faca8" +dependencies = [ + "crossbeam-channel", + "crossbeam-deque", + "crossbeam-epoch", + "crossbeam-queue", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-channel" +version = "0.5.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06ba6d68e24814cb8de6bb986db8222d3a027d15872cabc0d18817bc3c0e4471" +dependencies = [ + "crossbeam-utils", +] + [[package]] name = "crossbeam-deque" version = "0.8.6" @@ -254,12 +351,27 @@ dependencies = [ "crossbeam-utils", ] +[[package]] +name = "crossbeam-queue" +version = "0.3.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0f58bbc28f91df819d0aa2a2c00cd19754769c2fad90579b3592b1c9ba7a3115" +dependencies = [ + "crossbeam-utils", +] + [[package]] name = "crossbeam-utils" version = "0.8.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" +[[package]] +name = "crunchy" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7" + [[package]] name = "crypto-common" version = "0.1.6" @@ -301,6 +413,12 @@ version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fea41bba32d969b513997752735605054bc0dfa92b4c56bf1189f2e174be7a10" +[[package]] +name = "either" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0" + [[package]] name = "encode_unicode" version = "1.0.0" @@ -330,7 +448,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "33d852cb9b869c2a9b3df2f71a3074817f01e1844f839a144f5fcef059a4eb5d" dependencies = [ "libc", - "windows-sys", + "windows-sys 0.59.0", ] [[package]] @@ -382,12 +500,34 @@ dependencies = [ "regex-syntax", ] +[[package]] +name = "half" +version = "2.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6dd08c532ae367adf81c312a4580bc67f1d0fe8bc9c460520283f4c0ff277888" +dependencies = [ + "cfg-if", + "crunchy", +] + [[package]] name = "hashbrown" version = "0.15.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bf151400ff0baff5465007dd2f3e717f3fe502074ca563069ce3a6629d07b289" +[[package]] +name = "hermit-abi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d231dfb89cfffdbc30e7fc41579ed6066ad03abda9e567ccafae602b97ec5024" + +[[package]] +name = "hermit-abi" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fbf6a919d6cf397374f7dfeeea91d974c7c0a7221d0d0f4f20d859d329e53fcc" + [[package]] name = "iana-time-zone" version = "0.1.61" @@ -450,12 +590,32 @@ dependencies = [ "web-time", ] +[[package]] +name = "is-terminal" +version = "0.4.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "261f68e344040fbd0edea105bef17c66edf46f984ddb1115b775ce31be948f4b" +dependencies = [ + "hermit-abi 0.4.0", + "libc", + "windows-sys 0.52.0", +] + [[package]] name = "is_terminal_polyfill" version = "1.70.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" +[[package]] +name = "itertools" +version = "0.10.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473" +dependencies = [ + "either", +] + [[package]] name = "itoa" version = "1.0.14" @@ -536,6 +696,16 @@ dependencies = [ "autocfg", ] +[[package]] +name = "num_cpus" +version = "1.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43" +dependencies = [ + "hermit-abi 0.3.9", + "libc", +] + [[package]] name = "num_threads" version = "0.1.7" @@ -557,6 +727,12 @@ version = "1.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775" +[[package]] +name = "oorandom" +version = "11.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b410bbe7e14ab526a0e86877eb47c6996a2bd7746f027ba551028c925390e4e9" + [[package]] name = "overload" version = "0.1.1" @@ -569,6 +745,34 @@ version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3b3cff922bd51709b605d9ead9aa71031d81447142d828eb4a6eba76fe619f9b" +[[package]] +name = "plotters" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5aeb6f403d7a4911efb1e33402027fc44f29b5bf6def3effcc22d7bb75f2b747" +dependencies = [ + "num-traits", + "plotters-backend", + "plotters-svg", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "plotters-backend" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df42e13c12958a16b3f7f4386b9ab1f3e7933914ecea48da7139435263a4172a" + +[[package]] +name = "plotters-svg" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51bae2ac328883f7acdfea3d66a7c35751187f870bc81f94563733a154d7a670" +dependencies = [ + "plotters-backend", +] + [[package]] name = "portable-atomic" version = "1.10.0" @@ -629,6 +833,26 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "rayon" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2" +dependencies = [ + "crossbeam-deque", + "crossbeam-utils", +] + [[package]] name = "regex" version = "1.11.1" @@ -668,7 +892,7 @@ dependencies = [ "errno", "libc", "linux-raw-sys", - "windows-sys", + "windows-sys 0.59.0", ] [[package]] @@ -721,6 +945,18 @@ dependencies = [ "serde", ] +[[package]] +name = "serde_json" +version = "1.0.136" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "336a0c23cf42a38d9eaa7cd22c7040d04e1228a19a933890805ffd00a16437d2" +dependencies = [ + "itoa", + "memchr", + "ryu", + "serde", +] + [[package]] name = "serde_spanned" version = "0.6.8" @@ -868,7 +1104,7 @@ dependencies = [ "getrandom", "once_cell", "rustix", - "windows-sys", + "windows-sys 0.59.0", ] [[package]] @@ -920,6 +1156,16 @@ dependencies = [ "time-core", ] +[[package]] +name = "tinytemplate" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc" +dependencies = [ + "serde", + "serde_json", +] + [[package]] name = "toml" version = "0.8.19" @@ -1179,6 +1425,16 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "web-sys" +version = "0.3.77" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33b6dd2ef9186f1f2072e409e99cd22a975331a6b3591b12c764e0e55c60d5d2" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + [[package]] name = "web-time" version = "1.1.0" @@ -1211,7 +1467,7 @@ version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb" dependencies = [ - "windows-sys", + "windows-sys 0.59.0", ] [[package]] @@ -1229,6 +1485,15 @@ dependencies = [ "windows-targets", ] +[[package]] +name = "windows-sys" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +dependencies = [ + "windows-targets", +] + [[package]] name = "windows-sys" version = "0.59.0" @@ -1320,8 +1585,11 @@ dependencies = [ "byte-unit", "chrono", "clap", + "criterion", + "crossbeam", "ignore", "indicatif", + "num_cpus", "predicates", "regex", "serde", diff --git a/Cargo.toml b/Cargo.toml index e734cd9..c991bff 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,8 +7,10 @@ edition = "2021" anyhow = "1.0" byte-unit = "4.0" clap = "4.4" +crossbeam = "0.8" ignore = "0.4" indicatif = "0.17" +num_cpus = "1.15" regex = "1.10" serde = { version = "1.0", features = ["derive"] } sha2 = "0.10" @@ -23,6 +25,11 @@ assert_cmd = "2.0" chrono = "0.4" predicates = "3.0" tempfile = "3.9" +criterion = "0.5" + +[[bench]] +name = "serialization" +harness = false [profile.release] opt-level = 3 diff --git a/README.md b/README.md index 4a998cb..9ccea3c 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,8 @@ A fast Rust based tool to read text-based files in a repository or directory, ch - Supports processing multiple directories in a single command. - Configurable via a `yek.toml` file. +Yek ([يک](https://fa.wikipedia.org/wiki/۱)) means "One" in Farsi/Persian. + ## Installation ### Via Homebrew (recommended for macOS) @@ -58,31 +60,43 @@ export PATH=$(pwd)/target/release:$PATH ### Examples -Process current directory: +Process current directory and write to temp directory: ```bash yek ``` -Process specific directories: +Pipe output to clipboard (macOS): ```bash -yek src/ tests/ +yek src/ | pbcopy ``` -Process multiple repositories: +Cap the max size to 128K tokens and only process the `src` directory: ```bash -yek ~/code/project1 ~/code/project2 +yek --max-size 128000 --tokens src/ ``` -Pipe output to clipboard: +Cap the max size to 100KB and only process the `src` directory, writing to a specific directory: ```bash -yek src/ | pbcopy +yek --max-size 100KB --output-dir /tmp/yek src/ +``` + +Process multiple directories: + +```bash +yek src/ tests/ ``` -### Run +Process multiple repositories: + +```bash +yek ~/code/project1 ~/code/project2 +``` + +### Help ```bash yek --help @@ -103,44 +117,6 @@ Options: -V, --version Print version ``` -## Examples - -- Serialize entire repository into chunks of 10MB (default): - -```bash -yek -``` - -- Split repository into chunks of 128MB: - -```bash -yek --max-size 128MB -``` - -- Split into chunks by token count instead of bytes: - -```bash -yek --tokens --max-size 128000 -``` - -- Serialize only files under a specific path: - -```bash -yek src/app -``` - -- Process multiple directories: - -```bash -yek src/app src/lib -``` - -- Stream output to another command: - -```bash -yek | pbcopy -``` - ## Configuration File You can place a file called `yek.toml` at your project root or pass a custom path via `--config`. The configuration file allows you to: @@ -150,7 +126,9 @@ You can place a file called `yek.toml` at your project root or pass a custom pat 3. Add additional binary file extensions to ignore (extends the built-in list) 4. Configure Git-based priority boost -Example configuration: +### Example `yek.toml` + +This is optional, you can configure the `yek.toml` file at the root of your project. ```toml # Add patterns to ignore (in addition to .gitignore) diff --git a/benches/serialization.rs b/benches/serialization.rs new file mode 100644 index 0000000..315db18 --- /dev/null +++ b/benches/serialization.rs @@ -0,0 +1,50 @@ +use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion}; +use std::fs; +use std::path::Path; +use tempfile::TempDir; +use yek::serialize_repo; + +fn create_test_data(dir: &Path, size: usize) { + let filename = dir.join(format!("file_{}_bytes.txt", size)); + let data = vec![b'a'; size]; + fs::write(&filename, &data).unwrap(); +} + +fn bench_serialization(c: &mut Criterion) { + let mut group = c.benchmark_group("serialization"); + group.sample_size(10); // Number of samples to collect + + // Test different file sizes + let sizes = vec![ + 1024, // 1KB + 1024 * 1024, // 1MB + 10 * 1024 * 1024, // 10MB + ]; + + for size in sizes { + // Create a new temporary directory for each benchmark + let temp_dir = TempDir::new().unwrap(); + let output_dir = temp_dir.path().join("output"); + create_test_data(temp_dir.path(), size); + + group.bench_with_input(BenchmarkId::new("file_size", size), &size, |b, &size| { + b.iter(|| { + serialize_repo( + black_box(size), + Some(temp_dir.path()), + false, + false, + None, + Some(&output_dir), + None, + ) + .unwrap(); + fs::remove_dir_all(&output_dir).unwrap(); + }); + }); + } + group.finish(); +} + +criterion_group!(benches, bench_serialization); +criterion_main!(benches); diff --git a/src/lib.rs b/src/lib.rs index 0b6e5e5..20a8861 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -9,6 +9,8 @@ use std::path::{Path, PathBuf}; use std::process::{Command as SysCommand, Stdio}; use tracing::{debug, info}; use walkdir::WalkDir; +mod parallel; +use parallel::process_files_parallel; /// Helper macro to write debug statements both to standard debug log and to debug file if set. #[macro_export] @@ -192,7 +194,11 @@ fn build_final_config(cfg: Option) -> FinalConfig { merged_ignore.push(reg); } } - // Merge or add new priority rules + // Clear default priority rules if user provides their own + if !user_cfg.priority_rules.is_empty() { + merged_priority.clear(); + } + // Add user priority rules for user_rule in user_cfg.priority_rules { if user_rule.patterns.is_empty() { continue; @@ -348,14 +354,15 @@ pub fn get_file_priority( _ignore_pats: &[Regex], prio_list: &[PriorityPattern], ) -> i32 { - for prio in prio_list { + // Loop from highest score → lowest + for prio in prio_list.iter().rev() { for pat in &prio.patterns { if pat.is_match(rel_str) { return prio.score; } } } - 40 // fallback + 0 // fallback if nothing matches - lower than any user-defined priority } /// Get the commit time of the most recent change to each file. @@ -531,165 +538,188 @@ pub fn serialize_repo( None }; - // Collect files with their priorities - let mut files: Vec = Vec::new(); - let mut total_size = 0; - let mut current_chunk = 0; - let mut current_chunk_files = Vec::new(); - - // Walk directory tree - for entry in WalkDir::new(base_path) - .follow_links(true) - .into_iter() - .filter_map(|e| e.ok()) - { - let path = entry.path(); - if !path.is_file() { - continue; - } - - // Get path relative to base - let rel_path = path.strip_prefix(base_path).unwrap_or(path); - let rel_str = rel_path.to_string_lossy(); - - // Normalize path separators to forward slashes for consistent pattern matching - #[cfg(windows)] - let rel_str = rel_str.replace('\\', "/"); - - // Skip if matched by gitignore - #[cfg(windows)] - let gitignore_path = rel_path - .to_str() - .map(|s| s.replace('\\', "/")) - .map(PathBuf::from) - .unwrap_or(rel_path.to_path_buf()); - #[cfg(not(windows))] - let gitignore_path = rel_path.to_path_buf(); - - if gitignore.matched(&gitignore_path, false).is_ignore() { - debug!("Skipping {} - matched by gitignore", rel_str); - continue; - } + if stream { + // For streaming, we still use the old single-threaded approach + let mut files: Vec = Vec::new(); + let mut total_size = 0; + let mut current_chunk = 0; + let mut current_chunk_files = Vec::new(); + + // Walk directory tree + for entry in WalkDir::new(base_path) + .follow_links(true) + .into_iter() + .filter_map(|e| e.ok()) + { + let path = entry.path(); + if !path.is_file() { + continue; + } - // Skip if matched by our ignore patterns - let mut skip = false; - #[cfg(windows)] - let pattern_path = rel_str.replace('\\', "/"); - #[cfg(not(windows))] - let pattern_path = rel_str.to_string(); + // Get path relative to base + let rel_path = path.strip_prefix(base_path).unwrap_or(path); + let rel_str = rel_path.to_string_lossy(); + + // Normalize path separators to forward slashes for consistent pattern matching + #[cfg(windows)] + let rel_str = rel_str.replace('\\', "/"); + + // Skip if matched by gitignore + #[cfg(windows)] + let gitignore_path = rel_path + .to_str() + .map(|s| s.replace('\\', "/")) + .map(PathBuf::from) + .unwrap_or(rel_path.to_path_buf()); + #[cfg(not(windows))] + let gitignore_path = rel_path.to_path_buf(); + + if gitignore.matched(&gitignore_path, false).is_ignore() { + debug!("Skipping {} - matched by gitignore", rel_str); + continue; + } - for pat in &final_config.ignore_patterns { - if pat.is_match(&pattern_path) { - debug!("Skipping {} - matched ignore pattern", rel_str); - skip = true; - break; + // Skip if matched by our ignore patterns + let mut skip = false; + #[cfg(windows)] + let pattern_path = rel_str.replace('\\', "/"); + #[cfg(not(windows))] + let pattern_path = rel_str.to_string(); + + for pat in &final_config.ignore_patterns { + if pat.is_match(&pattern_path) { + debug!("Skipping {} - matched ignore pattern", rel_str); + skip = true; + break; + } + } + if skip { + continue; } - } - if skip { - continue; - } - // Calculate priority score - let mut priority = get_file_priority( - &pattern_path, - &final_config.ignore_patterns, - &final_config.priority_list, - ); + // Calculate priority score + let mut priority = get_file_priority( + &pattern_path, + &final_config.ignore_patterns, + &final_config.priority_list, + ); - // Apply rank-based boost if available - if let Some(ref boost_map) = recentness_boost { - if let Some(boost) = boost_map.get(&pattern_path) { - priority += *boost; + // Apply rank-based boost if available + if let Some(ref boost_map) = recentness_boost { + if let Some(boost) = boost_map.get(&pattern_path) { + priority += *boost; + } } - } - files.push(FileEntry { - path: path.to_path_buf(), - priority, - }); - } + files.push(FileEntry { + path: path.to_path_buf(), + priority, + }); + } - // Sort files by priority (ascending) so higher priority files come last - files.sort_by(|a, b| a.priority.cmp(&b.priority)); + // Sort files by priority (ascending) so higher priority files come last + files.sort_by(|a, b| a.priority.cmp(&b.priority)); - // Process files in sorted order - for file in files { - let path = file.path; - let rel_path = path.strip_prefix(base_path).unwrap_or(&path); - let rel_str = rel_path.to_string_lossy(); + // Process files in sorted order + for file in files { + let path = file.path; + let rel_path = path.strip_prefix(base_path).unwrap_or(&path); + let rel_str = rel_path.to_string_lossy(); - // Skip binary files - if let Some(ref cfg) = config { - if !is_text_file(&path, &cfg.binary_extensions) { + // Skip binary files + if let Some(ref cfg) = config { + if !is_text_file(&path, &cfg.binary_extensions) { + debug!("Skipping binary file: {}", rel_str); + continue; + } + } else if !is_text_file(&path, &[]) { debug!("Skipping binary file: {}", rel_str); continue; } - } else if !is_text_file(&path, &[]) { - debug!("Skipping binary file: {}", rel_str); - continue; - } - // Read file content - let content = match fs::read_to_string(&path) { - Ok(c) => c, - Err(e) => { - debug!("Failed to read {}: {}", rel_str, e); + // Read file content + let content = match fs::read_to_string(&path) { + Ok(c) => c, + Err(e) => { + debug!("Failed to read {}: {}", rel_str, e); + continue; + } + }; + + let size = count_size(&content, count_tokens); + if size == 0 { + debug!("Skipping empty file: {}", rel_str); continue; } - }; - let size = count_size(&content, count_tokens); - if size == 0 { - debug!("Skipping empty file: {}", rel_str); - continue; - } - - // If a single file is larger than max_size, split it into multiple chunks - if size > max_size { - debug_file!("File exceeds chunk size, splitting into multiple chunks"); - let mut remaining = content.as_str(); - let mut part = 0; - - while !remaining.is_empty() { - let mut chunk_size = if count_tokens { - // In token mode, count words until we hit max_size - let mut chars = 0; - for (tokens, word) in remaining.split_whitespace().enumerate() { - if tokens + 1 > max_size { - break; + // If a single file is larger than max_size, split it into multiple chunks + if size > max_size { + debug_file!("File exceeds chunk size, splitting into multiple chunks"); + let mut remaining = content.as_str(); + let mut part = 0; + + while !remaining.is_empty() { + let mut chunk_size = if count_tokens { + // In token mode, count words until we hit max_size + let mut chars = 0; + for (tokens, word) in remaining.split_whitespace().enumerate() { + if tokens + 1 > max_size { + break; + } + chars += word.len() + 1; // +1 for space } - chars += word.len() + 1; // +1 for space + chars + } else { + max_size + }; + + // Ensure we make progress even if no word boundary found + if chunk_size == 0 { + chunk_size = std::cmp::min(max_size, remaining.len()); } - chars - } else { - max_size - }; - - // Ensure we make progress even if no word boundary found - if chunk_size == 0 { - chunk_size = std::cmp::min(max_size, remaining.len()); - } - let (chunk, rest) = remaining.split_at(std::cmp::min(chunk_size, remaining.len())); - remaining = rest.trim_start(); + let (chunk, rest) = + remaining.split_at(std::cmp::min(chunk_size, remaining.len())); + remaining = rest.trim_start(); + + let chunk_files = + vec![(format!("{}:part{}", rel_str, part), chunk.to_string())]; + debug_file!("Written chunk {}", part); + write_chunk( + &chunk_files, + part, + output_dir.as_deref(), + stream, + count_tokens, + )?; + part += 1; + } + continue; + } - let chunk_files = vec![(format!("{}:part{}", rel_str, part), chunk.to_string())]; - debug_file!("Written chunk {}", part); + // Check if adding this file would exceed chunk size + if total_size + size > max_size && !current_chunk_files.is_empty() { + // Write current chunk write_chunk( - &chunk_files, - part, + ¤t_chunk_files, + current_chunk, output_dir.as_deref(), stream, count_tokens, )?; - part += 1; + debug_file!("Written chunk {}", current_chunk); + current_chunk += 1; + current_chunk_files.clear(); + total_size = 0; } - continue; + + // Add file to current chunk + current_chunk_files.push((rel_str.to_string(), content)); + total_size += size; } - // Check if adding this file would exceed chunk size - if total_size + size > max_size && !current_chunk_files.is_empty() { - // Write current chunk + // Write final chunk if any files remain + if !current_chunk_files.is_empty() { write_chunk( ¤t_chunk_files, current_chunk, @@ -698,32 +728,23 @@ pub fn serialize_repo( count_tokens, )?; debug_file!("Written chunk {}", current_chunk); - current_chunk += 1; - current_chunk_files.clear(); - total_size = 0; } - // Add file to current chunk - current_chunk_files.push((rel_str.to_string(), content)); - total_size += size; - } - - // Write final chunk if any files remain - if !current_chunk_files.is_empty() { - write_chunk( - ¤t_chunk_files, - current_chunk, - output_dir.as_deref(), - stream, - count_tokens, - )?; - debug_file!("Written chunk {}", current_chunk); - } - - if stream { Ok(None) + } else if let Some(out_dir) = output_dir { + // Use parallel processing for non-streaming mode + process_files_parallel( + base_path, + max_size, + &out_dir, + config.as_ref(), + &final_config.ignore_patterns, + &final_config.priority_list, + recentness_boost.as_ref(), + )?; + Ok(Some(out_dir)) } else { - Ok(output_dir) + Ok(None) } } @@ -802,7 +823,7 @@ fn compute_recentness_boost( return HashMap::new(); } - // Sort by ascending commit time + // Sort by ascending commit time => first is oldest let mut sorted: Vec<(&String, &u64)> = commit_times.iter().collect(); sorted.sort_by_key(|(_, t)| **t); @@ -819,8 +840,8 @@ fn compute_recentness_boost( let mut result = HashMap::new(); for (i, (path, _time)) in sorted.iter().enumerate() { - let rank = i as f64 / last_index; // 0.0..1.0 - let boost = (rank * max_boost as f64).round() as i32; + let rank = i as f64 / last_index; // 0.0..1.0 (older files get lower rank) + let boost = (rank * max_boost as f64).round() as i32; // Newer files get higher boost result.insert((*path).clone(), boost); } result diff --git a/src/parallel.rs b/src/parallel.rs new file mode 100644 index 0000000..e110de7 --- /dev/null +++ b/src/parallel.rs @@ -0,0 +1,337 @@ +use crate::is_text_file; +use crate::{get_file_priority, get_recent_commit_times, PriorityPattern, YekConfig}; +use anyhow::Result; +use crossbeam::channel::{bounded, Receiver, Sender}; +use ignore::{gitignore::GitignoreBuilder, WalkBuilder}; +use num_cpus; +use regex::Regex; +use std::collections::HashMap; +use std::fs; +use std::io::{BufReader, Read}; +use std::path::{Path, PathBuf}; +use std::thread; +use std::time::SystemTime; +use tracing::{debug, info}; + +/// Represents a chunk of text read from one file +#[derive(Debug)] +pub struct FileChunk { + pub priority: i32, + pub file_index: usize, + pub part_index: usize, + pub rel_path: String, + pub content: String, +} + +/// File entry with priority for sorting +#[derive(Debug, Clone)] +struct FileEntry { + path: PathBuf, + priority: i32, + file_index: usize, +} + +/// Reads a file and determines if it's likely binary by checking for null bytes +fn is_likely_binary(path: &Path) -> Result { + let f = fs::File::open(path)?; + let mut reader = BufReader::new(f); + let mut buf = [0; 4096]; + let n = reader.read(&mut buf)?; + Ok(buf[..n].contains(&0)) +} + +/// Reads and chunks a single file, sending chunks through the channel +fn read_and_send_chunks( + file_entry: FileEntry, + base_path: &Path, + tx: &Sender, + max_size: usize, +) -> Result<()> { + // Skip if binary + if is_likely_binary(&file_entry.path)? { + return Ok(()); + } + + // Read file content + let content = fs::read_to_string(&file_entry.path)?; + if content.is_empty() { + return Ok(()); + } + + // Get relative path for display + let rel_path = file_entry + .path + .strip_prefix(base_path) + .unwrap_or(&file_entry.path) + .to_string_lossy() + .into_owned(); + + // If smaller than max_size, send as single chunk + if content.len() <= max_size { + let chunk = FileChunk { + priority: file_entry.priority, + file_index: file_entry.file_index, + part_index: 0, + rel_path, + content, + }; + tx.send(chunk).ok(); + return Ok(()); + } + + // Otherwise split into chunks + let mut start = 0; + let mut part_index = 0; + let bytes = content.as_bytes(); + + while start < bytes.len() { + let end = (start + max_size).min(bytes.len()); + let slice = &bytes[start..end]; + let chunk_str = String::from_utf8_lossy(slice).into_owned(); + + let chunk = FileChunk { + priority: file_entry.priority, + file_index: file_entry.file_index, + part_index, + rel_path: rel_path.clone(), + content: chunk_str, + }; + + tx.send(chunk).ok(); + start = end; + part_index += 1; + } + + Ok(()) +} + +/// Main parallel processing function that coordinates workers and aggregator +pub fn process_files_parallel( + base_dir: &Path, + max_size: usize, + output_dir: &Path, + config: Option<&YekConfig>, + ignore_patterns: &[Regex], + priority_list: &[PriorityPattern], + recentness_boost: Option<&HashMap>, +) -> Result<()> { + // Create output directory + fs::create_dir_all(output_dir)?; + + // Collect and sort files by priority + let files = collect_files( + base_dir, + config, + ignore_patterns, + priority_list, + recentness_boost, + )?; + if files.is_empty() { + return Ok(()); + } + + // Create channels for worker→aggregator communication + let (tx, rx) = bounded(256); + + // Spawn aggregator thread + let output_dir = output_dir.to_path_buf(); + let aggregator_handle = thread::spawn(move || aggregator_loop(rx, output_dir)); + + // Spawn worker threads + let num_threads = num_cpus::get(); + let chunk_size = (files.len() + num_threads - 1) / num_threads; + let mut handles = Vec::new(); + + for chunk in files.chunks(chunk_size) { + let chunk_files = chunk.to_vec(); + let sender = tx.clone(); + let base_path = base_dir.to_path_buf(); + + let handle = thread::spawn(move || -> Result<()> { + for file_entry in chunk_files { + read_and_send_chunks(file_entry, &base_path, &sender, max_size)?; + } + Ok(()) + }); + handles.push(handle); + } + + // Drop original sender + drop(tx); + + // Wait for workers + for handle in handles { + handle.join().unwrap()?; + } + + // Wait for aggregator + aggregator_handle.join().unwrap()?; + + Ok(()) +} + +/// Collects files from directory respecting .gitignore and sorts by priority +fn collect_files( + base_dir: &Path, + config: Option<&YekConfig>, + ignore_patterns: &[Regex], + priority_list: &[PriorityPattern], + recentness_boost: Option<&HashMap>, +) -> Result> { + // Build gitignore matcher + let mut builder = GitignoreBuilder::new(base_dir); + let gitignore_path = base_dir.join(".gitignore"); + if gitignore_path.exists() { + builder.add(&gitignore_path); + } + let gitignore = builder + .build() + .unwrap_or_else(|_| GitignoreBuilder::new(base_dir).build().unwrap()); + + let mut builder = WalkBuilder::new(base_dir); + builder.follow_links(false).standard_filters(true); + + let mut results = Vec::new(); + let mut file_index = 0; + + for entry in builder.build() { + if let Ok(entry) = entry { + if entry.file_type().map_or(false, |ft| ft.is_file()) { + let path = entry.path().to_path_buf(); + let rel_path = path.strip_prefix(base_dir).unwrap_or(&path); + let rel_str = rel_path.to_string_lossy(); + + // Skip if matched by gitignore + #[cfg(windows)] + let gitignore_path = rel_path + .to_str() + .map(|s| s.replace('\\', "/")) + .map(PathBuf::from) + .unwrap_or(rel_path.to_path_buf()); + #[cfg(not(windows))] + let gitignore_path = rel_path.to_path_buf(); + + if gitignore.matched(&gitignore_path, false).is_ignore() { + debug!("Skipping {} - matched by gitignore", rel_str); + continue; + } + + // Skip if matched by our ignore patterns + let mut skip = false; + for pat in ignore_patterns { + if pat.is_match(&rel_str) { + debug!("Skipping {} - matched ignore pattern", rel_str); + skip = true; + break; + } + } + if skip { + continue; + } + + // Skip binary files + if let Some(ref cfg) = config { + if !is_text_file(&path, &cfg.binary_extensions) { + debug!("Skipping binary file: {}", rel_str); + continue; + } + } else if !is_text_file(&path, &[]) { + debug!("Skipping binary file: {}", rel_str); + continue; + } + + // Calculate priority score + let mut priority = get_file_priority(&rel_str, ignore_patterns, priority_list); + + // Apply git recentness boost + if let Some(boost_map) = recentness_boost { + if let Some(boost) = boost_map.get(&rel_str.to_string()) { + priority += *boost; + } + } + + results.push(FileEntry { + path, + priority, + file_index, + }); + file_index += 1; + } + } + } + + // Sort by priority (ascending) so higher priority files come last + results.sort_by(|a, b| { + // First sort by priority (ascending) + let p = a.priority.cmp(&b.priority); + if p != std::cmp::Ordering::Equal { + return p; + } + // If priorities are equal, sort by Git boost (ascending) + if let Some(boost_map) = recentness_boost { + let a_boost = boost_map + .get(&a.path.to_string_lossy().to_string()) + .unwrap_or(&0); + let b_boost = boost_map + .get(&b.path.to_string_lossy().to_string()) + .unwrap_or(&0); + return a_boost.cmp(b_boost); // Lower boost (older files) come first + } + std::cmp::Ordering::Equal + }); + Ok(results) +} + +/// Receives chunks from workers and writes them to files +fn aggregator_loop(rx: Receiver, output_dir: PathBuf) -> Result<()> { + // Create output directory + fs::create_dir_all(&output_dir)?; + + // Collect all chunks + let mut all_chunks = Vec::new(); + while let Ok(chunk) = rx.recv() { + all_chunks.push(chunk); + } + + // Sort chunks by priority (ascending), then file_index, then part_index + // so that higher priority files come last + all_chunks.sort_by(|a, b| { + let p = a.priority.cmp(&b.priority); + if p != std::cmp::Ordering::Equal { + return p; + } + // Then sort by file_index + let f = a.file_index.cmp(&b.file_index); + if f != std::cmp::Ordering::Equal { + return f; + } + // Finally sort by part_index + a.part_index.cmp(&b.part_index) + }); + + // Write sorted chunks + let mut current_chunk = String::new(); + let mut current_chunk_index = 0; + + for chunk in all_chunks { + let mut content = String::new(); + content.push_str(&format!(">>>> {}\n", chunk.rel_path)); + content.push_str(&chunk.content); + content.push_str("\n\n"); + + current_chunk.push_str(&content); + } + + // Write the final chunk + if !current_chunk.is_empty() { + let out_path = output_dir.join(format!("chunk-{}.txt", current_chunk_index)); + fs::write(&out_path, ¤t_chunk)?; + info!( + "Written chunk {} with {} lines.", + current_chunk_index, + current_chunk.lines().count() + ); + } + + Ok(()) +} diff --git a/tests/test_perf.rs b/tests/test_perf.rs index fca904b..4d5f040 100644 --- a/tests/test_perf.rs +++ b/tests/test_perf.rs @@ -1,10 +1,39 @@ use std::fs; use std::path::Path; -use std::time::Instant; +use std::time::{Duration, Instant}; use yek::serialize_repo; +struct PerfStats { + min: Duration, + max: Duration, + avg: Duration, + total_runs: usize, +} + +impl PerfStats { + fn new() -> Self { + PerfStats { + min: Duration::from_secs(u64::MAX), + max: Duration::from_secs(0), + avg: Duration::from_secs(0), + total_runs: 0, + } + } + + fn update(&mut self, duration: Duration) { + self.min = self.min.min(duration); + self.max = self.max.max(duration); + self.total_runs += 1; + // Compute running average + self.avg = (self.avg * (self.total_runs - 1) as u32 + duration) / self.total_runs as u32; + } +} + #[test] fn test_serialization_performance() { + const WARMUP_RUNS: usize = 2; + const BENCH_RUNS: usize = 5; + // Create test data directory let test_dir = "test_perf_data"; fs::create_dir_all(test_dir).unwrap(); @@ -12,29 +41,58 @@ fn test_serialization_performance() { // Create test files of different sizes let sizes = vec![1024, 1024 * 1024, 10 * 1024 * 1024]; // 1KB, 1MB, 10MB + println!("\nPerformance Benchmark Results:"); + println!("------------------------------"); + for size in sizes { let filename = format!("{}/file_{}_bytes.txt", test_dir, size); let data = vec![b'a'; size]; fs::write(&filename, &data).unwrap(); - // Measure serialization time - let start = Instant::now(); - serialize_repo( - size, // max_size - Some(Path::new(test_dir)), // base_path - false, // stream - false, // count_tokens - None, // config - Some(Path::new("perf_output")), // output_dir - None, // max_files - ) - .unwrap(); - let duration = start.elapsed(); - - println!("Serializing {}B took: {:?}", size, duration); - - // Cleanup - fs::remove_dir_all("perf_output").unwrap(); + // Warmup runs + println!("\nFile size: {}B", size); + println!("Warmup runs..."); + for _ in 0..WARMUP_RUNS { + serialize_repo( + size, + Some(Path::new(test_dir)), + false, + false, + None, + Some(Path::new("perf_output")), + None, + ) + .unwrap(); + fs::remove_dir_all("perf_output").unwrap(); + } + + // Benchmark runs + let mut stats = PerfStats::new(); + println!("Benchmark runs..."); + + for run in 1..=BENCH_RUNS { + let start = Instant::now(); + serialize_repo( + size, + Some(Path::new(test_dir)), + false, + false, + None, + Some(Path::new("perf_output")), + None, + ) + .unwrap(); + let duration = start.elapsed(); + stats.update(duration); + + println!(" Run {}: {:?}", run, duration); + fs::remove_dir_all("perf_output").unwrap(); + } + + println!("\nStats for {}B:", size); + println!(" Min: {:?}", stats.min); + println!(" Max: {:?}", stats.max); + println!(" Avg: {:?}", stats.avg); } // Final cleanup From a3951ca32506bc2916be3e539adae4f02ec66fd3 Mon Sep 17 00:00:00 2001 From: Mohsen Azimi Date: Sun, 19 Jan 2025 11:23:51 +0700 Subject: [PATCH 3/7] style: fix linting issues in parallel.rs --- src/parallel.rs | 124 ++++++++++++++++++++++-------------------------- 1 file changed, 56 insertions(+), 68 deletions(-) diff --git a/src/parallel.rs b/src/parallel.rs index e110de7..ac4bbba 100644 --- a/src/parallel.rs +++ b/src/parallel.rs @@ -1,16 +1,15 @@ use crate::is_text_file; -use crate::{get_file_priority, get_recent_commit_times, PriorityPattern, YekConfig}; +use crate::{get_file_priority, PriorityPattern, YekConfig}; use anyhow::Result; use crossbeam::channel::{bounded, Receiver, Sender}; use ignore::{gitignore::GitignoreBuilder, WalkBuilder}; -use num_cpus; +use num_cpus::get; use regex::Regex; use std::collections::HashMap; use std::fs; use std::io::{BufReader, Read}; use std::path::{Path, PathBuf}; use std::thread; -use std::time::SystemTime; use tracing::{debug, info}; /// Represents a chunk of text read from one file @@ -138,8 +137,8 @@ pub fn process_files_parallel( let aggregator_handle = thread::spawn(move || aggregator_loop(rx, output_dir)); // Spawn worker threads - let num_threads = num_cpus::get(); - let chunk_size = (files.len() + num_threads - 1) / num_threads; + let num_threads = get(); + let chunk_size = files.len().div_ceil(num_threads); let mut handles = Vec::new(); for chunk in files.chunks(chunk_size) { @@ -194,69 +193,67 @@ fn collect_files( let mut results = Vec::new(); let mut file_index = 0; - for entry in builder.build() { - if let Ok(entry) = entry { - if entry.file_type().map_or(false, |ft| ft.is_file()) { - let path = entry.path().to_path_buf(); - let rel_path = path.strip_prefix(base_dir).unwrap_or(&path); - let rel_str = rel_path.to_string_lossy(); - - // Skip if matched by gitignore - #[cfg(windows)] - let gitignore_path = rel_path - .to_str() - .map(|s| s.replace('\\', "/")) - .map(PathBuf::from) - .unwrap_or(rel_path.to_path_buf()); - #[cfg(not(windows))] - let gitignore_path = rel_path.to_path_buf(); - - if gitignore.matched(&gitignore_path, false).is_ignore() { - debug!("Skipping {} - matched by gitignore", rel_str); - continue; - } + for entry in builder.build().flatten() { + if entry.file_type().is_some_and(|ft| ft.is_file()) { + let path = entry.path().to_path_buf(); + let rel_path = path.strip_prefix(base_dir).unwrap_or(&path); + let rel_str = rel_path.to_string_lossy(); + + // Skip if matched by gitignore + #[cfg(windows)] + let gitignore_path = rel_path + .to_str() + .map(|s| s.replace('\\', "/")) + .map(PathBuf::from) + .unwrap_or(rel_path.to_path_buf()); + #[cfg(not(windows))] + let gitignore_path = rel_path.to_path_buf(); + + if gitignore.matched(&gitignore_path, false).is_ignore() { + debug!("Skipping {} - matched by gitignore", rel_str); + continue; + } - // Skip if matched by our ignore patterns - let mut skip = false; - for pat in ignore_patterns { - if pat.is_match(&rel_str) { - debug!("Skipping {} - matched ignore pattern", rel_str); - skip = true; - break; - } - } - if skip { - continue; + // Skip if matched by our ignore patterns + let mut skip = false; + for pat in ignore_patterns { + if pat.is_match(&rel_str) { + debug!("Skipping {} - matched ignore pattern", rel_str); + skip = true; + break; } + } + if skip { + continue; + } - // Skip binary files - if let Some(ref cfg) = config { - if !is_text_file(&path, &cfg.binary_extensions) { - debug!("Skipping binary file: {}", rel_str); - continue; - } - } else if !is_text_file(&path, &[]) { + // Skip binary files + if let Some(cfg) = config { + if !is_text_file(&path, &cfg.binary_extensions) { debug!("Skipping binary file: {}", rel_str); continue; } + } else if !is_text_file(&path, &[]) { + debug!("Skipping binary file: {}", rel_str); + continue; + } - // Calculate priority score - let mut priority = get_file_priority(&rel_str, ignore_patterns, priority_list); + // Calculate priority score + let mut priority = get_file_priority(&rel_str, ignore_patterns, priority_list); - // Apply git recentness boost - if let Some(boost_map) = recentness_boost { - if let Some(boost) = boost_map.get(&rel_str.to_string()) { - priority += *boost; - } + // Apply git recentness boost + if let Some(boost_map) = recentness_boost { + if let Some(boost) = boost_map.get(&rel_str.to_string()) { + priority += *boost; } - - results.push(FileEntry { - path, - priority, - file_index, - }); - file_index += 1; } + + results.push(FileEntry { + path, + priority, + file_index, + }); + file_index += 1; } } @@ -284,45 +281,36 @@ fn collect_files( /// Receives chunks from workers and writes them to files fn aggregator_loop(rx: Receiver, output_dir: PathBuf) -> Result<()> { - // Create output directory fs::create_dir_all(&output_dir)?; - // Collect all chunks let mut all_chunks = Vec::new(); while let Ok(chunk) = rx.recv() { all_chunks.push(chunk); } - // Sort chunks by priority (ascending), then file_index, then part_index - // so that higher priority files come last all_chunks.sort_by(|a, b| { let p = a.priority.cmp(&b.priority); if p != std::cmp::Ordering::Equal { return p; } - // Then sort by file_index let f = a.file_index.cmp(&b.file_index); if f != std::cmp::Ordering::Equal { return f; } - // Finally sort by part_index a.part_index.cmp(&b.part_index) }); - // Write sorted chunks let mut current_chunk = String::new(); - let mut current_chunk_index = 0; + let current_chunk_index = 0; for chunk in all_chunks { let mut content = String::new(); content.push_str(&format!(">>>> {}\n", chunk.rel_path)); content.push_str(&chunk.content); content.push_str("\n\n"); - current_chunk.push_str(&content); } - // Write the final chunk if !current_chunk.is_empty() { let out_path = output_dir.join(format!("chunk-{}.txt", current_chunk_index)); fs::write(&out_path, ¤t_chunk)?; From 10d0102cdd10394891cecb868f181dd6c8f93514 Mon Sep 17 00:00:00 2001 From: Mohsen Azimi Date: Sun, 19 Jan 2025 11:34:52 +0700 Subject: [PATCH 4/7] feat: add benchmark regression test with 5% threshold --- .github/workflows/ci.yml | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a7eb9cd..8763755 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -119,6 +119,46 @@ jobs: path: ${{ matrix.asset_name }} if-no-files-found: error + benchmark: + name: Benchmark + if: github.event_name == 'pull_request' + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: Install Rust + uses: dtolnay/rust-toolchain@stable + + - uses: Swatinem/rust-cache@v2 + with: + cache-on-failure: true + + - name: Build benchmarks on target branch + run: | + git fetch origin ${{ github.base_ref }} + git checkout ${{ github.base_ref }} + cargo bench --bench serialization --no-run + + - name: Run benchmark on target branch + run: cargo bench --bench serialization -- --save-baseline main + + - name: Build benchmarks on PR branch + run: | + git checkout ${{ github.head_ref }} + cargo bench --bench serialization --no-run + + - name: Compare benchmarks + run: cargo bench --bench serialization -- --baseline main --significance-threshold 5 + + - name: Upload benchmark results + uses: actions/upload-artifact@v3 + with: + name: criterion-results + path: target/criterion/ + if-no-files-found: error + release: name: Release needs: [test, lint, build] From f0a970f7083893cd186e127578f7d3dd81493038 Mon Sep 17 00:00:00 2001 From: Mohsen Azimi Date: Sun, 19 Jan 2025 11:44:25 +0700 Subject: [PATCH 5/7] feat: add comprehensive benchmarks for serialization --- Cargo.lock | 67 ++++++++ Cargo.toml | 1 + benches/serialization.rs | 326 +++++++++++++++++++++++++++++++++++---- 3 files changed, 366 insertions(+), 28 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 7184df0..173bab5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -151,6 +151,12 @@ dependencies = [ "utf8-width", ] +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + [[package]] name = "cast" version = "0.3.0" @@ -785,6 +791,15 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391" +[[package]] +name = "ppv-lite86" +version = "0.2.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77957b295656769bb8ad2b6a6b09d897d94f05c41b069aede1fcdaa675eaea04" +dependencies = [ + "zerocopy", +] + [[package]] name = "predicates" version = "3.1.3" @@ -833,6 +848,36 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "rand" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" +dependencies = [ + "libc", + "rand_chacha", + "rand_core", +] + +[[package]] +name = "rand_chacha" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +dependencies = [ + "ppv-lite86", + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +dependencies = [ + "getrandom", +] + [[package]] name = "rayon" version = "1.10.0" @@ -1591,6 +1636,7 @@ dependencies = [ "indicatif", "num_cpus", "predicates", + "rand", "regex", "serde", "sha2", @@ -1601,3 +1647,24 @@ dependencies = [ "tracing-subscriber", "walkdir", ] + +[[package]] +name = "zerocopy" +version = "0.7.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0" +dependencies = [ + "byteorder", + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.7.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] diff --git a/Cargo.toml b/Cargo.toml index c991bff..b0d4c2f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -26,6 +26,7 @@ chrono = "0.4" predicates = "3.0" tempfile = "3.9" criterion = "0.5" +rand = "0.8" [[bench]] name = "serialization" diff --git a/benches/serialization.rs b/benches/serialization.rs index 315db18..0b76c40 100644 --- a/benches/serialization.rs +++ b/benches/serialization.rs @@ -1,36 +1,151 @@ -use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion}; -use std::fs; +use criterion::{ + black_box, criterion_group, criterion_main, BatchSize, BenchmarkId, Criterion, Throughput, +}; +use rand::{distributions::Alphanumeric, Rng}; +use std::fs::{self, File}; +use std::io::Write; use std::path::Path; use tempfile::TempDir; -use yek::serialize_repo; +use yek::{serialize_repo, YekConfig}; -fn create_test_data(dir: &Path, size: usize) { - let filename = dir.join(format!("file_{}_bytes.txt", size)); +/// Creates a text file of a specified size in bytes. +fn create_test_data_bytes(dir: &Path, size: usize, file_name: &str) { + let filename = dir.join(file_name); let data = vec![b'a'; size]; - fs::write(&filename, &data).unwrap(); + fs::write(&filename, &data).expect("Unable to write test data"); } -fn bench_serialization(c: &mut Criterion) { - let mut group = c.benchmark_group("serialization"); - group.sample_size(10); // Number of samples to collect +/// Creates a file with a specified approximate number of tokens. +fn create_test_data_tokens(dir: &Path, tokens: usize, file_name: &str) { + let filename = dir.join(file_name); + // Each "token" is a short random word followed by a space + let mut rng = rand::thread_rng(); + let mut file = File::create(&filename).expect("Unable to create file"); - // Test different file sizes - let sizes = vec![ - 1024, // 1KB - 1024 * 1024, // 1MB - 10 * 1024 * 1024, // 10MB - ]; + for _ in 0..tokens { + let word: String = (0..4).map(|_| rng.sample(Alphanumeric) as char).collect(); + write!(file, "{} ", word).expect("Unable to write token"); + } + file.flush().unwrap(); +} + +/// Creates multiple files of given sizes in a single directory. +fn create_multiple_files(dir: &Path, sizes: &[usize], prefix: &str) { + for (i, &size) in sizes.iter().enumerate() { + let file_name = format!("{}_{}.txt", prefix, i); + create_test_data_bytes(dir, size, &file_name); + } +} + +/// Creates multiple files with a given token count each. +fn create_multiple_token_files(dir: &Path, tokens: &[usize], prefix: &str) { + for (i, &token_count) in tokens.iter().enumerate() { + let file_name = format!("{}_{}.txt", prefix, i); + create_test_data_tokens(dir, token_count, &file_name); + } +} + +fn single_small_file_byte_mode(c: &mut Criterion) { + let mut group = c.benchmark_group("SingleFile_ByteMode"); + let temp_dir = TempDir::new().unwrap(); + + let size = 10 * 1024; // 10 KB + create_test_data_bytes(temp_dir.path(), size, "small_file.txt"); + + let output_dir = temp_dir.path().join("output"); - for size in sizes { - // Create a new temporary directory for each benchmark - let temp_dir = TempDir::new().unwrap(); - let output_dir = temp_dir.path().join("output"); - create_test_data(temp_dir.path(), size); + group.throughput(Throughput::Bytes(size as u64)); + group.bench_function("single_small_file", |b| { + b.iter(|| { + serialize_repo( + black_box(1024 * 1024), // 1MB chunk + Some(temp_dir.path()), + false, + false, + None, + Some(&output_dir), + None, + ) + .unwrap(); + fs::remove_dir_all(&output_dir).ok(); + }); + }); + group.finish(); +} + +fn single_large_file_byte_mode(c: &mut Criterion) { + let mut group = c.benchmark_group("SingleFile_ByteMode_Large"); + let temp_dir = TempDir::new().unwrap(); + + let size = 10 * 1024 * 1024; // 10 MB + create_test_data_bytes(temp_dir.path(), size, "large_file.txt"); + + let output_dir = temp_dir.path().join("output"); + + group.throughput(Throughput::Bytes(size as u64)); + group.bench_function("single_large_file", |b| { + b.iter(|| { + serialize_repo( + black_box(5 * 1024 * 1024), // 5MB chunk, forces splits + Some(temp_dir.path()), + false, + false, + None, + Some(&output_dir), + None, + ) + .unwrap(); + fs::remove_dir_all(&output_dir).ok(); + }); + }); + group.finish(); +} - group.bench_with_input(BenchmarkId::new("file_size", size), &size, |b, &size| { - b.iter(|| { +fn single_large_file_token_mode(c: &mut Criterion) { + let mut group = c.benchmark_group("SingleFile_TokenMode_Large"); + let temp_dir = TempDir::new().unwrap(); + + let token_count = 200_000; + create_test_data_tokens(temp_dir.path(), token_count, "large_tokens.txt"); + + let output_dir = temp_dir.path().join("output"); + + group.throughput(Throughput::Elements(token_count as u64)); + group.bench_function("single_large_token_file", |b| { + b.iter(|| { + serialize_repo( + black_box(50_000), // 50k tokens + Some(temp_dir.path()), + false, + true, // token-based + None, + Some(&output_dir), + None, + ) + .unwrap(); + fs::remove_dir_all(&output_dir).ok(); + }); + }); + group.finish(); +} + +fn multiple_small_files(c: &mut Criterion) { + let mut group = c.benchmark_group("MultipleFiles_Small"); + group.sample_size(10); // we can tune the sample size as needed + + group.bench_function("multiple_small_files", |b| { + b.iter_batched( + || { + let temp_dir = TempDir::new().unwrap(); + // Create a set of small files + let sizes = vec![1024; 50]; // 50 files of 1KB each + create_multiple_files(temp_dir.path(), &sizes, "small"); + let output_dir = temp_dir.path().join("output"); + (temp_dir, output_dir) + }, + |(temp_dir, output_dir)| { serialize_repo( - black_box(size), + black_box(10 * 1024), // 10KB chunk Some(temp_dir.path()), false, false, @@ -39,12 +154,167 @@ fn bench_serialization(c: &mut Criterion) { None, ) .unwrap(); - fs::remove_dir_all(&output_dir).unwrap(); - }); - }); - } + fs::remove_dir_all(&output_dir).ok(); + }, + BatchSize::SmallInput, + ); + }); + group.finish(); +} + +fn multiple_medium_files(c: &mut Criterion) { + let mut group = c.benchmark_group("MultipleFiles_Medium"); + + group.bench_function("multiple_medium_files", |b| { + b.iter_batched( + || { + let temp_dir = TempDir::new().unwrap(); + // Create 20 files with sizes from 100KB to 500KB + let sizes = (100..=500) + .step_by(20) + .map(|kb| kb * 1024) + .collect::>(); + create_multiple_files(temp_dir.path(), &sizes, "medium"); + let output_dir = temp_dir.path().join("output"); + (temp_dir, output_dir) + }, + |(temp_dir, output_dir)| { + serialize_repo( + black_box(512 * 1024), // 512KB chunk + Some(temp_dir.path()), + false, + false, + None, + Some(&output_dir), + None, + ) + .unwrap(); + fs::remove_dir_all(&output_dir).ok(); + }, + BatchSize::SmallInput, + ); + }); + group.finish(); +} + +fn multiple_large_files(c: &mut Criterion) { + let mut group = c.benchmark_group("MultipleFiles_Large"); + + group.bench_function("multiple_large_files", |b| { + b.iter_batched( + || { + let temp_dir = TempDir::new().unwrap(); + // Create 5 large files, each ~ 5 MB + let sizes = vec![5_242_880; 5]; // ~5 MB x 5 + create_multiple_files(temp_dir.path(), &sizes, "large"); + let output_dir = temp_dir.path().join("output"); + (temp_dir, output_dir) + }, + |(temp_dir, output_dir)| { + serialize_repo( + black_box(2 * 1024 * 1024), // 2 MB chunk to force splits + Some(temp_dir.path()), + false, + false, + None, + Some(&output_dir), + None, + ) + .unwrap(); + fs::remove_dir_all(&output_dir).ok(); + }, + BatchSize::SmallInput, + ); + }); + group.finish(); +} + +fn multiple_token_files(c: &mut Criterion) { + let mut group = c.benchmark_group("MultipleFiles_TokenMode"); + + group.bench_function("multiple_token_files", |b| { + b.iter_batched( + || { + let temp_dir = TempDir::new().unwrap(); + // Create 10 files with 10k tokens each + let tokens = vec![10_000; 10]; + create_multiple_token_files(temp_dir.path(), &tokens, "token"); + let output_dir = temp_dir.path().join("output"); + (temp_dir, output_dir) + }, + |(temp_dir, output_dir)| { + serialize_repo( + black_box(5_000), // 5k tokens chunk + Some(temp_dir.path()), + false, + true, // token-based + None, + Some(&output_dir), + None, + ) + .unwrap(); + fs::remove_dir_all(&output_dir).ok(); + }, + BatchSize::SmallInput, + ); + }); + group.finish(); +} + +/// Demonstrates using a custom config (e.g. extra ignores or priority rules). +fn custom_config_test(c: &mut Criterion) { + let mut group = c.benchmark_group("CustomConfig"); + let mut config = YekConfig::default(); + // Add a silly priority rule for *.foo + config.priority_rules.push(yek::PriorityRule { + score: 500, + patterns: vec!["\\.foo$".into()], + }); + + group.bench_function("custom_config_test", |b| { + b.iter_batched( + || { + let temp_dir = TempDir::new().unwrap(); + // Create several files, some .foo, some .bar + let files = &[ + (50_000, "file1.foo"), + (70_000, "file2.bar"), + (90_000, "file3.foo"), + ]; + for &(size, name) in files { + create_test_data_bytes(temp_dir.path(), size, name); + } + let output_dir = temp_dir.path().join("output"); + (temp_dir, output_dir) + }, + |(temp_dir, output_dir)| { + serialize_repo( + black_box(128_000), + Some(temp_dir.path()), + false, + false, + Some(config.clone()), + Some(&output_dir), + None, + ) + .unwrap(); + fs::remove_dir_all(&output_dir).ok(); + }, + BatchSize::SmallInput, + ); + }); group.finish(); } -criterion_group!(benches, bench_serialization); +criterion_group!( + benches, + single_small_file_byte_mode, + single_large_file_byte_mode, + single_large_file_token_mode, + multiple_small_files, + multiple_medium_files, + multiple_large_files, + multiple_token_files, + custom_config_test, +); criterion_main!(benches); From 2cd09e66b22f9efab2fe8e99fdbe8e3473f4b1db Mon Sep 17 00:00:00 2001 From: Mohsen Azimi Date: Sun, 19 Jan 2025 11:51:55 +0700 Subject: [PATCH 6/7] git: undo parallel execution in test branch --- src/lib.rs | 337 +++++++++++++++++++++++------------------------- src/parallel.rs | 325 ---------------------------------------------- 2 files changed, 158 insertions(+), 504 deletions(-) delete mode 100644 src/parallel.rs diff --git a/src/lib.rs b/src/lib.rs index 20a8861..0b6e5e5 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -9,8 +9,6 @@ use std::path::{Path, PathBuf}; use std::process::{Command as SysCommand, Stdio}; use tracing::{debug, info}; use walkdir::WalkDir; -mod parallel; -use parallel::process_files_parallel; /// Helper macro to write debug statements both to standard debug log and to debug file if set. #[macro_export] @@ -194,11 +192,7 @@ fn build_final_config(cfg: Option) -> FinalConfig { merged_ignore.push(reg); } } - // Clear default priority rules if user provides their own - if !user_cfg.priority_rules.is_empty() { - merged_priority.clear(); - } - // Add user priority rules + // Merge or add new priority rules for user_rule in user_cfg.priority_rules { if user_rule.patterns.is_empty() { continue; @@ -354,15 +348,14 @@ pub fn get_file_priority( _ignore_pats: &[Regex], prio_list: &[PriorityPattern], ) -> i32 { - // Loop from highest score → lowest - for prio in prio_list.iter().rev() { + for prio in prio_list { for pat in &prio.patterns { if pat.is_match(rel_str) { return prio.score; } } } - 0 // fallback if nothing matches - lower than any user-defined priority + 40 // fallback } /// Get the commit time of the most recent change to each file. @@ -538,188 +531,165 @@ pub fn serialize_repo( None }; - if stream { - // For streaming, we still use the old single-threaded approach - let mut files: Vec = Vec::new(); - let mut total_size = 0; - let mut current_chunk = 0; - let mut current_chunk_files = Vec::new(); - - // Walk directory tree - for entry in WalkDir::new(base_path) - .follow_links(true) - .into_iter() - .filter_map(|e| e.ok()) - { - let path = entry.path(); - if !path.is_file() { - continue; - } + // Collect files with their priorities + let mut files: Vec = Vec::new(); + let mut total_size = 0; + let mut current_chunk = 0; + let mut current_chunk_files = Vec::new(); + + // Walk directory tree + for entry in WalkDir::new(base_path) + .follow_links(true) + .into_iter() + .filter_map(|e| e.ok()) + { + let path = entry.path(); + if !path.is_file() { + continue; + } - // Get path relative to base - let rel_path = path.strip_prefix(base_path).unwrap_or(path); - let rel_str = rel_path.to_string_lossy(); - - // Normalize path separators to forward slashes for consistent pattern matching - #[cfg(windows)] - let rel_str = rel_str.replace('\\', "/"); - - // Skip if matched by gitignore - #[cfg(windows)] - let gitignore_path = rel_path - .to_str() - .map(|s| s.replace('\\', "/")) - .map(PathBuf::from) - .unwrap_or(rel_path.to_path_buf()); - #[cfg(not(windows))] - let gitignore_path = rel_path.to_path_buf(); - - if gitignore.matched(&gitignore_path, false).is_ignore() { - debug!("Skipping {} - matched by gitignore", rel_str); - continue; - } + // Get path relative to base + let rel_path = path.strip_prefix(base_path).unwrap_or(path); + let rel_str = rel_path.to_string_lossy(); - // Skip if matched by our ignore patterns - let mut skip = false; - #[cfg(windows)] - let pattern_path = rel_str.replace('\\', "/"); - #[cfg(not(windows))] - let pattern_path = rel_str.to_string(); - - for pat in &final_config.ignore_patterns { - if pat.is_match(&pattern_path) { - debug!("Skipping {} - matched ignore pattern", rel_str); - skip = true; - break; - } - } - if skip { - continue; - } + // Normalize path separators to forward slashes for consistent pattern matching + #[cfg(windows)] + let rel_str = rel_str.replace('\\', "/"); - // Calculate priority score - let mut priority = get_file_priority( - &pattern_path, - &final_config.ignore_patterns, - &final_config.priority_list, - ); + // Skip if matched by gitignore + #[cfg(windows)] + let gitignore_path = rel_path + .to_str() + .map(|s| s.replace('\\', "/")) + .map(PathBuf::from) + .unwrap_or(rel_path.to_path_buf()); + #[cfg(not(windows))] + let gitignore_path = rel_path.to_path_buf(); - // Apply rank-based boost if available - if let Some(ref boost_map) = recentness_boost { - if let Some(boost) = boost_map.get(&pattern_path) { - priority += *boost; - } + if gitignore.matched(&gitignore_path, false).is_ignore() { + debug!("Skipping {} - matched by gitignore", rel_str); + continue; + } + + // Skip if matched by our ignore patterns + let mut skip = false; + #[cfg(windows)] + let pattern_path = rel_str.replace('\\', "/"); + #[cfg(not(windows))] + let pattern_path = rel_str.to_string(); + + for pat in &final_config.ignore_patterns { + if pat.is_match(&pattern_path) { + debug!("Skipping {} - matched ignore pattern", rel_str); + skip = true; + break; } + } + if skip { + continue; + } - files.push(FileEntry { - path: path.to_path_buf(), - priority, - }); + // Calculate priority score + let mut priority = get_file_priority( + &pattern_path, + &final_config.ignore_patterns, + &final_config.priority_list, + ); + + // Apply rank-based boost if available + if let Some(ref boost_map) = recentness_boost { + if let Some(boost) = boost_map.get(&pattern_path) { + priority += *boost; + } } - // Sort files by priority (ascending) so higher priority files come last - files.sort_by(|a, b| a.priority.cmp(&b.priority)); + files.push(FileEntry { + path: path.to_path_buf(), + priority, + }); + } - // Process files in sorted order - for file in files { - let path = file.path; - let rel_path = path.strip_prefix(base_path).unwrap_or(&path); - let rel_str = rel_path.to_string_lossy(); + // Sort files by priority (ascending) so higher priority files come last + files.sort_by(|a, b| a.priority.cmp(&b.priority)); - // Skip binary files - if let Some(ref cfg) = config { - if !is_text_file(&path, &cfg.binary_extensions) { - debug!("Skipping binary file: {}", rel_str); - continue; - } - } else if !is_text_file(&path, &[]) { + // Process files in sorted order + for file in files { + let path = file.path; + let rel_path = path.strip_prefix(base_path).unwrap_or(&path); + let rel_str = rel_path.to_string_lossy(); + + // Skip binary files + if let Some(ref cfg) = config { + if !is_text_file(&path, &cfg.binary_extensions) { debug!("Skipping binary file: {}", rel_str); continue; } + } else if !is_text_file(&path, &[]) { + debug!("Skipping binary file: {}", rel_str); + continue; + } - // Read file content - let content = match fs::read_to_string(&path) { - Ok(c) => c, - Err(e) => { - debug!("Failed to read {}: {}", rel_str, e); - continue; - } - }; - - let size = count_size(&content, count_tokens); - if size == 0 { - debug!("Skipping empty file: {}", rel_str); + // Read file content + let content = match fs::read_to_string(&path) { + Ok(c) => c, + Err(e) => { + debug!("Failed to read {}: {}", rel_str, e); continue; } + }; - // If a single file is larger than max_size, split it into multiple chunks - if size > max_size { - debug_file!("File exceeds chunk size, splitting into multiple chunks"); - let mut remaining = content.as_str(); - let mut part = 0; - - while !remaining.is_empty() { - let mut chunk_size = if count_tokens { - // In token mode, count words until we hit max_size - let mut chars = 0; - for (tokens, word) in remaining.split_whitespace().enumerate() { - if tokens + 1 > max_size { - break; - } - chars += word.len() + 1; // +1 for space + let size = count_size(&content, count_tokens); + if size == 0 { + debug!("Skipping empty file: {}", rel_str); + continue; + } + + // If a single file is larger than max_size, split it into multiple chunks + if size > max_size { + debug_file!("File exceeds chunk size, splitting into multiple chunks"); + let mut remaining = content.as_str(); + let mut part = 0; + + while !remaining.is_empty() { + let mut chunk_size = if count_tokens { + // In token mode, count words until we hit max_size + let mut chars = 0; + for (tokens, word) in remaining.split_whitespace().enumerate() { + if tokens + 1 > max_size { + break; } - chars - } else { - max_size - }; - - // Ensure we make progress even if no word boundary found - if chunk_size == 0 { - chunk_size = std::cmp::min(max_size, remaining.len()); + chars += word.len() + 1; // +1 for space } - - let (chunk, rest) = - remaining.split_at(std::cmp::min(chunk_size, remaining.len())); - remaining = rest.trim_start(); - - let chunk_files = - vec![(format!("{}:part{}", rel_str, part), chunk.to_string())]; - debug_file!("Written chunk {}", part); - write_chunk( - &chunk_files, - part, - output_dir.as_deref(), - stream, - count_tokens, - )?; - part += 1; + chars + } else { + max_size + }; + + // Ensure we make progress even if no word boundary found + if chunk_size == 0 { + chunk_size = std::cmp::min(max_size, remaining.len()); } - continue; - } - // Check if adding this file would exceed chunk size - if total_size + size > max_size && !current_chunk_files.is_empty() { - // Write current chunk + let (chunk, rest) = remaining.split_at(std::cmp::min(chunk_size, remaining.len())); + remaining = rest.trim_start(); + + let chunk_files = vec![(format!("{}:part{}", rel_str, part), chunk.to_string())]; + debug_file!("Written chunk {}", part); write_chunk( - ¤t_chunk_files, - current_chunk, + &chunk_files, + part, output_dir.as_deref(), stream, count_tokens, )?; - debug_file!("Written chunk {}", current_chunk); - current_chunk += 1; - current_chunk_files.clear(); - total_size = 0; + part += 1; } - - // Add file to current chunk - current_chunk_files.push((rel_str.to_string(), content)); - total_size += size; + continue; } - // Write final chunk if any files remain - if !current_chunk_files.is_empty() { + // Check if adding this file would exceed chunk size + if total_size + size > max_size && !current_chunk_files.is_empty() { + // Write current chunk write_chunk( ¤t_chunk_files, current_chunk, @@ -728,23 +698,32 @@ pub fn serialize_repo( count_tokens, )?; debug_file!("Written chunk {}", current_chunk); + current_chunk += 1; + current_chunk_files.clear(); + total_size = 0; } - Ok(None) - } else if let Some(out_dir) = output_dir { - // Use parallel processing for non-streaming mode - process_files_parallel( - base_path, - max_size, - &out_dir, - config.as_ref(), - &final_config.ignore_patterns, - &final_config.priority_list, - recentness_boost.as_ref(), + // Add file to current chunk + current_chunk_files.push((rel_str.to_string(), content)); + total_size += size; + } + + // Write final chunk if any files remain + if !current_chunk_files.is_empty() { + write_chunk( + ¤t_chunk_files, + current_chunk, + output_dir.as_deref(), + stream, + count_tokens, )?; - Ok(Some(out_dir)) - } else { + debug_file!("Written chunk {}", current_chunk); + } + + if stream { Ok(None) + } else { + Ok(output_dir) } } @@ -823,7 +802,7 @@ fn compute_recentness_boost( return HashMap::new(); } - // Sort by ascending commit time => first is oldest + // Sort by ascending commit time let mut sorted: Vec<(&String, &u64)> = commit_times.iter().collect(); sorted.sort_by_key(|(_, t)| **t); @@ -840,8 +819,8 @@ fn compute_recentness_boost( let mut result = HashMap::new(); for (i, (path, _time)) in sorted.iter().enumerate() { - let rank = i as f64 / last_index; // 0.0..1.0 (older files get lower rank) - let boost = (rank * max_boost as f64).round() as i32; // Newer files get higher boost + let rank = i as f64 / last_index; // 0.0..1.0 + let boost = (rank * max_boost as f64).round() as i32; result.insert((*path).clone(), boost); } result diff --git a/src/parallel.rs b/src/parallel.rs deleted file mode 100644 index ac4bbba..0000000 --- a/src/parallel.rs +++ /dev/null @@ -1,325 +0,0 @@ -use crate::is_text_file; -use crate::{get_file_priority, PriorityPattern, YekConfig}; -use anyhow::Result; -use crossbeam::channel::{bounded, Receiver, Sender}; -use ignore::{gitignore::GitignoreBuilder, WalkBuilder}; -use num_cpus::get; -use regex::Regex; -use std::collections::HashMap; -use std::fs; -use std::io::{BufReader, Read}; -use std::path::{Path, PathBuf}; -use std::thread; -use tracing::{debug, info}; - -/// Represents a chunk of text read from one file -#[derive(Debug)] -pub struct FileChunk { - pub priority: i32, - pub file_index: usize, - pub part_index: usize, - pub rel_path: String, - pub content: String, -} - -/// File entry with priority for sorting -#[derive(Debug, Clone)] -struct FileEntry { - path: PathBuf, - priority: i32, - file_index: usize, -} - -/// Reads a file and determines if it's likely binary by checking for null bytes -fn is_likely_binary(path: &Path) -> Result { - let f = fs::File::open(path)?; - let mut reader = BufReader::new(f); - let mut buf = [0; 4096]; - let n = reader.read(&mut buf)?; - Ok(buf[..n].contains(&0)) -} - -/// Reads and chunks a single file, sending chunks through the channel -fn read_and_send_chunks( - file_entry: FileEntry, - base_path: &Path, - tx: &Sender, - max_size: usize, -) -> Result<()> { - // Skip if binary - if is_likely_binary(&file_entry.path)? { - return Ok(()); - } - - // Read file content - let content = fs::read_to_string(&file_entry.path)?; - if content.is_empty() { - return Ok(()); - } - - // Get relative path for display - let rel_path = file_entry - .path - .strip_prefix(base_path) - .unwrap_or(&file_entry.path) - .to_string_lossy() - .into_owned(); - - // If smaller than max_size, send as single chunk - if content.len() <= max_size { - let chunk = FileChunk { - priority: file_entry.priority, - file_index: file_entry.file_index, - part_index: 0, - rel_path, - content, - }; - tx.send(chunk).ok(); - return Ok(()); - } - - // Otherwise split into chunks - let mut start = 0; - let mut part_index = 0; - let bytes = content.as_bytes(); - - while start < bytes.len() { - let end = (start + max_size).min(bytes.len()); - let slice = &bytes[start..end]; - let chunk_str = String::from_utf8_lossy(slice).into_owned(); - - let chunk = FileChunk { - priority: file_entry.priority, - file_index: file_entry.file_index, - part_index, - rel_path: rel_path.clone(), - content: chunk_str, - }; - - tx.send(chunk).ok(); - start = end; - part_index += 1; - } - - Ok(()) -} - -/// Main parallel processing function that coordinates workers and aggregator -pub fn process_files_parallel( - base_dir: &Path, - max_size: usize, - output_dir: &Path, - config: Option<&YekConfig>, - ignore_patterns: &[Regex], - priority_list: &[PriorityPattern], - recentness_boost: Option<&HashMap>, -) -> Result<()> { - // Create output directory - fs::create_dir_all(output_dir)?; - - // Collect and sort files by priority - let files = collect_files( - base_dir, - config, - ignore_patterns, - priority_list, - recentness_boost, - )?; - if files.is_empty() { - return Ok(()); - } - - // Create channels for worker→aggregator communication - let (tx, rx) = bounded(256); - - // Spawn aggregator thread - let output_dir = output_dir.to_path_buf(); - let aggregator_handle = thread::spawn(move || aggregator_loop(rx, output_dir)); - - // Spawn worker threads - let num_threads = get(); - let chunk_size = files.len().div_ceil(num_threads); - let mut handles = Vec::new(); - - for chunk in files.chunks(chunk_size) { - let chunk_files = chunk.to_vec(); - let sender = tx.clone(); - let base_path = base_dir.to_path_buf(); - - let handle = thread::spawn(move || -> Result<()> { - for file_entry in chunk_files { - read_and_send_chunks(file_entry, &base_path, &sender, max_size)?; - } - Ok(()) - }); - handles.push(handle); - } - - // Drop original sender - drop(tx); - - // Wait for workers - for handle in handles { - handle.join().unwrap()?; - } - - // Wait for aggregator - aggregator_handle.join().unwrap()?; - - Ok(()) -} - -/// Collects files from directory respecting .gitignore and sorts by priority -fn collect_files( - base_dir: &Path, - config: Option<&YekConfig>, - ignore_patterns: &[Regex], - priority_list: &[PriorityPattern], - recentness_boost: Option<&HashMap>, -) -> Result> { - // Build gitignore matcher - let mut builder = GitignoreBuilder::new(base_dir); - let gitignore_path = base_dir.join(".gitignore"); - if gitignore_path.exists() { - builder.add(&gitignore_path); - } - let gitignore = builder - .build() - .unwrap_or_else(|_| GitignoreBuilder::new(base_dir).build().unwrap()); - - let mut builder = WalkBuilder::new(base_dir); - builder.follow_links(false).standard_filters(true); - - let mut results = Vec::new(); - let mut file_index = 0; - - for entry in builder.build().flatten() { - if entry.file_type().is_some_and(|ft| ft.is_file()) { - let path = entry.path().to_path_buf(); - let rel_path = path.strip_prefix(base_dir).unwrap_or(&path); - let rel_str = rel_path.to_string_lossy(); - - // Skip if matched by gitignore - #[cfg(windows)] - let gitignore_path = rel_path - .to_str() - .map(|s| s.replace('\\', "/")) - .map(PathBuf::from) - .unwrap_or(rel_path.to_path_buf()); - #[cfg(not(windows))] - let gitignore_path = rel_path.to_path_buf(); - - if gitignore.matched(&gitignore_path, false).is_ignore() { - debug!("Skipping {} - matched by gitignore", rel_str); - continue; - } - - // Skip if matched by our ignore patterns - let mut skip = false; - for pat in ignore_patterns { - if pat.is_match(&rel_str) { - debug!("Skipping {} - matched ignore pattern", rel_str); - skip = true; - break; - } - } - if skip { - continue; - } - - // Skip binary files - if let Some(cfg) = config { - if !is_text_file(&path, &cfg.binary_extensions) { - debug!("Skipping binary file: {}", rel_str); - continue; - } - } else if !is_text_file(&path, &[]) { - debug!("Skipping binary file: {}", rel_str); - continue; - } - - // Calculate priority score - let mut priority = get_file_priority(&rel_str, ignore_patterns, priority_list); - - // Apply git recentness boost - if let Some(boost_map) = recentness_boost { - if let Some(boost) = boost_map.get(&rel_str.to_string()) { - priority += *boost; - } - } - - results.push(FileEntry { - path, - priority, - file_index, - }); - file_index += 1; - } - } - - // Sort by priority (ascending) so higher priority files come last - results.sort_by(|a, b| { - // First sort by priority (ascending) - let p = a.priority.cmp(&b.priority); - if p != std::cmp::Ordering::Equal { - return p; - } - // If priorities are equal, sort by Git boost (ascending) - if let Some(boost_map) = recentness_boost { - let a_boost = boost_map - .get(&a.path.to_string_lossy().to_string()) - .unwrap_or(&0); - let b_boost = boost_map - .get(&b.path.to_string_lossy().to_string()) - .unwrap_or(&0); - return a_boost.cmp(b_boost); // Lower boost (older files) come first - } - std::cmp::Ordering::Equal - }); - Ok(results) -} - -/// Receives chunks from workers and writes them to files -fn aggregator_loop(rx: Receiver, output_dir: PathBuf) -> Result<()> { - fs::create_dir_all(&output_dir)?; - - let mut all_chunks = Vec::new(); - while let Ok(chunk) = rx.recv() { - all_chunks.push(chunk); - } - - all_chunks.sort_by(|a, b| { - let p = a.priority.cmp(&b.priority); - if p != std::cmp::Ordering::Equal { - return p; - } - let f = a.file_index.cmp(&b.file_index); - if f != std::cmp::Ordering::Equal { - return f; - } - a.part_index.cmp(&b.part_index) - }); - - let mut current_chunk = String::new(); - let current_chunk_index = 0; - - for chunk in all_chunks { - let mut content = String::new(); - content.push_str(&format!(">>>> {}\n", chunk.rel_path)); - content.push_str(&chunk.content); - content.push_str("\n\n"); - current_chunk.push_str(&content); - } - - if !current_chunk.is_empty() { - let out_path = output_dir.join(format!("chunk-{}.txt", current_chunk_index)); - fs::write(&out_path, ¤t_chunk)?; - info!( - "Written chunk {} with {} lines.", - current_chunk_index, - current_chunk.lines().count() - ); - } - - Ok(()) -} From 22cdad81260b2ce11a84da4109df02baeae5e10e Mon Sep 17 00:00:00 2001 From: Mohsen Azimi Date: Sun, 19 Jan 2025 11:59:50 +0700 Subject: [PATCH 7/7] style: use tempfile::tempdir() for performance tests --- tests/test_perf.rs | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/tests/test_perf.rs b/tests/test_perf.rs index 4d5f040..14e4d25 100644 --- a/tests/test_perf.rs +++ b/tests/test_perf.rs @@ -1,6 +1,6 @@ use std::fs; -use std::path::Path; use std::time::{Duration, Instant}; +use tempfile::TempDir; use yek::serialize_repo; struct PerfStats { @@ -34,9 +34,9 @@ fn test_serialization_performance() { const WARMUP_RUNS: usize = 2; const BENCH_RUNS: usize = 5; - // Create test data directory - let test_dir = "test_perf_data"; - fs::create_dir_all(test_dir).unwrap(); + // Create temporary test directory that will be automatically cleaned up + let test_dir = TempDir::new().unwrap(); + let output_dir = TempDir::new().unwrap(); // Create test files of different sizes let sizes = vec![1024, 1024 * 1024, 10 * 1024 * 1024]; // 1KB, 1MB, 10MB @@ -45,7 +45,7 @@ fn test_serialization_performance() { println!("------------------------------"); for size in sizes { - let filename = format!("{}/file_{}_bytes.txt", test_dir, size); + let filename = test_dir.path().join(format!("file_{}_bytes.txt", size)); let data = vec![b'a'; size]; fs::write(&filename, &data).unwrap(); @@ -55,15 +55,16 @@ fn test_serialization_performance() { for _ in 0..WARMUP_RUNS { serialize_repo( size, - Some(Path::new(test_dir)), + Some(test_dir.path()), false, false, None, - Some(Path::new("perf_output")), + Some(output_dir.path()), None, ) .unwrap(); - fs::remove_dir_all("perf_output").unwrap(); + fs::remove_dir_all(output_dir.path()).unwrap(); + fs::create_dir_all(output_dir.path()).unwrap(); } // Benchmark runs @@ -74,11 +75,11 @@ fn test_serialization_performance() { let start = Instant::now(); serialize_repo( size, - Some(Path::new(test_dir)), + Some(test_dir.path()), false, false, None, - Some(Path::new("perf_output")), + Some(output_dir.path()), None, ) .unwrap(); @@ -86,7 +87,8 @@ fn test_serialization_performance() { stats.update(duration); println!(" Run {}: {:?}", run, duration); - fs::remove_dir_all("perf_output").unwrap(); + fs::remove_dir_all(output_dir.path()).unwrap(); + fs::create_dir_all(output_dir.path()).unwrap(); } println!("\nStats for {}B:", size); @@ -95,6 +97,5 @@ fn test_serialization_performance() { println!(" Avg: {:?}", stats.avg); } - // Final cleanup - fs::remove_dir_all(test_dir).unwrap(); + // TempDir will automatically clean up when dropped }