diff --git a/Cargo.lock b/Cargo.lock index 6f207b9dfc8..3c2403dd895 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1652,7 +1652,7 @@ dependencies = [ [[package]] name = "gix-fsck" -version = "0.0.0" +version = "0.1.0" dependencies = [ "gix-hash 0.13.1", "gix-hashtable 0.4.0", diff --git a/crate-status.md b/crate-status.md index dfe112c77ef..e9983498d5d 100644 --- a/crate-status.md +++ b/crate-status.md @@ -776,17 +776,21 @@ See its [README.md](https://github.com/Byron/gitoxide/blob/main/gix-lock/README. * [x] [validate][tagname-validation] tag names ### gix-fsck -* [x] validate connectivity when provided a specific commit as a starting point -* [ ] validate connectivity of all `refs` in the index +* [x] validate connectivity and find missing objects starting from… + - [x] commits + - [ ] tags + - [ ] tree-cache in the `index` or any entry within +* [ ] validate object hashes during connectivity traversal * [ ] progress reporting and interruptability -* [ ] identify objects that exist but are not reference by any reference nodes (e.g. `refs` or a provided commit) -* [ ] add support for various options - * [ ] write dangling objects to the `.git/log-found` directory structure - * [ ] `strict` mode, to check for tree objects with `g+w` permissions - * [ ] consider reflog entries as reference nodes/heads - * [ ] when reporting reachable objects, include _how_ they are reachable - * [ ] which reference node(s) made them reachable - * [ ] parent commit(s) +* [ ] skipList to exclude objects which are known to be broken +* [ ] validate blob hashes (connectivity check +* [ ] identify objects that exist but are not reachable (i.e. what remains after a full graph traversal from all valid starting points) +* [ ] write dangling objects to the `.git/log-found` directory structure +* [ ] `strict` mode, to check for tree objects with `g+w` permissions +* [ ] consider reflog entries from `ref` starting points +* [ ] when reporting reachable objects, provide the path through which they are reachable, i.e. ref-log@{3} -> commit -> tree -> path-in-tree +* [ ] limit search to ODB without alternates (default is equivalent to `git fsck --full` due to ODB implementation) +* [ ] all individual [checks available in `git fsck`](https://git-scm.com/docs/git-fsck#_fsck_messages) (*too many to print here*) ### gix-ref * [ ] Prepare code for arrival of longer hashes like Sha256. It's part of the [V2 proposal][reftable-v2] but should work for loose refs as well. diff --git a/etc/check-package-size.sh b/etc/check-package-size.sh index f4b5940cc2a..54e591d2b92 100755 --- a/etc/check-package-size.sh +++ b/etc/check-package-size.sh @@ -15,6 +15,7 @@ function indent () { } echo "in root: gitoxide CLI" +(enter gix-fsck && indent cargo diet -n --package-size-limit 10KB) (enter gix-actor && indent cargo diet -n --package-size-limit 10KB) (enter gix-archive && indent cargo diet -n --package-size-limit 10KB) (enter gix-worktree-stream && indent cargo diet -n --package-size-limit 40KB) diff --git a/gitoxide-core/Cargo.toml b/gitoxide-core/Cargo.toml index aec7996081b..32b017ba766 100644 --- a/gitoxide-core/Cargo.toml +++ b/gitoxide-core/Cargo.toml @@ -49,7 +49,7 @@ gix-pack-for-configuration-only = { package = "gix-pack", version = "^0.44.0", p gix-transport-configuration-only = { package = "gix-transport", version = "^0.38.0", path = "../gix-transport", default-features = false } gix-archive-for-configuration-only = { package = "gix-archive", version = "^0.6.0", path = "../gix-archive", optional = true, features = ["tar", "tar_gz"] } gix-status = { version = "^0.2.0", path = "../gix-status" } -gix-fsck = { version = "^0.0.0", path = "../gix-fsck" } +gix-fsck = { version = "^0.1.0", path = "../gix-fsck" } serde = { version = "1.0.114", optional = true, default-features = false, features = ["derive"] } anyhow = "1.0.42" thiserror = "1.0.34" diff --git a/gitoxide-core/src/repository/fsck.rs b/gitoxide-core/src/repository/fsck.rs index 1628e62adb7..6a156602a62 100644 --- a/gitoxide-core/src/repository/fsck.rs +++ b/gitoxide-core/src/repository/fsck.rs @@ -1,10 +1,11 @@ -use std::io::{BufWriter, Write}; - use anyhow::Context; use gix::{objs::Kind, ObjectId}; -pub fn connectivity(mut repo: gix::Repository, spec: Option, out: impl std::io::Write) -> anyhow::Result<()> { - let mut out = BufWriter::with_capacity(64 * 1024, out); +pub fn connectivity( + mut repo: gix::Repository, + spec: Option, + mut out: impl std::io::Write, +) -> anyhow::Result<()> { let spec = spec.unwrap_or("HEAD".into()); repo.object_cache_size_if_unset(4 * 1024 * 1024); @@ -22,19 +23,18 @@ pub fn connectivity(mut repo: gix::Repository, spec: Option, out: impl s .ancestors() .all()?; - let missing_cb = |oid: &ObjectId, kind: Kind| { + let on_missing = |oid: &ObjectId, kind: Kind| { writeln!(out, "{oid}: {kind}").expect("failed to write output"); }; - let mut conn = gix_fsck::ConnectivityCheck::new(&repo.objects, missing_cb); + let mut check = gix_fsck::Connectivity::new(&repo.objects, on_missing); // Walk all commits, checking each one for connectivity for commit in commits { let commit = commit?; - conn.check_commit(&commit.id); - for parent in commit.parent_ids { - conn.check_commit(&parent); - } + check.check_commit(&commit.id); + // Note that we leave parent-iteration to the commits iterator, as it will + // correctly handle shallow repositories which are expected to have the commits + // along the shallow boundary missing. } - Ok(()) } diff --git a/gix-fsck/Changelog.md b/gix-fsck/CHANGELOG.md similarity index 100% rename from gix-fsck/Changelog.md rename to gix-fsck/CHANGELOG.md diff --git a/gix-fsck/Cargo.toml b/gix-fsck/Cargo.toml index 172c8cdb35f..4c55db3e647 100644 --- a/gix-fsck/Cargo.toml +++ b/gix-fsck/Cargo.toml @@ -1,12 +1,12 @@ [package] name = "gix-fsck" -version = "0.0.0" +version = "0.1.0" repository = "https://github.com/Byron/gitoxide" authors = ["Cameron Esfahani ", "Sebastian Thiel "] license = "MIT OR Apache-2.0" description = "Verifies the connectivity and validity of objects in the database" edition = "2021" -include = ["src/**/*", "LICENSE-*", "CHANGELOG.md"] +include = ["src/**/*", "LICENSE-*"] rust-version = "1.65" [lib] diff --git a/gix-fsck/src/lib.rs b/gix-fsck/src/lib.rs index 6583697fc40..f542522a68a 100644 --- a/gix-fsck/src/lib.rs +++ b/gix-fsck/src/lib.rs @@ -1,127 +1,161 @@ //! A library for performing object database integrity and connectivity checks -#![deny(rust_2018_idioms)] +#![deny(rust_2018_idioms, unsafe_code, missing_docs)] use gix_hash::ObjectId; use gix_hashtable::HashSet; use gix_object::{tree::EntryMode, Exists, FindExt, Kind}; +use std::cell::RefCell; +use std::ops::{Deref, DerefMut}; -pub struct ConnectivityCheck<'a, T, F> +/// Perform a connectivity check. +pub struct Connectivity where T: FindExt + Exists, F: FnMut(&ObjectId, Kind), { /// ODB handle to use for the check - db: &'a T, + db: T, /// Closure to invoke when a missing object is encountered missing_cb: F, /// Set of Object IDs already (or about to be) scanned during the check oid_set: HashSet, - /// Single buffer for decoding objects from the ODB - /// This is slightly faster than allocating throughout the connectivity check (and reduces the memory requirements) - buf: Vec, + /// A free-list of buffers for recursive tree decoding. + free_list: FreeList, } -impl<'a, T, F> ConnectivityCheck<'a, T, F> +impl Connectivity where T: FindExt + Exists, F: FnMut(&ObjectId, Kind), { - /// Instantiate a connectivity check - pub fn new(db: &'a T, missing_cb: F) -> ConnectivityCheck<'a, T, F> { - ConnectivityCheck { + /// Instantiate a connectivity check. + pub fn new(db: T, missing_cb: F) -> Connectivity { + Connectivity { db, missing_cb, oid_set: HashSet::default(), - buf: Vec::new(), + free_list: Default::default(), } } - /// Run the connectivity check on the provided commit object ID - /// - This will walk the trees and blobs referenced by the commit and verify they exist in the ODB - /// - Any objects previously encountered by this [`ConnectivityCheck`] instance will be skipped - /// - Any referenced blobs that are not present in the ODB will result in a call to the `missing_cb` - /// - Missing commits or trees will currently result in panic + /// Run the connectivity check on the provided commit `oid`. + /// + /// ### Algorithm + /// + /// Walk the trees and blobs referenced by the commit and verify they exist in the ODB. + /// Any objects previously encountered by this instance will be skipped silently. + /// Any referenced blobs that are not present in the ODB will result in a call to the `missing_cb`. + /// Missing commits or trees will cause an error to be returned. /// - TODO: consider how to handle a missing commit (invoke `missing_cb`, or possibly return a Result?) - pub fn check_commit(&mut self, oid: &ObjectId) { + pub fn check_commit(&mut self, oid: &ObjectId) -> Result<(), gix_object::find::existing_object::Error> { // Attempt to insert the commit ID in the set, and if already present, return immediately if !self.oid_set.insert(*oid) { - return; + return Ok(()); } // Obtain the commit's tree ID let tree_id = { - let commit = self.db.find_commit(oid, &mut self.buf).expect("failed to find commit"); + let mut buf = self.free_list.buf(); + let commit = self.db.find_commit(oid, &mut buf)?; commit.tree() }; - // Attempt to insert the tree ID in the set, and if already present, return immediately if self.oid_set.insert(tree_id) { - self.check_tree(&tree_id); + check_tree( + &tree_id, + &self.db, + &mut self.free_list, + &mut self.missing_cb, + &mut self.oid_set, + ); } + + Ok(()) } +} - fn check_tree(&mut self, oid: &ObjectId) { - let tree = match self.db.find_tree(oid, &mut self.buf) { - Ok(tree) => tree, - Err(_) => { - // Tree is missing, so invoke `missing_cb` - (self.missing_cb)(oid, Kind::Tree); - return; - } - }; +#[derive(Default)] +struct FreeList(RefCell>>); - // Keeping separate sets for trees and blobs for now... - // This is about a wash when compared to using a HashMap - struct TreeEntries { - trees: HashSet, - blobs: HashSet, - } +impl FreeList { + fn buf(&self) -> ReturnToFreeListOnDrop<'_> { + let buf = self.0.borrow_mut().pop().unwrap_or_default(); + ReturnToFreeListOnDrop { buf, list: &self.0 } + } +} - // Build up a set of trees and a set of blobs - let entries: TreeEntries = { - let mut entries = TreeEntries { - trees: HashSet::default(), - blobs: HashSet::default(), - }; - - // For each entry in the tree - for entry_ref in tree.entries.iter() { - match entry_ref.mode { - EntryMode::Tree => { - let tree_id = entry_ref.oid.to_owned(); - // Check if the tree has already been encountered - if self.oid_set.insert(tree_id) { - entries.trees.insert(tree_id); - } - } - EntryMode::Blob | EntryMode::BlobExecutable | EntryMode::Link => { - let blob_id = entry_ref.oid.to_owned(); - // Check if the blob has already been encountered - if self.oid_set.insert(blob_id) { - entries.blobs.insert(blob_id); - } - } - EntryMode::Commit => { - // This implies a submodule (OID is the commit hash of the submodule) - // Skip it as it's not in this repository! - } - } - } - entries - }; +struct ReturnToFreeListOnDrop<'a> { + list: &'a RefCell>>, + buf: Vec, +} - for tree_id in entries.trees.iter() { - self.check_tree(tree_id); - } - for blob_id in entries.blobs.iter() { - self.check_blob(blob_id); +impl Drop for ReturnToFreeListOnDrop<'_> { + fn drop(&mut self) { + if !self.buf.is_empty() { + self.list.borrow_mut().push(std::mem::take(&mut self.buf)); } } +} + +impl Deref for ReturnToFreeListOnDrop<'_> { + type Target = Vec; + + fn deref(&self) -> &Self::Target { + &self.buf + } +} + +impl DerefMut for ReturnToFreeListOnDrop<'_> { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.buf + } +} + +fn check_blob(db: impl Exists, oid: &ObjectId, mut missing_cb: F) +where + F: FnMut(&ObjectId, Kind), +{ + // Check if the blob is missing from the ODB + if !db.exists(oid) { + // Blob is missing, so invoke `missing_cb` + missing_cb(oid, Kind::Blob); + } +} - fn check_blob(&mut self, oid: &ObjectId) { - // Check if the blob is missing from the ODB - if !self.db.exists(oid) { - // Blob is missing, so invoke `missing_cb` - (self.missing_cb)(oid, Kind::Blob); +fn check_tree( + oid: &ObjectId, + db: &(impl FindExt + Exists), + list: &FreeList, + missing_cb: &mut F, + oid_set: &mut HashSet, +) where + F: FnMut(&ObjectId, Kind), +{ + let mut buf = list.buf(); + let Ok(tree) = db.find_tree(oid, &mut buf) else { + missing_cb(oid, Kind::Tree); + return; + }; + + // Build up a set of trees and a set of blobs + // For each entry in the tree + for entry_ref in tree.entries.iter() { + match entry_ref.mode { + EntryMode::Tree => { + let tree_id = entry_ref.oid.to_owned(); + if oid_set.insert(tree_id) { + check_tree(&tree_id, &*db, list, &mut *missing_cb, oid_set); + } + } + EntryMode::Blob | EntryMode::BlobExecutable | EntryMode::Link => { + let blob_id = entry_ref.oid.to_owned(); + if oid_set.insert(blob_id) { + check_blob(&*db, &blob_id, &mut *missing_cb); + } + } + EntryMode::Commit => { + // This implies a submodule (OID is the commit hash of the submodule) + // Skip it as it's not in this repository! + } } } } diff --git a/gix-fsck/tests/connectivity/mod.rs b/gix-fsck/tests/connectivity/mod.rs index 4625fbf681e..120ae699f6b 100644 --- a/gix-fsck/tests/connectivity/mod.rs +++ b/gix-fsck/tests/connectivity/mod.rs @@ -1,6 +1,4 @@ -use std::ops::Deref; - -use gix_fsck::ConnectivityCheck; +use gix_fsck::Connectivity; use gix_hash::ObjectId; use gix_hashtable::HashMap; use gix_object::Kind; @@ -15,21 +13,20 @@ fn check_missing<'a>(repo_name: &str, commits: impl IntoIterator = HashMap::default(); - let callback = |oid: &ObjectId, kind: Kind| { + let record_missing_and_assert_no_duplicate = |oid: &ObjectId, kind: Kind| { missing.try_insert(*oid, kind).expect("no duplicate oid"); }; - let mut check = ConnectivityCheck::new(&db, callback); + let mut check = Connectivity::new(db, record_missing_and_assert_no_duplicate); for commit in commits.into_iter() { - check.check_commit(commit); + check.check_commit(commit).expect("commit is present") } - missing } @@ -38,22 +35,19 @@ fn hex_to_ids<'a>(hex_ids: impl IntoIterator) -> Vec { } fn hex_to_objects<'a>(hex_ids: impl IntoIterator, kind: Kind) -> HashMap { - hex_to_ids(hex_ids) - .into_iter() - .map(|id| (id, kind)) - .collect::>() + hex_to_ids(hex_ids).into_iter().map(|id| (id, kind)).collect() } // Get a `&Vec() -> &'a Vec { +fn all_commits() -> &'static [ObjectId] { static ALL_COMMITS: Lazy> = Lazy::new(|| { - hex_to_ids(vec![ + hex_to_ids([ "5d18db2e2aabadf7b914435ef34f2faf8b4546dd", "3a3dfaa55a515f3fb3a25751107bbb523af6a1b0", "734c926856a328d1168ffd7088532e0d1ad19bbe", ]) }); - ALL_COMMITS.deref() + &ALL_COMMITS } #[test] @@ -65,7 +59,7 @@ fn no_missing() { #[test] fn missing_blobs() { // The "blobless" repo is cloned with `--filter=blob:none`, and is missing one blob - let expected = hex_to_objects(vec!["c18147dc648481eeb65dc5e66628429a64843327"], Kind::Blob); + let expected = hex_to_objects(["c18147dc648481eeb65dc5e66628429a64843327"], Kind::Blob); assert_eq!(check_missing("blobless", all_commits()), expected); } @@ -74,7 +68,7 @@ fn missing_trees() { // The "treeless" repo is cloned with `--filter=tree:0`, and is missing two trees // NOTE: This repo is also missing a blob, but we have no way of knowing that, as the tree referencing it is missing let expected = hex_to_objects( - vec![ + [ "9561cfbae43c5e2accdfcd423378588dd10d827f", "fc264b3b6875a46e9031483aeb9994a1b897ffd3", ], diff --git a/gix-fsck/tests/fsck.rs b/gix-fsck/tests/fsck.rs index f12ebfebc25..2d14407cccc 100644 --- a/gix-fsck/tests/fsck.rs +++ b/gix-fsck/tests/fsck.rs @@ -1,5 +1,4 @@ use gix_hash::ObjectId; -pub use gix_testtools::Result; pub fn hex_to_id(hex: &str) -> ObjectId { ObjectId::from_hex(hex.as_bytes()).expect("40 bytes hex") diff --git a/src/plumbing/main.rs b/src/plumbing/main.rs index 670ab1b391c..2905c841fbd 100644 --- a/src/plumbing/main.rs +++ b/src/plumbing/main.rs @@ -1076,7 +1076,7 @@ pub fn main() -> Result<()> { fsck::Subcommands::Connectivity { spec } => prepare_and_run( "fsck-connectivity", trace, - verbose, + auto_verbose, progress, progress_keep_open, None, diff --git a/src/plumbing/options/mod.rs b/src/plumbing/options/mod.rs index 1eb84ea8400..2e998966579 100644 --- a/src/plumbing/options/mod.rs +++ b/src/plumbing/options/mod.rs @@ -87,7 +87,7 @@ pub enum Subcommands { /// Interact with the object database. #[clap(subcommand)] Odb(odb::Subcommands), - /// Perform integrity checks on the repository. + /// Perform (some) integrity checks on the repository. #[clap(subcommand)] Fsck(fsck::Subcommands), /// Interact with tree objects. @@ -495,7 +495,7 @@ pub mod odb { pub mod fsck { #[derive(Debug, clap::Subcommand)] pub enum Subcommands { - /// Perform a connectivity check on the repository. + /// Perform a (partial) connectivity check on the repository. Connectivity { /// A revspec to start the connectivity check from. spec: Option,