Skip to content

Commit

Permalink
Merge branch 'feat_basic_connectivity_check'
Browse files Browse the repository at this point in the history
  • Loading branch information
Byron committed Nov 10, 2023
2 parents 7227410 + 7ab5c76 commit 1f9aca5
Show file tree
Hide file tree
Showing 19 changed files with 347 additions and 2 deletions.
12 changes: 12 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -281,6 +281,7 @@ members = [
"gix-archive",
"gix-worktree-stream",
"gix-revwalk",
"gix-fsck",

"tests/tools",

Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,7 @@ is usable to some extent.
* [gix-tui](https://github.com/Byron/gitoxide/blob/main/crate-status.md#gix-tui)
* [gix-tix](https://github.com/Byron/gitoxide/blob/main/crate-status.md#gix-tix)
* [gix-bundle](https://github.com/Byron/gitoxide/blob/main/crate-status.md#gix-bundle)
* [gix-fsck](https://github.com/Byron/gitoxide/blob/main/crate-status.md#gix-fsck)

### Stress Testing
* [x] Verify huge packs
Expand Down
17 changes: 17 additions & 0 deletions crate-status.md
Original file line number Diff line number Diff line change
Expand Up @@ -775,6 +775,23 @@ See its [README.md](https://github.com/Byron/gitoxide/blob/main/gix-lock/README.
* [x] validate submodule names
* [x] [validate][tagname-validation] tag names

### gix-fsck
* [x] validate connectivity and find missing objects starting from…
- [x] commits
- [ ] tags
- [ ] tree-cache in the `index` or any entry within
* [ ] validate object hashes during connectivity traversal
* [ ] progress reporting and interruptability
* [ ] skipList to exclude objects which are known to be broken
* [ ] validate blob hashes (connectivity check
* [ ] identify objects that exist but are not reachable (i.e. what remains after a full graph traversal from all valid starting points)
* [ ] write dangling objects to the `.git/log-found` directory structure
* [ ] `strict` mode, to check for tree objects with `g+w` permissions
* [ ] consider reflog entries from `ref` starting points
* [ ] when reporting reachable objects, provide the path through which they are reachable, i.e. ref-log@{3} -> commit -> tree -> path-in-tree
* [ ] limit search to ODB without alternates (default is equivalent to `git fsck --full` due to ODB implementation)
* [ ] all individual [checks available in `git fsck`](https://git-scm.com/docs/git-fsck#_fsck_messages) (*too many to print here*)

### gix-ref
* [ ] Prepare code for arrival of longer hashes like Sha256. It's part of the [V2 proposal][reftable-v2] but should work for loose refs as well.
* **Stores**
Expand Down
1 change: 1 addition & 0 deletions etc/check-package-size.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ function indent () {
}

echo "in root: gitoxide CLI"
(enter gix-fsck && indent cargo diet -n --package-size-limit 10KB)
(enter gix-actor && indent cargo diet -n --package-size-limit 10KB)
(enter gix-archive && indent cargo diet -n --package-size-limit 10KB)
(enter gix-worktree-stream && indent cargo diet -n --package-size-limit 40KB)
Expand Down
1 change: 1 addition & 0 deletions gitoxide-core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ gix-pack-for-configuration-only = { package = "gix-pack", version = "^0.44.0", p
gix-transport-configuration-only = { package = "gix-transport", version = "^0.38.0", path = "../gix-transport", default-features = false }
gix-archive-for-configuration-only = { package = "gix-archive", version = "^0.6.0", path = "../gix-archive", optional = true, features = ["tar", "tar_gz"] }
gix-status = { version = "^0.2.0", path = "../gix-status" }
gix-fsck = { version = "^0.1.0", path = "../gix-fsck" }
serde = { version = "1.0.114", optional = true, default-features = false, features = ["derive"] }
anyhow = "1.0.42"
thiserror = "1.0.34"
Expand Down
36 changes: 36 additions & 0 deletions gitoxide-core/src/repository/fsck.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
use anyhow::Context;
use gix::{objs::Kind, ObjectId};

pub fn function(mut repo: gix::Repository, spec: Option<String>, mut out: impl std::io::Write) -> anyhow::Result<()> {
let spec = spec.unwrap_or("HEAD".into());

repo.object_cache_size_if_unset(4 * 1024 * 1024);
// We expect to be finding a bunch of non-existent objects here - never refresh the ODB
repo.objects.refresh_never();

let id = repo
.rev_parse_single(spec.as_str())
.context("Only single revisions are supported")?;
let commits: gix::revision::Walk<'_> = id
.object()?
.peel_to_kind(gix::object::Kind::Commit)
.context("Need commitish as starting point")?
.id()
.ancestors()
.all()?;

let on_missing = |oid: &ObjectId, kind: Kind| {
writeln!(out, "{oid}: {kind}").expect("failed to write output");
};

let mut check = gix_fsck::Connectivity::new(&repo.objects, on_missing);
// Walk all commits, checking each one for connectivity
for commit in commits {
let commit = commit?;
check.check_commit(&commit.id)?;
// Note that we leave parent-iteration to the commits iterator, as it will
// correctly handle shallow repositories which are expected to have the commits
// along the shallow boundary missing.
}
Ok(())
}
2 changes: 2 additions & 0 deletions gitoxide-core/src/repository/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ pub use clone::function::clone;
pub use fetch::function::fetch;

pub mod commitgraph;
mod fsck;
pub use fsck::function as fsck;
pub mod index;
pub mod mailmap;
pub mod odb;
Expand Down
6 changes: 6 additions & 0 deletions gix-fsck/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# Changelog

All notable changes to this project will be documented in this file.

The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
22 changes: 22 additions & 0 deletions gix-fsck/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
[package]
name = "gix-fsck"
version = "0.1.0"
repository = "https://github.com/Byron/gitoxide"
authors = ["Cameron Esfahani <[email protected]>", "Sebastian Thiel <[email protected]>"]
license = "MIT OR Apache-2.0"
description = "Verifies the connectivity and validity of objects in the database"
edition = "2021"
include = ["src/**/*", "LICENSE-*"]
rust-version = "1.65"

[lib]
doctest = false

[dependencies]
gix-hash = { version = "^0.13.1", path = "../gix-hash" }
gix-hashtable = { version = "^0.4.0", path = "../gix-hashtable" }
gix-object = { version = "^0.38.0", path = "../gix-object" }

[dev-dependencies]
gix-odb = { path = "../gix-odb" }
gix-testtools = { path = "../tests/tools"}
1 change: 1 addition & 0 deletions gix-fsck/LICENSE-APACHE
1 change: 1 addition & 0 deletions gix-fsck/LICENSE-MIT
107 changes: 107 additions & 0 deletions gix-fsck/src/lib.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
//! A library for performing object database integrity and connectivity checks
#![deny(rust_2018_idioms, unsafe_code, missing_docs)]

use gix_hash::ObjectId;
use gix_hashtable::HashSet;
use gix_object::{tree::EntryMode, Exists, FindExt, Kind};
use std::collections::VecDeque;

/// Perform a connectivity check.
pub struct Connectivity<T, F>
where
T: FindExt + Exists,
F: FnMut(&ObjectId, Kind),
{
/// ODB handle to use for the check
db: T,
/// Closure to invoke when a missing object is encountered
missing_cb: F,
/// Set of Object IDs already (or about to be) scanned during the check
seen: HashSet,
/// A buffer to keep a single object at a time.
buf: Vec<u8>,
}

impl<T, F> Connectivity<T, F>
where
T: FindExt + Exists,
F: FnMut(&ObjectId, Kind),
{
/// Instantiate a connectivity check.
pub fn new(db: T, missing_cb: F) -> Connectivity<T, F> {
Connectivity {
db,
missing_cb,
seen: HashSet::default(),
buf: Default::default(),
}
}

/// Run the connectivity check on the provided commit `oid`.
///
/// ### Algorithm
///
/// Walk the trees and blobs referenced by the commit and verify they exist in the ODB.
/// Any objects previously encountered by this instance will be skipped silently.
/// Any referenced blobs that are not present in the ODB will result in a call to the `missing_cb`.
/// Missing commits or trees will cause an error to be returned.
/// - TODO: consider how to handle a missing commit (invoke `missing_cb`, or possibly return a Result?)
pub fn check_commit(&mut self, oid: &ObjectId) -> Result<(), gix_object::find::existing_object::Error> {
// Attempt to insert the commit ID in the set, and if already present, return immediately
if !self.seen.insert(*oid) {
return Ok(());
}
// Obtain the commit's tree ID
let tree_id = {
let commit = self.db.find_commit(oid, &mut self.buf)?;
commit.tree()
};

let mut tree_ids = VecDeque::from_iter(Some(tree_id));
while let Some(tree_id) = tree_ids.pop_front() {
if self.seen.insert(tree_id) {
self.check_tree(&tree_id, &mut tree_ids);
}
}

Ok(())
}

/// Blobs are checked right away, trees are stored in `tree_ids` for the parent to iterate them, and only
/// if they have not been `seen` yet.
fn check_tree(&mut self, oid: &ObjectId, tree_ids: &mut VecDeque<ObjectId>) {
let Ok(tree) = self.db.find_tree(oid, &mut self.buf) else {
(self.missing_cb)(oid, Kind::Tree);
return;
};

for entry_ref in tree.entries.iter() {
match entry_ref.mode {
EntryMode::Tree => {
let tree_id = entry_ref.oid.to_owned();
if self.seen.insert(tree_id) {
tree_ids.push_back(tree_id);
}
}
EntryMode::Blob | EntryMode::BlobExecutable | EntryMode::Link => {
let blob_id = entry_ref.oid.to_owned();
if self.seen.insert(blob_id) {
check_blob(&self.db, &blob_id, &mut self.missing_cb);
}
}
EntryMode::Commit => {
// Skip submodules as it's not in this repository!
}
}
}
}
}

fn check_blob<F>(db: impl Exists, oid: &ObjectId, mut missing_cb: F)
where
F: FnMut(&ObjectId, Kind),
{
if !db.exists(oid) {
missing_cb(oid, Kind::Blob);
}
}
78 changes: 78 additions & 0 deletions gix-fsck/tests/connectivity/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
use gix_fsck::Connectivity;
use gix_hash::ObjectId;
use gix_hashtable::HashMap;
use gix_object::Kind;
use gix_testtools::once_cell::sync::Lazy;

use crate::hex_to_id;

fn check_missing<'a>(repo_name: &str, commits: impl IntoIterator<Item = &'a ObjectId>) -> HashMap<ObjectId, Kind> {
let db = {
let fixture_path = gix_testtools::scripted_fixture_read_only("make_test_repos.sh")
.expect("fixture path")
.join(repo_name)
.join(".git")
.join("objects");
let mut db = gix_odb::at(fixture_path).expect("valid odb");
db.refresh_never();
db
};

let mut missing: HashMap<ObjectId, Kind> = HashMap::default();
let record_missing_and_assert_no_duplicate = |oid: &ObjectId, kind: Kind| {
missing.try_insert(*oid, kind).expect("no duplicate oid");
};

let mut check = Connectivity::new(db, record_missing_and_assert_no_duplicate);
for commit in commits.into_iter() {
check.check_commit(commit).expect("commit is present")
}
missing
}

fn hex_to_ids<'a>(hex_ids: impl IntoIterator<Item = &'a str>) -> Vec<ObjectId> {
hex_ids.into_iter().map(hex_to_id).collect()
}

fn hex_to_objects<'a>(hex_ids: impl IntoIterator<Item = &'a str>, kind: Kind) -> HashMap<ObjectId, Kind> {
hex_to_ids(hex_ids).into_iter().map(|id| (id, kind)).collect()
}

// Get a `&Vec<ObjectID` for each commit in the test fixture repository
fn all_commits() -> &'static [ObjectId] {
static ALL_COMMITS: Lazy<Vec<ObjectId>> = Lazy::new(|| {
hex_to_ids([
"5d18db2e2aabadf7b914435ef34f2faf8b4546dd",
"3a3dfaa55a515f3fb3a25751107bbb523af6a1b0",
"734c926856a328d1168ffd7088532e0d1ad19bbe",
])
});
&ALL_COMMITS
}

#[test]
fn no_missing() {
// The "base" repo is the original, and has every object present
assert_eq!(check_missing("base", all_commits()), HashMap::default());
}

#[test]
fn missing_blobs() {
// The "blobless" repo is cloned with `--filter=blob:none`, and is missing one blob
let expected = hex_to_objects(["c18147dc648481eeb65dc5e66628429a64843327"], Kind::Blob);
assert_eq!(check_missing("blobless", all_commits()), expected);
}

#[test]
fn missing_trees() {
// The "treeless" repo is cloned with `--filter=tree:0`, and is missing two trees
// NOTE: This repo is also missing a blob, but we have no way of knowing that, as the tree referencing it is missing
let expected = hex_to_objects(
[
"9561cfbae43c5e2accdfcd423378588dd10d827f",
"fc264b3b6875a46e9031483aeb9994a1b897ffd3",
],
Kind::Tree,
);
assert_eq!(check_missing("treeless", all_commits()), expected);
}
1 change: 1 addition & 0 deletions gix-fsck/tests/fixtures/generated-archives/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
make_test_repos.tar.xz
32 changes: 32 additions & 0 deletions gix-fsck/tests/fixtures/make_test_repos.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#!/bin/bash
set -x
set -euo pipefail

# We override the global config with our own local one (see below)
export GIT_CONFIG_GLOBAL="$PWD/.gitconfig"

# We need to be able to do partial clones, so enable it
# - needs to be present in the persistent gitconfig, as a clone with `--no-local`
git config --global uploadpack.allowFilter true

# First build out a base repository
git init base
(
cd base

echo "blob 1" > blob-1
git add -A
git commit -m "commit 1"
echo "blob-2" > blob-2
git add -A
git commit -m "commit 2"
git rm blob-1
git add -A
git commit -m "commit 3"
)

# Blobless clone
git clone --no-local --no-hardlinks --filter=blob:none ./base blobless

# Treeless (and blobless) clone
git clone --no-local --no-hardlinks --filter=tree:0 ./base treeless
7 changes: 7 additions & 0 deletions gix-fsck/tests/fsck.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
use gix_hash::ObjectId;

pub fn hex_to_id(hex: &str) -> ObjectId {
ObjectId::from_hex(hex.as_bytes()).expect("40 bytes hex")
}

mod connectivity;
Loading

0 comments on commit 1f9aca5

Please sign in to comment.