Skip to content

Commit 4ccd9ae

Browse files
committed
perf(cargo-package): match certain path prefix with pathspec
`check_repo_state` checks the entire git repo status. This is usually fine if you have only a few packages in a workspace. For huge monorepos, it may hit performance issues. For example, on awslabs/aws-sdk-rust@2cbd34d the workspace has roughly 434 members to publish. `git ls-files` reported us 204379 files in this Git repository. That means git may need to check status of all files 434 times. That would be `204379 * 434 = 88,700,486` checks! Moreover, the current algorithm is finding the intersection of `PathSource::list_files` and `git status`. It is an `O(n^2)` check. Let's assume files are evenly distributed into each package, so roughly 470 files per package. If we're unlucky to have some dirty files, say 100 files. We will have to do `470 * 100 = 47,000` times of path comparisons. Even worse, because we `git status` everything in the repo, we'll have to it for all members, even when those dirty files are not part of the current package in question. So it becomes `470 * 100 * 434 = 20,398,000`! Instead of comparing with the status of the entire repository, this patch use the magic pathspec[1] to tell git only reports paths that match a certain path prefix. This wouldn't help the `O(n^2)` algorithm, but at least it won't check dirty files outside the current package. [1]: https://git-scm.com/docs/gitglossary#Documentation/gitglossary.txt-aiddefpathspecapathspec
1 parent b4f204e commit 4ccd9ae

File tree

1 file changed

+16
-4
lines changed

1 file changed

+16
-4
lines changed

src/cargo/ops/cargo_package.rs

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -805,7 +805,7 @@ fn check_repo_state(
805805
return Ok(Some(VcsInfo { git, path_in_vcs }));
806806

807807
fn git(
808-
_pkg: &Package,
808+
pkg: &Package,
809809
src_files: &[PathBuf],
810810
repo: &git2::Repository,
811811
opts: &PackageOpts<'_>,
@@ -816,7 +816,8 @@ fn check_repo_state(
816816
// - ignored (in case the user has an `include` directive that
817817
// conflicts with .gitignore).
818818
let mut dirty_files = Vec::new();
819-
collect_statuses(repo, &mut dirty_files)?;
819+
let pathspec = relative_pathspec(repo, pkg.root());
820+
collect_statuses(repo, &[pathspec.as_str()], &mut dirty_files)?;
820821
// Include each submodule so that the error message can provide
821822
// specifically *which* files in a submodule are modified.
822823
status_submodules(repo, &mut dirty_files)?;
@@ -858,16 +859,27 @@ fn check_repo_state(
858859
}
859860
}
860861

862+
/// Use pathspec so git only matches a certain path prefix
863+
fn relative_pathspec(repo: &git2::Repository, pkg_root: &Path) -> String {
864+
let workdir = repo.workdir().unwrap();
865+
let relpath = pkg_root.strip_prefix(workdir).unwrap_or(Path::new(""));
866+
// to unix separators
867+
relpath.to_str().unwrap().replace('\\', "/")
868+
}
869+
861870
// Helper to collect dirty statuses for a single repo.
862871
fn collect_statuses(
863872
repo: &git2::Repository,
873+
pathspecs: &[&str],
864874
dirty_files: &mut Vec<PathBuf>,
865875
) -> CargoResult<()> {
866876
let mut status_opts = git2::StatusOptions::new();
867877
// Exclude submodules, as they are being handled manually by recursing
868878
// into each one so that details about specific files can be
869879
// retrieved.
870-
status_opts
880+
pathspecs
881+
.iter()
882+
.fold(&mut status_opts, git2::StatusOptions::pathspec)
871883
.exclude_submodules(true)
872884
.include_ignored(true)
873885
.include_untracked(true);
@@ -902,7 +914,7 @@ fn check_repo_state(
902914
// If its files are required, then the verification step should fail.
903915
if let Ok(sub_repo) = submodule.open() {
904916
status_submodules(&sub_repo, dirty_files)?;
905-
collect_statuses(&sub_repo, dirty_files)?;
917+
collect_statuses(&sub_repo, &[], dirty_files)?;
906918
}
907919
}
908920
Ok(())

0 commit comments

Comments
 (0)