Skip to content

fix(vendor)!: direct extraction for registry sources #15514

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
May 21, 2025
79 changes: 39 additions & 40 deletions src/cargo/ops/vendor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -111,14 +111,14 @@ fn sync(
opts: &VendorOptions<'_>,
) -> CargoResult<VendorConfig> {
let dry_run = false;
let canonical_destination = try_canonicalize(opts.destination);
let canonical_destination = canonical_destination.as_deref().unwrap_or(opts.destination);
let dest_dir_already_exists = canonical_destination.exists();
let vendor_dir = try_canonicalize(opts.destination);
let vendor_dir = vendor_dir.as_deref().unwrap_or(opts.destination);
let vendor_dir_already_exists = vendor_dir.exists();

paths::create_dir_all(&canonical_destination)?;
paths::create_dir_all(&vendor_dir)?;
let mut to_remove = HashSet::new();
if !opts.no_delete {
for entry in canonical_destination.read_dir()? {
for entry in vendor_dir.read_dir()? {
let entry = entry?;
if !entry
.file_name()
Expand Down Expand Up @@ -247,7 +247,7 @@ fn sync(
};

sources.insert(id.source_id());
let dst = canonical_destination.join(&dst_name);
let dst = vendor_dir.join(&dst_name);
to_remove.remove(&dst);
let cksum = dst.join(".cargo-checksum.json");
// Registries are the only immutable sources,
Expand All @@ -265,14 +265,14 @@ fn sync(
let _ = fs::remove_dir_all(&dst);
let pathsource = PathSource::new(src, id.source_id(), gctx);
let paths = pathsource.list_files(pkg)?;
let mut map = BTreeMap::new();
cp_sources(pkg, src, &paths, &dst, &mut map, &mut tmp_buf, gctx)
let mut file_cksums = BTreeMap::new();
cp_sources(pkg, src, &paths, &dst, &mut file_cksums, &mut tmp_buf, gctx)
.with_context(|| format!("failed to copy over vendored sources for: {}", id))?;

// Finally, emit the metadata about this package
let json = serde_json::json!({
"package": checksums.get(id),
"files": map,
"files": file_cksums,
});

paths::write(&cksum, json.to_string())?;
Expand Down Expand Up @@ -347,9 +347,9 @@ fn sync(
directory: opts.destination.to_string_lossy().replace("\\", "/"),
},
);
} else if !dest_dir_already_exists {
} else if !vendor_dir_already_exists {
// Nothing to vendor. Remove the destination dir we've just created.
paths::remove_dir(canonical_destination)?;
paths::remove_dir(vendor_dir)?;
}

Ok(VendorConfig { source: config })
Expand All @@ -368,26 +368,9 @@ fn cp_sources(
let p = p.as_ref();
let relative = p.strip_prefix(&src).unwrap();

match relative.to_str() {
// Skip git config files as they're not relevant to builds most of
// the time and if we respect them (e.g. in git) then it'll
// probably mess with the checksums when a vendor dir is checked
// into someone else's source control
Some(".gitattributes" | ".gitignore" | ".git") => continue,

// Temporary Cargo files
Some(".cargo-ok") => continue,

// Skip patch-style orig/rej files. Published crates on crates.io
// have `Cargo.toml.orig` which we don't want to use here and
// otherwise these are rarely used as part of the build process.
Some(filename) => {
if filename.ends_with(".orig") || filename.ends_with(".rej") {
continue;
}
}
_ => {}
};
if !vendor_this(relative) {
continue;
}

// Join pathname components individually to make sure that the joined
// path uses the correct directory separators everywhere, since
Expand Down Expand Up @@ -417,7 +400,7 @@ fn cp_sources(
&dst,
&mut dst_opts,
&mut contents.as_bytes(),
"Generated Cargo.toml",
Path::new("Generated Cargo.toml"),
tmp_buf,
)?
} else {
Expand All @@ -430,13 +413,7 @@ fn cp_sources(
.with_context(|| format!("failed to stat {:?}", p))?;
dst_opts.mode(src_metadata.mode());
}
copy_and_checksum(
&dst,
&mut dst_opts,
&mut src,
&p.display().to_string(),
tmp_buf,
)?
copy_and_checksum(&dst, &mut dst_opts, &mut src, &p, tmp_buf)?
};

cksums.insert(relative.to_str().unwrap().replace("\\", "/"), cksum);
Expand Down Expand Up @@ -562,7 +539,7 @@ fn copy_and_checksum<T: Read>(
dst_path: &Path,
dst_opts: &mut OpenOptions,
contents: &mut T,
contents_path: &str,
contents_path: &Path,
buf: &mut [u8],
) -> CargoResult<String> {
let mut dst = dst_opts
Expand All @@ -584,3 +561,25 @@ fn copy_and_checksum<T: Read>(
.with_context(|| format!("failed to write to {:?}", dst_path))?;
}
}

/// Filters files we want to vendor.
///
/// `relative` is a path relative to the package root.
fn vendor_this(relative: &Path) -> bool {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we filter out .cargo_vcs_info.json and explicitly bring it back in?

Also, we're closing #11000 without having a test for that case

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Got it rearranged.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This commit didn't get into the merge. It said it was disabled.

image

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That was confusing. I approved your updates.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

match relative.to_str() {
// Skip git config files as they're not relevant to builds most of
// the time and if we respect them (e.g. in git) then it'll
// probably mess with the checksums when a vendor dir is checked
// into someone else's source control
Some(".gitattributes" | ".gitignore" | ".git") => false,

// Temporary Cargo files
Some(".cargo-ok") => false,

// Skip patch-style orig/rej files. Published crates on crates.io
// have `Cargo.toml.orig` which we don't want to use here and
// otherwise these are rarely used as part of the build process.
Some(p) if p.ends_with(".orig") || p.ends_with(".rej") => false,
_ => true,
}
}
115 changes: 61 additions & 54 deletions src/cargo/sources/registry/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -630,60 +630,8 @@ impl<'gctx> RegistrySource<'gctx> {
Err(e) => anyhow::bail!("unable to read .cargo-ok file at {path:?}: {e}"),
}
dst.create_dir()?;
let mut tar = {
let size_limit = max_unpack_size(self.gctx, tarball.metadata()?.len());
let gz = GzDecoder::new(tarball);
let gz = LimitErrorReader::new(gz, size_limit);
let mut tar = Archive::new(gz);
set_mask(&mut tar);
tar
};
let mut bytes_written = 0;
let prefix = unpack_dir.file_name().unwrap();
let parent = unpack_dir.parent().unwrap();
for entry in tar.entries()? {
let mut entry = entry.context("failed to iterate over archive")?;
let entry_path = entry
.path()
.context("failed to read entry path")?
.into_owned();

// We're going to unpack this tarball into the global source
// directory, but we want to make sure that it doesn't accidentally
// (or maliciously) overwrite source code from other crates. Cargo
// itself should never generate a tarball that hits this error, and
// crates.io should also block uploads with these sorts of tarballs,
// but be extra sure by adding a check here as well.
if !entry_path.starts_with(prefix) {
anyhow::bail!(
"invalid tarball downloaded, contains \
a file at {:?} which isn't under {:?}",
entry_path,
prefix
)
}
// Prevent unpacking the lockfile from the crate itself.
if entry_path
.file_name()
.map_or(false, |p| p == PACKAGE_SOURCE_LOCK)
{
continue;
}
// Unpacking failed
bytes_written += entry.size();
let mut result = entry.unpack_in(parent).map_err(anyhow::Error::from);
if cfg!(windows) && restricted_names::is_windows_reserved_path(&entry_path) {
result = result.with_context(|| {
format!(
"`{}` appears to contain a reserved Windows path, \
it cannot be extracted on Windows",
entry_path.display()
)
});
}
result
.with_context(|| format!("failed to unpack entry at `{}`", entry_path.display()))?;
}

let bytes_written = unpack(self.gctx, tarball, unpack_dir)?;

// Now that we've finished unpacking, create and write to the lock file to indicate that
// unpacking was successful.
Expand Down Expand Up @@ -1046,3 +994,62 @@ fn set_mask<R: Read>(tar: &mut Archive<R>) {
#[cfg(unix)]
tar.set_mask(crate::util::get_umask());
}

/// Unpack a tarball with zip bomb and overwrite protections.
fn unpack(gctx: &GlobalContext, tarball: &File, unpack_dir: &Path) -> CargoResult<u64> {
let mut tar = {
let size_limit = max_unpack_size(gctx, tarball.metadata()?.len());
let gz = GzDecoder::new(tarball);
let gz = LimitErrorReader::new(gz, size_limit);
let mut tar = Archive::new(gz);
set_mask(&mut tar);
tar
};
let mut bytes_written = 0;
let prefix = unpack_dir.file_name().unwrap();
let parent = unpack_dir.parent().unwrap();
for entry in tar.entries()? {
let mut entry = entry.context("failed to iterate over archive")?;
let entry_path = entry
.path()
.context("failed to read entry path")?
.into_owned();

// We're going to unpack this tarball into the global source
// directory, but we want to make sure that it doesn't accidentally
// (or maliciously) overwrite source code from other crates. Cargo
// itself should never generate a tarball that hits this error, and
// crates.io should also block uploads with these sorts of tarballs,
// but be extra sure by adding a check here as well.
if !entry_path.starts_with(prefix) {
anyhow::bail!(
"invalid tarball downloaded, contains \
a file at {:?} which isn't under {:?}",
entry_path,
prefix
)
}
// Prevent unpacking the lockfile from the crate itself.
if entry_path
.file_name()
.map_or(false, |p| p == PACKAGE_SOURCE_LOCK)
{
continue;
}
// Unpacking failed
bytes_written += entry.size();
let mut result = entry.unpack_in(parent).map_err(anyhow::Error::from);
if cfg!(windows) && restricted_names::is_windows_reserved_path(&entry_path) {
result = result.with_context(|| {
format!(
"`{}` appears to contain a reserved Windows path, \
it cannot be extracted on Windows",
entry_path.display()
)
});
}
result.with_context(|| format!("failed to unpack entry at `{}`", entry_path.display()))?;
}

Ok(bytes_written)
}