From fd3aecd230e8a1a0b1428bd23d0a6eea46085848 Mon Sep 17 00:00:00 2001 From: UebelAndre Date: Wed, 19 Feb 2025 07:17:09 +0100 Subject: [PATCH 1/2] feat: Add support for new index cargo hash implementation. This also adds `dirs::local_path_and_canonical_url_with_hash_kind()` and `SparseIndex::with_path_and_hash_kind()` to control which hash is used. --- Cargo.toml | 1 + src/dirs.rs | 201 +++++++++++++++++++++++++++++++++++------------ src/git/impl_.rs | 18 ++++- src/lib.rs | 3 +- src/sparse.rs | 29 +++++-- 5 files changed, 191 insertions(+), 61 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 7f311233..05084506 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -45,6 +45,7 @@ http = { version = "1", optional = true } memchr = "2.5.0" rayon = { version = "1.7.0", optional = true } rustc-hash = "2.0.0" +rustc-stable-hash = "0.1.1" semver = "1.0.17" serde = { version = "1.0.160", features = ["rc"] } serde_derive = "1.0.160" diff --git a/src/dirs.rs b/src/dirs.rs index 6c4b1b26..f1f2ff62 100644 --- a/src/dirs.rs +++ b/src/dirs.rs @@ -9,7 +9,16 @@ pub fn local_path_and_canonical_url( url: &str, cargo_home: Option<&std::path::Path>, ) -> Result<(std::path::PathBuf, String), Error> { - let (dir_name, canonical_url) = url_to_local_dir(url)?; + local_path_and_canonical_url_with_hash_kind(url, cargo_home, &DEFAULT_HASHER_KIND) +} + +/// Like [`local_path_and_canonical_url`] but accepts [`HashKind`] for determining the crate index path. +pub fn local_path_and_canonical_url_with_hash_kind( + url: &str, + cargo_home: Option<&std::path::Path>, + hash_kind: &HashKind, +) -> Result<(std::path::PathBuf, String), Error> { + let (dir_name, canonical_url) = url_to_local_dir(url, hash_kind)?; let mut path = match cargo_home { Some(path) => path.to_owned(), @@ -70,39 +79,31 @@ pub(crate) fn crate_name_to_relative_path(crate_name: &str, separator: Option Result<(String, String), Error> { - fn to_hex(num: u64) -> String { - const CHARS: &[u8] = b"0123456789abcdef"; - - let bytes = &[ - num as u8, - (num >> 8) as u8, - (num >> 16) as u8, - (num >> 24) as u8, - (num >> 32) as u8, - (num >> 40) as u8, - (num >> 48) as u8, - (num >> 56) as u8, - ]; +/// Matches https://github.com/rust-lang/cargo/blob/2928e32734b04925ee51e1ae88bea9a83d2fd451/crates/cargo-util-schemas/src/core/source_kind.rs#L5 +type SourceKind = u64; +const SOURCE_KIND_REGISTRY: SourceKind = 2; +const SOURCE_KIND_SPASE_REGISTRY: SourceKind = 3; - let mut output = vec![0u8; 16]; +/// Determine the crate registry hashing strategy for locating local crate indexes. +pub enum HashKind { + /// Use the new hashing behavior introduced in Rust `1.85.0`. + Stable, - let mut ind = 0; - - for &byte in bytes { - output[ind] = CHARS[(byte >> 4) as usize]; - output[ind + 1] = CHARS[(byte & 0xf) as usize]; - - ind += 2; - } + /// Use a hashing strategy that matches Cargo versions less than `1.85.0` + Legacy, +} - String::from_utf8(output).expect("valid utf-8 hex string") - } +// For now, this acts as a centralized place to change the default. Ideally +// this would be compiled conditionally based on the version of rustc as +// a nice approximation of when consumers will be using the associated hash +// implementation but this behavior is not yet stable: https://github.com/rust-lang/rust/issues/64796 +pub(crate) const DEFAULT_HASHER_KIND: HashKind = HashKind::Legacy; +/// Converts a full url, eg https://github.com/rust-lang/crates.io-index, into +/// the root directory name where cargo itself will fetch it on disk +fn url_to_local_dir(url: &str, hash_kind: &HashKind) -> Result<(String, String), Error> { #[allow(deprecated)] - fn hash_u64(url: &str, registry_kind: u64) -> u64 { + fn legacy_hash_u64(url: &str, registry_kind: u64) -> u64 { use std::hash::{Hash, Hasher, SipHasher}; let mut hasher = SipHasher::new_with_keys(0, 0); @@ -113,8 +114,32 @@ fn url_to_local_dir(url: &str) -> Result<(String, String), Error> { hasher.finish() } - // SourceKind::Registry - let mut registry_kind = 2; + // Matches https://github.com/rust-lang/cargo/blob/2928e32734b04925ee51e1ae88bea9a83d2fd451/src/cargo/util/hasher.rs#L6 + fn stable_hash_u64(url: &str, registry_kind: u64) -> u64 { + use rustc_stable_hash::StableSipHasher128 as StableHasher; + use std::hash::{Hash, Hasher}; + + let mut hasher = StableHasher::new(); + + // Type has an impact in the `rustc_stable_hasher`. + (registry_kind as isize).hash(&mut hasher); + + url.hash(&mut hasher); + + Hasher::finish(&hasher) + } + + // Matches https://github.com/rust-lang/cargo/blob/2928e32734b04925ee51e1ae88bea9a83d2fd451/src/cargo/util/hex.rs#L6 + fn to_hex(num: u64) -> String { + hex::encode(num.to_le_bytes()) + } + + let hash_u64 = match hash_kind { + HashKind::Stable => stable_hash_u64, + HashKind::Legacy => legacy_hash_u64, + }; + + let mut registry_kind = SOURCE_KIND_REGISTRY; // Ensure we have a registry or bare url let (url, scheme_ind) = { @@ -124,7 +149,7 @@ fn url_to_local_dir(url: &str) -> Result<(String, String), Error> { let scheme_str = &url[..scheme_ind]; if scheme_str.starts_with("sparse+http") { - registry_kind = 3; + registry_kind = SOURCE_KIND_SPASE_REGISTRY; (url, scheme_ind) } else if let Some(ind) = scheme_str.find('+') { if &scheme_str[..ind] != "registry" { @@ -147,7 +172,7 @@ fn url_to_local_dir(url: &str) -> Result<(String, String), Error> { // trim port let host = host.split(':').next().unwrap(); - let (ident, url) = if registry_kind == 2 { + let (ident, url) = if registry_kind == SOURCE_KIND_REGISTRY { // cargo special cases github.com for reasons, so do the same let mut canonical = if host == "github.com" { url.to_lowercase() @@ -155,25 +180,67 @@ fn url_to_local_dir(url: &str) -> Result<(String, String), Error> { url.to_owned() }; - // Chop off any query params/fragments - if let Some(hash) = canonical.rfind('#') { - canonical.truncate(hash); - } + let ident = match hash_kind { + HashKind::Stable => { + // Locate the the first instance of params/fragments. + let mut params_index = { + let question = canonical.find('?'); + let hash = canonical.rfind('#'); + + question.zip(hash).map(|(q, h)| q.min(h)).or(question).or(hash) + }; + + // Attempt to trim `.git` from the end of url paths. + canonical = if let Some(idx) = params_index { + let base_url = &canonical[..idx]; + let params = &canonical[idx..]; + + if let Some(sanitized) = base_url.strip_suffix(".git") { + params_index = Some(idx - 4); + format!("{}{}", sanitized, params) + } else { + canonical + } + } else { + if canonical.ends_with(".git") { + canonical.truncate(canonical.len() - 4); + } + canonical + }; + + let ident = to_hex(hash_u64(&canonical, registry_kind)); + + // Strip params + if let Some(idx) = params_index { + canonical.truncate(canonical.len() - (canonical.len() - idx)); + } + + ident + } + HashKind::Legacy => { + // Chop off any query params/fragments + if let Some(hash) = canonical.rfind('#') { + canonical.truncate(hash); + } - if let Some(query) = canonical.rfind('?') { - canonical.truncate(query); - } + if let Some(query) = canonical.rfind('?') { + canonical.truncate(query); + } - let ident = to_hex(hash_u64(&canonical, registry_kind)); + if canonical.ends_with('/') { + canonical.pop(); + } - if canonical.ends_with('/') { - canonical.pop(); - } + let ident = to_hex(hash_u64(&canonical, registry_kind)); - if canonical.contains("github.com/") && canonical.ends_with(".git") { - // Only GitHub (crates.io) repositories have their .git suffix truncated - canonical.truncate(canonical.len() - 4); - } + // Only GitHub (crates.io) repositories have their .git suffix truncated + if canonical.contains("github.com/") && canonical.ends_with(".git") { + canonical.truncate(canonical.len() - 4); + } + + ident + } + }; (ident, canonical) } else { @@ -185,25 +252,45 @@ fn url_to_local_dir(url: &str) -> Result<(String, String), Error> { #[cfg(test)] mod test { + use crate::dirs::HashKind; #[test] fn http_index_url_matches_cargo() { use crate::sparse::URL; assert_eq!( - super::url_to_local_dir(URL).unwrap(), + super::url_to_local_dir(URL, &HashKind::Legacy).unwrap(), ("index.crates.io-6f17d22bba15001f".to_owned(), URL.to_owned(),) ); + assert_eq!( + super::url_to_local_dir(URL, &HashKind::Stable).unwrap(), + ("index.crates.io-1949cf8c6b5b557f".to_owned(), URL.to_owned(),) + ); // I've confirmed this also works with a custom registry, unfortunately // that one includes a secret key as part of the url which would allow // anyone to publish to the registry, so uhh...here's a fake one instead assert_eq!( - super::url_to_local_dir("https://dl.cloudsmith.io/aBcW1234aBcW1234/embark/rust/cargo/index.git").unwrap(), + super::url_to_local_dir( + "https://dl.cloudsmith.io/aBcW1234aBcW1234/embark/rust/cargo/index.git", + &HashKind::Legacy + ) + .unwrap(), ( "dl.cloudsmith.io-ff79e51ddd2b38fd".to_owned(), "https://dl.cloudsmith.io/aBcW1234aBcW1234/embark/rust/cargo/index.git".to_owned() ) ); + assert_eq!( + super::url_to_local_dir( + "https://dl.cloudsmith.io/aBcW1234aBcW1234/embark/rust/cargo/index.git", + &HashKind::Stable + ) + .unwrap(), + ( + "dl.cloudsmith.io-5e6de3fada793d05".to_owned(), + "https://dl.cloudsmith.io/aBcW1234aBcW1234/embark/rust/cargo/index".to_owned() + ) + ); } #[test] @@ -211,15 +298,25 @@ mod test { fn git_url_matches_cargo() { use crate::git::URL; assert_eq!( - crate::dirs::url_to_local_dir(URL).unwrap(), + crate::dirs::url_to_local_dir(URL, &HashKind::Legacy).unwrap(), ("github.com-1ecc6299db9ec823".to_owned(), URL.to_owned()) ); + assert_eq!( + crate::dirs::url_to_local_dir(URL, &HashKind::Stable).unwrap(), + ("github.com-25cdd57fae9f0462".to_owned(), URL.to_owned()) + ); // Ensure we actually strip off the irrelevant parts of a url, note that // the .git suffix is not part of the canonical url, but *is* used when hashing assert_eq!( - crate::dirs::url_to_local_dir(&format!("registry+{}.git?one=1&two=2#fragment", URL)).unwrap(), + crate::dirs::url_to_local_dir(&format!("registry+{}.git?one=1&two=2#fragment", URL), &HashKind::Legacy) + .unwrap(), ("github.com-c786010fb7ef2e6e".to_owned(), URL.to_owned()) ); + assert_eq!( + crate::dirs::url_to_local_dir(&format!("registry+{}.git?one=1&two=2#fragment", URL), &HashKind::Stable) + .unwrap(), + ("github.com-e78ed0bbfe5f35d7".to_owned(), URL.to_owned()) + ); } } diff --git a/src/git/impl_.rs b/src/git/impl_.rs index 5823d0de..e6ee3c8d 100644 --- a/src/git/impl_.rs +++ b/src/git/impl_.rs @@ -1,5 +1,7 @@ use crate::dedupe::DedupeContext; -use crate::dirs::{crate_name_to_relative_path, local_path_and_canonical_url}; +use crate::dirs::{ + crate_name_to_relative_path, local_path_and_canonical_url_with_hash_kind, HashKind, DEFAULT_HASHER_KIND, +}; use crate::error::GixError; use crate::git::{changes, config, URL}; use crate::{path_max_byte_len, Crate, Error, GitIndex, IndexConfig}; @@ -92,7 +94,12 @@ impl GitIndex { /// Concurrent invocations may fail if the index needs to be cloned. To prevent that, /// use synchronization mechanisms like mutexes or file locks as needed by the application. pub fn from_url(url: &str) -> Result { - let (path, canonical_url) = local_path_and_canonical_url(url, None)?; + Self::from_url_with_hash_kind(url, &DEFAULT_HASHER_KIND) + } + + /// Like [`Self::from_url`], but accepts an explicit [`HashKind`] for determining the crates index path. + pub fn from_url_with_hash_kind(url: &str, hash_kind: &HashKind) -> Result { + let (path, canonical_url) = local_path_and_canonical_url_with_hash_kind(url, None, hash_kind)?; Ok( Self::from_path_and_url(path, canonical_url, Mode::CloneUrlToPathIfRepoMissing)? .expect("repo present after possibly cloning it"), @@ -101,7 +108,12 @@ impl GitIndex { /// Like [`Self::from_url()`], but read-only without auto-cloning the index at `url`. pub fn try_from_url(url: &str) -> Result, Error> { - let (path, canonical_url) = local_path_and_canonical_url(url, None)?; + Self::try_from_url_with_hash_kind(url, &DEFAULT_HASHER_KIND) + } + + /// Like [`Self::try_from_url`], but accepts an explicit [`HashKind`] for determining the crates index path. + pub fn try_from_url_with_hash_kind(url: &str, hash_kind: &HashKind) -> Result, Error> { + let (path, canonical_url) = local_path_and_canonical_url_with_hash_kind(url, None, hash_kind)?; Self::from_path_and_url(path, canonical_url, Mode::ReadOnly) } diff --git a/src/lib.rs b/src/lib.rs index a7af90c7..951370c3 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -138,7 +138,7 @@ pub use config::IndexConfig; mod dedupe; mod dirs; -pub use dirs::local_path_and_canonical_url; +pub use dirs::{local_path_and_canonical_url, local_path_and_canonical_url_with_hash_kind, HashKind}; /// Re-exports in case you want to inspect specific error details pub mod error; @@ -156,6 +156,7 @@ pub use error::Error; /// /// [reqwest]: https://github.com/frewsxcv/rust-crates-index/blob/HEAD/examples/sparse_http_reqwest.rs /// [ureq]: https://github.com/frewsxcv/rust-crates-index/blob/HEAD/examples/sparse_http_ureq.rs +#[derive(Debug)] pub struct SparseIndex { path: PathBuf, url: String, diff --git a/src/sparse.rs b/src/sparse.rs index 20a5b9da..abe07d6a 100644 --- a/src/sparse.rs +++ b/src/sparse.rs @@ -1,8 +1,10 @@ use std::io; use std::path::{Path, PathBuf}; -use crate::dirs::crate_name_to_relative_path; -use crate::{dirs::local_path_and_canonical_url, path_max_byte_len, Crate, Error, IndexConfig, SparseIndex}; +use crate::dirs::{ + crate_name_to_relative_path, local_path_and_canonical_url_with_hash_kind, HashKind, DEFAULT_HASHER_KIND, +}; +use crate::{path_max_byte_len, Crate, Error, IndexConfig, SparseIndex}; /// The default URL of the crates.io HTTP index, see [`SparseIndex::from_url`] and [`SparseIndex::new_cargo_default`] pub const URL: &str = "sparse+https://index.crates.io/"; @@ -15,7 +17,13 @@ impl SparseIndex { /// Note this function takes the `CARGO_HOME` environment variable into account #[inline] pub fn from_url(url: &str) -> Result { - Self::with_path(home::cargo_home()?, url) + Self::from_url_with_hash_kind(url, &DEFAULT_HASHER_KIND) + } + + /// Like [`Self::from_url`] but accepts an explicit [`HashKind`] for determining the crates index path. + #[inline] + pub fn from_url_with_hash_kind(url: &str, hash_kind: &HashKind) -> Result { + Self::with_path_and_hash_kind(home::cargo_home()?, url, hash_kind) } /// Creates an index for the default crates.io registry, using the same @@ -33,6 +41,16 @@ impl SparseIndex { /// at the specified location #[inline] pub fn with_path(cargo_home: impl AsRef, url: impl AsRef) -> Result { + Self::with_path_and_hash_kind(cargo_home, url, &DEFAULT_HASHER_KIND) + } + + /// Like [`Self::with_path`] but accepts an explicit [`HashKind`] for determining the crates index path. + #[inline] + pub fn with_path_and_hash_kind( + cargo_home: impl AsRef, + url: impl AsRef, + hash_kind: &HashKind, + ) -> Result { let url = url.as_ref(); // It is required to have the sparse+ scheme modifier for sparse urls as // they are part of the short ident hash calculation done by cargo @@ -40,7 +58,7 @@ impl SparseIndex { return Err(Error::Url(url.to_owned())); } - let (path, url) = local_path_and_canonical_url(url, Some(cargo_home.as_ref()))?; + let (path, url) = local_path_and_canonical_url_with_hash_kind(url, Some(cargo_home.as_ref()), hash_kind)?; Ok(Self::at_path(path, url)) } @@ -70,7 +88,8 @@ impl SparseIndex { .cache_path(name) .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidInput, "bad name"))?; - let cache_bytes = std::fs::read(cache_path)?; + let cache_bytes = std::fs::read(&cache_path) + .map_err(|e| io::Error::new(e.kind(), format!("{}: `{}`", e.to_string(), cache_path.display())))?; Ok(Crate::from_cache_slice(&cache_bytes, None)?) } From bdd79190deb2b461c61ba4d5ec49757392d95422 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Wed, 19 Feb 2025 07:20:23 +0100 Subject: [PATCH 2/2] thanks clippy --- src/lib.rs | 4 ++-- src/sparse.rs | 2 +- src/types.rs | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 951370c3..38ed949a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -130,7 +130,7 @@ pub struct GitIndex { pub(crate) head_commit: gix::ObjectId, } -/// +/// The Git based index implementation pub mod git; mod config; @@ -162,7 +162,7 @@ pub struct SparseIndex { url: String, } -/// +/// The sparse index implementation. pub mod sparse; /// The matching `http` types for use in the [`sparse`] API. #[cfg(feature = "sparse")] diff --git a/src/sparse.rs b/src/sparse.rs index abe07d6a..454361c0 100644 --- a/src/sparse.rs +++ b/src/sparse.rs @@ -89,7 +89,7 @@ impl SparseIndex { .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidInput, "bad name"))?; let cache_bytes = std::fs::read(&cache_path) - .map_err(|e| io::Error::new(e.kind(), format!("{}: `{}`", e.to_string(), cache_path.display())))?; + .map_err(|e| io::Error::new(e.kind(), format!("{}: `{}`", e, cache_path.display())))?; Ok(Crate::from_cache_slice(&cache_bytes, None)?) } diff --git a/src/types.rs b/src/types.rs index 40bc254e..ae3db790 100644 --- a/src/types.rs +++ b/src/types.rs @@ -298,7 +298,7 @@ impl Crate { /// 1. There is no entry for this crate /// 2. The entry was created with an older version than the one specified /// 3. The entry is a newer version than what can be read, would only - /// happen if a future version of cargo changed the format of the cache entries + /// happen if a future version of cargo changed the format of the cache entries /// 4. The cache entry is malformed somehow #[inline(never)] pub(crate) fn from_cache_slice(bytes: &[u8], index_version: Option<&str>) -> io::Result {