Skip to content

Commit

Permalink
Merge pull request #184 from UebelAndre/stable_hash
Browse files Browse the repository at this point in the history
Add support for new index hash implementation.
  • Loading branch information
Byron authored Feb 19, 2025
2 parents 973f2e5 + bdd7919 commit 1f3b4b0
Show file tree
Hide file tree
Showing 6 changed files with 194 additions and 64 deletions.
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ http = { version = "1", optional = true }
memchr = "2.5.0"
rayon = { version = "1.7.0", optional = true }
rustc-hash = "2.0.0"
rustc-stable-hash = "0.1.1"
semver = "1.0.17"
serde = { version = "1.0.160", features = ["rc"] }
serde_derive = "1.0.160"
Expand Down
201 changes: 149 additions & 52 deletions src/dirs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,16 @@ pub fn local_path_and_canonical_url(
url: &str,
cargo_home: Option<&std::path::Path>,
) -> Result<(std::path::PathBuf, String), Error> {
let (dir_name, canonical_url) = url_to_local_dir(url)?;
local_path_and_canonical_url_with_hash_kind(url, cargo_home, &DEFAULT_HASHER_KIND)
}

/// Like [`local_path_and_canonical_url`] but accepts [`HashKind`] for determining the crate index path.
pub fn local_path_and_canonical_url_with_hash_kind(
url: &str,
cargo_home: Option<&std::path::Path>,
hash_kind: &HashKind,
) -> Result<(std::path::PathBuf, String), Error> {
let (dir_name, canonical_url) = url_to_local_dir(url, hash_kind)?;

let mut path = match cargo_home {
Some(path) => path.to_owned(),
Expand Down Expand Up @@ -70,39 +79,31 @@ pub(crate) fn crate_name_to_relative_path(crate_name: &str, separator: Option<ch
Some(rel_path)
}

/// Converts a full url, eg https://github.com/rust-lang/crates.io-index, into
/// the root directory name where cargo itself will fetch it on disk
fn url_to_local_dir(url: &str) -> Result<(String, String), Error> {
fn to_hex(num: u64) -> String {
const CHARS: &[u8] = b"0123456789abcdef";

let bytes = &[
num as u8,
(num >> 8) as u8,
(num >> 16) as u8,
(num >> 24) as u8,
(num >> 32) as u8,
(num >> 40) as u8,
(num >> 48) as u8,
(num >> 56) as u8,
];
/// Matches https://github.com/rust-lang/cargo/blob/2928e32734b04925ee51e1ae88bea9a83d2fd451/crates/cargo-util-schemas/src/core/source_kind.rs#L5
type SourceKind = u64;
const SOURCE_KIND_REGISTRY: SourceKind = 2;
const SOURCE_KIND_SPASE_REGISTRY: SourceKind = 3;

let mut output = vec![0u8; 16];
/// Determine the crate registry hashing strategy for locating local crate indexes.
pub enum HashKind {
/// Use the new hashing behavior introduced in Rust `1.85.0`.
Stable,

let mut ind = 0;

for &byte in bytes {
output[ind] = CHARS[(byte >> 4) as usize];
output[ind + 1] = CHARS[(byte & 0xf) as usize];

ind += 2;
}
/// Use a hashing strategy that matches Cargo versions less than `1.85.0`
Legacy,
}

String::from_utf8(output).expect("valid utf-8 hex string")
}
// For now, this acts as a centralized place to change the default. Ideally
// this would be compiled conditionally based on the version of rustc as
// a nice approximation of when consumers will be using the associated hash
// implementation but this behavior is not yet stable: https://github.com/rust-lang/rust/issues/64796
pub(crate) const DEFAULT_HASHER_KIND: HashKind = HashKind::Legacy;

/// Converts a full url, eg https://github.com/rust-lang/crates.io-index, into
/// the root directory name where cargo itself will fetch it on disk
fn url_to_local_dir(url: &str, hash_kind: &HashKind) -> Result<(String, String), Error> {
#[allow(deprecated)]
fn hash_u64(url: &str, registry_kind: u64) -> u64 {
fn legacy_hash_u64(url: &str, registry_kind: u64) -> u64 {
use std::hash::{Hash, Hasher, SipHasher};

let mut hasher = SipHasher::new_with_keys(0, 0);
Expand All @@ -113,8 +114,32 @@ fn url_to_local_dir(url: &str) -> Result<(String, String), Error> {
hasher.finish()
}

// SourceKind::Registry
let mut registry_kind = 2;
// Matches https://github.com/rust-lang/cargo/blob/2928e32734b04925ee51e1ae88bea9a83d2fd451/src/cargo/util/hasher.rs#L6
fn stable_hash_u64(url: &str, registry_kind: u64) -> u64 {
use rustc_stable_hash::StableSipHasher128 as StableHasher;
use std::hash::{Hash, Hasher};

let mut hasher = StableHasher::new();

// Type has an impact in the `rustc_stable_hasher`.
(registry_kind as isize).hash(&mut hasher);

url.hash(&mut hasher);

Hasher::finish(&hasher)
}

// Matches https://github.com/rust-lang/cargo/blob/2928e32734b04925ee51e1ae88bea9a83d2fd451/src/cargo/util/hex.rs#L6
fn to_hex(num: u64) -> String {
hex::encode(num.to_le_bytes())
}

let hash_u64 = match hash_kind {
HashKind::Stable => stable_hash_u64,
HashKind::Legacy => legacy_hash_u64,
};

let mut registry_kind = SOURCE_KIND_REGISTRY;

// Ensure we have a registry or bare url
let (url, scheme_ind) = {
Expand All @@ -124,7 +149,7 @@ fn url_to_local_dir(url: &str) -> Result<(String, String), Error> {

let scheme_str = &url[..scheme_ind];
if scheme_str.starts_with("sparse+http") {
registry_kind = 3;
registry_kind = SOURCE_KIND_SPASE_REGISTRY;
(url, scheme_ind)
} else if let Some(ind) = scheme_str.find('+') {
if &scheme_str[..ind] != "registry" {
Expand All @@ -147,33 +172,75 @@ fn url_to_local_dir(url: &str) -> Result<(String, String), Error> {
// trim port
let host = host.split(':').next().unwrap();

let (ident, url) = if registry_kind == 2 {
let (ident, url) = if registry_kind == SOURCE_KIND_REGISTRY {
// cargo special cases github.com for reasons, so do the same
let mut canonical = if host == "github.com" {
url.to_lowercase()
} else {
url.to_owned()
};

// Chop off any query params/fragments
if let Some(hash) = canonical.rfind('#') {
canonical.truncate(hash);
}
let ident = match hash_kind {
HashKind::Stable => {
// Locate the the first instance of params/fragments.
let mut params_index = {
let question = canonical.find('?');
let hash = canonical.rfind('#');

question.zip(hash).map(|(q, h)| q.min(h)).or(question).or(hash)
};

// Attempt to trim `.git` from the end of url paths.
canonical = if let Some(idx) = params_index {
let base_url = &canonical[..idx];
let params = &canonical[idx..];

if let Some(sanitized) = base_url.strip_suffix(".git") {
params_index = Some(idx - 4);
format!("{}{}", sanitized, params)
} else {
canonical
}
} else {
if canonical.ends_with(".git") {
canonical.truncate(canonical.len() - 4);
}
canonical
};

let ident = to_hex(hash_u64(&canonical, registry_kind));

// Strip params
if let Some(idx) = params_index {
canonical.truncate(canonical.len() - (canonical.len() - idx));
}

ident
}
HashKind::Legacy => {
// Chop off any query params/fragments
if let Some(hash) = canonical.rfind('#') {
canonical.truncate(hash);
}

if let Some(query) = canonical.rfind('?') {
canonical.truncate(query);
}
if let Some(query) = canonical.rfind('?') {
canonical.truncate(query);
}

let ident = to_hex(hash_u64(&canonical, registry_kind));
if canonical.ends_with('/') {
canonical.pop();
}

if canonical.ends_with('/') {
canonical.pop();
}
let ident = to_hex(hash_u64(&canonical, registry_kind));

if canonical.contains("github.com/") && canonical.ends_with(".git") {
// Only GitHub (crates.io) repositories have their .git suffix truncated
canonical.truncate(canonical.len() - 4);
}
// Only GitHub (crates.io) repositories have their .git suffix truncated
if canonical.contains("github.com/") && canonical.ends_with(".git") {
canonical.truncate(canonical.len() - 4);
}

ident
}
};

(ident, canonical)
} else {
Expand All @@ -185,41 +252,71 @@ fn url_to_local_dir(url: &str) -> Result<(String, String), Error> {

#[cfg(test)]
mod test {
use crate::dirs::HashKind;

#[test]
fn http_index_url_matches_cargo() {
use crate::sparse::URL;
assert_eq!(
super::url_to_local_dir(URL).unwrap(),
super::url_to_local_dir(URL, &HashKind::Legacy).unwrap(),
("index.crates.io-6f17d22bba15001f".to_owned(), URL.to_owned(),)
);
assert_eq!(
super::url_to_local_dir(URL, &HashKind::Stable).unwrap(),
("index.crates.io-1949cf8c6b5b557f".to_owned(), URL.to_owned(),)
);

// I've confirmed this also works with a custom registry, unfortunately
// that one includes a secret key as part of the url which would allow
// anyone to publish to the registry, so uhh...here's a fake one instead
assert_eq!(
super::url_to_local_dir("https://dl.cloudsmith.io/aBcW1234aBcW1234/embark/rust/cargo/index.git").unwrap(),
super::url_to_local_dir(
"https://dl.cloudsmith.io/aBcW1234aBcW1234/embark/rust/cargo/index.git",
&HashKind::Legacy
)
.unwrap(),
(
"dl.cloudsmith.io-ff79e51ddd2b38fd".to_owned(),
"https://dl.cloudsmith.io/aBcW1234aBcW1234/embark/rust/cargo/index.git".to_owned()
)
);
assert_eq!(
super::url_to_local_dir(
"https://dl.cloudsmith.io/aBcW1234aBcW1234/embark/rust/cargo/index.git",
&HashKind::Stable
)
.unwrap(),
(
"dl.cloudsmith.io-5e6de3fada793d05".to_owned(),
"https://dl.cloudsmith.io/aBcW1234aBcW1234/embark/rust/cargo/index".to_owned()
)
);
}

#[test]
#[cfg(feature = "git")]
fn git_url_matches_cargo() {
use crate::git::URL;
assert_eq!(
crate::dirs::url_to_local_dir(URL).unwrap(),
crate::dirs::url_to_local_dir(URL, &HashKind::Legacy).unwrap(),
("github.com-1ecc6299db9ec823".to_owned(), URL.to_owned())
);
assert_eq!(
crate::dirs::url_to_local_dir(URL, &HashKind::Stable).unwrap(),
("github.com-25cdd57fae9f0462".to_owned(), URL.to_owned())
);

// Ensure we actually strip off the irrelevant parts of a url, note that
// the .git suffix is not part of the canonical url, but *is* used when hashing
assert_eq!(
crate::dirs::url_to_local_dir(&format!("registry+{}.git?one=1&two=2#fragment", URL)).unwrap(),
crate::dirs::url_to_local_dir(&format!("registry+{}.git?one=1&two=2#fragment", URL), &HashKind::Legacy)
.unwrap(),
("github.com-c786010fb7ef2e6e".to_owned(), URL.to_owned())
);
assert_eq!(
crate::dirs::url_to_local_dir(&format!("registry+{}.git?one=1&two=2#fragment", URL), &HashKind::Stable)
.unwrap(),
("github.com-e78ed0bbfe5f35d7".to_owned(), URL.to_owned())
);
}
}
18 changes: 15 additions & 3 deletions src/git/impl_.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
use crate::dedupe::DedupeContext;
use crate::dirs::{crate_name_to_relative_path, local_path_and_canonical_url};
use crate::dirs::{
crate_name_to_relative_path, local_path_and_canonical_url_with_hash_kind, HashKind, DEFAULT_HASHER_KIND,
};
use crate::error::GixError;
use crate::git::{changes, config, URL};
use crate::{path_max_byte_len, Crate, Error, GitIndex, IndexConfig};
Expand Down Expand Up @@ -92,7 +94,12 @@ impl GitIndex {
/// Concurrent invocations may fail if the index needs to be cloned. To prevent that,
/// use synchronization mechanisms like mutexes or file locks as needed by the application.
pub fn from_url(url: &str) -> Result<Self, Error> {
let (path, canonical_url) = local_path_and_canonical_url(url, None)?;
Self::from_url_with_hash_kind(url, &DEFAULT_HASHER_KIND)
}

/// Like [`Self::from_url`], but accepts an explicit [`HashKind`] for determining the crates index path.
pub fn from_url_with_hash_kind(url: &str, hash_kind: &HashKind) -> Result<Self, Error> {
let (path, canonical_url) = local_path_and_canonical_url_with_hash_kind(url, None, hash_kind)?;
Ok(
Self::from_path_and_url(path, canonical_url, Mode::CloneUrlToPathIfRepoMissing)?
.expect("repo present after possibly cloning it"),
Expand All @@ -101,7 +108,12 @@ impl GitIndex {

/// Like [`Self::from_url()`], but read-only without auto-cloning the index at `url`.
pub fn try_from_url(url: &str) -> Result<Option<Self>, Error> {
let (path, canonical_url) = local_path_and_canonical_url(url, None)?;
Self::try_from_url_with_hash_kind(url, &DEFAULT_HASHER_KIND)
}

/// Like [`Self::try_from_url`], but accepts an explicit [`HashKind`] for determining the crates index path.
pub fn try_from_url_with_hash_kind(url: &str, hash_kind: &HashKind) -> Result<Option<Self>, Error> {
let (path, canonical_url) = local_path_and_canonical_url_with_hash_kind(url, None, hash_kind)?;
Self::from_path_and_url(path, canonical_url, Mode::ReadOnly)
}

Expand Down
7 changes: 4 additions & 3 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -130,15 +130,15 @@ pub struct GitIndex {
pub(crate) head_commit: gix::ObjectId,
}

///
/// The Git based index implementation
pub mod git;

mod config;
pub use config::IndexConfig;

mod dedupe;
mod dirs;
pub use dirs::local_path_and_canonical_url;
pub use dirs::{local_path_and_canonical_url, local_path_and_canonical_url_with_hash_kind, HashKind};

/// Re-exports in case you want to inspect specific error details
pub mod error;
Expand All @@ -156,12 +156,13 @@ pub use error::Error;
///
/// [reqwest]: https://github.com/frewsxcv/rust-crates-index/blob/HEAD/examples/sparse_http_reqwest.rs
/// [ureq]: https://github.com/frewsxcv/rust-crates-index/blob/HEAD/examples/sparse_http_ureq.rs
#[derive(Debug)]
pub struct SparseIndex {
path: PathBuf,
url: String,
}

///
/// The sparse index implementation.
pub mod sparse;
/// The matching `http` types for use in the [`sparse`] API.
#[cfg(feature = "sparse")]
Expand Down
Loading

0 comments on commit 1f3b4b0

Please sign in to comment.