Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for new index hash implementation. #184

Merged
merged 2 commits into from
Feb 19, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ http = { version = "1", optional = true }
memchr = "2.5.0"
rayon = { version = "1.7.0", optional = true }
rustc-hash = "2.0.0"
rustc-stable-hash = "0.1.1"
semver = "1.0.17"
serde = { version = "1.0.160", features = ["rc"] }
serde_derive = "1.0.160"
Expand Down
201 changes: 149 additions & 52 deletions src/dirs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,16 @@ pub fn local_path_and_canonical_url(
url: &str,
cargo_home: Option<&std::path::Path>,
) -> Result<(std::path::PathBuf, String), Error> {
let (dir_name, canonical_url) = url_to_local_dir(url)?;
local_path_and_canonical_url_with_hash_kind(url, cargo_home, &DEFAULT_HASHER_KIND)
}

/// Like [`local_path_and_canonical_url`] but accepts [`HashKind`] for determining the crate index path.
pub fn local_path_and_canonical_url_with_hash_kind(
url: &str,
cargo_home: Option<&std::path::Path>,
hash_kind: &HashKind,
) -> Result<(std::path::PathBuf, String), Error> {
let (dir_name, canonical_url) = url_to_local_dir(url, hash_kind)?;

let mut path = match cargo_home {
Some(path) => path.to_owned(),
Expand Down Expand Up @@ -70,39 +79,31 @@ pub(crate) fn crate_name_to_relative_path(crate_name: &str, separator: Option<ch
Some(rel_path)
}

/// Converts a full url, eg https://github.com/rust-lang/crates.io-index, into
/// the root directory name where cargo itself will fetch it on disk
fn url_to_local_dir(url: &str) -> Result<(String, String), Error> {
fn to_hex(num: u64) -> String {
const CHARS: &[u8] = b"0123456789abcdef";

let bytes = &[
num as u8,
(num >> 8) as u8,
(num >> 16) as u8,
(num >> 24) as u8,
(num >> 32) as u8,
(num >> 40) as u8,
(num >> 48) as u8,
(num >> 56) as u8,
];
/// Matches https://github.com/rust-lang/cargo/blob/2928e32734b04925ee51e1ae88bea9a83d2fd451/crates/cargo-util-schemas/src/core/source_kind.rs#L5
type SourceKind = u64;
const SOURCE_KIND_REGISTRY: SourceKind = 2;
const SOURCE_KIND_SPASE_REGISTRY: SourceKind = 3;

let mut output = vec![0u8; 16];
/// Determine the crate registry hashing strategy for locating local crate indexes.
pub enum HashKind {
/// Use the new hashing behavior introduced in Rust `1.85.0`.
Stable,

let mut ind = 0;

for &byte in bytes {
output[ind] = CHARS[(byte >> 4) as usize];
output[ind + 1] = CHARS[(byte & 0xf) as usize];

ind += 2;
}
/// Use a hashing strategy that matches Cargo versions less than `1.85.0`
Legacy,
}

String::from_utf8(output).expect("valid utf-8 hex string")
}
// For now, this acts as a centralized place to change the default. Ideally
// this would be compiled conditionally based on the version of rustc as
// a nice approximation of when consumers will be using the associated hash
// implementation but this behavior is not yet stable: https://github.com/rust-lang/rust/issues/64796
pub(crate) const DEFAULT_HASHER_KIND: HashKind = HashKind::Legacy;

/// Converts a full url, eg https://github.com/rust-lang/crates.io-index, into
/// the root directory name where cargo itself will fetch it on disk
fn url_to_local_dir(url: &str, hash_kind: &HashKind) -> Result<(String, String), Error> {
#[allow(deprecated)]
fn hash_u64(url: &str, registry_kind: u64) -> u64 {
fn legacy_hash_u64(url: &str, registry_kind: u64) -> u64 {
use std::hash::{Hash, Hasher, SipHasher};

let mut hasher = SipHasher::new_with_keys(0, 0);
Expand All @@ -113,8 +114,32 @@ fn url_to_local_dir(url: &str) -> Result<(String, String), Error> {
hasher.finish()
}

// SourceKind::Registry
let mut registry_kind = 2;
// Matches https://github.com/rust-lang/cargo/blob/2928e32734b04925ee51e1ae88bea9a83d2fd451/src/cargo/util/hasher.rs#L6
fn stable_hash_u64(url: &str, registry_kind: u64) -> u64 {
use rustc_stable_hash::StableSipHasher128 as StableHasher;
use std::hash::{Hash, Hasher};

let mut hasher = StableHasher::new();

// Type has an impact in the `rustc_stable_hasher`.
(registry_kind as isize).hash(&mut hasher);

url.hash(&mut hasher);

Hasher::finish(&hasher)
}

// Matches https://github.com/rust-lang/cargo/blob/2928e32734b04925ee51e1ae88bea9a83d2fd451/src/cargo/util/hex.rs#L6
fn to_hex(num: u64) -> String {
hex::encode(num.to_le_bytes())
}

let hash_u64 = match hash_kind {
HashKind::Stable => stable_hash_u64,
HashKind::Legacy => legacy_hash_u64,
};

let mut registry_kind = SOURCE_KIND_REGISTRY;

// Ensure we have a registry or bare url
let (url, scheme_ind) = {
Expand All @@ -124,7 +149,7 @@ fn url_to_local_dir(url: &str) -> Result<(String, String), Error> {

let scheme_str = &url[..scheme_ind];
if scheme_str.starts_with("sparse+http") {
registry_kind = 3;
registry_kind = SOURCE_KIND_SPASE_REGISTRY;
(url, scheme_ind)
} else if let Some(ind) = scheme_str.find('+') {
if &scheme_str[..ind] != "registry" {
Expand All @@ -147,33 +172,75 @@ fn url_to_local_dir(url: &str) -> Result<(String, String), Error> {
// trim port
let host = host.split(':').next().unwrap();

let (ident, url) = if registry_kind == 2 {
let (ident, url) = if registry_kind == SOURCE_KIND_REGISTRY {
// cargo special cases github.com for reasons, so do the same
let mut canonical = if host == "github.com" {
url.to_lowercase()
} else {
url.to_owned()
};

// Chop off any query params/fragments
if let Some(hash) = canonical.rfind('#') {
canonical.truncate(hash);
}
let ident = match hash_kind {
HashKind::Stable => {
// Locate the the first instance of params/fragments.
let mut params_index = {
let question = canonical.find('?');
let hash = canonical.rfind('#');

question.zip(hash).map(|(q, h)| q.min(h)).or(question).or(hash)
};

// Attempt to trim `.git` from the end of url paths.
canonical = if let Some(idx) = params_index {
let base_url = &canonical[..idx];
let params = &canonical[idx..];

if let Some(sanitized) = base_url.strip_suffix(".git") {
params_index = Some(idx - 4);
format!("{}{}", sanitized, params)
} else {
canonical
}
} else {
if canonical.ends_with(".git") {
canonical.truncate(canonical.len() - 4);
}
canonical
};

let ident = to_hex(hash_u64(&canonical, registry_kind));

// Strip params
if let Some(idx) = params_index {
canonical.truncate(canonical.len() - (canonical.len() - idx));
}

ident
}
HashKind::Legacy => {
// Chop off any query params/fragments
if let Some(hash) = canonical.rfind('#') {
canonical.truncate(hash);
}

if let Some(query) = canonical.rfind('?') {
canonical.truncate(query);
}
if let Some(query) = canonical.rfind('?') {
canonical.truncate(query);
}

let ident = to_hex(hash_u64(&canonical, registry_kind));
if canonical.ends_with('/') {
canonical.pop();
}

if canonical.ends_with('/') {
canonical.pop();
}
let ident = to_hex(hash_u64(&canonical, registry_kind));

if canonical.contains("github.com/") && canonical.ends_with(".git") {
// Only GitHub (crates.io) repositories have their .git suffix truncated
canonical.truncate(canonical.len() - 4);
}
// Only GitHub (crates.io) repositories have their .git suffix truncated
if canonical.contains("github.com/") && canonical.ends_with(".git") {
canonical.truncate(canonical.len() - 4);
}

ident
}
};

(ident, canonical)
} else {
Expand All @@ -185,41 +252,71 @@ fn url_to_local_dir(url: &str) -> Result<(String, String), Error> {

#[cfg(test)]
mod test {
use crate::dirs::HashKind;

#[test]
fn http_index_url_matches_cargo() {
use crate::sparse::URL;
assert_eq!(
super::url_to_local_dir(URL).unwrap(),
super::url_to_local_dir(URL, &HashKind::Legacy).unwrap(),
("index.crates.io-6f17d22bba15001f".to_owned(), URL.to_owned(),)
);
assert_eq!(
super::url_to_local_dir(URL, &HashKind::Stable).unwrap(),
("index.crates.io-1949cf8c6b5b557f".to_owned(), URL.to_owned(),)
);

// I've confirmed this also works with a custom registry, unfortunately
// that one includes a secret key as part of the url which would allow
// anyone to publish to the registry, so uhh...here's a fake one instead
assert_eq!(
super::url_to_local_dir("https://dl.cloudsmith.io/aBcW1234aBcW1234/embark/rust/cargo/index.git").unwrap(),
super::url_to_local_dir(
"https://dl.cloudsmith.io/aBcW1234aBcW1234/embark/rust/cargo/index.git",
&HashKind::Legacy
)
.unwrap(),
(
"dl.cloudsmith.io-ff79e51ddd2b38fd".to_owned(),
"https://dl.cloudsmith.io/aBcW1234aBcW1234/embark/rust/cargo/index.git".to_owned()
)
);
assert_eq!(
super::url_to_local_dir(
"https://dl.cloudsmith.io/aBcW1234aBcW1234/embark/rust/cargo/index.git",
&HashKind::Stable
)
.unwrap(),
(
"dl.cloudsmith.io-5e6de3fada793d05".to_owned(),
"https://dl.cloudsmith.io/aBcW1234aBcW1234/embark/rust/cargo/index".to_owned()
)
);
}

#[test]
#[cfg(feature = "git")]
fn git_url_matches_cargo() {
use crate::git::URL;
assert_eq!(
crate::dirs::url_to_local_dir(URL).unwrap(),
crate::dirs::url_to_local_dir(URL, &HashKind::Legacy).unwrap(),
("github.com-1ecc6299db9ec823".to_owned(), URL.to_owned())
);
assert_eq!(
crate::dirs::url_to_local_dir(URL, &HashKind::Stable).unwrap(),
("github.com-25cdd57fae9f0462".to_owned(), URL.to_owned())
);

// Ensure we actually strip off the irrelevant parts of a url, note that
// the .git suffix is not part of the canonical url, but *is* used when hashing
assert_eq!(
crate::dirs::url_to_local_dir(&format!("registry+{}.git?one=1&two=2#fragment", URL)).unwrap(),
crate::dirs::url_to_local_dir(&format!("registry+{}.git?one=1&two=2#fragment", URL), &HashKind::Legacy)
.unwrap(),
("github.com-c786010fb7ef2e6e".to_owned(), URL.to_owned())
);
assert_eq!(
crate::dirs::url_to_local_dir(&format!("registry+{}.git?one=1&two=2#fragment", URL), &HashKind::Stable)
.unwrap(),
("github.com-e78ed0bbfe5f35d7".to_owned(), URL.to_owned())
);
}
}
18 changes: 15 additions & 3 deletions src/git/impl_.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
use crate::dedupe::DedupeContext;
use crate::dirs::{crate_name_to_relative_path, local_path_and_canonical_url};
use crate::dirs::{
crate_name_to_relative_path, local_path_and_canonical_url_with_hash_kind, HashKind, DEFAULT_HASHER_KIND,
};
use crate::error::GixError;
use crate::git::{changes, config, URL};
use crate::{path_max_byte_len, Crate, Error, GitIndex, IndexConfig};
Expand Down Expand Up @@ -92,7 +94,12 @@ impl GitIndex {
/// Concurrent invocations may fail if the index needs to be cloned. To prevent that,
/// use synchronization mechanisms like mutexes or file locks as needed by the application.
pub fn from_url(url: &str) -> Result<Self, Error> {
let (path, canonical_url) = local_path_and_canonical_url(url, None)?;
Self::from_url_with_hash_kind(url, &DEFAULT_HASHER_KIND)
}

/// Like [`Self::from_url`], but accepts an explicit [`HashKind`] for determining the crates index path.
pub fn from_url_with_hash_kind(url: &str, hash_kind: &HashKind) -> Result<Self, Error> {
let (path, canonical_url) = local_path_and_canonical_url_with_hash_kind(url, None, hash_kind)?;
Ok(
Self::from_path_and_url(path, canonical_url, Mode::CloneUrlToPathIfRepoMissing)?
.expect("repo present after possibly cloning it"),
Expand All @@ -101,7 +108,12 @@ impl GitIndex {

/// Like [`Self::from_url()`], but read-only without auto-cloning the index at `url`.
pub fn try_from_url(url: &str) -> Result<Option<Self>, Error> {
let (path, canonical_url) = local_path_and_canonical_url(url, None)?;
Self::try_from_url_with_hash_kind(url, &DEFAULT_HASHER_KIND)
}

/// Like [`Self::try_from_url`], but accepts an explicit [`HashKind`] for determining the crates index path.
pub fn try_from_url_with_hash_kind(url: &str, hash_kind: &HashKind) -> Result<Option<Self>, Error> {
let (path, canonical_url) = local_path_and_canonical_url_with_hash_kind(url, None, hash_kind)?;
Self::from_path_and_url(path, canonical_url, Mode::ReadOnly)
}

Expand Down
7 changes: 4 additions & 3 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -130,15 +130,15 @@ pub struct GitIndex {
pub(crate) head_commit: gix::ObjectId,
}

///
/// The Git based index implementation
pub mod git;

mod config;
pub use config::IndexConfig;

mod dedupe;
mod dirs;
pub use dirs::local_path_and_canonical_url;
pub use dirs::{local_path_and_canonical_url, local_path_and_canonical_url_with_hash_kind, HashKind};

/// Re-exports in case you want to inspect specific error details
pub mod error;
Expand All @@ -156,12 +156,13 @@ pub use error::Error;
///
/// [reqwest]: https://github.com/frewsxcv/rust-crates-index/blob/HEAD/examples/sparse_http_reqwest.rs
/// [ureq]: https://github.com/frewsxcv/rust-crates-index/blob/HEAD/examples/sparse_http_ureq.rs
#[derive(Debug)]
pub struct SparseIndex {
path: PathBuf,
url: String,
}

///
/// The sparse index implementation.
pub mod sparse;
/// The matching `http` types for use in the [`sparse`] API.
#[cfg(feature = "sparse")]
Expand Down
Loading
Loading