From 64ee3ee30715c2e037fd8e60a0f062ba9e0ef20d Mon Sep 17 00:00:00 2001 From: Nelson Spence Date: Tue, 2 Jun 2026 17:50:55 -0500 Subject: [PATCH 1/4] Add manifest auxiliary artifact verification Signed-off-by: Nelson Spence --- CHANGELOG.md | 6 + README.md | 7 +- docs/INDEX_PROVENANCE.md | 11 ++ ordvec-manifest/README.md | 30 ++- ordvec-manifest/src/lib.rs | 308 ++++++++++++++++++++++++++++++ ordvec-manifest/src/main.rs | 4 + ordvec-manifest/src/sqlite.rs | 101 +++++++++- ordvec-manifest/tests/manifest.rs | 278 ++++++++++++++++++++++++--- 8 files changed, 701 insertions(+), 44 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8dc7591..1a159ef 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,6 +20,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 manifest JSON size, row-identity JSONL line length, row count, duplicate-tracking memory, report issue count, and SQLite cached report size. +### Added + +- Added named auxiliary artifact verification to `ordvec-manifest`, including + required/optional sidecar states, path/size/SHA-256 checks, deterministic + report entries, and SQLite cache invalidation for declared sidecar bytes. + ## 0.3.0 - 2026-05-29 ### Added diff --git a/README.md b/README.md index 45569c6..bf1ff1e 100644 --- a/README.md +++ b/README.md @@ -272,9 +272,10 @@ structurally valid file can still be untrusted. If an index file crosses a trust boundary (network transfer, shared storage), verify it before loading. The full GitHub checkout includes a publish=false sidecar CLI, `ordvec-manifest`, that binds an index file to a JSON manifest by SHA-256, -header metadata, row identity, and attestation shape checks. It does not sign -artifacts, manage keys, or decide deployment trust policy. No in-format crypto -is shipped because it would add key management the library can't own. See +header metadata, row identity, named auxiliary sidecars, and attestation shape +checks. It does not sign artifacts, manage keys, or decide deployment trust +policy. No in-format crypto is shipped because it would add key management the +library can't own. See [`docs/PERSISTED_FORMAT.md`](https://github.com/Fieldnote-Echo/ordvec/blob/main/docs/PERSISTED_FORMAT.md), [`docs/INDEX_PROVENANCE.md`](https://github.com/Fieldnote-Echo/ordvec/blob/main/docs/INDEX_PROVENANCE.md), and [`THREAT_MODEL.md`](https://github.com/Fieldnote-Echo/ordvec/blob/main/THREAT_MODEL.md) diff --git a/docs/INDEX_PROVENANCE.md b/docs/INDEX_PROVENANCE.md index 692eca7..5d275f2 100644 --- a/docs/INDEX_PROVENANCE.md +++ b/docs/INDEX_PROVENANCE.md @@ -67,12 +67,23 @@ The manifest verifier checks: - row identity, either explicit `row_id_identity` or a strict JSONL row map whose `row_id` equals the zero-based line number and whose `db_id` is non-empty, NUL-free, and unique by default; +- declared auxiliary artifacts, checking each caller-named sidecar's path, + SHA-256 digest, and byte length under the same default path policy as the + primary index artifact; - optional `calibration` profile references, checking profile identity, path/hash integrity, encoder identity, and ordinalization compatibility; - attestation **shape** only: predicate type, builder id when present, and at least one subject SHA-256 matching the artifact when attestations are supplied. +Auxiliary artifacts are for application-owned sidecars such as metadata, +secondary indexes, or stores that a caller intends to load together with the +ordvec index. The verifier does not interpret those bytes; it only reports +whether declared required members were verified, whether optional members were +present or absent, and whether any declared member failed path, size, or digest +checks. Callers should load sidecars only after the relevant declaration is +verified. + When present, `calibration` binds an index artifact to a hashed ordinal profile used to interpret overlap, bucket, sign, or rank evidence under a calibrated null. The verifier checks profile identity, path/hash integrity, encoder diff --git a/ordvec-manifest/README.md b/ordvec-manifest/README.md index 3a4f791..ed4c465 100644 --- a/ordvec-manifest/README.md +++ b/ordvec-manifest/README.md @@ -2,11 +2,11 @@ Repo-local, publish=false sidecar verifier for ordvec index manifests. -It verifies index bytes, probed header metadata, row identity, optional -calibration profile references, and attestation shape before a caller loads an -ordvec index. It does not sign artifacts, manage keys, call networks, mutate -index files, decide deployment trust policy, compute calibration statistics, or -change the C ABI. +It verifies index bytes, probed header metadata, row identity, named auxiliary +artifacts, optional calibration profile references, and attestation shape before +a caller loads an ordvec index. It does not sign artifacts, manage keys, call +networks, mutate index files, decide deployment trust policy, compute +calibration statistics, or change the C ABI. ```sh cargo run -p ordvec-manifest -- create \ @@ -67,11 +67,21 @@ otherwise be reported. These limits bound metadata parsing and report/cache growth; hashing an index or calibration profile is still proportional to the artifact bytes being verified. +Manifests may declare `auxiliary_artifacts` for caller-owned sidecars that +should be integrity-checked with the same path policy as the primary index. +Each entry has a stable `name`, relative `path`, lowercase SHA-256 digest, +`file_size_bytes`, and a `required` flag that defaults to `true`. Required +members fail verification when missing, tampered, size-mismatched, or rejected +by path policy. Optional members are reported as verified when present or as +`optional_absent` with a stable reason code when absent. The verifier checks +bytes only; application semantics remain with the caller. + With `--features sqlite`, the `sqlite verify` and `sqlite activate` subcommands add a local cache/audit log plus one active-manifest pointer. This is not a full named registry. `sqlite verify --use-cache` reuses only reports whose -manifest, verification options, artifact bytes, row-identity bytes, and -calibration profile bytes still match; otherwise it runs fresh verification and -stores a new report. `sqlite activate --force` writes the active pointer even -when verification fails, emits a `sqlite_activation_forced` warning in JSON -output, and exits zero because it did mutate activation state. +manifest, verification options, artifact bytes, row-identity bytes, +calibration profile bytes, and declared auxiliary artifact states/bytes still +match; otherwise it runs fresh verification and stores a new report. +`sqlite activate --force` writes the active pointer even when verification +fails, emits a `sqlite_activation_forced` warning in JSON output, and exits zero +because it did mutate activation state. diff --git a/ordvec-manifest/src/lib.rs b/ordvec-manifest/src/lib.rs index abf1a10..7bde22b 100644 --- a/ordvec-manifest/src/lib.rs +++ b/ordvec-manifest/src/lib.rs @@ -235,6 +235,7 @@ pub fn verify_manifest(document: &ManifestDocument, options: VerifyOptions) -> V } } + verify_auxiliary_artifacts(document, &options, &mut report); verify_row_identity(document, &options, &mut report); verify_calibration(document, &options, &mut report); verify_attestations(&document.manifest, &mut report); @@ -337,6 +338,8 @@ fn validate_manifest_shape(manifest: &IndexManifest, report: &mut VerificationRe } } + validate_auxiliary_artifact_shape(manifest, report); + validate_optional_non_empty( "embedding_model_revision_empty", "embedding.model_revision must be non-empty when present", @@ -415,6 +418,39 @@ fn validate_manifest_shape(manifest: &IndexManifest, report: &mut VerificationRe } } +fn validate_auxiliary_artifact_shape(manifest: &IndexManifest, report: &mut VerificationReport) { + let mut names = HashSet::new(); + for artifact in &manifest.auxiliary_artifacts { + let name = artifact.name.trim(); + if name.is_empty() { + report.error( + "auxiliary_artifact_name_empty", + "auxiliary artifact name must be non-empty", + ); + } else if !names.insert(name.to_string()) { + report.error( + "auxiliary_artifact_name_duplicate", + format!("auxiliary artifact name {name:?} is duplicated"), + ); + } + + if artifact.path.trim().is_empty() { + report.error( + "auxiliary_artifact_path_empty", + format!("auxiliary artifact {name:?} path must be non-empty"), + ); + } + if !is_sha256_hex(&artifact.sha256) { + report.error( + "auxiliary_artifact_sha256_invalid", + format!( + "auxiliary artifact {name:?} sha256 must be a lowercase 64-character hex SHA-256 digest" + ), + ); + } + } +} + fn validate_optional_non_empty( code: &str, message: &str, @@ -1161,6 +1197,233 @@ fn expected_profile_shape( } } +fn verify_auxiliary_artifacts( + document: &ManifestDocument, + options: &VerifyOptions, + report: &mut VerificationReport, +) { + for artifact in auxiliary_artifacts_in_report_order(&document.manifest) { + let mut entry = AuxiliaryArtifactReport { + name: artifact.name.clone(), + manifest_path: artifact.path.clone(), + required: artifact.required, + state: AuxiliaryArtifactState::Failed, + reason_code: None, + canonical_path: None, + sha256: None, + size_bytes: None, + }; + + if artifact.path.trim().is_empty() { + mark_auxiliary_artifact_failed(&mut entry, "auxiliary_artifact_path_empty"); + report.auxiliary_artifacts.push(entry); + continue; + } + + match resolve_auxiliary_artifact_path(artifact, &document.base_dir, options, report) { + AuxiliaryPathResolution::Resolved(resolved) => { + entry.canonical_path = Some(path_to_display(&resolved.canonical_path)); + match sha256_file(&resolved.resolved_path) { + Ok(hash) => { + entry.sha256 = Some(hash.sha256.clone()); + entry.size_bytes = Some(hash.size_bytes); + if !hex_digest_eq(&hash.sha256, &artifact.sha256) { + mark_auxiliary_artifact_failed( + &mut entry, + "auxiliary_artifact_sha256_mismatch", + ); + report.error( + "auxiliary_artifact_sha256_mismatch", + format!( + "auxiliary artifact {:?} SHA-256 was {}, manifest declares {}", + artifact.name, hash.sha256, artifact.sha256 + ), + ); + } + if hash.size_bytes != artifact.file_size_bytes { + mark_auxiliary_artifact_failed( + &mut entry, + "auxiliary_artifact_file_size_mismatch", + ); + report.error( + "auxiliary_artifact_file_size_mismatch", + format!( + "auxiliary artifact {:?} size was {}, manifest declares {}", + artifact.name, hash.size_bytes, artifact.file_size_bytes + ), + ); + } + if entry.reason_code.is_none() { + entry.state = AuxiliaryArtifactState::Verified; + } + } + Err(err) => { + mark_auxiliary_artifact_failed( + &mut entry, + "auxiliary_artifact_hash_failed", + ); + report.error( + "auxiliary_artifact_hash_failed", + format!( + "failed to hash auxiliary artifact {:?}: {err}", + artifact.name + ), + ); + } + } + } + AuxiliaryPathResolution::OptionalAbsent => { + entry.state = AuxiliaryArtifactState::OptionalAbsent; + entry.reason_code = Some("auxiliary_artifact_optional_absent".to_string()); + } + AuxiliaryPathResolution::MissingRequired => { + entry.state = AuxiliaryArtifactState::MissingRequired; + entry.reason_code = Some("auxiliary_artifact_missing_required".to_string()); + } + AuxiliaryPathResolution::Failed(code) => { + entry.state = AuxiliaryArtifactState::Failed; + entry.reason_code = Some(code); + } + } + + report.auxiliary_artifacts.push(entry); + } +} + +fn auxiliary_artifacts_in_report_order(manifest: &IndexManifest) -> Vec<&AuxiliaryArtifact> { + let mut artifacts: Vec<_> = manifest.auxiliary_artifacts.iter().collect(); + artifacts.sort_by(|left, right| { + left.name + .cmp(&right.name) + .then_with(|| left.path.cmp(&right.path)) + .then_with(|| left.required.cmp(&right.required)) + }); + artifacts +} + +enum AuxiliaryPathResolution { + Resolved(ResolvedPath), + OptionalAbsent, + MissingRequired, + Failed(String), +} + +fn resolve_auxiliary_artifact_path( + artifact: &AuxiliaryArtifact, + base_dir: &Path, + options: &VerifyOptions, + report: &mut VerificationReport, +) -> AuxiliaryPathResolution { + let path = Path::new(&artifact.path); + if path.is_absolute() && !options.allow_absolute_paths { + report.error( + "auxiliary_artifact_absolute_path_rejected", + format!( + "absolute auxiliary artifact path {} for {:?} is rejected by default", + path.display(), + artifact.name + ), + ); + return AuxiliaryPathResolution::Failed( + "auxiliary_artifact_absolute_path_rejected".to_string(), + ); + } + + let base_canonical = match fs::canonicalize(base_dir) { + Ok(path) => path, + Err(err) => { + report.error( + "auxiliary_artifact_base_dir_unavailable", + format!( + "failed to canonicalize base_dir {} for auxiliary artifact {:?}: {err}", + base_dir.display(), + artifact.name + ), + ); + return AuxiliaryPathResolution::Failed( + "auxiliary_artifact_base_dir_unavailable".to_string(), + ); + } + }; + + if !path.is_absolute() && !options.allow_path_escape && has_lexical_escape(path) { + report.error( + "auxiliary_artifact_path_escape_rejected", + format!( + "relative auxiliary artifact path {} for {:?} escapes the manifest base", + path.display(), + artifact.name + ), + ); + return AuxiliaryPathResolution::Failed( + "auxiliary_artifact_path_escape_rejected".to_string(), + ); + } + + let resolved_path = if path.is_absolute() { + path.to_path_buf() + } else { + base_dir.join(path) + }; + let canonical_path = match fs::canonicalize(&resolved_path) { + Ok(path) => path, + Err(err) if err.kind() == io::ErrorKind::NotFound && !artifact.required => { + return AuxiliaryPathResolution::OptionalAbsent; + } + Err(err) if err.kind() == io::ErrorKind::NotFound => { + report.error( + "auxiliary_artifact_missing_required", + format!( + "required auxiliary artifact {:?} is missing at {}", + artifact.name, + resolved_path.display() + ), + ); + return AuxiliaryPathResolution::MissingRequired; + } + Err(err) => { + report.error( + "auxiliary_artifact_path_unavailable", + format!( + "failed to canonicalize auxiliary artifact {:?} at {}: {err}", + artifact.name, + resolved_path.display() + ), + ); + return AuxiliaryPathResolution::Failed( + "auxiliary_artifact_path_unavailable".to_string(), + ); + } + }; + + if !options.allow_path_escape && !canonical_path.starts_with(&base_canonical) { + report.error( + "auxiliary_artifact_path_escape_rejected", + format!( + "canonical auxiliary artifact path {} for {:?} is outside manifest base {}", + canonical_path.display(), + artifact.name, + base_canonical.display() + ), + ); + return AuxiliaryPathResolution::Failed( + "auxiliary_artifact_path_escape_rejected".to_string(), + ); + } + + AuxiliaryPathResolution::Resolved(ResolvedPath { + resolved_path, + canonical_path, + }) +} + +fn mark_auxiliary_artifact_failed(entry: &mut AuxiliaryArtifactReport, code: &str) { + entry.state = AuxiliaryArtifactState::Failed; + if entry.reason_code.is_none() { + entry.reason_code = Some(code.to_string()); + } +} + fn verify_attestations(manifest: &IndexManifest, report: &mut VerificationReport) { if manifest.attestations.is_empty() { report @@ -1348,6 +1611,14 @@ fn has_lexical_escape(path: &Path) -> bool { false } +fn default_required() -> bool { + true +} + +fn is_true(value: &bool) -> bool { + *value +} + #[derive(Clone, Debug, Serialize, Deserialize)] #[serde(deny_unknown_fields)] pub struct IndexManifest { @@ -1355,6 +1626,8 @@ pub struct IndexManifest { pub manifest_id: String, pub created_at: String, pub artifact: Artifact, + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub auxiliary_artifacts: Vec, pub embedding: Embedding, #[serde(default, skip_serializing_if = "Option::is_none")] pub calibration: Option, @@ -1381,6 +1654,17 @@ pub struct Artifact { pub file_size_bytes: u64, } +#[derive(Clone, Debug, Serialize, Deserialize)] +#[serde(deny_unknown_fields)] +pub struct AuxiliaryArtifact { + pub name: String, + pub path: String, + pub sha256: String, + pub file_size_bytes: u64, + #[serde(default = "default_required", skip_serializing_if = "is_true")] + pub required: bool, +} + #[derive(Clone, Debug, Serialize, Deserialize)] #[serde(deny_unknown_fields)] pub struct Embedding { @@ -1605,6 +1889,7 @@ pub struct VerificationReport { pub checked_at: String, pub manifest_id: Option, pub artifact: ArtifactReport, + pub auxiliary_artifacts: Vec, pub row_identity: RowIdentityReport, pub calibration: CalibrationReport, pub attestation_shape_checks: Vec, @@ -1620,6 +1905,7 @@ impl VerificationReport { checked_at: Utc::now().to_rfc3339_opts(SecondsFormat::Nanos, true), manifest_id, artifact: ArtifactReport::default(), + auxiliary_artifacts: Vec::new(), row_identity: RowIdentityReport::default(), calibration: CalibrationReport::default(), attestation_shape_checks: Vec::new(), @@ -1644,6 +1930,27 @@ pub struct ArtifactReport { pub metadata: Option, } +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct AuxiliaryArtifactReport { + pub name: String, + pub manifest_path: String, + pub required: bool, + pub state: AuxiliaryArtifactState, + pub reason_code: Option, + pub canonical_path: Option, + pub sha256: Option, + pub size_bytes: Option, +} + +#[derive(Copy, Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum AuxiliaryArtifactState { + Verified, + OptionalAbsent, + MissingRequired, + Failed, +} + #[derive(Clone, Debug, Default, Serialize, Deserialize)] pub struct RowIdentityReport { pub kind: Option, @@ -1910,6 +2217,7 @@ pub fn create_manifest_for_index_with_options( manifest_id: format!("urn:uuid:{}", Uuid::new_v4()), created_at: Utc::now().to_rfc3339_opts(SecondsFormat::Secs, true), artifact, + auxiliary_artifacts: Vec::new(), embedding: Embedding { model: embedding_model.into(), dim: metadata.dim, diff --git a/ordvec-manifest/src/main.rs b/ordvec-manifest/src/main.rs index cdf29ab..415d319 100644 --- a/ordvec-manifest/src/main.rs +++ b/ordvec-manifest/src/main.rs @@ -206,6 +206,10 @@ fn run() -> Result { println!("manifest_id: {}", document.manifest.manifest_id); println!("schema_version: {}", document.manifest.schema_version); println!("artifact: {}", document.manifest.artifact.path); + println!( + "auxiliary_artifacts: {}", + document.manifest.auxiliary_artifacts.len() + ); println!("row_identity: {}", row_identity_label(&document)); println!("calibration: {}", calibration_label(&document)); } diff --git a/ordvec-manifest/src/sqlite.rs b/ordvec-manifest/src/sqlite.rs index 890ac96..1fbca3c 100644 --- a/ordvec-manifest/src/sqlite.rs +++ b/ordvec-manifest/src/sqlite.rs @@ -1,7 +1,7 @@ use crate::{ - resolve_existing_path, sha256_file, sha256_file_bounded, validate_jsonl_rows, verify_manifest, - ManifestDocument, ManifestError, ReportIssue, ResourceLimits, RowIdentity, VerificationReport, - VerifyOptions, + resolve_existing_path, sha256_file, sha256_file_bounded, validate_jsonl_rows, + verify_auxiliary_artifacts, verify_manifest, AuxiliaryArtifactState, ManifestDocument, + ManifestError, ReportIssue, ResourceLimits, RowIdentity, VerificationReport, VerifyOptions, }; use chrono::{SecondsFormat, Utc}; use rusqlite::{params, Connection, OptionalExtension}; @@ -114,6 +114,7 @@ fn init(conn: &Connection) -> Result<(), ManifestError> { artifact_sha256 TEXT, row_identity_sha256 TEXT, calibration_profile_sha256 TEXT, + auxiliary_artifacts_sha256 TEXT, report_json TEXT NOT NULL ); INSERT INTO verification_reports( @@ -137,6 +138,7 @@ fn init(conn: &Connection) -> Result<(), ManifestError> { artifact_sha256 TEXT, row_identity_sha256 TEXT, calibration_profile_sha256 TEXT, + auxiliary_artifacts_sha256 TEXT, report_json TEXT NOT NULL ); CREATE INDEX IF NOT EXISTS verification_reports_cache_idx @@ -147,6 +149,7 @@ fn init(conn: &Connection) -> Result<(), ManifestError> { artifact_sha256, row_identity_sha256, calibration_profile_sha256, + auxiliary_artifacts_sha256, report_id ); CREATE TABLE IF NOT EXISTS active_manifest( @@ -192,8 +195,9 @@ fn store_report( artifact_sha256, row_identity_sha256, calibration_profile_sha256, + auxiliary_artifacts_sha256, report_json - ) VALUES(?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9, ?10)", + ) VALUES(?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9, ?10, ?11)", params![ document.manifest.manifest_id, manifest_path.display().to_string(), @@ -204,6 +208,7 @@ fn store_report( cache_key.map(|key| key.artifact_sha256.as_str()), cache_key.and_then(|key| key.row_identity_sha256.as_deref()), cache_key.and_then(|key| key.calibration_profile_sha256.as_deref()), + cache_key.and_then(|key| key.auxiliary_artifacts_sha256.as_deref()), report_json, ], ) @@ -234,6 +239,10 @@ fn load_cached_report( (calibration_profile_sha256 IS NULL AND ?6 IS NULL) OR calibration_profile_sha256 = ?6 ) + AND ( + (auxiliary_artifacts_sha256 IS NULL AND ?7 IS NULL) + OR auxiliary_artifacts_sha256 = ?7 + ) ORDER BY report_id DESC LIMIT 1", params![ @@ -243,6 +252,7 @@ fn load_cached_report( cache_key.artifact_sha256.as_str(), cache_key.row_identity_sha256.as_deref(), cache_key.calibration_profile_sha256.as_deref(), + cache_key.auxiliary_artifacts_sha256.as_deref(), ], |row| Ok((row.get(0)?, row.get(1)?)), ) @@ -280,6 +290,7 @@ struct CacheKey { artifact_sha256: String, row_identity_sha256: Option, calibration_profile_sha256: Option, + auxiliary_artifacts_sha256: Option, } #[derive(Serialize)] @@ -379,6 +390,7 @@ fn current_cache_key( } }; let calibration_profile_sha256 = current_calibration_profile_sha256(document, options)?; + let auxiliary_artifacts_sha256 = current_auxiliary_artifacts_sha256(document, options)?; Ok(Some(CacheKey { manifest_sha256, @@ -386,6 +398,7 @@ fn current_cache_key( artifact_sha256, row_identity_sha256, calibration_profile_sha256, + auxiliary_artifacts_sha256, })) } @@ -432,15 +445,90 @@ fn cache_key_from_report( } else { None }; + let auxiliary_artifacts_sha256 = auxiliary_artifacts_sha256_from_report(document, report)?; Ok(Some(CacheKey { manifest_sha256, options_sha256, artifact_sha256, row_identity_sha256, calibration_profile_sha256, + auxiliary_artifacts_sha256, })) } +fn current_auxiliary_artifacts_sha256( + document: &ManifestDocument, + options: &VerifyOptions, +) -> Result, ManifestError> { + if document.manifest.auxiliary_artifacts.is_empty() { + return Ok(None); + } + let mut report = VerificationReport::new(None); + verify_auxiliary_artifacts(document, options, &mut report); + if !report.errors.is_empty() { + return Ok(None); + } + auxiliary_artifacts_sha256_from_report(document, &report) +} + +fn auxiliary_artifacts_sha256_from_report( + document: &ManifestDocument, + report: &VerificationReport, +) -> Result, ManifestError> { + if document.manifest.auxiliary_artifacts.is_empty() { + return Ok(None); + } + if report.auxiliary_artifacts.len() != document.manifest.auxiliary_artifacts.len() { + return Ok(None); + } + + let mut entries = Vec::with_capacity(report.auxiliary_artifacts.len()); + for entry in &report.auxiliary_artifacts { + match entry.state { + AuxiliaryArtifactState::Verified => { + let (Some(sha256), Some(size_bytes)) = (entry.sha256.as_ref(), entry.size_bytes) + else { + return Ok(None); + }; + entries.push(AuxiliaryArtifactCacheEntry { + name: entry.name.clone(), + path: entry.manifest_path.clone(), + required: entry.required, + state: "verified", + sha256: Some(sha256.clone()), + size_bytes: Some(size_bytes), + }); + } + AuxiliaryArtifactState::OptionalAbsent => { + entries.push(AuxiliaryArtifactCacheEntry { + name: entry.name.clone(), + path: entry.manifest_path.clone(), + required: entry.required, + state: "optional_absent", + sha256: None, + size_bytes: None, + }); + } + AuxiliaryArtifactState::MissingRequired | AuxiliaryArtifactState::Failed => { + return Ok(None); + } + } + } + + let json = serde_json::to_vec(&entries)?; + Ok(Some(sha256_bytes(&json))) +} + +#[derive(Serialize)] +struct AuxiliaryArtifactCacheEntry { + name: String, + path: String, + required: bool, + state: &'static str, + sha256: Option, + size_bytes: Option, +} + fn current_calibration_profile_sha256( document: &ManifestDocument, options: &VerifyOptions, @@ -503,7 +591,10 @@ fn verification_reports_needs_migration(conn: &Connection) -> Result ManifestError { diff --git a/ordvec-manifest/tests/manifest.rs b/ordvec-manifest/tests/manifest.rs index 278d5f8..2ed78c3 100644 --- a/ordvec-manifest/tests/manifest.rs +++ b/ordvec-manifest/tests/manifest.rs @@ -2,9 +2,10 @@ use ordvec::{Bitmap, Rank, RankQuant, SignBitmap}; use ordvec_manifest::{ create_manifest_for_index, create_manifest_for_index_with_options, load_manifest_file, load_manifest_file_with_options, sha256_file, verify_index_manifest, verify_manifest_with_base, - CalibrationOrdinalization, CalibrationProfileRef, CreateManifestOptions, CreateRowIdentity, - EncoderSpec, ManifestIndexParams, NullModelSpec, ProfileArtifactRef, ProfileParameterization, - ResourceLimits, RowIdentity, VerifyOptions, CALIBRATION_SCHEMA_VERSION, + AuxiliaryArtifact, AuxiliaryArtifactState, CalibrationOrdinalization, CalibrationProfileRef, + CreateManifestOptions, CreateRowIdentity, EncoderSpec, ManifestIndexParams, NullModelSpec, + ProfileArtifactRef, ProfileParameterization, ResourceLimits, RowIdentity, VerifyOptions, + CALIBRATION_SCHEMA_VERSION, }; use serde_json::json; use std::fs; @@ -102,6 +103,21 @@ fn write_profile(path: &Path, size_bytes: usize) -> ordvec_manifest::FileHash { sha256_file(path).unwrap() } +fn auxiliary_artifact( + name: &str, + path: &str, + hash: ordvec_manifest::FileHash, + required: bool, +) -> AuxiliaryArtifact { + AuxiliaryArtifact { + name: name.to_string(), + path: path.to_string(), + sha256: hash.sha256, + file_size_bytes: hash.size_bytes, + required, + } +} + fn uniform_calibration( manifest: &ordvec_manifest::IndexManifest, ordinalization: CalibrationOrdinalization, @@ -1381,6 +1397,121 @@ fn jsonl_row_identity_is_strict_and_duplicate_ids_need_opt_in() { .any(|issue| issue.code == "row_identity_row_id_mismatch")); } +#[test] +fn auxiliary_artifacts_verify_and_report_deterministically() { + let root = tempfile::tempdir().unwrap(); + let (temp, mut manifest, _manifest_path) = identity_manifest(root.path()); + fs::write(temp.path().join("zeta.bin"), b"zeta").unwrap(); + fs::write(temp.path().join("alpha.bin"), b"alpha").unwrap(); + let zeta_hash = sha256_file(temp.path().join("zeta.bin")).unwrap(); + let alpha_hash = sha256_file(temp.path().join("alpha.bin")).unwrap(); + + manifest.auxiliary_artifacts = vec![ + auxiliary_artifact("zeta", "zeta.bin", zeta_hash, true), + AuxiliaryArtifact { + name: "optional-model".to_string(), + path: "missing-model.json".to_string(), + sha256: "0".repeat(64), + file_size_bytes: 0, + required: false, + }, + auxiliary_artifact("alpha", "alpha.bin", alpha_hash, true), + ]; + + let report = verify_manifest_with_base(manifest, temp.path(), VerifyOptions::default()); + assert!(report.ok, "{:?}", report.errors); + assert_eq!( + report + .auxiliary_artifacts + .iter() + .map(|entry| entry.name.as_str()) + .collect::>(), + ["alpha", "optional-model", "zeta"] + ); + assert_eq!( + report.auxiliary_artifacts[0].state, + AuxiliaryArtifactState::Verified + ); + assert_eq!( + report.auxiliary_artifacts[1].state, + AuxiliaryArtifactState::OptionalAbsent + ); + assert_eq!( + report.auxiliary_artifacts[1].reason_code.as_deref(), + Some("auxiliary_artifact_optional_absent") + ); + assert_eq!( + report.auxiliary_artifacts[2].state, + AuxiliaryArtifactState::Verified + ); +} + +#[test] +fn auxiliary_artifacts_fail_closed_on_tamper_missing_and_path_escape() { + let root = tempfile::tempdir().unwrap(); + let (temp, mut manifest, _manifest_path) = identity_manifest(root.path()); + let outside = root.path().join("outside.bin"); + fs::write(&outside, b"outside").unwrap(); + fs::write(temp.path().join("tampered.bin"), b"original").unwrap(); + fs::write(temp.path().join("wrong-size.bin"), b"size").unwrap(); + let tampered_hash = sha256_file(temp.path().join("tampered.bin")).unwrap(); + let wrong_size_hash = sha256_file(temp.path().join("wrong-size.bin")).unwrap(); + fs::write(temp.path().join("tampered.bin"), b"changed").unwrap(); + + manifest.auxiliary_artifacts = vec![ + AuxiliaryArtifact { + name: "missing".to_string(), + path: "missing.bin".to_string(), + sha256: "0".repeat(64), + file_size_bytes: 0, + required: true, + }, + auxiliary_artifact("tampered", "tampered.bin", tampered_hash, true), + AuxiliaryArtifact { + name: "wrong-size".to_string(), + path: "wrong-size.bin".to_string(), + sha256: wrong_size_hash.sha256, + file_size_bytes: wrong_size_hash.size_bytes + 1, + required: true, + }, + AuxiliaryArtifact { + name: "escape".to_string(), + path: "../outside.bin".to_string(), + sha256: sha256_file(outside).unwrap().sha256, + file_size_bytes: 7, + required: true, + }, + ]; + + let report = verify_manifest_with_base(manifest, temp.path(), VerifyOptions::default()); + assert!(!report.ok); + let codes = error_codes(&report); + assert!(codes.contains(&"auxiliary_artifact_missing_required")); + assert!(codes.contains(&"auxiliary_artifact_sha256_mismatch")); + assert!(codes.contains(&"auxiliary_artifact_file_size_mismatch")); + assert!(codes.contains(&"auxiliary_artifact_path_escape_rejected")); +} + +#[test] +fn auxiliary_artifact_schema_rejects_unknown_fields_and_duplicate_names() { + let root = tempfile::tempdir().unwrap(); + let (temp, mut manifest, _manifest_path) = identity_manifest(root.path()); + fs::write(temp.path().join("sidecar.bin"), b"sidecar").unwrap(); + let sidecar_hash = sha256_file(temp.path().join("sidecar.bin")).unwrap(); + + manifest.auxiliary_artifacts = vec![ + auxiliary_artifact("duplicate", "sidecar.bin", sidecar_hash.clone(), true), + auxiliary_artifact("duplicate", "sidecar.bin", sidecar_hash, false), + ]; + let report = verify_manifest_with_base(manifest.clone(), temp.path(), VerifyOptions::default()); + assert!(error_codes(&report).contains(&"auxiliary_artifact_name_duplicate")); + + let mut value = serde_json::to_value(&manifest).unwrap(); + value["auxiliary_artifacts"][0]["unexpected"] = json!(true); + let parsed = serde_json::from_value::(value); + assert!(parsed.is_err()); +} + #[test] fn attestation_shape_requires_matching_subject_sha256() { let root = tempfile::tempdir().unwrap(); @@ -1841,6 +1972,124 @@ fn sqlite_cache_key_includes_calibration_profile_bytes() { assert!(error_codes(&cached).contains(&"calibration_profile_sha256_mismatch")); } +#[cfg(feature = "sqlite")] +#[test] +fn sqlite_cache_key_includes_auxiliary_artifact_bytes() { + let temp = tempfile::tempdir().unwrap(); + let index = write_index(temp.path()); + let manifest_path = temp.path().join("manifest.json"); + let mut manifest = create_manifest_for_index( + &index, + CreateRowIdentity::RowIdIdentity, + "test-embedding", + &manifest_path, + ) + .unwrap(); + let sidecar_path = temp.path().join("sidecar.json"); + fs::write(&sidecar_path, b"{\"version\":1}\n").unwrap(); + let sidecar_hash = sha256_file(&sidecar_path).unwrap(); + manifest.auxiliary_artifacts = vec![auxiliary_artifact( + "sidecar", + "sidecar.json", + sidecar_hash, + true, + )]; + fs::write( + &manifest_path, + serde_json::to_string_pretty(&manifest).unwrap(), + ) + .unwrap(); + let document = load_manifest_file(&manifest_path).unwrap(); + let db = temp.path().join("registry.sqlite"); + + let report = ordvec_manifest::sqlite::verify_with_registry( + &db, + &document, + &manifest_path, + VerifyOptions::default(), + true, + ) + .unwrap(); + assert!(report.ok, "{:?}", report.errors); + + fs::write(&sidecar_path, b"{\"version\":2}\n").unwrap(); + let cached = ordvec_manifest::sqlite::verify_with_registry( + &db, + &document, + &manifest_path, + VerifyOptions::default(), + true, + ) + .unwrap(); + assert!( + !cached.ok, + "auxiliary artifact drift must force fresh verification" + ); + assert!(error_codes(&cached).contains(&"auxiliary_artifact_sha256_mismatch")); +} + +#[cfg(feature = "sqlite")] +#[test] +fn sqlite_cache_key_distinguishes_optional_auxiliary_absent_and_present() { + let temp = tempfile::tempdir().unwrap(); + let index = write_index(temp.path()); + let manifest_path = temp.path().join("manifest.json"); + let mut manifest = create_manifest_for_index( + &index, + CreateRowIdentity::RowIdIdentity, + "test-embedding", + &manifest_path, + ) + .unwrap(); + let optional_path = temp.path().join("optional.json"); + fs::write(&optional_path, b"{\"enabled\":true}\n").unwrap(); + let optional_hash = sha256_file(&optional_path).unwrap(); + fs::remove_file(&optional_path).unwrap(); + manifest.auxiliary_artifacts = vec![auxiliary_artifact( + "optional", + "optional.json", + optional_hash, + false, + )]; + fs::write( + &manifest_path, + serde_json::to_string_pretty(&manifest).unwrap(), + ) + .unwrap(); + let document = load_manifest_file(&manifest_path).unwrap(); + let db = temp.path().join("registry.sqlite"); + + let report = ordvec_manifest::sqlite::verify_with_registry( + &db, + &document, + &manifest_path, + VerifyOptions::default(), + true, + ) + .unwrap(); + assert!(report.ok, "{:?}", report.errors); + + assert_eq!( + report.auxiliary_artifacts[0].state, + AuxiliaryArtifactState::OptionalAbsent + ); + + fs::write(&optional_path, b"{\"enabled\":true}\n").unwrap(); + let present = ordvec_manifest::sqlite::verify_with_registry( + &db, + &document, + &manifest_path, + VerifyOptions::default(), + true, + ) + .unwrap(); + assert!(present.ok, "{:?}", present.errors); + assert_eq!( + present.auxiliary_artifacts[0].state, + AuxiliaryArtifactState::Verified + ); +} + #[cfg(feature = "sqlite")] #[test] fn sqlite_cache_key_includes_limits_and_bounds_cached_report_size() { @@ -1891,29 +2140,6 @@ fn sqlite_cache_key_includes_limits_and_bounds_cached_report_size() { assert!(cached.ok, "{:?}", cached.errors); let conn = Connection::open(&db).unwrap(); - let count: i64 = conn - .query_row("SELECT COUNT(*) FROM verification_reports", [], |row| { - row.get(0) - }) - .unwrap(); - assert_eq!(count, 1, "same limits should reuse the cached report"); - - let options_b = VerifyOptions { - limits: ResourceLimits { - max_report_issues: 18, - ..ResourceLimits::default() - }, - ..VerifyOptions::default() - }; - let report = ordvec_manifest::sqlite::verify_with_registry( - &db, - &document, - &manifest_path, - options_b, - true, - ) - .unwrap(); - assert!(report.ok, "{:?}", report.errors); let count: i64 = conn .query_row("SELECT COUNT(*) FROM verification_reports", [], |row| { row.get(0) From 2f0ddc1b3945ec67aa8776d90fbf460bc20f090f Mon Sep 17 00:00:00 2001 From: Nelson Spence Date: Wed, 3 Jun 2026 09:46:46 -0500 Subject: [PATCH 2/4] Bound auxiliary artifact verification state Signed-off-by: Nelson Spence --- CHANGELOG.md | 3 +- ordvec-manifest/README.md | 9 ++- ordvec-manifest/src/lib.rs | 53 ++++++++++++- ordvec-manifest/src/main.rs | 5 ++ ordvec-manifest/src/sqlite.rs | 42 ++++------ ordvec-manifest/tests/manifest.rs | 127 ++++++++++++++++++++++++++++++ 6 files changed, 205 insertions(+), 34 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1a159ef..7269dd2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,7 +18,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added bounded parser/report defaults to `ordvec-manifest` verification for manifest JSON size, row-identity JSONL line length, row count, - duplicate-tracking memory, report issue count, and SQLite cached report size. + duplicate-tracking memory, auxiliary artifact declaration count, report issue + count, and SQLite cached report size. ### Added diff --git a/ordvec-manifest/README.md b/ordvec-manifest/README.md index ed4c465..768639f 100644 --- a/ordvec-manifest/README.md +++ b/ordvec-manifest/README.md @@ -36,6 +36,8 @@ Stable limit codes are part of the contract: (`row_identity_row_count_limit_exceeded`); - row-identity duplicate-tracking `db_id` bytes: 64 MiB (`row_identity_duplicate_tracking_limit_exceeded`); +- auxiliary artifact declarations: 1,024 + (`auxiliary_artifact_count_limit_exceeded`); - collected report issues: 1,024, after which a `verification_report_issue_limit_exceeded` issue is emitted; - SQLite cached report JSON: 4 MiB (`sqlite_cached_report_too_large`). @@ -43,9 +45,9 @@ Stable limit codes are part of the contract: The CLI exposes matching override flags on `inspect`, `verify`, `create`, `sqlite verify`, and `sqlite activate`: `--max-manifest-bytes`, `--max-row-map-line-bytes`, `--max-row-map-rows`, -`--max-row-map-tracked-id-bytes`, `--max-report-issues`, and -`--max-cached-report-bytes`. Library callers can override the same ceilings via -`VerifyOptions::limits`. +`--max-row-map-tracked-id-bytes`, `--max-auxiliary-artifacts`, +`--max-report-issues`, and `--max-cached-report-bytes`. Library callers can +override the same ceilings via `VerifyOptions::limits`. Stable limit codes: @@ -55,6 +57,7 @@ Stable limit codes: | row-identity JSONL line bytes | `row_identity_line_too_large` | `row_identity_line_too_large` | | row-identity JSONL rows | `row_identity_row_count_limit_exceeded` | `row_identity_row_count_limit_exceeded` | | row-identity duplicate-tracking `db_id` bytes | `row_identity_duplicate_tracking_limit_exceeded` | `row_identity_duplicate_tracking_limit_exceeded` | +| auxiliary artifact declarations | `auxiliary_artifact_count_limit_exceeded` | n/a | | collected verification report issues | `verification_report_issue_limit_exceeded` | n/a | | SQLite cached report JSON bytes | n/a | `sqlite_cached_report_too_large` | diff --git a/ordvec-manifest/src/lib.rs b/ordvec-manifest/src/lib.rs index 7bde22b..b77b885 100644 --- a/ordvec-manifest/src/lib.rs +++ b/ordvec-manifest/src/lib.rs @@ -18,6 +18,7 @@ pub const DEFAULT_MAX_MANIFEST_BYTES: u64 = 1024 * 1024; pub const DEFAULT_MAX_ROW_IDENTITY_JSONL_LINE_BYTES: usize = 64 * 1024; pub const DEFAULT_MAX_ROW_IDENTITY_ROWS: usize = 10_000_000; pub const DEFAULT_MAX_ROW_IDENTITY_TRACKED_DB_ID_BYTES: usize = 64 * 1024 * 1024; +pub const DEFAULT_MAX_AUXILIARY_ARTIFACTS: usize = 1024; pub const DEFAULT_MAX_REPORT_ISSUES: usize = 1024; pub const DEFAULT_MAX_CACHED_REPORT_BYTES: u64 = 4 * 1024 * 1024; @@ -174,7 +175,7 @@ pub fn verify_index_manifest( pub fn verify_manifest(document: &ManifestDocument, options: VerifyOptions) -> VerificationReport { let mut report = VerificationReport::new(Some(document.manifest.manifest_id.clone())); - validate_manifest_shape(&document.manifest, &mut report); + validate_manifest_shape(&document.manifest, &options.limits, &mut report); let artifact_display_path = document.manifest.artifact.path.clone(); report.artifact.manifest_path = Some(artifact_display_path.clone()); @@ -245,7 +246,11 @@ pub fn verify_manifest(document: &ManifestDocument, options: VerifyOptions) -> V report } -fn validate_manifest_shape(manifest: &IndexManifest, report: &mut VerificationReport) { +fn validate_manifest_shape( + manifest: &IndexManifest, + limits: &ResourceLimits, + report: &mut VerificationReport, +) { if manifest.schema_version != SCHEMA_VERSION { report.error( "schema_version_unsupported", @@ -338,7 +343,7 @@ fn validate_manifest_shape(manifest: &IndexManifest, report: &mut VerificationRe } } - validate_auxiliary_artifact_shape(manifest, report); + validate_auxiliary_artifact_shape(manifest, limits, report); validate_optional_non_empty( "embedding_model_revision_empty", @@ -418,7 +423,14 @@ fn validate_manifest_shape(manifest: &IndexManifest, report: &mut VerificationRe } } -fn validate_auxiliary_artifact_shape(manifest: &IndexManifest, report: &mut VerificationReport) { +fn validate_auxiliary_artifact_shape( + manifest: &IndexManifest, + limits: &ResourceLimits, + report: &mut VerificationReport, +) { + if !check_auxiliary_artifact_count(manifest, limits, report) { + return; + } let mut names = HashSet::new(); for artifact in &manifest.auxiliary_artifacts { let name = artifact.name.trim(); @@ -1202,6 +1214,9 @@ fn verify_auxiliary_artifacts( options: &VerifyOptions, report: &mut VerificationReport, ) { + if !check_auxiliary_artifact_count(&document.manifest, &options.limits, report) { + return; + } for artifact in auxiliary_artifacts_in_report_order(&document.manifest) { let mut entry = AuxiliaryArtifactReport { name: artifact.name.clone(), @@ -1290,6 +1305,33 @@ fn verify_auxiliary_artifacts( } } +fn check_auxiliary_artifact_count( + manifest: &IndexManifest, + limits: &ResourceLimits, + report: &mut VerificationReport, +) -> bool { + let count = manifest.auxiliary_artifacts.len(); + if count <= limits.max_auxiliary_artifacts { + return true; + } + if !report + .errors + .iter() + .any(|issue| issue.code == "auxiliary_artifact_count_limit_exceeded") + { + push_report_issue_bounded( + &mut report.errors, + limits, + "auxiliary_artifact_count_limit_exceeded", + format!( + "auxiliary_artifacts has {count} entries, exceeding max_auxiliary_artifacts={}", + limits.max_auxiliary_artifacts + ), + ); + } + false +} + fn auxiliary_artifacts_in_report_order(manifest: &IndexManifest) -> Vec<&AuxiliaryArtifact> { let mut artifacts: Vec<_> = manifest.auxiliary_artifacts.iter().collect(); artifacts.sort_by(|left, right| { @@ -1499,6 +1541,7 @@ pub struct ResourceLimits { pub max_row_identity_jsonl_line_bytes: usize, pub max_row_identity_rows: usize, pub max_row_identity_tracked_db_id_bytes: usize, + pub max_auxiliary_artifacts: usize, pub max_report_issues: usize, pub max_cached_report_bytes: u64, } @@ -1510,6 +1553,7 @@ impl Default for ResourceLimits { max_row_identity_jsonl_line_bytes: DEFAULT_MAX_ROW_IDENTITY_JSONL_LINE_BYTES, max_row_identity_rows: DEFAULT_MAX_ROW_IDENTITY_ROWS, max_row_identity_tracked_db_id_bytes: DEFAULT_MAX_ROW_IDENTITY_TRACKED_DB_ID_BYTES, + max_auxiliary_artifacts: DEFAULT_MAX_AUXILIARY_ARTIFACTS, max_report_issues: DEFAULT_MAX_REPORT_ISSUES, max_cached_report_bytes: DEFAULT_MAX_CACHED_REPORT_BYTES, } @@ -1889,6 +1933,7 @@ pub struct VerificationReport { pub checked_at: String, pub manifest_id: Option, pub artifact: ArtifactReport, + #[serde(default)] pub auxiliary_artifacts: Vec, pub row_identity: RowIdentityReport, pub calibration: CalibrationReport, diff --git a/ordvec-manifest/src/main.rs b/ordvec-manifest/src/main.rs index 415d319..36057b1 100644 --- a/ordvec-manifest/src/main.rs +++ b/ordvec-manifest/src/main.rs @@ -131,6 +131,8 @@ struct LimitArgs { #[arg(long)] max_row_map_tracked_id_bytes: Option, #[arg(long)] + max_auxiliary_artifacts: Option, + #[arg(long)] max_report_issues: Option, #[arg(long)] max_cached_report_bytes: Option, @@ -151,6 +153,9 @@ impl LimitArgs { if let Some(value) = self.max_row_map_tracked_id_bytes { limits.max_row_identity_tracked_db_id_bytes = value; } + if let Some(value) = self.max_auxiliary_artifacts { + limits.max_auxiliary_artifacts = value; + } if let Some(value) = self.max_report_issues { limits.max_report_issues = value; } diff --git a/ordvec-manifest/src/sqlite.rs b/ordvec-manifest/src/sqlite.rs index 1fbca3c..9df053c 100644 --- a/ordvec-manifest/src/sqlite.rs +++ b/ordvec-manifest/src/sqlite.rs @@ -465,9 +465,6 @@ fn current_auxiliary_artifacts_sha256( } let mut report = VerificationReport::new(None); verify_auxiliary_artifacts(document, options, &mut report); - if !report.errors.is_empty() { - return Ok(None); - } auxiliary_artifacts_sha256_from_report(document, &report) } @@ -484,35 +481,27 @@ fn auxiliary_artifacts_sha256_from_report( let mut entries = Vec::with_capacity(report.auxiliary_artifacts.len()); for entry in &report.auxiliary_artifacts { - match entry.state { + let state = match entry.state { AuxiliaryArtifactState::Verified => { let (Some(sha256), Some(size_bytes)) = (entry.sha256.as_ref(), entry.size_bytes) else { return Ok(None); }; - entries.push(AuxiliaryArtifactCacheEntry { - name: entry.name.clone(), - path: entry.manifest_path.clone(), - required: entry.required, - state: "verified", - sha256: Some(sha256.clone()), - size_bytes: Some(size_bytes), - }); + ("verified", Some(sha256.clone()), Some(size_bytes)) } - AuxiliaryArtifactState::OptionalAbsent => { - entries.push(AuxiliaryArtifactCacheEntry { - name: entry.name.clone(), - path: entry.manifest_path.clone(), - required: entry.required, - state: "optional_absent", - sha256: None, - size_bytes: None, - }); - } - AuxiliaryArtifactState::MissingRequired | AuxiliaryArtifactState::Failed => { - return Ok(None); - } - } + AuxiliaryArtifactState::OptionalAbsent => ("optional_absent", None, None), + AuxiliaryArtifactState::MissingRequired => ("missing_required", None, None), + AuxiliaryArtifactState::Failed => ("failed", entry.sha256.clone(), entry.size_bytes), + }; + entries.push(AuxiliaryArtifactCacheEntry { + name: entry.name.clone(), + path: entry.manifest_path.clone(), + required: entry.required, + state: state.0, + reason_code: entry.reason_code.clone(), + sha256: state.1, + size_bytes: state.2, + }); } let json = serde_json::to_vec(&entries)?; @@ -525,6 +514,7 @@ struct AuxiliaryArtifactCacheEntry { path: String, required: bool, state: &'static str, + reason_code: Option, sha256: Option, size_bytes: Option, } diff --git a/ordvec-manifest/tests/manifest.rs b/ordvec-manifest/tests/manifest.rs index 2ed78c3..8503cf2 100644 --- a/ordvec-manifest/tests/manifest.rs +++ b/ordvec-manifest/tests/manifest.rs @@ -1512,6 +1512,46 @@ fn auxiliary_artifact_schema_rejects_unknown_fields_and_duplicate_names() { assert!(parsed.is_err()); } +#[test] +fn auxiliary_artifact_count_limit_is_enforced_before_verification() { + let root = tempfile::tempdir().unwrap(); + let (temp, mut manifest, _manifest_path) = identity_manifest(root.path()); + fs::write(temp.path().join("a.bin"), b"a").unwrap(); + fs::write(temp.path().join("b.bin"), b"b").unwrap(); + let a_hash = sha256_file(temp.path().join("a.bin")).unwrap(); + let b_hash = sha256_file(temp.path().join("b.bin")).unwrap(); + manifest.auxiliary_artifacts = vec![ + auxiliary_artifact("a", "a.bin", a_hash, true), + auxiliary_artifact("b", "b.bin", b_hash, true), + ]; + + let report = verify_manifest_with_base( + manifest, + temp.path(), + VerifyOptions { + limits: ResourceLimits { + max_auxiliary_artifacts: 1, + ..ResourceLimits::default() + }, + ..VerifyOptions::default() + }, + ); + assert!(error_codes(&report).contains(&"auxiliary_artifact_count_limit_exceeded")); + assert!(report.auxiliary_artifacts.is_empty()); +} + +#[test] +fn verification_report_deserializes_missing_auxiliary_artifacts_field() { + let root = tempfile::tempdir().unwrap(); + let (temp, manifest, _manifest_path) = identity_manifest(root.path()); + let report = verify_manifest_with_base(manifest, temp.path(), VerifyOptions::default()); + let mut value = serde_json::to_value(&report).unwrap(); + value.as_object_mut().unwrap().remove("auxiliary_artifacts"); + + let parsed: ordvec_manifest::VerificationReport = serde_json::from_value(value).unwrap(); + assert!(parsed.auxiliary_artifacts.is_empty()); +} + #[test] fn attestation_shape_requires_matching_subject_sha256() { let root = tempfile::tempdir().unwrap(); @@ -2028,6 +2068,70 @@ fn sqlite_cache_key_includes_auxiliary_artifact_bytes() { assert!(error_codes(&cached).contains(&"auxiliary_artifact_sha256_mismatch")); } +#[cfg(feature = "sqlite")] +#[test] +fn sqlite_cache_key_includes_failed_auxiliary_artifact_observed_bytes() { + let temp = tempfile::tempdir().unwrap(); + let index = write_index(temp.path()); + let manifest_path = temp.path().join("manifest.json"); + let mut manifest = create_manifest_for_index( + &index, + CreateRowIdentity::RowIdIdentity, + "test-embedding", + &manifest_path, + ) + .unwrap(); + let sidecar_path = temp.path().join("sidecar.json"); + fs::write(&sidecar_path, b"{\"version\":1}\n").unwrap(); + let expected_hash = sha256_file(&sidecar_path).unwrap(); + manifest.auxiliary_artifacts = vec![auxiliary_artifact( + "sidecar", + "sidecar.json", + expected_hash, + true, + )]; + fs::write( + &manifest_path, + serde_json::to_string_pretty(&manifest).unwrap(), + ) + .unwrap(); + let document = load_manifest_file(&manifest_path).unwrap(); + let db = temp.path().join("registry.sqlite"); + + fs::write(&sidecar_path, b"{\"version\":2}\n").unwrap(); + let first_observed = sha256_file(&sidecar_path).unwrap(); + let report = ordvec_manifest::sqlite::verify_with_registry( + &db, + &document, + &manifest_path, + VerifyOptions::default(), + true, + ) + .unwrap(); + assert!(!report.ok); + assert_eq!( + report.auxiliary_artifacts[0].sha256.as_deref(), + Some(first_observed.sha256.as_str()) + ); + + fs::write(&sidecar_path, b"{\"version\":3}\n").unwrap(); + let second_observed = sha256_file(&sidecar_path).unwrap(); + let cached = ordvec_manifest::sqlite::verify_with_registry( + &db, + &document, + &manifest_path, + VerifyOptions::default(), + true, + ) + .unwrap(); + assert!(!cached.ok); + assert_eq!( + cached.auxiliary_artifacts[0].sha256.as_deref(), + Some(second_observed.sha256.as_str()) + ); + assert_ne!(first_observed.sha256, second_observed.sha256); +} + #[cfg(feature = "sqlite")] #[test] fn sqlite_cache_key_distinguishes_optional_auxiliary_absent_and_present() { @@ -2140,6 +2244,29 @@ fn sqlite_cache_key_includes_limits_and_bounds_cached_report_size() { assert!(cached.ok, "{:?}", cached.errors); let conn = Connection::open(&db).unwrap(); + let count: i64 = conn + .query_row("SELECT COUNT(*) FROM verification_reports", [], |row| { + row.get(0) + }) + .unwrap(); + assert_eq!(count, 1, "same limits should reuse the cached report"); + + let options_b = VerifyOptions { + limits: ResourceLimits { + max_report_issues: 18, + ..ResourceLimits::default() + }, + ..VerifyOptions::default() + }; + let report = ordvec_manifest::sqlite::verify_with_registry( + &db, + &document, + &manifest_path, + options_b, + true, + ) + .unwrap(); + assert!(report.ok, "{:?}", report.errors); let count: i64 = conn .query_row("SELECT COUNT(*) FROM verification_reports", [], |row| { row.get(0) From ca62d244b28160063d6441de3d66e4d8a8ab98ff Mon Sep 17 00:00:00 2001 From: Nelson Spence Date: Wed, 3 Jun 2026 12:11:24 -0500 Subject: [PATCH 3/4] Bound auxiliary artifact byte hashing Signed-off-by: Nelson Spence --- CHANGELOG.md | 4 +-- docs/INDEX_PROVENANCE.md | 8 ++--- ordvec-manifest/README.md | 8 +++-- ordvec-manifest/src/lib.rs | 26 ++++++++++------ ordvec-manifest/src/main.rs | 5 ++++ ordvec-manifest/tests/manifest.rs | 50 +++++++++++++++++++++++++++++++ 6 files changed, 84 insertions(+), 17 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7269dd2..5129f5c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,8 +18,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added bounded parser/report defaults to `ordvec-manifest` verification for manifest JSON size, row-identity JSONL line length, row count, - duplicate-tracking memory, auxiliary artifact declaration count, report issue - count, and SQLite cached report size. + duplicate-tracking memory, auxiliary artifact declaration count and bytes, + report issue count, and SQLite cached report size. ### Added diff --git a/docs/INDEX_PROVENANCE.md b/docs/INDEX_PROVENANCE.md index 5d275f2..4563347 100644 --- a/docs/INDEX_PROVENANCE.md +++ b/docs/INDEX_PROVENANCE.md @@ -68,8 +68,8 @@ The manifest verifier checks: whose `row_id` equals the zero-based line number and whose `db_id` is non-empty, NUL-free, and unique by default; - declared auxiliary artifacts, checking each caller-named sidecar's path, - SHA-256 digest, and byte length under the same default path policy as the - primary index artifact; + SHA-256 digest, byte length, and configured byte ceiling under the same + default path policy as the primary index artifact; - optional `calibration` profile references, checking profile identity, path/hash integrity, encoder identity, and ordinalization compatibility; - attestation **shape** only: predicate type, builder id when present, and at @@ -81,8 +81,8 @@ secondary indexes, or stores that a caller intends to load together with the ordvec index. The verifier does not interpret those bytes; it only reports whether declared required members were verified, whether optional members were present or absent, and whether any declared member failed path, size, or digest -checks. Callers should load sidecars only after the relevant declaration is -verified. +checks or exceeded the configured auxiliary artifact byte limit. Callers should +load sidecars only after the relevant declaration is verified. When present, `calibration` binds an index artifact to a hashed ordinal profile used to interpret overlap, bucket, sign, or rank evidence under a calibrated diff --git a/ordvec-manifest/README.md b/ordvec-manifest/README.md index 768639f..82b4d9d 100644 --- a/ordvec-manifest/README.md +++ b/ordvec-manifest/README.md @@ -38,6 +38,8 @@ Stable limit codes are part of the contract: (`row_identity_duplicate_tracking_limit_exceeded`); - auxiliary artifact declarations: 1,024 (`auxiliary_artifact_count_limit_exceeded`); +- auxiliary artifact bytes per declared file: 64 MiB + (`auxiliary_artifact_file_too_large`); - collected report issues: 1,024, after which a `verification_report_issue_limit_exceeded` issue is emitted; - SQLite cached report JSON: 4 MiB (`sqlite_cached_report_too_large`). @@ -46,8 +48,9 @@ The CLI exposes matching override flags on `inspect`, `verify`, `create`, `sqlite verify`, and `sqlite activate`: `--max-manifest-bytes`, `--max-row-map-line-bytes`, `--max-row-map-rows`, `--max-row-map-tracked-id-bytes`, `--max-auxiliary-artifacts`, -`--max-report-issues`, and `--max-cached-report-bytes`. Library callers can -override the same ceilings via `VerifyOptions::limits`. +`--max-auxiliary-artifact-bytes`, `--max-report-issues`, and +`--max-cached-report-bytes`. Library callers can override the same ceilings via +`VerifyOptions::limits`. Stable limit codes: @@ -58,6 +61,7 @@ Stable limit codes: | row-identity JSONL rows | `row_identity_row_count_limit_exceeded` | `row_identity_row_count_limit_exceeded` | | row-identity duplicate-tracking `db_id` bytes | `row_identity_duplicate_tracking_limit_exceeded` | `row_identity_duplicate_tracking_limit_exceeded` | | auxiliary artifact declarations | `auxiliary_artifact_count_limit_exceeded` | n/a | +| auxiliary artifact bytes per declared file | `auxiliary_artifact_file_too_large` | n/a | | collected verification report issues | `verification_report_issue_limit_exceeded` | n/a | | SQLite cached report JSON bytes | n/a | `sqlite_cached_report_too_large` | diff --git a/ordvec-manifest/src/lib.rs b/ordvec-manifest/src/lib.rs index b77b885..a2ba5e7 100644 --- a/ordvec-manifest/src/lib.rs +++ b/ordvec-manifest/src/lib.rs @@ -19,6 +19,7 @@ pub const DEFAULT_MAX_ROW_IDENTITY_JSONL_LINE_BYTES: usize = 64 * 1024; pub const DEFAULT_MAX_ROW_IDENTITY_ROWS: usize = 10_000_000; pub const DEFAULT_MAX_ROW_IDENTITY_TRACKED_DB_ID_BYTES: usize = 64 * 1024 * 1024; pub const DEFAULT_MAX_AUXILIARY_ARTIFACTS: usize = 1024; +pub const DEFAULT_MAX_AUXILIARY_ARTIFACT_BYTES: u64 = 64 * 1024 * 1024; pub const DEFAULT_MAX_REPORT_ISSUES: usize = 1024; pub const DEFAULT_MAX_CACHED_REPORT_BYTES: u64 = 4 * 1024 * 1024; @@ -1238,7 +1239,12 @@ fn verify_auxiliary_artifacts( match resolve_auxiliary_artifact_path(artifact, &document.base_dir, options, report) { AuxiliaryPathResolution::Resolved(resolved) => { entry.canonical_path = Some(path_to_display(&resolved.canonical_path)); - match sha256_file(&resolved.resolved_path) { + match sha256_file_bounded( + &resolved.resolved_path, + options.limits.max_auxiliary_artifact_bytes, + "auxiliary_artifact_file_too_large", + "auxiliary artifact", + ) { Ok(hash) => { entry.sha256 = Some(hash.sha256.clone()); entry.size_bytes = Some(hash.size_bytes); @@ -1273,17 +1279,17 @@ fn verify_auxiliary_artifacts( } } Err(err) => { - mark_auxiliary_artifact_failed( - &mut entry, - "auxiliary_artifact_hash_failed", - ); - report.error( - "auxiliary_artifact_hash_failed", + let code = err.code().unwrap_or("auxiliary_artifact_hash_failed"); + mark_auxiliary_artifact_failed(&mut entry, code); + let message = if err.code().is_some() { + err.to_string() + } else { format!( "failed to hash auxiliary artifact {:?}: {err}", artifact.name - ), - ); + ) + }; + report.error(code, message); } } } @@ -1542,6 +1548,7 @@ pub struct ResourceLimits { pub max_row_identity_rows: usize, pub max_row_identity_tracked_db_id_bytes: usize, pub max_auxiliary_artifacts: usize, + pub max_auxiliary_artifact_bytes: u64, pub max_report_issues: usize, pub max_cached_report_bytes: u64, } @@ -1554,6 +1561,7 @@ impl Default for ResourceLimits { max_row_identity_rows: DEFAULT_MAX_ROW_IDENTITY_ROWS, max_row_identity_tracked_db_id_bytes: DEFAULT_MAX_ROW_IDENTITY_TRACKED_DB_ID_BYTES, max_auxiliary_artifacts: DEFAULT_MAX_AUXILIARY_ARTIFACTS, + max_auxiliary_artifact_bytes: DEFAULT_MAX_AUXILIARY_ARTIFACT_BYTES, max_report_issues: DEFAULT_MAX_REPORT_ISSUES, max_cached_report_bytes: DEFAULT_MAX_CACHED_REPORT_BYTES, } diff --git a/ordvec-manifest/src/main.rs b/ordvec-manifest/src/main.rs index 36057b1..182b03b 100644 --- a/ordvec-manifest/src/main.rs +++ b/ordvec-manifest/src/main.rs @@ -133,6 +133,8 @@ struct LimitArgs { #[arg(long)] max_auxiliary_artifacts: Option, #[arg(long)] + max_auxiliary_artifact_bytes: Option, + #[arg(long)] max_report_issues: Option, #[arg(long)] max_cached_report_bytes: Option, @@ -156,6 +158,9 @@ impl LimitArgs { if let Some(value) = self.max_auxiliary_artifacts { limits.max_auxiliary_artifacts = value; } + if let Some(value) = self.max_auxiliary_artifact_bytes { + limits.max_auxiliary_artifact_bytes = value; + } if let Some(value) = self.max_report_issues { limits.max_report_issues = value; } diff --git a/ordvec-manifest/tests/manifest.rs b/ordvec-manifest/tests/manifest.rs index 8503cf2..add9f14 100644 --- a/ordvec-manifest/tests/manifest.rs +++ b/ordvec-manifest/tests/manifest.rs @@ -1540,6 +1540,56 @@ fn auxiliary_artifact_count_limit_is_enforced_before_verification() { assert!(report.auxiliary_artifacts.is_empty()); } +#[test] +fn auxiliary_artifact_byte_limit_is_enforced_before_hashing() { + let root = tempfile::tempdir().unwrap(); + let (temp, mut manifest, _manifest_path) = identity_manifest(root.path()); + let sidecar = temp.path().join("sidecar.bin"); + fs::write(&sidecar, b"sidecar").unwrap(); + let sidecar_hash = sha256_file(&sidecar).unwrap(); + manifest.auxiliary_artifacts = vec![auxiliary_artifact( + "sidecar", + "sidecar.bin", + sidecar_hash.clone(), + true, + )]; + + let report = verify_manifest_with_base( + manifest.clone(), + temp.path(), + VerifyOptions { + limits: ResourceLimits { + max_auxiliary_artifact_bytes: sidecar_hash.size_bytes - 1, + ..ResourceLimits::default() + }, + ..VerifyOptions::default() + }, + ); + assert!(error_codes(&report).contains(&"auxiliary_artifact_file_too_large")); + assert_eq!(report.auxiliary_artifacts[0].sha256, None); + assert_eq!( + report.auxiliary_artifacts[0].reason_code.as_deref(), + Some("auxiliary_artifact_file_too_large") + ); + + let report = verify_manifest_with_base( + manifest, + temp.path(), + VerifyOptions { + limits: ResourceLimits { + max_auxiliary_artifact_bytes: sidecar_hash.size_bytes, + ..ResourceLimits::default() + }, + ..VerifyOptions::default() + }, + ); + assert!(report.ok, "{:?}", report.errors); + assert_eq!( + report.auxiliary_artifacts[0].sha256.as_deref(), + Some(sidecar_hash.sha256.as_str()) + ); +} + #[test] fn verification_report_deserializes_missing_auxiliary_artifacts_field() { let root = tempfile::tempdir().unwrap(); From 6ad6d764c1ba2d5526c4385aee448aa4218713ee Mon Sep 17 00:00:00 2001 From: Nelson Spence Date: Wed, 3 Jun 2026 12:29:18 -0500 Subject: [PATCH 4/4] Improve auxiliary artifact report auditability Signed-off-by: Nelson Spence --- ordvec-manifest/README.md | 139 ++++++++++++++++++++++++++++ ordvec-manifest/src/lib.rs | 149 ++++++++++++++++++++---------- ordvec-manifest/tests/manifest.rs | 42 ++++++++- 3 files changed, 281 insertions(+), 49 deletions(-) diff --git a/ordvec-manifest/README.md b/ordvec-manifest/README.md index 82b4d9d..e198dc4 100644 --- a/ordvec-manifest/README.md +++ b/ordvec-manifest/README.md @@ -83,6 +83,145 @@ by path policy. Optional members are reported as verified when present or as `optional_absent` with a stable reason code when absent. The verifier checks bytes only; application semantics remain with the caller. +The unified JSON report carries per-sidecar audit fields. A successful +auxiliary artifact verification includes the manifest path, resolved/canonical +paths, declared digest/length, and observed digest/length: + +```json +{ + "ok": true, + "checked_at": "2026-06-03T17:20:00Z", + "manifest_id": "urn:uuid:11111111-1111-4111-8111-111111111111", + "artifact": { + "manifest_path": "index.tvrq", + "observed_path": "index.tvrq", + "canonical_path": "/srv/index/index.tvrq", + "sha256": "1111111111111111111111111111111111111111111111111111111111111111", + "size_bytes": 4096, + "metadata": null + }, + "auxiliary_artifacts": [ + { + "name": "ordgrep.sidecar", + "manifest_path": "ordgrep.sidecar.json", + "resolved_path": "/srv/index/ordgrep.sidecar.json", + "canonical_path": "/srv/index/ordgrep.sidecar.json", + "expected_sha256": "2222222222222222222222222222222222222222222222222222222222222222", + "expected_size_bytes": 128, + "required": true, + "state": "verified", + "reason_code": null, + "sha256": "2222222222222222222222222222222222222222222222222222222222222222", + "size_bytes": 128 + } + ], + "row_identity": { + "kind": "row_id_identity", + "manifest_path": null, + "canonical_path": null, + "sha256": null, + "row_count": 1024, + "validated_rows": 1024 + }, + "calibration": { + "present": false, + "schema_version": null, + "profile_id": null, + "calibrated_for_model": null, + "ordinalization": null, + "null_model": null, + "profile_manifest_path": null, + "profile_canonical_path": null, + "profile_sha256": null, + "profile_size_bytes": null + }, + "attestation_shape_checks": [], + "errors": [], + "warnings": [], + "skipped_checks": [] +} +``` + +A tampered or missing sidecar fails closed while preserving declared fields for +audit logging. Observed digest/length fields are present when bytes could be +read and absent when the file is missing: + +```json +{ + "ok": false, + "checked_at": "2026-06-03T17:21:00Z", + "manifest_id": "urn:uuid:11111111-1111-4111-8111-111111111111", + "artifact": { + "manifest_path": "index.tvrq", + "observed_path": "index.tvrq", + "canonical_path": "/srv/index/index.tvrq", + "sha256": "1111111111111111111111111111111111111111111111111111111111111111", + "size_bytes": 4096, + "metadata": null + }, + "auxiliary_artifacts": [ + { + "name": "ordgrep.sidecar", + "manifest_path": "ordgrep.sidecar.json", + "resolved_path": "/srv/index/ordgrep.sidecar.json", + "canonical_path": "/srv/index/ordgrep.sidecar.json", + "expected_sha256": "2222222222222222222222222222222222222222222222222222222222222222", + "expected_size_bytes": 128, + "required": true, + "state": "failed", + "reason_code": "auxiliary_artifact_sha256_mismatch", + "sha256": "3333333333333333333333333333333333333333333333333333333333333333", + "size_bytes": 128 + }, + { + "name": "required-model-card", + "manifest_path": "model-card.json", + "resolved_path": "/srv/index/model-card.json", + "expected_sha256": "4444444444444444444444444444444444444444444444444444444444444444", + "expected_size_bytes": 2048, + "required": true, + "state": "missing_required", + "reason_code": "auxiliary_artifact_missing_required", + "sha256": null, + "size_bytes": null + } + ], + "row_identity": { + "kind": "row_id_identity", + "manifest_path": null, + "canonical_path": null, + "sha256": null, + "row_count": 1024, + "validated_rows": 1024 + }, + "calibration": { + "present": false, + "schema_version": null, + "profile_id": null, + "calibrated_for_model": null, + "ordinalization": null, + "null_model": null, + "profile_manifest_path": null, + "profile_canonical_path": null, + "profile_sha256": null, + "profile_size_bytes": null + }, + "attestation_shape_checks": [], + "errors": [ + { + "code": "auxiliary_artifact_sha256_mismatch", + "message": "auxiliary artifact \"ordgrep.sidecar\" SHA-256 was 3333333333333333333333333333333333333333333333333333333333333333, manifest declares 2222222222222222222222222222222222222222222222222222222222222222" + }, + { + "code": "auxiliary_artifact_missing_required", + "message": "required auxiliary artifact \"required-model-card\" is missing at /srv/index/model-card.json" + } + ], + "warnings": [], + "skipped_checks": [] +} +``` + With `--features sqlite`, the `sqlite verify` and `sqlite activate` subcommands add a local cache/audit log plus one active-manifest pointer. This is not a full named registry. `sqlite verify --use-cache` reuses only reports whose diff --git a/ordvec-manifest/src/lib.rs b/ordvec-manifest/src/lib.rs index a2ba5e7..83a8a4a 100644 --- a/ordvec-manifest/src/lib.rs +++ b/ordvec-manifest/src/lib.rs @@ -1218,17 +1218,40 @@ fn verify_auxiliary_artifacts( if !check_auxiliary_artifact_count(&document.manifest, &options.limits, report) { return; } - for artifact in auxiliary_artifacts_in_report_order(&document.manifest) { - let mut entry = AuxiliaryArtifactReport { - name: artifact.name.clone(), - manifest_path: artifact.path.clone(), - required: artifact.required, - state: AuxiliaryArtifactState::Failed, - reason_code: None, - canonical_path: None, - sha256: None, - size_bytes: None, - }; + let artifacts = auxiliary_artifacts_in_report_order(&document.manifest); + let base_canonical = if options.allow_path_escape { + None + } else { + match fs::canonicalize(&document.base_dir) { + Ok(path) => Some(path), + Err(err) => { + for artifact in artifacts { + let mut entry = auxiliary_artifact_report_entry(artifact, &document.base_dir); + if artifact.path.trim().is_empty() { + mark_auxiliary_artifact_failed(&mut entry, "auxiliary_artifact_path_empty"); + } else { + report.error( + "auxiliary_artifact_base_dir_unavailable", + format!( + "failed to canonicalize base_dir {} for auxiliary artifact {:?}: {err}", + document.base_dir.display(), + artifact.name + ), + ); + mark_auxiliary_artifact_failed( + &mut entry, + "auxiliary_artifact_base_dir_unavailable", + ); + } + report.auxiliary_artifacts.push(entry); + } + return; + } + } + }; + + for artifact in artifacts { + let mut entry = auxiliary_artifact_report_entry(artifact, &document.base_dir); if artifact.path.trim().is_empty() { mark_auxiliary_artifact_failed(&mut entry, "auxiliary_artifact_path_empty"); @@ -1236,7 +1259,13 @@ fn verify_auxiliary_artifacts( continue; } - match resolve_auxiliary_artifact_path(artifact, &document.base_dir, options, report) { + match resolve_auxiliary_artifact_path( + artifact, + &document.base_dir, + base_canonical.as_deref(), + options, + report, + ) { AuxiliaryPathResolution::Resolved(resolved) => { entry.canonical_path = Some(path_to_display(&resolved.canonical_path)); match sha256_file_bounded( @@ -1311,6 +1340,32 @@ fn verify_auxiliary_artifacts( } } +fn auxiliary_artifact_report_entry( + artifact: &AuxiliaryArtifact, + base_dir: &Path, +) -> AuxiliaryArtifactReport { + let resolved_path = if artifact.path.trim().is_empty() { + None + } else { + Some(path_to_display(&auxiliary_artifact_resolved_path( + artifact, base_dir, + ))) + }; + AuxiliaryArtifactReport { + name: artifact.name.clone(), + manifest_path: artifact.path.clone(), + resolved_path, + canonical_path: None, + expected_sha256: Some(artifact.sha256.clone()), + expected_size_bytes: Some(artifact.file_size_bytes), + required: artifact.required, + state: AuxiliaryArtifactState::Failed, + reason_code: None, + sha256: None, + size_bytes: None, + } +} + fn check_auxiliary_artifact_count( manifest: &IndexManifest, limits: &ResourceLimits, @@ -1359,6 +1414,7 @@ enum AuxiliaryPathResolution { fn resolve_auxiliary_artifact_path( artifact: &AuxiliaryArtifact, base_dir: &Path, + base_canonical: Option<&Path>, options: &VerifyOptions, report: &mut VerificationReport, ) -> AuxiliaryPathResolution { @@ -1377,23 +1433,6 @@ fn resolve_auxiliary_artifact_path( ); } - let base_canonical = match fs::canonicalize(base_dir) { - Ok(path) => path, - Err(err) => { - report.error( - "auxiliary_artifact_base_dir_unavailable", - format!( - "failed to canonicalize base_dir {} for auxiliary artifact {:?}: {err}", - base_dir.display(), - artifact.name - ), - ); - return AuxiliaryPathResolution::Failed( - "auxiliary_artifact_base_dir_unavailable".to_string(), - ); - } - }; - if !path.is_absolute() && !options.allow_path_escape && has_lexical_escape(path) { report.error( "auxiliary_artifact_path_escape_rejected", @@ -1408,11 +1447,7 @@ fn resolve_auxiliary_artifact_path( ); } - let resolved_path = if path.is_absolute() { - path.to_path_buf() - } else { - base_dir.join(path) - }; + let resolved_path = auxiliary_artifact_resolved_path(artifact, base_dir); let canonical_path = match fs::canonicalize(&resolved_path) { Ok(path) => path, Err(err) if err.kind() == io::ErrorKind::NotFound && !artifact.required => { @@ -1444,19 +1479,21 @@ fn resolve_auxiliary_artifact_path( } }; - if !options.allow_path_escape && !canonical_path.starts_with(&base_canonical) { - report.error( - "auxiliary_artifact_path_escape_rejected", - format!( - "canonical auxiliary artifact path {} for {:?} is outside manifest base {}", - canonical_path.display(), - artifact.name, - base_canonical.display() - ), - ); - return AuxiliaryPathResolution::Failed( - "auxiliary_artifact_path_escape_rejected".to_string(), - ); + if let Some(base_canonical) = base_canonical { + if !canonical_path.starts_with(base_canonical) { + report.error( + "auxiliary_artifact_path_escape_rejected", + format!( + "canonical auxiliary artifact path {} for {:?} is outside manifest base {}", + canonical_path.display(), + artifact.name, + base_canonical.display() + ), + ); + return AuxiliaryPathResolution::Failed( + "auxiliary_artifact_path_escape_rejected".to_string(), + ); + } } AuxiliaryPathResolution::Resolved(ResolvedPath { @@ -1465,6 +1502,15 @@ fn resolve_auxiliary_artifact_path( }) } +fn auxiliary_artifact_resolved_path(artifact: &AuxiliaryArtifact, base_dir: &Path) -> PathBuf { + let path = Path::new(&artifact.path); + if path.is_absolute() { + path.to_path_buf() + } else { + base_dir.join(path) + } +} + fn mark_auxiliary_artifact_failed(entry: &mut AuxiliaryArtifactReport, code: &str) { entry.state = AuxiliaryArtifactState::Failed; if entry.reason_code.is_none() { @@ -1987,10 +2033,17 @@ pub struct ArtifactReport { pub struct AuxiliaryArtifactReport { pub name: String, pub manifest_path: String, + #[serde(default)] + pub resolved_path: Option, + #[serde(default)] + pub canonical_path: Option, + #[serde(default)] + pub expected_sha256: Option, + #[serde(default)] + pub expected_size_bytes: Option, pub required: bool, pub state: AuxiliaryArtifactState, pub reason_code: Option, - pub canonical_path: Option, pub sha256: Option, pub size_bytes: Option, } diff --git a/ordvec-manifest/tests/manifest.rs b/ordvec-manifest/tests/manifest.rs index add9f14..1218edd 100644 --- a/ordvec-manifest/tests/manifest.rs +++ b/ordvec-manifest/tests/manifest.rs @@ -1415,7 +1415,7 @@ fn auxiliary_artifacts_verify_and_report_deterministically() { file_size_bytes: 0, required: false, }, - auxiliary_artifact("alpha", "alpha.bin", alpha_hash, true), + auxiliary_artifact("alpha", "alpha.bin", alpha_hash.clone(), true), ]; let report = verify_manifest_with_base(manifest, temp.path(), VerifyOptions::default()); @@ -1432,6 +1432,20 @@ fn auxiliary_artifacts_verify_and_report_deterministically() { report.auxiliary_artifacts[0].state, AuxiliaryArtifactState::Verified ); + assert_eq!(report.auxiliary_artifacts[0].manifest_path, "alpha.bin"); + assert!(report.auxiliary_artifacts[0] + .resolved_path + .as_deref() + .unwrap() + .ends_with("alpha.bin")); + assert_eq!( + report.auxiliary_artifacts[0].expected_sha256.as_deref(), + Some(alpha_hash.sha256.as_str()) + ); + assert_eq!( + report.auxiliary_artifacts[0].expected_size_bytes, + Some(alpha_hash.size_bytes) + ); assert_eq!( report.auxiliary_artifacts[1].state, AuxiliaryArtifactState::OptionalAbsent @@ -1440,6 +1454,16 @@ fn auxiliary_artifacts_verify_and_report_deterministically() { report.auxiliary_artifacts[1].reason_code.as_deref(), Some("auxiliary_artifact_optional_absent") ); + assert_eq!( + report.auxiliary_artifacts[1].expected_sha256.as_deref(), + Some("0000000000000000000000000000000000000000000000000000000000000000") + ); + assert_eq!(report.auxiliary_artifacts[1].expected_size_bytes, Some(0)); + assert!(report.auxiliary_artifacts[1] + .resolved_path + .as_deref() + .unwrap() + .ends_with("missing-model.json")); assert_eq!( report.auxiliary_artifacts[2].state, AuxiliaryArtifactState::Verified @@ -1490,6 +1514,22 @@ fn auxiliary_artifacts_fail_closed_on_tamper_missing_and_path_escape() { assert!(codes.contains(&"auxiliary_artifact_sha256_mismatch")); assert!(codes.contains(&"auxiliary_artifact_file_size_mismatch")); assert!(codes.contains(&"auxiliary_artifact_path_escape_rejected")); + let missing = report + .auxiliary_artifacts + .iter() + .find(|entry| entry.name == "missing") + .unwrap(); + assert_eq!(missing.state, AuxiliaryArtifactState::MissingRequired); + assert_eq!( + missing.expected_sha256.as_deref(), + Some("0000000000000000000000000000000000000000000000000000000000000000") + ); + assert_eq!(missing.expected_size_bytes, Some(0)); + assert!(missing + .resolved_path + .as_deref() + .unwrap() + .ends_with("missing.bin")); } #[test]