From 1c92da3a459bbb3f8f29f7965e2e7bee7ec59b07 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Wed, 15 Oct 2025 01:36:12 +0000 Subject: [PATCH] Add two repair tests to agent-antagonist Discussing possible causes for the zeroes appearing at the beginning of an extent file (#1788), one theory that came up was that the Extent was being repaired, and those zeroes came from reading the Extent from the repair API, not from the disk itself: only _one_ Region's Extents were bad, not all of them. In order to test this theory, add two new tests, both of which will contact an Agent to get a Region's details, then use the repair API to read Extent data files and either look for a block of zeroes at the beginning or write that to a temporary file and use the new `Extent::validate` routine. --- Cargo.lock | 3 + Cargo.toml | 1 + agent-antagonist/Cargo.toml | 7 +- agent-antagonist/src/main.rs | 249 ++++++++++++++++++++++++++++++++++- downstairs/src/lib.rs | 2 + 5 files changed, 259 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a751005cd..a6fee6e7d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -58,13 +58,16 @@ name = "agent-antagonist" version = "0.1.0" dependencies = [ "anyhow", + "camino-tempfile", "clap", "crucible-agent-client", "crucible-common", + "crucible-downstairs", "crucible-workspace-hack", "futures", "futures-core", "rand 0.9.2", + "repair-client", "reqwest", "serde", "signal-hook", diff --git a/Cargo.toml b/Cargo.toml index f9851f8c0..65b963065 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -48,6 +48,7 @@ bincode = "1.3" byte-unit = "5.1.6" bytes = { version = "1", features = ["serde"] } camino = { version = "1.1", features = ["serde1"] } +camino-tempfile = "1.4.1" cfg-if = { version = "1" } chrono = { version = "0.4", features = [ "serde" ] } clap = { version = "4.5", features = ["derive", "env", "wrap_help"] } diff --git a/agent-antagonist/Cargo.toml b/agent-antagonist/Cargo.toml index 41096ca77..ddb50998c 100644 --- a/agent-antagonist/Cargo.toml +++ b/agent-antagonist/Cargo.toml @@ -7,19 +7,22 @@ edition = "2021" [dependencies] anyhow.workspace = true +camino-tempfile.workspace = true clap.workspace = true crucible-agent-client.workspace = true crucible-common.workspace = true +crucible-downstairs.workspace = true crucible-workspace-hack.workspace = true futures-core.workspace = true futures.workspace = true rand.workspace = true +repair-client.workspace = true reqwest.workspace = true serde.workspace = true signal-hook-tokio.workspace = true signal-hook.workspace = true -slog.workspace = true -slog-term.workspace = true slog-bunyan.workspace = true +slog-term.workspace = true +slog.workspace = true tokio.workspace = true uuid.workspace = true diff --git a/agent-antagonist/src/main.rs b/agent-antagonist/src/main.rs index a33eeb29e..b0513432b 100644 --- a/agent-antagonist/src/main.rs +++ b/agent-antagonist/src/main.rs @@ -2,12 +2,15 @@ use anyhow::bail; use anyhow::Result; use clap::Parser; use crucible_common::build_logger; +use crucible_common::mkdir_for_file; use futures::StreamExt; use rand::random; use signal_hook::consts::signal::*; use signal_hook_tokio::Signals; use slog::Logger; use slog::{info, warn}; +use std::fs::File; +use std::io::Write; use std::net::SocketAddr; use std::process::Command; use std::sync::{ @@ -16,8 +19,17 @@ use std::sync::{ }; use uuid::Uuid; +use rand::prelude::*; +use repair_client::types::FileType; +use repair_client::Client as RepairClient; + +use crucible_common::ExtentId; +use crucible_common::RegionDefinition; +use crucible_downstairs::extent_path; +use crucible_downstairs::Extent; + use crucible_agent_client::{ - types::{CreateRegion, RegionId, State as RegionState}, + types::{CreateRegion, Region, RegionId, State as RegionState}, Client as CrucibleAgentClient, }; @@ -60,6 +72,56 @@ enum Args { #[clap(short, long)] tasks: usize, }, + + /// In a loop: pick a random region, read a random extent using the repair + /// API, and validate that it does not begin with zeroes. Note that this + /// will show a false positive if that extent's beginning has never been + /// written to. + AgentRepairTest { + /// Address of the crucible agent + #[clap(short, long)] + agent: SocketAddr, + + /// Allow reading extents from a read-write downstairs that may not be + /// closed. + #[clap(short, long)] + allow_read_write: bool, + + /// How many bytes to check if they're all zeroes + #[clap(long, default_value_t = 32768)] + verify_size: usize, + + /// Use Extent::validate + #[clap(long)] + validate: bool, + }, + + /// In a loop, pick a random extent for a region, read that extent using the + /// repair API, and validate it does not begin with zeroes. Note that this + /// will show a false positive if that extent's beginning has never been + /// written to. + RepairTest { + /// Address of the crucible agent + #[clap(short, long)] + agent: SocketAddr, + + /// Region id + #[clap(short, long)] + region_id: Uuid, + + /// Allow reading extents from a read-write downstairs that may not be + /// closed. + #[clap(short, long)] + allow_read_write: bool, + + /// How many bytes to check if they're all zeroes + #[clap(long, default_value_t = 32768)] + verify_size: usize, + + /// Use Extent::validate + #[clap(long)] + validate: bool, + }, } fn command(log: &Logger, bin: &'static str, args: &[&str]) -> Result { @@ -718,6 +780,99 @@ async fn signal_thread(log: Logger, stop_flag: Arc) { } } +async fn repair_test( + client: &reqwest::Client, + region_id: Uuid, + region_addr: SocketAddr, + verify_size: usize, + validate: bool, +) -> Result<()> { + let mut rng = rand::rng(); + + // Build a repair client + let repair_client = RepairClient::new_with_client( + &format!("http://{}", region_addr), + client.clone(), + ); + + // get a random extent + let def: RegionDefinition = + repair_client.get_region_info().await?.into_inner(); + let eid = rng.random_range(0..def.extent_count()); + + let mut stream: repair_client::ByteStream = repair_client + .get_extent_file(eid, FileType::Data) + .await? + .into_inner(); + + if validate { + // Stream the whole extent to a tmp file, then use Extent::validate + + let log = build_logger(); + + let mut tmpdir = camino_tempfile::tempdir().unwrap(); + let base_path = tmpdir.path(); + info!(log, "base path is {base_path}"); + let extent_path = + extent_path(base_path.as_std_path(), ExtentId::from(eid)); + info!(log, "extent path is {extent_path:?}"); + + mkdir_for_file(&extent_path)?; + + let mut file = File::create(&extent_path)?; + + while let Some(chunk) = stream.next().await { + let chunk = chunk?; + file.write_all(&chunk)?; + } + + drop(file); + + info!(log, "opening extent {eid} at {base_path}"); + + let extent = Extent::open( + base_path.as_std_path(), + &def, + ExtentId::from(eid), + true, // read-only + &log, + )?; + + if let Err(e) = extent.validate() { + tmpdir.disable_cleanup(true); + return Err(e.into()); + } + } else { + // Read just the beginning and look for a string of zeroes + let blank: Vec = vec![0u8; verify_size]; + let mut start = vec![1u8; verify_size]; + let mut i = 0; + + while let Some(chunk) = stream.next().await { + let chunk = chunk?; + if i >= verify_size { + break; + } + let n = std::cmp::min(chunk.len(), verify_size - i); + start[i..(i + n)].copy_from_slice(&chunk[0..n]); + i += n; + } + + if i < verify_size { + bail!("region {} extent {} truncated read!", region_id, eid); + } + + if start == blank { + eprintln!( + "region {} extent {} returned with a blank beginning!", + region_id, eid, + ); + } + } + + Ok(()) +} + #[allow( clippy::disallowed_macros, reason = "using `#[tokio::main]` in tests is fine" @@ -838,6 +993,98 @@ async fn main() -> Result<()> { info!(log, "All tasks have ended"); info!(log, "Summary: {:?}", result_summary); } + + Args::AgentRepairTest { + agent, + allow_read_write, + verify_size, + validate, + } => { + let mut rng = rand::rng(); + + let client = get_client(&agent); + + let regions: Vec = client.region_list().await?.into_inner(); + + println!("{} total regions", regions.len()); + + let regions: Vec = regions + .into_iter() + .filter(|r| r.state == RegionState::Created) + .filter(|r| r.read_only || allow_read_write) + .collect(); + + println!("{} candidate regions", regions.len()); + + let dur = std::time::Duration::from_secs(60); + + let client = reqwest::ClientBuilder::new() + .connect_timeout(dur) + .build() + .unwrap(); + + loop { + let region: &_ = regions.choose(&mut rng).unwrap(); + + let mut region_addr: SocketAddr = agent; + region_addr.set_port( + region.port_number + crucible_common::REPAIR_PORT_OFFSET, + ); + + repair_test( + &client, + region.id.0.parse().unwrap(), + region_addr, + verify_size, + validate, + ) + .await?; + } + } + + Args::RepairTest { + agent, + region_id, + allow_read_write, + verify_size, + validate, + } => { + let client = get_client(&agent); + + let region: Region = client + .region_get(&RegionId(region_id.to_string())) + .await? + .into_inner(); + + if !allow_read_write && !region.read_only { + eprintln!("region is read-write"); + return Ok(()); + } + + let mut region_addr: SocketAddr = agent; + region_addr.set_port( + region.port_number + crucible_common::REPAIR_PORT_OFFSET, + ); + + let dur = std::time::Duration::from_secs(60); + + let client = reqwest::ClientBuilder::new() + .connect_timeout(dur) + .build() + .unwrap(); + + loop { + repair_test( + &client, + region_id, + region_addr, + verify_size, + validate, + ) + .await?; + } + } } + Ok(()) } diff --git a/downstairs/src/lib.rs b/downstairs/src/lib.rs index e7d692ca3..cfb8c38ec 100644 --- a/downstairs/src/lib.rs +++ b/downstairs/src/lib.rs @@ -54,6 +54,8 @@ use region::Region; pub use admin::run_dropshot; pub use dump::{dump_region, verify_region}; pub use dynamometer::*; +pub use extent::extent_path; +pub use extent::Extent; pub use stats::{DsCountStat, DsStatOuter}; /// Single IO operation