From 6d3b503c6dc86af09661a098b3e159156bc70cd9 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Fri, 17 Oct 2025 10:49:17 -0700 Subject: [PATCH 1/3] [support bundle] Refactor into tasks --- .../tasks/support_bundle_collector.rs | 682 ++++++++++++------ 1 file changed, 457 insertions(+), 225 deletions(-) diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs index 8dc13e7ab4..ee2224edba 100644 --- a/nexus/src/app/background/tasks/support_bundle_collector.rs +++ b/nexus/src/app/background/tasks/support_bundle_collector.rs @@ -59,12 +59,11 @@ use std::future::Future; use std::io::Write; use std::num::NonZeroU64; use std::sync::Arc; -use std::sync::atomic::{AtomicUsize, Ordering}; use tokio::io::AsyncReadExt; use tokio::io::AsyncSeekExt; use tokio::io::AsyncWriteExt; use tokio::io::SeekFrom; -use tokio_util::task::AbortOnDropHandle; +use tokio::sync::OnceCell; use tufaceous_artifact::ArtifactHash; use uuid::Uuid; use zip::ZipArchive; @@ -428,8 +427,6 @@ impl SupportBundleCollector { request: request.clone(), bundle: bundle.clone(), transfer_chunk_size: request.transfer_chunk_size, - host_ereports_collected: AtomicUsize::new(0), - sp_ereports_collected: AtomicUsize::new(0), }); let authz_bundle = authz_support_bundle_from_id(bundle.id.into()); @@ -475,8 +472,60 @@ struct BundleCollection { request: BundleRequest, bundle: SupportBundle, transfer_chunk_size: NonZeroU64, - host_ereports_collected: AtomicUsize, - sp_ereports_collected: AtomicUsize, +} + +type CollectionStepFn = Box< + dyn for<'b> FnOnce( + &'b Arc, + &'b Utf8Path, + ) + -> BoxFuture<'b, anyhow::Result> + + Send, +>; + +enum CollectionStepOutput { + HostEreports(SupportBundleEreportStatus), + SpEreports(SupportBundleEreportStatus), + SavingSpDumps { listed_sps: bool }, + // NOTE: The ditinction between this and "Spawn" is pretty artificial - + // it's just to preserve a part of the report which says "we tried to + // list in-service sleds". + // + // If we changed the collection report, this could easily be combined + // with the "Spawn" variant. + SpawnSleds { extra_steps: Vec<(&'static str, CollectionStepFn)> }, + Spawn { extra_steps: Vec<(&'static str, CollectionStepFn)> }, + None, +} + +impl CollectionStepOutput { + // Updates the collection report based on the output of a collection step, + // and possibly extends the set of all steps to be executed. + fn process( + self, + report: &mut SupportBundleCollectionReport, + steps: &mut Vec<(&'static str, CollectionStepFn)>, + ) { + match self { + CollectionStepOutput::HostEreports(status) => { + report.host_ereports = status; + } + CollectionStepOutput::SpEreports(status) => { + report.sp_ereports = status; + } + CollectionStepOutput::SavingSpDumps { listed_sps } => { + report.listed_sps = listed_sps; + } + CollectionStepOutput::SpawnSleds { extra_steps } => { + report.listed_in_service_sleds = true; + steps.extend(extra_steps); + } + CollectionStepOutput::Spawn { extra_steps } => { + steps.extend(extra_steps); + } + CollectionStepOutput::None => (), + } + } } impl BundleCollection { @@ -656,37 +705,72 @@ impl BundleCollection { Ok(()) } - // Perform the work of collecting the support bundle into a temporary directory - // - // - "dir" is a directory where data can be stored. - // - "bundle" is metadata about the bundle being collected. - // - // If a partial bundle can be collected, it should be returned as - // an Ok(SupportBundleCollectionReport). Any failures from this function - // will prevent the support bundle from being collected altogether. - // - // NOTE: The background task infrastructure will periodically check to see - // if the bundle has been cancelled by a user while it is being collected. - // If that happens, this function will be CANCELLED at an await point. - // - // As a result, it is important that this function be implemented as - // cancel-safe. - async fn collect_bundle_as_file( + async fn run_collect_bundle_steps( self: &Arc, - dir: &Utf8TempDir, - ) -> anyhow::Result { - let log = &self.log; - - info!(&log, "Collecting bundle as local file"); + output: &Utf8TempDir, + mut steps: Vec<(&'static str, CollectionStepFn)>, + ) -> SupportBundleCollectionReport { let mut report = SupportBundleCollectionReport::new(self.bundle.id.into()); - tokio::fs::write( - dir.path().join("bundle_id.txt"), - self.bundle.id.to_string(), - ) - .await?; + const MAX_CONCURRENT_STEPS: usize = 16; + let mut tasks = + ParallelTaskSet::new_with_parallelism(MAX_CONCURRENT_STEPS); + + loop { + // Process all the currently-planned steps + while let Some((step_name, step)) = steps.pop() { + let previous_result = tasks.spawn({ + let collection = self.clone(); + let dir = output.path().to_path_buf(); + async move { + debug!(collection.log, "Running step"; "name" => &step_name); + step(&collection, dir.as_path()).await.inspect_err(|err| { + warn!( + collection.log, + "Step failed"; + "name" => &step_name, + InlineErrorChain::new(err.as_ref()), + ); + }) + } + }).await; + + if let Some(Ok(output)) = previous_result { + output.process(&mut report, &mut steps); + }; + } + + // If we've run out of tasks to spawn, join all the existing steps. + while let Some(previous_result) = tasks.join_next().await { + if let Ok(output) = previous_result { + output.process(&mut report, &mut steps); + }; + } + + // Executing steps may create additional steps, as follow-up work. + // + // Only finish if we've exhausted all possible steps and joined all spawned work. + if steps.is_empty() { + return report; + } + } + } + async fn collect_bundle_id( + &self, + dir: &Utf8Path, + ) -> anyhow::Result { + tokio::fs::write(dir.join("bundle_id.txt"), self.bundle.id.to_string()) + .await?; + + Ok(CollectionStepOutput::None) + } + + async fn collect_reconfigurator_state( + &self, + dir: &Utf8Path, + ) -> anyhow::Result { // Collect reconfigurator state const NMAX_BLUEPRINTS: usize = 300; match reconfigurator_state_load( @@ -697,7 +781,7 @@ impl BundleCollection { .await { Ok(state) => { - let file_path = dir.path().join("reconfigurator_state.json"); + let file_path = dir.join("reconfigurator_state.json"); let file = std::fs::OpenOptions::new() .create(true) .write(true) @@ -713,7 +797,7 @@ impl BundleCollection { }, )?; info!( - log, + self.log, "Support bundle: collected reconfigurator state"; "target_blueprint" => ?state.target_blueprint, "num_blueprints" => state.blueprints.len(), @@ -722,152 +806,322 @@ impl BundleCollection { } Err(err) => { warn!( - log, + self.log, "Support bundle: failed to collect reconfigurator state"; "err" => ?err, ); } - } + }; + + Ok(CollectionStepOutput::None) + } - let ereport_collection = if let Some(ref ereport_filters) = - self.request.ereport_query + async fn collect_host_ereports( + self: &Arc, + dir: &Utf8Path, + ) -> anyhow::Result { + let Some(ref ereport_filters) = self.request.ereport_query else { + debug!(self.log, "Support bundle: ereports not requested"); + return Ok(CollectionStepOutput::None); + }; + let ereports_dir = dir.join("ereports"); + let status = match self + .save_host_ereports(ereport_filters.clone(), ereports_dir.clone()) + .await { - // If ereports are to be included in the bundle, have someone go do - // that in the background while we're gathering up other stuff. Note - // that the `JoinHandle`s for these tasks are wrapped in - // `AbortOnDropHandle`s for cancellation correctness; this ensures - // that if collecting the bundle is cancelled and this future is - // dropped, the tasks that we've spawned to collect ereports are - // aborted as well. - let dir = dir.path().join("ereports"); - let host = AbortOnDropHandle::new(tokio::spawn( - self.clone().collect_host_ereports( - ereport_filters.clone(), - dir.clone(), - ), - )); - let sp = AbortOnDropHandle::new(tokio::spawn( - self.clone().collect_sp_ereports(ereport_filters.clone(), dir), - )); - Some((host, sp)) - } else { - debug!(log, "Support bundle: ereports not requested"); - None + Ok(n_collected) => { + SupportBundleEreportStatus::Collected { n_collected } + } + Err((n_collected, err)) => { + warn!( + &self.log, + "Support bundle: host ereport collection failed \ + ({n_collected} collected successfully)"; + InlineErrorChain::new(err.as_ref()), + ); + + SupportBundleEreportStatus::Failed { + n_collected, + error: err.to_string(), + } + } }; - let all_sleds = self - .datastore - .sled_list_all_batched(&self.opctx, SledFilter::InService) - .await; + Ok(CollectionStepOutput::HostEreports(status)) + } - if let Ok(mgs_client) = self.create_mgs_client().await { - if let Err(e) = write_sled_info( - &self.log, - &mgs_client, - all_sleds.as_deref().ok(), - dir.path(), - ) + async fn collect_sp_ereports( + self: &Arc, + dir: &Utf8Path, + ) -> anyhow::Result { + let Some(ref ereport_filters) = self.request.ereport_query else { + debug!(self.log, "Support bundle: ereports not requested"); + return Ok(CollectionStepOutput::None); + }; + let ereports_dir = dir.join("ereports"); + let status = match self + .save_sp_ereports(ereport_filters.clone(), ereports_dir.clone()) .await - { - error!(log, "Failed to write sled_info.json"; "error" => InlineErrorChain::new(e.as_ref())); + { + Ok(n_collected) => { + SupportBundleEreportStatus::Collected { n_collected } } + Err((n_collected, err)) => { + warn!( + &self.log, + "Support bundle: sp ereport collection failed \ + ({n_collected} collected successfully)"; + InlineErrorChain::new(err.as_ref()), + ); - let sp_dumps_dir = dir.path().join("sp_task_dumps"); - tokio::fs::create_dir_all(&sp_dumps_dir).await.with_context( - || { - format!( - "Failed to create SP task dump directory {sp_dumps_dir}" - ) - }, - )?; + SupportBundleEreportStatus::Failed { + n_collected, + error: err.to_string(), + } + } + }; - if let Err(e) = - save_all_sp_dumps(log, &mgs_client, &sp_dumps_dir).await - { - error!(log, "Failed to capture SP task dumps"; "error" => InlineErrorChain::new(e.as_ref())); - } else { - report.listed_sps = true; - }; - } else { - warn!(log, "No MGS client, skipping SP task dump collection"); - } + Ok(CollectionStepOutput::SpEreports(status)) + } - if let Ok(all_sleds) = all_sleds { - report.listed_in_service_sleds = true; + async fn get_or_initialize_mgs_client<'a>( + &self, + mgs_client: &'a OnceCell>>, + ) -> &'a Arc> { + mgs_client + .get_or_init(|| async { + Arc::new(self.create_mgs_client().await.ok()) + }) + .await + } - const MAX_CONCURRENT_SLED_REQUESTS: usize = 16; - const FAILURE_MESSAGE: &str = - "Failed to fully collect support bundle info from sled"; - let mut set = ParallelTaskSet::new_with_parallelism( - MAX_CONCURRENT_SLED_REQUESTS, + async fn get_or_initialize_all_sleds<'a>( + &self, + all_sleds: &'a OnceCell>>>, + ) -> &'a Arc>> { + all_sleds + .get_or_init(|| async { + Arc::new( + self.datastore + .sled_list_all_batched( + &self.opctx, + SledFilter::InService, + ) + .await + .ok(), + ) + }) + .await + } + + async fn collect_sled_cubby_info( + &self, + all_sleds: &OnceCell>>>, + mgs_client: &OnceCell>>, + dir: &Utf8Path, + ) -> anyhow::Result { + let Some(mgs_client) = + &**self.get_or_initialize_mgs_client(mgs_client).await + else { + warn!( + self.log, + "No MGS client, skipping sled cubby info collection" ); + return Ok(CollectionStepOutput::None); + }; + let nexus_sleds = self + .get_or_initialize_all_sleds(all_sleds) + .await + .as_deref() + .unwrap_or_default(); + + write_sled_cubby_info(&self.log, mgs_client, nexus_sleds, dir).await?; + + Ok(CollectionStepOutput::None) + } + + async fn spawn_sp_dump_collection( + &self, + mgs_client: &OnceCell>>, + dir: &Utf8Path, + ) -> anyhow::Result { + let Some(mgs_client) = + &**self.get_or_initialize_mgs_client(mgs_client).await + else { + warn!(self.log, "No MGS client, skipping SP task dump collection"); + return Ok(CollectionStepOutput::None); + }; - for sled in all_sleds { - let prev_result = set - .spawn({ - let collection: Arc = self.clone(); - let dir = dir.path().to_path_buf(); + let sp_dumps_dir = dir.join("sp_task_dumps"); + tokio::fs::create_dir_all(&sp_dumps_dir).await.with_context(|| { + format!("Failed to create SP task dump directory {sp_dumps_dir}") + })?; + + let mut extra_steps: Vec<(&'static str, CollectionStepFn)> = vec![]; + for sp in get_available_sps(&mgs_client).await? { + extra_steps.push(( + "sp dump", + Box::new({ + let mgs_client = mgs_client.clone(); + move |collection, dir| { async move { - collection.collect_data_from_sled(&sled, &dir).await + collection + .collect_sp_dump(&mgs_client, sp, dir) + .await } - }) - .await; - if let Some(Err(err)) = prev_result { - warn!(&self.log, "{FAILURE_MESSAGE}"; "err" => ?err); - } - } - while let Some(result) = set.join_next().await { - if let Err(err) = result { - warn!(&self.log, "{FAILURE_MESSAGE}"; "err" => ?err); - } - } + .boxed() + } + }), + )); } - if let Some((host, sp)) = ereport_collection { - let (host, sp) = tokio::join!(host, sp); - const TASK_FAILURE_MSG: &str = "task failed"; - let n_collected = - self.host_ereports_collected.load(Ordering::Acquire); - report.host_ereports = match host - .map_err(|e| anyhow::anyhow!("{TASK_FAILURE_MSG}: {e}")) - .and_then(|x| x) - { - Ok(_) => SupportBundleEreportStatus::Collected { n_collected }, - Err(err) => { - warn!( - &self.log, - "Support bundle: host ereport collection failed \ - ({n_collected} collected successfully)"; - "err" => ?err, - ); - SupportBundleEreportStatus::Failed { - n_collected, - error: err.to_string(), + Ok(CollectionStepOutput::Spawn { extra_steps }) + } + + async fn collect_sp_dump( + &self, + mgs_client: &MgsClient, + sp: SpIdentifier, + dir: &Utf8Path, + ) -> anyhow::Result { + save_sp_dumps(mgs_client, sp, dir) + .await + .with_context(|| format!("SP {} {}", sp.type_, sp.slot))?; + + Ok(CollectionStepOutput::SavingSpDumps { listed_sps: true }) + } + + // Perform the work of collecting the support bundle into a temporary directory + // + // - "dir" is a directory where data can be stored. + // - "bundle" is metadata about the bundle being collected. + // + // If a partial bundle can be collected, it should be returned as + // an Ok(SupportBundleCollectionReport). Any failures from this function + // will prevent the support bundle from being collected altogether. + // + // NOTE: The background task infrastructure will periodically check to see + // if the bundle has been cancelled by a user while it is being collected. + // If that happens, this function will be CANCELLED at an await point. + // + // As a result, it is important that this function be implemented as + // cancel-safe. + async fn collect_bundle_as_file( + self: &Arc, + dir: &Utf8TempDir, + ) -> anyhow::Result { + let log = &self.log; + + info!(&log, "Collecting bundle as local file"); + + // Shared, lazy, fallible initialization for sleds + let all_sleds: OnceCell>>> = OnceCell::new(); + // Shared, lazy, fallible initialization for MGS client + let mgs_client: OnceCell>> = OnceCell::new(); + + let steps: Vec<(&str, CollectionStepFn)> = vec![ + ( + "bundle id", + Box::new(|collection, dir| { + collection.collect_bundle_id(dir).boxed() + }), + ), + ( + "reconfigurator state", + Box::new(|collection, dir| { + collection.collect_reconfigurator_state(dir).boxed() + }), + ), + ( + "host ereports", + Box::new(|collection, dir| { + collection.collect_host_ereports(dir).boxed() + }), + ), + ( + "sp ereports", + Box::new(|collection, dir| { + collection.collect_sp_ereports(dir).boxed() + }), + ), + ( + "sled cubby info", + Box::new({ + let all_sleds = all_sleds.clone(); + let mgs_client = mgs_client.clone(); + move |collection, dir| { + async move { + collection + .collect_sled_cubby_info( + &all_sleds, + &mgs_client, + dir, + ) + .await + } + .boxed() } - } - }; - let n_collected = - self.sp_ereports_collected.load(Ordering::Acquire); - report.sp_ereports = match sp - .map_err(|e| anyhow::anyhow!("{TASK_FAILURE_MSG}: {e}")) - .and_then(|x| x) - { - Ok(_) => SupportBundleEreportStatus::Collected { n_collected }, - Err(err) => { - warn!( - &self.log, - "Support bundle: SP ereport collection failed \ - ({n_collected} collected successfully)"; - "err" => ?err, - ); - SupportBundleEreportStatus::Failed { - n_collected, - error: err.to_string(), + }), + ), + ( + "spawn steps to query all sp dumps", + Box::new({ + let mgs_client = mgs_client.clone(); + move |collection, dir| { + async move { + collection + .spawn_sp_dump_collection(&mgs_client, dir) + .await + } + .boxed() } - } - }; + }), + ), + ( + "spawn steps to query all sleds", + Box::new({ + let all_sleds = all_sleds.clone(); + move |collection, _| { + async move { + collection.spawn_query_all_sleds(&all_sleds).await + } + .boxed() + } + }), + ), + ]; + + Ok(self.run_collect_bundle_steps(dir, steps).await) + } + + async fn spawn_query_all_sleds( + &self, + all_sleds: &OnceCell>>>, + ) -> anyhow::Result { + let Some(all_sleds) = + self.get_or_initialize_all_sleds(all_sleds).await.as_deref() + else { + warn!(self.log, "Could not read list of sleds"); + return Ok(CollectionStepOutput::None); + }; + + let mut extra_steps: Vec<(&'static str, CollectionStepFn)> = vec![]; + for sled in all_sleds { + extra_steps.push(( + "sled data", + Box::new({ + let sled = sled.clone(); + move |collection, dir| { + async move { + collection.collect_data_from_sled(&sled, dir).await + } + .boxed() + } + }), + )); } - Ok(report) + + return Ok(CollectionStepOutput::SpawnSleds { extra_steps }); } // Collect data from a sled, storing it into a directory that will @@ -880,7 +1134,7 @@ impl BundleCollection { &self, sled: &nexus_db_model::Sled, dir: &Utf8Path, - ) -> anyhow::Result<()> { + ) -> anyhow::Result { let log = &self.log; info!(&log, "Collecting bundle info from sled"; "sled" => %sled.id()); let sled_path = dir @@ -893,7 +1147,7 @@ impl BundleCollection { .await?; if self.request.skip_sled_info { - return Ok(()); + return Ok(CollectionStepOutput::None); } let Ok(sled_client) = nexus_networking::sled_client( @@ -909,7 +1163,7 @@ impl BundleCollection { "Could not contact sled", ) .await?; - return Ok(()); + return Ok(CollectionStepOutput::None); }; // NB: As new sled-diagnostic commands are added they should @@ -1014,14 +1268,15 @@ impl BundleCollection { error!(&self.log, "failed to write logs output: {e}"); } } - return Ok(()); + return Ok(CollectionStepOutput::None); } - async fn collect_sp_ereports( - self: Arc, + async fn save_host_ereports( + self: &Arc, filters: EreportFilters, dir: Utf8PathBuf, - ) -> anyhow::Result<()> { + ) -> Result { + let mut reports = 0; let mut paginator = Paginator::new( datastore::SQL_BATCH_SIZE, dropshot::PaginationOrder::Ascending, @@ -1029,40 +1284,50 @@ impl BundleCollection { while let Some(p) = paginator.next() { let ereports = self .datastore - .sp_ereports_fetch_matching( + .host_ereports_fetch_matching( &self.opctx, &filters, &p.current_pagparams(), ) .await .map_err(|e| { - e.internal_context("failed to query for SP ereports") + ( + reports, + e.internal_context( + "failed to query for host OS ereports", + ) + .into(), + ) })?; paginator = p.found_batch(&ereports, &|ereport| { (ereport.restart_id.into_untyped_uuid(), ereport.ena) }); - let n_ereports = ereports.len(); for ereport in ereports { - write_ereport(ereport.into(), &dir).await?; - self.sp_ereports_collected.fetch_add(1, Ordering::Release); + write_ereport(ereport.into(), &dir) + .await + .map_err(|e| (reports, e))?; + reports += 1; } - debug!(self.log, "Support bundle: added {n_ereports} SP ereports"); + debug!( + self.log, + "Support bundle: added {n_ereports} host OS ereports" + ); } info!( self.log, - "Support bundle: collected {} total SP ereports", - self.sp_ereports_collected.load(Ordering::Relaxed) + "Support bundle: collected {} total host ereports", reports ); - Ok(()) + Ok(reports) } - async fn collect_host_ereports( - self: Arc, + async fn save_sp_ereports( + self: &Arc, filters: EreportFilters, dir: Utf8PathBuf, - ) -> anyhow::Result<()> { + ) -> Result { + let mut reports = 0; let mut paginator = Paginator::new( datastore::SQL_BATCH_SIZE, dropshot::PaginationOrder::Ascending, @@ -1070,35 +1335,37 @@ impl BundleCollection { while let Some(p) = paginator.next() { let ereports = self .datastore - .host_ereports_fetch_matching( + .sp_ereports_fetch_matching( &self.opctx, &filters, &p.current_pagparams(), ) .await .map_err(|e| { - e.internal_context("failed to query for host OS ereports") + ( + reports, + e.internal_context("failed to query for SP ereports") + .into(), + ) })?; paginator = p.found_batch(&ereports, &|ereport| { (ereport.restart_id.into_untyped_uuid(), ereport.ena) }); let n_ereports = ereports.len(); for ereport in ereports { - write_ereport(ereport.into(), &dir).await?; - self.host_ereports_collected.fetch_add(1, Ordering::Release); + write_ereport(ereport.into(), &dir) + .await + .map_err(|e| (reports, e))?; + reports += 1; } - debug!( - self.log, - "Support bundle: added {n_ereports} host OS ereports" - ); + debug!(self.log, "Support bundle: added {n_ereports} SP ereports"); } info!( self.log, - "Support bundle: collected {} total host ereports", - self.host_ereports_collected.load(Ordering::Relaxed) + "Support bundle: collected {} total SP ereports", reports ); - Ok(()) + Ok(reports) } async fn create_mgs_client(&self) -> anyhow::Result { @@ -1396,40 +1663,6 @@ where Ok(()) } -/// Collect task dumps from all SPs via MGS and save them to a directory. -async fn save_all_sp_dumps( - log: &slog::Logger, - mgs_client: &MgsClient, - sp_dumps_dir: &Utf8Path, -) -> anyhow::Result<()> { - let available_sps = get_available_sps(&mgs_client).await?; - - let mut tasks = ParallelTaskSet::new(); - for sp in available_sps { - let mgs_client = mgs_client.clone(); - let sp_dumps_dir = sp_dumps_dir.to_owned(); - - tasks - .spawn(async move { - save_sp_dumps(mgs_client, sp, sp_dumps_dir) - .await - .with_context(|| format!("SP {} {}", sp.type_, sp.slot)) - }) - .await; - } - for result in tasks.join_all().await { - if let Err(e) = result { - error!( - log, - "failed to capture task dumps"; - "error" => InlineErrorChain::new(e.as_ref()) - ); - } - } - - Ok(()) -} - /// Use MGS ignition info to find active SPs. async fn get_available_sps( mgs_client: &MgsClient, @@ -1455,9 +1688,9 @@ async fn get_available_sps( /// Fetch and save task dumps from a single SP. async fn save_sp_dumps( - mgs_client: MgsClient, + mgs_client: &MgsClient, sp: SpIdentifier, - sp_dumps_dir: Utf8PathBuf, + sp_dumps_dir: &Utf8Path, ) -> anyhow::Result<()> { let dump_count = mgs_client .sp_task_dump_count(&sp.type_, sp.slot) @@ -1488,10 +1721,10 @@ async fn save_sp_dumps( /// Write a file with a JSON mapping of sled serial numbers to cubby and UUIDs for easier /// identification of sleds present in a bundle. -async fn write_sled_info( +async fn write_sled_cubby_info( log: &slog::Logger, mgs_client: &MgsClient, - nexus_sleds: Option<&[Sled]>, + nexus_sleds: &[Sled], dir: &Utf8Path, ) -> anyhow::Result<()> { #[derive(Serialize)] @@ -1506,7 +1739,6 @@ async fn write_sled_info( // We can still get a useful mapping of cubby to serial using just the data from MGS. let mut nexus_map: BTreeMap<_, _> = nexus_sleds - .unwrap_or_default() .into_iter() .map(|sled| (sled.serial_number(), sled)) .collect(); From 70f0bf16c03fbc1cb6109e732f553976b3e5f7ef Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Mon, 27 Oct 2025 14:33:49 -0700 Subject: [PATCH 2/3] review feedback --- .../tasks/support_bundle_collector.rs | 38 ++++++++++++++----- 1 file changed, 28 insertions(+), 10 deletions(-) diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs index f064dac263..fbd191ab5b 100644 --- a/nexus/src/app/background/tasks/support_bundle_collector.rs +++ b/nexus/src/app/background/tasks/support_bundle_collector.rs @@ -474,6 +474,14 @@ struct BundleCollection { transfer_chunk_size: NonZeroU64, } +// This type describes a single step in the Support Bundle collection. +// +// - All steps have access to the "BundleCollection", which includes +// tools for actually acquiring data. +// - All steps have access to an output directory where they can store +// serialized data to a file. +// - Finally, all steps can emit a "CollectionStepOutput", which can either +// update the collection report, or generate more steps. type CollectionStepFn = Box< dyn for<'b> FnOnce( &'b Arc, @@ -487,7 +495,7 @@ enum CollectionStepOutput { HostEreports(SupportBundleEreportStatus), SpEreports(SupportBundleEreportStatus), SavingSpDumps { listed_sps: bool }, - // NOTE: The ditinction between this and "Spawn" is pretty artificial - + // NOTE: The distinction between this and "Spawn" is pretty artificial - // it's just to preserve a part of the report which says "we tried to // list in-service sleds". // @@ -789,7 +797,7 @@ impl BundleCollection { let mut tasks = ParallelTaskSet::new_with_parallelism(MAX_CONCURRENT_STEPS); - loop { + 'run: loop { // Process all the currently-planned steps while let Some((step_name, step)) = steps.pop() { let previous_result = tasks.spawn({ @@ -813,11 +821,16 @@ impl BundleCollection { }; } - // If we've run out of tasks to spawn, join all the existing steps. + // If we've run out of tasks to spawn, join any of the existing steps. while let Some(previous_result) = tasks.join_next().await { if let Ok(output) = previous_result { output.process(&mut report, &mut steps); }; + + // As soon as any task completes, see if we can spawn more work + // immediately. This ensures that the ParallelTaskSet is + // saturated as much as it can be. + continue 'run; } // Executing steps may create additional steps, as follow-up work. @@ -1056,17 +1069,16 @@ impl BundleCollection { sp: SpIdentifier, dir: &Utf8Path, ) -> anyhow::Result { - save_sp_dumps(mgs_client, sp, dir) - .await - .with_context(|| format!("SP {} {}", sp.type_, sp.slot))?; + save_sp_dumps(mgs_client, sp, dir).await.with_context(|| { + format!("failed to save SP dump from: {} {}", sp.type_, sp.slot) + })?; Ok(CollectionStepOutput::SavingSpDumps { listed_sps: true }) } // Perform the work of collecting the support bundle into a temporary directory // - // - "dir" is a directory where data can be stored. - // - "bundle" is metadata about the bundle being collected. + // "dir" is an output directory where data can be stored. // // If a partial bundle can be collected, it should be returned as // an Ok(SupportBundleCollectionReport). Any failures from this function @@ -1078,6 +1090,10 @@ impl BundleCollection { // // As a result, it is important that this function be implemented as // cancel-safe. + // + // The "steps" used within this function - passed to + // [`Self::run_collect_bundle_steps`] - are run on a [`ParallelTaskSet`], + // which automatically aborts tasks when it is dropped. async fn collect_bundle_as_file( self: &Arc, dir: &Utf8TempDir, @@ -1091,7 +1107,7 @@ impl BundleCollection { // Shared, lazy, fallible initialization for MGS client let mgs_client: OnceCell>> = OnceCell::new(); - let steps: Vec<(&str, CollectionStepFn)> = vec![ + let steps: Vec<(&'static str, CollectionStepFn)> = vec![ ( "bundle id", Box::new(|collection, dir| { @@ -1771,7 +1787,9 @@ async fn save_sp_dumps( .into_inner(); let output_dir = sp_dumps_dir.join(format!("{}_{}", sp.type_, sp.slot)); - tokio::fs::create_dir_all(&output_dir).await?; + tokio::fs::create_dir_all(&output_dir).await.with_context(|| { + format!("Failed to create output directory {output_dir}") + })?; for i in 0..dump_count { let task_dump = mgs_client From 8c76fc9b3b3f90691f2c5615754ab77da9f52973 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Mon, 27 Oct 2025 16:46:10 -0700 Subject: [PATCH 3/3] clippy --- .../src/app/background/tasks/support_bundle_collector.rs | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs index fbd191ab5b..c0ae5e7193 100644 --- a/nexus/src/app/background/tasks/support_bundle_collector.rs +++ b/nexus/src/app/background/tasks/support_bundle_collector.rs @@ -797,7 +797,7 @@ impl BundleCollection { let mut tasks = ParallelTaskSet::new_with_parallelism(MAX_CONCURRENT_STEPS); - 'run: loop { + loop { // Process all the currently-planned steps while let Some((step_name, step)) = steps.pop() { let previous_result = tasks.spawn({ @@ -821,8 +821,9 @@ impl BundleCollection { }; } - // If we've run out of tasks to spawn, join any of the existing steps. - while let Some(previous_result) = tasks.join_next().await { + // If we've run out of tasks to spawn, join any of the previously + // spawned tasks, if any exist. + if let Some(previous_result) = tasks.join_next().await { if let Ok(output) = previous_result { output.process(&mut report, &mut steps); }; @@ -830,7 +831,7 @@ impl BundleCollection { // As soon as any task completes, see if we can spawn more work // immediately. This ensures that the ParallelTaskSet is // saturated as much as it can be. - continue 'run; + continue; } // Executing steps may create additional steps, as follow-up work.