diff --git a/sled-agent/config-reconciler/src/dump_setup_task.rs b/sled-agent/config-reconciler/src/dump_setup_task.rs new file mode 100644 index 00000000000..4a721e21f4e --- /dev/null +++ b/sled-agent/config-reconciler/src/dump_setup_task.rs @@ -0,0 +1,113 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Long-running tokio task responsible for updating the dump device setup in +//! response to changes in available disks. + +use crate::InternalDisksReceiver; +use crate::dump_setup::DumpSetup; +use sled_storage::config::MountConfig; +use sled_storage::disk::Disk; +use slog::Logger; +use slog::error; +use std::collections::HashSet; +use std::ops::Deref; +use std::sync::Arc; +use tokio::sync::watch; + +pub(crate) fn spawn( + internal_disks_rx: InternalDisksReceiver, + external_disks_rx: watch::Receiver>, + mount_config: Arc, + base_log: &Logger, +) { + tokio::spawn( + DumpSetupTask::new( + internal_disks_rx, + external_disks_rx, + mount_config, + base_log, + ) + .run(), + ); +} + +struct DumpSetupTask { + // Input channels on which we receive updates about disk changes. + internal_disks_rx: InternalDisksReceiver, + external_disks_rx: watch::Receiver>, + + // Invokes dumpadm(8) and savecore(8) when new disks are encountered + dump_setup: DumpSetup, + + // Set of internal + external disks we most recently passed to `dump_setup`. + last_disks_used: HashSet, + + log: Logger, +} + +impl DumpSetupTask { + fn new( + internal_disks_rx: InternalDisksReceiver, + external_disks_rx: watch::Receiver>, + mount_config: Arc, + base_log: &Logger, + ) -> Self { + Self { + internal_disks_rx, + external_disks_rx, + dump_setup: DumpSetup::new(base_log, mount_config), + last_disks_used: HashSet::new(), + log: base_log.new(slog::o!("component" => "DumpSetupTask")), + } + } + + async fn run(mut self) { + loop { + self.update_setup_if_needed().await; + + // Wait for changes on either input channel. Exit if either channel + // is closed, which should never happen in production. + tokio::select! { + // Cancel-safe per docs on `changed()` + res = self.internal_disks_rx.changed() => { + if res.is_err() { + error!( + self.log, + "internal disks channel closed: exiting task" + ); + return; + } + } + + // Cancel-safe per docs on `changed()` + res = self.external_disks_rx.changed() => { + if res.is_err() { + error!( + self.log, + "external disks channel closed: exiting task" + ); + return; + } + } + } + } + } + + async fn update_setup_if_needed(&mut self) { + // Combine internal and external disks. + let disks_avail = self + .internal_disks_rx + .borrow_and_update_raw_disks() + .iter() + .map(|d| d.deref().clone()) + .chain(self.external_disks_rx.borrow_and_update().iter().cloned()) + .collect::>(); + + if disks_avail != self.last_disks_used { + self.dump_setup.update_dumpdev_setup(disks_avail.iter()).await; + self.last_disks_used = disks_avail; + } + } +} diff --git a/sled-agent/config-reconciler/src/handle.rs b/sled-agent/config-reconciler/src/handle.rs index cdae6a14dac..7c5d07879ae 100644 --- a/sled-agent/config-reconciler/src/handle.rs +++ b/sled-agent/config-reconciler/src/handle.rs @@ -3,7 +3,6 @@ // file, You can obtain one at https://mozilla.org/MPL/2.0/. use camino::Utf8PathBuf; -use illumos_utils::dladm::EtherstubVnic; use illumos_utils::zpool::PathInPool; use key_manager::StorageKeyRequester; use nexus_sled_agent_shared::inventory::InventoryDataset; @@ -14,10 +13,12 @@ use omicron_common::disk::DatasetName; use omicron_common::disk::DiskIdentity; use sled_agent_api::ArtifactConfig; use sled_storage::config::MountConfig; +use sled_storage::disk::Disk; use sled_storage::manager::NestedDatasetConfig; use sled_storage::manager::NestedDatasetListOptions; use sled_storage::manager::NestedDatasetLocation; use slog::Logger; +use std::collections::HashSet; use std::sync::Arc; use std::sync::OnceLock; use tokio::sync::watch; @@ -45,6 +46,7 @@ use crate::SledAgentFacilities; use crate::TimeSyncStatus; use crate::dataset_serialization_task::DatasetTaskHandle; use crate::dataset_serialization_task::NestedDatasetMountError; +use crate::dump_setup_task; use crate::internal_disks::InternalDisksReceiver; use crate::ledger::LedgerTaskHandle; use crate::raw_disks; @@ -68,6 +70,7 @@ pub struct ConfigReconcilerSpawnToken { time_sync_config: TimeSyncConfig, reconciler_result_tx: watch::Sender, currently_managed_zpools_tx: watch::Sender>, + external_disks_tx: watch::Sender>, ledger_task_log: Logger, reconciler_task_log: Logger, } @@ -111,6 +114,16 @@ impl ConfigReconcilerHandle { base_log, ); + // Spawn the task that manages dump devices. + let (external_disks_tx, external_disks_rx) = + watch::channel(HashSet::new()); + dump_setup_task::spawn( + internal_disks_rx.clone(), + external_disks_rx, + Arc::clone(&mount_config), + base_log, + ); + let (reconciler_result_tx, reconciler_result_rx) = watch::channel(ReconcilerResult::default()); let (currently_managed_zpools_tx, currently_managed_zpools_rx) = @@ -142,6 +155,7 @@ impl ConfigReconcilerHandle { time_sync_config, reconciler_result_tx, currently_managed_zpools_tx, + external_disks_tx, ledger_task_log: base_log .new(slog::o!("component" => "SledConfigLedgerTask")), reconciler_task_log: base_log @@ -164,7 +178,6 @@ impl ConfigReconcilerHandle { U: SledAgentArtifactStore, >( &self, - underlay_vnic: EtherstubVnic, sled_agent_facilities: T, sled_agent_artifact_store: U, token: ConfigReconcilerSpawnToken, @@ -174,6 +187,7 @@ impl ConfigReconcilerHandle { time_sync_config, reconciler_result_tx, currently_managed_zpools_tx, + external_disks_tx, ledger_task_log, reconciler_task_log, } = token; @@ -198,12 +212,13 @@ impl ConfigReconcilerHandle { } reconciler_task::spawn( + Arc::clone(self.internal_disks_rx.mount_config()), key_requester, time_sync_config, - underlay_vnic, current_config_rx, reconciler_result_tx, currently_managed_zpools_tx, + external_disks_tx, sled_agent_facilities, reconciler_task_log, ); diff --git a/sled-agent/config-reconciler/src/internal_disks.rs b/sled-agent/config-reconciler/src/internal_disks.rs index ca04f30b2ac..e2630aa4d1b 100644 --- a/sled-agent/config-reconciler/src/internal_disks.rs +++ b/sled-agent/config-reconciler/src/internal_disks.rs @@ -154,6 +154,10 @@ impl InternalDisksReceiver { ) } + pub(crate) fn mount_config(&self) -> &Arc { + &self.mount_config + } + fn spawn_with_disk_adopter( mount_config: Arc, raw_disks_rx: watch::Receiver>, diff --git a/sled-agent/config-reconciler/src/lib.rs b/sled-agent/config-reconciler/src/lib.rs index e42c994adca..d5f6a67d1a4 100644 --- a/sled-agent/config-reconciler/src/lib.rs +++ b/sled-agent/config-reconciler/src/lib.rs @@ -51,6 +51,7 @@ mod dataset_serialization_task; mod disks_common; +mod dump_setup_task; mod handle; mod internal_disks; mod ledger; diff --git a/sled-agent/config-reconciler/src/reconciler_task.rs b/sled-agent/config-reconciler/src/reconciler_task.rs index 5f082dada1c..3c91c60e674 100644 --- a/sled-agent/config-reconciler/src/reconciler_task.rs +++ b/sled-agent/config-reconciler/src/reconciler_task.rs @@ -6,11 +6,13 @@ use chrono::DateTime; use chrono::Utc; -use illumos_utils::dladm::EtherstubVnic; use illumos_utils::zpool::PathInPool; use key_manager::StorageKeyRequester; use nexus_sled_agent_shared::inventory::OmicronSledConfig; +use sled_storage::config::MountConfig; +use sled_storage::disk::Disk; use slog::Logger; +use std::collections::HashSet; use std::sync::Arc; use std::time::Duration; use std::time::Instant; @@ -23,6 +25,8 @@ use crate::sled_agent_facilities::SledAgentFacilities; mod external_disks; mod zones; +use self::external_disks::ExternalDisks; + pub use self::external_disks::CurrentlyManagedZpools; pub use self::external_disks::CurrentlyManagedZpoolsReceiver; pub use self::zones::TimeSyncError; @@ -30,23 +34,29 @@ pub use self::zones::TimeSyncStatus; #[allow(clippy::too_many_arguments)] pub(crate) fn spawn( + mount_config: Arc, key_requester: StorageKeyRequester, time_sync_config: TimeSyncConfig, - underlay_vnic: EtherstubVnic, current_config_rx: watch::Receiver, reconciler_result_tx: watch::Sender, currently_managed_zpools_tx: watch::Sender>, + external_disks_tx: watch::Sender>, sled_agent_facilities: T, log: Logger, ) { + let external_disks = ExternalDisks::new( + mount_config, + currently_managed_zpools_tx, + external_disks_tx, + ); + tokio::spawn( ReconcilerTask { key_requester, time_sync_config, - underlay_vnic, current_config_rx, reconciler_result_tx, - currently_managed_zpools_tx, + external_disks, sled_agent_facilities, log, } @@ -123,16 +133,11 @@ struct LatestReconcilerTaskResultInner { struct ReconcilerTask { key_requester: StorageKeyRequester, time_sync_config: TimeSyncConfig, - underlay_vnic: EtherstubVnic, current_config_rx: watch::Receiver, reconciler_result_tx: watch::Sender, - currently_managed_zpools_tx: watch::Sender>, + external_disks: ExternalDisks, sled_agent_facilities: T, log: Logger, - // TODO where do we want to do dump setup? Needs both internal and external - // disks. Maybe this task, or maybe a task just for dump setup? - // Invokes dumpadm(8) and savecore(8) when new disks are encountered - // dump_setup: DumpSetup, } impl ReconcilerTask { diff --git a/sled-agent/config-reconciler/src/reconciler_task/external_disks.rs b/sled-agent/config-reconciler/src/reconciler_task/external_disks.rs index ecb0000f187..5aa425b24f0 100644 --- a/sled-agent/config-reconciler/src/reconciler_task/external_disks.rs +++ b/sled-agent/config-reconciler/src/reconciler_task/external_disks.rs @@ -27,6 +27,7 @@ use slog::info; use slog::warn; use slog_error_chain::InlineErrorChain; use std::collections::BTreeSet; +use std::collections::HashSet; use std::future::Future; use std::sync::Arc; use tokio::sync::watch; @@ -163,36 +164,59 @@ impl CurrentlyManagedZpoolsReceiver { pub(super) struct ExternalDisks { disks: IdMap, mount_config: Arc, + + // Output channel for the set of zpools we're managing. Used by sled-agent + // generally to decide when to _stop_ something (e.g., stopping instances + // that were running on a zpool that's no longer available). currently_managed_zpools_tx: watch::Sender>, + + // Output channel for the raw disks we're managing. This is only consumed + // within this crate by `DumpSetupTask` (for managing dump devices). + external_disks_tx: watch::Sender>, } impl ExternalDisks { pub(super) fn new( mount_config: Arc, currently_managed_zpools_tx: watch::Sender>, + external_disks_tx: watch::Sender>, ) -> Self { Self { disks: IdMap::default(), mount_config, currently_managed_zpools_tx, + external_disks_tx, } } - fn update_currently_managed_zpools(&self) { - let new_zpools = self + fn update_output_watch_channels(&self) { + let current_disks = self .disks .iter() .filter_map(|disk| match &disk.state { - DiskState::Managed(disk) => Some(*disk.zpool_name()), + DiskState::Managed(disk) => Some(disk.clone()), DiskState::FailedToManage(_) => None, }) + .collect::>(); + let current_zpools = current_disks + .iter() + .map(|disk| *disk.zpool_name()) .collect::>(); + self.external_disks_tx.send_if_modified(|disks| { + if *disks == current_disks { + false + } else { + *disks = current_disks; + true + } + }); + self.currently_managed_zpools_tx.send_if_modified(|zpools| { - if zpools.0 == new_zpools { + if zpools.0 == current_zpools { false } else { - *zpools = Arc::new(CurrentlyManagedZpools(new_zpools)); + *zpools = Arc::new(CurrentlyManagedZpools(current_zpools)); true } }); @@ -253,7 +277,7 @@ impl ExternalDisks { // can save a bit of work by skipping it in the common case of "no disks // were removed".) if !disk_ids_to_remove.is_empty() || marked_disk_not_found { - self.update_currently_managed_zpools(); + self.update_output_watch_channels(); } } @@ -340,7 +364,7 @@ impl ExternalDisks { self.disks.insert(disk_state); } - self.update_currently_managed_zpools(); + self.update_output_watch_channels(); } async fn try_ensure_disk_managed( @@ -662,9 +686,11 @@ mod tests { let logctx = dev::test_setup_log("internal_disks_are_rejected"); let (currently_managed_zpools_tx, _rx) = watch::channel(Arc::default()); + let (external_disks_tx, _rx) = watch::channel(HashSet::default()); let mut external_disks = ExternalDisks::new( nonexistent_mount_config(), currently_managed_zpools_tx, + external_disks_tx, ); // There should be no disks to start. @@ -772,9 +798,11 @@ mod tests { let logctx = dev::test_setup_log("fail_if_disk_not_present"); let (currently_managed_zpools_tx, _rx) = watch::channel(Arc::default()); + let (external_disks_tx, _rx) = watch::channel(HashSet::default()); let mut external_disks = ExternalDisks::new( nonexistent_mount_config(), currently_managed_zpools_tx, + external_disks_tx, ); // There should be no disks to start. @@ -857,9 +885,11 @@ mod tests { let logctx = dev::test_setup_log("firmware_updates_are_propagated"); let (currently_managed_zpools_tx, _rx) = watch::channel(Arc::default()); + let (external_disks_tx, _rx) = watch::channel(HashSet::default()); let mut external_disks = ExternalDisks::new( nonexistent_mount_config(), currently_managed_zpools_tx, + external_disks_tx, ); // There should be no disks to start. @@ -977,9 +1007,11 @@ mod tests { let logctx = dev::test_setup_log("remove_disks_not_in_config"); let (currently_managed_zpools_tx, _rx) = watch::channel(Arc::default()); + let (external_disks_tx, _rx) = watch::channel(HashSet::default()); let mut external_disks = ExternalDisks::new( nonexistent_mount_config(), currently_managed_zpools_tx, + external_disks_tx, ); // There should be no disks to start.