diff --git a/dev-tools/omdb/src/bin/omdb/nexus/chicken_switches.rs b/dev-tools/omdb/src/bin/omdb/nexus/chicken_switches.rs index 65b39031241..d261789c4b0 100644 --- a/dev-tools/omdb/src/bin/omdb/nexus/chicken_switches.rs +++ b/dev-tools/omdb/src/bin/omdb/nexus/chicken_switches.rs @@ -6,6 +6,7 @@ use crate::Omdb; use crate::check_allow_destructive::DestructiveOperationToken; +use clap::ArgAction; use clap::Args; use clap::Subcommand; use http::StatusCode; @@ -33,6 +34,7 @@ pub enum ChickenSwitchesCommands { #[derive(Debug, Clone, Args)] pub struct ChickenSwitchesSetArgs { + #[clap(long, action=ArgAction::Set)] planner_enabled: bool, } @@ -100,7 +102,13 @@ async fn chicken_switches_show( println!(" modified time: {time_modified}"); println!(" planner enabled: {planner_enabled}"); } - Err(err) => eprintln!("error: {:#}", err), + Err(err) => { + if err.status() == Some(StatusCode::NOT_FOUND) { + println!("No chicken switches enabled"); + } else { + eprintln!("error: {:#}", err) + } + } } Ok(()) diff --git a/dev-tools/omdb/tests/env.out b/dev-tools/omdb/tests/env.out index ed51880588f..846c7d5700b 100644 --- a/dev-tools/omdb/tests/env.out +++ b/dev-tools/omdb/tests/env.out @@ -56,6 +56,10 @@ task: "blueprint_rendezvous" owned rendezvous tables that other subsystems consume +task: "chicken_switches_watcher" + watch db for chicken switch changes + + task: "crdb_node_id_collector" Collects node IDs of running CockroachDB zones @@ -260,6 +264,10 @@ task: "blueprint_rendezvous" owned rendezvous tables that other subsystems consume +task: "chicken_switches_watcher" + watch db for chicken switch changes + + task: "crdb_node_id_collector" Collects node IDs of running CockroachDB zones @@ -451,6 +459,10 @@ task: "blueprint_rendezvous" owned rendezvous tables that other subsystems consume +task: "chicken_switches_watcher" + watch db for chicken switch changes + + task: "crdb_node_id_collector" Collects node IDs of running CockroachDB zones diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out index 8a92d094ec8..7e05abead4c 100644 --- a/dev-tools/omdb/tests/successes.out +++ b/dev-tools/omdb/tests/successes.out @@ -268,6 +268,10 @@ task: "blueprint_rendezvous" owned rendezvous tables that other subsystems consume +task: "chicken_switches_watcher" + watch db for chicken switch changes + + task: "crdb_node_id_collector" Collects node IDs of running CockroachDB zones @@ -543,6 +547,13 @@ task: "blueprint_rendezvous" started at (s ago) and ran for ms last completion reported error: no blueprint +task: "chicken_switches_watcher" + configured period: every s + currently executing: no + last completed activation: , triggered by a periodic timer firing + started at (s ago) and ran for ms +warning: unknown background task: "chicken_switches_watcher" (don't know how to interpret details: Object {"chicken_switches_updated": Bool(false)}) + task: "crdb_node_id_collector" configured period: every m currently executing: no @@ -1083,6 +1094,13 @@ task: "blueprint_rendezvous" started at (s ago) and ran for ms last completion reported error: no blueprint +task: "chicken_switches_watcher" + configured period: every s + currently executing: no + last completed activation: , triggered by a periodic timer firing + started at (s ago) and ran for ms +warning: unknown background task: "chicken_switches_watcher" (don't know how to interpret details: Object {"chicken_switches_updated": Bool(false)}) + task: "crdb_node_id_collector" configured period: every m currently executing: no diff --git a/docs/reconfigurator.adoc b/docs/reconfigurator.adoc index 9aa3fd1d2f9..ea838a06d6f 100644 --- a/docs/reconfigurator.adoc +++ b/docs/reconfigurator.adoc @@ -175,7 +175,7 @@ We're being cautious about rolling out that kind of automation. Instead, today, `omdb` uses the Nexus internal API to do these things. Since this can only be done using `omdb`, Reconfigurator can really only be used by Oxide engineering and support, not customers. -The planner background task is currently disabled by default, but can be enabled by setting the Nexus configuration option `blueprints.disable_planner = false`. To get to the long term vision where the system is doing all this on its own in response to operator input, we'll need to get confidence that continually executing the planner will have no ill effects on working systems. This might involve more operational experience with it, more safeties, and tools for pausing execution, previewing what it _would_ do, etc. +The planner background task is currently disabled by default, but can be enabled via `omdb nexus chicken-switches --planner-enabled`. To get to the long term vision where the system is doing all this on its own in response to operator input, we'll need to get confidence that continually executing the planner will have no ill effects on working systems. This might involve more operational experience with it, more safeties, and tools for pausing execution, previewing what it _would_ do, etc. == Design patterns diff --git a/nexus-config/src/nexus_config.rs b/nexus-config/src/nexus_config.rs index f08881f0970..a2d34816ca2 100644 --- a/nexus-config/src/nexus_config.rs +++ b/nexus-config/src/nexus_config.rs @@ -594,9 +594,6 @@ pub struct PhantomDiskConfig { #[serde_as] #[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)] pub struct BlueprintTasksConfig { - /// background planner chicken switch - pub disable_planner: bool, - /// period (in seconds) for periodic activations of the background task that /// reads the latest target blueprint from the database #[serde_as(as = "DurationSeconds")] @@ -622,6 +619,11 @@ pub struct BlueprintTasksConfig { /// collects the node IDs of CockroachDB zones #[serde_as(as = "DurationSeconds")] pub period_secs_collect_crdb_node_ids: Duration, + + /// period (in seconds) for periodic activations of the background task that + /// reads chicken switches from the database + #[serde_as(as = "DurationSeconds")] + pub period_secs_load_chicken_switches: Duration, } #[serde_as] @@ -1079,12 +1081,12 @@ mod test { physical_disk_adoption.period_secs = 30 decommissioned_disk_cleaner.period_secs = 30 phantom_disks.period_secs = 30 - blueprints.disable_planner = true blueprints.period_secs_load = 10 blueprints.period_secs_plan = 60 blueprints.period_secs_execute = 60 blueprints.period_secs_rendezvous = 300 blueprints.period_secs_collect_crdb_node_ids = 180 + blueprints.period_secs_load_chicken_switches= 5 sync_service_zone_nat.period_secs = 30 switch_port_settings_manager.period_secs = 30 region_replacement.period_secs = 30 @@ -1247,13 +1249,14 @@ mod test { period_secs: Duration::from_secs(30), }, blueprints: BlueprintTasksConfig { - disable_planner: true, period_secs_load: Duration::from_secs(10), period_secs_plan: Duration::from_secs(60), period_secs_execute: Duration::from_secs(60), period_secs_collect_crdb_node_ids: Duration::from_secs(180), period_secs_rendezvous: Duration::from_secs(300), + period_secs_load_chicken_switches: + Duration::from_secs(5) }, sync_service_zone_nat: SyncServiceZoneNatConfig { period_secs: Duration::from_secs(30) @@ -1396,12 +1399,12 @@ mod test { physical_disk_adoption.period_secs = 30 decommissioned_disk_cleaner.period_secs = 30 phantom_disks.period_secs = 30 - blueprints.disable_planner = true blueprints.period_secs_load = 10 blueprints.period_secs_plan = 60 blueprints.period_secs_execute = 60 blueprints.period_secs_rendezvous = 300 blueprints.period_secs_collect_crdb_node_ids = 180 + blueprints.period_secs_load_chicken_switches= 5 sync_service_zone_nat.period_secs = 30 switch_port_settings_manager.period_secs = 30 region_replacement.period_secs = 30 @@ -1424,6 +1427,7 @@ mod test { alert_dispatcher.period_secs = 42 webhook_deliverator.period_secs = 43 sp_ereport_ingester.period_secs = 44 + [default_region_allocation_strategy] type = "random" "##, diff --git a/nexus/background-task-interface/src/init.rs b/nexus/background-task-interface/src/init.rs index 328f703e1a9..5e15186b0d6 100644 --- a/nexus/background-task-interface/src/init.rs +++ b/nexus/background-task-interface/src/init.rs @@ -48,6 +48,7 @@ pub struct BackgroundTasks { pub task_alert_dispatcher: Activator, pub task_webhook_deliverator: Activator, pub task_sp_ereport_ingester: Activator, + pub task_chicken_switches_loader: Activator, // Handles to activate background tasks that do not get used by Nexus // at-large. These background tasks are implementation details as far as diff --git a/nexus/examples/config-second.toml b/nexus/examples/config-second.toml index 5a3f201b99f..76d70e9f514 100644 --- a/nexus/examples/config-second.toml +++ b/nexus/examples/config-second.toml @@ -118,12 +118,12 @@ phantom_disks.period_secs = 30 physical_disk_adoption.period_secs = 30 support_bundle_collector.period_secs = 30 decommissioned_disk_cleaner.period_secs = 60 -blueprints.disable_planner = true blueprints.period_secs_load = 10 blueprints.period_secs_plan = 60 blueprints.period_secs_execute = 60 blueprints.period_secs_rendezvous = 300 blueprints.period_secs_collect_crdb_node_ids = 180 +blueprints.period_secs_load_chicken_switches = 5 sync_service_zone_nat.period_secs = 30 switch_port_settings_manager.period_secs = 30 region_replacement.period_secs = 30 diff --git a/nexus/examples/config.toml b/nexus/examples/config.toml index 43e2ba421f0..cc9e885b8fa 100644 --- a/nexus/examples/config.toml +++ b/nexus/examples/config.toml @@ -104,12 +104,12 @@ phantom_disks.period_secs = 30 physical_disk_adoption.period_secs = 30 support_bundle_collector.period_secs = 30 decommissioned_disk_cleaner.period_secs = 60 -blueprints.disable_planner = true blueprints.period_secs_load = 10 blueprints.period_secs_plan = 60 blueprints.period_secs_execute = 60 blueprints.period_secs_rendezvous = 300 blueprints.period_secs_collect_crdb_node_ids = 180 +blueprints.period_secs_load_chicken_switches = 5 sync_service_zone_nat.period_secs = 30 switch_port_settings_manager.period_secs = 30 region_replacement.period_secs = 30 diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index 25e52804c0c..d4860b0b1c5 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -96,6 +96,7 @@ use super::tasks::blueprint_execution; use super::tasks::blueprint_load; use super::tasks::blueprint_planner; use super::tasks::blueprint_rendezvous; +use super::tasks::chicken_switches::ChickenSwitchesLoader; use super::tasks::crdb_node_id_collector; use super::tasks::decommissioned_disk_cleaner; use super::tasks::dns_config; @@ -230,6 +231,7 @@ impl BackgroundTasksInitializer { task_alert_dispatcher: Activator::new(), task_webhook_deliverator: Activator::new(), task_sp_ereport_ingester: Activator::new(), + task_chicken_switches_loader: Activator::new(), task_internal_dns_propagation: Activator::new(), task_external_dns_propagation: Activator::new(), @@ -306,6 +308,7 @@ impl BackgroundTasksInitializer { task_alert_dispatcher, task_webhook_deliverator, task_sp_ereport_ingester, + task_chicken_switches_loader, // Add new background tasks here. Be sure to use this binding in a // call to `Driver::register()` below. That's what actually wires // up the Activator to the corresponding background task. @@ -476,13 +479,26 @@ impl BackgroundTasksInitializer { inventory_watcher }; + let chicken_switches_loader = + ChickenSwitchesLoader::new(datastore.clone()); + let chicken_switches_watcher = chicken_switches_loader.watcher(); + driver.register(TaskDefinition { + name: "chicken_switches_watcher", + description: "watch db for chicken switch changes", + period: config.blueprints.period_secs_load_chicken_switches, + task_impl: Box::new(chicken_switches_loader), + opctx: opctx.child(BTreeMap::new()), + watchers: vec![], + activator: task_chicken_switches_loader, + }); + // Background task: blueprint planner // // Replans on inventory collection and changes to the current // target blueprint. let blueprint_planner = blueprint_planner::BlueprintPlanner::new( datastore.clone(), - config.blueprints.disable_planner, + chicken_switches_watcher.clone(), inventory_watcher.clone(), rx_blueprint.clone(), ); @@ -496,6 +512,7 @@ impl BackgroundTasksInitializer { watchers: vec![ Box::new(inventory_watcher.clone()), Box::new(rx_blueprint.clone()), + Box::new(chicken_switches_watcher), ], activator: task_blueprint_planner, }); diff --git a/nexus/src/app/background/tasks/blueprint_planner.rs b/nexus/src/app/background/tasks/blueprint_planner.rs index 4e0b0e3fb1a..cff311899b1 100644 --- a/nexus/src/app/background/tasks/blueprint_planner.rs +++ b/nexus/src/app/background/tasks/blueprint_planner.rs @@ -12,6 +12,7 @@ use nexus_db_queries::context::OpContext; use nexus_db_queries::db::DataStore; use nexus_reconfigurator_planning::planner::Planner; use nexus_reconfigurator_preparation::PlanningInputFromDb; +use nexus_types::deployment::ReconfiguratorChickenSwitches; use nexus_types::deployment::{Blueprint, BlueprintTarget}; use nexus_types::internal_api::background::BlueprintPlannerStatus; use omicron_common::api::external::LookupType; @@ -24,7 +25,7 @@ use tokio::sync::watch::{self, Receiver, Sender}; /// Background task that runs the update planner. pub struct BlueprintPlanner { datastore: Arc, - disabled: bool, + rx_chicken_switches: Receiver, rx_inventory: Receiver>, rx_blueprint: Receiver>>, tx_blueprint: Sender>>, @@ -33,12 +34,18 @@ pub struct BlueprintPlanner { impl BlueprintPlanner { pub fn new( datastore: Arc, - disabled: bool, + rx_chicken_switches: Receiver, rx_inventory: Receiver>, rx_blueprint: Receiver>>, ) -> Self { let (tx_blueprint, _) = watch::channel(None); - Self { datastore, disabled, rx_inventory, rx_blueprint, tx_blueprint } + Self { + datastore, + rx_chicken_switches, + rx_inventory, + rx_blueprint, + tx_blueprint, + } } pub fn watcher( @@ -51,7 +58,8 @@ impl BlueprintPlanner { /// If it is different from the current target blueprint, /// save it and make it the current target. pub async fn plan(&mut self, opctx: &OpContext) -> BlueprintPlannerStatus { - if self.disabled { + let switches = self.rx_chicken_switches.borrow_and_update().clone(); + if !switches.planner_enabled { debug!(&opctx.log, "blueprint planning disabled, doing nothing"); return BlueprintPlannerStatus::Disabled; } @@ -251,6 +259,7 @@ mod test { use super::*; use crate::app::background::tasks::blueprint_load::TargetBlueprintLoader; use crate::app::background::tasks::inventory_collection::InventoryCollector; + use nexus_inventory::now_db_precision; use nexus_test_utils_macros::nexus_test; type ControlPlaneTestContext = @@ -291,10 +300,18 @@ mod test { let rx_collector = collector.watcher(); collector.activate(&opctx).await; + // Enable the planner + let (_tx, chicken_switches_collector_rx) = + watch::channel(ReconfiguratorChickenSwitches { + version: 1, + planner_enabled: true, + time_modified: now_db_precision(), + }); + // Finally, spin up the planner background task. let mut planner = BlueprintPlanner::new( datastore.clone(), - false, + chicken_switches_collector_rx, rx_collector, rx_loader.clone(), ); diff --git a/nexus/src/app/background/tasks/chicken_switches.rs b/nexus/src/app/background/tasks/chicken_switches.rs new file mode 100644 index 00000000000..0461cde7304 --- /dev/null +++ b/nexus/src/app/background/tasks/chicken_switches.rs @@ -0,0 +1,120 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Runtime configuration for reconfigurator + +use crate::app::background::BackgroundTask; +use anyhow::Context; +use futures::FutureExt; +use futures::future::BoxFuture; +use nexus_auth::context::OpContext; +use nexus_db_queries::db::DataStore; +use nexus_types::deployment::ReconfiguratorChickenSwitches; +use serde_json::json; +use std::sync::Arc; +use tokio::sync::watch; + +/// Background task that tracks reconfigurator chicken switches from the DB +pub struct ChickenSwitchesLoader { + datastore: Arc, + tx: watch::Sender, + rx: watch::Receiver, +} + +impl ChickenSwitchesLoader { + pub fn new(datastore: Arc) -> Self { + let (tx, rx) = watch::channel(ReconfiguratorChickenSwitches::default()); + Self { datastore, tx, rx } + } + + pub fn watcher(&self) -> watch::Receiver { + self.rx.clone() + } +} + +impl BackgroundTask for ChickenSwitchesLoader { + fn activate<'a>( + &'a mut self, + opctx: &'a OpContext, + ) -> BoxFuture<'a, serde_json::Value> { + async { + match self + .datastore + .reconfigurator_chicken_switches_get_latest(opctx) + .await + .context("failed to load chicken switches") + { + Err(error) => { + let message = format!("{:#}", error); + warn!(opctx.log, "chicken switches load failed"; + "error" => message.clone()); + json!({ "error": message }) + } + Ok(switches) => { + let switches = switches.unwrap_or_default(); + let updated = self.tx.send_if_modified(|s| { + if *s != switches { + *s = switches; + return true; + } + false + }); + debug!(opctx.log, "chicken switches load complete"); + json!({ "chicken_switches_updated": updated }) + } + } + } + .boxed() + } +} + +#[cfg(test)] +mod test { + use super::*; + use nexus_test_utils_macros::nexus_test; + use nexus_types::deployment::ReconfiguratorChickenSwitchesParam; + + type ControlPlaneTestContext = + nexus_test_utils::ControlPlaneTestContext; + + #[nexus_test(server = crate::Server)] + async fn test_basic(cptestctx: &ControlPlaneTestContext) { + let nexus = &cptestctx.server.server_context().nexus; + let datastore = nexus.datastore(); + let opctx = OpContext::for_tests( + cptestctx.logctx.log.clone(), + datastore.clone(), + ); + + let mut task = ChickenSwitchesLoader::new(datastore.clone()); + let out = task.activate(&opctx).await; + assert_eq!(out["chicken_switches_updated"], false); + let switches = ReconfiguratorChickenSwitchesParam { + version: 1, + planner_enabled: true, + }; + datastore + .reconfigurator_chicken_switches_insert_latest_version( + &opctx, switches, + ) + .await + .unwrap(); + let out = task.activate(&opctx).await; + assert_eq!(out["chicken_switches_updated"], true); + let out = task.activate(&opctx).await; + assert_eq!(out["chicken_switches_updated"], false); + let switches = ReconfiguratorChickenSwitchesParam { + version: 2, + planner_enabled: false, + }; + datastore + .reconfigurator_chicken_switches_insert_latest_version( + &opctx, switches, + ) + .await + .unwrap(); + let out = task.activate(&opctx).await; + assert_eq!(out["chicken_switches_updated"], true); + } +} diff --git a/nexus/src/app/background/tasks/mod.rs b/nexus/src/app/background/tasks/mod.rs index 4b2c604482c..3ae34c845bb 100644 --- a/nexus/src/app/background/tasks/mod.rs +++ b/nexus/src/app/background/tasks/mod.rs @@ -11,6 +11,7 @@ pub mod blueprint_execution; pub mod blueprint_load; pub mod blueprint_planner; pub mod blueprint_rendezvous; +pub mod chicken_switches; pub mod crdb_node_id_collector; pub mod decommissioned_disk_cleaner; pub mod dns_config; diff --git a/nexus/tests/config.test.toml b/nexus/tests/config.test.toml index 8e20850a190..162c6e781bd 100644 --- a/nexus/tests/config.test.toml +++ b/nexus/tests/config.test.toml @@ -110,12 +110,12 @@ support_bundle_collector.period_secs = 999999 decommissioned_disk_cleaner.period_secs = 60 # Disable disk decommissioning cleanup to avoid interfering with tests. decommissioned_disk_cleaner.disable = true -blueprints.disable_planner = true blueprints.period_secs_load = 100 blueprints.period_secs_plan = 600 blueprints.period_secs_execute = 600 blueprints.period_secs_rendezvous = 600 blueprints.period_secs_collect_crdb_node_ids = 600 +blueprints.period_secs_load_chicken_switches = 5 sync_service_zone_nat.period_secs = 30 switch_port_settings_manager.period_secs = 30 region_replacement.period_secs = 60 diff --git a/nexus/types/src/deployment/chicken_switches.rs b/nexus/types/src/deployment/chicken_switches.rs index 84e2aa4c67a..d54e2d4d004 100644 --- a/nexus/types/src/deployment/chicken_switches.rs +++ b/nexus/types/src/deployment/chicken_switches.rs @@ -4,7 +4,7 @@ //! Runtime configuration for reconfigurator //! -use chrono::{DateTime, Utc}; +use chrono::{DateTime, TimeZone, Utc}; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; @@ -22,3 +22,13 @@ pub struct ReconfiguratorChickenSwitches { pub planner_enabled: bool, pub time_modified: DateTime, } + +impl Default for ReconfiguratorChickenSwitches { + fn default() -> Self { + Self { + version: 0, + planner_enabled: false, + time_modified: Utc.with_ymd_and_hms(1970, 1, 1, 0, 1, 1).unwrap(), + } + } +} diff --git a/smf/nexus/multi-sled/config-partial.toml b/smf/nexus/multi-sled/config-partial.toml index 312e161fd6e..a2bc7352cf4 100644 --- a/smf/nexus/multi-sled/config-partial.toml +++ b/smf/nexus/multi-sled/config-partial.toml @@ -52,12 +52,12 @@ phantom_disks.period_secs = 30 physical_disk_adoption.period_secs = 30 support_bundle_collector.period_secs = 30 decommissioned_disk_cleaner.period_secs = 60 -blueprints.disable_planner = true blueprints.period_secs_load = 10 blueprints.period_secs_plan = 60 blueprints.period_secs_execute = 60 blueprints.period_secs_rendezvous = 300 blueprints.period_secs_collect_crdb_node_ids = 180 +blueprints.period_secs_load_chicken_switches = 5 sync_service_zone_nat.period_secs = 30 switch_port_settings_manager.period_secs = 30 region_replacement.period_secs = 30 diff --git a/smf/nexus/single-sled/config-partial.toml b/smf/nexus/single-sled/config-partial.toml index fc2d145561d..b6a29f28a82 100644 --- a/smf/nexus/single-sled/config-partial.toml +++ b/smf/nexus/single-sled/config-partial.toml @@ -52,12 +52,12 @@ phantom_disks.period_secs = 30 physical_disk_adoption.period_secs = 30 support_bundle_collector.period_secs = 30 decommissioned_disk_cleaner.period_secs = 60 -blueprints.disable_planner = true blueprints.period_secs_load = 10 blueprints.period_secs_plan = 60 blueprints.period_secs_execute = 60 blueprints.period_secs_rendezvous = 300 blueprints.period_secs_collect_crdb_node_ids = 180 +blueprints.period_secs_load_chicken_switches = 5 sync_service_zone_nat.period_secs = 30 switch_port_settings_manager.period_secs = 30 region_replacement.period_secs = 30