sled-agent: Fix races when starting switch zone in a4x2 (#9297)

jgallagher · web-flow · commit 2a97eedd0b8c · 2025-11-05T11:52:40.000-05:00
Yesterday @internet-diglett and I were looking at some weird a4x2 failures where sled-agent successfully started the switch zone but failed to configure uplinks within it. This PR fixes a race condition and a subsequent logic bug which together were causing that failure. I'm not sure if it's possible to hit this race condition in real hardware. I tried going over a real sled-agent startup log to figure out if we just happened to start up the switch zone "fast enough", or if something in the real startup path was (implicitly) blocked on that setup being done. I _think_ it's the latter but don't have great confidence in that; this is based on comparing timestamps of logs and things that appear backed up waiting on mutexes held during the whole switch zone setup process. All of this is pretty gnarly; we have multiple issues discussing the need for some rework here anyway, but this is yet another spot fix to unblock active work. --- Race condition: If we get our underlay info while we're still starting up the switch zone, we don't inform the task doing that startup about it, and therefore it doesn't attempt to configure uplinks. In the "swapping out the request" path, we actually had `Some(underlay_info)`, but were discarding it: it's not stored in `request` or `new_request` - we only passed it as an argument to `start_switch_zone`. This is fixed by moving the `underlay_info` into `request` instead of passing it as function argument. Now when we swap out the request, the task running to perform initialization has access to the `underlay_info` and will attempt to configure uplinks. --- Logic bug: Once we fixed the above, we saw the "ensure switch zone uplinks" worker stop after a single attempt as though it was told to because inside of `try_initialize_switch_zone()` itself, the last thing it does before returning is change the state from `::Initializing` to `::Running`, _with no `worker` task_: https://github.com/oxidecomputer/omicron/blob/d743754bb4be24228e9e042ce5262c242d4fd079/sled-agent/src/services.rs#L4006-L4010 This causes `exit_tx` to be dropped, which causes `ensure_switch_zone_uplinks_configured_loop()` to bail out after a single attempt, as we see in the logs above. This is fixed by moving the worker task into the `::Running` state instead of dropping it. (The `::Running` state can have a non-`None` worker if we reconfigure the switch zone, so the supporting code already expects this to be present sometimes, and knows to stop the task when appropriate.) This bug was _mostly_ introduced by above (not fully correct!) change to fix the race condition: prior to that change, the `::Initializing` state never had the `underlay_info` in it anyway, so `ensure_switch_zone_uplinks_configured_loop()` wouldn't have even been called.
diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs
@@ -443,6 +443,7 @@ struct SwitchZoneConfig {
     id: Uuid,
     addresses: Vec<Ipv6Addr>,
     services: Vec<SwitchService>,
+    underlay_info: Option<UnderlayInfo>,
 }
 
 /// Describes one of several services that may be deployed in a switch zone
@@ -2444,7 +2445,7 @@ impl ServiceManager {
         bootstrap_name_and_address: Option<(String, Ipv6Addr)>,
         device_names: &[String],
     ) -> Result<RunningZone, Error> {
-        let SwitchZoneConfig { id, services, addresses } = config;
+        let SwitchZoneConfig { id, services, addresses, .. } = config;
 
         let disabled_dns_client_service =
             ServiceBuilder::new("network/dns/client")
@@ -3265,16 +3266,14 @@ impl ServiceManager {
         };
         addresses.push(Ipv6Addr::LOCALHOST);
 
-        let request =
-            SwitchZoneConfig { id: Uuid::new_v4(), addresses, services };
-
-        self.ensure_switch_zone(
-            Some(request),
-            filesystems,
-            data_links,
+        let request = SwitchZoneConfig {
+            id: Uuid::new_v4(),
+            addresses,
+            services,
             underlay_info,
-        )
-        .await?;
+        };
+
+        self.ensure_switch_zone(Some(request), filesystems, data_links).await?;
 
         Ok(())
     }
@@ -3494,8 +3493,6 @@ impl ServiceManager {
             vec![],
             // data_links=
             vec![],
-            // underlay_info=
-            None,
         )
         .await
     }
@@ -3514,7 +3511,6 @@ impl ServiceManager {
         request: SwitchZoneConfig,
         filesystems: Vec<zone::Fs>,
         data_links: Vec<String>,
-        underlay_info: Option<UnderlayInfo>,
     ) {
         let (exit_tx, exit_rx) = oneshot::channel();
         *zone = SwitchZoneState::Initializing {
@@ -3524,8 +3520,7 @@ impl ServiceManager {
             worker: Some(Task {
                 exit_tx,
                 initializer: tokio::task::spawn(async move {
-                    self.initialize_switch_zone_loop(underlay_info, exit_rx)
-                        .await
+                    self.initialize_switch_zone_loop(exit_rx).await
                 }),
             }),
         };
@@ -3537,7 +3532,6 @@ impl ServiceManager {
         request: Option<SwitchZoneConfig>,
         filesystems: Vec<zone::Fs>,
         data_links: Vec<String>,
-        underlay_info: Option<UnderlayInfo>,
     ) -> Result<(), Error> {
         let log = &self.inner.log;
 
@@ -3552,7 +3546,6 @@ impl ServiceManager {
                     request,
                     filesystems,
                     data_links,
-                    underlay_info,
                 );
             }
             (
@@ -3918,7 +3911,7 @@ impl ServiceManager {
 
                 // We also need to ensure any uplinks are configured. Spawn a
                 // task that goes into an infinite retry loop until it succeeds.
-                if let Some(underlay_info) = underlay_info {
+                if let Some(underlay_info) = request.underlay_info.clone() {
                     if let Some(old_worker) = worker.take() {
                         old_worker.stop().await;
                     }
@@ -3978,15 +3971,15 @@ impl ServiceManager {
     async fn try_initialize_switch_zone(
         &self,
         sled_zone: &mut SwitchZoneState,
-    ) -> Result<(), Error> {
+    ) -> Result<Option<UnderlayInfo>, Error> {
         let SwitchZoneState::Initializing {
             request,
             filesystems,
             data_links,
-            ..
-        } = &*sled_zone
+            worker,
+        } = sled_zone
         else {
-            return Ok(());
+            return Ok(None);
         };
 
         // The switch zone must use the ramdisk in order to receive requests
@@ -4003,19 +3996,28 @@ impl ServiceManager {
         let zone = self
             .initialize_zone(zone_args, zone_root_path, filesystems, data_links)
             .await?;
+        let underlay_info = request.underlay_info.clone();
+
+        // Even though we've initialized the zone, the `worker` task may still
+        // be running to configure uplinks. If we drop `worker` now it will
+        // cause that task to exit before it gets a chance to do so. This is all
+        // very unsatisfying and needs some serious rework:
+        // https://github.com/oxidecomputer/omicron/issues/8970 and
+        // https://github.com/oxidecomputer/omicron/issues/9182 are strongly
+        // related.
+        let worker = worker.take();
         *sled_zone = SwitchZoneState::Running {
             request: request.clone(),
             zone: Box::new(zone),
-            worker: None,
+            worker,
         };
-        Ok(())
+        Ok(underlay_info)
     }
 
     // Body of a tokio task responsible for running until the switch zone is
     // inititalized, or it has been told to stop.
     async fn initialize_switch_zone_loop(
         &self,
-        underlay_info: Option<UnderlayInfo>,
         mut exit_rx: oneshot::Receiver<()>,
     ) {
         // We don't really expect failures trying to initialize the switch zone
@@ -4025,13 +4027,25 @@ impl ServiceManager {
 
         // First, go into a loop to bring up the switch zone; retry until we
         // succeed or are told to give up via `exit_rx`.
-        loop {
+        let underlay_info = loop {
             {
                 let mut sled_zone = self.inner.switch_zone.lock().await;
                 match self.try_initialize_switch_zone(&mut sled_zone).await {
-                    Ok(()) => {
-                        info!(self.inner.log, "initialized switch zone");
-                        break;
+                    Ok(None) => {
+                        info!(
+                            self.inner.log,
+                            "initialized switch zone \
+                             (no underlay info available yet)",
+                        );
+                        return;
+                    }
+                    Ok(Some(underlay_info)) => {
+                        info!(
+                            self.inner.log,
+                            "initialized switch zone (underlay info \
+                             available: will attempt uplink configuration)",
+                        );
+                        break underlay_info;
                     }
                     Err(e) => {
                         warn!(
@@ -4060,17 +4074,15 @@ impl ServiceManager {
                     continue;
                 }
             };
-        }
+        };
 
-        // Then, if we have underlay info, go into a loop trying to configure
-        // our uplinks. As above, retry until we succeed or are told to stop.
-        if let Some(underlay_info) = underlay_info {
-            self.ensure_switch_zone_uplinks_configured_loop(
-                &underlay_info,
-                exit_rx,
-            )
-            .await;
-        }
+        // Then go into a loop trying to configure our uplinks. As above, retry
+        // until we succeed or are told to stop.
+        self.ensure_switch_zone_uplinks_configured_loop(
+            &underlay_info,
+            exit_rx,
+        )
+        .await;
     }
 }