Skip to content

Commit fd9cedf

Browse files
committed
Rework locking to avoid holding instance-map lock for a long time
Rather than making the instance-map lock async, and holding it for the duration of the zone bundle operation, we keep it sync. Instead, clone the instance itself, which is pretty cheap since it's in an `Arc`. This keeps the code simpler and should improve overall throughput. Also makes a small change to the zone bundle filename for zone-specific commands, to ensure they don't collide.
1 parent a1ef798 commit fd9cedf

File tree

3 files changed

+49
-65
lines changed

3 files changed

+49
-65
lines changed

sled-agent/src/instance.rs

+13-31
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@ use crate::nexus::NexusClientWithResolver;
1313
use crate::params::{
1414
InstanceHardware, InstanceMigrationSourceParams,
1515
InstanceMigrationTargetParams, InstanceStateRequested, VpcFirewallRule,
16-
ZoneBundleCause,
1716
};
1817
use crate::profile::*;
1918
use crate::storage_manager::StorageResources;
@@ -35,6 +34,7 @@ use omicron_common::api::internal::shared::{
3534
};
3635
use omicron_common::backoff;
3736
//use propolis_client::generated::DiskRequest;
37+
use crate::params::ZoneBundleCause;
3838
use crate::params::ZoneBundleMetadata;
3939
use crate::zone_bundle;
4040
use crate::zone_bundle::BundleError;
@@ -155,7 +155,9 @@ fn fmri_name() -> String {
155155
format!("{}:default", service_name())
156156
}
157157

158-
fn propolis_zone_name(id: &Uuid) -> String {
158+
/// Return the expected name of a Propolis zone managing an instace with the
159+
/// provided ID.
160+
pub fn propolis_zone_name(id: &Uuid) -> String {
159161
format!("{}{}", PROPOLIS_ZONE_PREFIX, id)
160162
}
161163

@@ -589,20 +591,6 @@ impl InstanceInner {
589591
}
590592
}
591593

592-
/// A result asking an instance to generate a zone bundle.
593-
#[derive(Debug)]
594-
pub enum InstanceZoneBundleResult {
595-
/// A zone bundle was operation was performed on the requested zone.
596-
Ok(Result<ZoneBundleMetadata, BundleError>),
597-
598-
/// A request to create a bundle for a zone whose name does not match this
599-
/// one's Propolis zone.
600-
NotMe,
601-
602-
/// The zone is not currently running.
603-
NotRunning,
604-
}
605-
606594
/// A reference to a single instance running a running Propolis server.
607595
///
608596
/// Cloning this object clones the reference - it does not create another
@@ -675,22 +663,15 @@ impl Instance {
675663
Ok(Instance { inner })
676664
}
677665

678-
/// Create bundle from a zone matching the provided name.
679-
///
680-
/// This returns an [`InstanceZoneBundleResult`] to indicate whether the
681-
/// bundle could be correctly taken. See that enum for details.
666+
/// Create bundle from an instance zone.
682667
pub async fn request_zone_bundle(
683668
&self,
684-
name: &str,
685-
) -> InstanceZoneBundleResult {
669+
) -> Result<ZoneBundleMetadata, BundleError> {
686670
let inner = self.inner.lock().await;
687-
let zone_name = propolis_zone_name(inner.propolis_id());
688-
if zone_name != name {
689-
return InstanceZoneBundleResult::NotMe;
690-
}
671+
let name = propolis_zone_name(inner.propolis_id());
691672
match &*inner {
692673
InstanceInner { running_state: None, .. } => {
693-
InstanceZoneBundleResult::NotRunning
674+
Err(BundleError::Unavailable { name })
694675
}
695676
InstanceInner {
696677
ref log,
@@ -699,11 +680,12 @@ impl Instance {
699680
} => {
700681
let context = inner
701682
.storage
702-
.zone_bundle_context(name, ZoneBundleCause::ExplicitRequest)
683+
.zone_bundle_context(
684+
&name,
685+
ZoneBundleCause::ExplicitRequest,
686+
)
703687
.await;
704-
InstanceZoneBundleResult::Ok(
705-
zone_bundle::create(log, running_zone, &context).await,
706-
)
688+
zone_bundle::create(log, running_zone, &context).await
707689
}
708690
}
709691
}

sled-agent/src/instance_manager.rs

+29-32
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@
44

55
//! API for controlling multiple instances on a sled.
66
7+
use crate::instance::propolis_zone_name;
78
use crate::instance::Instance;
8-
use crate::instance::InstanceZoneBundleResult;
99
use crate::nexus::NexusClientWithResolver;
1010
use crate::params::ZoneBundleMetadata;
1111
use crate::params::{
@@ -23,7 +23,6 @@ use omicron_common::api::internal::nexus::InstanceRuntimeState;
2323
use slog::Logger;
2424
use std::collections::BTreeMap;
2525
use std::sync::{Arc, Mutex};
26-
use tokio::sync::Mutex as TokioMutex;
2726
use uuid::Uuid;
2827

2928
#[derive(thiserror::Error, Debug)]
@@ -58,7 +57,7 @@ struct InstanceManagerInternal {
5857
// instance, we could avoid the methods within "instance.rs" that panic
5958
// if the Propolis client hasn't been initialized.
6059
/// A mapping from a Sled Agent "Instance ID" to ("Propolis ID", [Instance]).
61-
instances: TokioMutex<BTreeMap<Uuid, (Uuid, Instance)>>,
60+
instances: Mutex<BTreeMap<Uuid, (Uuid, Instance)>>,
6261

6362
vnic_allocator: VnicAllocator<Etherstub>,
6463
port_manager: PortManager,
@@ -86,7 +85,7 @@ impl InstanceManager {
8685

8786
// no reservoir size set on startup
8887
reservoir_size: Mutex::new(ByteCount::from_kibibytes_u32(0)),
89-
instances: TokioMutex::new(BTreeMap::new()),
88+
instances: Mutex::new(BTreeMap::new()),
9089
vnic_allocator: VnicAllocator::new("Instance", etherstub),
9190
port_manager,
9291
storage,
@@ -176,7 +175,7 @@ impl InstanceManager {
176175
);
177176

178177
let instance = {
179-
let mut instances = self.inner.instances.lock().await;
178+
let mut instances = self.inner.instances.lock().unwrap();
180179
if let Some((existing_propolis_id, existing_instance)) =
181180
instances.get(&instance_id)
182181
{
@@ -233,7 +232,7 @@ impl InstanceManager {
233232
instance_id: Uuid,
234233
) -> Result<InstanceUnregisterResponse, Error> {
235234
let instance = {
236-
let instances = self.inner.instances.lock().await;
235+
let instances = self.inner.instances.lock().unwrap();
237236
let instance = instances.get(&instance_id);
238237
if let Some((_, instance)) = instance {
239238
instance.clone()
@@ -257,7 +256,7 @@ impl InstanceManager {
257256
target: InstanceStateRequested,
258257
) -> Result<InstancePutStateResponse, Error> {
259258
let instance = {
260-
let instances = self.inner.instances.lock().await;
259+
let instances = self.inner.instances.lock().unwrap();
261260
let instance = instances.get(&instance_id);
262261

263262
if let Some((_, instance)) = instance {
@@ -300,7 +299,7 @@ impl InstanceManager {
300299
.inner
301300
.instances
302301
.lock()
303-
.await
302+
.unwrap()
304303
.get(&instance_id)
305304
.ok_or_else(|| Error::NoSuchInstance(instance_id))?
306305
.clone();
@@ -315,7 +314,7 @@ impl InstanceManager {
315314
snapshot_id: Uuid,
316315
) -> Result<(), Error> {
317316
let instance = {
318-
let instances = self.inner.instances.lock().await;
317+
let instances = self.inner.instances.lock().unwrap();
319318
let (_, instance) = instances
320319
.get(&instance_id)
321320
.ok_or(Error::NoSuchInstance(instance_id))?;
@@ -333,20 +332,26 @@ impl InstanceManager {
333332
&self,
334333
name: &str,
335334
) -> Result<ZoneBundleMetadata, BundleError> {
336-
for (_propolis_id, instance) in
337-
self.inner.instances.lock().await.values()
338-
{
339-
match instance.request_zone_bundle(name).await {
340-
InstanceZoneBundleResult::NotMe => continue,
341-
InstanceZoneBundleResult::NotRunning => {
342-
return Err(BundleError::Unavailable {
343-
name: name.to_string(),
344-
});
345-
}
346-
InstanceZoneBundleResult::Ok(result) => return result,
347-
}
348-
}
349-
Err(BundleError::NoSuchZone { name: name.to_string() })
335+
// We need to find the instance and take its lock, but:
336+
//
337+
// 1. The instance-map lock is sync, and
338+
// 2. we don't want to hold the instance-map lock for the entire
339+
// bundling duration.
340+
//
341+
// Instead, we cheaply clone the instance through its `Arc` around the
342+
// `InstanceInner`, which is ultimately what we want.
343+
let Some((_propolis_id, instance)) = self
344+
.inner
345+
.instances
346+
.lock()
347+
.unwrap()
348+
.values()
349+
.find(|(propolis_id, _instance)| name == propolis_zone_name(propolis_id))
350+
.cloned()
351+
else {
352+
return Err(BundleError::NoSuchZone { name: name.to_string() });
353+
};
354+
instance.request_zone_bundle().await
350355
}
351356
}
352357

@@ -368,15 +373,7 @@ impl InstanceTicket {
368373
/// themselves after stopping.
369374
pub fn terminate(&mut self) {
370375
if let Some(inner) = self.inner.take() {
371-
// NOTE: We cannot call methods like `Mutex::blocking_lock()` from
372-
// an asynchronous context. We wrap this in a call to
373-
// `block_in_place()`. This does block the current thread (and any
374-
// concurrent tasks in the same task), but that's not a distinction
375-
// from the previous implementation which called
376-
// `std::sync::Mutex::lock()` here.
377-
tokio::task::block_in_place(|| {
378-
inner.instances.blocking_lock().remove(&self.id);
379-
});
376+
inner.instances.lock().unwrap().remove(&self.id);
380377
}
381378
}
382379
}

sled-agent/src/zone_bundle.rs

+7-2
Original file line numberDiff line numberDiff line change
@@ -214,7 +214,7 @@ pub async fn create(
214214
}
215215

216216
// Run any caller-requested zone-specific commands.
217-
for cmd in context.zone_specific_commands.iter() {
217+
for (i, cmd) in context.zone_specific_commands.iter().enumerate() {
218218
if cmd.is_empty() {
219219
continue;
220220
}
@@ -229,7 +229,12 @@ pub async fn create(
229229
Err(e) => format!("{}", e),
230230
};
231231
let contents = format!("Command: {:?}\n{}", cmd, output).into_bytes();
232-
if let Err(e) = insert_data(&mut builder, &cmd[0], &contents) {
232+
233+
// We'll insert the index into the filename as well, since it's
234+
// plausible that users will run multiple executions of the same
235+
// command.
236+
let filename = format!("zone-specific-{}-{}", i, &cmd[0]);
237+
if let Err(e) = insert_data(&mut builder, &filename, &contents) {
233238
error!(
234239
log,
235240
"failed to save zone bundle command output";

0 commit comments

Comments
 (0)