Skip to content

Commit a617f0f

Browse files
authored
sled-agent: move instance configuration generation to Nexus (#8002)
One of the determinations in RFD 505 is that Nexus should be the component that's in charge of determining how to configure a VM given a set of database records describing its instance (the Instance itself, its attached Disks and NetworkInterfaces, etc.). To summarize the rationale in the RFD, the hope is that this will promote two nice properties: - **Local reasoning about virtual platforms**: All the logic that translates instance descriptions into VM specs now lives in a single module in Nexus. In past iterations of the code, Nexus transformed database records into an intermediate sled-agent type, and sled-agent would transform those into Propolis API types, which Propolis would then use to fill in virtual hardware details. Understanding where a VM's configuration came from required the reader to look at all these components; now all the relevant logic lives in Nexus. - **Serviceability**: Putting type transformations and platform policies into sled-agent and Propolis makes them marginally more painful to update, since updating these components requires the system to migrate VMs and reboot sleds. Putting the virtual platform policy in Nexus will make it much less expensive to update in the future. To achieve this: - Move sled-agent's virtual platform logic (added in #7211) into a new Nexus module. Sled-agent needs to hold onto a bit of logic to insert OPTE port names into instance specs before sending those specs to Propolis; this needs to live in the agent since it selects the relevant object names. - Update the sled-agent instance registration API to take a Propolis instance spec as a parameter (and rework some other types to distinguish a bit more clearly between "Propolis VM configuration" and "sled-agent objects that need to be created to support this VM"). Tested by booting a VM in a dev cluster, booting a comparable VM on rack2, and comparing their instance specs (as returned by Propolis's `/instance/spec` API) to make sure they specified the same components with the same configuration.
1 parent bac3385 commit a617f0f

File tree

15 files changed

+1908
-835
lines changed

15 files changed

+1908
-835
lines changed

Cargo.lock

Lines changed: 72 additions & 132 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -387,10 +387,10 @@ crossterm = { version = "0.28.1", features = ["event-stream"] }
387387
# NOTE: if you change the pinned revision of the `crucible` dependencies, you
388388
# must also update the references in package-manifest.toml to match the new
389389
# revision.
390-
crucible-agent-client = { git = "https://github.com/oxidecomputer/crucible", rev = "da3cf198a0e000bb89efc3a1c77d7ba09340a600" }
391-
crucible-pantry-client = { git = "https://github.com/oxidecomputer/crucible", rev = "da3cf198a0e000bb89efc3a1c77d7ba09340a600" }
392-
crucible-smf = { git = "https://github.com/oxidecomputer/crucible", rev = "da3cf198a0e000bb89efc3a1c77d7ba09340a600" }
393-
crucible-common = { git = "https://github.com/oxidecomputer/crucible", rev = "da3cf198a0e000bb89efc3a1c77d7ba09340a600" }
390+
crucible-agent-client = { git = "https://github.com/oxidecomputer/crucible", rev = "45801597f410685015ac2704d044919a41e3ff75" }
391+
crucible-pantry-client = { git = "https://github.com/oxidecomputer/crucible", rev = "45801597f410685015ac2704d044919a41e3ff75" }
392+
crucible-smf = { git = "https://github.com/oxidecomputer/crucible", rev = "45801597f410685015ac2704d044919a41e3ff75" }
393+
crucible-common = { git = "https://github.com/oxidecomputer/crucible", rev = "45801597f410685015ac2704d044919a41e3ff75" }
394394
# NOTE: See above!
395395
csv = "1.3.1"
396396
curve25519-dalek = "4"
@@ -604,10 +604,10 @@ progenitor-client = "0.9.1"
604604
# NOTE: if you change the pinned revision of the `bhyve_api` and propolis
605605
# dependencies, you must also update the references in package-manifest.toml to
606606
# match the new revision.
607-
bhyve_api = { git = "https://github.com/oxidecomputer/propolis", rev = "e5c85d84b0a51803caffb335a1063612edb02f6d" }
608-
propolis_api_types = { git = "https://github.com/oxidecomputer/propolis", rev = "e5c85d84b0a51803caffb335a1063612edb02f6d" }
609-
propolis-client = { git = "https://github.com/oxidecomputer/propolis", rev = "e5c85d84b0a51803caffb335a1063612edb02f6d" }
610-
propolis-mock-server = { git = "https://github.com/oxidecomputer/propolis", rev = "e5c85d84b0a51803caffb335a1063612edb02f6d" }
607+
bhyve_api = { git = "https://github.com/oxidecomputer/propolis", rev = "060a204d91e401a368c700a09d24510b7cd2b0e4" }
608+
propolis_api_types = { git = "https://github.com/oxidecomputer/propolis", rev = "060a204d91e401a368c700a09d24510b7cd2b0e4" }
609+
propolis-client = { git = "https://github.com/oxidecomputer/propolis", rev = "060a204d91e401a368c700a09d24510b7cd2b0e4" }
610+
propolis-mock-server = { git = "https://github.com/oxidecomputer/propolis", rev = "060a204d91e401a368c700a09d24510b7cd2b0e4" }
611611
# NOTE: see above!
612612
proptest = "1.6.0"
613613
qorb = "0.3.1"

clients/sled-agent-client/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,5 +23,6 @@ reqwest = { workspace = true, features = [ "json", "rustls-tls", "stream" ] }
2323
schemars.workspace = true
2424
serde.workspace = true
2525
serde_json.workspace = true
26+
sled-agent-types.workspace = true
2627
slog.workspace = true
2728
uuid.workspace = true

clients/sled-agent-client/src/lib.rs

Lines changed: 1 addition & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ progenitor::generate_api!(
5656
DiskVariant = omicron_common::disk::DiskVariant,
5757
ExternalIpGatewayMap = omicron_common::api::internal::shared::ExternalIpGatewayMap,
5858
Generation = omicron_common::api::external::Generation,
59+
Hostname = omicron_common::api::external::Hostname,
5960
ImportExportPolicy = omicron_common::api::external::ImportExportPolicy,
6061
Inventory = nexus_sled_agent_shared::inventory::Inventory,
6162
InventoryDisk = nexus_sled_agent_shared::inventory::InventoryDisk,
@@ -117,14 +118,6 @@ impl From<omicron_common::api::internal::nexus::VmmState> for types::VmmState {
117118
}
118119
}
119120

120-
impl From<omicron_common::api::external::InstanceCpuCount>
121-
for types::InstanceCpuCount
122-
{
123-
fn from(s: omicron_common::api::external::InstanceCpuCount) -> Self {
124-
Self(s.0)
125-
}
126-
}
127-
128121
impl From<types::VmmState> for omicron_common::api::internal::nexus::VmmState {
129122
fn from(s: types::VmmState) -> Self {
130123
use omicron_common::api::internal::nexus::VmmState as Output;
@@ -188,14 +181,6 @@ impl From<types::MigrationState>
188181
}
189182
}
190183

191-
impl From<types::InstanceCpuCount>
192-
for omicron_common::api::external::InstanceCpuCount
193-
{
194-
fn from(s: types::InstanceCpuCount) -> Self {
195-
Self(s.0)
196-
}
197-
}
198-
199184
impl From<omicron_common::api::internal::nexus::DiskRuntimeState>
200185
for types::DiskRuntimeState
201186
{

nexus/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@ serde_urlencoded.workspace = true
9292
serde_with.workspace = true
9393
sha2.workspace = true
9494
sled-agent-client.workspace = true
95+
sled-agent-types.workspace = true
9596
slog.workspace = true
9697
slog-async.workspace = true
9798
slog-dtrace.workspace = true
@@ -162,7 +163,6 @@ pretty_assertions.workspace = true
162163
rcgen.workspace = true
163164
regex.workspace = true
164165
similar-asserts.workspace = true
165-
sled-agent-types.workspace = true
166166
sp-sim.workspace = true
167167
rustls.workspace = true
168168
subprocess.workspace = true

nexus/db-model/src/instance_cpu_count.rs

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -52,9 +52,3 @@ where
5252
.map_err(|e| e.into())
5353
}
5454
}
55-
56-
impl From<InstanceCpuCount> for sled_agent_client::types::InstanceCpuCount {
57-
fn from(i: InstanceCpuCount) -> Self {
58-
Self(i.0.0)
59-
}
60-
}

nexus/src/app/instance.rs

Lines changed: 16 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@ use super::MAX_VCPU_PER_INSTANCE;
1414
use super::MIN_MEMORY_BYTES_PER_INSTANCE;
1515
use crate::app::sagas;
1616
use crate::app::sagas::NexusSaga;
17-
use crate::cidata::InstanceCiData;
1817
use crate::external_api::params;
1918
use cancel_safe_futures::prelude::*;
2019
use futures::future::Fuse;
@@ -41,6 +40,7 @@ use omicron_common::api::external::CreateResult;
4140
use omicron_common::api::external::DataPageParams;
4241
use omicron_common::api::external::DeleteResult;
4342
use omicron_common::api::external::Error;
43+
use omicron_common::api::external::Hostname;
4444
use omicron_common::api::external::InstanceCpuCount;
4545
use omicron_common::api::external::InstanceState;
4646
use omicron_common::api::external::InternalContext;
@@ -64,9 +64,7 @@ use propolis_client::support::tungstenite::protocol::frame::coding::CloseCode;
6464
use sagas::instance_common::ExternalIpAttach;
6565
use sagas::instance_start;
6666
use sagas::instance_update;
67-
use sled_agent_client::types::InstanceBootSettings;
6867
use sled_agent_client::types::InstanceMigrationTargetParams;
69-
use sled_agent_client::types::InstanceProperties;
7068
use sled_agent_client::types::VmmPutStateBody;
7169
use std::matches;
7270
use std::net::SocketAddr;
@@ -1080,7 +1078,7 @@ impl super::Nexus {
10801078
// TODO-cleanup: This can be removed when we are confident that no
10811079
// instances exist prior to the addition of strict hostname validation
10821080
// in the API.
1083-
let Ok(hostname) = db_instance.hostname.parse() else {
1081+
let Ok(hostname) = db_instance.hostname.parse::<Hostname>() else {
10841082
let msg = format!(
10851083
"The instance hostname '{}' is no longer valid. \
10861084
To access the data on its disks, this instance \
@@ -1107,57 +1105,6 @@ impl super::Nexus {
11071105
)
11081106
.await?;
11091107

1110-
let mut disk_reqs = vec![];
1111-
for disk in &disks {
1112-
// Disks that are attached to an instance should always have a slot
1113-
// assignment, but if for some reason this one doesn't, return an
1114-
// error instead of taking down the whole process.
1115-
let slot = match disk.slot {
1116-
Some(s) => s,
1117-
None => {
1118-
error!(self.log, "attached disk has no PCI slot assignment";
1119-
"disk_id" => %disk.id(),
1120-
"disk_name" => disk.name().to_string(),
1121-
"instance_id" => ?disk.runtime_state.attach_instance_id);
1122-
1123-
return Err(Error::internal_error(&format!(
1124-
"disk {} is attached but has no PCI slot assignment",
1125-
disk.id()
1126-
))
1127-
.into());
1128-
}
1129-
};
1130-
1131-
let volume = self
1132-
.db_datastore
1133-
.volume_checkout(
1134-
disk.volume_id(),
1135-
match operation {
1136-
InstanceRegisterReason::Start { vmm_id } =>
1137-
db::datastore::VolumeCheckoutReason::InstanceStart { vmm_id },
1138-
InstanceRegisterReason::Migrate { vmm_id, target_vmm_id } =>
1139-
db::datastore::VolumeCheckoutReason::InstanceMigrate { vmm_id, target_vmm_id },
1140-
}
1141-
)
1142-
.await?;
1143-
1144-
disk_reqs.push(sled_agent_client::types::InstanceDisk {
1145-
disk_id: disk.id(),
1146-
name: disk.name().to_string(),
1147-
slot: slot.0,
1148-
read_only: false,
1149-
vcr_json: volume.data().to_owned(),
1150-
});
1151-
}
1152-
1153-
// The routines that maintain an instance's boot options are supposed to
1154-
// guarantee that the boot disk ID, if present, is the ID of an attached
1155-
// disk. If this invariant isn't upheld, Propolis will catch the failure
1156-
// when it processes its received VM configuration.
1157-
let boot_settings = db_instance
1158-
.boot_disk_id
1159-
.map(|id| InstanceBootSettings { order: vec![id] });
1160-
11611108
let nics = self
11621109
.db_datastore
11631110
.derive_guest_network_interface_info(&opctx, &authz_instance)
@@ -1271,28 +1218,25 @@ impl super::Nexus {
12711218
.unwrap(),
12721219
}),
12731220
)
1274-
.await?
1275-
.into_iter();
1221+
.await?;
12761222

1277-
let ssh_keys: Vec<String> =
1278-
ssh_keys.map(|ssh_key| ssh_key.public_key).collect();
1223+
let vmm_spec = self
1224+
.generate_vmm_spec(
1225+
&operation,
1226+
db_instance,
1227+
&disks,
1228+
&nics,
1229+
&ssh_keys,
1230+
)
1231+
.await?;
12791232

12801233
let metadata = sled_agent_client::types::InstanceMetadata {
12811234
silo_id: authz_silo.id(),
12821235
project_id: authz_project.id(),
12831236
};
12841237

1285-
// Ask the sled agent to begin the state change. Then update the
1286-
// database to reflect the new intermediate state. If this update is
1287-
// not the newest one, that's fine. That might just mean the sled agent
1288-
// beat us to it.
1289-
1290-
let instance_hardware = sled_agent_client::types::InstanceHardware {
1291-
properties: InstanceProperties {
1292-
ncpus: db_instance.ncpus.into(),
1293-
memory: db_instance.memory.into(),
1294-
hostname,
1295-
},
1238+
let local_config = sled_agent_client::types::InstanceSledLocalConfig {
1239+
hostname,
12961240
nics,
12971241
source_nat,
12981242
ephemeral_ip,
@@ -1304,12 +1248,6 @@ impl super::Nexus {
13041248
host_domain: None,
13051249
search_domains: Vec::new(),
13061250
},
1307-
disks: disk_reqs,
1308-
boot_settings,
1309-
cloud_init_bytes: Some(base64::Engine::encode(
1310-
&base64::engine::general_purpose::STANDARD,
1311-
db_instance.generate_cidata(&ssh_keys)?,
1312-
)),
13131251
};
13141252

13151253
let instance_id = InstanceUuid::from_untyped_uuid(db_instance.id());
@@ -1338,7 +1276,8 @@ impl super::Nexus {
13381276
.vmm_register(
13391277
propolis_id,
13401278
&sled_agent_client::types::InstanceEnsureBody {
1341-
hardware: instance_hardware,
1279+
vmm_spec,
1280+
local_config,
13421281
migration_id: db_instance.runtime().migration_id,
13431282
vmm_runtime,
13441283
instance_id,

0 commit comments

Comments
 (0)