Skip to content

Commit d07b544

Browse files
authored
Introduce sled-agent-config-reconciler skeleton (#8063)
This PR adds a new `sled-agent-config-reconciler` crate. The majority of its public API is present here, but most of the guts of it is stubbed out via `unimplemented!()`. This is the first PR of many in a big chunk of work, and I'm not sure how best to divide it, but my current plan is: 1. This PR 2. A draft PR that shows how this public API is integrated into sled-agent 3. A series of (relatively) smaller PRs that flesh out all the `unimplemented!()` bits inside the reconciler 4. (Optional) PRs to finish any lingering integration work not covered by 2 This PR is safe to land on `main`: the only changes to sled-agent proper are that the `dump_setup` module has moved to this crate (with some very minor changes that don't affect behavior), so sled-agent calls it from this crate instead of from its own submodule. The rest of the crate with all the `unimplemented!()` paths is not called by anyone. The PRs in group 3 will similarly be safe to land: they will only flesh out the `unimplemented!()`-but-not-called code paths. PR 2 (which I'll open shortly) will not pass tests until all the PRs from group 3 have landed, but I'd like to open them in this order to get eyes on that integration work, because it may inform changes we want to make to the config-reconciler crate's API (which would be easier now than after landing everything in group 3).
1 parent 4e64efa commit d07b544

19 files changed

+1778
-51
lines changed

Cargo.lock

+35
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

+3
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,7 @@ members = [
124124
"sled-agent",
125125
"sled-agent/api",
126126
"sled-agent/bootstrap-agent-api",
127+
"sled-agent/config-reconciler",
127128
"sled-agent/repo-depot-api",
128129
"sled-agent/types",
129130
"sled-diagnostics",
@@ -275,6 +276,7 @@ default-members = [
275276
"sled-agent",
276277
"sled-agent/api",
277278
"sled-agent/bootstrap-agent-api",
279+
"sled-agent/config-reconciler",
278280
"sled-agent/repo-depot-api",
279281
"sled-agent/types",
280282
"sled-diagnostics",
@@ -662,6 +664,7 @@ similar-asserts = "1.7.0"
662664
sled = "=0.34.7"
663665
sled-agent-api = { path = "sled-agent/api" }
664666
sled-agent-client = { path = "clients/sled-agent-client" }
667+
sled-agent-config-reconciler = { path = "sled-agent/config-reconciler" }
665668
sled-agent-types = { path = "sled-agent/types" }
666669
sled-diagnostics = { path = "sled-diagnostics" }
667670
sled-hardware = { path = "sled-hardware" }

sled-agent/Cargo.toml

+1
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@ sha2.workspace = true
8383
sha3.workspace = true
8484
sled-agent-api.workspace = true
8585
sled-agent-client.workspace = true
86+
sled-agent-config-reconciler.workspace = true
8687
sled-agent-types.workspace = true
8788
sled-diagnostics.workspace = true
8889
sled-hardware.workspace = true
+43
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
[package]
2+
name = "sled-agent-config-reconciler"
3+
version = "0.1.0"
4+
edition = "2021"
5+
license = "MPL-2.0"
6+
7+
[lints]
8+
workspace = true
9+
10+
[dependencies]
11+
anyhow.workspace = true
12+
async-trait.workspace = true
13+
camino.workspace = true
14+
camino-tempfile.workspace = true
15+
chrono.workspace = true
16+
derive_more.workspace = true
17+
dropshot.workspace = true
18+
glob.workspace = true
19+
id-map.workspace = true
20+
illumos-utils.workspace = true
21+
key-manager.workspace = true
22+
nexus-sled-agent-shared.workspace = true
23+
omicron-common.workspace = true
24+
omicron-uuid-kinds.workspace = true
25+
sled-agent-api.workspace = true
26+
sled-agent-types.workspace = true
27+
sled-hardware.workspace = true
28+
sled-storage.workspace = true
29+
slog.workspace = true
30+
slog-error-chain.workspace = true
31+
thiserror.workspace = true
32+
tokio.workspace = true
33+
tufaceous-artifact.workspace = true
34+
zone.workspace = true
35+
omicron-workspace-hack.workspace = true
36+
37+
[dev-dependencies]
38+
omicron-test-utils.workspace = true
39+
proptest.workspace = true
40+
test-strategy.workspace = true
41+
42+
[features]
43+
testing = []
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,162 @@
1+
// This Source Code Form is subject to the terms of the Mozilla Public
2+
// License, v. 2.0. If a copy of the MPL was not distributed with this
3+
// file, You can obtain one at https://mozilla.org/MPL/2.0/.
4+
5+
//! Many of the ZFS operations sled-agent performs are not atomic, because they
6+
//! involve multiple lower-level ZFS operations. This module implements a tokio
7+
//! task that serializes a set of operations to ensure no two operations could
8+
//! be executing concurrently.
9+
//!
10+
//! It uses the common pattern of "a task with a mpsc channel to send requests,
11+
//! using oneshot channels to send responses".
12+
13+
use camino::Utf8PathBuf;
14+
use id_map::IdMap;
15+
use id_map::IdMappable;
16+
use nexus_sled_agent_shared::inventory::InventoryDataset;
17+
use omicron_common::disk::DatasetConfig;
18+
use omicron_common::zpool_name::ZpoolName;
19+
use omicron_uuid_kinds::DatasetUuid;
20+
use sled_storage::config::MountConfig;
21+
use sled_storage::manager::NestedDatasetConfig;
22+
use sled_storage::manager::NestedDatasetListOptions;
23+
use sled_storage::manager::NestedDatasetLocation;
24+
use slog::Logger;
25+
use slog::warn;
26+
use std::collections::BTreeSet;
27+
use std::sync::Arc;
28+
use tokio::sync::mpsc;
29+
30+
#[derive(Debug, thiserror::Error)]
31+
pub enum DatasetTaskError {
32+
#[error("cannot perform dataset operations: waiting for key manager")]
33+
WaitingForKeyManager,
34+
#[error("dataset task busy; cannot service new requests")]
35+
Busy,
36+
#[error("internal error: dataset task exited!")]
37+
Exited,
38+
}
39+
40+
#[derive(Debug)]
41+
pub(crate) struct DatasetEnsureResult(IdMap<SingleDatasetEnsureResult>);
42+
43+
#[derive(Debug, Clone)]
44+
struct SingleDatasetEnsureResult {
45+
config: DatasetConfig,
46+
state: DatasetState,
47+
}
48+
49+
impl IdMappable for SingleDatasetEnsureResult {
50+
type Id = DatasetUuid;
51+
52+
fn id(&self) -> Self::Id {
53+
self.config.id
54+
}
55+
}
56+
57+
#[derive(Debug, Clone)]
58+
enum DatasetState {
59+
Mounted,
60+
FailedToMount, // TODO add error
61+
UuidMismatch(DatasetUuid),
62+
ZpoolNotFound,
63+
ParentMissingFromConfig,
64+
ParentFailedToMount,
65+
}
66+
67+
#[derive(Debug)]
68+
pub(crate) struct DatasetTaskHandle(mpsc::Sender<DatasetTaskRequest>);
69+
70+
impl DatasetTaskHandle {
71+
pub fn spawn_dataset_task(
72+
mount_config: Arc<MountConfig>,
73+
base_log: &Logger,
74+
) -> Self {
75+
// We don't expect too many concurrent requests to this task, and want
76+
// to detect "the task is wedged" pretty quickly. Common operations:
77+
//
78+
// 1. Reconciler wants to ensure datasets (at most 1 at a time)
79+
// 2. Inventory requests from Nexus (likely at most 3 at a time)
80+
// 3. Support bundle operations (unlikely to be multiple concurrently)
81+
//
82+
// so we'll pick a number that allows all of those plus a little
83+
// overhead.
84+
let (request_tx, request_rx) = mpsc::channel(16);
85+
86+
tokio::spawn(
87+
DatasetTask {
88+
mount_config,
89+
request_rx,
90+
log: base_log.new(slog::o!("component" => "DatasetTask")),
91+
}
92+
.run(),
93+
);
94+
95+
Self(request_tx)
96+
}
97+
98+
pub async fn datasets_ensure(
99+
&self,
100+
_dataset_configs: IdMap<DatasetConfig>,
101+
_zpools: BTreeSet<ZpoolName>,
102+
) -> Result<DatasetEnsureResult, DatasetTaskError> {
103+
unimplemented!()
104+
}
105+
106+
pub async fn inventory(
107+
&self,
108+
_zpools: BTreeSet<ZpoolName>,
109+
) -> Result<Vec<InventoryDataset>, DatasetTaskError> {
110+
unimplemented!()
111+
}
112+
113+
pub async fn nested_dataset_ensure_mounted(
114+
&self,
115+
_dataset: NestedDatasetLocation,
116+
) -> Result<Utf8PathBuf, DatasetTaskError> {
117+
unimplemented!()
118+
}
119+
120+
pub async fn nested_dataset_ensure(
121+
&self,
122+
_config: NestedDatasetConfig,
123+
) -> Result<(), DatasetTaskError> {
124+
unimplemented!()
125+
}
126+
127+
pub async fn nested_dataset_destroy(
128+
&self,
129+
_name: NestedDatasetLocation,
130+
) -> Result<(), DatasetTaskError> {
131+
unimplemented!()
132+
}
133+
134+
pub async fn nested_dataset_list(
135+
&self,
136+
_name: NestedDatasetLocation,
137+
_options: NestedDatasetListOptions,
138+
) -> Result<Vec<NestedDatasetConfig>, DatasetTaskError> {
139+
unimplemented!()
140+
}
141+
}
142+
143+
struct DatasetTask {
144+
mount_config: Arc<MountConfig>,
145+
request_rx: mpsc::Receiver<DatasetTaskRequest>,
146+
log: Logger,
147+
}
148+
149+
impl DatasetTask {
150+
async fn run(mut self) {
151+
while let Some(req) = self.request_rx.recv().await {
152+
self.handle_request(req).await;
153+
}
154+
warn!(self.log, "all request handles closed; exiting dataset task");
155+
}
156+
157+
async fn handle_request(&mut self, _req: DatasetTaskRequest) {
158+
unimplemented!()
159+
}
160+
}
161+
162+
enum DatasetTaskRequest {}

0 commit comments

Comments
 (0)