Skip to content

Commit 9b1867b

Browse files
authored
Adding automatic bundle on zone death (#3829)
- Moves zone bundle code to free functions in its own module, out of the `ServiceManager` itself. - Adds handling of Propolis zones, by reworking the locking around the instance manager. - Adds sled-agent endpoint for listing all zone bundles, even those not corresponding to an existing zone. - Adds a "cause" to the zone bundle metadata, indicating why it was created. - Some QoL improvements to `zone-bundle`, allowing listing bundles from zones matching a filter (or all), along with parseable output. - Improves robustness of extracting `GATEWAY_MAC` from the ARP entries for the provided `GATEWAY_IP`, and adds warning if the proxy-arp entries are not provided. - Extracts log files which may have been archived to a U.2 as well as the M.2-local log files - Adds basic mechanism for running zone-specific commands. Not used yet.
1 parent 28a6504 commit 9b1867b

17 files changed

+1422
-591
lines changed

docs/how-to-run.adoc

Lines changed: 22 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,7 @@ The rest of these instructions assume that you're building and running Omicron o
172172
The Sled Agent supports operation on both:
173173

174174
* a Gimlet (i.e., real Oxide hardware), and
175-
* an ordinary PC that's been set up to look like a Gimlet using the `./tools_create_virtual_hardware.sh` script.
175+
* an ordinary PC that's been set up to look like a Gimlet using the `./tools/create_virtual_hardware.sh` script.
176176

177177
This script also sets up a "softnpu" zone to implement Boundary Services. SoftNPU simulates the Tofino device that's used in real systems. Just like Tofino, it can implement sled-to-sled networking, but that's beyond the scope of this doc.
178178

@@ -373,7 +373,9 @@ $ dig recovery.sys.oxide.test @192.168.1.20 +short
373373
192.168.1.21
374374
----
375375

376-
Where did 192.168.1.20 come from? That's the external address of the external DNS server. We knew that only because it's the first address in the "internal services" IP pool in config-rss.toml.
376+
Where did 192.168.1.20 come from? That's the external address of the external
377+
DNS server. We knew that because it's listed in the `external_dns_ips` entry of
378+
the `config-rss.toml` file we're using.
377379

378380
Having looked this up, the easiest thing will be to use `http://192.168.1.21` for your URL (replacing with `https` if you used a certificate, and replacing that IP if needed). If you've set up networking right, you should be able to reach this from your web browser. You may have to instruct the browser to accept a self-signed TLS certificate. See also <<_connecting_securely_with_tls_using_the_cli>>.
379381

@@ -392,12 +394,19 @@ An IP pool is needed to provide external connectivity to Instances. The address
392394

393395
[source,console]
394396
----
395-
$ oxide api /v1/system/ip-pools/default/ranges/add --method POST --input - <<EOF
396-
{
397-
"first": "192.168.1.31",
398-
"last": "192.168.1.40"
397+
$ oxide ip-pool range add --pool default --first 192.168.1.31 --last 192.168.1.40
398+
success
399+
IpPoolRange {
400+
id: 4a61e65a-d96d-4c56-9cfd-dc1e44d9e99b,
401+
ip_pool_id: 1b1289a7-cefe-4a7e-a8c9-d93330846301,
402+
range: V4(
403+
Ipv4Range {
404+
first: 192.168.1.31,
405+
last: 192.168.1.40,
406+
},
407+
),
408+
time_created: 2023-08-02T16:31:43.679785Z,
399409
}
400-
EOF
401410
----
402411

403412
With SoftNPU you will generally also need to configure Proxy ARP. Below, `IP_POOL_START` and `IP_POOL_END` are the first and last addresses you used in the previous command:
@@ -435,11 +444,6 @@ $ oxide api /v1/images?project=myproj --method POST --input - <<EOF
435444
{
436445
"name": "alpine",
437446
"description": "boot from propolis zone blob!",
438-
"block_size": 512,
439-
"distribution": {
440-
"name": "alpine",
441-
"version": "propolis-blob"
442-
},
443447
"os": "linux",
444448
"version": "1",
445449
"source": {
@@ -457,22 +461,21 @@ $ oxide api /v1/images --method POST --input - <<EOF
457461
{
458462
"name": "crucible-tester-sparse",
459463
"description": "boot from a url!",
460-
"block_size": 512,
461-
"distribution": {
462-
"name": "debian",
463-
"version": "9"
464-
},
464+
"os": "debian",
465+
"version": "9",
465466
"source": {
466467
"type": "url",
467-
"url": "http://[fd00:1122:3344:101::15]/crucible-tester-sparse.img"
468+
"url": "http://[fd00:1122:3344:101::15]/crucible-tester-sparse.img",
469+
"block_size": 512
468470
}
469471
}
470472
EOF
471473
----
472474

473475
=== Provision an instance using the CLI
474476

475-
You'll need the id `$IMAGE_ID` of the image you just created.
477+
You'll need the id `$IMAGE_ID` of the image you just created. You can fetch that
478+
with `oxide image view --image $IMAGE_NAME`.
476479

477480
Now, create a Disk from that Image. The disk size must be a multiple of 1 GiB and at least as large as the image size. The example below creates a disk using the image made from the alpine ISO that ships with propolis, and sets the size to the next 1GiB multiple of the original alpine source:
478481

illumos-utils/src/running_zone.rs

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -933,11 +933,10 @@ impl RunningZone {
933933

934934
/// Return the names of the Oxide SMF services this zone is intended to run.
935935
pub fn service_names(&self) -> Result<Vec<String>, ServiceError> {
936-
const NEEDLES: [&str; 2] = ["/oxide", "/system/illumos"];
937936
let output = self.run_cmd(&["svcs", "-H", "-o", "fmri"])?;
938937
Ok(output
939938
.lines()
940-
.filter(|line| NEEDLES.iter().any(|needle| line.contains(needle)))
939+
.filter(|line| is_oxide_smf_log_file(line))
941940
.map(|line| line.trim().to_string())
942941
.collect())
943942
}
@@ -1191,3 +1190,11 @@ impl InstalledZone {
11911190
path
11921191
}
11931192
}
1193+
1194+
/// Return true if the named file appears to be a log file for an Oxide SMF
1195+
/// service.
1196+
pub fn is_oxide_smf_log_file(name: impl AsRef<str>) -> bool {
1197+
const SMF_SERVICE_PREFIXES: [&str; 2] = ["/oxide", "/system/illumos"];
1198+
let name = name.as_ref();
1199+
SMF_SERVICE_PREFIXES.iter().any(|needle| name.contains(needle))
1200+
}

openapi/sled-agent.json

Lines changed: 115 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,34 @@
1010
"version": "0.0.1"
1111
},
1212
"paths": {
13+
"/all-zone-bundles": {
14+
"get": {
15+
"summary": "List all zone bundles that exist, even for now-deleted zones.",
16+
"operationId": "zone_bundle_list_all",
17+
"responses": {
18+
"200": {
19+
"description": "successful operation",
20+
"content": {
21+
"application/json": {
22+
"schema": {
23+
"title": "Array_of_ZoneBundleMetadata",
24+
"type": "array",
25+
"items": {
26+
"$ref": "#/components/schemas/ZoneBundleMetadata"
27+
}
28+
}
29+
}
30+
}
31+
},
32+
"4XX": {
33+
"$ref": "#/components/responses/Error"
34+
},
35+
"5XX": {
36+
"$ref": "#/components/responses/Error"
37+
}
38+
}
39+
}
40+
},
1341
"/cockroachdb": {
1442
"post": {
1543
"summary": "Initializes a CockroachDB cluster",
@@ -528,7 +556,7 @@
528556
},
529557
"/zones/{zone_name}/bundles": {
530558
"get": {
531-
"summary": "List the zone bundles that are current available for a zone.",
559+
"summary": "List the zone bundles that are available for a running zone.",
532560
"operationId": "zone_bundle_list",
533561
"parameters": [
534562
{
@@ -639,6 +667,42 @@
639667
"$ref": "#/components/responses/Error"
640668
}
641669
}
670+
},
671+
"delete": {
672+
"summary": "Delete a zone bundle.",
673+
"operationId": "zone_bundle_delete",
674+
"parameters": [
675+
{
676+
"in": "path",
677+
"name": "bundle_id",
678+
"description": "The ID for this bundle itself.",
679+
"required": true,
680+
"schema": {
681+
"type": "string",
682+
"format": "uuid"
683+
}
684+
},
685+
{
686+
"in": "path",
687+
"name": "zone_name",
688+
"description": "The name of the zone this bundle is derived from.",
689+
"required": true,
690+
"schema": {
691+
"type": "string"
692+
}
693+
}
694+
],
695+
"responses": {
696+
"204": {
697+
"description": "successful deletion"
698+
},
699+
"4XX": {
700+
"$ref": "#/components/responses/Error"
701+
},
702+
"5XX": {
703+
"$ref": "#/components/responses/Error"
704+
}
705+
}
642706
}
643707
},
644708
"/zpools": {
@@ -2654,6 +2718,39 @@
26542718
"vni"
26552719
]
26562720
},
2721+
"ZoneBundleCause": {
2722+
"description": "The reason or cause for a zone bundle, i.e., why it was created.",
2723+
"oneOf": [
2724+
{
2725+
"description": "Generated in response to an explicit request to the sled agent.",
2726+
"type": "string",
2727+
"enum": [
2728+
"explicit_request"
2729+
]
2730+
},
2731+
{
2732+
"description": "A zone bundle taken when a sled agent finds a zone that it does not expect to be running.",
2733+
"type": "string",
2734+
"enum": [
2735+
"unexpected_zone"
2736+
]
2737+
},
2738+
{
2739+
"description": "An instance zone was terminated.",
2740+
"type": "string",
2741+
"enum": [
2742+
"terminated_instance"
2743+
]
2744+
},
2745+
{
2746+
"description": "Some other, unspecified reason.",
2747+
"type": "string",
2748+
"enum": [
2749+
"other"
2750+
]
2751+
}
2752+
]
2753+
},
26572754
"ZoneBundleId": {
26582755
"description": "An identifier for a zone bundle.",
26592756
"type": "object",
@@ -2677,6 +2774,14 @@
26772774
"description": "Metadata about a zone bundle.",
26782775
"type": "object",
26792776
"properties": {
2777+
"cause": {
2778+
"description": "The reason or cause a bundle was created.",
2779+
"allOf": [
2780+
{
2781+
"$ref": "#/components/schemas/ZoneBundleCause"
2782+
}
2783+
]
2784+
},
26802785
"id": {
26812786
"description": "Identifier for this zone bundle",
26822787
"allOf": [
@@ -2689,11 +2794,19 @@
26892794
"description": "The time at which this zone bundle was created.",
26902795
"type": "string",
26912796
"format": "date-time"
2797+
},
2798+
"version": {
2799+
"description": "A version number for this zone bundle.",
2800+
"type": "integer",
2801+
"format": "uint8",
2802+
"minimum": 0
26922803
}
26932804
},
26942805
"required": [
2806+
"cause",
26952807
"id",
2696-
"time_created"
2808+
"time_created",
2809+
"version"
26972810
]
26982811
},
26992812
"ZoneType": {

schema/zone-bundle-metadata.json

Lines changed: 50 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,20 @@
44
"description": "Metadata about a zone bundle.",
55
"type": "object",
66
"required": [
7+
"cause",
78
"id",
8-
"time_created"
9+
"time_created",
10+
"version"
911
],
1012
"properties": {
13+
"cause": {
14+
"description": "The reason or cause a bundle was created.",
15+
"allOf": [
16+
{
17+
"$ref": "#/definitions/ZoneBundleCause"
18+
}
19+
]
20+
},
1121
"id": {
1222
"description": "Identifier for this zone bundle",
1323
"allOf": [
@@ -20,9 +30,48 @@
2030
"description": "The time at which this zone bundle was created.",
2131
"type": "string",
2232
"format": "date-time"
33+
},
34+
"version": {
35+
"description": "A version number for this zone bundle.",
36+
"type": "integer",
37+
"format": "uint8",
38+
"minimum": 0.0
2339
}
2440
},
2541
"definitions": {
42+
"ZoneBundleCause": {
43+
"description": "The reason or cause for a zone bundle, i.e., why it was created.",
44+
"oneOf": [
45+
{
46+
"description": "Generated in response to an explicit request to the sled agent.",
47+
"type": "string",
48+
"enum": [
49+
"explicit_request"
50+
]
51+
},
52+
{
53+
"description": "A zone bundle taken when a sled agent finds a zone that it does not expect to be running.",
54+
"type": "string",
55+
"enum": [
56+
"unexpected_zone"
57+
]
58+
},
59+
{
60+
"description": "An instance zone was terminated.",
61+
"type": "string",
62+
"enum": [
63+
"terminated_instance"
64+
]
65+
},
66+
{
67+
"description": "Some other, unspecified reason.",
68+
"type": "string",
69+
"enum": [
70+
"other"
71+
]
72+
}
73+
]
74+
},
2675
"ZoneBundleId": {
2776
"description": "An identifier for a zone bundle.",
2877
"type": "object",

0 commit comments

Comments
 (0)