Skip to content

Commit 70a9518

Browse files
committed
[support bundle] Add health checks to support bundles
1 parent f2aae61 commit 70a9518

File tree

9 files changed

+189
-2
lines changed

9 files changed

+189
-2
lines changed

nexus/src/app/background/tasks/support_bundle_collector.rs

+6
Original file line numberDiff line numberDiff line change
@@ -662,6 +662,12 @@ impl BundleCollection<'_> {
662662
sled_client.support_zpool_info(),
663663
)
664664
.boxed(),
665+
save_diag_cmd_output_or_error(
666+
&sled_path,
667+
"health-check",
668+
sled_client.support_health_check(),
669+
)
670+
.boxed(),
665671
])
666672
// Currently we execute up to 10 commands concurrently which
667673
// might be doing their own concurrent work, for example

openapi/sled-agent.json

+27
Original file line numberDiff line numberDiff line change
@@ -659,6 +659,33 @@
659659
}
660660
}
661661
},
662+
"/support/health-check": {
663+
"get": {
664+
"operationId": "support_health_check",
665+
"responses": {
666+
"200": {
667+
"description": "successful operation",
668+
"content": {
669+
"application/json": {
670+
"schema": {
671+
"title": "Array_of_SledDiagnosticsQueryOutput",
672+
"type": "array",
673+
"items": {
674+
"$ref": "#/components/schemas/SledDiagnosticsQueryOutput"
675+
}
676+
}
677+
}
678+
}
679+
},
680+
"4XX": {
681+
"$ref": "#/components/responses/Error"
682+
},
683+
"5XX": {
684+
"$ref": "#/components/responses/Error"
685+
}
686+
}
687+
}
688+
},
662689
"/support/ipadm-info": {
663690
"get": {
664691
"operationId": "support_ipadm_info",

sled-agent/api/src/lib.rs

+8
Original file line numberDiff line numberDiff line change
@@ -673,6 +673,14 @@ pub trait SledAgentApi {
673673
request_context: RequestContext<Self::Context>,
674674
) -> Result<HttpResponseOk<SledDiagnosticsQueryOutput>, HttpError>;
675675

676+
#[endpoint {
677+
method = GET,
678+
path = "/support/health-check",
679+
}]
680+
async fn support_health_check(
681+
request_context: RequestContext<Self::Context>,
682+
) -> Result<HttpResponseOk<Vec<SledDiagnosticsQueryOutput>>, HttpError>;
683+
676684
/// This endpoint returns a list of known zones on a sled that have service
677685
/// logs that can be collected into a support bundle.
678686
#[endpoint {

sled-agent/src/http_entrypoints.rs

+14
Original file line numberDiff line numberDiff line change
@@ -1036,6 +1036,20 @@ impl SledAgentApi for SledAgentImpl {
10361036
Ok(HttpResponseOk(res.get_output()))
10371037
}
10381038

1039+
async fn support_health_check(
1040+
request_context: RequestContext<Self::Context>,
1041+
) -> Result<HttpResponseOk<Vec<SledDiagnosticsQueryOutput>>, HttpError>
1042+
{
1043+
let sa = request_context.context();
1044+
Ok(HttpResponseOk(
1045+
sa.support_health_check()
1046+
.await
1047+
.into_iter()
1048+
.map(|cmd| cmd.get_output())
1049+
.collect::<Vec<_>>(),
1050+
))
1051+
}
1052+
10391053
async fn support_logs(
10401054
request_context: RequestContext<Self::Context>,
10411055
) -> Result<HttpResponseOk<Vec<String>>, HttpError> {

sled-agent/src/sim/http_entrypoints.rs

+7
Original file line numberDiff line numberDiff line change
@@ -745,6 +745,13 @@ impl SledAgentApi for SledAgentSimImpl {
745745
method_unimplemented()
746746
}
747747

748+
async fn support_health_check(
749+
_request_context: RequestContext<Self::Context>,
750+
) -> Result<HttpResponseOk<Vec<SledDiagnosticsQueryOutput>>, HttpError>
751+
{
752+
method_unimplemented()
753+
}
754+
748755
async fn support_logs(
749756
_request_context: RequestContext<Self::Context>,
750757
) -> Result<HttpResponseOk<Vec<String>>, HttpError> {

sled-agent/src/sled_agent.rs

+8-1
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,8 @@ use sled_agent_types::zone_bundle::{
6767
BundleUtilization, CleanupContext, CleanupCount, CleanupPeriod,
6868
PriorityOrder, StorageLimit, ZoneBundleMetadata,
6969
};
70-
use sled_diagnostics::{SledDiagnosticsCmdError, SledDiagnosticsCmdOutput};
70+
use sled_diagnostics::SledDiagnosticsCmdError;
71+
use sled_diagnostics::SledDiagnosticsCmdOutput;
7172
use sled_hardware::{HardwareManager, MemoryReservations, underlay};
7273
use sled_hardware_types::Baseboard;
7374
use sled_hardware_types::underlay::BootstrapInterface;
@@ -1474,6 +1475,12 @@ impl SledAgent {
14741475
) -> Result<SledDiagnosticsCmdOutput, SledDiagnosticsCmdError> {
14751476
sled_diagnostics::zpool_info().await
14761477
}
1478+
1479+
pub(crate) async fn support_health_check(
1480+
&self,
1481+
) -> Vec<Result<SledDiagnosticsCmdOutput, SledDiagnosticsCmdError>> {
1482+
sled_diagnostics::health_check().await
1483+
}
14771484
}
14781485

14791486
#[derive(From, thiserror::Error, Debug)]

sled-diagnostics/Cargo.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -29,4 +29,4 @@ zip = { workspace = true, features = ["zstd"] }
2929
omicron-common.workspace = true
3030
omicron-test-utils.workspace = true
3131
omicron-uuid-kinds.workspace = true
32-
sled-storage = { workspace = true, features = ["testing"] }
32+
sled-storage = { workspace = true, features = ["testing"] }

sled-diagnostics/src/lib.rs

+22
Original file line numberDiff line numberDiff line change
@@ -146,3 +146,25 @@ pub async fn zpool_info()
146146
-> Result<SledDiagnosticsCmdOutput, SledDiagnosticsCmdError> {
147147
execute_command_with_timeout(zpool_status(), DEFAULT_TIMEOUT).await
148148
}
149+
150+
pub async fn health_check()
151+
-> Vec<Result<SledDiagnosticsCmdOutput, SledDiagnosticsCmdError>> {
152+
[
153+
uptime(),
154+
kstat_low_page(),
155+
svcs_show_disabled(),
156+
count_disks(),
157+
zfs_list_unmounted(),
158+
count_crucibles(),
159+
identify_datasets_close_to_quota(),
160+
identify_datasets_with_less_than_300_gib_avail(),
161+
dimm_check(),
162+
]
163+
.into_iter()
164+
.map(|c| async move {
165+
execute_command_with_timeout(c, DEFAULT_TIMEOUT).await
166+
})
167+
.collect::<FuturesUnordered<_>>()
168+
.collect::<Vec<Result<_, _>>>()
169+
.await
170+
}

sled-diagnostics/src/queries.rs

+96
Original file line numberDiff line numberDiff line change
@@ -22,11 +22,14 @@ use crate::contract_stub::ContractError;
2222

2323
const DLADM: &str = "/usr/sbin/dladm";
2424
const IPADM: &str = "/usr/sbin/ipadm";
25+
const KSTAT: &str = "/usr/bin/kstat";
2526
const NVMEADM: &str = "/usr/sbin/nvmeadm";
2627
const PFEXEC: &str = "/usr/bin/pfexec";
2728
const PFILES: &str = "/usr/bin/pfiles";
2829
const PSTACK: &str = "/usr/bin/pstack";
2930
const PARGS: &str = "/usr/bin/pargs";
31+
const SVCS: &str = "/usr/bin/svcs";
32+
const UPTIME: &str = "/usr/bin/uptime";
3033
const ZFS: &str = "/usr/sbin/zfs";
3134
const ZONEADM: &str = "/usr/sbin/zoneadm";
3235
const ZPOOL: &str = "/usr/sbin/zpool";
@@ -263,6 +266,99 @@ pub fn pfiles_process(pid: i32) -> Command {
263266
cmd
264267
}
265268

269+
pub fn uptime() -> Command {
270+
let mut cmd = std::process::Command::new(UPTIME);
271+
cmd.env_clear();
272+
cmd
273+
}
274+
275+
pub fn kstat_low_page() -> Command {
276+
let mut cmd = std::process::Command::new(PFEXEC);
277+
cmd.env_clear().arg(KSTAT).arg("-p").arg("unix::system_pages:low_mem_scan");
278+
cmd
279+
}
280+
281+
pub fn svcs_show_disabled() -> Command {
282+
let mut cmd = std::process::Command::new(PFEXEC);
283+
cmd.env_clear().arg(SVCS).arg("-xZ");
284+
cmd
285+
}
286+
287+
pub fn count_disks() -> Command {
288+
let mut cmd = std::process::Command::new("bash");
289+
cmd.env_clear().args([
290+
"-c",
291+
"(pfexec diskinfo -pH | tee | wc -l | xargs | grep -x '12' > /dev/null)
292+
&& echo 'OK: All expected disks found'
293+
|| echo 'WARN: Unexpected number of physical disks (expected 12)'",
294+
]);
295+
cmd
296+
}
297+
298+
pub fn zfs_list_unmounted() -> Command {
299+
let mut cmd = std::process::Command::new("bash");
300+
cmd.env_clear().args([
301+
"-c",
302+
"pfexec zfs list -r -o name,mounted | grep oxp | grep -v yes$
303+
&& echo 'WARN: Found unmounted dataset(s)'
304+
|| echo 'OK: No unmounted datasets'",
305+
]);
306+
cmd
307+
}
308+
309+
pub fn count_crucibles() -> Command {
310+
let mut cmd = std::process::Command::new("bash");
311+
cmd.env_clear()
312+
.args([
313+
"-c",
314+
"(zoneadm list | grep crucible | grep -v pantry | tee | wc -l | xargs | grep -x '10' > /dev/null)
315+
&& echo 'OK: 10 Crucibles found'
316+
|| echo 'WARN: Unexpected number of crucible zones (expected 10)'"
317+
]);
318+
cmd
319+
}
320+
321+
pub fn identify_datasets_close_to_quota() -> Command {
322+
let mut cmd = std::process::Command::new("bash");
323+
cmd.env_clear()
324+
.args([
325+
"-c",
326+
"zfs list -Hp -o used,quota,name,avail,mountpoint |
327+
egrep 'oxp|oxi' |
328+
egrep -v 'none|crucible' |
329+
awk '$2 > 0 && $1 / $2 >= 0.8 { any=1; print } END { exit !any }'
330+
&& echo 'WARN: Found near-quota datasets'
331+
|| echo 'OK: No near-quota datasets found'"
332+
]);
333+
cmd
334+
}
335+
336+
pub fn identify_datasets_with_less_than_300_gib_avail() -> Command {
337+
let mut cmd = std::process::Command::new("bash");
338+
cmd.env_clear().args([
339+
"-c",
340+
"zfs list -Hp -o used,quota,name,avail,mountpoint |
341+
egrep 'oxp|oxi' |
342+
egrep -v 'none|crucible' |
343+
awk '$4 < (300 * (1024^3)) { any=1; print } END { exit !any }'
344+
&& echo 'WARN: Found low-space datasets'
345+
|| echo 'OK: No low-space datasets found'",
346+
]);
347+
cmd
348+
}
349+
350+
pub fn dimm_check() -> Command {
351+
let mut cmd = std::process::Command::new("bash");
352+
cmd.env_clear().args([
353+
"-c",
354+
"prtconf -m |
355+
grep -v -e 1036271 -e 2084847
356+
&& echo 'WARN: Unexpected quantity of system memory'
357+
|| echo 'OK: Found expected quantity of system memory'",
358+
]);
359+
cmd
360+
}
361+
266362
pub fn zfs_list() -> Command {
267363
let mut cmd = std::process::Command::new(PFEXEC);
268364
cmd.env_clear()

0 commit comments

Comments
 (0)