Skip to content

[support bundle] Add health checks to support bundles #8102

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 59 commits into from
May 14, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
59 commits
Select commit Hold shift + click to select a range
07dd473
[nexus] Put support bundles in internal API too
smklein Apr 11, 2025
c079c3f
[omdb] Basic commands to access support bundles
smklein Apr 14, 2025
df47341
Updated output
smklein Apr 14, 2025
219d284
Merge branch 'main' into sb-internal-api
smklein Apr 14, 2025
e39785a
Merge branch 'sb-internal-api' into omdb-sb
smklein Apr 14, 2025
12f461b
[nexus] Make it 'more default' for Debug datasets to exist in test en…
smklein Apr 15, 2025
1af91c6
test patching
smklein Apr 15, 2025
8969fbe
Merge branch 'main' into sb-internal-api
smklein Apr 15, 2025
3dfd8ab
Merge branch 'sb-internal-api' into omdb-sb
smklein Apr 15, 2025
07de40c
Merge branch 'omicron-dev-disk-test' into sb-internal-api
smklein Apr 15, 2025
4bf9d9a
Merge branch 'sb-internal-api' into omdb-sb
smklein Apr 15, 2025
73b4975
Try compiling
smklein Apr 15, 2025
c3876e2
Merge branch 'omicron-dev-disk-test' into sb-internal-api
smklein Apr 15, 2025
600a537
Merge branch 'sb-internal-api' into omdb-sb
smklein Apr 15, 2025
8022e12
Use internal opctx
smklein Apr 15, 2025
47819c0
Patching tests more
smklein Apr 15, 2025
78af872
Merge branch 'omicron-dev-disk-test' into sb-internal-api
smklein Apr 15, 2025
192e255
Merge branch 'sb-internal-api' into omdb-sb
smklein Apr 15, 2025
6374a7b
Don't inject newlines
smklein Apr 16, 2025
c28a398
Continuing to iterate on TUI
smklein Apr 23, 2025
ab14729
Merge branch 'main' into omicron-dev-disk-test
smklein Apr 23, 2025
ceedbc3
Merge branch 'omicron-dev-disk-test' into sb-internal-api
smklein Apr 23, 2025
8278c09
Merge branch 'sb-internal-api' into omdb-sb
smklein Apr 23, 2025
9070654
Merge branch 'omdb-sb' into omdb-sb-polish
smklein Apr 23, 2025
820556f
Shift to move faster, fix dirs, wrapping
smklein Apr 23, 2025
0a457dd
Enable inspection of local files
smklein Apr 24, 2025
7fe5002
Fmt
smklein Apr 24, 2025
7fb1168
Merge branch 'main' into omicron-dev-disk-test
smklein Apr 25, 2025
36d2d04
Make datasets private, add helpers to access them
smklein Apr 25, 2025
bcfbb51
Merge branch 'omicron-dev-disk-test' into sb-internal-api
smklein Apr 25, 2025
b21b525
feedback
smklein Apr 25, 2025
b9b94d5
Merge branch 'sb-internal-api' into omdb-sb
smklein Apr 25, 2025
53c0a76
feedback, less utf8
smklein Apr 25, 2025
8f75399
Merge branch 'omdb-sb' into omdb-sb-polish
smklein Apr 25, 2025
44bd6a5
Better support for binary files
smklein Apr 25, 2025
9597433
Merge branch 'main' into omicron-dev-disk-test
smklein Apr 28, 2025
d2d0c76
Merge branch 'omicron-dev-disk-test' into sb-internal-api
smklein Apr 28, 2025
c8d9546
Merge branch 'sb-internal-api' into omdb-sb
smklein Apr 28, 2025
2790d9a
expectorate
smklein Apr 28, 2025
1a218b8
Merge branch 'omdb-sb' into omdb-sb-polish
smklein Apr 28, 2025
b73b6ff
Better support for waiting for collection to finish
smklein Apr 29, 2025
82ca320
Refactoring TUI into support-bundle-reader-lib
smklein Apr 29, 2025
995ca46
More private interface
smklein Apr 29, 2025
6ad1a31
starting to buffer and stream more properly
smklein Apr 30, 2025
865467a
Less unwrapping, more cleanup
smklein Apr 30, 2025
cc94a11
Merge branch 'main' into omdb-sb-polish
smklein Apr 30, 2025
f2aae61
[omdb] Add command to download entire support bundle
smklein Apr 30, 2025
70a9518
[support bundle] Add health checks to support bundles
smklein May 6, 2025
bf76518
Merge branch 'main' into omdb-sb-polish
smklein May 7, 2025
12d00d8
feedback
smklein May 7, 2025
3ac4d4c
Merge branch 'omdb-sb-polish' into omdb-sb-download-whole-thing
smklein May 7, 2025
6e57da6
Merge branch 'omdb-sb-download-whole-thing' into sb-health-check
smklein May 7, 2025
b2522d1
feedback
smklein May 7, 2025
3946a67
Merge branch 'main' into omdb-sb-polish
smklein May 7, 2025
531ad3a
Merge branch 'omdb-sb-polish' into omdb-sb-download-whole-thing
smklein May 7, 2025
160bdb0
Merge branch 'omdb-sb-download-whole-thing' into sb-health-check
smklein May 7, 2025
4538d70
Merge branch 'main' into omdb-sb-download-whole-thing
smklein May 8, 2025
63ef90c
Merge branch 'omdb-sb-download-whole-thing' into sb-health-check
smklein May 8, 2025
5e3cf04
avoid linebreaks
smklein May 9, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions nexus/src/app/background/tasks/support_bundle_collector.rs
Original file line number Diff line number Diff line change
Expand Up @@ -662,6 +662,12 @@ impl BundleCollection<'_> {
sled_client.support_zpool_info(),
)
.boxed(),
save_diag_cmd_output_or_error(
&sled_path,
"health-check",
sled_client.support_health_check(),
)
.boxed(),
])
// Currently we execute up to 10 commands concurrently which
// might be doing their own concurrent work, for example
Expand Down
27 changes: 27 additions & 0 deletions openapi/sled-agent.json
Original file line number Diff line number Diff line change
Expand Up @@ -659,6 +659,33 @@
}
}
},
"/support/health-check": {
"get": {
"operationId": "support_health_check",
"responses": {
"200": {
"description": "successful operation",
"content": {
"application/json": {
"schema": {
"title": "Array_of_SledDiagnosticsQueryOutput",
"type": "array",
"items": {
"$ref": "#/components/schemas/SledDiagnosticsQueryOutput"
}
}
}
}
},
"4XX": {
"$ref": "#/components/responses/Error"
},
"5XX": {
"$ref": "#/components/responses/Error"
}
}
}
},
"/support/ipadm-info": {
"get": {
"operationId": "support_ipadm_info",
Expand Down
8 changes: 8 additions & 0 deletions sled-agent/api/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -673,6 +673,14 @@ pub trait SledAgentApi {
request_context: RequestContext<Self::Context>,
) -> Result<HttpResponseOk<SledDiagnosticsQueryOutput>, HttpError>;

#[endpoint {
method = GET,
path = "/support/health-check",
}]
async fn support_health_check(
request_context: RequestContext<Self::Context>,
) -> Result<HttpResponseOk<Vec<SledDiagnosticsQueryOutput>>, HttpError>;

/// This endpoint returns a list of known zones on a sled that have service
/// logs that can be collected into a support bundle.
#[endpoint {
Expand Down
14 changes: 14 additions & 0 deletions sled-agent/src/http_entrypoints.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1036,6 +1036,20 @@ impl SledAgentApi for SledAgentImpl {
Ok(HttpResponseOk(res.get_output()))
}

async fn support_health_check(
request_context: RequestContext<Self::Context>,
) -> Result<HttpResponseOk<Vec<SledDiagnosticsQueryOutput>>, HttpError>
{
let sa = request_context.context();
Ok(HttpResponseOk(
sa.support_health_check()
.await
.into_iter()
.map(|cmd| cmd.get_output())
.collect::<Vec<_>>(),
))
}

async fn support_logs(
request_context: RequestContext<Self::Context>,
) -> Result<HttpResponseOk<Vec<String>>, HttpError> {
Expand Down
7 changes: 7 additions & 0 deletions sled-agent/src/sim/http_entrypoints.rs
Original file line number Diff line number Diff line change
Expand Up @@ -745,6 +745,13 @@ impl SledAgentApi for SledAgentSimImpl {
method_unimplemented()
}

async fn support_health_check(
_request_context: RequestContext<Self::Context>,
) -> Result<HttpResponseOk<Vec<SledDiagnosticsQueryOutput>>, HttpError>
{
method_unimplemented()
}

async fn support_logs(
_request_context: RequestContext<Self::Context>,
) -> Result<HttpResponseOk<Vec<String>>, HttpError> {
Expand Down
9 changes: 8 additions & 1 deletion sled-agent/src/sled_agent.rs
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,8 @@ use sled_agent_types::zone_bundle::{
BundleUtilization, CleanupContext, CleanupCount, CleanupPeriod,
PriorityOrder, StorageLimit, ZoneBundleMetadata,
};
use sled_diagnostics::{SledDiagnosticsCmdError, SledDiagnosticsCmdOutput};
use sled_diagnostics::SledDiagnosticsCmdError;
use sled_diagnostics::SledDiagnosticsCmdOutput;
use sled_hardware::{HardwareManager, MemoryReservations, underlay};
use sled_hardware_types::Baseboard;
use sled_hardware_types::underlay::BootstrapInterface;
Expand Down Expand Up @@ -1473,6 +1474,12 @@ impl SledAgent {
) -> Result<SledDiagnosticsCmdOutput, SledDiagnosticsCmdError> {
sled_diagnostics::zpool_info().await
}

pub(crate) async fn support_health_check(
&self,
) -> Vec<Result<SledDiagnosticsCmdOutput, SledDiagnosticsCmdError>> {
sled_diagnostics::health_check().await
}
}

#[derive(From, thiserror::Error, Debug)]
Expand Down
2 changes: 1 addition & 1 deletion sled-diagnostics/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,4 @@ zip = { workspace = true, features = ["zstd"] }
omicron-common.workspace = true
omicron-test-utils.workspace = true
omicron-uuid-kinds.workspace = true
sled-storage = { workspace = true, features = ["testing"] }
sled-storage = { workspace = true, features = ["testing"] }
22 changes: 22 additions & 0 deletions sled-diagnostics/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -146,3 +146,25 @@ pub async fn zpool_info()
-> Result<SledDiagnosticsCmdOutput, SledDiagnosticsCmdError> {
execute_command_with_timeout(zpool_status(), DEFAULT_TIMEOUT).await
}

pub async fn health_check()
-> Vec<Result<SledDiagnosticsCmdOutput, SledDiagnosticsCmdError>> {
[
uptime(),
kstat_low_page(),
svcs_enabled_but_not_running(),
count_disks(),
zfs_list_unmounted(),
count_crucibles(),
identify_datasets_close_to_quota(),
identify_datasets_with_less_than_300_gib_avail(),
dimm_check(),
]
.into_iter()
.map(|c| async move {
execute_command_with_timeout(c, DEFAULT_TIMEOUT).await
})
.collect::<FuturesUnordered<_>>()
.collect::<Vec<Result<_, _>>>()
.await
}
96 changes: 96 additions & 0 deletions sled-diagnostics/src/queries.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,14 @@ use crate::contract_stub::ContractError;

const DLADM: &str = "/usr/sbin/dladm";
const IPADM: &str = "/usr/sbin/ipadm";
const KSTAT: &str = "/usr/bin/kstat";
const NVMEADM: &str = "/usr/sbin/nvmeadm";
const PFEXEC: &str = "/usr/bin/pfexec";
const PFILES: &str = "/usr/bin/pfiles";
const PSTACK: &str = "/usr/bin/pstack";
const PARGS: &str = "/usr/bin/pargs";
const SVCS: &str = "/usr/bin/svcs";
const UPTIME: &str = "/usr/bin/uptime";
const ZFS: &str = "/usr/sbin/zfs";
const ZONEADM: &str = "/usr/sbin/zoneadm";
const ZPOOL: &str = "/usr/sbin/zpool";
Expand Down Expand Up @@ -263,6 +266,99 @@ pub fn pfiles_process(pid: i32) -> Command {
cmd
}

pub fn uptime() -> Command {
let mut cmd = std::process::Command::new(UPTIME);
cmd.env_clear();
cmd
}

pub fn kstat_low_page() -> Command {
let mut cmd = std::process::Command::new(PFEXEC);
cmd.env_clear().arg(KSTAT).arg("-p").arg("unix::system_pages:low_mem_scan");
cmd
}

pub fn svcs_enabled_but_not_running() -> Command {
let mut cmd = std::process::Command::new(PFEXEC);
cmd.env_clear().arg(SVCS).arg("-xZ");
cmd
}

pub fn count_disks() -> Command {
let mut cmd = std::process::Command::new("bash");
cmd.env_clear().args([
"-c",
"(pfexec diskinfo -pH | tee | wc -l | xargs | grep -x '12' > /dev/null) \
&& echo 'OK: All expected disks found' \
|| echo 'WARN: Unexpected number of physical disks (expected 12)'",
]);
cmd
}

pub fn zfs_list_unmounted() -> Command {
let mut cmd = std::process::Command::new("bash");
cmd.env_clear().args([
"-c",
"pfexec zfs list -r -o name,mounted | grep oxp | grep -v yes$ \
&& echo 'WARN: Found unmounted dataset(s)' \
|| echo 'OK: No unmounted datasets'",
]);
cmd
}

pub fn count_crucibles() -> Command {
let mut cmd = std::process::Command::new("bash");
cmd.env_clear()
.args([
"-c",
"(zoneadm list | grep crucible | grep -v pantry | tee | wc -l | xargs | grep -x '10' > /dev/null) \
&& echo 'OK: 10 Crucibles found' \
|| echo 'WARN: Unexpected number of crucible zones (expected 10)'"
]);
cmd
}

pub fn identify_datasets_close_to_quota() -> Command {
let mut cmd = std::process::Command::new("bash");
cmd.env_clear()
.args([
"-c",
"zfs list -Hp -o used,quota,name,avail,mountpoint | \
egrep 'oxp|oxi' | \
egrep -v 'none|crucible' | \
awk '$2 > 0 && $1 / $2 >= 0.8 { any=1; print } END { exit !any }' \
&& echo 'WARN: Found near-quota datasets' \
|| echo 'OK: No near-quota datasets found'"
]);
cmd
}

pub fn identify_datasets_with_less_than_300_gib_avail() -> Command {
let mut cmd = std::process::Command::new("bash");
cmd.env_clear().args([
"-c",
"zfs list -Hp -o used,quota,name,avail,mountpoint | \
egrep 'oxp|oxi' | \
egrep -v 'none|crucible' | \
awk '$4 < (300 * (1024^3)) { any=1; print } END { exit !any }' \
&& echo 'WARN: Found low-space datasets' \
|| echo 'OK: No low-space datasets found'",
]);
cmd
}

pub fn dimm_check() -> Command {
let mut cmd = std::process::Command::new("bash");
cmd.env_clear().args([
"-c",
"prtconf -m | \
grep -v -e 1036271 -e 2084847 \
&& echo 'WARN: Unexpected quantity of system memory' \
|| echo 'OK: Found expected quantity of system memory'",
]);
cmd
}

pub fn zfs_list() -> Command {
let mut cmd = std::process::Command::new(PFEXEC);
cmd.env_clear()
Expand Down
Loading