From cc9fb875922f367750487c70fb454f37a71b8263 Mon Sep 17 00:00:00 2001
From: Min Badar <badmin@amazon.com>
Date: Fri, 15 May 2026 22:55:19 -0700
Subject: [PATCH] feat(sagemaker-ai): add HyperPod debugging skills

Add new skills for diagnosing and troubleshooting HyperPod clusters:
- hyperpod-cluster-debugger: cluster-wide diagnostics
- hyperpod-nccl: NCCL failure diagnosis
- hyperpod-node-debugger: per-node issue triage
- hyperpod-performance-debugger: performance bottleneck analysis
- hyperpod-slurm-debugger: Slurm scheduler issues

Also updates hyperpod-ssm, hyperpod-version-checker, and
hyperpod-issue-report with related improvements.

Updates README with new skill documentation.
---
 plugins/sagemaker-ai/README.md                |   45 +-
 .../skills/hyperpod-cluster-debugger/SKILL.md |  198 ++
 .../references/capacity-planning.md           |  124 +
 .../references/cloudformation-errors.md       |   84 +
 .../references/cluster-diagnostics-detail.md  |  463 +++
 .../references/cluster-operations.md          |  270 ++
 .../references/iam-permissions.md             |   40 +
 .../references/lifecycle-scripts.md           |  111 +
 .../scripts/diagnose-cluster.sh               | 1621 +++++++++++
 .../references/troubleshooting.md             |    2 +-
 .../skills/hyperpod-nccl/SKILL.md             |  187 ++
 .../references/debugging-guide.md             | 1011 +++++++
 .../references/error-patterns-quick-ref.md    |   47 +
 .../hyperpod-nccl/references/operations.md    |  393 +++
 .../references/performance-testing.md         |  247 ++
 .../hyperpod-nccl/scripts/nccl-diagnose.sh    | 2563 +++++++++++++++++
 .../skills/hyperpod-node-debugger/SKILL.md    |  269 ++
 .../references/node-diagnostics-detail.md     | 1074 +++++++
 .../references/node-issue-catalog.md          |  141 +
 .../scripts/check-efa-sg.sh                   |  355 +++
 .../scripts/check-node-reachability.sh        |  389 +++
 .../scripts/check-vpc-config.sh               |  508 ++++
 .../scripts/triage-cluster.sh                 | 1258 ++++++++
 .../hyperpod-performance-debugger/SKILL.md    |  185 ++
 .../references/perf-details.md                |  202 ++
 .../scripts/perf-snapshot.sh                  |  666 +++++
 .../skills/hyperpod-slurm-debugger/SKILL.md   |  243 ++
 .../references/slurm-details.md               |  318 ++
 .../scripts/slurm-diagnose.sh                 |  802 ++++++
 .../sagemaker-ai/skills/hyperpod-ssm/SKILL.md |   14 +-
 .../skills/hyperpod-ssm/scripts/ssm-exec.sh   |   28 +-
 .../scripts/hyperpod_check_versions.sh        |   10 +-
 32 files changed, 13840 insertions(+), 28 deletions(-)
 create mode 100644 plugins/sagemaker-ai/skills/hyperpod-cluster-debugger/SKILL.md
 create mode 100644 plugins/sagemaker-ai/skills/hyperpod-cluster-debugger/references/capacity-planning.md
 create mode 100644 plugins/sagemaker-ai/skills/hyperpod-cluster-debugger/references/cloudformation-errors.md
 create mode 100644 plugins/sagemaker-ai/skills/hyperpod-cluster-debugger/references/cluster-diagnostics-detail.md
 create mode 100644 plugins/sagemaker-ai/skills/hyperpod-cluster-debugger/references/cluster-operations.md
 create mode 100644 plugins/sagemaker-ai/skills/hyperpod-cluster-debugger/references/iam-permissions.md
 create mode 100644 plugins/sagemaker-ai/skills/hyperpod-cluster-debugger/references/lifecycle-scripts.md
 create mode 100755 plugins/sagemaker-ai/skills/hyperpod-cluster-debugger/scripts/diagnose-cluster.sh
 create mode 100644 plugins/sagemaker-ai/skills/hyperpod-nccl/SKILL.md
 create mode 100644 plugins/sagemaker-ai/skills/hyperpod-nccl/references/debugging-guide.md
 create mode 100644 plugins/sagemaker-ai/skills/hyperpod-nccl/references/error-patterns-quick-ref.md
 create mode 100644 plugins/sagemaker-ai/skills/hyperpod-nccl/references/operations.md
 create mode 100644 plugins/sagemaker-ai/skills/hyperpod-nccl/references/performance-testing.md
 create mode 100755 plugins/sagemaker-ai/skills/hyperpod-nccl/scripts/nccl-diagnose.sh
 create mode 100644 plugins/sagemaker-ai/skills/hyperpod-node-debugger/SKILL.md
 create mode 100644 plugins/sagemaker-ai/skills/hyperpod-node-debugger/references/node-diagnostics-detail.md
 create mode 100644 plugins/sagemaker-ai/skills/hyperpod-node-debugger/references/node-issue-catalog.md
 create mode 100755 plugins/sagemaker-ai/skills/hyperpod-node-debugger/scripts/check-efa-sg.sh
 create mode 100755 plugins/sagemaker-ai/skills/hyperpod-node-debugger/scripts/check-node-reachability.sh
 create mode 100755 plugins/sagemaker-ai/skills/hyperpod-node-debugger/scripts/check-vpc-config.sh
 create mode 100755 plugins/sagemaker-ai/skills/hyperpod-node-debugger/scripts/triage-cluster.sh
 create mode 100644 plugins/sagemaker-ai/skills/hyperpod-performance-debugger/SKILL.md
 create mode 100644 plugins/sagemaker-ai/skills/hyperpod-performance-debugger/references/perf-details.md
 create mode 100755 plugins/sagemaker-ai/skills/hyperpod-performance-debugger/scripts/perf-snapshot.sh
 create mode 100644 plugins/sagemaker-ai/skills/hyperpod-slurm-debugger/SKILL.md
 create mode 100644 plugins/sagemaker-ai/skills/hyperpod-slurm-debugger/references/slurm-details.md
 create mode 100755 plugins/sagemaker-ai/skills/hyperpod-slurm-debugger/scripts/slurm-diagnose.sh

diff --git a/plugins/sagemaker-ai/README.md b/plugins/sagemaker-ai/README.md
index 764821ff..c867fcb5 100644
--- a/plugins/sagemaker-ai/README.md
+++ b/plugins/sagemaker-ai/README.md
@@ -3,24 +3,29 @@
 This plugin brings deep AWS AI/ML expertise directly into your coding assistant, covering the surface area of [Amazon SageMaker AI](https://aws.amazon.com/sagemaker/ai/); currently, skills are provided to assist with the following capability areas:
 
 - **Model Customization** — End-to-end guided workflows for fine-tuning foundation models, from use case definition through data preparation, training, evaluation, and deployment on Amazon SageMaker AI.
-- **HyperPod Cluster Operations** — Remote command execution on nodes via SSM, version checking, and diagnostic reporting for SageMaker HyperPod training clusters.
+- **HyperPod Cluster Operations** — Remote command execution on nodes via SSM, version checking, diagnostic reporting, and deep debugging for SageMaker HyperPod training clusters.
 
 ## Agent Skills
 
-| #  | Skill                      | Description                                                                                                              | Documentation                                        |
-| -- | -------------------------- | ------------------------------------------------------------------------------------------------------------------------ | ---------------------------------------------------- |
-| 1  | `planning`                 | Builds a dynamic, step-by-step plan tailored to your intents                                                             | [SKILL.md](skills/planning/SKILL.md)                 |
-| 2  | `directory-management`     | Manages project directory setup, artifact organization, and plan association for new or existing projects                | [SKILL.md](skills/directory-management/SKILL.md)     |
-| 3  | `use-case-specification`   | Guided, conversational process to define your model customization use case goals, key stakeholders, and success criteria | [SKILL.md](skills/use-case-specification/SKILL.md)   |
-| 4  | `dataset-evaluation`       | Dataset quality validation, format detection, and data requirements analysis                                             | [SKILL.md](skills/dataset-evaluation/SKILL.md)       |
-| 5  | `dataset-transformation`   | Dataset format conversion and preparation for SageMaker-compatible training formats                                      | [SKILL.md](skills/dataset-transformation/SKILL.md)   |
-| 6  | `finetuning-setup`         | Fine-tuning technique selection (SFT, DPO, RLVR, etc.) and base model selection                                          | [SKILL.md](skills/finetuning-setup/SKILL.md)         |
-| 7  | `finetuning`               | Hyperparameter configuration and training job execution                                                                  | [SKILL.md](skills/finetuning/SKILL.md)               |
-| 8  | `model-evaluation`         | Evaluation design, benchmark selection, LLM-as-a-judge, and model comparison                                             | [SKILL.md](skills/model-evaluation/SKILL.md)         |
-| 9  | `model-deployment`         | Deployment configuration and endpoint setup (SageMaker or Bedrock)                                                       | [SKILL.md](skills/model-deployment/SKILL.md)         |
-| 10 | `hyperpod-ssm`             | Remote command execution and file transfer on HyperPod cluster nodes via SSM                                             | [SKILL.md](skills/hyperpod-ssm/SKILL.md)             |
-| 11 | `hyperpod-version-checker` | Check and compare software component versions across HyperPod cluster nodes                                              | [SKILL.md](skills/hyperpod-version-checker/SKILL.md) |
-| 12 | `hyperpod-issue-report`    | Generate diagnostic reports for HyperPod troubleshooting and support cases                                               | [SKILL.md](skills/hyperpod-issue-report/SKILL.md)    |
+| #  | Skill                           | Description                                                                                                              | Documentation                                             |
+| -- | ------------------------------- | ------------------------------------------------------------------------------------------------------------------------ | --------------------------------------------------------- |
+| 1  | `planning`                      | Builds a dynamic, step-by-step plan tailored to your intents                                                             | [SKILL.md](skills/planning/SKILL.md)                      |
+| 2  | `directory-management`          | Manages project directory setup, artifact organization, and plan association for new or existing projects                | [SKILL.md](skills/directory-management/SKILL.md)          |
+| 3  | `use-case-specification`        | Guided, conversational process to define your model customization use case goals, key stakeholders, and success criteria | [SKILL.md](skills/use-case-specification/SKILL.md)        |
+| 4  | `dataset-evaluation`            | Dataset quality validation, format detection, and data requirements analysis                                             | [SKILL.md](skills/dataset-evaluation/SKILL.md)            |
+| 5  | `dataset-transformation`        | Dataset format conversion and preparation for SageMaker-compatible training formats                                      | [SKILL.md](skills/dataset-transformation/SKILL.md)        |
+| 6  | `finetuning-setup`              | Fine-tuning technique selection (SFT, DPO, RLVR, etc.) and base model selection                                          | [SKILL.md](skills/finetuning-setup/SKILL.md)              |
+| 7  | `finetuning`                    | Hyperparameter configuration and training job execution                                                                  | [SKILL.md](skills/finetuning/SKILL.md)                    |
+| 8  | `model-evaluation`              | Evaluation design, benchmark selection, LLM-as-a-judge, and model comparison                                             | [SKILL.md](skills/model-evaluation/SKILL.md)              |
+| 9  | `model-deployment`              | Deployment configuration and endpoint setup (SageMaker or Bedrock)                                                       | [SKILL.md](skills/model-deployment/SKILL.md)              |
+| 10 | `hyperpod-ssm`                  | Remote command execution and file transfer on HyperPod cluster nodes via SSM                                             | [SKILL.md](skills/hyperpod-ssm/SKILL.md)                  |
+| 11 | `hyperpod-version-checker`      | Check and compare software component versions across HyperPod cluster nodes                                              | [SKILL.md](skills/hyperpod-version-checker/SKILL.md)      |
+| 12 | `hyperpod-issue-report`         | Generate diagnostic reports for HyperPod troubleshooting and support cases                                               | [SKILL.md](skills/hyperpod-issue-report/SKILL.md)         |
+| 13 | `hyperpod-cluster-debugger`     | Diagnose cluster-wide HyperPod problems — creation failures, EFA health, lifecycle scripts, capacity                     | [SKILL.md](skills/hyperpod-cluster-debugger/SKILL.md)     |
+| 14 | `hyperpod-nccl`                 | Diagnose NCCL failures — training hangs, AllReduce timeouts, EFA errors, rendezvous failures                             | [SKILL.md](skills/hyperpod-nccl/SKILL.md)                 |
+| 15 | `hyperpod-node-debugger`        | Diagnose per-node issues — GPU hardware, EFA, disk/memory pressure, container runtime                                    | [SKILL.md](skills/hyperpod-node-debugger/SKILL.md)        |
+| 16 | `hyperpod-performance-debugger` | Diagnose performance issues — uneven NCCL bandwidth, filesystem throughput, straggler nodes                              | [SKILL.md](skills/hyperpod-performance-debugger/SKILL.md) |
+| 17 | `hyperpod-slurm-debugger`       | Diagnose Slurm scheduler issues — nodes stuck down/drain, jobs pending, GRES miscounts, auto-resume                      | [SKILL.md](skills/hyperpod-slurm-debugger/SKILL.md)       |
 
 ## MCP Servers
 
@@ -99,12 +104,22 @@ The HyperPod skills provide operational tooling for Amazon SageMaker HyperPod AI
 - **`hyperpod-ssm`** — Run commands and transfer files on cluster nodes via AWS Systems Manager (SSM), without needing direct SSH access.
 - **`hyperpod-version-checker`** — Check and compare software component versions (drivers, libraries, frameworks) across cluster nodes to identify drift or incompatibilities.
 - **`hyperpod-issue-report`** — Generate comprehensive issue reports that collect system state, logs, and configuration details for troubleshooting or support case submission.
+- **`hyperpod-cluster-debugger`** — Diagnose cluster-wide problems including creation/deployment failures, EFA health checks, lifecycle script errors, and capacity issues.
+- **`hyperpod-nccl`** — Diagnose NCCL failures and training-pod issues such as AllReduce timeouts, EFA/libfabric errors, rendezvous failures, and container OOM.
+- **`hyperpod-node-debugger`** — Diagnose per-node issues including GPU hardware faults (XID, ECC, NVLink), EFA, disk/memory pressure, and container runtime problems.
+- **`hyperpod-performance-debugger`** — Diagnose performance bottlenecks such as uneven NCCL bandwidth across nodes, filesystem throughput issues, and straggler nodes.
+- **`hyperpod-slurm-debugger`** — Diagnose Slurm scheduler and node-daemon issues including nodes stuck in down/drain, jobs pending, GRES miscounts, and auto-resume failures.
 
 ### Examples
 
 - "Check the GPU memory usage on all nodes in my HyperPod cluster using SSM"
 - "Check driver versions on my HyperPod cluster"
 - "Generate an issue report for my HyperPod cluster"
+- "My HyperPod cluster creation failed, help me debug it"
+- "Training is hanging with NCCL timeout errors"
+- "A node in my cluster is unhealthy, diagnose it"
+- "My training is slower than expected across nodes"
+- "Slurm jobs are stuck pending even though nodes show idle"
 
 ## Supported Environments
 
diff --git a/plugins/sagemaker-ai/skills/hyperpod-cluster-debugger/SKILL.md b/plugins/sagemaker-ai/skills/hyperpod-cluster-debugger/SKILL.md
new file mode 100644
index 00000000..bba49c5a
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/hyperpod-cluster-debugger/SKILL.md
@@ -0,0 +1,198 @@
+---
+name: hyperpod-cluster-debugger
+description: Diagnose and remediate cluster-wide HyperPod (EKS or Slurm) problems — creation / deployment failures (CloudFormation, EFA health check, lifecycle scripts, capacity), EKS access, node replacement, CloudFormation nested-stack errors, post-maintenance rollback state, dangling nodes, autoscaler conflicts. Includes `--validate` pre-flight. Read-only.
+metadata:
+  version: "0.0.1"
+---
+
+# HyperPod Cluster Debugger
+
+**Operating policy.** Run read-only diagnostics yourself. Never run a command that changes cluster, node, or workload state — present each one as a **Suggested command (run this yourself)** block and wait for the customer to run it. Destructive order: **investigate → reboot → replace** (replace destroys root + secondary volumes; not supported on Slurm controller nodes).
+
+**Before any state-changing CLI: ask if it's IaC-managed.** HyperPod clusters, SGs, EKS access entries, and IAM are usually provisioned via CloudFormation / CDK / Terraform. If yes, the fix belongs in IaC — running the CLI will drift and the next deploy reverts it. Use the CLI only when IaC is unavailable (locked out, predates IaC, mid-review).
+
+`scripts/diagnose-cluster.sh` is read-only: it collects state via AWS APIs (and SSM for Slurm controller health) and prints each issue as `[FAIL] ... → references/<file>.md § <section>`.
+
+| Reference                                                                 | Open when                                                           |
+| ------------------------------------------------------------------------- | ------------------------------------------------------------------- |
+| [cluster-diagnostics-detail.md](references/cluster-diagnostics-detail.md) | Per-finding remediation runbook (§ A–L)                             |
+| [cluster-operations.md](references/cluster-operations.md)                 | Operational deep-dives (EFA SG, EKS access, SSM, Slurm, filesystem) |
+| [cloudformation-errors.md](references/cloudformation-errors.md)           | § H needs the full per-resource CFN error catalog                   |
+| [capacity-planning.md](references/capacity-planning.md)                   | § B or `--validate` flags capacity / subnet sizing                  |
+| [lifecycle-scripts.md](references/lifecycle-scripts.md)                   | § C points at a specific lifecycle failure                          |
+| [iam-permissions.md](references/iam-permissions.md)                       | Full IAM policy for the diagnostic                                  |
+
+---
+
+## Workflow
+
+1. Collect HyperPod cluster name (not EKS name), region, exact error string.
+2. Run `scripts/diagnose-cluster.sh` (or `--validate` for pre-create).
+3. For every `[FAIL]` line, `Read` the referenced section.
+4. Present finding, root cause, and the Suggested-command block verbatim. Wait for customer approval.
+5. Re-run the diagnostic to confirm.
+
+---
+
+## Step 1: Run diagnostics
+
+```bash
+# Diagnose an existing cluster:
+bash scripts/diagnose-cluster.sh --cluster <CLUSTER_NAME_OR_ARN> --region <REGION>
+
+# Pre-flight (no cluster needed) — validates SGs, subnets, IAM, VPC endpoints,
+# optionally S3 lifecycle scripts and per-AZ capacity:
+bash scripts/diagnose-cluster.sh --validate --region <REGION> \
+  --sg-ids <sg-1,sg-2> --subnet-ids <sub-1,sub-2> [--iam-role <role-arn>] \
+  [--s3-uri s3://<BUCKET>/path/] [--instance-type ml.p5.48xlarge]
+```
+
+Pass `--instance-type` when the target instance type is known — enables the per-AZ capacity check (warns if none of the provided subnets are in an AZ that offers that type, which causes insufficient-capacity failures at creation time).
+
+Tags: `[PASS]` · `[FAIL]` (counted, has `→ references/...` pointer) · `[WARN]` · `[INFO]`. Priorities: **P0** blocks operation · **P1** degraded · **P2** informational.
+
+---
+
+## Step 2: Match signal → section
+
+**Error messages / events:**
+
+| Signal                                                                       | Section                                                        |
+| ---------------------------------------------------------------------------- | -------------------------------------------------------------- |
+| `"EFA health checks did not run successfully"` (public-doc verbatim signal)  | **[A: EFA Health Checks](#a-efa-health-checks)**               |
+| Insufficient-capacity or AZ-mismatch failure at creation                     | **[B: Capacity & AZ](#b-capacity--az)**                        |
+| Lifecycle-script failure or timeout during provisioning                      | **[C: Lifecycle Scripts](#c-lifecycle-scripts)**               |
+| kubectl auth error (server asks for credentials / no API group list)         | **[D: EKS Access](#d-eks-access--kubectl)**                    |
+| `InService` but not all instances visible                                    | **[E: Cluster Provisioning](#e-cluster-provisioning)**         |
+| `"Target is not connected"` / SSM errors                                     | **[F: SSM Connectivity](#f-ssm-connectivity)**                 |
+| Node replacement not happening / `batch-replace` not working                 | **[G: Node Replacement](#g-node-replacement)**                 |
+| `"Embedded stack failed"` / any CloudFormation error                         | **[H: CloudFormation Errors](#h-cloudformation-errors)**       |
+| `UpdateClusterSoftware` failed or cluster in post-maintenance rollback state | **[J: AMI & Cluster Updates](#j-ami--cluster-updates)**        |
+| Dangling / orphaned nodes in EKS vs `list-cluster-nodes`                     | **[K: Dangling Nodes & Cleanup](#k-dangling-nodes--cleanup)**  |
+| Cluster Autoscaler breaks after HyperPod attached                            | **[L: Autoscaler Compatibility](#l-autoscaler-compatibility)** |
+| Slow I/O, FSx throughput saturated                                           | [cluster-operations.md § 9](references/cluster-operations.md)  |
+| Slurm node name → instance ID lookup                                         | **[I: Utilities](#i-utilities)**                               |
+
+---
+
+## A: EFA Health Checks
+
+SG missing self-reference. Add inbound + outbound self-ref to every SG on the cluster, plus least-privilege egress for the AWS APIs the node needs (HTTPS 443 to S3 / ECR / SageMaker / SSM / STS / CloudWatch Logs — via VPC-endpoint prefix-lists when possible). Full procedure: [cluster-diagnostics-detail.md § A](references/cluster-diagnostics-detail.md#a-efa-health-checks).
+
+## B: Capacity & AZ
+
+Instance type unavailable in the requested AZ. Verify with `describe-instance-type-offerings`, then change AZ, use Flexible Training Plans, or request ODCR. Full: [§ B](references/cluster-diagnostics-detail.md#b-capacity--az) · strategy: [capacity-planning.md](references/capacity-planning.md).
+
+## C: Lifecycle Scripts
+
+Script failed or timed out during provisioning. Read CloudWatch under `/aws/sagemaker/Clusters/<name>/<id>` — common causes: missing S3 VPC endpoint, IAM gap, CRLF line endings, instance-group name mismatch. Full: [§ C](references/cluster-diagnostics-detail.md#c-lifecycle-scripts) · layout: [lifecycle-scripts.md](references/lifecycle-scripts.md).
+
+## D: EKS Access / kubectl
+
+IAM identity not in EKS access entries. Verify with `sts get-caller-identity`, create an access entry with admin policy, update kubeconfig. Full: [§ D](references/cluster-diagnostics-detail.md#d-eks-access--kubectl).
+
+## E: Cluster Provisioning
+
+`InService` without all instances is expected under Continuous Provisioning — failures surface as events, not cluster errors. For stuck `Creating`/`Updating`/`Deleting`: check CFN nested stacks (§ H), IAM, capacity, events; if stuck `Deleting` check VPC ENI dependencies. Full: [§ E](references/cluster-diagnostics-detail.md#e-cluster-provisioning).
+
+## F: SSM Connectivity
+
+`Target is not connected`: use `sagemaker-cluster:<CLUSTER_ID>_<GROUP>-<INSTANCE_ID>` format (not raw EC2 ID), install session-manager-plugin, confirm node `Running`. Check IAM + VPC endpoints on timeouts. Full: [§ F](references/cluster-diagnostics-detail.md#f-ssm-connectivity).
+
+## G: Node Replacement
+
+Auto-repair: confirm `NodeRecovery=Automatic`, check Health Monitoring Agent (HMA) logs + node labels / Slurm reason, confirm capacity. Manual: reboot first, replace only if reboot fails. Replace requires the cluster to have been patched via `UpdateClusterSoftware` at least once and cannot target a Slurm controller node. Full: [§ G](references/cluster-diagnostics-detail.md#g-node-replacement).
+
+## H: CloudFormation Errors
+
+`Embedded stack failed` hides the real error. Drill into nested stacks via Events tab (filter Failed) until you reach a non-stack resource. CLI: `describe-stack-events --query 'StackEvents[?ResourceStatus==\`CREATE_FAILED\`]'`. Also covers SLR creation failures and permission-boundary denials. Full: [§ H](references/cluster-diagnostics-detail.md#h-cloudformation-errors) · catalog: [cloudformation-errors.md](references/cloudformation-errors.md).
+
+## I: Utilities
+
+Map Slurm node names (`ip-10-x-y-z`) to HyperPod instance IDs via `list-cluster-nodes` or on-node `/opt/ml/config/resource_config.json`. Full: [§ I](references/cluster-diagnostics-detail.md#i-utilities).
+
+## J: AMI & Cluster Updates
+
+`UpdateClusterSoftware` fails and rolls back, or the cluster stays in a post-maintenance rollback state. Common causes: lifecycle script incompatible with new AMI, HMA version too old, insufficient rolling-update capacity. If the cluster has active nodes, collect diagnostics and escalate rather than delete-and-recreate. Full: [§ J](references/cluster-diagnostics-detail.md#j-ami--cluster-updates).
+
+## K: Dangling Nodes & Cleanup
+
+Nodes in `kubectl get nodes` but not in `list-cluster-nodes` (ghost EKS nodes), or the inverse (HyperPod nodes that never registered kubelet). Script flags both. Full: [§ K](references/cluster-diagnostics-detail.md#k-dangling-nodes--cleanup).
+
+## L: Autoscaler Compatibility
+
+Cluster Autoscaler errors on HyperPod provider IDs and breaks autoscaling for all node groups. No officially endorsed workaround — escalate to AWS Support. Karpenter does not conflict with HyperPod nodes by default. Full: [§ L](references/cluster-diagnostics-detail.md#l-autoscaler-compatibility).
+
+---
+
+## Prerequisites
+
+- `aws` CLI v2.13+ authenticated to the cluster's account
+- `jq`, `python3`, `bash` 4.2+
+- `kubectl` authenticated to the EKS cluster (EKS checks skipped if absent)
+- `session-manager-plugin` (Slurm controller health checks only)
+
+IAM policy: [references/iam-permissions.md](references/iam-permissions.md).
+
+## Defaults
+
+- **Region** — required: pass `--region` or set `$AWS_DEFAULT_REGION`.
+- **Mode** — `--cluster <NAME>` (diagnose) or `--validate` (pre-create).
+- **Event window** — up to 500 most recent events (5 × 100, paginated).
+- **Colors** — auto-disabled on non-TTY; `--no-color` to force off.
+
+## Error handling
+
+| Failure                                             | Script                                                     | Tell the customer                                     |
+| --------------------------------------------------- | ---------------------------------------------------------- | ----------------------------------------------------- |
+| `aws sts get-caller-identity` fails                 | Exit 1                                                     | "Fix AWS credentials and rerun."                      |
+| Cluster not found                                   | Exit 1 after listing region's clusters                     | "Confirm HyperPod cluster name (not EKS) and region." |
+| `sagemaker:*` / `ec2:*` / `eks:*` / `logs:*` denied | Warn, add `Missing IAM permission for <API>`, continue     | "Grant the listed IAM action and rerun."              |
+| `kubectl` absent or unauthenticated                 | Skip EKS checks (access entries, add-ons, aws-auth, nodes) | "Install/authenticate kubectl."                       |
+| `session-manager-plugin` absent (Slurm)             | Skip Slurm controller probe                                | "Install session-manager-plugin."                     |
+| SSM throttled / times out (180s)                    | Retry with backoff; warn and continue if still failing     | "Rerun later — script is idempotent."                 |
+| CloudWatch log group not found                      | Skip CloudWatch check                                      | "CloudWatch not configured on this cluster."          |
+
+Exit codes: `0` no critical failures · `1` one or more critical failures (cluster not found, fatal prerequisite missing, or any `[FAIL]` in diagnose or `--validate` mode). `[WARN]` lines do not affect the exit code.
+
+## Skill delegation
+
+| Need                            | Use                        |
+| ------------------------------- | -------------------------- |
+| Shell on nodes                  | `hyperpod-ssm`             |
+| Version comparison across nodes | `hyperpod-version-checker` |
+
+## Escalate to AWS Support
+
+Escalate when:
+
+1. EFA health checks fail despite correct SG rules.
+2. Capacity errors persist despite a valid Flexible Training Plan / ODCR.
+3. Node replacement fails repeatedly without clear events / log signal.
+4. Cluster stuck in a non-terminal state (`Creating`, `Updating`, or a post-maintenance rollback state) for an extended period.
+5. CloudFormation root-cause is an internal service error.
+
+### Before opening the case
+
+Run these commands and attach the output. Goal: AWS Support has everything at case open.
+
+```bash
+# 1. Cluster identity + status (confirms region, ARN, orchestrator, instance groups)
+aws sagemaker describe-cluster --cluster-name <CLUSTER> --region <REGION>
+
+# 2. Full cluster-level diagnostic bundle
+bash scripts/diagnose-cluster.sh --cluster <CLUSTER> --region <REGION> > diag.txt
+
+# 3. Per-node log/config bundle to S3 (delegates to hyperpod-issue-report skill)
+#    See skills/hyperpod-issue-report/SKILL.md for the exact invocation.
+```
+
+### Include in the case
+
+- Cluster name + ARN (or `ClusterId` suffix) and AWS region
+- `ClusterStatus` + `FailureMessage` from `describe-cluster`
+- Timestamp window (UTC start / end) of the failure
+- Exact error strings observed (copy verbatim from events / logs / console)
+- Affected instance IDs / `NodeLogicalId`s / instance group names
+- `diag.txt` from step 2 above
+- S3 URI of the `hyperpod-issue-report` bundle from step 3
diff --git a/plugins/sagemaker-ai/skills/hyperpod-cluster-debugger/references/capacity-planning.md b/plugins/sagemaker-ai/skills/hyperpod-cluster-debugger/references/capacity-planning.md
new file mode 100644
index 00000000..0c19592d
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/hyperpod-cluster-debugger/references/capacity-planning.md
@@ -0,0 +1,124 @@
+# Capacity Planning
+
+Companion to [SKILL.md](../SKILL.md) § B and `--validate`. Capacity errors are one of the most common creation failures.
+
+---
+
+## Capacity options
+
+### On-demand
+
+Fine for small instance types and short experiments. **Not guaranteed** for large GPU types (p4d, p5, p5e, trn1, trn2). No physical-proximity guarantees — sub-optimal for distributed training.
+
+```bash
+# Which AZs have this instance type. The EC2 API uses bare instance-type
+# names, so strip the SageMaker `ml.` prefix before filtering.
+aws ec2 describe-instance-type-offerings \
+  --location-type availability-zone \
+  --filters "Name=instance-type,Values=p5.48xlarge" \
+  --region us-west-2 \
+  --query 'InstanceTypeOfferings[*].Location' --output table
+```
+
+### Flexible Training Plans
+
+Guaranteed capacity for a reserved period, discounted pricing, co-located instances. Requires advance planning.
+
+```bash
+aws sagemaker list-training-plans \
+  --filters Name=Status,Value=Active \
+  --region <REGION> \
+  --query 'TrainingPlanSummaries[*].{Name:TrainingPlanName,Type:InstanceType,Count:TotalInstanceCount,AZ:AvailabilityZone,Status:Status,Start:StartTime,End:EndTime}' \
+  --output table
+```
+
+Use in cluster config:
+
+```bash
+aws sagemaker create-cluster \
+  --cluster-name my-cluster \
+  --instance-groups '[{
+    "InstanceGroupName": "gpu-workers",
+    "InstanceType": "ml.p5.48xlarge",
+    "InstanceCount": 4,
+    "ExecutionRole": "arn:aws:iam::<ACCT>:role/HyperPodRole",
+    "TrainingPlanArn": "arn:aws:sagemaker:<REGION>:<ACCT>:training-plan/<PLAN_NAME>",
+    "LifeCycleConfig": {"SourceS3Uri": "s3://sagemaker-lifecycle-<guid>/", "OnCreate": "on_create.sh"}
+  }]' \
+  --vpc-config '{"SecurityGroupIds":["sg-xxx"],"Subnets":["subnet-xxx"]}' \
+  --region <REGION>
+```
+
+**Critical:** the subnet must be in the **same AZ** as the training plan's `AvailabilityZone`.
+
+### Reserved capacity (via account team)
+
+For large or long-term capacity. Contact the AWS account team — customized placement and pricing, longer lead time.
+
+---
+
+## AZ selection
+
+Instance-type availability varies by AZ, and AZ names (`us-west-2a`) map to different physical zones per account. When coordinating with AWS Support or the account team about reserved capacity, use **AZ IDs** (`usw2-az1`), not AZ names — they're consistent across accounts.
+
+```bash
+# AZ name → ID:
+aws ec2 describe-availability-zones --region <REGION> \
+  --query 'AvailabilityZones[*].{Name:ZoneName,ID:ZoneId,State:State}' --output table
+
+# Your subnet's AZ:
+aws ec2 describe-subnets --subnet-ids <SUBNET> --region <REGION> \
+  --query 'Subnets[0].{AZ:AvailabilityZone,AZ_ID:AvailabilityZoneId}'
+
+# Instance-type offerings by AZ-ID:
+aws ec2 describe-instance-type-offerings \
+  --location-type availability-zone-id \
+  --filters "Name=instance-type,Values=<TYPE>" \
+  --region <REGION> \
+  --query 'InstanceTypeOfferings[*].Location'
+```
+
+If your subnet's AZ doesn't appear in the offerings list, create a new subnet in an AZ that does.
+
+---
+
+## Service quotas
+
+Check `ml.<type> for cluster usage` quotas before creating a cluster. EKS on HyperPod also consumes ENIs and subnet IPs — size subnets generously; CIDRs cannot be changed after creation.
+
+```bash
+# SageMaker HyperPod quotas:
+aws service-quotas list-service-quotas \
+  --service-code sagemaker --region <REGION> \
+  --query 'Quotas[?contains(QuotaName,`cluster`) || contains(QuotaName,`HyperPod`)].{Name:QuotaName,Value:Value,Code:QuotaCode}' \
+  --output table
+
+# Subnet free IPs:
+aws ec2 describe-subnets --subnet-ids <SUBNET> --region <REGION> \
+  --query 'Subnets[0].{CIDR:CidrBlock,FreeIPs:AvailableIpAddressCount}'
+```
+
+Request quota increases proactively — processing time varies by quota and region.
+
+---
+
+## Troubleshooting
+
+### `Insufficient capacity`
+
+1. Check which AZs have the instance type (commands above)
+2. Verify your subnet is in one of those AZs
+3. If no AZ has capacity: try a different region/type or contact account team
+4. Using a Training Plan: verify `TrainingPlanArn` and that the subnet AZ matches the plan AZ
+
+### `No subnets in the capacity AZ`
+
+Cluster specifies subnets, but none are in the AZ where AWS has capacity. Create a subnet in that AZ and add it to the cluster config.
+
+### Stuck in `Creating` with no events
+
+Likely waiting for capacity. Check `list-cluster-events`; if no events after >1 hour, contact AWS Support.
+
+### Partial provisioning
+
+Capacity was available for some instances but not all. With `NodeProvisioningMode=Continuous` the cluster keeps retrying. Check events for the failing instance group; consider reducing `InstanceCount` or using `MinInstanceCount` for elastic scaling.
diff --git a/plugins/sagemaker-ai/skills/hyperpod-cluster-debugger/references/cloudformation-errors.md b/plugins/sagemaker-ai/skills/hyperpod-cluster-debugger/references/cloudformation-errors.md
new file mode 100644
index 00000000..d22a0a13
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/hyperpod-cluster-debugger/references/cloudformation-errors.md
@@ -0,0 +1,84 @@
+# CloudFormation Error Reference
+
+Deep-dive companion to [SKILL.md](../SKILL.md) § H. HyperPod console deployments create nested CloudFormation stacks; the root-cause error is typically in a nested stack's leaf resource.
+
+---
+
+## Navigate to the real failure
+
+1. CloudFormation console → correct region → find the failed HyperPod stack (`CREATE_FAILED` or `ROLLBACK_COMPLETE`)
+2. **Events tab** → filter by `CREATE_FAILED` → note the earliest failure
+3. **Resources tab** → find `AWS::CloudFormation::Stack` entries with `CREATE_FAILED`
+4. Click the Physical ID → opens the nested stack
+5. Repeat until you reach a stack with only leaf resources
+6. The **Status reason** on the failed leaf resource is the root cause
+
+CLI alternative (per stack — nested stacks need to be iterated):
+
+```bash
+aws cloudformation describe-stack-events --stack-name <STACK> --region <REGION> \
+  --query 'StackEvents[?ResourceStatus==`CREATE_FAILED`].{Time:Timestamp,Resource:LogicalResourceId,Type:ResourceType,Reason:ResourceStatusReason}' \
+  --output table
+```
+
+---
+
+## Resource error catalog
+
+### AWS::SageMaker::Cluster
+
+| Status reason                                      | Root cause                             | Fix                                                                 |
+| -------------------------------------------------- | -------------------------------------- | ------------------------------------------------------------------- |
+| `Insufficient capacity in the Availability Zone`   | No on-demand instances available in AZ | Different AZ, Flexible Training Plans, or reserved capacity         |
+| `No subnets in the capacity AZ`                    | Cluster subnet not in capacity AZ      | Create subnet in the AZ where instances are available               |
+| `EFA health checks did not run successfully`       | SG missing self-referencing rules      | Add inbound + outbound self-ref rules (protocol: All, source: self) |
+| `Lifecycle scripts did not run successfully`       | Script error, S3 access, or timeout    | Check CloudWatch: `/aws/sagemaker/Clusters/<name>/<id>`             |
+| `The security group 'sg-xxx' does not exist`       | Wrong SG ID or different region        | Verify SG exists in same region and VPC                             |
+| `The subnet 'subnet-xxx' does not exist`           | Wrong subnet ID or different region    | Verify subnet exists in same region                                 |
+| `You are not authorized to perform this operation` | Execution role missing permissions     | Add required SageMaker + VPC permissions to the execution role      |
+
+### AWS::IAM::Role
+
+| Status reason                             | Root cause                                                           | Fix                                                          |
+| ----------------------------------------- | -------------------------------------------------------------------- | ------------------------------------------------------------ |
+| `Cannot exceed quota for PoliciesPerRole` | Managed-policy-per-role quota reached (default 10; can be increased) | Consolidate into inline policies or request a quota increase |
+| `Invalid principal in policy`             | Wrong service in trust policy                                        | Use `"Service": "sagemaker.amazonaws.com"` in trust policy   |
+| `MalformedPolicyDocument`                 | JSON syntax error                                                    | Validate JSON; check trailing commas and quotes              |
+| `EntityAlreadyExists`                     | Role name already taken                                              | Use unique name or import existing role                      |
+
+### AWS::EC2::VPC / Subnet / SecurityGroup
+
+| Status reason                                        | Root cause                                                       | Fix                                                      |
+| ---------------------------------------------------- | ---------------------------------------------------------------- | -------------------------------------------------------- |
+| `The CIDR 'x.x.x.x/y' conflicts with another subnet` | Overlapping CIDR in same VPC                                     | Use non-overlapping CIDR blocks                          |
+| `InvalidGroup.Duplicate`                             | SG rule already exists                                           | Treat as success (template idempotency)                  |
+| `RulesPerSecurityGroupLimitExceeded`                 | Per-SG rule quota reached (default 60 per direction; adjustable) | Consolidate with CIDR ranges or request a quota increase |
+
+### AWS::FSx::FileSystem
+
+| Status reason                                   | Root cause                          | Fix                                        |
+| ----------------------------------------------- | ----------------------------------- | ------------------------------------------ |
+| `The subnet is not in a supported AZ`           | FSx Lustre not available in that AZ | Use a subnet in an AZ that supports Lustre |
+| `The security group does not belong to the VPC` | SG and subnet in different VPCs     | Move SG or subnet to same VPC              |
+
+### Custom::Resource / AWS::Lambda::Function
+
+Lambda-backed custom resources fail with the underlying Lambda error. Find the function name in the Resources tab, then:
+
+```bash
+aws logs tail /aws/lambda/<FUNCTION_NAME> --region <REGION> --since 1h
+```
+
+---
+
+## Rolled-back stacks
+
+When a stack rolls back, CloudFormation deletes what it created. List them:
+
+```bash
+aws cloudformation list-stacks \
+  --stack-status-filter ROLLBACK_COMPLETE DELETE_COMPLETE \
+  --region <REGION> \
+  --query 'StackSummaries[?contains(StackName,`HyperPod`) || contains(StackName,`hyperpod`)].{Name:StackName,Status:StackStatus,Time:CreationTime}' \
+  --output table
+```
diff --git a/plugins/sagemaker-ai/skills/hyperpod-cluster-debugger/references/cluster-diagnostics-detail.md b/plugins/sagemaker-ai/skills/hyperpod-cluster-debugger/references/cluster-diagnostics-detail.md
new file mode 100644
index 00000000..aad0a1dc
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/hyperpod-cluster-debugger/references/cluster-diagnostics-detail.md
@@ -0,0 +1,463 @@
+# Cluster Diagnostics — Detailed Procedures
+
+Full diagnostic and fix procedures for each section referenced from [SKILL.md](../SKILL.md).
+
+---
+
+## A: EFA Health Checks
+
+**Signals:** `"EFA health checks did not run successfully. Ensure that your VPC and security groups are properly configured before attempting to create a new cluster."`
+
+**Root cause:** Security group missing self-referencing rules — a common cluster-creation failure.
+
+### Diagnose
+
+```bash
+bash scripts/diagnose-cluster.sh --cluster <CLUSTER> --region <REGION>
+
+# Or directly:
+SG=$(aws sagemaker describe-cluster --cluster-name <CLUSTER> --region <REGION> \
+  --query 'VpcConfig.SecurityGroupIds[0]' --output text)
+aws ec2 describe-security-groups --group-ids $SG --region <REGION> \
+  --query 'SecurityGroups[0].{Inbound:IpPermissions,Outbound:IpPermissionsEgress}' \
+  --output json
+```
+
+Look for self-referencing rules where source/destination is the SG itself.
+
+### Fix — apply to every SG on the cluster
+
+Customer-run. Apply the two self-ref rules to each SG in `describe-cluster → VpcConfig.SecurityGroupIds`, then add **least-privilege egress** for the AWS APIs the node needs to reach. Idempotent: `InvalidPermission.Duplicate` = already exists, treat as success.
+
+```bash
+SG=<security-group-id>
+REGION=<region>
+
+# Inbound self-ref (inter-node communication, EFA)
+aws ec2 authorize-security-group-ingress --group-id $SG --region $REGION \
+  --ip-permissions '[{"IpProtocol":"-1","UserIdGroupPairs":[{"GroupId":"'"$SG"'"}]}]'
+
+# Outbound self-ref (EFA RDMA)
+aws ec2 authorize-security-group-egress --group-id $SG --region $REGION \
+  --ip-permissions '[{"IpProtocol":"-1","UserIdGroupPairs":[{"GroupId":"'"$SG"'"}]}]'
+```
+
+**Egress for AWS APIs.** The node needs HTTPS (443) outbound to reach the AWS services HyperPod uses: S3 (lifecycle scripts), ECR (container images), SageMaker (HyperPod control plane), SSM / SSMMessages / EC2Messages (Session Manager), STS, and CloudWatch Logs. The narrowest practical rule is **TCP 443 to the VPC-endpoint prefix-lists** for those services (`com.amazonaws.<region>.<service>` resolves to a `pl-XXXXXXXX` ID via `aws ec2 describe-prefix-lists`), referenced in `authorize-security-group-egress --ip-permissions` as `PrefixListIds`. See the AWS docs on [VPC endpoint prefix lists](https://docs.aws.amazon.com/vpc/latest/privatelink/vpce-gateway.html#vpc-endpoints-security) for the exact CLI shape. `aws ec2 describe-vpc-endpoints` lists which services the cluster VPC already has endpoints for.
+
+Self-ref opens all ports between instances in this SG (intended for intra-cluster EFA). For multi-SG clusters see [cluster-operations.md § 1](cluster-operations.md#1-efa-security-group-multi-sg-clusters).
+
+---
+
+## B: Capacity & AZ
+
+**Signals:** `"We currently do not have sufficient capacity in the Availability Zone you requested"` (public doc); also seen: subnets not in the AZ where capacity is available.
+
+```bash
+aws ec2 describe-instance-type-offerings \
+  --location-type availability-zone \
+  --filters "Name=instance-type,Values=<INSTANCE_TYPE>" \
+  --region <REGION> \
+  --query 'InstanceTypeOfferings[*].Location' --output table
+```
+
+Fix: add subnet in an AZ where the type is available, or use Flexible Training Plans / ODCR. Full strategy: [capacity-planning.md](capacity-planning.md).
+
+---
+
+## C: Lifecycle Scripts
+
+**Signals:** cluster-creation event indicates lifecycle script execution error or timeout; creation fails during provisioning.
+
+```bash
+CLUSTER_ID=$(aws sagemaker describe-cluster --cluster-name <CLUSTER> --region <REGION> \
+  --query 'ClusterArn' --output text | cut -d/ -f2)
+LOG_GROUP="/aws/sagemaker/Clusters/<CLUSTER_NAME>/${CLUSTER_ID}"
+
+aws logs describe-log-streams --log-group-name "$LOG_GROUP" --region <REGION> \
+  --query 'logStreams[?starts_with(logStreamName,`LifecycleConfig`)].logStreamName' --output table
+
+aws logs get-log-events --log-group-name "$LOG_GROUP" \
+  --log-stream-name "LifecycleConfig/<group-name>/<instance-id>" \
+  --region <REGION> --query 'events[*].message' --output text
+```
+
+| Log error                                | Fix                                                         |
+| ---------------------------------------- | ----------------------------------------------------------- |
+| `Connect timeout on endpoint URL: s3://` | Add S3 Gateway VPC endpoint to subnet route table           |
+| `AccessDenied` on S3                     | Add `s3:GetObject` + `s3:ListBucket` to execution role      |
+| Script never exits / timeout             | Add `set -euo pipefail`; test locally; add network timeouts |
+| `ASCII text, with CRLF line terminators` | `dos2unix script.sh` before uploading                       |
+| `provisioning_parameters.json` mismatch  | Instance group names must match between config and API call |
+
+Full S3 layout, node-type detection, and on-node debug: [lifecycle-scripts.md](lifecycle-scripts.md).
+
+---
+
+## D: EKS Access / kubectl
+
+**Signals:** `"couldn't get current server API group list: the server has asked for the client to provide credentials"`, `kubectl get nodes` fails or returns nothing.
+
+```bash
+# Your identity
+aws sts get-caller-identity
+
+# EKS cluster behind the HyperPod cluster
+EKS_ARN=$(aws sagemaker describe-cluster --cluster-name <HYPERPOD> --region <REGION> \
+  --query 'Orchestrator.Eks.ClusterArn' --output text)
+EKS_NAME=$(echo $EKS_ARN | awk -F'/' '{print $NF}')
+
+# Existing access entries
+aws eks list-access-entries --cluster-name $EKS_NAME --region <REGION>
+
+# Auth mode
+aws eks describe-cluster --name $EKS_NAME --region <REGION> \
+  --query 'cluster.accessConfig.authenticationMode' --output text
+```
+
+### Suggested command — grant yourself EKS access (run this yourself)
+
+**Preconditions:** `$MY_ARN` is the IAM **role ARN**, not the assumed-role session ARN. EKS auth mode is `API` or `API_AND_CONFIG_MAP`.
+
+**Command:**
+
+```bash
+MY_ARN=$(aws sts get-caller-identity --query 'Arn' --output text)
+
+aws eks create-access-entry \
+  --cluster-name $EKS_NAME --region <REGION> --principal-arn $MY_ARN
+
+aws eks associate-access-policy \
+  --cluster-name $EKS_NAME --region <REGION> --principal-arn $MY_ARN \
+  --policy-arn arn:aws:eks::aws:cluster-access-policy/AmazonEKSClusterAdminPolicy \
+  --access-scope '{"type": "cluster"}'
+
+aws eks update-kubeconfig --name $EKS_NAME --region <REGION>
+kubectl get nodes
+```
+
+**Blast radius:** `AmazonEKSClusterAdminPolicy` grants cluster-wide admin on the EKS cluster — use a narrower policy (`AmazonEKSEditPolicy` / `AmazonEKSViewPolicy` + namespace scope) for day-to-day operators. `update-kubeconfig` overwrites the current `kubectl` context.
+
+If the EKS cluster's auth mode is `CONFIG_MAP` only, access entries are not available. Switching auth mode is a cluster-level, administrator-level change — review the EKS access-entries documentation before proceeding and coordinate with anyone who depends on the existing `aws-auth` ConfigMap.
+
+---
+
+## E: Cluster Provisioning
+
+**Signals:** Cluster `InService` but instances not visible, `kubectl get nodes` returns nothing, `list-cluster-nodes` shows fewer nodes than expected.
+
+With **Continuous Provisioning**, the cluster goes `InService` before all instances are created. Instance creation is asynchronous; failures appear as events.
+
+```bash
+aws sagemaker describe-cluster --cluster-name <CLUSTER> --region <REGION> \
+  --query '{Status:ClusterStatus,Groups:InstanceGroups[*].{Name:InstanceGroupName,Count:CurrentCount,Target:InstanceCount,Status:InstanceGroupStatus}}' \
+  --output table
+
+aws sagemaker list-cluster-events --cluster-name <CLUSTER> --region <REGION> \
+  --query 'ClusterEventSummaries[*].{Time:EventTime,Type:EventType,Message:Message}' \
+  --output table
+
+aws sagemaker list-cluster-nodes --cluster-name <CLUSTER> --region <REGION> \
+  --query 'ClusterNodeSummaries[*].{ID:InstanceId,Group:InstanceGroupName,Status:InstanceStatus.Status}' \
+  --output table
+```
+
+| Observation                                               | Cause                               | Action                                |
+| --------------------------------------------------------- | ----------------------------------- | ------------------------------------- |
+| `CurrentCount < InstanceCount`, events show provisioning  | Continuous provisioning in progress | Wait; monitor events                  |
+| Events: `"Insufficient capacity"`                         | No capacity in AZ                   | See **[B](#b-capacity--az)**          |
+| Events: lifecycle script failure                          | Script error                        | See **[C](#c-lifecycle-scripts)**     |
+| Events: `"EFA health checks"`                             | SG misconfiguration                 | See **[A](#a-efa-health-checks)**     |
+| Nodes in `list-cluster-nodes` but not `kubectl get nodes` | EKS registration issue              | Check lifecycle logs, kubelet via SSM |
+
+See [cluster-operations.md § 5](cluster-operations.md#5-continuous-provisioning-eks-only).
+
+---
+
+## F: SSM Connectivity
+
+**Signals:** `"Target is not connected"`, SSM session fails.
+
+> **For interactive shell or repeated SSM access, use the [`hyperpod-ssm`](../../hyperpod-ssm/SKILL.md) skill** — it wraps the cluster-ID derivation, target-format construction, and session start shown below. The block here is for one-off connectivity diagnosis; `hyperpod-ssm` is the right tool for actually working on nodes.
+
+---
+
+## G: Node Replacement
+
+### G.1: Auto-replacement not triggering
+
+Diagnose (read-only):
+
+```bash
+# Is NodeRecovery enabled?
+aws sagemaker describe-cluster --cluster-name <CLUSTER> --region <REGION> \
+  --query 'InstanceGroups[*].{Group:InstanceGroupName,Recovery:NodeRecovery}' --output table
+
+# Replacement activity
+aws sagemaker list-cluster-events --cluster-name <CLUSTER> --region <REGION> \
+  --query 'ClusterEventSummaries[?contains(Message,`replace`) || contains(Message,`reboot`) || contains(Message,`hardware`) || contains(Message,`recovery`)]' \
+  --output table
+
+# Health-monitoring-agent logs (pattern: SagemakerHealthMonitoringAgent/<group>/<instance>)
+CLUSTER_ID=$(aws sagemaker describe-cluster --cluster-name <CLUSTER> --region <REGION> \
+  --query 'ClusterArn' --output text | cut -d/ -f2)
+aws logs describe-log-streams \
+  --log-group-name "/aws/sagemaker/Clusters/<CLUSTER>/${CLUSTER_ID}" \
+  --region <REGION> \
+  --query 'logStreams[?starts_with(logStreamName,`SagemakerHealthMonitoringAgent`)].logStreamName' \
+  --output table
+
+# EKS node health labels — the sagemaker.amazonaws.com/node-health-status
+# label on each node indicates the action HyperPod has decided on.
+kubectl get nodes --show-labels
+kubectl describe node <NODE>
+
+sinfo -o "%N %T %30E"
+```
+
+**Common blockers:** `NodeRecovery=None`, health agent hasn't detected (wait for next cycle), lifecycle script failing on new instance (same log group, `LifecycleConfig/...` stream), no capacity (see [B](#b-capacity--az)), cluster not `InService`.
+
+### Suggested command — enable NodeRecovery (run this yourself)
+
+> **Destructive — replaces the whole `InstanceGroups` list.** Any group omitted from the payload is deleted; any field drift (instance type, count, lifecycle config) is applied as-is. Re-run `describe-cluster` first and copy every existing field into the payload below before adding `NodeRecovery=Automatic`. If unsure, use the SageMaker console — it preserves existing fields by default. Never run this command yourself; present it to the customer.
+
+**Preconditions:** `NodeRecovery=None` confirmed above. **Derive every field for every instance group from the current `describe-cluster` output** — `update-cluster` replaces the whole `InstanceGroups` list; any field drift is applied as-is.
+
+**Command:**
+
+```bash
+aws sagemaker update-cluster --cluster-name <CLUSTER> --region <REGION> \
+  --instance-groups '[{"InstanceGroupName":"<G>","InstanceType":"ml.p5.48xlarge",
+    "InstanceCount":<N>,
+    "LifeCycleConfig":{"SourceS3Uri":"<URI>","OnCreate":"<SCRIPT>"},
+    "ExecutionRole":"<ROLE>",
+    "OnStartDeepHealthChecks":["InstanceStress","InstanceConnectivity"],
+    "NodeRecovery":"Automatic"}]'
+```
+
+**Blast radius:** any instance group omitted from the list is deleted; any field drift (instance type, count, lifecycle config) is applied as-is. If unsure, use the console, which preserves existing fields by default.
+
+### G.2: Manual replacement
+
+Diagnose (read-only):
+
+```bash
+aws sagemaker list-cluster-nodes --cluster-name <CLUSTER> --region <REGION> \
+  --query 'ClusterNodeSummaries[*].{ID:InstanceId,Group:InstanceGroupName,Status:InstanceStatus.Status}' \
+  --output table
+
+aws sagemaker describe-cluster --cluster-name <CLUSTER> --region <REGION> \
+  --query 'ClusterStatus' --output text
+```
+
+### Suggested command — reboot (run this yourself)
+
+**Preconditions:** `<INSTANCE_ID>` belongs to the cluster (confirmed from `list-cluster-nodes` above); workload can tolerate a restart; on Slurm clusters, rebooting will not disrupt critical cluster operations (per the API doc). `NodeIds` batch size: 1-25 per call.
+
+**Command:**
+
+```bash
+aws sagemaker batch-reboot-cluster-nodes --cluster-name <CLUSTER> --region <REGION> \
+  --node-ids '["<INSTANCE_ID>"]'
+
+aws sagemaker list-cluster-events --cluster-name <CLUSTER> --region <REGION> \
+  --query 'ClusterEventSummaries[0:5].{Time:EventTime,Message:Message}' --output table
+```
+
+**Blast radius:** soft recovery via EC2 `RebootInstances` — preserves instance identity, root volume, and secondary volumes. Training processes on the node are interrupted.
+
+### Suggested command — replace (run this yourself, only if reboot did not clear the fault)
+
+**Preconditions:**
+
+- Reboot attempted first and did not clear the fault.
+- Hardware fault confirmed (uncorrectable ECC, GPU-bus errors, EFA hardware failure); not a software / config issue.
+- Data on root + secondary volumes is backed up — per the API doc: "Replacing nodes destroys all instance volumes, including both root and secondary volumes. All data stored on these volumes will be permanently lost and cannot be recovered."
+- Cluster has been patched via `UpdateClusterSoftware` — per the API doc: "If you want to invoke this API on an existing cluster, you'll first need to patch the cluster by running the UpdateClusterSoftware API."
+- Target is **NOT** a Slurm controller — per the API doc: "For SageMaker HyperPod clusters using the Slurm workload manager, you cannot replace instances that are configured as Slurm controller nodes."
+- `NodeIds` batch size: 1-25 per call (API limit).
+
+**Command:**
+
+```bash
+aws sagemaker batch-replace-cluster-nodes --cluster-name <CLUSTER> --region <REGION> \
+  --node-ids '["<INSTANCE_ID>"]'
+```
+
+**Blast radius:** destroys root + secondary volumes on the replaced instance (permanent data loss). New hardware is provisioned with the same AMI and instance configuration.
+
+**Karpenter note** (per the HyperPod EKS manual-recovery doc): on Karpenter-managed clusters, `BatchReplaceClusterNodes` terminates the node but does **not** guarantee a replacement — Karpenter only creates a new node if pending pods cannot be rescheduled onto remaining capacity. Per-workload configuration (pod anti-affinity, resource requests) can force a new node.
+
+---
+
+## H: CloudFormation Errors
+
+**Signals:** `"Embedded stack failed"`, `CREATE_FAILED` / `ROLLBACK_COMPLETE`, generic console error.
+
+### Navigate to root cause
+
+1. CloudFormation console → correct region
+2. Find the failed HyperPod stack
+3. **Events** tab → filter by `CREATE_FAILED` (earliest failure is the real one; later ones are cascades)
+4. If error is `"Embedded stack failed"`, open **Resources** → find `AWS::CloudFormation::Stack` with `CREATE_FAILED`
+5. Click Physical ID → opens the nested stack
+6. Repeat until you reach a non-stack leaf resource
+7. The **Status reason** on the leaf is the actionable error
+
+CLI alternative:
+
+```bash
+aws cloudformation describe-stack-events --stack-name <STACK> --region <REGION> \
+  --query 'StackEvents[?ResourceStatus==`CREATE_FAILED`]'
+```
+
+For Custom::Resource failures, find the Lambda function name and check its logs.
+
+| Failed resource type          | Common errors                                      |
+| ----------------------------- | -------------------------------------------------- |
+| `AWS::SageMaker::Cluster`     | Capacity, subnet, SG, lifecycle script             |
+| `AWS::IAM::Role`              | Permissions, trust relationship                    |
+| `AWS::IAM::ServiceLinkedRole` | SLR creation denied — see below                    |
+| `AWS::Lambda::Function`       | Execution error, timeout                           |
+| `AWS::EC2::VPC`               | CIDR conflict, quota                               |
+| `Custom::Resource`            | Lambda-backed error — check Lambda CloudWatch logs |
+
+Full resource-by-resource catalog: [cloudformation-errors.md](cloudformation-errors.md).
+
+### Service-linked role (SLR)
+
+SageMaker HyperPod uses the SLR `AWSServiceRoleForSageMakerHyperPod` (attached to the `AmazonSageMakerHyperPodServiceRolePolicy` managed policy). It is **created automatically** on first cluster creation — you do not need to pre-create it. If cluster creation fails with an SLR error, the cause is almost always an SCP or permission boundary blocking `iam:CreateServiceLinkedRole` for the caller.
+
+```bash
+# Verify the SLR exists in the account
+aws iam get-role --role-name AWSServiceRoleForSageMakerHyperPod
+```
+
+If `iam:CreateServiceLinkedRole` is denied by an SCP, have an account admin either:
+
+- Grant the permission to the caller and retry cluster creation, or
+- Request the SCP be adjusted to allow the specific SLR creation.
+
+### Permission boundary denials
+
+Even when a role's inline policy grants a permission, an attached permission boundary can deny it.
+
+```bash
+ROLE_NAME=$(aws sagemaker describe-cluster --cluster-name <C> --region <R> \
+  --query 'Orchestrator.Eks.ExecutionRoleArn' --output text | awk -F/ '{print $NF}')
+aws iam get-role --role-name "$ROLE_NAME" --query 'Role.PermissionsBoundary'
+```
+
+If `PermissionsBoundary` is non-null, inspect the boundary policy — any denial there overrides all grants.
+
+### Cluster in `Failed` terminal state
+
+`ClusterStatus=Failed` cannot be updated. Options:
+
+1. Collect diagnostics (`diagnose-cluster.sh` + CFN events above)
+2. Fix root cause (usually IAM / VPC / SG)
+3. `aws sagemaker delete-cluster` and recreate
+
+Deletion is destructive — migrate active workloads first.
+
+### Multi-AZ and EFA
+
+EFA is intra-AZ only. Cross-AZ collectives fall back to TCP. For EFA-accelerated training, keep all training instance groups in a single AZ. `describe-instance-type-offerings` to pick one.
+
+### Service quotas
+
+Check SageMaker HyperPod, EC2 EFA, and VPC quotas before creation — see [capacity-planning.md § service quotas](capacity-planning.md#service-quotas). Quota increases take 1-3 business days.
+
+---
+
+## I: Utilities
+
+### Slurm node name → instance ID
+
+Slurm nodes use IP-named hostnames (`ip-10-1-123-45`). Quick lookup:
+
+```bash
+# Works from anywhere
+aws sagemaker list-cluster-nodes --cluster-name <CLUSTER> --region <REGION> \
+  --query 'ClusterNodeSummaries[*].{ID:InstanceId,DNS:PrivateDnsHostname,Group:InstanceGroupName}' \
+  --output table
+
+# On head node
+IP=$(echo "ip-10-1-123-45" | sed 's/ip-//; s/-/./g')
+sudo cat /opt/ml/config/resource_config.json | jq | grep -A 3 "$IP"
+```
+
+For bulk lookups, `list-cluster-nodes` output can be piped to `jq` to produce a CSV of node → instance ID (there are also community scripts in public AWS sample repositories).
+
+---
+
+## J: AMI & Cluster Updates
+
+`UpdateClusterSoftware` fails and rolls back, or the cluster remains in a post-maintenance rollback state. Common causes: lifecycle script incompatible with new AMI, insufficient capacity during rolling update, IAM gaps.
+
+```bash
+aws sagemaker list-cluster-events --cluster-name <NAME> --region <REGION> \
+  --query 'ClusterEventSummaries[?contains(Message, `Update`) || contains(Message, `Rollback`)]'
+
+aws sagemaker describe-cluster --cluster-name <NAME> --region <REGION> \
+  --query '{Status:ClusterStatus,FailureMsg:FailureMessage}'
+
+# Per-instance-group lifecycle logs on the nodes that were rolled over:
+aws logs describe-log-streams \
+  --log-group-name "/aws/sagemaker/Clusters/<NAME>/<CLUSTER_ID>" \
+  --region <REGION>
+```
+
+### Decisions
+
+| Symptom                                            | Likely cause                                             | Action                                                                                 |
+| -------------------------------------------------- | -------------------------------------------------------- | -------------------------------------------------------------------------------------- |
+| Rollback on new AMI                                | Lifecycle script failed on new AMI                       | Fix the script (test on one instance group), retry `UpdateClusterSoftware`             |
+| Cluster stays in a post-maintenance rollback state | Cluster-state machine requires service-side intervention | Collect diagnostics and escalate; do not delete and recreate if there are active nodes |
+| Insufficient capacity mid-update                   | No rolling-update capacity                               | Pause the update; use Flexible Training Plans / ODCR; retry                            |
+| Large-fleet migration                              | Rolling update is high-risk at scale                     | Blue/green: new instance group on the new AMI, drain old, validate, delete old         |
+
+---
+
+## K: Dangling Nodes & Cleanup
+
+After a failed scale-up or rollback, EKS may show nodes that HyperPod no longer manages ("dangling"). The inverse — HyperPod nodes not registered in EKS — usually means kubelet or bootstrap failed.
+
+```bash
+kubectl get nodes -l sagemaker.amazonaws.com/compute-type=hyperpod \
+  -o jsonpath='{range .items[*]}{.spec.providerID}{"\n"}{end}' \
+  | sed 's|.*/||' | sort > /tmp/eks-nodes.txt
+
+aws sagemaker list-cluster-nodes --cluster-name <NAME> --region <REGION> \
+  --query 'ClusterNodeSummaries[*].InstanceId' --output text \
+  | tr '\t' '\n' | sort > /tmp/hp-nodes.txt
+
+# EKS-only (dangling) — registered in EKS but not in HyperPod
+comm -23 /tmp/eks-nodes.txt /tmp/hp-nodes.txt
+
+# HyperPod-only (kubelet never registered) — in HyperPod but not in EKS
+comm -13 /tmp/eks-nodes.txt /tmp/hp-nodes.txt
+```
+
+### Remediation
+
+### Fix — delete a dangling EKS node
+
+Customer-run. Only delete when the EKS node has no matching HyperPod instance (confirmed by `comm` above) AND the EC2 instance is terminated — confirm with the first command below.
+
+```bash
+aws ec2 describe-instances --instance-ids <IID> --region <REGION> \
+  --query 'Reservations[0].Instances[0].State.Name'
+kubectl delete node <NODE_NAME>
+```
+
+If the EC2 instance is still running and registered, kubelet re-registers the node — the delete is a no-op with transient scheduling churn.
+
+**Orphaned HyperPod node (not in EKS):** kubelet never registered. Triage with `hyperpod-node-debugger` — common causes are instance IAM role misconfigured, VPC endpoints missing, or lifecycle script failure.
+
+---
+
+## L: Autoscaler Compatibility
+
+Cluster Autoscaler (CAS) in the same EKS cluster can fail to parse HyperPod node provider IDs, which can break autoscaling for every node group in the cluster — not only HyperPod. Diagnose via CAS logs: look for node-info parse errors tied to HyperPod-managed nodes. If hit, escalate to AWS Support; do not apply untested CAS flags.
+
+Karpenter does not manage HyperPod nodes directly and should not conflict. If Karpenter is attempting to disrupt HyperPod training pods, the standard Karpenter annotation `karpenter.sh/do-not-disrupt: "true"` on the pod prevents disruption (see the Karpenter upstream documentation for current annotation syntax).
diff --git a/plugins/sagemaker-ai/skills/hyperpod-cluster-debugger/references/cluster-operations.md b/plugins/sagemaker-ai/skills/hyperpod-cluster-debugger/references/cluster-operations.md
new file mode 100644
index 00000000..bab2e5a6
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/hyperpod-cluster-debugger/references/cluster-operations.md
@@ -0,0 +1,270 @@
+# Cluster Operations Reference
+
+Operational deep-dives for the hyperpod-cluster-debugger skill. See SKILL.md for the workflow entry points.
+
+---
+
+## 1. EFA Security Group (multi-SG clusters)
+
+The EFA health check runs during instance provisioning, **before** lifecycle scripts execute. If it fails, lifecycle scripts never run and CloudWatch lifecycle logs are empty — the cluster event will say `"EFA health checks did not run successfully"`.
+
+When a cluster uses multiple security groups, **all** SGs must have the self-referencing rules. Check each:
+
+```bash
+for SG in $(aws sagemaker describe-cluster --cluster-name <C> --region <R> \
+  --query 'VpcConfig.SecurityGroupIds[]' --output text); do
+  echo "=== $SG ==="
+  aws ec2 describe-security-groups --group-ids $SG --region <R> \
+    --query 'SecurityGroups[0].{In:IpPermissions,Out:IpPermissionsEgress}'
+done
+```
+
+Fix commands are in [cluster-diagnostics-detail.md § A](cluster-diagnostics-detail.md#a-efa-health-checks).
+
+---
+
+## 2. Capacity
+
+See [capacity-planning.md](capacity-planning.md).
+
+---
+
+## 3. Lifecycle scripts
+
+See [lifecycle-scripts.md](lifecycle-scripts.md).
+
+---
+
+## 4. EKS access control
+
+### Authentication modes
+
+Access entries require `API` or `API_AND_CONFIG_MAP`. If the cluster is on `CONFIG_MAP` only, `aws eks list-access-entries` returns nothing useful; verify the mode with `describe-cluster --query 'cluster.accessConfig.authenticationMode'` and consult the EKS access-entries documentation for the switching procedure.
+
+### Access policies (EKS-native)
+
+| Policy                        | Scope        | Use case                       |
+| ----------------------------- | ------------ | ------------------------------ |
+| `AmazonEKSClusterAdminPolicy` | Cluster-wide | Full admin (debugging)         |
+| `AmazonEKSAdminPolicy`        | Namespace    | Namespace admin (multi-tenant) |
+| `AmazonEKSEditPolicy`         | Namespace    | Read/write workloads           |
+| `AmazonEKSViewPolicy`         | Namespace    | Read-only                      |
+
+### Troubleshooting kubectl auth
+
+```bash
+aws sts get-caller-identity            # your identity
+kubectl config current-context         # which cluster kubeconfig points at
+kubectl cluster-info                   # API server reachable?
+```
+
+If using an assumed role: **access entries reference the IAM role ARN, not the assumed-role session ARN.**
+
+- Role ARN: `arn:aws:iam::123456789012:role/MyRole`
+- Session ARN: `arn:aws:sts::123456789012:assumed-role/MyRole/session-name`
+
+---
+
+## 5. Continuous Provisioning (EKS only)
+
+The cluster transitions to `InService` once the control plane is ready; instances are created asynchronously and failures are reported as events, not cluster failures. Failed instances can be individually replaced.
+
+```bash
+# Poll instance creation:
+watch -n 30 "aws sagemaker describe-cluster --cluster-name <C> --region <R> \
+  --query 'InstanceGroups[*].{Name:InstanceGroupName,Current:CurrentCount,Target:InstanceCount}' --output table"
+
+# Poll cluster events:
+watch -n 30 "aws sagemaker list-cluster-events --cluster-name <C> --region <R> \
+  --query 'ClusterEventSummaries[0:5].{Time:EventTime,Msg:Message}' --output table"
+```
+
+### Nodes in `list-cluster-nodes` but not in `kubectl get nodes`
+
+1. Check lifecycle script logs — it registers the node with EKS
+2. Verify the EKS endpoint is reachable from worker subnets
+3. Check kubelet on the node via SSM
+4. Verify the node's IAM role has `AmazonEKSWorkerNodePolicy`
+
+> Cluster events are emitted for HyperPod EKS. For HyperPod Slurm, events are not yet surfaced — use CloudWatch logs and `list-cluster-nodes` instead.
+
+---
+
+## 6. SSM target format
+
+See the `hyperpod-ssm` skill's `SKILL.md` for the target format (`sagemaker-cluster:<CLUSTER_ID>_<GROUP>-<INSTANCE_ID>`), prerequisites, and manual-command examples. HyperPod requires `start-session` — not `send-command` against raw instance IDs.
+
+---
+
+## 7. Node replacement (batch APIs)
+
+Full Suggested-command blocks with preconditions + blast radius are in [cluster-diagnostics-detail.md § G.2](cluster-diagnostics-detail.md#g2-manual-replacement). Summary:
+
+- Cluster must be `InService`
+- Batch limit: **1-25 node IDs per call** for both APIs
+- `batch-replace-cluster-nodes` destroys root + secondary volumes and is not supported on Slurm controller nodes — back up first
+- Monitor with `list-cluster-events` after the call
+- Prefer batch APIs over legacy paths (Slurm reason fields, K8s labels)
+
+---
+
+## 8. Slurm — controller operations
+
+The per-node Slurm operations (resuming a single node, fixing a single Slurm state) live in the `hyperpod-node-debugger` skill. This section is controller-level only.
+
+### Diagnose controller health (via SSM on the controller)
+
+```bash
+scontrol ping                                     # slurmctld responsive?
+systemctl status slurmctld                        # service state
+systemctl is-active munge && systemctl status munge   # auth daemon (required)
+systemctl is-active slurmdbd                      # accounting DB (if used)
+```
+
+### slurmctld down
+
+```bash
+journalctl -u slurmctld --since "1 hour ago" --no-pager | tail -100
+tail -200 /var/log/slurm/slurmctld.log
+```
+
+Common causes and fixes:
+
+- **OOM on controller**: restart the service; investigate the job scale that triggered it.
+- **Munge auth failure** (`Invalid authentication credential`): munge key mismatch. Re-sync `/etc/munge/munge.key` to every node, restart munge + slurmctld.
+- **Accounting DB unreachable** (slurmdbd + MariaDB / RDS): check network path and credentials. slurmctld won't start if accounting is required but unreachable.
+- **Config error in `slurm.conf`**: `slurmctld -D -vvv` (foreground) prints the parse error. Roll back to the last known-good config.
+
+### Fix — restart slurmctld
+
+Customer-run on the Slurm controller (via SSM) after the root cause is diagnosed. Running jobs, pending queue, and node states are preserved; caches and resource calculations reset. Brief scheduler pause.
+
+```bash
+sudo systemctl restart slurmctld
+scontrol ping   # expect "Slurmctld(primary) is UP"
+```
+
+If `slurm.conf` is broken the service will not return — roll back the config first.
+
+### munge inactive
+
+Diagnose:
+
+```bash
+systemctl status munge
+ls -l /etc/munge/munge.key   # expect munge:munge, mode 0400
+sudo md5sum /etc/munge/munge.key   # must match on controller + every compute node
+```
+
+### Fix — start munge
+
+Customer-run. Safe when `munge` is inactive and the key file is present and matches other nodes.
+
+```bash
+sudo systemctl start munge
+```
+
+If md5 mismatches another node, jobs will still fail auth — re-distribute the controller's key cluster-wide and restart munge on every node.
+
+### Stuck jobs (PENDING / COMPLETING / CONFIGURING)
+
+```bash
+squeue -o "%i %j %T %R %N" --noheader | grep -iE "COMPLETING|CONFIGURING|PENDING"
+scontrol show job <JOBID>
+scancel <JOBID>               # if safe to cancel
+```
+
+Common reason codes:
+
+- `(Resources)` — waiting for free nodes. Check `sinfo -o "%P %a %l %D %T"`.
+- `(AssocGrpNodeLimit)` / `(QOSMaxJobsPerUserLimit)` — quota-related. `sacctmgr show assoc`.
+- `(NodeDown)` — partition has no healthy nodes. Use the `hyperpod-node-debugger` skill.
+- `(BeginTime)` — scheduled for a future start time.
+
+Restarting slurmctld to clear stuck-job symptoms uses the same Suggested-command block as above (§ slurmctld down).
+
+### Verify after remediation
+
+```bash
+scontrol ping                                   # "Slurmctld(primary) is UP"
+sinfo                                            # no "down*" or "drain" states
+systemctl is-active slurmctld munge
+scontrol show config | grep StateSaveLocation   # must be persistent + writable
+```
+
+---
+
+## 9. Filesystem performance
+
+Symptom: training bottlenecked by data loading, checkpoint save / load, or slow executable / script loading.
+
+### Diagnose on the node
+
+```bash
+mount | grep -E "fsx|nfs|lustre|ebs|nvme"
+df -hT
+iostat -x 1 5                 # per-device throughput / IOPS / utilization
+
+# FSx for Lustre:
+lfs df -h                     # per-OST utilization (uneven = hotspot)
+lfs getstripe <path>          # striping config; wider = more parallelism
+
+# FSx for OpenZFS / NFS:
+nfsstat -m                    # per-mount retransmissions / wait times
+nfsiostat 5                   # ops/s, throughput, RTT
+
+# EBS:
+lsblk -o NAME,TYPE,SIZE,MOUNTPOINT
+```
+
+### CloudWatch (from your workstation)
+
+```bash
+# FSx for Lustre throughput saturation:
+aws cloudwatch get-metric-statistics \
+  --namespace AWS/FSx --metric-name DataReadBytes \
+  --dimensions Name=FileSystemId,Value=<FSxId> \
+  --statistics Sum --period 300 \
+  --start-time "$(date -u -d '1 hour ago' +%Y-%m-%dT%H:%M:%SZ)" \
+  --end-time   "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
+  --region <REGION>
+# Also: DataWriteBytes, FreeDataStorageCapacity, MetadataOperations
+
+# EBS throughput / IOPS:
+aws cloudwatch get-metric-statistics \
+  --namespace AWS/EBS --metric-name VolumeReadOps \
+  --dimensions Name=VolumeId,Value=<vol-id> \
+  --statistics Sum --period 60 \
+  --start-time "$(date -u -d '1 hour ago' +%Y-%m-%dT%H:%M:%SZ)" \
+  --end-time   "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
+  --region <REGION>
+# Also: VolumeWriteOps, VolumeReadBytes, VolumeWriteBytes, BurstBalance
+```
+
+### Interpret
+
+| Signal                                              | Interpretation                         | Action                                                                           |
+| --------------------------------------------------- | -------------------------------------- | -------------------------------------------------------------------------------- |
+| FSx Lustre `DataReadBytes` sustained at the ceiling | Throughput ceiling hit                 | Increase throughput-per-TiB or grow storage (throughput scales with size)        |
+| FSx Lustre metadata ops saturated                   | Small-file workload on Lustre          | Move small-file traffic to FSx for OpenZFS; keep Lustre for large sequential I/O |
+| FSx OpenZFS `TotalIOps` near provisioned IOPS       | IOPS ceiling hit                       | Increase provisioned IOPS                                                        |
+| EBS `BurstBalance` draining to 0 on `gp2`           | Baseline IOPS insufficient             | Migrate to `gp3` or `io2` with provisioned IOPS / throughput                     |
+| `iostat %util` > 90% on a mount device              | Local device saturated                 | If NVMe instance store: at hardware ceiling, change data layout                  |
+| Slow only at checkpoint time                        | Write amplification (many small files) | Consolidate checkpoints; rank-0 writer patterns                                  |
+
+### Choose the right filesystem
+
+| Workload                                                         | Best fit                                |
+| ---------------------------------------------------------------- | --------------------------------------- |
+| Large sequential reads (datasets >> 1 MiB), many-reader training | FSx for Lustre                          |
+| Small-file / metadata-heavy / mixed random I/O                   | FSx for OpenZFS                         |
+| Single-instance scratch                                          | EBS `gp3` or `io2`                      |
+| Highest per-GPU throughput, ephemeral                            | NVMe instance store (`/opt/dlami/nvme`) |
+
+For HyperPod Slurm, the default lifecycle script supports FSx for OpenZFS for `/home` — evaluate it if home is on Lustre and you see metadata-op saturation.
+
+### Verify after remediation
+
+- CloudWatch: throughput / IOPS climbs past the old flat-line
+- Training step time drops; data-loading fraction of step time drops
+- `iostat %util` stays below 80% under sustained load
diff --git a/plugins/sagemaker-ai/skills/hyperpod-cluster-debugger/references/iam-permissions.md b/plugins/sagemaker-ai/skills/hyperpod-cluster-debugger/references/iam-permissions.md
new file mode 100644
index 00000000..ec5c8076
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/hyperpod-cluster-debugger/references/iam-permissions.md
@@ -0,0 +1,40 @@
+# IAM Permissions Required
+
+Read-only diagnostic:
+
+```json
+{
+  "Action": [
+    "sagemaker:DescribeCluster",
+    "sagemaker:ListClusterNodes",
+    "sagemaker:ListClusterEvents",
+    "sagemaker:ListClusters",
+    "ec2:DescribeSecurityGroups",
+    "ec2:DescribeSubnets",
+    "ec2:DescribeVpcs",
+    "ec2:DescribeVpcEndpoints",
+    "ec2:DescribeInstances",
+    "ec2:DescribeInstanceTypeOfferings",
+    "eks:DescribeCluster",
+    "eks:ListAccessEntries",
+    "eks:ListAddons",
+    "eks:DescribeAddon",
+    "iam:GetRole",
+    "iam:ListAttachedRolePolicies",
+    "s3:ListBucket",
+    "s3:GetObject",
+    "logs:DescribeLogGroups",
+    "logs:DescribeLogStreams",
+    "logs:GetLogEvents",
+    "cloudformation:DescribeStackEvents",
+    "cloudformation:DescribeStacks",
+    "servicequotas:ListServiceQuotas",
+    "ssm:StartSession",
+    "ssm:TerminateSession"
+  ]
+}
+```
+
+> SSM on HyperPod uses `start-session` with `sagemaker-cluster:<cluster-id>_<group>-<iid>` targets — not `send-command` against plain instance IDs. Grant `ssm:StartSession` / `ssm:TerminateSession`.
+
+For remediations the operator runs, add the matching write permission (e.g. `ec2:AuthorizeSecurityGroupIngress`, `eks:CreateAccessEntry`).
diff --git a/plugins/sagemaker-ai/skills/hyperpod-cluster-debugger/references/lifecycle-scripts.md b/plugins/sagemaker-ai/skills/hyperpod-cluster-debugger/references/lifecycle-scripts.md
new file mode 100644
index 00000000..b1c7346c
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/hyperpod-cluster-debugger/references/lifecycle-scripts.md
@@ -0,0 +1,111 @@
+# Lifecycle Script Reference
+
+Companion to [SKILL.md](../SKILL.md) § C and [cluster-operations.md § 3](cluster-operations.md). Lifecycle scripts run on each node during provisioning. A failure here blocks the node — and often the entire cluster — from reaching `InService`.
+
+---
+
+## Layout
+
+Default AWS-published lifecycle scripts (commonly called "base-config") handle provisioning for Slurm and EKS. Before deep debugging, compare the customer's in-use scripts against the latest published version — upstream fixes often resolve the failure.
+
+### Slurm entry point (typical base-config layout)
+
+`on_create.sh` → `lifecycle_script.py` for orchestration (detects node type from `/opt/ml/config/resource_config.json` and runs per-type steps). Controller nodes provision first; compute / login nodes wait for the controller to write `slurm.conf` to shared storage. Customer-forked pipelines may differ — read `on_create.sh` on the affected node to confirm.
+
+**Controller failure cascades to all compute nodes** — if the controller's lifecycle script fails, compute nodes cannot find `slurm.conf` and also fail.
+
+### EKS entry point
+
+`on_create.sh` → `on_create_main.sh` (configures containerd storage, kubelet, FSx client, EFA).
+
+### S3 URI validation
+
+- `SourceS3Uri` starts with `s3://`
+- `OnCreate` filename matches an S3 key in that prefix
+- Execution role has `s3:GetObject` and `s3:ListBucket` on the bucket
+
+---
+
+## Common errors
+
+### S3 access
+
+Timeout reaching S3 from the lifecycle script (e.g. `Connect timeout on endpoint URL: s3://...`) → no S3 VPC endpoint; node cannot reach S3 from a private subnet.
+
+### Fix — add an S3 Gateway endpoint
+
+Customer-run. Gateway endpoint type is free; Interface endpoints are billed per-hour.
+
+```bash
+aws ec2 create-vpc-endpoint \
+  --vpc-id <VPC_ID> \
+  --service-name com.amazonaws.<REGION>.s3 \
+  --route-table-ids <ROUTE_TABLE_ID> \
+  --vpc-endpoint-type Gateway
+```
+
+**Caution:** routes S3 traffic for every resource using the listed route tables through the VPC endpoint. Can break workloads that rely on going to S3 via public DNS + NAT with custom endpoint policies. Review the VPC's default endpoint policy (or set `--policy-document`) before creating.
+
+`AccessDenied` / `403 Forbidden` on `GetObject` — add `s3:GetObject` + `s3:ListBucket` on the lifecycle bucket to the execution role.
+
+### Script execution
+
+| Symptom                                     | Cause                                                   | Fix                                                          |
+| ------------------------------------------- | ------------------------------------------------------- | ------------------------------------------------------------ |
+| `No such file or directory` on entry script | `OnCreate` name doesn't match S3 key                    | `aws s3 ls s3://<BUCKET>/ \| grep on_create` to verify       |
+| `\r: command not found` / CRLF terminators  | Edited on Windows                                       | `dos2unix on_create.sh` or `sed -i 's/\r$//' on_create.sh`   |
+| Script hangs (lifecycle timeout)            | Blocking op, infinite loop, waiting for absent resource | Add `set -euo pipefail`, add network timeouts                |
+| `provisioning_parameters.json` KeyError     | Instance group name mismatch                            | `InstanceGroupName` in API call must match group key in JSON |
+
+### Slurm
+
+`Compute nodes fail because slurm.conf not found` — controller's lifecycle failed. Fix the controller first.
+
+`slurmctld: error ...` — check `/var/log/slurmctld.log` on controller via SSM. Common causes: wrong `SlurmctldHost`, partition/node definition errors, missing MUNGE key.
+
+### FSx
+
+`mount.lustre: ... Connection timed out` — FSx in different VPC/AZ, or SG doesn't allow Lustre traffic. FSx and HyperPod nodes must share a VPC; SG must allow TCP 988 and 1018-1023 between nodes and FSx. Verify FSx is `AVAILABLE`.
+
+---
+
+## Reading logs
+
+### CloudWatch (from workstation)
+
+```bash
+CLUSTER_ID=$(aws sagemaker describe-cluster --cluster-name <NAME> --region <R> \
+  --query 'ClusterArn' --output text | cut -d/ -f2)
+LOG_GROUP="/aws/sagemaker/Clusters/<CLUSTER_NAME>/${CLUSTER_ID}"
+
+# List lifecycle log streams:
+aws logs describe-log-streams \
+  --log-group-name "$LOG_GROUP" --region <R> \
+  --query 'logStreams[?starts_with(logStreamName,`LifecycleConfig`)].{Stream:logStreamName,LastEvent:lastEventTimestamp}' \
+  --output table
+
+# Read a specific stream:
+aws logs get-log-events \
+  --log-group-name "$LOG_GROUP" \
+  --log-stream-name "LifecycleConfig/<GROUP>/<INSTANCE_ID>" \
+  --region <R> --limit 100 \
+  --query 'events[*].message' --output text
+```
+
+### On-node (via SSM)
+
+```bash
+cat /var/log/provision/provisioning.log      # full provisioning log
+cat /opt/ml/config/resource_config.json      # node topology
+cat /opt/slurm/etc/slurm.conf                # Slurm config (if generated)
+cat /opt/ml/metadata/resource-metadata.json  # node metadata
+```
+
+### Test locally
+
+```bash
+file on_create.sh         # must not say "with CRLF line terminators"
+head -1 on_create.sh      # must start with #!/bin/bash
+bash -n on_create.sh      # syntax check
+shellcheck on_create.sh   # optional lint
+```
diff --git a/plugins/sagemaker-ai/skills/hyperpod-cluster-debugger/scripts/diagnose-cluster.sh b/plugins/sagemaker-ai/skills/hyperpod-cluster-debugger/scripts/diagnose-cluster.sh
new file mode 100755
index 00000000..7cc87635
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/hyperpod-cluster-debugger/scripts/diagnose-cluster.sh
@@ -0,0 +1,1621 @@
+#!/usr/bin/env bash
+# diagnose-cluster.sh — read-only HyperPod cluster-level diagnostic.
+# See SKILL.md and references/cluster-diagnostics-detail.md for remediation.
+#
+# Exit codes:
+#   0  No critical (P0/P1) failures; P2 warnings are informational-only.
+#   1  One or more critical failures, or a fatal prerequisite.
+#   2  Invalid argument.
+
+set -euo pipefail
+
+for cmd in aws jq python3; do
+  command -v "$cmd" &>/dev/null || {
+    echo "ERROR: '$cmd' is required but not found. Install it and retry."
+    exit 1
+  }
+done
+# unbuffer is only needed if the Slurm-controller SSM probe runs
+# (activated when the cluster's orchestrator is Slurm). Warn at startup
+# but don't exit — EKS-only users shouldn't be blocked.
+if ! command -v unbuffer &>/dev/null; then
+  echo "WARN: 'unbuffer' not found. Required for the Slurm-controller SSM probe." >&2
+  echo "      Install via 'yum install expect' / 'apt install expect' / 'brew install expect'." >&2
+  echo "      EKS diagnostics will continue; Slurm-controller-only checks will be skipped." >&2
+fi
+
+CLUSTER=""
+REGION="${AWS_DEFAULT_REGION:-}"
+USE_COLOR=true
+VALIDATE_MODE=false
+VALIDATE_SG_IDS=""
+VALIDATE_SUBNET_IDS=""
+VALIDATE_IAM_ROLE=""
+VALIDATE_S3_URI=""
+VALIDATE_INSTANCE_TYPE=""
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --cluster)    [[ $# -lt 2 ]] && { echo "ERROR: --cluster needs a value"; exit 2; }
+                  [[ ! "$2" =~ ^(arn:aws[a-z-]*:sagemaker:[a-z0-9-]+:[0-9]{12}:cluster/[a-z0-9]{12}|[a-zA-Z0-9]([-a-zA-Z0-9]{0,62}))$ ]] && { echo "ERROR: --cluster must be a valid HyperPod cluster name or ARN (got '$2')"; exit 2; }
+                  CLUSTER="$2"; shift 2 ;;
+    --region)     [[ $# -lt 2 ]] && { echo "ERROR: --region needs a value"; exit 2; }
+                  [[ ! "$2" =~ ^[a-z]{2}-[a-z]+-[0-9]+$ ]] && { echo "ERROR: --region must be a valid AWS region (got '$2')"; exit 2; }
+                  REGION="$2"; shift 2 ;;
+    --sg-ids)        [[ $# -lt 2 ]] && { echo "ERROR: --sg-ids needs a value";        exit 2; }; VALIDATE_SG_IDS="$2";        shift 2 ;;
+    --subnet-ids)    [[ $# -lt 2 ]] && { echo "ERROR: --subnet-ids needs a value";    exit 2; }; VALIDATE_SUBNET_IDS="$2";    shift 2 ;;
+    --iam-role)      [[ $# -lt 2 ]] && { echo "ERROR: --iam-role needs a value";      exit 2; }; VALIDATE_IAM_ROLE="$2";      shift 2 ;;
+    --s3-uri)        [[ $# -lt 2 ]] && { echo "ERROR: --s3-uri needs a value";        exit 2; }; VALIDATE_S3_URI="$2";        shift 2 ;;
+    --instance-type) [[ $# -lt 2 ]] && { echo "ERROR: --instance-type needs a value"; exit 2; }; VALIDATE_INSTANCE_TYPE="$2"; shift 2 ;;
+    --no-color)   USE_COLOR=false;           shift ;;
+    --validate)   VALIDATE_MODE=true;        shift ;;
+    -h|--help)
+      cat <<'EOF'
+Usage: diagnose-cluster.sh --cluster <name-or-arn> --region <region> [--no-color]
+       diagnose-cluster.sh --validate --region <region> \
+         --sg-ids <sg-1,sg-2> --subnet-ids <sub-1,sub-2> [--iam-role <role-arn>] \
+         [--s3-uri s3://bucket/path/] [--instance-type ml.p5.48xlarge]
+
+Read-only diagnostic for HyperPod cluster-level issues: provisioning, access,
+node replacement, VPC/SG, EKS config + add-ons, SSM, CloudWatch logs. Each
+[FAIL] line in the summary includes a pointer of the form
+  "→ references/cluster-diagnostics-detail.md § <section>"
+so the hyperpod-cluster-debugger skill can look up the remediation runbook.
+
+The script never modifies cluster state and never prints remediation commands.
+
+Modes:
+  (default)   Diagnose an existing cluster.
+  --validate  Pre-flight config validation (validates SGs / subnets / IAM /
+              VPC endpoints / optional S3 lifecycle scripts / optional per-AZ
+              instance-type capacity before creating a cluster; no cluster
+              needed).
+
+See references/cluster-diagnostics-detail.md for full remediation runbooks.
+See references/capacity-planning.md, lifecycle-scripts.md, cloudformation-errors.md
+for deep-dive companions to sections B / C / H.
+EOF
+      exit 0
+      ;;
+    *) echo "Unknown argument: $1"; exit 2 ;;
+  esac
+done
+
+if [[ -z "$REGION" ]]; then
+  echo "ERROR: --region is required (or set AWS_DEFAULT_REGION before running)." >&2
+  exit 2
+fi
+
+if ! "$VALIDATE_MODE"; then
+  [[ -z "$CLUSTER" ]] && echo "Usage: $0 --cluster <name-or-arn> --region <region>" && exit 1
+fi
+
+if ! [ -t 1 ] || [ "${TERM:-}" = "dumb" ]; then
+  USE_COLOR=false
+fi
+if "$USE_COLOR"; then
+  RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'
+  CYAN='\033[0;36m'; BOLD='\033[1m'; NC='\033[0m'
+else
+  RED=''; GREEN=''; YELLOW=''; CYAN=''; BOLD=''; NC=''
+fi
+
+CALLER_IDENTITY=$(aws sts get-caller-identity --output json 2>&1) || {
+  echo -e "${RED}ERROR: AWS credentials not configured or expired.${NC}"
+  echo "$CALLER_IDENTITY"
+  echo ""
+  echo "→ references/cluster-diagnostics-detail.md § D (EKS Access / kubectl) for credential setup"
+  exit 1
+}
+CALLER_ARN=$(echo "$CALLER_IDENTITY" | python3 -c "import sys,json; print(json.load(sys.stdin).get('Arn','unknown'))" 2>/dev/null || echo "unknown")
+
+CRITICAL_FAILURES=0
+WARNINGS=0
+ISSUES_FOUND=()
+
+pass()    { echo -e "  ${GREEN}[PASS]${NC}  $1${2:+ — $2}"; }
+fail()    { CRITICAL_FAILURES=$((CRITICAL_FAILURES+1)); echo -e "  ${RED}[FAIL]${NC}  $1${2:+ — $2}"; }
+warn()    { WARNINGS=$((WARNINGS+1)); echo -e "  ${YELLOW}[WARN]${NC}  $1${2:+ — $2}"; }
+info()    { echo -e "  ${CYAN}[INFO]${NC}  $1${2:+ — $2}"; }
+header()  { echo ""; echo -e "${BOLD}--- $1 ---${NC}"; }
+section() { echo ""; echo -e "${BOLD}=== $1 ===${NC}"; }
+
+add_issue() {
+  local priority="${2:-P1}"
+  ISSUES_FOUND+=("${priority}|$1")
+}
+
+_CD_TEMP_FILES=()
+trap '[[ ${#_CD_TEMP_FILES[@]} -gt 0 ]] && rm -f "${_CD_TEMP_FILES[@]}" 2>/dev/null || true' EXIT
+
+# Run a shell command on a HyperPod instance via SSM. Payload is base64-encoded
+# so shell metacharacters in the command are safely passed through argv.
+ssm_run_on_node() {
+  local iid="$1" grp="$2" cmd="$3"
+  [[ -z "$iid" || -z "$grp" || -z "$cmd" ]] && return 1
+  [[ ! "$iid" =~ ^i-[0-9a-f]{8,17}$ ]] && return 1
+  [[ -z "${CLUSTER_ID:-}" ]] && return 1
+  [[ ! "$grp" =~ ^[A-Za-z0-9._-]+$ ]] && return 1
+
+  local target="sagemaker-cluster:${CLUSTER_ID}_${grp}-${iid}"
+  local tmp; tmp=$(mktemp 2>/dev/null) || return 1
+  chmod 600 "$tmp" 2>/dev/null || true
+  _CD_TEMP_FILES+=("$tmp")
+  local cmd_b64
+  cmd_b64=$(printf '%s' "$cmd" | base64 | tr -d '\n') || return 1
+  local remote="bash -c \"echo $cmd_b64 | base64 -d | bash\""
+  python3 -c "import json,sys; print(json.dumps({'command':[sys.argv[1]]}))" "$remote" > "$tmp" || return 1
+
+  # unbuffer avoids the session-manager-plugin "Cannot perform start session:
+  # EOF" race. Only required on Slurm clusters (controller probe); guard at
+  # call site so EKS-only users aren't blocked if unbuffer is absent.
+  local _ssm_wrap=""
+  command -v unbuffer >/dev/null 2>&1 && _ssm_wrap="unbuffer"
+
+  local attempt=0 out rc
+  while (( attempt < 5 )); do
+    out=$($_ssm_wrap timeout 180 aws ssm start-session \
+      --target "$target" \
+      --document-name AWS-StartNonInteractiveCommand \
+      --parameters "file://$tmp" \
+      --region "$REGION" 2>&1)
+    rc=$?
+    # Retry transient SSM transport errors (rc=0 with EOF/plugin/timeout in stdout).
+    if (( rc == 0 )) && ! echo "$out" | grep -qiE "Cannot perform start session|EOF$|SessionManagerPlugin is not found|ERROR: Unable to|i/o timeout"; then
+      # Strip SSM session banners and the echoed base64 command line.
+      echo "$out" | grep -vE '^(Starting session with SessionId:|Exiting session with sessionId:|\s*$)' \
+                  | grep -vE "^(bash -c \"echo [A-Za-z0-9+/=]+ \| base64 -d \| bash\"|echo '[A-Za-z0-9+/=]+'|[A-Za-z0-9+/=]{40,}={0,2})[[:space:]]*\|?[[:space:]]*base64?[[:space:]]*-?d?[[:space:]]*\|?[[:space:]]*bash\"?\$" || true
+      return 0
+    fi
+    if echo "$out" | grep -qiE "ThrottlingException|RequestLimitExceeded|InternalFailure|InternalError|ServiceUnavailable|TooManyUpdates|Cannot perform start session|EOF$|SessionManagerPlugin is not found|i/o timeout"; then
+      attempt=$((attempt + 1))
+      sleep $((attempt * 3))
+      continue
+    fi
+    echo "$out" >&2
+    return 1
+  done
+  return 1
+}
+
+# Check SG self-referencing rules. Reads SG JSON from stdin, outputs PASS/FAIL/WARN lines.
+check_sg_self_ref() {
+  local sg_id="$1"
+  SG_CHECK_ID="$sg_id" python3 -c "
+import sys, json, os
+sg_id = os.environ['SG_CHECK_ID']
+sgs = json.load(sys.stdin).get('SecurityGroups', [])
+if not sgs:
+    print(f'SKIP:Could not describe {sg_id}')
+    sys.exit(0)
+sg = sgs[0]
+inbound_self = any(
+    any(p.get('GroupId') == sg_id for p in r.get('UserIdGroupPairs', []))
+    for r in sg.get('IpPermissions', [])
+)
+outbound_self = any(
+    any(p.get('GroupId') == sg_id for p in r.get('UserIdGroupPairs', []))
+    for r in sg.get('IpPermissionsEgress', [])
+)
+outbound_all = any(
+    any(r2.get('CidrIp') == '0.0.0.0/0' for r2 in r.get('IpRanges', []))
+    for r in sg.get('IpPermissionsEgress', [])
+)
+if inbound_self:  print(f'PASS:inbound:SG {sg_id}: Inbound self-ref present')
+else:             print(f'FAIL:inbound:SG {sg_id}: Inbound self-ref MISSING — required for inter-node communication')
+if outbound_self: print(f'PASS:outbound:SG {sg_id}: Outbound self-ref present')
+else:             print(f'FAIL:outbound:SG {sg_id}: Outbound self-ref MISSING — required for EFA RDMA traffic')
+if outbound_all:  print(f'PASS:internet:SG {sg_id}: Outbound 0.0.0.0/0 present')
+else:             print(f'WARN:internet:SG {sg_id}: Outbound 0.0.0.0/0 missing — may be needed for AWS API calls')
+" 2>/dev/null || echo ""
+}
+
+# AWS API wrapper that detects permission failures
+aws_check() {
+  local api_label="$1"; shift
+  local result
+  result=$("$@" 2>&1)
+  local rc=$?
+  if [[ $rc -ne 0 ]]; then
+    if echo "$result" | grep -qiE "AccessDenied|UnauthorizedOperation|not authorized|AuthorizationError"; then
+      warn "$api_label" "IAM permission denied — results may be incomplete"
+      add_issue "Missing IAM permission for $api_label → references/cluster-diagnostics-detail.md § D (EKS Access / kubectl)" "P1"
+      echo ""
+      return 1
+    fi
+    echo "$result"
+    return "$rc"
+  fi
+  echo "$result"
+}
+
+if "$VALIDATE_MODE"; then
+  section "HyperPod Pre-Creation Validation"
+  echo -e "Region:  ${BOLD}${REGION}${NC}"
+  echo -e "Caller:  ${BOLD}${CALLER_ARN}${NC}"
+  echo -e "Time:    $(date -u +"%Y-%m-%dT%H:%M:%SZ")"
+
+  if [[ -n "$VALIDATE_SG_IDS" ]]; then
+    header "V1. Security Group Rules"
+    for SG in $(echo "$VALIDATE_SG_IDS" | tr ',' ' '); do
+      SG_JSON=$(aws_check "describe-sg-$SG" aws ec2 describe-security-groups \
+        --group-ids "$SG" --region "$REGION" --output json) || continue
+
+      _SG_CHECK_OUT=$(echo "$SG_JSON" | check_sg_self_ref "$SG")
+      while IFS=: read -r level check msg; do
+        [[ -z "$level" ]] && continue
+        case "$level" in
+          PASS) pass "$msg" ;;
+          FAIL)
+            fail "$msg"
+            add_issue "SG $SG missing $check self-ref → references/cluster-diagnostics-detail.md § A (EFA Health Checks)" "P0"
+            ;;
+          WARN) warn "$msg" ;;
+        esac
+      done <<< "$_SG_CHECK_OUT"
+    done
+  fi
+
+  if [[ -n "$VALIDATE_SUBNET_IDS" ]]; then
+    header "V2. Subnet Configuration"
+    IFS=',' read -ra _subnet_ids <<< "$VALIDATE_SUBNET_IDS"
+    SUB_JSON=$(aws_check "describe-subnets" aws ec2 describe-subnets \
+      --subnet-ids "${_subnet_ids[@]}" \
+      --region "$REGION" --output json) || SUB_JSON='{"Subnets":[]}'
+
+    echo "$SUB_JSON" | python3 -c "
+import sys, json
+subnets = json.load(sys.stdin).get('Subnets', [])
+vpcs = set()
+azs = set()
+for s in subnets:
+    sid = s.get('SubnetId', '?')
+    vpc = s.get('VpcId', '?')
+    az = s.get('AvailabilityZone', '?')
+    free = s.get('AvailableIpAddressCount', 0)
+    vpcs.add(vpc)
+    azs.add(az)
+    status = 'LOW' if free < 10 else 'OK'
+    print(f'SUBNET:{sid}:{vpc}:{az}:{free}:{status}')
+print(f'VPC_COUNT:{len(vpcs)}')
+print(f'AZ_COUNT:{len(azs)}')
+" 2>/dev/null | while IFS=: read -r tag rest; do
+      case "$tag" in
+        SUBNET)
+          IFS=: read -r sid _vpc az free status <<< "$rest"
+          if [[ "$status" == "LOW" ]]; then
+            warn "Subnet $sid (AZ=$az) — only $free IPs available"
+          else
+            pass "Subnet $sid" "AZ=$az FreeIPs=$free"
+          fi
+          ;;
+        VPC_COUNT)
+          if [[ "$rest" -gt 1 ]]; then
+            fail "Subnets are in DIFFERENT VPCs — all must be in same VPC"
+            add_issue "Subnets in different VPCs → references/cluster-diagnostics-detail.md § B (Capacity & AZ)" "P0"
+          else
+            pass "All subnets in same VPC"
+          fi
+          ;;
+        AZ_COUNT)
+          info "Subnets span $rest availability zone(s)"
+          ;;
+      esac
+    done
+  fi
+
+  if [[ -n "$VALIDATE_IAM_ROLE" ]]; then
+    header "V3. IAM Execution Role"
+    ROLE_NAME=$(echo "$VALIDATE_IAM_ROLE" | awk -F/ '{print $NF}')
+    ROLE_INFO=$(aws_check "get-role" aws iam get-role --role-name "$ROLE_NAME" --output json) || ROLE_INFO=""
+    if [[ -n "$ROLE_INFO" ]]; then
+      pass "IAM role exists" "$ROLE_NAME"
+      TRUST_SM=$(echo "$ROLE_INFO" | python3 -c "
+import sys,json
+doc=json.load(sys.stdin).get('Role',{}).get('AssumeRolePolicyDocument',{})
+stmts=doc.get('Statement',[])
+for s in stmts:
+    p=s.get('Principal',{})
+    svc=p.get('Service',[]) if isinstance(p.get('Service'), list) else [p.get('Service','')]
+    if 'sagemaker.amazonaws.com' in svc:
+        print('true')
+        break
+else:
+    print('false')
+" 2>/dev/null)
+      if [[ "$TRUST_SM" == "true" ]]; then
+        pass "Trust policy" "allows sagemaker.amazonaws.com"
+      else
+        fail "Trust policy" "missing sagemaker.amazonaws.com — cluster creation will fail"
+        add_issue "IAM execution role trust policy missing sagemaker.amazonaws.com → references/cluster-diagnostics-detail.md § H (CloudFormation Errors / SLR)" "P0"
+      fi
+
+      POLICIES=$(aws_check "list-attached-role-policies-$ROLE_NAME" \
+        aws iam list-attached-role-policies --role-name "$ROLE_NAME" \
+        --query 'AttachedPolicies[*].PolicyArn' --output text) || POLICIES=""
+      if [[ -n "$POLICIES" ]]; then
+        if echo "$POLICIES" | grep -q "AmazonSageMakerClusterInstanceRolePolicy"; then
+          pass "Managed policy" "AmazonSageMakerClusterInstanceRolePolicy attached"
+        else
+          warn "Managed policy" "AmazonSageMakerClusterInstanceRolePolicy not attached — cluster bootstrap will fail"
+          add_issue "IAM execution role missing AmazonSageMakerClusterInstanceRolePolicy → references/cluster-diagnostics-detail.md § H (CloudFormation Errors / SLR)" "P0"
+        fi
+        if echo "$POLICIES" | grep -q "AmazonSSMManagedInstanceCore"; then
+          pass "Managed policy" "AmazonSSMManagedInstanceCore attached (SSM access)"
+        else
+          warn "Managed policy" "AmazonSSMManagedInstanceCore not attached — SSM node access will not work"
+          add_issue "IAM execution role missing AmazonSSMManagedInstanceCore → references/cluster-diagnostics-detail.md § F (SSM Connectivity)" "P1"
+        fi
+      fi
+    else
+      fail "IAM role" "cannot find role '$ROLE_NAME'"
+      add_issue "IAM execution role not found → references/cluster-diagnostics-detail.md § H (CloudFormation Errors / SLR)" "P0"
+    fi
+  fi
+
+  if [[ -n "$VALIDATE_SUBNET_IDS" ]]; then
+    header "V4. VPC Endpoints"
+    FIRST_SUBNET=$(echo "$VALIDATE_SUBNET_IDS" | cut -d, -f1)
+    VPC_FOR_EP=$(aws ec2 describe-subnets --subnet-ids "$FIRST_SUBNET" \
+      --region "$REGION" --query 'Subnets[0].VpcId' --output text 2>/dev/null || echo "")
+    if [[ -n "$VPC_FOR_EP" && "$VPC_FOR_EP" != "None" ]]; then
+      ENDPOINTS=$(aws ec2 describe-vpc-endpoints \
+        --filters "Name=vpc-id,Values=$VPC_FOR_EP" \
+        --region "$REGION" \
+        --query "VpcEndpoints[?State==\`available\`].ServiceName" \
+        --output text 2>/dev/null || echo "")
+      for SVC in s3 ssm ssmmessages ec2messages; do
+        if echo "$ENDPOINTS" | grep -qE "(^|[.])${SVC}($|[[:space:]])"; then
+          pass "VPC endpoint: $SVC"
+        else
+          warn "VPC endpoint: $SVC" "not found — needed for private VPC clusters"
+          add_issue "Missing VPC endpoint for $SVC → references/cluster-diagnostics-detail.md § C (Lifecycle Scripts)" "P2"
+        fi
+      done
+    fi
+  fi
+
+  if [[ -n "$VALIDATE_INSTANCE_TYPE" && -n "$VALIDATE_SUBNET_IDS" ]]; then
+    header "V5. Instance-Type Capacity per AZ"
+    # EC2 API takes the bare type, not the ml. prefix.
+    EC2_TYPE="${VALIDATE_INSTANCE_TYPE#ml.}"
+
+    AZ_OFFERINGS=$(aws_check "describe-instance-type-offerings-$EC2_TYPE" \
+      aws ec2 describe-instance-type-offerings \
+      --location-type availability-zone \
+      --filters "Name=instance-type,Values=${EC2_TYPE}" \
+      --region "$REGION" \
+      --query 'InstanceTypeOfferings[*].Location' --output text) || AZ_OFFERINGS=""
+
+    if [[ -z "$AZ_OFFERINGS" ]]; then
+      fail "Instance type $VALIDATE_INSTANCE_TYPE" "not offered in region $REGION"
+      add_issue "$VALIDATE_INSTANCE_TYPE is not offered in any AZ in $REGION → references/capacity-planning.md" "P0"
+    else
+      info "$VALIDATE_INSTANCE_TYPE available in AZ(s): $AZ_OFFERINGS"
+
+      IFS=',' read -ra _subnet_ids <<< "$VALIDATE_SUBNET_IDS"
+      SUB_AZ_JSON=$(aws_check "describe-subnets-validate" aws ec2 describe-subnets \
+        --subnet-ids "${_subnet_ids[@]}" \
+        --region "$REGION" \
+        --query 'Subnets[*].{SubnetId:SubnetId,AZ:AvailabilityZone}' --output json) || SUB_AZ_JSON="[]"
+
+      MATCHED=0
+      while IFS=$'\t' read -r sid az; do
+        [[ -z "$sid" ]] && continue
+        if echo "$AZ_OFFERINGS" | tr '\t' '\n' | grep -qx "$az"; then
+          pass "Subnet $sid (AZ=$az)" "$VALIDATE_INSTANCE_TYPE is available"
+          MATCHED=$((MATCHED+1))
+        else
+          fail "Subnet $sid (AZ=$az)" "$VALIDATE_INSTANCE_TYPE NOT offered here"
+          add_issue "Subnet $sid AZ=$az does not offer $VALIDATE_INSTANCE_TYPE → references/capacity-planning.md" "P0"
+        fi
+      done < <(echo "$SUB_AZ_JSON" | python3 -c "
+import sys, json
+for s in json.load(sys.stdin):
+    print(f\"{s.get('SubnetId','')}\t{s.get('AZ','')}\")
+" 2>/dev/null)
+
+      if [[ $MATCHED -eq 0 ]]; then
+        warn "No provided subnet is in an AZ that offers $VALIDATE_INSTANCE_TYPE — cluster creation will fail with Insufficient capacity / No subnets in the capacity AZ"
+      fi
+    fi
+  fi
+
+  if [[ -n "$VALIDATE_S3_URI" ]]; then
+    header "V6. S3 Lifecycle Scripts"
+    if [[ ! "$VALIDATE_S3_URI" =~ ^s3:// ]]; then
+      fail "S3 URI" "must start with s3:// (got '$VALIDATE_S3_URI')"
+      add_issue "S3 URI is not a valid s3:// URI → references/lifecycle-scripts.md" "P0"
+    else
+      S3_URI_NORM="${VALIDATE_S3_URI%/}/"
+      info "S3 URI: $S3_URI_NORM"
+
+      S3_LIST=$(aws_check "s3-ls-$S3_URI_NORM" \
+        aws s3 ls "$S3_URI_NORM" --region "$REGION") || S3_LIST=""
+
+      if [[ -z "$S3_LIST" ]]; then
+        fail "S3 access" "cannot list $S3_URI_NORM — bucket missing, permissions denied, or empty prefix"
+        add_issue "S3 URI not accessible or empty: $S3_URI_NORM → references/lifecycle-scripts.md" "P0"
+      else
+        pass "S3 access" "prefix is listable"
+
+        if echo "$S3_LIST" | grep -q "on_create.sh"; then
+          pass "on_create.sh" "entry script present"
+
+          TMPFILE=$(mktemp)
+          if aws s3 cp "${S3_URI_NORM}on_create.sh" "$TMPFILE" \
+               --region "$REGION" --only-show-errors 2>/dev/null; then
+            if file "$TMPFILE" | grep -q "CRLF"; then
+              fail "on_create.sh" "has Windows CRLF line endings — will fail on Linux"
+              add_issue "on_create.sh has CRLF line endings → references/lifecycle-scripts.md" "P0"
+            else
+              pass "on_create.sh" "Unix line endings"
+            fi
+            if head -1 "$TMPFILE" | grep -q "^#!"; then
+              pass "on_create.sh" "shebang present"
+            else
+              warn "on_create.sh" "missing shebang (#!/bin/bash)"
+              add_issue "on_create.sh missing shebang → references/lifecycle-scripts.md" "P1"
+            fi
+          else
+            warn "on_create.sh" "could not download for inspection"
+          fi
+          rm -f "$TMPFILE"
+        else
+          fail "on_create.sh" "entry script NOT FOUND at $S3_URI_NORM — cluster creation will fail"
+          add_issue "Missing on_create.sh at $S3_URI_NORM → references/lifecycle-scripts.md" "P0"
+        fi
+
+        if   echo "$S3_LIST" | grep -q "lifecycle_script.py"; then
+          pass "Orchestrator script" "lifecycle_script.py present (Slurm)"
+        elif echo "$S3_LIST" | grep -q "on_create_main.sh"; then
+          pass "Orchestrator script" "on_create_main.sh present (EKS)"
+        else
+          warn "Orchestrator script" "neither lifecycle_script.py (Slurm) nor on_create_main.sh (EKS) found at $S3_URI_NORM"
+          add_issue "Missing orchestrator-specific lifecycle script at $S3_URI_NORM → references/lifecycle-scripts.md" "P1"
+        fi
+      fi
+    fi
+  fi
+
+  echo ""
+  echo -e "${BOLD}========================================${NC}"
+  echo -e "${BOLD}       VALIDATION SUMMARY               ${NC}"
+  echo -e "${BOLD}========================================${NC}"
+  echo ""
+  echo -e "  Results: ${RED}${CRITICAL_FAILURES} critical${NC} | ${YELLOW}${WARNINGS} warnings${NC}"
+  echo -e "  Mode:    READ-ONLY (no changes made; each [FAIL] points to a references section)"
+  echo ""
+  if [[ ${#ISSUES_FOUND[@]} -gt 0 ]]; then
+    echo -e "${BOLD}  Issues:${NC}"
+    for priority in P0 P1 P2; do
+      for issue in "${ISSUES_FOUND[@]}"; do
+        if [[ "$issue" == "${priority}|"* ]]; then
+          desc="${issue#*|}"
+          case "$priority" in
+            P0) echo -e "    ${RED}[${priority}]${NC} $desc" ;;
+            P1) echo -e "    ${YELLOW}[${priority}]${NC} $desc" ;;
+            P2) echo -e "    [${priority}] $desc" ;;
+          esac
+        fi
+      done
+    done
+    echo ""
+  fi
+  if [[ $CRITICAL_FAILURES -eq 0 ]]; then
+    echo -e "  ${GREEN}${BOLD}Pre-flight validation passed. Safe to create cluster.${NC}"
+  else
+    echo -e "  ${RED}${BOLD}Fix P0 issues above before creating the cluster.${NC}"
+  fi
+  echo ""
+  exit "$([[ $CRITICAL_FAILURES -eq 0 ]] && echo 0 || echo 1)"
+fi
+
+section "HyperPod Cluster Diagnostics (read-only)"
+echo -e "Cluster: ${BOLD}${CLUSTER}${NC}"
+echo -e "Region:  ${BOLD}${REGION}${NC}"
+echo -e "Time:    $(date -u +"%Y-%m-%dT%H:%M:%SZ")"
+echo -e "${CYAN}   No cluster state will be modified. Each issue line below includes a${NC}"
+echo -e "${CYAN}   pointer to references/cluster-diagnostics-detail.md for remediation.${NC}"
+
+header "1. Cluster Identity & Status"
+
+CLUSTER_JSON=$(aws sagemaker describe-cluster \
+  --cluster-name "$CLUSTER" \
+  --region "$REGION" \
+  --cli-read-timeout 30 \
+  --output json 2>&1) || {
+  echo -e "${RED}ERROR: Could not describe cluster '$CLUSTER' in region '$REGION'${NC}"
+  echo "$CLUSTER_JSON" | head -3
+  echo ""
+  if echo "$CLUSTER_JSON" | grep -qiE "ResourceNotFound|Cluster with name .* not found"; then
+    echo "Available clusters in $REGION:"
+    aws sagemaker list-clusters --region "$REGION" \
+      --query 'ClusterSummaries[*].{Name:ClusterName,Status:ClusterStatus}' \
+      --output table 2>/dev/null || echo "  (unable to list clusters — check IAM)"
+  else
+    echo "Verify:"
+    echo "  1. Cluster name is correct (use: aws sagemaker list-clusters --region $REGION)"
+    echo "  2. Region is correct"
+    echo "  3. IAM permissions include sagemaker:DescribeCluster"
+  fi
+  exit 1
+}
+
+CLUSTER_ARN=$(echo "$CLUSTER_JSON" | python3 -c "import sys,json; print(json.load(sys.stdin).get('ClusterArn',''))" 2>/dev/null)
+CLUSTER_ID=$(echo "$CLUSTER_ARN" | awk -F'/' '{print $NF}')
+if [[ -z "$CLUSTER_ID" ]]; then
+  echo "ERROR: Could not extract cluster ID from ARN '$CLUSTER_ARN'. Verify the cluster name/ARN."
+  exit 1
+fi
+CLUSTER_STATUS=$(echo "$CLUSTER_JSON" | python3 -c "import sys,json; print(json.load(sys.stdin).get('ClusterStatus','unknown'))" 2>/dev/null)
+ORCHESTRATOR=$(echo "$CLUSTER_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); o=d.get('Orchestrator',{}); print('EKS' if 'Eks' in o else 'Slurm')" 2>/dev/null)
+NODE_RECOVERY=$(echo "$CLUSTER_JSON" | python3 -c "
+import sys,json
+d=json.load(sys.stdin)
+# Prefer cluster-level NodeRecovery (the API's canonical location); fall back to
+# per-InstanceGroup only when top-level is absent. Reading only per-group yields
+# 'Unknown' on every cluster because the field is null at group level when set
+# cluster-wide.
+top=d.get('NodeRecovery')
+if top:
+    print(top)
+else:
+    groups=d.get('InstanceGroups',[])
+    recoveries={g.get('NodeRecovery') for g in groups if g.get('NodeRecovery')}
+    print(','.join(sorted(recoveries)) if recoveries else 'Unknown')
+" 2>/dev/null || echo "Unknown")
+
+info "ARN:          $CLUSTER_ARN"
+info "Cluster ID:   $CLUSTER_ID"
+info "Status:       $CLUSTER_STATUS"
+info "Orchestrator: $ORCHESTRATOR"
+info "NodeRecovery: $NODE_RECOVERY"
+
+# Flag auto-recovery disabled regardless of orchestrator.
+if [[ "$NODE_RECOVERY" == *"None"* && "$NODE_RECOVERY" == *"Automatic"* ]]; then
+  warn "NodeRecovery" "mixed settings — some instance groups have recovery disabled"
+  add_issue "NodeRecovery disabled on some instance groups → references/cluster-diagnostics-detail.md § G (Node Replacement)" "P2"
+elif [[ "$NODE_RECOVERY" == *"None"* ]]; then
+  warn "NodeRecovery" "disabled on all instance groups — auto-replacement won't trigger"
+  add_issue "NodeRecovery disabled → references/cluster-diagnostics-detail.md § G (Node Replacement)" "P2"
+fi
+
+CREATION_TIME=$(echo "$CLUSTER_JSON" | python3 -c "
+import sys,json
+d=json.load(sys.stdin)
+ct=d.get('CreationTime','')
+print(ct if ct else '')
+" 2>/dev/null || echo "")
+
+LAST_MODIFIED_TIME=$(echo "$CLUSTER_JSON" | python3 -c "
+import sys,json
+d=json.load(sys.stdin)
+lm=d.get('LastModifiedTime','')
+print(lm if lm else '')
+" 2>/dev/null || echo "")
+
+STUCK_THRESHOLD_SECONDS=3600
+
+is_stuck() {
+  local creation_time="$1"
+  if [[ -z "$creation_time" ]]; then echo "false"; return; fi
+  CREATION_TS="$creation_time" THRESHOLD="$STUCK_THRESHOLD_SECONDS" python3 -c "
+import os
+from datetime import datetime, timezone
+ct = os.environ['CREATION_TS']
+threshold = int(os.environ['THRESHOLD'])
+try:
+    ct=ct.replace('+00:00','Z').rstrip('Z')
+    if '.' in ct: ct=ct[:ct.index('.')+7]
+    created=datetime.fromisoformat(ct).replace(tzinfo=timezone.utc)
+    elapsed=(datetime.now(timezone.utc)-created).total_seconds()
+    print('true' if elapsed > threshold else 'false')
+except (ValueError, TypeError):
+    # Unparseable timestamp — assume not stuck rather than abort the whole run.
+    print('false')
+" 2>/dev/null || echo "false"
+}
+
+case "$CLUSTER_STATUS" in
+  InService)    pass "Cluster status" "InService" ;;
+  Creating)
+    STUCK=$(is_stuck "$CREATION_TIME")
+    if [[ "$STUCK" == "true" ]]; then
+      fail "Cluster status" "Creating for over 1 hour — likely stuck"
+      add_issue "Cluster stuck in Creating > 1hr → references/cluster-diagnostics-detail.md § E (Cluster Provisioning), § H (CloudFormation)" "P0"
+    else
+      warn "Cluster status" "Creating — cluster is still being provisioned"
+      add_issue "Cluster still creating → references/cluster-diagnostics-detail.md § E (Cluster Provisioning)" "P1"
+    fi ;;
+  Updating)
+    STUCK=$(is_stuck "${LAST_MODIFIED_TIME:-$CREATION_TIME}")
+    if [[ "$STUCK" == "true" ]]; then
+      fail "Cluster status" "Updating — check if operation is stuck"
+      add_issue "Cluster may be stuck Updating → references/cluster-diagnostics-detail.md § E (Cluster Provisioning), § H (CloudFormation)" "P1"
+    else
+      warn "Cluster status" "Updating — cluster operation in progress"
+    fi ;;
+  Failed)       fail "Cluster status" "Failed — check events and CloudFormation"; add_issue "Cluster FAILED → references/cluster-diagnostics-detail.md § E (Cluster Provisioning), § H (CloudFormation)" "P0" ;;
+  Deleting)
+    STUCK=$(is_stuck "${LAST_MODIFIED_TIME:-$CREATION_TIME}")
+    if [[ "$STUCK" == "true" ]]; then
+      warn "Cluster status" "Deleting for extended time — may be blocked by VPC ENI dependencies"
+      add_issue "Cluster stuck Deleting → references/cluster-diagnostics-detail.md § E (Cluster Provisioning)" "P1"
+    else
+      warn "Cluster status" "Deleting"
+    fi ;;
+  RollingBack)  warn "Cluster status" "RollingBack — update is being rolled back"; add_issue "Cluster RollingBack → references/cluster-diagnostics-detail.md § J (AMI & Cluster Updates)" "P1" ;;
+  *RollbackFailed*|*MaintenanceFailed*)
+    fail "Cluster status" "$CLUSTER_STATUS — cluster is stuck in a non-recoverable state"
+    add_issue "Cluster stuck in $CLUSTER_STATUS → references/cluster-diagnostics-detail.md § J (AMI & Cluster Updates)" "P0" ;;
+  *)            warn "Cluster status" "$CLUSTER_STATUS" ;;
+esac
+
+EKS_NAME=""
+if [[ "$ORCHESTRATOR" == "EKS" ]]; then
+  EKS_NAME=$(echo "$CLUSTER_JSON" | python3 -c "
+import sys,json
+d=json.load(sys.stdin)
+arn=d.get('Orchestrator',{}).get('Eks',{}).get('ClusterArn','')
+print(arn.split('/')[-1] if arn else '')
+" 2>/dev/null || echo "")
+  if [[ -n "$EKS_NAME" ]]; then
+    info "EKS Cluster:  $EKS_NAME"
+  fi
+fi
+
+header "2. Instance Groups & Node Health"
+
+echo "$CLUSTER_JSON" | python3 -c "
+import sys, json
+d = json.load(sys.stdin)
+groups = d.get('InstanceGroups', [])
+if not groups:
+    print('  No instance groups found')
+else:
+    for g in groups:
+        name = g.get('InstanceGroupName', '?')
+        itype = g.get('InstanceType', '?')
+        target = g.get('TargetCount', 0)
+        current = g.get('CurrentCount', 0)
+        status = g.get('Status', g.get('InstanceGroupStatus', '?'))
+        threads = g.get('ThreadsPerCore', '?')
+        # TargetStateCount is the count the service is working toward when a
+        # resize is in flight; print when it differs from TargetCount.
+        tstate = g.get('TargetStateCount', None)
+        # Note: NodeRecovery is a cluster-level field in the DescribeCluster
+        # response, not per-group; shown on the cluster header line above.
+        print(f'  {name}: type={itype} target={target} current={current} status={status} threads/core={threads}')
+        if tstate is not None and tstate != target:
+            print(f'    TargetStateCount={tstate} (resize in progress)')
+        if current < target:
+            print(f'    Current count ({current}) < target ({target}) — instances may still be provisioning or failed')
+" 2>/dev/null
+
+# Check node-level details. Paginate — default page is small and large clusters
+# silently truncate, which would break dangling-node reconciliation below.
+fetch_all_cluster_nodes_cd() {
+  local merged='[]' token='' page_json combined i=0
+  local max_pages=200  # 200 × 100 = 20 000 nodes, supports 7k+ clusters
+  while (( i < max_pages )); do
+    if [[ -n "$token" ]]; then
+      page_json=$(aws sagemaker list-cluster-nodes \
+        --cluster-name "$CLUSTER" --region "$REGION" \
+        --max-results 100 --next-token "$token" \
+        --cli-read-timeout 30 --output json 2>&1) || break
+    else
+      page_json=$(aws sagemaker list-cluster-nodes \
+        --cluster-name "$CLUSTER" --region "$REGION" \
+        --max-results 100 \
+        --cli-read-timeout 30 --output json 2>&1) || break
+    fi
+    if echo "$page_json" | grep -qiE "AccessDenied|not authorized|UnauthorizedAccess"; then
+      echo "__AUTH_DENIED__"
+      return 1
+    fi
+    # Merge via stdin (NUL-delimited blobs) instead of argv — argv is capped at
+    # ARG_MAX (~128KB on Linux), which fails at ~500 nodes of accumulated JSON.
+    # Large clusters (7k+) need this path to avoid silent truncation.
+    combined=$(printf '%s\0%s' "$merged" "$page_json" | python3 -c "
+import sys, json
+blob = sys.stdin.buffer.read()
+try:
+    a, b = blob.split(b'\0', 1)
+    merged = json.loads(a)
+    page = json.loads(b)
+except (json.JSONDecodeError, ValueError):
+    sys.exit(2)
+merged.extend(page.get('ClusterNodeSummaries', []))
+print(json.dumps(merged))
+print(page.get('NextToken', ''))
+" 2>/dev/null) || break
+    merged=$(printf '%s\n' "$combined" | sed -n '1p')
+    token=$(printf '%s\n'  "$combined" | sed -n '2p')
+    i=$((i+1))
+    [[ -z "$token" ]] && break
+  done
+  if (( i == max_pages )) && [[ -n "$token" ]]; then
+    # Surface truncation via a marker file — this function runs inside $(...)
+    # (command substitution subshell), so add_issue would be lost. The parent
+    # shell checks for the marker after the call returns.
+    echo "WARN: list-cluster-nodes truncated at ${max_pages} pages (~$((max_pages*100)) nodes). Diagnostic sample is incomplete for very large clusters." >&2
+    : > "${_NODE_TRUNC_MARKER:-/dev/null}" 2>/dev/null || true
+  fi
+  printf '%s' "$merged" | python3 -c "
+import sys, json
+try:
+    print(json.dumps({'ClusterNodeSummaries': json.loads(sys.stdin.read())}))
+except json.JSONDecodeError:
+    print('{\"ClusterNodeSummaries\":[]}')
+" 2>/dev/null || echo '{"ClusterNodeSummaries":[]}'
+}
+
+_NODE_TRUNC_MARKER=$(mktemp 2>/dev/null) && _CD_TEMP_FILES+=("$_NODE_TRUNC_MARKER") || _NODE_TRUNC_MARKER=""
+export _NODE_TRUNC_MARKER
+rm -f "$_NODE_TRUNC_MARKER" 2>/dev/null || true
+
+NODE_LIST=$(fetch_all_cluster_nodes_cd)
+if [[ "$NODE_LIST" == "__AUTH_DENIED__" ]]; then
+  warn "list-cluster-nodes" "IAM permission denied — add sagemaker:ListClusterNodes to your role"
+  add_issue "Missing IAM permission for sagemaker:ListClusterNodes → references/cluster-diagnostics-detail.md § D (EKS Access / kubectl)" "P1"
+  NODE_LIST='{"ClusterNodeSummaries":[]}'
+fi
+
+# Parent-shell follow-up for the truncation marker set inside the subshell.
+if [[ -n "$_NODE_TRUNC_MARKER" && -e "$_NODE_TRUNC_MARKER" ]]; then
+  add_issue "Node list truncated at 200 pages (~20000 nodes); diagnostic sample incomplete → references/cluster-diagnostics-detail.md § E (Cluster Provisioning)" "P2"
+fi
+
+TOTAL_NODES=$(echo "$NODE_LIST" | python3 -c "import sys,json; print(len(json.load(sys.stdin).get('ClusterNodeSummaries',[])))" 2>/dev/null || echo 0)
+info "Total nodes reported: $TOTAL_NODES"
+
+UNHEALTHY_NODES=$(echo "$NODE_LIST" | python3 -c "
+import sys, json
+nodes = json.load(sys.stdin).get('ClusterNodeSummaries', [])
+unhealthy = [n for n in nodes if n.get('InstanceStatus', {}).get('Status', '') not in ('Running', 'Pending')]
+if unhealthy:
+    for n in unhealthy:
+        nid = n.get('InstanceId', '?')
+        group = n.get('InstanceGroupName', '?')
+        status = n.get('InstanceStatus', {}).get('Status', '?')
+        msg = n.get('InstanceStatus', {}).get('Message', '')
+        print(f'  {nid} ({group}): {status} {msg}')
+    print(f'UNHEALTHY_COUNT={len(unhealthy)}')
+else:
+    print('UNHEALTHY_COUNT=0')
+" 2>/dev/null || echo "UNHEALTHY_COUNT=0")
+
+UNHEALTHY_COUNT=$(echo "$UNHEALTHY_NODES" | grep "^UNHEALTHY_COUNT=" | cut -d= -f2)
+[[ -z "$UNHEALTHY_COUNT" ]] && UNHEALTHY_COUNT=0
+echo "$UNHEALTHY_NODES" | grep -v "^UNHEALTHY_COUNT=" || true
+
+if [[ "$UNHEALTHY_COUNT" -gt 0 ]]; then
+  warn "Node health" "$UNHEALTHY_COUNT unhealthy node(s)"
+  add_issue "$UNHEALTHY_COUNT unhealthy node(s) → references/cluster-diagnostics-detail.md § G (Node Replacement); delegate to hyperpod-node-debugger" "P1"
+
+  echo "$NODE_LIST" | python3 -c "
+import sys, json
+from collections import defaultdict
+nodes = json.load(sys.stdin).get('ClusterNodeSummaries', [])
+groups = defaultdict(lambda: {'total': 0, 'unhealthy': 0})
+for n in nodes:
+    g = n.get('InstanceGroupName', 'unknown')
+    groups[g]['total'] += 1
+    st = n.get('InstanceStatus', {}).get('Status', '')
+    if st not in ('Running', 'Pending', ''):
+        groups[g]['unhealthy'] += 1
+for g, c in groups.items():
+    if c['unhealthy'] > 0:
+        pct = int(c['unhealthy'] / c['total'] * 100) if c['total'] > 0 else 0
+        print(f'  [WARN] Group {g}: {c[\"unhealthy\"]}/{c[\"total\"]} unhealthy ({pct}%)')
+" 2>/dev/null
+
+elif [[ "$TOTAL_NODES" -eq 0 && "$CLUSTER_STATUS" == "InService" ]]; then
+  warn "Node health" "Cluster InService but 0 nodes reported"
+  add_issue "Cluster InService but no nodes → references/cluster-diagnostics-detail.md § E (Cluster Provisioning)" "P1"
+else
+  pass "Node health" "$TOTAL_NODES node(s), $UNHEALTHY_COUNT unhealthy"
+fi
+
+header "3. Cluster Events (Recent)"
+
+# Paginate up to 5 pages (500 events) so the event scan covers incident windows
+# longer than the default page. Long-lived clusters with rolling replacements
+# regularly generate >100 events.
+fetch_cluster_events_cd() {
+  local merged='[]' token='' page_json combined i=0 denied=0
+  while (( i < 5 )); do
+    if [[ -n "$token" ]]; then
+      page_json=$(aws sagemaker list-cluster-events \
+        --cluster-name "$CLUSTER" --region "$REGION" \
+        --max-results 100 --next-token "$token" \
+        --cli-read-timeout 30 --output json 2>&1) || break
+    else
+      page_json=$(aws sagemaker list-cluster-events \
+        --cluster-name "$CLUSTER" --region "$REGION" \
+        --max-results 100 \
+        --cli-read-timeout 30 --output json 2>&1) || break
+    fi
+    if echo "$page_json" | grep -qi "AccessDenied\|not authorized"; then
+      denied=1
+      break
+    fi
+    combined=$(python3 -c "
+import sys, json
+try:
+    prev = json.loads(sys.argv[1])
+    page = json.loads(sys.argv[2])
+except json.JSONDecodeError:
+    # Malformed page response — stop paginating; caller falls through on break.
+    sys.exit(2)
+prev.extend(page.get('ClusterEventSummaries', []))
+print(json.dumps(prev))
+print(page.get('NextToken',''))
+" "$merged" "$page_json" 2>/dev/null) || break
+
+    merged=$(printf '%s\n' "$combined" | sed -n '1p')
+
+    token=$(printf '%s\n'  "$combined" | sed -n '2p')
+    i=$((i+1))
+    [[ -z "$token" ]] && break
+  done
+  if (( denied )); then
+    echo "__AUTH_DENIED__"
+    return 1
+  fi
+  python3 -c "import sys, json; print(json.dumps({'ClusterEventSummaries': json.loads(sys.argv[1])}))" "$merged" \
+    2>/dev/null || echo '{"ClusterEventSummaries":[]}'
+}
+
+EVENTS_JSON=$(fetch_cluster_events_cd)
+if [[ "$EVENTS_JSON" == "__AUTH_DENIED__" ]]; then
+  warn "list-cluster-events" "IAM permission denied — add sagemaker:ListClusterEvents to your role"
+  EVENTS_JSON='{"ClusterEventSummaries":[]}'
+fi
+
+EVENT_COUNT=$(echo "$EVENTS_JSON" | python3 -c "import sys,json; print(len(json.load(sys.stdin).get('ClusterEventSummaries',[])))" 2>/dev/null || echo 0)
+
+if [[ "$EVENT_COUNT" -eq 0 ]]; then
+  info "No cluster events found"
+  if [[ "$ORCHESTRATOR" == "Slurm" ]]; then
+    info "(Cluster events may not be available for HyperPod Slurm clusters)"
+  fi
+else
+  echo "$EVENTS_JSON" | python3 -c "
+import sys, json
+events = json.load(sys.stdin).get('ClusterEventSummaries', [])
+
+# Issue pattern mapping
+ISSUE_PATTERNS = {
+    'EFA health checks': 'EFA health check failure → references/cluster-diagnostics-detail.md § A',
+    'Insufficient capacity': 'Capacity error → references/cluster-diagnostics-detail.md § B',
+    'No subnets in the capacity': 'AZ/subnet mismatch → references/cluster-diagnostics-detail.md § B',
+    'Lifecycle scripts did not run': 'Lifecycle script failure → references/cluster-diagnostics-detail.md § C',
+    'Lifecycle scripts execution timed out': 'Lifecycle script timeout → references/cluster-diagnostics-detail.md § C',
+    'network misconfiguration': 'Network misconfiguration → references/cluster-diagnostics-detail.md § A + § B',
+    'hardware failure': 'Hardware failure → delegate to node-debugger',
+    'Failed to provision': 'Provisioning failure → references/cluster-diagnostics-detail.md § B or § E',
+    'replace': 'Node replacement activity → references/cluster-diagnostics-detail.md § G',
+    'reboot': 'Node reboot activity → references/cluster-diagnostics-detail.md § G',
+}
+
+for e in events[:20]:
+    ts = str(e.get('EventTime', '?'))[:19]
+    etype = e.get('EventType', '?')
+    msg = e.get('Message', '?')[:120]
+    print(f'  [{ts}] {etype}: {msg}')
+
+    msg_lower = (e.get('Message','') or '').lower()
+    for pattern, hint in ISSUE_PATTERNS.items():
+        if pattern.lower() in msg_lower:
+            print(f'    [ISSUE] {hint}')
+            break
+" 2>/dev/null
+fi
+
+header "4. VPC & Security Group Configuration"
+
+SUBNET_IDS=$(echo "$CLUSTER_JSON" | python3 -c "
+import sys,json
+d=json.load(sys.stdin)
+print(' '.join(d.get('VpcConfig',{}).get('Subnets',[])))
+" 2>/dev/null || echo "")
+
+SG_IDS=$(echo "$CLUSTER_JSON" | python3 -c "
+import sys,json
+d=json.load(sys.stdin)
+print(' '.join(d.get('VpcConfig',{}).get('SecurityGroupIds',[])))
+" 2>/dev/null || echo "")
+
+if [[ -z "$SUBNET_IDS" ]]; then
+  warn "VpcConfig" "No VpcConfig found in cluster"
+else
+  info "Subnets: $SUBNET_IDS"
+  info "Security Groups: $SG_IDS"
+
+  IFS=' ' read -ra _subnet_ids_arr <<< "$SUBNET_IDS"
+  SUBNET_JSON=$(aws ec2 describe-subnets \
+    --subnet-ids "${_subnet_ids_arr[@]}" \
+    --region "$REGION" \
+    --cli-read-timeout 30 \
+    --output json 2>&1) || {
+    SUB_ERR="$SUBNET_JSON"
+    if echo "$SUB_ERR" | grep -qi "AccessDenied\|UnauthorizedOperation\|not authorized"; then
+      warn "describe-subnets" "IAM permission denied — add ec2:DescribeSubnets to your role"
+    fi
+    SUBNET_JSON='{"Subnets":[]}'
+  }
+
+  _SUBNET_CHECK=$(echo "$SUBNET_JSON" | python3 -c "
+import sys, json
+subnets = json.load(sys.stdin).get('Subnets', [])
+vpcs = set()
+for s in subnets:
+    sid = s.get('SubnetId', '?')
+    vpc = s.get('VpcId', '?')
+    az = s.get('AvailabilityZone', '?')
+    free = s.get('AvailableIpAddressCount', 0)
+    flag = ' LOW IPs' if free < 10 else ''
+    print(f'  {sid}: VPC={vpc} AZ={az} FreeIPs={free}{flag}')
+    vpcs.add(vpc)
+if len(vpcs) > 1:
+    print('MULTI_VPC=true')
+    print('VPC_LIST=' + ','.join(vpcs))
+else:
+    print('MULTI_VPC=false')
+    v = vpcs.pop() if vpcs else '?'
+    print('VPC_ID=' + v)
+" 2>/dev/null || echo "")
+
+  while IFS= read -r line; do
+    if [[ "$line" == "MULTI_VPC=true" ]]; then
+      fail "Subnet VPC alignment" "Subnets are in DIFFERENT VPCs — all must be in the same VPC"
+      add_issue "Subnets in different VPCs → references/cluster-diagnostics-detail.md § B (Capacity & AZ)" "P0"
+    fi
+    if [[ "$line" != MULTI_VPC=* && "$line" != VPC_ID=* && "$line" != VPC_LIST=* ]]; then
+      echo "$line"
+    fi
+  done <<< "$_SUBNET_CHECK"
+
+  # SG self-referencing rules are an EFA requirement.
+  # shellcheck disable=SC2086  # intentional word-split on space-separated SG IDs
+  for SG in $SG_IDS; do
+    SG_RESULT=$(aws ec2 describe-security-groups \
+      --group-ids "$SG" \
+      --region "$REGION" \
+      --cli-read-timeout 30 \
+      --output json 2>&1)
+    if echo "$SG_RESULT" | grep -qiE "AccessDenied|UnauthorizedOperation"; then
+      warn "describe-security-groups" "IAM permission denied for $SG — SG check skipped"
+      continue
+    fi
+    SG_JSON="${SG_RESULT}"
+    [[ -z "$SG_JSON" || "$SG_JSON" == *"error"* ]] && SG_JSON='{"SecurityGroups":[]}'
+
+    _SG_CHECK=$(echo "$SG_JSON" | check_sg_self_ref "$SG")
+
+    while IFS= read -r line; do
+      [[ -z "$line" ]] && continue
+      level=$(echo "$line" | cut -d: -f1)
+      msg=$(echo "$line" | cut -d: -f2-)
+      case "$level" in
+        PASS) pass "$msg" ;;
+        FAIL) fail "$msg"
+              if echo "$msg" | grep -q "Inbound self-ref MISSING"; then
+                add_issue "Security group $SG inbound self-ref MISSING → references/cluster-diagnostics-detail.md § A (EFA Health Checks)" "P0"
+              elif echo "$msg" | grep -q "Outbound self-ref MISSING"; then
+                add_issue "Security group $SG outbound self-ref MISSING → references/cluster-diagnostics-detail.md § A (EFA Health Checks)" "P0"
+              elif echo "$msg" | grep -q "Outbound 0.0.0.0/0 missing"; then
+                add_issue "Security group $SG outbound 0.0.0.0/0 MISSING → references/cluster-diagnostics-detail.md § A (EFA Health Checks)" "P0"
+              else
+                add_issue "Security group $SG rule missing → references/cluster-diagnostics-detail.md § A (EFA Health Checks)" "P0"
+              fi
+              ;;
+        WARN) warn "$msg" ;;
+        SKIP) info "$msg" ;;
+      esac
+    done <<< "$_SG_CHECK"
+  done
+fi
+
+header "4b. Instance Quotas"
+
+INSTANCE_TYPES=$(echo "$CLUSTER_JSON" | python3 -c "
+import sys,json
+d=json.load(sys.stdin)
+types=set(g.get('InstanceType','') for g in d.get('InstanceGroups',[]))
+print(' '.join(t for t in types if t))
+" 2>/dev/null || echo "")
+
+if [[ -n "$INSTANCE_TYPES" ]]; then
+  # One paginated list-service-quotas call, cached across all instance types.
+  # The API is account/region rate-limited and throttles if called per-type.
+  QUOTA_ALL=""
+  QUOTA_ERR=""
+  _next=""
+  for _pg in 1 2 3 4 5; do
+    if [[ -n "$_next" ]]; then
+      _raw=$(aws service-quotas list-service-quotas \
+        --service-code sagemaker --region "$REGION" \
+        --cli-read-timeout 15 --starting-token "$_next" \
+        --output json 2>&1 || true)
+    else
+      _raw=$(aws service-quotas list-service-quotas \
+        --service-code sagemaker --region "$REGION" \
+        --cli-read-timeout 15 \
+        --output json 2>&1 || true)
+    fi
+    # Order matters: test for specific errors first, then fall through to
+    # generic "not JSON" check, so throttled responses don't get misclassified.
+    if echo "$_raw" | grep -qiE "AccessDenied|UnauthorizedOperation"; then
+      QUOTA_ERR="denied"; break
+    elif echo "$_raw" | grep -qiE "TooManyRequestsException|ThrottlingException|RequestLimitExceeded|exceeded the rate"; then
+      QUOTA_ERR="throttled"; break
+    elif ! echo "$_raw" | head -c 1 | grep -q '{'; then
+      QUOTA_ERR="api-error"; break
+    fi
+    _pg_quotas=$(echo "$_raw" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps(d.get('Quotas',[])))" 2>/dev/null || echo "[]")
+    if [[ "$_pg_quotas" != "[]" ]]; then
+      if [[ -z "$QUOTA_ALL" ]]; then
+        QUOTA_ALL="$_pg_quotas"
+      else
+        QUOTA_ALL=$(python3 -c "import sys,json; a=json.loads(sys.argv[1]); b=json.loads(sys.argv[2]); print(json.dumps(a+b))" "$QUOTA_ALL" "$_pg_quotas")
+      fi
+    fi
+    _next=$(echo "$_raw" | python3 -c "import sys,json; print(json.load(sys.stdin).get('NextToken','') or '')" 2>/dev/null || echo "")
+    [[ -z "$_next" ]] && break
+  done
+
+  case "$QUOTA_ERR" in
+    denied)    warn "list-service-quotas" "IAM permission denied — quota check skipped" ;;
+    throttled) warn "list-service-quotas" "Throttled — quota check skipped (retry later)" ;;
+    api-error) warn "list-service-quotas" "API call failed — quota check skipped" ;;
+  esac
+
+  if [[ -n "$QUOTA_ALL" && -z "$QUOTA_ERR" ]]; then
+    for ITYPE in $INSTANCE_TYPES; do
+      QUOTA_VAL=$(python3 -c "
+import sys, json
+quotas = json.loads(sys.argv[1])
+itype = sys.argv[2]
+# Match quotas that reference the instance type AND HyperPod
+matches = [q for q in quotas if itype in q.get('QuotaName','') and 'HyperPod' in q.get('QuotaName','')]
+if matches:
+    q = matches[0]
+    print(f\"{q.get('QuotaName','?')}: {int(q.get('Value',0))}\")
+else:
+    print('NOT_FOUND')
+" "$QUOTA_ALL" "$ITYPE" 2>/dev/null || echo "NOT_FOUND")
+      if [[ "$QUOTA_VAL" == "NOT_FOUND" ]]; then
+        info "Quota for $ITYPE: not found in the SageMaker quota list (check Service Quotas console)"
+      else
+        info "Quota: $QUOTA_VAL"
+      fi
+    done
+  fi
+else
+  info "No instance types found in cluster config"
+fi
+
+if [[ "$ORCHESTRATOR" == "EKS" && -n "$EKS_NAME" ]]; then
+  header "5. EKS Configuration"
+
+  EKS_AUTH=$(aws eks describe-cluster \
+    --name "$EKS_NAME" \
+    --region "$REGION" \
+    --query 'cluster.accessConfig.authenticationMode' \
+    --output text 2>/dev/null || echo "unknown")
+
+  if [[ "$EKS_AUTH" == "CONFIG_MAP" ]]; then
+    warn "EKS auth mode" "CONFIG_MAP-only — access entries require API or API_AND_CONFIG_MAP"
+    add_issue "EKS auth mode is CONFIG_MAP — access entries unavailable until switched (see EKS access-entries docs) → references/cluster-diagnostics-detail.md § D (EKS Access / kubectl)" "P2"
+  elif [[ "$EKS_AUTH" == "API" || "$EKS_AUTH" == "API_AND_CONFIG_MAP" ]]; then
+    pass "EKS auth mode" "$EKS_AUTH"
+  else
+    warn "EKS auth mode" "Could not determine ($EKS_AUTH)"
+  fi
+
+  # Check access entries for current identity. AWS CLI paginates JSON output by
+  # token, so paginate explicitly to handle accounts with many principals.
+  info "Current IAM identity: $CALLER_ARN"
+
+  fetch_all_access_entries() {
+    local merged='[]' token='' page_json combined i=0
+    while (( i < 20 )); do
+      if [[ -n "$token" ]]; then
+        page_json=$(aws eks list-access-entries --cluster-name "$EKS_NAME" --region "$REGION" \
+          --next-token "$token" --output json 2>/dev/null) || break
+      else
+        page_json=$(aws eks list-access-entries --cluster-name "$EKS_NAME" --region "$REGION" \
+          --output json 2>/dev/null) || break
+      fi
+      combined=$(python3 -c "
+import sys, json
+prev = json.loads(sys.argv[1])
+page = json.loads(sys.argv[2])
+prev.extend(page.get('accessEntries', []))
+print(json.dumps(prev))
+print(page.get('nextToken',''))
+" "$merged" "$page_json" 2>/dev/null) || break
+
+      merged=$(printf '%s\n' "$combined" | sed -n '1p')
+
+      token=$(printf '%s\n'  "$combined" | sed -n '2p')
+      i=$((i+1))
+      [[ -z "$token" ]] && break
+    done
+    echo "$merged"
+  }
+  ACCESS_ENTRIES=$(fetch_all_access_entries)
+  [[ -z "$ACCESS_ENTRIES" ]] && ACCESS_ENTRIES='[]'
+
+  ENTRY_COUNT=$(echo "$ACCESS_ENTRIES" | python3 -c "import sys,json; print(len(json.load(sys.stdin)))" 2>/dev/null || echo 0)
+  info "Access entries: $ENTRY_COUNT configured"
+
+  # Strip session name for role-based ARNs
+  CALLER_BASE=$(echo "$CALLER_ARN" | python3 -c "
+import sys
+arn = sys.stdin.read().strip()
+# Convert assumed-role ARN to role ARN for matching
+# arn:aws:sts::ACCOUNT:assumed-role/ROLE/SESSION -> arn:aws:iam::ACCOUNT:role/ROLE
+if ':assumed-role/' in arn:
+    parts = arn.split(':')
+    role_path = parts[-1].replace('assumed-role/', 'role/')
+    role_path = '/'.join(role_path.split('/')[:2])  # remove session name
+    parts[-1] = role_path
+    parts[2] = 'iam'
+    parts[3] = ''  # IAM ARNs have no region
+    print(':'.join(parts))
+else:
+    print(arn)
+" 2>/dev/null || echo "$CALLER_ARN")
+
+  HAS_ACCESS=$(echo "$ACCESS_ENTRIES" | CALLER_BASE_ENV="$CALLER_BASE" python3 -c "
+import sys, json, os
+entries = json.load(sys.stdin)
+caller = os.environ['CALLER_BASE_ENV']
+found = any(caller in str(e) for e in entries)
+print('true' if found else 'false')
+" 2>/dev/null || echo "false")
+
+  if [[ "$HAS_ACCESS" == "true" ]]; then
+    pass "EKS access entry" "current identity has an access entry"
+  else
+    warn "EKS access entry" "current identity ($CALLER_BASE) may not have an access entry — kubectl may fail"
+    add_issue "Current IAM identity may lack EKS access → references/cluster-diagnostics-detail.md § D (EKS Access / kubectl)" "P1"
+  fi
+
+  if command -v kubectl &>/dev/null; then
+    KUBECTL_TEST=$(kubectl cluster-info 2>&1 || true)
+    if echo "$KUBECTL_TEST" | grep -q "Kubernetes control plane\|running at"; then
+      pass "kubectl connectivity" "can reach EKS API server"
+
+      if kubectl get namespace aws-hyperpod &>/dev/null 2>&1; then
+        pass "aws-hyperpod namespace" "exists"
+      else
+        warn "aws-hyperpod namespace" "missing → references/cluster-diagnostics-detail.md § D (EKS Access / kubectl)"
+      fi
+
+      # Node count. Note: `wc -l` never fails; avoid `|| echo 0` which would produce "0\n0".
+      K8S_NODE_COUNT=$(kubectl get nodes --no-headers 2>/dev/null | wc -l | tr -d ' ')
+      K8S_NODE_COUNT=${K8S_NODE_COUNT:-0}
+      info "Kubernetes nodes visible: $K8S_NODE_COUNT"
+
+      if [[ "$K8S_NODE_COUNT" -eq 0 && "$TOTAL_NODES" -gt 0 ]]; then
+        warn "K8s nodes" "0 K8s nodes but $TOTAL_NODES HyperPod nodes — nodes may not have registered with EKS"
+        add_issue "Nodes not visible in kubectl → references/cluster-diagnostics-detail.md § E (Cluster Provisioning)" "P1"
+      fi
+
+      HEALTH_LABELS=$(kubectl get nodes -o custom-columns='NODE:.metadata.name,HEALTH:.metadata.labels.sagemaker\.amazonaws\.com/node-health-status' --no-headers 2>/dev/null || true)
+      if [[ -n "$HEALTH_LABELS" ]]; then
+        UNHEALTHY_K8S=$(echo "$HEALTH_LABELS" | grep -v "<none>" | grep -viE "Schedulable$" || true)
+        if [[ -n "$UNHEALTHY_K8S" ]]; then
+          warn "EKS node health labels" "non-schedulable nodes detected:"
+          echo "$UNHEALTHY_K8S" | while IFS= read -r line; do info "  $line"; done
+          add_issue "EKS nodes with health issues → delegate to hyperpod-node-debugger skill; references/cluster-diagnostics-detail.md § G (Node Replacement)" "P1"
+        else
+          pass "EKS node health labels" "all nodes schedulable"
+        fi
+      fi
+
+      # Dangling node detection — nodes visible in EKS but not in HyperPod list
+      # (or vice versa). Happens after failed scale-up, rollback, or orphaned
+      # kubelet registrations.
+      if [[ "$K8S_NODE_COUNT" -gt 0 && "$TOTAL_NODES" -gt 0 ]]; then
+        HP_INSTANCES=$(echo "$NODE_LIST" | python3 -c "
+import sys,json
+nodes=json.load(sys.stdin).get('ClusterNodeSummaries',[])
+for n in nodes:
+    iid=n.get('InstanceId','')
+    if iid: print(iid)
+" 2>/dev/null | sort -u)
+        EKS_INSTANCES=$(kubectl get nodes -l sagemaker.amazonaws.com/compute-type=hyperpod \
+          -o jsonpath='{range .items[*]}{.spec.providerID}{"\n"}{end}' 2>/dev/null \
+          | awk -F/ '{print $NF}' | grep -E '^i-' | sort -u || true)
+        if [[ -n "$HP_INSTANCES" && -n "$EKS_INSTANCES" ]]; then
+          DANGLING=$(comm -13 <(echo "$HP_INSTANCES") <(echo "$EKS_INSTANCES"))
+          ORPHANED=$(comm -23 <(echo "$HP_INSTANCES") <(echo "$EKS_INSTANCES"))
+          if [[ -n "$DANGLING" ]]; then
+            warn "Dangling nodes" "visible in EKS but not in HyperPod ($(echo "$DANGLING" | wc -l))"
+            echo "$DANGLING" | head -5 | while IFS= read -r iid; do info "  EKS-only: $iid"; done
+            add_issue "Dangling EKS nodes (present in kubectl, absent from list-cluster-nodes) → references/cluster-diagnostics-detail.md § K (Dangling Nodes & Cleanup)" "P1"
+          fi
+          if [[ -n "$ORPHANED" ]]; then
+            warn "Orphaned HyperPod nodes" "visible in HyperPod but not in EKS ($(echo "$ORPHANED" | wc -l))"
+            echo "$ORPHANED" | head -5 | while IFS= read -r iid; do info "  HyperPod-only: $iid"; done
+            add_issue "HyperPod nodes not registered in EKS → references/cluster-diagnostics-detail.md § E (Cluster Provisioning); delegate to hyperpod-node-debugger" "P1"
+          fi
+          [[ -z "$DANGLING" && -z "$ORPHANED" ]] && pass "Node reconciliation" "EKS and HyperPod views match"
+        fi
+      fi
+
+      # EKS add-on health — VPC CNI, CoreDNS, kube-proxy failures break pod networking.
+      # Add-on count is small in practice (<10) so a single page of 100 is always sufficient.
+      if [[ -n "$EKS_NAME" ]]; then
+        ADDON_JSON=$(aws eks list-addons --cluster-name "$EKS_NAME" --region "$REGION" \
+          --max-results 100 --output json 2>/dev/null || echo '{"addons":[]}')
+        ADDON_NAMES=$(echo "$ADDON_JSON" | python3 -c "
+import sys,json
+print('\n'.join(json.load(sys.stdin).get('addons',[])))
+" 2>/dev/null)
+        DEGRADED_ADDONS=""
+        while IFS= read -r addon; do
+          [[ -z "$addon" ]] && continue
+          A_STATUS=$(aws eks describe-addon --cluster-name "$EKS_NAME" --addon-name "$addon" \
+            --region "$REGION" --query 'addon.status' --output text 2>/dev/null || echo "UNKNOWN")
+          if [[ "$A_STATUS" != "ACTIVE" && "$A_STATUS" != "UPDATING" ]]; then
+            DEGRADED_ADDONS+="$addon($A_STATUS) "
+          fi
+        done <<< "$ADDON_NAMES"
+        if [[ -n "$DEGRADED_ADDONS" ]]; then
+          warn "EKS add-ons" "not ACTIVE: $DEGRADED_ADDONS"
+          add_issue "EKS add-on(s) degraded: $DEGRADED_ADDONS → references/cluster-diagnostics-detail.md § D (EKS Access / kubectl)" "P1"
+        else
+          [[ -n "$ADDON_NAMES" ]] && pass "EKS add-ons" "$(echo "$ADDON_NAMES" | wc -l) add-on(s) ACTIVE"
+        fi
+      fi
+
+      # aws-auth ConfigMap legacy check — deprecated but still load-bearing if cluster auth mode
+      # is API_AND_CONFIG_MAP or CONFIG_MAP. Misconfigured entries here can shadow access entries.
+      if [[ -n "$EKS_NAME" ]]; then
+        AUTH_MODE=$(aws eks describe-cluster --name "$EKS_NAME" --region "$REGION" \
+          --query 'cluster.accessConfig.authenticationMode' --output text 2>/dev/null || echo "")
+        if [[ "$AUTH_MODE" == "CONFIG_MAP" || "$AUTH_MODE" == "API_AND_CONFIG_MAP" ]]; then
+          if kubectl -n kube-system get configmap aws-auth >/dev/null 2>&1; then
+            AUTH_ENTRIES=$(kubectl -n kube-system get configmap aws-auth -o jsonpath='{.data.mapRoles}' 2>/dev/null | grep -c "^" || true)
+            AUTH_ENTRIES=${AUTH_ENTRIES:-0}
+            info "aws-auth ConfigMap: $AUTH_ENTRIES mapRoles entries (auth mode: $AUTH_MODE)"
+            if [[ "$AUTH_MODE" == "API_AND_CONFIG_MAP" ]]; then
+              warn "aws-auth ConfigMap" "both ConfigMap and access entries in use — ConfigMap entries can shadow access entries; recommend migrating to API-only mode"
+            fi
+          fi
+        fi
+      fi
+    else
+      warn "kubectl connectivity" "cannot reach EKS API — check kubeconfig and access entries"
+      add_issue "kubectl cannot reach EKS → references/cluster-diagnostics-detail.md § D (EKS Access / kubectl)" "P1"
+    fi
+  else
+    info "kubectl not installed — skipping Kubernetes checks"
+  fi
+else
+  header "5. Slurm Checks"
+  info "Orchestrator: Slurm"
+
+  # Warn/issue emitted in section 1; this branch is the PASS-only confirmation.
+  if [[ "$NODE_RECOVERY" == *"Automatic"* ]] && [[ "$NODE_RECOVERY" != *"None"* ]]; then
+    pass "NodeRecovery" "enabled on all instance groups"
+  fi
+
+  if command -v session-manager-plugin &>/dev/null && [[ -n "$CLUSTER_ID" ]]; then
+    header "5b. Slurm Controller Health (via SSM)"
+    HEAD_NODE_ID=$(echo "$NODE_LIST" | python3 -c "
+import sys,json
+nodes=json.load(sys.stdin).get('ClusterNodeSummaries',[])
+for n in nodes:
+    g=n.get('InstanceGroupName','').lower()
+    if any(x in g for x in ['controller','head','master','login']):
+        print(n.get('InstanceId',''))
+        break
+else:
+    if nodes:
+        print(nodes[0].get('InstanceId',''))
+" 2>/dev/null || echo "")
+
+    if [[ -n "$HEAD_NODE_ID" ]]; then
+      HEAD_GROUP=$(echo "$NODE_LIST" | HEAD_NODE_ID_ENV="$HEAD_NODE_ID" python3 -c "
+import sys,json,os
+target_id = os.environ['HEAD_NODE_ID_ENV']
+nodes=json.load(sys.stdin).get('ClusterNodeSummaries',[])
+for n in nodes:
+    if n.get('InstanceId','') == target_id:
+        print(n.get('InstanceGroupName',''))
+        break
+" 2>/dev/null || echo "")
+      if [[ -z "$HEAD_GROUP" ]]; then
+        warn "Controller node" "could not resolve instance-group name — SSM check skipped"
+        HEAD_NODE_ID=""
+      fi
+    fi
+    if [[ -n "$HEAD_NODE_ID" ]]; then
+      SSM_TARGET="sagemaker-cluster:${CLUSTER_ID}_${HEAD_GROUP}-${HEAD_NODE_ID}"
+      info "Controller node: $HEAD_NODE_ID ($HEAD_GROUP)"
+      info "SSM target: $SSM_TARGET"
+
+      _slurm_nonce=$(date +%s%N 2>/dev/null || echo "$RANDOM")
+      # Validate nonce is numeric to prevent injection in remote command
+      if [[ ! "$_slurm_nonce" =~ ^[0-9]+$ ]]; then
+        _slurm_nonce="$$"
+      fi
+      SLURM_SH=$(cat <<EOF
+scontrol show config >/dev/null 2>&1
+if [ \$? -eq 0 ]; then echo SLURM_OK_${_slurm_nonce}; else echo SLURM_DOWN_${_slurm_nonce}; fi
+echo NODES_START_${_slurm_nonce}
+sinfo -o '%N %T %30E' --noheader 2>/dev/null | head -20
+echo NODES_END_${_slurm_nonce}
+echo JOBS_START_${_slurm_nonce}
+squeue -o '%i %j %T %R' --noheader 2>/dev/null | grep -iE 'COMPLETING|CONFIGURING|PENDING' | head -10 || true
+echo JOBS_END_${_slurm_nonce}
+echo MUNGE_${_slurm_nonce}
+systemctl is-active munge 2>/dev/null || echo munge_inactive
+echo END_${_slurm_nonce}
+EOF
+)
+      STDOUT=$(ssm_run_on_node "$HEAD_NODE_ID" "$HEAD_GROUP" "$SLURM_SH" || echo "")
+
+      if [[ -n "$STDOUT" ]]; then
+        if echo "$STDOUT" | grep -q "SLURM_OK_${_slurm_nonce}"; then
+          pass "slurmctld" "responsive"
+        elif echo "$STDOUT" | grep -q "SLURM_DOWN_${_slurm_nonce}"; then
+          fail "slurmctld" "not responding — all Slurm operations blocked"
+          add_issue "slurmctld down on controller → references/cluster-operations.md § 8 Slurm — controller operations" "P0"
+        fi
+
+        SLURM_DOWN_NODES=$(echo "$STDOUT" | sed -n "/^NODES_START_${_slurm_nonce}\$/,/^NODES_END_${_slurm_nonce}\$/p" | grep -v "^NODES_" | grep -iE "down|drain|fail" || true)
+        if [[ -n "$SLURM_DOWN_NODES" ]]; then
+          warn "Slurm nodes with issues:"
+          echo "$SLURM_DOWN_NODES" | while IFS= read -r line; do info "  $line"; done
+          S_DOWN_COUNT=$(echo "$SLURM_DOWN_NODES" | grep -c . ; :)
+          S_DOWN_COUNT=${S_DOWN_COUNT:-0}
+          add_issue "$S_DOWN_COUNT Slurm node(s) down/drained → references/cluster-diagnostics-detail.md § G (Node Replacement); delegate to hyperpod-node-debugger" "P1"
+        else
+          pass "Slurm nodes" "all idle/alloc/mixed"
+        fi
+
+        STUCK_JOBS=$(echo "$STDOUT" | sed -n "/^JOBS_START_${_slurm_nonce}\$/,/^JOBS_END_${_slurm_nonce}\$/p" | grep -v "^JOBS_" || true)
+        if [[ -n "$STUCK_JOBS" ]]; then
+          warn "Stuck Slurm jobs detected:"
+          echo "$STUCK_JOBS" | while IFS= read -r line; do info "  $line"; done
+          add_issue "Stuck Slurm jobs → references/cluster-operations.md § 8 Slurm — controller operations" "P1"
+        fi
+
+        if echo "$STDOUT" | sed -n "/^MUNGE_${_slurm_nonce}\$/,/^END_${_slurm_nonce}\$/p" | grep -q "munge_inactive"; then
+          fail "munge" "authentication service not running — Slurm auth will fail"
+          add_issue "munge service inactive on controller → references/cluster-operations.md § 8 Slurm — controller operations" "P0"
+        fi
+      else
+        info "Could not get output from SSM on controller — check ssm:StartSession permission, session-manager-plugin, or node reachability"
+      fi
+    else
+      info "Could not identify controller node from node list"
+    fi
+  else
+    info "SSM plugin not available — Slurm checks require SSM access to controller"
+    info "Install SSM plugin to enable Slurm health checks"
+  fi
+fi
+
+header "6. SSM Readiness"
+
+if command -v session-manager-plugin &>/dev/null; then
+  if SSM_VERSION=$(session-manager-plugin --version 2>/dev/null); then
+    pass "SSM plugin installed" "version: $SSM_VERSION"
+  else
+    warn "SSM plugin" "installed but --version failed — plugin may be corrupt"
+    add_issue "SSM plugin installed but broken → references/cluster-diagnostics-detail.md § F (SSM Connectivity)" "P1"
+  fi
+else
+  warn "SSM plugin" "not installed — required for node access (install session-manager-plugin)"
+  add_issue "SSM plugin not installed → references/cluster-diagnostics-detail.md § F (SSM Connectivity)" "P2"
+fi
+
+if [[ -n "$CLUSTER_ID" && "$TOTAL_NODES" -gt 0 ]]; then
+  FIRST_NODE=$(echo "$NODE_LIST" | python3 -c "
+import sys, json
+nodes = json.load(sys.stdin).get('ClusterNodeSummaries', [])
+if nodes:
+    n = nodes[0]
+    nid = n.get('InstanceId', '?')
+    group = n.get('InstanceGroupName', '?')
+    print(f'{group}-{nid}')
+" 2>/dev/null || echo "")
+
+  if [[ -n "$FIRST_NODE" ]]; then
+    info "SSM target format: sagemaker-cluster:${CLUSTER_ID}_${FIRST_NODE}"
+    info "To connect: aws ssm start-session --target sagemaker-cluster:${CLUSTER_ID}_${FIRST_NODE} --region $REGION"
+  fi
+fi
+
+if [[ -n "$SUBNET_IDS" ]]; then
+  header "6b. VPC Endpoints"
+
+  FIRST_SUBNET=$(echo "$SUBNET_IDS" | awk '{print $1}')
+  VPC_FOR_ENDPOINTS=$(aws ec2 describe-subnets \
+    --subnet-ids "$FIRST_SUBNET" \
+    --region "$REGION" \
+    --cli-read-timeout 15 \
+    --query 'Subnets[0].VpcId' \
+    --output text 2>/dev/null || echo "")
+
+  if [[ -n "$VPC_FOR_ENDPOINTS" && "$VPC_FOR_ENDPOINTS" != "None" ]]; then
+    EP_RESULT=$(aws ec2 describe-vpc-endpoints \
+      --filters "Name=vpc-id,Values=$VPC_FOR_ENDPOINTS" \
+      --region "$REGION" \
+      --cli-read-timeout 15 \
+      --query "VpcEndpoints[?State==\`available\`].ServiceName" \
+      --output text 2>&1)
+    if echo "$EP_RESULT" | grep -qiE "AccessDenied|UnauthorizedOperation"; then
+      warn "describe-vpc-endpoints" "IAM permission denied — VPC endpoint check skipped"
+      EP_RESULT=""
+    fi
+    ENDPOINTS="${EP_RESULT}"
+
+    # s3 → Lifecycle scripts (S3 bucket download path)
+    # ssm/ssmmessages/ec2messages → SSM connectivity (§ F)
+    for SVC in s3 ssm ssmmessages ec2messages; do
+      if echo "$ENDPOINTS" | grep -qE "(^|[.])${SVC}($|[[:space:]])"; then
+        pass "VPC endpoint: $SVC"
+      else
+        warn "VPC endpoint: $SVC" "not found — required only if the cluster subnet has no NAT/IGW path out"
+        case "$SVC" in
+          s3)   add_issue "VPC endpoint not found for s3 → references/cluster-diagnostics-detail.md § C (Lifecycle Scripts)" "P2" ;;
+          ssm|ssmmessages|ec2messages)
+                add_issue "VPC endpoint not found for $SVC → references/cluster-diagnostics-detail.md § F (SSM Connectivity)" "P2" ;;
+        esac
+      fi
+    done
+  else
+    info "Could not determine VPC ID for endpoint check"
+  fi
+fi
+
+header "7. CloudWatch Logs"
+
+if [[ -n "$CLUSTER_ID" ]]; then
+  # CW log groups follow /aws/sagemaker/Clusters/<CLUSTER_NAME>/<CLUSTER_ID>,
+  # where <CLUSTER_NAME> is the human-readable name (not the ARN short-id).
+  CLUSTER_NAME_FOR_LOGS=$(echo "$CLUSTER_JSON" | python3 -c "
+import sys, json
+try:
+    d = json.load(sys.stdin)
+    n = d.get('ClusterName', '')
+    print(n if n else '')
+except Exception:
+    print('')
+" 2>/dev/null)
+  # Fall back to the value the caller supplied, unless it looks like an ARN.
+  if [[ -z "$CLUSTER_NAME_FOR_LOGS" ]]; then
+    if [[ "$CLUSTER" == arn:aws:* ]]; then
+      CLUSTER_NAME_FOR_LOGS="$CLUSTER_ID"  # best-effort; will probe the prefix below
+    else
+      CLUSTER_NAME_FOR_LOGS="$CLUSTER"
+    fi
+  fi
+
+  LOG_GROUP="/aws/sagemaker/Clusters/${CLUSTER_NAME_FOR_LOGS}/${CLUSTER_ID}"
+
+  LOG_RESULT=$(aws logs describe-log-groups \
+    --log-group-name-prefix "$LOG_GROUP" \
+    --region "$REGION" \
+    --query 'logGroups[0].logGroupName' \
+    --output text 2>&1)
+  if echo "$LOG_RESULT" | grep -qiE "AccessDenied|UnauthorizedOperation"; then
+    warn "describe-log-groups" "IAM permission denied — CloudWatch log check skipped"
+    LOG_RESULT="None"
+  fi
+  LOG_EXISTS="${LOG_RESULT:-None}"
+
+  if [[ "$LOG_EXISTS" != "None" && -n "$LOG_EXISTS" ]]; then
+    pass "CloudWatch log group" "$LOG_GROUP"
+
+    # Use the server-side prefix filter; clusters with hundreds of nodes have
+    # hundreds of streams and the default first-page result truncates.
+    count_log_streams_by_prefix() {
+      local prefix="$1"
+      local merged='[]' token='' page_json combined i=0
+      while (( i < 20 )); do
+        if [[ -n "$token" ]]; then
+          page_json=$(aws logs describe-log-streams \
+            --log-group-name "$LOG_GROUP" --region "$REGION" \
+            --log-stream-name-prefix "$prefix" --limit 50 --next-token "$token" \
+            --output json 2>/dev/null) || break
+        else
+          page_json=$(aws logs describe-log-streams \
+            --log-group-name "$LOG_GROUP" --region "$REGION" \
+            --log-stream-name-prefix "$prefix" --limit 50 \
+            --output json 2>/dev/null) || break
+        fi
+        combined=$(python3 -c "
+import sys, json
+prev = json.loads(sys.argv[1])
+page = json.loads(sys.argv[2])
+prev.extend(s.get('logStreamName','') for s in page.get('logStreams', []))
+print(json.dumps(prev))
+print(page.get('nextToken',''))
+" "$merged" "$page_json" 2>/dev/null) || break
+
+        merged=$(printf '%s\n' "$combined" | sed -n '1p')
+
+        token=$(printf '%s\n'  "$combined" | sed -n '2p')
+        i=$((i+1))
+        [[ -z "$token" ]] && break
+      done
+      echo "$merged" | python3 -c "import sys,json; print(len(json.load(sys.stdin)))" 2>/dev/null || echo 0
+    }
+
+    LC_COUNT=$(count_log_streams_by_prefix "LifecycleConfig")
+    HM_COUNT=$(count_log_streams_by_prefix "SagemakerHealthMonitoringAgent")
+
+    info "Lifecycle log streams: $LC_COUNT"
+    info "Health monitoring log streams: $HM_COUNT"
+
+    if [[ "$LC_COUNT" -eq 0 && "$CLUSTER_STATUS" != "Creating" ]]; then
+      warn "Lifecycle logs" "no lifecycle log streams found — scripts may not have run"
+    fi
+  else
+    warn "CloudWatch log group" "not found: $LOG_GROUP"
+    info "Logs may not be available if cluster creation failed early"
+    info "Check IAM execution role has CloudWatch Logs write permissions"
+    add_issue "CloudWatch log group not found → references/cluster-diagnostics-detail.md § C (Lifecycle Scripts)" "P2"
+  fi
+fi
+
+echo ""
+echo -e "${BOLD}========================================${NC}"
+echo -e "${BOLD}          DIAGNOSTIC SUMMARY            ${NC}"
+echo -e "${BOLD}========================================${NC}"
+echo ""
+
+echo -e "  Cluster:  ${BOLD}${CLUSTER}${NC} (${ORCHESTRATOR})"
+echo -e "  Status:   ${CLUSTER_STATUS}"
+echo -e "  Results:  ${RED}${CRITICAL_FAILURES} critical${NC} | ${YELLOW}${WARNINGS} warnings${NC}"
+echo -e "  Mode:     READ-ONLY (no changes made; each [FAIL] points to a references section)"
+echo ""
+
+if [[ ${#ISSUES_FOUND[@]} -gt 0 ]]; then
+  echo -e "${BOLD}  Issues Found (prioritized):${NC}"
+  for priority in P0 P1 P2; do
+    has_priority=false
+    for issue in "${ISSUES_FOUND[@]}"; do
+      if [[ "$issue" == "${priority}|"* ]]; then
+        if ! "$has_priority"; then
+          case "$priority" in
+            P0) echo -e "    ${RED}${BOLD}[$priority — Fix Immediately]${NC}" ;;
+            P1) echo -e "    ${YELLOW}${BOLD}[$priority — Fix Soon]${NC}" ;;
+            P2) echo -e "    ${BOLD}[$priority — Informational]${NC}" ;;
+          esac
+          has_priority=true
+        fi
+        echo -e "      → ${issue#*|}"
+      fi
+    done
+  done
+  echo ""
+fi
+
+if [[ $CRITICAL_FAILURES -eq 0 && $WARNINGS -eq 0 ]]; then
+  echo -e "  ${GREEN}${BOLD}All cluster-level checks passed.${NC}"
+  echo "  If issues persist, try:"
+  echo "    - hyperpod-node-debugger skill for per-node issues"
+  echo "    - hyperpod-nccl skill for NCCL/training issues"
+elif [[ $CRITICAL_FAILURES -eq 0 ]]; then
+  echo -e "  ${YELLOW}${BOLD}No critical issues, but $WARNINGS warning(s) found.${NC}"
+  echo "  Review [WARN] items above."
+else
+  echo -e "  ${RED}${BOLD}$CRITICAL_FAILURES critical issue(s) found.${NC}"
+  echo "  Fix [FAIL] items above. See SKILL.md for detailed resolution steps."
+fi
+echo ""
+
+exit "$([[ $CRITICAL_FAILURES -eq 0 ]] && echo 0 || echo 1)"
diff --git a/plugins/sagemaker-ai/skills/hyperpod-issue-report/references/troubleshooting.md b/plugins/sagemaker-ai/skills/hyperpod-issue-report/references/troubleshooting.md
index 9ab32540..1033f02a 100755
--- a/plugins/sagemaker-ai/skills/hyperpod-issue-report/references/troubleshooting.md
+++ b/plugins/sagemaker-ai/skills/hyperpod-issue-report/references/troubleshooting.md
@@ -8,7 +8,7 @@
 | `kubectl must be configured for EKS clusters` | kubectl missing or wrong context                              | Run `aws eks update-kubeconfig --name <eks-cluster-name> --region <region>`. Get the EKS cluster name from `aws sagemaker describe-cluster` output (`Orchestrator.Eks.ClusterArn`) |
 | Cluster name from ARN not found               | ARN contains cluster ID, not name                             | Pass the full ARN to `--cluster` instead of extracting the ID portion. Alternatively, use `aws sagemaker list-clusters` to find the cluster name                                   |
 | No instance reports in S3                     | Node IAM role missing S3 permissions                          | Add `s3:GetObject`/`s3:PutObject` to node role for the report bucket                                                                                                               |
-| SSM connectivity failed                       | SSM agent down, missing IAM, or network                       | Check `systemctl status amazon-ssm-agent`, verify `AmazonSSMManagedInstanceCore` policy                                                                                            |
+| SSM connectivity failed                       | SSM agent down, missing IAM, or network                       | Check `systemctl status amazon-ssm-agent`                                                                                                                                          |
 | "Failed to detect shell prompt"               | Custom SSM session config (custom `.bashrc`, SSM preferences) | Not compatible without modifying prompt detection; use manual SSM sessions as workaround                                                                                           |
 | SSM throttling                                | Too many concurrent sessions                                  | Reduce `--max-workers`; automatic retry handles transient throttling                                                                                                               |
 | Nodes unresponsive                            | Node completely down                                          | Noted in report; other nodes' diagnostics may reveal pattern                                                                                                                       |
diff --git a/plugins/sagemaker-ai/skills/hyperpod-nccl/SKILL.md b/plugins/sagemaker-ai/skills/hyperpod-nccl/SKILL.md
new file mode 100644
index 00000000..01c066df
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/hyperpod-nccl/SKILL.md
@@ -0,0 +1,187 @@
+---
+name: hyperpod-nccl
+description: Diagnose NCCL failures and adjacent training-pod failures on HyperPod GPU clusters (EKS or Slurm) — training hangs, AllReduce / collective-op timeouts, EFA or libfabric errors, rendezvous failures, EFA TCP fallback, /dev/shm or memlock issues, NCCL version mismatch across pods, container OOM / exit-137 / OOMKilled, GPU OOM (CUDA out of memory), CrashLoopBackOff / Pending pods, MASTER_ADDR DNS, NetworkPolicy blocking. Not for single-node hardware faults (→ hyperpod-node-debugger § G) or cluster-creation EFA / SSM failures (→ hyperpod-cluster-debugger § A / § F).
+metadata:
+  version: "0.0.1"
+---
+
+# HyperPod NCCL Debugger
+
+**Operating policy.** Run read-only diagnostics yourself. Never run a command that changes cluster, node, or workload state — present each one as a **Suggested command (run this yourself)** block and wait for the customer. Destructive order: **investigate → reboot → replace** (replace destroys root + secondary volumes; not supported on Slurm controller nodes). Never discard training state on speculation.
+
+Diagnose NCCL failures on SageMaker HyperPod (EKS and Slurm). `scripts/nccl-diagnose.sh` reads state via AWS APIs, kubectl, and SSM, then prints each issue as `[FAIL] ... → references/<file>.md § <section>`. Read-only.
+
+**Signal sourcing:** `list-cluster-events` carries infrastructure-level state only (lifecycle, bootstrap, EFA health check, capacity, replacement, reboot, AMI rollback). It does **not** carry NCCL timeouts, GPU XID/ECC, or per-pod training signals — those come from pod logs, CloudWatch training streams, on-node SSM probes, and NCCL env audit. "No events" on a training-time NCCL issue is expected, not a clean bill of health.
+
+---
+
+## Workflow
+
+1. Collect cluster name, region, namespace/job (EKS), exact NCCL error string.
+2. Run the diagnostic (always — the output drives everything else).
+3. For every `[FAIL]` line, `Read` the referenced section.
+4. Present finding, root cause, and the Suggested-command block with concrete values (instance IDs, SG IDs, namespaces) filled in from the script output. Wait for customer approval.
+5. Re-run the diagnostic to confirm.
+
+If a finding has no matching section, report it as a bug — do not invent a fix.
+
+## Step 1: Authenticate kubectl (EKS)
+
+```bash
+EKS_ARN=$(aws sagemaker describe-cluster --cluster-name <HYPERPOD-NAME> --region <REGION> \
+  --query 'Orchestrator.Eks.ClusterArn' --output text)
+EKS_NAME=$(echo "$EKS_ARN" | awk -F'/' '{print $NF}')
+aws eks update-kubeconfig --name "$EKS_NAME" --region <REGION>
+kubectl get nodes
+```
+
+## Step 2: Run the diagnostic
+
+```bash
+# Basic:
+bash scripts/nccl-diagnose.sh --cluster <HYPERPOD-NAME> --region <REGION>
+
+# Scope to an EKS job/namespace:
+bash scripts/nccl-diagnose.sh --cluster <NAME> --region <REGION> --namespace <NS> --job <JOB>
+
+# Force orchestrator:
+bash scripts/nccl-diagnose.sh --cluster <NAME> --region <REGION> --orchestrator slurm
+
+# Larger hardware sample (default 3):
+bash scripts/nccl-diagnose.sh --cluster <NAME> --region <REGION> --sample-nodes 10
+
+# Specific node only:
+bash scripts/nccl-diagnose.sh --cluster <NAME> --region <REGION> --node i-0abc123def456
+```
+
+Tags: `[PASS]` · `[FAIL]` (counted in `Issues Found`, has reference pointer) · `[WARN]` · `[INFO]`. Priorities: **P0** blocks training · **P1** degraded · **P2** informational.
+
+---
+
+## Remediation index
+
+Each `[FAIL]` line in the script already points directly at the right section. This table is a lookup for manual triage.
+
+| Finding                                    | Section                                                                                             |
+| ------------------------------------------ | --------------------------------------------------------------------------------------------------- |
+| SG missing inbound/outbound self-reference | [operations.md § 8](references/operations.md)                                                       |
+| Blocking NetworkPolicy / allow-all missing | [operations.md § 8](references/operations.md)                                                       |
+| Slurm node DOWN / DRAINING / RemoveIPC     | [operations.md § 7](references/operations.md)                                                       |
+| GPU XID / SYSTEM_ERROR / hardware fault    | [hyperpod-node-debugger § F / § G](../hyperpod-node-debugger/references/node-diagnostics-detail.md) |
+| GPU row-remap / DCGM Fail / silent NaNs    | [hyperpod-node-debugger § G.1.a/b](../hyperpod-node-debugger/references/node-diagnostics-detail.md) |
+| NCCL timeout / rendezvous / straggler      | [debugging-guide.md § 1](references/debugging-guide.md)                                             |
+| EFA configuration / not used               | [debugging-guide.md § 6](references/debugging-guide.md)                                             |
+| EFA TCP fallback (`NET/OFI Using TCP`)     | [debugging-guide.md § 13](references/debugging-guide.md)                                            |
+| NCCL version mismatch across pods          | [debugging-guide.md § 10](references/debugging-guide.md)                                            |
+| Container OOM (pod killed, exit 137)       | [debugging-guide.md § 4](references/debugging-guide.md)                                             |
+| GPU OOM (`CUDA out of memory`)             | [debugging-guide.md § 11](references/debugging-guide.md)                                            |
+| RDMA memlock / `/dev/shm` too small        | [debugging-guide.md § 17](references/debugging-guide.md)                                            |
+| MASTER_ADDR DNS / headless Service         | [debugging-guide.md § 12](references/debugging-guide.md)                                            |
+| NVLS / PXN / topology tuning               | [debugging-guide.md § 19](references/debugging-guide.md)                                            |
+| Any NCCL / EFA / rendezvous log pattern    | [error-patterns-quick-ref.md](references/error-patterns-quick-ref.md)                               |
+| Performance / nccl-tests / bandwidth       | [performance-testing.md](references/performance-testing.md)                                         |
+
+---
+
+## Prerequisites
+
+- `aws` CLI v2.13+ authenticated (`aws sts get-caller-identity`)
+- `jq`, `python3`, `bash` 4.2+
+- `unbuffer` (from the `expect` package: `yum install expect` / `apt install expect`)
+- `kubectl` authenticated to the EKS cluster (K8s checks skipped if absent)
+- `session-manager-plugin` for on-node hardware checks
+
+## Defaults
+
+- **Region** — required: pass `--region` or set `$AWS_DEFAULT_REGION`.
+- **Orchestrator** — auto-detected; override with `--orchestrator eks|slurm`.
+- **Namespace / job (EKS)** — all namespaces; scope with `--namespace <NS> --job <JOB>`.
+- **Hardware sampling** — 3 nodes over SSM (capped at 50). `--node <ID>` for a specific node. Node probes run **serially** (180 s per node): `--sample-nodes 10` can take ~30 min.
+- **CloudWatch window** — last 2 hours.
+- **Colors** — auto-disabled on non-TTY or `TERM=dumb`.
+
+## Error handling
+
+| Failure                             | Script                                                | Tell the customer                                                         |
+| ----------------------------------- | ----------------------------------------------------- | ------------------------------------------------------------------------- |
+| `aws sts get-caller-identity` fails | Exit 1 with the AWS error                             | "Fix AWS credentials and rerun."                                          |
+| `describe-cluster` AccessDenied     | Warn, add `Missing IAM for sagemaker:DescribeCluster` | "Grant `sagemaker:DescribeCluster` (operations.md § 2)."                  |
+| Cluster not found                   | Exit 1 after listing region's clusters                | "Confirm HyperPod cluster name and region."                               |
+| `kubectl` absent / unauthenticated  | Warn, skip K8s checks                                 | "`aws eks update-kubeconfig --name <EKS> --region <R>`."                  |
+| SSM plugin absent                   | Warn, skip on-node hardware checks                    | "Install session-manager-plugin."                                         |
+| SSM times out (180s)                | Partial output, mark node unreachable                 | "Rerun with `--node <ID> --sample-nodes 1`; check SSM agent on the node." |
+| CloudWatch log group not found      | Skip CloudWatch scan                                  | "Enable CloudWatch on the cluster (operations.md § 4)."                   |
+| Cluster events API throttled        | Warn, continue with partial data                      | "Rerun later — script is idempotent."                                     |
+
+Exit codes: `0` diagnostic complete · `1` fatal prerequisite missing or cluster unreachable.
+
+## IAM permissions
+
+Full policy + RBAC in [operations.md § 2](references/operations.md#2-iam). SSM on HyperPod uses `start-session` against `sagemaker-cluster:<cluster-id>_<group>-<iid>` targets — grant `ssm:StartSession` / `ssm:TerminateSession`, not `ssm:SendCommand`.
+
+## Scale strategy
+
+| Scope           | Method                                   | Coverage                 |
+| --------------- | ---------------------------------------- | ------------------------ |
+| All nodes       | `sagemaker:ListClusterNodes` (paginated) | 100% nodes               |
+| All K8s objects | `kubectl`                                | 100% pods/nodes/policies |
+| Hardware        | SSM `--sample-nodes N` (default 3)       | Sampled                  |
+| Node logs       | CloudWatch                               | 100% nodes               |
+
+**Large clusters:** the PyTorch NCCL backend defaults to a 10-minute collective-op timeout (per the PyTorch distributed docs). Large clusters routinely exceed that on first rendezvous; raise it via `torch.distributed.init_process_group(timeout=timedelta(seconds=<N>))`. HyperPod support has also observed NCCL topology-graph-search hangs on 256+ node clusters when `memlock` is `unlimited`; using a large fixed memlock (e.g. `8388608`) in pod `securityContext` or `/etc/security/limits.conf` has cleared these in field cases. This memlock pattern is a field observation, not AWS- or NCCL-documented behavior.
+
+For **FSDP**, **DeepSpeed**, or **Megatron-LM** tuning: [debugging-guide.md § 18](references/debugging-guide.md).
+
+## Skill delegation
+
+| Need                                                                   | Use                                                          |
+| ---------------------------------------------------------------------- | ------------------------------------------------------------ |
+| Cluster creation / deployment failures                                 | `hyperpod-cluster-debugger` (§ A / B / C / H + `--validate`) |
+| Post-deployment cluster-wide management                                | `hyperpod-cluster-debugger`                                  |
+| Per-node issues (disk, lifecycle, hardware)                            | `hyperpod-node-debugger`                                     |
+| Trainium/Inferentia collective-comm (AWS Neuron Collectives, not NCCL) | `hyperpod-node-debugger` § G.2                               |
+| Shell on nodes                                                         | `hyperpod-ssm`                                               |
+| Version comparison across nodes                                        | `hyperpod-version-checker`                                   |
+| Diagnostic bundle for AWS Support                                      | `hyperpod-issue-report`                                      |
+| MFU / performance degradation                                          | `hyperpod-mfu-debugger`                                      |
+
+## Escalate to AWS Support
+
+Escalate when:
+
+1. All SG rules correct, EFA verified on-node, but NCCL still times out.
+2. Hardware checks pass on all nodes but AllReduce still hangs.
+3. `Issues Found: 0` but training still fails.
+4. GPU XID errors persist after node replacement.
+5. Collective-op timeout raised and memlock workaround applied but large-cluster rendezvous still hangs.
+
+### Before opening the case
+
+```bash
+# 1. Cluster identity + status
+aws sagemaker describe-cluster --cluster-name <C> --region <R>
+
+# 2. Full NCCL diagnostic (sample more nodes for escalation)
+bash scripts/nccl-diagnose.sh --cluster <C> --region <R> --sample-nodes 10 > nccl-diag.txt
+
+# 3. Per-node log/config bundle to S3 (delegates to hyperpod-issue-report)
+#    See skills/hyperpod-issue-report/SKILL.md for the exact invocation.
+```
+
+### Include in the case
+
+- Cluster name + ARN and AWS region
+- Orchestrator (EKS or Slurm) and EKS cluster name / Slurm controller node
+- Timestamp window (UTC start / end) of the failure
+- Exact NCCL / libfabric error strings (copy verbatim from pod logs or journalctl)
+- Affected instance IDs / node names / pod names / namespace / job name
+- `nccl-diag.txt` from step 2 above
+- S3 URI of the `hyperpod-issue-report` bundle from step 3
+- NCCL env vars in effect (`printenv | grep -E '^NCCL|^FI_|^TORCH_'` from one pod)
+
+## References
+
+- [error-patterns-quick-ref.md](references/error-patterns-quick-ref.md) — log pattern → code → fix table
+- [debugging-guide.md](references/debugging-guide.md) — per-scenario procedures (21 sections incl. NVLS/PXN/topology)
+- [performance-testing.md](references/performance-testing.md) — nccl-tests, bandwidth thresholds, straggler detection
+- [operations.md](references/operations.md) — IAM, SSM format, CloudWatch, env-var reference, node labels, Slurm ops, remediations
diff --git a/plugins/sagemaker-ai/skills/hyperpod-nccl/references/debugging-guide.md b/plugins/sagemaker-ai/skills/hyperpod-nccl/references/debugging-guide.md
new file mode 100644
index 00000000..00505ce6
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/hyperpod-nccl/references/debugging-guide.md
@@ -0,0 +1,1011 @@
+# NCCL HyperPod — Detailed Debugging Guide
+
+Detailed procedures for each failure type. See `SKILL.md` for the quick reference.
+
+## Table of Contents
+
+| #  | Section                                                                                                        | Key Symptoms                                        |
+| -- | -------------------------------------------------------------------------------------------------------------- | --------------------------------------------------- |
+| 1  | [NCCL Timeout / Rendezvous Hang](#1-nccl-timeout--rendezvous-hang)                                             | Training hangs, AllReduce stuck, rendezvous timeout |
+| 2  | [Security Group Self-Reference Rules](#2-security-group-self-reference-rules)                                  | NCCL always times out, new cluster                  |
+| 3  | [NCCL_SOCKET_IFNAME — Interface Selection](#3-nccl_socket_ifname--interface-selection)                         | Wrong NIC, binding to eth0 instead of EFA           |
+| 4  | [Container OOM (exit code 137)](#4-container-oom--pod-killed-mid-training-exit-code-137)                       | OOMKilled, exit code 137                            |
+| 5  | [Wrong Results — Gradient Sync](#5-wrong-results--gradient-sync-issues)                                        | Loss not converging, inconsistent results           |
+| 6  | [EFA Configuration](#6-efa-configuration)                                                                      | EFA not working, slow training, FI_PROVIDER         |
+| 7  | [Node Hardware Failures](#7-node-hardware-failures)                                                            | XID errors, ECC, NVLink errors                      |
+| 8  | [Slurm-Specific Procedures](#8-slurm-specific-procedures)                                                      | Slurm batch script, node management, RemoveIPC      |
+| 9  | [NCCL RAS — Live Job Health](#9-nccl-ras--live-job-health)                                                     | Live health query, straggler detection              |
+| 10 | [NCCL Version Mismatch](#10-nccl-version-mismatch-nccl-function-not-found)                                     | `NCCL function not found`, mixed images             |
+| 11 | [GPU OOM — CUDA out of memory](#11-gpu-oom--cuda-out-of-memory--cudamalloc-failed)                             | `cudaMalloc failed`, VRAM exhausted                 |
+| 12 | [DNS Resolution Failure](#12-dns-resolution-failure-name-or-service-not-known)                                 | `Name or service not known`, headless service       |
+| 13 | [EFA TCP Fallback](#13-efa-tcp-fallback-netofi-using-tcp)                                                      | `NET/OFI Using TCP`, 10x slower                     |
+| 14 | [GPU P2P Access Blocked (ACS)](#14-gpu-p2p-access-blocked-acsiommu)                                            | P2P not supported, intra-node slow                  |
+| 15 | [Stale Shared Memory](#15-stale-shared-memory-unlink-shared-memory)                                            | `/dev/shm/nccl-*` errors, RemoveIPC                 |
+| 16 | [Host Firewall Blocking NCCL](#16-host-firewall-blocking-nccl-iptablesnftables)                                | iptables DROP/REJECT                                |
+| 17 | [RDMA Memory Registration Failure](#17-rdma-memory-registration-failure-ibv_reg_mr-failed)                     | `ibv_reg_mr failed`, memlock                        |
+| 18 | [Distributed Training Frameworks](#18-distributed-training-frameworks--nccl-tuning)                            | FSDP, DeepSpeed, Megatron-LM tuning                 |
+| 19 | [Advanced NCCL Tuning](#19-advanced-nccl-tuning-nvls-pxn-topology-cross-nic)                                   | NVLS, PXN, topology, cross-NIC                      |
+| 20 | [Pending / CrashLoopBackOff / Init-Container Failures](#20-pending--crashloopbackoff--init-container-failures) | Pods stuck Pending, init containers failing         |
+| 21 | [GPU Row-Remap / DCGM Health](#21-gpu-row-remap--dcgm-health-marginal-memory-silent-degrader)                  | Silent NaNs, pending row-remap, DCGM false-Pass     |
+
+---
+
+## 1. NCCL Timeout / Rendezvous Hang
+
+**Always start minimal:** Reproduce with 2 ranks and `torch.ones(100)` before debugging full training.
+
+```python
+import os, torch, torch.distributed as dist, datetime
+rank = int(os.environ.get('RANK', 0))
+world_size = int(os.environ.get('WORLD_SIZE', 2))
+master = os.environ.get('MASTER_ADDR', 'localhost')
+port  = os.environ.get('MASTER_PORT', '29500')
+dist.init_process_group('gloo',
+    init_method=f'tcp://{master}:{port}',
+    world_size=world_size, rank=rank,
+    timeout=datetime.timedelta(seconds=120))
+t = torch.ones(100) * rank
+dist.all_reduce(t, op=dist.ReduceOp.SUM)
+expected = sum(range(world_size))
+assert t[0].item() == expected, f"Got {t[0].item()}, expected {expected}"
+print(f"[Rank {rank}] [PASS] AllReduce PASSED", flush=True)
+dist.destroy_process_group()
+```
+
+**Debug env vars:**
+
+```bash
+export NCCL_DEBUG=INFO              # verbose NCCL output
+export NCCL_DEBUG_SUBSYS=ALL        # all subsystems
+export TORCH_DISTRIBUTED_DEBUG=DETAIL
+export TORCH_NCCL_ASYNC_ERROR_HANDLING=1    # surface NCCL timeouts as exceptions
+export NCCL_DEBUG_FILE=/tmp/nccl_rank${RANK}.log
+# Extend PyTorch collective timeout in training code:
+#   dist.init_process_group("nccl", timeout=timedelta(seconds=1800))
+```
+
+**Dump call stack of hung process:**
+
+```bash
+# Inside the pod (EKS):
+kubectl exec -n <ns> <pod> -- pip install py-spy -q
+kubectl exec -n <ns> <pod> -- py-spy dump --pid $(pgrep -f python | head -1)
+
+# On the node via SSM (both orchestrators):
+aws ssm start-session --target sagemaker-cluster:<CLUSTER_ID>_<GROUP>-<INSTANCE_ID>
+# On node:
+py-spy dump --pid $(pgrep -f python | head -1)
+py-spy record -o /tmp/profile.svg --pid <PID> --duration 30
+```
+
+**Root cause matrix:**
+
+| Timeout fires when            | Root cause                                                   | Fix                                                                                                                    |
+| ----------------------------- | ------------------------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------- |
+| Before init completes         | SG missing self-ref / NetworkPolicy                          | Fix SG or remove blocking NetworkPolicy                                                                                |
+| Before init completes         | Wrong MASTER_ADDR / DNS failure                              | Fix headless service; use `<job>-0.<svc>.<ns>.svc.cluster.local`                                                       |
+| Before init completes         | WORLD_SIZE > actual pods                                     | Match WORLD_SIZE to `spec.completions`                                                                                 |
+| After init, during AllReduce  | One rank crashed (OOM/CUDA)                                  | Check pod logs for exit code 137                                                                                       |
+| After init, during AllReduce  | Straggler node (slow NIC)                                    | Run nccl-tests, drain slow node                                                                                        |
+| On large cluster (128+ nodes) | PyTorch collective timeout too low (default 10 min for NCCL) | Raise via `init_process_group(timeout=timedelta(seconds=<N>))`; `nodes*5+600` is a starting heuristic, not a guarantee |
+
+**Slurm MASTER_ADDR setup** (no headless service needed — Slurm resolves hostnames natively):
+
+```bash
+# In your sbatch script:
+export MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -1)
+export MASTER_PORT=29500
+# Verify DNS works from all nodes:
+srun --overlap bash -c "nslookup $MASTER_ADDR"
+```
+
+**For 100+ node clusters — prioritized fix order:**
+
+1. Extend the PyTorch collective timeout (default: 10 min for NCCL, per the PyTorch distributed docs). Example starting value: `init_process_group(timeout=timedelta(seconds=<N>))` where `N` is tuned from your observed step time. `nodes*5+600` is a starting heuristic only.
+2. Check `memlock` — see Section 17 (field-observed workaround for topology-search hangs on 256+ node clusters).
+3. Run straggler detection — see `references/performance-testing.md` pairwise bandwidth test.
+4. Check for NCCL version drift after rolling node replacements — see Section 10
+
+---
+
+## 2. Security Group Self-Reference Rules
+
+Commands and verification are in [operations.md § 8](operations.md#8-nccl-specific-remediations). Without inbound + outbound self-reference on the cluster SG, NCCL rendezvous and EFA RDMA traffic are dropped.
+
+---
+
+## 3. NCCL_SOCKET_IFNAME — Interface Selection
+
+**On EFA nodes (p4d/p5), always set explicitly:**
+
+```bash
+# Correct for EFA nodes — exclude non-VPC interfaces:
+export NCCL_SOCKET_IFNAME=^lo,docker,efa,veth,virbr
+
+# Find the correct VPC interface name:
+ip -br addr show | grep -vE "^lo|docker|br-|virbr|veth|efa" | grep UP | awk '{print $1}'
+```
+
+**Validate the setting works (leaves at least one interface):**
+
+```bash
+# After setting NCCL_SOCKET_IFNAME, verify it leaves interfaces:
+PATTERN="${NCCL_SOCKET_IFNAME#^}"
+ip -br addr show | grep UP | awk '{print $1}' | \
+  grep -vE "$(echo "$PATTERN" | tr ',' '|')"
+# Must show at least one interface (e.g., ens5)
+```
+
+**Also set matching MPI variable:**
+
+```bash
+export OMPI_MCA_btl_tcp_if_include=ens5   # match your VPC ENI
+# OR:
+export OMPI_MCA_btl_tcp_if_exclude=lo,docker0,virbr0
+```
+
+---
+
+## 4. Container OOM — Pod Killed Mid-Training (exit code 137)
+
+**Symptom:** Pod status = OOMKilled, exit code 137. The Linux kernel killed the process due to cgroup memory limit.
+This is different from GPU OOM (see section 11).
+
+**Detect:**
+
+```bash
+# EKS: check container termination reason
+kubectl describe pod <POD> -n <NS> | grep -A5 "Last State:"
+# Shows: Reason: OOMKilled, Exit Code: 137
+
+# On node via SSM:
+dmesg | grep -i "oom\|killed process" | tail -10
+free -h
+```
+
+**Fix options (in order of impact):**
+
+```python
+# 1. Gradient checkpointing (most impact, slower backward pass)
+model.gradient_checkpointing_enable()
+
+# 2. FSDP (shard model across all GPUs in job)
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+model = FSDP(model, device_id=torch.cuda.current_device())
+
+# 3. Mixed precision (halve activation memory)
+from torch.cuda.amp import autocast, GradScaler
+scaler = GradScaler()
+with autocast():
+    loss = model(inputs)
+
+# 4. Reduce batch size
+batch_size = batch_size // 2  # halve until OOM resolves
+```
+
+```yaml
+# Increase K8s memory limits:
+resources:
+  limits:
+    memory: "64Gi"   # increase as needed
+    nvidia.com/gpu: "8"
+```
+
+---
+
+## 5. Wrong Results — Gradient Sync Issues
+
+**Verify AllReduce is actually happening:**
+
+```python
+def check_allreduce_consistency(tensor, name, rank, world_size):
+    """Verify all ranks have same values after AllReduce."""
+    dist.all_reduce(tensor, op=dist.ReduceOp.SUM)
+    results = [None] * world_size
+    dist.all_gather_object(results, tensor.sum().item())
+    if rank == 0:
+        if len(set(round(r, 4) for r in results)) > 1:
+            print(f"[FAIL] INCONSISTENT '{name}': {results}", flush=True)
+        else:
+            print(f"[PASS] CONSISTENT '{name}': {results[0]:.4f}", flush=True)
+```
+
+**Check FSDP/DTensor placements:**
+
+```python
+from torch.distributed.tensor import DTensor
+for name, param in model.named_parameters():
+    if isinstance(param, DTensor):
+        print(f"[Rank {dist.get_rank()}] {name}: placements={param.placements}")
+    else:
+        print(f"[Rank {dist.get_rank()}] {name}: NOT sharded (unexpected for FSDP)")
+```
+
+**Print from all ranks in order (debugging):**
+
+```python
+def print_all_ranks(msg):
+    for r in range(dist.get_world_size()):
+        if dist.get_rank() == r:
+            print(f"[Rank {r}] {msg}", flush=True)
+        dist.barrier()
+```
+
+---
+
+## 6. EFA Configuration
+
+**Required for full performance on p4d/p5:**
+
+```bash
+export FI_PROVIDER=efa
+export FI_EFA_USE_DEVICE_RDMA=1     # GPU Direct RDMA
+export NCCL_SOCKET_IFNAME=^lo,docker,efa,veth
+export NCCL_PROTO=Simple            # large-message protocol (valid: LL, LL128, Simple)
+# Collective timeout is a PyTorch arg — set via init_process_group(timeout=timedelta(seconds=1800))
+```
+
+**K8s pod spec for EFA:**
+
+```yaml
+resources:
+  limits:
+    vpc.amazonaws.com/efa: <N>   # match EFA device count for the instance type
+  requests:
+    vpc.amazonaws.com/efa: <N>
+```
+
+### Suggested command — install EFA K8s device plugin (run this yourself)
+
+**Preconditions:** EKS orchestrator with GPU nodes (p4d / p5 / p5e / p5en / p6); node AMI already has EFA kernel modules (verify `fi_info -p efa` returns endpoints on one node); cluster admin has approved installing a daemonset into `kube-system`. If EFA is already allocated to pods (pod `limits.vpc.amazonaws.com/efa > 0`), the plugin is already installed — skip.
+
+**Command:**
+
+```bash
+helm repo add eks <aws-eks-charts-helm-repo>
+helm install aws-efa-k8s-device-plugin --namespace kube-system \
+  eks/aws-efa-k8s-device-plugin
+```
+
+**Blast radius:** installs a daemonset on every node in `kube-system` (one pod per node) that advertises `vpc.amazonaws.com/efa` as a schedulable resource. Cannot be removed by a single command — requires `helm uninstall`. Interacts with every GPU-scheduling pod; misconfiguration can starve pods of EFA resources.
+
+**Verify EFA on node:**
+
+```bash
+fi_info -p efa                              # lists EFA endpoints
+cat /opt/amazon/efa_installed_packages      # EFA installer version
+lsmod | grep efa                            # kernel module loaded
+ls /dev/infiniband/uverbs*                  # device files exist
+nvidia-smi nvlink --status                  # NVLink (p4d/p5)
+```
+
+---
+
+## 7. Node Hardware Failures
+
+NCCL errors caused by GPU / EFA hardware faults (Xid errors, ECC, NVLink, off-bus) are diagnosed and remediated in the node-debugger skill: [hyperpod-node-debugger § G (GPU/Accelerator)](../../hyperpod-node-debugger/references/node-diagnostics-detail.md#g-gpuaccelerator) and [§ F (Hardware / Auto-Repair)](../../hyperpod-node-debugger/references/node-diagnostics-detail.md#f-hardware--auto-repair).
+
+Get the instance ID from a K8s node name:
+
+```bash
+kubectl get node <NODE_NAME> -o jsonpath='{.spec.providerID}' | cut -d'/' -f5
+```
+
+### Suggested command — drain before reboot/replace (EKS) (run this yourself)
+
+**Preconditions:** hardware fault confirmed on `<NODE_NAME>` (XID/ECC/NVLink/off-bus — see `hyperpod-node-debugger § G`); customer accepts that pods using `emptyDir` volumes on this node will lose that data when evicted; drain is preparation for `batch-reboot-cluster-nodes` (try first) or `batch-replace-cluster-nodes` — not a fix on its own. See [hyperpod-cluster-debugger § G.2](../../hyperpod-cluster-debugger/references/cluster-diagnostics-detail.md#g2-manual-replacement).
+
+**Command:**
+
+```bash
+kubectl cordon <NODE_NAME>
+kubectl drain <NODE_NAME> --ignore-daemonsets --delete-emptydir-data
+```
+
+**Blast radius:** `--delete-emptydir-data` discards `emptyDir` scratch on this node (training caches, ephemeral checkpoints not persisted to PVC/`/opt/sagemaker`); pods are rescheduled elsewhere if capacity exists, otherwise stay Pending. Drain is reversible (`kubectl uncordon`) only if you decide not to proceed with reboot/replace.
+
+---
+
+## 8. Slurm-Specific Procedures
+
+**NCCL batch script template:**
+
+```bash
+#!/bin/bash
+#SBATCH --nodes=4
+#SBATCH --ntasks-per-node=1
+#SBATCH --gpus-per-node=8
+#SBATCH --job-name=nccl-training
+
+# EFA settings (p4d/p5):
+export FI_PROVIDER=efa
+export FI_EFA_USE_DEVICE_RDMA=1
+export NCCL_SOCKET_IFNAME=^lo,docker,efa,veth
+export NCCL_DEBUG=WARN
+# Set the PyTorch collective timeout in training code, not via env:
+#   dist.init_process_group("nccl", timeout=timedelta(seconds=1800))
+
+# Rendezvous (torchrun manages RANK/WORLD_SIZE automatically):
+export MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -1)
+export MASTER_PORT=29500
+
+srun torchrun \
+  --nnodes=$SLURM_NNODES \
+  --nproc_per_node=8 \
+  --rdzv_backend=c10d \
+  --rdzv_endpoint=$MASTER_ADDR:$MASTER_PORT \
+  train.py
+```
+
+Slurm node management and the `RemoveIPC=no` requirement are in [operations.md § 7](operations.md#7-slurm--nccl-specific-operations).
+
+---
+
+## 9. NCCL RAS — Live Job Health
+
+NCCL's RAS (Reliability, Availability, Serviceability) subsystem lets you query the state of a running NCCL job without attaching a debugger. Per the NCCL env-var reference, RAS is available since NCCL 2.24 and is enabled by default (`NCCL_RAS_ENABLE=1`); the listen address is configured via `NCCL_RAS_ADDR`. Confirm the actual port your build uses (it can be overridden by env or NCCL config) before assuming the example port number below.
+
+```bash
+# Find the RAS port for the running NCCL process (configurable via NCCL_RAS_ADDR):
+#   - Check the env of the training process:
+#       cat /proc/$(pgrep -f python | head -1)/environ | tr '\0' '\n' | grep NCCL_RAS_ADDR
+#   - Or check what's listening locally:
+#       ss -ltnp | grep -i nccl
+
+# Example (replace <PORT> with the actual RAS port for your build):
+echo "verbose status" | nc -w 3 localhost <PORT>
+
+# With the ncclras binary :
+ncclras -v
+ncclras -f json | python3 -m json.tool   
+ncclras -m                              
+
+# Inside a K8s pod:
+kubectl exec -n <NS> <POD> -- sh -c "echo 'verbose status' | nc -w 3 localhost <PORT>"
+```
+
+**Interpret status:**
+
+- `RUNNING OK` — all ranks alive, progressing normally
+- `MISMATCH` — some ranks behind → possible straggler
+- `INCOMPLETE` — missing rank data → one rank unresponsive
+- `DEAD` / `PEER_DEAD` — a rank process is confirmed dead → this is the rank that hung the collective
+
+---
+
+## 10. NCCL Version Mismatch (`NCCL function not found`)
+
+**Symptom:** `NCCL function not found` or `Incompatible NCCL version` at job startup.
+**Cause:** Different NCCL builds across nodes — mixed container images or manual installs.
+
+**Diagnose:**
+
+```bash
+# Check NCCL version per running pod:
+for pod in $(kubectl get pods -n <NS> -l job-name=<JOB> --no-headers | awk '{print $1}'); do
+    echo -n "$pod: "
+    kubectl exec -n <NS> "$pod" -- \
+        python3 -c "import torch; print(torch.cuda.nccl.version())" 2>/dev/null \
+        || echo "unavailable"
+done
+
+# Check via library file:
+kubectl exec -n <NS> <POD> -- \
+    find /usr/local/cuda/lib64 /usr/lib -name "libnccl.so*" 2>/dev/null | head -3
+
+# Check CUDA driver version per node:
+kubectl get nodes -o custom-columns=\
+'NAME:.metadata.name,DRIVER:.metadata.labels.nvidia\.com/cuda\.driver-version' \
+2>/dev/null || kubectl get nodes -o wide
+```
+
+**Fix:**
+
+```bash
+# All pods in a job MUST use identical container images.
+# Verify your job spec uses the same image for all replicas:
+kubectl get pod -n <NS> -l job-name=<JOB> \
+    -o jsonpath='{range .items[*]}{.metadata.name}: {.spec.containers[0].image}{"\n"}{end}'
+# Every line must show the same image:tag
+
+# If different, update your job spec to pin every replica to the same image:
+# spec.template.spec.containers[0].image: <AWS DLC image URI from your region's DLC account>
+# e.g. an AWS Deep Learning Container pytorch-training image tagged for your CUDA + Python + OS combo
+```
+
+**Common cause on HyperPod:** Rolling node replacement installs a new AMI with a different NCCL version while old nodes are still in the cluster. Use lifecycle scripts to pin NCCL versions.
+
+---
+
+## 11. GPU OOM — `CUDA out of memory` / `cudaMalloc failed`
+
+**Symptom:** `CUDA out of memory`, `cudaMalloc failed`, or `RuntimeError: CUDA error: out of memory`.
+This is GPU VRAM exhaustion — distinct from container OOMKill (section 4).
+The process does NOT get killed by the kernel; PyTorch raises a Python exception.
+
+**Diagnose:**
+
+```bash
+# Check GPU memory usage on all GPUs:
+kubectl exec -n <NS> <POD> -- \
+    nvidia-smi --query-gpu=index,name,memory.used,memory.total,utilization.gpu \
+    --format=csv,noheader
+
+# In training script — add before suspected OOM:
+import torch
+for i in range(torch.cuda.device_count()):
+    used = torch.cuda.memory_allocated(i) / 1e9
+    reserved = torch.cuda.memory_reserved(i) / 1e9
+    total = torch.cuda.get_device_properties(i).total_memory / 1e9
+    print(f"GPU {i}: allocated={used:.1f}GB reserved={reserved:.1f}GB total={total:.1f}GB")
+    print(torch.cuda.memory_summary(i))
+```
+
+**Fix options (in order of impact):**
+
+```python
+# 1. Gradient checkpointing — trade compute for memory (most impactful)
+model.gradient_checkpointing_enable()
+
+# 2. ZeRO optimizer — shard optimizer states across ranks (DeepSpeed)
+# In deepspeed config:
+# "zero_optimization": {"stage": 3}   # ZeRO-3: shards params, grads, optimizer states
+
+# 3. FSDP — shard model weights across all GPUs
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+model = FSDP(model)
+
+# 4. Mixed precision — halve activation memory
+from torch.cuda.amp import autocast
+with autocast(dtype=torch.bfloat16):
+    loss = model(inputs)
+
+# 5. Reduce batch size — simplest fix
+batch_size = batch_size // 2
+
+# 6. Clear cache between steps (if fragmentation is the issue)
+torch.cuda.empty_cache()
+```
+
+**Memory fragmentation fix:**
+
+```python
+# If OOM happens after many steps (fragmentation):
+import gc
+gc.collect()
+torch.cuda.empty_cache()
+# Or: set PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+```
+
+---
+
+## 12. DNS Resolution Failure (`Name or service not known`)
+
+**Symptom:** `Name or service not known`, `getaddrinfo failed`, or rendezvous hangs forever.
+**Cause:** MASTER_ADDR hostname cannot be resolved. Common on EKS when no headless Service is in place to give pods a stable DNS name.
+
+**Diagnose:**
+
+```bash
+# Check DNS from inside a pod:
+kubectl exec -n <NS> <POD> -- nslookup $MASTER_ADDR
+kubectl exec -n <NS> <POD> -- getent hosts $MASTER_ADDR
+
+# Check if headless service exists:
+kubectl get svc -n <NS> -o wide | grep None
+# Should show: ClusterIP: None with selector matching training pods
+
+# Check CoreDNS is healthy:
+kubectl get pods -n kube-system -l k8s-app=kube-dns
+kubectl logs -n kube-system -l k8s-app=kube-dns --tail=20
+```
+
+**Fix:**
+
+```yaml
+# Create headless service for training job DNS:
+apiVersion: v1
+kind: Service
+metadata:
+  name: my-training-svc
+  namespace: <NS>
+spec:
+  clusterIP: None
+  selector:
+    app: my-training-job   # must match training pod labels
+  ports:
+  - port: 29500
+    name: nccl-rendezvous
+```
+
+```bash
+# Set MASTER_ADDR using the service DNS:
+export MASTER_ADDR="<job-name>-0.<service-name>.<namespace>.svc.cluster.local"
+```
+
+---
+
+## 13. EFA TCP Fallback (`NET/OFI Using TCP`)
+
+**Symptom:** In NCCL_DEBUG=INFO output, you see `NET/OFI Using TCP` instead of `NET/OFI Using EFA`.
+Training runs but at 10-100x lower bandwidth than expected.
+
+**Diagnose:**
+
+```bash
+# Check if EFA device plugin is installed:
+kubectl get daemonset -A | grep -i efa
+
+# Check if pod requests EFA:
+kubectl get pod <POD> -n <NS> -o jsonpath='{.spec.containers[0].resources.limits}'
+# Must include: vpc.amazonaws.com/efa
+
+# Check EFA env vars:
+kubectl exec -n <NS> <POD> -- env | grep FI_
+
+# Check on node via SSM:
+fi_info -p efa  # Must list EFA endpoints
+```
+
+**Fix checklist:**
+
+1. Install the EFA K8s device plugin — see the Suggested-command block earlier in this file (§ EFA device plugin).
+2. Request EFA in pod spec:
+
+   ```yaml
+   resources:
+     limits:
+       vpc.amazonaws.com/efa: <N>   # match EFA device count for the instance type
+   ```
+
+3. Set EFA env vars in the pod:
+
+   ```bash
+   export FI_PROVIDER=efa
+   export FI_EFA_USE_DEVICE_RDMA=1
+   export NCCL_SOCKET_IFNAME=^lo,docker,efa,veth
+   ```
+
+4. Ensure the `aws-ofi-nccl` plugin is in the container image (`find /opt/amazon -name "libnccl-net.so" 2>/dev/null`).
+
+---
+
+## 14. GPU P2P Access Blocked (ACS/IOMMU)
+
+**Symptom:** `NCCL WARN P2P not supported between dev X and dev Y` or `peer access is not supported`.
+Intra-node AllReduce is 10-50x slower because GPU Direct P2P transfers are blocked by PCI ACS.
+
+**Diagnose:**
+
+```bash
+# Check ACS on node via SSM:
+lspci -vvv 2>/dev/null | grep -A20 "PCI bridge" | grep "ACSCtl:"
+# If "SrcValid+" appears → ACS is enabled → P2P blocked
+
+# Check IOMMU:
+dmesg | grep -i iommu
+grep -oE "intel_iommu=[^ ]+" /proc/cmdline
+
+# Check P2P topology:
+nvidia-smi topo -m
+# NV# = NVLink (fast), PIX/PXB/PHB = PCIe (slow)
+```
+
+### Suggested command — disable ACS on NVIDIA GPU bridges (last resort; run this yourself)
+
+**Preconditions:** P2P GPU traffic confirmed to fall back to CPU hops via `nvidia-smi topo -m`; GPU peer-to-peer blocked by PCIe ACS (`ACSCtl: SrcValid+` observed via `lspci -vvv`); confirmed the node is single-tenant (training workload only); you have reviewed that this weakens IOMMU isolation for the affected PCI bridges. Do NOT apply to multi-tenant or security-sensitive hosts.
+
+**Command:**
+
+```bash
+# Disable ACS on NVIDIA GPU upstream bridges only — scoping to 10de: avoids
+# weakening IOMMU isolation on unrelated PCI devices.
+for BDF in $(lspci -D -d 10de: | awk '{print $1}'); do
+  sudo setpci -s "$BDF" ECAP_ACS+0x6.w=0000 2>/dev/null
+done
+
+# For persistence, add the same NVIDIA-only scope to the lifecycle script:
+echo 'for BDF in $(lspci -D -d 10de: | awk "{print \$1}"); do setpci -s $BDF ECAP_ACS+0x6.w=0000 2>/dev/null; done' \
+  >> /opt/ml/scripts/on_create.sh
+```
+
+**Blast radius:** host-wide PCIe change for every NVIDIA GPU bridge on the node — takes effect immediately and persists for the life of the OS (or until the lifecycle script is re-run after a reboot). IOMMU isolation for those bridges is reduced, which is acceptable on a dedicated training host but NOT acceptable on multi-tenant hosts. If applied incorrectly, reboot restores the default ACS state unless the lifecycle-script change was made.
+
+---
+
+## 15. Stale Shared Memory (`unlink shared memory`)
+
+**Symptom:** `unlink shared memory /dev/shm/nccl-* failed: No such file` or new training job
+fails with `File exists` on /dev/shm/nccl-* files left by a previous crash.
+
+**Cause:** Either systemd `RemoveIPC=yes` (default on RHEL/Amazon Linux) deletes NCCL shm
+mid-training, or a crashed training process left orphaned shm files.
+
+**Diagnose:**
+
+```bash
+# Check on node:
+ls -la /dev/shm/nccl-*
+grep RemoveIPC /etc/systemd/logind.conf
+```
+
+### Suggested command — clean stale shm and disable RemoveIPC (run this yourself)
+
+**Preconditions:** no NCCL training job is currently running on this node (`ps aux | grep -E 'python.*torchrun|mpirun'` returns empty); `RemoveIPC=yes` confirmed in `/etc/systemd/logind.conf`; brief `systemd-logind` restart is acceptable on this node.
+
+**Command:**
+
+```bash
+# 1. Clean up stale files
+rm -f /dev/shm/nccl-*
+
+# 2. Prevent systemd from deleting shm mid-training
+echo "RemoveIPC=no" >> /etc/systemd/logind.conf
+sudo systemctl restart systemd-logind
+
+# 3. For persistence across replacements, add to the lifecycle script:
+echo 'echo "RemoveIPC=no" >> /etc/systemd/logind.conf && systemctl restart systemd-logind' \
+  >> /opt/ml/scripts/on_create.sh
+```
+
+**Blast radius:** `rm -f /dev/shm/nccl-*` silently destroys any active NCCL shared-memory segments — running a collective at the same time will fail. `RemoveIPC=no` is a persistent systemd change; the `systemctl restart` logs out anyone in a systemd user session. Lifecycle-script edit persists across node replacements.
+
+---
+
+## 16. Host Firewall Blocking NCCL (iptables/nftables)
+
+**Symptom:** NCCL timeout even though SG rules and NetworkPolicy are correct.
+Root cause: host-level iptables or nftables DROP/REJECT rules blocking NCCL ports.
+
+**Diagnose:**
+
+```bash
+# On node via SSM:
+iptables -L -n | grep -E "DROP|REJECT"
+nft list ruleset 2>/dev/null | grep -E "drop|reject"
+```
+
+### Suggested command — adjust host firewall to allow NCCL traffic (run this yourself)
+
+**Preconditions:** identified a specific iptables/nftables rule blocking NCCL traffic via `iptables -L -n --line-numbers`; confirmed the rule is **not** managed by `kube-proxy` (those typically appear in the `KUBE-*` chains — never delete those) or the VPC CNI; customer has approved either deleting the specific rule or adding an explicit ACCEPT rule for NCCL ports.
+
+**Command (preferred — add explicit allow rather than touch existing rules):**
+
+```bash
+# Allow NCCL rendezvous port range:
+iptables -I INPUT -p tcp --dport 29400:29500 -j ACCEPT
+# Allow the NCCL RAS port if RAS is enabled and used (read your NCCL_RAS_ADDR setting):
+# iptables -I INPUT -p tcp --dport <NCCL_RAS_PORT> -j ACCEPT
+```
+
+**Command (alternative — delete a specific custom rule by line number):**
+
+```bash
+iptables -L -n --line-numbers   # confirm the line number first
+iptables -D INPUT <rule_number>
+```
+
+**Blast radius:** `iptables -I INPUT ... -j ACCEPT` adds a rule at the top of the INPUT chain — host-wide effect, cleared on reboot unless persisted via `iptables-save`. Deleting a rule by line number is precise but irreversible without the original rule definition; capture `iptables-save` first if you may need to roll back. Never run `iptables -F` on an EKS worker — it flushes `kube-proxy`'s service rules and VPC CNI NetworkPolicy enforcement, breaking pod networking cluster-wide.
+
+---
+
+## 17. RDMA Memory Registration Failure (`ibv_reg_mr failed`)
+
+**Symptom:** `NCCL WARN Call to ibv_reg_mr failed` followed by EFA falling back to TCP — training continues but at 10-100x lower bandwidth.
+
+**Cause:** The Linux `memlock` limit prevents the EFA driver from pinning memory for RDMA DMA transfers. With `memlock=0` or very low values, EFA cannot register any memory buffers.
+
+**Diagnose:**
+
+```bash
+# Check current memlock limit:
+ulimit -l
+# Should be: unlimited or ≥8388608 (8GB in KB)
+# If 0 or 64 → FAIL
+
+# Check on the actual node via SSM:
+aws ssm start-session --target sagemaker-cluster:<CLUSTER_ID>_<GROUP>-<INSTANCE_ID>
+# On node:
+ulimit -l
+cat /proc/$(pgrep -f python | head -1)/limits | grep "Max locked"
+
+# In NCCL debug output (NCCL_DEBUG=INFO):
+# "NCCL WARN Call to ibv_reg_mr failed, got error (12)" → errno 12 = ENOMEM (memlock)
+```
+
+### Suggested command — raise memlock for EFA RDMA (run this yourself)
+
+**Preconditions:** `ulimit -l` confirmed at 0 / 64 / very low on the affected node; `Call to ibv_reg_mr failed` confirmed in NCCL/EFA logs; customer accepts a session/login change (immediate path) or a persistent change to `/etc/security/limits.conf` (permanent path); for K8s pods the change must be applied in the pod spec, not on the node.
+
+**Command — immediate (session only, lost on logout):**
+
+```bash
+ulimit -l 8388608       # 8 GB in KB
+```
+
+**Command — permanent (system-wide):**
+
+```bash
+echo "* soft memlock 8388608" >> /etc/security/limits.conf
+echo "* hard memlock 8388608" >> /etc/security/limits.conf
+# Requires re-login to take effect.
+
+# For Slurm:
+echo "ulimit -l 8388608" >> /etc/slurm/prolog.sh
+```
+
+**Pod spec (K8s) — required for containerized training:**
+
+```yaml
+securityContext:
+  capabilities:
+    add: ["IPC_LOCK"]
+# A high memlock limit on the host is not visible inside the container without
+# IPC_LOCK; without this capability, the pod still hits memlock=0 / very low.
+```
+
+**Blast radius:** session ulimit affects only the current login shell. `/etc/security/limits.conf` change persists across reboots and applies to **every** user who logs in afterwards. Slurm prolog change applies to every job step launched after the edit. K8s pod-spec change is per-pod. For HyperPod, replication across replacement nodes requires baking the limits.conf change into the lifecycle script.
+
+**Note — field observation on large clusters (not NCCL- or AWS-documented):** HyperPod support has seen NCCL topology-graph-search failures on 256+ node clusters when `memlock` is set to `unlimited`. Using a large fixed value (e.g. `8388608`) instead of `unlimited` has cleared these in field cases. If you hit this, engage AWS Support with the NCCL topology-search failure output.
+
+**Verify fix worked:**
+
+```bash
+# After fix, NCCL_DEBUG=INFO should show:
+# "NCCL INFO NET/OFI Using EFA RDMA" (not TCP fallback)
+# No more "ibv_reg_mr failed" warnings
+
+# Check effective bandwidth after fix:
+/opt/nccl-tests/build/all_reduce_perf -b 1G -e 8G -f 2 -g 1
+# Should match expected algbw for your instance type
+```
+
+---
+
+## 18. Distributed Training Frameworks — NCCL Tuning
+
+NCCL issues often surface differently depending on the distributed training framework. Framework-specific guidance:
+
+### FSDP (Fully Sharded Data Parallel — PyTorch native)
+
+**Common NCCL issues with FSDP:**
+
+| Symptom                                      | Cause                               | Fix                                                                            |
+| -------------------------------------------- | ----------------------------------- | ------------------------------------------------------------------------------ |
+| Hang at `_init_intra_and_inter_node_groups`  | NCCL can't form process groups      | Check `MASTER_ADDR`, `MASTER_PORT`, firewall rules, and headless service (EKS) |
+| OOM during FSDP wrapping                     | All-gather materializes full params | Use `sharding_strategy=FULL_SHARD`, enable `cpu_offload` if needed             |
+| Slow FSDP training vs DDP                    | Excessive all-gather/reduce-scatter | Tune `limit_all_gathers=True`, increase `forward_prefetch=True`                |
+| `NCCL watchdog timeout` during checkpointing | Distributed checkpoint blocks NCCL  | Use `StateDictType.SHARDED_STATE_DICT` for async checkpoint save               |
+
+**Recommended NCCL env vars for FSDP on HyperPod:**
+
+```bash
+export NCCL_SOCKET_IFNAME=^lo,docker
+export FI_PROVIDER=efa
+export FI_EFA_USE_DEVICE_RDMA=1
+export NCCL_ALGO=Ring           # Ring is generally better for FSDP all-gather patterns
+export NCCL_PROTO=Simple        # Simple protocol for large-message FSDP comms
+# FSDP checkpoint can be slow at scale — extend the PyTorch collective timeout:
+#   dist.init_process_group("nccl", timeout=timedelta(seconds=1800))
+```
+
+### DeepSpeed
+
+**Common NCCL issues with DeepSpeed:**
+
+| Symptom                                       | Cause                                 | Fix                                                                                                                                                                                                                                        |
+| --------------------------------------------- | ------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `RuntimeError: NCCL communicator was aborted` | Timeout during ZeRO all-gather        | Extend PyTorch collective timeout via `init_process_group(timeout=...)`; check for straggler nodes                                                                                                                                         |
+| OOM with ZeRO Stage 3                         | Parameter partitioning + NCCL buffers | Reduce `stage3_max_live_parameters`, enable `offload_optimizer`                                                                                                                                                                            |
+| Slow DeepSpeed init on 100+ nodes             | Sequential NCCL group creation        | Set `TORCH_NCCL_ASYNC_ERROR_HANDLING=1` (the older `NCCL_ASYNC_ERROR_HANDLING` was renamed to the `TORCH_NCCL_*` namespace in recent PyTorch; check your PyTorch's `torch.distributed` env-var docs); increase `init_timeout` in ds_config |
+| `ncclInternalError` with pipeline parallelism | Cross-node P2P fails                  | Ensure `NCCL_P2P_LEVEL=NVL` for intra-node, check EFA for inter-node                                                                                                                                                                       |
+
+**DeepSpeed config tuning for HyperPod:**
+
+```json
+{
+  "comms_config": {
+    "comms_backend": "nccl",
+    "timeout": 1800
+  },
+  "zero_optimization": {
+    "stage": 3,
+    "stage3_max_live_parameters": 1e8,
+    "stage3_prefetch_bucket_size": 5e7,
+    "reduce_bucket_size": 5e8
+  }
+}
+```
+
+### Megatron-LM
+
+**Common NCCL issues with Megatron-LM:**
+
+| Symptom                                     | Cause                                           | Fix                                                                                                                                             |
+| ------------------------------------------- | ----------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------- |
+| Hang at `initialize_model_parallel`         | NCCL group creation fails across nodes          | Verify world size = TP \* PP \* DP, check network connectivity                                                                                  |
+| Slow tensor-parallel matmul                 | NCCL all-reduce on small tensors is inefficient | Increase TP group size to stay intra-node (TP ≤ GPUs/node)                                                                                      |
+| Pipeline bubble > 40%                       | PP schedule inefficiency                        | Reduce PP stages, increase micro-batches, try interleaved schedule                                                                              |
+| `ncclGroupEnd failed` during 3D parallelism | Too many simultaneous NCCL groups               | Cap NCCL channel count for memory-constrained setups — use `NCCL_MAX_CTAS=2` (replaces the older `NCCL_MAX_NCHANNELS`, deprecated in NCCL 2.17) |
+
+**Megatron-LM parallelism mapping for HyperPod:**
+
+```
+Rule of thumb:
+  TP (tensor parallel) = within a single node (8 GPUs on p5)
+  PP (pipeline parallel) = across nodes (minimizes cross-node comms volume)
+  DP (data parallel) = remaining nodes
+
+  World size = TP × PP × DP
+  Example: 32 p5.48xlarge (256 GPUs)
+    TP=8, PP=4, DP=8 → 8×4×8 = 256
+```
+
+---
+
+## 19. Advanced NCCL Tuning (NVLS, PXN, Topology, Cross-NIC)
+
+### NVLS — NVLink SHARP (GPU-to-GPU hardware offload)
+
+NVLS is NVIDIA's in-network aggregation over NVLink. Per the NCCL env-var reference, `NCCL_NVLS_ENABLE` defaults to `2` (since NCCL 2.17), meaning NVLS is enabled when supported. It speeds up small-message AllReduce on H100/H200 nodes but **requires matching driver and container versions** — driver/container mismatch is a common cause of NVLS-related hangs in field cases.
+
+**Symptoms:**
+
+- Hang inside `ncclAllReduce` on p5/p5e/p5en
+- `NCCL INFO ... NVLS ... failed`
+- Fine on 1 node, hang on 2+ nodes
+
+**Diagnosis:**
+
+```bash
+# Check NCCL version (container side)
+python3 -c "import torch; print(torch.cuda.nccl.version())"
+# Check driver version (node side, via SSM)
+nvidia-smi --query-gpu=driver_version --format=csv
+```
+
+**Mitigations:**
+
+1. Disable NVLS temporarily to isolate:
+
+   ```bash
+   export NCCL_NVLS_ENABLE=0
+   ```
+
+2. Pin NCCL version across all pods/jobs (match container image digest, not tag).
+3. Upgrade the NVIDIA driver on the AMI via `UpdateClusterSoftware` if the container expects a newer driver.
+
+### PXN — P2P Cross-NUMA (p5.48xlarge optimal config)
+
+PXN lets NCCL route inter-node traffic via an intermediary GPU on a different NUMA node to maximize NIC utilization. The documented PXN env var is `NCCL_P2P_PXN_LEVEL` (since NCCL 2.12), which controls PXN usage for send/receive — default is `2` (always use PXN); set `0` to disable. There are also `NCCL_PXN_DISABLE` and `NCCL_PXN_C2C` knobs; consult the NCCL env-var reference for the version in use.
+
+`NCCL_CROSS_NIC` defaults to `2` (per the NCCL docs: "Try to use the same NIC for the same ring/tree, but still allow for the use of different NICs if it would result in a better performance") — leave at default unless you've measured a regression.
+
+```bash
+# Tuning knobs — measure before/after with nccl-tests:
+export NCCL_P2P_PXN_LEVEL=2     # default; 0 disables PXN
+
+# Channel count: NCCL_MIN_NCHANNELS / NCCL_MAX_NCHANNELS were deprecated in
+# NCCL 2.17 in favor of NCCL_MIN_CTAS / NCCL_MAX_CTAS (per NCCL env-var docs).
+# Both names still work on recent versions.
+export NCCL_MIN_CTAS=4
+```
+
+If these cause regressions on smaller jobs (< 16 nodes), unset and re-measure with the defaults.
+
+### NCCL_TOPO_FILE — Custom Topology
+
+NCCL auto-discovers topology on p-family instances and usually picks the right plan. Use a custom topology file only when:
+
+- Running in containers that hide the PCIe topology from NCCL
+- Using an instance type NCCL doesn't recognize
+- Debugging suboptimal ring/tree selection
+
+To export the topology NCCL sees for manual inspection:
+
+```bash
+export NCCL_TOPO_DUMP_FILE=/tmp/nccl-topo.xml
+# Run any NCCL op (e.g., all_reduce_perf), then inspect /tmp/nccl-topo.xml
+```
+
+Do **not** ship a hand-edited topology file unless you've confirmed the default is wrong — this is an advanced-user escape hatch.
+
+### NCCL_SOCKET_FAMILY — IPv4 Forcing
+
+Dual-stack environments (IPv6 enabled on the VPC but IPv4 intended for NCCL) can cause silent TCP fallback. Force IPv4:
+
+```bash
+export NCCL_SOCKET_FAMILY=AF_INET
+```
+
+### Mixed instance families
+
+Mixing different P-family generations in a single NCCL communicator (e.g. p4d + p5) is risky — the topology and EFA adapter counts differ, which can cause NCCL algorithm-selection issues. If you need to do this, measure carefully with nccl-tests first; otherwise launch separate jobs per instance family.
+
+### NCCL_COLLNET_ENABLE on EFA
+
+`NCCL_COLLNET_ENABLE=1` enables NVIDIA's Collective Network (CollNet) protocol, used with SHARP on InfiniBand fabrics. EFA is not InfiniBand and does not provide a SHARP-compatible CollNet provider, so leaving CollNet enabled on EFA can lead to wasted init time or fallback. If a job script sets `NCCL_COLLNET_ENABLE=1`, set it to `0` for HyperPod EFA clusters:
+
+```bash
+export NCCL_COLLNET_ENABLE=0
+```
+
+### Instance family EFA counts (reference)
+
+Counts from authoritative AWS sources where available. Always confirm live with `ls /dev/infiniband/uverbs* | wc -l` on the node — instance counts vary across firmware revisions.
+
+| Instance type | Expected EFA count |
+| ------------- | ------------------ |
+| p5.48xlarge   | 32                 |
+| p5e.48xlarge  | 32                 |
+| p5en.48xlarge | 16                 |
+| p4d.24xlarge  | 4                  |
+
+For other EFA-supported types (p4de, p5.4xlarge, trn1, trn1n, trn2, etc.), check the current EC2 instance-types doc rather than hard-coding a value here. Mismatch with the live count → EFA driver not loaded, or a subset of NICs didn't attach at boot. Reboot via `batch-reboot-cluster-nodes` first; replace if reboot doesn't recover.
+
+---
+
+## 20. Pending / CrashLoopBackOff / Init-Container Failures
+
+Pod lifecycle failures surface as `Pending`, `CrashLoopBackOff`, or stuck in an init container. These are NOT NCCL bugs per se — they block the NCCL job from starting. Diagnose in this order:
+
+### Pending pods
+
+```bash
+# Why is it pending?
+kubectl describe pod <POD> -n <NS> | sed -n '/Events:/,$p' | head -40
+```
+
+Common reasons and where to fix:
+
+| Event message                                                                  | Root cause                                                                           | Where to fix                                                                                                    |
+| ------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------ | --------------------------------------------------------------------------------------------------------------- |
+| `0/N nodes are available: N Insufficient <resource>`                           | Not enough CPU/mem/GPU free                                                          | Wait for other jobs, or scale the cluster                                                                       |
+| `0/N nodes are available: N node(s) didn't match Pod's node affinity/selector` | Affinity/selector too strict                                                         | Fix `nodeSelector` / `nodeAffinity` in the pod spec                                                             |
+| `0/N nodes are available: N node(s) had untolerated taint`                     | Taints on HyperPod nodes (check `kubectl describe node <N>` for the exact taint key) | Add matching `tolerations` to the pod spec                                                                      |
+| `failed to create pod sandbox: ... CNI`                                        | VPC CNI problem                                                                      | Delegate to `hyperpod-node-debugger` § O                                                                        |
+| `MountVolume.SetUp failed for volume`                                          | PVC binding issue                                                                    | Check PVC status, StorageClass, EBS/FSx availability                                                            |
+| `ImagePullBackOff` / `ErrImagePull`                                            | Container image pull failed                                                          | Check ECR pull permissions on the node role; check image URI; confirm VPC endpoint for ECR if in private subnet |
+| (no events; just stuck)                                                        | Scheduler starved or no matching pool                                                | `kubectl get events -A --sort-by=.lastTimestamp \| tail -50` for cluster-wide scheduler state                   |
+
+### CrashLoopBackOff
+
+```bash
+kubectl logs <POD> -n <NS> --previous | tail -100   # logs from the crashed container
+kubectl describe pod <POD> -n <NS>                   # last termination state + exit code
+```
+
+Map the exit code to the guide section:
+
+| Exit code       | Meaning                                       | Section                                                |
+| --------------- | --------------------------------------------- | ------------------------------------------------------ |
+| 137 (OOMKilled) | Container OOM                                 | § 4 Container OOM                                      |
+| 143 (SIGTERM)   | Liveness probe failed or graceful termination | Check liveness probe; check preceding SIGTERM in logs  |
+| 139 (SIGSEGV)   | Segfault — often CUDA / driver mismatch       | § 10 NCCL Version Mismatch                             |
+| 1 / 2 / other   | Application error                             | Read `kubectl logs --previous` for the app-level error |
+
+### Stuck in init container
+
+```bash
+kubectl get pod <POD> -n <NS> -o jsonpath='{.status.initContainerStatuses}' | python3 -m json.tool
+kubectl logs <POD> -n <NS> -c <INIT_CONTAINER_NAME>
+```
+
+Common init-container failures:
+
+- Fetching model weights from S3 — check IAM, VPC endpoint, bucket policy.
+- Downloading dataset — DNS / network / auth.
+- Running a `chown`/`chmod` on a large volume — timeout.
+- Waiting for another pod (headless service / init-container-as-gate pattern) — the dependency pod never became Ready.
+
+### Remediation is always customer-driven
+
+None of these states have a one-command fix. Walk the customer through the diagnosis above, identify the specific cause, then apply the targeted fix. Do not `kubectl delete` pods without understanding why.
+
+---
+
+## 21. GPU Row-Remap / DCGM Health (Marginal Memory Silent Degrader)
+
+When NCCL aborts or training accuracy regresses without matching Xid/ECC counts — sporadic NaNs, intermittent AllReduce hangs, DCGM default `medium,memtest` passes but a GPU is silently returning bad data — the cause is usually a pending row-remap or a marginal GPU that DCGM's combined-run is masking.
+
+Diagnosis procedure, remap state table, DCGM split-run workaround, and escalation bundle (`nvidia-bug-report.sh` + `/var/log/nvidia-dcgm/`) are in the node-debugger skill: [hyperpod-node-debugger § G.1.a/b](../../hyperpod-node-debugger/references/node-diagnostics-detail.md#g1-nvidia-p4dp5g5g6).
diff --git a/plugins/sagemaker-ai/skills/hyperpod-nccl/references/error-patterns-quick-ref.md b/plugins/sagemaker-ai/skills/hyperpod-nccl/references/error-patterns-quick-ref.md
new file mode 100644
index 00000000..0f8a5f11
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/hyperpod-nccl/references/error-patterns-quick-ref.md
@@ -0,0 +1,47 @@
+# NCCL Error Pattern Reference
+
+Quick-lookup table of NCCL log patterns → code → root cause → fix. Used by the diagnostic script to map log lines to a remediation section in `debugging-guide.md`.
+
+| Log pattern                                | Code                    | Root cause                        | Fix                                                                                                                        |
+| ------------------------------------------ | ----------------------- | --------------------------------- | -------------------------------------------------------------------------------------------------------------------------- |
+| **Rendezvous / connection**                |                         |                                   |                                                                                                                            |
+| `Timeout waiting for`                      | `TIMEOUT_RENDEZVOUS`    | Peers not joining init            | SG self-ref, NetworkPolicy, MASTER_ADDR                                                                                    |
+| `Connection refused`                       | `CONN_REFUSED`          | Rank-0 not listening              | Fix MASTER_ADDR + headless service                                                                                         |
+| `Address already in use`                   | `PORT_CONFLICT`         | Port 29500 bound                  | Change MASTER_PORT to 29501                                                                                                |
+| `NCCL WARN Connect to`                     | `CONNECT_FAIL`          | NCCL peer blocked                 | SG self-ref + NetworkPolicy                                                                                                |
+| `network is unreachable`                   | `NET_UNREACHABLE`       | No route to MASTER_ADDR           | DNS + VPC routing + SG                                                                                                     |
+| `Error in Store` / `DistStoreError`        | `STORE_ERR`             | c10d rendezvous timeout           | Fix network first                                                                                                          |
+| `RendezvousConnectionError`                | `RDZV_CONN_ERR`         | Elastic rendezvous failed         | MASTER_ADDR DNS + SG                                                                                                       |
+| `RendezvousTimeout`                        | `RDZV_TIMEOUT`          | Elastic rendezvous timed out      | Peers not reachable                                                                                                        |
+| `Name or service not known`                | `DNS_FAIL`              | DNS resolution failed             | Create headless service                                                                                                    |
+| `getaddrinfo failed`                       | `DNS_FAIL`              | DNS resolution failed             | CoreDNS + headless service                                                                                                 |
+| **Runtime / AllReduce**                    |                         |                                   |                                                                                                                            |
+| `Watchdog timeout`                         | `WATCHDOG_TIMEOUT`      | AllReduce timed out               | Extend PyTorch `init_process_group(timeout=...)`; find straggler                                                           |
+| `unhandled system error`                   | `SYSTEM_ERROR`          | GPU/EFA hardware                  | SSM: dmesg XID errors; reboot node                                                                                         |
+| `unhandled cuda error`                     | `CUDA_ERROR`            | CUDA runtime error                | GPU driver crash or hardware fault                                                                                         |
+| `peer access is not supported`             | `P2P_FAIL`              | GPU P2P blocked by ACS/IOMMU      | Disable ACS; check IOMMU                                                                                                   |
+| `NCCL WARN Cuda failure`                   | `CUDA_ERROR`            | CUDA failure inside NCCL          | GPU hardware or driver                                                                                                     |
+| `Call to ncclCommAbort`                    | `NCCL_COMM_ABORT`       | Communicator aborted              | Check for straggler or hardware fault                                                                                      |
+| **EFA / libfabric**                        |                         |                                   |                                                                                                                            |
+| `fi_getinfo failed`                        | `EFA_INIT_FAIL`         | EFA not available                 | Fix EFA; use gloo on non-EFA                                                                                               |
+| `NCCL_OFI_RDMA`                            | `OFI_ERROR`             | aws-ofi-nccl broken               | Check plugin + EFA version                                                                                                 |
+| `Call to ibv_reg_mr failed`                | `RDMA_REG_FAIL`         | memlock=0 blocks EFA RDMA         | `ulimit -l 8388608`                                                                                                        |
+| `NET/OFI Using TCP`                        | `EFA_TCP_FALLBACK`      | Fell back to TCP                  | Fix EFA device plugin + env                                                                                                |
+| `Failed to load NCCL`                      | `NCCL_LOAD_FAIL`        | libnccl.so missing                | Check LD_LIBRARY_PATH                                                                                                      |
+| `libnccl-net.so`                           | `OFI_LOAD_FAIL`         | OFI plugin missing                | Install aws-ofi-nccl                                                                                                       |
+| **OOM / resource limits**                  |                         |                                   |                                                                                                                            |
+| `OOMKilled`                                | `OOM_KILL`              | Pod out of memory                 | Reduce batch size; increase limits                                                                                         |
+| `CUDA out of memory` / `cudaMalloc failed` | `CUDA_OOM`              | GPU VRAM exhausted                | Reduce batch size, enable ZeRO                                                                                             |
+| `failed to extend /dev/shm` / `Bus error`  | `SHM_FULL`              | /dev/shm too small                | emptyDir medium:Memory 10Gi                                                                                                |
+| `ENOMEM`                                   | `ENOMEM`                | Memory alloc/registration failure | Check memlock + GPU memory                                                                                                 |
+| **Version / config**                       |                         |                                   |                                                                                                                            |
+| `NCCL function not found`                  | `NCCL_VERSION_MISMATCH` | Mixed NCCL versions               | Use identical container images                                                                                             |
+| `Incompatible NCCL version`                | `NCCL_VERSION_MISMATCH` | Mixed NCCL versions               | Use identical container images                                                                                             |
+| `Could not find interface`                 | `IFACE_NOT_FOUND`       | Bad NCCL_SOCKET_IFNAME            | Set `^lo,docker,efa,veth,virbr`                                                                                            |
+| `world_size mismatch`                      | `WORLD_SIZE_MISMATCH`   | WORLD_SIZE ≠ ranks                | WORLD_SIZE = pods × GPUs/pod                                                                                               |
+| `doesn't have NCCL built in`               | `NCCL_NOT_BUILT`        | PyTorch without NCCL              | Use AWS DLC image                                                                                                          |
+| `CUDA_VISIBLE_DEVICES`                     | `CUDA_VIS_DEV`          | GPUs hidden from training         | Remove CUDA_VISIBLE_DEVICES                                                                                                |
+| `invalid alignment`                        | `CUDA_ALIGN_ERR`        | CUDA alignment error              | Check driver/NCCL version compat                                                                                           |
+| **Stale state / topology**                 |                         |                                   |                                                                                                                            |
+| `unlink shared memory`                     | `SHM_STALE`             | Stale /dev/shm/nccl-* files       | Set `RemoveIPC=no`; clean up                                                                                               |
+| `MNNVL topology`                           | `MNNVL_TOPO_FAIL`       | NCCL topology search failure      | Try fixed memlock (e.g. `ulimit -l 8388608`) — field-observed workaround, not NCCL-documented; see debugging-guide.md § 17 |
diff --git a/plugins/sagemaker-ai/skills/hyperpod-nccl/references/operations.md b/plugins/sagemaker-ai/skills/hyperpod-nccl/references/operations.md
new file mode 100644
index 00000000..9a3424f0
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/hyperpod-nccl/references/operations.md
@@ -0,0 +1,393 @@
+# NCCL Operations Reference
+
+Operational procedures and lookup tables for the NCCL skill.
+
+---
+
+## 1. Getting cluster names
+
+The HyperPod cluster name ≠ the EKS cluster name.
+
+```bash
+# List HyperPod clusters:
+aws sagemaker list-clusters --region <REGION> \
+  --query 'ClusterSummaries[*].[ClusterName,ClusterStatus,CreationTime]' --output table
+
+# EKS cluster behind a HyperPod cluster:
+EKS_ARN=$(aws sagemaker describe-cluster \
+  --cluster-name <HYPERPOD-NAME> --region <REGION> \
+  --query 'Orchestrator.Eks.ClusterArn' --output text)
+EKS_NAME=$(echo $EKS_ARN | awk -F'/' '{print $NF}')
+
+aws eks update-kubeconfig --name $EKS_NAME --region <REGION>
+```
+
+---
+
+## 2. IAM
+
+### Read-only diagnostic
+
+```json
+{
+  "Version": "2012-10-17",
+  "Statement": [{
+    "Sid": "NCCLSkillReadOnly",
+    "Effect": "Allow",
+    "Action": [
+      "sagemaker:DescribeCluster",
+      "sagemaker:ListClusters",
+      "sagemaker:ListClusterNodes",
+      "sagemaker:ListClusterEvents",
+      "ec2:DescribeSecurityGroups",
+      "ec2:DescribeVpcs",
+      "ec2:DescribeSubnets",
+      "ec2:DescribeInstances",
+      "logs:DescribeLogGroups",
+      "logs:DescribeLogStreams",
+      "logs:FilterLogEvents",
+      "logs:GetLogEvents",
+      "ssm:StartSession",
+      "ssm:DescribeSessions",
+      "ssm:TerminateSession"
+    ],
+    "Resource": "*"
+  }]
+}
+```
+
+### Per-remediation permissions
+
+Granted only if the operator applies the suggested fix:
+
+| Suggested command                                   | Required action                                |
+| --------------------------------------------------- | ---------------------------------------------- |
+| `aws ec2 authorize-security-group-{ingress,egress}` | `ec2:AuthorizeSecurityGroupIngress` / `Egress` |
+| `aws sagemaker batch-reboot-cluster-nodes`          | `sagemaker:BatchRebootClusterNodes`            |
+| `aws sagemaker batch-replace-cluster-nodes`         | `sagemaker:BatchReplaceClusterNodes`           |
+| `aws eks update-kubeconfig`                         | `eks:DescribeCluster`                          |
+| `kubectl delete/create networkpolicy`               | EKS access entry + RBAC on `networkpolicies`   |
+
+### kubectl RBAC (EKS read — write only if operator applies a fix)
+
+```yaml
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: nccl-skill-read
+rules:
+- apiGroups: [""]
+  resources: ["nodes", "pods", "pods/log", "namespaces", "services"]
+  verbs: ["get", "list", "watch"]
+- apiGroups: [""]
+  resources: ["pods/exec"]
+  verbs: ["create"]
+- apiGroups: ["networking.k8s.io"]
+  resources: ["networkpolicies"]
+  verbs: ["get", "list", "watch"]
+- apiGroups: ["apps"]
+  resources: ["daemonsets"]
+  verbs: ["get", "list"]
+- apiGroups: ["batch"]
+  resources: ["jobs"]
+  verbs: ["get", "list"]
+```
+
+If the operator deletes/creates a NetworkPolicy, grant `delete`/`create` on `networkpolicies` scoped to the training namespace.
+
+---
+
+## 3. SSM target format (HyperPod)
+
+```
+sagemaker-cluster:<CLUSTER_ID>_<INSTANCE_GROUP>-<INSTANCE_ID>
+```
+
+`CLUSTER_ID` is the ARN suffix — not the cluster name. Full connect procedure is in the node-debugger skill (`references/node-diagnostics-detail.md § K`). `send-command` against a bare instance ID will fail with `ValidationException` — HyperPod's managed fleet requires `start-session` with the prefixed target.
+
+---
+
+## 4. CloudWatch — NCCL log collection
+
+NCCL logs are not collected by HyperPod by default. Add this to the lifecycle script so logs ship to the same log group as lifecycle/health-monitoring logs:
+
+```bash
+# Amazon Linux: yum install -y amazon-cloudwatch-agent
+# Ubuntu:       apt-get install -y amazon-cloudwatch-agent
+
+cat > /opt/aws/amazon-cloudwatch-agent/etc/amazon-cloudwatch-agent.json <<'EOF'
+{
+  "logs": {
+    "logs_collected": {
+      "files": {
+        "collect_list": [
+          {"file_path": "/var/log/nccl.log",
+           "log_group_name": "/aws/sagemaker/Clusters/${CLUSTER_NAME}/${CLUSTER_ID}",
+           "log_stream_name": "{instance_id}/nccl"},
+          {"file_path": "/var/log/training/*.log",
+           "log_group_name": "/aws/sagemaker/Clusters/${CLUSTER_NAME}/${CLUSTER_ID}",
+           "log_stream_name": "{instance_id}/training"}
+        ]
+      }
+    }
+  }
+}
+EOF
+
+/opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl \
+  -a fetch-config -m ec2 \
+  -c file:/opt/aws/amazon-cloudwatch-agent/etc/amazon-cloudwatch-agent.json -s
+```
+
+### Query NCCL errors
+
+```bash
+CLUSTER_ID=$(aws sagemaker describe-cluster --cluster-name <NAME> --region <R> \
+  --query 'ClusterArn' --output text | awk -F'/' '{print $NF}')
+
+aws logs filter-log-events \
+  --log-group-name "/aws/sagemaker/Clusters/<NAME>/${CLUSTER_ID}" \
+  --filter-pattern '"NCCL WARN"' \
+  --start-time $(($(date +%s) - 7200))000 \
+  --region <R> \
+  --query 'events[*].[timestamp,logStreamName,message]' --output table
+```
+
+---
+
+## 5. NCCL environment variable reference
+
+### Required
+
+| Variable      | Value                        | Purpose             |
+| ------------- | ---------------------------- | ------------------- |
+| `MASTER_ADDR` | IP or hostname of rank-0 pod | Rendezvous endpoint |
+| `MASTER_PORT` | `29500`                      | Rendezvous port     |
+| `WORLD_SIZE`  | `pods × GPUs_per_pod`        | Total process count |
+| `RANK`        | `0` to `WORLD_SIZE-1`        | Global rank         |
+| `LOCAL_RANK`  | `0` to `GPUs_per_pod-1`      | Local rank          |
+
+### EFA (p4d / p5 / p3dn)
+
+| Variable                 | Value                                     | Purpose                                |
+| ------------------------ | ----------------------------------------- | -------------------------------------- |
+| `NCCL_SOCKET_IFNAME`     | `^lo,docker,efa,veth,virbr`               | Exclude non-VPC interfaces             |
+| `FI_PROVIDER`            | `efa`                                     | Use EFA libfabric provider             |
+| `FI_EFA_USE_DEVICE_RDMA` | `1`                                       | Enable EFA RDMA (required for full bw) |
+| `FI_EFA_FORK_SAFE`       | `1`                                       | Required with Python multiprocessing   |
+| `NCCL_NET_PLUGIN`        | `/opt/amazon/ofi-nccl/lib/libnccl-net.so` | Explicit OFI plugin path               |
+
+### Collective-op timeout (PyTorch)
+
+`NCCL_TIMEOUT` is **not** a standard NCCL or PyTorch env var — some launchers (DeepSpeed, AWS samples) wrap it, but setting it alone has no effect in pure PyTorch. Control the collective timeout via `init_process_group` and the `TORCH_*` env vars:
+
+```python
+# In training code — replaces any NCCL_TIMEOUT env var:
+import datetime, torch.distributed as dist
+dist.init_process_group("nccl", timeout=datetime.timedelta(seconds=1800))
+```
+
+```bash
+# Surfaces hangs as Python exceptions instead of silent waits:
+export TORCH_NCCL_ASYNC_ERROR_HANDLING=1
+export TORCH_NCCL_BLOCKING_WAIT=1   # debug only — has perf cost at scale
+```
+
+### Performance tuning
+
+| Variable                  | Value                 | Purpose                                                                                                                                              |
+| ------------------------- | --------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `NCCL_DEBUG`              | `WARN`                | Production-safe logging. `INFO` / `TRACE` add runtime overhead; enable only for debug                                                                |
+| `NCCL_BUFFSIZE`           | bytes (power-of-2)    | Collective-op buffer size. NCCL default is `4194304` (4 MiB). Tune only after baseline measurement, and align to the NCCL user guide recommendations |
+| `NCCL_P2P_LEVEL`          | `NVL` / `PIX` / other | `NVL` = P2P only over NVLink; `PIX` = same PCI switch. See the NCCL user guide for the full LOC/NVL/PIX/PXB/PHB/SYS ladder                           |
+| `TORCH_DISTRIBUTED_DEBUG` | `DETAIL`              | PyTorch detailed distributed debug (dev only)                                                                                                        |
+| `NCCL_CUMEM_HOST_ENABLE`  | `0` / `1`             | Default flipped to `1` in NCCL 2.24 when CUDA driver ≥ 12.6 and runtime ≥ 12.2; set `0` to work around NUMA cuMem issues on older stacks             |
+| `NCCL_IB_DISABLE`         | `1`                   | Disable InfiniBand verbs; forces IP-socket transport on non-IB/non-EFA clusters                                                                      |
+
+### EFA network-card counts per instance type
+
+Used to populate `vpc.amazonaws.com/efa` requests in K8s pod specs. The canonical EC2 EFA doc enumerates which types support EFA but doesn't always state the per-instance card count; counts below are taken from authoritative AWS sources where available. Always count with `ls /dev/infiniband/uverbs* | wc -l` on a live node and adjust if your build differs.
+
+| Instance type        | EFA adapters | Aggregate bandwidth |
+| -------------------- | ------------ | ------------------- |
+| `p4d.24xlarge`       | 4            | 400 Gbps            |
+| `p5.48xlarge`        | 32           | 3200 Gbps           |
+| `p5e.48xlarge`       | 32           | 3200 Gbps           |
+| `p5en.48xlarge`      | 16           | 3200 Gbps           |
+| `p6-b200.48xlarge`   | 8            | 3200 Gbps           |
+| `p6-b300.48xlarge`   | 17           | 6400 Gbps           |
+| `p6e-gb200.36xlarge` | 17           | 1600 Gbps EFA       |
+
+For other types in the EFA-supported list (e.g. `p4de.24xlarge`, `p5.4xlarge`, `trn1.32xlarge`, `trn1n.32xlarge`, `trn2.48xlarge`) — check the current EC2 instance-types doc and confirm with `ls /dev/infiniband/uverbs* | wc -l` on the node before pinning a value.
+
+### K8s pod spec (EFA-enabled)
+
+```yaml
+env:
+- { name: MASTER_ADDR,            value: "my-job-svc.my-ns.svc.cluster.local" }
+- { name: MASTER_PORT,            value: "29500" }
+- { name: WORLD_SIZE,             value: "16" }        # 2 nodes × 8 GPUs
+- { name: NCCL_SOCKET_IFNAME,     value: "^lo,docker,efa,veth,virbr" }
+- { name: FI_PROVIDER,            value: "efa" }
+- { name: FI_EFA_USE_DEVICE_RDMA, value: "1" }
+- { name: FI_EFA_FORK_SAFE,       value: "1" }
+- { name: NCCL_DEBUG,             value: "WARN" }
+# Set PyTorch collective timeout via init_process_group(timeout=1800s) in training code
+# (NCCL_TIMEOUT env var is a non-standard convention — not read by NCCL or PyTorch directly)
+resources:
+  limits:
+    nvidia.com/gpu: 8
+    vpc.amazonaws.com/efa: <N>   # match the EFA-adapter count for the instance type (table above)
+  requests:
+    nvidia.com/gpu: 8
+    vpc.amazonaws.com/efa: <N>
+volumes:
+- { name: dshm, emptyDir: { medium: Memory, sizeLimit: "10Gi" } }
+volumeMounts:
+- { name: dshm, mountPath: /dev/shm }
+```
+
+---
+
+## 6. HyperPod node health labels (EKS)
+
+| Label                                              | Value                              | Meaning                                                                            |
+| -------------------------------------------------- | ---------------------------------- | ---------------------------------------------------------------------------------- |
+| `sagemaker.amazonaws.com/node-health-status`       | `Schedulable`                      | Healthy, accepts pods                                                              |
+|                                                    | `Unschedulable`                    | Node is running deep health checks (~2 h stress test); not available for workloads |
+|                                                    | `UnschedulablePendingReplacement`  | Failed health check — will be replaced                                             |
+|                                                    | `UnschedulablePendingReboot`       | Rebooting to re-run checks                                                         |
+| `sagemaker.amazonaws.com/deep-health-check-status` | `Passed` / `Failed` / `InProgress` | Deep-health-check outcome                                                          |
+| `sagemaker.amazonaws.com/fault-types`              | (value)                            | High-level fault category (plural label key)                                       |
+| `sagemaker.amazonaws.com/fault-reasons`            | (value)                            | Detailed fault reason (plural label key)                                           |
+
+HMA also writes a `sagemaker.amazonaws.com/fault-details` annotation on the node with the full JSON (`timestamp`, `type`, `reason`, `message`) — see the node-debugger skill § F.
+
+**NodeRecovery modes** (per instance group): `Automatic` (replace failed nodes) or `None` (manual). Toggle via `update-cluster` — fetch the current instance-group spec first (`describe-cluster`), edit only `NodeRecovery`, push back.
+
+---
+
+## 7. Slurm — NCCL-specific operations
+
+Diagnose (read-only):
+
+```bash
+sinfo -o "%10N %10T %10C %30E" --noheader
+squeue -o "%10i %20j %8T %12R %N" --noheader
+scontrol show node <NODE> | grep Reason
+```
+
+### Suggested command — resume a DRAINING node (run this yourself)
+
+**Preconditions:** the original drain reason no longer applies (the underlying issue — straggler bandwidth, hardware fault, RemoveIPC, etc. — has been investigated and resolved); the customer accepts that pending jobs may schedule onto this node immediately; you are running on the Slurm controller via SSM.
+
+**Command:**
+
+```bash
+scontrol update nodename=<NODE> state=resume
+```
+
+**Blast radius:** node returns to the idle pool. Reversible by setting `state=drain` again. If the original cause is unfixed, the node will likely re-fail; resume only after a clean diagnostic.
+
+### Suggested command — disable RemoveIPC for NCCL persistence (run this yourself)
+
+**Preconditions:** NCCL job is terminating with "unlink shared memory" or `/dev/shm/nccl-*` disappearing mid-training; confirmed that `RemoveIPC=yes` is set in `/etc/systemd/logind.conf`; node is quiescent or a brief `systemd-logind` restart is acceptable.
+
+**Command:**
+
+```bash
+grep RemoveIPC /etc/systemd/logind.conf   # diagnose
+echo "RemoveIPC=no" >> /etc/systemd/logind.conf
+sudo systemctl restart systemd-logind
+```
+
+**Blast radius:** persistent change to the node's systemd configuration — logs out anyone in a systemd user session during the restart. Change survives reboot. For new nodes, add the same commands to the lifecycle script so the setting persists across replacements.
+
+### Slurm prolog for NCCL env
+
+```bash
+#!/bin/bash
+# /etc/slurm/prolog.sh
+export NCCL_SOCKET_IFNAME=^lo,docker
+export FI_PROVIDER=efa
+export FI_EFA_USE_DEVICE_RDMA=1
+# Collective timeout is set in training code: init_process_group(timeout=timedelta(seconds=1800))
+mount -o remount,size=10G /dev/shm 2>/dev/null || true
+```
+
+---
+
+## 8. NCCL-specific remediations
+
+### Security group self-reference
+
+**Detected when:** `[FAIL] SG sg-xxx missing inbound/outbound self-reference` — NCCL rendezvous or EFA RDMA blocked.
+
+**Root cause:** EFA requires the SG to reference itself with `AllTraffic (-1)` on both ingress and egress. Without this, NCCL packets between nodes are dropped.
+
+### Suggested command — apply self-ref to every cluster SG (run this yourself)
+
+**Preconditions:** the rule check (e.g. `nccl-diagnose.sh` Check 4 or `hyperpod-node-debugger`'s `check-efa-sg.sh`) reports `[FAIL]` on inbound or outbound self-ref for `<SG>`; `<SG>` is one of the security groups attached to the HyperPod cluster (`describe-cluster → VpcConfig.SecurityGroupIds`); apply once **per SG** if multiple are attached; for IaC-managed SGs, see the operating-policy IaC note before running directly. Per the HyperPod prerequisites doc, do **not** add a `0.0.0.0/0` outbound rule on the EFA SG.
+
+**Command:**
+
+```bash
+# Inbound self-ref (NCCL rendezvous)
+aws ec2 authorize-security-group-ingress --group-id <SG> --region <R> \
+  --ip-permissions '[{"IpProtocol":"-1","UserIdGroupPairs":[{"GroupId":"<SG>"}]}]'
+
+# Outbound self-ref (EFA RDMA)
+aws ec2 authorize-security-group-egress --group-id <SG> --region <R> \
+  --ip-permissions '[{"IpProtocol":"-1","UserIdGroupPairs":[{"GroupId":"<SG>"}]}]'
+```
+
+**Blast radius:** opens all protocols between instances that share this SG (intended scope for intra-cluster EFA / NCCL). Idempotent: `InvalidPermission.Duplicate` = the rule already exists. Reversible with `revoke-security-group-ingress`/`revoke-security-group-egress` using the same `--ip-permissions` payload.
+
+### NetworkPolicy blocking NCCL
+
+**Detected when:** `[WARN] NetworkPolicies found in <ns>` + a `[FAIL]` indicating blocked inter-pod NCCL traffic.
+
+**Before deleting any NetworkPolicy, read it** — it may be intentional tenant isolation or compliance-required. Confirm with the customer.
+
+```bash
+kubectl get networkpolicy -n <NS> -o yaml
+```
+
+Allow-all intra-namespace policy for NCCL training namespaces:
+
+```yaml
+apiVersion: networking.k8s.io/v1
+kind: NetworkPolicy
+metadata:
+  name: allow-nccl-intranamespace
+  namespace: <NS>
+spec:
+  podSelector: {}
+  policyTypes: ["Ingress", "Egress"]
+  ingress:
+    - from:
+        - namespaceSelector:
+            matchLabels: { kubernetes.io/metadata.name: <NS> }
+  egress:
+    - to:
+        - namespaceSelector:
+            matchLabels: { kubernetes.io/metadata.name: <NS> }
+    - ports:
+        - { port: 53, protocol: UDP }
+```
+
+### Suggested command — delete a blocking NetworkPolicy (run this yourself)
+
+**Preconditions:** the policy has been read (`kubectl get networkpolicy <NAME> -n <NS> -o yaml`) and confirmed not to be intentional tenant isolation or compliance-required; customer has explicitly approved removal; a replacement allow-list policy (if needed) is already applied.
+
+**Command:**
+
+```bash
+kubectl delete networkpolicy <NAME> -n <NS>
+```
+
+**Blast radius:** changes default-deny traffic rules for every pod matched by the policy's `podSelector` in namespace `<NS>`. Cannot be reverted by a single command — the original YAML must be re-applied. Misdiagnosis can expose production traffic.
+
+### Node reboot / replacement for GPU faults
+
+Ordering and commands are in node-debugger: [references/node-diagnostics-detail.md § F](../../hyperpod-node-debugger/references/node-diagnostics-detail.md). Reboot first (clears transient GPU/EFA faults, preserves data); replace only if reboot doesn't clear the fault.
diff --git a/plugins/sagemaker-ai/skills/hyperpod-nccl/references/performance-testing.md b/plugins/sagemaker-ai/skills/hyperpod-nccl/references/performance-testing.md
new file mode 100644
index 00000000..13751d6e
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/hyperpod-nccl/references/performance-testing.md
@@ -0,0 +1,247 @@
+# NCCL Performance Testing & Straggler Detection
+
+Measure NCCL bandwidth and identify slow nodes.
+
+---
+
+## Install nccl-tests (once per cluster)
+
+```bash
+# On each compute node (add to lifecycle script for persistence). Source: NVIDIA nccl-tests.
+cd /opt && git clone <nccl-tests-source> nccl-tests
+cd /opt/nccl-tests
+make MPI=1 MPI_HOME=/usr/local/mpi NCCL_HOME=/usr/local/nccl CUDA_HOME=/usr/local/cuda
+# Binary: /opt/nccl-tests/build/all_reduce_perf
+```
+
+---
+
+## Single-Node Baseline Test
+
+Run first to confirm the node itself is healthy before multi-node tests.
+
+```bash
+# Single-GPU test (quick sanity check):
+/opt/nccl-tests/build/all_reduce_perf -b 8 -e 8G -f 2 -g 1
+
+# All-GPU test (p4d: 8 GPUs, p5: 8 GPUs):
+/opt/nccl-tests/build/all_reduce_perf -b 8 -e 8G -f 2 -g 8
+
+# Expected output column headers:
+# size  count  type  redop  root  time  algbw  busbw  error  time  algbw  busbw
+```
+
+**How to identify stragglers:** there is no single published GB/s threshold that applies across EFA generations, NCCL versions, and test message sizes. Run `all_reduce_perf` on **every** node against a known-good peer and compare the `busbw` (bus bandwidth) column. The outliers in the bottom quartile at the same message size are the stragglers. For reference workflow and exact test command, see the AWS EC2 EFA + NCCL getting-started doc. Also compare against the results of a recent known-good run on the same instance type and NCCL version — hardware generations differ widely and a static table rots quickly.
+
+---
+
+## Multi-Node AllReduce Test
+
+```bash
+# With MPI (from head node):
+mpirun -np <TOTAL_RANKS> \
+  --hostfile /etc/hosts \
+  -N <RANKS_PER_NODE> \
+  -x FI_PROVIDER=efa \
+  -x FI_EFA_USE_DEVICE_RDMA=1 \
+  -x NCCL_SOCKET_IFNAME=^lo,docker,efa,veth \
+  -x NCCL_DEBUG=WARN \
+  /opt/nccl-tests/build/all_reduce_perf -b 8 -e 8G -f 2 -g 1
+
+# With Slurm:
+srun --nodes=4 --ntasks-per-node=8 \
+  /opt/nccl-tests/build/all_reduce_perf -b 8 -e 8G -f 2 -g 1
+
+# With kubectl (EKS, 2 nodes, 8 GPUs each):
+# Deploy as a K8s Job with 2 pods, each requesting 8 GPUs.
+# Use mpirun inside the container, or the Kubeflow MPI Operator.
+kubectl exec -n <NS> <POD> -- mpirun -np 16 -N 8 \
+  --hostfile /etc/hosts \
+  -x FI_PROVIDER=efa -x FI_EFA_USE_DEVICE_RDMA=1 \
+  /opt/nccl-tests/build/all_reduce_perf -b 8 -e 8G -f 2 -g 1
+```
+
+---
+
+## Pairwise Bandwidth Test (identify slow pairs)
+
+```bash
+# Test each node pair individually to find the outlier:
+# From node A → node B:
+fi_ping -p efa -I 100 <NODE_B_IP>
+
+# From node B → node A:
+fi_ping -p efa -I 100 <NODE_A_IP>
+
+# Automate across all pairs (run on head node):
+for node in $(scontrol show hostnames $SLURM_JOB_NODELIST); do
+    echo -n "Testing $node: "
+    fi_ping -p efa -I 10 "$node" 2>/dev/null | tail -1 || echo "FAILED"
+done
+```
+
+**Interpreting fi_ping output:**
+
+- Normal: < 5 microseconds latency, consistent
+- Straggler: > 50 microseconds, or high variance across runs
+
+---
+
+## NCCL_DEBUG_FILE Analysis
+
+```bash
+# Enable per-rank debug files:
+export NCCL_DEBUG=INFO
+export NCCL_DEBUG_FILE=/tmp/nccl_rank${RANK}.log
+
+# After training (or timeout), check which rank was slow:
+# Look for the last "AllReduce" timestamp before the timeout:
+grep -h "AllReduce\|ring\|timeout" /tmp/nccl_rank*.log | sort -k1,1 | tail -30
+
+# Compare timestamps across ranks — the one furthest behind is the straggler:
+for f in /tmp/nccl_rank*.log; do
+    echo -n "$f: last line timestamp = "
+    tail -1 "$f" | awk '{print $1, $2}'
+done
+```
+
+---
+
+## Collective-op timeout scaling
+
+PyTorch's `init_process_group` default timeout for NCCL is **10 minutes (600 s)**. Too low for large clusters — a slow rank or straggler can blow past 10 min during warm-up or a large all-gather.
+
+Scale up via the `timeout` argument (NOT via a `NCCL_TIMEOUT` env var — that is not a standard NCCL or PyTorch variable):
+
+```python
+import datetime
+import torch.distributed as dist
+
+# nodes * 5 + 600 is a simple heuristic — tune against your actual step time:
+nodes = int(os.environ.get("WORLD_SIZE", "1")) // 8   # GPUs per node
+timeout_s = nodes * 5 + 600
+
+dist.init_process_group(
+    backend="nccl",
+    timeout=datetime.timedelta(seconds=timeout_s),
+)
+```
+
+Field-observed starting points (not AWS- or PyTorch-prescribed; tune from your actual step time and slowest collective):
+
+| Cluster size  | Starting point                   |
+| ------------- | -------------------------------- |
+| 2–16 GPUs     | 600 s (PyTorch default for NCCL) |
+| 17–64 GPUs    | 1200 s                           |
+| 65–256 GPUs   | 1800 s                           |
+| 257–1024 GPUs | 3600 s                           |
+| 1024+ GPUs    | 7200 s                           |
+
+To surface hangs as Python exceptions instead of silently waiting, also set:
+
+```bash
+export TORCH_NCCL_ASYNC_ERROR_HANDLING=1
+export TORCH_NCCL_BLOCKING_WAIT=1   # for debugging; has a perf cost at scale
+```
+
+---
+
+## NCCL_DEBUG=INFO Performance Impact
+
+**Never leave `NCCL_DEBUG=INFO` in production.** The NCCL env-var reference describes `TRACE` as printing "replayable trace information on every call" but does not publish overhead percentages. Field experience on HyperPod is:
+
+| Setting                     | Notes                                                                   |
+| --------------------------- | ----------------------------------------------------------------------- |
+| `NCCL_DEBUG=WARN` (default) | Negligible overhead                                                     |
+| `NCCL_DEBUG=INFO`           | Measurable runtime overhead and verbose logs — disable in production    |
+| `NCCL_DEBUG=TRACE`          | Per-call trace; very large log volume, only for short debugging windows |
+
+Use `INFO` / `TRACE` only for debugging, then set back to `WARN`. Measure your own overhead before and after if it matters for the workload.
+
+---
+
+## EFA Performance Settings
+
+```bash
+# Full EFA performance configuration:
+export FI_PROVIDER=efa
+export FI_EFA_USE_DEVICE_RDMA=1    # GPU Direct RDMA
+export NCCL_PROTO=Simple           # large-message protocol (valid: LL, LL128, Simple)
+export NCCL_SOCKET_IFNAME=^lo,docker,efa,veth
+# Collective timeout goes in training code: init_process_group(timeout=timedelta(seconds=1800))
+
+# Optional tuning for very large jobs:
+export FI_EFA_FORK_SAFE=1            # safe for multiprocessing
+export FI_EFA_ENABLE_SHM_TRANSFER=1  # intra-node shared memory
+
+# Do NOT set in production:
+# NCCL_DEBUG=INFO  (verbose; runtime overhead — disable in production)
+# CUDA_LAUNCH_BLOCKING=1  (disables GPU/CPU overlap, very slow)
+```
+
+---
+
+## Straggler Node — Detection and Replacement
+
+### Detection workflow
+
+1. **Run nccl-tests** across all nodes — compare algbw values
+2. **Check nvidia-smi nvlink -e** for NVLink error counters
+3. **Check dmesg** for XID errors, hardware failures
+4. **Compare fi_ping latency** pairwise — outlier has degraded EFA port
+
+### Replacement workflow
+
+Diagnose (read-only):
+
+```bash
+# Identify the bad node's instance ID:
+kubectl get node <NODE_NAME> -o jsonpath='{.spec.providerID}' | cut -d'/' -f5
+# OR for Slurm — list-cluster-nodes does NOT return PrivateDnsHostname (only describe-cluster-node does).
+# Two-step: list candidate IDs, then describe each one until DNS matches the Slurm name.
+SLURM_NODE="<SLURM_NODE_NAME>"
+for IID in $(aws sagemaker list-cluster-nodes --cluster-name <C> --region <R> \
+               --query 'ClusterNodeSummaries[?InstanceStatus.Status==`Running`].InstanceId' --output text); do
+  DNS=$(aws sagemaker describe-cluster-node --cluster-name <C> --region <R> --node-id "$IID" \
+          --query 'NodeDetails.PrivateDnsHostname' --output text 2>/dev/null)
+  case "$DNS" in "$SLURM_NODE."*) echo "$SLURM_NODE → $IID"; break ;; esac
+done
+```
+
+### Suggested command — drain the straggler node before reboot/replace (run this yourself)
+
+**Preconditions:** straggler behavior confirmed across **multiple** nccl-tests runs (single-run outliers can be transient — don't drain on one bad sample); customer accepts that pods using `emptyDir` volumes on this node will lose that data when evicted (EKS path); on Slurm, customer accepts that no new jobs will be scheduled to the node until `state=resume` runs after recovery; drain is preparation for reboot/replace, not a fix on its own.
+
+**Command:**
+
+```bash
+# EKS — cordon prevents new pods; drain evicts existing pods (emptyDir data lost).
+kubectl cordon <NODE_NAME>
+kubectl drain <NODE_NAME> --ignore-daemonsets --delete-emptydir-data
+
+# Slurm — on the controller via SSM; running jobs continue until they finish.
+scontrol update nodename=<NODE> state=drain reason="low-bandwidth-$(date +%Y%m%d)"
+```
+
+**Blast radius:** EKS — `--delete-emptydir-data` discards `emptyDir` scratch on this node; pods are rescheduled elsewhere if capacity exists, otherwise stay Pending. Slurm — running jobs finish on the node; pending jobs route around it. Drain is reversible (`kubectl uncordon` / `scontrol update state=resume`) only if you decide not to proceed with reboot/replace.
+
+See [hyperpod-cluster-debugger § G.2](../../hyperpod-cluster-debugger/references/cluster-diagnostics-detail.md#g2-manual-replacement) for the reboot-before-replace ordering.
+
+### Suggested command — replace the node (run this yourself, only after reboot did not clear the fault)
+
+**Preconditions:** reboot was tried first and did not clear the fault (see [hyperpod-cluster-debugger § G.2](../../hyperpod-cluster-debugger/references/cluster-diagnostics-detail.md#g2-manual-replacement)). Data on root + secondary volumes is backed up. Not supported on Slurm controller nodes. `NodeIds` batch: 1-25 per call.
+
+**Command:**
+
+```bash
+aws sagemaker batch-replace-cluster-nodes \
+  --cluster-name <C> --region <R> \
+  --node-ids '["<INSTANCE_ID>"]'
+
+# Monitor replacement completion (read-only):
+watch -n 10 "aws sagemaker list-cluster-nodes --cluster-name <C> --region <R> \
+  --query 'ClusterNodeSummaries[*].{ID:InstanceId,State:InstanceStatus.Status}' \
+  --output table"
+```
+
+**Blast radius:** destroys root + secondary volumes on the replaced instance — all data permanently lost. New hardware is provisioned with the same AMI.
diff --git a/plugins/sagemaker-ai/skills/hyperpod-nccl/scripts/nccl-diagnose.sh b/plugins/sagemaker-ai/skills/hyperpod-nccl/scripts/nccl-diagnose.sh
new file mode 100755
index 00000000..cc67bf2f
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/hyperpod-nccl/scripts/nccl-diagnose.sh
@@ -0,0 +1,2563 @@
+#!/usr/bin/env bash
+# nccl-diagnose.sh — read-only NCCL diagnostic for SageMaker HyperPod.
+# Supports both EKS and Slurm orchestrators (auto-detected).
+# Hardware checks run on cluster nodes via SSM, not locally.
+#
+# This script never modifies cluster state. It collects diagnostic signals and
+# attaches a reference pointer (→ references/<file>.md § <section>) to each
+# finding. The calling skill (hyperpod-nccl) reads this output alongside the
+# referenced sections to guide the user through the remediation.
+#
+# USAGE:
+#   bash nccl-diagnose.sh [OPTIONS]
+#
+# OPTIONS:
+#   --cluster       <name>        HyperPod cluster name (required)
+#   --region        <region>      AWS region (required)
+#   --orchestrator  <eks|slurm>   Force orchestrator (default: auto-detect)
+#   --namespace     <ns>          [EKS] K8s namespace to scope (default: all)
+#   --job           <job-name>    [EKS] Specific job to diagnose
+#   --node          <instance-id> Specific node instance ID for SSM checks
+#   --sample-nodes  <N>           How many nodes to SSM into (default: 3)
+#   --verbose                     Show extra debug output
+#   --no-color                    Disable ANSI colors (also auto-off when not a TTY)
+#   --help                        Show this help
+#
+# ARCHITECTURE:
+#   LOCAL checks (run on this machine):
+#     - AWS API calls: cluster status, SG rules, cluster events, node list
+#     - kubectl calls: K8s node readiness, pod status, logs, NetworkPolicies
+#   ON-NODE checks (run via SSM on actual cluster compute nodes):
+#     - GPU health (nvidia-smi, XID errors, NVLink)
+#     - EFA / libfabric availability
+#     - NCCL library presence
+#     - Network interfaces and MTU
+#     - Memory / /dev/shm / memlock limits
+#     - Active training processes
+#     - dmesg hardware errors
+#   SCALE strategy for 100s of nodes:
+#     - AWS API checks cover ALL nodes cheaply via list-cluster-nodes
+#     - K8s checks cover ALL nodes cheaply via kubectl
+#     - SSM hardware checks sample --sample-nodes (default: 3) compute nodes
+#     - CloudWatch log analysis covers ALL nodes at scale (no per-node SSM needed)
+#
+# EXAMPLES:
+#   bash nccl-diagnose.sh --cluster my-cluster --region us-east-1
+#   bash nccl-diagnose.sh --cluster my-cluster --region us-east-1 \
+#       --namespace nccl-test --job my-job --sample-nodes 5
+#   bash nccl-diagnose.sh --cluster my-cluster --region us-east-1 \
+#       --node i-0123456789abcdef0
+#
+# EXIT CODES:
+#   0  No critical (P0/P1) issues; P2 informational findings are allowed.
+#   1  One or more critical issues, or a fatal prerequisite is missing.
+#   2  Invalid argument.
+
+set -euo pipefail
+
+_TEMP_FILES=()
+cleanup() {
+    # Guard against empty-array + set -u on older bash (4.2 on AL2).
+    [[ ${#_TEMP_FILES[@]} -gt 0 ]] && rm -f "${_TEMP_FILES[@]}" 2>/dev/null || true
+}
+trap cleanup EXIT
+
+# Auto-disable colors when stdout is not a TTY or TERM=dumb (agent-piped output).
+if [ -t 1 ] && [ "${TERM:-}" != "dumb" ]; then
+    RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'
+    BLUE='\033[0;34m'; BOLD='\033[1m'; RESET='\033[0m'
+else
+    RED=''; GREEN=''; YELLOW=''; BLUE=''; BOLD=''; RESET=''
+fi
+
+info()    { echo -e "${BLUE}[INFO]${RESET} $*"; }
+success() { echo -e "${GREEN}[PASS]${RESET} $*"; }
+warn()    { echo -e "${YELLOW}[WARN]${RESET} $*"; }
+error()   { echo -e "${RED}[FAIL]${RESET} $*"; }
+header()  { echo -e "\n${BOLD}${BLUE}═══════════════════════════════════════════════${RESET}"
+            echo -e "${BOLD}${BLUE}  $*${RESET}"
+            echo -e "${BOLD}${BLUE}═══════════════════════════════════════════════${RESET}"; }
+section() { echo -e "\n${BOLD}-- $* --${RESET}"; }
+debug()   { $VERBOSE && echo -e "[DEBUG] $*" >&2 || true; }
+
+CLUSTER_NAME=""
+REGION="${AWS_DEFAULT_REGION:-}"
+ORCHESTRATOR=""
+NAMESPACE=""
+JOB_NAME=""
+NODE_ID=""
+SAMPLE_NODES=3
+VERBOSE=false
+ISSUES_FOUND=0
+ISSUE_DETAILS=()
+add_issue_detail() {
+    local priority="${2:-P1}"
+    ISSUE_DETAILS+=("${priority}|$1")
+}
+K8S_CONNECTED=false
+SSM_CLUSTER_ID=""
+SSM_NODES=()
+
+usage() {
+    # --help exits 0; invalid invocation exits 2 via usage 2.
+    grep "^# USAGE:" -A 40 "$0" | grep "^#" | sed 's/^# \?//' | head -25
+    exit "${1:-0}"
+}
+
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --cluster)       [[ $# -lt 2 ]] && { error "--cluster needs a value"; exit 2; }
+                         [[ ! "$2" =~ ^(arn:aws[a-z-]*:sagemaker:[a-z0-9-]+:[0-9]{12}:cluster/[a-z0-9]{12}|[a-zA-Z0-9]([-a-zA-Z0-9]{0,62}))$ ]] && { error "--cluster must be a valid HyperPod cluster name or ARN (got '$2')"; exit 2; }
+                         CLUSTER_NAME="$2"; shift 2 ;;
+        --region)        [[ $# -lt 2 ]] && { error "--region needs a value"; exit 2; }
+                         [[ ! "$2" =~ ^[a-z]{2}-[a-z]+-[0-9]+$ ]] && { error "--region must be a valid AWS region (got '$2')"; exit 2; }
+                         REGION="$2"; shift 2 ;;
+        --orchestrator)  [[ $# -lt 2 ]] && { error "--orchestrator needs a value"; exit 2; }
+                         [[ "$2" != "eks" && "$2" != "slurm" ]] && { error "--orchestrator must be 'eks' or 'slurm' (got '$2')"; exit 2; }
+                         ORCHESTRATOR="$2"; shift 2 ;;
+        --namespace)     [[ $# -lt 2 ]] && { error "--namespace needs a value"; exit 2; }
+                         [[ ! "$2" =~ ^[a-z0-9]([-a-z0-9]*[a-z0-9])?$ ]] && { error "--namespace must be a valid K8s namespace (got '$2')"; exit 2; }
+                         NAMESPACE="$2"; shift 2 ;;
+        --job)           [[ $# -lt 2 ]] && { error "--job needs a value"; exit 2; }
+                         [[ ! "$2" =~ ^[a-z0-9]([-a-z0-9]*[a-z0-9])?$ ]] && { error "--job must be a valid K8s name (got '$2')"; exit 2; }
+                         JOB_NAME="$2"; shift 2 ;;
+        --node)          [[ $# -lt 2 ]] && { error "--node needs a value"; exit 2; }
+                         [[ ! "$2" =~ ^i-[0-9a-f]{8,17}$ ]] && { error "--node must be an EC2 instance ID (got '$2')"; exit 2; }
+                         NODE_ID="$2"; shift 2 ;;
+        --sample-nodes)  [[ $# -lt 2 ]] && { error "--sample-nodes needs a value"; exit 2; }; SAMPLE_NODES="$2"; shift 2 ;;
+        --verbose)       VERBOSE=true; shift ;;
+        --no-color)      RED=''; GREEN=''; YELLOW=''; BLUE=''; BOLD=''; RESET=''; shift ;;
+        --help|-h)       usage 0 ;;
+        *) echo "Unknown option: $1" >&2; usage 2 ;;
+    esac
+done
+
+[[ -z "$CLUSTER_NAME" ]] && { error "Missing required: --cluster"; exit 1; }
+[[ -z "$REGION" ]] && { error "--region is required (or set AWS_DEFAULT_REGION before running)"; exit 2; }
+
+if ! [[ "$SAMPLE_NODES" =~ ^[0-9]+$ ]] || [[ "$SAMPLE_NODES" -lt 1 ]]; then
+    error "--sample-nodes must be a positive integer (got: '$SAMPLE_NODES')"
+    exit 1
+fi
+if [[ "$SAMPLE_NODES" -gt 50 ]]; then
+    warn "--sample-nodes=$SAMPLE_NODES is very high (max recommended: 50). Capping at 50."
+    SAMPLE_NODES=50
+fi
+
+# Paginate a sagemaker list-* call. Usage:
+#   sagemaker_list_paginated list-cluster-nodes ClusterNodeSummaries [extra args...]
+# Returns {"<SummaryKey>": [...]} on stdout. Caps at 20 000 items; emits a
+# stderr warning if truncated. Returns an empty result on AccessDenied.
+sagemaker_list_paginated() {
+    local api="$1" summary_key="$2"
+    shift 2
+    local merged='[]' token='' page_json combined i=0
+    local max_pages=200
+    while (( i < max_pages )); do
+        local page_args=(--cluster-name "$CLUSTER_NAME" --region "$REGION" \
+                         --max-results 100 --cli-read-timeout 30 --output json "$@")
+        # Validate token format before sending — avoid BadRequest on garbage.
+        if [[ -n "$token" ]]; then
+            if [[ "$token" =~ ^[a-zA-Z0-9/+]*={0,2}$ ]] && [[ -n "$token" ]]; then
+                page_args+=(--next-token "$token")
+            else
+                break
+            fi
+        fi
+        page_json=$(aws sagemaker "$api" "${page_args[@]}" 2>&1) || break
+        if echo "$page_json" | grep -qiE "AccessDenied|UnauthorizedOperation|not authorized"; then
+            break
+        fi
+        # Merge via stdin (NUL-delimited) to avoid ARG_MAX truncation at ~500
+        # entries. summary_key stays in argv since it's small.
+        combined=$(printf '%s\0%s' "$merged" "$page_json" | python3 -c "
+import sys, json
+blob = sys.stdin.buffer.read()
+try:
+    a, b = blob.split(b'\0', 1)
+    prev = json.loads(a)
+    page = json.loads(b)
+except (json.JSONDecodeError, ValueError):
+    sys.exit(2)
+prev.extend(page.get(sys.argv[1], []))
+print(json.dumps(prev))
+print(page.get('NextToken', ''))
+" "$summary_key" 2>/dev/null) || break
+        merged=$(printf '%s\n' "$combined" | sed -n '1p')
+        token=$(printf '%s\n'  "$combined" | sed -n '2p')
+        i=$((i+1))
+        [[ -z "$token" ]] && break
+    done
+    if (( i == max_pages )) && [[ -n "$token" ]]; then
+        echo "WARN: sagemaker_list_paginated($api): truncated at ${max_pages} pages (~$((max_pages*100)) items). Result may be incomplete for very large clusters." >&2
+    fi
+    printf '%s' "$merged" | python3 -c "
+import sys, json
+try:
+    print(json.dumps({sys.argv[1]: json.loads(sys.stdin.read())}))
+except json.JSONDecodeError:
+    print('{\"%s\":[]}' % sys.argv[1])
+" "$summary_key" 2>/dev/null || echo "{\"$summary_key\":[]}"
+}
+
+detect_orchestrator() {
+    if [[ -n "$ORCHESTRATOR" ]]; then
+        info "Orchestrator forced: $ORCHESTRATOR"; return
+    fi
+
+    header "Detecting Orchestrator Type"
+    local orch_type
+    orch_type=$(aws sagemaker describe-cluster \
+        --cluster-name "$CLUSTER_NAME" --region "$REGION" \
+        --query 'Orchestrator' --output text 2>/dev/null || echo "")
+
+    if echo "$orch_type" | grep -qi "eks\|kubernetes"; then
+        ORCHESTRATOR="eks"
+    elif echo "$orch_type" | grep -qi "slurm"; then
+        ORCHESTRATOR="slurm"
+    elif kubectl cluster-info &>/dev/null 2>&1; then
+        ORCHESTRATOR="eks"; info "Auto-detected: EKS (kubectl responds)"
+    elif command -v sinfo &>/dev/null && sinfo &>/dev/null 2>&1; then
+        ORCHESTRATOR="slurm"; info "Auto-detected: Slurm (sinfo responds)"
+    elif command -v squeue &>/dev/null; then
+        ORCHESTRATOR="slurm"; info "Auto-detected: Slurm (squeue found)"
+    else
+        ORCHESTRATOR="eks"
+        warn "Could not auto-detect orchestrator — defaulting to 'eks'"
+        warn "Override with: --orchestrator slurm"
+    fi
+    success "Orchestrator: ${ORCHESTRATOR^^}"
+}
+
+check_prerequisites() {
+    header "Checking Prerequisites"
+
+    local missing=()
+    local tool_path
+    for tool in aws jq python3 unbuffer; do
+        if tool_path=$(command -v "$tool" 2>/dev/null) && [[ -n "$tool_path" ]]; then
+            success "$tool: $tool_path"
+        else
+            error "$tool NOT found — required"
+            missing+=("$tool")
+        fi
+    done
+
+    if [[ "$ORCHESTRATOR" == "eks" ]]; then
+        if tool_path=$(command -v kubectl 2>/dev/null) && [[ -n "$tool_path" ]]; then
+            success "kubectl: $tool_path"
+        else
+            error "kubectl NOT found — required for EKS"
+            missing+=("kubectl")
+        fi
+    elif [[ "$ORCHESTRATOR" == "slurm" ]]; then
+        local slurm_ok=false
+        for t in sinfo squeue scontrol; do
+            command -v "$t" &>/dev/null && { success "$t found (Slurm CLI OK)"; slurm_ok=true; break; }
+        done
+        $slurm_ok || warn "Slurm CLI not found locally — will use SSM for Slurm commands"
+    fi
+
+    if [[ ${#missing[@]} -gt 0 ]]; then
+        error "Install: ${missing[*]}"
+        # unbuffer ships in the `expect` package.
+        if printf '%s\n' "${missing[@]}" | grep -qx unbuffer; then
+            error "  unbuffer: 'yum install expect' / 'apt install expect' / 'brew install expect'"
+        fi
+        exit 1
+    fi
+
+    if aws sts get-caller-identity --region "$REGION" &>/dev/null; then
+        local id
+        id=$(aws sts get-caller-identity --region "$REGION" --query 'Arn' --output text)
+        success "AWS credentials: $id"
+    else
+        error "AWS credentials invalid or expired"; exit 1
+    fi
+
+    # Inspect both stdout (node list) and stderr (error message).
+    # Empty stdout with non-zero exit is an auth / transport failure, not
+    # a healthy cluster with zero nodes.
+    if [[ "$ORCHESTRATOR" == "eks" ]]; then
+        local kubectl_out kubectl_err tmpfile
+        tmpfile=$(mktemp /tmp/kubectl-check-XXXXXX.err)
+        _TEMP_FILES+=("$tmpfile")
+        kubectl_out=$(kubectl get nodes --no-headers 2>"$tmpfile" || true)
+        kubectl_err=$(cat "$tmpfile" 2>/dev/null || echo "")
+        rm -f "$tmpfile"
+
+        debug "kubectl stdout: '$kubectl_out'"
+        debug "kubectl stderr: '$kubectl_err'"
+
+        if echo "$kubectl_err" | grep -qiE \
+            "Unauthorized|forbidden|You must be logged in|certificate|no configuration|Unable to connect|server.*refused"; then
+            error "kubectl NOT authenticated to EKS cluster"
+            error "  $(echo "$kubectl_err" | head -1)"
+            warn  "  K8s checks (2, 2b, 5, 5b, 6, 7, 9) will be SKIPPED"
+            K8S_CONNECTED=false
+            ISSUES_FOUND=$((ISSUES_FOUND + 1))
+            add_issue_detail "kubectl not authenticated to EKS cluster → references/operations.md § 3 SSM target format (HyperPod)" "P1"
+        elif echo "$kubectl_err" | grep -qiE \
+            "connection refused|no such host|dial tcp|context deadline exceeded|EOF"; then
+            error "kubectl cannot reach EKS API server → references/operations.md § 1 Getting cluster names (kubeconfig setup)"
+            error "  $(echo "$kubectl_err" | head -1)"
+            warn  "  K8s checks (2, 2b, 5, 5b, 6, 7, 9) will be SKIPPED — check VPN/network connectivity"
+            K8S_CONNECTED=false
+            ISSUES_FOUND=$((ISSUES_FOUND + 1))
+            add_issue_detail "kubectl cannot reach EKS API server → references/operations.md § 1 Getting cluster names (kubeconfig setup)" "P1"
+        elif [[ -z "$kubectl_out" && -z "$kubectl_err" ]]; then
+            warn "kubectl returned no output — kubeconfig may point to wrong cluster"
+            warn "  → references/operations.md § 1 Getting cluster names"
+            K8S_CONNECTED=true   # Allow K8s checks — cluster may simply have no nodes yet
+        elif [[ -n "$kubectl_err" && -z "$kubectl_out" ]]; then
+
+            error "kubectl error: $(echo "$kubectl_err" | head -1)"
+            warn  "  K8s checks (2, 2b, 5, 5b, 6, 7, 9) will be SKIPPED"
+            K8S_CONNECTED=false
+            ISSUES_FOUND=$((ISSUES_FOUND + 1))
+            add_issue_detail "kubectl error — K8s checks skipped → references/operations.md § 1 Getting cluster names (kubeconfig setup)" "P1"
+        else
+            local node_count
+            node_count=$(echo "$kubectl_out" | wc -l | tr -d ' ')
+            success "kubectl authenticated — $node_count node(s) visible"
+            K8S_CONNECTED=true
+        fi
+    fi
+}
+
+check_cluster_health() {
+    header "Check 1: HyperPod Cluster Health"
+
+    local cluster_json
+    cluster_json=$(aws sagemaker describe-cluster \
+        --cluster-name "$CLUSTER_NAME" --region "$REGION" \
+        --output json 2>&1) || {
+        if echo "$cluster_json" | grep -qiE "ResourceNotFound|Cluster with name .* not found|ValidationException"; then
+            error "Cluster '$CLUSTER_NAME' not found in region '$REGION'"
+            echo "$cluster_json" | head -3
+            echo ""
+            echo "Available clusters in $REGION:"
+            aws sagemaker list-clusters --region "$REGION" \
+                --query 'ClusterSummaries[*].{Name:ClusterName,Status:ClusterStatus}' \
+                --output table 2>/dev/null || echo "  (unable to list clusters — check IAM)"
+            exit 1
+        fi
+        if echo "$cluster_json" | grep -qiE "AccessDenied|UnauthorizedOperation"; then
+            warn "Permission denied: sagemaker:DescribeCluster — check IAM policy"
+        fi
+        cluster_json="{}"
+    }
+
+    local cluster_state
+    cluster_state=$(echo "$cluster_json" | python3 -c \
+        "import sys,json; print(json.load(sys.stdin).get('ClusterStatus','UNKNOWN'))" 2>/dev/null \
+        || echo "UNKNOWN")
+
+    case "$cluster_state" in
+        InService)
+            success "Cluster status: $cluster_state" ;;
+        UNKNOWN|None|"")
+            warn "Cluster status: could not retrieve"
+            warn "  Ensure --cluster is the HyperPod cluster name and IAM has sagemaker:DescribeCluster" ;;
+        Creating|Updating|RollingBack|SystemUpdating)
+            warn "Cluster status: $cluster_state (operation in progress — NCCL checks may be partial)"
+            add_issue_detail "Cluster in transient state $cluster_state — rerun after it completes → hyperpod-cluster-debugger skill if it stays stuck" "P2" ;;
+        Deleting|DeleteFailed)
+            error "Cluster status: $cluster_state (cluster is being torn down)"
+            ISSUES_FOUND=$((ISSUES_FOUND + 1))
+            add_issue_detail "Cluster is ${cluster_state} → hyperpod-cluster-debugger skill" "P0" ;;
+        Failed|ClusterMaintenanceRollbackFailed)
+            error "Cluster status: $cluster_state (expected: InService)"
+            ISSUES_FOUND=$((ISSUES_FOUND + 1))
+            add_issue_detail "Cluster status ${cluster_state} → hyperpod-cluster-debugger skill" "P0" ;;
+        *)
+            warn "Cluster status: $cluster_state (unrecognized state)"
+            add_issue_detail "Unrecognized cluster state '${cluster_state}' → hyperpod-cluster-debugger skill" "P1" ;;
+    esac
+
+    # NodeRecovery — affects whether failed nodes are auto-replaced.
+    # Prefer top-level NodeRecovery (the canonical location); InstanceGroups[*].NodeRecovery
+    # is null when cluster-level setting is applied, so per-group-only reads always return 'Unknown'.
+    local node_recovery
+    node_recovery=$(echo "$cluster_json" | python3 -c "
+import sys,json
+d=json.load(sys.stdin)
+top=d.get('NodeRecovery')
+if top:
+    print(top)
+else:
+    igs = d.get('InstanceGroups',[])
+    modes = sorted({ig.get('NodeRecovery') for ig in igs if ig.get('NodeRecovery')})
+    print(','.join(modes) if modes else 'Unknown')
+" 2>/dev/null || echo "Unknown")
+
+    if echo "$node_recovery" | grep -q "Automatic"; then
+        success "NodeRecovery: $node_recovery (auto-repair enabled)"
+    elif echo "$node_recovery" | grep -qi "^Unknown$"; then
+        info "NodeRecovery: could not retrieve (needs sagemaker:DescribeCluster)"
+    elif echo "$node_recovery" | grep -qi "^None$"; then
+        warn "NodeRecovery: None — failed nodes won't auto-replace → references/operations.md § 6 HyperPod node health labels (EKS)"
+        ISSUES_FOUND=$((ISSUES_FOUND + 1))
+        add_issue_detail "NodeRecovery disabled (set to 'None') — failed nodes won't auto-replace → references/operations.md § 6 HyperPod node health labels (EKS)" "P2"
+    else
+        warn "NodeRecovery: $node_recovery — failed nodes won't auto-replace → references/operations.md § 6 HyperPod node health labels (EKS)"
+    fi
+
+    # All instance groups — count nodes per group, surface any unhealthy count.
+    # Paginated because clusters >50 nodes would otherwise be diagnosed on a partial sample.
+    local node_summary
+    node_summary=$(sagemaker_list_paginated list-cluster-nodes ClusterNodeSummaries)
+
+    local node_output
+    node_output=$(echo "$node_summary" | python3 -c "
+import sys,json
+nodes = json.load(sys.stdin).get('ClusterNodeSummaries',[])
+total = len(nodes)
+by_status = {}
+for n in nodes:
+    s = n.get('InstanceStatus',{}).get('Status','Unknown')
+    by_status[s] = by_status.get(s,0) + 1
+print(f'  Total nodes: {total}')
+for s,c in sorted(by_status.items()):
+    tag = '[PASS]' if s == 'Running' else '[FAIL]'
+    print(f'  {tag} {s}: {c}')
+failed = [n for n in nodes if n.get('InstanceStatus',{}).get('Status') not in ('Running','Pending')]
+for n in failed[:10]:
+    msg = n.get('InstanceStatus',{}).get('Message','')
+    print(f'    -> {n[\"InstanceId\"]} ({n[\"InstanceGroupName\"]}): {msg[:120]}')
+print(f'FAILED_COUNT={len(failed)}')
+" 2>/dev/null || echo "FAILED_COUNT=0")
+
+    local fc
+    fc=$(echo "$node_output" | grep "^FAILED_COUNT=" | cut -d= -f2 || echo 0)
+    # `|| true` on grep — no-match returns 1 and pipefail aborts the function.
+    echo "$node_output" | { grep -v "^FAILED_COUNT=" || true; } | while IFS= read -r line; do
+        if echo "$line" | grep -q "\[FAIL\]"; then
+            error "$line"
+        else
+            echo "$line"
+        fi
+    done
+    if [[ "${fc:-0}" -gt 0 ]]; then
+        ISSUES_FOUND=$((ISSUES_FOUND + 1))
+        add_issue_detail "${fc} node(s) in failed/non-Running state → hyperpod-node-debugger skill" "P1"
+    fi
+
+    # Pre-flight: if the cluster has no GPU/EFA-capable instance groups, NCCL
+    # diagnostics don't apply — exit clearly instead of emitting mixed INFO/SKIP.
+    local gpu_groups
+    gpu_groups=$(echo "$cluster_json" | python3 -c "
+import sys, json, re
+d = json.load(sys.stdin)
+igs = d.get('InstanceGroups', [])
+gpu_efa_re = re.compile(r'^ml\.(p4d|p4de|p5|p5e|p5en|p6|trn1|trn2|g5\.48xlarge|g6\.48xlarge|g6e\.48xlarge)', re.I)
+matches = [ig.get('InstanceGroupName','?') + ':' + ig.get('InstanceType','?')
+           for ig in igs if gpu_efa_re.match(ig.get('InstanceType',''))]
+print('|'.join(matches))
+" 2>/dev/null || echo "")
+    if [[ -z "$gpu_groups" ]] && [[ "$(echo "$cluster_json" | python3 -c 'import sys,json; print(len(json.load(sys.stdin).get("InstanceGroups",[])))' 2>/dev/null)" -gt 0 ]]; then
+        warn "No GPU/EFA-capable instance groups in this cluster — NCCL is not applicable"
+        warn "  NCCL is only meaningful on multi-GPU instances with EFA (p4d/p4de/p5/p5e/p5en/p6/trn1/trn2/g5.48xlarge/g6.48xlarge/g6e.48xlarge)"
+        warn "  The rest of the diagnostic will still run, but most checks will return INFO/SKIP on CPU-only fleets"
+    fi
+}
+
+check_cluster_events() {
+    header "Check 3: Cluster Events (infrastructure signals)"
+
+    # HyperPod cluster events report infrastructure-level state only:
+    # lifecycle, bootstrap, EFA health-check, capacity, replacement, reboot,
+    # software update. They do NOT carry NCCL / GPU / training-level signals —
+    # those come from pod logs, CloudWatch, and on-node probes (checks 6–8).
+    # ListClusterEvents response shape: array under `Events` with fields
+    # EventId / ClusterArn / ClusterName / InstanceGroupName / ResourceType /
+    # EventTime / Description (verified live; no Severity field).
+    local events_json
+    events_json=$(sagemaker_list_paginated list-cluster-events Events)
+    local events
+    events=$(echo "$events_json" | python3 -c "
+import sys, json
+summaries = json.load(sys.stdin).get('Events', [])
+proj = [{'Time': e.get('EventTime',''),
+         'Grp':  e.get('InstanceGroupName','') or e.get('ResourceType',''),
+         'Msg':  e.get('Description','') or ''} for e in summaries]
+print(json.dumps(proj))
+" 2>/dev/null || echo "[]")
+
+    local infra_events
+    infra_events=$(echo "$events" | python3 -c "
+import sys, json
+data = json.load(sys.stdin)
+# Match real HyperPod event messages that could block or degrade distributed training.
+keywords = [
+    'efa health checks',         # 'EFA health checks did not run successfully'
+    'lifecycle script',          # 'Lifecycle scripts did not run successfully' / 'execution timed out'
+    'bootstrap failed',          # 'Instance bootstrap failed likely because of customer network misconfiguration'
+    'network misconfiguration',  # appears in bootstrap-failed events
+    'insufficient capacity',     # 'Insufficient capacity' / 'No subnets in the capacity AZ'
+    'failed to provision',       # provisioning events
+    'hardware failure',          # rare; surfaces via events when SMHP detects
+    'replacement',               # node replacement activity
+    'reboot',                    # node reboot activity
+    'rollback',                  # AMI upgrade rollback
+]
+found = [e for e in data if any(k in e.get('Msg','').lower() for k in keywords)]
+for e in found[:20]:
+    print(f\"[{e.get('Grp','?')}] {str(e.get('Time','?'))[:19]} | {e.get('Msg','?')[:140]}\")
+print(f'COUNT={len(found)}')
+" 2>/dev/null || echo "COUNT=0")
+
+    local count
+    count=$(echo "$infra_events" | grep "^COUNT=" | cut -d= -f2 || echo 0)
+    local lines
+    lines=$(echo "$infra_events" | grep -v "^COUNT=" || true)
+
+    if [[ -z "$lines" || "${count:-0}" -eq 0 ]]; then
+        success "No infrastructure events that would block NCCL"
+        if [[ "$ORCHESTRATOR" == "slurm" ]]; then
+            info "(Cluster events may not be populated for HyperPod Slurm clusters — rely on pod-/job-log checks instead.)"
+        fi
+    else
+        warn "Infrastructure events potentially affecting NCCL (last 100):"
+        echo "$lines" | while IFS= read -r line; do
+            if echo "$line" | grep -qiE "error|fail|timeout|rollback"; then
+                error "  $line"
+            else
+                warn "  $line"
+            fi
+        done
+        ISSUES_FOUND=$((ISSUES_FOUND + 1))
+        add_issue_detail "Infrastructure-level events found — review and cross-reference with cluster-debugger if root-cause is cluster-wide → references/debugging-guide.md (match event text to section)" "P1"
+    fi
+}
+
+check_security_groups() {
+    header "Check 4: Security Group Rules (EFA / NCCL Communication)"
+
+    local cluster_json
+    cluster_json=$(aws sagemaker describe-cluster \
+        --cluster-name "$CLUSTER_NAME" --region "$REGION" \
+        --output json 2>/dev/null || echo "{}")
+
+    # DescribeCluster.VpcConfig returns SecurityGroupIds + Subnets (not SubnetIds).
+    # VpcId is not on VpcConfig; derive from a subnet if needed downstream.
+    local sgs subnets
+    sgs=$(echo "$cluster_json" | python3 -c "
+import sys,json
+d=json.load(sys.stdin)
+print(','.join(d.get('VpcConfig',{}).get('SecurityGroupIds',[])))
+" 2>/dev/null || echo "")
+    subnets=$(echo "$cluster_json" | python3 -c "
+import sys,json
+d=json.load(sys.stdin)
+print(','.join(d.get('VpcConfig',{}).get('Subnets',[])))
+" 2>/dev/null || echo "")
+
+    info "SGs: ${sgs:-none}  |  Subnets: ${subnets:-none}"
+
+    if [[ -z "$sgs" ]]; then
+        warn "No security groups in cluster VPC config — cannot verify NCCL rules"
+        warn "  (DescribeCluster may need sagemaker:DescribeCluster permission)"
+        return
+    fi
+
+    IFS=',' read -ra sg_list <<< "$sgs"
+    for sg in "${sg_list[@]}"; do
+        [[ -z "$sg" ]] && continue
+        section "SG: $sg"
+
+        local sg_json
+        sg_json=$(aws ec2 describe-security-groups \
+            --group-ids "$sg" --region "$REGION" \
+            --query 'SecurityGroups[0]' --output json 2>&1) || {
+            if echo "$sg_json" | grep -qiE "AccessDenied|UnauthorizedOperation"; then
+                warn "Permission denied: ec2:DescribeSecurityGroups — check IAM policy"
+            fi
+            sg_json="{}"
+        }
+
+        local self_in self_out all_out
+        read -r self_in self_out all_out < <(echo "$sg_json" | python3 -c "
+import sys,json
+sg=json.load(sys.stdin)
+gid=sg.get('GroupId','')
+def has_self(rules):
+    return any(any(p.get('GroupId')==gid for p in r.get('UserIdGroupPairs',[])) for r in rules)
+def has_all_out(rules):
+    return any(r.get('IpProtocol')=='-1' and any(x.get('CidrIp')=='0.0.0.0/0' for x in r.get('IpRanges',[])) for r in rules)
+print('YES' if has_self(sg.get('IpPermissions',[])) else 'NO',
+      'YES' if has_self(sg.get('IpPermissionsEgress',[])) else 'NO',
+      'YES' if has_all_out(sg.get('IpPermissionsEgress',[])) else 'NO')
+" 2>/dev/null || echo "UNKNOWN UNKNOWN UNKNOWN")
+
+        if [[ "$self_in" == "YES" ]]; then
+            success "  Inbound self-reference: PRESENT (inter-node communication OK)"
+        else
+            error "  Inbound self-reference: MISSING — NCCL inter-node comm WILL FAIL"
+            ISSUES_FOUND=$((ISSUES_FOUND + 1))
+            add_issue_detail "SG $sg missing inbound self-referencing rule → references/operations.md § 8 NCCL-specific remediations (Security group self-reference)" "P0"
+        fi
+
+        if [[ "$self_out" == "YES" ]]; then
+            success "  Outbound self-reference: PRESENT (EFA traffic OK)"
+        else
+            error "  Outbound self-reference: MISSING — EFA traffic WILL FAIL"
+            ISSUES_FOUND=$((ISSUES_FOUND + 1))
+            add_issue_detail "SG $sg missing outbound self-referencing rule → references/operations.md § 8 NCCL-specific remediations (Security group self-reference)" "P0"
+        fi
+
+        if [[ "$all_out" == "YES" ]]; then
+            success "  Outbound 0.0.0.0/0: PRESENT (API/internet OK)"
+        else
+            warn    "  Outbound 0.0.0.0/0: MISSING — may block SageMaker/S3 API calls"
+        fi
+    done
+}
+
+check_k8s_nodes() {
+    header "Check 2: Kubernetes Node Readiness"
+
+    local raw_nodes total not_ready
+    raw_nodes=$(kubectl get nodes --no-headers 2>/dev/null || true)
+    total=$(echo "$raw_nodes" | awk 'NF{c++} END{print c+0}')
+    not_ready=$(echo "$raw_nodes" | { grep -vE " Ready" || true; } | awk 'NF{c++} END{print c+0}')
+
+    info "Total K8s nodes: $total"
+
+    if [[ "$not_ready" -eq 0 ]]; then
+        success "All $total nodes are Ready"
+    else
+        error "$not_ready/$total nodes NOT Ready"
+        ISSUES_FOUND=$((ISSUES_FOUND + 1))
+        add_issue_detail "$not_ready/$total K8s nodes not Ready → hyperpod-node-debugger skill" "P1"
+        echo "$raw_nodes" | { grep -vE " Ready" || true; } | while read -r line; do
+            error "  Not Ready: $line"
+        done
+    fi
+
+    section "HyperPod Health Labels (all nodes)"
+    # Uses the 4 documented node-health-status values plus deep-health-check-status
+    local health_output
+    health_output=$(kubectl get nodes -o json 2>/dev/null | python3 -c "
+import sys, json
+data = json.load(sys.stdin)
+issues = 0
+for node in data.get('items', []):
+    name = node['metadata']['name']
+    labels = node['metadata'].get('labels', {})
+    health       = labels.get('sagemaker.amazonaws.com/node-health-status', '')
+    deep         = labels.get('sagemaker.amazonaws.com/deep-health-check-status', '')
+    fault_type   = labels.get('sagemaker.amazonaws.com/fault-types', '')
+    fault_reason = labels.get('sagemaker.amazonaws.com/fault-reasons', '')
+
+    ok = health in ('', 'Schedulable') and deep in ('', 'Passed') and not fault_type
+    tag = '[PASS]' if ok else '[FAIL]'
+    if not ok:
+        issues += 1
+    line = f'  {tag} {name}: health={health or \"(none)\"}'
+    if deep:       line += f'  deep={deep}'
+    if fault_type: line += f'  fault={fault_type}'
+    print(line)
+    if health == 'Unschedulable':
+        print('         -> Running deep health checks (~2h), temporarily unavailable')
+    elif health == 'UnschedulablePendingReplacement':
+        print('         -> Failed health checks — needs replacement (NodeRecovery=Automatic will auto-replace)')
+    elif health == 'UnschedulablePendingReboot':
+        print('         -> Unhealthy — rebooting to re-run health checks')
+    if deep == 'InProgress': print('         -> Deep health check in progress')
+    elif deep == 'Failed':   print('         -> Deep health check FAILED — node will be replaced')
+    if fault_type: print(f'         -> Fault: {fault_type} | {fault_reason}')
+print(f'ISSUES={issues}')
+" 2>/dev/null || echo "ISSUES=0")
+
+    local health_issues
+    health_issues=$(echo "$health_output" | grep "^ISSUES=" | cut -d= -f2 || echo 0)
+    echo "$health_output" | { grep -v "^ISSUES=" || true; } | while IFS= read -r line; do
+        echo -e "$line"
+    done
+    # Using `if` instead of `[[ ... ]] && ...` — the short-circuit form returns
+    # non-zero when the test is false, which aborts the script under `set -e`
+    # and silently skips every remaining check (pods, env vars, hardware).
+    if [[ "${health_issues:-0}" -gt 0 ]]; then
+        ISSUES_FOUND=$((ISSUES_FOUND + health_issues))
+    fi
+}
+
+check_pod_status() {
+    local ns_flag ns_label
+    if [[ -n "$NAMESPACE" ]]; then
+        ns_flag=(-n "$NAMESPACE")
+        ns_label="'$NAMESPACE'"
+    else
+        ns_flag=(-A)
+        ns_label="all namespaces"
+    fi
+    header "Check 5: Pod / Job Status ($ns_label)"
+
+    local job_filter=()
+    [[ -n "$JOB_NAME" ]] && job_filter=(-l "job-name=$JOB_NAME")
+
+    # `${arr[@]+"${arr[@]}"}` — expand only if defined; plain `${arr[@]}`
+    # trips `set -u` on empty arrays under bash 4.2 (AL2 default).
+    local pods_json
+    pods_json=$(kubectl get pods "${ns_flag[@]}" ${job_filter[@]+"${job_filter[@]}"} -o json 2>/dev/null \
+                || echo '{"items":[]}')
+
+    local pod_output
+    pod_output=$(python3 -c "
+import sys, json
+d = json.load(sys.stdin)
+items = d['items']
+total   = len(items)
+failed  = [p for p in items if p.get('status',{}).get('phase') in ('Failed','Unknown')]
+pending = [p for p in items if p.get('status',{}).get('phase') == 'Pending']
+crashes = []
+for p in items:
+    for cs in p.get('status',{}).get('containerStatuses',[]):
+        if cs.get('restartCount',0)>2 or cs.get('state',{}).get('waiting',{}).get('reason') \
+           in ('CrashLoopBackOff','OOMKilled','Error'):
+            crashes.append(p)
+            break
+
+print(f'TOTAL={total}')
+print(f'FAILED={len(failed)}')
+print(f'PENDING={len(pending)}')
+print(f'CRASH={len(crashes)}')
+
+for p in failed[:5]:
+    name = p['metadata']['name']
+    ns   = p['metadata']['namespace']
+    msg  = p.get('status',{}).get('message','')[:150]
+    print(f'FAILED_POD={ns}/{name}: {msg}')
+for p in pending[:5]:
+    name = p['metadata']['name']
+    ns   = p['metadata']['namespace']
+    for c in p.get('status',{}).get('conditions',[]):
+        if c.get('status')=='False':
+            print(f'PENDING_POD={ns}/{name}: {c.get(\"message\",\"\")[:120]}')
+for p in crashes[:5]:
+    name = p['metadata']['name']
+    ns   = p['metadata']['namespace']
+    for cs in p.get('status',{}).get('containerStatuses',[]):
+        r = cs.get('state',{}).get('waiting',{}).get('reason','CrashLoop')
+        print(f'CRASH_POD={ns}/{name}: {r} restarts={cs.get(\"restartCount\",0)}')
+        break
+" <<< "$pods_json" 2>/dev/null || echo "TOTAL=0
+FAILED=0
+PENDING=0
+CRASH=0")
+
+    # Parse counts outside of pipe to avoid subshell variable loss
+    local p_total p_failed p_pending p_crash
+    p_total=$(echo "$pod_output" | grep "^TOTAL=" | cut -d= -f2 || echo 0)
+    p_failed=$(echo "$pod_output" | grep "^FAILED=" | cut -d= -f2 || echo 0)
+    p_pending=$(echo "$pod_output" | grep "^PENDING=" | cut -d= -f2 || echo 0)
+    p_crash=$(echo "$pod_output" | grep "^CRASH=" | cut -d= -f2 || echo 0)
+
+    info "  Total pods: ${p_total:-0}"
+
+    if [[ "${p_failed:-0}" -gt 0 ]]; then
+        error "  Failed/Unknown pods: $p_failed"; ISSUES_FOUND=$((ISSUES_FOUND+1)); add_issue_detail "$p_failed Failed/Unknown pod(s) → references/debugging-guide.md § 20 Pending / CrashLoopBackOff / Init-Container Failures" "P1"
+    else
+        success "  No failed pods"
+    fi
+    if [[ "${p_pending:-0}" -gt 0 ]]; then
+        warn "  Pending pods: $p_pending"; ISSUES_FOUND=$((ISSUES_FOUND+1)); add_issue_detail "$p_pending Pending pod(s) → references/debugging-guide.md § 20 Pending / CrashLoopBackOff / Init-Container Failures" "P1"
+    else
+        success "  No pending pods"
+    fi
+    if [[ "${p_crash:-0}" -gt 0 ]]; then
+        error "  CrashLoop/OOM pods: $p_crash"; ISSUES_FOUND=$((ISSUES_FOUND+1)); add_issue_detail "$p_crash CrashLoopBackOff/OOM pod(s) → references/debugging-guide.md § 20 Pending / CrashLoopBackOff / Init-Container Failures" "P1"
+    else
+        success "  No crashloop pods"
+    fi
+
+    # `|| true` — grep returns 1 on no-match; with `pipefail` that kills the
+    # whole function, silently skipping the rest of the diagnostic.
+    echo "$pod_output" | { grep "^FAILED_POD=" || true; } | while IFS= read -r line; do error "    ${line#FAILED_POD=}"; done
+    echo "$pod_output" | { grep "^PENDING_POD=" || true; } | while IFS= read -r line; do warn  "    ${line#PENDING_POD=}"; done
+    echo "$pod_output" | { grep "^CRASH_POD="   || true; } | while IFS= read -r line; do error "    ${line#CRASH_POD=}"; done
+}
+
+# Checks EKS-specific prerequisites that cause NCCL failures before training starts:
+#   - Headless service for MASTER_ADDR DNS resolution
+#   - Init container failures blocking training containers
+#   - /dev/shm volume mount (K8s default 64MB is too small for NCCL)
+check_nccl_infra_prereqs() {
+    header "Check 5b: NCCL Infrastructure Prerequisites"
+
+    local ns_flag ns_label
+    if [[ -n "$NAMESPACE" ]]; then
+        ns_flag=(-n "$NAMESPACE")
+        ns_label="'$NAMESPACE'"
+    else
+        ns_flag=(-A)
+        ns_label="all namespaces"
+    fi
+
+    local job_filter=()
+    [[ -n "$JOB_NAME" ]] && job_filter=(-l "job-name=$JOB_NAME")
+
+    # MASTER_ADDR DNS resolution requires a headless service (ClusterIP: None)
+    # Without it, pods get DNS like "10-0-1-5.default.pod.cluster.local" which
+    # doesn't resolve from other pods → rendezvous timeout
+    section "Headless Service (MASTER_ADDR DNS)"
+    local headless_svcs
+    headless_svcs=$(kubectl get svc "${ns_flag[@]}" -o json 2>/dev/null | python3 -c "
+import sys, json
+data = json.load(sys.stdin)
+found = []
+for svc in data.get('items', []):
+    spec = svc.get('spec', {})
+    if spec.get('clusterIP') == 'None':
+        name = svc['metadata']['name']
+        ns = svc['metadata']['namespace']
+        sel = spec.get('selector', {})
+        found.append(f'{ns}/{name} selector={sel}')
+print(f'COUNT={len(found)}')
+for f in found[:10]:
+    print(f)
+" 2>/dev/null || echo "COUNT=0")
+
+    local hl_count
+    hl_count=$(echo "$headless_svcs" | grep "^COUNT=" | cut -d= -f2 || echo 0)
+    if [[ "${hl_count:-0}" -gt 0 ]]; then
+        success "Headless service(s) found (${hl_count}) — MASTER_ADDR DNS can resolve"
+        echo "$headless_svcs" | { grep -v "^COUNT=" || true; } | while IFS= read -r line; do
+            [[ -n "$line" ]] && info "  $line"
+        done
+    else
+        warn "No headless services found in $ns_label"
+        warn "  If MASTER_ADDR uses a hostname, DNS resolution will fail"
+        warn "  Example: spec.clusterIP: None, spec.selector: {app: my-training-job}"
+    fi
+
+    # Init containers must complete before training container starts.
+    # Common failures: S3 data download, config fetch, health check wait
+    section "Init Container Status"
+    local init_issues
+    init_issues=$(kubectl get pods "${ns_flag[@]}" ${job_filter[@]+"${job_filter[@]}"} -o json 2>/dev/null | python3 -c "
+import sys, json
+data = json.load(sys.stdin)
+issues = 0
+for pod in data.get('items', []):
+    name = pod['metadata']['name']
+    ns = pod['metadata']['namespace']
+    for ics in pod.get('status', {}).get('initContainerStatuses', []):
+        state = ics.get('state', {})
+        if 'waiting' in state:
+            reason = state['waiting'].get('reason', '')
+            msg = state['waiting'].get('message', '')[:100]
+            if reason in ('CrashLoopBackOff', 'Error', 'ImagePullBackOff', 'ErrImagePull'):
+                print(f'FAIL:{ns}/{name}: init container \"{ics[\"name\"]}\" {reason}: {msg}')
+                issues += 1
+        elif 'terminated' in state and state['terminated'].get('exitCode', 0) != 0:
+            reason = state['terminated'].get('reason', 'Error')
+            print(f'FAIL:{ns}/{name}: init container \"{ics[\"name\"]}\" exited {state[\"terminated\"][\"exitCode\"]}: {reason}')
+            issues += 1
+print(f'ISSUES={issues}')
+" 2>/dev/null || echo "ISSUES=0")
+
+    local init_count
+    init_count=$(echo "$init_issues" | grep "^ISSUES=" | cut -d= -f2 || echo 0)
+    if [[ "${init_count:-0}" -gt 0 ]]; then
+        error "  $init_count init container failure(s) — training containers cannot start"
+        ISSUES_FOUND=$((ISSUES_FOUND + 1))
+        add_issue_detail "$init_count failed init container(s) blocking training → references/debugging-guide.md § 20 Pending / CrashLoopBackOff / Init-Container Failures" "P1"
+        echo "$init_issues" | { grep "^FAIL:" || true; } | while IFS= read -r line; do
+            error "    ${line#FAIL:}"
+        done
+    else
+        success "No init container failures"
+    fi
+
+    # K8s default /dev/shm = 64MB. NCCL needs ≥1GB. Without emptyDir mount,
+    # training gets "failed to extend /dev/shm/nccl-*" or SIGBUS.
+    section "/dev/shm Volume Mount"
+    if [[ -n "$JOB_NAME" ]]; then
+        local ns="${NAMESPACE:-default}"
+        local shm_check
+        shm_check=$(kubectl get pods -n "$ns" -l "job-name=$JOB_NAME" -o json 2>/dev/null | python3 -c "
+import sys, json
+data = json.load(sys.stdin)
+pods = data.get('items', [])
+if not pods:
+    print('NO_PODS')
+else:
+    pod = pods[0]
+    vols = pod.get('spec', {}).get('volumes', [])
+    has_dshm = any(
+        v.get('emptyDir', {}).get('medium') == 'Memory'
+        for v in vols
+        if any(vm.get('mountPath') == '/dev/shm'
+               for c in pod.get('spec', {}).get('containers', [])
+               for vm in c.get('volumeMounts', [])
+               if vm.get('name') == v.get('name'))
+    )
+    if has_dshm:
+        print('OK')
+    else:
+        print('MISSING')
+" 2>/dev/null || echo "UNKNOWN")
+
+        case "$shm_check" in
+            OK)      success "/dev/shm mounted as emptyDir Memory — NCCL shared memory OK" ;;
+            MISSING) warn "/dev/shm NOT mounted as emptyDir Memory (K8s default = 64MB)"
+                     warn "  NCCL will fail with 'failed to extend /dev/shm/nccl-*' or Bus error"
+                     warn "    volumes: [{name: dshm, emptyDir: {medium: Memory, sizeLimit: '10Gi'}}]"
+                     warn "    volumeMounts: [{name: dshm, mountPath: /dev/shm}]"
+                     ISSUES_FOUND=$((ISSUES_FOUND + 1))
+                     add_issue_detail "/dev/shm not mounted as emptyDir Memory → references/debugging-guide.md § 17 RDMA Memory Registration Failure" "P1" ;;
+            NO_PODS) info "No pods found for job '$JOB_NAME' — /dev/shm check skipped" ;;
+            *)       info "/dev/shm mount status unknown" ;;
+        esac
+    else
+        info "/dev/shm check requires --job flag (skipped)"
+    fi
+}
+
+analyze_nccl_logs() {
+    header "Check 6: NCCL Log Pattern Analysis"
+
+    local job_filter=()
+    [[ -n "$JOB_NAME" ]] && job_filter=(-l "job-name=$JOB_NAME")
+
+    local pod_entries
+    if [[ -n "$NAMESPACE" ]]; then
+        pod_entries=$(kubectl get pods -n "$NAMESPACE" ${job_filter[@]+"${job_filter[@]}"} --no-headers 2>/dev/null \
+            | awk -v ns="$NAMESPACE" '{print ns"/"$1}' || echo "")
+    else
+        pod_entries=$(kubectl get pods -A ${job_filter[@]+"${job_filter[@]}"} --no-headers 2>/dev/null \
+            | awk '{print $1"/"$2}' \
+            | grep -vE "^(kube-system|kube-public|kube-node-lease|aws-hyperpod)/" || true)
+    fi
+
+    if [[ -z "$pod_entries" ]]; then
+        info "No workload pods found to analyze logs"
+        return
+    fi
+
+    declare -A NCCL_PATTERNS=(
+        ["Timeout waiting for"]="TIMEOUT_RENDEZVOUS:Rendezvous timed out — peer ranks not responding"
+        ["Connection refused"]="CONN_REFUSED:TCP refused — check MASTER_ADDR/MASTER_PORT"
+        ["Address already in use"]="PORT_CONFLICT:Port already in use — change MASTER_PORT"
+        ["NCCL WARN Connect to"]="CONNECT_FAIL:NCCL peer connection failed — check SG/NetworkPolicy"
+        ["network is unreachable"]="NET_UNREACHABLE:Network unreachable — VPC/routing issue"
+        ["Error in Store"]="STORE_ERR:Distributed store error — usually rendezvous timeout"
+        ["DistStoreError"]="STORE_ERR:Distributed store error (PyTorch 2.x) — usually rendezvous timeout"
+        ["RendezvousConnectionError"]="RDZV_CONN_ERR:Torch elastic rendezvous connection failed — check MASTER_ADDR DNS + SG"
+        ["RendezvousTimeout"]="RDZV_TIMEOUT:Torch elastic rendezvous timed out — peers not reachable"
+        ["Name or service not known"]="DNS_FAIL:DNS resolution failed for MASTER_ADDR — check headless service or /etc/hosts"
+        ["getaddrinfo failed"]="DNS_FAIL:DNS resolution failed — headless service missing or CoreDNS issue"
+        ["Watchdog timeout"]="WATCHDOG_TIMEOUT:AllReduce watchdog expired — straggler or OOM"
+        ["unhandled system error"]="SYSTEM_ERROR:NCCL system error — GPU/EFA hardware issue"
+        ["unhandled cuda error"]="CUDA_ERROR:CUDA runtime error — GPU driver crash or hardware fault"
+        ["peer access is not supported"]="P2P_FAIL:GPU peer access blocked — ACS enabled or IOMMU misconfigured"
+        ["NCCL WARN Cuda failure"]="CUDA_ERROR:CUDA failure inside NCCL — GPU hardware or driver issue"
+        ["fi_getinfo failed"]="EFA_INIT_FAIL:EFA libfabric init failed — EFA not available or wrong NCCL_SOCKET_IFNAME"
+        ["NCCL_OFI_RDMA"]="OFI_ERROR:aws-ofi-nccl plugin error — check EFA driver and OFI NCCL version"
+        ["Call to ibv_reg_mr failed"]="RDMA_REG_FAIL:EFA/RDMA memory registration failed — memlock limit too low"
+        ["NET/OFI Using TCP"]="EFA_TCP_FALLBACK:NCCL fell back to TCP instead of EFA — 10-100x slower than expected"
+        ["Failed to load NCCL"]="NCCL_LOAD_FAIL:Failed to load NCCL library — libnccl.so missing or LD_LIBRARY_PATH wrong"
+        ["libnccl-net.so"]="OFI_LOAD_FAIL:Failed to load aws-ofi-nccl plugin — libnccl-net.so not found"
+        ["OOMKilled"]="OOM_KILL:Container killed (OOM) — reduce batch size or increase memory limit"
+        ["CUDA out of memory"]="CUDA_OOM:GPU out of memory — reduce batch size or model size"
+        ["cudaMalloc failed"]="CUDA_OOM:GPU cudaMalloc failed — reduce batch size or model size"
+        ["failed to extend /dev/shm"]="SHM_FULL:NCCL shared memory /dev/shm full — mount emptyDir with 10Gi sizeLimit"
+        ["Bus error"]="SHM_FULL:/dev/shm too small or SIGBUS — mount emptyDir with 10Gi sizeLimit"
+        ["NCCL function not found"]="NCCL_VERSION_MISMATCH:NCCL version mismatch across nodes — mixed container images"
+        ["Incompatible NCCL version"]="NCCL_VERSION_MISMATCH:NCCL version mismatch across nodes — mixed container images"
+        ["Could not find interface"]="IFACE_NOT_FOUND:NCCL_SOCKET_IFNAME points to missing interface"
+        ["world_size mismatch"]="WORLD_SIZE_MISMATCH:WORLD_SIZE doesn't match running process count"
+        ["doesn't have NCCL built in"]="NCCL_NOT_BUILT:PyTorch compiled without NCCL — rebuild with USE_NCCL=1 or use AWS DLC image"
+        ["CUDA_VISIBLE_DEVICES"]="CUDA_VIS_DEV:CUDA_VISIBLE_DEVICES misconfigured — GPUs not visible to training process"
+        ["unlink shared memory"]="SHM_STALE:Stale NCCL shared memory from previous run — systemd RemoveIPC=yes or manual cleanup"
+        ["Call to ncclCommAbort"]="NCCL_COMM_ABORT:NCCL communicator aborted — check for straggler node or hardware fault"
+        ["MNNVL topology"]="MNNVL_TOPO_FAIL:NCCL MNNVL topology search failed — memlock=unlimited + stack=unlimited causes 2MB thread stack; fix: ulimit -l 8388608 -s 8192"
+        ["ENOMEM"]="ENOMEM:Memory registration/allocation failed — check memlock limits and available GPU memory"
+        ["invalid alignment"]="CUDA_ALIGN_ERR:CUDA memory alignment error — possible driver/NCCL version incompatibility"
+    )
+
+    local issues_in_logs=false
+
+    while IFS= read -r entry; do
+        local ns pod
+        ns="${entry%%/*}"; pod="${entry#*/}"
+        section "Logs: $ns/$pod"
+
+        local logs
+        # Use --tail=500 to catch patterns even in longer outputs.
+        # For Failed/Error pods, also check --previous (logs from the crashed container instance).
+        local pod_phase
+        pod_phase=$(kubectl get pod -n "$ns" "$pod" -o jsonpath='{.status.phase}' 2>/dev/null || echo "")
+        logs=$(kubectl logs -n "$ns" "$pod" --tail=500 2>/dev/null || echo "")
+        if [[ -z "$logs" ]]; then
+            logs=$(kubectl logs -n "$ns" "$pod" --previous --tail=500 2>/dev/null || echo "")
+        elif [[ "$pod_phase" == "Failed" ]]; then
+            local prev_logs
+            prev_logs=$(kubectl logs -n "$ns" "$pod" --previous --tail=500 2>/dev/null || echo "")
+            [[ -n "$prev_logs" ]] && logs="${logs}"$'\n'"${prev_logs}"
+        fi
+
+        if [[ -z "$logs" ]]; then
+            info "  No logs available"
+            continue
+        fi
+
+        for pattern in "${!NCCL_PATTERNS[@]}"; do
+            if echo "$logs" | grep -qi "$pattern"; then
+                local meaning="${NCCL_PATTERNS[$pattern]}"
+                local code="${meaning%%:*}"
+                local desc="${meaning#*:}"
+                error "  DETECTED [$code]: $desc"
+                echo "$logs" | { grep -i "$pattern" || true; } | tail -3 | while IFS= read -r logline; do
+                    echo -e "    ${YELLOW}> $logline${RESET}"
+                done
+                issues_in_logs=true
+                ISSUES_FOUND=$((ISSUES_FOUND + 1))
+                add_issue_detail "NCCL log pattern [$code] in pod $pod: $desc → references/error-patterns-quick-ref.md" "P1"
+            fi
+        done
+
+        if echo "$logs" | grep -qiE "BASELINE TEST PASSED|AllReduce SUCCESS|Training complete"; then
+            success "  Pod $pod: completed successfully"
+        fi
+    done <<< "$pod_entries"
+
+    $issues_in_logs || success "No NCCL error patterns found in pod logs"
+}
+
+check_nccl_env_vars() {
+    header "Check 7: NCCL Environment Variable Audit"
+
+    local job_filter=()
+    [[ -n "$JOB_NAME" ]] && job_filter=(-l "job-name=$JOB_NAME")
+
+    local ns="${NAMESPACE:-default}"
+    local first_pod
+    first_pod=$(kubectl get pods -n "$ns" ${job_filter[@]+"${job_filter[@]}"} --no-headers 2>/dev/null \
+        | grep -E " Running " | head -1 | awk '{print $1}' || echo "")
+
+    if [[ -z "$first_pod" ]]; then
+        info "No Running pods found for env var audit (only meaningful during active training)"
+        return
+    fi
+
+    info "Checking env vars in Running pod: $ns/$first_pod"
+    local pod_env
+    pod_env=$(kubectl exec -n "$ns" "$first_pod" -- env 2>/dev/null || echo "")
+
+    if [[ -z "$pod_env" ]]; then
+        warn "Could not exec into $first_pod"
+        return
+    fi
+
+    # Capture Python output; the sentinel line feeds issue accounting below.
+    local env_audit_out env_warn_count
+    env_audit_out=$(python3 - <<'PYEOF' "$pod_env"
+import sys
+pod_env = sys.argv[1] if len(sys.argv) > 1 else ""
+env_map = {}
+for line in pod_env.strip().split('\n'):
+    if '=' in line:
+        k, _, v = line.partition('=')
+        env_map[k.strip()] = v.strip()
+
+# (rec_value, severity, description)
+# severity WARN = counts as issue; INFO = advisory only
+checks = {
+    'MASTER_ADDR':            (None,          'WARN', 'Must be rank-0 pod hostname/IP'),
+    'MASTER_PORT':            ('29500',       'WARN', 'Must match across all ranks'),
+    'WORLD_SIZE':             (None,          'WARN', 'Must equal total processes'),
+    'RANK':                   (None,          'WARN', 'Must be unique 0..WORLD_SIZE-1'),
+    'NCCL_SOCKET_IFNAME':     ('^lo,docker,efa,veth,virbr', 'WARN', 'Exclude non-VPC interfaces (loopback/docker/EFA control/veth)'),
+    'NCCL_TIMEOUT':           ('1200',        'WARN', 'Default 600s too short for large clusters'),
+    'FI_PROVIDER':            ('efa',         'INFO', 'Set to efa on EFA instances; omit for CPU-only'),
+    'FI_EFA_USE_DEVICE_RDMA': ('1',           'INFO', 'Required for full EFA RDMA performance'),
+    'NCCL_DEBUG':             ('WARN',        'INFO', 'Enable for diagnostics (use WARN not INFO in prod)'),
+}
+
+print("  {:<28} {:<22} {}".format('Variable','Value','Status'))
+print("  " + "-"*68)
+warn_count = 0
+for var,(rec,sev,desc) in checks.items():
+    val = env_map.get(var)
+    if val:
+        print(f"  [SET]  {var:<26} {val:<22}")
+    elif sev == 'WARN':
+        warn_count += 1
+        print(f"  [WARN] {var:<26} {'(not set)':<22}  <- {desc}")
+    else:
+        print(f"  [INFO] {var:<26} {'(not set)':<22}  <- {desc}")
+
+nccl_debug = env_map.get('NCCL_DEBUG', '')
+if nccl_debug.upper() == 'INFO':
+    warn_count += 1
+    print("\n  [WARN] NCCL_DEBUG=INFO detected in production job — verbose logging adds runtime overhead; set to WARN for production")
+elif nccl_debug.upper() == 'TRACE':
+    warn_count += 1
+    print("\n  [WARN] NCCL_DEBUG=TRACE detected — TRACE prints replayable trace info on every NCCL call (per the NCCL env-var doc); large overhead and gigabytes of logs per rank, set to WARN immediately")
+
+# NCCL_TIMEOUT value validation (formula: nodes * 5 + 600)
+nccl_timeout_str = env_map.get('NCCL_TIMEOUT', '')
+world_size_str = env_map.get('WORLD_SIZE', '0')
+try:
+    world_size = int(world_size_str)
+except ValueError:
+    world_size = 0
+if nccl_timeout_str and world_size > 0:
+    try:
+        nccl_timeout = int(nccl_timeout_str)
+        recommended = world_size * 5 + 600
+        if nccl_timeout < recommended:
+            warn_count += 1
+            print(f"\n  [WARN] NCCL_TIMEOUT={nccl_timeout}s may be too low for {world_size} ranks (recommended >= {recommended}s)")
+    except ValueError:
+        pass
+
+# Large cluster checks (256+ nodes)
+if world_size > 256:
+    warn_count += 1
+    print(f"\n  [WARN] WORLD_SIZE={world_size} (large cluster) — verify memlock and stack ulimits")
+
+if warn_count == 0:
+    print("\n  [PASS] All critical NCCL env vars configured")
+else:
+    print(f"\n  [WARN] {warn_count} critical NCCL env var(s) not set or misconfigured")
+
+# Sentinel line consumed by the caller — DO NOT remove.
+print(f"__WARN_COUNT__={warn_count}")
+PYEOF
+)
+    echo "$env_audit_out" | grep -v '^__WARN_COUNT__='
+    env_warn_count=$(echo "$env_audit_out" | grep '^__WARN_COUNT__=' | cut -d= -f2)
+    if [[ "${env_warn_count:-0}" =~ ^[0-9]+$ ]] && (( env_warn_count > 0 )); then
+        ISSUES_FOUND=$((ISSUES_FOUND + env_warn_count))
+        add_issue_detail "${env_warn_count} NCCL env var issue(s) in pod ${ns}/${first_pod} → references/operations.md § 5 NCCL environment variable reference" "P1"
+    fi
+}
+
+# EFA device plugin + NCCL version consistency. kubectl-only, no active job needed.
+check_efa_k8s() {
+    header "Check 2b: EFA K8s Device Plugin & NCCL Version Consistency"
+
+    # Without this DaemonSet, pods can't request vpc.amazonaws.com/efa resources
+    # and EFA interfaces won't be mounted into training containers.
+    local efa_ds
+    efa_ds=$(kubectl get daemonset -A 2>/dev/null | grep -iE "efa|aws-efa" | head -3 || echo "")
+
+    if [[ -n "$efa_ds" ]]; then
+        success "EFA device plugin DaemonSet found:"
+        echo "$efa_ds" | while IFS= read -r line; do info "  $line"; done
+    else
+        # Missing plugin is a FAIL only if any pod requests vpc.amazonaws.com/efa.
+        local ns_flag=(); if [[ -n "$NAMESPACE" ]]; then ns_flag=(-n "$NAMESPACE"); else ns_flag=(-A); fi
+        local efa_requested
+        efa_requested=$(kubectl get pods "${ns_flag[@]}" -o json 2>/dev/null | python3 -c "
+import sys, json
+data = json.load(sys.stdin)
+for pod in data.get('items', []):
+    for c in pod.get('spec', {}).get('containers', []):
+        lims = c.get('resources', {}).get('limits', {})
+        if 'vpc.amazonaws.com/efa' in lims:
+            ns = pod['metadata']['namespace']
+            name = pod['metadata']['name']
+            count = lims['vpc.amazonaws.com/efa']
+            print(f'  {ns}/{name}: requests {count} EFA interface(s)')
+" 2>/dev/null || echo "")
+
+        if [[ -n "$efa_requested" ]]; then
+            error "Pods request EFA resources but EFA device plugin DaemonSet NOT found!"
+            error "  EFA interfaces will NOT be mounted into training containers"
+            echo "$efa_requested"
+            ISSUES_FOUND=$((ISSUES_FOUND + 1))
+            add_issue_detail "EFA device plugin DaemonSet missing → references/operations.md § 5 NCCL environment variable reference / references/debugging-guide.md § 6 EFA Configuration" "P0"
+        else
+            info "EFA device plugin not detected (OK if no pods request vpc.amazonaws.com/efa)"
+        fi
+    fi
+
+    # Mixed NCCL versions across nodes → 'NCCL function not found' at init.
+    # Two independent probes:
+    #   - torch.cuda.nccl.version(): works only if PyTorch is installed.
+    #   - libnccl.so on disk: authoritative — this is what actually loads at
+    #     runtime, works for any image (PyTorch, JAX, raw NCCL, custom).
+    if [[ -n "$JOB_NAME" ]]; then
+        section "NCCL Version Consistency (job: $JOB_NAME)"
+        local ns="${NAMESPACE:-default}"
+        local job_pods
+        job_pods=$(kubectl get pods -n "$ns" -l "job-name=$JOB_NAME" --no-headers 2>/dev/null \
+            | grep -E " Running " | awk '{print $1}' | head -4 || echo "")
+
+        if [[ -z "$job_pods" ]]; then
+            info "No Running pods in job '$JOB_NAME' — version check skipped"
+        else
+            # Read-only probe: find libnccl.so*, extract embedded version string,
+            # fall back to SONAME filename parsing when `strings` is unavailable.
+            # Variables below are expanded inside the remote pod via `kubectl exec
+            # sh -c`, NOT locally — the quoted heredoc prevents local expansion.
+            local lib_probe
+            lib_probe=$(cat <<'REMOTE_PROBE'
+NCCL_LIB=$(find /usr/local/cuda/lib64 /usr/lib /usr/lib64 /usr/lib/x86_64-linux-gnu /opt/nccl/lib /opt/amazon/ofi-nccl/lib -maxdepth 3 -name "libnccl.so*" -type f 2>/dev/null | head -1)
+if [ -z "$NCCL_LIB" ]; then echo "not-found"; exit 0; fi
+VER=$(strings "$NCCL_LIB" 2>/dev/null | grep -oE "NCCL version [0-9]+\.[0-9]+\.[0-9]+" | head -1 | sed "s/NCCL version //")
+[ -z "$VER" ] && VER=$(basename "$(readlink -f "$NCCL_LIB")" 2>/dev/null | grep -oE "[0-9]+\.[0-9]+\.[0-9]+" | head -1)
+[ -z "$VER" ] && VER="present-no-version"
+echo "$VER"
+REMOTE_PROBE
+)
+            local torch_versions=()
+            local lib_versions=()
+            for pod in $job_pods; do
+                local tver lver
+                tver=$(kubectl exec -n "$ns" "$pod" -- \
+                    python3 -c "import torch; print(torch.cuda.nccl.version())" 2>/dev/null \
+                    || echo "unavailable")
+                lver=$(kubectl exec -n "$ns" "$pod" -- sh -c "$lib_probe" 2>/dev/null \
+                    || echo "unavailable")
+                info "  $pod: torch.nccl=$tver  libnccl.so=$lver"
+                torch_versions+=("$tver")
+                lib_versions+=("$lver")
+            done
+
+            local unique_torch unique_lib
+            unique_torch=$(printf '%s\n' "${torch_versions[@]}" | grep -v unavailable | sort -u | wc -l | tr -d ' ')
+            unique_lib=$(printf '%s\n' "${lib_versions[@]}" \
+                | grep -vE "unavailable|not-found|present-no-version" | sort -u | wc -l | tr -d ' ')
+
+            if [[ "$unique_torch" -gt 1 ]]; then
+                error "NCCL VERSION MISMATCH (torch.cuda.nccl.version) across pods — will cause 'NCCL function not found' at init!"
+                ISSUES_FOUND=$((ISSUES_FOUND + 1))
+                add_issue_detail "NCCL version mismatch across pods (torch) → references/debugging-guide.md § 10 NCCL Version Mismatch" "P1"
+            fi
+            if [[ "$unique_lib" -gt 1 ]]; then
+                error "libnccl.so VERSION MISMATCH across pods — mixed NCCL libraries will cause symbol errors at init!"
+                ISSUES_FOUND=$((ISSUES_FOUND + 1))
+                add_issue_detail "libnccl.so version mismatch across pods → references/debugging-guide.md § 10 NCCL Version Mismatch" "P1"
+            fi
+            if [[ "$unique_torch" -le 1 ]] && [[ "$unique_lib" -le 1 ]]; then
+                if [[ "$unique_torch" -eq 1 ]] || [[ "$unique_lib" -eq 1 ]]; then
+                    success "NCCL version consistent across ${#lib_versions[@]} pod(s)"
+                else
+                    info "NCCL version unavailable (neither torch nor libnccl.so could be probed)"
+                fi
+            fi
+        fi
+    fi
+}
+
+check_network_policies() {
+    header "Check 9: Kubernetes NetworkPolicy Scan"
+
+    local np_flag np_label
+    if [[ -n "$NAMESPACE" ]]; then
+        np_flag=(-n "$NAMESPACE")
+        np_label="'$NAMESPACE'"
+    else
+        np_flag=(-A)
+        np_label="all namespaces"
+    fi
+
+    local policies
+    policies=$(kubectl get networkpolicy "${np_flag[@]}" 2>/dev/null || echo "")
+
+    if [[ -z "$policies" ]] || echo "$policies" | grep -q "No resources found"; then
+        success "No NetworkPolicies in $np_label — all traffic allowed"
+        return
+    fi
+
+    # Informational — only raise a finding when the per-policy scan below
+    # identifies one that actually blocks all ingress/egress. Narrow allow-list
+    # policies (e.g. operator-scoped ingress) are common and not a defect.
+    info "NetworkPolicies found in $np_label — review each for NCCL impact:"
+    echo "$policies"
+
+    local scope_flag
+    local scope_flag=()
+    if [[ -n "$NAMESPACE" ]]; then scope_flag=(-n "$NAMESPACE"); else scope_flag=(-A); fi
+    kubectl get networkpolicy "${scope_flag[@]}" -o json 2>/dev/null | python3 -c "
+import sys, json
+data = json.load(sys.stdin)
+for pol in data.get('items', []):
+    name = pol['metadata']['name']
+    ns   = pol['metadata']['namespace']
+    spec = pol.get('spec', {})
+    types   = spec.get('policyTypes', [])
+    ingress = spec.get('ingress', [])
+    egress  = spec.get('egress', [])
+    print(f'  Policy: {ns}/{name}  |  Types: {types}')
+    if 'Ingress' in types and not ingress:
+        print(f'    [FAIL] BLOCKS ALL INBOUND — will break NCCL rendezvous and AllReduce!')
+    if 'Egress' in types and not egress:
+        print(f'    [FAIL] BLOCKS ALL OUTBOUND — will break NCCL communication!')
+    if ('Ingress' not in types) and ('Egress' not in types):
+        print(f'    [INFO] Policy has no policyTypes — acts as allow-all')
+" 2>/dev/null
+
+    local scope_flag2
+    local scope_flag2=()
+    if [[ -n "$NAMESPACE" ]]; then
+        scope_flag2=(-n "$NAMESPACE")
+    else
+        scope_flag2=(-A)
+    fi
+    local blocking_list
+    blocking_list=$(kubectl get networkpolicy "${scope_flag2[@]}" -o json 2>/dev/null | python3 -c "
+import sys, json
+try:
+    data = json.load(sys.stdin)
+except json.JSONDecodeError:
+    # kubectl returned non-JSON (empty stdin, error text, or version-skew output).
+    # Skip this check rather than aborting the overall diagnostic run.
+    sys.exit(0)
+for pol in data.get('items', []):
+    name = pol['metadata']['name']
+    ns   = pol['metadata']['namespace']
+    spec = pol.get('spec', {})
+    types   = spec.get('policyTypes', [])
+    ingress = spec.get('ingress', [])
+    egress  = spec.get('egress', [])
+    blocks = ('Ingress' in types and not ingress) or ('Egress' in types and not egress)
+    if blocks:
+        print(f'{ns}/{name}')
+" 2>/dev/null || echo "")
+
+    if [[ -n "$blocking_list" ]]; then
+        while IFS= read -r bp; do
+            [[ -z "$bp" ]] && continue
+            ISSUES_FOUND=$((ISSUES_FOUND + 1))
+            add_issue_detail "Blocking NetworkPolicy $bp may prevent NCCL traffic → references/operations.md § 8 NCCL-specific remediations (NetworkPolicy)" "P1"
+        done <<< "$blocking_list"
+    fi
+}
+
+# Populates SSM_CLUSTER_ID and SSM_NODES array (up to SAMPLE_NODES entries).
+# Each entry is "INSTANCE_ID GROUP_NAME".
+# Prefers worker/compute nodes over controller/head nodes.
+# Respects --node <INSTANCE_ID> if provided.
+resolve_cluster_nodes_for_ssm() {
+    SSM_CLUSTER_ID=""
+    SSM_NODES=()
+
+    local cluster_arn
+    cluster_arn=$(aws sagemaker describe-cluster \
+        --cluster-name "$CLUSTER_NAME" --region "$REGION" \
+        --query 'ClusterArn' --output text 2>/dev/null || echo "")
+
+    if [[ -z "$cluster_arn" || "$cluster_arn" == "None" ]]; then
+        debug "resolve_cluster_nodes_for_ssm: describe-cluster returned empty ARN"
+        return 1
+    fi
+
+    SSM_CLUSTER_ID=$(echo "$cluster_arn" | awk -F'/' '{print $NF}')
+
+    local nodes_json
+    nodes_json=$(sagemaker_list_paginated list-cluster-nodes ClusterNodeSummaries)
+
+    if [[ -n "$NODE_ID" ]]; then
+        local grp
+        grp=$(echo "$nodes_json" | python3 -c "
+import sys, json
+target = sys.argv[1]
+nodes = json.load(sys.stdin).get('ClusterNodeSummaries', [])
+for n in nodes:
+    if n.get('InstanceId') == target:
+        print(n.get('InstanceGroupName','worker'))
+        break
+" "$NODE_ID" 2>/dev/null | head -1)
+        [[ -z "$grp" ]] && grp="worker"
+        SSM_NODES=("$NODE_ID $grp")
+        return 0
+    fi
+
+    local all_nodes
+    all_nodes=$(echo "$nodes_json" | python3 -c "
+import sys, json
+print(json.dumps(json.load(sys.stdin).get('ClusterNodeSummaries', [])))
+" 2>/dev/null || echo '[]')
+
+    # For NCCL diagnostics, hardware probes (nvidia-smi, fi_info -p efa,
+    # neuron-ls) only produce meaningful signal on GPU / accelerator nodes.
+    # Prioritize by type: GPU/Neuron first, other Running compute next, then
+    # fall back to any Running node so the script still reports on a cluster
+    # that has only CPU nodes.
+    local picked
+    picked=$(echo "$all_nodes" | python3 -c "
+import sys, json
+nodes = json.load(sys.stdin)
+sample = $SAMPLE_NODES
+
+# Instance-type prefixes that carry NVIDIA GPUs or AWS Trainium/Inferentia.
+# A node's instance type shows up in ClusterNodeSummaries as e.g. 'ml.p5.48xlarge'.
+GPU_PREFIXES = ('ml.p3', 'ml.p3dn', 'ml.p4d', 'ml.p4de', 'ml.p5', 'ml.p5e',
+                'ml.p5en', 'ml.p6', 'ml.g4dn', 'ml.g5', 'ml.g6', 'ml.g6e', 'ml.g7e')
+NEURON_PREFIXES = ('ml.trn1', 'ml.trn2', 'ml.inf2')
+ACCEL_PREFIXES = GPU_PREFIXES + NEURON_PREFIXES
+
+def is_utility_group(name):
+    n = (name or '').lower()
+    return any(x in n for x in ('controller', 'head', 'master'))
+
+def itype(n):
+    return n.get('InstanceType', '') or ''
+
+running = [n for n in nodes if n.get('InstanceStatus', {}).get('Status', '') == 'Running']
+
+# Tier 1: running + accelerator type + not a controller group
+tier1 = [n for n in running if itype(n).startswith(ACCEL_PREFIXES) and not is_utility_group(n.get('InstanceGroupName', ''))]
+# Tier 2: running + non-controller (may be CPU-only compute)
+tier2 = [n for n in running if n not in tier1 and not is_utility_group(n.get('InstanceGroupName', ''))]
+# Tier 3: anything else running (utility / controller nodes, last resort)
+tier3 = [n for n in running if n not in tier1 and n not in tier2]
+
+results = []
+for n in tier1 + tier2 + tier3:
+    if len(results) >= sample:
+        break
+    results.append(n['InstanceId'] + ' ' + n['InstanceGroupName'])
+for r in results:
+    print(r)
+" 2>/dev/null || echo "")
+
+    if [[ -z "$picked" ]]; then
+        debug "resolve_cluster_nodes_for_ssm: no Running nodes found"
+        return 1
+    fi
+
+    while IFS= read -r line; do
+        [[ -n "$line" ]] && SSM_NODES+=("$line")
+    done <<< "$picked"
+
+    return 0
+}
+
+# Usage: _ssm_run INSTANCE_ID GROUP_NAME CLUSTER_ID SCRIPT_BODY
+# Returns the stdout of the remote script, or empty on failure.
+_ssm_run() {
+    local instance_id="$1"
+    local group_name="$2"
+    local cluster_id="$3"
+    local script_body="$4"
+
+    # Validate inputs before interpolating into the SSM target string.
+    [[ -z "$instance_id" || -z "$group_name" || -z "$cluster_id" || -z "$script_body" ]] && return 1
+    [[ ! "$instance_id" =~ ^i-[0-9a-f]{8,17}$ ]] && return 1
+    [[ ! "$group_name"  =~ ^[A-Za-z0-9._-]+$ ]] && return 1
+    [[ ! "$cluster_id"  =~ ^[A-Za-z0-9._-]+$ ]] && return 1
+
+    local target="sagemaker-cluster:${cluster_id}_${group_name}-${instance_id}"
+
+    local tmpfile
+    tmpfile=$(mktemp "${TMPDIR:-/tmp}/nccl-ssm-XXXXXX.json") || return 1
+    chmod 600 "$tmpfile" 2>/dev/null || true
+    _TEMP_FILES+=("$tmpfile")
+    # AWS-StartNonInteractiveCommand collapses newlines in a single command
+    # element, so embed the multi-line script as a base64 payload.
+    local cmd_b64
+    cmd_b64=$(printf '%s' "$script_body" | base64 | tr -d '\n') || { rm -f "$tmpfile"; return 1; }
+    local remote="bash -c \"echo $cmd_b64 | base64 -d | bash\""
+    python3 -c "import json,sys; print(json.dumps({'command':[sys.argv[1]]}))" "$remote" > "$tmpfile" 2>/dev/null || { rm -f "$tmpfile"; return 1; }
+
+    # session-manager-plugin races to close before flushing its last stdout
+    # block; `unbuffer` (from the `expect` package) gives it a PTY and avoids
+    # the resulting "Cannot perform start session: EOF". Required — see the
+    # prerequisite check at script startup.
+
+    # Retry transient SSM session errors (EOF, throttling, i/o timeout).
+    # Do not retry AccessDenied / UnauthorizedOperation — permanent IAM denials.
+    local out attempt=0
+    while (( attempt < 5 )); do
+        out=$(unbuffer timeout 180 aws ssm start-session \
+            --target "$target" \
+            --region "$REGION" \
+            --document-name AWS-StartNonInteractiveCommand \
+            --parameters "file://$tmpfile" 2>&1 || echo "")
+        # Fatal (don't retry) — permanent IAM or agent state.
+        if echo "$out" | grep -qiE "AccessDenied|UnauthorizedOperation|not authorized to perform|TargetNotConnected"; then
+            break
+        fi
+        if ! echo "$out" | grep -qiE "Cannot perform start session|EOF$|SessionManagerPlugin is not found|i/o timeout|ThrottlingException|RequestLimitExceeded|InternalFailure|ServiceUnavailable"; then
+            break
+        fi
+        attempt=$((attempt + 1))
+        sleep $((attempt * 3))
+    done
+    rm -f "$tmpfile"
+    # Strip SSM session banners and the echoed base64 command line.
+    echo "$out" | grep -vE '^(Starting session with SessionId:|Exiting session with sessionId:|\s*$)' \
+                | grep -vE "^(bash -c \"echo [A-Za-z0-9+/=]+ \| base64 -d \| bash\"|echo '[A-Za-z0-9+/=]+'|[A-Za-z0-9+/=]{40,}={0,2})[[:space:]]*\|?[[:space:]]*base64?[[:space:]]*-?d?[[:space:]]*\|?[[:space:]]*bash\"?\$" || true
+}
+
+# Self-contained bash script executed on each HyperPod compute node via SSM.
+# Covers GPU, EFA, NCCL library, network, memory, and process health.
+_NODE_DIAG_SCRIPT=$(cat <<'NODE_SCRIPT'
+#!/bin/bash
+# HyperPod NCCL Node Hardware Diagnostics
+# Runs ON the compute node via SSM — NOT on the local machine.
+export PATH="/opt/amazon/efa/bin:/usr/local/cuda/bin:$PATH"
+
+echo "=== NODE DIAGNOSTICS ==="
+echo "Host: $(hostname)"
+echo "Date: $(date -u +%Y-%m-%dT%H:%M:%SZ)"
+echo "Kernel: $(uname -r)"
+
+# Instance type via IMDS (v2)
+IMDS_TOKEN=$(curl -sf -m 3 -X PUT "http://169.254.169.254/latest/api/token" \
+    -H "X-aws-ec2-metadata-token-ttl-seconds: 60" 2>/dev/null || echo "")
+if [ -n "$IMDS_TOKEN" ]; then
+    INSTANCE_TYPE=$(curl -sf -m 3 -H "X-aws-ec2-metadata-token: $IMDS_TOKEN" \
+        "http://169.254.169.254/latest/meta-data/instance-type" 2>/dev/null || echo "unknown")
+    AZ=$(curl -sf -m 3 -H "X-aws-ec2-metadata-token: $IMDS_TOKEN" \
+        "http://169.254.169.254/latest/meta-data/placement/availability-zone" 2>/dev/null || echo "unknown")
+else
+    INSTANCE_TYPE="unknown"
+    AZ="unknown"
+fi
+echo "Instance: ${INSTANCE_TYPE} | AZ: ${AZ}"
+echo ""
+
+echo "--- GPU ---"
+# Require both the binary AND at least one GPU visible. nvidia-smi is preinstalled
+# on some non-GPU instance types (t3/c5) but returns "No devices were found" —
+# reporting that as [FAIL] would be a false positive on controllers/logins.
+if command -v nvidia-smi &>/dev/null && nvidia-smi -L 2>/dev/null | grep -q "^GPU"; then
+    nvidia-smi --query-gpu=index,name,driver_version,memory.used,memory.total,temperature.gpu,utilization.gpu \
+        --format=csv,noheader 2>/dev/null \
+        && echo "" \
+        || echo "[FAIL] nvidia-smi query failed"
+
+    # XID errors indicate hardware faults that will cause NCCL to abort.
+    # Modern A100/H100 drivers log XIDs to dmesg but NOT to nvidia-smi -q,
+    # so check both sources — verified on-hardware with A100 driver 580.126
+    # where an injected XID 31 appeared in dmesg but was invisible to -q.
+    XID_DMESG=$(dmesg 2>/dev/null | grep -E 'NVRM: Xid' | tail -5)
+    XID_SMI=$(nvidia-smi -q 2>/dev/null | grep -E '^[[:space:]]*Xid' | head -5)
+    if [ -n "$XID_DMESG" ] || [ -n "$XID_SMI" ]; then
+        echo "[FAIL] GPU XID ERRORS DETECTED (hardware fault — NCCL will abort):"
+        [ -n "$XID_DMESG" ] && echo "$XID_DMESG"
+        [ -n "$XID_SMI" ] && echo "$XID_SMI"
+    else
+        echo "[PASS] No GPU XID errors"
+    fi
+
+    # Only surface nonzero ECC counts. 'ECC Errors' section header and
+    # 'Uncorrectable ... : 0' lines fire on every healthy GPU.
+    ECC=$(nvidia-smi -q 2>/dev/null | awk '
+        /Uncorrectable/ { if ($NF ~ /^[0-9]+$/ && $NF+0 > 0) print }
+    ' | head -5)
+    [ -n "$ECC" ] && echo "[FAIL] GPU uncorrectable ECC errors detected: $ECC" || echo "[PASS] No ECC errors"
+
+    # Row-remap state — marginal GPU memory. Pending rows need a reset to finalize;
+    # Failed means exceeded remap capacity (bad memory). Silent degrader that
+    # default DCGM medium + memtest in some driver versions miss entirely.
+    REMAP=$(nvidia-smi --query-remapped-rows=gpu_bus_id,remapped_rows.pending,remapped_rows.failure \
+        --format=csv,noheader 2>/dev/null)
+    if [ -n "$REMAP" ]; then
+        PENDING_SUM=$(echo "$REMAP" | awk -F, '{gsub(/ /,""); s+=$2} END {print s+0}')
+        FAILED_COUNT=$(echo "$REMAP" | awk -F, '{gsub(/ /,""); if ($3=="Yes" || $3=="1") c++} END {print c+0}')
+        if [ "$FAILED_COUNT" -gt 0 ]; then
+            echo "[FAIL] GPU row-remap FAILED on $FAILED_COUNT device(s) — bad memory, replace GPU"
+        elif [ "$PENDING_SUM" -gt 0 ]; then
+            echo "[FAIL] GPU row-remap PENDING ($PENDING_SUM row(s)) — marginal memory; reset/reboot to finalize"
+            echo "       If pending persists across reboots, firmware may be stuck — replace GPU"
+        else
+            echo "[PASS] GPU row-remap: no pending or failed rows"
+        fi
+    fi
+
+    # DCGM health — complements XID/ECC above. Parse Fail/Warn verdicts only
+    # (Pass is not authoritative on DCGM <= 3.3.9 due to memtest bug).
+    if command -v dcgmi >/dev/null 2>&1; then
+        DCGM_OUT=$(dcgmi health --check -j 2>/dev/null || dcgmi health --check 2>/dev/null || echo "")
+        if echo "$DCGM_OUT" | grep -qiE '"overall_health"\s*:\s*"(Fail|Warn)"|HEALTH_RESULT_FAIL|HEALTH_RESULT_WARN|Health Monitor Report.*(Fail|Warn)'; then
+            echo "[FAIL] DCGM health check reports Fail/Warn — inspect with 'dcgmi health --check'"
+        fi
+    fi
+
+    # DCGM nvvs log presence — HyperPod deep-health-check writes here.
+    if [ -d /var/log/nvidia-dcgm ]; then
+        NVVS_LATEST=$(find /var/log/nvidia-dcgm -maxdepth 1 -name 'nvvs*.log' -printf '%T@ %p\n' 2>/dev/null | sort -nr | head -1 | awk '{print $2}')
+        if [ -n "$NVVS_LATEST" ]; then
+            if tail -n 200 "$NVVS_LATEST" 2>/dev/null | grep -qiE 'row ?remap.*(pending|fail)|FAIL: |Error: '; then
+                echo "[FAIL] DCGM nvvs log contains failure / row-remap signals: $NVVS_LATEST"
+            fi
+        fi
+    fi
+
+    # NVLink — important for p4d/p5 multi-GPU NCCL bandwidth.
+    # Output format across driver versions:
+    #   - 'Link N: X GB/s'   (active, driver 470+)
+    #   - 'Link N: Active'   (older drivers)
+    #   - 'error'/'fail'/'inactive' keywords when degraded
+    NVLINK=$(nvidia-smi nvlink --status 2>/dev/null | head -200)
+    if echo "$NVLINK" | grep -qiE "error|fail|inactive"; then
+        echo "[FAIL] NVLink errors/inactive links detected (replace node):"
+        echo "$NVLINK" | grep -iE "error|fail|inactive"
+    else
+        ACTIVE_COUNT=$(echo "$NVLINK" | grep -cE "Link [0-9]+:[[:space:]]+([0-9]+ GB/s|Active)" || true)
+        if [ "${ACTIVE_COUNT:-0}" -gt 0 ]; then
+            echo "[PASS] NVLink: $ACTIVE_COUNT active link(s)"
+        else
+            echo "[INFO] NVLink not available (expected on single-GPU or non-NVLink instances)"
+        fi
+    fi
+
+    # GPU P2P topology — critical for intra-node NCCL AllReduce performance
+    echo ""
+    echo "--- GPU P2P Topology (nvidia-smi topo) ---"
+    nvidia-smi topo -m 2>/dev/null | head -25 | while IFS= read -r line; do
+        if echo "$line" | grep -qiE "NV[0-9]|NVLink"; then
+            echo "  [PASS] $line"
+        elif echo "$line" | grep -qiE "PIX|PXB|PHB|SOC"; then
+            echo "  [WARN] $line  <- PCIe path (slower than NVLink)"
+        else
+            echo "  [INFO] $line"
+        fi
+    done
+
+    # PCI ACS — intercepts GPU Direct P2P → 10-50x slower intra-node AllReduce or hang
+    echo ""
+    echo "--- PCI ACS (Access Control Services) ---"
+    if command -v lspci &>/dev/null; then
+        ACS_ENABLED=$(lspci -vvv 2>/dev/null | grep -A20 "PCI bridge\|Root Port\|Upstream Port" \
+            | grep "ACSCtl:" | { grep -c "SrcValid+" 2>/dev/null; true; })
+        if [ "$ACS_ENABLED" -gt 0 ] 2>/dev/null; then
+            echo "[FAIL] ACS enabled on $ACS_ENABLED PCI bridge(s) — GPU Direct P2P blocked!"
+            echo "       Symptom: 'NCCL WARN P2P not supported between dev X and dev Y'"
+            echo "       Impact:  10-50x slower intra-node AllReduce"
+        else
+            echo "[PASS] ACS not enabled on PCI bridges — GPU Direct P2P unobstructed"
+        fi
+    else
+        echo "[INFO] lspci not available — install pciutils to check ACS"
+    fi
+
+    IOMMU=$(dmesg 2>/dev/null | grep -iE "iommu.*enabled|dmar.*enabled" | head -2 || \
+            grep -oE "intel_iommu=[^ ]+|iommu=[^ ]+" /proc/cmdline 2>/dev/null | head -1 || echo "")
+    if [ -n "$IOMMU" ]; then
+        echo "[WARN] IOMMU may be enabled: $IOMMU"
+        echo "       On baremetal: disable VT-d/IOMMU in BIOS for best GPU Direct P2P"
+        echo "       In VMs: normal — use ATS on network adapters"
+    else
+        echo "[PASS] IOMMU: not detected as enabled"
+    fi
+
+    [ "${NCCL_P2P_DISABLE:-0}" = "1" ] && \
+        echo "[WARN] NCCL_P2P_DISABLE=1 set — workaround active, performance degraded" || true
+
+    # nvidia-peermem — GPU Direct RDMA to NIC (required for EFA↔GPU on p4d/p5)
+    echo ""
+    echo "--- nvidia-peermem (GPU Direct RDMA) ---"
+    if lsmod 2>/dev/null | grep -q "nvidia_peermem\|nv_peer_mem"; then
+        echo "[PASS] nvidia-peermem loaded — GPU Direct RDMA to EFA/NIC enabled"
+    else
+        # Kernel 5.12+ uses DMA-BUF instead of nvidia-peermem.
+        KVER_MAJOR=$(uname -r | cut -d. -f1)
+        KVER_MINOR=$(uname -r | cut -d. -f2)
+        if [ "$KVER_MAJOR" -gt 5 ] || { [ "$KVER_MAJOR" -eq 5 ] && [ "$KVER_MINOR" -ge 12 ]; } 2>/dev/null; then
+            echo "[INFO] nvidia-peermem not loaded; kernel $(uname -r) supports DMA-BUF (auto-detected)"
+        else
+            echo "[WARN] nvidia-peermem NOT loaded — EFA↔GPU copies go through CPU"
+        fi
+    fi
+else
+    if command -v nvidia-smi &>/dev/null; then
+        echo "[INFO] nvidia-smi installed but no GPU devices visible — likely a CPU-only node (controller/login)"
+    else
+        echo "[INFO] nvidia-smi not found — CPU-only node or GPU driver not installed"
+    fi
+fi
+echo ""
+
+echo "--- EFA ---"
+
+if lsmod 2>/dev/null | grep -q "^efa "; then
+    EFA_MOD_VER=$(modinfo efa 2>/dev/null | grep "^version:" | awk '{print $2}' || echo "unknown")
+    echo "[PASS] EFA kernel module loaded (version: ${EFA_MOD_VER})"
+else
+    EFA_DEVS=$(ls /dev/infiniband/uverbs* 2>/dev/null || echo "")
+    EFA_IFACES=$(ip -br link show 2>/dev/null | grep -cE "^efa" || echo 0)
+    if [ -n "$EFA_DEVS" ] || [ "$EFA_IFACES" -gt 0 ] 2>/dev/null; then
+        echo "[FAIL] EFA devices present but kernel module NOT loaded — NCCL EFA will fail"
+    else
+        echo "[INFO] EFA kernel module not loaded (expected on non-EFA instances)"
+    fi
+fi
+
+FI_CMD=""
+command -v fi_info &>/dev/null && FI_CMD="fi_info"
+[ -z "$FI_CMD" ] && [ -f /opt/amazon/efa/bin/fi_info ] && FI_CMD="/opt/amazon/efa/bin/fi_info"
+
+if [ -n "$FI_CMD" ]; then
+    EFA_OUTPUT=$($FI_CMD -p efa 2>&1)
+    if echo "$EFA_OUTPUT" | grep -q "provider: efa"; then
+        EFA_COUNT=$(echo "$EFA_OUTPUT" | { grep -c "provider: efa" 2>/dev/null; true; })
+        echo "[PASS] EFA provider available: $EFA_COUNT interface(s)"
+        echo "$EFA_OUTPUT" | grep "device:" | head -5
+
+        # Validate EFA count against expected per-instance-type counts. A subset
+        # of NICs silently failing to attach is a top NCCL failure mode (training
+        # runs at reduced bandwidth with no error). Counts per AWS EC2 docs.
+        IMDS_TOKEN=$(curl -s -X PUT "http://169.254.169.254/latest/api/token" \
+            -H "X-aws-ec2-metadata-token-ttl-seconds: 60" --connect-timeout 2 2>/dev/null || echo "")
+        if [ -n "$IMDS_TOKEN" ]; then
+            INST_TYPE=$(curl -s -H "X-aws-ec2-metadata-token: $IMDS_TOKEN" \
+                http://169.254.169.254/latest/meta-data/instance-type --connect-timeout 2 2>/dev/null || echo "")
+            # Counts only included where AWS publishes them in the EC2 EFA
+            # docs (https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa-acc-inst-types.html).
+            # For other types the doc lists bandwidth but not card count, so we
+            # skip the check rather than guess.
+            case "$INST_TYPE" in
+                p5.48xlarge|p5e.48xlarge)   EXPECTED_EFA=32 ;;
+                p5en.48xlarge)              EXPECTED_EFA=16 ;;
+                p4d.24xlarge)               EXPECTED_EFA=4 ;;
+                p6-b200.48xlarge)           EXPECTED_EFA=8 ;;
+                p6-b300.48xlarge)           EXPECTED_EFA=17 ;;
+                p6e-gb200.36xlarge)         EXPECTED_EFA=17 ;;
+                *)                          EXPECTED_EFA=0 ;;
+            esac
+            if [ "$EXPECTED_EFA" -gt 0 ] 2>/dev/null; then
+                if [ "$EFA_COUNT" -lt "$EXPECTED_EFA" ] 2>/dev/null; then
+                    echo "[FAIL] EFA count mismatch on ${INST_TYPE}: got ${EFA_COUNT}, expected ${EXPECTED_EFA}"
+                    echo "       A subset of NICs failed to attach — NCCL will run at reduced bandwidth"
+                else
+                    echo "[PASS] EFA count matches ${INST_TYPE} expected value (${EXPECTED_EFA})"
+                fi
+            else
+                echo "[INFO] EFA count validation skipped — no expected value for ${INST_TYPE:-unknown}"
+            fi
+        fi
+    else
+        # Determine whether EFA is expected — absence on non-EFA instance types
+        # (t3, c5, controllers) is normal, not a failure.
+        INST_TYPE_CHECK="${INST_TYPE:-}"
+        case "$INST_TYPE_CHECK" in
+            p4d.*|p4de.*|p5.*|p5e.*|p5en.*|p6*|trn1.*|trn2.*)
+                echo "[FAIL] EFA provider NOT available on ${INST_TYPE_CHECK}"
+                echo "  fi_info -p efa returned no results"
+                echo "  Required for NCCL on this instance type — training will fall back to TCP (very slow)"
+                ;;
+            *)
+                echo "[INFO] EFA provider not available — expected on non-EFA instance type (${INST_TYPE_CHECK:-unknown})"
+                ;;
+        esac
+    fi
+    TCP_COUNT=$($FI_CMD -p tcp 2>/dev/null | { grep -c "provider: tcp" 2>/dev/null; true; })
+    LF_VER=$($FI_CMD --version 2>&1 | grep libfabric | sed 's/.*: //' | head -1)
+    echo "  libfabric: ${LF_VER:-unknown}  |  TCP fallback endpoints: $TCP_COUNT"
+else
+    echo "[INFO] fi_info not found — EFA tools not installed (OK for non-EFA instances)"
+fi
+
+[ -f /opt/amazon/efa_installed_packages ] && \
+    grep "# EFA installer version" /opt/amazon/efa_installed_packages | head -1 \
+    || echo "[INFO] /opt/amazon/efa_installed_packages not found"
+
+# aws-ofi-nccl — bridges NCCL and EFA, required for GPU training on EFA instances
+OFI_LIB=$(find /opt/amazon/ofi-nccl /usr/local/lib /usr/lib /opt/aws-ofi-nccl/lib \
+    -maxdepth 4 -name "libnccl-net.so" 2>/dev/null | head -1)
+NCCL_NET_PLUGIN_ENV="${NCCL_NET_PLUGIN:-}"
+if [ -n "$NCCL_NET_PLUGIN_ENV" ]; then
+    [ -f "$NCCL_NET_PLUGIN_ENV" ] && \
+        echo "[PASS] NCCL_NET_PLUGIN=$NCCL_NET_PLUGIN_ENV (file exists)" || \
+        echo "[FAIL] NCCL_NET_PLUGIN=$NCCL_NET_PLUGIN_ENV — FILE NOT FOUND! NCCL EFA will fail"
+elif [ -n "$OFI_LIB" ]; then
+    echo "[PASS] aws-ofi-nccl plugin: $OFI_LIB"
+else
+    # FAIL only if FI_PROVIDER=efa is set; otherwise informational.
+    [ "${FI_PROVIDER:-}" = "efa" ] && \
+        echo "[FAIL] FI_PROVIDER=efa but aws-ofi-nccl plugin not found — NCCL EFA will fail" || \
+        echo "[INFO] aws-ofi-nccl not found (required for EFA+NCCL; not needed for CPU-only)"
+fi
+
+# Hugepages — improve EFA/RDMA memory registration performance
+HP_2M=$(cat /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages 2>/dev/null || echo 0)
+HP_1G=$(cat /sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages 2>/dev/null || echo 0)
+if [ "$HP_2M" -gt 0 ] 2>/dev/null; then
+    HP_FREE=$(cat /sys/kernel/mm/hugepages/hugepages-2048kB/free_hugepages 2>/dev/null || echo 0)
+    echo "[PASS] 2MB hugepages: ${HP_2M} total, ${HP_FREE} free"
+    [ "$HP_FREE" -eq 0 ] && echo "[WARN] All hugepages in use — RDMA may have reduced performance"
+elif [ "$HP_1G" -gt 0 ] 2>/dev/null; then
+    echo "[PASS] 1GB hugepages: ${HP_1G} allocated"
+else
+    echo "[INFO] No hugepages configured (set vm.nr_hugepages=512 for optimal EFA RDMA)"
+fi
+echo ""
+
+echo "--- NCCL ---"
+NCCL_LIB=$(find /usr/local/cuda*/lib* /usr/lib /opt/nccl/lib 2>/dev/null \
+    -maxdepth 4 -name "libnccl.so*" 2>/dev/null | head -3)
+if [ -n "$NCCL_LIB" ]; then
+    echo "[PASS] NCCL library found:"
+    echo "$NCCL_LIB" | while read -r l; do echo "  $l"; done
+else
+    echo "[INFO] NCCL library not found (install NCCL for distributed GPU training)"
+fi
+
+NCCL_HDR=$(find /usr/local/cuda*/include /usr/include /opt/nccl/include 2>/dev/null \
+    -maxdepth 3 -name "nccl.h" 2>/dev/null | head -1)
+if [ -n "$NCCL_HDR" ]; then
+    NCCL_VER=$(grep -E "NCCL_MAJOR|NCCL_MINOR|NCCL_PATCH" "$NCCL_HDR" 2>/dev/null \
+        | awk '{print $3}' | tr '\n' '.' | sed 's/\.$//')
+    [ -n "$NCCL_VER" ] && echo "  NCCL version: $NCCL_VER"
+fi
+
+CUDA_DRV=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader 2>/dev/null | head -1 | tr -d ' ' || echo "")
+if [ -n "$CUDA_DRV" ] && [ -n "$NCCL_VER" ]; then
+    DRV_MAJOR=$(echo "$CUDA_DRV" | cut -d. -f1)
+    NCCL_MAJOR=$(echo "$NCCL_VER" | cut -d. -f1)
+    NCCL_MINOR=$(echo "$NCCL_VER" | cut -d. -f2)
+    # NCCL 2.20+ requires CUDA driver >= 525
+    if { [ "$NCCL_MAJOR" -gt 2 ] || { [ "$NCCL_MAJOR" -eq 2 ] && [ "${NCCL_MINOR:-0}" -ge 20 ]; }; } && [ "$DRV_MAJOR" -gt 0 ] && [ "$DRV_MAJOR" -lt 525 ] 2>/dev/null; then
+        echo "[WARN] NCCL $NCCL_VER may require CUDA driver >= 525; found $CUDA_DRV"
+        echo "       Symptom: 'NCCL function not found' on mixed-version nodes"
+    fi
+fi
+echo ""
+
+echo "--- Network Interfaces ---"
+ip -br addr show 2>/dev/null | while IFS= read -r line; do
+    IFACE=$(echo "$line" | awk '{print $1}')
+    STATE=$(echo "$line" | awk '{print $2}')
+    ADDR=$(echo "$line" | awk '{print $3}')
+    if   echo "$IFACE" | grep -q "^lo";                       then TYPE="loopback"
+    elif echo "$IFACE" | grep -qE "^efa|^rdma";               then TYPE="EFA device"
+    elif echo "$IFACE" | grep -qE "^ib[0-9]";                 then TYPE="InfiniBand"
+    elif echo "$IFACE" | grep -qE "^eth|^ens|^enp|^en[0-9]"; then TYPE="VPC ENI"
+    elif echo "$IFACE" | grep -qE "^docker|^br-|^veth";       then TYPE="container bridge"
+    else TYPE="other"; fi
+    printf "  %-18s %-8s %-20s (%s)\n" "$IFACE" "$STATE" "${ADDR:--}" "$TYPE"
+done
+echo ""
+
+echo "--- MTU ---"
+ip -br link show 2>/dev/null | grep -v "^lo" | while IFS= read -r line; do
+    IFACE=$(echo "$line" | awk '{print $1}')
+    MTU=$(ip link show "$IFACE" 2>/dev/null | grep -o "mtu [0-9]*" | awk '{print $2}')
+    [ -z "$MTU" ] && continue
+    if   echo "$IFACE" | grep -qE "docker|br-|veth"; then echo "  [INFO] $IFACE: MTU=$MTU (container bridge — OK)"
+    elif [ "$MTU" -ge 9000 ] 2>/dev/null;             then echo "  [PASS] $IFACE: MTU=$MTU (jumbo frames — optimal for EFA)"
+    else echo "  [WARN] $IFACE: MTU=$MTU — expected 9001 for EFA/RDMA (fragmentation risk for large tensors)"; fi
+done
+echo ""
+
+echo "--- Memory & Limits ---"
+free -h
+echo ""
+
+SHM_SIZE=$(df -BG /dev/shm 2>/dev/null | tail -1 | awk '{print $2}' | tr -d 'G')
+SHM_FS=$(df -T /dev/shm 2>/dev/null | tail -1 | awk '{print $2}' || echo "unknown")
+if [ -n "$SHM_SIZE" ] && [ "$SHM_SIZE" -ge 1 ] 2>/dev/null; then
+    echo "[PASS] /dev/shm: ${SHM_SIZE}GB (fs: ${SHM_FS})"
+    [ "$SHM_SIZE" -lt 4 ] 2>/dev/null && \
+        echo "[WARN] /dev/shm ${SHM_SIZE}GB < 4GB — consider 4GB+ for large model training"
+else
+    echo "[FAIL] /dev/shm: ${SHM_SIZE:-0}GB — NCCL needs ≥1GB (K8s default=64MB)"
+    echo "       Symptom: 'failed to extend /dev/shm/nccl-*' or Bus error"
+fi
+[ "$SHM_FS" != "tmpfs" ] && [ "$SHM_FS" != "unknown" ] && \
+    echo "[WARN] /dev/shm fs type: $SHM_FS (expected tmpfs)"
+
+MEMLOCK=$(ulimit -l 2>/dev/null || echo "unknown")
+if [ "$MEMLOCK" = "0" ]; then
+    echo "[FAIL] memlock=0 — InfiniBand/EFA RDMA memory registration WILL FAIL"
+    echo "       Symptom: 'NCCL WARN Call to ibv_reg_mr failed'"
+elif [ -n "$MEMLOCK" ] && [ "$MEMLOCK" != "unlimited" ] && [ "$MEMLOCK" -ge 8388608 ] 2>/dev/null; then
+    echo "[PASS] memlock=${MEMLOCK}KB (≥8GB — OK)"
+elif [ "$MEMLOCK" = "unlimited" ]; then
+    echo "[INFO] memlock=unlimited (OK for RDMA; see stack check below for libc quirk)"
+else
+    echo "[INFO] memlock=${MEMLOCK}KB"
+fi
+
+# Stack size — GNU libc quirk: when memlock=unlimited, thread stack is reduced to 2MB.
+# NCCL topology graph search (especially MNNVL on 256+ nodes) needs 8MB+ stack.
+STACK=$(ulimit -s 2>/dev/null || echo "unknown")
+if [ "$MEMLOCK" = "unlimited" ] && [ "$STACK" = "unlimited" ]; then
+    echo "[WARN] memlock=unlimited + stack=unlimited — GNU libc reduces NCCL thread stack to 2MB"
+    echo "       NCCL MNNVL/large topology graph search needs 8MB+ and will fail"
+elif [ "$STACK" = "unlimited" ]; then
+    echo "[PASS] stack=unlimited (memlock is bounded, so libc quirk does not apply)"
+elif [ "$STACK" != "unknown" ] && [ "$STACK" -lt 4096 ] 2>/dev/null; then
+    echo "[FAIL] stack=${STACK}KB — too small for NCCL topology search (need ≥4096KB)"
+else
+    echo "[PASS] stack=${STACK:-unknown}KB"
+fi
+
+# systemd RemoveIPC — deletes NCCL shm files when session ends (Slurm nodes)
+# Strip comment lines first; many distros ship logind.conf with `#RemoveIPC=yes`
+# as the documented default, which would false-WARN on a substring match.
+if [ -f /etc/systemd/logind.conf ]; then
+    REMOVEIPC=$(grep -v '^[[:space:]]*#' /etc/systemd/logind.conf 2>/dev/null \
+                  | grep -i "RemoveIPC" | tail -1 || echo "")
+    if [ -z "$REMOVEIPC" ]; then
+        echo "[WARN] RemoveIPC unset in /etc/systemd/logind.conf — defaults to 'yes' on RHEL/Amazon Linux"
+        echo "       Symptom: 'unlink shared memory /dev/shm/nccl-* failed: No such file'"
+    elif echo "$REMOVEIPC" | grep -qi "yes\|true\|1"; then
+        echo "[WARN] systemd RemoveIPC=yes — NCCL shm files will be deleted at session end"
+        echo "       Symptom: 'unlink shared memory /dev/shm/nccl-* failed: No such file'"
+    else
+        echo "[PASS] systemd RemoveIPC=no — NCCL shm files will not be deleted"
+    fi
+fi
+
+# cuMem NUMA (NCCL 2.23+)
+NUMA_NODES=$(ls /sys/devices/system/node/ 2>/dev/null | { grep -c "^node[0-9]" 2>/dev/null; true; })
+if [ "$NUMA_NODES" -gt 0 ] 2>/dev/null; then
+    echo "[PASS] NUMA topology: $NUMA_NODES node(s) visible (cuMem host alloc OK)"
+else
+    echo "[WARN] NUMA topology not visible — cuMem host allocations may fail"
+fi
+echo ""
+
+echo "--- NCCL RAS ---"
+# RAS port is configurable via NCCL_RAS_ADDR. Probe whatever is in the
+# training process's environment; skip if no candidate is found rather than
+# hard-coding a port that may not match every NCCL build.
+NC_CMD=$(command -v nc 2>/dev/null || command -v ncat 2>/dev/null || echo "")
+if [ -n "$NC_CMD" ]; then
+    RAS_PID=$(pgrep -f "python|torchrun|mpirun" 2>/dev/null | head -1)
+    RAS_ADDR=""
+    if [ -n "$RAS_PID" ] && [ -r "/proc/$RAS_PID/environ" ]; then
+        RAS_ADDR=$(tr '\0' '\n' < "/proc/$RAS_PID/environ" 2>/dev/null \
+                   | awk -F= '/^NCCL_RAS_ADDR=/{print $2}' | head -1)
+    fi
+    RAS_HOST="${RAS_ADDR%:*}"; RAS_PORT="${RAS_ADDR##*:}"
+    if [ -n "$RAS_PORT" ] && [ "$RAS_PORT" != "$RAS_ADDR" ]; then
+        RAS=$(echo "status" | timeout 3 $NC_CMD -w 2 "${RAS_HOST:-localhost}" "$RAS_PORT" 2>/dev/null || echo "")
+        if [ -n "$RAS" ]; then
+            echo "[PASS] NCCL RAS responding at ${RAS_HOST:-localhost}:${RAS_PORT}:"
+            echo "$RAS" | head -10
+        else
+            echo "[INFO] NCCL RAS port ${RAS_PORT} not responding — training job may not be using RAS, or RAS is disabled (NCCL_RAS_ENABLE=0)"
+        fi
+    else
+        echo "[INFO] NCCL_RAS_ADDR not set in any training process — skipping RAS probe (set NCCL_RAS_ADDR=<host>:<port> and re-run during training to enable)"
+    fi
+else
+    echo "[INFO] nc/ncat not found — cannot probe NCCL RAS"
+fi
+echo ""
+
+echo "--- Active Training Processes ---"
+PROCS=$(ps aux 2>/dev/null | grep -E "python|torchrun|mpirun|nccl_test" | grep -v grep | head -10)
+if [ -n "$PROCS" ]; then
+    echo "$PROCS"
+else
+    echo "[INFO] No active training processes"
+fi
+echo ""
+
+echo "--- Recent Hardware Errors (dmesg) ---"
+DMESG=$(dmesg 2>/dev/null | grep -iE "xid|nvrm|efa|ib_core|rdma|correctable|uncorrectable|acs|iommu" \
+    | tail -20 || echo "")
+if [ -n "$DMESG" ]; then
+    echo "$DMESG"
+else
+    echo "[PASS] No hardware errors in dmesg"
+fi
+
+# iptables / nftables — host-level firewall rules that block NCCL
+echo "--- Host Firewall (iptables/nftables) ---"
+IPT_DROP=0
+if command -v iptables &>/dev/null; then
+    IPT_DROP=$(iptables -L -n 2>/dev/null | grep -cE "DROP|REJECT" || echo 0)
+    if [ "$IPT_DROP" -gt 0 ] 2>/dev/null; then
+        echo "[WARN] iptables has $IPT_DROP DROP/REJECT rules — may block NCCL traffic"
+        iptables -L -n 2>/dev/null | grep -E "DROP|REJECT" | head -5
+        echo "       Verify NCCL ports (29400-29500, RDMA) are not blocked"
+    else
+        echo "[PASS] iptables: no DROP/REJECT rules"
+    fi
+elif command -v nft &>/dev/null; then
+    NFT_DROP=$(nft list ruleset 2>/dev/null | grep -cE "drop|reject" || echo 0)
+    if [ "$NFT_DROP" -gt 0 ] 2>/dev/null; then
+        echo "[WARN] nftables has $NFT_DROP drop/reject rules — may block NCCL traffic"
+    else
+        echo "[PASS] nftables: no drop/reject rules"
+    fi
+else
+    echo "[INFO] iptables/nftables not found"
+fi
+echo ""
+
+echo "--- Stale NCCL Shared Memory ---"
+STALE_SHM=$(ls /dev/shm/nccl-* 2>/dev/null || echo "")
+if [ -n "$STALE_SHM" ]; then
+    STALE_COUNT=$(echo "$STALE_SHM" | wc -l)
+    echo "[WARN] $STALE_COUNT stale NCCL shared memory file(s) found:"
+    echo "$STALE_SHM" | head -5
+    echo "       From a previous training run — may cause 'file exists' errors"
+else
+    echo "[PASS] No stale NCCL shared memory files"
+fi
+echo ""
+
+# EFA Latency Check (fi_ping) — catches degraded EFA ports (straggler #1 cause)
+echo "--- EFA Latency (fi_ping self-test) ---"
+FI_PING_CMD=""
+command -v fi_ping &>/dev/null && FI_PING_CMD="fi_ping"
+[ -z "$FI_PING_CMD" ] && [ -f /opt/amazon/efa/bin/fi_ping ] && FI_PING_CMD="/opt/amazon/efa/bin/fi_ping"
+
+if [ -n "$FI_PING_CMD" ]; then
+    # Self-ping on loopback — tests EFA stack without needing a second node
+    # A degraded EFA port shows high latency (>20us) even on self-ping
+    # Validate FI_PING_CMD is a known safe EFA binary path (not user-controlled)
+    if [[ ! "$FI_PING_CMD" =~ ^(/opt/amazon/efa/bin/fi_ping|fi_ping)$ ]]; then
+        echo "[SKIP] fi_ping path not recognised: $FI_PING_CMD"
+    else
+        # Try EFA provider first; if it succeeds, the result reflects EFA. If
+        # EFA isn't reachable on loopback (some kernels), fall back to TCP — but
+        # label it explicitly so a TCP latency isn't reported as if it were EFA.
+        # nosemgrep: ai.ai-best-practices.hooks-dns-exfiltration.hooks-dns-exfiltration.hooks-dns-exfiltration-generic -- FI_PING_CMD validated to known EFA binary path above; targets loopback 127.0.0.1
+        PING_OUT=$($FI_PING_CMD -p efa -I 10 127.0.0.1 2>/dev/null || echo "")
+        PROVIDER="efa"
+        if [ -z "$PING_OUT" ]; then
+            # nosemgrep: ai.ai-best-practices.hooks-dns-exfiltration.hooks-dns-exfiltration.hooks-dns-exfiltration-generic -- FI_PING_CMD validated above; loopback only
+            PING_OUT=$($FI_PING_CMD -p tcp -I 10 127.0.0.1 2>/dev/null || echo "")
+            PROVIDER="tcp"
+        fi
+        if [ -n "$PING_OUT" ]; then
+            LATENCY=$(echo "$PING_OUT" | grep -oE "[0-9]+\.[0-9]+ us" | tail -1 || echo "")
+            LAT_VAL=$(echo "$LATENCY" | grep -oE "[0-9]+" | head -1 || echo 0)
+            if [ "$PROVIDER" = "tcp" ]; then
+                # TCP loopback latency does NOT reflect EFA path health; an EFA
+                # straggler will not be visible here. Surface as INFO, not PASS/WARN.
+                echo "[INFO] fi_ping fell back to provider=tcp (EFA loopback unreachable) — latency=${LATENCY:-?}; this does NOT measure EFA path health"
+                echo "       For EFA latency, run fi_ping/fi_pingpong between two real nodes (not loopback) — see references/performance-testing.md"
+            elif [ -n "$LATENCY" ]; then
+                if [ "$LAT_VAL" -gt 20 ] 2>/dev/null; then
+                    echo "[WARN] fi_ping latency (provider=efa): $LATENCY (>20us — EFA port may be degraded; normal is 1-5us)"
+                    echo "       Impact: straggler AllReduce, training much slower than expected"
+                    echo "       Action: drain this node and replace via HyperPod API"
+                else
+                    echo "[PASS] fi_ping latency (provider=efa): $LATENCY"
+                fi
+            else
+                echo "[INFO] fi_ping (provider=efa) ran but no latency value extracted"
+                echo "$PING_OUT" | tail -3
+            fi
+        else
+            echo "[INFO] fi_ping self-test skipped (no EFA/TCP provider reachable)"
+        fi
+    fi
+else
+    echo "[INFO] fi_ping not found (install EFA tools for latency testing)"
+fi
+echo ""
+
+echo "=== END NODE DIAGNOSTICS ==="
+NODE_SCRIPT
+)
+
+# Strategy for 100s of nodes:
+#   1. Resolve all Running compute nodes via HyperPod API (paginated)
+#   2. Sample --sample-nodes (default 3) for SSM hardware checks
+#   3. Each SSM call has a 60s timeout
+#   4. Results show per-node summary; failures are highlighted
+#   5. This check does NOT increment ISSUES_FOUND (hardware checks are advisory)
+#      unless a critical hardware fault is detected (XID errors, EFA fail on GPU instance)
+check_node_hardware_via_ssm() {
+    header "Check 8: Node Hardware Checks (via SSM — runs ON cluster nodes)"
+
+    info "Resolving cluster nodes for SSM..."
+    if ! resolve_cluster_nodes_for_ssm; then
+        info "Could not resolve cluster nodes via HyperPod API"
+        info "  (DescribeCluster needs sagemaker:DescribeCluster on this cluster)"
+        info "  To check a specific node: --node <INSTANCE_ID>"
+        return
+    fi
+
+    if [[ ${#SSM_NODES[@]} -eq 0 ]]; then
+        info "No Running compute nodes found in cluster"
+        return
+    fi
+
+    local total_nodes
+    total_nodes="${#SSM_NODES[@]}"
+    info "Sampling $total_nodes node(s) for hardware checks (use --sample-nodes N for more)"
+    info "Cluster ID: $SSM_CLUSTER_ID"
+
+    local node_pass=0 node_warn=0 node_fail=0
+
+    for entry in "${SSM_NODES[@]}"; do
+        local instance_id group_name
+        instance_id=$(echo "$entry" | awk '{print $1}')
+        group_name=$(echo "$entry" | awk '{print $2}')
+        local target="sagemaker-cluster:${SSM_CLUSTER_ID}_${group_name}-${instance_id}"
+
+        section "Node: $instance_id ($group_name)"
+        info "  SSM target: $target"
+        info "  Connecting (timeout 60s)..."
+
+        local output
+        output=$(_ssm_run "$instance_id" "$group_name" "$SSM_CLUSTER_ID" "$_NODE_DIAG_SCRIPT")
+
+        # Detect SSM transport failures. Letting error text fall through as
+        # diagnostic output produces a misleading "0 [PASS]" finding.
+        if [[ -z "$output" ]] || echo "$output" | grep -qiE "SessionManagerPlugin|error.*session|not authorized|AccessDenied|Could not connect|^Cannot perform start session|EOF$|ThrottlingException|RequestLimitExceeded|InternalFailure|ServiceUnavailable|TargetNotConnected"; then
+            warn "  SSM connection failed for $instance_id → references/operations.md § 3 SSM target format (HyperPod)"
+            node_warn=$((node_warn + 1))
+            continue
+        fi
+
+        echo "$output"
+
+        local passes fails
+        passes=$(echo "$output" | { grep -c "\[PASS\]" 2>/dev/null; true; })
+        fails=$(echo "$output" | { grep -c "\[FAIL\]" 2>/dev/null; true; })
+
+        # Non-GPU / non-EFA nodes (controllers, logins, CPU families) sampled
+        # as a fallback. Flag as SKIP rather than PASS — a PASS on a node
+        # without GPU/EFA is meaningless for NCCL.
+        local is_non_gpu=false
+        if echo "$output" | grep -qE "^\[INFO\].*(CPU-only node|non-EFA instance|no GPU devices visible|nvidia-smi not found)"; then
+            if ! echo "$output" | grep -qE "^\[PASS\] EFA provider available|^\[PASS\] GPU row-remap"; then
+                is_non_gpu=true
+            fi
+        fi
+
+        if [[ "$fails" -gt 0 ]]; then
+            error "  Node $instance_id: $fails hardware issue(s) detected — see above"
+            node_fail=$((node_fail + 1))
+            # XID errors or EFA fail on a GPU instance = cluster-level issue
+            if echo "$output" | grep -q "\[FAIL\] GPU XID"; then
+                ISSUES_FOUND=$((ISSUES_FOUND + 1))
+                add_issue_detail "XID errors on GPU hardware ($instance_id) → references/operations.md § 8 NCCL-specific remediations (Node reboot / replacement); hyperpod-node-debugger skill" "P0"
+            elif echo "$output" | grep -q "\[FAIL\] EFA provider NOT"; then
+                ISSUES_FOUND=$((ISSUES_FOUND + 1))
+                add_issue_detail "EFA provider failure on $instance_id → references/debugging-guide.md § 6 EFA Configuration / § 13 EFA TCP Fallback" "P0"
+            fi
+        elif $is_non_gpu; then
+            info "  Node $instance_id: no GPU/EFA present — skipping (NCCL checks apply only to GPU/EFA compute nodes)"
+            node_warn=$((node_warn + 1))
+        else
+            success "  Node $instance_id: hardware checks passed ($passes [PASS])"
+            node_pass=$((node_pass + 1))
+        fi
+    done
+
+    echo ""
+    info "Hardware check summary: $node_pass PASS | $node_warn UNREACHABLE | $node_fail FAIL"
+    if [[ "$node_fail" -gt 0 ]]; then
+        warn "  $node_fail node(s) have hardware issues — check above for details"
+        warn "  For ALL nodes: re-run with --sample-nodes <total> to check every node"
+    fi
+    if [[ "$node_warn" -gt 0 ]]; then
+        warn "  $node_warn node(s) unreachable via SSM — verify SSM agent and IAM permissions"
+    fi
+}
+
+# CloudWatch covers ALL nodes at once without per-node SSM calls.
+# This runs for EKS when K8S_CONNECTED=false (can't use kubectl logs).
+check_cloudwatch_nccl_logs() {
+    header "Check 6b: NCCL Pattern Analysis via CloudWatch"
+
+    local cluster_arn cluster_id
+    cluster_arn=$(aws sagemaker describe-cluster \
+        --cluster-name "$CLUSTER_NAME" --region "$REGION" \
+        --query 'ClusterArn' --output text 2>/dev/null || echo "")
+    cluster_id=$(echo "$cluster_arn" | awk -F'/' '{print $NF}')
+
+    if [[ -z "$cluster_id" || "$cluster_id" == "None" ]]; then
+        info "Cluster ID unavailable — skipping CloudWatch log analysis"
+        return
+    fi
+
+    local log_group="/aws/sagemaker/Clusters/${CLUSTER_NAME}/${cluster_id}"
+    info "CloudWatch log group: $log_group"
+
+    local lg_exists
+    lg_exists=$(aws logs describe-log-groups \
+        --log-group-name-prefix "$log_group" --region "$REGION" \
+        --query 'logGroups[0].logGroupName' --output text 2>&1) || {
+        if echo "$lg_exists" | grep -qiE "AccessDenied|UnauthorizedOperation"; then
+            warn "Permission denied: logs:DescribeLogGroups — check IAM policy"
+        fi
+        lg_exists=""
+    }
+
+    if [[ -z "$lg_exists" || "$lg_exists" == "None" ]]; then
+        info "CloudWatch log group not found — CloudWatch agent may not be configured"
+        info "  Enable the CloudWatch agent in the cluster's lifecycle script (see operations.md § 4)"
+        return
+    fi
+
+    local start_time=$(( ($(date +%s) - 7200) * 1000 ))
+    local patterns=(
+        "NCCL WARN" "Watchdog timeout" "Timeout waiting for"
+        "fi_getinfo failed" "unhandled system error" "nccl error"
+        "Connection refused" "NCCL_OFI_RDMA"
+    )
+
+    local found_any=false
+    for pattern in "${patterns[@]}"; do
+        local matches
+        matches=$(aws logs filter-log-events \
+            --log-group-name "$log_group" \
+            --filter-pattern "\"$pattern\"" \
+            --start-time "$start_time" \
+            --region "$REGION" \
+            --query 'events[*].{t:timestamp,s:logStreamName,m:message}' \
+            --output json 2>/dev/null || echo "[]")
+
+        local count
+        count=$(echo "$matches" | python3 -c \
+            "import sys,json; print(len(json.load(sys.stdin)))" 2>/dev/null || echo 0)
+
+        if [[ "$count" -gt 0 ]]; then
+            error "CloudWatch: '$pattern' found $count time(s) in last 2h:"
+            echo "$matches" | python3 -c "
+import sys,json,datetime
+events=json.load(sys.stdin)[:5]
+for e in events:
+    ts=datetime.datetime.utcfromtimestamp(e['t']//1000).strftime('%H:%M:%S')
+    stream=e['s'][:30]
+    msg=e['m'][:120].strip()
+    print(f'  [{ts}] {stream}: {msg}')
+" 2>/dev/null
+            ISSUES_FOUND=$((ISSUES_FOUND + 1))
+            add_issue_detail "CloudWatch pattern '$pattern' found ${count} time(s) → references/error-patterns-quick-ref.md" "P1"
+            found_any=true
+        fi
+    done
+
+    $found_any || success "No NCCL error patterns in CloudWatch logs (last 2h)"
+}
+
+# Slurm: run command on head node via SSM (start-session, not send-command)
+run_slurm_cmd_via_ssm() {
+    local cmd="$1"
+
+    if ! resolve_cluster_nodes_for_ssm; then
+        return 1
+    fi
+
+    # Paginate list-cluster-nodes so controller/head nodes in the last page
+    # of a large cluster aren't missed.
+    local all_nodes
+    all_nodes=$(sagemaker_list_paginated list-cluster-nodes ClusterNodeSummaries)
+
+    local head_entry
+    head_entry=$(echo "$all_nodes" | python3 -c "
+import sys,json
+nodes=json.load(sys.stdin).get('ClusterNodeSummaries',[])
+for n in nodes:
+    g=n.get('InstanceGroupName','').lower()
+    if any(x in g for x in ['controller','head','master']):
+        print(n['InstanceId'] + ' ' + n['InstanceGroupName'])
+        break
+else:
+    for n in nodes:
+        if n.get('InstanceStatus',{}).get('Status') == 'Running':
+            print(n['InstanceId'] + ' ' + n['InstanceGroupName'])
+            break
+" 2>/dev/null || echo "")
+
+    [[ -z "$head_entry" ]] && return 1
+
+    local iid grp
+    iid=$(echo "$head_entry" | awk '{print $1}')
+    grp=$(echo "$head_entry" | awk '{print $2}')
+
+    _ssm_run "$iid" "$grp" "$SSM_CLUSTER_ID" "$cmd"
+}
+
+check_slurm_nodes() {
+    header "Check 2 [Slurm]: Node States"
+
+    local sinfo_output=""
+    if command -v sinfo &>/dev/null; then
+        sinfo_output=$(sinfo -o "%N %T %30E" --noheader 2>/dev/null || echo "")
+    else
+        sinfo_output=$(run_slurm_cmd_via_ssm "sinfo -o '%N %T %30E' --noheader" || echo "")
+    fi
+
+    # Treat SSM transport errors as retrieval failures, not as healthy state.
+    # Without this, "Cannot perform start session: EOF" is non-empty and falls
+    # through the empty-check below → grep finds no "down" → misleading [PASS].
+    if echo "$sinfo_output" | grep -qiE "^(Cannot perform start session|SessionManagerPlugin is not found)|EOF$|TargetNotConnected|ThrottlingException|RequestLimitExceeded|InternalFailure|ServiceUnavailable|AccessDenied|UnauthorizedOperation|not authorized to perform"; then
+        warn "Could not retrieve Slurm node states — SSM transient error after retries"
+        info "  Rerun the diagnostic; if persistent, delegate to hyperpod-ssm skill for manual probe."
+        return
+    fi
+
+    if [[ -z "$sinfo_output" ]]; then
+        warn "Could not retrieve Slurm node states"
+        return
+    fi
+
+    local down drained
+    down=$(echo "$sinfo_output" | grep -E "\bdown\b|\bdraining\b" | awk '{print $1}' || echo "")
+    drained=$(echo "$sinfo_output" | grep -E "\bdrained\b" | awk '{print $1}' || echo "")
+
+    if [[ -z "$down" && -z "$drained" ]]; then
+        success "All Slurm nodes: UP/IDLE/ALLOC — no NCCL-impacting states"
+    else
+        if [[ -n "$down" ]]; then
+            error "DOWN/DRAINING nodes: $down"
+            ISSUES_FOUND=$((ISSUES_FOUND + 1))
+            add_issue_detail "Slurm nodes DOWN/DRAINING: $down → references/operations.md § 7 Slurm — NCCL-specific operations" "P1"
+            while IFS= read -r node; do
+                [[ -z "$node" ]] && continue
+            done <<< "$(echo "$down" | tr ',' '\n')"
+        fi
+        [[ -n "$drained" ]] && warn "DRAINED nodes (not available): $drained"
+    fi
+
+    section "Slurm Job Queue"
+    local q=""
+    if command -v squeue &>/dev/null; then
+        q=$(squeue -o "%i %j %T %R %N" --noheader 2>/dev/null || echo "")
+    fi
+    if [[ -z "$q" ]]; then
+        q=$(run_slurm_cmd_via_ssm "squeue -o '%i %j %T %R %N' --noheader" 2>/dev/null || echo "")
+    fi
+
+    # Same SSM-error detection as above — without this, the error string is
+    # parsed as a job list and produces false "stuck" rows.
+    if echo "$q" | grep -qiE "^(Cannot perform start session|SessionManagerPlugin is not found)|EOF$|TargetNotConnected|ThrottlingException|RequestLimitExceeded|InternalFailure|ServiceUnavailable|AccessDenied|UnauthorizedOperation|not authorized to perform"; then
+        warn "Could not retrieve Slurm job queue — SSM transient error after retries"
+        q=""
+    fi
+
+    if [[ -z "$q" ]]; then
+        info "No jobs in queue"
+    else
+        local stuck
+        stuck=$(echo "$q" | grep -E "PENDING|COMPLETING" | head -10 || echo "")
+        if [[ -n "$stuck" ]]; then
+            warn "Stuck PENDING/COMPLETING jobs:"
+            echo "$stuck"
+            ISSUES_FOUND=$((ISSUES_FOUND+1))
+            add_issue_detail "Stuck PENDING/COMPLETING Slurm jobs → references/operations.md § 7 Slurm — NCCL-specific operations" "P1"
+        else
+            success "No stuck jobs in queue"
+        fi
+        info "Queue (top 10):"; echo "$q" | head -10
+    fi
+}
+
+check_slurm_nccl_logs() {
+    header "Check 6 [Slurm]: NCCL Log Pattern Analysis"
+    check_cloudwatch_nccl_logs
+}
+
+check_slurm_nccl_env() {
+    header "Check 7 [Slurm]: NCCL Environment Variable Audit (via SSM)"
+
+    local env_check
+    env_check=$(run_slurm_cmd_via_ssm \
+        "{ cat /etc/profile.d/nccl.sh /opt/ml/config/nccl.conf /etc/slurm/prolog.d/*.sh 2>/dev/null; env; } \
+         | grep -E '^(NCCL_|FI_|MASTER_)' | sort -u | head -30 || echo '(none)'" \
+        2>/dev/null || echo "")
+
+    # If SSM returned a transport error, don't interpret it as the controller's
+    # env output — that produces false "FI_PROVIDER=efa not set" warnings.
+    if echo "$env_check" | grep -qiE "^(Cannot perform start session|SessionManagerPlugin is not found)|EOF$|TargetNotConnected|ThrottlingException|RequestLimitExceeded|InternalFailure|ServiceUnavailable|AccessDenied|UnauthorizedOperation|not authorized to perform"; then
+        warn "Could not retrieve NCCL env vars from controller — SSM transient error after retries"
+        info "  Rerun the diagnostic; if persistent, delegate to hyperpod-ssm skill."
+        return
+    fi
+
+    if [[ -n "$env_check" && "$env_check" != "(none)" ]]; then
+        info "NCCL/EFA env vars on head node:"
+        echo "$env_check" | while IFS= read -r line; do info "  $line"; done
+
+        local warn_count=0
+        if echo "$env_check" | grep -q "NCCL_DEBUG=INFO"; then
+            warn "NCCL_DEBUG=INFO detected — verbose logging adds runtime overhead. Set NCCL_DEBUG=WARN for production."
+            ISSUES_FOUND=$((ISSUES_FOUND + 1))
+            add_issue_detail "NCCL_DEBUG=INFO in Slurm env (set NCCL_DEBUG=WARN in production) → references/operations.md § 5 NCCL environment variable reference" "P1"
+            warn_count=$((warn_count + 1))
+        fi
+        if echo "$env_check" | grep -q "NCCL_DEBUG=TRACE"; then
+            warn "NCCL_DEBUG=TRACE detected — TRACE prints replayable trace info on every NCCL call (large overhead and verbose logs). Set NCCL_DEBUG=WARN immediately."
+            ISSUES_FOUND=$((ISSUES_FOUND + 1))
+            add_issue_detail "NCCL_DEBUG=TRACE in Slurm env (set NCCL_DEBUG=WARN immediately) → references/operations.md § 5 NCCL environment variable reference" "P0"
+            warn_count=$((warn_count + 1))
+        fi
+        if ! echo "$env_check" | grep -q "FI_PROVIDER=efa"; then
+            warn "FI_PROVIDER=efa not set — EFA may not be used for NCCL transport"
+            warn_count=$((warn_count + 1))
+        fi
+        if ! echo "$env_check" | grep -q "NCCL_SOCKET_IFNAME"; then
+            warn "NCCL_SOCKET_IFNAME not set — NCCL may pick wrong interface. Recommend: ^lo,docker,efa,veth"
+            warn_count=$((warn_count + 1))
+        fi
+        if [[ "$warn_count" -eq 0 ]]; then
+            success "System-level NCCL env vars look correct"
+        fi
+    else
+        info "No NCCL env vars found in system config on head node"
+        info "  (Expected — NCCL vars are typically set in job scripts, not system-wide)"
+    fi
+}
+
+check_slurm_controller_health() {
+    # Slurm controller health — retry up to 3× before declaring it down, because
+    # SSM cold-start / session-service EOF errors are common on the first call.
+    header "Check 0 [Slurm]: Controller Health"
+    local ping_result=""
+    for _ in 1 2 3; do
+        ping_result=$(run_slurm_cmd_via_ssm "scontrol ping 2>/dev/null" || echo "")
+        [[ -n "$ping_result" ]] && echo "$ping_result" | grep -qi "is UP\|slurmctld.*UP" && break
+        sleep 3
+    done
+    if echo "$ping_result" | grep -qi "is UP\|slurmctld.*UP"; then
+        success "slurmctld is responsive"
+    elif echo "$ping_result" | grep -qiE "AccessDenied|UnauthorizedOperation|not authorized to perform"; then
+        # IAM denial ≠ Slurm failure. Reporting "slurmctld down" would be wrong
+        # and would send the customer down a Slurm-rescue path for an IAM issue.
+        warn "Could not check slurmctld — caller lacks ssm:StartSession on this cluster"
+        info "  Grant ssm:StartSession on the HyperPod cluster ARN and rerun."
+    elif echo "$ping_result" | grep -qiE "Cannot perform start session|SessionManager|EOF$|TargetNotConnected|ConnectTimeout|ServiceError|ThrottlingException|RequestLimitExceeded|InternalFailure|ServiceUnavailable"; then
+        # Transport-level SSM errors — not a Slurm failure. Downgrade to WARN.
+        warn "Could not reach controller via SSM (transient): $(echo "$ping_result" | head -1)"
+        info "  Rerun the diagnostic; if the error persists, delegate to hyperpod-ssm skill."
+    elif [[ -n "$ping_result" ]]; then
+        error "slurmctld not responding — all Slurm operations blocked"
+        local _diag_line
+        _diag_line="$(echo "$ping_result" | head -1)"
+        info "  Controller response: $_diag_line"
+        ISSUES_FOUND=$((ISSUES_FOUND + 1))
+        add_issue_detail "slurmctld down on controller → references/operations.md § 7 Slurm — NCCL-specific operations" "P0"
+    else
+        info "Could not reach controller via SSM — slurmctld status unknown"
+    fi
+
+    local munge_result
+    munge_result=$(run_slurm_cmd_via_ssm "systemctl is-active munge 2>/dev/null || echo munge_inactive" || echo "")
+    if echo "$munge_result" | grep -q "^active"; then
+        success "munge authentication service active"
+    elif echo "$munge_result" | grep -q "munge_inactive"; then
+        error "munge service inactive — Slurm auth will fail"
+        ISSUES_FOUND=$((ISSUES_FOUND + 1))
+        add_issue_detail "munge service inactive → references/operations.md § 7 Slurm — NCCL-specific operations" "P0"
+    fi
+}
+
+run_slurm_checks() {
+    check_slurm_controller_health
+    check_cluster_health
+    check_slurm_nodes
+    check_cluster_events
+    check_security_groups
+    check_slurm_nccl_logs
+    check_slurm_nccl_env
+    check_node_hardware_via_ssm
+}
+
+print_summary() {
+    header "NCCL Diagnostic Summary"
+    echo ""
+    echo -e "  Cluster:      ${BOLD}$CLUSTER_NAME${RESET}"
+    echo -e "  Region:       ${BOLD}$REGION${RESET}"
+    echo -e "  Orchestrator: ${BOLD}${ORCHESTRATOR^^}${RESET}"
+    [[ "$ORCHESTRATOR" == "eks" ]] && \
+        echo -e "  Namespace:    ${BOLD}${NAMESPACE:-all}${RESET}"
+    [[ -n "$JOB_NAME" ]]  && echo -e "  Job:          ${BOLD}$JOB_NAME${RESET}"
+    [[ -n "$NODE_ID" ]]   && echo -e "  Node:         ${BOLD}$NODE_ID${RESET}"
+    echo -e "  Mode:         ${BOLD}READ-ONLY${RESET} (no changes applied)"
+    echo ""
+    echo -e "  ┌──────────────────────────────────┐"
+    echo -e "  │  Issues Found:  ${RED}${BOLD}$ISSUES_FOUND${RESET}                │"
+    echo -e "  └──────────────────────────────────┘"
+
+    if [[ ${#ISSUE_DETAILS[@]} -gt 0 ]]; then
+        echo ""
+        echo "  Issue Details (prioritized):"
+        for priority in P0 P1 P2; do
+            local has_items=false
+            for detail in "${ISSUE_DETAILS[@]}"; do
+                if [[ "$detail" == "${priority}|"* ]]; then
+                    if ! $has_items; then
+                        case "$priority" in
+                            P0) echo -e "    ${RED}${BOLD}[$priority — Fix Immediately]${RESET}" ;;
+                            P1) echo -e "    ${YELLOW}${BOLD}[$priority — Fix Soon]${RESET}" ;;
+                            P2) echo -e "    ${BOLD}[$priority — Advisory]${RESET}" ;;
+                        esac
+                        has_items=true
+                    fi
+                    echo "      → ${detail#*|}"
+                fi
+            done
+        done
+    fi
+    echo ""
+
+    if [[ "$ISSUES_FOUND" -eq 0 ]]; then
+        success "No actionable NCCL issues detected — cluster looks healthy"
+        echo ""
+        info "If training is still hanging, check:"
+        echo "  1. CloudWatch: aws logs filter-log-events --log-group-name /aws/sagemaker/Clusters/$CLUSTER_NAME/..."
+        echo "  2. Version check: hyperpod-version-checker skill"
+        echo "  3. Full diagnostics: hyperpod-issue-report skill"
+    else
+        warn "$ISSUES_FOUND issue(s) found — see the Issue Details list above."
+        warn "Each issue line includes a reference pointer (→ references/<file>.md § <section>)."
+        warn "The hyperpod-nccl skill will read these findings, look up the matching section,"
+        warn "and guide you through remediation. This script does not modify cluster state."
+    fi
+    echo ""
+    echo -e "${BOLD}References:${RESET}"
+    echo "  Debugging guide:  references/debugging-guide.md"
+    echo "  Operations:       references/operations.md"
+    echo "  Performance test: references/performance-testing.md"
+    echo ""
+}
+
+main() {
+    header "NCCL Diagnostic — SageMaker HyperPod (read-only)"
+
+    detect_orchestrator
+
+    echo -e "  Cluster:      ${BOLD}$CLUSTER_NAME${RESET}"
+    echo -e "  Region:       ${BOLD}$REGION${RESET}"
+    echo -e "  Orchestrator: ${BOLD}${ORCHESTRATOR^^}${RESET}"
+    [[ "$ORCHESTRATOR" == "eks" ]] && echo -e "  Namespace:    ${BOLD}${NAMESPACE:-all}${RESET}"
+    info "READ-ONLY DIAGNOSTIC — no cluster state will be modified."
+    info "This script collects signals only. The hyperpod-nccl skill interprets findings"
+    info "and looks up remediation in references/*.md."
+    echo ""
+
+    check_prerequisites
+
+    if [[ "$ORCHESTRATOR" == "slurm" ]]; then
+        info "Running Slurm NCCL diagnostics..."
+        run_slurm_checks
+    else
+        info "Running EKS NCCL diagnostics..."
+
+        check_cluster_health
+        check_cluster_events
+        check_security_groups
+
+        if $K8S_CONNECTED; then
+            check_k8s_nodes
+            check_efa_k8s
+            check_pod_status
+            check_nccl_infra_prereqs
+            analyze_nccl_logs
+            check_nccl_env_vars
+            check_network_policies
+        else
+            warn "K8s checks skipped (2, 2b, 5, 5b, 6, 7, 9) — kubectl not authenticated"
+            # CloudWatch analysis doesn't need kubectl.
+            check_cloudwatch_nccl_logs
+        fi
+
+        check_node_hardware_via_ssm
+    fi
+
+    print_summary
+    # Exit 1 only on P0/P1 findings; P2 are informational.
+    local _critical=0
+    for _issue in "${ISSUE_DETAILS[@]:-}"; do
+        [[ -z "$_issue" ]] && continue
+        case "${_issue%%|*}" in P0|P1) _critical=$((_critical+1)) ;; esac
+    done
+    [[ "$_critical" -eq 0 ]] && exit 0 || exit 1
+}
+
+main "$@"
diff --git a/plugins/sagemaker-ai/skills/hyperpod-node-debugger/SKILL.md b/plugins/sagemaker-ai/skills/hyperpod-node-debugger/SKILL.md
new file mode 100644
index 00000000..84bf226e
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/hyperpod-node-debugger/SKILL.md
@@ -0,0 +1,269 @@
+---
+name: hyperpod-node-debugger
+description: Diagnose and remediate per-node issues on a HyperPod cluster (EKS or Slurm) — a specific node is unhealthy, unresponsive, stuck, or needs replacing. Covers on-node EFA, GPU / accelerator hardware (XID, ECC, NVLink, row-remap, DCGM), Slurm node down/drained, disk and memory pressure, per-node lifecycle-script failures, SSM agent, container runtime, kernel panics, pod networking. Read-only. Not for cluster-wide provisioning (→ hyperpod-cluster-debugger), NCCL (→ hyperpod-nccl), or MFU (→ hyperpod-mfu-debugger).
+metadata:
+  version: "0.0.1"
+---
+
+# HyperPod Node Debugger
+
+**Operating policy.** Run read-only diagnostics yourself. Never run a command that changes cluster, node, or workload state — present each one as a **Suggested command (run this yourself)** block and wait for the customer. Destructive order: **investigate → reboot → replace** (replace destroys root + secondary volumes; not supported on Slurm controller nodes). Never discard training state, logs, or caches on speculation.
+
+**IaC note (always include with mutation commands).** When you suggest any command that changes cluster, VPC, SG, subnet, or EKS configuration (e.g. `authorize-security-group-*`, `modify-vpc-attribute`, `update-cluster`, `kubectl label/cordon/drain`, `create namespace`, `set env daemonset`), ask the customer first whether the cluster / VPC / SG is managed by Infrastructure-as-Code (CloudFormation, CDK, Terraform, Pulumi). If yes, tell them: "Apply this change in your IaC source first, then deploy through the pipeline — running the command directly will drift from your template and the next stack update may overwrite it." If they need to fix the issue immediately and the IaC change will follow, flag the drift explicitly so they remember to reconcile.
+
+Read-only triage. `scripts/triage-cluster.sh` (and helpers `check-efa-sg.sh`, `check-node-reachability.sh`, `check-vpc-config.sh`) read state and print each issue as `[FAIL] ... → references/node-diagnostics-detail.md § <section>`. Catalog of customer-ticket patterns: [references/node-issue-catalog.md](references/node-issue-catalog.md).
+
+---
+
+## Workflow
+
+1. Collect cluster name, region, suspect instance ID, exact error string from logs.
+2. Run `scripts/triage-cluster.sh` (add `--node <INSTANCE-ID>` to focus one node).
+3. For every `[FAIL]` / issue entry, `Read` the referenced section.
+4. Present: what script detected (copy the line verbatim), root cause, exact command(s) with instance/SG IDs filled in, blast radius (e.g. "reboots i-xxx", "wipes volumes on replacement"). For any command that mutates cluster/VPC/SG/EKS state, ask whether the affected resource is IaC-managed and surface the drift warning from the operating-policy note above.
+5. Wait for explicit customer approval. Destructive order: investigate → reboot → replace.
+6. Re-run triage to confirm. Iterate if not cleared.
+
+## Step 1: Triage
+
+```bash
+bash scripts/triage-cluster.sh --cluster <CLUSTER_NAME_OR_ARN> --region <REGION>
+
+# Focus on one node:
+bash scripts/triage-cluster.sh --cluster <CLUSTER_NAME_OR_ARN> --region <REGION> --node <INSTANCE_ID>
+```
+
+One pass collects: cluster status + NodeRecovery, events, per-node health (HyperPod + EKS labels, Slurm states), VPC/SG snapshot, CloudWatch availability, SSM readiness, on-node resource checks (disk, memory, /dev/shm, OOM, NVMe, time sync, SSM agent), Slurm node→instance mapping.
+
+Tags: `[PASS]` passed · `[FAIL]` issue with a `→ references/...` pointer · `[WARN]` advisory · `[INFO]` informational. Priorities: **P0** blocks operation · **P1** degraded · **P2** informational.
+
+## Step 2: Match signal → section
+
+**Events (`list-cluster-events`) — provisioning-time:**
+
+| Event                                                                       | Section                                                         |
+| --------------------------------------------------------------------------- | --------------------------------------------------------------- |
+| `"EFA health checks did not run successfully"` (public-doc verbatim signal) | **[A: EFA/SG](#a-efa--security-group)**                         |
+| Instance bootstrap or network-misconfiguration event                        | **[A](#a-efa--security-group)** + **[B: VPC](#b-vpc--routing)** |
+| Lifecycle-script failure or timeout                                         | **[D: Lifecycle](#d-lifecycle-scripts)**                        |
+| Insufficient-capacity or AZ-mismatch failure at creation                    | **[C: Capacity](#c-capacity--az)**                              |
+| Hardware failure / `UnschedulablePendingReplacement`                        | **[F: Hardware](#f-hardware--auto-repair)**                     |
+
+**EKS labels:**
+
+| Label                                                 | Section                                                          |
+| ----------------------------------------------------- | ---------------------------------------------------------------- |
+| `node-health-status: UnschedulablePendingReplacement` | **[F](#f-hardware--auto-repair)**                                |
+| `node-health-status: UnschedulablePendingReboot`      | **[F](#f-hardware--auto-repair)**                                |
+| `deep-health-check-status: Failed`                    | **[G](#g-gpu--accelerator)** → **[F](#f-hardware--auto-repair)** |
+
+**Symptoms:**
+
+| Symptom                                                  | Section                                                         |
+| -------------------------------------------------------- | --------------------------------------------------------------- |
+| Training hangs at NCCL init / AllReduce                  | **[A](#a-efa--security-group)** → **[E](#e-software-versions)** |
+| Slurm node `down` / `"Node unexpectedly rebooted"`       | **[H: Slurm](#h-slurm-node-management)**                        |
+| Jobs stuck PENDING / COMPLETING                          | **[H](#h-slurm-node-management)**                               |
+| Auto-repair not triggering                               | **[F](#f-hardware--auto-repair)**                               |
+| GPU not visible / XID / ECC errors                       | **[G](#g-gpu--accelerator)**                                    |
+| GPU row-remap pending/failed / silent NaNs / DCGM Fail   | **[G § G.1.a/b](#g-gpu--accelerator)**                          |
+| Disk full / OOM / `"Cannot allocate memory"`             | **[I: Resources](#i-resource-exhaustion)**                      |
+| Wrong vCPU count (e.g. 96 instead of 192 on p5.48xlarge) | **[J: Config](#j-configuration)**                               |
+| Container CrashLoopBackOff / runtime crash               | **[M: Container Runtime](#m-container-runtime)**                |
+| `aws-node` CrashLoopBackOff / gRPC 50051 refused         | **[O: CNI / Pod Networking](#o-cni--pod-networking)**           |
+| Pods stuck Pending with no IP / CNI error                | **[O](#o-cni--pod-networking)**                                 |
+| DNS resolution / `enableDnsSupport`                      | **[B § B.2](#b-vpc--routing)**                                  |
+| Public subnet / IGW misconfigured                        | **[B § B.3](#b-vpc--routing)**                                  |
+| Missing VPC endpoints (ECR / STS / FSx)                  | **[B § B.4](#b-vpc--routing)**                                  |
+| EKS VPC / SG mismatch with HyperPod                      | **[B § B.5](#b-vpc--routing)**                                  |
+| Kernel panic / watchdog / hung task                      | **[N: Kernel](#n-kernel--system)**                              |
+| Need shell on a node                                     | **[K: SSM](#k-node-access-via-ssm)**                            |
+| Collect logs for AWS Support                             | **[L: Log Collection](#l-log-collection)**                      |
+
+---
+
+## A: EFA / Security Group
+
+Per the HyperPod prerequisites doc, the SG must allow all inbound and outbound to itself. `scripts/check-efa-sg.sh` validates self-ref rules on every cluster SG. On-node EFA check via `scripts/check-node-reachability.sh` over SSM. Full: [§ A](references/node-diagnostics-detail.md#a-efa--security-group).
+
+## B: VPC / Routing
+
+SG/subnet VPC mismatch, missing S3 Gateway endpoint, EKS auth mode, worker→controller routing, VPC DNS support, private-subnet + NAT / VPC endpoints, EKS↔HyperPod VPC alignment. `scripts/check-vpc-config.sh`. Full: [§ B](references/node-diagnostics-detail.md#b-vpc--routing).
+
+## C: Capacity / AZ
+
+Insufficient-capacity failure at creation, or no subnets in the AZ where capacity is available. Check AZ offerings via `describe-instance-type-offerings`, then change subnet AZ or use Flexible Training Plans / ODCR. Full: [§ C](references/node-diagnostics-detail.md#c-capacity--az).
+
+## D: Lifecycle Scripts
+
+Surfaced in cluster events + CloudWatch under `LifecycleConfig/<group>/<instance-id>`. Common: S3 connectivity, IAM gaps, CRLF line endings, infinite loops, parameter-name mismatch. Full: [§ D](references/node-diagnostics-detail.md#d-lifecycle-scripts).
+
+## E: Software Versions
+
+Delegate to `hyperpod-version-checker` to compare NVIDIA driver, CUDA, NCCL, EFA installer, OFI NCCL, PyTorch across nodes. Ensure job env has `FI_PROVIDER=efa`, `FI_EFA_USE_DEVICE_RDMA=1`, `NCCL_SOCKET_IFNAME=^lo,docker`. Full: [§ E](references/node-diagnostics-detail.md#e-software-versions).
+
+## F: Hardware / Auto-Repair
+
+Confirm `NodeRecovery=Automatic`, inspect the EKS health labels + `sagemaker.amazonaws.com/fault-details` annotation, and read the `SagemakerHealthMonitoringAgent/<group>/<instance>` CloudWatch stream. HMA runs passive background checks on GPU and Neuron state and **reboots** the node on count mismatch (per the HMA doc: "if there's a mismatch between the expected number of GPUs … and the count returned by `nvidia-smi`, then HMA reboots the node"; same for `neuron-ls`). Manual recovery order: reboot first, replace only if reboot fails; the preferred path is the batch APIs (`BatchReboot`/`BatchReplaceClusterNodes`). Full: [§ F](references/node-diagnostics-detail.md#f-hardware--auto-repair) · patterns: [node-issue-catalog.md](references/node-issue-catalog.md).
+
+## G: GPU / Accelerator
+
+**NVIDIA (p4d/p5/g5/g6):** `nvidia-smi` + `dmesg` over SSM for Xid, ECC, thermal throttling. Xid classification per NVIDIA's catalog: 13 Graphics Engine Exception (application-level), 31 GPU memory page fault (application, can be driver/HW), 63 GPU memory remapping event (HW/ECC), 71 CE4 Error (HW copy engine), 74 NVLink Error (HW), 79 GPU has fallen off the bus (PCIe bus), 109 Context Switch Timeout Error (HW). Any uncorrectable ECC → drain and replace. Row-remap state is the authoritative silent-degradation signal (§ G.1.a).
+
+**Trainium / Inferentia (trn1/trn2/inf2):** Neuron SDK — `neuron-ls`, `neuron-top`, `neuron-monitor`. `nvidia-smi` does not apply.
+
+GPU / accelerator failures flow into § F for reboot / replace. Full: [§ G](references/node-diagnostics-detail.md#g-gpuaccelerator).
+
+## H: Slurm Node Management
+
+Node down/unresponsive, unexpected reboots, stuck PENDING/COMPLETING jobs, Slurm-to-instance-ID translation. Primary access is SSM; diagnose `slurmd` first, fix the root cause, then start/resume the node per § H. Full: [§ H](references/node-diagnostics-detail.md#h-slurm-node-management).
+
+## I: Resource Exhaustion
+
+Disk full (HyperPod root volume defaults to 100 GB and is not intended to grow post-creation), OOM, `os.fork()` memory error, `/dev/shm` exhaustion, inode exhaustion. Fork-memory fix: `export FI_EFA_USE_HUGE_PAGE=0`. Redirect bulk data to `/opt/sagemaker` (secondary EBS) or `/opt/dlami/nvme` (instance store). Full: [§ I](references/node-diagnostics-detail.md#i-resource-exhaustion).
+
+## J: Configuration
+
+p5.48xlarge reports 96 vCPU instead of 192 → set `ThreadsPerCore=2` via `update-cluster`. Full: [§ J](references/node-diagnostics-detail.md#j-configuration).
+
+## K: Node Access via SSM
+
+No direct SSH on HyperPod. Target format `sagemaker-cluster:<CLUSTER_ID>_<GROUP>-<INSTANCE_ID>`. Failures: plugin missing, wrong prefix, IAM, VPC endpoints. Full: [§ K](references/node-diagnostics-detail.md#k-node-access-via-ssm).
+
+## L: Log Collection
+
+Delegate to `hyperpod-issue-report` for S3-stored bundles. Key CloudWatch streams: `LifecycleConfig/<group>/<instance-id>`, `SagemakerHealthMonitoringAgent/<group>/<instance-id>`. Full: [§ L](references/node-diagnostics-detail.md#l-log-collection).
+
+## M: Container Runtime
+
+CrashLoopBackOff, OOMKilled, ImagePullBackOff, RunContainerError on EKS. `kubectl describe pod` + on-node `crictl ps -a`, `journalctl -u containerd`. Full: [§ M](references/node-diagnostics-detail.md#m-container-runtime).
+
+## N: Kernel & System
+
+Kernel panic, watchdog timeout, soft lockup, unexpected reboots not explained by HyperPod health monitoring. `dmesg | grep -iE 'panic|watchdog|hung_task|NMI'` + `journalctl -b -1`. nvrm-related signatures point at NVIDIA driver crashes. Full: [§ N](references/node-diagnostics-detail.md#n-kernel--system).
+
+## O: CNI / Pod Networking
+
+VPC CNI (`aws-node`) failures, IPAMD errors, gRPC 127.0.0.1:50051 refused, pods stuck `Pending` with `FailedCreatePodSandBox`. Script auto-checks `aws-node`, `kube-proxy`, CoreDNS. Full: [§ O](references/node-diagnostics-detail.md#o-cni--pod-networking).
+
+---
+
+## Prerequisites
+
+- `aws` CLI v2, recent enough to support the HyperPod cluster commands (`describe-cluster`, `list-cluster-nodes`, `batch-reboot-cluster-nodes`, `batch-replace-cluster-nodes`)
+- `python3`, `bash` 4+ (associative arrays are required by the scripts)
+- `kubectl` authenticated to the EKS cluster (K8s checks skipped if absent)
+- `session-manager-plugin` for on-node hardware checks
+- `unbuffer` (from the `expect` package) — optional; if missing, SSM on-node probes are skipped while the rest of the triage still runs. Install via `yum install expect` / `apt install expect`.
+
+## Defaults
+
+- **Region** — required: pass `--region` or set `$AWS_DEFAULT_REGION`.
+- **Target scope** — all nodes; `--node <ID>` focuses one.
+- **Event window** — up to 500 most recent events (5 × 100, paginated).
+- **Node list cap** — up to 20,000 nodes (200 × 100); warns on cap.
+- **SSM probes** — 180 s per node with retry-on-throttle.
+- **Colors** — auto-disabled on non-TTY; `--no-color` to force off.
+
+## Error handling
+
+| Failure                                         | Script                                                 | Tell the customer                                                    |
+| ----------------------------------------------- | ------------------------------------------------------ | -------------------------------------------------------------------- |
+| `aws sts get-caller-identity` fails             | Exit 1                                                 | "Fix AWS credentials and rerun."                                     |
+| `describe-cluster` fails                        | Exit 1 after listing region's clusters                 | "Confirm cluster name and region."                                   |
+| `sagemaker:*` / `ec2:*` / `logs:*` AccessDenied | Warn, add `Missing IAM permission for <API>`, continue | "Grant the listed IAM action and rerun."                             |
+| `kubectl` absent or unauthenticated             | Skip K8s checks                                        | "Install/authenticate kubectl (see § K)."                            |
+| `session-manager-plugin` absent                 | Skip on-node probes                                    | "Install session-manager-plugin (see § K)."                          |
+| SSM `start-session` fails or times out (180s)   | Mark node unreachable with `→ § K` pointer             | "Rerun with `--node <ID>` to isolate; verify SSM agent on the node." |
+| Cluster > 20,000 nodes                          | First 20,000 paginated; warn                           | "Use `--node` to target specific nodes."                             |
+
+Exit codes: `0` triage complete · `1` cluster not found or fatal prerequisite missing.
+
+## IAM permissions
+
+Read-only diagnostic — covers `triage-cluster.sh`, `check-efa-sg.sh`, `check-vpc-config.sh`, and `check-node-reachability.sh`:
+
+```json
+{
+  "Action": [
+    "sagemaker:DescribeCluster",
+    "sagemaker:DescribeClusterNode",
+    "sagemaker:ListClusterNodes",
+    "sagemaker:ListClusterEvents",
+    "sagemaker:ListClusters",
+    "eks:DescribeCluster",
+    "ec2:DescribeSecurityGroups",
+    "ec2:DescribeSubnets",
+    "ec2:DescribeVpcs",
+    "ec2:DescribeVpcAttribute",
+    "ec2:DescribeVpcEndpoints",
+    "ec2:DescribeRouteTables",
+    "ec2:DescribeNetworkInterfaces",
+    "ec2:DescribeInstances",
+    "ec2:DescribeInstanceTypeOfferings",
+    "ec2:DescribeInstanceTypes",
+    "logs:DescribeLogGroups",
+    "logs:DescribeLogStreams",
+    "logs:FilterLogEvents",
+    "ssm:StartSession",
+    "ssm:TerminateSession",
+    "service-quotas:GetServiceQuota"
+  ]
+}
+```
+
+`sts:GetCallerIdentity` is implicit — it requires no IAM action. SSM on HyperPod uses `start-session` against `sagemaker-cluster:<cluster-id>_<group>-<iid>` targets — not `send-command` against bare instance IDs. For remediation commands, grant the matching write permission (e.g. `ec2:AuthorizeSecurityGroupIngress` / `Egress`, `ec2:RevokeSecurityGroupIngress` / `Egress`, `ec2:ModifyVpcAttribute`, `sagemaker:UpdateCluster`, `sagemaker:BatchRebootClusterNodes`, `sagemaker:BatchReplaceClusterNodes`). Not needed for the diagnostic itself.
+
+## Skill delegation
+
+| Need                                                   | Use                                                          |
+| ------------------------------------------------------ | ------------------------------------------------------------ |
+| Cluster creation / deployment failures                 | `hyperpod-cluster-debugger` (§ A / B / C / H + `--validate`) |
+| Cluster-wide SSM outage                                | `hyperpod-cluster-debugger` § F                              |
+| Single-node SSM failure                                | stay here — § K                                              |
+| Cluster-wide EFA health-check failure at creation time | `hyperpod-cluster-debugger` § A                              |
+| Single-node EFA failure post-provisioning              | stay here — § A                                              |
+| NCCL AllReduce / collective-op timeouts (distributed)  | `hyperpod-nccl`                                              |
+| Silent GPU NaNs on a specific node (row-remap / DCGM)  | stay here — § G.1 (even if discovered by NCCL)               |
+| Post-deployment cluster-wide management                | `hyperpod-cluster-debugger`                                  |
+| Shell / commands on nodes                              | `hyperpod-ssm`                                               |
+| CUDA / NCCL / EFA version comparison                   | `hyperpod-version-checker`                                   |
+| Diagnostic bundle for AWS Support                      | `hyperpod-issue-report`                                      |
+| Training performance / MFU degradation                 | `hyperpod-mfu-debugger`                                      |
+
+## Escalate to AWS Support
+
+Escalate when:
+
+1. SG rules correct and reachability passes but EFA still fails.
+2. VPC correct but K8s bootstrap fails — check VPC flow logs for REJECT.
+3. Hardware failure where replacement keeps failing (bad physical host).
+4. Node replacement fails with an insufficient-capacity signal despite a valid ODCR.
+
+### Before opening the case
+
+```bash
+# 1. Cluster identity + affected node status
+aws sagemaker describe-cluster --cluster-name <CLUSTER> --region <REGION>
+aws sagemaker list-cluster-nodes --cluster-name <CLUSTER> --region <REGION> \
+  --query "ClusterNodeSummaries[?InstanceId=='<INSTANCE_ID>']"
+
+# 2. Triage bundle (scoped to the affected node where possible)
+bash scripts/triage-cluster.sh --cluster <CLUSTER> --region <REGION> --node <INSTANCE_ID> > triage.txt
+
+# 3. Per-node log/config bundle to S3 (delegates to hyperpod-issue-report)
+#    See skills/hyperpod-issue-report/SKILL.md for the exact invocation.
+```
+
+### Include in the case
+
+- Cluster name + ARN and AWS region
+- Orchestrator (EKS or Slurm)
+- Affected instance IDs / node names / instance-group names
+- Timestamp window (UTC start / end) of the failure
+- Exact error strings observed (copy verbatim from pod logs, CloudWatch, dmesg, events)
+- XID numbers / ECC counts / DCGM output where hardware is implicated
+- `triage.txt` from step 2 above
+- S3 URI of the `hyperpod-issue-report` bundle from step 3
+
+Patterns from real customer tickets: [node-issue-catalog.md](references/node-issue-catalog.md).
diff --git a/plugins/sagemaker-ai/skills/hyperpod-node-debugger/references/node-diagnostics-detail.md b/plugins/sagemaker-ai/skills/hyperpod-node-debugger/references/node-diagnostics-detail.md
new file mode 100644
index 00000000..15bc2b9c
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/hyperpod-node-debugger/references/node-diagnostics-detail.md
@@ -0,0 +1,1074 @@
+# Node Diagnostics Detail
+
+Full diagnostic procedures, commands, and fixes for each section referenced from [SKILL.md](../SKILL.md).
+
+---
+
+## A: EFA / Security Group
+
+**Signals:** `"EFA health checks did not run successfully"`, EFA send/recv timeouts, NCCL connectivity fails.
+
+```bash
+bash scripts/check-efa-sg.sh --cluster <CLUSTER> --region <REGION>
+```
+
+Required rules on every cluster SG (per the HyperPod prerequisites doc — "configure the security group to allow all inbound and outbound traffic to and from the security group itself"):
+
+1. **Outbound self-ref (all protocols, source = SG)** — required for EFA.
+2. **Inbound self-ref (all protocols, source = SG)** — required for node-to-node communication.
+
+**Do not add `0.0.0.0/0` outbound to the EFA security group.** Per the HyperPod prerequisites doc: "avoid using `0.0.0.0/0` for outbound rules, as this may cause EFA health check failures." Outbound internet traffic for AWS API calls, package downloads, and image pulls must be routed at the **subnet** level — via a NAT gateway in private subnets, or via VPC interface/gateway endpoints in air-gapped VPCs (see § B.4).
+
+The script prints `[PASS]` / `[FAIL]` per rule.
+
+### Suggested command — add EFA SG self-referencing rules (run this yourself)
+
+**Preconditions:** the rule check above (`scripts/check-efa-sg.sh`) reports `[FAIL]` on inbound or outbound self-ref for `<SG_ID>`; `<SG_ID>` is one of the security groups attached to the HyperPod cluster (`describe-cluster → VpcConfig.SecurityGroupIds`); apply once **per SG** if multiple are attached; for IaC-managed SGs, see the operating-policy IaC note in SKILL.md before running directly.
+
+**Command:**
+
+```bash
+aws ec2 authorize-security-group-egress --group-id <SG_ID> --region <REGION> \
+  --ip-permissions '[{"IpProtocol":"-1","UserIdGroupPairs":[{"GroupId":"<SG_ID>","Description":"HyperPod EFA intra-SG"}]}]'
+
+aws ec2 authorize-security-group-ingress --group-id <SG_ID> --region <REGION> \
+  --ip-permissions '[{"IpProtocol":"-1","UserIdGroupPairs":[{"GroupId":"<SG_ID>","Description":"HyperPod intra-SG"}]}]'
+```
+
+**Blast radius:** opens all protocols between instances that share this SG (intended scope for intra-cluster EFA traffic) — does not open anything to the internet or to other SGs. Idempotent: `InvalidPermission.Duplicate` = the rule already exists. Reversible with `revoke-security-group-ingress`/`revoke-security-group-egress` using the same `--ip-permissions` payload. For outbound internet access, route at the subnet level (NAT gateway or VPC endpoints) — not via a `0.0.0.0/0` rule on this SG (per HyperPod prerequisites).
+
+**For provisioned nodes with EFA problems**, use the `hyperpod-ssm` skill to upload and run `check-node-reachability.sh`, or spot-check:
+
+```bash
+bash skills/hyperpod-ssm/scripts/ssm-exec.sh --target <TARGET> --region <REGION> 'fi_info -p efa'
+```
+
+---
+
+## B: VPC / Routing
+
+**Signals:** `"bootstrap failed...network misconfiguration"`, S3 timeout, subnet/VPC mismatch, DNS resolution failure, node unreachable despite correct SG.
+
+```bash
+bash scripts/check-vpc-config.sh --cluster <CLUSTER> --region <REGION>
+```
+
+### B.1 Common errors
+
+| Error                                                 | Fix (each is a mutation — see Suggested-command blocks below or in the referenced section)                                                                                                                                                                                    |
+| ----------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| SG and subnet in different VPCs                       | Move SG to same VPC as subnet                                                                                                                                                                                                                                                 |
+| S3 timeout (endpoint unreachable from private subnet) | Add an S3 Gateway VPC endpoint — see [hyperpod-cluster-debugger § lifecycle-scripts](../../hyperpod-cluster-debugger/references/lifecycle-scripts.md) for the Suggested-command block                                                                                         |
+| EKS auth mode is `CONFIG_MAP` only                    | Access entries require `API` or `API_AND_CONFIG_MAP`; switching the auth mode is a cluster-level change — see the EKS access-entries docs and [hyperpod-cluster-debugger § D](../../hyperpod-cluster-debugger/references/cluster-diagnostics-detail.md#d-eks-access--kubectl) |
+| `aws-hyperpod` namespace missing                      | `kubectl create namespace aws-hyperpod` — customer-run. **Preconditions:** namespace is genuinely missing (not just RBAC denial). **Blast radius:** creates a new namespace; low risk, but confirm which namespace HyperPod expects on this cluster version                   |
+| Workers can't reach EKS controller                    | Add route to EKS VPC CIDR in worker subnet; check VPC flow logs                                                                                                                                                                                                               |
+
+### B.2 VPC DNS
+
+HyperPod requires both `enableDnsSupport` and `enableDnsHostnames` on the VPC. Without these, EKS internal DNS, internal hostnames, and `ip-x-x-x-x` Slurm nodenames fail to resolve.
+
+Diagnose (read-only):
+
+```bash
+aws ec2 describe-vpc-attribute --vpc-id <VPC> --attribute enableDnsSupport   --region <R> --query 'EnableDnsSupport.Value'
+aws ec2 describe-vpc-attribute --vpc-id <VPC> --attribute enableDnsHostnames --region <R> --query 'EnableDnsHostnames.Value'
+```
+
+### Suggested command — enable VPC DNS attributes (run this yourself)
+
+**Preconditions:** VPC is customer-owned in this account (cannot modify attributes on a VPC shared from another account via RAM); current values are `false` (verify with the read-only `describe-vpc-attribute` calls above — calling modify on already-enabled attributes is a harmless no-op but wastes a call); change is acceptable cluster-wide (every instance in the VPC gains Amazon DNS resolution and internal hostnames).
+
+**Command:**
+
+```bash
+aws ec2 modify-vpc-attribute --vpc-id <VPC> --region <R> --enable-dns-support '{"Value":true}'
+aws ec2 modify-vpc-attribute --vpc-id <VPC> --region <R> --enable-dns-hostnames '{"Value":true}'
+```
+
+**Blast radius:** additive — enables Amazon-provided DNS resolution and `ip-x-x-x-x` internal hostnames for every existing and future instance in this VPC. Does not affect existing IPs, routes, or SGs. Reversible by setting the values to `false`, but disabling on a live HyperPod cluster will break EKS internal DNS and Slurm nodename resolution.
+
+### B.3 Private subnets
+
+HyperPod subnets should be private — route tables should not have a direct default route to an IGW. If outbound internet is needed, route `0.0.0.0/0` via a NAT Gateway in a separate public subnet. In air-gapped VPCs, the default route can be absent and outbound goes through VPC endpoints (§ B.4).
+
+```bash
+aws ec2 describe-route-tables \
+  --filters "Name=association.subnet-id,Values=<subnet-1>,<subnet-2>" \
+  --region <R> \
+  --query "RouteTables[*].{Assoc:Associations[?SubnetId!=\`null\`].SubnetId,Routes:Routes[?DestinationCidrBlock==\`0.0.0.0/0\`]}" \
+  --output json
+```
+
+| Route target for `0.0.0.0/0` | Subnet type                  | Action                                         |
+| ---------------------------- | ---------------------------- | ---------------------------------------------- |
+| `igw-*`                      | Public — not supported       | Remove IGW route; use a NAT Gateway            |
+| `nat-*`                      | Private with internet egress | OK                                             |
+| Absent                       | Fully private / air-gapped   | OK if VPC endpoints are configured — see § B.4 |
+| `vpce-*`                     | Endpoint-only routing        | OK                                             |
+
+### B.4 VPC endpoints (internet-disabled VPCs)
+
+When there is no NAT Gateway, nodes need private interface endpoints for every AWS service they call. Interface endpoints listen on TCP/443 — the endpoint's SG must allow inbound 443 from the HyperPod subnet CIDR.
+
+| Endpoint                                   | Type      | Required     | Purpose                                                                                                                                             |
+| ------------------------------------------ | --------- | ------------ | --------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `com.amazonaws.<region>.s3`                | Gateway   | **Yes**      | Lifecycle scripts, DLC image layers                                                                                                                 |
+| `com.amazonaws.<region>.ecr.api`           | Interface | **Yes**      | ECR authentication                                                                                                                                  |
+| `com.amazonaws.<region>.ecr.dkr`           | Interface | **Yes**      | Pull container images                                                                                                                               |
+| `com.amazonaws.<region>.sts`               | Interface | **Yes**      | STS calls (AssumeRole, GetCallerIdentity)                                                                                                           |
+| `com.amazonaws.<region>.ssm`               | Interface | **Yes**      | SSM Session Manager                                                                                                                                 |
+| `com.amazonaws.<region>.ssmmessages`       | Interface | **Yes**      | SSM session traffic                                                                                                                                 |
+| `com.amazonaws.<region>.ec2messages`       | Interface | **Yes**      | SSM heartbeats                                                                                                                                      |
+| `com.amazonaws.<region>.ec2`               | Interface | **Yes**      | EC2 control-plane API (DescribeInstances, EBS volume operations) — instance metadata is link-local (169.254.169.254) and does not use this endpoint |
+| `com.amazonaws.<region>.sagemaker.api`     | Interface | **Yes**      | HyperPod control plane                                                                                                                              |
+| `com.amazonaws.<region>.sagemaker.runtime` | Interface | **Yes**      | Runtime calls                                                                                                                                       |
+| `com.amazonaws.<region>.logs`              | Interface | **Yes**      | CloudWatch lifecycle + health-monitoring-agent logs                                                                                                 |
+| `com.amazonaws.<region>.eks`               | Interface | EKS only     | Required if EKS endpoint is private-only                                                                                                            |
+| `com.amazonaws.<region>.fsx`               | Interface | If using FSx | Required for FSx for Lustre / OpenZFS                                                                                                               |
+
+### B.5 EKS ↔ HyperPod VPC alignment
+
+When orchestrator is EKS, the EKS cluster and the HyperPod cluster must share a VPC. The SG attached to the HyperPod cluster must either be attached to the EKS cluster itself OR the EKS cluster SG must allow inbound from the HyperPod SG.
+
+Diagnose (read-only):
+
+```bash
+aws sagemaker describe-cluster --cluster-name <HP>  --region <R> --query 'VpcConfig.{Subnets:Subnets,SGs:SecurityGroupIds}'
+aws eks describe-cluster       --name         <EKS> --region <R> --query 'cluster.resourcesVpcConfig.{VPC:vpcId,SGs:securityGroupIds,ClusterSG:clusterSecurityGroupId}'
+```
+
+### Suggested command — allow HyperPod SG inbound on the EKS cluster SG (run this yourself)
+
+**Preconditions:** the orchestrator is EKS and the HyperPod cluster is in the same VPC as the EKS cluster (verify with the read-only `describe-cluster` calls above); `<EKS_CLUSTER_SG>` is the **EKS-managed cluster SG** (`clusterSecurityGroupId` from `eks describe-cluster`), **not** a worker SG; `<HP_SG>` is one of the security groups attached to the HyperPod cluster (`VpcConfig.SecurityGroupIds`); the customer prefers the SG-allow approach over re-attaching the HyperPod SG directly to the EKS cluster (both are valid; this rule is needed only when they're not attached).
+
+**Command:**
+
+```bash
+aws ec2 authorize-security-group-ingress --group-id <EKS_CLUSTER_SG> --region <R> \
+  --ip-permissions "[{\"IpProtocol\":\"-1\",\"UserIdGroupPairs\":[{\"GroupId\":\"<HP_SG>\",\"Description\":\"HyperPod worker traffic\"}]}]"
+```
+
+**Blast radius:** opens all protocols from every ENI using `<HP_SG>` to the EKS control-plane SG — scoped to two SGs, not the world. Idempotent: returns `InvalidPermission.Duplicate` if the rule already exists. Reversible with `revoke-security-group-ingress` and the same `--ip-permissions` payload.
+
+---
+
+## C: Capacity / AZ
+
+**Signals:** insufficient-capacity or AZ-mismatch failure at creation or replacement time.
+
+```bash
+aws ec2 describe-instance-type-offerings \
+  --location-type availability-zone \
+  --filters "Name=instance-type,Values=<INSTANCE_TYPE>" \
+  --region <REGION> --query 'InstanceTypeOfferings[*].Location'
+```
+
+Fix: add subnet in the AZ where capacity exists, or use Flexible Training Plans / ODCR.
+
+---
+
+## D: Lifecycle Scripts
+
+**Signals:** `"Lifecycle scripts did not run successfully"` or `"timed out"` in events.
+
+```bash
+CLUSTER_NAME="<C>"
+REGION="<R>"
+CLUSTER_ID=$(aws sagemaker describe-cluster --cluster-name "$CLUSTER_NAME" --region "$REGION" \
+  --query 'ClusterArn' --output text | cut -d/ -f2)
+LOG_GROUP="/aws/sagemaker/Clusters/${CLUSTER_NAME}/${CLUSTER_ID}"
+aws logs describe-log-streams --log-group-name "$LOG_GROUP" --region "$REGION" \
+  --query 'logStreams[?starts_with(logStreamName,`LifecycleConfig`)].logStreamName' --output table
+```
+
+On-node:
+
+```bash
+bash skills/hyperpod-ssm/scripts/ssm-exec.sh --target <TARGET> --region <REGION> \
+  'cat /var/log/provision/provisioning.log'
+```
+
+| Log error                                | Fix                                                        |
+| ---------------------------------------- | ---------------------------------------------------------- |
+| `Connect timeout on endpoint URL: s3://` | Add S3 VPC Gateway endpoint                                |
+| `AccessDenied` on S3                     | Add `s3:GetObject` + `s3:ListBucket` to execution role     |
+| Script never exits                       | Add proper exit; check infinite loops; test script locally |
+| `CRLF line terminators`                  | `dos2unix script.sh` before uploading                      |
+| `provisioning_parameters.json` mismatch  | Instance group names must match between script and API     |
+
+---
+
+## E: Software Versions
+
+**Signals:** NCCL hangs after node replacement, training fails after AMI update, version drift across nodes.
+
+**Delegate to `hyperpod-version-checker`** — compares NVIDIA driver, CUDA, NCCL, EFA installer, OFI NCCL, PyTorch across all nodes.
+
+### Quick spot-check on a node (via `hyperpod-ssm`)
+
+```bash
+bash skills/hyperpod-ssm/scripts/ssm-exec.sh --target <TARGET> --region <REGION> \
+  'nvidia-smi --query-gpu=driver_version --format=csv,noheader && \
+   nvcc --version | grep "release" && \
+   head -3 /opt/amazon/efa_installed_packages && \
+   python3 -c "import torch; print(torch.__version__, torch.version.cuda)"'
+```
+
+### CUDA driver vs `nvcc` toolkit
+
+The CUDA driver (`nvidia-smi`) and the CUDA toolkit / `nvcc` (`nvcc --version`) must be a supported pair — a newer toolkit cannot target an older driver. Mismatch commonly causes `CUDA error: no kernel image is available for execution on the device` or kernel-launch segfaults.
+
+```bash
+nvidia-smi | grep "CUDA Version"         # max CUDA the driver supports
+nvcc --version | grep "release"          # installed toolkit
+```
+
+Compatibility matrix: see the NVIDIA CUDA Toolkit Release Notes for the toolkit version in use.
+
+### EFA / NCCL / libfabric
+
+EFA installer version and AWS OFI NCCL version must be paired per the EFA changelog:
+
+```bash
+cat /opt/amazon/efa_installed_packages | head -10
+fi_info -p efa | head -5
+```
+
+Compatibility matrix: see the AWS EFA installer changelog for the version in use.
+
+### Container vs host mismatches
+
+If training works on the host but fails in the container (or vice versa), the cause is almost always one of:
+
+1. **EFA libraries not mounted into the container** — container must see `/opt/amazon/efa`, `/opt/amazon/openmpi`, and `/dev/infiniband`. Without these NCCL silently falls back to TCP.
+2. **`LD_LIBRARY_PATH` missing EFA / CUDA paths inside the container**:
+
+   ```bash
+   export LD_LIBRARY_PATH=/opt/amazon/efa/lib:/opt/amazon/openmpi/lib:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
+   ```
+
+3. **PyTorch / TF built against a different CUDA major than the host driver supports** — rebuild from a base image whose CUDA matches the host (e.g. AWS DLC `pytorch-training:<ver>-gpu-py<ver>-cu<host-major>-ubuntu*`).
+
+After a driver upgrade, CUDA devices may fail to init until the node is rebooted. Use `batch-reboot-cluster-nodes` (§ F) and re-run training.
+
+### Required job-launcher env vars
+
+Per the EC2 EFA-with-NCCL guide: `FI_EFA_USE_DEVICE_RDMA=1` (RDMA-capable instances). For NCCL over EFA, also set `FI_PROVIDER=efa` and `NCCL_SOCKET_IFNAME=^lo,docker` to keep NCCL's bootstrap off the loopback / docker interfaces. `NCCL_TIMEOUT` (seconds) is not AWS-prescribed — tune to your job's longest expected collective if jobs trip the default; otherwise leave unset.
+
+### Validation
+
+For PyTorch environment and EFA / network-stack validation, use the AWS-published validation guides for SageMaker HyperPod (available from the AWS SageMaker HyperPod documentation).
+
+---
+
+## F: Hardware / Auto-Repair
+
+**Signals:** hardware failure event, EKS label `UnschedulablePendingReplacement`, XID errors, auto-repair not triggering.
+
+```bash
+# NodeRecovery on each group
+aws sagemaker describe-cluster --cluster-name <C> --region <R> \
+  --query 'InstanceGroups[*].{Group:InstanceGroupName,Recovery:NodeRecovery}'
+
+# EKS: all node repair labels at once
+kubectl get nodes -o custom-columns='NODE:.metadata.name,HEALTH:.metadata.labels.sagemaker\.amazonaws\.com/node-health-status,FAULT:.metadata.labels.sagemaker\.amazonaws\.com/fault-types'
+
+# Repair events — ListClusterEvents returns `Events[*]` with field `Description`
+aws sagemaker list-cluster-events --cluster-name <C> --region <R> \
+  --query 'Events[?contains(Description,`replacement`) || contains(Description,`reboot`) || contains(Description,`hardware`)]' \
+  --output table
+
+# Slurm: HMA auto-recovery is triggered by the health-monitoring agent (not the Slurm reason).
+# The Slurm "Action:Reboot" / "Action:Replace" reason is the manual-recovery path — a user sets
+# it to ask HyperPod to reboot/replace the node. See "Manually mark a node..." below.
+sinfo -o "%N %T %30E"
+```
+
+### Suggested command — batch-reboot (run this yourself, soft recovery first)
+
+**Preconditions:**
+
+- Fault is plausibly transient (deep-health-check failure, driver hang, stuck process) and reboot may clear it. For confirmed hardware faults (uncorrectable ECC, GPU off-bus, NVLink), skip to batch-replace below.
+- Each node ID belongs to this cluster (verify with `list-cluster-nodes`).
+- Workload on the node can tolerate a restart — training processes on the node are interrupted.
+- On Slurm: rebooting will not disrupt critical cluster operations (per the API doc note); prefer to drain the node first via `scontrol update state=drain` to avoid the "Node unexpectedly rebooted" flag (§ H).
+- `NodeIds` batch size: 1–25 per call (API limit).
+
+**Command:**
+
+```bash
+aws sagemaker batch-reboot-cluster-nodes --cluster-name <C> --region <R> --node-ids '["<ID>"]'
+```
+
+**Blast radius:** per the API doc, "performs a graceful reboot… by calling the Amazon EC2 RebootInstances API." Preserves instance identity, root volume, and secondary volumes — **no data loss**. Training processes on the node are interrupted; pods on EKS are evicted by kubelet during the restart and rescheduled by the workload controller after the node returns Ready. Recovery time depends on instance type, AMI boot time, and any post-boot lifecycle work.
+
+### Suggested command — batch-replace (run this yourself, only if reboot did not clear the fault)
+
+**Preconditions:**
+
+- Reboot attempted first and did not clear the fault.
+- Hardware fault confirmed (uncorrectable ECC, GPU bus / NVLink errors, EFA hardware failure); not a software or config issue.
+- Data on root + secondary volumes is backed up to S3 or FSx — **per the API doc: "Replacing nodes destroys all instance volumes, including both root and secondary volumes. All data stored on these volumes will be permanently lost and cannot be recovered."**
+- Target is **NOT** a Slurm controller node — per the API doc: "For SageMaker HyperPod clusters using the Slurm workload manager, you cannot replace instances that are configured as Slurm controller nodes."
+- Cluster has been patched via `UpdateClusterSoftware` — per the API doc: "If you want to invoke this API on an existing cluster, you'll first need to patch the cluster by running the UpdateClusterSoftware API."
+- `NodeIds` batch size: 1–25 per call (API limit).
+
+**Command:**
+
+```bash
+aws sagemaker batch-replace-cluster-nodes --cluster-name <C> --region <R> --node-ids '["<ID>"]'
+```
+
+**Blast radius:** destroys root + secondary volumes on the replaced instance (permanent data loss). New hardware is provisioned with the same AMI and instance configuration.
+
+**Karpenter note:** Karpenter's documented design provisions nodes from pending/unschedulable pods (see Karpenter docs on disruption/provisioning), not as a one-for-one node replacement service. So on Karpenter-managed clusters, `BatchReplaceClusterNodes` terminates the node but **does not by itself guarantee a Karpenter-launched replacement** — Karpenter creates a new node only if pods become unschedulable on remaining capacity. If you need a guaranteed replacement, ensure workload configuration (pod anti-affinity, resource requests) forces pods to a new node.
+
+**Common blockers:** `NodeRecovery=None` (enable it), health agent hasn't detected yet (check `SagemakerHealthMonitoringAgent/<group>/<instance>` stream), lifecycle script failing on replacement (check `LifecycleConfig` stream), insufficient capacity, cluster not `InService`.
+
+### HMA detection events
+
+The Health Monitoring Agent emits `HealthMonitoringAgentDetectionEvent` records to CloudWatch. Use these to read fault history before triggering a manual replace.
+
+```bash
+CLUSTER_ID=$(aws sagemaker describe-cluster --cluster-name <C> --region <R> \
+  --query 'ClusterArn' --output text | cut -d/ -f2)
+
+aws logs filter-log-events \
+  --log-group-name "/aws/sagemaker/Clusters/<C>/${CLUSTER_ID}" \
+  --log-stream-name-prefix "SagemakerHealthMonitoringAgent/" \
+  --filter-pattern 'HealthMonitoringAgentDetectionEvent' \
+  --region <R> \
+  --query 'events[*].[timestamp,logStreamName,message]' --output table
+```
+
+Reference: the SageMaker HyperPod EKS Health Monitoring Agent documentation.
+
+### Repeat-Xid analysis
+
+A hardware-caused Xid will recur after each reboot because reboot does not repair hardware. If you see the same Xid on the same instance more than once, the node almost certainly needs to be replaced rather than rebooted again.
+
+Count Xid occurrences per instance from the HMA detection stream in the customer-visible cluster log group:
+
+```bash
+# Log group: /aws/sagemaker/Clusters/<CLUSTER>/<CLUSTER_ID>
+# Stream prefix: SagemakerHealthMonitoringAgent/
+fields @timestamp, @logStream, @message
+| parse @message /Xid.*?:\s*(?<xidCode>\d+)/
+| filter @message like /HealthMonitoringAgentDetectionEvent/ and @message like /Xid/
+| stats count(*) as errorCount,
+        earliest(@timestamp) as firstError,
+        latest(@timestamp) as lastError
+  by @logStream, xidCode, bin(1h) as hourBin
+| sort hourBin desc, errorCount desc
+```
+
+A recurring same-Xid + same-instance row is the signal to replace rather than reboot. The exact recurrence threshold is operator choice — many teams use ≥ 2 within a single time window as the trigger.
+
+### Node-level fault details (EKS)
+
+When HMA detects a fault it writes a four-part response onto the node (per the HyperPod HMA documentation):
+
+- **Labels**: `sagemaker.amazonaws.com/node-health-status`, `sagemaker.amazonaws.com/fault-types`, `sagemaker.amazonaws.com/fault-reasons`
+- **Taint**: `sagemaker.amazonaws.com/node-health-status=Unschedulable:NoSchedule`
+- **Annotation**: `sagemaker.amazonaws.com/fault-details` — JSON array recording recent faults with timestamps; check the HyperPod HMA doc for the current retention limit
+- **Condition** (per the HMA doc): `Type` = fault type, `Status` = `True`, `Reason` = fault reason, `LastTransitionTime` = fault occurrence time. After a successful recovery the condition status flips back to `False`.
+
+```bash
+kubectl get node <NODE> -o jsonpath='{.metadata.annotations.sagemaker\.amazonaws\.com/fault-details}' | jq
+kubectl get node <NODE> -o jsonpath='{.status.conditions}' | jq '.[] | select(.type|contains("GPU"))'
+```
+
+### Manually trigger reboot or replace on EKS (kubectl label)
+
+If HMA has not detected a fault but the customer has independent evidence, a label can trigger the existing HyperPod recovery path.
+
+### Suggested command — trigger replace on EKS (run this yourself)
+
+**Preconditions:** `NodeRecovery=Automatic` on the instance group; hardware fault confirmed on `<NODE>` (not a software/config issue); data on root + secondary volumes is backed up; cluster has been patched via `UpdateClusterSoftware` if this is the first replace on an existing cluster. Per the HyperPod EKS manual-recovery doc, the **preferred path is the Reboot/Replace APIs** (`BatchReplaceClusterNodes`); labelling is an alternative that activates the same recovery process.
+
+**Command:**
+
+```bash
+kubectl label nodes <NODE> sagemaker.amazonaws.com/node-health-status=UnschedulablePendingReplacement
+```
+
+**Blast radius:** marks the node for replacement. Destroys root + secondary volumes on the replaced instance — all data on those volumes is lost. New hardware is provisioned with the same AMI.
+
+### Suggested command — trigger reboot on EKS (run this yourself)
+
+**Preconditions:** `NodeRecovery=Automatic` on the instance group; fault is plausibly transient (deep-health-check failure, driver hang) and reboot may clear it; workload can tolerate restart.
+
+**Command:**
+
+```bash
+kubectl label nodes <NODE> sagemaker.amazonaws.com/node-health-status=UnschedulablePendingReboot
+```
+
+**Blast radius:** soft recovery — preserves identity, root volume, and secondary volumes. Training processes on the node are interrupted.
+
+### Suggested command — manually trigger recovery on Slurm (run this yourself)
+
+Per the HyperPod Slurm manual-recovery doc, the **preferred path is the batch APIs** (`BatchReboot`/`BatchReplaceClusterNodes`) — the `scontrol` commands below are documented as a legacy alternative that requires direct Slurm-controller access. Both paths activate the same HyperPod recovery processes.
+
+**Preconditions:** Slurm orchestrator; `scontrol` run on the controller via SSM; customer has decided between reboot (transient fault) and replace (confirmed hardware fault); replace target is NOT a Slurm controller node; data backed up for replace; cluster has been patched via `UpdateClusterSoftware` if invoking replace on an existing cluster.
+
+**Command:**
+
+```bash
+# Reboot — soft recovery:
+scontrol update node=<ip-ipv4> state=fail reason="Action:Reboot"
+
+# Replace — destroys root + secondary volumes:
+scontrol update node=<ip-ipv4> state=fail reason="Action:Replace"
+```
+
+Per the HyperPod Slurm manual-recovery doc: for `Action:Replace` the node goes into `fail`, waits for running jobs to finish, then is replaced with a fresh instance using the same host name. For either command, do not change the node state or restart `slurmctld` while recovery is in progress — this can leave the node stuck.
+
+**Last-resort force** — if the node is stuck in `fail`, the HyperPod Slurm manual-recovery doc provides `scontrol update node=<ip-ipv4> state=down reason="Action:Replace"` as a last resort. Per the doc: "this requires administrator privileges (sudo permissions)" and (warning) "it forces kill all jobs, and you might lose all unsaved work." Confirm with the customer that lost in-flight work is acceptable before running.
+
+**Blast radius:** drains the named node. `Action:Replace` inherits the same blast radius as `batch-replace-cluster-nodes` (root + secondary volumes destroyed). `state=down` additionally force-kills running jobs.
+
+### Suggested command — force-delete a stuck Terminating pod (last resort; run this yourself)
+
+**Preconditions:** pod has been in `Terminating` state on `<NODE>` for >30 minutes; the node is quarantined (cordoned, fault confirmed); customer has approved the forced deletion; you understand the API server will remove the pod object immediately even if the container is still running on the node.
+
+**Command:**
+
+```bash
+kubectl cordon <NODE>
+kubectl delete pods <POD> --grace-period=0 --force
+```
+
+**Blast radius:** `--grace-period=0 --force` removes the pod from the API without waiting for kubelet to confirm termination — the container may continue running on the node until the node is rebooted or replaced. Only appropriate when the node will be rebooted/replaced afterward. For a healthy node, use the default `kubectl delete pod` and let the grace period elapse.
+
+---
+
+## G: GPU/Accelerator
+
+**Signals:** GPU off bus, `deep-health-check-status: Failed`, XID errors, low utilization, ECC errors, thermal throttling, NeuronCore errors.
+
+### G.1: NVIDIA (p4d/p5/g5/g6)
+
+Run on the affected node via `hyperpod-ssm`:
+
+```bash
+bash skills/hyperpod-ssm/scripts/ssm-exec.sh --target <TARGET> --region <REGION> \
+  'nvidia-smi -L && nvidia-smi --query-gpu=index,name,utilization.gpu,memory.used,memory.total,temperature.gpu,power.draw,ecc.errors.uncorrected.volatile.total --format=csv && nvidia-smi -q | grep -E "Xid|Error Type|ECC" && dmesg | grep -i "xid\|nvrm\|pcie\|error" | tail -20'
+```
+
+**ECC:** any uncorrectable error (UCE) → drain and replace. Correctable errors are background noise individually but a growing rate across many GPUs is worth escalating. For detailed GPU diagnostics (NVLink, dmon, XID codes), see [node-issue-catalog.md § 2](node-issue-catalog.md#2-gpu--accelerator).
+
+**Xid reference (per NVIDIA Xid error catalog):** common Xid numbers seen in HyperPod dmesg / HMA `fault-details`:
+
+| Xid | NVIDIA name                  | Class | Typical cause                                                                                 |
+| --- | ---------------------------- | ----- | --------------------------------------------------------------------------------------------- |
+| 13  | Graphics Engine Exception    | App   | User-application fault (out-of-bounds, illegal instruction / register)                        |
+| 31  | GPU memory page fault        | App   | Illegal address access by a chip unit (usually an application bug; occasionally driver or HW) |
+| 63  | GPU memory remapping event   | HW    | ECC memory event; on Ampere+ provides row-remapper detail (see § G.1.a for row-remap triage)  |
+| 71  | CE4 Error                    | HW    | Copy Engine 4 exception (seen in HMA example detection logs on HyperPod p-family instances)   |
+| 74  | NVLINK Error                 | HW    | NVLink connectivity issue between GPUs / NVSwitch                                             |
+| 79  | GPU has fallen off the bus   | Bus   | Driver cannot reach GPU over PCIe — failing link or GPU (drain + replace)                     |
+| 109 | Context Switch Timeout Error | HW    | Timeout during GPU context switch                                                             |
+
+For an App-classified Xid (13, 31), investigate the workload before replacing hardware; HMA will reboot on the fault but a software cause will recur until the workload is fixed.
+
+#### G.1.a Row-remap state (silent memory degradation)
+
+Row-remapping is the mechanism that permanently reassigns physical memory rows around defects on H100 / A100 GPUs. The remap state is the most reliable signal of _silent_ memory degradation — accuracy regressions, sporadic NaNs, and intermittent NCCL hangs that no XID or ECC count explains.
+
+```bash
+nvidia-smi --query-remapped-rows=gpu_bus_id,remapped_rows.correctable,remapped_rows.uncorrectable,remapped_rows.pending,remapped_rows.failure \
+  --format=csv
+```
+
+| State                               | Meaning                                                    | Action                                                                             |
+| ----------------------------------- | ---------------------------------------------------------- | ---------------------------------------------------------------------------------- |
+| `pending = 0`, `failure = No`       | Healthy                                                    | None                                                                               |
+| `pending > 0`                       | Remap staged but needs a GPU reset / reboot to take effect | Reboot via `batch-reboot-cluster-nodes` (§ F); recheck — pending should reach 0    |
+| `pending > 0` persists after reboot | Remap stuck "pending" — memory is silently degrading       | Drain and replace via `batch-replace-cluster-nodes` (§ F); escalate to AWS Support |
+| `failure = Yes`                     | Remap capacity exceeded                                    | Drain and replace (§ F)                                                            |
+
+`uncorrectable > 0` with `pending = 0` means historical rows that have already been remapped — fine going forward, but a high count is a warning sign for the hardware cohort.
+
+#### G.1.b DCGM health and nvvs logs
+
+HyperPod runs DCGM as part of the deep-health-check. Findings are under `/var/log/nvidia-dcgm/`.
+
+```bash
+dcgmi health --check -j
+
+ls -1t /var/log/nvidia-dcgm/ | head
+tail -n 200 "$(ls -1t /var/log/nvidia-dcgm/nvvs*.log | head -1)"
+```
+
+Treat only **Fail** / **Warn** verdicts as authoritative. For comprehensive data collection before opening a ticket:
+
+```bash
+sudo nvidia-bug-report.sh                                    # NVIDIA's authoritative bundle
+sudo tar -czf /tmp/nvidia-dcgm-logs.tgz /var/log/nvidia-dcgm/
+```
+
+Attach both to the AWS Support case along with the triage script output.
+
+### G.2: AWS Trainium / Inferentia (trn1/trn2/inf2)
+
+These use the **AWS Neuron SDK**, not CUDA. `nvidia-smi` will not work.
+
+**Quick health check (via SSM):**
+
+```bash
+bash skills/hyperpod-ssm/scripts/ssm-exec.sh --target <TARGET> --region <REGION> \
+  'neuron-ls && neuron-top -n 1 2>/dev/null || echo "neuron-top not available" && dmesg | grep -i "neuron\|nrt\|error" | tail -20'
+```
+
+| Command                       | Shows                                                        |
+| ----------------------------- | ------------------------------------------------------------ |
+| `neuron-ls`                   | Lists all NeuronCore devices, count, status                  |
+| `neuron-top`                  | Live utilization (NeuronCore %, memory, model loaded)        |
+| `neuron-monitor`              | JSON metrics stream                                          |
+| `dmesg \| grep -i neuron`     | Kernel-level Neuron errors                                   |
+| `systemctl status neuron-rtd` | Neuron Runtime daemon (older AMIs; deprecated in SDK ≥ 2.10) |
+| `pip show neuronx-cc`         | Neuron Compiler version                                      |
+| `pip show torch-neuronx`      | PyTorch Neuron version                                       |
+
+**Per-chip counts** (AWS Neuron architecture docs):
+
+| Chip                        | Cores per chip |
+| --------------------------- | -------------- |
+| Trainium1 (NeuronCore-v2)   | 2              |
+| Inferentia2 (NeuronCore-v2) | 2              |
+
+Trainium2 uses **NeuronCore-v3**, with a different per-chip core count and HBM topology than the v2 chips above; check the AWS Neuron Trainium2 architecture doc and `neuron-ls` on the node for the authoritative numbers.
+
+For the chip count per instance type (NeuronDevices × per-chip cores = total), use `neuron-ls` on the node as the source of truth; the AWS EC2 Trn1 / Trn2 / Inf2 instance-types docs are the authoritative reference if you need a number before node-access is available. Per the HyperPod HMA doc: "Neuron Device Count validation — if there's a mismatch between the actual number of neuron device count in a particular instance type and the count returned by `neuron-ls`, then HMA **reboots** the node." Replacement only happens if reboots fail to clear the fault.
+
+**Common issues:**
+
+| Symptom                                    | Likely cause                                      | Action                                                                                                                                                                                                                                                  |
+| ------------------------------------------ | ------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `neuron-ls` shows 0 devices                | Neuron kernel driver not loaded                   | Check `lsmod \| grep neuron`; if the module is absent, the AMI is missing the Neuron driver — use the AWS Neuron DLAMI or rebuild the AMI with Neuron support (loading kernel modules on a running cluster node is a mutation; do not attempt in-place) |
+| `neuron-ls: command not found`             | Neuron SDK not installed                          | Install from the AWS Neuron repo, or use the AWS Neuron DLAMI                                                                                                                                                                                           |
+| NeuronCore count < expected                | Device failure / driver issue / partial detection | Reboot the node (§ F). If the count is still low, replace.                                                                                                                                                                                              |
+| `NRT_UNRECOVERABLE_ERROR` in dmesg or logs | Unrecoverable NeuronDevice fault                  | Drain and replace (§ F). Do not attempt software-only recovery.                                                                                                                                                                                         |
+| OOM on NeuronDevice (HBM exhaustion)       | Model + activations + optimizer exceed HBM        | Increase tensor-parallel degree, enable activation checkpointing, or scale up                                                                                                                                                                           |
+| Version mismatch across nodes              | AMI drift after partial replacement               | Pin Neuron package versions in the lifecycle script so replacements converge                                                                                                                                                                            |
+
+### Accelerator failure → Section F
+
+Drain the node, then follow the reboot / replace Suggested-command blocks in § F.
+
+### Suggested command — drain the node before reboot/replace (run this yourself)
+
+**Preconditions:** accelerator failure confirmed on `<node-name>` (GPU off-bus, uncorrectable ECC, NeuronDevice `NRT_UNRECOVERABLE_ERROR`); customer accepts that pods using `emptyDir` volumes on this node will lose that data when evicted (EKS path); on Slurm, customer accepts that no new jobs will be scheduled to the node until `state=resume` runs after recovery; you understand drain is preparation for reboot/replace, not a fix on its own.
+
+**Command:**
+
+```bash
+# EKS — cordon prevents new pods; drain evicts existing pods (emptyDir data lost).
+kubectl cordon <node-name>
+kubectl drain <node-name> --ignore-daemonsets --delete-emptydir-data
+
+# Slurm — on the controller via SSM. Running jobs continue until they finish; no new jobs are scheduled.
+scontrol update nodename=<node-name> state=drain reason="Accelerator failure -- replacing"
+```
+
+**Blast radius:** EKS — `--delete-emptydir-data` discards any in-pod scratch in `emptyDir` volumes (training caches, ephemeral checkpoints not persisted to PVC/`/opt/sagemaker`); pods are rescheduled elsewhere if capacity exists, otherwise stay Pending. Slurm — running jobs finish on the node; pending jobs route around it. Drain is reversible (`kubectl uncordon` / `scontrol update state=resume`) only if you decide not to proceed with reboot/replace.
+
+---
+
+## H: Slurm Node Management
+
+**Signals:** Node `down`, `"Node unexpectedly rebooted"`, jobs stuck PENDING/COMPLETING, `scontrol ping` fails.
+
+### Node down / unresponsive
+
+```bash
+sinfo -o "%N %T %30E"          # state + reason
+scontrol show node <NODE>      # full details
+
+# Connectivity checks
+ping <node-ip>
+ssh <node-name>
+srun -w <node-name> hostname
+```
+
+Diagnose (read-only):
+
+```bash
+bash skills/hyperpod-ssm/scripts/ssm-exec.sh --target <TARGET> --region <REGION> \
+  'sudo systemctl status slurmd && free -h && df -h'
+```
+
+### Suggested command — bring the node back in Slurm (run this yourself)
+
+**Preconditions:** root cause of the original `slurmd` failure has been **identified and resolved** (disk full, OOM, config parse error) — running `start` on a node whose underlying issue is unfixed will stop again immediately; node passes a basic health probe (`free -h`, `df -h /`, `df -h /opt/sagemaker`); customer accepts that pending jobs may schedule onto this node immediately after `state=resume`; do not run `systemctl enable slurmd` if the unit was _deliberately_ disabled by an admin (verify it was an unexpected reboot, not a config choice).
+
+**Command:**
+
+```bash
+# 1. On the affected node — start (and enable, if an unexpected reboot just
+#    knocked the unit out of auto-start):
+sudo systemctl start slurmd
+sudo systemctl enable slurmd   # only if it was not enabled before
+
+# 2. On the Slurm controller — return the node to the idle pool:
+scontrol update nodename=<N> state=resume
+```
+
+**Blast radius:** node returns to `idle` and pending jobs may schedule immediately. `enable` makes the unit auto-start on boot — if the unit was previously disabled by lifecycle script or admin, this changes that policy on this node only. Reversible: `scontrol update state=drain` to take it back out of scheduling, `systemctl disable slurmd` to revert auto-start. If start+resume does not hold (slurmd dies or node rejoins as `down`), escalate to batch-reboot then batch-replace (§ F).
+
+Before any intentional reboot of a Slurm compute node, set `scontrol update state=drain` first and `state=resume` after — this avoids Slurm flagging the node as unexpectedly rebooted.
+
+### Jobs stuck PENDING / COMPLETING → restart slurmctld
+
+**When:** PENDING with `Reason=Resources` despite free nodes, GRES miscalculation, COMPLETING after replacement, `scontrol ping` fails.
+
+### Suggested command — restart slurmctld (run this yourself)
+
+**Preconditions:** restart is targeting a **specific known cause** that an in-memory restart fixes (cached COMPLETING state after replacement, GRES miscalculation, scheduler not recomputing after node moves); the underlying cluster config is intact — `slurm.conf` parses cleanly (`scontrol show config >/dev/null` succeeds), `StateSaveLocation` is reachable and not full; the customer is OK with a brief scheduler pause during which no new jobs schedule and `scontrol`/`squeue`/`sbatch` calls return transient errors; no node recovery operation (`Action:Reboot`/`Action:Replace`) is in progress — restarting the controller mid-recovery can leave the affected node stuck.
+
+**Command:**
+
+```bash
+sudo systemctl restart slurmctld && sinfo && squeue
+```
+
+**Blast radius:** brief scheduler pause; running jobs are not interrupted (slurmd keeps them going); pending queue and node states are preserved on disk via `StateSaveLocation`. New job submissions during the restart window receive a transient error and must be retried by the user. If `systemctl restart` does not return, the daemon is hung — investigate a stuck `StateSaveLocation` (full disk, NFS hang) before any forcible kill, since killing slurmctld with corrupt state files can lose the queue.
+
+### Slurm node name → instance ID
+
+`list-cluster-nodes` does **not** return `PrivateDnsHostname` — that field is only populated by `describe-cluster-node`. So the mapping is a two-step call: list the instance IDs in the cluster, then describe each one to get the DNS hostname.
+
+```bash
+# 1. List candidate instance IDs (running nodes only, skip utility groups)
+aws sagemaker list-cluster-nodes --cluster-name <C> --region <R> \
+  --query 'ClusterNodeSummaries[?InstanceStatus.Status==`Running`].[InstanceId,InstanceGroupName,InstanceType]' \
+  --output text
+
+# 2. For each candidate, fetch the DNS hostname and match against the Slurm name
+NODE="ip-10-1-2-3"   # Slurm node name
+for IID in $(aws sagemaker list-cluster-nodes --cluster-name <C> --region <R> \
+               --query 'ClusterNodeSummaries[?InstanceStatus.Status==`Running`].InstanceId' --output text); do
+  DNS=$(aws sagemaker describe-cluster-node --cluster-name <C> --region <R> --node-id "$IID" \
+          --query 'NodeDetails.PrivateDnsHostname' --output text 2>/dev/null)
+  case "$DNS" in
+    "$NODE."*) echo "$NODE → $IID"; break ;;
+  esac
+done
+```
+
+**Scale note:** the for-loop above issues one `describe-cluster-node` API call per Running instance until it finds a match. On clusters with thousands of running nodes that's a lot of API calls; SageMaker has a default rate limit on the `Describe*` family (~10 TPS) so this can take minutes and incur throttling. For large clusters, use `dump_cluster_nodes_info.py` (AWS samples repo `awsome-distributed-training`) once to generate a CSV of IP ↔ instance-ID mappings, then look up locally.
+
+---
+
+## I: Resource Exhaustion
+
+**Signals:** Disk full, OOM kills, `"Cannot allocate memory"` at `os.fork()`, inode exhaustion, `/dev/shm` full.
+
+### Diagnose (via `hyperpod-ssm` on the node)
+
+```bash
+df -h && df -i                             # disk + inodes
+free -h                                    # RAM
+df -h /dev/shm                             # shared memory
+dmesg | grep -i oom | tail -10             # OOM kills
+sudo du -h --max-depth=1 / 2>/dev/null | sort -hr | head -15
+cat /proc/meminfo | grep Huge              # huge pages
+```
+
+### I.1: "Cannot allocate memory" at os.fork()
+
+**Symptoms:** `OSError: [Errno 12] Cannot allocate memory` during `os.fork()`, DataLoader crashes, `Failed to register memory` during EFA init, segfaults during NCCL.
+
+**Fix (in order):**
+
+1. `export FI_EFA_USE_HUGE_PAGE=0` — try this first; add to job script, container entrypoint, or `/etc/environment`. Disabling EFA huge pages avoids the fork-time memory-registration path that fails when huge pages aren't pre-allocated.
+2. Increase shared memory:
+   - Docker: `docker run --shm-size=8g ...`
+   - Kubernetes:
+
+     ```yaml
+     volumes:
+     - name: dshm
+       emptyDir: { medium: Memory, sizeLimit: 8Gi }
+     volumeMounts:
+     - { name: dshm, mountPath: /dev/shm }
+     ```
+
+3. Tune PyTorch DataLoader: `num_workers=4` (lower), `persistent_workers=True`, `pin_memory=False` if not bottlenecked on host→GPU copy.
+4. Reduce batch size to lower parent-process memory before fork.
+
+**If you need `FI_EFA_USE_HUGE_PAGE=1`**, pre-allocate huge pages first.
+
+### Suggested command — pre-allocate huge pages on a node (run this yourself)
+
+**Preconditions:** the workload requires `FI_EFA_USE_HUGE_PAGE=1` (most jobs do **not** — `=0` is the simpler fix and resolves the fork-time error on its own); free RAM on the node can absorb the reservation (1024 × 2 MiB = 2 GiB; check with `free -h` first); no existing process on the node already depends on a different `nr_hugepages` value; customer accepts that the persistent file (`/etc/sysctl.d/99-hugepages.conf`) survives reboots — on a node that may later be replaced, the file is destroyed with the volumes (replacement will recreate from the AMI/lifecycle script).
+
+**Command:**
+
+```bash
+cat /proc/sys/vm/nr_hugepages                             # current
+echo 1024 | sudo tee /proc/sys/vm/nr_hugepages            # 1024 × 2 MiB = 2 GiB, runtime-only
+echo 'vm.nr_hugepages=1024' | sudo tee -a /etc/sysctl.d/99-hugepages.conf   # persist across reboots
+```
+
+**Blast radius:** reduces RAM available to other processes on the node by ~2 GiB immediately. Persistent file change applies on every boot of _this_ node — bake the same value into the lifecycle script so replacement nodes match. Setting `FI_EFA_USE_HUGE_PAGE=1` without pre-allocation is the root cause of the fork-time failure; setting it after pre-allocation fixes that path.
+
+### I.2: Root Volume Exhausted
+
+The default HyperPod root volume is **100 GB EBS**. **Do not plan to grow it post-creation** — redirect heavy data to `/opt/sagemaker` (secondary EBS, sized at instance-group creation) or `/opt/dlami/nvme` (NVMe instance store on P/G families). For shared persistence use FSx for Lustre / OpenZFS or S3.
+
+| Mount             | Type                                                        | Persistence              | Best for                                      |
+| ----------------- | ----------------------------------------------------------- | ------------------------ | --------------------------------------------- |
+| `/opt/sagemaker`  | Secondary EBS (configurable per group)                      | Persistent               | Checkpoints, app data, logs, container images |
+| `/opt/dlami/nvme` | NVMe instance store (on instance types that ship with NVMe) | **Lost on stop/replace** | Scratch, caches, temp files                   |
+| FSx for Lustre    | Shared                                                      | Persistent               | Large datasets, shared models                 |
+| FSx for OpenZFS   | Shared                                                      | Persistent               | Mixed workloads, snapshots                    |
+| Amazon S3         | Object storage                                              | Persistent               | Large datasets, archives                      |
+
+### Suggested command — reclaim disk space (run this yourself)
+
+**Preconditions:** root-volume exhaustion confirmed (`df -h /` shows near-100%); customer has identified what is consuming space (`du -sh /var/* /opt/* 2>/dev/null | sort -h`); no training job is currently writing to the affected paths; you have **inspected** `/var/log/` and decided which files are safe to remove (never run a blanket wipe — target specific files identified by `du`); no running containers will be surprised by `docker system prune`.
+
+**Command:**
+
+```bash
+# 1. Shrink journald — capped size, reversible by running again
+sudo journalctl --vacuum-size=500M
+
+# 2. Remove rotated logs YOU HAVE IDENTIFIED as safe to delete. Example
+#    commands — review the file list first and adapt the globs:
+ls -lah /var/log/*.log.* /var/log/*/*.gz 2>/dev/null   # inspect
+# Then, targeted deletes for the specific logs you chose:
+sudo rm -f /var/log/<specific-file>.log.N
+
+# 3. Package-manager caches (safe):
+sudo apt-get clean 2>/dev/null || sudo yum clean all 2>/dev/null
+
+# 4. Docker prune — removes stopped containers, unused networks, dangling
+#    images. Add --volumes only if you know no named volume holds training data.
+docker system prune -a -f 2>/dev/null
+```
+
+**Blast radius:** `journalctl --vacuum-size` and package-manager `clean` are low-risk. Targeted `rm` in `/var/log/` is safe for rotated-and-gzipped files (`*.gz`) but a blanket `rm -f /var/log/*.log.*` can delete logs an incident team needs — always inspect first. `docker system prune -a` without `--volumes` leaves named volumes intact; adding `--volumes` will delete any unattached named volumes (including ones holding model checkpoints if not mounted at prune time).
+
+**Redirect data:**
+
+```bash
+# Environment variables
+export TORCH_HOME=/opt/sagemaker/torch_cache
+export HF_HOME=/opt/sagemaker/huggingface_cache
+export TRANSFORMERS_CACHE=/opt/sagemaker/transformers_cache
+export TMPDIR=/opt/dlami/nvme/tmp && mkdir -p $TMPDIR
+
+# Training scripts
+checkpoint_dir = "/opt/sagemaker/checkpoints"
+cache_dir = "/opt/dlami/nvme/cache"
+```
+
+For K8s pods, mount `/opt/sagemaker` and `/opt/dlami/nvme` as `hostPath` volumes. Check the customer's lifecycle script — the awsome-distributed-training samples typically point container runtimes at these paths, but custom scripts may not. **Prevention:** size secondary EBS generously at instance-group creation; growing it later is more disruptive than over-provisioning up front.
+
+### I.3: OOM events
+
+Triage signal: `[P1] OOM events on node <i-xxx>`.
+
+```bash
+sudo dmesg -T | grep -i -B2 -A30 "Out of memory" | tail -80
+ps auxf --sort=-%mem | head -20
+```
+
+The fix is in the workload spec (pod `resources.limits.memory`, batch size, DataLoader workers) — no remediation command on the node changes state.
+
+### I.4: Inode exhaustion
+
+Triage signal: `[P1] Inode exhaustion <N>% on /`. Small files (pip caches, HF caches, container image layers) can exhaust inodes before disk space.
+
+Diagnose (read-only):
+
+```bash
+df -i /
+# Top inode hoarders (by top-level directory):
+sudo find / -xdev -type f 2>/dev/null | awk -F/ '{print $1"/"$2"/"$3}' | sort | uniq -c | sort -rn | head -20
+```
+
+### Suggested command — reclaim inodes (run this yourself)
+
+**Preconditions:** inode exhaustion confirmed on `/` (`df -i /` near 100%); top hoarders identified via `find` above; no training job is currently writing to or reading from `~/.cache/huggingface` or `~/.cache/pip` (these caches may hold model weights that would need to be re-downloaded — check with the customer before deleting); `docker system prune --volumes` is acceptable (customer has confirmed no unattached named volume holds data they need).
+
+**Command:**
+
+```bash
+# 1. pip cache — fast to rebuild; safe.
+rm -rf ~/.cache/pip/*
+
+# 2. Hugging Face cache — CONTAINS DOWNLOADED MODEL WEIGHTS. Delete only
+#    if the customer accepts re-download cost (can be many GB and minutes).
+#    Preferably: `du -sh ~/.cache/huggingface/*` and remove only the specific
+#    entries they are not using.
+du -sh ~/.cache/huggingface/* 2>/dev/null   # inspect first
+# Then, targeted:
+rm -rf ~/.cache/huggingface/<specific-model-dir>
+
+# 3. journald (safe):
+sudo journalctl --vacuum-size=200M
+
+# 4. Docker prune — see blast-radius note in I.2. Only add --volumes if
+#    the customer has confirmed no named volume holds training data.
+docker system prune -a -f 2>/dev/null || true
+```
+
+**Blast radius:** `rm -rf ~/.cache/huggingface/*` can destroy large model weights requiring slow re-downloads (potentially interrupting training on adjacent jobs that share the cache). `docker system prune -a --volumes -f` without care can delete named volumes holding checkpoints. Always inspect (`du`) and delete targeted paths rather than using wildcards across the whole cache. Redirect caches to `/opt/sagemaker` or `/opt/dlami/nvme` (see I.2) as a long-term fix — separate filesystems with their own inode tables.
+
+---
+
+## J: Configuration
+
+**Signals:** p5.48xlarge shows 96 vCPU instead of 192 (half the expected vCPU count).
+
+### Suggested command — enable SMT via ThreadsPerCore (run this yourself)
+
+**Preconditions:** instance-type confirmed as one where SMT is disabled by default and the workload wants both threads (e.g., p5.48xlarge 96→192); **every field for every instance group is derived from the current `describe-cluster` output** (`update-cluster` replaces the whole `InstanceGroups` list — any mistyped field silently changes cluster config); you understand that changing `ThreadsPerCore` rolls the instance group through replacement.
+
+**Command:**
+
+```bash
+aws sagemaker update-cluster --cluster-name <C> --region <R> \
+  --instance-groups '[{"InstanceGroupName":"<G>","InstanceType":"ml.p5.48xlarge",
+    "InstanceCount":<N>,"ThreadsPerCore":2,
+    "LifeCycleConfig":{"SourceS3Uri":"<URI>","OnCreate":"<SCRIPT>"},
+    "ExecutionRole":"<ROLE>"}]'
+```
+
+**Blast radius:** any instance group omitted from the list is deleted; any field drift (instance type, count, lifecycle config, execution role) is applied as-is. Rolls nodes through replacement — which destroys root + secondary volumes per instance. Coordinate with the workload owner before running.
+
+---
+
+## K: Node Access via SSM
+
+Direct SSH is not available on HyperPod — SSM is the primary node access method. The target format and connection procedure is identical for EKS and Slurm.
+
+### Quick-start: connect in 4 commands
+
+```bash
+CLUSTER_NAME="my-hyperpod-cluster"
+REGION="us-east-1"
+
+# 1. Cluster ID is the ARN suffix — NOT the cluster name
+CLUSTER_ID=$(aws sagemaker describe-cluster \
+  --cluster-name "$CLUSTER_NAME" --region "$REGION" \
+  --query 'ClusterArn' --output text | cut -d/ -f2)
+
+# 2. List nodes
+aws sagemaker list-cluster-nodes --cluster-name "$CLUSTER_NAME" --region "$REGION" \
+  --query 'ClusterNodeSummaries[*].[InstanceGroupName,InstanceId,InstanceStatus.Status]' --output table
+
+# 3. Build the target
+TARGET="sagemaker-cluster:${CLUSTER_ID}_<GROUP>-<INSTANCE_ID>"
+
+# 4. Connect
+aws ssm start-session --target "$TARGET" --region "$REGION"
+```
+
+### From a Slurm node name (e.g. ip-10-1-2-3)
+
+`PrivateDnsHostname` is only returned by `describe-cluster-node` (not by `list-cluster-nodes`), so map via the two-step procedure in § H "Slurm node name → instance ID" — then build the SSM target with the resolved instance ID.
+
+### Non-interactive command execution
+
+```bash
+bash skills/hyperpod-ssm/scripts/ssm-exec.sh --target "$TARGET" --region "$REGION" \
+  'nvidia-smi && free -h && df -h'
+```
+
+### Essential on-node checks
+
+| Check                  | Command                                                 |
+| ---------------------- | ------------------------------------------------------- |
+| System health          | `uptime && free -h && df -h`                            |
+| GPU (NVIDIA)           | `nvidia-smi`                                            |
+| Accelerator (Trainium) | `neuron-ls && neuron-top -n 1`                          |
+| EFA                    | `fi_info -p efa`                                        |
+| NCCL/EFA env           | `env \| grep -E "FI_\|NCCL_"`                           |
+| OOM / errors           | `dmesg \| grep -i "oom\|xid\|nvrm\|neuron" \| tail -20` |
+| Provisioning           | `cat /var/log/provision/provisioning.log`               |
+| Slurmd (Slurm only)    | `sudo systemctl status slurmd`                          |
+
+### Prerequisites
+
+```bash
+session-manager-plugin --version
+# If missing, install session-manager-plugin for your OS — see the
+# AWS Systems Manager Session Manager documentation for current packages.
+```
+
+IAM:
+
+```json
+{
+  "Version": "2012-10-17",
+  "Statement": [{
+    "Effect": "Allow",
+    "Action": [
+      "sagemaker:DescribeCluster",
+      "sagemaker:DescribeClusterNode",
+      "sagemaker:ListClusterNodes",
+      "ssm:StartSession",
+      "ssm:TerminateSession"
+    ],
+    "Resource": "*"
+  }]
+}
+```
+
+### SSM not working?
+
+| Error                                   | Fix                                                                                    |
+| --------------------------------------- | -------------------------------------------------------------------------------------- |
+| `SessionManagerPlugin is not found`     | Install plugin; restart terminal                                                       |
+| `Target is not connected`               | Use `sagemaker-cluster:` prefix (not bare `i-xxx`); verify region; verify node Running |
+| `InvalidTarget` / `ValidationException` | Format must be exactly `sagemaker-cluster:<CLUSTER_ID>_<GROUP>-<INSTANCE_ID>`          |
+| `Access denied`                         | Need `ssm:StartSession`, `sagemaker:DescribeCluster`, `sagemaker:ListClusterNodes`     |
+| Connection timeout                      | Check VPC endpoints (SSM, SSMMessages, EC2Messages); verify node Running               |
+
+---
+
+## L: Log Collection
+
+**Delegate to `hyperpod-issue-report`** for comprehensive S3-stored diagnostics.
+
+| Log               | Group                                 | Stream                                                 |
+| ----------------- | ------------------------------------- | ------------------------------------------------------ |
+| Lifecycle scripts | `/aws/sagemaker/Clusters/<name>/<id>` | `LifecycleConfig/<group>/<instance-id>`                |
+| Health monitoring | `/aws/sagemaker/Clusters/<name>/<id>` | `SagemakerHealthMonitoringAgent/<group>/<instance-id>` |
+
+---
+
+## M: Container Runtime
+
+**Signals:** CrashLoopBackOff, ImagePullBackOff, RunContainerError, container OOM kills (EKS clusters).
+
+```bash
+# Pod-level (from workstation)
+kubectl describe pod <POD> -n <NAMESPACE>
+kubectl logs <POD> -n <NAMESPACE> --previous       # logs from last crash
+
+# On-node (via SSM)
+bash skills/hyperpod-ssm/scripts/ssm-exec.sh --target <TARGET> --region <REGION> \
+  'sudo crictl ps -a | head -20 && sudo crictl logs --tail 30 <CONTAINER_ID> && journalctl -u containerd --no-pager -n 50'
+```
+
+| Symptom                   | Cause                               | Fix                                                               |
+| ------------------------- | ----------------------------------- | ----------------------------------------------------------------- |
+| `CrashLoopBackOff`        | Training process crashes repeatedly | `kubectl logs --previous`; likely OOM, missing lib, or NCCL error |
+| `OOMKilled`               | Container exceeded memory limit     | Raise `resources.limits.memory` or reduce batch size              |
+| `ImagePullBackOff`        | Image not found or auth failure     | Verify ECR URI; ECR access via VPC endpoint or internet           |
+| `RunContainerError`       | Runtime can't start container       | `journalctl -u containerd`; may be disk full or GPU device issue  |
+| `ContainerCreating` stuck | Volume mount or device plugin issue | Check EFA device plugin DaemonSet, volume mounts, CSI drivers     |
+
+If containerd is crashing or OOM-ing, check disk on `/var/lib/containerd` (lives on the root 100 GB volume). Move container storage to `/opt/sagemaker` if needed.
+
+---
+
+## N: Kernel & System
+
+**Signals:** Kernel panic, watchdog timeout, NMI, system hang, unexpected reboot not explained by HyperPod health monitoring.
+
+```bash
+bash skills/hyperpod-ssm/scripts/ssm-exec.sh --target <TARGET> --region <REGION> \
+  'dmesg | grep -iE "panic|watchdog|hung_task|NMI|nvrm|Call Trace|BUG:" | tail -30 && journalctl -b -1 --no-pager -n 50 2>/dev/null || echo "No previous boot journal"'
+```
+
+| Signal                       | Likely cause                               | Action                                                                                     |
+| ---------------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------ |
+| `Kernel panic - not syncing` | Critical kernel error                      | Full `dmesg`; nvrm-related signatures suggest NVIDIA driver — reboot, replace if recurring |
+| `watchdog: BUG: soft lockup` | CPU stuck in kernel code                   | Often NVLink/PCIe issues on GPU instances; reboot, replace if recurring                    |
+| `hung_task_timeout`          | Process stuck in uninterruptible sleep     | Check disk I/O (`iostat`), NFS hangs, deadlocked GPU ops                                   |
+| `NMI received`               | Hardware interrupt                         | Drain and replace (§ F)                                                                    |
+| `mce: [Hardware Error]`      | Machine check exception                    | CPU/memory hardware failure — replace                                                      |
+| Repeated unexpected reboots  | Health agent triggered reboot for HW fault | Check `SagemakerHealthMonitoringAgent` logs; expected if auto-repair is working            |
+
+Previous boot logs:
+
+```bash
+journalctl -b -1 --no-pager | tail -100
+last reboot | head -5
+who -b
+```
+
+Recurring panics on the same node after reboot → hardware is likely bad; drain and replace (§ F).
+
+---
+
+## O: CNI / Pod Networking
+
+VPC CNI plugin (`aws-node`) failures prevent pods from getting IP addresses — breaks all pod networking on affected nodes. Pattern seen in customer cases: HyperPod GPU node is `Ready` but `aws-node` is in `CrashLoopBackOff`, pod sandbox creation fails with `gRPC 127.0.0.1:50051 refused`.
+
+### Diagnose
+
+```bash
+kubectl get ds -n kube-system aws-node                     # DESIRED vs READY mismatch
+kubectl get pods -n kube-system -l k8s-app=aws-node -o wide  # CrashLoopBackOff / Error / high RESTARTS
+
+# Pod logs
+kubectl logs -n kube-system <aws-node-pod> -c aws-node --tail=100
+kubectl logs -n kube-system <aws-node-pod> -c aws-eks-nodeagent --tail=50
+
+# IPAMD-specific
+kubectl logs -n kube-system <aws-node-pod> -c aws-node --tail=100 | grep -iE "ipamd|eni|ip pool|failed"
+
+# Related DaemonSets
+kubectl get pods -n kube-system -l k8s-app=kube-proxy
+kubectl get pods -n kube-system -l k8s-app=kube-dns
+```
+
+| Log pattern                                         | Root cause                                        | Fix                                                           |
+| --------------------------------------------------- | ------------------------------------------------- | ------------------------------------------------------------- |
+| `gRPC connection refused 127.0.0.1:50051`           | IPAMD not running; aws-node init container failed | Restart aws-node pod; check node IAM role                     |
+| `Failed to create ENI` / `ENI limit reached`        | Instance-type ENI limit reached                   | Reduce pod density or enable prefix delegation                |
+| `UnauthorizedOperation: ec2:CreateNetworkInterface` | Node IAM role missing EC2 permissions             | Add `AmazonEKS_CNI_Policy` to the node role                   |
+| `Failed to pull image` on aws-node                  | ECR unreachable in private VPC                    | Add `com.amazonaws.<region>.ecr.api` and `.dkr` VPC endpoints |
+| `Insufficient IP addresses`                         | Subnet exhausted                                  | Larger subnet or enable prefix delegation                     |
+| `ipamd: failed to increase IP pool`                 | Cannot allocate warm-pool IPs                     | Check ENI limits, subnet capacity, SG rules                   |
+
+Diagnose (read-only):
+
+```bash
+aws ec2 describe-subnets --subnet-ids <SUBNET_ID> --region <REGION> \
+  --query 'Subnets[0].{SubnetId:SubnetId,AvailableIPs:AvailableIpAddressCount,CIDR:CidrBlock}'
+```
+
+### Suggested command — restart a crashing aws-node pod (run this yourself)
+
+**Preconditions:** root cause has been investigated and is plausibly transient (e.g., a stuck IPAMD process). For persistent crashes from IAM, VPC, or subnet exhaustion, fix the underlying issue first — restarting the pod will only loop. The customer accepts brief CNI unavailability on this node (a few seconds while the daemonset respawns).
+
+**Command:**
+
+```bash
+kubectl delete pod -n kube-system <aws-node-pod-name>
+```
+
+**Blast radius:** the daemonset respawns the pod within seconds; during the gap, pods being scheduled or deleted on this node may briefly fail IP assignment. Already-running pods with assigned IPs are unaffected. Reversible by definition (replacement pod is identical).
+
+### Suggested command — enable prefix delegation for higher pod density (run this yourself)
+
+**Preconditions:** cluster admin has approved the operational change; you understand that prefix delegation changes ENI allocation behavior for every node managed by this daemonset; no existing workload relies on the previous per-IP allocation pattern.
+
+**Command:**
+
+```bash
+kubectl set env daemonset aws-node -n kube-system ENABLE_PREFIX_DELEGATION=true
+```
+
+**Blast radius:** cluster-wide change to the VPC CNI configuration. New pods scheduled after the rollout get IPs from ENI prefixes rather than individual secondary IPs. Existing pods keep their IPs. Reverting requires `kubectl set env daemonset aws-node -n kube-system ENABLE_PREFIX_DELEGATION-` (note the trailing `-`) and may leave ENIs in an unexpected state until nodes cycle.
+
+The node role needs `AmazonEKS_CNI_Policy` or equivalent: `ec2:CreateNetworkInterface`, `DeleteNetworkInterface`, `DescribeNetworkInterfaces`, `AssignPrivateIpAddresses`, `UnassignPrivateIpAddresses`, `AttachNetworkInterface`, `DetachNetworkInterface`.
+
+### Escalate
+
+If `aws-node` keeps crashing after restart with no clear error, and IAM + VPC + subnet are all correct, escalate with:
+
+```bash
+kubectl describe ds -n kube-system aws-node
+kubectl logs -n kube-system -l k8s-app=aws-node --tail=200
+kubectl get nodes -o wide
+```
diff --git a/plugins/sagemaker-ai/skills/hyperpod-node-debugger/references/node-issue-catalog.md b/plugins/sagemaker-ai/skills/hyperpod-node-debugger/references/node-issue-catalog.md
new file mode 100644
index 00000000..53a6e990
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/hyperpod-node-debugger/references/node-issue-catalog.md
@@ -0,0 +1,141 @@
+# Node Issue Catalog
+
+Patterns seen in real customer cases. Each entry: symptoms → root cause → diagnostic → fix. For the full remediation procedures see [node-diagnostics-detail.md](node-diagnostics-detail.md); this catalog is the quick-pattern lookup.
+
+---
+
+## 1. EFA
+
+### 1.1 Primary EFA health-check failure
+
+Covered in [node-diagnostics-detail.md § A](node-diagnostics-detail.md#a-efa--security-group).
+
+### 1.2 EFA not working after node replacement
+
+**Symptoms:** Training hangs at NCCL init after replacing one or more nodes; `fi_info -p efa` returns no providers on the replacement; other nodes work.
+
+**Root cause:** EFA driver not loaded, or version drift after an AMI update.
+
+```bash
+# On the affected node (via SSM):
+lsmod | grep efa                            # efa module loaded?
+fi_info -p efa                              # EFA endpoints visible?
+cat /opt/amazon/efa_installed_packages      # version
+```
+
+**Fix:** Compare versions across nodes with the `hyperpod-version-checker` skill. If versions differ, the lifecycle script likely needs updating.
+
+### 1.3 EFA intermittent failures
+
+**Symptoms:** Training works sometimes, randomly hangs; NCCL logs show `Using network TCP` on some iterations (EFA fallback).
+
+**Root cause:** EFA interface flapping, NIC errors, or PCIe issues.
+
+```bash
+# On the affected node (via SSM):
+ip -s link show 2>/dev/null | grep -A5 "RX\|TX"   # errors / drops
+dmesg | grep -i "efa\|pcie\|error" | tail -20
+bash scripts/check-node-reachability.sh            # full EFA health check
+```
+
+---
+
+## 2. GPU / Accelerator
+
+### 2.1 GPU off bus (XID 79)
+
+**Symptoms:** `nvidia-smi` shows fewer GPUs than expected; `dmesg` has `Xid 79: GPU has fallen off the bus`; training fails with CUDA device not found.
+
+**Root cause:** Hardware — GPU disconnected from PCIe bus.
+
+```bash
+nvidia-smi -L | wc -l              # visible GPUs
+dmesg | grep -i "xid.*79\|off the bus"
+lspci | grep -i nvidia | wc -l     # physical GPU count
+```
+
+**Fix:** Drain and replace — see the Suggested-command blocks in [node-diagnostics-detail.md § G (drain)](node-diagnostics-detail.md#accelerator-failure--section-f) and [§ F (batch-replace)](node-diagnostics-detail.md#f-hardware--auto-repair) for Preconditions / Blast-radius. Root + secondary volumes are destroyed on replace.
+
+### 2.2 ECC errors
+
+**Symptoms:** `nvidia-smi -q` shows non-zero ECC counts; training produces NaNs or incorrect gradients; throughput degrades on a specific GPU.
+
+```bash
+nvidia-smi -q | grep -A 10 "ECC Errors"
+nvidia-smi --query-gpu=index,ecc.errors.corrected.volatile.total,ecc.errors.uncorrected.volatile.total --format=csv
+```
+
+Correctable errors (CE) are a normal background. **Any uncorrectable error (UCE) indicates failing memory — drain and replace.** A persistent growing CE rate is also a warning and worth escalating even without UCE.
+
+### 2.3 Thermal throttling
+
+**Symptoms:** GPU utilization drops periodically; `nvidia-smi dmon` shows rising temperature and clock ramp-down; training throughput varies over time.
+
+```bash
+nvidia-smi dmon -s pucvmet -d 5
+nvidia-smi --query-gpu=temperature.gpu,power.draw,clocks.current.sm --format=csv
+```
+
+Persistent throttling on a single GPU when others stay cool typically points at a hardware-level thermal or power-delivery issue — drain and replace, and capture `nvidia-bug-report.sh` for the support case.
+
+### 2.4 NVLink failures
+
+**Symptoms:** Inter-GPU communication slow on the same node; `nvidia-smi nvlink --status` shows inactive links; XID 74 in dmesg.
+
+```bash
+nvidia-smi nvlink --status
+nvidia-smi topo -m             # should show NVLinks, not PHB-only paths
+dmesg | grep -i "xid.*74\|nvlink"
+```
+
+**Fix:** Drain and replace.
+
+---
+
+## 3. Slurm
+
+### 3.1 "Node unexpectedly rebooted"
+
+**Symptoms:** `sinfo` shows node `down`; reason `"Node unexpectedly rebooted"`; node is actually running and accessible.
+
+**Root cause:** Node rebooted without notifying Slurm; slurmd may not have restarted.
+
+```bash
+scontrol show node <NODE> | grep -E "State|Reason"
+# On node via SSM:
+sudo systemctl status slurmd
+```
+
+**Fix:** restart slurmd on the node and resume on the controller — see [node-diagnostics-detail.md § H (Slurm Node Management)](node-diagnostics-detail.md#h-slurm-node-management) for the framed procedure.
+
+### 3.2 Jobs stuck COMPLETING after node replacement
+
+**Symptoms:** Jobs stay in COMPLETING indefinitely; node was recently replaced.
+
+**Root cause:** slurmctld cached the COMPLETING state and keeps waiting for the replaced node.
+
+**Fix:** restart slurmctld (preserves running jobs, queue, and node states) — see the Suggested-command block in [node-diagnostics-detail.md § H (Jobs stuck PENDING / COMPLETING)](node-diagnostics-detail.md#jobs-stuck-pending--completing--restart-slurmctld).
+
+### 3.3 GRES (GPU) miscalculation
+
+**Symptoms:** Jobs stuck PENDING with `Reason=Resources` despite free GPUs; `scontrol show node` shows the wrong GRES count.
+
+**Root cause:** GRES resources not released after job completion or node replacement.
+
+**Fix:** restart slurmctld — same Suggested-command block as 3.2 above. Verify with `scontrol show node <NODE> | grep Gres`.
+
+---
+
+## 4. Configuration
+
+### 4.1 Wrong vCPU count (e.g. 96 on p5.48xlarge instead of 192)
+
+**Symptoms:** `nproc` shows half the expected vCPU count for the instance family; jobs configured for the full count can't schedule.
+
+**Fix:** See [node-diagnostics-detail.md § J](node-diagnostics-detail.md#j-configuration) for the `update-cluster` fix using `ThreadsPerCore`.
+
+---
+
+## 5. Resource exhaustion
+
+See [node-diagnostics-detail.md § I](node-diagnostics-detail.md#i-resource-exhaustion) — full coverage of root volume exhaustion, `os.fork()` memory error with EFA, OOM kills, inode exhaustion, and time sync.
diff --git a/plugins/sagemaker-ai/skills/hyperpod-node-debugger/scripts/check-efa-sg.sh b/plugins/sagemaker-ai/skills/hyperpod-node-debugger/scripts/check-efa-sg.sh
new file mode 100755
index 00000000..cf8677df
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/hyperpod-node-debugger/scripts/check-efa-sg.sh
@@ -0,0 +1,355 @@
+#!/usr/bin/env bash
+# check-efa-sg.sh
+#
+# Identify and diagnose EFA security group rules for a HyperPod cluster.
+# Automatically extracts the cluster's exact VPC, subnets, and security groups
+# from the cluster ARN — works correctly even in accounts with 1000s of resources.
+#
+# Usage (preferred — cluster-centric, auto-discovers resources):
+#   bash check-efa-sg.sh --cluster <cluster-name-or-arn> --region <region>
+#
+# Usage (direct SG mode — when SG is already known):
+#   bash check-efa-sg.sh --sg-id <sg-id> --region <region>
+#
+# Exit codes:
+#   0 — all required rules in place
+#   1 — one or more required rules missing
+
+set -euo pipefail
+
+for cmd in aws python3; do
+  command -v "$cmd" &>/dev/null || {
+    echo "ERROR: '$cmd' is required but not found. Install it and retry."
+    exit 1
+  }
+done
+
+CLUSTER=""
+SG_ID=""
+REGION="${AWS_DEFAULT_REGION:-}"
+USE_COLOR=true
+
+usage() {
+  cat <<EOF
+Usage:
+  $0 --cluster <cluster-name-or-arn> --region <region> [--no-color]
+  $0 --sg-id   <sg-id>               --region <region> [--no-color]
+
+Read-only diagnostic for EFA-related security group rules on a HyperPod
+cluster. Reports inbound/outbound self-referencing rules and warns on
+0.0.0.0/0 outbound (which the HyperPod docs advise against on the EFA SG).
+On any [FAIL] the script ends with a pointer to
+"references/node-diagnostics-detail.md § A (EFA / Security Group)".
+
+Options:
+  --cluster   Auto-discovers SGs, subnets, VPC from the cluster (preferred).
+  --sg-id     Check a specific security group directly.
+  --region    AWS region (required unless \$AWS_DEFAULT_REGION is set).
+  --no-color  Disable ANSI colors.
+  -h, --help  Show this message.
+
+Exit codes:
+  0  All required rules present.
+  1  One or more required rules missing.
+EOF
+}
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --cluster)  CLUSTER="$2";    shift 2 ;;
+    --sg-id)    SG_ID="$2";      shift 2 ;;
+    --region)   REGION="$2";     shift 2 ;;
+    --no-color) USE_COLOR=false; shift ;;
+    -h|--help)  usage; exit 0 ;;
+    *) echo "Unknown argument: $1" >&2; usage >&2; exit 1 ;;
+  esac
+done
+
+if [[ -z "$CLUSTER" && -z "$SG_ID" ]]; then
+  usage >&2
+  exit 1
+fi
+
+if [[ -z "$REGION" ]]; then
+  echo "ERROR: --region is required (or set AWS_DEFAULT_REGION before running)." >&2
+  exit 2
+fi
+
+# Mutually exclusive: --cluster auto-discovers SGs, --sg-id targets one specific SG.
+# Passing both was silently ignoring --sg-id — error instead so the caller notices.
+if [[ -n "$CLUSTER" && -n "$SG_ID" ]]; then
+  echo "ERROR: --cluster and --sg-id are mutually exclusive (pick one)" >&2
+  exit 2
+fi
+
+if [[ -n "$SG_ID" && ! "$SG_ID" =~ ^sg-[a-fA-F0-9]{8,17}$ ]]; then
+  echo "ERROR: Invalid security group ID format: '$SG_ID' (expected sg-<hex>, e.g. sg-0abc1234def56789a)"
+  exit 1
+fi
+
+if ! [ -t 1 ] || [ "${TERM:-}" = "dumb" ]; then
+  USE_COLOR=false
+fi
+if "$USE_COLOR"; then
+  RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'
+  BOLD='\033[1m'; NC='\033[0m'
+else
+  RED=''; GREEN=''; YELLOW=''; BOLD=''; NC=''
+fi
+
+check_single_sg() {
+  local sg_id="$1"
+  local region="$2"
+  local issues=0
+
+  echo ""
+  echo -e "${BOLD}=== EFA Security Group Diagnostic ===${NC}"
+  echo -e "Security Group: ${BOLD}${sg_id}${NC}  Region: ${BOLD}${region}${NC}"
+  echo ""
+
+  local sg_json
+  sg_json=$(aws ec2 describe-security-groups \
+    --group-ids "$sg_id" \
+    --region "$region" \
+    --cli-read-timeout 30 \
+    --output json 2>&1) || {
+    echo -e "${RED}ERROR: Cannot describe security group '$sg_id' in region '$region'${NC}"
+    echo "$sg_json"
+    return 1
+  }
+
+  # Distinguish "API succeeded but returned empty" (auth-denied or malformed JSON
+  # still yielding exit 0) from "SG genuinely has no rules". Without this, the
+  # three rule checks below would each emit [FAIL], misleading the customer
+  # into thinking rules are missing when the check itself could not run.
+  local sg_count
+  sg_count=$(echo "$sg_json" | python3 -c "import sys,json; d=json.load(sys.stdin); print(len(d.get('SecurityGroups',[])))" 2>/dev/null || echo 0)
+  if [[ "$sg_count" -eq 0 ]]; then
+    echo -e "  ${YELLOW}[WARN]${NC} Unable to check SG rules — describe-security-groups returned no data for '$sg_id' (possible IAM denial or stale ID)"
+    echo -e "         → references/node-diagnostics-detail.md § A (EFA / Security Group)"
+    return 0
+  fi
+
+  local sg_name vpc_id
+  sg_name=$(echo "$sg_json" | python3 -c "import sys,json; d=json.load(sys.stdin); print(d['SecurityGroups'][0].get('GroupName','unknown'))" 2>/dev/null || echo "unknown")
+  vpc_id=$(echo "$sg_json"  | python3 -c "import sys,json; d=json.load(sys.stdin); print(d['SecurityGroups'][0].get('VpcId','unknown'))"   2>/dev/null || echo "unknown")
+  echo -e "Name: ${sg_name}  |  VPC: ${vpc_id}"
+  echo ""
+
+  echo -e "${BOLD}--- Inbound Rules ---${NC}"
+  echo "$sg_json" | python3 -c "
+import sys, json
+d = json.load(sys.stdin)['SecurityGroups'][0]
+rules = d.get('IpPermissions', [])
+if not rules:
+    print('  (none)')
+for r in rules:
+    proto = r.get('IpProtocol', '?')
+    srcs  = [g.get('GroupId','') for g in r.get('UserIdGroupPairs', [])]
+    cidrs = [c.get('CidrIp','') for c in r.get('IpRanges', [])]
+    for s in srcs:  print(f'  proto={proto} source=sg:{s}')
+    for c in cidrs: print(f'  proto={proto} source={c}')
+" 2>/dev/null
+
+  echo ""
+  echo -e "${BOLD}--- Outbound Rules ---${NC}"
+  echo "$sg_json" | python3 -c "
+import sys, json
+d = json.load(sys.stdin)['SecurityGroups'][0]
+rules = d.get('IpPermissionsEgress', [])
+if not rules:
+    print('  (none)')
+for r in rules:
+    proto = r.get('IpProtocol', '?')
+    dests = [g.get('GroupId','') for g in r.get('UserIdGroupPairs', [])]
+    cidrs = [c.get('CidrIp','') for c in r.get('IpRanges', [])]
+    for s in dests: print(f'  proto={proto} dest=sg:{s}')
+    for c in cidrs: print(f'  proto={proto} dest={c}')
+" 2>/dev/null
+
+  echo ""
+  echo -e "${BOLD}--- Rule Check Results ---${NC}"
+
+  local inbound_self outbound_self outbound_inet
+  inbound_self=$(echo "$sg_json" | SG_CHECK_ID="$sg_id" python3 -c "
+import sys, json, os
+sg=os.environ['SG_CHECK_ID']
+d = json.load(sys.stdin)['SecurityGroups'][0]
+for r in d.get('IpPermissions', []):
+    if r.get('IpProtocol') == '-1':
+        if any(g.get('GroupId') == sg for g in r.get('UserIdGroupPairs', [])):
+            print('found'); exit(0)
+" 2>/dev/null || echo "")
+
+  outbound_self=$(echo "$sg_json" | SG_CHECK_ID="$sg_id" python3 -c "
+import sys, json, os
+sg=os.environ['SG_CHECK_ID']
+d = json.load(sys.stdin)['SecurityGroups'][0]
+for r in d.get('IpPermissionsEgress', []):
+    if r.get('IpProtocol') == '-1':
+        if any(g.get('GroupId') == sg for g in r.get('UserIdGroupPairs', [])):
+            print('found'); exit(0)
+" 2>/dev/null || echo "")
+
+  outbound_inet=$(echo "$sg_json" | python3 -c "
+import sys, json
+d = json.load(sys.stdin)['SecurityGroups'][0]
+for r in d.get('IpPermissionsEgress', []):
+    if r.get('IpProtocol') == '-1':
+        if any(c.get('CidrIp') == '0.0.0.0/0' for c in r.get('IpRanges', [])):
+            print('found'); exit(0)
+" 2>/dev/null || echo "")
+
+  if [[ "$inbound_self" == "found" ]]; then
+    echo -e "  ${GREEN}[PASS]${NC} Inbound self-referencing rule (all traffic from ${sg_id})"
+  else
+    echo -e "  ${RED}[FAIL]${NC} Missing inbound self-referencing rule (all traffic from ${sg_id})"
+    issues=$((issues+1))
+  fi
+
+  if [[ "$outbound_self" == "found" ]]; then
+    echo -e "  ${GREEN}[PASS]${NC} Outbound self-referencing rule (all traffic to ${sg_id}) ← required for EFA"
+  else
+    echo -e "  ${RED}[FAIL]${NC} Missing outbound self-referencing rule ← ${BOLD}PRIMARY cause of EFA health check failure${NC}"
+    issues=$((issues+1))
+  fi
+
+  if [[ "$outbound_inet" == "found" ]]; then
+    echo -e "  ${YELLOW}[WARN]${NC} Outbound 0.0.0.0/0 rule present — HyperPod docs advise against this on the EFA SG (can cause EFA health check failures). Move internet egress to the subnet (NAT or VPC endpoints)."
+  else
+    echo -e "  ${GREEN}[PASS]${NC} No outbound 0.0.0.0/0 on EFA SG (correct per HyperPod prerequisites)"
+  fi
+
+  if [[ $issues -gt 0 ]]; then
+    echo ""
+    echo -e "  ${YELLOW}→ See references/node-diagnostics-detail.md § A (EFA / Security Group) for remediation.${NC}"
+  fi
+
+  return "$issues"
+}
+
+if [[ -n "$CLUSTER" ]]; then
+  echo ""
+  echo -e "${BOLD}=== HyperPod Cluster Resource Discovery ===${NC}"
+  echo -e "Cluster: ${BOLD}${CLUSTER}${NC}"
+  echo -e "Region:  ${BOLD}${REGION}${NC}"
+  echo ""
+
+  CLUSTER_JSON=$(aws sagemaker describe-cluster \
+    --cluster-name "$CLUSTER" \
+    --region "$REGION" \
+    --cli-read-timeout 30 \
+    --output json 2>&1) || {
+    echo -e "${RED}ERROR: Cannot find cluster '$CLUSTER' in region '$REGION'${NC}"
+    echo ""
+    echo "Available clusters in this region:"
+    aws sagemaker list-clusters --region "$REGION" \
+      --query 'ClusterSummaries[*].{Name:ClusterName,Status:ClusterStatus,ARN:ClusterArn}' \
+      --output table 2>/dev/null || echo "  (unable to list clusters)"
+    echo "$CLUSTER_JSON"
+    exit 1
+  }
+
+  CLUSTER_ARN=$(echo "$CLUSTER_JSON"    | python3 -c "import sys,json; print(json.load(sys.stdin).get('ClusterArn',''))"    2>/dev/null || echo "")
+  CLUSTER_STATUS=$(echo "$CLUSTER_JSON" | python3 -c "import sys,json; print(json.load(sys.stdin).get('ClusterStatus',''))" 2>/dev/null || echo "")
+  ORCHESTRATOR=$(echo "$CLUSTER_JSON"   | python3 -c "
+import sys,json
+d=json.load(sys.stdin)
+print('EKS' if 'Eks' in d.get('Orchestrator',{}) else 'Slurm')
+" 2>/dev/null || echo "Unknown")
+
+  echo -e "  ARN:          ${CLUSTER_ARN}"
+  echo -e "  Status:       ${CLUSTER_STATUS}"
+  echo -e "  Orchestrator: ${ORCHESTRATOR}"
+
+  RESOURCES=$(echo "$CLUSTER_JSON" | python3 -c "
+import sys,json
+d=json.load(sys.stdin)
+vpc=d.get('VpcConfig',{})
+sgs=vpc.get('SecurityGroupIds',[])
+subnets=vpc.get('Subnets',[])
+print('SGs='     + ','.join(sgs))
+print('Subnets=' + ','.join(subnets))
+" 2>/dev/null || echo "")
+
+  CLUSTER_SGS=$(echo "$RESOURCES"     | grep "^SGs="     | cut -d= -f2)
+  CLUSTER_SUBNETS=$(echo "$RESOURCES" | grep "^Subnets=" | cut -d= -f2)
+
+  if [[ -z "$CLUSTER_SGS" ]]; then
+    echo -e "${YELLOW}[WARN]${NC} No SecurityGroupIds in cluster VpcConfig — cluster may not have customer VPC"
+    exit 0
+  fi
+
+  VPC_ID="unknown"
+  if [[ -n "$CLUSTER_SUBNETS" ]]; then
+    FIRST_SUBNET=$(echo "$CLUSTER_SUBNETS" | tr ',' '\n' | head -1)
+    VPC_ID=$(aws ec2 describe-subnets \
+      --subnet-ids "$FIRST_SUBNET" \
+      --region "$REGION" \
+      --query 'Subnets[0].VpcId' \
+      --output text 2>/dev/null || echo "unknown")
+  fi
+
+  echo ""
+  echo -e "${BOLD}  Resources owned by cluster '${CLUSTER}':${NC}"
+  echo -e "  VPC:              ${VPC_ID}"
+  echo -e "  Security Groups:  ${CLUSTER_SGS}"
+  echo -e "  Subnets:          ${CLUSTER_SUBNETS}"
+
+  if [[ -n "$CLUSTER_SUBNETS" ]]; then
+    echo ""
+    echo -e "${BOLD}  Subnet details:${NC}"
+    IFS=',' read -ra _subnet_arr <<< "$CLUSTER_SUBNETS"
+    aws ec2 describe-subnets \
+      --subnet-ids "${_subnet_arr[@]}" \
+      --region "$REGION" \
+      --query 'Subnets[*].{SubnetId:SubnetId,AZ:AvailabilityZone,FreeIPs:AvailableIpAddressCount,VpcId:VpcId}' \
+      --output table 2>/dev/null || echo "  (unable to describe subnets)"
+  fi
+
+  echo ""
+  TOTAL_ISSUES=0
+  # CLUSTER_SGS is guaranteed non-empty at the -z guard above, but defend anyway.
+  # grep -c returns exit 1 on zero matches under pipefail, so suppress and then
+  # explicitly branch on the count rather than letting 0 silently fall through.
+  SG_COUNT=$(echo "$CLUSTER_SGS" | tr ',' '\n' | grep -c . || true)
+  if [[ "${SG_COUNT:-0}" -eq 0 ]]; then
+    echo -e "  ${YELLOW}[WARN]${NC} No security groups resolved from CLUSTER_SGS — cannot run EFA rule check"
+    echo -e "         → references/node-diagnostics-detail.md § A (EFA / Security Group)"
+    exit 0
+  fi
+  echo -e "${BOLD}Checking ${SG_COUNT} security group(s) for cluster '${CLUSTER}'...${NC}"
+
+  for SG in $(echo "$CLUSTER_SGS" | tr ',' ' '); do
+    echo ""
+    echo -e "${BOLD}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
+    # Capture rc in a subshell pattern that survives `set -e` — otherwise
+    # the first SG with issues aborts the loop and later SGs are never checked.
+    sg_rc=0
+    check_single_sg "$SG" "$REGION" || sg_rc=$?
+    TOTAL_ISSUES=$((TOTAL_ISSUES + sg_rc))
+  done
+
+  echo ""
+  echo -e "${BOLD}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
+  if [[ $TOTAL_ISSUES -gt 0 ]]; then
+    echo -e "${RED}${BOLD}RESULT: ${TOTAL_ISSUES} security group rule issue(s) found for cluster '${CLUSTER}'${NC}"
+    echo "Fix the [FAIL] rules above (see references/node-diagnostics-detail.md § A for the Suggested-command block); if cluster creation was failing on EFA health checks, retry creation after fixing."
+    echo ""
+    echo "Verify after fixing:"
+    echo "  bash check-efa-sg.sh --cluster ${CLUSTER} --region ${REGION}"
+    exit 1
+  else
+    echo -e "${GREEN}${BOLD}RESULT: All EFA security group rules correctly configured for cluster '${CLUSTER}'${NC}"
+    echo ""
+    echo "If EFA health checks still fail:"
+    echo "  1. Verify all instance groups use one of these SGs: ${CLUSTER_SGS}"
+    echo "  2. Run check-node-reachability.sh on affected nodes via hyperpod-ssm skill"
+    exit 0
+  fi
+fi
+
+if [[ -n "$SG_ID" ]]; then
+  check_single_sg "$SG_ID" "$REGION"
+  exit $?
+fi
diff --git a/plugins/sagemaker-ai/skills/hyperpod-node-debugger/scripts/check-node-reachability.sh b/plugins/sagemaker-ai/skills/hyperpod-node-debugger/scripts/check-node-reachability.sh
new file mode 100755
index 00000000..95f8f31a
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/hyperpod-node-debugger/scripts/check-node-reachability.sh
@@ -0,0 +1,389 @@
+#!/usr/bin/env bash
+# check-node-reachability.sh
+#
+# Diagnose EFA reachability and inter-node communication health on a single
+# HyperPod node. Run this on each node via the hyperpod-ssm skill.
+#
+# Usage (via ssm-exec.sh):
+#   ssm-exec.sh --target <TARGET> --upload scripts/check-node-reachability.sh /tmp/check-node-reachability.sh
+#   ssm-exec.sh --target <TARGET> 'bash /tmp/check-node-reachability.sh'
+#
+# Usage (direct on node):
+#   bash check-node-reachability.sh [--json] [--no-color]
+#
+# Exit codes:
+#   0 — all critical checks passed
+#   1 — one or more critical checks failed
+
+set -euo pipefail
+
+# Note: this script runs ON the node (via SSM), so aws CLI may not be present.
+# Only python3 is checked here; other tools are checked individually per section.
+
+JSON_MODE=false
+USE_COLOR=true
+
+usage() {
+  cat <<EOF
+Usage: bash check-node-reachability.sh [--json] [--no-color]
+
+Read-only on-node diagnostic for EFA reachability and inter-node communication
+health. Must be executed on a HyperPod compute node (typically via the
+hyperpod-ssm skill). Checks EFA interfaces, /dev/infiniband devices, GPU
+count and Neuron device count against the expected counts for the node's
+instance type.
+
+Options:
+  --json       Emit findings as JSON instead of human-readable output.
+  --no-color   Disable ANSI colors.
+  -h, --help   Show this message.
+
+Exit codes:
+  0  All critical checks passed.
+  1  One or more critical checks failed.
+EOF
+}
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --json)     JSON_MODE=true;  shift ;;
+    --no-color) USE_COLOR=false; shift ;;
+    -h|--help)  usage; exit 0 ;;
+    *) echo "Unknown argument: $1" >&2; usage >&2; exit 1 ;;
+  esac
+done
+
+# Colors — auto-disable when stdout isn't a TTY.
+if ! [ -t 1 ] || [ "${TERM:-}" = "dumb" ]; then
+  USE_COLOR=false
+fi
+if "$USE_COLOR"; then
+  RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'
+  BOLD='\033[1m';   NC='\033[0m'
+else
+  RED=''; GREEN=''; YELLOW=''; BOLD=''; NC=''
+fi
+
+HOSTNAME=$(hostname 2>/dev/null || echo "unknown")
+TIMESTAMP=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
+CRITICAL_FAILURES=0
+declare -A RESULTS   # associative array: check_name → pass|fail|warn|skip
+
+pass()  { RESULTS["$1"]="pass";  [[ "$JSON_MODE" == false ]] && echo -e "  ${GREEN}[PASS]${NC}  $1${2:+ — $2}"; }
+fail()  { RESULTS["$1"]="fail";  CRITICAL_FAILURES=$((CRITICAL_FAILURES+1)); \
+           [[ "$JSON_MODE" == false ]] && echo -e "  ${RED}[FAIL]${NC}  $1${2:+ — $2}"; }
+warn()  { RESULTS["$1"]="warn";  [[ "$JSON_MODE" == false ]] && echo -e "  ${YELLOW}[WARN]${NC}  $1${2:+ — $2}"; }
+skip()  { RESULTS["$1"]="skip";  [[ "$JSON_MODE" == false ]] && echo -e "         [SKIP]  $1${2:+ — $2}"; }
+info()  { [[ "$JSON_MODE" == false ]] && echo -e "         $1"; }
+
+if [[ "$JSON_MODE" == false ]]; then
+  echo ""
+  echo -e "${BOLD}=== HyperPod Node EFA Reachability Check ===${NC}"
+  echo -e "Host:      ${BOLD}${HOSTNAME}${NC}"
+  echo -e "Timestamp: ${TIMESTAMP}"
+  echo ""
+fi
+
+if [[ "$JSON_MODE" == false ]]; then echo -e "${BOLD}--- EFA Kernel Module ---${NC}"; fi
+
+EFA_MODULE=$(lsmod 2>/dev/null | grep -E '^efa\b' | awk '{print $1}' || true)
+if [[ -n "$EFA_MODULE" ]]; then
+  EFA_MODULE_VER=$(modinfo efa 2>/dev/null | grep -E '^version:' | awk '{print $2}' || echo "unknown")
+  pass "efa_kernel_module" "loaded (version: ${EFA_MODULE_VER})"
+else
+  # Read-only invariant: detect only, never `sudo modprobe efa` — loading kernel
+  # modules mutates node state, which the hyperpod-ssm skill's approval flow owns.
+  fail "efa_kernel_module" "not loaded — see references/node-diagnostics-detail.md § A (EFA / Security Group)"
+fi
+
+if [[ "$JSON_MODE" == false ]]; then echo ""; echo -e "${BOLD}--- EFA Devices ---${NC}"; fi
+
+# shellcheck disable=SC2010  # /dev/ entries are kernel-named, safe to ls|grep
+EFA_DEVICES=$(ls /dev/infiniband/ 2>/dev/null | grep -E 'rdma_cm|uverbs|efa' || true)
+
+if [[ -n "$EFA_DEVICES" ]]; then
+  pass "efa_devices_present" "found in /dev/infiniband/: $(echo "$EFA_DEVICES" | tr '\n' ' ')"
+else
+  fail "efa_devices_present" "/dev/infiniband/ is empty or missing — EFA hardware not detected"
+fi
+
+if [[ "$JSON_MODE" == false ]]; then echo ""; echo -e "${BOLD}--- libfabric EFA Provider ---${NC}"; fi
+
+if command -v fi_info &>/dev/null; then
+  # If the previous section found no EFA hardware, fi_info failing is expected —
+  # don't emit [FAIL] on top of the hardware [FAIL], which would double-count and
+  # conflate "libfabric can't see EFA" with "node has no EFA at all".
+  if [[ -z "$EFA_DEVICES" ]]; then
+    skip "fi_info_efa_provider" "no EFA devices detected upstream — see efa_devices_present"
+  else
+    FI_EXIT=0
+    FI_OUTPUT=$(fi_info -p efa 2>&1) || FI_EXIT=$?
+    if echo "$FI_OUTPUT" | grep -q "provider: efa"; then
+      EFA_PROVIDER_COUNT=$(echo "$FI_OUTPUT" | { grep -c "provider: efa" 2>/dev/null; true; })
+      pass "fi_info_efa_provider" "EFA provider found (${EFA_PROVIDER_COUNT} endpoint(s))"
+      info "$(echo "$FI_OUTPUT" | grep -E 'provider:|fabric:|domain:|version:' | head -8 | sed 's/^/    /')"
+    else
+      fail "fi_info_efa_provider" "fi_info -p efa returned no EFA provider (exit code ${FI_EXIT}) — libfabric cannot enumerate EFA devices. See references/node-diagnostics-detail.md § A (EFA / Security Group)"
+      info "fi_info output: ${FI_OUTPUT:0:200}"
+    fi
+  fi
+else
+  warn "fi_info_efa_provider" "fi_info not found — install libfabric to run this check (fi_info comes with EFA installer)"
+fi
+
+if [[ "$JSON_MODE" == false ]]; then echo ""; echo -e "${BOLD}--- EFA Network Interfaces ---${NC}"; fi
+
+# EFA interfaces typically appear as eth0/ens* for primary + rdmaX or efa* for EFA devices
+# EFA ifaces on p5/p5en use regular kernel names (ens*) — filter by driver via ethtool
+# rather than by name pattern (the old 'rdma|efa' name grep misses ens* on p5).
+EFA_IFACES=""
+if command -v ethtool &>/dev/null; then
+  while IFS= read -r iface; do
+    [[ -z "$iface" ]] && continue
+    DRIVER=$(ethtool -i "$iface" 2>/dev/null | awk -F': ' '/^driver:/{print $2}')
+    if [[ "$DRIVER" == "efa" ]]; then
+      EFA_IFACES+="${iface}"$'\n'
+    fi
+  done < <(ip -o link show 2>/dev/null | awk -F': ' '{print $2}' | awk -F'@' '{print $1}' | grep -v '^lo$')
+fi
+# Fallback to name-based detection for older kernels / containers without ethtool
+if [[ -z "$EFA_IFACES" ]]; then
+  EFA_IFACES=$(ip link show 2>/dev/null | grep -E 'rdma|efa' | awk -F': ' '{print $2}' | tr -d '@' || true)
+fi
+REGULAR_IFACES=$(ip link show 2>/dev/null | grep -E 'state UP' | awk -F': ' '{print $2}' | tr -d '@' || true)
+
+if [[ -n "$EFA_IFACES" ]]; then
+  pass "efa_interfaces_up" "EFA interfaces found: $(echo "$EFA_IFACES" | tr '\n' ' ')"
+  while IFS= read -r iface; do
+    [[ -z "$iface" ]] && continue
+    IP=$(ip addr show "$iface" 2>/dev/null | grep 'inet ' | awk '{print $2}' || true)
+    if [[ -n "$IP" ]]; then
+      info "  $iface → $IP"
+    else
+      warn "efa_interface_ip_${iface}" "interface $iface has no IP address — check DHCP/subnet config"
+    fi
+  done <<< "$EFA_IFACES"
+else
+  info "No EFA interfaces detected (by driver or name)"
+  if [[ -n "$REGULAR_IFACES" ]]; then
+    skip "efa_interfaces_up" "no separate EFA interface — primary interfaces: $(echo "$REGULAR_IFACES" | tr '\n' ' ' | head -c 80)"
+  else
+    warn "efa_interfaces_up" "no UP network interfaces found"
+  fi
+fi
+
+if [[ "$JSON_MODE" == false ]]; then echo ""; echo -e "${BOLD}--- EFA Installation ---${NC}"; fi
+
+EFA_VER_FILE="/opt/amazon/efa_installed_packages"
+if [[ -f "$EFA_VER_FILE" ]]; then
+  # Format is "EFA installer version: 1.30.0" — grab only the version token.
+  EFA_VER=$(grep -iE '^EFA installer version' "$EFA_VER_FILE" 2>/dev/null \
+              | head -1 \
+              | grep -oE '[0-9]+\.[0-9]+(\.[0-9]+)?' \
+              | head -1 || echo "")
+  if [[ -z "$EFA_VER" ]]; then
+    warn "efa_installer_present" "EFA installer file present but version line not parsed"
+  else
+    pass "efa_installer_present" "EFA installer version: ${EFA_VER}"
+  fi
+else
+  warn "efa_installer_present" "EFA installer marker not found at ${EFA_VER_FILE} — EFA may not be installed via standard method"
+fi
+
+if [[ "$JSON_MODE" == false ]]; then echo ""; echo -e "${BOLD}--- NCCL / OFI Configuration ---${NC}"; fi
+
+NCCL_VARS=("FI_PROVIDER" "FI_EFA_USE_DEVICE_RDMA" "NCCL_SOCKET_IFNAME" "NCCL_ALGO" "LD_LIBRARY_PATH")
+ANY_NCCL_SET=false
+for var in "${NCCL_VARS[@]}"; do
+  val="${!var:-}"
+  if [[ -n "$val" ]]; then
+    info "  ${var}=${val}"
+    ANY_NCCL_SET=true
+  fi
+done
+
+if "$ANY_NCCL_SET"; then
+  FI_PROVIDER_VAL="${FI_PROVIDER:-}"
+  if [[ -n "$FI_PROVIDER_VAL" && "$FI_PROVIDER_VAL" != "efa" ]]; then
+    warn "nccl_fi_provider" "FI_PROVIDER=${FI_PROVIDER_VAL} — for EFA workloads this should be 'efa'"
+  elif [[ "$FI_PROVIDER_VAL" == "efa" ]]; then
+    pass "nccl_fi_provider" "FI_PROVIDER=efa"
+  fi
+else
+  skip "nccl_env_vars" "no NCCL/OFI env vars set in current shell — may be set in job launcher environment"
+fi
+
+if [[ "$JSON_MODE" == false ]]; then echo ""; echo -e "${BOLD}--- AWS OFI NCCL Plugin ---${NC}"; fi
+
+OFI_LIB=$(find /opt/amazon/efa /opt/aws-ofi-nccl /usr/local/lib /usr/lib \
+  -name "libnccl-net.so*" -o -name "aws-ofi-nccl.so*" 2>/dev/null | head -1 || true)
+
+if [[ -n "$OFI_LIB" ]]; then
+  pass "aws_ofi_nccl_plugin" "found: ${OFI_LIB}"
+else
+  if [[ -f "$EFA_VER_FILE" ]] && grep -q "ofi\|OFI" "$EFA_VER_FILE" 2>/dev/null; then
+    pass "aws_ofi_nccl_plugin" "referenced in ${EFA_VER_FILE}"
+  else
+    warn "aws_ofi_nccl_plugin" "libnccl-net.so not found — required for EFA-accelerated NCCL (distributed training)"
+  fi
+fi
+
+if [[ "$JSON_MODE" == false ]]; then echo ""; echo -e "${BOLD}--- Instance Metadata Reachability ---${NC}"; fi
+
+IMDS_TOKEN=$(curl -s -X PUT "http://169.254.169.254/latest/api/token" \
+  -H "X-aws-ec2-metadata-token-ttl-seconds: 60" --connect-timeout 3 -m 5 2>/dev/null || true)
+
+if [[ -n "$IMDS_TOKEN" ]]; then
+  INSTANCE_TYPE=$(curl -s -H "X-aws-ec2-metadata-token: $IMDS_TOKEN" \
+    http://169.254.169.254/latest/meta-data/instance-type --connect-timeout 3 -m 5 2>/dev/null || echo "unknown")
+  LOCAL_IP=$(curl -s -H "X-aws-ec2-metadata-token: $IMDS_TOKEN" \
+    http://169.254.169.254/latest/meta-data/local-ipv4 --connect-timeout 3 -m 5 2>/dev/null || echo "unknown")
+  pass "imds_reachable" "instance-type=${INSTANCE_TYPE}, local-ipv4=${LOCAL_IP}"
+
+  # Static list of EFA-capable families; unknown types fall through to the
+  # EC2 API check. aws CLI may not be present on-node, so the static path
+  # covers the common case.
+  case "$INSTANCE_TYPE" in
+    p4de*|p4d*|p5en*|p5e*|p5*|p6*|trn1*|trn2*|inf2*|g5.48xlarge|g6e.48xlarge|g6.48xlarge|hpc6a*|hpc6id*|hpc7a*|hpc7g*|dl1*|dl2q*)
+      pass "efa_capable_instance" "${INSTANCE_TYPE} supports EFA" ;;
+    *)
+      if command -v aws &>/dev/null; then
+        EFA_CHECK=$(aws ec2 describe-instance-types \
+          --instance-types "${INSTANCE_TYPE}" \
+          --query 'InstanceTypes[0].NetworkInfo.EfaSupported' \
+          --output text 2>/dev/null || echo "unknown")
+        if [[ "$EFA_CHECK" == "True" ]]; then
+          pass "efa_capable_instance" "${INSTANCE_TYPE} supports EFA (verified via API)"
+        elif [[ "$EFA_CHECK" == "False" ]]; then
+          warn "efa_capable_instance" "${INSTANCE_TYPE} does NOT support EFA"
+        else
+          warn "efa_capable_instance" "${INSTANCE_TYPE} — could not verify EFA support"
+        fi
+      else
+        warn "efa_capable_instance" "${INSTANCE_TYPE} — not in known EFA list; verify with: aws ec2 describe-instance-types --instance-types ${INSTANCE_TYPE} --query 'InstanceTypes[0].NetworkInfo.EfaSupported'"
+      fi
+      ;;
+  esac
+  # Multi-EFA validation — counts per EC2 instance-type documentation.
+  # NOTE: EFA counts vary between instance families (p5en has fewer than p5/p5e).
+  EXPECTED_EFA=0
+  case "$INSTANCE_TYPE" in
+    p5.48xlarge|p5e.48xlarge)   EXPECTED_EFA=32 ;;
+    p5en.48xlarge)              EXPECTED_EFA=16 ;;
+    p4d.24xlarge|p4de.24xlarge) EXPECTED_EFA=4 ;;
+    trn1.32xlarge)              EXPECTED_EFA=8 ;;
+    trn2.48xlarge)              EXPECTED_EFA=16 ;;
+    # p6 family and newer: don't hardcode counts; discover via ethtool to avoid false FAILs.
+  esac
+
+  if [[ "$EXPECTED_EFA" -gt 0 ]]; then
+    # Count actual EFA devices — avoid grep -c pattern that returns "0\n0" fallthrough.
+    ACTUAL_EFA=$(find /dev/infiniband -maxdepth 1 -name 'uverbs*' 2>/dev/null | wc -l)
+    [[ -z "$ACTUAL_EFA" ]] && ACTUAL_EFA=0
+    if [[ "$ACTUAL_EFA" -ge "$EXPECTED_EFA" ]]; then
+      pass "multi_efa_interfaces" "${ACTUAL_EFA}/${EXPECTED_EFA} EFA interfaces present for ${INSTANCE_TYPE}"
+    elif [[ "$ACTUAL_EFA" -gt 0 ]]; then
+      warn "multi_efa_interfaces" "only ${ACTUAL_EFA}/${EXPECTED_EFA} EFA interfaces — some may not be attached or driver issue"
+    else
+      fail "multi_efa_interfaces" "0/${EXPECTED_EFA} EFA interfaces on ${INSTANCE_TYPE} — EFA driver or attachment issue"
+    fi
+  fi
+else
+  warn "imds_reachable" "IMDS not reachable. If running inside a container: check IMDSv2 HttpPutResponseHopLimit on the instance (default 1 is often too low for container networking — set to 2 or higher). Otherwise: verify the instance metadata service is enabled (HttpEndpoint != disabled) and that no local iptables / nftables rules block 169.254.169.254. Note: SGs do not filter link-local addresses."
+fi
+
+if [[ "$JSON_MODE" == false ]]; then echo ""; echo -e "${BOLD}--- Network Interface Statistics ---${NC}"; fi
+
+if command -v ip &>/dev/null; then
+  IFACE_ERRORS=$(ip -s link show 2>/dev/null | awk '
+    BEGIN { rx_err=0; tx_err=0; iface="" }
+    /^[0-9]+:/ {
+      if (iface != "" && (rx_err > 0 || tx_err > 0))
+        print "  " iface ": RX errors=" rx_err " TX errors=" tx_err
+      iface=$2; gsub(/:$/, "", iface)
+      rx_err=0; tx_err=0
+    }
+    /RX:/ { getline; rx_err=$3+0 }
+    /TX:/ { getline; tx_err=$3+0 }
+    END {
+      if (iface != "" && (rx_err > 0 || tx_err > 0))
+        print "  " iface ": RX errors=" rx_err " TX errors=" tx_err
+    }
+  ' || true)
+
+  if [[ -n "$IFACE_ERRORS" ]]; then
+    warn "network_interface_errors" "interfaces with errors detected:"
+    info "$IFACE_ERRORS"
+  else
+    pass "network_interface_errors" "no RX/TX errors on active interfaces"
+  fi
+else
+  skip "network_interface_errors" "ip command not available"
+fi
+
+if [[ "$JSON_MODE" == false ]]; then echo ""; echo -e "${BOLD}--- Neuron Devices (Trainium/Inferentia) ---${NC}"; fi
+
+if command -v neuron-ls &>/dev/null; then
+  NEURON_OUTPUT=$(neuron-ls 2>&1 || true)
+  NEURON_DEVICE_COUNT=$(echo "$NEURON_OUTPUT" | { grep -c "neuron_device" 2>/dev/null; true; })
+  if [[ "$NEURON_DEVICE_COUNT" -gt 0 ]]; then
+    pass "neuron_devices" "${NEURON_DEVICE_COUNT} Neuron device(s) detected"
+    info "$(echo "$NEURON_OUTPUT" | head -10 | sed 's/^/    /')"
+  else
+    NEURON_MOD=$(lsmod 2>/dev/null | grep -E '^neuron' || true)
+    if [[ -n "$NEURON_MOD" ]]; then
+      warn "neuron_devices" "Neuron driver loaded but neuron-ls shows 0 devices → references/node-diagnostics-detail.md § G.2 (Trainium/Inferentia)"
+    else
+      fail "neuron_devices" "Neuron driver not loaded → references/node-diagnostics-detail.md § G.2 (Trainium/Inferentia)"
+    fi
+  fi
+elif ls /dev/neuron* &>/dev/null 2>&1; then
+  NEURON_DEV_COUNT=$(find /dev -maxdepth 1 -name 'neuron*' 2>/dev/null | wc -l)
+  NEURON_DEV_COUNT=${NEURON_DEV_COUNT:-0}
+  warn "neuron_devices" "${NEURON_DEV_COUNT} /dev/neuron* device(s) found but neuron-ls not installed → references/node-diagnostics-detail.md § G.2 (Trainium/Inferentia)"
+else
+  skip "neuron_devices" "not a Trainium/Inferentia instance (no Neuron devices)"
+fi
+
+if [[ "$JSON_MODE" == false ]]; then
+  echo ""
+  echo -e "${BOLD}--- Summary ---${NC}"
+  TOTAL=${#RESULTS[@]}
+  PASSED=$(printf '%s\n' "${RESULTS[@]}" | { grep -c "^pass$" 2>/dev/null; true; })
+  WARNED=$(printf '%s\n' "${RESULTS[@]}" | { grep -c "^warn$" 2>/dev/null; true; })
+  FAILED=$(printf '%s\n' "${RESULTS[@]}" | { grep -c "^fail$" 2>/dev/null; true; })
+  SKIPPED=$(printf '%s\n' "${RESULTS[@]}" | { grep -c "^skip$" 2>/dev/null; true; })
+  echo -e "  Host: ${HOSTNAME}"
+  echo -e "  Checks: ${TOTAL} total | ${GREEN}${PASSED} passed${NC} | ${YELLOW}${WARNED} warnings${NC} | ${RED}${FAILED} failed${NC} | ${SKIPPED} skipped"
+
+  if [[ $CRITICAL_FAILURES -eq 0 ]]; then
+    echo -e "\n  ${GREEN}${BOLD}Node EFA reachability checks PASSED.${NC}"
+    echo "  If inter-node communication still fails, verify security group rules with check-efa-sg.sh"
+    echo "  and compare EFA versions across nodes with the hyperpod-version-checker skill."
+  else
+    echo -e "\n  ${RED}${BOLD}Node EFA reachability checks FAILED (${CRITICAL_FAILURES} critical issue(s)).${NC}"
+    echo "  See [FAIL] items above. Each finding ends with a pointer of the form"
+    echo "  '→ references/node-diagnostics-detail.md § <section>' — open that section"
+    echo "  for root cause and remediation. Remediation lives in references, not in scripts."
+  fi
+  echo ""
+else
+  CHECKS_JSON=""
+  for key in "${!RESULTS[@]}"; do
+    val="${RESULTS[$key]}"
+    CHECKS_JSON+="\"${key}\": \"${val}\","
+  done
+  CHECKS_JSON="${CHECKS_JSON%,}"  # remove trailing comma
+
+  cat <<EOF
+{
+  "hostname": "${HOSTNAME}",
+  "timestamp": "${TIMESTAMP}",
+  "critical_failures": ${CRITICAL_FAILURES},
+  "overall_pass": $([ $CRITICAL_FAILURES -eq 0 ] && echo true || echo false),
+  "checks": { ${CHECKS_JSON} }
+}
+EOF
+fi
+
+exit "$([[ $CRITICAL_FAILURES -eq 0 ]] && echo 0 || echo 1)"
diff --git a/plugins/sagemaker-ai/skills/hyperpod-node-debugger/scripts/check-vpc-config.sh b/plugins/sagemaker-ai/skills/hyperpod-node-debugger/scripts/check-vpc-config.sh
new file mode 100755
index 00000000..b6eb8d58
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/hyperpod-node-debugger/scripts/check-vpc-config.sh
@@ -0,0 +1,508 @@
+#!/usr/bin/env bash
+# check-vpc-config.sh
+#
+# Diagnose VPC, subnet, and EKS configuration for a HyperPod cluster.
+# Automatically extracts ALL resources (VPC, subnets, SGs) from the cluster —
+# no need to know resource IDs in advance, even in accounts with 1000s of resources.
+#
+# Checks: VPC alignment, subnet AZ, IP availability, ENI limits,
+#         EKS auth mode, HyperPod namespace, VPC endpoints.
+#
+# Usage (cluster-centric — preferred):
+#   bash check-vpc-config.sh --cluster <name-or-arn> --region <region>
+#   bash check-vpc-config.sh --cluster <name-or-arn> --region <region> --eks-name <eks-cluster>
+#
+# Exit codes:
+#   0 — all checks passed (warnings may still be present)
+#   1 — one or more critical checks failed
+
+set -euo pipefail
+
+for cmd in aws python3; do
+  command -v "$cmd" &>/dev/null || {
+    echo "ERROR: '$cmd' is required but not found. Install it and retry."
+    exit 1
+  }
+done
+
+CLUSTER=""
+REGION="${AWS_DEFAULT_REGION:-}"
+EKS_NAME=""
+USE_COLOR=true
+
+usage() {
+  cat <<EOF
+Usage: $0 --cluster <name-or-arn> --region <region> [options]
+
+Read-only diagnostic for VPC / subnet / EKS configuration on a HyperPod
+cluster. Reports VPC alignment, subnet AZ, IP availability, ENI limits,
+EKS auth mode, HyperPod namespace presence, and VPC endpoint presence.
+Each [FAIL] line includes a pointer of the form
+"→ references/node-diagnostics-detail.md § B (VPC / Routing)".
+
+Options:
+  --cluster     HyperPod cluster name or ARN (required).
+  --region      AWS region (required unless \$AWS_DEFAULT_REGION is set).
+  --eks-name    EKS cluster name if different from the HyperPod cluster name.
+  --no-color    Disable ANSI colors.
+  -h, --help    Show this message.
+
+Exit codes:
+  0  All checks passed (warnings may still be present).
+  1  One or more critical checks failed.
+EOF
+}
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --cluster)   CLUSTER="$2";   shift 2 ;;
+    --region)    REGION="$2";    shift 2 ;;
+    --eks-name)  EKS_NAME="$2";  shift 2 ;;
+    --no-color)  USE_COLOR=false; shift ;;
+    -h|--help)   usage; exit 0 ;;
+    *) echo "Unknown argument: $1" >&2; usage >&2; exit 1 ;;
+  esac
+done
+
+if [[ -z "$CLUSTER" ]]; then
+  usage >&2
+  exit 1
+fi
+
+if [[ -z "$REGION" ]]; then
+  echo "ERROR: --region is required (or set AWS_DEFAULT_REGION before running)." >&2
+  exit 2
+fi
+
+if ! [ -t 1 ] || [ "${TERM:-}" = "dumb" ]; then
+  USE_COLOR=false
+fi
+if "$USE_COLOR"; then
+  RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'
+  BOLD='\033[1m'; NC='\033[0m'
+else
+  RED=''; GREEN=''; YELLOW=''; BOLD=''; NC=''
+fi
+
+ENI_QUOTA_CODE="L-DF5E4CA3"   # AWS Service Quotas code for "Network interfaces per Region"
+
+CRITICAL_FAILURES=0
+
+pass()  { echo -e "  ${GREEN}[PASS]${NC}  $1${2:+ — $2}"; }
+fail()  { CRITICAL_FAILURES=$((CRITICAL_FAILURES+1)); echo -e "  ${RED}[FAIL]${NC}  $1${2:+ — $2}"; }
+warn()  { echo -e "  ${YELLOW}[WARN]${NC}  $1${2:+ — $2}"; }
+info()  { echo -e "         $1"; }
+header(){ echo ""; echo -e "${BOLD}--- $1 ---${NC}"; }
+
+echo ""
+echo -e "${BOLD}=== HyperPod VPC Configuration Check ===${NC}"
+echo -e "Cluster: ${BOLD}${CLUSTER}${NC}"
+echo -e "Region:  ${BOLD}${REGION}${NC}"
+
+header "1. Cluster VPC Configuration"
+
+CLUSTER_JSON=$(aws sagemaker describe-cluster \
+  --cluster-name "$CLUSTER" \
+  --region "$REGION" \
+  --cli-read-timeout 30 \
+  --output json 2>&1) || {
+  echo -e "${RED}ERROR: Could not describe cluster '$CLUSTER' in region '$REGION'${NC}"
+  echo "$CLUSTER_JSON"
+  exit 1
+}
+
+CLUSTER_STATUS=$(echo "$CLUSTER_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(d.get('ClusterStatus','unknown'))" 2>/dev/null || echo "unknown")
+ORCHESTRATOR=$(echo "$CLUSTER_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); o=d.get('Orchestrator',{}); print('EKS' if 'Eks' in o else 'Slurm')" 2>/dev/null || echo "unknown")
+NODE_RECOVERY=$(echo "$CLUSTER_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(d.get('NodeRecovery','Unknown'))" 2>/dev/null || echo "Unknown")
+
+info "Status: $CLUSTER_STATUS | Orchestrator: $ORCHESTRATOR | NodeRecovery: $NODE_RECOVERY"
+
+SUBNET_IDS=$(echo "$CLUSTER_JSON" | python3 -c "
+import sys,json
+d=json.load(sys.stdin)
+subnets=d.get('VpcConfig',{}).get('Subnets',[])
+print(' '.join(subnets))
+" 2>/dev/null || echo "")
+
+SG_IDS=$(echo "$CLUSTER_JSON" | python3 -c "
+import sys,json
+d=json.load(sys.stdin)
+sgs=d.get('VpcConfig',{}).get('SecurityGroupIds',[])
+print(' '.join(sgs))
+" 2>/dev/null || echo "")
+
+if [[ -n "$SUBNET_IDS" ]]; then
+  pass "VpcConfig found"
+  info "Subnets: $SUBNET_IDS"
+  info "SecurityGroups: $SG_IDS"
+else
+  warn "VpcConfig" "no VpcConfig found in cluster"
+fi
+
+if [[ "$ORCHESTRATOR" == "EKS" && -z "$EKS_NAME" ]]; then
+  EKS_NAME=$(echo "$CLUSTER_JSON" | python3 -c "
+import sys,json
+d=json.load(sys.stdin)
+arn=d.get('Orchestrator',{}).get('Eks',{}).get('ClusterArn','')
+print(arn.split('/')[-1] if arn else '')
+" 2>/dev/null || echo "")
+fi
+
+if [[ -n "$SUBNET_IDS" ]]; then
+  header "2. Subnet VPC Alignment"
+
+  read -ra _subnet_arr <<< "$SUBNET_IDS"
+  SUBNET_JSON=$(aws ec2 describe-subnets \
+    --subnet-ids "${_subnet_arr[@]}" \
+    --region "$REGION" \
+    --cli-read-timeout 30 \
+    --output json 2>/dev/null || echo '{"Subnets":[]}')
+
+  VPC_IDS=$(echo "$SUBNET_JSON" | python3 -c "
+import sys,json
+subnets=json.load(sys.stdin).get('Subnets',[])
+vpc_ids=set(s.get('VpcId','?') for s in subnets)
+for s in subnets:
+    free=s.get('AvailableIpAddressCount',0)
+    az=s.get('AvailabilityZone','?')
+    sid=s.get('SubnetId','?')
+    vpc=s.get('VpcId','?')
+    flag='LOW IPs' if free < 10 else ''
+    print(f'  {sid}: VPC={vpc} AZ={az} FreeIPs={free} {flag}')
+print('VPCS=' + ','.join(vpc_ids))
+" 2>/dev/null || echo "")
+
+  echo "$VPC_IDS" | grep -v "^VPCS=" || true
+
+  UNIQUE_VPCS=$(echo "$VPC_IDS" | grep "^VPCS=" | cut -d= -f2 | tr ',' '\n' | sort -u | tr '\n' ',' | sed 's/,$//')
+  VPC_COUNT=$(echo "$UNIQUE_VPCS" | tr ',' '\n' | { grep -c . 2>/dev/null; true; })
+
+  if [[ "$VPC_COUNT" -gt 1 ]]; then
+    fail "Subnet VPC alignment" "Subnets are in DIFFERENT VPCs: $UNIQUE_VPCS — all must be in the same VPC → references/node-diagnostics-detail.md § B (VPC / Routing)"
+  elif [[ "$VPC_COUNT" -eq 1 ]]; then
+    pass "Subnet VPC alignment" "All subnets in VPC: $UNIQUE_VPCS"
+  else
+    # VPC_COUNT=0 means describe-subnets returned empty — usually an IAM denial
+    # on ec2:DescribeSubnets or a stale subnet ID. Without this branch the
+    # check would silently fall through and the customer sees no line at all.
+    warn "Subnet VPC alignment" "Unable to determine VPC — describe-subnets returned no data (check IAM ec2:DescribeSubnets) → references/node-diagnostics-detail.md § B (VPC / Routing)"
+  fi
+
+  if [[ -n "$SG_IDS" ]]; then
+    read -ra _sg_arr <<< "$SG_IDS"
+    SG_JSON=$(aws ec2 describe-security-groups \
+      --group-ids "${_sg_arr[@]}" \
+      --region "$REGION" \
+      --output json 2>/dev/null || echo '{"SecurityGroups":[]}')
+
+    SG_VPC_CHECK=$(echo "$SG_JSON" | SUBNET_VPC="$UNIQUE_VPCS" python3 -c "
+import sys, json, os
+sgs=json.load(sys.stdin).get('SecurityGroups',[])
+subnet_vpc=os.environ.get('SUBNET_VPC','')
+subnet_vpc_set=set(subnet_vpc.split(',')) if subnet_vpc else set()
+all_ok=True
+for sg in sgs:
+    sgid=sg.get('GroupId','?')
+    vpc=sg.get('VpcId','?')
+    if vpc not in subnet_vpc_set:
+        print(f'MISMATCH:{sgid} is in VPC {vpc} but subnets are in {subnet_vpc}')
+        all_ok=False
+    else:
+        print(f'OK:{sgid} in {vpc}')
+print('RESULT=' + ('PASS' if all_ok else 'FAIL'))
+" 2>/dev/null || echo "RESULT=SKIP")
+
+    echo "$SG_VPC_CHECK" | grep -v "^RESULT=" | sed 's/^OK:/  [OK]   SG /;s/^MISMATCH:/  [FAIL] SG /' || true
+    SG_RESULT=$(echo "$SG_VPC_CHECK" | grep "^RESULT=" | cut -d= -f2)
+    if [[ "$SG_RESULT" == "PASS" ]]; then
+      pass "SecurityGroup VPC alignment"
+    elif [[ "$SG_RESULT" == "FAIL" ]]; then
+      fail "SecurityGroup VPC alignment" "SG and subnet must be in the same VPC → references/node-diagnostics-detail.md § B (VPC / Routing)"
+    else
+      # SG_RESULT is "SKIP" (json parse error) or empty (describe-security-groups
+      # returned nothing). Either way the check did not run — say so, don't
+      # leave the customer staring at a missing line.
+      warn "SecurityGroup VPC alignment" "Unable to verify — describe-security-groups returned no usable data (check IAM ec2:DescribeSecurityGroups) → references/node-diagnostics-detail.md § B (VPC / Routing)"
+    fi
+  fi
+fi
+
+header "2a. VPC DNS Support & Hostnames"
+
+# HyperPod requires enableDnsSupport + enableDnsHostnames on the VPC so that
+# EKS service DNS and node internal hostnames resolve correctly.
+if [[ -n "$UNIQUE_VPCS" && "$UNIQUE_VPCS" != *,* ]]; then
+  DNS_SUPPORT=$(aws ec2 describe-vpc-attribute \
+    --vpc-id "$UNIQUE_VPCS" --attribute enableDnsSupport \
+    --region "$REGION" \
+    --query 'EnableDnsSupport.Value' --output text 2>/dev/null || echo "unknown")
+  DNS_HOSTNAMES=$(aws ec2 describe-vpc-attribute \
+    --vpc-id "$UNIQUE_VPCS" --attribute enableDnsHostnames \
+    --region "$REGION" \
+    --query 'EnableDnsHostnames.Value' --output text 2>/dev/null || echo "unknown")
+
+  if [[ "$DNS_SUPPORT" == "True" ]]; then
+    pass "VPC enableDnsSupport" "enabled"
+  else
+    fail "VPC enableDnsSupport" "must be True — EKS internal DNS and node hostname resolution will fail. → references/node-diagnostics-detail.md § B (VPC / Routing)"
+  fi
+  if [[ "$DNS_HOSTNAMES" == "True" ]]; then
+    pass "VPC enableDnsHostnames" "enabled"
+  else
+    fail "VPC enableDnsHostnames" "must be True — EKS internal DNS and node hostname resolution will fail. → references/node-diagnostics-detail.md § B (VPC / Routing)"
+  fi
+else
+  warn "VPC DNS attributes" "skipped — subnets span multiple VPCs or no VPC resolved"
+fi
+
+header "2b. Private Subnet / Routing"
+
+# HyperPod requires private subnets — a subnet is "public" if its route table has
+# a default route (0.0.0.0/0) pointing at an internet gateway. For outbound
+# access from a private subnet, the default route must point at a NAT gateway
+# (or be absent in a fully air-gapped VPC that relies on VPC endpoints).
+if [[ -n "$SUBNET_IDS" ]]; then
+  PRIVATE_CHECK=$(aws ec2 describe-route-tables \
+    --filters "Name=association.subnet-id,Values=$(echo "$SUBNET_IDS" | tr ' ' ',')" \
+    --region "$REGION" \
+    --query "RouteTables[*].{SubnetAssoc:Associations[?SubnetId!=\`null\`].SubnetId,Routes:Routes[?DestinationCidrBlock==\`0.0.0.0/0\`].{Target:GatewayId,NatGw:NatGatewayId}}" \
+    --output json 2>/dev/null || echo '[]')
+
+  echo "$PRIVATE_CHECK" | python3 -c "
+import sys, json
+rts = json.load(sys.stdin)
+if not rts:
+    print('INFO:no route tables associated — subnets likely use the main route table')
+    sys.exit(0)
+for rt in rts:
+    subs = rt.get('SubnetAssoc', []) or []
+    routes = rt.get('Routes', []) or []
+    for r in routes:
+        tgt = (r.get('Target') or '') or ''
+        nat = (r.get('NatGw') or '') or ''
+        subs_str = ','.join(subs) if subs else '(main)'
+        if tgt.startswith('igw-'):
+            print(f'FAIL:Subnet(s) {subs_str} route 0.0.0.0/0 -> Internet Gateway ({tgt}). HyperPod requires PRIVATE subnets; use a NAT gateway instead.')
+        elif nat.startswith('nat-'):
+            print(f'PASS:Subnet(s) {subs_str} route 0.0.0.0/0 -> NAT Gateway ({nat}) — private subnet, outbound via NAT.')
+        elif tgt.startswith('vpce-'):
+            print(f'INFO:Subnet(s) {subs_str} route 0.0.0.0/0 -> VPC endpoint ({tgt})')
+        else:
+            print(f'INFO:Subnet(s) {subs_str} route 0.0.0.0/0 -> {tgt or nat or \"unknown\"}')
+" 2>/dev/null | while IFS=: read -r level msg; do
+    case "$level" in
+      PASS) pass "Private subnet routing" "$msg" ;;
+      FAIL) fail "Private subnet routing" "$msg → references/node-diagnostics-detail.md § B (VPC / Routing)" ;;
+      WARN) warn "Private subnet routing" "$msg" ;;
+      INFO) info "$msg" ;;
+    esac
+  done
+fi
+
+header "3. IP Address Availability"
+
+if [[ -n "$SUBNET_IDS" ]]; then
+  _IP_CHECK=$(echo "$SUBNET_JSON" | python3 -c "
+import sys,json
+subnets=json.load(sys.stdin).get('Subnets',[])
+for s in subnets:
+    free=s.get('AvailableIpAddressCount',0)
+    sid=s.get('SubnetId','?')
+    if free < 5:
+        print(f'FAIL:{sid} only {free} free IPs — CRITICALLY LOW')
+    elif free < 50:
+        print(f'WARN:{sid} only {free} free IPs — consider expanding CIDR')
+    else:
+        print(f'PASS:{sid} has {free} free IPs')
+" 2>/dev/null || echo "")
+
+  while IFS= read -r line; do
+    [[ -z "$line" ]] && continue
+    level=$(echo "$line" | cut -d: -f1)
+    msg=$(echo "$line" | cut -d: -f2-)
+    case "$level" in
+      FAIL) fail "IP availability" "$msg → references/node-diagnostics-detail.md § B (VPC / Routing)" ;;
+      WARN) warn "IP availability" "$msg" ;;
+      PASS) pass "IP availability" "$msg" ;;
+    esac
+  done <<< "$_IP_CHECK"
+fi
+
+header "4. ENI Limits"
+
+if [[ -n "$UNIQUE_VPCS" ]]; then
+  VPC_ID=$(echo "$UNIQUE_VPCS" | tr ',' '\n' | head -1)
+  ENI_COUNT=$(aws ec2 describe-network-interfaces \
+    --filters "Name=vpc-id,Values=$VPC_ID" \
+    --region "$REGION" \
+    --query 'length(NetworkInterfaces)' \
+    --output text 2>/dev/null || echo "unknown")
+
+  ENI_QUOTA=$(aws service-quotas get-service-quota \
+    --service-code ec2 \
+    --quota-code "$ENI_QUOTA_CODE" \
+    --region "$REGION" \
+    --query 'Quota.Value' \
+    --output text 2>/dev/null || echo "unknown")
+
+  info "Current ENI count in VPC $VPC_ID: $ENI_COUNT"
+  info "ENI quota for region: $ENI_QUOTA"
+
+  if [[ "$ENI_COUNT" != "unknown" && "$ENI_QUOTA" != "unknown" ]]; then
+    USAGE_PCT=$(python3 -c "q=int(${ENI_QUOTA}); print(int(${ENI_COUNT}/q*100) if q > 0 else '?')" 2>/dev/null || echo "?")
+    if [[ "$USAGE_PCT" != "?" && "$USAGE_PCT" -gt 80 ]]; then
+      warn "ENI limits" "${USAGE_PCT}% of quota used — request increase via Service Quotas if provisioning fails → references/node-diagnostics-detail.md § B (VPC / Routing)"
+    else
+      pass "ENI limits" "${ENI_COUNT}/${ENI_QUOTA} ENIs used (${USAGE_PCT}%)"
+    fi
+  else
+    warn "ENI limits" "Could not determine ENI usage — verify manually → references/node-diagnostics-detail.md § B (VPC / Routing)"
+  fi
+fi
+
+if [[ "$ORCHESTRATOR" == "EKS" && -n "$EKS_NAME" ]]; then
+  header "5. EKS Prerequisites"
+
+  EKS_DESC=$(aws eks describe-cluster \
+    --name "$EKS_NAME" \
+    --region "$REGION" \
+    --output json 2>/dev/null || echo '{}')
+
+  # VPC alignment — the EKS cluster's VPC must match the HyperPod cluster's VPC.
+  EKS_VPC=$(echo "$EKS_DESC" | python3 -c "import sys,json; print(json.load(sys.stdin).get('cluster',{}).get('resourcesVpcConfig',{}).get('vpcId',''))" 2>/dev/null || echo "")
+  if [[ -n "$EKS_VPC" && -n "$UNIQUE_VPCS" ]]; then
+    if [[ ",$UNIQUE_VPCS," == *",$EKS_VPC,"* ]]; then
+      pass "EKS VPC alignment" "EKS cluster in same VPC as HyperPod ($EKS_VPC)"
+    else
+      fail "EKS VPC alignment" "EKS cluster is in VPC $EKS_VPC but HyperPod subnets are in $UNIQUE_VPCS — they must match → references/node-diagnostics-detail.md § B (VPC / Routing)"
+    fi
+  fi
+
+  # SG cross-reference — the HyperPod cluster SG must either be attached to the
+  # EKS cluster, OR the EKS cluster SG must allow inbound from the HyperPod SG.
+  EKS_SGS=$(echo "$EKS_DESC" | python3 -c "
+import sys,json
+d=json.load(sys.stdin).get('cluster',{}).get('resourcesVpcConfig',{})
+all_sgs = set(d.get('securityGroupIds',[]) or [])
+csg = d.get('clusterSecurityGroupId','')
+if csg: all_sgs.add(csg)
+print(' '.join(sorted(all_sgs)))
+" 2>/dev/null || echo "")
+
+  if [[ -n "$EKS_SGS" && -n "$SG_IDS" ]]; then
+    HP_SG_SET=$(echo "$SG_IDS" | tr ',' ' ')
+    SG_ATTACHED=false
+    for hp in $HP_SG_SET; do
+      for eks in $EKS_SGS; do
+        [[ "$hp" == "$eks" ]] && { SG_ATTACHED=true; break 2; }
+      done
+    done
+    if "$SG_ATTACHED"; then
+      pass "HyperPod SG on EKS" "HyperPod SG is attached to the EKS cluster"
+    else
+      EKS_SG_LIST=$(echo "$EKS_SGS" | tr ' ' ',' | sed 's/,$//')
+      read -r -a EKS_SG_ARR <<< "$EKS_SGS"
+      EKS_INGRESS=$(aws ec2 describe-security-groups \
+        --group-ids "${EKS_SG_ARR[@]}" \
+        --region "$REGION" --output json 2>/dev/null || echo '{"SecurityGroups":[]}')
+      CROSS_OK=$(echo "$EKS_INGRESS" | HP_SGS="$SG_IDS" python3 -c "
+import sys,json,os
+hp=set(os.environ.get('HP_SGS','').replace(',', ' ').split())
+sgs=json.load(sys.stdin).get('SecurityGroups',[])
+for sg in sgs:
+    for rule in sg.get('IpPermissions',[]):
+        for pair in rule.get('UserIdGroupPairs',[]):
+            if pair.get('GroupId','') in hp:
+                print('YES'); sys.exit(0)
+print('NO')
+" 2>/dev/null || echo "UNKNOWN")
+      if [[ "$CROSS_OK" == "YES" ]]; then
+        pass "HyperPod<->EKS SG" "EKS cluster SG ($EKS_SG_LIST) allows inbound from HyperPod SG"
+      else
+        fail "HyperPod<->EKS SG" "HyperPod SG is NOT attached to EKS and EKS SG ($EKS_SG_LIST) does not allow inbound from HyperPod SG → references/node-diagnostics-detail.md § A (EFA / Security Group)"
+      fi
+    fi
+  fi
+
+  EKS_AUTH=$(echo "$EKS_DESC" | python3 -c "import sys,json; print(json.load(sys.stdin).get('cluster',{}).get('accessConfig',{}).get('authenticationMode','unknown'))" 2>/dev/null || echo "unknown")
+
+  if [[ "$EKS_AUTH" == "CONFIG_MAP" ]]; then
+    warn "EKS auth mode" "CONFIG_MAP-only; access entries require API or API_AND_CONFIG_MAP — see the EKS access-entries documentation for the switching procedure"
+  elif [[ "$EKS_AUTH" == "API" || "$EKS_AUTH" == "API_AND_CONFIG_MAP" ]]; then
+    pass "EKS auth mode" "$EKS_AUTH"
+  else
+    warn "EKS auth mode" "Could not determine ($EKS_AUTH) — verify manually"
+  fi
+
+  # EKS endpoint accessibility (reuses $EKS_DESC captured above).
+  PUB=$(echo "$EKS_DESC" | python3 -c "import sys,json; print(json.load(sys.stdin).get('cluster',{}).get('resourcesVpcConfig',{}).get('endpointPublicAccess',False))" 2>/dev/null || echo "false")
+  PRIV=$(echo "$EKS_DESC" | python3 -c "import sys,json; print(json.load(sys.stdin).get('cluster',{}).get('resourcesVpcConfig',{}).get('endpointPrivateAccess',False))" 2>/dev/null || echo "false")
+
+  info "EKS endpoint: public=$PUB, private=$PRIV"
+  if [[ "$PUB" == "False" && "$PRIV" == "True" ]]; then
+    warn "EKS endpoint" "Private-only endpoint — ensure worker subnets can reach EKS API (port 443), create EKS VPC endpoint if needed"
+  elif [[ "$PUB" == "True" ]]; then
+    pass "EKS endpoint" "Public access enabled"
+  fi
+
+  if command -v kubectl &>/dev/null; then
+    if kubectl get namespace aws-hyperpod &>/dev/null 2>&1; then
+      pass "aws-hyperpod namespace" "exists"
+    else
+      fail "aws-hyperpod namespace" "Missing → references/node-diagnostics-detail.md § B (VPC / Routing)"
+    fi
+  else
+    warn "aws-hyperpod namespace" "kubectl not found — check skipped"
+  fi
+fi
+
+header "6. VPC Endpoints"
+
+if [[ -n "$UNIQUE_VPCS" ]]; then
+  VPC_ID=$(echo "$UNIQUE_VPCS" | tr ',' '\n' | head -1)
+  ENDPOINTS=$(aws ec2 describe-vpc-endpoints \
+    --filters "Name=vpc-id,Values=$VPC_ID" \
+    --region "$REGION" \
+    --query "VpcEndpoints[?State==\`available\`].ServiceName" \
+    --output text 2>/dev/null || echo "")
+
+  # Required for private/air-gapped VPCs. Port 443 is the default for every
+  # interface endpoint below; S3 uses a Gateway endpoint over the route table.
+  # FSx users additionally need com.amazonaws.<region>.fsx if using FSx on Lustre/OpenZFS.
+  REQUIRED_ENDPOINTS=("s3" "ecr.api" "ecr.dkr" "sts" "ssm" "ssmmessages" "ec2messages" "ec2" "sagemaker.api" "sagemaker.runtime" "logs")
+  for svc in "${REQUIRED_ENDPOINTS[@]}"; do
+
+    if echo "$ENDPOINTS" | grep -qE "\.${svc}$|\.${svc}[^a-z]"; then
+      pass "VPC endpoint: $svc"
+    else
+      warn "VPC endpoint: $svc" "not found — required for internet-disabled (private) VPCs; skip if outbound 0.0.0.0/0 via NAT is available → references/node-diagnostics-detail.md § B (VPC / Routing)"
+    fi
+  done
+
+  if [[ "$ORCHESTRATOR" == "EKS" ]]; then
+    if echo "$ENDPOINTS" | grep -qE "\.eks$|\.eks[^a-z]"; then
+      pass "VPC endpoint: eks"
+    else
+      warn "VPC endpoint: eks" "not found — needed if EKS endpoint is private-only → references/node-diagnostics-detail.md § B (VPC / Routing)"
+    fi
+  fi
+
+  if ! echo "$ENDPOINTS" | grep -qE "\.fsx"; then
+    info "VPC endpoint: fsx — not present (only required if this cluster uses FSx for Lustre or OpenZFS in a private/air-gapped VPC)"
+  fi
+fi
+
+echo ""
+echo -e "${BOLD}--- Summary ---${NC}"
+
+if [[ $CRITICAL_FAILURES -eq 0 ]]; then
+  echo -e "  ${GREEN}${BOLD}VPC configuration checks PASSED (${CRITICAL_FAILURES} critical issues).${NC}"
+  echo "  If cluster creation still fails, check EFA security group rules:"
+  echo "  bash check-efa-sg.sh --sg-id <SG_ID> --region $REGION"
+else
+  echo -e "  ${RED}${BOLD}VPC configuration checks FAILED (${CRITICAL_FAILURES} critical issue(s)).${NC}"
+  echo "  Fix the [FAIL] items above and retry cluster creation."
+fi
+echo ""
+
+exit "$([[ $CRITICAL_FAILURES -eq 0 ]] && echo 0 || echo 1)"
diff --git a/plugins/sagemaker-ai/skills/hyperpod-node-debugger/scripts/triage-cluster.sh b/plugins/sagemaker-ai/skills/hyperpod-node-debugger/scripts/triage-cluster.sh
new file mode 100755
index 00000000..fe0eeda1
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/hyperpod-node-debugger/scripts/triage-cluster.sh
@@ -0,0 +1,1258 @@
+#!/usr/bin/env bash
+# triage-cluster.sh — read-only HyperPod node triage.
+#
+# Collects signals to route node issues to the right reference section:
+#   - Cluster status, orchestrator, NodeRecovery
+#   - Cluster events (root-cause signal for provisioning failures)
+#   - Per-node health (HyperPod + EKS labels, Slurm state)
+#   - VPC / SG config
+#   - SSM reachability to compute nodes (hardware checks)
+#
+# Read-only: never modifies cluster state, never prints remediation commands.
+# Each [FAIL] / added issue carries a pointer of the form
+#   "... → references/node-diagnostics-detail.md § <section>"
+# which the hyperpod-node-debugger skill uses to look up remediation.
+#
+# Usage:
+#   bash triage-cluster.sh --cluster <name-or-arn> --region <region>
+#   bash triage-cluster.sh --cluster <name-or-arn> --region <region> --node <instance-id>
+#
+# Exit codes:
+#   0  No critical (P0/P1) issues; P2 informational findings are allowed.
+#   1  One or more critical issues, or a fatal prerequisite / cluster-not-found.
+#   2  Invalid argument.
+
+set -euo pipefail
+
+for cmd in aws python3; do
+  command -v "$cmd" &>/dev/null || {
+    echo "ERROR: '$cmd' is required but not found." >&2
+    exit 1
+  }
+done
+
+HAS_UNBUFFER=true
+if ! command -v unbuffer &>/dev/null; then
+  HAS_UNBUFFER=false
+fi
+
+CLUSTER=""
+REGION="${AWS_DEFAULT_REGION:-}"
+TARGET_NODE=""
+USE_COLOR=true
+
+usage() {
+  cat <<EOF
+Usage: $0 --cluster <name-or-arn> --region <region> [options]
+
+Options:
+  --cluster <name-or-arn>   HyperPod cluster name or ARN (required)
+  --region <region>         AWS region (required unless \$AWS_DEFAULT_REGION is set)
+  --node <instance-id>      Focus on a single instance ID
+  --no-color                Disable ANSI colors
+  -h, --help                This message
+
+Read-only diagnostic. Every [FAIL] line carries a pointer like
+"→ references/node-diagnostics-detail.md § <section>" which the
+hyperpod-node-debugger skill uses to look up remediation.
+EOF
+}
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --cluster)  [[ $# -lt 2 ]] && { echo "ERROR: --cluster needs a value"; exit 2; }
+                [[ ! "$2" =~ ^(arn:aws[a-z-]*:sagemaker:[a-z0-9-]+:[0-9]{12}:cluster/[a-z0-9]{12}|[a-zA-Z0-9]([-a-zA-Z0-9]{0,62}))$ ]] && { echo "ERROR: --cluster must be a valid HyperPod cluster name or ARN (got '$2')"; exit 2; }
+                CLUSTER="$2"; shift 2 ;;
+    --region)   [[ $# -lt 2 ]] && { echo "ERROR: --region needs a value"; exit 2; }
+                [[ ! "$2" =~ ^[a-z]{2}(-[a-z]+){1,2}-[0-9]+$ ]] && { echo "ERROR: --region must be a valid AWS region (got '$2')"; exit 2; }
+                REGION="$2"; shift 2 ;;
+    --node)     [[ $# -lt 2 ]] && { echo "ERROR: --node needs a value"; exit 2; }
+                [[ ! "$2" =~ ^i-[0-9a-f]{8,17}$ ]] && { echo "ERROR: --node must be an EC2 instance ID (i-xxxxxxxx...)"; exit 2; }
+                TARGET_NODE="$2"; shift 2 ;;
+    --no-color) USE_COLOR=false; shift ;;
+    -h|--help)  usage; exit 0 ;;
+    *) echo "Unknown argument: $1"; usage; exit 2 ;;
+  esac
+done
+
+[[ -z "$CLUSTER" ]] && {
+  echo "Usage: $0 --cluster <name-or-arn> --region <region> [--node <instance-id>]"
+  exit 1
+}
+
+if [[ -z "$REGION" ]]; then
+  echo "ERROR: --region is required (or set AWS_DEFAULT_REGION before running)." >&2
+  exit 2
+fi
+
+_CREDS=$(aws sts get-caller-identity --output json 2>&1) || {
+  echo "ERROR: AWS credentials not configured or expired."
+  echo "$_CREDS"
+  echo ""
+  echo "→ references/node-diagnostics-detail.md § K (Node Access via SSM) for credential setup"
+  exit 1
+}
+
+# Auto-disable colors when stdout is not a TTY (agent-piped / redirected).
+if ! [ -t 1 ] || [ "${TERM:-}" = "dumb" ]; then
+  USE_COLOR=false
+fi
+if "$USE_COLOR"; then
+  RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'
+  CYAN='\033[0;36m'; BOLD='\033[1m'; NC='\033[0m'
+else
+  RED=''; GREEN=''; YELLOW=''; CYAN=''; BOLD=''; NC=''
+fi
+
+section() { echo ""; echo -e "${BOLD}${CYAN}════════════════════════════════════════════════════════${NC}"; echo -e "${BOLD}${CYAN}  $1${NC}"; echo -e "${BOLD}${CYAN}════════════════════════════════════════════════════════${NC}"; }
+ok()      { echo -e "  ${GREEN}[PASS]${NC} $1"; }
+warn()    { echo -e "  ${YELLOW}[WARN]${NC} $1"; }
+bad()     { echo -e "  ${RED}[FAIL]${NC} $1"; }
+info()    { echo -e "  ${BOLD}[INFO]${NC} $1"; }
+
+ISSUES_FOUND=()
+add_issue() {
+  local priority="${2:-P1}"
+  ISSUES_FOUND+=("${priority}|$1")
+}
+
+aws_check_perms() {
+  local result="$1" api_name="$2"
+  if echo "$result" | grep -qiE "AccessDenied|UnauthorizedOperation|not authorized|AuthorizationError"; then
+    warn "Permission denied: $api_name — results may be incomplete"
+    add_issue "Missing IAM permission for $api_name → references/node-diagnostics-detail.md § K (Node Access via SSM)" "P1"
+    return 0
+  fi
+  return 1
+}
+
+_TEMP_FILES=()
+cleanup_temp() {
+  [[ ${#_TEMP_FILES[@]} -gt 0 ]] && rm -f "${_TEMP_FILES[@]}" 2>/dev/null || true
+}
+trap cleanup_temp EXIT
+
+# Run a shell command on a HyperPod node via SSM.
+#
+# HyperPod uses a SageMaker-managed instance fleet, so `aws ssm send-command`
+# with a bare instance-id is not supported. The supported path is
+# `aws ssm start-session` with target `sagemaker-cluster:<cluster-id>_<group>-<iid>`
+# and document `AWS-StartNonInteractiveCommand`.
+#
+# Usage: ssm_run_on_node <instance-id> <instance-group-name> "<shell command>"
+# Returns remote stdout. start-session does not propagate the remote exit code.
+ssm_run_on_node() {
+  local iid="$1" grp="$2" cmd="$3"
+  [[ -z "$iid" || -z "$grp" || -z "$cmd" ]] && return 1
+  [[ ! "$iid" =~ ^i-[0-9a-f]{8,17}$ ]] && return 1
+  [[ -z "${CLUSTER_ID:-}" ]] && return 1
+  [[ ! "$grp" =~ ^[A-Za-z0-9._-]+$ ]] && return 1
+
+  if [[ "${HAS_UNBUFFER:-true}" != "true" ]]; then
+    echo "  [SKIP] on-node SSM probe skipped — install 'unbuffer' (expect package) to enable" >&2
+    return 1
+  fi
+
+  local target="sagemaker-cluster:${CLUSTER_ID}_${grp}-${iid}"
+  local tmp; tmp=$(mktemp 2>/dev/null) || return 1
+  chmod 600 "$tmp" 2>/dev/null || true
+  _TEMP_FILES+=("$tmp")
+  # Embed the command as base64 because AWS-StartNonInteractiveCommand
+  # collapses newlines in a single command element.
+  local cmd_b64
+  cmd_b64=$(printf '%s' "$cmd" | base64 | tr -d '\n') || return 1
+  local remote="bash -c \"echo $cmd_b64 | base64 -d | bash\""
+  python3 -c "import json,sys; print(json.dumps({'command':[sys.argv[1]]}))" "$remote" > "$tmp" || return 1
+
+  local attempt=0 out rc
+  while (( attempt < 5 )); do
+    out=$(unbuffer timeout 180 aws ssm start-session \
+      --target "$target" \
+      --document-name AWS-StartNonInteractiveCommand \
+      --parameters "file://$tmp" \
+      --region "$REGION" 2>&1)
+    rc=$?
+    # SSM sometimes returns rc=0 with a transport error baked into stdout —
+    # retry those (EOF, SessionManagerPlugin not found, i/o timeout).
+    if (( rc == 0 )) && ! echo "$out" | grep -qiE "Cannot perform start session|EOF$|SessionManagerPlugin is not found|ERROR: Unable to|i/o timeout"; then
+      # Strip SSM session banners and the echoed base64 command line.
+      echo "$out" | grep -vE '^(Starting session with SessionId:|Exiting session with sessionId:|\s*$)' \
+                  | grep -vE "^(bash -c \"echo [A-Za-z0-9+/=]+ \| base64 -d \| bash\"|echo '[A-Za-z0-9+/=]+'|[A-Za-z0-9+/=]{40,}={0,2})[[:space:]]*\|?[[:space:]]*base64?[[:space:]]*-?d?[[:space:]]*\|?[[:space:]]*bash\"?\$" || true
+      return 0
+    fi
+    if echo "$out" | grep -qiE "ThrottlingException|RequestLimitExceeded|InternalFailure|InternalError|ServiceUnavailable|TooManyUpdates|Cannot perform start session|EOF$|SessionManagerPlugin is not found|i/o timeout"; then
+      attempt=$((attempt + 1))
+      sleep $((attempt * 3))
+      continue
+    fi
+    # Non-transient error; surface stderr so callers can diagnose.
+    echo "$out" >&2
+    return 1
+  done
+  return 1
+}
+
+echo ""
+echo -e "${CYAN}${BOLD}HyperPod Node Triage — READ-ONLY${NC}"
+echo -e "${CYAN}   No cluster state will be modified. Each issue line below includes a${NC}"
+echo -e "${CYAN}   pointer to references/node-diagnostics-detail.md for remediation.${NC}"
+
+section "1. Cluster Identity"
+
+CLUSTER_JSON=$(aws sagemaker describe-cluster \
+  --cluster-name "$CLUSTER" \
+  --region "$REGION" \
+  --cli-read-timeout 30 \
+  --output json 2>&1) || {
+  echo -e "${RED}ERROR: Cannot find cluster '$CLUSTER' in region '$REGION'${NC}"
+  echo ""
+  echo "Available clusters in $REGION:"
+  aws sagemaker list-clusters --region "$REGION" \
+    --query 'ClusterSummaries[*].{Name:ClusterName,Status:ClusterStatus,ARN:ClusterArn}' \
+    --output table 2>/dev/null || echo "  (unable to list)"
+  exit 1
+}
+
+CLUSTER_ARN=$(echo "$CLUSTER_JSON"    | python3 -c "import sys,json; print(json.load(sys.stdin).get('ClusterArn',''))"                2>/dev/null || echo "")
+CLUSTER_STATUS=$(echo "$CLUSTER_JSON" | python3 -c "import sys,json; print(json.load(sys.stdin).get('ClusterStatus',''))"             2>/dev/null || echo "")
+ORCHESTRATOR=$(echo "$CLUSTER_JSON"   | python3 -c "import sys,json; d=json.load(sys.stdin); print('EKS' if 'Eks' in d.get('Orchestrator',{}) else 'Slurm')" 2>/dev/null || echo "Unknown")
+NODE_RECOVERY=$(echo "$CLUSTER_JSON"  | python3 -c "import sys,json; print(json.load(sys.stdin).get('NodeRecovery','Unknown'))"       2>/dev/null || echo "Unknown")
+CLUSTER_ID=$(echo "$CLUSTER_ARN" | cut -d/ -f2 2>/dev/null || echo "")
+
+echo -e "  ARN:          ${CLUSTER_ARN}"
+echo -e "  Status:       ${BOLD}${CLUSTER_STATUS}${NC}"
+echo -e "  Orchestrator: ${ORCHESTRATOR}"
+echo -e "  NodeRecovery: ${NODE_RECOVERY}"
+echo -e "  ClusterId:    ${CLUSTER_ID}"
+
+[[ "$NODE_RECOVERY" == "None" || "$NODE_RECOVERY" == "Unknown" ]] && \
+  warn "NodeRecovery is '$NODE_RECOVERY' — auto-replacement disabled. Manual intervention required for hardware failures."
+
+section "2. Cluster Events (Root Cause Signals)"
+
+# Fetch multiple pages and merge into a single JSON blob. Cap at 500 events to
+# bound memory and runtime on long-lived clusters (each page is up to 100).
+fetch_cluster_events() {
+  local merged='[]' token='' page_json i=0
+  while (( i < 5 )); do
+    # Only pass --next-token if the token parses as a non-empty, strictly
+    # base64/URL-safe string. Sending garbage (e.g. an error message that
+    # leaked into $token) would cause ValidationException / BadRequest.
+    if [[ -n "$token" && "$token" =~ ^[a-zA-Z0-9/+]*={0,2}$ ]]; then
+      page_json=$(aws sagemaker list-cluster-events \
+        --cluster-name "$CLUSTER" --region "$REGION" \
+        --max-results 100 --next-token "$token" \
+        --cli-read-timeout 30 --output json 2>&1) || break
+    else
+      page_json=$(aws sagemaker list-cluster-events \
+        --cluster-name "$CLUSTER" --region "$REGION" \
+        --max-results 100 \
+        --cli-read-timeout 30 --output json 2>&1) || break
+    fi
+    local combined
+    combined=$(printf '%s\0%s' "$merged" "$page_json" | python3 -c "
+import sys, json
+blob = sys.stdin.buffer.read()
+try:
+    a, b = blob.split(b'\0', 1)
+    merged = json.loads(a)
+    page = json.loads(b)
+except (json.JSONDecodeError, ValueError):
+    sys.exit(2)
+merged.extend(page.get('Events', []))
+print(json.dumps(merged))
+print(page.get('NextToken','') or '')
+" 2>/dev/null) || break
+    merged=$(printf '%s\n' "$combined" | sed -n '1p')
+    token=$(printf '%s\n'  "$combined" | sed -n '2p')
+    i=$((i+1))
+    [[ -z "$token" ]] && break
+  done
+  printf '%s' "$merged" | python3 -c "
+import sys, json
+try:
+    print(json.dumps({'Events': json.loads(sys.stdin.read())}))
+except json.JSONDecodeError:
+    print('{\"Events\":[]}')
+" 2>/dev/null || echo '{"Events":[]}'
+}
+
+EVENTS=$(fetch_cluster_events)
+if [[ -z "$EVENTS" ]] || echo "$EVENTS" | grep -qE "AccessDenied|not authorized"; then
+  aws_check_perms "$EVENTS" "sagemaker:ListClusterEvents"
+  EVENTS='{"Events":[]}'
+fi
+
+EVENT_COUNT=$(echo "$EVENTS" | python3 -c "import sys,json; print(len(json.load(sys.stdin).get('Events',[])))" 2>/dev/null || echo "0")
+
+if [[ "$EVENT_COUNT" -gt 0 ]]; then
+  echo -e "  Found ${BOLD}${EVENT_COUNT}${NC} cluster events. Recent events:"
+  echo ""
+
+  echo "$EVENTS" | python3 -c "
+import sys, json
+events = json.load(sys.stdin).get('Events', [])
+for e in events[:20]:
+    ts = e.get('EventTime','?')
+    msg = e.get('Description','') or ''
+    grp = e.get('InstanceGroupName','') or ''
+    rt = e.get('ResourceType','') or ''
+    tag = ''
+    low = msg.lower()
+    if 'EFA health checks did not run' in msg:
+        tag = ' ← [GO TO SECTION A: EFA/SG FIX]'
+    elif 'bootstrap failed' in low and 'network' in low:
+        tag = ' ← [GO TO SECTION A+B: VPC/EKS FIX]'
+    elif 'Lifecycle scripts' in msg or 'lifecycle script' in low:
+        tag = ' ← [GO TO SECTION D: LIFECYCLE FIX]'
+    elif 'hardware failure' in low:
+        tag = ' ← [GO TO SECTION F: HARDWARE]'
+    elif 'insufficient capacity' in low or 'sufficient capacity' in low:
+        tag = ' ← [GO TO SECTION C: CAPACITY]'
+    elif 'failed to provision' in low:
+        tag = ' ← [CHECK SECTION C or F]'
+    elif 'successfully' in low and 'failed' not in low:
+        tag = ' [OK]'
+    label = (grp or rt or '?')
+    print(f'  [{label}] {ts}')
+    print(f'    {msg[:120]}{\"...\" if len(msg) > 120 else \"\"}{tag}')
+    print()
+" 2>/dev/null
+
+  FAILURE_EVENTS=$(echo "$EVENTS" | python3 -c "
+import sys,json
+events=json.load(sys.stdin).get('Events',[])
+fails=[(e.get('Description','') or '') for e in events if any(k in (e.get('Description','') or '').lower() for k in ['failed','error','timeout','fault','unhealthy'])]
+for f in fails[:5]:
+    print(f)
+" 2>/dev/null || echo "")
+
+  if echo "$FAILURE_EVENTS" | grep -qi "efa health"; then
+    add_issue "EFA health check failure → references/node-diagnostics-detail.md § A (EFA / Security Group)" "P0"
+  fi
+  if echo "$FAILURE_EVENTS" | grep -qi "network misconfiguration\|bootstrap failed"; then
+    add_issue "K8s bootstrap network error → references/node-diagnostics-detail.md § A (EFA / Security Group) + § B (VPC / Routing)" "P1"
+  fi
+  if echo "$FAILURE_EVENTS" | grep -qi "lifecycle script"; then
+    add_issue "Lifecycle script failure → references/node-diagnostics-detail.md § D (Lifecycle Scripts)" "P1"
+  fi
+  if echo "$FAILURE_EVENTS" | grep -qi "hardware failure"; then
+    add_issue "Hardware failure detected → references/node-diagnostics-detail.md § F (Hardware / Auto-Repair)" "P1"
+  fi
+  if echo "$FAILURE_EVENTS" | grep -qi "insufficient capacity"; then
+    add_issue "Insufficient capacity → references/node-diagnostics-detail.md § C (Capacity / AZ)" "P1"
+  fi
+else
+  warn "No cluster events available (may be Slurm cluster or no events yet)"
+fi
+
+section "3. Node Health Status"
+
+# Paginate list-cluster-nodes — default page is only 10 nodes, so large clusters
+# would otherwise be diagnosed on a tiny sample.
+fetch_all_cluster_nodes() {
+  local merged='[]' token='' page_json combined i=0
+  local max_pages=200  # 200 × 100 = 20 000 nodes, supports 7k+ clusters
+  while (( i < max_pages )); do
+    # Validate token format before sending — avoid BadRequest on garbage.
+    if [[ -n "$token" && "$token" =~ ^[a-zA-Z0-9/+]*={0,2}$ ]]; then
+      page_json=$(aws sagemaker list-cluster-nodes \
+        --cluster-name "$CLUSTER" --region "$REGION" \
+        --max-results 100 --next-token "$token" \
+        --cli-read-timeout 30 --output json 2>&1) || break
+    else
+      page_json=$(aws sagemaker list-cluster-nodes \
+        --cluster-name "$CLUSTER" --region "$REGION" \
+        --max-results 100 \
+        --cli-read-timeout 30 --output json 2>&1) || break
+    fi
+    # Merge via stdin (NUL-delimited) to avoid ARG_MAX truncation at ~500 nodes.
+    combined=$(printf '%s\0%s' "$merged" "$page_json" | python3 -c "
+import sys, json
+blob = sys.stdin.buffer.read()
+try:
+    a, b = blob.split(b'\0', 1)
+    merged = json.loads(a)
+    page = json.loads(b)
+except (json.JSONDecodeError, ValueError):
+    sys.exit(2)
+merged.extend(page.get('ClusterNodeSummaries', []))
+print(json.dumps(merged))
+print(page.get('NextToken','') or '')
+" 2>/dev/null) || break
+    merged=$(printf '%s\n' "$combined" | sed -n '1p')
+    token=$(printf '%s\n'  "$combined" | sed -n '2p')
+    i=$((i+1))
+    [[ -z "$token" ]] && break
+  done
+  if (( i == max_pages )) && [[ -n "$token" ]]; then
+    echo "WARN: list-cluster-nodes truncated at ${max_pages} pages (~$((max_pages*100)) nodes). Diagnostic sample is incomplete for very large clusters." >&2
+  fi
+  printf '%s' "$merged" | python3 -c "
+import sys, json
+try:
+    print(json.dumps({'ClusterNodeSummaries': json.loads(sys.stdin.read())}))
+except json.JSONDecodeError:
+    print('{\"ClusterNodeSummaries\":[]}')
+" 2>/dev/null || echo '{"ClusterNodeSummaries":[]}'
+}
+
+NODES_JSON=$(fetch_all_cluster_nodes)
+if [[ -z "$NODES_JSON" ]] || echo "$NODES_JSON" | grep -qE "AccessDenied|not authorized"; then
+  aws_check_perms "$NODES_JSON" "sagemaker:ListClusterNodes"
+  NODES_JSON='{"ClusterNodeSummaries":[]}'
+fi
+
+TOTAL_NODES=$(echo "$NODES_JSON"   | python3 -c "import sys,json; print(len(json.load(sys.stdin).get('ClusterNodeSummaries',[])))" 2>/dev/null || echo "0")
+RUNNING_NODES=$(echo "$NODES_JSON" | python3 -c "import sys,json; print(sum(1 for n in json.load(sys.stdin).get('ClusterNodeSummaries',[]) if n.get('InstanceStatus',{}).get('Status')=='Running'))" 2>/dev/null || echo "0")
+BAD_NODES=$(echo "$NODES_JSON"     | python3 -c "import sys,json; print(sum(1 for n in json.load(sys.stdin).get('ClusterNodeSummaries',[]) if n.get('InstanceStatus',{}).get('Status') not in ('Running','')))" 2>/dev/null || echo "0")
+
+echo -e "  Total: ${TOTAL_NODES}  Running: ${GREEN}${RUNNING_NODES}${NC}  Problems: ${RED}${BAD_NODES}${NC}"
+
+if [[ "$BAD_NODES" -gt 0 ]]; then
+  echo ""
+  echo -e "  ${RED}Non-Running nodes:${NC}"
+  echo "$NODES_JSON" | python3 -c "
+import sys,json
+nodes=json.load(sys.stdin).get('ClusterNodeSummaries',[])
+for n in nodes:
+    status=n.get('InstanceStatus',{})
+    st=status.get('Status','?')
+    if st not in ('Running',''):
+        iid=n.get('InstanceId','?')
+        grp=n.get('InstanceGroupName','?')
+        itype=n.get('InstanceType','?')
+        msg=status.get('Message','')
+        print(f'  [FAIL] {iid} ({grp} / {itype})')
+        print(f'    Status: {st}')
+        if msg:
+            print(f'    Message: {msg[:100]}')
+        print()
+" 2>/dev/null
+  add_issue "$BAD_NODES node(s) not Running → references/node-diagnostics-detail.md § F (Hardware / Auto-Repair)" "P1"
+else
+  ok "All $TOTAL_NODES nodes are Running"
+fi
+
+if [[ -n "$TARGET_NODE" ]]; then
+  echo ""
+  echo -e "  ${BOLD}Targeted node: ${TARGET_NODE}${NC}"
+  NODE_DETAIL=$(aws sagemaker describe-cluster-node \
+    --cluster-name "$CLUSTER" \
+    --node-id "$TARGET_NODE" \
+    --region "$REGION" \
+    --cli-read-timeout 30 \
+    --output json 2>&1 || true)
+  if echo "$NODE_DETAIL" | grep -qiE "ResourceNotFound|not found|ValidationException"; then
+    bad "Node '$TARGET_NODE' not found in cluster '$CLUSTER'"
+    info "Verify the instance ID belongs to this cluster:"
+    info "  aws sagemaker list-cluster-nodes --cluster-name $CLUSTER --region $REGION --query 'ClusterNodeSummaries[*].InstanceId' --output text"
+    add_issue "Node $TARGET_NODE not found in cluster $CLUSTER → verify --cluster and --node arguments" "P0"
+    TARGET_NODE=""  # clear so downstream SSM probe doesn't retry on nonexistent node
+  elif echo "$NODE_DETAIL" | grep -qiE "AccessDenied|UnauthorizedOperation"; then
+    warn "Permission denied: sagemaker:DescribeClusterNode — check IAM policy"
+  else
+    echo "$NODE_DETAIL" | python3 -c "
+import sys,json
+try:
+    d=json.load(sys.stdin).get('NodeDetails',{})
+    st=d.get('InstanceStatus',{})
+    print(f'  Status: {st.get(\"Status\",\"?\")}')
+    print(f'  Launch: {d.get(\"LaunchTime\",\"?\")}')
+    print(f'  Message: {st.get(\"Message\",\"\")}')
+    print(f'  Type: {d.get(\"InstanceType\",\"?\")}')
+    print(f'  Group: {d.get(\"InstanceGroupName\",\"?\")}')
+except Exception:
+    pass
+" 2>/dev/null
+  fi
+fi
+
+if [[ "$ORCHESTRATOR" == "EKS" ]]; then
+  section "4. EKS Node Health Labels"
+
+  if command -v kubectl &>/dev/null; then
+    UNHEALTHY_LABELS=$(kubectl get nodes \
+      -l 'sagemaker.amazonaws.com/node-health-status notin (Schedulable)' \
+      -o custom-columns='NODE:.metadata.name,HEALTH:.metadata.labels.sagemaker\.amazonaws\.com/node-health-status,FAULT:.metadata.labels.sagemaker\.amazonaws\.com/fault-types,DHC:.metadata.labels.sagemaker\.amazonaws\.com/deep-health-check-status' \
+      --no-headers 2>/dev/null || echo "")
+
+    if [[ -n "$UNHEALTHY_LABELS" ]]; then
+      bad "Nodes with health issues:"
+      while IFS= read -r line; do
+        echo "    $line"
+        if echo "$line" | grep -q "PendingReplacement"; then
+          add_issue "Node pending replacement (UnschedulablePendingReplacement) → references/node-diagnostics-detail.md § F (Hardware / Auto-Repair)" "P1"
+        elif echo "$line" | grep -q "PendingReboot"; then
+          add_issue "Node pending reboot (UnschedulablePendingReboot) → references/node-diagnostics-detail.md § F (Hardware / Auto-Repair)" "P1"
+        fi
+      done <<< "$UNHEALTHY_LABELS"
+    else
+      ok "All EKS nodes have healthy labels (Schedulable)"
+    fi
+
+    # Check deep health check status. Under `set -o pipefail`, a failed kubectl
+    # with `| wc -l || echo 0` yields "0\n0". Count safely via a tmp var.
+    DHC_FAILED_OUT=$(kubectl get nodes \
+      -l 'sagemaker.amazonaws.com/deep-health-check-status=Failed' \
+      -o name 2>/dev/null || true)
+    DHC_FAILED=$(echo -n "$DHC_FAILED_OUT" | grep -c . || true)
+    [[ -z "$DHC_FAILED" ]] && DHC_FAILED=0
+    [[ "$DHC_FAILED" -gt 0 ]] && bad "$DHC_FAILED node(s) have deep-health-check-status=Failed → references/node-diagnostics-detail.md § G (GPU/Accelerator) + § F (Hardware / Auto-Repair)"
+  else
+    warn "kubectl not available — cannot check EKS node labels (install kubectl to enable this check)"
+  fi
+fi
+
+if [[ "$ORCHESTRATOR" == "EKS" ]] && command -v kubectl &>/dev/null; then
+  section "4a. EKS CNI & System Pod Health"
+
+  CNI_ISSUES=0
+  # aws-node (VPC CNI plugin) — if this crashes, no pods can get IPs
+  AWS_NODE_DS=$(kubectl get ds -n kube-system aws-node -o json 2>/dev/null || echo "")
+  if [[ -n "$AWS_NODE_DS" && "$AWS_NODE_DS" != "" ]]; then
+    AWS_NODE_STATUS=$(echo "$AWS_NODE_DS" | python3 -c "
+import sys, json
+ds = json.load(sys.stdin)
+desired = ds.get('status',{}).get('desiredNumberScheduled', 0)
+ready = ds.get('status',{}).get('numberReady', 0)
+unavail = ds.get('status',{}).get('numberUnavailable', 0)
+if unavail > 0:
+    print(f'FAIL:{unavail} of {desired} aws-node pods not ready — pod networking broken on those nodes')
+elif ready == desired and desired > 0:
+    print(f'PASS:aws-node DaemonSet healthy ({ready}/{desired} ready)')
+elif desired == 0:
+    print('WARN:aws-node DaemonSet has 0 desired pods')
+else:
+    print(f'WARN:aws-node DaemonSet {ready}/{desired} ready')
+" 2>/dev/null || echo "")
+    if [[ -n "$AWS_NODE_STATUS" ]]; then
+      _level="${AWS_NODE_STATUS%%:*}"
+      _msg="${AWS_NODE_STATUS#*:}"
+      case "$_level" in
+        PASS) ok "$_msg" ;;
+        FAIL) bad "$_msg"
+              add_issue "aws-node (VPC CNI) pods failing → references/node-diagnostics-detail.md § O (CNI / Pod Networking)" "P0"
+              CNI_ISSUES=$((CNI_ISSUES + 1))
+              ;;
+        WARN) warn "$_msg" ;;
+      esac
+    fi
+
+    CNI_CRASHES=$(kubectl get pods -n kube-system -l k8s-app=aws-node --no-headers 2>/dev/null \
+      | grep -iE "CrashLoopBackOff|Error|ImagePullBackOff" || true)
+    if [[ -n "$CNI_CRASHES" ]]; then
+      bad "aws-node pods in crash state:"
+      echo "$CNI_CRASHES" | while IFS= read -r line; do echo "    $line"; done
+      add_issue "aws-node CrashLoopBackOff — pod networking broken → references/node-diagnostics-detail.md § O (CNI / Pod Networking)" "P0"
+      CNI_ISSUES=$((CNI_ISSUES + 1))
+
+      CNI_LOGS=$(kubectl logs -n kube-system -l k8s-app=aws-node --tail=20 2>/dev/null | \
+        grep -iE "error|failed|refused|timeout|fatal|gRPC|ipamd|eni" | tail -5 || true)
+      if [[ -n "$CNI_LOGS" ]]; then
+        info "Recent aws-node error logs:"
+        echo "$CNI_LOGS" | while IFS= read -r line; do info "  $line"; done
+      fi
+    fi
+  else
+    info "aws-node DaemonSet not found in kube-system (may use alternate CNI)"
+  fi
+
+  # kube-proxy — if down, service networking breaks
+  KP_CRASHES=$(kubectl get pods -n kube-system -l k8s-app=kube-proxy --no-headers 2>/dev/null \
+    | grep -iE "CrashLoopBackOff|Error|ImagePullBackOff" || true)
+  if [[ -n "$KP_CRASHES" ]]; then
+    bad "kube-proxy pods in crash state:"
+    echo "$KP_CRASHES" | while IFS= read -r line; do echo "    $line"; done
+    add_issue "kube-proxy crash — service networking broken → references/node-diagnostics-detail.md § O (CNI / Pod Networking)" "P0"
+    CNI_ISSUES=$((CNI_ISSUES + 1))
+  fi
+
+  # CoreDNS — if down, DNS resolution fails (NCCL MASTER_ADDR, service discovery)
+  COREDNS_CRASHES=$(kubectl get pods -n kube-system -l k8s-app=kube-dns --no-headers 2>/dev/null \
+    | grep -iE "CrashLoopBackOff|Error|ImagePullBackOff" || true)
+  if [[ -n "$COREDNS_CRASHES" ]]; then
+    bad "CoreDNS pods in crash state — DNS resolution will fail:"
+    echo "$COREDNS_CRASHES" | while IFS= read -r line; do echo "    $line"; done
+    add_issue "CoreDNS crash — DNS broken → references/node-diagnostics-detail.md § O (CNI / Pod Networking)" "P0"
+    CNI_ISSUES=$((CNI_ISSUES + 1))
+  fi
+
+  [[ "$CNI_ISSUES" -eq 0 ]] && ok "kube-system networking pods healthy (aws-node, kube-proxy, CoreDNS)"
+fi
+
+if [[ "$ORCHESTRATOR" == "Slurm" ]]; then
+  section "4b. Slurm Node States"
+
+  if command -v sinfo &>/dev/null; then
+    SLURM_DOWN=$(sinfo -o "%N %T %30E" --noheader 2>/dev/null | grep -iE "down|drain|fail" || true)
+    if [[ -n "$SLURM_DOWN" ]]; then
+      bad "Slurm nodes with issues:"
+      echo "$SLURM_DOWN" | while IFS= read -r line; do
+        echo "    $line"
+      done
+      DOWN_COUNT=$(echo "$SLURM_DOWN" | grep -c .)
+      add_issue "$DOWN_COUNT Slurm node(s) down/drained → references/node-diagnostics-detail.md § H (Slurm Node Management)" "P1"
+    else
+      ok "All Slurm nodes show idle/alloc/mixed state"
+    fi
+
+    STUCK_JOBS=$(squeue -o "%i %j %T %R %N" --noheader 2>/dev/null | grep -iE "COMPLETING|CONFIGURING" || true)
+    if [[ -n "$STUCK_JOBS" ]]; then
+      warn "Stuck jobs detected (COMPLETING/CONFIGURING):"
+      echo "$STUCK_JOBS" | head -5 | while IFS= read -r line; do echo "    $line"; done
+      add_issue "Stuck Slurm jobs → references/node-diagnostics-detail.md § H (Slurm Node Management)" "P1"
+    fi
+  else
+    info "Slurm CLI not available locally — to check Slurm node states, SSM into the head node:"
+    info "  sinfo -o '%N %T %30E'"
+    info "  squeue -o '%i %j %T %R %N'"
+    info ""
+    info "Or use SSM to run remotely:"
+    if [[ -n "$CLUSTER_ID" ]]; then
+      HEAD_NODE=$(echo "$NODES_JSON" | python3 -c "
+import sys,json
+nodes=json.load(sys.stdin).get('ClusterNodeSummaries',[])
+for n in nodes:
+    g=n.get('InstanceGroupName','').lower()
+    if any(x in g for x in ['controller','head','master']):
+        print(n.get('InstanceId','') + ' ' + n.get('InstanceGroupName',''))
+        break
+else:
+    for n in nodes:
+        if n.get('InstanceStatus',{}).get('Status')=='Running':
+            print(n.get('InstanceId','') + ' ' + n.get('InstanceGroupName',''))
+            break
+" 2>/dev/null || echo "")
+      if [[ -n "$HEAD_NODE" ]]; then
+        H_IID=$(echo "$HEAD_NODE" | awk '{print $1}')
+        H_GRP=$(echo "$HEAD_NODE" | awk '{print $2}')
+        info "  aws ssm start-session --target sagemaker-cluster:${CLUSTER_ID}_${H_GRP}-${H_IID} --region $REGION"
+      fi
+    fi
+    if command -v session-manager-plugin &>/dev/null && [[ -n "$HEAD_NODE" ]]; then
+      H_IID=$(echo "$HEAD_NODE" | awk '{print $1}')
+      # Validate instance ID format — defense-in-depth against unexpected input.
+      if [[ "$H_IID" =~ ^i-[0-9a-f]{8,17}$ ]]; then
+        info ""
+        info "Running Slurm checks via SSM on controller ${H_IID}..."
+        # Unique delimiter prevents false matches if check output happens to contain marker text.
+        local_nonce=$(date +%s%N 2>/dev/null || echo "$RANDOM")
+        SLURM_CHECK_SH=$(cat <<EOF
+echo SLURM_CHECK_START_${local_nonce}
+scontrol show config >/dev/null 2>&1 || echo SLURMCTLD_DOWN_${local_nonce}
+echo DOWN_NODES_${local_nonce}
+sinfo -o '%20N %10T %30E' --noheader 2>/dev/null | grep -iE 'down|drain|fail' | head -10
+echo END_DOWN_${local_nonce}
+echo STUCK_COUNT_${local_nonce}
+squeue -o '%i %T' --noheader 2>/dev/null | grep -cE 'COMPLETING|CONFIGURING' || echo 0
+echo MUNGE_${local_nonce}
+systemctl is-active munge 2>/dev/null || echo inactive
+echo SLURM_CHECK_END_${local_nonce}
+EOF
+)
+        SSM_STDOUT=$(ssm_run_on_node "$H_IID" "$H_GRP" "$SLURM_CHECK_SH" || echo "")
+        if [[ -z "$SSM_STDOUT" ]] || ! echo "$SSM_STDOUT" | grep -q "SLURM_CHECK_START_${local_nonce}"; then
+          warn "Slurm SSM probe returned no usable output — controller may be unreachable or SSM agent not responding"
+          add_issue "Slurm controller SSM probe failed → references/node-diagnostics-detail.md § K (Node Access via SSM) + § H (Slurm Node Management)" "P1"
+        fi
+        if echo "$SSM_STDOUT" | grep -q "SLURM_CHECK_START_${local_nonce}"; then
+          if echo "$SSM_STDOUT" | grep -q "SLURMCTLD_DOWN_${local_nonce}"; then
+            bad "slurmctld not responding on controller — all Slurm operations blocked"
+            add_issue "slurmctld down → references/node-diagnostics-detail.md § H (Slurm Node Management)" "P0"
+          else
+            ok "slurmctld responding"
+          fi
+          SSM_DOWN_LINES=$(echo "$SSM_STDOUT" | sed -n "/^DOWN_NODES_${local_nonce}\$/,/^END_DOWN_${local_nonce}\$/p" | grep -v "^DOWN_NODES_\|^END_DOWN_" | grep -v "^$" || true)
+          if [[ -n "$SSM_DOWN_LINES" ]]; then
+            bad "Slurm nodes with issues (via SSM):"
+            echo "$SSM_DOWN_LINES" | while IFS= read -r line; do info "  $line"; done
+            SSM_DOWN_COUNT=$(echo "$SSM_DOWN_LINES" | grep -c .)
+            add_issue "$SSM_DOWN_COUNT Slurm node(s) down/drained → references/node-diagnostics-detail.md § H (Slurm Node Management)" "P1"
+          else
+            ok "All Slurm nodes healthy (via SSM)"
+          fi
+          STUCK_COUNT=$(echo "$SSM_STDOUT" | sed -n "/^STUCK_COUNT_${local_nonce}\$/{n;p;}" | tr -d '[:space:]')
+          [[ "${STUCK_COUNT:-0}" =~ ^[0-9]+$ ]] && [[ "${STUCK_COUNT:-0}" -gt 0 ]] && \
+            add_issue "$STUCK_COUNT stuck Slurm jobs → references/node-diagnostics-detail.md § H (Slurm Node Management)" "P1"
+          if echo "$SSM_STDOUT" | sed -n "/^MUNGE_${local_nonce}\$/{n;p;}" | grep -q inactive; then
+            bad "munge authentication service inactive on controller"
+            add_issue "munge service inactive → references/node-diagnostics-detail.md § H (Slurm Node Management)" "P0"
+          fi
+        fi
+      fi
+    fi
+  fi
+fi
+
+section "5. Cluster VPC Resources"
+
+RESOURCES=$(echo "$CLUSTER_JSON" | python3 -c "
+import sys,json
+d=json.load(sys.stdin)
+vpc=d.get('VpcConfig',{})
+sgs=vpc.get('SecurityGroupIds',[])
+subnets=vpc.get('Subnets',[])
+print('SGs=' + ','.join(sgs))
+print('Subnets=' + ','.join(subnets))
+" 2>/dev/null || echo "")
+
+CLUSTER_SGS=$(echo "$RESOURCES"     | grep "^SGs="     | cut -d= -f2)
+CLUSTER_SUBNETS=$(echo "$RESOURCES" | grep "^Subnets=" | cut -d= -f2)
+
+if [[ -n "$CLUSTER_SGS" ]]; then
+  echo -e "  Security Groups: ${BOLD}${CLUSTER_SGS}${NC}"
+  echo -e "  Subnets:         ${BOLD}${CLUSTER_SUBNETS}${NC}"
+
+  for SG in $(echo "$CLUSTER_SGS" | tr ',' ' '); do
+    # Nested JMESPath filter `UserIdGroupPairs[?GroupId=='...']` inside an
+    # already-filtered projection returns empty under AWS CLI even when the
+    # rule is present — false-flags healthy SGs as a P0. Flatten the array
+    # and match in bash instead.
+    _SG_RESULT=$(aws ec2 describe-security-groups \
+      --group-ids "$SG" --region "$REGION" \
+      --cli-read-timeout 15 \
+      --query "SecurityGroups[0].IpPermissionsEgress[?IpProtocol=='-1'].UserIdGroupPairs[].GroupId" \
+      --output text 2>&1)
+    if aws_check_perms "$_SG_RESULT" "ec2:DescribeSecurityGroups"; then
+      info "SG check skipped for $SG (permission denied)"
+      continue
+    fi
+    if echo "$_SG_RESULT" | tr '\t' '\n' | grep -qxF "$SG"; then
+      ok "SG ${SG} has outbound self-referencing rule (EFA ready)"
+    else
+      bad "SG ${SG} missing outbound self-referencing rule → EFA will fail"
+      add_issue "Missing SG outbound self-ref rule on ${SG} → references/node-diagnostics-detail.md § A (EFA / Security Group)" "P0"
+    fi
+  done
+
+  if [[ -n "$CLUSTER_SUBNETS" ]]; then
+    echo ""
+    # shellcheck disable=SC2046  # intentional word splitting for multiple subnet IDs
+    IFS=',' read -ra _subnet_arr <<< "$CLUSTER_SUBNETS"
+    _SUB_RESULT=$(aws ec2 describe-subnets \
+      --subnet-ids "${_subnet_arr[@]}" \
+      --region "$REGION" \
+      --cli-read-timeout 15 \
+      --query 'Subnets[*].{SubnetId:SubnetId,AZ:AvailabilityZone,FreeIPs:AvailableIpAddressCount}' \
+      --output table 2>&1)
+    if ! aws_check_perms "$_SUB_RESULT" "ec2:DescribeSubnets"; then
+      echo "$_SUB_RESULT"
+    fi
+  fi
+else
+  warn "No VpcConfig found in cluster — cluster may not have customer VPC"
+fi
+
+section "6. CloudWatch Logs"
+
+if [[ -n "$CLUSTER_ID" ]]; then
+  CLUSTER_NAME_ONLY=$(echo "$CLUSTER" | awk -F/ '{print $NF}')
+  LOG_GROUP="/aws/sagemaker/Clusters/${CLUSTER_NAME_ONLY}/${CLUSTER_ID}"
+  echo -e "  Log group: ${LOG_GROUP}"
+
+  _LOG_RESULT=$(aws logs describe-log-groups \
+    --log-group-name-prefix "$LOG_GROUP" \
+    --region "$REGION" \
+    --cli-read-timeout 15 \
+    --query 'logGroups[0].logGroupName' \
+    --output text 2>&1)
+  if aws_check_perms "$_LOG_RESULT" "logs:DescribeLogGroups"; then
+    LOG_EXISTS="None"
+  else
+    LOG_EXISTS="$_LOG_RESULT"
+  fi
+
+  if [[ "$LOG_EXISTS" == "None" || -z "$LOG_EXISTS" ]]; then
+    warn "No CloudWatch log group found — logs may not be configured or cluster is new"
+    info "Expected: $LOG_GROUP"
+  else
+    ok "Log group exists: $LOG_EXISTS"
+
+    # Count recent log streams — paginate so the count reflects all streams,
+    # not just the first 50 (default CloudWatch page size).
+    STREAM_COUNT=0
+    _LS_TOKEN=""; _LS_I=0
+    while (( _LS_I < 20 )); do
+      # Validate token format before sending — avoid BadRequest on garbage.
+      if [[ -n "$_LS_TOKEN" && "$_LS_TOKEN" =~ ^[a-zA-Z0-9/+]*={0,2}$ ]]; then
+        _LS_PAGE=$(aws logs describe-log-streams --log-group-name "$LOG_GROUP" \
+          --region "$REGION" --cli-read-timeout 15 --limit 50 --next-token "$_LS_TOKEN" \
+          --output json 2>/dev/null) || break
+      else
+        _LS_PAGE=$(aws logs describe-log-streams --log-group-name "$LOG_GROUP" \
+          --region "$REGION" --cli-read-timeout 15 --limit 50 \
+          --output json 2>/dev/null) || break
+      fi
+      _LS_INC=$(echo "$_LS_PAGE" | python3 -c "import sys,json; print(len(json.load(sys.stdin).get('logStreams',[])))" 2>/dev/null || echo 0)
+      STREAM_COUNT=$((STREAM_COUNT + _LS_INC))
+      _LS_TOKEN=$(echo "$_LS_PAGE" | python3 -c "import sys,json; print(json.load(sys.stdin).get('nextToken',''))" 2>/dev/null || echo "")
+      _LS_I=$((_LS_I + 1))
+      [[ -z "$_LS_TOKEN" ]] && break
+    done
+    info "$STREAM_COUNT log stream(s) available"
+    info "To view: aws logs describe-log-streams --log-group-name \"$LOG_GROUP\" --region $REGION --output table"
+  fi
+fi
+
+section "7. SSM Connectivity"
+
+if command -v session-manager-plugin &>/dev/null; then
+  # `command -v` only verifies the binary exists — run --version to confirm it
+  # actually works (permissions, broken install, etc.).
+  if SSM_VER=$(session-manager-plugin --version 2>/dev/null); then
+    ok "SSM Session Manager plugin installed (${SSM_VER})"
+  else
+    warn "SSM Session Manager plugin installed but --version failed — plugin may be corrupt or missing libs"
+    add_issue "SSM plugin installed but broken → references/node-diagnostics-detail.md § K (Node Access via SSM)" "P1"
+  fi
+else
+  warn "SSM Session Manager plugin NOT found"
+  info "Install session-manager-plugin (see AWS Systems Manager documentation)"
+  add_issue "SSM plugin missing → references/node-diagnostics-detail.md § K (Node Access via SSM)" "P2"
+fi
+
+RUNNING_IDS=$(echo "$NODES_JSON" | python3 -c "
+import sys,json
+nodes=json.load(sys.stdin).get('ClusterNodeSummaries',[])
+ids=[n.get('InstanceId') for n in nodes if n.get('InstanceStatus',{}).get('Status')=='Running']
+print(','.join(ids[:3]))
+" 2>/dev/null || echo "")
+
+if [[ -n "$RUNNING_IDS" ]]; then
+  ok "Running nodes available for SSM (examples: ${RUNNING_IDS})"
+  info "Use hyperpod-ssm skill with cluster ID: ${CLUSTER_ID}"
+else
+  warn "No Running nodes found — SSM access not possible until nodes are healthy"
+fi
+
+# 8: On-Node Resource Checks (Memory / Storage / Utilities)
+# Runs via SSM on the target node (or first running node) to detect resource
+# exhaustion issues that only show up on-node: disk full, /dev/shm too small,
+# huge pages misconfigured, OOM signals.
+
+NODE_TO_PROBE="${TARGET_NODE}"
+NODE_TO_PROBE_GROUP=""
+
+if [[ -z "$NODE_TO_PROBE" ]]; then
+  # Prefer GPU / accelerator nodes: a node probe on a CPU-only utility node
+  # produces empty GPU / EFA sections and the user can't tell whether the
+  # result is "no hardware" or "hardware is broken." Three-tier fallback
+  # so the script still returns something on a CPU-only cluster.
+  NODE_TO_PROBE=$(echo "$NODES_JSON" | python3 -c "
+import sys, json
+nodes = json.load(sys.stdin).get('ClusterNodeSummaries', [])
+
+GPU_PREFIXES = ('ml.p3', 'ml.p3dn', 'ml.p4d', 'ml.p4de', 'ml.p5', 'ml.p5e',
+                'ml.p5en', 'ml.p6', 'ml.g4dn', 'ml.g5', 'ml.g6', 'ml.g6e', 'ml.g7e')
+NEURON_PREFIXES = ('ml.trn1', 'ml.trn2', 'ml.inf2')
+ACCEL_PREFIXES = GPU_PREFIXES + NEURON_PREFIXES
+
+def is_utility(n):
+    g = (n.get('InstanceGroupName','') or '').lower()
+    return any(x in g for x in ('controller', 'head', 'master'))
+
+running = [n for n in nodes if n.get('InstanceStatus', {}).get('Status','') == 'Running']
+tier1 = [n for n in running if (n.get('InstanceType','') or '').startswith(ACCEL_PREFIXES) and not is_utility(n)]
+tier2 = [n for n in running if n not in tier1 and not is_utility(n)]
+tier3 = [n for n in running if n not in tier1 and n not in tier2]
+
+for n in tier1 + tier2 + tier3:
+    print(n.get('InstanceId', ''))
+    break
+" 2>/dev/null || echo "")
+fi
+
+if [[ -n "$NODE_TO_PROBE" ]]; then
+  NODE_TO_PROBE_GROUP=$(echo "$NODES_JSON" | NODE_ID_ENV="$NODE_TO_PROBE" python3 -c "
+import sys,json,os
+target=os.environ['NODE_ID_ENV']
+nodes=json.load(sys.stdin).get('ClusterNodeSummaries',[])
+for n in nodes:
+    if n.get('InstanceId','')==target:
+        print(n.get('InstanceGroupName',''))
+        break
+" 2>/dev/null || echo "")
+fi
+
+if [[ -n "$NODE_TO_PROBE" ]] \
+    && [[ "$NODE_TO_PROBE" =~ ^i-[0-9a-f]{8,17}$ ]] \
+    && [[ -n "$NODE_TO_PROBE_GROUP" ]] \
+    && command -v session-manager-plugin &>/dev/null; then
+  section "8. On-Node Resource Checks (via SSM)"
+  info "Probing node: $NODE_TO_PROBE (group: ${NODE_TO_PROBE_GROUP})"
+
+  resource_nonce=$(date +%s%N 2>/dev/null || echo "$RANDOM")
+  RESOURCE_SH=$(cat <<EOF
+echo RESOURCE_CHECK_START_${resource_nonce}
+echo DISK_ROOT_${resource_nonce}
+df -h / 2>/dev/null | tail -1
+echo DISK_OPT_${resource_nonce}
+df -h /opt/sagemaker 2>/dev/null | tail -1 || echo NOT_MOUNTED
+echo DISK_NVME_${resource_nonce}
+df -h /opt/dlami/nvme 2>/dev/null | tail -1 || echo NOT_MOUNTED
+echo SHM_SIZE_${resource_nonce}
+df -h /dev/shm 2>/dev/null | tail -1
+echo MEMORY_INFO_${resource_nonce}
+free -h | grep Mem
+echo HUGEPAGES_${resource_nonce}
+cat /proc/meminfo 2>/dev/null | grep -i huge | head -5
+echo EFA_HUGE_PAGE_${resource_nonce}
+env 2>/dev/null | grep FI_EFA_USE_HUGE_PAGE || echo NOT_SET
+echo OOM_RECENT_${resource_nonce}
+dmesg 2>/dev/null | grep -iE 'oom|out of memory|cannot allocate' | tail -5 || echo NONE
+echo INODE_CHECK_${resource_nonce}
+df -i / 2>/dev/null | tail -1
+echo TIME_SYNC_${resource_nonce}
+chronyc tracking 2>/dev/null | grep -E 'System time|Leap status' || timedatectl status 2>/dev/null | grep -E 'synchronized|NTP service' || echo UNKNOWN
+echo SSM_AGENT_${resource_nonce}
+systemctl is-active amazon-ssm-agent 2>/dev/null || echo inactive
+echo NVME_MOUNTS_${resource_nonce}
+lsblk -nr -o NAME,MOUNTPOINT 2>/dev/null | grep -E 'nvme[0-9]+n[0-9]+\$' | head -10 || echo NONE
+echo GPU_XID_${resource_nonce}
+if command -v nvidia-smi >/dev/null 2>&1; then
+  _gpu_xid_out=\$(
+    dmesg 2>/dev/null | grep -E 'NVRM: Xid' | tail -10
+    nvidia-smi -q 2>/dev/null | awk '
+      /Uncorrectable/                                                { if (\$NF ~ /^[0-9]+\$/ && \$NF+0 > 0) print; next }
+      /Pending Page (Blacklist|Blocklist|Retirement)/                { if (\$NF ~ /^[0-9]+\$/ && \$NF+0 > 0) print; next }
+    '
+  )
+  if [[ -z "\$_gpu_xid_out" ]]; then echo NONE; else echo "\$_gpu_xid_out" | head -20; fi
+else
+  echo NO_NVIDIA_SMI
+fi
+echo GPU_REMAP_${resource_nonce}
+# Row-remap state: 'Pending' rows indicate marginal GPU memory that needs a reset
+# to finalize the remap. If remap is reported Failed, the GPU is bad.
+# A stuck 'Pending' state across reboots is a known firmware edge case that can
+# silently degrade training without NCCL/DCGM flagging it — capture explicitly.
+if command -v nvidia-smi >/dev/null 2>&1; then
+  nvidia-smi --query-remapped-rows=gpu_bus_id,remapped_rows.correctable,remapped_rows.uncorrectable,remapped_rows.pending,remapped_rows.failure \
+    --format=csv,noheader 2>/dev/null | head -16 || echo UNSUPPORTED
+else
+  echo NO_NVIDIA_SMI
+fi
+echo GPU_DCGM_${resource_nonce}
+# DCGM health summary. Presence of 'Health Monitor Report' + 'PASS'/'Warn'/'Fail'
+# tells us DCGM has run recently. Absence is informational, not an error.
+# Row-remap errors surface here on drivers where nvidia-smi lags the firmware.
+if command -v dcgmi >/dev/null 2>&1; then
+  dcgmi health --check -j 2>/dev/null | head -40 || dcgmi health --check 2>/dev/null | head -20 || echo DCGM_UNAVAILABLE
+else
+  echo NO_DCGMI
+fi
+echo GPU_DCGM_LOGS_${resource_nonce}
+# DCGM nvvs log presence — SageMaker HyperPod runs DCGM medium/memtest as part
+# of deep-health-check. If this log is present the node has been health-checked
+# recently; tail captures last run result.
+if [ -d /var/log/nvidia-dcgm ] 2>/dev/null; then
+  find /var/log/nvidia-dcgm -maxdepth 1 -type f -printf '%f\n' 2>/dev/null | head -5
+  # \$ escapes are required: this heredoc is <<EOF (not <<'EOF'), so unescaped
+  # shell variables would expand locally. Keep \$ to defer to the remote shell.
+  NVVS_LATEST=\$(find /var/log/nvidia-dcgm -maxdepth 1 -name 'nvvs*.log' -printf '%T@ %p\n' 2>/dev/null | sort -nr | head -1 | awk '{print \$2}')
+  if [ -n "\$NVVS_LATEST" ]; then
+    echo "--- tail of \$NVVS_LATEST ---"
+    tail -n 5 "\$NVVS_LATEST" 2>/dev/null || true
+  fi
+else
+  echo NO_DCGM_LOG_DIR
+fi
+echo KERNEL_PANIC_${resource_nonce}
+dmesg 2>/dev/null | grep -iE 'Kernel panic - not syncing|watchdog: BUG|soft lockup|hard lockup|hung_task: blocked|BUG: unable to handle|BUG: kernel NULL|NMI watchdog' | tail -10 || echo NONE
+echo CONTAINERD_${resource_nonce}
+if command -v systemctl >/dev/null 2>&1; then
+  systemctl is-active containerd 2>/dev/null || echo inactive
+else
+  echo UNKNOWN
+fi
+echo RESOURCE_CHECK_END_${resource_nonce}
+EOF
+)
+  RES_STDOUT=$(ssm_run_on_node "$NODE_TO_PROBE" "$NODE_TO_PROBE_GROUP" "$RESOURCE_SH" || echo "")
+
+  extract_section() {
+    local start="$1" end="$2"
+    # grep -v returns 1 when every line is filtered out; under pipefail this
+    # kills the pipeline even though the EMPTY output is legitimate. Force 0.
+    { echo "$RES_STDOUT" | sed -n "/^${start}_${resource_nonce}\$/,/^${end}_${resource_nonce}\$/p" \
+      | grep -v "^${start}_${resource_nonce}\$\|^${end}_${resource_nonce}\$" || true; }
+  }
+
+  if echo "$RES_STDOUT" | grep -q "RESOURCE_CHECK_START_${resource_nonce}"; then
+    echo ""
+    echo -e "  ${BOLD}Storage:${NC}"
+    ROOT_LINE=$(extract_section DISK_ROOT DISK_OPT | head -1)
+    if [[ -n "$ROOT_LINE" ]]; then
+      ROOT_USE_PCT=$(echo "$ROOT_LINE" | awk '{print $5}' | tr -d '%')
+      if [[ "$ROOT_USE_PCT" =~ ^[0-9]+$ ]] && [[ "$ROOT_USE_PCT" -gt 90 ]]; then
+        bad "Root volume: ${ROOT_USE_PCT}% used — CRITICALLY FULL (100GB fixed, cannot expand)"
+        add_issue "Root volume ${ROOT_USE_PCT}% full → references/node-diagnostics-detail.md § I (Resource Exhaustion)" "P0"
+      elif [[ "$ROOT_USE_PCT" =~ ^[0-9]+$ ]] && [[ "$ROOT_USE_PCT" -gt 80 ]]; then
+        warn "Root volume: ${ROOT_USE_PCT}% used — approaching full"
+        add_issue "Root volume ${ROOT_USE_PCT}% used → references/node-diagnostics-detail.md § I (Resource Exhaustion)" "P1"
+      else
+        ok "Root volume: ${ROOT_USE_PCT:-?}% used"
+      fi
+    fi
+
+    OPT_LINE=$(extract_section DISK_OPT DISK_NVME | head -1)
+    if [[ "$OPT_LINE" != "NOT_MOUNTED" && -n "$OPT_LINE" ]]; then
+      OPT_USE=$(echo "$OPT_LINE" | awk '{print $5}' | tr -d '%')
+      if [[ "$OPT_USE" =~ ^[0-9]+$ ]] && [[ "$OPT_USE" -gt 90 ]]; then
+        warn "/opt/sagemaker: ${OPT_USE}% used — secondary EBS nearing full"
+        add_issue "/opt/sagemaker ${OPT_USE}% full → references/node-diagnostics-detail.md § I (Resource Exhaustion)" "P1"
+      else
+        ok "/opt/sagemaker: ${OPT_USE:-?}% used"
+      fi
+    fi
+
+    NVME_LINE=$(extract_section DISK_NVME SHM_SIZE | head -1)
+    if [[ "$NVME_LINE" != "NOT_MOUNTED" && -n "$NVME_LINE" ]]; then
+      ok "NVMe instance store: mounted at /opt/dlami/nvme"
+    else
+      # On GPU training instances NVMe is expected — flag if not mounted
+      INSTANCE_TYPE_LOC=$(echo "$NODES_JSON" | NODE_ID_ENV="$NODE_TO_PROBE" python3 -c "
+import sys,json,os
+target=os.environ['NODE_ID_ENV']
+for n in json.load(sys.stdin).get('ClusterNodeSummaries',[]):
+    if n.get('InstanceId','')==target:
+        print(n.get('InstanceType',''))
+        break
+" 2>/dev/null || echo "")
+      if [[ "$INSTANCE_TYPE_LOC" =~ ^ml\.(p5|p5e|p5en|p4d|p4de|p6|trn1|trn2)\. ]]; then
+        warn "/opt/dlami/nvme not mounted on $INSTANCE_TYPE_LOC — instance store expected"
+        add_issue "NVMe instance store not mounted on $NODE_TO_PROBE ($INSTANCE_TYPE_LOC) → references/node-diagnostics-detail.md § I (Resource Exhaustion)" "P1"
+      fi
+    fi
+
+    INODE_LINE=$(extract_section INODE_CHECK TIME_SYNC | head -1)
+    if [[ -n "$INODE_LINE" ]]; then
+      INODE_PCT=$(echo "$INODE_LINE" | awk '{print $5}' | tr -d '%')
+      if [[ "$INODE_PCT" =~ ^[0-9]+$ ]] && [[ "$INODE_PCT" -gt 90 ]]; then
+        bad "Inode usage: ${INODE_PCT}% — filesystem running out of inodes"
+        add_issue "Inode exhaustion ${INODE_PCT}% → references/node-diagnostics-detail.md § I (Resource Exhaustion)" "P1"
+      fi
+    fi
+
+    echo ""
+    echo -e "  ${BOLD}Memory:${NC}"
+    MEM_LINE=$(extract_section MEMORY_INFO HUGEPAGES | head -1)
+    [[ -n "$MEM_LINE" ]] && info "RAM: $MEM_LINE"
+
+    SHM_LINE=$(extract_section SHM_SIZE MEMORY_INFO | head -1)
+    if [[ -n "$SHM_LINE" ]]; then
+      SHM_SIZE=$(echo "$SHM_LINE" | awk '{print $2}')
+      SHM_USE_PCT=$(echo "$SHM_LINE" | awk '{print $5}' | tr -d '%')
+      if [[ "$SHM_USE_PCT" =~ ^[0-9]+$ ]] && [[ "$SHM_USE_PCT" -gt 80 ]]; then
+        warn "/dev/shm: ${SHM_USE_PCT}% used (size: $SHM_SIZE) — NCCL may fail with 'Bus error'"
+        add_issue "/dev/shm ${SHM_USE_PCT}% full → references/node-diagnostics-detail.md § I (Resource Exhaustion)" "P1"
+      else
+        ok "/dev/shm: ${SHM_USE_PCT:-?}% used (size: ${SHM_SIZE:-?})"
+      fi
+    fi
+
+    EFA_HP=$(extract_section EFA_HUGE_PAGE OOM_RECENT | head -1)
+    if [[ "$EFA_HP" == "NOT_SET" ]]; then
+      HUGEPAGES_TOTAL=$(extract_section HUGEPAGES EFA_HUGE_PAGE | { grep "HugePages_Total" || true; } | awk '{print $2}')
+      if [[ "${HUGEPAGES_TOTAL:-0}" == "0" ]]; then
+        warn "FI_EFA_USE_HUGE_PAGE not set and HugePages_Total=0"
+        add_issue "FI_EFA_USE_HUGE_PAGE not configured → references/node-diagnostics-detail.md § I (Resource Exhaustion)" "P2"
+      fi
+    elif echo "$EFA_HP" | grep -q "=0"; then
+      ok "FI_EFA_USE_HUGE_PAGE=0 (huge pages disabled for EFA — os.fork() safe)"
+    fi
+
+    OOM_LINES=$(extract_section OOM_RECENT INODE_CHECK | { grep -v "^NONE$" || true; } | head -3)
+    if [[ -n "$OOM_LINES" ]]; then
+      echo ""
+      bad "Recent OOM events detected on node:"
+      echo "$OOM_LINES" | while IFS= read -r line; do info "  $line"; done
+      add_issue "OOM events on node $NODE_TO_PROBE → references/node-diagnostics-detail.md § I (Resource Exhaustion)" "P1"
+    else
+      echo ""
+      ok "No recent OOM events"
+    fi
+
+    # Time sync health — clock drift breaks TLS/SigV4 and Slurm accounting.
+    TIME_STATUS=$(extract_section TIME_SYNC SSM_AGENT | head -3)
+    if echo "$TIME_STATUS" | grep -qiE "synchronized: no|Not synchronised|UNKNOWN"; then
+      warn "Time sync unhealthy — chronyc/timedatectl reports not synchronised"
+      info "Clock drift breaks TLS/IAM (SigV4) and Slurm accounting"
+      add_issue "Node $NODE_TO_PROBE time sync not healthy → references/node-diagnostics-detail.md § I (Resource Exhaustion)" "P1"
+    elif [[ -n "$TIME_STATUS" ]]; then
+      ok "Time sync healthy"
+    fi
+
+    # SSM agent health — if we got here it's mostly working, but flag if systemd says otherwise.
+    SSM_AGENT_STATUS=$(extract_section SSM_AGENT NVME_MOUNTS | head -1)
+    if [[ "$SSM_AGENT_STATUS" == "inactive" ]]; then
+      warn "amazon-ssm-agent reported inactive — may be restarting or broken"
+      add_issue "amazon-ssm-agent inactive on $NODE_TO_PROBE → references/node-diagnostics-detail.md § K (Node Access via SSM)" "P1"
+    fi
+
+    # GPU XID / ECC / page-retirement — hardware faults visible via nvidia-smi query.
+    GPU_XID_LINES=$(extract_section GPU_XID GPU_REMAP | { grep -v "^NONE$" || true; } | { grep -v "^NO_NVIDIA_SMI$" || true; } | head -5)
+    if [[ -n "$GPU_XID_LINES" ]]; then
+      echo ""
+      bad "GPU XID / ECC / page-retirement signals on node $NODE_TO_PROBE:"
+      echo "$GPU_XID_LINES" | while IFS= read -r line; do info "  $line"; done
+      add_issue "GPU XID / ECC / page-retirement on $NODE_TO_PROBE → references/node-diagnostics-detail.md § G (GPU/Accelerator) + § F (Hardware / Auto-Repair)" "P0"
+    fi
+
+    # GPU row-remapping — marginal GPU memory. Pending rows that never clear
+    # indicate a firmware edge case where the remap is stuck; Failed rows mean
+    # the GPU is bad and must be replaced. Silent degrader — NCCL and DCGM's
+    # default checks can miss this.
+    GPU_REMAP_LINES=$(extract_section GPU_REMAP GPU_DCGM | { grep -v "^NO_NVIDIA_SMI$" || true; } | { grep -v "^UNSUPPORTED$" || true; })
+    if [[ -n "$GPU_REMAP_LINES" ]]; then
+      # Columns (csv,noheader): gpu_bus_id, correctable, uncorrectable, pending, failure
+      REMAP_PENDING_TOTAL=0
+      REMAP_FAILED_TOTAL=0
+      REMAP_UNCORRECT_TOTAL=0
+      while IFS= read -r line; do
+        [[ -z "$line" ]] && continue
+        _p=$(echo "$line" | awk -F, '{gsub(/ /,""); print $4}')
+        _f=$(echo "$line" | awk -F, '{gsub(/ /,""); print $5}')
+        _u=$(echo "$line" | awk -F, '{gsub(/ /,""); print $3}')
+        [[ "$_p" =~ ^[0-9]+$ ]] && REMAP_PENDING_TOTAL=$((REMAP_PENDING_TOTAL + _p))
+        [[ "$_u" =~ ^[0-9]+$ ]] && REMAP_UNCORRECT_TOTAL=$((REMAP_UNCORRECT_TOTAL + _u))
+        [[ "$_f" == "Yes" || "$_f" == "1" ]] && REMAP_FAILED_TOTAL=$((REMAP_FAILED_TOTAL + 1))
+      done <<< "$GPU_REMAP_LINES"
+      if [[ "$REMAP_FAILED_TOTAL" -gt 0 ]]; then
+        bad "GPU row-remap FAILED on $REMAP_FAILED_TOTAL device(s) — GPU has exceeded remap capacity"
+        add_issue "GPU row-remap failure on $NODE_TO_PROBE (bad memory, replace GPU) → references/node-diagnostics-detail.md § G (GPU/Accelerator) + § F (Hardware / Auto-Repair)" "P0"
+      elif [[ "$REMAP_PENDING_TOTAL" -gt 0 ]]; then
+        bad "GPU row-remap PENDING — $REMAP_PENDING_TOTAL row(s) awaiting reset"
+        info "  Pending remaps indicate marginal memory that a GPU reset/reboot should finalize."
+        info "  If pending persists across reboots, the firmware may be stuck (known edge case) — escalate."
+        add_issue "GPU row-remap pending on $NODE_TO_PROBE (reset/reboot to finalize; if stuck, marginal memory) → references/node-diagnostics-detail.md § G (GPU/Accelerator) + § F (Hardware / Auto-Repair)" "P1"
+      elif [[ "$REMAP_UNCORRECT_TOTAL" -gt 0 ]]; then
+        warn "GPU has $REMAP_UNCORRECT_TOTAL uncorrectable remapped rows (healthy now, but history of faults)"
+      fi
+    fi
+
+    GPU_DCGM_LINES=$(extract_section GPU_DCGM GPU_DCGM_LOGS | { grep -v "^NO_DCGMI$" || true; } | { grep -v "^DCGM_UNAVAILABLE$" || true; })
+    if [[ -n "$GPU_DCGM_LINES" ]]; then
+      if echo "$GPU_DCGM_LINES" | grep -qiE '"overall_health"\s*:\s*"(Fail|Warn)"|HEALTH_RESULT_FAIL|HEALTH_RESULT_WARN|Health Monitor Report.*(Fail|Warn)'; then
+        bad "DCGM health check reported Fail/Warn on $NODE_TO_PROBE"
+        add_issue "DCGM health Fail/Warn on $NODE_TO_PROBE → references/node-diagnostics-detail.md § G (GPU/Accelerator)" "P0"
+      fi
+    fi
+
+    # DCGM log presence — informational. Confirms deep-health-check history.
+    GPU_DCGM_LOG_LINES=$(extract_section GPU_DCGM_LOGS KERNEL_PANIC)
+    if echo "$GPU_DCGM_LOG_LINES" | grep -qi "nvvs"; then
+      ok "DCGM nvvs logs present on $NODE_TO_PROBE (/var/log/nvidia-dcgm/)"
+      if echo "$GPU_DCGM_LOG_LINES" | grep -qE "^--- tail"; then
+        DCGM_TAIL=$(echo "$GPU_DCGM_LOG_LINES" | sed -n '/^--- tail/,$p' | head -20)
+        if echo "$DCGM_TAIL" | grep -qiE 'FAIL|Error:|row ?remap.*(pending|fail)'; then
+          warn "DCGM nvvs log tail contains failure/row-remap signals — inspect on node:"
+          echo "$DCGM_TAIL" | while IFS= read -r line; do info "  $line"; done
+          add_issue "DCGM nvvs log shows failure/row-remap signals on $NODE_TO_PROBE → references/node-diagnostics-detail.md § G (GPU/Accelerator)" "P0"
+        fi
+      fi
+    fi
+
+    # Kernel panic / watchdog / hung task signals — indicate node-level instability.
+    KERNEL_PANIC_LINES=$(extract_section KERNEL_PANIC CONTAINERD | { grep -v "^NONE$" || true; } | head -5)
+    if [[ -n "$KERNEL_PANIC_LINES" ]]; then
+      echo ""
+      bad "Kernel panic / watchdog / hung_task signals on node $NODE_TO_PROBE:"
+      echo "$KERNEL_PANIC_LINES" | while IFS= read -r line; do info "  $line"; done
+      add_issue "Kernel panic / watchdog on $NODE_TO_PROBE → references/node-diagnostics-detail.md § N (Kernel & System)" "P0"
+    fi
+
+    # containerd health — if the runtime is inactive, every pod on this node fails.
+    CONTAINERD_STATUS=$(extract_section CONTAINERD RESOURCE_CHECK_END | head -1)
+    if [[ "$CONTAINERD_STATUS" == "inactive" ]]; then
+      warn "containerd is inactive on $NODE_TO_PROBE — all pods on this node will fail"
+      add_issue "containerd inactive on $NODE_TO_PROBE → references/node-diagnostics-detail.md § M (Container Runtime)" "P0"
+    fi
+
+  else
+    warn "SSM command returned no output — node may not be reachable"
+    add_issue "Cannot reach node $NODE_TO_PROBE via SSM → references/node-diagnostics-detail.md § K (Node Access via SSM)" "P1"
+  fi
+else
+  if [[ -z "$NODE_TO_PROBE" ]]; then
+    info "No running nodes to probe for resource checks"
+  else
+    info "SSM plugin not installed — skipping on-node resource checks → references/node-diagnostics-detail.md § K (Node Access via SSM)"
+  fi
+fi
+
+if [[ "$ORCHESTRATOR" == "Slurm" && "$TOTAL_NODES" -gt 0 ]]; then
+  section "8b. Slurm Node Mapping"
+  info "Slurm node name → HyperPod instance ID mapping:"
+  info "(PrivateDnsHostname is not in list-cluster-nodes; use 'describe-cluster-node --node-id <i-...>' to retrieve it for a specific instance.)"
+  echo ""
+  echo "$NODES_JSON" | python3 -c "
+import sys,json
+nodes=json.load(sys.stdin).get('ClusterNodeSummaries',[])
+print(f'  {\"Instance ID\":<22} {\"Group\":<20} {\"Type\":<22} {\"Status\"}')
+print(f'  {\"─\"*22} {\"─\"*20} {\"─\"*22} {\"─\"*10}')
+for n in nodes[:20]:
+    iid=n.get('InstanceId','?')
+    grp=n.get('InstanceGroupName','?')
+    itype=n.get('InstanceType','?')
+    st=n.get('InstanceStatus',{}).get('Status','?')
+    print(f'  {iid:<22} {grp:<20} {itype:<22} {st}')
+if len(nodes) > 20:
+    print(f'  ... and {len(nodes)-20} more nodes')
+" 2>/dev/null
+  echo ""
+  info "To get PrivateDnsHostname for a specific instance: aws sagemaker describe-cluster-node --cluster-name $CLUSTER --region $REGION --node-id <i-...> --query 'NodeDetails.PrivateDnsHostname' --output text"
+fi
+
+section "9. Triage Summary"
+
+echo ""
+if [[ ${#ISSUES_FOUND[@]} -eq 0 ]]; then
+  echo -e "  ${GREEN}${BOLD}No critical issues detected from available signals.${NC}"
+  echo ""
+  echo "  Next steps:"
+  echo "  • If cluster is still failing: check cluster events above for error details"
+  echo "  • For node-level issues: use hyperpod-ssm skill to inspect nodes directly"
+  echo "  • For EFA issues: bash scripts/check-efa-sg.sh --cluster ${CLUSTER} --region ${REGION}"
+else
+  echo -e "  ${RED}${BOLD}Issues found (${#ISSUES_FOUND[@]}):${NC}"
+  echo ""
+  for priority in P0 P1 P2; do
+    has_items=false
+    for issue in "${ISSUES_FOUND[@]}"; do
+      if [[ "$issue" == "${priority}|"* ]]; then
+        if ! "$has_items"; then
+          case "$priority" in
+            P0) echo -e "  ${RED}${BOLD}[$priority — Fix Immediately]${NC}" ;;
+            P1) echo -e "  ${YELLOW}${BOLD}[$priority — Fix Soon]${NC}" ;;
+            P2) echo -e "  ${BOLD}[$priority — Informational]${NC}" ;;
+          esac
+          has_items=true
+        fi
+        echo -e "    → ${issue#*|}"
+      fi
+    done
+  done
+  echo ""
+  echo -e "  ${BOLD}Recommended next steps:${NC}"
+  echo "  1. Address P0 issues first, then P1. Each issue above includes a"
+  echo "     pointer of the form '→ references/node-diagnostics-detail.md § X'."
+  echo "  2. The hyperpod-node-debugger skill will open the referenced section"
+  echo "     and guide you through the fix with explicit approval."
+  echo "  3. After fixing, re-run: bash scripts/triage-cluster.sh --cluster ${CLUSTER} --region ${REGION}"
+  echo "  4. For shell access on nodes, use the hyperpod-ssm skill."
+fi
+
+echo ""
+echo -e "${BOLD}Cluster: ${CLUSTER}  |  Region: ${REGION}  |  Orchestrator: ${ORCHESTRATOR}${NC}"
+echo ""
+
+# Exit 1 only on critical (P0/P1) issues so CI / retry loops don't fail on
+# P2 informational findings. Fatal prerequisite failures exit 1 earlier at
+# argument-validation time.
+_critical=0
+for _issue in "${ISSUES_FOUND[@]}"; do
+  case "${_issue%%|*}" in P0|P1) _critical=$((_critical+1)) ;; esac
+done
+[[ "$_critical" -eq 0 ]] && exit 0 || exit 1
diff --git a/plugins/sagemaker-ai/skills/hyperpod-performance-debugger/SKILL.md b/plugins/sagemaker-ai/skills/hyperpod-performance-debugger/SKILL.md
new file mode 100644
index 00000000..3ba1bdbe
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/hyperpod-performance-debugger/SKILL.md
@@ -0,0 +1,185 @@
+---
+name: hyperpod-performance-debugger
+description: Diagnose performance issues on Amazon SageMaker HyperPod clusters — uneven NCCL bandwidth across nodes and poor filesystem throughput. Read-only. Surfaces host-side signals (Xid, ECC, NVLink, EFA reachability, FSx saturation) and routes to the appropriate sibling skill (hyperpod-node-debugger, hyperpod-nccl, hyperpod-version-checker, hyperpod-issue-report) for any remediation. Triggers on uneven NCCL across nodes, straggler node, FSx slow, checkpoint slow, dataloader slow, filesystem bottleneck, FSx throughput, cross-AZ latency, topology mismatch.
+metadata:
+  version: "0.0.1"
+---
+
+# HyperPod Performance Debugger
+
+1. **Uneven NCCL performance across nodes** — workload faster on some node sets than others, pairwise bandwidth variance, suspected straggler.
+2. **Poor filesystem performance** — training stalled on data loading, checkpoint save/load dominating step time, FSx throughput saturated.
+
+## Scope and delegation
+
+Route findings outside the two in-scope scenarios to the owner skill below.
+
+| Concern observed                                                       | Route to                                                     |
+| ---------------------------------------------------------------------- | ------------------------------------------------------------ |
+| GPU hardware fault, ECC, NVLink, Xid, DCGM diagnostics, drain/replace  | `hyperpod-node-debugger` (§ F Hardware/Auto-Repair, § G GPU) |
+| `Cannot allocate memory` at `os.fork()`, root volume exhausted         | `hyperpod-node-debugger` (§ I Resource Exhaustion)           |
+| NCCL timeouts, hangs, AllReduce stalls, EFA TCP fallback, RDMA memlock | `hyperpod-nccl`                                              |
+| EFA / NCCL / CUDA / NVIDIA driver version drift across nodes           | `hyperpod-version-checker`                                   |
+| EFA self-referencing security-group rule missing — single node         | `hyperpod-node-debugger` § A (EFA / Security Group)          |
+| EFA self-referencing security-group rule missing — cluster-wide        | `hyperpod-cluster-debugger` § A (EFA Health Checks)          |
+| Slurm node state changes (drain / resume / reboot)                     | `hyperpod-slurm-debugger`                                    |
+| Diagnostic bundle for AWS Support                                      | `hyperpod-issue-report`                                      |
+| Shell access on a node                                                 | `hyperpod-ssm`                                               |
+
+## Operating policy
+
+- Read-only. Print commands the customer runs; do not execute commands that modify state.
+- Container vs host version comparisons go through `hyperpod-version-checker`.
+- Xid lines, ECC counts, NVLink lane state, and thermal readings get surfaced; the catalog and verdict live in `hyperpod-node-debugger` § G.
+
+## Workflow
+
+1. Confirm the symptom is **uneven NCCL** or **poor filesystem performance**. If neither, route to the matching sibling skill above.
+2. Run `scripts/perf-snapshot.sh` (read-only) to gather host-side signals for the suspect node and FSx filesystems mounted on it.
+3. For each `[CONCERN]` line in the script output, open the matching section below and read the supporting reference.
+4. After the per-incident diagnosis, recommend the HyperPod platform health features in [§ Continuous health coverage](#continuous-health-coverage) so the customer gets ongoing protection.
+
+## Step 1: Run the snapshot
+
+```bash
+bash scripts/perf-snapshot.sh --cluster <CLUSTER_NAME_OR_ARN> --region <REGION>
+
+# Scope to one suspect node:
+bash scripts/perf-snapshot.sh --cluster <C> --region <R> --node <INSTANCE_ID>
+```
+
+The script samples one node by default. It collects host-side data via `hyperpod-ssm`: `nvidia-smi` output (temperature, SM clocks, PCIe link width, ECC, NVLink, `topo -m`), recent `dmesg` Xid lines, EFA port state and `fi_info` provider visibility, EFA installer + kernel module versions, CPU governor, NVL72 Fabric Manager state, FSx CloudWatch utilization, `df -h` / `lfs df -h` per mount, host iowait, `/dev/shm` size, and root-volume usage. All read-only.
+
+Tags: `[OK]` healthy · `[CONCERN]` signal worth investigating (carries a `→` pointer to the owner skill) · `[INFO]` informational.
+
+**Host vs container scope.** The script runs on the host via SSM and reports host-scope values. Many setups ship the EFA / libfabric / OFI-NCCL / CUDA stack inside the training container by design — a host value of `unknown` is not by itself a defect. What matters for performance is the stack the workload actually uses. Verify versions inside the container (and across nodes) via `hyperpod-version-checker` before drawing conclusions.
+
+## Step 2: Match signal → section
+
+| Observation                                                                   | Section                                                                                                                                |
+| ----------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------- |
+| Pairwise NCCL bandwidth varies across node pairs / suspected straggler        | **[A: Uneven NCCL Performance](#a-uneven-nccl-performance)**                                                                           |
+| Nodes spread across AZs / network-node-layer labels / UltraServer boundaries  | **[A](#a-uneven-nccl-performance)**                                                                                                    |
+| EFA port not ACTIVE on a node, missing OFI plugin, or FI provider not visible | **[A](#a-uneven-nccl-performance)** + route to `hyperpod-node-debugger` § A; `hyperpod-version-checker` for cross-node version compare |
+| `iostat` shows high iowait, FSx CloudWatch utilization sustained near 100%    | **[B: Poor Filesystem Performance](#b-poor-filesystem-performance)**                                                                   |
+| DataLoader stalls, checkpoint dominates step time                             | **[B](#b-poor-filesystem-performance)**                                                                                                |
+| Xid line in `dmesg`, uncorrectable ECC, inactive NVLink lane, GPU ≥ 88°C      | Route to `hyperpod-node-debugger` § G                                                                                                  |
+| Container vs host version drift suspected                                     | Route to `hyperpod-version-checker`                                                                                                    |
+| `Cannot allocate memory` at `os.fork()`, root volume full, OOM events         | Route to `hyperpod-node-debugger` § I                                                                                                  |
+| NCCL timeout, hang, TCP fallback (`NET/OFI Using TCP`), RDMA memlock          | Route to `hyperpod-nccl`                                                                                                               |
+
+---
+
+## A: Uneven NCCL Performance
+
+The customer reports identical training jobs running with different step times on different node sets, pairwise bandwidth variance, or some allocations consistently slower than others despite identical code.
+
+Per the official troubleshooting guide, the common contributing factors are network topology differences between nodes (cross-AZ, cross-rack, cross-UltraServer), degraded EFA performance on some nodes, mixed instance types or generations within an instance group, and CPU frequency scaling differences.
+
+### Diagnostic pass (read-only)
+
+The host-side data points — GPU thermal/ECC/PCIe/clocks, Xid, NVLink lanes, EFA port state and provider visibility, CPU governor, EFA/OFI/driver versions, `nvidia-smi topo -m` — are all collected by `scripts/perf-snapshot.sh` (Step 1 above). The script tags `[CONCERN]` with thresholds and emits routing pointers; rerun it per suspect node via `--node <INSTANCE_ID>`.
+
+For driver / CUDA / NCCL / EFA / OFI version drift across nodes, run `hyperpod-version-checker` skill.
+
+### Pairwise NCCL bandwidth test
+
+Run the standard `nccl-tests` recipes from [awslabs/awsome-distributed-training](https://github.com/awslabs/awsome-distributed-training/tree/main/micro-benchmarks/nccl-tests). For an N-node cluster, run all-reduce across every pair and record `busbw` for each pair. Pairs more than ~5% below the run mean (the threshold the AWS validation script flags) are problematic candidates.
+
+Expected `busbw` per SKU is published in the [AI-on-HyperPod NCCL test guide](https://awslabs.github.io/ai-on-sagemaker-hyperpod/docs/slurm-orchestration/validation-and-testing/performance-testing/nccl-tests). Benchmark the specific instance type before relying on a number.
+
+Pairwise scripts, HyperPod topology surfaces (HyperPod API, EKS labels, Slurm `topology.conf`), and GB200 NVL72 specifics are in [references/perf-details.md § Uneven NCCL](references/perf-details.md#uneven-nccl).
+
+### Topology verification
+
+HyperPod exposes topology through three operator-visible surfaces:
+
+- **HyperPod API**: `aws sagemaker describe-cluster-node` returns `NodeDetails.Placement.AvailabilityZone` / `AvailabilityZoneId` and `NodeDetails.UltraServerInfo.Id` (UltraServer SKUs only).
+- **EKS labels**: `topology.kubernetes.io/zone`, `topology.k8s.aws/network-node-layer-{1,2,3}` (highest-numbered = closest to instance), `topology.k8s.aws/ultraserver-id`.
+- **Slurm**: HyperPod auto-generates `topology.conf`. Inspect via `scontrol show topology`.
+
+Tightly coupled work shares the same AZ, the same highest-numbered network-node-layer label (EKS) or the same Slurm topology block, and — for NVL72 jobs — the same `UltraServerInfo.Id` / `topology.k8s.aws/ultraserver-id`. If the cluster is spread across AZs or layers, topology must be re-established at provisioning time. Route provisioning changes to `hyperpod-cluster-debugger` § B (Capacity & AZ).
+
+---
+
+## B: Poor Filesystem Performance
+
+The customer reports training bottlenecked on data loading, checkpoint save/load dominating step time, executables/scripts loading slowly, or `iowait` high.
+
+Per the official troubleshooting guide, the resolution path follows this order:
+
+1. Check CloudWatch metrics on the filesystem.
+2. Check the provisioned performance configuration against workload requirements.
+3. Investigate which operations are causing the I/O — workload demand vs inefficient pattern.
+4. Consider upgrading provisioned performance.
+5. Choose the filesystem type that matches the I/O pattern.
+
+This skill covers steps 1–3. Steps 4–5 are customer decisions; surface the data and let the customer pick.
+
+### Diagnostic pass (read-only)
+
+`scripts/perf-snapshot.sh` (Step 1 above) covers the on-node side of this pass: it discovers FSx mounts, calls `aws cloudwatch get-metric-statistics` on `DataReadBytes` and (for OpenZFS) `FileServerDiskIopsUtilization`, prints `df -h` for `/fsx /opt/dlami/nvme /opt/sagemaker`, runs `lfs df -h` per Lustre mount, and reports `iostat` iowait. It tags `[CONCERN]` when OpenZFS IOPS utilization sustains ≥ 80% or iowait > 20%.
+
+For longer windows or additional metrics (`DataWriteBytes`, Lustre `DiskIopsUtilization`, OpenZFS `FileServerDiskThroughputUtilization`), drive the query directly:
+
+```bash
+aws cloudwatch get-metric-statistics --region <REGION> \
+  --namespace AWS/FSx --metric-name DataReadBytes \
+  --dimensions Name=FileSystemId,Value=<FSID> \
+  --start-time "$(date -u -d '3 hours ago' +%Y-%m-%dT%H:%M:%S)" \
+  --end-time   "$(date -u +%Y-%m-%dT%H:%M:%S)" \
+  --period 60 --statistics Sum Maximum
+```
+
+The full per-filesystem-type metric catalog is in [references/perf-details.md § Filesystem](references/perf-details.md#filesystem).
+
+### Branches
+
+**Provisioned capacity is saturated.** CloudWatch utilization sustained near 100% across the workload window. Customer decision: scale up the filesystem.
+
+- FSx for Lustre throughput scales with `StorageCapacity × PerUnitStorageThroughput`; capacity changes are non-disruptive.
+- FSx for OpenZFS — increase provisioned IOPS or throughput.
+
+**I/O pattern is inefficient.** CloudWatch shows headroom but the workload is still I/O-bound. Customer decision: change the application.
+
+- DataLoader: raise `num_workers`, set `pin_memory=True`, `persistent_workers=True`.
+- Checkpointing: use async + sharded (`torch.distributed.checkpoint.async_save` plus FSDP `SHARDED_STATE_DICT`). `FULL_STATE_DICT` serializes through rank 0 and is a frequent root cause.
+- Small-file workloads: Lustre is optimized for large sequential I/O. For millions of small files, use WebDataset / tar shards, FSx for OpenZFS, or NVMe scratch.
+
+Filesystem-selection guidance and the async-checkpoint pattern are in [references/perf-details.md § Filesystem](references/perf-details.md#filesystem).
+
+---
+
+## Continuous health coverage
+
+Once the immediate incident is diagnosed, recommend HyperPod's built-in health features so problems are caught before the next training run rather than after another customer-reported regression.
+
+- **Enable `NodeRecovery=Automatic`** on the cluster. The Health Monitoring Agent (HMA) continuously monitors GPU- and Trainium-based instances and marks instances unhealthy on detected failure. With auto-recovery enabled, HyperPod reboots or replaces the node — no operator intervention.
+- **Enable `OnStartDeepHealthChecks` on every GPU instance group** with both check categories:
+  - `InstanceStress` — `stress-ng` on CPU/memory/disk, GPU and PCI device count verification, DCGM level-4 diagnostics (memory test included), and EFA loopback bandwidth/latency.
+  - `InstanceConnectivity` — multi-node NCCL all-reduce.
+
+  Every newly provisioned or auto-replaced node passes the same hardware bar before accepting jobs.
+
+- **Run on-demand deep health checks** when this skill or any sibling surfaces a hardware concern but the cluster is mid-workload. `aws sagemaker start-cluster-health-check` runs the same checks against a specific instance group; nodes are placed in a Slurm maintenance reservation and the check is queued until any running job completes (not preempted). Console: **HyperPod → Clusters → Instances → Run deep health checks**.
+
+  Not supported when `NodeProvisioningMode=Continuous`; one on-demand request per cluster at a time. Requires the latest AMI — run `UpdateClusterSoftware` first.
+
+Logs land in CloudWatch at `/aws/sagemaker/Clusters/<cluster_name>/<cluster_id>` under `DeepHealthCheckResults/<log_stream_id>`, and on each node at `/var/log/aws/clusters/sagemaker-deep-health-check.log`.
+
+## References
+
+- [references/perf-details.md](references/perf-details.md) — pairwise NCCL test recipes, HyperPod topology check, GB200 NVL72 placement; CloudWatch metric catalog per filesystem type, async-checkpoint pattern, filesystem selection guide.
+
+External:
+
+- Amazon SageMaker HyperPod troubleshooting guide (official): <https://github.com/aws/sagemaker-hyperpod-cluster-setup/blob/troubleshooting-doc-20250917/troubleshoot/index.md>
+- AI-on-HyperPod NCCL performance tests (expected `busbw` per SKU): <https://awslabs.github.io/ai-on-sagemaker-hyperpod/docs/slurm-orchestration/validation-and-testing/performance-testing/nccl-tests>
+- Amazon SageMaker HyperPod resiliency (NodeRecovery, HMA, auto-resume): <https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-hyperpod-resiliency.html>
+- Amazon SageMaker HyperPod deep health checks: <https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-hyperpod-resiliency-slurm-deep-health-checks.html>
+- StartClusterHealthCheck API: <https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_StartClusterHealthCheck.html>
+- Amazon EC2 instance topology / network-node-layer labels: <https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/how-ec2-instance-topology-works.html>
+- Amazon FSx for Lustre performance: <https://docs.aws.amazon.com/fsx/latest/LustreGuide/performance.html>
+- Amazon FSx for OpenZFS metrics: <https://docs.aws.amazon.com/fsx/latest/OpenZFSGuide/fsx-openzfs-metrics.html>
+- AWS Elastic Fabric Adapter and NCCL: <https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa-start-nccl.html>
+- awslabs/awsome-distributed-training NCCL tests: <https://github.com/awslabs/awsome-distributed-training/tree/main/micro-benchmarks/nccl-tests>
diff --git a/plugins/sagemaker-ai/skills/hyperpod-performance-debugger/references/perf-details.md b/plugins/sagemaker-ai/skills/hyperpod-performance-debugger/references/perf-details.md
new file mode 100644
index 00000000..5ec2c8da
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/hyperpod-performance-debugger/references/perf-details.md
@@ -0,0 +1,202 @@
+# Performance Details
+
+Supplementary detail for `hyperpod-performance-debugger`. Two sections, matching the two scenarios the parent SKILL.md covers.
+
+## Contents
+
+1. [Uneven NCCL](#uneven-nccl)
+2. [Filesystem](#filesystem)
+3. [References](#references)
+
+---
+
+## Uneven NCCL
+
+### Pairwise NCCL all-reduce test
+
+Use the `nccl-tests` recipes from [awslabs/awsome-distributed-training](https://github.com/awslabs/awsome-distributed-training/tree/main/micro-benchmarks/nccl-tests). The repo ships `micro-benchmarks/nccl-tests/slurm/nccl-tests-container.sbatch` and a topology-aware pairwise sweep under `micro-benchmarks/nccl-tests/slurm/topology-aware-nccl-tests/`. For an N-node cluster, run all-reduce across every pair and record `busbw` for each pair. Pairs more than ~5% below the run mean (the threshold the AWS validation script flags) are straggler candidates.
+
+The topology-aware submit script uses `sbatch --array` to fan out pairwise jobs. The repo also ships `process_nccl_results.sh` as a CSV post-processor for the raw test output; it does not itself apply an outlier threshold — compare results against the published expected `busbw`.
+
+**Single-pair run on Slurm:**
+
+```bash
+sbatch -N 2 -w <NODE_A>,<NODE_B> nccl-tests-container.sbatch
+```
+
+**N-node aggregate run from a prebuilt container with NCCL + nccl-tests + aws-ofi-nccl baked in:**
+
+```bash
+srun -N <N> --mpi=pmix /path/in/container/all_reduce_perf -b 8 -e 8G -f 2 -g 8
+```
+
+### Expected bandwidth
+
+Always benchmark the specific SKU before relying on a number — averages across message sizes are misleading; focus on the message sizes the workload actually uses. AWS publishes expected `busbw` per SKU in the AI-on-HyperPod NCCL test guide.
+
+### EFA error-counter check (host)
+
+Non-zero per-port counters mean packet loss or link issues. The data point names a specific node; route to `hyperpod-node-debugger` § A (EFA / Security Group) for the deeper read.
+
+**Check per-port EFA error counters via SSM:**
+
+```bash
+for dev in /sys/class/infiniband/*/; do
+  name=$(basename "$dev")
+  rcv_err=$(cat "$dev/ports/1/counters/port_rcv_errors" 2>/dev/null)
+  xmit_disc=$(cat "$dev/ports/1/counters/port_xmit_discards" 2>/dev/null)
+  if [ "$rcv_err" != "0" ] || [ "$xmit_disc" != "0" ]; then
+    echo "PROBLEM: $name rcv_errors=$rcv_err xmit_discards=$xmit_disc"
+  fi
+done
+```
+
+EFA firmware should also match across nodes (compare via `hyperpod-version-checker`):
+
+```bash
+cat /sys/class/infiniband/*/fw_ver 2>/dev/null
+```
+
+### HyperPod topology surfaces
+
+HyperPod models co-location through three operator-visible surfaces — check each one that applies to the cluster.
+
+**Validate per-node AZ and UltraServer assignment via the HyperPod API:**
+
+```bash
+for id in $(aws sagemaker list-cluster-nodes --cluster-name <C> --region <R> \
+             --query 'ClusterNodeSummaries[*].InstanceId' --output text); do
+  aws sagemaker describe-cluster-node --cluster-name <C> --region <R> \
+    --node-id "$id" \
+    --query 'NodeDetails.{ID:InstanceId,AZ:Placement.AvailabilityZone,AZID:Placement.AvailabilityZoneId,UltraServer:UltraServerInfo.Id}' \
+    --output table
+done
+```
+
+**Check EKS topology labels:**
+
+```bash
+kubectl get nodes -L \
+  topology.kubernetes.io/zone,\
+  topology.k8s.aws/network-node-layer-1,\
+  topology.k8s.aws/network-node-layer-2,\
+  topology.k8s.aws/network-node-layer-3,\
+  topology.k8s.aws/ultraserver-id
+```
+
+**Check Slurm topology:**
+
+```bash
+scontrol show topology
+grep -E 'TopologyPlugin|BlockSizes' \
+  /var/spool/slurm/slurm.conf /var/spool/slurm/topology.conf 2>/dev/null
+```
+
+Tightly coupled work should share the same AZ, the same highest-numbered `network-node-layer-*` label (EKS) or the same Slurm topology block, and — for NVL72 jobs — the same `UltraServerInfo.Id` / `topology.k8s.aws/ultraserver-id`. If the cluster is spread across AZs or layers, co-location has to be re-established at provisioning time. Route provisioning changes to `hyperpod-cluster-debugger` § B (Capacity & AZ).
+
+### EFA version consistency
+
+All nodes in the training group must run identical EFA and OFI-NCCL versions. Mismatches can materially degrade pairwise bandwidth. Compare across nodes via `hyperpod-version-checker`.
+
+### GB200 NVL72 UltraServer
+
+`p6e-gb200.36xlarge` is fundamentally different from p5/p6-b200. One UltraServer = 18 instances × 4 Blackwell GPUs = 72 GPUs inside one NVLink domain, stitched across the 18 instances by NVIDIA IMEX.
+
+For uneven-NCCL triage on NVL72:
+
+- If the variance is **inside one UltraServer**, the IMEX / NVLink fabric is a candidate. Surface `nvidia-smi topo -m` and `systemctl status nvidia-fabricmanager` as data points; route to `hyperpod-node-debugger` § G for the deeper read. Fabric failures hard-fail CUDA init with SXid errors rather than silently degrading, so a clean `nvidia-smi` typically rules out the fabric.
+- If the variance is **across UltraServers**, the workload placement could be wrong — the NVL72 is meant to contain a single tight-coupled group. Verify the auto-configured `topology/block` (Slurm, `BlockSizes=18`) or the EKS `topology.k8s.aws/ultraserver-id` label.
+
+---
+
+## Filesystem
+
+### CloudWatch metrics per filesystem type
+
+All metrics live in the `AWS/FSx` namespace. Dimension: `FileSystemId`.
+
+#### FSx for Lustre (`FileSystemType: LUSTRE`)
+
+| Metric                    | What it means                                | Statistic |
+| ------------------------- | -------------------------------------------- | --------- |
+| `DataReadBytes`           | Aggregate read throughput (Bytes)            | Sum       |
+| `DataWriteBytes`          | Aggregate write throughput (Bytes)           | Sum       |
+| `MetadataOperations`      | File-open, stat, readdir rate (Count)        | Sum       |
+| `FreeDataStorageCapacity` | Remaining bytes — low values throttle writes | Minimum   |
+| `DiskIopsUtilization`     | % of provisioned IOPS in use (Percent)       | Maximum   |
+
+Lustre throughput scales as `StorageCapacity_TiB × PerUnitStorageThroughput_MBps`. Capacity changes are non-disruptive.
+
+#### FSx for OpenZFS (`FileSystemType: OPENZFS`)
+
+| Metric                                       | What it means                              | Statistic        |
+| -------------------------------------------- | ------------------------------------------ | ---------------- |
+| `DataReadBytes` / `DataWriteBytes`           | Aggregate throughput (Bytes)               | Sum              |
+| `DataReadOperations` / `DataWriteOperations` | Client IOPS (Count)                        | Sum              |
+| `NetworkThroughputUtilization`               | % of provisioned network throughput in use | Average, Maximum |
+| `FileServerDiskIopsUtilization`              | % of disk IOPS in use                      | Average, Maximum |
+| `FileServerDiskThroughputUtilization`        | % of disk throughput in use                | Average, Maximum |
+| `CPUUtilization`                             | File server CPU %                          | Average, Maximum |
+
+The utilization metrics (percent) are the authoritative saturation signals. There is no `ReadIOPS` metric in `AWS/FSx` — that is an EBS metric.
+
+#### EBS (`AWS/EBS` namespace)
+
+`VolumeReadOps`, `VolumeWriteOps`, `VolumeQueueLength`. A sustained `VolumeQueueLength > 1` typically indicates the volume is the bottleneck. For `gp3`, also compare against the provisioned IOPS / throughput configured on the volume.
+
+### NVMe (instance-local)
+
+Mounted at `/opt/dlami/nvme`. **Ephemeral** — data is lost on stop, replace, or hardware failure. Use for scratch and caches, not persistent state. Available capacity varies by instance type.
+
+### Secondary EBS volume (`/opt/sagemaker`)
+
+The secondary EBS volume is the persistent per-instance storage HyperPod attaches at `/opt/sagemaker`. It is configured per instance group via `ClusterEbsVolumeConfig` (root volume is fixed; secondary is what you size). When the volume backing it fills up and the customer needs more space, there are two paths.
+
+#### Path 1 — Resize via the instance group (takes effect on replacement)
+
+`ClusterEbsVolumeConfig` carries `VolumeSizeInGB` on each instance group. Update the instance group with a larger value via `UpdateCluster` call or CloudFormation/Terraform.
+
+Important: the new size applies to **newly provisioned or replaced nodes**, not to running nodes. Existing nodes keep their original secondary EBS until they're replaced (auto-recovery, on-demand deep health check that fails, or `BatchReplaceClusterNodes`).
+
+When to use this path:
+
+- The customer wants the new size to be the standard for the instance group going forward.
+- A rolling replacement is acceptable (data on `/opt/sagemaker` of the existing nodes does not survive replacement — checkpoints / artifacts on shared storage like FSx are unaffected).
+
+#### Path 2 — Attach an extra EBS volume to a running node (EKS only)
+
+`AttachClusterNodeVolume` attaches an existing EBS volume to a running HyperPod EKS node without replacement. This is the EBS CSI driver path — typically driven by Kubernetes PersistentVolumeClaims rather than called directly, but the API is available for ad-hoc attachment.
+
+Constraints (per the API):
+
+- EKS-orchestrated cluster only; the cluster must be `InService`.
+- The target node cannot be in a Restricted Instance Group (RIG).
+- The EBS volume must already exist and be in the `available` state, in the same AZ as the node.
+- A complementary `DetachClusterNodeVolume` removes the volume.
+
+### Filesystem selection by pattern
+
+| Pattern                       | Best fit                               | Why                                     |
+| ----------------------------- | -------------------------------------- | --------------------------------------- |
+| Large sequential I/O          | FSx for Lustre                         | Striping scales with OSTs               |
+| Small random I/O, mixed reads | FSx for OpenZFS                        | POSIX + better small-file performance   |
+| Temporary high-perf scratch   | NVMe (`/opt/dlami/nvme`)               | High aggregate throughput, zero network |
+| Single-node persistent        | EBS (`/opt/sagemaker`)                 | 100 GiB root is too small; EBS sized    |
+| Datasets (cold + warm)        | S3 + Mountpoint-S3 for streaming reads | Scales infinitely, no provisioned limit |
+
+For HyperPod Slurm, the default lifecycle script offers FSx for OpenZFS as an alternative to Lustre for home directories — useful when the home tree has small-file metadata pressure.
+
+---
+
+## References
+
+- Amazon SageMaker HyperPod troubleshooting guide (official): <https://github.com/aws/sagemaker-hyperpod-cluster-setup/blob/troubleshooting-doc-20250917/troubleshoot/index.md>
+- AI-on-HyperPod NCCL performance test guide (expected `busbw` per SKU): <https://awslabs.github.io/ai-on-sagemaker-hyperpod/docs/slurm-orchestration/validation-and-testing/performance-testing/nccl-tests>
+- AI-on-HyperPod GPU stress testing: <https://awslabs.github.io/ai-on-sagemaker-hyperpod/docs/validation-and-testing/performance-testing/gpu-stress-testing>
+- Amazon SageMaker HyperPod resiliency (NodeRecovery, Health Monitoring Agent, auto-resume): <https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-hyperpod-resiliency.html>
+- Amazon SageMaker HyperPod deep health checks: <https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-hyperpod-deep-health-checks.html>
+- AWS Elastic Fabric Adapter and NCCL: <https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa-start-nccl.html>
+- Amazon FSx for Lustre performance: <https://docs.aws.amazon.com/fsx/latest/LustreGuide/performance.html>
+- Amazon FSx for OpenZFS metrics: <https://docs.aws.amazon.com/fsx/latest/OpenZFSGuide/fsx-openzfs-metrics.html>
+- awslabs/awsome-distributed-training NCCL tests: <https://github.com/awslabs/awsome-distributed-training/tree/main/micro-benchmarks/nccl-tests>
+- Amazon EC2 instance topology (network-node-layer ordering): <https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/how-ec2-instance-topology-works.html>
diff --git a/plugins/sagemaker-ai/skills/hyperpod-performance-debugger/scripts/perf-snapshot.sh b/plugins/sagemaker-ai/skills/hyperpod-performance-debugger/scripts/perf-snapshot.sh
new file mode 100755
index 00000000..0b9f9095
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/hyperpod-performance-debugger/scripts/perf-snapshot.sh
@@ -0,0 +1,666 @@
+#!/usr/bin/env bash
+# perf-snapshot.sh
+#
+# Read-only host-side snapshot for the two performance scenarios that
+# hyperpod-performance-debugger covers:
+#
+#   A. Uneven NCCL performance (host-side EFA reachability, NVLink, Fabric
+#      Manager, recent dmesg events that contextualize bandwidth variance)
+#   B. Poor filesystem performance (FSx CloudWatch utilization for actually
+#      mounted filesystems, on-node iowait)
+#
+#
+# Usage:
+#   bash perf-snapshot.sh --cluster <NAME|ARN> --region <REGION>
+#   bash perf-snapshot.sh --cluster <N> --region <R> --node <INSTANCE_ID>
+#   bash perf-snapshot.sh --cluster <N> --region <R> --no-color > report.txt
+#
+# Required IAM (on the calling principal):
+#   sagemaker:DescribeCluster, sagemaker:ListClusterNodes,
+#     sagemaker:DescribeClusterNode
+#   fsx:DescribeFileSystems
+#   cloudwatch:GetMetricStatistics
+#   ssm:StartSession, ssm:TerminateSession
+#
+# Note: HyperPod-managed instances are not reliably addressable via
+# ec2:DescribeInstances from the operator role, so this script stays on
+# SageMaker HyperPod APIs + IMDS (via SSM) for per-instance metadata.
+#
+# Prerequisites on the calling machine:
+#   aws CLI v2, jq, session-manager-plugin (for the SSM calls),
+#   unbuffer (from the `expect` package; works around a session-manager-plugin
+#   stdout race — see ssm_run below).
+
+set -uo pipefail
+
+# ---------------------------------------------------------------------------
+# Argument parsing
+# ---------------------------------------------------------------------------
+CLUSTER=""
+REGION="${AWS_DEFAULT_REGION:-us-east-1}"
+TARGET_NODE=""
+NO_COLOR="${NO_COLOR:-}"
+
+usage() {
+  sed -n '2,40p' "$0" | sed 's/^# \{0,1\}//'
+}
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --cluster)   CLUSTER="${2:-}";     shift 2 ;;
+    --region)    REGION="${2:-}";      shift 2 ;;
+    --node)      TARGET_NODE="${2:-}"; shift 2 ;;
+    --no-color)  NO_COLOR=1;           shift 1 ;;
+    -h|--help)   usage; exit 0 ;;
+    *) echo "Unknown arg: $1" >&2; usage; exit 2 ;;
+  esac
+done
+
+# ---------------------------------------------------------------------------
+# Input validation — these values flow into AWS API calls and SSM payloads.
+# ---------------------------------------------------------------------------
+[[ -z "$CLUSTER" ]] && { echo "Error: --cluster required" >&2; exit 2; }
+
+# Cluster name or ARN (see AWS SageMaker BatchReplaceClusterNodesRequest pattern)
+if ! [[ "$CLUSTER" =~ ^(arn:aws[a-z-]*:sagemaker:[a-z0-9-]*:[0-9]{12}:cluster/[a-z0-9]{12})$|^[a-zA-Z0-9][-a-zA-Z0-9]{0,62}$ ]]; then
+  echo "Error: invalid cluster name or ARN: $CLUSTER" >&2
+  exit 2
+fi
+
+# Region
+if ! [[ "$REGION" =~ ^[a-z]{2}-[a-z]+-[0-9]{1,2}$ ]]; then
+  echo "Error: invalid region: $REGION" >&2
+  exit 2
+fi
+
+# Optional node — EC2 instance ID
+if [[ -n "$TARGET_NODE" ]] && ! [[ "$TARGET_NODE" =~ ^i-[a-f0-9]{8,17}$ ]]; then
+  echo "Error: invalid --node (expected i-<hex>): $TARGET_NODE" >&2
+  exit 2
+fi
+
+# Dependency check
+for cmd in aws jq; do
+  command -v "$cmd" >/dev/null 2>&1 || { echo "Error: '$cmd' is required" >&2; exit 2; }
+done
+if ! command -v session-manager-plugin >/dev/null 2>&1; then
+  echo "Warning: session-manager-plugin not found; on-node probes will fail" >&2
+fi
+if ! command -v unbuffer >/dev/null 2>&1; then
+  echo "Warning: 'unbuffer' (from the 'expect' package) not found — SSM calls" >&2
+  echo "         can intermittently return empty output. Install with" >&2
+  echo "         'sudo yum install expect' / 'sudo apt install expect' / 'brew install expect'." >&2
+fi
+
+# ---------------------------------------------------------------------------
+# Output helpers (TTY-gated; respect NO_COLOR)
+# ---------------------------------------------------------------------------
+if [[ -t 1 ]] && [[ -z "$NO_COLOR" ]]; then
+  GREEN=$'\033[0;32m'; YELLOW=$'\033[1;33m'
+  CYAN=$'\033[0;36m';  BOLD=$'\033[1m';    NC=$'\033[0m'
+else
+  GREEN=""; YELLOW=""; CYAN=""; BOLD=""; NC=""
+fi
+
+section() { printf "\n${BOLD}${CYAN}== %s ==${NC}\n" "$1"; }
+ok()      { printf "  ${GREEN}[OK     ]${NC} %s\n" "$1"; }
+concern() { printf "  ${YELLOW}[CONCERN]${NC} %s\n" "$1"; }
+info()    { printf "             %s\n" "$1"; }
+
+# Pointers (sibling skill / SKILL.md section to read after a [CONCERN] line)
+NEXT=()
+
+# ---------------------------------------------------------------------------
+# Cluster + node list
+# ---------------------------------------------------------------------------
+DESC=$(aws sagemaker describe-cluster --cluster-name "$CLUSTER" --region "$REGION" --output json 2>&1) \
+  || { echo "Error: describe-cluster failed: $DESC" >&2; exit 3; }
+CLUSTER_ID=$(echo "$DESC" | jq -r '.ClusterArn' | awk -F/ '{print $NF}')
+
+NODES=$(aws sagemaker list-cluster-nodes --cluster-name "$CLUSTER" --region "$REGION" --output json 2>&1) \
+  || { echo "Error: list-cluster-nodes failed: $NODES" >&2; exit 3; }
+
+# Pick target node
+if [[ -n "$TARGET_NODE" ]]; then
+  TGT_ID="$TARGET_NODE"
+else
+  TGT_ID=$(echo "$NODES" | jq -r '
+    [.ClusterNodeSummaries[] | select(.InstanceGroupName|test("controller|head";"i")|not)][0].InstanceId
+    // .ClusterNodeSummaries[0].InstanceId // empty')
+fi
+[[ -z "$TGT_ID" ]] && { echo "Error: no nodes found in cluster" >&2; exit 3; }
+
+TGT_GROUP=$(echo "$NODES" | jq -r --arg id "$TGT_ID" \
+  '.ClusterNodeSummaries[] | select(.InstanceId==$id) | .InstanceGroupName // empty')
+[[ -z "$TGT_GROUP" ]] && { echo "Error: node $TGT_ID not found in cluster" >&2; exit 3; }
+
+SSM_TARGET="sagemaker-cluster:${CLUSTER_ID}_${TGT_GROUP}-${TGT_ID}"
+
+# Instance type from list-cluster-nodes output (already fetched). No EC2 call.
+INSTANCE_TYPE=$(echo "$NODES" | jq -r --arg id "$TGT_ID" \
+  '.ClusterNodeSummaries[] | select(.InstanceId==$id) | .InstanceType // empty')
+IS_NVL72=0
+if [[ "$INSTANCE_TYPE" =~ ^ml\.p6e-gb200|^ml\.p6e-gb300|^p6e-gb200|^p6e-gb300 ]]; then
+  IS_NVL72=1
+fi
+
+# ---------------------------------------------------------------------------
+# SSM helper — injection-safe (commands passed via file-based CLI input).
+# Bounded to 60s per call to avoid hangs on unreachable nodes.
+# ---------------------------------------------------------------------------
+ssm_run() {
+  local target="$1"
+  local cmd="$2"
+  local json_file runner
+  json_file=$(mktemp)
+  # shellcheck disable=SC2064
+  trap "rm -f '$json_file'" RETURN
+  jq -n --arg t "$target" --arg c "$cmd" '{
+    Target: $t,
+    DocumentName: "AWS-StartNonInteractiveCommand",
+    Parameters: { command: [ ("bash -c " + ($c | @sh)) ] }
+  }' > "$json_file"
+
+  if command -v unbuffer >/dev/null 2>&1; then
+    runner=(unbuffer aws)
+  else
+    runner=(aws)
+  fi
+
+  timeout 60 "${runner[@]}" ssm start-session --region "$REGION" \
+    --cli-input-json "file://${json_file}" 2>/dev/null \
+    | sed -e 's/\x1b\[[0-9;]*m//g' \
+          -e '/^Starting session/d' \
+          -e '/^Exiting session/d' \
+          -e '/^Cannot perform start session: EOF$/d'
+}
+
+# ssm_json: run a payload that is expected to print a single JSON document on
+# stdout. On parse failure (probe missing, jq absent, command timeout) returns
+# the empty object so callers can use jq with safe defaults.
+ssm_json() {
+  local target="$1" cmd="$2" out
+  out=$(ssm_run "$target" "$cmd")
+  if printf '%s' "$out" | jq -e . >/dev/null 2>&1; then
+    printf '%s' "$out"
+  else
+    printf '{}'
+  fi
+}
+
+# ---------------------------------------------------------------------------
+# A. Uneven NCCL — placement and EFA reachability data points
+# ---------------------------------------------------------------------------
+section "A. NCCL topology & EFA reachability"
+
+# AZ placement — use sagemaker:DescribeClusterNode which returns
+# Placement.AvailabilityZone. No ec2:DescribeInstances needed.
+#
+# DescribeClusterNode has no batch form, so this is O(N) API calls. Cap the
+# sample to keep runtime bounded; a single outlier AZ is enough to surface
+# the concern. Customer can run sagemaker list-cluster-nodes for a full audit.
+mapfile -t ALL_IDS < <(echo "$NODES" | jq -r '.ClusterNodeSummaries[].InstanceId // empty')
+AZ_SAMPLE_CAP=20
+if [[ "${#ALL_IDS[@]}" -eq 0 ]]; then
+  info "no instance IDs in cluster node list; skipping placement check"
+else
+  SAMPLE_N=${#ALL_IDS[@]}
+  TRUNCATED=0
+  if (( SAMPLE_N > AZ_SAMPLE_CAP )); then
+    SAMPLE_N=$AZ_SAMPLE_CAP
+    TRUNCATED=1
+  fi
+  AZS=""
+  for ((i = 0; i < SAMPLE_N; i++)); do
+    id="${ALL_IDS[$i]}"
+    az=$(aws sagemaker describe-cluster-node --cluster-name "$CLUSTER" --region "$REGION" \
+      --node-id "$id" --query 'NodeDetails.Placement.AvailabilityZone' --output text 2>/dev/null) || az=""
+    [[ -n "$az" && "$az" != "None" ]] && AZS+="${az}"$'\n'
+  done
+  UNIQ_AZ=$(echo "$AZS" | awk 'NF' | sort -u | wc -l)
+  if (( UNIQ_AZ > 1 )); then
+    concern "sampled nodes span $UNIQ_AZ AZs — cross-AZ placement is a known cause of uneven NCCL"
+    info "→ SKILL.md § A (Uneven NCCL); for re-provisioning, → hyperpod-cluster-debugger § B"
+    NEXT+=("A")
+  elif (( UNIQ_AZ == 1 )); then
+    ok "sampled nodes share a single AZ"
+  else
+    info "no AZ returned by DescribeClusterNode; skipping placement check"
+  fi
+  (( TRUNCATED )) && info "sampled first $AZ_SAMPLE_CAP of ${#ALL_IDS[@]} nodes; sagemaker list-cluster-nodes for a full audit"
+fi
+
+# EFA + container toolkit stack versions — sample from the target node so the
+# customer has a starting point. For cross-node comparison, route to
+# hyperpod-version-checker rather than re-implementing it here.
+STACK_JSON=$(ssm_json "$SSM_TARGET" '
+  pkgver() {
+    pkg=$1
+    if command -v dpkg >/dev/null 2>&1; then
+      v=$(dpkg-query -W -f="\${Version}" "$pkg" 2>/dev/null)
+    fi
+    if [ -z "${v:-}" ] && command -v rpm >/dev/null 2>&1; then
+      v=$(rpm -q --qf "%{VERSION}-%{RELEASE}" "$pkg" 2>/dev/null)
+      case "$v" in [0-9]*) ;; *) v="" ;; esac
+    fi
+    printf "%s" "${v:-}"
+  }
+
+  efa_inst=$(grep -iE "^EFA[[:space:]]+(installer[[:space:]]+)?version" \
+    /opt/amazon/efa_installed_packages 2>/dev/null \
+    | head -1 | sed -E "s/.*[:=][[:space:]]*//")
+  efa_mod=$(modinfo efa 2>/dev/null | awk "/^version:/ {print \$2; exit}")
+  ofi=$(pkgver aws-ofi-nccl)
+  libfabric=$(fi_info -v 2>/dev/null | awk -F": " "/libfabric/{print \$2; exit}")
+  driver=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader 2>/dev/null | head -1)
+  nvct=$(pkgver nvidia-container-toolkit)
+
+  jq -n \
+    --arg efa_installer "$efa_inst" \
+    --arg efa_kmod      "$efa_mod"  \
+    --arg ofi_nccl      "$ofi"      \
+    --arg libfabric     "$libfabric" \
+    --arg driver        "$driver"   \
+    --arg nvct          "$nvct"     \
+    "{efa_installer:\$efa_installer, efa_kmod:\$efa_kmod, ofi_nccl:\$ofi_nccl, libfabric:\$libfabric, driver:\$driver, nvct:\$nvct}"
+')
+# Render to operator output. `// "unknown"` keeps the column non-empty when a
+# component is intentionally not on the host.
+while IFS=$'\t' read -r k v; do
+  info "$TGT_ID host: ${k}=${v}"
+done < <(echo "$STACK_JSON" | jq -r '
+  def nz(x): if (x // "") == "" then "unknown" else x end;
+  . as $s
+  | [
+      ["EFA",       (nz($s.efa_installer) + " (kmod=" + nz($s.efa_kmod) + ")")],
+      ["OFI_NCCL",   nz($s.ofi_nccl)],
+      ["LIBFABRIC",  nz($s.libfabric)],
+      ["DRIVER",     nz($s.driver)],
+      ["NVCT",       nz($s.nvct)]
+    ]
+  | .[] | @tsv
+')
+info "values above are host-scope; the workload may use a different EFA/OFI/CUDA stack inside the container — verify via hyperpod-version-checker"
+info "for cross-node version comparison, → hyperpod-version-checker"
+
+# EFA fabric reachability — port state and provider visibility. SG-level rules
+# are not directly inspectable from this role; route to hyperpod-cluster-debugger
+# § A for the cluster-wide EFA SG check.
+EFA_JSON=$(ssm_json "$SSM_TARGET" '
+  total=0; active=0
+  for p in /sys/class/infiniband/*/ports/1/state; do
+    [ -e "$p" ] || continue
+    total=$((total+1))
+    grep -q ACTIVE "$p" 2>/dev/null && active=$((active+1))
+  done
+  if fi_info -p efa >/dev/null 2>&1; then
+    fi_info_ok=true
+  else
+    fi_info_ok=false
+  fi
+  jq -n \
+    --argjson total       "$total" \
+    --argjson active      "$active" \
+    --argjson fi_info_ok  "$fi_info_ok" \
+    "{ports:{total:\$total, active:\$active}, fi_info_ok:\$fi_info_ok}"
+')
+EFA_TOTAL=$(echo "$EFA_JSON" | jq -r '.ports.total // 0')
+EFA_ACTIVE=$(echo "$EFA_JSON" | jq -r '.ports.active // 0')
+EFA_FI_OK=$(echo "$EFA_JSON" | jq -r '.fi_info_ok // false')
+if (( EFA_TOTAL == 0 )); then
+  concern "no EFA devices visible on $TGT_ID"
+  info "→ hyperpod-node-debugger § A (EFA / Security Group)"
+  NEXT+=("A")
+elif (( EFA_ACTIVE != EFA_TOTAL )); then
+  concern "EFA port state on $TGT_ID: ${EFA_ACTIVE}/${EFA_TOTAL} ACTIVE"
+  info "→ hyperpod-node-debugger § A (EFA / Security Group)"
+  NEXT+=("A")
+else
+  ok "EFA port state on $TGT_ID: ${EFA_ACTIVE}/${EFA_TOTAL} ACTIVE"
+fi
+if [[ "$EFA_FI_OK" != "true" ]] && (( EFA_TOTAL > 0 )); then
+  concern "libfabric does not see the EFA provider on $TGT_ID — NCCL would fall back to TCP"
+  info "→ hyperpod-nccl § 13 (EFA TCP fallback) / hyperpod-cluster-debugger § A"
+  NEXT+=("A")
+fi
+info "EFA self-referencing security-group rule is a cluster-wide check — → hyperpod-cluster-debugger § A"
+
+# GPU/NIC topology snapshot — raw informational print so the operator can see
+# how PCIe / NVLink edges connect GPUs to NICs without re-running on the node.
+TOPO=$(ssm_run "$SSM_TARGET" "nvidia-smi topo -m 2>/dev/null")
+if [[ -n "$TOPO" ]]; then
+  info "nvidia-smi topo -m on $TGT_ID:"
+  echo "$TOPO" | sed 's/^/             /'
+fi
+
+# ---------------------------------------------------------------------------
+# B. Filesystem — CloudWatch utilization + on-node iowait
+# ---------------------------------------------------------------------------
+section "B. Filesystem saturation"
+
+# Scope FSx query to filesystems actually mounted on the target node.
+FSIDS_JSON=$(ssm_json "$SSM_TARGET" '
+  ids=$(mount | awk "/lustre|zfs/ {print \$1}" | grep -oE "fs-[a-f0-9]+" | sort -u)
+  if [ -z "$ids" ]; then
+    echo "[]"
+  else
+    printf "%s\n" "$ids" | jq -R . | jq -s .
+  fi
+')
+mapfile -t FSID_ARRAY < <(echo "$FSIDS_JSON" | jq -r '.[]?')
+
+if [[ ${#FSID_ARRAY[@]} -eq 0 ]]; then
+  info "no FSx filesystems mounted on $TGT_ID"
+else
+  FSX_DESC=$(aws fsx describe-file-systems --region "$REGION" \
+    --file-system-ids "${FSID_ARRAY[@]}" --output json 2>/dev/null || echo '{}')
+  FSCOUNT=$(echo "$FSX_DESC" | jq '.FileSystems | length // 0')
+
+  if (( FSCOUNT == 0 )); then
+    info "FSx filesystems ${FSID_ARRAY[*]} are mounted but describe-file-systems returned nothing (cross-account?)"
+  else
+    while IFS=$'\t' read -r fsid fstype; do
+      [[ -z "$fsid" ]] && continue
+      val=$(aws cloudwatch get-metric-statistics --region "$REGION" \
+        --namespace AWS/FSx --metric-name DataReadBytes \
+        --dimensions "Name=FileSystemId,Value=${fsid}" \
+        --start-time "$(date -u -d '1 hour ago' +%Y-%m-%dT%H:%M:%S 2>/dev/null || date -u -v-1H +%Y-%m-%dT%H:%M:%S)" \
+        --end-time   "$(date -u +%Y-%m-%dT%H:%M:%S)" \
+        --period 60 --statistics Maximum --output json 2>/dev/null \
+        | jq -r '[.Datapoints[].Maximum] | max // 0')
+      info "${fstype} ${fsid}: max 1h DataReadBytes = ${val} bytes/min"
+
+      if [[ "$fstype" == "OPENZFS" ]]; then
+        util=$(aws cloudwatch get-metric-statistics --region "$REGION" \
+          --namespace AWS/FSx --metric-name FileServerDiskIopsUtilization \
+          --dimensions "Name=FileSystemId,Value=${fsid}" \
+          --start-time "$(date -u -d '1 hour ago' +%Y-%m-%dT%H:%M:%S 2>/dev/null || date -u -v-1H +%Y-%m-%dT%H:%M:%S)" \
+          --end-time   "$(date -u +%Y-%m-%dT%H:%M:%S)" \
+          --period 60 --statistics Maximum --output json 2>/dev/null \
+          | jq -r '[.Datapoints[].Maximum] | max // 0')
+        info "         max 1h FileServerDiskIopsUtilization = ${util}%"
+        util_int=${util%.*}
+        if [[ "$util_int" =~ ^[0-9]+$ ]] && (( util_int >= 80 )); then
+          concern "OpenZFS $fsid disk IOPS utilization sustained ≥ 80% (peak ${util}%)"
+          info "→ SKILL.md § B (Poor Filesystem Performance)"
+          NEXT+=("B")
+        fi
+      fi
+    done < <(echo "$FSX_DESC" | jq -r '.FileSystems[]? | [.FileSystemId, .FileSystemType] | @tsv')
+    info "review the FSx dashboards for sustained near-provisioned-limit usage (script reports peaks only)"
+  fi
+fi
+
+# On-node mount-point capacity — surface usage on FSx / NVMe / SageMaker paths.
+# Includes lfs df per Lustre mount so the operator can see OST/MDT fill.
+DF_JSON=$(ssm_json "$SSM_TARGET" '
+  mounts="[]"
+  for p in /fsx /opt/dlami/nvme /opt/sagemaker; do
+    [ -e "$p" ] || continue
+    line=$(df -h "$p" 2>/dev/null | awk "NR==2") || continue
+    [ -z "$line" ] && continue
+    fs=$(echo "$line"  | awk "{print \$1}")
+    sz=$(echo "$line"  | awk "{print \$2}")
+    used=$(echo "$line" | awk "{print \$3}")
+    avail=$(echo "$line" | awk "{print \$4}")
+    pct=$(echo "$line" | awk "{print \$5}")
+    entry=$(jq -n \
+      --arg path  "$p"    --arg fs    "$fs"   --arg size "$sz" \
+      --arg used  "$used" --arg avail "$avail" --arg pct "$pct" \
+      "{path:\$path, fs:\$fs, size:\$size, used:\$used, avail:\$avail, pct:\$pct}")
+    mounts=$(jq --argjson e "$entry" ". + [\$e]" <<< "$mounts")
+  done
+
+  lustre="[]"
+  while IFS= read -r mnt; do
+    [ -z "$mnt" ] && continue
+    out=$(lfs df -h "$mnt" 2>/dev/null) || continue
+    [ -z "$out" ] && continue
+    rows=$(printf "%s\n" "$out" | jq -R . | jq -s .)
+    entry=$(jq -n --arg mnt "$mnt" --argjson rows "$rows" \
+      "{mount:\$mnt, rows:\$rows}")
+    lustre=$(jq --argjson e "$entry" ". + [\$e]" <<< "$lustre")
+  done < <(mount | awk "/lustre/ {print \$3}")
+
+  jq -n \
+    --argjson mounts "$mounts" \
+    --argjson lustre "$lustre" \
+    "{mounts:\$mounts, lustre:\$lustre}"
+')
+while IFS=$'\t' read -r path fs size used avail pct; do
+  [[ -z "$path" ]] && continue
+  info "df ${path}: ${used} used / ${size} (${pct}, ${avail} free) on ${fs}"
+done < <(echo "$DF_JSON" | jq -r '.mounts[]? | [.path, .fs, .size, .used, .avail, .pct] | @tsv')
+
+LAST_LFS_MNT=""
+while IFS=$'\t' read -r mnt row; do
+  [[ -z "$mnt" ]] && continue
+  if [[ "$mnt" != "$LAST_LFS_MNT" ]]; then
+    info "lfs df -h ${mnt}:"
+    LAST_LFS_MNT="$mnt"
+  fi
+  info "             ${row}"
+done < <(echo "$DF_JSON" | jq -r '.lustre[]? | . as $e | $e.rows[] | [$e.mount, .] | @tsv')
+
+# On-node iowait via iostat
+IOWAIT=$(ssm_run "$SSM_TARGET" "iostat -c 1 2 2>/dev/null | awk 'END{print \$4}'")
+IOWAIT=$(echo "$IOWAIT" | tr -d '\r \n')
+if [[ -n "$IOWAIT" ]]; then
+  IOWAIT_INT=${IOWAIT%.*}
+  if [[ "$IOWAIT_INT" =~ ^[0-9]+$ ]]; then
+    info "$TGT_ID iowait: ${IOWAIT}%"
+    if (( IOWAIT_INT > 20 )); then
+      concern "iowait on $TGT_ID is ${IOWAIT}%"
+      info "→ SKILL.md § B (Poor Filesystem Performance)"
+      NEXT+=("B")
+    fi
+  fi
+fi
+
+# ---------------------------------------------------------------------------
+# Adjacent host data points — out of scope for this skill but commonly relevant.
+# Reported as data points only; remediation is owned by sibling skills.
+# ---------------------------------------------------------------------------
+section "Adjacent data points (out of scope — see sibling skills)"
+
+# GPU thermal / ECC / NVLink / Xid — surface as concerns; routing goes to
+# hyperpod-node-debugger § G. Do NOT classify cause from a single reading.
+GPU_OUT=$(ssm_run "$SSM_TARGET" "nvidia-smi --query-gpu=index,temperature.gpu,clocks.current.sm,clocks.max.sm,pcie.link.width.current,pcie.link.width.max,ecc.errors.uncorrected.volatile.total,ecc.errors.uncorrected.aggregate.total --format=csv,noheader,nounits 2>&1 | head -16")
+
+if echo "$GPU_OUT" | grep -qiE 'command not found|no devices|NVIDIA-SMI has failed'; then
+  info "no NVIDIA GPU detected on $TGT_ID"
+else
+  HOT=0; UNCORR_VOL=0; UNCORR_AGG=0; GPUS=0; PCIE_DEGRADED=0; SM_THROTTLED=0
+  while IFS=',' read -r idx temp sm_cur sm_max pcie_cur pcie_max unc_vol unc_agg; do
+    idx=$(echo "$idx" | tr -d ' '); [[ -z "$idx" ]] && continue
+    temp=$(echo "$temp" | tr -d ' ')
+    sm_cur=$(echo "$sm_cur" | tr -d ' ')
+    sm_max=$(echo "$sm_max" | tr -d ' ')
+    pcie_cur=$(echo "$pcie_cur" | tr -d ' ')
+    pcie_max=$(echo "$pcie_max" | tr -d ' ')
+    unc_vol=$(echo "$unc_vol" | tr -d ' ')
+    unc_agg=$(echo "$unc_agg" | tr -d ' ')
+
+    GPUS=$((GPUS+1))
+    [[ "$temp" =~ ^[0-9]+$ && "$temp" -ge 88 ]] && HOT=$((HOT+1))
+    [[ "$unc_vol" =~ ^[0-9]+$ && "$unc_vol" -gt 0 ]] && UNCORR_VOL=$((UNCORR_VOL+1))
+    [[ "$unc_agg" =~ ^[0-9]+$ && "$unc_agg" -gt 0 ]] && UNCORR_AGG=$((UNCORR_AGG+1))
+    if [[ "$pcie_cur" =~ ^[0-9]+$ && "$pcie_max" =~ ^[0-9]+$ ]] && (( pcie_cur < pcie_max )); then
+      PCIE_DEGRADED=$((PCIE_DEGRADED+1))
+    fi
+    # Workload-time clock check would need correlation; skip silently when idle.
+    if [[ "$sm_cur" =~ ^[0-9]+$ && "$sm_max" =~ ^[0-9]+$ ]] && (( sm_max > 0 )) \
+       && (( sm_cur * 100 < sm_max * 50 )) && [[ "$temp" =~ ^[0-9]+$ ]] && (( temp >= 80 )); then
+      SM_THROTTLED=$((SM_THROTTLED+1))
+    fi
+  done <<< "$GPU_OUT"
+
+  info "$GPUS GPUs visible on $TGT_ID"
+  if (( HOT > 0 )); then
+    concern "$HOT GPU(s) at or above the H100 SXM5 software-throttle point (≥ 88°C)"
+    info "data point only — correlate with workload before drawing a conclusion"
+    info "→ hyperpod-node-debugger § G (GPU / Accelerator)"
+    NEXT+=("G")
+  fi
+  if (( PCIE_DEGRADED > 0 )); then
+    concern "$PCIE_DEGRADED GPU(s) report PCIe link width below max"
+    info "→ hyperpod-node-debugger § G (GPU / Accelerator)"
+    NEXT+=("G")
+  fi
+  if (( SM_THROTTLED > 0 )); then
+    concern "$SM_THROTTLED GPU(s) running SM clock < 50% of max while ≥ 80°C — possible thermal throttling"
+    info "→ hyperpod-node-debugger § G (GPU / Accelerator)"
+    NEXT+=("G")
+  fi
+  if (( UNCORR_VOL > 0 )); then
+    concern "$UNCORR_VOL GPU(s) report uncorrectable ECC (volatile)"
+    info "→ hyperpod-node-debugger § G (GPU / Accelerator)"
+    NEXT+=("G")
+  fi
+  if (( UNCORR_AGG > 0 )); then
+    concern "$UNCORR_AGG GPU(s) report uncorrectable ECC (aggregate / lifetime)"
+    info "→ hyperpod-node-debugger § G (GPU / Accelerator)"
+    NEXT+=("G")
+  fi
+  if (( HOT == 0 && UNCORR_VOL == 0 && UNCORR_AGG == 0 && PCIE_DEGRADED == 0 && SM_THROTTLED == 0 )); then
+    ok "no thermal / ECC / PCIe / clock concerns visible on $TGT_ID"
+  fi
+fi
+
+# CPU frequency governor — uneven across nodes is a known straggler cause.
+GOV=$(ssm_run "$SSM_TARGET" "cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor 2>/dev/null")
+GOV=$(echo "$GOV" | tr -d '\r\n ')
+if [[ -n "$GOV" ]]; then
+  info "CPU governor on $TGT_ID: ${GOV}"
+  if [[ "$GOV" != "performance" ]]; then
+    concern "CPU governor is '${GOV}' (not 'performance') on $TGT_ID — known cause of uneven NCCL"
+    info "→ SKILL.md § A (Uneven NCCL); compare across nodes with hyperpod-version-checker"
+    NEXT+=("A")
+  fi
+fi
+
+# Recent Xid lines — surface, do NOT classify
+XID=$(ssm_run "$SSM_TARGET" "dmesg -T 2>/dev/null | grep -i 'Xid' | tail -5")
+if [[ -n "$XID" ]]; then
+  concern "recent Xid line(s) in dmesg on $TGT_ID — surface only; → hyperpod-node-debugger § G for the catalog"
+  echo "$XID" | sed 's/^/             /'
+  NEXT+=("G")
+else
+  ok "no Xid lines in recent dmesg"
+fi
+
+# NVLink lane status / errors — concern, don't classify
+NVLINK=$(ssm_run "$SSM_TARGET" '
+  nvidia-smi nvlink -s 2>/dev/null
+  echo "----"
+  nvidia-smi nvlink -e 2>/dev/null
+')
+if echo "$NVLINK" | grep -qiE 'has no supported GPU|command not found|no devices'; then
+  info "NVLink: not supported on this instance (skipped)"
+else
+  INACTIVE=$(echo "$NVLINK" | awk '/^GPU/{gpu=$0; next} /[Ii]nactive/ {print gpu":"$0}' | wc -l)
+  ERR_LINES=$(echo "$NVLINK" | awk 'BEGIN{errs=0} /^GPU/{gpu=$0; next} /[Ee]rror/{for(i=1;i<=NF;i++) if($i ~ /^[0-9]+$/ && $i>0) errs++} END{print errs}')
+  if (( INACTIVE > 0 )); then
+    concern "$INACTIVE NVLink lane(s) report inactive on $TGT_ID"
+    info "→ hyperpod-node-debugger § G (GPU / Accelerator)"
+    NEXT+=("G")
+  elif (( ERR_LINES > 0 )); then
+    concern "NVLink error counters non-zero on some lanes on $TGT_ID"
+    info "→ hyperpod-node-debugger § G (GPU / Accelerator)"
+    NEXT+=("G")
+  else
+    ok "NVLink lanes active, no error counters"
+  fi
+fi
+
+# Fabric Manager — required on NVL72 UltraServers
+if (( IS_NVL72 )); then
+  FM=$(ssm_run "$SSM_TARGET" 'systemctl is-active nvidia-fabricmanager 2>/dev/null || echo missing')
+  FM=$(echo "$FM" | tr -d '\r\n ')
+  case "$FM" in
+    active)
+      ok "Fabric Manager active (required for $INSTANCE_TYPE NVLink fabric)"
+      ;;
+    *)
+      concern "Fabric Manager state=${FM:-missing} on $INSTANCE_TYPE"
+      info "→ hyperpod-node-debugger § G (GPU / Accelerator)"
+      NEXT+=("G")
+      ;;
+  esac
+fi
+
+# /dev/shm and root-volume usage — surface, don't act
+HOST_INFO_JSON=$(ssm_json "$SSM_TARGET" '
+  shm_present=false
+  shm_size_gib=0
+  shm_used_gib=0
+  if [ -d /dev/shm ]; then
+    shm_present=true
+    read -r size_k used_k _ < <(df -k /dev/shm 2>/dev/null | awk "NR==2{print \$2, \$3}")
+    shm_size_gib=$(awk -v k="${size_k:-0}" "BEGIN{printf \"%.1f\", k/1024/1024}")
+    shm_used_gib=$(awk -v k="${used_k:-0}" "BEGIN{printf \"%.1f\", k/1024/1024}")
+  fi
+  root_pct=$(df / 2>/dev/null | awk "NR==2 {gsub(\"%\",\"\",\$5); print \$5+0}")
+  root_avail_k=$(df -k / 2>/dev/null | awk "NR==2 {print \$4}")
+  root_avail_gib=$(awk -v k="${root_avail_k:-0}" "BEGIN{printf \"%.1f\", k/1024/1024}")
+
+  jq -n \
+    --argjson shm_present   "$shm_present" \
+    --argjson shm_size_gib  "$shm_size_gib" \
+    --argjson shm_used_gib  "$shm_used_gib" \
+    --argjson root_pct      "${root_pct:-0}" \
+    --argjson root_avail_gib "$root_avail_gib" \
+    "{shm:{present:\$shm_present, size_gib:\$shm_size_gib, used_gib:\$shm_used_gib}, root:{used_pct:\$root_pct, avail_gib:\$root_avail_gib}}"
+')
+SHM_PRESENT=$(echo "$HOST_INFO_JSON" | jq -r '.shm.present // false')
+SHM_SIZE=$(echo "$HOST_INFO_JSON" | jq -r '(.shm.size_gib // 0) | . * 10 | floor / 10 | tostring | if test("\\.") then . else . + ".0" end')
+SHM_USED=$(echo "$HOST_INFO_JSON" | jq -r '(.shm.used_gib // 0) | . * 10 | floor / 10 | tostring | if test("\\.") then . else . + ".0" end')
+ROOT_PCT=$(echo "$HOST_INFO_JSON" | jq -r '.root.used_pct // 0')
+ROOT_AVAIL=$(echo "$HOST_INFO_JSON" | jq -r '(.root.avail_gib // 0) | . * 10 | floor / 10 | tostring | if test("\\.") then . else . + ".0" end')
+
+if [[ "$SHM_PRESENT" != "true" ]]; then
+  concern "/dev/shm not present on host"
+  info "→ hyperpod-node-debugger § I (Resource Exhaustion) / hyperpod-nccl § 17"
+  NEXT+=("I")
+else
+  info "/dev/shm (host): ${SHM_USED} GiB used of ${SHM_SIZE} GiB"
+  SHM_INT=${SHM_SIZE%.*}
+  if [[ "$SHM_INT" =~ ^[0-9]+$ ]] && (( SHM_INT < 16 )); then
+    concern "/dev/shm (host) is ${SHM_SIZE} GiB"
+    info "container view may differ (EKS emptyDir, enroot ipc-unshare); → hyperpod-node-debugger § I"
+    NEXT+=("I")
+  fi
+fi
+
+if [[ "$ROOT_PCT" =~ ^[0-9]+$ ]]; then
+  info "/ used: ${ROOT_PCT}% (${ROOT_AVAIL} GiB free of fixed 100 GiB root)"
+  if (( ROOT_PCT >= 90 )); then
+    concern "/ is ${ROOT_PCT}% full on $TGT_ID"
+    info "→ hyperpod-node-debugger § I.2 (Root Volume Exhausted)"
+    NEXT+=("I")
+  fi
+fi
+
+# ---------------------------------------------------------------------------
+# Summary
+# ---------------------------------------------------------------------------
+section "Summary"
+if [[ ${#NEXT[@]} -eq 0 ]]; then
+  ok "no concerns surfaced for the in-scope perf categories"
+  info "if the customer still reports slowness, route to the matching sibling skill (hyperpod-nccl, hyperpod-node-debugger, hyperpod-version-checker)"
+else
+  mapfile -t UNIQ < <(printf '%s\n' "${NEXT[@]}" | sort -u)
+  for h in "${UNIQ[@]}"; do
+    case "$h" in
+      A) printf "  ${BOLD}see SKILL.md § A (Uneven NCCL Performance)${NC}\n" ;;
+      B) printf "  ${BOLD}see SKILL.md § B (Poor Filesystem Performance)${NC}\n" ;;
+      G) printf "  ${BOLD}see hyperpod-node-debugger § G (GPU / Accelerator) — adjacent data point${NC}\n" ;;
+      I) printf "  ${BOLD}see hyperpod-node-debugger § I (Resource Exhaustion) — adjacent data point${NC}\n" ;;
+    esac
+  done
+fi
+
+printf "\n"
+info "sampled one node: $TGT_ID (${INSTANCE_TYPE:-unknown-type}) in group $TGT_GROUP"
+info "re-run with --node <INSTANCE_ID> to target a specific node"
+info "for continuous coverage of GPU / EFA / multi-node NCCL health, enable HyperPod NodeRecovery (HMA) and OnStartDeepHealthChecks"
diff --git a/plugins/sagemaker-ai/skills/hyperpod-slurm-debugger/SKILL.md b/plugins/sagemaker-ai/skills/hyperpod-slurm-debugger/SKILL.md
new file mode 100644
index 00000000..bb1fbc3c
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/hyperpod-slurm-debugger/SKILL.md
@@ -0,0 +1,243 @@
+---
+name: hyperpod-slurm-debugger
+description: Diagnostic-only skill for Slurm scheduler and node-daemon issues on Amazon SageMaker HyperPod Slurm clusters. Scope mirrors the HyperPod troubleshooting guide. Invoke when the user reports a Slurm node stuck in down/drain, "Node unexpectedly rebooted" after auto-repair, slurmd not running, jobs stuck PENDING with REASON=Resources while sinfo shows idle nodes, jobs stuck COMPLETING after node replacement, GRES/GPU counts wrong, scontrol ping failing, slurmctld unresponsive, an Action:Reboot/Replace request that did not trigger HyperPod auto-recovery, or auto-resume not restarting a job. Also triggers on "drain before reboot", "diagnose a Slurm node", "investigate stuck jobs."
+metadata:
+  version: "0.0.1"
+---
+
+# HyperPod Slurm Debugger
+
+Diagnostic-only. Identify and classify Slurm scheduler and node-daemon issues on
+HyperPod Slurm clusters. Do not run, recommend, or print any state-mutating command.
+For remediation, link to the official AWS or Slurm documentation.
+
+## When to invoke
+
+Invoke when the user reports any of the symptoms in the [decision table](#decision-table).
+
+## When NOT to invoke
+
+- Cluster has `Orchestrator.Eks` — invoke `hyperpod-node-debugger` or `hyperpod-nccl`.
+- Single-node hardware fault with healthy Slurm scheduler — invoke `hyperpod-node-debugger`.
+- NCCL training-hang investigation — invoke `hyperpod-nccl`.
+- Node unreachable via SSM — invoke `hyperpod-ssm`.
+
+## Constraints
+
+- Read-only. Do not run, recommend, or print state-mutating commands.
+- For any remediation, link to AWS or Slurm docs. The user authorizes and executes.
+- IaC-managed cluster (Terraform / CloudFormation / CDK): warn that direct mutation
+  drifts the live state from the IaC plan.
+
+Canonical recovery URLs:
+[references/slurm-details.md → Authoritative recovery documentation](references/slurm-details.md).
+
+## Prerequisites
+
+- AWS CLI v2, authenticated for the target account and region with permissions:
+  - `sagemaker:DescribeCluster`, `sagemaker:ListClusterNodes`
+  - `ssm:StartSession` on the HyperPod-created SSM document
+- [Session Manager plugin](https://docs.aws.amazon.com/systems-manager/latest/userguide/session-manager-working-with-install-plugin.html)
+  installed locally.
+- `jq` ≥ 1.6.
+- `unbuffer` (from the `expect` package). Required — without it `aws ssm start-session`
+  returns empty stdout intermittently with `Cannot perform start session: EOF` and every
+  check silently misreports. Install: `expect` package on Amazon Linux / RHEL / Debian /
+  Ubuntu / macOS. Script exits at prerequisite check if missing.
+
+## Procedure
+
+### Step 1 — Collect inputs
+
+Ask the user for:
+
+1. HyperPod cluster name (not Slurm partition name).
+2. AWS region.
+3. Optional: a specific Slurm node name.
+
+### Step 2 — Confirm orchestrator
+
+```bash
+aws sagemaker describe-cluster --cluster-name <NAME/ARN> --region <REGION> \
+  --query 'Orchestrator' --output json
+```
+
+If `Orchestrator.Eks` is present, stop. Route per [When NOT to invoke](#when-not-to-invoke).
+
+### Step 3 — Run the diagnostic script
+
+```bash
+bash scripts/slurm-diagnose.sh --cluster <NAME> --region <REGION>
+# Scope to a node:
+bash scripts/slurm-diagnose.sh --cluster <NAME> --region <REGION> --node <SLURM_NODE>
+```
+
+Relay the script output to the user verbatim.
+
+### Step 4 — Map findings → docs
+
+For each finding, look up the section in the [decision table](#decision-table) and link
+the user to the corresponding AWS / Slurm doc. Do not type out remediation commands.
+
+## Decision table
+
+| Symptom (`sinfo -o "%N %T %30E"` or script finding)         | Section                                                |
+| ----------------------------------------------------------- | ------------------------------------------------------ |
+| Node state = `down` or `down*`, reason other than below     | [A: Node Down](#a-node-down)                           |
+| Node state = `down*`, Reason = `Node unexpectedly rebooted` | [B: Unexpected Reboot](#b-unexpected-reboot)           |
+| Jobs `PENDING` with `REASON=Resources` while nodes are idle | [C: Controller State](#c-controller-state)             |
+| Jobs stuck `COMPLETING` after node replacement              | [C: Controller State](#c-controller-state)             |
+| `scontrol ping` returns `DOWN` for the controller           | [C: Controller State](#c-controller-state)             |
+| GRES (GPU) counts incorrect or not released                 | [C: Controller State](#c-controller-state)             |
+| `state=fail` issued but no recovery occurred                | [D: Action Reason Mismatch](#d-action-reason-mismatch) |
+| Accounting errors or RPC errors mentioning `dbd`            | [C: Controller State](#c-controller-state) (slurmdbd)  |
+| `slurm.conf` edited; new partitions or nodes not visible    | [C: Controller State](#c-controller-state) (config)    |
+| Job exited on a hardware failure but did not restart        | [E: Auto-resume](#e-auto-resume)                       |
+
+## Defaults
+
+| Behavior             | Default                                                                                            | Override                   |
+| -------------------- | -------------------------------------------------------------------------------------------------- | -------------------------- |
+| Mode                 | read-only — always; no remediation flag exists                                                     | n/a                        |
+| Region               | `$AWS_DEFAULT_REGION`, falling back to `us-east-1`                                                 | `--region <R>`             |
+| Scope                | all nodes in `down` / `drain` / `fail` / "unexpectedly rebooted"                                   | `--node <SLURM_NODE_NAME>` |
+| Output               | colorized terminal                                                                                 | `--no-color`               |
+| SSM target format    | `sagemaker-cluster:<clusterId>_<instanceGroupName>-<instanceId>` (derived)                         | n/a                        |
+| Controller discovery | `--controller-group` (if set) → `SlurmConfig.NodeType=Controller` → `provisioning_parameters.json` | `--controller-group <N>`   |
+
+## Error handling
+
+| Failure                                            | Skill behavior                         | Required user action                            |
+| -------------------------------------------------- | -------------------------------------- | ----------------------------------------------- |
+| `describe-cluster` fails                           | Print AWS error; exit 1                | Fix credentials/region; verify cluster name     |
+| Cluster has `Orchestrator.Eks`                     | Exit 1 with pointer to EKS-side skills | Use `hyperpod-node-debugger` or `hyperpod-nccl` |
+| `session-manager-plugin` missing / SSM unreachable | `sinfo` returns empty; exit 1          | Install plugin; verify node `InService`         |
+| Disk ≥ 95 % full on a `down` node                  | Report finding `disk-full-<node>`      | Refer to AWS troubleshooting docs               |
+| Missing `jq` or `aws`                              | Exit 1 at prerequisite check           | Install per [Prerequisites](#prerequisites)     |
+
+---
+
+## A: Node Down
+
+Node is `down` because `slurmd` stopped responding. Causes: `slurmd` crash, disk full,
+OOM, network partition, hardware fault.
+
+Script checks: `systemctl is-active slurmd`, `srun -w <NODE> hostname` (RPC layer), disk,
+memory.
+
+Link: <https://github.com/aws/sagemaker-hyperpod-cluster-setup/blob/troubleshooting-doc-20250917/troubleshoot/index.md>
+
+If node returns to `down` after a manual resume → escalate to `hyperpod-node-debugger`.
+
+Context: [references/slurm-details.md § A](references/slurm-details.md#-a-node-down--diagnostic-context).
+
+---
+
+## B: Unexpected Reboot
+
+Node is `down*` with Reason `"Node unexpectedly rebooted"` because `slurmd`
+re-registered after an out-of-band reboot. Upstream Slurm behavior, not HyperPod.
+Node is typically healthy.
+
+Links:
+
+- <https://github.com/aws/sagemaker-hyperpod-cluster-setup/blob/troubleshooting-doc-20250917/troubleshoot/index.md>
+- <https://slurm.schedmd.com/scontrol.html> (`state=resume` semantics)
+
+If node reboots again within minutes → escalate to `hyperpod-node-debugger`.
+
+Context: [references/slurm-details.md § B](references/slurm-details.md#-b-unexpected-reboot--diagnostic-context).
+
+---
+
+## C: Controller State
+
+`slurmctld` in-memory state can desync from the on-disk state. A controller restart reloads from `StateSaveLocation` and clears bad caches. User decides and executes.
+
+Restart may help:
+
+| Symptom                                            | Why                                         |
+| -------------------------------------------------- | ------------------------------------------- |
+| `PENDING` with `REASON=Resources`, idle nodes      | Re-evaluates the queue                      |
+| Jobs stuck `COMPLETING` after node replacement     | Controller held a reference to the old node |
+| GRES (GPU, EFA) not released after a job ends      | Resource accounting de-synced               |
+| Nodes stuck `Unknown` after reboot, `slurmd` is up | Re-registration was not processed           |
+| `scontrol ping` times out                          | Controller event loop is hung               |
+| Lost connection to `slurmdbd` / RPC errors         | DBD connection wedged                       |
+
+Do NOT restart when:
+
+- HyperPod replacement (`Action:Replace`) in progress on any node — concurrent changes
+  fail the replacement.
+- Only one compute node is bad — restart `slurmd` on that node.
+- `sinfo` and `squeue` are responsive — problem is elsewhere.
+- `journalctl -u slurmctld` not reviewed yet — panic / OOM will reproduce.
+- `slurm.conf` was just edited — try `scontrol reconfigure` first.
+
+### Folded triggers
+
+- **slurmdbd disconnected** — `sacct` fails, accounting fields show `Unknown`,
+  controller log spams `Unable to contact slurmdbd`. Restore `slurmdbd` before
+  considering controller restart.
+  <https://slurm.schedmd.com/accounting.html> ·
+  [details](references/slurm-details.md#slurmdbd-connectivity).
+- **Stale config** — `slurm.conf` / `topology.conf` mtime > slurmctld start.
+  `scontrol reconfigure` first; restart is fallback.
+  <https://slurm.schedmd.com/scontrol.html> ·
+  [details](references/slurm-details.md#scontrol-reconfigure-vs-restart).
+
+Restart procedure / what's preserved:
+
+- <https://slurm.schedmd.com/slurmctld.html>
+- <https://github.com/aws/sagemaker-hyperpod-cluster-setup/blob/troubleshooting-doc-20250917/troubleshoot/index.md>
+
+Context: [references/slurm-details.md § C](references/slurm-details.md#-c-controller-state--diagnostic-context).
+
+---
+
+## D: Action Reason Mismatch
+
+`scontrol update state=fail reason=...` was issued with a `reason` that does not match
+`Action:Reboot` or `Action:Replace` exactly. HyperPod silently ignores anything else.
+Script detects near-misses on nodes in `fail` state.
+
+Required strings (case-sensitive, no whitespace, no punctuation):
+
+- `Action:Reboot`
+- `Action:Replace`
+
+Link: <https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-hyperpod-resiliency-slurm-replace-faulty-instance.html>
+
+Context: [references/slurm-details.md § Action reason-string validation](references/slurm-details.md#action-reason-string-validation).
+
+---
+
+## E: Auto-resume
+
+`--auto-resume=1` is an `srun` step option. It re-runs the step after HMA (the Health
+Monitoring Agent) flags a node and Automatic node recovery replaces it.
+
+Why it didn't restart the job:
+
+- Flag on `sbatch` not `srun` — per-step; `sbatch` directives are silently ignored.
+- HMA did not flag the node — failure was application/transient, not hardware. Step
+  exits as a normal Slurm failure.
+- Cluster `NodeRecovery` is `None` — faulty nodes are labeled but not replaced.
+- No checkpointing — step restarts from process zero each iteration.
+- AMI predates HMA support (released 2025-09-11) — needs AMI / cluster-software update.
+
+Link: <https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-hyperpod-resiliency-slurm-auto-resume.html>
+
+Context: [references/slurm-details.md § HyperPod auto-resume](references/slurm-details.md#hyperpod-auto-resume).
+
+---
+
+## Escalation
+
+| Condition                                                       | Next skill                            |
+| --------------------------------------------------------------- | ------------------------------------- |
+| Node returns to `down` shortly after a manual resume            | `hyperpod-node-debugger` (hardware)   |
+| `slurmd` logs contain CUDA / NVIDIA / XID errors                | `hyperpod-node-debugger` § G          |
+| Disk full or `/dev/shm` exhausted                               | `hyperpod-node-debugger` § I          |
+| Node unreachable via SSM                                        | `hyperpod-ssm`                        |
+| Controller restart does not clear `COMPLETING` after 2 attempts | `hyperpod-issue-report` + AWS Support |
diff --git a/plugins/sagemaker-ai/skills/hyperpod-slurm-debugger/references/slurm-details.md b/plugins/sagemaker-ai/skills/hyperpod-slurm-debugger/references/slurm-details.md
new file mode 100644
index 00000000..2678f4a3
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/hyperpod-slurm-debugger/references/slurm-details.md
@@ -0,0 +1,318 @@
+# Slurm Details
+
+Diagnostic context for `hyperpod-slurm-debugger`. Diagnostic-only — do not run,
+recommend, or print state-mutating commands. Link to AWS / Slurm docs for remediation.
+
+## Table of contents
+
+- [Authoritative recovery documentation](#authoritative-recovery-documentation)
+- [HyperPod auto-resume](#hyperpod-auto-resume)
+- [Action reason-string validation](#action-reason-string-validation)
+- [§ A: Node down — diagnostic context](#-a-node-down--diagnostic-context)
+- [§ B: Unexpected reboot — diagnostic context](#-b-unexpected-reboot--diagnostic-context)
+- [§ C: Controller state — diagnostic context](#-c-controller-state--diagnostic-context)
+  - [scontrol reconfigure vs restart](#scontrol-reconfigure-vs-restart)
+  - [slurmdbd connectivity](#slurmdbd-connectivity)
+
+---
+
+## Authoritative recovery documentation
+
+- HyperPod Slurm troubleshooting:
+  <https://github.com/aws/sagemaker-hyperpod-cluster-setup/blob/troubleshooting-doc-20250917/troubleshoot/index.md>
+- Replace a faulty Slurm instance:
+  <https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-hyperpod-resiliency-slurm-replace-faulty-instance.html>
+- HyperPod auto-resume:
+  <https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-hyperpod-resiliency-slurm-auto-resume.html>
+- `BatchRebootClusterNodes`:
+  <https://docs.aws.amazon.com/cli/latest/reference/sagemaker/batch-reboot-cluster-nodes.html>
+- `BatchReplaceClusterNodes`:
+  <https://docs.aws.amazon.com/cli/latest/reference/sagemaker/batch-replace-cluster-nodes.html>
+- `scontrol(1)`: <https://slurm.schedmd.com/scontrol.html>
+- `slurmctld(8)`: <https://slurm.schedmd.com/slurmctld.html>
+- `slurm.conf(5)`: <https://slurm.schedmd.com/slurm.conf.html>
+- Slurm accounting: <https://slurm.schedmd.com/accounting.html>
+- Slurm authentication (munge): <https://slurm.schedmd.com/authentication.html>
+
+---
+
+## HyperPod auto-resume
+
+Three separate features that compose:
+
+- **HMA (Health Monitoring Agent)** — runs hardware checks (NVIDIA SMI, Neuron sysfs,
+  EFA) continuously, independent of jobs. Marks faulty nodes for drain.
+- **Automatic node recovery** (cluster `NodeRecovery` setting; `Automatic` or `None`) —
+  when `Automatic`, replaces drained nodes after their jobs exit.
+- **`--auto-resume=1`** (`srun` step option) — re-runs the step after HMA + node
+  recovery replace a node in its allocation.
+
+**Auto-resume itself does not run health checks.** HMA does. Auto-resume reacts to
+HMA-triggered replacements. The AWS doc's "How auto-resume works" section is misleading
+on this point — the authoritative description is in the "How automatic node recovery
+and auto-resume work together" section, which states:
+_"If the HMA detects a hardware fault, the node is marked for drain regardless of
+job-level status. With node automatic recovery enabled, the nodes are automatically
+replaced once all the jobs running in the nodes exit. In this scenario, for jobs with
+auto-resume enabled, if there is a non-zero exit status in the step, the auto resume
+kicks in."_
+
+If HMA does not flag a node, auto-resume does not fire — the step exits as a normal
+Slurm failure.
+
+<https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-hyperpod-resiliency-slurm-auto-resume.html>
+
+### Verify auto-resume ran (read-only)
+
+```bash
+# Replace events in slurmctld log:
+sudo journalctl -u slurmctld --since "2 hours ago" | grep -E 'auto.?resume|Action:Replace|replac'
+
+# Last reason and boot time on the node:
+scontrol show node <NODE> | grep -i 'reason\|boot'
+
+# Job-step events from accounting:
+sacct -j <JOBID> -o JobID,JobName,State,ExitCode,NodeList,Start,End -X
+```
+
+Same `JOBID` after `NodeList` change → auto-resume succeeded.
+
+### Why auto-resume didn't restart
+
+- **Flag on `sbatch` not `srun`** — per-step option; `sbatch` directives ignored.
+- **HMA did not flag the node** — auto-resume only reacts to HMA-triggered
+  replacements. Inspect `dmesg` and `journalctl -k` for hardware signals (XID, MCE,
+  PCIe AER, EFA driver errors). None → not hardware; failure was application or
+  transient and auto-resume cannot fire.
+- **Cluster `NodeRecovery` is `None`** — HMA labels faulty nodes but nothing replaces
+  them. Confirm: `aws sagemaker describe-cluster ... --query NodeRecovery`.
+- **AMI predates HMA support** (released 2025-09-11). Script flags this by checking for
+  `--auto-resume` in `srun --help`.
+- **Concurrent manual `Action:Replace`** racing with the automatic replacement.
+
+---
+
+## Action reason-string validation
+
+HyperPod auto-recovery matches the Slurm node `Reason` field exactly, case-sensitive:
+
+| Intent  | Required reason  |
+| ------- | ---------------- |
+| Reboot  | `Action:Reboot`  |
+| Replace | `Action:Replace` |
+
+Any mismatch is silently ignored. Common near-misses:
+
+- `action:replace` — wrong case
+- `Action: Reboot` — extra space after colon
+- `Action:Reboot⎵` (where `⎵` is whitespace) — trailing whitespace
+- `Action:Reboot.` — trailing punctuation
+- `Reboot` / `replace this` — wrong format
+
+Verify (read-only):
+
+```bash
+sinfo -o "%N %T %30E" | grep <NODE>
+scontrol show node <NODE> | grep -i reason
+```
+
+Canonical command form per AWS docs (do not run from this skill — operator-executed):
+
+```bash
+scontrol update node=<ip-ipv4> state=fail reason="Action:Reboot"
+scontrol update node=<ip-ipv4> state=fail reason="Action:Replace"
+```
+
+Re-issue procedure: <https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-hyperpod-resiliency-slurm-replace-faulty-instance.html>
+
+---
+
+## § A: Node down — diagnostic context
+
+`slurmd` stopped responding. Causes: `slurmd` crash/stop, disk full, OOM, network
+partition, hardware fault.
+
+### Inspection (read-only)
+
+```bash
+# Head node:
+sinfo -o "%N %T %30E" | grep -E 'down|drain'
+scontrol show node <NODE>           # Reason, LastBusyTime, Boot
+
+# Reachability per layer:
+ping <NODE>                          # L3
+srun -w <NODE> hostname              # Slurm RPC
+ssh <NODE> true                      # SSH (if configured)
+
+# Affected node (via SSM):
+systemctl status slurmd
+journalctl -u slurmd -n 200 --no-pager
+journalctl -xe -n 100 --no-pager     # kernel errors, OOM kills
+free -h
+df -h
+df -h /dev/shm
+```
+
+### Findings → docs
+
+| Finding                         | Link                                                                |
+| ------------------------------- | ------------------------------------------------------------------- |
+| `slurmd` stopped, logs clean    | HyperPod Slurm troubleshooting (Authoritative recovery)             |
+| `slurmd` crashing, munge errors | <https://slurm.schedmd.com/authentication.html>                     |
+| Disk full                       | HyperPod storage layout (`/opt/sagemaker`, `/opt/dlami/nvme`, FSx)  |
+| OOM in `dmesg`                  | Right-size workload — AWS instance-type docs                        |
+| Kernel panic / recent reboot    | [§ B: Unexpected reboot](#-b-unexpected-reboot--diagnostic-context) |
+| GPU XID / ECC errors in `dmesg` | `hyperpod-node-debugger` § G                                        |
+
+If node returns to `down` after manual recovery → `hyperpod-node-debugger` (hardware).
+
+---
+
+## § B: Unexpected reboot — diagnostic context
+
+`slurmd` re-registered after an out-of-band reboot (kernel panic, watchdog, manual
+reboot, HyperPod auto-repair). Slurm marks the node `down*` with reason
+`Node unexpectedly rebooted` and refuses scheduling. **Upstream Slurm behavior, not
+HyperPod-specific** — protects pending jobs from landing on a node with potentially
+corrupt local state (partial checkpoints, half-written scratch).
+
+Node is usually fine. Resume procedure:
+
+- <https://github.com/aws/sagemaker-hyperpod-cluster-setup/blob/troubleshooting-doc-20250917/troubleshoot/index.md>
+- <https://slurm.schedmd.com/scontrol.html> (`state=resume` semantics)
+
+If the node loops through reboots → kernel / hardware issue. Inspect `dmesg` and
+`journalctl -b -1` (previous boot) before any further action. Route to
+`hyperpod-node-debugger`.
+
+---
+
+## § C: Controller state — diagnostic context
+
+`slurmctld` in-memory state desynced from disk-persisted state. Standard restart reloads
+from `StateSaveLocation` (typically `/var/spool/slurmctld/` on HyperPod, but
+admin-configured — confirm with `scontrol show config | grep StateSaveLocation`).
+
+### What's preserved across a restart
+
+Per [`slurmctld(8)`](https://slurm.schedmd.com/slurmctld.html), without `-c` the restart
+preserves running jobs plus node state of `DOWN`, `DRAINED`, and `DRAINING` nodes with
+their Reason field.
+
+**Recovered from `StateSaveLocation`:**
+
+- Running jobs (continue executing on compute nodes; reconnect when controller is back).
+- Pending queue (`squeue` returns the same queue).
+- `DOWN`, `DRAINED`, `DRAINING` node states + Reason field.
+- Accounting records (via `slurmdbd`).
+
+**Re-read from `slurm.conf` on startup:**
+
+- Partition definitions, `NodeName` definitions, scheduling parameters.
+
+**Reset (this is what fixes the symptoms):**
+
+- In-memory scheduling decisions and priority calculations.
+- GRES / TRES accounting caches.
+- Hung RPC connections to compute nodes.
+- Stale `REASON=Resources` on pending jobs.
+- Stuck `COMPLETING` tracking.
+
+### Pre-restart inspection (read-only)
+
+```bash
+scontrol show config | grep StateSaveLocation
+STATE=$(scontrol show config | awk -F= '/^StateSaveLocation/ {gsub(/ /,"",$2); print $2; exit}')
+sudo ls -la "$STATE"      # should have recent state files
+```
+
+If the directory is missing or empty, do NOT restart — recover state file from backup
+first. `slurmctld -c` (clean start) purges every job from the controller.
+
+Restart procedure:
+
+- <https://slurm.schedmd.com/slurmctld.html>
+- <https://github.com/aws/sagemaker-hyperpod-cluster-setup/blob/troubleshooting-doc-20250917/troubleshoot/index.md>
+
+When-to-restart vs when-not-to: see [SKILL.md § C](../SKILL.md#c-controller-state).
+
+### scontrol reconfigure vs restart
+
+`slurm.conf` / `topology.conf` / `gres.conf` was edited; controller has stale config
+in memory. Two reload paths:
+
+**`scontrol reconfigure`** — no downtime. Reloads `slurm.conf` in place. Per
+[`scontrol(1)`](https://slurm.schedmd.com/scontrol.html), cannot change daemons'
+listening TCP port or `AuthType`; changing `AuthType` requires terminating all Slurm
+daemons + commands per [`slurm.conf(5)`](https://slurm.schedmd.com/slurm.conf.html).
+
+**`systemctl restart slurmctld`** — ~5–30s scheduling pause. Required for changes that
+`scontrol reconfigure` rejects. In practice operators also restart for structural
+changes (adding/removing nodes, `NodeName` changes, topology rewrites) since
+reconfigure isn't guaranteed to apply them cleanly.
+
+Pre-reload inspection (read-only):
+
+```bash
+# HyperPod installs to /opt/slurm-<version>/etc/, not /etc/slurm/:
+CONF=$(scontrol show config | awk -F= '/^SLURM_CONF/ {gsub(/ /,"",$2); print $2; exit}')
+ls -la "$CONF"
+# After reload, watch for parse errors:
+journalctl -u slurmctld -n 50 --no-pager
+```
+
+No syntax-check flag exists for `slurmctld` or `slurmdbd`. Errors surface in
+`journalctl` after reload.
+
+`scontrol reconfigure` only reloads the controller's view. Compute nodes read their own
+copy of `slurm.conf` from disk. If the lifecycle script doesn't push `slurm.conf` to
+every node (via shared FSx mount or explicit copy step), node-side `slurmd` runs with
+stale config until restarted.
+
+### slurmdbd connectivity
+
+`slurmctld` cannot reach `slurmdbd`. Scheduler keeps running; accounting fails. Symptoms
+look like a controller hang but aren't.
+
+**Symptoms:**
+
+- `sacctmgr show stats` returns `Unable to contact slurmdbd` or `Connection refused`.
+- `sacct -j <JOBID>` returns `Sockets disabled` or no rows.
+- `journalctl -u slurmctld | grep -i dbd` shows repeated reconnect attempts.
+- New jobs complete but accounting records never appear in `sacct`.
+
+**Diagnose (read-only):**
+
+```bash
+systemctl status slurmdbd
+journalctl -u slurmdbd  -n 100 --no-pager
+journalctl -u slurmctld -n 100 --no-pager | grep -iE 'dbd|accounting'
+
+# slurmdbd.conf path — HyperPod uses /opt/slurm-<version>/etc/:
+SLURMDBD_CONF=$(find /opt/slurm*/etc /etc/slurm -name slurmdbd.conf 2>/dev/null | head -1)
+sudo grep -E 'StorageHost|StoragePort|StorageUser' "$SLURMDBD_CONF"
+
+nc -vz <StorageHost> <StoragePort>     # default port 3306
+```
+
+**Common causes:**
+
+| Cause                                         | Link                                             |
+| --------------------------------------------- | ------------------------------------------------ |
+| `slurmdbd` daemon stopped or crashed          | <https://slurm.schedmd.com/accounting.html>      |
+| MySQL / MariaDB endpoint unreachable          | Restore SG / VPC route; slurmdbd self-recovers   |
+| `slurmdbd.conf` `StoragePass` wrong / rotated | <https://slurm.schedmd.com/slurmdbd.conf.html>   |
+| Disk full on slurmdbd host                    | Daemon won't start without log-file write access |
+| Schema migration pending after Slurm upgrade  | <https://slurm.schedmd.com/upgrades.html>        |
+
+**Recovery order:**
+
+1. Restore `slurmdbd`. Running jobs are unaffected — no time pressure.
+2. Verify with `sacctmgr show stats` (rollup counters, no errors).
+3. Only then evaluate whether `slurmctld` itself needs a restart. If `slurmctld`
+   recovered the DBD connection on its own, no restart is needed. If the controller log
+   still shows stuck DBD-RPC threads, see
+   [§ C](#-c-controller-state--diagnostic-context).
+
+If the database is RDS / Aurora / managed, check snapshot windows and maintenance
+events — a brief failover can leave `slurmctld` with a wedged connection.
diff --git a/plugins/sagemaker-ai/skills/hyperpod-slurm-debugger/scripts/slurm-diagnose.sh b/plugins/sagemaker-ai/skills/hyperpod-slurm-debugger/scripts/slurm-diagnose.sh
new file mode 100755
index 00000000..b25bd83e
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/hyperpod-slurm-debugger/scripts/slurm-diagnose.sh
@@ -0,0 +1,802 @@
+#!/usr/bin/env bash
+# slurm-diagnose.sh
+#
+# Read-only diagnostic for Slurm node-management issues on Amazon SageMaker HyperPod
+# Slurm clusters. Covers the scenarios documented in the HyperPod troubleshooting guide:
+#
+#   A. Node DOWN / not responding
+#   B. Node DOWN with reason "Node unexpectedly rebooted"
+#   C. Controller state — slurmctld desync, plus the two folded triggers:
+#      C (slurmdbd): accounting daemon connectivity
+#      C (config):   pending slurm.conf reconfiguration
+#   D. Auto-recovery reason-string mismatches (Action:Reboot / Action:Replace)
+#   E. HyperPod --auto-resume support and recent missed-resume detection
+#
+# Security model:
+#   - All CLI inputs are validated against strict regexes at parse time.
+#   - All AWS-derived values (instance IDs, group names, node names) are validated before
+#     they reach any shell context — invalid values cause an immediate exit.
+#   - Remote SSM payloads are base64-encoded literals; server-derived values are
+#     prepended to the remote script as `export VAR='<jq @sh-quoted VALUE>'` lines so
+#     they are never string-interpolated into shell commands.
+#   - Local printf calls use `%s` with the data as a separate argument; format-string
+#     attacks via server values are not possible.
+#
+# The script never mutates cluster state.
+#
+# Usage:
+#   bash slurm-diagnose.sh --cluster <NAME-or-ARN> --region <REGION>
+#   bash slurm-diagnose.sh --cluster <N> --region <R> --node <SLURM_NODE>
+#   bash slurm-diagnose.sh --cluster <N> --region <R> --controller-group <NAME>
+#
+# Optional flags:
+#   --node <SLURM_NODE>       Scope inspection to a single Slurm node.
+#   --controller-group <N>    Override controller-group discovery (for self-managed
+#                             Slurm clusters where SlurmConfig is not set).
+#   --no-color                Plain output (no ANSI colors).
+
+set -euo pipefail
+
+CLUSTER=""
+REGION="${AWS_DEFAULT_REGION:-us-east-1}"
+TARGET_NODE=""
+CONTROLLER_GROUP_OVERRIDE=""
+USE_COLOR=true
+
+# --- Input-validation helpers -------------------------------------------------
+# Each validator prints the value if valid, exits non-zero if not. All callsites
+# capture into a local variable; failure aborts the script via `set -e`.
+
+# AWS region: lowercase letters, digits, dashes only.
+validate_region() {
+  local v="${1-}"
+  [[ "$v" =~ ^[a-z]{2,3}-[a-z]+-[0-9]+$ ]] || { echo "Error: invalid region: $v" >&2; exit 2; }
+  printf '%s' "$v"
+}
+
+# HyperPod cluster name OR ARN. Names are 1-63 chars of [a-zA-Z0-9_-]; ARNs match the
+# documented SageMaker cluster ARN shape.
+validate_cluster() {
+  local v="${1-}"
+  if [[ "$v" =~ ^arn:aws[a-zA-Z-]*:sagemaker:[a-z0-9-]+:[0-9]{12}:cluster/[a-zA-Z0-9-]+$ ]]; then
+    printf '%s' "$v"
+  elif [[ "$v" =~ ^[a-zA-Z0-9_-]{1,63}$ ]]; then
+    printf '%s' "$v"
+  else
+    echo "Error: invalid cluster name/ARN: $v" >&2
+    exit 2
+  fi
+}
+
+# Slurm node names on HyperPod follow the `ip-x-x-x-x` form, but admins may rename.
+# Allow [a-zA-Z0-9._-]+ with length 1..253; reject anything that could escape a shell.
+validate_node_name() {
+  local v="${1-}"
+  [[ "$v" =~ ^[a-zA-Z0-9._-]{1,253}$ ]] || { echo "Error: invalid node name: $v" >&2; exit 2; }
+  printf '%s' "$v"
+}
+
+# EC2 instance IDs: i- followed by 8 or 17 hex characters. Documented and stable.
+validate_instance_id() {
+  local v="${1-}"
+  [[ "$v" =~ ^i-[a-f0-9]{8}([a-f0-9]{9})?$ ]] || { echo "Error: invalid instance ID: $v" >&2; exit 2; }
+  printf '%s' "$v"
+}
+
+# Cluster ID (from ARN): lowercase alphanumeric, currently 12 chars (e.g.
+# qrmv6xhralg4). Allow 4..32 to be future-tolerant.
+validate_cluster_id() {
+  local v="${1-}"
+  [[ "$v" =~ ^[a-z0-9]{4,32}$ ]] || { echo "Error: invalid cluster ID: $v" >&2; exit 2; }
+  printf '%s' "$v"
+}
+
+# Instance group name: SageMaker allows 1..63 chars [a-zA-Z0-9_-] per the
+# CreateCluster API.
+validate_group_name() {
+  local v="${1-}"
+  [[ "$v" =~ ^[a-zA-Z0-9_-]{1,63}$ ]] || { echo "Error: invalid instance group name: $v" >&2; exit 2; }
+  printf '%s' "$v"
+}
+
+# --- Argument parsing ---------------------------------------------------------
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --cluster)
+      [[ $# -lt 2 ]] && { echo "Error: --cluster requires a value" >&2; exit 2; }
+      CLUSTER=$(validate_cluster "$2"); shift 2 ;;
+    --region)
+      [[ $# -lt 2 ]] && { echo "Error: --region requires a value" >&2; exit 2; }
+      REGION=$(validate_region "$2"); shift 2 ;;
+    --node)
+      [[ $# -lt 2 ]] && { echo "Error: --node requires a value" >&2; exit 2; }
+      TARGET_NODE=$(validate_node_name "$2"); shift 2 ;;
+    --controller-group)
+      [[ $# -lt 2 ]] && { echo "Error: --controller-group requires a value" >&2; exit 2; }
+      CONTROLLER_GROUP_OVERRIDE=$(validate_group_name "$2"); shift 2 ;;
+    --no-color) USE_COLOR=false;  shift ;;
+    -h|--help)
+      # Print every leading-comment line at the top of this file (lines 2..N until the
+      # first non-comment line). Robust against future header edits.
+      awk 'NR==1{next} /^#/{sub(/^# ?/,""); print; next} {exit}' "$0"
+      exit 0 ;;
+    --*) echo "Error: unknown flag: $1" >&2; exit 2 ;;
+    *)   echo "Error: unexpected positional argument: $1" >&2; exit 2 ;;
+  esac
+done
+
+[[ -z "$CLUSTER" ]] && { echo "Error: --cluster is required" >&2; exit 2; }
+REGION=$(validate_region "$REGION")  # validate even when sourced from env default
+
+# --- Prerequisite checks ------------------------------------------------------
+command -v aws >/dev/null 2>&1 || { echo "Error: aws CLI is required (v2 recommended)." >&2; exit 1; }
+command -v jq  >/dev/null 2>&1 || { echo "Error: jq is required. Install with your package manager." >&2; exit 1; }
+
+# `unbuffer` (from the `expect` package) attaches a PTY to aws ssm start-session, which
+# avoids a known race where session-manager-plugin closes stdout before flushing and the
+# caller sees "Cannot perform start session: EOF" with empty output. Without it, every
+# SSM command silently returns empty, causing every downstream check to misreport.
+command -v unbuffer >/dev/null 2>&1 || {
+  echo "Error: unbuffer (from the 'expect' package) is required." >&2
+  echo "       Install: sudo yum install expect | sudo apt install expect | brew install expect" >&2
+  exit 1
+}
+
+# --- Output formatting --------------------------------------------------------
+if "$USE_COLOR"; then
+  RED=$'\033[0;31m'; GREEN=$'\033[0;32m'; YELLOW=$'\033[1;33m'
+  CYAN=$'\033[0;36m'; BOLD=$'\033[1m'; NC=$'\033[0m'
+else
+  RED=''; GREEN=''; YELLOW=''; CYAN=''; BOLD=''; NC=''
+fi
+
+# All status helpers use %s with the message as a separate arg — never embed message
+# text into the format string. Strip ANSI escape sequences from incoming server data
+# so a malicious or buggy upstream cannot rewrite the operator's terminal.
+_sanitize() {
+  # Drop ANSI CSI sequences and bell, but leave printable UTF-8 alone.
+  sed -e 's/\x1b\[[0-9;?]*[a-zA-Z]//g' -e 's/\x07//g' -e 's/\r$//' <<< "${1-}"
+}
+section() { printf '\n%s%s=== %s ===%s\n' "$BOLD" "$CYAN" "$(_sanitize "$1")" "$NC"; }
+ok()    { printf '  %s[PASS]%s %s\n' "$GREEN"  "$NC" "$(_sanitize "$1")"; }
+warn()  { printf '  %s[WARN]%s %s\n' "$YELLOW" "$NC" "$(_sanitize "$1")"; }
+bad()   { printf '  %s[FAIL]%s %s\n' "$RED"    "$NC" "$(_sanitize "$1")"; }
+info()  { printf '         %s\n' "$(_sanitize "$1")"; }
+hint()  { printf '  %s[NEXT]%s %s\n' "$CYAN"   "$NC" "$(_sanitize "$1")"; }
+
+ISSUES=()
+NEXT_STEPS=()
+
+# --- Verify cluster + orchestrator --------------------------------------------
+section "1. Cluster identity"
+DESC=$(aws sagemaker describe-cluster --cluster-name "$CLUSTER" --region "$REGION" \
+  --output json 2>&1) || { bad "cannot describe cluster: $DESC"; exit 1; }
+
+ORCH=$(jq -r '.Orchestrator // {} | keys[0] // "Slurm"' <<< "$DESC")
+if [[ "$ORCH" == "Eks" ]]; then
+  bad "cluster uses EKS orchestrator - this skill is for Slurm only"
+  info "use hyperpod-node-debugger or hyperpod-nccl instead"
+  exit 1
+fi
+
+# Managed Slurm vs self-managed Slurm:
+#   - Managed: DescribeCluster.Orchestrator.Slurm is present AND the cluster was created
+#     with the SlurmConfig API parameter — InstanceGroups[].SlurmConfig.NodeType identifies
+#     controllers, login nodes, workers. AWS docs treat this as the authoritative source.
+#   - Self-managed: anything else. The customer brought their own Slurm setup via the
+#     lifecycle scripts and InstanceGroups[].SlurmConfig is empty. The controller-group
+#     name lives in /opt/ml/config/provisioning_parameters.json on every node, or the
+#     customer can pass --controller-group <NAME>.
+HAS_SLURM_CONFIG=$(jq -r '
+  any(.InstanceGroups[]?; (.SlurmConfig // {}) != {})
+' <<< "$DESC")
+CLUSTER_NAME=$(jq -r '.ClusterName // "unknown"' <<< "$DESC")
+CLUSTER_STATUS=$(jq -r '.ClusterStatus // "unknown"' <<< "$DESC")
+if [[ "$HAS_SLURM_CONFIG" == "true" ]]; then
+  ok "Managed Slurm cluster: $CLUSTER_NAME  status=$CLUSTER_STATUS"
+else
+  ok "Self-managed Slurm cluster: $CLUSTER_NAME  status=$CLUSTER_STATUS"
+fi
+
+# Cluster ID from ARN. Validate before it gets embedded into SSM target strings.
+CLUSTER_ID=$(jq -r '.ClusterArn // "" | split("/") | last' <<< "$DESC")
+[[ -n "$CLUSTER_ID" ]] || { bad "cannot extract cluster ID from ARN"; exit 1; }
+CLUSTER_ID=$(validate_cluster_id "$CLUSTER_ID")
+
+# --- SSM remote-execution helper ----------------------------------------------
+#
+# `ssm_run` runs a command on a HyperPod node via SSM (read-only).
+#
+# Design notes:
+#   1. The remote script is base64-encoded locally and decoded remotely. The agent's
+#      command parameter is a fixed `sh -c "echo <BASE64> | base64 -d | bash"`; the
+#      base64 string contains only [A-Za-z0-9+/=] and is safe inside double quotes.
+#      Nothing from the script's caller appears unescaped in the SSM-agent's argv.
+#   2. Server-derived values that need to be visible to the remote script are passed
+#      as named environment variables (`VAR=VALUE` trailing args). Each value is run
+#      through `jq @sh` (single-quoted shell-safe encoding with `'\''` escapes) and
+#      prepended to the remote script as `export VAR='<safely-quoted>'; ...`. The remote
+#      shell reads them as `$NODE`, `$NODELIST`, etc. — values never reach a remote
+#      shell-eval context as raw interpolated text.
+#   3. `unbuffer` is required to defeat the SSM "Cannot perform start session: EOF"
+#      race; the prerequisite check above guarantees it's present.
+#   4. Returns the underlying aws-cli exit code so callers can distinguish transport
+#      failures from successful empty output.
+#
+# Usage:
+#   ssm_run TARGET REMOTE_SCRIPT [VAR=VALUE ...]
+ssm_run() {
+  local target="$1"; shift
+  local script="$1"; shift
+  local export_block="" raw_kv key val safe_val
+  for raw_kv in "$@"; do
+    [[ "$raw_kv" =~ ^([A-Za-z_][A-Za-z0-9_]*)=(.*)$ ]] || {
+      echo "ssm_run: invalid VAR=VALUE: $raw_kv" >&2
+      return 2
+    }
+    key="${BASH_REMATCH[1]}"
+    val="${BASH_REMATCH[2]}"
+    # jq's @sh produces single-quoted shell-safe text with embedded `'\''` escapes.
+    safe_val=$(jq -nr --arg v "$val" '$v | @sh')
+    export_block+="export ${key}=${safe_val}; "
+  done
+  local full_script="${export_block}${script}"
+  local b64
+  if base64 --help 2>&1 | grep -q '\-w'; then
+    b64=$(printf '%s' "$full_script" | base64 -w0)
+  else
+    b64=$(printf '%s' "$full_script" | base64 -b0)
+  fi
+  local wrapper="sh -c \"echo $b64 | base64 -d | bash\""
+  local params
+  params=$(jq -nc --arg c "$wrapper" '{command: [$c]}')
+  local out rc=0
+  out=$(unbuffer aws ssm start-session --region "$REGION" --target "$target" \
+        --document-name AWS-StartNonInteractiveCommand \
+        --parameters "$params" 2>&1) || rc=$?
+  # NOTE: do NOT strip 'Cannot perform start session' here — that line is the
+  # SSM transport-failure signal that ssm_transport_failed() detects. Only filter
+  # benign session chrome ('Starting session' / 'Exiting session') and ANSI escapes.
+  printf '%s' "$out" \
+    | sed -e 's/\x1b\[[0-9;?]*[a-zA-Z]//g' \
+          -e '/^Starting session/d' \
+          -e '/^Exiting session/d'
+  return "$rc"
+}
+
+# Returns 0 if the SSM raw output indicates a transport-layer failure (no command
+# output, session refused, EOF before flush) — distinct from "command ran and returned
+# nothing." Used to bail out early rather than misreport every downstream check.
+ssm_transport_failed() {
+  local raw="${1-}"
+  grep -qiE 'Cannot perform start session|TargetNotConnected|InvalidTarget|AccessDeniedException|UnauthorizedOperation' <<< "$raw"
+}
+
+# --- Find controller node -----------------------------------------------------
+NODES_JSON=$(aws sagemaker list-cluster-nodes --cluster-name "$CLUSTER" --region "$REGION" \
+  --output json 2>&1) || { bad "list-cluster-nodes failed: $NODES_JSON"; exit 1; }
+
+# Discovery priority:
+#   1. --controller-group <NAME>          (operator override — always wins)
+#   2. InstanceGroups[].SlurmConfig.NodeType == "Controller"   (managed-Slurm authoritative)
+#   3. /opt/ml/config/provisioning_parameters.json on a probe node   (self-managed fallback)
+#   4. Refuse to guess — print available groups and exit.
+# We never guess based on instance-group naming — that's a lifecycle-script convention,
+# not a guarantee, and getting it wrong sends every command to a non-controller.
+CONTROLLER_GROUP=""
+CONTROLLER_DISCOVERY_METHOD=""
+
+# (1) Operator override — always wins.
+if [[ -n "$CONTROLLER_GROUP_OVERRIDE" ]]; then
+  CONTROLLER_GROUP="$CONTROLLER_GROUP_OVERRIDE"
+  CONTROLLER_DISCOVERY_METHOD="--controller-group flag"
+fi
+
+# (2) Managed-Slurm authoritative source.
+if [[ -z "$CONTROLLER_GROUP" && "$HAS_SLURM_CONFIG" == "true" ]]; then
+  CONTROLLER_GROUP=$(jq -r '
+    .InstanceGroups[]?
+    | select((.SlurmConfig.NodeType // "") == "Controller")
+    | .InstanceGroupName' <<< "$DESC" | head -1)
+  if [[ -n "$CONTROLLER_GROUP" ]]; then
+    CONTROLLER_DISCOVERY_METHOD="DescribeCluster.SlurmConfig"
+  fi
+fi
+
+# (3) Self-managed: read provisioning_parameters.json from any node.
+# The lifecycle-script convention is that this file is dropped at the same path on every
+# node, so we pick any node arbitrarily, SSM in, and read the controller_group field.
+if [[ -z "$CONTROLLER_GROUP" ]]; then
+  PROBE_ID=$(jq -r '.ClusterNodeSummaries[0].InstanceId // ""' <<< "$NODES_JSON")
+  PROBE_GROUP=$(jq -r '.ClusterNodeSummaries[0].InstanceGroupName // ""' <<< "$NODES_JSON")
+  if [[ -n "$PROBE_ID" && -n "$PROBE_GROUP" ]]; then
+    PROBE_ID_V=$(validate_instance_id "$PROBE_ID")
+    PROBE_GROUP_V=$(validate_group_name "$PROBE_GROUP")
+    PROBE_TARGET="sagemaker-cluster:${CLUSTER_ID}_${PROBE_GROUP_V}-${PROBE_ID_V}"
+    # Field name varies between lifecycle-script generations — try both.
+    PROV_GROUP=$(ssm_run "$PROBE_TARGET" \
+      'jq -r ".controller_group // .ControllerGroup // empty" /opt/ml/config/provisioning_parameters.json 2>/dev/null' \
+      2>/dev/null | tr -d '\r\n' || true)
+    if [[ -n "$PROV_GROUP" ]]; then
+      MATCHED=$(jq -r --arg g "$PROV_GROUP" \
+        '[.ClusterNodeSummaries[]? | select(.InstanceGroupName == $g)] | length' <<< "$NODES_JSON")
+      if [[ "$MATCHED" -gt 0 ]]; then
+        CONTROLLER_GROUP="$PROV_GROUP"
+        CONTROLLER_DISCOVERY_METHOD="provisioning_parameters.json on $PROBE_ID_V"
+      fi
+    fi
+  fi
+fi
+
+# (4) Out of options — refuse to guess. Tell the operator how to unblock.
+if [[ -z "$CONTROLLER_GROUP" ]]; then
+  bad "cannot identify the Slurm controller instance group"
+  if [[ "$HAS_SLURM_CONFIG" == "true" ]]; then
+    info "no InstanceGroup has SlurmConfig.NodeType=Controller in DescribeCluster output"
+    info "this is unexpected for a managed-Slurm cluster — verify the cluster was"
+    info "created with the SlurmConfig parameter, or pass --controller-group <NAME>."
+  else
+    info "self-managed Slurm cluster — provisioning_parameters.json was not readable"
+    info "from a probe node, and no --controller-group flag was provided."
+    info ""
+    info "Resolve by either:"
+    info "  1. inspecting the head node manually:"
+    info "       aws ssm start-session --target $PROBE_TARGET --region $REGION"
+    info "       cat /opt/ml/config/provisioning_parameters.json | jq ."
+    info "  2. re-running with the controller group's name:"
+    info "       --controller-group <INSTANCE_GROUP_NAME>"
+    info ""
+    info "Available instance groups in this cluster:"
+    jq -r '.ClusterNodeSummaries[] | "  - " + .InstanceGroupName + "  (" + .InstanceId + ")"' \
+      <<< "$NODES_JSON" | sort -u
+  fi
+  exit 1
+fi
+CONTROLLER_GROUP=$(validate_group_name "$CONTROLLER_GROUP")
+
+# Pick the first node from the controller group.
+CONTROLLER_ID=$(jq -r --arg g "$CONTROLLER_GROUP" \
+  '.ClusterNodeSummaries[]? | select(.InstanceGroupName == $g) | .InstanceId' <<< "$NODES_JSON" | head -1)
+[[ -n "$CONTROLLER_ID" ]] || { bad "controller group $CONTROLLER_GROUP has no nodes"; exit 1; }
+CONTROLLER_ID=$(validate_instance_id "$CONTROLLER_ID")
+
+ok "controller node: $CONTROLLER_ID (group=$CONTROLLER_GROUP, source=$CONTROLLER_DISCOVERY_METHOD)"
+
+SSM_HEAD="sagemaker-cluster:${CLUSTER_ID}_${CONTROLLER_GROUP}-${CONTROLLER_ID}"
+
+# --- Collect Slurm state from head node ---------------------------------------
+section "2. Slurm cluster state (from head node)"
+SSM_PROBE=$(ssm_run "$SSM_HEAD" 'echo SSM_OK' || true)
+if ! grep -q '^SSM_OK$' <<< "$SSM_PROBE"; then
+  bad "cannot reach head node via SSM — every downstream check would be unreliable"
+  if ssm_transport_failed "$SSM_PROBE"; then
+    info "  transport error detected (TargetNotConnected, AccessDenied, or EOF race)"
+  fi
+  info "  reproduce manually with the same target and region:"
+  info "    aws ssm start-session --target $SSM_HEAD --region $REGION"
+  info "  if that fails, route to the hyperpod-ssm skill before retrying."
+  exit 1
+fi
+ok "SSM transport to head node working"
+
+SINFO_OUT=$(ssm_run "$SSM_HEAD" 'sinfo -h -o "%N|%T|%E" 2>&1 | head -200' || true)
+if [[ $(printf '%s\n' "$SINFO_OUT" | wc -l) -ge 200 ]]; then
+  warn "sinfo output reached the 200-line cap — node-state results may be truncated on this large cluster"
+fi
+if grep -qi 'command not found' <<< "$SINFO_OUT"; then
+  bad "sinfo not installed on head node — Slurm lifecycle script may not have run"
+  info "verify on the node:  systemctl status slurmctld; ls /opt/slurm*/etc /etc/slurm 2>/dev/null"
+  exit 1
+fi
+if [[ -z "$SINFO_OUT" ]]; then
+  warn "sinfo returned no rows — empty cluster, or controller not yet responding"
+fi
+
+# Parse sinfo lines. Node names from sinfo are server-controlled; validate before they
+# can be embedded into any later command. Values that fail validation are dropped, not
+# trusted; we report the count of skipped entries so the operator notices.
+DOWN_NODES=()
+REBOOT_NODES=()
+FAIL_NODES=()
+BAD_REASON_NODES=()
+SKIPPED_INVALID=0
+while IFS='|' read -r node state reason; do
+  [[ -z "$node" ]] && continue
+  if ! [[ "$node" =~ ^[a-zA-Z0-9._-]{1,253}$ ]]; then
+    SKIPPED_INVALID=$((SKIPPED_INVALID+1))
+    continue
+  fi
+  # Reasons can contain spaces and punctuation; allow them but strip ANSI/control chars.
+  reason="$(_sanitize "$reason")"
+  if grep -qi 'fail' <<< "$state"; then
+    if [[ "$reason" =~ ^Action:(Reboot|Replace)$ ]]; then
+      FAIL_NODES+=("$node|$reason")
+    elif grep -qiE 'action[ :_-]*re(boot|place)|reboot|replace' <<< "$reason"; then
+      BAD_REASON_NODES+=("$node|$reason")
+    fi
+  fi
+  if grep -qiE 'down|drain' <<< "$state"; then
+    if grep -qi 'unexpectedly rebooted' <<< "$reason"; then
+      REBOOT_NODES+=("$node")
+    else
+      DOWN_NODES+=("$node|$reason")
+    fi
+  fi
+done <<< "$SINFO_OUT"
+[[ "$SKIPPED_INVALID" -gt 0 ]] && warn "$SKIPPED_INVALID sinfo row(s) had invalid node names and were ignored"
+
+if [[ ${#DOWN_NODES[@]} -eq 0 && ${#REBOOT_NODES[@]} -eq 0 && ${#FAIL_NODES[@]} -eq 0 && ${#BAD_REASON_NODES[@]} -eq 0 ]]; then
+  ok "all nodes in healthy Slurm states"
+else
+  [[ ${#DOWN_NODES[@]}       -gt 0 ]] && bad   "${#DOWN_NODES[@]} node(s) DOWN/DRAIN (Section A)"
+  [[ ${#REBOOT_NODES[@]}     -gt 0 ]] && bad   "${#REBOOT_NODES[@]} node(s) with 'unexpectedly rebooted' (Section B)"
+  [[ ${#FAIL_NODES[@]}       -gt 0 ]] && warn  "${#FAIL_NODES[@]} node(s) in fail state with valid Action:* reason (HyperPod recovery in progress)"
+  [[ ${#BAD_REASON_NODES[@]} -gt 0 ]] && bad   "${#BAD_REASON_NODES[@]} node(s) in fail state with non-matching reason (Section D)"
+fi
+
+# --- Section D: Action:* reason-string validation -----------------------------
+if [[ ${#BAD_REASON_NODES[@]} -gt 0 ]]; then
+  section "D. Reason-string mismatch — HyperPod auto-recovery will NOT trigger"
+  for entry in "${BAD_REASON_NODES[@]}"; do
+    n="${entry%%|*}"; r="${entry#*|}"
+    bad "$n: reason='$r'"
+  done
+  info "the reason field must match exactly: Action:Reboot  or  Action:Replace"
+  info "(case-sensitive, no spaces, no trailing punctuation)"
+  hint "for re-issue procedure, see:"
+  info "  https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-hyperpod-resiliency-slurm-replace-faulty-instance.html"
+  info "  references/slurm-details.md#action-reason-string-validation"
+  ISSUES+=("bad-action-reason")
+  NEXT_STEPS+=("see AWS replace-faulty-instance docs (link above)")
+fi
+
+# --- Detect in-progress HyperPod replacements (informational) -----------------
+if [[ ${#FAIL_NODES[@]} -gt 0 ]]; then
+  section "  HyperPod recovery in progress (do not interfere)"
+  for entry in "${FAIL_NODES[@]}"; do
+    n="${entry%%|*}"; r="${entry#*|}"
+    info "$n ($r)"
+  done
+  info "AWS docs: do NOT change node state or restart slurmctld until this completes."
+  info "If a replacement seems stuck > 30 min, see:"
+  info "  https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-hyperpod-resiliency-slurm-replace-faulty-instance.html"
+fi
+
+# --- Check controller health --------------------------------------------------
+section "3. slurmctld health"
+PING_OUT=$(ssm_run "$SSM_HEAD" 'scontrol ping 2>&1' || true)
+PING_FIRST_LINE=$(head -1 <<< "$PING_OUT" | tr -d '\r')
+if grep -qi 'UP' <<< "$PING_OUT"; then
+  ok "slurmctld responding: $(tr '\n' ' ' <<< "$PING_OUT")"
+elif [[ -z "$PING_OUT" ]] || ssm_transport_failed "$PING_OUT"; then
+  warn "could not get a response from scontrol ping — cannot determine controller health"
+  info "this is most likely an SSM transport problem, not a hung controller"
+  info "do NOT restart slurmctld based on this finding alone"
+elif grep -qi 'DOWN' <<< "$PING_OUT"; then
+  bad "slurmctld reports DOWN: $PING_FIRST_LINE"
+  ISSUES+=("controller-hung")
+  NEXT_STEPS+=("controller restart — see references/slurm-details.md#-c-controller-state--diagnostic-context")
+else
+  bad "slurmctld responded with an unrecognized status: $PING_FIRST_LINE"
+  ISSUES+=("controller-hung")
+  NEXT_STEPS+=("inspect logs first; controller restart only if logs confirm a hang")
+fi
+
+# --- Section C-1: slurmdbd connectivity (controller-state restart trigger) ---
+section "C (slurmdbd): accounting daemon connectivity"
+DBD_OUT=$(ssm_run "$SSM_HEAD" 'sacctmgr -i show stats 2>&1 | head -20' || true)
+if grep -qiE 'unable to contact|connection refused|cannot connect|no slurmdbd' <<< "$DBD_OUT"; then
+  bad "slurmctld cannot reach slurmdbd"
+  info "$(head -3 <<< "$DBD_OUT")"
+  hint "diagnostic and recovery procedure:"
+  info "  https://slurm.schedmd.com/accounting.html"
+  info "  references/slurm-details.md#slurmdbd-connectivity"
+  ISSUES+=("slurmdbd-disconnected")
+  NEXT_STEPS+=("restore slurmdbd connectivity (see AWS / Slurm docs linked above)")
+elif grep -qiE 'rollup|rpc' <<< "$DBD_OUT"; then
+  ok "slurmdbd reachable"
+else
+  warn "could not determine slurmdbd state from sacctmgr output"
+  info "if accounting is configured, run on the head node: sacctmgr show stats"
+fi
+
+# --- Section C-2: pending slurm.conf reconfiguration (controller-state restart trigger) ---
+# HyperPod's slurm.conf lives at /opt/slurm-<version>/etc/slurm.conf rather than the
+# upstream /etc/slurm/slurm.conf, so the remote script asks scontrol where the live
+# config is. The output is a `<conf-mtime>|<ctld-start>|<conf-path>` line that we
+# match strictly with a regex before parsing.
+section "C (config): slurm.conf freshness"
+read -r -d '' F_REMOTE <<'REMOTE_F' || true
+set -e
+# nosemgrep: bash.lang.correctness.unquoted-expansion.unquoted-variable-expansion-in-command
+_CONF="$(scontrol show config 2>/dev/null | awk -F= '/^SLURM_CONF/ {gsub(/ /,"",$2); print $2; exit}')"
+CONF_MTIME=0
+if [ -n "$_CONF" ] && [ -r "$_CONF" ]; then
+  CONF_MTIME=$(stat -c %Y "$_CONF" 2>/dev/null || echo 0)
+fi
+CTLD_TS=$(systemctl show slurmctld -p ActiveEnterTimestamp --value 2>/dev/null || true)
+CTLD_START=0
+if [ -n "$CTLD_TS" ]; then
+  CTLD_START=$(date -d "$CTLD_TS" +%s 2>/dev/null || echo 0)
+fi
+printf 'F_RESULT|%s|%s|%s\n' "${CONF_MTIME}" "${CTLD_START}" "${_CONF}"
+REMOTE_F
+F_LINE=$(ssm_run "$SSM_HEAD" "$F_REMOTE" 2>/dev/null | grep -E '^F_RESULT\|[0-9]+\|[0-9]+\|' | head -1 || true)
+if [[ "$F_LINE" =~ ^F_RESULT\|([0-9]+)\|([0-9]+)\|(.*)$ ]]; then
+  CONF_MTIME="${BASH_REMATCH[1]}"
+  CTLD_START="${BASH_REMATCH[2]}"
+  CONF_PATH="${BASH_REMATCH[3]}"
+  # CONF_PATH must be a real-looking absolute path before we put it into operator-
+  # facing recommendations. Reject anything that has shell-active characters.
+  if ! [[ "$CONF_PATH" =~ ^/[A-Za-z0-9._/-]+$ ]]; then
+    warn "slurm.conf path returned by remote did not validate; skipping freshness check"
+  elif [[ "$CONF_MTIME" -gt "$CTLD_START" && "$CTLD_START" -gt 0 ]]; then
+    DELTA=$((CONF_MTIME - CTLD_START))
+    warn "$CONF_PATH modified ${DELTA}s after slurmctld last started — config may be stale in memory"
+    hint "for the reload-vs-restart decision and procedure, see:"
+    info "  https://slurm.schedmd.com/scontrol.html"
+    info "  https://slurm.schedmd.com/slurm.conf.html"
+    info "  references/slurm-details.md#scontrol-reconfigure-vs-restart"
+    ISSUES+=("stale-conf")
+    NEXT_STEPS+=("review reload procedure in linked docs")
+  else
+    ok "slurm.conf older than slurmctld start time — no pending reconfigure"
+  fi
+else
+  warn "could not determine slurm.conf vs slurmctld timestamps"
+fi
+
+# --- Check for stuck jobs -----------------------------------------------------
+section "4. Job queue health"
+SQUEUE_OUT=$(ssm_run "$SSM_HEAD" 'squeue -h -o "%i|%T|%r" 2>&1 | head -200' || true)
+if [[ $(printf '%s\n' "$SQUEUE_OUT" | wc -l) -ge 200 ]]; then
+  warn "squeue output reached the 200-line cap — stuck-job counts below may underreport on this large cluster"
+fi
+STUCK_PENDING=0
+STUCK_COMPLETING=0
+while IFS='|' read -r jobid state reason; do
+  [[ -z "$jobid" ]] && continue
+  [[ "$state" == "PENDING" && "$reason" == "Resources" ]] && STUCK_PENDING=$((STUCK_PENDING+1))
+  [[ "$state" == "COMPLETING" ]] && STUCK_COMPLETING=$((STUCK_COMPLETING+1))
+done <<< "$SQUEUE_OUT"
+
+if [[ $STUCK_PENDING -gt 0 ]]; then
+  warn "$STUCK_PENDING job(s) PENDING with Reason=Resources"
+  if [[ ${#DOWN_NODES[@]} -eq 0 ]]; then
+    ISSUES+=("stuck-pending-with-idle-nodes")
+    NEXT_STEPS+=("controller restart — Section C")
+  fi
+fi
+if [[ $STUCK_COMPLETING -gt 0 ]]; then
+  bad "$STUCK_COMPLETING job(s) stuck in COMPLETING"
+  ISSUES+=("stuck-completing")
+  NEXT_STEPS+=("controller restart — Section C")
+fi
+[[ $STUCK_PENDING -eq 0 && $STUCK_COMPLETING -eq 0 ]] && ok "no stuck jobs"
+
+# --- Per-node inspection (read-only) ------------------------------------------
+inspect_node() {
+  local slurm_node="$1"
+  # Defense-in-depth: validate again at the boundary even though all upstream paths
+  # validate. Cheap, and catches future refactors that miss a callsite.
+  slurm_node=$(validate_node_name "$slurm_node")
+
+  local instance_id group ssm_target
+  # PrivateDnsName looks like `ip-10-1-2-3.us-west-2.compute.internal`. The strict
+  # `<name>.` match handles the default `ip-x-x-x-x` form and rejects the false
+  # positive where node `ip-10-1-2-3` would otherwise also match
+  # `ip-10-1-2-30.<region>.compute.internal`.
+  instance_id=$(jq -r --arg dns "$slurm_node" '
+    .ClusterNodeSummaries[]?
+    | select((.PrivateDnsName // "") | startswith($dns + "."))
+    | .InstanceId' <<< "$NODES_JSON" | head -1)
+  if [[ -z "$instance_id" ]]; then
+    if [[ ! "$slurm_node" =~ ^ip-[0-9]+-[0-9]+-[0-9]+-[0-9]+$ ]]; then
+      warn "$slurm_node: not in the default ip-X-X-X-X form — Slurm-node-name → instance-ID auto-mapping needs DNS lookup or scontrol show node, neither cheap from here. Pass --target-instance-id <i-xxx> if you have it, or look up via 'scontrol show node $slurm_node | grep NodeAddr' on the controller."
+    else
+      warn "$slurm_node: cannot map to instance ID (PrivateDnsName mismatch — verify node is in this cluster)"
+    fi
+    return
+  fi
+  instance_id=$(validate_instance_id "$instance_id")
+
+  group=$(jq -r --arg id "$instance_id" \
+    '.ClusterNodeSummaries[] | select(.InstanceId==$id) | .InstanceGroupName // ""' <<< "$NODES_JSON")
+  group=$(validate_group_name "$group")
+  ssm_target="sagemaker-cluster:${CLUSTER_ID}_${group}-${instance_id}"
+
+  local slurmd_status disk mem rpc_check
+  slurmd_status=$(ssm_run "$ssm_target" 'systemctl is-active slurmd 2>&1' | tr -d '\r\n' || true)
+  disk=$(ssm_run         "$ssm_target" 'df -h / | awk "NR==2 {print \$5}"' | tr -d '\r\n' || true)
+  mem=$(ssm_run          "$ssm_target" 'free -h | awk "/Mem:/ {print \$3\"/\"\$2}"' | tr -d '\r\n' || true)
+
+  # Slurm-RPC reachability: srun -w "$NODE" hostname. The remote script reads $NODE
+  # from the environment, so the slurm node name is never string-interpolated into
+  # the remote shell — it lives in env-var space the whole way.
+  rpc_check=$(ssm_run "$SSM_HEAD" 'timeout 10 srun --immediate=5 -w "$NODE" hostname 2>&1 | tail -1' \
+              "NODE=$slurm_node" | tr -d '\r\n' || true)
+
+  info "$slurm_node ($instance_id): slurmd=$slurmd_status disk=$disk mem=$mem"
+  info "  srun RPC: ${rpc_check:-<no output>}"
+
+  local disk_num="${disk%\%}"
+  if [[ "$disk_num" =~ ^[0-9]+$ && "$disk_num" -ge 95 ]]; then
+    bad "  $slurm_node: root volume ${disk} — clean up before any restart"
+    info "  HyperPod storage layout: https://github.com/aws/sagemaker-hyperpod-cluster-setup/blob/troubleshooting-doc-20250917/troubleshoot/index.md"
+    ISSUES+=("disk-full-$slurm_node")
+    NEXT_STEPS+=("clean disk on $slurm_node before recovery")
+  fi
+  if [[ "$slurmd_status" != "active" ]]; then
+    bad "  $slurm_node: slurmd is '$slurmd_status'"
+    info "  for recovery procedure, see:"
+    info "    https://github.com/aws/sagemaker-hyperpod-cluster-setup/blob/troubleshooting-doc-20250917/troubleshoot/index.md"
+  fi
+  if [[ -n "$rpc_check" ]] && grep -qiE 'auth|munge|invalid' <<< "$rpc_check"; then
+    bad "  $slurm_node: srun reports auth/munge error — slurmd-controller trust broken"
+    info "  for munge troubleshooting, see Slurm authentication docs:"
+    info "    https://slurm.schedmd.com/authentication.html"
+  fi
+}
+
+if [[ -n "$TARGET_NODE" ]]; then
+  section "5. Inspecting node: $TARGET_NODE"
+  inspect_node "$TARGET_NODE"
+elif [[ ${#DOWN_NODES[@]} -gt 0 || ${#REBOOT_NODES[@]} -gt 0 ]]; then
+  section "5. Inspecting affected nodes"
+  for entry in "${DOWN_NODES[@]-}"; do
+    [[ -z "$entry" ]] && continue
+    inspect_node "${entry%%|*}"
+  done
+  for n in "${REBOOT_NODES[@]-}"; do
+    [[ -z "$n" ]] && continue
+    inspect_node "$n"
+  done
+fi
+
+# --- Section E: HyperPod auto-resume support + recent missed-resume detection ---
+section "E. Auto-resume support"
+
+AR_HELP=$(ssm_run "$SSM_HEAD" 'srun --help 2>&1 | grep -i auto-resume | head -3' || true)
+if [[ -n "$AR_HELP" ]]; then
+  ok "srun --auto-resume is available on this cluster"
+else
+  warn "srun --auto-resume not found in srun --help output"
+  info "this AMI / Slurm build may predate HyperPod auto-resume support"
+  info "see: references/slurm-details.md#hyperpod-auto-resume"
+  ISSUES+=("auto-resume-unsupported")
+  NEXT_STEPS+=("upgrade the cluster AMI / Slurm package to enable --auto-resume")
+fi
+
+read -r -d '' G_FAILS <<'REMOTE_G' || true
+sacct -X -n --starttime=now-6hours \
+  -o JobID,State,ExitCode,NodeList \
+  --state=NODE_FAIL,FAILED 2>/dev/null \
+  | awk 'NF>=4 && $4!~/None/ {print $1"|"$2"|"$4}' | head -50
+REMOTE_G
+RECENT_FAILS=$(ssm_run "$SSM_HEAD" "$G_FAILS" 2>/dev/null || true)
+
+MISSED_AR=()
+NOW_EPOCH=$(date +%s)
+while IFS='|' read -r jobid state nodelist; do
+  [[ -z "$jobid" ]] && continue
+  # Only single-node failures — multi-node lists need a real range expander.
+  [[ "$nodelist" == *,* || "$nodelist" == *\[* ]] && continue
+  # Validate before passing to remote.
+  if ! [[ "$nodelist" =~ ^[a-zA-Z0-9._-]{1,253}$ ]]; then
+    continue
+  fi
+  # A successful HyperPod replace clears the node's Reason field once the new instance
+  # registers, so grepping for "Action:Replace" is unreliable. Detect a recent replace
+  # by comparing scontrol show node's BootTime to wall-clock: a fresh BootTime within
+  # the last 6h that's later than the failed-job's End time strongly suggests the node
+  # was replaced (or rebooted) after the job died.
+  BOOT_LINE=$(ssm_run "$SSM_HEAD" 'scontrol show node "$NODE" 2>/dev/null | tr " " "\n" | grep "^BootTime="' \
+              "NODE=$nodelist" | head -1 | tr -d '\r\n' || true)
+  BOOT_STR="${BOOT_LINE#BootTime=}"
+  [[ -z "$BOOT_STR" || "$BOOT_STR" == "Unknown" ]] && continue
+  BOOT_EPOCH=$(date -d "$BOOT_STR" +%s 2>/dev/null || echo 0)
+  [[ "$BOOT_EPOCH" =~ ^[0-9]+$ && "$BOOT_EPOCH" -gt 0 ]] || continue
+  AGE=$((NOW_EPOCH - BOOT_EPOCH))
+  if [[ $AGE -ge 0 && $AGE -le 21600 ]]; then  # 6h window
+    MISSED_AR+=("$jobid|$state|$nodelist|$BOOT_STR")
+  fi
+done <<< "$RECENT_FAILS"
+
+if [[ ${#MISSED_AR[@]} -gt 0 ]]; then
+  bad "${#MISSED_AR[@]} recent job(s) failed on a node that was rebooted/replaced shortly after — possible missed auto-resume:"
+  for entry in "${MISSED_AR[@]}"; do
+    IFS='|' read -r jobid state nodelist boot <<< "$entry"
+    info "  job $jobid ($state) on $nodelist (node BootTime=$boot)"
+  done
+  info "(heuristic: node BootTime is within the last 6h, suggesting a replace or reboot)"
+  hint "verify the launch command used srun --auto-resume=1 (NOT just sbatch):"
+  info "  sacct -j <JOBID> -o JobID,JobName,Submit,Start,End,State,ExitCode,NodeList -X"
+  info "  scontrol show job <JOBID>   # only if still in the controller's recent history"
+  info "see: references/slurm-details.md#hyperpod-auto-resume"
+  ISSUES+=("missed-auto-resume")
+  NEXT_STEPS+=("verify --auto-resume=1 is on the srun line, not just sbatch")
+elif [[ -n "$RECENT_FAILS" ]]; then
+  ok "recent failed jobs do not match the missed-auto-resume pattern"
+else
+  ok "no recent NODE_FAIL / FAILED jobs in the last 6h"
+fi
+
+# --- Findings → documentation links ------------------------------------------
+# This skill is diagnostic-only. It never prints a remediation command. For each
+# finding, point the user at the authoritative doc and let them act.
+section "Where to read next"
+
+if [[ ${#REBOOT_NODES[@]} -gt 0 ]]; then
+  hint "Section B — nodes flagged 'unexpectedly rebooted':"
+  for n in "${REBOOT_NODES[@]}"; do
+    info "  $n"
+  done
+  info "  HyperPod Slurm troubleshooting:"
+  info "    https://github.com/aws/sagemaker-hyperpod-cluster-setup/blob/troubleshooting-doc-20250917/troubleshoot/index.md"
+  info "  diagnostic context: references/slurm-details.md#-b-unexpected-reboot--diagnostic-context"
+fi
+
+if [[ ${#DOWN_NODES[@]} -gt 0 ]]; then
+  hint "Section A — nodes DOWN/DRAIN:"
+  for entry in "${DOWN_NODES[@]}"; do
+    n="${entry%%|*}"; r="${entry#*|}"
+    info "  $n  (reason: $r)"
+  done
+  info "  HyperPod Slurm troubleshooting:"
+  info "    https://github.com/aws/sagemaker-hyperpod-cluster-setup/blob/troubleshooting-doc-20250917/troubleshoot/index.md"
+  info "  if the node flaps after a manual recovery → route to hyperpod-node-debugger"
+fi
+
+CTRL_RESTART_REASON=""
+ISSUES_STR=" ${ISSUES[*]-} "
+[[ "$ISSUES_STR" == *" controller-hung "* ]]               && CTRL_RESTART_REASON="scontrol ping failed"
+[[ "$ISSUES_STR" == *" stuck-completing "* ]]              && CTRL_RESTART_REASON="${CTRL_RESTART_REASON:+$CTRL_RESTART_REASON, }jobs stuck COMPLETING"
+[[ "$ISSUES_STR" == *" stuck-pending-with-idle-nodes "* ]] && CTRL_RESTART_REASON="${CTRL_RESTART_REASON:+$CTRL_RESTART_REASON, }jobs PENDING with idle nodes"
+
+if [[ -n "$CTRL_RESTART_REASON" ]]; then
+  hint "Section C — controller-state issue ($CTRL_RESTART_REASON):"
+  info "  Slurm slurmctld(8) — for what is preserved across a controller restart:"
+  info "    https://slurm.schedmd.com/slurmctld.html"
+  info "  HyperPod Slurm troubleshooting:"
+  info "    https://github.com/aws/sagemaker-hyperpod-cluster-setup/blob/troubleshooting-doc-20250917/troubleshoot/index.md"
+  if [[ ${#FAIL_NODES[@]} -gt 0 ]]; then
+    warn "HyperPod recovery is in progress on:"
+    for entry in "${FAIL_NODES[@]}"; do
+      n="${entry%%|*}"
+      info "  $n"
+    done
+    info "AWS docs warn against changing node state or restarting slurmctld during a"
+    info "replacement; wait for it to complete, then re-run this script."
+  fi
+  info "  diagnostic context: references/slurm-details.md#-c-controller-state--diagnostic-context"
+fi
+
+if [[ "$ISSUES_STR" == *" missed-auto-resume "* ]]; then
+  hint "Section E — recent job failed on a node that was later replaced:"
+  info "  the most common cause is --auto-resume on sbatch instead of srun."
+  info "  Use SageMaker HyperPod auto-resume:"
+  info "    https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-hyperpod-resiliency-slurm-auto-resume.html"
+  info "  diagnostic context: references/slurm-details.md#hyperpod-auto-resume"
+fi
+
+# --- Summary ------------------------------------------------------------------
+section "Summary"
+printf '  Issues detected: %d\n' "${#ISSUES[@]-0}"
+if [[ ${#ISSUES[@]-0} -eq 0 ]]; then
+  ok "cluster Slurm state is healthy"
+else
+  echo ""
+  echo "  Findings:"
+  for i in "${ISSUES[@]}"; do
+    info "- $i"
+  done
+fi
+
+if [[ ${#NEXT_STEPS[@]-0} -gt 0 ]]; then
+  echo ""
+  echo "  Where to read next:"
+  for s in "${NEXT_STEPS[@]}"; do
+    info "- $s"
+  done
+fi
+
diff --git a/plugins/sagemaker-ai/skills/hyperpod-ssm/SKILL.md b/plugins/sagemaker-ai/skills/hyperpod-ssm/SKILL.md
index fe09f313..0dfd14cc 100755
--- a/plugins/sagemaker-ai/skills/hyperpod-ssm/SKILL.md
+++ b/plugins/sagemaker-ai/skills/hyperpod-ssm/SKILL.md
@@ -7,6 +7,13 @@ metadata:
 
 # HyperPod SSM Access
 
+## Prerequisites
+
+- **`aws` CLI v2**, authenticated for the target account/Region.
+- **`session-manager-plugin`** — installed alongside the AWS CLI.
+- **`jq`** — the scripts build JSON payloads with it.
+- **`unbuffer`** (from the `expect` package) — wraps `aws ssm start-session` with a PTY so the session-manager-plugin flushes stdout instead of racing to close. Without it, calls intermittently return empty output with `Cannot perform start session: EOF` even when the command ran. Install with `sudo yum install expect`, `sudo apt install expect`, or `brew install expect`. `ssm-exec.sh` detects and uses it automatically; falls back with a warning if missing.
+
 ## SSM Target Format
 
 Target: `sagemaker-cluster:<CLUSTER_ID>_<GROUP_NAME>-<INSTANCE_ID>`
@@ -59,21 +66,22 @@ SSM `start-session` rate limit: **3 TPS** per account. Plan batch size and delay
 
 ## Manual SSM Commands
 
-When the scripts aren't suitable, use `aws ssm start-session` directly with `AWS-StartNonInteractiveCommand`:
+When the scripts aren't suitable, use `aws ssm start-session` directly with `AWS-StartNonInteractiveCommand`. Wrap every invocation in `unbuffer` — without it, stdout is intermittently empty (see Prerequisites).
 
 ```bash
 cat > /tmp/cmd.json << 'EOF'
 {"command": ["bash -c 'echo hello && whoami'"]}
 EOF
 
-aws ssm start-session \
+unbuffer aws ssm start-session \
   --target sagemaker-cluster:{CLUSTER_ID}_{GROUP_NAME}-{INSTANCE_ID} \
   --region REGION \
   --document-name AWS-StartNonInteractiveCommand \
   --parameters file:///tmp/cmd.json
 ```
 
-Always use a JSON file for `--parameters` — inline parameters break with special characters.
+- Always use a JSON file for `--parameters` — inline parameters break with special characters.
+- The document's `command` parameter is argv, not shell input. Wrap multi-statement scripts in `bash -c '...'` so pipes, semicolons, and redirects evaluate.
 
 ## Common Diagnostic Commands
 
diff --git a/plugins/sagemaker-ai/skills/hyperpod-ssm/scripts/ssm-exec.sh b/plugins/sagemaker-ai/skills/hyperpod-ssm/scripts/ssm-exec.sh
index bf9bb28e..d6b8dce0 100755
--- a/plugins/sagemaker-ai/skills/hyperpod-ssm/scripts/ssm-exec.sh
+++ b/plugins/sagemaker-ai/skills/hyperpod-ssm/scripts/ssm-exec.sh
@@ -86,8 +86,26 @@ case "$MODE" in
     ;;
 esac
 
-aws ssm start-session \
-  --target "$TARGET" \
-  --region "$REGION" \
-  --document-name AWS-StartNonInteractiveCommand \
-  --parameters "file://$TMPFILE"
+# The session-manager-plugin races against stdout when it writes to a pipe:
+# under "Cannot perform start session: EOF" it closes before flushing, so the
+# caller intermittently sees empty stdout even when the command ran. Running
+# under `unbuffer` (expect) attaches a PTY, which forces line-buffered I/O
+# and eliminates the race. See https://github.com/aws/amazon-ssm-agent/issues/358.
+# If `unbuffer` isn't on PATH, fall back to the bare invocation.
+if command -v unbuffer >/dev/null 2>&1; then
+  exec unbuffer aws ssm start-session \
+    --target "$TARGET" \
+    --region "$REGION" \
+    --document-name AWS-StartNonInteractiveCommand \
+    --parameters "file://$TMPFILE"
+else
+  echo "Warning: 'unbuffer' (from the 'expect' package) is not installed." >&2
+  echo "         Without it, 'aws ssm start-session' will intermittently return empty" >&2
+  echo "         stdout with 'Cannot perform start session: EOF'." >&2
+  echo "         Install with: sudo yum install expect | sudo apt install expect | brew install expect" >&2
+  exec aws ssm start-session \
+    --target "$TARGET" \
+    --region "$REGION" \
+    --document-name AWS-StartNonInteractiveCommand \
+    --parameters "file://$TMPFILE"
+fi
diff --git a/plugins/sagemaker-ai/skills/hyperpod-version-checker/scripts/hyperpod_check_versions.sh b/plugins/sagemaker-ai/skills/hyperpod-version-checker/scripts/hyperpod_check_versions.sh
index 03f5cb28..27ba88a3 100755
--- a/plugins/sagemaker-ai/skills/hyperpod-version-checker/scripts/hyperpod_check_versions.sh
+++ b/plugins/sagemaker-ai/skills/hyperpod-version-checker/scripts/hyperpod_check_versions.sh
@@ -31,10 +31,11 @@ while [[ $# -gt 0 ]]; do
 done
 
 # --- Color setup ---
+
 if [[ "$USE_COLOR" == "true" ]] && [ -t 1 ] && [[ "$JSON_OUTPUT" != "true" ]]; then
-    RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'; BLUE='\033[0;34m'; NC='\033[0m'
+    GREEN='\033[0;32m'; YELLOW='\033[1;33m'; BLUE='\033[0;34m'; NC='\033[0m'
 else
-    RED=''; GREEN=''; YELLOW=''; BLUE=''; NC=''
+    GREEN=''; YELLOW=''; BLUE=''; NC=''
 fi
 
 # --- Output file ---
@@ -74,9 +75,10 @@ else
     fi
 fi
 IS_NEURON=false
-IS_GPU=false
 [[ "$INSTANCE_TYPE" =~ (^|\.)(trn|inf) ]] && IS_NEURON=true
-[[ "$INSTANCE_TYPE" =~ (^|\.)(p[0-9]|g[0-9]) ]] && IS_GPU=true
+# GPU detection is driven by `cmd_exists nvidia-smi` at each GPU section below —
+# no explicit IS_GPU flag needed. Keeps GPU checks working on instances where
+# the driver is present but the regex would miss (e.g. new p-family SKUs).
 
 # JSON-safe string escape via jq (handles all special/unicode characters correctly)
 json_escape() { jq -rn --arg v "$1" '$v | @json | .[1:-1]'; }