Skip to content

Commit 1cfd655

Browse files
committed
chore: address review comments
Signed-off-by: Ajay Mishra <ajmishra@nvidia.com>
1 parent 11b40b7 commit 1cfd655

15 files changed

Lines changed: 210 additions & 308 deletions

File tree

distros/kubernetes/nvsentinel/charts/gpu-health-monitor/files/dcgmerrorsmapping.csv

Lines changed: 0 additions & 1 deletion
This file was deleted.
Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
DCGM_FR_UNKNOWN,CONTACT_SUPPORT
2+
DCGM_FR_UNRECOGNIZED,CONTACT_SUPPORT
3+
DCGM_FR_PCI_REPLAY_RATE,CONTACT_SUPPORT
4+
DCGM_FR_VOLATILE_DBE_DETECTED,COMPONENT_RESET
5+
DCGM_FR_VOLATILE_SBE_DETECTED,NONE
6+
DCGM_FR_PENDING_PAGE_RETIREMENTS,NONE
7+
DCGM_FR_RETIRED_PAGES_LIMIT,CONTACT_SUPPORT
8+
DCGM_FR_RETIRED_PAGES_DBE_LIMIT,CONTACT_SUPPORT
9+
DCGM_FR_CORRUPT_INFOROM,COMPONENT_RESET
10+
DCGM_FR_CLOCKS_EVENT_THERMAL,CONTACT_SUPPORT
11+
DCGM_FR_CLOCK_THROTTLE_THERMAL,NONE
12+
DCGM_FR_POWER_UNREADABLE,RESTART_VM
13+
DCGM_FR_CLOCKS_EVENT_POWER,NONE
14+
DCGM_FR_CLOCK_THROTTLE_POWER,NONE
15+
DCGM_FR_NVLINK_ERROR_THRESHOLD,NONE
16+
DCGM_FR_NVLINK_DOWN,RESTART_VM
17+
DCGM_FR_NVSWITCH_FATAL_ERROR,CONTACT_SUPPORT
18+
DCGM_FR_NVSWITCH_NON_FATAL_ERROR,NONE
19+
DCGM_FR_NVSWITCH_DOWN,COMPONENT_RESET
20+
DCGM_FR_NO_ACCESS_TO_FILE,CONTACT_SUPPORT
21+
DCGM_FR_NVML_API,CONTACT_SUPPORT
22+
DCGM_FR_DEVICE_COUNT_MISMATCH,CONTACT_SUPPORT
23+
DCGM_FR_BAD_PARAMETER,CONTACT_SUPPORT
24+
DCGM_FR_CANNOT_OPEN_LIB,CONTACT_SUPPORT
25+
DCGM_FR_DENYLISTED_DRIVER,CONTACT_SUPPORT
26+
DCGM_FR_NVML_LIB_BAD,CONTACT_SUPPORT
27+
DCGM_FR_GRAPHICS_PROCESSES,CONTACT_SUPPORT
28+
DCGM_FR_HOSTENGINE_CONN,CONTACT_SUPPORT
29+
DCGM_FR_FIELD_QUERY,RESTART_VM
30+
DCGM_FR_BAD_CUDA_ENV,CONTACT_SUPPORT
31+
DCGM_FR_PERSISTENCE_MODE,CONTACT_SUPPORT
32+
DCGM_FR_LOW_BANDWIDTH,CONTACT_SUPPORT
33+
DCGM_FR_HIGH_LATENCY,CONTACT_SUPPORT
34+
DCGM_FR_CANNOT_GET_FIELD_TAG,CONTACT_SUPPORT
35+
DCGM_FR_FIELD_VIOLATION,RESTART_VM
36+
DCGM_FR_FIELD_THRESHOLD,RESTART_VM
37+
DCGM_FR_FIELD_VIOLATION_DBL,RESTART_VM
38+
DCGM_FR_FIELD_THRESHOLD_DBL,RESTART_VM
39+
DCGM_FR_UNSUPPORTED_FIELD_TYPE,RESTART_VM
40+
DCGM_FR_FIELD_THRESHOLD_TS,RESTART_VM
41+
DCGM_FR_FIELD_THRESHOLD_TS_DBL,RESTART_VM
42+
DCGM_FR_THERMAL_VIOLATIONS,CONTACT_SUPPORT
43+
DCGM_FR_THERMAL_VIOLATIONS_TS,CONTACT_SUPPORT
44+
DCGM_FR_TEMP_VIOLATION,CONTACT_SUPPORT
45+
DCGM_FR_CLOCKS_EVENT_VIOLATION,RESTART_VM
46+
DCGM_FR_THROTTLING_VIOLATION,RESTART_VM
47+
DCGM_FR_INTERNAL,CONTACT_SUPPORT
48+
DCGM_FR_PCIE_GENERATION,CONTACT_SUPPORT
49+
DCGM_FR_PCIE_WIDTH,CONTACT_SUPPORT
50+
DCGM_FR_ABORTED,NONE
51+
DCGM_FR_TEST_DISABLED,NONE
52+
DCGM_FR_CANNOT_GET_STAT,CONTACT_SUPPORT
53+
DCGM_FR_STRESS_LEVEL,RESTART_VM
54+
DCGM_FR_CUDA_API,CONTACT_SUPPORT
55+
DCGM_FR_FAULTY_MEMORY,CONTACT_SUPPORT
56+
DCGM_FR_CANNOT_SET_WATCHES,RESTART_VM
57+
DCGM_FR_CUDA_UNBOUND,RESTART_VM
58+
DCGM_FR_ECC_DISABLED,CONTACT_SUPPORT
59+
DCGM_FR_MEMORY_ALLOC,RESTART_VM
60+
DCGM_FR_CUDA_DBE,CONTACT_SUPPORT
61+
DCGM_FR_MEMORY_MISMATCH,CONTACT_SUPPORT
62+
DCGM_FR_CUDA_DEVICE,CONTACT_SUPPORT
63+
DCGM_FR_ECC_UNSUPPORTED,CONTACT_SUPPORT
64+
DCGM_FR_ECC_PENDING,RESTART_VM
65+
DCGM_FR_MEMORY_BANDWIDTH,RESTART_VM
66+
DCGM_FR_TARGET_POWER,NONE
67+
DCGM_FR_API_FAIL,RESTART_VM
68+
DCGM_FR_API_FAIL_GPU,RESTART_VM
69+
DCGM_FR_CUDA_CONTEXT,CONTACT_SUPPORT
70+
DCGM_FR_DCGM_API,CONTACT_SUPPORT
71+
DCGM_FR_CONCURRENT_GPUS,CONTACT_SUPPORT
72+
DCGM_FR_TOO_MANY_ERRORS,CONTACT_SUPPORT
73+
DCGM_FR_NVLINK_CRC_ERROR_THRESHOLD,CONTACT_SUPPORT
74+
DCGM_FR_NVLINK_ERROR_CRITICAL,CONTACT_SUPPORT
75+
DCGM_FR_ENFORCED_POWER_LIMIT,CONTACT_SUPPORT
76+
DCGM_FR_MEMORY_ALLOC_HOST,RESTART_VM
77+
DCGM_FR_GPU_OP_MODE,CONTACT_SUPPORT
78+
DCGM_FR_NO_MEMORY_CLOCKS,CONTACT_SUPPORT
79+
DCGM_FR_NO_GRAPHICS_CLOCKS,NONE
80+
DCGM_FR_HAD_TO_RESTORE_STATE,RESTART_VM
81+
DCGM_FR_L1TAG_UNSUPPORTED,CONTACT_SUPPORT
82+
DCGM_FR_L1TAG_MISCOMPARE,CONTACT_SUPPORT
83+
DCGM_FR_ROW_REMAP_FAILURE,CONTACT_SUPPORT
84+
DCGM_FR_UNCONTAINED_ERROR,RESTART_VM
85+
DCGM_FR_EMPTY_GPU_LIST,CONTACT_SUPPORT
86+
DCGM_FR_DBE_PENDING_PAGE_RETIREMENTS,RESTART_VM
87+
DCGM_FR_UNCORRECTABLE_ROW_REMAP,NONE
88+
DCGM_FR_PENDING_ROW_REMAP,COMPONENT_RESET
89+
DCGM_FR_BROKEN_P2P_MEMORY_DEVICE,CONTACT_SUPPORT
90+
DCGM_FR_BROKEN_P2P_WRITER_DEVICE,CONTACT_SUPPORT
91+
DCGM_FR_NVSWITCH_NVLINK_DOWN,CONTACT_SUPPORT
92+
DCGM_FR_EUD_BINARY_PERMISSIONS,CONTACT_SUPPORT
93+
DCGM_FR_EUD_NON_ROOT_USER,CONTACT_SUPPORT
94+
DCGM_FR_EUD_SPAWN_FAILURE,CONTACT_SUPPORT
95+
DCGM_FR_EUD_TIMEOUT,CONTACT_SUPPORT
96+
DCGM_FR_EUD_ZOMBIE,CONTACT_SUPPORT
97+
DCGM_FR_EUD_NON_ZERO_EXIT_CODE,CONTACT_SUPPORT
98+
DCGM_FR_EUD_TEST_FAILED,CONTACT_SUPPORT
99+
DCGM_FR_FILE_CREATE_PERMISSIONS,CONTACT_SUPPORT
100+
DCGM_FR_PAUSE_RESUME_FAILED,CONTACT_SUPPORT
101+
DCGM_FR_PCIE_H_REPLAY_VIOLATION,CONTACT_SUPPORT
102+
DCGM_FR_GPU_EXPECTED_NVLINKS_UP,CONTACT_SUPPORT
103+
DCGM_FR_NVSWITCH_EXPECTED_NVLINKS_UP,CONTACT_SUPPORT
104+
DCGM_FR_XID_ERROR,NONE
105+
DCGM_FR_SBE_VIOLATION,CONTACT_SUPPORT
106+
DCGM_FR_DBE_VIOLATION,CONTACT_SUPPORT
107+
DCGM_FR_PCIE_REPLAY_VIOLATION,CONTACT_SUPPORT
108+
DCGM_FR_SBE_THRESHOLD_VIOLATION,CONTACT_SUPPORT
109+
DCGM_FR_DBE_THRESHOLD_VIOLATION,CONTACT_SUPPORT
110+
DCGM_FR_PCIE_REPLAY_THRESHOLD_VIOLATION,CONTACT_SUPPORT
111+
DCGM_FR_CUDA_FM_NOT_INITIALIZED,CONTACT_SUPPORT
112+
DCGM_FR_SXID_ERROR,RESTART_VM
113+
DCGM_FR_GFLOPS_THRESHOLD_VIOLATION,CONTACT_SUPPORT
114+
DCGM_FR_NAN_VALUE,CONTACT_SUPPORT
115+
DCGM_FR_FABRIC_MANAGER_TRAINING_ERROR,CONTACT_SUPPORT
116+
DCGM_FR_BROKEN_P2P_PCIE_MEMORY_DEVICE,CONTACT_SUPPORT
117+
DCGM_FR_BROKEN_P2P_PCIE_WRITER_DEVICE,CONTACT_SUPPORT
118+
DCGM_FR_BROKEN_P2P_NVLINK_MEMORY_DEVICE,CONTACT_SUPPORT
119+
DCGM_FR_BROKEN_P2P_NVLINK_WRITER_DEVICE,CONTACT_SUPPORT
120+
DCGM_FR_TEST_SKIPPED,NONE
121+
DCGM_FR_ERROR_SENTINEL,NONE

distros/kubernetes/nvsentinel/charts/preflight/files/dcgmerrorsmapping.csv

Lines changed: 0 additions & 1 deletion
This file was deleted.

distros/kubernetes/nvsentinel/charts/preflight/templates/configmap.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,4 +32,3 @@ data:
3232
processingStrategy: {{ include "preflight.processingStrategy" . | quote }}
3333
initContainers:
3434
{{- toYaml .Values.initContainers | nindent 6 }}
35-
{{ (.Files.Glob "files/dcgmerrorsmapping.csv").AsConfig | indent 2 }}

distros/kubernetes/nvsentinel/charts/preflight/templates/deployment.yaml

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,9 +42,6 @@ spec:
4242
- --port={{ .Values.webhook.port }}
4343
- --cert-dir=/certs
4444
- --config=/etc/preflight/config.yaml
45-
env:
46-
- name: PREFLIGHT_CONFIGMAP_NAME
47-
value: {{ include "preflight.fullname" . }}-config
4845
ports:
4946
- name: https
5047
containerPort: {{ .Values.webhook.port }}

distros/kubernetes/nvsentinel/charts/preflight/values.yaml

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,9 @@
1515
replicaCount: 1
1616

1717
image:
18-
repository: ghcr.io/nvidia/nvsentinel/preflight
18+
repository: xrfxlp/preflight
1919
pullPolicy: IfNotPresent
20-
tag: "latest"
20+
tag: 09550402
2121

2222
imagePullSecrets: []
2323
nameOverride: ""
@@ -97,13 +97,10 @@ dcgm:
9797

9898
initContainers:
9999
- name: preflight-dcgm-diag
100-
image: ghcr.io/nvidia/nvsentinel/preflight-dcgm-diag:latest
100+
image: xrfxlp/preflight-dcgm-diag:09550402
101101
volumeMounts:
102102
- name: nvsentinel-socket
103103
mountPath: /var/run
104-
- name: dcgm-error-mapping
105-
mountPath: /etc/dcgm
106-
readOnly: true
107104
# Full corev1.Container fields supported:
108105
# resources:
109106
# limits:

distros/kubernetes/nvsentinel/files/dcgmerrorsmapping.csv

Lines changed: 0 additions & 121 deletions
This file was deleted.

preflight-checks/dcgm-diag/Dockerfile

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,10 @@ ENV PYTHONPATH=${PYTHONPATH} \
5959
COPY --from=build /app/dist/*.whl ./
6060
COPY --from=build /app/constraints.txt ./
6161
RUN --mount=type=cache,target=/root/.cache/pip \
62-
pip install ./dcgm_diag*.whl --constraint constraints.txt
62+
pip install ./nvsentinel_dcgm_diag*.whl --constraint constraints.txt
63+
64+
# Bundle error mapping file (static release data, not runtime config)
65+
COPY distros/kubernetes/nvsentinel/charts/gpu-health-monitor/files/dcgmerrorsmapping.csv /etc/dcgm/dcgmerrorsmapping.csv
6366

6467
RUN groupadd -r nvsentinel && useradd -r -g nvsentinel nvsentinel
6568
USER nvsentinel

preflight-checks/dcgm-diag/dcgm_diag/__main__.py

Lines changed: 26 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -69,46 +69,45 @@ def main() -> None:
6969

7070
failures = [r for r in results if r.status == "fail"]
7171
warnings = [r for r in results if r.status == "warn"]
72-
73-
for r in results:
74-
log.info(
75-
"Test result",
76-
extra={
77-
"test": r.test_name,
78-
"status": r.status,
79-
"gpu": r.gpu_index,
80-
"error_code": r.error_code,
81-
"error": r.error_message,
82-
},
83-
)
72+
passes = [r for r in results if r.status == "pass"]
8473

8574
log.info(
8675
"Diagnostic summary",
8776
extra={
88-
"passed": len(results) - len(failures) - len(warnings),
77+
"passed": len(passes),
8978
"failed": len(failures),
9079
"warned": len(warnings),
80+
"skipped": len(results) - len(passes) - len(failures) - len(warnings),
9181
"total": len(results),
9282
},
9383
)
9484

95-
for r in failures:
96-
msg = f"{r.test_name} (GPU {r.gpu_index}): {r.error_message}"
97-
log.error("DCGM diagnostic failed", extra={"gpu": r.gpu_uuid, "message": msg})
98-
reporter.send_event(gpu_uuid=r.gpu_uuid, is_healthy=False, is_fatal=True, message=msg, error_code=r.error_code)
85+
# Send one event per test result with specific test name
86+
for r in results:
87+
if r.status not in ("pass", "warn", "fail"):
88+
continue
9989

100-
if failures:
101-
sys.exit(1)
90+
is_pass = r.status == "pass"
91+
is_fatal = r.status == "fail"
92+
message = "Test passed" if is_pass else r.error_message
10293

103-
for r in warnings:
104-
msg = f"{r.test_name} (GPU {r.gpu_index}): {r.error_message}"
105-
log.warning("DCGM diagnostic warning", extra={"gpu": r.gpu_uuid, "message": msg})
106-
reporter.send_event(gpu_uuid=r.gpu_uuid, is_healthy=False, is_fatal=False, message=msg, error_code=r.error_code)
94+
log.log(
95+
logging.INFO if is_pass else (logging.ERROR if is_fatal else logging.WARNING),
96+
f"Test {r.status}",
97+
extra={"gpu": r.gpu_uuid, "test": r.test_name, "error_code": r.error_code, "detail": message},
98+
)
99+
reporter.send_event(
100+
gpu_uuid=r.gpu_uuid,
101+
is_healthy=is_pass,
102+
is_fatal=is_fatal,
103+
message=message,
104+
error_code=r.error_code if not is_pass else 0,
105+
test_name=r.test_name,
106+
)
107107

108-
failed_gpus = {r.gpu_uuid for r in failures + warnings}
109-
for uuid in diag.get_all_gpu_uuids():
110-
if uuid not in failed_gpus:
111-
reporter.send_event(gpu_uuid=uuid, is_healthy=True, is_fatal=False, message="DCGM diagnostic passed")
108+
if failures:
109+
log.error("DCGM diagnostic check failed")
110+
sys.exit(1)
112111

113112
log.info("DCGM diagnostic check passed")
114113
sys.exit(0)

0 commit comments

Comments
 (0)