Skip to content

Commit fecc08e

Browse files
committed
feat: add GPU reset support to syslog-health-monitor
Signed-off-by: Nathan Herz <nherz@nvidia.com>
1 parent 4038245 commit fecc08e

7 files changed

Lines changed: 410 additions & 26 deletions

File tree

docs/configuration/syslog-health-monitor.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
## Overview
44

5-
The Syslog Health Monitor module watches system logs for GPU errors (XID/SXID) and GPU-fallen-off events by reading journald logs. This document covers all Helm configuration options for system administrators.
5+
The Syslog Health Monitor module watches system logs for GPU errors (XID/SXID), GPU-fallen-off, and GPU reset events by reading journald logs. This document covers all Helm configuration options for system administrators.
66

77
## Configuration Reference
88

@@ -55,7 +55,7 @@ syslog-health-monitor:
5555
### Check Types
5656
5757
#### SysLogsXIDError
58-
Monitors for XID (GPU error) messages in system logs. XIDs are NVIDIA GPU error codes that indicate hardware or software issues.
58+
Monitors for XID (GPU error) and GPU reset messages in system logs. XIDs are NVIDIA GPU error codes that indicate hardware or software issues.
5959
6060
#### SysLogsSXIDError
6161
Monitors for SXID messages specific to NVSwitch errors in multi-GPU configurations.

docs/syslog-health-monitor.md

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,26 +2,27 @@
22

33
## Overview
44

5-
The Syslog Health Monitor watches system logs for GPU-related errors that may not be caught by DCGM. It monitors journald/syslog for XID errors, SXID errors (NVSwitch/NVLink errors), and GPU fallen-off-bus events - critical failures that indicate serious GPU, NVSwitch, or driver problems.
5+
The Syslog Health Monitor watches system logs for GPU-related errors that may not be caught by DCGM. It monitors journald/syslog for XID errors, SXID errors (NVSwitch/NVLink errors), and GPU fallen-off-bus events - critical failures that indicate serious GPU, NVSwitch, or driver problems. In addition to failures, it monitors system logs for other GPU-related events such as GPU resets to indicate that a required remediation action has completed.
66

77
Think of it as a log analyzer that reads between the lines - catching GPU and NVSwitch problems recorded in system logs that other monitoring might miss.
88

99
### Why Do You Need This?
1010

11-
Some GPU and NVSwitch failures manifest in system logs before DCGM can detect them:
11+
Some GPU and NVSwitch failures or events manifest in system logs before DCGM can detect them:
1212

1313
- **XID errors**: GPU hardware errors logged by the NVIDIA driver
1414
- **SXID errors**: NVSwitch errors related to NVSwitch and NVLink interconnects
1515
- **GPU fallen off the bus**: GPU became inaccessible to the system
16+
- **GPU Reset**: A GPU reset was executed by nvidia-smi
1617

17-
These errors often appear in system logs first and can indicate imminent GPU or fabric failure, making early detection critical for preventing workload disruptions.
18+
These errors or events often appear in system logs first and can indicate imminent GPU or fabric failure, making early detection critical for preventing workload disruptions or returning GPUs to service.
1819

1920
## How It Works
2021

2122
The Syslog Health Monitor runs as a DaemonSet on GPU nodes:
2223

2324
1. Reads journald logs from the host system
24-
2. Parses log entries for GPU-related error patterns (XID, SXID, fallen-off-bus)
25+
2. Parses log entries for GPU-related error patterns (XID, SXID, fallen-off-bus, GPU reset)
2526
3. Maintains cursor position to avoid re-processing old logs
2627
4. For XID errors, uses embedded NVIDIA XID Catalog spreadsheet to determine recommended actions
2728
5. Optionally analyzes XID errors via XID analyzer sidecar for custom logic
@@ -38,7 +39,7 @@ syslog-health-monitor:
3839
enabled: true
3940

4041
enabledChecks:
41-
- SysLogsXIDError # GPU XID hardware errors
42+
- SysLogsXIDError # GPU XID hardware errors and GPU reset events
4243
- SysLogsSXIDError # NVSwitch/NVLink SXID errors
4344
- SysLogsGPUFallenOff # GPU fallen off the bus
4445

@@ -75,7 +76,10 @@ NVSwitch errors related to the high-speed NVLink interconnect fabric:
7576
- Fabric-level issues affecting multi-GPU communication
7677
7778
### GPU Fallen Off Bus
78-
GPU became inaccessible to the system - critical failure requiring immediate attention.
79+
A GPU became inaccessible to the system - critical failure requiring immediate attention.
80+
81+
### GPU Reset
82+
A GPU was reset by nvidia-smi, indicating that a remediation action for a previous GPU failure has completed.
7983
8084
## Key Features
8185

health-monitors/syslog-health-monitor/pkg/metadata/reader.go

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ type Reader struct {
3434
metadata *model.GPUMetadata
3535

3636
pciToGPU map[string]*model.GPUInfo
37+
uuidToInfo map[string]*model.GPUInfo
3738
nvswitchLinks map[string]map[int]*gpuLinkInfo
3839
}
3940

@@ -80,12 +81,14 @@ func (r *Reader) load() error {
8081

8182
func (r *Reader) buildMaps() {
8283
r.pciToGPU = make(map[string]*model.GPUInfo)
84+
r.uuidToInfo = make(map[string]*model.GPUInfo)
8385
r.nvswitchLinks = make(map[string]map[int]*gpuLinkInfo)
8486

8587
for i := range r.metadata.GPUs {
8688
gpu := &r.metadata.GPUs[i]
8789
normPCI := normalizePCI(gpu.PCIAddress)
8890
r.pciToGPU[normPCI] = gpu
91+
r.uuidToInfo[gpu.UUID] = gpu
8992

9093
for _, link := range gpu.NVLinks {
9194
remotePCI := normalizePCI(link.RemotePCIAddress)
@@ -102,6 +105,19 @@ func (r *Reader) buildMaps() {
102105
}
103106
}
104107

108+
func (r *Reader) GetInfoByUUID(uuid string) (*model.GPUInfo, error) {
109+
if err := r.ensureLoaded(); err != nil {
110+
return nil, fmt.Errorf("failed to load metadata for UUID lookup %s: %w", uuid, err)
111+
}
112+
113+
gpu, ok := r.uuidToInfo[uuid]
114+
if !ok {
115+
return nil, fmt.Errorf("GPU not found for UUID: %s", uuid)
116+
}
117+
118+
return gpu, nil
119+
}
120+
105121
func (r *Reader) GetGPUByPCI(pci string) (*model.GPUInfo, error) {
106122
if err := r.ensureLoaded(); err != nil {
107123
return nil, fmt.Errorf("failed to load metadata for PCI lookup %s: %w", pci, err)

health-monitors/syslog-health-monitor/pkg/metadata/reader_test.go

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,53 @@ func TestGetGPUByPCI(t *testing.T) {
173173
}
174174
}
175175

176+
func TestGetInfoByUUID(t *testing.T) {
177+
tmpDir := t.TempDir()
178+
metadataFile := filepath.Join(tmpDir, "gpu_metadata.json")
179+
require.NoError(t, os.WriteFile(metadataFile, []byte(testMetadataJSON), 0600))
180+
181+
reader := NewReader(metadataFile)
182+
183+
tests := []struct {
184+
name string
185+
uuid string
186+
wantID int
187+
wantErr bool
188+
}{
189+
{
190+
name: "exact match for GPU 0",
191+
uuid: "GPU-00000000-0000-0000-0000-000000000000",
192+
wantID: 0,
193+
wantErr: false,
194+
},
195+
{
196+
name: "exact match for GPU 1",
197+
uuid: "GPU-11111111-1111-1111-1111-111111111111",
198+
wantID: 1,
199+
wantErr: false,
200+
},
201+
{
202+
name: "GPU not found",
203+
uuid: "GPU-123",
204+
wantID: -1,
205+
wantErr: true,
206+
},
207+
}
208+
for _, tt := range tests {
209+
t.Run(tt.name, func(t *testing.T) {
210+
gpu, err := reader.GetInfoByUUID(tt.uuid)
211+
if tt.wantErr {
212+
require.Error(t, err)
213+
require.Nil(t, gpu)
214+
} else {
215+
require.NoError(t, err)
216+
require.NotNil(t, gpu)
217+
require.Equal(t, tt.wantID, gpu.GPUID)
218+
}
219+
})
220+
}
221+
}
222+
176223
func TestGetGPUByNVSwitchLink(t *testing.T) {
177224
tmpDir := t.TempDir()
178225
metadataFile := filepath.Join(tmpDir, "gpu_metadata.json")

health-monitors/syslog-health-monitor/pkg/xid/types.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,8 @@ import (
2222
)
2323

2424
var (
25-
reNvrmMap = regexp.MustCompile(`NVRM: GPU at PCI:([0-9a-fA-F:.]+): (GPU-[0-9a-fA-F-]+)`)
25+
reNvrmMap = regexp.MustCompile(`NVRM: GPU at PCI:([0-9a-fA-F:.]+): (GPU-[0-9a-fA-F-]+)`)
26+
gpuResetMap = regexp.MustCompile(`GPU reset executed: (GPU-[0-9a-fA-F-]+)`)
2627
)
2728

2829
type XIDHandler struct {

health-monitors/syslog-health-monitor/pkg/xid/xid_handler.go

Lines changed: 124 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,10 @@ import (
3030
"github.com/nvidia/nvsentinel/health-monitors/syslog-health-monitor/pkg/xid/parser"
3131
)
3232

33+
const (
34+
healthyHealthEventMessage = "No Health Failures"
35+
)
36+
3337
func NewXIDHandler(nodeName, defaultAgentName,
3438
defaultComponentClass, checkName, xidAnalyserEndpoint, metadataPath string) (*XIDHandler, error) {
3539
config := parser.ParserConfig{
@@ -72,6 +76,11 @@ func (xidHandler *XIDHandler) ProcessLine(message string) (*pb.HealthEvents, err
7276
return nil, nil
7377
}
7478

79+
if uuid := xidHandler.parseGPUResetLine(message); len(uuid) != 0 {
80+
slog.Info("GPU was reset, creating healthy HealthEvent", "GPU_UUID", uuid)
81+
return xidHandler.createHealthEventGPUResetEvent(uuid)
82+
}
83+
7584
xidResp, err := xidHandler.parser.Parse(message)
7685
if err != nil {
7786
slog.Debug("XID parsing failed for message",
@@ -89,6 +98,15 @@ func (xidHandler *XIDHandler) ProcessLine(message string) (*pb.HealthEvents, err
8998
return xidHandler.createHealthEventFromResponse(xidResp, message), nil
9099
}
91100

101+
func (xidHandler *XIDHandler) parseGPUResetLine(message string) string {
102+
m := gpuResetMap.FindStringSubmatch(message)
103+
if len(m) >= 2 {
104+
return m[1]
105+
}
106+
107+
return ""
108+
}
109+
92110
func (xidHandler *XIDHandler) parseNVRMGPUMapLine(message string) (string, string) {
93111
m := reNvrmMap.FindStringSubmatch(message)
94112
if len(m) >= 3 {
@@ -112,38 +130,72 @@ func (xidHandler *XIDHandler) determineFatality(recommendedAction pb.Recommended
112130
}, recommendedAction)
113131
}
114132

115-
func (xidHandler *XIDHandler) getGPUUUID(normPCI string) string {
133+
func (xidHandler *XIDHandler) getGPUUUID(normPCI string) (uuid string, fromMetadata bool) {
116134
gpuInfo, err := xidHandler.metadataReader.GetGPUByPCI(normPCI)
117135
if err == nil && gpuInfo != nil {
118-
return gpuInfo.UUID
136+
return gpuInfo.UUID, true
119137
}
120138

121139
if err != nil {
122140
slog.Error("Error getting GPU UUID from metadata", "pci", normPCI, "error", err)
123141
}
124142

125143
if uuid, ok := xidHandler.pciToGPUUUID[normPCI]; ok {
126-
return uuid
144+
return uuid, false
127145
}
128146

129-
return ""
147+
return "", false
130148
}
131149

150+
/*
151+
In addition to the PCI, we will always add the GPU UUID as an impacted entity if it is available from either dmesg
152+
or from the metadata-collector. The COMPONENT_RESET remediation action requires the PCI and GPU UUID are available in
153+
the initial unhealthy event we're sending. Additionally, the corresponding healthy event triggered after the
154+
COMPONENT_RESET requires the same PCI and GPU UUID impacted entities are included as the initial event. As a result,
155+
we will only permit the COMPONENT_RESET action if the GPU UUID was sourced from the metadata-collector to ensure that
156+
the same impacted entities can be fetched after a reset occurs. If the GPU UUID does not exist or is sourced from dmesg,
157+
we will still include it as an impacted entity but override the remediation action from COMPONENT_RESET to RESTART_VM.
158+
159+
Unhealthy event generation:
160+
1. XID 48 error occurs in syslog which includes the PCI 0000:03:00:
161+
Xid (PCI:0000:03:00): 48, pid=91237, name=nv-hostengine, Ch 00000076, errorString CTX SWITCH TIMEOUT, Info 0x3c046
162+
163+
2. Using the metadata-collector, look up the corresponding GPU UUID for PCI 0000:03:00 which is
164+
GPU-455d8f70-2051-db6c-0430-ffc457bff834
165+
166+
3. Include this PCI and GPU UUID in the list of impacted entities in our unhealthy HealthEvent with the COMPONENT_RESET
167+
remediation action.
168+
169+
Healthy event generation:
170+
1. GPU reset occurs in syslog which includes the GPU UUID:
171+
GPU reset executed: GPU-455d8f70-2051-db6c-0430-ffc457bff834
172+
173+
2. Using the metadata-collector, look up the corresponding PCI for the given GPU UUID.
174+
175+
3. Include this PCI and GPU UUID in the list of impacted entities in our healthy HealthEvent.
176+
177+
Implementation details:
178+
- The xid-handler will take care of overriding the remediation action from COMPONENT_RESET to RESTART_VM if the GPU UUID
179+
is not available in the HealthEvent. This prevents either a healthEventOverrides from being required or from each future
180+
module needing to derive whether to proceed with a COMPONENT_RESET or RESTART_VM based on if the GPU UUID is present in
181+
impacted entities (specifically node-drainer needs this determine if we do a partial drain and fault-remediation needs
182+
this for the maintenance resource selection).
183+
- Note that it would be possible to not include the PCI as an impacted entity in COMPONENT_RESET health events which
184+
would allow us to always do a GPU reset if the GPU UUID could be fetched from any source (metadata-collector or dmegs).
185+
Recall that the GPU UUID itself is provided in the syslog GPU reset log line (whereas the PCI needs to be dynamically
186+
looked up from the metadata-collector because Janitor does not accept the PCI as input nor does it look up the PCI
187+
before writing the syslog event). However, we do not want to conditionally add entity impact depending on the needs of
188+
healthy event generation nor do we want to add custom logic to allow the fault-quarantine-module to clear conditions on
189+
a subset of impacted entities recovering.
190+
*/
132191
func (xidHandler *XIDHandler) createHealthEventFromResponse(
133192
xidResp *parser.Response,
134193
message string,
135194
) *pb.HealthEvents {
136-
entities := []*pb.Entity{
137-
{EntityType: "PCI", EntityValue: xidResp.Result.PCIE},
138-
}
139-
140195
normPCI := xidHandler.normalizePCI(xidResp.Result.PCIE)
196+
uuid, fromMetadata := xidHandler.getGPUUUID(normPCI)
141197

142-
if uuid := xidHandler.getGPUUUID(normPCI); uuid != "" {
143-
entities = append(entities, &pb.Entity{
144-
EntityType: "GPU_UUID", EntityValue: uuid,
145-
})
146-
}
198+
entities := getDefaultImpactedEntities(normPCI, uuid)
147199

148200
if xidResp.Result.Metadata != nil {
149201
var metadata []*pb.Entity
@@ -169,6 +221,14 @@ func (xidHandler *XIDHandler) createHealthEventFromResponse(
169221
).Inc()
170222

171223
recommendedAction := common.MapActionStringToProto(xidResp.Result.Resolution)
224+
// If we couldn't look up the GPU UUID from metadata (and either couldn't fetch it or retrieved it from dmesg),
225+
// then override the recommended action from COMPONENT_RESET to RESTART_VM.
226+
if !fromMetadata && recommendedAction == pb.RecommendedAction_COMPONENT_RESET {
227+
slog.Info("Overriding recommended action from COMPONENT_RESET to RESTART_VM", "pci", normPCI, "gpuUUID", uuid)
228+
229+
recommendedAction = pb.RecommendedAction_RESTART_VM
230+
}
231+
172232
event := &pb.HealthEvent{
173233
Version: 1,
174234
Agent: xidHandler.defaultAgentName,
@@ -191,6 +251,40 @@ func (xidHandler *XIDHandler) createHealthEventFromResponse(
191251
}
192252
}
193253

254+
func (xidHandler *XIDHandler) createHealthEventGPUResetEvent(uuid string) (*pb.HealthEvents, error) {
255+
gpuInfo, err := xidHandler.metadataReader.GetInfoByUUID(uuid)
256+
// There's no point in sending a healthy HealthEvent with only GPU UUID and not PCI because that healthy HealthEvent
257+
// will not match all impacted entities tracked by the fault-quarantine-module so we will return an error rather than
258+
// send the event with partial information.
259+
if err != nil {
260+
return nil, fmt.Errorf("failed to look up GPU info using UUID %s: %w", uuid, err)
261+
}
262+
263+
if len(gpuInfo.PCIAddress) == 0 {
264+
return nil, fmt.Errorf("failed to look up PCI info using UUID %s: %w", uuid, err)
265+
}
266+
267+
entities := getDefaultImpactedEntities(gpuInfo.PCIAddress, uuid)
268+
event := &pb.HealthEvent{
269+
Version: 1,
270+
Agent: xidHandler.defaultAgentName,
271+
CheckName: xidHandler.checkName,
272+
ComponentClass: xidHandler.defaultComponentClass,
273+
GeneratedTimestamp: timestamppb.New(time.Now()),
274+
EntitiesImpacted: entities,
275+
Message: healthyHealthEventMessage,
276+
IsFatal: false,
277+
IsHealthy: true,
278+
NodeName: xidHandler.nodeName,
279+
RecommendedAction: pb.RecommendedAction_NONE,
280+
}
281+
282+
return &pb.HealthEvents{
283+
Version: 1,
284+
Events: []*pb.HealthEvent{event},
285+
}, nil
286+
}
287+
194288
func getXID13Metadata(metadata map[string]string) []*pb.Entity {
195289
entities := []*pb.Entity{}
196290

@@ -235,3 +329,20 @@ func getXID74Metadata(metadata map[string]string) []*pb.Entity {
235329

236330
return entities
237331
}
332+
333+
func getDefaultImpactedEntities(pci, uuid string) []*pb.Entity {
334+
entities := []*pb.Entity{
335+
{
336+
EntityType: "PCI",
337+
EntityValue: pci,
338+
},
339+
}
340+
if len(uuid) > 0 {
341+
entities = append(entities, &pb.Entity{
342+
EntityType: "GPU_UUID",
343+
EntityValue: uuid,
344+
})
345+
}
346+
347+
return entities
348+
}

0 commit comments

Comments
 (0)