feat: add GPU reset support to syslog-health-monitor

natherz97 · natherz97 · commit fecc08e194a1 · 2026-01-09T10:19:20.000-08:00
Signed-off-by: Nathan Herz &lt;nherz@nvidia.com&gt;
diff --git a/docs/configuration/syslog-health-monitor.md b/docs/configuration/syslog-health-monitor.md
@@ -2,7 +2,7 @@
 
 ## Overview
 
-The Syslog Health Monitor module watches system logs for GPU errors (XID/SXID) and GPU-fallen-off events by reading journald logs. This document covers all Helm configuration options for system administrators.
+The Syslog Health Monitor module watches system logs for GPU errors (XID/SXID), GPU-fallen-off, and GPU reset events by reading journald logs. This document covers all Helm configuration options for system administrators.
 
 ## Configuration Reference
 
@@ -55,7 +55,7 @@ syslog-health-monitor:
 ### Check Types
 
 #### SysLogsXIDError
-Monitors for XID (GPU error) messages in system logs. XIDs are NVIDIA GPU error codes that indicate hardware or software issues.
+Monitors for XID (GPU error) and GPU reset messages in system logs. XIDs are NVIDIA GPU error codes that indicate hardware or software issues.
 
 #### SysLogsSXIDError
 Monitors for SXID messages specific to NVSwitch errors in multi-GPU configurations.
diff --git a/docs/syslog-health-monitor.md b/docs/syslog-health-monitor.md
@@ -2,26 +2,27 @@
 
 ## Overview
 
-The Syslog Health Monitor watches system logs for GPU-related errors that may not be caught by DCGM. It monitors journald/syslog for XID errors, SXID errors (NVSwitch/NVLink errors), and GPU fallen-off-bus events - critical failures that indicate serious GPU, NVSwitch, or driver problems.
+The Syslog Health Monitor watches system logs for GPU-related errors that may not be caught by DCGM. It monitors journald/syslog for XID errors, SXID errors (NVSwitch/NVLink errors), and GPU fallen-off-bus events - critical failures that indicate serious GPU, NVSwitch, or driver problems. In addition to failures, it monitors system logs for other GPU-related events such as GPU resets to indicate that a required remediation action has completed. 
 
 Think of it as a log analyzer that reads between the lines - catching GPU and NVSwitch problems recorded in system logs that other monitoring might miss.
 
 ### Why Do You Need This?
 
-Some GPU and NVSwitch failures manifest in system logs before DCGM can detect them:
+Some GPU and NVSwitch failures or events manifest in system logs before DCGM can detect them:
 
 - **XID errors**: GPU hardware errors logged by the NVIDIA driver
 - **SXID errors**: NVSwitch errors related to NVSwitch and NVLink interconnects
 - **GPU fallen off the bus**: GPU became inaccessible to the system
+- **GPU Reset**: A GPU reset was executed by nvidia-smi
 
-These errors often appear in system logs first and can indicate imminent GPU or fabric failure, making early detection critical for preventing workload disruptions.
+These errors or events often appear in system logs first and can indicate imminent GPU or fabric failure, making early detection critical for preventing workload disruptions or returning GPUs to service.
 
 ## How It Works
 
 The Syslog Health Monitor runs as a DaemonSet on GPU nodes:
 
 1. Reads journald logs from the host system
-2. Parses log entries for GPU-related error patterns (XID, SXID, fallen-off-bus)
+2. Parses log entries for GPU-related error patterns (XID, SXID, fallen-off-bus, GPU reset)
 3. Maintains cursor position to avoid re-processing old logs
 4. For XID errors, uses embedded NVIDIA XID Catalog spreadsheet to determine recommended actions
 5. Optionally analyzes XID errors via XID analyzer sidecar for custom logic
@@ -38,7 +39,7 @@ syslog-health-monitor:
   enabled: true
   
   enabledChecks:
-    - SysLogsXIDError       # GPU XID hardware errors
+    - SysLogsXIDError       # GPU XID hardware errors and GPU reset events
     - SysLogsSXIDError      # NVSwitch/NVLink SXID errors
     - SysLogsGPUFallenOff   # GPU fallen off the bus
   
@@ -75,7 +76,10 @@ NVSwitch errors related to the high-speed NVLink interconnect fabric:
 - Fabric-level issues affecting multi-GPU communication
 
 ### GPU Fallen Off Bus
-GPU became inaccessible to the system - critical failure requiring immediate attention.
+A GPU became inaccessible to the system - critical failure requiring immediate attention.
+
+### GPU Reset
+A GPU was reset by nvidia-smi, indicating that a remediation action for a previous GPU failure has completed.
 
 ## Key Features
 
diff --git a/health-monitors/syslog-health-monitor/pkg/metadata/reader.go b/health-monitors/syslog-health-monitor/pkg/metadata/reader.go
@@ -34,6 +34,7 @@ type Reader struct {
 	metadata *model.GPUMetadata
 
 	pciToGPU      map[string]*model.GPUInfo
+	uuidToInfo    map[string]*model.GPUInfo
 	nvswitchLinks map[string]map[int]*gpuLinkInfo
 }
 
@@ -80,12 +81,14 @@ func (r *Reader) load() error {
 
 func (r *Reader) buildMaps() {
 	r.pciToGPU = make(map[string]*model.GPUInfo)
+	r.uuidToInfo = make(map[string]*model.GPUInfo)
 	r.nvswitchLinks = make(map[string]map[int]*gpuLinkInfo)
 
 	for i := range r.metadata.GPUs {
 		gpu := &r.metadata.GPUs[i]
 		normPCI := normalizePCI(gpu.PCIAddress)
 		r.pciToGPU[normPCI] = gpu
+		r.uuidToInfo[gpu.UUID] = gpu
 
 		for _, link := range gpu.NVLinks {
 			remotePCI := normalizePCI(link.RemotePCIAddress)
@@ -102,6 +105,19 @@ func (r *Reader) buildMaps() {
 	}
 }
 
+func (r *Reader) GetInfoByUUID(uuid string) (*model.GPUInfo, error) {
+	if err := r.ensureLoaded(); err != nil {
+		return nil, fmt.Errorf("failed to load metadata for UUID lookup %s: %w", uuid, err)
+	}
+
+	gpu, ok := r.uuidToInfo[uuid]
+	if !ok {
+		return nil, fmt.Errorf("GPU not found for UUID: %s", uuid)
+	}
+
+	return gpu, nil
+}
+
 func (r *Reader) GetGPUByPCI(pci string) (*model.GPUInfo, error) {
 	if err := r.ensureLoaded(); err != nil {
 		return nil, fmt.Errorf("failed to load metadata for PCI lookup %s: %w", pci, err)
diff --git a/health-monitors/syslog-health-monitor/pkg/metadata/reader_test.go b/health-monitors/syslog-health-monitor/pkg/metadata/reader_test.go
@@ -173,6 +173,53 @@ func TestGetGPUByPCI(t *testing.T) {
 	}
 }
 
+func TestGetInfoByUUID(t *testing.T) {
+	tmpDir := t.TempDir()
+	metadataFile := filepath.Join(tmpDir, "gpu_metadata.json")
+	require.NoError(t, os.WriteFile(metadataFile, []byte(testMetadataJSON), 0600))
+
+	reader := NewReader(metadataFile)
+
+	tests := []struct {
+		name    string
+		uuid    string
+		wantID  int
+		wantErr bool
+	}{
+		{
+			name:    "exact match for GPU 0",
+			uuid:    "GPU-00000000-0000-0000-0000-000000000000",
+			wantID:  0,
+			wantErr: false,
+		},
+		{
+			name:    "exact match for GPU 1",
+			uuid:    "GPU-11111111-1111-1111-1111-111111111111",
+			wantID:  1,
+			wantErr: false,
+		},
+		{
+			name:    "GPU not found",
+			uuid:    "GPU-123",
+			wantID:  -1,
+			wantErr: true,
+		},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			gpu, err := reader.GetInfoByUUID(tt.uuid)
+			if tt.wantErr {
+				require.Error(t, err)
+				require.Nil(t, gpu)
+			} else {
+				require.NoError(t, err)
+				require.NotNil(t, gpu)
+				require.Equal(t, tt.wantID, gpu.GPUID)
+			}
+		})
+	}
+}
+
 func TestGetGPUByNVSwitchLink(t *testing.T) {
 	tmpDir := t.TempDir()
 	metadataFile := filepath.Join(tmpDir, "gpu_metadata.json")
diff --git a/health-monitors/syslog-health-monitor/pkg/xid/types.go b/health-monitors/syslog-health-monitor/pkg/xid/types.go
@@ -22,7 +22,8 @@ import (
 )
 
 var (
-	reNvrmMap = regexp.MustCompile(`NVRM: GPU at PCI:([0-9a-fA-F:.]+): (GPU-[0-9a-fA-F-]+)`)
+	reNvrmMap   = regexp.MustCompile(`NVRM: GPU at PCI:([0-9a-fA-F:.]+): (GPU-[0-9a-fA-F-]+)`)
+	gpuResetMap = regexp.MustCompile(`GPU reset executed: (GPU-[0-9a-fA-F-]+)`)
 )
 
 type XIDHandler struct {
diff --git a/health-monitors/syslog-health-monitor/pkg/xid/xid_handler.go b/health-monitors/syslog-health-monitor/pkg/xid/xid_handler.go
@@ -30,6 +30,10 @@ import (
 	"github.com/nvidia/nvsentinel/health-monitors/syslog-health-monitor/pkg/xid/parser"
 )
 
+const (
+	healthyHealthEventMessage = "No Health Failures"
+)
+
 func NewXIDHandler(nodeName, defaultAgentName,
 	defaultComponentClass, checkName, xidAnalyserEndpoint, metadataPath string) (*XIDHandler, error) {
 	config := parser.ParserConfig{
@@ -72,6 +76,11 @@ func (xidHandler *XIDHandler) ProcessLine(message string) (*pb.HealthEvents, err
 		return nil, nil
 	}
 
+	if uuid := xidHandler.parseGPUResetLine(message); len(uuid) != 0 {
+		slog.Info("GPU was reset, creating healthy HealthEvent", "GPU_UUID", uuid)
+		return xidHandler.createHealthEventGPUResetEvent(uuid)
+	}
+
 	xidResp, err := xidHandler.parser.Parse(message)
 	if err != nil {
 		slog.Debug("XID parsing failed for message",
@@ -89,6 +98,15 @@ func (xidHandler *XIDHandler) ProcessLine(message string) (*pb.HealthEvents, err
 	return xidHandler.createHealthEventFromResponse(xidResp, message), nil
 }
 
+func (xidHandler *XIDHandler) parseGPUResetLine(message string) string {
+	m := gpuResetMap.FindStringSubmatch(message)
+	if len(m) >= 2 {
+		return m[1]
+	}
+
+	return ""
+}
+
 func (xidHandler *XIDHandler) parseNVRMGPUMapLine(message string) (string, string) {
 	m := reNvrmMap.FindStringSubmatch(message)
 	if len(m) >= 3 {
@@ -112,38 +130,72 @@ func (xidHandler *XIDHandler) determineFatality(recommendedAction pb.Recommended
 	}, recommendedAction)
 }
 
-func (xidHandler *XIDHandler) getGPUUUID(normPCI string) string {
+func (xidHandler *XIDHandler) getGPUUUID(normPCI string) (uuid string, fromMetadata bool) {
 	gpuInfo, err := xidHandler.metadataReader.GetGPUByPCI(normPCI)
 	if err == nil && gpuInfo != nil {
-		return gpuInfo.UUID
+		return gpuInfo.UUID, true
 	}
 
 	if err != nil {
 		slog.Error("Error getting GPU UUID from metadata", "pci", normPCI, "error", err)
 	}
 
 	if uuid, ok := xidHandler.pciToGPUUUID[normPCI]; ok {
-		return uuid
+		return uuid, false
 	}
 
-	return ""
+	return "", false
 }
 
+/*
+In addition to the PCI, we will always add the GPU UUID as an impacted entity if it is available from either dmesg
+or from the metadata-collector. The COMPONENT_RESET remediation action requires the PCI and GPU UUID are available in
+the initial unhealthy event we're sending. Additionally, the corresponding healthy event triggered after the
+COMPONENT_RESET requires the same PCI and GPU UUID impacted entities are included as the initial event. As a result,
+we will only permit the COMPONENT_RESET action if the GPU UUID was sourced from the metadata-collector to ensure that
+the same impacted entities can be fetched after a reset occurs. If the GPU UUID does not exist or is sourced from dmesg,
+we will still include it as an impacted entity but override the remediation action from COMPONENT_RESET to RESTART_VM.
+
+Unhealthy event generation:
+1. XID 48 error occurs in syslog which includes the PCI 0000:03:00:
+Xid (PCI:0000:03:00): 48, pid=91237, name=nv-hostengine, Ch 00000076, errorString CTX SWITCH TIMEOUT, Info 0x3c046
+
+2. Using the metadata-collector, look up the corresponding GPU UUID for PCI 0000:03:00 which is
+GPU-455d8f70-2051-db6c-0430-ffc457bff834
+
+3. Include this PCI and GPU UUID in the list of impacted entities in our unhealthy HealthEvent with the COMPONENT_RESET
+remediation action.
+
+Healthy event generation:
+1. GPU reset occurs in syslog which includes the GPU UUID:
+GPU reset executed: GPU-455d8f70-2051-db6c-0430-ffc457bff834
+
+2. Using the metadata-collector, look up the corresponding PCI for the given GPU UUID.
+
+3. Include this PCI and GPU UUID in the list of impacted entities in our healthy HealthEvent.
+
+Implementation details:
+- The xid-handler will take care of overriding the remediation action from COMPONENT_RESET to RESTART_VM if the GPU UUID
+is not available in the HealthEvent. This prevents either a healthEventOverrides from being required or from each future
+module needing to derive whether to proceed with a COMPONENT_RESET or RESTART_VM based on if the GPU UUID is present in
+impacted entities (specifically node-drainer needs this determine if we do a partial drain and fault-remediation needs
+this for the maintenance resource selection).
+- Note that it would be possible to not include the PCI as an impacted entity in COMPONENT_RESET health events which
+would allow us to always do a GPU reset if the GPU UUID could be fetched from any source (metadata-collector or dmegs).
+Recall that the GPU UUID itself is provided in the syslog GPU reset log line (whereas the PCI needs to be dynamically
+looked up from the metadata-collector because Janitor does not accept the PCI as input nor does it look up the PCI
+before writing the syslog event). However, we do not want to conditionally add entity impact depending on the needs of
+healthy event generation nor do we want to add custom logic to allow the fault-quarantine-module to clear conditions on
+a subset of impacted entities recovering.
+*/
 func (xidHandler *XIDHandler) createHealthEventFromResponse(
 	xidResp *parser.Response,
 	message string,
 ) *pb.HealthEvents {
-	entities := []*pb.Entity{
-		{EntityType: "PCI", EntityValue: xidResp.Result.PCIE},
-	}
-
 	normPCI := xidHandler.normalizePCI(xidResp.Result.PCIE)
+	uuid, fromMetadata := xidHandler.getGPUUUID(normPCI)
 
-	if uuid := xidHandler.getGPUUUID(normPCI); uuid != "" {
-		entities = append(entities, &pb.Entity{
-			EntityType: "GPU_UUID", EntityValue: uuid,
-		})
-	}
+	entities := getDefaultImpactedEntities(normPCI, uuid)
 
 	if xidResp.Result.Metadata != nil {
 		var metadata []*pb.Entity
@@ -169,6 +221,14 @@ func (xidHandler *XIDHandler) createHealthEventFromResponse(
 	).Inc()
 
 	recommendedAction := common.MapActionStringToProto(xidResp.Result.Resolution)
+	// If we couldn't look up the GPU UUID from metadata (and either couldn't fetch it or retrieved it from dmesg),
+	// then override the recommended action from COMPONENT_RESET to RESTART_VM.
+	if !fromMetadata && recommendedAction == pb.RecommendedAction_COMPONENT_RESET {
+		slog.Info("Overriding recommended action from COMPONENT_RESET to RESTART_VM", "pci", normPCI, "gpuUUID", uuid)
+
+		recommendedAction = pb.RecommendedAction_RESTART_VM
+	}
+
 	event := &pb.HealthEvent{
 		Version:            1,
 		Agent:              xidHandler.defaultAgentName,
@@ -191,6 +251,40 @@ func (xidHandler *XIDHandler) createHealthEventFromResponse(
 	}
 }
 
+func (xidHandler *XIDHandler) createHealthEventGPUResetEvent(uuid string) (*pb.HealthEvents, error) {
+	gpuInfo, err := xidHandler.metadataReader.GetInfoByUUID(uuid)
+	// There's no point in sending a healthy HealthEvent with only GPU UUID and not PCI because that healthy HealthEvent
+	// will not match all impacted entities tracked by the fault-quarantine-module so we will return an error rather than
+	// send the event with partial information.
+	if err != nil {
+		return nil, fmt.Errorf("failed to look up GPU info using UUID %s: %w", uuid, err)
+	}
+
+	if len(gpuInfo.PCIAddress) == 0 {
+		return nil, fmt.Errorf("failed to look up PCI info using UUID %s: %w", uuid, err)
+	}
+
+	entities := getDefaultImpactedEntities(gpuInfo.PCIAddress, uuid)
+	event := &pb.HealthEvent{
+		Version:            1,
+		Agent:              xidHandler.defaultAgentName,
+		CheckName:          xidHandler.checkName,
+		ComponentClass:     xidHandler.defaultComponentClass,
+		GeneratedTimestamp: timestamppb.New(time.Now()),
+		EntitiesImpacted:   entities,
+		Message:            healthyHealthEventMessage,
+		IsFatal:            false,
+		IsHealthy:          true,
+		NodeName:           xidHandler.nodeName,
+		RecommendedAction:  pb.RecommendedAction_NONE,
+	}
+
+	return &pb.HealthEvents{
+		Version: 1,
+		Events:  []*pb.HealthEvent{event},
+	}, nil
+}
+
 func getXID13Metadata(metadata map[string]string) []*pb.Entity {
 	entities := []*pb.Entity{}
 
@@ -235,3 +329,20 @@ func getXID74Metadata(metadata map[string]string) []*pb.Entity {
 
 	return entities
 }
+
+func getDefaultImpactedEntities(pci, uuid string) []*pb.Entity {
+	entities := []*pb.Entity{
+		{
+			EntityType:  "PCI",
+			EntityValue: pci,
+		},
+	}
+	if len(uuid) > 0 {
+		entities = append(entities, &pb.Entity{
+			EntityType:  "GPU_UUID",
+			EntityValue: uuid,
+		})
+	}
+
+	return entities
+}
diff --git a/health-monitors/syslog-health-monitor/pkg/xid/xid_handler_test.go b/health-monitors/syslog-health-monitor/pkg/xid/xid_handler_test.go

Original file line number	Diff line number	Diff line change
`@@ -22,7 +22,8 @@ import (`
`22`	`22`	`)`
`23`	`23`
`24`	`24`	`var (`
`25`		- reNvrmMap = regexp.MustCompile(`NVRM: GPU at PCI:([0-9a-fA-F:.]+): (GPU-[0-9a-fA-F-]+)`)
	`25`	+ reNvrmMap = regexp.MustCompile(`NVRM: GPU at PCI:([0-9a-fA-F:.]+): (GPU-[0-9a-fA-F-]+)`)
	`26`	+ gpuResetMap = regexp.MustCompile(`GPU reset executed: (GPU-[0-9a-fA-F-]+)`)
`26`	`27`	`)`
`27`	`28`
`28`	`29`	`type XIDHandler struct {`