From 6066b899f36b9969f335b6ae2a0fc5aeb5ed861c Mon Sep 17 00:00:00 2001 From: Nathan Herz Date: Fri, 9 Jan 2026 09:36:13 -0800 Subject: [PATCH] feat: add GPU reset support to syslog-health-monitor Signed-off-by: Nathan Herz --- docs/configuration/syslog-health-monitor.md | 4 +- docs/syslog-health-monitor.md | 16 +- .../pkg/metadata/reader.go | 16 ++ .../pkg/metadata/reader_test.go | 47 ++++ .../syslog-health-monitor/pkg/xid/types.go | 3 +- .../pkg/xid/xid_handler.go | 138 ++++++++++-- .../pkg/xid/xid_handler_test.go | 213 +++++++++++++++++- tests/gpu_health_monitor_test.go | 4 +- tests/helpers/kube.go | 16 +- tests/node_drainer_test.go | 2 +- tests/syslog_health_monitor_test.go | 67 +++++- 11 files changed, 487 insertions(+), 39 deletions(-) diff --git a/docs/configuration/syslog-health-monitor.md b/docs/configuration/syslog-health-monitor.md index 7d8b18dbf..ad1f07bb3 100644 --- a/docs/configuration/syslog-health-monitor.md +++ b/docs/configuration/syslog-health-monitor.md @@ -2,7 +2,7 @@ ## Overview -The Syslog Health Monitor module watches system logs for GPU errors (XID/SXID) and GPU-fallen-off events by reading journald logs. This document covers all Helm configuration options for system administrators. +The Syslog Health Monitor module watches system logs for GPU errors (XID/SXID), GPU-fallen-off, and GPU reset events by reading journald logs. This document covers all Helm configuration options for system administrators. ## Configuration Reference @@ -55,7 +55,7 @@ syslog-health-monitor: ### Check Types #### SysLogsXIDError -Monitors for XID (GPU error) messages in system logs. XIDs are NVIDIA GPU error codes that indicate hardware or software issues. +Monitors for XID (GPU error) and GPU reset messages in system logs. XIDs are NVIDIA GPU error codes that indicate hardware or software issues. #### SysLogsSXIDError Monitors for SXID messages specific to NVSwitch errors in multi-GPU configurations. diff --git a/docs/syslog-health-monitor.md b/docs/syslog-health-monitor.md index 7999d79fa..f7d1a157a 100644 --- a/docs/syslog-health-monitor.md +++ b/docs/syslog-health-monitor.md @@ -2,26 +2,27 @@ ## Overview -The Syslog Health Monitor watches system logs for GPU-related errors that may not be caught by DCGM. It monitors journald/syslog for XID errors, SXID errors (NVSwitch/NVLink errors), and GPU fallen-off-bus events - critical failures that indicate serious GPU, NVSwitch, or driver problems. +The Syslog Health Monitor watches system logs for GPU-related errors that may not be caught by DCGM. It monitors journald/syslog for XID errors, SXID errors (NVSwitch/NVLink errors), and GPU fallen-off-bus events - critical failures that indicate serious GPU, NVSwitch, or driver problems. In addition to failures, it monitors system logs for other GPU-related events such as GPU resets to indicate that a required remediation action has completed. Think of it as a log analyzer that reads between the lines - catching GPU and NVSwitch problems recorded in system logs that other monitoring might miss. ### Why Do You Need This? -Some GPU and NVSwitch failures manifest in system logs before DCGM can detect them: +Some GPU and NVSwitch failures or events manifest in system logs before DCGM can detect them: - **XID errors**: GPU hardware errors logged by the NVIDIA driver - **SXID errors**: NVSwitch errors related to NVSwitch and NVLink interconnects - **GPU fallen off the bus**: GPU became inaccessible to the system +- **GPU Reset**: A GPU reset was executed by nvidia-smi -These errors often appear in system logs first and can indicate imminent GPU or fabric failure, making early detection critical for preventing workload disruptions. +These errors or events often appear in system logs first and can indicate imminent GPU or fabric failure, making early detection critical for preventing workload disruptions or returning GPUs to service. ## How It Works The Syslog Health Monitor runs as a DaemonSet on GPU nodes: 1. Reads journald logs from the host system -2. Parses log entries for GPU-related error patterns (XID, SXID, fallen-off-bus) +2. Parses log entries for GPU-related error patterns (XID, SXID, fallen-off-bus, GPU reset) 3. Maintains cursor position to avoid re-processing old logs 4. For XID errors, uses embedded NVIDIA XID Catalog spreadsheet to determine recommended actions 5. Optionally analyzes XID errors via XID analyzer sidecar for custom logic @@ -38,7 +39,7 @@ syslog-health-monitor: enabled: true enabledChecks: - - SysLogsXIDError # GPU XID hardware errors + - SysLogsXIDError # GPU XID hardware errors and GPU reset events - SysLogsSXIDError # NVSwitch/NVLink SXID errors - SysLogsGPUFallenOff # GPU fallen off the bus @@ -75,7 +76,10 @@ NVSwitch errors related to the high-speed NVLink interconnect fabric: - Fabric-level issues affecting multi-GPU communication ### GPU Fallen Off Bus -GPU became inaccessible to the system - critical failure requiring immediate attention. +A GPU became inaccessible to the system - critical failure requiring immediate attention. + +### GPU Reset +A GPU was reset by nvidia-smi, indicating that a remediation action for a previous GPU failure has completed. ## Key Features diff --git a/health-monitors/syslog-health-monitor/pkg/metadata/reader.go b/health-monitors/syslog-health-monitor/pkg/metadata/reader.go index 36fd9d8db..058153a08 100644 --- a/health-monitors/syslog-health-monitor/pkg/metadata/reader.go +++ b/health-monitors/syslog-health-monitor/pkg/metadata/reader.go @@ -34,6 +34,7 @@ type Reader struct { metadata *model.GPUMetadata pciToGPU map[string]*model.GPUInfo + uuidToInfo map[string]*model.GPUInfo nvswitchLinks map[string]map[int]*gpuLinkInfo } @@ -80,12 +81,14 @@ func (r *Reader) load() error { func (r *Reader) buildMaps() { r.pciToGPU = make(map[string]*model.GPUInfo) + r.uuidToInfo = make(map[string]*model.GPUInfo) r.nvswitchLinks = make(map[string]map[int]*gpuLinkInfo) for i := range r.metadata.GPUs { gpu := &r.metadata.GPUs[i] normPCI := normalizePCI(gpu.PCIAddress) r.pciToGPU[normPCI] = gpu + r.uuidToInfo[gpu.UUID] = gpu for _, link := range gpu.NVLinks { remotePCI := normalizePCI(link.RemotePCIAddress) @@ -102,6 +105,19 @@ func (r *Reader) buildMaps() { } } +func (r *Reader) GetInfoByUUID(uuid string) (*model.GPUInfo, error) { + if err := r.ensureLoaded(); err != nil { + return nil, fmt.Errorf("failed to load metadata for UUID lookup %s: %w", uuid, err) + } + + gpu, ok := r.uuidToInfo[uuid] + if !ok { + return nil, fmt.Errorf("GPU not found for UUID: %s", uuid) + } + + return gpu, nil +} + func (r *Reader) GetGPUByPCI(pci string) (*model.GPUInfo, error) { if err := r.ensureLoaded(); err != nil { return nil, fmt.Errorf("failed to load metadata for PCI lookup %s: %w", pci, err) diff --git a/health-monitors/syslog-health-monitor/pkg/metadata/reader_test.go b/health-monitors/syslog-health-monitor/pkg/metadata/reader_test.go index 6a9df2dd6..474ab5469 100644 --- a/health-monitors/syslog-health-monitor/pkg/metadata/reader_test.go +++ b/health-monitors/syslog-health-monitor/pkg/metadata/reader_test.go @@ -173,6 +173,53 @@ func TestGetGPUByPCI(t *testing.T) { } } +func TestGetInfoByUUID(t *testing.T) { + tmpDir := t.TempDir() + metadataFile := filepath.Join(tmpDir, "gpu_metadata.json") + require.NoError(t, os.WriteFile(metadataFile, []byte(testMetadataJSON), 0600)) + + reader := NewReader(metadataFile) + + tests := []struct { + name string + uuid string + wantID int + wantErr bool + }{ + { + name: "exact match for GPU 0", + uuid: "GPU-00000000-0000-0000-0000-000000000000", + wantID: 0, + wantErr: false, + }, + { + name: "exact match for GPU 1", + uuid: "GPU-11111111-1111-1111-1111-111111111111", + wantID: 1, + wantErr: false, + }, + { + name: "GPU not found", + uuid: "GPU-123", + wantID: -1, + wantErr: true, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + gpu, err := reader.GetInfoByUUID(tt.uuid) + if tt.wantErr { + require.Error(t, err) + require.Nil(t, gpu) + } else { + require.NoError(t, err) + require.NotNil(t, gpu) + require.Equal(t, tt.wantID, gpu.GPUID) + } + }) + } +} + func TestGetGPUByNVSwitchLink(t *testing.T) { tmpDir := t.TempDir() metadataFile := filepath.Join(tmpDir, "gpu_metadata.json") diff --git a/health-monitors/syslog-health-monitor/pkg/xid/types.go b/health-monitors/syslog-health-monitor/pkg/xid/types.go index 44cab65f8..69fe9b736 100644 --- a/health-monitors/syslog-health-monitor/pkg/xid/types.go +++ b/health-monitors/syslog-health-monitor/pkg/xid/types.go @@ -22,7 +22,8 @@ import ( ) var ( - reNvrmMap = regexp.MustCompile(`NVRM: GPU at PCI:([0-9a-fA-F:.]+): (GPU-[0-9a-fA-F-]+)`) + reNvrmMap = regexp.MustCompile(`NVRM: GPU at PCI:([0-9a-fA-F:.]+): (GPU-[0-9a-fA-F-]+)`) + gpuResetMap = regexp.MustCompile(`GPU reset executed: (GPU-[0-9a-fA-F-]+)`) ) type XIDHandler struct { diff --git a/health-monitors/syslog-health-monitor/pkg/xid/xid_handler.go b/health-monitors/syslog-health-monitor/pkg/xid/xid_handler.go index d8615df5f..59311fb28 100644 --- a/health-monitors/syslog-health-monitor/pkg/xid/xid_handler.go +++ b/health-monitors/syslog-health-monitor/pkg/xid/xid_handler.go @@ -30,6 +30,10 @@ import ( "github.com/nvidia/nvsentinel/health-monitors/syslog-health-monitor/pkg/xid/parser" ) +const ( + healthyHealthEventMessage = "No Health Failures" +) + func NewXIDHandler(nodeName, defaultAgentName, defaultComponentClass, checkName, xidAnalyserEndpoint, metadataPath string) (*XIDHandler, error) { config := parser.ParserConfig{ @@ -72,6 +76,11 @@ func (xidHandler *XIDHandler) ProcessLine(message string) (*pb.HealthEvents, err return nil, nil } + if uuid := xidHandler.parseGPUResetLine(message); len(uuid) != 0 { + slog.Info("GPU was reset, creating healthy HealthEvent", "GPU_UUID", uuid) + return xidHandler.createHealthEventGPUResetEvent(uuid) + } + xidResp, err := xidHandler.parser.Parse(message) if err != nil { slog.Debug("XID parsing failed for message", @@ -89,6 +98,15 @@ func (xidHandler *XIDHandler) ProcessLine(message string) (*pb.HealthEvents, err return xidHandler.createHealthEventFromResponse(xidResp, message), nil } +func (xidHandler *XIDHandler) parseGPUResetLine(message string) string { + m := gpuResetMap.FindStringSubmatch(message) + if len(m) >= 2 { + return m[1] + } + + return "" +} + func (xidHandler *XIDHandler) parseNVRMGPUMapLine(message string) (string, string) { m := reNvrmMap.FindStringSubmatch(message) if len(m) >= 3 { @@ -112,10 +130,10 @@ func (xidHandler *XIDHandler) determineFatality(recommendedAction pb.Recommended }, recommendedAction) } -func (xidHandler *XIDHandler) getGPUUUID(normPCI string) string { +func (xidHandler *XIDHandler) getGPUUUID(normPCI string) (uuid string, fromMetadata bool) { gpuInfo, err := xidHandler.metadataReader.GetGPUByPCI(normPCI) if err == nil && gpuInfo != nil { - return gpuInfo.UUID + return gpuInfo.UUID, true } if err != nil { @@ -123,27 +141,61 @@ func (xidHandler *XIDHandler) getGPUUUID(normPCI string) string { } if uuid, ok := xidHandler.pciToGPUUUID[normPCI]; ok { - return uuid + return uuid, false } - return "" + return "", false } +/* +In addition to the PCI, we will always add the GPU UUID as an impacted entity if it is available from either dmesg +or from the metadata-collector. The COMPONENT_RESET remediation action requires the PCI and GPU UUID are available in +the initial unhealthy event we're sending. Additionally, the corresponding healthy event triggered after the +COMPONENT_RESET requires the same PCI and GPU UUID impacted entities are included as the initial event. As a result, +we will only permit the COMPONENT_RESET action if the GPU UUID was sourced from the metadata-collector to ensure that +the same impacted entities can be fetched after a reset occurs. If the GPU UUID does not exist or is sourced from dmesg, +we will still include it as an impacted entity but override the remediation action from COMPONENT_RESET to RESTART_VM. + +Unhealthy event generation: +1. XID 48 error occurs in syslog which includes the PCI 0000:03:00: +Xid (PCI:0000:03:00): 48, pid=91237, name=nv-hostengine, Ch 00000076, errorString CTX SWITCH TIMEOUT, Info 0x3c046 + +2. Using the metadata-collector, look up the corresponding GPU UUID for PCI 0000:03:00 which is +GPU-455d8f70-2051-db6c-0430-ffc457bff834 + +3. Include this PCI and GPU UUID in the list of impacted entities in our unhealthy HealthEvent with the COMPONENT_RESET +remediation action. + +Healthy event generation: +1. GPU reset occurs in syslog which includes the GPU UUID: +GPU reset executed: GPU-455d8f70-2051-db6c-0430-ffc457bff834 + +2. Using the metadata-collector, look up the corresponding PCI for the given GPU UUID. + +3. Include this PCI and GPU UUID in the list of impacted entities in our healthy HealthEvent. + +Implementation details: +- The xid-handler will take care of overriding the remediation action from COMPONENT_RESET to RESTART_VM if the GPU UUID +is not available in the HealthEvent. This prevents either a healthEventOverrides from being required or from each future +module needing to derive whether to proceed with a COMPONENT_RESET or RESTART_VM based on if the GPU UUID is present in +impacted entities (specifically node-drainer needs this determine if we do a partial drain and fault-remediation needs +this for the maintenance resource selection). +- Note that it would be possible to not include the PCI as an impacted entity in COMPONENT_RESET health events which +would allow us to always do a GPU reset if the GPU UUID could be fetched from any source (metadata-collector or dmesg). +Recall that the GPU UUID itself is provided in the syslog GPU reset log line (whereas the PCI needs to be dynamically +looked up from the metadata-collector because Janitor does not accept the PCI as input nor does it look up the PCI +before writing the syslog event). However, we do not want to conditionally add entity impact depending on the needs of +healthy event generation nor do we want to add custom logic to allow the fault-quarantine-module to clear conditions on +a subset of impacted entities recovering. +*/ func (xidHandler *XIDHandler) createHealthEventFromResponse( xidResp *parser.Response, message string, ) *pb.HealthEvents { - entities := []*pb.Entity{ - {EntityType: "PCI", EntityValue: xidResp.Result.PCIE}, - } - normPCI := xidHandler.normalizePCI(xidResp.Result.PCIE) + uuid, fromMetadata := xidHandler.getGPUUUID(normPCI) - if uuid := xidHandler.getGPUUUID(normPCI); uuid != "" { - entities = append(entities, &pb.Entity{ - EntityType: "GPU_UUID", EntityValue: uuid, - }) - } + entities := getDefaultImpactedEntities(normPCI, uuid) if xidResp.Result.Metadata != nil { var metadata []*pb.Entity @@ -169,6 +221,14 @@ func (xidHandler *XIDHandler) createHealthEventFromResponse( ).Inc() recommendedAction := common.MapActionStringToProto(xidResp.Result.Resolution) + // If we couldn't look up the GPU UUID from metadata (and either couldn't fetch it or retrieved it from dmesg), + // then override the recommended action from COMPONENT_RESET to RESTART_VM. + if !fromMetadata && recommendedAction == pb.RecommendedAction_COMPONENT_RESET { + slog.Info("Overriding recommended action from COMPONENT_RESET to RESTART_VM", "pci", normPCI, "gpuUUID", uuid) + + recommendedAction = pb.RecommendedAction_RESTART_VM + } + event := &pb.HealthEvent{ Version: 1, Agent: xidHandler.defaultAgentName, @@ -191,6 +251,41 @@ func (xidHandler *XIDHandler) createHealthEventFromResponse( } } +func (xidHandler *XIDHandler) createHealthEventGPUResetEvent(uuid string) (*pb.HealthEvents, error) { + gpuInfo, err := xidHandler.metadataReader.GetInfoByUUID(uuid) + // There's no point in sending a healthy HealthEvent with only GPU UUID and not PCI because that healthy HealthEvent + // will not match all impacted entities tracked by the fault-quarantine-module so we will return an error rather than + // send the event with partial information. + if err != nil { + return nil, fmt.Errorf("failed to look up GPU info using UUID %s: %w", uuid, err) + } + + if len(gpuInfo.PCIAddress) == 0 { + return nil, fmt.Errorf("failed to look up PCI info using UUID %s", uuid) + } + + normPCI := xidHandler.normalizePCI(gpuInfo.PCIAddress) + entities := getDefaultImpactedEntities(normPCI, uuid) + event := &pb.HealthEvent{ + Version: 1, + Agent: xidHandler.defaultAgentName, + CheckName: xidHandler.checkName, + ComponentClass: xidHandler.defaultComponentClass, + GeneratedTimestamp: timestamppb.New(time.Now()), + EntitiesImpacted: entities, + Message: healthyHealthEventMessage, + IsFatal: false, + IsHealthy: true, + NodeName: xidHandler.nodeName, + RecommendedAction: pb.RecommendedAction_NONE, + } + + return &pb.HealthEvents{ + Version: 1, + Events: []*pb.HealthEvent{event}, + }, nil +} + func getXID13Metadata(metadata map[string]string) []*pb.Entity { entities := []*pb.Entity{} @@ -235,3 +330,20 @@ func getXID74Metadata(metadata map[string]string) []*pb.Entity { return entities } + +func getDefaultImpactedEntities(pci, uuid string) []*pb.Entity { + entities := []*pb.Entity{ + { + EntityType: "PCI", + EntityValue: pci, + }, + } + if len(uuid) > 0 { + entities = append(entities, &pb.Entity{ + EntityType: "GPU_UUID", + EntityValue: uuid, + }) + } + + return entities +} diff --git a/health-monitors/syslog-health-monitor/pkg/xid/xid_handler_test.go b/health-monitors/syslog-health-monitor/pkg/xid/xid_handler_test.go index e80275e22..997059584 100644 --- a/health-monitors/syslog-health-monitor/pkg/xid/xid_handler_test.go +++ b/health-monitors/syslog-health-monitor/pkg/xid/xid_handler_test.go @@ -16,6 +16,8 @@ package xid import ( "errors" + "os" + "path/filepath" "testing" "github.com/stretchr/testify/assert" @@ -25,6 +27,64 @@ import ( "github.com/nvidia/nvsentinel/health-monitors/syslog-health-monitor/pkg/xid/parser" ) +const ( + testMetadataJSON = ` +{ + "version": "1.0", + "timestamp": "2025-12-10T18:12:55Z", + "node_name": "test-node", + "chassis_serial": null, + "gpus": [ + { + "gpu_id": 0, + "uuid": "GPU-123", + "pci_address": "0000:00:08.0", + "serial_number": "1655322020697", + "device_name": "NVIDIA H100 80GB HBM3", + "nvlinks": [ + { + "link_id": 0, + "remote_pci_address": "ffff:ff:ff.0", + "remote_link_id": 0 + }, + { + "link_id": 1, + "remote_pci_address": "ffff:ff:ff.0", + "remote_link_id": 0 + } + ] + } + ] +}` + testMetadataMissingPCIJSON = ` +{ + "version": "1.0", + "timestamp": "2025-12-10T18:12:55Z", + "node_name": "test-node", + "chassis_serial": null, + "gpus": [ + { + "gpu_id": 0, + "uuid": "GPU-123", + "serial_number": "1655322020697", + "device_name": "NVIDIA H100 80GB HBM3", + "nvlinks": [ + { + "link_id": 0, + "remote_pci_address": "ffff:ff:ff.0", + "remote_link_id": 0 + }, + { + "link_id": 1, + "remote_pci_address": "ffff:ff:ff.0", + "remote_link_id": 0 + } + ] + } + ] +}` +) + func TestParseNVRMGPUMapLine(t *testing.T) { xidHandler := &XIDHandler{} @@ -57,6 +117,39 @@ func TestParseNVRMGPUMapLine(t *testing.T) { } } +func TestParseGPUResetLine(t *testing.T) { + xidHandler := &XIDHandler{} + + testCases := []struct { + name string + line string + gpuId string + }{ + { + name: "Valid GPU Reset Line", + line: "GPU reset executed: GPU-123", + gpuId: "GPU-123", + }, + { + name: "Invalid GPU Reset Line", + line: "GPU reset executed:", + gpuId: "", + }, + { + name: "XID Log Line GPU", + line: "NVRM: GPU at PCI:0000:00:08.0: GPU-123", + gpuId: "", + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + gpuId := xidHandler.parseGPUResetLine(tc.line) + assert.Equal(t, tc.gpuId, gpuId) + }) + } +} + func TestNormalizePCI(t *testing.T) { xidHandler := &XIDHandler{} @@ -127,6 +220,12 @@ func (m *mockParser) Parse(message string) (*parser.Response, error) { } func TestProcessLine(t *testing.T) { + tmpDir := t.TempDir() + metadataFile := filepath.Join(tmpDir, "gpu_metadata.json") + require.NoError(t, os.WriteFile(metadataFile, []byte(testMetadataJSON), 0600)) + metadataFileMissingPCI := filepath.Join(tmpDir, "gpu_metadata_missing_pci.json") + require.NoError(t, os.WriteFile(metadataFileMissingPCI, []byte(testMetadataMissingPCIJSON), 0600)) + testCases := []struct { name string message string @@ -188,13 +287,82 @@ func TestProcessLine(t *testing.T) { assert.Equal(t, "Xid 79", event.ErrorCode[0]) require.Len(t, event.EntitiesImpacted, 1) assert.Equal(t, "PCI", event.EntitiesImpacted[0].EntityType) - assert.Equal(t, "0000:00:08.0", event.EntitiesImpacted[0].EntityValue) + assert.Equal(t, "0000:00:08", event.EntitiesImpacted[0].EntityValue) // Issue #197: Message field stores full journal, no Metadata duplication assert.Empty(t, event.Metadata) }, }, { - name: "Valid XID with GPU UUID", + name: "Valid GPU Reset Message", + message: "GPU reset executed: GPU-123", + setupHandler: func() *XIDHandler { + h, _ := NewXIDHandler("test-node", "test-agent", "GPU", "xid-check", "", metadataFile) + return h + }, + expectEvent: true, + expectError: false, + validateEvent: func(t *testing.T, events *pb.HealthEvents) { + require.NotNil(t, events) + require.Len(t, events.Events, 1) + event := events.Events[0] + expectedEvent := &pb.HealthEvent{ + Version: 1, + Agent: "test-agent", + CheckName: "xid-check", + ComponentClass: "GPU", + EntitiesImpacted: []*pb.Entity{ + { + EntityType: "PCI", + EntityValue: "0000:00:08", + }, + { + EntityType: "GPU_UUID", + EntityValue: "GPU-123", + }, + }, + Message: healthyHealthEventMessage, + IsFatal: false, + IsHealthy: true, + NodeName: "test-node", + RecommendedAction: pb.RecommendedAction_NONE, + } + assert.NotNil(t, event.GeneratedTimestamp) + event.GeneratedTimestamp = nil + assert.Equal(t, expectedEvent, event) + }, + }, + { + name: "Valid GPU Reset Message with Metadata Collector not Initialized", + message: "GPU reset executed: GPU-123", + setupHandler: func() *XIDHandler { + h, _ := NewXIDHandler("test-node", "test-agent", "GPU", "xid-check", "", "/tmp/metadata.json") + return h + }, + expectEvent: false, + expectError: true, + }, + { + name: "Valid GPU Reset Message with Metadata Collector missing GPU UUID", + message: "GPU reset executed: GPU-456", + setupHandler: func() *XIDHandler { + h, _ := NewXIDHandler("test-node", "test-agent", "GPU", "xid-check", "", metadataFile) + return h + }, + expectEvent: false, + expectError: true, + }, + { + name: "Valid GPU Reset Message with Metadata Collector not containing PCI", + message: "GPU reset executed: GPU-123", + setupHandler: func() *XIDHandler { + h, _ := NewXIDHandler("test-node", "test-agent", "GPU", "xid-check", "", metadataFileMissingPCI) + return h + }, + expectEvent: false, + expectError: true, + }, + { + name: "Valid XID with GPU UUID from NVRM: RESET_GPU overridden to RESTART_VM", message: "NVRM: Xid (PCI:0000:00:08.0): 79, pid=12345, name=test-process", setupHandler: func() *XIDHandler { h, _ := NewXIDHandler("test-node", "test-agent", "GPU", "xid-check", "", "/tmp/metadata.json") @@ -206,7 +374,7 @@ func TestProcessLine(t *testing.T) { DecodedXIDStr: "Xid 79", PCIE: "0000:00:08.0", Mnemonic: "GPU has fallen off the bus", - Resolution: "CONTACT_SUPPORT", + Resolution: "RESET_GPU", Number: 79, }, }, nil @@ -226,6 +394,43 @@ func TestProcessLine(t *testing.T) { assert.Equal(t, "GPU_UUID", event.EntitiesImpacted[1].EntityType) assert.Equal(t, "GPU-12345678-1234-1234-1234-123456789012", event.EntitiesImpacted[1].EntityValue) assert.Equal(t, "NVRM: Xid (PCI:0000:00:08.0): 79, pid=12345, name=test-process", event.Message) + assert.Equal(t, pb.RecommendedAction_RESTART_VM, event.RecommendedAction) + assert.Empty(t, event.Metadata) + }, + }, + { + name: "Valid XID with GPU UUID from Metadata Collector: RESET_GPU kept", + message: "NVRM: Xid (PCI:0000:00:08.0): 79, pid=12345, name=test-process", + setupHandler: func() *XIDHandler { + h, _ := NewXIDHandler("test-node", "test-agent", "GPU", "xid-check", "", metadataFile) + h.parser = &mockParser{ + parseFunc: func(msg string) (*parser.Response, error) { + return &parser.Response{ + Success: true, + Result: parser.XIDDetails{ + DecodedXIDStr: "Xid 79", + PCIE: "0000:00:08.0", + Mnemonic: "GPU has fallen off the bus", + Resolution: "RESET_GPU", + Number: 79, + }, + }, nil + }, + } + return h + }, + expectEvent: true, + expectError: false, + validateEvent: func(t *testing.T, events *pb.HealthEvents) { + require.NotNil(t, events) + require.Len(t, events.Events, 1) + event := events.Events[0] + require.Len(t, event.EntitiesImpacted, 2) + assert.Equal(t, "PCI", event.EntitiesImpacted[0].EntityType) + assert.Equal(t, "GPU_UUID", event.EntitiesImpacted[1].EntityType) + assert.Equal(t, "GPU-123", event.EntitiesImpacted[1].EntityValue) + assert.Equal(t, "NVRM: Xid (PCI:0000:00:08.0): 79, pid=12345, name=test-process", event.Message) + assert.Equal(t, pb.RecommendedAction_COMPONENT_RESET, event.RecommendedAction) assert.Empty(t, event.Metadata) }, }, @@ -367,7 +572,7 @@ func TestCreateHealthEventFromResponse(t *testing.T) { event := events.Events[0] require.Len(t, event.EntitiesImpacted, 2) assert.Equal(t, "PCI", event.EntitiesImpacted[0].EntityType) - assert.Equal(t, "0000:00:09.0", event.EntitiesImpacted[0].EntityValue) + assert.Equal(t, "0000:00:09", event.EntitiesImpacted[0].EntityValue) assert.Equal(t, "GPU_UUID", event.EntitiesImpacted[1].EntityType) assert.Equal(t, "GPU-ABCDEF12-3456-7890-ABCD-EF1234567890", event.EntitiesImpacted[1].EntityValue) assert.Equal(t, "Test XID message", event.Message) diff --git a/tests/gpu_health_monitor_test.go b/tests/gpu_health_monitor_test.go index e5b086bb5..0a76cef11 100644 --- a/tests/gpu_health_monitor_test.go +++ b/tests/gpu_health_monitor_test.go @@ -63,7 +63,7 @@ func TestGPUHealthMonitorMultipleErrors(t *testing.T) { helpers.InjectMetadata(t, ctx, client, helpers.NVSentinelNamespace, testNodeName, metadata) t.Logf("Restarting GPU health monitor pod %s to load metadata", gpuHealthMonitorPod.Name) - err = helpers.DeletePod(ctx, client, helpers.NVSentinelNamespace, gpuHealthMonitorPod.Name) + err = helpers.DeletePod(ctx, t, client, helpers.NVSentinelNamespace, gpuHealthMonitorPod.Name, false) require.NoError(t, err, "failed to restart GPU health monitor pod") helpers.WaitForPodsDeleted(ctx, t, client, helpers.NVSentinelNamespace, []string{gpuHealthMonitorPod.Name}) @@ -300,7 +300,7 @@ func TestGPUHealthMonitorDCGMConnectionError(t *testing.T) { require.NoError(t, err, "failed to patch DCGM service port") t.Logf("Restarting GPU health monitor pod %s to trigger reconnection", gpuHealthMonitorPodName) - err = helpers.DeletePod(ctx, client, helpers.NVSentinelNamespace, gpuHealthMonitorPodName) + err = helpers.DeletePod(ctx, t, client, helpers.NVSentinelNamespace, gpuHealthMonitorPodName, false) require.NoError(t, err, "failed to restart GPU health monitor pod") t.Logf("Waiting for GpuDcgmConnectivityFailure condition on node %s", nodeName) diff --git a/tests/helpers/kube.go b/tests/helpers/kube.go index fe9fea7ee..c50163d8d 100755 --- a/tests/helpers/kube.go +++ b/tests/helpers/kube.go @@ -474,7 +474,8 @@ func GetNodeByName(ctx context.Context, c klient.Client, nodeName string) (*v1.N return &node, nil } -func DeletePod(ctx context.Context, c klient.Client, namespace, podName string) error { +func DeletePod(ctx context.Context, t *testing.T, c klient.Client, namespace, podName string, + waitForRemoval bool) error { pod := &v1.Pod{ ObjectMeta: metav1.ObjectMeta{ Name: podName, @@ -487,6 +488,17 @@ func DeletePod(ctx context.Context, c klient.Client, namespace, podName string) return fmt.Errorf("failed to delete pod %s: %w", podName, err) } + if waitForRemoval { + require.Eventually(t, func() bool { + err = c.Resources().Get(ctx, podName, namespace, pod) + if err != nil { + return apierrors.IsNotFound(err) + } + + return false + }, EventuallyWaitTimeout, WaitInterval, "pod %s should be removed from API", podName) + } + return nil } @@ -739,7 +751,7 @@ func DrainRunningPodsInNamespace(ctx context.Context, t *testing.T, c klient.Cli t.Logf("Found running pod: %s, deleting it", pod.Name) - err = DeletePod(ctx, c, namespace, pod.Name) + err = DeletePod(ctx, t, c, namespace, pod.Name, false) if err != nil { t.Errorf("Failed to delete pod %s: %v", pod.Name, err) } else { diff --git a/tests/node_drainer_test.go b/tests/node_drainer_test.go index 56c34c07a..d74b2858f 100644 --- a/tests/node_drainer_test.go +++ b/tests/node_drainer_test.go @@ -94,7 +94,7 @@ func TestNodeDrainerEvictionModes(t *testing.T) { helpers.AssertPodsNeverDeleted(ctx, t, client, "kube-system", kubeSystemPods) t.Log("Phase 1: Finalizer pod stuck in Terminating") - err = helpers.DeletePod(ctx, client, "immediate-test", finalizerPod) + err = helpers.DeletePod(ctx, t, client, "immediate-test", finalizerPod, false) require.NoError(t, err) require.Eventually(t, func() bool { diff --git a/tests/syslog_health_monitor_test.go b/tests/syslog_health_monitor_test.go index c0b717823..db875d925 100644 --- a/tests/syslog_health_monitor_test.go +++ b/tests/syslog_health_monitor_test.go @@ -83,6 +83,57 @@ func TestSyslogHealthMonitorXIDDetection(t *testing.T) { return ctx }) + // We require that the "Inject XID error requiring GPU reset" and "Inject GPU reset event" tests run first since + // these 2 tests will ensure that the SysLogsXIDError is added in response to an XID error and cleared in response + // to a GPU reset event. This ensures the remaining tests run against the same syslog-health-monitor pod and node, + // with no unhealthy node status conditions, without needing to recycle the pod during the test execution. + feature.Assess("Inject XID error requiring GPU reset", func(ctx context.Context, t *testing.T, c *envconf.Config) context.Context { + client, err := c.NewClient() + require.NoError(t, err, "failed to create kubernetes client") + + nodeName := ctx.Value(keySyslogNodeName).(string) + xidMessages := []string{ + "kernel: [16450076.435595] NVRM: Xid (PCI:0002:00:00): 119, pid=1582259, name=nvc:[driver], Timeout after 6s of waiting for RPC response from GPU1 GSP! Expected function 76 (GSP_RM_CONTROL) (0x20802a02 0x8).", + } + expectedSequencePatterns := []string{ + `ErrorCode:119 PCI:0002:00:00 GPU_UUID:GPU-22222222-2222-2222-2222-222222222222 kernel:.*?NVRM: Xid \(PCI:0002:00:00\): 119.*?Recommended Action=COMPONENT_RESET`, + } + + helpers.InjectSyslogMessages(t, stubJournalHTTPPort, xidMessages) + + t.Log("Verifying node condition contains XID error with GPU UUID using regex pattern") + require.Eventually(t, func() bool { + return helpers.VerifyNodeConditionMatchesSequence(t, ctx, client, nodeName, + "SysLogsXIDError", "SysLogsXIDErrorIsNotHealthy", expectedSequencePatterns) + }, helpers.EventuallyWaitTimeout, helpers.WaitInterval, "Node condition should contain XID error with GPU UUID") + + return ctx + }) + + feature.Assess("Inject GPU reset event", func(ctx context.Context, t *testing.T, c *envconf.Config) context.Context { + client, err := c.NewClient() + require.NoError(t, err, "failed to create kubernetes client") + + nodeName := ctx.Value(keySyslogNodeName).(string) + gpuResetMessages := []string{ + "kernel: [16450076.435595] GPU reset executed: GPU-22222222-2222-2222-2222-222222222222", + } + + helpers.InjectSyslogMessages(t, stubJournalHTTPPort, gpuResetMessages) + + t.Logf("Waiting for SysLogsXIDError condition to be cleared from node %s", nodeName) + require.Eventually(t, func() bool { + condition, err := helpers.CheckNodeConditionExists(ctx, client, nodeName, + "SysLogsXIDError", "SysLogsXIDErrorIsHealthy") + if err != nil { + return false + } + return condition != nil && condition.Status == v1.ConditionFalse + }, helpers.EventuallyWaitTimeout, helpers.WaitInterval, "SysLogsXIDError condition should be cleared") + + return ctx + }) + feature.Assess("Inject burst XID errors and verify aggregation", func(ctx context.Context, t *testing.T, c *envconf.Config) context.Context { client, err := c.NewClient() require.NoError(t, err, "failed to create kubernetes client") @@ -201,11 +252,14 @@ func TestSyslogHealthMonitorXIDDetection(t *testing.T) { } nodeName := nodeNameVal.(string) + t.Logf("Cleaning up metadata from node %s", nodeName) + helpers.DeleteMetadata(t, ctx, client, helpers.NVSentinelNamespace, nodeName) + podNameVal := ctx.Value(keySyslogPodName) if podNameVal != nil { podName := podNameVal.(string) t.Logf("Restarting syslog-health-monitor pod %s to clear conditions", podName) - err = helpers.DeletePod(ctx, client, helpers.NVSentinelNamespace, podName) + err = helpers.DeletePod(ctx, t, client, helpers.NVSentinelNamespace, podName, true) if err != nil { t.Logf("Warning: failed to delete pod: %v", err) } else { @@ -221,9 +275,6 @@ func TestSyslogHealthMonitorXIDDetection(t *testing.T) { } } - t.Logf("Cleaning up metadata from node %s", nodeName) - helpers.DeleteMetadata(t, ctx, client, helpers.NVSentinelNamespace, nodeName) - t.Logf("Removing ManagedByNVSentinel label from node %s", nodeName) err = helpers.RemoveNodeManagedByNVSentinelLabel(ctx, client, nodeName) if err != nil { @@ -452,11 +503,14 @@ func TestSyslogHealthMonitorSXIDDetection(t *testing.T) { } nodeName := nodeNameVal.(string) + t.Logf("Cleaning up metadata from node %s", nodeName) + helpers.DeleteMetadata(t, ctx, client, helpers.NVSentinelNamespace, nodeName) + podNameVal := ctx.Value(keySyslogPodName) if podNameVal != nil { podName := podNameVal.(string) t.Logf("Restarting syslog-health-monitor pod %s", podName) - err = helpers.DeletePod(ctx, client, helpers.NVSentinelNamespace, podName) + err = helpers.DeletePod(ctx, t, client, helpers.NVSentinelNamespace, podName, true) if err != nil { t.Logf("Warning: failed to delete pod: %v", err) } else { @@ -471,9 +525,6 @@ func TestSyslogHealthMonitorSXIDDetection(t *testing.T) { } } - t.Logf("Cleaning up metadata from node %s", nodeName) - helpers.DeleteMetadata(t, ctx, client, helpers.NVSentinelNamespace, nodeName) - t.Logf("Removing ManagedByNVSentinel label from node %s", nodeName) err = helpers.RemoveNodeManagedByNVSentinelLabel(ctx, client, nodeName) if err != nil {