diff --git a/components/nvidia/collector/device_info.go b/components/nvidia/collector/device_info.go index e034fd48..5ef90a9f 100644 --- a/components/nvidia/collector/device_info.go +++ b/components/nvidia/collector/device_info.go @@ -42,8 +42,9 @@ type DeviceInfo struct { Utilization UtilizationInfo `json:"utilization_info" yaml:"utilization_info"` NVLinkStates NVLinkStates `json:"nvlink_state" yaml:"nvlink_state"` MemoryErrors MemoryErrors `json:"ecc_event" yaml:"ecc_event"` - NProcess int `json:"nprocess" yaml:"nprocess"` - PartialErrors []string `json:"partial_errors,omitempty" yaml:"partial_errors,omitempty"` + NProcess int `json:"nprocess" yaml:"nprocess"` + Processes ComputeProcesses `json:"compute_processes" yaml:"compute_processes"` + PartialErrors []string `json:"partial_errors,omitempty" yaml:"partial_errors,omitempty"` } func (deviceInfo *DeviceInfo) JSON() ([]byte, error) { @@ -165,13 +166,14 @@ func (deviceInfo *DeviceInfo) Get(device nvml.Device, index int, driverVersion s deviceInfo.PartialErrors = append(deviceInfo.PartialErrors, fmt.Sprintf("failed to get nvlink states: %v", err2)) } - // Get the number of processes using the GPU - processInfo, err := device.GetComputeRunningProcesses() - if !errors.Is(err, nvml.SUCCESS) { - deviceInfo.PartialErrors = append(deviceInfo.PartialErrors, fmt.Sprintf("failed to get processes: %v", nvml.ErrorString(err))) + // Get the compute processes using the GPU (PID, comm, used GPU memory). + // NProcess is derived from the resulting list so its meaning is unchanged. + err2 = deviceInfo.Processes.Get(device, uuid) + if err2 != nil { + deviceInfo.PartialErrors = append(deviceInfo.PartialErrors, fmt.Sprintf("failed to get compute processes: %v", err2)) deviceInfo.NProcess = 0 } else { - deviceInfo.NProcess = len(processInfo) + deviceInfo.NProcess = len(deviceInfo.Processes) } if len(deviceInfo.PartialErrors) > 0 { diff --git a/components/nvidia/collector/process_info.go b/components/nvidia/collector/process_info.go new file mode 100644 index 00000000..3c33ff6d --- /dev/null +++ b/components/nvidia/collector/process_info.go @@ -0,0 +1,74 @@ +/* +Copyright 2024 The Scitix Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package collector + +import ( + "errors" + "fmt" + "os" + "strings" + + "github.com/scitix/sichek/components/common" + + "github.com/NVIDIA/go-nvml/pkg/nvml" +) + +type ComputeProcesses []ComputeProcess + +type ComputeProcess struct { + Pid uint32 `json:"pid" yaml:"pid"` + Comm string `json:"comm,omitempty" yaml:"comm,omitempty"` + UsedGpuMemoryMiB uint64 `json:"used_gpu_memory_mib" yaml:"used_gpu_memory_mib"` +} + +func (p ComputeProcesses) JSON() ([]byte, error) { + return common.JSON(p) +} + +func (p ComputeProcesses) ToString() string { + return common.ToString(p) +} + +func (p *ComputeProcesses) Get(device nvml.Device, uuid string) error { + *p = make(ComputeProcesses, 0) + procs, ret := device.GetComputeRunningProcesses() + if !errors.Is(ret, nvml.SUCCESS) { + return fmt.Errorf("failed to get compute running processes for GPU %v: %v", uuid, nvml.ErrorString(ret)) + } + + out := make(ComputeProcesses, 0, len(procs)) + for _, pi := range procs { + out = append(out, ComputeProcess{ + Pid: pi.Pid, + Comm: readProcComm(pi.Pid), + UsedGpuMemoryMiB: pi.UsedGpuMemory / (1024 * 1024), + }) + } + *p = out + return nil +} + +// readProcComm reads /proc//comm and returns the process name without +// the trailing newline. Returns an empty string on any failure (process exited, +// PID not visible from this namespace, permission denied) — callers fall back +// to the raw PID. +func readProcComm(pid uint32) string { + data, err := os.ReadFile(fmt.Sprintf("/proc/%d/comm", pid)) + if err != nil { + return "" + } + return strings.TrimRight(string(data), "\n") +} diff --git a/config/default_user_config.yaml b/config/default_user_config.yaml index a1f0230e..ddd5a09e 100644 --- a/config/default_user_config.yaml +++ b/config/default_user_config.yaml @@ -44,7 +44,11 @@ gpuevents: mock: false enable_metrics: true ignore_namespaces: [] - ignored_checkers: [] + # GPUHang temporarily disabled: current rule has known false-positive + # sources (pviol thermal bug, rxpci/txpci delta semantics, strict 8/8 AND). + # See docs/gpu-hang-detection-summary.md before re-enabling. + ignored_checkers: + - "GPUHang" podlog: query_interval: 10s