Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
2eedd91
fix(transceiver): concurrent collect and BDF dedup for RoCE environments
Apr 23, 2026
2702544
fix(transceiver): skip SR-IOV VF ethernet interfaces and cap worker pool
Apr 23, 2026
36e5674
feat(consts): differentiate alert colors by severity level
Apr 23, 2026
4e87b40
docs: add sichek-collector design spec
Apr 23, 2026
5c57d6c
docs: add implementation plans for sichek-collector and reporter module
Apr 23, 2026
5fc3304
feat(service): reporter config loader with defaults
Apr 24, 2026
eefd0e9
feat(service): Reporter.pushOnce with gzip + retry
Apr 24, 2026
604d4f8
feat(service): Reporter.Run periodic loop with panic recover
Apr 24, 2026
ac6bc83
feat(service): ResolveNodeName prefers NODE_NAME env
Apr 24, 2026
8eecbc1
feat(service): wire Reporter into DaemonService lifecycle
Apr 24, 2026
1bfc9a7
feat(config): add reporter block (disabled by default)
Apr 24, 2026
da2466e
fix(infiniband): show upstream BDF in PCIETreeWidth/Speed checks
Apr 30, 2026
0288568
fix(infiniband): raise PCIETreeSpeed/Width severity to Critical
Apr 30, 2026
48e476b
feat(infiniband): per-port collection for multi-plane HCAs (CX8 4P RoCE)
May 7, 2026
b553058
fix(infiniband): pcie tree checkers read tree spec, not link spec
May 7, 2026
3d2a026
fix(infiniband): no panic / readable error when init fails
May 7, 2026
f67e991
chore(infiniband): drop mezz_* from zy spec ib_devs
May 7, 2026
2dc1f33
Merge origin/feat/roce-multiplane into feat/roce-multiplane-bundle
May 7, 2026
621f965
Merge origin/fix/ib-pcie-tree-show-upstream-bdf into feat/roce-multip…
May 7, 2026
4b64bab
Merge origin/feat/sichek-collector into feat/roce-multiplane-bundle
May 7, 2026
c3fc1f3
Merge origin/feat/alert-color-by-level into feat/roce-multiplane-bundle
May 7, 2026
4c931d0
Merge origin/fix/transceiver-roce-concurrent-collect into feat/roce-m…
May 7, 2026
d2a69fe
Merge origin/main (sichek config sync) into feat/roce-multiplane-bundle
May 7, 2026
128c3f0
Merge branch 'scitix:main' into main
lzi-a11y May 8, 2026
b39de21
Merge branch 'scitix:main' into main
lzi-a11y May 18, 2026
53ef767
chore(gpuevents): mute GPUHang checker pending rule rework
May 21, 2026
9277940
feat(nvidia): collect compute processes per GPU and surface in snapshot
May 21, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 9 additions & 7 deletions components/nvidia/collector/device_info.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,9 @@ type DeviceInfo struct {
Utilization UtilizationInfo `json:"utilization_info" yaml:"utilization_info"`
NVLinkStates NVLinkStates `json:"nvlink_state" yaml:"nvlink_state"`
MemoryErrors MemoryErrors `json:"ecc_event" yaml:"ecc_event"`
NProcess int `json:"nprocess" yaml:"nprocess"`
PartialErrors []string `json:"partial_errors,omitempty" yaml:"partial_errors,omitempty"`
NProcess int `json:"nprocess" yaml:"nprocess"`
Processes ComputeProcesses `json:"compute_processes" yaml:"compute_processes"`
PartialErrors []string `json:"partial_errors,omitempty" yaml:"partial_errors,omitempty"`
}

func (deviceInfo *DeviceInfo) JSON() ([]byte, error) {
Expand Down Expand Up @@ -165,13 +166,14 @@ func (deviceInfo *DeviceInfo) Get(device nvml.Device, index int, driverVersion s
deviceInfo.PartialErrors = append(deviceInfo.PartialErrors, fmt.Sprintf("failed to get nvlink states: %v", err2))
}

// Get the number of processes using the GPU
processInfo, err := device.GetComputeRunningProcesses()
if !errors.Is(err, nvml.SUCCESS) {
deviceInfo.PartialErrors = append(deviceInfo.PartialErrors, fmt.Sprintf("failed to get processes: %v", nvml.ErrorString(err)))
// Get the compute processes using the GPU (PID, comm, used GPU memory).
// NProcess is derived from the resulting list so its meaning is unchanged.
err2 = deviceInfo.Processes.Get(device, uuid)
if err2 != nil {
deviceInfo.PartialErrors = append(deviceInfo.PartialErrors, fmt.Sprintf("failed to get compute processes: %v", err2))
deviceInfo.NProcess = 0
} else {
deviceInfo.NProcess = len(processInfo)
deviceInfo.NProcess = len(deviceInfo.Processes)
}

if len(deviceInfo.PartialErrors) > 0 {
Expand Down
74 changes: 74 additions & 0 deletions components/nvidia/collector/process_info.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
/*
Copyright 2024 The Scitix Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package collector

import (
"errors"
"fmt"
"os"
"strings"

"github.com/scitix/sichek/components/common"

"github.com/NVIDIA/go-nvml/pkg/nvml"
)

type ComputeProcesses []ComputeProcess

type ComputeProcess struct {
Pid uint32 `json:"pid" yaml:"pid"`
Comm string `json:"comm,omitempty" yaml:"comm,omitempty"`
UsedGpuMemoryMiB uint64 `json:"used_gpu_memory_mib" yaml:"used_gpu_memory_mib"`
}

func (p ComputeProcesses) JSON() ([]byte, error) {
return common.JSON(p)
}

func (p ComputeProcesses) ToString() string {
return common.ToString(p)
}

func (p *ComputeProcesses) Get(device nvml.Device, uuid string) error {
*p = make(ComputeProcesses, 0)
procs, ret := device.GetComputeRunningProcesses()
if !errors.Is(ret, nvml.SUCCESS) {
return fmt.Errorf("failed to get compute running processes for GPU %v: %v", uuid, nvml.ErrorString(ret))
}

out := make(ComputeProcesses, 0, len(procs))
for _, pi := range procs {
out = append(out, ComputeProcess{
Pid: pi.Pid,
Comm: readProcComm(pi.Pid),
UsedGpuMemoryMiB: pi.UsedGpuMemory / (1024 * 1024),
})
}
*p = out
return nil
}

// readProcComm reads /proc/<pid>/comm and returns the process name without
// the trailing newline. Returns an empty string on any failure (process exited,
// PID not visible from this namespace, permission denied) — callers fall back
// to the raw PID.
func readProcComm(pid uint32) string {
data, err := os.ReadFile(fmt.Sprintf("/proc/%d/comm", pid))
if err != nil {
return ""
}
return strings.TrimRight(string(data), "\n")
}
6 changes: 5 additions & 1 deletion config/default_user_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,11 @@ gpuevents:
mock: false
enable_metrics: true
ignore_namespaces: []
ignored_checkers: []
# GPUHang temporarily disabled: current rule has known false-positive
# sources (pviol thermal bug, rxpci/txpci delta semantics, strict 8/8 AND).
# See docs/gpu-hang-detection-summary.md before re-enabling.
ignored_checkers:
- "GPUHang"

podlog:
query_interval: 10s
Expand Down