diff --git a/backend/pkg/libarcane/system/gpu.go b/backend/pkg/libarcane/system/gpu.go index a785391b08..ff374a127d 100644 --- a/backend/pkg/libarcane/system/gpu.go +++ b/backend/pkg/libarcane/system/gpu.go @@ -4,6 +4,7 @@ import ( "bytes" "context" "encoding/csv" + "encoding/json" "fmt" "log/slog" "os" @@ -93,7 +94,10 @@ func (m *GPUMonitor) statsForTypeInternal(ctx context.Context, gpuType string) ( case "amd": return getAMDStatsInternal(ctx) case "intel": - return getIntelStatsInternal(ctx) + m.cacheMu.RLock() + toolPath := m.toolPath + m.cacheMu.RUnlock() + return getIntelStatsInternal(ctx, toolPath) default: return nil, fmt.Errorf("no supported GPU found") } @@ -138,7 +142,14 @@ func (m *GPUMonitor) detectInternal(ctx context.Context) error { slog.InfoContext(ctx, "Using configured GPU type", "type", "intel") return nil } - return fmt.Errorf("intel_gpu_top not found but GPU_TYPE set to intel") + // intel_gpu_top absent — fall back to sysfs so the GPU still shows in the + // dashboard (ARM builds, minimal containers, Proxmox passthrough). + if HasIntelGPU() { + m.markDetectedInternal("intel", "") + slog.InfoContext(ctx, "Using configured GPU type via sysfs (intel_gpu_top not found)", "type", "intel") + return nil + } + return fmt.Errorf("intel_gpu_top not found and no Intel GPU detected in sysfs, but GPU_TYPE set to intel") default: slog.WarnContext(ctx, "Invalid GPU_TYPE specified, falling back to auto-detection", "gpu_type", t) } @@ -159,6 +170,12 @@ func (m *GPUMonitor) detectInternal(ctx context.Context) error { slog.InfoContext(ctx, "Intel GPU detected", "tool", "intel_gpu_top", "path", path) return nil } + // Last resort: sysfs vendor-ID detection so the GPU shows up even without the tool. + if HasIntelGPU() { + m.markDetectedInternal("intel", "") + slog.InfoContext(ctx, "Intel GPU detected", "method", "sysfs") + return nil + } m.detectionDone = true return fmt.Errorf("no supported GPU found") @@ -292,10 +309,138 @@ func getAMDStatsInternal(ctx context.Context) ([]systemtypes.GPUStats, error) { return stats, nil } -func getIntelStatsInternal(ctx context.Context) ([]systemtypes.GPUStats, error) { - stats := []systemtypes.GPUStats{ - {Name: "Intel GPU", Index: 0, MemoryUsed: 0, MemoryTotal: 0}, +// intelGPUTopOutput is the subset of intel_gpu_top JSON we care about. +// The memory block is only present on discrete GPUs (Intel Arc and later). +type intelGPUTopOutput struct { + Memory *struct { + Unit string `json:"unit"` + Local *struct { + Total float64 `json:"total"` + Free float64 `json:"free"` + } `json:"local"` + } `json:"memory"` +} + +// findIntelDRICardsInternal returns /dev/dri/cardN paths for cards whose PCI vendor +// is Intel (0x8086). This handles Proxmox and similar setups where card0 is a +// VirtIO display adapter and the real Arc GPU sits on card1 or higher. +func findIntelDRICardsInternal() []string { + entries, err := os.ReadDir(AMDGPUSysfsPath) + if err != nil { + return nil + } + var cards []string + for _, entry := range entries { + name := entry.Name() + if !strings.HasPrefix(name, "card") || strings.Contains(name, "-") { + continue + } + data, err := os.ReadFile(fmt.Sprintf("%s/%s/device/vendor", AMDGPUSysfsPath, name)) + if err != nil { + continue + } + if strings.TrimSpace(string(data)) == "0x8086" { + cards = append(cards, fmt.Sprintf("/dev/dri/%s", name)) + } + } + return cards +} + +// HasIntelGPU reports whether at least one Intel GPU is present via DRM sysfs. +func HasIntelGPU() bool { + return len(findIntelDRICardsInternal()) > 0 +} + +// getIntelStatsInternal iterates over every Intel DRI card and collects VRAM stats. +// toolPath is the path to intel_gpu_top (empty string → sysfs-only, no memory stats). +func getIntelStatsInternal(ctx context.Context, toolPath string) ([]systemtypes.GPUStats, error) { + intelCards := findIntelDRICardsInternal() + if len(intelCards) == 0 { + return nil, fmt.Errorf("no Intel GPU found in sysfs") } - slog.DebugContext(ctx, "Intel GPU detected but detailed stats not yet implemented") + + var stats []systemtypes.GPUStats + for i, cardPath := range intelCards { + entry := systemtypes.GPUStats{ + Name: intelGPUNameInternal(cardPath), + Index: i, + } + if toolPath != "" { + if mem, err := intelGPUTopMemoryInternal(ctx, toolPath, cardPath); err == nil { + entry.MemoryUsed = mem.used + entry.MemoryTotal = mem.total + } else { + slog.DebugContext(ctx, "intel_gpu_top memory query failed", "card", cardPath, "error", err) + } + } + stats = append(stats, entry) + } + + slog.DebugContext(ctx, "Collected Intel GPU stats", "gpu_count", len(stats)) return stats, nil } + +type intelMemStats struct{ used, total float64 } + +// intelGPUTopMemoryInternal runs intel_gpu_top for a single card and returns VRAM stats. +func intelGPUTopMemoryInternal(ctx context.Context, toolPath, cardPath string) (intelMemStats, error) { + ctx, cancel := context.WithTimeout(ctx, 3*time.Second) + defer cancel() + + cmd := exec.CommandContext(ctx, toolPath, //nolint:gosec // toolPath from exec.LookPath, cardPath from sysfs vendor check + "-d", fmt.Sprintf("drm:%s", cardPath), + "-J", "-s", "100", "-c", "1") + out, err := cmd.Output() + if err != nil { + return intelMemStats{}, fmt.Errorf("intel_gpu_top: %w", err) + } + + var result intelGPUTopOutput + data := bytes.TrimSpace(out) + if len(data) > 0 && data[0] == '[' { + var arr []intelGPUTopOutput + if err := json.Unmarshal(data, &arr); err != nil { + return intelMemStats{}, fmt.Errorf("parse intel_gpu_top array: %w", err) + } + if len(arr) == 0 { + return intelMemStats{}, fmt.Errorf("parse intel_gpu_top array: empty output") + } + result = arr[0] + } else { + if err := json.Unmarshal(data, &result); err != nil { + return intelMemStats{}, fmt.Errorf("parse intel_gpu_top object: %w", err) + } + } + + if result.Memory == nil || result.Memory.Local == nil { + return intelMemStats{}, fmt.Errorf("no local memory info in intel_gpu_top output") + } + + unit := strings.ToLower(strings.TrimSpace(result.Memory.Unit)) + var scale float64 + switch unit { + case "mib": + scale = 1024 * 1024 + case "gib": + scale = 1024 * 1024 * 1024 + default: + slog.WarnContext(ctx, "Unknown intel_gpu_top memory unit, treating as bytes", "unit", unit) + scale = 1 + } + + total := result.Memory.Local.Total * scale + used := (result.Memory.Local.Total - result.Memory.Local.Free) * scale + return intelMemStats{used: used, total: total}, nil +} + +// intelGPUNameInternal returns a human-readable label for an Intel DRI card. +// Prefers the DRM "label" sysfs attribute; falls back to "Intel GPU (cardN)". +func intelGPUNameInternal(cardPath string) string { + cardName := strings.TrimPrefix(cardPath, "/dev/dri/") + if data, err := os.ReadFile(fmt.Sprintf("%s/%s/device/label", AMDGPUSysfsPath, cardName)); err == nil { + if label := strings.TrimSpace(string(data)); label != "" { + return label + } + } + return fmt.Sprintf("Intel GPU (%s)", cardName) +}