Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
157 changes: 151 additions & 6 deletions backend/pkg/libarcane/system/gpu.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"bytes"
"context"
"encoding/csv"
"encoding/json"
"fmt"
"log/slog"
"os"
Expand Down Expand Up @@ -93,7 +94,10 @@ func (m *GPUMonitor) statsForTypeInternal(ctx context.Context, gpuType string) (
case "amd":
return getAMDStatsInternal(ctx)
case "intel":
return getIntelStatsInternal(ctx)
m.cacheMu.RLock()
toolPath := m.toolPath
m.cacheMu.RUnlock()
return getIntelStatsInternal(ctx, toolPath)
default:
return nil, fmt.Errorf("no supported GPU found")
}
Expand Down Expand Up @@ -138,7 +142,14 @@ func (m *GPUMonitor) detectInternal(ctx context.Context) error {
slog.InfoContext(ctx, "Using configured GPU type", "type", "intel")
return nil
}
return fmt.Errorf("intel_gpu_top not found but GPU_TYPE set to intel")
// intel_gpu_top absent — fall back to sysfs so the GPU still shows in the
// dashboard (ARM builds, minimal containers, Proxmox passthrough).
if HasIntelGPU() {
m.markDetectedInternal("intel", "")
slog.InfoContext(ctx, "Using configured GPU type via sysfs (intel_gpu_top not found)", "type", "intel")
return nil
}
return fmt.Errorf("intel_gpu_top not found and no Intel GPU detected in sysfs, but GPU_TYPE set to intel")
default:
slog.WarnContext(ctx, "Invalid GPU_TYPE specified, falling back to auto-detection", "gpu_type", t)
}
Expand All @@ -159,6 +170,12 @@ func (m *GPUMonitor) detectInternal(ctx context.Context) error {
slog.InfoContext(ctx, "Intel GPU detected", "tool", "intel_gpu_top", "path", path)
return nil
}
// Last resort: sysfs vendor-ID detection so the GPU shows up even without the tool.
if HasIntelGPU() {
m.markDetectedInternal("intel", "")
slog.InfoContext(ctx, "Intel GPU detected", "method", "sysfs")
return nil
}

m.detectionDone = true
return fmt.Errorf("no supported GPU found")
Expand Down Expand Up @@ -292,10 +309,138 @@ func getAMDStatsInternal(ctx context.Context) ([]systemtypes.GPUStats, error) {
return stats, nil
}

func getIntelStatsInternal(ctx context.Context) ([]systemtypes.GPUStats, error) {
stats := []systemtypes.GPUStats{
{Name: "Intel GPU", Index: 0, MemoryUsed: 0, MemoryTotal: 0},
// intelGPUTopOutput is the subset of intel_gpu_top JSON we care about.
// The memory block is only present on discrete GPUs (Intel Arc and later).
type intelGPUTopOutput struct {
Memory *struct {
Unit string `json:"unit"`
Local *struct {
Total float64 `json:"total"`
Free float64 `json:"free"`
} `json:"local"`
} `json:"memory"`
}

// findIntelDRICardsInternal returns /dev/dri/cardN paths for cards whose PCI vendor
// is Intel (0x8086). This handles Proxmox and similar setups where card0 is a
// VirtIO display adapter and the real Arc GPU sits on card1 or higher.
func findIntelDRICardsInternal() []string {
entries, err := os.ReadDir(AMDGPUSysfsPath)
if err != nil {
return nil
}
var cards []string
for _, entry := range entries {
name := entry.Name()
if !strings.HasPrefix(name, "card") || strings.Contains(name, "-") {
continue
}
data, err := os.ReadFile(fmt.Sprintf("%s/%s/device/vendor", AMDGPUSysfsPath, name))
if err != nil {
continue
}
if strings.TrimSpace(string(data)) == "0x8086" {
cards = append(cards, fmt.Sprintf("/dev/dri/%s", name))
}
}
return cards
}

// HasIntelGPU reports whether at least one Intel GPU is present via DRM sysfs.
func HasIntelGPU() bool {
return len(findIntelDRICardsInternal()) > 0
}

// getIntelStatsInternal iterates over every Intel DRI card and collects VRAM stats.
// toolPath is the path to intel_gpu_top (empty string → sysfs-only, no memory stats).
func getIntelStatsInternal(ctx context.Context, toolPath string) ([]systemtypes.GPUStats, error) {
intelCards := findIntelDRICardsInternal()
if len(intelCards) == 0 {
return nil, fmt.Errorf("no Intel GPU found in sysfs")
}
slog.DebugContext(ctx, "Intel GPU detected but detailed stats not yet implemented")

var stats []systemtypes.GPUStats
for i, cardPath := range intelCards {
entry := systemtypes.GPUStats{
Name: intelGPUNameInternal(cardPath),
Index: i,
}
if toolPath != "" {
if mem, err := intelGPUTopMemoryInternal(ctx, toolPath, cardPath); err == nil {
entry.MemoryUsed = mem.used
entry.MemoryTotal = mem.total
} else {
slog.DebugContext(ctx, "intel_gpu_top memory query failed", "card", cardPath, "error", err)
}
}
stats = append(stats, entry)
}

slog.DebugContext(ctx, "Collected Intel GPU stats", "gpu_count", len(stats))
return stats, nil
}

type intelMemStats struct{ used, total float64 }

// intelGPUTopMemoryInternal runs intel_gpu_top for a single card and returns VRAM stats.
func intelGPUTopMemoryInternal(ctx context.Context, toolPath, cardPath string) (intelMemStats, error) {
ctx, cancel := context.WithTimeout(ctx, 3*time.Second)
defer cancel()

cmd := exec.CommandContext(ctx, toolPath, //nolint:gosec // toolPath from exec.LookPath, cardPath from sysfs vendor check
"-d", fmt.Sprintf("drm:%s", cardPath),
"-J", "-s", "100", "-c", "1")
out, err := cmd.Output()
if err != nil {
return intelMemStats{}, fmt.Errorf("intel_gpu_top: %w", err)
}

var result intelGPUTopOutput
data := bytes.TrimSpace(out)
if len(data) > 0 && data[0] == '[' {
var arr []intelGPUTopOutput
if err := json.Unmarshal(data, &arr); err != nil {
return intelMemStats{}, fmt.Errorf("parse intel_gpu_top array: %w", err)
}
if len(arr) == 0 {
return intelMemStats{}, fmt.Errorf("parse intel_gpu_top array: empty output")
}
result = arr[0]
} else {
if err := json.Unmarshal(data, &result); err != nil {
return intelMemStats{}, fmt.Errorf("parse intel_gpu_top object: %w", err)
}
}

if result.Memory == nil || result.Memory.Local == nil {
return intelMemStats{}, fmt.Errorf("no local memory info in intel_gpu_top output")
}

unit := strings.ToLower(strings.TrimSpace(result.Memory.Unit))
var scale float64
switch unit {
case "mib":
scale = 1024 * 1024
case "gib":
scale = 1024 * 1024 * 1024
default:
slog.WarnContext(ctx, "Unknown intel_gpu_top memory unit, treating as bytes", "unit", unit)
scale = 1
}

total := result.Memory.Local.Total * scale
used := (result.Memory.Local.Total - result.Memory.Local.Free) * scale
return intelMemStats{used: used, total: total}, nil
}

// intelGPUNameInternal returns a human-readable label for an Intel DRI card.
// Prefers the DRM "label" sysfs attribute; falls back to "Intel GPU (cardN)".
func intelGPUNameInternal(cardPath string) string {
cardName := strings.TrimPrefix(cardPath, "/dev/dri/")
if data, err := os.ReadFile(fmt.Sprintf("%s/%s/device/label", AMDGPUSysfsPath, cardName)); err == nil {
if label := strings.TrimSpace(string(data)); label != "" {
return label
}
}
return fmt.Sprintf("Intel GPU (%s)", cardName)
}
Loading