From 40b779c6defff6b13f01ba78ae8fa0a49af5731d Mon Sep 17 00:00:00 2001 From: Ho Lim Date: Wed, 17 Jun 2026 18:32:45 -0700 Subject: [PATCH] fix: detect gpu for toolkit cdi preflight --- src/lib/onboard.ts | 25 ++++--- .../machine/handlers/preflight.test.ts | 26 ++++++- src/lib/onboard/machine/handlers/preflight.ts | 8 ++- src/lib/onboard/preflight-cdi.test.ts | 50 ++++++++++++++ src/lib/onboard/preflight.ts | 69 ++++++++++++++----- 5 files changed, 142 insertions(+), 36 deletions(-) diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index f2f5967d13..6b3d70a46e 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -1581,8 +1581,9 @@ function waitForSandboxReady(sandboxName: string, attempts = 10, delaySeconds = // ── Step 1: Preflight ──────────────────────────────────────────── -// Keep the Docker CDI guard near preflight so resume hits the same early failure path. -// Jetson/Tegra uses Docker's NVIDIA runtime backend and is exempt from CDI. +/** + * Fails onboarding when Docker CDI injection is configured but the NVIDIA GPU spec is invalid. + */ function assertCdiNvidiaGpuSpecPresent( host: ReturnType, optedOutGpuPassthrough: boolean, @@ -1605,11 +1606,9 @@ type PreflightOptions = Pick< optedOutGpuPassthrough?: boolean; }; -// Reject unsupported container runtimes (currently only Podman with the -// Linux Docker-driver gateway) before any Docker-specific probes. Both -// the fresh preflight and `--resume` backstop call this — if `docker` -// resolves to Podman, surface the unsupported-runtime message instead of -// running bridge/DNS diagnostics that would be misleading. +/** + * Rejects unsupported runtimes before Docker-specific bridge, DNS, and CDI probes. + */ function rejectUnsupportedContainerRuntime(host: ReturnType): void { if (isLinuxDockerDriverGatewayEnabled() && host.runtime === "podman") { console.error(` ✗ ${cliDisplayName()} onboarding now uses OpenShell's Docker driver.`); @@ -1619,13 +1618,14 @@ function rejectUnsupportedContainerRuntime(host: ReturnType): } } +/** + * Runs host preflight and blocks early on Docker, GPU, CDI, and runtime problems. + */ async function preflight( preflightOpts: PreflightOptions = {}, ): Promise> { step(1, 8, "Preflight checks"); - const host = assessHost(); - // Docker / runtime if (!host.dockerReachable) { console.error(" Docker is not reachable. Please fix Docker and try again."); @@ -1644,12 +1644,11 @@ async function preflight( device: preflightOpts.sandboxGpuDevice ?? null, }); exitOnSandboxGpuConfigErrors(sandboxGpuConfig); - const optedOutGpuPassthrough = + const explicitlyOptedOutGpuPassthrough = preflightOpts.optedOutGpuPassthrough === true || preflightOpts.noGpu === true || - !sandboxGpuConfig.sandboxGpuEnabled; - assertCdiNvidiaGpuSpecPresent(host, optedOutGpuPassthrough, sandboxGpuConfig.hostGpuPlatform); - + sandboxGpuConfig.mode === "0"; + assertCdiNvidiaGpuSpecPresent(host, explicitlyOptedOutGpuPassthrough, sandboxGpuConfig.hostGpuPlatform); assertDockerBridgeAndContainerDnsHealthy(host, isNonInteractive()); if (host.runtime !== "unknown") { diff --git a/src/lib/onboard/machine/handlers/preflight.test.ts b/src/lib/onboard/machine/handlers/preflight.test.ts index 15296f6369..5152e30a9a 100644 --- a/src/lib/onboard/machine/handlers/preflight.test.ts +++ b/src/lib/onboard/machine/handlers/preflight.test.ts @@ -186,13 +186,35 @@ describe("handlePreflightState", () => { expect(harness.deps.startRecordedStep).not.toHaveBeenCalled(); expect(harness.deps.assertCdiNvidiaGpuSpecPresent).toHaveBeenCalledWith( { cdiNvidiaGpuSpecMissing: false }, - true, + false, undefined, ); expect(harness.deps.validateSandboxGpuPreflight).toHaveBeenCalledOnce(); expect(result.resumePreflight).toBe(true); }); + it("keeps CDI guard active on resume when auto mode disables GPU after failed detection", async () => { + const session = createSession(); + session.steps.preflight.status = "complete"; + session.gpuPassthrough = false; + const assertCdiNvidiaGpuSpecPresent = vi.fn(); + const host = { cdiNvidiaGpuSpecMissing: true }; + const harness = createDeps({ + detectGpu: vi.fn(() => null), + getResumeSandboxGpuOverrides: vi.fn(() => ({ flag: null, device: null })), + resolveSandboxGpuConfig, + assessHost: () => host, + assertCdiNvidiaGpuSpecPresent, + }); + + await handlePreflightState({ + ...baseOptions(harness.deps, session), + resume: true, + }); + + expect(assertCdiNvidiaGpuSpecPresent).toHaveBeenCalledWith(host, false, null); + }); + it("passes host GPU platform into the resumed CDI guard", async () => { const session = createSession(); session.steps.preflight.status = "complete"; @@ -220,7 +242,7 @@ describe("handlePreflightState", () => { expect(assertCdiNvidiaGpuSpecPresent).toHaveBeenCalledWith( { cdiNvidiaGpuSpecMissing: false }, - true, + false, "jetson", ); }); diff --git a/src/lib/onboard/machine/handlers/preflight.ts b/src/lib/onboard/machine/handlers/preflight.ts index 10c7d0f235..6d4c5c85bd 100644 --- a/src/lib/onboard/machine/handlers/preflight.ts +++ b/src/lib/onboard/machine/handlers/preflight.ts @@ -97,6 +97,10 @@ function envHasSandboxGpuOverride(env: NodeJS.ProcessEnv): boolean { return env.NEMOCLAW_SANDBOX_GPU !== undefined || env.NEMOCLAW_SANDBOX_GPU_DEVICE !== undefined; } +/** + * Executes or revalidates the preflight state while preserving the user's + * effective sandbox GPU intent across fresh onboarding and resume. + */ export async function handlePreflightState< Gpu, SandboxEntry, @@ -143,9 +147,7 @@ export async function handlePreflightState< }); deps.validateSandboxGpuPreflight(resumeSandboxGpuConfig); const resumeOptedOutGpuPassthrough = - noGpu || - (!gpuRequested && session?.gpuPassthrough === false) || - !resumeSandboxGpuConfig.sandboxGpuEnabled; + noGpu || effectiveSandboxGpuFlag === "disable" || resumeSandboxGpuConfig.mode === "0"; const resumeHost = deps.assessHost(); // Reject unsupported runtimes (Podman) BEFORE the CDI GPU-spec // backstop and the Docker-specific bridge/DNS probes so Podman diff --git a/src/lib/onboard/preflight-cdi.test.ts b/src/lib/onboard/preflight-cdi.test.ts index 786fe3bee7..6701224db2 100644 --- a/src/lib/onboard/preflight-cdi.test.ts +++ b/src/lib/onboard/preflight-cdi.test.ts @@ -8,6 +8,9 @@ import { assessHost, planHostRemediation } from "../../../dist/lib/onboard/prefl type HostAssessment = Parameters[0]; +/** + * Creates a Linux Docker host assessment with NVIDIA CDI defaults for focused overrides. + */ function baseAssessment(overrides: Partial = {}): HostAssessment { return { platform: "linux", @@ -38,6 +41,9 @@ function baseAssessment(overrides: Partial = {}): HostAssessment }; } +/** + * Emulates the systemctl/stat probes needed by CDI staleness remediation tests. + */ function healthySystemctlAndStat(command: readonly string[]) { if (command[0] === "systemctl" && command[1] === "is-enabled") return "enabled"; if (command[0] === "systemctl" && command[1] === "is-active") return "active"; @@ -67,6 +73,50 @@ describe("assessHost — CDI", () => { expect(result.cdiNvidiaGpuSpecMissing).toBe(true); }); + it("plans toolkit bootstrap when PCI detects NVIDIA hardware but nvidia-smi and nvidia-ctk are absent", () => { + const result = assessHost({ + platform: "linux", + env: {}, + release: "6.8.0-58-generic", + readFileImpl: (filePath: string) => + filePath.endsWith("other.yaml") + ? "cdiVersion: 0.5.0\nkind: vendor.example/device\ndevices: []\n" + : "Linux version 6.8.0-58-generic", + readdirImpl: (dir: string) => (dir === "/etc/cdi" ? ["other.yaml"] : []), + runCaptureImpl: (command: readonly string[]) => { + if (command.join(" ").includes("apt-get")) return "/usr/bin/apt-get"; + if (command[0] === "lspci") { + return "01:00.0 VGA compatible controller: NVIDIA Corporation GA102 [GeForce RTX 3090]\n"; + } + if (command[0] === "systemctl" && command[1] === "is-active") return "active"; + if (command[0] === "systemctl" && command[1] === "is-enabled") return "enabled"; + return ""; + }, + dockerInfoOutput: JSON.stringify({ + ServerVersion: "27.0", + OperatingSystem: "Ubuntu 24.04", + CDISpecDirs: ["/etc/cdi", "/var/run/cdi"], + }), + commandExistsImpl: (name: string) => + name === "docker" || name === "lspci" || name === "systemctl", + }); + + expect(result.hasNvidiaGpu).toBe(true); + expect(result.nvidiaContainerToolkitInstalled).toBe(false); + expect(result.cdiNvidiaGpuSpecMissing).toBe(true); + + const action = planHostRemediation(result).find( + (entry: { id: string }) => entry.id === "install_nvidia_container_toolkit", + ); + expect(action).toBeTruthy(); + expect(action?.blocking).toBe(true); + expect(action?.commands).toContain("sudo apt-get install -y nvidia-container-toolkit"); + expect(action?.commands.some((command) => command.includes("nvidia-ctk cdi generate"))).toBe( + true, + ); + expect(action?.commands.some((command) => command.includes("nvidia-ctk cdi list"))).toBe(true); + }); + it("does not flag the host when an nvidia.com/gpu YAML spec is present", () => { const result = assessHost({ platform: "linux", diff --git a/src/lib/onboard/preflight.ts b/src/lib/onboard/preflight.ts index e2bfaa35de..d0b429e126 100644 --- a/src/lib/onboard/preflight.ts +++ b/src/lib/onboard/preflight.ts @@ -385,11 +385,32 @@ function isHeadlessLikely(env: NodeJS.ProcessEnv): boolean { return !env.DISPLAY && !env.WAYLAND_DISPLAY && !env.TERM_PROGRAM; } -function detectNvidiaGpu(runCaptureImpl: RunCaptureFn): boolean { - if (!commandExists("nvidia-smi", runCaptureImpl)) { - return false; - } - return Boolean(String(runCaptureImpl(["nvidia-smi", "-L"], { ignoreError: true }) || "").trim()); +/** + * Detects NVIDIA hardware via nvidia-smi first, then Linux PCI data as a toolkit-free fallback. + */ +function detectNvidiaGpu(opts: { + platform: NodeJS.Platform | string; + isWsl: boolean; + runCaptureImpl: RunCaptureFn; + commandExistsImpl?: (commandName: string) => boolean; +}): boolean { + const commandExistsImpl = + opts.commandExistsImpl ?? + ((commandName: string) => commandExists(commandName, opts.runCaptureImpl)); + if (commandExistsImpl("nvidia-smi")) { + const smiOutput = opts.runCaptureImpl(["nvidia-smi", "-L"], { ignoreError: true }); + if (String(smiOutput || "").trim()) return true; + } + + if (opts.platform !== "linux" || opts.isWsl || !commandExistsImpl("lspci")) return false; + const pciOutput = opts.runCaptureImpl(["lspci", "-nn"], { ignoreError: true }); + return String(pciOutput || "") + .split(/\r?\n/) + .some( + (line) => + /nvidia/i.test(line) && + /(vga compatible controller|3d controller|display controller)/i.test(line), + ); } function detectPackageManager(runCaptureImpl: RunCaptureFn): PackageManager { @@ -447,6 +468,9 @@ export function buildContainerToolkitBootstrapCommands( ]; } +/** + * Builds the host capability snapshot used to plan preflight remediation. + */ export function assessHost(opts: AssessHostOpts = {}): HostAssessment { const platform = opts.platform ?? process.platform; const env = opts.env ?? process.env; @@ -456,12 +480,33 @@ export function assessHost(opts: AssessHostOpts = {}): HostAssessment { runCapture(command, { ignoreError: options?.ignoreError ?? false })); const readFileImpl = opts.readFileImpl ?? fs.readFileSync; const readdirImpl = opts.readdirImpl ?? ((dir: string) => fs.readdirSync(dir)); + const shouldReadLinuxHostDetails = platform === "linux"; + const release = opts.release ?? (shouldReadLinuxHostDetails ? os.release() : ""); + const procVersion = + opts.procVersion ?? + (shouldReadLinuxHostDetails + ? (() => { + try { + return readFileImpl("/proc/version", "utf-8"); + } catch { + return ""; + } + })() + : ""); + const isWslHost = detectWsl({ platform, env, release, procVersion }); const dockerInstalled = opts.commandExistsImpl?.("docker") ?? commandExists("docker", runCaptureImpl); const nodeInstalled = opts.commandExistsImpl?.("node") ?? commandExists("node", runCaptureImpl); const openshellInstalled = opts.commandExistsImpl?.("openshell") ?? commandExists("openshell", runCaptureImpl); - const hasNvidiaGpu = opts.gpuProbeImpl?.() ?? detectNvidiaGpu(runCaptureImpl); + const hasNvidiaGpu = + opts.gpuProbeImpl?.() ?? + detectNvidiaGpu({ + platform, + isWsl: isWslHost, + runCaptureImpl, + commandExistsImpl: opts.commandExistsImpl, + }); const nvidiaContainerToolkitInstalled = opts.commandExistsImpl?.("nvidia-ctk") ?? commandExists("nvidia-ctk", runCaptureImpl); const packageManager = detectPackageManager(runCaptureImpl); @@ -480,22 +525,10 @@ export function assessHost(opts: AssessHostOpts = {}): HostAssessment { dockerReachable = true; dockerRunning = true; } - - const release = opts.release ?? os.release(); - const procVersion = - opts.procVersion ?? - (() => { - try { - return readFileImpl("/proc/version", "utf-8"); - } catch { - return ""; - } - })(); let runtime = inferContainerRuntime(dockerInfoOutput); if (dockerReachable && runtime === "unknown" && platform === "linux") { runtime = "docker"; } - const isWslHost = detectWsl({ platform, env, release, procVersion }); const dockerCgroupVersion = dockerReachable ? parseDockerCgroupVersion(dockerInfoOutput) : "unknown";