Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 12 additions & 13 deletions src/lib/onboard.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1581,8 +1581,9 @@ function waitForSandboxReady(sandboxName: string, attempts = 10, delaySeconds =

// ── Step 1: Preflight ────────────────────────────────────────────

// Keep the Docker CDI guard near preflight so resume hits the same early failure path.
// Jetson/Tegra uses Docker's NVIDIA runtime backend and is exempt from CDI.
/**
* Fails onboarding when Docker CDI injection is configured but the NVIDIA GPU spec is invalid.
*/
function assertCdiNvidiaGpuSpecPresent(
host: ReturnType<typeof assessHost>,
optedOutGpuPassthrough: boolean,
Expand All @@ -1605,11 +1606,9 @@ type PreflightOptions = Pick<
optedOutGpuPassthrough?: boolean;
};

// Reject unsupported container runtimes (currently only Podman with the
// Linux Docker-driver gateway) before any Docker-specific probes. Both
// the fresh preflight and `--resume` backstop call this — if `docker`
// resolves to Podman, surface the unsupported-runtime message instead of
// running bridge/DNS diagnostics that would be misleading.
/**
* Rejects unsupported runtimes before Docker-specific bridge, DNS, and CDI probes.
*/
function rejectUnsupportedContainerRuntime(host: ReturnType<typeof assessHost>): void {
if (isLinuxDockerDriverGatewayEnabled() && host.runtime === "podman") {
console.error(` ✗ ${cliDisplayName()} onboarding now uses OpenShell's Docker driver.`);
Expand All @@ -1619,13 +1618,14 @@ function rejectUnsupportedContainerRuntime(host: ReturnType<typeof assessHost>):
}
}

/**
* Runs host preflight and blocks early on Docker, GPU, CDI, and runtime problems.
*/
async function preflight(
preflightOpts: PreflightOptions = {},
): Promise<ReturnType<typeof nim.detectGpu>> {
step(1, 8, "Preflight checks");

const host = assessHost();

// Docker / runtime
if (!host.dockerReachable) {
console.error(" Docker is not reachable. Please fix Docker and try again.");
Expand All @@ -1644,12 +1644,11 @@ async function preflight(
device: preflightOpts.sandboxGpuDevice ?? null,
});
exitOnSandboxGpuConfigErrors(sandboxGpuConfig);
const optedOutGpuPassthrough =
const explicitlyOptedOutGpuPassthrough =
preflightOpts.optedOutGpuPassthrough === true ||
preflightOpts.noGpu === true ||
!sandboxGpuConfig.sandboxGpuEnabled;
assertCdiNvidiaGpuSpecPresent(host, optedOutGpuPassthrough, sandboxGpuConfig.hostGpuPlatform);

sandboxGpuConfig.mode === "0";
assertCdiNvidiaGpuSpecPresent(host, explicitlyOptedOutGpuPassthrough, sandboxGpuConfig.hostGpuPlatform);
assertDockerBridgeAndContainerDnsHealthy(host, isNonInteractive());

if (host.runtime !== "unknown") {
Expand Down
26 changes: 24 additions & 2 deletions src/lib/onboard/machine/handlers/preflight.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -186,13 +186,35 @@ describe("handlePreflightState", () => {
expect(harness.deps.startRecordedStep).not.toHaveBeenCalled();
expect(harness.deps.assertCdiNvidiaGpuSpecPresent).toHaveBeenCalledWith(
{ cdiNvidiaGpuSpecMissing: false },
true,
false,
undefined,
);
expect(harness.deps.validateSandboxGpuPreflight).toHaveBeenCalledOnce();
expect(result.resumePreflight).toBe(true);
});

it("keeps CDI guard active on resume when auto mode disables GPU after failed detection", async () => {
const session = createSession();
session.steps.preflight.status = "complete";
session.gpuPassthrough = false;
const assertCdiNvidiaGpuSpecPresent = vi.fn();
const host = { cdiNvidiaGpuSpecMissing: true };
const harness = createDeps({
detectGpu: vi.fn(() => null),
getResumeSandboxGpuOverrides: vi.fn(() => ({ flag: null, device: null })),
resolveSandboxGpuConfig,
assessHost: () => host,
assertCdiNvidiaGpuSpecPresent,
});

await handlePreflightState({
...baseOptions(harness.deps, session),
resume: true,
});

expect(assertCdiNvidiaGpuSpecPresent).toHaveBeenCalledWith(host, false, null);
});

it("passes host GPU platform into the resumed CDI guard", async () => {
const session = createSession();
session.steps.preflight.status = "complete";
Expand Down Expand Up @@ -220,7 +242,7 @@ describe("handlePreflightState", () => {

expect(assertCdiNvidiaGpuSpecPresent).toHaveBeenCalledWith(
{ cdiNvidiaGpuSpecMissing: false },
true,
false,
"jetson",
);
});
Expand Down
8 changes: 5 additions & 3 deletions src/lib/onboard/machine/handlers/preflight.ts
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,10 @@ function envHasSandboxGpuOverride(env: NodeJS.ProcessEnv): boolean {
return env.NEMOCLAW_SANDBOX_GPU !== undefined || env.NEMOCLAW_SANDBOX_GPU_DEVICE !== undefined;
}

/**
* Executes or revalidates the preflight state while preserving the user's
* effective sandbox GPU intent across fresh onboarding and resume.
*/
export async function handlePreflightState<
Gpu,
SandboxEntry,
Expand Down Expand Up @@ -143,9 +147,7 @@ export async function handlePreflightState<
});
deps.validateSandboxGpuPreflight(resumeSandboxGpuConfig);
const resumeOptedOutGpuPassthrough =
noGpu ||
(!gpuRequested && session?.gpuPassthrough === false) ||
!resumeSandboxGpuConfig.sandboxGpuEnabled;
noGpu || effectiveSandboxGpuFlag === "disable" || resumeSandboxGpuConfig.mode === "0";
const resumeHost = deps.assessHost();
// Reject unsupported runtimes (Podman) BEFORE the CDI GPU-spec
// backstop and the Docker-specific bridge/DNS probes so Podman
Expand Down
50 changes: 50 additions & 0 deletions src/lib/onboard/preflight-cdi.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@ import { assessHost, planHostRemediation } from "../../../dist/lib/onboard/prefl

type HostAssessment = Parameters<typeof planHostRemediation>[0];

/**
* Creates a Linux Docker host assessment with NVIDIA CDI defaults for focused overrides.
*/
function baseAssessment(overrides: Partial<HostAssessment> = {}): HostAssessment {
return {
platform: "linux",
Expand Down Expand Up @@ -38,6 +41,9 @@ function baseAssessment(overrides: Partial<HostAssessment> = {}): HostAssessment
};
}

/**
* Emulates the systemctl/stat probes needed by CDI staleness remediation tests.
*/
function healthySystemctlAndStat(command: readonly string[]) {
if (command[0] === "systemctl" && command[1] === "is-enabled") return "enabled";
if (command[0] === "systemctl" && command[1] === "is-active") return "active";
Expand Down Expand Up @@ -67,6 +73,50 @@ describe("assessHost — CDI", () => {
expect(result.cdiNvidiaGpuSpecMissing).toBe(true);
});

it("plans toolkit bootstrap when PCI detects NVIDIA hardware but nvidia-smi and nvidia-ctk are absent", () => {
const result = assessHost({
platform: "linux",
env: {},
release: "6.8.0-58-generic",
readFileImpl: (filePath: string) =>
filePath.endsWith("other.yaml")
? "cdiVersion: 0.5.0\nkind: vendor.example/device\ndevices: []\n"
: "Linux version 6.8.0-58-generic",
readdirImpl: (dir: string) => (dir === "/etc/cdi" ? ["other.yaml"] : []),
runCaptureImpl: (command: readonly string[]) => {
if (command.join(" ").includes("apt-get")) return "/usr/bin/apt-get";
if (command[0] === "lspci") {
return "01:00.0 VGA compatible controller: NVIDIA Corporation GA102 [GeForce RTX 3090]\n";
}
Comment thread
HOYALIM marked this conversation as resolved.
if (command[0] === "systemctl" && command[1] === "is-active") return "active";
if (command[0] === "systemctl" && command[1] === "is-enabled") return "enabled";
return "";
},
dockerInfoOutput: JSON.stringify({
ServerVersion: "27.0",
OperatingSystem: "Ubuntu 24.04",
CDISpecDirs: ["/etc/cdi", "/var/run/cdi"],
}),
commandExistsImpl: (name: string) =>
name === "docker" || name === "lspci" || name === "systemctl",
});

expect(result.hasNvidiaGpu).toBe(true);
expect(result.nvidiaContainerToolkitInstalled).toBe(false);
expect(result.cdiNvidiaGpuSpecMissing).toBe(true);

const action = planHostRemediation(result).find(
(entry: { id: string }) => entry.id === "install_nvidia_container_toolkit",
);
expect(action).toBeTruthy();
expect(action?.blocking).toBe(true);
expect(action?.commands).toContain("sudo apt-get install -y nvidia-container-toolkit");
expect(action?.commands.some((command) => command.includes("nvidia-ctk cdi generate"))).toBe(
true,
);
expect(action?.commands.some((command) => command.includes("nvidia-ctk cdi list"))).toBe(true);
});

it("does not flag the host when an nvidia.com/gpu YAML spec is present", () => {
const result = assessHost({
platform: "linux",
Expand Down
69 changes: 51 additions & 18 deletions src/lib/onboard/preflight.ts
Original file line number Diff line number Diff line change
Expand Up @@ -385,11 +385,32 @@ function isHeadlessLikely(env: NodeJS.ProcessEnv): boolean {
return !env.DISPLAY && !env.WAYLAND_DISPLAY && !env.TERM_PROGRAM;
}

function detectNvidiaGpu(runCaptureImpl: RunCaptureFn): boolean {
if (!commandExists("nvidia-smi", runCaptureImpl)) {
return false;
}
return Boolean(String(runCaptureImpl(["nvidia-smi", "-L"], { ignoreError: true }) || "").trim());
/**
* Detects NVIDIA hardware via nvidia-smi first, then Linux PCI data as a toolkit-free fallback.
*/
function detectNvidiaGpu(opts: {
platform: NodeJS.Platform | string;
isWsl: boolean;
runCaptureImpl: RunCaptureFn;
commandExistsImpl?: (commandName: string) => boolean;
}): boolean {
const commandExistsImpl =
opts.commandExistsImpl ??
((commandName: string) => commandExists(commandName, opts.runCaptureImpl));
if (commandExistsImpl("nvidia-smi")) {
const smiOutput = opts.runCaptureImpl(["nvidia-smi", "-L"], { ignoreError: true });
if (String(smiOutput || "").trim()) return true;
}

if (opts.platform !== "linux" || opts.isWsl || !commandExistsImpl("lspci")) return false;
const pciOutput = opts.runCaptureImpl(["lspci", "-nn"], { ignoreError: true });
return String(pciOutput || "")
.split(/\r?\n/)
.some(
(line) =>
/nvidia/i.test(line) &&
/(vga compatible controller|3d controller|display controller)/i.test(line),
);
}

function detectPackageManager(runCaptureImpl: RunCaptureFn): PackageManager {
Expand Down Expand Up @@ -447,6 +468,9 @@ export function buildContainerToolkitBootstrapCommands(
];
}

/**
* Builds the host capability snapshot used to plan preflight remediation.
*/
export function assessHost(opts: AssessHostOpts = {}): HostAssessment {
const platform = opts.platform ?? process.platform;
const env = opts.env ?? process.env;
Expand All @@ -456,12 +480,33 @@ export function assessHost(opts: AssessHostOpts = {}): HostAssessment {
runCapture(command, { ignoreError: options?.ignoreError ?? false }));
const readFileImpl = opts.readFileImpl ?? fs.readFileSync;
const readdirImpl = opts.readdirImpl ?? ((dir: string) => fs.readdirSync(dir));
const shouldReadLinuxHostDetails = platform === "linux";
const release = opts.release ?? (shouldReadLinuxHostDetails ? os.release() : "");
const procVersion =
opts.procVersion ??
(shouldReadLinuxHostDetails
? (() => {
try {
return readFileImpl("/proc/version", "utf-8");
} catch {
return "";
}
})()
: "");
const isWslHost = detectWsl({ platform, env, release, procVersion });
const dockerInstalled =
opts.commandExistsImpl?.("docker") ?? commandExists("docker", runCaptureImpl);
const nodeInstalled = opts.commandExistsImpl?.("node") ?? commandExists("node", runCaptureImpl);
const openshellInstalled =
opts.commandExistsImpl?.("openshell") ?? commandExists("openshell", runCaptureImpl);
const hasNvidiaGpu = opts.gpuProbeImpl?.() ?? detectNvidiaGpu(runCaptureImpl);
const hasNvidiaGpu =
opts.gpuProbeImpl?.() ??
detectNvidiaGpu({
platform,
isWsl: isWslHost,
runCaptureImpl,
commandExistsImpl: opts.commandExistsImpl,
});
const nvidiaContainerToolkitInstalled =
opts.commandExistsImpl?.("nvidia-ctk") ?? commandExists("nvidia-ctk", runCaptureImpl);
const packageManager = detectPackageManager(runCaptureImpl);
Expand All @@ -480,22 +525,10 @@ export function assessHost(opts: AssessHostOpts = {}): HostAssessment {
dockerReachable = true;
dockerRunning = true;
}

const release = opts.release ?? os.release();
const procVersion =
opts.procVersion ??
(() => {
try {
return readFileImpl("/proc/version", "utf-8");
} catch {
return "";
}
})();
let runtime = inferContainerRuntime(dockerInfoOutput);
if (dockerReachable && runtime === "unknown" && platform === "linux") {
runtime = "docker";
}
const isWslHost = detectWsl({ platform, env, release, procVersion });
const dockerCgroupVersion = dockerReachable
? parseDockerCgroupVersion(dockerInfoOutput)
: "unknown";
Expand Down
Loading