diff --git a/src/arc_llama/cli.py b/src/arc_llama/cli.py index 994d98e..3029cad 100644 --- a/src/arc_llama/cli.py +++ b/src/arc_llama/cli.py @@ -35,7 +35,7 @@ init_config_from_detection, load_config, ) -from arc_llama.detect import detect_gpus, lspci_intel_gpus +from arc_llama.detect import detect_gpus, lspci_intel_gpus, render_nodes_in_dev from arc_llama.models import ( add_local_model, discover_ggufs, @@ -208,12 +208,90 @@ def doctor(ctx: click.Context) -> None: else: console.print(" lspci shows no Intel display devices either.") + # DRM device nodes in /dev/dri/ + console.print("\n /dev/dri/ render nodes:") + dev_dri = Path("/dev/dri") + if not dev_dri.exists(): + console.print( + " [red]/dev/dri/ does not exist[/red] — no DRM device nodes " + "present in this environment." + ) + console.print( + " In a container: pass the device through with --device=/dev/dri/") + else: + render_nodes = render_nodes_in_dev() + all_dri = sorted(dev_dri.iterdir()) if dev_dri.exists() else [] + if not render_nodes: + console.print( + " [yellow]no renderD* nodes found[/yellow] (all entries: " + + (", ".join(p.name for p in all_dri) or "none") + + ")" + ) + else: + for node in render_nodes: + try: + stat = node.stat() + mode = oct(stat.st_mode)[-3:] + except OSError: + mode = "???" + console.print(f" [green]{node}[/green] (mode={mode})") + + # Cross-reference: driver bound but render node absent + _missing_render = [ + g for g in gpus if g.driver in ("xe", "i915") and g.drm_render is None + ] + if _missing_render: + console.print() + for g in _missing_render: + console.print( + f" [red]WARNING[/red]: {g.name} has `{g.driver}` driver bound " + f"but no DRM render node was created." + ) + console.print( + " This is the most common cause of " + "\"No device of requested type available\" from SYCL.\n" + " Steps to resolve:\n" + " 1. Check render nodes: ls -la /dev/dri/\n" + " 2. Check driver init: dmesg | grep -E '(xe|i915|drm)' | tail -30\n" + " 3. In containers: ensure host /dev/dri is passed through " + "(--device=/dev/dri/ or --device=/dev/dri/renderD128)\n" + " 4. Add to groups: sudo usermod -aG render,video $USER (re-login)\n" + " 5. Reload driver: sudo modprobe -r xe && sudo modprobe xe" + ) + # External tools console.print("\n external tools:") for tool in ("clinfo", "sycl-ls", "intel_gpu_top", "nvtop", "lspci"): path = shutil.which(tool) console.print(f" {tool:<14} {path or '— missing —'}") + # sycl-ls device enumeration (most useful signal for "can SYCL see the GPU?") + sycl_ls_bin = shutil.which("sycl-ls") + console.print("\n sycl-ls device enumeration:") + if not sycl_ls_bin: + console.print(" [yellow]sycl-ls not found — install Intel oneAPI Base Toolkit[/yellow]") + else: + try: + sl = subprocess.run( + [sycl_ls_bin], capture_output=True, text=True, timeout=10 + ) + output = sl.stdout.strip() + if output: + for line in output.splitlines(): + console.print(f" {line}") + else: + console.print( + " [yellow](no output — SYCL sees no devices; " + "GPU/render-node access likely missing)[/yellow]" + ) + if sl.returncode != 0: + console.print(f" [yellow]sycl-ls exited {sl.returncode}[/yellow]") + if sl.stderr.strip(): + for line in sl.stderr.strip().splitlines()[:5]: + console.print(f" [dim]{line}[/dim]") + except subprocess.TimeoutExpired: + console.print(" [yellow]sycl-ls timed out[/yellow]") + # Permissions console.print("\n user groups:") try: @@ -227,8 +305,7 @@ def doctor(ctx: click.Context) -> None: console.print(f" {needed:<14} {marker}") if "render" not in groups or "video" not in groups: console.print( - " [yellow]→ add yourself with `sudo usermod -aG render,video $USER` " - "and re-login.[/yellow]" + " [yellow]→ sudo usermod -aG render,video $USER then log out and back in.[/yellow]" ) # oneAPI diff --git a/src/arc_llama/detect.py b/src/arc_llama/detect.py index 81921de..209eb4c 100644 --- a/src/arc_llama/detect.py +++ b/src/arc_llama/detect.py @@ -141,6 +141,12 @@ def _scan_pci() -> list[DetectedGPU]: ) if driver is None: gpu.notes.append("No kernel driver bound — install `xe` or `i915` modules.") + elif render is None: + gpu.notes.append( + f"driver '{driver}' bound but no render node found in sysfs " + "(DRM render node not created). " + "Check: ls /dev/dri/ and dmesg | grep -E '(xe|i915|drm)'" + ) found.append(gpu) for i, g in enumerate(found): g._index = i # type: ignore[attr-defined] @@ -225,6 +231,14 @@ def detect_gpus(enrich: bool = True) -> list[DetectedGPU]: return gpus +def render_nodes_in_dev() -> list[Path]: + """Return renderD* device nodes present under /dev/dri/, sorted by name.""" + dev_dri = Path("/dev/dri") + if not dev_dri.exists(): + return [] + return sorted(p for p in dev_dri.iterdir() if p.name.startswith("renderD")) + + def lspci_intel_gpus() -> str: """Return raw `lspci -nn` output filtered to Intel display devices. diff --git a/src/arc_llama/launcher.py b/src/arc_llama/launcher.py index 3c38b69..a79f9bd 100644 --- a/src/arc_llama/launcher.py +++ b/src/arc_llama/launcher.py @@ -16,6 +16,7 @@ import os import signal import subprocess +import tempfile import time from dataclasses import dataclass from pathlib import Path @@ -162,6 +163,48 @@ def build_plan( ) +def _surface_crash_logs(name: str, log_path: Path | None) -> None: + """Read the tail of the server's stderr log and emit diagnostic hints. + + Called when the subprocess exits before passing the health check. If the + log contains the canonical SYCL "no device" message we print an actionable + checklist so the user knows exactly what to fix without having to grep logs + themselves. + """ + if log_path is None or not log_path.exists(): + return + try: + with open(log_path, "rb") as f: + f.seek(0, 2) + size = f.tell() + f.seek(max(0, size - 8192)) + tail = f.read().decode("utf-8", errors="replace") + except Exception: + return + lines = [line for line in tail.splitlines() if line.strip()][-40:] + if not lines: + return + log.error("[%s] last output before crash:", name) + for line in lines: + log.error("[%s] %s", name, line) + combined = tail.lower() + if "no device of requested type available" in combined: + log.error( + "[%s] SYCL/level_zero found no compute device. Checklist:\n" + " 1. render nodes present? ls /dev/dri/renderD*\n" + " 2. user in render group? sudo usermod -aG render,video $USER (re-login)\n" + " 3. driver init errors? dmesg | grep -E '(xe|i915|drm)'\n" + " 4. device visible? sycl-ls\n" + " 5. full diagnostics: arc-llama doctor", + name, + ) + elif "level_zero" in combined and ("error" in combined or "failed" in combined): + log.error( + "[%s] level_zero adapter error — run `sycl-ls` and `arc-llama doctor`.", + name, + ) + + class LlamaServer: """One llama-server subprocess. Lifecycle: start → wait_ready → stop.""" @@ -170,6 +213,8 @@ def __init__(self, plan: LaunchPlan, name: str = "llama-server"): self.name = name self.process: subprocess.Popen[bytes] | None = None self.started_at: float | None = None + self._log_path: Path | None = None # path to current stderr/log file + self._is_tmp_log: bool = False # True when _log_path is a temp file @property def is_running(self) -> bool: @@ -181,11 +226,23 @@ def start(self, log_dir: Path | None = None) -> None: return stdout = subprocess.DEVNULL stderr = subprocess.DEVNULL + _tmp_fh = None # temp file handle to close after Popen if log_dir is not None: log_dir.mkdir(parents=True, exist_ok=True) log_path = log_dir / f"{self.name}.log" stdout = open(log_path, "ab") stderr = subprocess.STDOUT + self._log_path = log_path + self._is_tmp_log = False + else: + # Capture stderr to a temp file so wait_ready can surface crash + # messages such as "No device of requested type available". + _tmp_fh = tempfile.NamedTemporaryFile( + prefix=f"arc-llama-{self.name}-", suffix=".log", delete=False + ) + stderr = _tmp_fh + self._log_path = Path(_tmp_fh.name) + self._is_tmp_log = True log.info("[%s] starting: %s", self.name, " ".join(self.plan.argv)) self.process = subprocess.Popen( self.plan.argv, @@ -194,6 +251,9 @@ def start(self, log_dir: Path | None = None) -> None: stderr=stderr, preexec_fn=_preexec_isolate_and_pdeathsig, ) + # Parent closes its write copy; child keeps its own inherited fd. + if _tmp_fh is not None: + _tmp_fh.close() self.started_at = time.time() async def wait_ready(self, timeout: float = DEFAULT_HEALTH_TIMEOUT) -> bool: @@ -202,6 +262,7 @@ async def wait_ready(self, timeout: float = DEFAULT_HEALTH_TIMEOUT) -> bool: while time.time() < deadline: if not self.is_running: log.warning("[%s] process exited before becoming healthy", self.name) + _surface_crash_logs(self.name, self._log_path) return False try: r = await client.get(self.plan.health_url) @@ -236,3 +297,10 @@ def stop(self, drain_seconds: float = 3.0) -> None: pass self.process = None self.started_at = None + if self._is_tmp_log and self._log_path is not None: + try: + self._log_path.unlink(missing_ok=True) + except Exception: + pass + self._log_path = None + self._is_tmp_log = False diff --git a/src/arc_llama/server.py b/src/arc_llama/server.py index 3b7effb..5ad081b 100644 --- a/src/arc_llama/server.py +++ b/src/arc_llama/server.py @@ -242,7 +242,7 @@ async def admin_edit_model(name: str, request: Request) -> dict: try: ub = int(body["ubatch_size"]) except (TypeError, ValueError): - raise HTTPException(status_code=400, detail="ubatch_size must be an integer") + raise HTTPException(status_code=400, detail="ubatch_size must be an integer") from None if not (1 <= ub <= 4096): raise HTTPException(status_code=400, detail="ubatch_size must be 1..4096") recipe["ubatch_size"] = ub diff --git a/tests/test_launcher.py b/tests/test_launcher.py index 24a8544..921f244 100644 --- a/tests/test_launcher.py +++ b/tests/test_launcher.py @@ -191,7 +191,6 @@ async def test_wait_ready_true_when_healthy(self, monkeypatch: pytest.MonkeyPatc srv.started_at = 0.0 import httpx - original_get = httpx.AsyncClient.get async def _fake_get(self, url): if "/health" in url: diff --git a/tests/test_recipes.py b/tests/test_recipes.py index c05a0af..0d417f8 100644 --- a/tests/test_recipes.py +++ b/tests/test_recipes.py @@ -1,8 +1,6 @@ """Tests for arc_llama.recipes — VRAM math, recipe generation, KV sizing.""" from __future__ import annotations -import pytest - from arc_llama.arch import Arch from arc_llama.recipes import ( DEFAULT_CTX_CAP,