offbyonebit · offbyonebit · May 27, 2026 · May 27, 2026
diff --git a/src/arc_llama/cli.py b/src/arc_llama/cli.py
@@ -35,7 +35,7 @@
     init_config_from_detection,
     load_config,
 )
-from arc_llama.detect import detect_gpus, lspci_intel_gpus
+from arc_llama.detect import detect_gpus, lspci_intel_gpus, render_nodes_in_dev
 from arc_llama.models import (
     add_local_model,
     discover_ggufs,
@@ -208,12 +208,90 @@ def doctor(ctx: click.Context) -> None:
         else:
             console.print("    lspci shows no Intel display devices either.")
 
+    # DRM device nodes in /dev/dri/
+    console.print("\n  /dev/dri/ render nodes:")
+    dev_dri = Path("/dev/dri")
+    if not dev_dri.exists():
+        console.print(
+            "    [red]/dev/dri/ does not exist[/red] — no DRM device nodes "
+            "present in this environment."
+        )
+        console.print(
+            "    In a container: pass the device through with --device=/dev/dri/")
+    else:
+        render_nodes = render_nodes_in_dev()
+        all_dri = sorted(dev_dri.iterdir()) if dev_dri.exists() else []
+        if not render_nodes:
+            console.print(
+                "    [yellow]no renderD* nodes found[/yellow] (all entries: "
+                + (", ".join(p.name for p in all_dri) or "none")
+                + ")"
+            )
+        else:
+            for node in render_nodes:
+                try:
+                    stat = node.stat()
+                    mode = oct(stat.st_mode)[-3:]
+                except OSError:
+                    mode = "???"
+                console.print(f"    [green]{node}[/green]  (mode={mode})")
+
+    # Cross-reference: driver bound but render node absent
+    _missing_render = [
+        g for g in gpus if g.driver in ("xe", "i915") and g.drm_render is None
+    ]
+    if _missing_render:
+        console.print()
+        for g in _missing_render:
+            console.print(
+                f"  [red]WARNING[/red]: {g.name} has `{g.driver}` driver bound "
+                f"but no DRM render node was created."
+            )
+        console.print(
+            "  This is the most common cause of "
+            "\"No device of requested type available\" from SYCL.\n"
+            "  Steps to resolve:\n"
+            "    1. Check render nodes:  ls -la /dev/dri/\n"
+            "    2. Check driver init:   dmesg | grep -E '(xe|i915|drm)' | tail -30\n"
+            "    3. In containers:       ensure host /dev/dri is passed through "
+            "(--device=/dev/dri/ or --device=/dev/dri/renderD128)\n"
+            "    4. Add to groups:       sudo usermod -aG render,video $USER  (re-login)\n"
+            "    5. Reload driver:       sudo modprobe -r xe && sudo modprobe xe"
+        )
+
     # External tools
     console.print("\n  external tools:")
     for tool in ("clinfo", "sycl-ls", "intel_gpu_top", "nvtop", "lspci"):
         path = shutil.which(tool)
         console.print(f"    {tool:<14} {path or '— missing —'}")
 
+    # sycl-ls device enumeration (most useful signal for "can SYCL see the GPU?")
+    sycl_ls_bin = shutil.which("sycl-ls")
+    console.print("\n  sycl-ls device enumeration:")
+    if not sycl_ls_bin:
+        console.print("    [yellow]sycl-ls not found — install Intel oneAPI Base Toolkit[/yellow]")
+    else:
+        try:
+            sl = subprocess.run(
+                [sycl_ls_bin], capture_output=True, text=True, timeout=10
+            )
+            output = sl.stdout.strip()
+            if output:
+                for line in output.splitlines():
+                    console.print(f"    {line}")
+            else:
+                console.print(
+                    "    [yellow](no output — SYCL sees no devices; "
+                    "GPU/render-node access likely missing)[/yellow]"
+                )
+            if sl.returncode != 0:
+                console.print(f"    [yellow]sycl-ls exited {sl.returncode}[/yellow]")
+                if sl.stderr.strip():
+                    for line in sl.stderr.strip().splitlines()[:5]:
+                        console.print(f"    [dim]{line}[/dim]")
+        except subprocess.TimeoutExpired:
+            console.print("    [yellow]sycl-ls timed out[/yellow]")
+
     # Permissions
     console.print("\n  user groups:")
     try:
@@ -227,8 +305,7 @@ def doctor(ctx: click.Context) -> None:
         console.print(f"    {needed:<14} {marker}")
     if "render" not in groups or "video" not in groups:
         console.print(
-            "    [yellow]→ add yourself with `sudo usermod -aG render,video $USER` "
-            "and re-login.[/yellow]"
+            "    [yellow]→ sudo usermod -aG render,video $USER  then log out and back in.[/yellow]"
         )
 
     # oneAPI

diff --git a/src/arc_llama/detect.py b/src/arc_llama/detect.py
@@ -141,6 +141,12 @@ def _scan_pci() -> list[DetectedGPU]:
         )
         if driver is None:
             gpu.notes.append("No kernel driver bound — install `xe` or `i915` modules.")
+        elif render is None:
+            gpu.notes.append(
+                f"driver '{driver}' bound but no render node found in sysfs "
+                "(DRM render node not created). "
+                "Check: ls /dev/dri/  and  dmesg | grep -E '(xe|i915|drm)'"
+            )
         found.append(gpu)
     for i, g in enumerate(found):
         g._index = i  # type: ignore[attr-defined]
@@ -225,6 +231,14 @@ def detect_gpus(enrich: bool = True) -> list[DetectedGPU]:
     return gpus
 
 
+def render_nodes_in_dev() -> list[Path]:
+    """Return renderD* device nodes present under /dev/dri/, sorted by name."""
+    dev_dri = Path("/dev/dri")
+    if not dev_dri.exists():
+        return []
+    return sorted(p for p in dev_dri.iterdir() if p.name.startswith("renderD"))
+
+
 def lspci_intel_gpus() -> str:
     """Return raw `lspci -nn` output filtered to Intel display devices.
 

diff --git a/src/arc_llama/launcher.py b/src/arc_llama/launcher.py
@@ -16,6 +16,7 @@
 import os
 import signal
 import subprocess
+import tempfile
 import time
 from dataclasses import dataclass
 from pathlib import Path
@@ -162,6 +163,48 @@ def build_plan(
     )
 
 
+def _surface_crash_logs(name: str, log_path: Path | None) -> None:
+    """Read the tail of the server's stderr log and emit diagnostic hints.
+
+    Called when the subprocess exits before passing the health check.  If the
+    log contains the canonical SYCL "no device" message we print an actionable
+    checklist so the user knows exactly what to fix without having to grep logs
+    themselves.
+    """
+    if log_path is None or not log_path.exists():
+        return
+    try:
+        with open(log_path, "rb") as f:
+            f.seek(0, 2)
+            size = f.tell()
+            f.seek(max(0, size - 8192))
+            tail = f.read().decode("utf-8", errors="replace")
+    except Exception:
+        return
+    lines = [line for line in tail.splitlines() if line.strip()][-40:]
+    if not lines:
+        return
+    log.error("[%s] last output before crash:", name)
+    for line in lines:
+        log.error("[%s]   %s", name, line)
+    combined = tail.lower()
+    if "no device of requested type available" in combined:
+        log.error(
+            "[%s] SYCL/level_zero found no compute device. Checklist:\n"
+            "  1. render nodes present?  ls /dev/dri/renderD*\n"
+            "  2. user in render group?  sudo usermod -aG render,video $USER  (re-login)\n"
+            "  3. driver init errors?    dmesg | grep -E '(xe|i915|drm)'\n"
+            "  4. device visible?        sycl-ls\n"
+            "  5. full diagnostics:      arc-llama doctor",
+            name,
+        )
+    elif "level_zero" in combined and ("error" in combined or "failed" in combined):
+        log.error(
+            "[%s] level_zero adapter error — run `sycl-ls` and `arc-llama doctor`.",
+            name,
+        )
+
+
 class LlamaServer:
     """One llama-server subprocess. Lifecycle: start → wait_ready → stop."""
 
@@ -170,6 +213,8 @@ def __init__(self, plan: LaunchPlan, name: str = "llama-server"):
         self.name = name
         self.process: subprocess.Popen[bytes] | None = None
         self.started_at: float | None = None
+        self._log_path: Path | None = None  # path to current stderr/log file
+        self._is_tmp_log: bool = False       # True when _log_path is a temp file
 
     @property
     def is_running(self) -> bool:
@@ -181,11 +226,23 @@ def start(self, log_dir: Path | None = None) -> None:
             return
         stdout = subprocess.DEVNULL
         stderr = subprocess.DEVNULL
+        _tmp_fh = None  # temp file handle to close after Popen
         if log_dir is not None:
             log_dir.mkdir(parents=True, exist_ok=True)
             log_path = log_dir / f"{self.name}.log"
             stdout = open(log_path, "ab")
             stderr = subprocess.STDOUT
+            self._log_path = log_path
+            self._is_tmp_log = False
+        else:
+            # Capture stderr to a temp file so wait_ready can surface crash
+            # messages such as "No device of requested type available".
+            _tmp_fh = tempfile.NamedTemporaryFile(
+                prefix=f"arc-llama-{self.name}-", suffix=".log", delete=False
+            )
+            stderr = _tmp_fh
+            self._log_path = Path(_tmp_fh.name)
+            self._is_tmp_log = True
         log.info("[%s] starting: %s", self.name, " ".join(self.plan.argv))
         self.process = subprocess.Popen(
             self.plan.argv,
@@ -194,6 +251,9 @@ def start(self, log_dir: Path | None = None) -> None:
             stderr=stderr,
             preexec_fn=_preexec_isolate_and_pdeathsig,
         )
+        # Parent closes its write copy; child keeps its own inherited fd.
+        if _tmp_fh is not None:
+            _tmp_fh.close()
         self.started_at = time.time()
 
     async def wait_ready(self, timeout: float = DEFAULT_HEALTH_TIMEOUT) -> bool:
@@ -202,6 +262,7 @@ async def wait_ready(self, timeout: float = DEFAULT_HEALTH_TIMEOUT) -> bool:
             while time.time() < deadline:
                 if not self.is_running:
                     log.warning("[%s] process exited before becoming healthy", self.name)
+                    _surface_crash_logs(self.name, self._log_path)
                     return False
                 try:
                     r = await client.get(self.plan.health_url)
@@ -236,3 +297,10 @@ def stop(self, drain_seconds: float = 3.0) -> None:
                 pass
         self.process = None
         self.started_at = None
+        if self._is_tmp_log and self._log_path is not None:
+            try:
+                self._log_path.unlink(missing_ok=True)
+            except Exception:
+                pass
+            self._log_path = None
+            self._is_tmp_log = False
diff --git a/src/arc_llama/server.py b/src/arc_llama/server.py
@@ -242,7 +242,7 @@ async def admin_edit_model(name: str, request: Request) -> dict:
             try:
                 ub = int(body["ubatch_size"])
             except (TypeError, ValueError):
-                raise HTTPException(status_code=400, detail="ubatch_size must be an integer")
+                raise HTTPException(status_code=400, detail="ubatch_size must be an integer") from None
             if not (1 <= ub <= 4096):
                 raise HTTPException(status_code=400, detail="ubatch_size must be 1..4096")
             recipe["ubatch_size"] = ub

diff --git a/tests/test_launcher.py b/tests/test_launcher.py
@@ -191,7 +191,6 @@ async def test_wait_ready_true_when_healthy(self, monkeypatch: pytest.MonkeyPatc
         srv.started_at = 0.0
 
         import httpx
-        original_get = httpx.AsyncClient.get
 
         async def _fake_get(self, url):
             if "/health" in url:

diff --git a/tests/test_recipes.py b/tests/test_recipes.py
@@ -1,8 +1,6 @@
 """Tests for arc_llama.recipes — VRAM math, recipe generation, KV sizing."""
 from __future__ import annotations
 
-import pytest
-
 from arc_llama.arch import Arch
 from arc_llama.recipes import (
     DEFAULT_CTX_CAP,