diff --git a/usage-cookbook/Nemotron-3-Nano-Omni/computer-use-agent-with-omni/.dockerignore b/usage-cookbook/Nemotron-3-Nano-Omni/computer-use-agent-with-omni/.dockerignore new file mode 100644 index 000000000..e42216960 --- /dev/null +++ b/usage-cookbook/Nemotron-3-Nano-Omni/computer-use-agent-with-omni/.dockerignore @@ -0,0 +1,18 @@ +.git +.env +.env.* +!.env.example +__pycache__/ +*.py[cod] +*$py.class +.pytest_cache/ +.mypy_cache/ +.ruff_cache/ +.venv/ +venv/ +*.log +*.sif +dist/ +build/ +.DS_Store +Thumbs.db diff --git a/usage-cookbook/Nemotron-3-Nano-Omni/computer-use-agent-with-omni/.env.example b/usage-cookbook/Nemotron-3-Nano-Omni/computer-use-agent-with-omni/.env.example new file mode 100644 index 000000000..82d25c27d --- /dev/null +++ b/usage-cookbook/Nemotron-3-Nano-Omni/computer-use-agent-with-omni/.env.example @@ -0,0 +1,44 @@ +# Docker Compose reaches a vLLM server running on the host through +# host.docker.internal. Host-side curl can use http://127.0.0.1:8001/v1. +VLLM_API_BASE=http://host.docker.internal:8001/v1 +VLLM_API_KEY=EMPTY +VLLM_MODEL=vllm_local + +# ─── Model family ──────────────────────────────────────────────────────────── +# Selects which prompt/output contract the agent uses: +# nemotron — nvidia/Nemotron-3-Nano-Omni (## Action / ## Code markdown) +# holotron — Hcompany/Holotron-3-Nano (JSON tool_call via H Company agent loop) +# The vLLM container should be serving the matching model. +MODEL_FAMILY=nemotron + +# ─── Agent settings ───────────────────────────────────────────────────────── +ENABLE_THINKING=true +TRUNCATE_HISTORY_THINKING=false +MAX_STEPS=150 +MAX_IMAGE_HISTORY=3 +MODEL_MAX_TOKENS=20480 +REASONING_BUDGET=16384 +REASONING_GRACE_TOKENS=1024 +MODEL_ATTEMPT_TIMEOUT=120 +MODEL_MAX_RETRIES=3 +MODEL_RETRY_SLEEP=5 +COMPUTER_WAIT_SECONDS=3 + +# ─── Server ───────────────────────────────────────────────────────────────── +DEMO_PORT=8000 +DOCKER_SOCKET=/var/run/docker.sock +DESKTOP_CONTAINER_SERVICE=desktop +DOCKER_RESTART_TIMEOUT=10 + +# ─── Desktop environment ─────────────────────────────────────────────────── +# Desktop API port inside the Compose network +DESKTOP_API_PORT=5000 + +# KasmVNC port inside the Compose network +DESKTOP_VNC_PORT=6901 + +# Desktop password (used for VNC access and agent's sudo operations) +DESKTOP_PASSWORD=password + +# Screen resolution (must match model training: 1920x1080) +SCREEN_RESOLUTION=1920x1080 diff --git a/usage-cookbook/Nemotron-3-Nano-Omni/computer-use-agent-with-omni/.gitignore b/usage-cookbook/Nemotron-3-Nano-Omni/computer-use-agent-with-omni/.gitignore new file mode 100644 index 000000000..7a7e2714c --- /dev/null +++ b/usage-cookbook/Nemotron-3-Nano-Omni/computer-use-agent-with-omni/.gitignore @@ -0,0 +1,26 @@ +# Environment +.env +*.sif + +# Python +__pycache__/ +*.py[cod] +*$py.class +*.egg-info/ +dist/ +build/ +.venv/ +venv/ + +# IDE +.vscode/ +.idea/ +*.swp +*.swo + +# OS +.DS_Store +Thumbs.db + +# Logs +*.log diff --git a/usage-cookbook/Nemotron-3-Nano-Omni/computer-use-agent-with-omni/Dockerfile.server b/usage-cookbook/Nemotron-3-Nano-Omni/computer-use-agent-with-omni/Dockerfile.server new file mode 100644 index 000000000..a70dfc930 --- /dev/null +++ b/usage-cookbook/Nemotron-3-Nano-Omni/computer-use-agent-with-omni/Dockerfile.server @@ -0,0 +1,14 @@ +# FastAPI server for the agent loop + web UI + VNC proxy +FROM python:3.11-slim@sha256:6d85378d88a19cd4d76079817532d62232be95757cb45945a99fec8e8084b9c2 + +WORKDIR /app + +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +COPY server/ server/ +COPY web/ web/ + +EXPOSE 8000 + +CMD ["uvicorn", "server.main:app", "--host", "0.0.0.0", "--port", "8000"] diff --git a/usage-cookbook/Nemotron-3-Nano-Omni/computer-use-agent-with-omni/README.md b/usage-cookbook/Nemotron-3-Nano-Omni/computer-use-agent-with-omni/README.md new file mode 100644 index 000000000..bf1715e6b --- /dev/null +++ b/usage-cookbook/Nemotron-3-Nano-Omni/computer-use-agent-with-omni/README.md @@ -0,0 +1,428 @@ +# Computer Use Agent with Nemotron-3 Nano Omni & Holotron-3-Nano + +A self-contained, reproducible demo showing how to run **NVIDIA Nemotron-3 Nano Omni 30B** or **H Company Holotron-3-Nano** as a Computer Use Agent (CUA) that autonomously drives a live desktop environment via screenshots -> reasoning -> pyautogui actions. + +Both models share the same Omni-Nano-v3 backbone and run on the same vLLM TP=2 setup, but use **different prompt/output contracts**: + +| | Nemotron-3 Nano Omni (default) | Holotron-3-Nano | +|---|---|---| +| Output | Free-text `## Action: … ## Code: ` | Strict JSON `{note, thought, tool_call: {…}}` | +| Coordinates | Floats in `[0, 1]` projected to pixels | Integers in `[0, 1000]` scaled to pixels | +| Constraint | Reasoning parser | vLLM `structured_outputs` JSON schema | +| Tools | `pyautogui.*` + `computer.wait`/`computer.terminate` | 12-tool union (click_desktop, write_desktop, drag_to_desktop, scroll_desktop, hotkey_desktop, update_plan, answer, …) | +| Selection | `MODEL_FAMILY=nemotron` (default) | `MODEL_FAMILY=holotron` | + +Switch between them with the `MODEL_FAMILY` env var; the FastAPI server picks the right agent class at startup. **Only one vLLM container can serve at a time** on a 2-GPU host — stop the active vLLM and start the sibling for the other model. See [Launch vLLM](#launch-vllm) below. + +Inference uses an OpenAI-compatible **vLLM** endpoint. You can run vLLM locally on a GPU machine or point the demo at a remote vLLM server. + +``` + Your browser (http://localhost:8000) + ┌──────────────────────────────────────────────────────────────────────┐ + │ ┌───────────────────────────┬──────────────────────────────────────┐│ + │ │ Live Desktop │ Side Panel ││ + │ │ (KasmVNC iframe) │ ┌─ Desktop ─────────────────────┐ ││ + │ │ │ │ Ready — model name [Restart] │ ││ + │ │ │ └────────────────────────────────┘ ││ + │ │ │ ┌─ Task ────────────────────────┐ ││ + │ │ │ │ "Open Chrome, go to │ ││ + │ │ ┌─────────────────────┐ │ │ amazon.com, search keyboard" │ ││ + │ │ │ Ubuntu GNOME Desktop│ │ │ [▶ Run] [■ Stop] │ ││ + │ │ │ Chrome, Firefox, │ │ └────────────────────────────────┘ ││ + │ │ │ VS Code, LibreOffice│ │ ┌─ Live Agent Trace ────────────┐ ││ + │ │ │ GIMP, VLC, Mail │ │ │ step 1: 💭 I see a desktop… │ ││ + │ │ │ │ │ │ → click(0.5, 0.9) │ ││ + │ │ │ Agent controls this │ │ │ step 2: 💭 Chrome opened… │ ││ + │ │ │ in real-time ←──────│──│──│──── streaming reasoning │ ││ + │ │ └─────────────────────┘ │ └────────────────────────────────┘ ││ + │ └───────────────────────────┴──────────────────────────────────────┘│ + └──────────────────────────────────────────────────────────────────────┘ + + ┌──────────────────────────────┐ + │ vLLM OpenAI endpoint │ + │ http://.../v1 │ + │ Nemotron-3 Nano Omni 30B │ + │ or Hcompany/Holotron-3-Nano │ + │ - Vision + Reasoning │ + │ - Streaming response │ + └──────────────────────────────┘ +``` + +## Quick Start + +```bash +# 1. Start vLLM in a separate terminal, or point .env at an existing vLLM URL. +# See "Launch vLLM" below. + +# 2. Clone and configure the demo +git clone https://github.com/NVIDIA-NeMo/Nemotron.git +cd Nemotron/usage-cookbook/Nemotron-3-Nano-Omni/computer-use-agent-with-omni +cp .env.example .env + +# 3. Start the desktop + web server +docker compose up -d + +# 4. Open the demo +open http://localhost:8000 +``` + +The default `.env.example` expects vLLM on the host at `http://127.0.0.1:8001/v1`; the server container reaches that same endpoint as `http://host.docker.internal:8001/v1`. + +The web UI shows: +- **Left pane**: Live KasmVNC desktop — you see exactly what the agent sees and does +- **Desktop Environment card**: Readiness status plus a **Restart** button for the OS container +- **Task Instruction card**: Type a task, **Run** the agent, or **Stop** the current inference/action loop +- **Live Agent Trace**: Streaming model reasoning, parsed actions, execution output, retry notices, and errors + +## Prerequisites + +| Requirement | Notes | +|---|---| +| Docker + Docker Compose | Desktop or server, any OS | +| NVIDIA GPU machine or remote vLLM endpoint | Required for vLLM inference | +| Hugging Face access | Required if your vLLM server downloads the model from Hugging Face | +| ~12 GB disk | For the desktop container image and build cache | + +Nemotron-3 Nano Omni BF16 model weights are about 62 GB. Holotron-3-Nano weights are similar (also a 30B-class A3B model). Plan GPU memory and disk accordingly, or use a quantized variant supported by your vLLM setup. + +## Launch vLLM + +The demo only needs an OpenAI-compatible `/v1/chat/completions` endpoint. The default config assumes this endpoint is available on the host at `http://127.0.0.1:8001/v1`. + +### Option A: Docker — Nemotron-3 Nano Omni (default) + +This starts vLLM with the BF16 Hugging Face model ID and serves it as `vllm_local`, which matches `.env.example`. + +```bash +docker pull vllm/vllm-openai:v0.20.0 + +docker run --rm -it \ + --gpus all \ + --ipc=host \ + --shm-size=16g \ + -p 8001:8001 \ + --name nano-omni-vllm \ + --entrypoint /bin/bash \ + vllm/vllm-openai:v0.20.0 -lc ' + pip install "vllm[audio]" && + vllm serve nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-BF16 \ + --served-model-name vllm_local \ + --host 0.0.0.0 \ + --port 8001 \ + --trust-remote-code \ + --max-model-len 131072 \ + --gpu-memory-utilization 0.9 \ + --enable-prefix-caching \ + --max-num-seqs 8 \ + --allowed-local-media-path / \ + --reasoning-parser nemotron_v3 + ' +``` + +If your Hugging Face account is required to access the model, export `HF_TOKEN` and add `-e HF_TOKEN` to the `docker run` command. + +The agent sends `truncate_history_thinking=false` in `chat_template_kwargs` by default, so vLLM preserves previous-step thinking traces when rendering multi-turn GUI history. This is an inference request setting, not a `vllm serve` launch flag. + +Verify vLLM before starting the demo: + +```bash +curl -sS http://127.0.0.1:8001/v1/models | python3 -m json.tool +``` + +If vLLM runs on another machine, update `.env`: + +```bash +VLLM_API_BASE=http://YOUR_VLLM_HOST:8001/v1 +VLLM_API_KEY=EMPTY +VLLM_MODEL=vllm_local +``` + +When the FastAPI server runs inside Docker Compose and vLLM runs on the same host, keep `VLLM_API_BASE=http://host.docker.internal:8001/v1`. + +### Option B: Docker — Hcompany/Holotron-3-Nano + +To run the H Company Holotron model instead, stop the Nemotron container and start a sibling. Holotron uses the Qwen-3 reasoning parser: + +```bash +docker stop nano-omni-vllm 2>/dev/null && docker rm nano-omni-vllm 2>/dev/null + +docker run -d \ + --gpus '"device=0,1"' \ + --ipc=host \ + --shm-size=16g \ + -p 8011:8011 \ + --name nano-omni-vllm-holotron \ + -v $HOME/.cache/huggingface:/root/.cache/huggingface \ + --entrypoint /bin/bash \ + vllm/vllm-openai:v0.20.0 -lc ' + pip install "vllm[audio]" && + exec vllm serve Hcompany/Holotron-3-Nano \ + --served-model-name holotron_local \ + --host 0.0.0.0 \ + --port 8011 \ + --tensor-parallel-size 2 \ + --trust-remote-code \ + --max-model-len 131072 \ + --gpu-memory-utilization 0.9 \ + --enable-prefix-caching \ + --max-num-seqs 8 \ + --allowed-local-media-path / \ + --reasoning-parser nemotron_v3 + ' +``` + +Then point `.env` at this endpoint and select the matching model family: + +```bash +VLLM_API_BASE=http://host.docker.internal:8011/v1 +VLLM_API_KEY=EMPTY +VLLM_MODEL=holotron_local +MODEL_FAMILY=holotron +``` + +Restart the demo server to pick up the new agent class: + +```bash +docker compose up -d --build server +curl -s http://localhost:8000/health # → "model_family": "holotron" +``` + +The Holotron agent uses the official H Company `holo-nano` agent loop: +- 12-tool JSON schema (click/double_click/move_to/drag_to/scroll/write/key_down/key_up/hotkey/hold_and_tap_key/update_plan/answer) +- vLLM `structured_outputs` constrained decoding so the model can never emit malformed JSON +- `` / `` / `` user-message wrappers around each turn +- Image budget: oldest screenshots are demoted to `[Image omitted by context cleaning]` text placeholders, last 3 kept + +### Option C: Native vLLM + +Use the same `vllm serve` flags from Options A or B after installing vLLM 0.20.0 in your Python environment. Keep the served model name aligned with `VLLM_MODEL`. + +## Switching Between Models + +The demo supports both Nemotron-3 Nano Omni and Hcompany/Holotron-3-Nano, but a typical 2× A6000 host has only enough GPU memory to serve **one model at a time**. Switching is a 3-step ritual: (1) replace the vLLM container, (2) point `.env` at the new endpoint and family, (3) rebuild the FastAPI server container so it picks up the new env vars. + +### From Nemotron → Holotron + +```bash +# 1. Replace the vLLM container +docker stop nano-omni-vllm 2>/dev/null && docker rm nano-omni-vllm 2>/dev/null + +docker run -d \ + --gpus '"device=0,1"' --ipc=host --shm-size=16g \ + -p 8011:8011 --name nano-omni-vllm-holotron \ + -v $HOME/.cache/huggingface:/root/.cache/huggingface \ + --entrypoint /bin/bash vllm/vllm-openai:v0.20.0 -lc ' + pip install "vllm[audio]" && + exec vllm serve Hcompany/Holotron-3-Nano \ + --served-model-name holotron_local \ + --host 0.0.0.0 --port 8011 \ + --tensor-parallel-size 2 --trust-remote-code \ + --max-model-len 131072 --gpu-memory-utilization 0.9 \ + --enable-prefix-caching --max-num-seqs 8 \ + --allowed-local-media-path / --reasoning-parser nemotron_v3 + ' + +# Wait for vLLM to be ready (first launch downloads weights): +until curl -sf http://127.0.0.1:8011/v1/models >/dev/null; do sleep 5; done + +# 2. Update .env (these four lines must match the running vLLM) +sed -i \ + -e 's|^VLLM_API_BASE=.*|VLLM_API_BASE=http://host.docker.internal:8011/v1|' \ + -e 's|^VLLM_MODEL=.*|VLLM_MODEL=holotron_local|' \ + -e 's|^MODEL_FAMILY=.*|MODEL_FAMILY=holotron|' \ + .env + +# 3. Rebuild the demo server container (env vars are baked in at build time) +docker compose up -d --build server + +# Verify +curl -s http://localhost:8000/health | python -m json.tool # model_family: holotron +``` + +### From Holotron → Nemotron + +```bash +# 1. Replace the vLLM container +docker stop nano-omni-vllm-holotron 2>/dev/null && docker rm nano-omni-vllm-holotron 2>/dev/null + +docker run -d \ + --gpus '"device=0,1"' --ipc=host --shm-size=16g \ + -p 8001:8001 --name nano-omni-vllm \ + -v $HOME/.cache/huggingface:/root/.cache/huggingface \ + --entrypoint /bin/bash vllm/vllm-openai:v0.20.0 -lc ' + pip install "vllm[audio]" && + exec vllm serve nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-BF16 \ + --served-model-name vllm_local \ + --host 0.0.0.0 --port 8001 \ + --tensor-parallel-size 2 --trust-remote-code \ + --max-model-len 131072 --gpu-memory-utilization 0.9 \ + --enable-prefix-caching --max-num-seqs 8 \ + --allowed-local-media-path / --reasoning-parser nemotron_v3 + ' + +until curl -sf http://127.0.0.1:8001/v1/models >/dev/null; do sleep 5; done + +# 2. Update .env +sed -i \ + -e 's|^VLLM_API_BASE=.*|VLLM_API_BASE=http://host.docker.internal:8001/v1|' \ + -e 's|^VLLM_MODEL=.*|VLLM_MODEL=vllm_local|' \ + -e 's|^MODEL_FAMILY=.*|MODEL_FAMILY=nemotron|' \ + .env + +# 3. Rebuild the demo server +docker compose up -d --build server + +curl -s http://localhost:8000/health | python -m json.tool # model_family: nemotron +``` + +### Switching matrix + +`MODEL_FAMILY` must match what the vLLM container is serving — running Holotron weights with `MODEL_FAMILY=nemotron` (or vice versa) produces immediate parse errors because the prompt contracts are incompatible. + +| Model | `VLLM_API_BASE` | `VLLM_MODEL` | `MODEL_FAMILY` | vLLM port | +|---|---|---|---|---| +| Nemotron-3 Nano Omni | `http://host.docker.internal:8001/v1` | `vllm_local` | `nemotron` | 8001 | +| Hcompany/Holotron-3-Nano | `http://host.docker.internal:8011/v1` | `holotron_local` | `holotron` | 8011 | + +The two families use different ports by convention (8001 vs 8011) so you can leave both `.env` blocks ready and only flip `MODEL_FAMILY` + the matching `VLLM_*` lines. The web UI's `/health` endpoint always reports the live `model_family` so you can confirm which model is currently driving the demo. + +## How It Works + +1. A **Docker container** (`desktop/`) runs a full Ubuntu 22.04 GNOME desktop with: + - **KasmVNC** for browser-accessible live desktop viewing + - A minimal Flask API for screenshots, screen size, health, and command execution + - Desktop apps: Chrome, Firefox, LibreOffice, GIMP, VLC, VS Code, Thunderbird, Files, and Terminal + - A clean desktop canvas with launchers kept in the dock/app grid instead of desktop icons + - Based on the [ProRL-Agent-Server desktop recipe](https://github.com/NVIDIA-NeMo/ProRL-Agent-Server/blob/docker_osworld/osworld-docker/Dockerfile), trimmed for this demo + +2. The **FastAPI backend** (`server/`) orchestrates the agent loop: + - Takes a screenshot from the desktop API + - Sends screenshot + instruction + history through the configured vLLM endpoint + - Parses the model's response — `## Action` / `## Code` for Nemotron, or a JSON `tool_call` for Holotron — into a uniform `ParsedStep` + - Executes the resulting pyautogui commands inside the desktop container + - Feeds tool execution results back into the conversation as `` for Holotron's next turn + - Cancels active inference/tasks immediately when **Stop** is clicked + - Retries model calls with configurable per-attempt timeout settings + - Restarts the desktop container through the Docker Engine API when **Restart** is clicked + - Repeats until the model calls `computer.terminate(status="success")` (Nemotron) or `answer` (Holotron) + +3. The **web frontend** (`web/`) shows the live KasmVNC desktop in an iframe, exposes Run/Stop/Restart controls, and streams the agent's reasoning tokens in real-time via SSE. + +## Architecture + +``` +┌─────────────────────────────────────────────────────────────────────┐ +│ docker compose │ +│ │ +│ ┌──────────────────────────────┐ ┌───────────────────────────┐ │ +│ │ server (FastAPI :8000) │ │ desktop (GNOME :6901) │ │ +│ │ ├─ /web/* static UI │ │ ├─ KasmVNC │ │ +│ │ ├─ /vnc/* proxy ─────│────│──│ (browser desktop) │ │ +│ │ ├─ /agent/* loop │ │ ├─ Desktop API :5000 │ │ +│ │ │ ├─ screenshot ────────│────│──│── /screenshot │ │ +│ │ │ └─ execute ───────────│────│──│── /execute │ │ +│ │ ├─ /env/restart ───────────│────│──┤ restart container │ │ +│ │ └──────────────────────────┘ │ ├─ Desktop apps │ │ +│ │ │ │ │ Chrome, Code, Office │ │ +│ │ │ vLLM /v1 endpoint │ └───────────────────────│ │ +│ │ └─────────────────────────│──────────────────────────┘ │ +│ └───────────────────────────────────────────────────────────────── │ +└─────────────────────────────────────────────────────────────────────┘ + │ + ▼ + OpenAI-compatible vLLM endpoint + (Nemotron-3 Nano Omni 30B or Hcompany/Holotron-3-Nano) +``` + +## Configuration + +All settings go in `.env` (see `.env.example`): + +| Variable | Default | Description | +|---|---|---| +| `VLLM_API_BASE` | `http://host.docker.internal:8001/v1` | OpenAI-compatible vLLM base URL for Docker Compose | +| `VLLM_API_KEY` | `EMPTY` | Bearer token for vLLM | +| `VLLM_MODEL` | `vllm_local` | vLLM served model name (use `holotron_local` for the Holotron container) | +| `MODEL_FAMILY` | `nemotron` | Prompt/output contract: `nemotron` or `holotron`. Must match the model the vLLM container is serving. | +| `ENABLE_THINKING` | `true` | Enable reasoning mode (`` tags). Honored by both families. | +| `TRUNCATE_HISTORY_THINKING` | `false` | Keep previous-step thinking traces in chat-template history | +| `MAX_STEPS` | `150` | Maximum agent steps | +| `MAX_IMAGE_HISTORY` | `3` | Screenshots kept in context window | +| `MODEL_MAX_TOKENS` | `20480` | Maximum generated tokens per model attempt | +| `REASONING_BUDGET` | `16384` | Max reasoning tokens | +| `REASONING_GRACE_TOKENS` | `1024` | Extra thinking-token allowance before final answer generation | +| `MODEL_ATTEMPT_TIMEOUT` | `120` | Seconds before one model attempt is timed out | +| `MODEL_MAX_RETRIES` | `3` | Maximum model attempts per agent step | +| `MODEL_RETRY_SLEEP` | `5` | Seconds to wait between model retry attempts | +| `COMPUTER_WAIT_SECONDS` | `3` | Duration for explicit `computer.wait` actions generated by the model | +| `DEMO_PORT` | `8000` | Port for the web UI | +| `DOCKER_SOCKET` | `/var/run/docker.sock` | Docker Engine socket used by `/env/restart` | +| `DESKTOP_CONTAINER_SERVICE` | `desktop` | Compose service name to restart | +| `DOCKER_RESTART_TIMEOUT` | `10` | Docker stop timeout, in seconds, during restart | +| `DESKTOP_API_PORT` | `5000` | Desktop API port inside the Compose network | +| `DESKTOP_VNC_PORT` | `6901` | KasmVNC port inside the Compose network | +| `DESKTOP_PASSWORD` | `password` | Desktop login password | +| `SCREEN_RESOLUTION` | `1920x1080` | Desktop resolution | + +The server container mounts `/var/run/docker.sock` so the **Restart** button can restart only the Compose desktop service. Treat Docker socket access as host-level administrative access and expose this demo only in trusted development environments. + +## Project Structure + +``` +computer-use-agent-with-omni/ +├── README.md +├── .env.example # Template — copy to .env and configure inference +├── docker-compose.yml # One-command setup +├── Dockerfile.server # FastAPI server container +├── requirements.txt # Python deps for the server +├── desktop/ # Desktop container build context +│ ├── Dockerfile # Ubuntu GNOME + KasmVNC + minimal desktop API +│ ├── requirements-desktop-api.txt # Python deps for the desktop API +│ ├── desktop-api/ # Flask API server (health, screenshot, execute) +│ │ └── main.py +│ ├── gnome-config/ # GNOME session file +│ ├── startup-scripts/ # Custom GNOME startup +│ ├── logind-mock.py # Mock systemd-logind for container +│ └── kasmvnc-entrypoint.sh # Container entrypoint +├── server/ +│ ├── __init__.py +│ ├── main.py # FastAPI app (REST + SSE + VNC proxy); MODEL_FAMILY dispatch +│ ├── agent.py # NemotronAgent: ## Action / ## Code prompt + parsing + coord projection +│ ├── holotron_agent.py # HolotronAgent: H Company agent-loop (12-tool JSON schema, structured_outputs) +│ ├── vllm_inference.py # vLLM OpenAI-compatible inference path (both families) +│ ├── agent_runner.py # Async screenshot→model→action loop (model-family agnostic) +│ └── desktop_client.py # HTTP client for the desktop container API +└── web/ + ├── index.html # Two-pane UI (VNC iframe + side panel) + ├── sidepanel.js # SSE + REST glue + └── style.css # Dark theme +``` + +## Development + +```bash +# Run the server outside Docker (desktop container must be running and reachable) +docker compose up -d desktop +pip install -r requirements.txt +python -m uvicorn server.main:app --host 0.0.0.0 --port 8000 --reload +``` + +The default Compose file does not publish desktop ports to the host. For host-side server development, add a local Compose override that maps `127.0.0.1:5000:5000` and `127.0.0.1:6901:6901`, then set `DESKTOP_HOST=localhost`. When running the server outside Docker, `/env/restart` uses the host Docker socket path from `DOCKER_SOCKET`; the default `/var/run/docker.sock` works on typical Linux Docker installs. + +## API Reference + +| Endpoint | Method | Description | +|---|---|---| +| `/` | GET | Redirects to web UI | +| `/health` | GET | Health check + desktop status | +| `/env/screenshot` | GET | Live PNG screenshot | +| `/env/restart` | POST | Stop active jobs, restart the desktop container, wait for readiness | +| `/agent/start` | POST | Start agent task `{instruction, max_steps?}` | +| `/agent/{job_id}/stop` | POST | Cancel a running task/inference | +| `/agent/{job_id}/status` | GET | Job status | +| `/agent/{job_id}/events` | GET | SSE stream of reasoning + actions | +| `/vnc/*` | * | KasmVNC reverse proxy | + +## License + +Apache 2.0 diff --git a/usage-cookbook/Nemotron-3-Nano-Omni/computer-use-agent-with-omni/desktop/Dockerfile b/usage-cookbook/Nemotron-3-Nano-Omni/computer-use-agent-with-omni/desktop/Dockerfile new file mode 100644 index 000000000..26519e838 --- /dev/null +++ b/usage-cookbook/Nemotron-3-Nano-Omni/computer-use-agent-with-omni/desktop/Dockerfile @@ -0,0 +1,378 @@ +# =========================================================================== +# Computer Use Agent with Omni desktop - KasmVNC + GNOME + minimal desktop API +# Based on the ProRL-Agent-Server desktop recipe, with a reduced package and +# API surface for this demo. +# +# Base: kasmweb/ubuntu-jammy-desktop pinned by digest (KasmVNC + Xvnc built-in) +# DE: GNOME Shell +# Apps: Chrome, Firefox, LibreOffice, GIMP, VLC, VS Code, Thunderbird +# API: Minimal Flask server on port 5000 +# +# Build: docker build -t computer-use-agent-desktop ./desktop +# Run: docker run -d --shm-size=2g computer-use-agent-desktop +# =========================================================================== +FROM kasmweb/ubuntu-jammy-desktop@sha256:288de2b78d30d42dba80a9b3c806459762f8d404c3b851ad7483e030a9dad1d8 + +USER root + +# =========================================================================== +# 1. Remove default XFCE (Kasm core ships with XFCE) +# =========================================================================== +RUN rm -f /etc/apt/sources.list.d/*.list /etc/apt/sources.list.d/*.sources \ + && apt-get remove -y --purge \ + xfce4 xfce4-session xfce4-panel xfce4-settings xfce4-terminal \ + xfce4-appfinder xfce4-notifyd xfce4-screensaver xfce4-helpers \ + xfce4-pulseaudio-plugin \ + xfdesktop4 xfdesktop4-data xfwm4 thunar \ + libxfce4ui-2-0 libxfce4ui-common libxfce4ui-utils \ + libxfce4panel-2.0-4 libxfce4util7 libxfce4util-bin libxfce4util-common \ + libgarcon-1-0 libgarcon-gtk3-1-0 libexo-2-0 \ + xfconf libxfconf-0-3 gir1.2-xfconf-0 gir1.2-libxfce4util-1.0 \ + elementary-xfce-icon-theme 2>/dev/null \ + || true; \ + apt-mark manual code firefox google-chrome-stable \ + libreoffice-writer libreoffice-calc libreoffice-impress libreoffice-gtk3 \ + vlc gimp thunderbird 2>/dev/null || true; \ + apt-get autoremove -y \ + && rm -rf /var/lib/apt/lists/* + +# =========================================================================== +# 2. System dependencies +# =========================================================================== +RUN apt-get update && apt-get install -y --no-install-recommends \ + sudo \ + apt-transport-https ca-certificates curl wget gnupg \ + software-properties-common lsb-release \ + locales tzdata \ + dbus dbus-x11 dbus-user-session \ + python3 python3-pip python3-tk \ + python3-gi python3-gi-cairo python3-dbus \ + gir1.2-gtk-3.0 \ + xdotool xclip xsel xterm wmctrl \ + x11-xserver-utils \ + scrot \ + librsvg2-common libgdk-pixbuf2.0-bin \ + fonts-dejavu fonts-liberation fonts-noto fonts-noto-cjk fontconfig \ + xfonts-base xfonts-100dpi xfonts-75dpi \ + && rm -rf /var/lib/apt/lists/* + +RUN locale-gen en_US.UTF-8 +ENV LANG=en_US.UTF-8 +ENV LANGUAGE=en_US:en +ENV LC_ALL=en_US.UTF-8 + +RUN ln -sf /usr/bin/python3 /usr/bin/python + +# =========================================================================== +# 3. GNOME Shell desktop +# =========================================================================== +RUN apt-get update && apt-get install -y --no-install-recommends \ + gnome-shell \ + gnome-session \ + gnome-terminal \ + gnome-settings-daemon \ + gnome-shell-extension-ubuntu-dock \ + gnome-shell-extension-appindicator \ + gnome-shell-extension-desktop-icons-ng \ + gnome-software yelp \ + nautilus gvfs gvfs-backends \ + mutter glib-networking \ + dconf-cli dconf-service \ + adwaita-icon-theme adwaita-icon-theme-full \ + yaru-theme-gnome-shell yaru-theme-gtk yaru-theme-icon yaru-theme-sound \ + gnome-themes-extra \ + gsettings-desktop-schemas \ + xdg-user-dirs xdg-utils \ + && sed -i 's/^Icon=.*/Icon=software-store/' /usr/share/applications/org.gnome.Software.desktop \ + && rm -rf /var/lib/apt/lists/* + +# =========================================================================== +# 4. Applications +# =========================================================================== + +# Google Chrome +RUN install -d -m 0755 /etc/apt/keyrings \ + && wget -qO- https://dl.google.com/linux/linux_signing_key.pub \ + | gpg --dearmor --batch --yes -o /etc/apt/keyrings/google-linux-signing-keyring.gpg \ + && chmod 0644 /etc/apt/keyrings/google-linux-signing-keyring.gpg \ + && echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/google-linux-signing-keyring.gpg] https://dl.google.com/linux/chrome/deb/ stable main" \ + > /etc/apt/sources.list.d/google-chrome.list \ + && apt-get update && apt-get install -y google-chrome-stable \ + && rm -f /etc/apt/sources.list.d/google-chrome.sources \ + && rm -rf /var/lib/apt/lists/* + +# LibreOffice, VLC, GIMP, Thunderbird +RUN apt-get update && apt-get install -y --no-install-recommends \ + libreoffice-writer libreoffice-calc libreoffice-impress libreoffice-gtk3 \ + vlc vlc-plugin-access-extra \ + gimp gimp-data \ + thunderbird \ + && rm -rf /var/lib/apt/lists/* \ + && echo "/usr/lib/libreoffice/program" > /etc/ld.so.conf.d/libreoffice.conf \ + && ldconfig + +# VS Code +RUN install -d -m 0755 /etc/apt/keyrings \ + && wget -qO- https://packages.microsoft.com/keys/microsoft.asc \ + | gpg --dearmor --batch --yes -o /etc/apt/keyrings/microsoft.gpg \ + && chmod 0644 /etc/apt/keyrings/microsoft.gpg \ + && echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/microsoft.gpg] https://packages.microsoft.com/repos/vscode stable main" \ + > /etc/apt/sources.list.d/vscode.list \ + && apt-get update && apt-get install -y code \ + && rm -rf /var/lib/apt/lists/* + +# =========================================================================== +# 5. Python packages for the desktop API +# =========================================================================== +COPY requirements-desktop-api.txt /tmp/requirements-desktop-api.txt +RUN pip3 install --no-cache-dir --break-system-packages -r /tmp/requirements-desktop-api.txt 2>/dev/null \ + || pip3 install --no-cache-dir -r /tmp/requirements-desktop-api.txt + +# =========================================================================== +# 6. User setup (kasm-user already exists from base image) +# =========================================================================== +RUN echo "kasm-user ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers \ + && usermod -aG sudo,audio,video kasm-user \ + && ln -sf /home/kasm-user /home/user +RUN su - kasm-user -c "LC_ALL=C xdg-user-dirs-update" + +# =========================================================================== +# 7. Application wrappers +# =========================================================================== + +# Chrome wrapper +RUN cat > /usr/local/bin/google-chrome << 'CHROMEWRAP' +#!/bin/bash +set -euo pipefail +CHROME_BIN="/usr/bin/google-chrome-stable" +CHROME_DIR="${HOME}/.config/google-chrome-debug" +has_user_data_dir=false +for arg in "$@"; do + [[ "$arg" == --user-data-dir=* ]] && has_user_data_dir=true +done +mkdir -p "$CHROME_DIR" +rm -f "$CHROME_DIR/SingletonLock" "$CHROME_DIR/SingletonSocket" "$CHROME_DIR/SingletonCookie" || true +extra_args=(--no-sandbox --no-first-run --no-default-browser-check --start-maximized + --enable-webgl --ignore-gpu-blocklist --use-gl=swiftshader --enable-unsafe-swiftshader) +$has_user_data_dir || extra_args+=(--user-data-dir="$CHROME_DIR") +exec "$CHROME_BIN" "${extra_args[@]}" "$@" +CHROMEWRAP +RUN chmod +x /usr/local/bin/google-chrome \ + && ln -sf /usr/local/bin/google-chrome /usr/local/bin/google-chrome-wrapper + +RUN cat > /usr/share/applications/google-chrome.desktop << 'EOF' +[Desktop Entry] +Version=1.0 +Name=Google Chrome +Exec=/usr/local/bin/google-chrome %U +StartupNotify=true +Terminal=false +Icon=google-chrome +Type=Application +Categories=Network;WebBrowser; +EOF + +# VS Code wrapper +RUN cat > /usr/local/bin/code << 'CODEWRAP' +#!/bin/bash +export DONT_PROMPT_WSL_INSTALL=1 +exec /usr/bin/code --no-sandbox "$@" +CODEWRAP +RUN chmod +x /usr/local/bin/code \ + && ln -sf /usr/local/bin/code /usr/local/bin/code-wrapper \ + && sed -i 's|Exec=/usr/share/code/code|Exec=/usr/local/bin/code|g' /usr/share/applications/code.desktop 2>/dev/null || true \ + && sed -i 's|Exec=code|Exec=/usr/local/bin/code|g' /usr/share/applications/code.desktop 2>/dev/null || true + +# =========================================================================== +# 8. Application configs +# =========================================================================== +RUN mkdir -p /home/kasm-user/.config/vlc \ + && printf '[qt]\nqt-privacy-ask=0\n' > /home/kasm-user/.config/vlc/vlcrc +RUN mkdir -p /home/kasm-user/.config/Code/User \ + && printf '{"window.zoomLevel":0,"editor.fontSize":14,"telemetry.telemetryLevel":"off","update.mode":"none"}\n' \ + > /home/kasm-user/.config/Code/User/settings.json +RUN mkdir -p /home/kasm-user/.config/libreoffice/4/user \ + && printf '\n\nfalse\nfalse\n\n' \ + > /home/kasm-user/.config/libreoffice/4/user/registrymodifications.xcu +RUN mkdir -p /home/kasm-user/.config/google-chrome-debug \ + && rm -rf /home/kasm-user/.config/google-chrome \ + && ln -s /home/kasm-user/.config/google-chrome-debug /home/kasm-user/.config/google-chrome +RUN mkdir -p /home/kasm-user/.config/GIMP/2.10 \ + && printf '# GIMP gimprc\n(icon-theme "Legacy")\n(icon-size medium)\n' \ + > /home/kasm-user/.config/GIMP/2.10/gimprc + +# =========================================================================== +# 9. Disable GNOME Keyring password prompt +# =========================================================================== +RUN mv /usr/share/dbus-1/services/org.gnome.keyring.service /usr/share/dbus-1/services/org.gnome.keyring.service.disabled 2>/dev/null || true \ + && mv /usr/share/dbus-1/services/org.freedesktop.secrets.service /usr/share/dbus-1/services/org.freedesktop.secrets.service.disabled 2>/dev/null || true +RUN mkdir -p /home/kasm-user/.local/share/keyrings \ + && printf '[keyring]\ndisplay-name=Login\nctime=1234567890\nmtime=1234567890\nlock-on-idle=false\nlock-after=false\n' \ + > /home/kasm-user/.local/share/keyrings/login.keyring \ + && echo "login" > /home/kasm-user/.local/share/keyrings/default + +# =========================================================================== +# 10. Custom GNOME session for KasmVNC +# =========================================================================== +COPY gnome-config/kasm.session /usr/share/gnome-session/sessions/kasm.session +RUN mkdir -p /etc/xdg/autostart.disabled \ + && for f in gnome-initial-setup-first-login.desktop \ + gnome-initial-setup-copy-worker.desktop \ + org.gnome.SettingsDaemon.UsbProtection.desktop \ + org.gnome.SettingsDaemon.Rfkill.desktop \ + org.gnome.SettingsDaemon.Wacom.desktop \ + org.gnome.SettingsDaemon.Wwan.desktop \ + org.gnome.SettingsDaemon.Smartcard.desktop \ + org.gnome.SettingsDaemon.Sharing.desktop \ + org.gnome.SettingsDaemon.PrintNotifications.desktop; do \ + [ -f /etc/xdg/autostart/$f ] && mv /etc/xdg/autostart/$f /etc/xdg/autostart.disabled/ || true; \ + done + +# =========================================================================== +# 11. dconf defaults +# =========================================================================== +RUN mkdir -p /etc/dconf/profile /etc/dconf/db/local.d /etc/dconf/db/local.d/locks \ + && printf 'user-db:user\nsystem-db:local\n' > /etc/dconf/profile/user + +RUN cat > /etc/dconf/db/local.d/00-nano-omni << 'DCONF_EOF' +[org/gnome/shell] +favorite-apps=['google-chrome.desktop', 'thunderbird.desktop', 'code.desktop', 'vlc.desktop', 'libreoffice-writer.desktop', 'libreoffice-calc.desktop', 'libreoffice-impress.desktop', 'gimp.desktop', 'org.gnome.Nautilus.desktop', 'org.gnome.Software.desktop', 'yelp.desktop'] +enabled-extensions=['ubuntu-dock@ubuntu.com', 'ubuntu-appindicators@ubuntu.com'] +welcome-dialog-last-shown-version='99.0' + +[org/gnome/shell/extensions/dash-to-dock] +show-trash=true +show-mounts=false +show-show-apps-button=false + +[org/gnome/desktop/background] +show-desktop-icons=false +picture-uri='file:///usr/share/backgrounds/warty-final-ubuntu.png' +picture-uri-dark='file:///usr/share/backgrounds/warty-final-ubuntu.png' + +[org/gnome/shell/extensions/ding] +show-home=false +show-trash=false +show-volumes=false +show-network-volumes=false + +[org/gnome/desktop/interface] +gtk-theme='Yaru' +icon-theme='Yaru' +cursor-theme='Yaru' +show-battery-percentage=false +toolkit-accessibility=false +enable-animations=false + +[org/gnome/desktop/wm/preferences] +button-layout=':minimize,maximize,close' + +[org/gnome/desktop/screensaver] +lock-enabled=false +idle-activation-enabled=false + +[org/gnome/desktop/lockdown] +disable-lock-screen=true + +[org/gnome/desktop/session] +idle-delay=uint32 0 + +[org/gnome/desktop/notifications] +show-in-lock-screen=false + +[org/gnome/desktop/input-sources] +sources=[('xkb', 'us')] + +[org/gnome/settings-daemon/plugins/power] +sleep-inactive-ac-type='nothing' +sleep-inactive-battery-type='nothing' + +[org/gnome/mutter] +check-alive-timeout=uint32 0 + +[org/gnome/terminal/legacy] +theme-variant='dark' +DCONF_EOF + +RUN cat > /etc/dconf/db/local.d/locks/00-nano-omni-locks << 'EOF' +/org/gnome/desktop/screensaver/lock-enabled +/org/gnome/desktop/screensaver/idle-activation-enabled +/org/gnome/desktop/lockdown/disable-lock-screen +/org/gnome/desktop/session/idle-delay +/org/gnome/settings-daemon/plugins/power/sleep-inactive-ac-type +/org/gnome/settings-daemon/plugins/power/sleep-inactive-battery-type +EOF + +RUN dconf update + +# =========================================================================== +# 12. D-Bus + polkit fixes +# =========================================================================== +RUN cat > /usr/share/dbus-1/system-services/org.freedesktop.timedate1.service << 'EOF' +[D-BUS Service] +Name=org.freedesktop.timedate1 +Exec=/lib/systemd/systemd-timedated +User=root +SystemdService=dbus-org.freedesktop.timedate1.service +AssumedAppArmorLabel=unconfined +EOF +RUN mkdir -p /etc/polkit-1/localauthority/50-local.d \ + && cat > /etc/polkit-1/localauthority/50-local.d/10-timedate.pkla << 'EOF' +[Allow user timedate changes] +Identity=unix-user:kasm-user +Action=org.freedesktop.timedate1.set-time;org.freedesktop.timedate1.set-timezone;org.freedesktop.timedate1.set-local-rtc;org.freedesktop.timedate1.set-ntp +ResultAny=yes +ResultInactive=yes +ResultActive=yes +EOF + +# =========================================================================== +# 13. KasmVNC resolution + desktop_ready +# =========================================================================== +RUN sed -i 's/width: 1024/width: 1920/' /usr/share/kasmvnc/kasmvnc_defaults.yaml \ + && sed -i 's/height: 768/height: 1080/' /usr/share/kasmvnc/kasmvnc_defaults.yaml + +RUN printf '#!/usr/bin/env bash\nuntil pids=$(pidof gnome-shell); do sleep .5; done\n' \ + > /usr/bin/desktop_ready \ + && chmod +x /usr/bin/desktop_ready + +# =========================================================================== +# 14. Minimal desktop Flask API +# =========================================================================== +COPY desktop-api/main.py /home/kasm-user/server/main.py +RUN mkdir -p /home/kasm-user/server/screenshots + +# =========================================================================== +# 15. Logind mock + KasmVNC entrypoint + custom GNOME startup +# =========================================================================== +COPY logind-mock.py /usr/local/bin/logind-mock.py +RUN chmod +x /usr/local/bin/logind-mock.py + +COPY kasmvnc-entrypoint.sh /dockerstartup/kasmvnc-entrypoint.sh +RUN chmod +x /dockerstartup/kasmvnc-entrypoint.sh + +COPY startup-scripts/custom_startup_gnome.sh /dockerstartup/custom_startup.sh +RUN chmod +x /dockerstartup/custom_startup.sh + +# =========================================================================== +# 16. Fix ownership + environment +# =========================================================================== +RUN chown -R kasm-user:kasm-user /home/kasm-user + +ENV LD_LIBRARY_PATH= +ENV VNC_RESOLUTION=1920x1080 +ENV START_XFCE4=0 +ENV START_DE=gnome-shell +ENV GNOME_SHELL_SESSION_MODE=ubuntu +ENV XDG_SESSION_TYPE=x11 +ENV XDG_SESSION_DESKTOP=gnome +ENV XDG_CURRENT_DESKTOP=ubuntu:GNOME +ENV GDK_BACKEND=x11 + +ENV API_PORT=5000 +ENV VNC_PORT=6901 + +EXPOSE 5000 6901 +USER root +ENTRYPOINT ["/dockerstartup/kasmvnc-entrypoint.sh"] +CMD ["--wait"] diff --git a/usage-cookbook/Nemotron-3-Nano-Omni/computer-use-agent-with-omni/desktop/desktop-api/main.py b/usage-cookbook/Nemotron-3-Nano-Omni/computer-use-agent-with-omni/desktop/desktop-api/main.py new file mode 100644 index 000000000..b4ad5dce8 --- /dev/null +++ b/usage-cookbook/Nemotron-3-Nano-Omni/computer-use-agent-with-omni/desktop/desktop-api/main.py @@ -0,0 +1,114 @@ +"""Minimal desktop API used by nano_omni_demo. + +The FastAPI server only needs four operations from the desktop container: +readiness, screenshot capture, screen size, and command execution. This file +keeps that surface intentionally small for public release. +""" + +from __future__ import annotations + +import os +import subprocess +from io import BytesIO +from typing import Any + +import pyautogui +from flask import Flask, Response, jsonify, request + + +SERVER_VERSION = "2026.04.30.minimal" +MAX_COMMAND_TIMEOUT = 120 +MAX_OUTPUT_CHARS = 20000 + +app = Flask(__name__) + +pyautogui.FAILSAFE = False +pyautogui.PAUSE = 0 + + +def _json_error(message: str, status: int = 400): + return jsonify({"status": "error", "message": message}), status + + +def _command_from_request(data: dict[str, Any]) -> list[str] | None: + command = data.get("command") + if not isinstance(command, list) or not command: + return None + if not all(isinstance(arg, str) for arg in command): + return None + return [ + os.path.expanduser(arg) if arg.startswith("~/") else arg + for arg in command + ] + + +@app.get("/health") +def health(): + return jsonify({"status": "ok", "version": SERVER_VERSION}) + + +@app.get("/screenshot") +def screenshot(): + try: + image = pyautogui.screenshot() + if image.mode != "RGB": + image = image.convert("RGB") + buf = BytesIO() + image.save(buf, format="PNG") + return Response(buf.getvalue(), mimetype="image/png") + except Exception as exc: + return _json_error(f"screenshot failed: {exc}", 500) + + +@app.post("/screen_size") +def screen_size(): + try: + size = pyautogui.size() + return jsonify({"width": int(size.width), "height": int(size.height)}) + except Exception as exc: + return _json_error(f"screen size failed: {exc}", 500) + + +@app.post("/execute") +def execute_command(): + data = request.get_json(silent=True) or {} + command = _command_from_request(data) + if command is None: + return _json_error("command must be a non-empty list of strings") + + try: + timeout = min(float(data.get("timeout", MAX_COMMAND_TIMEOUT)), MAX_COMMAND_TIMEOUT) + except (TypeError, ValueError): + timeout = MAX_COMMAND_TIMEOUT + + try: + result = subprocess.run( + command, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + timeout=timeout, + shell=False, + check=False, + ) + except subprocess.TimeoutExpired: + return _json_error(f"command timed out after {timeout:g}s", 500) + except Exception as exc: + return _json_error(f"command failed: {exc}", 500) + + output = result.stdout or "" + if result.stderr: + output = f"{output}\n{result.stderr}" if output else result.stderr + output = output[:MAX_OUTPUT_CHARS] + + return jsonify({ + "status": "success" if result.returncode == 0 else "error", + "output": output, + "error": (result.stderr or "")[:MAX_OUTPUT_CHARS], + "returncode": result.returncode, + }) + + +if __name__ == "__main__": + port = int(os.getenv("API_PORT", "5000")) + app.run(host="0.0.0.0", port=port, debug=False) diff --git a/usage-cookbook/Nemotron-3-Nano-Omni/computer-use-agent-with-omni/desktop/gnome-config/kasm.session b/usage-cookbook/Nemotron-3-Nano-Omni/computer-use-agent-with-omni/desktop/gnome-config/kasm.session new file mode 100644 index 000000000..411830022 --- /dev/null +++ b/usage-cookbook/Nemotron-3-Nano-Omni/computer-use-agent-with-omni/desktop/gnome-config/kasm.session @@ -0,0 +1,3 @@ +[GNOME Session] +Name=GNOME (Kasm) +RequiredComponents=org.gnome.Shell;org.gnome.SettingsDaemon.A11ySettings;org.gnome.SettingsDaemon.Color;org.gnome.SettingsDaemon.Datetime;org.gnome.SettingsDaemon.Housekeeping;org.gnome.SettingsDaemon.Keyboard;org.gnome.SettingsDaemon.MediaKeys;org.gnome.SettingsDaemon.ScreensaverProxy;org.gnome.SettingsDaemon.Sound;org.gnome.SettingsDaemon.XSettings; diff --git a/usage-cookbook/Nemotron-3-Nano-Omni/computer-use-agent-with-omni/desktop/kasmvnc-entrypoint.sh b/usage-cookbook/Nemotron-3-Nano-Omni/computer-use-agent-with-omni/desktop/kasmvnc-entrypoint.sh new file mode 100644 index 000000000..844b9cf1b --- /dev/null +++ b/usage-cookbook/Nemotron-3-Nano-Omni/computer-use-agent-with-omni/desktop/kasmvnc-entrypoint.sh @@ -0,0 +1,86 @@ +#!/bin/bash +# --------------------------------------------------------------------------- +# kasmvnc-entrypoint.sh — Wrapper entrypoint that starts system services +# needed by GNOME (D-Bus, logind mock) as root, then drops to kasm-user for +# the normal Kasm startup chain. +# --------------------------------------------------------------------------- + +echo "[entrypoint] Starting kasmvnc-entrypoint.sh" >&2 + +# Generate machine-id if missing +if [ ! -s /etc/machine-id ]; then + dbus-uuidgen > /etc/machine-id 2>/dev/null || cat /proc/sys/kernel/random/uuid | tr -d '-' > /etc/machine-id + cp /etc/machine-id /var/lib/dbus/machine-id 2>/dev/null || true +fi + +# Fix PAM on EL8/EL9: remove pam_systemd which hangs without real systemd +if [ -f /etc/pam.d/system-auth ]; then + sed -i '/pam_systemd/d' /etc/pam.d/system-auth 2>/dev/null || true + sed -i '/pam_systemd/d' /etc/pam.d/password-auth 2>/dev/null || true +fi + +# Remove D-Bus service activation files that try to start systemd services +# These hang in containers without real systemd (both system and session buses) +rm -f /usr/share/dbus-1/system-services/org.freedesktop.RealtimeKit1.service 2>/dev/null || true +rm -f /usr/share/dbus-1/system-services/org.freedesktop.systemd1.service 2>/dev/null || true +rm -f /usr/share/dbus-1/services/org.freedesktop.systemd1.service 2>/dev/null || true + +# Start system D-Bus (GNOME needs it for various org.freedesktop.* services) +mkdir -p /run/dbus +rm -f /run/dbus/pid /run/dbus/system_bus_socket 2>/dev/null || true +dbus-daemon --system --nofork --nopidfile & +sleep 0.3 + +# Mark system as "systemd-like" for timedatectl etc. +mkdir -p /run/systemd/system + +# Start logind mock (GNOME shell requires org.freedesktop.login1) +PYTHON3_BIN=$(command -v python3.9 2>/dev/null || command -v python3 2>/dev/null || echo python3) +$PYTHON3_BIN /usr/local/bin/logind-mock.py & +sleep 0.5 + +echo "[entrypoint] Dropping to kasm-user..." >&2 + +# Drop to kasm-user preserving environment (VNC_RESOLUTION etc.) +# Set HOME explicitly so kasm-user doesn't inherit root's HOME +export HOME=/home/kasm-user + +# Start session D-Bus as kasm-user (not root) so DEs can use it properly +DBUS_SOCKET="/tmp/dbus-session-kasm" +rm -f "$DBUS_SOCKET" 2>/dev/null || true +if command -v gosu &>/dev/null; then + gosu kasm-user dbus-daemon --session --address="unix:path=$DBUS_SOCKET" --nofork --nopidfile & +else + su -m -s /bin/bash kasm-user -c "dbus-daemon --session --address='unix:path=$DBUS_SOCKET' --nofork --nopidfile" & +fi +sleep 0.3 +export DBUS_SESSION_BUS_ADDRESS="unix:path=$DBUS_SOCKET" + +# Wrap dbus-launch to use our pre-started session bus instead of spawning new ones +DBUS_LAUNCH_REAL=$(command -v dbus-launch 2>/dev/null || echo "") +if [ -n "$DBUS_LAUNCH_REAL" ] && [ -x "$DBUS_LAUNCH_REAL" ]; then + mv "$DBUS_LAUNCH_REAL" "${DBUS_LAUNCH_REAL}.real" 2>/dev/null || true + cat > "$DBUS_LAUNCH_REAL" << 'DBUSWRAP' +#!/bin/bash +if [ -n "$DBUS_SESSION_BUS_ADDRESS" ]; then + echo "DBUS_SESSION_BUS_ADDRESS=$DBUS_SESSION_BUS_ADDRESS" + echo "DBUS_SESSION_BUS_PID=0" + exit 0 +fi +exec "$(dirname "$0")/dbus-launch.real" "$@" +DBUSWRAP + chmod +x "$DBUS_LAUNCH_REAL" +fi + +# Drop privileges: gosu first (clean, no PAM), fallback to su -m +if command -v gosu &>/dev/null; then + echo "[entrypoint] Using gosu" >&2 + exec gosu kasm-user /bin/bash -c \ + 'exec /dockerstartup/kasm_default_profile.sh /dockerstartup/vnc_startup.sh /dockerstartup/kasm_startup.sh "$@"' \ + -- "$@" +else + echo "[entrypoint] Using su -m" >&2 + exec su -m -s /bin/bash kasm-user -c \ + 'exec /dockerstartup/kasm_default_profile.sh /dockerstartup/vnc_startup.sh /dockerstartup/kasm_startup.sh "$@"' \ + -- "$@" +fi diff --git a/usage-cookbook/Nemotron-3-Nano-Omni/computer-use-agent-with-omni/desktop/logind-mock.py b/usage-cookbook/Nemotron-3-Nano-Omni/computer-use-agent-with-omni/desktop/logind-mock.py new file mode 100644 index 000000000..9c25103ae --- /dev/null +++ b/usage-cookbook/Nemotron-3-Nano-Omni/computer-use-agent-with-omni/desktop/logind-mock.py @@ -0,0 +1,219 @@ +#!/usr/bin/env python3 +""" +Minimal mock of org.freedesktop.login1 (systemd-logind) for GNOME Shell. + +GNOME Shell's loginManager.js creates a LoginManagerSystemd proxy that calls: + - GetSession('auto') -> returns a session object path + - Session.Type property -> returns 'x11' + - Manager signals: PrepareForSleep, PrepareForShutdown + +This mock provides just enough of the D-Bus interface to prevent the +JS exception that crashes gnome-shell in containers without systemd. +""" + +import dbus +import dbus.service +import dbus.mainloop.glib +from gi.repository import GLib + +BUS_NAME = "org.freedesktop.login1" +MANAGER_PATH = "/org/freedesktop/login1" +SESSION_PATH = "/org/freedesktop/login1/session/auto" +SEAT_PATH = "/org/freedesktop/login1/seat/seat0" +USER_PATH = "/org/freedesktop/login1/user/_1000" + +MANAGER_IFACE = "org.freedesktop.login1.Manager" +SESSION_IFACE = "org.freedesktop.login1.Session" +SEAT_IFACE = "org.freedesktop.login1.Seat" +USER_IFACE = "org.freedesktop.login1.User" +PROP_IFACE = "org.freedesktop.DBus.Properties" + + +class MockSession(dbus.service.Object): + def __init__(self, bus): + super().__init__(bus, SESSION_PATH) + + @dbus.service.method(SESSION_IFACE, in_signature="b", out_signature="") + def TakeControl(self, force): + pass + + @dbus.service.method(SESSION_IFACE, in_signature="", out_signature="") + def ReleaseControl(self): + pass + + @dbus.service.method(SESSION_IFACE, in_signature="uu", out_signature="hb") + def TakeDevice(self, major, minor): + import os + path = f"/dev/char/{major}:{minor}" + try: + fd = os.open(path, os.O_RDWR | os.O_CLOEXEC | os.O_NOCTTY | os.O_NONBLOCK) + except OSError: + fd = os.open("/dev/null", os.O_RDWR) + return dbus.types.UnixFd(fd), dbus.Boolean(False) + + @dbus.service.method(SESSION_IFACE, in_signature="uu", out_signature="") + def ReleaseDevice(self, major, minor): + pass + + @dbus.service.method(SESSION_IFACE, in_signature="", out_signature="") + def Activate(self): + pass + + @dbus.service.signal(SESSION_IFACE, signature="ub") + def PauseDevice(self, major, minor): + pass + + @dbus.service.signal(SESSION_IFACE, signature="u") + def ResumeDevice(self, major): + pass + + @dbus.service.method(PROP_IFACE, in_signature="ss", out_signature="v") + def Get(self, interface, prop): + props = { + "Id": dbus.String("auto", variant_level=1), + "Name": dbus.String("kasm-user", variant_level=1), + "User": dbus.Struct([dbus.UInt32(1000), dbus.ObjectPath(USER_PATH)], variant_level=1), + "Seat": dbus.Struct([dbus.String("seat0"), dbus.ObjectPath(SEAT_PATH)], variant_level=1), + "Type": dbus.String("x11", variant_level=1), + "Class": dbus.String("user", variant_level=1), + "Active": dbus.Boolean(True, variant_level=1), + "State": dbus.String("active", variant_level=1), + "Display": dbus.String(":1", variant_level=1), + "Remote": dbus.Boolean(False, variant_level=1), + } + return props.get(prop, dbus.String("", variant_level=1)) + + @dbus.service.method(PROP_IFACE, in_signature="s", out_signature="a{sv}") + def GetAll(self, interface): + return { + "Id": dbus.String("auto"), + "Name": dbus.String("kasm-user"), + "Type": dbus.String("x11"), + "Class": dbus.String("user"), + "Active": dbus.Boolean(True), + "State": dbus.String("active"), + "Display": dbus.String(":1"), + "Remote": dbus.Boolean(False), + } + + +class MockSeat(dbus.service.Object): + def __init__(self, bus): + super().__init__(bus, SEAT_PATH) + + @dbus.service.method(PROP_IFACE, in_signature="ss", out_signature="v") + def Get(self, interface, prop): + props = { + "Id": dbus.String("seat0", variant_level=1), + "CanGraphical": dbus.Boolean(True, variant_level=1), + "CanMultiSession": dbus.Boolean(False, variant_level=1), + } + return props.get(prop, dbus.String("", variant_level=1)) + + +class MockUser(dbus.service.Object): + def __init__(self, bus): + super().__init__(bus, USER_PATH) + + @dbus.service.method(PROP_IFACE, in_signature="ss", out_signature="v") + def Get(self, interface, prop): + props = { + "Name": dbus.String("kasm-user", variant_level=1), + "UID": dbus.UInt32(1000, variant_level=1), + "State": dbus.String("active", variant_level=1), + } + return props.get(prop, dbus.String("", variant_level=1)) + + +class MockManager(dbus.service.Object): + def __init__(self, bus): + super().__init__(bus, MANAGER_PATH) + + @dbus.service.method(MANAGER_IFACE, in_signature="s", out_signature="o") + def GetSession(self, session_id): + return dbus.ObjectPath(SESSION_PATH) + + @dbus.service.method(MANAGER_IFACE, in_signature="s", out_signature="o") + def GetSessionByPID(self, pid): + return dbus.ObjectPath(SESSION_PATH) + + @dbus.service.method(MANAGER_IFACE, in_signature="u", out_signature="o") + def GetUser(self, uid): + return dbus.ObjectPath(USER_PATH) + + @dbus.service.method(MANAGER_IFACE, in_signature="s", out_signature="o") + def GetSeat(self, seat_id): + return dbus.ObjectPath(SEAT_PATH) + + @dbus.service.method(MANAGER_IFACE, in_signature="", out_signature="a(susso)") + def ListSessions(self): + return [(dbus.String("auto"), dbus.UInt32(1000), dbus.String("kasm-user"), + dbus.String("seat0"), dbus.ObjectPath(SESSION_PATH))] + + @dbus.service.method(MANAGER_IFACE, in_signature="ssss", out_signature="h") + def Inhibit(self, what, who, why, mode): + import os + r, w = os.pipe() + return dbus.types.UnixFd(r) + + @dbus.service.method(MANAGER_IFACE, in_signature="", out_signature="s") + def CanSuspend(self): + return "no" + + @dbus.service.method(MANAGER_IFACE, in_signature="", out_signature="s") + def CanHibernate(self): + return "no" + + @dbus.service.method(MANAGER_IFACE, in_signature="", out_signature="s") + def CanPowerOff(self): + return "no" + + @dbus.service.method(MANAGER_IFACE, in_signature="", out_signature="s") + def CanReboot(self): + return "no" + + @dbus.service.method(PROP_IFACE, in_signature="ss", out_signature="v") + def Get(self, interface, prop): + props = { + "IdleHint": dbus.Boolean(False, variant_level=1), + "IdleSinceHint": dbus.UInt64(0, variant_level=1), + "IdleSinceHintMonotonic": dbus.UInt64(0, variant_level=1), + "PreparingForShutdown": dbus.Boolean(False, variant_level=1), + "PreparingForSleep": dbus.Boolean(False, variant_level=1), + } + return props.get(prop, dbus.Boolean(False, variant_level=1)) + + @dbus.service.method(PROP_IFACE, in_signature="s", out_signature="a{sv}") + def GetAll(self, interface): + return { + "IdleHint": dbus.Boolean(False), + "PreparingForShutdown": dbus.Boolean(False), + "PreparingForSleep": dbus.Boolean(False), + } + + @dbus.service.signal(MANAGER_IFACE, signature="b") + def PrepareForSleep(self, active): + pass + + @dbus.service.signal(MANAGER_IFACE, signature="b") + def PrepareForShutdown(self, active): + pass + + +def main(): + dbus.mainloop.glib.DBusGMainLoop(set_as_default=True) + bus = dbus.SystemBus() + bus_name = dbus.service.BusName(BUS_NAME, bus) + + MockManager(bus) + MockSession(bus) + MockSeat(bus) + MockUser(bus) + + print(f"logind-mock: Registered {BUS_NAME} on system bus") + loop = GLib.MainLoop() + loop.run() + + +if __name__ == "__main__": + main() diff --git a/usage-cookbook/Nemotron-3-Nano-Omni/computer-use-agent-with-omni/desktop/requirements-desktop-api.txt b/usage-cookbook/Nemotron-3-Nano-Omni/computer-use-agent-with-omni/desktop/requirements-desktop-api.txt new file mode 100644 index 000000000..d7516a360 --- /dev/null +++ b/usage-cookbook/Nemotron-3-Nano-Omni/computer-use-agent-with-omni/desktop/requirements-desktop-api.txt @@ -0,0 +1,17 @@ +blinker==1.9.0 +click==8.3.3 +Flask==3.1.3 +itsdangerous==2.2.0 +Jinja2==3.1.6 +MarkupSafe==3.0.3 +MouseInfo==0.1.3 +Pillow==12.2.0 +PyAutoGUI==0.9.54 +PyGetWindow==0.0.9 +PyMsgBox==2.0.1 +pyperclip==1.11.0 +PyRect==0.2.0 +PyScreeze==1.0.1 +python3-Xlib==0.15 +pytweening==1.2.0 +Werkzeug==3.1.8 diff --git a/usage-cookbook/Nemotron-3-Nano-Omni/computer-use-agent-with-omni/desktop/startup-scripts/custom_startup_gnome.sh b/usage-cookbook/Nemotron-3-Nano-Omni/computer-use-agent-with-omni/desktop/startup-scripts/custom_startup_gnome.sh new file mode 100755 index 000000000..910e0f13c --- /dev/null +++ b/usage-cookbook/Nemotron-3-Nano-Omni/computer-use-agent-with-omni/desktop/startup-scripts/custom_startup_gnome.sh @@ -0,0 +1,81 @@ +#!/bin/bash +# --------------------------------------------------------------------------- +# custom_startup_gnome.sh — Launch GNOME session + desktop Flask API +# --------------------------------------------------------------------------- + +echo "--- GNOME + desktop API custom startup ---" + +export DISPLAY=${DISPLAY:-:1} +export XDG_SESSION_TYPE=x11 +export XDG_SESSION_DESKTOP=gnome +export XDG_CURRENT_DESKTOP=${XDG_CURRENT_DESKTOP:-GNOME} +export GDK_BACKEND=x11 + +# D-Bus session bus +if [ -z "$DBUS_SESSION_BUS_ADDRESS" ]; then + eval "$(dbus-launch --sh-syntax)" + export DBUS_SESSION_BUS_ADDRESS +fi + +# XDG runtime dir +XDG_RUNTIME_DIR="/tmp/runtime-$(id -u)" +mkdir -p "$XDG_RUNTIME_DIR" +chmod 700 "$XDG_RUNTIME_DIR" +export XDG_RUNTIME_DIR + +dconf update 2>/dev/null || true + +# Set resolution to 1920x1080 +xrandr --output VNC-0 --mode 1920x1080 2>/dev/null || true + +# Start gvfsd for Nautilus +( /usr/libexec/gvfsd || /usr/lib/gvfsd || /usr/lib/gvfs/gvfsd || true ) &>/dev/null & + +clear_desktop_icons() { + local desktop_dir="${HOME}/Desktop" + [ -d "$desktop_dir" ] || return 0 + + chmod 755 "$desktop_dir" 2>/dev/null || true + find "$desktop_dir" -mindepth 1 -maxdepth 1 -exec rm -rf {} + 2>/dev/null || true + gsettings set org.gnome.desktop.background show-desktop-icons false 2>/dev/null || true + gsettings set org.gnome.shell.extensions.ding show-home false 2>/dev/null || true + gsettings set org.gnome.shell.extensions.ding show-trash false 2>/dev/null || true + gsettings set org.gnome.shell.extensions.ding show-volumes false 2>/dev/null || true + gsettings set org.gnome.shell.extensions.ding show-network-volumes false 2>/dev/null || true + gsettings set org.gnome.shell enabled-extensions "['ubuntu-dock@ubuntu.com', 'ubuntu-appindicators@ubuntu.com']" 2>/dev/null || true +} + +# Keep the desktop canvas empty; apps remain available from the dock and app grid. +clear_desktop_icons + +# Dismiss GNOME Activities overview after shell starts +( + sleep 15 + xdotool key Escape 2>/dev/null || true + sleep 2 + xdotool key Escape 2>/dev/null || true +) & + +# Launch desktop Flask API in background (waits for gnome-shell) +( + sleep 15 + echo "Starting desktop Flask API on port ${API_PORT:-5000}" + cd /home/kasm-user/server + PYTHON3=$(command -v python3.9 2>/dev/null || command -v python3 2>/dev/null || echo python3); exec $PYTHON3 main.py +) & + +# Launch GNOME session (foreground) +# Check if systemd is actually running (PID 1), not just the directory mock +echo "Starting GNOME session on $DISPLAY" +if pidof systemd > /dev/null 2>&1 || dbus-send --system --print-reply --dest=org.freedesktop.systemd1 /org/freedesktop/systemd1 org.freedesktop.DBus.Peer.Ping > /dev/null 2>&1; then + exec gnome-session --disable-acceleration-check 2>&1 +else + # No real systemd — start gnome-shell directly (GNOME 42+ requires systemd for session) + echo "No systemd detected, starting gnome-shell directly" + # Try --x11 first (GNOME 43+), fall back to plain gnome-shell if flag unsupported + if gnome-shell --help 2>&1 | grep -q -- '--x11'; then + exec gnome-shell --x11 2>&1 + else + exec gnome-shell 2>&1 + fi +fi diff --git a/usage-cookbook/Nemotron-3-Nano-Omni/computer-use-agent-with-omni/docker-compose.yml b/usage-cookbook/Nemotron-3-Nano-Omni/computer-use-agent-with-omni/docker-compose.yml new file mode 100644 index 000000000..9bf19d938 --- /dev/null +++ b/usage-cookbook/Nemotron-3-Nano-Omni/computer-use-agent-with-omni/docker-compose.yml @@ -0,0 +1,37 @@ +services: + # Desktop environment: Ubuntu GNOME + KasmVNC + minimal desktop API + desktop: + build: + context: ./desktop + dockerfile: Dockerfile + expose: + - "6901" # KasmVNC, reached through the FastAPI /vnc proxy + - "5000" # Desktop API, reached only from the server service + environment: + - VNC_PW=${DESKTOP_PASSWORD:-password} + - VNC_RESOLUTION=${SCREEN_RESOLUTION:-1920x1080} + shm_size: "2g" + restart: unless-stopped + + # FastAPI server: agent loop + web UI + VNC proxy + server: + build: + context: . + dockerfile: Dockerfile.server + ports: + - "${DEMO_PORT:-8000}:8000" + env_file: + - .env + environment: + - DESKTOP_HOST=desktop + - DESKTOP_API_PORT=5000 + - DESKTOP_VNC_PORT=6901 + - DOCKER_SOCKET=/var/run/docker.sock + - DESKTOP_CONTAINER_SERVICE=desktop + extra_hosts: + - "host.docker.internal:host-gateway" + volumes: + - /var/run/docker.sock:/var/run/docker.sock + depends_on: + - desktop + restart: unless-stopped diff --git a/usage-cookbook/Nemotron-3-Nano-Omni/computer-use-agent-with-omni/requirements.txt b/usage-cookbook/Nemotron-3-Nano-Omni/computer-use-agent-with-omni/requirements.txt new file mode 100644 index 000000000..7fe03c4e5 --- /dev/null +++ b/usage-cookbook/Nemotron-3-Nano-Omni/computer-use-agent-with-omni/requirements.txt @@ -0,0 +1,29 @@ +annotated-doc==0.0.4 +annotated-types==0.7.0 +anyio==4.13.0 +certifi==2026.4.22 +click==8.3.3 +distro==1.9.0 +fastapi==0.136.1 +h11==0.16.0 +httpcore==1.0.9 +httptools==0.7.1 +httpx==0.28.1 +idna==3.13 +jiter==0.14.0 +loguru==0.7.3 +openai==2.33.0 +Pillow==12.2.0 +pydantic==2.13.3 +pydantic_core==2.46.3 +python-dotenv==1.2.2 +PyYAML==6.0.3 +sniffio==1.3.1 +starlette==1.0.0 +tqdm==4.67.3 +typing-inspection==0.4.2 +typing_extensions==4.15.0 +uvicorn==0.46.0 +uvloop==0.22.1 +watchfiles==1.1.1 +websockets==16.0 diff --git a/usage-cookbook/Nemotron-3-Nano-Omni/computer-use-agent-with-omni/server/__init__.py b/usage-cookbook/Nemotron-3-Nano-Omni/computer-use-agent-with-omni/server/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/usage-cookbook/Nemotron-3-Nano-Omni/computer-use-agent-with-omni/server/agent.py b/usage-cookbook/Nemotron-3-Nano-Omni/computer-use-agent-with-omni/server/agent.py new file mode 100644 index 000000000..9c332898a --- /dev/null +++ b/usage-cookbook/Nemotron-3-Nano-Omni/computer-use-agent-with-omni/server/agent.py @@ -0,0 +1,664 @@ +"""Nemotron-3 Nano Omni prompt, history, and response parser. + +This agent observes a desktop environment via screenshots and generates +executable pyautogui actions to complete automation tasks through an +OpenAI-compatible vLLM server. + +The prompt format matches the Nemotron CUA training specification: +- System prompt with password injection +- Multi-turn history with screenshots (limited sliding window) +- Older steps summarized as text-only history +- Response parsed for ## Action / ## Code blocks +- Coordinates projected from relative [0,1] to absolute pixels +""" + +from __future__ import annotations + +import asyncio +import ast +import base64 +import json +import re +from dataclasses import dataclass, field +from typing import Any, Awaitable, Callable, Optional + +from openai import AsyncOpenAI +from loguru import logger + +# Type for the optional streaming delta callback +DeltaCallback = Callable[[str, str], Awaitable[None]] + +# ── Prompt templates (from Nemotron CUA training spec) ─────────────────────── + +INSTRUCTION_TEMPLATE = ( + "# Task Instruction:\n{instruction}\n\n" + "Please generate the next move according to the screenshot, task " + "instruction and previous steps (if provided).\n" +) +STEP_TEMPLATE = "# Step {step_num}:\n" + +SYSTEM_PROMPT_THINKING = """\ +You are a GUI agent. You are given an instruction, a screenshot of the screen and your previous interactions with the computer. You need to perform a series of actions to complete the task. The passoword of the computer is {password}. + +For each step, provide your response in this format: +{thought} +## Action: +{action} +## Code: +{code} + +In the code section, the code should be either pyautogui code or one of the following functions wrapped in the code block: +- {"name": "computer.wait", "description": "Make the computer wait for 20 seconds for installation, running code, etc.", "parameters": {"type": "object", "properties": {}, "required": []}} +- {"name": "computer.terminate", "description": "Terminate the current task and report its completion status", "parameters": {"type": "object", "properties": {"status": {"type": "string", "enum": ["success", "failure"], "description": "The status of the task"}, "answer": {"type": "string", "description": "The answer of the task"}}, "required": ["status"]}}\ +""" + +SYSTEM_PROMPT_NON_THINKING = """\ +You are a GUI agent. You are given an instruction, a screenshot of the screen and your previous interactions with the computer. You need to perform a series of actions to complete the task. The passoword of the computer is {password}. + +For each step, provide your response in this format: +## Thought +{thought} +## Action: +{action} +## Code: +{code} + +In the code section, the code should be either pyautogui code or one of the following functions wrapped in the code block: +- {"name": "computer.wait", "description": "Make the computer wait for 20 seconds for installation, running code, etc.", "parameters": {"type": "object", "properties": {}, "required": []}} +- {"name": "computer.terminate", "description": "Terminate the current task and report its completion status", "parameters": {"type": "object", "properties": {"status": {"type": "string", "enum": ["success", "failure"], "description": "The status of the task"}, "answer": {"type": "string", "description": "The answer of the task"}}, "required": ["status"]}}\ +""" + +TEXT_HISTORY_TEMPLATE = "## Thought:\n{thought}\n\n## Action:\n{action}\n" +ASSISTANT_HISTORY_TEMPLATE_THINKING = "\n{thought}\n\n## Action:\n{action}\n" +ASSISTANT_HISTORY_TEMPLATE_NON_THINKING = "## Thought:\n{thought}\n\n## Action:\n{action}\n" + + +# ── Coordinate projection ──────────────────────────────────────────────────── + +_PYAUTOGUI_PARAM_NAMES: dict[str, list[str]] = { + "click": ["x", "y", "clicks", "interval", "button", "duration", "pause"], + "rightClick": ["x", "y", "duration", "tween", "pause"], + "middleClick": ["x", "y", "duration", "tween", "pause"], + "doubleClick": ["x", "y", "interval", "button", "duration", "pause"], + "tripleClick": ["x", "y", "interval", "button", "duration", "pause"], + "moveTo": ["x", "y", "duration", "tween", "pause"], + "dragTo": ["x", "y", "duration", "button", "mouseDownUp", "pause"], +} + + +def project_pyautogui_coords(code: str, screen_w: int, screen_h: int) -> str: + """Replace relative (0..1) coords in pyautogui.* calls with absolute px.""" + pattern = re.compile(r"(pyautogui\.\w+\([^\)]*\))") + out = code + for full_call in pattern.findall(code): + m = re.match(r"(pyautogui\.\w+)\((.*)\)", full_call, re.DOTALL) + if not m: + continue + func_name, args_str = m.group(1), m.group(2) + try: + parsed = ast.parse(f"f({args_str})").body[0].value + except SyntaxError: + continue + + param_names = _PYAUTOGUI_PARAM_NAMES.get(func_name.split(".")[-1], []) + args: dict[str, Any] = {} + for idx, arg in enumerate(parsed.args): + if idx < len(param_names): + try: + args[param_names[idx]] = ast.literal_eval(arg) + except (ValueError, SyntaxError): + pass + for kw in parsed.keywords: + try: + args[kw.arg] = ast.literal_eval(kw.value) + except (ValueError, SyntaxError): + pass + + if "x" not in args or "y" not in args: + continue + try: + x_rel = float(args["x"]) + y_rel = float(args["y"]) + except (TypeError, ValueError): + continue + if x_rel <= 1.0 and y_rel <= 1.0: + args["x"] = int(round(x_rel * screen_w)) + args["y"] = int(round(y_rel * screen_h)) + else: + args["x"] = int(round(x_rel)) + args["y"] = int(round(y_rel)) + + positional: list[str] = [] + for name in param_names: + if name in args: + v = args.pop(name) + positional.append(repr(v) if isinstance(v, str) else str(v)) + else: + break + keyword = [ + f"{k}={v!r}" if isinstance(v, str) else f"{k}={v}" + for k, v in args.items() + ] + out = out.replace(full_call, f"{func_name}({', '.join(positional + keyword)})") + return out + + +# ── Response parsing ───────────────────────────────────────────────────────── + + +@dataclass +class ParsedStep: + thought: str = "" + action: str = "" + code: str = "" # absolute-coord pyautogui (or "WAIT" / "DONE" / "FAIL") + original_code: str = "" # raw fenced code block, pre-projection + status: str = "continue" # one of: continue | wait | done | fail | error + error: Optional[str] = None + tool_name: str = "" # populated by HolotronAgent ("click", "write", ...) + + +def parse_response( + content: str, + reasoning_content: str, + screen_w: int, + screen_h: int, + *, + thinking: bool, +) -> ParsedStep: + """Parse model output into a ParsedStep.""" + + def _strip_think_block(text: str) -> str: + if "" in text: + return text.rsplit("", 1)[-1] + return text + + def _clean_code_section(section: str) -> str: + section = section.strip() + fenced = re.findall( + r"```[A-Za-z0-9_-]*\s*(.*?)\s*```", + section, + re.DOTALL, + ) + if fenced: + return fenced[-1].strip() + section = section.strip("`").strip() + section = re.sub(r"\s*```\s*$", "", section).strip() + return section + + def _terminal_status(block: str) -> tuple[str, str] | None: + lower = block.lower() + if "computer.wait" in lower: + return "WAIT", "wait" + if "computer.terminate" in lower: + if "failure" in lower or "fail" in lower: + return "FAIL", "fail" + if "success" in lower: + return "DONE", "done" + return "", "error" + return None + + def _parse_content(candidate: str, thought: str) -> ParsedStep: + out = ParsedStep(thought=thought) + candidate = _strip_think_block(candidate).lstrip() + + if not thinking: + thought_m = re.search( + r"^##\s*Thought\s*:?[\n\r]+(.*?)(?=^##\s*Action:|^##|\Z)", + candidate, + re.DOTALL | re.MULTILINE, + ) + out.thought = thought_m.group(1).strip() if thought_m else "" + + action_heading = re.search( + r"^\s*##\s*Action\s*:?", + candidate, + flags=re.MULTILINE | re.IGNORECASE, + ) + if not action_heading: + out.status = "error" + out.error = "missing action after parsing" + return out + + candidate = candidate[action_heading.start():] + next_action = re.search( + r"\n\s*##\s*Action\s*:?", + candidate[len(action_heading.group(0)) :], + flags=re.IGNORECASE, + ) + if next_action: + span_end = len(action_heading.group(0)) + next_action.start() + candidate = candidate[:span_end] + + action_m = re.search( + r"##\s*Action\s*:?\s*(.*?)(?=\s*##\s*Code\b|\Z)", + candidate, + re.DOTALL | re.IGNORECASE, + ) + if action_m: + out.action = action_m.group(1).strip() + + code_sections = re.findall( + r"##\s*Code\s*:?\s*(.*?)(?=\s*##\s*Code\b|\s*##\s*Action\b|\Z)", + candidate, + re.DOTALL | re.IGNORECASE, + ) + if not code_sections: + fenced_blocks = re.findall( + r"```(?:code|python|py)?\s*(.*?)\s*```", + candidate, + re.DOTALL | re.IGNORECASE, + ) + code_sections = fenced_blocks + + if not code_sections: + out.status = "error" + out.error = "no code block found" + return out + + executable_blocks: list[str] = [] + original_blocks: list[str] = [] + for raw_section in code_sections: + block = _clean_code_section(raw_section) + if not block: + continue + original_blocks.append(block) + terminal = _terminal_status(block) + if terminal is not None: + if executable_blocks: + # Some responses can append repeated terminate calls + # after a valid pyautogui action. The first executable + # action is the safe one to run for this step. + continue + code, status = terminal + if status == "error": + out.status = "error" + out.error = "computer.terminate without explicit status" + else: + out.code, out.status = code, status + out.original_code = block + return out + executable_blocks.append(block) + + if not executable_blocks: + out.status = "error" + out.error = "no executable code found" + return out + + block = "\n".join(executable_blocks).strip() + out.original_code = "\n".join(original_blocks).strip() + out.code = project_pyautogui_coords(block, screen_w, screen_h) + if not out.action or not out.code: + out.status = "error" + out.error = "missing action or code after parsing" + return out + + thought = reasoning_content.strip() if thinking else "" + parsed = _parse_content(content, thought) + if parsed.status != "error": + return parsed + + # Some OpenAI-compatible vLLM responses may place the formatted action/code + # block in the reasoning stream instead of the final content field. Only + # fall back when reasoning contains explicit response headings, so random + # scratch-pad examples are not executed. + if thinking: + action_heading = re.search(r"^##\s*Action\b", reasoning_content, re.MULTILINE) + else: + action_heading = None + if action_heading: + fallback_thought = reasoning_content[: action_heading.start()].strip() or thought + fallback = _parse_content(reasoning_content[action_heading.start():], fallback_thought) + if fallback.status != "error": + return fallback + + return parsed + + +# ── The agent ──────────────────────────────────────────────────────────────── + + +@dataclass +class _Turn: + """One historical agent turn.""" + screenshot_png: bytes + thought: str + action: str + + +@dataclass +class NemotronAgent: + """Desktop automation agent using Nemotron-3 Nano Omni prompt conventions.""" + + api_key: str + api_base: str = "http://host.docker.internal:8001/v1" + model: str = "vllm_local" + max_tokens: int = 20480 + top_p: float = 0.95 + temperature: float = 0.6 + max_image_history_length: int = 3 + password: str = "password" + thinking: bool = True + truncate_history_thinking: bool = False + reasoning_budget: int = 16384 + reasoning_grace_tokens: int = 1024 + model_attempt_timeout: float = 120.0 + max_retry: int = 3 + retry_sleep: float = 5.0 + history: list[_Turn] = field(default_factory=list) + + def reset(self) -> None: + self.history.clear() + + def record_tool_result(self, tool_name: str, output: str) -> None: + """No-op for Nemotron — kept on the base class so the runner can + unconditionally call it. Holotron uses this to feed + ```` back as the next user turn.""" + return None + + @property + def system_prompt(self) -> str: + tmpl = SYSTEM_PROMPT_THINKING if self.thinking else SYSTEM_PROMPT_NON_THINKING + return tmpl.replace("{password}", self.password) + + @property + def assistant_template(self) -> str: + return ( + ASSISTANT_HISTORY_TEMPLATE_THINKING + if self.thinking + else ASSISTANT_HISTORY_TEMPLATE_NON_THINKING + ) + + def _b64(self, data: bytes) -> str: + return base64.b64encode(data).decode("ascii") + + def build_messages(self, instruction: str, current_png: bytes) -> list[dict]: + """Construct the chat messages array for the API call.""" + messages: list[dict] = [{"role": "system", "content": self.system_prompt}] + instr_block = INSTRUCTION_TEMPLATE.format(instruction=instruction) + + n_with_images = min(len(self.history), max(1, self.max_image_history_length) - 1) + image_window_start = len(self.history) - n_with_images + + # Text-only history for older steps + text_history = "" + if image_window_start > 0: + parts = [] + for i in range(image_window_start): + parts.append( + STEP_TEMPLATE.format(step_num=i + 1) + + TEXT_HISTORY_TEMPLATE.format( + thought=self.history[i].thought, + action=self.history[i].action, + ) + ) + text_history = "# Previous History Actions:\n" + "\n".join(parts) + + # Image-included history (recent steps) + for i in range(image_window_start, len(self.history)): + user_text = instr_block + if i == image_window_start and text_history: + user_text += text_history + "\n" + user_text += f"You are currently on Step {i + 1}.\n" + messages.append({ + "role": "user", + "content": [ + { + "type": "image_url", + "image_url": {"url": f"data:image/png;base64,{self._b64(self.history[i].screenshot_png)}"}, + }, + {"type": "text", "text": user_text}, + ], + }) + messages.append({ + "role": "assistant", + "content": self.assistant_template.format( + thought=self.history[i].thought, + action=self.history[i].action, + ), + }) + + # Current step + current_text = instr_block + if n_with_images == 0 and text_history: + current_text += text_history + "\n" + current_text += f"You are currently on Step {len(self.history) + 1}.\n" + + messages.append({ + "role": "user", + "content": [ + { + "type": "image_url", + "image_url": {"url": f"data:image/png;base64,{self._b64(current_png)}"}, + }, + {"type": "text", "text": current_text}, + ], + }) + return messages + + def build_extra_body(self) -> dict[str, Any]: + """Build reasoning controls for the local or remote vLLM path. + + Nemotron-3 Nano Omni uses thinking mode for CUA quality. The model docs + and vLLM launch path both describe the same contract: enable thinking + in chat template kwargs, give the model a reasoning budget, and reserve + a small grace window so generation can transition from reasoning into + the final `## Action` / `## Code` answer instead of ending at length. + """ + + if not self.thinking: + return { + "chat_template_kwargs": { + "enable_thinking": False, + "truncate_history_thinking": self.truncate_history_thinking, + } + } + + return { + "thinking_token_budget": self.reasoning_budget + self.reasoning_grace_tokens, + "chat_template_kwargs": { + "enable_thinking": True, + "reasoning_budget": self.reasoning_budget, + "truncate_history_thinking": self.truncate_history_thinking, + }, + } + + async def step( + self, + instruction: str, + screenshot_png: bytes, + screen_size: tuple[int, int], + *, + delta_callback: Optional[DeltaCallback] = None, + ) -> ParsedStep: + """Send one OpenAI-compatible inference request and parse the result. + + Retry logic: + - Retry request, timeout, finish-reason, and parse failures + - Each model attempt has a bounded wall-clock timeout + - Temperature floor bumped to 0.2 on subsequent attempts + - finish_reason must be "stop" for a valid response + """ + + messages = self.build_messages(instruction, screenshot_png) + + client = AsyncOpenAI( + base_url=self.api_base, + api_key=self.api_key, + ) + + max_retry = max(1, self.max_retry) + last_error = "unknown" + parsed: Optional[ParsedStep] = None + + for attempt in range(max_retry): + # Match cua_demo: bump temperature floor on retries so the model + # isn't deterministically stuck producing the same malformed output + temperature = self.temperature if attempt == 0 else max(0.2, self.temperature) + + if delta_callback is not None and attempt > 0: + # Surface a visible marker into the live stream so the UI + # shows that we're retrying and why. + try: + await delta_callback( + f"\n\n[retry {attempt + 1}/{max_retry}: {last_error}]\n\n", + "", + ) + except Exception: + pass + + if delta_callback is not None: + try: + await delta_callback( + ( + f"\n\n[model attempt {attempt + 1}/{max_retry}; " + f"timeout {self.model_attempt_timeout:g}s]\n\n" + ), + "", + ) + except Exception: + pass + + try: + extra_body = self.build_extra_body() + + logger.info( + "model attempt {}/{} (timeout={}s, temperature={})", + attempt + 1, + max_retry, + self.model_attempt_timeout, + temperature, + ) + + content, reasoning, finish_reason = await asyncio.wait_for( + self._stream_completion( + client, + messages, + temperature, + extra_body, + delta_callback, + ), + timeout=self.model_attempt_timeout, + ) + + # Match cua_demo: check finish_reason is "stop" + if finish_reason not in (None, "stop"): + last_error = f"unexpected finish_reason={finish_reason}" + logger.warning( + "attempt {}/{}: {}; retrying", + attempt + 1, + max_retry, + last_error, + ) + if attempt + 1 < max_retry: + await asyncio.sleep(self.retry_sleep) + continue + + parsed = parse_response( + content, reasoning, screen_size[0], screen_size[1], + thinking=self.thinking, + ) + + if parsed.status != "error": + break + last_error = parsed.error or "parse error" + logger.warning( + "attempt {}/{}: parse error: {}; retrying", + attempt + 1, + max_retry, + last_error, + ) + if attempt + 1 < max_retry: + await asyncio.sleep(self.retry_sleep) + + except asyncio.TimeoutError: + last_error = f"model timed out after {self.model_attempt_timeout:g}s" + logger.warning( + "attempt {}/{}: {}; retrying", + attempt + 1, + max_retry, + last_error, + ) + if attempt + 1 < max_retry: + await asyncio.sleep(self.retry_sleep) + + except Exception as e: + last_error = f"request failed: {e}" + logger.error( + "attempt {}/{}: {}; retrying", + attempt + 1, + max_retry, + last_error, + ) + if attempt + 1 < max_retry: + await asyncio.sleep(self.retry_sleep) + continue + + if parsed is None or parsed.status == "error": + return ParsedStep( + status="error", + error=f"all {max_retry} attempts failed: {last_error}", + ) + + # Record history for successful steps + if parsed.status in {"continue", "wait"}: + self.history.append( + _Turn(screenshot_png=screenshot_png, thought=parsed.thought, action=parsed.action) + ) + return parsed + + async def _stream_completion( + self, + client: AsyncOpenAI, + messages: list[dict], + temperature: float, + extra_body: dict[str, Any], + delta_callback: Optional[DeltaCallback], + ) -> tuple[str, str, Optional[str]]: + """Run one streaming model request and return content, reasoning, finish.""" + + reasoning_buf: list[str] = [] + content_buf: list[str] = [] + finish_reason: Optional[str] = None + + stream = await client.chat.completions.create( + model=self.model, + messages=messages, + temperature=temperature, + top_p=self.top_p, + max_tokens=self.max_tokens, + stream=True, + extra_body=extra_body if extra_body else None, + ) + + async with stream: + async for chunk in stream: + if not chunk.choices: + continue + choice = chunk.choices[0] + delta = choice.delta + + # Handle reasoning content (thinking tokens). vLLM variants + # have used both field names. + r_delta = ( + getattr(delta, "reasoning_content", None) + or getattr(delta, "reasoning", None) + or "" + ) + c_delta = delta.content or "" + + if r_delta: + reasoning_buf.append(r_delta) + if c_delta: + content_buf.append(c_delta) + + if (r_delta or c_delta) and delta_callback: + try: + await delta_callback(r_delta, c_delta) + except Exception: + logger.warning("delta_callback raised; continuing stream") + + # Capture finish_reason from the final chunk + if choice.finish_reason: + finish_reason = choice.finish_reason + + return "".join(content_buf), "".join(reasoning_buf), finish_reason diff --git a/usage-cookbook/Nemotron-3-Nano-Omni/computer-use-agent-with-omni/server/agent_runner.py b/usage-cookbook/Nemotron-3-Nano-Omni/computer-use-agent-with-omni/server/agent_runner.py new file mode 100644 index 000000000..5543674d5 --- /dev/null +++ b/usage-cookbook/Nemotron-3-Nano-Omni/computer-use-agent-with-omni/server/agent_runner.py @@ -0,0 +1,192 @@ +"""Agent loop that drives a NemotronAgent against the desktop container. + +Publishes events to an asyncio queue so the FastAPI SSE endpoint can +stream them to the browser in real-time. +""" + +from __future__ import annotations + +import asyncio +import time +import uuid +from dataclasses import dataclass, field +from typing import Optional + +from loguru import logger + +from server.agent import NemotronAgent, ParsedStep +from server.desktop_client import DesktopClient + + +@dataclass +class AgentJob: + job_id: str + instruction: str + status: str = "pending" # pending | running | done | failed | stopped | error + error: Optional[str] = None + started_at: float = 0.0 + finished_at: float = 0.0 + events: asyncio.Queue = field(default_factory=asyncio.Queue) + _task: Optional[asyncio.Task] = None + _stop: asyncio.Event = field(default_factory=asyncio.Event) + + +class AgentRunner: + def __init__( + self, + agent: NemotronAgent, + desktop: DesktopClient, + *, + max_steps: int = 40, + wait_after_action: float = 0.0, + computer_wait_seconds: float = 3.0, + ) -> None: + self.agent = agent + self.desktop = desktop + self.max_steps = max_steps + self.wait_after_action = wait_after_action + self.computer_wait_seconds = max(0.0, computer_wait_seconds) + + def start(self, instruction: str) -> AgentJob: + job = AgentJob( + job_id=f"job-{uuid.uuid4().hex[:8]}", + instruction=instruction, + started_at=time.time(), + ) + self.agent.reset() + job._task = asyncio.create_task(self._run(job)) + return job + + async def _emit(self, job: AgentJob, kind: str, **payload) -> None: + await job.events.put({"kind": kind, "ts": time.time(), **payload}) + + async def _run(self, job: AgentJob) -> None: + job.status = "running" + await self._emit(job, "started", instruction=job.instruction) + + try: + screen_w, screen_h = await self.desktop.screen_size() + await self._emit(job, "screen_size", width=screen_w, height=screen_h) + + for step in range(1, self.max_steps + 1): + if job._stop.is_set(): + job.status = "stopped" + await self._emit(job, "stopped") + return + + await self._emit(job, "step_started", step=step) + + # Take screenshot + try: + png = await self.desktop.screenshot() + except Exception as exc: + job.status = "error" + job.error = f"screenshot failed: {exc}" + await self._emit(job, "error", message=job.error) + return + + # Stream reasoning tokens to frontend + async def _on_delta(r_delta: str, c_delta: str, _step=step) -> None: + await self._emit( + job, "thought_delta", + step=_step, + reasoning=r_delta, + content=c_delta, + ) + + # Call the model + parsed: ParsedStep = await self.agent.step( + job.instruction, png, (screen_w, screen_h), + delta_callback=_on_delta, + ) + + if job._stop.is_set(): + job.status = "stopped" + await self._emit(job, "stopped") + return + + await self._emit( + job, "thought", + step=step, + thought=parsed.thought[:500] if parsed.thought else "", + action=parsed.action, + code=parsed.original_code, + status=parsed.status, + ) + + if parsed.status == "error": + job.status = "error" + job.error = parsed.error or "parse error" + await self._emit(job, "error", message=job.error) + return + if parsed.status == "done": + job.status = "done" + await self._emit(job, "done", step=step, action=parsed.action) + return + if parsed.status == "fail": + job.status = "failed" + await self._emit(job, "failed", step=step, action=parsed.action) + return + if parsed.status == "wait": + await self._emit(job, "wait", step=step, seconds=self.computer_wait_seconds) + try: + await asyncio.wait_for( + job._stop.wait(), + timeout=self.computer_wait_seconds, + ) + except asyncio.TimeoutError: + pass + if job._stop.is_set(): + job.status = "stopped" + await self._emit(job, "stopped") + return + continue + + # Execute pyautogui code + if job._stop.is_set(): + job.status = "stopped" + await self._emit(job, "stopped") + return + try: + output = await self.desktop.run_pyautogui(parsed.code) + await self._emit( + job, "executed", step=step, code=parsed.code, output=output[:500] + ) + self.agent.record_tool_result(parsed.tool_name or "pyautogui", output) + except Exception as exc: + err = str(exc) + await self._emit( + job, "execute_error", step=step, code=parsed.code, message=err + ) + self.agent.record_tool_result( + parsed.tool_name or "pyautogui", f"error: {err}" + ) + + try: + await asyncio.wait_for( + job._stop.wait(), + timeout=self.wait_after_action, + ) + except asyncio.TimeoutError: + pass + if job._stop.is_set(): + job.status = "stopped" + await self._emit(job, "stopped") + return + + # Hit step ceiling + job.status = "failed" + await self._emit(job, "failed", reason="max_steps") + + except asyncio.CancelledError: + job.status = "stopped" + await self._emit(job, "stopped") + raise + except Exception as exc: + logger.exception("agent loop crashed") + job.status = "error" + job.error = str(exc) + await self._emit(job, "error", message=str(exc)) + finally: + job.finished_at = time.time() + await self._emit(job, "finished", status=job.status) diff --git a/usage-cookbook/Nemotron-3-Nano-Omni/computer-use-agent-with-omni/server/desktop_client.py b/usage-cookbook/Nemotron-3-Nano-Omni/computer-use-agent-with-omni/server/desktop_client.py new file mode 100644 index 000000000..33435636b --- /dev/null +++ b/usage-cookbook/Nemotron-3-Nano-Omni/computer-use-agent-with-omni/server/desktop_client.py @@ -0,0 +1,82 @@ +"""Async HTTP client for the desktop container's minimal API. + +The desktop container runs a Flask server that exposes: + GET /screenshot -> image/png + POST /screen_size -> {"width": int, "height": int} + POST /execute -> {"output": str} body: {"command": [...]} + GET /health -> {"status": "ok"} +""" + +from __future__ import annotations + +import asyncio +import time + +import httpx +from loguru import logger + + +class DesktopClient: + """Client for communicating with the desktop container.""" + + def __init__(self, host: str = "localhost", api_port: int = 5000, *, timeout: float = 60.0) -> None: + self.base = f"http://{host}:{api_port}" + self._timeout = timeout + + async def screenshot(self) -> bytes: + """Capture current desktop as PNG bytes.""" + async with httpx.AsyncClient(timeout=self._timeout) as c: + r = await c.get(f"{self.base}/screenshot") + r.raise_for_status() + return r.content + + async def screen_size(self) -> tuple[int, int]: + """Get desktop resolution.""" + async with httpx.AsyncClient(timeout=self._timeout) as c: + r = await c.post(f"{self.base}/screen_size") + r.raise_for_status() + data = r.json() + return int(data["width"]), int(data["height"]) + + async def execute(self, command: list[str]) -> str: + """Execute a command inside the desktop container.""" + async with httpx.AsyncClient(timeout=self._timeout) as c: + r = await c.post(f"{self.base}/execute", json={"command": command}) + r.raise_for_status() + return r.json().get("output", "") + + async def run_pyautogui(self, code: str) -> str: + """Run a pyautogui snippet inside the desktop. + + Wraps the code with imports and executes via python3 -c. + """ + wrapper = ( + "import pyautogui, time\n" + "pyautogui.FAILSAFE = False\n" + f"{code}\n" + ) + return await self.execute(["python3", "-c", wrapper]) + + async def wait_ready(self, max_seconds: float = 120.0, interval: float = 2.0) -> None: + """Poll the desktop API until it's responsive.""" + deadline = time.time() + max_seconds + last_err: Exception | None = None + while time.time() < deadline: + try: + async with httpx.AsyncClient(timeout=10) as c: + r = await c.get(f"{self.base}/health") + if r.status_code == 200: + return + except Exception as exc: + last_err = exc + await asyncio.sleep(interval) + raise TimeoutError(f"Desktop API never became ready after {max_seconds}s: {last_err}") + + async def is_ready(self) -> bool: + """Quick check if the desktop is responding.""" + try: + async with httpx.AsyncClient(timeout=5) as c: + r = await c.get(f"{self.base}/health") + return r.status_code == 200 + except Exception: + return False diff --git a/usage-cookbook/Nemotron-3-Nano-Omni/computer-use-agent-with-omni/server/holotron_agent.py b/usage-cookbook/Nemotron-3-Nano-Omni/computer-use-agent-with-omni/server/holotron_agent.py new file mode 100644 index 000000000..d4ebb41e8 --- /dev/null +++ b/usage-cookbook/Nemotron-3-Nano-Omni/computer-use-agent-with-omni/server/holotron_agent.py @@ -0,0 +1,994 @@ +"""Holotron-3-Nano agent — H Company agent-loop edition. + +Implements the official H Company agent contract documented in the +``holo-nano`` reference harness: + +- 12-tool union (``update_plan``, ``write_desktop``, ``click_desktop``, + ``double_click_desktop``, ``drag_to_desktop``, ``scroll_desktop``, + ``move_to_desktop``, ``key_down_desktop``, ``key_up_desktop``, + ``hotkey_desktop``, ``hold_and_tap_key_desktop``, ``answer``). +- Single JSON output per step: ``{note, thought, tool_call: {...}}``. +- Tool union enforced via vLLM ``structured_outputs`` extra_body. +- Coordinates emitted as integers in ``[0, 1000]``, scaled to absolute + pixels here before pyautogui code is sent to the desktop container. +- User observations wrapped in ``...``. +- Tool results injected back as user messages wrapped in + ``...`` (or ```` + on failure). +- Image budget: keep at most the last N screenshots; older ``image_url`` + chunks are demoted to ``[Image omitted by context cleaning]`` text. +- Only the parsed JSON (``Step.model_dump_json()``) is pushed back into + history — never raw model output, per H Company docs. + +Public surface mirrors :class:`server.agent.NemotronAgent` so the same +:class:`server.agent_runner.AgentRunner` drives both: + +- ``reset()`` +- ``step(instruction, screenshot_png, screen_size, *, delta_callback)`` +- ``record_tool_result(tool_name, output)`` (stores a ```` + for the next step). +""" + +from __future__ import annotations + +import asyncio +import base64 +import datetime +import json +from dataclasses import dataclass, field +from typing import Any, Literal, Optional + +import httpx +from loguru import logger +from pydantic import BaseModel, ConfigDict, Field, ValidationError + +from server.agent import DeltaCallback, ParsedStep + + +# ── Pydantic Step union (matches H Company holo-nano harness tools.py) ────── + +MouseButton = Literal["left", "right", "middle"] +ScrollDirection = Literal["up", "down", "left", "right"] +GoalStatus = Literal["todo", "running", "done", "failed"] + + +class _Tool(BaseModel): + model_config = ConfigDict(extra="forbid") + + +class Goal(_Tool): + """One step of the agent's task plan.""" + + title: str = Field( + description="Action-oriented title beginning with a verb." + ) + status: GoalStatus = Field(description="Current status of this goal.") + + +class UpdatePlan(_Tool): + """Create or update the agent's task plan.""" + + tool_name: Literal["update_plan"] + goals: list[Goal] = Field( + min_length=1, + max_length=20, + description="The full list of goals; only one may be 'running' at a time.", + ) + + +class WriteDesktop(_Tool): + """Type text at the current cursor position (no click first).""" + + tool_name: Literal["write_desktop"] + content: str = Field(description="The text to type.") + press_enter: bool = Field( + default=False, description="Press Enter immediately after typing." + ) + overwrite: bool = Field( + default=False, + description="Select-all and delete the field's contents before typing.", + ) + + +class ClickDesktop(_Tool): + """Click at (x, y) on a UI element.""" + + tool_name: Literal["click_desktop"] + element: str = Field(description="Detailed description of the target element.") + x: int = Field(ge=0, le=1000, description="X coordinate as integer in [0, 1000].") + y: int = Field(ge=0, le=1000, description="Y coordinate as integer in [0, 1000].") + button: MouseButton = Field(default="left") + + +class DoubleClickDesktop(_Tool): + """Double-click at (x, y) on a UI element.""" + + tool_name: Literal["double_click_desktop"] + element: str = Field(description="Detailed description of the target element.") + x: int = Field(ge=0, le=1000) + y: int = Field(ge=0, le=1000) + + +class DragToDesktop(_Tool): + """Drag from the current cursor position to (x, y). + + Useful for selecting text, moving objects, or drawing. Move the + cursor with ``move_to_desktop`` first to set the drag origin. + """ + + tool_name: Literal["drag_to_desktop"] + element: str = Field( + description=( + "Description of the destination element / area the drag ends on." + ) + ) + x: int = Field(ge=0, le=1000, description="Destination X in [0, 1000].") + y: int = Field(ge=0, le=1000, description="Destination Y in [0, 1000].") + + +class ScrollDesktop(_Tool): + """Move to (x, y) and scroll in a direction.""" + + tool_name: Literal["scroll_desktop"] + element: str = Field(description="Description of the region to scroll.") + x: int = Field(ge=0, le=1000) + y: int = Field(ge=0, le=1000) + direction: ScrollDirection + scroll_size: int = Field( + default=10, + ge=1, + le=100, + description="Number of mouse wheel clicks to scroll.", + ) + + +class MoveToDesktop(_Tool): + """Move the mouse cursor to (x, y) without clicking.""" + + tool_name: Literal["move_to_desktop"] + element: str = Field(description="Detailed description of the target element.") + x: int = Field(ge=0, le=1000) + y: int = Field(ge=0, le=1000) + + +class KeyDownDesktop(_Tool): + """Press a key down without releasing it (must be paired with key_up_desktop).""" + + tool_name: Literal["key_down_desktop"] + key: str = Field(description="Key to press, e.g. 'shift', 'ctrl', 'a'.") + + +class KeyUpDesktop(_Tool): + """Release a previously pressed key.""" + + tool_name: Literal["key_up_desktop"] + key: str = Field(description="Key to release.") + + +class HotkeyDesktop(_Tool): + """Press multiple keys in order, release in reverse (e.g. ['ctrl','c']).""" + + tool_name: Literal["hotkey_desktop"] + keys: list[str] = Field( + min_length=1, + max_length=5, + description="Keys to chord together, e.g. ['ctrl','shift','t'].", + ) + repeat_count: int = Field( + default=1, ge=1, le=20, description="Repeat the hotkey this many times." + ) + + +class HoldAndTapKeyDesktop(_Tool): + """Hold modifier keys while tapping a sequence of keys.""" + + tool_name: Literal["hold_and_tap_key_desktop"] + hold_keys: list[str] = Field( + min_length=1, max_length=3, description="Keys to hold down." + ) + tap_keys: list[str] = Field( + min_length=1, max_length=5, description="Keys to tap once each, in order." + ) + + +class Answer(_Tool): + """Terminate the task and return the final answer to the user.""" + + tool_name: Literal["answer"] + content: str = Field(description="The final answer (Markdown-formatted).") + + +ToolCall = ( + UpdatePlan + | WriteDesktop + | ClickDesktop + | DoubleClickDesktop + | DragToDesktop + | ScrollDesktop + | MoveToDesktop + | KeyDownDesktop + | KeyUpDesktop + | HotkeyDesktop + | HoldAndTapKeyDesktop + | Answer +) + + +class Step(BaseModel): + """The structured output the model emits each turn.""" + + model_config = ConfigDict(extra="forbid") + note: Optional[str] = Field( + default=None, + description=( + "Persistent notes — extract task-relevant facts from the current " + "screen. Set to null when nothing new is worth recording." + ), + ) + thought: str = Field( + description="Brief reasoning about progress and the next action." + ) + tool_call: ToolCall = Field(description="Exactly one tool to invoke.") + + +# ── System prompt (port of holo-nano/prompts/system.j2) ───────────────────── + +# Schema is computed once at import time and embedded in the system prompt +# so the model sees the same shape it is constrained against. +_SCHEMA: dict[str, Any] = Step.model_json_schema() + + +_SYSTEM_PROMPT_TEMPLATE = """\ +You are HoloNano, a navigation agent designed by H Company. + +# Core principles + +You operate autonomously within a local desktop environment where you interact with users. + +1. **Thoroughness over speed**: Accuracy matters; speed does not. You have no time pressure to complete the task. Move deliberately: observe carefully, think strategically, act precisely. Do not shortcut your way to the answer. Verify all critical steps thoroughly. +2. **Evidence over inference**: Base every statement or action on what you can see or verify. When uncertain, gather more data. +3. **Persistence through adaptation**: When blocked, analyze why. Refine your approach, try alternative methods, or pivot to new strategies until momentum returns. +4. **Methodical progress**: Each action should clarify, advance, or eliminate possibilities. Avoid aimless steps. Plan clearly, execute cleanly, verify always. +5. **Structured awareness**: Track what's known, pending, and missing. Record essential evidence before changing windows or applications; your notes are your only lasting memory. +6. **Closure with care**: Conclude only when the targeted result is verifiably reached or all information is gathered beyond doubt. +7. **Complete retention**: Make your answer as complete and verifiable as possible. Everything else (notes, thoughts, intermediate steps) will be lost forever. + +# Workflow + +You are an agent using the ReAct pattern to iteratively (1) observe the environment, (2) reason about next steps and (3) act in a series of steps. +You must emit at each step a single JSON object with keys `note`, `thought` and `tool_call`. + +## `note` + +GOAL: Persist information from the previous observation. +- Only the last few screenshots are kept in memory; notes persist. Before changing screens or windows, extract ALL task-relevant information into your note (you cannot return to review old screenshots). +- Extract visible information and evidence from the current environment state in relation to your task. +- Capture ALL relevant data in detail: values, short text excerpts, tables, context, application names, window titles, file paths, dialog messages, button states, timestamps, etc. +- Notes build on top of previous notes. New notes must be distinct from previous notes. Never restate old info. Set to null if nothing new. + +## `thought` + +GOAL: Reason strategically about next steps. +- Assess whether your past tool call was successful or not. +- Detect loops: Have you performed this same action before? If yes and it failed previously, you MUST pivot to a different approach. +- Assess progress: advancing toward completion, temporarily blocked, or fundamentally stuck. +- Identify remaining gaps: information needed, actions to perform, etc. +- Reason about next steps: continue, shortcut, backtrack, pivot, change strategy, etc. +- Select the optimal next tool to call. +- Reason about the arguments to pass to the tool. + +## `tool_call` + +GOAL: Emit a single tool call in JSON format strictly following the provided schema. +- Select one of the available tools as chosen in your reasoning. +- Each tool's schema specifies required vs optional arguments and their types (consult it before calling). +- Call the tool and its arguments in JSON format strictly following the provided schema. +- Invalid tool calls will not be executed. +- Emit only a single tool call per step. + +# Tools + +## Desktop Control Tools + +You control a live desktop environment via dedicated tools (mouse, keyboard, hotkeys). +At each step, you will be provided with a fresh screenshot of the current desktop screen. + +Coordinates `x` and `y` are integers in `[0, 1000]`, normalized against the +screenshot. The origin is the top-left corner. Coordinates will be scaled +to absolute pixels by the host driver before execution. + +Available desktop tools: +- `click_desktop(element, x, y, button="left")` — single click. +- `double_click_desktop(element, x, y)` — double click. +- `move_to_desktop(element, x, y)` — move cursor only, no click. +- `drag_to_desktop(element, x, y)` — drag from CURRENT cursor position to (x, y); call `move_to_desktop` first to set the drag origin (use this to select text, move files, draw, etc.). +- `scroll_desktop(element, x, y, direction, scroll_size=10)` — move to (x, y) then scroll up/down/left/right by `scroll_size` mouse wheel clicks (max 100). +- `write_desktop(content, press_enter=False, overwrite=False)` — type text at the current focus; `overwrite=True` selects-all then deletes first; `press_enter=True` presses Enter when done. +- `hotkey_desktop(keys, repeat_count=1)` — press multiple keys in order then release them in reverse, e.g. `keys=["ctrl","c"]`. Use this for keyboard shortcuts. +- `hold_and_tap_key_desktop(hold_keys, tap_keys)` — hold modifiers while tapping a sequence (e.g. hold `["ctrl"]`, tap `["a","c"]`). +- `key_down_desktop(key)` / `key_up_desktop(key)` — fine-grained press/release; pair them. + +### Critical blockers + +Critical blockers are obstacles that prevent progress on the current path. Handle them immediately, then pivot to alternative approaches. Only report a blocker as insurmountable after exhausting all viable alternatives: alternative methods, different navigation paths, simplified approaches, or workarounds. + +1. **Permission/authentication dialogs**: Handle or dismiss as needed. The sudo / lock password for this machine is `{password}` — use it only when the task explicitly requires it. +2. **Application not responding**: Wait briefly, then force quit and restart if necessary. +3. **Missing applications**: Attempt to locate or launch from alternative paths; report if unavailable and required. +4. **System dialogs/alerts**: Read carefully, dismiss or acknowledge appropriately to proceed. +5. **Locked files/permissions**: Try alternative locations or methods; report only if specifically required resource is inaccessible. + +### Strategic interaction + +Desktop applications expose various UI elements and shortcuts for efficient interaction. Use their logical structure and conventions. +- **Keyboard shortcuts**: Leverage common operations (Ctrl+C for copy, Ctrl+V for paste, Ctrl+S for save, Ctrl+F for find, etc.). +- **Application launching**: Click the dock/taskbar icons or use the application launcher. +- **Window management**: Use hotkeys for switching apps (Alt+Tab), or click window controls. +- **File operations**: Navigate file managers, use drag-and-drop for moving files (`move_to_desktop` then `drag_to_desktop`), or keyboard shortcuts for copy/paste/save operations. +- **Menu navigation**: Click menu items or use keyboard navigation (Alt+key). +- **Text input**: Text can be typed at the current cursor location. Focus input fields by clicking them first when needed. +- **Scrolling**: Navigate long content with `scroll_desktop` or `pagedown`/`pageup` keys. +- **Selecting text**: `move_to_desktop` to the start, then `drag_to_desktop` to the end. Or click then `hold_and_tap_key_desktop(["shift"], ["end"])`. + +### Element Localization + +Tools include an `element` string to describe the target UI element for traceability and clarity. Provide a clear, uniquely identifying description, including: +1. **Visible text**: Exact label, button text, placeholder text, heading, etc. +2. **Visual attributes**: Color, icon, shape, size, state, etc. +3. **Position**: Rough indication of location, quadrant, container, etc. +4. **Context anchors**: Description of nearby element(s) and their relative position. + +Good example: "Blue 'Search' button with magnifying glass icon, top-right of header, next to login link" +Bad example: "search button" + +## Planning + +Use `update_plan` to create, track, and adapt your task plan. + +- **Reasoning**: Before calling the tool, analyze in your `thought`: consider task breakdown, current gaps, dependencies, and complexity when creating an initial plan; diagnose failures, invalidated assumptions, and alternative routes when replanning. +- **When to plan**: Create a plan within your first 2-3 steps for non-trivial tasks (>5 steps or multiple sources). You may perform 1-2 exploratory actions first to understand the landscape, then plan based on findings. +- **Goal design**: Design goals that are action-oriented with concise titles beginning with a verb, achievable with concrete target states or success criteria, progressive where each goal unlocks the next, and right-sized with fewer goals for simple tasks and more for complex tasks. +- **Tracking progress**: Keep status up-to-date to stay on track and detect when stuck. When you complete a goal, call `update_plan` with the full goal list where the completed goal has status='done' and the next goal has status='running'. Always include all goals in your list. Only one goal should be 'running' at a time. Finish only when all goals are 'done'. +- **Replanning**: Replan immediately when you've attempted the same approach twice and failed both times, when new facts invalidate initial assumptions or partial results, when the user changes the task scope or priorities, or when a goal becomes impossible to complete (not just difficult). Include your done/failed goals plus new goals in the list. + +## Answer Tool + +Calling `answer` will terminate your task and send the final answer to the user. +This is an irreversible action; you must be certain beyond a doubt that all task requirements are verifiably met before terminating. + +### Termination criteria + +Call `answer` only when ALL of the following are true: + +1. **Task requirements met**: All requested information gathered OR target state reached and verified +2. **Evidence captured**: Verifiable proof exists in your notes (window/app names, window titles, file paths, dialog texts, confirmations, data values) +3. **Exploration exhausted**: No viable alternative approaches or methods remain untried +4. **Contradictions resolved**: Conflicting information has been cross-checked or acknowledged as unresolvable +5. **Actions confirmed**: Any file operations, form submissions, or system changes show clear success confirmation +6. **Notes complete**: All visible observations from the current screen recorded (screenshots are ephemeral) + +Do NOT call `answer` if: +- Alternative approaches exist that might accomplish the task +- You have not verified critical results when feasible +- Current screen contains unrecorded information relevant to the task +- A blocker was encountered but workarounds remain unexplored +- You reached a state but lack concrete evidence + +When in doubt about termination, ask: "If I had to defend this answer in court, do I have the evidence?" If no, continue working. + +### How to call the `answer` tool + +Your final answer is the ONLY artifact forwarded to the user; everything else is lost. + +**Rules** +1. Include as much information as necessary for a self-contained, verifiable output. Address every task requirement explicitly. +2. Prioritize completeness over brevity; avoid omitting critical details. +3. Synthesize ALL relevant information you have observed into one coherent answer. +4. Maximize retention (all that is not passed as an answer is lost forever). +5. Assume the reader has zero context about your process. Your answer must be self-contained and defensible. + +# System settings + +The current date is {start_time} +Maximal budget: {max_steps} steps or {max_time_s}s +Environment: Local desktop ({os_name}) +Effort level: High +Persona: Meticulous, precise, detail-oriented +Output format: JSON +""" + + +def _build_system_prompt( + *, + password: str, + max_steps: int, + max_time_s: int, + os_name: str = "Linux", +) -> str: + body = _SYSTEM_PROMPT_TEMPLATE.format( + start_time=datetime.datetime.now().strftime("%Y-%m-%d"), + max_steps=max_steps, + max_time_s=max_time_s, + os_name=os_name, + password=password, + ) + return ( + body + + "\n\n```json\n" + + json.dumps(_SCHEMA) + + "\n```\n\n" + ) + + +# ── pyautogui code rendering ──────────────────────────────────────────────── + +def _scale(c: int, dim: int) -> int: + """Scale a normalized [0, 1000] integer coord to absolute pixels.""" + return int(round(max(0, min(int(c), 1000)) / 1000.0 * dim)) + + +def _q(s: str) -> str: + """Python-source-safe literal for typewrite/keys.""" + return repr(s) + + +def _render_click(args: ClickDesktop, w: int, h: int) -> str: + return ( + f"pyautogui.click({_scale(args.x, w)}, {_scale(args.y, h)}," + f" button={args.button!r})" + ) + + +def _render_double_click(args: DoubleClickDesktop, w: int, h: int) -> str: + return ( + f"pyautogui.doubleClick({_scale(args.x, w)}, {_scale(args.y, h)}," + f" button='left')" + ) + + +def _render_drag_to(args: DragToDesktop, w: int, h: int) -> str: + x, y = _scale(args.x, w), _scale(args.y, h) + return ( + "import time\n" + "pyautogui.mouseDown(button='left')\n" + "time.sleep(0.05)\n" + f"pyautogui.moveTo({x}, {y}, duration=0.2)\n" + "time.sleep(0.05)\n" + "pyautogui.mouseUp(button='left')" + ) + + +def _render_scroll(args: ScrollDesktop, w: int, h: int) -> str: + x, y = _scale(args.x, w), _scale(args.y, h) + sz = int(args.scroll_size) + lines = [f"pyautogui.moveTo({x}, {y})"] + if args.direction == "up": + lines.append(f"pyautogui.scroll({sz})") + elif args.direction == "down": + lines.append(f"pyautogui.scroll(-{sz})") + elif args.direction == "left": + lines.append(f"pyautogui.hscroll(-{sz})") + elif args.direction == "right": + lines.append(f"pyautogui.hscroll({sz})") + return "\n".join(lines) + + +def _render_move_to(args: MoveToDesktop, w: int, h: int) -> str: + return f"pyautogui.moveTo({_scale(args.x, w)}, {_scale(args.y, h)})" + + +def _render_write(args: WriteDesktop) -> str: + lines: list[str] = [] + if args.overwrite: + lines.append("pyautogui.hotkey('ctrl', 'a')") + lines.append("pyautogui.press('delete')") + lines.append(f"pyautogui.typewrite({_q(args.content)}, interval=0.02)") + if args.press_enter: + lines.append("pyautogui.press('enter')") + return "\n".join(lines) + + +def _render_key_down(args: KeyDownDesktop) -> str: + return f"pyautogui.keyDown({_q(args.key.lower())})" + + +def _render_key_up(args: KeyUpDesktop) -> str: + return f"pyautogui.keyUp({_q(args.key.lower())})" + + +def _render_hotkey(args: HotkeyDesktop) -> str: + keys = ", ".join(_q(k.lower()) for k in args.keys) + call = f"pyautogui.hotkey({keys})" + if args.repeat_count == 1: + return call + return f"for _ in range({int(args.repeat_count)}):\n {call}" + + +def _render_hold_and_tap(args: HoldAndTapKeyDesktop) -> str: + lines: list[str] = [] + for k in args.hold_keys: + lines.append(f"pyautogui.keyDown({_q(k.lower())})") + for k in args.tap_keys: + lines.append(f"pyautogui.press({_q(k.lower())})") + for k in reversed(args.hold_keys): + lines.append(f"pyautogui.keyUp({_q(k.lower())})") + return "\n".join(lines) + + +def _render_action_text(tc: Any) -> str: + """Human-readable rendering for ParsedStep.action (UI display).""" + if isinstance(tc, ClickDesktop): + return f"Click {tc.element} at ({tc.x},{tc.y}) [{tc.button}]" + if isinstance(tc, DoubleClickDesktop): + return f"Double-click {tc.element} at ({tc.x},{tc.y})" + if isinstance(tc, DragToDesktop): + return f"Drag to ({tc.x},{tc.y}) — {tc.element}" + if isinstance(tc, ScrollDesktop): + return ( + f"Scroll {tc.direction} ×{tc.scroll_size} at ({tc.x},{tc.y})" + f" [{tc.element}]" + ) + if isinstance(tc, MoveToDesktop): + return f"Move to ({tc.x},{tc.y}) — {tc.element}" + if isinstance(tc, WriteDesktop): + suffix = " + Enter" if tc.press_enter else "" + prefix = "Overwrite with " if tc.overwrite else "Type " + return f"{prefix}{tc.content!r}{suffix}" + if isinstance(tc, KeyDownDesktop): + return f"Key down {tc.key}" + if isinstance(tc, KeyUpDesktop): + return f"Key up {tc.key}" + if isinstance(tc, HotkeyDesktop): + n = f" ×{tc.repeat_count}" if tc.repeat_count > 1 else "" + return f"Hotkey {'+'.join(tc.keys)}{n}" + if isinstance(tc, HoldAndTapKeyDesktop): + return f"Hold {'+'.join(tc.hold_keys)} tap {','.join(tc.tap_keys)}" + if isinstance(tc, UpdatePlan): + running = next( + (g.title for g in tc.goals if g.status == "running"), None + ) + done = sum(1 for g in tc.goals if g.status == "done") + return ( + f"Update plan ({done}/{len(tc.goals)} done; " + f"running: {running or '—'})" + ) + if isinstance(tc, Answer): + return f"Answer: {tc.content}" + return str(tc) + + +def _to_parsed_step(step: Step, screen_w: int, screen_h: int) -> ParsedStep: + """Translate a parsed Step into the demo's :class:`ParsedStep`.""" + tc = step.tool_call + parts: list[str] = [] + if step.note: + parts.append(f"Note: {step.note}") + if step.thought: + parts.append(step.thought) + thought_text = "\n".join(parts) + + # Terminal — answer ends the run. + if isinstance(tc, Answer): + return ParsedStep( + thought=thought_text, + action=_render_action_text(tc), + code="DONE", + original_code="answer", + status="done", + tool_name="answer", + ) + + # Plan-only — no desktop action; treat as a wait so the runner + # re-screenshots and lets the model take its next concrete step. + if isinstance(tc, UpdatePlan): + return ParsedStep( + thought=thought_text, + action=_render_action_text(tc), + code="WAIT", + original_code="update_plan", + status="wait", + tool_name="update_plan", + ) + + # Desktop tools — emit pyautogui source for run_pyautogui(). + if isinstance(tc, ClickDesktop): + code = _render_click(tc, screen_w, screen_h) + elif isinstance(tc, DoubleClickDesktop): + code = _render_double_click(tc, screen_w, screen_h) + elif isinstance(tc, DragToDesktop): + code = _render_drag_to(tc, screen_w, screen_h) + elif isinstance(tc, ScrollDesktop): + code = _render_scroll(tc, screen_w, screen_h) + elif isinstance(tc, MoveToDesktop): + code = _render_move_to(tc, screen_w, screen_h) + elif isinstance(tc, WriteDesktop): + code = _render_write(tc) + elif isinstance(tc, KeyDownDesktop): + code = _render_key_down(tc) + elif isinstance(tc, KeyUpDesktop): + code = _render_key_up(tc) + elif isinstance(tc, HotkeyDesktop): + code = _render_hotkey(tc) + elif isinstance(tc, HoldAndTapKeyDesktop): + code = _render_hold_and_tap(tc) + else: + return ParsedStep( + thought=thought_text, + status="error", + error=f"unknown tool {getattr(tc, 'tool_name', '?')}", + ) + + return ParsedStep( + thought=thought_text, + action=_render_action_text(tc), + code=code, + original_code=code, + status="continue", + tool_name=tc.tool_name, + ) + + +# ── Image-budget trimming (mirrors holo-nano agent.py) ────────────────────── + +def _trim_to_last_n_images(messages: list[dict], n: int = 3) -> None: + """Demote older image_url chunks to a placeholder text marker. + + Mirrors the holo-nano harness behaviour where evicted observations + become "[Image omitted by context cleaning]" within the same + ``...`` wrapper. + """ + seen = 0 + for msg in reversed(messages): + if msg.get("role") != "user" or not isinstance(msg.get("content"), list): + continue + for chunk in msg["content"]: + if chunk.get("type") != "image_url": + continue + seen += 1 + if seen > n: + chunk["type"] = "text" + chunk["text"] = "[Image omitted by context cleaning]" + chunk.pop("image_url", None) + + +# ── Agent ─────────────────────────────────────────────────────────────────── + +@dataclass +class HolotronAgent: + """Desktop automation agent powered by Hcompany/Holotron-3-Nano via vLLM.""" + + api_key: str + api_base: str = "http://host.docker.internal:8011/v1" + model: str = "holotron_local" + max_tokens: int = 4096 + top_p: float = 0.95 + temperature: float = 0.8 + max_image_history_length: int = 3 + password: str = "password" + + # Plan/step budget surfaced into the system prompt. + max_steps: int = 50 + max_time_s: int = 1800 + os_name: str = "Linux" + reasoning_effort: str = "medium" # vLLM extra_body field + + # Accepted-but-ignored knobs so main.py can pass the same kwargs as + # for NemotronAgent without branching: + thinking: bool = True + truncate_history_thinking: bool = False + reasoning_budget: int = 16384 + reasoning_grace_tokens: int = 1024 + model_attempt_timeout: float = 120.0 + max_retry: int = 3 + retry_sleep: float = 5.0 + + # Internal mutable state. Tuple is (tool_name, output, is_error). + _messages: list[dict] = field(default_factory=list) + _pending_tool_result: Optional[tuple[str, str, bool]] = None + _instruction_seeded: bool = False + + def reset(self) -> None: + self._messages = [{ + "role": "system", + "content": _build_system_prompt( + password=self.password, + max_steps=self.max_steps, + max_time_s=self.max_time_s, + os_name=self.os_name, + ), + }] + self._pending_tool_result = None + self._instruction_seeded = False + + # ── Helpers ──────────────────────────────────────────────────────────── + + @staticmethod + def _b64(data: bytes) -> str: + return base64.b64encode(data).decode("ascii") + + def _append_observation(self, png: bytes) -> None: + self._messages.append({ + "role": "user", + "content": [ + {"type": "text", "text": "\n"}, + { + "type": "image_url", + "image_url": { + "url": f"data:image/png;base64,{self._b64(png)}" + }, + }, + {"type": "text", "text": "\n"}, + ], + }) + + def _flush_pending_tool_result(self) -> None: + if self._pending_tool_result is None: + return + tool_name, output, is_error = self._pending_tool_result + self._pending_tool_result = None + tag = "error" if is_error else "tool_output" + self._messages.append({ + "role": "user", + "content": f'<{tag} tool="{tool_name}">\n{output}\n', + }) + + def _extra_body(self) -> dict[str, Any]: + body: dict[str, Any] = { + "structured_outputs": {"json": _SCHEMA}, + "reasoning_effort": self.reasoning_effort, + } + if not self.thinking: + body["chat_template_kwargs"] = {"enable_thinking": False} + return body + + def record_tool_result(self, tool_name: str, output: str) -> None: + """Store the last tool's stdout/stderr for emission on the next turn.""" + truncated = (output or "")[:2000] + is_error = truncated.startswith("error:") + # Map non-desktop tool names (which never reach the runner's + # run_pyautogui path) to a default tag. + if not tool_name: + tool_name = "pyautogui" + self._pending_tool_result = (tool_name, truncated, is_error) + + # ── Main loop entry point ────────────────────────────────────────────── + + async def step( + self, + instruction: str, + screenshot_png: bytes, + screen_size: tuple[int, int], + *, + delta_callback: Optional[DeltaCallback] = None, + client: Optional[httpx.AsyncClient] = None, + ) -> ParsedStep: + if not self._messages: + self.reset() + + # Seed the user's task once at the start of the conversation. + if not self._instruction_seeded: + self._messages.append({ + "role": "user", + "content": f"Task: {instruction}", + }) + self._instruction_seeded = True + + # Render any queued tool result before the next observation so the + # model can react to outcomes from its previous action. + self._flush_pending_tool_result() + self._append_observation(screenshot_png) + _trim_to_last_n_images(self._messages, n=self.max_image_history_length) + + endpoint = self.api_base.rstrip("/") + "/chat/completions" + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {self.api_key}", + } + owns_client = client is None + client = client or httpx.AsyncClient( + timeout=self.model_attempt_timeout + 5, + verify=False, + ) + + max_retry = max(1, self.max_retry) + last_error = "unknown" + parsed_step: Optional[Step] = None + finish_reason: Optional[str] = None + + try: + for attempt in range(max_retry): + temperature = ( + self.temperature + if attempt == 0 + else max(0.4, self.temperature - 0.1) + ) + payload = { + "model": self.model, + "messages": self._messages, + "temperature": temperature, + "top_p": self.top_p, + "max_tokens": self.max_tokens, + "stream": True, + } + payload.update(self._extra_body()) + + if delta_callback is not None: + try: + if attempt > 0: + await delta_callback( + f"\n\n[retry {attempt + 1}/{max_retry}: " + f"{last_error}]\n\n", + "", + ) + await delta_callback( + ( + f"\n\n[Holotron attempt " + f"{attempt + 1}/{max_retry}; timeout " + f"{self.model_attempt_timeout:g}s]\n\n" + ), + "", + ) + except Exception: + pass + + try: + logger.info( + "Holotron attempt {}/{} (endpoint={}, timeout={}s, " + "temperature={})", + attempt + 1, + max_retry, + endpoint, + self.model_attempt_timeout, + temperature, + ) + content, _reasoning, finish_reason = await asyncio.wait_for( + self._stream_completion( + client, endpoint, payload, headers, delta_callback, + ), + timeout=self.model_attempt_timeout, + ) + + if finish_reason not in (None, "stop"): + last_error = ( + f"unexpected finish_reason={finish_reason}" + ) + logger.warning( + "Holotron attempt {}/{}: {}; retrying", + attempt + 1, + max_retry, + last_error, + ) + if attempt + 1 < max_retry: + await asyncio.sleep(self.retry_sleep) + continue + + try: + parsed_step = Step.model_validate_json(content) + break + except ValidationError as exc: + last_error = f"json validation: {str(exc)[:200]}" + except Exception as exc: # noqa: BLE001 + last_error = ( + f"parse error: {type(exc).__name__}: {exc}" + ) + + logger.warning( + "Holotron attempt {}/{}: {}; retrying", + attempt + 1, + max_retry, + last_error, + ) + if attempt + 1 < max_retry: + await asyncio.sleep(self.retry_sleep) + + except asyncio.TimeoutError: + last_error = ( + f"model timed out after {self.model_attempt_timeout:g}s" + ) + logger.warning( + "Holotron attempt {}/{}: {}; retrying", + attempt + 1, + max_retry, + last_error, + ) + if attempt + 1 < max_retry: + await asyncio.sleep(self.retry_sleep) + + except Exception as exc: # noqa: BLE001 + last_error = f"request failed: {exc}" + logger.error( + "Holotron attempt {}/{}: {}; retrying", + attempt + 1, + max_retry, + last_error, + ) + if attempt + 1 < max_retry: + await asyncio.sleep(self.retry_sleep) + + finally: + if owns_client: + await client.aclose() + + if parsed_step is None: + return ParsedStep( + status="error", + error=f"all {max_retry} attempts failed: {last_error}", + ) + + # Push parsed JSON back into history (NOT raw model output) — per + # the H Company harness, only the canonical Step JSON should be + # round-tripped so the model never sees its own pre-validation prose. + self._messages.append({ + "role": "assistant", + "content": parsed_step.model_dump_json(), + }) + + screen_w, screen_h = screen_size + return _to_parsed_step(parsed_step, screen_w, screen_h) + + # ── Streaming ────────────────────────────────────────────────────────── + + async def _stream_completion( + self, + client: httpx.AsyncClient, + endpoint: str, + payload: dict, + headers: dict, + delta_callback: Optional[DeltaCallback], + ) -> tuple[str, str, Optional[str]]: + reasoning_buf: list[str] = [] + content_buf: list[str] = [] + finish_reason: Optional[str] = None + + async with client.stream( + "POST", endpoint, json=payload, headers=headers + ) as resp: + if resp.status_code != 200: + err_body = (await resp.aread()).decode("utf-8", errors="replace") + raise RuntimeError( + f"Holotron stream HTTP {resp.status_code}: {err_body[:400]}" + ) + + async for raw in resp.aiter_lines(): + if not raw or not raw.startswith("data:"): + continue + data = raw[len("data:"):].strip() + if not data or data == "[DONE]": + continue + try: + chunk = json.loads(data) + except json.JSONDecodeError: + logger.warning( + "Holotron stream returned non-JSON data chunk: {!r}", + data[:200], + ) + continue + + choices = chunk.get("choices") or [] + if not choices: + continue + choice = choices[0] + delta = choice.get("delta") or {} + r_delta = ( + delta.get("reasoning_content") + or delta.get("reasoning") + or "" + ) + c_delta = delta.get("content") or "" + + if r_delta: + reasoning_buf.append(r_delta) + if c_delta: + content_buf.append(c_delta) + if (r_delta or c_delta) and delta_callback: + try: + await delta_callback(r_delta, c_delta) + except Exception: + logger.warning( + "delta_callback raised; continuing Holotron stream" + ) + + if choice.get("finish_reason"): + finish_reason = choice["finish_reason"] + + return "".join(content_buf), "".join(reasoning_buf), finish_reason diff --git a/usage-cookbook/Nemotron-3-Nano-Omni/computer-use-agent-with-omni/server/main.py b/usage-cookbook/Nemotron-3-Nano-Omni/computer-use-agent-with-omni/server/main.py new file mode 100644 index 000000000..e647dba7a --- /dev/null +++ b/usage-cookbook/Nemotron-3-Nano-Omni/computer-use-agent-with-omni/server/main.py @@ -0,0 +1,506 @@ +"""FastAPI entrypoint for nano_omni_demo. + +Endpoints +--------- +GET / -> redirects to web UI +GET /health -> health check + desktop status +GET /env/screenshot -> live PNG from desktop +POST /env/restart -> restart the desktop container +POST /agent/start -> {instruction} -> {job_id} +POST /agent/{job_id}/stop -> stop the running job +GET /agent/{job_id}/status -> job status +GET /agent/{job_id}/events -> SSE stream of agent events +/vnc/* -> reverse proxy to KasmVNC (no cert/auth issues) +/vnc/websockify -> WebSocket proxy for noVNC +""" + +from __future__ import annotations + +import asyncio +import base64 +import json +import os +import socket +import ssl +import time +from contextlib import asynccontextmanager +from pathlib import Path +from typing import Optional + +from dotenv import load_dotenv + +# Load .env from project root +load_dotenv(Path(__file__).resolve().parent.parent / ".env", override=False) + +import httpx +import websockets +from fastapi import FastAPI, HTTPException, Request, WebSocket, WebSocketDisconnect +from fastapi.responses import JSONResponse, RedirectResponse, Response, StreamingResponse +from fastapi.staticfiles import StaticFiles +from loguru import logger +from pydantic import BaseModel, Field + +from server.agent_runner import AgentJob, AgentRunner +from server.desktop_client import DesktopClient +from server.holotron_agent import HolotronAgent +from server.vllm_inference import VllmInferenceAgent + +# ── Config (from .env) ────────────────────────────────────────────────────── + +MODEL_FAMILY = os.getenv("MODEL_FAMILY", "nemotron").strip().lower() +VLLM_API_BASE = os.getenv("VLLM_API_BASE", "http://host.docker.internal:8001/v1") +VLLM_API_KEY = os.getenv("VLLM_API_KEY", "EMPTY") +VLLM_MODEL = os.getenv("VLLM_MODEL", os.getenv("VLLM_SERVED_MODEL_NAME", "vllm_local")) +ENABLE_THINKING = os.getenv("ENABLE_THINKING", "true").lower() == "true" +TRUNCATE_HISTORY_THINKING = ( + os.getenv("TRUNCATE_HISTORY_THINKING", "false").lower() == "true" +) +MAX_STEPS = int(os.getenv("MAX_STEPS", "150")) +MAX_IMAGE_HISTORY = int(os.getenv("MAX_IMAGE_HISTORY", "3")) +MODEL_MAX_TOKENS = int(os.getenv("MODEL_MAX_TOKENS", "20480")) +REASONING_BUDGET = int(os.getenv("REASONING_BUDGET", "16384")) +REASONING_GRACE_TOKENS = int(os.getenv("REASONING_GRACE_TOKENS", "1024")) +MODEL_ATTEMPT_TIMEOUT = float(os.getenv("MODEL_ATTEMPT_TIMEOUT", "120")) +MODEL_MAX_RETRIES = int(os.getenv("MODEL_MAX_RETRIES", "3")) +MODEL_RETRY_SLEEP = float(os.getenv("MODEL_RETRY_SLEEP", "5")) +COMPUTER_WAIT_SECONDS = max(0.0, float(os.getenv("COMPUTER_WAIT_SECONDS", "3"))) +DESKTOP_HOST = os.getenv("DESKTOP_HOST", "localhost") +DESKTOP_API_PORT = int(os.getenv("DESKTOP_API_PORT", "5000")) +DESKTOP_VNC_PORT = int(os.getenv("DESKTOP_VNC_PORT", "6901")) +DESKTOP_PASSWORD = os.getenv("DESKTOP_PASSWORD", "password") +DOCKER_SOCKET = os.getenv("DOCKER_SOCKET", "/var/run/docker.sock") +DESKTOP_CONTAINER_NAME = os.getenv("DESKTOP_CONTAINER_NAME", "") +DESKTOP_CONTAINER_SERVICE = os.getenv("DESKTOP_CONTAINER_SERVICE", "desktop") +DOCKER_RESTART_TIMEOUT = int(os.getenv("DOCKER_RESTART_TIMEOUT", "10")) + +# KasmVNC credentials (match the desktop container defaults) +KASM_VNC_USER = os.getenv("VNC_USER", "kasm_user") +KASM_VNC_PW = os.getenv("VNC_PW", "password") + +WEB_DIR = Path(__file__).resolve().parent.parent / "web" + + +def _active_model() -> str: + return VLLM_MODEL + + +def _active_api_base() -> str: + return VLLM_API_BASE + + +# ── State ─────────────────────────────────────────────────────────────────── + + +class _State: + desktop: DesktopClient + jobs: dict[str, AgentJob] = {} + + +state = _State() + + +@asynccontextmanager +async def lifespan(app: FastAPI): + if MODEL_FAMILY not in {"nemotron", "holotron"}: + logger.error( + f"Unknown MODEL_FAMILY={MODEL_FAMILY!r}; expected 'nemotron' or 'holotron'." + ) + state.desktop = DesktopClient(host=DESKTOP_HOST, api_port=DESKTOP_API_PORT) + state.jobs = {} + logger.info( + "nano_omni_demo started (family={}, model={}, base={}, thinking={})", + MODEL_FAMILY, + _active_model(), + _active_api_base(), + ENABLE_THINKING, + ) + logger.info(f"Desktop: {DESKTOP_HOST}:{DESKTOP_API_PORT} (VNC: {DESKTOP_VNC_PORT})") + yield + + +app = FastAPI(title="nano_omni_demo", lifespan=lifespan) + + +@app.middleware("http") +async def add_cross_origin_isolation_headers(request: Request, call_next): + response = await call_next(request) + response.headers.setdefault("Cross-Origin-Opener-Policy", "same-origin") + response.headers.setdefault("Cross-Origin-Embedder-Policy", "require-corp") + response.headers.setdefault("Cross-Origin-Resource-Policy", "same-origin") + return response + + +# Mount static web frontend +if WEB_DIR.exists(): + app.mount("/web", StaticFiles(directory=str(WEB_DIR), html=True), name="web") + + +# ── DTOs ──────────────────────────────────────────────────────────────────── + + +class StartAgentRequest(BaseModel): + instruction: str = Field(..., min_length=1) + max_steps: Optional[int] = None + + +# ── Helpers ───────────────────────────────────────────────────────────────── + + +async def _stop_job(job: AgentJob) -> None: + job._stop.set() + if job.status in {"pending", "running"}: + job.status = "stopped" + await job.events.put({"kind": "stopping", "ts": time.time()}) + if job._task is not None and not job._task.done(): + job._task.cancel() + try: + await asyncio.wait_for(asyncio.shield(job._task), timeout=2) + except (asyncio.CancelledError, asyncio.TimeoutError): + pass + + +async def _stop_active_jobs() -> None: + await asyncio.gather( + *[ + _stop_job(job) + for job in list(state.jobs.values()) + if job.status in {"pending", "running"} + ], + return_exceptions=True, + ) + + +def _docker_client() -> httpx.AsyncClient: + if not os.path.exists(DOCKER_SOCKET): + raise HTTPException( + 503, + f"Docker socket not available at {DOCKER_SOCKET}; cannot restart desktop container.", + ) + transport = httpx.AsyncHTTPTransport(uds=DOCKER_SOCKET) + return httpx.AsyncClient(transport=transport, base_url="http://docker", timeout=30.0) + + +def _container_name(container: dict, container_id: str) -> str: + names = container.get("Names") + if isinstance(names, list) and names: + return str(names[0]).lstrip("/") + if isinstance(names, str) and names: + return names.lstrip("/") + name = container.get("Name") + if isinstance(name, str) and name: + return name.lstrip("/") + return container_id[:12] + + +async def _compose_project_label(client: httpx.AsyncClient) -> str | None: + try: + r = await client.get(f"/containers/{socket.gethostname()}/json") + r.raise_for_status() + return r.json().get("Config", {}).get("Labels", {}).get("com.docker.compose.project") + except Exception as exc: + logger.warning(f"Could not inspect server container for compose project label: {exc}") + return None + + +async def _find_desktop_container(client: httpx.AsyncClient) -> dict: + if DESKTOP_CONTAINER_NAME: + r = await client.get(f"/containers/{DESKTOP_CONTAINER_NAME}/json") + if r.status_code == 404: + raise HTTPException(404, f"Desktop container {DESKTOP_CONTAINER_NAME!r} not found") + r.raise_for_status() + data = r.json() + return { + "Id": data.get("Id"), + "Names": data.get("Name", "").lstrip("/"), + } + + labels = [f"com.docker.compose.service={DESKTOP_CONTAINER_SERVICE}"] + project = await _compose_project_label(client) + if project: + labels.append(f"com.docker.compose.project={project}") + + filters = json.dumps({"label": labels}) + r = await client.get("/containers/json", params={"all": "true", "filters": filters}) + r.raise_for_status() + containers = r.json() + if not containers: + raise HTTPException( + 404, + f"No Docker container found for compose service {DESKTOP_CONTAINER_SERVICE!r}", + ) + return containers[0] + + +# ── Routes ────────────────────────────────────────────────────────────────── + + +@app.get("/", include_in_schema=False) +async def root() -> RedirectResponse: + return RedirectResponse(url="/web/index.html") + + +@app.get("/health") +async def health() -> dict: + desktop_ok = await state.desktop.is_ready() + return { + "status": "ok", + "desktop": "ready" if desktop_ok else "not_ready", + "model_family": MODEL_FAMILY, + "model": _active_model(), + "api_base": _active_api_base(), + "enable_thinking": ENABLE_THINKING, + "truncate_history_thinking": TRUNCATE_HISTORY_THINKING, + "model_max_tokens": MODEL_MAX_TOKENS, + "reasoning_budget": REASONING_BUDGET, + "reasoning_grace_tokens": REASONING_GRACE_TOKENS, + "model_attempt_timeout": MODEL_ATTEMPT_TIMEOUT, + "model_max_retries": MODEL_MAX_RETRIES, + "computer_wait_seconds": COMPUTER_WAIT_SECONDS, + "vnc_password": KASM_VNC_PW, + } + + +@app.get("/env/screenshot") +async def env_screenshot() -> Response: + try: + png = await state.desktop.screenshot() + return Response(content=png, media_type="image/png") + except Exception as e: + raise HTTPException(503, f"Desktop not ready: {e}") + + +@app.post("/env/restart") +async def env_restart() -> dict: + await _stop_active_jobs() + + async with _docker_client() as client: + container = await _find_desktop_container(client) + container_id = container.get("Id") + if not container_id: + raise HTTPException(500, "Docker did not return a desktop container id") + + logger.info(f"Restarting desktop container {container_id[:12]}") + r = await client.post( + f"/containers/{container_id}/restart", + params={"t": str(DOCKER_RESTART_TIMEOUT)}, + ) + if r.status_code not in {204, 304}: + raise HTTPException(r.status_code, f"Docker restart failed: {r.text}") + + try: + await state.desktop.wait_ready(max_seconds=120, interval=2) + except Exception as exc: + raise HTTPException(503, f"Desktop restarted but did not become ready: {exc}") + + return { + "status": "ready", + "desktop": "ready", + "container": _container_name(container, container_id), + "model": _active_model(), + } + + +@app.post("/agent/start") +async def agent_start(req: StartAgentRequest) -> dict: + if MODEL_FAMILY not in {"nemotron", "holotron"}: + raise HTTPException( + 500, + f"Unknown MODEL_FAMILY={MODEL_FAMILY!r}. Use 'nemotron' or 'holotron'.", + ) + if not await state.desktop.is_ready(): + raise HTTPException(503, "Desktop container not ready. Is it running? (docker compose up)") + + await _stop_active_jobs() + + if MODEL_FAMILY == "holotron": + agent_cls = HolotronAgent + else: + agent_cls = VllmInferenceAgent + agent = agent_cls( + api_key=VLLM_API_KEY, + api_base=VLLM_API_BASE, + model=VLLM_MODEL, + max_tokens=MODEL_MAX_TOKENS, + max_image_history_length=MAX_IMAGE_HISTORY, + thinking=ENABLE_THINKING, + truncate_history_thinking=TRUNCATE_HISTORY_THINKING, + reasoning_budget=REASONING_BUDGET, + reasoning_grace_tokens=REASONING_GRACE_TOKENS, + model_attempt_timeout=MODEL_ATTEMPT_TIMEOUT, + max_retry=MODEL_MAX_RETRIES, + retry_sleep=MODEL_RETRY_SLEEP, + password=DESKTOP_PASSWORD, + ) + runner = AgentRunner( + agent, + state.desktop, + max_steps=req.max_steps or MAX_STEPS, + computer_wait_seconds=COMPUTER_WAIT_SECONDS, + ) + job = runner.start(req.instruction) + state.jobs[job.job_id] = job + return {"job_id": job.job_id, "status": job.status} + + +@app.post("/agent/{job_id}/stop") +async def agent_stop(job_id: str) -> dict: + job = state.jobs.get(job_id) + if job is None: + raise HTTPException(404, "unknown job") + await _stop_job(job) + return {"job_id": job_id, "status": job.status} + + +@app.get("/agent/{job_id}/status") +async def agent_status(job_id: str) -> dict: + job = state.jobs.get(job_id) + if job is None: + raise HTTPException(404, "unknown job") + return { + "job_id": job.job_id, + "status": job.status, + "error": job.error, + "started_at": job.started_at, + "finished_at": job.finished_at, + } + + +@app.get("/agent/{job_id}/events") +async def agent_events(job_id: str, request: Request) -> StreamingResponse: + job = state.jobs.get(job_id) + if job is None: + raise HTTPException(404, "unknown job") + + async def gen(): + while True: + if await request.is_disconnected(): + return + try: + ev = await asyncio.wait_for(job.events.get(), timeout=15) + except asyncio.TimeoutError: + yield ": keep-alive\n\n" + if job.status not in {"running", "pending"}: + return + continue + yield f"data: {json.dumps(ev)}\n\n" + if ev.get("kind") == "finished": + return + + return StreamingResponse( + gen(), + media_type="text/event-stream", + headers={ + "Cache-Control": "no-cache, no-transform", + "Connection": "keep-alive", + "X-Accel-Buffering": "no", + }, + ) + + +# ── KasmVNC Reverse Proxy ─────────────────────────────────────────────────── +# +# Proxy KasmVNC through the FastAPI server so users only need to expose +# port 8000. No self-signed cert warnings, no Basic Auth prompts. +# +# HTTP /vnc/ -> https://:/ +# WS /vnc/websockify -> wss://:/websockify + +_KASM_AUTH = "Basic " + base64.b64encode( + f"{KASM_VNC_USER}:{KASM_VNC_PW}".encode() +).decode() + +_HOP_BY_HOP = { + "connection", "keep-alive", "proxy-authenticate", "proxy-authorization", + "te", "trailers", "transfer-encoding", "upgrade", + "content-encoding", "content-length", +} + + +@app.api_route( + "/vnc/{path:path}", + methods=["GET", "POST", "PUT", "DELETE", "OPTIONS", "HEAD", "PATCH"], +) +async def vnc_http_proxy(path: str, request: Request) -> Response: + upstream = f"https://{DESKTOP_HOST}:{DESKTOP_VNC_PORT}/{path}" + if request.url.query: + upstream += f"?{request.url.query}" + fwd_headers = { + k: v for k, v in request.headers.items() if k.lower() not in _HOP_BY_HOP + } + fwd_headers["host"] = f"{DESKTOP_HOST}:{DESKTOP_VNC_PORT}" + fwd_headers["authorization"] = _KASM_AUTH + body = await request.body() + async with httpx.AsyncClient(verify=False, timeout=30.0, follow_redirects=False) as c: + try: + r = await c.request(request.method, upstream, content=body, headers=fwd_headers) + except httpx.HTTPError as e: + raise HTTPException(502, f"vnc proxy error: {e}") + resp_headers = { + k: v for k, v in r.headers.items() if k.lower() not in _HOP_BY_HOP + } + return Response( + content=r.content, + status_code=r.status_code, + headers=resp_headers, + media_type=r.headers.get("content-type"), + ) + + +@app.websocket("/vnc/websockify") +async def vnc_ws_proxy(client_ws: WebSocket) -> None: + """Bidirectionally proxy the noVNC websocket to KasmVNC.""" + await client_ws.accept(subprotocol="binary") + upstream_url = f"wss://{DESKTOP_HOST}:{DESKTOP_VNC_PORT}/websockify" + sslctx = ssl.create_default_context() + sslctx.check_hostname = False + sslctx.verify_mode = ssl.CERT_NONE + extra_headers = [ + ("Authorization", _KASM_AUTH), + ("Origin", f"https://{DESKTOP_HOST}:{DESKTOP_VNC_PORT}"), + ] + try: + async with websockets.connect( + upstream_url, + ssl=sslctx, + subprotocols=["binary"], + additional_headers=extra_headers, + max_size=None, + open_timeout=10, + ping_interval=None, + ) as upstream: + + async def c2u() -> None: + try: + while True: + msg = await client_ws.receive() + t = msg.get("type") + if t == "websocket.disconnect": + return + if "bytes" in msg and msg["bytes"] is not None: + await upstream.send(msg["bytes"]) + elif "text" in msg and msg["text"] is not None: + await upstream.send(msg["text"]) + except (WebSocketDisconnect, websockets.ConnectionClosed): + return + + async def u2c() -> None: + try: + async for msg in upstream: + if isinstance(msg, (bytes, bytearray)): + await client_ws.send_bytes(bytes(msg)) + else: + await client_ws.send_text(msg) + except (RuntimeError, WebSocketDisconnect, websockets.ConnectionClosed): + return + + tasks = {asyncio.create_task(c2u()), asyncio.create_task(u2c())} + done, pending = await asyncio.wait(tasks, return_when=asyncio.FIRST_COMPLETED) + for task in pending: + task.cancel() + await asyncio.gather(*done, *pending, return_exceptions=True) + except Exception as e: + logger.warning(f"vnc ws proxy failed: {e}") + try: + await client_ws.close(code=1011) + except Exception: + pass diff --git a/usage-cookbook/Nemotron-3-Nano-Omni/computer-use-agent-with-omni/server/vllm_inference.py b/usage-cookbook/Nemotron-3-Nano-Omni/computer-use-agent-with-omni/server/vllm_inference.py new file mode 100644 index 000000000..42000d8e6 --- /dev/null +++ b/usage-cookbook/Nemotron-3-Nano-Omni/computer-use-agent-with-omni/server/vllm_inference.py @@ -0,0 +1,252 @@ +"""vLLM OpenAI-compatible inference path for the desktop agent. + +This path mirrors the request/response shape used by yidong72/cua_demo: +direct HTTP calls to a vLLM `/v1/chat/completions` endpoint, SSE streaming, +and `reasoning_content` parsed from the streamed delta payload generated by +vLLM's `nemotron_v3` reasoning parser. +""" + +from __future__ import annotations + +import asyncio +import json +from typing import Optional + +import httpx +from loguru import logger + +from server.agent import DeltaCallback, NemotronAgent, ParsedStep, _Turn, parse_response + + +class VllmInferenceAgent(NemotronAgent): + """Desktop automation agent powered by a local or remote vLLM endpoint.""" + + async def step( + self, + instruction: str, + screenshot_png: bytes, + screen_size: tuple[int, int], + *, + delta_callback: Optional[DeltaCallback] = None, + client: Optional[httpx.AsyncClient] = None, + ) -> ParsedStep: + messages = self.build_messages(instruction, screenshot_png) + endpoint = self.api_base.rstrip("/") + "/chat/completions" + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {self.api_key}", + } + + max_retry = max(1, self.max_retry) + last_error = "unknown" + parsed: Optional[ParsedStep] = None + owns_client = client is None + client = client or httpx.AsyncClient( + timeout=self.model_attempt_timeout + 5, + verify=False, + ) + + try: + for attempt in range(max_retry): + temperature = self.temperature if attempt == 0 else max(0.2, self.temperature) + payload = { + "model": self.model, + "messages": messages, + "temperature": temperature, + "top_p": self.top_p, + "max_tokens": self.max_tokens, + } + payload.update(self.build_extra_body()) + + if delta_callback is not None and attempt > 0: + try: + await delta_callback( + f"\n\n[retry {attempt + 1}/{max_retry}: {last_error}]\n\n", + "", + ) + except Exception: + pass + + if delta_callback is not None: + try: + await delta_callback( + ( + f"\n\n[vLLM model attempt {attempt + 1}/{max_retry}; " + f"timeout {self.model_attempt_timeout:g}s]\n\n" + ), + "", + ) + except Exception: + pass + + try: + logger.info( + "vLLM model attempt {}/{} (endpoint={}, timeout={}s, temperature={})", + attempt + 1, + max_retry, + endpoint, + self.model_attempt_timeout, + temperature, + ) + + if delta_callback is None: + content, reasoning, finish_reason = await asyncio.wait_for( + self._post_completion(client, endpoint, payload, headers), + timeout=self.model_attempt_timeout, + ) + else: + payload["stream"] = True + content, reasoning, finish_reason = await asyncio.wait_for( + self._stream_completion( + client, + endpoint, + payload, + headers, + delta_callback, + ), + timeout=self.model_attempt_timeout, + ) + + if finish_reason not in (None, "stop"): + last_error = f"unexpected finish_reason={finish_reason}" + logger.warning( + "vLLM attempt {}/{}: {}; retrying", + attempt + 1, + max_retry, + last_error, + ) + if attempt + 1 < max_retry: + await asyncio.sleep(self.retry_sleep) + continue + + parsed = parse_response( + content, + reasoning, + screen_size[0], + screen_size[1], + thinking=self.thinking, + ) + if parsed.status != "error": + break + last_error = parsed.error or "parse error" + logger.warning( + "vLLM attempt {}/{}: parse error: {}; retrying", + attempt + 1, + max_retry, + last_error, + ) + if attempt + 1 < max_retry: + await asyncio.sleep(self.retry_sleep) + + except asyncio.TimeoutError: + last_error = f"model timed out after {self.model_attempt_timeout:g}s" + logger.warning( + "vLLM attempt {}/{}: {}; retrying", + attempt + 1, + max_retry, + last_error, + ) + if attempt + 1 < max_retry: + await asyncio.sleep(self.retry_sleep) + + except Exception as exc: + last_error = f"request failed: {exc}" + logger.error( + "vLLM attempt {}/{}: {}; retrying", + attempt + 1, + max_retry, + last_error, + ) + if attempt + 1 < max_retry: + await asyncio.sleep(self.retry_sleep) + + finally: + if owns_client: + await client.aclose() + + if parsed is None or parsed.status == "error": + return ParsedStep( + status="error", + error=f"all {max_retry} attempts failed: {last_error}", + ) + + if parsed.status in {"continue", "wait"}: + self.history.append( + _Turn( + screenshot_png=screenshot_png, + thought=parsed.thought, + action=parsed.action, + ) + ) + return parsed + + async def _post_completion( + self, + client: httpx.AsyncClient, + endpoint: str, + payload: dict, + headers: dict, + ) -> tuple[str, str, Optional[str]]: + resp = await client.post(endpoint, json=payload, headers=headers) + if resp.status_code != 200: + raise RuntimeError(f"vLLM HTTP {resp.status_code}: {resp.text[:400]}") + body = resp.json() + choice = (body.get("choices") or [{}])[0] + message = choice.get("message") or {} + return ( + message.get("content") or "", + message.get("reasoning_content") or message.get("reasoning") or "", + choice.get("finish_reason"), + ) + + async def _stream_completion( + self, + client: httpx.AsyncClient, + endpoint: str, + payload: dict, + headers: dict, + delta_callback: DeltaCallback, + ) -> tuple[str, str, Optional[str]]: + reasoning_buf: list[str] = [] + content_buf: list[str] = [] + finish_reason: Optional[str] = None + + async with client.stream("POST", endpoint, json=payload, headers=headers) as resp: + if resp.status_code != 200: + err_body = (await resp.aread()).decode("utf-8", errors="replace") + raise RuntimeError(f"vLLM stream HTTP {resp.status_code}: {err_body[:400]}") + + async for raw in resp.aiter_lines(): + if not raw or not raw.startswith("data:"): + continue + data = raw[len("data:") :].strip() + if not data or data == "[DONE]": + continue + try: + chunk = json.loads(data) + except json.JSONDecodeError: + logger.warning("vLLM stream returned non-JSON data chunk: {!r}", data[:200]) + continue + + choices = chunk.get("choices") or [] + if not choices: + continue + choice = choices[0] + delta = choice.get("delta") or {} + r_delta = delta.get("reasoning_content") or delta.get("reasoning") or "" + c_delta = delta.get("content") or "" + + if r_delta: + reasoning_buf.append(r_delta) + if c_delta: + content_buf.append(c_delta) + if r_delta or c_delta: + try: + await delta_callback(r_delta, c_delta) + except Exception: + logger.warning("delta_callback raised; continuing vLLM stream") + + if choice.get("finish_reason"): + finish_reason = choice["finish_reason"] + + return "".join(content_buf), "".join(reasoning_buf), finish_reason diff --git a/usage-cookbook/Nemotron-3-Nano-Omni/computer-use-agent-with-omni/tests/test_agent_parser.py b/usage-cookbook/Nemotron-3-Nano-Omni/computer-use-agent-with-omni/tests/test_agent_parser.py new file mode 100644 index 000000000..c52b26af8 --- /dev/null +++ b/usage-cookbook/Nemotron-3-Nano-Omni/computer-use-agent-with-omni/tests/test_agent_parser.py @@ -0,0 +1,203 @@ +import unittest +import asyncio +import json + +import httpx + +from server.agent import NemotronAgent, parse_response +from server.vllm_inference import VllmInferenceAgent + + +class AgentParserTests(unittest.TestCase): + def test_thinking_extra_body_matches_nemotron_omni_contract(self): + agent = NemotronAgent( + api_key="test", + reasoning_budget=16384, + reasoning_grace_tokens=1024, + thinking=True, + ) + + self.assertEqual( + agent.build_extra_body(), + { + "thinking_token_budget": 17408, + "chat_template_kwargs": { + "enable_thinking": True, + "reasoning_budget": 16384, + "truncate_history_thinking": False, + }, + }, + ) + + def test_non_thinking_extra_body_disables_thinking_explicitly(self): + agent = NemotronAgent(api_key="test", thinking=False) + + self.assertEqual( + agent.build_extra_body(), + { + "chat_template_kwargs": { + "enable_thinking": False, + "truncate_history_thinking": False, + } + }, + ) + + def test_parses_same_line_action_and_code_fence(self): + parsed = parse_response( + "## Action: Click Chrome.\n## Code: ```python\npyautogui.click(0.5, 0.5)\n```", + "click chrome", + 1920, + 1080, + thinking=True, + ) + + self.assertEqual(parsed.status, "continue") + self.assertEqual(parsed.action, "Click Chrome.") + self.assertEqual(parsed.code, "pyautogui.click(960, 540)") + + def test_falls_back_to_reasoning_when_action_is_in_reasoning(self): + reasoning = ( + "I should click the browser.\n" + "## Action: Click Chrome.\n" + "## Code:\n" + "```python\npyautogui.click(0.25, 0.75)\n```" + ) + + parsed = parse_response("", reasoning, 1600, 1200, thinking=True) + + self.assertEqual(parsed.status, "continue") + self.assertEqual(parsed.action, "Click Chrome.") + self.assertEqual(parsed.code, "pyautogui.click(400, 900)") + + def test_accepts_json_fenced_computer_function_inside_code_section(self): + parsed = parse_response( + ( + "## Action: Done.\n" + "## Code:\n" + "```json\n" + '{"name":"computer.terminate","parameters":{"status":"success"}}\n' + "```" + ), + "done", + 1920, + 1080, + thinking=True, + ) + + self.assertEqual(parsed.status, "done") + self.assertEqual(parsed.code, "DONE") + + def test_strips_unmatched_trailing_code_fence_from_code_section(self): + parsed = parse_response( + ( + "## Action: Type query.\n" + "## Code: pyautogui.click(0.5, 0.5)\n" + 'pyautogui.typewrite("NVDA stock price")\n' + "```" + ), + "", + 1000, + 1000, + thinking=True, + ) + + self.assertEqual(parsed.status, "continue") + self.assertEqual( + parsed.code, + 'pyautogui.click(500, 500)\npyautogui.typewrite("NVDA stock price")', + ) + + def test_does_not_execute_reasoning_json_without_action_heading(self): + parsed = parse_response( + "", + 'Example only:\n```json\n{"type":"text","text":"hello"}\n```', + 1920, + 1080, + thinking=True, + ) + + self.assertEqual(parsed.status, "error") + self.assertIn("action", parsed.error or "") + + def test_uses_first_executable_code_and_ignores_trailing_terminate_spam(self): + content = ( + "The previous on the image.\n" + "## Action: Press the Enter key to execute the search.\n" + "## Code: pyautogui.keyDown('return')\n" + "## Code: pyautogui.keyUp('return')\n" + "\n" + "## Action: Press the Enter key to execute the search.\n" + "## Code: pyautogui.keyDown('return')\n" + "## Code: pyautogui.keyUp('return')\n" + "## Code: computer.terminate(status='success')\n" + "## Code: computer.terminate(status='success')\n" + ) + + parsed = parse_response(content, "", 1920, 1080, thinking=True) + + self.assertEqual(parsed.status, "continue") + self.assertIn("Press the Enter key", parsed.action) + self.assertEqual( + parsed.code, + "pyautogui.keyDown('return')\npyautogui.keyUp('return')", + ) + + def test_vllm_path_parses_stubbed_openai_response(self): + png = b"\x89PNG\r\n\x1a\n" + b"\x00" * 16 + agent = VllmInferenceAgent( + api_key="EMPTY", + api_base="http://stub/v1", + model="vllm_local", + max_retry=1, + thinking=True, + ) + + async def handler(request: httpx.Request) -> httpx.Response: + body = json.loads(request.content) + self.assertEqual(body["model"], "vllm_local") + self.assertEqual(body["thinking_token_budget"], 17408) + self.assertEqual( + body["chat_template_kwargs"], + { + "enable_thinking": True, + "reasoning_budget": 16384, + "truncate_history_thinking": False, + }, + ) + self.assertTrue( + any( + isinstance(message["content"], list) + and any(part["type"] == "image_url" for part in message["content"]) + for message in body["messages"] + ) + ) + return httpx.Response( + 200, + json={ + "choices": [ + { + "finish_reason": "stop", + "message": { + "reasoning_content": "click around", + "content": ( + "## Action:\nclick the icon\n## Code:\n" + "```python\npyautogui.click(0.25, 0.75)\n```" + ), + }, + } + ] + }, + ) + + async def run_step(): + transport = httpx.MockTransport(handler) + async with httpx.AsyncClient(transport=transport) as client: + return await agent.step("do a thing", png, (1600, 1200), client=client) + + parsed = asyncio.run(run_step()) + self.assertEqual(parsed.status, "continue") + self.assertEqual(parsed.code, "pyautogui.click(400, 900)") + + +if __name__ == "__main__": + unittest.main() diff --git a/usage-cookbook/Nemotron-3-Nano-Omni/computer-use-agent-with-omni/web/index.html b/usage-cookbook/Nemotron-3-Nano-Omni/computer-use-agent-with-omni/web/index.html new file mode 100644 index 000000000..c5ba449e2 --- /dev/null +++ b/usage-cookbook/Nemotron-3-Nano-Omni/computer-use-agent-with-omni/web/index.html @@ -0,0 +1,55 @@ + + + + + Computer Use Agent with Omni - Nemotron-3 Nano Omni + + + +
+

Computer Use Agent with Omni

+ Nemotron-3 Nano Omni driving a live desktop + checking… +
+ +
+
+
+

Desktop container not detected.

+

Run docker compose up -d to start it.

+
+ +
+ + +
+ + + + diff --git a/usage-cookbook/Nemotron-3-Nano-Omni/computer-use-agent-with-omni/web/sidepanel.js b/usage-cookbook/Nemotron-3-Nano-Omni/computer-use-agent-with-omni/web/sidepanel.js new file mode 100644 index 000000000..1cf422143 --- /dev/null +++ b/usage-cookbook/Nemotron-3-Nano-Omni/computer-use-agent-with-omni/web/sidepanel.js @@ -0,0 +1,356 @@ +// Computer Use Agent with Omni - side panel logic +// Connects to the FastAPI backend for desktop status, agent control, and SSE events. +// Uses KasmVNC iframe for live desktop viewing. + +const $ = (id) => document.getElementById(id); +const btnRun = $("btn-run"); +const btnStop = $("btn-stop"); +const btnRestartDesktop = $("btn-restart-desktop"); +const envStatus = $("env-status"); +const agentStatus = $("agent-status"); +const envBadge = $("env-badge"); +const log = $("event-log"); +const placeholder = $("vnc-placeholder"); +const vncFrame = $("vnc-frame"); +const instruction = $("instruction"); + +let state = { + jobId: null, + eventSource: null, + desktopRestarting: false, +}; + +// ── Desktop status ───────────────────────────────────────────────────────── + +async function checkDesktop() { + if (state.desktopRestarting) return false; + try { + const r = await fetch("/health"); + const j = await r.json(); + btnRestartDesktop.disabled = false; + if (j.desktop === "ready") { + envStatus.textContent = `Ready — ${j.model}`; + setBadge("ready", "ready"); + btnRun.disabled = false; + + showVnc(j); + return true; + } else { + envStatus.textContent = "Desktop not ready — run: docker compose up -d"; + setBadge("offline", "offline"); + btnRun.disabled = true; + return false; + } + } catch (e) { + envStatus.textContent = `Cannot reach server: ${e.message}`; + setBadge("offline", "offline"); + btnRun.disabled = true; + btnRestartDesktop.disabled = true; + return false; + } +} + +function showVnc(health, force = false) { + const wasHidden = vncFrame.hidden; + const params = new URLSearchParams({ + autoconnect: "1", + resize: "scale", + reconnect: "1", + reconnect_delay: "1500", + path: "vnc/websockify", + password: health.vnc_password || "password", + kasmvnc_mode_preference: "image", + }); + const vncUrl = `/vnc/vnc.html?${params.toString()}`; + placeholder.hidden = true; + vncFrame.hidden = false; + if (force || wasHidden || vncFrame.src === "about:blank") { + vncFrame.src = vncUrl; + } +} + +function showPlaceholder(message) { + placeholder.innerHTML = `

${escape(message)}

`; + placeholder.hidden = false; + vncFrame.hidden = true; + vncFrame.src = "about:blank"; +} + +async function restartDesktop() { + state.desktopRestarting = true; + envStatus.textContent = "restarting desktop container…"; + setBadge("running", "restarting"); + btnRestartDesktop.disabled = true; + btnRun.disabled = true; + btnStop.disabled = true; + showPlaceholder("Restarting desktop container..."); + + try { + const r = await fetch("/env/restart", { method: "POST" }); + if (!r.ok) { + const text = await r.text(); + throw new Error(text); + } + state.desktopRestarting = false; + const ready = await checkDesktop(); + if (!ready) { + throw new Error("desktop restart finished, but health check is not ready"); + } + const health = await (await fetch("/health")).json(); + showVnc(health, true); + } catch (e) { + state.desktopRestarting = false; + envStatus.textContent = `restart failed: ${e.message}`; + setBadge("error", "error"); + btnRestartDesktop.disabled = false; + } +} + +// ── Agent lifecycle ──────────────────────────────────────────────────────── + +async function runAgent() { + if (!instruction.value.trim()) return; + log.innerHTML = ""; + agentStatus.textContent = "starting…"; + setBadge("running", "running"); + btnRun.disabled = true; + btnStop.disabled = false; + + try { + const r = await fetch("/agent/start", { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ + instruction: instruction.value.trim(), + }), + }); + if (!r.ok) { + const text = await r.text(); + throw new Error(text); + } + const j = await r.json(); + state.jobId = j.job_id; + agentStatus.textContent = `running (${j.job_id})`; + streamEvents(j.job_id); + } catch (e) { + agentStatus.textContent = `start failed: ${e.message}`; + setBadge("error", "error"); + btnRun.disabled = false; + btnStop.disabled = true; + } +} + +function streamEvents(jobId) { + if (state.eventSource) state.eventSource.close(); + const es = new EventSource(`/agent/${jobId}/events`); + state.eventSource = es; + es.onmessage = (msg) => { + if (!msg.data) return; + let ev; + try { ev = JSON.parse(msg.data); } catch { return; } + appendEvent(ev); + if (["done", "failed", "error", "stopping", "stopped"].includes(ev.kind)) { + agentStatus.textContent = ev.kind; + if (ev.kind === "done") setBadge("ready", "done ✓"); + else if (ev.kind === "failed") setBadge("error", "failed"); + else setBadge("ready", ev.kind); + } + if (ev.kind === "finished") { + es.close(); + btnRun.disabled = false; + btnStop.disabled = true; + } + }; + es.onerror = () => { + agentStatus.textContent = "stream disconnected"; + }; +} + +async function stopAgent() { + if (!state.jobId) return; + agentStatus.textContent = "stopping…"; + btnStop.disabled = true; + try { + const r = await fetch(`/agent/${state.jobId}/stop`, { method: "POST" }); + if (!r.ok) { + const text = await r.text(); + throw new Error(text); + } + } catch (e) { + agentStatus.textContent = `stop failed: ${e.message}`; + btnStop.disabled = false; + } +} + +// ── UI helpers ───────────────────────────────────────────────────────────── + +function setBadge(cls, text) { + envBadge.className = "badge " + cls; + envBadge.textContent = text; +} + +// Per-step DOM nodes and buffered text for live streaming +const liveStep = { + step: null, + container: null, + thought: null, + content: null, + pendingReasoning: "", + pendingContent: "", + flushScheduled: false, +}; + +function ensureLiveStep(step) { + if (liveStep.step === step && liveStep.container && liveStep.container.isConnected) { + return; + } + const div = document.createElement("div"); + div.className = "event"; + div.innerHTML = `step ${step}` + + `
` + + `
`; + log.appendChild(div); + liveStep.step = step; + liveStep.container = div; + liveStep.thought = div.querySelector(".live-thought"); + liveStep.content = div.querySelector(".live-content"); + log.scrollTop = log.scrollHeight; +} + +function scheduleLiveFlush() { + if (liveStep.flushScheduled) return; + liveStep.flushScheduled = true; + requestAnimationFrame(() => { + liveStep.flushScheduled = false; + if (!liveStep.container || !liveStep.container.isConnected) { + liveStep.pendingReasoning = ""; + liveStep.pendingContent = ""; + return; + } + if (liveStep.pendingReasoning) { + if (!liveStep.thought.textContent) { + liveStep.thought.textContent = "reasoning\n"; + } + liveStep.thought.textContent += liveStep.pendingReasoning; + liveStep.pendingReasoning = ""; + } + if (liveStep.pendingContent) { + if (!liveStep.content.textContent) { + liveStep.content.textContent = "response\n"; + } + liveStep.content.textContent += liveStep.pendingContent; + liveStep.pendingContent = ""; + } + log.scrollTop = log.scrollHeight; + }); +} + +function appendEvent(ev) { + // Streaming deltas + if (ev.kind === "thought_delta") { + ensureLiveStep(ev.step); + if (ev.reasoning) { + liveStep.pendingReasoning += ev.reasoning; + } + if (ev.content) { + liveStep.pendingContent += ev.content; + } + if (ev.reasoning || ev.content) { + agentStatus.textContent = `streaming step ${ev.step}…`; + scheduleLiveFlush(); + } + return; + } + + const div = document.createElement("div"); + div.className = "event"; + let body; + switch (ev.kind) { + case "started": + body = `▶ started${escape(ev.instruction)}`; + break; + case "screen_size": + body = `screen${ev.width}×${ev.height}`; + break; + case "step_started": + liveStep.step = null; + liveStep.container = null; + liveStep.pendingReasoning = ""; + liveStep.pendingContent = ""; + liveStep.flushScheduled = false; + agentStatus.textContent = `step ${ev.step}: waiting for model…`; + body = `step ${ev.step}`; + break; + case "thought": + if (liveStep.step === ev.step && liveStep.container) { + if (ev.action) { + const a = document.createElement("div"); + a.className = "action"; + a.textContent = "→ " + ev.action; + liveStep.container.appendChild(a); + } + if (ev.code) { + const c = document.createElement("div"); + c.className = "code"; + c.textContent = ev.code; + liveStep.container.appendChild(c); + } + log.scrollTop = log.scrollHeight; + return; + } + body = `step ${ev.step}` + + (ev.thought ? `
💭 ${escape(ev.thought)}
` : "") + + (ev.action ? `
→ ${escape(ev.action)}
` : "") + + (ev.code ? `
${escape(ev.code)}
` : ""); + break; + case "executed": + body = `⚡ executed` + + (ev.output ? `
${escape(ev.output)}
` : ""); + break; + case "execute_error": + body = `⚠ exec err
${escape(ev.message)}
`; + break; + case "wait": + body = `⏳ wait${ev.seconds}s`; + break; + case "done": + body = `✓ DONE`; + break; + case "failed": + body = `✗ FAILED${ev.reason || ""}`; + break; + case "stopped": + body = `■ stopped`; + break; + case "stopping": + body = `■ stopping`; + break; + case "error": + body = `⚠ error
${escape(ev.message || "")}
`; + break; + case "finished": + body = `— finished (${ev.status}) —`; + break; + default: + body = `${escape(ev.kind)}${escape(JSON.stringify(ev))}`; + } + div.innerHTML = body; + log.appendChild(div); + log.scrollTop = log.scrollHeight; +} + +function escape(s) { + if (s === undefined || s === null) return ""; + return String(s) + .replaceAll("&", "&").replaceAll("<", "<").replaceAll(">", ">"); +} + +// ── Wire up ──────────────────────────────────────────────────────────────── + +btnRun.addEventListener("click", runAgent); +btnStop.addEventListener("click", stopAgent); +btnRestartDesktop.addEventListener("click", restartDesktop); + +// Initial check + periodic polling +checkDesktop(); +setInterval(checkDesktop, 10000); diff --git a/usage-cookbook/Nemotron-3-Nano-Omni/computer-use-agent-with-omni/web/style.css b/usage-cookbook/Nemotron-3-Nano-Omni/computer-use-agent-with-omni/web/style.css new file mode 100644 index 000000000..2f39dc2eb --- /dev/null +++ b/usage-cookbook/Nemotron-3-Nano-Omni/computer-use-agent-with-omni/web/style.css @@ -0,0 +1,248 @@ +/* Computer Use Agent with Omni - dark theme */ + +:root { + --bg: #1a1b26; + --surface: #24283b; + --border: #3b4261; + --text: #c0caf5; + --text-dim: #565f89; + --accent: #7aa2f7; + --green: #9ece6a; + --red: #f7768e; + --yellow: #e0af68; +} + +* { box-sizing: border-box; margin: 0; padding: 0; } + +body { + font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif; + background: var(--bg); + color: var(--text); + height: 100vh; + display: flex; + flex-direction: column; + overflow: hidden; +} + +header { + display: flex; + align-items: center; + gap: 1rem; + padding: 0.75rem 1.5rem; + background: var(--surface); + border-bottom: 1px solid var(--border); +} + +header h1 { + font-size: 1.25rem; + font-weight: 600; +} + +header .subtitle { + color: var(--text-dim); + font-size: 0.85rem; +} + +.badge { + margin-left: auto; + padding: 0.25rem 0.75rem; + border-radius: 1rem; + font-size: 0.75rem; + font-weight: 600; + text-transform: uppercase; +} +.badge.ready { background: var(--green); color: #1a1b26; } +.badge.offline { background: var(--border); color: var(--text-dim); } +.badge.error { background: var(--red); color: #1a1b26; } +.badge.running { background: var(--accent); color: #1a1b26; } + +main { + flex: 1; + display: grid; + grid-template-columns: 1fr 400px; + gap: 0; + overflow: hidden; +} + +#vnc-pane { + position: relative; + overflow: hidden; + background: #000; + display: flex; + align-items: center; + justify-content: center; +} + +#vnc-frame { + width: 100%; + height: 100%; + border: none; +} + +#vnc-placeholder { + color: var(--text-dim); + text-align: center; + line-height: 1.8; +} + +#vnc-placeholder code { + background: var(--surface); + padding: 0.2rem 0.5rem; + border-radius: 4px; + font-size: 0.9rem; +} + +#side-panel { + display: flex; + flex-direction: column; + gap: 0.75rem; + padding: 0.75rem; + overflow-y: auto; + border-left: 1px solid var(--border); + background: var(--surface); +} + +.card { + background: var(--bg); + border: 1px solid var(--border); + border-radius: 8px; + padding: 1rem; +} +.card-title-row { + display: flex; + align-items: center; + justify-content: space-between; + gap: 0.75rem; + margin-bottom: 0.5rem; +} +.card.grow { + flex: 1; + display: flex; + flex-direction: column; + min-height: 0; +} +.card h2 { + font-size: 0.85rem; + font-weight: 600; + color: var(--accent); + margin-bottom: 0.5rem; + text-transform: uppercase; + letter-spacing: 0.05em; +} +.card-title-row h2 { + margin-bottom: 0; +} + +textarea { + width: 100%; + background: var(--surface); + border: 1px solid var(--border); + border-radius: 6px; + color: var(--text); + padding: 0.5rem; + font-size: 0.85rem; + resize: vertical; + font-family: inherit; +} +textarea:focus { outline: 1px solid var(--accent); } + +.row { + display: flex; + gap: 0.5rem; + margin-top: 0.5rem; +} + +button { + padding: 0.4rem 1rem; + border: 1px solid var(--border); + border-radius: 6px; + background: var(--surface); + color: var(--text); + font-size: 0.85rem; + cursor: pointer; + transition: all 0.15s; +} +button:hover:not(:disabled) { + background: var(--accent); + color: var(--bg); + border-color: var(--accent); +} +button:disabled { + opacity: 0.4; + cursor: not-allowed; +} +button.secondary { + padding: 0.25rem 0.65rem; + font-size: 0.75rem; +} + +.status { + font-size: 0.8rem; + color: var(--text-dim); + margin-top: 0.5rem; +} + +#event-log { + flex: 1; + overflow-y: auto; + font-size: 0.8rem; + font-family: "JetBrains Mono", "Fira Code", monospace; + line-height: 1.5; + padding: 0.5rem 0; + min-height: 0; +} + +.event { + padding: 0.3rem 0; + border-bottom: 1px solid var(--border); +} + +.tag { + display: inline-block; + padding: 0.1rem 0.4rem; + border-radius: 3px; + font-size: 0.7rem; + font-weight: 600; + margin-right: 0.4rem; + background: var(--border); + color: var(--text); +} +.tag.step { background: var(--accent); color: var(--bg); } +.tag.err { background: var(--red); color: var(--bg); } + +.thought { + color: var(--text-dim); + font-style: italic; + margin: 0.2rem 0; + white-space: pre-wrap; + word-break: break-word; + max-height: 200px; + overflow-y: auto; +} + +.action { + color: var(--green); + margin: 0.2rem 0; +} + +.code { + background: var(--surface); + padding: 0.3rem 0.5rem; + border-radius: 4px; + margin: 0.2rem 0; + white-space: pre-wrap; + word-break: break-all; + font-size: 0.75rem; +} + +.err { + color: var(--red); +} + +@media (max-width: 900px) { + main { + grid-template-columns: 1fr; + grid-template-rows: 40vh 1fr; + } + #side-panel { border-left: none; border-top: 1px solid var(--border); } +}