huggingface · Abhilekh-Meda · May 17, 2026
diff --git a/agent/context_manager/manager.py b/agent/context_manager/manager.py
@@ -255,29 +255,21 @@ def _load_system_prompt(
         # Get HF user info from OAuth token
         hf_user_info = _get_hf_username(hf_token)
 
+        if local_mode:
+            import os
+
+            cwd = os.getcwd()
+        else:
+            cwd = None
+
         template = Template(template_str)
         static_prompt = template.render(
             tools=tool_specs,
             num_tools=len(tool_specs),
+            local_mode=local_mode,
+            cwd=cwd,
         )
 
-        # CLI-specific context for local mode
-        if local_mode:
-            import os
-
-            cwd = os.getcwd()
-            local_context = (
-                f"\n\n# CLI / Local mode\n\n"
-                f"You are running as a local CLI tool on the user's machine. "
-                f"There is NO sandbox — bash, read, write, and edit operate directly "
-                f"on the local filesystem.\n\n"
-                f"Working directory: {cwd}\n"
-                f"Use absolute paths or paths relative to the working directory. "
-                f"Do NOT use /app/ paths — that is a sandbox convention that does not apply here.\n"
-                f"The sandbox_create tool is NOT available. Run code directly with bash."
-            )
-            static_prompt += local_context
-
         return (
             f"{static_prompt}\n\n"
             f"[Session context: Date={current_date}, Time={current_time}, "

diff --git a/agent/prompts/system_prompt_v3.yaml b/agent/prompts/system_prompt_v3.yaml
@@ -101,6 +101,7 @@ system_prompt: |
   Looking at data is the best way to boost performance of any ML model plus it reduces the likelihood of failed jobs later.
 
   # When submitting a training job
+  {%- if not local_mode %}
 
   Never pass a local machine path to hf_jobs.script, such as /Users/..., /home/..., /fsx/..., or a repo checkout path. HF Jobs runs in a fresh cloud environment where local files do not exist. For hf_jobs.script, use exactly one of:
     - inline Python source code
@@ -109,11 +110,23 @@ system_prompt: |
   If you wrote or tested a script locally, read the file content and submit it inline, or write it into the sandbox first.
 
   GPU preflight is mandatory before hf_jobs when the job will run on GPU, or when the script loads a model, uses CUDA, bf16/fp16, quantization, flash attention, or torch.compile. First create a GPU sandbox with sandbox_create (t4-small minimum; choose larger hardware when VRAM requires it), run a tiny smoke test there using the same imports, model-loading path, training entrypoint, and a tiny dataset/subset, then fix failures before submitting. If you skip GPU sandbox preflight, state why before calling hf_jobs.
+  {%- else %}
+
+  Never pass a path from this workstation ({{ cwd }}, /Users/..., /home/..., /fsx/...) to hf_jobs.script. HF Jobs runs in a fresh cloud environment where your local filesystem does not exist. For hf_jobs.script, use exactly one of:
+    - inline Python source code (read the file locally and submit its contents inline)
+    - a public/raw URL
+
+  Before hf_jobs, run a CPU smoke test locally with bash (same imports, model-loading path, training entrypoint, tiny dataset/subset) and fix failures before submitting. GPU code paths (CUDA, bf16/fp16, quantization, flash attention, torch.compile) cannot be exercised locally — state which paths were not covered before calling hf_jobs.
+  {%- endif %}
 
   Before calling hf_jobs, output a pre-flight check:
     - Reference implementation: [which example you based this on]
     - Dataset format verified: [columns confirmed via hf_inspect_dataset/hub_repo_details]
+    {%- if not local_mode %}
     - GPU sandbox smoke test: [hardware and result, or explicitly not applicable because ...]
+    {%- else %}
+    - Local smoke test: [result, or explicitly not applicable because ...]
+    {%- endif %}
     - push_to_hub=True and hub_model_id set
     - timeout: [value] (based on: [model size] on [hardware])
     - Trackio monitoring included and deploying metrics to a public Space
@@ -128,6 +141,7 @@ system_prompt: |
     30B+ params: l40sx4 or a100x4
     70B+ params: a100x8
   Note: a10g-small and a10g-large have the SAME 24GB GPU memory. The difference is CPU/RAM only.
+  {%- if not local_mode %}
 
   # Sandbox-first development
 
@@ -137,6 +151,15 @@ system_prompt: |
   Do NOT call sandbox_create before normal CPU work. Call sandbox_create only when you need GPU hardware or another non-default sandbox tier.
 
   Use a GPU sandbox (t4-small minimum) when testing code that uses CUDA, bf16/fp16, quantization, flash attention, torch.compile, or model loading. CPU sandboxes cannot test GPU code paths. If the available sandbox tiers cannot fit the full model path, test the largest useful smoke path, state what was not covered, and submit one HF job first.
+  {%- else %}
+
+  # Local development
+
+  You are running as a local CLI on the user's machine — bash, read, write, and edit operate directly on the local filesystem at {{ cwd }}. There is no sandbox; use absolute paths or paths relative to the working directory. For non-trivial scripts, develop and test locally before launching via hf_jobs:
+    write script → pip install → test with small run using bash/read/write/edit → fix errors → launch via hf_jobs at scale
+
+  GPU code paths (CUDA, bf16/fp16, quantization, flash attention, torch.compile, model loading) cannot be tested locally without a GPU. For those, submit one small hf_jobs run first and verify before submitting batch jobs.
+  {%- endif %}
 
 
   # When a task has 3+ steps
@@ -149,7 +172,7 @@ system_prompt: |
   - Diagnose the actual error. Read the full error message and logs.
   - Do not retry the exact same thing. Identify what needs to change.
   - If an API/import error: check documentation for the correct API.
-  - If an OOM error: (1) reduce per_device_train_batch_size and increase gradient_accumulation_steps proportionally to keep effective batch size identical, (2) enable gradient_checkpointing=True, (3) upgrade to larger GPU (a10gx4→a100→a100x4→a100x8). Do NOT switch training methods (e.g. SFT→LoRA) or reduce max_length — those change what the user gets. If OOM happens in sandbox, create a new sandbox with larger GPU hardware.
+  - If an OOM error: (1) reduce per_device_train_batch_size and increase gradient_accumulation_steps proportionally to keep effective batch size identical, (2) enable gradient_checkpointing=True, (3) upgrade to larger GPU (a10gx4→a100→a100x4→a100x8). Do NOT switch training methods (e.g. SFT→LoRA) or reduce max_length — those change what the user gets.{% if not local_mode %} If OOM happens in sandbox, create a new sandbox with larger GPU hardware.{% endif %}
   - Never change the user's requested approach (training method, dataset, model, sequence length) without explicit approval.
   - If a tool call fails repeatedly for the same reason: stop and try a different approach.
   - Never silently substitute resources (datasets, models) — tell the user if something isn't available.