aisa-group · hrdkbhatnagar · May 5, 2026
diff --git a/agents/claude_reprompt/human_readable_trace.py b/agents/claude_reprompt/human_readable_trace.py
@@ -0,0 +1 @@
+../claude/human_readable_trace.py
diff --git a/agents/claude_reprompt/solve.sh b/agents/claude_reprompt/solve.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+unset GEMINI_API_KEY
+unset CODEX_API_KEY
+
+export BASH_MAX_TIMEOUT_MS="36000000"
+
+MIN_REMAINING_MINUTES=30
+
+claude --print --verbose --model "$AGENT_CONFIG" --output-format stream-json \
+    --dangerously-skip-permissions "$PROMPT"
+
+# Re-prompt loop: if the agent finishes early, resume the session
+while true; do
+    TIMER_OUTPUT=$(bash timer.sh 2>/dev/null)
+    if echo "$TIMER_OUTPUT" | grep -q "expired"; then
+        break
+    fi
+
+    REMAINING_HOURS=$(echo "$TIMER_OUTPUT" | grep -oP '^\d+(?=:)')
+    REMAINING_MINS=$(echo "$TIMER_OUTPUT" | grep -oP '(?<=:)\d+')
+    TOTAL_REMAINING_MINS=$(( REMAINING_HOURS * 60 + REMAINING_MINS ))
+
+    if [ "$TOTAL_REMAINING_MINS" -lt "$MIN_REMAINING_MINUTES" ]; then
+        break
+    fi
+
+    CONTINUATION_PROMPT="You still have ${REMAINING_HOURS}h ${REMAINING_MINS}m remaining. Please continue improving your result and maximize performance."
+
+    claude --print --verbose --continue --model "$AGENT_CONFIG" --output-format stream-json \
+        --dangerously-skip-permissions "$CONTINUATION_PROMPT"
+done
diff --git a/agents/codex_xhigh/human_readable_trace.py b/agents/codex_xhigh/human_readable_trace.py
@@ -0,0 +1 @@
+../codex/human_readable_trace.py
diff --git a/agents/codex_xhigh/solve.sh b/agents/codex_xhigh/solve.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+unset ANTHROPIC_API_KEY
+unset GEMINI_API_KEY
+
+# Set reasoning effort to xhigh (prepend to ensure precedence)
+file=/home/ben/.codex/config.toml
+tmp="$(mktemp)"
+printf 'model_reasoning_effort = "xhigh"\n\n' > "$tmp"
+[ -f "$file" ] && cat "$file" >> "$tmp"
+mv "$tmp" "$file"
+
+codex --search exec --json -c model_reasoning_summary=detailed --skip-git-repo-check --yolo --model "$AGENT_CONFIG" "$PROMPT"
diff --git a/agents/codex_xhigh_reprompt/human_readable_trace.py b/agents/codex_xhigh_reprompt/human_readable_trace.py
@@ -0,0 +1 @@
+../codex/human_readable_trace.py
diff --git a/agents/codex_xhigh_reprompt/solve.sh b/agents/codex_xhigh_reprompt/solve.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+unset ANTHROPIC_API_KEY
+unset GEMINI_API_KEY
+
+# Set reasoning effort to xhigh (prepend to ensure precedence)
+file=/home/ben/.codex/config.toml
+tmp="$(mktemp)"
+printf 'model_reasoning_effort = "xhigh"\n\n' > "$tmp"
+[ -f "$file" ] && cat "$file" >> "$tmp"
+mv "$tmp" "$file"
+
+MIN_REMAINING_MINUTES=30
+
+codex --search exec --json -c model_reasoning_summary=detailed --skip-git-repo-check --yolo --model "$AGENT_CONFIG" "$PROMPT"
+
+# Re-prompt loop: if the agent finishes early, resume the session
+while true; do
+    TIMER_OUTPUT=$(bash timer.sh 2>/dev/null)
+    if echo "$TIMER_OUTPUT" | grep -q "expired"; then
+        break
+    fi
+
+    REMAINING_HOURS=$(echo "$TIMER_OUTPUT" | grep -oP '^\d+(?=:)')
+    REMAINING_MINS=$(echo "$TIMER_OUTPUT" | grep -oP '(?<=:)\d+')
+    TOTAL_REMAINING_MINS=$(( REMAINING_HOURS * 60 + REMAINING_MINS ))
+
+    if [ "$TOTAL_REMAINING_MINS" -lt "$MIN_REMAINING_MINUTES" ]; then
+        break
+    fi
+
+    CONTINUATION_PROMPT="You still have ${REMAINING_HOURS}h ${REMAINING_MINS}m remaining. Please continue improving your result and maximize performance."
+
+    codex --search exec resume --last --json -c model_reasoning_summary=detailed --skip-git-repo-check --yolo --model "$AGENT_CONFIG" "$CONTINUATION_PROMPT"
+done
diff --git a/containers/gpt_5_5.def b/containers/gpt_5_5.def
@@ -0,0 +1,78 @@
+Bootstrap: docker
+From: nvidia/cuda:12.9.1-cudnn-devel-ubuntu22.04
+
+%files
+    containers/requirements-direct.txt /opt/requirements-direct.txt
+
+%post
+    chmod 1777 /tmp
+    # Set environment variables
+    export DEBIAN_FRONTEND=noninteractive
+
+    # Update and install system dependencies
+    apt-get update && apt-get install -y \
+        python3.10 \
+        python3-dev \
+        git \
+        wget \
+        curl \
+        build-essential \
+        && rm -rf /var/lib/apt/lists/*
+
+    # Create python3 symlink
+    ln -sf /usr/bin/python3.10 /usr/bin/python3
+    ln -sf /usr/bin/python3.10 /usr/bin/python
+
+    # Install Node.js (LTS version 22.x) for npm
+    curl -fsSL https://deb.nodesource.com/setup_22.x | bash -
+    apt-get install -y nodejs
+
+    # Install uv
+    curl -LsSf https://astral.sh/uv/install.sh | sh
+    export PATH="/root/.local/bin:$PATH"
+
+    uv pip install --system --no-cache vllm==0.11.0 --torch-backend=auto
+
+    #  Pinned direct dependencies
+    uv pip install --system --no-cache -r /opt/requirements-direct.txt
+
+    #  flash-attn (needs no-build-isolation)
+    uv pip install --system --no-cache flash-attn==2.8.3 --no-build-isolation
+
+    #  update CLI harnesss to most stable latest versions 
+    # OpenCode doesn't support DeepSeek V4 yet. 
+    npm install -g \
+        @anthropic-ai/claude-code@2.1.116 \
+        @openai/codex@0.124.0 \
+        @google/gemini-cli@0.39.1 \
+        opencode-ai@1.14.20
+
+    # install inspect evals
+    mkdir -p /opt
+    cd /opt
+    git clone https://github.com/UKGovernmentBEIS/inspect_evals.git
+    cd /opt/inspect_evals
+    git checkout 06001a83e6d7c709c2ede0570dce7f1031a0bad8
+    uv pip install --system --no-cache .
+
+    # install inspect ai with debug 
+    mkdir -p /opt
+    cd /opt
+    git clone https://github.com/rank-and-file/inspect_ai_vllm_stdout.git
+    cd inspect_ai_vllm_stdout
+    uv pip install --system --no-cache .
+
+%environment
+    export PATH="/root/.local/bin:$PATH"
+    export NO_PROXY="localhost,127.0.0.1"
+    export no_proxy="localhost,127.0.0.1"
+
+%runscript
+    exec python3 "$@"
+
+%labels
+    Version v1.0
+    Description Python ML container with CUDA support for transformers and LLM training (using uv) + AI CLI tools
+
+%help
+    Note: Use the --nv flag to enable NVIDIA GPU support when running the container.
diff --git a/dev_utils/extract_traces.py b/dev_utils/extract_traces.py
@@ -153,6 +153,11 @@ def main():
         nargs="+",
         help="Input directory names (relative to RESULTS_BASE) to process"
     )
+    parser.add_argument(
+        "--all",
+        action="store_true",
+        help="Copy all runs, not just the latest per task (default: latest only)"
+    )
     args = parser.parse_args()
 
     output_base = Path(OUTPUT_DIR)
@@ -175,8 +180,12 @@ def main():
 
         print(f"\n[{input_dir_name}]")
 
-        # Iterate over only the latest subdirectories (highest ID per prefix)
-        for subdir in sorted(get_latest_subdirs(input_dir)):
+        # Iterate over subdirectories (latest per task by default, all with --all)
+        if args.all:
+            subdirs = sorted(d for d in input_dir.iterdir() if d.is_dir())
+        else:
+            subdirs = sorted(get_latest_subdirs(input_dir))
+        for subdir in subdirs:
             # Determine source file (prefer solve_parsed.txt)
             src_file = subdir / "solve_parsed.txt"
             if not src_file.exists():
@@ -201,6 +210,7 @@ def main():
             copy_other_files(subdir, dest_dir, 'contamination_judgement.txt', api_keys=api_keys)
             copy_other_files(subdir, dest_dir, 'disallowed_model_judgement.txt', api_keys=api_keys)
             copy_other_files(subdir, dest_dir, 'error.log', 'judgement.log', api_keys=api_keys)
+            copy_other_files(subdir, dest_dir, 'time_taken.txt', api_keys=api_keys)
             copy_other_files(subdir, dest_dir, 'system_monitor.log', api_keys=api_keys, optional=True)
 
             tag = " [sanitized]" if was_sanitized else ""

diff --git a/dev_utils/limit_hit_list.py b/dev_utils/limit_hit_list.py
@@ -10,11 +10,13 @@
     "You've hit your limit",         # Claude Code Pro subscription limit
     "spending_limit",                 # Anthropic/OpenAI spending limit
     "billing_hard_limit",            # OpenAI billing hard limit
-    "insufficient_quota",            # OpenAI quota exceeded
+    "insufficient_quota",            # OpenAI quota exceeded (structured error code)
+    "Quota exceeded. Check your plan",  # OpenAI/Codex quota exceeded (turn.failed message)
     "budget_exceeded",               # General budget error
     "plan does not yet include",     # Z.AI subscription plan restriction
     "token_expired",                 # OpenAI/Codex expired auth token
     "Failed to refresh token",       # Codex CLI refresh token failure
+    "Reconnecting... 5/5",           # Codex CLI exhausted stream-reconnect retries
 ]
 
 

diff --git a/dev_utils/terminated_finder.py b/dev_utils/terminated_finder.py
@@ -11,19 +11,23 @@ def get_results_dir():
     return os.environ.get("POST_TRAIN_BENCH_RESULTS_DIR", "results")
 
 
+KILLED_RE = re.compile(rb"run_task\.sh: line \d+: \d+ Killed")
+
+
 def classify_error(error_log_path: Path) -> str | None:
     """Classify the error in error.log. Returns 'terminated', 'killed', or None."""
     if not error_log_path.exists():
         return None
     try:
-        content = error_log_path.read_text()
-        if content.startswith("Terminated"):
-            return "terminated"
-        if re.search(r"\bKilled\b", content):
-            return "killed"
-        return None
+        with open(error_log_path, "rb") as f:
+            head = f.read(4096)
     except Exception:
         return None
+    if head.startswith(b"Terminated"):
+        return "terminated"
+    if KILLED_RE.search(head):
+        return "killed"
+    return None
 
 
 def get_latest_runs(method_path: Path):