Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions agents/claude_reprompt/human_readable_trace.py
31 changes: 31 additions & 0 deletions agents/claude_reprompt/solve.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#!/bin/bash
unset GEMINI_API_KEY
unset CODEX_API_KEY

export BASH_MAX_TIMEOUT_MS="36000000"

MIN_REMAINING_MINUTES=30

claude --print --verbose --model "$AGENT_CONFIG" --output-format stream-json \
--dangerously-skip-permissions "$PROMPT"

# Re-prompt loop: if the agent finishes early, resume the session
while true; do
TIMER_OUTPUT=$(bash timer.sh 2>/dev/null)
if echo "$TIMER_OUTPUT" | grep -q "expired"; then
break
fi

REMAINING_HOURS=$(echo "$TIMER_OUTPUT" | grep -oP '^\d+(?=:)')
REMAINING_MINS=$(echo "$TIMER_OUTPUT" | grep -oP '(?<=:)\d+')
TOTAL_REMAINING_MINS=$(( REMAINING_HOURS * 60 + REMAINING_MINS ))

if [ "$TOTAL_REMAINING_MINS" -lt "$MIN_REMAINING_MINUTES" ]; then
break
fi

CONTINUATION_PROMPT="You still have ${REMAINING_HOURS}h ${REMAINING_MINS}m remaining. Please continue improving your result and maximize performance."

claude --print --verbose --continue --model "$AGENT_CONFIG" --output-format stream-json \
--dangerously-skip-permissions "$CONTINUATION_PROMPT"
done
1 change: 1 addition & 0 deletions agents/codex_xhigh/human_readable_trace.py
12 changes: 12 additions & 0 deletions agents/codex_xhigh/solve.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/bin/bash
unset ANTHROPIC_API_KEY
unset GEMINI_API_KEY

# Set reasoning effort to xhigh (prepend to ensure precedence)
file=/home/ben/.codex/config.toml
tmp="$(mktemp)"
printf 'model_reasoning_effort = "xhigh"\n\n' > "$tmp"
[ -f "$file" ] && cat "$file" >> "$tmp"
mv "$tmp" "$file"

codex --search exec --json -c model_reasoning_summary=detailed --skip-git-repo-check --yolo --model "$AGENT_CONFIG" "$PROMPT"
1 change: 1 addition & 0 deletions agents/codex_xhigh_reprompt/human_readable_trace.py
34 changes: 34 additions & 0 deletions agents/codex_xhigh_reprompt/solve.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#!/bin/bash
unset ANTHROPIC_API_KEY
unset GEMINI_API_KEY

# Set reasoning effort to xhigh (prepend to ensure precedence)
file=/home/ben/.codex/config.toml
tmp="$(mktemp)"
printf 'model_reasoning_effort = "xhigh"\n\n' > "$tmp"
[ -f "$file" ] && cat "$file" >> "$tmp"
mv "$tmp" "$file"

MIN_REMAINING_MINUTES=30

codex --search exec --json -c model_reasoning_summary=detailed --skip-git-repo-check --yolo --model "$AGENT_CONFIG" "$PROMPT"

# Re-prompt loop: if the agent finishes early, resume the session
while true; do
TIMER_OUTPUT=$(bash timer.sh 2>/dev/null)
if echo "$TIMER_OUTPUT" | grep -q "expired"; then
break
fi

REMAINING_HOURS=$(echo "$TIMER_OUTPUT" | grep -oP '^\d+(?=:)')
REMAINING_MINS=$(echo "$TIMER_OUTPUT" | grep -oP '(?<=:)\d+')
TOTAL_REMAINING_MINS=$(( REMAINING_HOURS * 60 + REMAINING_MINS ))

if [ "$TOTAL_REMAINING_MINS" -lt "$MIN_REMAINING_MINUTES" ]; then
break
fi

CONTINUATION_PROMPT="You still have ${REMAINING_HOURS}h ${REMAINING_MINS}m remaining. Please continue improving your result and maximize performance."

codex --search exec resume --last --json -c model_reasoning_summary=detailed --skip-git-repo-check --yolo --model "$AGENT_CONFIG" "$CONTINUATION_PROMPT"
done
78 changes: 78 additions & 0 deletions containers/gpt_5_5.def
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
Bootstrap: docker
From: nvidia/cuda:12.9.1-cudnn-devel-ubuntu22.04

%files
containers/requirements-direct.txt /opt/requirements-direct.txt

%post
chmod 1777 /tmp
# Set environment variables
export DEBIAN_FRONTEND=noninteractive

# Update and install system dependencies
apt-get update && apt-get install -y \
python3.10 \
python3-dev \
git \
wget \
curl \
build-essential \
&& rm -rf /var/lib/apt/lists/*

# Create python3 symlink
ln -sf /usr/bin/python3.10 /usr/bin/python3
ln -sf /usr/bin/python3.10 /usr/bin/python

# Install Node.js (LTS version 22.x) for npm
curl -fsSL https://deb.nodesource.com/setup_22.x | bash -
apt-get install -y nodejs

# Install uv
curl -LsSf https://astral.sh/uv/install.sh | sh
export PATH="/root/.local/bin:$PATH"

uv pip install --system --no-cache vllm==0.11.0 --torch-backend=auto

# Pinned direct dependencies
uv pip install --system --no-cache -r /opt/requirements-direct.txt

# flash-attn (needs no-build-isolation)
uv pip install --system --no-cache flash-attn==2.8.3 --no-build-isolation

# update CLI harnesss to most stable latest versions
# OpenCode doesn't support DeepSeek V4 yet.
npm install -g \
@anthropic-ai/claude-code@2.1.116 \
@openai/codex@0.124.0 \
@google/gemini-cli@0.39.1 \
opencode-ai@1.14.20

# install inspect evals
mkdir -p /opt
cd /opt
git clone https://github.com/UKGovernmentBEIS/inspect_evals.git
cd /opt/inspect_evals
git checkout 06001a83e6d7c709c2ede0570dce7f1031a0bad8
uv pip install --system --no-cache .

# install inspect ai with debug
mkdir -p /opt
cd /opt
git clone https://github.com/rank-and-file/inspect_ai_vllm_stdout.git
cd inspect_ai_vllm_stdout
uv pip install --system --no-cache .

%environment
export PATH="/root/.local/bin:$PATH"
export NO_PROXY="localhost,127.0.0.1"
export no_proxy="localhost,127.0.0.1"

%runscript
exec python3 "$@"

%labels
Version v1.0
Description Python ML container with CUDA support for transformers and LLM training (using uv) + AI CLI tools

%help
Note: Use the --nv flag to enable NVIDIA GPU support when running the container.
14 changes: 12 additions & 2 deletions dev_utils/extract_traces.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,11 @@ def main():
nargs="+",
help="Input directory names (relative to RESULTS_BASE) to process"
)
parser.add_argument(
"--all",
action="store_true",
help="Copy all runs, not just the latest per task (default: latest only)"
)
args = parser.parse_args()

output_base = Path(OUTPUT_DIR)
Expand All @@ -175,8 +180,12 @@ def main():

print(f"\n[{input_dir_name}]")

# Iterate over only the latest subdirectories (highest ID per prefix)
for subdir in sorted(get_latest_subdirs(input_dir)):
# Iterate over subdirectories (latest per task by default, all with --all)
if args.all:
subdirs = sorted(d for d in input_dir.iterdir() if d.is_dir())
else:
subdirs = sorted(get_latest_subdirs(input_dir))
for subdir in subdirs:
# Determine source file (prefer solve_parsed.txt)
src_file = subdir / "solve_parsed.txt"
if not src_file.exists():
Expand All @@ -201,6 +210,7 @@ def main():
copy_other_files(subdir, dest_dir, 'contamination_judgement.txt', api_keys=api_keys)
copy_other_files(subdir, dest_dir, 'disallowed_model_judgement.txt', api_keys=api_keys)
copy_other_files(subdir, dest_dir, 'error.log', 'judgement.log', api_keys=api_keys)
copy_other_files(subdir, dest_dir, 'time_taken.txt', api_keys=api_keys)
copy_other_files(subdir, dest_dir, 'system_monitor.log', api_keys=api_keys, optional=True)

tag = " [sanitized]" if was_sanitized else ""
Expand Down
4 changes: 3 additions & 1 deletion dev_utils/limit_hit_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,13 @@
"You've hit your limit", # Claude Code Pro subscription limit
"spending_limit", # Anthropic/OpenAI spending limit
"billing_hard_limit", # OpenAI billing hard limit
"insufficient_quota", # OpenAI quota exceeded
"insufficient_quota", # OpenAI quota exceeded (structured error code)
"Quota exceeded. Check your plan", # OpenAI/Codex quota exceeded (turn.failed message)
"budget_exceeded", # General budget error
"plan does not yet include", # Z.AI subscription plan restriction
"token_expired", # OpenAI/Codex expired auth token
"Failed to refresh token", # Codex CLI refresh token failure
"Reconnecting... 5/5", # Codex CLI exhausted stream-reconnect retries
]


Expand Down
16 changes: 10 additions & 6 deletions dev_utils/terminated_finder.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,19 +11,23 @@ def get_results_dir():
return os.environ.get("POST_TRAIN_BENCH_RESULTS_DIR", "results")


KILLED_RE = re.compile(rb"run_task\.sh: line \d+: \d+ Killed")


def classify_error(error_log_path: Path) -> str | None:
"""Classify the error in error.log. Returns 'terminated', 'killed', or None."""
if not error_log_path.exists():
return None
try:
content = error_log_path.read_text()
if content.startswith("Terminated"):
return "terminated"
if re.search(r"\bKilled\b", content):
return "killed"
return None
with open(error_log_path, "rb") as f:
head = f.read(4096)
except Exception:
return None
if head.startswith(b"Terminated"):
return "terminated"
if KILLED_RE.search(head):
return "killed"
return None


def get_latest_runs(method_path: Path):
Expand Down
Loading