From 2d6f1d5ffc009aefea020a1e0aaea8150be59fd7 Mon Sep 17 00:00:00 2001
From: hrdkbhatnagar <hrdk.bhatnagar@gmail.com>
Date: Tue, 5 May 2026 21:36:57 +0200
Subject: [PATCH] refactor aggregation scripts completely fix dev_utils
 scriptsadd new agents add rerun judge

---
 .../claude_reprompt/human_readable_trace.py   |   1 +
 agents/claude_reprompt/solve.sh               |  31 ++
 agents/codex_xhigh/human_readable_trace.py    |   1 +
 agents/codex_xhigh/solve.sh                   |  12 +
 .../human_readable_trace.py                   |   1 +
 agents/codex_xhigh_reprompt/solve.sh          |  34 ++
 containers/gpt_5_5.def                        |  78 ++++
 dev_utils/extract_traces.py                   |  14 +-
 dev_utils/limit_hit_list.py                   |   4 +-
 dev_utils/terminated_finder.py                |  16 +-
 scripts/README.md                             | 173 ++++++++
 scripts/aggregate.py                          | 363 ++++++++++++++++
 scripts/aggregate_metrics_runs.py             |  72 +++
 scripts/baselines.json                        | 114 +++++
 scripts/collect.py                            | 255 +++++++++++
 scripts/constants.py                          |   8 +-
 scripts/rerun_eval_n_times.sh                 | 152 +++++++
 scripts/utils.py                              | 410 ++++++++++++++++++
 scripts/verify.py                             | 261 +++++++++++
 src/commit_utils/commit.sh                    | 106 +++--
 src/commit_utils/rerun_eval.sub               |  15 +
 .../rerun_judge/README.md                     | 100 +++++
 .../rerun_judge/aggregate_rerun_results.py    | 149 +++++++
 .../rerun_judge/commit_rerun_judge.sh         |  88 ++++
 .../rerun_judge/list_results.py               |  74 ++++
 .../rerun_judge/rerun_judge.sub               |  12 +
 .../rerun_judge/rerun_single.sh               |  19 +
 .../rerun_judge/utils.py                      | 135 ++++++
 src/disallowed_usage_judge/run_judge.sh       | 124 ++++++
 29 files changed, 2765 insertions(+), 57 deletions(-)
 create mode 120000 agents/claude_reprompt/human_readable_trace.py
 create mode 100755 agents/claude_reprompt/solve.sh
 create mode 120000 agents/codex_xhigh/human_readable_trace.py
 create mode 100755 agents/codex_xhigh/solve.sh
 create mode 120000 agents/codex_xhigh_reprompt/human_readable_trace.py
 create mode 100755 agents/codex_xhigh_reprompt/solve.sh
 create mode 100644 containers/gpt_5_5.def
 create mode 100644 scripts/README.md
 create mode 100644 scripts/aggregate.py
 create mode 100755 scripts/aggregate_metrics_runs.py
 create mode 100644 scripts/baselines.json
 create mode 100644 scripts/collect.py
 create mode 100755 scripts/rerun_eval_n_times.sh
 create mode 100644 scripts/utils.py
 create mode 100644 scripts/verify.py
 create mode 100644 src/commit_utils/rerun_eval.sub
 create mode 100644 src/disallowed_usage_judge/rerun_judge/README.md
 create mode 100644 src/disallowed_usage_judge/rerun_judge/aggregate_rerun_results.py
 create mode 100755 src/disallowed_usage_judge/rerun_judge/commit_rerun_judge.sh
 create mode 100644 src/disallowed_usage_judge/rerun_judge/list_results.py
 create mode 100644 src/disallowed_usage_judge/rerun_judge/rerun_judge.sub
 create mode 100755 src/disallowed_usage_judge/rerun_judge/rerun_single.sh
 create mode 100644 src/disallowed_usage_judge/rerun_judge/utils.py
 create mode 100755 src/disallowed_usage_judge/run_judge.sh

diff --git a/agents/claude_reprompt/human_readable_trace.py b/agents/claude_reprompt/human_readable_trace.py
new file mode 120000
index 0000000..d643db0
--- /dev/null
+++ b/agents/claude_reprompt/human_readable_trace.py
@@ -0,0 +1 @@
+../claude/human_readable_trace.py
\ No newline at end of file
diff --git a/agents/claude_reprompt/solve.sh b/agents/claude_reprompt/solve.sh
new file mode 100755
index 0000000..b0b25a4
--- /dev/null
+++ b/agents/claude_reprompt/solve.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+unset GEMINI_API_KEY
+unset CODEX_API_KEY
+
+export BASH_MAX_TIMEOUT_MS="36000000"
+
+MIN_REMAINING_MINUTES=30
+
+claude --print --verbose --model "$AGENT_CONFIG" --output-format stream-json \
+    --dangerously-skip-permissions "$PROMPT"
+
+# Re-prompt loop: if the agent finishes early, resume the session
+while true; do
+    TIMER_OUTPUT=$(bash timer.sh 2>/dev/null)
+    if echo "$TIMER_OUTPUT" | grep -q "expired"; then
+        break
+    fi
+
+    REMAINING_HOURS=$(echo "$TIMER_OUTPUT" | grep -oP '^\d+(?=:)')
+    REMAINING_MINS=$(echo "$TIMER_OUTPUT" | grep -oP '(?<=:)\d+')
+    TOTAL_REMAINING_MINS=$(( REMAINING_HOURS * 60 + REMAINING_MINS ))
+
+    if [ "$TOTAL_REMAINING_MINS" -lt "$MIN_REMAINING_MINUTES" ]; then
+        break
+    fi
+
+    CONTINUATION_PROMPT="You still have ${REMAINING_HOURS}h ${REMAINING_MINS}m remaining. Please continue improving your result and maximize performance."
+
+    claude --print --verbose --continue --model "$AGENT_CONFIG" --output-format stream-json \
+        --dangerously-skip-permissions "$CONTINUATION_PROMPT"
+done
diff --git a/agents/codex_xhigh/human_readable_trace.py b/agents/codex_xhigh/human_readable_trace.py
new file mode 120000
index 0000000..9cf1a5d
--- /dev/null
+++ b/agents/codex_xhigh/human_readable_trace.py
@@ -0,0 +1 @@
+../codex/human_readable_trace.py
\ No newline at end of file
diff --git a/agents/codex_xhigh/solve.sh b/agents/codex_xhigh/solve.sh
new file mode 100755
index 0000000..443f1c5
--- /dev/null
+++ b/agents/codex_xhigh/solve.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+unset ANTHROPIC_API_KEY
+unset GEMINI_API_KEY
+
+# Set reasoning effort to xhigh (prepend to ensure precedence)
+file=/home/ben/.codex/config.toml
+tmp="$(mktemp)"
+printf 'model_reasoning_effort = "xhigh"\n\n' > "$tmp"
+[ -f "$file" ] && cat "$file" >> "$tmp"
+mv "$tmp" "$file"
+
+codex --search exec --json -c model_reasoning_summary=detailed --skip-git-repo-check --yolo --model "$AGENT_CONFIG" "$PROMPT"
diff --git a/agents/codex_xhigh_reprompt/human_readable_trace.py b/agents/codex_xhigh_reprompt/human_readable_trace.py
new file mode 120000
index 0000000..9cf1a5d
--- /dev/null
+++ b/agents/codex_xhigh_reprompt/human_readable_trace.py
@@ -0,0 +1 @@
+../codex/human_readable_trace.py
\ No newline at end of file
diff --git a/agents/codex_xhigh_reprompt/solve.sh b/agents/codex_xhigh_reprompt/solve.sh
new file mode 100755
index 0000000..3afc973
--- /dev/null
+++ b/agents/codex_xhigh_reprompt/solve.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+unset ANTHROPIC_API_KEY
+unset GEMINI_API_KEY
+
+# Set reasoning effort to xhigh (prepend to ensure precedence)
+file=/home/ben/.codex/config.toml
+tmp="$(mktemp)"
+printf 'model_reasoning_effort = "xhigh"\n\n' > "$tmp"
+[ -f "$file" ] && cat "$file" >> "$tmp"
+mv "$tmp" "$file"
+
+MIN_REMAINING_MINUTES=30
+
+codex --search exec --json -c model_reasoning_summary=detailed --skip-git-repo-check --yolo --model "$AGENT_CONFIG" "$PROMPT"
+
+# Re-prompt loop: if the agent finishes early, resume the session
+while true; do
+    TIMER_OUTPUT=$(bash timer.sh 2>/dev/null)
+    if echo "$TIMER_OUTPUT" | grep -q "expired"; then
+        break
+    fi
+
+    REMAINING_HOURS=$(echo "$TIMER_OUTPUT" | grep -oP '^\d+(?=:)')
+    REMAINING_MINS=$(echo "$TIMER_OUTPUT" | grep -oP '(?<=:)\d+')
+    TOTAL_REMAINING_MINS=$(( REMAINING_HOURS * 60 + REMAINING_MINS ))
+
+    if [ "$TOTAL_REMAINING_MINS" -lt "$MIN_REMAINING_MINUTES" ]; then
+        break
+    fi
+
+    CONTINUATION_PROMPT="You still have ${REMAINING_HOURS}h ${REMAINING_MINS}m remaining. Please continue improving your result and maximize performance."
+
+    codex --search exec resume --last --json -c model_reasoning_summary=detailed --skip-git-repo-check --yolo --model "$AGENT_CONFIG" "$CONTINUATION_PROMPT"
+done
diff --git a/containers/gpt_5_5.def b/containers/gpt_5_5.def
new file mode 100644
index 0000000..5010544
--- /dev/null
+++ b/containers/gpt_5_5.def
@@ -0,0 +1,78 @@
+Bootstrap: docker
+From: nvidia/cuda:12.9.1-cudnn-devel-ubuntu22.04
+
+%files
+    containers/requirements-direct.txt /opt/requirements-direct.txt
+
+%post
+    chmod 1777 /tmp
+    # Set environment variables
+    export DEBIAN_FRONTEND=noninteractive
+
+    # Update and install system dependencies
+    apt-get update && apt-get install -y \
+        python3.10 \
+        python3-dev \
+        git \
+        wget \
+        curl \
+        build-essential \
+        && rm -rf /var/lib/apt/lists/*
+
+    # Create python3 symlink
+    ln -sf /usr/bin/python3.10 /usr/bin/python3
+    ln -sf /usr/bin/python3.10 /usr/bin/python
+    
+    # Install Node.js (LTS version 22.x) for npm
+    curl -fsSL https://deb.nodesource.com/setup_22.x | bash -
+    apt-get install -y nodejs
+    
+    # Install uv
+    curl -LsSf https://astral.sh/uv/install.sh | sh
+    export PATH="/root/.local/bin:$PATH"
+    
+    uv pip install --system --no-cache vllm==0.11.0 --torch-backend=auto
+
+    #  Pinned direct dependencies
+    uv pip install --system --no-cache -r /opt/requirements-direct.txt
+
+    #  flash-attn (needs no-build-isolation)
+    uv pip install --system --no-cache flash-attn==2.8.3 --no-build-isolation
+
+    #  update CLI harnesss to most stable latest versions 
+    # OpenCode doesn't support DeepSeek V4 yet. 
+    npm install -g \
+        @anthropic-ai/claude-code@2.1.116 \
+        @openai/codex@0.124.0 \
+        @google/gemini-cli@0.39.1 \
+        opencode-ai@1.14.20
+
+    # install inspect evals
+    mkdir -p /opt
+    cd /opt
+    git clone https://github.com/UKGovernmentBEIS/inspect_evals.git
+    cd /opt/inspect_evals
+    git checkout 06001a83e6d7c709c2ede0570dce7f1031a0bad8
+    uv pip install --system --no-cache .
+
+    # install inspect ai with debug 
+    mkdir -p /opt
+    cd /opt
+    git clone https://github.com/rank-and-file/inspect_ai_vllm_stdout.git
+    cd inspect_ai_vllm_stdout
+    uv pip install --system --no-cache .
+    
+%environment
+    export PATH="/root/.local/bin:$PATH"
+    export NO_PROXY="localhost,127.0.0.1"
+    export no_proxy="localhost,127.0.0.1"
+
+%runscript
+    exec python3 "$@"
+
+%labels
+    Version v1.0
+    Description Python ML container with CUDA support for transformers and LLM training (using uv) + AI CLI tools
+
+%help
+    Note: Use the --nv flag to enable NVIDIA GPU support when running the container.
diff --git a/dev_utils/extract_traces.py b/dev_utils/extract_traces.py
index 74e2e7c..ae4affe 100644
--- a/dev_utils/extract_traces.py
+++ b/dev_utils/extract_traces.py
@@ -153,6 +153,11 @@ def main():
         nargs="+",
         help="Input directory names (relative to RESULTS_BASE) to process"
     )
+    parser.add_argument(
+        "--all",
+        action="store_true",
+        help="Copy all runs, not just the latest per task (default: latest only)"
+    )
     args = parser.parse_args()
 
     output_base = Path(OUTPUT_DIR)
@@ -175,8 +180,12 @@ def main():
 
         print(f"\n[{input_dir_name}]")
 
-        # Iterate over only the latest subdirectories (highest ID per prefix)
-        for subdir in sorted(get_latest_subdirs(input_dir)):
+        # Iterate over subdirectories (latest per task by default, all with --all)
+        if args.all:
+            subdirs = sorted(d for d in input_dir.iterdir() if d.is_dir())
+        else:
+            subdirs = sorted(get_latest_subdirs(input_dir))
+        for subdir in subdirs:
             # Determine source file (prefer solve_parsed.txt)
             src_file = subdir / "solve_parsed.txt"
             if not src_file.exists():
@@ -201,6 +210,7 @@ def main():
             copy_other_files(subdir, dest_dir, 'contamination_judgement.txt', api_keys=api_keys)
             copy_other_files(subdir, dest_dir, 'disallowed_model_judgement.txt', api_keys=api_keys)
             copy_other_files(subdir, dest_dir, 'error.log', 'judgement.log', api_keys=api_keys)
+            copy_other_files(subdir, dest_dir, 'time_taken.txt', api_keys=api_keys)
             copy_other_files(subdir, dest_dir, 'system_monitor.log', api_keys=api_keys, optional=True)
 
             tag = " [sanitized]" if was_sanitized else ""
diff --git a/dev_utils/limit_hit_list.py b/dev_utils/limit_hit_list.py
index 12f5809..9bfa144 100644
--- a/dev_utils/limit_hit_list.py
+++ b/dev_utils/limit_hit_list.py
@@ -10,11 +10,13 @@
     "You've hit your limit",         # Claude Code Pro subscription limit
     "spending_limit",                 # Anthropic/OpenAI spending limit
     "billing_hard_limit",            # OpenAI billing hard limit
-    "insufficient_quota",            # OpenAI quota exceeded
+    "insufficient_quota",            # OpenAI quota exceeded (structured error code)
+    "Quota exceeded. Check your plan",  # OpenAI/Codex quota exceeded (turn.failed message)
     "budget_exceeded",               # General budget error
     "plan does not yet include",     # Z.AI subscription plan restriction
     "token_expired",                 # OpenAI/Codex expired auth token
     "Failed to refresh token",       # Codex CLI refresh token failure
+    "Reconnecting... 5/5",           # Codex CLI exhausted stream-reconnect retries
 ]
 
 
diff --git a/dev_utils/terminated_finder.py b/dev_utils/terminated_finder.py
index f7af378..90e21fe 100644
--- a/dev_utils/terminated_finder.py
+++ b/dev_utils/terminated_finder.py
@@ -11,19 +11,23 @@ def get_results_dir():
     return os.environ.get("POST_TRAIN_BENCH_RESULTS_DIR", "results")
 
 
+KILLED_RE = re.compile(rb"run_task\.sh: line \d+: \d+ Killed")
+
+
 def classify_error(error_log_path: Path) -> str | None:
     """Classify the error in error.log. Returns 'terminated', 'killed', or None."""
     if not error_log_path.exists():
         return None
     try:
-        content = error_log_path.read_text()
-        if content.startswith("Terminated"):
-            return "terminated"
-        if re.search(r"\bKilled\b", content):
-            return "killed"
-        return None
+        with open(error_log_path, "rb") as f:
+            head = f.read(4096)
     except Exception:
         return None
+    if head.startswith(b"Terminated"):
+        return "terminated"
+    if KILLED_RE.search(head):
+        return "killed"
+    return None
 
 
 def get_latest_runs(method_path: Path):
diff --git a/scripts/README.md b/scripts/README.md
new file mode 100644
index 0000000..4f08e87
--- /dev/null
+++ b/scripts/README.md
@@ -0,0 +1,173 @@
+# scripts
+
+Post-hoc analysis utilities for PostTrainBench result directories. Most scripts
+here read the contents of `$POST_TRAIN_BENCH_RESULTS_DIR` and produce CSV /
+JSON aggregates; the exception is `rerun_eval_n_times.sh`, which actually
+re-runs the model on a GPU.
+
+## Aggregating results into CSVs
+
+The recommended pipeline is two scripts: `collect.py` reads raw run dirs into
+per-method CSVs, then `aggregate.py` rolls those into per-agent avg/std and
+the weighted leaderboard metric. Together they replace ~10 older scripts
+(`aggregate_methods.py`, `aggregate_final.py`, `aggregate_contamination.py`,
+`aggregate_time.py`, `compute_single_metrics*.py`, etc., still kept in the
+tree for reference — see "Legacy per-stage scripts" below).
+
+### Typical flow
+
+From the repo root, with `POST_TRAIN_BENCH_RESULTS_DIR` pointing at the raw
+results tree:
+
+```bash
+# 1. Collect raw per-run data into per-method CSVs.
+#    Reads metrics.json + contamination/disallowed_model judgements + time_taken.txt,
+#    applies baseline-zeroshot fallback for contaminated/errored cells.
+#    Writes:
+#      final_{method}.csv          — score grid (model x benchmark) with fallback
+#      contamination_{method}.csv  — flags ("", "C", "M", "MC", or error string)
+#      time_overview.csv           — average wall time per method
+python scripts/collect.py
+
+# 2. Aggregate across runs/agents and compute the weighted leaderboard metric.
+#    Reads final_{method}.csv produced above. Writes:
+#      aggregated_avg_{agent}.csv  — per-cell mean across runs (one per multi-run agent)
+#      aggregated_std_{agent}.csv  — per-cell sample stddev (n-1)
+#      single_metrics.csv          — weighted score per individual run
+#      single_metrics_aggregated.csv  — agent-level avg/std/n on the weighted metric
+#      time_aggregated.csv         — agent-level avg/std wall time
+python scripts/aggregate.py
+```
+
+`aggregate.py` skips agents whose run CSVs aren't present in this results
+dir, so it's safe to run against a partial tree.
+
+### `collect.py` flags
+
+```bash
+python scripts/collect.py \
+    --data-dir /path/to/results \      # default: $POST_TRAIN_BENCH_RESULTS_DIR
+    --output-dir /path/to/out \        # default: same as --data-dir
+    --min-run-id 17000000 \            # inclusive lower bound on cluster_id
+    --max-run-id 17200000              # exclusive upper bound on cluster_id
+```
+
+### `aggregate.py` flags
+
+By default `--all` is implied (write everything). Use the flags below to
+restrict to one stage:
+
+```bash
+python scripts/aggregate.py --per-cell      # only aggregated_avg/std_{agent}.csv
+python scripts/aggregate.py --leaderboard   # only single_metrics{,_aggregated}.csv
+python scripts/aggregate.py --time          # only time_aggregated.csv
+```
+
+Same `--data-dir` / `--output-dir` flags as `collect.py`.
+
+### Hardcoded things
+
+| File | What it pins |
+|---|---|
+| `constants.py` (`HARDCODED_AGENT_MAP`) | Which run directories belong to which agent (multi-run agents are how stddev is computed) |
+| `constants.py` (`HARDCODED_BENCHMARKS`, `EXPECTED_MODELS`) | Benchmark + base-model lists |
+| `factors.json` | Per-benchmark weights for the weighted leaderboard metric |
+| `baselines.json` | Hardcoded zero-shot + few-shot baseline scores; used as fallback for contaminated/errored cells (no longer recomputed at every run) |
+
+To add a new agent: add its run-dir names to `HARDCODED_AGENT_MAP` in
+`constants.py`. To add a new benchmark: extend `HARDCODED_BENCHMARKS` and add
+a weight to `factors.json`.
+
+### `verify.py` (refactor regression check)
+
+`verify.py` is a one-off script used when the new pipeline was
+rolled out — it compares two CSV output dirs cell-by-cell with float
+tolerance, used to confirm the new pipeline matches the old one byte-for-byte
+(except for filename renames). Not part of the normal workflow.
+
+```bash
+python scripts/verify.py \
+    --ground-truth /fast/.../ptb_results_old \
+    --new-output   /fast/.../ptb_results_new
+```
+
+## Other helpers
+
+| Script | Description |
+|---|---|
+| `compute_claude_costs.py` | Claude API spend rollup |
+| `extract_token_usage.py` | Token-usage extraction from agent traces |
+| `migrate_judgement_files.py` | One-off: migrate older judgement file naming |
+| `list_safetensors.py` | List safetensors files under a result tree |
+| `parse_all_to_human_readable.sh` | Run human-readable trace parsers across results |
+| `baselines.json`, `factors.json`, `constants.py`, `utils.py` | Shared config / helpers |
+
+## Legacy per-stage scripts
+
+These predate `collect.py` + `aggregate.py` and are kept for reference; the
+two new scripts cover what they did with less duplication. Prefer the new
+pipeline for fresh analysis.
+
+| Script | Replaced by |
+|---|---|
+| `aggregate_methods.py` | `collect.py` |
+| `aggregate_contamination.py` | `collect.py` |
+| `aggregate_final.py` | `collect.py` |
+| `aggregate_time.py` | `collect.py` (writes `time_overview.csv`) |
+| `aggregate_avg_stddev.py` | `aggregate.py --per-cell` |
+| `aggregate_avg_stddev_over_benchmarks.py` | (dropped — deprecated artifact) |
+| `aggregate_time_avg_stddev.py` | `aggregate.py --time` |
+| `aggregate_time_baselines.py` | (dropped — baselines hardcoded) |
+| `aggregate_summary.py`, `aggregate_together.py` | `aggregate.py --leaderboard` |
+| `compute_single_metrics.py`, `compute_single_metrics_avg_stddev.py` | `aggregate.py --leaderboard` |
+| `compute_baseline_metrics.py`, `compute_baseline_metrics_by_benchmark.py` | `baselines.json` (no longer recomputed) |
+
+
+## Re-evaluating a finished run N times
+
+`rerun_eval_n_times.sh` re-evaluates a job's `final_model/` N times and writes
+mean / std / stderr / min / max per metric into `metrics_averaged.json`. Useful
+because each job's standard `metrics.json` is a single decoding sample per
+question and does not capture decoding noise.
+
+It mirrors `src/run_task.sh`'s evaluation step exactly:
+
+- runs `src/eval/tasks/<task>/evaluate.py` (the live source — **not** the
+  potentially-modified snapshot in `<EVAL_DIR>/task/`)
+- inside the same `${POST_TRAIN_BENCH_CONTAINER_NAME}.sif` container
+- with the same fuse-overlayfs HF cache pattern (`with_huggingface_overlay`)
+- using the same `--max-tokens` fallback ladder per task
+
+Per-run JSONs are written to `<EVAL_DIR>/reruns/run_{i}.json` (with
+`run_{i}_{level}.log` alongside). The aggregated file is `<EVAL_DIR>/metrics_averaged.json`.
+
+### Files
+
+| File | Description |
+|---|---|
+| `rerun_eval_n_times.sh` | Driver: re-runs `evaluate.py` N times on one EVAL_DIR and aggregates |
+| `aggregate_metrics_runs.py` | Helper called by the driver: computes mean/std/stderr/min/max from per-run JSONs |
+| `../src/commit_utils/rerun_eval.sub` | HTCondor submission file |
+
+### Usage
+
+#### Locally on a GPU node
+
+From the repo root:
+
+```bash
+scripts/rerun_eval_n_times.sh /path/to/EVAL_DIR 5
+```
+
+`EVAL_DIR` must be an existing job directory containing `final_model/`. The
+task name is parsed from the basename (`<task>_<model_safe>_<cluster_id>`) to
+pick the correct max-tokens fallback ladder.
+
+#### HTCondor 
+
+```bash
+condor_submit_bid 50 \
+  -a "eval_dir=/path/to/EVAL_DIR" \
+  -a "n=5" \
+  src/commit_utils/rerun_eval.sub
+```
diff --git a/scripts/aggregate.py b/scripts/aggregate.py
new file mode 100644
index 0000000..b1b2ee9
--- /dev/null
+++ b/scripts/aggregate.py
@@ -0,0 +1,363 @@
+#!/usr/bin/env python3
+"""
+Aggregate results across multiple runs per agent.
+
+# For comparing to previous version: 
+Replaces: aggregate_avg_stddev.py, aggregate_avg_stddev_over_benchmarks.py,
+          compute_single_metrics.py, compute_single_metrics_avg_stddev.py,
+          aggregate_time_avg_stddev.py, aggregate_summary.py,
+          aggregate_together.py, compute_baseline_metrics.py,
+          compute_baseline_metrics_by_benchmark.py
+
+Reads final_{method}.csv files produced by collect.py and computes:
+  --per-cell     : aggregated_avg_{agent}.csv, aggregated_std_{agent}.csv
+  --leaderboard  : single_metrics.csv, single_metrics_aggregated.csv
+  --time         : time_aggregated.csv
+  --all          : everything (default)
+
+Usage:
+    python aggregate.py
+    python aggregate.py --data-dir /path/to/results --output-dir /path/to/output
+    python aggregate.py --per-cell --leaderboard
+"""
+import argparse
+import csv
+import os
+import re
+
+from utils import (
+    get_results_dir,
+    load_csv_as_dict,
+    write_csv,
+    load_factors,
+    mean,
+    stddev,
+    is_number,
+    format_time_hms,
+    HARDCODED_AGENT_MAP,
+    HARDCODED_BENCHMARKS,
+    EXPECTED_MODELS,
+)
+
+
+# ---------------------------------------------------------------------------
+# Per-cell avg/std across runs (replaces aggregate_avg_stddev.py)
+# ---------------------------------------------------------------------------
+
+def aggregate_per_cell(
+    agent_name: str,
+    method_names: list[str],
+    data_dir: str,
+    output_dir: str,
+):
+    """
+    For each (model, benchmark) cell, compute mean and sample stddev
+    across the runs. Write aggregated_avg_{agent}.csv and aggregated_std_{agent}.csv.
+    """
+    all_data = []
+    all_models = None
+
+    for method_name in method_names:
+        csv_path = os.path.join(data_dir, f"final_{method_name}.csv")
+        data, _ = load_csv_as_dict(csv_path)
+
+        models = sorted(data.keys())
+        if all_models is None:
+            all_models = models
+        else:
+            assert all_models == models, (
+                f"Model mismatch for {method_name}: "
+                f"expected {all_models}, got {models}"
+            )
+        all_data.append(data)
+
+    avg_data = {}
+    std_data = {}
+
+    for model in all_models:
+        avg_data[model] = {}
+        std_data[model] = {}
+
+        for bench in HARDCODED_BENCHMARKS:
+            values = []
+            for data in all_data:
+                values.append(float(data[model][bench]))
+
+            avg_data[model][bench] = str(mean(values))
+            std_data[model][bench] = str(stddev(values))
+
+    avg_path = os.path.join(output_dir, f"aggregated_avg_{agent_name}.csv")
+    write_csv(avg_path, all_models, HARDCODED_BENCHMARKS, avg_data)
+    print(f"Written: {avg_path}")
+
+    std_path = os.path.join(output_dir, f"aggregated_std_{agent_name}.csv")
+    write_csv(std_path, all_models, HARDCODED_BENCHMARKS, std_data)
+    print(f"Written: {std_path}")
+
+    return avg_data, std_data
+
+
+# ---------------------------------------------------------------------------
+# Weighted single metric (replaces compute_single_metrics*.py)
+# ---------------------------------------------------------------------------
+
+def compute_weighted_metric(
+    data: dict[str, dict[str, str]],
+    factors: dict[str, float],
+) -> float:
+    """
+    Compute weighted sum: for each benchmark, average across models,
+    multiply by factor, sum.
+    """
+    valid_benchmarks = set(factors.keys())
+    total = 0.0
+    num_models = len(data)
+    for bench in sorted(valid_benchmarks):
+        values = []
+        for model in data:
+            val_str = data[model].get(bench, "")
+            if val_str == "":
+                continue
+            values.append(float(val_str))
+        if values:
+            avg_value = sum(values) / num_models
+            total += avg_value * factors[bench]
+    return total
+
+
+def aggregate_leaderboard(data_dir: str, output_dir: str):
+    """
+    Compute weighted metric for every final_*.csv that has all expected models.
+    Then group by HARDCODED_AGENT_MAP for avg/std.
+
+    Also writes final_avg_{agent}.csv and final_std_{agent}.csv (identical to
+    aggregated_ versions) so their metrics appear in single_metrics.csv.
+    """
+    factors = load_factors()
+    valid_benchmarks = set(factors.keys())
+
+    # Phase 1: compute per-cell avg/std and write final_avg/std files
+    # so they get picked up in the metric scan below
+    for agent_name, method_names in HARDCODED_AGENT_MAP.items():
+        avg_data, std_data = _load_avg_std_for_agent(
+            agent_name, method_names, data_dir
+        )
+        if avg_data is not None:
+            # Write final_avg_{agent}.csv (identical to aggregated_avg_)
+            avg_path = os.path.join(output_dir, f"final_avg_{agent_name}.csv")
+            write_csv(
+                avg_path,
+                sorted(avg_data.keys()),
+                HARDCODED_BENCHMARKS,
+                avg_data,
+            )
+            std_path = os.path.join(output_dir, f"final_std_{agent_name}.csv")
+            write_csv(
+                std_path,
+                sorted(std_data.keys()),
+                HARDCODED_BENCHMARKS,
+                std_data,
+            )
+
+    # Phase 2: compute metrics for ALL final_*.csv files in the output dir
+    all_metrics = {}
+
+    for filename in os.listdir(output_dir):
+        if not filename.startswith("final_"):
+            continue
+        if not filename.endswith(".csv"):
+            continue
+        if filename.startswith("final_time_"):
+            continue
+
+        csv_path = os.path.join(output_dir, filename)
+        try:
+            data, _ = load_csv_as_dict(csv_path)
+        except Exception:
+            print(f"Warning: could not load {csv_path}.")
+            raise
+
+        if set(data.keys()) != EXPECTED_MODELS:
+            continue
+
+        method_name = filename[len("final_"):-len(".csv")]
+        all_metrics[method_name] = compute_weighted_metric(data, factors)
+
+    # Write individual metrics
+    metrics_path = os.path.join(output_dir, "single_metrics.csv")
+    with open(metrics_path, "w", newline="") as f:
+        writer = csv.writer(f)
+        writer.writerow(["method", "metric"])
+        for method_name in sorted(all_metrics.keys()):
+            writer.writerow([method_name, all_metrics[method_name]])
+    print(f"Written: {metrics_path}")
+
+    # Compute aggregated metrics per agent group
+    aggregated_path = os.path.join(output_dir, "single_metrics_aggregated.csv")
+    with open(aggregated_path, "w", newline="") as f:
+        writer = csv.writer(f)
+        writer.writerow(["agent", "avg", "std", "n"])
+        for agent_name in sorted(HARDCODED_AGENT_MAP.keys()):
+            method_names = HARDCODED_AGENT_MAP[agent_name]
+            # Skip agents with missing runs
+            if not all(m in all_metrics for m in method_names):
+                print(f"Skipping agent {agent_name} in leaderboard: missing metrics")
+                continue
+            metrics = [all_metrics[m] for m in method_names]
+            writer.writerow([
+                agent_name,
+                mean(metrics),
+                stddev(metrics),
+                len(metrics),
+            ])
+    print(f"Written: {aggregated_path}")
+
+
+def _load_avg_std_for_agent(
+    agent_name: str,
+    method_names: list[str],
+    data_dir: str,
+) -> tuple[dict | None, dict | None]:
+    """Load final_*.csv for each run and compute per-cell avg/std."""
+    all_data = []
+    all_models = None
+
+    for method_name in method_names:
+        csv_path = os.path.join(data_dir, f"final_{method_name}.csv")
+        if not os.path.exists(csv_path):
+            return None, None
+        data, _ = load_csv_as_dict(csv_path)
+        models = sorted(data.keys())
+        if all_models is None:
+            all_models = models
+        all_data.append(data)
+
+    avg_data = {}
+    std_data = {}
+    for model in all_models:
+        avg_data[model] = {}
+        std_data[model] = {}
+        for bench in HARDCODED_BENCHMARKS:
+            values = [float(d[model][bench]) for d in all_data]
+            avg_data[model][bench] = str(mean(values))
+            std_data[model][bench] = str(stddev(values))
+
+    return avg_data, std_data
+
+
+# ---------------------------------------------------------------------------
+# Time aggregation (replaces aggregate_time_avg_stddev.py)
+# ---------------------------------------------------------------------------
+
+def parse_time_to_hours(time_str: str) -> float:
+    """Parse time string like '8:17:28' to hours as float."""
+    parts = time_str.split(":")
+    hours = int(parts[0])
+    minutes = int(parts[1])
+    seconds = int(parts[2])
+    return hours + minutes / 60 + seconds / 3600
+
+
+def aggregate_time(data_dir: str, output_dir: str):
+    """
+    Read time_overview.csv, group by HARDCODED_AGENT_MAP, compute avg/std.
+    Write time_aggregated.csv.
+    """
+    # Try new name first, fall back to old name
+    time_csv_path = os.path.join(data_dir, "time_overview.csv")
+    if not os.path.exists(time_csv_path):
+        time_csv_path = os.path.join(data_dir, "aggregated_time_overview.csv")
+
+    time_data = {}
+    with open(time_csv_path, "r", newline="") as f:
+        reader = csv.DictReader(f)
+        for row in reader:
+            method = row["method"]
+            avg_time = row["average_time"]
+            if avg_time and avg_time != "N/A":
+                time_data[method] = parse_time_to_hours(avg_time)
+
+    output_path = os.path.join(output_dir, "time_aggregated.csv")
+    with open(output_path, "w", newline="") as f:
+        writer = csv.writer(f)
+        writer.writerow(["agent", "avg_time", "std_time", "n"])
+        for agent_name, method_names in HARDCODED_AGENT_MAP.items():
+            if not all(m in time_data for m in method_names):
+                print(f"Skipping agent {agent_name} in time: missing data")
+                continue
+            hours_list = [time_data[m] for m in method_names]
+            writer.writerow([
+                agent_name,
+                format_time_hms(int(mean(hours_list) * 3600)),
+                format_time_hms(int(stddev(hours_list) * 3600)),
+                len(hours_list),
+            ])
+    print(f"Written: {output_path}")
+
+
+def _all_finals_exist(method_names: list[str], data_dir: str) -> bool:
+    """Check if all final_*.csv files exist for the given methods."""
+    return all(
+        os.path.exists(os.path.join(data_dir, f"final_{m}.csv"))
+        for m in method_names
+    )
+
+
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Aggregate results across multiple runs per agent."
+    )
+    parser.add_argument(
+        "--data-dir",
+        default=None,
+        help="Directory containing final_*.csv files (from collect.py). "
+        "Defaults to POST_TRAIN_BENCH_RESULTS_DIR or 'results'.",
+    )
+    parser.add_argument(
+        "--output-dir",
+        default=None,
+        help="Directory to write output CSVs. Defaults to same as --data-dir.",
+    )
+    parser.add_argument("--per-cell", action="store_true",
+                        help="Write per-cell avg/std CSVs per agent.")
+    parser.add_argument("--leaderboard", action="store_true",
+                        help="Write single_metrics.csv and single_metrics_aggregated.csv.")
+    parser.add_argument("--time", action="store_true",
+                        help="Write time_aggregated.csv.")
+    parser.add_argument("--all", action="store_true",
+                        help="Write everything (default if no flags given).")
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+
+    data_dir = args.data_dir or get_results_dir()
+    output_dir = args.output_dir or data_dir
+
+    os.makedirs(output_dir, exist_ok=True)
+
+    do_all = args.all or not (args.per_cell or args.leaderboard or args.time)
+
+    if do_all or args.per_cell:
+        for agent_name, method_names in HARDCODED_AGENT_MAP.items():
+            # Skip agents whose run data isn't available
+            if not _all_finals_exist(method_names, data_dir):
+                print(f"Skipping agent {agent_name}: missing final CSVs")
+                continue
+            print(f"Processing agent: {agent_name}")
+            aggregate_per_cell(agent_name, method_names, data_dir, output_dir)
+
+    if do_all or args.leaderboard:
+        aggregate_leaderboard(data_dir, output_dir)
+
+    if do_all or args.time:
+        aggregate_time(data_dir, output_dir)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/aggregate_metrics_runs.py b/scripts/aggregate_metrics_runs.py
new file mode 100755
index 0000000..26ef798
--- /dev/null
+++ b/scripts/aggregate_metrics_runs.py
@@ -0,0 +1,72 @@
+#!/usr/bin/env python3
+"""Aggregate per-run metrics JSON files into a single metrics_averaged.json.
+
+Reads every file matching --runs-glob, treats top-level numeric keys as
+per-run metric values, and writes mean/std/stderr/min/max per key plus the
+raw per-run records and source file list.
+"""
+from __future__ import annotations
+
+import argparse
+import glob
+import json
+import math
+import sys
+
+
+def _numeric(x: object) -> bool:
+    return isinstance(x, (int, float)) and not isinstance(x, bool)
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--runs-glob", required=True,
+                        help="Glob matching per-run metrics JSON files.")
+    parser.add_argument("--output", required=True,
+                        help="Path to write the aggregated metrics JSON.")
+    args = parser.parse_args()
+
+    paths = sorted(glob.glob(args.runs_glob))
+    if not paths:
+        sys.exit(f"no run files matched {args.runs_glob}")
+
+    runs: list[dict] = []
+    for path in paths:
+        with open(path, "r") as f:
+            runs.append(json.load(f))
+
+    keys = sorted({k for r in runs for k in r.keys()})
+
+    aggregated: dict[str, dict[str, float | int]] = {}
+    for k in keys:
+        vals = [r[k] for r in runs if k in r and _numeric(r[k])]
+        if not vals:
+            continue
+        mean = sum(vals) / len(vals)
+        if len(vals) > 1:
+            variance = sum((x - mean) ** 2 for x in vals) / (len(vals) - 1)
+            std = math.sqrt(variance)
+        else:
+            std = 0.0
+        aggregated[k] = {
+            "mean": mean,
+            "std": std,
+            "stderr": std / math.sqrt(len(vals)),
+            "min": min(vals),
+            "max": max(vals),
+            "n": len(vals),
+        }
+
+    out = {
+        "n_runs": len(runs),
+        "metrics": aggregated,
+        "per_run": runs,
+        "run_files": paths,
+    }
+
+    with open(args.output, "w") as f:
+        json.dump(out, f, indent=2)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/baselines.json b/scripts/baselines.json
new file mode 100644
index 0000000..d80799c
--- /dev/null
+++ b/scripts/baselines.json
@@ -0,0 +1,114 @@
+{
+    "zeroshot": {
+        "Qwen3-1.7B": {
+            "aime2025": 0.26666666666666666,
+            "arenahardwriting": 0.5,
+            "bfcl": 0.94,
+            "gpqamain": 0.3549107142857143,
+            "gsm8k": 0.8847611827141774,
+            "healthbench": 0.44918867035528026,
+            "humaneval": 0.6890243902439024
+        },
+        "Qwen3-1.7B-Base": {
+            "aime2025": 0.0,
+            "arenahardwriting": 0.009142053445850914,
+            "bfcl": 0.0,
+            "gpqamain": 0.140625,
+            "gsm8k": 0.12661106899166036,
+            "healthbench": 0.07537565969807473,
+            "humaneval": 0.07926829268292683
+        },
+        "Qwen3-4B": {
+            "aime2025": 0.5333333333333333,
+            "arenahardwriting": 0.8683943089430894,
+            "bfcl": 0.95,
+            "gpqamain": 0.44642857142857145,
+            "gsm8k": 0.9378316906747536,
+            "healthbench": 0.5272399437524256,
+            "humaneval": 0.774390243902439
+        },
+        "Qwen3-4B-Base": {
+            "aime2025": 0.03333333333333333,
+            "arenahardwriting": 0.03417533432392273,
+            "bfcl": 0.0,
+            "gpqamain": 0.13392857142857142,
+            "gsm8k": 0.4184988627748294,
+            "healthbench": 0.13383521639663787,
+            "humaneval": 0.36585365853658536
+        },
+        "SmolLM3-3B": {
+            "aime2025": 0.26666666666666666,
+            "arenahardwriting": 0.492,
+            "bfcl": 0.84,
+            "gpqamain": 0.3325892857142857,
+            "gsm8k": 0.8218347232752085,
+            "healthbench": 0.2957717718639611,
+            "humaneval": 0.7012195121951219
+        },
+        "SmolLM3-3B-Base": {
+            "aime2025": 0.03333333333333333,
+            "arenahardwriting": 0.004225352112676056,
+            "bfcl": 0.0,
+            "gpqamain": 0.049107142857142856,
+            "gsm8k": 0.21076573161485973,
+            "healthbench": 0.0,
+            "humaneval": 0.06097560975609756
+        },
+        "gemma-3-4b-it": {
+            "aime2025": 0.1,
+            "arenahardwriting": 0.948,
+            "bfcl": 0.67,
+            "gpqamain": 0.31473214285714285,
+            "gsm8k": 0.8354814253222138,
+            "healthbench": 0.46063396051286026,
+            "humaneval": 0.6951219512195121
+        },
+        "gemma-3-4b-pt": {
+            "aime2025": 0.0,
+            "arenahardwriting": 0.0028530670470756064,
+            "bfcl": 0.06,
+            "gpqamain": 0.015625,
+            "gsm8k": 0.06141015921152388,
+            "healthbench": 0.17039403723633986,
+            "humaneval": 0.006097560975609756
+        }
+    },
+    "fewshot": {
+        "Qwen3-1.7B-Base": {
+            "aime2025": 0.05333333333333333,
+            "arenahardwriting": 0.05314625850340136,
+            "bfcl": 0.0,
+            "gpqamain": 0.25959821428571417,
+            "gsm8k": 0.46679302501895537,
+            "healthbench": 0.2110110691560308,
+            "humaneval": 0.25243902439024396
+        },
+        "Qwen3-4B-Base": {
+            "aime2025": 0.09000000000000001,
+            "arenahardwriting": 0.19168260038240917,
+            "bfcl": 0.0,
+            "gpqamain": 0.29888392857142837,
+            "gsm8k": 0.7438210765731573,
+            "healthbench": 0.2179351466647625,
+            "humaneval": 0.6774390243902438
+        },
+        "SmolLM3-3B-Base": {
+            "aime2025": 0.06000000000000001,
+            "arenahardwriting": 0.03248811410459588,
+            "bfcl": 0.0,
+            "gpqamain": 0.13236607142857182,
+            "gsm8k": 0.5298711144806676,
+            "healthbench": 0.10165123092180756,
+            "humaneval": 0.3237804878048783
+        },
+        "gemma-3-4b-pt": {
+            "aime2025": 0.0,
+            "arenahardwriting": 0.01257396449704142,
+            "bfcl": 0.06699999999999998,
+            "gpqamain": 0.21406249999999985,
+            "gsm8k": 0.0583775587566339,
+            "healthbench": 0.23317845064882012,
+            "humaneval": 0.004878048780487805
+        }
+    }
+}
diff --git a/scripts/collect.py b/scripts/collect.py
new file mode 100644
index 0000000..2bc33f1
--- /dev/null
+++ b/scripts/collect.py
@@ -0,0 +1,255 @@
+#!/usr/bin/env python3
+"""
+Collect results from raw run directories into per-method CSVs.
+
+# For comparing to previous version: 
+Replaces: aggregate_methods.py, aggregate_contamination.py,
+          aggregate_final.py, aggregate_time.py
+
+For each method directory in the results dir, does a single pass:
+  1. Finds the latest run per (benchmark, model)
+  2. Reads metrics.json, contamination files, and time_taken.txt
+  3. Applies baseline fallback for contaminated or errored cells
+  4. Writes final_{method}.csv, contamination_{method}.csv
+
+Also writes a time_overview.csv summarising average time per method.
+
+Usage:
+    python collect.py
+    python collect.py --data-dir /path/to/results --output-dir /path/to/output
+    python collect.py --min-run-id 100 --max-run-id 200
+"""
+import argparse
+import csv
+import os
+
+from utils import (
+    get_results_dir,
+    get_baseline_fallback_data,
+    walk_latest_runs,
+    load_metrics,
+    load_contamination,
+    load_disallowed_model,
+    combine_contamination_results,
+    load_time_taken,
+    is_number,
+    format_time_hms,
+    BUDGET_SECONDS,
+)
+
+# Directories to skip (baselines are hardcoded in baselines.json)
+SKIP_METHODS = {"baseline", "baseline_zeroshot"}
+
+
+def collect_method(
+    method_path: str,
+    method_name: str,
+    baseline_data: dict[str, dict[str, str]],
+    output_dir: str,
+    min_run_id: int | None = None,
+    max_run_id: int | None = None,
+) -> dict | None:
+    """
+    Collect results for one method directory.
+
+    Writes:
+      - final_{method_name}.csv      (scores with baseline fallback)
+      - contamination_{method_name}.csv (contamination flags)
+
+    Returns time stats dict {"total_seconds": int, "valid_count": int}
+    or None if no runs found.
+    """
+    latest_runs = walk_latest_runs(method_path, min_run_id, max_run_id)
+    if not latest_runs:
+        return None
+
+    benchmarks = sorted({b for b, m in latest_runs})
+    models = sorted({m for b, m in latest_runs})
+
+    # Collect metrics, contamination, and time in one pass
+    metrics_grid = {}  # {model: {bench: str}}
+    contamination_grid = {}  # {model: {bench: str}}
+    time_total_seconds = 0
+    time_valid_count = 0
+
+    for model in models:
+        metrics_grid[model] = {}
+        contamination_grid[model] = {}
+
+        for bench in benchmarks:
+            key = (bench, model)
+            if key not in latest_runs:
+                metrics_grid[model][bench] = ""
+                contamination_grid[model][bench] = ""
+                continue
+
+            run_dir = latest_runs[key]["path"]
+
+            # Metrics
+            metrics_path = os.path.join(run_dir, "metrics.json")
+            metrics_grid[model][bench] = load_metrics(metrics_path, method_name)
+
+            # Contamination
+            contamination = load_contamination(
+                os.path.join(run_dir, "contamination_judgement.txt")
+            )
+            disallowed = load_disallowed_model(
+                os.path.join(run_dir, "disallowed_model_judgement.txt")
+            )
+            contamination_grid[model][bench] = combine_contamination_results(
+                contamination, disallowed
+            )
+
+            # Time
+            _, seconds = load_time_taken(run_dir)
+            if seconds is not None:
+                time_total_seconds += seconds
+                time_valid_count += 1
+
+    # Write contamination CSV
+    contamination_path = os.path.join(
+        output_dir, f"contamination_{method_name}.csv"
+    )
+    with open(contamination_path, "w", newline="") as f:
+        writer = csv.writer(f)
+        writer.writerow(["model"] + benchmarks)
+        for model in models:
+            row = [model]
+            for bench in benchmarks:
+                row.append(contamination_grid[model][bench])
+            writer.writerow(row)
+    print(f"Written: {contamination_path}")
+
+    # Apply baseline fallback: replace cell with baseline if
+    #   (a) value is not a number, OR
+    #   (b) contamination flag is non-empty
+    for model in models:
+        for bench in benchmarks:
+            value = metrics_grid[model][bench]
+            contamination_value = contamination_grid[model][bench]
+
+            needs_baseline = False
+            if not is_number(value):
+                needs_baseline = True
+            if contamination_value.strip():
+                needs_baseline = True
+
+            if needs_baseline:
+                metrics_grid[model][bench] = baseline_data.get(model, {}).get(
+                    bench, ""
+                )
+
+    # Write final CSV (scores with baseline fallback applied)
+    final_path = os.path.join(output_dir, f"final_{method_name}.csv")
+    with open(final_path, "w", newline="") as f:
+        writer = csv.writer(f)
+        writer.writerow(["model"] + benchmarks)
+        for model in models:
+            row = [model]
+            for bench in benchmarks:
+                row.append(metrics_grid[model].get(bench, ""))
+            writer.writerow(row)
+    print(f"Written: {final_path}")
+
+    return {
+        "total_seconds": time_total_seconds,
+        "valid_count": time_valid_count,
+    }
+
+
+def write_time_overview(method_stats: dict[str, dict], output_dir: str):
+    """Write time_overview.csv with average time per method."""
+    csv_path = os.path.join(output_dir, "time_overview.csv")
+
+    with open(csv_path, "w", newline="") as f:
+        writer = csv.writer(f)
+        writer.writerow(["method", "average_time", "percentage"])
+
+        for method_name in sorted(method_stats.keys()):
+            stats = method_stats[method_name]
+            total_secs = stats["total_seconds"]
+            valid = stats["valid_count"]
+
+            if valid > 0:
+                avg_secs = total_secs // valid
+                avg_str = format_time_hms(avg_secs)
+                pct = (avg_secs / BUDGET_SECONDS) * 100
+                pct_str = f"{pct:.1f}%"
+            else:
+                avg_str = "N/A"
+                pct_str = "N/A"
+
+            writer.writerow([method_name, avg_str, pct_str])
+
+    print(f"Written: {csv_path}")
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Collect raw results into per-method CSVs."
+    )
+    parser.add_argument(
+        "--data-dir",
+        default=None,
+        help="Directory containing method subdirectories with raw run data. "
+        "Defaults to POST_TRAIN_BENCH_RESULTS_DIR or 'results'.",
+    )
+    parser.add_argument(
+        "--output-dir",
+        default=None,
+        help="Directory to write output CSVs. Defaults to same as --data-dir.",
+    )
+    parser.add_argument(
+        "--min-run-id",
+        type=int,
+        default=None,
+        help="Inclusive lower bound for run IDs to consider.",
+    )
+    parser.add_argument(
+        "--max-run-id",
+        type=int,
+        default=None,
+        help="Exclusive upper bound for run IDs to consider.",
+    )
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+
+    data_dir = args.data_dir or get_results_dir()
+    output_dir = args.output_dir or data_dir
+
+    os.makedirs(output_dir, exist_ok=True)
+
+    # Load baseline data for fallback (hardcoded in baselines.json)
+    baseline_data = get_baseline_fallback_data()
+
+    method_stats = {}
+
+    for method_name in sorted(os.listdir(data_dir)):
+        method_path = os.path.join(data_dir, method_name)
+        if not os.path.isdir(method_path):
+            continue
+
+        # Skip baseline directories — their values are hardcoded
+        if method_name in SKIP_METHODS:
+            continue
+
+        stats = collect_method(
+            method_path,
+            method_name,
+            baseline_data,
+            output_dir,
+            min_run_id=args.min_run_id,
+            max_run_id=args.max_run_id,
+        )
+        if stats:
+            method_stats[method_name] = stats
+
+    if method_stats:
+        write_time_overview(method_stats, output_dir)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/constants.py b/scripts/constants.py
index c1dbd61..c45bcef 100644
--- a/scripts/constants.py
+++ b/scripts/constants.py
@@ -62,7 +62,13 @@
     "claude_non_api_claude-opus-4-6_1m__10h_run1",
     "claude_non_api_claude-opus-4-6_1m__10h_run2",
     "claude_non_api_claude-opus-4-6_1m__10h_run3"
-    ]
+    ],
+
+    "Opus-4.7":[
+    "claude_non_api_claude-opus-4-7_10h",
+    "claude_non_api_claude-opus-4-7_10h_run2",
+    "claude_non_api_claude-opus-4-7_10h_run3"
+    ] 
 
 }
 
diff --git a/scripts/rerun_eval_n_times.sh b/scripts/rerun_eval_n_times.sh
new file mode 100755
index 0000000..c3b8926
--- /dev/null
+++ b/scripts/rerun_eval_n_times.sh
@@ -0,0 +1,152 @@
+#!/bin/bash
+# Re-run the per-task evaluate.py N times on an already-finished EVAL_DIR
+# and aggregate per-run metrics into <EVAL_DIR>/metrics_averaged.json.
+#
+# Usage:
+#   scripts/rerun_eval_n_times.sh <EVAL_DIR> [N]
+#
+# Defaults: N=5.
+#
+# Mirrors run_task.sh's evaluation step: runs src/eval/tasks/<task>/evaluate.py
+# (NOT the snapshot in $EVAL_DIR/task) under the same vllm_debug container with
+# the same fuse-overlayfs HF cache and the same max-tokens fallback ladder.
+#
+# Run from the repo root, on a node with GPUs (submit via
+# src/commit_utils/rerun_eval.sub for cluster execution).
+set -euo pipefail
+
+if [ "$#" -lt 1 ]; then
+    echo "usage: $0 <EVAL_DIR> [N]" >&2
+    exit 1
+fi
+
+EVAL_DIR="$(realpath "$1")"
+N="${2:-5}"
+
+if [ ! -d "$EVAL_DIR/final_model" ]; then
+    echo "ERROR: $EVAL_DIR/final_model not found" >&2
+    exit 1
+fi
+
+source src/commit_utils/set_env_vars.sh
+
+# Derive the task name from the EVAL_DIR basename: <task>_<model_safe>_<cluster_id>.
+EVAL_BASENAME="$(basename "$EVAL_DIR")"
+EVALUATION_TASK="${EVAL_BASENAME%%_*}"
+
+if [ ! -f "src/eval/tasks/${EVALUATION_TASK}/evaluate.py" ]; then
+    echo "ERROR: src/eval/tasks/${EVALUATION_TASK}/evaluate.py not found" >&2
+    echo "       (parsed task '${EVALUATION_TASK}' from $(basename "$EVAL_DIR"))" >&2
+    exit 1
+fi
+
+REPO_ROOT="$(pwd)"
+RERUNS_DIR="$EVAL_DIR/reruns"
+mkdir -p "$RERUNS_DIR"
+
+# Per-task max-tokens fallback ladder, mirroring run_task.sh.
+case "$EVALUATION_TASK" in
+    aime2025)         FB1="--max-tokens 12000";    FB2="--max-tokens 8000" ;;
+    arenahardwriting) FB1="--max-new-tokens 12288"; FB2="--max-new-tokens 8192" ;;
+    bfcl)             FB1="--max-tokens 12000";    FB2="--max-tokens 8000" ;;
+    gpqamain)         FB1="--max-tokens 12000";    FB2="--max-tokens 8000" ;;
+    gsm8k)            FB1="--max-tokens 3000";     FB2="--max-tokens 2000" ;;
+    healthbench)      FB1="--max-new-tokens 12288"; FB2="--max-new-tokens 8192" ;;
+    humaneval)        FB1="--max-tokens 3000";     FB2="--max-tokens 2000" ;;
+    *)                FB1="";                       FB2="" ;;
+esac
+
+# Fuse-overlayfs HF cache so reruns don't pollute the shared HF cache,
+# matching run_task.sh's with_huggingface_overlay helper.
+TMP_SUBDIR="/tmp/rerun_eval_$$"
+HF_MERGED="${TMP_SUBDIR}/merged_huggingface"
+TMP_HF_CACHE="/tmp/hf_cache_rerun_$$"
+
+setup_overlay() {
+    mkdir -p "${TMP_SUBDIR}/upper_huggingface"
+    mkdir -p "${TMP_SUBDIR}/fuse_workdir"
+    mkdir -p "${HF_MERGED}"
+    fuse-overlayfs -o \
+        "lowerdir=${HF_HOME},upperdir=${TMP_SUBDIR}/upper_huggingface,workdir=${TMP_SUBDIR}/fuse_workdir" \
+        "${HF_MERGED}"
+}
+
+teardown_overlay() {
+    fusermount -u "${HF_MERGED}" 2>/dev/null || true
+    rm -rf "${TMP_SUBDIR}" 2>/dev/null || true
+}
+trap teardown_overlay EXIT
+
+setup_overlay
+
+run_one() {
+    local out_json="$1"
+    local extra="$2"
+    local log="$3"
+
+    nvidia-smi --query-compute-apps=pid --format=csv,noheader 2>/dev/null \
+        | xargs -r kill -9 2>/dev/null || true
+    sleep 5
+
+    timeout --signal=TERM --kill-after=60s 28800s \
+    apptainer exec \
+        --nv \
+        --env "HF_HOME=${TMP_HF_CACHE}" \
+        --env OPENAI_API_KEY="${OPENAI_API_KEY:-}" \
+        --env VLLM_API_KEY="inspectai" \
+        --env PYTHONNOUSERSITE="1" \
+        --writable-tmpfs \
+        --bind "${REPO_ROOT}:${REPO_ROOT}" \
+        --bind "${HF_MERGED}:${TMP_HF_CACHE}" \
+        --pwd "${REPO_ROOT}/src/eval/tasks/${EVALUATION_TASK}" \
+        "${POST_TRAIN_BENCH_CONTAINERS_DIR}/${POST_TRAIN_BENCH_CONTAINER_NAME}.sif" \
+        python evaluate.py \
+            --model-path "${EVAL_DIR}/final_model" \
+            --templates-dir ../../../../src/eval/templates \
+            --limit -1 \
+            ${extra} \
+            --json-output-file "${out_json}" >"${log}" 2>&1
+}
+
+run_with_fallback() {
+    local out_json="$1"
+    local log_prefix="$2"
+
+    rm -f "$out_json"
+
+    for level in default fb1 fb2; do
+        local extra=""
+        case "$level" in
+            default) extra="" ;;
+            fb1)     extra="$FB1" ;;
+            fb2)     extra="$FB2" ;;
+        esac
+        echo "  [$level] extra='${extra}'"
+        run_one "$out_json" "$extra" "${log_prefix}_${level}.log" || true
+        if [ -f "$out_json" ]; then
+            return 0
+        fi
+    done
+    return 1
+}
+
+echo "EVAL_DIR=${EVAL_DIR}"
+echo "EVALUATION_TASK=${EVALUATION_TASK}"
+echo "N=${N}"
+
+for i in $(seq 1 "$N"); do
+    out="${RERUNS_DIR}/run_${i}.json"
+    log_prefix="${RERUNS_DIR}/run_${i}"
+    echo "=== rerun ${i} / ${N} ==="
+    if run_with_fallback "$out" "$log_prefix"; then
+        echo "  -> wrote $out"
+    else
+        echo "  -> FAILED all fallbacks for rerun ${i}"
+    fi
+done
+
+python scripts/aggregate_metrics_runs.py \
+    --runs-glob "${RERUNS_DIR}/run_*.json" \
+    --output "${EVAL_DIR}/metrics_averaged.json"
+
+echo "Wrote ${EVAL_DIR}/metrics_averaged.json"
diff --git a/scripts/utils.py b/scripts/utils.py
new file mode 100644
index 0000000..ab6af6d
--- /dev/null
+++ b/scripts/utils.py
@@ -0,0 +1,410 @@
+#!/usr/bin/env python3
+"""Shared constants and utility functions for aggregation scripts."""
+import csv
+import json
+import math
+import os
+import re
+
+
+# ---------------------------------------------------------------------------
+# Constants
+# ---------------------------------------------------------------------------
+
+SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+FACTORS_PATH = os.path.join(SCRIPT_DIR, "factors.json")
+BASELINES_PATH = os.path.join(SCRIPT_DIR, "baselines.json")
+
+HARDCODED_AGENT_MAP = {
+    "Opus-4.5": [
+        "claude_claude-opus-4-5_10h_final_v3",
+        "claude_claude-opus-4-5_10h_v5",
+        "claude_claude-opus-4-5_10h_v6_seed1",
+    ],
+    "GPT-5.1-Codex-Max": [
+        "codex_gpt-5.1-codex-max_10h_final_v3",
+        "codex_gpt-5.1-codex-max_10h_v4_seed1",
+        "codex_gpt-5.1-codex-max_10h_v4_seed2",
+    ],
+    "GPT-5.2-Codex": [
+        "codex_gpt-5.2-codex_10h_v6",
+        "codex_gpt-5.2-codex_10h_v6_seed1",
+        "codex_gpt-5.2-codex_10h_v6_seed2",
+    ],
+    "GPT-5.2": [
+        "codex_gpt-5.2_10h_v4",
+        "codex_gpt-5.2_10h_v6_seed1",
+        "codex_gpt-5.2_10h_v6_seed2",
+    ],
+    "Gemini-3-Pro": [
+        "gemini_models_gemini-3-pro-preview_10h_final_v3",
+        "gemini_models_gemini-3-pro-preview_10h_v5",
+        "gemini_models_gemini-3-pro-preview_10h_v6_seed1",
+    ],
+    "GPT-5.1-Codex-Max Low": [
+        "codexlow_gpt-5.1-codex-max_10h_v7",
+        "codexlow_gpt-5.1-codex-max_10h_v7_seed1",
+    ],
+    "GPT-5.1-Codex-Max High": [
+        "codexhigh_gpt-5.1-codex-max_10h_v7",
+        "codexhigh_gpt-5.1-codex-max_10h_v7_seed1",
+    ],
+    "Opus-4.6": [
+        "claude_claude-opus-4-6_10h_run1_old_container",
+        "claude_claude-opus-4-6_10h_run2",
+        "claude_claude-opus-4-6_10h_run3",
+    ],
+    "GPT-5.3-Codex_Med": [
+        "codex_non_api_gpt-5.3-codex_10h_run1",
+        "codex_non_api_gpt-5.3-codex_10h_run2",
+        "codex_non_api_gpt-5.3-codex_10h_run3",
+    ],
+    "Gemini-3.1-Pro": [
+        "opencode_opencode_gemini-3.1-pro_10h_run1",
+        "opencode_opencode_gemini-3.1-pro_10h_run2",
+        "opencode_opencode_gemini-3.1-pro_10h_run3",
+    ],
+    "GPT-5.3-Codex_High": [
+        "codex_non_api_high_gpt-5.3-codex_10h_run1",
+        "codex_non_api_high_gpt-5.3-codex_10h_run2",
+        "codex_non_api_high_gpt-5.3-codex_10h_run3",
+    ],
+    "GPT-5.4-High": [
+        "codex_non_api_high_gpt-5.4_10h_run1",
+        "codex_non_api_high_gpt-5.4_10h_run2",
+        "codex_non_api_high_gpt-5.4_10h_run3",
+    ],
+    "Opus-4.6-1M": [
+        "claude_non_api_claude-opus-4-6_1m__10h_run1",
+        "claude_non_api_claude-opus-4-6_1m__10h_run2",
+        "claude_non_api_claude-opus-4-6_1m__10h_run3",
+    ],
+    "Opus-4.7":[
+    "claude_non_api_claude-opus-4-7_10h",
+    "claude_non_api_claude-opus-4-7_10h_run2",
+    "claude_non_api_claude-opus-4-7_10h_run3",
+    ],
+    "GPT-5.5-xHigh":[
+    "codex_non_api_xhigh_gpt-5.5_10h_run1",
+    "codex_non_api_xhigh_gpt-5.5_10h_run2",
+
+    ]
+}
+
+HARDCODED_BENCHMARKS = [
+    "aime2025",
+    "arenahardwriting",
+    "bfcl",
+    "gpqamain",
+    "gsm8k",
+    "healthbench",
+    "humaneval",
+]
+
+EXPECTED_MODELS = {
+    "Qwen3-1.7B-Base",
+    "Qwen3-4B-Base",
+    "SmolLM3-3B-Base",
+    "gemma-3-4b-pt",
+}
+
+BUDGET_SECONDS = 10 * 3600  # 10 hours
+
+
+def load_factors() -> dict:
+    with open(FACTORS_PATH, "r") as f:
+        return json.load(f)
+
+
+def load_baselines() -> dict:
+    """Load hardcoded baseline data from baselines.json.
+
+    Returns {"zeroshot": {model: {bench: value}}, "fewshot": {...}}.
+    Values are floats.
+    """
+    with open(BASELINES_PATH, "r") as f:
+        return json.load(f)
+
+
+def get_baseline_fallback_data() -> dict[str, dict[str, str]]:
+    """Load zeroshot baselines as {model: {bench: str_value}} for fallback.
+
+    This is the replacement for reading aggregated_baseline_zeroshot.csv.
+    """
+    baselines = load_baselines()
+    data = {}
+    for model, benchmarks in baselines["zeroshot"].items():
+        data[model] = {bench: str(val) for bench, val in benchmarks.items()}
+    return data
+
+
+# ---------------------------------------------------------------------------
+# Stats
+# ---------------------------------------------------------------------------
+
+def mean(values: list[float]) -> float:
+    return sum(values) / len(values)
+
+
+def stddev(values: list[float]) -> float:
+    avg = mean(values)
+    variance = sum((x - avg) ** 2 for x in values) / (len(values) - 1)
+    return math.sqrt(variance)
+
+
+# ---------------------------------------------------------------------------
+# Paths
+# ---------------------------------------------------------------------------
+
+def get_results_dir() -> str:
+    return os.environ.get("POST_TRAIN_BENCH_RESULTS_DIR", "results")
+
+
+# ---------------------------------------------------------------------------
+# CSV I/O
+# ---------------------------------------------------------------------------
+
+def is_number(value: str) -> bool:
+    if not value:
+        return False
+    try:
+        float(value)
+        return True
+    except ValueError:
+        return False
+
+
+def load_csv_as_dict(csv_path: str) -> tuple[dict[str, dict[str, str]], list[str]]:
+    """
+    Load a CSV into {model: {benchmark: value}}.
+    Returns (data, benchmarks). Returns ({}, []) if file doesn't exist.
+    """
+    data = {}
+    benchmarks = []
+
+    if not os.path.exists(csv_path):
+        return data, benchmarks
+
+    with open(csv_path, "r", newline="") as f:
+        reader = csv.reader(f)
+        header = next(reader, None)
+        if not header:
+            return data, benchmarks
+
+        benchmarks = header[1:]
+
+        for row in reader:
+            if not row:
+                continue
+            model = row[0]
+            data[model] = {}
+            for i, bench in enumerate(benchmarks):
+                if i + 1 < len(row):
+                    data[model][bench] = row[i + 1]
+                else:
+                    data[model][bench] = ""
+
+    return data, benchmarks
+
+
+def write_csv(
+    path: str,
+    models: list[str],
+    benchmarks: list[str],
+    data: dict[str, dict[str, str]],
+):
+    with open(path, "w", newline="") as f:
+        writer = csv.writer(f)
+        writer.writerow(["model"] + benchmarks)
+        for model in models:
+            row = [model]
+            for bench in benchmarks:
+                row.append(data[model].get(bench, ""))
+            writer.writerow(row)
+
+
+# ---------------------------------------------------------------------------
+# Walking result directories
+# ---------------------------------------------------------------------------
+
+def walk_latest_runs(
+    method_path: str,
+    min_run_id: int | None = None,
+    max_run_id: int | None = None,
+) -> dict[tuple[str, str], dict]:
+    """
+    Walk a method directory and return the latest run per (benchmark, model).
+
+    Returns {(benchmark, model): {"run_id": int, "path": str}}.
+    """
+    latest_runs = {}
+
+    for entry in os.listdir(method_path):
+        entry_path = os.path.join(method_path, entry)
+        if not os.path.isdir(entry_path):
+            continue
+
+        try:
+            benchmark, _, model, run_id_str = entry.split("_")
+            run_id = int(run_id_str)
+        except ValueError:
+            print(entry)
+            raise ValueError(f"{entry}, {method_path}")
+
+        if max_run_id is not None and run_id >= max_run_id:
+            continue
+        if min_run_id is not None and run_id < min_run_id:
+            continue
+
+        key = (benchmark, model)
+        if key not in latest_runs or run_id > latest_runs[key]["run_id"]:
+            latest_runs[key] = {"run_id": run_id, "path": entry_path}
+
+    return latest_runs
+
+
+# ---------------------------------------------------------------------------
+# Metrics loading
+# ---------------------------------------------------------------------------
+
+def load_metrics(metrics_path: str, method_name: str = None) -> str:
+    """
+    Return the accuracy as a string, or an error label.
+
+    Error labels for non-baseline methods:
+      - "not avl."   if time_taken.txt doesn't exist
+      - "not stored" if time_taken.txt exists but final_model/ doesn't
+      - "ERR"        otherwise
+    For baseline: always "ERR" on failure.
+    """
+    if os.path.exists(metrics_path):
+        try:
+            with open(metrics_path, "r") as f:
+                data = json.load(f)
+            acc = data.get("accuracy")
+            if acc is not None:
+                return str(acc)
+        except Exception:
+            pass
+
+    if method_name == "baseline_zeroshot":
+        return "ERR"
+
+    run_dir = os.path.dirname(metrics_path)
+
+    if not os.path.exists(os.path.join(run_dir, "time_taken.txt")):
+        return "not avl."
+
+    if not os.path.isdir(os.path.join(run_dir, "final_model")):
+        return "not stored"
+
+    return "ERR"
+
+
+# ---------------------------------------------------------------------------
+# Contamination loading
+# ---------------------------------------------------------------------------
+
+def load_contamination(contamination_path: str):
+    """Return True, False, "IMPORTANT ERR", or "ERR"."""
+    if not os.path.exists(contamination_path):
+        return "ERR"
+    try:
+        with open(contamination_path, "r") as f:
+            content = f.read().strip()
+    except Exception:
+        return "ERR"
+    if content == "contamination detected":
+        return True
+    elif content == "no contamination detected":
+        return False
+    else:
+        return "IMPORTANT ERR"
+
+
+def load_disallowed_model(disallowed_path: str):
+    """Return True, False, "IMPORTANT ERR", or "ERR"."""
+    if not os.path.exists(disallowed_path):
+        return "ERR"
+    try:
+        with open(disallowed_path, "r") as f:
+            content = f.read().strip()
+    except Exception:
+        return "ERR"
+    if content == "disallowed use detected":
+        return True
+    elif content == "only allowed use detected":
+        return False
+    else:
+        return "IMPORTANT ERR"
+
+
+def combine_contamination_results(contamination, disallowed_model) -> str:
+    """
+    Combine contamination and disallowed model results into a cell value.
+
+    Returns "" (clean), "C", "M", "MC", or an error string.
+    """
+    if contamination in ("ERR", "IMPORTANT ERR") or disallowed_model in (
+        "ERR",
+        "IMPORTANT ERR",
+    ):
+        errors = []
+        if contamination in ("ERR", "IMPORTANT ERR"):
+            errors.append(f"C:{contamination}")
+        if disallowed_model in ("ERR", "IMPORTANT ERR"):
+            errors.append(f"M:{disallowed_model}")
+        return " ".join(errors)
+
+    if disallowed_model and contamination:
+        return "MC"
+    elif disallowed_model and not contamination:
+        return "M"
+    elif not disallowed_model and contamination:
+        return "C"
+    else:
+        return ""
+
+
+# ---------------------------------------------------------------------------
+# Time loading
+# ---------------------------------------------------------------------------
+
+def parse_time_hms(time_str: str) -> int | None:
+    """Parse H:M:S string to total seconds. Returns None on failure."""
+    match = re.match(r"^(\d+):(\d{1,2}):(\d{1,2})$", time_str.strip())
+    if not match:
+        return None
+    hours, minutes, seconds = map(int, match.groups())
+    if minutes >= 60 or seconds >= 60:
+        return None
+    return hours * 3600 + minutes * 60 + seconds
+
+
+def format_time_hms(total_seconds: int) -> str:
+    """Convert total seconds to H:MM:SS format."""
+    hours = total_seconds // 3600
+    minutes = (total_seconds % 3600) // 60
+    seconds = total_seconds % 60
+    return f"{hours}:{minutes:02d}:{seconds:02d}"
+
+
+def load_time_taken(run_dir: str) -> tuple[str, int | None]:
+    """
+    Return (display_string, total_seconds).
+    Returns ("ERR", None) on failure.
+    """
+    time_taken_path = os.path.join(run_dir, "time_taken.txt")
+
+    if not os.path.exists(time_taken_path):
+        return "ERR", None
+
+    try:
+        with open(time_taken_path, "r") as f:
+            time_str = f.read().strip()
+        total_seconds = parse_time_hms(time_str)
+        if total_seconds is None:
+            return "ERR", None
+        return format_time_hms(total_seconds), total_seconds
+    except Exception:
+        return "ERR", None
diff --git a/scripts/verify.py b/scripts/verify.py
new file mode 100644
index 0000000..857703a
--- /dev/null
+++ b/scripts/verify.py
@@ -0,0 +1,261 @@
+#!/usr/bin/env python3
+"""
+Verify that refactored aggregation scripts produce identical outputs
+to the original pipeline.
+
+Usage:
+    python verify.py --ground-truth /fast/hbhatnagar/ptb_results/ \
+                     --new-output /fast/hbhatnagar/ptb_results_new/
+
+Compares all key output CSVs cell-by-cell:
+  - final_{method}.csv          (per-method score grids)
+  - contamination_{method}.csv  (per-method contamination flags)
+  - single_metrics.csv          (weighted score per run)
+  - single_metrics_aggregated.csv (avg/std per agent group)
+  - aggregated_avg_{agent}.csv  (per-cell avg for multi-run agents)
+  - aggregated_std_{agent}.csv  (per-cell std for multi-run agents)
+  - time_aggregated.csv         (avg/std time per agent)
+"""
+import argparse
+import csv
+import os
+import sys
+
+
+FLOAT_TOLERANCE = 1e-10
+
+
+def is_number(s: str) -> bool:
+    if not s:
+        return False
+    try:
+        float(s)
+        return True
+    except ValueError:
+        return False
+
+
+def load_csv(path: str) -> list[list[str]]:
+    with open(path, "r", newline="") as f:
+        return list(csv.reader(f))
+
+
+def compare_csvs(gt_path: str, new_path: str) -> list[str]:
+    """
+    Compare two CSVs cell-by-cell.
+    Returns list of mismatch descriptions (empty = pass).
+    """
+    errors = []
+
+    gt_rows = load_csv(gt_path)
+    new_rows = load_csv(new_path)
+
+    if len(gt_rows) != len(new_rows):
+        errors.append(f"Row count differs: {len(gt_rows)} vs {len(new_rows)}")
+        # Still compare what we can
+        max_rows = min(len(gt_rows), len(new_rows))
+    else:
+        max_rows = len(gt_rows)
+
+    for row_idx in range(max_rows):
+        gt_row = gt_rows[row_idx]
+        new_row = new_rows[row_idx]
+
+        if len(gt_row) != len(new_row):
+            errors.append(
+                f"  Row {row_idx}: column count differs: "
+                f"{len(gt_row)} vs {len(new_row)}"
+            )
+            max_cols = min(len(gt_row), len(new_row))
+        else:
+            max_cols = len(gt_row)
+
+        for col_idx in range(max_cols):
+            gt_val = gt_row[col_idx]
+            new_val = new_row[col_idx]
+
+            if gt_val == new_val:
+                continue
+
+            # Try numeric comparison with tolerance
+            if is_number(gt_val) and is_number(new_val):
+                if abs(float(gt_val) - float(new_val)) < FLOAT_TOLERANCE:
+                    continue
+
+            # Header row for context
+            header_label = ""
+            if row_idx > 0 and gt_rows[0]:
+                col_name = gt_rows[0][col_idx] if col_idx < len(gt_rows[0]) else "?"
+                row_name = gt_row[0] if gt_row else "?"
+                header_label = f" ({row_name}, {col_name})"
+
+            errors.append(
+                f"  Row {row_idx}, Col {col_idx}{header_label}: "
+                f"'{gt_val}' vs '{new_val}'"
+            )
+
+    return errors
+
+
+def find_matching_files(gt_dir: str, new_dir: str) -> dict[str, tuple[str, str]]:
+    """
+    Find CSVs that exist in both directories, filtered to the ones we care about.
+    Returns {filename: (gt_path, new_path)}.
+    """
+    matches = {}
+
+    gt_files = set(f for f in os.listdir(gt_dir) if f.endswith(".csv"))
+    new_files = set(f for f in os.listdir(new_dir) if f.endswith(".csv"))
+
+    # Files we care about
+    for f in sorted(gt_files & new_files):
+        if should_verify(f):
+            matches[f] = (os.path.join(gt_dir, f), os.path.join(new_dir, f))
+
+    return matches
+
+
+def should_verify(filename: str) -> bool:
+    """Decide if a CSV file should be verified."""
+    # Skip deprecated / intermediate / artifact files
+    if filename in (
+        "aggregated_avg_over_models.csv",
+        "aggregated_std_over_models.csv",
+    ):
+        return False
+
+    # Skip intermediate time CSVs (only time_aggregated.csv is a final output)
+    if filename.startswith("aggregated_time_"):
+        return False
+
+    # Per-method final scores
+    if filename.startswith("final_") and filename.endswith(".csv"):
+        # Skip deprecated/artifact files
+        if filename.startswith("final_avg_"):
+            return False
+        if filename.startswith("final_std_"):
+            return False
+        if filename.startswith("final_time_"):
+            return False
+        # Skip baselines (hardcoded in baselines.json, not regenerated)
+        if filename in ("final_baseline.csv", "final_baseline_zeroshot.csv"):
+            return False
+        return True
+
+    # Contamination flags
+    if filename.startswith("contamination_") and filename.endswith(".csv"):
+        # Skip baselines
+        if filename in (
+            "contamination_baseline.csv",
+            "contamination_baseline_zeroshot.csv",
+        ):
+            return False
+        return True
+
+    # Single metric outputs
+    if filename in ("single_metrics.csv", "single_metrics_aggregated.csv"):
+        return True
+
+    # Per-agent avg/std (multi-run agents)
+    if filename.startswith("aggregated_avg_") or filename.startswith("aggregated_std_"):
+        return True
+
+    # Time aggregation
+    if filename == "time_aggregated.csv":
+        return True
+
+    return False
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Verify refactored aggregation outputs match ground truth."
+    )
+    parser.add_argument(
+        "--ground-truth",
+        required=True,
+        help="Directory with ground truth CSV outputs (from original scripts).",
+    )
+    parser.add_argument(
+        "--new-output",
+        required=True,
+        help="Directory with new CSV outputs (from refactored scripts).",
+    )
+    args = parser.parse_args()
+
+    gt_dir = args.ground_truth
+    new_dir = args.new_output
+
+    if not os.path.isdir(gt_dir):
+        print(f"Error: ground truth dir not found: {gt_dir}")
+        sys.exit(1)
+    if not os.path.isdir(new_dir):
+        print(f"Error: new output dir not found: {new_dir}")
+        sys.exit(1)
+
+    matches = find_matching_files(gt_dir, new_dir)
+
+    if not matches:
+        print("No matching CSV files found to compare.")
+        sys.exit(1)
+
+    # Check for files in ground truth that are missing from new output
+    gt_verifiable = set(
+        f for f in os.listdir(gt_dir) if f.endswith(".csv") and should_verify(f)
+    )
+    new_verifiable = set(
+        f for f in os.listdir(new_dir) if f.endswith(".csv") and should_verify(f)
+    )
+
+    missing_from_new = gt_verifiable - new_verifiable
+    extra_in_new = new_verifiable - gt_verifiable
+
+    total_files = len(matches)
+    passed = 0
+    failed = 0
+    failure_details = []
+
+    print(f"Comparing {total_files} CSV files...\n")
+
+    for filename, (gt_path, new_path) in sorted(matches.items()):
+        errors = compare_csvs(gt_path, new_path)
+        if errors:
+            failed += 1
+            failure_details.append((filename, errors))
+            print(f"  FAIL  {filename}")
+        else:
+            passed += 1
+            print(f"  PASS  {filename}")
+
+    # Summary
+    print(f"\n{'='*60}")
+    print(f"Results: {passed} passed, {failed} failed, {total_files} total")
+
+    if missing_from_new:
+        print(f"\nMISSING from new output ({len(missing_from_new)}):")
+        for f in sorted(missing_from_new):
+            print(f"  - {f}")
+
+    if extra_in_new:
+        print(f"\nEXTRA in new output ({len(extra_in_new)}):")
+        for f in sorted(extra_in_new):
+            print(f"  + {f}")
+
+    if failure_details:
+        print(f"\nFailure details:")
+        for filename, errors in failure_details:
+            print(f"\n  {filename}:")
+            for err in errors[:10]:  # Cap at 10 errors per file
+                print(f"    {err}")
+            if len(errors) > 10:
+                print(f"    ... and {len(errors) - 10} more")
+
+    if failed or missing_from_new:
+        sys.exit(1)
+    else:
+        print("\nAll checks passed.")
+        sys.exit(0)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/commit_utils/commit.sh b/src/commit_utils/commit.sh
index 34abd37..3c43144 100644
--- a/src/commit_utils/commit.sh
+++ b/src/commit_utils/commit.sh
@@ -2,71 +2,83 @@
 source src/commit_utils/set_env_vars.sh
 
 models=(
-    "google/gemma-3-4b-pt"
+    # "google/gemma-3-4b-pt"
     "Qwen/Qwen3-4B-Base"
-    "Qwen/Qwen3-1.7B-Base"
-    "HuggingFaceTB/SmolLM3-3B-Base"
+    # "Qwen/Qwen3-1.7B-Base"
+    # "HuggingFaceTB/SmolLM3-3B-Base"
 )
 
 evals=(
-    "aime2025"
-    "arenahardwriting"
-    "bfcl"
-    "gpqamain"
-    "gsm8k"
-    "humaneval"
+    # "aime2025"
+    # "arenahardwriting"
+    # "bfcl"
+    # "gpqamain"
+    # "gsm8k"
+    # "humaneval"
     "healthbench"
 )
-# export POST_TRAIN_BENCH_EXPERIMENT_NAME="_pushed"
+export POST_TRAIN_BENCH_EXPERIMENT_NAME="_METR"
 for model in "${models[@]}"; do
     for eval in "${evals[@]}"; do
         echo ""
         echo $model on $eval
         if [ "${POST_TRAIN_BENCH_JOB_SCHEDULER}" = "htcondor_mpi-is" ]; then
             # Proprietary (API)
-            condor_submit_bid 100 -a "agent=codex" -a "agent_config=gpt-5.1-codex-max" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
-            condor_submit_bid 50 -a "agent=codex" -a "agent_config=gpt-5.3-codex" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
-            condor_submit_bid 100 -a "agent=claude" -a "agent_config=claude-sonnet-4-5" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
-            condor_submit_bid 100 -a "agent=claude" -a "agent_config=claude-opus-4-5" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
-            condor_submit_bid 50 -a "agent=claude" -a "agent_config=claude-opus-4-6" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
-            condor_submit_bid 50 -a "agent=qwen3max" -a "agent_config=qwen3-max-2026-01-23" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
+            # condor_submit_bid 100 -a "agent=codex" -a "agent_config=gpt-5.1-codex-max" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
+            # condor_submit_bid 50 -a "agent=codex" -a "agent_config=gpt-5.3-codex" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
+            # condor_submit_bid 100 -a "agent=claude" -a "agent_config=claude-sonnet-4-5" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
+            # condor_submit_bid 100 -a "agent=claude" -a "agent_config=claude-opus-4-5" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
+            # condor_submit_bid 50 -a "agent=claude" -a "agent_config=claude-opus-4-6" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
+            # condor_submit_bid 50 -a "agent=qwen3max" -a "agent_config=qwen3-max-2026-01-23" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
             # Proprietary (Subscription plan)
-            condor_submit_bid 100 -a "agent=codex_non_api" -a "agent_config=gpt-5.3-codex" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
-            condor_submit_bid 100 -a "agent=claude_non_api" -a "agent_config=claude-opus-4-6" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
-            condor_submit_bid 150 -a "agent=claude_non_api" -a "agent_config=claude-sonnet-4-6" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
-            condor_submit_bid 100 -a "agent=codex_non_api_high" -a "agent_config=gpt-5.3-codex" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
-            condor_submit_bid 100 -a "agent=codex_non_api_high" -a "agent_config=gpt-5.2" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
-            condor_submit_bid 100 -a "agent=claude_non_api" -a "agent_config=claude-opus-4-6[1m]" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
-            condor_submit_bid 100 -a "agent=claude_non_api" -a "agent_config=claude-opus-4-6[1m]" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=50" src/commit_utils/single_task.sub
+            # condor_submit_bid 100 -a "agent=codex_non_api" -a "agent_config=gpt-5.3-codex" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
+            # condor_submit_bid 100 -a "agent=claude_non_api" -a "agent_config=claude-opus-4-6" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
+            # condor_submit_bid 150 -a "agent=claude_non_api" -a "agent_config=claude-sonnet-4-6" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
+            # condor_submit_bid 100 -a "agent=codex_non_api_high" -a "agent_config=gpt-5.3-codex" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
+            # condor_submit_bid 100 -a "agent=codex_non_api_high" -a "agent_config=gpt-5.2" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
+            # condor_submit_bid 100 -a "agent=claude_non_api" -a "agent_config=claude-opus-4-6[1m]" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
+            # condor_submit_bid 100 -a "agent=claude_non_api" -a "agent_config=claude-opus-4-6[1m]" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=50" src/commit_utils/single_task.sub
+            # condor_submit_bid 100 -a "agent=claude_non_api" -a "agent_config=claude-opus-4-7" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
+            # condor_submit_bid 100 -a "agent=codex_non_api_xhigh" -a "agent_config=gpt-5.5" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
+            # condor_submit_bid 100 -a "agent=codex_xhigh" -a "agent_config=gpt-5.5" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
+            
 
             # Multi-GPU runs might need more than 8 CPUs and 128 GB of RAM (use 512 GB to be safe)
-            condor_submit_bid 100 -a "agent=claude_non_api" -a "agent_config=claude-opus-4-6[1m]" -a "eval=$eval" -a "model_to_train=$model" -a "num_gpus=8" -a "num_hours=50" -a "request_memory=524288" -a "request_cpus=128" src/commit_utils/single_task.sub   
-            condor_submit_bid 500 -a "agent=claude_non_api" -a "agent_config=claude-opus-4-6[1m]" -a "eval=$eval" -a "model_to_train=$model" -a "num_gpus=8" -a "num_hours=50" src/commit_utils/single_task.sub
+            # condor_submit_bid 100 -a "agent=claude_non_api" -a "agent_config=claude-opus-4-6[1m]" -a "eval=$eval" -a "model_to_train=$model" -a "num_gpus=8" -a "num_hours=100" -a "request_memory=524288" -a "request_cpus=128" -a "request_disk=800G" src/commit_utils/single_task.sub   
+            condor_submit_bid 100 -a "agent=claude_reprompt" -a "agent_config=claude-opus-4-6[1m]" -a "eval=$eval" -a "model_to_train=$model" -a "num_gpus=8" -a "num_hours=100" -a "request_memory=524288" -a "request_cpus=128" -a "request_disk=800G" src/commit_utils/single_task.sub   
+            # condor_submit_bid 100 -a "agent=claude_non_api" -a "agent_config=claude-opus-4-6[1m]" -a "eval=$eval" -a "model_to_train=$model" -a "num_gpus=8" -a "num_hours=50" -a "request_memory=524288" -a "request_cpus=128" -a "request_disk=800G" src/commit_utils/single_task.sub   
+            # condor_submit_bid 500 -a "agent=claude_non_api" -a "agent_config=claude-opus-4-6[1m]" -a "eval=$eval" -a "model_to_train=$model" -a "num_gpus=8" -a "num_hours=50" src/commit_utils/single_task.sub
 
-            # Reprompted variant to push the agent (such as GPT 5.4)
-            condor_submit_bid 100 -a "agent=codex_non_api_high_reprompt" -a "agent_config=gpt-5.4" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
+            # # Reprompted variant to push the agent (such as GPT 5.4)
+            # condor_submit_bid 50 -a "agent=codex_non_api_high_reprompt" -a "agent_config=gpt-5.4" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
+            # condor_submit_bid 100 -a "agent=codex_non_api_xhigh_reprompt" -a "agent_config=gpt-5.5" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
+            # condor_submit_bid 100 -a "agent=codex_xhigh_reprompt" -a "agent_config=gpt-5.5" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
+            # condor_submit_bid 100 -a "agent=claude_reprompt" -a "agent_config=claude-opus-4-6[1m]" -a "eval=$eval" -a "model_to_train=$model" -a "num_gpus=1" -a "num_hours=5" src/commit_utils/single_task.sub   
 
-            condor_submit_bid 100 -a "agent=codex_non_api_high" -a "agent_config=gpt-5.4" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
-            condor_submit_bid 100 -a "agent=codex_non_api_xhigh" -a "agent_config=gpt-5.3-codex" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
-            condor_submit_bid 100 -a "agent=claude_non_api_max" -a "agent_config=claude-opus-4-6" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
-            condor_submit_bid 50 -a "agent=claude" -a "agent_config=claude-sonnet-4-5" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=1" src/commit_utils/single_task.sub
-            condor_submit_bid 100 -a "agent=gemini" -a "agent_config=models/gemini-3-pro-preview" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
-            condor_submit_bid 100 -a "agent=gemini" -a "agent_config=models/gemini-3-flash-preview" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
-            condor_submit_bid 150 -a "agent=gemini" -a "agent_config=models/gemini-3.1-pro-preview" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
+
+
+            # condor_submit_bid 100 -a "agent=codex_non_api_high" -a "agent_config=gpt-5.4" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
+            # condor_submit_bid 100 -a "agent=codex_non_api_xhigh" -a "agent_config=gpt-5.3-codex" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
+            # condor_submit_bid 100 -a "agent=claude_non_api_max" -a "agent_config=claude-opus-4-6" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
+            # condor_submit_bid 50 -a "agent=claude" -a "agent_config=claude-sonnet-4-5" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=1" src/commit_utils/single_task.sub
+            # condor_submit_bid 100 -a "agent=gemini" -a "agent_config=models/gemini-3-pro-preview" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
+            # condor_submit_bid 100 -a "agent=gemini" -a "agent_config=models/gemini-3-flash-preview" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
+            # condor_submit_bid 150 -a "agent=gemini" -a "agent_config=models/gemini-3.1-pro-preview" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
+           
             # OpenCode 
-            condor_submit_bid 50 -a "agent=opencode" -a "agent_config=anthropic/claude-opus-4-5" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub
-            condor_submit_bid 50 -a "agent=opencode" -a "agent_config=opencode/gpt-5.1-codex-max" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub
-            condor_submit_bid 50 -a "agent=opencode" -a "agent_config=opencode/kimi-k2-thinking" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub 
-            condor_submit_bid 50 -a "agent=opencode" -a "agent_config=opencode/glm-4.7-free" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub 
-            condor_submit_bid 500 -a "agent=opencode" -a "agent_config=opencode/gemini-3-pro" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub 
-            condor_submit_bid 50 -a "agent=opencode" -a "agent_config=opencode/minimax-m2.1-free" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub 
-            condor_submit_bid 50 -a "agent=glm5" -a "agent_config=glm-5" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub 
-            condor_submit_bid 100 -a "agent=opencode" -a "agent_config=opencode/minimax-m2.5-free" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub 
-            condor_submit_bid 100 -a "agent=opencode" -a "agent_config=zai/glm-5" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub 
-            condor_submit_bid 100 -a "agent=opencode" -a "agent_config=opencode/kimi-k2.5" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub 
-            condor_submit_bid 50 -a "agent=opencode" -a "agent_config=opencode/glm-5" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub 
-            condor_submit_bid 150 -a "agent=opencode" -a "agent_config=opencode/gemini-3.1-pro" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub 
-            sleep 10
+            # condor_submit_bid 50 -a "agent=opencode" -a "agent_config=anthropic/claude-opus-4-5" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub
+            # condor_submit_bid 50 -a "agent=opencode" -a "agent_config=opencode/gpt-5.1-codex-max" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub
+            # condor_submit_bid 50 -a "agent=opencode" -a "agent_config=opencode/kimi-k2-thinking" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub 
+            # condor_submit_bid 50 -a "agent=opencode" -a "agent_config=opencode/glm-4.7-free" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub 
+            # condor_submit_bid 500 -a "agent=opencode" -a "agent_config=opencode/gemini-3-pro" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub 
+            # condor_submit_bid 50 -a "agent=opencode" -a "agent_config=opencode/minimax-m2.1-free" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub 
+            # condor_submit_bid 50 -a "agent=glm5" -a "agent_config=glm-5" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub 
+            # condor_submit_bid 100 -a "agent=opencode" -a "agent_config=opencode/minimax-m2.5-free" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub 
+            # condor_submit_bid 100 -a "agent=opencode" -a "agent_config=zai/glm-5" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub 
+            # condor_submit_bid 100 -a "agent=opencode" -a "agent_config=opencode/kimi-k2.5" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub 
+            # condor_submit_bid 50 -a "agent=opencode" -a "agent_config=opencode/glm-5" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub 
+            # condor_submit_bid 150 -a "agent=opencode" -a "agent_config=opencode/gemini-3.1-pro" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub 
+            # sleep 10
         elif [ "${POST_TRAIN_BENCH_JOB_SCHEDULER}" = "htcondor" ]; then
             condor_submit_bid -a "agent=codex" -a "agent_config=gpt-5.1-codex-max" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
             condor_submit_bid -a "agent=codex" -a "agent_config=gpt-5.3-codex" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
diff --git a/src/commit_utils/rerun_eval.sub b/src/commit_utils/rerun_eval.sub
new file mode 100644
index 0000000..62aa4a1
--- /dev/null
+++ b/src/commit_utils/rerun_eval.sub
@@ -0,0 +1,15 @@
+executable = /bin/bash
+num_gpus = 1
+n = 5
+arguments = scripts/rerun_eval_n_times.sh $(eval_dir) $(n)
+environment = "OPENAI_API_KEY=$ENV(OPENAI_API_KEY) HOME=$ENV(HOME) POST_TRAIN_BENCH_RESULTS_DIR=$ENV(POST_TRAIN_BENCH_RESULTS_DIR) POST_TRAIN_BENCH_CONTAINERS_DIR=$ENV(POST_TRAIN_BENCH_CONTAINERS_DIR) POST_TRAIN_BENCH_CONTAINER_NAME=$ENV(POST_TRAIN_BENCH_CONTAINER_NAME) POST_TRAIN_BENCH_JOB_SCHEDULER=$ENV(POST_TRAIN_BENCH_JOB_SCHEDULER) HF_HOME=$ENV(HF_HOME)"
+error = rerun_$(Cluster).err
+output = rerun_$(Cluster).out
+log = rerun_$(Cluster).log
+request_memory = 131072
+request_cpus = 16
+request_gpus = $(num_gpus)
+requirements = TARGET.CUDADeviceName == "NVIDIA H100 80GB HBM3" && Machine != "i104.internal.cluster.is.localnet"
+request_disk=400G
++BypassLXCfs="true"
+queue
diff --git a/src/disallowed_usage_judge/rerun_judge/README.md b/src/disallowed_usage_judge/rerun_judge/README.md
new file mode 100644
index 0000000..06dd3fd
--- /dev/null
+++ b/src/disallowed_usage_judge/rerun_judge/README.md
@@ -0,0 +1,100 @@
+# Rerun Judge
+
+Re-run the disallowed-usage / contamination judge on existing result
+directories without re-running the agent or eval. Useful when the judge step
+in `src/run_task.sh` failed mid-run (e.g. API quota hit) and the run dir is
+missing `contamination_judgement.txt` / `disallowed_model_judgement.txt`.
+
+The judge invocation mirrors `src/run_task.sh` exactly — single GPT-5.1-Codex
+call via the `CODEX_API_KEY` API path. All outputs are suffixed with `_rerun`
+so the originals (if any) are preserved.
+
+## Files
+
+| File | Description |
+|---|---|
+| `../run_judge.sh` | Standalone script: run judge on one result dir |
+| `utils.py` | Shared dir-walking / parsing / judgement-loading helpers |
+| `list_results.py` | List + filter result directories |
+| `aggregate_rerun_results.py` | Compare original vs rerun judgements |
+| `rerun_single.sh` | Thin wrapper over `run_judge.sh` (for HTCondor) |
+| `commit_rerun_judge.sh` | Submit HTCondor jobs |
+| `rerun_judge.sub` | HTCondor submission file |
+
+## Usage
+
+### Single directory
+
+```bash
+bash src/disallowed_usage_judge/run_judge.sh /path/to/result_dir
+```
+
+Writes:
+- `contamination_judgement_rerun.txt`
+- `disallowed_model_judgement_rerun.txt`
+- `judge_output_rerun.json`
+- `judge_output_rerun.txt`
+
+### Listing candidates
+
+```bash
+# Every result dir
+python src/disallowed_usage_judge/rerun_judge/list_results.py
+
+# Only dirs where the original judge step failed
+python src/disallowed_usage_judge/rerun_judge/list_results.py --only-missing-judgement
+
+# Just paths, ready for piping
+python src/disallowed_usage_judge/rerun_judge/list_results.py \
+    --only-missing-judgement --paths-only
+```
+
+### Submit HTCondor jobs
+
+```bash
+# All dirs missing original judgement, latest run per method/model/benchmark
+./src/disallowed_usage_judge/rerun_judge/commit_rerun_judge.sh \
+    --only-missing-judgement --latest-only
+
+# Filter by method
+./src/disallowed_usage_judge/rerun_judge/commit_rerun_judge.sh \
+    --method "codex_non_api_xhigh_reprompt_gpt-5.5"
+
+# Skip dirs that already have rerun output
+./src/disallowed_usage_judge/rerun_judge/commit_rerun_judge.sh --skip-existing
+
+# Preview without submitting
+./src/disallowed_usage_judge/rerun_judge/commit_rerun_judge.sh \
+    --only-missing-judgement --dry-run
+```
+
+### Aggregate / diff
+
+```bash
+# Plain summary
+python src/disallowed_usage_judge/rerun_judge/aggregate_rerun_results.py
+
+# Only show dirs where the rerun changed the verdict
+python src/disallowed_usage_judge/rerun_judge/aggregate_rerun_results.py --diff-only
+
+# Only show dirs where the rerun filled a previously-missing judgement
+python src/disallowed_usage_judge/rerun_judge/aggregate_rerun_results.py --filled-only
+
+# Export to CSV
+python src/disallowed_usage_judge/rerun_judge/aggregate_rerun_results.py --csv rerun.csv
+```
+
+## Adopting rerun results
+
+`scripts/collect.py` reads `contamination_judgement.txt` /
+`disallowed_model_judgement.txt`, not the `_rerun` variants. Once you're happy
+with the rerun output, copy the files over (or symlink) so collect.py picks
+them up:
+
+```bash
+cp result_dir/contamination_judgement_rerun.txt    result_dir/contamination_judgement.txt
+cp result_dir/disallowed_model_judgement_rerun.txt result_dir/disallowed_model_judgement.txt
+```
+
+(A `--prefer-rerun` flag in `collect.py` would be the cleaner long-term
+option.)
diff --git a/src/disallowed_usage_judge/rerun_judge/aggregate_rerun_results.py b/src/disallowed_usage_judge/rerun_judge/aggregate_rerun_results.py
new file mode 100644
index 0000000..8ff5d67
--- /dev/null
+++ b/src/disallowed_usage_judge/rerun_judge/aggregate_rerun_results.py
@@ -0,0 +1,149 @@
+#!/usr/bin/env python3
+"""
+Aggregate rerun judge results and compare with original judgements.
+
+Usage:
+    python aggregate_rerun_results.py                     # Show summary
+    python aggregate_rerun_results.py --csv output.csv    # Write to CSV
+    python aggregate_rerun_results.py --diff-only         # Only show changed judgements
+    python aggregate_rerun_results.py --filled-only       # Only show dirs where the rerun
+                                                          #   filled in a previously missing
+                                                          #   judgement
+"""
+import argparse
+import csv
+from collections import defaultdict
+from pathlib import Path
+
+from utils import get_result_dirs, parse_result_dir, read_judgement
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Aggregate rerun judge results")
+    parser.add_argument("--csv", type=str, help="Output CSV file")
+    parser.add_argument("--diff-only", action="store_true",
+                        help="Only show results where judgement changed")
+    parser.add_argument("--filled-only", action="store_true",
+                        help="Only show dirs where a missing original was filled by the rerun")
+    parser.add_argument("--method", type=str, help="Filter by method pattern")
+    parser.add_argument("--dirs", type=str, nargs="+",
+                        help="Only process these specific result directories")
+    args = parser.parse_args()
+
+    if args.dirs:
+        result_dirs = [Path(d) for d in args.dirs]
+    else:
+        result_dirs = get_result_dirs(method_pattern=args.method)
+
+    results = []
+    stats = defaultdict(int)
+
+    for result_dir in result_dirs:
+        try:
+            parsed = parse_result_dir(result_dir)
+        except ValueError:
+            continue
+
+        contam_orig = read_judgement(result_dir / "contamination_judgement.txt")
+        contam_rerun = read_judgement(result_dir / "contamination_judgement_rerun.txt")
+        model_orig = read_judgement(result_dir / "disallowed_model_judgement.txt")
+        model_rerun = read_judgement(result_dir / "disallowed_model_judgement_rerun.txt")
+
+        contam_changed = (
+            contam_rerun is not None and contam_orig is not None and contam_orig != contam_rerun
+        )
+        model_changed = (
+            model_rerun is not None and model_orig is not None and model_orig != model_rerun
+        )
+        contam_filled = contam_orig is None and contam_rerun is not None
+        model_filled = model_orig is None and model_rerun is not None
+
+        stats["total"] += 1
+        if contam_rerun is not None:
+            stats["has_rerun"] += 1
+        if contam_changed:
+            stats["contamination_changed"] += 1
+        if model_changed:
+            stats["model_changed"] += 1
+        if contam_filled:
+            stats["contamination_filled"] += 1
+        if model_filled:
+            stats["model_filled"] += 1
+
+        result = {
+            "method": parsed["method"],
+            "benchmark": parsed["benchmark"],
+            "model": parsed["model_hf"],
+            "cluster_id": parsed["cluster_id"],
+            "contamination_orig": contam_orig,
+            "contamination_rerun": contam_rerun,
+            "contamination_changed": contam_changed,
+            "contamination_filled": contam_filled,
+            "model_orig": model_orig,
+            "model_rerun": model_rerun,
+            "model_changed": model_changed,
+            "model_filled": model_filled,
+            "result_dir": str(result_dir),
+        }
+
+        if args.diff_only and not (contam_changed or model_changed):
+            continue
+        if args.filled_only and not (contam_filled or model_filled):
+            continue
+
+        results.append(result)
+
+    if args.csv:
+        if results:
+            with open(args.csv, "w", newline="") as f:
+                writer = csv.DictWriter(f, fieldnames=results[0].keys())
+                writer.writeheader()
+                writer.writerows(results)
+        print(f"Wrote {len(results)} results to {args.csv}")
+    else:
+        print("=" * 80)
+        print("Rerun Judge Results Summary")
+        print("=" * 80)
+        print()
+
+        for result in results:
+            print(f"Method: {result['method']}")
+            print(f"  Folder: {result['result_dir']}")
+            print(f"  Benchmark: {result['benchmark']}  Model: {result['model']}")
+
+            tags = []
+            if result["contamination_changed"]:
+                tags.append("CHANGED")
+            if result["contamination_filled"]:
+                tags.append("FILLED")
+            tag_str = f" [{','.join(tags)}]" if tags else ""
+            print(
+                f"  Contamination: {result['contamination_orig']} -> "
+                f"{result['contamination_rerun']}{tag_str}"
+            )
+
+            tags = []
+            if result["model_changed"]:
+                tags.append("CHANGED")
+            if result["model_filled"]:
+                tags.append("FILLED")
+            tag_str = f" [{','.join(tags)}]" if tags else ""
+            print(
+                f"  Model usage:   {result['model_orig']} -> "
+                f"{result['model_rerun']}{tag_str}"
+            )
+            print()
+
+    print("=" * 80)
+    print("Statistics")
+    print("=" * 80)
+    print(f"Total result directories: {stats['total']}")
+    print(f"With rerun judgements: {stats['has_rerun']}")
+    print(f"Contamination changed (orig vs rerun): {stats['contamination_changed']}")
+    print(f"Model usage changed (orig vs rerun): {stats['model_changed']}")
+    print(f"Contamination filled (no orig, now rerun): {stats['contamination_filled']}")
+    print(f"Model usage filled (no orig, now rerun): {stats['model_filled']}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/disallowed_usage_judge/rerun_judge/commit_rerun_judge.sh b/src/disallowed_usage_judge/rerun_judge/commit_rerun_judge.sh
new file mode 100755
index 0000000..f677856
--- /dev/null
+++ b/src/disallowed_usage_judge/rerun_judge/commit_rerun_judge.sh
@@ -0,0 +1,88 @@
+#!/bin/bash
+#
+# Submit HTCondor jobs to rerun the judge on past result directories.
+#
+# Usage: commit_rerun_judge.sh [options]
+#
+# Options:
+#   --method <pattern>          Filter to result dirs whose method matches this substring
+#   --benchmark <pattern>       Filter to result dirs whose name matches this substring
+#   --skip-existing             Skip dirs that already have contamination_judgement_rerun.txt
+#   --only-missing-judgement    Only re-run dirs where the original judge step failed
+#                               (no contamination_judgement.txt or disallowed_model_judgement.txt)
+#   --limit <n>                 Process at most n directories
+#   --latest-only               Only the highest cluster_id per (method, model, benchmark)
+#   --dry-run                   Print the dirs that would be submitted, then exit
+
+set -e
+
+METHOD_PATTERN=""
+BENCHMARK_PATTERN=""
+SKIP_EXISTING=""
+ONLY_MISSING=""
+LIMIT=0
+LATEST_ONLY=""
+DRY_RUN=""
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --method) METHOD_PATTERN="$2"; shift 2 ;;
+        --benchmark) BENCHMARK_PATTERN="$2"; shift 2 ;;
+        --skip-existing) SKIP_EXISTING="1"; shift ;;
+        --only-missing-judgement) ONLY_MISSING="1"; shift ;;
+        --limit) LIMIT="$2"; shift 2 ;;
+        --latest-only) LATEST_ONLY="1"; shift ;;
+        --dry-run) DRY_RUN="1"; shift ;;
+        *) echo "Unknown option: $1" >&2; exit 1 ;;
+    esac
+done
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "$SCRIPT_DIR/../../.." && pwd)"
+source "$REPO_ROOT/src/commit_utils/set_env_vars.sh"
+
+SUB_FILE="$SCRIPT_DIR/rerun_judge.sub"
+
+echo "========================================"
+echo "Submitting rerun judge jobs"
+echo "  Method pattern: ${METHOD_PATTERN:-all}"
+echo "  Benchmark pattern: ${BENCHMARK_PATTERN:-all}"
+echo "  Skip existing: ${SKIP_EXISTING:-no}"
+echo "  Only missing judgement: ${ONLY_MISSING:-no}"
+echo "  Limit: ${LIMIT:-no limit}"
+echo "  Latest only: ${LATEST_ONLY:-no}"
+echo "  Dry run: ${DRY_RUN:-no}"
+echo "========================================"
+
+LIST_ARGS="--paths-only"
+[ -n "$METHOD_PATTERN" ] && LIST_ARGS="$LIST_ARGS --method $METHOD_PATTERN"
+[ -n "$BENCHMARK_PATTERN" ] && LIST_ARGS="$LIST_ARGS --benchmark $BENCHMARK_PATTERN"
+[ -n "$SKIP_EXISTING" ] && LIST_ARGS="$LIST_ARGS --skip-existing"
+[ -n "$ONLY_MISSING" ] && LIST_ARGS="$LIST_ARGS --only-missing-judgement"
+[ "$LIMIT" -gt 0 ] && LIST_ARGS="$LIST_ARGS --limit $LIMIT"
+[ -n "$LATEST_ONLY" ] && LIST_ARGS="$LIST_ARGS --latest-only"
+
+RESULT_DIRS=$(python "$SCRIPT_DIR/list_results.py" $LIST_ARGS)
+TOTAL=$(echo "$RESULT_DIRS" | grep -c . || echo 0)
+
+echo "Found $TOTAL result directories"
+
+if [ "$TOTAL" -eq 0 ]; then
+    echo "No directories to process"
+    exit 0
+fi
+
+if [ -n "$DRY_RUN" ]; then
+    echo "$RESULT_DIRS"
+    exit 0
+fi
+
+while read -r result_dir; do
+    [ -z "$result_dir" ] && continue
+    condor_submit_bid 100 -a "result_dir=$result_dir" "$SUB_FILE"
+done <<< "$RESULT_DIRS"
+
+echo ""
+echo "========================================"
+echo "Jobs submitted: $TOTAL"
+echo "========================================"
diff --git a/src/disallowed_usage_judge/rerun_judge/list_results.py b/src/disallowed_usage_judge/rerun_judge/list_results.py
new file mode 100644
index 0000000..e5d44c6
--- /dev/null
+++ b/src/disallowed_usage_judge/rerun_judge/list_results.py
@@ -0,0 +1,74 @@
+#!/usr/bin/env python3
+"""
+List and filter result directories for judge rerun.
+
+Examples:
+    python list_results.py                              # List all result directories
+    python list_results.py --method "claude"            # Filter by method substring
+    python list_results.py --benchmark "aime"           # Filter by benchmark substring
+    python list_results.py --skip-existing              # Skip dirs that already have rerun output
+    python list_results.py --only-missing-judgement     # Only dirs where the original judge failed
+    python list_results.py --paths-only                 # Print just paths (for piping)
+    python list_results.py --latest-only                # Latest cluster_id per method/model/benchmark
+"""
+import argparse
+from utils import get_result_dirs
+
+
+def main():
+    parser = argparse.ArgumentParser(description="List and filter result directories")
+    parser.add_argument("--method", type=str, help="Filter by method pattern")
+    parser.add_argument("--benchmark", type=str, help="Filter by benchmark pattern")
+    parser.add_argument("--skip-existing", action="store_true",
+                        help="Skip directories that already have contamination_judgement_rerun.txt")
+    parser.add_argument("--only-missing-judgement", action="store_true",
+                        help="Only include directories where the original judge step "
+                             "didn't write contamination_judgement.txt or "
+                             "disallowed_model_judgement.txt")
+    parser.add_argument("--paths-only", action="store_true",
+                        help="Print just paths (for piping)")
+    parser.add_argument("--limit", type=int, default=0, help="Limit number of results")
+    parser.add_argument("--latest-only", action="store_true",
+                        help="Only return latest cluster_id per method/model/benchmark")
+    args = parser.parse_args()
+
+    result_dirs = get_result_dirs(
+        method_pattern=args.method,
+        benchmark_pattern=args.benchmark,
+        skip_existing=args.skip_existing,
+        only_missing_judgement=args.only_missing_judgement,
+        limit=args.limit,
+        latest_only=args.latest_only,
+    )
+
+    if args.paths_only:
+        for d in result_dirs:
+            print(d)
+        return
+
+    has_rerun_count = 0
+    missing_orig_count = 0
+    for result_dir in result_dirs:
+        has_rerun = (result_dir / "contamination_judgement_rerun.txt").exists()
+        has_orig_contam = (result_dir / "contamination_judgement.txt").exists()
+        has_orig_disallowed = (result_dir / "disallowed_model_judgement.txt").exists()
+
+        flags = []
+        if has_rerun:
+            flags.append("RERUN")
+            has_rerun_count += 1
+        if not has_orig_contam or not has_orig_disallowed:
+            flags.append("ORIG-MISSING")
+            missing_orig_count += 1
+        flag_str = f" [{','.join(flags)}]" if flags else ""
+        print(f"{result_dir}{flag_str}")
+
+    print()
+    print("=" * 50)
+    print(f"Total: {len(result_dirs)}")
+    print(f"  Already has _rerun output: {has_rerun_count}")
+    print(f"  Missing original judgement files: {missing_orig_count}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/disallowed_usage_judge/rerun_judge/rerun_judge.sub b/src/disallowed_usage_judge/rerun_judge/rerun_judge.sub
new file mode 100644
index 0000000..72e38d2
--- /dev/null
+++ b/src/disallowed_usage_judge/rerun_judge/rerun_judge.sub
@@ -0,0 +1,12 @@
+executable = /bin/bash
+arguments = src/disallowed_usage_judge/rerun_judge/rerun_single.sh $(result_dir)
+environment = "OPENAI_API_KEY=$ENV(OPENAI_API_KEY) HOME=$ENV(HOME) POST_TRAIN_BENCH_RESULTS_DIR=$ENV(POST_TRAIN_BENCH_RESULTS_DIR) POST_TRAIN_BENCH_CONTAINERS_DIR=$ENV(POST_TRAIN_BENCH_CONTAINERS_DIR) POST_TRAIN_BENCH_CONTAINER_NAME=$ENV(POST_TRAIN_BENCH_CONTAINER_NAME) POST_TRAIN_BENCH_JOB_SCHEDULER=$ENV(POST_TRAIN_BENCH_JOB_SCHEDULER) HF_HOME=$ENV(HF_HOME)"
+error = test_$(Cluster).err
+output = test_$(Cluster).out
+log = test_$(Cluster).log
+concurrency_limits=user.judge:3333
+request_memory = 32768
+request_cpus = 4
+request_disk=20G
++BypassLXCfs="true"
+queue
diff --git a/src/disallowed_usage_judge/rerun_judge/rerun_single.sh b/src/disallowed_usage_judge/rerun_judge/rerun_single.sh
new file mode 100755
index 0000000..3fbdd6f
--- /dev/null
+++ b/src/disallowed_usage_judge/rerun_judge/rerun_single.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+#
+# Rerun the judge on a single result directory.
+# Thin wrapper around src/disallowed_usage_judge/run_judge.sh.
+#
+# Usage: rerun_single.sh <result_dir>
+
+set -e
+
+RESULT_DIR="$1"
+
+if [ -z "$RESULT_DIR" ]; then
+    echo "Usage: $0 <result_dir>" >&2
+    exit 1
+fi
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+exec bash "$SCRIPT_DIR/../run_judge.sh" "$RESULT_DIR"
diff --git a/src/disallowed_usage_judge/rerun_judge/utils.py b/src/disallowed_usage_judge/rerun_judge/utils.py
new file mode 100644
index 0000000..01cba05
--- /dev/null
+++ b/src/disallowed_usage_judge/rerun_judge/utils.py
@@ -0,0 +1,135 @@
+#!/usr/bin/env python3
+"""Shared utilities for rerun judge scripts."""
+
+import os
+from pathlib import Path
+
+
+def get_repo_root() -> Path:
+    return Path(__file__).parent.parent.parent.parent
+
+
+def get_results_dir() -> Path:
+    results_dir = os.environ.get("POST_TRAIN_BENCH_RESULTS_DIR")
+    if not results_dir:
+        raise RuntimeError("POST_TRAIN_BENCH_RESULTS_DIR is not set")
+    return Path(results_dir)
+
+
+def get_result_dirs(
+    method_pattern: str = None,
+    benchmark_pattern: str = None,
+    skip_existing: bool = False,
+    only_missing_judgement: bool = False,
+    limit: int = 0,
+    latest_only: bool = False,
+) -> list[Path]:
+    """
+    Walk POST_TRAIN_BENCH_RESULTS_DIR and return matching result directories.
+
+    Args:
+        method_pattern: substring filter on method (parent dir name)
+        benchmark_pattern: substring filter on result dir name
+        skip_existing: skip dirs that already have contamination_judgement_rerun.txt
+        only_missing_judgement: only include dirs missing contamination_judgement.txt
+            and/or disallowed_model_judgement.txt (i.e. the judge previously failed)
+        limit: cap result count (0 = no limit)
+        latest_only: keep only the highest-cluster_id run per (method, model, benchmark)
+    """
+    results_root = get_results_dir()
+    result_dirs = []
+
+    for method_dir in sorted(results_root.iterdir()):
+        if not method_dir.is_dir():
+            continue
+
+        method_name = method_dir.name
+        if method_name.startswith(".") or method_name in ("baseline", "baseline_zeroshot"):
+            continue
+
+        if method_pattern and method_pattern.lower() not in method_name.lower():
+            continue
+
+        for result_dir in sorted(method_dir.iterdir()):
+            if not result_dir.is_dir():
+                continue
+
+            if not (result_dir / "task").is_dir():
+                continue
+
+            if benchmark_pattern and benchmark_pattern.lower() not in result_dir.name.lower():
+                continue
+
+            if skip_existing and (result_dir / "contamination_judgement_rerun.txt").exists():
+                continue
+
+            if only_missing_judgement:
+                has_contam = (result_dir / "contamination_judgement.txt").exists()
+                has_disallowed = (result_dir / "disallowed_model_judgement.txt").exists()
+                if has_contam and has_disallowed:
+                    continue
+
+            result_dirs.append(result_dir)
+
+    if latest_only:
+        result_dirs = _filter_latest_only(result_dirs)
+
+    if limit > 0:
+        result_dirs = result_dirs[:limit]
+
+    return result_dirs
+
+
+def _filter_latest_only(result_dirs: list[Path]) -> list[Path]:
+    best_by_key: dict[tuple[str, str, str], tuple[int, Path]] = {}
+
+    for result_dir in result_dirs:
+        try:
+            parsed = parse_result_dir(result_dir)
+        except ValueError:
+            continue
+        key = (parsed["method"], parsed["model"], parsed["benchmark"])
+        cluster_id = int(parsed["cluster_id"])
+
+        if key not in best_by_key or cluster_id > best_by_key[key][0]:
+            best_by_key[key] = (cluster_id, result_dir)
+
+    return sorted(path for _, path in best_by_key.values())
+
+
+def parse_result_dir(result_dir: Path) -> dict:
+    """
+    Parse a result dir name into its components.
+    Format: {benchmark}_{provider}_{model}_{cluster_id}
+    """
+    dirname = result_dir.name
+    method = result_dir.parent.name
+
+    parts = dirname.rsplit("_", 1)
+    if len(parts) < 2:
+        raise ValueError(f"Invalid result directory name: {dirname}")
+
+    cluster_id = parts[1]
+    rest = parts[0]
+
+    benchmark_end = rest.find("_")
+    if benchmark_end == -1:
+        raise ValueError(f"Invalid result directory name: {dirname}")
+
+    benchmark = rest[:benchmark_end]
+    model_part = rest[benchmark_end + 1:]
+    model_hf = model_part.replace("_", "/", 1)
+
+    return {
+        "benchmark": benchmark,
+        "model": model_part,
+        "model_hf": model_hf,
+        "method": method,
+        "cluster_id": cluster_id,
+    }
+
+
+def read_judgement(filepath: Path) -> str | None:
+    if not filepath.exists():
+        return None
+    return filepath.read_text().strip()
diff --git a/src/disallowed_usage_judge/run_judge.sh b/src/disallowed_usage_judge/run_judge.sh
new file mode 100755
index 0000000..487b131
--- /dev/null
+++ b/src/disallowed_usage_judge/run_judge.sh
@@ -0,0 +1,124 @@
+#!/bin/bash
+#
+# Re-run the contamination/disallowed-model judge on an existing result directory.
+#
+# Mirrors the judge invocation in src/run_task.sh (single GPT-5.1-Codex via the
+# CODEX_API_KEY) but operates on a result directory that already exists,
+# without re-running the agent or eval. Outputs are written with a `_rerun`
+# suffix so the originals from the run are preserved:
+#   - contamination_judgement_rerun.txt
+#   - disallowed_model_judgement_rerun.txt
+#   - judge_output_rerun.json / judge_output_rerun.txt
+#
+# Usage: run_judge.sh <result_dir>
+
+set -e
+
+RESULT_DIR="$1"
+
+if [ -z "$RESULT_DIR" ]; then
+    echo "Usage: $0 <result_dir>" >&2
+    exit 1
+fi
+
+if [ ! -d "$RESULT_DIR" ]; then
+    echo "Error: result directory does not exist: $RESULT_DIR" >&2
+    exit 1
+fi
+
+if [ ! -d "$RESULT_DIR/task" ]; then
+    echo "Error: no task directory found in $RESULT_DIR" >&2
+    exit 1
+fi
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
+source "$REPO_ROOT/src/commit_utils/set_env_vars.sh"
+
+# The original run_task.sh swaps OPENAI_API_KEY into CODEX_API_KEY before the
+# judge runs. Mirror that here so the script works whether the user has only
+# OPENAI_API_KEY set, only CODEX_API_KEY set, or both.
+if [ -z "${CODEX_API_KEY:-}" ] && [ -n "${OPENAI_API_KEY:-}" ]; then
+    export CODEX_API_KEY="${OPENAI_API_KEY}"
+fi
+
+if [ -z "${CODEX_API_KEY:-}" ]; then
+    echo "Error: neither CODEX_API_KEY nor OPENAI_API_KEY is set" >&2
+    exit 1
+fi
+
+# Parse benchmark/model from result directory name.
+# Format: {benchmark}_{provider}_{model}_{cluster_id}
+DIRNAME=$(basename "$RESULT_DIR")
+BENCHMARK=$(echo "$DIRNAME" | sed -E 's/^([^_]+)_.*/\1/')
+MODEL_PART=$(echo "$DIRNAME" | sed -E 's/^[^_]+_(.*)_[0-9]+$/\1/')
+MODEL_HF=$(echo "$MODEL_PART" | sed 's/_/\//')
+
+echo "Running judge on: $RESULT_DIR"
+echo "  Benchmark: $BENCHMARK | Model: $MODEL_HF"
+
+JUDGE_TASK=$(python "$REPO_ROOT/src/disallowed_usage_judge/get_judge_prompt.py" \
+    --benchmark "$BENCHMARK" \
+    --model "$MODEL_HF")
+
+# Sandbox: copy the task dir so any judgement files written by the judge land
+# in our temp dir, not the canonical result dir, until we explicitly copy them.
+TMP_DIR=$(mktemp -d)
+trap 'rm -rf "$TMP_DIR"' EXIT
+JOB_DIR="$TMP_DIR/job_dir"
+JOB_TMP="$TMP_DIR/tmp"
+mkdir -p "$JOB_DIR" "$JOB_TMP"
+
+cp -r "$RESULT_DIR/task" "$JOB_DIR/task"
+
+# Strip stale judgement files from the sandbox so a CLI failure can't leak old
+# values into this judge's output.
+rm -f "$JOB_DIR/task/contamination_judgement.txt"
+rm -f "$JOB_DIR/task/disallowed_model_judgement.txt"
+
+# Reset codex config (matches src/run_task.sh:196) so any agent-specific
+# settings like model_reasoning_effort don't leak into the judge.
+cp -r "$REPO_ROOT/containers/other_home_data/.codex" "$JOB_DIR/"
+
+# Strip any pre-existing _rerun outputs so a CLI crash can't leave stale data.
+rm -f "$RESULT_DIR/contamination_judgement_rerun.txt"
+rm -f "$RESULT_DIR/disallowed_model_judgement_rerun.txt"
+rm -f "$RESULT_DIR/judge_output_rerun.json"
+rm -f "$RESULT_DIR/judge_output_rerun.txt"
+
+JUDGE_OUTPUT_JSON="$RESULT_DIR/judge_output_rerun.json"
+
+apptainer exec \
+    --nv \
+    -c \
+    --env PATH="/root/.local/bin:/home/ben/.local/bin:$PATH" \
+    --env HF_HOME="${HF_HOME_NEW}" \
+    --env CODEX_API_KEY="${CODEX_API_KEY}" \
+    --env VLLM_API_KEY="inspectai" \
+    --env PYTHONNOUSERSITE="1" \
+    --bind "${JOB_TMP}:/tmp" \
+    --home "${JOB_DIR}:/home/ben" \
+    --pwd "/home/ben/task" \
+    --writable-tmpfs \
+    "${POST_TRAIN_BENCH_CONTAINERS_DIR}/${POST_TRAIN_BENCH_CONTAINER_NAME}.sif" \
+    codex --search -a never exec --json -c model_reasoning_summary=detailed --skip-git-repo-check --yolo --model "gpt-5.1-codex" "$JUDGE_TASK" 2>&1 | tee "$JUDGE_OUTPUT_JSON"
+
+# Convert JSON output to human-readable trace.
+if [ -f "$JUDGE_OUTPUT_JSON" ]; then
+    python "$REPO_ROOT/agents/codex/human_readable_trace.py" "$JUDGE_OUTPUT_JSON" -o "$RESULT_DIR/judge_output_rerun.txt"
+fi
+
+# Copy out judgement files (if the judge produced them).
+if [ -f "$JOB_DIR/task/contamination_judgement.txt" ]; then
+    cp "$JOB_DIR/task/contamination_judgement.txt" "$RESULT_DIR/contamination_judgement_rerun.txt"
+    echo "  Contamination: $(cat "$RESULT_DIR/contamination_judgement_rerun.txt")"
+else
+    echo "  Warning: contamination_judgement.txt not produced by judge"
+fi
+
+if [ -f "$JOB_DIR/task/disallowed_model_judgement.txt" ]; then
+    cp "$JOB_DIR/task/disallowed_model_judgement.txt" "$RESULT_DIR/disallowed_model_judgement_rerun.txt"
+    echo "  Model usage: $(cat "$RESULT_DIR/disallowed_model_judgement_rerun.txt")"
+else
+    echo "  Warning: disallowed_model_judgement.txt not produced by judge"
+fi