From 2d6f1d5ffc009aefea020a1e0aaea8150be59fd7 Mon Sep 17 00:00:00 2001 From: hrdkbhatnagar Date: Tue, 5 May 2026 21:36:57 +0200 Subject: [PATCH] refactor aggregation scripts completely fix dev_utils scriptsadd new agents add rerun judge --- .../claude_reprompt/human_readable_trace.py | 1 + agents/claude_reprompt/solve.sh | 31 ++ agents/codex_xhigh/human_readable_trace.py | 1 + agents/codex_xhigh/solve.sh | 12 + .../human_readable_trace.py | 1 + agents/codex_xhigh_reprompt/solve.sh | 34 ++ containers/gpt_5_5.def | 78 ++++ dev_utils/extract_traces.py | 14 +- dev_utils/limit_hit_list.py | 4 +- dev_utils/terminated_finder.py | 16 +- scripts/README.md | 173 ++++++++ scripts/aggregate.py | 363 ++++++++++++++++ scripts/aggregate_metrics_runs.py | 72 +++ scripts/baselines.json | 114 +++++ scripts/collect.py | 255 +++++++++++ scripts/constants.py | 8 +- scripts/rerun_eval_n_times.sh | 152 +++++++ scripts/utils.py | 410 ++++++++++++++++++ scripts/verify.py | 261 +++++++++++ src/commit_utils/commit.sh | 106 +++-- src/commit_utils/rerun_eval.sub | 15 + .../rerun_judge/README.md | 100 +++++ .../rerun_judge/aggregate_rerun_results.py | 149 +++++++ .../rerun_judge/commit_rerun_judge.sh | 88 ++++ .../rerun_judge/list_results.py | 74 ++++ .../rerun_judge/rerun_judge.sub | 12 + .../rerun_judge/rerun_single.sh | 19 + .../rerun_judge/utils.py | 135 ++++++ src/disallowed_usage_judge/run_judge.sh | 124 ++++++ 29 files changed, 2765 insertions(+), 57 deletions(-) create mode 120000 agents/claude_reprompt/human_readable_trace.py create mode 100755 agents/claude_reprompt/solve.sh create mode 120000 agents/codex_xhigh/human_readable_trace.py create mode 100755 agents/codex_xhigh/solve.sh create mode 120000 agents/codex_xhigh_reprompt/human_readable_trace.py create mode 100755 agents/codex_xhigh_reprompt/solve.sh create mode 100644 containers/gpt_5_5.def create mode 100644 scripts/README.md create mode 100644 scripts/aggregate.py create mode 100755 scripts/aggregate_metrics_runs.py create mode 100644 scripts/baselines.json create mode 100644 scripts/collect.py create mode 100755 scripts/rerun_eval_n_times.sh create mode 100644 scripts/utils.py create mode 100644 scripts/verify.py create mode 100644 src/commit_utils/rerun_eval.sub create mode 100644 src/disallowed_usage_judge/rerun_judge/README.md create mode 100644 src/disallowed_usage_judge/rerun_judge/aggregate_rerun_results.py create mode 100755 src/disallowed_usage_judge/rerun_judge/commit_rerun_judge.sh create mode 100644 src/disallowed_usage_judge/rerun_judge/list_results.py create mode 100644 src/disallowed_usage_judge/rerun_judge/rerun_judge.sub create mode 100755 src/disallowed_usage_judge/rerun_judge/rerun_single.sh create mode 100644 src/disallowed_usage_judge/rerun_judge/utils.py create mode 100755 src/disallowed_usage_judge/run_judge.sh diff --git a/agents/claude_reprompt/human_readable_trace.py b/agents/claude_reprompt/human_readable_trace.py new file mode 120000 index 0000000..d643db0 --- /dev/null +++ b/agents/claude_reprompt/human_readable_trace.py @@ -0,0 +1 @@ +../claude/human_readable_trace.py \ No newline at end of file diff --git a/agents/claude_reprompt/solve.sh b/agents/claude_reprompt/solve.sh new file mode 100755 index 0000000..b0b25a4 --- /dev/null +++ b/agents/claude_reprompt/solve.sh @@ -0,0 +1,31 @@ +#!/bin/bash +unset GEMINI_API_KEY +unset CODEX_API_KEY + +export BASH_MAX_TIMEOUT_MS="36000000" + +MIN_REMAINING_MINUTES=30 + +claude --print --verbose --model "$AGENT_CONFIG" --output-format stream-json \ + --dangerously-skip-permissions "$PROMPT" + +# Re-prompt loop: if the agent finishes early, resume the session +while true; do + TIMER_OUTPUT=$(bash timer.sh 2>/dev/null) + if echo "$TIMER_OUTPUT" | grep -q "expired"; then + break + fi + + REMAINING_HOURS=$(echo "$TIMER_OUTPUT" | grep -oP '^\d+(?=:)') + REMAINING_MINS=$(echo "$TIMER_OUTPUT" | grep -oP '(?<=:)\d+') + TOTAL_REMAINING_MINS=$(( REMAINING_HOURS * 60 + REMAINING_MINS )) + + if [ "$TOTAL_REMAINING_MINS" -lt "$MIN_REMAINING_MINUTES" ]; then + break + fi + + CONTINUATION_PROMPT="You still have ${REMAINING_HOURS}h ${REMAINING_MINS}m remaining. Please continue improving your result and maximize performance." + + claude --print --verbose --continue --model "$AGENT_CONFIG" --output-format stream-json \ + --dangerously-skip-permissions "$CONTINUATION_PROMPT" +done diff --git a/agents/codex_xhigh/human_readable_trace.py b/agents/codex_xhigh/human_readable_trace.py new file mode 120000 index 0000000..9cf1a5d --- /dev/null +++ b/agents/codex_xhigh/human_readable_trace.py @@ -0,0 +1 @@ +../codex/human_readable_trace.py \ No newline at end of file diff --git a/agents/codex_xhigh/solve.sh b/agents/codex_xhigh/solve.sh new file mode 100755 index 0000000..443f1c5 --- /dev/null +++ b/agents/codex_xhigh/solve.sh @@ -0,0 +1,12 @@ +#!/bin/bash +unset ANTHROPIC_API_KEY +unset GEMINI_API_KEY + +# Set reasoning effort to xhigh (prepend to ensure precedence) +file=/home/ben/.codex/config.toml +tmp="$(mktemp)" +printf 'model_reasoning_effort = "xhigh"\n\n' > "$tmp" +[ -f "$file" ] && cat "$file" >> "$tmp" +mv "$tmp" "$file" + +codex --search exec --json -c model_reasoning_summary=detailed --skip-git-repo-check --yolo --model "$AGENT_CONFIG" "$PROMPT" diff --git a/agents/codex_xhigh_reprompt/human_readable_trace.py b/agents/codex_xhigh_reprompt/human_readable_trace.py new file mode 120000 index 0000000..9cf1a5d --- /dev/null +++ b/agents/codex_xhigh_reprompt/human_readable_trace.py @@ -0,0 +1 @@ +../codex/human_readable_trace.py \ No newline at end of file diff --git a/agents/codex_xhigh_reprompt/solve.sh b/agents/codex_xhigh_reprompt/solve.sh new file mode 100755 index 0000000..3afc973 --- /dev/null +++ b/agents/codex_xhigh_reprompt/solve.sh @@ -0,0 +1,34 @@ +#!/bin/bash +unset ANTHROPIC_API_KEY +unset GEMINI_API_KEY + +# Set reasoning effort to xhigh (prepend to ensure precedence) +file=/home/ben/.codex/config.toml +tmp="$(mktemp)" +printf 'model_reasoning_effort = "xhigh"\n\n' > "$tmp" +[ -f "$file" ] && cat "$file" >> "$tmp" +mv "$tmp" "$file" + +MIN_REMAINING_MINUTES=30 + +codex --search exec --json -c model_reasoning_summary=detailed --skip-git-repo-check --yolo --model "$AGENT_CONFIG" "$PROMPT" + +# Re-prompt loop: if the agent finishes early, resume the session +while true; do + TIMER_OUTPUT=$(bash timer.sh 2>/dev/null) + if echo "$TIMER_OUTPUT" | grep -q "expired"; then + break + fi + + REMAINING_HOURS=$(echo "$TIMER_OUTPUT" | grep -oP '^\d+(?=:)') + REMAINING_MINS=$(echo "$TIMER_OUTPUT" | grep -oP '(?<=:)\d+') + TOTAL_REMAINING_MINS=$(( REMAINING_HOURS * 60 + REMAINING_MINS )) + + if [ "$TOTAL_REMAINING_MINS" -lt "$MIN_REMAINING_MINUTES" ]; then + break + fi + + CONTINUATION_PROMPT="You still have ${REMAINING_HOURS}h ${REMAINING_MINS}m remaining. Please continue improving your result and maximize performance." + + codex --search exec resume --last --json -c model_reasoning_summary=detailed --skip-git-repo-check --yolo --model "$AGENT_CONFIG" "$CONTINUATION_PROMPT" +done diff --git a/containers/gpt_5_5.def b/containers/gpt_5_5.def new file mode 100644 index 0000000..5010544 --- /dev/null +++ b/containers/gpt_5_5.def @@ -0,0 +1,78 @@ +Bootstrap: docker +From: nvidia/cuda:12.9.1-cudnn-devel-ubuntu22.04 + +%files + containers/requirements-direct.txt /opt/requirements-direct.txt + +%post + chmod 1777 /tmp + # Set environment variables + export DEBIAN_FRONTEND=noninteractive + + # Update and install system dependencies + apt-get update && apt-get install -y \ + python3.10 \ + python3-dev \ + git \ + wget \ + curl \ + build-essential \ + && rm -rf /var/lib/apt/lists/* + + # Create python3 symlink + ln -sf /usr/bin/python3.10 /usr/bin/python3 + ln -sf /usr/bin/python3.10 /usr/bin/python + + # Install Node.js (LTS version 22.x) for npm + curl -fsSL https://deb.nodesource.com/setup_22.x | bash - + apt-get install -y nodejs + + # Install uv + curl -LsSf https://astral.sh/uv/install.sh | sh + export PATH="/root/.local/bin:$PATH" + + uv pip install --system --no-cache vllm==0.11.0 --torch-backend=auto + + # Pinned direct dependencies + uv pip install --system --no-cache -r /opt/requirements-direct.txt + + # flash-attn (needs no-build-isolation) + uv pip install --system --no-cache flash-attn==2.8.3 --no-build-isolation + + # update CLI harnesss to most stable latest versions + # OpenCode doesn't support DeepSeek V4 yet. + npm install -g \ + @anthropic-ai/claude-code@2.1.116 \ + @openai/codex@0.124.0 \ + @google/gemini-cli@0.39.1 \ + opencode-ai@1.14.20 + + # install inspect evals + mkdir -p /opt + cd /opt + git clone https://github.com/UKGovernmentBEIS/inspect_evals.git + cd /opt/inspect_evals + git checkout 06001a83e6d7c709c2ede0570dce7f1031a0bad8 + uv pip install --system --no-cache . + + # install inspect ai with debug + mkdir -p /opt + cd /opt + git clone https://github.com/rank-and-file/inspect_ai_vllm_stdout.git + cd inspect_ai_vllm_stdout + uv pip install --system --no-cache . + +%environment + export PATH="/root/.local/bin:$PATH" + export NO_PROXY="localhost,127.0.0.1" + export no_proxy="localhost,127.0.0.1" + +%runscript + exec python3 "$@" + +%labels + Version v1.0 + Description Python ML container with CUDA support for transformers and LLM training (using uv) + AI CLI tools + +%help + Note: Use the --nv flag to enable NVIDIA GPU support when running the container. diff --git a/dev_utils/extract_traces.py b/dev_utils/extract_traces.py index 74e2e7c..ae4affe 100644 --- a/dev_utils/extract_traces.py +++ b/dev_utils/extract_traces.py @@ -153,6 +153,11 @@ def main(): nargs="+", help="Input directory names (relative to RESULTS_BASE) to process" ) + parser.add_argument( + "--all", + action="store_true", + help="Copy all runs, not just the latest per task (default: latest only)" + ) args = parser.parse_args() output_base = Path(OUTPUT_DIR) @@ -175,8 +180,12 @@ def main(): print(f"\n[{input_dir_name}]") - # Iterate over only the latest subdirectories (highest ID per prefix) - for subdir in sorted(get_latest_subdirs(input_dir)): + # Iterate over subdirectories (latest per task by default, all with --all) + if args.all: + subdirs = sorted(d for d in input_dir.iterdir() if d.is_dir()) + else: + subdirs = sorted(get_latest_subdirs(input_dir)) + for subdir in subdirs: # Determine source file (prefer solve_parsed.txt) src_file = subdir / "solve_parsed.txt" if not src_file.exists(): @@ -201,6 +210,7 @@ def main(): copy_other_files(subdir, dest_dir, 'contamination_judgement.txt', api_keys=api_keys) copy_other_files(subdir, dest_dir, 'disallowed_model_judgement.txt', api_keys=api_keys) copy_other_files(subdir, dest_dir, 'error.log', 'judgement.log', api_keys=api_keys) + copy_other_files(subdir, dest_dir, 'time_taken.txt', api_keys=api_keys) copy_other_files(subdir, dest_dir, 'system_monitor.log', api_keys=api_keys, optional=True) tag = " [sanitized]" if was_sanitized else "" diff --git a/dev_utils/limit_hit_list.py b/dev_utils/limit_hit_list.py index 12f5809..9bfa144 100644 --- a/dev_utils/limit_hit_list.py +++ b/dev_utils/limit_hit_list.py @@ -10,11 +10,13 @@ "You've hit your limit", # Claude Code Pro subscription limit "spending_limit", # Anthropic/OpenAI spending limit "billing_hard_limit", # OpenAI billing hard limit - "insufficient_quota", # OpenAI quota exceeded + "insufficient_quota", # OpenAI quota exceeded (structured error code) + "Quota exceeded. Check your plan", # OpenAI/Codex quota exceeded (turn.failed message) "budget_exceeded", # General budget error "plan does not yet include", # Z.AI subscription plan restriction "token_expired", # OpenAI/Codex expired auth token "Failed to refresh token", # Codex CLI refresh token failure + "Reconnecting... 5/5", # Codex CLI exhausted stream-reconnect retries ] diff --git a/dev_utils/terminated_finder.py b/dev_utils/terminated_finder.py index f7af378..90e21fe 100644 --- a/dev_utils/terminated_finder.py +++ b/dev_utils/terminated_finder.py @@ -11,19 +11,23 @@ def get_results_dir(): return os.environ.get("POST_TRAIN_BENCH_RESULTS_DIR", "results") +KILLED_RE = re.compile(rb"run_task\.sh: line \d+: \d+ Killed") + + def classify_error(error_log_path: Path) -> str | None: """Classify the error in error.log. Returns 'terminated', 'killed', or None.""" if not error_log_path.exists(): return None try: - content = error_log_path.read_text() - if content.startswith("Terminated"): - return "terminated" - if re.search(r"\bKilled\b", content): - return "killed" - return None + with open(error_log_path, "rb") as f: + head = f.read(4096) except Exception: return None + if head.startswith(b"Terminated"): + return "terminated" + if KILLED_RE.search(head): + return "killed" + return None def get_latest_runs(method_path: Path): diff --git a/scripts/README.md b/scripts/README.md new file mode 100644 index 0000000..4f08e87 --- /dev/null +++ b/scripts/README.md @@ -0,0 +1,173 @@ +# scripts + +Post-hoc analysis utilities for PostTrainBench result directories. Most scripts +here read the contents of `$POST_TRAIN_BENCH_RESULTS_DIR` and produce CSV / +JSON aggregates; the exception is `rerun_eval_n_times.sh`, which actually +re-runs the model on a GPU. + +## Aggregating results into CSVs + +The recommended pipeline is two scripts: `collect.py` reads raw run dirs into +per-method CSVs, then `aggregate.py` rolls those into per-agent avg/std and +the weighted leaderboard metric. Together they replace ~10 older scripts +(`aggregate_methods.py`, `aggregate_final.py`, `aggregate_contamination.py`, +`aggregate_time.py`, `compute_single_metrics*.py`, etc., still kept in the +tree for reference — see "Legacy per-stage scripts" below). + +### Typical flow + +From the repo root, with `POST_TRAIN_BENCH_RESULTS_DIR` pointing at the raw +results tree: + +```bash +# 1. Collect raw per-run data into per-method CSVs. +# Reads metrics.json + contamination/disallowed_model judgements + time_taken.txt, +# applies baseline-zeroshot fallback for contaminated/errored cells. +# Writes: +# final_{method}.csv — score grid (model x benchmark) with fallback +# contamination_{method}.csv — flags ("", "C", "M", "MC", or error string) +# time_overview.csv — average wall time per method +python scripts/collect.py + +# 2. Aggregate across runs/agents and compute the weighted leaderboard metric. +# Reads final_{method}.csv produced above. Writes: +# aggregated_avg_{agent}.csv — per-cell mean across runs (one per multi-run agent) +# aggregated_std_{agent}.csv — per-cell sample stddev (n-1) +# single_metrics.csv — weighted score per individual run +# single_metrics_aggregated.csv — agent-level avg/std/n on the weighted metric +# time_aggregated.csv — agent-level avg/std wall time +python scripts/aggregate.py +``` + +`aggregate.py` skips agents whose run CSVs aren't present in this results +dir, so it's safe to run against a partial tree. + +### `collect.py` flags + +```bash +python scripts/collect.py \ + --data-dir /path/to/results \ # default: $POST_TRAIN_BENCH_RESULTS_DIR + --output-dir /path/to/out \ # default: same as --data-dir + --min-run-id 17000000 \ # inclusive lower bound on cluster_id + --max-run-id 17200000 # exclusive upper bound on cluster_id +``` + +### `aggregate.py` flags + +By default `--all` is implied (write everything). Use the flags below to +restrict to one stage: + +```bash +python scripts/aggregate.py --per-cell # only aggregated_avg/std_{agent}.csv +python scripts/aggregate.py --leaderboard # only single_metrics{,_aggregated}.csv +python scripts/aggregate.py --time # only time_aggregated.csv +``` + +Same `--data-dir` / `--output-dir` flags as `collect.py`. + +### Hardcoded things + +| File | What it pins | +|---|---| +| `constants.py` (`HARDCODED_AGENT_MAP`) | Which run directories belong to which agent (multi-run agents are how stddev is computed) | +| `constants.py` (`HARDCODED_BENCHMARKS`, `EXPECTED_MODELS`) | Benchmark + base-model lists | +| `factors.json` | Per-benchmark weights for the weighted leaderboard metric | +| `baselines.json` | Hardcoded zero-shot + few-shot baseline scores; used as fallback for contaminated/errored cells (no longer recomputed at every run) | + +To add a new agent: add its run-dir names to `HARDCODED_AGENT_MAP` in +`constants.py`. To add a new benchmark: extend `HARDCODED_BENCHMARKS` and add +a weight to `factors.json`. + +### `verify.py` (refactor regression check) + +`verify.py` is a one-off script used when the new pipeline was +rolled out — it compares two CSV output dirs cell-by-cell with float +tolerance, used to confirm the new pipeline matches the old one byte-for-byte +(except for filename renames). Not part of the normal workflow. + +```bash +python scripts/verify.py \ + --ground-truth /fast/.../ptb_results_old \ + --new-output /fast/.../ptb_results_new +``` + +## Other helpers + +| Script | Description | +|---|---| +| `compute_claude_costs.py` | Claude API spend rollup | +| `extract_token_usage.py` | Token-usage extraction from agent traces | +| `migrate_judgement_files.py` | One-off: migrate older judgement file naming | +| `list_safetensors.py` | List safetensors files under a result tree | +| `parse_all_to_human_readable.sh` | Run human-readable trace parsers across results | +| `baselines.json`, `factors.json`, `constants.py`, `utils.py` | Shared config / helpers | + +## Legacy per-stage scripts + +These predate `collect.py` + `aggregate.py` and are kept for reference; the +two new scripts cover what they did with less duplication. Prefer the new +pipeline for fresh analysis. + +| Script | Replaced by | +|---|---| +| `aggregate_methods.py` | `collect.py` | +| `aggregate_contamination.py` | `collect.py` | +| `aggregate_final.py` | `collect.py` | +| `aggregate_time.py` | `collect.py` (writes `time_overview.csv`) | +| `aggregate_avg_stddev.py` | `aggregate.py --per-cell` | +| `aggregate_avg_stddev_over_benchmarks.py` | (dropped — deprecated artifact) | +| `aggregate_time_avg_stddev.py` | `aggregate.py --time` | +| `aggregate_time_baselines.py` | (dropped — baselines hardcoded) | +| `aggregate_summary.py`, `aggregate_together.py` | `aggregate.py --leaderboard` | +| `compute_single_metrics.py`, `compute_single_metrics_avg_stddev.py` | `aggregate.py --leaderboard` | +| `compute_baseline_metrics.py`, `compute_baseline_metrics_by_benchmark.py` | `baselines.json` (no longer recomputed) | + + +## Re-evaluating a finished run N times + +`rerun_eval_n_times.sh` re-evaluates a job's `final_model/` N times and writes +mean / std / stderr / min / max per metric into `metrics_averaged.json`. Useful +because each job's standard `metrics.json` is a single decoding sample per +question and does not capture decoding noise. + +It mirrors `src/run_task.sh`'s evaluation step exactly: + +- runs `src/eval/tasks//evaluate.py` (the live source — **not** the + potentially-modified snapshot in `/task/`) +- inside the same `${POST_TRAIN_BENCH_CONTAINER_NAME}.sif` container +- with the same fuse-overlayfs HF cache pattern (`with_huggingface_overlay`) +- using the same `--max-tokens` fallback ladder per task + +Per-run JSONs are written to `/reruns/run_{i}.json` (with +`run_{i}_{level}.log` alongside). The aggregated file is `/metrics_averaged.json`. + +### Files + +| File | Description | +|---|---| +| `rerun_eval_n_times.sh` | Driver: re-runs `evaluate.py` N times on one EVAL_DIR and aggregates | +| `aggregate_metrics_runs.py` | Helper called by the driver: computes mean/std/stderr/min/max from per-run JSONs | +| `../src/commit_utils/rerun_eval.sub` | HTCondor submission file | + +### Usage + +#### Locally on a GPU node + +From the repo root: + +```bash +scripts/rerun_eval_n_times.sh /path/to/EVAL_DIR 5 +``` + +`EVAL_DIR` must be an existing job directory containing `final_model/`. The +task name is parsed from the basename (`__`) to +pick the correct max-tokens fallback ladder. + +#### HTCondor + +```bash +condor_submit_bid 50 \ + -a "eval_dir=/path/to/EVAL_DIR" \ + -a "n=5" \ + src/commit_utils/rerun_eval.sub +``` diff --git a/scripts/aggregate.py b/scripts/aggregate.py new file mode 100644 index 0000000..b1b2ee9 --- /dev/null +++ b/scripts/aggregate.py @@ -0,0 +1,363 @@ +#!/usr/bin/env python3 +""" +Aggregate results across multiple runs per agent. + +# For comparing to previous version: +Replaces: aggregate_avg_stddev.py, aggregate_avg_stddev_over_benchmarks.py, + compute_single_metrics.py, compute_single_metrics_avg_stddev.py, + aggregate_time_avg_stddev.py, aggregate_summary.py, + aggregate_together.py, compute_baseline_metrics.py, + compute_baseline_metrics_by_benchmark.py + +Reads final_{method}.csv files produced by collect.py and computes: + --per-cell : aggregated_avg_{agent}.csv, aggregated_std_{agent}.csv + --leaderboard : single_metrics.csv, single_metrics_aggregated.csv + --time : time_aggregated.csv + --all : everything (default) + +Usage: + python aggregate.py + python aggregate.py --data-dir /path/to/results --output-dir /path/to/output + python aggregate.py --per-cell --leaderboard +""" +import argparse +import csv +import os +import re + +from utils import ( + get_results_dir, + load_csv_as_dict, + write_csv, + load_factors, + mean, + stddev, + is_number, + format_time_hms, + HARDCODED_AGENT_MAP, + HARDCODED_BENCHMARKS, + EXPECTED_MODELS, +) + + +# --------------------------------------------------------------------------- +# Per-cell avg/std across runs (replaces aggregate_avg_stddev.py) +# --------------------------------------------------------------------------- + +def aggregate_per_cell( + agent_name: str, + method_names: list[str], + data_dir: str, + output_dir: str, +): + """ + For each (model, benchmark) cell, compute mean and sample stddev + across the runs. Write aggregated_avg_{agent}.csv and aggregated_std_{agent}.csv. + """ + all_data = [] + all_models = None + + for method_name in method_names: + csv_path = os.path.join(data_dir, f"final_{method_name}.csv") + data, _ = load_csv_as_dict(csv_path) + + models = sorted(data.keys()) + if all_models is None: + all_models = models + else: + assert all_models == models, ( + f"Model mismatch for {method_name}: " + f"expected {all_models}, got {models}" + ) + all_data.append(data) + + avg_data = {} + std_data = {} + + for model in all_models: + avg_data[model] = {} + std_data[model] = {} + + for bench in HARDCODED_BENCHMARKS: + values = [] + for data in all_data: + values.append(float(data[model][bench])) + + avg_data[model][bench] = str(mean(values)) + std_data[model][bench] = str(stddev(values)) + + avg_path = os.path.join(output_dir, f"aggregated_avg_{agent_name}.csv") + write_csv(avg_path, all_models, HARDCODED_BENCHMARKS, avg_data) + print(f"Written: {avg_path}") + + std_path = os.path.join(output_dir, f"aggregated_std_{agent_name}.csv") + write_csv(std_path, all_models, HARDCODED_BENCHMARKS, std_data) + print(f"Written: {std_path}") + + return avg_data, std_data + + +# --------------------------------------------------------------------------- +# Weighted single metric (replaces compute_single_metrics*.py) +# --------------------------------------------------------------------------- + +def compute_weighted_metric( + data: dict[str, dict[str, str]], + factors: dict[str, float], +) -> float: + """ + Compute weighted sum: for each benchmark, average across models, + multiply by factor, sum. + """ + valid_benchmarks = set(factors.keys()) + total = 0.0 + num_models = len(data) + for bench in sorted(valid_benchmarks): + values = [] + for model in data: + val_str = data[model].get(bench, "") + if val_str == "": + continue + values.append(float(val_str)) + if values: + avg_value = sum(values) / num_models + total += avg_value * factors[bench] + return total + + +def aggregate_leaderboard(data_dir: str, output_dir: str): + """ + Compute weighted metric for every final_*.csv that has all expected models. + Then group by HARDCODED_AGENT_MAP for avg/std. + + Also writes final_avg_{agent}.csv and final_std_{agent}.csv (identical to + aggregated_ versions) so their metrics appear in single_metrics.csv. + """ + factors = load_factors() + valid_benchmarks = set(factors.keys()) + + # Phase 1: compute per-cell avg/std and write final_avg/std files + # so they get picked up in the metric scan below + for agent_name, method_names in HARDCODED_AGENT_MAP.items(): + avg_data, std_data = _load_avg_std_for_agent( + agent_name, method_names, data_dir + ) + if avg_data is not None: + # Write final_avg_{agent}.csv (identical to aggregated_avg_) + avg_path = os.path.join(output_dir, f"final_avg_{agent_name}.csv") + write_csv( + avg_path, + sorted(avg_data.keys()), + HARDCODED_BENCHMARKS, + avg_data, + ) + std_path = os.path.join(output_dir, f"final_std_{agent_name}.csv") + write_csv( + std_path, + sorted(std_data.keys()), + HARDCODED_BENCHMARKS, + std_data, + ) + + # Phase 2: compute metrics for ALL final_*.csv files in the output dir + all_metrics = {} + + for filename in os.listdir(output_dir): + if not filename.startswith("final_"): + continue + if not filename.endswith(".csv"): + continue + if filename.startswith("final_time_"): + continue + + csv_path = os.path.join(output_dir, filename) + try: + data, _ = load_csv_as_dict(csv_path) + except Exception: + print(f"Warning: could not load {csv_path}.") + raise + + if set(data.keys()) != EXPECTED_MODELS: + continue + + method_name = filename[len("final_"):-len(".csv")] + all_metrics[method_name] = compute_weighted_metric(data, factors) + + # Write individual metrics + metrics_path = os.path.join(output_dir, "single_metrics.csv") + with open(metrics_path, "w", newline="") as f: + writer = csv.writer(f) + writer.writerow(["method", "metric"]) + for method_name in sorted(all_metrics.keys()): + writer.writerow([method_name, all_metrics[method_name]]) + print(f"Written: {metrics_path}") + + # Compute aggregated metrics per agent group + aggregated_path = os.path.join(output_dir, "single_metrics_aggregated.csv") + with open(aggregated_path, "w", newline="") as f: + writer = csv.writer(f) + writer.writerow(["agent", "avg", "std", "n"]) + for agent_name in sorted(HARDCODED_AGENT_MAP.keys()): + method_names = HARDCODED_AGENT_MAP[agent_name] + # Skip agents with missing runs + if not all(m in all_metrics for m in method_names): + print(f"Skipping agent {agent_name} in leaderboard: missing metrics") + continue + metrics = [all_metrics[m] for m in method_names] + writer.writerow([ + agent_name, + mean(metrics), + stddev(metrics), + len(metrics), + ]) + print(f"Written: {aggregated_path}") + + +def _load_avg_std_for_agent( + agent_name: str, + method_names: list[str], + data_dir: str, +) -> tuple[dict | None, dict | None]: + """Load final_*.csv for each run and compute per-cell avg/std.""" + all_data = [] + all_models = None + + for method_name in method_names: + csv_path = os.path.join(data_dir, f"final_{method_name}.csv") + if not os.path.exists(csv_path): + return None, None + data, _ = load_csv_as_dict(csv_path) + models = sorted(data.keys()) + if all_models is None: + all_models = models + all_data.append(data) + + avg_data = {} + std_data = {} + for model in all_models: + avg_data[model] = {} + std_data[model] = {} + for bench in HARDCODED_BENCHMARKS: + values = [float(d[model][bench]) for d in all_data] + avg_data[model][bench] = str(mean(values)) + std_data[model][bench] = str(stddev(values)) + + return avg_data, std_data + + +# --------------------------------------------------------------------------- +# Time aggregation (replaces aggregate_time_avg_stddev.py) +# --------------------------------------------------------------------------- + +def parse_time_to_hours(time_str: str) -> float: + """Parse time string like '8:17:28' to hours as float.""" + parts = time_str.split(":") + hours = int(parts[0]) + minutes = int(parts[1]) + seconds = int(parts[2]) + return hours + minutes / 60 + seconds / 3600 + + +def aggregate_time(data_dir: str, output_dir: str): + """ + Read time_overview.csv, group by HARDCODED_AGENT_MAP, compute avg/std. + Write time_aggregated.csv. + """ + # Try new name first, fall back to old name + time_csv_path = os.path.join(data_dir, "time_overview.csv") + if not os.path.exists(time_csv_path): + time_csv_path = os.path.join(data_dir, "aggregated_time_overview.csv") + + time_data = {} + with open(time_csv_path, "r", newline="") as f: + reader = csv.DictReader(f) + for row in reader: + method = row["method"] + avg_time = row["average_time"] + if avg_time and avg_time != "N/A": + time_data[method] = parse_time_to_hours(avg_time) + + output_path = os.path.join(output_dir, "time_aggregated.csv") + with open(output_path, "w", newline="") as f: + writer = csv.writer(f) + writer.writerow(["agent", "avg_time", "std_time", "n"]) + for agent_name, method_names in HARDCODED_AGENT_MAP.items(): + if not all(m in time_data for m in method_names): + print(f"Skipping agent {agent_name} in time: missing data") + continue + hours_list = [time_data[m] for m in method_names] + writer.writerow([ + agent_name, + format_time_hms(int(mean(hours_list) * 3600)), + format_time_hms(int(stddev(hours_list) * 3600)), + len(hours_list), + ]) + print(f"Written: {output_path}") + + +def _all_finals_exist(method_names: list[str], data_dir: str) -> bool: + """Check if all final_*.csv files exist for the given methods.""" + return all( + os.path.exists(os.path.join(data_dir, f"final_{m}.csv")) + for m in method_names + ) + + +# --------------------------------------------------------------------------- +# CLI +# --------------------------------------------------------------------------- + +def parse_args(): + parser = argparse.ArgumentParser( + description="Aggregate results across multiple runs per agent." + ) + parser.add_argument( + "--data-dir", + default=None, + help="Directory containing final_*.csv files (from collect.py). " + "Defaults to POST_TRAIN_BENCH_RESULTS_DIR or 'results'.", + ) + parser.add_argument( + "--output-dir", + default=None, + help="Directory to write output CSVs. Defaults to same as --data-dir.", + ) + parser.add_argument("--per-cell", action="store_true", + help="Write per-cell avg/std CSVs per agent.") + parser.add_argument("--leaderboard", action="store_true", + help="Write single_metrics.csv and single_metrics_aggregated.csv.") + parser.add_argument("--time", action="store_true", + help="Write time_aggregated.csv.") + parser.add_argument("--all", action="store_true", + help="Write everything (default if no flags given).") + return parser.parse_args() + + +def main(): + args = parse_args() + + data_dir = args.data_dir or get_results_dir() + output_dir = args.output_dir or data_dir + + os.makedirs(output_dir, exist_ok=True) + + do_all = args.all or not (args.per_cell or args.leaderboard or args.time) + + if do_all or args.per_cell: + for agent_name, method_names in HARDCODED_AGENT_MAP.items(): + # Skip agents whose run data isn't available + if not _all_finals_exist(method_names, data_dir): + print(f"Skipping agent {agent_name}: missing final CSVs") + continue + print(f"Processing agent: {agent_name}") + aggregate_per_cell(agent_name, method_names, data_dir, output_dir) + + if do_all or args.leaderboard: + aggregate_leaderboard(data_dir, output_dir) + + if do_all or args.time: + aggregate_time(data_dir, output_dir) + + +if __name__ == "__main__": + main() diff --git a/scripts/aggregate_metrics_runs.py b/scripts/aggregate_metrics_runs.py new file mode 100755 index 0000000..26ef798 --- /dev/null +++ b/scripts/aggregate_metrics_runs.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python3 +"""Aggregate per-run metrics JSON files into a single metrics_averaged.json. + +Reads every file matching --runs-glob, treats top-level numeric keys as +per-run metric values, and writes mean/std/stderr/min/max per key plus the +raw per-run records and source file list. +""" +from __future__ import annotations + +import argparse +import glob +import json +import math +import sys + + +def _numeric(x: object) -> bool: + return isinstance(x, (int, float)) and not isinstance(x, bool) + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument("--runs-glob", required=True, + help="Glob matching per-run metrics JSON files.") + parser.add_argument("--output", required=True, + help="Path to write the aggregated metrics JSON.") + args = parser.parse_args() + + paths = sorted(glob.glob(args.runs_glob)) + if not paths: + sys.exit(f"no run files matched {args.runs_glob}") + + runs: list[dict] = [] + for path in paths: + with open(path, "r") as f: + runs.append(json.load(f)) + + keys = sorted({k for r in runs for k in r.keys()}) + + aggregated: dict[str, dict[str, float | int]] = {} + for k in keys: + vals = [r[k] for r in runs if k in r and _numeric(r[k])] + if not vals: + continue + mean = sum(vals) / len(vals) + if len(vals) > 1: + variance = sum((x - mean) ** 2 for x in vals) / (len(vals) - 1) + std = math.sqrt(variance) + else: + std = 0.0 + aggregated[k] = { + "mean": mean, + "std": std, + "stderr": std / math.sqrt(len(vals)), + "min": min(vals), + "max": max(vals), + "n": len(vals), + } + + out = { + "n_runs": len(runs), + "metrics": aggregated, + "per_run": runs, + "run_files": paths, + } + + with open(args.output, "w") as f: + json.dump(out, f, indent=2) + + +if __name__ == "__main__": + main() diff --git a/scripts/baselines.json b/scripts/baselines.json new file mode 100644 index 0000000..d80799c --- /dev/null +++ b/scripts/baselines.json @@ -0,0 +1,114 @@ +{ + "zeroshot": { + "Qwen3-1.7B": { + "aime2025": 0.26666666666666666, + "arenahardwriting": 0.5, + "bfcl": 0.94, + "gpqamain": 0.3549107142857143, + "gsm8k": 0.8847611827141774, + "healthbench": 0.44918867035528026, + "humaneval": 0.6890243902439024 + }, + "Qwen3-1.7B-Base": { + "aime2025": 0.0, + "arenahardwriting": 0.009142053445850914, + "bfcl": 0.0, + "gpqamain": 0.140625, + "gsm8k": 0.12661106899166036, + "healthbench": 0.07537565969807473, + "humaneval": 0.07926829268292683 + }, + "Qwen3-4B": { + "aime2025": 0.5333333333333333, + "arenahardwriting": 0.8683943089430894, + "bfcl": 0.95, + "gpqamain": 0.44642857142857145, + "gsm8k": 0.9378316906747536, + "healthbench": 0.5272399437524256, + "humaneval": 0.774390243902439 + }, + "Qwen3-4B-Base": { + "aime2025": 0.03333333333333333, + "arenahardwriting": 0.03417533432392273, + "bfcl": 0.0, + "gpqamain": 0.13392857142857142, + "gsm8k": 0.4184988627748294, + "healthbench": 0.13383521639663787, + "humaneval": 0.36585365853658536 + }, + "SmolLM3-3B": { + "aime2025": 0.26666666666666666, + "arenahardwriting": 0.492, + "bfcl": 0.84, + "gpqamain": 0.3325892857142857, + "gsm8k": 0.8218347232752085, + "healthbench": 0.2957717718639611, + "humaneval": 0.7012195121951219 + }, + "SmolLM3-3B-Base": { + "aime2025": 0.03333333333333333, + "arenahardwriting": 0.004225352112676056, + "bfcl": 0.0, + "gpqamain": 0.049107142857142856, + "gsm8k": 0.21076573161485973, + "healthbench": 0.0, + "humaneval": 0.06097560975609756 + }, + "gemma-3-4b-it": { + "aime2025": 0.1, + "arenahardwriting": 0.948, + "bfcl": 0.67, + "gpqamain": 0.31473214285714285, + "gsm8k": 0.8354814253222138, + "healthbench": 0.46063396051286026, + "humaneval": 0.6951219512195121 + }, + "gemma-3-4b-pt": { + "aime2025": 0.0, + "arenahardwriting": 0.0028530670470756064, + "bfcl": 0.06, + "gpqamain": 0.015625, + "gsm8k": 0.06141015921152388, + "healthbench": 0.17039403723633986, + "humaneval": 0.006097560975609756 + } + }, + "fewshot": { + "Qwen3-1.7B-Base": { + "aime2025": 0.05333333333333333, + "arenahardwriting": 0.05314625850340136, + "bfcl": 0.0, + "gpqamain": 0.25959821428571417, + "gsm8k": 0.46679302501895537, + "healthbench": 0.2110110691560308, + "humaneval": 0.25243902439024396 + }, + "Qwen3-4B-Base": { + "aime2025": 0.09000000000000001, + "arenahardwriting": 0.19168260038240917, + "bfcl": 0.0, + "gpqamain": 0.29888392857142837, + "gsm8k": 0.7438210765731573, + "healthbench": 0.2179351466647625, + "humaneval": 0.6774390243902438 + }, + "SmolLM3-3B-Base": { + "aime2025": 0.06000000000000001, + "arenahardwriting": 0.03248811410459588, + "bfcl": 0.0, + "gpqamain": 0.13236607142857182, + "gsm8k": 0.5298711144806676, + "healthbench": 0.10165123092180756, + "humaneval": 0.3237804878048783 + }, + "gemma-3-4b-pt": { + "aime2025": 0.0, + "arenahardwriting": 0.01257396449704142, + "bfcl": 0.06699999999999998, + "gpqamain": 0.21406249999999985, + "gsm8k": 0.0583775587566339, + "healthbench": 0.23317845064882012, + "humaneval": 0.004878048780487805 + } + } +} diff --git a/scripts/collect.py b/scripts/collect.py new file mode 100644 index 0000000..2bc33f1 --- /dev/null +++ b/scripts/collect.py @@ -0,0 +1,255 @@ +#!/usr/bin/env python3 +""" +Collect results from raw run directories into per-method CSVs. + +# For comparing to previous version: +Replaces: aggregate_methods.py, aggregate_contamination.py, + aggregate_final.py, aggregate_time.py + +For each method directory in the results dir, does a single pass: + 1. Finds the latest run per (benchmark, model) + 2. Reads metrics.json, contamination files, and time_taken.txt + 3. Applies baseline fallback for contaminated or errored cells + 4. Writes final_{method}.csv, contamination_{method}.csv + +Also writes a time_overview.csv summarising average time per method. + +Usage: + python collect.py + python collect.py --data-dir /path/to/results --output-dir /path/to/output + python collect.py --min-run-id 100 --max-run-id 200 +""" +import argparse +import csv +import os + +from utils import ( + get_results_dir, + get_baseline_fallback_data, + walk_latest_runs, + load_metrics, + load_contamination, + load_disallowed_model, + combine_contamination_results, + load_time_taken, + is_number, + format_time_hms, + BUDGET_SECONDS, +) + +# Directories to skip (baselines are hardcoded in baselines.json) +SKIP_METHODS = {"baseline", "baseline_zeroshot"} + + +def collect_method( + method_path: str, + method_name: str, + baseline_data: dict[str, dict[str, str]], + output_dir: str, + min_run_id: int | None = None, + max_run_id: int | None = None, +) -> dict | None: + """ + Collect results for one method directory. + + Writes: + - final_{method_name}.csv (scores with baseline fallback) + - contamination_{method_name}.csv (contamination flags) + + Returns time stats dict {"total_seconds": int, "valid_count": int} + or None if no runs found. + """ + latest_runs = walk_latest_runs(method_path, min_run_id, max_run_id) + if not latest_runs: + return None + + benchmarks = sorted({b for b, m in latest_runs}) + models = sorted({m for b, m in latest_runs}) + + # Collect metrics, contamination, and time in one pass + metrics_grid = {} # {model: {bench: str}} + contamination_grid = {} # {model: {bench: str}} + time_total_seconds = 0 + time_valid_count = 0 + + for model in models: + metrics_grid[model] = {} + contamination_grid[model] = {} + + for bench in benchmarks: + key = (bench, model) + if key not in latest_runs: + metrics_grid[model][bench] = "" + contamination_grid[model][bench] = "" + continue + + run_dir = latest_runs[key]["path"] + + # Metrics + metrics_path = os.path.join(run_dir, "metrics.json") + metrics_grid[model][bench] = load_metrics(metrics_path, method_name) + + # Contamination + contamination = load_contamination( + os.path.join(run_dir, "contamination_judgement.txt") + ) + disallowed = load_disallowed_model( + os.path.join(run_dir, "disallowed_model_judgement.txt") + ) + contamination_grid[model][bench] = combine_contamination_results( + contamination, disallowed + ) + + # Time + _, seconds = load_time_taken(run_dir) + if seconds is not None: + time_total_seconds += seconds + time_valid_count += 1 + + # Write contamination CSV + contamination_path = os.path.join( + output_dir, f"contamination_{method_name}.csv" + ) + with open(contamination_path, "w", newline="") as f: + writer = csv.writer(f) + writer.writerow(["model"] + benchmarks) + for model in models: + row = [model] + for bench in benchmarks: + row.append(contamination_grid[model][bench]) + writer.writerow(row) + print(f"Written: {contamination_path}") + + # Apply baseline fallback: replace cell with baseline if + # (a) value is not a number, OR + # (b) contamination flag is non-empty + for model in models: + for bench in benchmarks: + value = metrics_grid[model][bench] + contamination_value = contamination_grid[model][bench] + + needs_baseline = False + if not is_number(value): + needs_baseline = True + if contamination_value.strip(): + needs_baseline = True + + if needs_baseline: + metrics_grid[model][bench] = baseline_data.get(model, {}).get( + bench, "" + ) + + # Write final CSV (scores with baseline fallback applied) + final_path = os.path.join(output_dir, f"final_{method_name}.csv") + with open(final_path, "w", newline="") as f: + writer = csv.writer(f) + writer.writerow(["model"] + benchmarks) + for model in models: + row = [model] + for bench in benchmarks: + row.append(metrics_grid[model].get(bench, "")) + writer.writerow(row) + print(f"Written: {final_path}") + + return { + "total_seconds": time_total_seconds, + "valid_count": time_valid_count, + } + + +def write_time_overview(method_stats: dict[str, dict], output_dir: str): + """Write time_overview.csv with average time per method.""" + csv_path = os.path.join(output_dir, "time_overview.csv") + + with open(csv_path, "w", newline="") as f: + writer = csv.writer(f) + writer.writerow(["method", "average_time", "percentage"]) + + for method_name in sorted(method_stats.keys()): + stats = method_stats[method_name] + total_secs = stats["total_seconds"] + valid = stats["valid_count"] + + if valid > 0: + avg_secs = total_secs // valid + avg_str = format_time_hms(avg_secs) + pct = (avg_secs / BUDGET_SECONDS) * 100 + pct_str = f"{pct:.1f}%" + else: + avg_str = "N/A" + pct_str = "N/A" + + writer.writerow([method_name, avg_str, pct_str]) + + print(f"Written: {csv_path}") + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Collect raw results into per-method CSVs." + ) + parser.add_argument( + "--data-dir", + default=None, + help="Directory containing method subdirectories with raw run data. " + "Defaults to POST_TRAIN_BENCH_RESULTS_DIR or 'results'.", + ) + parser.add_argument( + "--output-dir", + default=None, + help="Directory to write output CSVs. Defaults to same as --data-dir.", + ) + parser.add_argument( + "--min-run-id", + type=int, + default=None, + help="Inclusive lower bound for run IDs to consider.", + ) + parser.add_argument( + "--max-run-id", + type=int, + default=None, + help="Exclusive upper bound for run IDs to consider.", + ) + return parser.parse_args() + + +def main(): + args = parse_args() + + data_dir = args.data_dir or get_results_dir() + output_dir = args.output_dir or data_dir + + os.makedirs(output_dir, exist_ok=True) + + # Load baseline data for fallback (hardcoded in baselines.json) + baseline_data = get_baseline_fallback_data() + + method_stats = {} + + for method_name in sorted(os.listdir(data_dir)): + method_path = os.path.join(data_dir, method_name) + if not os.path.isdir(method_path): + continue + + # Skip baseline directories — their values are hardcoded + if method_name in SKIP_METHODS: + continue + + stats = collect_method( + method_path, + method_name, + baseline_data, + output_dir, + min_run_id=args.min_run_id, + max_run_id=args.max_run_id, + ) + if stats: + method_stats[method_name] = stats + + if method_stats: + write_time_overview(method_stats, output_dir) + + +if __name__ == "__main__": + main() diff --git a/scripts/constants.py b/scripts/constants.py index c1dbd61..c45bcef 100644 --- a/scripts/constants.py +++ b/scripts/constants.py @@ -62,7 +62,13 @@ "claude_non_api_claude-opus-4-6_1m__10h_run1", "claude_non_api_claude-opus-4-6_1m__10h_run2", "claude_non_api_claude-opus-4-6_1m__10h_run3" - ] + ], + + "Opus-4.7":[ + "claude_non_api_claude-opus-4-7_10h", + "claude_non_api_claude-opus-4-7_10h_run2", + "claude_non_api_claude-opus-4-7_10h_run3" + ] } diff --git a/scripts/rerun_eval_n_times.sh b/scripts/rerun_eval_n_times.sh new file mode 100755 index 0000000..c3b8926 --- /dev/null +++ b/scripts/rerun_eval_n_times.sh @@ -0,0 +1,152 @@ +#!/bin/bash +# Re-run the per-task evaluate.py N times on an already-finished EVAL_DIR +# and aggregate per-run metrics into /metrics_averaged.json. +# +# Usage: +# scripts/rerun_eval_n_times.sh [N] +# +# Defaults: N=5. +# +# Mirrors run_task.sh's evaluation step: runs src/eval/tasks//evaluate.py +# (NOT the snapshot in $EVAL_DIR/task) under the same vllm_debug container with +# the same fuse-overlayfs HF cache and the same max-tokens fallback ladder. +# +# Run from the repo root, on a node with GPUs (submit via +# src/commit_utils/rerun_eval.sub for cluster execution). +set -euo pipefail + +if [ "$#" -lt 1 ]; then + echo "usage: $0 [N]" >&2 + exit 1 +fi + +EVAL_DIR="$(realpath "$1")" +N="${2:-5}" + +if [ ! -d "$EVAL_DIR/final_model" ]; then + echo "ERROR: $EVAL_DIR/final_model not found" >&2 + exit 1 +fi + +source src/commit_utils/set_env_vars.sh + +# Derive the task name from the EVAL_DIR basename: __. +EVAL_BASENAME="$(basename "$EVAL_DIR")" +EVALUATION_TASK="${EVAL_BASENAME%%_*}" + +if [ ! -f "src/eval/tasks/${EVALUATION_TASK}/evaluate.py" ]; then + echo "ERROR: src/eval/tasks/${EVALUATION_TASK}/evaluate.py not found" >&2 + echo " (parsed task '${EVALUATION_TASK}' from $(basename "$EVAL_DIR"))" >&2 + exit 1 +fi + +REPO_ROOT="$(pwd)" +RERUNS_DIR="$EVAL_DIR/reruns" +mkdir -p "$RERUNS_DIR" + +# Per-task max-tokens fallback ladder, mirroring run_task.sh. +case "$EVALUATION_TASK" in + aime2025) FB1="--max-tokens 12000"; FB2="--max-tokens 8000" ;; + arenahardwriting) FB1="--max-new-tokens 12288"; FB2="--max-new-tokens 8192" ;; + bfcl) FB1="--max-tokens 12000"; FB2="--max-tokens 8000" ;; + gpqamain) FB1="--max-tokens 12000"; FB2="--max-tokens 8000" ;; + gsm8k) FB1="--max-tokens 3000"; FB2="--max-tokens 2000" ;; + healthbench) FB1="--max-new-tokens 12288"; FB2="--max-new-tokens 8192" ;; + humaneval) FB1="--max-tokens 3000"; FB2="--max-tokens 2000" ;; + *) FB1=""; FB2="" ;; +esac + +# Fuse-overlayfs HF cache so reruns don't pollute the shared HF cache, +# matching run_task.sh's with_huggingface_overlay helper. +TMP_SUBDIR="/tmp/rerun_eval_$$" +HF_MERGED="${TMP_SUBDIR}/merged_huggingface" +TMP_HF_CACHE="/tmp/hf_cache_rerun_$$" + +setup_overlay() { + mkdir -p "${TMP_SUBDIR}/upper_huggingface" + mkdir -p "${TMP_SUBDIR}/fuse_workdir" + mkdir -p "${HF_MERGED}" + fuse-overlayfs -o \ + "lowerdir=${HF_HOME},upperdir=${TMP_SUBDIR}/upper_huggingface,workdir=${TMP_SUBDIR}/fuse_workdir" \ + "${HF_MERGED}" +} + +teardown_overlay() { + fusermount -u "${HF_MERGED}" 2>/dev/null || true + rm -rf "${TMP_SUBDIR}" 2>/dev/null || true +} +trap teardown_overlay EXIT + +setup_overlay + +run_one() { + local out_json="$1" + local extra="$2" + local log="$3" + + nvidia-smi --query-compute-apps=pid --format=csv,noheader 2>/dev/null \ + | xargs -r kill -9 2>/dev/null || true + sleep 5 + + timeout --signal=TERM --kill-after=60s 28800s \ + apptainer exec \ + --nv \ + --env "HF_HOME=${TMP_HF_CACHE}" \ + --env OPENAI_API_KEY="${OPENAI_API_KEY:-}" \ + --env VLLM_API_KEY="inspectai" \ + --env PYTHONNOUSERSITE="1" \ + --writable-tmpfs \ + --bind "${REPO_ROOT}:${REPO_ROOT}" \ + --bind "${HF_MERGED}:${TMP_HF_CACHE}" \ + --pwd "${REPO_ROOT}/src/eval/tasks/${EVALUATION_TASK}" \ + "${POST_TRAIN_BENCH_CONTAINERS_DIR}/${POST_TRAIN_BENCH_CONTAINER_NAME}.sif" \ + python evaluate.py \ + --model-path "${EVAL_DIR}/final_model" \ + --templates-dir ../../../../src/eval/templates \ + --limit -1 \ + ${extra} \ + --json-output-file "${out_json}" >"${log}" 2>&1 +} + +run_with_fallback() { + local out_json="$1" + local log_prefix="$2" + + rm -f "$out_json" + + for level in default fb1 fb2; do + local extra="" + case "$level" in + default) extra="" ;; + fb1) extra="$FB1" ;; + fb2) extra="$FB2" ;; + esac + echo " [$level] extra='${extra}'" + run_one "$out_json" "$extra" "${log_prefix}_${level}.log" || true + if [ -f "$out_json" ]; then + return 0 + fi + done + return 1 +} + +echo "EVAL_DIR=${EVAL_DIR}" +echo "EVALUATION_TASK=${EVALUATION_TASK}" +echo "N=${N}" + +for i in $(seq 1 "$N"); do + out="${RERUNS_DIR}/run_${i}.json" + log_prefix="${RERUNS_DIR}/run_${i}" + echo "=== rerun ${i} / ${N} ===" + if run_with_fallback "$out" "$log_prefix"; then + echo " -> wrote $out" + else + echo " -> FAILED all fallbacks for rerun ${i}" + fi +done + +python scripts/aggregate_metrics_runs.py \ + --runs-glob "${RERUNS_DIR}/run_*.json" \ + --output "${EVAL_DIR}/metrics_averaged.json" + +echo "Wrote ${EVAL_DIR}/metrics_averaged.json" diff --git a/scripts/utils.py b/scripts/utils.py new file mode 100644 index 0000000..ab6af6d --- /dev/null +++ b/scripts/utils.py @@ -0,0 +1,410 @@ +#!/usr/bin/env python3 +"""Shared constants and utility functions for aggregation scripts.""" +import csv +import json +import math +import os +import re + + +# --------------------------------------------------------------------------- +# Constants +# --------------------------------------------------------------------------- + +SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +FACTORS_PATH = os.path.join(SCRIPT_DIR, "factors.json") +BASELINES_PATH = os.path.join(SCRIPT_DIR, "baselines.json") + +HARDCODED_AGENT_MAP = { + "Opus-4.5": [ + "claude_claude-opus-4-5_10h_final_v3", + "claude_claude-opus-4-5_10h_v5", + "claude_claude-opus-4-5_10h_v6_seed1", + ], + "GPT-5.1-Codex-Max": [ + "codex_gpt-5.1-codex-max_10h_final_v3", + "codex_gpt-5.1-codex-max_10h_v4_seed1", + "codex_gpt-5.1-codex-max_10h_v4_seed2", + ], + "GPT-5.2-Codex": [ + "codex_gpt-5.2-codex_10h_v6", + "codex_gpt-5.2-codex_10h_v6_seed1", + "codex_gpt-5.2-codex_10h_v6_seed2", + ], + "GPT-5.2": [ + "codex_gpt-5.2_10h_v4", + "codex_gpt-5.2_10h_v6_seed1", + "codex_gpt-5.2_10h_v6_seed2", + ], + "Gemini-3-Pro": [ + "gemini_models_gemini-3-pro-preview_10h_final_v3", + "gemini_models_gemini-3-pro-preview_10h_v5", + "gemini_models_gemini-3-pro-preview_10h_v6_seed1", + ], + "GPT-5.1-Codex-Max Low": [ + "codexlow_gpt-5.1-codex-max_10h_v7", + "codexlow_gpt-5.1-codex-max_10h_v7_seed1", + ], + "GPT-5.1-Codex-Max High": [ + "codexhigh_gpt-5.1-codex-max_10h_v7", + "codexhigh_gpt-5.1-codex-max_10h_v7_seed1", + ], + "Opus-4.6": [ + "claude_claude-opus-4-6_10h_run1_old_container", + "claude_claude-opus-4-6_10h_run2", + "claude_claude-opus-4-6_10h_run3", + ], + "GPT-5.3-Codex_Med": [ + "codex_non_api_gpt-5.3-codex_10h_run1", + "codex_non_api_gpt-5.3-codex_10h_run2", + "codex_non_api_gpt-5.3-codex_10h_run3", + ], + "Gemini-3.1-Pro": [ + "opencode_opencode_gemini-3.1-pro_10h_run1", + "opencode_opencode_gemini-3.1-pro_10h_run2", + "opencode_opencode_gemini-3.1-pro_10h_run3", + ], + "GPT-5.3-Codex_High": [ + "codex_non_api_high_gpt-5.3-codex_10h_run1", + "codex_non_api_high_gpt-5.3-codex_10h_run2", + "codex_non_api_high_gpt-5.3-codex_10h_run3", + ], + "GPT-5.4-High": [ + "codex_non_api_high_gpt-5.4_10h_run1", + "codex_non_api_high_gpt-5.4_10h_run2", + "codex_non_api_high_gpt-5.4_10h_run3", + ], + "Opus-4.6-1M": [ + "claude_non_api_claude-opus-4-6_1m__10h_run1", + "claude_non_api_claude-opus-4-6_1m__10h_run2", + "claude_non_api_claude-opus-4-6_1m__10h_run3", + ], + "Opus-4.7":[ + "claude_non_api_claude-opus-4-7_10h", + "claude_non_api_claude-opus-4-7_10h_run2", + "claude_non_api_claude-opus-4-7_10h_run3", + ], + "GPT-5.5-xHigh":[ + "codex_non_api_xhigh_gpt-5.5_10h_run1", + "codex_non_api_xhigh_gpt-5.5_10h_run2", + + ] +} + +HARDCODED_BENCHMARKS = [ + "aime2025", + "arenahardwriting", + "bfcl", + "gpqamain", + "gsm8k", + "healthbench", + "humaneval", +] + +EXPECTED_MODELS = { + "Qwen3-1.7B-Base", + "Qwen3-4B-Base", + "SmolLM3-3B-Base", + "gemma-3-4b-pt", +} + +BUDGET_SECONDS = 10 * 3600 # 10 hours + + +def load_factors() -> dict: + with open(FACTORS_PATH, "r") as f: + return json.load(f) + + +def load_baselines() -> dict: + """Load hardcoded baseline data from baselines.json. + + Returns {"zeroshot": {model: {bench: value}}, "fewshot": {...}}. + Values are floats. + """ + with open(BASELINES_PATH, "r") as f: + return json.load(f) + + +def get_baseline_fallback_data() -> dict[str, dict[str, str]]: + """Load zeroshot baselines as {model: {bench: str_value}} for fallback. + + This is the replacement for reading aggregated_baseline_zeroshot.csv. + """ + baselines = load_baselines() + data = {} + for model, benchmarks in baselines["zeroshot"].items(): + data[model] = {bench: str(val) for bench, val in benchmarks.items()} + return data + + +# --------------------------------------------------------------------------- +# Stats +# --------------------------------------------------------------------------- + +def mean(values: list[float]) -> float: + return sum(values) / len(values) + + +def stddev(values: list[float]) -> float: + avg = mean(values) + variance = sum((x - avg) ** 2 for x in values) / (len(values) - 1) + return math.sqrt(variance) + + +# --------------------------------------------------------------------------- +# Paths +# --------------------------------------------------------------------------- + +def get_results_dir() -> str: + return os.environ.get("POST_TRAIN_BENCH_RESULTS_DIR", "results") + + +# --------------------------------------------------------------------------- +# CSV I/O +# --------------------------------------------------------------------------- + +def is_number(value: str) -> bool: + if not value: + return False + try: + float(value) + return True + except ValueError: + return False + + +def load_csv_as_dict(csv_path: str) -> tuple[dict[str, dict[str, str]], list[str]]: + """ + Load a CSV into {model: {benchmark: value}}. + Returns (data, benchmarks). Returns ({}, []) if file doesn't exist. + """ + data = {} + benchmarks = [] + + if not os.path.exists(csv_path): + return data, benchmarks + + with open(csv_path, "r", newline="") as f: + reader = csv.reader(f) + header = next(reader, None) + if not header: + return data, benchmarks + + benchmarks = header[1:] + + for row in reader: + if not row: + continue + model = row[0] + data[model] = {} + for i, bench in enumerate(benchmarks): + if i + 1 < len(row): + data[model][bench] = row[i + 1] + else: + data[model][bench] = "" + + return data, benchmarks + + +def write_csv( + path: str, + models: list[str], + benchmarks: list[str], + data: dict[str, dict[str, str]], +): + with open(path, "w", newline="") as f: + writer = csv.writer(f) + writer.writerow(["model"] + benchmarks) + for model in models: + row = [model] + for bench in benchmarks: + row.append(data[model].get(bench, "")) + writer.writerow(row) + + +# --------------------------------------------------------------------------- +# Walking result directories +# --------------------------------------------------------------------------- + +def walk_latest_runs( + method_path: str, + min_run_id: int | None = None, + max_run_id: int | None = None, +) -> dict[tuple[str, str], dict]: + """ + Walk a method directory and return the latest run per (benchmark, model). + + Returns {(benchmark, model): {"run_id": int, "path": str}}. + """ + latest_runs = {} + + for entry in os.listdir(method_path): + entry_path = os.path.join(method_path, entry) + if not os.path.isdir(entry_path): + continue + + try: + benchmark, _, model, run_id_str = entry.split("_") + run_id = int(run_id_str) + except ValueError: + print(entry) + raise ValueError(f"{entry}, {method_path}") + + if max_run_id is not None and run_id >= max_run_id: + continue + if min_run_id is not None and run_id < min_run_id: + continue + + key = (benchmark, model) + if key not in latest_runs or run_id > latest_runs[key]["run_id"]: + latest_runs[key] = {"run_id": run_id, "path": entry_path} + + return latest_runs + + +# --------------------------------------------------------------------------- +# Metrics loading +# --------------------------------------------------------------------------- + +def load_metrics(metrics_path: str, method_name: str = None) -> str: + """ + Return the accuracy as a string, or an error label. + + Error labels for non-baseline methods: + - "not avl." if time_taken.txt doesn't exist + - "not stored" if time_taken.txt exists but final_model/ doesn't + - "ERR" otherwise + For baseline: always "ERR" on failure. + """ + if os.path.exists(metrics_path): + try: + with open(metrics_path, "r") as f: + data = json.load(f) + acc = data.get("accuracy") + if acc is not None: + return str(acc) + except Exception: + pass + + if method_name == "baseline_zeroshot": + return "ERR" + + run_dir = os.path.dirname(metrics_path) + + if not os.path.exists(os.path.join(run_dir, "time_taken.txt")): + return "not avl." + + if not os.path.isdir(os.path.join(run_dir, "final_model")): + return "not stored" + + return "ERR" + + +# --------------------------------------------------------------------------- +# Contamination loading +# --------------------------------------------------------------------------- + +def load_contamination(contamination_path: str): + """Return True, False, "IMPORTANT ERR", or "ERR".""" + if not os.path.exists(contamination_path): + return "ERR" + try: + with open(contamination_path, "r") as f: + content = f.read().strip() + except Exception: + return "ERR" + if content == "contamination detected": + return True + elif content == "no contamination detected": + return False + else: + return "IMPORTANT ERR" + + +def load_disallowed_model(disallowed_path: str): + """Return True, False, "IMPORTANT ERR", or "ERR".""" + if not os.path.exists(disallowed_path): + return "ERR" + try: + with open(disallowed_path, "r") as f: + content = f.read().strip() + except Exception: + return "ERR" + if content == "disallowed use detected": + return True + elif content == "only allowed use detected": + return False + else: + return "IMPORTANT ERR" + + +def combine_contamination_results(contamination, disallowed_model) -> str: + """ + Combine contamination and disallowed model results into a cell value. + + Returns "" (clean), "C", "M", "MC", or an error string. + """ + if contamination in ("ERR", "IMPORTANT ERR") or disallowed_model in ( + "ERR", + "IMPORTANT ERR", + ): + errors = [] + if contamination in ("ERR", "IMPORTANT ERR"): + errors.append(f"C:{contamination}") + if disallowed_model in ("ERR", "IMPORTANT ERR"): + errors.append(f"M:{disallowed_model}") + return " ".join(errors) + + if disallowed_model and contamination: + return "MC" + elif disallowed_model and not contamination: + return "M" + elif not disallowed_model and contamination: + return "C" + else: + return "" + + +# --------------------------------------------------------------------------- +# Time loading +# --------------------------------------------------------------------------- + +def parse_time_hms(time_str: str) -> int | None: + """Parse H:M:S string to total seconds. Returns None on failure.""" + match = re.match(r"^(\d+):(\d{1,2}):(\d{1,2})$", time_str.strip()) + if not match: + return None + hours, minutes, seconds = map(int, match.groups()) + if minutes >= 60 or seconds >= 60: + return None + return hours * 3600 + minutes * 60 + seconds + + +def format_time_hms(total_seconds: int) -> str: + """Convert total seconds to H:MM:SS format.""" + hours = total_seconds // 3600 + minutes = (total_seconds % 3600) // 60 + seconds = total_seconds % 60 + return f"{hours}:{minutes:02d}:{seconds:02d}" + + +def load_time_taken(run_dir: str) -> tuple[str, int | None]: + """ + Return (display_string, total_seconds). + Returns ("ERR", None) on failure. + """ + time_taken_path = os.path.join(run_dir, "time_taken.txt") + + if not os.path.exists(time_taken_path): + return "ERR", None + + try: + with open(time_taken_path, "r") as f: + time_str = f.read().strip() + total_seconds = parse_time_hms(time_str) + if total_seconds is None: + return "ERR", None + return format_time_hms(total_seconds), total_seconds + except Exception: + return "ERR", None diff --git a/scripts/verify.py b/scripts/verify.py new file mode 100644 index 0000000..857703a --- /dev/null +++ b/scripts/verify.py @@ -0,0 +1,261 @@ +#!/usr/bin/env python3 +""" +Verify that refactored aggregation scripts produce identical outputs +to the original pipeline. + +Usage: + python verify.py --ground-truth /fast/hbhatnagar/ptb_results/ \ + --new-output /fast/hbhatnagar/ptb_results_new/ + +Compares all key output CSVs cell-by-cell: + - final_{method}.csv (per-method score grids) + - contamination_{method}.csv (per-method contamination flags) + - single_metrics.csv (weighted score per run) + - single_metrics_aggregated.csv (avg/std per agent group) + - aggregated_avg_{agent}.csv (per-cell avg for multi-run agents) + - aggregated_std_{agent}.csv (per-cell std for multi-run agents) + - time_aggregated.csv (avg/std time per agent) +""" +import argparse +import csv +import os +import sys + + +FLOAT_TOLERANCE = 1e-10 + + +def is_number(s: str) -> bool: + if not s: + return False + try: + float(s) + return True + except ValueError: + return False + + +def load_csv(path: str) -> list[list[str]]: + with open(path, "r", newline="") as f: + return list(csv.reader(f)) + + +def compare_csvs(gt_path: str, new_path: str) -> list[str]: + """ + Compare two CSVs cell-by-cell. + Returns list of mismatch descriptions (empty = pass). + """ + errors = [] + + gt_rows = load_csv(gt_path) + new_rows = load_csv(new_path) + + if len(gt_rows) != len(new_rows): + errors.append(f"Row count differs: {len(gt_rows)} vs {len(new_rows)}") + # Still compare what we can + max_rows = min(len(gt_rows), len(new_rows)) + else: + max_rows = len(gt_rows) + + for row_idx in range(max_rows): + gt_row = gt_rows[row_idx] + new_row = new_rows[row_idx] + + if len(gt_row) != len(new_row): + errors.append( + f" Row {row_idx}: column count differs: " + f"{len(gt_row)} vs {len(new_row)}" + ) + max_cols = min(len(gt_row), len(new_row)) + else: + max_cols = len(gt_row) + + for col_idx in range(max_cols): + gt_val = gt_row[col_idx] + new_val = new_row[col_idx] + + if gt_val == new_val: + continue + + # Try numeric comparison with tolerance + if is_number(gt_val) and is_number(new_val): + if abs(float(gt_val) - float(new_val)) < FLOAT_TOLERANCE: + continue + + # Header row for context + header_label = "" + if row_idx > 0 and gt_rows[0]: + col_name = gt_rows[0][col_idx] if col_idx < len(gt_rows[0]) else "?" + row_name = gt_row[0] if gt_row else "?" + header_label = f" ({row_name}, {col_name})" + + errors.append( + f" Row {row_idx}, Col {col_idx}{header_label}: " + f"'{gt_val}' vs '{new_val}'" + ) + + return errors + + +def find_matching_files(gt_dir: str, new_dir: str) -> dict[str, tuple[str, str]]: + """ + Find CSVs that exist in both directories, filtered to the ones we care about. + Returns {filename: (gt_path, new_path)}. + """ + matches = {} + + gt_files = set(f for f in os.listdir(gt_dir) if f.endswith(".csv")) + new_files = set(f for f in os.listdir(new_dir) if f.endswith(".csv")) + + # Files we care about + for f in sorted(gt_files & new_files): + if should_verify(f): + matches[f] = (os.path.join(gt_dir, f), os.path.join(new_dir, f)) + + return matches + + +def should_verify(filename: str) -> bool: + """Decide if a CSV file should be verified.""" + # Skip deprecated / intermediate / artifact files + if filename in ( + "aggregated_avg_over_models.csv", + "aggregated_std_over_models.csv", + ): + return False + + # Skip intermediate time CSVs (only time_aggregated.csv is a final output) + if filename.startswith("aggregated_time_"): + return False + + # Per-method final scores + if filename.startswith("final_") and filename.endswith(".csv"): + # Skip deprecated/artifact files + if filename.startswith("final_avg_"): + return False + if filename.startswith("final_std_"): + return False + if filename.startswith("final_time_"): + return False + # Skip baselines (hardcoded in baselines.json, not regenerated) + if filename in ("final_baseline.csv", "final_baseline_zeroshot.csv"): + return False + return True + + # Contamination flags + if filename.startswith("contamination_") and filename.endswith(".csv"): + # Skip baselines + if filename in ( + "contamination_baseline.csv", + "contamination_baseline_zeroshot.csv", + ): + return False + return True + + # Single metric outputs + if filename in ("single_metrics.csv", "single_metrics_aggregated.csv"): + return True + + # Per-agent avg/std (multi-run agents) + if filename.startswith("aggregated_avg_") or filename.startswith("aggregated_std_"): + return True + + # Time aggregation + if filename == "time_aggregated.csv": + return True + + return False + + +def main(): + parser = argparse.ArgumentParser( + description="Verify refactored aggregation outputs match ground truth." + ) + parser.add_argument( + "--ground-truth", + required=True, + help="Directory with ground truth CSV outputs (from original scripts).", + ) + parser.add_argument( + "--new-output", + required=True, + help="Directory with new CSV outputs (from refactored scripts).", + ) + args = parser.parse_args() + + gt_dir = args.ground_truth + new_dir = args.new_output + + if not os.path.isdir(gt_dir): + print(f"Error: ground truth dir not found: {gt_dir}") + sys.exit(1) + if not os.path.isdir(new_dir): + print(f"Error: new output dir not found: {new_dir}") + sys.exit(1) + + matches = find_matching_files(gt_dir, new_dir) + + if not matches: + print("No matching CSV files found to compare.") + sys.exit(1) + + # Check for files in ground truth that are missing from new output + gt_verifiable = set( + f for f in os.listdir(gt_dir) if f.endswith(".csv") and should_verify(f) + ) + new_verifiable = set( + f for f in os.listdir(new_dir) if f.endswith(".csv") and should_verify(f) + ) + + missing_from_new = gt_verifiable - new_verifiable + extra_in_new = new_verifiable - gt_verifiable + + total_files = len(matches) + passed = 0 + failed = 0 + failure_details = [] + + print(f"Comparing {total_files} CSV files...\n") + + for filename, (gt_path, new_path) in sorted(matches.items()): + errors = compare_csvs(gt_path, new_path) + if errors: + failed += 1 + failure_details.append((filename, errors)) + print(f" FAIL {filename}") + else: + passed += 1 + print(f" PASS {filename}") + + # Summary + print(f"\n{'='*60}") + print(f"Results: {passed} passed, {failed} failed, {total_files} total") + + if missing_from_new: + print(f"\nMISSING from new output ({len(missing_from_new)}):") + for f in sorted(missing_from_new): + print(f" - {f}") + + if extra_in_new: + print(f"\nEXTRA in new output ({len(extra_in_new)}):") + for f in sorted(extra_in_new): + print(f" + {f}") + + if failure_details: + print(f"\nFailure details:") + for filename, errors in failure_details: + print(f"\n {filename}:") + for err in errors[:10]: # Cap at 10 errors per file + print(f" {err}") + if len(errors) > 10: + print(f" ... and {len(errors) - 10} more") + + if failed or missing_from_new: + sys.exit(1) + else: + print("\nAll checks passed.") + sys.exit(0) + + +if __name__ == "__main__": + main() diff --git a/src/commit_utils/commit.sh b/src/commit_utils/commit.sh index 34abd37..3c43144 100644 --- a/src/commit_utils/commit.sh +++ b/src/commit_utils/commit.sh @@ -2,71 +2,83 @@ source src/commit_utils/set_env_vars.sh models=( - "google/gemma-3-4b-pt" + # "google/gemma-3-4b-pt" "Qwen/Qwen3-4B-Base" - "Qwen/Qwen3-1.7B-Base" - "HuggingFaceTB/SmolLM3-3B-Base" + # "Qwen/Qwen3-1.7B-Base" + # "HuggingFaceTB/SmolLM3-3B-Base" ) evals=( - "aime2025" - "arenahardwriting" - "bfcl" - "gpqamain" - "gsm8k" - "humaneval" + # "aime2025" + # "arenahardwriting" + # "bfcl" + # "gpqamain" + # "gsm8k" + # "humaneval" "healthbench" ) -# export POST_TRAIN_BENCH_EXPERIMENT_NAME="_pushed" +export POST_TRAIN_BENCH_EXPERIMENT_NAME="_METR" for model in "${models[@]}"; do for eval in "${evals[@]}"; do echo "" echo $model on $eval if [ "${POST_TRAIN_BENCH_JOB_SCHEDULER}" = "htcondor_mpi-is" ]; then # Proprietary (API) - condor_submit_bid 100 -a "agent=codex" -a "agent_config=gpt-5.1-codex-max" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub - condor_submit_bid 50 -a "agent=codex" -a "agent_config=gpt-5.3-codex" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub - condor_submit_bid 100 -a "agent=claude" -a "agent_config=claude-sonnet-4-5" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub - condor_submit_bid 100 -a "agent=claude" -a "agent_config=claude-opus-4-5" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub - condor_submit_bid 50 -a "agent=claude" -a "agent_config=claude-opus-4-6" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub - condor_submit_bid 50 -a "agent=qwen3max" -a "agent_config=qwen3-max-2026-01-23" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub + # condor_submit_bid 100 -a "agent=codex" -a "agent_config=gpt-5.1-codex-max" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub + # condor_submit_bid 50 -a "agent=codex" -a "agent_config=gpt-5.3-codex" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub + # condor_submit_bid 100 -a "agent=claude" -a "agent_config=claude-sonnet-4-5" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub + # condor_submit_bid 100 -a "agent=claude" -a "agent_config=claude-opus-4-5" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub + # condor_submit_bid 50 -a "agent=claude" -a "agent_config=claude-opus-4-6" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub + # condor_submit_bid 50 -a "agent=qwen3max" -a "agent_config=qwen3-max-2026-01-23" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub # Proprietary (Subscription plan) - condor_submit_bid 100 -a "agent=codex_non_api" -a "agent_config=gpt-5.3-codex" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub - condor_submit_bid 100 -a "agent=claude_non_api" -a "agent_config=claude-opus-4-6" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub - condor_submit_bid 150 -a "agent=claude_non_api" -a "agent_config=claude-sonnet-4-6" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub - condor_submit_bid 100 -a "agent=codex_non_api_high" -a "agent_config=gpt-5.3-codex" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub - condor_submit_bid 100 -a "agent=codex_non_api_high" -a "agent_config=gpt-5.2" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub - condor_submit_bid 100 -a "agent=claude_non_api" -a "agent_config=claude-opus-4-6[1m]" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub - condor_submit_bid 100 -a "agent=claude_non_api" -a "agent_config=claude-opus-4-6[1m]" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=50" src/commit_utils/single_task.sub + # condor_submit_bid 100 -a "agent=codex_non_api" -a "agent_config=gpt-5.3-codex" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub + # condor_submit_bid 100 -a "agent=claude_non_api" -a "agent_config=claude-opus-4-6" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub + # condor_submit_bid 150 -a "agent=claude_non_api" -a "agent_config=claude-sonnet-4-6" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub + # condor_submit_bid 100 -a "agent=codex_non_api_high" -a "agent_config=gpt-5.3-codex" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub + # condor_submit_bid 100 -a "agent=codex_non_api_high" -a "agent_config=gpt-5.2" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub + # condor_submit_bid 100 -a "agent=claude_non_api" -a "agent_config=claude-opus-4-6[1m]" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub + # condor_submit_bid 100 -a "agent=claude_non_api" -a "agent_config=claude-opus-4-6[1m]" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=50" src/commit_utils/single_task.sub + # condor_submit_bid 100 -a "agent=claude_non_api" -a "agent_config=claude-opus-4-7" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub + # condor_submit_bid 100 -a "agent=codex_non_api_xhigh" -a "agent_config=gpt-5.5" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub + # condor_submit_bid 100 -a "agent=codex_xhigh" -a "agent_config=gpt-5.5" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub + # Multi-GPU runs might need more than 8 CPUs and 128 GB of RAM (use 512 GB to be safe) - condor_submit_bid 100 -a "agent=claude_non_api" -a "agent_config=claude-opus-4-6[1m]" -a "eval=$eval" -a "model_to_train=$model" -a "num_gpus=8" -a "num_hours=50" -a "request_memory=524288" -a "request_cpus=128" src/commit_utils/single_task.sub - condor_submit_bid 500 -a "agent=claude_non_api" -a "agent_config=claude-opus-4-6[1m]" -a "eval=$eval" -a "model_to_train=$model" -a "num_gpus=8" -a "num_hours=50" src/commit_utils/single_task.sub + # condor_submit_bid 100 -a "agent=claude_non_api" -a "agent_config=claude-opus-4-6[1m]" -a "eval=$eval" -a "model_to_train=$model" -a "num_gpus=8" -a "num_hours=100" -a "request_memory=524288" -a "request_cpus=128" -a "request_disk=800G" src/commit_utils/single_task.sub + condor_submit_bid 100 -a "agent=claude_reprompt" -a "agent_config=claude-opus-4-6[1m]" -a "eval=$eval" -a "model_to_train=$model" -a "num_gpus=8" -a "num_hours=100" -a "request_memory=524288" -a "request_cpus=128" -a "request_disk=800G" src/commit_utils/single_task.sub + # condor_submit_bid 100 -a "agent=claude_non_api" -a "agent_config=claude-opus-4-6[1m]" -a "eval=$eval" -a "model_to_train=$model" -a "num_gpus=8" -a "num_hours=50" -a "request_memory=524288" -a "request_cpus=128" -a "request_disk=800G" src/commit_utils/single_task.sub + # condor_submit_bid 500 -a "agent=claude_non_api" -a "agent_config=claude-opus-4-6[1m]" -a "eval=$eval" -a "model_to_train=$model" -a "num_gpus=8" -a "num_hours=50" src/commit_utils/single_task.sub - # Reprompted variant to push the agent (such as GPT 5.4) - condor_submit_bid 100 -a "agent=codex_non_api_high_reprompt" -a "agent_config=gpt-5.4" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub + # # Reprompted variant to push the agent (such as GPT 5.4) + # condor_submit_bid 50 -a "agent=codex_non_api_high_reprompt" -a "agent_config=gpt-5.4" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub + # condor_submit_bid 100 -a "agent=codex_non_api_xhigh_reprompt" -a "agent_config=gpt-5.5" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub + # condor_submit_bid 100 -a "agent=codex_xhigh_reprompt" -a "agent_config=gpt-5.5" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub + # condor_submit_bid 100 -a "agent=claude_reprompt" -a "agent_config=claude-opus-4-6[1m]" -a "eval=$eval" -a "model_to_train=$model" -a "num_gpus=1" -a "num_hours=5" src/commit_utils/single_task.sub - condor_submit_bid 100 -a "agent=codex_non_api_high" -a "agent_config=gpt-5.4" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub - condor_submit_bid 100 -a "agent=codex_non_api_xhigh" -a "agent_config=gpt-5.3-codex" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub - condor_submit_bid 100 -a "agent=claude_non_api_max" -a "agent_config=claude-opus-4-6" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub - condor_submit_bid 50 -a "agent=claude" -a "agent_config=claude-sonnet-4-5" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=1" src/commit_utils/single_task.sub - condor_submit_bid 100 -a "agent=gemini" -a "agent_config=models/gemini-3-pro-preview" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub - condor_submit_bid 100 -a "agent=gemini" -a "agent_config=models/gemini-3-flash-preview" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub - condor_submit_bid 150 -a "agent=gemini" -a "agent_config=models/gemini-3.1-pro-preview" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub + + + # condor_submit_bid 100 -a "agent=codex_non_api_high" -a "agent_config=gpt-5.4" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub + # condor_submit_bid 100 -a "agent=codex_non_api_xhigh" -a "agent_config=gpt-5.3-codex" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub + # condor_submit_bid 100 -a "agent=claude_non_api_max" -a "agent_config=claude-opus-4-6" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub + # condor_submit_bid 50 -a "agent=claude" -a "agent_config=claude-sonnet-4-5" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=1" src/commit_utils/single_task.sub + # condor_submit_bid 100 -a "agent=gemini" -a "agent_config=models/gemini-3-pro-preview" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub + # condor_submit_bid 100 -a "agent=gemini" -a "agent_config=models/gemini-3-flash-preview" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub + # condor_submit_bid 150 -a "agent=gemini" -a "agent_config=models/gemini-3.1-pro-preview" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub + # OpenCode - condor_submit_bid 50 -a "agent=opencode" -a "agent_config=anthropic/claude-opus-4-5" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub - condor_submit_bid 50 -a "agent=opencode" -a "agent_config=opencode/gpt-5.1-codex-max" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub - condor_submit_bid 50 -a "agent=opencode" -a "agent_config=opencode/kimi-k2-thinking" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub - condor_submit_bid 50 -a "agent=opencode" -a "agent_config=opencode/glm-4.7-free" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub - condor_submit_bid 500 -a "agent=opencode" -a "agent_config=opencode/gemini-3-pro" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub - condor_submit_bid 50 -a "agent=opencode" -a "agent_config=opencode/minimax-m2.1-free" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub - condor_submit_bid 50 -a "agent=glm5" -a "agent_config=glm-5" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub - condor_submit_bid 100 -a "agent=opencode" -a "agent_config=opencode/minimax-m2.5-free" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub - condor_submit_bid 100 -a "agent=opencode" -a "agent_config=zai/glm-5" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub - condor_submit_bid 100 -a "agent=opencode" -a "agent_config=opencode/kimi-k2.5" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub - condor_submit_bid 50 -a "agent=opencode" -a "agent_config=opencode/glm-5" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub - condor_submit_bid 150 -a "agent=opencode" -a "agent_config=opencode/gemini-3.1-pro" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub - sleep 10 + # condor_submit_bid 50 -a "agent=opencode" -a "agent_config=anthropic/claude-opus-4-5" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub + # condor_submit_bid 50 -a "agent=opencode" -a "agent_config=opencode/gpt-5.1-codex-max" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub + # condor_submit_bid 50 -a "agent=opencode" -a "agent_config=opencode/kimi-k2-thinking" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub + # condor_submit_bid 50 -a "agent=opencode" -a "agent_config=opencode/glm-4.7-free" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub + # condor_submit_bid 500 -a "agent=opencode" -a "agent_config=opencode/gemini-3-pro" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub + # condor_submit_bid 50 -a "agent=opencode" -a "agent_config=opencode/minimax-m2.1-free" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub + # condor_submit_bid 50 -a "agent=glm5" -a "agent_config=glm-5" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub + # condor_submit_bid 100 -a "agent=opencode" -a "agent_config=opencode/minimax-m2.5-free" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub + # condor_submit_bid 100 -a "agent=opencode" -a "agent_config=zai/glm-5" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub + # condor_submit_bid 100 -a "agent=opencode" -a "agent_config=opencode/kimi-k2.5" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub + # condor_submit_bid 50 -a "agent=opencode" -a "agent_config=opencode/glm-5" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub + # condor_submit_bid 150 -a "agent=opencode" -a "agent_config=opencode/gemini-3.1-pro" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub + # sleep 10 elif [ "${POST_TRAIN_BENCH_JOB_SCHEDULER}" = "htcondor" ]; then condor_submit_bid -a "agent=codex" -a "agent_config=gpt-5.1-codex-max" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub condor_submit_bid -a "agent=codex" -a "agent_config=gpt-5.3-codex" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub diff --git a/src/commit_utils/rerun_eval.sub b/src/commit_utils/rerun_eval.sub new file mode 100644 index 0000000..62aa4a1 --- /dev/null +++ b/src/commit_utils/rerun_eval.sub @@ -0,0 +1,15 @@ +executable = /bin/bash +num_gpus = 1 +n = 5 +arguments = scripts/rerun_eval_n_times.sh $(eval_dir) $(n) +environment = "OPENAI_API_KEY=$ENV(OPENAI_API_KEY) HOME=$ENV(HOME) POST_TRAIN_BENCH_RESULTS_DIR=$ENV(POST_TRAIN_BENCH_RESULTS_DIR) POST_TRAIN_BENCH_CONTAINERS_DIR=$ENV(POST_TRAIN_BENCH_CONTAINERS_DIR) POST_TRAIN_BENCH_CONTAINER_NAME=$ENV(POST_TRAIN_BENCH_CONTAINER_NAME) POST_TRAIN_BENCH_JOB_SCHEDULER=$ENV(POST_TRAIN_BENCH_JOB_SCHEDULER) HF_HOME=$ENV(HF_HOME)" +error = rerun_$(Cluster).err +output = rerun_$(Cluster).out +log = rerun_$(Cluster).log +request_memory = 131072 +request_cpus = 16 +request_gpus = $(num_gpus) +requirements = TARGET.CUDADeviceName == "NVIDIA H100 80GB HBM3" && Machine != "i104.internal.cluster.is.localnet" +request_disk=400G ++BypassLXCfs="true" +queue diff --git a/src/disallowed_usage_judge/rerun_judge/README.md b/src/disallowed_usage_judge/rerun_judge/README.md new file mode 100644 index 0000000..06dd3fd --- /dev/null +++ b/src/disallowed_usage_judge/rerun_judge/README.md @@ -0,0 +1,100 @@ +# Rerun Judge + +Re-run the disallowed-usage / contamination judge on existing result +directories without re-running the agent or eval. Useful when the judge step +in `src/run_task.sh` failed mid-run (e.g. API quota hit) and the run dir is +missing `contamination_judgement.txt` / `disallowed_model_judgement.txt`. + +The judge invocation mirrors `src/run_task.sh` exactly — single GPT-5.1-Codex +call via the `CODEX_API_KEY` API path. All outputs are suffixed with `_rerun` +so the originals (if any) are preserved. + +## Files + +| File | Description | +|---|---| +| `../run_judge.sh` | Standalone script: run judge on one result dir | +| `utils.py` | Shared dir-walking / parsing / judgement-loading helpers | +| `list_results.py` | List + filter result directories | +| `aggregate_rerun_results.py` | Compare original vs rerun judgements | +| `rerun_single.sh` | Thin wrapper over `run_judge.sh` (for HTCondor) | +| `commit_rerun_judge.sh` | Submit HTCondor jobs | +| `rerun_judge.sub` | HTCondor submission file | + +## Usage + +### Single directory + +```bash +bash src/disallowed_usage_judge/run_judge.sh /path/to/result_dir +``` + +Writes: +- `contamination_judgement_rerun.txt` +- `disallowed_model_judgement_rerun.txt` +- `judge_output_rerun.json` +- `judge_output_rerun.txt` + +### Listing candidates + +```bash +# Every result dir +python src/disallowed_usage_judge/rerun_judge/list_results.py + +# Only dirs where the original judge step failed +python src/disallowed_usage_judge/rerun_judge/list_results.py --only-missing-judgement + +# Just paths, ready for piping +python src/disallowed_usage_judge/rerun_judge/list_results.py \ + --only-missing-judgement --paths-only +``` + +### Submit HTCondor jobs + +```bash +# All dirs missing original judgement, latest run per method/model/benchmark +./src/disallowed_usage_judge/rerun_judge/commit_rerun_judge.sh \ + --only-missing-judgement --latest-only + +# Filter by method +./src/disallowed_usage_judge/rerun_judge/commit_rerun_judge.sh \ + --method "codex_non_api_xhigh_reprompt_gpt-5.5" + +# Skip dirs that already have rerun output +./src/disallowed_usage_judge/rerun_judge/commit_rerun_judge.sh --skip-existing + +# Preview without submitting +./src/disallowed_usage_judge/rerun_judge/commit_rerun_judge.sh \ + --only-missing-judgement --dry-run +``` + +### Aggregate / diff + +```bash +# Plain summary +python src/disallowed_usage_judge/rerun_judge/aggregate_rerun_results.py + +# Only show dirs where the rerun changed the verdict +python src/disallowed_usage_judge/rerun_judge/aggregate_rerun_results.py --diff-only + +# Only show dirs where the rerun filled a previously-missing judgement +python src/disallowed_usage_judge/rerun_judge/aggregate_rerun_results.py --filled-only + +# Export to CSV +python src/disallowed_usage_judge/rerun_judge/aggregate_rerun_results.py --csv rerun.csv +``` + +## Adopting rerun results + +`scripts/collect.py` reads `contamination_judgement.txt` / +`disallowed_model_judgement.txt`, not the `_rerun` variants. Once you're happy +with the rerun output, copy the files over (or symlink) so collect.py picks +them up: + +```bash +cp result_dir/contamination_judgement_rerun.txt result_dir/contamination_judgement.txt +cp result_dir/disallowed_model_judgement_rerun.txt result_dir/disallowed_model_judgement.txt +``` + +(A `--prefer-rerun` flag in `collect.py` would be the cleaner long-term +option.) diff --git a/src/disallowed_usage_judge/rerun_judge/aggregate_rerun_results.py b/src/disallowed_usage_judge/rerun_judge/aggregate_rerun_results.py new file mode 100644 index 0000000..8ff5d67 --- /dev/null +++ b/src/disallowed_usage_judge/rerun_judge/aggregate_rerun_results.py @@ -0,0 +1,149 @@ +#!/usr/bin/env python3 +""" +Aggregate rerun judge results and compare with original judgements. + +Usage: + python aggregate_rerun_results.py # Show summary + python aggregate_rerun_results.py --csv output.csv # Write to CSV + python aggregate_rerun_results.py --diff-only # Only show changed judgements + python aggregate_rerun_results.py --filled-only # Only show dirs where the rerun + # filled in a previously missing + # judgement +""" +import argparse +import csv +from collections import defaultdict +from pathlib import Path + +from utils import get_result_dirs, parse_result_dir, read_judgement + + +def main(): + parser = argparse.ArgumentParser(description="Aggregate rerun judge results") + parser.add_argument("--csv", type=str, help="Output CSV file") + parser.add_argument("--diff-only", action="store_true", + help="Only show results where judgement changed") + parser.add_argument("--filled-only", action="store_true", + help="Only show dirs where a missing original was filled by the rerun") + parser.add_argument("--method", type=str, help="Filter by method pattern") + parser.add_argument("--dirs", type=str, nargs="+", + help="Only process these specific result directories") + args = parser.parse_args() + + if args.dirs: + result_dirs = [Path(d) for d in args.dirs] + else: + result_dirs = get_result_dirs(method_pattern=args.method) + + results = [] + stats = defaultdict(int) + + for result_dir in result_dirs: + try: + parsed = parse_result_dir(result_dir) + except ValueError: + continue + + contam_orig = read_judgement(result_dir / "contamination_judgement.txt") + contam_rerun = read_judgement(result_dir / "contamination_judgement_rerun.txt") + model_orig = read_judgement(result_dir / "disallowed_model_judgement.txt") + model_rerun = read_judgement(result_dir / "disallowed_model_judgement_rerun.txt") + + contam_changed = ( + contam_rerun is not None and contam_orig is not None and contam_orig != contam_rerun + ) + model_changed = ( + model_rerun is not None and model_orig is not None and model_orig != model_rerun + ) + contam_filled = contam_orig is None and contam_rerun is not None + model_filled = model_orig is None and model_rerun is not None + + stats["total"] += 1 + if contam_rerun is not None: + stats["has_rerun"] += 1 + if contam_changed: + stats["contamination_changed"] += 1 + if model_changed: + stats["model_changed"] += 1 + if contam_filled: + stats["contamination_filled"] += 1 + if model_filled: + stats["model_filled"] += 1 + + result = { + "method": parsed["method"], + "benchmark": parsed["benchmark"], + "model": parsed["model_hf"], + "cluster_id": parsed["cluster_id"], + "contamination_orig": contam_orig, + "contamination_rerun": contam_rerun, + "contamination_changed": contam_changed, + "contamination_filled": contam_filled, + "model_orig": model_orig, + "model_rerun": model_rerun, + "model_changed": model_changed, + "model_filled": model_filled, + "result_dir": str(result_dir), + } + + if args.diff_only and not (contam_changed or model_changed): + continue + if args.filled_only and not (contam_filled or model_filled): + continue + + results.append(result) + + if args.csv: + if results: + with open(args.csv, "w", newline="") as f: + writer = csv.DictWriter(f, fieldnames=results[0].keys()) + writer.writeheader() + writer.writerows(results) + print(f"Wrote {len(results)} results to {args.csv}") + else: + print("=" * 80) + print("Rerun Judge Results Summary") + print("=" * 80) + print() + + for result in results: + print(f"Method: {result['method']}") + print(f" Folder: {result['result_dir']}") + print(f" Benchmark: {result['benchmark']} Model: {result['model']}") + + tags = [] + if result["contamination_changed"]: + tags.append("CHANGED") + if result["contamination_filled"]: + tags.append("FILLED") + tag_str = f" [{','.join(tags)}]" if tags else "" + print( + f" Contamination: {result['contamination_orig']} -> " + f"{result['contamination_rerun']}{tag_str}" + ) + + tags = [] + if result["model_changed"]: + tags.append("CHANGED") + if result["model_filled"]: + tags.append("FILLED") + tag_str = f" [{','.join(tags)}]" if tags else "" + print( + f" Model usage: {result['model_orig']} -> " + f"{result['model_rerun']}{tag_str}" + ) + print() + + print("=" * 80) + print("Statistics") + print("=" * 80) + print(f"Total result directories: {stats['total']}") + print(f"With rerun judgements: {stats['has_rerun']}") + print(f"Contamination changed (orig vs rerun): {stats['contamination_changed']}") + print(f"Model usage changed (orig vs rerun): {stats['model_changed']}") + print(f"Contamination filled (no orig, now rerun): {stats['contamination_filled']}") + print(f"Model usage filled (no orig, now rerun): {stats['model_filled']}") + + +if __name__ == "__main__": + main() diff --git a/src/disallowed_usage_judge/rerun_judge/commit_rerun_judge.sh b/src/disallowed_usage_judge/rerun_judge/commit_rerun_judge.sh new file mode 100755 index 0000000..f677856 --- /dev/null +++ b/src/disallowed_usage_judge/rerun_judge/commit_rerun_judge.sh @@ -0,0 +1,88 @@ +#!/bin/bash +# +# Submit HTCondor jobs to rerun the judge on past result directories. +# +# Usage: commit_rerun_judge.sh [options] +# +# Options: +# --method Filter to result dirs whose method matches this substring +# --benchmark Filter to result dirs whose name matches this substring +# --skip-existing Skip dirs that already have contamination_judgement_rerun.txt +# --only-missing-judgement Only re-run dirs where the original judge step failed +# (no contamination_judgement.txt or disallowed_model_judgement.txt) +# --limit Process at most n directories +# --latest-only Only the highest cluster_id per (method, model, benchmark) +# --dry-run Print the dirs that would be submitted, then exit + +set -e + +METHOD_PATTERN="" +BENCHMARK_PATTERN="" +SKIP_EXISTING="" +ONLY_MISSING="" +LIMIT=0 +LATEST_ONLY="" +DRY_RUN="" + +while [[ $# -gt 0 ]]; do + case $1 in + --method) METHOD_PATTERN="$2"; shift 2 ;; + --benchmark) BENCHMARK_PATTERN="$2"; shift 2 ;; + --skip-existing) SKIP_EXISTING="1"; shift ;; + --only-missing-judgement) ONLY_MISSING="1"; shift ;; + --limit) LIMIT="$2"; shift 2 ;; + --latest-only) LATEST_ONLY="1"; shift ;; + --dry-run) DRY_RUN="1"; shift ;; + *) echo "Unknown option: $1" >&2; exit 1 ;; + esac +done + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "$SCRIPT_DIR/../../.." && pwd)" +source "$REPO_ROOT/src/commit_utils/set_env_vars.sh" + +SUB_FILE="$SCRIPT_DIR/rerun_judge.sub" + +echo "========================================" +echo "Submitting rerun judge jobs" +echo " Method pattern: ${METHOD_PATTERN:-all}" +echo " Benchmark pattern: ${BENCHMARK_PATTERN:-all}" +echo " Skip existing: ${SKIP_EXISTING:-no}" +echo " Only missing judgement: ${ONLY_MISSING:-no}" +echo " Limit: ${LIMIT:-no limit}" +echo " Latest only: ${LATEST_ONLY:-no}" +echo " Dry run: ${DRY_RUN:-no}" +echo "========================================" + +LIST_ARGS="--paths-only" +[ -n "$METHOD_PATTERN" ] && LIST_ARGS="$LIST_ARGS --method $METHOD_PATTERN" +[ -n "$BENCHMARK_PATTERN" ] && LIST_ARGS="$LIST_ARGS --benchmark $BENCHMARK_PATTERN" +[ -n "$SKIP_EXISTING" ] && LIST_ARGS="$LIST_ARGS --skip-existing" +[ -n "$ONLY_MISSING" ] && LIST_ARGS="$LIST_ARGS --only-missing-judgement" +[ "$LIMIT" -gt 0 ] && LIST_ARGS="$LIST_ARGS --limit $LIMIT" +[ -n "$LATEST_ONLY" ] && LIST_ARGS="$LIST_ARGS --latest-only" + +RESULT_DIRS=$(python "$SCRIPT_DIR/list_results.py" $LIST_ARGS) +TOTAL=$(echo "$RESULT_DIRS" | grep -c . || echo 0) + +echo "Found $TOTAL result directories" + +if [ "$TOTAL" -eq 0 ]; then + echo "No directories to process" + exit 0 +fi + +if [ -n "$DRY_RUN" ]; then + echo "$RESULT_DIRS" + exit 0 +fi + +while read -r result_dir; do + [ -z "$result_dir" ] && continue + condor_submit_bid 100 -a "result_dir=$result_dir" "$SUB_FILE" +done <<< "$RESULT_DIRS" + +echo "" +echo "========================================" +echo "Jobs submitted: $TOTAL" +echo "========================================" diff --git a/src/disallowed_usage_judge/rerun_judge/list_results.py b/src/disallowed_usage_judge/rerun_judge/list_results.py new file mode 100644 index 0000000..e5d44c6 --- /dev/null +++ b/src/disallowed_usage_judge/rerun_judge/list_results.py @@ -0,0 +1,74 @@ +#!/usr/bin/env python3 +""" +List and filter result directories for judge rerun. + +Examples: + python list_results.py # List all result directories + python list_results.py --method "claude" # Filter by method substring + python list_results.py --benchmark "aime" # Filter by benchmark substring + python list_results.py --skip-existing # Skip dirs that already have rerun output + python list_results.py --only-missing-judgement # Only dirs where the original judge failed + python list_results.py --paths-only # Print just paths (for piping) + python list_results.py --latest-only # Latest cluster_id per method/model/benchmark +""" +import argparse +from utils import get_result_dirs + + +def main(): + parser = argparse.ArgumentParser(description="List and filter result directories") + parser.add_argument("--method", type=str, help="Filter by method pattern") + parser.add_argument("--benchmark", type=str, help="Filter by benchmark pattern") + parser.add_argument("--skip-existing", action="store_true", + help="Skip directories that already have contamination_judgement_rerun.txt") + parser.add_argument("--only-missing-judgement", action="store_true", + help="Only include directories where the original judge step " + "didn't write contamination_judgement.txt or " + "disallowed_model_judgement.txt") + parser.add_argument("--paths-only", action="store_true", + help="Print just paths (for piping)") + parser.add_argument("--limit", type=int, default=0, help="Limit number of results") + parser.add_argument("--latest-only", action="store_true", + help="Only return latest cluster_id per method/model/benchmark") + args = parser.parse_args() + + result_dirs = get_result_dirs( + method_pattern=args.method, + benchmark_pattern=args.benchmark, + skip_existing=args.skip_existing, + only_missing_judgement=args.only_missing_judgement, + limit=args.limit, + latest_only=args.latest_only, + ) + + if args.paths_only: + for d in result_dirs: + print(d) + return + + has_rerun_count = 0 + missing_orig_count = 0 + for result_dir in result_dirs: + has_rerun = (result_dir / "contamination_judgement_rerun.txt").exists() + has_orig_contam = (result_dir / "contamination_judgement.txt").exists() + has_orig_disallowed = (result_dir / "disallowed_model_judgement.txt").exists() + + flags = [] + if has_rerun: + flags.append("RERUN") + has_rerun_count += 1 + if not has_orig_contam or not has_orig_disallowed: + flags.append("ORIG-MISSING") + missing_orig_count += 1 + flag_str = f" [{','.join(flags)}]" if flags else "" + print(f"{result_dir}{flag_str}") + + print() + print("=" * 50) + print(f"Total: {len(result_dirs)}") + print(f" Already has _rerun output: {has_rerun_count}") + print(f" Missing original judgement files: {missing_orig_count}") + + +if __name__ == "__main__": + main() diff --git a/src/disallowed_usage_judge/rerun_judge/rerun_judge.sub b/src/disallowed_usage_judge/rerun_judge/rerun_judge.sub new file mode 100644 index 0000000..72e38d2 --- /dev/null +++ b/src/disallowed_usage_judge/rerun_judge/rerun_judge.sub @@ -0,0 +1,12 @@ +executable = /bin/bash +arguments = src/disallowed_usage_judge/rerun_judge/rerun_single.sh $(result_dir) +environment = "OPENAI_API_KEY=$ENV(OPENAI_API_KEY) HOME=$ENV(HOME) POST_TRAIN_BENCH_RESULTS_DIR=$ENV(POST_TRAIN_BENCH_RESULTS_DIR) POST_TRAIN_BENCH_CONTAINERS_DIR=$ENV(POST_TRAIN_BENCH_CONTAINERS_DIR) POST_TRAIN_BENCH_CONTAINER_NAME=$ENV(POST_TRAIN_BENCH_CONTAINER_NAME) POST_TRAIN_BENCH_JOB_SCHEDULER=$ENV(POST_TRAIN_BENCH_JOB_SCHEDULER) HF_HOME=$ENV(HF_HOME)" +error = test_$(Cluster).err +output = test_$(Cluster).out +log = test_$(Cluster).log +concurrency_limits=user.judge:3333 +request_memory = 32768 +request_cpus = 4 +request_disk=20G ++BypassLXCfs="true" +queue diff --git a/src/disallowed_usage_judge/rerun_judge/rerun_single.sh b/src/disallowed_usage_judge/rerun_judge/rerun_single.sh new file mode 100755 index 0000000..3fbdd6f --- /dev/null +++ b/src/disallowed_usage_judge/rerun_judge/rerun_single.sh @@ -0,0 +1,19 @@ +#!/bin/bash +# +# Rerun the judge on a single result directory. +# Thin wrapper around src/disallowed_usage_judge/run_judge.sh. +# +# Usage: rerun_single.sh + +set -e + +RESULT_DIR="$1" + +if [ -z "$RESULT_DIR" ]; then + echo "Usage: $0 " >&2 + exit 1 +fi + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +exec bash "$SCRIPT_DIR/../run_judge.sh" "$RESULT_DIR" diff --git a/src/disallowed_usage_judge/rerun_judge/utils.py b/src/disallowed_usage_judge/rerun_judge/utils.py new file mode 100644 index 0000000..01cba05 --- /dev/null +++ b/src/disallowed_usage_judge/rerun_judge/utils.py @@ -0,0 +1,135 @@ +#!/usr/bin/env python3 +"""Shared utilities for rerun judge scripts.""" + +import os +from pathlib import Path + + +def get_repo_root() -> Path: + return Path(__file__).parent.parent.parent.parent + + +def get_results_dir() -> Path: + results_dir = os.environ.get("POST_TRAIN_BENCH_RESULTS_DIR") + if not results_dir: + raise RuntimeError("POST_TRAIN_BENCH_RESULTS_DIR is not set") + return Path(results_dir) + + +def get_result_dirs( + method_pattern: str = None, + benchmark_pattern: str = None, + skip_existing: bool = False, + only_missing_judgement: bool = False, + limit: int = 0, + latest_only: bool = False, +) -> list[Path]: + """ + Walk POST_TRAIN_BENCH_RESULTS_DIR and return matching result directories. + + Args: + method_pattern: substring filter on method (parent dir name) + benchmark_pattern: substring filter on result dir name + skip_existing: skip dirs that already have contamination_judgement_rerun.txt + only_missing_judgement: only include dirs missing contamination_judgement.txt + and/or disallowed_model_judgement.txt (i.e. the judge previously failed) + limit: cap result count (0 = no limit) + latest_only: keep only the highest-cluster_id run per (method, model, benchmark) + """ + results_root = get_results_dir() + result_dirs = [] + + for method_dir in sorted(results_root.iterdir()): + if not method_dir.is_dir(): + continue + + method_name = method_dir.name + if method_name.startswith(".") or method_name in ("baseline", "baseline_zeroshot"): + continue + + if method_pattern and method_pattern.lower() not in method_name.lower(): + continue + + for result_dir in sorted(method_dir.iterdir()): + if not result_dir.is_dir(): + continue + + if not (result_dir / "task").is_dir(): + continue + + if benchmark_pattern and benchmark_pattern.lower() not in result_dir.name.lower(): + continue + + if skip_existing and (result_dir / "contamination_judgement_rerun.txt").exists(): + continue + + if only_missing_judgement: + has_contam = (result_dir / "contamination_judgement.txt").exists() + has_disallowed = (result_dir / "disallowed_model_judgement.txt").exists() + if has_contam and has_disallowed: + continue + + result_dirs.append(result_dir) + + if latest_only: + result_dirs = _filter_latest_only(result_dirs) + + if limit > 0: + result_dirs = result_dirs[:limit] + + return result_dirs + + +def _filter_latest_only(result_dirs: list[Path]) -> list[Path]: + best_by_key: dict[tuple[str, str, str], tuple[int, Path]] = {} + + for result_dir in result_dirs: + try: + parsed = parse_result_dir(result_dir) + except ValueError: + continue + key = (parsed["method"], parsed["model"], parsed["benchmark"]) + cluster_id = int(parsed["cluster_id"]) + + if key not in best_by_key or cluster_id > best_by_key[key][0]: + best_by_key[key] = (cluster_id, result_dir) + + return sorted(path for _, path in best_by_key.values()) + + +def parse_result_dir(result_dir: Path) -> dict: + """ + Parse a result dir name into its components. + Format: {benchmark}_{provider}_{model}_{cluster_id} + """ + dirname = result_dir.name + method = result_dir.parent.name + + parts = dirname.rsplit("_", 1) + if len(parts) < 2: + raise ValueError(f"Invalid result directory name: {dirname}") + + cluster_id = parts[1] + rest = parts[0] + + benchmark_end = rest.find("_") + if benchmark_end == -1: + raise ValueError(f"Invalid result directory name: {dirname}") + + benchmark = rest[:benchmark_end] + model_part = rest[benchmark_end + 1:] + model_hf = model_part.replace("_", "/", 1) + + return { + "benchmark": benchmark, + "model": model_part, + "model_hf": model_hf, + "method": method, + "cluster_id": cluster_id, + } + + +def read_judgement(filepath: Path) -> str | None: + if not filepath.exists(): + return None + return filepath.read_text().strip() diff --git a/src/disallowed_usage_judge/run_judge.sh b/src/disallowed_usage_judge/run_judge.sh new file mode 100755 index 0000000..487b131 --- /dev/null +++ b/src/disallowed_usage_judge/run_judge.sh @@ -0,0 +1,124 @@ +#!/bin/bash +# +# Re-run the contamination/disallowed-model judge on an existing result directory. +# +# Mirrors the judge invocation in src/run_task.sh (single GPT-5.1-Codex via the +# CODEX_API_KEY) but operates on a result directory that already exists, +# without re-running the agent or eval. Outputs are written with a `_rerun` +# suffix so the originals from the run are preserved: +# - contamination_judgement_rerun.txt +# - disallowed_model_judgement_rerun.txt +# - judge_output_rerun.json / judge_output_rerun.txt +# +# Usage: run_judge.sh + +set -e + +RESULT_DIR="$1" + +if [ -z "$RESULT_DIR" ]; then + echo "Usage: $0 " >&2 + exit 1 +fi + +if [ ! -d "$RESULT_DIR" ]; then + echo "Error: result directory does not exist: $RESULT_DIR" >&2 + exit 1 +fi + +if [ ! -d "$RESULT_DIR/task" ]; then + echo "Error: no task directory found in $RESULT_DIR" >&2 + exit 1 +fi + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" +source "$REPO_ROOT/src/commit_utils/set_env_vars.sh" + +# The original run_task.sh swaps OPENAI_API_KEY into CODEX_API_KEY before the +# judge runs. Mirror that here so the script works whether the user has only +# OPENAI_API_KEY set, only CODEX_API_KEY set, or both. +if [ -z "${CODEX_API_KEY:-}" ] && [ -n "${OPENAI_API_KEY:-}" ]; then + export CODEX_API_KEY="${OPENAI_API_KEY}" +fi + +if [ -z "${CODEX_API_KEY:-}" ]; then + echo "Error: neither CODEX_API_KEY nor OPENAI_API_KEY is set" >&2 + exit 1 +fi + +# Parse benchmark/model from result directory name. +# Format: {benchmark}_{provider}_{model}_{cluster_id} +DIRNAME=$(basename "$RESULT_DIR") +BENCHMARK=$(echo "$DIRNAME" | sed -E 's/^([^_]+)_.*/\1/') +MODEL_PART=$(echo "$DIRNAME" | sed -E 's/^[^_]+_(.*)_[0-9]+$/\1/') +MODEL_HF=$(echo "$MODEL_PART" | sed 's/_/\//') + +echo "Running judge on: $RESULT_DIR" +echo " Benchmark: $BENCHMARK | Model: $MODEL_HF" + +JUDGE_TASK=$(python "$REPO_ROOT/src/disallowed_usage_judge/get_judge_prompt.py" \ + --benchmark "$BENCHMARK" \ + --model "$MODEL_HF") + +# Sandbox: copy the task dir so any judgement files written by the judge land +# in our temp dir, not the canonical result dir, until we explicitly copy them. +TMP_DIR=$(mktemp -d) +trap 'rm -rf "$TMP_DIR"' EXIT +JOB_DIR="$TMP_DIR/job_dir" +JOB_TMP="$TMP_DIR/tmp" +mkdir -p "$JOB_DIR" "$JOB_TMP" + +cp -r "$RESULT_DIR/task" "$JOB_DIR/task" + +# Strip stale judgement files from the sandbox so a CLI failure can't leak old +# values into this judge's output. +rm -f "$JOB_DIR/task/contamination_judgement.txt" +rm -f "$JOB_DIR/task/disallowed_model_judgement.txt" + +# Reset codex config (matches src/run_task.sh:196) so any agent-specific +# settings like model_reasoning_effort don't leak into the judge. +cp -r "$REPO_ROOT/containers/other_home_data/.codex" "$JOB_DIR/" + +# Strip any pre-existing _rerun outputs so a CLI crash can't leave stale data. +rm -f "$RESULT_DIR/contamination_judgement_rerun.txt" +rm -f "$RESULT_DIR/disallowed_model_judgement_rerun.txt" +rm -f "$RESULT_DIR/judge_output_rerun.json" +rm -f "$RESULT_DIR/judge_output_rerun.txt" + +JUDGE_OUTPUT_JSON="$RESULT_DIR/judge_output_rerun.json" + +apptainer exec \ + --nv \ + -c \ + --env PATH="/root/.local/bin:/home/ben/.local/bin:$PATH" \ + --env HF_HOME="${HF_HOME_NEW}" \ + --env CODEX_API_KEY="${CODEX_API_KEY}" \ + --env VLLM_API_KEY="inspectai" \ + --env PYTHONNOUSERSITE="1" \ + --bind "${JOB_TMP}:/tmp" \ + --home "${JOB_DIR}:/home/ben" \ + --pwd "/home/ben/task" \ + --writable-tmpfs \ + "${POST_TRAIN_BENCH_CONTAINERS_DIR}/${POST_TRAIN_BENCH_CONTAINER_NAME}.sif" \ + codex --search -a never exec --json -c model_reasoning_summary=detailed --skip-git-repo-check --yolo --model "gpt-5.1-codex" "$JUDGE_TASK" 2>&1 | tee "$JUDGE_OUTPUT_JSON" + +# Convert JSON output to human-readable trace. +if [ -f "$JUDGE_OUTPUT_JSON" ]; then + python "$REPO_ROOT/agents/codex/human_readable_trace.py" "$JUDGE_OUTPUT_JSON" -o "$RESULT_DIR/judge_output_rerun.txt" +fi + +# Copy out judgement files (if the judge produced them). +if [ -f "$JOB_DIR/task/contamination_judgement.txt" ]; then + cp "$JOB_DIR/task/contamination_judgement.txt" "$RESULT_DIR/contamination_judgement_rerun.txt" + echo " Contamination: $(cat "$RESULT_DIR/contamination_judgement_rerun.txt")" +else + echo " Warning: contamination_judgement.txt not produced by judge" +fi + +if [ -f "$JOB_DIR/task/disallowed_model_judgement.txt" ]; then + cp "$JOB_DIR/task/disallowed_model_judgement.txt" "$RESULT_DIR/disallowed_model_judgement_rerun.txt" + echo " Model usage: $(cat "$RESULT_DIR/disallowed_model_judgement_rerun.txt")" +else + echo " Warning: disallowed_model_judgement.txt not produced by judge" +fi