barryroodt · barryroodt · May 27, 2026
diff --git a/NEXT_STEPS.md b/NEXT_STEPS.md
@@ -9,6 +9,13 @@ Tracking what's left after v0.1.2 manual release (2026-05-20).
 - [ ] **Test release.yml on `v0.1.3` tag**, not v0.1.2 — 0.1.2 is already published and would 403 on retry. When ready: bump to 0.1.3, `git tag v0.1.3 && git push --tags`, watch with `gh run watch`.
 - [ ] **Decide on multi-arch image strategy.** Currently `ghcr.io/barryroodt/refine-skill:0.1.2` is linux/arm64 only (manual buildx broke on cross-arch GPG). Release.yml is wired for `docker/build-push-action` multi-arch on GHA runners (amd64-native, no QEMU GPG bug) — should work once tag-triggered.
 
+## Evals (Braintrust)
+
+- [ ] **Install + first run.** `cd evals && npm install`, then `REFINE_EVAL_DRYRUN=1 npm run eval:dryrun` to smoke-test wiring. Push to Braintrust with `BRAINTRUST_API_KEY` set.
+- [ ] **Free-model validation pass.** Run against Gemini/Groq free tier (`REFINE_EVAL_MODEL=gemini-2.5-flash` or `groq/llama-3.3-70b-versatile`) before paying for sonnet runs.
+- [ ] **Grow fixture set.** Currently 2 fixtures — add ~8 more starter skills spanning grades F→B so scorers see distribution.
+- [ ] **CI gate (later).** Wire `npm --prefix evals run eval` into `release.yml` behind a separate `EVAL=1` env, mirroring the existing `E2E_LLM=1` pattern. Don't block release on it until scorer thresholds are calibrated.
+
 ## Code follow-ups (queued for v0.2.0)
 
 - [ ] **`--provider` CLI flag.** Closes the auto-routing gap for Cerebras, OpenRouter, Azure, Bedrock, AI Gateway, ZAI, MiniMax (all enumerated in MODELS.md as "not yet auto-routed").

diff --git a/evals/README.md b/evals/README.md
@@ -0,0 +1,69 @@
+# refine-skill evals
+
+Braintrust eval harness for `@jumptag/refine-skill`. Each row runs the full CLI
+against a fixture skill in `e2e/fixtures/`, parses `.refine/log.json`, and
+emits five pure-JS scorers.
+
+## Scorers
+
+| Name | What it measures | 0–1 mapping |
+|---|---|---|
+| `score_lift` | Quality gain (last − first pass) as fraction of remaining headroom | `(last − first) / (max − first)`, clamped |
+| `converged` | Stopped naturally (not at iteration cap) | 1 if `stop_reason ∉ {max_iterations, null}` |
+| `apply_rate` | Items applied vs. proposed across all passes | `applied / proposed`; 1 if no items proposed |
+| `efficiency` | Lift amortised per pass — flags slow-burn runs | `score_lift / passes.length` |
+| `ran_cleanly` | Exit code 0/1 and log present | 0 on crash / missing log |
+
+## Run
+
+```bash
+cd evals
+npm install
+
+# 1. Smoke-test wiring with no Braintrust account:
+REFINE_EVAL_DRYRUN=1 \
+ANTHROPIC_API_KEY=sk-... \
+npm run eval:dryrun
+
+# 2. Push to Braintrust:
+export BRAINTRUST_API_KEY=...
+export ANTHROPIC_API_KEY=...   # match REFINE_EVAL_MODEL provider
+npm run eval
+```
+
+## Knobs
+
+| Env | Default | Effect |
+|---|---|---|
+| `REFINE_EVAL_MODEL` | `claude-haiku-4-5` | Any pi-supported model id |
+| `REFINE_EVAL_ITERATIONS` | `3` | Max passes per run |
+| `REFINE_EVAL_IMAGE` | `refine-skill:dev` | Docker image override |
+| `REFINE_EVAL_FIXTURES` | (all) | Comma-separated subset (e.g. `small-skill`) |
+| `REFINE_EVAL_PROJECT` | `refine-skill` | Braintrust project name |
+| `REFINE_EVAL_DRYRUN` | off | `1` → print JSON, skip Braintrust |
+
+## Free-model swap
+
+`REFINE_EVAL_MODEL` flows straight through to `bin/refine-skill --model`,
+so any model `src/providers.js` routes will work. For Gemini free tier:
+
+```bash
+REFINE_EVAL_MODEL=gemini-2.5-flash GEMINI_API_KEY=... npm run eval
+```
+
+For Groq:
+
+```bash
+REFINE_EVAL_MODEL=groq/llama-3.3-70b-versatile GROQ_API_KEY=... npm run eval
+```
+
+## Cost note
+
+Each row = full Docker run + LLM calls. Expect 1–3 minutes and a handful
+of cents per fixture per model. Keep the fixture set tight while iterating
+on scorers.
+
+## Adding fixtures
+
+Drop a new dir under `e2e/fixtures/<name>/` containing a `SKILL.md`. It's
+picked up automatically next run.
diff --git a/evals/package.json b/evals/package.json
@@ -0,0 +1,14 @@
+{
+  "name": "@jumptag/refine-skill-evals",
+  "version": "0.0.0",
+  "private": true,
+  "type": "module",
+  "description": "Braintrust evals for @jumptag/refine-skill",
+  "scripts": {
+    "eval": "braintrust eval run.eval.js",
+    "eval:dryrun": "REFINE_EVAL_DRYRUN=1 node run.eval.js"
+  },
+  "dependencies": {
+    "braintrust": "^0.0.190"
+  }
+}
diff --git a/evals/run.eval.js b/evals/run.eval.js
@@ -0,0 +1,187 @@
+// Braintrust eval harness for refine-skill.
+//
+// Run:
+//   cd evals && npm install
+//   export BRAINTRUST_API_KEY=...
+//   export ANTHROPIC_API_KEY=... # or whichever provider matches REFINE_EVAL_MODEL
+//   npm run eval
+//
+// Knobs (env):
+//   REFINE_EVAL_MODEL       pi model id (default: claude-haiku-4-5)
+//   REFINE_EVAL_ITERATIONS  max passes per run (default: 3)
+//   REFINE_EVAL_IMAGE       docker image override (default: refine-skill:dev)
+//   REFINE_EVAL_FIXTURES    comma-separated subset of fixture dir names
+//   REFINE_EVAL_DRYRUN      "1" → skip Braintrust, print task results to stdout
+//   REFINE_EVAL_PROJECT     Braintrust project name (default: refine-skill)
+
+import { Eval } from "braintrust";
+import { spawnSync } from "node:child_process";
+import { mkdtempSync, readFileSync, readdirSync, statSync } from "node:fs";
+import { tmpdir } from "node:os";
+import path from "node:path";
+import { fileURLToPath } from "node:url";
+
+const __dirname = path.dirname(fileURLToPath(import.meta.url));
+const REPO_ROOT = path.resolve(__dirname, "..");
+const FIXTURE_ROOT = path.join(REPO_ROOT, "e2e", "fixtures");
+const BIN = path.join(REPO_ROOT, "bin", "refine-skill");
+
+const MODEL = process.env.REFINE_EVAL_MODEL || "claude-haiku-4-5";
+const ITERATIONS = Number(process.env.REFINE_EVAL_ITERATIONS || 3);
+const IMAGE = process.env.REFINE_EVAL_IMAGE || "refine-skill:dev";
+const FIXTURE_FILTER = (process.env.REFINE_EVAL_FIXTURES || "")
+  .split(",")
+  .map((s) => s.trim())
+  .filter(Boolean);
+
+function listFixtures() {
+  const all = readdirSync(FIXTURE_ROOT).filter((n) =>
+    statSync(path.join(FIXTURE_ROOT, n)).isDirectory()
+  );
+  return FIXTURE_FILTER.length
+    ? all.filter((n) => FIXTURE_FILTER.includes(n))
+    : all;
+}
+
+function copyFixture(name) {
+  const dst = mkdtempSync(path.join(tmpdir(), `refine-eval-${name}-`));
+  const src = path.join(FIXTURE_ROOT, name);
+  const r = spawnSync("cp", ["-r", `${src}/.`, dst]);
+  if (r.status !== 0) throw new Error(`cp failed for ${name}: ${r.stderr}`);
+  return dst;
+}
+
+function runRefine(workDir) {
+  return spawnSync(
+    BIN,
+    [
+      workDir,
+      "--iterations",
+      String(ITERATIONS),
+      "--model",
+      MODEL,
+      "--image",
+      IMAGE,
+    ],
+    { stdio: ["ignore", "pipe", "pipe"], env: process.env, encoding: "utf8" }
+  );
+}
+
+function readLog(workDir) {
+  const logPath = path.join(workDir, ".refine", "log.json");
+  return JSON.parse(readFileSync(logPath, "utf8"));
+}
+
+const dataset = () =>
+  listFixtures().map((name) => ({
+    input: { fixture: name },
+    metadata: { fixture: name, model: MODEL, iterations: ITERATIONS },
+  }));
+
+async function task(input) {
+  const work = copyFixture(input.fixture);
+  const started = Date.now();
+  const result = runRefine(work);
+  const elapsed_ms = Date.now() - started;
+  let log = null;
+  let log_error = null;
+  try {
+    log = readLog(work);
+  } catch (e) {
+    log_error = String(e.message ?? e);
+  }
+  return {
+    fixture: input.fixture,
+    exit_code: result.status,
+    elapsed_ms,
+    stderr_tail: (result.stderr || "").split("\n").slice(-40).join("\n"),
+    log,
+    log_error,
+  };
+}
+
+// Scorers — every scorer returns 0..1 (Braintrust convention).
+// Each receives ({output, expected, input, metadata}).
+
+const scoreLift = ({ output }) => {
+  const log = output?.log;
+  if (!log || !Array.isArray(log.passes) || log.passes.length === 0) {
+    return { name: "score_lift", score: 0 };
+  }
+  const first = log.passes[0]?.score ?? 0;
+  const last = log.passes.at(-1)?.score ?? 0;
+  const max = log.passes[0]?.max ?? 100;
+  // Normalize lift to fraction of remaining headroom from pass-1 score.
+  const headroom = Math.max(1, max - first);
+  const lift = Math.max(0, last - first) / headroom;
+  return { name: "score_lift", score: Math.min(1, lift) };
+};
+
+const converged = ({ output }) => {
+  const log = output?.log;
+  if (!log) return { name: "converged", score: 0 };
+  const ok =
+    log.stop_reason &&
+    log.stop_reason !== "max_iterations" &&
+    log.stop_reason !== null;
+  return { name: "converged", score: ok ? 1 : 0 };
+};
+
+const applyRate = ({ output }) => {
+  const log = output?.log;
+  if (!log) return { name: "apply_rate", score: 0 };
+  let proposed = 0;
+  let applied = 0;
+  for (const p of log.passes ?? []) {
+    for (const it of p.items ?? []) {
+      proposed += 1;
+      if (it.status === "applied") applied += 1;
+    }
+  }
+  if (proposed === 0) return { name: "apply_rate", score: 1 }; // nothing to apply = vacuously ok
+  return { name: "apply_rate", score: applied / proposed };
+};
+
+const efficiency = ({ output }) => {
+  // Lift per pass — penalize burning iterations for small gains.
+  const log = output?.log;
+  if (!log || !log.passes?.length) return { name: "efficiency", score: 0 };
+  const first = log.passes[0]?.score ?? 0;
+  const last = log.passes.at(-1)?.score ?? 0;
+  const max = log.passes[0]?.max ?? 100;
+  const headroom = Math.max(1, max - first);
+  const lift = Math.max(0, last - first) / headroom;
+  return { name: "efficiency", score: Math.min(1, lift / log.passes.length) };
+};
+
+const ranCleanly = ({ output }) => {
+  // exit 0 = natural convergence, 1 = max-iter (still a valid run).
+  // Any other code = crash / config error.
+  const ok =
+    output && (output.exit_code === 0 || output.exit_code === 1) && !output.log_error;
+  return { name: "ran_cleanly", score: ok ? 1 : 0 };
+};
+
+const scorers = [scoreLift, converged, applyRate, efficiency, ranCleanly];
+
+if (process.env.REFINE_EVAL_DRYRUN === "1") {
+  // Skip Braintrust — useful to validate task wiring without an API key.
+  const rows = dataset();
+  for (const row of rows) {
+    const output = await task(row.input);
+    const scores = Object.fromEntries(
+      scorers.map((fn) => {
+        const { name, score } = fn({ output, input: row.input });
+        return [name, score];
+      })
+    );
+    console.log(JSON.stringify({ input: row.input, scores, exit_code: output.exit_code }, null, 2));
+  }
+} else {
+  Eval(process.env.REFINE_EVAL_PROJECT || "refine-skill", {
+    data: dataset,
+    task,
+    scores: scorers,
+    metadata: { model: MODEL, iterations: ITERATIONS, image: IMAGE },
+  });
+}