Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions benchmarks/dataset-agent/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,11 @@ without committing raw run folders:
Agent canary actually emitted browser actions before starting a Playwright
compiler.

If `selfHealingAction` is `candidate_rejected`, the benchmark marks the lane as
`failureCategory: "capability_gate"` even when the diagnostic rows score well.
Rejected candidates are useful for debugging, but they are not promotable cron
recipes.

Agent canaries also preserve safe provenance from the TinyFish run payload:
reported step count, whether a streaming URL existed, and top-level result
keys. Raw `streaming_url` values are never persisted. If Agent returns rows but
Expand Down
33 changes: 25 additions & 8 deletions benchmarks/dataset-agent/run-benchmark.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -569,10 +569,13 @@ async function runSystemPrompt(input) {
});
const capabilityGateReason = infraBlockerReason
? null
: playwrightReadinessGateReason({
diagnostics: normalized.diagnostics,
requirePlaywrightReady: input.config.requirePlaywrightReady,
});
: firstString([
selfHealingActionGateReason({ diagnostics: normalized.diagnostics }),
playwrightReadinessGateReason({
diagnostics: normalized.diagnostics,
requirePlaywrightReady: input.config.requirePlaywrightReady,
}),
]);
const status = benchmarkStatusForOutcome({
execution,
parsedPayload,
Expand Down Expand Up @@ -1016,6 +1019,13 @@ export function playwrightReadinessGateReason({
return null;
}

export function selfHealingActionGateReason({ diagnostics }) {
if (diagnostics?.selfHealingAction !== "candidate_rejected") {
return null;
}
return "Self-healing gate failed: candidate recipe was rejected; rows came from a diagnostic run, not a promoted recipe.";
}

export function benchmarkStatusForOutcome({
execution,
parsedPayload,
Expand Down Expand Up @@ -1171,10 +1181,13 @@ export async function rescoreBenchmarkRun({ runDirectory, prompts, config }) {
});
const capabilityGateReason = infraBlockerReason
? null
: playwrightReadinessGateReason({
diagnostics: normalized.diagnostics,
requirePlaywrightReady: config.requirePlaywrightReady,
});
: firstString([
selfHealingActionGateReason({ diagnostics: normalized.diagnostics }),
playwrightReadinessGateReason({
diagnostics: normalized.diagnostics,
requirePlaywrightReady: config.requirePlaywrightReady,
}),
]);
const status = benchmarkStatusForOutcome({
execution,
parsedPayload: usablePayload,
Expand Down Expand Up @@ -1832,6 +1845,10 @@ function stringArrayValue(value) {
return [];
}

function firstString(values) {
return values.find((value) => typeof value === "string" && value.length > 0) ?? null;
}

function singleStringArray(value) {
return typeof value === "string" ? [value] : [];
}
Expand Down
120 changes: 120 additions & 0 deletions benchmarks/dataset-agent/run-benchmark.test.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import {
playwrightReadinessGateReason,
rescoreBenchmarkRun,
scoreBenchmarkRows,
selfHealingActionGateReason,
} from "./run-benchmark.mjs";
import { selfHealingDiagnosticsFromTick } from "./adapters/self-healing-output.mjs";

Expand Down Expand Up @@ -295,6 +296,66 @@ test("Playwright readiness gate does not override infrastructure blockers", () =
}), "infra");
});

test("self-healing rejection gate fails otherwise passing benchmark output", () => {
const capabilityGateReason = selfHealingActionGateReason({
diagnostics: {
selfHealingAction: "candidate_rejected",
},
});
const answerKeyScore = { passed: true, failureCategory: undefined };
const status = benchmarkStatusForOutcome({
execution: { exitCode: 0 },
parsedPayload: { rows: passingRows() },
answerKeyScore,
infraBlockerReason: null,
capabilityGateReason,
});

assert.equal(status, "failed");
assert.match(capabilityGateReason, /candidate recipe was rejected/i);
assert.equal(failureCategoryForOutcome({
status,
infraBlockerReason: null,
capabilityGateReason,
answerKeyScore,
}), "capability_gate");
assert.equal(failureReason({
execution: { exitCode: 0, timedOut: false },
parsedPayload: { rows: passingRows() },
validation: passingValidation,
answerKeyScore,
infraBlockerReason: null,
capabilityGateReason,
minRequiredCompleteness: 0.75,
}), capabilityGateReason);
});

test("self-healing rejection gate does not override infrastructure blockers", () => {
const infraBlockerReason = "Infrastructure/auth/credits blocker.";
const capabilityGateReason = null;
const answerKeyScore = { passed: true, failureCategory: undefined };
const status = benchmarkStatusForOutcome({
execution: { exitCode: 0 },
parsedPayload: {
rows: passingRows(),
diagnostics: {
selfHealingAction: "candidate_rejected",
},
},
answerKeyScore,
infraBlockerReason,
capabilityGateReason,
});

assert.equal(status, "blocked");
assert.equal(failureCategoryForOutcome({
status,
infraBlockerReason,
capabilityGateReason,
answerKeyScore,
}), "infra");
});

test("rescore applies Playwright readiness gate semantics", async () => {
const runDirectory = await mkdtemp(join(tmpdir(), "bigset-benchmark-rescore-"));
const artifactDirectory = join(runDirectory, "collection-self-heal", "01-gate-prompt");
Expand Down Expand Up @@ -352,6 +413,65 @@ test("rescore applies Playwright readiness gate semantics", async () => {
assert.equal(rescored.laneResults[0].playwrightCandidateStatus, "not_ready");
});

test("rescore applies self-healing rejection gate semantics", async () => {
const runDirectory = await mkdtemp(join(tmpdir(), "bigset-benchmark-rescore-"));
const artifactDirectory = join(runDirectory, "collection-self-heal", "01-rejected-prompt");
await mkdir(artifactDirectory, { recursive: true });

const parsedPayload = {
rows: passingRows(),
validationIssues: [],
diagnostics: {
selfHealingAction: "candidate_rejected",
},
};
await writeFile(
join(runDirectory, "summary.json"),
JSON.stringify({
laneResults: [{
system: "collection-self-heal",
promptId: "rejected-prompt",
promptQuality: "good",
artifactDirectory,
exitCode: 0,
timedOut: false,
}],
})
);
await writeFile(
join(artifactDirectory, "parsed-output.json"),
JSON.stringify(parsedPayload)
);
await writeFile(join(artifactDirectory, "stdout.txt"), JSON.stringify(parsedPayload));
await writeFile(join(artifactDirectory, "stderr.txt"), "");

const rescored = await rescoreBenchmarkRun({
runDirectory,
prompts: [{
id: "rejected-prompt",
quality: "good",
persona: "developer",
prompt: "Find official docs.",
expectedStress: "Self-healing rejection gate.",
requiredColumns: ["entity_name", "source_url"],
}],
config: {
promptIds: null,
minRequiredCompleteness: 0.75,
minFactualAccuracy: 0.75,
requirePlaywrightReady: false,
inputUsdPer1M: 0.05,
outputUsdPer1M: 0.5,
tinyFishAgentStepUsd: 0.015,
},
});

assert.equal(rescored.laneResults[0].status, "failed");
assert.equal(rescored.laneResults[0].failureCategory, "capability_gate");
assert.match(rescored.laneResults[0].errorMessage, /candidate recipe was rejected/i);
assert.equal(rescored.laneResults[0].selfHealingAction, "candidate_rejected");
});

function passingRows() {
return [{
cells: {
Expand Down
3 changes: 3 additions & 0 deletions docs/data-collection-agent-migration-plan.md
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,9 @@ The current layer does not yet:
`has_streaming_url`, and `result_keys`) when readiness fails; these fields
prove browser work happened without persisting raw streaming URLs or
pretending selectors/clicks exist
- treat `selfHealingAction: "candidate_rejected"` as a capability failure
even if diagnostic rows score well; rejected rows are debug output, not a
promotable self-healing recipe
- full benchmark only after the 2-prompt run is not obviously broken
- live `--dataset-id` dry-run only after Convex/env prerequisites are ready
- `--commit` only on a throwaway dataset first
Expand Down