diff --git a/benchmarks/dataset-agent/README.md b/benchmarks/dataset-agent/README.md index f55bd81..f96e5a9 100644 --- a/benchmarks/dataset-agent/README.md +++ b/benchmarks/dataset-agent/README.md @@ -146,6 +146,11 @@ without committing raw run folders: Agent canary actually emitted browser actions before starting a Playwright compiler. +If `selfHealingAction` is `candidate_rejected`, the benchmark marks the lane as +`failureCategory: "capability_gate"` even when the diagnostic rows score well. +Rejected candidates are useful for debugging, but they are not promotable cron +recipes. + Agent canaries also preserve safe provenance from the TinyFish run payload: reported step count, whether a streaming URL existed, and top-level result keys. Raw `streaming_url` values are never persisted. If Agent returns rows but diff --git a/benchmarks/dataset-agent/run-benchmark.mjs b/benchmarks/dataset-agent/run-benchmark.mjs index 1fe09c2..3b89837 100755 --- a/benchmarks/dataset-agent/run-benchmark.mjs +++ b/benchmarks/dataset-agent/run-benchmark.mjs @@ -569,10 +569,13 @@ async function runSystemPrompt(input) { }); const capabilityGateReason = infraBlockerReason ? null - : playwrightReadinessGateReason({ - diagnostics: normalized.diagnostics, - requirePlaywrightReady: input.config.requirePlaywrightReady, - }); + : firstString([ + selfHealingActionGateReason({ diagnostics: normalized.diagnostics }), + playwrightReadinessGateReason({ + diagnostics: normalized.diagnostics, + requirePlaywrightReady: input.config.requirePlaywrightReady, + }), + ]); const status = benchmarkStatusForOutcome({ execution, parsedPayload, @@ -1016,6 +1019,13 @@ export function playwrightReadinessGateReason({ return null; } +export function selfHealingActionGateReason({ diagnostics }) { + if (diagnostics?.selfHealingAction !== "candidate_rejected") { + return null; + } + return "Self-healing gate failed: candidate recipe was rejected; rows came from a diagnostic run, not a promoted recipe."; +} + export function benchmarkStatusForOutcome({ execution, parsedPayload, @@ -1171,10 +1181,13 @@ export async function rescoreBenchmarkRun({ runDirectory, prompts, config }) { }); const capabilityGateReason = infraBlockerReason ? null - : playwrightReadinessGateReason({ - diagnostics: normalized.diagnostics, - requirePlaywrightReady: config.requirePlaywrightReady, - }); + : firstString([ + selfHealingActionGateReason({ diagnostics: normalized.diagnostics }), + playwrightReadinessGateReason({ + diagnostics: normalized.diagnostics, + requirePlaywrightReady: config.requirePlaywrightReady, + }), + ]); const status = benchmarkStatusForOutcome({ execution, parsedPayload: usablePayload, @@ -1832,6 +1845,10 @@ function stringArrayValue(value) { return []; } +function firstString(values) { + return values.find((value) => typeof value === "string" && value.length > 0) ?? null; +} + function singleStringArray(value) { return typeof value === "string" ? [value] : []; } diff --git a/benchmarks/dataset-agent/run-benchmark.test.mjs b/benchmarks/dataset-agent/run-benchmark.test.mjs index e22c910..534f6c2 100644 --- a/benchmarks/dataset-agent/run-benchmark.test.mjs +++ b/benchmarks/dataset-agent/run-benchmark.test.mjs @@ -13,6 +13,7 @@ import { playwrightReadinessGateReason, rescoreBenchmarkRun, scoreBenchmarkRows, + selfHealingActionGateReason, } from "./run-benchmark.mjs"; import { selfHealingDiagnosticsFromTick } from "./adapters/self-healing-output.mjs"; @@ -295,6 +296,66 @@ test("Playwright readiness gate does not override infrastructure blockers", () = }), "infra"); }); +test("self-healing rejection gate fails otherwise passing benchmark output", () => { + const capabilityGateReason = selfHealingActionGateReason({ + diagnostics: { + selfHealingAction: "candidate_rejected", + }, + }); + const answerKeyScore = { passed: true, failureCategory: undefined }; + const status = benchmarkStatusForOutcome({ + execution: { exitCode: 0 }, + parsedPayload: { rows: passingRows() }, + answerKeyScore, + infraBlockerReason: null, + capabilityGateReason, + }); + + assert.equal(status, "failed"); + assert.match(capabilityGateReason, /candidate recipe was rejected/i); + assert.equal(failureCategoryForOutcome({ + status, + infraBlockerReason: null, + capabilityGateReason, + answerKeyScore, + }), "capability_gate"); + assert.equal(failureReason({ + execution: { exitCode: 0, timedOut: false }, + parsedPayload: { rows: passingRows() }, + validation: passingValidation, + answerKeyScore, + infraBlockerReason: null, + capabilityGateReason, + minRequiredCompleteness: 0.75, + }), capabilityGateReason); +}); + +test("self-healing rejection gate does not override infrastructure blockers", () => { + const infraBlockerReason = "Infrastructure/auth/credits blocker."; + const capabilityGateReason = null; + const answerKeyScore = { passed: true, failureCategory: undefined }; + const status = benchmarkStatusForOutcome({ + execution: { exitCode: 0 }, + parsedPayload: { + rows: passingRows(), + diagnostics: { + selfHealingAction: "candidate_rejected", + }, + }, + answerKeyScore, + infraBlockerReason, + capabilityGateReason, + }); + + assert.equal(status, "blocked"); + assert.equal(failureCategoryForOutcome({ + status, + infraBlockerReason, + capabilityGateReason, + answerKeyScore, + }), "infra"); +}); + test("rescore applies Playwright readiness gate semantics", async () => { const runDirectory = await mkdtemp(join(tmpdir(), "bigset-benchmark-rescore-")); const artifactDirectory = join(runDirectory, "collection-self-heal", "01-gate-prompt"); @@ -352,6 +413,65 @@ test("rescore applies Playwright readiness gate semantics", async () => { assert.equal(rescored.laneResults[0].playwrightCandidateStatus, "not_ready"); }); +test("rescore applies self-healing rejection gate semantics", async () => { + const runDirectory = await mkdtemp(join(tmpdir(), "bigset-benchmark-rescore-")); + const artifactDirectory = join(runDirectory, "collection-self-heal", "01-rejected-prompt"); + await mkdir(artifactDirectory, { recursive: true }); + + const parsedPayload = { + rows: passingRows(), + validationIssues: [], + diagnostics: { + selfHealingAction: "candidate_rejected", + }, + }; + await writeFile( + join(runDirectory, "summary.json"), + JSON.stringify({ + laneResults: [{ + system: "collection-self-heal", + promptId: "rejected-prompt", + promptQuality: "good", + artifactDirectory, + exitCode: 0, + timedOut: false, + }], + }) + ); + await writeFile( + join(artifactDirectory, "parsed-output.json"), + JSON.stringify(parsedPayload) + ); + await writeFile(join(artifactDirectory, "stdout.txt"), JSON.stringify(parsedPayload)); + await writeFile(join(artifactDirectory, "stderr.txt"), ""); + + const rescored = await rescoreBenchmarkRun({ + runDirectory, + prompts: [{ + id: "rejected-prompt", + quality: "good", + persona: "developer", + prompt: "Find official docs.", + expectedStress: "Self-healing rejection gate.", + requiredColumns: ["entity_name", "source_url"], + }], + config: { + promptIds: null, + minRequiredCompleteness: 0.75, + minFactualAccuracy: 0.75, + requirePlaywrightReady: false, + inputUsdPer1M: 0.05, + outputUsdPer1M: 0.5, + tinyFishAgentStepUsd: 0.015, + }, + }); + + assert.equal(rescored.laneResults[0].status, "failed"); + assert.equal(rescored.laneResults[0].failureCategory, "capability_gate"); + assert.match(rescored.laneResults[0].errorMessage, /candidate recipe was rejected/i); + assert.equal(rescored.laneResults[0].selfHealingAction, "candidate_rejected"); +}); + function passingRows() { return [{ cells: { diff --git a/docs/data-collection-agent-migration-plan.md b/docs/data-collection-agent-migration-plan.md index 852ed5a..fcd10b9 100644 --- a/docs/data-collection-agent-migration-plan.md +++ b/docs/data-collection-agent-migration-plan.md @@ -201,6 +201,9 @@ The current layer does not yet: `has_streaming_url`, and `result_keys`) when readiness fails; these fields prove browser work happened without persisting raw streaming URLs or pretending selectors/clicks exist + - treat `selfHealingAction: "candidate_rejected"` as a capability failure + even if diagnostic rows score well; rejected rows are debug output, not a + promotable self-healing recipe - full benchmark only after the 2-prompt run is not obviously broken - live `--dataset-id` dry-run only after Convex/env prerequisites are ready - `--commit` only on a throwaway dataset first