tinyfish-io · giaphutran12 · May 23, 2026
diff --git a/benchmarks/dataset-agent/README.md b/benchmarks/dataset-agent/README.md
@@ -146,6 +146,11 @@ without committing raw run folders:
 Agent canary actually emitted browser actions before starting a Playwright
 compiler.
 
+If `selfHealingAction` is `candidate_rejected`, the benchmark marks the lane as
+`failureCategory: "capability_gate"` even when the diagnostic rows score well.
+Rejected candidates are useful for debugging, but they are not promotable cron
+recipes.
+
 Agent canaries also preserve safe provenance from the TinyFish run payload:
 reported step count, whether a streaming URL existed, and top-level result
 keys. Raw `streaming_url` values are never persisted. If Agent returns rows but

diff --git a/benchmarks/dataset-agent/run-benchmark.mjs b/benchmarks/dataset-agent/run-benchmark.mjs
@@ -569,10 +569,13 @@ async function runSystemPrompt(input) {
   });
   const capabilityGateReason = infraBlockerReason
     ? null
-    : playwrightReadinessGateReason({
-      diagnostics: normalized.diagnostics,
-      requirePlaywrightReady: input.config.requirePlaywrightReady,
-    });
+    : firstString([
+      selfHealingActionGateReason({ diagnostics: normalized.diagnostics }),
+      playwrightReadinessGateReason({
+        diagnostics: normalized.diagnostics,
+        requirePlaywrightReady: input.config.requirePlaywrightReady,
+      }),
+    ]);
   const status = benchmarkStatusForOutcome({
     execution,
     parsedPayload,
@@ -1016,6 +1019,13 @@ export function playwrightReadinessGateReason({
   return null;
 }
 
+export function selfHealingActionGateReason({ diagnostics }) {
+  if (diagnostics?.selfHealingAction !== "candidate_rejected") {
+    return null;
+  }
+  return "Self-healing gate failed: candidate recipe was rejected; rows came from a diagnostic run, not a promoted recipe.";
+}
+
 export function benchmarkStatusForOutcome({
   execution,
   parsedPayload,
@@ -1171,10 +1181,13 @@ export async function rescoreBenchmarkRun({ runDirectory, prompts, config }) {
     });
     const capabilityGateReason = infraBlockerReason
       ? null
-      : playwrightReadinessGateReason({
-        diagnostics: normalized.diagnostics,
-        requirePlaywrightReady: config.requirePlaywrightReady,
-      });
+      : firstString([
+        selfHealingActionGateReason({ diagnostics: normalized.diagnostics }),
+        playwrightReadinessGateReason({
+          diagnostics: normalized.diagnostics,
+          requirePlaywrightReady: config.requirePlaywrightReady,
+        }),
+      ]);
     const status = benchmarkStatusForOutcome({
       execution,
       parsedPayload: usablePayload,
@@ -1832,6 +1845,10 @@ function stringArrayValue(value) {
   return [];
 }
 
+function firstString(values) {
+  return values.find((value) => typeof value === "string" && value.length > 0) ?? null;
+}
+
 function singleStringArray(value) {
   return typeof value === "string" ? [value] : [];
 }

diff --git a/benchmarks/dataset-agent/run-benchmark.test.mjs b/benchmarks/dataset-agent/run-benchmark.test.mjs
@@ -13,6 +13,7 @@ import {
   playwrightReadinessGateReason,
   rescoreBenchmarkRun,
   scoreBenchmarkRows,
+  selfHealingActionGateReason,
 } from "./run-benchmark.mjs";
 import { selfHealingDiagnosticsFromTick } from "./adapters/self-healing-output.mjs";
 
@@ -295,6 +296,66 @@ test("Playwright readiness gate does not override infrastructure blockers", () =
   }), "infra");
 });
 
+test("self-healing rejection gate fails otherwise passing benchmark output", () => {
+  const capabilityGateReason = selfHealingActionGateReason({
+    diagnostics: {
+      selfHealingAction: "candidate_rejected",
+    },
+  });
+  const answerKeyScore = { passed: true, failureCategory: undefined };
+  const status = benchmarkStatusForOutcome({
+    execution: { exitCode: 0 },
+    parsedPayload: { rows: passingRows() },
+    answerKeyScore,
+    infraBlockerReason: null,
+    capabilityGateReason,
+  });
+
+  assert.equal(status, "failed");
+  assert.match(capabilityGateReason, /candidate recipe was rejected/i);
+  assert.equal(failureCategoryForOutcome({
+    status,
+    infraBlockerReason: null,
+    capabilityGateReason,
+    answerKeyScore,
+  }), "capability_gate");
+  assert.equal(failureReason({
+    execution: { exitCode: 0, timedOut: false },
+    parsedPayload: { rows: passingRows() },
+    validation: passingValidation,
+    answerKeyScore,
+    infraBlockerReason: null,
+    capabilityGateReason,
+    minRequiredCompleteness: 0.75,
+  }), capabilityGateReason);
+});
+
+test("self-healing rejection gate does not override infrastructure blockers", () => {
+  const infraBlockerReason = "Infrastructure/auth/credits blocker.";
+  const capabilityGateReason = null;
+  const answerKeyScore = { passed: true, failureCategory: undefined };
+  const status = benchmarkStatusForOutcome({
+    execution: { exitCode: 0 },
+    parsedPayload: {
+      rows: passingRows(),
+      diagnostics: {
+        selfHealingAction: "candidate_rejected",
+      },
+    },
+    answerKeyScore,
+    infraBlockerReason,
+    capabilityGateReason,
+  });
+
+  assert.equal(status, "blocked");
+  assert.equal(failureCategoryForOutcome({
+    status,
+    infraBlockerReason,
+    capabilityGateReason,
+    answerKeyScore,
+  }), "infra");
+});
+
 test("rescore applies Playwright readiness gate semantics", async () => {
   const runDirectory = await mkdtemp(join(tmpdir(), "bigset-benchmark-rescore-"));
   const artifactDirectory = join(runDirectory, "collection-self-heal", "01-gate-prompt");
@@ -352,6 +413,65 @@ test("rescore applies Playwright readiness gate semantics", async () => {
   assert.equal(rescored.laneResults[0].playwrightCandidateStatus, "not_ready");
 });
 
+test("rescore applies self-healing rejection gate semantics", async () => {
+  const runDirectory = await mkdtemp(join(tmpdir(), "bigset-benchmark-rescore-"));
+  const artifactDirectory = join(runDirectory, "collection-self-heal", "01-rejected-prompt");
+  await mkdir(artifactDirectory, { recursive: true });
+
+  const parsedPayload = {
+    rows: passingRows(),
+    validationIssues: [],
+    diagnostics: {
+      selfHealingAction: "candidate_rejected",
+    },
+  };
+  await writeFile(
+    join(runDirectory, "summary.json"),
+    JSON.stringify({
+      laneResults: [{
+        system: "collection-self-heal",
+        promptId: "rejected-prompt",
+        promptQuality: "good",
+        artifactDirectory,
+        exitCode: 0,
+        timedOut: false,
+      }],
+    })
+  );
+  await writeFile(
+    join(artifactDirectory, "parsed-output.json"),
+    JSON.stringify(parsedPayload)
+  );
+  await writeFile(join(artifactDirectory, "stdout.txt"), JSON.stringify(parsedPayload));
+  await writeFile(join(artifactDirectory, "stderr.txt"), "");
+
+  const rescored = await rescoreBenchmarkRun({
+    runDirectory,
+    prompts: [{
+      id: "rejected-prompt",
+      quality: "good",
+      persona: "developer",
+      prompt: "Find official docs.",
+      expectedStress: "Self-healing rejection gate.",
+      requiredColumns: ["entity_name", "source_url"],
+    }],
+    config: {
+      promptIds: null,
+      minRequiredCompleteness: 0.75,
+      minFactualAccuracy: 0.75,
+      requirePlaywrightReady: false,
+      inputUsdPer1M: 0.05,
+      outputUsdPer1M: 0.5,
+      tinyFishAgentStepUsd: 0.015,
+    },
+  });
+
+  assert.equal(rescored.laneResults[0].status, "failed");
+  assert.equal(rescored.laneResults[0].failureCategory, "capability_gate");
+  assert.match(rescored.laneResults[0].errorMessage, /candidate recipe was rejected/i);
+  assert.equal(rescored.laneResults[0].selfHealingAction, "candidate_rejected");
+});
+
 function passingRows() {
   return [{
     cells: {

diff --git a/docs/data-collection-agent-migration-plan.md b/docs/data-collection-agent-migration-plan.md
@@ -201,6 +201,9 @@ The current layer does not yet:
      `has_streaming_url`, and `result_keys`) when readiness fails; these fields
      prove browser work happened without persisting raw streaming URLs or
      pretending selectors/clicks exist
+   - treat `selfHealingAction: "candidate_rejected"` as a capability failure
+     even if diagnostic rows score well; rejected rows are debug output, not a
+     promotable self-healing recipe
    - full benchmark only after the 2-prompt run is not obviously broken
    - live `--dataset-id` dry-run only after Convex/env prerequisites are ready
    - `--commit` only on a throwaway dataset first