From 8bafe26e537967d452f31d31157f077e49ca4ed0 Mon Sep 17 00:00:00 2001
From: Edward Tran <giaphutran012@gmail.com>
Date: Sat, 23 May 2026 06:21:35 +0700
Subject: [PATCH] Gate Playwright candidate readiness

---
 .../pipeline/populate-playwright-readiness.ts |  95 ++++++++++++
 backend/src/pipeline/populate-runtime.ts      |  20 +++
 backend/src/pipeline/populate-self-healing.ts |  19 +++
 .../populate-playwright-readiness.test.ts     | 144 ++++++++++++++++++
 backend/test/populate-self-healing.test.ts    |  11 ++
 benchmarks/dataset-agent/README.md            |   7 +
 docs/data-collection-agent-migration-plan.md  |  11 ++
 7 files changed, 307 insertions(+)
 create mode 100644 backend/src/pipeline/populate-playwright-readiness.ts
 create mode 100644 backend/test/populate-playwright-readiness.test.ts

diff --git a/backend/src/pipeline/populate-playwright-readiness.ts b/backend/src/pipeline/populate-playwright-readiness.ts
new file mode 100644
index 0000000..c7a1b59
--- /dev/null
+++ b/backend/src/pipeline/populate-playwright-readiness.ts
@@ -0,0 +1,95 @@
+import type {
+  PopulateProcessTrace,
+  PopulateRuntimeResult,
+  PopulateRuntimeTraceStep,
+} from "./populate-runtime.js";
+
+export type PopulatePlaywrightCandidateReadinessStatus =
+  | "ready"
+  | "not_ready";
+
+export interface PopulatePlaywrightCandidateReadiness {
+  status: PopulatePlaywrightCandidateReadinessStatus;
+  reasons: string[];
+  browserStepCount: number;
+  sourceUrlCount: number;
+}
+
+export function playwrightCandidateReadinessForRun(input: {
+  result: PopulateRuntimeResult;
+}): PopulatePlaywrightCandidateReadiness {
+  const processTrace = input.result.debug?.processTrace;
+  const reasons: string[] = [];
+
+  if (!processTrace) {
+    reasons.push("Process trace is missing.");
+  }
+  if (hasAgentDisabledCapabilityDiagnostic(input.result)) {
+    reasons.push(
+      "TinyFish Agent/browser follow-up was required but disabled for this run."
+    );
+  }
+
+  const browserSteps = processTrace
+    ? actionableBrowserSteps(processTrace)
+    : [];
+  if (browserSteps.length === 0) {
+    reasons.push(
+      "Trace has no actionable browser steps with URL/selector/target data."
+    );
+  }
+
+  const sourceUrlCount = processTrace
+    ? sourceUrlCountForTrace(processTrace)
+    : 0;
+  if (sourceUrlCount === 0) {
+    reasons.push("Trace has no source URLs to anchor a replay script.");
+  }
+
+  return {
+    status: reasons.length === 0 ? "ready" : "not_ready",
+    reasons,
+    browserStepCount: browserSteps.length,
+    sourceUrlCount,
+  };
+}
+
+function hasAgentDisabledCapabilityDiagnostic(
+  result: PopulateRuntimeResult
+): boolean {
+  const diagnostics = [
+    ...result.validationIssues,
+    ...(result.debug?.notes ?? []),
+  ];
+  return diagnostics.some((diagnostic) =>
+    /Capability diagnostic: TinyFish Agent disabled/i.test(diagnostic)
+  );
+}
+
+function actionableBrowserSteps(
+  processTrace: PopulateProcessTrace
+): PopulateRuntimeTraceStep[] {
+  return processTrace.steps.filter((step) => {
+    if (step.kind !== "browser" || step.status !== "succeeded") {
+      return false;
+    }
+    const action = step.browserAction;
+    if (!action) {
+      return false;
+    }
+    return Boolean(
+      action.url ||
+      action.selector ||
+      action.targetText
+    );
+  });
+}
+
+function sourceUrlCountForTrace(processTrace: PopulateProcessTrace): number {
+  return new Set([
+    ...processTrace.fetchedUrls,
+    ...processTrace.sourceArtifacts
+      .filter((artifact) => artifact.status === "succeeded")
+      .map((artifact) => artifact.url),
+  ].filter((url) => /^https?:\/\//i.test(url))).size;
+}
diff --git a/backend/src/pipeline/populate-runtime.ts b/backend/src/pipeline/populate-runtime.ts
index 0a3cff0..f385e85 100644
--- a/backend/src/pipeline/populate-runtime.ts
+++ b/backend/src/pipeline/populate-runtime.ts
@@ -47,10 +47,29 @@ export type PopulateRuntimeTraceStepKind =
   | "fetch"
   | "insert_row"
   | "agent"
+  | "browser"
   | "extract"
   | "repair"
   | "validation";
 
+export type PopulateRuntimeBrowserActionKind =
+  | "navigate"
+  | "click"
+  | "type"
+  | "select"
+  | "wait"
+  | "extract"
+  | "screenshot"
+  | "unknown";
+
+export interface PopulateRuntimeBrowserAction {
+  action: PopulateRuntimeBrowserActionKind;
+  url?: string;
+  selector?: string;
+  targetText?: string;
+  valueDescription?: string;
+}
+
 export interface PopulateRuntimeTraceStep {
   kind: PopulateRuntimeTraceStepKind;
   label: string;
@@ -58,6 +77,7 @@ export interface PopulateRuntimeTraceStep {
   input?: Record<string, unknown>;
   output?: Record<string, unknown>;
   error?: string;
+  browserAction?: PopulateRuntimeBrowserAction;
 }
 
 export interface PopulateProcessTraceSourceArtifact {
diff --git a/backend/src/pipeline/populate-self-healing.ts b/backend/src/pipeline/populate-self-healing.ts
index 2ba75ba..06022a4 100644
--- a/backend/src/pipeline/populate-self-healing.ts
+++ b/backend/src/pipeline/populate-self-healing.ts
@@ -13,6 +13,10 @@ import {
   datasetContextSchema,
   type DatasetContext,
 } from "./populate.js";
+import {
+  playwrightCandidateReadinessForRun,
+  type PopulatePlaywrightCandidateReadiness,
+} from "./populate-playwright-readiness.js";
 
 export type PopulateRecipeStatus =
   | "active"
@@ -28,6 +32,7 @@ export type PopulateRecipeArtifactKind =
   | "source-transcript"
   | "captured-rows"
   | "process-trace"
+  | "playwright-candidate-readiness"
   | "playwright-candidate-script";
 
 const MAX_ARTIFACT_TEXT_LENGTH = 20_000;
@@ -884,10 +889,24 @@ function artifactsForRun(input: {
       label: "populate-process-trace",
       content: processTraceArtifactContent(processTrace),
     });
+    artifacts.push({
+      kind: "playwright-candidate-readiness",
+      label: "populate-playwright-candidate-readiness",
+      content: playwrightCandidateReadinessArtifactContent(
+        playwrightCandidateReadinessForRun({ result: input.result })
+      ),
+    });
   }
   return artifacts;
 }
 
+function playwrightCandidateReadinessArtifactContent(
+  readiness: PopulatePlaywrightCandidateReadiness
+): string {
+  return JSON.stringify(readiness, null, 2)
+    .slice(0, MAX_ARTIFACT_TEXT_LENGTH);
+}
+
 function processTraceArtifactContent(processTrace: PopulateProcessTrace): string {
   let content = "";
   for (const limits of PROCESS_TRACE_ARTIFACT_LIMITS) {
diff --git a/backend/test/populate-playwright-readiness.test.ts b/backend/test/populate-playwright-readiness.test.ts
new file mode 100644
index 0000000..cd95a09
--- /dev/null
+++ b/backend/test/populate-playwright-readiness.test.ts
@@ -0,0 +1,144 @@
+import assert from "node:assert/strict";
+import { test } from "node:test";
+
+import { playwrightCandidateReadinessForRun } from "../src/pipeline/populate-playwright-readiness.js";
+import type { PopulateRuntimeResult } from "../src/pipeline/populate-runtime.js";
+
+test("Playwright candidate readiness rejects search/fetch-only traces", () => {
+  const readiness = playwrightCandidateReadinessForRun({
+    result: runtimeResult({
+      processTrace: {
+        runtime: "collection",
+        searchQueries: ["OpenAI latest blog"],
+        fetchedUrls: ["https://openai.com/news"],
+        sourceArtifacts: [{
+          url: "https://openai.com/news",
+          status: "succeeded",
+          source: "fetch",
+          label: "news",
+        }],
+        selectedRowSource: "collection_pipeline",
+        notes: [],
+        steps: [{
+          kind: "fetch",
+          label: "collection-fetched-url",
+          status: "succeeded",
+          input: { url: "https://openai.com/news" },
+        }],
+      },
+    }),
+  });
+
+  assert.equal(readiness.status, "not_ready");
+  assert.equal(readiness.browserStepCount, 0);
+  assert.match(readiness.reasons.join("\n"), /no actionable browser steps/i);
+});
+
+test("Playwright candidate readiness rejects Agent-disabled capability diagnostics", () => {
+  const readiness = playwrightCandidateReadinessForRun({
+    result: runtimeResult({
+      validationIssues: [
+        "Capability diagnostic: TinyFish Agent disabled; triage requested browser/form/detail follow-up for 1 page(s).",
+      ],
+      processTrace: {
+        runtime: "collection",
+        searchQueries: [],
+        fetchedUrls: ["https://example.com/form"],
+        sourceArtifacts: [{
+          url: "https://example.com/form",
+          status: "succeeded",
+          source: "fetch",
+        }],
+        selectedRowSource: "collection_pipeline",
+        notes: [],
+        steps: [{
+          kind: "browser",
+          label: "agent-navigation",
+          status: "succeeded",
+          browserAction: {
+            action: "navigate",
+            url: "https://example.com/form",
+          },
+        }],
+      },
+    }),
+  });
+
+  assert.equal(readiness.status, "not_ready");
+  assert.match(readiness.reasons.join("\n"), /Agent\/browser follow-up/i);
+});
+
+test("Playwright candidate readiness accepts browser-action traces anchored to sources", () => {
+  const readiness = playwrightCandidateReadinessForRun({
+    result: runtimeResult({
+      processTrace: {
+        runtime: "collection",
+        searchQueries: [],
+        fetchedUrls: ["https://example.com/form"],
+        sourceArtifacts: [{
+          url: "https://example.com/form",
+          status: "succeeded",
+          source: "agent",
+          label: "browser-canary",
+        }],
+        selectedRowSource: "collection_pipeline",
+        notes: [],
+        steps: [{
+          kind: "browser",
+          label: "agent-form-submit",
+          status: "succeeded",
+          browserAction: {
+            action: "click",
+            url: "https://example.com/form",
+            selector: "button[type=submit]",
+          },
+        }],
+      },
+    }),
+  });
+
+  assert.equal(readiness.status, "ready");
+  assert.deepEqual(readiness.reasons, []);
+  assert.equal(readiness.browserStepCount, 1);
+  assert.equal(readiness.sourceUrlCount, 1);
+});
+
+function runtimeResult(input: {
+  validationIssues?: string[];
+  processTrace?: NonNullable<PopulateRuntimeResult["debug"]>["processTrace"];
+}): PopulateRuntimeResult {
+  return {
+    rows: [{
+      cells: {
+        entity_name: "OpenAI",
+        source_url: "https://openai.com/news",
+        evidence_quote: "Release notes",
+      },
+      sourceUrls: ["https://openai.com/news"],
+      evidence: [{
+        columnName: "evidence_quote",
+        sourceUrl: "https://openai.com/news",
+        quote: "Release notes",
+      }],
+      needsReview: false,
+    }],
+    validationIssues: input.validationIssues ?? [],
+    usage: { promptTokens: 0, completionTokens: 0, totalTokens: 0 },
+    metrics: {
+      searchCalls: 0,
+      fetchCalls: 0,
+      browserCalls: 0,
+      agentRuns: 0,
+      agentSteps: 0,
+    },
+    debug: input.processTrace
+      ? {
+          capturedRows: [],
+          capturedSources: [],
+          selectedRowSource: "collection_pipeline",
+          notes: [],
+          processTrace: input.processTrace,
+        }
+      : undefined,
+  };
+}
diff --git a/backend/test/populate-self-healing.test.ts b/backend/test/populate-self-healing.test.ts
index 7544460..b68356b 100644
--- a/backend/test/populate-self-healing.test.ts
+++ b/backend/test/populate-self-healing.test.ts
@@ -117,6 +117,17 @@ test("Mastra populate recipe runtime maps populate rows into a healthy recipe ru
   assert.deepEqual(trace.searchQueries, ["OpenAI latest blog"]);
   assert.deepEqual(trace.fetchedUrls, ["https://openai.com/news"]);
   assert.equal(trace.selectedRowSource, "insert_row");
+  const readinessArtifact = run.artifacts.find((artifact) =>
+    artifact.kind === "playwright-candidate-readiness"
+  );
+  assert.ok(readinessArtifact);
+  const readiness = JSON.parse(readinessArtifact.content);
+  assert.equal(readiness.status, "not_ready");
+  assert.match(readiness.reasons.join("\n"), /no actionable browser steps/i);
+  assert.equal(
+    run.artifacts.some((artifact) => artifact.kind === "playwright-candidate-script"),
+    false
+  );
 });
 
 test("Mastra populate recipe runtime keeps supplemental fetch misses non-blocking", async () => {
diff --git a/benchmarks/dataset-agent/README.md b/benchmarks/dataset-agent/README.md
index a4e0cc7..ce88a3d 100644
--- a/benchmarks/dataset-agent/README.md
+++ b/benchmarks/dataset-agent/README.md
@@ -81,6 +81,13 @@ Latest `mcp-docs-pages` Agent-enabled canary evidence:
 App and CLI collection-runtime runs use the same runner shape, but load it from
 `POPULATE_COLLECTION_RUNNER_MODULE` when `POPULATE_AGENT_RUNTIME=collection`.
 
+Self-healing run records now include a `process-trace` artifact when a runtime
+exposes trace data and a `playwright-candidate-readiness` artifact that says
+whether the trace is grounded enough for a future Playwright compiler. Search
+and fetch URLs alone are not enough. The readiness gate expects real browser
+actions such as URL transitions, selectors, target text, or redacted input
+descriptions before any `playwright-candidate-script` can be emitted.
+
 ## Verify Self-Healing Stack
 
 Use this before asking someone else to migrate a new collection agent into the
diff --git a/docs/data-collection-agent-migration-plan.md b/docs/data-collection-agent-migration-plan.md
index 8c0394c..8175714 100644
--- a/docs/data-collection-agent-migration-plan.md
+++ b/docs/data-collection-agent-migration-plan.md
@@ -92,6 +92,11 @@ The current layer now can:
 - expose structured trace data for both Mastra and collection runs:
   `runtime`, `searchQueries`, `fetchedUrls`, `sourceArtifacts`,
   `selectedRowSource`, `notes`, and ordered `steps`
+- expose a `playwright-candidate-readiness` artifact that explains whether the
+  trace is grounded enough to compile a future Playwright script
+- represent browser actions in the trace contract when a future Agent/canary
+  records URL transitions, selectors, target text, or redacted input
+  descriptions
 - emit a capability diagnostic when no-Agent mode sees pages that need browser,
   form, or detail-page follow-up
 
@@ -103,6 +108,9 @@ The current layer does not yet:
 - run cron from compiled Playwright scripts
 - repair or promote Playwright scripts; repair still changes durable runtime
   instructions only
+- compile search/fetch-only traces into Playwright; traces must include
+  actionable browser steps before the script compiler is allowed to emit a
+  candidate
 - run a green live Convex canary in this local environment
 - prove Agent-enabled collection quality on a full real benchmark
 - prove the collection runtime should replace Mastra as the default app runtime
@@ -166,6 +174,9 @@ The current layer does not yet:
    - 2-prompt real benchmark
    - 1-prompt Agent-enabled capability canary for prompts that need browser or
      detail follow-up
+   - browser-step trace canary that records URL transitions, selectors/targets,
+     and redacted form-input descriptions before any Playwright compiler is
+     enabled
    - full benchmark only after the 2-prompt run is not obviously broken
    - live `--dataset-id` dry-run only after Convex/env prerequisites are ready
    - `--commit` only on a throwaway dataset first