From 8bafe26e537967d452f31d31157f077e49ca4ed0 Mon Sep 17 00:00:00 2001 From: Edward Tran Date: Sat, 23 May 2026 06:21:35 +0700 Subject: [PATCH] Gate Playwright candidate readiness --- .../pipeline/populate-playwright-readiness.ts | 95 ++++++++++++ backend/src/pipeline/populate-runtime.ts | 20 +++ backend/src/pipeline/populate-self-healing.ts | 19 +++ .../populate-playwright-readiness.test.ts | 144 ++++++++++++++++++ backend/test/populate-self-healing.test.ts | 11 ++ benchmarks/dataset-agent/README.md | 7 + docs/data-collection-agent-migration-plan.md | 11 ++ 7 files changed, 307 insertions(+) create mode 100644 backend/src/pipeline/populate-playwright-readiness.ts create mode 100644 backend/test/populate-playwright-readiness.test.ts diff --git a/backend/src/pipeline/populate-playwright-readiness.ts b/backend/src/pipeline/populate-playwright-readiness.ts new file mode 100644 index 0000000..c7a1b59 --- /dev/null +++ b/backend/src/pipeline/populate-playwright-readiness.ts @@ -0,0 +1,95 @@ +import type { + PopulateProcessTrace, + PopulateRuntimeResult, + PopulateRuntimeTraceStep, +} from "./populate-runtime.js"; + +export type PopulatePlaywrightCandidateReadinessStatus = + | "ready" + | "not_ready"; + +export interface PopulatePlaywrightCandidateReadiness { + status: PopulatePlaywrightCandidateReadinessStatus; + reasons: string[]; + browserStepCount: number; + sourceUrlCount: number; +} + +export function playwrightCandidateReadinessForRun(input: { + result: PopulateRuntimeResult; +}): PopulatePlaywrightCandidateReadiness { + const processTrace = input.result.debug?.processTrace; + const reasons: string[] = []; + + if (!processTrace) { + reasons.push("Process trace is missing."); + } + if (hasAgentDisabledCapabilityDiagnostic(input.result)) { + reasons.push( + "TinyFish Agent/browser follow-up was required but disabled for this run." + ); + } + + const browserSteps = processTrace + ? actionableBrowserSteps(processTrace) + : []; + if (browserSteps.length === 0) { + reasons.push( + "Trace has no actionable browser steps with URL/selector/target data." + ); + } + + const sourceUrlCount = processTrace + ? sourceUrlCountForTrace(processTrace) + : 0; + if (sourceUrlCount === 0) { + reasons.push("Trace has no source URLs to anchor a replay script."); + } + + return { + status: reasons.length === 0 ? "ready" : "not_ready", + reasons, + browserStepCount: browserSteps.length, + sourceUrlCount, + }; +} + +function hasAgentDisabledCapabilityDiagnostic( + result: PopulateRuntimeResult +): boolean { + const diagnostics = [ + ...result.validationIssues, + ...(result.debug?.notes ?? []), + ]; + return diagnostics.some((diagnostic) => + /Capability diagnostic: TinyFish Agent disabled/i.test(diagnostic) + ); +} + +function actionableBrowserSteps( + processTrace: PopulateProcessTrace +): PopulateRuntimeTraceStep[] { + return processTrace.steps.filter((step) => { + if (step.kind !== "browser" || step.status !== "succeeded") { + return false; + } + const action = step.browserAction; + if (!action) { + return false; + } + return Boolean( + action.url || + action.selector || + action.targetText + ); + }); +} + +function sourceUrlCountForTrace(processTrace: PopulateProcessTrace): number { + return new Set([ + ...processTrace.fetchedUrls, + ...processTrace.sourceArtifacts + .filter((artifact) => artifact.status === "succeeded") + .map((artifact) => artifact.url), + ].filter((url) => /^https?:\/\//i.test(url))).size; +} diff --git a/backend/src/pipeline/populate-runtime.ts b/backend/src/pipeline/populate-runtime.ts index 0a3cff0..f385e85 100644 --- a/backend/src/pipeline/populate-runtime.ts +++ b/backend/src/pipeline/populate-runtime.ts @@ -47,10 +47,29 @@ export type PopulateRuntimeTraceStepKind = | "fetch" | "insert_row" | "agent" + | "browser" | "extract" | "repair" | "validation"; +export type PopulateRuntimeBrowserActionKind = + | "navigate" + | "click" + | "type" + | "select" + | "wait" + | "extract" + | "screenshot" + | "unknown"; + +export interface PopulateRuntimeBrowserAction { + action: PopulateRuntimeBrowserActionKind; + url?: string; + selector?: string; + targetText?: string; + valueDescription?: string; +} + export interface PopulateRuntimeTraceStep { kind: PopulateRuntimeTraceStepKind; label: string; @@ -58,6 +77,7 @@ export interface PopulateRuntimeTraceStep { input?: Record; output?: Record; error?: string; + browserAction?: PopulateRuntimeBrowserAction; } export interface PopulateProcessTraceSourceArtifact { diff --git a/backend/src/pipeline/populate-self-healing.ts b/backend/src/pipeline/populate-self-healing.ts index 2ba75ba..06022a4 100644 --- a/backend/src/pipeline/populate-self-healing.ts +++ b/backend/src/pipeline/populate-self-healing.ts @@ -13,6 +13,10 @@ import { datasetContextSchema, type DatasetContext, } from "./populate.js"; +import { + playwrightCandidateReadinessForRun, + type PopulatePlaywrightCandidateReadiness, +} from "./populate-playwright-readiness.js"; export type PopulateRecipeStatus = | "active" @@ -28,6 +32,7 @@ export type PopulateRecipeArtifactKind = | "source-transcript" | "captured-rows" | "process-trace" + | "playwright-candidate-readiness" | "playwright-candidate-script"; const MAX_ARTIFACT_TEXT_LENGTH = 20_000; @@ -884,10 +889,24 @@ function artifactsForRun(input: { label: "populate-process-trace", content: processTraceArtifactContent(processTrace), }); + artifacts.push({ + kind: "playwright-candidate-readiness", + label: "populate-playwright-candidate-readiness", + content: playwrightCandidateReadinessArtifactContent( + playwrightCandidateReadinessForRun({ result: input.result }) + ), + }); } return artifacts; } +function playwrightCandidateReadinessArtifactContent( + readiness: PopulatePlaywrightCandidateReadiness +): string { + return JSON.stringify(readiness, null, 2) + .slice(0, MAX_ARTIFACT_TEXT_LENGTH); +} + function processTraceArtifactContent(processTrace: PopulateProcessTrace): string { let content = ""; for (const limits of PROCESS_TRACE_ARTIFACT_LIMITS) { diff --git a/backend/test/populate-playwright-readiness.test.ts b/backend/test/populate-playwright-readiness.test.ts new file mode 100644 index 0000000..cd95a09 --- /dev/null +++ b/backend/test/populate-playwright-readiness.test.ts @@ -0,0 +1,144 @@ +import assert from "node:assert/strict"; +import { test } from "node:test"; + +import { playwrightCandidateReadinessForRun } from "../src/pipeline/populate-playwright-readiness.js"; +import type { PopulateRuntimeResult } from "../src/pipeline/populate-runtime.js"; + +test("Playwright candidate readiness rejects search/fetch-only traces", () => { + const readiness = playwrightCandidateReadinessForRun({ + result: runtimeResult({ + processTrace: { + runtime: "collection", + searchQueries: ["OpenAI latest blog"], + fetchedUrls: ["https://openai.com/news"], + sourceArtifacts: [{ + url: "https://openai.com/news", + status: "succeeded", + source: "fetch", + label: "news", + }], + selectedRowSource: "collection_pipeline", + notes: [], + steps: [{ + kind: "fetch", + label: "collection-fetched-url", + status: "succeeded", + input: { url: "https://openai.com/news" }, + }], + }, + }), + }); + + assert.equal(readiness.status, "not_ready"); + assert.equal(readiness.browserStepCount, 0); + assert.match(readiness.reasons.join("\n"), /no actionable browser steps/i); +}); + +test("Playwright candidate readiness rejects Agent-disabled capability diagnostics", () => { + const readiness = playwrightCandidateReadinessForRun({ + result: runtimeResult({ + validationIssues: [ + "Capability diagnostic: TinyFish Agent disabled; triage requested browser/form/detail follow-up for 1 page(s).", + ], + processTrace: { + runtime: "collection", + searchQueries: [], + fetchedUrls: ["https://example.com/form"], + sourceArtifacts: [{ + url: "https://example.com/form", + status: "succeeded", + source: "fetch", + }], + selectedRowSource: "collection_pipeline", + notes: [], + steps: [{ + kind: "browser", + label: "agent-navigation", + status: "succeeded", + browserAction: { + action: "navigate", + url: "https://example.com/form", + }, + }], + }, + }), + }); + + assert.equal(readiness.status, "not_ready"); + assert.match(readiness.reasons.join("\n"), /Agent\/browser follow-up/i); +}); + +test("Playwright candidate readiness accepts browser-action traces anchored to sources", () => { + const readiness = playwrightCandidateReadinessForRun({ + result: runtimeResult({ + processTrace: { + runtime: "collection", + searchQueries: [], + fetchedUrls: ["https://example.com/form"], + sourceArtifacts: [{ + url: "https://example.com/form", + status: "succeeded", + source: "agent", + label: "browser-canary", + }], + selectedRowSource: "collection_pipeline", + notes: [], + steps: [{ + kind: "browser", + label: "agent-form-submit", + status: "succeeded", + browserAction: { + action: "click", + url: "https://example.com/form", + selector: "button[type=submit]", + }, + }], + }, + }), + }); + + assert.equal(readiness.status, "ready"); + assert.deepEqual(readiness.reasons, []); + assert.equal(readiness.browserStepCount, 1); + assert.equal(readiness.sourceUrlCount, 1); +}); + +function runtimeResult(input: { + validationIssues?: string[]; + processTrace?: NonNullable["processTrace"]; +}): PopulateRuntimeResult { + return { + rows: [{ + cells: { + entity_name: "OpenAI", + source_url: "https://openai.com/news", + evidence_quote: "Release notes", + }, + sourceUrls: ["https://openai.com/news"], + evidence: [{ + columnName: "evidence_quote", + sourceUrl: "https://openai.com/news", + quote: "Release notes", + }], + needsReview: false, + }], + validationIssues: input.validationIssues ?? [], + usage: { promptTokens: 0, completionTokens: 0, totalTokens: 0 }, + metrics: { + searchCalls: 0, + fetchCalls: 0, + browserCalls: 0, + agentRuns: 0, + agentSteps: 0, + }, + debug: input.processTrace + ? { + capturedRows: [], + capturedSources: [], + selectedRowSource: "collection_pipeline", + notes: [], + processTrace: input.processTrace, + } + : undefined, + }; +} diff --git a/backend/test/populate-self-healing.test.ts b/backend/test/populate-self-healing.test.ts index 7544460..b68356b 100644 --- a/backend/test/populate-self-healing.test.ts +++ b/backend/test/populate-self-healing.test.ts @@ -117,6 +117,17 @@ test("Mastra populate recipe runtime maps populate rows into a healthy recipe ru assert.deepEqual(trace.searchQueries, ["OpenAI latest blog"]); assert.deepEqual(trace.fetchedUrls, ["https://openai.com/news"]); assert.equal(trace.selectedRowSource, "insert_row"); + const readinessArtifact = run.artifacts.find((artifact) => + artifact.kind === "playwright-candidate-readiness" + ); + assert.ok(readinessArtifact); + const readiness = JSON.parse(readinessArtifact.content); + assert.equal(readiness.status, "not_ready"); + assert.match(readiness.reasons.join("\n"), /no actionable browser steps/i); + assert.equal( + run.artifacts.some((artifact) => artifact.kind === "playwright-candidate-script"), + false + ); }); test("Mastra populate recipe runtime keeps supplemental fetch misses non-blocking", async () => { diff --git a/benchmarks/dataset-agent/README.md b/benchmarks/dataset-agent/README.md index a4e0cc7..ce88a3d 100644 --- a/benchmarks/dataset-agent/README.md +++ b/benchmarks/dataset-agent/README.md @@ -81,6 +81,13 @@ Latest `mcp-docs-pages` Agent-enabled canary evidence: App and CLI collection-runtime runs use the same runner shape, but load it from `POPULATE_COLLECTION_RUNNER_MODULE` when `POPULATE_AGENT_RUNTIME=collection`. +Self-healing run records now include a `process-trace` artifact when a runtime +exposes trace data and a `playwright-candidate-readiness` artifact that says +whether the trace is grounded enough for a future Playwright compiler. Search +and fetch URLs alone are not enough. The readiness gate expects real browser +actions such as URL transitions, selectors, target text, or redacted input +descriptions before any `playwright-candidate-script` can be emitted. + ## Verify Self-Healing Stack Use this before asking someone else to migrate a new collection agent into the diff --git a/docs/data-collection-agent-migration-plan.md b/docs/data-collection-agent-migration-plan.md index 8c0394c..8175714 100644 --- a/docs/data-collection-agent-migration-plan.md +++ b/docs/data-collection-agent-migration-plan.md @@ -92,6 +92,11 @@ The current layer now can: - expose structured trace data for both Mastra and collection runs: `runtime`, `searchQueries`, `fetchedUrls`, `sourceArtifacts`, `selectedRowSource`, `notes`, and ordered `steps` +- expose a `playwright-candidate-readiness` artifact that explains whether the + trace is grounded enough to compile a future Playwright script +- represent browser actions in the trace contract when a future Agent/canary + records URL transitions, selectors, target text, or redacted input + descriptions - emit a capability diagnostic when no-Agent mode sees pages that need browser, form, or detail-page follow-up @@ -103,6 +108,9 @@ The current layer does not yet: - run cron from compiled Playwright scripts - repair or promote Playwright scripts; repair still changes durable runtime instructions only +- compile search/fetch-only traces into Playwright; traces must include + actionable browser steps before the script compiler is allowed to emit a + candidate - run a green live Convex canary in this local environment - prove Agent-enabled collection quality on a full real benchmark - prove the collection runtime should replace Mastra as the default app runtime @@ -166,6 +174,9 @@ The current layer does not yet: - 2-prompt real benchmark - 1-prompt Agent-enabled capability canary for prompts that need browser or detail follow-up + - browser-step trace canary that records URL transitions, selectors/targets, + and redacted form-input descriptions before any Playwright compiler is + enabled - full benchmark only after the 2-prompt run is not obviously broken - live `--dataset-id` dry-run only after Convex/env prerequisites are ready - `--commit` only on a throwaway dataset first