Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
95 changes: 95 additions & 0 deletions backend/src/pipeline/populate-playwright-readiness.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
import type {
PopulateProcessTrace,
PopulateRuntimeResult,
PopulateRuntimeTraceStep,
} from "./populate-runtime.js";

export type PopulatePlaywrightCandidateReadinessStatus =
| "ready"
| "not_ready";

export interface PopulatePlaywrightCandidateReadiness {
status: PopulatePlaywrightCandidateReadinessStatus;
reasons: string[];
browserStepCount: number;
sourceUrlCount: number;
}

export function playwrightCandidateReadinessForRun(input: {
result: PopulateRuntimeResult;
}): PopulatePlaywrightCandidateReadiness {
const processTrace = input.result.debug?.processTrace;
const reasons: string[] = [];

if (!processTrace) {
reasons.push("Process trace is missing.");
}
if (hasAgentDisabledCapabilityDiagnostic(input.result)) {
reasons.push(
"TinyFish Agent/browser follow-up was required but disabled for this run."
);
}

const browserSteps = processTrace
? actionableBrowserSteps(processTrace)
: [];
if (browserSteps.length === 0) {
reasons.push(
"Trace has no actionable browser steps with URL/selector/target data."
);
}

const sourceUrlCount = processTrace
? sourceUrlCountForTrace(processTrace)
: 0;
if (sourceUrlCount === 0) {
reasons.push("Trace has no source URLs to anchor a replay script.");
}

return {
status: reasons.length === 0 ? "ready" : "not_ready",
reasons,
browserStepCount: browserSteps.length,
sourceUrlCount,
};
}

function hasAgentDisabledCapabilityDiagnostic(
result: PopulateRuntimeResult
): boolean {
const diagnostics = [
...result.validationIssues,
...(result.debug?.notes ?? []),
];
return diagnostics.some((diagnostic) =>
/Capability diagnostic: TinyFish Agent disabled/i.test(diagnostic)
);
}

function actionableBrowserSteps(
processTrace: PopulateProcessTrace
): PopulateRuntimeTraceStep[] {
return processTrace.steps.filter((step) => {
if (step.kind !== "browser" || step.status !== "succeeded") {
return false;
}
const action = step.browserAction;
if (!action) {
return false;
}
return Boolean(
action.url ||
action.selector ||
action.targetText
);
});
}

function sourceUrlCountForTrace(processTrace: PopulateProcessTrace): number {
return new Set([
...processTrace.fetchedUrls,
...processTrace.sourceArtifacts
.filter((artifact) => artifact.status === "succeeded")
.map((artifact) => artifact.url),
].filter((url) => /^https?:\/\//i.test(url))).size;
}
20 changes: 20 additions & 0 deletions backend/src/pipeline/populate-runtime.ts
Original file line number Diff line number Diff line change
Expand Up @@ -47,17 +47,37 @@ export type PopulateRuntimeTraceStepKind =
| "fetch"
| "insert_row"
| "agent"
| "browser"
| "extract"
| "repair"
| "validation";

export type PopulateRuntimeBrowserActionKind =
| "navigate"
| "click"
| "type"
| "select"
| "wait"
| "extract"
| "screenshot"
| "unknown";

export interface PopulateRuntimeBrowserAction {
action: PopulateRuntimeBrowserActionKind;
url?: string;
selector?: string;
targetText?: string;
valueDescription?: string;
}

export interface PopulateRuntimeTraceStep {
kind: PopulateRuntimeTraceStepKind;
label: string;
status: "succeeded" | "failed" | "skipped";
input?: Record<string, unknown>;
output?: Record<string, unknown>;
error?: string;
browserAction?: PopulateRuntimeBrowserAction;
}

export interface PopulateProcessTraceSourceArtifact {
Expand Down
19 changes: 19 additions & 0 deletions backend/src/pipeline/populate-self-healing.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@ import {
datasetContextSchema,
type DatasetContext,
} from "./populate.js";
import {
playwrightCandidateReadinessForRun,
type PopulatePlaywrightCandidateReadiness,
} from "./populate-playwright-readiness.js";

export type PopulateRecipeStatus =
| "active"
Expand All @@ -28,6 +32,7 @@ export type PopulateRecipeArtifactKind =
| "source-transcript"
| "captured-rows"
| "process-trace"
| "playwright-candidate-readiness"
| "playwright-candidate-script";

const MAX_ARTIFACT_TEXT_LENGTH = 20_000;
Expand Down Expand Up @@ -884,10 +889,24 @@ function artifactsForRun(input: {
label: "populate-process-trace",
content: processTraceArtifactContent(processTrace),
});
artifacts.push({
kind: "playwright-candidate-readiness",
label: "populate-playwright-candidate-readiness",
content: playwrightCandidateReadinessArtifactContent(
playwrightCandidateReadinessForRun({ result: input.result })
),
});
}
return artifacts;
}

function playwrightCandidateReadinessArtifactContent(
readiness: PopulatePlaywrightCandidateReadiness
): string {
return JSON.stringify(readiness, null, 2)
.slice(0, MAX_ARTIFACT_TEXT_LENGTH);
}

function processTraceArtifactContent(processTrace: PopulateProcessTrace): string {
let content = "";
for (const limits of PROCESS_TRACE_ARTIFACT_LIMITS) {
Expand Down
144 changes: 144 additions & 0 deletions backend/test/populate-playwright-readiness.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
import assert from "node:assert/strict";
import { test } from "node:test";

import { playwrightCandidateReadinessForRun } from "../src/pipeline/populate-playwright-readiness.js";
import type { PopulateRuntimeResult } from "../src/pipeline/populate-runtime.js";

test("Playwright candidate readiness rejects search/fetch-only traces", () => {
const readiness = playwrightCandidateReadinessForRun({
result: runtimeResult({
processTrace: {
runtime: "collection",
searchQueries: ["OpenAI latest blog"],
fetchedUrls: ["https://openai.com/news"],
sourceArtifacts: [{
url: "https://openai.com/news",
status: "succeeded",
source: "fetch",
label: "news",
}],
selectedRowSource: "collection_pipeline",
notes: [],
steps: [{
kind: "fetch",
label: "collection-fetched-url",
status: "succeeded",
input: { url: "https://openai.com/news" },
}],
},
}),
});

assert.equal(readiness.status, "not_ready");
assert.equal(readiness.browserStepCount, 0);
assert.match(readiness.reasons.join("\n"), /no actionable browser steps/i);
});

test("Playwright candidate readiness rejects Agent-disabled capability diagnostics", () => {
const readiness = playwrightCandidateReadinessForRun({
result: runtimeResult({
validationIssues: [
"Capability diagnostic: TinyFish Agent disabled; triage requested browser/form/detail follow-up for 1 page(s).",
],
processTrace: {
runtime: "collection",
searchQueries: [],
fetchedUrls: ["https://example.com/form"],
sourceArtifacts: [{
url: "https://example.com/form",
status: "succeeded",
source: "fetch",
}],
selectedRowSource: "collection_pipeline",
notes: [],
steps: [{
kind: "browser",
label: "agent-navigation",
status: "succeeded",
browserAction: {
action: "navigate",
url: "https://example.com/form",
},
}],
},
}),
});

assert.equal(readiness.status, "not_ready");
assert.match(readiness.reasons.join("\n"), /Agent\/browser follow-up/i);
});

test("Playwright candidate readiness accepts browser-action traces anchored to sources", () => {
const readiness = playwrightCandidateReadinessForRun({
result: runtimeResult({
processTrace: {
runtime: "collection",
searchQueries: [],
fetchedUrls: ["https://example.com/form"],
sourceArtifacts: [{
url: "https://example.com/form",
status: "succeeded",
source: "agent",
label: "browser-canary",
}],
selectedRowSource: "collection_pipeline",
notes: [],
steps: [{
kind: "browser",
label: "agent-form-submit",
status: "succeeded",
browserAction: {
action: "click",
url: "https://example.com/form",
selector: "button[type=submit]",
},
}],
},
}),
});

assert.equal(readiness.status, "ready");
assert.deepEqual(readiness.reasons, []);
assert.equal(readiness.browserStepCount, 1);
assert.equal(readiness.sourceUrlCount, 1);
});

function runtimeResult(input: {
validationIssues?: string[];
processTrace?: NonNullable<PopulateRuntimeResult["debug"]>["processTrace"];
}): PopulateRuntimeResult {
return {
rows: [{
cells: {
entity_name: "OpenAI",
source_url: "https://openai.com/news",
evidence_quote: "Release notes",
},
sourceUrls: ["https://openai.com/news"],
evidence: [{
columnName: "evidence_quote",
sourceUrl: "https://openai.com/news",
quote: "Release notes",
}],
needsReview: false,
}],
validationIssues: input.validationIssues ?? [],
usage: { promptTokens: 0, completionTokens: 0, totalTokens: 0 },
metrics: {
searchCalls: 0,
fetchCalls: 0,
browserCalls: 0,
agentRuns: 0,
agentSteps: 0,
},
debug: input.processTrace
? {
capturedRows: [],
capturedSources: [],
selectedRowSource: "collection_pipeline",
notes: [],
processTrace: input.processTrace,
}
: undefined,
};
}
11 changes: 11 additions & 0 deletions backend/test/populate-self-healing.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,17 @@ test("Mastra populate recipe runtime maps populate rows into a healthy recipe ru
assert.deepEqual(trace.searchQueries, ["OpenAI latest blog"]);
assert.deepEqual(trace.fetchedUrls, ["https://openai.com/news"]);
assert.equal(trace.selectedRowSource, "insert_row");
const readinessArtifact = run.artifacts.find((artifact) =>
artifact.kind === "playwright-candidate-readiness"
);
assert.ok(readinessArtifact);
const readiness = JSON.parse(readinessArtifact.content);
assert.equal(readiness.status, "not_ready");
assert.match(readiness.reasons.join("\n"), /no actionable browser steps/i);
assert.equal(
run.artifacts.some((artifact) => artifact.kind === "playwright-candidate-script"),
false
);
});

test("Mastra populate recipe runtime keeps supplemental fetch misses non-blocking", async () => {
Expand Down
7 changes: 7 additions & 0 deletions benchmarks/dataset-agent/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,13 @@ Latest `mcp-docs-pages` Agent-enabled canary evidence:
App and CLI collection-runtime runs use the same runner shape, but load it from
`POPULATE_COLLECTION_RUNNER_MODULE` when `POPULATE_AGENT_RUNTIME=collection`.

Self-healing run records now include a `process-trace` artifact when a runtime
exposes trace data and a `playwright-candidate-readiness` artifact that says
whether the trace is grounded enough for a future Playwright compiler. Search
and fetch URLs alone are not enough. The readiness gate expects real browser
actions such as URL transitions, selectors, target text, or redacted input
descriptions before any `playwright-candidate-script` can be emitted.

## Verify Self-Healing Stack

Use this before asking someone else to migrate a new collection agent into the
Expand Down
11 changes: 11 additions & 0 deletions docs/data-collection-agent-migration-plan.md
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,11 @@ The current layer now can:
- expose structured trace data for both Mastra and collection runs:
`runtime`, `searchQueries`, `fetchedUrls`, `sourceArtifacts`,
`selectedRowSource`, `notes`, and ordered `steps`
- expose a `playwright-candidate-readiness` artifact that explains whether the
trace is grounded enough to compile a future Playwright script
- represent browser actions in the trace contract when a future Agent/canary
records URL transitions, selectors, target text, or redacted input
descriptions
- emit a capability diagnostic when no-Agent mode sees pages that need browser,
form, or detail-page follow-up

Expand All @@ -103,6 +108,9 @@ The current layer does not yet:
- run cron from compiled Playwright scripts
- repair or promote Playwright scripts; repair still changes durable runtime
instructions only
- compile search/fetch-only traces into Playwright; traces must include
actionable browser steps before the script compiler is allowed to emit a
candidate
- run a green live Convex canary in this local environment
- prove Agent-enabled collection quality on a full real benchmark
- prove the collection runtime should replace Mastra as the default app runtime
Expand Down Expand Up @@ -166,6 +174,9 @@ The current layer does not yet:
- 2-prompt real benchmark
- 1-prompt Agent-enabled capability canary for prompts that need browser or
detail follow-up
- browser-step trace canary that records URL transitions, selectors/targets,
and redacted form-input descriptions before any Playwright compiler is
enabled
- full benchmark only after the 2-prompt run is not obviously broken
- live `--dataset-id` dry-run only after Convex/env prerequisites are ready
- `--commit` only on a throwaway dataset first
Expand Down