diff --git a/backend/src/pipeline/collection-agent-runner.ts b/backend/src/pipeline/collection-agent-runner.ts index 5c85a4f..2f7a7ae 100644 --- a/backend/src/pipeline/collection-agent-runner.ts +++ b/backend/src/pipeline/collection-agent-runner.ts @@ -10,6 +10,7 @@ import type { import { populateProcessTraceFromSteps, type PopulateCellValue, + type PopulateRuntimeBrowserAction, type PopulateRuntimeResult, type PopulateRuntimeTraceStep, } from "./populate-runtime.js"; @@ -51,6 +52,8 @@ interface CollectionPipelineResult { search_queries?: string[]; fetched_urls?: string[]; failed_urls?: string[]; + browser_actions?: CollectionBrowserActionReport[]; + agent_browser_actions?: CollectionBrowserActionReport[]; }; repair?: { stats?: CollectionPhaseStats; @@ -59,6 +62,8 @@ interface CollectionPipelineResult { search_queries?: string[]; fetched_urls?: string[]; failed_urls?: string[]; + browser_actions?: CollectionBrowserActionReport[]; + agent_browser_actions?: CollectionBrowserActionReport[]; quality?: { records?: CollectionRecordQuality[]; }; @@ -124,9 +129,25 @@ interface CollectionSourceOutcome { interface CollectionRepairLoopReport { loop_index?: number; repair_queries?: string[]; + browser_actions?: CollectionBrowserActionReport[]; + agent_browser_actions?: CollectionBrowserActionReport[]; stats?: CollectionPhaseStats; } +interface CollectionBrowserActionReport { + action?: string; + url?: string; + selector?: string; + target_text?: string; + targetText?: string; + value_description?: string; + valueDescription?: string; + status?: string; + error?: string; + phase?: string; + label?: string; +} + const AGENT_REQUIRED_TRIAGE_STATUSES = new Set([ "requires_navigation", "requires_form_submission", @@ -312,8 +333,25 @@ function collectionProcessTrace(input: { }, }); } + steps.push(...browserTraceStepsFromReports({ + reports: [ + ...(loop.browser_actions ?? []), + ...(loop.agent_browser_actions ?? []), + ], + defaultPhase: `repair-loop-${loop.loop_index ?? "unknown"}`, + })); } + steps.push(...browserTraceStepsFromReports({ + reports: [ + ...(report.browser_actions ?? []), + ...(report.agent_browser_actions ?? []), + ...(report.initial?.browser_actions ?? []), + ...(report.initial?.agent_browser_actions ?? []), + ], + defaultPhase: "initial", + })); + for (const outcome of report.sources?.outcomes ?? []) { if (!outcome.url) { continue; @@ -358,6 +396,92 @@ function collectionDebugNotes(report: CollectionPipelineResult["report"]): strin return notes; } +function browserTraceStepsFromReports(input: { + reports: CollectionBrowserActionReport[]; + defaultPhase: string; +}): PopulateRuntimeTraceStep[] { + return input.reports + .map((report) => browserTraceStepFromReport({ + report, + defaultPhase: input.defaultPhase, + })) + .filter((step): step is PopulateRuntimeTraceStep => Boolean(step)); +} + +function browserTraceStepFromReport(input: { + report: CollectionBrowserActionReport; + defaultPhase: string; +}): PopulateRuntimeTraceStep | undefined { + const browserAction = browserActionFromReport(input.report); + if (!browserAction) { + return undefined; + } + + return { + kind: "browser", + label: input.report.label ?? `collection-browser-${browserAction.action}`, + status: browserActionTraceStatus(input.report.status), + input: { + url: browserAction.url, + selector: browserAction.selector, + targetText: browserAction.targetText, + phase: input.report.phase ?? input.defaultPhase, + }, + error: input.report.error, + browserAction, + }; +} + +function browserActionFromReport( + report: CollectionBrowserActionReport +): PopulateRuntimeBrowserAction | undefined { + const action = browserActionKind(report.action); + const targetText = report.targetText ?? report.target_text; + const valueDescription = + report.valueDescription ?? report.value_description; + if (!report.url && !report.selector && !targetText) { + return undefined; + } + return { + action, + url: report.url, + selector: report.selector, + targetText, + valueDescription, + }; +} + +function browserActionKind( + value: string | undefined +): PopulateRuntimeBrowserAction["action"] { + const normalized = value?.trim().toLowerCase(); + if ( + normalized === "navigate" || + normalized === "click" || + normalized === "type" || + normalized === "select" || + normalized === "wait" || + normalized === "extract" || + normalized === "screenshot" + ) { + return normalized; + } + return "unknown"; +} + +function browserActionTraceStatus( + value: string | undefined +): PopulateRuntimeTraceStep["status"] { + const normalized = value?.trim().toLowerCase(); + if (normalized === "failed" || normalized === "error") { + return "failed"; + } + if (normalized === "skipped") { + return "skipped"; + } + return "succeeded"; +} + function sourceOutcomeTraceKind(outcome: CollectionSourceOutcome): PopulateRuntimeTraceStep["kind"] { if (outcome.outcome?.startsWith("agent_")) { return "agent"; diff --git a/backend/test/collection-agent-runner.test.ts b/backend/test/collection-agent-runner.test.ts index 4907f91..2b32b9b 100644 --- a/backend/test/collection-agent-runner.test.ts +++ b/backend/test/collection-agent-runner.test.ts @@ -2,6 +2,7 @@ import assert from "node:assert/strict"; import { test } from "node:test"; import { runCollectionPopulatePipeline } from "../src/pipeline/collection-agent-runner.js"; +import { playwrightCandidateReadinessForRun } from "../src/pipeline/populate-playwright-readiness.js"; test("collection agent runner maps vendored pipeline output into populate runtime result", async () => { const previousEnv = snapshotEnv([ @@ -53,6 +54,77 @@ test("collection agent runner maps vendored pipeline output into populate runtim ), true ); + assert.equal( + result.debug?.processTrace.steps.some((step) => step.kind === "browser"), + false + ); + } finally { + restoreEnv(previousEnv); + } +}); + +test("collection agent runner maps explicit browser action reports into process trace", async () => { + const previousEnv = snapshotEnv([ + "AGENT_POLL_TIMEOUT_MS", + "COLLECTION_AGENT_ENABLE_AGENT", + "COLLECTION_AGENT_PIPELINE_MODULE", + "COLLECTION_AGENT_POLL_TIMEOUT_MS", + ]); + delete process.env.AGENT_POLL_TIMEOUT_MS; + process.env.COLLECTION_AGENT_ENABLE_AGENT = "true"; + delete process.env.COLLECTION_AGENT_POLL_TIMEOUT_MS; + process.env.COLLECTION_AGENT_PIPELINE_MODULE = fakeCollectionPipelineModuleUrl({ + expectedCalls: [{ agentEnabled: true, pollTimeoutMs: 480_000 }], + browserActions: [ + { + action: "hover", + url: "https://openai.com/news", + status: "succeeded", + phase: "initial-browser", + label: "browser-open-news", + }, + ], + agentBrowserActions: [ + { + action: "click", + url: "https://openai.com/news", + selector: "a[href*='/news/']", + target_text: "Release notes", + value_description: "not captured", + status: "succeeded", + }, + ], + }); + try { + const result = await runCollectionPopulatePipeline(collectionPipelineInput()); + const browserSteps = result.debug?.processTrace.steps.filter( + (step) => step.kind === "browser" + ) ?? []; + + assert.equal(browserSteps.length, 2); + assert.equal(browserSteps[0]?.browserAction?.action, "unknown"); + assert.equal(browserSteps[0]?.label, "browser-open-news"); + assert.deepEqual(browserSteps[0]?.input, { + url: "https://openai.com/news", + selector: undefined, + targetText: undefined, + phase: "initial-browser", + }); + assert.equal(browserSteps[0]?.error, undefined); + assert.equal(browserSteps[1]?.browserAction?.action, "click"); + assert.equal(browserSteps[1]?.browserAction?.selector, "a[href*='/news/']"); + assert.equal(browserSteps[1]?.browserAction?.targetText, "Release notes"); + assert.equal(browserSteps[1]?.browserAction?.valueDescription, "not captured"); + assert.equal(browserSteps[1]?.status, "succeeded"); + assert.deepEqual( + playwrightCandidateReadinessForRun({ result }), + { + status: "ready", + reasons: [], + browserStepCount: 2, + sourceUrlCount: 2, + } + ); } finally { restoreEnv(previousEnv); } @@ -182,6 +254,8 @@ function fakeCollectionPipelineModuleUrl(input: { pollTimeoutMs?: number; }>; sources?: unknown; + browserActions?: unknown; + agentBrowserActions?: unknown; }): string { const source = ` const moduleLoadPollTimeoutMs = process.env.AGENT_POLL_TIMEOUT_MS ?? null; @@ -275,6 +349,8 @@ function fakeCollectionPipelineModuleUrl(input: { "OpenAI latest AI blog posts", "OpenAI release notes", ], + browser_actions: ${JSON.stringify(input.browserActions ?? [])}, + agent_browser_actions: ${JSON.stringify(input.agentBrowserActions ?? [])}, fetched_urls: [ "https://openai.com/news", "https://openai.com/research", diff --git a/benchmarks/dataset-agent/README.md b/benchmarks/dataset-agent/README.md index ce88a3d..418dc9d 100644 --- a/benchmarks/dataset-agent/README.md +++ b/benchmarks/dataset-agent/README.md @@ -88,6 +88,26 @@ and fetch URLs alone are not enough. The readiness gate expects real browser actions such as URL transitions, selectors, target text, or redacted input descriptions before any `playwright-candidate-script` can be emitted. +Collection runners can feed those actions through explicit report fields such +as `browser_actions` or `agent_browser_actions`. BigSet maps only those explicit +actions into `browser` trace steps; it does not infer selectors or clicks from +URLs, source outcomes, or prose diagnostics. + +Mapping is mechanical: + +- `target_text` / `targetText` -> `browserAction.targetText` +- `value_description` / `valueDescription` -> `browserAction.valueDescription` +- `status` -> `step.status` +- `error` -> `step.error` +- `phase` -> `step.input.phase` +- unknown action strings -> `browserAction.action = "unknown"` + +When both action arrays are present in the same report scope, BigSet preserves +array order by appending `browser_actions` first and `agent_browser_actions` +second. This is an ingestion contract for a future Meteor/Mengzhe producer or +Agent canary; it does not mean the current vendored pipeline already emits +browser actions. + ## Verify Self-Healing Stack Use this before asking someone else to migrate a new collection agent into the diff --git a/docs/data-collection-agent-migration-plan.md b/docs/data-collection-agent-migration-plan.md index 8175714..8430973 100644 --- a/docs/data-collection-agent-migration-plan.md +++ b/docs/data-collection-agent-migration-plan.md @@ -97,6 +97,13 @@ The current layer now can: - represent browser actions in the trace contract when a future Agent/canary records URL transitions, selectors, target text, or redacted input descriptions +- ingest explicit collection runner `browser_actions` / + `agent_browser_actions` report fields into `browser` trace steps without + inferring missing clicks, selectors, or form inputs from source URLs +- map browser action reports mechanically: `target_text` to `targetText`, + `value_description` to `valueDescription`, `status` to the trace-step status, + `error` to the trace-step error, `phase` to `step.input.phase`, and unknown + action names to `browserAction.action = "unknown"` - emit a capability diagnostic when no-Agent mode sees pages that need browser, form, or detail-page follow-up @@ -111,6 +118,8 @@ The current layer does not yet: - compile search/fetch-only traces into Playwright; traces must include actionable browser steps before the script compiler is allowed to emit a candidate +- infer browser selectors, clicks, or form values from source outcomes; the + collection runner or Agent canary must emit those as explicit action fields - run a green live Convex canary in this local environment - prove Agent-enabled collection quality on a full real benchmark - prove the collection runtime should replace Mastra as the default app runtime @@ -177,6 +186,8 @@ The current layer does not yet: - browser-step trace canary that records URL transitions, selectors/targets, and redacted form-input descriptions before any Playwright compiler is enabled + - confirm the canary emits explicit `agent_browser_actions` or equivalent + fields in the collection report; source outcomes alone are not enough - full benchmark only after the 2-prompt run is not obviously broken - live `--dataset-id` dry-run only after Convex/env prerequisites are ready - `--commit` only on a throwaway dataset first @@ -233,6 +244,12 @@ collection runner ignores `recipeInstructions`, repaired recipes cannot change future behavior. If it ignores `requiredColumns` or benchmark metadata, the benchmark can stop measuring the same task. +For the Playwright handoff, Meteor can optionally emit `browser_actions` and +`agent_browser_actions` in the collection report. BigSet preserves each array's +order and appends `browser_actions` before `agent_browser_actions` when both are +present in the same report scope. This is a wrapper ingestion contract only; the +current vendored pipeline is not claimed to emit those fields yet. + The real benchmark command after a runner module exists is: ```bash