Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
124 changes: 124 additions & 0 deletions backend/src/pipeline/collection-agent-runner.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import type {
import {
populateProcessTraceFromSteps,
type PopulateCellValue,
type PopulateRuntimeBrowserAction,
type PopulateRuntimeResult,
type PopulateRuntimeTraceStep,
} from "./populate-runtime.js";
Expand Down Expand Up @@ -51,6 +52,8 @@ interface CollectionPipelineResult {
search_queries?: string[];
fetched_urls?: string[];
failed_urls?: string[];
browser_actions?: CollectionBrowserActionReport[];
agent_browser_actions?: CollectionBrowserActionReport[];
};
repair?: {
stats?: CollectionPhaseStats;
Expand All @@ -59,6 +62,8 @@ interface CollectionPipelineResult {
search_queries?: string[];
fetched_urls?: string[];
failed_urls?: string[];
browser_actions?: CollectionBrowserActionReport[];
agent_browser_actions?: CollectionBrowserActionReport[];
quality?: {
records?: CollectionRecordQuality[];
};
Expand Down Expand Up @@ -124,9 +129,25 @@ interface CollectionSourceOutcome {
interface CollectionRepairLoopReport {
loop_index?: number;
repair_queries?: string[];
browser_actions?: CollectionBrowserActionReport[];
agent_browser_actions?: CollectionBrowserActionReport[];
stats?: CollectionPhaseStats;
}

interface CollectionBrowserActionReport {
action?: string;
url?: string;
selector?: string;
target_text?: string;
targetText?: string;
value_description?: string;
valueDescription?: string;
status?: string;
error?: string;
phase?: string;
label?: string;
}

const AGENT_REQUIRED_TRIAGE_STATUSES = new Set([
"requires_navigation",
"requires_form_submission",
Expand Down Expand Up @@ -312,8 +333,25 @@ function collectionProcessTrace(input: {
},
});
}
steps.push(...browserTraceStepsFromReports({
reports: [
...(loop.browser_actions ?? []),
...(loop.agent_browser_actions ?? []),
],
defaultPhase: `repair-loop-${loop.loop_index ?? "unknown"}`,
}));
}

steps.push(...browserTraceStepsFromReports({
reports: [
...(report.browser_actions ?? []),
...(report.agent_browser_actions ?? []),
...(report.initial?.browser_actions ?? []),
...(report.initial?.agent_browser_actions ?? []),
],
defaultPhase: "initial",
}));

for (const outcome of report.sources?.outcomes ?? []) {
if (!outcome.url) {
continue;
Expand Down Expand Up @@ -358,6 +396,92 @@ function collectionDebugNotes(report: CollectionPipelineResult["report"]): strin
return notes;
}

function browserTraceStepsFromReports(input: {
reports: CollectionBrowserActionReport[];
defaultPhase: string;
}): PopulateRuntimeTraceStep[] {
return input.reports
.map((report) => browserTraceStepFromReport({
report,
defaultPhase: input.defaultPhase,
}))
.filter((step): step is PopulateRuntimeTraceStep => Boolean(step));
}

function browserTraceStepFromReport(input: {
report: CollectionBrowserActionReport;
defaultPhase: string;
}): PopulateRuntimeTraceStep | undefined {
const browserAction = browserActionFromReport(input.report);
if (!browserAction) {
return undefined;
}

return {
kind: "browser",
label: input.report.label ?? `collection-browser-${browserAction.action}`,
status: browserActionTraceStatus(input.report.status),
input: {
url: browserAction.url,
selector: browserAction.selector,
targetText: browserAction.targetText,
phase: input.report.phase ?? input.defaultPhase,
},
error: input.report.error,
browserAction,
};
}

function browserActionFromReport(
report: CollectionBrowserActionReport
): PopulateRuntimeBrowserAction | undefined {
const action = browserActionKind(report.action);
const targetText = report.targetText ?? report.target_text;
const valueDescription =
report.valueDescription ?? report.value_description;
if (!report.url && !report.selector && !targetText) {
return undefined;
}
return {
action,
url: report.url,
selector: report.selector,
targetText,
valueDescription,
};
}

function browserActionKind(
value: string | undefined
): PopulateRuntimeBrowserAction["action"] {
const normalized = value?.trim().toLowerCase();
if (
normalized === "navigate" ||
normalized === "click" ||
normalized === "type" ||
normalized === "select" ||
normalized === "wait" ||
normalized === "extract" ||
normalized === "screenshot"
) {
return normalized;
}
return "unknown";
}

function browserActionTraceStatus(
value: string | undefined
): PopulateRuntimeTraceStep["status"] {
const normalized = value?.trim().toLowerCase();
if (normalized === "failed" || normalized === "error") {
return "failed";
}
if (normalized === "skipped") {
return "skipped";
}
return "succeeded";
}

function sourceOutcomeTraceKind(outcome: CollectionSourceOutcome): PopulateRuntimeTraceStep["kind"] {
if (outcome.outcome?.startsWith("agent_")) {
return "agent";
Expand Down
76 changes: 76 additions & 0 deletions backend/test/collection-agent-runner.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ import assert from "node:assert/strict";
import { test } from "node:test";

import { runCollectionPopulatePipeline } from "../src/pipeline/collection-agent-runner.js";
import { playwrightCandidateReadinessForRun } from "../src/pipeline/populate-playwright-readiness.js";

test("collection agent runner maps vendored pipeline output into populate runtime result", async () => {
const previousEnv = snapshotEnv([
Expand Down Expand Up @@ -53,6 +54,77 @@ test("collection agent runner maps vendored pipeline output into populate runtim
),
true
);
assert.equal(
result.debug?.processTrace.steps.some((step) => step.kind === "browser"),
false
);
} finally {
restoreEnv(previousEnv);
}
});

test("collection agent runner maps explicit browser action reports into process trace", async () => {
const previousEnv = snapshotEnv([
"AGENT_POLL_TIMEOUT_MS",
"COLLECTION_AGENT_ENABLE_AGENT",
"COLLECTION_AGENT_PIPELINE_MODULE",
"COLLECTION_AGENT_POLL_TIMEOUT_MS",
]);
delete process.env.AGENT_POLL_TIMEOUT_MS;
process.env.COLLECTION_AGENT_ENABLE_AGENT = "true";
delete process.env.COLLECTION_AGENT_POLL_TIMEOUT_MS;
process.env.COLLECTION_AGENT_PIPELINE_MODULE = fakeCollectionPipelineModuleUrl({
expectedCalls: [{ agentEnabled: true, pollTimeoutMs: 480_000 }],
browserActions: [
{
action: "hover",
url: "https://openai.com/news",
status: "succeeded",
phase: "initial-browser",
label: "browser-open-news",
},
],
agentBrowserActions: [
{
action: "click",
url: "https://openai.com/news",
selector: "a[href*='/news/']",
target_text: "Release notes",
value_description: "not captured",
status: "succeeded",
},
],
});
try {
const result = await runCollectionPopulatePipeline(collectionPipelineInput());
const browserSteps = result.debug?.processTrace.steps.filter(
(step) => step.kind === "browser"
) ?? [];

assert.equal(browserSteps.length, 2);
assert.equal(browserSteps[0]?.browserAction?.action, "unknown");
assert.equal(browserSteps[0]?.label, "browser-open-news");
assert.deepEqual(browserSteps[0]?.input, {
url: "https://openai.com/news",
selector: undefined,
targetText: undefined,
phase: "initial-browser",
});
assert.equal(browserSteps[0]?.error, undefined);
assert.equal(browserSteps[1]?.browserAction?.action, "click");
assert.equal(browserSteps[1]?.browserAction?.selector, "a[href*='/news/']");
assert.equal(browserSteps[1]?.browserAction?.targetText, "Release notes");
assert.equal(browserSteps[1]?.browserAction?.valueDescription, "not captured");
assert.equal(browserSteps[1]?.status, "succeeded");
assert.deepEqual(
playwrightCandidateReadinessForRun({ result }),
{
status: "ready",
reasons: [],
browserStepCount: 2,
sourceUrlCount: 2,
}
);
} finally {
restoreEnv(previousEnv);
}
Expand Down Expand Up @@ -182,6 +254,8 @@ function fakeCollectionPipelineModuleUrl(input: {
pollTimeoutMs?: number;
}>;
sources?: unknown;
browserActions?: unknown;
agentBrowserActions?: unknown;
}): string {
const source = `
const moduleLoadPollTimeoutMs = process.env.AGENT_POLL_TIMEOUT_MS ?? null;
Expand Down Expand Up @@ -275,6 +349,8 @@ function fakeCollectionPipelineModuleUrl(input: {
"OpenAI latest AI blog posts",
"OpenAI release notes",
],
browser_actions: ${JSON.stringify(input.browserActions ?? [])},
agent_browser_actions: ${JSON.stringify(input.agentBrowserActions ?? [])},
fetched_urls: [
"https://openai.com/news",
"https://openai.com/research",
Expand Down
20 changes: 20 additions & 0 deletions benchmarks/dataset-agent/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,26 @@ and fetch URLs alone are not enough. The readiness gate expects real browser
actions such as URL transitions, selectors, target text, or redacted input
descriptions before any `playwright-candidate-script` can be emitted.

Collection runners can feed those actions through explicit report fields such
as `browser_actions` or `agent_browser_actions`. BigSet maps only those explicit
actions into `browser` trace steps; it does not infer selectors or clicks from
URLs, source outcomes, or prose diagnostics.

Mapping is mechanical:

- `target_text` / `targetText` -> `browserAction.targetText`
- `value_description` / `valueDescription` -> `browserAction.valueDescription`
- `status` -> `step.status`
- `error` -> `step.error`
- `phase` -> `step.input.phase`
- unknown action strings -> `browserAction.action = "unknown"`

When both action arrays are present in the same report scope, BigSet preserves
array order by appending `browser_actions` first and `agent_browser_actions`
second. This is an ingestion contract for a future Meteor/Mengzhe producer or
Agent canary; it does not mean the current vendored pipeline already emits
browser actions.

## Verify Self-Healing Stack

Use this before asking someone else to migrate a new collection agent into the
Expand Down
17 changes: 17 additions & 0 deletions docs/data-collection-agent-migration-plan.md
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,13 @@ The current layer now can:
- represent browser actions in the trace contract when a future Agent/canary
records URL transitions, selectors, target text, or redacted input
descriptions
- ingest explicit collection runner `browser_actions` /
`agent_browser_actions` report fields into `browser` trace steps without
inferring missing clicks, selectors, or form inputs from source URLs
- map browser action reports mechanically: `target_text` to `targetText`,
`value_description` to `valueDescription`, `status` to the trace-step status,
`error` to the trace-step error, `phase` to `step.input.phase`, and unknown
action names to `browserAction.action = "unknown"`
- emit a capability diagnostic when no-Agent mode sees pages that need browser,
form, or detail-page follow-up

Expand All @@ -111,6 +118,8 @@ The current layer does not yet:
- compile search/fetch-only traces into Playwright; traces must include
actionable browser steps before the script compiler is allowed to emit a
candidate
- infer browser selectors, clicks, or form values from source outcomes; the
collection runner or Agent canary must emit those as explicit action fields
- run a green live Convex canary in this local environment
- prove Agent-enabled collection quality on a full real benchmark
- prove the collection runtime should replace Mastra as the default app runtime
Expand Down Expand Up @@ -177,6 +186,8 @@ The current layer does not yet:
- browser-step trace canary that records URL transitions, selectors/targets,
and redacted form-input descriptions before any Playwright compiler is
enabled
- confirm the canary emits explicit `agent_browser_actions` or equivalent
fields in the collection report; source outcomes alone are not enough
- full benchmark only after the 2-prompt run is not obviously broken
- live `--dataset-id` dry-run only after Convex/env prerequisites are ready
- `--commit` only on a throwaway dataset first
Expand Down Expand Up @@ -233,6 +244,12 @@ collection runner ignores `recipeInstructions`, repaired recipes cannot change
future behavior. If it ignores `requiredColumns` or benchmark metadata, the
benchmark can stop measuring the same task.

For the Playwright handoff, Meteor can optionally emit `browser_actions` and
`agent_browser_actions` in the collection report. BigSet preserves each array's
order and appends `browser_actions` before `agent_browser_actions` when both are
present in the same report scope. This is a wrapper ingestion contract only; the
current vendored pipeline is not claimed to emit those fields yet.

The real benchmark command after a runner module exists is:

```bash
Expand Down