diff --git a/.env.example b/.env.example index 8959888..5a5c796 100644 --- a/.env.example +++ b/.env.example @@ -1,14 +1,30 @@ -# These are read by docker-compose.dev.yml. +# This is the only local env file BigSet expects. # Copy this file to .env and fill in your values. +# Local service URLs +CLIENT_ORIGIN=http://localhost:3500 +CONVEX_URL=http://localhost:3210 +NEXT_PUBLIC_CONVEX_URL=http://127.0.0.1:3210 +CONVEX_SELF_HOSTED_URL=http://127.0.0.1:3210 +NEXT_PUBLIC_BACKEND_URL=http://localhost:3501 +PORT=3501 + # Clerk — create a free app at https://dashboard.clerk.com +# Enable the Clerk JWT Templates -> Convex template, then set your issuer URL. NEXT_PUBLIC_CLERK_PUBLISHABLE_KEY=pk_test_... CLERK_SECRET_KEY=sk_test_... +CLERK_JWT_ISSUER_DOMAIN=https://your-app.clerk.accounts.dev # OpenRouter — required by backend + Mastra for AI model calls. # Generate at https://openrouter.ai/settings/keys OPENROUTER_API_KEY=sk-or-... +# Optional model overrides. +# Schema inference defaults to anthropic/claude-sonnet-4-6. +# Populate and other non-inference tasks default to google/gemini-3.1-flash-lite. +# OPENROUTER_MODEL=google/gemini-3.1-flash-lite +# OPENROUTER_POPULATE_MODEL=google/gemini-3.1-flash-lite + # TinyFish — required by populate agent web search/fetch. # Generate at https://agent.tinyfish.ai/api-keys TINYFISH_API_KEY= @@ -22,6 +38,31 @@ CONVEX_SELF_HOSTED_ADMIN_KEY= # Docker dev overrides this to /app/.bigset/populate-recipes on a named volume. POPULATE_RECIPE_STORE_DIR=.bigset/populate-recipes +# Populate runtime limits. +# POPULATE_MAX_ROWS=100 +# POPULATE_MAX_SEARCH_CALLS=25 +# POPULATE_MAX_FETCH_CALLS=50 +# POPULATE_COMMIT_ROW_LIMIT_PER_HOUR=1000 + +# Browser-action self-healing. Non-secret tunables. +POPULATE_ENABLE_BROWSER_ACTION_BOX=true +POPULATE_BROWSER_ACTION_BOX_POLL_INTERVAL_MS=3000 +POPULATE_ENABLE_PLAYWRIGHT_REPLAY=true +POPULATE_ENABLE_PLAYWRIGHT_REPAIR=true +POPULATE_PLAYWRIGHT_HEADLESS=true +# POPULATE_PLAYWRIGHT_EXECUTABLE_PATH= + +# Collection-agent canaries. Leave Mastra as the default app runtime unless +# intentionally benchmarking the collection runner. +# POPULATE_AGENT_RUNTIME=collection +# POPULATE_COLLECTION_RUNNER_MODULE=./backend/src/pipeline/collection-agent-runner.ts +COLLECTION_AGENT_PIPELINE_MODULE=./backend/BigSet_Data_Collection_Agent/src/orchestrator/pipeline.ts +COLLECTION_AGENT_ENABLE_TRIAGE=true +COLLECTION_AGENT_ENABLE_AGENT=false +COLLECTION_AGENT_POLL_TIMEOUT_MS=1200000 +AGENT_POLL_TIMEOUT_MS=1200000 +AGENT_REQUEST_TIMEOUT_MS=15000 + # PostHog (optional — leave blank to disable analytics entirely in local dev). # Get from https://us.posthog.com/project/settings/general. NEXT_PUBLIC_POSTHOG_KEY= diff --git a/CLAUDE.md b/CLAUDE.md index 813fbf7..52f9a05 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -8,14 +8,14 @@ Frontend on :3500, backend on :3501, Mastra Studio on :4111, Convex dashboard on 1. Create a free Clerk account at https://clerk.com and create an application. 2. In the Clerk dashboard, go to **JWT Templates** and enable the **Convex** template. -3. Copy `frontend/.env.example` to `frontend/.env.local` and fill in your Clerk keys: +3. Copy `.env.example` to `.env` and fill in your Clerk keys: - `NEXT_PUBLIC_CLERK_PUBLISHABLE_KEY` — from Clerk API Keys - `CLERK_SECRET_KEY` — from Clerk API Keys - `CLERK_JWT_ISSUER_DOMAIN` — your Frontend API URL (e.g. `https://your-app.clerk.accounts.dev`) -4. Add an OpenRouter API key to the root `.env` file: `OPENROUTER_API_KEY=sk-or-...` (get one at https://openrouter.ai/settings/keys). Docker Compose reads the root `.env` and passes it to the backend and Mastra containers. -4b. Add a TinyFish API key to the root `.env` file: `TINYFISH_API_KEY=...` (get one at https://agent.tinyfish.ai/api-keys). This enables the populate agent to search the web and fetch page content. +4. Add an OpenRouter API key to `.env`: `OPENROUTER_API_KEY=sk-or-...` (get one at https://openrouter.ai/settings/keys). Docker Compose reads root `.env` and passes it to frontend, backend, and Mastra containers. +4b. Add a TinyFish API key to `.env`: `TINYFISH_API_KEY=...` (get one at https://agent.tinyfish.ai/api-keys). This enables the populate agent to search the web and fetch page content. 5. Run `make dev` — this starts all Docker services AND pushes Convex functions automatically. -6. Generate a Convex admin key (first run only): `docker compose exec convex ./generate_admin_key.sh` and add it as `CONVEX_SELF_HOSTED_ADMIN_KEY` in `frontend/.env.local`, then re-run `make dev`. +6. Generate a Convex admin key (first run only): `docker compose exec convex ./generate_admin_key.sh` and add it as `CONVEX_SELF_HOSTED_ADMIN_KEY` in `.env`, then re-run `make dev`. ## Architecture @@ -35,13 +35,13 @@ Convex functions use `ctx.auth.getUserIdentity()` to get the authenticated user. ## Environment Variables -Docker Compose interpolates variables from the root `.env` file. Key variables: +Root `.env` is the only local env file. Docker Compose, package scripts, Convex helper targets, and benchmarks load it. Key variables: - `NEXT_PUBLIC_CLERK_PUBLISHABLE_KEY`, `CLERK_SECRET_KEY` — shared by frontend and backend - `OPENROUTER_API_KEY` — used by backend and Mastra for AI model calls - `CONVEX_SELF_HOSTED_ADMIN_KEY` — used by backend for system-level Convex writes - `TINYFISH_API_KEY` — used by the populate agent for web search and fetch (get one at https://agent.tinyfish.ai/api-keys) -The backend container maps `NEXT_PUBLIC_CLERK_PUBLISHABLE_KEY` → `CLERK_PUBLISHABLE_KEY` (see `docker-compose.dev.yml`). +The backend accepts `NEXT_PUBLIC_CLERK_PUBLISHABLE_KEY` as the publishable Clerk key, and the Docker backend container also maps it to `CLERK_PUBLISHABLE_KEY` (see `docker-compose.dev.yml`). ## Convex Deploys diff --git a/README.md b/README.md index ddee776..b5dacbe 100644 --- a/README.md +++ b/README.md @@ -44,16 +44,12 @@ cd bigset Create a Clerk application at [dashboard.clerk.com](https://dashboard.clerk.com), then go to **JWT Templates** and enable the **Convex** template. -### 2. Configure env files +### 2. Configure env ```bash -# Root .env — used by Docker for the frontend container cp .env.example .env -# Fill in NEXT_PUBLIC_CLERK_PUBLISHABLE_KEY and CLERK_SECRET_KEY - -# Frontend .env.local — used by Next.js and Convex CLI -cp frontend/.env.example frontend/.env.local -# Fill in all three Clerk keys (publishable, secret, and JWT issuer domain) +# Fill in NEXT_PUBLIC_CLERK_PUBLISHABLE_KEY, CLERK_SECRET_KEY, +# CLERK_JWT_ISSUER_DOMAIN, OPENROUTER_API_KEY, and TINYFISH_API_KEY ``` > **Required for the create-dataset wizard:** set `OPENROUTER_API_KEY` (used by the schema-inference pipeline). Get one at [openrouter.ai](https://openrouter.ai). Without it the wizard's "Generate Schema" step will fail. @@ -66,7 +62,11 @@ cp frontend/.env.example frontend/.env.local make dev ``` -This starts all Docker services, waits for Convex to be healthy, and deploys Convex functions automatically. Once it's up: +This starts all Docker services, waits for Convex to be healthy, and deploys Convex functions automatically. +`make dev` checks that root `.env` contains real Clerk/OpenRouter/TinyFish +values before it starts Docker. If it reports a placeholder key, replace that +value first. +Once it is up: - App: http://localhost:3500 - Convex dashboard: http://localhost:6791 @@ -78,26 +78,31 @@ This starts all Docker services, waits for Convex to be healthy, and deploys Con docker compose exec convex ./generate_admin_key.sh ``` -Paste the output into `frontend/.env.local` as `CONVEX_SELF_HOSTED_ADMIN_KEY`, then re-run `make dev`. +Paste the output into `.env` as `CONVEX_SELF_HOSTED_ADMIN_KEY`, then re-run +`make dev`. + +If `make dev` stops at `CONVEX_SELF_HOSTED_ADMIN_KEY is missing`, that means +Docker/Convex is up far enough for you to run the command above. Generate the +key, paste it into root `.env`, and run `make dev` again. ### 5. Load curated public datasets The landing page and the dashboard's "Curated" section read from a set of 9 system-owned datasets. Load them with: ```bash -cd frontend -npx convex run publicSeed:seedPublicDatasets +make seed-public-datasets ``` The script is **idempotent** — rerunning it skips datasets that already exist (matched by a stable `seedKey`, so renaming a curated dataset never creates a duplicate). To add a 10th curated dataset, append it to `PUBLIC_DATASETS` in [frontend/convex/publicSeed.ts](frontend/convex/publicSeed.ts) with a fresh `seedKey` and rerun the command. To replace existing curated content in place, pass `force: true`: ```bash -npx convex run publicSeed:seedPublicDatasets '{"force":true}' +cd frontend +node ../scripts/with-root-env.mjs npx convex run publicSeed:seedPublicDatasets '{"force":true}' ``` Open [localhost:3500](http://localhost:3500) and click **Get started** to sign in. -> **Note:** Backend env needs no setup — `backend/.env.example` has correct defaults. If you edit Convex functions in `frontend/convex/`, run `make convex-push` to deploy the changes. +> **Note:** root `.env` is the only local env file. If you edit Convex functions in `frontend/convex/`, run `make convex-push` to deploy the changes. > **Free tier:** each signed-in account gets **2,500 row operations per calendar month** (resets on the 1st, UTC). The header shows a live usage badge; system-owned curated datasets bypass the quota. @@ -123,12 +128,11 @@ Open [localhost:3500](http://localhost:3500) and click **Get started** to sign i bigset/ ├── frontend/ Next.js 16 — UI + Convex schema & functions │ ├── convex/ Convex functions, schema, authz + quota helpers -│ └── .env.local Clerk + Convex keys (not committed) ├── backend/ Fastify + Mastra — schema inference + (future) agents │ ├── src/pipeline/ Pure schema-inference fn (called by Fastify + Mastra) │ └── src/mastra/ Mastra workflows (Studio at :4111 in dev) ├── scripts/ One-off scripts (e.g. verify-authz.sh) -├── .env Clerk keys for docker-compose (not committed) +├── .env Local env for frontend, backend, Convex CLI, benchmarks (not committed) ├── docker-compose.dev.yml └── Makefile ``` diff --git a/backend/.env.example b/backend/.env.example deleted file mode 100644 index a56d9df..0000000 --- a/backend/.env.example +++ /dev/null @@ -1,21 +0,0 @@ -CLIENT_ORIGIN=http://localhost:3500 -CONVEX_URL=http://localhost:3210 -PORT=3501 -POPULATE_RECIPE_STORE_DIR=.bigset/populate-recipes - -# Required once the backend starts writing rows via internal Convex mutations. -# Generate with: docker compose exec convex ./generate_admin_key.sh -CONVEX_SELF_HOSTED_ADMIN_KEY= - -# Required for user-facing protected routes (JWT verification). -# Same values as the frontend's Clerk keys. -CLERK_SECRET_KEY= -CLERK_PUBLISHABLE_KEY= - -# OpenRouter API key — required by schema inference. -# Generate at https://openrouter.ai/settings/keys -OPENROUTER_API_KEY=sk-or-... - -# TinyFish API key — used by the populate agent for web search and fetch. -# Generate at https://agent.tinyfish.ai/api-keys -TINYFISH_API_KEY= diff --git a/backend/BigSet_Data_Collection_Agent/src/agents/agent-goal.ts b/backend/BigSet_Data_Collection_Agent/src/agents/agent-goal.ts index e84ad75..7df2219 100644 --- a/backend/BigSet_Data_Collection_Agent/src/agents/agent-goal.ts +++ b/backend/BigSet_Data_Collection_Agent/src/agents/agent-goal.ts @@ -5,6 +5,15 @@ import { } from "../memory/index.js"; import { agentGoalSchema, type AgentGoal } from "../models/schemas.js"; import type { DatasetSpec, SourceTriageResult } from "../models/schemas.js"; +import type { LlmMessage } from "../integrations/openrouter.js"; + +export const AGENT_BROWSER_ACTION_CONTRACT = `Browser action reporting contract: +- The Tinyfish Agent result JSON MUST include "agent_browser_actions" next to "records". +- "agent_browser_actions" is an ordered array of browser steps the agent actually performed. +- Each action should use this shape when known: { "action": "navigate|click|type|select|wait|extract|screenshot|unknown", "url": "current page URL", "selector": "CSS selector when known", "target_text": "visible button/link/field text when known", "value_description": "safe description of typed/selected value, never secrets", "status": "succeeded|failed", "error": "failure reason if any", "phase": "initial|search|filter|pagination|detail|form|extract", "label": "short human label" }. +- Record navigation, clicks, form fills, pagination, waits that affected extraction, and final extraction. +- If a selector is unknown, still include url plus target_text when visible. If no browser action happened, return an empty array. +- Do not include raw passwords, tokens, cookies, or private user-entered values in value_description.`; const AGENT_GOAL_SYSTEM = `You are the Navigation Task Agent for a web data collection pipeline. @@ -14,8 +23,9 @@ The agent must navigate the site and return structured JSON with extracted data Rules: - Be specific about what to click, search, filter, or paginate. -- State the exact JSON shape to return: { "records": [ { column_name: value, ... } ] } +- State the exact JSON shape to return: { "records": [ { column_name: value, ... } ], "agent_browser_actions": [ ... ] } - Include column names from the schema in the goal. +- Include the browser action reporting contract verbatim enough that the Tinyfish Agent knows it must report replay-oriented actions. - For forms: describe fields to fill and how to submit. - For detail follow-up: explain how to open each item and which fields to collect. - Limit scope (e.g. first 25 rows) to keep runs reliable. @@ -31,34 +41,45 @@ export async function generateAgentGoal(options: { focusFields?: string[]; memory?: WorkflowMemory; }): Promise { - const columnList = options.spec.columns - .map((c) => `${c.name} (${c.type}${c.required ? ", required" : ""})`) - .join(", "); - return completeJson({ label: `agent_goal:${options.triage.final_url}`, schema: agentGoalSchema, - messages: [ - { role: "system", content: AGENT_GOAL_SYSTEM }, - { - role: "user", - content: JSON.stringify({ - user_prompt: options.userPrompt, - triage_status: options.triage.status, - triage_reasoning: options.triage.reasoning, - suggested_action: options.triage.suggested_action, - page_url: options.triage.final_url, - page_title: options.triage.title, - row_grain: options.spec.row_grain, - columns: columnList, - focus_fields: options.focusFields ?? [], - extraction_hints: options.spec.extraction_hints, - workflow_memory: options.memory - ? memoryContextForAgents(options.memory) - : undefined, - output_shape: { goal: "string", rationale: "string" }, - }), - }, - ], + messages: buildAgentGoalMessages(options), }); } + +export function buildAgentGoalMessages(options: { + userPrompt: string; + spec: DatasetSpec; + triage: SourceTriageResult; + focusFields?: string[]; + memory?: WorkflowMemory; +}): LlmMessage[] { + const columnList = options.spec.columns + .map((c) => `${c.name} (${c.type}${c.required ? ", required" : ""})`) + .join(", "); + + return [ + { role: "system", content: AGENT_GOAL_SYSTEM }, + { + role: "user", + content: JSON.stringify({ + user_prompt: options.userPrompt, + triage_status: options.triage.status, + triage_reasoning: options.triage.reasoning, + suggested_action: options.triage.suggested_action, + page_url: options.triage.final_url, + page_title: options.triage.title, + row_grain: options.spec.row_grain, + columns: columnList, + focus_fields: options.focusFields ?? [], + extraction_hints: options.spec.extraction_hints, + browser_action_reporting_contract: AGENT_BROWSER_ACTION_CONTRACT, + workflow_memory: options.memory + ? memoryContextForAgents(options.memory) + : undefined, + output_shape: { goal: "string", rationale: "string" }, + }), + }, + ]; +} diff --git a/backend/BigSet_Data_Collection_Agent/src/agents/source-policy.ts b/backend/BigSet_Data_Collection_Agent/src/agents/source-policy.ts index 1ea3b54..31fa044 100644 --- a/backend/BigSet_Data_Collection_Agent/src/agents/source-policy.ts +++ b/backend/BigSet_Data_Collection_Agent/src/agents/source-policy.ts @@ -5,7 +5,7 @@ import type { SourceTriageResult, } from "../models/schemas.js"; import { scoreDocsUrlForOfficialSource } from "../records/source-urls.js"; -import { getDomain } from "../utils/url.js"; +import { getDomain, normalizeUrl } from "../utils/url.js"; export interface PromptSourceEntity { name: string; @@ -17,6 +17,7 @@ export interface PromptSourcePolicy { requiresOfficialSource: boolean; entities: PromptSourceEntity[]; searchPhrases: string[]; + explicitSourceUrls: string[]; hint?: string; } @@ -55,6 +56,14 @@ function uniqueStrings(values: string[]): string[] { return [...new Set(values.map((value) => value.trim()).filter(Boolean))]; } +function extractPromptSourceUrls(prompt: string): string[] { + return uniqueStrings( + [...prompt.matchAll(/https?:\/\/[^\s)"'<>]+/gi)].map((match) => + normalizeUrl((match[0] ?? "").replace(/[.,;:!?]+$/g, "")), + ), + ); +} + function tokenize(value: string): string[] { return value .toLowerCase() @@ -157,6 +166,7 @@ export function derivePromptSourcePolicy(prompt: string): PromptSourcePolicy { const taskText = taskTextFromPrompt(prompt); const entities = extractExplicitEntities(taskText); const searchPhrases = searchPhrasesForPrompt(taskText); + const explicitSourceUrls = extractPromptSourceUrls(taskText); const lower = taskText.toLowerCase(); const asksForCanonicalSource = searchPhrases.length > 0 || @@ -182,7 +192,7 @@ export function derivePromptSourcePolicy(prompt: string): PromptSourcePolicy { ].join("\n") : undefined; - return { requiresOfficialSource, entities, searchPhrases, hint }; + return { requiresOfficialSource, entities, searchPhrases, explicitSourceUrls, hint }; } export function promptSourceSearchQueries(policy: PromptSourcePolicy): string[] { @@ -236,6 +246,7 @@ export function urlMatchesPromptSourcePolicy( url: string, policy: PromptSourcePolicy, ): boolean { + if (urlMatchesExplicitPromptSource(url, policy)) return true; if (!policy.requiresOfficialSource) return true; const domain = getDomain(url).toLowerCase(); if (GENERIC_HOSTED_DOMAIN.test(domain)) { @@ -246,6 +257,17 @@ export function urlMatchesPromptSourcePolicy( ); } +function urlMatchesExplicitPromptSource( + url: string, + policy: PromptSourcePolicy, +): boolean { + const normalized = normalizeUrl(url); + return policy.explicitSourceUrls.some((sourceUrl) => { + const explicit = normalizeUrl(sourceUrl); + return normalized === explicit || normalized.startsWith(`${explicit}/`); + }); +} + function urlMatchesEntitySourcePolicy( url: string, entity: PromptSourceEntity, @@ -361,6 +383,9 @@ export function recordMatchesPromptSourcePolicy( if (urls.length === 0) { return false; } + if (urls.some((url) => urlMatchesExplicitPromptSource(url, policy))) { + return true; + } return urls.some((url) => urlMatchesEntitySourcePolicy(url, entity, policy)); } diff --git a/backend/BigSet_Data_Collection_Agent/src/config.ts b/backend/BigSet_Data_Collection_Agent/src/config.ts index 875747c..3ca55c7 100644 --- a/backend/BigSet_Data_Collection_Agent/src/config.ts +++ b/backend/BigSet_Data_Collection_Agent/src/config.ts @@ -1,3 +1,8 @@ +import { config as loadDotenv } from "dotenv"; +import { fileURLToPath } from "node:url"; + +loadDotenv({ path: fileURLToPath(new URL("../../../.env", import.meta.url)) }); + function readBool(name: string, fallback: boolean): boolean { const raw = process.env[name]; if (raw === undefined || raw === "") return fallback; @@ -86,6 +91,8 @@ export const config = { agentPollConcurrency: readInt("AGENT_POLL_CONCURRENCY", 10), agentPollIntervalMs: readInt("AGENT_POLL_INTERVAL_MS", 3000), agentPollTimeoutMs: readInt("AGENT_POLL_TIMEOUT_MS", 1_200_000), + /** Per HTTP request cap for TinyFish Agent queue/poll/cancel calls. */ + agentRequestTimeoutMs: readInt("AGENT_REQUEST_TIMEOUT_MS", 15_000), triageConcurrency: readInt("TRIAGE_CONCURRENCY", 5), enableQualityScoring: readBool("ENABLE_QUALITY_SCORING", true), /** results.csv only includes rows with all required fields, ranked by quality. */ @@ -108,7 +115,7 @@ export function assertConfig(): void { if (!config.openRouterApiKey) missing.push("OPENROUTER_API_KEY"); if (missing.length > 0) { throw new Error( - `Missing required environment variables: ${missing.join(", ")}. Copy .env.example to .env and fill in values.`, + `Missing required environment variables: ${missing.join(", ")}. Copy root .env.example to .env and fill in values.`, ); } } diff --git a/backend/BigSet_Data_Collection_Agent/src/integrations/tinyfish-agent.ts b/backend/BigSet_Data_Collection_Agent/src/integrations/tinyfish-agent.ts index 4e337f3..b2b2cf9 100644 --- a/backend/BigSet_Data_Collection_Agent/src/integrations/tinyfish-agent.ts +++ b/backend/BigSet_Data_Collection_Agent/src/integrations/tinyfish-agent.ts @@ -1,6 +1,7 @@ import { RunStatus, TinyFish, type Run } from "@tiny-fish/sdk"; import { config } from "../config.js"; -import { sleep, withRetry } from "../queue/retry.js"; +import type { BrowserActionReport } from "../models/schemas.js"; +import { isRetryableError, sleep, withRetry } from "../queue/retry.js"; import { mapWithConcurrency } from "../utils/concurrency.js"; let client: TinyFish | null = null; @@ -25,6 +26,12 @@ export interface TinyfishAgentRunResult { status: string; result: Record | null; error: string | null; + agent_step_count: number | null; + has_streaming_url: boolean; + has_recording_url: boolean; + capture_artifact_count: number; + result_keys: string[]; + browser_actions: BrowserActionReport[]; } export interface QueueTinyfishAgentResult { @@ -39,46 +46,73 @@ export interface TinyfishAgentJob { export interface TinyfishAgentRunOptions { pollTimeoutMs?: number; + pollIntervalMs?: number; + requestTimeoutMs?: number; + readRun?: TinyfishAgentRunReader; + cancelRun?: TinyfishAgentRunCanceller; } -function runToResult(run: Run): TinyfishAgentRunResult { +type TinyfishAgentRunReader = ( + runId: string, + options: { signal: AbortSignal }, +) => Promise; + +type TinyfishAgentRunCanceller = ( + runId: string, + options: { signal: AbortSignal }, +) => Promise; + +type TinyfishRunWithTrace = Run & { + steps?: unknown; + recording_url?: unknown; + recordingUrl?: unknown; + captures?: unknown; + capture_artifacts?: unknown; + captureArtifacts?: unknown; + artifacts?: unknown; +}; + +export function tinyfishAgentRunResultFromRun(run: Run): TinyfishAgentRunResult { const errorMessage = run.error?.message ?? (run.status === RunStatus.FAILED ? "Agent run failed" : null); + const result = (run.result as Record | null) ?? null; + const runWithTrace = run as TinyfishRunWithTrace; return { run_id: run.run_id, status: run.status, - result: (run.result as Record | null) ?? null, + result, error: errorMessage, + agent_step_count: typeof run.num_of_steps === "number" + ? run.num_of_steps + : null, + has_streaming_url: typeof run.streaming_url === "string" && + run.streaming_url.length > 0, + has_recording_url: hasNonEmptyString( + runWithTrace.recording_url ?? runWithTrace.recordingUrl + ), + capture_artifact_count: countCaptureArtifacts(runWithTrace), + result_keys: result ? Object.keys(result).sort() : [], + browser_actions: browserActionsFromRunSteps(runWithTrace), }; } /** Best-effort cancel for async agent runs (POST /v1/runs/{id}/cancel). */ -export async function cancelTinyfishAgentRun(runId: string): Promise { +export async function cancelTinyfishAgentRun( + runId: string, + options: { requestTimeoutMs?: number } = {}, +): Promise { if (!runId.trim()) return; try { await withRetry( - async () => { - const response = await fetch( - `${TINYFISH_API_BASE}/v1/runs/${encodeURIComponent(runId)}/cancel`, - { - method: "POST", - headers: { - "X-API-Key": config.tinyfishApiKey, - "Content-Type": "application/json", - }, - }, - ); - - if (!response.ok) { - const body = await response.text(); - throw new Error( - `Cancel failed (${response.status})${body ? `: ${body.slice(0, 200)}` : ""}`, - ); - } - }, + () => + withRequestTimeout({ + timeoutMs: options.requestTimeoutMs ?? config.agentRequestTimeoutMs, + label: `TinyFish Agent cancel ${runId}`, + action: (signal) => sendTinyfishAgentCancel(runId, { signal }), + }), { maxRetries: 1, baseDelayMs: config.retryBaseDelayMs, @@ -94,9 +128,15 @@ export async function cancelTinyfishAgentRun(runId: string): Promise { export async function queueTinyfishAgent( url: string, goal: string, + options: TinyfishAgentRunOptions = {}, ): Promise { const response = await withRetry( - () => getClient().agent.queue({ url, goal }), + () => + withRequestTimeout({ + timeoutMs: options.requestTimeoutMs ?? config.agentRequestTimeoutMs, + label: `TinyFish Agent queue ${url}`, + action: (signal) => getClient().agent.queue({ url, goal }, { signal }), + }), { maxRetries: config.maxRetries, baseDelayMs: config.retryBaseDelayMs, @@ -122,54 +162,62 @@ export async function pollTinyfishAgentUntilDone( ): Promise { const startedAt = Date.now(); const pollTimeoutMs = options.pollTimeoutMs ?? config.agentPollTimeoutMs; + const pollIntervalMs = options.pollIntervalMs ?? config.agentPollIntervalMs; + const requestTimeoutMs = options.requestTimeoutMs ?? config.agentRequestTimeoutMs; + const readRun = options.readRun ?? fetchTinyfishAgentRun; + const cancelRun = options.cancelRun ?? sendTinyfishAgentCancel; let lastStatus = RunStatus.PENDING; + let lastPollError: string | null = null; while (true) { - const run = await withRetry( - () => getClient().runs.get(runId), - { - maxRetries: config.maxRetries, - baseDelayMs: config.retryBaseDelayMs, - label: `agent.poll:${runId}`, - }, - ); - - lastStatus = run.status; + const remainingPollMs = pollTimeoutMs - (Date.now() - startedAt); + if (remainingPollMs <= 0) { + return timeoutAgentRunResult({ + runId, + pollTimeoutMs, + requestTimeoutMs, + lastStatus, + lastPollError, + readRun, + cancelRun, + }); + } - if (TERMINAL_STATUSES.has(run.status)) { - return runToResult(run); + let run: Run | null = null; + try { + run = await withRequestTimeout({ + timeoutMs: Math.min(requestTimeoutMs, remainingPollMs), + label: `TinyFish Agent poll ${runId}`, + action: (signal) => readRun(runId, { signal }), + }); + lastPollError = null; + } catch (error) { + lastPollError = error instanceof Error ? error.message : String(error); + if (!isRetryableError(error)) { + throw error; + } } - if (Date.now() - startedAt >= pollTimeoutMs) { - await cancelTinyfishAgentRun(runId); - - try { - const finalRun = await getClient().runs.get(runId); - if (TERMINAL_STATUSES.has(finalRun.status)) { - const result = runToResult(finalRun); - if (finalRun.status === RunStatus.CANCELLED) { - return { - ...result, - error: - result.error ?? - `Agent run cancelled after ${pollTimeoutMs}ms (was ${lastStatus})`, - }; - } - return result; - } - } catch { - // Fall through to TIMEOUT result below. + if (run) { + lastStatus = run.status; + if (TERMINAL_STATUSES.has(run.status)) { + return tinyfishAgentRunResultFromRun(run); } + } - return { - run_id: runId, - status: "TIMEOUT", - result: null, - error: `Agent run timed out after ${pollTimeoutMs}ms (last status: ${lastStatus}); cancel requested`, - }; + if (Date.now() - startedAt >= pollTimeoutMs) { + return timeoutAgentRunResult({ + runId, + pollTimeoutMs, + requestTimeoutMs, + lastStatus, + lastPollError, + readRun, + cancelRun, + }); } - await sleep(config.agentPollIntervalMs); + await sleep(Math.min(pollIntervalMs, pollTimeoutMs - (Date.now() - startedAt))); } } @@ -181,13 +229,19 @@ export async function runTinyfishAgent( goal: string, options: TinyfishAgentRunOptions = {}, ): Promise { - const queued = await queueTinyfishAgent(url, goal); + const queued = await queueTinyfishAgent(url, goal, options); if (queued.error || !queued.run_id) { return { run_id: null, status: RunStatus.FAILED, result: null, error: queued.error ?? "Failed to queue agent run", + agent_step_count: null, + has_streaming_url: false, + has_recording_url: false, + capture_artifact_count: 0, + result_keys: [], + browser_actions: [], }; } return pollTinyfishAgentUntilDone(queued.run_id, options); @@ -206,7 +260,7 @@ export async function runTinyfishAgentsBatch( jobs, config.agentQueueConcurrency, async (job) => { - const queueResult = await queueTinyfishAgent(job.url, job.goal); + const queueResult = await queueTinyfishAgent(job.url, job.goal, options); return { job, ...queueResult }; }, ); @@ -222,6 +276,12 @@ export async function runTinyfishAgentsBatch( status: RunStatus.FAILED, result: null, error: item.error ?? "Failed to queue agent run", + agent_step_count: null, + has_streaming_url: false, + has_recording_url: false, + capture_artifact_count: 0, + result_keys: [], + browser_actions: [], }; continue; } @@ -238,3 +298,429 @@ export async function runTinyfishAgentsBatch( return results; } + +async function timeoutAgentRunResult(input: { + runId: string; + pollTimeoutMs: number; + requestTimeoutMs: number; + lastStatus: string; + lastPollError: string | null; + readRun: TinyfishAgentRunReader; + cancelRun: TinyfishAgentRunCanceller; +}): Promise { + await withRequestTimeout({ + timeoutMs: input.requestTimeoutMs, + label: `TinyFish Agent cancel ${input.runId}`, + action: (signal) => input.cancelRun(input.runId, { signal }), + }).catch(() => undefined); + + try { + const finalRun = await withRequestTimeout({ + timeoutMs: input.requestTimeoutMs, + label: `TinyFish Agent final poll ${input.runId}`, + action: (signal) => input.readRun(input.runId, { signal }), + }); + if (TERMINAL_STATUSES.has(finalRun.status)) { + const result = tinyfishAgentRunResultFromRun(finalRun); + if (finalRun.status === RunStatus.CANCELLED) { + return { + ...result, + error: + result.error ?? + `Agent run cancelled after ${input.pollTimeoutMs}ms (was ${input.lastStatus})`, + }; + } + return result; + } + } catch { + // Fall through to TIMEOUT result below. + } + + const lastPollSuffix = input.lastPollError + ? `; last poll error: ${input.lastPollError}` + : ""; + return { + run_id: input.runId, + status: "TIMEOUT", + result: null, + error: + `Agent run timed out after ${input.pollTimeoutMs}ms (last status: ${input.lastStatus}); cancel requested${lastPollSuffix}`, + agent_step_count: null, + has_streaming_url: false, + has_recording_url: false, + capture_artifact_count: 0, + result_keys: [], + browser_actions: [], + }; +} + +async function fetchTinyfishAgentRun( + runId: string, + options: { signal: AbortSignal }, +): Promise { + const response = await fetch( + `${TINYFISH_API_BASE}/v1/runs/${encodeURIComponent(runId)}`, + { + headers: { + "X-API-Key": config.tinyfishApiKey, + "Content-Type": "application/json", + }, + signal: options.signal, + }, + ); + + if (!response.ok) { + const body = await response.text(); + throw httpStatusError( + `TinyFish run poll returned HTTP ${response.status}${body ? `: ${body.slice(0, 200)}` : ""}`, + response.status, + ); + } + + return await response.json() as Run; +} + +async function sendTinyfishAgentCancel( + runId: string, + options: { signal: AbortSignal }, +): Promise { + const response = await fetch( + `${TINYFISH_API_BASE}/v1/runs/${encodeURIComponent(runId)}/cancel`, + { + method: "POST", + headers: { + "X-API-Key": config.tinyfishApiKey, + "Content-Type": "application/json", + }, + signal: options.signal, + }, + ); + + if (!response.ok) { + const body = await response.text(); + throw httpStatusError( + `Cancel failed (${response.status})${body ? `: ${body.slice(0, 200)}` : ""}`, + response.status, + ); + } +} + +async function withRequestTimeout(input: { + timeoutMs: number; + label: string; + action: (signal: AbortSignal) => Promise; +}): Promise { + const timeoutMs = Math.max(1, Math.floor(input.timeoutMs)); + const controller = new AbortController(); + + return await new Promise((resolve, reject) => { + const timeout = setTimeout(() => { + const error = new Error(`${input.label} timed out after ${timeoutMs}ms`); + controller.abort(error); + reject(error); + }, timeoutMs); + + Promise.resolve() + .then(() => input.action(controller.signal)) + .then(resolve, reject) + .finally(() => clearTimeout(timeout)); + }); +} + +function httpStatusError(message: string, status: number): Error & { status: number } { + const error = new Error(message) as Error & { status: number }; + error.status = status; + return error; +} + +function browserActionsFromRunSteps(run: TinyfishRunWithTrace): BrowserActionReport[] { + const steps = Array.isArray(run.steps) ? run.steps : []; + const actions = steps + .map((step) => browserActionFromRunStep(step)) + .filter((action): action is BrowserActionReport => Boolean(action)); + return dedupeBrowserActions(actions); +} + +function browserActionFromRunStep(step: unknown): BrowserActionReport | undefined { + if (!isRecord(step)) { + return undefined; + } + + const action = normalizeBrowserAction( + firstStringAtPaths(step, [ + ["action"], + ["type"], + ["kind"], + ["operation"], + ["tool"], + ["name"], + ["event"], + ]) + ); + const url = firstStringAtPaths(step, [ + ["url"], + ["current_url"], + ["currentUrl"], + ["target_url"], + ["targetUrl"], + ["page_url"], + ["pageUrl"], + ["href"], + ["input", "url"], + ["args", "url"], + ["arguments", "url"], + ["target", "url"], + ["metadata", "url"], + ]); + const selector = firstStringAtPaths(step, [ + ["selector"], + ["locator"], + ["target", "selector"], + ["element", "selector"], + ["input", "selector"], + ["args", "selector"], + ["arguments", "selector"], + ]); + const targetText = targetTextFromStep(step, action); + const status = normalizeStepStatus( + firstStringAtPaths(step, [ + ["status"], + ["state"], + ["outcome"], + ["result", "status"], + ]) + ); + const error = errorMessageFromStep(step); + const phase = firstStringAtPaths(step, [["phase"], ["stage"]]) ?? "agent-step"; + const label = firstStringAtPaths(step, [ + ["label"], + ["description"], + ["summary"], + ["name"], + ["type"], + ]); + const valueDescription = valueDescriptionFromStep(step, action); + + const report: BrowserActionReport = { + action, + url, + selector, + target_text: targetText, + status: status ?? (error ? "failed" : undefined), + error, + phase, + label, + value_description: valueDescription, + }; + + return hasReplayAnchor(report) ? report : undefined; +} + +function normalizeBrowserAction(value: string | undefined): string | undefined { + if (!value) { + return undefined; + } + const lower = value.toLowerCase(); + if (/\b(click|tap|press|select)\b/.test(lower)) return "click"; + if (/\b(navigate|goto|go_to|open|visit)\b/.test(lower)) return "navigate"; + if (/\b(fill|type|input|enter_text|set_value)\b/.test(lower)) return "type"; + if (/\b(scroll)\b/.test(lower)) return "scroll"; + if (/\b(wait)\b/.test(lower)) return "wait"; + if (/\b(extract|scrape|read)\b/.test(lower)) return "extract"; + return value.slice(0, 80); +} + +function normalizeStepStatus(value: string | undefined): string | undefined { + if (!value) { + return undefined; + } + const lower = value.toLowerCase(); + if (/\b(success|succeeded|completed|complete|done|ok)\b/.test(lower)) { + return "succeeded"; + } + if (/\b(failed|failure|error)\b/.test(lower)) { + return "failed"; + } + if (/\b(cancelled|canceled)\b/.test(lower)) { + return "cancelled"; + } + return value.slice(0, 80); +} + +function targetTextFromStep( + step: Record, + action: string | undefined +): string | undefined { + const explicitTargetText = firstStringAtPaths(step, [ + ["target_text"], + ["targetText"], + ["target", "text"], + ["element", "text"], + ["input", "target_text"], + ["args", "target_text"], + ["arguments", "target_text"], + ]); + if (explicitTargetText) { + return explicitTargetText; + } + if (action === "fill") { + return firstStringAtPaths(step, [ + ["placeholder"], + ["label"], + ["target", "label"], + ["element", "label"], + ["input", "label"], + ["args", "label"], + ["arguments", "label"], + ]); + } + return firstStringAtPaths(step, [ + ["text"], + ["label"], + ["target", "label"], + ["element", "label"], + ]); +} + +function valueDescriptionFromStep( + step: Record, + action: string | undefined +): string | undefined { + if (action !== "type") { + return firstStringAtPaths(step, [ + ["value_description"], + ["valueDescription"], + ]); + } + + const explicitDescription = firstStringAtPaths(step, [ + ["value_description"], + ["valueDescription"], + ]); + if (explicitDescription) { + return explicitDescription; + } + + const typedValue = firstStringAtPaths(step, [ + ["value"], + ["text"], + ["input", "value"], + ["args", "value"], + ["arguments", "value"], + ]); + return typedValue + ? `redacted typed value (${typedValue.length} chars)` + : "redacted typed value"; +} + +function errorMessageFromStep(step: Record): string | undefined { + const errorValue = valueAtFirstPath(step, [ + ["error"], + ["failure"], + ["failure_reason"], + ["failureReason"], + ["result", "error"], + ]); + if (typeof errorValue === "string") { + return errorValue.slice(0, 200); + } + if (isRecord(errorValue) && typeof errorValue.message === "string") { + return errorValue.message.slice(0, 200); + } + return undefined; +} + +function countCaptureArtifacts(run: TinyfishRunWithTrace): number { + const artifactValues = [ + run.captures, + run.capture_artifacts, + run.captureArtifacts, + run.artifacts, + ]; + return artifactValues.reduce((count, value) => { + if (Array.isArray(value)) { + return count + value.length; + } + if (isRecord(value)) { + return count + Object.keys(value).length; + } + return count; + }, 0); +} + +function firstStringAtPaths( + record: Record, + paths: readonly (readonly string[])[] +): string | undefined { + for (const path of paths) { + const value = valueAtPath(record, path); + if (hasNonEmptyString(value)) { + return value.trim().slice(0, 500); + } + } + return undefined; +} + +function valueAtFirstPath( + record: Record, + paths: readonly (readonly string[])[] +): unknown { + for (const path of paths) { + const value = valueAtPath(record, path); + if (value !== undefined && value !== null) { + return value; + } + } + return undefined; +} + +function valueAtPath( + record: Record, + path: readonly string[] +): unknown { + let value: unknown = record; + for (const key of path) { + if (!isRecord(value)) { + return undefined; + } + value = value[key]; + } + return value; +} + +function hasReplayAnchor(action: BrowserActionReport): boolean { + return Boolean(action.url || action.selector || action.target_text || action.targetText); +} + +function hasNonEmptyString(value: unknown): value is string { + return typeof value === "string" && value.trim().length > 0; +} + +function isRecord(value: unknown): value is Record { + return Boolean(value) && typeof value === "object" && !Array.isArray(value); +} + +function dedupeBrowserActions( + actions: BrowserActionReport[] +): BrowserActionReport[] { + const seen = new Set(); + const deduped: BrowserActionReport[] = []; + for (const action of actions) { + const key = JSON.stringify([ + action.action ?? "", + action.url ?? "", + action.selector ?? "", + action.target_text ?? action.targetText ?? "", + action.status ?? "", + action.error ?? "", + action.phase ?? "", + action.label ?? "", + ]); + if (seen.has(key)) { + continue; + } + seen.add(key); + deduped.push(action); + } + return deduped; +} diff --git a/backend/BigSet_Data_Collection_Agent/src/models/schemas.ts b/backend/BigSet_Data_Collection_Agent/src/models/schemas.ts index fe1a059..3bc1265 100644 --- a/backend/BigSet_Data_Collection_Agent/src/models/schemas.ts +++ b/backend/BigSet_Data_Collection_Agent/src/models/schemas.ts @@ -102,6 +102,22 @@ export const agentGoalSchema = z.object({ export type AgentGoal = z.infer; +export const browserActionReportSchema = z.object({ + action: z.string().optional(), + url: z.string().optional(), + selector: z.string().optional(), + target_text: z.string().optional(), + targetText: z.string().optional(), + value_description: z.string().optional(), + valueDescription: z.string().optional(), + status: z.string().optional(), + error: z.string().optional(), + phase: z.string().optional(), + label: z.string().optional(), +}); + +export type BrowserActionReport = z.infer; + export const agentRunRecordSchema = z.object({ url: z.string(), status: sourceStatusSchema, @@ -110,6 +126,13 @@ export const agentRunRecordSchema = z.object({ goal: z.string(), records_extracted: z.number(), error: z.string().optional(), + agent_step_count: z.number().nullable().optional(), + has_streaming_url: z.boolean().optional(), + has_recording_url: z.boolean().optional(), + capture_artifact_count: z.number().optional(), + result_keys: z.array(z.string()).optional(), + browser_action_diagnostic: z.string().optional(), + browser_actions: z.array(browserActionReportSchema).optional(), }); export type AgentRunRecord = z.infer; @@ -126,6 +149,11 @@ export const triageSummarySchema = z.object({ skipped: z.number(), records_from_extract: z.number(), records_from_agent: z.number(), + agent_reported_step_count: z.number().optional(), + agent_runs_with_streaming_url: z.number().optional(), + agent_runs_with_recording_url: z.number().optional(), + agent_capture_artifact_count: z.number().optional(), + agent_runs_with_explicit_browser_actions: z.number().optional(), }); export type TriageSummary = z.infer; @@ -152,6 +180,7 @@ export const repairLoopReportSchema = z.object({ loop_index: z.number().int().positive(), diagnosis_summary: z.string().optional(), repair_queries: z.array(z.string()), + agent_browser_actions: z.array(browserActionReportSchema).optional(), rationale: z.string().optional(), missing_fields: z.array(z.string()), records_before: z.number(), @@ -198,6 +227,7 @@ export const runReportSchema = z.object({ search_queries: z.array(z.string()), fetched_urls: z.array(z.string()), failed_urls: z.array(z.string()), + agent_browser_actions: z.array(browserActionReportSchema).optional(), }), repair: repairReportSchema, search_queries: z.array(z.string()), diff --git a/backend/BigSet_Data_Collection_Agent/src/orchestrator/browser-actions.ts b/backend/BigSet_Data_Collection_Agent/src/orchestrator/browser-actions.ts new file mode 100644 index 0000000..064523c --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/orchestrator/browser-actions.ts @@ -0,0 +1,227 @@ +import { + browserActionReportSchema, + type AgentRunRecord, + type BrowserActionReport, +} from "../models/schemas.js"; + +const EXPLICIT_BROWSER_ACTION_ARRAY_KEYS = [ + "browser_actions", + "agent_browser_actions", +] as const; + +export function explicitBrowserActionsFromAgentResult( + input: { + agentResult: Record | null; + pageUrl: string; + } +): BrowserActionReport[] { + if (!input.agentResult) { + return []; + } + + const actions: BrowserActionReport[] = []; + for (const key of EXPLICIT_BROWSER_ACTION_ARRAY_KEYS) { + actions.push(...browserActionsFromValue(input.agentResult[key], input.pageUrl)); + } + actions.push(...browserActionsFromNavigationSummary({ + value: input.agentResult.navigation, + pageUrl: input.pageUrl, + hasExtraction: Boolean(input.agentResult.extraction), + })); + return dedupeBrowserActions(actions); +} + +export function explicitBrowserActionsFromAgentRuns( + agentRuns: AgentRunRecord[] +): BrowserActionReport[] { + return dedupeBrowserActions( + agentRuns.flatMap((run) => run.browser_actions ?? []) + ); +} + +function browserActionsFromValue( + value: unknown, + pageUrl: string +): BrowserActionReport[] { + if (Array.isArray(value)) { + return value + .map((item) => browserActionFromValue(item, pageUrl)) + .filter((action): action is BrowserActionReport => Boolean(action)); + } + const action = browserActionFromValue(value, pageUrl); + return action ? [action] : []; +} + +function browserActionFromValue( + value: unknown, + pageUrl: string +): BrowserActionReport | undefined { + if (typeof value === "string") { + return browserActionFromString(value, pageUrl); + } + if (!value || typeof value !== "object" || Array.isArray(value)) { + return undefined; + } + const parsed = browserActionReportSchema.safeParse(value); + if (!parsed.success || !hasReplayAnchor(parsed.data)) { + return undefined; + } + return { + ...parsed.data, + url: parsed.data.url ?? pageUrl, + }; +} + +function browserActionFromString( + value: string, + pageUrl: string +): BrowserActionReport | undefined { + const label = value.trim(); + if (!label) { + return undefined; + } + const url = label.match(/https?:\/\/[^\s,)]+/i)?.[0] + ?.replace(/[.?!]+$/, ""); + if (url) { + return { + action: "navigate", + url, + status: "succeeded", + phase: "navigation", + label, + }; + } + + if (/\bextract\b/i.test(label)) { + return { + action: "extract", + url: pageUrl, + status: "succeeded", + phase: "extract", + label, + }; + } + + const sectionText = targetTextFromNavigationInstruction(label); + if (sectionText) { + return { + action: "click", + url: pageUrl, + target_text: sectionText, + status: "succeeded", + phase: "navigation", + label, + }; + } + + return undefined; +} + +function targetTextFromNavigationInstruction(label: string): string | undefined { + const match = label.match( + /\b(?:navigate|go)\s+to\s+(?:the\s+)?(.+?)(?:\s+(?:section|tab|category|page|area))?(?:\s+(?:of|on|to|for)\b|[.?!]|$)/i + ); + const targetText = match?.[1]?.trim(); + return targetText && !/^https?:\/\//i.test(targetText) + ? targetText + : undefined; +} + +function hasReplayAnchor(action: BrowserActionReport): boolean { + return Boolean( + action.url || + action.selector || + action.target_text || + action.targetText + ); +} + +function browserActionsFromNavigationSummary(input: { + value: unknown; + pageUrl: string; + hasExtraction: boolean; +}): BrowserActionReport[] { + if (!input.value || typeof input.value !== "object" || Array.isArray(input.value)) { + return []; + } + const navigation = input.value as Record; + const actions: BrowserActionReport[] = []; + const initialUrl = stringValue(navigation.initial_url ?? navigation.initialUrl); + if (initialUrl) { + actions.push({ + action: "navigate", + url: initialUrl, + status: "succeeded", + phase: "initial", + label: "agent-navigation-start", + }); + } + + const categoryClicked = stringValue( + navigation.category_clicked ?? navigation.categoryClicked + ); + if (categoryClicked) { + actions.push({ + action: "click", + url: initialUrl ?? input.pageUrl, + target_text: categoryClicked, + status: "succeeded", + phase: "navigation", + label: "agent-click-category", + }); + } + + const finalUrl = stringValue(navigation.final_url ?? navigation.finalUrl); + if (finalUrl && finalUrl !== initialUrl) { + actions.push({ + action: "navigate", + url: finalUrl, + status: "succeeded", + phase: "navigation", + label: "agent-navigation-final-url", + }); + } + + if (actions.length > 0 && input.hasExtraction) { + actions.push({ + action: "extract", + url: finalUrl ?? initialUrl ?? input.pageUrl, + status: "succeeded", + phase: "extract", + label: "agent-extract-results", + }); + } + + return actions; +} + +function stringValue(value: unknown): string | undefined { + return typeof value === "string" && value.trim().length > 0 + ? value.trim() + : undefined; +} + +export function dedupeBrowserActions( + actions: BrowserActionReport[] +): BrowserActionReport[] { + const seen = new Set(); + const deduped: BrowserActionReport[] = []; + for (const action of actions) { + const key = JSON.stringify([ + action.action ?? "", + action.url ?? "", + action.selector ?? "", + action.target_text ?? action.targetText ?? "", + action.status ?? "", + action.error ?? "", + action.phase ?? "", + action.label ?? "", + ]); + if (seen.has(key)) { + continue; + } + seen.add(key); + deduped.push(action); + } + return deduped; +} diff --git a/backend/BigSet_Data_Collection_Agent/src/orchestrator/pipeline.ts b/backend/BigSet_Data_Collection_Agent/src/orchestrator/pipeline.ts index ae6af0d..a8c409a 100644 --- a/backend/BigSet_Data_Collection_Agent/src/orchestrator/pipeline.ts +++ b/backend/BigSet_Data_Collection_Agent/src/orchestrator/pipeline.ts @@ -48,6 +48,7 @@ import { type RunPaths, } from "../storage/run-store.js"; import { normalizeUrl } from "../utils/url.js"; +import { explicitBrowserActionsFromAgentRuns } from "./browser-actions.js"; export interface PipelineOptions { prompt: string; @@ -545,6 +546,9 @@ async function executeRunPipeline( const visualizationCount = benchmarkVisualizationRecords.length; const llmUsage = getCurrentLlmUsage(); + const initialAgentBrowserActions = explicitBrowserActionsFromAgentRuns( + initialAcquisition.agentRuns, + ); const report: RunReport = { run_id: runId, @@ -586,6 +590,7 @@ async function executeRunPipeline( search_queries: initialQueries, fetched_urls: initialAcquisition.fetchedUrls, failed_urls: initialAcquisition.failedUrls, + agent_browser_actions: initialAgentBrowserActions, }, repair: repairReport, search_queries: allSearchQueries, diff --git a/backend/BigSet_Data_Collection_Agent/src/orchestrator/process-pages.ts b/backend/BigSet_Data_Collection_Agent/src/orchestrator/process-pages.ts index 4009569..2aed17a 100644 --- a/backend/BigSet_Data_Collection_Agent/src/orchestrator/process-pages.ts +++ b/backend/BigSet_Data_Collection_Agent/src/orchestrator/process-pages.ts @@ -5,6 +5,7 @@ import { triagePage } from "../agents/source-triage.js"; import { derivePromptSourcePolicy } from "../agents/source-policy.js"; import { config } from "../config.js"; import { runTinyfishAgentsBatch } from "../integrations/tinyfish-agent.js"; +import type { TinyfishAgentRunResult } from "../integrations/tinyfish-agent.js"; import type { WorkflowMemory } from "../memory/index.js"; import { getPrimaryKeyValue } from "../merge/records.js"; import { @@ -26,6 +27,10 @@ import { } from "../queue/pools.js"; import { saveJson, type RunPaths } from "../storage/run-store.js"; import { getDomain } from "../utils/url.js"; +import { + dedupeBrowserActions, + explicitBrowserActionsFromAgentResult, +} from "./browser-actions.js"; import { join } from "node:path"; export interface AgentDeferredEntry { @@ -55,6 +60,67 @@ function emptySummary(): TriageSummary { skipped: 0, records_from_extract: 0, records_from_agent: 0, + agent_reported_step_count: 0, + agent_runs_with_streaming_url: 0, + agent_runs_with_recording_url: 0, + agent_capture_artifact_count: 0, + agent_runs_with_explicit_browser_actions: 0, + }; +} + +function recordAgentRunProvenance( + summary: TriageSummary, + run: TinyfishAgentRunResult, + browserActionCount: number, +): void { + summary.agent_reported_step_count = + (summary.agent_reported_step_count ?? 0) + + (run.agent_step_count ?? 0); + if (run.has_streaming_url) { + summary.agent_runs_with_streaming_url = + (summary.agent_runs_with_streaming_url ?? 0) + 1; + } + if (run.has_recording_url) { + summary.agent_runs_with_recording_url = + (summary.agent_runs_with_recording_url ?? 0) + 1; + } + summary.agent_capture_artifact_count = + (summary.agent_capture_artifact_count ?? 0) + run.capture_artifact_count; + if (browserActionCount > 0) { + summary.agent_runs_with_explicit_browser_actions = + (summary.agent_runs_with_explicit_browser_actions ?? 0) + 1; + } +} + +function agentRunProvenanceFields(input: { + run: TinyfishAgentRunResult; + recordsExtracted: number; + browserActionCount: number; +}): Pick< + AgentRunRecord, + | "agent_step_count" + | "has_streaming_url" + | "has_recording_url" + | "capture_artifact_count" + | "result_keys" + | "browser_action_diagnostic" +> { + const hasReportedBrowserWork = (input.run.agent_step_count ?? 0) > 0; + const missingExplicitBrowserActions = + hasReportedBrowserWork && input.browserActionCount === 0; + const browserActionDiagnostic = missingExplicitBrowserActions + ? input.recordsExtracted > 0 + ? "Agent completed and returned rows, but polled run payload exposed no explicit browser actions." + : "Agent completed with reported browser work, but polled run payload exposed no explicit browser actions." + : undefined; + + return { + agent_step_count: input.run.agent_step_count, + has_streaming_url: input.run.has_streaming_url, + has_recording_url: input.run.has_recording_url, + capture_artifact_count: input.run.capture_artifact_count, + result_keys: input.run.result_keys, + browser_action_diagnostic: browserActionDiagnostic, }; } @@ -389,8 +455,16 @@ export async function processFetchedPages(options: { jobsToExtract, async ({ job, run }) => { const pageUrl = job.pageUrl; + const browserActions = dedupeBrowserActions([ + ...(run.browser_actions ?? []), + ...explicitBrowserActionsFromAgentResult({ + agentResult: run.result, + pageUrl, + }), + ]); if (run.error || !run.result) { + recordAgentRunProvenance(summary, run, browserActions.length); summary.agent_failed += 1; agentRuns.push({ url: pageUrl, @@ -400,6 +474,14 @@ export async function processFetchedPages(options: { goal: job.goal, records_extracted: 0, error: run.error ?? "No result returned", + ...agentRunProvenanceFields({ + run, + recordsExtracted: 0, + browserActionCount: browserActions.length, + }), + browser_actions: browserActions.length > 0 + ? browserActions + : undefined, }); options.log( options.label, @@ -408,6 +490,8 @@ export async function processFetchedPages(options: { return; } + recordAgentRunProvenance(summary, run, browserActions.length); + try { const agentRecords = await extractFromAgentResult({ spec: options.spec, @@ -432,6 +516,14 @@ export async function processFetchedPages(options: { agent_status: run.status, goal: job.goal, records_extracted: agentRecords.length, + ...agentRunProvenanceFields({ + run, + recordsExtracted: agentRecords.length, + browserActionCount: browserActions.length, + }), + browser_actions: browserActions.length > 0 + ? browserActions + : undefined, }); options.log( @@ -450,6 +542,14 @@ export async function processFetchedPages(options: { goal: job.goal, records_extracted: 0, error: msg, + ...agentRunProvenanceFields({ + run, + recordsExtracted: 0, + browserActionCount: browserActions.length, + }), + browser_actions: browserActions.length > 0 + ? browserActions + : undefined, }); } }, diff --git a/backend/BigSet_Data_Collection_Agent/src/orchestrator/repair-loop.ts b/backend/BigSet_Data_Collection_Agent/src/orchestrator/repair-loop.ts index 892f531..1def4e9 100644 --- a/backend/BigSet_Data_Collection_Agent/src/orchestrator/repair-loop.ts +++ b/backend/BigSet_Data_Collection_Agent/src/orchestrator/repair-loop.ts @@ -29,6 +29,7 @@ import { runAcquisitionPhase, type AcquisitionResult, } from "./acquisition.js"; +import { explicitBrowserActionsFromAgentRuns } from "./browser-actions.js"; export interface RepairLoopContext { userPrompt: string; @@ -235,6 +236,9 @@ export async function runRepairLoops(options: { loop_index: loopIndex, diagnosis_summary: diagnosis.summary, repair_queries: repairPlan.repair_queries, + agent_browser_actions: explicitBrowserActionsFromAgentRuns( + acquisition.agentRuns + ), rationale: repairPlan.rationale, missing_fields: coverage.field_gaps.map((gap) => gap.column), records_before: recordsBeforeLoop.length, diff --git a/backend/CLAUDE.md b/backend/CLAUDE.md index 38eb942..fbf4441 100644 --- a/backend/CLAUDE.md +++ b/backend/CLAUDE.md @@ -56,11 +56,11 @@ Writes to Convex via `ConvexHttpClient` in `src/convex.ts`. Import `{ convex, ap ## Environment -Required env vars (see `.env.example`): +Required env vars (see root `.env.example`): - `CONVEX_URL` — Convex instance URL - `CONVEX_SELF_HOSTED_ADMIN_KEY` — for system-level Convex writes (internal mutations) - `CLERK_SECRET_KEY`, `CLERK_PUBLISHABLE_KEY` — for JWT verification - `OPENROUTER_API_KEY` — for AI model calls - `TINYFISH_API_KEY` — for web search and fetch (populate agent). Get one at https://agent.tinyfish.ai/api-keys -In Docker, these are interpolated from the root `.env` file via `docker-compose.dev.yml`. +In Docker, these are loaded from the root `.env` file via `docker-compose.dev.yml`. diff --git a/backend/Dockerfile.dev b/backend/Dockerfile.dev index 890a3c1..1642a70 100644 --- a/backend/Dockerfile.dev +++ b/backend/Dockerfile.dev @@ -2,12 +2,24 @@ FROM node:22-alpine WORKDIR /app +RUN apk add --no-cache \ + ca-certificates \ + chromium \ + freetype \ + harfbuzz \ + nss \ + su-exec \ + ttf-freefont + COPY package.json package-lock.json ./ RUN npm ci COPY tsconfig.json ./ COPY src/ ./src/ +COPY BigSet_Data_Collection_Agent/ ./BigSet_Data_Collection_Agent/ +COPY docker-entrypoint.dev.sh /usr/local/bin/docker-entrypoint.dev.sh RUN chown -R node:node /app -USER node +RUN chmod +x /usr/local/bin/docker-entrypoint.dev.sh +ENTRYPOINT ["docker-entrypoint.dev.sh"] CMD ["npx", "tsx", "watch", "src/index.ts"] diff --git a/backend/Dockerfile.mastra b/backend/Dockerfile.mastra index ad3f896..ca46847 100644 --- a/backend/Dockerfile.mastra +++ b/backend/Dockerfile.mastra @@ -2,6 +2,14 @@ FROM node:22-alpine WORKDIR /app +RUN apk add --no-cache \ + ca-certificates \ + chromium \ + freetype \ + harfbuzz \ + nss \ + ttf-freefont + COPY package.json package-lock.json ./ RUN npm ci diff --git a/backend/README.md b/backend/README.md index 107b90c..896f6ac 100644 --- a/backend/README.md +++ b/backend/README.md @@ -5,8 +5,10 @@ Fastify server that handles auth, database, and talks to TinyFish APIs. ## Running ```bash +# From the repo root: cp .env.example .env -# Set BETTER_AUTH_SECRET (openssl rand -base64 32) +# Fill in the root .env file. +cd backend npm install npx drizzle-kit push npm run dev @@ -28,3 +30,14 @@ Starts on [localhost:3501](http://localhost:3501). | `npm run dev` | Start with hot reload | | `npm run build` | Compile TypeScript | | `npm run db:push` | Push schema changes to Postgres | + +Local backend scripts load the repo-root `.env` through `../scripts/with-root-env.mjs`. + +## Self-Healing Commit Cap + +`populate:self-heal --commit` and `POST /populate` use a configurable +per-dataset hourly safety throttle before writing rows. Override with +`POPULATE_COMMIT_ROW_LIMIT_PER_HOUR` or CLI +`--commit-row-limit-per-hour`. + +Dry runs and benchmarks do not commit rows, so they do not consume this cap. diff --git a/backend/docker-entrypoint.dev.sh b/backend/docker-entrypoint.dev.sh new file mode 100644 index 0000000..6728b79 --- /dev/null +++ b/backend/docker-entrypoint.dev.sh @@ -0,0 +1,7 @@ +#!/bin/sh +set -eu + +mkdir -p /app/.bigset +chown -R node:node /app/.bigset + +exec su-exec node "$@" diff --git a/backend/package-lock.json b/backend/package-lock.json index ea6aa1c..eaf37c2 100644 --- a/backend/package-lock.json +++ b/backend/package-lock.json @@ -18,6 +18,7 @@ "dotenv": "^16.4.0", "fastify": "^5.0.0", "fastify-plugin": "^5.1.0", + "playwright-core": "^1.58.2", "zod": "^4.4.3" }, "devDependencies": { @@ -7399,6 +7400,18 @@ "pathe": "^2.0.1" } }, + "node_modules/playwright-core": { + "version": "1.58.2", + "resolved": "https://registry.npmjs.org/playwright-core/-/playwright-core-1.58.2.tgz", + "integrity": "sha512-yZkEtftgwS8CsfYo7nm0KE8jsvm6i/PTgVtB8DL726wNf6H2IMsDuxCpJj59KDaxCtSnrWan2AeDqM7JBaultg==", + "license": "Apache-2.0", + "bin": { + "playwright-core": "cli.js" + }, + "engines": { + "node": ">=18" + } + }, "node_modules/posthog-node": { "version": "5.34.7", "resolved": "https://registry.npmjs.org/posthog-node/-/posthog-node-5.34.7.tgz", diff --git a/backend/package.json b/backend/package.json index f7784a9..aad84aa 100644 --- a/backend/package.json +++ b/backend/package.json @@ -4,12 +4,12 @@ "type": "module", "private": true, "scripts": { - "dev": "tsx watch src/index.ts", + "dev": "node ../scripts/with-root-env.mjs tsx watch src/index.ts", "test": "node --import tsx --test test/*.test.ts", "build": "tsc", "start": "node dist/index.js", - "mastra:dev": "mastra dev", - "populate:self-heal": "tsx src/pipeline/populate-self-healing-cli.ts" + "mastra:dev": "node ../scripts/with-root-env.mjs mastra dev", + "populate:self-heal": "node ../scripts/with-root-env.mjs tsx src/pipeline/populate-self-healing-cli.ts" }, "dependencies": { "@clerk/backend": "^3.4.11", @@ -22,6 +22,7 @@ "dotenv": "^16.4.0", "fastify": "^5.0.0", "fastify-plugin": "^5.1.0", + "playwright-core": "^1.58.2", "zod": "^4.4.3" }, "devDependencies": { diff --git a/backend/prompts/schema-inference.txt b/backend/prompts/schema-inference.txt index 9752429..49d1d07 100644 --- a/backend/prompts/schema-inference.txt +++ b/backend/prompts/schema-inference.txt @@ -18,7 +18,7 @@ Rules: - All column `name` values must be snake_case and unique. - Prefer concrete column choices over speculative ones — better to omit a column than guess wildly. -# @MMeteorL's comments/suggestions: +# Collection pipeline review comments: # This may be too early in the agent workflow to suggest these without more # context. In the current agent system, the agent would first use Tinyfish # search to search for candidate urls and fetch those results for analysis diff --git a/backend/src/env.ts b/backend/src/env.ts index 475994b..e088d2d 100644 --- a/backend/src/env.ts +++ b/backend/src/env.ts @@ -1,4 +1,7 @@ -import "dotenv/config"; +import { config as loadDotenv } from "dotenv"; +import { fileURLToPath } from "node:url"; + +loadDotenv({ path: fileURLToPath(new URL("../../.env", import.meta.url)) }); function required(name: string): string { const value = process.env[name]; @@ -21,7 +24,9 @@ export const env = { // Used by ./clerk-auth.ts to verify JWTs on protected routes (e.g. // /infer-schema). Required for the backend to function. CLERK_SECRET_KEY: process.env.CLERK_SECRET_KEY, - CLERK_PUBLISHABLE_KEY: process.env.CLERK_PUBLISHABLE_KEY, + CLERK_PUBLISHABLE_KEY: + process.env.CLERK_PUBLISHABLE_KEY ?? + process.env.NEXT_PUBLIC_CLERK_PUBLISHABLE_KEY, OPENROUTER_API_KEY: process.env.OPENROUTER_API_KEY, TINYFISH_API_KEY: process.env.TINYFISH_API_KEY, diff --git a/backend/src/index.ts b/backend/src/index.ts index b73b1ae..db3388b 100644 --- a/backend/src/index.ts +++ b/backend/src/index.ts @@ -1,14 +1,20 @@ import { env } from "./env.js"; import clerkAuthPlugin, { requireAuth } from "./clerk-auth.js"; +import { ConvexPopulateDatasetOwnerLoader } from "./pipeline/populate-dataset-owner-loader.js"; import { ConvexPopulateDatasetRowWriter } from "./pipeline/populate-convex-writer.js"; -import { convex, api } from "./convex.js"; +import { convex, internal } from "./convex.js"; import { createBigSetServer } from "./server.js"; +const datasetOwnerLoader = new ConvexPopulateDatasetOwnerLoader({ + convexClient: convex, + internalApi: internal, +}); + const fastify = await createBigSetServer({ env, authPlugin: clerkAuthPlugin, authPreHandler: requireAuth, - getDatasetById: (datasetId) => convex.query(api.datasets.get, { id: datasetId }), + getDatasetById: (datasetId) => datasetOwnerLoader.loadDataset(datasetId), populateRowWriter: new ConvexPopulateDatasetRowWriter(), }); diff --git a/backend/src/pipeline/collection-agent-runner.ts b/backend/src/pipeline/collection-agent-runner.ts index 9321a06..803a49a 100644 --- a/backend/src/pipeline/collection-agent-runner.ts +++ b/backend/src/pipeline/collection-agent-runner.ts @@ -7,9 +7,12 @@ import type { CollectionPopulatePipelineInput, CollectionPopulatePipelineRunner, } from "./populate-collection-runtime.js"; -import type { - PopulateCellValue, - PopulateRuntimeResult, +import { + populateProcessTraceFromSteps, + type PopulateCellValue, + type PopulateRuntimeBrowserAction, + type PopulateRuntimeResult, + type PopulateRuntimeTraceStep, } from "./populate-runtime.js"; type CollectionPipelineModule = { @@ -36,14 +39,31 @@ interface CollectionPipelineOptions { } interface CollectionPipelineResult { + runId?: string; + paths?: { + root?: string; + reportPath?: string; + }; report: { errors?: string[]; dataset_spec?: CollectionDatasetSpec; stats?: CollectionPhaseStats; - initial?: CollectionPhaseStats; + initial?: CollectionPhaseStats & { + search_queries?: string[]; + fetched_urls?: string[]; + failed_urls?: string[]; + browser_actions?: CollectionBrowserActionReport[]; + agent_browser_actions?: CollectionBrowserActionReport[]; + }; repair?: { stats?: CollectionPhaseStats; + loops?: CollectionRepairLoopReport[]; }; + search_queries?: string[]; + fetched_urls?: string[]; + failed_urls?: string[]; + browser_actions?: CollectionBrowserActionReport[]; + agent_browser_actions?: CollectionBrowserActionReport[]; quality?: { records?: CollectionRecordQuality[]; }; @@ -75,6 +95,9 @@ interface CollectionPhaseStats { agent_dispatched?: number; agent_succeeded?: number; agent_failed?: number; + agent_reported_step_count?: number; + agent_runs_with_streaming_url?: number; + agent_runs_with_explicit_browser_actions?: number; }; } @@ -98,8 +121,34 @@ interface CollectionSourcesReport { } interface CollectionSourceOutcome { + url?: string; + phase?: string; outcome?: string; triage_status?: string; + error?: string; + records_extracted?: number; +} + +interface CollectionRepairLoopReport { + loop_index?: number; + repair_queries?: string[]; + browser_actions?: CollectionBrowserActionReport[]; + agent_browser_actions?: CollectionBrowserActionReport[]; + stats?: CollectionPhaseStats; +} + +interface CollectionBrowserActionReport { + action?: string; + url?: string; + selector?: string; + target_text?: string; + targetText?: string; + value_description?: string; + valueDescription?: string; + status?: string; + error?: string; + phase?: string; + label?: string; } const AGENT_REQUIRED_TRIAGE_STATUSES = new Set([ @@ -108,7 +157,7 @@ const AGENT_REQUIRED_TRIAGE_STATUSES = new Set([ "requires_detail_page_followup", ]); -const DEFAULT_COLLECTION_AGENT_POLL_TIMEOUT_MS = 480_000; +const DEFAULT_COLLECTION_AGENT_POLL_TIMEOUT_MS = 1_200_000; export const runCollectionPopulatePipeline: CollectionPopulatePipelineRunner = async (input) => { @@ -200,6 +249,16 @@ function collectionPipelineResultToPopulateRuntimeResult(input: { ], usage: usageFromPipeline(input.pipeline), metrics: metricsFromReport(input.pipeline.report), + debug: { + capturedRows: [], + capturedSources: [], + selectedRowSource: rows.length > 0 ? "collection_pipeline" : "none", + notes: collectionDebugNotes(input.pipeline.report), + processTrace: collectionProcessTrace({ + pipeline: input.pipeline, + rows, + }), + }, }; } @@ -231,6 +290,235 @@ function capabilityDiagnosticsFromReport(input: { ]; } +function collectionProcessTrace(input: { + pipeline: CollectionPipelineResult; + rows: Array>; +}) { + const report = input.pipeline.report; + const steps: PopulateRuntimeTraceStep[] = []; + + for (const query of report.search_queries ?? report.initial?.search_queries ?? []) { + steps.push({ + kind: "search", + label: "collection-search-query", + status: "succeeded", + input: { query }, + }); + } + + for (const url of report.fetched_urls ?? report.initial?.fetched_urls ?? []) { + steps.push({ + kind: "fetch", + label: "collection-fetched-url", + status: "succeeded", + input: { url }, + }); + } + + for (const url of report.failed_urls ?? report.initial?.failed_urls ?? []) { + steps.push({ + kind: "fetch", + label: "collection-failed-url", + status: "failed", + input: { url }, + }); + } + + for (const loop of report.repair?.loops ?? []) { + for (const query of loop.repair_queries ?? []) { + steps.push({ + kind: "repair", + label: "collection-repair-query", + status: "succeeded", + input: { + loopIndex: loop.loop_index, + query, + }, + }); + } + steps.push(...browserTraceStepsFromReports({ + reports: [ + ...(loop.browser_actions ?? []), + ...(loop.agent_browser_actions ?? []), + ], + defaultPhase: `repair-loop-${loop.loop_index ?? "unknown"}`, + })); + } + + steps.push(...browserTraceStepsFromReports({ + reports: [ + ...(report.browser_actions ?? []), + ...(report.agent_browser_actions ?? []), + ...(report.initial?.browser_actions ?? []), + ...(report.initial?.agent_browser_actions ?? []), + ], + defaultPhase: "initial", + })); + + for (const outcome of report.sources?.outcomes ?? []) { + if (!outcome.url) { + continue; + } + steps.push({ + kind: sourceOutcomeTraceKind(outcome), + label: `collection-source-${outcome.outcome ?? "unknown"}`, + status: sourceOutcomeTraceStatus(outcome), + input: { + url: outcome.url, + phase: outcome.phase, + triageStatus: outcome.triage_status, + }, + output: { + recordsExtracted: outcome.records_extracted, + }, + error: outcome.error, + }); + } + + return populateProcessTraceFromSteps({ + runtime: "collection", + steps, + selectedRowSource: input.rows.length > 0 ? "collection_pipeline" : "none", + notes: collectionDebugNotes(report), + artifactRoot: input.pipeline.paths?.root, + runReportPath: input.pipeline.paths?.reportPath, + }); +} + +function collectionDebugNotes(report: CollectionPipelineResult["report"]): string[] { + const notes = []; + if (report.stats) { + notes.push( + `collection stats: searches=${numberValue(report.stats.search_queries_executed)}, ` + + `fetches=${numberValue(report.stats.pages_fetched)}` + ); + } + if (report.repair?.loops && report.repair.loops.length > 0) { + notes.push(`collection repair loops=${report.repair.loops.length}`); + } + const triage = report.stats?.triage ?? report.initial?.triage; + if ( + numberValue(triage?.agent_reported_step_count) > 0 && + numberValue(triage?.agent_runs_with_explicit_browser_actions) === 0 + ) { + notes.push( + `collection Agent reported ${numberValue(triage?.agent_reported_step_count)} step(s), but emitted no explicit browser actions for Playwright replay` + ); + } + return notes; +} + +function browserTraceStepsFromReports(input: { + reports: CollectionBrowserActionReport[]; + defaultPhase: string; +}): PopulateRuntimeTraceStep[] { + return input.reports + .map((report) => browserTraceStepFromReport({ + report, + defaultPhase: input.defaultPhase, + })) + .filter((step): step is PopulateRuntimeTraceStep => Boolean(step)); +} + +function browserTraceStepFromReport(input: { + report: CollectionBrowserActionReport; + defaultPhase: string; +}): PopulateRuntimeTraceStep | undefined { + const browserAction = browserActionFromReport(input.report); + if (!browserAction) { + return undefined; + } + + return { + kind: "browser", + label: input.report.label ?? `collection-browser-${browserAction.action}`, + status: browserActionTraceStatus(input.report.status), + input: { + url: browserAction.url, + selector: browserAction.selector, + targetText: browserAction.targetText, + phase: input.report.phase ?? input.defaultPhase, + }, + error: input.report.error, + browserAction, + }; +} + +function browserActionFromReport( + report: CollectionBrowserActionReport +): PopulateRuntimeBrowserAction | undefined { + const action = browserActionKind(report.action); + const targetText = report.targetText ?? report.target_text; + const valueDescription = + report.valueDescription ?? report.value_description; + if (!report.url && !report.selector && !targetText) { + return undefined; + } + return { + action, + url: report.url, + selector: report.selector, + targetText, + valueDescription, + }; +} + +function browserActionKind( + value: string | undefined +): PopulateRuntimeBrowserAction["action"] { + const normalized = value?.trim().toLowerCase(); + if ( + normalized === "navigate" || + normalized === "click" || + normalized === "type" || + normalized === "fill" || + normalized === "select" || + normalized === "wait" || + normalized === "extract" || + normalized === "screenshot" + ) { + return normalized === "fill" ? "type" : normalized; + } + return "unknown"; +} + +function browserActionTraceStatus( + value: string | undefined +): PopulateRuntimeTraceStep["status"] { + const normalized = value?.trim().toLowerCase(); + if (normalized === "failed" || normalized === "error") { + return "failed"; + } + if (normalized === "skipped") { + return "skipped"; + } + return "succeeded"; +} + +function sourceOutcomeTraceKind(outcome: CollectionSourceOutcome): PopulateRuntimeTraceStep["kind"] { + if (outcome.outcome?.startsWith("agent_")) { + return "agent"; + } + if (outcome.outcome === "fetch_failed") { + return "fetch"; + } + return "validation"; +} + +function sourceOutcomeTraceStatus( + outcome: CollectionSourceOutcome +): PopulateRuntimeTraceStep["status"] { + if ( + outcome.outcome && + ["fetch_failed", "skipped", "agent_failed", "agent_deferred", "no_records"].includes( + outcome.outcome + ) + ) { + return "failed"; + } + return "succeeded"; +} + function isAgentRequiredSourceOutcome(outcome: CollectionSourceOutcome): boolean { return ( typeof outcome.triage_status === "string" && @@ -333,17 +621,23 @@ function metricsFromReport(report: CollectionPipelineResult["report"]) { const agentDispatched = numberValue(initialTriage.agent_dispatched) + numberValue(repairTriage.agent_dispatched); + const reportedAgentSteps = + numberValue(initialTriage.agent_reported_step_count) + + numberValue(repairTriage.agent_reported_step_count); + const fallbackAgentSteps = + numberValue(initialTriage.agent_succeeded) + + numberValue(initialTriage.agent_failed) + + numberValue(repairTriage.agent_succeeded) + + numberValue(repairTriage.agent_failed); return { searchCalls: numberValue(stats.search_queries_executed), fetchCalls: numberValue(stats.pages_fetched), browserCalls: agentDispatched, agentRuns: agentDispatched, - agentSteps: - numberValue(initialTriage.agent_succeeded) + - numberValue(initialTriage.agent_failed) + - numberValue(repairTriage.agent_succeeded) + - numberValue(repairTriage.agent_failed), + agentSteps: reportedAgentSteps > 0 + ? reportedAgentSteps + : fallbackAgentSteps, }; } diff --git a/backend/src/pipeline/populate-browser-action-box.ts b/backend/src/pipeline/populate-browser-action-box.ts new file mode 100644 index 0000000..cad1a56 --- /dev/null +++ b/backend/src/pipeline/populate-browser-action-box.ts @@ -0,0 +1,781 @@ +import { createHash, randomUUID } from "node:crypto"; + +import { + populateProcessTraceFromSteps, + type PopulateCellValue, + type PopulateRuntimeDebug, + type PopulateRuntimeResult, + type PopulateRuntimeRow, + type PopulateRuntimeTraceStep, +} from "./populate-runtime.js"; +import { + playwrightCandidateReadinessForRun, + type PopulatePlaywrightCandidateReadiness, +} from "./populate-playwright-readiness.js"; +import { playwrightCandidateScriptForRun } from "./populate-playwright-candidate-script.js"; +import { + recordTinyFishTrace, + createTinyFishTraceRecorderClient, + tinyFishTraceProcessSteps, + type TinyFishRecordedTrace, + type TinyFishTraceRecorderClient, +} from "./populate-tinyfish-trace-recorder.js"; + +export interface BrowserActionBoxDatasetSchema { + columns: Array<{ + name: string; + description?: string; + required?: boolean; + }>; + dedupeKey?: string; +} + +export interface BrowserActionBoxRunCaps { + maxAgentSteps: number; + maxDurationSeconds: number; + captureHtml: boolean; + captureScreenshots: boolean; +} + +export interface PlaywrightScriptRegistryKey { + sourceUrlCanonical: string; + datasetGoalFingerprint: string; + datasetSchemaFingerprint: string; + promptPolicyVersion: string; + scriptGeneratorVersion: string; +} + +export interface PlaywrightScriptArtifact { + scriptId: string; + sourceUrl: string; + createdAt: string; + status: "draft" | "promoted" | "rejected"; + generatorVersion: string; + registryKey: PlaywrightScriptRegistryKey; + code: string; + diagnostics: string[]; +} + +export interface BrowserActionBoxFirstRunInput { + sourceUrl: string; + datasetGoalPrompt: string; + datasetSchema: BrowserActionBoxDatasetSchema; + runCaps: BrowserActionBoxRunCaps; +} + +export interface BrowserActionBoxFirstRunOutput { + agentCompatibleResult: Record; + runtimeResult: PopulateRuntimeResult; + trace: TinyFishRecordedTrace; + playwrightScript: PlaywrightScriptArtifact | null; + replayReadiness: PopulatePlaywrightCandidateReadiness; + diagnostics: string[]; +} + +export interface BrowserActionBoxReplayInput { + sourceUrl: string; + datasetGoalPrompt: string; + datasetSchema: BrowserActionBoxDatasetSchema; + currentPlaywrightScript: PlaywrightScriptArtifact; + previousSuccessfulOutputProfile: { + fieldsPreviouslyRetrieved: string[]; + rowCountRange?: { min: number; max?: number }; + sourceUrls: string[]; + evidenceRequired: boolean; + }; + runCaps: { + maxReplayAttempts: 1; + maxRepairAttempts: 1; + timeoutMs: number; + }; +} + +export interface PlaywrightReplayTrace { + status: "succeeded" | "failed"; + startedAt: string; + completedAt: string; + scriptId: string; + sourceUrl: string; + failedStepIndex?: number; + failedAction?: string; + currentUrl?: string; + error?: string; + screenshotRef?: string; + htmlRef?: string; + diagnostics: string[]; + steps: PopulateRuntimeTraceStep[]; +} + +export interface BrowserActionBoxReplayOutput { + agentCompatibleResult: Record | null; + runtimeResult: PopulateRuntimeResult | null; + trace: PlaywrightReplayTrace; + replayStatus: + | "replay_succeeded" + | "replay_failed" + | "repair_promoted" + | "repair_rejected"; + repairedPlaywrightScript?: PlaywrightScriptArtifact; + diagnostics: string[]; +} + +export interface PlaywrightReplayRunnerResult { + agentCompatibleResult: Record | null; + trace?: Partial; + error?: string; +} + +export interface BrowserActionBoxHooks { + tinyFishClient: TinyFishTraceRecorderClient; + runPlaywrightScript?: ( + input: BrowserActionBoxReplayInput & { + script: PlaywrightScriptArtifact; + } + ) => Promise; + repairPlaywrightScript?: ( + input: BrowserActionBoxReplayInput & { + failedReplay: PlaywrightReplayTrace; + diagnostics: string[]; + } + ) => Promise; + now?: () => Date; +} + +export class BrowserActionBox { + constructor(private readonly hooks: BrowserActionBoxHooks) {} + + async firstRun( + input: BrowserActionBoxFirstRunInput + ): Promise { + const trace = await recordTinyFishTrace({ + sourceUrl: input.sourceUrl, + goal: browserActionBoxGoal(input), + captureHtml: input.runCaps.captureHtml, + captureScreenshots: input.runCaps.captureScreenshots, + maxDurationSeconds: input.runCaps.maxDurationSeconds, + maxAgentSteps: input.runCaps.maxAgentSteps, + client: this.hooks.tinyFishClient, + }); + const agentCompatibleResult = trace.finalResult ?? { rows: [] }; + const runtimeResult = populateRuntimeResultFromAgentCompatibleResult({ + agentCompatibleResult, + datasetSchema: input.datasetSchema, + sourceUrl: input.sourceUrl, + trace, + diagnosticArtifacts: [{ + kind: "tinyfish-trace", + label: "populate-tinyfish-trace", + content: safeJsonStringify(trace), + }], + }); + const replayReadiness = playwrightCandidateReadinessForRun({ + result: runtimeResult, + }); + const code = playwrightCandidateScriptForRun({ result: runtimeResult }); + const playwrightScript = code + ? createPlaywrightScriptArtifact({ + sourceUrl: input.sourceUrl, + datasetGoalPrompt: input.datasetGoalPrompt, + datasetSchema: input.datasetSchema, + code, + status: "draft", + createdAt: this.now().toISOString(), + diagnostics: replayReadiness.reasons, + }) + : null; + + return { + agentCompatibleResult, + runtimeResult, + trace, + playwrightScript, + replayReadiness, + diagnostics: trace.diagnostics, + }; + } + + async replay( + input: BrowserActionBoxReplayInput + ): Promise { + const replay = await this.runPlaywrightScript({ + ...input, + script: input.currentPlaywrightScript, + }); + const replayTrace = this.replayTraceFromRunner({ + input, + result: replay, + script: input.currentPlaywrightScript, + }); + const replayValidation = validateReplayAgentCompatibleResult({ + agentCompatibleResult: replay.agentCompatibleResult, + profile: input.previousSuccessfulOutputProfile, + }); + + if (replayTrace.status === "succeeded" && replayValidation.isValid) { + const runtimeResult = populateRuntimeResultFromReplay({ + input, + agentCompatibleResult: replay.agentCompatibleResult!, + trace: replayTrace, + status: "replay_succeeded", + }); + return { + agentCompatibleResult: replay.agentCompatibleResult, + runtimeResult, + trace: replayTrace, + replayStatus: "replay_succeeded", + diagnostics: replayValidation.issues, + }; + } + + const failureDiagnostics = [ + ...replayTrace.diagnostics, + ...replayValidation.issues, + classifyReplayFailure({ + replayTrace, + validationIssues: replayValidation.issues, + }), + ]; + if (!this.hooks.repairPlaywrightScript || input.runCaps.maxRepairAttempts < 1) { + return { + agentCompatibleResult: null, + runtimeResult: null, + trace: replayTrace, + replayStatus: "replay_failed", + diagnostics: failureDiagnostics, + }; + } + + const repairedScript = await this.hooks.repairPlaywrightScript({ + ...input, + failedReplay: replayTrace, + diagnostics: failureDiagnostics, + }); + if (!repairedScript) { + return { + agentCompatibleResult: null, + runtimeResult: null, + trace: replayTrace, + replayStatus: "repair_rejected", + diagnostics: [...failureDiagnostics, "Repair did not produce a script candidate."], + }; + } + + const repairedReplay = await this.runPlaywrightScript({ + ...input, + currentPlaywrightScript: repairedScript, + script: repairedScript, + }); + const repairedTrace = this.replayTraceFromRunner({ + input, + result: repairedReplay, + script: repairedScript, + }); + const repairedValidation = validateReplayAgentCompatibleResult({ + agentCompatibleResult: repairedReplay.agentCompatibleResult, + profile: input.previousSuccessfulOutputProfile, + }); + if (repairedTrace.status === "succeeded" && repairedValidation.isValid) { + const promotedScript = { + ...repairedScript, + status: "promoted" as const, + diagnostics: repairedValidation.issues, + }; + const runtimeResult = populateRuntimeResultFromReplay({ + input, + agentCompatibleResult: repairedReplay.agentCompatibleResult!, + trace: repairedTrace, + status: "repair_promoted", + repairedScript: promotedScript, + }); + return { + agentCompatibleResult: repairedReplay.agentCompatibleResult, + runtimeResult, + trace: repairedTrace, + replayStatus: "repair_promoted", + repairedPlaywrightScript: promotedScript, + diagnostics: repairedValidation.issues, + }; + } + + return { + agentCompatibleResult: null, + runtimeResult: null, + trace: repairedTrace, + replayStatus: "repair_rejected", + diagnostics: [ + ...failureDiagnostics, + ...repairedTrace.diagnostics, + ...repairedValidation.issues, + ], + }; + } + + private async runPlaywrightScript( + input: BrowserActionBoxReplayInput & { script: PlaywrightScriptArtifact } + ): Promise { + if (!this.hooks.runPlaywrightScript) { + return { + agentCompatibleResult: null, + error: "No Playwright replay runner is configured.", + }; + } + return this.hooks.runPlaywrightScript(input); + } + + private replayTraceFromRunner(input: { + input: BrowserActionBoxReplayInput; + script: PlaywrightScriptArtifact; + result: PlaywrightReplayRunnerResult; + }): PlaywrightReplayTrace { + const now = this.now(); + const startedAt = + input.result.trace?.startedAt ?? + new Date(now.getTime() - 1).toISOString(); + const completedAt = input.result.trace?.completedAt ?? now.toISOString(); + const status = input.result.error || !input.result.agentCompatibleResult + ? "failed" + : input.result.trace?.status ?? "succeeded"; + return { + status, + startedAt, + completedAt, + scriptId: input.script.scriptId, + sourceUrl: input.input.sourceUrl, + failedStepIndex: input.result.trace?.failedStepIndex, + failedAction: input.result.trace?.failedAction, + currentUrl: input.result.trace?.currentUrl, + error: input.result.error ?? input.result.trace?.error, + screenshotRef: input.result.trace?.screenshotRef, + htmlRef: input.result.trace?.htmlRef, + diagnostics: [ + ...(input.result.trace?.diagnostics ?? []), + ...(input.result.error ? [input.result.error] : []), + ], + steps: input.result.trace?.steps ?? [{ + kind: "browser", + label: "playwright-replay", + status: status === "succeeded" ? "succeeded" : "failed", + input: { + sourceUrl: input.input.sourceUrl, + scriptId: input.script.scriptId, + }, + error: input.result.error, + }], + }; + } + + private now(): Date { + return this.hooks.now?.() ?? new Date(); + } +} + +export function createTinyFishBrowserActionBox(input: { + apiKey: string; + pollIntervalMs?: number; + runPlaywrightScript?: BrowserActionBoxHooks["runPlaywrightScript"]; + repairPlaywrightScript?: BrowserActionBoxHooks["repairPlaywrightScript"]; +}): BrowserActionBox { + return new BrowserActionBox({ + tinyFishClient: createTinyFishTraceRecorderClient(input), + runPlaywrightScript: input.runPlaywrightScript, + repairPlaywrightScript: input.repairPlaywrightScript, + }); +} + +export function createPlaywrightScriptArtifact(input: { + sourceUrl: string; + datasetGoalPrompt: string; + datasetSchema: BrowserActionBoxDatasetSchema; + code: string; + status: PlaywrightScriptArtifact["status"]; + createdAt: string; + diagnostics?: string[]; +}): PlaywrightScriptArtifact { + const registryKey = playwrightScriptRegistryKey(input); + return { + scriptId: `pw-${shortHash(JSON.stringify(registryKey))}`, + sourceUrl: input.sourceUrl, + createdAt: input.createdAt, + status: input.status, + generatorVersion: registryKey.scriptGeneratorVersion, + registryKey, + code: input.code, + diagnostics: input.diagnostics ?? [], + }; +} + +export function playwrightScriptRegistryKey(input: { + sourceUrl: string; + datasetGoalPrompt: string; + datasetSchema: BrowserActionBoxDatasetSchema; +}): PlaywrightScriptRegistryKey { + return { + sourceUrlCanonical: canonicalSourceUrl(input.sourceUrl), + datasetGoalFingerprint: shortHash(input.datasetGoalPrompt), + datasetSchemaFingerprint: shortHash(JSON.stringify(input.datasetSchema)), + promptPolicyVersion: "bigset-populate-v1", + scriptGeneratorVersion: "browser-action-box-v1", + }; +} + +export function populateRuntimeResultFromAgentCompatibleResult(input: { + agentCompatibleResult: Record; + datasetSchema: BrowserActionBoxDatasetSchema; + sourceUrl: string; + trace?: TinyFishRecordedTrace; + replayTrace?: PlaywrightReplayTrace; + diagnosticArtifacts?: NonNullable; +}): PopulateRuntimeResult { + const rows = rowsFromAgentCompatibleResult({ + agentCompatibleResult: input.agentCompatibleResult, + datasetSchema: input.datasetSchema, + fallbackSourceUrl: input.sourceUrl, + }); + const traceSteps = input.trace + ? tinyFishTraceProcessSteps(input.trace) + : input.replayTrace?.steps ?? []; + const processTrace = populateProcessTraceFromSteps({ + runtime: input.trace ? "collection" : "unknown", + steps: traceSteps, + capturedSources: [{ + url: input.sourceUrl, + text: safeJsonStringify(input.agentCompatibleResult).slice(0, 12_000), + source: "synthetic", + }], + selectedRowSource: rows.length > 0 ? "collection_pipeline" : "none", + notes: [ + ...(input.trace?.diagnostics ?? []), + ...(input.replayTrace?.diagnostics ?? []), + ], + }); + return { + rows, + validationIssues: rows.length > 0 ? [] : ["BrowserActionBox returned no rows."], + usage: { promptTokens: 0, completionTokens: 0, totalTokens: 0 }, + metrics: { + searchCalls: 0, + fetchCalls: 0, + browserCalls: input.trace || input.replayTrace ? 1 : 0, + agentRuns: input.trace ? 1 : 0, + agentSteps: input.trace?.runSteps.length ?? input.replayTrace?.steps.length ?? 0, + }, + debug: { + capturedRows: [], + capturedSources: [{ + url: input.sourceUrl, + text: safeJsonStringify(input.agentCompatibleResult).slice(0, 12_000), + source: "synthetic", + }], + selectedRowSource: rows.length > 0 ? "collection_pipeline" : "none", + notes: [ + ...(input.trace?.diagnostics ?? []), + ...(input.replayTrace?.diagnostics ?? []), + ], + processTrace, + diagnosticArtifacts: input.diagnosticArtifacts ?? [], + }, + }; +} + +export function validateReplayAgentCompatibleResult(input: { + agentCompatibleResult: Record | null; + profile: BrowserActionBoxReplayInput["previousSuccessfulOutputProfile"]; +}): { isValid: boolean; issues: string[] } { + if (!input.agentCompatibleResult) { + return { isValid: false, issues: ["Replay returned no Agent-compatible result."] }; + } + const rows = agentCompatibleRows(input.agentCompatibleResult); + const issues: string[] = []; + const minRows = input.profile.rowCountRange?.min ?? 1; + if (rows.length < minRows) { + issues.push(`Replay returned ${rows.length} row(s), below previous minimum ${minRows}.`); + } + if ( + input.profile.rowCountRange?.max !== undefined && + rows.length > input.profile.rowCountRange.max + ) { + issues.push( + `Replay returned ${rows.length} row(s), above previous maximum ${input.profile.rowCountRange.max}.` + ); + } + const missingFields = input.profile.fieldsPreviouslyRetrieved.filter( + (field) => !rows.some((row) => rowHasField(row, field)) + ); + if (missingFields.length > 0) { + issues.push(`Replay missed previously retrieved field(s): ${missingFields.join(", ")}.`); + } + if ( + input.profile.evidenceRequired && + !rows.some((row) => rowHasEvidence(row)) + ) { + issues.push("Replay returned no evidence-backed rows."); + } + return { isValid: issues.length === 0, issues }; +} + +export function classifyReplayFailure(input: { + replayTrace: PlaywrightReplayTrace; + validationIssues: string[]; +}): string { + const text = [ + input.replayTrace.error, + input.replayTrace.currentUrl, + input.replayTrace.diagnostics.join("\n"), + input.validationIssues.join("\n"), + ].filter(Boolean).join("\n"); + if (/captcha|verify you are human|bot|blocked/i.test(text)) { + return "blocked/captcha/auth wall"; + } + if (/404|not found|gone|no longer|unavailable/i.test(text)) { + return "source unavailable"; + } + if (input.validationIssues.length > 0) { + return "validation failure"; + } + if (/timeout|selector|locator|click|navigation/i.test(text)) { + return "script failure"; + } + return "script failure"; +} + +function populateRuntimeResultFromReplay(input: { + input: BrowserActionBoxReplayInput; + agentCompatibleResult: Record; + trace: PlaywrightReplayTrace; + status: BrowserActionBoxReplayOutput["replayStatus"]; + repairedScript?: PlaywrightScriptArtifact; +}): PopulateRuntimeResult { + const diagnosticArtifacts: NonNullable = [{ + kind: "playwright-replay-result", + label: "populate-playwright-replay-result", + content: safeJsonStringify({ + replayStatus: input.status, + trace: input.trace, + }), + }]; + if (input.status === "repair_promoted" && input.repairedScript) { + diagnosticArtifacts.push({ + kind: "playwright-repaired-script", + label: "populate-playwright-repaired-script", + content: input.repairedScript.code, + }); + } + return populateRuntimeResultFromAgentCompatibleResult({ + agentCompatibleResult: input.agentCompatibleResult, + datasetSchema: input.input.datasetSchema, + sourceUrl: input.input.sourceUrl, + replayTrace: input.trace, + diagnosticArtifacts, + }); +} + +function rowsFromAgentCompatibleResult(input: { + agentCompatibleResult: Record; + datasetSchema: BrowserActionBoxDatasetSchema; + fallbackSourceUrl: string; +}): PopulateRuntimeRow[] { + const rawRows = agentCompatibleRows(input.agentCompatibleResult); + return rawRows + .map((row) => runtimeRowFromUnknown({ + row, + datasetSchema: input.datasetSchema, + fallbackSourceUrl: input.fallbackSourceUrl, + })) + .filter((row): row is PopulateRuntimeRow => Boolean(row)); +} + +function runtimeRowFromUnknown(input: { + row: unknown; + datasetSchema: BrowserActionBoxDatasetSchema; + fallbackSourceUrl: string; +}): PopulateRuntimeRow | undefined { + if (!isRecord(input.row)) { + return undefined; + } + const cells = isRecord(input.row.cells) + ? input.row.cells + : isRecord(input.row.row) + ? input.row.row + : input.row; + const sourceUrls = uniqueHttpUrls([ + ...arrayValue(input.row.sourceUrls).filter(isString), + ...arrayValue(input.row.source_urls).filter(isString), + ...sourceUrlsFromCells(cells), + input.fallbackSourceUrl, + ]); + const evidence = evidenceFromRow({ + row: input.row, + cells, + fallbackSourceUrl: sourceUrls[0] ?? input.fallbackSourceUrl, + }); + const normalizedCells: Record = Object.fromEntries( + input.datasetSchema.columns.map((column) => [ + column.name, + normalizeCellValue(cells[column.name]), + ]) + ); + return { + cells: normalizedCells, + sourceUrls, + evidence, + needsReview: true, + }; +} + +function evidenceFromUnknown( + value: unknown, + fallbackSourceUrl: string +): PopulateRuntimeRow["evidence"] { + return arrayValue(value) + .map((item) => { + if (!isRecord(item)) { + return undefined; + } + const quote = stringValue(item.quote ?? item.text ?? item.evidence); + if (!quote) { + return undefined; + } + return { + columnName: stringValue(item.columnName ?? item.field) ?? "evidence", + sourceUrl: stringValue(item.sourceUrl ?? item.url) ?? fallbackSourceUrl, + quote, + }; + }) + .filter((item): item is PopulateRuntimeRow["evidence"][number] => Boolean(item)); +} + +function evidenceFromRow(input: { + row: Record; + cells: Record; + fallbackSourceUrl: string; +}): PopulateRuntimeRow["evidence"] { + const explicitEvidence = evidenceFromUnknown( + input.row.evidence, + input.fallbackSourceUrl + ); + if (explicitEvidence.length > 0) { + return explicitEvidence; + } + const evidenceQuote = stringValue( + input.cells.evidence_quote ?? + input.cells.evidenceQuote ?? + input.cells.quote + ); + return evidenceQuote + ? [{ + columnName: "evidence_quote", + sourceUrl: input.fallbackSourceUrl, + quote: evidenceQuote, + }] + : []; +} + +function browserActionBoxGoal(input: BrowserActionBoxFirstRunInput): string { + return [ + input.datasetGoalPrompt, + "", + "Source URL:", + input.sourceUrl, + "", + "Return JSON with records/rows, source URLs, evidence quotes, and agent_browser_actions when browser actions happen.", + "Columns:", + ...input.datasetSchema.columns.map((column) => + `- ${column.name}${column.description ? `: ${column.description}` : ""}` + ), + ].join("\n"); +} + +function sourceUrlsFromCells(cells: Record): string[] { + return Object.entries(cells) + .filter(([key]) => /(url|link|website|source)/i.test(key)) + .flatMap(([, value]) => typeof value === "string" ? [value] : []); +} + +function uniqueHttpUrls(values: string[]): string[] { + return Array.from(new Set(values.filter((value) => /^https?:\/\//i.test(value)))); +} + +function normalizeCellValue(value: unknown): PopulateCellValue { + if ( + typeof value === "string" || + typeof value === "number" || + typeof value === "boolean" || + value === null || + Array.isArray(value) + ) { + return value; + } + if (isRecord(value)) { + return value; + } + return null; +} + +function rowHasField(row: unknown, field: string): boolean { + if (!isRecord(row)) { + return false; + } + const cells = isRecord(row.cells) ? row.cells : isRecord(row.row) ? row.row : row; + const value = cells[field]; + return value !== undefined && value !== null && value !== ""; +} + +function rowHasEvidence(row: unknown): boolean { + if (!isRecord(row)) { + return false; + } + if (arrayValue(row.evidence).some((item) => + isRecord(item) && Boolean(stringValue(item.quote ?? item.text)) + )) { + return true; + } + const cells = isRecord(row.cells) ? row.cells : isRecord(row.row) ? row.row : row; + return Boolean(stringValue( + cells.evidence_quote ?? + cells.evidenceQuote ?? + cells.quote + )); +} + +function agentCompatibleRows(result: Record): unknown[] { + const direct = arrayValue(result.rows ?? result.records ?? result.result); + if (direct.length > 0) { + return direct; + } + const nested = isRecord(result.result) ? result.result : undefined; + return nested ? arrayValue(nested.rows ?? nested.records) : []; +} + +function canonicalSourceUrl(value: string): string { + try { + const url = new URL(value); + url.hash = ""; + return url.toString().replace(/\/$/, ""); + } catch { + return value.trim(); + } +} + +function shortHash(value: string): string { + return createHash("sha256").update(value).digest("hex").slice(0, 16); +} + +function safeJsonStringify(value: unknown): string { + return JSON.stringify(value, null, 2).slice(0, 20_000); +} + +function arrayValue(value: unknown): unknown[] { + return Array.isArray(value) ? value : []; +} + +function stringValue(value: unknown): string | undefined { + return typeof value === "string" && value.trim() ? value.trim() : undefined; +} + +function isString(value: unknown): value is string { + return typeof value === "string"; +} + +function isRecord(value: unknown): value is Record { + return Boolean(value) && typeof value === "object" && !Array.isArray(value); +} diff --git a/backend/src/pipeline/populate-convex-writer.ts b/backend/src/pipeline/populate-convex-writer.ts index 78335a0..6c1faf7 100644 --- a/backend/src/pipeline/populate-convex-writer.ts +++ b/backend/src/pipeline/populate-convex-writer.ts @@ -34,6 +34,7 @@ export class ConvexPopulateDatasetRowWriter implements PopulateDatasetRowWriter rows: input.rows.map((row) => ({ data: row.cells, sources: row.sourceUrls, + evidence: row.evidence, })), } ); diff --git a/backend/src/pipeline/populate-dataset-owner-loader.ts b/backend/src/pipeline/populate-dataset-owner-loader.ts new file mode 100644 index 0000000..560424c --- /dev/null +++ b/backend/src/pipeline/populate-dataset-owner-loader.ts @@ -0,0 +1,48 @@ +import { ConvexHttpClient } from "convex/browser"; +import { anyApi } from "convex/server"; + +export interface PopulateDatasetOwnerQueryClient { + query(functionReference: unknown, args: unknown): Promise; +} + +export interface PopulateDatasetOwnerRecord { + ownerId: string; +} + +export class ConvexPopulateDatasetOwnerLoader { + constructor( + private readonly input: { + convexClient: PopulateDatasetOwnerQueryClient; + internalApi?: typeof anyApi; + } + ) {} + + async loadDataset(datasetId: string): Promise { + const internalApi = this.input.internalApi ?? anyApi; + const dataset = await this.input.convexClient.query( + internalApi.datasets.getForSystemPopulate, + { id: datasetId } + ); + + if (!dataset || typeof dataset !== "object") { + return null; + } + const ownerId = (dataset as { ownerId?: unknown }).ownerId; + if (typeof ownerId !== "string" || !ownerId) { + throw new Error(`Dataset ${datasetId} is missing ownerId.`); + } + return { ownerId }; + } +} + +export function createConvexPopulateDatasetOwnerLoader(input: { + convexUrl: string; + convexAdminKey: string; +}): ConvexPopulateDatasetOwnerLoader { + const convexClient = new ConvexHttpClient(input.convexUrl); + (convexClient as unknown as { + setAdminAuth(adminKey: string): void; + }).setAdminAuth(input.convexAdminKey); + + return new ConvexPopulateDatasetOwnerLoader({ convexClient }); +} diff --git a/backend/src/pipeline/populate-playwright-candidate-script.ts b/backend/src/pipeline/populate-playwright-candidate-script.ts new file mode 100644 index 0000000..f7ba5b9 --- /dev/null +++ b/backend/src/pipeline/populate-playwright-candidate-script.ts @@ -0,0 +1,225 @@ +import type { + PopulateProcessTrace, + PopulateRuntimeBrowserAction, + PopulateRuntimeResult, + PopulateRuntimeTraceStep, +} from "./populate-runtime.js"; +import { playwrightCandidateReadinessForRun } from "./populate-playwright-readiness.js"; + +const MAX_CANDIDATE_ACTIONS = 100; +const MAX_CANDIDATE_SCRIPT_LENGTH = 19_500; +const CANDIDATE_ACTION_LIMITS = [100, 50, 25, 10, 5, 1] as const; + +interface PlaywrightCandidateAction { + action: PopulateRuntimeBrowserAction["action"]; + label: string; + url?: string; + selector?: string; + targetText?: string; + valueDescription?: string; +} + +export function playwrightCandidateScriptForRun(input: { + result: PopulateRuntimeResult; +}): string | undefined { + const readiness = playwrightCandidateReadinessForRun(input); + const processTrace = input.result.debug?.processTrace; + if (readiness.status !== "ready" || !processTrace) { + return undefined; + } + + const actions = actionableBrowserSteps(processTrace) + .slice(0, MAX_CANDIDATE_ACTIONS) + .map((step) => ({ + action: step.browserAction!.action, + label: trimCandidateText(step.label) ?? "browser-action", + url: trimCandidateText(step.browserAction!.url), + selector: trimCandidateText(step.browserAction!.selector), + targetText: trimCandidateText(step.browserAction!.targetText), + valueDescription: trimCandidateText(step.browserAction!.valueDescription), + })); + if (actions.length === 0) { + return undefined; + } + + const sourceUrls = sourceUrlsForTrace(processTrace); + for (const actionLimit of CANDIDATE_ACTION_LIMITS) { + const limitedActions = actions.slice(0, actionLimit); + if (limitedActions.length === 0) { + continue; + } + const script = renderPlaywrightCandidateScript({ + actions: limitedActions, + sourceUrls, + omittedActionCount: Math.max(0, actions.length - limitedActions.length), + }); + if (script.length <= MAX_CANDIDATE_SCRIPT_LENGTH) { + return script; + } + } + return undefined; +} + +function actionableBrowserSteps( + processTrace: PopulateProcessTrace +): PopulateRuntimeTraceStep[] { + return processTrace.steps.filter((step) => { + if (step.kind !== "browser" || step.status !== "succeeded") { + return false; + } + const action = step.browserAction; + if (!action) { + return false; + } + return Boolean(action.url || action.selector || action.targetText); + }); +} + +function sourceUrlsForTrace(processTrace: PopulateProcessTrace): string[] { + return Array.from(new Set([ + ...processTrace.fetchedUrls, + ...processTrace.sourceArtifacts + .filter((artifact) => artifact.status === "succeeded") + .map((artifact) => artifact.url), + ].filter((url) => /^https?:\/\//i.test(url)))); +} + +function trimCandidateText(value: string | undefined): string | undefined { + if (value === undefined) { + return undefined; + } + return value.length > 500 ? `${value.slice(0, 500)} [truncated]` : value; +} + +function renderPlaywrightCandidateScript(input: { + actions: PlaywrightCandidateAction[]; + sourceUrls: string[]; + omittedActionCount: number; +}): string { + return `// Generated from explicit BigSet browser actions. +// Review before promotion to an active cron recipe. +${input.omittedActionCount > 0 + ? `// Omitted ${input.omittedActionCount} lower-priority browser actions to keep artifact size bounded.\n` + : ""} + +const browserActions = ${JSON.stringify(input.actions)}; +const sourceUrls = ${JSON.stringify(input.sourceUrls)}; + +export async function runDatasetRecipe(context) { + const page = context.page; + if (!page) { + throw new Error("runDatasetRecipe requires context.page"); + } + + const notes = []; + for (const action of browserActions) { + await replayBrowserAction(page, action, context, notes); + } + + return { + rows: [], + sourceUrls, + notes, + }; +} + +async function replayBrowserAction(page, action, context, notes) { + switch (action.action) { + case "navigate": + if (!action.url) throw new Error(\`navigate action missing url: \${action.label}\`); + await page.goto(action.url, { waitUntil: "domcontentloaded" }); + return; + case "click": + await clickTarget(page, action); + await waitAfterAction(page); + return; + case "type": + await fillTarget(page, action, context); + await waitAfterAction(page); + return; + case "select": + await selectTarget(page, action, context); + await waitAfterAction(page); + return; + case "wait": + await waitAfterAction(page); + return; + case "extract": + await page.waitForLoadState("domcontentloaded"); + return; + case "screenshot": + notes.push(\`screenshot requested by action: \${action.label}\`); + return; + default: + if (action.url) { + await page.goto(action.url, { waitUntil: "domcontentloaded" }); + } else { + notes.push(\`skipped unknown browser action: \${action.label}\`); + } + } +} + +async function clickTarget(page, action) { + if (action.selector) { + await page.locator(action.selector).first().click(); + return; + } + if (action.targetText) { + await page.getByText(action.targetText, { exact: false }).first().click(); + return; + } + if (action.url) { + await page.goto(action.url, { waitUntil: "domcontentloaded" }); + return; + } + throw new Error(\`click action missing selector, targetText, and url: \${action.label}\`); +} + +async function fillTarget(page, action, context) { + const value = inputValueForAction(action, context); + if (action.selector) { + await page.locator(action.selector).first().fill(value); + return; + } + if (action.targetText) { + await page.getByLabel(action.targetText, { exact: false }).first().fill(value); + return; + } + throw new Error(\`type action missing selector or targetText: \${action.label}\`); +} + +async function selectTarget(page, action, context) { + const value = inputValueForAction(action, context); + if (action.selector) { + await page.locator(action.selector).first().selectOption(value); + return; + } + if (action.targetText) { + await page.getByLabel(action.targetText, { exact: false }).first().selectOption(value); + return; + } + throw new Error(\`select action missing selector or targetText: \${action.label}\`); +} + +function inputValueForAction(action, context) { + const inputs = context.inputs ?? {}; + const keys = [action.label, action.selector, action.targetText].filter(Boolean); + for (const key of keys) { + if (inputs[key] !== undefined) return String(inputs[key]); + } + throw new Error( + "missing context.inputs value for " + + action.label + + (action.valueDescription ? " (" + action.valueDescription + ")" : "") + ); +} + +async function waitAfterAction(page) { + try { + await page.waitForLoadState("networkidle", { timeout: 5_000 }); + } catch { + await page.waitForTimeout(500); + } +} +`; +} diff --git a/backend/src/pipeline/populate-playwright-readiness.ts b/backend/src/pipeline/populate-playwright-readiness.ts new file mode 100644 index 0000000..c7a1b59 --- /dev/null +++ b/backend/src/pipeline/populate-playwright-readiness.ts @@ -0,0 +1,95 @@ +import type { + PopulateProcessTrace, + PopulateRuntimeResult, + PopulateRuntimeTraceStep, +} from "./populate-runtime.js"; + +export type PopulatePlaywrightCandidateReadinessStatus = + | "ready" + | "not_ready"; + +export interface PopulatePlaywrightCandidateReadiness { + status: PopulatePlaywrightCandidateReadinessStatus; + reasons: string[]; + browserStepCount: number; + sourceUrlCount: number; +} + +export function playwrightCandidateReadinessForRun(input: { + result: PopulateRuntimeResult; +}): PopulatePlaywrightCandidateReadiness { + const processTrace = input.result.debug?.processTrace; + const reasons: string[] = []; + + if (!processTrace) { + reasons.push("Process trace is missing."); + } + if (hasAgentDisabledCapabilityDiagnostic(input.result)) { + reasons.push( + "TinyFish Agent/browser follow-up was required but disabled for this run." + ); + } + + const browserSteps = processTrace + ? actionableBrowserSteps(processTrace) + : []; + if (browserSteps.length === 0) { + reasons.push( + "Trace has no actionable browser steps with URL/selector/target data." + ); + } + + const sourceUrlCount = processTrace + ? sourceUrlCountForTrace(processTrace) + : 0; + if (sourceUrlCount === 0) { + reasons.push("Trace has no source URLs to anchor a replay script."); + } + + return { + status: reasons.length === 0 ? "ready" : "not_ready", + reasons, + browserStepCount: browserSteps.length, + sourceUrlCount, + }; +} + +function hasAgentDisabledCapabilityDiagnostic( + result: PopulateRuntimeResult +): boolean { + const diagnostics = [ + ...result.validationIssues, + ...(result.debug?.notes ?? []), + ]; + return diagnostics.some((diagnostic) => + /Capability diagnostic: TinyFish Agent disabled/i.test(diagnostic) + ); +} + +function actionableBrowserSteps( + processTrace: PopulateProcessTrace +): PopulateRuntimeTraceStep[] { + return processTrace.steps.filter((step) => { + if (step.kind !== "browser" || step.status !== "succeeded") { + return false; + } + const action = step.browserAction; + if (!action) { + return false; + } + return Boolean( + action.url || + action.selector || + action.targetText + ); + }); +} + +function sourceUrlCountForTrace(processTrace: PopulateProcessTrace): number { + return new Set([ + ...processTrace.fetchedUrls, + ...processTrace.sourceArtifacts + .filter((artifact) => artifact.status === "succeeded") + .map((artifact) => artifact.url), + ].filter((url) => /^https?:\/\//i.test(url))).size; +} diff --git a/backend/src/pipeline/populate-playwright-replay-runner.ts b/backend/src/pipeline/populate-playwright-replay-runner.ts new file mode 100644 index 0000000..1c829ae --- /dev/null +++ b/backend/src/pipeline/populate-playwright-replay-runner.ts @@ -0,0 +1,538 @@ +import { createHash, randomUUID } from "node:crypto"; +import { existsSync } from "node:fs"; + +import type { + BrowserActionBoxHooks, + BrowserActionBoxReplayInput, + PlaywrightReplayRunnerResult, + PlaywrightScriptArtifact, +} from "./populate-browser-action-box.js"; +import type { PopulateRuntimeTraceStep } from "./populate-runtime.js"; +import type { Browser, Page } from "playwright-core"; + +export interface LocalPlaywrightReplayRunnerOptions { + executablePath?: string; + headless?: boolean; + launchArgs?: string[]; +} + +interface LinkCandidate { + href: string; + title: string; + text: string; + ariaLabel?: string; +} + +interface ReplayPageSnapshot { + url: string; + title: string; + evidenceQuote: string; +} + +export function createLocalPlaywrightReplayRunner( + options: LocalPlaywrightReplayRunnerOptions = {} +): NonNullable { + return (input) => runLocalPlaywrightReplay(input, options); +} + +export function createDeterministicPlaywrightRepair(): NonNullable< + BrowserActionBoxHooks["repairPlaywrightScript"] +> { + return async (input) => { + const repairedCode = repairGeneratedScriptSourceUrls({ + code: input.currentPlaywrightScript.code, + sourceUrl: input.sourceUrl, + }); + if (repairedCode === input.currentPlaywrightScript.code) { + return null; + } + return { + ...input.currentPlaywrightScript, + scriptId: `${input.currentPlaywrightScript.scriptId}-repair-${shortHash(repairedCode)}`, + status: "draft", + createdAt: new Date().toISOString(), + code: repairedCode, + diagnostics: [ + ...input.currentPlaywrightScript.diagnostics, + "Repaired generated script URL anchors to match the recipe source URL.", + ], + }; + }; +} + +export async function runLocalPlaywrightReplay( + input: BrowserActionBoxReplayInput & { script: PlaywrightScriptArtifact }, + options: LocalPlaywrightReplayRunnerOptions = {} +): Promise { + const startedAt = new Date().toISOString(); + const steps: PopulateRuntimeTraceStep[] = []; + let browser: Browser | undefined; + let page: Page | undefined; + + try { + const executablePath = options.executablePath ?? findChromiumExecutable(); + browser = await launchChromium({ + executablePath, + headless: options.headless ?? true, + launchArgs: options.launchArgs, + }); + page = await browser.newPage(); + page.setDefaultTimeout(Math.min(input.runCaps.timeoutMs, 30_000)); + page.setDefaultNavigationTimeout(Math.min(input.runCaps.timeoutMs, 30_000)); + steps.push({ + kind: "browser", + label: "playwright-launch", + status: "succeeded", + input: { + executablePath: executablePath ?? "playwright-default", + headless: options.headless ?? true, + }, + }); + + const recipeResult = await withTimeout( + runScriptModule({ + script: input.script, + page, + replayInput: input, + }), + input.runCaps.timeoutMs, + "Playwright replay timed out." + ); + steps.push({ + kind: "browser", + label: "playwright-script-run", + status: "succeeded", + input: { + sourceUrl: input.sourceUrl, + scriptId: input.script.scriptId, + }, + output: { + returnedRows: agentCompatibleRows(recipeResult).length, + }, + }); + + const agentCompatibleResult = + agentCompatibleRows(recipeResult).length > 0 + ? recipeResult + : await extractAgentCompatibleRowsFromPage({ + page, + replayInput: input, + recipeResult, + steps, + }); + + return { + agentCompatibleResult, + trace: { + status: "succeeded", + startedAt, + completedAt: new Date().toISOString(), + currentUrl: safePageUrl(page), + diagnostics: [], + steps, + }, + }; + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + steps.push({ + kind: "browser", + label: "playwright-replay-failure", + status: "failed", + input: { + sourceUrl: input.sourceUrl, + scriptId: input.script.scriptId, + }, + error: message, + }); + return { + agentCompatibleResult: null, + error: message, + trace: { + status: "failed", + startedAt, + completedAt: new Date().toISOString(), + currentUrl: safePageUrl(page), + diagnostics: await failureDiagnostics(page, message), + steps, + }, + }; + } finally { + await browser?.close().catch(() => undefined); + } +} + +async function launchChromium(input: { + executablePath?: string; + headless: boolean; + launchArgs?: string[]; +}) { + const { chromium } = await import("playwright-core"); + return chromium.launch({ + executablePath: input.executablePath, + headless: input.headless, + args: [ + "--no-sandbox", + "--disable-dev-shm-usage", + ...(input.launchArgs ?? []), + ], + }); +} + +async function runScriptModule(input: { + script: PlaywrightScriptArtifact; + page: Page; + replayInput: BrowserActionBoxReplayInput; +}): Promise> { + const moduleUrl = `data:text/javascript;base64,${Buffer.from( + input.script.code + ).toString("base64")}#${randomUUID()}`; + const mod = await import(moduleUrl); + if (typeof mod.runDatasetRecipe !== "function") { + throw new Error("Playwright script must export runDatasetRecipe(context)."); + } + const result = await mod.runDatasetRecipe({ + page: input.page, + sourceUrl: input.replayInput.sourceUrl, + datasetGoalPrompt: input.replayInput.datasetGoalPrompt, + datasetSchema: input.replayInput.datasetSchema, + inputs: {}, + timeoutMs: input.replayInput.runCaps.timeoutMs, + }); + return isRecord(result) ? result : { rows: [] }; +} + +async function extractAgentCompatibleRowsFromPage(input: { + page: Page; + replayInput: BrowserActionBoxReplayInput; + recipeResult: Record; + steps: PopulateRuntimeTraceStep[]; +}): Promise> { + const candidates = await input.page.evaluate(({ sourceUrl }) => { + const sourceHost = new URL(sourceUrl).hostname.replace(/^www\./, ""); + return Array.from(document.querySelectorAll("a[href]")) + .map((anchor) => { + const href = new URL(anchor.getAttribute("href") ?? "", document.baseURI).href; + const text = (anchor.textContent ?? "").replace(/\s+/g, " ").trim(); + const ariaLabel = anchor.getAttribute("aria-label") ?? undefined; + const title = anchor.getAttribute("title") ?? ariaLabel ?? text; + const host = new URL(href).hostname.replace(/^www\./, ""); + return { + href, + title: title.trim(), + text, + ariaLabel, + sameHost: host === sourceHost, + }; + }) + .filter((candidate) => /^https?:\/\//i.test(candidate.href)) + .filter((candidate) => !/(signin|login|signup|privacy|terms|cookie|contact|mailto:)/i.test(candidate.href)) + .filter((candidate) => candidate.sameHost || candidate.text.length > 12) + .slice(0, 80); + }, { sourceUrl: input.replayInput.sourceUrl }); + const ranked = candidates + .map((candidate) => ({ + ...candidate, + score: replayCandidateScore(candidate, input.replayInput), + })) + .filter((candidate) => candidate.score > 0) + .sort((left, right) => + right.score - left.score || left.href.localeCompare(right.href) + ); + const unique = dedupeByHref(ranked).slice(0, targetReplayRowCount(input.replayInput)); + let rows = unique.map((candidate) => + rowFromCandidate({ + candidate, + replayInput: input.replayInput, + }) + ); + let sourceUrls = unique.map((candidate) => candidate.href); + if (rows.length === 0) { + const snapshot = await replayPageSnapshot(input.page); + const pageRow = rowFromPageSnapshot({ + snapshot, + replayInput: input.replayInput, + }); + if (pageRow) { + rows = [pageRow]; + sourceUrls = [snapshot.url]; + } + } + input.steps.push({ + kind: "extract", + label: "playwright-dom-extract", + status: rows.length > 0 ? "succeeded" : "failed", + input: { + sourceUrl: input.replayInput.sourceUrl, + candidateCount: candidates.length, + }, + output: { + rowCount: rows.length, + }, + error: rows.length > 0 ? undefined : "Replay DOM extraction found no rows.", + }); + return { + records: rows, + sourceUrls, + replayNotes: arrayValue(input.recipeResult.notes).filter(isString).slice(0, 20), + }; +} + +function rowFromCandidate(input: { + candidate: LinkCandidate; + replayInput: BrowserActionBoxReplayInput; +}): Record { + const evidenceQuote = bestEvidenceQuote(input.candidate); + const cells: Record = {}; + for (const column of input.replayInput.datasetSchema.columns) { + const columnName = column.name; + if (/(url|link|website|source)/i.test(columnName)) { + cells[columnName] = input.candidate.href; + } else if (/(title|name|company|article|post)/i.test(columnName)) { + cells[columnName] = input.candidate.title || input.candidate.text; + } else if (/(evidence|quote|summary|description|snippet)/i.test(columnName)) { + cells[columnName] = evidenceQuote; + } else if (/(date|published|year)/i.test(columnName)) { + cells[columnName] = dateFromText(input.candidate.text) ?? null; + } else { + cells[columnName] = input.candidate.title || input.candidate.text || null; + } + } + return { + ...cells, + sourceUrls: [input.candidate.href], + evidence: [{ + field: "evidence_quote", + url: input.candidate.href, + quote: evidenceQuote, + }], + }; +} + +async function replayPageSnapshot(page: Page): Promise { + return page.evaluate(() => { + const bodyText = (document.body?.innerText ?? "") + .replace(/\s+/g, " ") + .trim(); + const title = (document.title || "") + .replace(/\s+/g, " ") + .trim(); + const evidenceQuote = bodyText.slice(0, 500); + return { + url: window.location.href, + title: title || evidenceQuote.slice(0, 120) || window.location.href, + evidenceQuote, + }; + }); +} + +function rowFromPageSnapshot(input: { + snapshot: ReplayPageSnapshot; + replayInput: BrowserActionBoxReplayInput; +}): Record | undefined { + if (!input.snapshot.evidenceQuote) { + return undefined; + } + const cells: Record = {}; + for (const column of input.replayInput.datasetSchema.columns) { + const columnName = column.name; + if (/(url|link|website|source)/i.test(columnName)) { + cells[columnName] = input.snapshot.url; + } else if (/(title|name|company|article|post)/i.test(columnName)) { + cells[columnName] = input.snapshot.title; + } else if (/(evidence|quote|summary|description|snippet)/i.test(columnName)) { + cells[columnName] = input.snapshot.evidenceQuote; + } else if (/(date|published|year)/i.test(columnName)) { + cells[columnName] = dateFromText(input.snapshot.evidenceQuote) ?? null; + } else { + cells[columnName] = input.snapshot.title || input.snapshot.evidenceQuote; + } + } + return { + ...cells, + sourceUrls: [input.snapshot.url], + evidence: [{ + field: "evidence_quote", + url: input.snapshot.url, + quote: input.snapshot.evidenceQuote, + }], + }; +} + +function replayCandidateScore( + candidate: LinkCandidate & { sameHost?: boolean }, + input: BrowserActionBoxReplayInput +): number { + const haystack = `${candidate.href} ${candidate.title} ${candidate.text}`.toLowerCase(); + let score = candidate.sameHost ? 4 : 1; + if (candidate.title.length >= 8) score += 2; + if (candidate.text.length >= 20) score += 1; + if (/(privacy|terms|login|signin|signup|careers|contact|about|download)/i.test(haystack)) { + score -= 8; + } + for (const term of replayIntentTerms(input)) { + if (haystack.includes(term)) score += 1; + } + return score; +} + +function replayIntentTerms(input: BrowserActionBoxReplayInput): string[] { + return Array.from(new Set( + [ + ...input.datasetGoalPrompt.toLowerCase().match(/[a-z][a-z0-9-]{3,}/g) ?? [], + ...input.datasetSchema.columns.flatMap((column) => + column.name.toLowerCase().match(/[a-z][a-z0-9-]{3,}/g) ?? [] + ), + ].filter((term) => + !/^(with|from|this|that|rows?|each|page|source|include|current|public)$/.test(term) + ) + )).slice(0, 20); +} + +function targetReplayRowCount(input: BrowserActionBoxReplayInput): number { + const min = input.previousSuccessfulOutputProfile.rowCountRange?.min ?? 1; + const max = input.previousSuccessfulOutputProfile.rowCountRange?.max ?? Math.max(10, min); + return Math.max(1, Math.min(max, Math.max(min, 10))); +} + +function dedupeByHref(candidates: T[]): T[] { + const seen = new Set(); + const rows: T[] = []; + for (const candidate of candidates) { + const href = canonicalUrl(candidate.href); + if (seen.has(href)) continue; + seen.add(href); + rows.push(candidate); + } + return rows; +} + +function repairGeneratedScriptSourceUrls(input: { + code: string; + sourceUrl: string; +}): string { + let changed = false; + const repaired = input.code.replace( + /https?:\/\/[^"'`\]\s)]+/g, + (url) => { + if (canonicalUrl(url) === canonicalUrl(input.sourceUrl)) { + return url; + } + try { + const urlHost = new URL(url).hostname.replace(/^www\./, ""); + const sourceHost = new URL(input.sourceUrl).hostname.replace(/^www\./, ""); + if (urlHost !== sourceHost || /example\.invalid|broken|localhost/i.test(url)) { + changed = true; + return input.sourceUrl; + } + } catch { + changed = true; + return input.sourceUrl; + } + return url; + } + ); + return changed ? repaired : input.code; +} + +function findChromiumExecutable(): string | undefined { + for (const candidate of [ + process.env.POPULATE_PLAYWRIGHT_EXECUTABLE_PATH, + "/usr/bin/chromium-browser", + "/usr/bin/chromium", + "/Applications/Google Chrome for Testing.app/Contents/MacOS/Google Chrome for Testing", + "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome", + ]) { + if (candidate && existsSync(candidate)) { + return candidate; + } + } + return undefined; +} + +async function withTimeout( + promise: Promise, + timeoutMs: number, + message: string +): Promise { + let timeout: NodeJS.Timeout | undefined; + try { + return await Promise.race([ + promise, + new Promise((_, reject) => { + timeout = setTimeout(() => reject(new Error(message)), timeoutMs); + }), + ]); + } finally { + if (timeout) clearTimeout(timeout); + } +} + +async function failureDiagnostics( + page: Page | undefined, + message: string +): Promise { + if (!page) { + return [message]; + } + const title = await page.title().catch(() => ""); + return [ + message, + `Current URL: ${page.url()}`, + ...(title ? [`Page title: ${title.slice(0, 160)}`] : []), + ]; +} + +function safePageUrl(page: Page | undefined): string | undefined { + try { + return page?.url(); + } catch { + return undefined; + } +} + +function bestEvidenceQuote(candidate: LinkCandidate): string { + return (candidate.text || candidate.title || candidate.href) + .replace(/\s+/g, " ") + .trim() + .slice(0, 500); +} + +function dateFromText(value: string): string | undefined { + return value.match(/\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec)[a-z]*\.?\s+\d{1,2},?\s+\d{4}\b/i)?.[0] ?? + value.match(/\b\d{4}-\d{2}-\d{2}\b/)?.[0]; +} + +function agentCompatibleRows(result: Record): unknown[] { + const direct = arrayValue(result.rows ?? result.records ?? result.result); + if (direct.length > 0) { + return direct; + } + const nested = isRecord(result.result) ? result.result : undefined; + return nested ? arrayValue(nested.rows ?? nested.records) : []; +} + +function canonicalUrl(value: string): string { + try { + const url = new URL(value); + url.hash = ""; + return url.toString().replace(/\/$/, ""); + } catch { + return value; + } +} + +function shortHash(value: string): string { + return createHash("sha256").update(value).digest("hex").slice(0, 8); +} + +function arrayValue(value: unknown): unknown[] { + return Array.isArray(value) ? value : []; +} + +function isString(value: unknown): value is string { + return typeof value === "string"; +} + +function isRecord(value: unknown): value is Record { + return Boolean(value) && typeof value === "object" && !Array.isArray(value); +} diff --git a/backend/src/pipeline/populate-runtime-selection.ts b/backend/src/pipeline/populate-runtime-selection.ts index 62c6656..3a35255 100644 --- a/backend/src/pipeline/populate-runtime-selection.ts +++ b/backend/src/pipeline/populate-runtime-selection.ts @@ -10,6 +10,11 @@ import { MastraPopulateRecipeRuntime, type PopulateRecipeRuntime, } from "./populate-self-healing.js"; +import { createTinyFishBrowserActionBox } from "./populate-browser-action-box.js"; +import { + createDeterministicPlaywrightRepair, + createLocalPlaywrightReplayRunner, +} from "./populate-playwright-replay-runner.js"; export type PopulateAgentRuntimeName = "mastra" | "collection"; @@ -44,7 +49,10 @@ export async function createPopulateRecipeRuntime( ): Promise { const runtimeName = selectedPopulateRuntimeName(input.env); if (runtimeName === "mastra") { - return new MastraPopulateRecipeRuntime({ maxRows: input.maxRows }); + return new MastraPopulateRecipeRuntime({ + maxRows: input.maxRows, + browserActionBox: browserActionBoxFromEnv(input.env), + }); } const collectionRunner = input.collectionRunner ?? await loadCollectionRunnerFromEnv(input.env); @@ -60,6 +68,45 @@ export async function createPopulateRecipeRuntime( }); } +function browserActionBoxFromEnv(env: NodeJS.ProcessEnv) { + const enabled = booleanEnv(env.POPULATE_ENABLE_BROWSER_ACTION_BOX, true); + const apiKey = env.TINYFISH_API_KEY?.trim(); + if (!enabled || !apiKey) { + return undefined; + } + return createTinyFishBrowserActionBox({ + apiKey, + pollIntervalMs: positiveIntEnv( + env.POPULATE_BROWSER_ACTION_BOX_POLL_INTERVAL_MS, + 3_000 + ), + runPlaywrightScript: booleanEnv(env.POPULATE_ENABLE_PLAYWRIGHT_REPLAY, true) + ? createLocalPlaywrightReplayRunner({ + executablePath: env.POPULATE_PLAYWRIGHT_EXECUTABLE_PATH?.trim() || undefined, + headless: booleanEnv(env.POPULATE_PLAYWRIGHT_HEADLESS, true), + }) + : undefined, + repairPlaywrightScript: booleanEnv(env.POPULATE_ENABLE_PLAYWRIGHT_REPAIR, true) + ? createDeterministicPlaywrightRepair() + : undefined, + }); +} + +function booleanEnv(value: string | undefined, fallback: boolean): boolean { + if (value === undefined) { + return fallback; + } + return ["1", "true", "yes", "on"].includes(value.trim().toLowerCase()); +} + +function positiveIntEnv(value: string | undefined, fallback: number): number { + if (!value?.trim()) { + return fallback; + } + const parsed = Number.parseInt(value, 10); + return Number.isFinite(parsed) && parsed > 0 ? parsed : fallback; +} + async function loadCollectionRunnerFromEnv( env: NodeJS.ProcessEnv ): Promise { diff --git a/backend/src/pipeline/populate-runtime.ts b/backend/src/pipeline/populate-runtime.ts index a91dbe3..f3f90fa 100644 --- a/backend/src/pipeline/populate-runtime.ts +++ b/backend/src/pipeline/populate-runtime.ts @@ -11,6 +11,15 @@ import { datasetContextSchema, type DatasetContext, } from "./populate.js"; +import { + buildPopulateFetchPlan, + triageFetchedPageForPopulate, + rankPopulateSearchResults, +} from "./populate-source-planner.js"; +import type { + BrowserActionBox, + BrowserActionBoxDatasetSchema, +} from "./populate-browser-action-box.js"; export type PopulateCellValue = | string @@ -39,13 +48,86 @@ export interface PopulateRuntimeCapturedInsertedRow { export interface PopulateRuntimeCapturedSource { url: string; text: string; + source: "search" | "fetch" | "synthetic"; +} + +export type PopulateRuntimeTraceStepKind = + | "search" + | "fetch" + | "insert_row" + | "agent" + | "browser" + | "extract" + | "repair" + | "validation"; + +export type PopulateRuntimeBrowserActionKind = + | "navigate" + | "click" + | "type" + | "select" + | "wait" + | "extract" + | "screenshot" + | "unknown"; + +export interface PopulateRuntimeBrowserAction { + action: PopulateRuntimeBrowserActionKind; + url?: string; + selector?: string; + targetText?: string; + valueDescription?: string; +} + +export interface PopulateRuntimeTraceStep { + kind: PopulateRuntimeTraceStepKind; + label: string; + status: "succeeded" | "failed" | "skipped"; + input?: Record; + output?: Record; + error?: string; + browserAction?: PopulateRuntimeBrowserAction; +} + +export interface PopulateProcessTraceSourceArtifact { + url: string; + status: "succeeded" | "failed" | "skipped"; + source: "search" | "fetch" | "agent" | "collection" | "unknown"; + label?: string; + error?: string; +} + +export interface PopulateProcessTrace { + runtime: "mastra" | "mastra-injected" | "collection" | "unknown"; + searchQueries: string[]; + fetchedUrls: string[]; + sourceArtifacts: PopulateProcessTraceSourceArtifact[]; + selectedRowSource: + | "insert_row" + | "structured_recovery" + | "collection_pipeline" + | "none"; + notes: string[]; + steps: PopulateRuntimeTraceStep[]; + artifactRoot?: string; + runReportPath?: string; } export interface PopulateRuntimeDebug { capturedRows: PopulateRuntimeCapturedInsertedRow[]; capturedSources: PopulateRuntimeCapturedSource[]; - selectedRowSource: "insert_row" | "structured_recovery" | "none"; + selectedRowSource: + | "insert_row" + | "structured_recovery" + | "collection_pipeline" + | "none"; notes: string[]; + processTrace: PopulateProcessTrace; + diagnosticArtifacts?: Array<{ + kind: string; + label: string; + content: string; + }>; } export interface PopulateRuntimeResult { @@ -109,7 +191,12 @@ export async function runPopulateRuntime(input: { context: DatasetContext; webTools?: PopulateRuntimeWebTools; agentRunner?: PopulateRuntimeAgentRunner; + browserActionBox?: Pick; maxRows?: number; + sourcePlanner?: { + enabled?: boolean; + fetchLimit?: number; + }; }): Promise { const parsedContext = datasetContextSchema.parse(input.context); const clarificationResult = clarificationResultForContext(parsedContext); @@ -119,8 +206,11 @@ export async function runPopulateRuntime(input: { const capturedRows: PopulateRuntimeCapturedInsertedRow[] = []; const capturedSources: PopulateRuntimeCapturedSource[] = []; + const processTraceSteps: PopulateRuntimeTraceStep[] = []; const validationIssues: string[] = []; const debugNotes: string[] = []; + const diagnosticArtifacts: NonNullable = []; + const browserActionRows: PopulateRuntimeRow[] = []; const metrics = emptyMetrics(); const webTools = input.webTools ?? createTinyFishWebTools(); const tools = createPopulateRuntimeTools({ @@ -131,7 +221,73 @@ export async function runPopulateRuntime(input: { metrics, webTools, maxRows: input.maxRows ?? 10, + processTraceSteps, }); + if (input.sourcePlanner?.enabled ?? !input.agentRunner) { + await seedCapturedSourcesFromPlannedSearches({ + context: parsedContext, + webTools, + capturedSources, + validationIssues, + metrics, + processTraceSteps, + fetchLimit: input.sourcePlanner?.fetchLimit ?? 6, + }); + } + await runBrowserActionBoxForDeferredSources({ + context: parsedContext, + capturedSources, + browserActionBox: input.browserActionBox, + browserActionRows, + processTraceSteps, + validationIssues, + debugNotes, + diagnosticArtifacts, + metrics, + maxRows: input.maxRows ?? 10, + }); + await seedCapturedSourcesFromContextUrls({ + context: parsedContext, + webTools, + capturedSources, + validationIssues, + metrics, + processTraceSteps, + }); + const explicitUrlRows = deterministicRowsFromCapturedSources({ + context: parsedContext, + capturedSources, + maxRows: input.maxRows ?? 10, + }); + if (urlsFromText(parsedContext.description).length > 0 && explicitUrlRows.length > 0) { + debugNotes.push( + "Explicit URL shortcut built title/URL rows from fetched source snippets." + ); + const processTrace = populateProcessTraceFromSteps({ + runtime: input.agentRunner ? "mastra-injected" : "mastra", + steps: processTraceSteps, + capturedSources, + selectedRowSource: "structured_recovery", + notes: debugNotes, + }); + return { + rows: explicitUrlRows, + validationIssues: Array.from(new Set([ + ...validationIssues, + ...validateRuntimeRows(explicitUrlRows), + ])), + usage: emptyUsage(), + metrics, + debug: { + capturedRows, + capturedSources, + selectedRowSource: "structured_recovery", + notes: debugNotes, + processTrace, + diagnosticArtifacts, + }, + }; + } const prompt = buildPopulatePrompt(parsedContext); let agentOutput: unknown; @@ -139,21 +295,74 @@ export async function runPopulateRuntime(input: { try { agentOutput = await input.agentRunner({ prompt, tools }); metrics.agentRuns += 1; + processTraceSteps.push({ + kind: "agent", + label: "populate-agent-injected", + status: "succeeded", + input: { + promptCharacters: prompt.length, + toolNames: Object.keys(tools), + }, + output: { + capturedRowCount: capturedRows.length, + capturedSourceCount: capturedSources.length, + }, + }); } catch (error) { - validationIssues.push(populateAgentFailureMessage(error)); + const message = populateAgentFailureMessage(error); + validationIssues.push(message); + processTraceSteps.push({ + kind: "agent", + label: "populate-agent-injected", + status: "failed", + input: { + promptCharacters: prompt.length, + toolNames: Object.keys(tools), + }, + error: message, + }); } } else { try { const agent = createRuntimePopulateAgent({ tools }); agentOutput = await agent.generate(prompt); metrics.agentRuns += 1; + processTraceSteps.push({ + kind: "agent", + label: "populate-agent-mastra", + status: "succeeded", + input: { + promptCharacters: prompt.length, + toolNames: Object.keys(tools), + }, + output: { + capturedRowCount: capturedRows.length, + capturedSourceCount: capturedSources.length, + }, + }); } catch (error) { - validationIssues.push(populateAgentFailureMessage(error)); + const message = populateAgentFailureMessage(error); + validationIssues.push(message); + processTraceSteps.push({ + kind: "agent", + label: "populate-agent-mastra", + status: "failed", + input: { + promptCharacters: prompt.length, + toolNames: Object.keys(tools), + }, + error: message, + }); } } - const insertedRows = capturedRows.map((row) => benchmarkRowFromInsertedData(row.data)); + const insertedRows = capturedRows.map((row) => + benchmarkRowFromInsertedData({ + data: row.data, + capturedSources, + }) + ); const insertedRowIssues = validateRuntimeRows(insertedRows); if ( !input.agentRunner && @@ -173,15 +382,32 @@ export async function runPopulateRuntime(input: { capturedSources, }); metrics.agentRuns += 1; + processTraceSteps.push({ + kind: "extract", + label: "structured-row-recovery", + status: "succeeded", + input: { + capturedSourceCount: capturedSources.length, + }, + }); } catch (error) { - validationIssues.push( - `Structured row generation failed: ${ - error instanceof Error ? error.message : String(error) - }` - ); + const message = `Structured row generation failed: ${ + error instanceof Error ? error.message : String(error) + }`; + validationIssues.push(message); + processTraceSteps.push({ + kind: "extract", + label: "structured-row-recovery", + status: "failed", + input: { + capturedSourceCount: capturedSources.length, + }, + error: message, + }); } } + const validationIssueCountBeforeStructuredRows = validationIssues.length; const structuredRows = benchmarkRowsFromStructuredOutput({ output: structuredOutputFromAgentResult(agentOutput), maxRows: input.maxRows ?? 10, @@ -191,12 +417,44 @@ export async function runPopulateRuntime(input: { validationIssues, debugNotes, }); - const structuredRowIssues = validateRuntimeRows(structuredRows); + const structuredOutputValidationIssues = validationIssues.slice( + validationIssueCountBeforeStructuredRows + ); + const deterministicRows = deterministicRowsFromCapturedSources({ + context: parsedContext, + capturedSources, + maxRows: input.maxRows ?? 10, + }); + const rawStructuredRowIssues = validateRuntimeRows(structuredRows); + const deterministicRowIssues = validateRuntimeRows(deterministicRows); + const shouldUseDeterministicRows = + deterministicRows.length > 0 && + deterministicRowIssues.length === 0 && + ( + structuredRows.length === 0 || + rawStructuredRowIssues.length > 0 || + structuredOutputValidationIssues.some((issue) => + /approximation|manual review|not present|not accompanied|only .*listing page/i.test(issue) + ) + ); + if (shouldUseDeterministicRows) { + validationIssues.splice(validationIssueCountBeforeStructuredRows); + debugNotes.push( + "Deterministic source fallback built title/URL rows from captured source snippets." + ); + } + const fallbackStructuredRows = shouldUseDeterministicRows + ? deterministicRows + : [ + ...browserActionRows, + ...structuredRows, + ]; + const structuredRowIssues = validateRuntimeRows(fallbackStructuredRows); if ( insertedRows.length > 0 && insertedRowIssues.length === 0 && - structuredRows.length > 0 && - hasContradictingStructuredRows(insertedRows, structuredRows) + fallbackStructuredRows.length > 0 && + hasContradictingStructuredRows(insertedRows, fallbackStructuredRows) ) { validationIssues.push( "Structured populate rows differed from insert_row rows and were ignored." @@ -205,14 +463,21 @@ export async function runPopulateRuntime(input: { const rows = selectBestRuntimeRows({ insertedRows, insertedRowIssues, - structuredRows, + structuredRows: fallbackStructuredRows, structuredRowIssues, debugNotes, }); const selectedRowSource = selectedRowSourceForRows({ rows, insertedRows, - structuredRows, + structuredRows: fallbackStructuredRows, + }); + const processTrace = populateProcessTraceFromSteps({ + runtime: input.agentRunner ? "mastra-injected" : "mastra", + steps: processTraceSteps, + capturedSources, + selectedRowSource, + notes: debugNotes, }); validationIssues.push(...validateRuntimeRows(rows)); @@ -226,6 +491,8 @@ export async function runPopulateRuntime(input: { capturedSources, selectedRowSource, notes: debugNotes, + processTrace, + diagnosticArtifacts, }, }; } @@ -244,6 +511,213 @@ function createRuntimePopulateAgent(input: { tools: Record }) { }); } +async function seedCapturedSourcesFromPlannedSearches(input: { + context: DatasetContext; + webTools: PopulateRuntimeWebTools; + capturedSources: PopulateRuntimeCapturedSource[]; + validationIssues: string[]; + metrics: PopulateRuntimeResult["metrics"]; + processTraceSteps: PopulateRuntimeTraceStep[]; + fetchLimit: number; +}): Promise { + if (urlsFromText(userPromptDescription(input.context.description)).length > 0) { + return; + } + + const searchResults: PopulateWebSearchResult[] = []; + for (const query of plannedSourceSearchQueries(input.context)) { + input.metrics.searchCalls += 1; + try { + const results = await input.webTools.search({ query }); + searchResults.push(...results); + input.processTraceSteps.push({ + kind: "search", + label: "source-planner-search", + status: "succeeded", + input: { query }, + output: { + resultCount: results.length, + urls: results.map((result) => result.url).slice(0, 10), + }, + }); + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + input.validationIssues.push(`Source planner search failed: ${message}`); + input.processTraceSteps.push({ + kind: "search", + label: "source-planner-search", + status: "failed", + input: { query }, + error: message, + }); + } + } + + const ranked = rankPopulateSearchResults({ + context: input.context, + results: searchResults, + }); + const fetchUrls = buildPopulateFetchPlan({ + rankedResults: ranked, + fetchLimit: input.fetchLimit, + }); + for (const url of fetchUrls) { + input.metrics.fetchCalls += 1; + try { + const page = await input.webTools.fetch({ url }); + input.capturedSources.push({ + url, + text: [page.title, page.text].filter(Boolean).join("\n"), + source: "fetch", + }); + input.processTraceSteps.push({ + kind: "fetch", + label: "source-planner-fetch", + status: "succeeded", + input: { url }, + output: { + title: page.title, + textCharacters: page.text?.length ?? 0, + expectationScore: ranked.find((result) => + result.canonicalUrl === url + )?.expectationScore, + }, + }); + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + input.validationIssues.push(`Source planner fetch failed for ${url}: ${message}`); + input.processTraceSteps.push({ + kind: "fetch", + label: "source-planner-fetch", + status: "failed", + input: { url }, + error: message, + }); + } + } +} + +function plannedSourceSearchQueries(context: DatasetContext): string[] { + const searchPhrase = taskSearchPhrase(context); + const entities = entityCandidatesFromDescription( + userPromptDescription(context.description) + ).slice(0, 3); + const queries = entities.length > 0 + ? entities.map((entity) => `${entity} ${searchPhrase} official source`) + : [`${searchPhrase} official source`]; + return Array.from(new Set(queries)).slice(0, 4); +} + +async function runBrowserActionBoxForDeferredSources(input: { + context: DatasetContext; + capturedSources: PopulateRuntimeCapturedSource[]; + browserActionBox?: Pick; + browserActionRows: PopulateRuntimeRow[]; + processTraceSteps: PopulateRuntimeTraceStep[]; + validationIssues: string[]; + debugNotes: string[]; + diagnosticArtifacts: NonNullable; + metrics: PopulateRuntimeResult["metrics"]; + maxRows: number; +}): Promise { + const candidates = input.capturedSources + .filter((source) => source.source === "fetch") + .map((source) => ({ + source, + triage: triageFetchedPageForPopulate({ + context: input.context, + url: source.url, + page: { + title: firstUsefulSourceTitle(source.text), + text: source.text, + }, + }), + })); + + for (const candidate of candidates) { + input.processTraceSteps.push({ + kind: "validation", + label: "source-fetch-triage", + status: "succeeded", + input: { + url: candidate.source.url, + }, + output: { + status: candidate.triage.status, + confidence: candidate.triage.confidence, + reason: candidate.triage.reason, + }, + }); + } + + const browserCandidate = candidates.find((candidate) => + candidate.triage.status === "requires_navigation" || + candidate.triage.status === "requires_form_submission" || + candidate.triage.status === "requires_detail_page_followup" + ); + if (!browserCandidate) { + return; + } + + if (!input.browserActionBox) { + input.debugNotes.push( + `BrowserActionBox not configured for ${browserCandidate.source.url}; replay readiness remains not_ready until a real browser-action trace exists.` + ); + return; + } + + try { + const output = await input.browserActionBox.firstRun({ + sourceUrl: browserCandidate.source.url, + datasetGoalPrompt: userPromptDescription(input.context.description), + datasetSchema: browserActionBoxDatasetSchemaFromContext(input.context), + runCaps: { + maxAgentSteps: 20, + maxDurationSeconds: 120, + captureHtml: true, + captureScreenshots: true, + }, + }); + input.browserActionRows.push(...output.runtimeResult.rows.slice(0, input.maxRows)); + input.validationIssues.push(...output.runtimeResult.validationIssues); + input.metrics.browserCalls += 1; + input.metrics.agentRuns += 1; + input.metrics.agentSteps += output.trace.runSteps.length; + input.processTraceSteps.push(...(output.runtimeResult.debug?.processTrace.steps ?? [])); + input.debugNotes.push( + `BrowserActionBox first run for ${browserCandidate.source.url}: replay_${output.replayReadiness.status}.` + ); + input.debugNotes.push(...output.diagnostics); + input.diagnosticArtifacts.push(...(output.runtimeResult.debug?.diagnosticArtifacts ?? [])); + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + input.validationIssues.push( + `BrowserActionBox first run failed for ${browserCandidate.source.url}: ${message}` + ); + input.processTraceSteps.push({ + kind: "agent", + label: "browser-action-box-first-run", + status: "failed", + input: { + url: browserCandidate.source.url, + }, + error: message, + }); + } +} + +function browserActionBoxDatasetSchemaFromContext( + context: DatasetContext +): BrowserActionBoxDatasetSchema { + return { + columns: context.columns.map((column) => ({ + name: column.name, + description: column.description, + required: column.nullable !== true, + })), + }; +} + function clarificationResultForContext( context: DatasetContext ): PopulateRuntimeResult | undefined { @@ -281,6 +755,15 @@ function emptyClarificationResult(validationIssues: string[]): PopulateRuntimeRe capturedSources: [], selectedRowSource: "none", notes: [], + processTrace: { + runtime: "unknown", + searchQueries: [], + fetchedUrls: [], + sourceArtifacts: [], + selectedRowSource: "none", + notes: [], + steps: [], + }, }, }; } @@ -292,7 +775,9 @@ async function enrichCapturedSourcesForStructuredFallback(input: { metrics: PopulateRuntimeResult["metrics"]; webTools: PopulateRuntimeWebTools; }) { - const entities = entityCandidatesFromDescription(input.context.description); + const entities = entityCandidatesFromDescription( + userPromptDescription(input.context.description) + ); const newSources: PopulateRuntimeCapturedSource[] = []; for (const entity of entities.slice(0, 4)) { let results: PopulateWebSearchResult[] = []; @@ -327,6 +812,7 @@ async function enrichCapturedSourcesForStructuredFallback(input: { newSources.push({ url: result.url, text: [result.title, result.snippet].filter(Boolean).join("\n"), + source: "search", }); input.metrics.fetchCalls += 1; try { @@ -334,6 +820,7 @@ async function enrichCapturedSourcesForStructuredFallback(input: { newSources.push({ url: result.url, text: [page.title, page.text].filter(Boolean).join("\n"), + source: "fetch", }); } catch (error) { input.validationIssues.push( @@ -360,6 +847,7 @@ async function captureDirectOfficialSource(input: { input.newSources.push({ url: input.url, text: `${input.entity} official source\n${input.url}`, + source: "synthetic", }); input.input.metrics.fetchCalls += 1; try { @@ -367,6 +855,7 @@ async function captureDirectOfficialSource(input: { input.newSources.push({ url: input.url, text: [page.title, page.text].filter(Boolean).join("\n"), + source: "fetch", }); } catch (error) { input.input.validationIssues.push( @@ -462,16 +951,22 @@ function taskSearchPhrase(context: DatasetContext): string { if (/\b(latest|blog|post|release|date)\b/i.test(taskText)) { return "latest official source title date URL"; } - return truncateForPrompt(context.description, 120); + return truncateForPrompt(userPromptDescription(context.description), 120); } function contextText(context: DatasetContext): string { return [ - context.description, + userPromptDescription(context.description), ...context.columns.map((column) => `${column.name} ${column.description ?? ""}`), ].join(" "); } +function userPromptDescription(description: string): string { + return description + .split(/\n\s*Durable recipe instructions:\s*/i)[0] + ?.trim() || description.trim(); +} + function uniqueSearchResults(results: PopulateWebSearchResult[]): PopulateWebSearchResult[] { const byUrl = new Map(); for (const result of results) { @@ -565,7 +1060,14 @@ function buildStructuredRowsPrompt(input: { capturedSources: PopulateRuntimeCapturedSource[]; }): string { const columnNames = input.context.columns.map((column) => column.name); - const entities = entityCandidatesFromDescription(input.context.description); + const columnRequirements = input.context.columns.map((column) => ({ + name: column.name, + nullable: column.nullable === true, + description: column.description ?? "", + })); + const entities = entityCandidatesFromDescription( + userPromptDescription(input.context.description) + ); const officialHints = Object.fromEntries( entities.map((entity) => [ entity, @@ -585,8 +1087,8 @@ function buildStructuredRowsPrompt(input: { return `Dataset description: ${input.context.description} -Required columns: -${JSON.stringify(columnNames)} +Columns: +${JSON.stringify(columnRequirements)} Named entities, when present: ${JSON.stringify(entities)} @@ -601,7 +1103,9 @@ Return rows using this exact shape: { "rows": [{ "cells": {}, "sourceUrls": [], "evidence": [{ "columnName": "", "sourceUrl": "", "quote": "" }], "needsReview": true }], "validationIssues": [] } Rules: -- cells must contain exactly the required columns. +- cells must contain exactly the listed columns. +- non-nullable cells must only be filled with facts directly present in the transcript. +- nullable cells may be null when the source transcript does not support a value. - sourceUrls must contain exact URLs from the captured source transcript. - evidence.sourceUrl must exactly match one captured source URL. - evidence.quote must be copied verbatim from that source text. @@ -691,6 +1195,121 @@ function selectedRowSourceForRows(input: { return "none"; } +export function populateProcessTraceFromSteps(input: { + runtime: PopulateProcessTrace["runtime"]; + steps: PopulateRuntimeTraceStep[]; + capturedSources?: PopulateRuntimeCapturedSource[]; + selectedRowSource: PopulateProcessTrace["selectedRowSource"]; + notes?: string[]; + artifactRoot?: string; + runReportPath?: string; +}): PopulateProcessTrace { + const searchQueries = input.steps.flatMap((step) => { + const query = step.kind === "search" + ? stringValue(step.input?.query) + : undefined; + return query ? [query] : []; + }); + const fetchedUrls = input.steps.flatMap((step) => { + const url = step.kind === "fetch" + ? stringValue(step.input?.url) + : undefined; + return url ? [url] : []; + }); + const sourceArtifacts: PopulateProcessTraceSourceArtifact[] = [ + ...(input.capturedSources ?? []).map((source) => ({ + url: source.url, + status: "succeeded" as const, + source: capturedSourceArtifactSource(source.source), + label: "captured-source", + })), + ...input.steps + .filter((step) => step.kind === "search" && Array.isArray(step.output?.urls)) + .flatMap((step) => + (step.output?.urls as unknown[]).flatMap((url) => { + const sourceUrl = stringValue(url); + return sourceUrl + ? [{ + url: sourceUrl, + status: step.status, + source: "search" as const, + label: step.label, + error: step.error, + }] + : []; + }) + ), + ...input.steps + .filter((step) => step.kind === "fetch") + .flatMap((step) => { + const sourceUrl = stringValue(step.input?.url); + return sourceUrl + ? [{ + url: sourceUrl, + status: step.status, + source: "fetch" as const, + label: step.label, + error: step.error, + }] + : []; + }), + ...input.steps + .filter((step) => step.kind === "agent") + .flatMap((step) => { + const sourceUrl = stringValue(step.input?.url); + return sourceUrl + ? [{ + url: sourceUrl, + status: step.status, + source: "agent" as const, + label: step.label, + error: step.error, + }] + : []; + }), + ]; + + return { + runtime: input.runtime, + searchQueries: Array.from(new Set(searchQueries)), + fetchedUrls: uniqueHttpUrls(fetchedUrls), + sourceArtifacts: dedupeProcessTraceSourceArtifacts(sourceArtifacts), + selectedRowSource: input.selectedRowSource, + notes: input.notes ?? [], + steps: input.steps, + artifactRoot: input.artifactRoot, + runReportPath: input.runReportPath, + }; +} + +function capturedSourceArtifactSource( + source: PopulateRuntimeCapturedSource["source"] +): PopulateProcessTraceSourceArtifact["source"] { + if (source === "search" || source === "fetch") { + return source; + } + return "unknown"; +} + +function dedupeProcessTraceSourceArtifacts( + artifacts: PopulateProcessTraceSourceArtifact[] +): PopulateProcessTraceSourceArtifact[] { + const seen = new Set(); + const uniqueArtifacts: PopulateProcessTraceSourceArtifact[] = []; + for (const artifact of artifacts) { + if (!/^https?:\/\//i.test(artifact.url)) { + continue; + } + const key = `${artifact.url}|${artifact.status}|${artifact.source}|${artifact.label ?? ""}`; + if (seen.has(key)) { + continue; + } + seen.add(key); + uniqueArtifacts.push(artifact); + } + return uniqueArtifacts; +} + function createPopulateRuntimeTools(input: { datasetId: string; capturedRows: PopulateRuntimeCapturedInsertedRow[]; @@ -699,6 +1318,7 @@ function createPopulateRuntimeTools(input: { metrics: PopulateRuntimeResult["metrics"]; webTools: PopulateRuntimeWebTools; maxRows: number; + processTraceSteps: PopulateRuntimeTraceStep[]; }) { return { insert_row: createTool({ @@ -714,18 +1334,50 @@ function createPopulateRuntimeTools(input: { }), execute: async ({ datasetId, data }) => { if (datasetId !== input.datasetId) { + input.processTraceSteps.push({ + kind: "insert_row", + label: "insert_row", + status: "failed", + input: { + datasetId, + columnNames: Object.keys(data), + }, + error: `datasetId must be ${input.datasetId}.`, + }); return { success: false, error: `datasetId must be ${input.datasetId}.`, }; } if (input.capturedRows.length >= input.maxRows) { + input.processTraceSteps.push({ + kind: "insert_row", + label: "insert_row", + status: "failed", + input: { + datasetId, + columnNames: Object.keys(data), + }, + error: `Row cap reached for this benchmark run (${input.maxRows}).`, + }); return { success: false, error: `Row cap reached for this benchmark run (${input.maxRows}).`, }; } input.capturedRows.push({ datasetId, data }); + input.processTraceSteps.push({ + kind: "insert_row", + label: "insert_row", + status: "succeeded", + input: { + datasetId, + columnNames: Object.keys(data), + }, + output: { + capturedRowCount: input.capturedRows.length, + }, + }); return { success: true }; }, }), @@ -749,12 +1401,30 @@ function createPopulateRuntimeTools(input: { ...results.map((result) => ({ url: result.url, text: [result.title, result.snippet].filter(Boolean).join("\n"), + source: "search" as const, })) ); + input.processTraceSteps.push({ + kind: "search", + label: "search_web", + status: "succeeded", + input: { query }, + output: { + resultCount: results.length, + urls: results.map((result) => result.url).slice(0, 10), + }, + }); return { results }; } catch (error) { const message = error instanceof Error ? error.message : String(error); input.validationIssues.push(`search_web failed: ${message}`); + input.processTraceSteps.push({ + kind: "search", + label: "search_web", + status: "failed", + input: { query }, + error: message, + }); return { error: message }; } }, @@ -775,11 +1445,29 @@ function createPopulateRuntimeTools(input: { input.capturedSources.push({ url, text: [page.title, page.text].filter(Boolean).join("\n"), + source: "fetch", + }); + input.processTraceSteps.push({ + kind: "fetch", + label: "fetch_page", + status: "succeeded", + input: { url }, + output: { + title: page.title, + textCharacters: page.text?.length ?? 0, + }, }); return page; } catch (error) { const message = error instanceof Error ? error.message : String(error); input.validationIssues.push(`fetch_page failed: ${message}`); + input.processTraceSteps.push({ + kind: "fetch", + label: "fetch_page", + status: "failed", + input: { url }, + error: message, + }); return { error: message }; } }, @@ -794,6 +1482,57 @@ function createPopulateRuntimeTools(input: { }; } +async function seedCapturedSourcesFromContextUrls(input: { + context: DatasetContext; + webTools: PopulateRuntimeWebTools; + capturedSources: PopulateRuntimeCapturedSource[]; + validationIssues: string[]; + metrics: PopulateRuntimeResult["metrics"]; + processTraceSteps: PopulateRuntimeTraceStep[]; +}): Promise { + const urls = urlsFromText( + userPromptDescription(input.context.description) + ).slice(0, 5); + for (const url of urls) { + input.metrics.fetchCalls += 1; + try { + const page = await input.webTools.fetch({ url }); + input.capturedSources.push({ + url, + text: [page.title, page.text].filter(Boolean).join("\n"), + source: "fetch", + }); + input.processTraceSteps.push({ + kind: "fetch", + label: "context-url-fetch", + status: "succeeded", + input: { url }, + output: { + title: page.title, + textCharacters: page.text?.length ?? 0, + }, + }); + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + input.validationIssues.push(`context URL fetch failed for ${url}: ${message}`); + input.processTraceSteps.push({ + kind: "fetch", + label: "context-url-fetch", + status: "failed", + input: { url }, + error: message, + }); + } + } +} + +function urlsFromText(value: string): string[] { + return Array.from(new Set( + [...value.matchAll(/https?:\/\/[^\s),]+/gi)] + .map((match) => match[0].replace(/[.,;:]+$/, "")) + )); +} + function createTinyFishWebTools(): PopulateRuntimeWebTools { return { async search({ query }) { @@ -845,15 +1584,19 @@ function createTinyFishWebTools(): PopulateRuntimeWebTools { }; } -function benchmarkRowFromInsertedData( - data: Record -): PopulateRuntimeRow { - const cells = normalizeCells(data); +function benchmarkRowFromInsertedData(input: { + data: Record; + capturedSources: PopulateRuntimeCapturedSource[]; +}): PopulateRuntimeRow { + const cells = normalizeCells(input.data); const sourceUrls = sourceUrlsFromData(cells); + const evidence = evidenceFromData(cells, sourceUrls).filter((item) => + isEvidenceBackedByCapturedSource(item, input.capturedSources) + ); return { cells, sourceUrls, - evidence: evidenceFromData(cells, sourceUrls), + evidence, needsReview: true, }; } @@ -930,6 +1673,172 @@ function benchmarkRowsFromStructuredOutput(input: { return selectRepresentativeRows(rows, input.context); } +function deterministicRowsFromCapturedSources(input: { + context: DatasetContext; + capturedSources: PopulateRuntimeCapturedSource[]; + maxRows: number; +}): PopulateRuntimeRow[] { + const explicitSourceUrls = urlsFromText( + userPromptDescription(input.context.description) + ); + const titleColumn = input.context.columns.find((column) => + /title|name/i.test(column.name) + ); + const urlColumn = input.context.columns.find((column) => + /url|link|website/i.test(column.name) + ); + if (!titleColumn || !urlColumn) { + return []; + } + const requiredColumns = input.context.columns.filter( + (column) => column.nullable !== true + ); + const canBuildRequiredColumns = requiredColumns.every((column) => + column.name === titleColumn.name || column.name === urlColumn.name + ); + if (!canBuildRequiredColumns) { + return []; + } + + const seenUrls = new Set(); + return input.capturedSources + .filter((source) => source.url && !seenUrls.has(source.url)) + .map((source) => { + seenUrls.add(source.url); + return source; + }) + .map((source) => ({ + source, + title: firstUsefulSourceTitle(source.text), + score: capturedSourceRelevanceScore(source, input.context), + })) + .filter((candidate) => + candidate.title && + candidate.score > 0 && + sourceMatchesExplicitUrlScope(candidate.source.url, explicitSourceUrls) && + !isListingSource(candidate.source, candidate.title) + ) + .sort((a, b) => b.score - a.score) + .slice(0, input.maxRows) + .map(({ source, title }) => { + const cells = Object.fromEntries( + input.context.columns.map((column) => { + if (column.name === titleColumn.name) { + return [column.name, title]; + } + if (column.name === urlColumn.name) { + return [column.name, source.url]; + } + return [column.name, null]; + }) + ) as Record; + return { + cells, + sourceUrls: [source.url], + evidence: [{ + columnName: titleColumn.name, + sourceUrl: source.url, + quote: title, + }], + needsReview: true, + }; + }); +} + +function sourceMatchesExplicitUrlScope( + sourceUrl: string, + explicitSourceUrls: string[] +): boolean { + if (explicitSourceUrls.length === 0) { + return true; + } + const source = parseHttpUrl(sourceUrl); + if (!source) { + return false; + } + return explicitSourceUrls.some((explicitUrl) => { + const explicit = parseHttpUrl(explicitUrl); + if (!explicit) { + return false; + } + if (normalizedUrlWithoutHash(source) === normalizedUrlWithoutHash(explicit)) { + return true; + } + return source.hostname === explicit.hostname; + }); +} + +function parseHttpUrl(value: string): URL | undefined { + try { + const url = new URL(value); + return /^https?:$/i.test(url.protocol) ? url : undefined; + } catch { + return undefined; + } +} + +function normalizedUrlWithoutHash(url: URL): string { + const normalized = new URL(url.toString()); + normalized.hash = ""; + return normalized.toString().replace(/\/$/, ""); +} + +function firstUsefulSourceTitle(text: string): string { + return text + .split("\n") + .map((line) => line.trim()) + .find((line) => + line.length >= 8 && + line.length <= 160 && + !/^https?:\/\//i.test(line) && + !/^source\s+\d+/i.test(line) + ) ?? ""; +} + +function capturedSourceRelevanceScore( + source: PopulateRuntimeCapturedSource, + context: DatasetContext +): number { + const text = `${source.url}\n${source.text}`.toLowerCase(); + const descriptionTokens = userPromptDescription(context.description) + .toLowerCase() + .split(/[^a-z0-9]+/) + .filter((token) => + token.length >= 4 && + !["from", "with", "post", "posts", "title", "titles", "url", "urls", "article", "articles", "find"].includes(token) + ); + let score = 1; + for (const token of new Set(descriptionTokens)) { + if (text.includes(token)) { + score += 1; + } + } + if (/\/index\//i.test(source.url)) { + score += 2; + } + if (/\/news\/product/i.test(source.url)) { + score += 2; + } + if (/openai\.com\/news\/?$|openai\.com\/news\/(product-releases|research|company-announcements)\/?$/i.test(source.url)) { + score -= 3; + } + if (/mcp/i.test(source.url) && !/mcp/i.test(userPromptDescription(context.description))) { + score -= 4; + } + return score; +} + +function isListingSource( + source: PopulateRuntimeCapturedSource, + title: string +): boolean { + return ( + /openai\.com\/news\/?$|openai\.com\/news\/(product-releases|research|company-announcements)\/?$/i.test(source.url) || + /\b(newsroom|recent news)\b/i.test(title) || + /^openai news$/i.test(title) + ); +} + function validateStructuredRowColumns( cells: Record, requestedColumns: string[] @@ -1071,7 +1980,9 @@ function selectRepresentativeRows( rows: PopulateRuntimeRow[], context: DatasetContext ): PopulateRuntimeRow[] { - const entities = entityCandidatesFromDescription(context.description); + const entities = entityCandidatesFromDescription( + userPromptDescription(context.description) + ); if (entities.length < 2 || rows.length <= entities.length) { return rows; } @@ -1211,12 +2122,34 @@ function validateRuntimeRows(rows: PopulateRuntimeRow[]): string[] { if (rows.some((row) => row.sourceUrls.length === 0)) { issues.push("One or more Mastra populate rows have no source URL."); } + if (rows.some((row) => row.sourceUrls.some((sourceUrl) => !isHttpUrl(sourceUrl)))) { + issues.push("One or more Mastra populate rows have invalid source URLs."); + } if (rows.some((row) => row.evidence.length === 0)) { issues.push("Mastra populate rows do not include per-row evidence quotes yet."); } + if (rows.some((row) => + row.evidence.some((item) => !item.quote.trim()) + )) { + issues.push("One or more Mastra populate evidence quotes are blank."); + } + if (rows.some((row) => + row.evidence.some((item) => !row.sourceUrls.includes(item.sourceUrl)) + )) { + issues.push("One or more Mastra populate evidence URLs do not match row source URLs."); + } return issues; } +function isHttpUrl(value: string): boolean { + try { + const url = new URL(value); + return url.protocol === "http:" || url.protocol === "https:"; + } catch { + return false; + } +} + function firstPresentColumn(data: Record): string { return Object.keys(data)[0] ?? "entity_name"; } diff --git a/backend/src/pipeline/populate-self-healing-command.ts b/backend/src/pipeline/populate-self-healing-command.ts index 3436017..2bbca72 100644 --- a/backend/src/pipeline/populate-self-healing-command.ts +++ b/backend/src/pipeline/populate-self-healing-command.ts @@ -16,6 +16,8 @@ import { type CreatePopulateRecipeRuntimeInput, } from "./populate-runtime-selection.js"; +export const DEFAULT_COMMIT_ROW_LIMIT_PER_HOUR = 1000; + export interface PopulateSelfHealingCliOptions { datasetId?: string; contextPath?: string; @@ -23,6 +25,7 @@ export interface PopulateSelfHealingCliOptions { shouldCommitRows: boolean; recipeStoreDirectory?: string; maxRows?: number; + commitRowLimitPerHour?: number; } export interface PopulateSelfHealingCliDependencies { @@ -90,6 +93,15 @@ export async function runPopulateSelfHealingCli( rowWriter, shouldCommitRows: options.shouldCommitRows, runtime, + commitRowLimit: options.shouldCommitRows + ? { + maxRowsPerWindow: commitRowLimitPerHour({ + optionValue: options.commitRowLimitPerHour, + envValue: input.env.POPULATE_COMMIT_ROW_LIMIT_PER_HOUR, + }), + windowMs: 60 * 60 * 1_000, + } + : undefined, }); writeStdout(JSON.stringify(summaryForResult(result, !options.shouldCommitRows))); @@ -151,6 +163,14 @@ export function parsePopulateSelfHealingCliArgs( } options.maxRows = parsed; index += 1; + } else if (arg === "--commit-row-limit-per-hour") { + const value = argv[index + 1]; + const parsed = Number(value); + if (!Number.isInteger(parsed) || parsed <= 0) { + throw new Error("--commit-row-limit-per-hour requires a positive integer."); + } + options.commitRowLimitPerHour = parsed; + index += 1; } else { throw new Error(`Unknown argument: ${arg}`); } @@ -232,6 +252,7 @@ function summaryForResult( action: result.action, datasetId: result.datasetId, committedRows: result.committedRows, + commitLimit: result.commitLimit, rowCount: diagnosticRun?.rows.length ?? 0, validationIssues: result.validationIssues, rejectionReasons: result.rejectionReasons, @@ -240,6 +261,25 @@ function summaryForResult( }; } +function commitRowLimitPerHour(input: { + optionValue?: number; + envValue?: string; +}): number { + if (input.optionValue !== undefined) { + return input.optionValue; + } + if (input.envValue === undefined || input.envValue === "") { + return DEFAULT_COMMIT_ROW_LIMIT_PER_HOUR; + } + const parsed = Number(input.envValue); + if (!Number.isInteger(parsed) || parsed <= 0) { + throw new Error( + "POPULATE_COMMIT_ROW_LIMIT_PER_HOUR must be a positive integer." + ); + } + return parsed; +} + async function readProcessStdin(): Promise { let text = ""; for await (const chunk of process.stdin) { diff --git a/backend/src/pipeline/populate-self-healing-runner.ts b/backend/src/pipeline/populate-self-healing-runner.ts index 3e3347d..fdce64e 100644 --- a/backend/src/pipeline/populate-self-healing-runner.ts +++ b/backend/src/pipeline/populate-self-healing-runner.ts @@ -1,4 +1,6 @@ -import { join } from "node:path"; +import { randomUUID } from "node:crypto"; +import { mkdir, readFile, rm, writeFile } from "node:fs/promises"; +import { dirname, join } from "node:path"; import type { DatasetContext } from "./populate.js"; import { @@ -6,6 +8,7 @@ import { FileSystemPopulateRecipeStore, MastraPopulateRecipeRuntime, SelfHealingPopulateRecipeService, + safeRowsForPopulateCommit, type PopulateRecipeAuthor, type PopulateRecipeRunResult, type PopulateRecipeRuntime, @@ -25,6 +28,244 @@ export interface PopulateDatasetWriteResult { insertedRowCount: number; } +export interface PopulateDatasetRowCommitLimit { + maxRowsPerWindow: number; + windowMs: number; + now?: () => Date; + limiter?: PopulateDatasetRowCommitLimiter; +} + +export interface PopulateDatasetRowCommitLimiter { + committedRowCount(input: { + datasetId: string; + since: Date; + now: Date; + }): Promise; + reserveCommit(input: { + datasetId: string; + rowCount: number; + since: Date; + now: Date; + maxRowsPerWindow: number; + }): Promise; +} + +export interface PopulateDatasetRowCommitReservation { + decision: PopulateDatasetRowCommitLimitDecision; + confirm(input: { rowCount: number }): Promise; + release(): Promise; +} + +interface PopulateDatasetRowCommitLimitCheck { + datasetId: string; + rowCount: number; + now: Date; + windowStartedAt: Date; + maxRowsPerWindow: number; + committedRowsInWindow: number; +} + +interface FileSystemCommitLedgerEntry { + datasetId: string; + committedAt: string; + rowCount: number; + reservationId?: string; + status?: "reserved" | "committed"; +} + +interface CommitLedgerReservationInput { + entries: FileSystemCommitLedgerEntry[]; + reservationId: string; + datasetId: string; + rowCount: number; + now: Date; + since: Date; + maxRowsPerWindow: number; +} + +interface CommitLedgerReservationState { + entries: FileSystemCommitLedgerEntry[]; + decision: PopulateDatasetRowCommitLimitDecision; + reservation?: FileSystemCommitLedgerEntry; +} + +interface CommitLedgerMutationInput { + reservationId: string; + datasetId: string; + rowCount?: number; +} + +interface CommitLedgerState { + entries: FileSystemCommitLedgerEntry[]; +} + +interface CommitLedgerStore { + mutateDatasetLedger( + datasetId: string, + mutate: (state: CommitLedgerState) => Promise | T + ): Promise; +} + +interface CommitLedgerReservation { + store: CommitLedgerStore; + reservationId: string; + datasetId: string; + decision: PopulateDatasetRowCommitLimitDecision; +} + +function commitLedgerReservation( + input: CommitLedgerReservation +): PopulateDatasetRowCommitReservation { + return { + decision: input.decision, + async confirm(confirmInput) { + await input.store.mutateDatasetLedger(input.datasetId, (state) => { + confirmReservation({ + entries: state.entries, + reservationId: input.reservationId, + datasetId: input.datasetId, + rowCount: confirmInput.rowCount, + }); + }); + }, + async release() { + await input.store.mutateDatasetLedger(input.datasetId, (state) => { + releaseReservation({ + entries: state.entries, + reservationId: input.reservationId, + datasetId: input.datasetId, + }); + }); + }, + }; +} + +function deniedCommitReservation( + decision: PopulateDatasetRowCommitLimitDecision +): PopulateDatasetRowCommitReservation { + return { + decision, + async confirm() { + return undefined; + }, + async release() { + return undefined; + }, + }; +} + +function reserveInLedger(input: CommitLedgerReservationInput): CommitLedgerReservationState { + const committedRowsInWindow = entriesInWindow(input.entries, { + datasetId: input.datasetId, + since: input.since, + now: input.now, + }).reduce((total, entry) => total + entry.rowCount, 0); + const decision = commitLimitDecisionFromCheck({ + datasetId: input.datasetId, + rowCount: input.rowCount, + now: input.now, + windowStartedAt: input.since, + maxRowsPerWindow: input.maxRowsPerWindow, + committedRowsInWindow, + }); + + if (!decision.isAllowed) { + return { entries: input.entries, decision }; + } + + const reservation = { + datasetId: input.datasetId, + committedAt: input.now.toISOString(), + rowCount: input.rowCount, + reservationId: input.reservationId, + status: "reserved" as const, + }; + return { + entries: [...input.entries, reservation], + decision, + reservation, + }; +} + +function confirmReservation(input: CommitLedgerMutationInput & { + entries: FileSystemCommitLedgerEntry[]; +}): void { + const entry = matchingReservation(input.entries, input); + if (!entry) { + return; + } + entry.status = "committed"; + if (input.rowCount !== undefined) { + entry.rowCount = input.rowCount; + } +} + +function releaseReservation(input: CommitLedgerMutationInput & { + entries: FileSystemCommitLedgerEntry[]; +}): void { + const index = input.entries.findIndex((entry) => + entry.datasetId === input.datasetId && + entry.reservationId === input.reservationId + ); + if (index >= 0) { + input.entries.splice(index, 1); + } +} + +function matchingReservation( + entries: FileSystemCommitLedgerEntry[], + input: CommitLedgerMutationInput +): FileSystemCommitLedgerEntry | undefined { + return entries.find((entry) => + entry.datasetId === input.datasetId && + entry.reservationId === input.reservationId + ); +} + +function commitLimitDecisionFromCheck( + input: PopulateDatasetRowCommitLimitCheck +): PopulateDatasetRowCommitLimitDecision { + const remainingRowsInWindow = Math.max( + 0, + input.maxRowsPerWindow - input.committedRowsInWindow + ); + const isAllowed = input.rowCount <= remainingRowsInWindow; + + return { + isAllowed, + datasetId: input.datasetId, + requestedRowCount: input.rowCount, + maxRowsPerWindow: input.maxRowsPerWindow, + committedRowsInWindow: input.committedRowsInWindow, + remainingRowsInWindow, + windowStartedAt: input.windowStartedAt.toISOString(), + windowEndsAt: input.now.toISOString(), + reason: isAllowed + ? undefined + : `Commit row cap exceeded for ${input.datasetId}: requested ${input.rowCount}, remaining ${remainingRowsInWindow} of ${input.maxRowsPerWindow} rows in the current window.`, + }; +} + +function reservationId(): string { + return randomUUID(); +} + +export interface PopulateDatasetRowCommitLimitDecision { + isAllowed: boolean; + datasetId: string; + requestedRowCount: number; + maxRowsPerWindow: number; + committedRowsInWindow: number; + remainingRowsInWindow: number; + windowStartedAt: string; + windowEndsAt: string; + reason?: string; +} + +export type RunSelfHealingPopulateAction = + | SelfHealingPopulateTickResult["action"] + | "commit_rate_limited"; + export interface RunSelfHealingPopulateInput { context: DatasetContext; store?: PopulateRecipeStore; @@ -33,18 +274,21 @@ export interface RunSelfHealingPopulateInput { rowWriter?: PopulateDatasetRowWriter; shouldCommitRows?: boolean; recipeStoreDirectory?: string; + commitRowLimit?: PopulateDatasetRowCommitLimit; } export interface RunSelfHealingPopulateResult { success: boolean; - action: SelfHealingPopulateTickResult["action"]; + action: RunSelfHealingPopulateAction; datasetId: string; selectedRun?: PopulateRecipeRunResult; diagnosticRun?: PopulateRecipeRunResult; committedRows?: PopulateDatasetWriteResult; + commitLimit?: PopulateDatasetRowCommitLimitDecision; + validationState?: PopulateRecipeRunResult["productionValidation"]["state"]; rejectionReasons: string[]; validationIssues: string[]; - tick: SelfHealingPopulateTickResult; + tick?: SelfHealingPopulateTickResult; } export async function runSelfHealingPopulate( @@ -54,6 +298,22 @@ export async function runSelfHealingPopulate( throw new Error("rowWriter is required when shouldCommitRows is true."); } const rowWriter = input.rowWriter; + const commitLimiter = commitLimiterForInput(input); + + if (input.shouldCommitRows && commitLimiter) { + const preflightDecision = await commitLimitDecision({ + context: input.context, + rowCount: 1, + commitRowLimit: input.commitRowLimit!, + limiter: commitLimiter, + }); + if (!preflightDecision.isAllowed && preflightDecision.remainingRowsInWindow <= 0) { + return commitRateLimitedResult({ + datasetId: input.context.datasetId, + decision: preflightDecision, + }); + } + } const store = input.store ?? new FileSystemPopulateRecipeStore( input.recipeStoreDirectory ?? defaultPopulateRecipeStoreDirectory() @@ -67,15 +327,44 @@ export async function runSelfHealingPopulate( datasetId: input.context.datasetId, context: input.context, }); - const selectedRun = successfulRunForTick(tick); + const selectedRun = committableRunForTick(tick); const diagnosticRun = diagnosticRunForTick(tick); + const rowsToCommit = selectedRun + ? safeRowsForPopulateCommit({ context: input.context, run: selectedRun }) + : []; let committedRows: PopulateDatasetWriteResult | undefined; + let commitLimit: PopulateDatasetRowCommitLimitDecision | undefined; if (input.shouldCommitRows && selectedRun && rowWriter) { - committedRows = await rowWriter.replaceRows({ - datasetId: input.context.datasetId, - rows: selectedRun.rows, - }); + let reservation: PopulateDatasetRowCommitReservation | undefined; + if (commitLimiter) { + reservation = await reserveCommitRows({ + context: input.context, + rowCount: rowsToCommit.length, + commitRowLimit: input.commitRowLimit!, + limiter: commitLimiter, + }); + commitLimit = reservation.decision; + if (!commitLimit.isAllowed) { + return commitRateLimitedResult({ + datasetId: input.context.datasetId, + decision: commitLimit, + selectedRun, + diagnosticRun, + tick, + }); + } + } + try { + committedRows = await rowWriter.replaceRows({ + datasetId: input.context.datasetId, + rows: rowsToCommit, + }); + } catch (error) { + await reservation?.release(); + throw error; + } + await reservation?.confirm({ rowCount: committedRows.insertedRowCount }); } return { @@ -85,12 +374,21 @@ export async function runSelfHealingPopulate( selectedRun, diagnosticRun, committedRows, + commitLimit, + validationState: selectedRun?.productionValidation.state ?? + diagnosticRun?.productionValidation.state, rejectionReasons: tick.rejectionReasons, validationIssues: validationIssuesForSelfHealingTick(tick), tick, }; } +function committableRunForTick( + tick: SelfHealingPopulateTickResult +): PopulateRecipeRunResult | undefined { + return successfulRunForTick(tick) ?? acceptedPartialRunForTick(tick); +} + export function successfulRunForTick( tick: SelfHealingPopulateTickResult ): PopulateRecipeRunResult | undefined { @@ -112,6 +410,15 @@ export function diagnosticRunForTick( return successfulRunForTick(tick) ?? tick.candidateRun ?? tick.activeRun; } +function acceptedPartialRunForTick( + tick: SelfHealingPopulateTickResult +): PopulateRecipeRunResult | undefined { + return [tick.candidateRun, tick.activeRun].find((run) => + run?.productionValidation.state === "accepted_partial" && + run.productionValidation.safeRowCount > 0 + ); +} + export function validationIssuesForSelfHealingTick( tick: SelfHealingPopulateTickResult ): string[] { @@ -126,3 +433,272 @@ export function validationIssuesForSelfHealingTick( function defaultPopulateRecipeStoreDirectory(): string { return join(process.cwd(), ".bigset", "populate-recipes"); } + +function commitLimiterForInput( + input: RunSelfHealingPopulateInput +): PopulateDatasetRowCommitLimiter | undefined { + if (!input.shouldCommitRows || !input.commitRowLimit) { + return undefined; + } + return input.commitRowLimit.limiter ?? new FileSystemPopulateDatasetRowCommitLimiter( + join( + input.recipeStoreDirectory ?? defaultPopulateRecipeStoreDirectory(), + "commit-ledger" + ) + ); +} + +async function commitLimitDecision(input: { + context: DatasetContext; + rowCount: number; + commitRowLimit: PopulateDatasetRowCommitLimit; + limiter: PopulateDatasetRowCommitLimiter; +}): Promise { + const now = input.commitRowLimit.now?.() ?? new Date(); + const windowStartedAt = new Date(now.getTime() - input.commitRowLimit.windowMs); + const committedRowsInWindow = await input.limiter.committedRowCount({ + datasetId: input.context.datasetId, + since: windowStartedAt, + now, + }); + return commitLimitDecisionFromCheck({ + datasetId: input.context.datasetId, + rowCount: input.rowCount, + now, + windowStartedAt, + maxRowsPerWindow: input.commitRowLimit.maxRowsPerWindow, + committedRowsInWindow, + }); +} + +async function reserveCommitRows(input: { + context: DatasetContext; + rowCount: number; + commitRowLimit: PopulateDatasetRowCommitLimit; + limiter: PopulateDatasetRowCommitLimiter; +}): Promise { + const now = input.commitRowLimit.now?.() ?? new Date(); + const windowStartedAt = new Date(now.getTime() - input.commitRowLimit.windowMs); + return input.limiter.reserveCommit({ + datasetId: input.context.datasetId, + rowCount: input.rowCount, + since: windowStartedAt, + now, + maxRowsPerWindow: input.commitRowLimit.maxRowsPerWindow, + }); +} + +function commitRateLimitedResult(input: { + datasetId: string; + decision: PopulateDatasetRowCommitLimitDecision; + selectedRun?: PopulateRecipeRunResult; + diagnosticRun?: PopulateRecipeRunResult; + tick?: SelfHealingPopulateTickResult; +}): RunSelfHealingPopulateResult { + const reason = input.decision.reason ?? + `Commit row cap exceeded for ${input.datasetId}.`; + return { + success: false, + action: "commit_rate_limited", + datasetId: input.datasetId, + selectedRun: input.selectedRun, + diagnosticRun: input.diagnosticRun ?? input.selectedRun, + commitLimit: input.decision, + validationState: input.selectedRun?.productionValidation.state ?? + input.diagnosticRun?.productionValidation.state, + rejectionReasons: [reason], + validationIssues: [reason], + tick: input.tick, + }; +} + +export class InMemoryPopulateDatasetRowCommitLimiter +implements PopulateDatasetRowCommitLimiter, CommitLedgerStore { + private readonly entries: FileSystemCommitLedgerEntry[] = []; + + async committedRowCount(input: { + datasetId: string; + since: Date; + now: Date; + }): Promise { + return entriesInWindow(this.entries, input) + .reduce((total, entry) => total + entry.rowCount, 0); + } + + async reserveCommit(input: { + datasetId: string; + rowCount: number; + since: Date; + now: Date; + maxRowsPerWindow: number; + }): Promise { + const id = reservationId(); + const state = reserveInLedger({ + entries: this.entries, + reservationId: id, + datasetId: input.datasetId, + rowCount: input.rowCount, + since: input.since, + now: input.now, + maxRowsPerWindow: input.maxRowsPerWindow, + }); + this.entries.splice(0, this.entries.length, ...state.entries); + return state.reservation + ? commitLedgerReservation({ + store: this, + reservationId: id, + datasetId: input.datasetId, + decision: state.decision, + }) + : deniedCommitReservation(state.decision); + } + + async mutateDatasetLedger( + _datasetId: string, + mutate: (state: CommitLedgerState) => Promise | T + ): Promise { + return mutate({ entries: this.entries }); + } +} + +export class FileSystemPopulateDatasetRowCommitLimiter +implements PopulateDatasetRowCommitLimiter, CommitLedgerStore { + constructor(private readonly rootDirectory: string) {} + + async committedRowCount(input: { + datasetId: string; + since: Date; + now: Date; + }): Promise { + return entriesInWindow(await this.readEntries(input.datasetId), input) + .reduce((total, entry) => total + entry.rowCount, 0); + } + + async reserveCommit(input: { + datasetId: string; + rowCount: number; + since: Date; + now: Date; + maxRowsPerWindow: number; + }): Promise { + const id = reservationId(); + const state = await this.mutateDatasetLedger(input.datasetId, (ledger) => { + const reservationState = reserveInLedger({ + entries: ledger.entries, + reservationId: id, + datasetId: input.datasetId, + rowCount: input.rowCount, + since: input.since, + now: input.now, + maxRowsPerWindow: input.maxRowsPerWindow, + }); + ledger.entries.splice(0, ledger.entries.length, ...reservationState.entries); + return reservationState; + }); + return state.reservation + ? commitLedgerReservation({ + store: this, + reservationId: id, + datasetId: input.datasetId, + decision: state.decision, + }) + : deniedCommitReservation(state.decision); + } + + async mutateDatasetLedger( + datasetId: string, + mutate: (state: CommitLedgerState) => Promise | T + ): Promise { + const lockPath = await this.acquireLock(datasetId); + try { + const entries = await this.readEntries(datasetId); + const state = { entries }; + const result = await mutate(state); + await this.writeEntries(datasetId, state.entries); + return result; + } finally { + await rm(lockPath, { recursive: true, force: true }); + } + } + + private async acquireLock(datasetId: string): Promise { + await mkdir(this.rootDirectory, { recursive: true }); + const lockPath = this.lockPath(datasetId); + const startedAt = Date.now(); + while (true) { + try { + await mkdir(lockPath); + return lockPath; + } catch (error) { + if (!isNodeError(error) || error.code !== "EEXIST") { + throw error; + } + if (Date.now() - startedAt > 5_000) { + throw new Error(`Timed out waiting for commit ledger lock for ${datasetId}.`); + } + await sleep(25); + } + } + } + + private lockPath(datasetId: string): string { + return join(this.rootDirectory, `${safePathSegment(datasetId)}.lock`); + } + + private async readEntries(datasetId: string): Promise { + try { + const text = await readFile(this.ledgerPath(datasetId), "utf8"); + const parsed = JSON.parse(text) as { entries?: FileSystemCommitLedgerEntry[] }; + return Array.isArray(parsed.entries) ? parsed.entries : []; + } catch (error) { + if (isNodeError(error) && error.code === "ENOENT") { + return []; + } + throw error; + } + } + + private async writeEntries( + datasetId: string, + entries: FileSystemCommitLedgerEntry[] + ): Promise { + const path = this.ledgerPath(datasetId); + await mkdir(dirname(path), { recursive: true }); + await writeFile(path, `${JSON.stringify({ entries }, null, 2)}\n`, "utf8"); + } + + private ledgerPath(datasetId: string): string { + return join(this.rootDirectory, `${safePathSegment(datasetId)}.json`); + } +} + +function sleep(ms: number): Promise { + return new Promise((resolve) => setTimeout(resolve, ms)); +} + +function entriesInWindow( + entries: FileSystemCommitLedgerEntry[], + input: { + datasetId: string; + since: Date; + now: Date; + } +): FileSystemCommitLedgerEntry[] { + return entries.filter((entry) => { + if (entry.datasetId !== input.datasetId) { + return false; + } + const committedAtMs = Date.parse(entry.committedAt); + return Number.isFinite(committedAtMs) && + committedAtMs >= input.since.getTime() && + committedAtMs <= input.now.getTime(); + }); +} + +function safePathSegment(value: string): string { + return value.replace(/[^a-zA-Z0-9._-]/g, "_"); +} + +function isNodeError(error: unknown): error is NodeJS.ErrnoException { + return error instanceof Error && "code" in error; +} diff --git a/backend/src/pipeline/populate-self-healing.ts b/backend/src/pipeline/populate-self-healing.ts index b5f89e2..a7f1207 100644 --- a/backend/src/pipeline/populate-self-healing.ts +++ b/backend/src/pipeline/populate-self-healing.ts @@ -2,16 +2,28 @@ import { mkdir, readFile, writeFile } from "node:fs/promises"; import { join } from "node:path"; import { + type PopulateProcessTrace, type PopulateRuntimeAgentRunner, type PopulateRuntimeResult, type PopulateRuntimeRow, type PopulateRuntimeWebTools, runPopulateRuntime, } from "./populate-runtime.js"; +import { + createPlaywrightScriptArtifact, + type BrowserActionBox, + type BrowserActionBoxDatasetSchema, + type PlaywrightScriptArtifact, +} from "./populate-browser-action-box.js"; import { datasetContextSchema, type DatasetContext, } from "./populate.js"; +import { + playwrightCandidateReadinessForRun, + type PopulatePlaywrightCandidateReadiness, +} from "./populate-playwright-readiness.js"; +import { playwrightCandidateScriptForRun } from "./populate-playwright-candidate-script.js"; export type PopulateRecipeStatus = | "active" @@ -25,9 +37,23 @@ export type PopulateRecipeArtifactKind = | "text" | "stderr" | "source-transcript" - | "captured-rows"; + | "captured-rows" + | "process-trace" + | "playwright-candidate-readiness" + | "playwright-candidate-script" + | "tinyfish-trace" + | "playwright-replay-result" + | "playwright-repair-diagnostic" + | "playwright-repaired-script" + | "validation-result"; const MAX_ARTIFACT_TEXT_LENGTH = 20_000; +const PROCESS_TRACE_ARTIFACT_LIMITS = [ + { maxItems: 100, maxNestedItems: 25, maxStringLength: 500 }, + { maxItems: 50, maxNestedItems: 10, maxStringLength: 240 }, + { maxItems: 25, maxNestedItems: 8, maxStringLength: 120 }, + { maxItems: 10, maxNestedItems: 5, maxStringLength: 80 }, +] as const; export interface PopulateRecipe { recipeId: string; @@ -41,6 +67,7 @@ export interface PopulateRecipe { createdBy: "agent" | "human" | "system"; lastSuccessfulRunAt?: string; lastValidationScore?: number; + playwrightScript?: PlaywrightScriptArtifact; } export interface PopulateRecipeArtifact { @@ -50,19 +77,38 @@ export interface PopulateRecipeArtifact { } export interface PopulateRecipeProductionValidation { + state: PopulateValidationState; isValid: boolean; score: number; rowCount: number; + safeRowCount: number; requestedCellCompletenessRatio: number; sourceUrlCoverageRatio: number; evidenceCoverageRatio: number; expectedEntityCoverageRatio: number; expectedEntities: string[]; missingExpectedEntities: string[]; + coveragePolicy: PopulateValidationCoveragePolicy; + targetSource: string; criticalIssues: string[]; warnings: string[]; } +export type PopulateValidationState = + | "accepted_full" + | "accepted_partial" + | "rejected"; + +export type PopulateValidationCoveragePolicy = + | "partial_allowed" + | "full_required"; + +interface PopulateValidationIntent { + expectedEntities: string[]; + coveragePolicy: PopulateValidationCoveragePolicy; + targetSource: string; +} + export interface PopulateRecipeRunResult extends PopulateRuntimeResult { recipeId: string; recipeVersion: number; @@ -103,6 +149,7 @@ export interface StoredPopulateRecipeRunRecord { runStatus: PopulateRecipeRunStatus; completedAt: string; productionValidation: PopulateRecipeProductionValidation; + artifacts: PopulateRecipeArtifact[]; } export interface PopulateRecipeStoreSnapshot { @@ -140,6 +187,7 @@ export class MastraPopulateRecipeRuntime implements PopulateRecipeRuntime { runPopulate?: typeof runPopulateRuntime; webTools?: PopulateRuntimeWebTools; agentRunner?: PopulateRuntimeAgentRunner; + browserActionBox?: Pick; maxRows?: number; } = {} ) {} @@ -156,12 +204,50 @@ export class MastraPopulateRecipeRuntime implements PopulateRecipeRuntime { let failureMessage: string | undefined; try { - result = await runtime({ - context, - webTools: this.input.webTools, - agentRunner: this.input.agentRunner, - maxRows: this.input.maxRows, - }); + if (input.recipe.playwrightScript && this.input.browserActionBox) { + const replayOutput = await this.input.browserActionBox.replay({ + sourceUrl: input.recipe.playwrightScript.sourceUrl, + datasetGoalPrompt: input.context.description, + datasetSchema: browserActionBoxDatasetSchemaFromContext(input.context), + currentPlaywrightScript: input.recipe.playwrightScript, + previousSuccessfulOutputProfile: { + fieldsPreviouslyRetrieved: input.recipe.requestedColumns, + rowCountRange: { min: 1 }, + sourceUrls: [input.recipe.playwrightScript.sourceUrl], + evidenceRequired: true, + }, + runCaps: { + maxReplayAttempts: 1, + maxRepairAttempts: 1, + timeoutMs: 30_000, + }, + }); + result = replayOutput.runtimeResult ?? emptyPopulateRuntimeResult([ + `BrowserActionBox ${replayOutput.replayStatus}: ${replayOutput.diagnostics.join("; ")}`, + ]); + if (result.debug) { + result.debug.diagnosticArtifacts = [ + ...(result.debug.diagnosticArtifacts ?? []), + { + kind: "playwright-repair-diagnostic", + label: "populate-playwright-repair-diagnostic", + content: JSON.stringify({ + replayStatus: replayOutput.replayStatus, + diagnostics: replayOutput.diagnostics, + trace: replayOutput.trace, + }, null, 2), + }, + ]; + } + } else { + result = await runtime({ + context, + webTools: this.input.webTools, + agentRunner: this.input.agentRunner, + browserActionBox: this.input.browserActionBox, + maxRows: this.input.maxRows, + }); + } } catch (error) { failureMessage = error instanceof Error ? error.message : String(error); result = emptyPopulateRuntimeResult([failureMessage]); @@ -202,7 +288,7 @@ export function populateRecipeRunResultFromRuntimeResult(input: { ...input.result, recipeId: input.recipe.recipeId, recipeVersion: input.recipe.version, - runStatus: productionValidation.isValid ? "succeeded" : "failed", + runStatus: productionValidation.state === "rejected" ? "failed" : "succeeded", startedAt: input.startedAt, completedAt, runtimeMs: Date.now() - input.startedAtMs, @@ -282,6 +368,16 @@ export class SelfHealingPopulateRecipeService { }; } + if (shouldRejectAfterBoundedReplayFailure({ activeRecipe, activeRun })) { + return { + datasetId: input.datasetId, + action: "candidate_rejected", + activeRecipe, + activeRun, + rejectionReasons: replayFailureRejectionReasons(activeRun), + }; + } + const nextVersion = await this.nextVersion(input.datasetId); const candidateRecipe = normalizeCandidateRecipe({ recipe: await this.input.author.repairRecipe({ @@ -454,7 +550,10 @@ export class FileSystemPopulateRecipeStore implements PopulateRecipeStore { return { datasetId, recipes: parsed.recipes ?? [], - runRecords: parsed.runRecords ?? [], + runRecords: (parsed.runRecords ?? []).map((record) => ({ + ...record, + artifacts: record.artifacts ?? [], + })), }; } catch (error) { if (isNodeError(error) && error.code === "ENOENT") { @@ -560,13 +659,25 @@ function requestedColumnNames(context: DatasetContext): string[] { return context.columns.map((column) => column.name); } +function requiredColumnNamesForValidation(context: DatasetContext): string[] { + return context.columns + .filter((column) => column.nullable !== true) + .map((column) => column.name); +} + +function columnRequirementLabels(context: DatasetContext): string { + return context.columns + .map((column) => `${column.name}${column.nullable ? " (nullable)" : " (required)"}`) + .join(", "); +} + function initialRuntimeInstructions(context: DatasetContext): string { return [ "Use search_web before fetch_page unless an official source URL is already obvious.", "Prefer official docs, pricing, blog, product, or company pages over third-party summaries.", "Every inserted row must include source_url and evidence_quote cells when those columns exist.", "Every inserted row must include at least one source URL and one evidence quote.", - `Requested columns: ${requestedColumnNames(context).join(", ")}.`, + `Requested columns: ${columnRequirementLabels(context)}.`, ].join("\n"); } @@ -618,13 +729,43 @@ function validatePopulateRuntimeResult(input: { result: PopulateRuntimeResult; context: DatasetContext; }): PopulateRecipeProductionValidation { - const requestedColumns = input.context.columns.map((column) => column.name); - const expectedEntities = expectedEntitiesFromContext(input.context); + const requestedColumns = requestedColumnNames(input.context); + const requiredColumns = requiredColumnNamesForValidation(input.context); + const nullableColumns = requestedColumns.filter( + (columnName) => !requiredColumns.includes(columnName) + ); + const validationIntent = validationIntentFromContext(input.context); + const expectedEntities = validationIntent.expectedEntities; const entityCoverage = expectedEntityCoverage({ rows: input.result.rows, expectedEntities, }); const rowCount = input.result.rows.length; + const rowSafety = input.result.rows.map((row, index) => + productionRowSafety({ + row, + rowNumber: index + 1, + requiredColumns, + }) + ); + const rowCriticalIssues = rowSafety.flatMap((safety) => safety.criticalIssues); + const dataCriticalIssues = criticalValidationIssues({ + validationIssues: input.result.validationIssues, + nullableColumns, + requiredColumns, + }); + const coverageIssues = coverageIssuesFromValidationIntent({ + missingExpectedEntities: entityCoverage.missingExpectedEntities, + }); + const safeRowCount = rowSafety.filter((safety) => safety.isSafe).length; + const state = validationStateFromSignals({ + rowCount, + safeRowCount, + rowCriticalIssues, + dataCriticalIssues, + coverageIssues, + coveragePolicy: validationIntent.coveragePolicy, + }); const requestedCellCompletenessRatio = averageRatio( input.result.rows.map((row) => cellCompletenessRatio(row, requestedColumns)) ); @@ -634,12 +775,6 @@ function validatePopulateRuntimeResult(input: { const evidenceCoverageRatio = averageRatio( input.result.rows.map((row) => row.evidence.length > 0 ? 1 : 0) ); - const criticalIssues = criticalIssuesForRows({ - rows: input.result.rows, - requestedColumns, - validationIssues: input.result.validationIssues, - missingExpectedEntities: entityCoverage.missingExpectedEntities, - }); const scoreComponents = [ requestedCellCompletenessRatio, sourceUrlCoverageRatio, @@ -651,58 +786,188 @@ function validatePopulateRuntimeResult(input: { const score = rowCount === 0 ? 0 : averageRatio(scoreComponents); + const criticalIssues = Array.from(new Set([ + ...rowCriticalIssues, + ...dataCriticalIssues, + ...coverageIssues, + ])); return { - isValid: criticalIssues.length === 0, + state, + isValid: state === "accepted_full", score, rowCount, + safeRowCount, requestedCellCompletenessRatio, sourceUrlCoverageRatio, evidenceCoverageRatio, expectedEntityCoverageRatio: entityCoverage.expectedEntityCoverageRatio, expectedEntities, missingExpectedEntities: entityCoverage.missingExpectedEntities, + coveragePolicy: validationIntent.coveragePolicy, + targetSource: validationIntent.targetSource, criticalIssues, warnings: input.result.validationIssues, }; } -function criticalIssuesForRows(input: { - rows: PopulateRuntimeRow[]; - requestedColumns: string[]; - validationIssues: string[]; +function validationStateFromSignals(input: { + rowCount: number; + safeRowCount: number; + rowCriticalIssues: string[]; + dataCriticalIssues: string[]; + coverageIssues: string[]; + coveragePolicy: PopulateValidationCoveragePolicy; +}): PopulateValidationState { + if (input.rowCount === 0 || input.safeRowCount === 0) { + return "rejected"; + } + if (input.rowCriticalIssues.length > 0 || input.dataCriticalIssues.length > 0) { + return "rejected"; + } + if (input.coverageIssues.length > 0) { + return input.coveragePolicy === "partial_allowed" + ? "accepted_partial" + : "rejected"; + } + return "accepted_full"; +} + +function coverageIssuesFromValidationIntent(input: { missingExpectedEntities: string[]; }): string[] { - const issues: string[] = []; - if (input.rows.length === 0) { - issues.push("Populate runtime returned no rows."); + if (input.missingExpectedEntities.length === 0) { + return []; + } + return [ + `Missing expected entities: ${input.missingExpectedEntities.join(", ")}.`, + ]; +} + +function productionRowSafety(input: { + row: PopulateRuntimeRow; + rowNumber: number; + requiredColumns: string[]; +}): { + isSafe: boolean; + criticalIssues: string[]; +} { + const criticalIssues: string[] = []; + const missingColumns = input.requiredColumns.filter( + (columnName) => isMissingCellValue(input.row.cells[columnName]) + ); + if (missingColumns.length > 0) { + criticalIssues.push( + `Row ${input.rowNumber} missing requested columns: ${missingColumns.join(", ")}.` + ); + } + if (input.row.sourceUrls.length === 0) { + criticalIssues.push(`Row ${input.rowNumber} has no source URL.`); + } + if (input.row.sourceUrls.some((sourceUrl) => !isHttpUrl(sourceUrl))) { + criticalIssues.push(`Row ${input.rowNumber} has an invalid source URL.`); } - if (input.missingExpectedEntities.length > 0) { - issues.push( - `Missing expected entities: ${input.missingExpectedEntities.join(", ")}.` + if (input.row.evidence.length === 0) { + criticalIssues.push(`Row ${input.rowNumber} has no evidence quote.`); + } + if (input.row.evidence.some((item) => !item.quote.trim())) { + criticalIssues.push(`Row ${input.rowNumber} has a blank evidence quote.`); + } + if ( + input.row.evidence.some((item) => !input.row.sourceUrls.includes(item.sourceUrl)) + ) { + criticalIssues.push( + `Row ${input.rowNumber} has evidence that does not match a row source URL.` ); } - input.rows.forEach((row, index) => { - const missingColumns = input.requestedColumns.filter( - (columnName) => isMissingCellValue(row.cells[columnName]) + return { + isSafe: criticalIssues.length === 0, + criticalIssues, + }; +} + +function isHttpUrl(value: string): boolean { + try { + const url = new URL(value); + return url.protocol === "http:" || url.protocol === "https:"; + } catch { + return false; + } +} + +function criticalValidationIssues(input: { + validationIssues: string[]; + nullableColumns: string[]; + requiredColumns: string[]; +}): string[] { + return input.validationIssues.filter((issue) => + /failed|missing|no rows|not found|invented|invalid|approximation|manual review|not present|could not be determined|left blank|unavailable/i.test(issue) && + !isNullableColumnOnlyWarning(issue, { + nullableColumns: input.nullableColumns, + requiredColumns: input.requiredColumns, + }) && + !isNonBlockingOperationalWarning(issue) + ); +} + +export function safeRowsForPopulateCommit(input: { + context: DatasetContext; + run: PopulateRecipeRunResult; +}): PopulateRuntimeRow[] { + const context = datasetContextSchema.parse(input.context); + const requiredColumns = requiredColumnNamesForValidation(context); + return input.run.rows.filter((row, index) => + productionRowSafety({ + row, + rowNumber: index + 1, + requiredColumns, + }).isSafe + ); +} + +function isNullableColumnOnlyWarning( + issue: string, + input: { + nullableColumns: string[]; + requiredColumns: string[]; + } +): boolean { + if (!/not present|could not be determined|left blank|unavailable/i.test(issue)) { + return false; + } + const mentionsNullableColumn = input.nullableColumns.some((columnName) => + issueMentionsColumn(issue, columnName) + ); + if (!mentionsNullableColumn) { + return false; + } + return !input.requiredColumns.some((columnName) => + issueMentionsColumn(issue, columnName) + ); +} + +function issueMentionsColumn(issue: string, columnName: string): boolean { + const normalizedIssue = normalizeColumnMention(issue); + const normalizedColumnName = normalizeColumnMention(columnName); + const meaningfulTokens = columnName + .split(/[^a-z0-9]+/i) + .map((part) => part.trim().toLowerCase()) + .filter((part) => part.length >= 3) + .filter((part) => + !["source", "url", "link", "evidence", "quote", "field", "value"].includes(part) ); - if (missingColumns.length > 0) { - issues.push(`Row ${index + 1} missing requested columns: ${missingColumns.join(", ")}.`); - } - if (row.sourceUrls.length === 0) { - issues.push(`Row ${index + 1} has no source URL.`); - } - if (row.evidence.length === 0) { - issues.push(`Row ${index + 1} has no evidence quote.`); - } - }); - input.validationIssues - .filter((issue) => - /failed|missing|no rows|not found|invented|invalid/i.test(issue) && - !isNonBlockingOperationalWarning(issue) - ) - .forEach((issue) => issues.push(issue)); - return Array.from(new Set(issues)); + if (meaningfulTokens.length === 0) { + return normalizedIssue.includes(normalizedColumnName); + } + return meaningfulTokens.some((part) => + normalizedIssue.includes(part.replace(/s$/, "")) + ); +} + +function normalizeColumnMention(value: string): string { + return value + .toLowerCase() + .replace(/[^a-z0-9]+/g, " "); } function cellCompletenessRatio( @@ -718,15 +983,22 @@ function cellCompletenessRatio( return filledCount / requestedColumns.length; } +function validationIntentFromContext(context: DatasetContext): PopulateValidationIntent { + return { + expectedEntities: expectedEntitiesFromContext(context), + coveragePolicy: coveragePolicyFromContext(context), + targetSource: targetSourceFromContext(context), + }; +} + function expectedEntitiesFromContext(context: DatasetContext): string[] { - const fromSegment = context.description.match(/\bfrom\s+([^?.]+)/i)?.[1]; - if (!fromSegment) { - return []; - } - const entities = fromSegment - .split(/,|\band\b/i) - .map((entity) => entity.replace(/\b(the|a|an)\b/gi, " ").trim()) - .map((entity) => entity.replace(/\s+/g, " ")) + const description = context.description.replace(/\s+/g, " ").trim(); + const entitySegments = [ + ...entitySegmentsAfterConnectors(description), + capitalizedListSegment(description), + ].filter((segment): segment is string => Boolean(segment)); + const entities = entitySegments + .flatMap((segment) => entitiesFromSegment(segment)) .filter((entity) => entity.length >= 2 && entity.length <= 60 && @@ -735,6 +1007,60 @@ function expectedEntitiesFromContext(context: DatasetContext): string[] { return entities.length >= 2 ? Array.from(new Set(entities)) : []; } +function entitySegmentsAfterConnectors(description: string): string[] { + return Array.from( + description.matchAll( + /\b(?:from|for|across|among|between)\s+([^?.]+)/gi + ) + ).map((match) => stopEntitySegment(match[1] ?? "")); +} + +function capitalizedListSegment(description: string): string | undefined { + const commaList = description.match( + /\b([A-Z][A-Za-z0-9.&-]*(?:\s+[A-Z][A-Za-z0-9.&-]*)?(?:\s*,\s*[A-Z][A-Za-z0-9.&-]*(?:\s+[A-Z][A-Za-z0-9.&-]*)?)+(?:,?\s+and\s+[A-Z][A-Za-z0-9.&-]*(?:\s+[A-Z][A-Za-z0-9.&-]*)?)?)\b/ + )?.[1]; + return commaList ? stopEntitySegment(commaList) : undefined; +} + +function stopEntitySegment(segment: string): string { + return segment + .split(/\b(?:include|includes|including|with|where|that|whose|prefer|using|one row|one record|as|by)\b/i)[0] + ?.trim() ?? ""; +} + +function entitiesFromSegment(segment: string): string[] { + return segment + .split(/,|\band\b/i) + .map((entity) => entity.replace(/\b(the|a|an)\b/gi, " ").trim()) + .map((entity) => entity.replace(/\s+/g, " ")) + .map((entity) => entity.replace(/[.:;]+$/g, "").trim()) + .filter((entity) => !isValidationIntentStopword(entity)); +} + +function isValidationIntentStopword(value: string): boolean { + return /^(create|dataset|current|public|api|model|pricing|latest|posts?|articles?|companies|sources?|official|pages?)$/i.test(value); +} + +function coveragePolicyFromContext( + context: DatasetContext +): PopulateValidationCoveragePolicy { + return /\b(no partial|full coverage|required coverage|must include all|every expected|all expected)\b/i.test( + context.description + ) + ? "full_required" + : "partial_allowed"; +} + +function targetSourceFromContext(context: DatasetContext): string { + if (/\bofficial\b/i.test(context.description)) { + return "official public pages"; + } + if (/\b(public|website|docs|blog|news|pricing)\b/i.test(context.description)) { + return "public web sources"; + } + return "source-backed public data"; +} + function expectedEntityCoverage(input: { rows: PopulateRuntimeRow[]; expectedEntities: string[]; @@ -776,7 +1102,7 @@ function rowIdentityText(row: PopulateRuntimeRow): string { } function isNonBlockingOperationalWarning(issue: string): boolean { - return /^Structured fallback (search|fetch) failed/i.test(issue); + return /^(Structured fallback (search|fetch) failed|search_web failed|fetch_page failed|context URL fetch failed)/i.test(issue); } function isMissingCellValue(value: unknown): boolean { @@ -826,8 +1152,34 @@ function artifactsForRun(input: { content: debugNotes.join("\n").slice(0, MAX_ARTIFACT_TEXT_LENGTH), }); } + artifacts.push({ + kind: "validation-result", + label: "populate-validation-result", + content: JSON.stringify(input.productionValidation, null, 2) + .slice(0, MAX_ARTIFACT_TEXT_LENGTH), + }); + for (const artifact of input.result.debug?.diagnosticArtifacts ?? []) { + const kind = runtimeDiagnosticArtifactKind(artifact.kind); + if (!kind) { + continue; + } + artifacts.push({ + kind, + label: artifact.label, + content: artifact.content.slice(0, MAX_ARTIFACT_TEXT_LENGTH), + }); + } const capturedSources = input.result.debug?.capturedSources ?? []; const capturedRows = input.result.debug?.capturedRows ?? []; + const processTrace = input.result.debug?.processTrace ?? { + runtime: "unknown", + searchQueries: [], + fetchedUrls: [], + sourceArtifacts: [], + selectedRowSource: "none", + notes: [], + steps: [], + }; if (capturedSources.length > 0) { artifacts.push({ kind: "source-transcript", @@ -851,9 +1203,172 @@ function artifactsForRun(input: { .slice(0, MAX_ARTIFACT_TEXT_LENGTH), }); } + if ( + processTrace.steps.length > 0 || + processTrace.searchQueries.length > 0 || + processTrace.fetchedUrls.length > 0 || + processTrace.sourceArtifacts.length > 0 + ) { + const playwrightCandidateScript = playwrightCandidateScriptForRun({ + result: input.result, + }); + artifacts.push({ + kind: "process-trace", + label: "populate-process-trace", + content: processTraceArtifactContent(processTrace), + }); + artifacts.push({ + kind: "playwright-candidate-readiness", + label: "populate-playwright-candidate-readiness", + content: playwrightCandidateReadinessArtifactContent( + playwrightCandidateReadinessForRun({ result: input.result }) + ), + }); + if ( + playwrightCandidateScript && + playwrightCandidateScript.length <= MAX_ARTIFACT_TEXT_LENGTH + ) { + artifacts.push({ + kind: "playwright-candidate-script", + label: "populate-playwright-candidate-script", + content: playwrightCandidateScript, + }); + } + } return artifacts; } +function runtimeDiagnosticArtifactKind( + value: string +): PopulateRecipeArtifactKind | undefined { + if ( + value === "tinyfish-trace" || + value === "playwright-replay-result" || + value === "playwright-repair-diagnostic" || + value === "playwright-repaired-script" + ) { + return value; + } + return undefined; +} + +function playwrightCandidateReadinessArtifactContent( + readiness: PopulatePlaywrightCandidateReadiness +): string { + return JSON.stringify(readiness, null, 2) + .slice(0, MAX_ARTIFACT_TEXT_LENGTH); +} + +function processTraceArtifactContent(processTrace: PopulateProcessTrace): string { + let content = ""; + for (const limits of PROCESS_TRACE_ARTIFACT_LIMITS) { + content = JSON.stringify(truncatedProcessTrace(processTrace, limits), null, 2); + if (content.length <= MAX_ARTIFACT_TEXT_LENGTH) { + return content; + } + } + return content; +} + +function truncatedProcessTrace( + processTrace: PopulateProcessTrace, + limits: typeof PROCESS_TRACE_ARTIFACT_LIMITS[number] +) { + return { + ...processTrace, + truncated: hasProcessTraceOverflow(processTrace, limits), + searchQueries: processTrace.searchQueries + .slice(0, limits.maxItems) + .map((query) => truncateArtifactString(query, limits)), + fetchedUrls: processTrace.fetchedUrls + .slice(0, limits.maxItems) + .map((url) => truncateArtifactString(url, limits)), + sourceArtifacts: processTrace.sourceArtifacts.slice(0, limits.maxItems).map((artifact) => ({ + ...artifact, + url: truncateArtifactString(artifact.url, limits), + label: artifact.label + ? truncateArtifactString(artifact.label, limits) + : artifact.label, + error: artifact.error + ? truncateArtifactString(artifact.error, limits) + : artifact.error, + })), + notes: processTrace.notes + .slice(0, limits.maxItems) + .map((note) => truncateArtifactString(note, limits)), + steps: processTrace.steps.slice(0, limits.maxItems).map((step) => ({ + ...step, + label: truncateArtifactString(step.label, limits), + input: truncateArtifactJson(step.input, limits), + output: truncateArtifactJson(step.output, limits), + error: step.error ? truncateArtifactString(step.error, limits) : step.error, + })), + }; +} + +function hasProcessTraceOverflow( + processTrace: PopulateProcessTrace, + limits: typeof PROCESS_TRACE_ARTIFACT_LIMITS[number] +): boolean { + return ( + processTrace.searchQueries.length > limits.maxItems || + processTrace.fetchedUrls.length > limits.maxItems || + processTrace.sourceArtifacts.length > limits.maxItems || + processTrace.notes.length > limits.maxItems || + processTrace.steps.length > limits.maxItems || + processTrace.searchQueries.some((query) => query.length > limits.maxStringLength) || + processTrace.fetchedUrls.some((url) => url.length > limits.maxStringLength) || + processTrace.notes.some((note) => note.length > limits.maxStringLength) || + processTrace.sourceArtifacts.some((artifact) => + [ + artifact.url, + artifact.label ?? "", + artifact.error ?? "", + ].some((value) => value.length > limits.maxStringLength) + ) || + processTrace.steps.some((step) => + [ + step.label, + step.error ?? "", + ].some((value) => value.length > limits.maxStringLength) + ) + ); +} + +function truncateArtifactJson( + value: unknown, + limits: typeof PROCESS_TRACE_ARTIFACT_LIMITS[number] +): unknown { + if (typeof value === "string") { + return truncateArtifactString(value, limits); + } + if (Array.isArray(value)) { + return value + .slice(0, limits.maxNestedItems) + .map((nestedValue) => truncateArtifactJson(nestedValue, limits)); + } + if (value && typeof value === "object") { + return Object.fromEntries( + Object.entries(value as Record) + .slice(0, limits.maxNestedItems) + .map(([key, nestedValue]) => [ + key, + truncateArtifactJson(nestedValue, limits), + ]) + ); + } + return value; +} + +function truncateArtifactString( + value: string, + limits: typeof PROCESS_TRACE_ARTIFACT_LIMITS[number] +): string { + return value.length > limits.maxStringLength + ? `${value.slice(0, limits.maxStringLength)}\n[truncated]` + : value; +} + export function emptyPopulateRuntimeResult(validationIssues: string[]): PopulateRuntimeResult { return { rows: [], @@ -875,6 +1390,15 @@ export function emptyPopulateRuntimeResult(validationIssues: string[]): Populate capturedSources: [], selectedRowSource: "none", notes: [], + processTrace: { + runtime: "unknown", + searchQueries: [], + fetchedUrls: [], + sourceArtifacts: [], + selectedRowSource: "none", + notes: [], + steps: [], + }, }, }; } @@ -884,6 +1408,28 @@ function isHealthyRun(runResult: PopulateRecipeRunResult): boolean { runResult.productionValidation.isValid; } +function shouldRejectAfterBoundedReplayFailure(input: { + activeRecipe: PopulateRecipe; + activeRun: PopulateRecipeRunResult; +}): boolean { + return Boolean(input.activeRecipe.playwrightScript) && + input.activeRun.artifacts.some((artifact) => + artifact.kind === "playwright-repair-diagnostic" || + artifact.kind === "playwright-replay-result" + ); +} + +function replayFailureRejectionReasons( + activeRun: PopulateRecipeRunResult +): string[] { + return Array.from(new Set([ + ...activeRun.productionValidation.criticalIssues, + ...activeRun.validationIssues, + ...activeRun.productionValidation.warnings, + "Promoted Playwright replay/repair failed; keeping prior active script.", + ])).filter(Boolean); +} + function shouldPromoteCandidate(input: { activeRecipe: PopulateRecipe; activeRun: PopulateRecipeRunResult; @@ -915,6 +1461,18 @@ function rejectionReasonsForCandidate(input: { return Array.from(new Set(reasons)); } +function browserActionBoxDatasetSchemaFromContext( + context: DatasetContext +): BrowserActionBoxDatasetSchema { + return { + columns: context.columns.map((column) => ({ + name: column.name, + description: column.description, + required: column.nullable !== true, + })), + }; +} + function successfulRecipe( recipe: PopulateRecipe, runResult: PopulateRecipeRunResult @@ -924,9 +1482,51 @@ function successfulRecipe( status: "active", lastSuccessfulRunAt: runResult.completedAt, lastValidationScore: runResult.productionValidation.score, + playwrightScript: promotedPlaywrightScriptFromRunResult(recipe, runResult), }; } +function promotedPlaywrightScriptFromRunResult( + recipe: PopulateRecipe, + runResult: PopulateRecipeRunResult +): PlaywrightScriptArtifact | undefined { + const scriptArtifact = runResult.artifacts.find((artifact) => + artifact.kind === "playwright-repaired-script" + ) ?? runResult.artifacts.find((artifact) => + artifact.kind === "playwright-candidate-script" + ); + if (!scriptArtifact?.content.trim()) { + return recipe.playwrightScript; + } + const sourceUrl = firstRunSourceUrl(runResult); + if (!sourceUrl) { + return recipe.playwrightScript; + } + return createPlaywrightScriptArtifact({ + sourceUrl, + datasetGoalPrompt: recipe.sourceDescription, + datasetSchema: { + columns: recipe.requestedColumns.map((name) => ({ + name, + required: true, + })), + }, + code: scriptArtifact.content, + status: "promoted", + createdAt: runResult.completedAt, + diagnostics: [], + }); +} + +function firstRunSourceUrl( + runResult: PopulateRecipeRunResult +): string | undefined { + return runResult.rows.flatMap((row) => row.sourceUrls)[0] ?? + runResult.debug?.processTrace.sourceArtifacts.find((artifact) => + artifact.status === "succeeded" + )?.url; +} + function runRecordFromRunResult( runResult: PopulateRecipeRunResult ): StoredPopulateRecipeRunRecord { @@ -936,6 +1536,7 @@ function runRecordFromRunResult( runStatus: runResult.runStatus, completedAt: runResult.completedAt, productionValidation: runResult.productionValidation, + artifacts: runResult.artifacts, }; } diff --git a/backend/src/pipeline/populate-source-planner.ts b/backend/src/pipeline/populate-source-planner.ts new file mode 100644 index 0000000..db0d310 --- /dev/null +++ b/backend/src/pipeline/populate-source-planner.ts @@ -0,0 +1,311 @@ +import type { DatasetContext } from "./populate.js"; +import type { + PopulateFetchedPage, + PopulateRuntimeRow, + PopulateWebSearchResult, +} from "./populate-runtime.js"; + +export type PopulateSourceTriageStatus = + | "extract_now" + | "requires_navigation" + | "requires_form_submission" + | "requires_detail_page_followup" + | "blocked" + | "irrelevant" + | "low_value"; + +export interface PopulateRankedSearchResult extends PopulateWebSearchResult { + canonicalUrl: string; + expectationScore: number; + lowTrustReason?: string; +} + +export interface PopulateSourceTriageResult { + status: PopulateSourceTriageStatus; + confidence: number; + reason: string; + suggestedAction?: string; +} + +const LOW_TRUST_HOST_PATTERNS = [ + /(^|\.)reddit\.com$/i, + /(^|\.)quora\.com$/i, + /(^|\.)medium\.com$/i, + /(^|\.)linkedin\.com$/i, + /(^|\.)x\.com$/i, + /(^|\.)twitter\.com$/i, + /(^|\.)facebook\.com$/i, + /(^|\.)instagram\.com$/i, +]; + +const SOURCE_PLANNER_FETCH_LIMIT_DEFAULT = 8; + +export function canonicalPopulateSourceUrl(url: string): string { + try { + const parsed = new URL(url.trim()); + parsed.hash = ""; + if (parsed.pathname !== "/") { + parsed.pathname = parsed.pathname.replace(/\/+$/, ""); + } + parsed.searchParams.sort(); + return parsed.toString(); + } catch { + return url.trim(); + } +} + +export function rankPopulateSearchResults(input: { + context: DatasetContext; + results: PopulateWebSearchResult[]; +}): PopulateRankedSearchResult[] { + const byCanonicalUrl = new Map(); + for (const result of input.results) { + const canonicalUrl = canonicalPopulateSourceUrl(result.url); + const ranked = { + ...result, + canonicalUrl, + ...scorePopulateSearchResult({ + context: input.context, + result: { ...result, url: canonicalUrl }, + }), + }; + const existing = byCanonicalUrl.get(canonicalUrl); + if (!existing || ranked.expectationScore > existing.expectationScore) { + byCanonicalUrl.set(canonicalUrl, ranked); + } + } + + return [...byCanonicalUrl.values()].sort((left, right) => { + if (right.expectationScore !== left.expectationScore) { + return right.expectationScore - left.expectationScore; + } + return left.canonicalUrl.localeCompare(right.canonicalUrl); + }); +} + +export function buildPopulateFetchPlan(input: { + rankedResults: PopulateRankedSearchResult[]; + fetchLimit?: number; +}): string[] { + return input.rankedResults + .slice(0, input.fetchLimit ?? SOURCE_PLANNER_FETCH_LIMIT_DEFAULT) + .map((result) => result.canonicalUrl); +} + +export function triageFetchedPageForPopulate(input: { + context: DatasetContext; + url: string; + page: PopulateFetchedPage; +}): PopulateSourceTriageResult { + const text = [input.page.title, input.page.text].filter(Boolean).join("\n"); + const normalizedText = text.toLowerCase(); + const normalizedUrl = input.url.toLowerCase(); + + if (isBlockedPageText(normalizedText)) { + return { + status: "blocked", + confidence: 0.9, + reason: "Page appears blocked by auth, captcha, access control, or anti-bot copy.", + suggestedAction: "Use browser diagnostics only if the data is publicly accessible.", + }; + } + + if (/\b(search|filter|location|zipcode|zip code|enter your|select)\b/i.test(text)) { + return { + status: /submit|form|zipcode|zip code|enter your/i.test(text) + ? "requires_form_submission" + : "requires_navigation", + confidence: 0.75, + reason: "Page likely requires browser interaction before the requested rows are visible.", + suggestedAction: "Navigate the page, apply required filters, then extract source-backed rows.", + }; + } + + if (/\/search|\/locator|\/directory|\/catalog|\/companies|\/jobs/i.test(normalizedUrl)) { + return { + status: "requires_detail_page_followup", + confidence: 0.7, + reason: "URL looks like a listing or directory that may need detail-page follow-up.", + suggestedAction: "Open relevant detail pages and extract requested fields from those public pages.", + }; + } + + const relevantTokenCount = relevantPlannerTokens(input.context) + .filter((token) => normalizedText.includes(token)).length; + if (relevantTokenCount === 0 && normalizedText.length > 0) { + return { + status: "low_value", + confidence: 0.65, + reason: "Fetched text has little overlap with the dataset prompt or columns.", + }; + } + + if (normalizedText.length < 200) { + return { + status: "low_value", + confidence: 0.6, + reason: "Fetched text is too short to support source-backed rows.", + }; + } + + return { + status: "extract_now", + confidence: Math.min(0.95, 0.55 + relevantTokenCount * 0.08), + reason: "Fetched text appears to contain enough inline public data to attempt extraction before browser spend.", + }; +} + +export function directRowsFromFetchedPage(input: { + context: DatasetContext; + url: string; + page: PopulateFetchedPage; + maxRows?: number; +}): PopulateRuntimeRow[] { + const titleColumn = input.context.columns.find((column) => + /title|name/i.test(column.name) + ); + const urlColumn = input.context.columns.find((column) => + /url|link|website|source/i.test(column.name) + ); + if (!titleColumn || !urlColumn) { + return []; + } + + const requiredColumns = input.context.columns.filter( + (column) => column.nullable !== true + ); + if ( + requiredColumns.some((column) => + column.name !== titleColumn.name && column.name !== urlColumn.name + ) + ) { + return []; + } + + const title = firstUsefulLine([input.page.title, input.page.text].filter(Boolean).join("\n")); + if (!title) { + return []; + } + + const cells = Object.fromEntries( + input.context.columns.map((column) => { + if (column.name === titleColumn.name) { + return [column.name, title]; + } + if (column.name === urlColumn.name) { + return [column.name, input.url]; + } + return [column.name, null]; + }) + ); + + return [{ + cells, + sourceUrls: [input.url], + evidence: [{ + columnName: titleColumn.name, + sourceUrl: input.url, + quote: title, + }], + needsReview: true, + }].slice(0, input.maxRows ?? 1); +} + +function scorePopulateSearchResult(input: { + context: DatasetContext; + result: PopulateWebSearchResult; +}): Pick { + const lowTrustReason = lowTrustSourceReason(input.result.url); + if (lowTrustReason) { + return { expectationScore: 1, lowTrustReason }; + } + + const haystack = [ + input.result.title, + input.result.snippet, + input.result.url, + ].filter(Boolean).join(" ").toLowerCase(); + let score = 2; + if ((input.result.snippet?.length ?? 0) >= 40) score += 1; + if (/\b(official|docs|documentation|pricing|blog|news|release|careers|jobs)\b/.test(haystack)) { + score += 1; + } + if (plannerHostLooksPrimary(input.result.url)) { + score += 0.5; + } + const overlap = relevantPlannerTokens(input.context) + .filter((token) => haystack.includes(token)).length; + score += Math.min(1.5, overlap * 0.3); + + return { + expectationScore: Math.max(1, Math.min(5, Math.round(score * 10) / 10)), + }; +} + +function lowTrustSourceReason(url: string): string | undefined { + try { + const host = new URL(url).hostname.replace(/^www\./i, ""); + return LOW_TRUST_HOST_PATTERNS.some((pattern) => pattern.test(host)) + ? `low-trust host: ${host}` + : undefined; + } catch { + return "invalid URL"; + } +} + +function plannerHostLooksPrimary(url: string): boolean { + try { + const host = new URL(url).hostname; + return /\.(com|org|io|ai|dev|gov|edu)$/i.test(host); + } catch { + return false; + } +} + +function relevantPlannerTokens(context: DatasetContext): string[] { + return Array.from(new Set([ + userPromptDescription(context.description), + ...context.columns.map((column) => `${column.name} ${column.description ?? ""}`), + ] + .join(" ") + .toLowerCase() + .split(/[^a-z0-9]+/) + .filter((token) => + token.length >= 4 && + ![ + "with", + "from", + "that", + "this", + "have", + "source", + "sources", + "column", + "columns", + "include", + "latest", + "find", + ].includes(token) + ))); +} + +function userPromptDescription(description: string): string { + return description + .split(/\n\s*Durable recipe instructions:\s*/i)[0] + ?.trim() || description.trim(); +} + +function isBlockedPageText(text: string): boolean { + return /\b(captcha|access denied|forbidden|sign in to continue|enable javascript|unusual traffic|verify you are human)\b/i.test(text); +} + +function firstUsefulLine(text: string): string { + return text + .split(/\r?\n/) + .map((line) => line.trim()) + .find((line) => + line.length >= 8 && + line.length <= 200 && + !/^https?:\/\//i.test(line) + ) ?? ""; +} diff --git a/backend/src/pipeline/populate-tinyfish-trace-recorder.ts b/backend/src/pipeline/populate-tinyfish-trace-recorder.ts new file mode 100644 index 0000000..bd213a8 --- /dev/null +++ b/backend/src/pipeline/populate-tinyfish-trace-recorder.ts @@ -0,0 +1,830 @@ +import type { + PopulateRuntimeBrowserAction, + PopulateRuntimeTraceStep, +} from "./populate-runtime.js"; +import { TinyFish } from "@tiny-fish/sdk"; + +export type TinyFishRecordedTraceStatus = + | "completed" + | "failed" + | "cancelled" + | "timed_out" + | "unknown"; + +export interface TinyFishSseEvent { + type: string; + message?: string; + data?: Record; + createdAt: string; +} + +export interface TinyFishArtifactRef { + kind: "screenshot" | "html" | "recording" | "streaming" | "unknown"; + url?: string; + endpoint?: string; + stepId?: string; + label?: string; +} + +export interface TinyFishRunStep { + index: number; + id?: string; + action?: string; + status?: string; + urlBefore?: string; + urlAfter?: string; + selector?: string; + targetText?: string; + valueSummary?: string; + error?: string; + startedAt?: string; + completedAt?: string; + durationMs?: number; + artifactRefs: TinyFishArtifactRef[]; +} + +export interface TinyFishRecordedTrace { + provider: "tinyfish"; + sourceUrl: string; + goal: string; + runId: string | null; + status: TinyFishRecordedTraceStatus; + sseEvents: TinyFishSseEvent[]; + runSteps: TinyFishRunStep[]; + artifactRefs: TinyFishArtifactRef[]; + finalResult: Record | null; + normalizedBrowserActions: PopulateRuntimeBrowserAction[]; + diagnostics: string[]; +} + +export interface TinyFishTraceRecorderClient { + runAgent(input: { + sourceUrl: string; + goal: string; + captureHtml: boolean; + captureScreenshots: boolean; + maxDurationSeconds: number; + maxAgentSteps: number; + }): Promise<{ + runId?: string | null; + status?: string; + finalResult?: Record | null; + sseEvents?: unknown[]; + runDetail?: Record | null; + diagnostics?: string[]; + }>; + getRun?(runId: string): Promise | null>; +} + +export function createTinyFishTraceRecorderClient(input: { + apiKey: string; + pollIntervalMs?: number; + baseUrl?: string; +}): TinyFishTraceRecorderClient { + const baseURL = input.baseUrl ?? "https://agent.tinyfish.ai"; + const client = new TinyFish({ apiKey: input.apiKey, baseURL }); + const pollIntervalMs = input.pollIntervalMs ?? 3_000; + const rawApiClient = createTinyFishRawApiClient({ + apiKey: input.apiKey, + baseUrl: baseURL, + }); + return { + async runAgent(runInput) { + return runAgentWithStreamFirst({ + client, + rawApiClient, + pollIntervalMs, + runInput, + }); + }, + async getRun(runId) { + return rawApiClient.getRun(runId); + }, + }; +} + +function createTinyFishRawApiClient(input: { + apiKey: string; + baseUrl: string; +}) { + return { + async getRun(runId: string): Promise | null> { + const response = await fetch( + `${input.baseUrl}/v1/runs/${encodeURIComponent(runId)}`, + { + headers: { + "X-API-Key": input.apiKey, + Accept: "application/json", + }, + } + ); + if (!response.ok) { + return null; + } + return recordValue(await response.json().catch(() => null)) ?? null; + }, + }; +} + +async function runAgentWithStreamFirst(input: { + client: TinyFish; + rawApiClient: ReturnType; + pollIntervalMs: number; + runInput: Parameters[0]; +}): ReturnType { + const controller = new AbortController(); + const timeout = setTimeout( + () => controller.abort(), + input.runInput.maxDurationSeconds * 1_000 + ); + const sseEvents: unknown[] = []; + let runId: string | null = null; + let status: string | undefined; + let finalResult: Record | null = null; + + try { + const stream = await input.client.agent.stream( + { + url: input.runInput.sourceUrl, + goal: input.runInput.goal, + }, + { signal: controller.signal } + ); + for await (const event of stream) { + const eventRecord = recordValue(event); + sseEvents.push(event); + runId = stringValue(eventRecord?.run_id) ?? runId; + if (eventRecord?.type === "COMPLETE") { + status = stringValue(eventRecord.status) ?? status; + finalResult = recordValue(eventRecord.result) ?? finalResult; + await stream.close().catch(() => undefined); + break; + } + } + } catch (err) { + const diagnostic = err instanceof Error ? err.message : String(err); + if (!controller.signal.aborted || !runId) { + clearTimeout(timeout); + return runAgentWithQueueFallback({ + client: input.client, + rawApiClient: input.rawApiClient, + pollIntervalMs: input.pollIntervalMs, + runInput: input.runInput, + initialDiagnostics: [diagnostic], + }); + } + status = "TIMEOUT"; + } finally { + clearTimeout(timeout); + } + + const runDetail = runId ? await input.rawApiClient.getRun(runId) : null; + const runDetailStatus = stringValue(runDetail?.status); + const finalStatus = status === "TIMEOUT" && runDetailStatus + ? runDetailStatus + : status ?? runDetailStatus ?? "UNKNOWN"; + return { + runId, + status: finalStatus, + finalResult: finalResult ?? recordValue(runDetail?.result) ?? null, + sseEvents, + runDetail, + diagnostics: controller.signal.aborted && finalStatus === "TIMEOUT" + ? [`TinyFish Agent stream timed out after ${input.runInput.maxDurationSeconds}s.`] + : [], + }; +} + +async function runAgentWithQueueFallback(input: { + client: TinyFish; + rawApiClient: ReturnType; + pollIntervalMs: number; + runInput: Parameters[0]; + initialDiagnostics?: string[]; +}): ReturnType { + const queued = await input.client.agent.queue({ + url: input.runInput.sourceUrl, + goal: input.runInput.goal, + }); + if (queued.error || !queued.run_id) { + return { + runId: queued.run_id ?? null, + status: "FAILED", + finalResult: null, + diagnostics: [ + ...(input.initialDiagnostics ?? []), + queued.error?.message ?? "TinyFish Agent queue returned no run id.", + ], + }; + } + + const startedAt = Date.now(); + let runDetail: Record | null = null; + while (Date.now() - startedAt < input.runInput.maxDurationSeconds * 1_000) { + runDetail = await input.rawApiClient.getRun(queued.run_id); + const status = String(runDetail?.status ?? ""); + if (/COMPLETED|FAILED|CANCELLED|CANCELED/i.test(status)) { + const error = recordValue(runDetail?.error); + return { + runId: queued.run_id, + status, + runDetail, + finalResult: recordValue(runDetail?.result) ?? null, + diagnostics: [ + ...(input.initialDiagnostics ?? []), + ...(stringValue(error?.message) ? [stringValue(error?.message)!] : []), + ], + }; + } + await sleep(input.pollIntervalMs); + } + + return { + runId: queued.run_id, + status: "TIMEOUT", + runDetail, + finalResult: null, + diagnostics: [ + ...(input.initialDiagnostics ?? []), + `TinyFish Agent run timed out after ${input.runInput.maxDurationSeconds}s.`, + ], + }; +} + +export async function recordTinyFishTrace(input: { + sourceUrl: string; + goal: string; + captureHtml: boolean; + captureScreenshots: boolean; + maxDurationSeconds: number; + maxAgentSteps: number; + client: TinyFishTraceRecorderClient; +}): Promise { + const run = await input.client.runAgent({ + sourceUrl: input.sourceUrl, + goal: input.goal, + captureHtml: input.captureHtml, + captureScreenshots: input.captureScreenshots, + maxDurationSeconds: input.maxDurationSeconds, + maxAgentSteps: input.maxAgentSteps, + }); + const runId = stringValue(run.runId) ?? stringValue(run.runDetail?.run_id) ?? null; + const runDetail = + run.runDetail ?? + (runId && input.client.getRun ? await input.client.getRun(runId) : null); + + return normalizeTinyFishRecordedTrace({ + sourceUrl: input.sourceUrl, + goal: input.goal, + runId, + status: run.status ?? stringValue(runDetail?.status), + sseEvents: run.sseEvents ?? [], + runDetail, + finalResult: + run.finalResult ?? + recordValue(runDetail?.result) ?? + null, + diagnostics: run.diagnostics ?? [], + }); +} + +export function normalizeTinyFishRecordedTrace(input: { + sourceUrl: string; + goal: string; + runId?: string | null; + status?: string; + sseEvents?: unknown[]; + runDetail?: Record | null; + finalResult?: Record | null; + diagnostics?: string[]; +}): TinyFishRecordedTrace { + const runSteps = runStepsFromRunDetail(input.runDetail); + const artifactRefs = dedupeArtifactRefs([ + ...artifactRefsFromRunDetail(input.runDetail, input.runId ?? null), + ...runSteps.flatMap((step) => step.artifactRefs), + ]); + const normalizedBrowserActions = dedupeBrowserActions([ + ...runSteps + .map((step) => browserActionFromRunStep(step, input.sourceUrl)) + .filter((action): action is PopulateRuntimeBrowserAction => Boolean(action)), + ...browserActionsFromAgentResult(input.finalResult), + ]); + + return { + provider: "tinyfish", + sourceUrl: input.sourceUrl, + goal: input.goal, + runId: input.runId ?? stringValue(input.runDetail?.run_id) ?? null, + status: normalizeTraceStatus(input.status ?? stringValue(input.runDetail?.status)), + sseEvents: (input.sseEvents ?? []).map(normalizeSseEvent), + runSteps, + artifactRefs, + finalResult: input.finalResult ?? null, + normalizedBrowserActions, + diagnostics: [ + ...(input.diagnostics ?? []), + ...(normalizedBrowserActions.length === 0 + ? ["TinyFish trace contains no explicit replayable browser actions."] + : []), + ], + }; +} + +export function tinyFishTraceProcessSteps( + trace: TinyFishRecordedTrace +): PopulateRuntimeTraceStep[] { + const agentSteps: PopulateRuntimeTraceStep[] = [{ + kind: "agent", + label: "tinyfish-agent-run", + status: trace.status === "completed" ? "succeeded" : "failed", + input: { + url: trace.sourceUrl, + runId: trace.runId, + goalCharacters: trace.goal.length, + }, + output: { + sseEventCount: trace.sseEvents.length, + runStepCount: trace.runSteps.length, + artifactRefCount: trace.artifactRefs.length, + browserActionCount: trace.normalizedBrowserActions.length, + }, + error: trace.status === "completed" ? undefined : trace.diagnostics[0], + }]; + + const browserSteps = trace.normalizedBrowserActions.map((action, index) => ({ + kind: "browser" as const, + label: `tinyfish-browser-${action.action}-${index + 1}`, + status: "succeeded" as const, + input: { + url: action.url, + selector: action.selector, + targetText: action.targetText, + }, + browserAction: action, + })); + + return [...agentSteps, ...browserSteps]; +} + +function runStepsFromRunDetail( + runDetail: Record | null | undefined +): TinyFishRunStep[] { + const rawSteps = arrayValue(runDetail?.steps); + return rawSteps + .map((step, index) => runStepFromUnknown(step, index)) + .filter((step): step is TinyFishRunStep => Boolean(step)); +} + +function runStepFromUnknown( + value: unknown, + index: number +): TinyFishRunStep | undefined { + if (!isRecord(value)) { + return undefined; + } + const id = firstStringAtPaths(value, [["id"], ["step_id"], ["stepId"]]); + return { + index, + id, + action: firstStringAtPaths(value, [ + ["action"], + ["type"], + ["kind"], + ["operation"], + ["description"], + ["summary"], + ["name"], + ]), + status: firstStringAtPaths(value, [["status"], ["state"], ["outcome"]]), + urlBefore: firstStringAtPaths(value, [ + ["url_before"], + ["urlBefore"], + ["before", "url"], + ["input", "url_before"], + ]), + urlAfter: firstStringAtPaths(value, [ + ["url_after"], + ["urlAfter"], + ["current_url"], + ["currentUrl"], + ["url"], + ["page_url"], + ["pageUrl"], + ["after", "url"], + ["input", "url"], + ["args", "url"], + ]), + selector: firstStringAtPaths(value, [ + ["selector"], + ["locator"], + ["target", "selector"], + ["element", "selector"], + ["input", "selector"], + ["args", "selector"], + ]), + targetText: firstStringAtPaths(value, [ + ["target_text"], + ["targetText"], + ["target", "text"], + ["element", "text"], + ["text"], + ["label"], + ]), + valueSummary: safeValueSummary(value), + error: errorMessageFromRecord(value), + startedAt: firstStringAtPaths(value, [["started_at"], ["startedAt"]]), + completedAt: firstStringAtPaths(value, [["completed_at"], ["completedAt"]]), + durationMs: numberValueAtPaths(value, [["duration_ms"], ["durationMs"], ["duration"]]), + artifactRefs: artifactRefsFromStep(value, id), + }; +} + +function browserActionFromRunStep( + step: TinyFishRunStep, + fallbackSourceUrl?: string +): PopulateRuntimeBrowserAction | undefined { + const action = normalizeBrowserActionKind(step.action); + if (!action) { + return undefined; + } + const url = + step.urlAfter ?? + step.urlBefore ?? + (action === "navigate" || action === "extract" || action === "screenshot" + ? fallbackSourceUrl + : undefined); + if (!url && !step.selector && !step.targetText) { + return undefined; + } + return { + action, + url, + selector: step.selector, + targetText: step.targetText, + valueDescription: step.valueSummary, + }; +} + +function browserActionsFromAgentResult( + result: Record | null | undefined +): PopulateRuntimeBrowserAction[] { + if (!result) { + return []; + } + const rawActions = [ + ...arrayValue(result.browser_actions), + ...arrayValue(result.agent_browser_actions), + ...agentCompatibleRows(result).flatMap((row) => { + if (!isRecord(row)) return []; + return [ + ...arrayValue(row.browser_actions), + ...arrayValue(row.agent_browser_actions), + ]; + }), + ]; + return rawActions + .map(browserActionFromUnknown) + .filter((action): action is PopulateRuntimeBrowserAction => Boolean(action)); +} + +function browserActionFromUnknown(value: unknown): PopulateRuntimeBrowserAction | undefined { + if (typeof value === "string") { + const action = normalizeBrowserActionKind(value); + if (!action) { + return undefined; + } + const url = value.match(/https?:\/\/[^\s)]+/i)?.[0]; + const targetText = targetTextFromActionString(value); + if (!url && !targetText) { + return undefined; + } + return { + action, + url, + targetText, + }; + } + if (!isRecord(value)) { + return undefined; + } + const action = normalizeBrowserActionKind( + firstStringAtPaths(value, [["action"], ["kind"], ["type"], ["name"], ["label"]]) + ); + if (!action) { + return undefined; + } + const targetText = firstStringAtPaths(value, [ + ["targetText"], + ["target_text"], + ["target", "text"], + ["label"], + ]); + const browserAction = { + action, + url: firstStringAtPaths(value, [["url"], ["pageUrl"], ["page_url"], ["href"]]), + selector: firstStringAtPaths(value, [["selector"], ["locator"]]), + targetText, + valueDescription: safeValueSummary(value), + }; + return browserAction.url || browserAction.selector || browserAction.targetText + ? browserAction + : undefined; +} + +function normalizeBrowserActionKind( + value: string | undefined +): PopulateRuntimeBrowserAction["action"] | undefined { + if (!value) { + return undefined; + } + const normalized = value.toLowerCase(); + if (/\b(goto|go to|navigate|visit|open)\b/.test(normalized)) return "navigate"; + if (/\b(click|tap|press)\b/.test(normalized)) return "click"; + if (/\b(type|fill|input|enter)\b/.test(normalized)) return "type"; + if (/\b(select|choose)\b/.test(normalized)) return "select"; + if (/\b(wait|pause)\b/.test(normalized)) return "wait"; + if (/\b(extract|scrape|read|collect)\b/.test(normalized)) return "extract"; + if (/\b(screenshot|capture)\b/.test(normalized)) return "screenshot"; + return undefined; +} + +function artifactRefsFromRunDetail( + runDetail: Record | null | undefined, + runId: string | null +): TinyFishArtifactRef[] { + if (!runDetail) { + return []; + } + const refs: TinyFishArtifactRef[] = []; + for (const key of ["streaming_url", "streamingUrl"] as const) { + const url = stringValue(runDetail[key]); + if (url) refs.push({ kind: "streaming", url, label: key }); + } + for (const key of ["recording_url", "recordingUrl"] as const) { + const url = stringValue(runDetail[key]); + if (url) refs.push({ kind: "recording", url, label: key }); + } + for (const artifact of [ + ...arrayValue(runDetail.capture_artifacts), + ...arrayValue(runDetail.captureArtifacts), + ...arrayValue(runDetail.artifacts), + ]) { + if (!isRecord(artifact)) { + continue; + } + refs.push({ + kind: artifactKindFromString(firstStringAtPaths(artifact, [["kind"], ["type"]])), + url: firstStringAtPaths(artifact, [["url"], ["href"]]), + endpoint: firstStringAtPaths(artifact, [["endpoint"]]), + stepId: firstStringAtPaths(artifact, [["step_id"], ["stepId"]]), + label: firstStringAtPaths(artifact, [["label"], ["name"]]), + }); + } + if (runId) { + refs.push({ + kind: "html", + endpoint: `/v1/runs/${encodeURIComponent(runId)}/steps/{stepId}/html`, + label: "documented-step-html-endpoint-template", + }); + refs.push({ + kind: "screenshot", + endpoint: `/v1/runs/${encodeURIComponent(runId)}/steps/{stepId}/screenshot`, + label: "documented-step-screenshot-endpoint-template", + }); + } + return refs; +} + +function artifactRefsFromStep( + step: Record, + stepId: string | undefined +): TinyFishArtifactRef[] { + const refs: TinyFishArtifactRef[] = []; + for (const key of ["screenshot_url", "screenshotUrl"] as const) { + const url = stringValue(step[key]); + if (url) refs.push({ kind: "screenshot", url, stepId, label: key }); + } + for (const key of ["screenshot"] as const) { + const url = stringValue(step[key]); + if (url) refs.push({ kind: "screenshot", url, stepId, label: key }); + } + for (const key of ["html_url", "htmlUrl"] as const) { + const url = stringValue(step[key]); + if (url) refs.push({ kind: "html", url, stepId, label: key }); + } + for (const key of ["html"] as const) { + const url = stringValue(step[key]); + if (url) refs.push({ kind: "html", url, stepId, label: key }); + } + return refs; +} + +function normalizeSseEvent(value: unknown): TinyFishSseEvent { + if (!isRecord(value)) { + return { + type: "UNKNOWN", + message: typeof value === "string" ? value.slice(0, 500) : undefined, + createdAt: new Date().toISOString(), + }; + } + return { + type: firstStringAtPaths(value, [["type"], ["event"], ["name"]]) ?? "UNKNOWN", + message: firstStringAtPaths(value, [ + ["message"], + ["text"], + ["purpose"], + ["data", "message"], + ]), + data: redactedSseData(value), + createdAt: + firstStringAtPaths(value, [["createdAt"], ["created_at"], ["timestamp"]]) ?? + new Date().toISOString(), + }; +} + +function normalizeTraceStatus(value: string | undefined): TinyFishRecordedTraceStatus { + const normalized = value?.toLowerCase() ?? ""; + if (/complete|completed|success|succeeded/.test(normalized)) return "completed"; + if (/fail|error/.test(normalized)) return "failed"; + if (/cancel/.test(normalized)) return "cancelled"; + if (/timeout|timed_out/.test(normalized)) return "timed_out"; + return "unknown"; +} + +function artifactKindFromString(value: string | undefined): TinyFishArtifactRef["kind"] { + const normalized = value?.toLowerCase() ?? ""; + if (/screenshot|image|png|jpeg|jpg/.test(normalized)) return "screenshot"; + if (/html|dom/.test(normalized)) return "html"; + if (/recording|video/.test(normalized)) return "recording"; + if (/stream/.test(normalized)) return "streaming"; + return "unknown"; +} + +function targetTextFromActionString(value: string): string | undefined { + const quoted = value.match(/["'“”]([^"'“”]{2,120})["'“”]/)?.[1]; + if (quoted) { + return quoted; + } + const section = value.match(/\b(?:click|select|choose|press)\s+(?:the\s+)?([^.,;]{2,80})/i)?.[1]; + return section?.trim(); +} + +function safeValueSummary(record: Record): string | undefined { + const raw = firstStringAtPaths(record, [ + ["value_description"], + ["valueDescription"], + ["value"], + ["text"], + ["input", "value"], + ["args", "value"], + ]); + if (!raw) { + return undefined; + } + if (/(password|token|secret|key|cookie|auth)/i.test(raw)) { + return "redacted sensitive value"; + } + return raw.length > 80 ? `redacted typed value (${raw.length} chars)` : raw; +} + +function errorMessageFromRecord(record: Record): string | undefined { + const raw = valueAtFirstPath(record, [ + ["error"], + ["failure"], + ["failure_reason"], + ["failureReason"], + ["result", "error"], + ]); + if (typeof raw === "string") { + return raw.slice(0, 500); + } + if (isRecord(raw) && typeof raw.message === "string") { + return raw.message.slice(0, 500); + } + return undefined; +} + +function dedupeArtifactRefs(refs: TinyFishArtifactRef[]): TinyFishArtifactRef[] { + const seen = new Set(); + const deduped: TinyFishArtifactRef[] = []; + for (const ref of refs) { + const key = JSON.stringify([ref.kind, ref.url, ref.endpoint, ref.stepId, ref.label]); + if (seen.has(key)) continue; + seen.add(key); + deduped.push(ref); + } + return deduped; +} + +function dedupeBrowserActions( + actions: PopulateRuntimeBrowserAction[] +): PopulateRuntimeBrowserAction[] { + const seen = new Set(); + const deduped: PopulateRuntimeBrowserAction[] = []; + for (const action of actions) { + const key = JSON.stringify([ + action.action, + action.url, + action.selector, + action.targetText, + action.valueDescription, + ]); + if (seen.has(key)) continue; + seen.add(key); + deduped.push(action); + } + return deduped; +} + +function firstStringAtPaths( + record: Record, + paths: readonly (readonly string[])[] +): string | undefined { + for (const path of paths) { + const value = valueAtPath(record, path); + if (typeof value === "string" && value.trim()) { + return value.trim().slice(0, 500); + } + } + return undefined; +} + +function numberValueAtPaths( + record: Record, + paths: readonly (readonly string[])[] +): number | undefined { + for (const path of paths) { + const value = valueAtPath(record, path); + if (typeof value === "number" && Number.isFinite(value)) { + return value; + } + } + return undefined; +} + +function valueAtFirstPath( + record: Record, + paths: readonly (readonly string[])[] +): unknown { + for (const path of paths) { + const value = valueAtPath(record, path); + if (value !== undefined && value !== null) { + return value; + } + } + return undefined; +} + +function valueAtPath( + record: Record, + path: readonly string[] +): unknown { + let value: unknown = record; + for (const key of path) { + if (!isRecord(value)) { + return undefined; + } + value = value[key]; + } + return value; +} + +function redactedSseData(value: Record): Record | undefined { + const safeEntries = Object.entries(value) + .filter(([key]) => !/streaming|url|token|secret|key|cookie|auth/i.test(key)) + .filter(([, entryValue]) => + typeof entryValue === "string" || + typeof entryValue === "number" || + typeof entryValue === "boolean" + ); + return safeEntries.length > 0 ? Object.fromEntries(safeEntries) : undefined; +} + +function agentCompatibleRows(result: Record): unknown[] { + const direct = arrayValue(result.rows ?? result.records ?? result.result); + if (direct.length > 0) { + return direct; + } + const nested = recordValue(result.result); + return nested ? arrayValue(nested.rows ?? nested.records) : []; +} + +function arrayValue(value: unknown): unknown[] { + return Array.isArray(value) ? value : []; +} + +function recordValue(value: unknown): Record | undefined { + return isRecord(value) ? value : undefined; +} + +function stringValue(value: unknown): string | undefined { + return typeof value === "string" && value.trim() ? value.trim() : undefined; +} + +function isRecord(value: unknown): value is Record { + return Boolean(value) && typeof value === "object" && !Array.isArray(value); +} + +function sleep(ms: number): Promise { + return new Promise((resolve) => setTimeout(resolve, ms)); +} diff --git a/backend/src/pipeline/populate.ts b/backend/src/pipeline/populate.ts index 1524d34..ab160c6 100644 --- a/backend/src/pipeline/populate.ts +++ b/backend/src/pipeline/populate.ts @@ -4,6 +4,7 @@ export const populateColumnSchema = z.object({ name: z.string(), type: z.enum(["text", "number", "boolean", "url", "date"]), description: z.optional(z.string()), + nullable: z.optional(z.boolean()), }); export type PopulateColumn = z.infer; diff --git a/backend/src/server.ts b/backend/src/server.ts index aa93ea7..2a92b61 100644 --- a/backend/src/server.ts +++ b/backend/src/server.ts @@ -17,6 +17,7 @@ import { createPopulateRecipeRuntime, type CreatePopulateRecipeRuntimeInput, } from "./pipeline/populate-runtime-selection.js"; +import { DEFAULT_COMMIT_ROW_LIMIT_PER_HOUR } from "./pipeline/populate-self-healing-command.js"; export interface BigSetServerEnv { CLIENT_ORIGIN: string; @@ -25,6 +26,7 @@ export interface BigSetServerEnv { OPENROUTER_API_KEY?: string; TINYFISH_API_KEY?: string; POPULATE_RECIPE_STORE_DIR: string; + POPULATE_COMMIT_ROW_LIMIT_PER_HOUR?: string; } export interface BigSetPopulateDataset { @@ -134,6 +136,10 @@ export async function createBigSetServer( rowWriter: input.populateRowWriter, shouldCommitRows: true, runtime, + commitRowLimit: { + maxRowsPerWindow: commitRowLimitPerHour(input.env), + windowMs: 60 * 60 * 1_000, + }, }); req.log.info({ @@ -176,11 +182,32 @@ function responseSafePopulateResult( action: result.action, datasetId: result.datasetId, success: result.success, + validationState: result.validationState, committedRows: result.committedRows, + commitLimit: result.commitLimit, rejectionReasons: result.rejectionReasons, validationIssues: result.validationIssues, productionValidation: diagnosticRun?.productionValidation, metrics: diagnosticRun?.metrics, rowCount: diagnosticRun?.rows.length ?? 0, + sampleRows: (diagnosticRun?.rows ?? []).slice(0, 5).map((row) => ({ + cells: row.cells, + sourceUrls: row.sourceUrls, + evidence: row.evidence, + needsReview: row.needsReview, + })), }; } + +function commitRowLimitPerHour(env: BigSetServerEnv): number { + if (!env.POPULATE_COMMIT_ROW_LIMIT_PER_HOUR) { + return DEFAULT_COMMIT_ROW_LIMIT_PER_HOUR; + } + const parsed = Number(env.POPULATE_COMMIT_ROW_LIMIT_PER_HOUR); + if (!Number.isInteger(parsed) || parsed <= 0) { + throw new Error( + "POPULATE_COMMIT_ROW_LIMIT_PER_HOUR must be a positive integer." + ); + } + return parsed; +} diff --git a/backend/test/agent-goal-contract.test.ts b/backend/test/agent-goal-contract.test.ts new file mode 100644 index 0000000..bad81ef --- /dev/null +++ b/backend/test/agent-goal-contract.test.ts @@ -0,0 +1,62 @@ +import assert from "node:assert/strict"; +import { test } from "node:test"; + +import { + AGENT_BROWSER_ACTION_CONTRACT, + buildAgentGoalMessages, +} from "../BigSet_Data_Collection_Agent/src/agents/agent-goal.js"; + +test("Agent goal prompt requires producer-side browser action reporting", () => { + const messages = buildAgentGoalMessages({ + userPrompt: "Find SaaS pricing pages.", + spec: { + intent_summary: "Find pricing pages.", + target_row_count: 3, + row_grain: "company", + columns: [ + { + name: "company_name", + type: "string", + description: "Company name", + required: true, + }, + { + name: "pricing_url", + type: "string", + description: "Pricing page URL", + required: true, + }, + ], + dedupe_keys: ["company_name"], + search_queries: ["SaaS pricing"], + extraction_hints: "Prefer official pricing pages.", + }, + triage: { + url: "https://example.com", + final_url: "https://example.com/pricing", + title: "Pricing", + status: "requires_navigation", + confidence: 0.9, + source_data_confidence: 0.8, + expected_yield: "partial", + reasoning: "Needs click-through navigation.", + suggested_action: "Open pricing details.", + }, + }); + + const systemPrompt = messages.find((message) => message.role === "system") + ?.content ?? ""; + const userPayload = JSON.parse( + messages.find((message) => message.role === "user")?.content ?? "{}" + ); + + assert.match(systemPrompt, /agent_browser_actions/); + assert.match(systemPrompt, /records/); + assert.match(AGENT_BROWSER_ACTION_CONTRACT, /selector/); + assert.match(AGENT_BROWSER_ACTION_CONTRACT, /target_text/); + assert.match(AGENT_BROWSER_ACTION_CONTRACT, /value_description/); + assert.equal( + userPayload.browser_action_reporting_contract, + AGENT_BROWSER_ACTION_CONTRACT + ); +}); diff --git a/backend/test/collection-agent-runner.test.ts b/backend/test/collection-agent-runner.test.ts index 1b88c6e..f7b2532 100644 --- a/backend/test/collection-agent-runner.test.ts +++ b/backend/test/collection-agent-runner.test.ts @@ -2,6 +2,7 @@ import assert from "node:assert/strict"; import { test } from "node:test"; import { runCollectionPopulatePipeline } from "../src/pipeline/collection-agent-runner.js"; +import { playwrightCandidateReadinessForRun } from "../src/pipeline/populate-playwright-readiness.js"; test("collection agent runner maps vendored pipeline output into populate runtime result", async () => { const previousEnv = snapshotEnv([ @@ -36,6 +37,129 @@ test("collection agent runner maps vendored pipeline output into populate runtim assert.equal(result.metrics.browserCalls, 3); assert.equal(result.metrics.agentRuns, 3); assert.equal(result.metrics.agentSteps, 3); + assert.equal(result.debug?.selectedRowSource, "collection_pipeline"); + assert.equal(result.debug?.processTrace.runtime, "collection"); + assert.deepEqual(result.debug?.processTrace.searchQueries, [ + "OpenAI latest AI blog posts", + "OpenAI release notes", + ]); + assert.deepEqual(result.debug?.processTrace.fetchedUrls, [ + "https://openai.com/news", + "https://openai.com/research", + ]); + assert.equal( + result.debug?.processTrace.sourceArtifacts.some((artifact) => + artifact.url === "https://openai.com/news" && + artifact.status === "succeeded" + ), + true + ); + assert.equal( + result.debug?.processTrace.steps.some((step) => step.kind === "browser"), + false + ); + } finally { + restoreEnv(previousEnv); + } +}); + +test("collection agent runner maps explicit browser action reports into process trace", async () => { + const previousEnv = snapshotEnv([ + "AGENT_POLL_TIMEOUT_MS", + "COLLECTION_AGENT_ENABLE_AGENT", + "COLLECTION_AGENT_PIPELINE_MODULE", + "COLLECTION_AGENT_POLL_TIMEOUT_MS", + ]); + delete process.env.AGENT_POLL_TIMEOUT_MS; + process.env.COLLECTION_AGENT_ENABLE_AGENT = "true"; + delete process.env.COLLECTION_AGENT_POLL_TIMEOUT_MS; + process.env.COLLECTION_AGENT_PIPELINE_MODULE = fakeCollectionPipelineModuleUrl({ + expectedCalls: [{ agentEnabled: true, pollTimeoutMs: 1_200_000 }], + browserActions: [ + { + action: "hover", + url: "https://openai.com/news", + status: "succeeded", + phase: "initial-browser", + label: "browser-open-news", + }, + ], + agentBrowserActions: [ + { + action: "click", + url: "https://openai.com/news", + selector: "a[href*='/news/']", + target_text: "Release notes", + value_description: "not captured", + status: "succeeded", + }, + ], + }); + try { + const result = await runCollectionPopulatePipeline(collectionPipelineInput()); + const browserSteps = result.debug?.processTrace.steps.filter( + (step) => step.kind === "browser" + ) ?? []; + + assert.equal(browserSteps.length, 2); + assert.equal(browserSteps[0]?.browserAction?.action, "unknown"); + assert.equal(browserSteps[0]?.label, "browser-open-news"); + assert.deepEqual(browserSteps[0]?.input, { + url: "https://openai.com/news", + selector: undefined, + targetText: undefined, + phase: "initial-browser", + }); + assert.equal(browserSteps[0]?.error, undefined); + assert.equal(browserSteps[1]?.browserAction?.action, "click"); + assert.equal(browserSteps[1]?.browserAction?.selector, "a[href*='/news/']"); + assert.equal(browserSteps[1]?.browserAction?.targetText, "Release notes"); + assert.equal(browserSteps[1]?.browserAction?.valueDescription, "not captured"); + assert.equal(browserSteps[1]?.status, "succeeded"); + assert.deepEqual( + playwrightCandidateReadinessForRun({ result }), + { + status: "ready", + reasons: [], + browserStepCount: 2, + sourceUrlCount: 2, + } + ); + } finally { + restoreEnv(previousEnv); + } +}); + +test("collection agent runner surfaces Agent provenance when actions are missing", async () => { + const previousEnv = snapshotEnv([ + "AGENT_POLL_TIMEOUT_MS", + "COLLECTION_AGENT_ENABLE_AGENT", + "COLLECTION_AGENT_PIPELINE_MODULE", + "COLLECTION_AGENT_POLL_TIMEOUT_MS", + ]); + delete process.env.AGENT_POLL_TIMEOUT_MS; + process.env.COLLECTION_AGENT_ENABLE_AGENT = "true"; + delete process.env.COLLECTION_AGENT_POLL_TIMEOUT_MS; + process.env.COLLECTION_AGENT_PIPELINE_MODULE = fakeCollectionPipelineModuleUrl({ + expectedCalls: [{ agentEnabled: true, pollTimeoutMs: 1_200_000 }], + agentReportedStepCount: 4, + agentRunsWithStreamingUrl: 1, + agentRunsWithExplicitBrowserActions: 0, + }); + try { + const result = await runCollectionPopulatePipeline(collectionPipelineInput()); + + assert.equal(result.metrics.agentSteps, 4); + assert.equal( + result.debug?.processTrace.notes.some((note) => + /reported 4 step\(s\), but emitted no explicit browser actions/i.test(note) + ), + true + ); + assert.equal( + playwrightCandidateReadinessForRun({ result }).status, + "not_ready" + ); } finally { restoreEnv(previousEnv); } @@ -165,6 +289,11 @@ function fakeCollectionPipelineModuleUrl(input: { pollTimeoutMs?: number; }>; sources?: unknown; + browserActions?: unknown; + agentBrowserActions?: unknown; + agentReportedStepCount?: number; + agentRunsWithStreamingUrl?: number; + agentRunsWithExplicitBrowserActions?: number; }): string { const source = ` const moduleLoadPollTimeoutMs = process.env.AGENT_POLL_TIMEOUT_MS ?? null; @@ -202,6 +331,11 @@ function fakeCollectionPipelineModuleUrl(input: { throw new Error("required columns missing from benchmark context"); } return { + runId: "fake-run-1", + paths: { + root: "/tmp/fake-run-1", + reportPath: "/tmp/fake-run-1/run_report.json", + }, report: { errors: [], dataset_spec: { @@ -215,16 +349,35 @@ function fakeCollectionPipelineModuleUrl(input: { agent_dispatched: 1, agent_succeeded: 1, agent_failed: 0, + agent_reported_step_count: ${JSON.stringify(input.agentReportedStepCount)}, + agent_runs_with_streaming_url: ${JSON.stringify(input.agentRunsWithStreamingUrl)}, + agent_runs_with_explicit_browser_actions: ${JSON.stringify(input.agentRunsWithExplicitBrowserActions)}, }, }, initial: { + search_queries: [ + "OpenAI latest AI blog posts", + "OpenAI release notes", + ], + fetched_urls: [ + "https://openai.com/news", + "https://openai.com/research", + ], + failed_urls: [], triage: { agent_dispatched: 1, agent_succeeded: 1, agent_failed: 0, + agent_reported_step_count: ${JSON.stringify(input.agentReportedStepCount)}, + agent_runs_with_streaming_url: ${JSON.stringify(input.agentRunsWithStreamingUrl)}, + agent_runs_with_explicit_browser_actions: ${JSON.stringify(input.agentRunsWithExplicitBrowserActions)}, }, }, repair: { + loops: [{ + loop_index: 1, + repair_queries: ["OpenAI blog official source_url evidence"], + }], stats: { triage: { agent_dispatched: 2, @@ -236,7 +389,26 @@ function fakeCollectionPipelineModuleUrl(input: { quality: { records: [{ record_id: "pk:openai", needs_review: true }], }, - sources: ${JSON.stringify(input.sources ?? { outcomes: [] })}, + search_queries: [ + "OpenAI latest AI blog posts", + "OpenAI release notes", + ], + browser_actions: ${JSON.stringify(input.browserActions ?? [])}, + agent_browser_actions: ${JSON.stringify(input.agentBrowserActions ?? [])}, + fetched_urls: [ + "https://openai.com/news", + "https://openai.com/research", + ], + failed_urls: [], + sources: ${JSON.stringify(input.sources ?? { + outcomes: [{ + url: "https://openai.com/news", + outcome: "success", + phase: "initial", + triage_status: "extract_now", + records_extracted: 1, + }], + })}, llm_usage: { prompt_tokens: 1, completion_tokens: 1, diff --git a/backend/test/collection-browser-actions.test.ts b/backend/test/collection-browser-actions.test.ts new file mode 100644 index 0000000..91d9e9e --- /dev/null +++ b/backend/test/collection-browser-actions.test.ts @@ -0,0 +1,264 @@ +import assert from "node:assert/strict"; +import { test } from "node:test"; + +import { + explicitBrowserActionsFromAgentResult, + explicitBrowserActionsFromAgentRuns, +} from "../BigSet_Data_Collection_Agent/src/orchestrator/browser-actions.js"; +import { + agentRunRecordSchema, + runReportSchema, +} from "../BigSet_Data_Collection_Agent/src/models/schemas.js"; + +test("explicit browser actions are copied from Agent results without generic inference", () => { + const actions = explicitBrowserActionsFromAgentResult({ + pageUrl: "https://example.com/start", + agentResult: { + browser_actions: [ + { + action: "navigate", + url: "https://example.com/start", + status: "succeeded", + phase: "initial", + }, + "not an action", + ], + agent_browser_actions: [{ + action: "click", + selector: "button[type=submit]", + target_text: "Submit", + value_description: "redacted", + status: "succeeded", + }], + actions: [{ + action: "click", + selector: "#generic-actions-are-ignored", + }], + }, + }); + + assert.equal(actions.length, 2); + assert.deepEqual(actions[0], { + action: "navigate", + url: "https://example.com/start", + status: "succeeded", + phase: "initial", + }); + assert.deepEqual(actions[1], { + action: "click", + url: "https://example.com/start", + selector: "button[type=submit]", + target_text: "Submit", + value_description: "redacted", + status: "succeeded", + }); +}); + +test("Agent navigation summaries become replayable browser actions", () => { + const actions = explicitBrowserActionsFromAgentResult({ + pageUrl: "https://example.com/start", + agentResult: { + navigation: { + initial_url: "https://example.com/start", + category_clicked: "K-CUP® PODS", + }, + extraction: { + total_items: 25, + }, + }, + }); + + assert.deepEqual(actions, [ + { + action: "navigate", + url: "https://example.com/start", + status: "succeeded", + phase: "initial", + label: "agent-navigation-start", + }, + { + action: "click", + url: "https://example.com/start", + target_text: "K-CUP® PODS", + status: "succeeded", + phase: "navigation", + label: "agent-click-category", + }, + { + action: "extract", + url: "https://example.com/start", + status: "succeeded", + phase: "extract", + label: "agent-extract-results", + }, + ]); +}); + +test("Agent string browser action reports become replayable actions", () => { + const actions = explicitBrowserActionsFromAgentResult({ + pageUrl: "https://example.com/store", + agentResult: { + agent_browser_actions: [ + "Navigate to the store page at https://example.com/store.", + "Navigate to the K-Cup Pods section of the store to locate product listings.", + "Extract the first 25 products, collecting name, price, image URL, stock status, numerical price, and source URL.", + ], + }, + }); + + assert.deepEqual(actions, [ + { + action: "navigate", + url: "https://example.com/store", + status: "succeeded", + phase: "navigation", + label: "Navigate to the store page at https://example.com/store.", + }, + { + action: "click", + url: "https://example.com/store", + target_text: "K-Cup Pods", + status: "succeeded", + phase: "navigation", + label: + "Navigate to the K-Cup Pods section of the store to locate product listings.", + }, + { + action: "extract", + url: "https://example.com/store", + status: "succeeded", + phase: "extract", + label: + "Extract the first 25 products, collecting name, price, image URL, stock status, numerical price, and source URL.", + }, + ]); +}); + +test("Agent run records and run reports persist browser action arrays", () => { + const browserActions = [{ + action: "click", + url: "https://example.com/start", + selector: "button[type=submit]", + target_text: "Submit", + value_description: "redacted", + status: "succeeded", + phase: "initial", + }]; + const agentRun = agentRunRecordSchema.parse({ + url: "https://example.com/start", + status: "requires_form_submission", + run_id: "run-1", + agent_status: "COMPLETED", + goal: "Submit the form and extract the result.", + records_extracted: 1, + agent_step_count: 3, + has_streaming_url: true, + result_keys: ["records"], + browser_action_diagnostic: "Agent completed and returned rows, but polled run payload exposed no explicit browser actions.", + browser_actions: browserActions, + }); + + assert.equal(agentRun.agent_step_count, 3); + assert.equal(agentRun.has_streaming_url, true); + assert.deepEqual(agentRun.result_keys, ["records"]); + + assert.deepEqual( + explicitBrowserActionsFromAgentRuns([agentRun]), + browserActions + ); + + const report = runReportSchema.parse({ + run_id: "run-1", + prompt: "Find form-backed data.", + target_rows: 1, + started_at: "2026-05-23T00:00:00.000Z", + finished_at: "2026-05-23T00:00:01.000Z", + duration_ms: 1_000, + dataset_spec: datasetSpec(), + stats: { + ...phaseStats(), + records_after_merge: 1, + visualization_records: 1, + }, + initial: { + ...phaseStats(), + search_queries: ["example form"], + fetched_urls: ["https://example.com/start"], + failed_urls: [], + agent_browser_actions: browserActions, + }, + repair: { + attempted: true, + total_loops: 1, + loops: [{ + loop_index: 1, + repair_queries: ["example form details"], + agent_browser_actions: browserActions, + missing_fields: [], + records_before: 0, + records_after: 1, + fields_filled: {}, + stats: phaseStats(), + }], + missing_fields: [], + repair_queries: ["example form details"], + records_before: 0, + records_after: 1, + fields_filled: {}, + stats: phaseStats(), + }, + search_queries: ["example form", "example form details"], + fetched_urls: ["https://example.com/start"], + failed_urls: [], + errors: [], + }); + + assert.deepEqual(report.initial.agent_browser_actions, browserActions); + assert.deepEqual(report.repair.loops[0]?.agent_browser_actions, browserActions); +}); + +function datasetSpec() { + return { + intent_summary: "Find form-backed data.", + target_row_count: 1, + row_grain: "company", + columns: [{ + name: "entity_name", + type: "string", + description: "Entity name", + required: true, + }], + dedupe_keys: ["entity_name"], + search_queries: ["example form"], + extraction_hints: "Use source-backed rows.", + }; +} + +function phaseStats() { + return { + search_queries_executed: 1, + search_results_collected: 1, + unique_urls_selected: 1, + pages_fetched: 1, + pages_failed: 0, + raw_records_extracted: 1, + triage: { + pages_triaged: 1, + by_status: { + requires_form_submission: 1, + }, + extract_now: 0, + agent_candidates: 1, + agent_dispatched: 1, + agent_deferred: 0, + agent_succeeded: 1, + agent_failed: 0, + skipped: 0, + records_from_extract: 0, + records_from_agent: 1, + agent_reported_step_count: 3, + agent_runs_with_streaming_url: 1, + agent_runs_with_explicit_browser_actions: 1, + }, + }; +} diff --git a/backend/test/collection-source-policy.test.ts b/backend/test/collection-source-policy.test.ts index 48b6ac2..eb990c6 100644 --- a/backend/test/collection-source-policy.test.ts +++ b/backend/test/collection-source-policy.test.ts @@ -122,6 +122,63 @@ test("prompt source policy prefers entity-owned domains over third-party proof", ); }); +test("prompt source policy trusts explicit source URLs from the user's prompt", () => { + const policy = derivePromptSourcePolicy( + "make a table from these public OpenAI API docs pages with only page title and page URL: https://developers.openai.com/api/docs/mcp https://developers.openai.com/api/docs/guides/tools-connectors-mcp", + ); + const triage: SourceTriageResult = { + url: "https://developers.openai.com/api/docs/mcp", + final_url: "https://developers.openai.com/api/docs/mcp", + title: "Building MCP servers for ChatGPT Apps and API integrations", + status: "extract_now", + confidence: 0.9, + source_data_confidence: 0.8, + expected_yield: "complete", + reasoning: "User provided this exact source URL.", + }; + const spec: DatasetSpec = { + intent_summary: "Collect docs pages.", + target_row_count: 2, + row_grain: "one row per docs page", + columns: [ + { + name: "page_url", + type: "string", + description: "Page URL.", + required: true, + }, + { + name: "page_title", + type: "string", + description: "Page title.", + required: true, + }, + ], + dedupe_keys: ["page_url"], + search_queries: [], + extraction_hints: "", + }; + const record: ExtractedRecord = { + row: { + page_url: "https://developers.openai.com/api/docs/mcp", + page_title: "Building MCP servers for ChatGPT Apps and API integrations", + }, + evidence: [], + source_urls: ["https://developers.openai.com/api/docs/mcp"], + }; + + assert.deepEqual(policy.explicitSourceUrls, [ + "https://developers.openai.com/api/docs/mcp", + "https://developers.openai.com/api/docs/guides/tools-connectors-mcp", + ]); + assert.equal( + urlMatchesPromptSourcePolicy("https://developers.openai.com/api/docs/mcp", policy), + true, + ); + assert.equal(applyPromptSourcePolicyToTriageResult(triage, policy).status, "extract_now"); + assert.equal(recordMatchesPromptSourcePolicy(record, spec, policy), true); +}); + test("prompt source policy downgrades third-party extraction triage", () => { const policy = derivePromptSourcePolicy( "For Stripe, Paddle, and Chargebee, collect the official pricing page URL and plan names.", diff --git a/backend/test/populate-browser-action-box.test.ts b/backend/test/populate-browser-action-box.test.ts new file mode 100644 index 0000000..f05dc2f --- /dev/null +++ b/backend/test/populate-browser-action-box.test.ts @@ -0,0 +1,682 @@ +import assert from "node:assert/strict"; +import { existsSync } from "node:fs"; +import { test } from "node:test"; + +import { + BrowserActionBox, + classifyReplayFailure, + createPlaywrightScriptArtifact, + validateReplayAgentCompatibleResult, +} from "../src/pipeline/populate-browser-action-box.js"; +import { + buildPopulateFetchPlan, + directRowsFromFetchedPage, + rankPopulateSearchResults, + triageFetchedPageForPopulate, +} from "../src/pipeline/populate-source-planner.js"; +import { normalizeTinyFishRecordedTrace } from "../src/pipeline/populate-tinyfish-trace-recorder.js"; +import { + createDeterministicPlaywrightRepair, + createLocalPlaywrightReplayRunner, +} from "../src/pipeline/populate-playwright-replay-runner.js"; +import type { DatasetContext } from "../src/pipeline/populate.js"; + +const context: DatasetContext = { + datasetId: "dataset-browser-action-box", + datasetName: "Browser action box", + description: "Find official pricing pages for OpenAI and Anthropic.", + columns: [ + { + name: "provider", + type: "text", + description: "Provider name.", + }, + { + name: "source_url", + type: "url", + description: "Official source URL.", + }, + { + name: "evidence_quote", + type: "text", + description: "Evidence quote.", + }, + ], +}; + +const browserSchema = { + columns: context.columns.map((column) => ({ + name: column.name, + description: column.description, + required: true, + })), +}; + +test("source planner ranks, dedupes, deprioritizes low-trust URLs, and caps fetches", () => { + const ranked = rankPopulateSearchResults({ + context, + results: [ + { + title: "Someone discusses OpenAI pricing", + snippet: "A forum thread.", + url: "https://reddit.com/r/openai/comments/1", + }, + { + title: "OpenAI API Pricing", + snippet: "Official API pricing for current public models.", + url: "https://openai.com/api/pricing#models", + }, + { + title: "OpenAI API Pricing duplicate", + snippet: "Official docs and pricing.", + url: "https://openai.com/api/pricing", + }, + { + title: "Anthropic pricing", + snippet: "Official Claude pricing details and docs.", + url: "https://docs.anthropic.com/en/docs/about-claude/pricing", + }, + ], + }); + + assert.equal(ranked.length, 3); + assert.deepEqual( + ranked.slice(0, 2).map((result) => result.canonicalUrl).sort(), + [ + "https://docs.anthropic.com/en/docs/about-claude/pricing", + "https://openai.com/api/pricing", + ] + ); + assert.ok((ranked[0]?.expectationScore ?? 0) > (ranked[2]?.expectationScore ?? 0)); + assert.match(ranked[2]?.lowTrustReason ?? "", /low-trust/); + assert.deepEqual(buildPopulateFetchPlan({ rankedResults: ranked, fetchLimit: 2 }).sort(), [ + "https://docs.anthropic.com/en/docs/about-claude/pricing", + "https://openai.com/api/pricing", + ]); +}); + +test("fetch triage separates direct extraction from browser-heavy pages", () => { + const direct = triageFetchedPageForPopulate({ + context, + url: "https://openai.com/api/pricing", + page: { + title: "OpenAI API Pricing", + text: "OpenAI official pricing. Input tokens and output tokens are listed for every model.".repeat(4), + }, + }); + assert.equal(direct.status, "extract_now"); + + const form = triageFetchedPageForPopulate({ + context, + url: "https://example.com/locator", + page: { + title: "Location finder", + text: "Enter your zip code and submit the form to search current locations.", + }, + }); + assert.equal(form.status, "requires_form_submission"); + + const blocked = triageFetchedPageForPopulate({ + context, + url: "https://example.com/protected", + page: { + title: "Verify", + text: "Please verify you are human. Captcha required.", + }, + }); + assert.equal(blocked.status, "blocked"); +}); + +test("direct fetch extraction only fills title/url schemas without browser spend", () => { + const rows = directRowsFromFetchedPage({ + context: { + ...context, + columns: [ + { name: "Post Title", type: "text", description: "Title." }, + { name: "Post URL", type: "url", description: "URL." }, + ], + }, + url: "https://openai.com/news/product-releases/introducing-gpt-5", + page: { + title: "Introducing GPT-5", + text: "Introducing GPT-5\nOpenAI product release details.", + }, + }); + + assert.equal(rows.length, 1); + assert.equal(rows[0]?.cells["Post Title"], "Introducing GPT-5"); + assert.equal( + rows[0]?.cells["Post URL"], + "https://openai.com/news/product-releases/introducing-gpt-5" + ); + assert.equal(rows[0]?.evidence[0]?.quote, "Introducing GPT-5"); +}); + +test("BrowserActionBox first run records TinyFish trace and emits draft script when actions are explicit", async () => { + const box = new BrowserActionBox({ + now: () => new Date("2026-05-24T00:00:00.000Z"), + tinyFishClient: { + async runAgent() { + return { + runId: "run-123", + status: "COMPLETED", + sseEvents: [{ type: "PROGRESS", message: "Clicked Pricing" }], + runDetail: { + run_id: "run-123", + status: "COMPLETED", + streaming_url: "https://agent.tinyfish.ai/runs/run-123/live", + steps: [{ + id: "step-1", + action: "navigate", + status: "completed", + url: "https://openai.com/api/pricing", + screenshot_url: "https://agent.tinyfish.ai/runs/run-123/step-1.jpg", + }, { + id: "step-2", + action: "click", + status: "completed", + target_text: "Pricing", + url: "https://openai.com/api/pricing", + }], + }, + finalResult: { + records: [{ + provider: "OpenAI", + source_url: "https://openai.com/api/pricing", + evidence_quote: "OpenAI official pricing", + evidence: [{ + field: "provider", + url: "https://openai.com/api/pricing", + quote: "OpenAI official pricing", + }], + }], + agent_browser_actions: [{ + action: "click", + url: "https://openai.com/api/pricing", + target_text: "Pricing", + status: "succeeded", + }], + }, + }; + }, + }, + }); + + const output = await box.firstRun({ + sourceUrl: "https://openai.com/api/pricing", + datasetGoalPrompt: context.description, + datasetSchema: browserSchema, + runCaps: { + maxAgentSteps: 8, + maxDurationSeconds: 120, + captureHtml: true, + captureScreenshots: true, + }, + }); + + assert.equal(output.trace.runId, "run-123"); + assert.equal(output.trace.normalizedBrowserActions.length, 2); + assert.equal(output.runtimeResult.rows.length, 1); + assert.equal(output.replayReadiness.status, "ready"); + assert.ok(output.playwrightScript); + assert.match(output.playwrightScript?.code ?? "", /runDatasetRecipe/); + assert.ok( + output.runtimeResult.debug?.diagnosticArtifacts?.some((artifact) => + artifact.kind === "tinyfish-trace" + ) + ); +}); + +test("BrowserActionBox first run accepts raw TinyFish result arrays and raw run steps", async () => { + const box = new BrowserActionBox({ + now: () => new Date("2026-05-24T00:00:00.000Z"), + tinyFishClient: { + async runAgent() { + return { + runId: "run-raw", + status: "COMPLETED", + sseEvents: [{ + type: "PROGRESS", + purpose: "Extract article cards.", + timestamp: "2026-05-24T00:00:00.000Z", + streaming_url: "https://agent.tinyfish.ai/private-preview", + }], + runDetail: { + run_id: "run-raw", + status: "COMPLETED", + steps: [{ + id: "step-1", + status: "RUNNING", + action: "Navigate to the source page.", + screenshot: "https://agent.tinyfish.ai/runs/run-raw/step-1.jpg", + duration: 1000, + }, { + id: "step-2", + status: "RUNNING", + action: "Extract titles and URLs.", + html: "https://agent.tinyfish.ai/runs/run-raw/step-2.html", + duration: 500, + }], + }, + finalResult: { + result: [{ + provider: "OpenAI", + source_url: "https://openai.com/api/pricing", + evidence_quote: "OpenAI official pricing", + agent_browser_actions: [ + "visit_url_tool: https://openai.com/api/pricing", + ], + }], + }, + }; + }, + }, + }); + + const output = await box.firstRun({ + sourceUrl: "https://openai.com/api/pricing", + datasetGoalPrompt: context.description, + datasetSchema: browserSchema, + runCaps: { + maxAgentSteps: 8, + maxDurationSeconds: 120, + captureHtml: true, + captureScreenshots: true, + }, + }); + + assert.equal(output.runtimeResult.rows.length, 1); + assert.equal(output.runtimeResult.rows[0]?.evidence[0]?.quote, "OpenAI official pricing"); + assert.equal(output.replayReadiness.status, "ready"); + assert.ok(output.playwrightScript); + assert.ok(output.trace.artifactRefs.some((artifact) => artifact.kind === "screenshot")); + assert.ok(output.trace.artifactRefs.some((artifact) => artifact.kind === "html")); +}); + +test("TinyFish trace normalization redacts streaming URLs from SSE data", () => { + const trace = normalizeTinyFishRecordedTrace({ + sourceUrl: "https://openai.com/api/pricing", + goal: "Collect pricing.", + runId: "run-redacted", + status: "COMPLETED", + sseEvents: [{ + type: "PROGRESS", + purpose: "Click pricing.", + streaming_url: "https://agent.tinyfish.ai/private-preview", + timestamp: "2026-05-24T00:00:00.000Z", + }], + runDetail: { + run_id: "run-redacted", + status: "COMPLETED", + steps: [{ + id: "step-1", + action: "Navigate to source page.", + status: "COMPLETED", + }], + }, + finalResult: { result: [] }, + }); + + assert.equal(trace.sseEvents[0]?.message, "Click pricing."); + assert.equal(trace.sseEvents[0]?.data?.streaming_url, undefined); + assert.equal(trace.normalizedBrowserActions[0]?.url, "https://openai.com/api/pricing"); +}); + +test("BrowserActionBox replay returns candidate rows without calling TinyFish Agent", async () => { + let tinyFishCalls = 0; + let replayCalls = 0; + const script = scriptArtifact("console.log('replay');"); + const box = new BrowserActionBox({ + tinyFishClient: { + async runAgent() { + tinyFishCalls += 1; + throw new Error("TinyFish should not run during replay"); + }, + }, + async runPlaywrightScript() { + replayCalls += 1; + return { + agentCompatibleResult: agentCompatibleRows(), + trace: { + status: "succeeded", + steps: [{ + kind: "browser", + label: "playwright-replay", + status: "succeeded", + }], + }, + }; + }, + }); + + const output = await box.replay(replayInput(script)); + + assert.equal(output.replayStatus, "replay_succeeded"); + assert.equal(output.runtimeResult?.rows.length, 1); + assert.equal(tinyFishCalls, 0); + assert.equal(replayCalls, 1); +}); + +test("BrowserActionBox repair is one-shot and only emits repaired script after validation passes", async () => { + const calls: string[] = []; + const script = scriptArtifact("throw new Error('stale selector');"); + const repaired = scriptArtifact("console.log('repaired');"); + const box = new BrowserActionBox({ + tinyFishClient: { + async runAgent() { + throw new Error("TinyFish should not run during replay repair"); + }, + }, + async runPlaywrightScript({ script: currentScript }) { + calls.push(currentScript.code); + if (currentScript.code.includes("stale selector")) { + return { + agentCompatibleResult: null, + error: "locator button.old timed out", + trace: { + status: "failed", + failedStepIndex: 1, + failedAction: "click old button", + currentUrl: "https://openai.com/api/pricing", + }, + }; + } + return { + agentCompatibleResult: agentCompatibleRows(), + trace: { status: "succeeded" }, + }; + }, + async repairPlaywrightScript() { + return repaired; + }, + }); + + const output = await box.replay(replayInput(script)); + + assert.equal(output.replayStatus, "repair_promoted"); + assert.equal(output.repairedPlaywrightScript?.code, repaired.code); + assert.deepEqual(calls, [script.code, repaired.code]); +}); + +test("local Playwright replay runner executes a script and extracts rows", { + skip: !localChromiumExecutable(), +}, async () => { + const sourceUrl = "https://example.com/releases"; + const datasetSchema = { + columns: [ + { name: "post_title", required: true }, + { name: "post_url", required: true }, + { name: "evidence_quote", required: true }, + ], + }; + const script = createPlaywrightScriptArtifact({ + sourceUrl, + datasetGoalPrompt: "Collect product releases with titles, URLs, and evidence.", + datasetSchema, + code: ` + export async function runDatasetRecipe(context) { + await context.page.setContent(\` +
+ Alpha product release May 1, 2026 + Beta product release May 2, 2026 +
+ \`); + return { rows: [] }; + } + `, + status: "promoted", + createdAt: "2026-05-24T00:00:00.000Z", + }); + const runner = createLocalPlaywrightReplayRunner({ + executablePath: localChromiumExecutable(), + }); + + const output = await runner({ + sourceUrl, + datasetGoalPrompt: "Collect product releases with titles, URLs, and evidence.", + datasetSchema, + currentPlaywrightScript: script, + script, + previousSuccessfulOutputProfile: { + fieldsPreviouslyRetrieved: ["post_title", "post_url", "evidence_quote"], + rowCountRange: { min: 2 }, + sourceUrls: [sourceUrl], + evidenceRequired: true, + }, + runCaps: { + maxReplayAttempts: 1, + maxRepairAttempts: 1, + timeoutMs: 15_000, + }, + }); + + assert.equal(output.error, undefined); + assert.equal(output.trace?.status, "succeeded"); + assert.equal( + validateReplayAgentCompatibleResult({ + agentCompatibleResult: output.agentCompatibleResult, + profile: { + fieldsPreviouslyRetrieved: ["post_title", "post_url", "evidence_quote"], + rowCountRange: { min: 2 }, + sourceUrls: [sourceUrl], + evidenceRequired: true, + }, + }).isValid, + true + ); +}); + +test("local Playwright replay runner extracts current page evidence when links are absent", { + skip: !localChromiumExecutable(), +}, async () => { + const sourceUrl = "https://example.com/releases"; + const datasetSchema = { + columns: [ + { name: "page_title", required: true }, + { name: "source_url", required: true }, + { name: "evidence_quote", required: true }, + ], + }; + const script = createPlaywrightScriptArtifact({ + sourceUrl, + datasetGoalPrompt: "Collect page title, URL, and visible evidence.", + datasetSchema, + code: ` + export async function runDatasetRecipe(context) { + await context.page.setContent(\` +
+

Example release notes

+

Visible release evidence from the current public page.

+
+ \`); + return { rows: [] }; + } + `, + status: "promoted", + createdAt: "2026-05-24T00:00:00.000Z", + }); + const runner = createLocalPlaywrightReplayRunner({ + executablePath: localChromiumExecutable(), + }); + + const output = await runner({ + sourceUrl, + datasetGoalPrompt: "Collect page title, URL, and visible evidence.", + datasetSchema, + currentPlaywrightScript: script, + script, + previousSuccessfulOutputProfile: { + fieldsPreviouslyRetrieved: ["page_title", "source_url", "evidence_quote"], + rowCountRange: { min: 1 }, + sourceUrls: [sourceUrl], + evidenceRequired: true, + }, + runCaps: { + maxReplayAttempts: 1, + maxRepairAttempts: 1, + timeoutMs: 15_000, + }, + }); + + assert.equal(output.error, undefined); + assert.equal( + validateReplayAgentCompatibleResult({ + agentCompatibleResult: output.agentCompatibleResult, + profile: { + fieldsPreviouslyRetrieved: ["page_title", "source_url", "evidence_quote"], + rowCountRange: { min: 1 }, + sourceUrls: [sourceUrl], + evidenceRequired: true, + }, + }).isValid, + true + ); +}); + +test("deterministic Playwright repair retargets generated script URLs to the source URL", async () => { + const repair = createDeterministicPlaywrightRepair(); + const broken = createPlaywrightScriptArtifact({ + sourceUrl: "https://example.com/releases", + datasetGoalPrompt: context.description, + datasetSchema: browserSchema, + code: ` + const browserActions = [{"action":"navigate","url":"https://example.invalid/broken"}]; + const sourceUrls = ["https://example.invalid/broken"]; + export async function runDatasetRecipe() { return { rows: [], sourceUrls }; } + `, + status: "promoted", + createdAt: "2026-05-24T00:00:00.000Z", + }); + + const repaired = await repair({ + ...replayInput(broken), + sourceUrl: "https://example.com/releases", + failedReplay: { + status: "failed", + startedAt: "2026-05-24T00:00:00.000Z", + completedAt: "2026-05-24T00:00:01.000Z", + scriptId: broken.scriptId, + sourceUrl: "https://example.com/releases", + currentUrl: "https://example.invalid/broken", + error: "navigation failed", + diagnostics: ["script failure"], + steps: [], + }, + diagnostics: ["script failure"], + }); + + assert.ok(repaired); + assert.match(repaired.code, /https:\/\/example\.com\/releases/); + assert.doesNotMatch(repaired.code, /example\.invalid/); +}); + +test("replay validation and classification distinguish broken scripts from validation failures", () => { + assert.deepEqual( + validateReplayAgentCompatibleResult({ + agentCompatibleResult: { records: [] }, + profile: { + fieldsPreviouslyRetrieved: ["provider"], + rowCountRange: { min: 1 }, + sourceUrls: ["https://openai.com/api/pricing"], + evidenceRequired: true, + }, + }), + { + isValid: false, + issues: [ + "Replay returned 0 row(s), below previous minimum 1.", + "Replay missed previously retrieved field(s): provider.", + "Replay returned no evidence-backed rows.", + ], + } + ); + + assert.equal( + classifyReplayFailure({ + replayTrace: { + status: "failed", + startedAt: "2026-05-24T00:00:00.000Z", + completedAt: "2026-05-24T00:00:01.000Z", + scriptId: "script", + sourceUrl: "https://example.com", + error: "locator timed out", + diagnostics: [], + steps: [], + }, + validationIssues: [], + }), + "script failure" + ); + assert.equal( + classifyReplayFailure({ + replayTrace: { + status: "failed", + startedAt: "2026-05-24T00:00:00.000Z", + completedAt: "2026-05-24T00:00:01.000Z", + scriptId: "script", + sourceUrl: "https://example.com", + error: "Captcha required", + diagnostics: [], + steps: [], + }, + validationIssues: [], + }), + "blocked/captcha/auth wall" + ); +}); + +function replayInput(script: ReturnType) { + return { + sourceUrl: "https://openai.com/api/pricing", + datasetGoalPrompt: context.description, + datasetSchema: browserSchema, + currentPlaywrightScript: script, + previousSuccessfulOutputProfile: { + fieldsPreviouslyRetrieved: ["provider", "source_url", "evidence_quote"], + rowCountRange: { min: 1 }, + sourceUrls: ["https://openai.com/api/pricing"], + evidenceRequired: true, + }, + runCaps: { + maxReplayAttempts: 1 as const, + maxRepairAttempts: 1 as const, + timeoutMs: 30_000, + }, + }; +} + +function scriptArtifact(code: string) { + return createPlaywrightScriptArtifact({ + sourceUrl: "https://openai.com/api/pricing", + datasetGoalPrompt: context.description, + datasetSchema: browserSchema, + code, + status: "promoted", + createdAt: "2026-05-24T00:00:00.000Z", + }); +} + +function agentCompatibleRows() { + return { + records: [{ + provider: "OpenAI", + source_url: "https://openai.com/api/pricing", + evidence_quote: "OpenAI official pricing", + evidence: [{ + field: "provider", + url: "https://openai.com/api/pricing", + quote: "OpenAI official pricing", + }], + }], + }; +} + +function localChromiumExecutable(): string | undefined { + return [ + process.env.POPULATE_PLAYWRIGHT_EXECUTABLE_PATH, + "/usr/bin/chromium-browser", + "/usr/bin/chromium", + "/Applications/Google Chrome for Testing.app/Contents/MacOS/Google Chrome for Testing", + "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome", + ].find((candidate): candidate is string => Boolean(candidate && existsSync(candidate))); +} diff --git a/backend/test/populate-convex-writer.test.ts b/backend/test/populate-convex-writer.test.ts index d347b9f..7a60c4d 100644 --- a/backend/test/populate-convex-writer.test.ts +++ b/backend/test/populate-convex-writer.test.ts @@ -52,12 +52,17 @@ test("Convex populate row writer uses one atomic replace mutation", async () => assert.equal(calls[0]?.functionReference, replaceByDataset); assert.deepEqual(calls[0]?.args, { datasetId: "dataset-ai-posts", - rows: [{ - data: { - entity_name: "OpenAI", - source_url: "https://openai.com/news", - }, - sources: ["https://openai.com/news"], - }], - }); + rows: [{ + data: { + entity_name: "OpenAI", + source_url: "https://openai.com/news", + }, + sources: ["https://openai.com/news"], + evidence: [{ + columnName: "entity_name", + sourceUrl: "https://openai.com/news", + quote: "OpenAI", + }], + }], + }); }); diff --git a/backend/test/populate-dataset-owner-loader.test.ts b/backend/test/populate-dataset-owner-loader.test.ts new file mode 100644 index 0000000..52e05ae --- /dev/null +++ b/backend/test/populate-dataset-owner-loader.test.ts @@ -0,0 +1,67 @@ +import assert from "node:assert/strict"; +import { test } from "node:test"; + +import { ConvexPopulateDatasetOwnerLoader } from "../src/pipeline/populate-dataset-owner-loader.js"; + +test("Convex dataset owner loader uses trusted system populate query", async () => { + const getForSystemPopulate = Symbol("getForSystemPopulate"); + const calls: Array<{ functionReference: unknown; args: unknown }> = []; + const loader = new ConvexPopulateDatasetOwnerLoader({ + internalApi: { + datasets: { + getForSystemPopulate, + }, + }, + convexClient: { + async query(functionReference, args) { + calls.push({ functionReference, args }); + return { ownerId: "user-1" }; + }, + }, + }); + + const dataset = await loader.loadDataset("dataset-ai-posts"); + + assert.deepEqual(calls, [{ + functionReference: getForSystemPopulate, + args: { id: "dataset-ai-posts" }, + }]); + assert.deepEqual(dataset, { ownerId: "user-1" }); +}); + +test("Convex dataset owner loader returns null for missing dataset", async () => { + const loader = new ConvexPopulateDatasetOwnerLoader({ + internalApi: { + datasets: { + getForSystemPopulate: Symbol("getForSystemPopulate"), + }, + }, + convexClient: { + async query() { + return null; + }, + }, + }); + + assert.equal(await loader.loadDataset("missing-dataset"), null); +}); + +test("Convex dataset owner loader rejects malformed dataset owner", async () => { + const loader = new ConvexPopulateDatasetOwnerLoader({ + internalApi: { + datasets: { + getForSystemPopulate: Symbol("getForSystemPopulate"), + }, + }, + convexClient: { + async query() { + return { ownerId: "" }; + }, + }, + }); + + await assert.rejects( + loader.loadDataset("dataset-ai-posts"), + /Dataset dataset-ai-posts is missing ownerId/ + ); +}); diff --git a/backend/test/populate-playwright-readiness.test.ts b/backend/test/populate-playwright-readiness.test.ts new file mode 100644 index 0000000..cd95a09 --- /dev/null +++ b/backend/test/populate-playwright-readiness.test.ts @@ -0,0 +1,144 @@ +import assert from "node:assert/strict"; +import { test } from "node:test"; + +import { playwrightCandidateReadinessForRun } from "../src/pipeline/populate-playwright-readiness.js"; +import type { PopulateRuntimeResult } from "../src/pipeline/populate-runtime.js"; + +test("Playwright candidate readiness rejects search/fetch-only traces", () => { + const readiness = playwrightCandidateReadinessForRun({ + result: runtimeResult({ + processTrace: { + runtime: "collection", + searchQueries: ["OpenAI latest blog"], + fetchedUrls: ["https://openai.com/news"], + sourceArtifacts: [{ + url: "https://openai.com/news", + status: "succeeded", + source: "fetch", + label: "news", + }], + selectedRowSource: "collection_pipeline", + notes: [], + steps: [{ + kind: "fetch", + label: "collection-fetched-url", + status: "succeeded", + input: { url: "https://openai.com/news" }, + }], + }, + }), + }); + + assert.equal(readiness.status, "not_ready"); + assert.equal(readiness.browserStepCount, 0); + assert.match(readiness.reasons.join("\n"), /no actionable browser steps/i); +}); + +test("Playwright candidate readiness rejects Agent-disabled capability diagnostics", () => { + const readiness = playwrightCandidateReadinessForRun({ + result: runtimeResult({ + validationIssues: [ + "Capability diagnostic: TinyFish Agent disabled; triage requested browser/form/detail follow-up for 1 page(s).", + ], + processTrace: { + runtime: "collection", + searchQueries: [], + fetchedUrls: ["https://example.com/form"], + sourceArtifacts: [{ + url: "https://example.com/form", + status: "succeeded", + source: "fetch", + }], + selectedRowSource: "collection_pipeline", + notes: [], + steps: [{ + kind: "browser", + label: "agent-navigation", + status: "succeeded", + browserAction: { + action: "navigate", + url: "https://example.com/form", + }, + }], + }, + }), + }); + + assert.equal(readiness.status, "not_ready"); + assert.match(readiness.reasons.join("\n"), /Agent\/browser follow-up/i); +}); + +test("Playwright candidate readiness accepts browser-action traces anchored to sources", () => { + const readiness = playwrightCandidateReadinessForRun({ + result: runtimeResult({ + processTrace: { + runtime: "collection", + searchQueries: [], + fetchedUrls: ["https://example.com/form"], + sourceArtifacts: [{ + url: "https://example.com/form", + status: "succeeded", + source: "agent", + label: "browser-canary", + }], + selectedRowSource: "collection_pipeline", + notes: [], + steps: [{ + kind: "browser", + label: "agent-form-submit", + status: "succeeded", + browserAction: { + action: "click", + url: "https://example.com/form", + selector: "button[type=submit]", + }, + }], + }, + }), + }); + + assert.equal(readiness.status, "ready"); + assert.deepEqual(readiness.reasons, []); + assert.equal(readiness.browserStepCount, 1); + assert.equal(readiness.sourceUrlCount, 1); +}); + +function runtimeResult(input: { + validationIssues?: string[]; + processTrace?: NonNullable["processTrace"]; +}): PopulateRuntimeResult { + return { + rows: [{ + cells: { + entity_name: "OpenAI", + source_url: "https://openai.com/news", + evidence_quote: "Release notes", + }, + sourceUrls: ["https://openai.com/news"], + evidence: [{ + columnName: "evidence_quote", + sourceUrl: "https://openai.com/news", + quote: "Release notes", + }], + needsReview: false, + }], + validationIssues: input.validationIssues ?? [], + usage: { promptTokens: 0, completionTokens: 0, totalTokens: 0 }, + metrics: { + searchCalls: 0, + fetchCalls: 0, + browserCalls: 0, + agentRuns: 0, + agentSteps: 0, + }, + debug: input.processTrace + ? { + capturedRows: [], + capturedSources: [], + selectedRowSource: "collection_pipeline", + notes: [], + processTrace: input.processTrace, + } + : undefined, + }; +} diff --git a/backend/test/populate-runtime.test.ts b/backend/test/populate-runtime.test.ts index 5172f8e..ae44307 100644 --- a/backend/test/populate-runtime.test.ts +++ b/backend/test/populate-runtime.test.ts @@ -1,6 +1,7 @@ import assert from "node:assert/strict"; import { test } from "node:test"; +import { BrowserActionBox } from "../src/pipeline/populate-browser-action-box.js"; import { runPopulateRuntime } from "../src/pipeline/populate-runtime.js"; interface ToolLike { @@ -92,6 +93,39 @@ test("populate runtime captures rows through injected tools without Convex write assert.deepEqual(result.validationIssues, []); }); +test("populate runtime strips unbacked insert_row evidence before validation", async () => { + const result = await runPopulateRuntime({ + context, + webTools: { + search: async () => [], + fetch: async () => ({}), + }, + agentRunner: async ({ tools }) => { + const insertRow = tools.insert_row as ToolLike< + { datasetId: string; data: Record }, + { success: boolean } + >; + + await insertRow.execute({ + datasetId: "benchmark-dataset", + data: { + entity_name: "OpenAI", + latest_post_title: "Invented post", + source_url: "https://openai.com/news", + evidence_quote: "Invented quote never fetched", + }, + }); + }, + }); + + assert.equal(result.rows.length, 1); + assert.deepEqual(result.rows[0]?.evidence, []); + assert.match( + result.validationIssues.join("\n"), + /evidence quotes/i + ); +}); + test("populate runtime accepts structured fallback rows backed by captured sources", async () => { const result = await runPopulateRuntime({ context, @@ -147,6 +181,390 @@ test("populate runtime accepts structured fallback rows backed by captured sourc assert.deepEqual(result.validationIssues, []); }); +test("populate runtime can pre-rank and fetch planned sources before agent work", async () => { + const calls: string[] = []; + const result = await runPopulateRuntime({ + context, + sourcePlanner: { + enabled: true, + fetchLimit: 1, + }, + webTools: { + search: async ({ query }) => { + calls.push(`search:${query}`); + return [ + { + title: "Forum copy", + snippet: "OpenAI pricing discussion", + url: "https://reddit.com/r/openai/comments/pricing", + }, + { + title: "OpenAI API Pricing", + snippet: "Official API pricing page.", + url: "https://openai.com/api/pricing", + }, + ]; + }, + fetch: async ({ url }) => { + calls.push(`fetch:${url}`); + return { + title: "OpenAI API Pricing", + text: "Official API pricing page.", + }; + }, + }, + agentRunner: async () => ({ + rows: [{ + cells: { + entity_name: "OpenAI", + latest_post_title: "OpenAI API Pricing", + source_url: "https://openai.com/api/pricing", + evidence_quote: "Official API pricing page.", + }, + sourceUrls: ["https://openai.com/api/pricing"], + evidence: [{ + columnName: "latest_post_title", + sourceUrl: "https://openai.com/api/pricing", + quote: "Official API pricing page.", + }], + }], + }), + }); + + assert.ok(calls.some((call) => call.startsWith("search:"))); + assert.deepEqual( + calls.filter((call) => call.startsWith("fetch:")), + ["fetch:https://openai.com/api/pricing"] + ); + assert.equal(result.rows.length, 1); + assert.equal(result.metrics.searchCalls, 1); + assert.equal(result.metrics.fetchCalls, 1); + assert.ok(result.debug?.processTrace.steps.some((step) => + step.label === "source-planner-search" + )); +}); + +test("populate runtime plans searches from user prompt before durable recipe instructions", async () => { + const searchQueries: string[] = []; + + await runPopulateRuntime({ + context: { + ...context, + description: [ + "nyc big techs that are hiring", + "", + "Durable recipe instructions:", + "Use search_web before fetch_page unless an official source URL is already obvious.", + "Prefer official docs, pricing, blog, product, or company pages over third-party summaries.", + ].join("\n"), + columns: [ + { + name: "Company Name", + type: "text", + description: "The official name of the big tech company.", + }, + { + name: "Careers Page URL", + type: "url", + description: "Direct URL to the company's careers page.", + nullable: true, + }, + ], + }, + sourcePlanner: { + enabled: true, + fetchLimit: 0, + }, + webTools: { + search: async ({ query }) => { + searchQueries.push(query); + return []; + }, + fetch: async () => { + throw new Error("fetch should not run with fetchLimit 0"); + }, + }, + agentRunner: async () => ({ rows: [] }), + }); + + assert.deepEqual(searchQueries, [ + "nyc big techs that are hiring official source", + ]); +}); + +test("populate runtime calls BrowserActionBox for browser-heavy fetched sources", async () => { + let browserActionBoxCalls = 0; + const browserActionBox = new BrowserActionBox({ + tinyFishClient: { + async runAgent() { + browserActionBoxCalls += 1; + return { + runId: "run-browser-heavy", + status: "COMPLETED", + runDetail: { + run_id: "run-browser-heavy", + status: "COMPLETED", + steps: [{ + action: "navigate", + status: "completed", + url: "https://example.com/locator", + }, { + action: "type", + status: "completed", + target_text: "Search", + value: "OpenAI", + url: "https://example.com/locator", + }], + }, + finalResult: { + records: [{ + entity_name: "OpenAI", + latest_post_title: "OpenAI locator result", + source_url: "https://example.com/locator", + evidence_quote: "OpenAI locator result", + evidence: [{ + field: "latest_post_title", + url: "https://example.com/locator", + quote: "OpenAI locator result", + }], + }], + agent_browser_actions: [{ + action: "type", + url: "https://example.com/locator", + target_text: "Search", + value_description: "redacted typed value", + }], + }, + }; + }, + }, + }); + + const result = await runPopulateRuntime({ + context, + browserActionBox, + sourcePlanner: { + enabled: true, + fetchLimit: 1, + }, + webTools: { + search: async () => [ + { + title: "Example locator", + snippet: "Search and filter current entries.", + url: "https://example.com/locator", + }, + ], + fetch: async () => ({ + title: "Example locator", + text: "Enter your search term and submit the form to see current entries.", + }), + }, + agentRunner: async () => ({ rows: [] }), + }); + + assert.equal(browserActionBoxCalls, 1); + assert.equal(result.rows.length, 1); + assert.equal(result.rows[0]?.cells.latest_post_title, "OpenAI locator result"); + assert.ok(result.debug?.processTrace.steps.some((step) => + step.label === "tinyfish-agent-run" + )); + assert.ok(result.debug?.diagnosticArtifacts?.some((artifact) => + artifact.kind === "tinyfish-trace" + )); +}); + +test("populate runtime builds simple title URL rows from captured sources", async () => { + const result = await runPopulateRuntime({ + context: { + datasetId: "product-releases", + datasetName: "OpenAI product releases", + description: + "find OpenAI product release articles from https://openai.com/news/product-releases/ with post title and post URL", + columns: [ + { + name: "Post Title", + type: "text" as const, + description: "Post title.", + }, + { + name: "Post URL", + type: "url" as const, + description: "Post URL.", + }, + ], + }, + webTools: { + search: async () => [ + { + title: "OpenAI Newsroom | Product", + snippet: "Product release listing page.", + url: "https://openai.com/news/product-releases/", + }, + { + title: "Introducing GPT-5", + snippet: "OpenAI product release post.", + url: "https://openai.com/index/introducing-gpt-5/", + }, + ], + fetch: async () => ({ + title: "OpenAI Newsroom | Product", + text: "Product release listing page.", + }), + }, + agentRunner: async ({ tools }) => { + const searchWeb = tools.search_web as ToolLike< + { query: string }, + { results?: unknown[] } + >; + await searchWeb.execute({ query: "OpenAI product releases" }); + return { + rows: [{ + cells: { + "Post Title": "OpenAI Newsroom | Product", + "Post URL": "https://openai.com/news/product-releases/", + }, + sourceUrls: ["https://openai.com/news/product-releases/"], + evidence: [{ + columnName: "Post Title", + sourceUrl: "https://openai.com/news/product-releases/", + quote: "OpenAI Newsroom | Product", + }], + }], + validationIssues: [ + "Individual article URLs are not present in the transcript; only the listing page URL is available.", + ], + }; + }, + }); + + assert.equal(result.rows.length, 1); + assert.equal(result.rows[0]?.cells["Post Title"], "Introducing GPT-5"); + assert.equal( + result.rows[0]?.cells["Post URL"], + "https://openai.com/index/introducing-gpt-5/" + ); + assert.deepEqual(result.validationIssues, []); +}); + +test("populate runtime shortcuts explicit URL title rows without agent call", async () => { + let agentCalls = 0; + const result = await runPopulateRuntime({ + context: { + datasetId: "docs-pages", + datasetName: "OpenAI API docs pages", + description: + "make a table from these public OpenAI API docs pages with only page title and page URL: https://developers.openai.com/api/docs/mcp", + columns: [ + { + name: "Page URL", + type: "url" as const, + description: "Page URL.", + }, + { + name: "Page Title", + type: "text" as const, + description: "Page title.", + }, + ], + }, + webTools: { + search: async () => [], + fetch: async () => ({ + title: "Building MCP servers for ChatGPT Apps and API integrations", + text: "Building MCP servers for ChatGPT Apps and API integrations\nMCP and Connectors", + }), + }, + agentRunner: async () => { + agentCalls += 1; + }, + }); + + assert.equal(agentCalls, 0); + assert.equal(result.rows.length, 1); + assert.equal( + result.rows[0]?.cells["Page Title"], + "Building MCP servers for ChatGPT Apps and API integrations" + ); + assert.equal( + result.rows[0]?.cells["Page URL"], + "https://developers.openai.com/api/docs/mcp" + ); + assert.deepEqual(result.validationIssues, []); + assert.equal(result.metrics.agentRuns, 0); +}); + +test("populate runtime does not build deterministic rows outside explicit URL scope", async () => { + const result = await runPopulateRuntime({ + context: { + datasetId: "product-releases", + datasetName: "OpenAI product releases", + description: + "find OpenAI product release articles from https://openai.com/news/product-releases/ with post title and post URL", + columns: [ + { + name: "Post Title", + type: "text" as const, + description: "Post title.", + }, + { + name: "Post URL", + type: "url" as const, + description: "Post URL.", + }, + ], + }, + webTools: { + search: async () => [ + { + title: "Building MCP servers for ChatGPT Apps and API integrations", + snippet: "OpenAI developer docs.", + url: "https://developers.openai.com/api/docs/mcp", + }, + ], + fetch: async (input) => { + if (input.url === "https://openai.com/news/product-releases/") { + throw new Error("fetch failed"); + } + return { + title: "Building MCP servers for ChatGPT Apps and API integrations", + text: "Building MCP servers for ChatGPT Apps and API integrations", + }; + }, + }, + agentRunner: async ({ tools }) => { + const searchWeb = tools.search_web as ToolLike< + { query: string }, + { results?: unknown[] } + >; + await searchWeb.execute({ query: "OpenAI product releases" }); + return { + rows: [{ + cells: { + "Post Title": "OpenAI Newsroom | Product", + "Post URL": "https://openai.com/news/product-releases/", + }, + sourceUrls: ["https://openai.com/news/product-releases/"], + evidence: [{ + columnName: "Post Title", + sourceUrl: "https://openai.com/news/product-releases/", + quote: "OpenAI Newsroom | Product", + }], + }], + validationIssues: [ + "Individual article URLs are not present in the transcript; only the listing page URL is available.", + ], + }; + }, + }); + + assert.equal(result.rows.length, 0); + assert.match( + result.validationIssues.join("\n"), + /Mastra populate runtime returned no rows/ + ); +}); + test("populate runtime rejects structured fallback rows without source-backed evidence", async () => { const result = await runPopulateRuntime({ context, diff --git a/backend/test/populate-self-healing-command.test.ts b/backend/test/populate-self-healing-command.test.ts index 1baf0f1..89727bc 100644 --- a/backend/test/populate-self-healing-command.test.ts +++ b/backend/test/populate-self-healing-command.test.ts @@ -5,6 +5,7 @@ import type { DatasetContext } from "../src/pipeline/populate.js"; import type { PopulateRecipeRuntime } from "../src/pipeline/populate-self-healing.js"; import type { RunSelfHealingPopulateResult } from "../src/pipeline/populate-self-healing-runner.js"; import { + DEFAULT_COMMIT_ROW_LIMIT_PER_HOUR, parsePopulateSelfHealingCliArgs, runPopulateSelfHealingCli, } from "../src/pipeline/populate-self-healing-command.js"; @@ -46,6 +47,21 @@ test("self-healing CLI parses dataset-id mode", () => { }); }); +test("self-healing CLI parses commit row limit override", () => { + assert.deepEqual(parsePopulateSelfHealingCliArgs([ + "--dataset-id", + "dataset-ai-posts", + "--commit", + "--commit-row-limit-per-hour", + "250", + ]), { + datasetId: "dataset-ai-posts", + shouldReadStdin: false, + shouldCommitRows: true, + commitRowLimitPerHour: 250, + }); +}); + test("self-healing CLI rejects dataset-id mixed with context input", () => { assert.throws( () => parsePopulateSelfHealingCliArgs([ @@ -240,6 +256,11 @@ test("self-healing CLI dataset-id commit loads context and creates writer", asyn assert.equal(input.store, undefined); assert.equal(input.recipeStoreDirectory, ".bigset/populate-recipes"); assert.ok(input.rowWriter); + assert.equal( + input.commitRowLimit?.maxRowsPerWindow, + DEFAULT_COMMIT_ROW_LIMIT_PER_HOUR + ); + assert.equal(input.commitRowLimit?.windowMs, 60 * 60 * 1_000); return successfulResult(input.context.datasetId); }, }); @@ -446,8 +467,10 @@ function rejectedResult(datasetId: string): RunSelfHealingPopulateResult { validationIssues: ["Still no evidence."], productionValidation: { ...baseRun(datasetId).productionValidation, + state: "rejected", isValid: false, score: 0, + safeRowCount: 0, criticalIssues: ["Still no evidence."], }, }, @@ -480,15 +503,19 @@ function baseRun(datasetId: string): RunSelfHealingPopulateResult["selectedRun"] completedAt: "2026-05-22T00:00:01.000Z", runtimeMs: 1_000, productionValidation: { + state: "accepted_full", isValid: true, score: 1, rowCount: 1, + safeRowCount: 1, requestedCellCompletenessRatio: 1, sourceUrlCoverageRatio: 1, evidenceCoverageRatio: 1, expectedEntityCoverageRatio: 1, expectedEntities: [], missingExpectedEntities: [], + coveragePolicy: "partial_allowed", + targetSource: "public web sources", criticalIssues: [], warnings: [], }, diff --git a/backend/test/populate-self-healing-runner.test.ts b/backend/test/populate-self-healing-runner.test.ts index b63c4c0..8767bb4 100644 --- a/backend/test/populate-self-healing-runner.test.ts +++ b/backend/test/populate-self-healing-runner.test.ts @@ -17,6 +17,8 @@ import { } from "../src/pipeline/populate-self-healing.js"; import { diagnosticRunForTick, + FileSystemPopulateDatasetRowCommitLimiter, + InMemoryPopulateDatasetRowCommitLimiter, runSelfHealingPopulate, validationIssuesForSelfHealingTick, type PopulateDatasetRowWriter, @@ -71,7 +73,7 @@ test("self-healing runner commits rows only after a successful tick", async () = assert.equal(result.committedRows?.insertedRowCount, 1); assert.equal(writer.replaceCalls.length, 1); assert.equal(writer.replaceCalls[0]?.datasetId, context.datasetId); - assert.equal(writer.replaceCalls[0]?.rows[0]?.cells.entity_name, "OpenAI"); + assert.equal(writer.replaceCalls[0]?.rows[0]?.cells.entity_name, "OpenAI 1"); }); test("self-healing runner requires a row writer before runtime work when committing", async () => { @@ -97,6 +99,142 @@ test("self-healing runner requires a row writer before runtime work when committ assert.equal(runtimeCalls, 0); }); +test("self-healing runner records committed rows against the hourly cap", async () => { + const store = new InMemoryPopulateRecipeStore(); + const generatedRecipe = recipe({ recipeId: "generated-v1" }); + const writer = new FakePopulateDatasetRowWriter(); + const limiter = new InMemoryPopulateDatasetRowCommitLimiter(); + const now = new Date("2026-05-22T00:30:00.000Z"); + + const result = await runSelfHealingPopulate({ + context, + store, + runtime: new FakePopulateRecipeRuntime({ + "generated-v1": validRunWithRows(generatedRecipe, 2), + }), + author: new FakeRecipeAuthor({ generatedRecipe }), + rowWriter: writer, + shouldCommitRows: true, + commitRowLimit: { + maxRowsPerWindow: 100, + windowMs: 60 * 60 * 1_000, + now: () => now, + limiter, + }, + }); + + assert.equal(result.success, true); + assert.equal(result.committedRows?.insertedRowCount, 2); + assert.equal(result.commitLimit?.remainingRowsInWindow, 100); + assert.equal(await limiter.committedRowCount({ + datasetId: context.datasetId, + since: new Date("2026-05-21T23:30:00.000Z"), + now, + }), 2); +}); + +test("self-healing runner skips runtime when commit cap is exhausted", async () => { + const limiter = new InMemoryPopulateDatasetRowCommitLimiter(); + const now = new Date("2026-05-22T00:30:00.000Z"); + let runtimeCalls = 0; + const writer = new FakePopulateDatasetRowWriter(); + await reserveExistingRows({ limiter, now, rowCount: 100 }); + + const result = await runSelfHealingPopulate({ + context, + store: new InMemoryPopulateRecipeStore(), + runtime: { + async runRecipe(input) { + runtimeCalls += 1; + return validRun(input.recipe); + }, + }, + author: new FakeRecipeAuthor({ + generatedRecipe: recipe({ recipeId: "generated-v1" }), + }), + rowWriter: writer, + shouldCommitRows: true, + commitRowLimit: { + maxRowsPerWindow: 100, + windowMs: 60 * 60 * 1_000, + now: () => now, + limiter, + }, + }); + + assert.equal(result.success, false); + assert.equal(result.action, "commit_rate_limited"); + assert.equal(result.tick, undefined); + assert.equal(result.commitLimit?.remainingRowsInWindow, 0); + assert.match(result.validationIssues.join("\n"), /Commit row cap exceeded/); + assert.equal(runtimeCalls, 0); + assert.equal(writer.replaceCalls.length, 0); +}); + +test("self-healing runner blocks commit when selected rows exceed remaining cap", async () => { + const store = new InMemoryPopulateRecipeStore(); + const limiter = new InMemoryPopulateDatasetRowCommitLimiter(); + const generatedRecipe = recipe({ recipeId: "generated-v1" }); + const writer = new FakePopulateDatasetRowWriter(); + const now = new Date("2026-05-22T00:30:00.000Z"); + await reserveExistingRows({ limiter, now, rowCount: 99 }); + + const result = await runSelfHealingPopulate({ + context, + store, + runtime: new FakePopulateRecipeRuntime({ + "generated-v1": validRunWithRows(generatedRecipe, 2), + }), + author: new FakeRecipeAuthor({ generatedRecipe }), + rowWriter: writer, + shouldCommitRows: true, + commitRowLimit: { + maxRowsPerWindow: 100, + windowMs: 60 * 60 * 1_000, + now: () => now, + limiter, + }, + }); + + assert.equal(result.success, false); + assert.equal(result.action, "commit_rate_limited"); + assert.equal(result.selectedRun?.rows.length, 2); + assert.equal(result.commitLimit?.requestedRowCount, 2); + assert.equal(result.commitLimit?.remainingRowsInWindow, 1); + assert.equal(writer.replaceCalls.length, 0); +}); + +test("filesystem row commit limiter reserves atomically for concurrent calls", async () => { + const rootDirectory = await mkdtemp(join(tmpdir(), "bigset-row-cap-")); + const limiter = new FileSystemPopulateDatasetRowCommitLimiter(rootDirectory); + const now = new Date("2026-05-22T00:30:00.000Z"); + const reserve = () => limiter.reserveCommit({ + datasetId: context.datasetId, + rowCount: 60, + since: new Date(now.getTime() - 60 * 60 * 1_000), + now, + maxRowsPerWindow: 100, + }); + + const reservations = await Promise.all([reserve(), reserve()]); + const allowed = reservations.filter((reservation) => + reservation.decision.isAllowed + ); + const denied = reservations.filter((reservation) => + !reservation.decision.isAllowed + ); + + assert.equal(allowed.length, 1); + assert.equal(denied.length, 1); + assert.equal(denied[0]?.decision.remainingRowsInWindow, 40); + await allowed[0]?.confirm({ rowCount: 60 }); + assert.equal(await limiter.committedRowCount({ + datasetId: context.datasetId, + since: new Date(now.getTime() - 60 * 60 * 1_000), + now, + }), 60); +}); + test("self-healing runner commits healthy active reruns", async () => { const store = new InMemoryPopulateRecipeStore(); const activeRecipe = recipe({ recipeId: "active-v1", status: "active" }); @@ -173,6 +311,35 @@ test("self-healing runner does not clear or insert rows when candidate is reject assert.match(result.validationIssues.join("\n"), /Still no evidence/); }); +test("self-healing runner commits partial safe rows without promoting recipe", async () => { + const store = new InMemoryPopulateRecipeStore(); + const generatedRecipe = recipe({ recipeId: "generated-v1" }); + const writer = new FakePopulateDatasetRowWriter(); + + const result = await runSelfHealingPopulate({ + context, + store, + runtime: new FakePopulateRecipeRuntime({ + "generated-v1": partialRun( + generatedRecipe, + "Missing expected entities: Anthropic." + ), + }), + author: new FakeRecipeAuthor({ generatedRecipe }), + rowWriter: writer, + shouldCommitRows: true, + }); + const snapshot = await store.loadSnapshot(context.datasetId); + + assert.equal(result.success, true); + assert.equal(result.action, "candidate_rejected"); + assert.equal(result.validationState, "accepted_partial"); + assert.equal(result.committedRows?.insertedRowCount, 1); + assert.equal(writer.replaceCalls.length, 1); + assert.equal(writer.replaceCalls[0]?.rows[0]?.cells.entity_name, "OpenAI 1"); + assert.equal(snapshot.recipes[0]?.status, "rejected"); +}); + test("filesystem store lets the runner reuse an active recipe across calls", async () => { const rootDirectory = await mkdtemp(join(tmpdir(), "bigset-populate-runner-")); const store = new FileSystemPopulateRecipeStore(rootDirectory); @@ -241,23 +408,30 @@ function recipe(input: { } function validRun(recipe: PopulateRecipe): PopulateRecipeRunResult { + return validRunWithRows(recipe, 1); +} + +function validRunWithRows( + recipe: PopulateRecipe, + rowCount: number +): PopulateRecipeRunResult { return runResult({ recipe, - rows: [{ + rows: Array.from({ length: rowCount }, (_, index) => ({ cells: { - entity_name: "OpenAI", - latest_post_title: "Release notes from OpenAI", + entity_name: `OpenAI ${index + 1}`, + latest_post_title: `Release notes from OpenAI ${index + 1}`, source_url: "https://openai.com/news", - evidence_quote: "Release notes from OpenAI", + evidence_quote: `Release notes from OpenAI ${index + 1}`, }, sourceUrls: ["https://openai.com/news"], evidence: [{ columnName: "latest_post_title", sourceUrl: "https://openai.com/news", - quote: "Release notes from OpenAI", + quote: `Release notes from OpenAI ${index + 1}`, }], needsReview: true, - }], + })), isValid: true, score: 1, }); @@ -274,6 +448,29 @@ function invalidRun(recipe: PopulateRecipe, issue: string): PopulateRecipeRunRes }); } +function partialRun(recipe: PopulateRecipe, issue: string): PopulateRecipeRunResult { + const run = runResult({ + recipe, + rows: validRunWithRows(recipe, 1).rows, + validationIssues: [issue], + criticalIssues: [issue], + isValid: false, + score: 0.75, + }); + return { + ...run, + runStatus: "succeeded", + productionValidation: { + ...run.productionValidation, + state: "accepted_partial", + safeRowCount: run.rows.length, + expectedEntityCoverageRatio: 0.5, + expectedEntities: ["OpenAI", "Anthropic"], + missingExpectedEntities: ["Anthropic"], + }, + }; +} + function runResult(input: { recipe: PopulateRecipe; rows: PopulateRecipeRunResult["rows"]; @@ -304,15 +501,19 @@ function runResult(input: { completedAt: "2026-05-22T00:00:01.000Z", runtimeMs: 1_000, productionValidation: { + state: input.isValid ? "accepted_full" : "rejected", isValid: input.isValid, score: input.score, rowCount: input.rows.length, + safeRowCount: input.isValid ? input.rows.length : 0, requestedCellCompletenessRatio: input.score, sourceUrlCoverageRatio: input.score, evidenceCoverageRatio: input.score, expectedEntityCoverageRatio: input.score, expectedEntities: [], missingExpectedEntities: [], + coveragePolicy: "partial_allowed", + targetSource: "public web sources", criticalIssues: input.criticalIssues ?? [], warnings: input.validationIssues ?? [], }, @@ -363,3 +564,19 @@ class FakePopulateDatasetRowWriter implements PopulateDatasetRowWriter { }; } } + +async function reserveExistingRows(input: { + limiter: InMemoryPopulateDatasetRowCommitLimiter; + now: Date; + rowCount: number; +}): Promise { + const reservation = await input.limiter.reserveCommit({ + datasetId: context.datasetId, + rowCount: input.rowCount, + since: new Date(input.now.getTime() - 60 * 60 * 1_000), + now: input.now, + maxRowsPerWindow: 100, + }); + assert.equal(reservation.decision.isAllowed, true); + await reservation.confirm({ rowCount: input.rowCount }); +} diff --git a/backend/test/populate-self-healing.test.ts b/backend/test/populate-self-healing.test.ts index e1be40d..1d54666 100644 --- a/backend/test/populate-self-healing.test.ts +++ b/backend/test/populate-self-healing.test.ts @@ -3,6 +3,7 @@ import { mkdtemp } from "node:fs/promises"; import { tmpdir } from "node:os"; import { join } from "node:path"; import { test } from "node:test"; +import * as ts from "typescript"; import { createPopulateRecipe, @@ -17,6 +18,10 @@ import type { PopulateRecipeRunResult, PopulateRecipeRuntime, } from "../src/pipeline/populate-self-healing.js"; +import { + createPlaywrightScriptArtifact, + populateRuntimeResultFromAgentCompatibleResult, +} from "../src/pipeline/populate-browser-action-box.js"; import type { DatasetContext } from "../src/pipeline/populate.js"; const context: DatasetContext = { @@ -108,6 +113,253 @@ test("Mastra populate recipe runtime maps populate rows into a healthy recipe ru assert.equal(run.debug?.selectedRowSource, "insert_row"); assert.ok(run.artifacts.some((artifact) => artifact.kind === "source-transcript")); assert.ok(run.artifacts.some((artifact) => artifact.kind === "captured-rows")); + const traceArtifact = run.artifacts.find((artifact) => + artifact.kind === "process-trace" + ); + assert.ok(traceArtifact); + const trace = JSON.parse(traceArtifact.content); + assert.equal(trace.runtime, "mastra-injected"); + assert.deepEqual(trace.searchQueries, ["OpenAI latest blog"]); + assert.deepEqual(trace.fetchedUrls, ["https://openai.com/news"]); + assert.equal(trace.selectedRowSource, "insert_row"); + const readinessArtifact = run.artifacts.find((artifact) => + artifact.kind === "playwright-candidate-readiness" + ); + assert.ok(readinessArtifact); + const readiness = JSON.parse(readinessArtifact.content); + assert.equal(readiness.status, "not_ready"); + assert.match(readiness.reasons.join("\n"), /no actionable browser steps/i); + assert.equal( + run.artifacts.some((artifact) => artifact.kind === "playwright-candidate-script"), + false + ); +}); + +test("Mastra populate recipe runtime emits Playwright candidate script for ready browser traces", async () => { + const runtime = new MastraPopulateRecipeRuntime({ + runPopulate: async () => ({ + rows: validRows(), + validationIssues: [], + usage: emptyUsage(), + metrics: { + ...emptyMetrics(), + browserCalls: 1, + }, + debug: { + capturedRows: [], + capturedSources: [], + selectedRowSource: "collection_pipeline", + notes: [], + processTrace: { + runtime: "collection", + searchQueries: ["OpenAI pricing docs"], + fetchedUrls: ["https://openai.com/news"], + sourceArtifacts: [{ + url: "https://openai.com/news", + status: "succeeded", + source: "collection", + }], + selectedRowSource: "collection_pipeline", + notes: [], + steps: [{ + kind: "browser", + label: "open-news-link", + status: "succeeded", + input: { + phase: "initial", + }, + browserAction: { + action: "click", + url: "https://openai.com/news", + selector: "a[href*='/news']", + targetText: "News", + }, + }], + }, + }, + }), + }); + + const run = await runtime.runRecipe({ + recipe: recipe({ recipeId: "recipe-v1" }), + context, + }); + + const readinessArtifact = run.artifacts.find((artifact) => + artifact.kind === "playwright-candidate-readiness" + ); + assert.ok(readinessArtifact); + assert.equal(JSON.parse(readinessArtifact.content).status, "ready"); + + const scriptArtifact = run.artifacts.find((artifact) => + artifact.kind === "playwright-candidate-script" + ); + assert.ok(scriptArtifact); + assert.match(scriptArtifact.content, /export async function runDatasetRecipe/); + assert.match(scriptArtifact.content, /a\[href\*='\/news'\]/); + assert.match(scriptArtifact.content, /clickTarget/); + assert.match(scriptArtifact.content, /https:\/\/openai\.com\/news/); + assert.ok(scriptArtifact.content.length <= 20_000); + assertJavaScriptModuleParses(scriptArtifact.content); +}); + +test("Mastra populate recipe runtime keeps Playwright candidate scripts complete under artifact cap", async () => { + const longText = "x".repeat(2_000); + const runtime = new MastraPopulateRecipeRuntime({ + runPopulate: async () => ({ + rows: validRows(), + validationIssues: [], + usage: emptyUsage(), + metrics: emptyMetrics(), + debug: { + capturedRows: [], + capturedSources: [], + selectedRowSource: "collection_pipeline", + notes: [], + processTrace: { + runtime: "collection", + searchQueries: [], + fetchedUrls: ["https://example.com/catalog"], + sourceArtifacts: [{ + url: "https://example.com/catalog", + status: "succeeded", + source: "collection", + }], + selectedRowSource: "collection_pipeline", + notes: [], + steps: Array.from({ length: 10 }, (_, index) => ({ + kind: "browser" as const, + label: `long-ready-action-${index}-${longText}`, + status: "succeeded" as const, + input: { + phase: "initial", + }, + browserAction: { + action: "click" as const, + url: `https://example.com/catalog/${index}`, + selector: `[data-long="${longText}-${index}"]`, + targetText: `${longText}-${index}`, + valueDescription: `${longText}-${index}`, + }, + })), + }, + }, + }), + }); + + const run = await runtime.runRecipe({ + recipe: recipe({ recipeId: "recipe-v1" }), + context, + }); + + const readinessArtifact = run.artifacts.find((artifact) => + artifact.kind === "playwright-candidate-readiness" + ); + assert.ok(readinessArtifact); + assert.equal(JSON.parse(readinessArtifact.content).status, "ready"); + + const scriptArtifact = run.artifacts.find((artifact) => + artifact.kind === "playwright-candidate-script" + ); + assert.ok(scriptArtifact); + assert.ok(scriptArtifact.content.length <= 20_000); + assert.match(scriptArtifact.content, /Omitted 5 lower-priority browser actions/); + assertJavaScriptModuleParses(scriptArtifact.content); +}); + +test("Mastra populate recipe runtime replays promoted Playwright script before agent spend", async () => { + let replayCalls = 0; + let populateCalls = 0; + const promotedScript = createPlaywrightScriptArtifact({ + sourceUrl: "https://openai.com/news", + datasetGoalPrompt: context.description, + datasetSchema: { + columns: context.columns.map((column) => ({ + name: column.name, + required: true, + })), + }, + code: "export async function runDatasetRecipe() { return { records: [] }; }", + status: "promoted", + createdAt: "2026-05-24T00:00:00.000Z", + }); + const runtime = new MastraPopulateRecipeRuntime({ + runPopulate: async () => { + populateCalls += 1; + throw new Error("normal populate should not run before replay"); + }, + browserActionBox: { + async replay(input) { + replayCalls += 1; + const agentCompatibleResult = { + records: [{ + entity_name: "OpenAI", + latest_post_title: "Release notes from OpenAI", + source_url: "https://openai.com/news", + evidence_quote: "Release notes from OpenAI", + evidence: [{ + field: "latest_post_title", + url: "https://openai.com/news", + quote: "Release notes from OpenAI", + }], + }], + }; + return { + agentCompatibleResult, + runtimeResult: populateRuntimeResultFromAgentCompatibleResult({ + agentCompatibleResult, + datasetSchema: input.datasetSchema, + sourceUrl: input.sourceUrl, + replayTrace: { + status: "succeeded", + startedAt: "2026-05-24T00:00:00.000Z", + completedAt: "2026-05-24T00:00:01.000Z", + scriptId: input.currentPlaywrightScript.scriptId, + sourceUrl: input.sourceUrl, + diagnostics: [], + steps: [{ + kind: "browser", + label: "playwright-replay", + status: "succeeded", + }], + }, + diagnosticArtifacts: [{ + kind: "playwright-replay-result", + label: "populate-playwright-replay-result", + content: JSON.stringify({ replayStatus: "replay_succeeded" }), + }], + }), + trace: { + status: "succeeded", + startedAt: "2026-05-24T00:00:00.000Z", + completedAt: "2026-05-24T00:00:01.000Z", + scriptId: input.currentPlaywrightScript.scriptId, + sourceUrl: input.sourceUrl, + diagnostics: [], + steps: [], + }, + replayStatus: "replay_succeeded", + diagnostics: [], + }; + }, + }, + }); + + const run = await runtime.runRecipe({ + recipe: recipe({ + recipeId: "recipe-v1", + playwrightScript: promotedScript, + }), + context, + }); + + assert.equal(replayCalls, 1); + assert.equal(populateCalls, 0); + assert.equal(run.runStatus, "succeeded"); + assert.equal(run.productionValidation.isValid, true); + assert.ok(run.artifacts.some((artifact) => + artifact.kind === "playwright-replay-result" + )); }); test("Mastra populate recipe runtime keeps supplemental fetch misses non-blocking", async () => { @@ -133,7 +385,156 @@ test("Mastra populate recipe runtime keeps supplemental fetch misses non-blockin assert.match(run.productionValidation.warnings.join("\n"), /timeout/); }); -test("Mastra populate recipe runtime blocks missing expected entities", async () => { +test("Mastra populate recipe runtime treats nullable missing cells as non-blocking", async () => { + const runtime = new MastraPopulateRecipeRuntime({ + runPopulate: async () => ({ + rows: validRows(), + validationIssues: [ + "Author field could not be determined from the available source transcript.", + ], + usage: emptyUsage(), + metrics: emptyMetrics(), + }), + }); + + const run = await runtime.runRecipe({ + recipe: recipe({ recipeId: "recipe-v1" }), + context: { + ...context, + columns: [ + ...context.columns, + { + name: "author", + type: "text", + description: "Author when the source names one.", + nullable: true, + }, + ], + }, + }); + + assert.equal(run.runStatus, "succeeded"); + assert.equal(run.productionValidation.isValid, true); + assert.deepEqual(run.productionValidation.criticalIssues, []); + assert.equal(run.productionValidation.requestedCellCompletenessRatio, 0.8); +}); + +test("Mastra populate recipe runtime rejects approximated required cells", async () => { + const runtime = new MastraPopulateRecipeRuntime({ + runPopulate: async () => ({ + rows: validRows(), + validationIssues: [ + "Actual post titles and canonical URLs are not present in the captured transcript, so category+date combinations are used as best approximations. These rows require manual review.", + ], + usage: emptyUsage(), + metrics: emptyMetrics(), + }), + }); + + const run = await runtime.runRecipe({ + recipe: recipe({ recipeId: "recipe-v1" }), + context: { + ...context, + columns: [ + ...context.columns, + { + name: "author", + type: "text", + description: "Author when the source names one.", + nullable: true, + }, + ], + }, + }); + + assert.equal(run.runStatus, "failed"); + assert.equal(run.productionValidation.isValid, false); + assert.match( + run.productionValidation.criticalIssues.join("\n"), + /best approximations/ + ); +}); + +test("Mastra populate recipe runtime rejects evidence disconnected from row sources", async () => { + const runtime = new MastraPopulateRecipeRuntime({ + runPopulate: async () => ({ + rows: validRows().map((row) => ({ + ...row, + evidence: [{ + columnName: "latest_post_title", + sourceUrl: "https://example.com/unrelated", + quote: "Release notes from OpenAI", + }], + })), + validationIssues: [], + usage: emptyUsage(), + metrics: emptyMetrics(), + }), + }); + + const run = await runtime.runRecipe({ + recipe: recipe({ recipeId: "recipe-v1" }), + context, + }); + + assert.equal(run.runStatus, "failed"); + assert.equal(run.productionValidation.isValid, false); + assert.match( + run.productionValidation.criticalIssues.join("\n"), + /does not match a row source URL/ + ); +}); + +test("process trace artifacts stay parseable when trace content is large", async () => { + const runtime = new MastraPopulateRecipeRuntime({ + runPopulate: async () => ({ + rows: validRows(), + validationIssues: [], + usage: emptyUsage(), + metrics: emptyMetrics(), + debug: { + capturedRows: [], + capturedSources: [], + selectedRowSource: "collection_pipeline", + notes: [], + processTrace: { + runtime: "collection", + searchQueries: Array.from({ length: 125 }, (_, index) => + `query-${index}-${"x".repeat(1_000)}` + ), + fetchedUrls: [], + sourceArtifacts: [], + selectedRowSource: "collection_pipeline", + notes: ["n".repeat(1_000)], + steps: Array.from({ length: 125 }, (_, index) => ({ + kind: "search" as const, + label: `collection-search-query-${index}`, + status: "succeeded" as const, + input: { query: "x".repeat(1_000) }, + })), + }, + }, + }), + }); + + const run = await runtime.runRecipe({ + recipe: recipe({ recipeId: "recipe-v1" }), + context, + }); + const traceArtifact = run.artifacts.find((artifact) => + artifact.kind === "process-trace" + ); + + assert.ok(traceArtifact); + assert.ok(traceArtifact.content.length <= 20_000); + const parsedTrace = JSON.parse(traceArtifact.content); + assert.equal(parsedTrace.truncated, true); + assert.ok(parsedTrace.steps.length > 0); + assert.ok(parsedTrace.steps.length <= 100); + assert.match(parsedTrace.searchQueries[0], /\[truncated\]/); +}); + +test("Mastra populate recipe runtime marks missing expected entities as partial", async () => { const runtime = new MastraPopulateRecipeRuntime({ runPopulate: async () => ({ rows: [{ @@ -166,8 +567,10 @@ test("Mastra populate recipe runtime blocks missing expected entities", async () }, }); - assert.equal(run.runStatus, "failed"); + assert.equal(run.runStatus, "succeeded"); + assert.equal(run.productionValidation.state, "accepted_partial"); assert.equal(run.productionValidation.isValid, false); + assert.equal(run.productionValidation.safeRowCount, 1); assert.deepEqual(run.productionValidation.expectedEntities, [ "OpenAI", "Anthropic", @@ -314,6 +717,69 @@ test("self-healing service repairs a failed active recipe and promotes the candi assert.equal(snapshot.recipes.find((item) => item.recipeId === "repair-v2")?.status, "active"); }); +test("self-healing service does not run a second recipe repair after bounded Playwright repair rejects", async () => { + const store = new InMemoryPopulateRecipeStore(); + const promotedScript = createPlaywrightScriptArtifact({ + sourceUrl: "https://openai.com/news", + datasetGoalPrompt: context.description, + datasetSchema: { + columns: context.columns.map((column) => ({ + name: column.name, + required: true, + })), + }, + code: "export async function runDatasetRecipe() { throw new Error('stale selector'); }", + status: "promoted", + createdAt: "2026-05-24T00:00:00.000Z", + }); + const activeRecipe = { + ...recipe({ + recipeId: "active-playwright-broken", + status: "active", + playwrightScript: promotedScript, + }), + lastValidationScore: 1, + }; + await store.saveRecipe(activeRecipe); + const author = new FakeRecipeAuthor(); + const service = new SelfHealingPopulateRecipeService({ + store, + runtime: new FakePopulateRecipeRuntime({ + "active-playwright-broken": runResult({ + recipe: activeRecipe, + rows: [], + validationIssues: ["BrowserActionBox repair_rejected: locator timed out"], + criticalIssues: ["BrowserActionBox repair_rejected: locator timed out"], + isValid: false, + score: 0, + artifacts: [{ + kind: "playwright-repair-diagnostic", + label: "populate-playwright-repair-diagnostic", + content: JSON.stringify({ + replayStatus: "repair_rejected", + diagnostics: ["locator timed out"], + }), + }], + }), + }), + author, + }); + + const result = await service.tick({ datasetId: context.datasetId, context }); + const snapshot = await store.loadSnapshot(context.datasetId); + + assert.equal(result.action, "candidate_rejected"); + assert.equal(author.repairCalls, 0); + assert.match(result.rejectionReasons.join("\n"), /keeping prior active script/i); + assert.equal( + snapshot.recipes.find((item) => + item.recipeId === "active-playwright-broken" + )?.status, + "active" + ); + assert.equal(snapshot.recipes.some((item) => item.recipeId === "repair-v2"), false); +}); + test("self-healing service rejects valid repairs below active recipe baseline", async () => { const store = new InMemoryPopulateRecipeStore(); const activeRecipe = { @@ -370,7 +836,7 @@ test("file store reloads populate recipes and run records", async () => { const service = new SelfHealingPopulateRecipeService({ store, runtime: new FakePopulateRecipeRuntime({ - "persisted-v1": validRun(generatedRecipe), + "persisted-v1": validRun(generatedRecipe, 1, [processTraceArtifact()]), }), author: new FakeRecipeAuthor({ generatedRecipe }), }); @@ -384,6 +850,11 @@ test("file store reloads populate recipes and run records", async () => { assert.equal(snapshot.recipes[0]?.status, "active"); assert.equal(snapshot.runRecords.length, 1); assert.equal(snapshot.runRecords[0]?.runStatus, "succeeded"); + assert.equal(snapshot.runRecords[0]?.artifacts[0]?.kind, "process-trace"); + assert.match( + snapshot.runRecords[0]?.artifacts[0]?.content ?? "", + /collection-search-query/ + ); }); interface ToolLike { @@ -395,25 +866,34 @@ function recipe(input: { version?: number; status?: PopulateRecipe["status"]; runtimeInstructions?: string; + playwrightScript?: PopulateRecipe["playwrightScript"]; }): PopulateRecipe { - return createPopulateRecipe({ - recipeId: input.recipeId, - datasetId: context.datasetId, - version: input.version ?? 1, - status: input.status, - sourceDescription: context.description, - requestedColumns: context.columns.map((column) => column.name), - runtimeInstructions: input.runtimeInstructions, - createdAt: "2026-05-22T00:00:00.000Z", - }); + return { + ...createPopulateRecipe({ + recipeId: input.recipeId, + datasetId: context.datasetId, + version: input.version ?? 1, + status: input.status, + sourceDescription: context.description, + requestedColumns: context.columns.map((column) => column.name), + runtimeInstructions: input.runtimeInstructions, + createdAt: "2026-05-22T00:00:00.000Z", + }), + playwrightScript: input.playwrightScript, + }; } -function validRun(recipe: PopulateRecipe, score = 1): PopulateRecipeRunResult { +function validRun( + recipe: PopulateRecipe, + score = 1, + artifacts: PopulateRecipeRunResult["artifacts"] = [] +): PopulateRecipeRunResult { return runResult({ recipe, rows: validRows(), isValid: true, score, + artifacts, }); } @@ -435,6 +915,7 @@ function runResult(input: { criticalIssues?: string[]; isValid: boolean; score: number; + artifacts?: PopulateRecipeRunResult["artifacts"]; }): PopulateRecipeRunResult { return { rows: input.rows, @@ -458,19 +939,48 @@ function runResult(input: { completedAt: "2026-05-22T00:00:01.000Z", runtimeMs: 1_000, productionValidation: { + state: input.isValid ? "accepted_full" : "rejected", isValid: input.isValid, score: input.score, rowCount: input.rows.length, + safeRowCount: input.isValid ? input.rows.length : 0, requestedCellCompletenessRatio: input.score, sourceUrlCoverageRatio: input.score, evidenceCoverageRatio: input.score, expectedEntityCoverageRatio: input.score, expectedEntities: [], missingExpectedEntities: [], + coveragePolicy: "partial_allowed", + targetSource: "public web sources", criticalIssues: input.criticalIssues ?? [], warnings: input.validationIssues ?? [], }, - artifacts: [], + artifacts: input.artifacts ?? [], + }; +} + +function processTraceArtifact(): PopulateRecipeRunResult["artifacts"][number] { + return { + kind: "process-trace", + label: "populate-process-trace", + content: JSON.stringify({ + runtime: "collection", + searchQueries: ["OpenAI latest blog"], + fetchedUrls: ["https://openai.com/news"], + sourceArtifacts: [{ + url: "https://openai.com/news", + status: "succeeded", + source: "collection", + }], + selectedRowSource: "collection_pipeline", + notes: [], + steps: [{ + kind: "search", + label: "collection-search-query", + status: "succeeded", + input: { query: "OpenAI latest blog" }, + }], + }), }; } @@ -514,6 +1024,22 @@ function emptyMetrics(): PopulateRecipeRunResult["metrics"] { }; } +function assertJavaScriptModuleParses(source: string): void { + const sourceFile = ts.createSourceFile( + "playwright-candidate-script.mjs", + source, + ts.ScriptTarget.ES2022, + true, + ts.ScriptKind.JS + ); + assert.deepEqual( + sourceFile.parseDiagnostics.map((diagnostic) => + ts.flattenDiagnosticMessageText(diagnostic.messageText, "\n") + ), + [] + ); +} + class FakePopulateRecipeRuntime implements PopulateRecipeRuntime { constructor(private readonly runsByRecipeId: Record) {} diff --git a/backend/test/populate-server.test.ts b/backend/test/populate-server.test.ts index 99e63f2..cbc6b63 100644 --- a/backend/test/populate-server.test.ts +++ b/backend/test/populate-server.test.ts @@ -2,6 +2,7 @@ import assert from "node:assert/strict"; import { test } from "node:test"; import { createBigSetServer } from "../src/server.js"; +import { DEFAULT_COMMIT_ROW_LIMIT_PER_HOUR } from "../src/pipeline/populate-self-healing-command.js"; import type { DatasetContext } from "../src/pipeline/populate.js"; import type { PopulateRecipeRuntime } from "../src/pipeline/populate-self-healing.js"; import type { RunSelfHealingPopulateResult } from "../src/pipeline/populate-self-healing-runner.js"; @@ -55,6 +56,11 @@ test("POST /populate passes selected runtime into self-healing runner", async () assert.equal(input.shouldCommitRows, true); assert.equal(input.recipeStoreDirectory, ".bigset/populate-recipes"); assert.ok(input.rowWriter); + assert.equal( + input.commitRowLimit?.maxRowsPerWindow, + DEFAULT_COMMIT_ROW_LIMIT_PER_HOUR + ); + assert.equal(input.commitRowLimit?.windowMs, 60 * 60 * 1_000); return successfulResult(input.context.datasetId); }, }); @@ -105,15 +111,19 @@ function successfulResult(datasetId: string): RunSelfHealingPopulateResult { completedAt: "2026-05-22T00:00:01.000Z", runtimeMs: 1_000, productionValidation: { + state: "accepted_full", isValid: true, score: 1, rowCount: 1, + safeRowCount: 1, requestedCellCompletenessRatio: 1, sourceUrlCoverageRatio: 1, evidenceCoverageRatio: 1, expectedEntityCoverageRatio: 1, expectedEntities: [], missingExpectedEntities: [], + coveragePolicy: "partial_allowed", + targetSource: "public web sources", criticalIssues: [], warnings: [], }, diff --git a/backend/test/tinyfish-agent-run.test.ts b/backend/test/tinyfish-agent-run.test.ts new file mode 100644 index 0000000..b3c3fc6 --- /dev/null +++ b/backend/test/tinyfish-agent-run.test.ts @@ -0,0 +1,162 @@ +import assert from "node:assert/strict"; +import { test } from "node:test"; + +import { + pollTinyfishAgentUntilDone, + tinyfishAgentRunResultFromRun, +} from "../BigSet_Data_Collection_Agent/src/integrations/tinyfish-agent.js"; + +test("TinyFish run normalization keeps safe provenance without streaming URL", () => { + const normalized = tinyfishAgentRunResultFromRun({ + run_id: "run-1", + status: "COMPLETED", + goal: "Extract rows.", + created_at: "2026-05-23T00:00:00Z", + started_at: "2026-05-23T00:00:01Z", + finished_at: "2026-05-23T00:00:02Z", + num_of_steps: 3, + result: { + records: [], + }, + error: null, + streaming_url: "STREAMING_URL_SHOULD_NOT_BE_STORED", + recording_url: "RECORDING_URL_SHOULD_NOT_BE_STORED", + capture_artifacts: [{ + type: "screenshot", + url: "CAPTURE_ARTIFACT_URL_SHOULD_NOT_BE_STORED", + }], + browser_config: { + proxy_enabled: true, + proxy_country_code: null, + }, + } as never); + + assert.equal(normalized.agent_step_count, 3); + assert.equal(normalized.has_streaming_url, true); + assert.equal(normalized.has_recording_url, true); + assert.equal(normalized.capture_artifact_count, 1); + assert.deepEqual(normalized.result_keys, ["records"]); + assert.equal( + JSON.stringify(normalized).includes("STREAMING_URL_SHOULD_NOT_BE_STORED"), + false + ); + assert.equal( + JSON.stringify(normalized).includes("RECORDING_URL_SHOULD_NOT_BE_STORED"), + false + ); + assert.equal( + JSON.stringify(normalized).includes("CAPTURE_ARTIFACT_URL_SHOULD_NOT_BE_STORED"), + false + ); +}); + +test("TinyFish run normalization converts documented run steps to browser actions", () => { + const normalized = tinyfishAgentRunResultFromRun({ + run_id: "run-2", + status: "COMPLETED", + goal: "Extract rows.", + created_at: "2026-05-23T00:00:00Z", + started_at: "2026-05-23T00:00:01Z", + finished_at: "2026-05-23T00:00:02Z", + num_of_steps: 4, + result: { + records: [], + }, + error: null, + streaming_url: null, + steps: [{ + type: "navigate", + url: "https://example.com/products", + status: "completed", + }, { + action: "click", + current_url: "https://example.com/products", + target: { + selector: "button[data-category='tools']", + text: "Tools", + }, + outcome: "success", + }, { + type: "type", + current_url: "https://example.com/products", + selector: "input[name='password']", + value: "secret-password", + status: "completed", + }], + } as never); + + assert.deepEqual(normalized.browser_actions, [{ + action: "navigate", + url: "https://example.com/products", + selector: undefined, + target_text: undefined, + status: "succeeded", + error: undefined, + phase: "agent-step", + label: "navigate", + value_description: undefined, + }, { + action: "click", + url: "https://example.com/products", + selector: "button[data-category='tools']", + target_text: "Tools", + status: "succeeded", + error: undefined, + phase: "agent-step", + label: undefined, + value_description: undefined, + }, { + action: "type", + url: "https://example.com/products", + selector: "input[name='password']", + target_text: undefined, + status: "succeeded", + error: undefined, + phase: "agent-step", + label: "type", + value_description: "redacted typed value (15 chars)", + }]); + assert.equal(JSON.stringify(normalized).includes("secret-password"), false); +}); + +test("TinyFish agent poll timeout races hung run reads and cancels promptly", async () => { + const startedAt = Date.now(); + let pollAttempts = 0; + let cancelAttempts = 0; + + const result = await pollTinyfishAgentUntilDone("run-hung-poll", { + pollTimeoutMs: 40, + pollIntervalMs: 1, + requestTimeoutMs: 10, + readRun: async (_runId, { signal }) => { + pollAttempts += 1; + await new Promise((_resolve, reject) => { + signal.addEventListener( + "abort", + () => + reject( + signal.reason instanceof Error + ? signal.reason + : new Error("poll aborted"), + ), + { once: true }, + ); + }); + }, + cancelRun: async (_runId, { signal }) => { + cancelAttempts += 1; + assert.equal(signal.aborted, false); + }, + }); + + assert.equal(result.status, "TIMEOUT"); + assert.equal(result.run_id, "run-hung-poll"); + assert.match(result.error ?? "", /timed out after 40ms/); + assert.match( + result.error ?? "", + /last poll error: TinyFish Agent poll run-hung-poll timed out after \d+ms/, + ); + assert.equal(cancelAttempts, 1); + assert.ok(pollAttempts >= 1); + assert.ok(Date.now() - startedAt < 300); +}); diff --git a/benchmarks/dataset-agent/README.md b/benchmarks/dataset-agent/README.md index a4e0cc7..c9bbd16 100644 --- a/benchmarks/dataset-agent/README.md +++ b/benchmarks/dataset-agent/README.md @@ -59,7 +59,7 @@ source-evidence misses: ```bash COLLECTION_AGENT_ENABLE_AGENT=true \ -COLLECTION_AGENT_POLL_TIMEOUT_MS=480000 \ +COLLECTION_AGENT_POLL_TIMEOUT_MS=1200000 \ COLLECTION_AGENT_PIPELINE_MODULE=./backend/BigSet_Data_Collection_Agent/src/orchestrator/pipeline.ts \ BIGSET_COLLECTION_BENCHMARK_RUNNER_MODULE=./backend/src/pipeline/collection-agent-runner.ts \ node benchmarks/dataset-agent/run-benchmark.mjs \ @@ -68,19 +68,125 @@ node benchmarks/dataset-agent/run-benchmark.mjs \ --system collection-self-heal='node --import ./backend/node_modules/tsx/dist/esm/index.mjs benchmarks/dataset-agent/adapters/collection-self-healing-adapter.mjs' ``` -Latest `mcp-docs-pages` Agent-enabled canary evidence: +Latest `mcp-docs-pages` Agent-enabled canary evidence, rescored with the +rejected-candidate gate: -- artifact: `benchmark-results/collection-agent-canary-mcp-20260523-001` -- status: failed, not blocked -- rows/evidence: 3 rows, 12 evidence quotes, 10 source URLs -- cost: about `$0.053552` -- signal: Agent runs complete and claim support reaches `1.0`, but domain - accuracy stays `0.667`; next fix is source/domain coherence, not more Agent - plumbing. +- artifact: `benchmark-results/collection-agent-provenance-mcp-20260523-001` +- status: failed with `failureCategory: "capability_gate"` +- rows/evidence: 3 rows, 5 evidence quotes, 5 source URLs +- score: factual accuracy `1.0`, entity coverage `1.0`, domain accuracy `1.0`, + claim support `1.0` +- Agent signal: 1 Agent run reported 20 steps, but emitted no explicit + `browser_actions` +- self-healing signal: `selfHealingAction: "candidate_rejected"` +- Playwright signal: `playwrightCandidateStatus: "not_ready"` with zero + replayable browser steps +- conclusion: rows are useful debug evidence, but not a promotable cron recipe. + Next fix is producer-side browser action emission plus a promoted + self-healing run, not a Playwright compiler yet. App and CLI collection-runtime runs use the same runner shape, but load it from `POPULATE_COLLECTION_RUNNER_MODULE` when `POPULATE_AGENT_RUNTIME=collection`. +Self-healing run records now include a `process-trace` artifact when a runtime +exposes trace data and a `playwright-candidate-readiness` artifact that says +whether the trace is grounded enough for Playwright replay. When that readiness +status is `ready`, the run also emits a `playwright-candidate-script` artifact +that exports `runDatasetRecipe(context)`. Search and fetch URLs alone are not +enough. The readiness gate expects real browser actions such as URL transitions, +selectors, target text, or redacted input descriptions before any script can be +emitted. + +Collection runners can feed those actions through explicit report fields such +as `browser_actions` or `agent_browser_actions`. BigSet maps only those explicit +actions into `browser` trace steps; it does not infer selectors or clicks from +URLs, source outcomes, or prose diagnostics. The collection TinyFish Agent goal +now explicitly asks the Agent to return `agent_browser_actions` next to +`records`, so browser replay evidence starts at the producer contract. + +Mapping is mechanical: + +- `target_text` / `targetText` -> `browserAction.targetText` +- `value_description` / `valueDescription` -> `browserAction.valueDescription` +- `status` -> `step.status` +- `error` -> `step.error` +- `phase` -> `step.input.phase` +- unknown action strings -> `browserAction.action = "unknown"` + +When both action arrays are present in the same report scope, BigSet preserves +array order by appending `browser_actions` first and `agent_browser_actions` +second. This is an ingestion contract for a future collection producer or Agent +canary; it does not mean the current vendored pipeline already emits +browser actions. + +When TinyFish Agent result JSON includes explicit `browser_actions` or +`agent_browser_actions`, the vendored runner preserves those arrays in +`agent_runs_*.json` and phase-scoped `run_report.json` fields. Generic +`actions` arrays are ignored because they are not browser-specific enough to +replay honestly. + +The collection self-healing adapter also prints a compact `diagnostics` object +to stdout so benchmark artifacts can answer the Playwright readiness question +without committing raw run folders: + +```json +{ + "diagnostics": { + "selfHealingAction": "generated_initial_recipe", + "artifactKinds": ["process-trace", "playwright-candidate-readiness"], + "processTrace": { + "runtime": "collection", + "stepCount": 12, + "browserStepCount": 1, + "sourceUrlCount": 4 + }, + "playwrightCandidateReadiness": { + "status": "ready", + "browserStepCount": 1, + "sourceUrlCount": 4 + } + } +} +``` + +`summary.json` carries the same high-signal fields on each lane result: +`selfHealingAction`, `selfHealingArtifactKinds`, `processTraceStepCount`, +`processTraceBrowserStepCount`, `playwrightCandidateStatus`, +`playwrightCandidateBrowserStepCount`, and +`playwrightCandidateSourceUrlCount`. Use those fields to verify whether an +Agent canary actually emitted browser actions before starting a Playwright +compiler. + +If `selfHealingAction` is `candidate_rejected`, the benchmark marks the lane as +`failureCategory: "capability_gate"` even when the diagnostic rows score well. +Rejected candidates are useful for debugging, but they are not promotable cron +recipes. + +Agent canaries also preserve safe provenance from the TinyFish run payload: +reported step count, whether a streaming URL existed, and top-level result +keys. Raw `streaming_url` values are never persisted. If Agent returns rows but +the polled run payload has no explicit `browser_actions`, diagnostics include +that distinction so `not_ready` means "no replayable action trace", not "the +Agent did no browser work." + +For browser-action canaries, add `--require-playwright-ready` to make the +benchmark fail with `failureCategory: "capability_gate"` unless the +`playwright-candidate-readiness` artifact is `ready`. This gate uses the +readiness artifact, not raw browser step counts, so it still requires +actionable browser steps, source anchors, and no Agent-disabled diagnostic. + +```bash +COLLECTION_AGENT_ENABLE_AGENT=true \ +COLLECTION_AGENT_POLL_TIMEOUT_MS=1200000 \ +COLLECTION_AGENT_PIPELINE_MODULE=./backend/BigSet_Data_Collection_Agent/src/orchestrator/pipeline.ts \ +BIGSET_COLLECTION_BENCHMARK_RUNNER_MODULE=./backend/src/pipeline/collection-agent-runner.ts \ +node benchmarks/dataset-agent/run-benchmark.mjs \ + --require-playwright-ready \ + --prompt-ids mcp-docs-pages \ + --timeout-ms 900000 \ + --system collection-self-heal='node --import ./backend/node_modules/tsx/dist/esm/index.mjs benchmarks/dataset-agent/adapters/collection-self-healing-adapter.mjs' +``` + ## Verify Self-Healing Stack Use this before asking someone else to migrate a new collection agent into the @@ -102,10 +208,9 @@ bash scripts/verify-self-healing-stack.sh --convex-push --dataset-id --commit ``` -The live benchmark and dataset smoke expect required env vars to already be -exported in the shell. They print only missing key names and never print secret -values. The `--convex-push` mode still uses the existing `make convex-push` -target, which requires `frontend/.env.local`. +The live benchmark and dataset smoke load root `.env` when present. They print +only missing key names and never print secret values. The `--convex-push` mode +uses `make convex-push`, which also reads root `.env`. ## Benchmark Env diff --git a/benchmarks/dataset-agent/adapters/collection-self-healing-adapter.mjs b/benchmarks/dataset-agent/adapters/collection-self-healing-adapter.mjs index c9480ba..5888e67 100644 --- a/benchmarks/dataset-agent/adapters/collection-self-healing-adapter.mjs +++ b/benchmarks/dataset-agent/adapters/collection-self-healing-adapter.mjs @@ -2,6 +2,8 @@ import { pathToFileURL } from "node:url"; import { resolve } from "node:path"; +import { selfHealingDiagnosticsFromTick } from "./self-healing-output.mjs"; + const prompt = requiredEnv("BIGSET_BENCHMARK_PROMPT"); const promptId = process.env.BIGSET_BENCHMARK_PROMPT_ID ?? "benchmark-prompt"; const promptQuality = process.env.BIGSET_BENCHMARK_PROMPT_QUALITY ?? "unknown"; @@ -87,6 +89,7 @@ const service = new SelfHealingPopulateRecipeService({ }); const tick = await service.tick({ datasetId: context.datasetId, context }); const result = diagnosticRunForTick(tick); +const diagnostics = selfHealingDiagnosticsFromTick({ tick, run: result }); console.log(JSON.stringify({ rows: result?.rows ?? [], @@ -95,7 +98,16 @@ console.log(JSON.stringify({ ...minimumColumnIssues(result?.rows ?? []), ], usage: result?.usage ?? emptyUsage(), - metrics: result?.metrics ?? emptyMetrics(), + metrics: { + ...(result?.metrics ?? emptyMetrics()), + processTraceStepCount: diagnostics.processTrace?.stepCount ?? 0, + processTraceBrowserStepCount: diagnostics.processTrace?.browserStepCount ?? 0, + playwrightCandidateBrowserStepCount: + diagnostics.playwrightCandidateReadiness?.browserStepCount ?? 0, + playwrightCandidateSourceUrlCount: + diagnostics.playwrightCandidateReadiness?.sourceUrlCount ?? 0, + }, + diagnostics, })); async function loadCollectionRunner() { diff --git a/benchmarks/dataset-agent/adapters/self-healing-output.mjs b/benchmarks/dataset-agent/adapters/self-healing-output.mjs new file mode 100644 index 0000000..b17ecc1 --- /dev/null +++ b/benchmarks/dataset-agent/adapters/self-healing-output.mjs @@ -0,0 +1,79 @@ +export function selfHealingDiagnosticsFromTick({ tick, run }) { + const artifacts = Array.isArray(run?.artifacts) ? run.artifacts : []; + const processTrace = processTraceSummaryFromArtifacts(artifacts); + const playwrightCandidateReadiness = playwrightReadinessFromArtifacts(artifacts); + + return { + selfHealingAction: tick?.action, + recipeId: run?.recipeId, + artifactKinds: artifacts + .map((artifact) => artifact?.kind) + .filter((kind) => typeof kind === "string"), + processTrace, + playwrightCandidateReadiness, + }; +} + +function processTraceSummaryFromArtifacts(artifacts) { + const trace = parsedJsonArtifact(artifacts, "process-trace"); + if (!trace) { + return undefined; + } + const steps = Array.isArray(trace.steps) ? trace.steps : []; + const sourceArtifacts = Array.isArray(trace.sourceArtifacts) + ? trace.sourceArtifacts + : []; + const fetchedUrls = Array.isArray(trace.fetchedUrls) ? trace.fetchedUrls : []; + const searchQueries = Array.isArray(trace.searchQueries) + ? trace.searchQueries + : []; + const notes = Array.isArray(trace.notes) + ? trace.notes.filter((note) => typeof note === "string") + : []; + + return { + runtime: typeof trace.runtime === "string" ? trace.runtime : "unknown", + stepCount: steps.length, + browserStepCount: steps.filter((step) => step?.kind === "browser").length, + sourceUrlCount: new Set([ + ...fetchedUrls, + ...sourceArtifacts + .filter((artifact) => artifact?.status === "succeeded") + .map((artifact) => artifact?.url), + ].filter((url) => typeof url === "string" && /^https?:\/\//i.test(url))).size, + searchQueryCount: searchQueries.length, + fetchedUrlCount: fetchedUrls.length, + notes: notes.slice(0, 10), + }; +} + +function playwrightReadinessFromArtifacts(artifacts) { + const readiness = parsedJsonArtifact(artifacts, "playwright-candidate-readiness"); + if (!readiness) { + return undefined; + } + return { + status: readiness.status === "ready" ? "ready" : "not_ready", + reasons: Array.isArray(readiness.reasons) + ? readiness.reasons.filter((reason) => typeof reason === "string") + : [], + browserStepCount: numberValue(readiness.browserStepCount), + sourceUrlCount: numberValue(readiness.sourceUrlCount), + }; +} + +function parsedJsonArtifact(artifacts, kind) { + const artifact = artifacts.find((candidate) => candidate?.kind === kind); + if (!artifact || typeof artifact.content !== "string") { + return undefined; + } + try { + return JSON.parse(artifact.content); + } catch { + return undefined; + } +} + +function numberValue(value) { + return Number.isFinite(Number(value)) ? Number(value) : 0; +} diff --git a/benchmarks/dataset-agent/run-benchmark.mjs b/benchmarks/dataset-agent/run-benchmark.mjs index 3c3ed9e..3b89837 100755 --- a/benchmarks/dataset-agent/run-benchmark.mjs +++ b/benchmarks/dataset-agent/run-benchmark.mjs @@ -567,11 +567,22 @@ async function runSystemPrompt(input) { parsedPayload, normalized, }); - const status = infraBlockerReason - ? "blocked" - : execution.exitCode === 0 && parsedPayload && answerKeyScore.passed - ? "ok" - : "failed"; + const capabilityGateReason = infraBlockerReason + ? null + : firstString([ + selfHealingActionGateReason({ diagnostics: normalized.diagnostics }), + playwrightReadinessGateReason({ + diagnostics: normalized.diagnostics, + requirePlaywrightReady: input.config.requirePlaywrightReady, + }), + ]); + const status = benchmarkStatusForOutcome({ + execution, + parsedPayload, + answerKeyScore, + infraBlockerReason, + capabilityGateReason, + }); const promptRunDirectory = join( input.runDirectory, @@ -597,9 +608,12 @@ async function runSystemPrompt(input) { expectedStress: input.promptDefinition.expectedStress, answerKey: answerKeyForPrompt(input.promptDefinition), status, - failureCategory: status === "ok" ? undefined : ( - infraBlockerReason ? "infra" : answerKeyScore.failureCategory - ), + failureCategory: failureCategoryForOutcome({ + status, + infraBlockerReason, + capabilityGateReason, + answerKeyScore, + }), factualAccuracyScore: answerKeyScore.factualAccuracyScore, entityCoverageRatio: answerKeyScore.entityCoverageRatio, domainAccuracyRatio: answerKeyScore.domainAccuracyRatio, @@ -627,6 +641,18 @@ async function runSystemPrompt(input) { needsReviewCount: validation.needsReviewCount, validationIssueCount: normalized.validationIssues.length, validationIssues: normalized.validationIssues, + selfHealingAction: normalized.diagnostics.selfHealingAction, + selfHealingArtifactKinds: normalized.diagnostics.artifactKinds, + processTraceStepCount: normalized.diagnostics.processTrace?.stepCount, + processTraceBrowserStepCount: + normalized.diagnostics.processTrace?.browserStepCount, + playwrightCandidateStatus: + normalized.diagnostics.playwrightCandidateReadiness?.status, + playwrightCandidateBrowserStepCount: + normalized.diagnostics.playwrightCandidateReadiness?.browserStepCount, + playwrightCandidateSourceUrlCount: + normalized.diagnostics.playwrightCandidateReadiness?.sourceUrlCount, + diagnostics: normalized.diagnostics, usage, searchCallCount: normalized.metrics.searchCallCount, fetchCallCount: normalized.metrics.fetchCallCount, @@ -645,6 +671,7 @@ async function runSystemPrompt(input) { validation, answerKeyScore, infraBlockerReason, + capabilityGateReason, minRequiredCompleteness: input.config.minRequiredCompleteness, validationIssues: normalized.validationIssues, }), @@ -746,6 +773,7 @@ function parseArgs(args) { tinyFishAgentStepUsd: 0.015, minRequiredCompleteness: 0.75, minFactualAccuracy: defaultMinimumFactualAccuracy, + requirePlaywrightReady: false, }; for (let index = 0; index < args.length; index += 1) { @@ -785,6 +813,8 @@ function parseArgs(args) { } else if (arg === "--min-factual-accuracy") { config.minFactualAccuracy = nonNegativeNumber(value, config.minFactualAccuracy); index += 1; + } else if (arg === "--require-playwright-ready") { + config.requirePlaywrightReady = true; } else if (arg === "--help" || arg === "-h") { printHelpAndExit(); } else { @@ -930,7 +960,7 @@ function extractLastJsonObject(value) { return null; } -function normalizePayload(payload) { +export function normalizePayload(payload) { const rows = arrayValue( payload?.rows ?? payload?.data ?? @@ -943,10 +973,12 @@ function normalizePayload(payload) { ); const metrics = payload?.metrics ?? payload?.benchmarkMetrics ?? {}; const usage = normalizeUsage(payload?.usage ?? metrics.usage ?? metrics); + const diagnostics = objectValue(payload?.diagnostics); return { rows, validationIssues, + diagnostics, usage, metrics: { searchCallCount: numberValue(metrics.searchCallCount ?? metrics.searchCalls), @@ -958,6 +990,78 @@ function normalizePayload(payload) { }; } +export function playwrightReadinessGateReason({ + diagnostics, + requirePlaywrightReady, +}) { + if (!requirePlaywrightReady) { + return null; + } + const readiness = diagnostics?.playwrightCandidateReadiness; + if (!readiness || typeof readiness !== "object") { + return "Playwright readiness gate failed: missing playwrightCandidateReadiness diagnostics."; + } + const reasons = stringArrayValue(readiness.reasons); + if (readiness.status !== "ready") { + return [ + "Playwright readiness gate failed:", + reasons.length > 0 + ? reasons.join("; ") + : `status is ${String(readiness.status ?? "missing")}.`, + ].join(" "); + } + if (numberValue(readiness.browserStepCount) <= 0) { + return "Playwright readiness gate failed: no actionable browser steps."; + } + if (numberValue(readiness.sourceUrlCount) <= 0) { + return "Playwright readiness gate failed: no source URLs to anchor replay."; + } + return null; +} + +export function selfHealingActionGateReason({ diagnostics }) { + if (diagnostics?.selfHealingAction !== "candidate_rejected") { + return null; + } + return "Self-healing gate failed: candidate recipe was rejected; rows came from a diagnostic run, not a promoted recipe."; +} + +export function benchmarkStatusForOutcome({ + execution, + parsedPayload, + answerKeyScore, + infraBlockerReason, + capabilityGateReason, +}) { + if (infraBlockerReason) { + return "blocked"; + } + if (capabilityGateReason) { + return "failed"; + } + return execution.exitCode === 0 && parsedPayload && answerKeyScore.passed + ? "ok" + : "failed"; +} + +export function failureCategoryForOutcome({ + status, + infraBlockerReason, + capabilityGateReason, + answerKeyScore, +}) { + if (status === "ok") { + return undefined; + } + if (infraBlockerReason) { + return "infra"; + } + if (capabilityGateReason) { + return "capability_gate"; + } + return answerKeyScore.failureCategory; +} + function normalizeUsage(value) { return { promptTokens: numberValue(value?.promptTokens ?? value?.inputTokens ?? value?.prompt_tokens), @@ -1027,7 +1131,7 @@ function evaluateRows({ rows, promptDefinition }) { }; } -async function rescoreBenchmarkRun({ runDirectory, prompts, config }) { +export async function rescoreBenchmarkRun({ runDirectory, prompts, config }) { const previousSummary = JSON.parse(await readFile(join(runDirectory, "summary.json"), "utf8")); const promptsById = new Map(prompts.map((promptDefinition) => [ promptDefinition.id, @@ -1075,11 +1179,22 @@ async function rescoreBenchmarkRun({ runDirectory, prompts, config }) { parsedPayload: usablePayload, normalized, }); - const status = infraBlockerReason - ? "blocked" - : execution.exitCode === 0 && usablePayload && answerKeyScore.passed - ? "ok" - : "failed"; + const capabilityGateReason = infraBlockerReason + ? null + : firstString([ + selfHealingActionGateReason({ diagnostics: normalized.diagnostics }), + playwrightReadinessGateReason({ + diagnostics: normalized.diagnostics, + requirePlaywrightReady: config.requirePlaywrightReady, + }), + ]); + const status = benchmarkStatusForOutcome({ + execution, + parsedPayload: usablePayload, + answerKeyScore, + infraBlockerReason, + capabilityGateReason, + }); rescoredLaneResults.push({ ...laneResult, @@ -1089,9 +1204,12 @@ async function rescoreBenchmarkRun({ runDirectory, prompts, config }) { expectedStress: promptDefinition.expectedStress, answerKey: answerKeyForPrompt(promptDefinition), status, - failureCategory: status === "ok" ? undefined : ( - infraBlockerReason ? "infra" : answerKeyScore.failureCategory - ), + failureCategory: failureCategoryForOutcome({ + status, + infraBlockerReason, + capabilityGateReason, + answerKeyScore, + }), factualAccuracyScore: answerKeyScore.factualAccuracyScore, entityCoverageRatio: answerKeyScore.entityCoverageRatio, domainAccuracyRatio: answerKeyScore.domainAccuracyRatio, @@ -1116,6 +1234,32 @@ async function rescoreBenchmarkRun({ runDirectory, prompts, config }) { needsReviewCount: validation.needsReviewCount, validationIssueCount: normalized.validationIssues.length, validationIssues: normalized.validationIssues, + selfHealingAction: normalized.diagnostics.selfHealingAction, + selfHealingArtifactKinds: normalized.diagnostics.artifactKinds, + processTraceStepCount: normalized.diagnostics.processTrace?.stepCount, + processTraceBrowserStepCount: + normalized.diagnostics.processTrace?.browserStepCount, + playwrightCandidateStatus: + normalized.diagnostics.playwrightCandidateReadiness?.status, + playwrightCandidateBrowserStepCount: + normalized.diagnostics.playwrightCandidateReadiness?.browserStepCount, + playwrightCandidateSourceUrlCount: + normalized.diagnostics.playwrightCandidateReadiness?.sourceUrlCount, + diagnostics: normalized.diagnostics, + usage: normalized.usage, + searchCallCount: normalized.metrics.searchCallCount, + fetchCallCount: normalized.metrics.fetchCallCount, + browserCallCount: normalized.metrics.browserCallCount, + agentRunCount: normalized.metrics.agentRunCount, + agentStepCount: normalized.metrics.agentStepCount, + estimatedModelCostUsd: estimateModelCostUsd(normalized.usage, config), + estimatedTinyFishAgentCostUsd: roundUsd( + normalized.metrics.agentStepCount * config.tinyFishAgentStepUsd + ), + estimatedTotalCostUsd: roundUsd( + estimateModelCostUsd(normalized.usage, config) + + normalized.metrics.agentStepCount * config.tinyFishAgentStepUsd + ), errorMessage: status === "ok" ? undefined : failureReason({ @@ -1124,6 +1268,7 @@ async function rescoreBenchmarkRun({ runDirectory, prompts, config }) { validation, answerKeyScore, infraBlockerReason, + capabilityGateReason, minRequiredCompleteness: config.minRequiredCompleteness, validationIssues: normalized.validationIssues, }), @@ -1638,6 +1783,7 @@ export function failureReason({ validation, answerKeyScore, infraBlockerReason, + capabilityGateReason, minRequiredCompleteness, validationIssues = [], }) { @@ -1645,6 +1791,7 @@ export function failureReason({ if (execution.timedOut) return "Command timed out."; if (execution.exitCode !== 0) return `Command exited ${execution.exitCode}.`; if (!parsedPayload) return "No parseable JSON object found in stdout."; + if (capabilityGateReason) return capabilityGateReason; const capabilityDiagnostic = capabilityDiagnosticReason(validationIssues); if (capabilityDiagnostic) return capabilityDiagnostic; if (answerKeyScore?.failureCategory === "clarification") { @@ -1681,6 +1828,13 @@ function arrayValue(value) { return Array.isArray(value) ? value : []; } +function objectValue(value) { + if (!value || Array.isArray(value) || typeof value !== "object") { + return {}; + } + return value; +} + function stringArrayValue(value) { if (Array.isArray(value)) { return value.filter((item) => typeof item === "string"); @@ -1691,6 +1845,10 @@ function stringArrayValue(value) { return []; } +function firstString(values) { + return values.find((value) => typeof value === "string" && value.length > 0) ?? null; +} + function singleStringArray(value) { return typeof value === "string" ? [value] : []; } @@ -1786,6 +1944,12 @@ node benchmarks/dataset-agent/run-benchmark.mjs \\ Rescore existing artifacts without spending credits: node benchmarks/dataset-agent/run-benchmark.mjs --rescore-dir benchmark-results/ +Require self-healing Playwright readiness for browser-action canaries: +node benchmarks/dataset-agent/run-benchmark.mjs \\ + --require-playwright-ready \\ + --prompt-ids mcp-docs-pages \\ + --system collection-self-heal='node --import ./backend/node_modules/tsx/dist/esm/index.mjs benchmarks/dataset-agent/adapters/collection-self-healing-adapter.mjs' + Agent command contract: - stdout should contain a JSON object. - Preferred shape: { "rows": [], "validationIssues": [], "usage": {}, "metrics": {} } diff --git a/benchmarks/dataset-agent/run-benchmark.test.mjs b/benchmarks/dataset-agent/run-benchmark.test.mjs index 773557a..534f6c2 100644 --- a/benchmarks/dataset-agent/run-benchmark.test.mjs +++ b/benchmarks/dataset-agent/run-benchmark.test.mjs @@ -1,11 +1,21 @@ import assert from "node:assert/strict"; +import { mkdir, mkdtemp, writeFile } from "node:fs/promises"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; import { test } from "node:test"; import { + benchmarkStatusForOutcome, + failureCategoryForOutcome, failureReason, findInfrastructureBlockerReason, + normalizePayload, + playwrightReadinessGateReason, + rescoreBenchmarkRun, scoreBenchmarkRows, + selfHealingActionGateReason, } from "./run-benchmark.mjs"; +import { selfHealingDiagnosticsFromTick } from "./adapters/self-healing-output.mjs"; const passingValidation = { rowCount: 1, @@ -177,3 +187,319 @@ test("domain scoring counts product, careers, and docs URL cells", () => { assert.equal(score.domainAccuracyRatio, 1, `${item.label} domain`); } }); + +test("self-healing diagnostics summarize trace and readiness artifacts", () => { + const diagnostics = selfHealingDiagnosticsFromTick({ + tick: { action: "generated_initial_recipe" }, + run: { + recipeId: "recipe-v1", + artifacts: [ + { + kind: "process-trace", + content: JSON.stringify({ + runtime: "collection", + searchQueries: ["example"], + fetchedUrls: ["https://example.com"], + sourceArtifacts: [{ + url: "https://example.com", + status: "succeeded", + }], + steps: [ + { kind: "search" }, + { kind: "browser" }, + ], + }), + }, + { + kind: "playwright-candidate-readiness", + content: JSON.stringify({ + status: "ready", + reasons: [], + browserStepCount: 1, + sourceUrlCount: 1, + }), + }, + ], + }, + }); + const normalized = normalizePayload({ + rows: [], + validationIssues: [], + diagnostics, + }); + + assert.equal(normalized.diagnostics.selfHealingAction, "generated_initial_recipe"); + assert.deepEqual(normalized.diagnostics.artifactKinds, [ + "process-trace", + "playwright-candidate-readiness", + ]); + assert.equal(normalized.diagnostics.processTrace.runtime, "collection"); + assert.equal(normalized.diagnostics.processTrace.stepCount, 2); + assert.equal(normalized.diagnostics.processTrace.browserStepCount, 1); + assert.equal( + normalized.diagnostics.playwrightCandidateReadiness.status, + "ready" + ); +}); + +test("Playwright readiness gate fails otherwise passing benchmark output", () => { + const capabilityGateReason = playwrightReadinessGateReason({ + requirePlaywrightReady: true, + diagnostics: notReadyDiagnostics(), + }); + const answerKeyScore = { passed: true, failureCategory: undefined }; + const status = benchmarkStatusForOutcome({ + execution: { exitCode: 0 }, + parsedPayload: { rows: passingRows() }, + answerKeyScore, + infraBlockerReason: null, + capabilityGateReason, + }); + + assert.equal(status, "failed"); + assert.match(capabilityGateReason, /no actionable browser steps/i); + assert.equal(failureCategoryForOutcome({ + status, + infraBlockerReason: null, + capabilityGateReason, + answerKeyScore, + }), "capability_gate"); + assert.equal(failureReason({ + execution: { exitCode: 0, timedOut: false }, + parsedPayload: { rows: passingRows() }, + validation: passingValidation, + answerKeyScore, + infraBlockerReason: null, + capabilityGateReason, + minRequiredCompleteness: 0.75, + }), capabilityGateReason); +}); + +test("Playwright readiness gate does not override infrastructure blockers", () => { + const infraBlockerReason = "Infrastructure/auth/credits blocker."; + const capabilityGateReason = null; + const answerKeyScore = { passed: true, failureCategory: undefined }; + const status = benchmarkStatusForOutcome({ + execution: { exitCode: 0 }, + parsedPayload: null, + answerKeyScore, + infraBlockerReason, + capabilityGateReason, + }); + + assert.equal(status, "blocked"); + assert.equal(failureCategoryForOutcome({ + status, + infraBlockerReason, + capabilityGateReason, + answerKeyScore, + }), "infra"); +}); + +test("self-healing rejection gate fails otherwise passing benchmark output", () => { + const capabilityGateReason = selfHealingActionGateReason({ + diagnostics: { + selfHealingAction: "candidate_rejected", + }, + }); + const answerKeyScore = { passed: true, failureCategory: undefined }; + const status = benchmarkStatusForOutcome({ + execution: { exitCode: 0 }, + parsedPayload: { rows: passingRows() }, + answerKeyScore, + infraBlockerReason: null, + capabilityGateReason, + }); + + assert.equal(status, "failed"); + assert.match(capabilityGateReason, /candidate recipe was rejected/i); + assert.equal(failureCategoryForOutcome({ + status, + infraBlockerReason: null, + capabilityGateReason, + answerKeyScore, + }), "capability_gate"); + assert.equal(failureReason({ + execution: { exitCode: 0, timedOut: false }, + parsedPayload: { rows: passingRows() }, + validation: passingValidation, + answerKeyScore, + infraBlockerReason: null, + capabilityGateReason, + minRequiredCompleteness: 0.75, + }), capabilityGateReason); +}); + +test("self-healing rejection gate does not override infrastructure blockers", () => { + const infraBlockerReason = "Infrastructure/auth/credits blocker."; + const capabilityGateReason = null; + const answerKeyScore = { passed: true, failureCategory: undefined }; + const status = benchmarkStatusForOutcome({ + execution: { exitCode: 0 }, + parsedPayload: { + rows: passingRows(), + diagnostics: { + selfHealingAction: "candidate_rejected", + }, + }, + answerKeyScore, + infraBlockerReason, + capabilityGateReason, + }); + + assert.equal(status, "blocked"); + assert.equal(failureCategoryForOutcome({ + status, + infraBlockerReason, + capabilityGateReason, + answerKeyScore, + }), "infra"); +}); + +test("rescore applies Playwright readiness gate semantics", async () => { + const runDirectory = await mkdtemp(join(tmpdir(), "bigset-benchmark-rescore-")); + const artifactDirectory = join(runDirectory, "collection-self-heal", "01-gate-prompt"); + await mkdir(artifactDirectory, { recursive: true }); + + const parsedPayload = { + rows: passingRows(), + validationIssues: [], + diagnostics: notReadyDiagnostics(), + }; + await writeFile( + join(runDirectory, "summary.json"), + JSON.stringify({ + laneResults: [{ + system: "collection-self-heal", + promptId: "gate-prompt", + promptQuality: "good", + artifactDirectory, + exitCode: 0, + timedOut: false, + }], + }) + ); + await writeFile( + join(artifactDirectory, "parsed-output.json"), + JSON.stringify(parsedPayload) + ); + await writeFile(join(artifactDirectory, "stdout.txt"), JSON.stringify(parsedPayload)); + await writeFile(join(artifactDirectory, "stderr.txt"), ""); + + const rescored = await rescoreBenchmarkRun({ + runDirectory, + prompts: [{ + id: "gate-prompt", + quality: "good", + persona: "developer", + prompt: "Find official docs.", + expectedStress: "Browser action gate.", + requiredColumns: ["entity_name", "source_url"], + }], + config: { + promptIds: null, + minRequiredCompleteness: 0.75, + minFactualAccuracy: 0.75, + requirePlaywrightReady: true, + inputUsdPer1M: 0.05, + outputUsdPer1M: 0.5, + tinyFishAgentStepUsd: 0.015, + }, + }); + + assert.equal(rescored.laneResults[0].status, "failed"); + assert.equal(rescored.laneResults[0].failureCategory, "capability_gate"); + assert.match(rescored.laneResults[0].errorMessage, /no actionable browser steps/i); + assert.equal(rescored.laneResults[0].playwrightCandidateStatus, "not_ready"); +}); + +test("rescore applies self-healing rejection gate semantics", async () => { + const runDirectory = await mkdtemp(join(tmpdir(), "bigset-benchmark-rescore-")); + const artifactDirectory = join(runDirectory, "collection-self-heal", "01-rejected-prompt"); + await mkdir(artifactDirectory, { recursive: true }); + + const parsedPayload = { + rows: passingRows(), + validationIssues: [], + diagnostics: { + selfHealingAction: "candidate_rejected", + }, + }; + await writeFile( + join(runDirectory, "summary.json"), + JSON.stringify({ + laneResults: [{ + system: "collection-self-heal", + promptId: "rejected-prompt", + promptQuality: "good", + artifactDirectory, + exitCode: 0, + timedOut: false, + }], + }) + ); + await writeFile( + join(artifactDirectory, "parsed-output.json"), + JSON.stringify(parsedPayload) + ); + await writeFile(join(artifactDirectory, "stdout.txt"), JSON.stringify(parsedPayload)); + await writeFile(join(artifactDirectory, "stderr.txt"), ""); + + const rescored = await rescoreBenchmarkRun({ + runDirectory, + prompts: [{ + id: "rejected-prompt", + quality: "good", + persona: "developer", + prompt: "Find official docs.", + expectedStress: "Self-healing rejection gate.", + requiredColumns: ["entity_name", "source_url"], + }], + config: { + promptIds: null, + minRequiredCompleteness: 0.75, + minFactualAccuracy: 0.75, + requirePlaywrightReady: false, + inputUsdPer1M: 0.05, + outputUsdPer1M: 0.5, + tinyFishAgentStepUsd: 0.015, + }, + }); + + assert.equal(rescored.laneResults[0].status, "failed"); + assert.equal(rescored.laneResults[0].failureCategory, "capability_gate"); + assert.match(rescored.laneResults[0].errorMessage, /candidate recipe was rejected/i); + assert.equal(rescored.laneResults[0].selfHealingAction, "candidate_rejected"); +}); + +function passingRows() { + return [{ + cells: { + entity_name: "Example", + source_url: "https://example.com/docs", + }, + sourceUrls: ["https://example.com/docs"], + evidence: [{ + columnName: "entity_name", + sourceUrl: "https://example.com/docs", + quote: "Example docs", + }], + }]; +} + +function notReadyDiagnostics() { + return { + playwrightCandidateReadiness: { + status: "not_ready", + reasons: ["Trace has no actionable browser steps with URL/selector/target data."], + browserStepCount: 0, + sourceUrlCount: 1, + }, + processTrace: { + runtime: "collection", + stepCount: 3, + browserStepCount: 0, + sourceUrlCount: 1, + }, + }; +} diff --git a/docker-compose.dev.yml b/docker-compose.dev.yml index 05ab9c7..8dcb087 100644 --- a/docker-compose.dev.yml +++ b/docker-compose.dev.yml @@ -20,16 +20,27 @@ services: build: context: ./backend dockerfile: Dockerfile.dev + env_file: + - .env ports: - "3501:3501" volumes: - ./backend/src:/app/src + - ./backend/BigSet_Data_Collection_Agent:/app/BigSet_Data_Collection_Agent - populate_recipe_data:/app/.bigset environment: CLIENT_ORIGIN: http://localhost:3500 CONVEX_URL: http://convex:3210 PORT: 3501 POPULATE_RECIPE_STORE_DIR: /app/.bigset/populate-recipes + POPULATE_AGENT_RUNTIME: ${POPULATE_AGENT_RUNTIME:-} + POPULATE_COLLECTION_RUNNER_MODULE: ${POPULATE_COLLECTION_RUNNER_MODULE:-} + COLLECTION_AGENT_PIPELINE_MODULE: ${COLLECTION_AGENT_PIPELINE_MODULE:-} + COLLECTION_AGENT_ENABLE_TRIAGE: ${COLLECTION_AGENT_ENABLE_TRIAGE:-} + COLLECTION_AGENT_ENABLE_AGENT: ${COLLECTION_AGENT_ENABLE_AGENT:-} + COLLECTION_AGENT_POLL_TIMEOUT_MS: ${COLLECTION_AGENT_POLL_TIMEOUT_MS:-} + AGENT_POLL_TIMEOUT_MS: ${AGENT_POLL_TIMEOUT_MS:-} + AGENT_REQUEST_TIMEOUT_MS: ${AGENT_REQUEST_TIMEOUT_MS:-} CONVEX_SELF_HOSTED_ADMIN_KEY: ${CONVEX_SELF_HOSTED_ADMIN_KEY:-} CLERK_SECRET_KEY: ${CLERK_SECRET_KEY:-} CLERK_PUBLISHABLE_KEY: ${NEXT_PUBLIC_CLERK_PUBLISHABLE_KEY:-} @@ -43,6 +54,8 @@ services: build: context: ./backend dockerfile: Dockerfile.mastra + env_file: + - .env ports: - "4111:4111" volumes: @@ -54,6 +67,8 @@ services: CONVEX_URL: http://convex:3210 CONVEX_SELF_HOSTED_ADMIN_KEY: ${CONVEX_SELF_HOSTED_ADMIN_KEY:-} TINYFISH_API_KEY: ${TINYFISH_API_KEY:-} + AGENT_POLL_TIMEOUT_MS: ${AGENT_POLL_TIMEOUT_MS:-} + AGENT_REQUEST_TIMEOUT_MS: ${AGENT_REQUEST_TIMEOUT_MS:-} depends_on: convex: condition: service_healthy @@ -62,6 +77,8 @@ services: build: context: ./frontend dockerfile: Dockerfile.dev + env_file: + - .env ports: - "3500:3500" volumes: diff --git a/docs/assets/bigset-populate-before-state.svg b/docs/assets/bigset-populate-before-state.svg new file mode 100644 index 0000000..43f8a12 --- /dev/null +++ b/docs/assets/bigset-populate-before-state.svg @@ -0,0 +1 @@ +

Backend path reached

Browser Convex calls

Signed-in BigSet browser

frontend shows generic retry error

rows stay 0

Prompt
'yc companies hiring right now'

Generated schema
9 requested columns

Dataset page
Yc Companies Hiring

Clear & Populate

datasets.create
signed-in user

datasets.get + datasetRows.list
signed-in user

POST /populate
frontend sends auth header

convex.query(api.datasets.get)
backend dataset lookup

Convex sees caller
anonymous

BLOCKER
[authz] deny op=read
reason=anonymous_private
Dataset not found

Not Mastra yet
Not validation yet
No row write attempted

\ No newline at end of file diff --git a/docs/assets/bigset-populate-target-loop.svg b/docs/assets/bigset-populate-target-loop.svg new file mode 100644 index 0000000..31174d7 --- /dev/null +++ b/docs/assets/bigset-populate-target-loop.svg @@ -0,0 +1 @@ +

Replay / repair

First run

Source planning

no

yes

yes

no

no

yes

pass

fail

Frontend prompt + dataset page

Backend /populate

Rank sources
triage fetched pages

Browser-heavy page?

TinyFish Search / Fetch

BrowserActionBox
TinyFish Agent + trace recorder

Normalize trace
into browser actions

Draft Playwright script
when readiness is ready

Promoted Playwright script exists

Replay runner configured?

Replay through BrowserActionBox

Diagnostic only
no fake rows

Replay failed?

One bounded repair attempt

Same validation gate

Accepted rows + evidence

Reject run
no direct row writes

\ No newline at end of file diff --git a/docs/assets/bigset-populate-validation-gate.svg b/docs/assets/bigset-populate-validation-gate.svg new file mode 100644 index 0000000..0252bb7 --- /dev/null +++ b/docs/assets/bigset-populate-validation-gate.svg @@ -0,0 +1 @@ +

Replay diagnostics

Bad run path

Good run path

Validation checks

yes

no

yes

no

Candidate extraction run
rows + sources + evidence + trace

Nonzero row count

Requested columns filled

Source URL coverage

Evidence quote coverage

Expected entities covered
when prompt names them

No critical runtime issue

All required checks pass?

Promote recipe
trusted reusable plan

Commit rows safely
Convex datasetRows

Surface result in UI
count, source, evidence

Reject candidate

Write no fake rows

Save rejection reason
for debugging and repair

Explicit browser actions exist?

Replay readiness: ready

Generate draft Playwright script

No script
not enough browser-action data

\ No newline at end of file diff --git a/docs/assets/bigset-self-healing-data-collection.png b/docs/assets/bigset-self-healing-data-collection.png new file mode 100644 index 0000000..22bba04 Binary files /dev/null and b/docs/assets/bigset-self-healing-data-collection.png differ diff --git a/docs/assets/bigset-self-healing-plain-english.svg b/docs/assets/bigset-self-healing-plain-english.svg new file mode 100644 index 0000000..74fc6c9 --- /dev/null +++ b/docs/assets/bigset-self-healing-plain-english.svg @@ -0,0 +1,118 @@ + + BigSet self-healing data collection flow + Plain-English diagram showing how a BigSet prompt becomes sourced dataset rows, how bad output is rejected, and which browser replay pieces are still future work. + + + + + + + + + + + + + + + BigSet self-healing data collection + Goal: user prompt in the app -> real sourced rows in the dataset, with bad output blocked. + + + 1. User prompt + Ask BigSet for a table + + + 2. Frontend + Create dataset shell + + + 3. Backend /populate + Start data collection + + + 4. Search + fetch + Find real sources + + + 5. Fill rows + Use fetched pages + Use browser data when needed + + + + + + + + Browser helper + TinyFish clicks pages + when fetch is not enough + + + Draft replay script + Save browser steps + Not trusted cron yet + + + + + + + SELF-HEALING LAYER + + Check output + Rows + sources + evidence + + + Good output? + Matches required table + + + + + + Good: save + write rows + Save extraction plan + Write rows to DB with safety throttle + + + Bad: reject + repair + Do not save fake success + Search/fetch missing cells + + + + + + + FUTURE REPLAY + Scheduled browser rerun + Cheap rerun after script is proven + Script repair + If site changes, live agent regenerates script + + + + + STATUS + Built now: validation, reject bad runs, save good extraction plans, row-write safety throttle, draft browser replay script generation. + Still next: full Mastra app integration, scheduled replay, script auto-repair, production-scale proof. + diff --git a/docs/data-collection-agent-migration-plan.md b/docs/data-collection-agent-migration-plan.md index 2bb1847..178a617 100644 --- a/docs/data-collection-agent-migration-plan.md +++ b/docs/data-collection-agent-migration-plan.md @@ -18,7 +18,8 @@ the collection pipeline is migrated into BigSet. a runner module from `POPULATE_COLLECTION_RUNNER_MODULE`. - PR #41 adds a `collection-self-heal` benchmark lane that wraps the collection runtime inside `SelfHealingPopulateRecipeService`. This is the benchmark - socket Meteor can use once the real collection runner is available. + socket a collection-runtime producer can use once the real collection runner + is available. - PR #43 ports the real vendored collection pipeline behind `runCollectionPopulatePipeline(input)`, so the collection benchmark lane now runs the BigSet-wrapped collection runner instead of a fake injected runner. @@ -28,9 +29,22 @@ the collection pipeline is migrated into BigSet. without injecting answer-key URLs at runtime. - PR #46 surfaces no-Agent browser/form/detail follow-up as a safe capability diagnostic instead of hiding it as generic bad data or infra failure. +- PR #47-#52 document and improve collection benchmark evidence, source + coherence, official-source support, and URL-like source evidence. PR #52 fixes + the `official_website` / `company_website` / `product_url` scoring class. +- PR #53-#60 add the self-healing process trace, Playwright readiness artifact, + explicit browser-action ingestion contract, Agent provenance diagnostics, + readiness benchmark gate, and rejected-candidate benchmark gate. +- PR #60 is the current top of the draft self-healing/collection stack. It makes + `selfHealingAction: "candidate_rejected"` fail benchmark scoring with + `failureCategory: "capability_gate"`, even when diagnostic rows match answer + keys. +- This branch adds a commit-path row cap for self-healing writes. Commit mode + defaults to a configurable safety throttle and can be overridden with + `POPULATE_COMMIT_ROW_LIMIT_PER_HOUR` or `--commit-row-limit-per-hour`. - `feat/data-collection-agent-v14` is no longer the branch to build on directly. It was the source of the collection pipeline port. New work should branch on - top of the current draft stack, not edit Meteor's branch or the dirty main + top of the current draft stack, not edit the external collection branch or the dirty main checkout. ## Target Shape @@ -63,6 +77,8 @@ The current layer: - stores active recipes and run records in a filesystem recipe store on the durable app/commit path +- persists each run's artifacts on the run record, including a structured + `process-trace` artifact when the runtime exposes one - reruns the active recipe when one exists - generates an initial recipe when no active recipe exists - repairs a failed active recipe through `DefaultPopulateRecipeAuthor` @@ -72,6 +88,7 @@ The current layer: - promotes a repaired recipe only if it is valid and does not score below the active recipe baseline - commits rows only after a successful tick, using one Convex atomic replace +- enforces a configurable per-dataset hourly row cap before committing rows - supports a CLI path for cron/live smoke via `populate:self-heal --dataset-id` Dry-run and benchmark paths intentionally use in-memory stores so they do not @@ -84,12 +101,40 @@ The current layer now can: - run the real vendored collection pipeline through that same boundary - preserve `recipe.runtimeInstructions`, required columns, and benchmark metadata through the collection runner +- expose structured trace data for both Mastra and collection runs: + `runtime`, `searchQueries`, `fetchedUrls`, `sourceArtifacts`, + `selectedRowSource`, `notes`, and ordered `steps` +- expose a `playwright-candidate-readiness` artifact that explains whether the + trace is grounded enough to compile a Playwright replay script +- emit a `playwright-candidate-script` artifact with + `runDatasetRecipe(context)` when readiness is `ready` +- represent browser actions in the trace contract when a future Agent/canary + records URL transitions, selectors, target text, or redacted input + descriptions +- ingest explicit collection runner `browser_actions` / + `agent_browser_actions` report fields into `browser` trace steps without + inferring missing clicks, selectors, or form inputs from source URLs +- preserve explicit `browser_actions` from TinyFish Agent results in + `agent_runs_*.json`, `run_report.initial.agent_browser_actions`, repair-loop + `agent_browser_actions`, without duplicating them into top-level report fields +- map browser action reports mechanically: `target_text` to `targetText`, + `value_description` to `valueDescription`, `status` to the trace-step status, + `error` to the trace-step error, `phase` to `step.input.phase`, and unknown + action names to `browserAction.action = "unknown"` - emit a capability diagnostic when no-Agent mode sees pages that need browser, form, or detail-page follow-up The current layer does not yet: -- generate Playwright scripts as a durable production recipe +- promote Playwright scripts as durable production recipes +- run cron from compiled Playwright scripts +- repair or promote Playwright scripts; repair still changes durable runtime + instructions only +- compile search/fetch-only traces into Playwright; traces must include + actionable browser steps before the script compiler is allowed to emit a + candidate +- infer browser selectors, clicks, or form values from source outcomes; the + collection runner or Agent canary must emit those as explicit action fields - run a green live Convex canary in this local environment - prove Agent-enabled collection quality on a full real benchmark - prove the collection runtime should replace Mastra as the default app runtime @@ -98,8 +143,8 @@ The current layer does not yet: 1. Branch from the top of the self-healing stack. - For new collection-runner or benchmark work, base on - `codex/collection-capability-diagnostics` unless that PR has been - superseded. + `codex/benchmark-self-healing-action-gate` unless that PR has been + superseded by a newer reviewed stack tip. - Do not edit `main`, the dirty local checkout, or `feat/data-collection-agent-v14` directly. @@ -153,6 +198,24 @@ The current layer does not yet: - 2-prompt real benchmark - 1-prompt Agent-enabled capability canary for prompts that need browser or detail follow-up + - browser-step trace canary that records URL transitions, selectors/targets, + and redacted form-input descriptions before any Playwright compiler is + enabled + - confirm the canary emits explicit `agent_browser_actions` or equivalent + fields in the collection report; source outcomes alone are not enough + - check `summary.json` for `playwrightCandidateStatus`, + `processTraceBrowserStepCount`, and + `playwrightCandidateBrowserStepCount` so the canary proves browser-action + provenance, not only row/evidence quality + - run browser-action canaries with `--require-playwright-ready` so row + quality cannot hide missing replayable browser-action provenance + - inspect Agent run provenance fields (`agent_step_count`, + `has_streaming_url`, and `result_keys`) when readiness fails; these fields + prove browser work happened without persisting raw streaming URLs or + pretending selectors/clicks exist + - treat `selfHealingAction: "candidate_rejected"` as a capability failure + even if diagnostic rows score well; rejected rows are debug output, not a + promotable self-healing recipe - full benchmark only after the 2-prompt run is not obviously broken - live `--dataset-id` dry-run only after Convex/env prerequisites are ready - `--commit` only on a throwaway dataset first @@ -193,10 +256,13 @@ Before any merge: follow-up - live dataset commit is tested only on a throwaway dataset - backend build does not depend on `frontend/convex/_generated` +- commit-mode row caps block Convex writes before the cap is exceeded and skip + runtime work when the cap is already exhausted -## Meteor Handoff Shape +## Collection Runtime Handoff Shape -Meteor does not need to rebuild the self-healing wrapper. The socket is now: +The collection-runtime owner does not need to rebuild the self-healing wrapper. +The socket is now: ```text runCollectionPopulatePipeline(CollectionPopulatePipelineInput) @@ -209,6 +275,23 @@ collection runner ignores `recipeInstructions`, repaired recipes cannot change future behavior. If it ignores `requiredColumns` or benchmark metadata, the benchmark can stop measuring the same task. +For the Playwright handoff, the collection runtime can optionally emit `browser_actions` and +`agent_browser_actions` in the collection report. BigSet preserves each array's +order and appends `browser_actions` before `agent_browser_actions` when both are +present in the same report scope. This is a wrapper ingestion contract only; the +current vendored pipeline is not claimed to emit those fields yet. + +The TinyFish Agent goal generator now asks the Agent itself to include +`agent_browser_actions` beside `records` in its result JSON. That makes the +producer responsible for ordered navigation/click/type/extract actions instead +of asking the self-healing layer to infer browser behavior after the fact. + +If TinyFish Agent result JSON includes explicit `browser_actions` or +`agent_browser_actions`, the vendored runner now carries those arrays into the +saved Agent run records and phase-scoped run report fields. Generic `actions` +arrays are intentionally ignored because they are not a browser-specific +contract. + The real benchmark command after a runner module exists is: ```bash @@ -224,7 +307,8 @@ Agent explicitly enabled: ```bash COLLECTION_AGENT_ENABLE_AGENT=true \ -COLLECTION_AGENT_POLL_TIMEOUT_MS=480000 \ +COLLECTION_AGENT_POLL_TIMEOUT_MS=1200000 \ +AGENT_REQUEST_TIMEOUT_MS=15000 \ COLLECTION_AGENT_PIPELINE_MODULE=./backend/BigSet_Data_Collection_Agent/src/orchestrator/pipeline.ts \ BIGSET_COLLECTION_BENCHMARK_RUNNER_MODULE=./backend/src/pipeline/collection-agent-runner.ts \ node benchmarks/dataset-agent/run-benchmark.mjs \ @@ -243,35 +327,43 @@ That is not a pass, but it is useful: it tells us the next benchmark should turn Agent on and measure whether browser/detail follow-up fixes the source evidence miss. -Agent-enabled `mcp-docs-pages` evidence from the stack-handoff branch: - -- artifact: `benchmark-results/collection-agent-canary-mcp-20260523-001` -- result: 3 rows, 12 evidence quotes, 10 source URLs, 3 Agent runs -- cost: about `$0.053552` -- status: failed, not blocked -- score: factual accuracy `0.933`, entity coverage `1.0`, claim support `1.0`, - domain accuracy `0.667` -- conclusion: Agent/browser follow-up runs successfully and improves claim - support, but source/domain evidence still misses. The next code target is - source coherence: keep each row's docs URL/evidence/source URLs aligned with - that entity's official docs domain instead of merging discovery/blog/course - evidence across vendors. +Latest Agent-enabled `mcp-docs-pages` evidence from the provenance diagnostics +branch, rescored with the rejected-candidate gate: + +- artifact: `benchmark-results/collection-agent-provenance-mcp-20260523-001` +- result: 3 rows, 5 evidence quotes, 5 source URLs, 1 Agent run, 20 reported + Agent steps +- cost: about `$0.307769` +- score: factual accuracy `1.0`, entity coverage `1.0`, domain accuracy `1.0`, + claim support `1.0` +- self-healing action: `candidate_rejected` +- Playwright readiness: `not_ready`, with zero replayable browser steps +- status after PR #60 rescore: failed with `failureCategory: + "capability_gate"` +- conclusion: the collection Agent can collect useful rows for this prompt, but + the self-healing layer correctly refuses to treat a rejected diagnostic run as + a promotable cron recipe. TinyFish reported browser work happened, but the + exposed run payload still did not contain explicit replayable browser actions. ## Next Engineering Move -Create a fresh branch from `codex/collection-capability-diagnostics` and fix -source coherence before running the full benchmark: - -1. Keep `COLLECTION_AGENT_ENABLE_AGENT=false` as the default. -2. Add focused tests around record merge/source selection so a row does not gain - evidence for a populated field from another record unless the incoming row - value supports the existing value. -3. Tighten docs/official-source selection so docs prompts prefer docs/developers - pages over blogs, news, courses, directories, or third-party discovery pages. -4. Re-run the Agent-enabled `mcp-docs-pages` canary. -5. If domain accuracy reaches `1.0`, run the 4-prompt focused benchmark from - PR #45. -6. Run the full prompt pack only after the focused benchmark is not obviously +Create fresh branches from the current rollup/producer stack. Do not edit +`main`, the external collection branch, or the dirty local checkout. + +1. Ask the migrated collection agent to emit explicit action traces. + - Preferred fields are `browser_actions` or `agent_browser_actions`. + - Each action should include at least URL or selector/target text plus safe, + redacted value descriptions for form inputs. + - Do not build a Playwright compiler against search/fetch-only traces. +2. Re-run the Agent-enabled `mcp-docs-pages` canary with: + - `COLLECTION_AGENT_ENABLE_AGENT=true` + - `--require-playwright-ready` + - PR #60's rejected-candidate gate +3. When that canary produces `selfHealingAction` other than + `candidate_rejected` and `playwrightCandidateStatus: "ready"`, inspect the + `playwright-candidate-script` artifact and promote the script runner/cron + contract behind a separate gate. +4. Run the full prompt pack only after the focused canaries are not obviously broken. When testing the real app or CLI path, set: @@ -280,13 +372,16 @@ When testing the real app or CLI path, set: POPULATE_AGENT_RUNTIME=collection POPULATE_COLLECTION_RUNNER_MODULE=./backend/src/pipeline/collection-agent-runner.ts COLLECTION_AGENT_PIPELINE_MODULE=./backend/BigSet_Data_Collection_Agent/src/orchestrator/pipeline.ts +POPULATE_COMMIT_ROW_LIMIT_PER_HOUR=1000 ``` The BigSet runner keeps TinyFish Agent/browser calls disabled unless `COLLECTION_AGENT_ENABLE_AGENT=true`. This makes cron and benchmark reruns cheap and repeatable first. Agent-enabled runs should also set `COLLECTION_AGENT_POLL_TIMEOUT_MS` or `AGENT_POLL_TIMEOUT_MS` so a browser run -cannot outlive the benchmark/job budget. +cannot outlive the benchmark/job budget. `AGENT_REQUEST_TIMEOUT_MS` caps each +TinyFish Agent queue/poll/cancel HTTP request so one hung `runs.get` call cannot +outlive the app-level poll timeout. Do not switch the default runtime from Mastra to collection until the self-healing-wrapped collection benchmark has better evidence than the current diff --git a/docs/self-healing-data-collection-flow.html b/docs/self-healing-data-collection-flow.html new file mode 100644 index 0000000..6ff5c73 --- /dev/null +++ b/docs/self-healing-data-collection-flow.html @@ -0,0 +1,332 @@ + + + + + + BigSet PR67 Self-Healing Flow + + + +
+
+
+

BigSet PR67 Self-Healing Flow

+

+ Public-safe note for the in-app populate path. It covers source planning, + TinyFish trace normalization, and the BrowserActionBox replay loop that still + goes through the same validation gate before any row write. +

+
+
+ Source planning in backend + TinyFish trace canary + Chromium replay runner +
+
+ +
+
+ Backend +

+ `/populate` now runs source planning, ranking, and triage in + `backend/src/pipeline/populate-source-planner.ts` before browser work starts. +

+
+
+ Traces +

+ `backend/src/pipeline/populate-tinyfish-trace-recorder.ts` normalizes TinyFish + raw run steps and artifacts into replayable browser actions when the trace has + enough signal. +

+
+
+ Replay +

+ `backend/src/pipeline/populate-browser-action-box.ts` covers first run, + replay, and one bounded repair attempt. The backend Docker image includes + `playwright-core` and system Chromium for local replay. +

+
+
+ +
+

Trust Shell

+

+ Validation is mandatory. Replay or repair can promote a script, but they still + flow through the same validation gate before any row write. +

+
+
+ First run +

+ BrowserActionBox records the TinyFish trace, normalizes browser actions, and + drafts a Playwright script when readiness is `ready`. +

+
+
+ Replay +

+ A promoted Playwright script can replay for the same source URL, schema, and + goal through local Chromium. A failed replay gets one bounded repair attempt, + and disabled replay still records diagnostics instead of fake rows. +

+
+
+ Artifacts +

+ Public artifacts now include `tinyfish-trace`, + `playwright-replay-result`, `playwright-repair-diagnostic`, + `playwright-repaired-script`, and `validation-result`. +

+
+
+
+ +
+
+
+
+

BrowserActionBox Flow

+

Source planning feeds first-run tracing, then replay or one repair attempt goes back through the same validation gate.

+
+ Open SVG +
+
+ BrowserActionBox first-run and second-run populate flow diagram +
+
+ +
+
+
+

Validation Gate

+

Rows must have useful cells, sources, evidence, and expected-entity coverage before write.

+
+ Open SVG +
+
+ Self-healing validation gate diagram +
+
+
+
+ + diff --git a/docs/self-healing-data-collection-flow.md b/docs/self-healing-data-collection-flow.md new file mode 100644 index 0000000..74e04cf --- /dev/null +++ b/docs/self-healing-data-collection-flow.md @@ -0,0 +1,114 @@ +# BigSet PR67 Self-Healing Flow + +Public-safe note for the in-app populate path. No transcripts, no private links, no secrets. + +## Overview + +1. The frontend sends the dataset prompt and context to backend `/populate`. +2. Backend loads the dataset, then runs source planning, ranking, and triage in `backend/src/pipeline/populate-source-planner.ts`. +3. TinyFish trace normalization in `backend/src/pipeline/populate-tinyfish-trace-recorder.ts` turns agent runs into replayable browser actions when the trace supports it. +4. BrowserActionBox in `backend/src/pipeline/populate-browser-action-box.ts` handles first run, replay, and one bounded repair attempt. +5. Validation still decides `accepted_full`, `accepted_partial`, or `rejected`. +6. Replay and repair reuse the same validation path. They do not write rows directly. +7. Only validated results can be promoted or written. + +## What PR67 proves now + +- Source planning, ranking, and triage now live in the backend. +- TinyFish traces normalize into the same runtime shape used by populate. +- A real TinyFish canary now captures raw run steps, artifacts, evidence-backed rows, replayable browser actions, and a draft Playwright script. +- BrowserActionBox replay and one-shot repair are wired into the self-healing contract and covered by fixture tests. +- Replay and repair do not create rows on their own. +- Public artifacts now include `tinyfish-trace`, `playwright-replay-result`, `playwright-repair-diagnostic`, `playwright-repaired-script`, and `validation-result`. + +## Honest Proof Status + +| Area | Status | +| --- | --- | +| Source planning/ranking/triage | Implemented, unit-tested, and exercised through `/populate`. | +| TinyFish first-run trace | Implemented, unit-tested, and proven with a real local canary. | +| Draft script generation | Implemented and proven when TinyFish exposes replayable browser actions. | +| Replay before Agent spend | Wired, unit-tested, and proven with local Chromium replay. | +| Repair and promotion | Wired, unit-tested, and proven with one forced-failure repair canary. | +| Default server replay runtime | Docker backend includes `playwright-core` plus system Chromium; replay can be disabled with `POPULATE_ENABLE_PLAYWRIGHT_REPLAY=false`. | + +## Phase 2 BrowserActionBox + +- First run records the TinyFish trace, normalizes browser actions, and drafts a Playwright script when readiness allows. +- Replay uses a promoted script first when the source URL, schema, and goal match. +- Repair gets one bounded retry, and any promoted repaired script still goes back through validation. +- Readiness is `ready` only when the trace includes explicit replayable browser actions. +- The local Playwright runner executes the generated script with a bounded timeout, then extracts Agent-compatible rows from the resulting DOM if the script itself returns no rows. +- If replay is disabled or unavailable, replay is diagnostic-only and normal populate can continue through the trusted path. +- The trust shell stays intact: validation is mandatory, and replay or repair never write fake rows. + +## What works now / What is next / How to verify + +### What works now + +- Backend source planner, trace recorder, BrowserActionBox contract, and validation gate. +- Accepted rows and evidence still show in the UI. +- Rejected runs still write no fake rows. +- Real TinyFish first-run canary produces rows, evidence, run steps, artifacts, replayable actions, and a draft script. +- Real replay canary runs the draft script through local Chromium before spending TinyFish Agent again. +- Forced repair canary retargets a broken generated script and promotes the repaired script only after replay validates. + +### What is next + +- Improve extraction specificity for source families beyond title/link/evidence-style pages. +- Replace the deterministic repair helper with an LLM repair agent once the product wants broader selector repair. + +### How to verify + +1. Start the local stack with `make dev`. +2. Run a populate flow on a browser-heavy page. +3. Confirm source URL, evidence, and validation state appear. +4. Confirm replay and repair artifacts show only when browser actions exist. +5. Confirm rejected runs write no rows. + +## Diagram + +Rendered diagram: [bigset-populate-target-loop.svg](assets/bigset-populate-target-loop.svg) + +Mermaid source: [self-healing-data-collection-flow.mmd](self-healing-data-collection-flow.mmd) + +```mermaid +flowchart LR + prompt["Frontend prompt + dataset page"] + populate["Backend /populate"] + + subgraph planning["Source planning"] + planner["Rank sources\ntriage fetched pages"] + browser{"Browser-heavy page?"} + fetch["TinyFish Search / Fetch"] + end + + subgraph first["First run"] + box["BrowserActionBox\nTinyFish Agent + trace recorder"] + normalize["Normalize trace\ninto browser actions"] + draft["Draft Playwright script\nwhen readiness is ready"] + end + + subgraph replay["Replay / repair"] + promoted["Promoted Playwright script exists"] + runner{"Replay runner configured?"} + replayRun["Replay through BrowserActionBox"] + diagnostic["Diagnostic only\nno fake rows"] + repair{"Replay failed?"} + repairBox["One bounded repair attempt"] + validate["Same validation gate"] + write["Accepted rows + evidence"] + reject["Reject run\nno direct row writes"] + end + + prompt --> populate --> planner --> fetch --> browser + browser -- no --> validate + browser -- yes --> box --> normalize --> draft --> validate + promoted --> runner + runner -- yes --> replayRun --> repair + runner -- no --> diagnostic --> validate + repair -- no --> validate + repair -- yes --> repairBox --> validate + validate -- pass --> write + validate -- fail --> reject +``` diff --git a/docs/self-healing-data-collection-flow.mmd b/docs/self-healing-data-collection-flow.mmd new file mode 100644 index 0000000..f608825 --- /dev/null +++ b/docs/self-healing-data-collection-flow.mmd @@ -0,0 +1,38 @@ +flowchart LR + prompt["Frontend prompt + dataset page"] + populate["Backend /populate"] + + subgraph planning["Source planning"] + planner["Rank sources\ntriage fetched pages"] + browser{"Browser-heavy page?"} + fetch["TinyFish Search / Fetch"] + end + + subgraph first["First run"] + box["BrowserActionBox\nTinyFish Agent + trace recorder"] + normalize["Normalize trace\ninto browser actions"] + draft["Draft Playwright script\nwhen readiness is ready"] + end + + subgraph replay["Replay / repair"] + promoted["Promoted Playwright script exists"] + runner{"Replay runner configured?"} + replayRun["Replay through BrowserActionBox"] + diagnostic["Diagnostic only\nno fake rows"] + repair{"Replay failed?"} + repairBox["One bounded repair attempt"] + validate["Same validation gate"] + write["Accepted rows + evidence"] + reject["Reject run\nno direct row writes"] + end + + prompt --> populate --> planner --> fetch --> browser + browser -- no --> validate + browser -- yes --> box --> normalize --> draft --> validate + promoted --> runner + runner -- yes --> replayRun --> repair + runner -- no --> diagnostic --> validate + repair -- no --> validate + repair -- yes --> repairBox --> validate + validate -- pass --> write + validate -- fail --> reject diff --git a/frontend/.env.example b/frontend/.env.example deleted file mode 100644 index 912ced3..0000000 --- a/frontend/.env.example +++ /dev/null @@ -1,19 +0,0 @@ -# Convex (self-hosted) -NEXT_PUBLIC_CONVEX_URL=http://127.0.0.1:3210 -CONVEX_SELF_HOSTED_URL=http://127.0.0.1:3210 -CONVEX_SELF_HOSTED_ADMIN_KEY= - -# Clerk — create a free app at https://dashboard.clerk.com -# 1. Create a Clerk application -# 2. Go to JWT Templates → enable the "Convex" template -# 3. Copy your keys below -NEXT_PUBLIC_CLERK_PUBLISHABLE_KEY=pk_test_... -CLERK_SECRET_KEY=sk_test_... -CLERK_JWT_ISSUER_DOMAIN=https://your-app.clerk.accounts.dev - -# Backend API (Fastify) -NEXT_PUBLIC_BACKEND_URL=http://localhost:3501 - -# PostHog (optional — leave blank to disable analytics entirely in local dev) -NEXT_PUBLIC_POSTHOG_KEY= -NEXT_PUBLIC_POSTHOG_HOST=https://us.i.posthog.com diff --git a/frontend/.gitignore b/frontend/.gitignore index c712c9b..5da59f6 100644 --- a/frontend/.gitignore +++ b/frontend/.gitignore @@ -30,9 +30,8 @@ yarn-debug.log* yarn-error.log* .pnpm-debug.log* -# env files (can opt-in for committing if needed) +# env files .env* -!.env.example # package manager: this project uses bun (bun.lock is the source of truth). # Reject npm/yarn lockfiles so they don't drift from bun's resolution. diff --git a/frontend/README.md b/frontend/README.md index 883bcf2..e3e0449 100644 --- a/frontend/README.md +++ b/frontend/README.md @@ -9,7 +9,9 @@ bun install bun dev --port 3500 ``` -Opens on [localhost:3500](http://localhost:3500). Expects the backend running on 3501 (auth requests are proxied via Next.js rewrites). +Opens on [localhost:3500](http://localhost:3500). Package scripts load root +`.env` before starting Next.js. The supported full-stack dev path is still +`make dev` from the repo root. ## Key Paths diff --git a/frontend/app/dataset/[id]/page.tsx b/frontend/app/dataset/[id]/page.tsx index 3a158ae..c7ea293 100644 --- a/frontend/app/dataset/[id]/page.tsx +++ b/frontend/app/dataset/[id]/page.tsx @@ -3,24 +3,42 @@ import { useParams } from "next/navigation"; import Link from "next/link"; import { useEffect, useMemo, useRef, useState } from "react"; -import { useQuery, useConvexAuth } from "convex/react"; +import { useMutation, useQuery, useConvexAuth } from "convex/react"; import { useAuth } from "@clerk/nextjs"; import { api } from "@/convex/_generated/api"; import type { Id } from "@/convex/_generated/dataModel"; import { DatasetTable } from "@/components/table"; import { useSelection } from "@/components/table/use-selection"; import { ThemeToggle } from "@/components/ThemeToggle"; -import { StatusBadge } from "@/components/dataset/StatusBadge"; +import { + StatusBadge, + type DatasetStatus, +} from "@/components/dataset/StatusBadge"; import { downloadCSV, downloadXLSX } from "@/lib/export"; -import { populate } from "@/lib/backend"; +import { + PopulateApiError, + populate, + type PopulateRunSummary, +} from "@/lib/backend"; import { EVENTS, captureException, track } from "@/lib/analytics"; +type PopulateStatus = + | { state: "idle" } + | { state: "running"; startedAt: number } + | { state: "accepted"; summary: PopulateRunSummary } + | { state: "rejected"; message: string; summary?: PopulateRunSummary } + | { state: "failed"; message: string; summary?: PopulateRunSummary }; + export default function DatasetPage() { const params = useParams(); const { isLoading: authLoading } = useConvexAuth(); const { userId, getToken } = useAuth(); const [exporting, setExporting] = useState<"csv" | "xlsx" | null>(null); const [populating, setPopulating] = useState(false); + const [populateStatus, setPopulateStatus] = useState({ + state: "idle", + }); + const [clockNow, setClockNow] = useState(() => Date.now()); const datasetId = params.id as Id<"datasets">; const dataset = useQuery( @@ -31,6 +49,7 @@ export default function DatasetPage() { api.datasetRows.listByDataset, authLoading ? "skip" : { datasetId }, ); + const updateDatasetStatus = useMutation(api.datasets.updateStatus); const rowIds = useMemo(() => (rows ?? []).map((r) => r._id), [rows]); const selection = useSelection(rowIds); @@ -51,6 +70,25 @@ export default function DatasetPage() { } }, [dataset, userId]); + useEffect(() => { + if (populateStatus.state !== "running") { + return; + } + const intervalId = window.setInterval(() => { + setClockNow(Date.now()); + }, 1_000); + return () => window.clearInterval(intervalId); + }, [populateStatus.state]); + + async function setDatasetStatusSafely(status: "live" | "paused") { + if (!dataset) return; + try { + await updateDatasetStatus({ id: dataset._id, status }); + } catch (err) { + console.warn("[populate] failed to update dataset status", err); + } + } + async function handleExport(format: "csv" | "xlsx") { if (!dataset || !rows || exporting) return; @@ -93,24 +131,61 @@ export default function DatasetPage() { async function handlePopulate() { if (!dataset || populating) return; + const startedAt = Date.now(); + setClockNow(startedAt); setPopulating(true); + setPopulateStatus({ state: "running", startedAt }); try { const token = await getToken(); if (!token) throw new Error("Not authenticated"); - await populate( + const response = await populate( dataset._id, dataset.name, dataset.description, dataset.columns, token, ); + setPopulateStatus({ state: "accepted", summary: response.result }); + await setDatasetStatusSafely( + (response.result.committedRows?.insertedRowCount ?? 0) > 0 + ? "live" + : "paused", + ); track(EVENTS.DATASET_POPULATED, { datasetId: dataset._id, column_count: dataset.columns.length, + committed_row_count: response.result.committedRows?.insertedRowCount ?? 0, }); } catch (err) { + const message = err instanceof Error + ? err.message + : "Failed to populate dataset."; + const summary = err instanceof PopulateApiError ? err.result : undefined; + if (err instanceof PopulateApiError && summary?.success === false) { + console.warn("[populate] rejected", { + status: err.status, + action: summary.action, + validationState: summary.validationState, + validationIssues: summary.validationIssues, + rejectionReasons: summary.rejectionReasons, + }); + setPopulateStatus({ + state: "rejected", + message, + summary, + }); + await setDatasetStatusSafely("paused"); + return; + } + console.error("[populate] failed", err); + setPopulateStatus({ + state: "failed", + message, + summary, + }); + await setDatasetStatusSafely("paused"); captureException(err, { operation: "dataset_populate", datasetId: dataset._id, @@ -133,6 +208,10 @@ export default function DatasetPage() { // the "Dataset not found" UI. const exportDisabled = exporting !== null || rows.length === 0; + const trustSummary = trustSummaryForRows(rows); + const elapsedSeconds = populateStatus.state === "running" + ? Math.max(0, Math.floor((Math.max(clockNow, populateStatus.startedAt) - populateStatus.startedAt) / 1_000)) + : 0; const csvLabel = exporting === "csv" ? "Exporting…" @@ -145,6 +224,11 @@ export default function DatasetPage() { : selectedCount > 0 ? `Export XLSX (${selectedCount})` : "Export XLSX"; + const displayStatus = statusForCurrentPopulateState({ + storedStatus: dataset.status, + isPopulateRequestOpen: populateStatus.state === "running", + rowCount: rows.length, + }); return (
@@ -158,7 +242,7 @@ export default function DatasetPage() {

{dataset.name}

- +
@@ -219,6 +303,12 @@ export default function DatasetPage() {
+ + ); } + +function statusForCurrentPopulateState({ + storedStatus, + isPopulateRequestOpen, + rowCount, +}: { + storedStatus: DatasetStatus; + isPopulateRequestOpen: boolean; + rowCount: number; +}): DatasetStatus { + if (isPopulateRequestOpen) { + return "building"; + } + if (storedStatus === "building") { + return rowCount > 0 ? "live" : "paused"; + } + return storedStatus; +} + +function trustSummaryForRows(rows: NonNullable>[]) { + const sourceUrls = uniqueStrings( + rows.flatMap((row) => Array.isArray(row.sources) ? row.sources : []), + ); + const evidence = rows + .flatMap((row) => Array.isArray(row.evidence) ? row.evidence : []) + .filter((item) => + typeof item?.sourceUrl === "string" && + typeof item?.quote === "string" && + item.sourceUrl && + item.quote + ); + return { + sourceUrls, + evidence, + }; +} + +function uniqueStrings(values: string[]): string[] { + return Array.from(new Set(values.filter(Boolean))); +} + +function PopulateTrustStrip({ + populateStatus, + trustSummary, + elapsedSeconds, +}: { + populateStatus: PopulateStatus; + trustSummary: { + sourceUrls: string[]; + evidence: Array<{ + columnName?: string; + sourceUrl: string; + quote: string; + }>; + }; + elapsedSeconds: number; +}) { + const summary = "summary" in populateStatus + ? populateStatus.summary + : undefined; + const statusTone = + populateStatus.state === "accepted" + ? summary?.productionValidation?.state === "accepted_partial" + ? "text-amber-700 border-amber-600/20 bg-amber-600/[0.04]" + : "text-emerald-700 border-emerald-600/20 bg-emerald-600/[0.04]" + : populateStatus.state === "rejected" || populateStatus.state === "failed" + ? "text-red-700 border-red-600/20 bg-red-600/[0.04]" + : "text-muted border-border bg-surface"; + const firstEvidence = trustSummary.evidence[0] ?? summary?.sampleRows + .flatMap((row) => row.evidence)[0]; + const sourceUrls = trustSummary.sourceUrls.length > 0 + ? trustSummary.sourceUrls + : uniqueStrings(summary?.sampleRows.flatMap((row) => row.sourceUrls) ?? []); + + if ( + populateStatus.state === "idle" && + sourceUrls.length === 0 && + !firstEvidence + ) { + return null; + } + + return ( +
+
+ + {populateStatusLabel(populateStatus, summary)} + + {populateStatus.state === "running" && ( + + waiting for backend response · {elapsedSeconds}s elapsed + + )} + {summary?.productionValidation && ( + + validation {validationStateLabel(summary.productionValidation.state)} + {" · "} + score {summary.productionValidation.score.toFixed(2)} + + )} + {summary?.validationIssues?.[0] && ( + + {summary.validationIssues[0]} + + )} + {summary?.rejectionReasons?.[0] && ( + + {summary.rejectionReasons[0]} + + )} + {sourceUrls.slice(0, 3).map((sourceUrl) => ( + + {sourceUrl} + + ))} + {firstEvidence && ( + + evidence: {firstEvidence.quote} + + )} +
+
+ ); +} + +function populateStatusLabel( + populateStatus: PopulateStatus, + summary?: PopulateRunSummary, +): string { + if (populateStatus.state === "running") { + return "Populate request open"; + } + if (populateStatus.state === "accepted") { + const rowCount = summary?.committedRows?.insertedRowCount ?? + summary?.rowCount ?? + 0; + if (summary?.productionValidation?.state === "accepted_partial") { + return `Accepted partial ${rowCount} rows`; + } + return `Accepted full ${rowCount} rows`; + } + if (populateStatus.state === "rejected") { + return "Rejected: no rows written"; + } + if (populateStatus.state === "failed") { + return populateStatus.message; + } + return "Populate evidence"; +} + +function validationStateLabel( + state: NonNullable["state"], +): string { + if (state === "accepted_full") { + return "accepted full"; + } + if (state === "accepted_partial") { + return "accepted partial"; + } + return "rejected"; +} diff --git a/frontend/app/dataset/new/page.tsx b/frontend/app/dataset/new/page.tsx index 1333798..c72f224 100644 --- a/frontend/app/dataset/new/page.tsx +++ b/frontend/app/dataset/new/page.tsx @@ -17,6 +17,7 @@ interface ProposedColumn { name: string; type: ColumnType; description: string; + nullable: boolean; } type Cadence = "30m" | "6h" | "12h" | "daily" | "weekly"; @@ -61,6 +62,7 @@ function mapBackendColumn(col: InferredColumn, index: number): ProposedColumn { name: col.display_name, type: BACKEND_TYPE_MAP[col.type], description: col.retrieval_hint, + nullable: col.nullable, }; } @@ -162,7 +164,13 @@ export default function NewDatasetPage() { function handleAddColumn() { setColumns((prev) => [ ...prev, - { id: String(Date.now()), name: "New Column", type: "text", description: "" }, + { + id: String(Date.now()), + name: "New Column", + type: "text", + description: "", + nullable: false, + }, ]); } @@ -180,6 +188,7 @@ export default function NewDatasetPage() { name: c.name, type: c.type, description: c.description || undefined, + nullable: c.nullable, })), }); } catch (err) { diff --git a/frontend/components/dataset/DatasetCard.tsx b/frontend/components/dataset/DatasetCard.tsx index 2d362ae..064ccde 100644 --- a/frontend/components/dataset/DatasetCard.tsx +++ b/frontend/components/dataset/DatasetCard.tsx @@ -10,6 +10,7 @@ export interface DatasetCardData { cadence: string; columns: { name: string; type: string }[]; previewRows: Record[]; + rowCount?: number; visibility?: "public" | "private"; } @@ -49,7 +50,7 @@ export function DatasetCard({ dataset }: { dataset: DatasetCardData }) { {dataset.cadence} - {dataset.previewRows?.length ?? 0} rows + {dataset.rowCount ?? dataset.previewRows?.length ?? 0} rows diff --git a/frontend/components/table/types.ts b/frontend/components/table/types.ts index 903ea6f..06a6859 100644 --- a/frontend/components/table/types.ts +++ b/frontend/components/table/types.ts @@ -4,6 +4,7 @@ export interface DatasetColumn { name: string; type: ColumnType; description?: string; + nullable?: boolean; } export interface DatasetMeta { @@ -19,4 +20,10 @@ export interface DatasetRow { _id: string; _creationTime: number; data: Record; + sources?: string[]; + evidence?: Array<{ + columnName: string; + sourceUrl: string; + quote: string; + }>; } diff --git a/frontend/convex/datasetRows.ts b/frontend/convex/datasetRows.ts index a6420a8..87be52c 100644 --- a/frontend/convex/datasetRows.ts +++ b/frontend/convex/datasetRows.ts @@ -47,9 +47,15 @@ export const insert = internalMutation({ datasetId: v.id("datasets"), data: v.record(v.string(), v.any()), sources: v.optional(v.array(v.string())), + evidence: v.optional(v.array(v.object({ + columnName: v.string(), + sourceUrl: v.string(), + quote: v.string(), + }))), }, handler: async (ctx, args) => { await consumeQuotaForDataset(ctx, args.datasetId, 1); + await ctx.db.patch(args.datasetId, { status: "live" }); return await ctx.db.insert("datasetRows", args); }, }); @@ -93,6 +99,7 @@ export const clearByDataset = internalMutation({ for (const row of rows) { await ctx.db.delete(row._id); } + await ctx.db.patch(args.datasetId, { status: "paused" }); return rows.length; }, }); @@ -104,6 +111,16 @@ export const get = internalQuery({ }, }); +export const listForSystemPopulate = internalQuery({ + args: { datasetId: v.id("datasets") }, + handler: async (ctx, args) => { + return await ctx.db + .query("datasetRows") + .withIndex("by_dataset", (q) => q.eq("datasetId", args.datasetId)) + .collect(); + }, +}); + export const remove = internalMutation({ args: { id: v.id("datasetRows") }, handler: async (ctx, args) => { @@ -135,6 +152,9 @@ export const insertBatch = internalMutation({ data, }); } + if (args.rows.length > 0) { + await ctx.db.patch(args.datasetId, { status: "live" }); + } }, }); @@ -144,6 +164,11 @@ export const replaceByDataset = internalMutation({ rows: v.array(v.object({ data: v.record(v.string(), v.any()), sources: v.optional(v.array(v.string())), + evidence: v.optional(v.array(v.object({ + columnName: v.string(), + sourceUrl: v.string(), + quote: v.string(), + }))), })), }, handler: async (ctx, args) => { @@ -165,8 +190,12 @@ export const replaceByDataset = internalMutation({ datasetId: args.datasetId, data: row.data, sources: row.sources, + evidence: row.evidence, }); } + await ctx.db.patch(args.datasetId, { + status: args.rows.length > 0 ? "live" : "paused", + }); return { clearedRowCount: existingRows.length, diff --git a/frontend/convex/datasets.ts b/frontend/convex/datasets.ts index b240948..1432a28 100644 --- a/frontend/convex/datasets.ts +++ b/frontend/convex/datasets.ts @@ -20,6 +20,7 @@ const columnValidator = v.object({ v.literal("date"), ), description: v.optional(v.string()), + nullable: v.optional(v.boolean()), }); const PREVIEW_ROW_COUNT = 5; @@ -28,10 +29,17 @@ async function attachPreview(ctx: QueryCtx, dataset: Doc<"datasets">) { const rows = await ctx.db .query("datasetRows") .withIndex("by_dataset", (q) => q.eq("datasetId", dataset._id)) - .take(PREVIEW_ROW_COUNT); + .collect(); + const truthfulStatus = dataset.status === "building" + ? rows.length > 0 + ? "live" + : "paused" + : dataset.status; return { ...dataset, - previewRows: rows.map((r) => r.data), + status: truthfulStatus, + rowCount: rows.length, + previewRows: rows.slice(0, PREVIEW_ROW_COUNT).map((r) => r.data), }; } @@ -108,7 +116,7 @@ export const create = mutation({ return await ctx.db.insert("datasets", { ...args, ownerId: identity.subject, - status: "building", + status: "paused", visibility: "private", }); }, diff --git a/frontend/convex/schema.ts b/frontend/convex/schema.ts index 68cd55f..2244cdd 100644 --- a/frontend/convex/schema.ts +++ b/frontend/convex/schema.ts @@ -32,6 +32,7 @@ export default defineSchema({ v.literal("date") ), description: v.optional(v.string()), + nullable: v.optional(v.boolean()), }) ), }) @@ -43,6 +44,11 @@ export default defineSchema({ datasetId: v.id("datasets"), data: v.record(v.string(), v.any()), sources: v.optional(v.array(v.string())), + evidence: v.optional(v.array(v.object({ + columnName: v.string(), + sourceUrl: v.string(), + quote: v.string(), + }))), scrapeScript: v.optional(v.string()), }).index("by_dataset", ["datasetId"]), diff --git a/frontend/lib/backend.ts b/frontend/lib/backend.ts index c1e7142..6513691 100644 --- a/frontend/lib/backend.ts +++ b/frontend/lib/backend.ts @@ -21,11 +21,70 @@ export interface PopulateColumn { name: string; type: "text" | "number" | "boolean" | "url" | "date"; description?: string; + nullable?: boolean; } export interface PopulateResult { success: boolean; - result: unknown; + result: PopulateRunSummary; +} + +export interface PopulateRunSummary { + action: string; + datasetId: string; + success: boolean; + validationState?: "accepted_full" | "accepted_partial" | "rejected"; + committedRows?: { + clearedRowCount?: number; + insertedRowCount: number; + }; + rejectionReasons: string[]; + validationIssues: string[]; + rowCount: number; + sampleRows: Array<{ + cells: Record; + sourceUrls: string[]; + evidence: Array<{ + columnName: string; + sourceUrl: string; + quote: string; + }>; + needsReview: boolean; + }>; + productionValidation?: { + state: "accepted_full" | "accepted_partial" | "rejected"; + isValid: boolean; + score: number; + rowCount: number; + safeRowCount: number; + requestedCellCompletenessRatio: number; + sourceUrlCoverageRatio: number; + evidenceCoverageRatio: number; + expectedEntityCoverageRatio: number; + expectedEntities: string[]; + missingExpectedEntities: string[]; + coveragePolicy: "partial_allowed" | "full_required"; + targetSource: string; + criticalIssues: string[]; + warnings: string[]; + }; + metrics?: Record; +} + +export class PopulateApiError extends Error { + readonly status: number; + readonly result?: PopulateRunSummary; + + constructor(input: { + message: string; + status: number; + result?: PopulateRunSummary; + }) { + super(input.message); + this.name = "PopulateApiError"; + this.status = input.status; + this.result = input.result; + } } const BACKEND_URL = @@ -47,7 +106,11 @@ export async function inferSchema( if (!res.ok) { const body = await res.json().catch(() => null); const message = body?.error || `Backend error (${res.status})`; - throw new Error(message); + throw new PopulateApiError({ + message, + status: res.status, + result: body?.result, + }); } return res.json(); @@ -72,7 +135,11 @@ export async function populate( if (!res.ok) { const body = await res.json().catch(() => null); const message = body?.error || `Backend error (${res.status})`; - throw new Error(message); + throw new PopulateApiError({ + message, + status: res.status, + result: body?.result, + }); } return res.json(); diff --git a/frontend/package.json b/frontend/package.json index d7d7a0c..5dc165c 100644 --- a/frontend/package.json +++ b/frontend/package.json @@ -3,10 +3,10 @@ "version": "0.1.0", "private": true, "scripts": { - "dev": "next dev", - "build": "next build", - "start": "next start", - "lint": "eslint" + "dev": "node ../scripts/with-root-env.mjs next dev", + "build": "node ../scripts/with-root-env.mjs next build", + "start": "node ../scripts/with-root-env.mjs next start", + "lint": "node ../scripts/with-root-env.mjs eslint" }, "dependencies": { "@clerk/nextjs": "^7.3.7", diff --git a/makefiles/Makefile b/makefiles/Makefile index 633df80..ce72c59 100644 --- a/makefiles/Makefile +++ b/makefiles/Makefile @@ -1,8 +1,8 @@ -.PHONY: all dev down clean convex-push convex-env verify-self-healing +.PHONY: all dev validate-dev-env down clean convex-push convex-env seed-public-datasets verify-self-healing all: dev -dev: +dev: validate-dev-env docker compose -f docker-compose.dev.yml up --build -d @echo "Waiting for Convex to be healthy..." @for i in $$(seq 1 120); do \ @@ -18,20 +18,61 @@ dev: @echo " Mastra Studio: http://localhost:4111" docker compose -f docker-compose.dev.yml logs -f +validate-dev-env: + @test -f .env || { echo "Error: .env not found. Run: cp .env.example .env"; exit 1; } + @check_env() { \ + key="$$1"; placeholder="$$2"; \ + value="$$(grep "^$$key=" .env | cut -d= -f2-)"; \ + if [[ -z "$$value" || "$$value" == "$$placeholder" || "$$value" == *"..."* ]]; then \ + echo "Error: $$key must be set in .env"; \ + exit 1; \ + fi; \ + }; \ + check_env NEXT_PUBLIC_CLERK_PUBLISHABLE_KEY pk_test_...; \ + check_env CLERK_SECRET_KEY sk_test_...; \ + check_env CLERK_JWT_ISSUER_DOMAIN https://your-app.clerk.accounts.dev; \ + check_env OPENROUTER_API_KEY sk-or-...; \ + check_env TINYFISH_API_KEY "" + convex-env: - @test -f frontend/.env.local || { echo "Error: frontend/.env.local not found"; exit 1; } - @grep -q CLERK_JWT_ISSUER_DOMAIN frontend/.env.local || { echo "Error: CLERK_JWT_ISSUER_DOMAIN not set in frontend/.env.local"; exit 1; } - @grep -q CONVEX_SELF_HOSTED_ADMIN_KEY frontend/.env.local || { echo "Error: CONVEX_SELF_HOSTED_ADMIN_KEY not set in frontend/.env.local"; exit 1; } - @cd frontend && npx convex env set CLERK_JWT_ISSUER_DOMAIN "$$(grep CLERK_JWT_ISSUER_DOMAIN .env.local | cut -d= -f2-)" \ - --url http://127.0.0.1:3210 \ - --admin-key "$$(grep CONVEX_SELF_HOSTED_ADMIN_KEY .env.local | cut -d= -f2-)" + @test -f .env || { echo "Error: .env not found. Run: cp .env.example .env"; exit 1; } + @issuer="$$(grep '^CLERK_JWT_ISSUER_DOMAIN=' .env | cut -d= -f2-)"; \ + admin_key="$$(grep '^CONVEX_SELF_HOSTED_ADMIN_KEY=' .env | cut -d= -f2-)"; \ + if [[ -z "$$issuer" || "$$issuer" == "https://your-app.clerk.accounts.dev" ]]; then \ + echo "Error: CLERK_JWT_ISSUER_DOMAIN must be your Clerk issuer URL in .env"; \ + exit 1; \ + fi; \ + if [[ -z "$$admin_key" || "$$admin_key" == \ [...args]"); + process.exit(2); +} + +const child = spawn(command, args, { + env: process.env, + shell: process.platform === "win32", + stdio: "inherit", +}); + +child.on("exit", (code, signal) => { + if (signal) { + process.kill(process.pid, signal); + return; + } + process.exit(code ?? 1); +});