From ca903668b65e5f4b80f4e0fc36b18b067ff9b6d4 Mon Sep 17 00:00:00 2001 From: Edward Tran Date: Fri, 22 May 2026 22:13:03 +0700 Subject: [PATCH 1/2] Port collection pipeline runner into self-healing path --- .../src/acquisition/link-follow.ts | 114 +++ .../src/agents/agent-goal.ts | 64 ++ .../src/agents/benchmark-spec.ts | 94 +++ .../src/agents/dataset-spec.ts | 191 +++++ .../src/agents/extract-from-agent.ts | 82 +++ .../src/agents/extract.ts | 289 ++++++++ .../src/agents/repair-diagnosis.ts | 80 +++ .../src/agents/repair-queries.ts | 108 +++ .../src/agents/source-triage.ts | 100 +++ .../src/config.ts | 114 +++ .../src/coverage/analyze.ts | 116 ++++ .../src/export/csv-compiler.ts | 199 ++++++ .../src/export/select-results.ts | 47 ++ .../src/integrations/openrouter.ts | 2 + .../src/integrations/tinyfish-agent.ts | 232 +++++++ .../src/integrations/tinyfish.ts | 70 ++ .../src/llm/complete-json.ts | 93 +++ .../src/llm/provider.ts | 23 + .../src/llm/usage.ts | 57 ++ .../src/memory/fingerprint.ts | 6 + .../src/memory/index.ts | 26 + .../src/memory/scored-aggregates.ts | 481 +++++++++++++ .../src/memory/search-pagination.ts | 184 +++++ .../src/memory/store.ts | 125 ++++ .../src/memory/types.ts | 101 +++ .../src/memory/workflow-memory.ts | 208 ++++++ .../src/merge/records.ts | 153 ++++ .../src/models/quality.ts | 79 +++ .../src/models/schemas.ts | 214 ++++++ .../src/models/source-status.ts | 24 + .../src/orchestrator/acquisition.ts | 260 +++++++ .../src/orchestrator/pipeline.ts | 652 ++++++++++++++++++ .../src/orchestrator/process-pages.ts | 415 +++++++++++ .../src/orchestrator/repair-loop.ts | 280 ++++++++ .../src/quality/build-report.ts | 238 +++++++ .../src/quality/field-confidence.ts | 72 ++ .../src/quality/index.ts | 8 + .../src/quality/score-record.ts | 176 +++++ .../src/queue/domain-throttle.ts | 63 ++ .../src/queue/pools.ts | 73 ++ .../src/queue/rate-limiter.ts | 41 ++ .../src/queue/retry.ts | 55 ++ .../src/queue/task-queue.ts | 79 +++ .../src/storage/run-loader.ts | 90 +++ .../src/storage/run-store.ts | 99 +++ .../src/utils/concurrency.ts | 26 + .../src/utils/url.ts | 20 + backend/package-lock.json | 13 + backend/package.json | 1 + .../src/pipeline/collection-agent-runner.ts | 311 +++++++++ backend/test/collection-agent-runner.test.ts | 140 ++++ benchmarks/dataset-agent/run-benchmark.mjs | 6 + 52 files changed, 6794 insertions(+) create mode 100644 backend/BigSet_Data_Collection_Agent/src/acquisition/link-follow.ts create mode 100644 backend/BigSet_Data_Collection_Agent/src/agents/agent-goal.ts create mode 100644 backend/BigSet_Data_Collection_Agent/src/agents/benchmark-spec.ts create mode 100644 backend/BigSet_Data_Collection_Agent/src/agents/dataset-spec.ts create mode 100644 backend/BigSet_Data_Collection_Agent/src/agents/extract-from-agent.ts create mode 100644 backend/BigSet_Data_Collection_Agent/src/agents/extract.ts create mode 100644 backend/BigSet_Data_Collection_Agent/src/agents/repair-diagnosis.ts create mode 100644 backend/BigSet_Data_Collection_Agent/src/agents/repair-queries.ts create mode 100644 backend/BigSet_Data_Collection_Agent/src/agents/source-triage.ts create mode 100644 backend/BigSet_Data_Collection_Agent/src/config.ts create mode 100644 backend/BigSet_Data_Collection_Agent/src/coverage/analyze.ts create mode 100644 backend/BigSet_Data_Collection_Agent/src/export/csv-compiler.ts create mode 100644 backend/BigSet_Data_Collection_Agent/src/export/select-results.ts create mode 100644 backend/BigSet_Data_Collection_Agent/src/integrations/openrouter.ts create mode 100644 backend/BigSet_Data_Collection_Agent/src/integrations/tinyfish-agent.ts create mode 100644 backend/BigSet_Data_Collection_Agent/src/integrations/tinyfish.ts create mode 100644 backend/BigSet_Data_Collection_Agent/src/llm/complete-json.ts create mode 100644 backend/BigSet_Data_Collection_Agent/src/llm/provider.ts create mode 100644 backend/BigSet_Data_Collection_Agent/src/llm/usage.ts create mode 100644 backend/BigSet_Data_Collection_Agent/src/memory/fingerprint.ts create mode 100644 backend/BigSet_Data_Collection_Agent/src/memory/index.ts create mode 100644 backend/BigSet_Data_Collection_Agent/src/memory/scored-aggregates.ts create mode 100644 backend/BigSet_Data_Collection_Agent/src/memory/search-pagination.ts create mode 100644 backend/BigSet_Data_Collection_Agent/src/memory/store.ts create mode 100644 backend/BigSet_Data_Collection_Agent/src/memory/types.ts create mode 100644 backend/BigSet_Data_Collection_Agent/src/memory/workflow-memory.ts create mode 100644 backend/BigSet_Data_Collection_Agent/src/merge/records.ts create mode 100644 backend/BigSet_Data_Collection_Agent/src/models/quality.ts create mode 100644 backend/BigSet_Data_Collection_Agent/src/models/schemas.ts create mode 100644 backend/BigSet_Data_Collection_Agent/src/models/source-status.ts create mode 100644 backend/BigSet_Data_Collection_Agent/src/orchestrator/acquisition.ts create mode 100644 backend/BigSet_Data_Collection_Agent/src/orchestrator/pipeline.ts create mode 100644 backend/BigSet_Data_Collection_Agent/src/orchestrator/process-pages.ts create mode 100644 backend/BigSet_Data_Collection_Agent/src/orchestrator/repair-loop.ts create mode 100644 backend/BigSet_Data_Collection_Agent/src/quality/build-report.ts create mode 100644 backend/BigSet_Data_Collection_Agent/src/quality/field-confidence.ts create mode 100644 backend/BigSet_Data_Collection_Agent/src/quality/index.ts create mode 100644 backend/BigSet_Data_Collection_Agent/src/quality/score-record.ts create mode 100644 backend/BigSet_Data_Collection_Agent/src/queue/domain-throttle.ts create mode 100644 backend/BigSet_Data_Collection_Agent/src/queue/pools.ts create mode 100644 backend/BigSet_Data_Collection_Agent/src/queue/rate-limiter.ts create mode 100644 backend/BigSet_Data_Collection_Agent/src/queue/retry.ts create mode 100644 backend/BigSet_Data_Collection_Agent/src/queue/task-queue.ts create mode 100644 backend/BigSet_Data_Collection_Agent/src/storage/run-loader.ts create mode 100644 backend/BigSet_Data_Collection_Agent/src/storage/run-store.ts create mode 100644 backend/BigSet_Data_Collection_Agent/src/utils/concurrency.ts create mode 100644 backend/BigSet_Data_Collection_Agent/src/utils/url.ts create mode 100644 backend/src/pipeline/collection-agent-runner.ts create mode 100644 backend/test/collection-agent-runner.test.ts diff --git a/backend/BigSet_Data_Collection_Agent/src/acquisition/link-follow.ts b/backend/BigSet_Data_Collection_Agent/src/acquisition/link-follow.ts new file mode 100644 index 0000000..bebc418 --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/acquisition/link-follow.ts @@ -0,0 +1,114 @@ +import type { FetchedPage } from "../models/schemas.js"; +import type { WorkflowMemory } from "../memory/types.js"; +import { domainMemoryBoost } from "../memory/workflow-memory.js"; +import { getDomain, normalizeUrl } from "../utils/url.js"; + +const SKIP_HOST = + /(?:facebook|twitter|x\.com|instagram|youtube|tiktok|pinterest|reddit\.com\/r\/|linkedin\.com\/in\/|accounts\.google|login|signin|signup|register|cookie|privacy|terms|cdn\.|static\.|fonts\.)/i; +const SKIP_EXT = /\.(?:pdf|zip|png|jpe?g|gif|svg|webp|css|js|woff2?|xml|mp4|mp3)(?:\?|$)/i; +const POSITIVE_PATH = + /\/(?:company|companies|startup|startups|portfolio|team|about|careers|jobs|directory|list|batch|founder|org|organization|profile|detail|view)(?:\/|$|\?)/i; +const NEGATIVE_PATH = + /\/(?:tag|tags|category|categories|author|feed|rss|search|wp-admin|wp-content)(?:\/|$|\?)/i; + +export interface LinkFollowOptions { + pages: FetchedPage[]; + excludeUrls: Set; + focusFields?: string[]; + maxTotal: number; + maxPerSource: number; + memory?: WorkflowMemory; +} + +function pathTokensFromFields(fields?: string[]): string[] { + if (!fields?.length) return []; + return fields + .flatMap((field) => + field + .split(/[_\s-]+/) + .map((part) => part.toLowerCase()) + .filter((part) => part.length > 3), + ) + .slice(0, 12); +} + +function scoreLink( + link: string, + sourceDomain: string, + focusTokens: string[], + memory?: WorkflowMemory, +): number { + let score = 0; + + try { + const parsed = new URL(link); + const host = parsed.hostname.toLowerCase(); + const path = `${parsed.pathname}${parsed.search}`.toLowerCase(); + + if (SKIP_HOST.test(host) || SKIP_EXT.test(path)) return -1000; + if (NEGATIVE_PATH.test(path)) score -= 2; + if (POSITIVE_PATH.test(path)) score += 4; + + const linkDomain = getDomain(link); + if (linkDomain === sourceDomain) score += 3; + else if (linkDomain.endsWith(`.${sourceDomain}`) || sourceDomain.endsWith(`.${linkDomain}`)) { + score += 2; + } + + for (const token of focusTokens) { + if (path.includes(token)) score += 2; + } + + if (memory) score += domainMemoryBoost(memory, linkDomain); + + if (path.length > 120) score -= 1; + if (parsed.hash.length > 1) score -= 1; + } catch { + return -1000; + } + + return score; +} + +/** Pick outbound links from high-value pages using URL heuristics only. */ +export function selectOutboundLinksToFollow( + options: LinkFollowOptions, +): string[] { + const focusTokens = pathTokensFromFields(options.focusFields); + const selected: string[] = []; + const selectedSet = new Set(); + + const pagesWithLinks = options.pages + .filter((page) => !page.error && page.outbound_links && page.outbound_links.length > 0) + .sort((a, b) => (b.outbound_links?.length ?? 0) - (a.outbound_links?.length ?? 0)); + + for (const page of pagesWithLinks) { + const sourceUrl = normalizeUrl(page.final_url || page.url); + const sourceDomain = getDomain(sourceUrl); + let perSource = 0; + + const ranked = [...(page.outbound_links ?? [])] + .map((link) => ({ + link, + score: scoreLink(link, sourceDomain, focusTokens, options.memory), + })) + .filter((item) => item.score > 0) + .sort((a, b) => b.score - a.score); + + for (const { link } of ranked) { + if (selected.length >= options.maxTotal) return selected; + if (perSource >= options.maxPerSource) break; + + const normalized = normalizeUrl(link); + if (options.excludeUrls.has(normalized)) continue; + if (selectedSet.has(normalized)) continue; + if (normalized === sourceUrl) continue; + + selectedSet.add(normalized); + selected.push(link); + perSource += 1; + } + } + + return selected; +} diff --git a/backend/BigSet_Data_Collection_Agent/src/agents/agent-goal.ts b/backend/BigSet_Data_Collection_Agent/src/agents/agent-goal.ts new file mode 100644 index 0000000..e84ad75 --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/agents/agent-goal.ts @@ -0,0 +1,64 @@ +import { completeJson } from "../integrations/openrouter.js"; +import { + memoryContextForAgents, + type WorkflowMemory, +} from "../memory/index.js"; +import { agentGoalSchema, type AgentGoal } from "../models/schemas.js"; +import type { DatasetSpec, SourceTriageResult } from "../models/schemas.js"; + +const AGENT_GOAL_SYSTEM = `You are the Navigation Task Agent for a web data collection pipeline. + +Write a Tinyfish Agent goal: a clear natural-language instruction for browser automation on the given URL. + +The agent must navigate the site and return structured JSON with extracted data matching the dataset schema. + +Rules: +- Be specific about what to click, search, filter, or paginate. +- State the exact JSON shape to return: { "records": [ { column_name: value, ... } ] } +- Include column names from the schema in the goal. +- For forms: describe fields to fill and how to submit. +- For detail follow-up: explain how to open each item and which fields to collect. +- Limit scope (e.g. first 25 rows) to keep runs reliable. +- Do not invent data; extract only what is visible on the site. +- When workflow_memory is provided, reuse goal patterns from agent_goal_stats_top (high avg_completeness/confidence); avoid domains in domain_stats_weak unless diagnosis says otherwise. +- If latest_diagnosis.prefer_tinyfish_agent or agent_strategy_notes exist, follow them. +- Return ONLY JSON with fields: goal, rationale`; + +export async function generateAgentGoal(options: { + userPrompt: string; + spec: DatasetSpec; + triage: SourceTriageResult; + focusFields?: string[]; + memory?: WorkflowMemory; +}): Promise { + const columnList = options.spec.columns + .map((c) => `${c.name} (${c.type}${c.required ? ", required" : ""})`) + .join(", "); + + return completeJson({ + label: `agent_goal:${options.triage.final_url}`, + schema: agentGoalSchema, + messages: [ + { role: "system", content: AGENT_GOAL_SYSTEM }, + { + role: "user", + content: JSON.stringify({ + user_prompt: options.userPrompt, + triage_status: options.triage.status, + triage_reasoning: options.triage.reasoning, + suggested_action: options.triage.suggested_action, + page_url: options.triage.final_url, + page_title: options.triage.title, + row_grain: options.spec.row_grain, + columns: columnList, + focus_fields: options.focusFields ?? [], + extraction_hints: options.spec.extraction_hints, + workflow_memory: options.memory + ? memoryContextForAgents(options.memory) + : undefined, + output_shape: { goal: "string", rationale: "string" }, + }), + }, + ], + }); +} diff --git a/backend/BigSet_Data_Collection_Agent/src/agents/benchmark-spec.ts b/backend/BigSet_Data_Collection_Agent/src/agents/benchmark-spec.ts new file mode 100644 index 0000000..288f540 --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/agents/benchmark-spec.ts @@ -0,0 +1,94 @@ +import type { ColumnDef, DatasetSpec } from "../models/schemas.js"; +import { normalizeSpecColumnOrder } from "./dataset-spec.js"; + +/** Benchmark harness fields from prompts.json (via env in adapters). */ +export interface BenchmarkSpecContext { + promptId?: string; + promptQuality?: string; + persona?: string; + expectedStress?: string; + requiredColumns: string[]; +} + +export function hasBenchmarkRequiredColumns( + context?: BenchmarkSpecContext, +): context is BenchmarkSpecContext & { requiredColumns: string[] } { + return Boolean(context?.requiredColumns?.length); +} + +/** Parse comma-separated column names (CLI flag or benchmark env). */ +export function parseRequiredColumns(value: string): string[] { + const columns = value + .split(",") + .map((name) => name.trim()) + .filter(Boolean); + if (columns.length === 0) { + throw new Error( + "Required columns must include at least one non-empty column name.", + ); + } + return columns; +} + +/** + * Ensures every benchmark-required column name exists on the spec as required. + * Types and descriptions come from the dataset-spec LLM when present; otherwise + * minimal placeholders (no per-column name heuristics). + */ +export function mergeSpecWithBenchmarkRequiredColumns( + spec: DatasetSpec, + context: BenchmarkSpecContext, +): DatasetSpec { + const requiredColumns = context.requiredColumns; + const columnsByName = new Map(spec.columns.map((column) => [column.name, column])); + + const requiredColumnDefs: ColumnDef[] = requiredColumns.map((name) => { + const existing = columnsByName.get(name); + if (existing) { + return { ...existing, required: true }; + } + return { + name, + type: "string", + description: name, + required: true, + }; + }); + + const optionalExtras = spec.columns.filter( + (column) => !requiredColumns.includes(column.name), + ); + + const columns = [...requiredColumnDefs, ...optionalExtras]; + const columnNames = new Set(columns.map((column) => column.name)); + + const isEntityLikeColumn = (name: string): boolean => + /(entity|company|organization|business|restaurant|bakery|provider|product|name|title)/i.test( + name, + ); + + const dedupeKey = + requiredColumns.find( + (name) => columnNames.has(name) && isEntityLikeColumn(name), + ) ?? + spec.dedupe_keys.find((key) => columnNames.has(key)) ?? + requiredColumns.find((name) => columnNames.has(name)) ?? + spec.dedupe_keys[0]; + + const extractionHints = [ + spec.extraction_hints, + `Benchmark required columns (use as exact row keys): ${requiredColumns.join(", ")}.`, + context.expectedStress + ? `Benchmark stress note: ${context.expectedStress}` + : undefined, + ] + .filter(Boolean) + .join("\n"); + + return normalizeSpecColumnOrder({ + ...spec, + columns, + dedupe_keys: dedupeKey ? [dedupeKey] : spec.dedupe_keys, + extraction_hints: extractionHints, + }); +} diff --git a/backend/BigSet_Data_Collection_Agent/src/agents/dataset-spec.ts b/backend/BigSet_Data_Collection_Agent/src/agents/dataset-spec.ts new file mode 100644 index 0000000..eda4a25 --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/agents/dataset-spec.ts @@ -0,0 +1,191 @@ +import { completeJson } from "../integrations/openrouter.js"; +import type { WorkflowMemory } from "../memory/types.js"; +import { + datasetSpecSchema, + type ColumnDef, + type DatasetSpec, +} from "../models/schemas.js"; +import { + hasBenchmarkRequiredColumns, + mergeSpecWithBenchmarkRequiredColumns, + type BenchmarkSpecContext, +} from "./benchmark-spec.js"; + +const DATASET_SPEC_SYSTEM = `You are the Dataset Spec Agent for a web data collection pipeline. + +Given a user's data gathering prompt, produce a JSON object that defines: +- what each CSV row represents (row_grain) +- column names, types, and which are required +- dedupe_keys: exactly ONE column name that identifies a unique row (the main entity field, e.g. entity_name or restaurant_name — used as primary key for merge/repair) +- search_queries: diverse web search strings to find sources (use site: operators when helpful) +- extraction_hints: guidance for downstream extraction + +Rules: +- columns[].name must be snake_case +- types must be one of: string, number, boolean, date +- Column order: list every required column first (see ordering below), then optional columns. Do not bury required fields after optional metadata. +- Required columns (required: true): + - The single dedupe_keys field must be required: true. + - Every column that the user_prompt explicitly or clearly implies they want per row (e.g. "who's hiring" → is_hiring; "still active" → is_active; "funding amount" → funding column) must be required: true. + - Do NOT mark only the entity name/identifier as required while leaving core intent fields optional — that blocks the repair loop from filling sparse rows. + - Optional (required: false) only for nice-to-have extras the user did not ask for (e.g. logo_url when they only care about hiring status). +- Required column ordering within columns[]: + 1. the dedupe_keys field first + 2. other required intent fields (what the user asked to collect) + 3. optional fields last +- For type "number", embed the measurement unit in the column name using snake_case + (e.g. funding_amount_usd(millions), employee_count, market_cap_million_usd, growth_rate_percent). + Choose units that match the user's intent; describe the unit in columns[].description when helpful. + Do not use bare numeric names like "amount", "price", or "funding" without a unit, for example, if the + numeric value is in millions, use "funding_amount_million_usd" instead of "funding_amount_usd". +- search_queries should be specific, varied (5-8 queries), and likely to surface pages with list/table data +- Temporal relevance for search_queries: + - Use the provided current_date / current_year when a query needs a time anchor (e.g. "2026", "latest", "recent"). + - Do NOT default to past years (e.g. 2024) unless the user_prompt explicitly names that year or date range. + - If the user says "recent", "current", "latest", or implies up-to-date data, anchor queries to current_year. + - If the user gives no time constraint, prefer evergreen queries OR current_year only when recency clearly matters for the dataset. + - If the user specifies a year or date (e.g. "in 2024", "Q1 2023"), use exactly what they asked for. +- target_row_count should reflect the user's implied or stated goal +- Return ONLY JSON, no markdown`; + +function currentTimeContext(): { current_date: string; current_year: number } { + const now = new Date(); + return { + current_date: now.toISOString().slice(0, 10), + current_year: now.getFullYear(), + }; +} + +/** Ensure exactly one valid dedupe key exists on the spec. */ +export function normalizeDedupeKey(spec: DatasetSpec): DatasetSpec { + const columnNames = new Set(spec.columns.map((column) => column.name)); + let key = spec.dedupe_keys[0]; + + if (!key || !columnNames.has(key)) { + const firstRequired = spec.columns.find((column) => column.required); + key = firstRequired?.name ?? spec.columns[0]?.name ?? key; + } + + if (!key) { + return spec; + } + + return { ...spec, dedupe_keys: [key] }; +} + +/** Enforce required-first column order even if the model returns a different order. */ +export function normalizeSpecColumnOrder(spec: DatasetSpec): DatasetSpec { + const byName = new Map(spec.columns.map((col) => [col.name, col])); + const ordered: ColumnDef[] = []; + const used = new Set(); + + for (const key of spec.dedupe_keys.slice(0, 1)) { + const col = byName.get(key); + if (!col || used.has(key)) continue; + ordered.push({ ...col, required: true }); + used.add(key); + } + + for (const col of spec.columns) { + if (used.has(col.name) || !col.required) continue; + ordered.push(col); + used.add(col.name); + } + + for (const col of spec.columns) { + if (used.has(col.name)) continue; + ordered.push(col); + used.add(col.name); + } + + return { ...spec, columns: ordered }; +} + +export async function generateDatasetSpec( + prompt: string, + targetRows: number, + priorMemory?: WorkflowMemory | null, + benchmark?: BenchmarkSpecContext, +): Promise { + const { current_date, current_year } = currentTimeContext(); + + const spec = await completeJson({ + label: "dataset_spec", + schema: datasetSpecSchema, + messages: [ + { role: "system", content: DATASET_SPEC_SYSTEM }, + { + role: "user", + content: JSON.stringify({ + user_prompt: prompt, + target_row_count: targetRows, + current_date, + current_year, + prior_workflow_memory: + priorMemory && priorMemory.prompt_fingerprint + ? { + query_stats_top: [...priorMemory.query_stats] + .filter((q) => q.record_count > 0) + .slice(-8), + domain_stats_top: [...priorMemory.domain_stats] + .filter((d) => d.record_count > 0) + .slice(-8), + domain_stats_weak: [...priorMemory.domain_stats] + .filter( + (d) => + d.fetch_failures > 0 || + (d.record_count > 0 && d.avg_completeness < 0.5), + ) + .slice(-6), + dedupe_keys: priorMemory.dedupe_keys, + strategy_notes: priorMemory.strategy_notes.slice(-5), + } + : undefined, + column_order_note: + "required columns first: dedupe_keys in order, then other required intent fields, then optional", + benchmark_context: hasBenchmarkRequiredColumns(benchmark) + ? { + prompt_id: benchmark.promptId, + prompt_quality: benchmark.promptQuality, + persona: benchmark.persona, + expected_stress: benchmark.expectedStress, + required_columns: benchmark.requiredColumns, + instruction: + "When required_columns is present, columns[].name MUST use those exact snake_case names as the core schema (all required: true). You may add optional extra columns only if they do not replace or rename required_columns. Align search_queries and extraction_hints to satisfy the user_prompt and expected_stress.", + } + : undefined, + output_shape: { + intent_summary: "string", + target_row_count: "number", + row_grain: "string", + columns: [ + { + name: "string (snake_case)", + type: "string | number | boolean | date", + description: "string", + required: + "boolean — true for dedupe_keys and every field the user_prompt asks to collect per row", + }, + ], + dedupe_keys: ["string — exactly one primary entity column name"], + search_queries: ["string"], + extraction_hints: "string", + }, + }), + }, + ], + }); + + let normalized = normalizeDedupeKey( + normalizeSpecColumnOrder({ + ...spec, + target_row_count: targetRows, + }), + ); + + if (hasBenchmarkRequiredColumns(benchmark)) { + normalized = mergeSpecWithBenchmarkRequiredColumns(normalized, benchmark); + } + + return normalized; +} diff --git a/backend/BigSet_Data_Collection_Agent/src/agents/extract-from-agent.ts b/backend/BigSet_Data_Collection_Agent/src/agents/extract-from-agent.ts new file mode 100644 index 0000000..eba28c1 --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/agents/extract-from-agent.ts @@ -0,0 +1,82 @@ +import { completeJson } from "../integrations/openrouter.js"; +import { + memoryContextForAgents, + type WorkflowMemory, +} from "../memory/index.js"; +import type { DatasetSpec, ExtractedRecord } from "../models/schemas.js"; +import { + buildLlmExtractionResultSchema, + finalizeExtractedRecords, + type LlmExtractionRecord, +} from "./extract.js"; + +/** + * Parses one Tinyfish agent result JSON per call (see process-pages.ts agent branch). + * Not used for fetched-page markdown; that path uses extractFromPage. + */ + +const EXTRACT_AGENT_SYSTEM = `You are the Extraction Agent parsing output from a Tinyfish browser automation run. + +Convert the agent result JSON into dataset records matching the schema. + +Rules: +- Only include facts present in the agent result. Do not invent values. +- row keys must match spec column names exactly. +- For number columns, numeric values only (unit is in the column name). +- evidence: field, quote, and url for fields you populated when you have a supporting quote (url = where that quote was found; use page_url when from this page). Not required for every column. +- Do not return source_urls. +- extraction_confidence (0–1) per record when possible. +- Provenance URL columns: set per row to the URL where that row's data came from (use page_url when appropriate). +- If the agent result has no usable rows, return an empty records array. +- Return ONLY JSON`; + +export async function extractFromAgentResult(options: { + spec: DatasetSpec; + pageUrl: string; + agentResult: Record | null; + focusFields?: string[]; + memory?: WorkflowMemory; +}): Promise { + if (!options.agentResult || Object.keys(options.agentResult).length === 0) { + return []; + } + + const result = await completeJson({ + label: `extract_agent:${options.pageUrl}`, + schema: buildLlmExtractionResultSchema(options.spec), + messages: [ + { role: "system", content: EXTRACT_AGENT_SYSTEM }, + { + role: "user", + content: JSON.stringify({ + dataset_spec: { + intent_summary: options.spec.intent_summary, + row_grain: options.spec.row_grain, + columns: options.spec.columns, + }, + page_url: options.pageUrl, + agent_result: options.agentResult, + focus_fields: options.focusFields ?? [], + workflow_memory: options.memory + ? memoryContextForAgents(options.memory) + : undefined, + output_shape: { + records: [ + { + row: { column_name: "value or null" }, + evidence: [{ field: "column_name", url: "string", quote: "string" }], + extraction_confidence: "0-1 number", + }, + ], + }, + }), + }, + ], + }); + + return finalizeExtractedRecords( + result.records as LlmExtractionRecord[], + options.pageUrl, + options.spec, + ); +} diff --git a/backend/BigSet_Data_Collection_Agent/src/agents/extract.ts b/backend/BigSet_Data_Collection_Agent/src/agents/extract.ts new file mode 100644 index 0000000..2055102 --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/agents/extract.ts @@ -0,0 +1,289 @@ +import { z } from "zod"; +import { config } from "../config.js"; +import { completeJson } from "../integrations/openrouter.js"; +import { + memoryContextForAgents, + type WorkflowMemory, +} from "../memory/index.js"; +import { + extractedRecordSchema, + fieldEvidenceSchema, + type ColumnDef, + type DatasetSpec, + type ExtractedRecord, + type FetchedPage, +} from "../models/schemas.js"; + +/** + * Extraction is always one source per LLM call in process-pages.ts: + * - extractFromPage: one fetched page's markdown per call (parallelized per page). + * - extractFromAgentResult: one Tinyfish agent JSON payload per call (separate module). + * + * LLM returns row + sparse evidence + extraction_confidence; code attaches evidence URLs + * and source_urls. Provenance URL columns come from the LLM row values per record. + */ + +const llmFieldEvidenceSchema = fieldEvidenceSchema + .omit({ url: true }) + .extend({ url: z.string().optional() }); + +export type LlmExtractionRecord = { + row: Record; + evidence: z.infer[]; + extraction_confidence?: number; +}; + +function columnValueSchema( + column: ColumnDef, +): z.ZodType { + switch (column.type) { + case "number": + return z.union([z.number(), z.null()]); + case "boolean": + return z.union([z.boolean(), z.null()]); + default: + return z.union([z.string(), z.null()]); + } +} + +/** Explicit column keys so AI SDK structured output guides the model to populate row fields. */ +export function buildLlmExtractionResultSchema(spec: DatasetSpec) { + const rowShape: Record = {}; + for (const column of spec.columns) { + rowShape[column.name] = columnValueSchema(column); + } + + const llmExtractionRecordSchema = z.object({ + row: z.object(rowShape), + evidence: z.array(llmFieldEvidenceSchema), + extraction_confidence: z.number().min(0).max(1).optional(), + }); + + return z.object({ + records: z.array(llmExtractionRecordSchema), + notes: z.string().optional(), + }); +} + +const EXTRACTION_SYSTEM = `You are the Extraction Agent for a web data collection pipeline. + +Extract structured records from the provided page content according to the dataset specification. + +Rules: +- Only extract facts supported by the page text. Do not invent data. +- row keys must match spec column names exactly. +- For columns with type "number", store numeric values only (no unit text in the value; the unit is already in the column name). +- Use null for unknown values. +- Return multiple records if the page lists multiple entities matching row_grain. +- If the page has no relevant data, return an empty records array. +- evidence: include field, quote, and url for fields you populated when you have a supporting quote (url = where that quote was found; use the page URL when from this page). Not required for every column. +- Do not return source_urls on the record. +- extraction_confidence (0–1): how confident you are this row is accurate. +- Provenance URL columns (e.g. source_url, evidence_url, or columns described as where data was found): set each row's value to the URL where that row's facts came from — use the provided page URL when all fields for that row are from this page, or a more specific URL only if clearly stated on the page. +- Do not copy unrelated URLs into provenance columns (e.g. do not set source_url to the page URL when pricing_page_url already holds the pricing URL and source_url should cite where you read the plan). +- Return ONLY JSON`; + +function truncatePageText(text: string): string { + if (text.length <= config.maxPageChars) return text; + return `${text.slice(0, config.maxPageChars)}\n\n[truncated]`; +} + +function isEmpty(value: unknown): boolean { + return value === null || value === undefined || value === ""; +} + +function coerceEvidenceToColumnValue( + column: ColumnDef, + quote: string, +): string | number | boolean | null { + const trimmed = quote.trim(); + if (!trimmed) return null; + + switch (column.type) { + case "boolean": { + const lower = trimmed.toLowerCase(); + if ( + /\b(true|yes|active|hiring|looking for|open roles|open positions|join us|join our team|we(?:'re| are) hiring|see open roles)\b/.test( + lower, + ) + ) { + return true; + } + if ( + /\b(false|no|not hiring|no careers|does not contain|lack of|without)\b/.test( + lower, + ) + ) { + return false; + } + return null; + } + case "number": { + const parsed = Number(trimmed.replace(/,/g, "")); + return Number.isFinite(parsed) ? parsed : null; + } + default: + return trimmed; + } +} + +function hydrateRowFromEvidence( + row: Record, + evidence: Array<{ field: string; quote: string }>, + spec: DatasetSpec, +): void { + const columnByName = new Map(spec.columns.map((column) => [column.name, column])); + + for (const item of evidence) { + if (isEmpty(row[item.field])) { + const column = columnByName.get(item.field); + if (!column) continue; + const value = coerceEvidenceToColumnValue(column, item.quote); + if (value !== null) { + row[item.field] = value; + } + } + } +} + +/** Columns meant to hold a citation URL for where row data was found (not content URLs). */ +export function isProvenanceUrlColumn(column: ColumnDef): boolean { + const name = column.name.toLowerCase(); + if (name === "source_url" || name === "evidence_url") { + return true; + } + if (name.endsWith("_source_url")) { + return true; + } + const description = column.description.toLowerCase(); + return ( + name.includes("source") && + name.includes("url") && + (description.includes("evidence") || + description.includes("provenance") || + description.includes("where")) + ); +} + +function provenanceUrlColumns(spec: DatasetSpec): ColumnDef[] { + return spec.columns.filter(isProvenanceUrlColumn); +} + +function collectSourceUrls( + pageUrl: string, + evidence: Array<{ url?: string }>, +): string[] { + const urls = new Set([pageUrl]); + for (const item of evidence) { + if (item.url?.startsWith("http")) { + urls.add(item.url); + } + } + return [...urls]; +} + +/** Attach evidence URLs and source_urls; keep LLM row and provenance values. */ +export function finalizeExtractedRecord( + record: LlmExtractionRecord, + pageUrl: string, + spec: DatasetSpec, +): ExtractedRecord { + const row = { ...record.row }; + hydrateRowFromEvidence(row, record.evidence, spec); + + const evidence = record.evidence.map((item) => ({ + field: item.field, + quote: item.quote, + url: item.url?.trim() || pageUrl, + })); + + for (const column of provenanceUrlColumns(spec)) { + if (column.required && isEmpty(row[column.name])) { + row[column.name] = pageUrl; + } + } + + const source_urls = collectSourceUrls(pageUrl, evidence); + + return extractedRecordSchema.parse({ + row, + evidence, + source_urls, + ...(record.extraction_confidence !== undefined + ? { extraction_confidence: record.extraction_confidence } + : {}), + }); +} + +export function finalizeExtractedRecords( + records: LlmExtractionRecord[], + pageUrl: string, + spec: DatasetSpec, +): ExtractedRecord[] { + return records.map((record) => finalizeExtractedRecord(record, pageUrl, spec)); +} + +export interface ExtractOptions { + focusFields?: string[]; +} + +export async function extractFromPage( + spec: DatasetSpec, + page: FetchedPage, + options: ExtractOptions & { memory?: WorkflowMemory } = {}, +): Promise { + if (page.error || !page.text.trim()) { + return []; + } + + const pageUrl = page.final_url || page.url; + const result = await completeJson({ + label: `extraction:${pageUrl}`, + schema: buildLlmExtractionResultSchema(spec), + messages: [ + { role: "system", content: EXTRACTION_SYSTEM }, + { + role: "user", + content: JSON.stringify({ + dataset_spec: { + intent_summary: spec.intent_summary, + row_grain: spec.row_grain, + columns: spec.columns, + extraction_hints: spec.extraction_hints, + }, + page: { + url: pageUrl, + title: page.title, + text: truncatePageText(page.text), + }, + ...(options.focusFields?.length + ? { + focus_fields: options.focusFields, + instruction: + "Prioritize extracting focus_fields. Use null only when the page truly lacks that information.", + } + : {}), + workflow_memory: options.memory + ? memoryContextForAgents(options.memory) + : undefined, + output_shape: { + records: [ + { + row: { column_name: "value or null" }, + evidence: [{ field: "column_name", url: "string", quote: "string" }], + extraction_confidence: "0-1 number", + }, + ], + notes: "optional string", + }, + }), + }, + ], + }); + + return finalizeExtractedRecords( + result.records as LlmExtractionRecord[], + pageUrl, + spec, + ); +} diff --git a/backend/BigSet_Data_Collection_Agent/src/agents/repair-diagnosis.ts b/backend/BigSet_Data_Collection_Agent/src/agents/repair-diagnosis.ts new file mode 100644 index 0000000..be77e5e --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/agents/repair-diagnosis.ts @@ -0,0 +1,80 @@ +import type { CoverageReport } from "../coverage/analyze.js"; +import { completeJson } from "../integrations/openrouter.js"; +import { + memoryContextForAgents, + type WorkflowMemory, +} from "../memory/index.js"; +import { repairDiagnosisSchema, type RepairDiagnosis } from "../memory/types.js"; +import type { DatasetSpec } from "../models/schemas.js"; +import type { SourcesReport } from "../models/quality.js"; + +const DIAGNOSIS_SYSTEM = `You are the Repair Diagnosis Agent for a web data collection pipeline. + +A repair loop just finished (or is about to start). Analyze workflow memory, coverage gaps, and source outcomes to explain what failed and how the next search/fetch/agent pass should change. + +Rules: +- Be specific and actionable — cite domains, query patterns, and triage/agent failures from memory when relevant. +- recommended_search_patterns: concrete query templates or angles (not duplicates of failed_queries). +- domains_to_prioritize: hosts that previously yielded records or match the missing fields. +- domains_to_avoid: hosts that failed fetch, blocked, or returned no usable rows. +- prefer_tinyfish_agent: true when static fetch/extract failed but navigation or forms are likely needed. +- extraction_notes: hints for extract agents (e.g. which columns are still null, evidence issues). +- Return ONLY JSON`; + +export async function generateRepairDiagnosis(options: { + userPrompt: string; + spec: DatasetSpec; + coverage: CoverageReport; + memory: WorkflowMemory; + sources?: SourcesReport; + repairLoop: number; + maxRepairLoops: number; +}): Promise { + const failedOutcomes = + options.sources?.failed.slice(0, 20).map((item) => ({ + url: item.url, + outcome: item.outcome, + error: item.error?.slice(0, 120), + })) ?? []; + + return completeJson({ + label: `repair_diagnosis:loop${options.repairLoop}`, + schema: repairDiagnosisSchema, + messages: [ + { role: "system", content: DIAGNOSIS_SYSTEM }, + { + role: "user", + content: JSON.stringify({ + user_prompt: options.userPrompt, + repair_loop: options.repairLoop, + max_repair_loops: options.maxRepairLoops, + dataset_spec: { + intent_summary: options.spec.intent_summary, + row_grain: options.spec.row_grain, + columns: options.spec.columns, + dedupe_keys: options.spec.dedupe_keys, + }, + coverage: { + total_records: options.coverage.total_records, + complete_count: options.coverage.complete_count, + partial_count: options.coverage.partial_count, + required_columns: options.coverage.required_columns, + field_gaps: options.coverage.field_gaps, + }, + source_failures_sample: failedOutcomes, + workflow_memory: memoryContextForAgents(options.memory), + output_shape: { + summary: "string", + likely_causes: ["string"], + recommended_search_patterns: ["string"], + domains_to_prioritize: ["string"], + domains_to_avoid: ["string"], + prefer_tinyfish_agent: "boolean", + agent_strategy_notes: "optional string", + extraction_notes: "optional string", + }, + }), + }, + ], + }); +} diff --git a/backend/BigSet_Data_Collection_Agent/src/agents/repair-queries.ts b/backend/BigSet_Data_Collection_Agent/src/agents/repair-queries.ts new file mode 100644 index 0000000..441778b --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/agents/repair-queries.ts @@ -0,0 +1,108 @@ +import { z } from "zod"; +import type { CoverageReport } from "../coverage/analyze.js"; +import { completeJson } from "../integrations/openrouter.js"; +import { + memoryContextForAgents, + type WorkflowMemory, +} from "../memory/index.js"; +import type { RepairDiagnosis } from "../memory/types.js"; +import type { DatasetSpec } from "../models/schemas.js"; + +const repairQueriesSchema = z.object({ + repair_queries: z.array(z.string()).min(1), + rationale: z.string(), +}); + +export type RepairQueriesResult = z.infer; + +function buildRepairQueriesSystem(maxQueries: number): string { + const minQueries = Math.min(2, maxQueries); + return `You are the Coverage & Query Planning Agent for a web data collection pipeline. + +After an initial extraction pass, some required fields are still missing. Generate targeted web search queries to find pages that can fill those gaps. + +Rules: +- Return between ${minQueries} and ${maxQueries} repair_queries (the user message includes max_queries — use as many distinct queries as needed, up to that limit). +- Prefer more queries when multiple fields or example rows need coverage (e.g. one query angle per missing field or per entity in example_rows). +- Each query should aim at a different source angle (company site, press release, database, registry, news). +- Include entity names or attributes from example_rows when available. +- Do NOT repeat or lightly rephrase queries already in prior_search_queries. +- Temporal rules (same as initial search): + - Use current_year / current_date when recency matters unless the user_prompt names a specific year. + - Do not default to outdated years. +- Prefer queries likely to return factual detail pages, not generic listicles. +- Use workflow_memory.query_stats_weak (low completeness/confidence) to avoid repeating bad queries; prefer angles similar to query_stats_top. +- Use workflow_memory.domain_stats_top / domain_stats_weak when choosing site: operators or domains to target. +- Follow recommended_search_patterns from latest_diagnosis when present. +- Return ONLY JSON`; +} + +function currentTimeContext(): { current_date: string; current_year: number } { + const now = new Date(); + return { + current_date: now.toISOString().slice(0, 10), + current_year: now.getFullYear(), + }; +} + +export async function generateRepairQueries(options: { + userPrompt: string; + spec: DatasetSpec; + coverage: CoverageReport; + priorSearchQueries: string[]; + maxQueries: number; + memory?: WorkflowMemory; + diagnosis?: RepairDiagnosis; + repairLoop?: number; +}): Promise { + const { current_date, current_year } = currentTimeContext(); + + const result = await completeJson({ + label: "repair_queries", + schema: repairQueriesSchema, + messages: [ + { + role: "system", + content: buildRepairQueriesSystem(options.maxQueries), + }, + { + role: "user", + content: JSON.stringify({ + user_prompt: options.userPrompt, + current_date, + current_year, + max_queries: options.maxQueries, + instruction: `Generate up to ${options.maxQueries} distinct repair_queries. Use as many as needed to cover missing fields and example rows; do not stop at 5 unless you have fewer useful angles.`, + dataset_spec: { + intent_summary: options.spec.intent_summary, + row_grain: options.spec.row_grain, + columns: options.spec.columns, + dedupe_keys: options.spec.dedupe_keys, + }, + coverage: { + total_records: options.coverage.total_records, + complete_count: options.coverage.complete_count, + partial_count: options.coverage.partial_count, + partial_record_ids: options.coverage.partial_record_ids, + field_gaps: options.coverage.field_gaps, + }, + prior_search_queries: options.priorSearchQueries, + repair_loop: options.repairLoop ?? options.memory?.repair_loop_count ?? 0, + repair_diagnosis: options.diagnosis, + workflow_memory: options.memory + ? memoryContextForAgents(options.memory) + : undefined, + output_shape: { + repair_queries: ["string"], + rationale: "string", + }, + }), + }, + ], + }); + + return { + ...result, + repair_queries: result.repair_queries.slice(0, options.maxQueries), + }; +} diff --git a/backend/BigSet_Data_Collection_Agent/src/agents/source-triage.ts b/backend/BigSet_Data_Collection_Agent/src/agents/source-triage.ts new file mode 100644 index 0000000..6c9e219 --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/agents/source-triage.ts @@ -0,0 +1,100 @@ +import { config } from "../config.js"; +import { completeJson } from "../integrations/openrouter.js"; +import { sourceStatusSchema } from "../models/source-status.js"; +import { + memoryContextForAgents, + type WorkflowMemory, +} from "../memory/index.js"; +import { + sourceTriageResultSchema, + type DatasetSpec, + type FetchedPage, + type SourceTriageResult, +} from "../models/schemas.js"; + +const TRIAGE_SYSTEM = `You are the Source Triage Agent for a web data collection pipeline. + +Classify each fetched web page to decide how the pipeline should process it. + +Status definitions: +- extract_now: Page already contains a usable list/table or enough inline data to extract rows directly. +- requires_navigation: Data exists but requires clicking through menus, pagination, tabs, or multi-step browsing. +- requires_form_submission: Data requires filling and submitting a search/filter form. +- requires_detail_page_followup: Page is an index; each item needs opening a detail page to get full fields. +- irrelevant: Page is unrelated to the dataset intent. +- duplicate: Page largely repeats data already covered (same listings, mirror content). +- blocked: Login wall, CAPTCHA, access denied, or bot block. +- low_value: Related but unlikely to yield useful rows (thin content, ads-only, generic homepage). + +Rules: +- Prefer extract_now when markdown already has list/table-style content matching row_grain. +- Use requires_* statuses when static fetch text is clearly incomplete for the schema. +- Mark duplicate only when the page would not yield any NEW rows beyond known_entities (if provided): same listings or mirror content with no additional primary keys visible. If the page may list entities not in known_entities, prefer extract_now or partial yield instead of duplicate. +- source_data_confidence: how confident you are that accurate, complete rows can be extracted (0–1). +- expected_yield: "complete" if full rows likely available inline; "partial" if only some fields; "none" if no useful rows. +- confidence: your confidence in this triage classification itself (routing), not data quality. +- When workflow_memory is provided: use domain_stats_top (high avg_completeness and avg_confidence) as strong extract_now signals; domain_stats_weak suggests blocked, low_value, or partial-only unless content clearly matches intent. +- Return ONLY JSON`; + +function truncate(text: string): string { + if (text.length <= config.maxPageChars) return text; + return `${text.slice(0, config.maxPageChars)}\n\n[truncated]`; +} + +export async function triagePage(options: { + userPrompt: string; + spec: DatasetSpec; + page: FetchedPage; + knownEntityKeys?: string[]; + memory?: WorkflowMemory; +}): Promise { + const pageUrl = options.page.final_url || options.page.url; + + const result = await completeJson({ + label: `triage:${pageUrl}`, + schema: sourceTriageResultSchema, + messages: [ + { role: "system", content: TRIAGE_SYSTEM }, + { + role: "user", + content: JSON.stringify({ + user_prompt: options.userPrompt, + dataset_spec: { + intent_summary: options.spec.intent_summary, + row_grain: options.spec.row_grain, + columns: options.spec.columns, + extraction_hints: options.spec.extraction_hints, + }, + known_entities: options.knownEntityKeys ?? [], + workflow_memory: options.memory + ? memoryContextForAgents(options.memory) + : undefined, + page: { + url: pageUrl, + title: options.page.title, + text: truncate(options.page.text), + }, + output_shape: { + url: "string", + final_url: "string", + title: "string", + status: "extract_now | requires_navigation | ...", + confidence: "0-1 triage routing confidence", + source_data_confidence: "0-1 expected data accuracy if extracted", + expected_yield: "complete | partial | none", + reasoning: "string", + suggested_action: "optional string", + }, + }), + }, + ], + }); + + return { + ...result, + url: options.page.url, + final_url: pageUrl, + title: options.page.title || result.title, + status: sourceStatusSchema.parse(result.status), + }; +} diff --git a/backend/BigSet_Data_Collection_Agent/src/config.ts b/backend/BigSet_Data_Collection_Agent/src/config.ts new file mode 100644 index 0000000..875747c --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/config.ts @@ -0,0 +1,114 @@ +function readBool(name: string, fallback: boolean): boolean { + const raw = process.env[name]; + if (raw === undefined || raw === "") return fallback; + return ["1", "true", "yes", "on"].includes(raw.toLowerCase()); +} + +function readFloat(name: string, fallback: number): number { + const raw = process.env[name]; + if (!raw) return fallback; + const value = Number.parseFloat(raw); + if (Number.isNaN(value) || value < 0 || value > 1) { + throw new Error(`Invalid ${name}: expected number 0–1, got "${raw}"`); + } + return value; +} + +function readOptionalFloat(name: string): number | undefined { + const raw = process.env[name]; + if (raw === undefined || raw === "") return undefined; + const value = Number.parseFloat(raw); + if (Number.isNaN(value) || value < 0 || value > 2) { + throw new Error(`Invalid ${name}: expected number 0–2, got "${raw}"`); + } + return value; +} + +function readInt(name: string, fallback: number): number { + const raw = process.env[name]; + if (!raw) return fallback; + const value = Number.parseInt(raw, 10); + if (Number.isNaN(value) || value <= 0) { + throw new Error(`Invalid ${name}: expected positive integer, got "${raw}"`); + } + return value; +} + +export const config = { + tinyfishApiKey: process.env.TINYFISH_API_KEY ?? "", + openRouterApiKey: process.env.OPENROUTER_API_KEY ?? "", + openRouterModel: process.env.OPENROUTER_MODEL ?? "google/gemini-3.1-flash-lite", + openRouterSiteUrl: + process.env.OPENROUTER_SITE_URL ?? + "https://github.com/MMeteorL/BigSet_Data_Collection_Agent", + openRouterAppName: + process.env.OPENROUTER_APP_NAME ?? "BigSet Data Collection Agent", + /** Omit temperature by default — Gemini/reasoning models on OpenRouter reject it. Set OPENROUTER_TEMPERATURE to override. */ + openRouterTemperature: readOptionalFloat("OPENROUTER_TEMPERATURE"), + maxSearchQueries: readInt("MAX_SEARCH_QUERIES", 6), + maxResultsPerQuery: readInt("MAX_RESULTS_PER_QUERY", 5), + maxUrlsToFetch: readInt("MAX_URLS_TO_FETCH", 20), + maxPageChars: readInt("MAX_PAGE_CHARS", 12000), + extractionConcurrency: readInt("EXTRACTION_CONCURRENCY", 5), + fetchBatchSize: readInt("FETCH_BATCH_SIZE", 10), + fetchConcurrency: readInt("FETCH_CONCURRENCY", 4), + searchConcurrency: readInt("SEARCH_CONCURRENCY", 4), + maxConcurrentPerDomain: readInt("MAX_CONCURRENT_PER_DOMAIN", 2), + maxRetries: readInt("MAX_RETRIES", 2), + retryBaseDelayMs: readInt("RETRY_BASE_DELAY_MS", 1000), + openRouterRpm: readInt("OPENROUTER_RPM", 60), + tinyfishSearchRpm: readInt("TINYFISH_SEARCH_RPM", 30), + tinyfishFetchRpm: readInt("TINYFISH_FETCH_RPM", 30), + tinyfishAgentRpm: readInt("TINYFISH_AGENT_RPM", 10), + enableRepairLoop: readBool("ENABLE_REPAIR_LOOP", true), + maxRepairLoops: readInt("MAX_REPAIR_LOOPS", 3), + enableWorkflowMemory: readBool("ENABLE_WORKFLOW_MEMORY", true), + maxRepairQueries: readInt("MAX_REPAIR_QUERIES", 4), + maxRepairResultsPerQuery: readInt("MAX_REPAIR_RESULTS_PER_QUERY", 5), + maxRepairUrlsToFetch: readInt("MAX_REPAIR_URLS_TO_FETCH", 10), + /** Top historical queries to re-run on the next Search API page during repair. */ + maxRepairSearchPaginationQueries: readInt( + "MAX_REPAIR_SEARCH_PAGINATION_QUERIES", + 2, + ), + /** Highest Search API page index (API allows 0–10). */ + maxSearchPage: readInt("MAX_SEARCH_PAGE", 10), + enableRepairLinkFollow: readBool("ENABLE_REPAIR_LINK_FOLLOW", true), + maxRepairLinkUrls: readInt("MAX_REPAIR_LINK_URLS", 8), + maxLinksPerSourcePage: readInt("MAX_LINKS_PER_SOURCE_PAGE", 3), + enableTriage: readBool("ENABLE_TRIAGE", true), + enableTinyfishAgent: readBool("ENABLE_TINYFISH_AGENT", true), + maxAgentRunsPerPhase: readInt("MAX_AGENT_RUNS_PER_PHASE", 5), + agentConcurrency: readInt("AGENT_CONCURRENCY", 2), + /** Parallel `/run-async` queue submissions per agent phase. */ + agentQueueConcurrency: readInt("AGENT_QUEUE_CONCURRENCY", 10), + /** Parallel `runs.get` polls while agent jobs execute on Tinyfish. */ + agentPollConcurrency: readInt("AGENT_POLL_CONCURRENCY", 10), + agentPollIntervalMs: readInt("AGENT_POLL_INTERVAL_MS", 3000), + agentPollTimeoutMs: readInt("AGENT_POLL_TIMEOUT_MS", 1_200_000), + triageConcurrency: readInt("TRIAGE_CONCURRENCY", 5), + enableQualityScoring: readBool("ENABLE_QUALITY_SCORING", true), + /** results.csv only includes rows with all required fields, ranked by quality. */ + enableSelectiveResults: readBool("ENABLE_SELECTIVE_RESULTS", true), + qualityLowConfidenceThreshold: readFloat("QUALITY_LOW_CONFIDENCE_THRESHOLD", 0.55), + qualityReviewThreshold: readFloat("QUALITY_REVIEW_THRESHOLD", 0.75), + qualitySourceConfidenceThreshold: readFloat( + "QUALITY_SOURCE_CONFIDENCE_THRESHOLD", + 0.5, + ), + qualityExtractionConfidenceThreshold: readFloat( + "QUALITY_EXTRACTION_CONFIDENCE_THRESHOLD", + 0.6, + ), +} as const; + +export function assertConfig(): void { + const missing: string[] = []; + if (!config.tinyfishApiKey) missing.push("TINYFISH_API_KEY"); + if (!config.openRouterApiKey) missing.push("OPENROUTER_API_KEY"); + if (missing.length > 0) { + throw new Error( + `Missing required environment variables: ${missing.join(", ")}. Copy .env.example to .env and fill in values.`, + ); + } +} diff --git a/backend/BigSet_Data_Collection_Agent/src/coverage/analyze.ts b/backend/BigSet_Data_Collection_Agent/src/coverage/analyze.ts new file mode 100644 index 0000000..e1a364c --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/coverage/analyze.ts @@ -0,0 +1,116 @@ +import { canonicalRecordId } from "../merge/records.js"; +import type { DatasetSpec, ExtractedRecord } from "../models/schemas.js"; + +export interface FieldGap { + column: string; + description: string; + missing_count: number; + missing_pct: number; + /** Partial rows missing this field (for repair query context). */ + example_rows: Record[]; +} + +export interface CoverageReport { + total_records: number; + required_columns: string[]; + field_gaps: FieldGap[]; + should_repair: boolean; + /** Rows with all required fields present. */ + complete_count: number; + /** Rows missing at least one required field. */ + partial_count: number; + /** Record ids (canonical) for partial rows — for repair planning. */ + partial_record_ids: string[]; +} + +function isEmpty(value: unknown): boolean { + return value === null || value === undefined || value === ""; +} + +export function analyzeCoverage( + spec: DatasetSpec, + records: ExtractedRecord[], +): CoverageReport { + const requiredColumns = spec.columns.filter((col) => col.required); + + const fieldGaps: FieldGap[] = requiredColumns + .map((col) => { + const missingRecords = records.filter((record) => + isEmpty(record.row[col.name]), + ); + + return { + column: col.name, + description: col.description, + missing_count: missingRecords.length, + missing_pct: + records.length > 0 ? missingRecords.length / records.length : 1, + example_rows: missingRecords.slice(0, 5).map((record) => record.row), + }; + }) + .filter((gap) => gap.missing_count > 0 || records.length === 0); + + const shouldRepair = + fieldGaps.length > 0 && + (records.length === 0 || fieldGaps.some((gap) => gap.missing_count > 0)); + + const partialRecordIds: string[] = []; + let completeCount = 0; + + for (const record of records) { + const missingRequired = requiredColumns.some((col) => + isEmpty(record.row[col.name]), + ); + if (missingRequired) { + const id = canonicalRecordId(record, spec); + if (id) partialRecordIds.push(id); + } else { + completeCount += 1; + } + } + + return { + total_records: records.length, + required_columns: requiredColumns.map((col) => col.name), + field_gaps: fieldGaps, + should_repair: shouldRepair, + complete_count: completeCount, + partial_count: partialRecordIds.length, + partial_record_ids: partialRecordIds, + }; +} + +export function countFilledGaps( + spec: DatasetSpec, + before: ExtractedRecord[], + after: ExtractedRecord[], + columns: string[], +): Record { + const filled = Object.fromEntries(columns.map((col) => [col, 0])) as Record< + string, + number + >; + + const afterByKey = new Map(); + for (const record of after) { + const key = canonicalRecordId(record, spec); + if (key && !afterByKey.has(key)) { + afterByKey.set(key, record); + } + } + + for (const prev of before) { + const key = canonicalRecordId(prev, spec); + if (!key) continue; + const next = afterByKey.get(key); + if (!next) continue; + + for (const column of columns) { + if (isEmpty(prev.row[column]) && !isEmpty(next.row[column])) { + filled[column] = (filled[column] ?? 0) + 1; + } + } + } + + return filled; +} diff --git a/backend/BigSet_Data_Collection_Agent/src/export/csv-compiler.ts b/backend/BigSet_Data_Collection_Agent/src/export/csv-compiler.ts new file mode 100644 index 0000000..0514376 --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/export/csv-compiler.ts @@ -0,0 +1,199 @@ +import { writeFile } from "node:fs/promises"; +import { join } from "node:path"; +import { canonicalRecordId } from "../merge/records.js"; +import type { RecordQuality } from "../models/quality.js"; +import type { DatasetSpec, ExtractedRecord } from "../models/schemas.js"; + +function escapeCsv(value: string): string { + if (/[",\n\r]/.test(value)) { + return `"${value.replace(/"/g, '""')}"`; + } + return value; +} + +function cellValue(value: unknown): string { + if (value === null || value === undefined) return ""; + if (typeof value === "boolean") return value ? "true" : "false"; + return String(value); +} + +const QUALITY_COLUMNS = [ + "record_id", + "record_status", + "needs_review", + "completeness_pct", + "confidence_score", + "missing_required_fields", + "review_reasons", +] as const; + +function fieldConfidenceColumns(spec: DatasetSpec): string[] { + return spec.columns + .filter((col) => col.required) + .map((col) => `${col.name}_confidence`); +} + +function qualityCells( + quality: RecordQuality | undefined, + spec: DatasetSpec, +): string[] { + if (!quality) { + return [ + ...QUALITY_COLUMNS.map(() => ""), + ...fieldConfidenceColumns(spec).map(() => ""), + ]; + } + const requiredConfidenceCells = spec.columns + .filter((col) => col.required) + .map((col) => { + const value = quality.field_confidences[col.name]; + return escapeCsv(value !== undefined ? String(value) : ""); + }); + + return [ + escapeCsv(quality.record_id), + escapeCsv(quality.record_status), + escapeCsv(quality.needs_review ? "true" : "false"), + escapeCsv(String(quality.completeness_pct)), + escapeCsv(String(quality.confidence_score)), + escapeCsv(quality.missing_required_fields.join("; ")), + escapeCsv(quality.review_reasons.join("; ")), + ...requiredConfidenceCells, + ]; +} + +export async function writeResultsCsv( + path: string, + spec: DatasetSpec, + records: ExtractedRecord[], + qualityByRecordId?: Map, +): Promise { + const columnNames = spec.columns.map((c) => c.name); + const metaColumns = ["primary_source_url", "all_source_urls"]; + const includeQuality = qualityByRecordId !== undefined; + const header = [ + ...columnNames, + ...metaColumns, + ...(includeQuality + ? [...QUALITY_COLUMNS, ...fieldConfidenceColumns(spec)] + : []), + ]; + + const lines = [header.map(escapeCsv).join(",")]; + + for (const record of records) { + const cells = columnNames.map((name) => + escapeCsv(cellValue(record.row[name])), + ); + const primarySource = record.source_urls[0] ?? ""; + const allSources = record.source_urls.join(" | "); + cells.push(escapeCsv(primarySource), escapeCsv(allSources)); + + if (includeQuality) { + const recordId = canonicalRecordId(record, spec); + const quality = recordId ? qualityByRecordId.get(recordId) : undefined; + cells.push(...qualityCells(quality, spec)); + } + + lines.push(cells.join(",")); + } + + await writeFile(path, `${lines.join("\n")}\n`, "utf8"); +} + +export async function writeEvidenceJsonl( + path: string, + spec: DatasetSpec, + records: ExtractedRecord[], + qualityByRecordId?: Map, +): Promise { + const lines = records.map((record) => { + const recordId = canonicalRecordId(record, spec); + const payload: Record = { + row: record.row, + evidence: record.evidence, + source_urls: record.source_urls, + }; + if (record.extraction_confidence !== undefined) { + payload.extraction_confidence = record.extraction_confidence; + } + if (recordId && qualityByRecordId?.has(recordId)) { + const quality = qualityByRecordId.get(recordId)!; + payload.quality = quality; + if (Object.keys(quality.field_confidences).length > 0) { + payload.field_confidences = quality.field_confidences; + } + } + return JSON.stringify(payload); + }); + + const body = lines.length > 0 ? `${lines.join("\n")}\n` : ""; + await writeFile(path, body, "utf8"); +} + +export function qualityMapFromReport( + qualities: RecordQuality[], +): Map { + return new Map(qualities.map((quality) => [quality.record_id, quality])); +} + +export async function writeSegmentedRecordCsvs( + root: string, + spec: DatasetSpec, + records: ExtractedRecord[], + qualities: RecordQuality[], +): Promise { + const qualityById = qualityMapFromReport(qualities); + const recordIdFor = (record: ExtractedRecord) => canonicalRecordId(record, spec); + + const complete = records.filter((record) => { + const id = recordIdFor(record); + return id && qualityById.get(id)?.record_status === "complete"; + }); + const partial = records.filter((record) => { + const id = recordIdFor(record); + return id && qualityById.get(id)?.record_status === "partial"; + }); + const lowConfidence = records.filter((record) => { + const id = recordIdFor(record); + return id && qualityById.get(id)?.record_status === "low_confidence"; + }); + const needingReview = records.filter((record) => { + const id = recordIdFor(record); + return id && qualityById.get(id)?.needs_review === true; + }); + + await writeResultsCsv( + join(root, "records_complete.csv"), + spec, + complete, + qualityById, + ); + await writeResultsCsv( + join(root, "records_partial.csv"), + spec, + partial, + qualityById, + ); + await writeResultsCsv( + join(root, "records_low_confidence.csv"), + spec, + lowConfidence, + qualityById, + ); + await writeResultsCsv( + join(root, "records_needing_review.csv"), + spec, + needingReview, + qualityById, + ); +} + +export async function writeUnkeyedRecordsJsonl( + path: string, + records: ExtractedRecord[], +): Promise { + const lines = records.map((record) => JSON.stringify(record)); + const body = lines.length > 0 ? `${lines.join("\n")}\n` : ""; + await writeFile(path, body, "utf8"); +} diff --git a/backend/BigSet_Data_Collection_Agent/src/export/select-results.ts b/backend/BigSet_Data_Collection_Agent/src/export/select-results.ts new file mode 100644 index 0000000..643bb9f --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/export/select-results.ts @@ -0,0 +1,47 @@ +import { canonicalRecordId } from "../merge/records.js"; +import type { RecordQuality } from "../models/quality.js"; +import type { DatasetSpec, ExtractedRecord } from "../models/schemas.js"; + +function isEmpty(value: unknown): boolean { + return value === null || value === undefined || value === ""; +} + +/** Row has every required column populated. */ +export function hasAllRequiredFields( + spec: DatasetSpec, + record: ExtractedRecord, +): boolean { + return spec.columns + .filter((col) => col.required) + .every((col) => !isEmpty(record.row[col.name])); +} + +/** + * Records for the primary results view: all required fields present, + * ranked by completeness (desc) then confidence (desc). + */ +export function selectVisualizationRecords( + spec: DatasetSpec, + records: ExtractedRecord[], + qualityById: Map, +): ExtractedRecord[] { + const eligible = records.filter((record) => { + if (!hasAllRequiredFields(spec, record)) return false; + const id = canonicalRecordId(record, spec); + if (!id) return false; + const quality = qualityById.get(id); + return quality !== undefined && quality.missing_required_fields.length === 0; + }); + + return eligible.sort((a, b) => { + const idA = canonicalRecordId(a, spec)!; + const idB = canonicalRecordId(b, spec)!; + const qA = qualityById.get(idA)!; + const qB = qualityById.get(idB)!; + + if (qB.completeness_pct !== qA.completeness_pct) { + return qB.completeness_pct - qA.completeness_pct; + } + return qB.confidence_score - qA.confidence_score; + }); +} diff --git a/backend/BigSet_Data_Collection_Agent/src/integrations/openrouter.ts b/backend/BigSet_Data_Collection_Agent/src/integrations/openrouter.ts new file mode 100644 index 0000000..b8e6418 --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/integrations/openrouter.ts @@ -0,0 +1,2 @@ +/** @deprecated Import from `../llm/complete-json.js` instead. */ +export { completeJson, type LlmMessage } from "../llm/complete-json.js"; diff --git a/backend/BigSet_Data_Collection_Agent/src/integrations/tinyfish-agent.ts b/backend/BigSet_Data_Collection_Agent/src/integrations/tinyfish-agent.ts new file mode 100644 index 0000000..01c9da8 --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/integrations/tinyfish-agent.ts @@ -0,0 +1,232 @@ +import { RunStatus, TinyFish, type Run } from "@tiny-fish/sdk"; +import { config } from "../config.js"; +import { sleep, withRetry } from "../queue/retry.js"; +import { mapWithConcurrency } from "../utils/concurrency.js"; + +let client: TinyFish | null = null; + +const TINYFISH_API_BASE = "https://agent.tinyfish.ai"; + +function getClient(): TinyFish { + if (!client) { + client = new TinyFish({ apiKey: config.tinyfishApiKey }); + } + return client; +} + +const TERMINAL_STATUSES: ReadonlySet = new Set([ + RunStatus.COMPLETED, + RunStatus.FAILED, + RunStatus.CANCELLED, +]); + +export interface TinyfishAgentRunResult { + run_id: string | null; + status: string; + result: Record | null; + error: string | null; +} + +export interface QueueTinyfishAgentResult { + run_id: string | null; + error: string | null; +} + +export interface TinyfishAgentJob { + url: string; + goal: string; +} + +function runToResult(run: Run): TinyfishAgentRunResult { + const errorMessage = + run.error?.message ?? + (run.status === RunStatus.FAILED ? "Agent run failed" : null); + + return { + run_id: run.run_id, + status: run.status, + result: (run.result as Record | null) ?? null, + error: errorMessage, + }; +} + +/** Best-effort cancel for async agent runs (POST /v1/runs/{id}/cancel). */ +export async function cancelTinyfishAgentRun(runId: string): Promise { + if (!runId.trim()) return; + + try { + await withRetry( + async () => { + const response = await fetch( + `${TINYFISH_API_BASE}/v1/runs/${encodeURIComponent(runId)}/cancel`, + { + method: "POST", + headers: { + "X-API-Key": config.tinyfishApiKey, + "Content-Type": "application/json", + }, + }, + ); + + if (!response.ok) { + const body = await response.text(); + throw new Error( + `Cancel failed (${response.status})${body ? `: ${body.slice(0, 200)}` : ""}`, + ); + } + }, + { + maxRetries: 1, + baseDelayMs: config.retryBaseDelayMs, + label: `agent.cancel:${runId}`, + }, + ); + } catch { + // Cancel is best-effort — polling timeout still reports failure. + } +} + +/** Submit a run via `/run-async` (returns immediately with run_id). */ +export async function queueTinyfishAgent( + url: string, + goal: string, +): Promise { + const response = await withRetry( + () => getClient().agent.queue({ url, goal }), + { + maxRetries: config.maxRetries, + baseDelayMs: config.retryBaseDelayMs, + label: `agent.queue:${url}`, + }, + ); + + if (response.error) { + return { run_id: null, error: response.error.message }; + } + + if (!response.run_id) { + return { run_id: null, error: "Failed to queue agent run (no run_id)" }; + } + + return { run_id: response.run_id, error: null }; +} + +/** Poll `runs.get` until the run reaches a terminal status or times out. */ +export async function pollTinyfishAgentUntilDone( + runId: string, +): Promise { + const startedAt = Date.now(); + let lastStatus = RunStatus.PENDING; + + while (true) { + const run = await withRetry( + () => getClient().runs.get(runId), + { + maxRetries: config.maxRetries, + baseDelayMs: config.retryBaseDelayMs, + label: `agent.poll:${runId}`, + }, + ); + + lastStatus = run.status; + + if (TERMINAL_STATUSES.has(run.status)) { + return runToResult(run); + } + + if (Date.now() - startedAt >= config.agentPollTimeoutMs) { + await cancelTinyfishAgentRun(runId); + + try { + const finalRun = await getClient().runs.get(runId); + if (TERMINAL_STATUSES.has(finalRun.status)) { + const result = runToResult(finalRun); + if (finalRun.status === RunStatus.CANCELLED) { + return { + ...result, + error: + result.error ?? + `Agent run cancelled after ${config.agentPollTimeoutMs}ms (was ${lastStatus})`, + }; + } + return result; + } + } catch { + // Fall through to TIMEOUT result below. + } + + return { + run_id: runId, + status: "TIMEOUT", + result: null, + error: `Agent run timed out after ${config.agentPollTimeoutMs}ms (last status: ${lastStatus}); cancel requested`, + }; + } + + await sleep(config.agentPollIntervalMs); + } +} + +/** + * Queue then poll — drop-in replacement for the old synchronous `/run` helper. + */ +export async function runTinyfishAgent( + url: string, + goal: string, +): Promise { + const queued = await queueTinyfishAgent(url, goal); + if (queued.error || !queued.run_id) { + return { + run_id: null, + status: RunStatus.FAILED, + result: null, + error: queued.error ?? "Failed to queue agent run", + }; + } + return pollTinyfishAgentUntilDone(queued.run_id); +} + +/** + * Queue all jobs quickly, then poll in parallel — better overlap than sync `/run` waves. + */ +export async function runTinyfishAgentsBatch( + jobs: TinyfishAgentJob[], +): Promise { + if (jobs.length === 0) return []; + + const queued = await mapWithConcurrency( + jobs, + config.agentQueueConcurrency, + async (job) => { + const queueResult = await queueTinyfishAgent(job.url, job.goal); + return { job, ...queueResult }; + }, + ); + + const results: TinyfishAgentRunResult[] = new Array(jobs.length); + + const pollTargets: { index: number; run_id: string }[] = []; + for (let index = 0; index < queued.length; index++) { + const item = queued[index]!; + if (item.error || !item.run_id) { + results[index] = { + run_id: null, + status: RunStatus.FAILED, + result: null, + error: item.error ?? "Failed to queue agent run", + }; + continue; + } + pollTargets.push({ index, run_id: item.run_id }); + } + + await mapWithConcurrency( + pollTargets, + config.agentPollConcurrency, + async ({ index, run_id }) => { + results[index] = await pollTinyfishAgentUntilDone(run_id); + }, + ); + + return results; +} diff --git a/backend/BigSet_Data_Collection_Agent/src/integrations/tinyfish.ts b/backend/BigSet_Data_Collection_Agent/src/integrations/tinyfish.ts new file mode 100644 index 0000000..c11948a --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/integrations/tinyfish.ts @@ -0,0 +1,70 @@ +import { TinyFish } from "@tiny-fish/sdk"; +import { config } from "../config.js"; +import type { FetchedPage, SourceCandidate } from "../models/schemas.js"; + +let client: TinyFish | null = null; + +function getClient(): TinyFish { + if (!client) { + client = new TinyFish({ apiKey: config.tinyfishApiKey }); + } + return client; +} + +export async function searchWeb( + query: string, + page = 0, +): Promise { + const response = await getClient().search.query({ query, page }); + return response.results.map((result) => ({ + url: result.url, + title: result.title, + snippet: result.snippet, + site_name: result.site_name, + query, + position: result.position, + search_page: page, + })); +} + +export async function fetchPages( + urls: string[], + options?: { includeLinks?: boolean }, +): Promise { + if (urls.length === 0) return []; + + const response = await getClient().fetch.getContents({ + urls, + format: "markdown", + links: options?.includeLinks ?? false, + }); + + const pages: FetchedPage[] = response.results.map((page) => ({ + url: page.url, + final_url: page.final_url ?? page.url, + title: page.title ?? "", + description: page.description ?? undefined, + text: typeof page.text === "string" ? page.text : JSON.stringify(page.text), + outbound_links: page.links, + })); + + for (const err of response.errors) { + pages.push({ + url: err.url, + final_url: err.url, + title: "", + text: "", + error: err.error, + }); + } + + return pages; +} + +export function chunkUrls(urls: string[], size: number): string[][] { + const chunks: string[][] = []; + for (let i = 0; i < urls.length; i += size) { + chunks.push(urls.slice(i, i + size)); + } + return chunks; +} diff --git a/backend/BigSet_Data_Collection_Agent/src/llm/complete-json.ts b/backend/BigSet_Data_Collection_Agent/src/llm/complete-json.ts new file mode 100644 index 0000000..bed77f2 --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/llm/complete-json.ts @@ -0,0 +1,93 @@ +import { generateText, Output } from "ai"; +import type { z } from "zod"; + +import { config } from "../config.js"; +import { getOpenRouterLimiter } from "../queue/pools.js"; +import { getOpenRouterChatModel } from "./provider.js"; +import { recordLanguageModelUsage } from "./usage.js"; + +export interface LlmMessage { + role: "system" | "user" | "assistant"; + content: string; +} + +type ConversationMessage = { + role: "user" | "assistant"; + content: string; +}; + +function splitPromptMessages(messages: LlmMessage[]): { + system?: string; + messages: ConversationMessage[]; +} { + const systemParts: string[] = []; + const conversation: ConversationMessage[] = []; + + for (const message of messages) { + if (message.role === "system") { + systemParts.push(message.content); + continue; + } + conversation.push({ role: message.role, content: message.content }); + } + + return { + system: systemParts.length > 0 ? systemParts.join("\n\n") : undefined, + messages: conversation, + }; +} + +/** + * Structured JSON completion via Vercel AI SDK (`generateText` + `Output.object`). + * Token usage is recorded into the current `runWithLlmUsageScope` when active. + */ +export async function completeJson(options: { + messages: LlmMessage[]; + schema: z.ZodType; + label: string; + maxRetries?: number; +}): Promise { + const maxRetries = options.maxRetries ?? 2; + let messages = [...options.messages]; + let lastError: unknown; + + for (let attempt = 0; attempt <= maxRetries; attempt++) { + await getOpenRouterLimiter().acquire(); + + const { system, messages: conversation } = splitPromptMessages(messages); + + try { + const result = await generateText({ + model: getOpenRouterChatModel(), + ...(system ? { system } : {}), + messages: conversation, + output: Output.object({ schema: options.schema }), + ...(config.openRouterTemperature !== undefined + ? { temperature: config.openRouterTemperature } + : {}), + }); + + recordLanguageModelUsage(result.usage); + return result.output as T; + } catch (error) { + lastError = error; + if (attempt < maxRetries) { + messages = [ + ...messages, + { + role: "user", + content: `Your JSON was invalid for ${options.label}. Error: ${ + error instanceof Error ? error.message : String(error) + }. Return only valid JSON matching the requested schema.`, + }, + ]; + } + } + } + + throw new Error( + `${options.label} failed after ${maxRetries + 1} attempts: ${ + lastError instanceof Error ? lastError.message : String(lastError) + }`, + ); +} diff --git a/backend/BigSet_Data_Collection_Agent/src/llm/provider.ts b/backend/BigSet_Data_Collection_Agent/src/llm/provider.ts new file mode 100644 index 0000000..078e514 --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/llm/provider.ts @@ -0,0 +1,23 @@ +import { createOpenRouter } from "@openrouter/ai-sdk-provider"; + +import { config } from "../config.js"; + +let openRouterProvider: ReturnType | null = null; + +function getOpenRouterProvider(): ReturnType { + if (!openRouterProvider) { + openRouterProvider = createOpenRouter({ + apiKey: config.openRouterApiKey, + headers: { + "HTTP-Referer": config.openRouterSiteUrl, + "X-Title": config.openRouterAppName, + }, + }); + } + return openRouterProvider; +} + +/** OpenRouter chat model via the official AI SDK provider (not OpenAI-compatible shim). */ +export function getOpenRouterChatModel() { + return getOpenRouterProvider().chat(config.openRouterModel); +} diff --git a/backend/BigSet_Data_Collection_Agent/src/llm/usage.ts b/backend/BigSet_Data_Collection_Agent/src/llm/usage.ts new file mode 100644 index 0000000..5f27740 --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/llm/usage.ts @@ -0,0 +1,57 @@ +import { AsyncLocalStorage } from "node:async_hooks"; +import type { LanguageModelUsage } from "ai"; + +export interface LlmUsageTotals { + promptTokens: number; + completionTokens: number; + totalTokens: number; + callCount: number; +} + +const storage = new AsyncLocalStorage(); + +export function emptyLlmUsage(): LlmUsageTotals { + return { + promptTokens: 0, + completionTokens: 0, + totalTokens: 0, + callCount: 0, + }; +} + +/** Run pipeline (or other work) with a scoped LLM usage accumulator. */ +export async function runWithLlmUsageScope( + fn: () => Promise, +): Promise<{ result: T; usage: LlmUsageTotals }> { + const usage = emptyLlmUsage(); + const result = await storage.run(usage, fn); + return { result, usage: { ...usage } }; +} + +export function getCurrentLlmUsage(): LlmUsageTotals { + return storage.getStore() ?? emptyLlmUsage(); +} + +export function recordLanguageModelUsage(usage: LanguageModelUsage | undefined): void { + const totals = storage.getStore(); + if (!totals || !usage) { + return; + } + + const promptTokens = usage.inputTokens ?? 0; + const completionTokens = usage.outputTokens ?? 0; + totals.promptTokens += promptTokens; + totals.completionTokens += completionTokens; + totals.totalTokens += usage.totalTokens ?? promptTokens + completionTokens; + totals.callCount += 1; +} + +export function toDatasetAgentUsage( + usage: LlmUsageTotals, +): { promptTokens: number; completionTokens: number; totalTokens: number } { + return { + promptTokens: usage.promptTokens, + completionTokens: usage.completionTokens, + totalTokens: usage.totalTokens, + }; +} diff --git a/backend/BigSet_Data_Collection_Agent/src/memory/fingerprint.ts b/backend/BigSet_Data_Collection_Agent/src/memory/fingerprint.ts new file mode 100644 index 0000000..7d49854 --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/memory/fingerprint.ts @@ -0,0 +1,6 @@ +import { createHash } from "node:crypto"; + +export function promptFingerprint(prompt: string): string { + const normalized = prompt.trim().toLowerCase().replace(/\s+/g, " "); + return createHash("sha256").update(normalized).digest("hex").slice(0, 16); +} diff --git a/backend/BigSet_Data_Collection_Agent/src/memory/index.ts b/backend/BigSet_Data_Collection_Agent/src/memory/index.ts new file mode 100644 index 0000000..4dec404 --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/memory/index.ts @@ -0,0 +1,26 @@ +export { promptFingerprint } from "./fingerprint.js"; +export { + createWorkflowMemory, + domainMemoryBoost, + memoryContextForAgents, + mergePersistentMemory, + recordCoverageGaps, + recordDiagnosis, + recordPhaseInMemory, + snapshotExtractionSchema, +} from "./workflow-memory.js"; +export { loadPersistentMemory, savePersistentMemory, saveRunMemory } from "./store.js"; +export { + aggregateQueryStatsByText, + effectiveWeightedQuality, + planRepairSearches, + type SearchPlan, +} from "./search-pagination.js"; +export type { + AgentGoalMemoryEntry, + DomainMemoryEntry, + QueryMemoryEntry, + RepairDiagnosis, + WorkflowMemory, +} from "./types.js"; +export { repairDiagnosisSchema, workflowMemorySchema } from "./types.js"; diff --git a/backend/BigSet_Data_Collection_Agent/src/memory/scored-aggregates.ts b/backend/BigSet_Data_Collection_Agent/src/memory/scored-aggregates.ts new file mode 100644 index 0000000..5d873a7 --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/memory/scored-aggregates.ts @@ -0,0 +1,481 @@ +import type { + AgentRunRecord, + DatasetSpec, + ExtractedRecord, + SourceCandidate, + SourceTriageResult, +} from "../models/schemas.js"; +import { agentExtractedUrls, triageByUrl } from "../quality/index.js"; +import { scoreRecord, type ScoreRecordContext } from "../quality/score-record.js"; +import { getDomain, normalizeUrl } from "../utils/url.js"; +import { recomputeWeightedQuality } from "./search-pagination.js"; +import type { + AgentGoalMemoryEntry, + DomainMemoryEntry, + QueryMemoryEntry, + QueryPageBreakdown, + WorkflowMemory, +} from "./types.js"; + +export interface RecordMetrics { + completeness: number; + confidence: number; +} + +function rollingAvg(current: number, count: number, value: number): number { + if (count <= 0) return value; + return (current * count + value) / (count + 1); +} + +export function metricsForRecord( + spec: DatasetSpec, + record: ExtractedRecord, + context: ScoreRecordContext, +): RecordMetrics { + const quality = scoreRecord(spec, record, context, "memory"); + return { + completeness: quality.completeness_pct, + confidence: quality.confidence_score, + }; +} + +export function buildUrlToQueryMap( + candidates: SourceCandidate[], +): Map { + const map = new Map(); + for (const candidate of candidates) { + map.set(normalizeUrl(candidate.url), candidate.query); + } + return map; +} + +function getOrCreateQueryEntry( + memory: WorkflowMemory, + query: string, + phase: string, + repairLoop: number, +): QueryMemoryEntry { + let entry = memory.query_stats.find( + (item) => item.query === query && item.phase === phase, + ); + if (!entry) { + entry = { + query, + phase, + repair_loop: repairLoop, + urls_produced: 0, + urls_with_records: 0, + record_count: 0, + avg_completeness: 0, + avg_confidence: 0, + search_page: 0, + weighted_quality: 0, + page_breakdown: [], + }; + memory.query_stats.push(entry); + } + return entry; +} + +function getOrCreatePageSlice( + entry: QueryMemoryEntry, + page: number, +): QueryPageBreakdown { + let slice = entry.page_breakdown.find((item) => item.page === page); + if (!slice) { + slice = { + page, + urls_produced: 0, + urls_with_records: 0, + record_count: 0, + avg_completeness: 0, + avg_confidence: 0, + }; + entry.page_breakdown.push(slice); + } + return slice; +} + +function applyMetricsToPageSlice( + slice: QueryPageBreakdown, + metrics: RecordMetrics, +): void { + slice.avg_completeness = rollingAvg( + slice.avg_completeness, + slice.record_count, + metrics.completeness, + ); + slice.avg_confidence = rollingAvg( + slice.avg_confidence, + slice.record_count, + metrics.confidence, + ); + slice.record_count += 1; +} + +function getOrCreateDomainEntry( + memory: WorkflowMemory, + domain: string, + repairLoop: number, +): DomainMemoryEntry { + let entry = memory.domain_stats.find((item) => item.domain === domain); + if (!entry) { + entry = { + domain, + record_count: 0, + fetch_failures: 0, + avg_completeness: 0, + avg_confidence: 0, + last_repair_loop: repairLoop, + }; + memory.domain_stats.push(entry); + } + return entry; +} + +function applyMetricsToDomain( + entry: DomainMemoryEntry, + metrics: RecordMetrics, + repairLoop: number, +): void { + entry.avg_completeness = rollingAvg( + entry.avg_completeness, + entry.record_count, + metrics.completeness, + ); + entry.avg_confidence = rollingAvg( + entry.avg_confidence, + entry.record_count, + metrics.confidence, + ); + entry.record_count += 1; + entry.last_repair_loop = repairLoop; +} + +function applyMetricsToQuery( + entry: QueryMemoryEntry, + metrics: RecordMetrics, + searchPage = 0, +): void { + entry.avg_completeness = rollingAvg( + entry.avg_completeness, + entry.record_count, + metrics.completeness, + ); + entry.avg_confidence = rollingAvg( + entry.avg_confidence, + entry.record_count, + metrics.confidence, + ); + entry.record_count += 1; + entry.search_page = Math.max(entry.search_page ?? 0, searchPage); + + const slice = getOrCreatePageSlice(entry, searchPage); + applyMetricsToPageSlice(slice, metrics); + recomputeWeightedQuality(entry); +} + +export function attributeRecordsToMemory(options: { + memory: WorkflowMemory; + spec: DatasetSpec; + phase: string; + repairLoop: number; + queries: string[]; + candidates: SourceCandidate[]; + records: ExtractedRecord[]; + failedUrls: string[]; + agentRuns: AgentRunRecord[]; + triageResults: SourceTriageResult[]; +}): void { + const { + memory, + spec, + phase, + repairLoop, + queries, + candidates, + records, + failedUrls, + agentRuns, + triageResults, + } = options; + + const urlToQuery = buildUrlToQueryMap(candidates); + const context: ScoreRecordContext = { + triageByUrl: triageByUrl(triageResults), + agentExtractedUrls: agentExtractedUrls(agentRuns), + }; + + const candidateUrlsByQuery = new Map>(); + const candidateUrlsByQueryPage = new Map>>(); + const urlToSearchPage = new Map(); + + for (const candidate of candidates) { + const normalized = normalizeUrl(candidate.url); + const page = candidate.search_page ?? 0; + urlToSearchPage.set(normalized, page); + + if (!candidateUrlsByQuery.has(candidate.query)) { + candidateUrlsByQuery.set(candidate.query, new Set()); + } + candidateUrlsByQuery.get(candidate.query)!.add(normalized); + + if (!candidateUrlsByQueryPage.has(candidate.query)) { + candidateUrlsByQueryPage.set(candidate.query, new Map()); + } + const byPage = candidateUrlsByQueryPage.get(candidate.query)!; + if (!byPage.has(page)) byPage.set(page, new Set()); + byPage.get(page)!.add(normalized); + } + + for (const query of queries) { + const entry = getOrCreateQueryEntry(memory, query, phase, repairLoop); + const urls = candidateUrlsByQuery.get(query); + if (urls) entry.urls_produced += urls.size; + + const byPage = candidateUrlsByQueryPage.get(query); + if (byPage) { + for (const [page, pageUrls] of byPage) { + const slice = getOrCreatePageSlice(entry, page); + slice.urls_produced += pageUrls.size; + entry.search_page = Math.max(entry.search_page ?? 0, page); + } + } + } + + const urlsWithRecordsByQuery = new Map>(); + const urlsWithRecordsByQueryPage = new Map>>(); + + for (const record of records) { + const metrics = metricsForRecord(spec, record, context); + const queriesHit = new Set(); + const domainsHit = new Set(); + + const attributeUrl = (rawUrl: string) => { + const normalized = normalizeUrl(rawUrl); + const domain = getDomain(rawUrl); + + if (!domainsHit.has(domain)) { + domainsHit.add(domain); + applyMetricsToDomain( + getOrCreateDomainEntry(memory, domain, repairLoop), + metrics, + repairLoop, + ); + } + + const query = urlToQuery.get(normalized); + if (query) { + if (!urlsWithRecordsByQuery.has(query)) { + urlsWithRecordsByQuery.set(query, new Set()); + } + urlsWithRecordsByQuery.get(query)!.add(normalized); + queriesHit.add(query); + + const page = urlToSearchPage.get(normalized) ?? 0; + if (!urlsWithRecordsByQueryPage.has(query)) { + urlsWithRecordsByQueryPage.set(query, new Map()); + } + const byPage = urlsWithRecordsByQueryPage.get(query)!; + if (!byPage.has(page)) byPage.set(page, new Set()); + byPage.get(page)!.add(normalized); + } + }; + + for (const sourceUrl of record.source_urls) { + attributeUrl(sourceUrl); + } + for (const item of record.evidence) { + attributeUrl(item.url); + } + + for (const query of queriesHit) { + let searchPage = 0; + for (const sourceUrl of record.source_urls) { + const normalized = normalizeUrl(sourceUrl); + if (urlToQuery.get(normalized) === query) { + searchPage = urlToSearchPage.get(normalized) ?? 0; + break; + } + } + if (searchPage === 0) { + for (const item of record.evidence) { + const normalized = normalizeUrl(item.url); + if (urlToQuery.get(normalized) === query) { + searchPage = urlToSearchPage.get(normalized) ?? 0; + break; + } + } + } + applyMetricsToQuery( + getOrCreateQueryEntry(memory, query, phase, repairLoop), + metrics, + searchPage, + ); + } + } + + for (const [query, urls] of urlsWithRecordsByQuery) { + const entry = getOrCreateQueryEntry(memory, query, phase, repairLoop); + entry.urls_with_records = Math.max(entry.urls_with_records, urls.size); + + const byPage = urlsWithRecordsByQueryPage.get(query); + if (byPage) { + for (const [page, pageUrls] of byPage) { + const slice = getOrCreatePageSlice(entry, page); + slice.urls_with_records = Math.max(slice.urls_with_records, pageUrls.size); + } + } + recomputeWeightedQuality(entry); + } + + for (const url of failedUrls) { + const entry = getOrCreateDomainEntry(memory, getDomain(url), repairLoop); + entry.fetch_failures += 1; + entry.last_repair_loop = repairLoop; + } + + for (const run of agentRuns) { + const normalizedUrl = normalizeUrl(run.url); + const domain = getDomain(run.url); + + if (run.records_extracted > 0 && run.goal) { + const matching = records.filter((record) => + record.source_urls.some((u) => normalizeUrl(u) === normalizedUrl), + ); + + let goalEntry = memory.agent_goal_stats.find( + (item) => item.url === run.url && item.goal === run.goal, + ); + if (!goalEntry) { + goalEntry = { + url: run.url, + goal: run.goal, + repair_loop: repairLoop, + record_count: 0, + avg_completeness: 0, + avg_confidence: 0, + }; + memory.agent_goal_stats.push(goalEntry); + } + + for (const record of matching) { + const metrics = metricsForRecord(spec, record, context); + goalEntry.avg_completeness = rollingAvg( + goalEntry.avg_completeness, + goalEntry.record_count, + metrics.completeness, + ); + goalEntry.avg_confidence = rollingAvg( + goalEntry.avg_confidence, + goalEntry.record_count, + metrics.confidence, + ); + goalEntry.record_count += 1; + } + } else { + const domainEntry = getOrCreateDomainEntry(memory, domain, repairLoop); + domainEntry.fetch_failures += 1; + } + } + + capMemoryLists(memory); +} + +function capMemoryLists(memory: WorkflowMemory): void { + if (memory.query_stats.length > 80) { + memory.query_stats.splice(0, memory.query_stats.length - 80); + } + if (memory.domain_stats.length > 50) { + memory.domain_stats.sort((a, b) => b.record_count - a.record_count); + memory.domain_stats = memory.domain_stats.slice(0, 50); + } + if (memory.agent_goal_stats.length > 40) { + memory.agent_goal_stats = memory.agent_goal_stats + .filter((item) => item.record_count > 0) + .slice(-40); + } +} + +export function mergeQueryEntry( + target: QueryMemoryEntry, + source: QueryMemoryEntry, +): void { + const totalRecords = target.record_count + source.record_count; + if (totalRecords > 0) { + target.avg_completeness = + (target.avg_completeness * target.record_count + + source.avg_completeness * source.record_count) / + totalRecords; + target.avg_confidence = + (target.avg_confidence * target.record_count + + source.avg_confidence * source.record_count) / + totalRecords; + } + target.record_count = totalRecords; + target.urls_produced += source.urls_produced; + target.urls_with_records += source.urls_with_records; + target.repair_loop = Math.max(target.repair_loop, source.repair_loop); + target.search_page = Math.max( + target.search_page ?? 0, + source.search_page ?? 0, + ); + + for (const slice of source.page_breakdown ?? []) { + const targetSlice = getOrCreatePageSlice(target, slice.page); + const combinedRecords = targetSlice.record_count + slice.record_count; + if (combinedRecords > 0) { + targetSlice.avg_completeness = + (targetSlice.avg_completeness * targetSlice.record_count + + slice.avg_completeness * slice.record_count) / + combinedRecords; + targetSlice.avg_confidence = + (targetSlice.avg_confidence * targetSlice.record_count + + slice.avg_confidence * slice.record_count) / + combinedRecords; + } + targetSlice.record_count = combinedRecords; + targetSlice.urls_produced += slice.urls_produced; + targetSlice.urls_with_records += slice.urls_with_records; + } + recomputeWeightedQuality(target); +} + +export function mergeDomainEntry( + target: DomainMemoryEntry, + source: DomainMemoryEntry, +): void { + const totalRecords = target.record_count + source.record_count; + if (totalRecords > 0) { + target.avg_completeness = + (target.avg_completeness * target.record_count + + source.avg_completeness * source.record_count) / + totalRecords; + target.avg_confidence = + (target.avg_confidence * target.record_count + + source.avg_confidence * source.record_count) / + totalRecords; + } + target.record_count = totalRecords; + target.fetch_failures += source.fetch_failures; + target.last_repair_loop = Math.max(target.last_repair_loop, source.last_repair_loop); +} + +export function mergeAgentGoalEntry( + target: AgentGoalMemoryEntry, + source: AgentGoalMemoryEntry, +): void { + const totalRecords = target.record_count + source.record_count; + if (totalRecords > 0) { + target.avg_completeness = + (target.avg_completeness * target.record_count + + source.avg_completeness * source.record_count) / + totalRecords; + target.avg_confidence = + (target.avg_confidence * target.record_count + + source.avg_confidence * source.record_count) / + totalRecords; + } + target.record_count = totalRecords; + target.repair_loop = Math.max(target.repair_loop, source.repair_loop); +} diff --git a/backend/BigSet_Data_Collection_Agent/src/memory/search-pagination.ts b/backend/BigSet_Data_Collection_Agent/src/memory/search-pagination.ts new file mode 100644 index 0000000..67c9e9e --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/memory/search-pagination.ts @@ -0,0 +1,184 @@ +import { config } from "../config.js"; +import type { QueryMemoryEntry, WorkflowMemory } from "./types.js"; + +export interface SearchPlan { + /** Base query string sent to the Search API. */ + query: string; + /** Search API page index (0-based, max 10). */ + page: number; +} + +/** Front pages count more toward recurring-search ranking. */ +const PAGE_WEIGHTS = [1.0, 0.75, 0.5, 0.35, 0.25, 0.2, 0.15, 0.12, 0.1, 0.08, 0.05]; + +export function pageWeight(page: number): number { + if (page < 0) return 0.05; + return PAGE_WEIGHTS[page] ?? 0.05; +} + +export function effectiveWeightedQuality(entry: QueryMemoryEntry): number { + if (entry.weighted_quality > 0) return entry.weighted_quality; + if (entry.record_count <= 0) return 0; + return (entry.avg_completeness + entry.avg_confidence) / 2; +} + +export function recomputeWeightedQuality(entry: QueryMemoryEntry): void { + const breakdown = entry.page_breakdown ?? []; + if (breakdown.length === 0) { + entry.weighted_quality = + entry.record_count > 0 + ? (entry.avg_completeness + entry.avg_confidence) / 2 + : 0; + return; + } + + let numerator = 0; + let denominator = 0; + for (const slice of breakdown) { + if (slice.record_count <= 0) continue; + const w = pageWeight(slice.page) * slice.record_count; + const q = (slice.avg_completeness + slice.avg_confidence) / 2; + numerator += w * q; + denominator += w; + } + entry.weighted_quality = denominator > 0 ? numerator / denominator : 0; +} + +/** Roll up stats for the same query text across phases. */ +export function aggregateQueryStatsByText( + memory: WorkflowMemory, +): Map { + const map = new Map(); + + for (const item of memory.query_stats) { + const existing = map.get(item.query); + if (!existing) { + map.set(item.query, { + ...item, + phases: [item.phase], + search_page: item.search_page ?? 0, + weighted_quality: item.weighted_quality ?? 0, + page_breakdown: [...(item.page_breakdown ?? [])], + }); + continue; + } + + existing.phases.push(item.phase); + existing.record_count += item.record_count; + existing.urls_produced += item.urls_produced; + existing.urls_with_records += item.urls_with_records; + existing.search_page = Math.max( + existing.search_page ?? 0, + item.search_page ?? 0, + ); + existing.repair_loop = Math.max(existing.repair_loop, item.repair_loop); + + const totalRecords = existing.record_count; + if (totalRecords > 0) { + const prevCount = totalRecords - item.record_count; + if (prevCount > 0) { + existing.avg_completeness = + (existing.avg_completeness * prevCount + + item.avg_completeness * item.record_count) / + totalRecords; + existing.avg_confidence = + (existing.avg_confidence * prevCount + + item.avg_confidence * item.record_count) / + totalRecords; + } else { + existing.avg_completeness = item.avg_completeness; + existing.avg_confidence = item.avg_confidence; + } + } + + for (const slice of item.page_breakdown ?? []) { + const target = existing.page_breakdown!.find((p) => p.page === slice.page); + if (!target) { + existing.page_breakdown!.push({ ...slice }); + } else { + const combined = target.record_count + slice.record_count; + if (combined > 0) { + target.avg_completeness = + (target.avg_completeness * target.record_count + + slice.avg_completeness * slice.record_count) / + combined; + target.avg_confidence = + (target.avg_confidence * target.record_count + + slice.avg_confidence * slice.record_count) / + combined; + } + target.record_count = combined; + target.urls_produced += slice.urls_produced; + target.urls_with_records += slice.urls_with_records; + } + } + recomputeWeightedQuality(existing); + } + + return map; +} + +/** New repair queries at page 0; top historical queries at the next page. */ +export function planRepairSearches( + memory: WorkflowMemory, + newQueries: string[], +): SearchPlan[] { + const plans: SearchPlan[] = []; + const seen = new Set(); + + for (const raw of newQueries) { + const query = raw.trim(); + if (!query || seen.has(query)) continue; + seen.add(query); + plans.push({ query, page: 0 }); + } + + const aggregated = aggregateQueryStatsByText(memory); + const top = [...aggregated.values()] + .filter((item) => item.record_count > 0) + .sort( + (a, b) => effectiveWeightedQuality(b) - effectiveWeightedQuality(a), + ) + .slice(0, config.maxRepairSearchPaginationQueries); + + for (const entry of top) { + const nextPage = (entry.search_page ?? 0) + 1; + if (nextPage > config.maxSearchPage) continue; + if (seen.has(entry.query)) continue; + seen.add(entry.query); + plans.push({ query: entry.query, page: nextPage }); + } + + return plans; +} + +/** After a repair search pass, persist the highest page used per query. */ +export function markSearchPagesUsed( + memory: WorkflowMemory, + plans: SearchPlan[], + phase: string, + repairLoop: number, +): void { + for (const plan of plans) { + let entry = memory.query_stats.find( + (item) => item.query === plan.query && item.phase === phase, + ); + if (!entry) { + entry = { + query: plan.query, + phase, + repair_loop: repairLoop, + urls_produced: 0, + urls_with_records: 0, + record_count: 0, + avg_completeness: 0, + avg_confidence: 0, + search_page: plan.page, + weighted_quality: 0, + page_breakdown: [], + }; + memory.query_stats.push(entry); + } + entry.search_page = Math.max(entry.search_page ?? 0, plan.page); + } +} diff --git a/backend/BigSet_Data_Collection_Agent/src/memory/store.ts b/backend/BigSet_Data_Collection_Agent/src/memory/store.ts new file mode 100644 index 0000000..a8c75f7 --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/memory/store.ts @@ -0,0 +1,125 @@ +import { mkdir, readFile, writeFile } from "node:fs/promises"; +import { join } from "node:path"; +import { workflowMemorySchema, type WorkflowMemory } from "./types.js"; + +export function globalMemoryPath(memoryDir: string, fingerprint: string): string { + return join(memoryDir, `${fingerprint}.json`); +} + +/** Migrate v1.1 coarse memory format to scored stats (best-effort). */ +function migrateLegacyMemory(raw: Record): WorkflowMemory { + const base = workflowMemorySchema.parse({ + prompt_fingerprint: raw.prompt_fingerprint, + user_prompt: raw.user_prompt, + repair_loop_count: raw.repair_loop_count ?? 0, + query_stats: [], + domain_stats: [], + agent_goal_stats: [], + extraction_schema: raw.extraction_schema, + dedupe_keys: raw.dedupe_keys ?? [], + diagnoses: raw.diagnoses ?? [], + strategy_notes: raw.strategy_notes ?? [], + last_missing_fields: raw.last_missing_fields, + }); + + const successfulDomains = raw.successful_domains as string[] | undefined; + const failedDomains = raw.failed_domains as string[] | undefined; + + for (const domain of successfulDomains ?? []) { + base.domain_stats.push({ + domain, + record_count: 1, + fetch_failures: 0, + avg_completeness: 0.7, + avg_confidence: 0.7, + last_repair_loop: 0, + }); + } + for (const domain of failedDomains ?? []) { + base.domain_stats.push({ + domain, + record_count: 0, + fetch_failures: 1, + avg_completeness: 0, + avg_confidence: 0, + last_repair_loop: 0, + }); + } + + const successfulQueries = raw.successful_queries as + | { query: string; phase: string; repair_loop: number }[] + | undefined; + for (const item of successfulQueries ?? []) { + base.query_stats.push({ + query: item.query, + phase: item.phase, + repair_loop: item.repair_loop, + urls_produced: 1, + urls_with_records: 1, + record_count: 1, + avg_completeness: 0.7, + avg_confidence: 0.7, + search_page: 0, + weighted_quality: 0.7, + page_breakdown: [], + }); + } + + for (const query of (raw.failed_queries as string[] | undefined) ?? []) { + base.query_stats.push({ + query, + phase: "legacy", + repair_loop: 0, + urls_produced: 1, + urls_with_records: 0, + record_count: 0, + avg_completeness: 0, + avg_confidence: 0, + search_page: 0, + weighted_quality: 0, + page_breakdown: [], + }); + } + + return base; +} + +export async function loadPersistentMemory( + memoryDir: string, + fingerprint: string, +): Promise { + try { + const raw = JSON.parse( + await readFile(globalMemoryPath(memoryDir, fingerprint), "utf8"), + ) as Record; + + if (Array.isArray(raw.query_stats)) { + return workflowMemorySchema.parse(raw); + } + + return migrateLegacyMemory(raw); + } catch { + return null; + } +} + +export async function savePersistentMemory( + memoryDir: string, + memory: WorkflowMemory, +): Promise { + await mkdir(memoryDir, { recursive: true }); + await writeFile( + globalMemoryPath(memoryDir, memory.prompt_fingerprint), + `${JSON.stringify(memory, null, 2)}\n`, + "utf8", + ); +} + +export async function saveRunMemory( + runRoot: string, + memory: WorkflowMemory, +): Promise { + const path = join(runRoot, "workflow_memory.json"); + await writeFile(path, `${JSON.stringify(memory, null, 2)}\n`, "utf8"); + return path; +} diff --git a/backend/BigSet_Data_Collection_Agent/src/memory/types.ts b/backend/BigSet_Data_Collection_Agent/src/memory/types.ts new file mode 100644 index 0000000..893b658 --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/memory/types.ts @@ -0,0 +1,101 @@ +import { z } from "zod"; + +export const queryPageBreakdownSchema = z.object({ + page: z.number().int().min(0), + urls_produced: z.number().int().nonnegative(), + urls_with_records: z.number().int().nonnegative(), + record_count: z.number().int().nonnegative(), + avg_completeness: z.number().min(0).max(1), + avg_confidence: z.number().min(0).max(1), +}); + +export type QueryPageBreakdown = z.infer; + +/** Rolling aggregate for a search query based on records from URLs it surfaced. */ +export const queryMemoryEntrySchema = z.object({ + query: z.string(), + phase: z.string(), + repair_loop: z.number(), + urls_produced: z.number().int().nonnegative(), + urls_with_records: z.number().int().nonnegative(), + record_count: z.number().int().nonnegative(), + avg_completeness: z.number().min(0).max(1), + avg_confidence: z.number().min(0).max(1), + /** Last Search API page index used for this query (0-based). */ + search_page: z.number().int().min(0).default(0), + /** Page-weighted quality for recurring search (earlier pages weigh more). */ + weighted_quality: z.number().min(0).max(1).default(0), + page_breakdown: z.array(queryPageBreakdownSchema).default([]), +}); + +export type QueryMemoryEntry = z.infer; + +/** Rolling aggregate for a hostname from records attributed to that domain. */ +export const domainMemoryEntrySchema = z.object({ + domain: z.string(), + record_count: z.number().int().nonnegative(), + fetch_failures: z.number().int().nonnegative(), + avg_completeness: z.number().min(0).max(1), + avg_confidence: z.number().min(0).max(1), + last_repair_loop: z.number().int().nonnegative(), +}); + +export type DomainMemoryEntry = z.infer; + +/** Rolling aggregate for a Tinyfish Agent goal from records on that URL. */ +export const agentGoalMemoryEntrySchema = z.object({ + url: z.string(), + goal: z.string(), + repair_loop: z.number(), + record_count: z.number().int().nonnegative(), + avg_completeness: z.number().min(0).max(1), + avg_confidence: z.number().min(0).max(1), +}); + +export type AgentGoalMemoryEntry = z.infer; + +export const extractionSchemaSnapshotSchema = z.object({ + columns: z.array( + z.object({ + name: z.string(), + type: z.string(), + required: z.boolean(), + }), + ), + dedupe_keys: z.array(z.string()), + row_grain: z.string(), +}); + +export const repairDiagnosisSchema = z.object({ + summary: z.string(), + likely_causes: z.array(z.string()), + recommended_search_patterns: z.array(z.string()), + domains_to_prioritize: z.array(z.string()), + domains_to_avoid: z.array(z.string()), + prefer_tinyfish_agent: z.boolean(), + agent_strategy_notes: z.string().optional(), + extraction_notes: z.string().optional(), +}); + +export type RepairDiagnosis = z.infer; + +export const workflowMemorySchema = z.object({ + prompt_fingerprint: z.string(), + user_prompt: z.string(), + repair_loop_count: z.number(), + query_stats: z.array(queryMemoryEntrySchema), + domain_stats: z.array(domainMemoryEntrySchema), + agent_goal_stats: z.array(agentGoalMemoryEntrySchema), + extraction_schema: extractionSchemaSnapshotSchema.optional(), + dedupe_keys: z.array(z.string()), + diagnoses: z.array( + z.object({ + repair_loop: z.number(), + diagnosis: repairDiagnosisSchema, + }), + ), + strategy_notes: z.array(z.string()), + last_missing_fields: z.array(z.string()).optional(), +}); + +export type WorkflowMemory = z.infer; diff --git a/backend/BigSet_Data_Collection_Agent/src/memory/workflow-memory.ts b/backend/BigSet_Data_Collection_Agent/src/memory/workflow-memory.ts new file mode 100644 index 0000000..559d91f --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/memory/workflow-memory.ts @@ -0,0 +1,208 @@ +import type { CoverageReport } from "../coverage/analyze.js"; +import type { + AgentRunRecord, + DatasetSpec, + ExtractedRecord, + SourceCandidate, + SourceTriageResult, +} from "../models/schemas.js"; +import { promptFingerprint } from "./fingerprint.js"; +import { effectiveWeightedQuality } from "./search-pagination.js"; +import { + attributeRecordsToMemory, + mergeAgentGoalEntry, + mergeDomainEntry, + mergeQueryEntry, +} from "./scored-aggregates.js"; +import type { + RepairDiagnosis, + WorkflowMemory, +} from "./types.js"; + +export function createWorkflowMemory( + userPrompt: string, + spec?: DatasetSpec, +): WorkflowMemory { + return { + prompt_fingerprint: promptFingerprint(userPrompt), + user_prompt: userPrompt, + repair_loop_count: 0, + query_stats: [], + domain_stats: [], + agent_goal_stats: [], + dedupe_keys: spec?.dedupe_keys ?? [], + extraction_schema: spec ? snapshotExtractionSchema(spec) : undefined, + diagnoses: [], + strategy_notes: [], + }; +} + +export function snapshotExtractionSchema( + spec: DatasetSpec, +): WorkflowMemory["extraction_schema"] { + return { + row_grain: spec.row_grain, + dedupe_keys: spec.dedupe_keys, + columns: spec.columns.map((col) => ({ + name: col.name, + type: col.type, + required: col.required, + })), + }; +} + +export function recordPhaseInMemory(options: { + memory: WorkflowMemory; + spec: DatasetSpec; + phase: string; + repairLoop: number; + queries: string[]; + candidates: SourceCandidate[]; + records: ExtractedRecord[]; + failedUrls: string[]; + agentRuns: AgentRunRecord[]; + triageResults: SourceTriageResult[]; +}): void { + attributeRecordsToMemory(options); +} + +export function recordDiagnosis( + memory: WorkflowMemory, + repairLoop: number, + diagnosis: RepairDiagnosis, +): void { + memory.diagnoses.push({ repair_loop: repairLoop, diagnosis }); + if (diagnosis.summary) { + memory.strategy_notes.push(`[loop ${repairLoop}] ${diagnosis.summary}`); + } + if (memory.strategy_notes.length > 30) { + memory.strategy_notes.splice(0, memory.strategy_notes.length - 30); + } +} + +export function recordCoverageGaps( + memory: WorkflowMemory, + coverage: CoverageReport, +): void { + memory.last_missing_fields = coverage.field_gaps.map((gap) => gap.column); +} + +export function mergePersistentMemory( + base: WorkflowMemory, + prior: WorkflowMemory | null, +): WorkflowMemory { + if (!prior || prior.prompt_fingerprint !== base.prompt_fingerprint) { + return base; + } + + for (const source of prior.query_stats) { + const target = base.query_stats.find( + (item) => item.query === source.query && item.phase === source.phase, + ); + if (target) mergeQueryEntry(target, source); + else base.query_stats.push({ ...source }); + } + + for (const source of prior.domain_stats) { + const target = base.domain_stats.find((item) => item.domain === source.domain); + if (target) mergeDomainEntry(target, source); + else base.domain_stats.push({ ...source }); + } + + for (const source of prior.agent_goal_stats) { + const target = base.agent_goal_stats.find( + (item) => item.url === source.url && item.goal === source.goal, + ); + if (target) mergeAgentGoalEntry(target, source); + else base.agent_goal_stats.push({ ...source }); + } + + for (const note of prior.strategy_notes) { + if (!base.strategy_notes.includes(note)) { + base.strategy_notes.push(note); + } + } + + return base; +} + +function topQueries(memory: WorkflowMemory, limit: number) { + return [...memory.query_stats] + .filter((item) => item.record_count > 0) + .sort( + (a, b) => effectiveWeightedQuality(b) - effectiveWeightedQuality(a), + ) + .slice(0, limit); +} + +function weakQueries(memory: WorkflowMemory, limit: number) { + return [...memory.query_stats] + .filter((item) => item.urls_produced > 0 && item.record_count === 0) + .slice(-limit); +} + +function topDomains(memory: WorkflowMemory, limit: number) { + return [...memory.domain_stats] + .filter((item) => item.record_count > 0) + .sort( + (a, b) => + b.avg_completeness + b.avg_confidence - (a.avg_completeness + a.avg_confidence), + ) + .slice(-limit); +} + +function weakDomains(memory: WorkflowMemory, limit: number) { + return [...memory.domain_stats] + .filter( + (item) => + item.fetch_failures > 0 || + (item.record_count > 0 && item.avg_completeness < 0.5), + ) + .sort((a, b) => b.fetch_failures - a.fetch_failures) + .slice(-limit); +} + +function topAgentGoals(memory: WorkflowMemory, limit: number) { + return [...memory.agent_goal_stats] + .filter((item) => item.record_count > 0) + .sort( + (a, b) => + b.avg_completeness + b.avg_confidence - (a.avg_completeness + a.avg_confidence), + ) + .slice(-limit); +} + +/** Compact context injected into LLM agent calls. */ +export function memoryContextForAgents(memory: WorkflowMemory): Record { + return { + repair_loop_count: memory.repair_loop_count, + query_stats_top: topQueries(memory, 12), + query_stats_weak: weakQueries(memory, 10), + domain_stats_top: topDomains(memory, 15), + domain_stats_weak: weakDomains(memory, 12), + agent_goal_stats_top: topAgentGoals(memory, 6), + extraction_schema: memory.extraction_schema, + dedupe_keys: memory.dedupe_keys, + last_missing_fields: memory.last_missing_fields, + strategy_notes: memory.strategy_notes.slice(-8), + latest_diagnosis: + memory.diagnoses.length > 0 + ? memory.diagnoses[memory.diagnoses.length - 1]!.diagnosis + : undefined, + }; +} + +export function domainMemoryBoost( + memory: WorkflowMemory, + domain: string, +): number { + const stats = memory.domain_stats.find((item) => item.domain === domain); + if (!stats) return 0; + + if (stats.record_count === 0 && stats.fetch_failures > 0) { + return -4; + } + + const qualityScore = (stats.avg_completeness + stats.avg_confidence) / 2; + return (qualityScore - 0.5) * 4; +} diff --git a/backend/BigSet_Data_Collection_Agent/src/merge/records.ts b/backend/BigSet_Data_Collection_Agent/src/merge/records.ts new file mode 100644 index 0000000..995af2d --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/merge/records.ts @@ -0,0 +1,153 @@ +import type { DatasetSpec, ExtractedRecord } from "../models/schemas.js"; + +function normalizeValue(value: unknown): string { + if (value === null || value === undefined) return ""; + return String(value).trim().toLowerCase(); +} + +/** Normalize entity names for stable primary-key matching. */ +export function normalizePrimaryKey(value: unknown): string { + return normalizeValue(value) + .replace(/\s+/g, " ") + .replace(/[''`]/g, "'"); +} + +export function recordDedupeKey( + record: ExtractedRecord, + keys: string[], +): string { + return keys.map((key) => normalizeValue(record.row[key])).join("||"); +} + +function isEmptyCompositeKey(key: string, keyCount: number): boolean { + return !key || key === Array.from({ length: keyCount }, () => "").join("||"); +} + +/** + * Primary identity column: first dedupe key, or first column whose name suggests a name/title. + */ +export function getPrimaryKeyColumn(spec: DatasetSpec): string | null { + if (spec.dedupe_keys.length > 0) { + return spec.dedupe_keys[0]!; + } + + const nameLike = spec.columns.find((col) => + /(name|title|company|organization|entity)/i.test(col.name), + ); + return nameLike?.name ?? spec.columns[0]?.name ?? null; +} + +export function getPrimaryKeyValue( + record: ExtractedRecord, + spec: DatasetSpec, +): string { + const column = getPrimaryKeyColumn(spec); + if (!column) return ""; + return normalizePrimaryKey(record.row[column]); +} + +/** + * Canonical row id: primary key when present, otherwise full composite dedupe key. + */ +export function canonicalRecordId( + record: ExtractedRecord, + spec: DatasetSpec, +): string | null { + const primary = getPrimaryKeyValue(record, spec); + if (primary) { + return `pk:${primary}`; + } + + const composite = recordDedupeKey(record, spec.dedupe_keys); + if (!isEmptyCompositeKey(composite, spec.dedupe_keys.length)) { + return `dk:${composite}`; + } + + return null; +} + +export interface MergeResult { + records: ExtractedRecord[]; + unkeyed: ExtractedRecord[]; +} + +export function mergeRecords( + spec: DatasetSpec, + records: ExtractedRecord[], +): MergeResult { + const seen = new Map(); + const unkeyed: ExtractedRecord[] = []; + + for (const record of records) { + const id = canonicalRecordId(record, spec); + if (!id) { + unkeyed.push(record); + continue; + } + + const existing = seen.get(id); + if (!existing) { + seen.set(id, record); + continue; + } + + seen.set(id, mergePair(existing, record, spec)); + } + + return { records: [...seen.values()], unkeyed }; +} + +/** + * Merge repair-pass rows into an existing dataset. + * Rows with the same primary key (e.g. restaurant name) update in place; new keys add rows. + */ +export function mergeRepairIntoExisting( + spec: DatasetSpec, + existing: ExtractedRecord[], + repairRecords: ExtractedRecord[], +): MergeResult { + return mergeRecords(spec, [...existing, ...repairRecords]); +} + +export function mergePair( + a: ExtractedRecord, + b: ExtractedRecord, + spec: DatasetSpec, +): ExtractedRecord { + const row: Record = { ...a.row }; + + for (const col of spec.columns) { + const current = row[col.name]; + const incoming = b.row[col.name]; + const currentEmpty = + current === null || current === undefined || current === ""; + const incomingFilled = + incoming !== null && incoming !== undefined && incoming !== ""; + + if (currentEmpty && incomingFilled) { + row[col.name] = incoming ?? null; + } + } + + const evidence = [...a.evidence]; + const evidenceFields = new Set(evidence.map((e) => e.field)); + for (const item of b.evidence) { + if (!evidenceFields.has(item.field)) { + evidence.push(item); + } + } + + const extractionConfidence = Math.max( + a.extraction_confidence ?? 0, + b.extraction_confidence ?? 0, + ); + + return { + row, + evidence, + source_urls: [...new Set([...a.source_urls, ...b.source_urls])], + ...(extractionConfidence > 0 + ? { extraction_confidence: extractionConfidence } + : {}), + }; +} diff --git a/backend/BigSet_Data_Collection_Agent/src/models/quality.ts b/backend/BigSet_Data_Collection_Agent/src/models/quality.ts new file mode 100644 index 0000000..ffd496a --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/models/quality.ts @@ -0,0 +1,79 @@ +import { z } from "zod"; + +export const recordStatusSchema = z.enum([ + "complete", + "partial", + "low_confidence", +]); + +export type RecordStatus = z.infer; + +export const recordQualitySchema = z.object({ + record_id: z.string(), + record_status: recordStatusSchema, + needs_review: z.boolean(), + completeness_pct: z.number().min(0).max(1), + /** Mean confidence across required fields (from per-field source signals). */ + confidence_score: z.number().min(0).max(1), + field_confidences: z.record(z.string(), z.number().min(0).max(1)).default({}), + missing_required_fields: z.array(z.string()), + missing_optional_fields: z.array(z.string()), + fields_without_evidence: z.array(z.string()), + review_reasons: z.array(z.string()), +}); + +export type RecordQuality = z.infer; + +export const qualityBucketSchema = z.object({ + count: z.number().int().nonnegative(), + record_ids: z.array(z.string()), +}); + +export type QualityBucket = z.infer; + +export const qualityReportSchema = z.object({ + total_records: z.number().int().nonnegative(), + unkeyed_records: z.number().int().nonnegative(), + complete: qualityBucketSchema, + partial: qualityBucketSchema, + low_confidence: qualityBucketSchema, + needs_review: qualityBucketSchema, + records: z.array(recordQualitySchema), +}); + +export type QualityReport = z.infer; + +export const sourceOutcomeTypeSchema = z.enum([ + "success", + "fetch_failed", + "skipped", + "extract_failed", + "agent_failed", + "agent_deferred", + "no_records", +]); + +export type SourceOutcomeType = z.infer; + +export const sourceOutcomeSchema = z.object({ + url: z.string(), + phase: z.enum(["initial", "repair"]), + outcome: sourceOutcomeTypeSchema, + triage_status: z.string().optional(), + triage_confidence: z.number().optional(), + source_data_confidence: z.number().optional(), + expected_yield: z.string().optional(), + error: z.string().optional(), + records_extracted: z.number().optional(), +}); + +export type SourceOutcome = z.infer; + +export const sourcesReportSchema = z.object({ + total: z.number().int().nonnegative(), + failed: z.array(sourceOutcomeSchema), + by_outcome: z.record(z.string(), z.number()), + outcomes: z.array(sourceOutcomeSchema), +}); + +export type SourcesReport = z.infer; diff --git a/backend/BigSet_Data_Collection_Agent/src/models/schemas.ts b/backend/BigSet_Data_Collection_Agent/src/models/schemas.ts new file mode 100644 index 0000000..fe1a059 --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/models/schemas.ts @@ -0,0 +1,214 @@ +import { z } from "zod"; +import { repairDiagnosisSchema } from "../memory/types.js"; +import { qualityReportSchema, sourcesReportSchema } from "./quality.js"; +import { sourceStatusSchema } from "./source-status.js"; + +export const columnSchema = z.object({ + name: z.string().min(1), + type: z.enum(["string", "number", "boolean", "date"]), + description: z.string(), + required: z.boolean(), +}); + +export const datasetSpecSchema = z.object({ + intent_summary: z.string(), + target_row_count: z.number().int().positive(), + row_grain: z.string(), + columns: z.array(columnSchema).min(1), + dedupe_keys: z.preprocess( + (value) => (Array.isArray(value) ? value.slice(0, 1) : value), + z.array(z.string()).length(1), + ), + search_queries: z.array(z.string()).min(1), + extraction_hints: z.string(), +}); + +export type ColumnDef = z.infer; +export type DatasetSpec = z.infer; + +export const fieldEvidenceSchema = z.object({ + field: z.string(), + url: z.string(), + quote: z.string(), +}); + +export const extractedRecordSchema = z.object({ + row: z.record(z.string(), z.union([z.string(), z.number(), z.boolean(), z.null()])), + evidence: z.array(fieldEvidenceSchema), + source_urls: z.array(z.string()), + /** LLM-estimated confidence that row values are accurate (0–1). */ + extraction_confidence: z.number().min(0).max(1).optional(), +}); + +export type FieldEvidence = z.infer; +export type ExtractedRecord = z.infer; + +export const extractionResultSchema = z.object({ + records: z.array(extractedRecordSchema), + notes: z.string().optional(), +}); + +export type ExtractionResult = z.infer; + +export const sourceCandidateSchema = z.object({ + url: z.string().url(), + title: z.string(), + snippet: z.string(), + site_name: z.string().optional(), + query: z.string(), + position: z.number().optional(), + /** Search API page (0-based) that produced this candidate. */ + search_page: z.number().int().min(0).optional(), +}); + +export type SourceCandidate = z.infer; + +export const fetchedPageSchema = z.object({ + url: z.string(), + final_url: z.string(), + title: z.string(), + description: z.string().optional(), + text: z.string(), + error: z.string().optional(), + /** Outbound links when Fetch API was called with links: true. */ + outbound_links: z.array(z.string()).optional(), +}); + +export type FetchedPage = z.infer; + +export const expectedYieldSchema = z.enum(["complete", "partial", "none"]); + +export const sourceTriageResultSchema = z.object({ + url: z.string(), + final_url: z.string(), + title: z.string(), + status: sourceStatusSchema, + /** Confidence in triage classification (routing). */ + confidence: z.number().min(0).max(1), + /** Expected accuracy/completeness of data if extracted from this page. */ + source_data_confidence: z.number().min(0).max(1), + /** Likely yield: full rows, partial rows, or none. */ + expected_yield: expectedYieldSchema, + reasoning: z.string(), + suggested_action: z.string().optional(), +}); + +export type SourceTriageResult = z.infer; + +export const agentGoalSchema = z.object({ + goal: z.string(), + rationale: z.string(), +}); + +export type AgentGoal = z.infer; + +export const agentRunRecordSchema = z.object({ + url: z.string(), + status: sourceStatusSchema, + run_id: z.string().nullable(), + agent_status: z.string(), + goal: z.string(), + records_extracted: z.number(), + error: z.string().optional(), +}); + +export type AgentRunRecord = z.infer; + +export const triageSummarySchema = z.object({ + pages_triaged: z.number(), + by_status: z.record(z.string(), z.number()), + extract_now: z.number(), + agent_candidates: z.number(), + agent_dispatched: z.number(), + agent_deferred: z.number(), + agent_succeeded: z.number(), + agent_failed: z.number(), + skipped: z.number(), + records_from_extract: z.number(), + records_from_agent: z.number(), +}); + +export type TriageSummary = z.infer; + +const phaseStatsSchema = z.object({ + search_queries_executed: z.number(), + search_pages_paginated: z.number().optional(), + search_results_collected: z.number(), + unique_urls_selected: z.number(), + pages_fetched: z.number(), + pages_failed: z.number(), + raw_records_extracted: z.number(), + triage: triageSummarySchema.optional(), +}); + +export const llmUsageReportSchema = z.object({ + prompt_tokens: z.number().int().nonnegative(), + completion_tokens: z.number().int().nonnegative(), + total_tokens: z.number().int().nonnegative(), + call_count: z.number().int().nonnegative(), +}); + +export const repairLoopReportSchema = z.object({ + loop_index: z.number().int().positive(), + diagnosis_summary: z.string().optional(), + repair_queries: z.array(z.string()), + rationale: z.string().optional(), + missing_fields: z.array(z.string()), + records_before: z.number(), + records_after: z.number(), + fields_filled: z.record(z.string(), z.number()), + partial_count_before: z.number().optional(), + partial_count_after: z.number().optional(), + stats: phaseStatsSchema, +}); + +export type RepairLoopReport = z.infer; + +export const repairReportSchema = z.object({ + attempted: z.boolean(), + total_loops: z.number().int().nonnegative(), + loops: z.array(repairLoopReportSchema), + skipped_reason: z.string().optional(), + missing_fields: z.array(z.string()), + repair_queries: z.array(z.string()), + rationale: z.string().optional(), + records_before: z.number(), + records_after: z.number(), + fields_filled: z.record(z.string(), z.number()), + stats: phaseStatsSchema, + last_diagnosis: repairDiagnosisSchema.optional(), +}); + +export const runReportSchema = z.object({ + run_id: z.string(), + /** Set when this run is a recurring refresh of a prior run. */ + refreshed_from_run_id: z.string().optional(), + refresh_in_place: z.boolean().optional(), + prompt: z.string(), + target_rows: z.number(), + started_at: z.string(), + finished_at: z.string(), + duration_ms: z.number(), + dataset_spec: datasetSpecSchema, + stats: phaseStatsSchema.extend({ + records_after_merge: z.number(), + visualization_records: z.number().optional(), + }), + initial: phaseStatsSchema.extend({ + search_queries: z.array(z.string()), + fetched_urls: z.array(z.string()), + failed_urls: z.array(z.string()), + }), + repair: repairReportSchema, + search_queries: z.array(z.string()), + fetched_urls: z.array(z.string()), + failed_urls: z.array(z.string()), + errors: z.array(z.string()), + quality: qualityReportSchema.optional(), + sources: sourcesReportSchema.optional(), + llm_usage: llmUsageReportSchema.optional(), +}); + +export type RunReport = z.infer; + +export type { QualityReport, RecordQuality, SourcesReport, SourceOutcome } from "./quality.js"; diff --git a/backend/BigSet_Data_Collection_Agent/src/models/source-status.ts b/backend/BigSet_Data_Collection_Agent/src/models/source-status.ts new file mode 100644 index 0000000..e25afd5 --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/models/source-status.ts @@ -0,0 +1,24 @@ +import { z } from "zod"; + +export const sourceStatusSchema = z.enum([ + "extract_now", + "requires_navigation", + "requires_form_submission", + "requires_detail_page_followup", + "irrelevant", + "duplicate", + "blocked", + "low_value", +]); + +export type SourceStatus = z.infer; + +export const AGENT_STATUSES: SourceStatus[] = [ + "requires_navigation", + "requires_form_submission", + "requires_detail_page_followup", +]; + +export function statusNeedsAgent(status: SourceStatus): boolean { + return AGENT_STATUSES.includes(status); +} diff --git a/backend/BigSet_Data_Collection_Agent/src/orchestrator/acquisition.ts b/backend/BigSet_Data_Collection_Agent/src/orchestrator/acquisition.ts new file mode 100644 index 0000000..ca169e0 --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/orchestrator/acquisition.ts @@ -0,0 +1,260 @@ +import { selectOutboundLinksToFollow } from "../acquisition/link-follow.js"; +import { config } from "../config.js"; +import { chunkUrls, fetchPages, searchWeb } from "../integrations/tinyfish.js"; +import { domainMemoryBoost, type WorkflowMemory } from "../memory/index.js"; +import type { SearchPlan } from "../memory/search-pagination.js"; +import { getPrimaryKeyValue } from "../merge/records.js"; +import { createFetchQueue, createSearchQueue } from "../queue/pools.js"; +import type { + AgentRunRecord, + DatasetSpec, + ExtractedRecord, + FetchedPage, + SourceCandidate, + SourceTriageResult, + TriageSummary, +} from "../models/schemas.js"; +import { saveFetchedPage, type RunPaths } from "../storage/run-store.js"; +import { + processFetchedPages, + type AgentDeferredEntry, +} from "./process-pages.js"; +import { getDomain, normalizeUrl } from "../utils/url.js"; + +export interface AcquisitionResult { + candidates: SourceCandidate[]; + fetchedUrls: string[]; + failedUrls: string[]; + fetchedPages: FetchedPage[]; + records: ExtractedRecord[]; + pagesFetched: number; + triage: TriageSummary; + triageResults: SourceTriageResult[]; + agentRuns: AgentRunRecord[]; + agentDeferred: AgentDeferredEntry[]; +} + +function rankCandidates( + candidates: SourceCandidate[], + excludeUrls: Set, + limit: number, + memory?: WorkflowMemory, +): string[] { + const byUrl = new Map< + string, + { url: string; score: number; domain: string } + >(); + + for (const candidate of candidates) { + const url = normalizeUrl(candidate.url); + if (excludeUrls.has(url)) continue; + + const domain = getDomain(url); + let score = byUrl.get(url)?.score ?? 0; + score += 1; + if (candidate.title.length > 10) score += 0.5; + if (candidate.snippet.length > 40) score += 0.5; + if (memory) score += domainMemoryBoost(memory, domain); + byUrl.set(url, { url, score, domain }); + } + + const domainsSeen = new Set(); + return [...byUrl.values()] + .sort((a, b) => b.score - a.score) + .filter((item) => { + if (domainsSeen.has(item.domain)) return false; + domainsSeen.add(item.domain); + return true; + }) + .map((item) => item.url) + .slice(0, limit); +} + +export async function runAcquisitionPhase(options: { + label: string; + userPrompt: string; + spec: DatasetSpec; + queries: string[]; + /** When set, runs Search with per-query page indices (repair pagination). */ + searches?: SearchPlan[]; + paths: RunPaths; + errors: string[]; + excludeUrls: Set; + maxResultsPerQuery: number; + maxUrlsToFetch: number; + pageIndexStart: number; + focusFields?: string[]; + knownEntityKeys?: string[]; + enableTriage?: boolean; + enableTinyfishAgent?: boolean; + memory?: WorkflowMemory; + forceAgent?: boolean; + /** Fetch outbound links from high-value pages (repair). */ + enableLinkFollow?: boolean; + log: (stage: string, message: string) => void; +}): Promise { + const searchQueue = createSearchQueue(); + const fetchQueue = createFetchQueue(); + + const searches: SearchPlan[] = + options.searches ?? + options.queries.map((query) => ({ query, page: 0 })); + + options.log( + options.label, + `Running ${searches.length} searches (parallel, concurrency=${config.searchConcurrency})...`, + ); + + const searchBatches = await searchQueue.runAll( + searches, + async (plan) => { + try { + const results = await searchWeb(plan.query, plan.page); + return results.slice(0, options.maxResultsPerQuery).map((result) => ({ + ...result, + query: plan.query, + search_page: plan.page, + })); + } catch (error) { + const msg = `Search failed for "${plan.query}" (page ${plan.page}): ${ + error instanceof Error ? error.message : String(error) + }`; + options.errors.push(msg); + options.log(options.label, `WARN ${msg}`); + return [] as SourceCandidate[]; + } + }, + ); + const candidates: SourceCandidate[] = searchBatches.flat(); + + const urlsToFetch = rankCandidates( + candidates, + options.excludeUrls, + options.maxUrlsToFetch, + options.memory, + ); + + const fetchWithLinks = options.enableLinkFollow ?? false; + const urlChunks = chunkUrls(urlsToFetch, config.fetchBatchSize); + + options.log( + options.label, + `Fetching ${urlsToFetch.length} URLs in ${urlChunks.length} parallel batches (concurrency=${config.fetchConcurrency})${fetchWithLinks ? " with outbound links" : ""}...`, + ); + + const fetchChunk = async (chunk: string[], includeLinks: boolean) => { + try { + return await fetchPages(chunk, { includeLinks }); + } catch (error) { + const msg = `Fetch batch failed: ${ + error instanceof Error ? error.message : String(error) + }`; + options.errors.push(msg); + options.log(options.label, `WARN ${msg}`); + return chunk.map((url) => ({ + url, + final_url: url, + title: "", + text: "", + error: msg, + })); + } + }; + + let fetchedPages: FetchedPage[] = + urlChunks.length > 0 + ? ( + await fetchQueue.runAll( + urlChunks, + (chunk) => fetchChunk(chunk, fetchWithLinks), + (chunk) => chunk.map((url) => getDomain(url)), + ) + ).flat() + : []; + + if (fetchWithLinks && fetchedPages.length > 0) { + const linkUrls = selectOutboundLinksToFollow({ + pages: fetchedPages, + excludeUrls: options.excludeUrls, + focusFields: options.focusFields, + maxTotal: config.maxRepairLinkUrls, + maxPerSource: config.maxLinksPerSourcePage, + memory: options.memory, + }).filter((url) => !urlsToFetch.includes(normalizeUrl(url))); + + if (linkUrls.length > 0) { + const linkChunks = chunkUrls(linkUrls, config.fetchBatchSize); + options.log( + options.label, + `Following ${linkUrls.length} high-relevance outbound links...`, + ); + const linkPages = ( + await fetchQueue.runAll( + linkChunks, + (chunk) => fetchChunk(chunk, false), + (chunk) => chunk.map((url) => getDomain(url)), + ) + ).flat(); + fetchedPages = [...fetchedPages, ...linkPages]; + } + } + + let pageIndex = options.pageIndexStart; + for (const page of fetchedPages) { + await saveFetchedPage(options.paths, page, pageIndex); + pageIndex += 1; + } + + const failedUrls = fetchedPages + .filter((page) => page.error) + .map((page) => page.url); + + const processed = await processFetchedPages({ + label: options.label, + userPrompt: options.userPrompt, + spec: options.spec, + pages: fetchedPages, + paths: options.paths, + errors: options.errors, + focusFields: options.focusFields, + knownEntityKeys: options.knownEntityKeys, + enableTriage: options.enableTriage, + enableTinyfishAgent: + options.enableTinyfishAgent ?? + (options.forceAgent ? true : config.enableTinyfishAgent), + memory: options.memory, + log: options.log, + }); + + const allFetchedUrls = [ + ...new Set([ + ...urlsToFetch.map((url) => normalizeUrl(url)), + ...fetchedPages.map((page) => normalizeUrl(page.url)), + ]), + ]; + + return { + candidates, + fetchedUrls: allFetchedUrls, + failedUrls, + fetchedPages, + records: processed.records, + pagesFetched: fetchedPages.length, + triage: processed.summary, + triageResults: processed.triageResults, + agentRuns: processed.agentRuns, + agentDeferred: processed.agentDeferred, + }; +} + +export function entityKeysFromRecords( + spec: DatasetSpec, + records: ExtractedRecord[], +): string[] { + const keys = new Set(); + for (const record of records) { + const pk = getPrimaryKeyValue(record, spec); + if (pk) keys.add(pk); + } + return [...keys]; +} diff --git a/backend/BigSet_Data_Collection_Agent/src/orchestrator/pipeline.ts b/backend/BigSet_Data_Collection_Agent/src/orchestrator/pipeline.ts new file mode 100644 index 0000000..016566b --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/orchestrator/pipeline.ts @@ -0,0 +1,652 @@ +import { runWithLlmUsageScope, getCurrentLlmUsage, type LlmUsageTotals } from "../llm/usage.js"; +import { randomUUID } from "node:crypto"; +import { join } from "node:path"; +import { generateDatasetSpec } from "../agents/dataset-spec.js"; +import type { BenchmarkSpecContext } from "../agents/benchmark-spec.js"; +import { + analyzeCoverage, + type CoverageReport, +} from "../coverage/analyze.js"; +import { assertConfig, config } from "../config.js"; +import { selectVisualizationRecords } from "../export/select-results.js"; +import { + qualityMapFromReport, + writeEvidenceJsonl, + writeResultsCsv, + writeSegmentedRecordCsvs, + writeUnkeyedRecordsJsonl, +} from "../export/csv-compiler.js"; +import { mergeRecords, mergeRepairIntoExisting } from "../merge/records.js"; +import type { DatasetSpec, ExtractedRecord, RunReport } from "../models/schemas.js"; +import { + createWorkflowMemory, + loadPersistentMemory, + mergePersistentMemory, + recordCoverageGaps, + recordPhaseInMemory, + savePersistentMemory, + saveRunMemory, + snapshotExtractionSchema, + type WorkflowMemory, +} from "../memory/index.js"; +import { + agentExtractedUrls, + buildQualityReport, + buildSourcesReport, + mergeSourcesReports, + triageByUrl, +} from "../quality/index.js"; +import { entityKeysFromRecords, runAcquisitionPhase } from "./acquisition.js"; +import { runRepairLoops } from "./repair-loop.js"; +import { loadRunForRefresh, type LoadedRun } from "../storage/run-loader.js"; +import { + createRunStore, + saveDatasetSpec, + saveJson, + saveRunReport, + saveSourceCandidates, + type RunPaths, +} from "../storage/run-store.js"; +import { normalizeUrl } from "../utils/url.js"; + +export interface PipelineOptions { + prompt: string; + targetRows: number; + outputDir: string; + memoryDir?: string; + enableRepair?: boolean; + enableTriage?: boolean; + enableTinyfishAgent?: boolean; + /** Recurring refresh: baseline run to merge into (in-place by primary key). */ + refreshFrom?: LoadedRun; + /** Overwrite the source run directory (same run_id). */ + refreshInPlace?: boolean; + /** When refreshing, re-fetch URLs already seen in the source run. */ + refetchUrls?: boolean; + /** Override pipeline logging (benchmark adapters should log to stderr). */ + onLog?: (stage: string, message: string) => void; + /** Set when invoked from the dataset-agent benchmark harness. */ + benchmark?: BenchmarkSpecContext; +} + +export interface PipelineResult { + runId: string; + paths: RunPaths; + report: RunReport; + recordCount: number; + records: ExtractedRecord[]; + visualizationRecords: ExtractedRecord[]; + llmUsage: LlmUsageTotals; +} + +let pipelineLog: (stage: string, message: string) => void = (stage, message) => { + console.log(`[${stage}] ${message}`); +}; + +function log(stage: string, message: string): void { + pipelineLog(stage, message); +} + +function phaseStatsFromAcquisition( + acquisition: { + candidates: { length: number }; + fetchedUrls: string[]; + failedUrls: string[]; + records: ExtractedRecord[]; + pagesFetched: number; + triage: import("../models/schemas.js").TriageSummary; + }, + queryCount: number, +) { + return { + search_queries_executed: queryCount, + search_results_collected: acquisition.candidates.length, + unique_urls_selected: acquisition.fetchedUrls.length, + pages_fetched: acquisition.pagesFetched, + pages_failed: acquisition.failedUrls.length, + raw_records_extracted: acquisition.records.length, + triage: acquisition.triage, + }; +} + +function emptyRepairStats(): RunReport["repair"]["stats"] { + return { + search_queries_executed: 0, + search_results_collected: 0, + unique_urls_selected: 0, + pages_fetched: 0, + pages_failed: 0, + raw_records_extracted: 0, + triage: { + pages_triaged: 0, + by_status: {}, + extract_now: 0, + agent_candidates: 0, + agent_dispatched: 0, + agent_deferred: 0, + agent_succeeded: 0, + agent_failed: 0, + skipped: 0, + records_from_extract: 0, + records_from_agent: 0, + }, + }; +} + +function aggregateRepairStats( + loops: RunReport["repair"]["loops"], +): RunReport["repair"]["stats"] { + const stats = emptyRepairStats(); + for (const loop of loops) { + stats.search_queries_executed += loop.stats.search_queries_executed; + stats.search_results_collected += loop.stats.search_results_collected; + stats.unique_urls_selected += loop.stats.unique_urls_selected; + stats.pages_fetched += loop.stats.pages_fetched; + stats.pages_failed += loop.stats.pages_failed; + stats.raw_records_extracted += loop.stats.raw_records_extracted; + } + return stats; +} + +function memoryDirFor(options: PipelineOptions): string { + return options.memoryDir ?? join(options.outputDir, "..", "memory"); +} + +export async function runPipeline( + options: PipelineOptions, +): Promise { + const { result, usage } = await runWithLlmUsageScope(() => + executeRunPipeline(options), + ); + return { ...result, llmUsage: usage }; +} + +async function executeRunPipeline( + options: PipelineOptions, +): Promise> { + pipelineLog = + options.onLog ?? ((stage, message) => console.log(`[${stage}] ${message}`)); + assertConfig(); + + const enableRepair = options.enableRepair ?? config.enableRepairLoop; + const enableTriage = options.enableTriage ?? config.enableTriage; + const enableTinyfishAgent = + options.enableTinyfishAgent ?? config.enableTinyfishAgent; + const useMemory = config.enableWorkflowMemory; + const startedAt = new Date(); + const refreshSource = options.refreshFrom; + const inPlaceRefresh = Boolean(refreshSource && options.refreshInPlace); + const runId = + inPlaceRefresh && refreshSource + ? refreshSource.runId + : randomUUID().slice(0, 8); + const paths = await createRunStore(options.outputDir, runId); + const errors: string[] = []; + const fetchedUrlSet = new Set(); + if (refreshSource && !options.refetchUrls) { + for (const url of refreshSource.report.fetched_urls) { + fetchedUrlSet.add(normalizeUrl(url)); + } + } + let pageIndex = 0; + const targetRowCap = options.targetRows * 2; + + log( + "init", + refreshSource + ? `refresh run_id=${runId} from=${refreshSource.runId} in_place=${inPlaceRefresh} output=${paths.root}` + : `run_id=${runId} output=${paths.root}`, + ); + + let memory: WorkflowMemory = createWorkflowMemory(options.prompt); + if (refreshSource?.memory) { + memory = mergePersistentMemory(memory, refreshSource.memory); + log( + "memory", + `Loaded workflow memory from run ${refreshSource.runId} (${refreshSource.memory.query_stats.length} query stats)`, + ); + } + if (useMemory) { + const prior = await loadPersistentMemory( + memoryDirFor(options), + memory.prompt_fingerprint, + ); + memory = mergePersistentMemory(memory, prior); + if (prior && !refreshSource?.memory) { + log( + "memory", + `Loaded prior workflow memory (${prior.query_stats.length} query stats, ${prior.domain_stats.length} domain stats)`, + ); + } + } + + let spec: DatasetSpec; + let baselineRecords: ExtractedRecord[] = []; + + if (refreshSource) { + spec = refreshSource.spec; + baselineRecords = refreshSource.records; + memory.extraction_schema = snapshotExtractionSchema(spec); + memory.dedupe_keys = spec.dedupe_keys; + memory.repair_loop_count = 0; + await saveDatasetSpec(paths, spec); + log( + "refresh", + `Baseline ${baselineRecords.length} records — new search with prior diagnostics/memory`, + ); + } else { + log("spec", "Generating dataset specification..."); + spec = await generateDatasetSpec( + options.prompt, + options.targetRows, + useMemory ? memory : null, + options.benchmark, + ); + memory.extraction_schema = snapshotExtractionSchema(spec); + memory.dedupe_keys = spec.dedupe_keys; + await saveDatasetSpec(paths, spec); + } + + const initialQueries = spec.search_queries.slice(0, config.maxSearchQueries); + + const initialAcquisition = await runAcquisitionPhase({ + label: refreshSource ? "refresh" : "initial", + userPrompt: options.prompt, + spec, + queries: initialQueries, + paths, + errors, + excludeUrls: fetchedUrlSet, + maxResultsPerQuery: config.maxResultsPerQuery, + maxUrlsToFetch: config.maxUrlsToFetch, + pageIndexStart: pageIndex, + enableTriage, + enableTinyfishAgent, + memory: useMemory ? memory : undefined, + log, + }); + + recordPhaseInMemory({ + memory, + spec, + phase: refreshSource ? "refresh" : "initial", + repairLoop: 0, + queries: initialQueries, + candidates: initialAcquisition.candidates, + records: initialAcquisition.records, + failedUrls: initialAcquisition.failedUrls, + agentRuns: initialAcquisition.agentRuns, + triageResults: initialAcquisition.triageResults, + }); + + if (initialAcquisition.triage.agent_dispatched > 0) { + log( + "triage", + `Initial: ${initialAcquisition.triage.extract_now} extract_now, ` + + `${initialAcquisition.triage.agent_succeeded}/${initialAcquisition.triage.agent_dispatched} agent runs succeeded`, + ); + } + + for (const url of initialAcquisition.fetchedUrls) { + fetchedUrlSet.add(normalizeUrl(url)); + } + pageIndex += initialAcquisition.pagesFetched; + + await saveSourceCandidates(paths, initialAcquisition.candidates); + + let mergeResult = refreshSource + ? mergeRepairIntoExisting( + spec, + baselineRecords, + initialAcquisition.records, + ) + : mergeRecords(spec, initialAcquisition.records); + let mergedRecords = mergeResult.records.slice(0, targetRowCap); + let benchmarkVisualizationRecords = mergedRecords; + let unkeyedRecords = mergeResult.unkeyed; + + let coverage: CoverageReport = analyzeCoverage(spec, mergedRecords); + recordCoverageGaps(memory, coverage); + await saveJson(join(paths.root, "coverage_initial.json"), coverage); + + const writeExports = async ( + csvPath: string, + evidencePath: string, + records: ExtractedRecord[], + qualityById?: ReturnType, + ) => { + await writeResultsCsv(csvPath, spec, records, qualityById); + await writeEvidenceJsonl(evidencePath, spec, records, qualityById); + }; + + log("export", `Writing init_results.csv (${mergedRecords.length} records)...`); + await writeExports(paths.initResultsPath, paths.initEvidencePath, mergedRecords); + + const allSearchQueries = [...initialQueries]; + const allFailedUrls = [...initialAcquisition.failedUrls]; + const recordsBeforeRepair = mergedRecords; + + let repairReport: RunReport["repair"] = { + attempted: false, + total_loops: 0, + loops: [], + missing_fields: [], + repair_queries: [], + records_before: mergedRecords.length, + records_after: mergedRecords.length, + fields_filled: {}, + stats: emptyRepairStats(), + }; + + const repairAcquisitions: typeof initialAcquisition[] = []; + + if (!enableRepair) { + repairReport.skipped_reason = "repair_disabled"; + log("repair", "Skipped (disabled)"); + } else if (!coverage.should_repair) { + repairReport.skipped_reason = "no_missing_required_fields"; + log( + "repair", + `Skipped (coverage satisfied) — required=[${coverage.required_columns.join(", ")}]`, + ); + } else { + repairReport.attempted = true; + repairReport.records_before = recordsBeforeRepair.length; + repairReport.missing_fields = coverage.field_gaps.map((gap) => gap.column); + + const repairResult = await runRepairLoops({ + ctx: { + userPrompt: options.prompt, + spec, + paths, + errors, + memory, + fetchedUrlSet, + allSearchQueries, + allFailedUrls, + enableTriage, + enableTinyfishAgent, + targetRowCap, + log, + }, + recordsBeforeRepair, + initialCoverage: coverage, + pageIndexStart: pageIndex, + }); + + mergedRecords = repairResult.mergedRecords; + unkeyedRecords = [...unkeyedRecords, ...repairResult.unkeyedRecords]; + coverage = repairResult.coverage; + repairAcquisitions.push(...repairResult.repairAcquisitions); + + repairReport.total_loops = repairResult.loops.length; + repairReport.loops = repairResult.loops; + repairReport.last_diagnosis = repairResult.lastDiagnosis; + repairReport.records_after = mergedRecords.length; + repairReport.repair_queries = repairResult.loops.flatMap((loop) => loop.repair_queries); + repairReport.rationale = repairResult.lastDiagnosis?.summary; + repairReport.fields_filled = repairResult.loops.reduce( + (acc, loop) => { + for (const [key, value] of Object.entries(loop.fields_filled)) { + acc[key] = (acc[key] ?? 0) + value; + } + return acc; + }, + {} as Record, + ); + repairReport.stats = aggregateRepairStats(repairResult.loops); + repairReport.missing_fields = coverage.field_gaps.map((gap) => gap.column); + + if (repairResult.loops.length > 0) { + log( + "export", + `Writing repair_results.csv (${mergedRecords.length} records after ${repairResult.loops.length} repair loop(s))...`, + ); + await writeExports( + paths.repairResultsPath, + paths.repairEvidencePath, + mergedRecords, + ); + } + } + + if (useMemory) { + await saveRunMemory(paths.root, memory); + await savePersistentMemory(memoryDirFor(options), memory); + log("memory", `Saved workflow memory (repair_loops=${memory.repair_loop_count})`); + } + + let qualityReport: RunReport["quality"]; + let sourcesReport: RunReport["sources"]; + + if (config.enableQualityScoring) { + log("quality", "Scoring records and building source outcomes..."); + + const allTriage = [ + ...initialAcquisition.triageResults, + ...repairAcquisitions.flatMap((a) => a.triageResults), + ]; + const allAgentRuns = [ + ...initialAcquisition.agentRuns, + ...repairAcquisitions.flatMap((a) => a.agentRuns), + ]; + + const scoreContext = { + triageByUrl: triageByUrl(allTriage), + agentExtractedUrls: agentExtractedUrls(allAgentRuns), + }; + + qualityReport = buildQualityReport( + spec, + mergedRecords, + scoreContext, + unkeyedRecords.length, + ); + + const initialSources = buildSourcesReport({ + phase: "initial", + fetchedPages: initialAcquisition.fetchedPages, + fetchedUrls: initialAcquisition.fetchedUrls, + triageResults: initialAcquisition.triageResults, + agentRuns: initialAcquisition.agentRuns, + agentDeferred: initialAcquisition.agentDeferred, + }); + + const repairSourcesList = repairAcquisitions.map((acquisition, index) => + buildSourcesReport({ + phase: "repair", + fetchedPages: acquisition.fetchedPages, + fetchedUrls: acquisition.fetchedUrls, + triageResults: acquisition.triageResults, + agentRuns: acquisition.agentRuns, + agentDeferred: acquisition.agentDeferred, + }), + ); + + sourcesReport = repairSourcesList.reduce( + (acc, report) => mergeSourcesReports(acc, report), + initialSources, + ); + + await saveJson(join(paths.root, "quality_report.json"), qualityReport); + await saveJson(join(paths.root, "sources_outcomes.json"), sourcesReport); + + if (unkeyedRecords.length > 0) { + await writeUnkeyedRecordsJsonl( + join(paths.root, "records_unkeyed.jsonl"), + unkeyedRecords, + ); + } + + await writeSegmentedRecordCsvs( + paths.root, + spec, + mergedRecords, + qualityReport.records, + ); + + const qualityById = qualityMapFromReport(qualityReport.records); + benchmarkVisualizationRecords = config.enableSelectiveResults + ? selectVisualizationRecords(spec, mergedRecords, qualityById) + : mergedRecords; + + log( + "quality", + `complete=${qualityReport.complete.count} partial=${qualityReport.partial.count} ` + + `low_confidence=${qualityReport.low_confidence.count} needs_review=${qualityReport.needs_review.count} ` + + `visualization=${benchmarkVisualizationRecords.length}`, + ); + + if (config.enableSelectiveResults) { + log( + "export", + `Writing results_full.csv (${mergedRecords.length} records)...`, + ); + await writeExports( + paths.resultsFullPath, + paths.evidenceFullPath, + mergedRecords, + qualityById, + ); + log( + "export", + `Writing results.csv (${benchmarkVisualizationRecords.length} selective records)...`, + ); + await writeExports( + paths.resultsPath, + paths.evidencePath, + benchmarkVisualizationRecords, + qualityById, + ); + } else { + log("export", `Writing results.csv (${mergedRecords.length} records)...`); + await writeExports( + paths.resultsPath, + paths.evidencePath, + mergedRecords, + qualityById, + ); + } + } else { + log("export", `Writing results.csv (${mergedRecords.length} records)...`); + await writeExports(paths.resultsPath, paths.evidencePath, mergedRecords); + } + + const finishedAt = new Date(); + const initialStats = phaseStatsFromAcquisition( + initialAcquisition, + initialQueries.length, + ); + + const visualizationCount = benchmarkVisualizationRecords.length; + + const llmUsage = getCurrentLlmUsage(); + + const report: RunReport = { + run_id: runId, + ...(refreshSource + ? { + refreshed_from_run_id: refreshSource.runId, + refresh_in_place: inPlaceRefresh, + } + : {}), + prompt: options.prompt, + target_rows: options.targetRows, + started_at: startedAt.toISOString(), + finished_at: finishedAt.toISOString(), + duration_ms: finishedAt.getTime() - startedAt.getTime(), + dataset_spec: spec, + stats: { + ...initialStats, + search_queries_executed: + initialStats.search_queries_executed + + repairReport.stats.search_queries_executed, + search_results_collected: + initialStats.search_results_collected + + repairReport.stats.search_results_collected, + unique_urls_selected: + initialStats.unique_urls_selected + + repairReport.stats.unique_urls_selected, + pages_fetched: + initialStats.pages_fetched + repairReport.stats.pages_fetched, + pages_failed: + initialStats.pages_failed + repairReport.stats.pages_failed, + raw_records_extracted: + initialStats.raw_records_extracted + + repairReport.stats.raw_records_extracted, + records_after_merge: mergedRecords.length, + visualization_records: visualizationCount, + }, + initial: { + ...initialStats, + search_queries: initialQueries, + fetched_urls: initialAcquisition.fetchedUrls, + failed_urls: initialAcquisition.failedUrls, + }, + repair: repairReport, + search_queries: allSearchQueries, + fetched_urls: [...fetchedUrlSet], + failed_urls: allFailedUrls, + errors, + quality: qualityReport, + sources: sourcesReport, + llm_usage: { + prompt_tokens: llmUsage.promptTokens, + completion_tokens: llmUsage.completionTokens, + total_tokens: llmUsage.totalTokens, + call_count: llmUsage.callCount, + }, + }; + + await saveRunReport(paths, report); + + log("done", `results → ${paths.resultsPath}`); + return { + runId, + paths, + report, + recordCount: mergedRecords.length, + records: mergedRecords, + visualizationRecords: benchmarkVisualizationRecords, + }; +} + +export function defaultRunsDir(): string { + return join(process.cwd(), "runs"); +} + +export function defaultMemoryDir(): string { + return join(process.cwd(), "memory"); +} + +export async function runRefreshPipeline(options: { + fromRunId: string; + outputDir: string; + memoryDir?: string; + targetRows?: number; + inPlace?: boolean; + refetchUrls?: boolean; + enableRepair?: boolean; + enableTriage?: boolean; + enableTinyfishAgent?: boolean; +}): Promise { + const loaded = await loadRunForRefresh(options.outputDir, options.fromRunId); + if (loaded.records.length === 0) { + throw new Error( + `Run ${options.fromRunId} has no records in evidence.jsonl — cannot refresh`, + ); + } + + return runPipeline({ + prompt: loaded.report.prompt, + targetRows: options.targetRows ?? loaded.report.target_rows, + outputDir: options.outputDir, + memoryDir: options.memoryDir, + enableRepair: options.enableRepair, + enableTriage: options.enableTriage, + enableTinyfishAgent: options.enableTinyfishAgent, + refreshFrom: loaded, + refreshInPlace: options.inPlace, + refetchUrls: options.refetchUrls, + }); +} diff --git a/backend/BigSet_Data_Collection_Agent/src/orchestrator/process-pages.ts b/backend/BigSet_Data_Collection_Agent/src/orchestrator/process-pages.ts new file mode 100644 index 0000000..649a1d0 --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/orchestrator/process-pages.ts @@ -0,0 +1,415 @@ +import { generateAgentGoal } from "../agents/agent-goal.js"; +import { extractFromAgentResult } from "../agents/extract-from-agent.js"; +import { extractFromPage } from "../agents/extract.js"; +import { triagePage } from "../agents/source-triage.js"; +import { config } from "../config.js"; +import { runTinyfishAgentsBatch } from "../integrations/tinyfish-agent.js"; +import type { WorkflowMemory } from "../memory/index.js"; +import { getPrimaryKeyValue } from "../merge/records.js"; +import { + statusNeedsAgent, + type SourceStatus, +} from "../models/source-status.js"; +import type { + AgentRunRecord, + DatasetSpec, + ExtractedRecord, + FetchedPage, + SourceTriageResult, + TriageSummary, +} from "../models/schemas.js"; +import { + createAgentQueue, + createExtractionQueue, + createTriageQueue, +} from "../queue/pools.js"; +import { saveJson, type RunPaths } from "../storage/run-store.js"; +import { getDomain } from "../utils/url.js"; +import { join } from "node:path"; + +export interface AgentDeferredEntry { + url: string; + status: SourceStatus; +} + +export interface ProcessPagesResult { + records: ExtractedRecord[]; + triageResults: SourceTriageResult[]; + agentRuns: AgentRunRecord[]; + agentDeferred: AgentDeferredEntry[]; + summary: TriageSummary; +} + +function emptySummary(): TriageSummary { + return { + pages_triaged: 0, + by_status: {}, + extract_now: 0, + agent_candidates: 0, + agent_dispatched: 0, + agent_deferred: 0, + agent_succeeded: 0, + agent_failed: 0, + skipped: 0, + records_from_extract: 0, + records_from_agent: 0, + }; +} + +function bumpStatus(summary: TriageSummary, status: SourceStatus): void { + summary.by_status[status] = (summary.by_status[status] ?? 0) + 1; +} + +export async function processFetchedPages(options: { + label: string; + userPrompt: string; + spec: DatasetSpec; + pages: FetchedPage[]; + paths: RunPaths; + errors: string[]; + focusFields?: string[]; + knownEntityKeys?: string[]; + enableTriage?: boolean; + enableTinyfishAgent?: boolean; + memory?: WorkflowMemory; + log: (stage: string, message: string) => void; +}): Promise { + const triageEnabled = options.enableTriage ?? config.enableTriage; + const agentEnabled = options.enableTinyfishAgent ?? config.enableTinyfishAgent; + const summary = emptySummary(); + const records: ExtractedRecord[] = []; + const agentRuns: AgentRunRecord[] = []; + const knownKeys = new Set(options.knownEntityKeys ?? []); + + const successfulPages = options.pages.filter( + (page) => !page.error && page.text.trim().length > 0, + ); + + if (successfulPages.length === 0) { + return { + records: [], + triageResults: [], + agentRuns: [], + agentDeferred: [], + summary, + }; + } + + const extractionQueue = createExtractionQueue(); + + if (!triageEnabled) { + options.log( + options.label, + `Triage disabled — extracting all pages (parallel, concurrency=${config.extractionConcurrency})...`, + ); + const extracted = await extractionQueue.runAll( + successfulPages, + async (page) => { + try { + return await extractFromPage(options.spec, page, { + focusFields: options.focusFields, + memory: options.memory, + }); + } catch (error) { + const msg = `Extraction failed for ${page.final_url || page.url}: ${ + error instanceof Error ? error.message : String(error) + }`; + options.errors.push(msg); + return [] as ExtractedRecord[]; + } + }, + (page) => [getDomain(page.final_url || page.url)], + ); + const flat = extracted.flat(); + summary.pages_triaged = successfulPages.length; + summary.extract_now = successfulPages.length; + summary.records_from_extract = flat.length; + return { + records: flat, + triageResults: [], + agentRuns: [], + agentDeferred: [], + summary, + }; + } + + const triageQueue = createTriageQueue(); + + options.log( + options.label, + `Triaging ${successfulPages.length} pages (parallel, concurrency=${config.triageConcurrency})...`, + ); + + const triageResults = await triageQueue.runAll( + successfulPages, + async (page) => { + try { + return await triagePage({ + userPrompt: options.userPrompt, + spec: options.spec, + page, + knownEntityKeys: [...knownKeys], + memory: options.memory, + }); + } catch (error) { + const pageUrl = page.final_url || page.url; + const msg = `Triage failed for ${pageUrl}: ${ + error instanceof Error ? error.message : String(error) + }`; + options.errors.push(msg); + options.log(options.label, `WARN ${msg}`); + return { + url: page.url, + final_url: pageUrl, + title: page.title, + status: "extract_now" as const, + confidence: 0.3, + source_data_confidence: 0.35, + expected_yield: "partial" as const, + reasoning: "Triage failed; falling back to direct extraction.", + }; + } + }, + (page) => [getDomain(page.final_url || page.url)], + ); + + summary.pages_triaged = triageResults.length; + await saveJson( + join(options.paths.root, `triage_${options.label}.json`), + triageResults, + ); + + const pageByUrl = new Map( + successfulPages.map((page) => [page.final_url || page.url, page]), + ); + + const extractPages: { page: FetchedPage; triage: SourceTriageResult }[] = []; + const agentQueue: { page: FetchedPage; triage: SourceTriageResult }[] = []; + + for (const triage of triageResults) { + bumpStatus(summary, triage.status); + + const page = pageByUrl.get(triage.final_url) ?? pageByUrl.get(triage.url); + if (!page) continue; + + if (triage.status === "extract_now") { + summary.extract_now += 1; + extractPages.push({ page, triage }); + } else if (statusNeedsAgent(triage.status)) { + summary.agent_candidates += 1; + if (agentEnabled) { + agentQueue.push({ page, triage }); + } else { + options.log( + options.label, + `Agent disabled — fallback extract for ${triage.final_url} [${triage.status}]`, + ); + extractPages.push({ page, triage }); + } + } else { + summary.skipped += 1; + options.log( + options.label, + `Skip ${triage.final_url} [${triage.status}]: ${triage.reasoning.slice(0, 80)}`, + ); + } + } + + if (extractPages.length > 0) { + options.log( + options.label, + `Direct extraction on ${extractPages.length} pages (parallel, concurrency=${config.extractionConcurrency})...`, + ); + const extracted = await extractionQueue.runAll( + extractPages, + async ({ page }) => { + try { + return await extractFromPage(options.spec, page, { + focusFields: options.focusFields, + memory: options.memory, + }); + } catch (error) { + const msg = `Extraction failed for ${page.final_url || page.url}: ${ + error instanceof Error ? error.message : String(error) + }`; + options.errors.push(msg); + return [] as ExtractedRecord[]; + } + }, + ({ page }) => [getDomain(page.final_url || page.url)], + ); + for (const batch of extracted) { + for (const record of batch) { + records.push(record); + const pk = getPrimaryKeyValue(record, options.spec); + if (pk) knownKeys.add(pk); + } + } + summary.records_from_extract = records.length; + } + + const agentBudget = agentEnabled ? config.maxAgentRunsPerPhase : 0; + const toRun = agentQueue.slice(0, agentBudget); + const deferredEntries: AgentDeferredEntry[] = agentQueue + .slice(agentBudget) + .map(({ page, triage }) => ({ + url: triage.final_url || page.url, + status: triage.status, + })); + + if (deferredEntries.length > 0) { + options.log( + options.label, + `Agent budget: running ${toRun.length}/${agentQueue.length} (${deferredEntries.length} deferred)`, + ); + } + + summary.agent_dispatched = toRun.length; + summary.agent_deferred = deferredEntries.length; + + if (toRun.length > 0) { + options.log( + options.label, + `Tinyfish Agent on ${toRun.length} pages (async queue + poll, queue=${config.agentQueueConcurrency}, poll=${config.agentPollConcurrency})...`, + ); + + const agentGoalQueue = createAgentQueue(); + + const jobsWithGoals = await agentGoalQueue.runAll( + toRun, + async ({ page, triage }) => { + const pageUrl = triage.final_url || page.url; + try { + const agentGoal = await generateAgentGoal({ + userPrompt: options.userPrompt, + spec: options.spec, + triage, + focusFields: options.focusFields, + memory: options.memory, + }); + return { page, triage, pageUrl, goal: agentGoal.goal, goalError: null as string | null }; + } catch (error) { + const msg = error instanceof Error ? error.message : String(error); + options.errors.push(`Agent goal failed for ${pageUrl}: ${msg}`); + return { page, triage, pageUrl, goal: "", goalError: msg }; + } + }, + ({ page }) => [getDomain(page.final_url || page.url)], + ); + + const queueJobs: { url: string; goal: string }[] = []; + const queueJobIndices: number[] = []; + + for (let index = 0; index < jobsWithGoals.length; index++) { + const job = jobsWithGoals[index]!; + if (job.goalError) { + summary.agent_failed += 1; + agentRuns.push({ + url: job.pageUrl, + status: job.triage.status, + run_id: null, + agent_status: "FAILED", + goal: "", + records_extracted: 0, + error: job.goalError, + }); + continue; + } + queueJobs.push({ url: job.pageUrl, goal: job.goal }); + queueJobIndices.push(index); + } + + const agentRunResults = await runTinyfishAgentsBatch(queueJobs); + + const jobsToExtract = queueJobIndices.map((jobIndex, batchIndex) => ({ + job: jobsWithGoals[jobIndex]!, + run: agentRunResults[batchIndex]!, + })); + + await extractionQueue.runAll( + jobsToExtract, + async ({ job, run }) => { + const pageUrl = job.pageUrl; + + if (run.error || !run.result) { + summary.agent_failed += 1; + agentRuns.push({ + url: pageUrl, + status: job.triage.status, + run_id: run.run_id, + agent_status: run.status, + goal: job.goal, + records_extracted: 0, + error: run.error ?? "No result returned", + }); + options.log( + options.label, + `WARN Agent failed ${pageUrl}: ${run.error ?? "no result"}`, + ); + return; + } + + try { + const agentRecords = await extractFromAgentResult({ + spec: options.spec, + pageUrl, + agentResult: run.result, + focusFields: options.focusFields, + memory: options.memory, + }); + + summary.agent_succeeded += 1; + for (const record of agentRecords) { + records.push(record); + const pk = getPrimaryKeyValue(record, options.spec); + if (pk) knownKeys.add(pk); + } + summary.records_from_agent += agentRecords.length; + + agentRuns.push({ + url: pageUrl, + status: job.triage.status, + run_id: run.run_id, + agent_status: run.status, + goal: job.goal, + records_extracted: agentRecords.length, + }); + + options.log( + options.label, + `Agent OK ${pageUrl} → ${agentRecords.length} records`, + ); + } catch (error) { + summary.agent_failed += 1; + const msg = error instanceof Error ? error.message : String(error); + options.errors.push(`Agent extract failed for ${pageUrl}: ${msg}`); + agentRuns.push({ + url: pageUrl, + status: job.triage.status, + run_id: run.run_id, + agent_status: run.status, + goal: job.goal, + records_extracted: 0, + error: msg, + }); + } + }, + ({ job }) => [getDomain(job.pageUrl)], + ); + } + + if (agentRuns.length > 0) { + await saveJson( + join(options.paths.root, `agent_runs_${options.label}.json`), + agentRuns, + ); + } + + return { + records, + triageResults, + agentRuns, + agentDeferred: deferredEntries, + summary, + }; +} diff --git a/backend/BigSet_Data_Collection_Agent/src/orchestrator/repair-loop.ts b/backend/BigSet_Data_Collection_Agent/src/orchestrator/repair-loop.ts new file mode 100644 index 0000000..4bff7b9 --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/orchestrator/repair-loop.ts @@ -0,0 +1,280 @@ +import { join } from "node:path"; +import { generateRepairDiagnosis } from "../agents/repair-diagnosis.js"; +import { generateRepairQueries } from "../agents/repair-queries.js"; +import { + analyzeCoverage, + countFilledGaps, + type CoverageReport, +} from "../coverage/analyze.js"; +import { config } from "../config.js"; +import type { RepairLoopReport } from "../models/schemas.js"; +import type { DatasetSpec, ExtractedRecord } from "../models/schemas.js"; +import { + recordCoverageGaps, + recordDiagnosis, + recordPhaseInMemory, + type WorkflowMemory, +} from "../memory/index.js"; +import { + markSearchPagesUsed, + planRepairSearches, +} from "../memory/search-pagination.js"; +import { mergeRepairIntoExisting } from "../merge/records.js"; +import type { SourcesReport } from "../models/quality.js"; +import { buildSourcesReport } from "../quality/index.js"; +import { saveJson, type RunPaths } from "../storage/run-store.js"; +import { normalizeUrl } from "../utils/url.js"; +import { + entityKeysFromRecords, + runAcquisitionPhase, + type AcquisitionResult, +} from "./acquisition.js"; + +export interface RepairLoopContext { + userPrompt: string; + spec: DatasetSpec; + paths: RunPaths; + errors: string[]; + memory: WorkflowMemory; + fetchedUrlSet: Set; + allSearchQueries: string[]; + allFailedUrls: string[]; + enableTriage: boolean; + enableTinyfishAgent: boolean; + targetRowCap: number; + log: (stage: string, message: string) => void; +} + +export interface RepairLoopRunResult { + mergedRecords: ExtractedRecord[]; + unkeyedRecords: ExtractedRecord[]; + coverage: CoverageReport; + loops: RepairLoopReport[]; + lastDiagnosis?: import("../memory/types.js").RepairDiagnosis; + repairAcquisitions: AcquisitionResult[]; + sourcesReports: SourcesReport[]; +} + +export async function runRepairLoops(options: { + ctx: RepairLoopContext; + recordsBeforeRepair: ExtractedRecord[]; + initialCoverage: CoverageReport; + pageIndexStart: number; +}): Promise { + const { ctx } = options; + let mergedRecords = options.recordsBeforeRepair; + let unkeyedRecords: ExtractedRecord[] = []; + let coverage = options.initialCoverage; + let pageIndex = options.pageIndexStart; + + const loops: RepairLoopReport[] = []; + const repairAcquisitions: AcquisitionResult[] = []; + const sourcesReports: SourcesReport[] = []; + let lastDiagnosis: import("../memory/types.js").RepairDiagnosis | undefined; + + recordCoverageGaps(ctx.memory, coverage); + + if (!coverage.should_repair) { + return { + mergedRecords, + unkeyedRecords, + coverage, + loops, + repairAcquisitions, + sourcesReports, + }; + } + + while ( + coverage.should_repair && + ctx.memory.repair_loop_count < config.maxRepairLoops + ) { + const loopIndex = ctx.memory.repair_loop_count + 1; + ctx.memory.repair_loop_count = loopIndex; + + const recordsBeforeLoop = mergedRecords; + const partialBefore = coverage.partial_count; + + ctx.log( + "repair", + `Loop ${loopIndex}/${config.maxRepairLoops} — missing: ${coverage.field_gaps.map((g) => g.column).join(", ")}`, + ); + + const diagnosis = await generateRepairDiagnosis({ + userPrompt: ctx.userPrompt, + spec: ctx.spec, + coverage, + memory: ctx.memory, + repairLoop: loopIndex, + maxRepairLoops: config.maxRepairLoops, + }); + lastDiagnosis = diagnosis; + recordDiagnosis(ctx.memory, loopIndex, diagnosis); + + await saveJson( + join(ctx.paths.root, `repair_diagnosis_${loopIndex}.json`), + diagnosis, + ); + + const repairPlan = await generateRepairQueries({ + userPrompt: ctx.userPrompt, + spec: ctx.spec, + coverage, + priorSearchQueries: ctx.allSearchQueries, + maxQueries: config.maxRepairQueries, + memory: ctx.memory, + diagnosis, + repairLoop: loopIndex, + }); + + const repairSearches = planRepairSearches( + ctx.memory, + repairPlan.repair_queries, + ); + const paginatedCount = repairSearches.filter((plan) => plan.page > 0).length; + + await saveJson(join(ctx.paths.root, `repair_queries_${loopIndex}.json`), { + ...repairPlan, + repair_searches: repairSearches, + }); + + ctx.log( + "repair", + `Loop ${loopIndex}: ${repairSearches.length} searches (${repairPlan.repair_queries.length} new, ${paginatedCount} paginated) — ${diagnosis.summary.slice(0, 100)}`, + ); + + const preferAgent = + diagnosis.prefer_tinyfish_agent && ctx.enableTinyfishAgent; + + const acquisition = await runAcquisitionPhase({ + label: `repair_${loopIndex}`, + userPrompt: ctx.userPrompt, + spec: ctx.spec, + queries: repairSearches.map((plan) => plan.query), + searches: repairSearches, + paths: ctx.paths, + errors: ctx.errors, + excludeUrls: ctx.fetchedUrlSet, + maxResultsPerQuery: config.maxRepairResultsPerQuery, + maxUrlsToFetch: config.maxRepairUrlsToFetch, + pageIndexStart: pageIndex, + focusFields: coverage.field_gaps.map((gap) => gap.column), + knownEntityKeys: entityKeysFromRecords(ctx.spec, recordsBeforeLoop), + enableTriage: ctx.enableTriage, + enableTinyfishAgent: ctx.enableTinyfishAgent, + memory: ctx.memory, + forceAgent: preferAgent, + enableLinkFollow: config.enableRepairLinkFollow, + log: ctx.log, + }); + + markSearchPagesUsed( + ctx.memory, + repairSearches, + `repair_${loopIndex}`, + loopIndex, + ); + + repairAcquisitions.push(acquisition); + pageIndex += acquisition.pagesFetched; + + recordPhaseInMemory({ + memory: ctx.memory, + spec: ctx.spec, + phase: `repair_${loopIndex}`, + repairLoop: loopIndex, + queries: repairSearches.map((plan) => plan.query), + candidates: acquisition.candidates, + records: acquisition.records, + failedUrls: acquisition.failedUrls, + agentRuns: acquisition.agentRuns, + triageResults: acquisition.triageResults, + }); + + for (const url of acquisition.fetchedUrls) { + ctx.fetchedUrlSet.add(normalizeUrl(url)); + } + ctx.allSearchQueries.push(...repairPlan.repair_queries); + ctx.allFailedUrls.push(...acquisition.failedUrls); + + sourcesReports.push( + buildSourcesReport({ + phase: "repair", + fetchedPages: acquisition.fetchedPages, + fetchedUrls: acquisition.fetchedUrls, + triageResults: acquisition.triageResults, + agentRuns: acquisition.agentRuns, + agentDeferred: acquisition.agentDeferred, + }), + ); + + const mergeResult = mergeRepairIntoExisting( + ctx.spec, + recordsBeforeLoop, + acquisition.records, + ); + mergedRecords = mergeResult.records.slice(0, ctx.targetRowCap); + unkeyedRecords = [...unkeyedRecords, ...mergeResult.unkeyed]; + + const coverageAfter = analyzeCoverage(ctx.spec, mergedRecords); + await saveJson( + join(ctx.paths.root, `coverage_repair_${loopIndex}.json`), + coverageAfter, + ); + + const fieldsFilled = countFilledGaps( + ctx.spec, + recordsBeforeLoop, + mergedRecords, + coverage.field_gaps.map((gap) => gap.column), + ); + + loops.push({ + loop_index: loopIndex, + diagnosis_summary: diagnosis.summary, + repair_queries: repairPlan.repair_queries, + rationale: repairPlan.rationale, + missing_fields: coverage.field_gaps.map((gap) => gap.column), + records_before: recordsBeforeLoop.length, + records_after: mergedRecords.length, + fields_filled: fieldsFilled, + partial_count_before: partialBefore, + partial_count_after: coverageAfter.partial_count, + stats: { + search_queries_executed: repairSearches.length, + search_pages_paginated: paginatedCount, + search_results_collected: acquisition.candidates.length, + unique_urls_selected: acquisition.fetchedUrls.length, + pages_fetched: acquisition.pagesFetched, + pages_failed: acquisition.failedUrls.length, + raw_records_extracted: acquisition.records.length, + triage: acquisition.triage, + }, + }); + + ctx.log( + "repair", + `Loop ${loopIndex} done — ${mergedRecords.length} records, partial ${partialBefore} → ${coverageAfter.partial_count}`, + ); + + coverage = coverageAfter; + recordCoverageGaps(ctx.memory, coverage); + } + + if (coverage.should_repair && ctx.memory.repair_loop_count >= config.maxRepairLoops) { + ctx.log( + "repair", + `Stopped after ${config.maxRepairLoops} repair loops (gaps remain)`, + ); + } + + return { + mergedRecords, + unkeyedRecords, + coverage, + loops, + lastDiagnosis, + repairAcquisitions, + sourcesReports, + }; +} diff --git a/backend/BigSet_Data_Collection_Agent/src/quality/build-report.ts b/backend/BigSet_Data_Collection_Agent/src/quality/build-report.ts new file mode 100644 index 0000000..5f1442e --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/quality/build-report.ts @@ -0,0 +1,238 @@ +import type { + QualityBucket, + QualityReport, + SourceOutcome, + SourcesReport, +} from "../models/quality.js"; +import type { + AgentRunRecord, + DatasetSpec, + ExtractedRecord, + FetchedPage, + SourceTriageResult, +} from "../models/schemas.js"; +import { statusNeedsAgent } from "../models/source-status.js"; +import { normalizeUrl } from "../utils/url.js"; +import { scoreRecords, type ScoreRecordContext } from "./score-record.js"; + +function bucket(recordIds: string[]): QualityBucket { + return { count: recordIds.length, record_ids: recordIds }; +} + +export function buildQualityReport( + spec: DatasetSpec, + records: ExtractedRecord[], + context: ScoreRecordContext, + unkeyedCount: number, +): QualityReport { + const scored = scoreRecords(spec, records, context); + + const completeIds: string[] = []; + const partialIds: string[] = []; + const lowConfidenceIds: string[] = []; + const reviewIds: string[] = []; + + for (const quality of scored) { + if (quality.record_status === "complete") completeIds.push(quality.record_id); + if (quality.record_status === "partial") partialIds.push(quality.record_id); + if (quality.record_status === "low_confidence") { + lowConfidenceIds.push(quality.record_id); + } + if (quality.needs_review) reviewIds.push(quality.record_id); + } + + return { + total_records: records.length, + unkeyed_records: unkeyedCount, + complete: bucket(completeIds), + partial: bucket(partialIds), + low_confidence: bucket(lowConfidenceIds), + needs_review: bucket(reviewIds), + records: scored, + }; +} + +export function triageByUrl( + triageResults: SourceTriageResult[], +): Map { + const map = new Map(); + for (const triage of triageResults) { + map.set(normalizeUrl(triage.final_url), triage); + map.set(normalizeUrl(triage.url), triage); + } + return map; +} + +export function agentExtractedUrls( + agentRuns: AgentRunRecord[], +): Set { + return new Set( + agentRuns + .filter((run) => run.records_extracted > 0 && !run.error) + .map((run) => normalizeUrl(run.url)), + ); +} + +const SKIPPED_STATUSES = new Set([ + "irrelevant", + "duplicate", + "blocked", + "low_value", +]); + +export interface BuildSourcesOptions { + phase: "initial" | "repair"; + fetchedPages: FetchedPage[]; + fetchedUrls: string[]; + triageResults: SourceTriageResult[]; + agentRuns: AgentRunRecord[]; + agentDeferred: { url: string; status: string }[]; +} + +export function buildSourcesReport( + options: BuildSourcesOptions, +): SourcesReport { + const outcomes: SourceOutcome[] = []; + const triageMap = triageByUrl(options.triageResults); + + for (const page of options.fetchedPages) { + const url = normalizeUrl(page.final_url || page.url); + const triage = triageMap.get(url); + + if (page.error) { + outcomes.push({ + url, + phase: options.phase, + outcome: "fetch_failed", + error: page.error, + triage_status: triage?.status, + triage_confidence: triage?.confidence, + source_data_confidence: triage?.source_data_confidence, + expected_yield: triage?.expected_yield, + }); + continue; + } + + if (triage && SKIPPED_STATUSES.has(triage.status)) { + outcomes.push({ + url, + phase: options.phase, + outcome: "skipped", + triage_status: triage.status, + triage_confidence: triage.confidence, + source_data_confidence: triage.source_data_confidence, + expected_yield: triage.expected_yield, + error: triage.reasoning.slice(0, 200), + }); + } + } + + for (const deferred of options.agentDeferred) { + outcomes.push({ + url: normalizeUrl(deferred.url), + phase: options.phase, + outcome: "agent_deferred", + triage_status: deferred.status, + error: "Exceeded MAX_AGENT_RUNS_PER_PHASE budget", + }); + } + + for (const run of options.agentRuns) { + const url = normalizeUrl(run.url); + if (run.error || run.agent_status === "FAILED" || run.agent_status === "TIMEOUT") { + outcomes.push({ + url, + phase: options.phase, + outcome: "agent_failed", + triage_status: run.status, + error: run.error ?? run.agent_status, + records_extracted: run.records_extracted, + }); + } else if (run.records_extracted === 0) { + outcomes.push({ + url, + phase: options.phase, + outcome: "no_records", + triage_status: run.status, + records_extracted: 0, + }); + } else { + outcomes.push({ + url, + phase: options.phase, + outcome: "success", + triage_status: run.status, + records_extracted: run.records_extracted, + }); + } + } + + const outcomeUrls = new Set(outcomes.map((item) => item.url)); + for (const triage of options.triageResults) { + const url = normalizeUrl(triage.final_url); + if (outcomeUrls.has(url)) continue; + + if (triage.status === "extract_now") { + outcomes.push({ + url, + phase: options.phase, + outcome: "success", + triage_status: triage.status, + triage_confidence: triage.confidence, + source_data_confidence: triage.source_data_confidence, + expected_yield: triage.expected_yield, + }); + } else if (statusNeedsAgent(triage.status)) { + outcomes.push({ + url, + phase: options.phase, + outcome: "no_records", + triage_status: triage.status, + triage_confidence: triage.confidence, + source_data_confidence: triage.source_data_confidence, + expected_yield: triage.expected_yield, + error: "Agent path did not yield records", + }); + } + } + + const byOutcome: Record = {}; + for (const item of outcomes) { + byOutcome[item.outcome] = (byOutcome[item.outcome] ?? 0) + 1; + } + + const failed = outcomes.filter((item) => + ["fetch_failed", "skipped", "agent_failed", "agent_deferred", "no_records"].includes( + item.outcome, + ), + ); + + return { + total: outcomes.length, + failed, + by_outcome: byOutcome, + outcomes, + }; +} + +export function mergeSourcesReports( + initial: SourcesReport, + repair: SourcesReport | null, +): SourcesReport { + const outcomes = [...initial.outcomes, ...(repair?.outcomes ?? [])]; + const byOutcome: Record = {}; + for (const item of outcomes) { + byOutcome[item.outcome] = (byOutcome[item.outcome] ?? 0) + 1; + } + const failed = outcomes.filter((item) => + ["fetch_failed", "skipped", "agent_failed", "agent_deferred", "no_records"].includes( + item.outcome, + ), + ); + return { + total: outcomes.length, + failed, + by_outcome: byOutcome, + outcomes, + }; +} diff --git a/backend/BigSet_Data_Collection_Agent/src/quality/field-confidence.ts b/backend/BigSet_Data_Collection_Agent/src/quality/field-confidence.ts new file mode 100644 index 0000000..790afef --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/quality/field-confidence.ts @@ -0,0 +1,72 @@ +import type { DatasetSpec, ExtractedRecord, SourceTriageResult } from "../models/schemas.js"; +import type { ScoreRecordContext } from "./score-record.js"; + +function isEmpty(value: unknown): boolean { + return value === null || value === undefined || value === ""; +} + +/** Confidence for one populated field from its evidence URL and row-level signals. */ +export function confidenceForField( + fieldName: string, + record: ExtractedRecord, + context: ScoreRecordContext, +): number { + const extraction = record.extraction_confidence ?? 0.85; + const evidenceForField = record.evidence.filter((item) => item.field === fieldName); + + if (evidenceForField.length === 0) { + const fromAgent = record.source_urls.some((url) => + context.agentExtractedUrls.has(url), + ); + return Math.min(1, Math.max(0, extraction * (fromAgent ? 0.72 : 0.78))); + } + + const urlScores = evidenceForField + .map((item) => { + const triage = context.triageByUrl.get(item.url); + const source = triage?.source_data_confidence ?? 0.65; + const routing = triage?.confidence ?? 0.7; + return source * 0.7 + routing * 0.15 + extraction * 0.15; + }) + .filter((value) => Number.isFinite(value)); + + if (urlScores.length === 0) { + return Math.min(1, Math.max(0, extraction * 0.8)); + } + + return Math.min( + 1, + Math.max(0, urlScores.reduce((sum, value) => sum + value, 0) / urlScores.length), + ); +} + +export function computeFieldConfidences( + spec: DatasetSpec, + record: ExtractedRecord, + context: ScoreRecordContext, +): Record { + const out: Record = {}; + for (const col of spec.columns) { + if (isEmpty(record.row[col.name])) continue; + const score = confidenceForField(col.name, record, context); + out[col.name] = Math.round(score * 1000) / 1000; + } + return out; +} + +export function aggregateRecordConfidence( + spec: DatasetSpec, + fieldConfidences: Record, + requiredOnly = true, +): number { + const columns = spec.columns.filter((col) => + requiredOnly ? col.required : true, + ); + const scores = columns + .map((col) => fieldConfidences[col.name]) + .filter((value): value is number => value !== undefined); + + if (scores.length === 0) return 0; + const mean = scores.reduce((sum, value) => sum + value, 0) / scores.length; + return Math.round(mean * 1000) / 1000; +} diff --git a/backend/BigSet_Data_Collection_Agent/src/quality/index.ts b/backend/BigSet_Data_Collection_Agent/src/quality/index.ts new file mode 100644 index 0000000..a15fd78 --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/quality/index.ts @@ -0,0 +1,8 @@ +export { + agentExtractedUrls, + buildQualityReport, + buildSourcesReport, + mergeSourcesReports, + triageByUrl, +} from "./build-report.js"; +export { scoreRecord, scoreRecords, type ScoreRecordContext } from "./score-record.js"; diff --git a/backend/BigSet_Data_Collection_Agent/src/quality/score-record.ts b/backend/BigSet_Data_Collection_Agent/src/quality/score-record.ts new file mode 100644 index 0000000..cdefa1f --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/quality/score-record.ts @@ -0,0 +1,176 @@ +import { config } from "../config.js"; +import { canonicalRecordId } from "../merge/records.js"; +import type { RecordQuality, RecordStatus } from "../models/quality.js"; +import type { DatasetSpec, ExtractedRecord, SourceTriageResult } from "../models/schemas.js"; +import { + aggregateRecordConfidence, + computeFieldConfidences, +} from "./field-confidence.js"; + +export interface ScoreRecordContext { + triageByUrl: Map; + agentExtractedUrls: Set; +} + +function isEmpty(value: unknown): boolean { + return value === null || value === undefined || value === ""; +} + +function evidenceCoverage( + spec: DatasetSpec, + record: ExtractedRecord, +): { ratio: number; fieldsWithoutEvidence: string[] } { + const nonNullFields = spec.columns.filter((col) => !isEmpty(record.row[col.name])); + if (nonNullFields.length === 0) { + return { ratio: 1, fieldsWithoutEvidence: [] }; + } + + const evidenced = new Set(record.evidence.map((item) => item.field)); + const fieldsWithoutEvidence = nonNullFields + .filter((col) => !evidenced.has(col.name)) + .map((col) => col.name); + + const ratio = + (nonNullFields.length - fieldsWithoutEvidence.length) / nonNullFields.length; + + return { ratio, fieldsWithoutEvidence }; +} + +function minSourceConfidence( + record: ExtractedRecord, + triageByUrl: Map, +): number { + const scores = record.source_urls + .map((url) => triageByUrl.get(url)?.source_data_confidence) + .filter((value): value is number => value !== undefined); + + if (scores.length === 0) return 0.65; + return Math.min(...scores); +} + +export function scoreRecord( + spec: DatasetSpec, + record: ExtractedRecord, + context: ScoreRecordContext, + recordId: string, +): RecordQuality { + const requiredColumns = spec.columns.filter((col) => col.required); + const optionalColumns = spec.columns.filter((col) => !col.required); + + const missingRequired = requiredColumns + .filter((col) => isEmpty(record.row[col.name])) + .map((col) => col.name); + const missingOptional = optionalColumns + .filter((col) => isEmpty(record.row[col.name])) + .map((col) => col.name); + + const filledRequired = + requiredColumns.length > 0 + ? requiredColumns.length - missingRequired.length + : spec.columns.length; + const completenessPct = + requiredColumns.length > 0 + ? filledRequired / requiredColumns.length + : spec.columns.filter((col) => !isEmpty(record.row[col.name])).length / + Math.max(spec.columns.length, 1); + + const { ratio: evidenceRatio, fieldsWithoutEvidence } = evidenceCoverage( + spec, + record, + ); + const sourceConfidence = minSourceConfidence(record, context.triageByUrl); + const extractionConfidence = record.extraction_confidence ?? 0.85; + const fieldConfidences = computeFieldConfidences(spec, record, context); + + const requiredFieldConfidence = aggregateRecordConfidence( + spec, + fieldConfidences, + true, + ); + const legacyBlend = Math.min( + 1, + Math.max( + 0, + completenessPct * 0.35 + + sourceConfidence * 0.25 + + extractionConfidence * 0.25 + + evidenceRatio * 0.15, + ), + ); + const confidenceScore = + requiredColumns.length > 0 && Object.keys(fieldConfidences).length > 0 + ? requiredFieldConfidence + : legacyBlend; + + const reviewReasons: string[] = []; + if (missingRequired.length > 0) { + reviewReasons.push( + `missing required fields: ${missingRequired.join(", ")}`, + ); + } + if (fieldsWithoutEvidence.length > 0) { + reviewReasons.push( + `fields without evidence: ${fieldsWithoutEvidence.join(", ")}`, + ); + } + if (sourceConfidence < config.qualitySourceConfidenceThreshold) { + reviewReasons.push( + `low source data confidence (${sourceConfidence.toFixed(2)})`, + ); + } + if (extractionConfidence < config.qualityExtractionConfidenceThreshold) { + reviewReasons.push( + `low extraction confidence (${extractionConfidence.toFixed(2)})`, + ); + } + + const fromAgent = record.source_urls.some((url) => + context.agentExtractedUrls.has(url), + ); + if (fromAgent && extractionConfidence < 0.8) { + reviewReasons.push("browser agent extraction — verify manually"); + } + + let recordStatus: RecordStatus; + if (missingRequired.length > 0) { + recordStatus = "partial"; + } else if ( + confidenceScore < config.qualityLowConfidenceThreshold || + fieldsWithoutEvidence.length > 0 + ) { + recordStatus = "low_confidence"; + } else { + recordStatus = "complete"; + } + + const needsReview = + recordStatus === "partial" || + recordStatus === "low_confidence" || + confidenceScore < config.qualityReviewThreshold; + + return { + record_id: recordId, + record_status: recordStatus, + needs_review: needsReview, + completeness_pct: Math.round(completenessPct * 1000) / 1000, + confidence_score: Math.round(confidenceScore * 1000) / 1000, + field_confidences: fieldConfidences, + missing_required_fields: missingRequired, + missing_optional_fields: missingOptional, + fields_without_evidence: fieldsWithoutEvidence, + review_reasons: reviewReasons, + }; +} + +export function scoreRecords( + spec: DatasetSpec, + records: ExtractedRecord[], + context: ScoreRecordContext, +): RecordQuality[] { + return records.map((record) => { + const recordId = + canonicalRecordId(record, spec) ?? + `unkeyed:${JSON.stringify(record.row).slice(0, 80)}`; + return scoreRecord(spec, record, context, recordId); + }); +} diff --git a/backend/BigSet_Data_Collection_Agent/src/queue/domain-throttle.ts b/backend/BigSet_Data_Collection_Agent/src/queue/domain-throttle.ts new file mode 100644 index 0000000..8efb7a8 --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/queue/domain-throttle.ts @@ -0,0 +1,63 @@ +/** + * Limits concurrent work per domain (e.g. max 2 fetches on yelp.com at once). + */ +export class DomainThrottle { + private readonly active = new Map(); + private readonly waiters = new Map void>>(); + + constructor(private readonly maxPerDomain: number) {} + + async acquire(domain: string): Promise<() => void> { + if (!domain) { + return () => undefined; + } + + await new Promise((resolve) => { + const tryAcquire = (): void => { + const count = this.active.get(domain) ?? 0; + if (count < this.maxPerDomain) { + this.active.set(domain, count + 1); + resolve(); + return; + } + const queue = this.waiters.get(domain) ?? []; + queue.push(tryAcquire); + this.waiters.set(domain, queue); + }; + tryAcquire(); + }); + + let released = false; + return () => { + if (released) return; + released = true; + const count = (this.active.get(domain) ?? 1) - 1; + if (count <= 0) { + this.active.delete(domain); + } else { + this.active.set(domain, count); + } + const queue = this.waiters.get(domain); + if (queue && queue.length > 0) { + const next = queue.shift()!; + next(); + } + }; + } + + async withDomains(domains: string[], fn: () => Promise): Promise { + const unique = [...new Set(domains.filter(Boolean))].sort(); + const releases: Array<() => void> = []; + + try { + for (const domain of unique) { + releases.push(await this.acquire(domain)); + } + return await fn(); + } finally { + for (const release of releases.reverse()) { + release(); + } + } + } +} diff --git a/backend/BigSet_Data_Collection_Agent/src/queue/pools.ts b/backend/BigSet_Data_Collection_Agent/src/queue/pools.ts new file mode 100644 index 0000000..05aefc2 --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/queue/pools.ts @@ -0,0 +1,73 @@ +import { config } from "../config.js"; +import { DomainThrottle } from "./domain-throttle.js"; +import { RateLimiter } from "./rate-limiter.js"; +import { TaskQueue } from "./task-queue.js"; + +let sharedDomainThrottle: DomainThrottle | null = null; +let openRouterLimiter: RateLimiter | null = null; + +export function getSharedDomainThrottle(): DomainThrottle { + if (!sharedDomainThrottle) { + sharedDomainThrottle = new DomainThrottle(config.maxConcurrentPerDomain); + } + return sharedDomainThrottle; +} + +export function getOpenRouterLimiter(): RateLimiter { + if (!openRouterLimiter) { + openRouterLimiter = new RateLimiter(config.openRouterRpm, 60_000); + } + return openRouterLimiter; +} + +const defaultRetry = { + maxRetries: config.maxRetries, + retryBaseDelayMs: config.retryBaseDelayMs, +}; + +export function createSearchQueue(): TaskQueue { + return new TaskQueue({ + name: "search", + concurrency: config.searchConcurrency, + rateLimiter: new RateLimiter(config.tinyfishSearchRpm, 60_000), + ...defaultRetry, + }); +} + +export function createFetchQueue(): TaskQueue { + return new TaskQueue({ + name: "fetch", + concurrency: config.fetchConcurrency, + rateLimiter: new RateLimiter(config.tinyfishFetchRpm, 60_000), + domainThrottle: getSharedDomainThrottle(), + ...defaultRetry, + }); +} + +export function createTriageQueue(): TaskQueue { + return new TaskQueue({ + name: "triage", + concurrency: config.triageConcurrency, + rateLimiter: getOpenRouterLimiter(), + ...defaultRetry, + }); +} + +export function createExtractionQueue(): TaskQueue { + return new TaskQueue({ + name: "extract", + concurrency: config.extractionConcurrency, + rateLimiter: getOpenRouterLimiter(), + ...defaultRetry, + }); +} + +export function createAgentQueue(): TaskQueue { + return new TaskQueue({ + name: "agent", + concurrency: config.agentConcurrency, + rateLimiter: new RateLimiter(config.tinyfishAgentRpm, 60_000), + domainThrottle: getSharedDomainThrottle(), + ...defaultRetry, + }); +} diff --git a/backend/BigSet_Data_Collection_Agent/src/queue/rate-limiter.ts b/backend/BigSet_Data_Collection_Agent/src/queue/rate-limiter.ts new file mode 100644 index 0000000..a3c46af --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/queue/rate-limiter.ts @@ -0,0 +1,41 @@ +import { sleep } from "./retry.js"; + +/** + * Token-bucket style limiter: at most `maxRequests` starts per `intervalMs`. + */ +export class RateLimiter { + private tokens: number; + private lastRefillAt: number; + + constructor( + private readonly maxRequests: number, + private readonly intervalMs: number, + ) { + this.tokens = maxRequests; + this.lastRefillAt = Date.now(); + } + + private refill(): void { + const now = Date.now(); + const elapsed = now - this.lastRefillAt; + if (elapsed < this.intervalMs) return; + + const periods = Math.floor(elapsed / this.intervalMs); + this.tokens = Math.min( + this.maxRequests, + this.tokens + periods * this.maxRequests, + ); + this.lastRefillAt += periods * this.intervalMs; + } + + async acquire(): Promise { + while (true) { + this.refill(); + if (this.tokens > 0) { + this.tokens -= 1; + return; + } + await sleep(Math.min(250, this.intervalMs)); + } + } +} diff --git a/backend/BigSet_Data_Collection_Agent/src/queue/retry.ts b/backend/BigSet_Data_Collection_Agent/src/queue/retry.ts new file mode 100644 index 0000000..dd9e8e5 --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/queue/retry.ts @@ -0,0 +1,55 @@ +export function isRetryableError(error: unknown): boolean { + if (error && typeof error === "object" && "status" in error) { + const status = (error as { status: number }).status; + if (status === 429 || status === 502 || status === 503 || status === 504) { + return true; + } + } + + const message = + error instanceof Error + ? error.message + : typeof error === "string" + ? error + : JSON.stringify(error); + + return /429|502|503|504|timeout|timed out|ECONNRESET|ETIMEDOUT|rate limit|temporarily unavailable/i.test( + message, + ); +} + +export async function sleep(ms: number): Promise { + await new Promise((resolve) => setTimeout(resolve, ms)); +} + +export async function withRetry( + fn: () => Promise, + options: { + maxRetries: number; + baseDelayMs: number; + label?: string; + }, +): Promise { + let lastError: unknown; + + for (let attempt = 0; attempt <= options.maxRetries; attempt++) { + try { + return await fn(); + } catch (error) { + lastError = error; + if (!isRetryableError(error) || attempt >= options.maxRetries) { + throw error; + } + const delay = options.baseDelayMs * 2 ** attempt; + const label = options.label ? ` (${options.label})` : ""; + console.warn( + `[retry]${label} attempt ${attempt + 1}/${options.maxRetries} failed, retrying in ${delay}ms: ${ + error instanceof Error ? error.message : String(error) + }`, + ); + await sleep(delay); + } + } + + throw lastError; +} diff --git a/backend/BigSet_Data_Collection_Agent/src/queue/task-queue.ts b/backend/BigSet_Data_Collection_Agent/src/queue/task-queue.ts new file mode 100644 index 0000000..e3327a0 --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/queue/task-queue.ts @@ -0,0 +1,79 @@ +import type { DomainThrottle } from "./domain-throttle.js"; +import type { RateLimiter } from "./rate-limiter.js"; +import { withRetry } from "./retry.js"; + +export interface TaskQueueOptions { + name: string; + concurrency: number; + maxRetries?: number; + retryBaseDelayMs?: number; + rateLimiter?: RateLimiter; + domainThrottle?: DomainThrottle; +} + +export class TaskQueue { + private readonly maxRetries: number; + private readonly retryBaseDelayMs: number; + + constructor(private readonly options: TaskQueueOptions) { + this.maxRetries = options.maxRetries ?? 0; + this.retryBaseDelayMs = options.retryBaseDelayMs ?? 1000; + } + + /** + * Run handler for each item with bounded concurrency, optional rate limit, + * per-domain throttle, and retries on transient failures. + */ + async runAll( + items: T[], + handler: (item: T, index: number) => Promise, + getDomains?: (item: T) => string[], + ): Promise { + if (items.length === 0) return []; + + const results = new Array(items.length); + let nextIndex = 0; + + const runOne = async (index: number, item: T): Promise => { + const execute = async (): Promise => { + if (this.options.rateLimiter) { + await this.options.rateLimiter.acquire(); + } + + const runHandler = () => handler(item, index); + + if (this.options.domainThrottle && getDomains) { + const domains = getDomains(item); + return this.options.domainThrottle.withDomains(domains, runHandler); + } + + return runHandler(); + }; + + const wrapped = () => + withRetry(execute, { + maxRetries: this.maxRetries, + baseDelayMs: this.retryBaseDelayMs, + label: `${this.options.name}#${index}`, + }); + + results[index] = await wrapped(); + }; + + async function worker(): Promise { + while (true) { + const index = nextIndex; + nextIndex += 1; + if (index >= items.length) return; + await runOne(index, items[index]!); + } + } + + const workers = Array.from( + { length: Math.min(this.options.concurrency, items.length) }, + () => worker(), + ); + await Promise.all(workers); + return results; + } +} diff --git a/backend/BigSet_Data_Collection_Agent/src/storage/run-loader.ts b/backend/BigSet_Data_Collection_Agent/src/storage/run-loader.ts new file mode 100644 index 0000000..e857630 --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/storage/run-loader.ts @@ -0,0 +1,90 @@ +import { readFile } from "node:fs/promises"; +import { join } from "node:path"; +import { workflowMemorySchema, type WorkflowMemory } from "../memory/types.js"; +import { + datasetSpecSchema, + extractedRecordSchema, + runReportSchema, + type DatasetSpec, + type ExtractedRecord, + type RunReport, +} from "../models/schemas.js"; + +export interface LoadedRun { + runId: string; + root: string; + spec: DatasetSpec; + report: RunReport; + records: ExtractedRecord[]; + memory: WorkflowMemory | null; +} + +export function runRoot(baseDir: string, runId: string): string { + return join(baseDir, runId); +} + +export async function loadRunForRefresh( + baseDir: string, + runId: string, +): Promise { + const root = runRoot(baseDir, runId); + const spec = datasetSpecSchema.parse( + JSON.parse(await readFile(join(root, "dataset_spec.json"), "utf8")), + ); + const report = runReportSchema.parse( + JSON.parse(await readFile(join(root, "run_report.json"), "utf8")), + ); + + let memory: WorkflowMemory | null = null; + try { + memory = workflowMemorySchema.parse( + JSON.parse(await readFile(join(root, "workflow_memory.json"), "utf8")), + ); + } catch { + memory = null; + } + + const records = await loadRecordsFromEvidence(join(root, "evidence.jsonl")); + const fallback = + records.length > 0 + ? records + : await loadRecordsFromEvidence(join(root, "evidence_full.jsonl")); + + return { + runId, + root, + spec, + report, + records: fallback, + memory, + }; +} + +export async function loadRecordsFromEvidence( + path: string, +): Promise { + try { + const raw = await readFile(path, "utf8"); + const lines = raw.split("\n").filter((line) => line.trim().length > 0); + const records: ExtractedRecord[] = []; + for (const line of lines) { + const parsed = JSON.parse(line) as { + row: ExtractedRecord["row"]; + evidence: ExtractedRecord["evidence"]; + source_urls: string[]; + extraction_confidence?: number; + }; + records.push( + extractedRecordSchema.parse({ + row: parsed.row, + evidence: parsed.evidence ?? [], + source_urls: parsed.source_urls ?? [], + extraction_confidence: parsed.extraction_confidence, + }), + ); + } + return records; + } catch { + return []; + } +} diff --git a/backend/BigSet_Data_Collection_Agent/src/storage/run-store.ts b/backend/BigSet_Data_Collection_Agent/src/storage/run-store.ts new file mode 100644 index 0000000..a7ceb16 --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/storage/run-store.ts @@ -0,0 +1,99 @@ +import { mkdir, writeFile } from "node:fs/promises"; +import { join } from "node:path"; +import type { + DatasetSpec, + FetchedPage, + RunReport, + SourceCandidate, +} from "../models/schemas.js"; + +export interface RunPaths { + runId: string; + root: string; + pagesDir: string; + specPath: string; + candidatesPath: string; + /** Final selective view (required fields only, ranked). */ + resultsPath: string; + /** Full merged dataset before selective filter. */ + resultsFullPath: string; + evidencePath: string; + evidenceFullPath: string; + /** Snapshot after initial search → fetch → extract → merge. */ + initResultsPath: string; + initEvidencePath: string; + /** Snapshot after repair pass (written only when repair runs). */ + repairResultsPath: string; + repairEvidencePath: string; + reportPath: string; +} + +export async function createRunStore( + baseDir: string, + runId: string, +): Promise { + const root = join(baseDir, runId); + const pagesDir = join(root, "pages"); + await mkdir(pagesDir, { recursive: true }); + + return { + runId, + root, + pagesDir, + specPath: join(root, "dataset_spec.json"), + candidatesPath: join(root, "source_candidates.json"), + resultsPath: join(root, "results.csv"), + resultsFullPath: join(root, "results_full.csv"), + evidencePath: join(root, "evidence.jsonl"), + evidenceFullPath: join(root, "evidence_full.jsonl"), + initResultsPath: join(root, "init_results.csv"), + initEvidencePath: join(root, "init_evidence.jsonl"), + repairResultsPath: join(root, "repair_results.csv"), + repairEvidencePath: join(root, "repair_evidence.jsonl"), + reportPath: join(root, "run_report.json"), + }; +} + +export async function saveJson(path: string, data: unknown): Promise { + await writeFile(path, `${JSON.stringify(data, null, 2)}\n`, "utf8"); +} + +export async function saveDatasetSpec( + paths: RunPaths, + spec: DatasetSpec, +): Promise { + await saveJson(paths.specPath, spec); +} + +export async function saveSourceCandidates( + paths: RunPaths, + candidates: SourceCandidate[], +): Promise { + await saveJson(paths.candidatesPath, candidates); +} + +export async function saveFetchedPage( + paths: RunPaths, + page: FetchedPage, + index: number, +): Promise { + const slug = String(index).padStart(3, "0"); + const metaPath = join(paths.pagesDir, `${slug}.meta.json`); + const textPath = join(paths.pagesDir, `${slug}.md`); + + await saveJson(metaPath, { + url: page.url, + final_url: page.final_url, + title: page.title, + description: page.description, + error: page.error, + }); + await writeFile(textPath, page.text || "", "utf8"); +} + +export async function saveRunReport( + paths: RunPaths, + report: RunReport, +): Promise { + await saveJson(paths.reportPath, report); +} diff --git a/backend/BigSet_Data_Collection_Agent/src/utils/concurrency.ts b/backend/BigSet_Data_Collection_Agent/src/utils/concurrency.ts new file mode 100644 index 0000000..767fc3b --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/utils/concurrency.ts @@ -0,0 +1,26 @@ +export async function mapWithConcurrency( + items: T[], + concurrency: number, + fn: (item: T, index: number) => Promise, +): Promise { + if (items.length === 0) return []; + + const results = new Array(items.length); + let nextIndex = 0; + + async function worker(): Promise { + while (true) { + const index = nextIndex; + nextIndex += 1; + if (index >= items.length) return; + results[index] = await fn(items[index]!, index); + } + } + + const workers = Array.from( + { length: Math.min(concurrency, items.length) }, + () => worker(), + ); + await Promise.all(workers); + return results; +} diff --git a/backend/BigSet_Data_Collection_Agent/src/utils/url.ts b/backend/BigSet_Data_Collection_Agent/src/utils/url.ts new file mode 100644 index 0000000..3f1f0fc --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/utils/url.ts @@ -0,0 +1,20 @@ +export function normalizeUrl(url: string): string { + try { + const parsed = new URL(url); + parsed.hash = ""; + if (parsed.pathname.endsWith("/") && parsed.pathname.length > 1) { + parsed.pathname = parsed.pathname.slice(0, -1); + } + return parsed.toString(); + } catch { + return url.trim(); + } +} + +export function getDomain(url: string): string { + try { + return new URL(url).hostname.replace(/^www\./, ""); + } catch { + return url; + } +} diff --git a/backend/package-lock.json b/backend/package-lock.json index 16bad4d..ea6aa1c 100644 --- a/backend/package-lock.json +++ b/backend/package-lock.json @@ -12,6 +12,7 @@ "@fastify/cors": "^11.0.0", "@mastra/core": "^1.36.0", "@openrouter/ai-sdk-provider": "^2.9.0", + "@tiny-fish/sdk": "^0.0.8", "ai": "^6.0.0", "convex": "^1.39.1", "dotenv": "^16.4.0", @@ -2544,6 +2545,18 @@ "url": "https://github.com/sponsors/tannerlinsley" } }, + "node_modules/@tiny-fish/sdk": { + "version": "0.0.8", + "resolved": "https://registry.npmjs.org/@tiny-fish/sdk/-/sdk-0.0.8.tgz", + "integrity": "sha512-GTIpIDcwYuCbtd1xcgf0JD81wbPWGY0mxiab9VepT1allNUfVvjWCKT1n8RypsrzXne39j5Ez3ILDBE4ZwlApQ==", + "dependencies": { + "p-retry": "^7.1.1", + "zod": "^4.3.6" + }, + "engines": { + "node": ">=18" + } + }, "node_modules/@types/babel__traverse": { "version": "7.28.0", "resolved": "https://registry.npmjs.org/@types/babel__traverse/-/babel__traverse-7.28.0.tgz", diff --git a/backend/package.json b/backend/package.json index f282ae5..f7784a9 100644 --- a/backend/package.json +++ b/backend/package.json @@ -16,6 +16,7 @@ "@fastify/cors": "^11.0.0", "@mastra/core": "^1.36.0", "@openrouter/ai-sdk-provider": "^2.9.0", + "@tiny-fish/sdk": "^0.0.8", "ai": "^6.0.0", "convex": "^1.39.1", "dotenv": "^16.4.0", diff --git a/backend/src/pipeline/collection-agent-runner.ts b/backend/src/pipeline/collection-agent-runner.ts new file mode 100644 index 0000000..19aaa29 --- /dev/null +++ b/backend/src/pipeline/collection-agent-runner.ts @@ -0,0 +1,311 @@ +import { mkdtemp } from "node:fs/promises"; +import { tmpdir } from "node:os"; +import { join, resolve } from "node:path"; +import { pathToFileURL } from "node:url"; + +import type { + CollectionPopulatePipelineInput, + CollectionPopulatePipelineRunner, +} from "./populate-collection-runtime.js"; +import type { + PopulateCellValue, + PopulateRuntimeResult, +} from "./populate-runtime.js"; + +type CollectionPipelineModule = { + runPipeline(input: CollectionPipelineOptions): Promise; +}; + +interface CollectionPipelineOptions { + prompt: string; + targetRows: number; + outputDir: string; + memoryDir?: string; + enableRepair?: boolean; + enableTriage?: boolean; + enableTinyfishAgent?: boolean; + benchmark?: { + promptId?: string; + promptQuality?: string; + persona?: string; + expectedStress?: string; + requiredColumns: string[]; + }; + onLog?: (stage: string, message: string) => void; +} + +interface CollectionPipelineResult { + report: { + errors?: string[]; + dataset_spec?: CollectionDatasetSpec; + stats?: CollectionPhaseStats; + initial?: CollectionPhaseStats; + repair?: { + stats?: CollectionPhaseStats; + }; + quality?: { + records?: CollectionRecordQuality[]; + }; + llm_usage?: { + prompt_tokens?: number; + completion_tokens?: number; + total_tokens?: number; + }; + }; + records?: CollectionExtractedRecord[]; + visualizationRecords?: CollectionExtractedRecord[]; + llmUsage?: { + promptTokens?: number; + completionTokens?: number; + totalTokens?: number; + }; +} + +interface CollectionDatasetSpec { + columns?: Array<{ name: string }>; + dedupe_keys?: string[]; +} + +interface CollectionPhaseStats { + search_queries_executed?: number; + pages_fetched?: number; + triage?: { + agent_dispatched?: number; + agent_succeeded?: number; + agent_failed?: number; + }; +} + +interface CollectionExtractedRecord { + row?: Record; + source_urls?: string[]; + evidence?: Array<{ + field?: string; + url?: string; + quote?: string; + }>; +} + +interface CollectionRecordQuality { + record_id?: string; + needs_review?: boolean; +} + +export const runCollectionPopulatePipeline: CollectionPopulatePipelineRunner = + async (input) => { + const outputDir = await mkdtemp(join(tmpdir(), "bigset-collection-")); + const pipeline = await loadCollectionPipelineModule(); + const result = await pipeline.runPipeline({ + prompt: input.prompt, + targetRows: input.targetRows, + outputDir, + memoryDir: join(outputDir, "memory"), + enableRepair: boolEnv("COLLECTION_AGENT_ENABLE_REPAIR", false), + enableTriage: boolEnv("COLLECTION_AGENT_ENABLE_TRIAGE", true), + enableTinyfishAgent: boolEnv("COLLECTION_AGENT_ENABLE_AGENT", true), + benchmark: benchmarkContextFromInput(input), + onLog: (stage, message) => { + console.error(`[collection:${stage}] ${message}`); + }, + }); + + return collectionPipelineResultToPopulateRuntimeResult({ + pipeline: result, + requiredColumns: input.requiredColumns, + }); + }; + +async function loadCollectionPipelineModule(): Promise { + const moduleSpecifier = + process.env.COLLECTION_AGENT_PIPELINE_MODULE ?? + new URL( + "../../BigSet_Data_Collection_Agent/src/orchestrator/pipeline.ts", + import.meta.url + ).href; + const moduleUrl = moduleSpecifier.startsWith(".") || moduleSpecifier.startsWith("/") + ? pathToFileURL(resolve(moduleSpecifier)).href + : moduleSpecifier; + const loadedModule = await import(moduleUrl); + if (typeof loadedModule.runPipeline !== "function") { + throw new Error( + `${moduleSpecifier} must export runPipeline(options).` + ); + } + return loadedModule as CollectionPipelineModule; +} + +function benchmarkContextFromInput(input: CollectionPopulatePipelineInput) { + if (input.requiredColumns.length === 0) { + return undefined; + } + return { + promptId: input.promptId, + promptQuality: input.promptQuality, + persona: input.persona, + expectedStress: input.expectedStress, + requiredColumns: input.requiredColumns, + }; +} + +function collectionPipelineResultToPopulateRuntimeResult(input: { + pipeline: CollectionPipelineResult; + requiredColumns: string[]; +}): PopulateRuntimeResult { + const records = selectOutputRecords(input.pipeline); + const qualityById = qualityByRecordId(input.pipeline.report.quality?.records); + const rows = records.map((record) => + collectionRecordToPopulateRow({ + record, + spec: input.pipeline.report.dataset_spec, + requiredColumns: input.requiredColumns, + qualityById, + }) + ); + + return { + rows, + validationIssues: [ + ...(input.pipeline.report.errors ?? []), + ...(rows.length === 0 ? ["No rows returned from collection pipeline."] : []), + ], + usage: usageFromPipeline(input.pipeline), + metrics: metricsFromReport(input.pipeline.report), + }; +} + +function selectOutputRecords( + pipeline: CollectionPipelineResult +): CollectionExtractedRecord[] { + if (pipeline.visualizationRecords && pipeline.visualizationRecords.length > 0) { + return pipeline.visualizationRecords; + } + return pipeline.records ?? []; +} + +function collectionRecordToPopulateRow(input: { + record: CollectionExtractedRecord; + spec?: CollectionDatasetSpec; + requiredColumns: string[]; + qualityById: Map; +}) { + const cells: Record = { + ...(input.record.row ?? {}), + }; + for (const columnName of input.requiredColumns) { + if (cells[columnName] === undefined) { + cells[columnName] = null; + } + } + + const sourceUrls = uniqueHttpUrls(input.record.source_urls ?? []); + const evidence = (input.record.evidence ?? []) + .map((item) => ({ + columnName: item.field ?? "", + sourceUrl: item.url || sourceUrls[0] || "", + quote: item.quote ?? "", + })) + .filter((item) => item.columnName && item.quote); + const recordId = canonicalRecordId(input.record, input.spec); + const quality = recordId ? input.qualityById.get(recordId) : undefined; + + return { + cells, + sourceUrls, + evidence, + needsReview: quality?.needs_review ?? false, + }; +} + +function qualityByRecordId( + records: CollectionRecordQuality[] = [] +): Map { + return new Map( + records + .filter((record) => record.record_id) + .map((record) => [record.record_id as string, record]) + ); +} + +function canonicalRecordId( + record: CollectionExtractedRecord, + spec?: CollectionDatasetSpec +): string | undefined { + const primaryKey = + spec?.dedupe_keys?.[0] ?? + spec?.columns?.find((column) => + /(name|title|company|organization|entity)/i.test(column.name) + )?.name ?? + spec?.columns?.[0]?.name; + if (!primaryKey) { + return undefined; + } + const value = normalizePrimaryKey(record.row?.[primaryKey]); + return value ? `pk:${value}` : undefined; +} + +function usageFromPipeline(pipeline: CollectionPipelineResult) { + const scopedUsage = pipeline.llmUsage; + if (scopedUsage?.totalTokens) { + return { + promptTokens: scopedUsage.promptTokens ?? 0, + completionTokens: scopedUsage.completionTokens ?? 0, + totalTokens: scopedUsage.totalTokens ?? 0, + }; + } + const reportUsage = pipeline.report.llm_usage; + return { + promptTokens: reportUsage?.prompt_tokens ?? 0, + completionTokens: reportUsage?.completion_tokens ?? 0, + totalTokens: reportUsage?.total_tokens ?? 0, + }; +} + +function metricsFromReport(report: CollectionPipelineResult["report"]) { + const stats = report.stats ?? {}; + const initialTriage = report.initial?.triage ?? {}; + const statsTriage = stats.triage ?? {}; + const repairTriage = report.repair?.stats?.triage ?? {}; + const agentDispatched = + numberValue(statsTriage.agent_dispatched) || + numberValue(initialTriage.agent_dispatched) + + numberValue(repairTriage.agent_dispatched); + + return { + searchCalls: numberValue(stats.search_queries_executed), + fetchCalls: numberValue(stats.pages_fetched), + browserCalls: agentDispatched, + agentRuns: agentDispatched > 0 ? agentDispatched : 1, + agentSteps: + numberValue(statsTriage.agent_succeeded) + + numberValue(statsTriage.agent_failed) + + numberValue(repairTriage.agent_succeeded) + + numberValue(repairTriage.agent_failed), + }; +} + +function uniqueHttpUrls(urls: string[]): string[] { + return Array.from( + new Set( + urls.filter((url) => typeof url === "string" && /^https?:\/\//i.test(url)) + ) + ); +} + +function normalizePrimaryKey(value: unknown): string { + if (value === null || value === undefined) { + return ""; + } + return String(value).trim().toLowerCase().replace(/\s+/g, " "); +} + +function numberValue(value: unknown): number { + return typeof value === "number" && Number.isFinite(value) ? value : 0; +} + +function boolEnv(name: string, fallback: boolean): boolean { + const raw = process.env[name]; + if (raw === undefined || raw === "") { + return fallback; + } + return ["1", "true", "yes", "on"].includes(raw.toLowerCase()); +} diff --git a/backend/test/collection-agent-runner.test.ts b/backend/test/collection-agent-runner.test.ts new file mode 100644 index 0000000..aaf9e0a --- /dev/null +++ b/backend/test/collection-agent-runner.test.ts @@ -0,0 +1,140 @@ +import assert from "node:assert/strict"; +import { test } from "node:test"; + +import { runCollectionPopulatePipeline } from "../src/pipeline/collection-agent-runner.js"; + +test("collection agent runner maps vendored pipeline output into populate runtime result", async () => { + const previousModule = process.env.COLLECTION_AGENT_PIPELINE_MODULE; + process.env.COLLECTION_AGENT_PIPELINE_MODULE = fakeCollectionPipelineModuleUrl(); + try { + const result = await runCollectionPopulatePipeline({ + datasetId: "dataset-ai-posts", + datasetName: "AI posts", + description: "Find latest AI blog posts.", + columns: [ + { name: "entity_name", type: "text" }, + { name: "source_url", type: "url" }, + { name: "evidence_quote", type: "text" }, + ], + requiredColumns: ["entity_name", "source_url", "evidence_quote"], + prompt: [ + "Dataset: AI posts", + "Task: Find latest AI blog posts.", + "", + "Durable recipe instructions:", + "Prefer official source pages.", + ].join("\n"), + recipeInstructions: "Prefer official source pages.", + targetRows: 3, + promptId: "latest-ai-blog-posts", + promptQuality: "easy", + persona: "technical operator", + expectedStress: "Latest dated source pages.", + }); + + assert.equal(result.rows.length, 1); + assert.equal(result.rows[0]?.cells.entity_name, "OpenAI"); + assert.equal(result.rows[0]?.cells.evidence_quote, "technical operator"); + assert.deepEqual(result.rows[0]?.sourceUrls, ["https://openai.com/news"]); + assert.equal(result.rows[0]?.evidence[0]?.columnName, "entity_name"); + assert.equal(result.rows[0]?.needsReview, true); + assert.deepEqual(result.validationIssues, []); + assert.deepEqual(result.usage, { + promptTokens: 11, + completionTokens: 7, + totalTokens: 18, + }); + assert.equal(result.metrics.searchCalls, 2); + assert.equal(result.metrics.fetchCalls, 3); + assert.equal(result.metrics.browserCalls, 1); + } finally { + if (previousModule === undefined) { + delete process.env.COLLECTION_AGENT_PIPELINE_MODULE; + } else { + process.env.COLLECTION_AGENT_PIPELINE_MODULE = previousModule; + } + } +}); + +function fakeCollectionPipelineModuleUrl(): string { + const source = ` + export async function runPipeline(options) { + if (!options.prompt.includes("Durable recipe instructions")) { + throw new Error("recipe instructions missing from prompt"); + } + if (!options.memoryDir || !options.memoryDir.includes("memory")) { + throw new Error("isolated memory dir missing"); + } + if (options.benchmark?.promptId !== "latest-ai-blog-posts") { + throw new Error("prompt id missing from benchmark context"); + } + if (options.benchmark?.persona !== "technical operator") { + throw new Error("persona missing from benchmark context"); + } + if (options.benchmark?.requiredColumns?.join(",") !== "entity_name,source_url,evidence_quote") { + throw new Error("required columns missing from benchmark context"); + } + return { + report: { + errors: [], + dataset_spec: { + columns: [{ name: "entity_name" }], + dedupe_keys: ["entity_name"], + }, + stats: { + search_queries_executed: 2, + pages_fetched: 3, + triage: { + agent_dispatched: 1, + agent_succeeded: 1, + agent_failed: 0, + }, + }, + initial: { + triage: { + agent_dispatched: 1, + agent_succeeded: 1, + agent_failed: 0, + }, + }, + repair: { + stats: { + triage: { + agent_dispatched: 0, + agent_succeeded: 0, + agent_failed: 0, + }, + }, + }, + quality: { + records: [{ record_id: "pk:openai", needs_review: true }], + }, + llm_usage: { + prompt_tokens: 1, + completion_tokens: 1, + total_tokens: 2, + }, + }, + records: [{ + row: { + entity_name: "OpenAI", + source_url: "https://openai.com/news", + evidence_quote: options.benchmark.persona, + }, + source_urls: ["https://openai.com/news"], + evidence: [{ + field: "entity_name", + url: "https://openai.com/news", + quote: options.benchmark.expectedStress, + }], + }], + llmUsage: { + promptTokens: 11, + completionTokens: 7, + totalTokens: 18, + }, + }; + } + `; + return `data:text/javascript,${encodeURIComponent(source)}`; +} diff --git a/benchmarks/dataset-agent/run-benchmark.mjs b/benchmarks/dataset-agent/run-benchmark.mjs index 2de1099..a84fa63 100755 --- a/benchmarks/dataset-agent/run-benchmark.mjs +++ b/benchmarks/dataset-agent/run-benchmark.mjs @@ -1582,6 +1582,12 @@ function failureReason({ if (answerKeyScore.failureCategory === "source_evidence") { return `Source/domain evidence failed; factual accuracy ${answerKeyScore.factualAccuracyScore}, domain accuracy ${answerKeyScore.domainAccuracyRatio}.`; } + if (answerKeyScore.entityCoverageRatio < 1) { + return `Entity coverage ${answerKeyScore.entityCoverageRatio} below required coverage; missing entities: ${answerKeyScore.missingExpectedEntities.join(", ") || "none"}.`; + } + if (answerKeyScore.claimSupportRatio < 1) { + return `Claim support ${answerKeyScore.claimSupportRatio} below required support; missing required claim text for: ${answerKeyScore.missingExpectedEntities.join(", ") || "none"}.`; + } return `Factual accuracy ${answerKeyScore.factualAccuracyScore} below ${answerKeyScore.minimumScore}; missing entities: ${answerKeyScore.missingExpectedEntities.join(", ") || "none"}.`; } return "Benchmark failed."; From d476174e1d64edd6959a7e177b6435da357b8be5 Mon Sep 17 00:00:00 2001 From: Edward Tran Date: Fri, 22 May 2026 22:35:34 +0700 Subject: [PATCH 2/2] Harden collection runner wiring --- .../src/pipeline/collection-agent-runner.ts | 20 +++++++++---------- backend/test/collection-agent-runner.test.ts | 10 ++++++---- benchmarks/dataset-agent/README.md | 11 ++++++---- benchmarks/dataset-agent/run-benchmark.mjs | 15 +++++++++----- docs/data-collection-agent-migration-plan.md | 2 ++ 5 files changed, 34 insertions(+), 24 deletions(-) diff --git a/backend/src/pipeline/collection-agent-runner.ts b/backend/src/pipeline/collection-agent-runner.ts index 19aaa29..67b7eba 100644 --- a/backend/src/pipeline/collection-agent-runner.ts +++ b/backend/src/pipeline/collection-agent-runner.ts @@ -116,12 +116,12 @@ export const runCollectionPopulatePipeline: CollectionPopulatePipelineRunner = }; async function loadCollectionPipelineModule(): Promise { - const moduleSpecifier = - process.env.COLLECTION_AGENT_PIPELINE_MODULE ?? - new URL( - "../../BigSet_Data_Collection_Agent/src/orchestrator/pipeline.ts", - import.meta.url - ).href; + const moduleSpecifier = process.env.COLLECTION_AGENT_PIPELINE_MODULE; + if (!moduleSpecifier) { + throw new Error( + "COLLECTION_AGENT_PIPELINE_MODULE must point to the collection pipeline module exporting runPipeline(options)." + ); + } const moduleUrl = moduleSpecifier.startsWith(".") || moduleSpecifier.startsWith("/") ? pathToFileURL(resolve(moduleSpecifier)).href : moduleSpecifier; @@ -263,10 +263,8 @@ function usageFromPipeline(pipeline: CollectionPipelineResult) { function metricsFromReport(report: CollectionPipelineResult["report"]) { const stats = report.stats ?? {}; const initialTriage = report.initial?.triage ?? {}; - const statsTriage = stats.triage ?? {}; const repairTriage = report.repair?.stats?.triage ?? {}; const agentDispatched = - numberValue(statsTriage.agent_dispatched) || numberValue(initialTriage.agent_dispatched) + numberValue(repairTriage.agent_dispatched); @@ -274,10 +272,10 @@ function metricsFromReport(report: CollectionPipelineResult["report"]) { searchCalls: numberValue(stats.search_queries_executed), fetchCalls: numberValue(stats.pages_fetched), browserCalls: agentDispatched, - agentRuns: agentDispatched > 0 ? agentDispatched : 1, + agentRuns: agentDispatched, agentSteps: - numberValue(statsTriage.agent_succeeded) + - numberValue(statsTriage.agent_failed) + + numberValue(initialTriage.agent_succeeded) + + numberValue(initialTriage.agent_failed) + numberValue(repairTriage.agent_succeeded) + numberValue(repairTriage.agent_failed), }; diff --git a/backend/test/collection-agent-runner.test.ts b/backend/test/collection-agent-runner.test.ts index aaf9e0a..0d68cc6 100644 --- a/backend/test/collection-agent-runner.test.ts +++ b/backend/test/collection-agent-runner.test.ts @@ -46,7 +46,9 @@ test("collection agent runner maps vendored pipeline output into populate runtim }); assert.equal(result.metrics.searchCalls, 2); assert.equal(result.metrics.fetchCalls, 3); - assert.equal(result.metrics.browserCalls, 1); + assert.equal(result.metrics.browserCalls, 3); + assert.equal(result.metrics.agentRuns, 3); + assert.equal(result.metrics.agentSteps, 3); } finally { if (previousModule === undefined) { delete process.env.COLLECTION_AGENT_PIPELINE_MODULE; @@ -100,9 +102,9 @@ function fakeCollectionPipelineModuleUrl(): string { repair: { stats: { triage: { - agent_dispatched: 0, - agent_succeeded: 0, - agent_failed: 0, + agent_dispatched: 2, + agent_succeeded: 1, + agent_failed: 1, }, }, }, diff --git a/benchmarks/dataset-agent/README.md b/benchmarks/dataset-agent/README.md index 94525f4..e9a56d7 100644 --- a/benchmarks/dataset-agent/README.md +++ b/benchmarks/dataset-agent/README.md @@ -29,16 +29,19 @@ That means collection results are scored after the same recipe generation, repair, validation, and promotion path as the app runtime. ```bash +COLLECTION_AGENT_PIPELINE_MODULE=./backend/BigSet_Data_Collection_Agent/src/orchestrator/pipeline.ts \ +BIGSET_COLLECTION_BENCHMARK_RUNNER_MODULE=./backend/src/pipeline/collection-agent-runner.ts \ node benchmarks/dataset-agent/run-benchmark.mjs \ --prompt-ids latest-ai-blog-posts,saas-pricing-pages \ --system collection-self-heal='node --import ./backend/node_modules/tsx/dist/esm/index.mjs benchmarks/dataset-agent/adapters/collection-self-healing-adapter.mjs' ``` Real collection benchmark runs require `OPENROUTER_API_KEY`, -`TINYFISH_API_KEY`, and `BIGSET_COLLECTION_BENCHMARK_RUNNER_MODULE` loaded in -the shell. The runner module must export `runCollectionPopulatePipeline(input)` -or a default runner that accepts `CollectionPopulatePipelineInput` and returns a -`PopulateRuntimeResult`. +`TINYFISH_API_KEY`, `BIGSET_COLLECTION_BENCHMARK_RUNNER_MODULE`, and +`COLLECTION_AGENT_PIPELINE_MODULE` loaded in the shell. The benchmark runner +module must export `runCollectionPopulatePipeline(input)` or a default runner +that accepts `CollectionPopulatePipelineInput` and returns a +`PopulateRuntimeResult`. The pipeline module must export `runPipeline(options)`. App and CLI collection-runtime runs use the same runner shape, but load it from `POPULATE_COLLECTION_RUNNER_MODULE` when `POPULATE_AGENT_RUNTIME=collection`. diff --git a/benchmarks/dataset-agent/run-benchmark.mjs b/benchmarks/dataset-agent/run-benchmark.mjs index a84fa63..552a311 100755 --- a/benchmarks/dataset-agent/run-benchmark.mjs +++ b/benchmarks/dataset-agent/run-benchmark.mjs @@ -605,6 +605,7 @@ async function runSystemPrompt(input) { abstentionScore: answerKeyScore.abstentionScore, matchedExpectedEntities: answerKeyScore.matchedExpectedEntities, missingExpectedEntities: answerKeyScore.missingExpectedEntities, + missingClaimSupportEntities: answerKeyScore.missingClaimSupportEntities, latencyMs: Date.now() - startedAt, exitCode: execution.exitCode, timedOut: execution.timedOut, @@ -1093,6 +1094,7 @@ async function rescoreBenchmarkRun({ runDirectory, prompts, config }) { abstentionScore: answerKeyScore.abstentionScore, matchedExpectedEntities: answerKeyScore.matchedExpectedEntities, missingExpectedEntities: answerKeyScore.missingExpectedEntities, + missingClaimSupportEntities: answerKeyScore.missingClaimSupportEntities, rowCount: validation.rowCount, nonEmptyCellCount: validation.nonEmptyCellCount, totalExpectedCellCount: validation.totalExpectedCellCount, @@ -1137,6 +1139,7 @@ function scoreBenchmarkRows(input) { const expectedEntities = answerKey.expectedEntities ?? []; const matchedExpectedEntities = []; const missingExpectedEntities = []; + const missingClaimSupportEntities = []; let expectedEntityDomainMatches = 0; let expectedEntityClaimMatches = 0; @@ -1157,11 +1160,12 @@ function scoreBenchmarkRows(input) { if (rowsToCheck.some((row) => rowHasAllowedDomain(row, expectedEntity.allowedSourceDomains))) { expectedEntityDomainMatches += 1; } - if ( - !expectedEntity.requiredText?.length || - rowsToCheck.some((row) => textContainsAny(rowSearchText(row), expectedEntity.requiredText)) - ) { + const hasRequiredClaimText = !expectedEntity.requiredText?.length || + rowsToCheck.some((row) => textContainsAny(rowSearchText(row), expectedEntity.requiredText)); + if (hasRequiredClaimText) { expectedEntityClaimMatches += 1; + } else { + missingClaimSupportEntities.push(expectedEntity.label ?? expectedEntity.id); } } @@ -1241,6 +1245,7 @@ function scoreBenchmarkRows(input) { abstentionScore, matchedExpectedEntities, missingExpectedEntities, + missingClaimSupportEntities, minimumScore, }; } @@ -1586,7 +1591,7 @@ function failureReason({ return `Entity coverage ${answerKeyScore.entityCoverageRatio} below required coverage; missing entities: ${answerKeyScore.missingExpectedEntities.join(", ") || "none"}.`; } if (answerKeyScore.claimSupportRatio < 1) { - return `Claim support ${answerKeyScore.claimSupportRatio} below required support; missing required claim text for: ${answerKeyScore.missingExpectedEntities.join(", ") || "none"}.`; + return `Claim support ${answerKeyScore.claimSupportRatio} below required support; missing required claim text for: ${(answerKeyScore.missingClaimSupportEntities ?? []).join(", ") || "none"}.`; } return `Factual accuracy ${answerKeyScore.factualAccuracyScore} below ${answerKeyScore.minimumScore}; missing entities: ${answerKeyScore.missingExpectedEntities.join(", ") || "none"}.`; } diff --git a/docs/data-collection-agent-migration-plan.md b/docs/data-collection-agent-migration-plan.md index 6531984..7110815 100644 --- a/docs/data-collection-agent-migration-plan.md +++ b/docs/data-collection-agent-migration-plan.md @@ -198,6 +198,7 @@ benchmark can stop measuring the same task. The real benchmark command after a runner module exists is: ```bash +COLLECTION_AGENT_PIPELINE_MODULE=./backend/BigSet_Data_Collection_Agent/src/orchestrator/pipeline.ts \ BIGSET_COLLECTION_BENCHMARK_RUNNER_MODULE=./backend/src/pipeline/collection-agent-runner.ts \ node benchmarks/dataset-agent/run-benchmark.mjs \ --prompt-ids latest-ai-blog-posts,saas-pricing-pages \ @@ -234,6 +235,7 @@ When testing the real app or CLI path, set: ```bash POPULATE_AGENT_RUNTIME=collection POPULATE_COLLECTION_RUNNER_MODULE=./backend/src/pipeline/collection-agent-runner.ts +COLLECTION_AGENT_PIPELINE_MODULE=./backend/BigSet_Data_Collection_Agent/src/orchestrator/pipeline.ts ``` Do not switch the default runtime from Mastra to collection until the