diff --git a/backend/BigSet_Data_Collection_Agent/src/orchestrator/process-pages.ts b/backend/BigSet_Data_Collection_Agent/src/orchestrator/process-pages.ts index ef81d2a..4009569 100644 --- a/backend/BigSet_Data_Collection_Agent/src/orchestrator/process-pages.ts +++ b/backend/BigSet_Data_Collection_Agent/src/orchestrator/process-pages.ts @@ -31,6 +31,7 @@ import { join } from "node:path"; export interface AgentDeferredEntry { url: string; status: SourceStatus; + reason: "agent_budget" | "agent_disabled"; } export interface ProcessPagesResult { @@ -216,6 +217,7 @@ export async function processFetchedPages(options: { const extractPages: { page: FetchedPage; triage: SourceTriageResult }[] = []; const agentQueue: { page: FetchedPage; triage: SourceTriageResult }[] = []; + const agentDisabledDeferredEntries: AgentDeferredEntry[] = []; for (const triage of triageResults) { bumpStatus(summary, triage.status); @@ -241,6 +243,11 @@ export async function processFetchedPages(options: { extractPages.push({ page, triage }); } else if (sourcePolicy.requiresOfficialSource) { summary.skipped += 1; + agentDisabledDeferredEntries.push({ + url: triage.final_url || page.url, + status: triage.status, + reason: "agent_disabled", + }); options.log( options.label, `Agent disabled — skip navigation-only official source ${triage.final_url} [${triage.status}]`, @@ -296,17 +303,21 @@ export async function processFetchedPages(options: { const agentBudget = agentEnabled ? config.maxAgentRunsPerPhase : 0; const toRun = agentQueue.slice(0, agentBudget); - const deferredEntries: AgentDeferredEntry[] = agentQueue - .slice(agentBudget) - .map(({ page, triage }) => ({ - url: triage.final_url || page.url, - status: triage.status, - })); + const deferredEntries: AgentDeferredEntry[] = [ + ...agentDisabledDeferredEntries, + ...agentQueue + .slice(agentBudget) + .map(({ page, triage }) => ({ + url: triage.final_url || page.url, + status: triage.status, + reason: "agent_budget" as const, + })), + ]; if (deferredEntries.length > 0) { options.log( options.label, - `Agent budget: running ${toRun.length}/${agentQueue.length} (${deferredEntries.length} deferred)`, + `Agent capability: running ${toRun.length}/${agentQueue.length} (${deferredEntries.length} deferred)`, ); } diff --git a/backend/BigSet_Data_Collection_Agent/src/quality/build-report.ts b/backend/BigSet_Data_Collection_Agent/src/quality/build-report.ts index 5f1442e..dac45d9 100644 --- a/backend/BigSet_Data_Collection_Agent/src/quality/build-report.ts +++ b/backend/BigSet_Data_Collection_Agent/src/quality/build-report.ts @@ -86,7 +86,11 @@ export interface BuildSourcesOptions { fetchedUrls: string[]; triageResults: SourceTriageResult[]; agentRuns: AgentRunRecord[]; - agentDeferred: { url: string; status: string }[]; + agentDeferred: { + url: string; + status: string; + reason?: "agent_budget" | "agent_disabled"; + }[]; } export function buildSourcesReport( @@ -133,7 +137,9 @@ export function buildSourcesReport( phase: options.phase, outcome: "agent_deferred", triage_status: deferred.status, - error: "Exceeded MAX_AGENT_RUNS_PER_PHASE budget", + error: deferred.reason === "agent_disabled" + ? "TinyFish Agent disabled for browser/form/detail follow-up" + : "Exceeded MAX_AGENT_RUNS_PER_PHASE budget", }); } diff --git a/backend/src/pipeline/collection-agent-runner.ts b/backend/src/pipeline/collection-agent-runner.ts index bb9d90b..9321a06 100644 --- a/backend/src/pipeline/collection-agent-runner.ts +++ b/backend/src/pipeline/collection-agent-runner.ts @@ -47,6 +47,7 @@ interface CollectionPipelineResult { quality?: { records?: CollectionRecordQuality[]; }; + sources?: CollectionSourcesReport; llm_usage?: { prompt_tokens?: number; completion_tokens?: number; @@ -92,6 +93,21 @@ interface CollectionRecordQuality { needs_review?: boolean; } +interface CollectionSourcesReport { + outcomes?: CollectionSourceOutcome[]; +} + +interface CollectionSourceOutcome { + outcome?: string; + triage_status?: string; +} + +const AGENT_REQUIRED_TRIAGE_STATUSES = new Set([ + "requires_navigation", + "requires_form_submission", + "requires_detail_page_followup", +]); + const DEFAULT_COLLECTION_AGENT_POLL_TIMEOUT_MS = 480_000; export const runCollectionPopulatePipeline: CollectionPopulatePipelineRunner = @@ -119,6 +135,7 @@ export const runCollectionPopulatePipeline: CollectionPopulatePipelineRunner = return collectionPipelineResultToPopulateRuntimeResult({ pipeline: result, requiredColumns: input.requiredColumns, + enableTinyfishAgent, }); }; @@ -157,6 +174,7 @@ function benchmarkContextFromInput(input: CollectionPopulatePipelineInput) { function collectionPipelineResultToPopulateRuntimeResult(input: { pipeline: CollectionPipelineResult; requiredColumns: string[]; + enableTinyfishAgent: boolean; }): PopulateRuntimeResult { const records = selectOutputRecords(input.pipeline); const qualityById = qualityByRecordId(input.pipeline.report.quality?.records); @@ -168,11 +186,16 @@ function collectionPipelineResultToPopulateRuntimeResult(input: { qualityById, }) ); + const capabilityDiagnostics = capabilityDiagnosticsFromReport({ + report: input.pipeline.report, + enableTinyfishAgent: input.enableTinyfishAgent, + }); return { rows, validationIssues: [ ...(input.pipeline.report.errors ?? []), + ...capabilityDiagnostics, ...(rows.length === 0 ? ["No rows returned from collection pipeline."] : []), ], usage: usageFromPipeline(input.pipeline), @@ -180,6 +203,42 @@ function collectionPipelineResultToPopulateRuntimeResult(input: { }; } +function capabilityDiagnosticsFromReport(input: { + report: CollectionPipelineResult["report"]; + enableTinyfishAgent: boolean; +}): string[] { + if (input.enableTinyfishAgent) { + return []; + } + const agentRequiredOutcomes = (input.report.sources?.outcomes ?? []).filter( + isAgentRequiredSourceOutcome + ); + if (agentRequiredOutcomes.length === 0) { + return []; + } + + const statusCounts = new Map(); + for (const outcome of agentRequiredOutcomes) { + const status = outcome.triage_status as string; + statusCounts.set(status, (statusCounts.get(status) ?? 0) + 1); + } + const statusSummary = Array.from(statusCounts.entries()) + .map(([status, count]) => `${status}=${count}`) + .join(", "); + + return [ + `Capability diagnostic: TinyFish Agent disabled; triage requested browser/form/detail follow-up for ${agentRequiredOutcomes.length} page(s) (${statusSummary}). Enable COLLECTION_AGENT_ENABLE_AGENT=true for live navigation.`, + ]; +} + +function isAgentRequiredSourceOutcome(outcome: CollectionSourceOutcome): boolean { + return ( + typeof outcome.triage_status === "string" && + AGENT_REQUIRED_TRIAGE_STATUSES.has(outcome.triage_status) && + outcome.outcome !== "success" + ); +} + function selectOutputRecords( pipeline: CollectionPipelineResult ): CollectionExtractedRecord[] { diff --git a/backend/test/collection-agent-runner.test.ts b/backend/test/collection-agent-runner.test.ts index 5c16465..1b88c6e 100644 --- a/backend/test/collection-agent-runner.test.ts +++ b/backend/test/collection-agent-runner.test.ts @@ -83,6 +83,54 @@ test("collection agent runner requires explicit Agent opt-in and caps poll timeo } }); +test("collection agent runner surfaces Agent-required capability diagnostics from source outcomes", async () => { + const previousEnv = snapshotEnv([ + "AGENT_POLL_TIMEOUT_MS", + "COLLECTION_AGENT_ENABLE_AGENT", + "COLLECTION_AGENT_PIPELINE_MODULE", + "COLLECTION_AGENT_POLL_TIMEOUT_MS", + ]); + delete process.env.AGENT_POLL_TIMEOUT_MS; + delete process.env.COLLECTION_AGENT_ENABLE_AGENT; + delete process.env.COLLECTION_AGENT_POLL_TIMEOUT_MS; + process.env.COLLECTION_AGENT_PIPELINE_MODULE = fakeCollectionPipelineModuleUrl({ + expectedCalls: [{ agentEnabled: false }], + sources: { + outcomes: [ + { + outcome: "agent_deferred", + triage_status: "requires_navigation", + }, + { + outcome: "no_records", + triage_status: "requires_form_submission", + }, + { + outcome: "success", + triage_status: "requires_detail_page_followup", + }, + ], + }, + }); + + try { + const result = await runCollectionPopulatePipeline(collectionPipelineInput()); + const diagnostic = result.validationIssues.join("\n"); + + assert.equal(result.rows.length, 1); + assert.match(diagnostic, /Capability diagnostic: TinyFish Agent disabled/); + assert.match(diagnostic, /2 page\(s\)/); + assert.match(diagnostic, /requires_navigation=1/); + assert.match(diagnostic, /requires_form_submission=1/); + assert.doesNotMatch( + diagnostic, + /failed|missing|no rows|not found|invented|invalid/i + ); + } finally { + restoreEnv(previousEnv); + } +}); + function collectionPipelineInput() { return { datasetId: "dataset-ai-posts", @@ -116,6 +164,7 @@ function fakeCollectionPipelineModuleUrl(input: { agentEnabled: boolean; pollTimeoutMs?: number; }>; + sources?: unknown; }): string { const source = ` const moduleLoadPollTimeoutMs = process.env.AGENT_POLL_TIMEOUT_MS ?? null; @@ -187,6 +236,7 @@ function fakeCollectionPipelineModuleUrl(input: { quality: { records: [{ record_id: "pk:openai", needs_review: true }], }, + sources: ${JSON.stringify(input.sources ?? { outcomes: [] })}, llm_usage: { prompt_tokens: 1, completion_tokens: 1, diff --git a/backend/test/populate-collection-runtime.test.ts b/backend/test/populate-collection-runtime.test.ts index a9fd9e8..f195bc2 100644 --- a/backend/test/populate-collection-runtime.test.ts +++ b/backend/test/populate-collection-runtime.test.ts @@ -121,6 +121,57 @@ test("collection runtime threads recipe instructions into the collection prompt" assert.equal(run.rows[0]?.cells.entity_name, "OpenAI"); }); +test("collection runtime treats capability diagnostics as non-fatal warnings for healthy rows", async () => { + const runtime = new CollectionPopulateRecipeRuntime({ + targetRows: 3, + runPipeline: async () => ({ + rows: [{ + cells: { + entity_name: "OpenAI", + latest_post_title: "Release notes from OpenAI", + source_url: "https://openai.com/news", + evidence_quote: "Release notes from OpenAI", + }, + sourceUrls: ["https://openai.com/news"], + evidence: [{ + columnName: "latest_post_title", + sourceUrl: "https://openai.com/news", + quote: "Release notes from OpenAI", + }], + needsReview: false, + }], + validationIssues: [ + "Capability diagnostic: TinyFish Agent disabled; triage requested browser/form/detail follow-up for 2 page(s) (requires_navigation=1, requires_form_submission=1). Enable COLLECTION_AGENT_ENABLE_AGENT=true for live navigation.", + ], + usage: { + promptTokens: 11, + completionTokens: 7, + totalTokens: 18, + }, + metrics: { + searchCalls: 1, + fetchCalls: 1, + browserCalls: 0, + agentRuns: 0, + agentSteps: 0, + }, + }), + }); + + const run = await runtime.runRecipe({ + recipe: collectionRecipe(), + context, + }); + + assert.equal(run.runStatus, "succeeded"); + assert.equal(run.productionValidation.isValid, true); + assert.deepEqual(run.productionValidation.criticalIssues, []); + assert.match( + run.productionValidation.warnings.join("\n"), + /Capability diagnostic: TinyFish Agent disabled/ + ); +}); + test("collection pipeline input builder trims empty recipe instructions", () => { const input = collectionPipelineInputFromRecipe({ recipe: collectionRecipe({ runtimeInstructions: " " }), diff --git a/benchmarks/dataset-agent/run-benchmark.mjs b/benchmarks/dataset-agent/run-benchmark.mjs index 552a311..4dfc58b 100755 --- a/benchmarks/dataset-agent/run-benchmark.mjs +++ b/benchmarks/dataset-agent/run-benchmark.mjs @@ -1,7 +1,7 @@ #!/usr/bin/env node import { spawn } from "node:child_process"; import { mkdir, readFile, writeFile } from "node:fs/promises"; -import { dirname, join } from "node:path"; +import { dirname, join, resolve } from "node:path"; import { fileURLToPath } from "node:url"; const scriptDir = dirname(fileURLToPath(import.meta.url)); @@ -515,7 +515,9 @@ const answerKeysByPromptId = { }, }; -await main(); +if (process.argv[1] && resolve(process.argv[1]) === fileURLToPath(import.meta.url)) { + await main(); +} async function runSystemPrompt(input) { const startedAt = Date.now(); @@ -643,6 +645,7 @@ async function runSystemPrompt(input) { answerKeyScore, infraBlockerReason, minRequiredCompleteness: input.config.minRequiredCompleteness, + validationIssues: normalized.validationIssues, }), }; } @@ -1119,6 +1122,7 @@ async function rescoreBenchmarkRun({ runDirectory, prompts, config }) { answerKeyScore, infraBlockerReason, minRequiredCompleteness: config.minRequiredCompleteness, + validationIssues: normalized.validationIssues, }), }); } @@ -1350,7 +1354,7 @@ function failureCategoryForScore(input) { return "factual_accuracy"; } -function findInfrastructureBlockerReason({ execution, parsedPayload, normalized }) { +export function findInfrastructureBlockerReason({ execution, parsedPayload, normalized }) { const combinedText = [ execution.stderr, execution.stdout, @@ -1360,17 +1364,19 @@ function findInfrastructureBlockerReason({ execution, parsedPayload, normalized if (execution.timedOut) return "Command timed out."; const blockerPatterns = [ - "authentication failed", - "active subscription", - "insufficient credits", - "not enough credits", - "api key", - "tinyfish_api_key", - "quota", - "rate limit", - "benchmark deadline", + /authentication failed/, + /active subscription/, + /insufficient credits/, + /not enough credits/, + /(?:missing|required|invalid|not configured|not set|unset)[^.]{0,80}api[_ -]?key/, + /api[_ -]?key[^.]{0,80}(?:missing|required|invalid|not configured|not set|unset)/, + /tinyfish_api_key/, + /openrouter_api_key/, + /quota exceeded/, + /rate[_ -]?limit[_ -]?exceeded/, + /benchmark deadline/, ]; - return blockerPatterns.some((pattern) => combinedText.includes(pattern)) + return blockerPatterns.some((pattern) => pattern.test(combinedText)) ? "Infrastructure/auth/credits blocker." : null; } @@ -1562,18 +1568,21 @@ function identityKey(cells, row) { return identityParts[0] ?? null; } -function failureReason({ +export function failureReason({ execution, parsedPayload, validation, answerKeyScore, infraBlockerReason, minRequiredCompleteness, + validationIssues = [], }) { if (infraBlockerReason) return infraBlockerReason; if (execution.timedOut) return "Command timed out."; if (execution.exitCode !== 0) return `Command exited ${execution.exitCode}.`; if (!parsedPayload) return "No parseable JSON object found in stdout."; + const capabilityDiagnostic = capabilityDiagnosticReason(validationIssues); + if (capabilityDiagnostic) return capabilityDiagnostic; if (answerKeyScore?.failureCategory === "clarification") { return `Clarification/abstention score ${answerKeyScore.abstentionScore} below required threshold.`; } @@ -1598,6 +1607,12 @@ function failureReason({ return "Benchmark failed."; } +function capabilityDiagnosticReason(validationIssues) { + return validationIssues.find((issue) => + /^capability diagnostic:/i.test(String(issue)) + ) ?? null; +} + function arrayValue(value) { return Array.isArray(value) ? value : []; } diff --git a/benchmarks/dataset-agent/run-benchmark.test.mjs b/benchmarks/dataset-agent/run-benchmark.test.mjs new file mode 100644 index 0000000..cdc0eff --- /dev/null +++ b/benchmarks/dataset-agent/run-benchmark.test.mjs @@ -0,0 +1,74 @@ +import assert from "node:assert/strict"; +import { test } from "node:test"; + +import { + failureReason, + findInfrastructureBlockerReason, +} from "./run-benchmark.mjs"; + +test("benchmark failure reason prefers capability diagnostic over generic zero rows", () => { + const diagnostic = "Capability diagnostic: TinyFish Agent disabled; triage requested browser/form/detail follow-up for 2 page(s) (requires_navigation=1, requires_form_submission=1). Enable COLLECTION_AGENT_ENABLE_AGENT=true for live navigation."; + + const reason = failureReason({ + execution: { + timedOut: false, + exitCode: 0, + }, + parsedPayload: { + rows: [], + validationIssues: [diagnostic], + }, + validation: { + rowCount: 0, + sourceUrlCount: 0, + evidenceQuoteCount: 0, + requiredCellCompletenessRatio: 0, + }, + answerKeyScore: null, + infraBlockerReason: null, + minRequiredCompleteness: 0.75, + validationIssues: [diagnostic], + }); + + assert.equal(reason, diagnostic); +}); + +test("infrastructure blocker detection ignores ordinary API-key documentation text", () => { + const reason = findInfrastructureBlockerReason({ + execution: { + timedOut: false, + stderr: "The documentation page covers general API key setup and SDK usage.", + stdout: "", + }, + parsedPayload: { + rows: [{ + cells: { + summary: "Covers API key setup for developers.", + }, + }], + }, + normalized: { + validationIssues: [ + "Capability diagnostic: TinyFish Agent disabled; triage requested browser/form/detail follow-up for 1 page(s) (requires_navigation=1). Enable COLLECTION_AGENT_ENABLE_AGENT=true for live navigation.", + ], + }, + }); + + assert.equal(reason, null); +}); + +test("infrastructure blocker detection still catches missing API key configuration", () => { + const reason = findInfrastructureBlockerReason({ + execution: { + timedOut: false, + stderr: "Missing OPENROUTER_API_KEY.", + stdout: "", + }, + parsedPayload: null, + normalized: { + validationIssues: [], + }, + }); + + assert.equal(reason, "Infrastructure/auth/credits blocker."); +});