From 0f7c48e55e1619556d0ef59dc88d162657300ef1 Mon Sep 17 00:00:00 2001 From: Edward Tran Date: Fri, 22 May 2026 23:51:01 +0700 Subject: [PATCH] Improve collection source targeting --- .../src/acquisition/link-follow.ts | 2 +- .../src/agents/dataset-spec.ts | 3 + .../src/agents/source-policy.ts | 266 ++++++++++++++++++ .../src/agents/source-triage.ts | 10 +- .../src/orchestrator/acquisition.ts | 12 +- .../src/orchestrator/process-pages.ts | 45 +++ backend/test/collection-source-policy.test.ts | 182 ++++++++++++ 7 files changed, 517 insertions(+), 3 deletions(-) create mode 100644 backend/BigSet_Data_Collection_Agent/src/agents/source-policy.ts create mode 100644 backend/test/collection-source-policy.test.ts diff --git a/backend/BigSet_Data_Collection_Agent/src/acquisition/link-follow.ts b/backend/BigSet_Data_Collection_Agent/src/acquisition/link-follow.ts index bebc418..b8316d7 100644 --- a/backend/BigSet_Data_Collection_Agent/src/acquisition/link-follow.ts +++ b/backend/BigSet_Data_Collection_Agent/src/acquisition/link-follow.ts @@ -7,7 +7,7 @@ const SKIP_HOST = /(?:facebook|twitter|x\.com|instagram|youtube|tiktok|pinterest|reddit\.com\/r\/|linkedin\.com\/in\/|accounts\.google|login|signin|signup|register|cookie|privacy|terms|cdn\.|static\.|fonts\.)/i; const SKIP_EXT = /\.(?:pdf|zip|png|jpe?g|gif|svg|webp|css|js|woff2?|xml|mp4|mp3)(?:\?|$)/i; const POSITIVE_PATH = - /\/(?:company|companies|startup|startups|portfolio|team|about|careers|jobs|directory|list|batch|founder|org|organization|profile|detail|view)(?:\/|$|\?)/i; + /\/(?:blog|news|docs|documentation|pricing|billing|investor|investors|earnings|financial|reports|press|release|releases|mcp|model-context-protocol|agents|company|companies|startup|startups|portfolio|team|about|careers|jobs|directory|list|batch|founder|org|organization|profile|detail|view)(?:\/|$|\?)/i; const NEGATIVE_PATH = /\/(?:tag|tags|category|categories|author|feed|rss|search|wp-admin|wp-content)(?:\/|$|\?)/i; diff --git a/backend/BigSet_Data_Collection_Agent/src/agents/dataset-spec.ts b/backend/BigSet_Data_Collection_Agent/src/agents/dataset-spec.ts index eda4a25..be1f489 100644 --- a/backend/BigSet_Data_Collection_Agent/src/agents/dataset-spec.ts +++ b/backend/BigSet_Data_Collection_Agent/src/agents/dataset-spec.ts @@ -10,6 +10,7 @@ import { mergeSpecWithBenchmarkRequiredColumns, type BenchmarkSpecContext, } from "./benchmark-spec.js"; +import { applyPromptSourcePolicyToSpec } from "./source-policy.js"; const DATASET_SPEC_SYSTEM = `You are the Dataset Spec Agent for a web data collection pipeline. @@ -183,6 +184,8 @@ export async function generateDatasetSpec( }), ); + normalized = applyPromptSourcePolicyToSpec(normalized, prompt); + if (hasBenchmarkRequiredColumns(benchmark)) { normalized = mergeSpecWithBenchmarkRequiredColumns(normalized, benchmark); } diff --git a/backend/BigSet_Data_Collection_Agent/src/agents/source-policy.ts b/backend/BigSet_Data_Collection_Agent/src/agents/source-policy.ts new file mode 100644 index 0000000..703109f --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/agents/source-policy.ts @@ -0,0 +1,266 @@ +import type { DatasetSpec, SourceCandidate, SourceTriageResult } from "../models/schemas.js"; +import { getDomain } from "../utils/url.js"; + +export interface PromptSourceEntity { + name: string; + primaryToken: string; + domainTokens: string[]; +} + +export interface PromptSourcePolicy { + requiresOfficialSource: boolean; + entities: PromptSourceEntity[]; + searchPhrases: string[]; + hint?: string; +} + +const ENTITY_STOPWORDS = new Set([ + "a", + "an", + "and", + "company", + "companies", + "corp", + "corporation", + "for", + "from", + "inc", + "llc", + "ltd", + "of", + "official", + "page", + "pages", + "the", +]); + +const ENTITY_LIST_INTRODUCER = /\b(?:for|from)\s+([^?.;:]+)/gi; +const ENTITY_LIST_CUTOFF = + /\b(?:collect|find|include|give|make|show|table|with|need|return|list|shown)\b/i; +const GENERIC_HOSTED_DOMAIN = + /(?:^|\.)((github|gitlab)\.(io|com)|gitbook\.io|readthedocs\.io|notion\.site|medium\.com|substack\.com)$/i; + +function taskTextFromPrompt(prompt: string): string { + const taskLine = prompt.match(/^Task:\s*(.+)$/im)?.[1]; + return taskLine?.trim() || prompt; +} + +function uniqueStrings(values: string[]): string[] { + return [...new Set(values.map((value) => value.trim()).filter(Boolean))]; +} + +function tokenize(value: string): string[] { + return value + .toLowerCase() + .replace(/[^a-z0-9]+/g, " ") + .split(/\s+/) + .filter((token) => token.length >= 2 && !ENTITY_STOPWORDS.has(token)); +} + +function looksLikeEntityName(value: string): boolean { + const trimmed = value.trim(); + if (!trimmed || trimmed.length > 60) return false; + if (/^(?:and|or|the|official|latest|recent|current)$/i.test(trimmed)) { + return false; + } + return /[A-Z]/.test(trimmed[0] ?? "") || /[a-z][A-Z]/.test(trimmed); +} + +function splitEntityList(value: string): string[] { + const beforeVerb = value.split(ENTITY_LIST_CUTOFF)[0] ?? value; + const nestedFrom = beforeVerb.match(/\bfrom\s+(.+)$/i)?.[1]; + const entitySegment = nestedFrom ?? beforeVerb; + return entitySegment + .replace(/\s+and\s+/gi, ",") + .split(",") + .map((part) => part.trim().replace(/^and\s+/i, "").replace(/[.?!]$/g, "")) + .filter(looksLikeEntityName); +} + +function extractExplicitEntities(prompt: string): PromptSourceEntity[] { + const names: string[] = []; + for (const match of prompt.matchAll(ENTITY_LIST_INTRODUCER)) { + names.push(...splitEntityList(match[1] ?? "")); + } + + return uniqueStrings(names).map((name) => { + const domainTokens = tokenize(name); + return { + name, + primaryToken: domainTokens.at(-1) ?? name.toLowerCase(), + domainTokens, + }; + }); +} + +function searchPhrasesForPrompt(prompt: string): string[] { + const lower = prompt.toLowerCase(); + const phrases: string[] = []; + + if (lower.includes("pricing")) { + phrases.push("official pricing page", "billing pricing"); + } + if (lower.includes("investor relations") || lower.includes("earnings release")) { + phrases.push("reports quarterly results", "investor relations earnings release"); + } + if (lower.includes("mcp")) { + phrases.push("MCP connector docs", "model context protocol docs"); + } else if (lower.includes("docs") || lower.includes("documentation")) { + phrases.push("official docs"); + } + if (lower.includes("blog post") || lower.includes("blog posts")) { + phrases.push("official blog latest post"); + } + if (lower.includes("official website") || lower.includes("official websites")) { + phrases.push("official website"); + } + if (lower.includes("official") && phrases.length === 0) { + phrases.push("official source"); + } + + return uniqueStrings(phrases); +} + +export function derivePromptSourcePolicy(prompt: string): PromptSourcePolicy { + const taskText = taskTextFromPrompt(prompt); + const entities = extractExplicitEntities(taskText); + const searchPhrases = searchPhrasesForPrompt(taskText); + const lower = taskText.toLowerCase(); + const asksForCanonicalSource = + searchPhrases.length > 0 || + lower.includes("source url") || + lower.includes("source page"); + const requiresOfficialSource = + entities.length > 0 && + asksForCanonicalSource && + (lower.includes("official") || + lower.includes("pricing") || + lower.includes("investor relations") || + lower.includes("earnings release") || + lower.includes("docs") || + lower.includes("documentation") || + lower.includes("blog post")); + + const hint = requiresOfficialSource + ? [ + "Prompt source policy: user requested canonical/official sources for named entities.", + `Named entities: ${entities.map((entity) => entity.name).join(", ")}.`, + "Use official entity-owned domains for source_url, evidence, pricing/docs/blog/IR URLs, and required facts.", + "Use third-party pages only for discovery; do not use them as evidence when an official entity-owned page is available.", + ].join("\n") + : undefined; + + return { requiresOfficialSource, entities, searchPhrases, hint }; +} + +export function promptSourceSearchQueries(policy: PromptSourcePolicy): string[] { + if (!policy.requiresOfficialSource || policy.entities.length === 0) { + return []; + } + + const phrases = policy.searchPhrases.length + ? policy.searchPhrases + : ["official source"]; + + return uniqueStrings( + policy.entities.flatMap((entity) => + phrases.map((phrase) => `${entity.name} ${phrase}`), + ), + ); +} + +export function applyPromptSourcePolicyToSpec( + spec: DatasetSpec, + prompt: string, +): DatasetSpec { + const policy = derivePromptSourcePolicy(prompt); + if (!policy.requiresOfficialSource) { + return spec; + } + + return { + ...spec, + search_queries: uniqueStrings([ + ...promptSourceSearchQueries(policy), + ...spec.search_queries, + ]), + extraction_hints: [spec.extraction_hints, policy.hint] + .filter(Boolean) + .join("\n"), + }; +} + +export function urlMatchesPromptSourcePolicy( + url: string, + policy: PromptSourcePolicy, +): boolean { + if (!policy.requiresOfficialSource) return true; + const domain = getDomain(url).toLowerCase(); + if (GENERIC_HOSTED_DOMAIN.test(domain)) { + return false; + } + return policy.entities.some((entity) => domain.includes(entity.primaryToken)); +} + +export function sourceCandidatePolicyBoost( + candidate: SourceCandidate, + policy: PromptSourcePolicy, +): number { + if (!policy.requiresOfficialSource) return 0; + + const searchableText = [ + candidate.url, + candidate.title, + candidate.snippet, + candidate.site_name, + ] + .join(" ") + .toLowerCase(); + const matchedEntity = policy.entities.some((entity) => + entity.domainTokens.some((token) => searchableText.includes(token)), + ); + const matchedDomain = urlMatchesPromptSourcePolicy(candidate.url, policy); + const officialLanguage = + /\b(official|pricing|docs|documentation|investor relations|earnings|blog)\b/.test( + searchableText, + ); + + if (matchedDomain && matchedEntity && officialLanguage) return 5; + if (matchedDomain && matchedEntity) return 4; + if (matchedDomain) return 3; + if (matchedEntity && officialLanguage) return 1; + return -2; +} + +export function applyPromptSourcePolicyToTriageResult( + result: SourceTriageResult, + policy: PromptSourcePolicy, +): SourceTriageResult { + if ( + !policy.requiresOfficialSource || + ![ + "extract_now", + "requires_navigation", + "requires_form_submission", + "requires_detail_page_followup", + ].includes(result.status) || + urlMatchesPromptSourcePolicy(result.final_url || result.url, policy) + ) { + return result; + } + + const domain = getDomain(result.final_url || result.url); + return { + ...result, + status: "low_value", + source_data_confidence: Math.min(result.source_data_confidence, 0.3), + expected_yield: "none", + reasoning: + `Prompt asks for official/canonical sources for named entities; ${domain} ` + + `does not match ${policy.entities.map((entity) => entity.name).join(", ")}. ` + + `Original triage: ${result.reasoning}`, + suggested_action: + result.suggested_action ?? + "Search/fetch the named entity's official domain instead of extracting this third-party page.", + }; +} diff --git a/backend/BigSet_Data_Collection_Agent/src/agents/source-triage.ts b/backend/BigSet_Data_Collection_Agent/src/agents/source-triage.ts index 6c9e219..68939e5 100644 --- a/backend/BigSet_Data_Collection_Agent/src/agents/source-triage.ts +++ b/backend/BigSet_Data_Collection_Agent/src/agents/source-triage.ts @@ -11,6 +11,10 @@ import { type FetchedPage, type SourceTriageResult, } from "../models/schemas.js"; +import { + applyPromptSourcePolicyToTriageResult, + derivePromptSourcePolicy, +} from "./source-policy.js"; const TRIAGE_SYSTEM = `You are the Source Triage Agent for a web data collection pipeline. @@ -90,11 +94,15 @@ export async function triagePage(options: { ], }); - return { + const normalizedResult = { ...result, url: options.page.url, final_url: pageUrl, title: options.page.title || result.title, status: sourceStatusSchema.parse(result.status), }; + return applyPromptSourcePolicyToTriageResult( + normalizedResult, + derivePromptSourcePolicy(options.userPrompt), + ); } diff --git a/backend/BigSet_Data_Collection_Agent/src/orchestrator/acquisition.ts b/backend/BigSet_Data_Collection_Agent/src/orchestrator/acquisition.ts index 6dd748c..aa24bfb 100644 --- a/backend/BigSet_Data_Collection_Agent/src/orchestrator/acquisition.ts +++ b/backend/BigSet_Data_Collection_Agent/src/orchestrator/acquisition.ts @@ -5,6 +5,11 @@ import { domainMemoryBoost, type WorkflowMemory } from "../memory/index.js"; import type { SearchPlan } from "../memory/search-pagination.js"; import { getPrimaryKeyValue } from "../merge/records.js"; import { createFetchQueue, createSearchQueue } from "../queue/pools.js"; +import { + derivePromptSourcePolicy, + sourceCandidatePolicyBoost, + type PromptSourcePolicy, +} from "../agents/source-policy.js"; import type { AgentRunRecord, DatasetSpec, @@ -39,6 +44,7 @@ function rankCandidates( excludeUrls: Set, limit: number, memory?: WorkflowMemory, + sourcePolicy?: PromptSourcePolicy, ): string[] { const byUrl = new Map< string, @@ -55,6 +61,7 @@ function rankCandidates( if (candidate.title.length > 10) score += 0.5; if (candidate.snippet.length > 40) score += 0.5; if (memory) score += domainMemoryBoost(memory, domain); + if (sourcePolicy) score += sourceCandidatePolicyBoost(candidate, sourcePolicy); byUrl.set(url, { url, score, domain }); } @@ -127,15 +134,18 @@ export async function runAcquisitionPhase(options: { }, ); const candidates: SourceCandidate[] = searchBatches.flat(); + const sourcePolicy = derivePromptSourcePolicy(options.userPrompt); const urlsToFetch = rankCandidates( candidates, options.excludeUrls, options.maxUrlsToFetch, options.memory, + sourcePolicy, ); - const fetchWithLinks = options.enableLinkFollow ?? false; + const fetchWithLinks = + options.enableLinkFollow ?? sourcePolicy.requiresOfficialSource; const urlChunks = chunkUrls(urlsToFetch, config.fetchBatchSize); options.log( diff --git a/backend/BigSet_Data_Collection_Agent/src/orchestrator/process-pages.ts b/backend/BigSet_Data_Collection_Agent/src/orchestrator/process-pages.ts index 99e2e52..ef81d2a 100644 --- a/backend/BigSet_Data_Collection_Agent/src/orchestrator/process-pages.ts +++ b/backend/BigSet_Data_Collection_Agent/src/orchestrator/process-pages.ts @@ -2,6 +2,7 @@ import { generateAgentGoal } from "../agents/agent-goal.js"; import { extractFromAgentResult } from "../agents/extract-from-agent.js"; import { extractFromPage } from "../agents/extract.js"; import { triagePage } from "../agents/source-triage.js"; +import { derivePromptSourcePolicy } from "../agents/source-policy.js"; import { config } from "../config.js"; import { runTinyfishAgentsBatch } from "../integrations/tinyfish-agent.js"; import type { WorkflowMemory } from "../memory/index.js"; @@ -60,6 +61,34 @@ function bumpStatus(summary: TriageSummary, status: SourceStatus): void { summary.by_status[status] = (summary.by_status[status] ?? 0) + 1; } +function shouldFallbackExtractOfficialNavigation( + url: string, + status: SourceStatus, +): boolean { + if ( + status !== "requires_navigation" && + status !== "requires_detail_page_followup" + ) { + return false; + } + + try { + const parsed = new URL(url); + const path = `${parsed.pathname}${parsed.search}`.toLowerCase(); + if ( + path === "/" || + /(?:login|signin|signup|default\.aspx|home)(?:\/|$|\?)/.test(path) + ) { + return false; + } + return /(?:pricing|billing|docs|documentation|mcp|model-context-protocol|earnings|press-release|quarterly|results|news|blog)/.test( + path, + ); + } catch { + return false; + } +} + export async function processFetchedPages(options: { label: string; userPrompt: string; @@ -81,6 +110,7 @@ export async function processFetchedPages(options: { const records: ExtractedRecord[] = []; const agentRuns: AgentRunRecord[] = []; const knownKeys = new Set(options.knownEntityKeys ?? []); + const sourcePolicy = derivePromptSourcePolicy(options.userPrompt); const successfulPages = options.pages.filter( (page) => !page.error && page.text.trim().length > 0, @@ -200,6 +230,21 @@ export async function processFetchedPages(options: { summary.agent_candidates += 1; if (agentEnabled) { agentQueue.push({ page, triage }); + } else if ( + sourcePolicy.requiresOfficialSource && + shouldFallbackExtractOfficialNavigation(triage.final_url, triage.status) + ) { + options.log( + options.label, + `Agent disabled — intent-path fallback extract for ${triage.final_url} [${triage.status}]`, + ); + extractPages.push({ page, triage }); + } else if (sourcePolicy.requiresOfficialSource) { + summary.skipped += 1; + options.log( + options.label, + `Agent disabled — skip navigation-only official source ${triage.final_url} [${triage.status}]`, + ); } else { options.log( options.label, diff --git a/backend/test/collection-source-policy.test.ts b/backend/test/collection-source-policy.test.ts new file mode 100644 index 0000000..c2079a0 --- /dev/null +++ b/backend/test/collection-source-policy.test.ts @@ -0,0 +1,182 @@ +import assert from "node:assert/strict"; +import { test } from "node:test"; + +import { + applyPromptSourcePolicyToSpec, + applyPromptSourcePolicyToTriageResult, + derivePromptSourcePolicy, + promptSourceSearchQueries, + sourceCandidatePolicyBoost, + urlMatchesPromptSourcePolicy, +} from "../BigSet_Data_Collection_Agent/src/agents/source-policy.js"; +import type { + DatasetSpec, + SourceCandidate, + SourceTriageResult, +} from "../BigSet_Data_Collection_Agent/src/models/schemas.js"; + +test("prompt source policy derives official queries from the user's prompt", () => { + const policy = derivePromptSourcePolicy( + "For Stripe, Paddle, and Chargebee, collect the official pricing page URL and the plan names or starting prices shown on the page.", + ); + + assert.equal(policy.requiresOfficialSource, true); + assert.deepEqual( + policy.entities.map((entity) => entity.name), + ["Stripe", "Paddle", "Chargebee"], + ); + assert.deepEqual(promptSourceSearchQueries(policy).slice(0, 3), [ + "Stripe official pricing page", + "Stripe billing pricing", + "Paddle official pricing page", + ]); +}); + +test("prompt source policy ignores generic durable recipe source wording", () => { + const policy = derivePromptSourcePolicy( + [ + "Dataset: benchmark_latest-ai-blog-posts", + "Task: Can you make me a table of the latest blog posts from OpenAI, Anthropic, and Google DeepMind? I need title, publish date, and URL.", + "", + "Durable recipe instructions:", + "Prefer official docs, pricing, blog, product, or company pages over third-party summaries.", + ].join("\n"), + ); + + const queries = promptSourceSearchQueries(policy); + + assert.deepEqual(queries, [ + "OpenAI official blog latest post", + "Anthropic official blog latest post", + "Google DeepMind official blog latest post", + ]); +}); + +test("prompt source policy adds official-source guidance without benchmark answer keys", () => { + const spec: DatasetSpec = { + intent_summary: "Collect pricing pages.", + target_row_count: 3, + row_grain: "one row per company", + columns: [ + { + name: "entity_name", + type: "string", + description: "Company.", + required: true, + }, + { + name: "pricing_page_url", + type: "string", + description: "Official pricing URL.", + required: true, + }, + ], + dedupe_keys: ["entity_name"], + search_queries: ["SaaS pricing pages"], + extraction_hints: "Extract plan names.", + }; + + const updated = applyPromptSourcePolicyToSpec( + spec, + "For Stripe and Paddle, collect the official pricing page URL.", + ); + + assert.equal(updated.search_queries[0], "Stripe official pricing page"); + assert.equal(updated.search_queries[1], "Stripe billing pricing"); + assert.equal(updated.search_queries[2], "Paddle official pricing page"); + assert.match(updated.extraction_hints, /Prompt source policy/); + assert.match(updated.extraction_hints, /Stripe, Paddle/); +}); + +test("prompt source policy prefers entity-owned domains over third-party proof", () => { + const policy = derivePromptSourcePolicy( + "Find the latest investor relations earnings release page for Apple, Microsoft, and Nvidia.", + ); + + assert.equal( + urlMatchesPromptSourcePolicy("https://investor.apple.com/newsroom/", policy), + true, + ); + assert.equal( + urlMatchesPromptSourcePolicy("https://finance.yahoo.com/quote/AAPL", policy), + false, + ); + assert.equal( + urlMatchesPromptSourcePolicy("https://cloud.google.com/blog/topics/threat-intelligence", { + ...derivePromptSourcePolicy( + "Can you make me a table of the latest blog posts from OpenAI, Anthropic, and Google DeepMind?", + ), + }), + false, + ); + assert.equal( + urlMatchesPromptSourcePolicy( + "https://openai.github.io/openai-agents-python/mcp/", + derivePromptSourcePolicy( + "I need official docs pages for setting up MCP servers from Anthropic, OpenAI, and Cloudflare.", + ), + ), + false, + ); +}); + +test("prompt source policy downgrades third-party extraction triage", () => { + const policy = derivePromptSourcePolicy( + "For Stripe, Paddle, and Chargebee, collect the official pricing page URL and plan names.", + ); + const triage: SourceTriageResult = { + url: "https://www.trustradius.com/products/paddle/pricing", + final_url: "https://www.trustradius.com/products/paddle/pricing", + title: "Paddle Pricing", + status: "extract_now", + confidence: 0.9, + source_data_confidence: 0.8, + expected_yield: "complete", + reasoning: "Page lists pricing information.", + }; + + const updated = applyPromptSourcePolicyToTriageResult(triage, policy); + + assert.equal(updated.status, "low_value"); + assert.equal(updated.expected_yield, "none"); + assert.match(updated.reasoning, /official\/canonical sources/); +}); + +test("prompt source policy boosts official candidates", () => { + const policy = derivePromptSourcePolicy( + [ + "Dataset: benchmark_mcp-docs-pages", + "Task: I need official docs pages for setting up MCP servers from Anthropic, OpenAI, and Cloudflare. Give me title, URL, and what each page covers.", + "", + "Durable recipe instructions:", + "Prefer official docs, pricing, blog, product, or company pages over third-party summaries.", + ].join("\n"), + ); + assert.deepEqual( + policy.entities.map((entity) => entity.name), + ["Anthropic", "OpenAI", "Cloudflare"], + ); + assert.deepEqual(promptSourceSearchQueries(policy).slice(0, 4), [ + "Anthropic MCP connector docs", + "Anthropic model context protocol docs", + "OpenAI MCP connector docs", + "OpenAI model context protocol docs", + ]); + const official: SourceCandidate = { + url: "https://developers.cloudflare.com/agents/model-context-protocol/", + title: "MCP servers", + snippet: "Official Cloudflare docs for MCP server setup.", + query: "Cloudflare official docs MCP server setup", + }; + const thirdParty: SourceCandidate = { + url: "https://example.com/cloudflare-mcp-guide", + title: "Cloudflare MCP guide", + snippet: "A blog guide to Cloudflare MCP.", + query: "Cloudflare official docs MCP server setup", + }; + + assert.ok( + sourceCandidatePolicyBoost(official, policy) > + sourceCandidatePolicyBoost(thirdParty, policy), + ); +});