Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ const SKIP_HOST =
/(?:facebook|twitter|x\.com|instagram|youtube|tiktok|pinterest|reddit\.com\/r\/|linkedin\.com\/in\/|accounts\.google|login|signin|signup|register|cookie|privacy|terms|cdn\.|static\.|fonts\.)/i;
const SKIP_EXT = /\.(?:pdf|zip|png|jpe?g|gif|svg|webp|css|js|woff2?|xml|mp4|mp3)(?:\?|$)/i;
const POSITIVE_PATH =
/\/(?:company|companies|startup|startups|portfolio|team|about|careers|jobs|directory|list|batch|founder|org|organization|profile|detail|view)(?:\/|$|\?)/i;
/\/(?:blog|news|docs|documentation|pricing|billing|investor|investors|earnings|financial|reports|press|release|releases|mcp|model-context-protocol|agents|company|companies|startup|startups|portfolio|team|about|careers|jobs|directory|list|batch|founder|org|organization|profile|detail|view)(?:\/|$|\?)/i;
const NEGATIVE_PATH =
/\/(?:tag|tags|category|categories|author|feed|rss|search|wp-admin|wp-content)(?:\/|$|\?)/i;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import {
mergeSpecWithBenchmarkRequiredColumns,
type BenchmarkSpecContext,
} from "./benchmark-spec.js";
import { applyPromptSourcePolicyToSpec } from "./source-policy.js";

const DATASET_SPEC_SYSTEM = `You are the Dataset Spec Agent for a web data collection pipeline.

Expand Down Expand Up @@ -183,6 +184,8 @@ export async function generateDatasetSpec(
}),
);

normalized = applyPromptSourcePolicyToSpec(normalized, prompt);

if (hasBenchmarkRequiredColumns(benchmark)) {
normalized = mergeSpecWithBenchmarkRequiredColumns(normalized, benchmark);
}
Expand Down
266 changes: 266 additions & 0 deletions backend/BigSet_Data_Collection_Agent/src/agents/source-policy.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,266 @@
import type { DatasetSpec, SourceCandidate, SourceTriageResult } from "../models/schemas.js";
import { getDomain } from "../utils/url.js";

export interface PromptSourceEntity {
name: string;
primaryToken: string;
domainTokens: string[];
}

export interface PromptSourcePolicy {
requiresOfficialSource: boolean;
entities: PromptSourceEntity[];
searchPhrases: string[];
hint?: string;
}

const ENTITY_STOPWORDS = new Set([
"a",
"an",
"and",
"company",
"companies",
"corp",
"corporation",
"for",
"from",
"inc",
"llc",
"ltd",
"of",
"official",
"page",
"pages",
"the",
]);

const ENTITY_LIST_INTRODUCER = /\b(?:for|from)\s+([^?.;:]+)/gi;
const ENTITY_LIST_CUTOFF =
/\b(?:collect|find|include|give|make|show|table|with|need|return|list|shown)\b/i;
const GENERIC_HOSTED_DOMAIN =
/(?:^|\.)((github|gitlab)\.(io|com)|gitbook\.io|readthedocs\.io|notion\.site|medium\.com|substack\.com)$/i;

function taskTextFromPrompt(prompt: string): string {
const taskLine = prompt.match(/^Task:\s*(.+)$/im)?.[1];
return taskLine?.trim() || prompt;
}

function uniqueStrings(values: string[]): string[] {
return [...new Set(values.map((value) => value.trim()).filter(Boolean))];
}

function tokenize(value: string): string[] {
return value
.toLowerCase()
.replace(/[^a-z0-9]+/g, " ")
.split(/\s+/)
.filter((token) => token.length >= 2 && !ENTITY_STOPWORDS.has(token));
}

function looksLikeEntityName(value: string): boolean {
const trimmed = value.trim();
if (!trimmed || trimmed.length > 60) return false;
if (/^(?:and|or|the|official|latest|recent|current)$/i.test(trimmed)) {
return false;
}
return /[A-Z]/.test(trimmed[0] ?? "") || /[a-z][A-Z]/.test(trimmed);
}

function splitEntityList(value: string): string[] {
const beforeVerb = value.split(ENTITY_LIST_CUTOFF)[0] ?? value;
const nestedFrom = beforeVerb.match(/\bfrom\s+(.+)$/i)?.[1];
const entitySegment = nestedFrom ?? beforeVerb;
return entitySegment
.replace(/\s+and\s+/gi, ",")
.split(",")
.map((part) => part.trim().replace(/^and\s+/i, "").replace(/[.?!]$/g, ""))
.filter(looksLikeEntityName);
}

function extractExplicitEntities(prompt: string): PromptSourceEntity[] {
const names: string[] = [];
for (const match of prompt.matchAll(ENTITY_LIST_INTRODUCER)) {
names.push(...splitEntityList(match[1] ?? ""));
}

return uniqueStrings(names).map((name) => {
const domainTokens = tokenize(name);
return {
name,
primaryToken: domainTokens.at(-1) ?? name.toLowerCase(),
domainTokens,
};
});
}

function searchPhrasesForPrompt(prompt: string): string[] {
const lower = prompt.toLowerCase();
const phrases: string[] = [];

if (lower.includes("pricing")) {
phrases.push("official pricing page", "billing pricing");
}
if (lower.includes("investor relations") || lower.includes("earnings release")) {
phrases.push("reports quarterly results", "investor relations earnings release");
}
if (lower.includes("mcp")) {
phrases.push("MCP connector docs", "model context protocol docs");
} else if (lower.includes("docs") || lower.includes("documentation")) {
phrases.push("official docs");
}
if (lower.includes("blog post") || lower.includes("blog posts")) {
phrases.push("official blog latest post");
}
if (lower.includes("official website") || lower.includes("official websites")) {
phrases.push("official website");
}
if (lower.includes("official") && phrases.length === 0) {
phrases.push("official source");
}

return uniqueStrings(phrases);
}

export function derivePromptSourcePolicy(prompt: string): PromptSourcePolicy {
const taskText = taskTextFromPrompt(prompt);
const entities = extractExplicitEntities(taskText);
const searchPhrases = searchPhrasesForPrompt(taskText);
const lower = taskText.toLowerCase();
const asksForCanonicalSource =
searchPhrases.length > 0 ||
lower.includes("source url") ||
lower.includes("source page");
const requiresOfficialSource =
entities.length > 0 &&
asksForCanonicalSource &&
(lower.includes("official") ||
lower.includes("pricing") ||
lower.includes("investor relations") ||
lower.includes("earnings release") ||
lower.includes("docs") ||
lower.includes("documentation") ||
lower.includes("blog post"));

const hint = requiresOfficialSource
? [
"Prompt source policy: user requested canonical/official sources for named entities.",
`Named entities: ${entities.map((entity) => entity.name).join(", ")}.`,
"Use official entity-owned domains for source_url, evidence, pricing/docs/blog/IR URLs, and required facts.",
"Use third-party pages only for discovery; do not use them as evidence when an official entity-owned page is available.",
].join("\n")
: undefined;

return { requiresOfficialSource, entities, searchPhrases, hint };
}

export function promptSourceSearchQueries(policy: PromptSourcePolicy): string[] {
if (!policy.requiresOfficialSource || policy.entities.length === 0) {
return [];
}

const phrases = policy.searchPhrases.length
? policy.searchPhrases
: ["official source"];

return uniqueStrings(
policy.entities.flatMap((entity) =>
phrases.map((phrase) => `${entity.name} ${phrase}`),
),
);
}

export function applyPromptSourcePolicyToSpec(
spec: DatasetSpec,
prompt: string,
): DatasetSpec {
const policy = derivePromptSourcePolicy(prompt);
if (!policy.requiresOfficialSource) {
return spec;
}

return {
...spec,
search_queries: uniqueStrings([
...promptSourceSearchQueries(policy),
...spec.search_queries,
]),
extraction_hints: [spec.extraction_hints, policy.hint]
.filter(Boolean)
.join("\n"),
};
}

export function urlMatchesPromptSourcePolicy(
url: string,
policy: PromptSourcePolicy,
): boolean {
if (!policy.requiresOfficialSource) return true;
const domain = getDomain(url).toLowerCase();
if (GENERIC_HOSTED_DOMAIN.test(domain)) {
return false;
}
return policy.entities.some((entity) => domain.includes(entity.primaryToken));
}

export function sourceCandidatePolicyBoost(
candidate: SourceCandidate,
policy: PromptSourcePolicy,
): number {
if (!policy.requiresOfficialSource) return 0;

const searchableText = [
candidate.url,
candidate.title,
candidate.snippet,
candidate.site_name,
]
.join(" ")
.toLowerCase();
const matchedEntity = policy.entities.some((entity) =>
entity.domainTokens.some((token) => searchableText.includes(token)),
);
const matchedDomain = urlMatchesPromptSourcePolicy(candidate.url, policy);
const officialLanguage =
/\b(official|pricing|docs|documentation|investor relations|earnings|blog)\b/.test(
searchableText,
);

if (matchedDomain && matchedEntity && officialLanguage) return 5;
if (matchedDomain && matchedEntity) return 4;
if (matchedDomain) return 3;
if (matchedEntity && officialLanguage) return 1;
return -2;
}

export function applyPromptSourcePolicyToTriageResult(
result: SourceTriageResult,
policy: PromptSourcePolicy,
): SourceTriageResult {
if (
!policy.requiresOfficialSource ||
![
"extract_now",
"requires_navigation",
"requires_form_submission",
"requires_detail_page_followup",
].includes(result.status) ||
urlMatchesPromptSourcePolicy(result.final_url || result.url, policy)
) {
return result;
}

const domain = getDomain(result.final_url || result.url);
return {
...result,
status: "low_value",
source_data_confidence: Math.min(result.source_data_confidence, 0.3),
expected_yield: "none",
reasoning:
`Prompt asks for official/canonical sources for named entities; ${domain} ` +
`does not match ${policy.entities.map((entity) => entity.name).join(", ")}. ` +
`Original triage: ${result.reasoning}`,
suggested_action:
result.suggested_action ??
"Search/fetch the named entity's official domain instead of extracting this third-party page.",
};
}
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@ import {
type FetchedPage,
type SourceTriageResult,
} from "../models/schemas.js";
import {
applyPromptSourcePolicyToTriageResult,
derivePromptSourcePolicy,
} from "./source-policy.js";

const TRIAGE_SYSTEM = `You are the Source Triage Agent for a web data collection pipeline.

Expand Down Expand Up @@ -90,11 +94,15 @@ export async function triagePage(options: {
],
});

return {
const normalizedResult = {
...result,
url: options.page.url,
final_url: pageUrl,
title: options.page.title || result.title,
status: sourceStatusSchema.parse(result.status),
};
return applyPromptSourcePolicyToTriageResult(
normalizedResult,
derivePromptSourcePolicy(options.userPrompt),
);
}
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,11 @@ import { domainMemoryBoost, type WorkflowMemory } from "../memory/index.js";
import type { SearchPlan } from "../memory/search-pagination.js";
import { getPrimaryKeyValue } from "../merge/records.js";
import { createFetchQueue, createSearchQueue } from "../queue/pools.js";
import {
derivePromptSourcePolicy,
sourceCandidatePolicyBoost,
type PromptSourcePolicy,
} from "../agents/source-policy.js";
import type {
AgentRunRecord,
DatasetSpec,
Expand Down Expand Up @@ -39,6 +44,7 @@ function rankCandidates(
excludeUrls: Set<string>,
limit: number,
memory?: WorkflowMemory,
sourcePolicy?: PromptSourcePolicy,
): string[] {
const byUrl = new Map<
string,
Expand All @@ -55,6 +61,7 @@ function rankCandidates(
if (candidate.title.length > 10) score += 0.5;
if (candidate.snippet.length > 40) score += 0.5;
if (memory) score += domainMemoryBoost(memory, domain);
if (sourcePolicy) score += sourceCandidatePolicyBoost(candidate, sourcePolicy);
byUrl.set(url, { url, score, domain });
}

Expand Down Expand Up @@ -127,15 +134,18 @@ export async function runAcquisitionPhase(options: {
},
);
const candidates: SourceCandidate[] = searchBatches.flat();
const sourcePolicy = derivePromptSourcePolicy(options.userPrompt);

const urlsToFetch = rankCandidates(
candidates,
options.excludeUrls,
options.maxUrlsToFetch,
options.memory,
sourcePolicy,
);

const fetchWithLinks = options.enableLinkFollow ?? false;
const fetchWithLinks =
options.enableLinkFollow ?? sourcePolicy.requiresOfficialSource;
const urlChunks = chunkUrls(urlsToFetch, config.fetchBatchSize);

options.log(
Expand Down
Loading