From 0f7c48e55e1619556d0ef59dc88d162657300ef1 Mon Sep 17 00:00:00 2001
From: Edward Tran <giaphutran012@gmail.com>
Date: Fri, 22 May 2026 23:51:01 +0700
Subject: [PATCH] Improve collection source targeting

---
 .../src/acquisition/link-follow.ts            |   2 +-
 .../src/agents/dataset-spec.ts                |   3 +
 .../src/agents/source-policy.ts               | 266 ++++++++++++++++++
 .../src/agents/source-triage.ts               |  10 +-
 .../src/orchestrator/acquisition.ts           |  12 +-
 .../src/orchestrator/process-pages.ts         |  45 +++
 backend/test/collection-source-policy.test.ts | 182 ++++++++++++
 7 files changed, 517 insertions(+), 3 deletions(-)
 create mode 100644 backend/BigSet_Data_Collection_Agent/src/agents/source-policy.ts
 create mode 100644 backend/test/collection-source-policy.test.ts

diff --git a/backend/BigSet_Data_Collection_Agent/src/acquisition/link-follow.ts b/backend/BigSet_Data_Collection_Agent/src/acquisition/link-follow.ts
index bebc418..b8316d7 100644
--- a/backend/BigSet_Data_Collection_Agent/src/acquisition/link-follow.ts
+++ b/backend/BigSet_Data_Collection_Agent/src/acquisition/link-follow.ts
@@ -7,7 +7,7 @@ const SKIP_HOST =
   /(?:facebook|twitter|x\.com|instagram|youtube|tiktok|pinterest|reddit\.com\/r\/|linkedin\.com\/in\/|accounts\.google|login|signin|signup|register|cookie|privacy|terms|cdn\.|static\.|fonts\.)/i;
 const SKIP_EXT = /\.(?:pdf|zip|png|jpe?g|gif|svg|webp|css|js|woff2?|xml|mp4|mp3)(?:\?|$)/i;
 const POSITIVE_PATH =
-  /\/(?:company|companies|startup|startups|portfolio|team|about|careers|jobs|directory|list|batch|founder|org|organization|profile|detail|view)(?:\/|$|\?)/i;
+  /\/(?:blog|news|docs|documentation|pricing|billing|investor|investors|earnings|financial|reports|press|release|releases|mcp|model-context-protocol|agents|company|companies|startup|startups|portfolio|team|about|careers|jobs|directory|list|batch|founder|org|organization|profile|detail|view)(?:\/|$|\?)/i;
 const NEGATIVE_PATH =
   /\/(?:tag|tags|category|categories|author|feed|rss|search|wp-admin|wp-content)(?:\/|$|\?)/i;
 
diff --git a/backend/BigSet_Data_Collection_Agent/src/agents/dataset-spec.ts b/backend/BigSet_Data_Collection_Agent/src/agents/dataset-spec.ts
index eda4a25..be1f489 100644
--- a/backend/BigSet_Data_Collection_Agent/src/agents/dataset-spec.ts
+++ b/backend/BigSet_Data_Collection_Agent/src/agents/dataset-spec.ts
@@ -10,6 +10,7 @@ import {
   mergeSpecWithBenchmarkRequiredColumns,
   type BenchmarkSpecContext,
 } from "./benchmark-spec.js";
+import { applyPromptSourcePolicyToSpec } from "./source-policy.js";
 
 const DATASET_SPEC_SYSTEM = `You are the Dataset Spec Agent for a web data collection pipeline.
 
@@ -183,6 +184,8 @@ export async function generateDatasetSpec(
     }),
   );
 
+  normalized = applyPromptSourcePolicyToSpec(normalized, prompt);
+
   if (hasBenchmarkRequiredColumns(benchmark)) {
     normalized = mergeSpecWithBenchmarkRequiredColumns(normalized, benchmark);
   }
diff --git a/backend/BigSet_Data_Collection_Agent/src/agents/source-policy.ts b/backend/BigSet_Data_Collection_Agent/src/agents/source-policy.ts
new file mode 100644
index 0000000..703109f
--- /dev/null
+++ b/backend/BigSet_Data_Collection_Agent/src/agents/source-policy.ts
@@ -0,0 +1,266 @@
+import type { DatasetSpec, SourceCandidate, SourceTriageResult } from "../models/schemas.js";
+import { getDomain } from "../utils/url.js";
+
+export interface PromptSourceEntity {
+  name: string;
+  primaryToken: string;
+  domainTokens: string[];
+}
+
+export interface PromptSourcePolicy {
+  requiresOfficialSource: boolean;
+  entities: PromptSourceEntity[];
+  searchPhrases: string[];
+  hint?: string;
+}
+
+const ENTITY_STOPWORDS = new Set([
+  "a",
+  "an",
+  "and",
+  "company",
+  "companies",
+  "corp",
+  "corporation",
+  "for",
+  "from",
+  "inc",
+  "llc",
+  "ltd",
+  "of",
+  "official",
+  "page",
+  "pages",
+  "the",
+]);
+
+const ENTITY_LIST_INTRODUCER = /\b(?:for|from)\s+([^?.;:]+)/gi;
+const ENTITY_LIST_CUTOFF =
+  /\b(?:collect|find|include|give|make|show|table|with|need|return|list|shown)\b/i;
+const GENERIC_HOSTED_DOMAIN =
+  /(?:^|\.)((github|gitlab)\.(io|com)|gitbook\.io|readthedocs\.io|notion\.site|medium\.com|substack\.com)$/i;
+
+function taskTextFromPrompt(prompt: string): string {
+  const taskLine = prompt.match(/^Task:\s*(.+)$/im)?.[1];
+  return taskLine?.trim() || prompt;
+}
+
+function uniqueStrings(values: string[]): string[] {
+  return [...new Set(values.map((value) => value.trim()).filter(Boolean))];
+}
+
+function tokenize(value: string): string[] {
+  return value
+    .toLowerCase()
+    .replace(/[^a-z0-9]+/g, " ")
+    .split(/\s+/)
+    .filter((token) => token.length >= 2 && !ENTITY_STOPWORDS.has(token));
+}
+
+function looksLikeEntityName(value: string): boolean {
+  const trimmed = value.trim();
+  if (!trimmed || trimmed.length > 60) return false;
+  if (/^(?:and|or|the|official|latest|recent|current)$/i.test(trimmed)) {
+    return false;
+  }
+  return /[A-Z]/.test(trimmed[0] ?? "") || /[a-z][A-Z]/.test(trimmed);
+}
+
+function splitEntityList(value: string): string[] {
+  const beforeVerb = value.split(ENTITY_LIST_CUTOFF)[0] ?? value;
+  const nestedFrom = beforeVerb.match(/\bfrom\s+(.+)$/i)?.[1];
+  const entitySegment = nestedFrom ?? beforeVerb;
+  return entitySegment
+    .replace(/\s+and\s+/gi, ",")
+    .split(",")
+    .map((part) => part.trim().replace(/^and\s+/i, "").replace(/[.?!]$/g, ""))
+    .filter(looksLikeEntityName);
+}
+
+function extractExplicitEntities(prompt: string): PromptSourceEntity[] {
+  const names: string[] = [];
+  for (const match of prompt.matchAll(ENTITY_LIST_INTRODUCER)) {
+    names.push(...splitEntityList(match[1] ?? ""));
+  }
+
+  return uniqueStrings(names).map((name) => {
+    const domainTokens = tokenize(name);
+    return {
+      name,
+      primaryToken: domainTokens.at(-1) ?? name.toLowerCase(),
+      domainTokens,
+    };
+  });
+}
+
+function searchPhrasesForPrompt(prompt: string): string[] {
+  const lower = prompt.toLowerCase();
+  const phrases: string[] = [];
+
+  if (lower.includes("pricing")) {
+    phrases.push("official pricing page", "billing pricing");
+  }
+  if (lower.includes("investor relations") || lower.includes("earnings release")) {
+    phrases.push("reports quarterly results", "investor relations earnings release");
+  }
+  if (lower.includes("mcp")) {
+    phrases.push("MCP connector docs", "model context protocol docs");
+  } else if (lower.includes("docs") || lower.includes("documentation")) {
+    phrases.push("official docs");
+  }
+  if (lower.includes("blog post") || lower.includes("blog posts")) {
+    phrases.push("official blog latest post");
+  }
+  if (lower.includes("official website") || lower.includes("official websites")) {
+    phrases.push("official website");
+  }
+  if (lower.includes("official") && phrases.length === 0) {
+    phrases.push("official source");
+  }
+
+  return uniqueStrings(phrases);
+}
+
+export function derivePromptSourcePolicy(prompt: string): PromptSourcePolicy {
+  const taskText = taskTextFromPrompt(prompt);
+  const entities = extractExplicitEntities(taskText);
+  const searchPhrases = searchPhrasesForPrompt(taskText);
+  const lower = taskText.toLowerCase();
+  const asksForCanonicalSource =
+    searchPhrases.length > 0 ||
+    lower.includes("source url") ||
+    lower.includes("source page");
+  const requiresOfficialSource =
+    entities.length > 0 &&
+    asksForCanonicalSource &&
+    (lower.includes("official") ||
+      lower.includes("pricing") ||
+      lower.includes("investor relations") ||
+      lower.includes("earnings release") ||
+      lower.includes("docs") ||
+      lower.includes("documentation") ||
+      lower.includes("blog post"));
+
+  const hint = requiresOfficialSource
+    ? [
+        "Prompt source policy: user requested canonical/official sources for named entities.",
+        `Named entities: ${entities.map((entity) => entity.name).join(", ")}.`,
+        "Use official entity-owned domains for source_url, evidence, pricing/docs/blog/IR URLs, and required facts.",
+        "Use third-party pages only for discovery; do not use them as evidence when an official entity-owned page is available.",
+      ].join("\n")
+    : undefined;
+
+  return { requiresOfficialSource, entities, searchPhrases, hint };
+}
+
+export function promptSourceSearchQueries(policy: PromptSourcePolicy): string[] {
+  if (!policy.requiresOfficialSource || policy.entities.length === 0) {
+    return [];
+  }
+
+  const phrases = policy.searchPhrases.length
+    ? policy.searchPhrases
+    : ["official source"];
+
+  return uniqueStrings(
+    policy.entities.flatMap((entity) =>
+      phrases.map((phrase) => `${entity.name} ${phrase}`),
+    ),
+  );
+}
+
+export function applyPromptSourcePolicyToSpec(
+  spec: DatasetSpec,
+  prompt: string,
+): DatasetSpec {
+  const policy = derivePromptSourcePolicy(prompt);
+  if (!policy.requiresOfficialSource) {
+    return spec;
+  }
+
+  return {
+    ...spec,
+    search_queries: uniqueStrings([
+      ...promptSourceSearchQueries(policy),
+      ...spec.search_queries,
+    ]),
+    extraction_hints: [spec.extraction_hints, policy.hint]
+      .filter(Boolean)
+      .join("\n"),
+  };
+}
+
+export function urlMatchesPromptSourcePolicy(
+  url: string,
+  policy: PromptSourcePolicy,
+): boolean {
+  if (!policy.requiresOfficialSource) return true;
+  const domain = getDomain(url).toLowerCase();
+  if (GENERIC_HOSTED_DOMAIN.test(domain)) {
+    return false;
+  }
+  return policy.entities.some((entity) => domain.includes(entity.primaryToken));
+}
+
+export function sourceCandidatePolicyBoost(
+  candidate: SourceCandidate,
+  policy: PromptSourcePolicy,
+): number {
+  if (!policy.requiresOfficialSource) return 0;
+
+  const searchableText = [
+    candidate.url,
+    candidate.title,
+    candidate.snippet,
+    candidate.site_name,
+  ]
+    .join(" ")
+    .toLowerCase();
+  const matchedEntity = policy.entities.some((entity) =>
+    entity.domainTokens.some((token) => searchableText.includes(token)),
+  );
+  const matchedDomain = urlMatchesPromptSourcePolicy(candidate.url, policy);
+  const officialLanguage =
+    /\b(official|pricing|docs|documentation|investor relations|earnings|blog)\b/.test(
+      searchableText,
+    );
+
+  if (matchedDomain && matchedEntity && officialLanguage) return 5;
+  if (matchedDomain && matchedEntity) return 4;
+  if (matchedDomain) return 3;
+  if (matchedEntity && officialLanguage) return 1;
+  return -2;
+}
+
+export function applyPromptSourcePolicyToTriageResult(
+  result: SourceTriageResult,
+  policy: PromptSourcePolicy,
+): SourceTriageResult {
+  if (
+    !policy.requiresOfficialSource ||
+    ![
+      "extract_now",
+      "requires_navigation",
+      "requires_form_submission",
+      "requires_detail_page_followup",
+    ].includes(result.status) ||
+    urlMatchesPromptSourcePolicy(result.final_url || result.url, policy)
+  ) {
+    return result;
+  }
+
+  const domain = getDomain(result.final_url || result.url);
+  return {
+    ...result,
+    status: "low_value",
+    source_data_confidence: Math.min(result.source_data_confidence, 0.3),
+    expected_yield: "none",
+    reasoning:
+      `Prompt asks for official/canonical sources for named entities; ${domain} ` +
+      `does not match ${policy.entities.map((entity) => entity.name).join(", ")}. ` +
+      `Original triage: ${result.reasoning}`,
+    suggested_action:
+      result.suggested_action ??
+      "Search/fetch the named entity's official domain instead of extracting this third-party page.",
+  };
+}
diff --git a/backend/BigSet_Data_Collection_Agent/src/agents/source-triage.ts b/backend/BigSet_Data_Collection_Agent/src/agents/source-triage.ts
index 6c9e219..68939e5 100644
--- a/backend/BigSet_Data_Collection_Agent/src/agents/source-triage.ts
+++ b/backend/BigSet_Data_Collection_Agent/src/agents/source-triage.ts
@@ -11,6 +11,10 @@ import {
   type FetchedPage,
   type SourceTriageResult,
 } from "../models/schemas.js";
+import {
+  applyPromptSourcePolicyToTriageResult,
+  derivePromptSourcePolicy,
+} from "./source-policy.js";
 
 const TRIAGE_SYSTEM = `You are the Source Triage Agent for a web data collection pipeline.
 
@@ -90,11 +94,15 @@ export async function triagePage(options: {
     ],
   });
 
-  return {
+  const normalizedResult = {
     ...result,
     url: options.page.url,
     final_url: pageUrl,
     title: options.page.title || result.title,
     status: sourceStatusSchema.parse(result.status),
   };
+  return applyPromptSourcePolicyToTriageResult(
+    normalizedResult,
+    derivePromptSourcePolicy(options.userPrompt),
+  );
 }
diff --git a/backend/BigSet_Data_Collection_Agent/src/orchestrator/acquisition.ts b/backend/BigSet_Data_Collection_Agent/src/orchestrator/acquisition.ts
index 6dd748c..aa24bfb 100644
--- a/backend/BigSet_Data_Collection_Agent/src/orchestrator/acquisition.ts
+++ b/backend/BigSet_Data_Collection_Agent/src/orchestrator/acquisition.ts
@@ -5,6 +5,11 @@ import { domainMemoryBoost, type WorkflowMemory } from "../memory/index.js";
 import type { SearchPlan } from "../memory/search-pagination.js";
 import { getPrimaryKeyValue } from "../merge/records.js";
 import { createFetchQueue, createSearchQueue } from "../queue/pools.js";
+import {
+  derivePromptSourcePolicy,
+  sourceCandidatePolicyBoost,
+  type PromptSourcePolicy,
+} from "../agents/source-policy.js";
 import type {
   AgentRunRecord,
   DatasetSpec,
@@ -39,6 +44,7 @@ function rankCandidates(
   excludeUrls: Set<string>,
   limit: number,
   memory?: WorkflowMemory,
+  sourcePolicy?: PromptSourcePolicy,
 ): string[] {
   const byUrl = new Map<
     string,
@@ -55,6 +61,7 @@ function rankCandidates(
     if (candidate.title.length > 10) score += 0.5;
     if (candidate.snippet.length > 40) score += 0.5;
     if (memory) score += domainMemoryBoost(memory, domain);
+    if (sourcePolicy) score += sourceCandidatePolicyBoost(candidate, sourcePolicy);
     byUrl.set(url, { url, score, domain });
   }
 
@@ -127,15 +134,18 @@ export async function runAcquisitionPhase(options: {
     },
   );
   const candidates: SourceCandidate[] = searchBatches.flat();
+  const sourcePolicy = derivePromptSourcePolicy(options.userPrompt);
 
   const urlsToFetch = rankCandidates(
     candidates,
     options.excludeUrls,
     options.maxUrlsToFetch,
     options.memory,
+    sourcePolicy,
   );
 
-  const fetchWithLinks = options.enableLinkFollow ?? false;
+  const fetchWithLinks =
+    options.enableLinkFollow ?? sourcePolicy.requiresOfficialSource;
   const urlChunks = chunkUrls(urlsToFetch, config.fetchBatchSize);
 
   options.log(
diff --git a/backend/BigSet_Data_Collection_Agent/src/orchestrator/process-pages.ts b/backend/BigSet_Data_Collection_Agent/src/orchestrator/process-pages.ts
index 99e2e52..ef81d2a 100644
--- a/backend/BigSet_Data_Collection_Agent/src/orchestrator/process-pages.ts
+++ b/backend/BigSet_Data_Collection_Agent/src/orchestrator/process-pages.ts
@@ -2,6 +2,7 @@ import { generateAgentGoal } from "../agents/agent-goal.js";
 import { extractFromAgentResult } from "../agents/extract-from-agent.js";
 import { extractFromPage } from "../agents/extract.js";
 import { triagePage } from "../agents/source-triage.js";
+import { derivePromptSourcePolicy } from "../agents/source-policy.js";
 import { config } from "../config.js";
 import { runTinyfishAgentsBatch } from "../integrations/tinyfish-agent.js";
 import type { WorkflowMemory } from "../memory/index.js";
@@ -60,6 +61,34 @@ function bumpStatus(summary: TriageSummary, status: SourceStatus): void {
   summary.by_status[status] = (summary.by_status[status] ?? 0) + 1;
 }
 
+function shouldFallbackExtractOfficialNavigation(
+  url: string,
+  status: SourceStatus,
+): boolean {
+  if (
+    status !== "requires_navigation" &&
+    status !== "requires_detail_page_followup"
+  ) {
+    return false;
+  }
+
+  try {
+    const parsed = new URL(url);
+    const path = `${parsed.pathname}${parsed.search}`.toLowerCase();
+    if (
+      path === "/" ||
+      /(?:login|signin|signup|default\.aspx|home)(?:\/|$|\?)/.test(path)
+    ) {
+      return false;
+    }
+    return /(?:pricing|billing|docs|documentation|mcp|model-context-protocol|earnings|press-release|quarterly|results|news|blog)/.test(
+      path,
+    );
+  } catch {
+    return false;
+  }
+}
+
 export async function processFetchedPages(options: {
   label: string;
   userPrompt: string;
@@ -81,6 +110,7 @@ export async function processFetchedPages(options: {
   const records: ExtractedRecord[] = [];
   const agentRuns: AgentRunRecord[] = [];
   const knownKeys = new Set(options.knownEntityKeys ?? []);
+  const sourcePolicy = derivePromptSourcePolicy(options.userPrompt);
 
   const successfulPages = options.pages.filter(
     (page) => !page.error && page.text.trim().length > 0,
@@ -200,6 +230,21 @@ export async function processFetchedPages(options: {
       summary.agent_candidates += 1;
       if (agentEnabled) {
         agentQueue.push({ page, triage });
+      } else if (
+        sourcePolicy.requiresOfficialSource &&
+        shouldFallbackExtractOfficialNavigation(triage.final_url, triage.status)
+      ) {
+        options.log(
+          options.label,
+          `Agent disabled — intent-path fallback extract for ${triage.final_url} [${triage.status}]`,
+        );
+        extractPages.push({ page, triage });
+      } else if (sourcePolicy.requiresOfficialSource) {
+        summary.skipped += 1;
+        options.log(
+          options.label,
+          `Agent disabled — skip navigation-only official source ${triage.final_url} [${triage.status}]`,
+        );
       } else {
         options.log(
           options.label,
diff --git a/backend/test/collection-source-policy.test.ts b/backend/test/collection-source-policy.test.ts
new file mode 100644
index 0000000..c2079a0
--- /dev/null
+++ b/backend/test/collection-source-policy.test.ts
@@ -0,0 +1,182 @@
+import assert from "node:assert/strict";
+import { test } from "node:test";
+
+import {
+  applyPromptSourcePolicyToSpec,
+  applyPromptSourcePolicyToTriageResult,
+  derivePromptSourcePolicy,
+  promptSourceSearchQueries,
+  sourceCandidatePolicyBoost,
+  urlMatchesPromptSourcePolicy,
+} from "../BigSet_Data_Collection_Agent/src/agents/source-policy.js";
+import type {
+  DatasetSpec,
+  SourceCandidate,
+  SourceTriageResult,
+} from "../BigSet_Data_Collection_Agent/src/models/schemas.js";
+
+test("prompt source policy derives official queries from the user's prompt", () => {
+  const policy = derivePromptSourcePolicy(
+    "For Stripe, Paddle, and Chargebee, collect the official pricing page URL and the plan names or starting prices shown on the page.",
+  );
+
+  assert.equal(policy.requiresOfficialSource, true);
+  assert.deepEqual(
+    policy.entities.map((entity) => entity.name),
+    ["Stripe", "Paddle", "Chargebee"],
+  );
+  assert.deepEqual(promptSourceSearchQueries(policy).slice(0, 3), [
+    "Stripe official pricing page",
+    "Stripe billing pricing",
+    "Paddle official pricing page",
+  ]);
+});
+
+test("prompt source policy ignores generic durable recipe source wording", () => {
+  const policy = derivePromptSourcePolicy(
+    [
+      "Dataset: benchmark_latest-ai-blog-posts",
+      "Task: Can you make me a table of the latest blog posts from OpenAI, Anthropic, and Google DeepMind? I need title, publish date, and URL.",
+      "",
+      "Durable recipe instructions:",
+      "Prefer official docs, pricing, blog, product, or company pages over third-party summaries.",
+    ].join("\n"),
+  );
+
+  const queries = promptSourceSearchQueries(policy);
+
+  assert.deepEqual(queries, [
+    "OpenAI official blog latest post",
+    "Anthropic official blog latest post",
+    "Google DeepMind official blog latest post",
+  ]);
+});
+
+test("prompt source policy adds official-source guidance without benchmark answer keys", () => {
+  const spec: DatasetSpec = {
+    intent_summary: "Collect pricing pages.",
+    target_row_count: 3,
+    row_grain: "one row per company",
+    columns: [
+      {
+        name: "entity_name",
+        type: "string",
+        description: "Company.",
+        required: true,
+      },
+      {
+        name: "pricing_page_url",
+        type: "string",
+        description: "Official pricing URL.",
+        required: true,
+      },
+    ],
+    dedupe_keys: ["entity_name"],
+    search_queries: ["SaaS pricing pages"],
+    extraction_hints: "Extract plan names.",
+  };
+
+  const updated = applyPromptSourcePolicyToSpec(
+    spec,
+    "For Stripe and Paddle, collect the official pricing page URL.",
+  );
+
+  assert.equal(updated.search_queries[0], "Stripe official pricing page");
+  assert.equal(updated.search_queries[1], "Stripe billing pricing");
+  assert.equal(updated.search_queries[2], "Paddle official pricing page");
+  assert.match(updated.extraction_hints, /Prompt source policy/);
+  assert.match(updated.extraction_hints, /Stripe, Paddle/);
+});
+
+test("prompt source policy prefers entity-owned domains over third-party proof", () => {
+  const policy = derivePromptSourcePolicy(
+    "Find the latest investor relations earnings release page for Apple, Microsoft, and Nvidia.",
+  );
+
+  assert.equal(
+    urlMatchesPromptSourcePolicy("https://investor.apple.com/newsroom/", policy),
+    true,
+  );
+  assert.equal(
+    urlMatchesPromptSourcePolicy("https://finance.yahoo.com/quote/AAPL", policy),
+    false,
+  );
+  assert.equal(
+    urlMatchesPromptSourcePolicy("https://cloud.google.com/blog/topics/threat-intelligence", {
+      ...derivePromptSourcePolicy(
+        "Can you make me a table of the latest blog posts from OpenAI, Anthropic, and Google DeepMind?",
+      ),
+    }),
+    false,
+  );
+  assert.equal(
+    urlMatchesPromptSourcePolicy(
+      "https://openai.github.io/openai-agents-python/mcp/",
+      derivePromptSourcePolicy(
+        "I need official docs pages for setting up MCP servers from Anthropic, OpenAI, and Cloudflare.",
+      ),
+    ),
+    false,
+  );
+});
+
+test("prompt source policy downgrades third-party extraction triage", () => {
+  const policy = derivePromptSourcePolicy(
+    "For Stripe, Paddle, and Chargebee, collect the official pricing page URL and plan names.",
+  );
+  const triage: SourceTriageResult = {
+    url: "https://www.trustradius.com/products/paddle/pricing",
+    final_url: "https://www.trustradius.com/products/paddle/pricing",
+    title: "Paddle Pricing",
+    status: "extract_now",
+    confidence: 0.9,
+    source_data_confidence: 0.8,
+    expected_yield: "complete",
+    reasoning: "Page lists pricing information.",
+  };
+
+  const updated = applyPromptSourcePolicyToTriageResult(triage, policy);
+
+  assert.equal(updated.status, "low_value");
+  assert.equal(updated.expected_yield, "none");
+  assert.match(updated.reasoning, /official\/canonical sources/);
+});
+
+test("prompt source policy boosts official candidates", () => {
+  const policy = derivePromptSourcePolicy(
+    [
+      "Dataset: benchmark_mcp-docs-pages",
+      "Task: I need official docs pages for setting up MCP servers from Anthropic, OpenAI, and Cloudflare. Give me title, URL, and what each page covers.",
+      "",
+      "Durable recipe instructions:",
+      "Prefer official docs, pricing, blog, product, or company pages over third-party summaries.",
+    ].join("\n"),
+  );
+  assert.deepEqual(
+    policy.entities.map((entity) => entity.name),
+    ["Anthropic", "OpenAI", "Cloudflare"],
+  );
+  assert.deepEqual(promptSourceSearchQueries(policy).slice(0, 4), [
+    "Anthropic MCP connector docs",
+    "Anthropic model context protocol docs",
+    "OpenAI MCP connector docs",
+    "OpenAI model context protocol docs",
+  ]);
+  const official: SourceCandidate = {
+    url: "https://developers.cloudflare.com/agents/model-context-protocol/",
+    title: "MCP servers",
+    snippet: "Official Cloudflare docs for MCP server setup.",
+    query: "Cloudflare official docs MCP server setup",
+  };
+  const thirdParty: SourceCandidate = {
+    url: "https://example.com/cloudflare-mcp-guide",
+    title: "Cloudflare MCP guide",
+    snippet: "A blog guide to Cloudflare MCP.",
+    query: "Cloudflare official docs MCP server setup",
+  };
+
+  assert.ok(
+    sourceCandidatePolicyBoost(official, policy) >
+      sourceCandidatePolicyBoost(thirdParty, policy),
+  );
+});