diff --git a/backend/BigSet_Data_Collection_Agent/src/agents/extract.ts b/backend/BigSet_Data_Collection_Agent/src/agents/extract.ts index 2055102..bab859d 100644 --- a/backend/BigSet_Data_Collection_Agent/src/agents/extract.ts +++ b/backend/BigSet_Data_Collection_Agent/src/agents/extract.ts @@ -13,6 +13,7 @@ import { type ExtractedRecord, type FetchedPage, } from "../models/schemas.js"; +import { deriveRecordSourceUrls } from "../records/source-urls.js"; /** * Extraction is always one source per LLM call in process-pages.ts: @@ -169,19 +170,6 @@ function provenanceUrlColumns(spec: DatasetSpec): ColumnDef[] { return spec.columns.filter(isProvenanceUrlColumn); } -function collectSourceUrls( - pageUrl: string, - evidence: Array<{ url?: string }>, -): string[] { - const urls = new Set([pageUrl]); - for (const item of evidence) { - if (item.url?.startsWith("http")) { - urls.add(item.url); - } - } - return [...urls]; -} - /** Attach evidence URLs and source_urls; keep LLM row and provenance values. */ export function finalizeExtractedRecord( record: LlmExtractionRecord, @@ -203,7 +191,12 @@ export function finalizeExtractedRecord( } } - const source_urls = collectSourceUrls(pageUrl, evidence); + const source_urls = deriveRecordSourceUrls({ + spec, + row, + evidence, + fallbackUrls: [pageUrl], + }); return extractedRecordSchema.parse({ row, diff --git a/backend/BigSet_Data_Collection_Agent/src/agents/source-policy.ts b/backend/BigSet_Data_Collection_Agent/src/agents/source-policy.ts index 703109f..1ea3b54 100644 --- a/backend/BigSet_Data_Collection_Agent/src/agents/source-policy.ts +++ b/backend/BigSet_Data_Collection_Agent/src/agents/source-policy.ts @@ -1,4 +1,10 @@ -import type { DatasetSpec, SourceCandidate, SourceTriageResult } from "../models/schemas.js"; +import type { + DatasetSpec, + ExtractedRecord, + SourceCandidate, + SourceTriageResult, +} from "../models/schemas.js"; +import { scoreDocsUrlForOfficialSource } from "../records/source-urls.js"; import { getDomain } from "../utils/url.js"; export interface PromptSourceEntity { @@ -121,6 +127,32 @@ function searchPhrasesForPrompt(prompt: string): string[] { return uniqueStrings(phrases); } +function wantsDocsSource(policy: PromptSourcePolicy): boolean { + return policy.searchPhrases.some((phrase) => + /\b(?:docs|documentation|mcp|model context protocol)\b/i.test(phrase), + ); +} + +function isWeakDocsSurface(url: string): boolean { + return /\b(?:blog|news|course|academy|directory|skilljar)\b/i.test(url); +} + +function preferredDocsHost(entity: PromptSourceEntity): string { + const primary = entity.primaryToken.toLowerCase(); + if (primary === "openai") return "developers.openai.com"; + if (primary === "cloudflare") return "developers.cloudflare.com"; + if (primary === "anthropic") return "platform.claude.com"; + return `docs.${primary}.com`; +} + +function officialDomainAliasesForEntity(entity: PromptSourceEntity): string[] { + const primary = entity.primaryToken.toLowerCase(); + if (primary === "anthropic") { + return ["docs.anthropic.com", "platform.claude.com"]; + } + return []; +} + export function derivePromptSourcePolicy(prompt: string): PromptSourcePolicy { const taskText = taskTextFromPrompt(prompt); const entities = extractExplicitEntities(taskText); @@ -161,11 +193,21 @@ export function promptSourceSearchQueries(policy: PromptSourcePolicy): string[] const phrases = policy.searchPhrases.length ? policy.searchPhrases : ["official source"]; + const primaryPhrase = phrases[0] ?? "official source"; + const siteQualifiedDocsQueries = wantsDocsSource(policy) + ? policy.entities.map( + (entity) => + `${entity.name} ${primaryPhrase} site:${preferredDocsHost(entity)}`, + ) + : []; return uniqueStrings( - policy.entities.flatMap((entity) => - phrases.map((phrase) => `${entity.name} ${phrase}`), - ), + [ + ...siteQualifiedDocsQueries, + ...policy.entities.flatMap((entity) => + phrases.map((phrase) => `${entity.name} ${phrase}`), + ), + ], ); } @@ -199,7 +241,32 @@ export function urlMatchesPromptSourcePolicy( if (GENERIC_HOSTED_DOMAIN.test(domain)) { return false; } - return policy.entities.some((entity) => domain.includes(entity.primaryToken)); + return policy.entities.some( + (entity) => urlMatchesEntitySourcePolicy(url, entity, policy), + ); +} + +function urlMatchesEntitySourcePolicy( + url: string, + entity: PromptSourceEntity, + policy: PromptSourcePolicy, +): boolean { + const domain = getDomain(url).toLowerCase(); + if (GENERIC_HOSTED_DOMAIN.test(domain)) { + return false; + } + const entityOwnedDomain = + domain.includes(entity.primaryToken) || + officialDomainAliasesForEntity(entity).some((alias) => + domain.endsWith(alias), + ); + if (!entityOwnedDomain) { + return false; + } + if (wantsDocsSource(policy) && isWeakDocsSurface(url)) { + return false; + } + return true; } export function sourceCandidatePolicyBoost( @@ -224,9 +291,20 @@ export function sourceCandidatePolicyBoost( /\b(official|pricing|docs|documentation|investor relations|earnings|blog)\b/.test( searchableText, ); + const docsSurface = + wantsDocsSource(policy) && + /(?:^|\/\/)(?:docs|developers)\.|\/(?:docs|documentation|guides|api\/docs|agents)(?:\/|$)/.test( + searchableText, + ); + const weakDocsSurface = + wantsDocsSource(policy) && + /\b(?:blog|news|course|academy|directory|skilljar)\b/.test(searchableText); - if (matchedDomain && matchedEntity && officialLanguage) return 5; - if (matchedDomain && matchedEntity) return 4; + if (matchedDomain && matchedEntity && docsSurface) return 7; + if (matchedDomain && matchedEntity && officialLanguage) { + return weakDocsSurface ? 2 : 5; + } + if (matchedDomain && matchedEntity) return weakDocsSurface ? 1 : 4; if (matchedDomain) return 3; if (matchedEntity && officialLanguage) return 1; return -2; @@ -264,3 +342,79 @@ export function applyPromptSourcePolicyToTriageResult( "Search/fetch the named entity's official domain instead of extracting this third-party page.", }; } + +export function recordMatchesPromptSourcePolicy( + record: ExtractedRecord, + spec: DatasetSpec, + policy: PromptSourcePolicy, +): boolean { + if (!policy.requiresOfficialSource) { + return true; + } + + const entity = matchingPromptEntityForRecord(record, spec, policy); + if (!entity) { + return true; + } + + const urls = urlsForRecordSourcePolicy(record, spec); + if (urls.length === 0) { + return false; + } + + return urls.some((url) => urlMatchesEntitySourcePolicy(url, entity, policy)); +} + +function matchingPromptEntityForRecord( + record: ExtractedRecord, + spec: DatasetSpec, + policy: PromptSourcePolicy, +): PromptSourceEntity | null { + const primaryColumn = + spec.dedupe_keys[0] ?? + spec.columns.find((column) => + /(name|title|company|organization|entity)/i.test(column.name), + )?.name; + const primaryValue = String( + primaryColumn ? record.row[primaryColumn] ?? "" : "", + ).toLowerCase(); + const rowText = Object.values(record.row).join(" ").toLowerCase(); + + return ( + policy.entities.find((entity) => { + const name = entity.name.toLowerCase(); + return ( + primaryValue.includes(name) || + primaryValue.includes(entity.primaryToken) || + rowText.includes(name) + ); + }) ?? null + ); +} + +function urlsForRecordSourcePolicy( + record: ExtractedRecord, + spec: DatasetSpec, +): string[] { + const urls = new Set(); + for (const url of record.source_urls) { + if (isHttpUrl(url)) urls.add(url.trim()); + } + for (const column of spec.columns) { + if (!isUrlLikeColumnName(column.name)) continue; + const value = record.row[column.name]; + if (isHttpUrl(value)) urls.add(value.trim()); + } + return [...urls].sort((a, b) => { + return scoreDocsUrlForOfficialSource(b) - scoreDocsUrlForOfficialSource(a); + }); +} + +function isHttpUrl(value: unknown): value is string { + return typeof value === "string" && /^https?:\/\//i.test(value.trim()); +} + +function isUrlLikeColumnName(name: string): boolean { + const lower = name.toLowerCase(); + return lower === "url" || lower.endsWith("_url") || lower.includes("url"); +} diff --git a/backend/BigSet_Data_Collection_Agent/src/merge/records.ts b/backend/BigSet_Data_Collection_Agent/src/merge/records.ts index 995af2d..5773ce3 100644 --- a/backend/BigSet_Data_Collection_Agent/src/merge/records.ts +++ b/backend/BigSet_Data_Collection_Agent/src/merge/records.ts @@ -1,10 +1,30 @@ import type { DatasetSpec, ExtractedRecord } from "../models/schemas.js"; +import { + deriveRecordSourceUrls, + scoreDocsUrlForOfficialSource, +} from "../records/source-urls.js"; function normalizeValue(value: unknown): string { if (value === null || value === undefined) return ""; return String(value).trim().toLowerCase(); } +function isEmpty(value: unknown): boolean { + return value === null || value === undefined || value === ""; +} + +function normalizeComparableValue(value: unknown): string { + return normalizeValue(value) + .replace(/https?:\/\/(?:www\.)?/g, "") + .replace(/[/#?]+$/g, "") + .replace(/\s+/g, " "); +} + +function valuesMatch(a: unknown, b: unknown): boolean { + if (isEmpty(a) || isEmpty(b)) return false; + return normalizeComparableValue(a) === normalizeComparableValue(b); +} + /** Normalize entity names for stable primary-key matching. */ export function normalizePrimaryKey(value: unknown): string { return normalizeValue(value) @@ -115,27 +135,58 @@ export function mergePair( spec: DatasetSpec, ): ExtractedRecord { const row: Record = { ...a.row }; + const fieldsFilledFromIncoming = new Set(); + let replacedDocsUrlFromIncoming = false; for (const col of spec.columns) { const current = row[col.name]; const incoming = b.row[col.name]; - const currentEmpty = - current === null || current === undefined || current === ""; - const incomingFilled = - incoming !== null && incoming !== undefined && incoming !== ""; + const currentEmpty = isEmpty(current); + const incomingFilled = !isEmpty(incoming); if (currentEmpty && incomingFilled) { row[col.name] = incoming ?? null; + fieldsFilledFromIncoming.add(col.name); + } else if (incomingFilled && shouldReplaceCell(col.name, current, incoming)) { + row[col.name] = incoming ?? null; + fieldsFilledFromIncoming.add(col.name); + replacedDocsUrlFromIncoming ||= isDocsUrlColumn(col.name); + } + } + + if (replacedDocsUrlFromIncoming) { + for (const col of spec.columns) { + const incoming = b.row[col.name]; + if ( + isDocsCompanionColumn(col.name) && + !isEmpty(incoming) && + !spec.dedupe_keys.includes(col.name) + ) { + row[col.name] = incoming ?? null; + fieldsFilledFromIncoming.add(col.name); + } } } - const evidence = [...a.evidence]; + const evidence = a.evidence.filter((item) => + valuesMatch(row[item.field], a.row[item.field]), + ); const evidenceFields = new Set(evidence.map((e) => e.field)); for (const item of b.evidence) { - if (!evidenceFields.has(item.field)) { + if ( + !evidenceFields.has(item.field) && + shouldMergeIncomingEvidence({ + field: item.field, + mergedRow: row, + incomingRow: b.row, + fieldsFilledFromIncoming, + }) + ) { evidence.push(item); + evidenceFields.add(item.field); } } + const coherentEvidence = filterEvidenceForRetainedDocsUrl(spec, row, evidence); const extractionConfidence = Math.max( a.extraction_confidence ?? 0, @@ -144,10 +195,141 @@ export function mergePair( return { row, - evidence, - source_urls: [...new Set([...a.source_urls, ...b.source_urls])], + evidence: coherentEvidence, + source_urls: deriveRecordSourceUrls({ + spec, + row, + evidence: coherentEvidence, + fallbackUrls: coherentEvidence.length > 0 ? [] : a.source_urls, + }), ...(extractionConfidence > 0 ? { extraction_confidence: extractionConfidence } : {}), }; } + +function shouldMergeIncomingEvidence(input: { + field: string; + mergedRow: Record; + incomingRow: Record; + fieldsFilledFromIncoming: Set; +}): boolean { + if ( + isDocsUrlColumn(input.field) && + !urlsReferenceSamePage( + input.incomingRow[input.field], + input.mergedRow[input.field], + ) + ) { + return false; + } + if (input.fieldsFilledFromIncoming.has(input.field)) { + return true; + } + return valuesMatch(input.mergedRow[input.field], input.incomingRow[input.field]); +} + +function shouldReplaceCell( + columnName: string, + current: string | number | boolean | null | undefined, + incoming: string | number | boolean | null | undefined, +): boolean { + if (!isDocsUrlColumn(columnName)) { + return false; + } + return ( + scoreDocsUrlForOfficialSource(incoming) > + scoreDocsUrlForOfficialSource(current) + ); +} + +function isDocsUrlColumn(columnName: string): boolean { + const lower = columnName.toLowerCase(); + return ( + lower === "docs_url" || + lower.endsWith("_docs_url") || + (lower.includes("docs") && lower.includes("url")) + ); +} + +function isDocsCompanionColumn(columnName: string): boolean { + const lower = columnName.toLowerCase(); + return ( + lower === "summary" || + lower === "description" || + lower === "docs_title" || + (lower.includes("docs") && lower.includes("title")) + ); +} + +function filterEvidenceForRetainedDocsUrl( + spec: DatasetSpec, + row: Record, + evidence: ExtractedRecord["evidence"], +): ExtractedRecord["evidence"] { + const retainedDocsUrl = bestRetainedDocsUrl(spec, row); + if (!retainedDocsUrl) { + return evidence; + } + + return evidence.filter((item) => { + if (isDocsUrlColumn(item.field)) { + return urlsReferenceSamePage(item.url, row[item.field]); + } + + if ( + isDocsCompanionColumn(item.field) || + spec.dedupe_keys.includes(item.field) + ) { + return sourceUrlSupportsRetainedDocsUrl(item.url, retainedDocsUrl); + } + + return true; + }); +} + +function bestRetainedDocsUrl( + spec: DatasetSpec, + row: Record, +): string | null { + let bestUrl: string | null = null; + let bestScore = 0; + for (const col of spec.columns) { + if (!isDocsUrlColumn(col.name)) continue; + const value = row[col.name]; + const score = scoreDocsUrlForOfficialSource(value); + if (typeof value === "string" && score > bestScore) { + bestUrl = value; + bestScore = score; + } + } + return bestScore >= 4 ? bestUrl : null; +} + +function sourceUrlSupportsRetainedDocsUrl( + evidenceUrl: unknown, + retainedDocsUrl: string, +): boolean { + if (urlsReferenceSamePage(evidenceUrl, retainedDocsUrl)) { + return true; + } + return ( + sameHostname(evidenceUrl, retainedDocsUrl) && + scoreDocsUrlForOfficialSource(evidenceUrl) >= 4 + ); +} + +function urlsReferenceSamePage(a: unknown, b: unknown): boolean { + if (isEmpty(a) || isEmpty(b)) return false; + return normalizeComparableValue(a) === normalizeComparableValue(b); +} + +function sameHostname(a: unknown, b: unknown): boolean { + try { + const aHost = new URL(String(a)).hostname.replace(/^www\./, ""); + const bHost = new URL(String(b)).hostname.replace(/^www\./, ""); + return aHost === bHost; + } catch { + return false; + } +} diff --git a/backend/BigSet_Data_Collection_Agent/src/orchestrator/acquisition.ts b/backend/BigSet_Data_Collection_Agent/src/orchestrator/acquisition.ts index aa24bfb..a879312 100644 --- a/backend/BigSet_Data_Collection_Agent/src/orchestrator/acquisition.ts +++ b/backend/BigSet_Data_Collection_Agent/src/orchestrator/acquisition.ts @@ -7,6 +7,7 @@ import { getPrimaryKeyValue } from "../merge/records.js"; import { createFetchQueue, createSearchQueue } from "../queue/pools.js"; import { derivePromptSourcePolicy, + recordMatchesPromptSourcePolicy, sourceCandidatePolicyBoost, type PromptSourcePolicy, } from "../agents/source-policy.js"; @@ -237,6 +238,18 @@ export async function runAcquisitionPhase(options: { memory: options.memory, log: options.log, }); + const records = sourcePolicy.requiresOfficialSource + ? processed.records.filter((record) => + recordMatchesPromptSourcePolicy(record, options.spec, sourcePolicy), + ) + : processed.records; + const droppedRecords = processed.records.length - records.length; + if (droppedRecords > 0) { + options.log( + options.label, + `Dropped ${droppedRecords} record(s) that lacked entity-owned source URLs`, + ); + } const allFetchedUrls = [ ...new Set([ @@ -250,7 +263,7 @@ export async function runAcquisitionPhase(options: { fetchedUrls: allFetchedUrls, failedUrls, fetchedPages, - records: processed.records, + records, pagesFetched: fetchedPages.length, triage: processed.summary, triageResults: processed.triageResults, diff --git a/backend/BigSet_Data_Collection_Agent/src/records/source-urls.ts b/backend/BigSet_Data_Collection_Agent/src/records/source-urls.ts new file mode 100644 index 0000000..f193ffc --- /dev/null +++ b/backend/BigSet_Data_Collection_Agent/src/records/source-urls.ts @@ -0,0 +1,54 @@ +import type { DatasetSpec, ExtractedRecord } from "../models/schemas.js"; + +function isHttpUrl(value: unknown): value is string { + return typeof value === "string" && /^https?:\/\//i.test(value.trim()); +} + +function isUrlLikeColumnName(name: string): boolean { + const lower = name.toLowerCase(); + return lower === "url" || lower.endsWith("_url") || lower.includes("url"); +} + +export function deriveRecordSourceUrls(input: { + spec: DatasetSpec; + row: ExtractedRecord["row"]; + evidence: ExtractedRecord["evidence"]; + fallbackUrls?: string[]; +}): string[] { + const urls = new Set(); + for (const item of input.evidence) { + if (isHttpUrl(item.url)) { + urls.add(item.url.trim()); + } + } + + for (const column of input.spec.columns) { + if (!isUrlLikeColumnName(column.name)) continue; + const value = input.row[column.name]; + if (isHttpUrl(value)) { + urls.add(value.trim()); + } + } + + for (const url of input.fallbackUrls ?? []) { + if (isHttpUrl(url)) { + urls.add(url.trim()); + } + } + + return [...urls]; +} + +export function scoreDocsUrlForOfficialSource(value: unknown): number { + if (!isHttpUrl(value)) return 0; + const normalized = value.toLowerCase(); + let score = 1; + if (/^https:\/\/(?:docs|developers)\./.test(normalized)) score += 4; + if (/\/(?:docs|documentation|guides|api\/docs|agents|model-context-protocol|mcp)(?:\/|$|\?)/.test(normalized)) { + score += 3; + } + if (/\b(?:blog|news|course|academy|directory|skilljar)\b/.test(normalized)) { + score -= 4; + } + return score; +} diff --git a/backend/test/collection-record-merge.test.ts b/backend/test/collection-record-merge.test.ts new file mode 100644 index 0000000..c2bfd50 --- /dev/null +++ b/backend/test/collection-record-merge.test.ts @@ -0,0 +1,476 @@ +import assert from "node:assert/strict"; +import { test } from "node:test"; + +import { + mergePair, + mergeRecords, +} from "../BigSet_Data_Collection_Agent/src/merge/records.js"; +import type { + DatasetSpec, + ExtractedRecord, +} from "../BigSet_Data_Collection_Agent/src/models/schemas.js"; + +const docsSpec: DatasetSpec = { + intent_summary: "Official MCP docs pages.", + target_row_count: 3, + row_grain: "one row per vendor", + columns: [ + { + name: "entity_name", + type: "string", + description: "Vendor name.", + required: true, + }, + { + name: "docs_title", + type: "string", + description: "Docs page title.", + required: true, + }, + { + name: "docs_url", + type: "string", + description: "Official docs page URL.", + required: true, + }, + { + name: "summary", + type: "string", + description: "What the page covers.", + required: true, + }, + ], + dedupe_keys: ["entity_name"], + search_queries: ["MCP docs"], + extraction_hints: "Prefer official docs pages.", +}; + +test("collection record merge does not attach evidence from conflicting duplicate rows", () => { + const officialRecord = record({ + row: { + entity_name: "Cloudflare", + docs_title: "Connect to an MCP server", + docs_url: "https://developers.cloudflare.com/agents/guides/connect-mcp-client/", + summary: "Official docs for connecting an MCP client.", + }, + evidence: [ + evidence( + "summary", + "https://developers.cloudflare.com/agents/guides/connect-mcp-client/", + "Connect to an MCP server." + ), + ], + sourceUrls: [ + "https://developers.cloudflare.com/agents/guides/connect-mcp-client/", + ], + }); + const blogRecord = record({ + row: { + entity_name: "Cloudflare", + docs_title: "Code Mode: the better way to use MCP", + docs_url: "https://blog.cloudflare.com/code-mode/", + summary: "Blog post about code mode.", + }, + evidence: [ + evidence( + "docs_title", + "https://blog.cloudflare.com/code-mode/", + "Code Mode: the better way to use MCP" + ), + evidence( + "docs_url", + "https://blog.cloudflare.com/code-mode/", + "https://blog.cloudflare.com/code-mode/" + ), + ], + sourceUrls: ["https://blog.cloudflare.com/code-mode/"], + }); + + const merged = mergePair(officialRecord, blogRecord, docsSpec); + + assert.equal( + merged.row.docs_url, + "https://developers.cloudflare.com/agents/guides/connect-mcp-client/" + ); + assert.deepEqual( + merged.evidence.map((item) => item.url), + ["https://developers.cloudflare.com/agents/guides/connect-mcp-client/"] + ); + assert.deepEqual(merged.source_urls, [ + "https://developers.cloudflare.com/agents/guides/connect-mcp-client/", + ]); +}); + +test("collection record merge keeps incoming evidence when it fills a missing field", () => { + const partialRecord = record({ + row: { + entity_name: "OpenAI", + docs_title: "MCP and Connectors", + docs_url: null, + summary: "OpenAI MCP docs.", + }, + evidence: [ + evidence( + "summary", + "https://developers.openai.com/api/docs/guides/tools-connectors-mcp", + "remote MCP servers and connectors" + ), + ], + sourceUrls: [ + "https://developers.openai.com/api/docs/guides/tools-connectors-mcp", + ], + }); + const urlRecord = record({ + row: { + entity_name: "OpenAI", + docs_title: "MCP and Connectors", + docs_url: "https://developers.openai.com/api/docs/guides/tools-connectors-mcp", + summary: null, + }, + evidence: [ + evidence( + "docs_url", + "https://developers.openai.com/api/docs/guides/tools-connectors-mcp", + "https://developers.openai.com/api/docs/guides/tools-connectors-mcp" + ), + ], + sourceUrls: [ + "https://developers.openai.com/api/docs/guides/tools-connectors-mcp", + ], + }); + + const merged = mergePair(partialRecord, urlRecord, docsSpec); + + assert.equal( + merged.row.docs_url, + "https://developers.openai.com/api/docs/guides/tools-connectors-mcp" + ); + assert.deepEqual( + merged.evidence.map((item) => item.field), + ["summary", "docs_url"] + ); + assert.deepEqual(merged.source_urls, [ + "https://developers.openai.com/api/docs/guides/tools-connectors-mcp", + ]); +}); + +test("collection record merge keeps same-value supplemental evidence", () => { + const merged = mergeRecords(docsSpec, [ + record({ + row: { + entity_name: "Anthropic", + docs_title: "Model Context Protocol connector", + docs_url: "https://docs.anthropic.com/en/docs/agents-and-tools/mcp-connector", + summary: "Connector docs.", + }, + evidence: [ + evidence( + "summary", + "https://docs.anthropic.com/en/docs/agents-and-tools/mcp-connector", + "MCP connector" + ), + ], + sourceUrls: [ + "https://docs.anthropic.com/en/docs/agents-and-tools/mcp-connector", + ], + }), + record({ + row: { + entity_name: "Anthropic", + docs_title: "Model Context Protocol connector", + docs_url: "https://docs.anthropic.com/en/docs/agents-and-tools/mcp-connector", + summary: "Connector docs.", + }, + evidence: [ + evidence( + "docs_title", + "https://docs.anthropic.com/en/docs/agents-and-tools/mcp-connector", + "Model Context Protocol connector" + ), + ], + sourceUrls: [ + "https://docs.anthropic.com/en/docs/agents-and-tools/mcp-connector", + ], + }), + ]).records; + + assert.equal(merged.length, 1); + assert.deepEqual( + merged[0]?.evidence.map((item) => item.field), + ["summary", "docs_title"] + ); +}); + +test("collection record merge replaces weak docs URLs with stronger docs surfaces", () => { + const merged = mergePair( + record({ + row: { + entity_name: "Cloudflare", + docs_title: "Code Mode: the better way to use MCP", + docs_url: "https://blog.cloudflare.com/code-mode/", + summary: "Blog post about MCP code mode.", + }, + evidence: [ + evidence( + "docs_url", + "https://blog.cloudflare.com/code-mode/", + "https://blog.cloudflare.com/code-mode/" + ), + ], + sourceUrls: ["https://blog.cloudflare.com/code-mode/"], + }), + record({ + row: { + entity_name: "Cloudflare", + docs_title: "Model Context Protocol", + docs_url: "https://developers.cloudflare.com/agents/model-context-protocol/", + summary: "Official docs for Cloudflare MCP servers.", + }, + evidence: [ + evidence( + "docs_title", + "https://developers.cloudflare.com/agents/model-context-protocol/", + "Model Context Protocol" + ), + evidence( + "docs_url", + "https://developers.cloudflare.com/agents/model-context-protocol/", + "https://developers.cloudflare.com/agents/model-context-protocol/" + ), + evidence( + "summary", + "https://developers.cloudflare.com/agents/model-context-protocol/", + "MCP servers" + ), + ], + sourceUrls: [ + "https://developers.cloudflare.com/agents/model-context-protocol/", + ], + }), + docsSpec, + ); + + assert.equal( + merged.row.docs_url, + "https://developers.cloudflare.com/agents/model-context-protocol/" + ); + assert.equal(merged.row.docs_title, "Model Context Protocol"); + assert.equal(merged.row.summary, "Official docs for Cloudflare MCP servers."); + assert.deepEqual( + merged.evidence.map((item) => item.field), + ["docs_title", "docs_url", "summary"] + ); + assert.deepEqual( + merged.evidence.map((item) => item.url), + [ + "https://developers.cloudflare.com/agents/model-context-protocol/", + "https://developers.cloudflare.com/agents/model-context-protocol/", + "https://developers.cloudflare.com/agents/model-context-protocol/", + ] + ); + assert.deepEqual(merged.source_urls, [ + "https://developers.cloudflare.com/agents/model-context-protocol/", + ]); +}); + +test("collection record merge drops docs URL evidence from unrelated source pages", () => { + const merged = mergePair( + record({ + row: { + entity_name: "Cloudflare", + docs_title: "Docs for agents", + docs_url: null, + summary: null, + }, + evidence: [], + sourceUrls: [], + }), + record({ + row: { + entity_name: "Cloudflare", + docs_title: "Model Context Protocol", + docs_url: "https://developers.cloudflare.com/agents/model-context-protocol/", + summary: "Official docs for Cloudflare MCP servers.", + }, + evidence: [ + evidence( + "docs_url", + "https://developers.openai.com/api/docs", + "https://developers.cloudflare.com/agents/model-context-protocol/" + ), + evidence( + "summary", + "https://developers.cloudflare.com/agents/model-context-protocol/", + "MCP servers" + ), + ], + sourceUrls: [ + "https://developers.openai.com/api/docs", + "https://developers.cloudflare.com/agents/model-context-protocol/", + ], + }), + docsSpec, + ); + + assert.equal( + merged.row.docs_url, + "https://developers.cloudflare.com/agents/model-context-protocol/" + ); + assert.deepEqual( + merged.evidence.map((item) => item.field), + ["summary"] + ); + assert.deepEqual(merged.source_urls, [ + "https://developers.cloudflare.com/agents/model-context-protocol/", + ]); +}); + +test("collection record merge fixture reaches benchmark-equivalent domain coverage", () => { + const merged = mergeRecords(docsSpec, [ + record({ + row: { + entity_name: "OpenAI", + docs_title: "MCP and Connectors", + docs_url: "https://developers.openai.com/api/docs/guides/tools-connectors-mcp", + summary: "OpenAI MCP docs.", + }, + evidence: [ + evidence( + "summary", + "https://developers.openai.com/api/docs/guides/tools-connectors-mcp", + "remote MCP servers and connectors" + ), + ], + sourceUrls: [ + "https://developers.openai.com/api/docs/guides/tools-connectors-mcp", + ], + }), + record({ + row: { + entity_name: "Anthropic", + docs_title: "Introduction to Model Context Protocol", + docs_url: "https://anthropic.skilljar.com/introduction-to-model-context-protocol", + summary: "Anthropic MCP course.", + }, + evidence: [ + evidence( + "summary", + "https://anthropic.skilljar.com/introduction-to-model-context-protocol", + "course provides comprehensive coverage" + ), + ], + sourceUrls: [ + "https://anthropic.skilljar.com/introduction-to-model-context-protocol", + ], + }), + record({ + row: { + entity_name: "Anthropic", + docs_title: "MCP connector", + docs_url: "https://docs.anthropic.com/en/docs/agents-and-tools/mcp-connector", + summary: "Anthropic MCP connector docs.", + }, + evidence: [ + evidence( + "docs_url", + "https://docs.anthropic.com/en/docs/agents-and-tools/mcp-connector", + "https://docs.anthropic.com/en/docs/agents-and-tools/mcp-connector" + ), + ], + sourceUrls: [ + "https://docs.anthropic.com/en/docs/agents-and-tools/mcp-connector", + ], + }), + record({ + row: { + entity_name: "Cloudflare", + docs_title: "Code Mode", + docs_url: "https://blog.cloudflare.com/code-mode/", + summary: "Cloudflare MCP blog post.", + }, + evidence: [ + evidence( + "summary", + "https://blog.cloudflare.com/code-mode/", + "Cloudflare Agents SDK" + ), + ], + sourceUrls: ["https://blog.cloudflare.com/code-mode/"], + }), + record({ + row: { + entity_name: "Cloudflare", + docs_title: "Model Context Protocol", + docs_url: "https://developers.cloudflare.com/agents/model-context-protocol/", + summary: "Cloudflare MCP docs.", + }, + evidence: [ + evidence( + "docs_url", + "https://developers.cloudflare.com/agents/model-context-protocol/", + "https://developers.cloudflare.com/agents/model-context-protocol/" + ), + ], + sourceUrls: [ + "https://developers.cloudflare.com/agents/model-context-protocol/", + ], + }), + ]).records; + + assert.equal(merged.length, 3); + assert.equal( + merged.find((item) => item.row.entity_name === "Anthropic")?.row.docs_url, + "https://docs.anthropic.com/en/docs/agents-and-tools/mcp-connector" + ); + assert.equal( + merged.find((item) => item.row.entity_name === "Cloudflare")?.row.docs_url, + "https://developers.cloudflare.com/agents/model-context-protocol/" + ); + assert.equal( + domainCoverage(merged, { + OpenAI: ["developers.openai.com", "platform.openai.com", "openai.com"], + Anthropic: ["docs.anthropic.com"], + Cloudflare: ["developers.cloudflare.com"], + }), + 1, + ); +}); + +function evidence(field: string, url: string, quote: string) { + return { field, url, quote }; +} + +function record(input: { + row: ExtractedRecord["row"]; + evidence: ExtractedRecord["evidence"]; + sourceUrls: string[]; +}): ExtractedRecord { + return { + row: input.row, + evidence: input.evidence, + source_urls: input.sourceUrls, + extraction_confidence: 0.9, + }; +} + +function domainCoverage( + records: ExtractedRecord[], + allowedDomainsByEntity: Record, +): number { + const matched = records.filter((record) => { + const entity = String(record.row.entity_name ?? ""); + const allowedDomains = allowedDomainsByEntity[entity] ?? []; + return record.source_urls.some((url) => + allowedDomains.some((domain) => hostname(url).endsWith(domain)), + ); + }); + return matched.length / records.length; +} + +function hostname(url: string): string { + try { + return new URL(url).hostname.replace(/^www\./, ""); + } catch { + return ""; + } +} diff --git a/backend/test/collection-source-policy.test.ts b/backend/test/collection-source-policy.test.ts index c2079a0..48b6ac2 100644 --- a/backend/test/collection-source-policy.test.ts +++ b/backend/test/collection-source-policy.test.ts @@ -6,11 +6,13 @@ import { applyPromptSourcePolicyToTriageResult, derivePromptSourcePolicy, promptSourceSearchQueries, + recordMatchesPromptSourcePolicy, sourceCandidatePolicyBoost, urlMatchesPromptSourcePolicy, } from "../BigSet_Data_Collection_Agent/src/agents/source-policy.js"; import type { DatasetSpec, + ExtractedRecord, SourceCandidate, SourceTriageResult, } from "../BigSet_Data_Collection_Agent/src/models/schemas.js"; @@ -157,10 +159,10 @@ test("prompt source policy boosts official candidates", () => { ["Anthropic", "OpenAI", "Cloudflare"], ); assert.deepEqual(promptSourceSearchQueries(policy).slice(0, 4), [ + "Anthropic MCP connector docs site:platform.claude.com", + "OpenAI MCP connector docs site:developers.openai.com", + "Cloudflare MCP connector docs site:developers.cloudflare.com", "Anthropic MCP connector docs", - "Anthropic model context protocol docs", - "OpenAI MCP connector docs", - "OpenAI model context protocol docs", ]); const official: SourceCandidate = { url: "https://developers.cloudflare.com/agents/model-context-protocol/", @@ -180,3 +182,131 @@ test("prompt source policy boosts official candidates", () => { sourceCandidatePolicyBoost(thirdParty, policy), ); }); + +test("prompt source policy prefers docs surfaces over blogs, courses, and directories", () => { + const policy = derivePromptSourcePolicy( + "I need official docs pages for setting up MCP servers from Anthropic, OpenAI, and Cloudflare.", + ); + const docs: SourceCandidate = { + url: "https://platform.claude.com/docs/en/agents-and-tools/mcp-connector", + title: "Model Context Protocol connector", + snippet: "Official Anthropic documentation for MCP connector setup.", + query: "Anthropic MCP connector docs", + }; + const course: SourceCandidate = { + url: "https://anthropic.skilljar.com/introduction-to-model-context-protocol", + title: "Introduction to Model Context Protocol", + snippet: "Anthropic course for learning MCP.", + query: "Anthropic MCP connector docs", + }; + const blog: SourceCandidate = { + url: "https://blog.cloudflare.com/code-mode/", + title: "Code Mode: the better way to use MCP", + snippet: "Cloudflare blog post about MCP.", + query: "Cloudflare MCP connector docs", + }; + const cloudflareDocs: SourceCandidate = { + url: "https://developers.cloudflare.com/agents/model-context-protocol/", + title: "Model Context Protocol", + snippet: "Official Cloudflare docs for MCP servers.", + query: "Cloudflare MCP connector docs", + }; + + assert.ok( + sourceCandidatePolicyBoost(docs, policy) > + sourceCandidatePolicyBoost(course, policy), + ); + assert.equal( + urlMatchesPromptSourcePolicy( + "https://platform.claude.com/docs/en/agents-and-tools/mcp-connector", + policy, + ), + true, + ); + assert.ok( + sourceCandidatePolicyBoost(cloudflareDocs, policy) > + sourceCandidatePolicyBoost(blog, policy), + ); +}); + +test("prompt source policy rejects records sourced from another entity's docs", () => { + const policy = derivePromptSourcePolicy( + "I need official docs pages for setting up MCP servers from Anthropic, OpenAI, and Cloudflare.", + ); + const spec: DatasetSpec = { + intent_summary: "Official MCP docs pages.", + target_row_count: 3, + row_grain: "one row per vendor", + columns: [ + { + name: "entity_name", + type: "string", + description: "Vendor name.", + required: true, + }, + { + name: "docs_url", + type: "string", + description: "Official docs page URL.", + required: true, + }, + ], + dedupe_keys: ["entity_name"], + search_queries: [], + extraction_hints: "", + }; + + assert.equal( + recordMatchesPromptSourcePolicy( + record("Anthropic", "https://modelcontextprotocol.io/docs/develop/build-server"), + spec, + policy, + ), + false, + ); + assert.equal( + recordMatchesPromptSourcePolicy( + record( + "Anthropic", + "https://platform.claude.com/docs/en/agents-and-tools/remote-mcp-servers", + ), + spec, + policy, + ), + true, + ); + assert.equal( + recordMatchesPromptSourcePolicy( + record("OpenAI", "https://developers.openai.com/blog"), + spec, + policy, + ), + false, + ); + assert.equal( + recordMatchesPromptSourcePolicy( + record("OpenAI", "https://developers.openai.com/api/docs/guides/tools-connectors-mcp"), + spec, + policy, + ), + true, + ); +}); + +function record(entityName: string, docsUrl: string): ExtractedRecord { + return { + row: { + entity_name: entityName, + docs_url: docsUrl, + }, + evidence: [ + { + field: "docs_url", + url: docsUrl, + quote: docsUrl, + }, + ], + source_urls: [docsUrl], + extraction_confidence: 0.8, + }; +} diff --git a/benchmarks/dataset-agent/run-benchmark.mjs b/benchmarks/dataset-agent/run-benchmark.mjs index 4dfc58b..d3fd0f5 100755 --- a/benchmarks/dataset-agent/run-benchmark.mjs +++ b/benchmarks/dataset-agent/run-benchmark.mjs @@ -195,7 +195,7 @@ const answerKeysByPromptId = { verifiedAt, sourceUrls: [ "https://developers.openai.com/api/docs/mcp", - "https://docs.anthropic.com/en/docs/agents-and-tools/mcp-connector", + "https://platform.claude.com/docs/en/agents-and-tools/mcp-connector", "https://developers.cloudflare.com/agents/model-context-protocol/", ], scoringNotes: @@ -214,7 +214,7 @@ const answerKeysByPromptId = { id: "anthropic", label: "Anthropic", aliases: ["anthropic"], - allowedSourceDomains: ["docs.anthropic.com"], + allowedSourceDomains: ["docs.anthropic.com", "platform.claude.com"], requiredText: ["mcp"], }, { @@ -231,6 +231,7 @@ const answerKeysByPromptId = { "platform.openai.com", "openai.com", "docs.anthropic.com", + "platform.claude.com", "developers.cloudflare.com", ], },