diff --git a/ui/__tests__/scientific-evidence.test.ts b/ui/__tests__/scientific-evidence.test.ts new file mode 100644 index 0000000..c00cbf9 --- /dev/null +++ b/ui/__tests__/scientific-evidence.test.ts @@ -0,0 +1,137 @@ +import { + type ChromaQueryLike, + buildScientificEvidencePayload, +} from '@/utils/server/scientific-evidence'; + +import { describe, expect, it } from 'vitest'; + +describe('buildScientificEvidencePayload', () => { + it('deduplicates duplicate chunks and emits stable citation keys/source manifest', () => { + const queryResult: ChromaQueryLike = { + ids: [['id-1', 'id-2', 'id-3']], + documents: [['Alpha finding', 'Alpha finding', 'Beta finding']], + metadatas: [ + [ + { + title: 'Paper A', + source: '/tmp/paper-a.pdf', + page: 2, + chunkIndex: 0, + }, + { + title: 'Paper A', + source: '/tmp/paper-a.pdf', + page: 2, + chunkIndex: 0, + }, + { + title: 'Paper A', + source: '/tmp/paper-a.pdf', + page: 3, + chunkIndex: 1, + }, + ], + ], + distances: [[0.01, 0.01, 0.02]], + }; + + const first = buildScientificEvidencePayload(queryResult); + const second = buildScientificEvidencePayload(queryResult); + + expect(first.citations).toHaveLength(2); + expect(first.citations.map((citation) => citation.key)).toEqual( + second.citations.map((citation) => citation.key), + ); + expect(first.citations.map((citation) => citation.key)).toEqual([ + expect.stringMatching(/^SRC-[0-9A-F]{8}$/), + expect.stringMatching(/^SRC-[0-9A-F]{8}$/), + ]); + expect(first.sourceManifest).toHaveLength(1); + expect(first.sourceManifest[0].sourceId).toEqual( + expect.stringMatching(/^DOC-[0-9A-F]{8}$/), + ); + expect(first.sourceManifest[0].citationKeys).toHaveLength(2); + }); + + it('normalizes ragged metadata and prefers citation-friendly sources', () => { + const queryResult: ChromaQueryLike = { + ids: [['id-1']], + documents: [[' Content with spaces ']], + metadatas: [ + [ + { + title: 123, + filename: 'research.pdf', + sourcePath: '/var/tmp/research.pdf', + pageNumber: '5', + chunk_index: '2', + chunk_id: 'c-2', + }, + ], + ], + distances: [[0.1]], + }; + + const payload = buildScientificEvidencePayload(queryResult); + expect(payload.citations).toHaveLength(1); + expect(payload.citations[0].title).toBe('123'); + expect(payload.citations[0].source).toBe('research.pdf'); + expect(payload.sourceManifest[0].source).toBe('research.pdf'); + expect(payload.evidenceContext).not.toContain('/var/tmp/research.pdf'); + expect(payload.citations[0].page).toBe(5); + expect(payload.citations[0].chunkIndex).toBe(2); + expect(payload.citations[0].chunkId).toBe('c-2'); + expect(payload.citations[0].content).toBe('Content with spaces'); + }); + + it('bounds evidence context length and truncates safely', () => { + const queryResult: ChromaQueryLike = { + ids: [['id-1']], + documents: [['A'.repeat(200)]], + metadatas: [[{ title: 'Large Chunk', source: 'source.pdf', page: 1 }]], + distances: [[0.2]], + }; + + const payload = buildScientificEvidencePayload(queryResult, { + maxChunkChars: 20, + maxEvidenceChars: 60, + }); + + expect(payload.citations[0].content.length).toBeLessThanOrEqual(20); + expect(payload.evidenceContext.length).toBeLessThanOrEqual(60); + expect(payload.evidenceContext.endsWith('...')).toBe(true); + }); + + it('truncates safely when max chars is shorter than the ellipsis', () => { + const queryResult: ChromaQueryLike = { + ids: [['id-1']], + documents: [['A'.repeat(200)]], + metadatas: [[{ title: 'Large Chunk', source: 'source.pdf', page: 1 }]], + distances: [[0.2]], + }; + + const payload = buildScientificEvidencePayload(queryResult, { + maxChunkChars: 2, + maxEvidenceChars: 2, + }); + + expect(payload.citations[0].content).toBe('..'); + expect(payload.citations[0].content.length).toBeLessThanOrEqual(2); + expect(payload.evidenceContext).toBe('..'); + expect(payload.evidenceContext.length).toBeLessThanOrEqual(2); + }); + + it('handles ragged/null chroma arrays without throwing', () => { + const raggedResult: ChromaQueryLike = { + ids: [['id-1', 'id-2'], []], + documents: [['Chunk one', null], []], + metadatas: [[null], []], + distances: null, + }; + + const payload = buildScientificEvidencePayload(raggedResult); + expect(payload.citations).toHaveLength(1); + expect(payload.citations[0].source).toBe('unknown-source'); + expect(payload.sourceManifest).toHaveLength(1); + }); +}); diff --git a/ui/pages/api/fetch-documents.ts b/ui/pages/api/fetch-documents.ts index 9304e48..4f869cf 100644 --- a/ui/pages/api/fetch-documents.ts +++ b/ui/pages/api/fetch-documents.ts @@ -1,25 +1,84 @@ -import type { NextApiRequest, NextApiResponse } from "next"; -import { ChromaClient, TransformersEmbeddingFunction } from "chromadb"; +import type { NextApiRequest, NextApiResponse } from 'next'; -export default async function handler(req: NextApiRequest, res: NextApiResponse) { +import { buildScientificEvidencePayload } from '@/utils/server/scientific-evidence'; + +import { ChromaClient, TransformersEmbeddingFunction } from 'chromadb'; + +const DEFAULT_RESULTS = 8; +const MAX_RESULTS = 20; +const DEFAULT_EVIDENCE_CHARS = 12000; +const MAX_EVIDENCE_CHARS = 30000; + +function parseBoundedInteger( + value: unknown, + defaultValue: number, + maxValue: number, +): number { + const parsed = + typeof value === 'number' + ? value + : typeof value === 'string' + ? Number(value) + : NaN; + + if (!Number.isFinite(parsed) || parsed <= 0) { + return defaultValue; + } + + return Math.min(Math.floor(parsed), maxValue); +} + +export default async function handler( + req: NextApiRequest, + res: NextApiResponse, +) { try { + if (req.method !== 'POST') { + return res.status(405).json({ error: 'Only POST is supported' }); + } + const client = new ChromaClient({ - path: "http://chroma-server:8000", + path: process.env.CHROMA_PATH || 'http://chroma-server:8000', }); const query = req.body.input; + if (typeof query !== 'string' || query.trim().length === 0) { + return res.status(400).json({ error: 'input is required' }); + } + + const nResults = parseBoundedInteger( + req.body.nResults, + DEFAULT_RESULTS, + MAX_RESULTS, + ); + const maxEvidenceChars = parseBoundedInteger( + req.body.maxEvidenceChars, + DEFAULT_EVIDENCE_CHARS, + MAX_EVIDENCE_CHARS, + ); const embedder = new TransformersEmbeddingFunction(); - const collection = await client.getOrCreateCollection({ name: "default-collection", embeddingFunction: embedder }); + const collection = await client.getOrCreateCollection({ + name: 'default-collection', + embeddingFunction: embedder, + }); + + const results = await collection.query({ + nResults, + queryTexts: [query.trim()], + }); - // query the collection - const results = await collection.query({ - nResults: 4, - queryTexts: [query] - }) + const evidence = buildScientificEvidencePayload(results, { + maxEvidenceChars, + }); - res.status(200).json(results); + res.status(200).json({ + ...results, + evidenceContext: evidence.evidenceContext, + sourceManifest: evidence.sourceManifest, + citations: evidence.citations, + }); } catch (error) { if (error instanceof Error) { console.error('Error message:', error.message); @@ -29,4 +88,4 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse) } res.status(500).json({ error: 'An unexpected error occurred :(' }); } -} \ No newline at end of file +} diff --git a/ui/pages/api/inject-documents.ts b/ui/pages/api/inject-documents.ts index 532a635..8a3caf2 100644 --- a/ui/pages/api/inject-documents.ts +++ b/ui/pages/api/inject-documents.ts @@ -3,8 +3,7 @@ import type { NextApiRequest, NextApiResponse } from 'next'; import { ChromaClient, TransformersEmbeddingFunction } from 'chromadb'; import { IncomingForm } from 'formidable'; import { PDFLoader } from 'langchain/document_loaders/fs/pdf'; -import { RecursiveCharacterTextSplitter } from "langchain/text_splitter"; - +import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter'; import path from 'path'; import { v4 as uuidv4 } from 'uuid'; @@ -18,37 +17,44 @@ export default async function handler( req: NextApiRequest, res: NextApiResponse, ) { - try { - if (req.method !== 'POST') { - return res.status(405).end(); - } + if (req.method !== 'POST') { + return res.status(405).end(); + } - const form = new IncomingForm(); - form.parse(req, async (err, fields, files) => { + const form = new IncomingForm(); + form.parse(req, async (err, fields, files) => { + try { if (err) { return res.status(400).json({ error: 'Failed to upload file' }); } + const pdfFile = Array.isArray(files.pdf) ? files.pdf[0] : files.pdf; + if (!pdfFile?.filepath) { + return res.status(400).json({ error: 'A PDF file is required' }); + } + const publicSource = + asNonEmptyString(pdfFile.originalFilename) ?? 'uploaded-document.pdf'; + const client = new ChromaClient({ path: process.env.CHROMA_PATH || 'http://chroma-server:8000', }); - const loader = new PDFLoader(files.pdf[0].filepath); + const loader = new PDFLoader(pdfFile.filepath); const originalDocs = await loader.load(); - console.log(JSON.stringify(originalDocs)); - - const splitter = new RecursiveCharacterTextSplitter({ chunkSize: 500, chunkOverlap: 100, - }); + }); const docs = await splitter.splitDocuments(originalDocs); - + // Process the documents and perform other logic - const { ids, metadatas, documentContents } = processDocuments(docs); + const { ids, metadatas, documentContents } = processDocuments( + docs, + publicSource, + ); const embedder = new TransformersEmbeddingFunction(); const collection = await client.getOrCreateCollection({ @@ -66,39 +72,175 @@ export default async function handler( message: 'Documents processed successfully', documentCount: ids.length, }); - }); - } catch (error) { - console.error(error); - res - .status(500) - .json({ message: 'An error occurred while processing the documents' }); + } catch (error) { + console.error(error); + res + .status(500) + .json({ message: 'An error occurred while processing the documents' }); + } + }); +} + +type PrimitiveMetadata = Record; + +type LoadedDocument = { + pageContent: string; + metadata?: unknown; +}; + +function isRecord(value: unknown): value is Record { + return typeof value === 'object' && value !== null && !Array.isArray(value); +} + +function getPrimitive( + record: Record, + key: string, +): string | number | boolean | null { + const value = record[key]; + if ( + typeof value === 'string' || + typeof value === 'number' || + typeof value === 'boolean' + ) { + return value; + } + + return null; +} + +function asNonEmptyString(value: unknown): string | null { + if (typeof value === 'string') { + const trimmed = value.trim(); + return trimmed.length > 0 ? trimmed : null; + } + + if (typeof value === 'number' || typeof value === 'boolean') { + return String(value); } + + return null; } -function processDocuments(docs: any) { - const ids = []; - const metadatas = []; - const documentContents = []; +function asNumber(value: unknown): number | null { + if (typeof value === 'number' && Number.isFinite(value)) { + return value; + } + + if (typeof value === 'string') { + const parsed = Number(value); + if (Number.isFinite(parsed)) { + return parsed; + } + } + + return null; +} + +function getPageFromMetadata(metadata: Record): number | null { + const directPage = asNumber( + getPrimitive(metadata, 'page') ?? getPrimitive(metadata, 'pageNumber'), + ); + if (directPage !== null) { + return directPage; + } + + const loc = metadata.loc; + if (!isRecord(loc)) { + return null; + } + + return asNumber(getPrimitive(loc, 'pageNumber') ?? getPrimitive(loc, 'page')); +} - for (const document of docs) { - // Generate an ID for each document, or use some existing unique identifier - const id = uuidv4(); - ids.push(id); +function getPdfInfoPrimitive( + metadata: Record, + key: string, +): string | number | boolean | null { + const pdf = metadata.pdf; + if (!isRecord(pdf)) { + return null; + } - const fallbackTitle = path.basename(document.metadata.source); - const titleFromMetadata = document.metadata.pdf.info.Title; + const info = pdf.info; + if (!isRecord(info)) { + return null; + } - const title = titleFromMetadata && titleFromMetadata.length > 0 ? titleFromMetadata : fallbackTitle; + return getPrimitive(info, key); +} - - const metadata = { - title: title, - page: document.metadata.loc.pageNumber, // Define this function to extract chapter info - source: document.metadata.source, // Define this function to extract verse info +function processDocuments(docs: LoadedDocument[], publicSource?: string) { + const ids: string[] = []; + const metadatas: PrimitiveMetadata[] = []; + const documentContents: string[] = []; + + for (let index = 0; index < docs.length; index += 1) { + const document = docs[index]; + const metadata = isRecord(document.metadata) ? document.metadata : {}; + + const sourceForCitation = + asNonEmptyString(getPrimitive(metadata, 'filename')) ?? + asNonEmptyString(getPrimitive(metadata, 'fileName')) ?? + asNonEmptyString(getPrimitive(metadata, 'originalFilename')) ?? + publicSource ?? + asNonEmptyString(getPrimitive(metadata, 'source')) ?? + asNonEmptyString(getPrimitive(metadata, 'sourcePath')) ?? + `document-${index + 1}.pdf`; + const filename = path.basename(sourceForCitation.replace(/\\/g, '/')); + const fallbackTitle = + filename.length > 0 ? filename : `Document ${index + 1}`; + const titleFromMetadata = + asNonEmptyString(getPrimitive(metadata, 'title')) ?? + asNonEmptyString(getPrimitive(metadata, 'documentTitle')) ?? + asNonEmptyString(getPdfInfoPrimitive(metadata, 'Title')); + const title = titleFromMetadata ?? fallbackTitle; + const page = getPageFromMetadata(metadata); + const chunkIndex = + asNumber( + getPrimitive(metadata, 'chunkIndex') ?? + getPrimitive(metadata, 'chunk_index'), + ) ?? index; + + const generatedId = uuidv4(); + const chunkId = + asNonEmptyString( + getPrimitive(metadata, 'chunkId') ?? getPrimitive(metadata, 'chunk_id'), + ) ?? `${filename}:${page ?? 'na'}:${chunkIndex}`; + const documentId = + asNonEmptyString( + getPrimitive(metadata, 'documentId') ?? + getPrimitive(metadata, 'document_id'), + ) ?? generatedId; + + const metadataToStore: PrimitiveMetadata = { + title, + source: fallbackTitle, + filename, + chunkIndex, + chunkId, + documentId, }; - metadatas.push(metadata); - // Add the page content to the documents array + if (page !== null) { + metadataToStore.page = page; + } + + const optionalPdfInfoFields = [ + 'Author', + 'Subject', + 'Keywords', + 'Creator', + 'Producer', + ]; + for (const field of optionalPdfInfoFields) { + const value = getPdfInfoPrimitive(metadata, field); + if (value !== null) { + metadataToStore[`pdf${field}`] = value; + } + } + + ids.push(generatedId); + metadatas.push(metadataToStore); documentContents.push(document.pageContent); } diff --git a/ui/pages/api/rag-chat.ts b/ui/pages/api/rag-chat.ts index ce84d67..25de16f 100644 --- a/ui/pages/api/rag-chat.ts +++ b/ui/pages/api/rag-chat.ts @@ -1,6 +1,6 @@ import { DEFAULT_SYSTEM_PROMPT, DEFAULT_TEMPERATURE } from '@/utils/app/const'; import { OpenAIError, OpenAIStream } from '@/utils/server'; -import { codeBlock, oneLine } from 'common-tags' +import type { ScientificSourceManifestEntry } from '@/utils/server/scientific-evidence'; import { ChatBody, Message } from '@/types/chat'; @@ -9,46 +9,67 @@ import wasm from '../../node_modules/@dqbd/tiktoken/lite/tiktoken_bg.wasm?module import tiktokenModel from '@dqbd/tiktoken/encoders/cl100k_base.json'; import { Tiktoken, init } from '@dqbd/tiktoken/lite/init'; +import { codeBlock, oneLine } from 'common-tags'; export const config = { runtime: 'edge', }; -// Function to fetch and format documents -async function fetchAndFormatDocuments(lastMessageContent: string) { +type FetchDocumentsResponse = { + evidenceContext?: string; + sourceManifest?: ScientificSourceManifestEntry[]; +}; + +function formatSourceManifest( + sourceManifest: ScientificSourceManifestEntry[], +): string { + return sourceManifest + .map((source, index) => { + return `${index + 1}. ${source.title} (${ + source.source + }) -> keys: ${source.citationKeys.join(', ')}`; + }) + .join('\n'); +} + +async function fetchScientificEvidence( + req: Request, + lastMessageContent: string, +) { try { - console.log("fetching documents") - const response = await fetch('http://localhost:3000/api/fetch-documents', { + const fetchDocumentsUrl = new URL( + '/api/fetch-documents', + req.url, + ).toString(); + const response = await fetch(fetchDocumentsUrl, { method: 'POST', headers: { 'Content-Type': 'application/json' }, - body: JSON.stringify({ input: lastMessageContent }), + body: JSON.stringify({ + input: lastMessageContent, + nResults: 8, + maxEvidenceChars: 12000, + }), }); - + if (!response.ok) { throw new Error(`Error fetching documents: ${response.statusText}`); } - const data = await response.json(); - const result = data.metadatas[0].map((metadata: any, index: number) => { - return `Source ${index + 1}) Title: ${metadata.title}, Page: ${metadata.page}, Content: ${data.documents[0][index]}\n`; - }).join(''); - - console.log(result); - - return result; - + const data = (await response.json()) as FetchDocumentsResponse; + return { + evidenceContext: + typeof data.evidenceContext === 'string' ? data.evidenceContext : '', + sourceManifest: Array.isArray(data.sourceManifest) + ? data.sourceManifest + : [], + }; } catch (error) { - console.error('Error fetching and formatting documents:', error); - throw error; // You may want to throw a more specific error object here + console.error('Error fetching scientific evidence:', error); + throw error; } } - - - - const handler = async (req: Request): Promise => { - try { const { model, messages, key, prompt, temperature } = (await req.json()) as ChatBody; @@ -85,8 +106,11 @@ const handler = async (req: Request): Promise => { const lastMessage = messages[messages.length - 1]; - const relevantDocuments = await fetchAndFormatDocuments(lastMessage.content); - + const { evidenceContext, sourceManifest } = await fetchScientificEvidence( + req, + lastMessage.content, + ); + let temperatureToUse = temperature; if (temperatureToUse == null) { temperatureToUse = DEFAULT_TEMPERATURE; @@ -97,22 +121,27 @@ const handler = async (req: Request): Promise => { let tokenCount = prompt_tokens.length; let messagesToSend: Message[] = []; - encoding.free(); console.log(model, promptToSend, temperatureToUse, key, messagesToSend); - - messagesToSend = [ + messagesToSend = [ + { + role: 'user', + content: codeBlock` + Here is the evidence context: + ${evidenceContext} + `, + }, { - role: "user", + role: 'user', content: codeBlock` - Here is the relevant documentation: - ${relevantDocuments} + Here is the source manifest: + ${formatSourceManifest(sourceManifest)} `, }, { - role: "user", + role: 'user', content: codeBlock` ${oneLine` Answer my next question using only the above documentation. @@ -130,24 +159,26 @@ const handler = async (req: Request): Promise => { - Prefer splitting your response into multiple paragraphs. `} ${oneLine` - - Output as markdown with citations based on the documentation. + - Cite claims inline with the provided citation keys (format: [SRC-XXXXXXXX]). + `} + ${oneLine` + - Only cite keys that appear in the source manifest/evidence context. `} `, }, { - role: "user", + role: 'user', content: codeBlock` Here is my question: ${oneLine`${lastMessage.content}`} `, }, - ] - + ]; const stream = await OpenAIStream( model, promptToSend, - 0, + temperatureToUse, key, messagesToSend, ); diff --git a/ui/utils/server/scientific-evidence.ts b/ui/utils/server/scientific-evidence.ts new file mode 100644 index 0000000..3298d8e --- /dev/null +++ b/ui/utils/server/scientific-evidence.ts @@ -0,0 +1,360 @@ +type Primitive = string | number | boolean; + +type UnknownRecord = Record; + +export type ChromaQueryLike = { + ids: string[][]; + documents: (string | null)[][]; + metadatas: (Record | null)[][]; + distances: null | number[][]; +}; + +export type ScientificCitation = { + key: string; + sourceId: string; + title: string; + source: string; + page: number | null; + chunkIndex: number | null; + chunkId: string | null; + documentId: string | null; + distance: number | null; + content: string; +}; + +export type ScientificSourceManifestEntry = { + sourceId: string; + title: string; + source: string; + citationKeys: string[]; + documentIds: string[]; +}; + +export type ScientificEvidencePayload = { + citations: ScientificCitation[]; + sourceManifest: ScientificSourceManifestEntry[]; + evidenceContext: string; +}; + +export type ScientificEvidenceOptions = { + maxEvidenceChars?: number; + maxChunkChars?: number; +}; + +const DEFAULT_MAX_EVIDENCE_CHARS = 12000; +const DEFAULT_MAX_CHUNK_CHARS = 1200; + +const TITLE_KEYS = ['title', 'documentTitle', 'document_title', 'pdfTitle']; +const SOURCE_KEYS = [ + 'sourceLabel', + 'publicSource', + 'publicIdentifier', + 'originalFilename', + 'filename', + 'fileName', + 'source', + 'sourcePath', + 'source_path', +]; +const PAGE_KEYS = ['page', 'pageNumber', 'page_number']; +const CHUNK_INDEX_KEYS = ['chunkIndex', 'chunk_index']; +const CHUNK_ID_KEYS = ['chunkId', 'chunk_id']; +const DOCUMENT_ID_KEYS = ['documentId', 'document_id', 'id']; + +function isRecord(value: unknown): value is UnknownRecord { + return typeof value === 'object' && value !== null && !Array.isArray(value); +} + +function asPrimitive(value: unknown): Primitive | undefined { + if ( + typeof value === 'string' || + typeof value === 'number' || + typeof value === 'boolean' + ) { + return value; + } + + return undefined; +} + +function firstPrimitive( + record: UnknownRecord | null, + keys: string[], +): Primitive | undefined { + if (!record) { + return undefined; + } + + for (const key of keys) { + const value = asPrimitive(record[key]); + if (value !== undefined) { + return value; + } + } + + return undefined; +} + +function toCleanString(value: unknown): string | null { + if (typeof value === 'string') { + const trimmed = value.trim(); + return trimmed.length > 0 ? trimmed : null; + } + + if (typeof value === 'number' || typeof value === 'boolean') { + return String(value); + } + + return null; +} + +function toCitationSource(value: unknown): string | null { + const source = toCleanString(value); + if (!source) { + return null; + } + + const [withoutQuery] = source.split(/[?#]/); + const normalizedPath = withoutQuery.replace(/\\/g, '/'); + const pathParts = normalizedPath.split('/').filter(Boolean); + + return pathParts[pathParts.length - 1] ?? source; +} + +function toNumberOrNull(value: unknown): number | null { + if (typeof value === 'number' && Number.isFinite(value)) { + return value; + } + + if (typeof value === 'string') { + const parsed = Number(value); + if (Number.isFinite(parsed)) { + return parsed; + } + } + + return null; +} + +function normalizeMetadata(rawMetadata: unknown): { + title: string; + source: string; + page: number | null; + chunkIndex: number | null; + chunkId: string | null; + documentId: string | null; +} { + const metadata = isRecord(rawMetadata) ? rawMetadata : null; + + const title = + toCleanString(firstPrimitive(metadata, TITLE_KEYS)) ?? 'Untitled Source'; + const source = + toCitationSource(firstPrimitive(metadata, SOURCE_KEYS)) ?? 'unknown-source'; + const page = toNumberOrNull(firstPrimitive(metadata, PAGE_KEYS)); + const chunkIndex = toNumberOrNull(firstPrimitive(metadata, CHUNK_INDEX_KEYS)); + const chunkId = toCleanString(firstPrimitive(metadata, CHUNK_ID_KEYS)); + const documentId = toCleanString(firstPrimitive(metadata, DOCUMENT_ID_KEYS)); + + return { + title, + source, + page, + chunkIndex, + chunkId, + documentId, + }; +} + +function collapseWhitespace(content: string): string { + return content.replace(/\s+/g, ' ').trim(); +} + +function safeTruncate(text: string, maxChars: number): string { + const maxLength = Math.max(0, Math.floor(maxChars)); + + if (maxLength <= 0) { + return ''; + } + + if (text.length <= maxLength) { + return text; + } + + const ellipsis = '...'; + if (maxLength <= ellipsis.length) { + return ellipsis.slice(0, maxLength); + } + + const limit = maxLength - ellipsis.length; + const truncated = text.slice(0, limit).trimEnd(); + return `${truncated}${ellipsis}`; +} + +function hashString(value: string): string { + let hash = 0x811c9dc5; + for (let i = 0; i < value.length; i += 1) { + hash ^= value.charCodeAt(i); + hash += + (hash << 1) + (hash << 4) + (hash << 7) + (hash << 8) + (hash << 24); + } + + return (hash >>> 0).toString(16).toUpperCase().padStart(8, '0'); +} + +function buildCitationKey( + source: string, + page: number | null, + chunkIndex: number | null, + chunkId: string | null, + normalizedContent: string, +): string { + const keySeed = [ + source.toLowerCase(), + page ?? 'na', + chunkIndex ?? 'na', + (chunkId ?? '').toLowerCase(), + normalizedContent, + ].join('|'); + + return `SRC-${hashString(keySeed)}`; +} + +function buildSourceId(source: string, title: string): string { + return `DOC-${hashString(`${source.toLowerCase()}|${title.toLowerCase()}`)}`; +} + +export function buildScientificEvidencePayload( + results: ChromaQueryLike, + options: ScientificEvidenceOptions = {}, +): ScientificEvidencePayload { + const maxEvidenceChars = + options.maxEvidenceChars ?? DEFAULT_MAX_EVIDENCE_CHARS; + const maxChunkChars = options.maxChunkChars ?? DEFAULT_MAX_CHUNK_CHARS; + + const citations: ScientificCitation[] = []; + const seenCitationKeys = new Set(); + const sourceManifestMap = new Map(); + + const documentsByQuery = Array.isArray(results.documents) + ? results.documents + : []; + const metadatasByQuery = Array.isArray(results.metadatas) + ? results.metadatas + : []; + const idsByQuery = Array.isArray(results.ids) ? results.ids : []; + const distancesByQuery = Array.isArray(results.distances) + ? results.distances + : []; + + for ( + let queryIndex = 0; + queryIndex < documentsByQuery.length; + queryIndex += 1 + ) { + const documents = Array.isArray(documentsByQuery[queryIndex]) + ? documentsByQuery[queryIndex] + : []; + const metadatas = Array.isArray(metadatasByQuery[queryIndex]) + ? metadatasByQuery[queryIndex] + : []; + const ids = Array.isArray(idsByQuery[queryIndex]) + ? idsByQuery[queryIndex] + : []; + const distances = Array.isArray(distancesByQuery[queryIndex]) + ? distancesByQuery[queryIndex] + : []; + + for (let index = 0; index < documents.length; index += 1) { + const rawContent = documents[index]; + if (typeof rawContent !== 'string') { + continue; + } + + const normalizedContent = collapseWhitespace(rawContent); + if (!normalizedContent) { + continue; + } + + const metadata = normalizeMetadata(metadatas[index] ?? null); + const documentId = toCleanString(ids[index]) ?? metadata.documentId; + const citationKey = buildCitationKey( + metadata.source, + metadata.page, + metadata.chunkIndex, + metadata.chunkId, + normalizedContent, + ); + + if (seenCitationKeys.has(citationKey)) { + continue; + } + seenCitationKeys.add(citationKey); + + const sourceId = buildSourceId(metadata.source, metadata.title); + const distance = + typeof distances[index] === 'number' ? distances[index] : null; + + citations.push({ + key: citationKey, + sourceId, + title: metadata.title, + source: metadata.source, + page: metadata.page, + chunkIndex: metadata.chunkIndex, + chunkId: metadata.chunkId, + documentId, + distance, + content: safeTruncate(normalizedContent, maxChunkChars), + }); + + const existingSource = sourceManifestMap.get(sourceId); + if (!existingSource) { + sourceManifestMap.set(sourceId, { + sourceId, + title: metadata.title, + source: metadata.source, + citationKeys: [citationKey], + documentIds: documentId ? [documentId] : [], + }); + } else { + existingSource.citationKeys.push(citationKey); + if (documentId && !existingSource.documentIds.includes(documentId)) { + existingSource.documentIds.push(documentId); + } + } + } + } + + const evidenceLines: string[] = []; + let usedChars = 0; + for (const citation of citations) { + const headerParts = [ + `[${citation.key}]`, + `Title: ${citation.title}`, + `Source: ${citation.source}`, + `Page: ${citation.page ?? 'n/a'}`, + ]; + + if (citation.chunkIndex !== null) { + headerParts.push(`Chunk: ${citation.chunkIndex}`); + } + + const block = `${headerParts.join(' | ')}\n${citation.content}\n`; + if (usedChars + block.length > maxEvidenceChars) { + const remaining = maxEvidenceChars - usedChars; + if (remaining > 0) { + evidenceLines.push(safeTruncate(block, remaining)); + } + break; + } + + evidenceLines.push(block); + usedChars += block.length; + } + + return { + citations, + sourceManifest: Array.from(sourceManifestMap.values()), + evidenceContext: evidenceLines.join('\n'), + }; +}