diff --git a/ui/__tests__/scientific-rag.test.ts b/ui/__tests__/scientific-rag.test.ts new file mode 100644 index 0000000..83e9f3f --- /dev/null +++ b/ui/__tests__/scientific-rag.test.ts @@ -0,0 +1,74 @@ +import { describe, expect, it } from 'vitest'; + +import { + buildScientificMetadata, + citationSlug, + clampRetrievedDocumentCount, + detectScientificSection, + formatRetrievedDocument, +} from '@/utils/server/scientific-rag'; + +describe('scientific RAG helpers', () => { + it('detects scientific sections from chunk headings', () => { + expect(detectScientificSection('Abstract\nWe study retrieval quality.')).toBe('abstract'); + expect(detectScientificSection('2. Methods\nWe collected samples.')).toBe('methods'); + expect(detectScientificSection('RESULTS\nAccuracy improved.')).toBe('results'); + expect(detectScientificSection('A paragraph without a heading.')).toBe('body'); + }); + + it('builds stable readable citation metadata', () => { + const metadata = buildScientificMetadata( + { + pageContent: 'Introduction\nThis paper evaluates citation stability.', + metadata: { + loc: { pageNumber: 3 }, + pdf: { info: { Title: 'Scientific RAG: Stable Citations.pdf' } }, + source: '/uploads/scientific-rag.pdf', + }, + }, + 'fallback.pdf', + 7, + 1, + ); + + expect(metadata).toMatchObject({ + title: 'Scientific RAG: Stable Citations', + page: 3, + source: '/uploads/scientific-rag.pdf', + section: 'introduction', + chunkIndex: 7, + pageChunkIndex: 1, + citationKey: 'scientific-rag-stable-citations:p3:c2', + }); + }); + + it('keeps citation slugs deterministic and bounded', () => { + expect(citationSlug(' A Very_Long Scientific Paper!!! ')).toBe( + 'a-very-long-scientific-paper', + ); + expect(citationSlug('')).toBe('document'); + }); + + it('clamps retrieval result counts to a useful range', () => { + expect(clampRetrievedDocumentCount(undefined)).toBe(6); + expect(clampRetrievedDocumentCount(0)).toBe(1); + expect(clampRetrievedDocumentCount(99)).toBe(12); + expect(clampRetrievedDocumentCount(4.9)).toBe(4); + }); + + it('formats retrieved chunks with citation and distance context', () => { + expect( + formatRetrievedDocument({ + content: 'The model retrieved a grounded answer.', + metadata: { + title: 'Grounded RAG', + page: 4, + section: 'results', + citationKey: 'grounded-rag:p4:c1', + }, + distance: 0.012345, + index: 0, + }), + ).toContain('[grounded-rag:p4:c1] Grounded RAG, page 4, section results, distance 0.0123'); + }); +}); diff --git a/ui/pages/api/fetch-documents.ts b/ui/pages/api/fetch-documents.ts index 9304e48..75bd8fc 100644 --- a/ui/pages/api/fetch-documents.ts +++ b/ui/pages/api/fetch-documents.ts @@ -1,23 +1,31 @@ import type { NextApiRequest, NextApiResponse } from "next"; import { ChromaClient, TransformersEmbeddingFunction } from "chromadb"; +import { clampRetrievedDocumentCount } from '@/utils/server/scientific-rag'; + export default async function handler(req: NextApiRequest, res: NextApiResponse) { try { + if (req.method !== 'POST') { + return res.status(405).end(); + } + const client = new ChromaClient({ - path: "http://chroma-server:8000", + path: process.env.CHROMA_PATH || "http://chroma-server:8000", }); const query = req.body.input; + if (typeof query !== 'string' || query.trim().length === 0) { + return res.status(400).json({ error: 'Missing document query' }); + } const embedder = new TransformersEmbeddingFunction(); const collection = await client.getOrCreateCollection({ name: "default-collection", embeddingFunction: embedder }); - // query the collection - const results = await collection.query({ - nResults: 4, - queryTexts: [query] - }) + const results = await collection.query({ + nResults: clampRetrievedDocumentCount(req.body.nResults), + queryTexts: [query], + }); res.status(200).json(results); } catch (error) { @@ -29,4 +37,4 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse) } res.status(500).json({ error: 'An unexpected error occurred :(' }); } -} \ No newline at end of file +} diff --git a/ui/pages/api/inject-documents.ts b/ui/pages/api/inject-documents.ts index 532a635..626152c 100644 --- a/ui/pages/api/inject-documents.ts +++ b/ui/pages/api/inject-documents.ts @@ -3,11 +3,17 @@ import type { NextApiRequest, NextApiResponse } from 'next'; import { ChromaClient, TransformersEmbeddingFunction } from 'chromadb'; import { IncomingForm } from 'formidable'; import { PDFLoader } from 'langchain/document_loaders/fs/pdf'; -import { RecursiveCharacterTextSplitter } from "langchain/text_splitter"; +import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter'; import path from 'path'; import { v4 as uuidv4 } from 'uuid'; +import { + SCIENTIFIC_TEXT_SEPARATORS, + buildScientificMetadata, + type ScientificDocument, +} from '@/utils/server/scientific-rag'; + export const config = { api: { bodyParser: false, @@ -33,17 +39,20 @@ export default async function handler( path: process.env.CHROMA_PATH || 'http://chroma-server:8000', }); - const loader = new PDFLoader(files.pdf[0].filepath); - - const originalDocs = await loader.load(); + const pdfFile = Array.isArray(files.pdf) ? files.pdf[0] : files.pdf; + if (!pdfFile) { + return res.status(400).json({ error: 'Missing PDF file' }); + } - console.log(JSON.stringify(originalDocs)); + const loader = new PDFLoader(pdfFile.filepath); + const originalDocs = await loader.load(); const splitter = new RecursiveCharacterTextSplitter({ - chunkSize: 500, - chunkOverlap: 100, - }); + chunkSize: 900, + chunkOverlap: 180, + separators: SCIENTIFIC_TEXT_SEPARATORS, + }); const docs = await splitter.splitDocuments(originalDocs); @@ -75,30 +84,31 @@ export default async function handler( } } -function processDocuments(docs: any) { - const ids = []; +function processDocuments(docs: ScientificDocument[]) { + const ids: string[] = []; const metadatas = []; - const documentContents = []; + const documentContents: string[] = []; + const pageChunkCounts = new Map(); - for (const document of docs) { - // Generate an ID for each document, or use some existing unique identifier + for (let index = 0; index < docs.length; index += 1) { + const document = docs[index]; const id = uuidv4(); ids.push(id); - const fallbackTitle = path.basename(document.metadata.source); - const titleFromMetadata = document.metadata.pdf.info.Title; - - const title = titleFromMetadata && titleFromMetadata.length > 0 ? titleFromMetadata : fallbackTitle; - - - const metadata = { - title: title, - page: document.metadata.loc.pageNumber, // Define this function to extract chapter info - source: document.metadata.source, // Define this function to extract verse info - }; + const fallbackTitle = path.basename(document.metadata.source ?? 'document.pdf'); + const page = document.metadata.loc?.pageNumber ?? 'unknown'; + const pageKey = `${document.metadata.source ?? fallbackTitle}:${page}`; + const pageChunkIndex = pageChunkCounts.get(pageKey) ?? 0; + pageChunkCounts.set(pageKey, pageChunkIndex + 1); + + const metadata = buildScientificMetadata( + document, + fallbackTitle, + index, + pageChunkIndex, + ); metadatas.push(metadata); - // Add the page content to the documents array documentContents.push(document.pageContent); } diff --git a/ui/pages/api/rag-chat.ts b/ui/pages/api/rag-chat.ts index ce84d67..14c7778 100644 --- a/ui/pages/api/rag-chat.ts +++ b/ui/pages/api/rag-chat.ts @@ -1,8 +1,9 @@ import { DEFAULT_SYSTEM_PROMPT, DEFAULT_TEMPERATURE } from '@/utils/app/const'; import { OpenAIError, OpenAIStream } from '@/utils/server'; -import { codeBlock, oneLine } from 'common-tags' +import { codeBlock, oneLine } from 'common-tags'; import { ChatBody, Message } from '@/types/chat'; +import { formatRetrievedDocument } from '@/utils/server/scientific-rag'; // @ts-expect-error import wasm from '../../node_modules/@dqbd/tiktoken/lite/tiktoken_bg.wasm?module'; @@ -15,13 +16,15 @@ export const config = { }; // Function to fetch and format documents -async function fetchAndFormatDocuments(lastMessageContent: string) { +async function fetchAndFormatDocuments( + baseUrl: string, + lastMessageContent: string, +) { try { - console.log("fetching documents") - const response = await fetch('http://localhost:3000/api/fetch-documents', { + const response = await fetch(`${baseUrl}/api/fetch-documents`, { method: 'POST', headers: { 'Content-Type': 'application/json' }, - body: JSON.stringify({ input: lastMessageContent }), + body: JSON.stringify({ input: lastMessageContent, nResults: 6 }), }); if (!response.ok) { @@ -30,10 +33,13 @@ async function fetchAndFormatDocuments(lastMessageContent: string) { const data = await response.json(); const result = data.metadatas[0].map((metadata: any, index: number) => { - return `Source ${index + 1}) Title: ${metadata.title}, Page: ${metadata.page}, Content: ${data.documents[0][index]}\n`; - }).join(''); - - console.log(result); + return formatRetrievedDocument({ + content: data.documents[0][index], + metadata, + distance: data.distances?.[0]?.[index], + index, + }); + }).join('\n\n---\n\n'); return result; @@ -64,7 +70,7 @@ const handler = async (req: Request): Promise => { ${oneLine` You are a very enthusiastic AI assistant who loves to help people! Given the following information from - relevant documentation, answer the user's question using + relevant scientific documentation, answer the user's question using only that information, outputted in markdown format. `} @@ -75,7 +81,7 @@ const handler = async (req: Request): Promise => { `} ${oneLine` - Always include citations from the documentation. + Every factual claim must include citation keys from the documentation. `} `; @@ -85,7 +91,10 @@ const handler = async (req: Request): Promise => { const lastMessage = messages[messages.length - 1]; - const relevantDocuments = await fetchAndFormatDocuments(lastMessage.content); + const relevantDocuments = await fetchAndFormatDocuments( + new URL(req.url).origin, + lastMessage.content, + ); let temperatureToUse = temperature; if (temperatureToUse == null) { @@ -100,9 +109,6 @@ const handler = async (req: Request): Promise => { encoding.free(); - console.log(model, promptToSend, temperatureToUse, key, messagesToSend); - - messagesToSend = [ { role: "user", @@ -121,6 +127,14 @@ const handler = async (req: Request): Promise => { ${oneLine` - Do not make up answers that are not provided in the documentation. `} + ${oneLine` + - Cite sources using the exact citation keys shown in square brackets, + for example [paper-title:p3:c2]. + `} + ${oneLine` + - Prefer sources with lower retrieval distance when multiple sources + contain similar information. + `} ${oneLine` - If you are unsure and the answer is not explicitly written in the documentation context, say diff --git a/ui/utils/server/scientific-rag.ts b/ui/utils/server/scientific-rag.ts new file mode 100644 index 0000000..9e95bfe --- /dev/null +++ b/ui/utils/server/scientific-rag.ts @@ -0,0 +1,157 @@ +export type ScientificDocument = { + pageContent: string; + metadata: { + loc?: { + pageNumber?: number; + }; + pdf?: { + info?: { + Title?: string; + }; + }; + source?: string; + [key: string]: unknown; + }; +}; + +export type ScientificChunkMetadata = { + title: string; + page: number | string; + source: string; + section: string; + chunkIndex: number; + pageChunkIndex: number; + citationKey: string; +}; + +export type RetrievedScientificDocument = { + content: string; + metadata: Partial; + distance?: number; + index: number; +}; + +const SECTION_PATTERNS: Array<[string, RegExp]> = [ + ['abstract', /^\s*(?:abstract|summary)\b/im], + ['introduction', /^\s*(?:\d+\.?\s*)?introduction\b/im], + ['background', /^\s*(?:\d+\.?\s*)?background\b/im], + ['methods', /^\s*(?:\d+\.?\s*)?(?:methods|methodology|materials and methods|experimental setup)\b/im], + ['results', /^\s*(?:\d+\.?\s*)?(?:results|evaluation|experiments?)\b/im], + ['discussion', /^\s*(?:\d+\.?\s*)?discussion\b/im], + ['limitations', /^\s*(?:\d+\.?\s*)?limitations?\b/im], + ['conclusion', /^\s*(?:\d+\.?\s*)?(?:conclusion|conclusions|future work)\b/im], + ['references', /^\s*(?:references|bibliography)\b/im], +]; + +export const SCIENTIFIC_TEXT_SEPARATORS = [ + '\nAbstract', + '\nABSTRACT', + '\nIntroduction', + '\nINTRODUCTION', + '\nBackground', + '\nMethods', + '\nMETHODS', + '\nMaterials and Methods', + '\nResults', + '\nRESULTS', + '\nDiscussion', + '\nDISCUSSION', + '\nConclusion', + '\nCONCLUSION', + '\nReferences', + '\n\n', + '\n', + '. ', + ' ', + '', +]; + +export function detectScientificSection(content: string): string { + for (const [section, pattern] of SECTION_PATTERNS) { + if (pattern.test(content)) { + return section; + } + } + + return 'body'; +} + +export function normalizeTitle(title: string): string { + return title + .trim() + .replace(/\.[^.]+$/, '') + .replace(/[_-]+/g, ' ') + .replace(/\s+/g, ' ') + .slice(0, 140); +} + +export function citationSlug(title: string): string { + const slug = normalizeTitle(title) + .toLowerCase() + .replace(/[^a-z0-9]+/g, '-') + .replace(/^-+|-+$/g, '') + .slice(0, 48); + + return slug || 'document'; +} + +export function buildCitationKey( + title: string, + page: number | string, + pageChunkIndex: number, +): string { + return `${citationSlug(title)}:p${page}:c${pageChunkIndex + 1}`; +} + +export function buildScientificMetadata( + document: ScientificDocument, + fallbackTitle: string, + chunkIndex: number, + pageChunkIndex: number, +): ScientificChunkMetadata { + const titleFromMetadata = document.metadata.pdf?.info?.Title; + const title = normalizeTitle( + titleFromMetadata && titleFromMetadata.trim().length > 0 + ? titleFromMetadata + : fallbackTitle, + ); + const page = document.metadata.loc?.pageNumber ?? 'unknown'; + + return { + title, + page, + source: document.metadata.source ?? fallbackTitle, + section: detectScientificSection(document.pageContent), + chunkIndex, + pageChunkIndex, + citationKey: buildCitationKey(title, page, pageChunkIndex), + }; +} + +export function clampRetrievedDocumentCount(value: unknown): number { + if (typeof value !== 'number' || !Number.isFinite(value)) { + return 6; + } + + return Math.min(12, Math.max(1, Math.trunc(value))); +} + +export function formatRetrievedDocument({ + content, + metadata, + distance, + index, +}: RetrievedScientificDocument): string { + const citationKey = + metadata.citationKey ?? + buildCitationKey(metadata.title ?? `source-${index + 1}`, metadata.page ?? 'unknown', index); + const title = metadata.title ?? `Source ${index + 1}`; + const page = metadata.page ?? 'unknown'; + const section = metadata.section ?? 'body'; + const distanceText = typeof distance === 'number' ? `, distance ${distance.toFixed(4)}` : ''; + + return [ + `[${citationKey}] ${title}, page ${page}, section ${section}${distanceText}`, + content.trim(), + ].join('\n'); +}