From cf379f104c78c787b89d02e3082094ffde4d1235 Mon Sep 17 00:00:00 2001 From: Vinzz2303 Date: Wed, 13 May 2026 20:54:17 +0700 Subject: [PATCH 1/4] feat: improve scientific RAG citations --- ui/__tests__/scientific-rag.test.ts | 66 ++++++++++++ ui/pages/api/fetch-documents.ts | 25 +++-- ui/pages/api/inject-documents.ts | 39 ++++--- ui/pages/api/rag-chat.ts | 34 +++--- ui/utils/server/scientific-rag.ts | 154 ++++++++++++++++++++++++++++ 5 files changed, 274 insertions(+), 44 deletions(-) create mode 100644 ui/__tests__/scientific-rag.test.ts create mode 100644 ui/utils/server/scientific-rag.ts diff --git a/ui/__tests__/scientific-rag.test.ts b/ui/__tests__/scientific-rag.test.ts new file mode 100644 index 0000000..015fd76 --- /dev/null +++ b/ui/__tests__/scientific-rag.test.ts @@ -0,0 +1,66 @@ +import { + buildCitationKey, + buildScientificMetadata, + detectScientificSection, + formatRetrievedDocument, +} from '@/utils/server/scientific-rag'; +import { describe, expect, it } from 'vitest'; + +describe('scientific-rag helpers', () => { + it('detects scientific sections from document chunks', () => { + expect(detectScientificSection('Abstract\nThis paper studies retrieval.')).toBe( + 'abstract', + ); + expect(detectScientificSection('METHODS\nWe used a benchmark.')).toBe( + 'methods', + ); + }); + + it('builds stable citation keys', () => { + expect( + buildCitationKey({ + title: 'Scientific RAG for Papers!', + page: 4, + chunkIndex: 2, + }), + ).toBe('scientific-rag-for-papers:p4:c3'); + }); + + it('builds metadata with title fallback and section', () => { + const metadata = buildScientificMetadata( + { + pageContent: 'Results\nThe model improved citation accuracy.', + metadata: { + loc: { pageNumber: 7 }, + pdf: { info: { Title: '' } }, + source: '/tmp/paper.pdf', + }, + }, + 'paper.pdf', + 0, + ); + + expect(metadata).toMatchObject({ + title: 'paper.pdf', + page: 7, + section: 'results', + citationKey: 'paper-pdf:p7:c1', + }); + }); + + it('formats retrieved documents with citation metadata', () => { + expect( + formatRetrievedDocument({ + content: 'Citation-aware answer context.', + metadata: { + title: 'Paper', + page: 2, + section: 'discussion', + citationKey: 'paper:p2:c1', + }, + distance: 0.12345, + index: 0, + }), + ).toContain('Source 1 [paper:p2:c1]'); + }); +}); diff --git a/ui/pages/api/fetch-documents.ts b/ui/pages/api/fetch-documents.ts index 9304e48..fc95101 100644 --- a/ui/pages/api/fetch-documents.ts +++ b/ui/pages/api/fetch-documents.ts @@ -1,23 +1,28 @@ -import type { NextApiRequest, NextApiResponse } from "next"; -import { ChromaClient, TransformersEmbeddingFunction } from "chromadb"; +import type { NextApiRequest, NextApiResponse } from 'next'; +import { ChromaClient, TransformersEmbeddingFunction } from 'chromadb'; export default async function handler(req: NextApiRequest, res: NextApiResponse) { try { const client = new ChromaClient({ - path: "http://chroma-server:8000", + path: process.env.CHROMA_PATH || 'http://chroma-server:8000', }); const query = req.body.input; + const nResults = Math.min(Number(req.body.nResults ?? 6), 10); const embedder = new TransformersEmbeddingFunction(); - const collection = await client.getOrCreateCollection({ name: "default-collection", embeddingFunction: embedder }); + const collection = await client.getOrCreateCollection({ + name: 'default-collection', + embeddingFunction: embedder, + }); - // query the collection - const results = await collection.query({ - nResults: 4, - queryTexts: [query] - }) + // query the collection + const results = await collection.query({ + nResults, + queryTexts: [query], + include: ['documents', 'metadatas', 'distances'] as any, + }); res.status(200).json(results); } catch (error) { @@ -29,4 +34,4 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse) } res.status(500).json({ error: 'An unexpected error occurred :(' }); } -} \ No newline at end of file +} diff --git a/ui/pages/api/inject-documents.ts b/ui/pages/api/inject-documents.ts index 532a635..8ce9295 100644 --- a/ui/pages/api/inject-documents.ts +++ b/ui/pages/api/inject-documents.ts @@ -3,11 +3,17 @@ import type { NextApiRequest, NextApiResponse } from 'next'; import { ChromaClient, TransformersEmbeddingFunction } from 'chromadb'; import { IncomingForm } from 'formidable'; import { PDFLoader } from 'langchain/document_loaders/fs/pdf'; -import { RecursiveCharacterTextSplitter } from "langchain/text_splitter"; +import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter'; import path from 'path'; import { v4 as uuidv4 } from 'uuid'; +import { + SCIENTIFIC_TEXT_SEPARATORS, + buildScientificMetadata, + type ScientificDocument, +} from '@/utils/server/scientific-rag'; + export const config = { api: { bodyParser: false, @@ -37,13 +43,11 @@ export default async function handler( const originalDocs = await loader.load(); - console.log(JSON.stringify(originalDocs)); - - const splitter = new RecursiveCharacterTextSplitter({ - chunkSize: 500, - chunkOverlap: 100, - }); + chunkSize: 900, + chunkOverlap: 180, + separators: SCIENTIFIC_TEXT_SEPARATORS, + }); const docs = await splitter.splitDocuments(originalDocs); @@ -75,27 +79,20 @@ export default async function handler( } } -function processDocuments(docs: any) { - const ids = []; +function processDocuments(docs: ScientificDocument[]) { + const ids: string[] = []; const metadatas = []; - const documentContents = []; + const documentContents: string[] = []; - for (const document of docs) { + for (let index = 0; index < docs.length; index += 1) { + const document = docs[index]; // Generate an ID for each document, or use some existing unique identifier const id = uuidv4(); ids.push(id); - const fallbackTitle = path.basename(document.metadata.source); - const titleFromMetadata = document.metadata.pdf.info.Title; - - const title = titleFromMetadata && titleFromMetadata.length > 0 ? titleFromMetadata : fallbackTitle; + const fallbackTitle = path.basename(document.metadata.source ?? 'document.pdf'); - - const metadata = { - title: title, - page: document.metadata.loc.pageNumber, // Define this function to extract chapter info - source: document.metadata.source, // Define this function to extract verse info - }; + const metadata = buildScientificMetadata(document, fallbackTitle, index); metadatas.push(metadata); // Add the page content to the documents array diff --git a/ui/pages/api/rag-chat.ts b/ui/pages/api/rag-chat.ts index ce84d67..2a3c98c 100644 --- a/ui/pages/api/rag-chat.ts +++ b/ui/pages/api/rag-chat.ts @@ -1,8 +1,9 @@ import { DEFAULT_SYSTEM_PROMPT, DEFAULT_TEMPERATURE } from '@/utils/app/const'; import { OpenAIError, OpenAIStream } from '@/utils/server'; -import { codeBlock, oneLine } from 'common-tags' +import { codeBlock, oneLine } from 'common-tags'; import { ChatBody, Message } from '@/types/chat'; +import { formatRetrievedDocument } from '@/utils/server/scientific-rag'; // @ts-expect-error import wasm from '../../node_modules/@dqbd/tiktoken/lite/tiktoken_bg.wasm?module'; @@ -17,11 +18,10 @@ export const config = { // Function to fetch and format documents async function fetchAndFormatDocuments(lastMessageContent: string) { try { - console.log("fetching documents") const response = await fetch('http://localhost:3000/api/fetch-documents', { method: 'POST', headers: { 'Content-Type': 'application/json' }, - body: JSON.stringify({ input: lastMessageContent }), + body: JSON.stringify({ input: lastMessageContent, nResults: 6 }), }); if (!response.ok) { @@ -30,16 +30,19 @@ async function fetchAndFormatDocuments(lastMessageContent: string) { const data = await response.json(); const result = data.metadatas[0].map((metadata: any, index: number) => { - return `Source ${index + 1}) Title: ${metadata.title}, Page: ${metadata.page}, Content: ${data.documents[0][index]}\n`; - }).join(''); - - console.log(result); + return formatRetrievedDocument({ + content: data.documents[0][index], + metadata, + distance: data.distances?.[0]?.[index], + index, + }); + }).join('\n\n---\n\n'); return result; } catch (error) { console.error('Error fetching and formatting documents:', error); - throw error; // You may want to throw a more specific error object here + throw error; } } @@ -64,7 +67,7 @@ const handler = async (req: Request): Promise => { ${oneLine` You are a very enthusiastic AI assistant who loves to help people! Given the following information from - relevant documentation, answer the user's question using + relevant scientific documentation, answer the user's question using only that information, outputted in markdown format. `} @@ -75,7 +78,7 @@ const handler = async (req: Request): Promise => { `} ${oneLine` - Always include citations from the documentation. + Every factual claim must include citation keys from the documentation. `} `; @@ -100,9 +103,6 @@ const handler = async (req: Request): Promise => { encoding.free(); - console.log(model, promptToSend, temperatureToUse, key, messagesToSend); - - messagesToSend = [ { role: "user", @@ -121,6 +121,14 @@ const handler = async (req: Request): Promise => { ${oneLine` - Do not make up answers that are not provided in the documentation. `} + ${oneLine` + - Cite sources using the exact citation keys shown in square brackets, + for example [paper-title:p3:c2]. + `} + ${oneLine` + - Prefer sources with lower retrieval distance when multiple sources + contain similar information. + `} ${oneLine` - If you are unsure and the answer is not explicitly written in the documentation context, say diff --git a/ui/utils/server/scientific-rag.ts b/ui/utils/server/scientific-rag.ts new file mode 100644 index 0000000..47d4711 --- /dev/null +++ b/ui/utils/server/scientific-rag.ts @@ -0,0 +1,154 @@ +export type ScientificDocument = { + pageContent: string; + metadata: { + loc?: { + pageNumber?: number; + }; + pdf?: { + info?: { + Title?: string; + }; + }; + source?: string; + [key: string]: unknown; + }; +}; + +export type ScientificChunkMetadata = { + title: string; + page: number | string; + source: string; + section: string; + chunkIndex: number; + citationKey: string; +}; + +const SCIENTIFIC_SECTIONS = [ + 'abstract', + 'introduction', + 'background', + 'methods', + 'methodology', + 'materials and methods', + 'results', + 'discussion', + 'limitations', + 'conclusion', + 'references', +]; + +export const SCIENTIFIC_TEXT_SEPARATORS = [ + '\nAbstract', + '\nABSTRACT', + '\nIntroduction', + '\nINTRODUCTION', + '\nMethods', + '\nMETHODS', + '\nMaterials and Methods', + '\nResults', + '\nRESULTS', + '\nDiscussion', + '\nDISCUSSION', + '\nConclusion', + '\nCONCLUSION', + '\nReferences', + '\nREFERENCES', + '\n\n', + '\n', + '. ', + ' ', + '', +]; + +export const normalizeTitle = ( + titleFromMetadata: string | undefined, + fallbackTitle: string, +) => { + const title = titleFromMetadata?.trim(); + + return title && title.length > 0 ? title : fallbackTitle; +}; + +export const detectScientificSection = (content: string) => { + const firstLines = content + .split('\n') + .slice(0, 8) + .join(' ') + .toLowerCase(); + + for (const section of SCIENTIFIC_SECTIONS) { + const sectionRegex = new RegExp(`\\b${section}\\b`, 'i'); + + if (sectionRegex.test(firstLines)) { + return section; + } + } + + return 'body'; +}; + +export const buildCitationKey = ({ + title, + page, + chunkIndex, +}: { + title: string; + page: number | string; + chunkIndex: number; +}) => { + const slug = title + .toLowerCase() + .replace(/[^a-z0-9]+/g, '-') + .replace(/(^-|-$)/g, '') + .slice(0, 40); + + return `${slug || 'document'}:p${page}:c${chunkIndex + 1}`; +}; + +export const buildScientificMetadata = ( + document: ScientificDocument, + fallbackTitle: string, + chunkIndex: number, +): ScientificChunkMetadata => { + const title = normalizeTitle(document.metadata.pdf?.info?.Title, fallbackTitle); + const page = document.metadata.loc?.pageNumber ?? 'unknown'; + const section = detectScientificSection(document.pageContent); + + return { + title, + page, + source: document.metadata.source ?? fallbackTitle, + section, + chunkIndex, + citationKey: buildCitationKey({ title, page, chunkIndex }), + }; +}; + +export const formatRetrievedDocument = ({ + content, + metadata, + distance, + index, +}: { + content: string; + metadata: Partial; + distance?: number; + index: number; +}) => { + const citationKey = metadata.citationKey ?? `source-${index + 1}`; + const page = metadata.page ?? 'unknown'; + const section = metadata.section ?? 'body'; + const scoreLine = + typeof distance === 'number' ? `Distance: ${distance.toFixed(4)}\n` : ''; + + return [ + `Source ${index + 1} [${citationKey}]`, + `Title: ${metadata.title ?? 'Untitled'}`, + `Page: ${page}`, + `Section: ${section}`, + scoreLine.trim(), + `Content: ${content}`, + ] + .filter(Boolean) + .join('\n'); +}; From 9fd0f8b30950e0a6e19ec5e471986e6c258f2f5e Mon Sep 17 00:00:00 2001 From: Vinzz2303 Date: Thu, 14 May 2026 22:31:15 +0700 Subject: [PATCH 2/4] fix: harden scientific rag retrieval citations --- ui/__tests__/scientific-rag.test.ts | 24 +++++++++++++++++++++++- ui/pages/api/fetch-documents.ts | 15 +++++++++++++-- ui/pages/api/inject-documents.ts | 14 ++++++++++++-- ui/pages/api/rag-chat.ts | 13 ++++++++++--- ui/utils/server/scientific-rag.ts | 25 +++++++++++++++++++++---- 5 files changed, 79 insertions(+), 12 deletions(-) diff --git a/ui/__tests__/scientific-rag.test.ts b/ui/__tests__/scientific-rag.test.ts index 015fd76..2e59411 100644 --- a/ui/__tests__/scientific-rag.test.ts +++ b/ui/__tests__/scientific-rag.test.ts @@ -21,7 +21,7 @@ describe('scientific-rag helpers', () => { buildCitationKey({ title: 'Scientific RAG for Papers!', page: 4, - chunkIndex: 2, + pageChunkIndex: 2, }), ).toBe('scientific-rag-for-papers:p4:c3'); }); @@ -48,6 +48,28 @@ describe('scientific-rag helpers', () => { }); }); + it('can build page-local citation keys when global chunk order differs', () => { + const metadata = buildScientificMetadata( + { + pageContent: 'Discussion\nThe citation key should be local to a page.', + metadata: { + loc: { pageNumber: 9 }, + pdf: { info: { Title: 'Long Paper' } }, + source: '/tmp/long-paper.pdf', + }, + }, + 'long-paper.pdf', + 14, + 1, + ); + + expect(metadata).toMatchObject({ + chunkIndex: 14, + pageChunkIndex: 1, + citationKey: 'long-paper:p9:c2', + }); + }); + it('formats retrieved documents with citation metadata', () => { expect( formatRetrievedDocument({ diff --git a/ui/pages/api/fetch-documents.ts b/ui/pages/api/fetch-documents.ts index fc95101..08decf3 100644 --- a/ui/pages/api/fetch-documents.ts +++ b/ui/pages/api/fetch-documents.ts @@ -3,12 +3,23 @@ import { ChromaClient, TransformersEmbeddingFunction } from 'chromadb'; export default async function handler(req: NextApiRequest, res: NextApiResponse) { try { + if (req.method !== 'POST') { + return res.status(405).end(); + } + const client = new ChromaClient({ path: process.env.CHROMA_PATH || 'http://chroma-server:8000', }); - const query = req.body.input; - const nResults = Math.min(Number(req.body.nResults ?? 6), 10); + const query = typeof req.body.input === 'string' ? req.body.input.trim() : ''; + const requestedResults = Number(req.body.nResults ?? 6); + const nResults = Number.isFinite(requestedResults) + ? Math.min(Math.max(Math.trunc(requestedResults), 1), 10) + : 6; + + if (!query) { + return res.status(400).json({ error: 'Missing retrieval query' }); + } const embedder = new TransformersEmbeddingFunction(); diff --git a/ui/pages/api/inject-documents.ts b/ui/pages/api/inject-documents.ts index 8ce9295..daa9ad1 100644 --- a/ui/pages/api/inject-documents.ts +++ b/ui/pages/api/inject-documents.ts @@ -83,6 +83,7 @@ function processDocuments(docs: ScientificDocument[]) { const ids: string[] = []; const metadatas = []; const documentContents: string[] = []; + const pageChunkCounts = new Map(); for (let index = 0; index < docs.length; index += 1) { const document = docs[index]; @@ -91,8 +92,17 @@ function processDocuments(docs: ScientificDocument[]) { ids.push(id); const fallbackTitle = path.basename(document.metadata.source ?? 'document.pdf'); - - const metadata = buildScientificMetadata(document, fallbackTitle, index); + const page = document.metadata.loc?.pageNumber ?? 'unknown'; + const pageKey = `${document.metadata.source ?? fallbackTitle}:${page}`; + const pageChunkIndex = pageChunkCounts.get(pageKey) ?? 0; + pageChunkCounts.set(pageKey, pageChunkIndex + 1); + + const metadata = buildScientificMetadata( + document, + fallbackTitle, + index, + pageChunkIndex, + ); metadatas.push(metadata); // Add the page content to the documents array diff --git a/ui/pages/api/rag-chat.ts b/ui/pages/api/rag-chat.ts index 2a3c98c..5c5859c 100644 --- a/ui/pages/api/rag-chat.ts +++ b/ui/pages/api/rag-chat.ts @@ -16,9 +16,12 @@ export const config = { }; // Function to fetch and format documents -async function fetchAndFormatDocuments(lastMessageContent: string) { +async function fetchAndFormatDocuments( + baseUrl: string, + lastMessageContent: string, +) { try { - const response = await fetch('http://localhost:3000/api/fetch-documents', { + const response = await fetch(`${baseUrl}/api/fetch-documents`, { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ input: lastMessageContent, nResults: 6 }), @@ -88,7 +91,11 @@ const handler = async (req: Request): Promise => { const lastMessage = messages[messages.length - 1]; - const relevantDocuments = await fetchAndFormatDocuments(lastMessage.content); + const baseUrl = new URL(req.url).origin; + const relevantDocuments = await fetchAndFormatDocuments( + baseUrl, + lastMessage.content, + ); let temperatureToUse = temperature; if (temperatureToUse == null) { diff --git a/ui/utils/server/scientific-rag.ts b/ui/utils/server/scientific-rag.ts index 47d4711..4c26a76 100644 --- a/ui/utils/server/scientific-rag.ts +++ b/ui/utils/server/scientific-rag.ts @@ -20,6 +20,7 @@ export type ScientificChunkMetadata = { source: string; section: string; chunkIndex: number; + pageChunkIndex: number; citationKey: string; }; @@ -30,7 +31,10 @@ const SCIENTIFIC_SECTIONS = [ 'methods', 'methodology', 'materials and methods', + 'experimental setup', + 'experiments', 'results', + 'evaluation', 'discussion', 'limitations', 'conclusion', @@ -45,8 +49,15 @@ export const SCIENTIFIC_TEXT_SEPARATORS = [ '\nMethods', '\nMETHODS', '\nMaterials and Methods', + '\nMATERIALS AND METHODS', + '\nExperimental Setup', + '\nEXPERIMENTAL SETUP', + '\nExperiments', + '\nEXPERIMENTS', '\nResults', '\nRESULTS', + '\nEvaluation', + '\nEVALUATION', '\nDiscussion', '\nDISCUSSION', '\nConclusion', @@ -90,11 +101,11 @@ export const detectScientificSection = (content: string) => { export const buildCitationKey = ({ title, page, - chunkIndex, + pageChunkIndex, }: { title: string; page: number | string; - chunkIndex: number; + pageChunkIndex: number; }) => { const slug = title .toLowerCase() @@ -102,13 +113,14 @@ export const buildCitationKey = ({ .replace(/(^-|-$)/g, '') .slice(0, 40); - return `${slug || 'document'}:p${page}:c${chunkIndex + 1}`; + return `${slug || 'document'}:p${page}:c${pageChunkIndex + 1}`; }; export const buildScientificMetadata = ( document: ScientificDocument, fallbackTitle: string, chunkIndex: number, + pageChunkIndex = chunkIndex, ): ScientificChunkMetadata => { const title = normalizeTitle(document.metadata.pdf?.info?.Title, fallbackTitle); const page = document.metadata.loc?.pageNumber ?? 'unknown'; @@ -120,7 +132,8 @@ export const buildScientificMetadata = ( source: document.metadata.source ?? fallbackTitle, section, chunkIndex, - citationKey: buildCitationKey({ title, page, chunkIndex }), + pageChunkIndex, + citationKey: buildCitationKey({ title, page, pageChunkIndex }), }; }; @@ -138,6 +151,7 @@ export const formatRetrievedDocument = ({ const citationKey = metadata.citationKey ?? `source-${index + 1}`; const page = metadata.page ?? 'unknown'; const section = metadata.section ?? 'body'; + const pageChunkIndex = metadata.pageChunkIndex; const scoreLine = typeof distance === 'number' ? `Distance: ${distance.toFixed(4)}\n` : ''; @@ -146,6 +160,9 @@ export const formatRetrievedDocument = ({ `Title: ${metadata.title ?? 'Untitled'}`, `Page: ${page}`, `Section: ${section}`, + typeof pageChunkIndex === 'number' + ? `Page chunk: ${pageChunkIndex + 1}` + : '', scoreLine.trim(), `Content: ${content}`, ] From cef4416cbbcd8c85ad70ce83f1402ba44de50434 Mon Sep 17 00:00:00 2001 From: cerredz <110927971+cerredz@users.noreply.github.com> Date: Sat, 16 May 2026 02:41:07 -0400 Subject: [PATCH 3/4] Fix scientific section specificity Merge follow-up regression for Materials and Methods section detection. --- ui/__tests__/scientific-rag.test.ts | 3 +++ ui/utils/server/scientific-rag.ts | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/ui/__tests__/scientific-rag.test.ts b/ui/__tests__/scientific-rag.test.ts index 2e59411..6e5decf 100644 --- a/ui/__tests__/scientific-rag.test.ts +++ b/ui/__tests__/scientific-rag.test.ts @@ -14,6 +14,9 @@ describe('scientific-rag helpers', () => { expect(detectScientificSection('METHODS\nWe used a benchmark.')).toBe( 'methods', ); + expect( + detectScientificSection('Materials and Methods\nWe collected samples.'), + ).toBe('materials and methods'); }); it('builds stable citation keys', () => { diff --git a/ui/utils/server/scientific-rag.ts b/ui/utils/server/scientific-rag.ts index 4c26a76..644c25a 100644 --- a/ui/utils/server/scientific-rag.ts +++ b/ui/utils/server/scientific-rag.ts @@ -28,9 +28,9 @@ const SCIENTIFIC_SECTIONS = [ 'abstract', 'introduction', 'background', - 'methods', - 'methodology', 'materials and methods', + 'methodology', + 'methods', 'experimental setup', 'experiments', 'results', From 7bd30ba69bcf522f70b24bb2a4dca0f2a1dc3e2c Mon Sep 17 00:00:00 2001 From: Vinzz2303 Date: Sun, 17 May 2026 22:55:25 +0700 Subject: [PATCH 4/4] fix: harden scientific rag empty retrieval handling --- ui/__tests__/scientific-rag.test.ts | 14 +++++++++ ui/pages/api/inject-documents.ts | 11 +++++-- ui/pages/api/rag-chat.ts | 15 +++------- ui/utils/server/scientific-rag.ts | 45 +++++++++++++++++++++++++++++ 4 files changed, 72 insertions(+), 13 deletions(-) diff --git a/ui/__tests__/scientific-rag.test.ts b/ui/__tests__/scientific-rag.test.ts index 6e5decf..8e17da3 100644 --- a/ui/__tests__/scientific-rag.test.ts +++ b/ui/__tests__/scientific-rag.test.ts @@ -3,6 +3,7 @@ import { buildScientificMetadata, detectScientificSection, formatRetrievedDocument, + formatRetrievedDocuments, } from '@/utils/server/scientific-rag'; import { describe, expect, it } from 'vitest'; @@ -88,4 +89,17 @@ describe('scientific-rag helpers', () => { }), ).toContain('Source 1 [paper:p2:c1]'); }); + + it('formats Chroma retrieval results defensively', () => { + expect( + formatRetrievedDocuments({ + documents: [['Document context.']], + metadatas: [[{ citationKey: 'paper:p1:c1', title: 'Paper', page: 1 }]], + distances: [[0.42]], + }), + ).toContain('Source 1 [paper:p1:c1]'); + + expect(formatRetrievedDocuments({ documents: [[]] })).toBe(''); + expect(formatRetrievedDocuments({ documents: undefined })).toBe(''); + }); }); diff --git a/ui/pages/api/inject-documents.ts b/ui/pages/api/inject-documents.ts index daa9ad1..1dbf39e 100644 --- a/ui/pages/api/inject-documents.ts +++ b/ui/pages/api/inject-documents.ts @@ -39,7 +39,14 @@ export default async function handler( path: process.env.CHROMA_PATH || 'http://chroma-server:8000', }); - const loader = new PDFLoader(files.pdf[0].filepath); + const pdf = files.pdf; + const pdfFile = Array.isArray(pdf) ? pdf[0] : pdf; + + if (!pdfFile || typeof pdfFile.filepath !== 'string') { + return res.status(400).json({ error: 'Missing PDF upload' }); + } + + const loader = new PDFLoader(pdfFile.filepath); const originalDocs = await loader.load(); @@ -50,7 +57,7 @@ export default async function handler( }); const docs = await splitter.splitDocuments(originalDocs); - + // Process the documents and perform other logic const { ids, metadatas, documentContents } = processDocuments(docs); diff --git a/ui/pages/api/rag-chat.ts b/ui/pages/api/rag-chat.ts index 5c5859c..5d42eb6 100644 --- a/ui/pages/api/rag-chat.ts +++ b/ui/pages/api/rag-chat.ts @@ -3,7 +3,7 @@ import { OpenAIError, OpenAIStream } from '@/utils/server'; import { codeBlock, oneLine } from 'common-tags'; import { ChatBody, Message } from '@/types/chat'; -import { formatRetrievedDocument } from '@/utils/server/scientific-rag'; +import { formatRetrievedDocuments } from '@/utils/server/scientific-rag'; // @ts-expect-error import wasm from '../../node_modules/@dqbd/tiktoken/lite/tiktoken_bg.wasm?module'; @@ -32,16 +32,9 @@ async function fetchAndFormatDocuments( } const data = await response.json(); - const result = data.metadatas[0].map((metadata: any, index: number) => { - return formatRetrievedDocument({ - content: data.documents[0][index], - metadata, - distance: data.distances?.[0]?.[index], - index, - }); - }).join('\n\n---\n\n'); - - return result; + const result = formatRetrievedDocuments(data); + + return result || 'No relevant documents were retrieved.'; } catch (error) { console.error('Error fetching and formatting documents:', error); diff --git a/ui/utils/server/scientific-rag.ts b/ui/utils/server/scientific-rag.ts index 644c25a..eb36655 100644 --- a/ui/utils/server/scientific-rag.ts +++ b/ui/utils/server/scientific-rag.ts @@ -169,3 +169,48 @@ export const formatRetrievedDocument = ({ .filter(Boolean) .join('\n'); }; + +export const formatRetrievedDocuments = (data: { + documents?: unknown; + metadatas?: unknown; + distances?: unknown; +}) => { + const documents = Array.isArray(data.documents) + ? (data.documents[0] as unknown) + : undefined; + const metadatas = Array.isArray(data.metadatas) + ? (data.metadatas[0] as unknown) + : undefined; + const distances = Array.isArray(data.distances) + ? (data.distances[0] as unknown) + : undefined; + + if (!Array.isArray(documents) || documents.length === 0) { + return ''; + } + + return documents + .map((content, index) => { + if (typeof content !== 'string' || content.trim().length === 0) { + return ''; + } + + const metadata = + Array.isArray(metadatas) && typeof metadatas[index] === 'object' + ? (metadatas[index] as Partial) + : {}; + const distance = + Array.isArray(distances) && typeof distances[index] === 'number' + ? distances[index] + : undefined; + + return formatRetrievedDocument({ + content, + metadata, + distance, + index, + }); + }) + .filter(Boolean) + .join('\n\n---\n\n'); +};