diff --git a/ISAAC-497-PR-REPORT.md b/ISAAC-497-PR-REPORT.md new file mode 100644 index 0000000..8c9fe01 --- /dev/null +++ b/ISAAC-497-PR-REPORT.md @@ -0,0 +1,69 @@ +# ISAAC-497 PR Report + +## Summary + +This patch improves the existing uploaded-document RAG path so it behaves more like a scientific/research workflow: + +- stores stronger citation metadata for uploaded PDF chunks +- formats retrieved sources with stable bracketed source IDs like `[S1]` +- includes page, DOI, year, and retrieval distance when available +- makes retrieval depth configurable through `RAG_TOP_K` or request body `nResults` +- updates the RAG prompt so factual claims must cite retrieved scientific sources +- adds request validation for document upload and retrieval API routes + +## Files Changed + +- `ui/utils/server/scientific-rag.ts` +- `ui/pages/api/inject-documents.ts` +- `ui/pages/api/fetch-documents.ts` +- `ui/pages/api/rag-chat.ts` + +## Validation + +I could not run the full Next.js build in this workspace because dependencies are not installed. Recommended maintainer validation: + +```bash +cd ui +npm install +npm run lint +npm run build +``` + +Manual validation path: + +1. Start Chroma and the UI with the existing Docker flow. +2. Upload a research PDF. +3. Ask a question that requires evidence from the PDF. +4. Confirm the answer cites source IDs like `[S1]`. +5. Confirm the retrieved context includes page metadata and DOI/year when available. + +## Suggested PR Title + +Improve scientific RAG citations and retrieval metadata + +## Suggested PR Body + +This PR addresses part of ISAAC-497 by strengthening the current uploaded-document RAG pipeline for scientific workflows. + +Changes included: + +- Added a small scientific RAG utility for citation metadata extraction and retrieved-source formatting. +- Preserved source metadata during PDF ingestion, including title, page, source path, source type, chunk index, citation key, DOI, author, and year when available. +- Made Chroma retrieval configurable through `RAG_TOP_K` or request `nResults`. +- Included retrieval distances in the formatted source context. +- Updated the RAG prompt to require bracketed source citations like `[S1]` for factual claims. +- Added basic API method/file validation around upload and retrieval routes. + +This is intentionally scoped as an incremental improvement to the existing Chroma/LangChain implementation rather than a full framework rewrite. It should make the current RAG behavior easier to validate and extend toward the broader ISAAC-497 goals around scientific document management, AI access to uploaded documents, performance tuning, and reliable citations. + +Validation note: I was not able to run the full build in my local workspace because dependencies were not installed there. Recommended validation is `cd ui && npm install && npm run lint && npm run build`. + +## Suggested Maintainer Comment + +Hi Isaac team, I prepared an initial implementation for ISAAC-497 focused on the existing uploaded-document RAG path. + +It improves scientific citation handling, stores richer PDF chunk metadata, formats retrieved context with stable source IDs like `[S1]`, exposes configurable retrieval depth, and updates the answer prompt so factual claims must cite retrieved sources. + +If this direction matches the bounty expectations, I can continue with the next slice: Semantic Scholar reference ingestion/unification and a small retrieval evaluation harness. + +For bounty payout, I understand this should go through Algora after the PR is reviewed and merged. diff --git a/ui/pages/api/fetch-documents.ts b/ui/pages/api/fetch-documents.ts index 9304e48..533c435 100644 --- a/ui/pages/api/fetch-documents.ts +++ b/ui/pages/api/fetch-documents.ts @@ -3,11 +3,16 @@ import { ChromaClient, TransformersEmbeddingFunction } from "chromadb"; export default async function handler(req: NextApiRequest, res: NextApiResponse) { try { + if (req.method !== 'POST') { + return res.status(405).end(); + } + const client = new ChromaClient({ - path: "http://chroma-server:8000", + path: process.env.CHROMA_PATH || "http://chroma-server:8000", }); const query = req.body.input; + const nResults = Number(req.body.nResults || process.env.RAG_TOP_K || 6); const embedder = new TransformersEmbeddingFunction(); @@ -15,8 +20,9 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse) // query the collection const results = await collection.query({ - nResults: 4, - queryTexts: [query] + nResults, + queryTexts: [query], + include: ["documents", "metadatas", "distances"], }) res.status(200).json(results); @@ -29,4 +35,4 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse) } res.status(500).json({ error: 'An unexpected error occurred :(' }); } -} \ No newline at end of file +} diff --git a/ui/pages/api/inject-documents.ts b/ui/pages/api/inject-documents.ts index 532a635..efbf186 100644 --- a/ui/pages/api/inject-documents.ts +++ b/ui/pages/api/inject-documents.ts @@ -5,6 +5,8 @@ import { IncomingForm } from 'formidable'; import { PDFLoader } from 'langchain/document_loaders/fs/pdf'; import { RecursiveCharacterTextSplitter } from "langchain/text_splitter"; +import { extractScientificMetadata } from '@/utils/server/scientific-rag'; + import path from 'path'; import { v4 as uuidv4 } from 'uuid'; @@ -33,7 +35,13 @@ export default async function handler( path: process.env.CHROMA_PATH || 'http://chroma-server:8000', }); - const loader = new PDFLoader(files.pdf[0].filepath); + const uploadedPdf = Array.isArray(files.pdf) ? files.pdf[0] : files.pdf; + + if (!uploadedPdf) { + return res.status(400).json({ error: 'A PDF file is required' }); + } + + const loader = new PDFLoader(uploadedPdf.filepath); const originalDocs = await loader.load(); @@ -80,22 +88,18 @@ function processDocuments(docs: any) { const metadatas = []; const documentContents = []; - for (const document of docs) { + for (const [chunkIndex, document] of docs.entries()) { // Generate an ID for each document, or use some existing unique identifier const id = uuidv4(); ids.push(id); const fallbackTitle = path.basename(document.metadata.source); - const titleFromMetadata = document.metadata.pdf.info.Title; - - const title = titleFromMetadata && titleFromMetadata.length > 0 ? titleFromMetadata : fallbackTitle; - - - const metadata = { - title: title, - page: document.metadata.loc.pageNumber, // Define this function to extract chapter info - source: document.metadata.source, // Define this function to extract verse info - }; + const metadata = extractScientificMetadata( + document.pageContent, + document.metadata, + fallbackTitle, + chunkIndex, + ); metadatas.push(metadata); // Add the page content to the documents array diff --git a/ui/pages/api/rag-chat.ts b/ui/pages/api/rag-chat.ts index ce84d67..5ff87e9 100644 --- a/ui/pages/api/rag-chat.ts +++ b/ui/pages/api/rag-chat.ts @@ -3,6 +3,7 @@ import { OpenAIError, OpenAIStream } from '@/utils/server'; import { codeBlock, oneLine } from 'common-tags' import { ChatBody, Message } from '@/types/chat'; +import { formatRetrievedDocuments } from '@/utils/server/scientific-rag'; // @ts-expect-error import wasm from '../../node_modules/@dqbd/tiktoken/lite/tiktoken_bg.wasm?module'; @@ -29,9 +30,7 @@ async function fetchAndFormatDocuments(lastMessageContent: string) { } const data = await response.json(); - const result = data.metadatas[0].map((metadata: any, index: number) => { - return `Source ${index + 1}) Title: ${metadata.title}, Page: ${metadata.page}, Content: ${data.documents[0][index]}\n`; - }).join(''); + const result = formatRetrievedDocuments(data); console.log(result); @@ -75,7 +74,8 @@ const handler = async (req: Request): Promise => { `} ${oneLine` - Always include citations from the documentation. + Always include citations using the bracketed source IDs, for example [S1]. + For scientific claims, prefer sources with page, DOI, year, or author metadata. `} `; @@ -107,7 +107,7 @@ const handler = async (req: Request): Promise => { { role: "user", content: codeBlock` - Here is the relevant documentation: + Here are the retrieved scientific sources: ${relevantDocuments} `, }, @@ -130,7 +130,8 @@ const handler = async (req: Request): Promise => { - Prefer splitting your response into multiple paragraphs. `} ${oneLine` - - Output as markdown with citations based on the documentation. + - Output as markdown with citations based on the bracketed source IDs. + - Cite every factual claim with one or more source IDs such as [S1]. `} `, }, diff --git a/ui/utils/server/scientific-rag.ts b/ui/utils/server/scientific-rag.ts new file mode 100644 index 0000000..4871cb0 --- /dev/null +++ b/ui/utils/server/scientific-rag.ts @@ -0,0 +1,87 @@ +type ChromaQueryResults = { + documents?: (string | null)[][]; + metadatas?: (Record | null)[][]; + distances?: (number | null)[][]; +}; + +export type ScientificDocumentMetadata = { + title: string; + page?: number | string; + source?: string; + sourceType: 'uploaded_pdf' | 'semantic_scholar' | 'unknown'; + citationKey: string; + chunkIndex: number; + doi?: string; + authors?: string; + year?: string; + semanticScholarId?: string; +}; + +const DOI_PATTERN = /\b10\.\d{4,9}\/[-._;()/:A-Z0-9]+\b/i; +const YEAR_PATTERN = /\b(19|20)\d{2}\b/; + +export const normalizeWhitespace = (value: string) => + value.replace(/\s+/g, ' ').trim(); + +export const extractScientificMetadata = ( + content: string, + rawMetadata: Record, + fallbackTitle: string, + chunkIndex: number, +): ScientificDocumentMetadata => { + const pdfInfo = rawMetadata?.pdf?.info ?? {}; + const title = + normalizeWhitespace(String(pdfInfo.Title || rawMetadata.title || fallbackTitle)) || + 'Untitled document'; + const doi = normalizeWhitespace(String(pdfInfo.DOI || content.match(DOI_PATTERN)?.[0] || '')); + const year = normalizeWhitespace(String(pdfInfo.CreationDate || content.match(YEAR_PATTERN)?.[0] || '')); + const authors = normalizeWhitespace(String(pdfInfo.Author || rawMetadata.authors || '')); + const source = String(rawMetadata.source || ''); + const sourceType = rawMetadata.semanticScholarId ? 'semantic_scholar' : source ? 'uploaded_pdf' : 'unknown'; + const page = rawMetadata?.loc?.pageNumber; + + return { + title, + page, + source, + sourceType, + citationKey: `S${chunkIndex + 1}`, + chunkIndex, + ...(doi ? { doi } : {}), + ...(authors ? { authors } : {}), + ...(year ? { year } : {}), + ...(rawMetadata.semanticScholarId + ? { semanticScholarId: String(rawMetadata.semanticScholarId) } + : {}), + }; +}; + +export const formatRetrievedDocuments = (results: ChromaQueryResults): string => { + const documents = results.documents?.[0] ?? []; + const metadatas = results.metadatas?.[0] ?? []; + const distances = results.distances?.[0] ?? []; + + return documents + .map((content, index) => { + if (!content) { + return ''; + } + + const metadata = metadatas[index] ?? {}; + const citationKey = String(metadata.citationKey || `S${index + 1}`); + const title = String(metadata.title || 'Untitled document'); + const page = metadata.page ? `, page ${metadata.page}` : ''; + const doi = metadata.doi ? `, DOI ${metadata.doi}` : ''; + const year = metadata.year ? `, ${metadata.year}` : ''; + const distance = + typeof distances[index] === 'number' + ? `, retrieval distance ${distances[index]?.toFixed(4)}` + : ''; + + return `[${citationKey}] ${title}${year}${page}${doi}${distance}\n${normalizeWhitespace( + content, + )}\n`; + }) + .filter(Boolean) + .join('\n'); +};