aietal · Yongzie · May 14, 2026
diff --git a/ISAAC-497-PR-REPORT.md b/ISAAC-497-PR-REPORT.md
@@ -0,0 +1,69 @@
+# ISAAC-497 PR Report
+
+## Summary
+
+This patch improves the existing uploaded-document RAG path so it behaves more like a scientific/research workflow:
+
+- stores stronger citation metadata for uploaded PDF chunks
+- formats retrieved sources with stable bracketed source IDs like `[S1]`
+- includes page, DOI, year, and retrieval distance when available
+- makes retrieval depth configurable through `RAG_TOP_K` or request body `nResults`
+- updates the RAG prompt so factual claims must cite retrieved scientific sources
+- adds request validation for document upload and retrieval API routes
+
+## Files Changed
+
+- `ui/utils/server/scientific-rag.ts`
+- `ui/pages/api/inject-documents.ts`
+- `ui/pages/api/fetch-documents.ts`
+- `ui/pages/api/rag-chat.ts`
+
+## Validation
+
+I could not run the full Next.js build in this workspace because dependencies are not installed. Recommended maintainer validation:
+
+```bash
+cd ui
+npm install
+npm run lint
+npm run build
+```
+
+Manual validation path:
+
+1. Start Chroma and the UI with the existing Docker flow.
+2. Upload a research PDF.
+3. Ask a question that requires evidence from the PDF.
+4. Confirm the answer cites source IDs like `[S1]`.
+5. Confirm the retrieved context includes page metadata and DOI/year when available.
+
+## Suggested PR Title
+
+Improve scientific RAG citations and retrieval metadata
+
+## Suggested PR Body
+
+This PR addresses part of ISAAC-497 by strengthening the current uploaded-document RAG pipeline for scientific workflows.
+
+Changes included:
+
+- Added a small scientific RAG utility for citation metadata extraction and retrieved-source formatting.
+- Preserved source metadata during PDF ingestion, including title, page, source path, source type, chunk index, citation key, DOI, author, and year when available.
+- Made Chroma retrieval configurable through `RAG_TOP_K` or request `nResults`.
+- Included retrieval distances in the formatted source context.
+- Updated the RAG prompt to require bracketed source citations like `[S1]` for factual claims.
+- Added basic API method/file validation around upload and retrieval routes.
+
+This is intentionally scoped as an incremental improvement to the existing Chroma/LangChain implementation rather than a full framework rewrite. It should make the current RAG behavior easier to validate and extend toward the broader ISAAC-497 goals around scientific document management, AI access to uploaded documents, performance tuning, and reliable citations.
+
+Validation note: I was not able to run the full build in my local workspace because dependencies were not installed there. Recommended validation is `cd ui && npm install && npm run lint && npm run build`.
+
+## Suggested Maintainer Comment
+
+Hi Isaac team, I prepared an initial implementation for ISAAC-497 focused on the existing uploaded-document RAG path.
+
+It improves scientific citation handling, stores richer PDF chunk metadata, formats retrieved context with stable source IDs like `[S1]`, exposes configurable retrieval depth, and updates the answer prompt so factual claims must cite retrieved sources.
+
+If this direction matches the bounty expectations, I can continue with the next slice: Semantic Scholar reference ingestion/unification and a small retrieval evaluation harness.
+
+For bounty payout, I understand this should go through Algora after the PR is reviewed and merged.
diff --git a/ui/pages/api/fetch-documents.ts b/ui/pages/api/fetch-documents.ts
@@ -3,20 +3,26 @@ import { ChromaClient, TransformersEmbeddingFunction } from "chromadb";
 
 export default async function handler(req: NextApiRequest, res: NextApiResponse) {
   try {
+    if (req.method !== 'POST') {
+      return res.status(405).end();
+    }
+
     const client = new ChromaClient({
-      path: "http://chroma-server:8000",
+      path: process.env.CHROMA_PATH || "http://chroma-server:8000",
     });
 
     const query = req.body.input;
+    const nResults = Number(req.body.nResults || process.env.RAG_TOP_K || 6);
 
     const embedder = new TransformersEmbeddingFunction();
 
     const collection = await client.getOrCreateCollection({ name: "default-collection", embeddingFunction: embedder });
 
   // query the collection
   const results = await collection.query({
-      nResults: 4, 
-      queryTexts: [query]
+      nResults,
+      queryTexts: [query],
+      include: ["documents", "metadatas", "distances"],
   }) 
 
     res.status(200).json(results);
@@ -29,4 +35,4 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse)
     }
     res.status(500).json({ error: 'An unexpected error occurred :(' });
   }
-}
+}
diff --git a/ui/pages/api/inject-documents.ts b/ui/pages/api/inject-documents.ts
@@ -5,6 +5,8 @@ import { IncomingForm } from 'formidable';
 import { PDFLoader } from 'langchain/document_loaders/fs/pdf';
 import { RecursiveCharacterTextSplitter } from "langchain/text_splitter";
 
+import { extractScientificMetadata } from '@/utils/server/scientific-rag';
+
 import path from 'path';
 import { v4 as uuidv4 } from 'uuid';
 
@@ -33,7 +35,13 @@ export default async function handler(
         path: process.env.CHROMA_PATH || 'http://chroma-server:8000',
       });
 
-      const loader = new PDFLoader(files.pdf[0].filepath);
+      const uploadedPdf = Array.isArray(files.pdf) ? files.pdf[0] : files.pdf;
+
+      if (!uploadedPdf) {
+        return res.status(400).json({ error: 'A PDF file is required' });
+      }
+
+      const loader = new PDFLoader(uploadedPdf.filepath);
 
       const originalDocs = await loader.load();
 
@@ -80,22 +88,18 @@ function processDocuments(docs: any) {
   const metadatas = [];
   const documentContents = [];
 
-  for (const document of docs) {
+  for (const [chunkIndex, document] of docs.entries()) {
     // Generate an ID for each document, or use some existing unique identifier
     const id = uuidv4();
     ids.push(id);
 
     const fallbackTitle = path.basename(document.metadata.source);
-    const titleFromMetadata = document.metadata.pdf.info.Title;
-
-    const title = titleFromMetadata && titleFromMetadata.length > 0 ? titleFromMetadata : fallbackTitle;
-
-
-    const metadata = {
-      title: title,
-      page: document.metadata.loc.pageNumber, // Define this function to extract chapter info
-      source: document.metadata.source, // Define this function to extract verse info
-    };
+    const metadata = extractScientificMetadata(
+      document.pageContent,
+      document.metadata,
+      fallbackTitle,
+      chunkIndex,
+    );
     metadatas.push(metadata);
 
     // Add the page content to the documents array

diff --git a/ui/pages/api/rag-chat.ts b/ui/pages/api/rag-chat.ts
@@ -3,6 +3,7 @@ import { OpenAIError, OpenAIStream } from '@/utils/server';
 import { codeBlock, oneLine } from 'common-tags'
 
 import { ChatBody, Message } from '@/types/chat';
+import { formatRetrievedDocuments } from '@/utils/server/scientific-rag';
 
 // @ts-expect-error
 import wasm from '../../node_modules/@dqbd/tiktoken/lite/tiktoken_bg.wasm?module';
@@ -29,9 +30,7 @@ async function fetchAndFormatDocuments(lastMessageContent: string) {
     }
 
     const data = await response.json();
-    const result = data.metadatas[0].map((metadata: any, index: number) => {
-      return `Source ${index + 1}) Title: ${metadata.title}, Page: ${metadata.page}, Content: ${data.documents[0][index]}\n`;
-    }).join('');
+    const result = formatRetrievedDocuments(data);
 
     console.log(result);
 
@@ -75,7 +74,8 @@ const handler = async (req: Request): Promise<Response> => {
     `}
 
     ${oneLine`
-      Always include citations from the documentation.
+      Always include citations using the bracketed source IDs, for example [S1].
+      For scientific claims, prefer sources with page, DOI, year, or author metadata.
     `}
   `;
 
@@ -107,7 +107,7 @@ const handler = async (req: Request): Promise<Response> => {
       {
         role: "user",
         content: codeBlock`
-          Here is the relevant documentation:
+          Here are the retrieved scientific sources:
           ${relevantDocuments}
         `,
       },
@@ -130,7 +130,8 @@ const handler = async (req: Request): Promise<Response> => {
             - Prefer splitting your response into multiple paragraphs.
           `}
           ${oneLine`
-            - Output as markdown with citations based on the documentation.
+            - Output as markdown with citations based on the bracketed source IDs.
+            - Cite every factual claim with one or more source IDs such as [S1].
           `}
         `,
       },

diff --git a/ui/utils/server/scientific-rag.ts b/ui/utils/server/scientific-rag.ts
@@ -0,0 +1,87 @@
+type ChromaQueryResults = {
+  documents?: (string | null)[][];
+  metadatas?: (Record<string, unknown> | null)[][];
+  distances?: (number | null)[][];
+};
+
+export type ScientificDocumentMetadata = {
+  title: string;
+  page?: number | string;
+  source?: string;
+  sourceType: 'uploaded_pdf' | 'semantic_scholar' | 'unknown';
+  citationKey: string;
+  chunkIndex: number;
+  doi?: string;
+  authors?: string;
+  year?: string;
+  semanticScholarId?: string;
+};
+
+const DOI_PATTERN = /\b10\.\d{4,9}\/[-._;()/:A-Z0-9]+\b/i;
+const YEAR_PATTERN = /\b(19|20)\d{2}\b/;
+
+export const normalizeWhitespace = (value: string) =>
+  value.replace(/\s+/g, ' ').trim();
+
+export const extractScientificMetadata = (
+  content: string,
+  rawMetadata: Record<string, any>,
+  fallbackTitle: string,
+  chunkIndex: number,
+): ScientificDocumentMetadata => {
+  const pdfInfo = rawMetadata?.pdf?.info ?? {};
+  const title =
+    normalizeWhitespace(String(pdfInfo.Title || rawMetadata.title || fallbackTitle)) ||
+    'Untitled document';
+  const doi = normalizeWhitespace(String(pdfInfo.DOI || content.match(DOI_PATTERN)?.[0] || ''));
+  const year = normalizeWhitespace(String(pdfInfo.CreationDate || content.match(YEAR_PATTERN)?.[0] || ''));
+  const authors = normalizeWhitespace(String(pdfInfo.Author || rawMetadata.authors || ''));
+  const source = String(rawMetadata.source || '');
+  const sourceType = rawMetadata.semanticScholarId ? 'semantic_scholar' : source ? 'uploaded_pdf' : 'unknown';
+  const page = rawMetadata?.loc?.pageNumber;
+
+  return {
+    title,
+    page,
+    source,
+    sourceType,
+    citationKey: `S${chunkIndex + 1}`,
+    chunkIndex,
+    ...(doi ? { doi } : {}),
+    ...(authors ? { authors } : {}),
+    ...(year ? { year } : {}),
+    ...(rawMetadata.semanticScholarId
+      ? { semanticScholarId: String(rawMetadata.semanticScholarId) }
+      : {}),
+  };
+};
+
+export const formatRetrievedDocuments = (results: ChromaQueryResults): string => {
+  const documents = results.documents?.[0] ?? [];
+  const metadatas = results.metadatas?.[0] ?? [];
+  const distances = results.distances?.[0] ?? [];
+
+  return documents
+    .map((content, index) => {
+      if (!content) {
+        return '';
+      }
+
+      const metadata = metadatas[index] ?? {};
+      const citationKey = String(metadata.citationKey || `S${index + 1}`);
+      const title = String(metadata.title || 'Untitled document');
+      const page = metadata.page ? `, page ${metadata.page}` : '';
+      const doi = metadata.doi ? `, DOI ${metadata.doi}` : '';
+      const year = metadata.year ? `, ${metadata.year}` : '';
+      const distance =
+        typeof distances[index] === 'number'
+          ? `, retrieval distance ${distances[index]?.toFixed(4)}`
+          : '';
+
+      return `[${citationKey}] ${title}${year}${page}${doi}${distance}\n${normalizeWhitespace(
+        content,
+      )}\n`;
+    })
+    .filter(Boolean)
+    .join('\n');
+};