aietal · kkudumu · May 14, 2026 · May 14, 2026 · May 14, 2026 · May 14, 2026
diff --git a/ui/__tests__/rag-ingest.test.ts b/ui/__tests__/rag-ingest.test.ts
@@ -0,0 +1,151 @@
+import {
+  buildDocumentMetadata,
+  buildSourceHash,
+  getFirstDoi,
+  getPdfTitle,
+  getPublicationYear,
+  getSourceName,
+  prepareDocumentsForChroma,
+} from '@/utils/server/rag-ingest';
+
+import { describe, expect, it } from 'vitest';
+
+describe('RAG document ingestion helpers', () => {
+  it('uses PDF title metadata when present', () => {
+    const document = {
+      pageContent: 'Findings text',
+      metadata: {
+        source: '/tmp/fallback.pdf',
+        loc: { pageNumber: 3 },
+        pdf: { info: { Title: '  Trial Protocol  ' } },
+      },
+    };
+
+    expect(getPdfTitle(document)).toBe('Trial Protocol');
+    expect(buildDocumentMetadata(document, 2)).toEqual({
+      title: 'Trial Protocol',
+      page: 3,
+      source: '/tmp/fallback.pdf',
+      chunk: 2,
+      contentLength: 13,
+      sourceHash: buildSourceHash(document),
+      doi: '',
+      publicationYear: 0,
+    });
+  });
+
+  it('falls back to the source file name when PDF title metadata is missing', () => {
+    const document = {
+      pageContent: 'Methods text',
+      metadata: {
+        source: '/uploads/research-paper.pdf',
+        loc: { pageNumber: 7 },
+      },
+    };
+
+    expect(getPdfTitle(document)).toBeUndefined();
+    expect(getSourceName(document)).toBe('research-paper.pdf');
+    expect(buildDocumentMetadata(document, 0).title).toBe('research-paper.pdf');
+  });
+
+  it('handles missing metadata without throwing', () => {
+    const metadata = buildDocumentMetadata({ pageContent: 'Abstract text' }, 0);
+
+    expect(metadata).toEqual({
+      title: 'uploaded-document',
+      page: 0,
+      source: 'uploaded-document',
+      chunk: 0,
+      contentLength: 13,
+      sourceHash: buildSourceHash({ pageContent: 'Abstract text' }),
+      doi: '',
+      publicationYear: 0,
+    });
+  });
+
+  it('extracts citation metadata and stable source hashes for retrieval', () => {
+    const document = {
+      pageContent:
+        'Rivera et al. 2025 reported supporting evidence in DOI 10.1016/j.watres.2025.120001.',
+      metadata: {
+        source: '/uploads/flood-study.pdf',
+        loc: { pageNumber: 12 },
+      },
+    };
+    const metadata = buildDocumentMetadata(document, 4);
+
+    expect(getFirstDoi(`${document.pageContent}.`)).toBe(
+      '10.1016/j.watres.2025.120001',
+    );
+    expect(getPublicationYear(document.pageContent)).toBe(2025);
+    expect(metadata).toMatchObject({
+      title: 'flood-study.pdf',
+      page: 12,
+      source: '/uploads/flood-study.pdf',
+      chunk: 4,
+      doi: '10.1016/j.watres.2025.120001',
+      publicationYear: 2025,
+    });
+    expect(metadata.sourceHash).toHaveLength(16);
+    expect(metadata.sourceHash).toBe(buildSourceHash(document));
+  });
+
+  it('skips blank chunks and assigns dense chunk indices', () => {
+    let nextId = 0;
+    const prepared = prepareDocumentsForChroma(
+      [
+        { pageContent: '  ' },
+        { pageContent: ' First chunk ', metadata: { source: 'first.pdf' } },
+        { pageContent: '\n\n' },
+        { pageContent: 'Second chunk', metadata: { source: 'second.pdf' } },
+      ],
+      () => `doc-${++nextId}`,
+    );
+
+    expect(prepared).toEqual({
+      ids: ['doc-1', 'doc-2'],
+      metadatas: [
+        {
+          title: 'first.pdf',
+          page: 0,
+          source: 'first.pdf',
+          chunk: 0,
+          contentLength: 11,
+          sourceHash: buildSourceHash({
+            pageContent: 'First chunk',
+            metadata: { source: 'first.pdf' },
+          }),
+          doi: '',
+          publicationYear: 0,
+        },
+        {
+          title: 'second.pdf',
+          page: 0,
+          source: 'second.pdf',
+          chunk: 1,
+          contentLength: 12,
+          sourceHash: buildSourceHash({
+            pageContent: 'Second chunk',
+            metadata: { source: 'second.pdf' },
+          }),
+          doi: '',
+          publicationYear: 0,
+        },
+      ],
+      documentContents: ['First chunk', 'Second chunk'],
+    });
+  });
+
+  it('returns an empty prepared payload when every chunk is blank', () => {
+    const prepared = prepareDocumentsForChroma(
+      [{ pageContent: '  ' }, { pageContent: '\n\n' }],
+      () => 'unused',
+    );
+
+    expect(prepared).toEqual({
+      ids: [],
+      metadatas: [],
+      documentContents: [],
+    });
+  });
+});
diff --git a/ui/docs/rag-ingest-demo.mp4 b/ui/docs/rag-ingest-demo.mp4
diff --git a/ui/pages/api/inject-documents.ts b/ui/pages/api/inject-documents.ts
@@ -1,11 +1,11 @@
 import type { NextApiRequest, NextApiResponse } from 'next';
 
+import { prepareDocumentsForChroma } from '@/utils/server/rag-ingest';
+
 import { ChromaClient, TransformersEmbeddingFunction } from 'chromadb';
 import { IncomingForm } from 'formidable';
 import { PDFLoader } from 'langchain/document_loaders/fs/pdf';
-import { RecursiveCharacterTextSplitter } from "langchain/text_splitter";
-
-import path from 'path';
+import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
 import { v4 as uuidv4 } from 'uuid';
 
 export const config = {
@@ -29,26 +29,40 @@ export default async function handler(
         return res.status(400).json({ error: 'Failed to upload file' });
       }
 
+      const pdfFiles = Array.isArray(files.pdf)
+        ? files.pdf
+        : [files.pdf].filter(Boolean);
+
+      if (!pdfFiles.length) {
+        return res.status(400).json({ error: 'Missing PDF file upload' });
+      }
+
       const client = new ChromaClient({
         path: process.env.CHROMA_PATH || 'http://chroma-server:8000',
       });
 
-      const loader = new PDFLoader(files.pdf[0].filepath);
+      const loader = new PDFLoader(pdfFiles[0].filepath);
 
       const originalDocs = await loader.load();
 
-      console.log(JSON.stringify(originalDocs));
-
-
       const splitter = new RecursiveCharacterTextSplitter({
         chunkSize: 500,
         chunkOverlap: 100,
-      });      
+      });
 
       const docs = await splitter.splitDocuments(originalDocs);
- 
+
       // Process the documents and perform other logic
-      const { ids, metadatas, documentContents } = processDocuments(docs);
+      const { ids, metadatas, documentContents } = prepareDocumentsForChroma(
+        docs,
+        uuidv4,
+      );
+
+      if (!documentContents.length) {
+        return res
+          .status(400)
+          .json({ error: 'PDF did not contain ingestible text' });
+      }
 
       const embedder = new TransformersEmbeddingFunction();
       const collection = await client.getOrCreateCollection({
@@ -74,33 +88,3 @@ export default async function handler(
       .json({ message: 'An error occurred while processing the documents' });
   }
 }
-
-function processDocuments(docs: any) {
-  const ids = [];
-  const metadatas = [];
-  const documentContents = [];
-
-  for (const document of docs) {
-    // Generate an ID for each document, or use some existing unique identifier
-    const id = uuidv4();
-    ids.push(id);
-
-    const fallbackTitle = path.basename(document.metadata.source);
-    const titleFromMetadata = document.metadata.pdf.info.Title;
-
-    const title = titleFromMetadata && titleFromMetadata.length > 0 ? titleFromMetadata : fallbackTitle;
-
-
-    const metadata = {
-      title: title,
-      page: document.metadata.loc.pageNumber, // Define this function to extract chapter info
-      source: document.metadata.source, // Define this function to extract verse info
-    };
-    metadatas.push(metadata);
-
-    // Add the page content to the documents array
-    documentContents.push(document.pageContent);
-  }
-
-  return { ids, metadatas, documentContents };
-}
diff --git a/ui/utils/server/rag-ingest.ts b/ui/utils/server/rag-ingest.ts
@@ -0,0 +1,119 @@
+import { createHash } from 'crypto';
+import path from 'path';
+
+export interface RagLoadedDocument {
+  pageContent?: string;
+  metadata?: {
+    source?: string;
+    loc?: {
+      pageNumber?: number;
+    };
+    pdf?: {
+      info?: {
+        Title?: string;
+      };
+    };
+  };
+}
+
+export interface RagDocumentMetadata {
+  [key: string]: string | number | boolean;
+  title: string;
+  page: number;
+  source: string;
+  chunk: number;
+  contentLength: number;
+  sourceHash: string;
+  doi: string;
+  publicationYear: number;
+}
+
+export interface PreparedRagDocuments {
+  ids: string[];
+  metadatas: RagDocumentMetadata[];
+  documentContents: string[];
+}
+
+export type IdFactory = () => string;
+
+export function getPdfTitle(document: RagLoadedDocument): string | undefined {
+  const title = document.metadata?.pdf?.info?.Title?.trim();
+  return title ? title : undefined;
+}
+
+export function getSourceName(document: RagLoadedDocument): string {
+  const source = document.metadata?.source?.trim();
+
+  if (!source) {
+    return 'uploaded-document';
+  }
+
+  return path.basename(source) || 'uploaded-document';
+}
+
+export function getFirstDoi(text: string): string | undefined {
+  const match = text.match(/10\.\d{4,9}\/[-._;()/:A-Z0-9]+/i);
+
+  return match?.[0].replace(/[.,;:]+$/g, '');
+}
+
+export function getPublicationYear(text: string): number | undefined {
+  const match = text.match(/\b(19|20)\d{2}\b/);
+
+  return match ? Number(match[0]) : undefined;
+}
+
+export function buildSourceHash(document: RagLoadedDocument): string {
+  const source = document.metadata?.source ?? 'uploaded-document';
+  const page = document.metadata?.loc?.pageNumber ?? 0;
+
+  return createHash('sha256')
+    .update(`${source}:${page}`)
+    .digest('hex')
+    .slice(0, 16);
+}
+
+export function buildDocumentMetadata(
+  document: RagLoadedDocument,
+  chunk: number,
+): RagDocumentMetadata {
+  const content = document.pageContent ?? '';
+  const doi = getFirstDoi(content);
+  const publicationYear = getPublicationYear(content);
+
+  return {
+    title: getPdfTitle(document) ?? getSourceName(document),
+    page: document.metadata?.loc?.pageNumber ?? 0,
+    source: document.metadata?.source ?? 'uploaded-document',
+    chunk,
+    contentLength: content.length,
+    sourceHash: buildSourceHash(document),
+    doi: doi ?? '',
+    publicationYear: publicationYear ?? 0,
+  };
+}
+
+export function prepareDocumentsForChroma(
+  docs: RagLoadedDocument[],
+  idFactory: IdFactory,
+): PreparedRagDocuments {
+  const ids: string[] = [];
+  const metadatas: RagDocumentMetadata[] = [];
+  const documentContents: string[] = [];
+
+  for (const document of docs) {
+    const pageContent = document.pageContent?.trim();
+
+    if (!pageContent) {
+      continue;
+    }
+
+    const chunk = documentContents.length;
+
+    ids.push(idFactory());
+    metadatas.push(buildDocumentMetadata({ ...document, pageContent }, chunk));
+    documentContents.push(pageContent);
+  }
+
+  return { ids, metadatas, documentContents };
+}