diff --git a/ui/__tests__/rag-ingest.test.ts b/ui/__tests__/rag-ingest.test.ts new file mode 100644 index 0000000..609668f --- /dev/null +++ b/ui/__tests__/rag-ingest.test.ts @@ -0,0 +1,151 @@ +import { + buildDocumentMetadata, + buildSourceHash, + getFirstDoi, + getPdfTitle, + getPublicationYear, + getSourceName, + prepareDocumentsForChroma, +} from '@/utils/server/rag-ingest'; + +import { describe, expect, it } from 'vitest'; + +describe('RAG document ingestion helpers', () => { + it('uses PDF title metadata when present', () => { + const document = { + pageContent: 'Findings text', + metadata: { + source: '/tmp/fallback.pdf', + loc: { pageNumber: 3 }, + pdf: { info: { Title: ' Trial Protocol ' } }, + }, + }; + + expect(getPdfTitle(document)).toBe('Trial Protocol'); + expect(buildDocumentMetadata(document, 2)).toEqual({ + title: 'Trial Protocol', + page: 3, + source: '/tmp/fallback.pdf', + chunk: 2, + contentLength: 13, + sourceHash: buildSourceHash(document), + doi: '', + publicationYear: 0, + }); + }); + + it('falls back to the source file name when PDF title metadata is missing', () => { + const document = { + pageContent: 'Methods text', + metadata: { + source: '/uploads/research-paper.pdf', + loc: { pageNumber: 7 }, + }, + }; + + expect(getPdfTitle(document)).toBeUndefined(); + expect(getSourceName(document)).toBe('research-paper.pdf'); + expect(buildDocumentMetadata(document, 0).title).toBe('research-paper.pdf'); + }); + + it('handles missing metadata without throwing', () => { + const metadata = buildDocumentMetadata({ pageContent: 'Abstract text' }, 0); + + expect(metadata).toEqual({ + title: 'uploaded-document', + page: 0, + source: 'uploaded-document', + chunk: 0, + contentLength: 13, + sourceHash: buildSourceHash({ pageContent: 'Abstract text' }), + doi: '', + publicationYear: 0, + }); + }); + + it('extracts citation metadata and stable source hashes for retrieval', () => { + const document = { + pageContent: + 'Rivera et al. 2025 reported supporting evidence in DOI 10.1016/j.watres.2025.120001.', + metadata: { + source: '/uploads/flood-study.pdf', + loc: { pageNumber: 12 }, + }, + }; + const metadata = buildDocumentMetadata(document, 4); + + expect(getFirstDoi(`${document.pageContent}.`)).toBe( + '10.1016/j.watres.2025.120001', + ); + expect(getPublicationYear(document.pageContent)).toBe(2025); + expect(metadata).toMatchObject({ + title: 'flood-study.pdf', + page: 12, + source: '/uploads/flood-study.pdf', + chunk: 4, + doi: '10.1016/j.watres.2025.120001', + publicationYear: 2025, + }); + expect(metadata.sourceHash).toHaveLength(16); + expect(metadata.sourceHash).toBe(buildSourceHash(document)); + }); + + it('skips blank chunks and assigns dense chunk indices', () => { + let nextId = 0; + const prepared = prepareDocumentsForChroma( + [ + { pageContent: ' ' }, + { pageContent: ' First chunk ', metadata: { source: 'first.pdf' } }, + { pageContent: '\n\n' }, + { pageContent: 'Second chunk', metadata: { source: 'second.pdf' } }, + ], + () => `doc-${++nextId}`, + ); + + expect(prepared).toEqual({ + ids: ['doc-1', 'doc-2'], + metadatas: [ + { + title: 'first.pdf', + page: 0, + source: 'first.pdf', + chunk: 0, + contentLength: 11, + sourceHash: buildSourceHash({ + pageContent: 'First chunk', + metadata: { source: 'first.pdf' }, + }), + doi: '', + publicationYear: 0, + }, + { + title: 'second.pdf', + page: 0, + source: 'second.pdf', + chunk: 1, + contentLength: 12, + sourceHash: buildSourceHash({ + pageContent: 'Second chunk', + metadata: { source: 'second.pdf' }, + }), + doi: '', + publicationYear: 0, + }, + ], + documentContents: ['First chunk', 'Second chunk'], + }); + }); + + it('returns an empty prepared payload when every chunk is blank', () => { + const prepared = prepareDocumentsForChroma( + [{ pageContent: ' ' }, { pageContent: '\n\n' }], + () => 'unused', + ); + + expect(prepared).toEqual({ + ids: [], + metadatas: [], + documentContents: [], + }); + }); +}); diff --git a/ui/docs/rag-ingest-demo.mp4 b/ui/docs/rag-ingest-demo.mp4 new file mode 100644 index 0000000..70a21ea Binary files /dev/null and b/ui/docs/rag-ingest-demo.mp4 differ diff --git a/ui/pages/api/inject-documents.ts b/ui/pages/api/inject-documents.ts index 532a635..c2654df 100644 --- a/ui/pages/api/inject-documents.ts +++ b/ui/pages/api/inject-documents.ts @@ -1,11 +1,11 @@ import type { NextApiRequest, NextApiResponse } from 'next'; +import { prepareDocumentsForChroma } from '@/utils/server/rag-ingest'; + import { ChromaClient, TransformersEmbeddingFunction } from 'chromadb'; import { IncomingForm } from 'formidable'; import { PDFLoader } from 'langchain/document_loaders/fs/pdf'; -import { RecursiveCharacterTextSplitter } from "langchain/text_splitter"; - -import path from 'path'; +import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter'; import { v4 as uuidv4 } from 'uuid'; export const config = { @@ -29,26 +29,40 @@ export default async function handler( return res.status(400).json({ error: 'Failed to upload file' }); } + const pdfFiles = Array.isArray(files.pdf) + ? files.pdf + : [files.pdf].filter(Boolean); + + if (!pdfFiles.length) { + return res.status(400).json({ error: 'Missing PDF file upload' }); + } + const client = new ChromaClient({ path: process.env.CHROMA_PATH || 'http://chroma-server:8000', }); - const loader = new PDFLoader(files.pdf[0].filepath); + const loader = new PDFLoader(pdfFiles[0].filepath); const originalDocs = await loader.load(); - console.log(JSON.stringify(originalDocs)); - - const splitter = new RecursiveCharacterTextSplitter({ chunkSize: 500, chunkOverlap: 100, - }); + }); const docs = await splitter.splitDocuments(originalDocs); - + // Process the documents and perform other logic - const { ids, metadatas, documentContents } = processDocuments(docs); + const { ids, metadatas, documentContents } = prepareDocumentsForChroma( + docs, + uuidv4, + ); + + if (!documentContents.length) { + return res + .status(400) + .json({ error: 'PDF did not contain ingestible text' }); + } const embedder = new TransformersEmbeddingFunction(); const collection = await client.getOrCreateCollection({ @@ -74,33 +88,3 @@ export default async function handler( .json({ message: 'An error occurred while processing the documents' }); } } - -function processDocuments(docs: any) { - const ids = []; - const metadatas = []; - const documentContents = []; - - for (const document of docs) { - // Generate an ID for each document, or use some existing unique identifier - const id = uuidv4(); - ids.push(id); - - const fallbackTitle = path.basename(document.metadata.source); - const titleFromMetadata = document.metadata.pdf.info.Title; - - const title = titleFromMetadata && titleFromMetadata.length > 0 ? titleFromMetadata : fallbackTitle; - - - const metadata = { - title: title, - page: document.metadata.loc.pageNumber, // Define this function to extract chapter info - source: document.metadata.source, // Define this function to extract verse info - }; - metadatas.push(metadata); - - // Add the page content to the documents array - documentContents.push(document.pageContent); - } - - return { ids, metadatas, documentContents }; -} diff --git a/ui/utils/server/rag-ingest.ts b/ui/utils/server/rag-ingest.ts new file mode 100644 index 0000000..9bb5150 --- /dev/null +++ b/ui/utils/server/rag-ingest.ts @@ -0,0 +1,119 @@ +import { createHash } from 'crypto'; +import path from 'path'; + +export interface RagLoadedDocument { + pageContent?: string; + metadata?: { + source?: string; + loc?: { + pageNumber?: number; + }; + pdf?: { + info?: { + Title?: string; + }; + }; + }; +} + +export interface RagDocumentMetadata { + [key: string]: string | number | boolean; + title: string; + page: number; + source: string; + chunk: number; + contentLength: number; + sourceHash: string; + doi: string; + publicationYear: number; +} + +export interface PreparedRagDocuments { + ids: string[]; + metadatas: RagDocumentMetadata[]; + documentContents: string[]; +} + +export type IdFactory = () => string; + +export function getPdfTitle(document: RagLoadedDocument): string | undefined { + const title = document.metadata?.pdf?.info?.Title?.trim(); + return title ? title : undefined; +} + +export function getSourceName(document: RagLoadedDocument): string { + const source = document.metadata?.source?.trim(); + + if (!source) { + return 'uploaded-document'; + } + + return path.basename(source) || 'uploaded-document'; +} + +export function getFirstDoi(text: string): string | undefined { + const match = text.match(/10\.\d{4,9}\/[-._;()/:A-Z0-9]+/i); + + return match?.[0].replace(/[.,;:]+$/g, ''); +} + +export function getPublicationYear(text: string): number | undefined { + const match = text.match(/\b(19|20)\d{2}\b/); + + return match ? Number(match[0]) : undefined; +} + +export function buildSourceHash(document: RagLoadedDocument): string { + const source = document.metadata?.source ?? 'uploaded-document'; + const page = document.metadata?.loc?.pageNumber ?? 0; + + return createHash('sha256') + .update(`${source}:${page}`) + .digest('hex') + .slice(0, 16); +} + +export function buildDocumentMetadata( + document: RagLoadedDocument, + chunk: number, +): RagDocumentMetadata { + const content = document.pageContent ?? ''; + const doi = getFirstDoi(content); + const publicationYear = getPublicationYear(content); + + return { + title: getPdfTitle(document) ?? getSourceName(document), + page: document.metadata?.loc?.pageNumber ?? 0, + source: document.metadata?.source ?? 'uploaded-document', + chunk, + contentLength: content.length, + sourceHash: buildSourceHash(document), + doi: doi ?? '', + publicationYear: publicationYear ?? 0, + }; +} + +export function prepareDocumentsForChroma( + docs: RagLoadedDocument[], + idFactory: IdFactory, +): PreparedRagDocuments { + const ids: string[] = []; + const metadatas: RagDocumentMetadata[] = []; + const documentContents: string[] = []; + + for (const document of docs) { + const pageContent = document.pageContent?.trim(); + + if (!pageContent) { + continue; + } + + const chunk = documentContents.length; + + ids.push(idFactory()); + metadatas.push(buildDocumentMetadata({ ...document, pageContent }, chunk)); + documentContents.push(pageContent); + } + + return { ids, metadatas, documentContents }; +}