Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
151 changes: 151 additions & 0 deletions ui/__tests__/rag-ingest.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
import {
buildDocumentMetadata,
buildSourceHash,
getFirstDoi,
getPdfTitle,
getPublicationYear,
getSourceName,
prepareDocumentsForChroma,
} from '@/utils/server/rag-ingest';

import { describe, expect, it } from 'vitest';

describe('RAG document ingestion helpers', () => {
it('uses PDF title metadata when present', () => {
const document = {
pageContent: 'Findings text',
metadata: {
source: '/tmp/fallback.pdf',
loc: { pageNumber: 3 },
pdf: { info: { Title: ' Trial Protocol ' } },
},
};

expect(getPdfTitle(document)).toBe('Trial Protocol');
expect(buildDocumentMetadata(document, 2)).toEqual({
title: 'Trial Protocol',
page: 3,
source: '/tmp/fallback.pdf',
chunk: 2,
contentLength: 13,
sourceHash: buildSourceHash(document),
doi: '',
publicationYear: 0,
});
});

it('falls back to the source file name when PDF title metadata is missing', () => {
const document = {
pageContent: 'Methods text',
metadata: {
source: '/uploads/research-paper.pdf',
loc: { pageNumber: 7 },
},
};

expect(getPdfTitle(document)).toBeUndefined();
expect(getSourceName(document)).toBe('research-paper.pdf');
expect(buildDocumentMetadata(document, 0).title).toBe('research-paper.pdf');
});

it('handles missing metadata without throwing', () => {
const metadata = buildDocumentMetadata({ pageContent: 'Abstract text' }, 0);

expect(metadata).toEqual({
title: 'uploaded-document',
page: 0,
source: 'uploaded-document',
chunk: 0,
contentLength: 13,
sourceHash: buildSourceHash({ pageContent: 'Abstract text' }),
doi: '',
publicationYear: 0,
});
});

it('extracts citation metadata and stable source hashes for retrieval', () => {
const document = {
pageContent:
'Rivera et al. 2025 reported supporting evidence in DOI 10.1016/j.watres.2025.120001.',
metadata: {
source: '/uploads/flood-study.pdf',
loc: { pageNumber: 12 },
},
};
const metadata = buildDocumentMetadata(document, 4);

expect(getFirstDoi(`${document.pageContent}.`)).toBe(
'10.1016/j.watres.2025.120001',
);
expect(getPublicationYear(document.pageContent)).toBe(2025);
expect(metadata).toMatchObject({
title: 'flood-study.pdf',
page: 12,
source: '/uploads/flood-study.pdf',
chunk: 4,
doi: '10.1016/j.watres.2025.120001',
publicationYear: 2025,
});
expect(metadata.sourceHash).toHaveLength(16);
expect(metadata.sourceHash).toBe(buildSourceHash(document));
});

it('skips blank chunks and assigns dense chunk indices', () => {
let nextId = 0;
const prepared = prepareDocumentsForChroma(
[
{ pageContent: ' ' },
{ pageContent: ' First chunk ', metadata: { source: 'first.pdf' } },
{ pageContent: '\n\n' },
{ pageContent: 'Second chunk', metadata: { source: 'second.pdf' } },
],
() => `doc-${++nextId}`,
);

expect(prepared).toEqual({
ids: ['doc-1', 'doc-2'],
metadatas: [
{
title: 'first.pdf',
page: 0,
source: 'first.pdf',
chunk: 0,
contentLength: 11,
sourceHash: buildSourceHash({
pageContent: 'First chunk',
metadata: { source: 'first.pdf' },
}),
doi: '',
publicationYear: 0,
},
{
title: 'second.pdf',
page: 0,
source: 'second.pdf',
chunk: 1,
contentLength: 12,
sourceHash: buildSourceHash({
pageContent: 'Second chunk',
metadata: { source: 'second.pdf' },
}),
doi: '',
publicationYear: 0,
},
],
documentContents: ['First chunk', 'Second chunk'],
});
});

it('returns an empty prepared payload when every chunk is blank', () => {
const prepared = prepareDocumentsForChroma(
[{ pageContent: ' ' }, { pageContent: '\n\n' }],
() => 'unused',
);

expect(prepared).toEqual({
ids: [],
metadatas: [],
documentContents: [],
});
});
});
Binary file added ui/docs/rag-ingest-demo.mp4
Binary file not shown.
64 changes: 24 additions & 40 deletions ui/pages/api/inject-documents.ts
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
import type { NextApiRequest, NextApiResponse } from 'next';

import { prepareDocumentsForChroma } from '@/utils/server/rag-ingest';

import { ChromaClient, TransformersEmbeddingFunction } from 'chromadb';
import { IncomingForm } from 'formidable';
import { PDFLoader } from 'langchain/document_loaders/fs/pdf';
import { RecursiveCharacterTextSplitter } from "langchain/text_splitter";

import path from 'path';
import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
import { v4 as uuidv4 } from 'uuid';

export const config = {
Expand All @@ -29,26 +29,40 @@ export default async function handler(
return res.status(400).json({ error: 'Failed to upload file' });
}

const pdfFiles = Array.isArray(files.pdf)
? files.pdf
: [files.pdf].filter(Boolean);

if (!pdfFiles.length) {
return res.status(400).json({ error: 'Missing PDF file upload' });
}

const client = new ChromaClient({
path: process.env.CHROMA_PATH || 'http://chroma-server:8000',
});

const loader = new PDFLoader(files.pdf[0].filepath);
const loader = new PDFLoader(pdfFiles[0].filepath);

const originalDocs = await loader.load();

console.log(JSON.stringify(originalDocs));


const splitter = new RecursiveCharacterTextSplitter({
chunkSize: 500,
chunkOverlap: 100,
});
});

const docs = await splitter.splitDocuments(originalDocs);

// Process the documents and perform other logic
const { ids, metadatas, documentContents } = processDocuments(docs);
const { ids, metadatas, documentContents } = prepareDocumentsForChroma(
docs,
uuidv4,
);

if (!documentContents.length) {
return res
.status(400)
.json({ error: 'PDF did not contain ingestible text' });
}

const embedder = new TransformersEmbeddingFunction();
const collection = await client.getOrCreateCollection({
Expand All @@ -74,33 +88,3 @@ export default async function handler(
.json({ message: 'An error occurred while processing the documents' });
}
}

function processDocuments(docs: any) {
const ids = [];
const metadatas = [];
const documentContents = [];

for (const document of docs) {
// Generate an ID for each document, or use some existing unique identifier
const id = uuidv4();
ids.push(id);

const fallbackTitle = path.basename(document.metadata.source);
const titleFromMetadata = document.metadata.pdf.info.Title;

const title = titleFromMetadata && titleFromMetadata.length > 0 ? titleFromMetadata : fallbackTitle;


const metadata = {
title: title,
page: document.metadata.loc.pageNumber, // Define this function to extract chapter info
source: document.metadata.source, // Define this function to extract verse info
};
metadatas.push(metadata);

// Add the page content to the documents array
documentContents.push(document.pageContent);
}

return { ids, metadatas, documentContents };
}
119 changes: 119 additions & 0 deletions ui/utils/server/rag-ingest.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
import { createHash } from 'crypto';
import path from 'path';

export interface RagLoadedDocument {
pageContent?: string;
metadata?: {
source?: string;
loc?: {
pageNumber?: number;
};
pdf?: {
info?: {
Title?: string;
};
};
};
}

export interface RagDocumentMetadata {
[key: string]: string | number | boolean;
title: string;
page: number;
source: string;
chunk: number;
contentLength: number;
sourceHash: string;
doi: string;
publicationYear: number;
}

export interface PreparedRagDocuments {
ids: string[];
metadatas: RagDocumentMetadata[];
documentContents: string[];
}

export type IdFactory = () => string;

export function getPdfTitle(document: RagLoadedDocument): string | undefined {
const title = document.metadata?.pdf?.info?.Title?.trim();
return title ? title : undefined;
}

export function getSourceName(document: RagLoadedDocument): string {
const source = document.metadata?.source?.trim();

if (!source) {
return 'uploaded-document';
}

return path.basename(source) || 'uploaded-document';
}

export function getFirstDoi(text: string): string | undefined {
const match = text.match(/10\.\d{4,9}\/[-._;()/:A-Z0-9]+/i);

return match?.[0].replace(/[.,;:]+$/g, '');
}

export function getPublicationYear(text: string): number | undefined {
const match = text.match(/\b(19|20)\d{2}\b/);

return match ? Number(match[0]) : undefined;
}

export function buildSourceHash(document: RagLoadedDocument): string {
const source = document.metadata?.source ?? 'uploaded-document';
const page = document.metadata?.loc?.pageNumber ?? 0;

return createHash('sha256')
.update(`${source}:${page}`)
.digest('hex')
.slice(0, 16);
}

export function buildDocumentMetadata(
document: RagLoadedDocument,
chunk: number,
): RagDocumentMetadata {
const content = document.pageContent ?? '';
const doi = getFirstDoi(content);
const publicationYear = getPublicationYear(content);

return {
title: getPdfTitle(document) ?? getSourceName(document),
page: document.metadata?.loc?.pageNumber ?? 0,
source: document.metadata?.source ?? 'uploaded-document',
chunk,
contentLength: content.length,
sourceHash: buildSourceHash(document),
doi: doi ?? '',
publicationYear: publicationYear ?? 0,
};
}

export function prepareDocumentsForChroma(
docs: RagLoadedDocument[],
idFactory: IdFactory,
): PreparedRagDocuments {
const ids: string[] = [];
const metadatas: RagDocumentMetadata[] = [];
const documentContents: string[] = [];

for (const document of docs) {
const pageContent = document.pageContent?.trim();

if (!pageContent) {
continue;
}

const chunk = documentContents.length;

ids.push(idFactory());
metadatas.push(buildDocumentMetadata({ ...document, pageContent }, chunk));
documentContents.push(pageContent);
}

return { ids, metadatas, documentContents };
}