Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 74 additions & 0 deletions ui/__tests__/scientific-rag.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import { describe, expect, it } from 'vitest';

import {
buildScientificMetadata,
citationSlug,
clampRetrievedDocumentCount,
detectScientificSection,
formatRetrievedDocument,
} from '@/utils/server/scientific-rag';

describe('scientific RAG helpers', () => {
it('detects scientific sections from chunk headings', () => {
expect(detectScientificSection('Abstract\nWe study retrieval quality.')).toBe('abstract');
expect(detectScientificSection('2. Methods\nWe collected samples.')).toBe('methods');
expect(detectScientificSection('RESULTS\nAccuracy improved.')).toBe('results');
expect(detectScientificSection('A paragraph without a heading.')).toBe('body');
});

it('builds stable readable citation metadata', () => {
const metadata = buildScientificMetadata(
{
pageContent: 'Introduction\nThis paper evaluates citation stability.',
metadata: {
loc: { pageNumber: 3 },
pdf: { info: { Title: 'Scientific RAG: Stable Citations.pdf' } },
source: '/uploads/scientific-rag.pdf',
},
},
'fallback.pdf',
7,
1,
);

expect(metadata).toMatchObject({
title: 'Scientific RAG: Stable Citations',
page: 3,
source: '/uploads/scientific-rag.pdf',
section: 'introduction',
chunkIndex: 7,
pageChunkIndex: 1,
citationKey: 'scientific-rag-stable-citations:p3:c2',
});
});

it('keeps citation slugs deterministic and bounded', () => {
expect(citationSlug(' A Very_Long Scientific Paper!!! ')).toBe(
'a-very-long-scientific-paper',
);
expect(citationSlug('')).toBe('document');
});

it('clamps retrieval result counts to a useful range', () => {
expect(clampRetrievedDocumentCount(undefined)).toBe(6);
expect(clampRetrievedDocumentCount(0)).toBe(1);
expect(clampRetrievedDocumentCount(99)).toBe(12);
expect(clampRetrievedDocumentCount(4.9)).toBe(4);
});

it('formats retrieved chunks with citation and distance context', () => {
expect(
formatRetrievedDocument({
content: 'The model retrieved a grounded answer.',
metadata: {
title: 'Grounded RAG',
page: 4,
section: 'results',
citationKey: 'grounded-rag:p4:c1',
},
distance: 0.012345,
index: 0,
}),
).toContain('[grounded-rag:p4:c1] Grounded RAG, page 4, section results, distance 0.0123');
});
});
22 changes: 15 additions & 7 deletions ui/pages/api/fetch-documents.ts
Original file line number Diff line number Diff line change
@@ -1,23 +1,31 @@
import type { NextApiRequest, NextApiResponse } from "next";
import { ChromaClient, TransformersEmbeddingFunction } from "chromadb";

import { clampRetrievedDocumentCount } from '@/utils/server/scientific-rag';

export default async function handler(req: NextApiRequest, res: NextApiResponse) {
try {
if (req.method !== 'POST') {
return res.status(405).end();
}

const client = new ChromaClient({
path: "http://chroma-server:8000",
path: process.env.CHROMA_PATH || "http://chroma-server:8000",
});

const query = req.body.input;
if (typeof query !== 'string' || query.trim().length === 0) {
return res.status(400).json({ error: 'Missing document query' });
}

const embedder = new TransformersEmbeddingFunction();

const collection = await client.getOrCreateCollection({ name: "default-collection", embeddingFunction: embedder });

// query the collection
const results = await collection.query({
nResults: 4,
queryTexts: [query]
})
const results = await collection.query({
nResults: clampRetrievedDocumentCount(req.body.nResults),
queryTexts: [query],
});

res.status(200).json(results);
} catch (error) {
Expand All @@ -29,4 +37,4 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse)
}
res.status(500).json({ error: 'An unexpected error occurred :(' });
}
}
}
60 changes: 35 additions & 25 deletions ui/pages/api/inject-documents.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,17 @@ import type { NextApiRequest, NextApiResponse } from 'next';
import { ChromaClient, TransformersEmbeddingFunction } from 'chromadb';
import { IncomingForm } from 'formidable';
import { PDFLoader } from 'langchain/document_loaders/fs/pdf';
import { RecursiveCharacterTextSplitter } from "langchain/text_splitter";
import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';

import path from 'path';
import { v4 as uuidv4 } from 'uuid';

import {
SCIENTIFIC_TEXT_SEPARATORS,
buildScientificMetadata,
type ScientificDocument,
} from '@/utils/server/scientific-rag';

export const config = {
api: {
bodyParser: false,
Expand All @@ -33,17 +39,20 @@ export default async function handler(
path: process.env.CHROMA_PATH || 'http://chroma-server:8000',
});

const loader = new PDFLoader(files.pdf[0].filepath);

const originalDocs = await loader.load();
const pdfFile = Array.isArray(files.pdf) ? files.pdf[0] : files.pdf;
if (!pdfFile) {
return res.status(400).json({ error: 'Missing PDF file' });
}

console.log(JSON.stringify(originalDocs));
const loader = new PDFLoader(pdfFile.filepath);

const originalDocs = await loader.load();

const splitter = new RecursiveCharacterTextSplitter({
chunkSize: 500,
chunkOverlap: 100,
});
chunkSize: 900,
chunkOverlap: 180,
separators: SCIENTIFIC_TEXT_SEPARATORS,
});

const docs = await splitter.splitDocuments(originalDocs);

Expand Down Expand Up @@ -75,30 +84,31 @@ export default async function handler(
}
}

function processDocuments(docs: any) {
const ids = [];
function processDocuments(docs: ScientificDocument[]) {
const ids: string[] = [];
const metadatas = [];
const documentContents = [];
const documentContents: string[] = [];
const pageChunkCounts = new Map<string, number>();

for (const document of docs) {
// Generate an ID for each document, or use some existing unique identifier
for (let index = 0; index < docs.length; index += 1) {
const document = docs[index];
const id = uuidv4();
ids.push(id);

const fallbackTitle = path.basename(document.metadata.source);
const titleFromMetadata = document.metadata.pdf.info.Title;

const title = titleFromMetadata && titleFromMetadata.length > 0 ? titleFromMetadata : fallbackTitle;


const metadata = {
title: title,
page: document.metadata.loc.pageNumber, // Define this function to extract chapter info
source: document.metadata.source, // Define this function to extract verse info
};
const fallbackTitle = path.basename(document.metadata.source ?? 'document.pdf');
const page = document.metadata.loc?.pageNumber ?? 'unknown';
const pageKey = `${document.metadata.source ?? fallbackTitle}:${page}`;
const pageChunkIndex = pageChunkCounts.get(pageKey) ?? 0;
pageChunkCounts.set(pageKey, pageChunkIndex + 1);

const metadata = buildScientificMetadata(
document,
fallbackTitle,
index,
pageChunkIndex,
);
metadatas.push(metadata);

// Add the page content to the documents array
documentContents.push(document.pageContent);
}

Expand Down
44 changes: 29 additions & 15 deletions ui/pages/api/rag-chat.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import { DEFAULT_SYSTEM_PROMPT, DEFAULT_TEMPERATURE } from '@/utils/app/const';
import { OpenAIError, OpenAIStream } from '@/utils/server';
import { codeBlock, oneLine } from 'common-tags'
import { codeBlock, oneLine } from 'common-tags';

import { ChatBody, Message } from '@/types/chat';
import { formatRetrievedDocument } from '@/utils/server/scientific-rag';

// @ts-expect-error
import wasm from '../../node_modules/@dqbd/tiktoken/lite/tiktoken_bg.wasm?module';
Expand All @@ -15,13 +16,15 @@ export const config = {
};

// Function to fetch and format documents
async function fetchAndFormatDocuments(lastMessageContent: string) {
async function fetchAndFormatDocuments(
baseUrl: string,
lastMessageContent: string,
) {
try {
console.log("fetching documents")
const response = await fetch('http://localhost:3000/api/fetch-documents', {
const response = await fetch(`${baseUrl}/api/fetch-documents`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ input: lastMessageContent }),
body: JSON.stringify({ input: lastMessageContent, nResults: 6 }),
});

if (!response.ok) {
Expand All @@ -30,10 +33,13 @@ async function fetchAndFormatDocuments(lastMessageContent: string) {

const data = await response.json();
const result = data.metadatas[0].map((metadata: any, index: number) => {
return `Source ${index + 1}) Title: ${metadata.title}, Page: ${metadata.page}, Content: ${data.documents[0][index]}\n`;
}).join('');

console.log(result);
return formatRetrievedDocument({
content: data.documents[0][index],
metadata,
distance: data.distances?.[0]?.[index],
index,
});
}).join('\n\n---\n\n');

return result;

Expand Down Expand Up @@ -64,7 +70,7 @@ const handler = async (req: Request): Promise<Response> => {
${oneLine`
You are a very enthusiastic AI assistant who loves
to help people! Given the following information from
relevant documentation, answer the user's question using
relevant scientific documentation, answer the user's question using
only that information, outputted in markdown format.
`}

Expand All @@ -75,7 +81,7 @@ const handler = async (req: Request): Promise<Response> => {
`}

${oneLine`
Always include citations from the documentation.
Every factual claim must include citation keys from the documentation.
`}
`;

Expand All @@ -85,7 +91,10 @@ const handler = async (req: Request): Promise<Response> => {

const lastMessage = messages[messages.length - 1];

const relevantDocuments = await fetchAndFormatDocuments(lastMessage.content);
const relevantDocuments = await fetchAndFormatDocuments(
new URL(req.url).origin,
lastMessage.content,
);

let temperatureToUse = temperature;
if (temperatureToUse == null) {
Expand All @@ -100,9 +109,6 @@ const handler = async (req: Request): Promise<Response> => {

encoding.free();

console.log(model, promptToSend, temperatureToUse, key, messagesToSend);


messagesToSend = [
{
role: "user",
Expand All @@ -121,6 +127,14 @@ const handler = async (req: Request): Promise<Response> => {
${oneLine`
- Do not make up answers that are not provided in the documentation.
`}
${oneLine`
- Cite sources using the exact citation keys shown in square brackets,
for example [paper-title:p3:c2].
`}
${oneLine`
- Prefer sources with lower retrieval distance when multiple sources
contain similar information.
`}
${oneLine`
- If you are unsure and the answer is not explicitly written
in the documentation context, say
Expand Down
Loading