Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 16 additions & 12 deletions ui/pages/api/inject-documents.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import { ChromaClient, TransformersEmbeddingFunction } from 'chromadb';
import { IncomingForm } from 'formidable';
import { PDFLoader } from 'langchain/document_loaders/fs/pdf';
import { RecursiveCharacterTextSplitter } from "langchain/text_splitter";
import { detectSection, generateCitationKey, SCIENTIFIC_SEPARATORS } from '@/utils/server/scientific-rag';

import path from 'path';
import { v4 as uuidv4 } from 'uuid';
Expand Down Expand Up @@ -37,12 +38,10 @@ export default async function handler(

const originalDocs = await loader.load();

console.log(JSON.stringify(originalDocs));


const splitter = new RecursiveCharacterTextSplitter({
chunkSize: 500,
chunkOverlap: 100,
chunkSize: 600,
chunkOverlap: 120,
separators: SCIENTIFIC_SEPARATORS,
});

const docs = await splitter.splitDocuments(originalDocs);
Expand Down Expand Up @@ -79,26 +78,31 @@ function processDocuments(docs: any) {
const ids = [];
const metadatas = [];
const documentContents = [];
let currentSection = 'INTRODUCTION';

for (const document of docs) {
// Generate an ID for each document, or use some existing unique identifier
for (let i = 0; i < docs.length; i++) {
const document = docs[i];
const id = uuidv4();
ids.push(id);

const fallbackTitle = path.basename(document.metadata.source);
const titleFromMetadata = document.metadata.pdf.info.Title;
const titleFromMetadata = document.metadata.pdf?.info?.Title;

const title = titleFromMetadata && titleFromMetadata.length > 0 ? titleFromMetadata : fallbackTitle;

currentSection = detectSection(document.pageContent, currentSection);

const metadata = {
title: title,
page: document.metadata.loc.pageNumber, // Define this function to extract chapter info
source: document.metadata.source, // Define this function to extract verse info
page: document.metadata.loc?.pageNumber || 0,
source: document.metadata.source,
section: currentSection,
citationKey: '',
};
metadatas.push(metadata);

// Add the page content to the documents array
metadata.citationKey = generateCitationKey(metadata, i);

metadatas.push(metadata);
documentContents.push(document.pageContent);
}

Expand Down
29 changes: 17 additions & 12 deletions ui/pages/api/rag-chat.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import { DEFAULT_SYSTEM_PROMPT, DEFAULT_TEMPERATURE } from '@/utils/app/const';
import { OpenAIError, OpenAIStream } from '@/utils/server';
import { codeBlock, oneLine } from 'common-tags'
import { formatScientificContext } from '@/utils/server/scientific-rag';

import { ChatBody, Message } from '@/types/chat';

Expand Down Expand Up @@ -29,17 +30,22 @@ async function fetchAndFormatDocuments(lastMessageContent: string) {
}

const data = await response.json();
const result = data.metadatas[0].map((metadata: any, index: number) => {
return `Source ${index + 1}) Title: ${metadata.title}, Page: ${metadata.page}, Content: ${data.documents[0][index]}\n`;
}).join('');

// Chroma returns data.metadatas[0] and data.documents[0]
// and data.distances[0] if available
const result = formatScientificContext(
data.documents[0],
data.metadatas[0],
data.distances ? data.distances[0] : undefined
);

console.log(result);

return result;

} catch (error) {
console.error('Error fetching and formatting documents:', error);
throw error; // You may want to throw a more specific error object here
throw error;
}
}

Expand All @@ -62,20 +68,19 @@ const handler = async (req: Request): Promise<Response> => {

let promptToSend = codeBlock`
${oneLine`
You are a very enthusiastic AI assistant who loves
to help people! Given the following information from
relevant documentation, answer the user's question using
only that information, outputted in markdown format.
You are a specialized Scientific Research Assistant. Given the following
excerpts from scientific papers, your goal is to answer the user's question
accurately and with rigorous citations.
`}

${oneLine`
If you are unsure
and the answer is not explicitly written in the documentation, say
"Sorry, I don't know how to help with that."
Use only the provided context. If the answer is not in the context, say
"I'm sorry, the provided research documents do not contain enough information to answer this question."
`}

${oneLine`
Always include citations from the documentation.
CRITICAL: You must cite every factual claim using the citation keys provided in the context
(e.g., [[DocName:p1:c2]]). Place citations immediately after the claim they support.
`}
`;

Expand Down
75 changes: 75 additions & 0 deletions ui/utils/server/scientific-rag.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import { Document } from 'langchain/document';

export interface ScientificMetadata {
title: string;
authors?: string[];
doi?: string;
section?: string;
page: number;
citationKey: string;
source: string;
}

const SCIENTIFIC_SECTIONS = [
'ABSTRACT',
'INTRODUCTION',
'METHODS',
'METHODOLOGY',
'RESULTS',
'DISCUSSION',
'CONCLUSION',
'REFERENCES',
'ACKNOWLEDGEMENTS',
];

/**
* Detects which scientific section a piece of text belongs to.
*/
export function detectSection(text: string, currentSection: string): string {
const upperText = text.toUpperCase().trim();
for (const section of SCIENTIFIC_SECTIONS) {
if (upperText.startsWith(section) || upperText.includes('\n' + section + '\n')) {
return section;
}
}
return currentSection;
}

/**
* Generates a stable citation key based on metadata.
*/
export function generateCitationKey(metadata: any, chunkIndex: number): string {
const shortTitle = (metadata.title || 'Doc').substring(0, 15).replace(/\s+/g, '');
const page = metadata.page || 0;
return `${shortTitle}:p${page}:c${chunkIndex}`;
}

/**
* Enhanced scientific text splitter separators.
*/
export const SCIENTIFIC_SEPARATORS = [
'\nREFERENCES\n',
'\nABSTRACT\n',
'\nINTRODUCTION\n',
'\nMETHODS\n',
'\nRESULTS\n',
'\nDISCUSSION\n',
'\nCONCLUSION\n',
'\n\n',
'\n',
'. ',
' ',
'',
];

/**
* Formats retrieved documents for the prompt, including distances and citation keys.
*/
export function formatScientificContext(documents: string[], metadatas: any[], distances?: number[]): string {
return documents.map((doc, i) => {
const meta = metadatas[i];
const distStr = distances ? ` [Relevance: ${(1 - distances[i]).toFixed(2)}]` : '';
const sectionStr = meta.section ? ` Section: ${meta.section}` : '';
return `[[${meta.citationKey}]] (Title: ${meta.title}, Page: ${meta.page}${sectionStr}${distStr})\nContent: ${doc}\n`;
}).join('\n---\n');
}