diff --git a/ui/pages/api/inject-documents.ts b/ui/pages/api/inject-documents.ts index 532a635..8a6ab18 100644 --- a/ui/pages/api/inject-documents.ts +++ b/ui/pages/api/inject-documents.ts @@ -4,6 +4,7 @@ import { ChromaClient, TransformersEmbeddingFunction } from 'chromadb'; import { IncomingForm } from 'formidable'; import { PDFLoader } from 'langchain/document_loaders/fs/pdf'; import { RecursiveCharacterTextSplitter } from "langchain/text_splitter"; +import { detectSection, generateCitationKey, SCIENTIFIC_SEPARATORS } from '@/utils/server/scientific-rag'; import path from 'path'; import { v4 as uuidv4 } from 'uuid'; @@ -37,12 +38,10 @@ export default async function handler( const originalDocs = await loader.load(); - console.log(JSON.stringify(originalDocs)); - - const splitter = new RecursiveCharacterTextSplitter({ - chunkSize: 500, - chunkOverlap: 100, + chunkSize: 600, + chunkOverlap: 120, + separators: SCIENTIFIC_SEPARATORS, }); const docs = await splitter.splitDocuments(originalDocs); @@ -79,26 +78,31 @@ function processDocuments(docs: any) { const ids = []; const metadatas = []; const documentContents = []; + let currentSection = 'INTRODUCTION'; - for (const document of docs) { - // Generate an ID for each document, or use some existing unique identifier + for (let i = 0; i < docs.length; i++) { + const document = docs[i]; const id = uuidv4(); ids.push(id); const fallbackTitle = path.basename(document.metadata.source); - const titleFromMetadata = document.metadata.pdf.info.Title; + const titleFromMetadata = document.metadata.pdf?.info?.Title; const title = titleFromMetadata && titleFromMetadata.length > 0 ? titleFromMetadata : fallbackTitle; + currentSection = detectSection(document.pageContent, currentSection); const metadata = { title: title, - page: document.metadata.loc.pageNumber, // Define this function to extract chapter info - source: document.metadata.source, // Define this function to extract verse info + page: document.metadata.loc?.pageNumber || 0, + source: document.metadata.source, + section: currentSection, + citationKey: '', }; - metadatas.push(metadata); - // Add the page content to the documents array + metadata.citationKey = generateCitationKey(metadata, i); + + metadatas.push(metadata); documentContents.push(document.pageContent); } diff --git a/ui/pages/api/rag-chat.ts b/ui/pages/api/rag-chat.ts index ce84d67..1f844fc 100644 --- a/ui/pages/api/rag-chat.ts +++ b/ui/pages/api/rag-chat.ts @@ -1,6 +1,7 @@ import { DEFAULT_SYSTEM_PROMPT, DEFAULT_TEMPERATURE } from '@/utils/app/const'; import { OpenAIError, OpenAIStream } from '@/utils/server'; import { codeBlock, oneLine } from 'common-tags' +import { formatScientificContext } from '@/utils/server/scientific-rag'; import { ChatBody, Message } from '@/types/chat'; @@ -29,9 +30,14 @@ async function fetchAndFormatDocuments(lastMessageContent: string) { } const data = await response.json(); - const result = data.metadatas[0].map((metadata: any, index: number) => { - return `Source ${index + 1}) Title: ${metadata.title}, Page: ${metadata.page}, Content: ${data.documents[0][index]}\n`; - }).join(''); + + // Chroma returns data.metadatas[0] and data.documents[0] + // and data.distances[0] if available + const result = formatScientificContext( + data.documents[0], + data.metadatas[0], + data.distances ? data.distances[0] : undefined + ); console.log(result); @@ -39,7 +45,7 @@ async function fetchAndFormatDocuments(lastMessageContent: string) { } catch (error) { console.error('Error fetching and formatting documents:', error); - throw error; // You may want to throw a more specific error object here + throw error; } } @@ -62,20 +68,19 @@ const handler = async (req: Request): Promise => { let promptToSend = codeBlock` ${oneLine` - You are a very enthusiastic AI assistant who loves - to help people! Given the following information from - relevant documentation, answer the user's question using - only that information, outputted in markdown format. + You are a specialized Scientific Research Assistant. Given the following + excerpts from scientific papers, your goal is to answer the user's question + accurately and with rigorous citations. `} ${oneLine` - If you are unsure - and the answer is not explicitly written in the documentation, say - "Sorry, I don't know how to help with that." + Use only the provided context. If the answer is not in the context, say + "I'm sorry, the provided research documents do not contain enough information to answer this question." `} ${oneLine` - Always include citations from the documentation. + CRITICAL: You must cite every factual claim using the citation keys provided in the context + (e.g., [[DocName:p1:c2]]). Place citations immediately after the claim they support. `} `; diff --git a/ui/utils/server/scientific-rag.ts b/ui/utils/server/scientific-rag.ts new file mode 100644 index 0000000..41c0856 --- /dev/null +++ b/ui/utils/server/scientific-rag.ts @@ -0,0 +1,75 @@ +import { Document } from 'langchain/document'; + +export interface ScientificMetadata { + title: string; + authors?: string[]; + doi?: string; + section?: string; + page: number; + citationKey: string; + source: string; +} + +const SCIENTIFIC_SECTIONS = [ + 'ABSTRACT', + 'INTRODUCTION', + 'METHODS', + 'METHODOLOGY', + 'RESULTS', + 'DISCUSSION', + 'CONCLUSION', + 'REFERENCES', + 'ACKNOWLEDGEMENTS', +]; + +/** + * Detects which scientific section a piece of text belongs to. + */ +export function detectSection(text: string, currentSection: string): string { + const upperText = text.toUpperCase().trim(); + for (const section of SCIENTIFIC_SECTIONS) { + if (upperText.startsWith(section) || upperText.includes('\n' + section + '\n')) { + return section; + } + } + return currentSection; +} + +/** + * Generates a stable citation key based on metadata. + */ +export function generateCitationKey(metadata: any, chunkIndex: number): string { + const shortTitle = (metadata.title || 'Doc').substring(0, 15).replace(/\s+/g, ''); + const page = metadata.page || 0; + return `${shortTitle}:p${page}:c${chunkIndex}`; +} + +/** + * Enhanced scientific text splitter separators. + */ +export const SCIENTIFIC_SEPARATORS = [ + '\nREFERENCES\n', + '\nABSTRACT\n', + '\nINTRODUCTION\n', + '\nMETHODS\n', + '\nRESULTS\n', + '\nDISCUSSION\n', + '\nCONCLUSION\n', + '\n\n', + '\n', + '. ', + ' ', + '', +]; + +/** + * Formats retrieved documents for the prompt, including distances and citation keys. + */ +export function formatScientificContext(documents: string[], metadatas: any[], distances?: number[]): string { + return documents.map((doc, i) => { + const meta = metadatas[i]; + const distStr = distances ? ` [Relevance: ${(1 - distances[i]).toFixed(2)}]` : ''; + const sectionStr = meta.section ? ` Section: ${meta.section}` : ''; + return `[[${meta.citationKey}]] (Title: ${meta.title}, Page: ${meta.page}${sectionStr}${distStr})\nContent: ${doc}\n`; + }).join('\n---\n'); +}