From 5444c6449dabd94ef2292297dae41b0fcada3ba4 Mon Sep 17 00:00:00 2001 From: ridzkyy Date: Fri, 15 May 2026 21:18:01 +0700 Subject: [PATCH] feat: add scientific research context to rag --- ui/__tests__/scientific-rag.test.ts | 113 ++++++++++++ ui/pages/api/fetch-documents.ts | 51 ++++-- ui/pages/api/fetch-research.ts | 62 +++++++ ui/pages/api/inject-documents.ts | 40 ++--- ui/pages/api/rag-chat.ts | 94 ++++++---- ui/utils/server/scientific-rag.ts | 263 ++++++++++++++++++++++++++++ 6 files changed, 555 insertions(+), 68 deletions(-) create mode 100644 ui/__tests__/scientific-rag.test.ts create mode 100644 ui/pages/api/fetch-research.ts create mode 100644 ui/utils/server/scientific-rag.ts diff --git a/ui/__tests__/scientific-rag.test.ts b/ui/__tests__/scientific-rag.test.ts new file mode 100644 index 0000000..b1fabbe --- /dev/null +++ b/ui/__tests__/scientific-rag.test.ts @@ -0,0 +1,113 @@ +import { + buildScientificMetadata, + buildSemanticScholarSearchUrl, + clampResultLimit, + createCitationKey, + detectScientificSection, + formatChromaResults, + formatResearchPapers, + normalizeSemanticScholarPaper, +} from '@/utils/server/scientific-rag'; + +import { describe, expect, it } from 'vitest'; + +describe('scientific RAG helpers', () => { + it('detects common scientific sections from chunk text', () => { + expect( + detectScientificSection('Abstract\nWe test a retrieval method.'), + ).toBe('abstract'); + expect(detectScientificSection('The Methods section explains setup.')).toBe( + 'methods', + ); + expect(detectScientificSection('Plain paragraph without a heading.')).toBe( + 'body', + ); + }); + + it('builds stable local citation metadata', () => { + const metadata = buildScientificMetadata( + { + metadata: { + loc: { pageNumber: 3 }, + pdf: { info: { Title: 'Attention Is All You Need' } }, + source: '/tmp/paper.pdf', + }, + pageContent: 'Results show improved BLEU.', + }, + 1, + 'paper.pdf', + ); + + expect(metadata).toMatchObject({ + chunk: 1, + page: 3, + section: 'results', + title: 'Attention Is All You Need', + }); + expect(metadata.citationKey).toBe('attention-is-all-you-need:p3:c2'); + }); + + it('formats Chroma results with citation keys and distances', () => { + const formatted = formatChromaResults({ + distances: [[0.123456]], + documents: [['This paper introduces a scientific benchmark.']], + metadatas: [ + [ + { + citationKey: 'benchmark:p2:c1', + page: 2, + section: 'abstract', + title: 'Benchmark Paper', + }, + ], + ], + }); + + expect(formatted).toContain('Local Source 1 [benchmark:p2:c1]'); + expect(formatted).toContain('Distance: 0.1235'); + expect(formatted).toContain( + 'This paper introduces a scientific benchmark.', + ); + }); + + it('normalizes and formats Semantic Scholar papers', () => { + const paper = normalizeSemanticScholarPaper( + { + abstract: 'A retrieval pipeline for scientific papers.', + authors: [{ name: 'Ada Lovelace' }, { name: 'Grace Hopper' }], + citationCount: 42, + externalIds: { DOI: '10.1234/example.paper' }, + isOpenAccess: true, + title: 'Scientific RAG', + url: 'https://example.test/paper', + venue: 'Journal of Tests', + year: 2026, + }, + 0, + ); + + expect(paper.citationKey).toBe('paper:10-1234-example-paper'); + + const formatted = formatResearchPapers([paper]); + expect(formatted).toContain( + 'Research Source 1 [paper:10-1234-example-paper]', + ); + expect(formatted).toContain('Authors: Ada Lovelace, Grace Hopper'); + expect(formatted).toContain('Open Access: yes'); + }); + + it('builds bounded Semantic Scholar search URLs', () => { + expect(clampResultLimit('99', 4, 6)).toBe(6); + expect(clampResultLimit('0', 4, 6)).toBe(1); + + const url = buildSemanticScholarSearchUrl('graph neural networks', 99); + + expect(url.searchParams.get('query')).toBe('graph neural networks'); + expect(url.searchParams.get('limit')).toBe('8'); + expect(url.searchParams.get('fields')).toContain('abstract'); + }); + + it('falls back to generated citation keys when metadata is incomplete', () => { + expect(createCitationKey({}, 2)).toBe('source-3:p-unknown:c3'); + }); +}); diff --git a/ui/pages/api/fetch-documents.ts b/ui/pages/api/fetch-documents.ts index 9304e48..e13e3e1 100644 --- a/ui/pages/api/fetch-documents.ts +++ b/ui/pages/api/fetch-documents.ts @@ -1,25 +1,50 @@ -import type { NextApiRequest, NextApiResponse } from "next"; -import { ChromaClient, TransformersEmbeddingFunction } from "chromadb"; +import type { NextApiRequest, NextApiResponse } from 'next'; -export default async function handler(req: NextApiRequest, res: NextApiResponse) { +import { + clampResultLimit, + formatChromaResults, +} from '@/utils/server/scientific-rag'; + +import { ChromaClient, TransformersEmbeddingFunction } from 'chromadb'; + +export default async function handler( + req: NextApiRequest, + res: NextApiResponse, +) { try { + if (req.method !== 'POST') { + return res.status(405).json({ error: 'Method not allowed' }); + } + const client = new ChromaClient({ - path: "http://chroma-server:8000", + path: process.env.CHROMA_PATH || 'http://chroma-server:8000', }); - const query = req.body.input; + const query = + typeof req.body.input === 'string' ? req.body.input.trim() : ''; + const nResults = clampResultLimit(req.body.nResults, 6, 10); + + if (!query) { + return res.status(400).json({ error: 'Missing input query' }); + } const embedder = new TransformersEmbeddingFunction(); - const collection = await client.getOrCreateCollection({ name: "default-collection", embeddingFunction: embedder }); + const collection = await client.getOrCreateCollection({ + name: 'default-collection', + embeddingFunction: embedder, + }); - // query the collection - const results = await collection.query({ - nResults: 4, - queryTexts: [query] - }) + // query the collection + const results = await collection.query({ + nResults, + queryTexts: [query], + }); - res.status(200).json(results); + res.status(200).json({ + ...results, + formatted: formatChromaResults(results), + }); } catch (error) { if (error instanceof Error) { console.error('Error message:', error.message); @@ -29,4 +54,4 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse) } res.status(500).json({ error: 'An unexpected error occurred :(' }); } -} \ No newline at end of file +} diff --git a/ui/pages/api/fetch-research.ts b/ui/pages/api/fetch-research.ts new file mode 100644 index 0000000..bd2ada4 --- /dev/null +++ b/ui/pages/api/fetch-research.ts @@ -0,0 +1,62 @@ +import type { NextApiRequest, NextApiResponse } from 'next'; + +import { + buildSemanticScholarSearchUrl, + clampResultLimit, + formatResearchPapers, + normalizeSemanticScholarPaper, +} from '@/utils/server/scientific-rag'; + +export default async function handler( + req: NextApiRequest, + res: NextApiResponse, +) { + if (req.method !== 'POST') { + return res.status(405).json({ error: 'Method not allowed' }); + } + + const query = typeof req.body.input === 'string' ? req.body.input.trim() : ''; + const limit = clampResultLimit(req.body.limit, 4, 6); + + if (!query) { + return res.status(400).json({ error: 'Missing input query' }); + } + + const controller = new AbortController(); + const timeout = setTimeout(() => controller.abort(), 4000); + + try { + const response = await fetch(buildSemanticScholarSearchUrl(query, limit), { + headers: { + Accept: 'application/json', + ...(process.env.SEMANTIC_SCHOLAR_API_KEY && { + 'x-api-key': process.env.SEMANTIC_SCHOLAR_API_KEY, + }), + }, + signal: controller.signal, + }); + + if (!response.ok) { + return res.status(502).json({ + error: `Semantic Scholar returned ${response.status}`, + }); + } + + const payload = await response.json(); + const papers = (payload.data ?? []).map(normalizeSemanticScholarPaper); + + return res.status(200).json({ + formatted: formatResearchPapers(papers), + papers, + }); + } catch (error) { + const message = + error instanceof Error + ? error.message + : 'Unable to fetch research papers'; + + return res.status(502).json({ error: message }); + } finally { + clearTimeout(timeout); + } +} diff --git a/ui/pages/api/inject-documents.ts b/ui/pages/api/inject-documents.ts index 532a635..a232679 100644 --- a/ui/pages/api/inject-documents.ts +++ b/ui/pages/api/inject-documents.ts @@ -1,10 +1,14 @@ import type { NextApiRequest, NextApiResponse } from 'next'; +import { + SCIENTIFIC_TEXT_SEPARATORS, + buildScientificMetadata, +} from '@/utils/server/scientific-rag'; + import { ChromaClient, TransformersEmbeddingFunction } from 'chromadb'; import { IncomingForm } from 'formidable'; import { PDFLoader } from 'langchain/document_loaders/fs/pdf'; -import { RecursiveCharacterTextSplitter } from "langchain/text_splitter"; - +import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter'; import path from 'path'; import { v4 as uuidv4 } from 'uuid'; @@ -33,20 +37,24 @@ export default async function handler( path: process.env.CHROMA_PATH || 'http://chroma-server:8000', }); - const loader = new PDFLoader(files.pdf[0].filepath); + const pdfFile = Array.isArray(files.pdf) ? files.pdf[0] : files.pdf; - const originalDocs = await loader.load(); + if (!pdfFile?.filepath) { + return res.status(400).json({ error: 'Missing PDF file' }); + } - console.log(JSON.stringify(originalDocs)); + const loader = new PDFLoader(pdfFile.filepath); + const originalDocs = await loader.load(); const splitter = new RecursiveCharacterTextSplitter({ chunkSize: 500, chunkOverlap: 100, - }); + separators: SCIENTIFIC_TEXT_SEPARATORS, + }); const docs = await splitter.splitDocuments(originalDocs); - + // Process the documents and perform other logic const { ids, metadatas, documentContents } = processDocuments(docs); @@ -80,23 +88,15 @@ function processDocuments(docs: any) { const metadatas = []; const documentContents = []; - for (const document of docs) { + for (const [index, document] of docs.entries()) { // Generate an ID for each document, or use some existing unique identifier const id = uuidv4(); ids.push(id); - const fallbackTitle = path.basename(document.metadata.source); - const titleFromMetadata = document.metadata.pdf.info.Title; - - const title = titleFromMetadata && titleFromMetadata.length > 0 ? titleFromMetadata : fallbackTitle; - - - const metadata = { - title: title, - page: document.metadata.loc.pageNumber, // Define this function to extract chapter info - source: document.metadata.source, // Define this function to extract verse info - }; - metadatas.push(metadata); + const fallbackTitle = path.basename( + document.metadata?.source ?? 'document', + ); + metadatas.push(buildScientificMetadata(document, index, fallbackTitle)); // Add the page content to the documents array documentContents.push(document.pageContent); diff --git a/ui/pages/api/rag-chat.ts b/ui/pages/api/rag-chat.ts index ce84d67..276968a 100644 --- a/ui/pages/api/rag-chat.ts +++ b/ui/pages/api/rag-chat.ts @@ -1,6 +1,5 @@ import { DEFAULT_SYSTEM_PROMPT, DEFAULT_TEMPERATURE } from '@/utils/app/const'; import { OpenAIError, OpenAIStream } from '@/utils/server'; -import { codeBlock, oneLine } from 'common-tags' import { ChatBody, Message } from '@/types/chat'; @@ -9,46 +8,39 @@ import wasm from '../../node_modules/@dqbd/tiktoken/lite/tiktoken_bg.wasm?module import tiktokenModel from '@dqbd/tiktoken/encoders/cl100k_base.json'; import { Tiktoken, init } from '@dqbd/tiktoken/lite/init'; +import { codeBlock, oneLine } from 'common-tags'; export const config = { runtime: 'edge', }; -// Function to fetch and format documents -async function fetchAndFormatDocuments(lastMessageContent: string) { +async function fetchFormattedContext( + baseUrl: string, + path: string, + input: string, + body: Record = {}, +) { try { - console.log("fetching documents") - const response = await fetch('http://localhost:3000/api/fetch-documents', { + const response = await fetch(`${baseUrl}${path}`, { method: 'POST', headers: { 'Content-Type': 'application/json' }, - body: JSON.stringify({ input: lastMessageContent }), + body: JSON.stringify({ input, ...body }), }); - + if (!response.ok) { - throw new Error(`Error fetching documents: ${response.statusText}`); + console.warn(`Skipping ${path}: ${response.statusText}`); + return ''; } const data = await response.json(); - const result = data.metadatas[0].map((metadata: any, index: number) => { - return `Source ${index + 1}) Title: ${metadata.title}, Page: ${metadata.page}, Content: ${data.documents[0][index]}\n`; - }).join(''); - - console.log(result); - - return result; - + return typeof data.formatted === 'string' ? data.formatted : ''; } catch (error) { - console.error('Error fetching and formatting documents:', error); - throw error; // You may want to throw a more specific error object here + console.warn(`Skipping ${path}:`, error); + return ''; } } - - - - const handler = async (req: Request): Promise => { - try { const { model, messages, key, prompt, temperature } = (await req.json()) as ChatBody; @@ -84,9 +76,36 @@ const handler = async (req: Request): Promise => { } const lastMessage = messages[messages.length - 1]; + const baseUrl = new URL(req.url).origin; + + const [localDocuments, researchPapers] = await Promise.all([ + fetchFormattedContext( + baseUrl, + '/api/fetch-documents', + lastMessage.content, + { + nResults: 6, + }, + ), + fetchFormattedContext( + baseUrl, + '/api/fetch-research', + lastMessage.content, + { + limit: 4, + }, + ), + ]); + + const evidenceBlocks = [ + localDocuments + ? `LOCAL DOCUMENT SOURCES\n${localDocuments}` + : 'LOCAL DOCUMENT SOURCES\nNo matching local documents were retrieved.', + researchPapers + ? `EXTERNAL RESEARCH SOURCES\n${researchPapers}` + : 'EXTERNAL RESEARCH SOURCES\nNo matching external research papers were retrieved.', + ].join('\n\n'); - const relevantDocuments = await fetchAndFormatDocuments(lastMessage.content); - let temperatureToUse = temperature; if (temperatureToUse == null) { temperatureToUse = DEFAULT_TEMPERATURE; @@ -97,22 +116,20 @@ const handler = async (req: Request): Promise => { let tokenCount = prompt_tokens.length; let messagesToSend: Message[] = []; - encoding.free(); console.log(model, promptToSend, temperatureToUse, key, messagesToSend); - - messagesToSend = [ + messagesToSend = [ { - role: "user", + role: 'user', content: codeBlock` - Here is the relevant documentation: - ${relevantDocuments} + Here is the available scientific evidence: + ${evidenceBlocks} `, }, { - role: "user", + role: 'user', content: codeBlock` ${oneLine` Answer my next question using only the above documentation. @@ -121,6 +138,14 @@ const handler = async (req: Request): Promise => { ${oneLine` - Do not make up answers that are not provided in the documentation. `} + ${oneLine` + - Cite local documents with their [citation key] and cite research + papers with their [paper:*] key. + `} + ${oneLine` + - Prefer local uploaded documents when they directly answer the + question; use external papers only to add research context. + `} ${oneLine` - If you are unsure and the answer is not explicitly written in the documentation context, say @@ -135,14 +160,13 @@ const handler = async (req: Request): Promise => { `, }, { - role: "user", + role: 'user', content: codeBlock` Here is my question: ${oneLine`${lastMessage.content}`} `, }, - ] - + ]; const stream = await OpenAIStream( model, diff --git a/ui/utils/server/scientific-rag.ts b/ui/utils/server/scientific-rag.ts new file mode 100644 index 0000000..670f0f8 --- /dev/null +++ b/ui/utils/server/scientific-rag.ts @@ -0,0 +1,263 @@ +export type ChromaMetadata = { + citationKey?: string; + chunk?: number; + page?: number; + section?: string; + source?: string; + title?: string; +}; + +export type ChromaQueryResults = { + documents?: Array> | null; + distances?: Array> | null; + metadatas?: Array> | null; +}; + +export type SemanticScholarAuthor = { + name?: string; +}; + +export type SemanticScholarPaper = { + abstract?: string | null; + authors?: SemanticScholarAuthor[]; + citationCount?: number | null; + externalIds?: Record | null; + isOpenAccess?: boolean; + openAccessPdf?: { url?: string | null } | null; + paperId?: string; + title?: string | null; + url?: string | null; + venue?: string | null; + year?: number | null; +}; + +export type NormalizedResearchPaper = { + abstract: string; + authors: string; + citationCount?: number; + citationKey: string; + identifier?: string; + isOpenAccess: boolean; + title: string; + url?: string; + venue?: string; + year?: number; +}; + +export const SCIENTIFIC_TEXT_SEPARATORS = [ + '\nAbstract', + '\nIntroduction', + '\nBackground', + '\nMethods', + '\nMethodology', + '\nMaterials and Methods', + '\nResults', + '\nDiscussion', + '\nConclusion', + '\nReferences', + '\n\n', + '\n', + '. ', + ' ', + '', +]; + +const SCIENTIFIC_SECTIONS = [ + 'abstract', + 'introduction', + 'background', + 'methods', + 'methodology', + 'materials and methods', + 'results', + 'discussion', + 'conclusion', + 'references', +]; + +export function clampResultLimit(value: unknown, fallback = 4, max = 8) { + const parsed = typeof value === 'number' ? value : Number(value); + + if (!Number.isFinite(parsed)) { + return fallback; + } + + return Math.min(Math.max(Math.floor(parsed), 1), max); +} + +export function detectScientificSection(text: string) { + const normalized = text.toLowerCase(); + const section = SCIENTIFIC_SECTIONS.find((name) => normalized.includes(name)); + + return section ?? 'body'; +} + +export function sanitizeCitationPart(value: unknown, fallback: string) { + const raw = + typeof value === 'string' && value.trim().length > 0 ? value : fallback; + + return ( + raw + .toLowerCase() + .replace(/[^a-z0-9]+/g, '-') + .replace(/^-+|-+$/g, '') + .slice(0, 48) || fallback + ); +} + +export function createCitationKey(metadata: ChromaMetadata, index: number) { + if (metadata.citationKey) { + return metadata.citationKey; + } + + const title = sanitizeCitationPart( + metadata.title ?? metadata.source, + `source-${index + 1}`, + ); + const page = metadata.page ? `p${metadata.page}` : 'p-unknown'; + const chunk = + typeof metadata.chunk === 'number' + ? `c${metadata.chunk + 1}` + : `c${index + 1}`; + + return `${title}:${page}:${chunk}`; +} + +export function buildScientificMetadata( + document: { + metadata?: { + loc?: { pageNumber?: number }; + pdf?: { info?: { Title?: string } }; + source?: string; + }; + pageContent: string; + }, + index: number, + fallbackTitle: string, +): ChromaMetadata { + const titleFromMetadata = document.metadata?.pdf?.info?.Title; + const title = + titleFromMetadata && titleFromMetadata.trim().length > 0 + ? titleFromMetadata + : fallbackTitle; + const metadata = { + chunk: index, + page: document.metadata?.loc?.pageNumber, + section: detectScientificSection(document.pageContent), + source: document.metadata?.source, + title, + }; + + return { + ...metadata, + citationKey: createCitationKey(metadata, index), + }; +} + +export function formatChromaResults(results: ChromaQueryResults) { + const documents = results.documents?.[0] ?? []; + const metadatas = results.metadatas?.[0] ?? []; + const distances = results.distances?.[0] ?? []; + + return documents + .map((content, index) => { + if (!content || content.trim().length === 0) { + return ''; + } + + const metadata = metadatas[index] ?? {}; + const citationKey = createCitationKey(metadata, index); + const distance = distances[index]; + const score = + typeof distance === 'number' + ? `, Distance: ${distance.toFixed(4)}` + : ''; + + return [ + `Local Source ${index + 1} [${citationKey}]`, + `Title: ${metadata.title ?? 'Unknown'}`, + `Page: ${metadata.page ?? 'Unknown'}`, + `Section: ${metadata.section ?? 'body'}${score}`, + `Content: ${content}`, + ].join('\n'); + }) + .filter(Boolean) + .join('\n\n'); +} + +export function buildSemanticScholarSearchUrl(query: string, limit: number) { + const url = new URL('https://api.semanticscholar.org/graph/v1/paper/search'); + + url.searchParams.set('query', query); + url.searchParams.set('limit', String(clampResultLimit(limit))); + url.searchParams.set( + 'fields', + [ + 'title', + 'abstract', + 'year', + 'authors', + 'url', + 'venue', + 'citationCount', + 'externalIds', + 'isOpenAccess', + 'openAccessPdf', + ].join(','), + ); + + return url; +} + +export function normalizeSemanticScholarPaper( + paper: SemanticScholarPaper, + index: number, +): NormalizedResearchPaper { + const title = paper.title?.trim() || `Untitled paper ${index + 1}`; + const externalIds = paper.externalIds ?? {}; + const identifier = + externalIds.DOI ?? externalIds.ArXiv ?? externalIds.PubMed ?? paper.paperId; + const citationKey = `paper:${sanitizeCitationPart( + identifier ?? title, + `paper-${index + 1}`, + )}`; + const authors = + paper.authors + ?.map((author) => author.name) + .filter((name): name is string => Boolean(name)) + .slice(0, 4) + .join(', ') || 'Unknown authors'; + + return { + abstract: paper.abstract?.trim() || 'No abstract available.', + authors, + citationCount: paper.citationCount ?? undefined, + citationKey, + identifier, + isOpenAccess: Boolean(paper.isOpenAccess || paper.openAccessPdf?.url), + title, + url: paper.openAccessPdf?.url ?? paper.url ?? undefined, + venue: paper.venue ?? undefined, + year: paper.year ?? undefined, + }; +} + +export function formatResearchPapers(papers: NormalizedResearchPaper[]) { + return papers + .map((paper, index) => + [ + `Research Source ${index + 1} [${paper.citationKey}]`, + `Title: ${paper.title}`, + `Authors: ${paper.authors}`, + `Year: ${paper.year ?? 'Unknown'}`, + `Venue: ${paper.venue ?? 'Unknown'}`, + `Citations: ${paper.citationCount ?? 'Unknown'}`, + `Open Access: ${paper.isOpenAccess ? 'yes' : 'unknown/no'}`, + paper.url ? `URL: ${paper.url}` : undefined, + `Abstract: ${paper.abstract}`, + ] + .filter(Boolean) + .join('\n'), + ) + .join('\n\n'); +}