Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
113 changes: 113 additions & 0 deletions ui/__tests__/scientific-rag.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
import {
buildScientificMetadata,
buildSemanticScholarSearchUrl,
clampResultLimit,
createCitationKey,
detectScientificSection,
formatChromaResults,
formatResearchPapers,
normalizeSemanticScholarPaper,
} from '@/utils/server/scientific-rag';

import { describe, expect, it } from 'vitest';

describe('scientific RAG helpers', () => {
it('detects common scientific sections from chunk text', () => {
expect(
detectScientificSection('Abstract\nWe test a retrieval method.'),
).toBe('abstract');
expect(detectScientificSection('The Methods section explains setup.')).toBe(
'methods',
);
expect(detectScientificSection('Plain paragraph without a heading.')).toBe(
'body',
);
});

it('builds stable local citation metadata', () => {
const metadata = buildScientificMetadata(
{
metadata: {
loc: { pageNumber: 3 },
pdf: { info: { Title: 'Attention Is All You Need' } },
source: '/tmp/paper.pdf',
},
pageContent: 'Results show improved BLEU.',
},
1,
'paper.pdf',
);

expect(metadata).toMatchObject({
chunk: 1,
page: 3,
section: 'results',
title: 'Attention Is All You Need',
});
expect(metadata.citationKey).toBe('attention-is-all-you-need:p3:c2');
});

it('formats Chroma results with citation keys and distances', () => {
const formatted = formatChromaResults({
distances: [[0.123456]],
documents: [['This paper introduces a scientific benchmark.']],
metadatas: [
[
{
citationKey: 'benchmark:p2:c1',
page: 2,
section: 'abstract',
title: 'Benchmark Paper',
},
],
],
});

expect(formatted).toContain('Local Source 1 [benchmark:p2:c1]');
expect(formatted).toContain('Distance: 0.1235');
expect(formatted).toContain(
'This paper introduces a scientific benchmark.',
);
});

it('normalizes and formats Semantic Scholar papers', () => {
const paper = normalizeSemanticScholarPaper(
{
abstract: 'A retrieval pipeline for scientific papers.',
authors: [{ name: 'Ada Lovelace' }, { name: 'Grace Hopper' }],
citationCount: 42,
externalIds: { DOI: '10.1234/example.paper' },
isOpenAccess: true,
title: 'Scientific RAG',
url: 'https://example.test/paper',
venue: 'Journal of Tests',
year: 2026,
},
0,
);

expect(paper.citationKey).toBe('paper:10-1234-example-paper');

const formatted = formatResearchPapers([paper]);
expect(formatted).toContain(
'Research Source 1 [paper:10-1234-example-paper]',
);
expect(formatted).toContain('Authors: Ada Lovelace, Grace Hopper');
expect(formatted).toContain('Open Access: yes');
});

it('builds bounded Semantic Scholar search URLs', () => {
expect(clampResultLimit('99', 4, 6)).toBe(6);
expect(clampResultLimit('0', 4, 6)).toBe(1);

const url = buildSemanticScholarSearchUrl('graph neural networks', 99);

expect(url.searchParams.get('query')).toBe('graph neural networks');
expect(url.searchParams.get('limit')).toBe('8');
expect(url.searchParams.get('fields')).toContain('abstract');
});

it('falls back to generated citation keys when metadata is incomplete', () => {
expect(createCitationKey({}, 2)).toBe('source-3:p-unknown:c3');
});
});
51 changes: 38 additions & 13 deletions ui/pages/api/fetch-documents.ts
Original file line number Diff line number Diff line change
@@ -1,25 +1,50 @@
import type { NextApiRequest, NextApiResponse } from "next";
import { ChromaClient, TransformersEmbeddingFunction } from "chromadb";
import type { NextApiRequest, NextApiResponse } from 'next';

export default async function handler(req: NextApiRequest, res: NextApiResponse) {
import {
clampResultLimit,
formatChromaResults,
} from '@/utils/server/scientific-rag';

import { ChromaClient, TransformersEmbeddingFunction } from 'chromadb';

export default async function handler(
req: NextApiRequest,
res: NextApiResponse,
) {
try {
if (req.method !== 'POST') {
return res.status(405).json({ error: 'Method not allowed' });
}

const client = new ChromaClient({
path: "http://chroma-server:8000",
path: process.env.CHROMA_PATH || 'http://chroma-server:8000',
});

const query = req.body.input;
const query =
typeof req.body.input === 'string' ? req.body.input.trim() : '';
const nResults = clampResultLimit(req.body.nResults, 6, 10);

if (!query) {
return res.status(400).json({ error: 'Missing input query' });
}

const embedder = new TransformersEmbeddingFunction();

const collection = await client.getOrCreateCollection({ name: "default-collection", embeddingFunction: embedder });
const collection = await client.getOrCreateCollection({
name: 'default-collection',
embeddingFunction: embedder,
});

// query the collection
const results = await collection.query({
nResults: 4,
queryTexts: [query]
})
// query the collection
const results = await collection.query({
nResults,
queryTexts: [query],
});

res.status(200).json(results);
res.status(200).json({
...results,
formatted: formatChromaResults(results),
});
} catch (error) {
if (error instanceof Error) {
console.error('Error message:', error.message);
Expand All @@ -29,4 +54,4 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse)
}
res.status(500).json({ error: 'An unexpected error occurred :(' });
}
}
}
62 changes: 62 additions & 0 deletions ui/pages/api/fetch-research.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import type { NextApiRequest, NextApiResponse } from 'next';

import {
buildSemanticScholarSearchUrl,
clampResultLimit,
formatResearchPapers,
normalizeSemanticScholarPaper,
} from '@/utils/server/scientific-rag';

export default async function handler(
req: NextApiRequest,
res: NextApiResponse,
) {
if (req.method !== 'POST') {
return res.status(405).json({ error: 'Method not allowed' });
}

const query = typeof req.body.input === 'string' ? req.body.input.trim() : '';
const limit = clampResultLimit(req.body.limit, 4, 6);

if (!query) {
return res.status(400).json({ error: 'Missing input query' });
}

const controller = new AbortController();
const timeout = setTimeout(() => controller.abort(), 4000);

try {
const response = await fetch(buildSemanticScholarSearchUrl(query, limit), {
headers: {
Accept: 'application/json',
...(process.env.SEMANTIC_SCHOLAR_API_KEY && {
'x-api-key': process.env.SEMANTIC_SCHOLAR_API_KEY,
}),
},
signal: controller.signal,
});

if (!response.ok) {
return res.status(502).json({
error: `Semantic Scholar returned ${response.status}`,
});
}

const payload = await response.json();
const papers = (payload.data ?? []).map(normalizeSemanticScholarPaper);

return res.status(200).json({
formatted: formatResearchPapers(papers),
papers,
});
} catch (error) {
const message =
error instanceof Error
? error.message
: 'Unable to fetch research papers';

return res.status(502).json({ error: message });
} finally {
clearTimeout(timeout);
}
}
40 changes: 20 additions & 20 deletions ui/pages/api/inject-documents.ts
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@
import type { NextApiRequest, NextApiResponse } from 'next';

import {
SCIENTIFIC_TEXT_SEPARATORS,
buildScientificMetadata,
} from '@/utils/server/scientific-rag';

import { ChromaClient, TransformersEmbeddingFunction } from 'chromadb';
import { IncomingForm } from 'formidable';
import { PDFLoader } from 'langchain/document_loaders/fs/pdf';
import { RecursiveCharacterTextSplitter } from "langchain/text_splitter";

import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
import path from 'path';
import { v4 as uuidv4 } from 'uuid';

Expand Down Expand Up @@ -33,20 +37,24 @@ export default async function handler(
path: process.env.CHROMA_PATH || 'http://chroma-server:8000',
});

const loader = new PDFLoader(files.pdf[0].filepath);
const pdfFile = Array.isArray(files.pdf) ? files.pdf[0] : files.pdf;

const originalDocs = await loader.load();
if (!pdfFile?.filepath) {
return res.status(400).json({ error: 'Missing PDF file' });
}

console.log(JSON.stringify(originalDocs));
const loader = new PDFLoader(pdfFile.filepath);

const originalDocs = await loader.load();

const splitter = new RecursiveCharacterTextSplitter({
chunkSize: 500,
chunkOverlap: 100,
});
separators: SCIENTIFIC_TEXT_SEPARATORS,
});

const docs = await splitter.splitDocuments(originalDocs);

// Process the documents and perform other logic
const { ids, metadatas, documentContents } = processDocuments(docs);

Expand Down Expand Up @@ -80,23 +88,15 @@ function processDocuments(docs: any) {
const metadatas = [];
const documentContents = [];

for (const document of docs) {
for (const [index, document] of docs.entries()) {
// Generate an ID for each document, or use some existing unique identifier
const id = uuidv4();
ids.push(id);

const fallbackTitle = path.basename(document.metadata.source);
const titleFromMetadata = document.metadata.pdf.info.Title;

const title = titleFromMetadata && titleFromMetadata.length > 0 ? titleFromMetadata : fallbackTitle;


const metadata = {
title: title,
page: document.metadata.loc.pageNumber, // Define this function to extract chapter info
source: document.metadata.source, // Define this function to extract verse info
};
metadatas.push(metadata);
const fallbackTitle = path.basename(
document.metadata?.source ?? 'document',
);
metadatas.push(buildScientificMetadata(document, index, fallbackTitle));

// Add the page content to the documents array
documentContents.push(document.pageContent);
Expand Down
Loading