From 5444c6449dabd94ef2292297dae41b0fcada3ba4 Mon Sep 17 00:00:00 2001
From: ridzkyy <ridzkyguntur12@gmail.com>
Date: Fri, 15 May 2026 21:18:01 +0700
Subject: [PATCH] feat: add scientific research context to rag

---
 ui/__tests__/scientific-rag.test.ts | 113 ++++++++++++
 ui/pages/api/fetch-documents.ts     |  51 ++++--
 ui/pages/api/fetch-research.ts      |  62 +++++++
 ui/pages/api/inject-documents.ts    |  40 ++---
 ui/pages/api/rag-chat.ts            |  94 ++++++----
 ui/utils/server/scientific-rag.ts   | 263 ++++++++++++++++++++++++++++
 6 files changed, 555 insertions(+), 68 deletions(-)
 create mode 100644 ui/__tests__/scientific-rag.test.ts
 create mode 100644 ui/pages/api/fetch-research.ts
 create mode 100644 ui/utils/server/scientific-rag.ts

diff --git a/ui/__tests__/scientific-rag.test.ts b/ui/__tests__/scientific-rag.test.ts
new file mode 100644
index 0000000..b1fabbe
--- /dev/null
+++ b/ui/__tests__/scientific-rag.test.ts
@@ -0,0 +1,113 @@
+import {
+  buildScientificMetadata,
+  buildSemanticScholarSearchUrl,
+  clampResultLimit,
+  createCitationKey,
+  detectScientificSection,
+  formatChromaResults,
+  formatResearchPapers,
+  normalizeSemanticScholarPaper,
+} from '@/utils/server/scientific-rag';
+
+import { describe, expect, it } from 'vitest';
+
+describe('scientific RAG helpers', () => {
+  it('detects common scientific sections from chunk text', () => {
+    expect(
+      detectScientificSection('Abstract\nWe test a retrieval method.'),
+    ).toBe('abstract');
+    expect(detectScientificSection('The Methods section explains setup.')).toBe(
+      'methods',
+    );
+    expect(detectScientificSection('Plain paragraph without a heading.')).toBe(
+      'body',
+    );
+  });
+
+  it('builds stable local citation metadata', () => {
+    const metadata = buildScientificMetadata(
+      {
+        metadata: {
+          loc: { pageNumber: 3 },
+          pdf: { info: { Title: 'Attention Is All You Need' } },
+          source: '/tmp/paper.pdf',
+        },
+        pageContent: 'Results show improved BLEU.',
+      },
+      1,
+      'paper.pdf',
+    );
+
+    expect(metadata).toMatchObject({
+      chunk: 1,
+      page: 3,
+      section: 'results',
+      title: 'Attention Is All You Need',
+    });
+    expect(metadata.citationKey).toBe('attention-is-all-you-need:p3:c2');
+  });
+
+  it('formats Chroma results with citation keys and distances', () => {
+    const formatted = formatChromaResults({
+      distances: [[0.123456]],
+      documents: [['This paper introduces a scientific benchmark.']],
+      metadatas: [
+        [
+          {
+            citationKey: 'benchmark:p2:c1',
+            page: 2,
+            section: 'abstract',
+            title: 'Benchmark Paper',
+          },
+        ],
+      ],
+    });
+
+    expect(formatted).toContain('Local Source 1 [benchmark:p2:c1]');
+    expect(formatted).toContain('Distance: 0.1235');
+    expect(formatted).toContain(
+      'This paper introduces a scientific benchmark.',
+    );
+  });
+
+  it('normalizes and formats Semantic Scholar papers', () => {
+    const paper = normalizeSemanticScholarPaper(
+      {
+        abstract: 'A retrieval pipeline for scientific papers.',
+        authors: [{ name: 'Ada Lovelace' }, { name: 'Grace Hopper' }],
+        citationCount: 42,
+        externalIds: { DOI: '10.1234/example.paper' },
+        isOpenAccess: true,
+        title: 'Scientific RAG',
+        url: 'https://example.test/paper',
+        venue: 'Journal of Tests',
+        year: 2026,
+      },
+      0,
+    );
+
+    expect(paper.citationKey).toBe('paper:10-1234-example-paper');
+
+    const formatted = formatResearchPapers([paper]);
+    expect(formatted).toContain(
+      'Research Source 1 [paper:10-1234-example-paper]',
+    );
+    expect(formatted).toContain('Authors: Ada Lovelace, Grace Hopper');
+    expect(formatted).toContain('Open Access: yes');
+  });
+
+  it('builds bounded Semantic Scholar search URLs', () => {
+    expect(clampResultLimit('99', 4, 6)).toBe(6);
+    expect(clampResultLimit('0', 4, 6)).toBe(1);
+
+    const url = buildSemanticScholarSearchUrl('graph neural networks', 99);
+
+    expect(url.searchParams.get('query')).toBe('graph neural networks');
+    expect(url.searchParams.get('limit')).toBe('8');
+    expect(url.searchParams.get('fields')).toContain('abstract');
+  });
+
+  it('falls back to generated citation keys when metadata is incomplete', () => {
+    expect(createCitationKey({}, 2)).toBe('source-3:p-unknown:c3');
+  });
+});
diff --git a/ui/pages/api/fetch-documents.ts b/ui/pages/api/fetch-documents.ts
index 9304e48..e13e3e1 100644
--- a/ui/pages/api/fetch-documents.ts
+++ b/ui/pages/api/fetch-documents.ts
@@ -1,25 +1,50 @@
-import type { NextApiRequest, NextApiResponse } from "next";
-import { ChromaClient, TransformersEmbeddingFunction } from "chromadb";
+import type { NextApiRequest, NextApiResponse } from 'next';
 
-export default async function handler(req: NextApiRequest, res: NextApiResponse) {
+import {
+  clampResultLimit,
+  formatChromaResults,
+} from '@/utils/server/scientific-rag';
+
+import { ChromaClient, TransformersEmbeddingFunction } from 'chromadb';
+
+export default async function handler(
+  req: NextApiRequest,
+  res: NextApiResponse,
+) {
   try {
+    if (req.method !== 'POST') {
+      return res.status(405).json({ error: 'Method not allowed' });
+    }
+
     const client = new ChromaClient({
-      path: "http://chroma-server:8000",
+      path: process.env.CHROMA_PATH || 'http://chroma-server:8000',
     });
 
-    const query = req.body.input;
+    const query =
+      typeof req.body.input === 'string' ? req.body.input.trim() : '';
+    const nResults = clampResultLimit(req.body.nResults, 6, 10);
+
+    if (!query) {
+      return res.status(400).json({ error: 'Missing input query' });
+    }
 
     const embedder = new TransformersEmbeddingFunction();
 
-    const collection = await client.getOrCreateCollection({ name: "default-collection", embeddingFunction: embedder });
+    const collection = await client.getOrCreateCollection({
+      name: 'default-collection',
+      embeddingFunction: embedder,
+    });
 
-  // query the collection
-  const results = await collection.query({
-      nResults: 4, 
-      queryTexts: [query]
-  }) 
+    // query the collection
+    const results = await collection.query({
+      nResults,
+      queryTexts: [query],
+    });
 
-    res.status(200).json(results);
+    res.status(200).json({
+      ...results,
+      formatted: formatChromaResults(results),
+    });
   } catch (error) {
     if (error instanceof Error) {
       console.error('Error message:', error.message);
@@ -29,4 +54,4 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse)
     }
     res.status(500).json({ error: 'An unexpected error occurred :(' });
   }
-}
\ No newline at end of file
+}
diff --git a/ui/pages/api/fetch-research.ts b/ui/pages/api/fetch-research.ts
new file mode 100644
index 0000000..bd2ada4
--- /dev/null
+++ b/ui/pages/api/fetch-research.ts
@@ -0,0 +1,62 @@
+import type { NextApiRequest, NextApiResponse } from 'next';
+
+import {
+  buildSemanticScholarSearchUrl,
+  clampResultLimit,
+  formatResearchPapers,
+  normalizeSemanticScholarPaper,
+} from '@/utils/server/scientific-rag';
+
+export default async function handler(
+  req: NextApiRequest,
+  res: NextApiResponse,
+) {
+  if (req.method !== 'POST') {
+    return res.status(405).json({ error: 'Method not allowed' });
+  }
+
+  const query = typeof req.body.input === 'string' ? req.body.input.trim() : '';
+  const limit = clampResultLimit(req.body.limit, 4, 6);
+
+  if (!query) {
+    return res.status(400).json({ error: 'Missing input query' });
+  }
+
+  const controller = new AbortController();
+  const timeout = setTimeout(() => controller.abort(), 4000);
+
+  try {
+    const response = await fetch(buildSemanticScholarSearchUrl(query, limit), {
+      headers: {
+        Accept: 'application/json',
+        ...(process.env.SEMANTIC_SCHOLAR_API_KEY && {
+          'x-api-key': process.env.SEMANTIC_SCHOLAR_API_KEY,
+        }),
+      },
+      signal: controller.signal,
+    });
+
+    if (!response.ok) {
+      return res.status(502).json({
+        error: `Semantic Scholar returned ${response.status}`,
+      });
+    }
+
+    const payload = await response.json();
+    const papers = (payload.data ?? []).map(normalizeSemanticScholarPaper);
+
+    return res.status(200).json({
+      formatted: formatResearchPapers(papers),
+      papers,
+    });
+  } catch (error) {
+    const message =
+      error instanceof Error
+        ? error.message
+        : 'Unable to fetch research papers';
+
+    return res.status(502).json({ error: message });
+  } finally {
+    clearTimeout(timeout);
+  }
+}
diff --git a/ui/pages/api/inject-documents.ts b/ui/pages/api/inject-documents.ts
index 532a635..a232679 100644
--- a/ui/pages/api/inject-documents.ts
+++ b/ui/pages/api/inject-documents.ts
@@ -1,10 +1,14 @@
 import type { NextApiRequest, NextApiResponse } from 'next';
 
+import {
+  SCIENTIFIC_TEXT_SEPARATORS,
+  buildScientificMetadata,
+} from '@/utils/server/scientific-rag';
+
 import { ChromaClient, TransformersEmbeddingFunction } from 'chromadb';
 import { IncomingForm } from 'formidable';
 import { PDFLoader } from 'langchain/document_loaders/fs/pdf';
-import { RecursiveCharacterTextSplitter } from "langchain/text_splitter";
-
+import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
 import path from 'path';
 import { v4 as uuidv4 } from 'uuid';
 
@@ -33,20 +37,24 @@ export default async function handler(
         path: process.env.CHROMA_PATH || 'http://chroma-server:8000',
       });
 
-      const loader = new PDFLoader(files.pdf[0].filepath);
+      const pdfFile = Array.isArray(files.pdf) ? files.pdf[0] : files.pdf;
 
-      const originalDocs = await loader.load();
+      if (!pdfFile?.filepath) {
+        return res.status(400).json({ error: 'Missing PDF file' });
+      }
 
-      console.log(JSON.stringify(originalDocs));
+      const loader = new PDFLoader(pdfFile.filepath);
 
+      const originalDocs = await loader.load();
 
       const splitter = new RecursiveCharacterTextSplitter({
         chunkSize: 500,
         chunkOverlap: 100,
-      });      
+        separators: SCIENTIFIC_TEXT_SEPARATORS,
+      });
 
       const docs = await splitter.splitDocuments(originalDocs);
- 
+
       // Process the documents and perform other logic
       const { ids, metadatas, documentContents } = processDocuments(docs);
 
@@ -80,23 +88,15 @@ function processDocuments(docs: any) {
   const metadatas = [];
   const documentContents = [];
 
-  for (const document of docs) {
+  for (const [index, document] of docs.entries()) {
     // Generate an ID for each document, or use some existing unique identifier
     const id = uuidv4();
     ids.push(id);
 
-    const fallbackTitle = path.basename(document.metadata.source);
-    const titleFromMetadata = document.metadata.pdf.info.Title;
-
-    const title = titleFromMetadata && titleFromMetadata.length > 0 ? titleFromMetadata : fallbackTitle;
-
-  
-    const metadata = {
-      title: title,
-      page: document.metadata.loc.pageNumber, // Define this function to extract chapter info
-      source: document.metadata.source, // Define this function to extract verse info
-    };
-    metadatas.push(metadata);
+    const fallbackTitle = path.basename(
+      document.metadata?.source ?? 'document',
+    );
+    metadatas.push(buildScientificMetadata(document, index, fallbackTitle));
 
     // Add the page content to the documents array
     documentContents.push(document.pageContent);
diff --git a/ui/pages/api/rag-chat.ts b/ui/pages/api/rag-chat.ts
index ce84d67..276968a 100644
--- a/ui/pages/api/rag-chat.ts
+++ b/ui/pages/api/rag-chat.ts
@@ -1,6 +1,5 @@
 import { DEFAULT_SYSTEM_PROMPT, DEFAULT_TEMPERATURE } from '@/utils/app/const';
 import { OpenAIError, OpenAIStream } from '@/utils/server';
-import { codeBlock, oneLine } from 'common-tags'
 
 import { ChatBody, Message } from '@/types/chat';
 
@@ -9,46 +8,39 @@ import wasm from '../../node_modules/@dqbd/tiktoken/lite/tiktoken_bg.wasm?module
 
 import tiktokenModel from '@dqbd/tiktoken/encoders/cl100k_base.json';
 import { Tiktoken, init } from '@dqbd/tiktoken/lite/init';
+import { codeBlock, oneLine } from 'common-tags';
 
 export const config = {
   runtime: 'edge',
 };
 
-// Function to fetch and format documents
-async function fetchAndFormatDocuments(lastMessageContent: string) {
+async function fetchFormattedContext(
+  baseUrl: string,
+  path: string,
+  input: string,
+  body: Record<string, unknown> = {},
+) {
   try {
-    console.log("fetching documents")
-    const response = await fetch('http://localhost:3000/api/fetch-documents', {
+    const response = await fetch(`${baseUrl}${path}`, {
       method: 'POST',
       headers: { 'Content-Type': 'application/json' },
-      body: JSON.stringify({ input: lastMessageContent }),
+      body: JSON.stringify({ input, ...body }),
     });
-    
+
     if (!response.ok) {
-      throw new Error(`Error fetching documents: ${response.statusText}`);
+      console.warn(`Skipping ${path}: ${response.statusText}`);
+      return '';
     }
 
     const data = await response.json();
-    const result = data.metadatas[0].map((metadata: any, index: number) => {
-      return `Source ${index + 1}) Title: ${metadata.title}, Page: ${metadata.page}, Content: ${data.documents[0][index]}\n`;
-    }).join('');
-
-    console.log(result);
-
-    return result;
-
+    return typeof data.formatted === 'string' ? data.formatted : '';
   } catch (error) {
-    console.error('Error fetching and formatting documents:', error);
-    throw error; // You may want to throw a more specific error object here
+    console.warn(`Skipping ${path}:`, error);
+    return '';
   }
 }
 
-
-
-
-
 const handler = async (req: Request): Promise<Response> => {
-
   try {
     const { model, messages, key, prompt, temperature } =
       (await req.json()) as ChatBody;
@@ -84,9 +76,36 @@ const handler = async (req: Request): Promise<Response> => {
     }
 
     const lastMessage = messages[messages.length - 1];
+    const baseUrl = new URL(req.url).origin;
+
+    const [localDocuments, researchPapers] = await Promise.all([
+      fetchFormattedContext(
+        baseUrl,
+        '/api/fetch-documents',
+        lastMessage.content,
+        {
+          nResults: 6,
+        },
+      ),
+      fetchFormattedContext(
+        baseUrl,
+        '/api/fetch-research',
+        lastMessage.content,
+        {
+          limit: 4,
+        },
+      ),
+    ]);
+
+    const evidenceBlocks = [
+      localDocuments
+        ? `LOCAL DOCUMENT SOURCES\n${localDocuments}`
+        : 'LOCAL DOCUMENT SOURCES\nNo matching local documents were retrieved.',
+      researchPapers
+        ? `EXTERNAL RESEARCH SOURCES\n${researchPapers}`
+        : 'EXTERNAL RESEARCH SOURCES\nNo matching external research papers were retrieved.',
+    ].join('\n\n');
 
-    const relevantDocuments = await fetchAndFormatDocuments(lastMessage.content);
-    
     let temperatureToUse = temperature;
     if (temperatureToUse == null) {
       temperatureToUse = DEFAULT_TEMPERATURE;
@@ -97,22 +116,20 @@ const handler = async (req: Request): Promise<Response> => {
     let tokenCount = prompt_tokens.length;
     let messagesToSend: Message[] = [];
 
-
     encoding.free();
 
     console.log(model, promptToSend, temperatureToUse, key, messagesToSend);
 
-  
-  messagesToSend = [
+    messagesToSend = [
       {
-        role: "user",
+        role: 'user',
         content: codeBlock`
-          Here is the relevant documentation:
-          ${relevantDocuments}
+          Here is the available scientific evidence:
+          ${evidenceBlocks}
         `,
       },
       {
-        role: "user",
+        role: 'user',
         content: codeBlock`
           ${oneLine`
             Answer my next question using only the above documentation.
@@ -121,6 +138,14 @@ const handler = async (req: Request): Promise<Response> => {
           ${oneLine`
             - Do not make up answers that are not provided in the documentation.
           `}
+          ${oneLine`
+            - Cite local documents with their [citation key] and cite research
+            papers with their [paper:*] key.
+          `}
+          ${oneLine`
+            - Prefer local uploaded documents when they directly answer the
+            question; use external papers only to add research context.
+          `}
           ${oneLine`
             - If you are unsure and the answer is not explicitly written
             in the documentation context, say
@@ -135,14 +160,13 @@ const handler = async (req: Request): Promise<Response> => {
         `,
       },
       {
-        role: "user",
+        role: 'user',
         content: codeBlock`
           Here is my question:
           ${oneLine`${lastMessage.content}`}
       `,
       },
-    ]
-
+    ];
 
     const stream = await OpenAIStream(
       model,
diff --git a/ui/utils/server/scientific-rag.ts b/ui/utils/server/scientific-rag.ts
new file mode 100644
index 0000000..670f0f8
--- /dev/null
+++ b/ui/utils/server/scientific-rag.ts
@@ -0,0 +1,263 @@
+export type ChromaMetadata = {
+  citationKey?: string;
+  chunk?: number;
+  page?: number;
+  section?: string;
+  source?: string;
+  title?: string;
+};
+
+export type ChromaQueryResults = {
+  documents?: Array<Array<string | null>> | null;
+  distances?: Array<Array<number | null>> | null;
+  metadatas?: Array<Array<ChromaMetadata | null>> | null;
+};
+
+export type SemanticScholarAuthor = {
+  name?: string;
+};
+
+export type SemanticScholarPaper = {
+  abstract?: string | null;
+  authors?: SemanticScholarAuthor[];
+  citationCount?: number | null;
+  externalIds?: Record<string, string | undefined> | null;
+  isOpenAccess?: boolean;
+  openAccessPdf?: { url?: string | null } | null;
+  paperId?: string;
+  title?: string | null;
+  url?: string | null;
+  venue?: string | null;
+  year?: number | null;
+};
+
+export type NormalizedResearchPaper = {
+  abstract: string;
+  authors: string;
+  citationCount?: number;
+  citationKey: string;
+  identifier?: string;
+  isOpenAccess: boolean;
+  title: string;
+  url?: string;
+  venue?: string;
+  year?: number;
+};
+
+export const SCIENTIFIC_TEXT_SEPARATORS = [
+  '\nAbstract',
+  '\nIntroduction',
+  '\nBackground',
+  '\nMethods',
+  '\nMethodology',
+  '\nMaterials and Methods',
+  '\nResults',
+  '\nDiscussion',
+  '\nConclusion',
+  '\nReferences',
+  '\n\n',
+  '\n',
+  '. ',
+  ' ',
+  '',
+];
+
+const SCIENTIFIC_SECTIONS = [
+  'abstract',
+  'introduction',
+  'background',
+  'methods',
+  'methodology',
+  'materials and methods',
+  'results',
+  'discussion',
+  'conclusion',
+  'references',
+];
+
+export function clampResultLimit(value: unknown, fallback = 4, max = 8) {
+  const parsed = typeof value === 'number' ? value : Number(value);
+
+  if (!Number.isFinite(parsed)) {
+    return fallback;
+  }
+
+  return Math.min(Math.max(Math.floor(parsed), 1), max);
+}
+
+export function detectScientificSection(text: string) {
+  const normalized = text.toLowerCase();
+  const section = SCIENTIFIC_SECTIONS.find((name) => normalized.includes(name));
+
+  return section ?? 'body';
+}
+
+export function sanitizeCitationPart(value: unknown, fallback: string) {
+  const raw =
+    typeof value === 'string' && value.trim().length > 0 ? value : fallback;
+
+  return (
+    raw
+      .toLowerCase()
+      .replace(/[^a-z0-9]+/g, '-')
+      .replace(/^-+|-+$/g, '')
+      .slice(0, 48) || fallback
+  );
+}
+
+export function createCitationKey(metadata: ChromaMetadata, index: number) {
+  if (metadata.citationKey) {
+    return metadata.citationKey;
+  }
+
+  const title = sanitizeCitationPart(
+    metadata.title ?? metadata.source,
+    `source-${index + 1}`,
+  );
+  const page = metadata.page ? `p${metadata.page}` : 'p-unknown';
+  const chunk =
+    typeof metadata.chunk === 'number'
+      ? `c${metadata.chunk + 1}`
+      : `c${index + 1}`;
+
+  return `${title}:${page}:${chunk}`;
+}
+
+export function buildScientificMetadata(
+  document: {
+    metadata?: {
+      loc?: { pageNumber?: number };
+      pdf?: { info?: { Title?: string } };
+      source?: string;
+    };
+    pageContent: string;
+  },
+  index: number,
+  fallbackTitle: string,
+): ChromaMetadata {
+  const titleFromMetadata = document.metadata?.pdf?.info?.Title;
+  const title =
+    titleFromMetadata && titleFromMetadata.trim().length > 0
+      ? titleFromMetadata
+      : fallbackTitle;
+  const metadata = {
+    chunk: index,
+    page: document.metadata?.loc?.pageNumber,
+    section: detectScientificSection(document.pageContent),
+    source: document.metadata?.source,
+    title,
+  };
+
+  return {
+    ...metadata,
+    citationKey: createCitationKey(metadata, index),
+  };
+}
+
+export function formatChromaResults(results: ChromaQueryResults) {
+  const documents = results.documents?.[0] ?? [];
+  const metadatas = results.metadatas?.[0] ?? [];
+  const distances = results.distances?.[0] ?? [];
+
+  return documents
+    .map((content, index) => {
+      if (!content || content.trim().length === 0) {
+        return '';
+      }
+
+      const metadata = metadatas[index] ?? {};
+      const citationKey = createCitationKey(metadata, index);
+      const distance = distances[index];
+      const score =
+        typeof distance === 'number'
+          ? `, Distance: ${distance.toFixed(4)}`
+          : '';
+
+      return [
+        `Local Source ${index + 1} [${citationKey}]`,
+        `Title: ${metadata.title ?? 'Unknown'}`,
+        `Page: ${metadata.page ?? 'Unknown'}`,
+        `Section: ${metadata.section ?? 'body'}${score}`,
+        `Content: ${content}`,
+      ].join('\n');
+    })
+    .filter(Boolean)
+    .join('\n\n');
+}
+
+export function buildSemanticScholarSearchUrl(query: string, limit: number) {
+  const url = new URL('https://api.semanticscholar.org/graph/v1/paper/search');
+
+  url.searchParams.set('query', query);
+  url.searchParams.set('limit', String(clampResultLimit(limit)));
+  url.searchParams.set(
+    'fields',
+    [
+      'title',
+      'abstract',
+      'year',
+      'authors',
+      'url',
+      'venue',
+      'citationCount',
+      'externalIds',
+      'isOpenAccess',
+      'openAccessPdf',
+    ].join(','),
+  );
+
+  return url;
+}
+
+export function normalizeSemanticScholarPaper(
+  paper: SemanticScholarPaper,
+  index: number,
+): NormalizedResearchPaper {
+  const title = paper.title?.trim() || `Untitled paper ${index + 1}`;
+  const externalIds = paper.externalIds ?? {};
+  const identifier =
+    externalIds.DOI ?? externalIds.ArXiv ?? externalIds.PubMed ?? paper.paperId;
+  const citationKey = `paper:${sanitizeCitationPart(
+    identifier ?? title,
+    `paper-${index + 1}`,
+  )}`;
+  const authors =
+    paper.authors
+      ?.map((author) => author.name)
+      .filter((name): name is string => Boolean(name))
+      .slice(0, 4)
+      .join(', ') || 'Unknown authors';
+
+  return {
+    abstract: paper.abstract?.trim() || 'No abstract available.',
+    authors,
+    citationCount: paper.citationCount ?? undefined,
+    citationKey,
+    identifier,
+    isOpenAccess: Boolean(paper.isOpenAccess || paper.openAccessPdf?.url),
+    title,
+    url: paper.openAccessPdf?.url ?? paper.url ?? undefined,
+    venue: paper.venue ?? undefined,
+    year: paper.year ?? undefined,
+  };
+}
+
+export function formatResearchPapers(papers: NormalizedResearchPaper[]) {
+  return papers
+    .map((paper, index) =>
+      [
+        `Research Source ${index + 1} [${paper.citationKey}]`,
+        `Title: ${paper.title}`,
+        `Authors: ${paper.authors}`,
+        `Year: ${paper.year ?? 'Unknown'}`,
+        `Venue: ${paper.venue ?? 'Unknown'}`,
+        `Citations: ${paper.citationCount ?? 'Unknown'}`,
+        `Open Access: ${paper.isOpenAccess ? 'yes' : 'unknown/no'}`,
+        paper.url ? `URL: ${paper.url}` : undefined,
+        `Abstract: ${paper.abstract}`,
+      ]
+        .filter(Boolean)
+        .join('\n'),
+    )
+    .join('\n\n');
+}