From cac2d67a6c1289cf3d78068f3e176217d63152b0 Mon Sep 17 00:00:00 2001 From: FlackoJodye1 Date: Tue, 5 Aug 2025 20:38:25 +0200 Subject: [PATCH 1/2] Add hierarchichal chunking --- .gitignore | 2 + .../test_context_enriched_processor.ipynb | 687 +++++++ notebooks/test_document_processor.ipynb | 1649 +++++++++++++++++ notebooks/test_verbatim_rag_integration.ipynb | 507 +++++ verbatim_rag/index.py | 20 +- verbatim_rag/ingestion/__init__.py | 3 +- .../ingestion/context_enriched_processor.py | 356 ++++ verbatim_rag/ingestion/document_processor.py | 3 + 8 files changed, 3216 insertions(+), 11 deletions(-) create mode 100644 notebooks/test_context_enriched_processor.ipynb create mode 100644 notebooks/test_document_processor.ipynb create mode 100644 notebooks/test_verbatim_rag_integration.ipynb create mode 100644 verbatim_rag/ingestion/context_enriched_processor.py diff --git a/.gitignore b/.gitignore index 5581dca..95f080b 100644 --- a/.gitignore +++ b/.gitignore @@ -77,6 +77,8 @@ target/ # Jupyter Notebook .ipynb_checkpoints +**/.ipynb_checkpoints/ +notebooks/RAGBENCH/ # IPython profile_default/ diff --git a/notebooks/test_context_enriched_processor.ipynb b/notebooks/test_context_enriched_processor.ipynb new file mode 100644 index 0000000..5b33e83 --- /dev/null +++ b/notebooks/test_context_enriched_processor.ipynb @@ -0,0 +1,687 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Context-Enriched Processor Testing\n", + "\n", + "This notebook tests the new context-enriched chunking approach that adds hierarchical section context to each chunk for better RAG retrieval." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Project root: /Users/paulschmitt/DataspellProjects/verbatim-rag\n", + "✅ Setup complete\n" + ] + } + ], + "source": [ + "import sys\n", + "import os\n", + "from pathlib import Path\n", + "import torch\n", + "\n", + "# Fix OpenMP conflict (common with ML libraries on macOS)\n", + "os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'\n", + "\n", + "# Add the project root to Python path\n", + "project_root = Path().absolute().parent\n", + "sys.path.append(str(project_root))\n", + "\n", + "print(f\"Project root: {project_root}\")\n", + "print(\"✅ Setup complete\")" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Imports successful\n" + ] + } + ], + "source": [ + "from verbatim_rag.ingestion.context_enriched_processor import ContextEnrichedProcessor, ContextEnrichedChunk\n", + "from pprint import pprint\n", + "\n", + "print(\"✅ Imports successful\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Test 1: Basic Context-Enriched Processing" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/paulschmitt/miniforge3/envs/verbatim-rag-2/lib/python3.10/site-packages/torch/utils/data/dataloader.py:683: UserWarning: 'pin_memory' argument is set as true but not supported on MPS now, then device pinned memory won't be used.\n", + " warnings.warn(warn_msg)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Document processed successfully!\n", + "Title: Verbatim RAG ACL Paper\n", + "Chunks: 57\n", + "Content type: DocumentType.PDF\n" + ] + } + ], + "source": [ + "# Test with the academic paper\n", + "pdf_path = project_root / \"data\" / \"acl_papers\" / \"VERBATIM_RAG_ACL.pdf\"\n", + "\n", + "# Create context-enriched processor\n", + "processor = ContextEnrichedProcessor.for_rag(chunk_size=512)\n", + "\n", + "# Process document\n", + "document = processor.process_file(pdf_path, title=\"Verbatim RAG ACL Paper\")\n", + "\n", + "print(f\"✅ Document processed successfully!\")\n", + "print(f\"Title: {document.title}\")\n", + "print(f\"Chunks: {len(document.chunks)}\")\n", + "print(f\"Content type: {document.content_type}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Test 2: Examine Context-Enriched Chunks" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "🔍 Context-Enriched Chunk Analysis:\n", + "Total chunks: 57\n" + ] + } + ], + "source": [ + "print(\"🔍 Context-Enriched Chunk Analysis:\")\n", + "print(f\"Total chunks: {len(document.chunks)}\")" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "📝 First 10 Chunks with Context:\n", + "\n", + "--- Chunk 1 ---\n", + "Type: ContextEnrichedChunk\n", + "Section Path: ['1 Introduction']\n", + "Context: Section: 1 Introduction\n", + "Content: Modern question-answering (QA) and retrievalaugmented generation (RAG) systems play a vital role in ...\n", + "Enhanced: Section: 1 Introduction | Modern question-answering (QA) and retrievalaugmented generation (RAG) systems play a vital role in many high-stakes domains...\n", + "Citation: 1 Introduction\n", + "\n", + "--- Chunk 2 ---\n", + "Type: ContextEnrichedChunk\n", + "Section Path: ['1 Introduction']\n", + "Context: Section: 1 Introduction\n", + "Content: incorrect information, commonly referred to as hallucinations (Ji et al., 2023; Madsen et al., 2024)...\n", + "Enhanced: Section: 1 Introduction | incorrect information, commonly referred to as hallucinations (Ji et al., 2023; Madsen et al., 2024). We argue that a reliab...\n", + "Citation: 1 Introduction\n", + "\n", + "--- Chunk 3 ---\n", + "Type: ContextEnrichedChunk\n", + "Section Path: ['1 Introduction']\n", + "Context: Section: 1 Introduction\n", + "Content: trained generation , dynamically creating answer templates filled exclu-\n", + "\n", + "We participated in the Arc...\n", + "Enhanced: Section: 1 Introduction | trained generation , dynamically creating answer templates filled exclu-\n", + "\n", + "We participated in the ArchEHR-QA 2025 shared task...\n", + "Citation: 1 Introduction\n", + "\n", + "--- Chunk 4 ---\n", + "Type: ContextEnrichedChunk\n", + "Section Path: ['1 Introduction']\n", + "Context: Section: 1 Introduction\n", + "Content: arner et al., 2024), achieving performance comparable to the LLM extractor. Both extractors were the...\n", + "Enhanced: Section: 1 Introduction | arner et al., 2024), achieving performance comparable to the LLM extractor. Both extractors were then fed into the same LLM ...\n", + "Citation: 1 Introduction\n", + "\n", + "--- Chunk 5 ---\n", + "Type: ContextEnrichedChunk\n", + "Section Path: ['1 Introduction']\n", + "Context: Section: 1 Introduction\n", + "Content: d train custom models. Additionally, we are releasing all the code on GitHub 2 under the MIT License...\n", + "Enhanced: Section: 1 Introduction | d train custom models. Additionally, we are releasing all the code on GitHub 2 under the MIT License.\n", + "\n", + "The remainder of the ...\n", + "Citation: 1 Introduction\n", + "\n", + "--- Chunk 6 ---\n", + "Type: ContextEnrichedChunk\n", + "Section Path: ['2 Background', '2.1 Dataset']\n", + "Context: Section: 2 Background | Subsection: 2.1 Dataset\n", + "Content: Early clinical QA datasets such as emrQA (Pampari et al., 2018) and CliCR (Šuster and Daelemans, 201...\n", + "Enhanced: Section: 2 Background | Subsection: 2.1 Dataset | Early clinical QA datasets such as emrQA (Pampari et al., 2018) and CliCR (Šuster and Daelemans, 201...\n", + "Citation: 2 Background → 2.1 Dataset\n", + "\n", + "--- Chunk 7 ---\n", + "Type: ContextEnrichedChunk\n", + "Section Path: ['2 Background', '2.1 Dataset']\n", + "Context: Section: 2 Background | Subsection: 2.1 Dataset\n", + "Content: sentence-level as essential , supplementary , or irrelevant . Answers must be concise (under 75 word...\n", + "Enhanced: Section: 2 Background | Subsection: 2.1 Dataset | sentence-level as essential , supplementary , or irrelevant . Answers must be concise (under 75 word...\n", + "Citation: 2 Background → 2.1 Dataset\n", + "\n", + "--- Chunk 8 ---\n", + "Type: ContextEnrichedChunk\n", + "Section Path: ['2 Background', '2.2 Limitations of Standard RAG']\n", + "Context: Section: 2 Background | Subsection: 2.2 Limitations of Standard RAG\n", + "Content: Standard RAG models, despite external grounding, still frequently hallucinate unsupported or contrad...\n", + "Enhanced: Section: 2 Background | Subsection: 2.2 Limitations of Standard RAG | Standard RAG models, despite external grounding, still frequently hallucinate un...\n", + "Citation: 2 Background → 2.2 Limitations of Standard RAG\n", + "\n", + "--- Chunk 9 ---\n", + "Type: ContextEnrichedChunk\n", + "Section Path: ['2 Background', '2.2 Limitations of Standard RAG']\n", + "Context: Section: 2 Background | Subsection: 2.2 Limitations of Standard RAG\n", + "Content: ano and Smith, 2019; Jain and Wallace, 2019) and LLM self-explanations (Madsen et al., 2024) have al...\n", + "Enhanced: Section: 2 Background | Subsection: 2.2 Limitations of Standard RAG | ano and Smith, 2019; Jain and Wallace, 2019) and LLM self-explanations (Madsen e...\n", + "Citation: 2 Background → 2.2 Limitations of Standard RAG\n", + "\n", + "--- Chunk 10 ---\n", + "Type: ContextEnrichedChunk\n", + "Section Path: ['2 Background', '2.3 Synthetic Training Data']\n", + "Context: Section: 2 Background | Subsection: 2.3 Synthetic Training Data\n", + "Content: Due to limited access and annotation restrictions, obtaining sentence-level labeled clinical dataset...\n", + "Enhanced: Section: 2 Background | Subsection: 2.3 Synthetic Training Data | Due to limited access and annotation restrictions, obtaining sentence-level labeled ...\n", + "Citation: 2 Background → 2.3 Synthetic Training Data\n", + "\n", + "... and 47 more chunks\n" + ] + } + ], + "source": [ + "# Show first 5 chunks with their context\n", + "n = 10\n", + "print(f\"\\n📝 First {n} Chunks with Context:\")\n", + "for i, chunk in enumerate(document.chunks[:n]):\n", + " print(f\"\\n--- Chunk {i+1} ---\")\n", + " print(f\"Type: {type(chunk).__name__}\")\n", + " print(f\"Section Path: {chunk.section_path}\")\n", + " print(f\"Context: {chunk.context_string}\")\n", + " print(f\"Content: {chunk.content[:100]}...\")\n", + "\n", + " # Show enhanced content (what gets embedded)\n", + " if hasattr(chunk, 'get_enhanced_content'):\n", + " enhanced = chunk.get_enhanced_content()\n", + " print(f\"Enhanced: {enhanced[:150]}...\")\n", + "\n", + " # Show citation context\n", + " if hasattr(chunk, 'get_citation_context'):\n", + " citation = chunk.get_citation_context()\n", + " print(f\"Citation: {citation}\")\n", + "\n", + "if len(document.chunks) > n:\n", + " print(f\"\\n... and {len(document.chunks) - n} more chunks\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Test 3: Context Distribution Analysis" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "📊 Context Distribution Analysis:\n", + "\n", + "🏷️ Chunks per Main Section:\n", + " 1 Introduction: 5 chunks\n", + " 2 Background: 5 chunks\n", + " 3 Method: 16 chunks\n", + " 4 Evaluation: 9 chunks\n", + " 5 Ethical Considerations: 2 chunks\n", + " 6 Limitations: 2 chunks\n", + " 7 Conclusion: 18 chunks\n", + "\n", + "📏 Context String Statistics:\n", + " Average length: 35.0 chars\n", + " Min length: 21 chars\n", + " Max length: 67 chars\n", + "\n", + "🌳 Unique Section Paths (12 total):\n", + " 1 Introduction\n", + " 2 Background → 2.1 Dataset\n", + " 2 Background → 2.2 Limitations of Standard RAG\n", + " 2 Background → 2.3 Synthetic Training Data\n", + " 3 Method → 3.1 System Overview\n", + " 3 Method → 3.2 Evidence Extraction\n", + " 3 Method → 3.3 Synthetic Data Generation\n", + " 3 Method → 3.4 Answer Generation\n", + " 4 Evaluation\n", + " 5 Ethical Considerations\n", + " 6 Limitations\n", + " 7 Conclusion\n" + ] + } + ], + "source": [ + "print(\"📊 Context Distribution Analysis:\")\n", + "\n", + "# Analyze section distribution\n", + "section_counts = {}\n", + "context_lengths = []\n", + "\n", + "for chunk in document.chunks:\n", + " if hasattr(chunk, 'section_path') and chunk.section_path:\n", + " # Count chunks per section\n", + " main_section = chunk.section_path[0] if chunk.section_path else \"No Section\"\n", + " section_counts[main_section] = section_counts.get(main_section, 0) + 1\n", + "\n", + " # Track context length\n", + " context_lengths.append(len(chunk.context_string))\n", + "\n", + "print(f\"\\n🏷️ Chunks per Main Section:\")\n", + "for section, count in sorted(section_counts.items()):\n", + " print(f\" {section}: {count} chunks\")\n", + "\n", + "if context_lengths:\n", + " print(f\"\\n📏 Context String Statistics:\")\n", + " print(f\" Average length: {sum(context_lengths)/len(context_lengths):.1f} chars\")\n", + " print(f\" Min length: {min(context_lengths)} chars\")\n", + " print(f\" Max length: {max(context_lengths)} chars\")\n", + "\n", + "# Show unique section paths\n", + "unique_paths = set()\n", + "for chunk in document.chunks:\n", + " if hasattr(chunk, 'section_path') and chunk.section_path:\n", + " path_str = \" → \".join(chunk.section_path)\n", + " unique_paths.add(path_str)\n", + "\n", + "print(f\"\\n🌳 Unique Section Paths ({len(unique_paths)} total):\")\n", + "for path in sorted(unique_paths):\n", + " print(f\" {path}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Test 4: Embedding-Ready Content Examples" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "🎯 Embedding-Ready Content Examples:\n", + "This shows what will actually be embedded for RAG retrieval.\n", + "\n", + "--- Example 1 ---\n", + "Original: Modern question-answering (QA) and retrievalaugmented generation (RAG) systems play a vital role in ...\n", + "Enhanced: Section: 1 Introduction | Modern question-answering (QA) and retrievalaugmented generation (RAG) systems play a vital role in many high-stakes domains for information extraction and generation tasks. ...\n", + "Context adds: 26 chars\n", + "\n", + "--- Example 2 ---\n", + "Original: incorrect information, commonly referred to as hallucinations (Ji et al., 2023; Madsen et al., 2024)...\n", + "Enhanced: Section: 1 Introduction | incorrect information, commonly referred to as hallucinations (Ji et al., 2023; Madsen et al., 2024). We argue that a reliable QA system should guarantee complete traceabilit...\n", + "Context adds: 26 chars\n", + "\n", + "--- Example 3 ---\n", + "Original: trained generation , dynamically creating answer templates filled exclu-\n", + "\n", + "We participated in the Arc...\n", + "Enhanced: Section: 1 Introduction | trained generation , dynamically creating answer templates filled exclu-\n", + "\n", + "We participated in the ArchEHR-QA 2025 shared task on grounded question answering (QA) from electron...\n", + "Context adds: 26 chars\n", + "\n", + "--- Example 4 ---\n", + "Original: arner et al., 2024), achieving performance comparable to the LLM extractor. Both extractors were the...\n", + "Enhanced: Section: 1 Introduction | arner et al., 2024), achieving performance comparable to the LLM extractor. Both extractors were then fed into the same LLM template generator. Our solution achieved a score ...\n", + "Context adds: 26 chars\n", + "\n", + "--- Example 5 ---\n", + "Original: d train custom models. Additionally, we are releasing all the code on GitHub 2 under the MIT License...\n", + "Enhanced: Section: 1 Introduction | d train custom models. Additionally, we are releasing all the code on GitHub 2 under the MIT License.\n", + "\n", + "The remainder of the paper discusses background (Section 2), method (Se...\n", + "Context adds: 26 chars\n", + "\n", + "--- Example 6 ---\n", + "Original: Early clinical QA datasets such as emrQA (Pampari et al., 2018) and CliCR (Šuster and Daelemans, 201...\n", + "Enhanced: Section: 2 Background | Subsection: 2.1 Dataset | Early clinical QA datasets such as emrQA (Pampari et al., 2018) and CliCR (Šuster and Daelemans, 2018) used fill-in-the-blank methods and lacked expli...\n", + "Context adds: 50 chars\n", + "\n", + "--- Example 7 ---\n", + "Original: sentence-level as essential , supplementary , or irrelevant . Answers must be concise (under 75 word...\n", + "Enhanced: Section: 2 Background | Subsection: 2.1 Dataset | sentence-level as essential , supplementary , or irrelevant . Answers must be concise (under 75 words) and explicitly cite relevant sentences....\n", + "Context adds: 50 chars\n", + "\n", + "--- Example 8 ---\n", + "Original: Standard RAG models, despite external grounding, still frequently hallucinate unsupported or contrad...\n", + "Enhanced: Section: 2 Background | Subsection: 2.2 Limitations of Standard RAG | Standard RAG models, despite external grounding, still frequently hallucinate unsupported or contradictory information (Ji et al.,...\n", + "Context adds: 70 chars\n", + "\n", + "--- Example 9 ---\n", + "Original: ano and Smith, 2019; Jain and Wallace, 2019) and LLM self-explanations (Madsen et al., 2024) have al...\n", + "Enhanced: Section: 2 Background | Subsection: 2.2 Limitations of Standard RAG | ano and Smith, 2019; Jain and Wallace, 2019) and LLM self-explanations (Madsen et al., 2024) have also been found unreliable. Our ...\n", + "Context adds: 70 chars\n", + "\n", + "--- Example 10 ---\n", + "Original: Due to limited access and annotation restrictions, obtaining sentence-level labeled clinical dataset...\n", + "Enhanced: Section: 2 Background | Subsection: 2.3 Synthetic Training Data | Due to limited access and annotation restrictions, obtaining sentence-level labeled clinical datasets is challenging. Recent works add...\n", + "Context adds: 66 chars\n", + "\n", + "--- Example 11 ---\n", + "Original: Figure 1 depicts our system architecture. First, an extraction step identifies relevant sentences fr...\n", + "Enhanced: Section: 3 Method | Subsection: 3.1 System Overview | Figure 1 depicts our system architecture. First, an extraction step identifies relevant sentences from the input (patient narrative, clinician que...\n", + "Context adds: 54 chars\n", + "\n", + "--- Example 12 ---\n", + "Original: We evaluated two extractors: (i) We prompted gemma-3-27b-it to explicitly label sentences as relevan...\n", + "Enhanced: Section: 3 Method | Subsection: 3.2 Evidence Extraction | We evaluated two extractors: (i) We prompted gemma-3-27b-it to explicitly label sentences as relevant via a step-by-step process. (ii) We fine...\n", + "Context adds: 58 chars\n", + "\n", + "💾 ProcessedChunk Integration:\n", + "Total processed chunks: 57\n", + "\n", + "Example ProcessedChunk:\n", + " Section title: Introduction\n", + " Enhanced content: Section: 1 Introduction | Modern question-answering (QA) and retrievalaugmented generation (RAG) systems play a vital role in many high-stakes domains...\n", + " Processing metadata: {'context_enriched': True, 'section_path': ['1 Introduction'], 'context_string': 'Section: 1 Introduction'}\n" + ] + } + ], + "source": [ + "print(\"🎯 Embedding-Ready Content Examples:\")\n", + "print(\"This shows what will actually be embedded for RAG retrieval.\")\n", + "\n", + "# Show 3 examples of enhanced content\n", + "for i, chunk in enumerate(document.chunks[:12]):\n", + " if hasattr(chunk, 'get_enhanced_content'):\n", + " print(f\"\\n--- Example {i+1} ---\")\n", + " print(f\"Original: {chunk.content[:100]}...\")\n", + " print(f\"Enhanced: {chunk.get_enhanced_content()[:200]}...\")\n", + " print(f\"Context adds: {len(chunk.get_enhanced_content()) - len(chunk.content)} chars\")\n", + "\n", + "# Show processed chunks (what goes to the index)\n", + "print(f\"\\n💾 ProcessedChunk Integration:\")\n", + "total_processed = sum(len(chunk.processed_chunks) for chunk in document.chunks)\n", + "print(f\"Total processed chunks: {total_processed}\")\n", + "\n", + "if document.chunks and document.chunks[0].processed_chunks:\n", + " pc = document.chunks[0].processed_chunks[0]\n", + " print(f\"\\nExample ProcessedChunk:\")\n", + " print(f\" Section title: {pc.section_title}\")\n", + " print(f\" Enhanced content: {pc.enhanced_content[:150]}...\")\n", + " print(f\" Processing metadata: {pc.processing_metadata}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Test 5: RAG Benefits Demonstration" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "🚀 RAG Benefits Demonstration:\n", + "Examples of how context enrichment improves retrieval...\n", + "\n", + "🔍 Query: 'dataset'\n", + " Found 9 potential matches\n", + " Match 1 (Content+Context): 2 Background → 2.1 Dataset\n", + " Content: Early clinical QA datasets such as emrQA (Pampari et al., 2018) and CliCR (Šuste...\n", + " Match 2 (Context): 2 Background → 2.1 Dataset\n", + " Content: sentence-level as essential , supplementary , or irrelevant . Answers must be co...\n", + "\n", + "🔍 Query: 'background'\n", + " Found 6 potential matches\n", + " Match 1 (Context): 2 Background → 2.1 Dataset\n", + " Content: Early clinical QA datasets such as emrQA (Pampari et al., 2018) and CliCR (Šuste...\n", + " Match 2 (Context): 2 Background → 2.1 Dataset\n", + " Content: sentence-level as essential , supplementary , or irrelevant . Answers must be co...\n", + "\n", + "🔍 Query: 'method'\n", + " Found 28 potential matches\n", + " Match 1 (Content+Context): 3 Method → 3.4 Answer Generation\n", + " Content: repair of his ruptured thoracoabdominal aortic aneurysm. |1| -He was immediately...\n", + " Match 2 (Content+Context): 3 Method → 3.4 Answer Generation\n", + " Content: ated by our verbatim method, inserting evidence sentences verbatim into a dynami...\n", + "\n", + "🔍 Query: 'evaluation'\n", + " Found 12 potential matches\n", + " Match 1 (Context): 4 Evaluation\n", + " Content: We evaluated our pipeline in the ArchEHR-QA 2025 shared task (Soni and Demner-Fu...\n", + " Match 2 (Context): 4 Evaluation\n", + " Content: s through BLEU (Papineni et al., 2002), ROUGE (Lin, 2004), BERTScore (Zhang et a...\n", + "\n", + "🔍 Query: 'limitations'\n", + " Found 4 potential matches\n", + " Match 1 (Content+Context): 6 Limitations\n", + " Content: Our verbatim RAG pipeline explicitly cites source sentences to mitigate hallucin...\n", + " Match 2 (Context): 2 Background → 2.2 Limitations of Standard RAG\n", + " Content: Standard RAG models, despite external grounding, still frequently hallucinate un...\n" + ] + } + ], + "source": [ + "print(\"🚀 RAG Benefits Demonstration:\")\n", + "print(\"Examples of how context enrichment improves retrieval...\")\n", + "\n", + "# Simulate search scenarios\n", + "search_terms = [\n", + " \"dataset\",\n", + " \"background\",\n", + " \"method\",\n", + " \"evaluation\",\n", + " \"limitations\"\n", + "]\n", + "\n", + "for term in search_terms:\n", + " print(f\"\\n🔍 Query: '{term}'\")\n", + " matches = []\n", + "\n", + " for chunk in document.chunks:\n", + " if hasattr(chunk, 'get_enhanced_content'):\n", + " enhanced = chunk.get_enhanced_content().lower()\n", + " if term.lower() in enhanced:\n", + " # Calculate relevance score (simple approach)\n", + " content_match = term.lower() in chunk.content.lower()\n", + " context_match = term.lower() in chunk.context_string.lower()\n", + "\n", + " matches.append({\n", + " 'chunk': chunk,\n", + " 'content_match': content_match,\n", + " 'context_match': context_match,\n", + " 'both': content_match and context_match\n", + " })\n", + "\n", + " if matches:\n", + " print(f\" Found {len(matches)} potential matches\")\n", + "\n", + " # Show best matches\n", + " best_matches = sorted(matches, key=lambda x: (x['both'], x['context_match'], x['content_match']), reverse=True)[:2]\n", + "\n", + " for i, match in enumerate(best_matches):\n", + " chunk = match['chunk']\n", + " match_type = \"Content+Context\" if match['both'] else (\"Context\" if match['context_match'] else \"Content\")\n", + " print(f\" Match {i+1} ({match_type}): {chunk.get_citation_context()}\")\n", + " print(f\" Content: {chunk.content[:80]}...\")\n", + " else:\n", + " print(f\" No matches found\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Summary and Next Steps" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "📋 Context-Enriched Processing Summary:\n", + "==================================================\n", + "\n", + "✅ Successfully processed document with context enrichment\n", + " 📄 Document: Verbatim RAG ACL Paper\n", + " 🧩 Total chunks: 57\n", + " 🏷️ Context-enriched chunks: 57\n", + " 📚 Unique sections: 7\n", + "\n", + "🎯 Key Benefits for RAG:\n", + " • Each chunk contains full hierarchical context\n", + " • Section information embedded with content\n", + " • Better retrieval through context matching\n", + " • Rich citation context for answers\n", + " • Backward compatible with existing VerbatimRAG\n", + "\n", + "🚀 Ready for:\n", + " • Integration with VerbatimIndex\n", + " • Embedding generation with context\n", + " • Enhanced RAG retrieval testing\n", + "\n", + "🧹 Test complete - ready for production integration!\n" + ] + } + ], + "source": [ + "print(\"📋 Context-Enriched Processing Summary:\")\n", + "print(\"=\" * 50)\n", + "\n", + "print(f\"\\n✅ Successfully processed document with context enrichment\")\n", + "print(f\" 📄 Document: {document.title}\")\n", + "print(f\" 🧩 Total chunks: {len(document.chunks)}\")\n", + "\n", + "# Count context-enriched chunks\n", + "enriched_count = sum(1 for chunk in document.chunks if hasattr(chunk, 'section_path') and chunk.section_path)\n", + "print(f\" 🏷️ Context-enriched chunks: {enriched_count}\")\n", + "\n", + "# Show unique sections\n", + "sections = set()\n", + "for chunk in document.chunks:\n", + " if hasattr(chunk, 'section_path') and chunk.section_path:\n", + " sections.add(chunk.section_path[0])\n", + "print(f\" 📚 Unique sections: {len(sections)}\")\n", + "\n", + "print(f\"\\n🎯 Key Benefits for RAG:\")\n", + "print(f\" • Each chunk contains full hierarchical context\")\n", + "print(f\" • Section information embedded with content\")\n", + "print(f\" • Better retrieval through context matching\")\n", + "print(f\" • Rich citation context for answers\")\n", + "print(f\" • Backward compatible with existing VerbatimRAG\")\n", + "\n", + "print(f\"\\n🚀 Ready for:\")\n", + "print(f\" • Integration with VerbatimIndex\")\n", + "print(f\" • Embedding generation with context\")\n", + "print(f\" • Enhanced RAG retrieval testing\")\n", + "\n", + "print(f\"\\n🧹 Test complete - ready for production integration!\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [], + "metadata": { + "collapsed": false + } + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/test_document_processor.ipynb b/notebooks/test_document_processor.ipynb new file mode 100644 index 0000000..61060b2 --- /dev/null +++ b/notebooks/test_document_processor.ipynb @@ -0,0 +1,1649 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# DocumentProcessor Testing Notebook\n", + "\n", + "This notebook tests the DocumentProcessor functionality with different chunking strategies and document types." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": "## Setup and Imports\n\nThis section sets up the Python environment, handles common ML library conflicts (OpenMP), and imports the necessary modules for testing the DocumentProcessor." + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Project root: /Users/paulschmitt/DataspellProjects/verbatim-rag\n", + "Current working directory: /Users/paulschmitt/DataspellProjects/verbatim-rag/notebooks\n", + "✅ OpenMP conflict workaround applied\n" + ] + } + ], + "source": [ + "import sys\n", + "import os\n", + "from pathlib import Path\n", + "\n", + "# Fix OpenMP conflict (common with ML libraries on macOS)\n", + "os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'\n", + "\n", + "# Add the project root to Python path\n", + "project_root = Path().absolute().parent\n", + "sys.path.append(str(project_root))\n", + "\n", + "print(f\"Project root: {project_root}\")\n", + "print(f\"Current working directory: {Path.cwd()}\")\n", + "print(\"✅ OpenMP conflict workaround applied\")" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "from verbatim_rag.ingestion import DocumentProcessor" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "from verbatim_rag.document import DocumentType, ChunkType\n", + "import json\n", + "from pprint import pprint" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": "## Check Available Test Files\n\nBefore running tests, we need to verify which document files are available for processing. This helps us understand what test data we have to work with." + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Example docs path: /Users/paulschmitt/DataspellProjects/verbatim-rag/data/acl_papers\n", + "Exists: True\n", + "\n", + "Available files:\n", + " - VERBATIM_RAG_ACL.pdf (362982 bytes)\n" + ] + } + ], + "source": [ + "# Check available example documents\n", + "example_docs_path = project_root / \"data\" / \"acl_papers\"\n", + "print(f\"Example docs path: {example_docs_path}\")\n", + "print(f\"Exists: {example_docs_path.exists()}\")\n", + "\n", + "if example_docs_path.exists():\n", + " print(\"\\nAvailable files:\")\n", + " for file in example_docs_path.iterdir():\n", + " print(f\" - {file.name} ({file.stat().st_size} bytes)\")\n", + "else:\n", + " print(\"Example docs directory not found!\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": "## Test 1: Basic DocumentProcessor Creation\n\nThis test verifies that the DocumentProcessor can be instantiated with default settings. It checks if all required dependencies (docling, chonkie) are properly installed and accessible." + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ DocumentProcessor created successfully\n", + "Chunker type: recursive\n", + "Chunk size: 512\n", + "Chunk overlap: 50\n" + ] + } + ], + "source": [ + "try:\n", + " # Test basic creation with default settings\n", + " processor = DocumentProcessor()\n", + " print(\"✅ DocumentProcessor created successfully\")\n", + " print(f\"Chunker type: {processor.chunker_type}\")\n", + " print(f\"Chunk size: {processor.chunk_size}\")\n", + " print(f\"Chunk overlap: {processor.chunk_overlap}\")\n", + "except Exception as e:\n", + " print(f\"❌ Error creating DocumentProcessor: {e}\")\n", + " print(\"Make sure you have installed the document-processing dependencies:\")\n", + " print(\"pip install -e .[document-processing]\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": "## Test 2: Process a Simple Text File\n\nThis test processes a sample markdown document to understand how the DocumentProcessor converts content into chunks. We create a test document with headers and sections to analyze the chunking behavior." + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "test_file_name = \"VERBATIM_RAG_ACL.pdf\"\n", + "test_file_path = example_docs_path / test_file_name" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/paulschmitt/miniforge3/envs/verbatim-rag-2/lib/python3.10/site-packages/torch/utils/data/dataloader.py:683: UserWarning: 'pin_memory' argument is set as true but not supported on MPS now, then device pinned memory won't be used.\n", + " warnings.warn(warn_msg)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Document processed successfully!\n", + "Document ID: 7c2a73ad-6491-45ab-a371-8717b4d72a18\n", + "Title: VERBATIM_RAG_ACL.pdf\n", + "Source: /Users/paulschmitt/DataspellProjects/verbatim-rag/data/acl_papers/VERBATIM_RAG_ACL.pdf\n", + "Content Type: DocumentType.PDF\n", + "Number of chunks: 88\n", + "Raw content length: 25746 characters\n" + ] + } + ], + "source": [ + "# Test processing the file\n", + "try:\n", + " processor = DocumentProcessor()\n", + " document = processor.process_file(test_file_path, title=test_file_name)\n", + " \n", + " print(\"✅ Document processed successfully!\")\n", + " print(f\"Document ID: {document.id}\")\n", + " print(f\"Title: {document.title}\")\n", + " print(f\"Source: {document.source}\")\n", + " print(f\"Content Type: {document.content_type}\")\n", + " print(f\"Number of chunks: {len(document.chunks)}\")\n", + " print(f\"Raw content length: {len(document.raw_content)} characters\")\n", + " \n", + "except Exception as e:\n", + " print(f\"❌ Error processing document: {e}\")\n", + " import traceback\n", + " traceback.print_exc()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": "## Test 3: Examine Chunks in Detail\n\nThis test provides a detailed analysis of the generated chunks, including their structure, content, and metadata. It helps us understand how content is split and what information is preserved in each chunk." + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "📄 Document Analysis:\n", + "Total chunks: 88\n", + "\n", + "📝 Chunk Details:\n", + "\n", + "--- Chunk 1 ---\n", + "ID: 9ca6dcc7-f0e1-438b-96d7-a0e227425290\n", + "Type: ChunkType.PARAGRAPH\n", + "Number: 0\n", + "Content length: 227 chars\n", + "Processed chunks: 1\n", + "Content preview: ## KR Labs at ArchEHR-QA 2025: A Verbatim Approach for Evidence-Based Question Answering\n", + "\n", + "Ádám Kovács KR Labs kovacs@krlabs.eu\n", + "\n", + "## Paul Schmitt\n", + "\n", + "TU Wien paul.schmitt@tuwien.ac.at\n", + "\n", + "Gábor Recski KR Labs...\n", + "Enhanced content length: 227 chars\n", + "Enhanced preview: ## KR Labs at ArchEHR-QA 2025: A Verbatim Approach for Evidence-Based Question Answering\n", + "\n", + "Ádám Kovács KR Labs kovacs@krlabs.eu\n", + "\n", + "## Paul Schmitt\n", + "\n", + "TU Wien paul.schmitt@tuwien.ac.at\n", + "\n", + "Gábor Recski KR Labs...\n", + "\n", + "--- Chunk 2 ---\n", + "ID: 408e5810-c9ac-4d0b-9faa-9504d8008d27\n", + "Type: ChunkType.PARAGRAPH\n", + "Number: 1\n", + "Content length: 458 chars\n", + "Processed chunks: 1\n", + "Content preview: ## Abstract\n", + "\n", + "We present a lightweight, domain-agnostic verbatim pipeline for evidence-grounded question answering. Our pipeline operates in two steps: first, a sentence-level extractor flags relevant ...\n", + "Enhanced content length: 458 chars\n", + "Enhanced preview: ## Abstract\n", + "\n", + "We present a lightweight, domain-agnostic verbatim pipeline for evidence-grounded question answering. Our pipeline operates in two steps: first, a sentence-level extractor flags relevant ...\n", + "\n", + "--- Chunk 3 ---\n", + "ID: f8f4fdd6-282c-476d-b8d1-3539dababd2f\n", + "Type: ChunkType.PARAGRAPH\n", + "Number: 2\n", + "Content length: 234 chars\n", + "Processed chunks: 1\n", + "Content preview: In the ArchEHR-QA 2025 shared task, our system scored 42.01%, ranking top-10 in core metrics and outperforming the organiser's 70B-parameter Llama-3.3 baseline. We publicly release our code and infere...\n", + "Enhanced content length: 234 chars\n", + "Enhanced preview: In the ArchEHR-QA 2025 shared task, our system scored 42.01%, ranking top-10 in core metrics and outperforming the organiser's 70B-parameter Llama-3.3 baseline. We publicly release our code and infere...\n", + "\n", + "--- Chunk 4 ---\n", + "ID: 6e240427-434c-4c97-83fd-a2bd9229ac88\n", + "Type: ChunkType.PARAGRAPH\n", + "Number: 3\n", + "Content length: 1 chars\n", + "Processed chunks: 1\n", + "Content preview: \n", + "...\n", + "Enhanced content length: 1 chars\n", + "Enhanced preview: \n", + "...\n", + "\n", + "--- Chunk 5 ---\n", + "ID: 03a812fc-ba4d-4828-bb43-e191ceb9f9e9\n", + "Type: ChunkType.PARAGRAPH\n", + "Number: 4\n", + "Content length: 68 chars\n", + "Processed chunks: 1\n", + "Content preview: sively with verbatim sentences selected from the extraction phase.\n", + "\n", + "...\n", + "Enhanced content length: 68 chars\n", + "Enhanced preview: sively with verbatim sentences selected from the extraction phase.\n", + "\n", + "...\n", + "\n", + "... and 83 more chunks\n" + ] + } + ], + "source": [ + "if 'document' in locals():\n", + " print(f\"\\n📄 Document Analysis:\")\n", + " print(f\"Total chunks: {len(document.chunks)}\")\n", + " \n", + " print(\"\\n📝 Chunk Details:\")\n", + " for i, chunk in enumerate(document.chunks[:5]): # Show first 5 chunks\n", + " print(f\"\\n--- Chunk {i+1} ---\")\n", + " print(f\"ID: {chunk.id}\")\n", + " print(f\"Type: {chunk.chunk_type}\")\n", + " print(f\"Number: {chunk.chunk_number}\")\n", + " print(f\"Content length: {len(chunk.content)} chars\")\n", + " print(f\"Processed chunks: {len(chunk.processed_chunks)}\")\n", + " print(f\"Content preview: {chunk.content[:200]}...\")\n", + " \n", + " # Show processed chunk details\n", + " if chunk.processed_chunks:\n", + " pc = chunk.processed_chunks[0]\n", + " print(f\"Enhanced content length: {len(pc.enhanced_content)} chars\")\n", + " print(f\"Enhanced preview: {pc.enhanced_content[:200]}...\")\n", + " \n", + " if len(document.chunks) > 5:\n", + " print(f\"\\n... and {len(document.chunks) - 5} more chunks\")\n", + "else:\n", + " print(\"❌ No document available to analyze\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": "## Test 4: Different Chunking Strategies\n\nThis test compares various chunking approaches (recursive, token, sentence, word) to understand how each strategy affects the resulting chunks. This comparison helps identify the best approach for different use cases." + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "🔄 Testing recursive chunking...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/paulschmitt/miniforge3/envs/verbatim-rag-2/lib/python3.10/site-packages/torch/utils/data/dataloader.py:683: UserWarning: 'pin_memory' argument is set as true but not supported on MPS now, then device pinned memory won't be used.\n", + " warnings.warn(warn_msg)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " ✅ recursive: 88 chunks\n", + "\n", + "🔄 Testing token chunking...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/paulschmitt/miniforge3/envs/verbatim-rag-2/lib/python3.10/site-packages/torch/utils/data/dataloader.py:683: UserWarning: 'pin_memory' argument is set as true but not supported on MPS now, then device pinned memory won't be used.\n", + " warnings.warn(warn_msg)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " ✅ token: 125 chunks\n", + "\n", + "🔄 Testing sentence chunking...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/paulschmitt/miniforge3/envs/verbatim-rag-2/lib/python3.10/site-packages/torch/utils/data/dataloader.py:683: UserWarning: 'pin_memory' argument is set as true but not supported on MPS now, then device pinned memory won't be used.\n", + " warnings.warn(warn_msg)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " ✅ sentence: 273 chunks\n", + "\n", + "🔄 Testing word chunking...\n", + " ❌ word failed: module 'chonkie' has no attribute 'WordChunker'\n", + "\n", + "📊 Chunking Strategy Comparison:\n", + "------------------------------------------------------------\n", + "recursive | 88 chunks | Avg size: 292.6 chars\n", + "token | 125 chunks | Avg size: 255.6 chars\n", + "sentence | 273 chunks | Avg size: 94.3 chars\n", + "word | ERROR: module 'chonkie' has no attribute 'WordChunker'\n" + ] + } + ], + "source": [ + "# Test different chunking strategies\n", + "chunking_strategies = [\n", + " (\"recursive\", {\"chunker_recipe\": \"markdown\", \"chunk_size\": 512}),\n", + " (\"token\", {\"chunk_size\": 256, \"chunk_overlap\": 50}),\n", + " (\"sentence\", {\"chunk_size\": 3, \"chunk_overlap\": 1}),\n", + " (\"word\", {\"chunk_size\": 100, \"chunk_overlap\": 20}),\n", + "]\n", + "\n", + "results = {}\n", + "\n", + "for strategy_name, kwargs in chunking_strategies:\n", + " try:\n", + " print(f\"\\n🔄 Testing {strategy_name} chunking...\")\n", + " processor = DocumentProcessor(chunker_type=strategy_name, **kwargs)\n", + " doc = processor.process_file(test_file_path, title=f\"Test Doc - {strategy_name}\")\n", + " \n", + " results[strategy_name] = {\n", + " \"chunks\": len(doc.chunks),\n", + " \"avg_chunk_size\": sum(len(chunk.content) for chunk in doc.chunks) / len(doc.chunks),\n", + " \"total_content\": sum(len(chunk.content) for chunk in doc.chunks)\n", + " }\n", + " \n", + " print(f\" ✅ {strategy_name}: {len(doc.chunks)} chunks\")\n", + " \n", + " except Exception as e:\n", + " print(f\" ❌ {strategy_name} failed: {e}\")\n", + " results[strategy_name] = {\"error\": str(e)}\n", + "\n", + "# Summary\n", + "print(\"\\n📊 Chunking Strategy Comparison:\")\n", + "print(\"-\" * 60)\n", + "for strategy, result in results.items():\n", + " if \"error\" in result:\n", + " print(f\"{strategy:12} | ERROR: {result['error']}\")\n", + " else:\n", + " print(f\"{strategy:12} | {result['chunks']:3d} chunks | Avg size: {result['avg_chunk_size']:6.1f} chars\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": "## Test 5: Factory Methods\n\nThis test evaluates the convenience factory methods provided by DocumentProcessor. These methods create pre-configured processors optimized for specific tasks like embeddings, Q&A, and semantic processing." + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "🏭 Testing Factory Methods:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/paulschmitt/miniforge3/envs/verbatim-rag-2/lib/python3.10/site-packages/torch/utils/data/dataloader.py:683: UserWarning: 'pin_memory' argument is set as true but not supported on MPS now, then device pinned memory won't be used.\n", + " warnings.warn(warn_msg)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " ✅ for_embeddings | 56 chunks | Type: token\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/paulschmitt/miniforge3/envs/verbatim-rag-2/lib/python3.10/site-packages/torch/utils/data/dataloader.py:683: UserWarning: 'pin_memory' argument is set as true but not supported on MPS now, then device pinned memory won't be used.\n", + " warnings.warn(warn_msg)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " ✅ for_qa | 273 chunks | Type: sentence\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/paulschmitt/miniforge3/envs/verbatim-rag-2/lib/python3.10/site-packages/chonkie/embeddings/auto.py:87: UserWarning: Failed to load minishlab/potion-base-8M with Model2VecEmbeddings: Model2VecEmbeddings.__init__() got an unexpected keyword argument 'merge_threshold'\n", + "Falling back to loading default provider model.\n", + " warnings.warn(\n", + "/Users/paulschmitt/miniforge3/envs/verbatim-rag-2/lib/python3.10/site-packages/chonkie/embeddings/auto.py:95: UserWarning: Failed to load the default model for Model2VecEmbeddings: Model2VecEmbeddings.__init__() got an unexpected keyword argument 'merge_threshold'\n", + "Falling back to SentenceTransformerEmbeddings.\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " ❌ semantic | Error: Failed to load embeddings via SentenceTransformerE...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/paulschmitt/miniforge3/envs/verbatim-rag-2/lib/python3.10/site-packages/torch/utils/data/dataloader.py:683: UserWarning: 'pin_memory' argument is set as true but not supported on MPS now, then device pinned memory won't be used.\n", + " warnings.warn(warn_msg)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " ✅ markdown_recursive | 88 chunks | Type: recursive\n" + ] + } + ], + "source": [ + "# Test factory methods\n", + "factory_methods = [\n", + " (\"for_embeddings\", DocumentProcessor.for_embeddings),\n", + " (\"for_qa\", DocumentProcessor.for_qa),\n", + " (\"semantic\", DocumentProcessor.semantic),\n", + " (\"markdown_recursive\", DocumentProcessor.markdown_recursive),\n", + "]\n", + "\n", + "print(\"🏭 Testing Factory Methods:\")\n", + "for method_name, method in factory_methods:\n", + " try:\n", + " processor = method()\n", + " doc = processor.process_file(test_file_path, title=f\"Factory Test - {method_name}\")\n", + " print(f\" ✅ {method_name:18} | {len(doc.chunks):3d} chunks | Type: {processor.chunker_type}\")\n", + " except Exception as e:\n", + " print(f\" ❌ {method_name:18} | Error: {str(e)[:50]}...\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": "## Test 6: Directory Processing\n\nThis test examines the DocumentProcessor's ability to process multiple files from a directory. It's useful for understanding batch processing capabilities and handling various file formats." + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "📁 Testing directory processing...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/paulschmitt/miniforge3/envs/verbatim-rag-2/lib/python3.10/site-packages/torch/utils/data/dataloader.py:683: UserWarning: 'pin_memory' argument is set as true but not supported on MPS now, then device pinned memory won't be used.\n", + " warnings.warn(warn_msg)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Processed 1 documents from directory\n", + " - VERBATIM_RAG_ACL: 88 chunks (pdf)\n" + ] + } + ], + "source": [ + "# Test directory processing if example docs exist\n", + "if example_docs_path.exists():\n", + " print(\"📁 Testing directory processing...\")\n", + " try:\n", + " processor = DocumentProcessor()\n", + " documents = processor.process_directory(example_docs_path)\n", + " \n", + " print(f\"✅ Processed {len(documents)} documents from directory\")\n", + " \n", + " for doc in documents:\n", + " print(f\" - {doc.title}: {len(doc.chunks)} chunks ({doc.content_type.value})\")\n", + " \n", + " except Exception as e:\n", + " print(f\"❌ Directory processing failed: {e}\")\n", + " import traceback\n", + " traceback.print_exc()\n", + "else:\n", + " print(\"📁 Skipping directory test - no example docs found\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": "## Test 7: Document Structure Analysis\n\nThis critical test analyzes the document's structural patterns, including headers, sections, and hierarchical elements. The insights from this test inform our hierarchical chunking strategy and help identify documents suitable for hierarchical processing." + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "🔍 Document Structure Analysis (for hierarchical chunking):\n", + "\n", + "Document: VERBATIM_RAG_ACL.pdf\n", + "\n", + "📊 Chunk Analysis:\n", + "Chunk types distribution:\n", + " paragraph: 88\n", + "\n", + "Chunk size statistics:\n", + " Min: 1 chars\n", + " Max: 510 chars\n", + " Avg: 292.6 chars\n", + "\n", + "🌳 Hierarchical Pattern Analysis:\n", + "Found 18 headers:\n", + " Level 2: ## KR Labs at ArchEHR-QA 2025: A Verbatim Approach for Evidence-Based Question Answering\n", + " Level 2: ## Paul Schmitt\n", + " Level 2: ## Abstract\n", + " Level 2: ## 1 Introduction\n", + " Level 2: ## 2 Background\n", + " Level 2: ## 2.1 Dataset\n", + " Level 2: ## 2.2 Limitations of Standard RAG\n", + " Level 2: ## 2.3 Synthetic Training Data\n", + " Level 2: ## 3 Method\n", + " Level 2: ## 3.1 System Overview\n", + " Level 2: ## 3.2 Evidence Extraction\n", + " Level 2: ## 3.3 Synthetic Data Generation\n", + " Level 2: ## 3.4 Answer Generation\n", + " Level 2: ## 4 Evaluation\n", + " Level 2: ## 5 Ethical Considerations\n", + " Level 2: ## 6 Limitations\n", + " Level 2: ## 7 Conclusion\n", + " Level 2: ## References\n" + ] + } + ], + "source": [ + "# Analyze document structure for hierarchical chunking insights\n", + "if 'document' in locals():\n", + " print(\"🔍 Document Structure Analysis (for hierarchical chunking):\")\n", + " print(f\"\\nDocument: {document.title}\")\n", + " # print(f\"Raw content sample:\")\n", + " # print(document.raw_content[:500] + \"...\")\n", + " \n", + " print(f\"\\n📊 Chunk Analysis:\")\n", + " chunk_types = {}\n", + " chunk_sizes = []\n", + " \n", + " for chunk in document.chunks:\n", + " chunk_types[chunk.chunk_type] = chunk_types.get(chunk.chunk_type, 0) + 1\n", + " chunk_sizes.append(len(chunk.content))\n", + " \n", + " print(f\"Chunk types distribution:\")\n", + " for chunk_type, count in chunk_types.items():\n", + " print(f\" {chunk_type.value}: {count}\")\n", + " \n", + " print(f\"\\nChunk size statistics:\")\n", + " print(f\" Min: {min(chunk_sizes)} chars\")\n", + " print(f\" Max: {max(chunk_sizes)} chars\")\n", + " print(f\" Avg: {sum(chunk_sizes)/len(chunk_sizes):.1f} chars\")\n", + " \n", + " # Look for hierarchical patterns in content\n", + " print(f\"\\n🌳 Hierarchical Pattern Analysis:\")\n", + " headers = []\n", + " for chunk in document.chunks:\n", + " lines = chunk.content.split('\\n')\n", + " for line in lines:\n", + " line = line.strip()\n", + " if line.startswith('#'):\n", + " level = len(line) - len(line.lstrip('#'))\n", + " headers.append((level, line))\n", + " \n", + " if headers:\n", + " print(f\"Found {len(headers)} headers:\")\n", + " for level, header in headers:\n", + " indent = \" \" * (level - 1)\n", + " print(f\"{indent}Level {level}: {header}\")\n", + " else:\n", + " print(\"No markdown headers found\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": "## Test 8: Document Serialization\n\nThis test verifies that documents can be properly serialized to and deserialized from dictionary format. This capability is essential for storing, transferring, and reconstructing document objects while maintaining data integrity." + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Test document serialization/deserialization\n", + "if 'document' in locals():\n", + " print(\"💾 Testing document serialization...\")\n", + " \n", + " try:\n", + " # Convert to dict\n", + " doc_dict = document.to_dict()\n", + " print(f\"✅ Document serialized to dict ({len(str(doc_dict))} chars)\")\n", + " \n", + " # Convert back to document\n", + " restored_doc = document.__class__.from_dict(doc_dict)\n", + " print(f\"✅ Document restored from dict\")\n", + " \n", + " # Verify integrity\n", + " print(f\"\\n🔍 Integrity check:\")\n", + " print(f\" Title match: {document.title == restored_doc.title}\")\n", + " print(f\" Chunks count match: {len(document.chunks) == len(restored_doc.chunks)}\")\n", + " print(f\" Content match: {document.raw_content == restored_doc.raw_content}\")\n", + " \n", + " if len(document.chunks) > 0 and len(restored_doc.chunks) > 0:\n", + " print(f\" First chunk content match: {document.chunks[0].content == restored_doc.chunks[0].content}\")\n", + " \n", + " except Exception as e:\n", + " print(f\"❌ Serialization test failed: {e}\")\n", + " import traceback\n", + " traceback.print_exc()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": "## Test 9: Docling HierarchicalChunker\n\nThis test explores Docling's built-in HierarchicalChunker to determine if it preserves document hierarchy better than standard chunking. We investigate whether the issue lies in PDF conversion or the chunking process itself." + }, + { + "cell_type": "markdown", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": "## Test 11: Hierarchical Chunking Prototype\n\nThis is our breakthrough test that implements hierarchical chunking using section numbering patterns. It creates a complete hierarchical document structure with parent-child relationships, demonstrating how to overcome Docling's hierarchy flattening limitations through intelligent post-processing." + }, + { + "cell_type": "markdown", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": "## Test 10: View Raw Converted Content\n\nThis test examines the raw markdown content produced by Docling's PDF conversion to understand exactly what happens during the document conversion process. It helps identify where hierarchy information is lost and explores alternative export methods." + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Test 11: Hierarchical Chunking Prototype" + ] + }, + { + "cell_type": "code", + "execution_count": 116, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "# Prototype hierarchical chunking using section numbering\n", + "\n", + "import re\n", + "from dataclasses import dataclass, field\n", + "from typing import List, Optional\n", + "from verbatim_rag.document import Chunk, ChunkType\n", + "\n", + "HEADER_RE = re.compile(r'^##\\s+(\\d+(?:\\.\\d+)*)\\s+(.+)$')\n", + "NUM_RE = re.compile(r'^(\\d+(?:\\.\\d+)*)\\s+([A-Z][A-Za-z\\s]+.*)$')\n", + "\n", + "@dataclass\n", + "class HierarchicalChunk(Chunk):\n", + " \"\"\"Extended Chunk class with hierarchy support.\"\"\"\n", + " parent_chunk_id: Optional[str] = None\n", + " child_chunk_ids: List[str] = field(default_factory=list)\n", + " hierarchy_level: int = 0 # 0=document, 1=section, 2=subsection, 3=content\n", + " section_number: Optional[str] = None # \"1\", \"2.1\", \"3.2.1\"\n", + "\n", + " def add_child(self, child_chunk: 'HierarchicalChunk'):\n", + " \"\"\"Add a child chunk and set up parent-child relationship.\"\"\"\n", + " child_chunk.parent_chunk_id = self.id\n", + " if child_chunk.id not in self.child_chunk_ids:\n", + " self.child_chunk_ids.append(child_chunk.id)\n", + "\n", + " def __str__(self):\n", + " indent = \" \" * self.hierarchy_level\n", + " section = f\"[{self.section_number}] \" if self.section_number else \"\"\n", + " return f\"{indent}{section}{self.content[:100]}...\"\n", + "\n", + "def detect_section_numbering(content: str) -> List[tuple]:\n", + " \"\"\"\n", + " Detect section numbering patterns in content.\n", + " Returns list of (line_number, section_number, title, level, full_line)\n", + " \"\"\"\n", + " lines = content.split('\\n')\n", + " sections = []\n", + "\n", + " for i, line in enumerate(lines):\n", + " line_stripped = line.strip()\n", + "\n", + " # Pattern 1: \"## 1 Introduction\" or \"## 2.1 Dataset\"\n", + " match = HEADER_RE.match(line_stripped)\n", + " if match:\n", + " section_num = match.group(1)\n", + " title = match.group(2)\n", + " level = len(section_num.split('.'))\n", + " sections.append((i+1, section_num, title, level, line_stripped))\n", + " continue\n", + "\n", + " # Pattern 2: Just numbers \"1 Introduction\" (without ##)\n", + " match = NUM_RE.match(line_stripped)\n", + " if match:\n", + " section_num = match.group(1)\n", + " title = match.group(2)\n", + " level = len(section_num.split('.'))\n", + " sections.append((i+1, section_num, title, level, line_stripped))\n", + "\n", + " return sections\n", + "\n", + "def create_hierarchical_chunks(content: str, document_id: str) -> List[HierarchicalChunk]:\n", + " \"\"\"\n", + " Create hierarchical chunks from content using section numbering.\n", + " \"\"\"\n", + " # Step 1: Detect sections\n", + " sections = detect_section_numbering(content)\n", + "\n", + " if not sections:\n", + " print(\"❌ No section numbering found - fallback to flat chunking\")\n", + " return []\n", + "\n", + " print(f\"✅ Found {len(sections)} sections with numbering\")\n", + "\n", + " # Step 2: Split content by sections\n", + " lines = content.split('\\n')\n", + " hierarchical_chunks = []\n", + " chunk_map = {} # section_number -> chunk\n", + "\n", + " for i, (line_num, section_num, title, level, full_line) in enumerate(sections):\n", + " # Find content for this section (until next section)\n", + " start_line = line_num - 1 # Convert to 0-based\n", + " if i + 1 < len(sections):\n", + " end_line = sections[i + 1][0] - 1 # Next section's line\n", + " else:\n", + " end_line = len(lines) # End of document\n", + "\n", + " # Extract section content\n", + " section_lines = lines[start_line:end_line]\n", + " header, *body = section_lines\n", + " section_content = '\\n'.join(body).strip()\n", + "\n", + " # Create hierarchical chunk\n", + " chunk = HierarchicalChunk(\n", + " document_id=document_id,\n", + " content=section_content,\n", + " chunk_number=i,\n", + " chunk_type=ChunkType.SECTION if level <= 2 else ChunkType.PARAGRAPH,\n", + " hierarchy_level=level,\n", + " section_number=section_num,\n", + " metadata={\n", + " 'section_title': title,\n", + " 'section_number': section_num,\n", + " 'hierarchy_level': level\n", + " }\n", + " )\n", + "\n", + " hierarchical_chunks.append(chunk)\n", + " chunk_map[section_num] = chunk\n", + "\n", + " # Step 3: Build parent-child relationships\n", + " for chunk in hierarchical_chunks:\n", + " section_parts = chunk.section_number.split('.')\n", + "\n", + " # Find parent (e.g., \"2.1\" parent is \"2\")\n", + " if len(section_parts) > 1:\n", + " parent_section = '.'.join(section_parts[:-1])\n", + " parent_chunk = chunk_map.get(parent_section)\n", + " if parent_chunk:\n", + " parent_chunk.add_child(chunk)\n", + "\n", + " return hierarchical_chunks" + ] + }, + { + "cell_type": "markdown", + "execution_count": null, + "outputs": [], + "source": "## Test Summary and Recommendations\n\nThis final section summarizes all test results and provides actionable recommendations for implementing production-ready hierarchical chunking. It consolidates insights from all previous tests and outlines the next steps for development.", + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "**Analyze the detected Sections**" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 117, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "🔍 Section Detection Results:\n", + "Found 14 sections:\n", + " Line 17: Level 1 - 1 Introduction\n", + " Line 30: Level 1 - 2 Background\n", + " Line 32: Level 2 - 2.1 Dataset\n", + " Line 42: Level 2 - 2.2 Limitations of Standard RAG\n", + " Line 46: Level 2 - 2.3 Synthetic Training Data\n", + " Line 50: Level 1 - 3 Method\n", + " Line 52: Level 2 - 3.1 System Overview\n", + " Line 56: Level 2 - 3.2 Evidence Extraction\n", + " Line 66: Level 2 - 3.3 Synthetic Data Generation\n", + " Line 79: Level 2 - 3.4 Answer Generation\n", + " ... and 4 more sections\n" + ] + } + ], + "source": [ + "# Test section detection\n", + "content = document.raw_content\n", + "sections = detect_section_numbering(content)\n", + "print(f\"🔍 Section Detection Results:\")\n", + "print(f\"Found {len(sections)} sections:\")\n", + "\n", + "for line_num, section_num, title, level, full_line in sections[:10]:\n", + " indent = \" \" * (level - 1)\n", + " print(f\" Line {line_num:3d}: {indent}Level {level} - {section_num} {title}\")\n", + "\n", + "if len(sections) > 10:\n", + " print(f\" ... and {len(sections) - 10} more sections\")" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "**Hierarchical Chunks**" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 118, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Found 14 sections with numbering\n", + "Created 14 hierarchical chunks\n" + ] + } + ], + "source": [ + "hierarchical_chunks = create_hierarchical_chunks(content, document.id)\n", + "print(f\"Created {len(hierarchical_chunks)} hierarchical chunks\")" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "**Analyze the Hierarchy**" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 119, + "outputs": [], + "source": [ + "level_counts = {}\n", + "parent_child_pairs = 0\n", + "\n", + "for chunk in hierarchical_chunks:\n", + " level_counts[chunk.hierarchy_level] = level_counts.get(chunk.hierarchy_level, 0) + 1\n", + " if chunk.child_chunk_ids:\n", + " parent_child_pairs += len(chunk.child_chunk_ids)" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "**Level Distribution**" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 120, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Level distribution:\n", + " Level 1: 7 chunks\n", + " Level 2: 7 chunks\n", + "Parent-child relationships: 7\n" + ] + } + ], + "source": [ + "print(f\"Level distribution:\")\n", + "for level in sorted(level_counts.keys()):\n", + " print(f\" Level {level}: {level_counts[level]} chunks\")\n", + "print(f\"Parent-child relationships: {parent_child_pairs}\")" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "**Hierarchy Structure**" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 121, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "14\n" + ] + } + ], + "source": [ + "print(len(hierarchical_chunks))" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 126, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "📋 Hierarchy Structure (first 10 chunks):\n", + "\n", + "Introduction\n", + "Modern question-answering (QA) and retrievalaugmented generation (RAG) systems play a vital role in many high-stakes domains for information extraction and generation tasks. In medicine, a typical use case involves clinicians asking questions based on a patient's electronic health record (EHR) notes, rather than manually sifting through lengthy notes, which can be time-consuming. However, in practice, RAG and QA pipelines often misalign evidence and produce incorrect information, commonly referred to as hallucinations (Ji et al., 2023; Madsen et al., 2024). We argue that a reliable QA system should guarantee complete traceability of answers. To tackle this problem, we propose a verbatim pipeline that clearly separates extraction and generation to mitigate hallucinations (other errors may still occur):\n", + "\n", + "- Sentence-level extraction , using either zeroshot LLMs or supervised ModernBERT classifiers.\n", + "- Template-constrained generation , dynamically creating answer templates filled exclu-\n", + "\n", + "We participated in the ArchEHR-QA 2025 shared task on grounded question answering (QA) from electronic health records (EHRs). Our approach involved (i) utilizing a zero-shot gemma-3-27b-it 1 LLM(Team, 2025) and (ii) generating synthetic data for sentence extraction from EHRs to train a compact extractor. For this purpose, we trained a Clinical ModernBERT classifier (Lee et al., 2025; Warner et al., 2024), achieving performance comparable to the LLM extractor. Both extractors were then fed into the same LLM template generator. Our solution achieved a score of 42.01% , ranking in the top 10 for core metrics, and surpassed the organizer's 70B-parameter Llama3.3 baseline by a large margin.\n", + "\n", + "Our contributions include a modular, traceable QA architecture that mitigates hallucinations, a method to generate synthetic EHR question-answer corpus and train custom models. Additionally, we are releasing all the code on GitHub 2 under the MIT License.\n", + "\n", + "The remainder of the paper discusses background (Section 2), method (Section 3), and evaluation (Section 4).\n", + "\n", + "Background\n", + "\n", + " └── Has 3 children\n", + "\n", + "Dataset\n", + "Early clinical QA datasets such as emrQA (Pampari et al., 2018) and CliCR (Šuster and Daelemans, 2018) used fill-in-the-blank methods and lacked explicit sentence-level evidence. ArchEHR-QA (Soni and Demner-Fushman, 2025b,a) addresses this by pairing clinician-authored questions with deidentified MIMIC-III (Johnson et al., 2016) notes,\n", + "\n", + "1 https://huggingface.co/google/gemma-3-27b-it\n", + "\n", + "2 https://github.com/KRLabsOrg/verbatim-rag/ tree/archehr\n", + "\n", + "annotated at the sentence-level as essential , supplementary , or irrelevant . Answers must be concise (under 75 words) and explicitly cite relevant sentences.\n", + "\n", + "Limitations of Standard RAG\n", + "Standard RAG models, despite external grounding, still frequently hallucinate unsupported or contradictory information (Ji et al., 2023). Existing approaches like post-hoc verification (Friel and Sanyal, 2023; Manakul et al., 2023) or classifiers trained on hallucination corpora such as RAGTruth (Niu et al., 2024) (e.g., RAG-HAT (Song et al., 2024), LettuceDetect (Ádám Kovács and Recski, 2025)) add extra complexity and latency. Posthoc saliency methods (Serrano and Smith, 2019; Jain and Wallace, 2019) and LLM self-explanations (Madsen et al., 2024) have also been found unreliable. Our approach proactively prevents hallucinations through strict template-driven sentence extraction and verbatim insertion.\n", + "\n", + "Synthetic Training Data\n", + "Due to limited access and annotation restrictions, obtaining sentence-level labeled clinical datasets is challenging. Recent works address this by generating synthetic data via perturbation or LLM prompting (Niu et al., 2024; Lozano et al., 2023; Frayling et al., 2024; Bai et al., 2024). We follow this approach, generating synthetic EHR snippets, clinician-style questions, and sentence relevance annotations (details in Section 3.3).\n", + "\n", + "Method\n", + "\n", + " └── Has 4 children\n", + "\n", + "System Overview\n", + "Figure 1 depicts our system architecture. First, an extraction step identifies relevant sentences from the input (patient narrative, clinician question, and note excerpt). We implemented both zero-shot and supervised models. Second, the generation step uses gemma-3-27b-it to dynamically draft an answer template, filled verbatim with extracted sentences. If exceeding 75 words, answers are compressed via a summarization prompt, preserving sentence-level citations.\n", + "\n", + "Evidence Extraction\n", + "We evaluated two extractors: (i) We prompted gemma-3-27b-it to explicitly label sentences as relevant via a step-by-step process. (ii) We finetuned a Clinical ModernBERT classifier (Lee et al.,\n", + "\n", + "Figure 1: System overview. The pipeline first selects relevant sentences and then generates a question-specific answer using a dynamic template.\n", + "\n", + "\n", + "\n", + "2025), trained on our synthetic data (Section 3.3). It independently evaluates each sentence in context (question + patient narrative). Lee et al. (2025) is a variant of ModernBERT (Warner et al., 2024) adapted specifically for biomedical and clinical text. Clinical ModernBERT supports extended input sequences (up to 8,192 tokens) and includes domain-specific vocabulary enhancements, making it particularly suitable for handling long clinical narratives. To provide additional context during classification, we included one sentence before and after the target sentence, forming a passage of up to three sentences. We chose a window size of one sentence before and after the target based on preliminary experimentation. The target sentence was explicitly marked with [START] and [END] tokens. The full input was structured using the standard BERT classification format. During fine-tuning, we merged essential and supplementary labels into a single positive class. We addressed class imbalance using weighted binary cross-entropy loss. We trained for 3 epochs (batch size 32, learning rate 2e-5), with gradient clipping and early stopping based on F1 score.\n", + "\n", + "Synthetic Data Generation\n", + "Due to the scarcity of publicly available annotated data for sentence-level relevance classification, we constructed a synthetic dataset tailored specifically to the ArchEHR-QA task. Although the official development set contains labeled sentences, it is lim- ited to 428 sentences across only 20 question-note pairs. Initial experiments using external resources like RAGBench (Friel et al., 2025) and PubMedQAderived corpora (Jin et al., 2019) showed poor transfer performance, emphasizing the need for taskspecific synthetic data.\n", + "\n", + "Wegenerated synthetic data via few-shot prompting with gemma-3-27b-it . Each prompt provided dynamic examples from the development set to ensure diversity. The LLM generated synthetic instances comprising de-identified clinical note excerpts, patient narratives, clinician-authored questions, and binary relevance labels. This approach yielded 3915 synthetic notes. We varied the few-shot examples across multiple runs, as static prompting resulted in repetitive outputs. This variation greatly increased lexical and semantic diversity, aligning with other work in synthetic data generation (Li et al., 2023; Tang et al., 2023; Xu et al., 2024). Ultimately, selecting each sentence with their relevance from the note excerpts, we constructed a comprehensive dataset of 58k synthetic training examples, each labeled at the sentence level, which formed the training set for our Clinical ModernBERT classifier. Table 1 shows an illustrative training instance.\n", + "\n", + "Table 1: An example model input for our training.\n", + "\n", + "| QUESTION | Patient narrative : My husband, a 72-year-old with a history of COPD, was admitted for worsening shortness of breath. He's been on home oxygen for years, but it wasn't helping this time. He also developed some swelling in his ankles. He seems a little confused today... Clinician question : What is the likely cause of the patient's ankle edema and what was done to address it? |\n", + "|------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|\n", + "| SENTENCE | Adiuretic, furosemide 40mg PO daily, was initiated to address the lower extremity edema, which was at- tributed to both underlying heart failure and fluid re- tention secondary to COPD exacerbation. [START] Echocardiogram revealed mild left ventricular dys- function with an estimated ejection fraction of 45%. [END] Renal function was monitored closely, and remained stable throughout hospitalization. |\n", + "| LABEL | RELEVANT |\n", + "\n", + "Answer Generation\n", + "The answer generation module dynamically creates a template using the LLM ( gemma-3-27b-it ) based on the clinician's question, the selected evidence sentences, and the clinical note context. After the template generation step, we directly insert the extracted evidence sentences verbatim into the generated template, referencing sentence IDs ex-\n", + "\n", + "The emergency salvage repair was performed due to: -He was transferred to the hospital on 2025-01-20 for emergent repair of his ruptured thoracoabdominal aortic aneurysm. |1| -He was immediately taken to the operating room where he underwent an emergent salvage repair of ruptured thoracoabdominal aortic aneurysm with a 34-mm Dacron tube graft using deep hypothermic circulatory arrest. |2| -Thoracoabdominal wound healing well with exception of very small open area mid-wound that is ~1 cm around and 0.5 cm deep, no surrounding erythema. |8|\n", + "\n", + "Figure 2: Example answer generated by our verbatim method, inserting evidence sentences verbatim into a dynamically generated template.\n", + "\n", + "He was transferred to the hospital on 2025-01-20 for emergent repair of his ruptured thoracoabdominal aortic aneurysm |1|. He underwent an emergent salvage repair with a 34-mm Dacron tube graft using deep hypothermic circulatory arrest |2|. See also: |8|\n", + "\n", + "Figure 3: Concise answer produced by our summarization step to comply with the 75-word limit.\n", + "\n", + "plicitly. An example filled template generated by our pipeline is shown in Figure 2.\n", + "\n", + "If the filled answer exceeds the 75-word constraint of the task, we use an additional summarization prompt to rewrite the answer more concisely, ensuring all selected evidence remains cited and intact. An example summarization of the answer from Figure 2 is illustrated in Figure 3.\n" + ] + } + ], + "source": [ + "print(f\"📋 Hierarchy Structure (first 10 chunks):\")\n", + "for i, chunk in enumerate(hierarchical_chunks[:10]):\n", + " print()\n", + " #print(f\"{i+1:2d}. {chunk}\")\n", + " print(chunk.metadata[\"section_title\"])\n", + " print(chunk.content)\n", + " if chunk.child_chunk_ids:\n", + " print(f\" └── Has {len(chunk.child_chunk_ids)} children\")" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 114, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'section_title': 'Introduction', 'section_number': '1', 'hierarchy_level': 1}\n", + "Modern question-answering (QA) and retrievalaugmented generation (RAG) systems play a vital role in many high-stakes domains for information extraction and generation tasks. In medicine, a typical use case involves clinicians asking questions based on a patient's electronic health record (EHR) notes, rather than manually sifting through lengthy notes, which can be time-consuming. However, in practice, RAG and QA pipelines often misalign evidence and produce incorrect information, commonly referred to as hallucinations (Ji et al., 2023; Madsen et al., 2024). We argue that a reliable QA system should guarantee complete traceability of answers. To tackle this problem, we propose a verbatim pipeline that clearly separates extraction and generation to mitigate hallucinations (other errors may still occur):\n", + "\n", + "- Sentence-level extraction , using either zeroshot LLMs or supervised ModernBERT classifiers.\n", + "- Template-constrained generation , dynamically creating answer templates filled exclu-\n", + "\n", + "We participated in the ArchEHR-QA 2025 shared task on grounded question answering (QA) from electronic health records (EHRs). Our approach involved (i) utilizing a zero-shot gemma-3-27b-it 1 LLM(Team, 2025) and (ii) generating synthetic data for sentence extraction from EHRs to train a compact extractor. For this purpose, we trained a Clinical ModernBERT classifier (Lee et al., 2025; Warner et al., 2024), achieving performance comparable to the LLM extractor. Both extractors were then fed into the same LLM template generator. Our solution achieved a score of 42.01% , ranking in the top 10 for core metrics, and surpassed the organizer's 70B-parameter Llama3.3 baseline by a large margin.\n", + "\n", + "Our contributions include a modular, traceable QA architecture that mitigates hallucinations, a method to generate synthetic EHR question-answer corpus and train custom models. Additionally, we are releasing all the code on GitHub 2 under the MIT License.\n", + "\n", + "The remainder of the paper discusses background (Section 2), method (Section 3), and evaluation (Section 4).\n" + ] + } + ], + "source": [ + "for i, chunk in enumerate(hierarchical_chunks[:10]):\n", + " print(chunk.metadata)\n", + " print(chunk.content)\n", + " break" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "**Test: Find children of a parent**" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 115, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "👨‍👧‍👦 Parent-Child Relationship Test:\n", + "Parent: 2 Background\n", + " └── Child: 2.1 Dataset\n", + " └── Child: 2.2 Limitations of Standard RAG\n", + " └── Child: 2.3 Synthetic Training Data\n" + ] + } + ], + "source": [ + "print(f\"👨‍👧‍👦 Parent-Child Relationship Test:\")\n", + "for chunk in hierarchical_chunks[:5]:\n", + " if chunk.child_chunk_ids:\n", + " print(f\"Parent: {chunk.section_number} {chunk.metadata.get('section_title', '')}\")\n", + " for child_id in chunk.child_chunk_ids:\n", + " child_chunk = next((c for c in hierarchical_chunks if c.id == child_id), None)\n", + " if child_chunk:\n", + " print(f\" └── Child: {child_chunk.section_number} {child_chunk.metadata.get('section_title', '')}\")" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Test 9: Docling HierarchicalChunker" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "🔍 Deep Analysis of Docling DocChunks:\n", + "\n", + "--- DocChunk 1 Deep Dive ---\n", + "Raw text (36 chars):\n", + "'Ádám Kovács KR Labs kovacs@krlabs.eu...'\n", + "Meta type: \n", + "Doc items count: 1\n", + " Item 0: TextItem\n", + " Text: 'Ádám Kovács KR Labs kovacs@krlabs.eu...'\n", + " Parent: cref='#/body'\n", + "\n", + "--- DocChunk 2 Deep Dive ---\n", + "Raw text (33 chars):\n", + "'TU Wien paul.schmitt@tuwien.ac.at...'\n", + "Meta type: \n", + "Doc items count: 1\n", + " Item 0: TextItem\n", + " Text: 'TU Wien paul.schmitt@tuwien.ac.at...'\n", + " Parent: cref='#/body'\n", + "\n", + "--- DocChunk 3 Deep Dive ---\n", + "Raw text (45 chars):\n", + "'Gábor Recski KR Labs TU Wien recski@krlabs.eu...'\n", + "Meta type: \n", + "Doc items count: 1\n", + " Item 0: TextItem\n", + " Text: 'Gábor Recski KR Labs TU Wien recski@krlabs.eu...'\n", + " Parent: cref='#/body'\n", + "\n", + "--- DocChunk 4 Deep Dive ---\n", + "Raw text (678 chars):\n", + "'We present a lightweight, domain-agnostic verbatim pipeline for evidence-grounded question answering. Our pipeline operates in two steps: first, a sentence-level extractor flags relevant note sentences using either zero-shot LLM prompts or supervised ModernBERT classifiers. Next, an LLM drafts a que...'\n", + "Meta type: \n", + "Doc items count: 1\n", + " Item 0: TextItem\n", + " Text: 'We present a lightweight, domain-agnostic verbatim pipeline for evidence-grounded question answering...'\n", + " Parent: cref='#/body'\n", + "\n", + "--- DocChunk 5 Deep Dive ---\n", + "Raw text (66 chars):\n", + "'sively with verbatim sentences selected from the extraction phase....'\n", + "Meta type: \n", + "Doc items count: 1\n", + " Item 0: TextItem\n", + " Text: 'sively with verbatim sentences selected from the extraction phase....'\n", + " Parent: cref='#/body'\n", + "\n", + "🔢 Section Number Analysis:\n", + "❌ No clear section numbering patterns found\n", + "May need to use content-based or semantic hierarchy\n" + ] + } + ], + "source": [ + "# Follow-up: Examine DocChunk structure and content\n", + "if 'chunks' in locals() and chunks:\n", + " print(\"🔍 Deep Analysis of Docling DocChunks:\")\n", + " \n", + " for i, chunk in enumerate(chunks[:5]):\n", + " print(f\"\\n--- DocChunk {i+1} Deep Dive ---\")\n", + " \n", + " # Access the text content directly\n", + " chunk_text = chunk.text if hasattr(chunk, 'text') else str(chunk)\n", + " print(f\"Raw text ({len(chunk_text)} chars):\")\n", + " print(f\"'{chunk_text[:300]}...'\")\n", + " \n", + " # Check chunk metadata\n", + " if hasattr(chunk, 'meta'):\n", + " print(f\"Meta type: {type(chunk.meta)}\")\n", + " if hasattr(chunk.meta, 'doc_items'):\n", + " print(f\"Doc items count: {len(chunk.meta.doc_items)}\")\n", + " \n", + " # Look at document items for structure\n", + " for j, item in enumerate(chunk.meta.doc_items[:3]):\n", + " print(f\" Item {j}: {type(item).__name__}\")\n", + " if hasattr(item, 'text'):\n", + " print(f\" Text: '{item.text[:100]}...'\")\n", + " if hasattr(item, 'parent'):\n", + " print(f\" Parent: {item.parent}\")\n", + " if hasattr(item, 'level') or hasattr(item, 'hierarchy_level'):\n", + " level = getattr(item, 'level', getattr(item, 'hierarchy_level', None))\n", + " print(f\" Level: {level}\")\n", + " \n", + " # Look for section patterns in the text\n", + " lines = chunk_text.split('\\n')\n", + " for line_num, line in enumerate(lines[:10]):\n", + " line = line.strip()\n", + " # Look for section patterns like \"1 Introduction\", \"2.1 Dataset\"\n", + " import re\n", + " if re.match(r'^\\d+(\\.\\d+)*\\s+[A-Z]', line):\n", + " print(f\" 📍 Section pattern found: '{line}'\")\n", + " elif re.match(r'^[A-Z][A-Za-z\\s]+$', line) and len(line) < 50:\n", + " print(f\" 📝 Possible header: '{line}'\")\n", + "\n", + " # Test: Can we detect hierarchy from section numbering?\n", + " print(f\"\\n🔢 Section Number Analysis:\")\n", + " section_patterns = []\n", + " \n", + " for chunk in chunks:\n", + " chunk_text = chunk.text if hasattr(chunk, 'text') else str(chunk)\n", + " lines = chunk_text.split('\\n')\n", + " \n", + " for line in lines:\n", + " line = line.strip()\n", + " # Match patterns like \"1 Introduction\", \"2.1 Dataset\", \"2.1.1 Details\"\n", + " match = re.match(r'^(\\d+(?:\\.\\d+)*)\\s+([A-Z][A-Za-z\\s]+)', line)\n", + " if match:\n", + " section_num = match.group(1)\n", + " section_title = match.group(2)\n", + " level = len(section_num.split('.'))\n", + " section_patterns.append((level, section_num, section_title))\n", + " \n", + " if section_patterns:\n", + " print(\"✅ Found section number hierarchy:\")\n", + " for level, num, title in section_patterns[:10]:\n", + " indent = \" \" * (level - 1)\n", + " print(f\"{indent}Level {level}: {num} {title}\")\n", + " \n", + " print(f\"\\n🚀 SOLUTION: Use section numbering for hierarchy!\")\n", + " print(\"We can create hierarchical chunks based on section numbers:\")\n", + " print(\" Level 1: 1, 2, 3, 4...\")\n", + " print(\" Level 2: 2.1, 2.2, 3.1...\")\n", + " print(\" Level 3: 2.1.1, 2.1.2...\")\n", + " else:\n", + " print(\"❌ No clear section numbering patterns found\")\n", + " print(\"May need to use content-based or semantic hierarchy\")\n", + "else:\n", + " print(\"❌ No chunks available for deep analysis\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Test 9.5: Docling Conversion Analysis" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "📄 Examining Raw Docling Conversion:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/paulschmitt/miniforge3/envs/verbatim-rag-2/lib/python3.10/site-packages/torch/utils/data/dataloader.py:683: UserWarning: 'pin_memory' argument is set as true but not supported on MPS now, then device pinned memory won't be used.\n", + " warnings.warn(warn_msg)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "🔍 Method 1: Standard export_to_markdown():\n", + "Length: 25746 characters\n", + "First 2000 characters:\n", + "------------------------------------------------------------\n", + "\"## KR Labs at ArchEHR-QA 2025: A Verbatim Approach for Evidence-Based Question Answering\\n\\nÁdám Kovács KR Labs kovacs@krlabs.eu\\n\\n## Paul Schmitt\\n\\nTU Wien paul.schmitt@tuwien.ac.at\\n\\nGábor Recski KR Labs TU Wien recski@krlabs.eu\\n\\n## Abstract\\n\\nWe present a lightweight, domain-agnostic verbatim pipeline for evidence-grounded question answering. Our pipeline operates in two steps: first, a sentence-level extractor flags relevant note sentences using either zero-shot LLM prompts or supervised ModernBERT classifiers. Next, an LLM drafts a questionspecific template, which is filled verbatim with sentences from the extraction step. This prevents hallucinations and ensures traceability. In the ArchEHR-QA 2025 shared task, our system scored 42.01%, ranking top-10 in core metrics and outperforming the organiser's 70B-parameter Llama-3.3 baseline. We publicly release our code and inference scripts under an MIT license.\\n\\nsively with verbatim sentences selected from the extraction phase.\\n\\n## 1 Introduction\\n\\nModern question-answering (QA) and retrievalaugmented generation (RAG) systems play a vital role in many high-stakes domains for information extraction and generation tasks. In medicine, a typical use case involves clinicians asking questions based on a patient's electronic health record (EHR) notes, rather than manually sifting through lengthy notes, which can be time-consuming. However, in practice, RAG and QA pipelines often misalign evidence and produce incorrect information, commonly referred to as hallucinations (Ji et al., 2023; Madsen et al., 2024). We argue that a reliable QA system should guarantee complete traceability of answers. To tackle this problem, we propose a verbatim pipeline that clearly separates extraction and generation to mitigate hallucinations (other errors may still occur):\\n\\n- Sentence-level extraction , using either zeroshot LLMs or supervised ModernBERT classifiers.\\n- Template-constrained generation , dynamically creating answer templates filled exc\"\n", + "------------------------------------------------------------\n", + "\n", + "🔍 Method 2: Available export methods:\n", + "Available methods: ['_export_to_indented_text', 'export_to_dict', 'export_to_doctags', 'export_to_document_tokens', 'export_to_element_tree', 'export_to_html', 'export_to_markdown', 'export_to_text']\n", + "\n", + "🔍 Method 3: Document structure:\n", + "Document type: \n", + "Document attributes: ['add_code', 'add_document', 'add_form', 'add_formula', 'add_group', 'add_heading', 'add_inline_group', 'add_key_values', 'add_list_group', 'add_list_item']...\n", + "\n", + "🔍 Method 4: Document body structure:\n", + "Body type: \n", + "Body attributes: ['children', 'construct', 'content_layer', 'copy', 'dict', 'from_orm', 'get_ref', 'json', 'label', 'model_computed_fields']...\n", + "\n", + "💾 Saved full content to: /Users/paulschmitt/DataspellProjects/verbatim-rag/debug_converted_content.md\n", + "You can open this file to see the complete converted content!\n", + "\n", + "🔍 Method 6: Header pattern analysis in raw content:\n", + "Found 18 potential headers:\n", + " Line 1: ## KR Labs at ArchEHR-QA 2025: A Verbatim Approach for Evidence-Based Question Answering\n", + " Line 5: ## Paul Schmitt\n", + " Line 11: ## Abstract\n", + " Line 17: ## 1 Introduction\n", + " Line 30: ## 2 Background\n", + " Line 32: ## 2.1 Dataset\n", + " Line 42: ## 2.2 Limitations of Standard RAG\n", + " Line 46: ## 2.3 Synthetic Training Data\n", + " Line 50: ## 3 Method\n", + " Line 52: ## 3.1 System Overview\n", + " Line 56: ## 3.2 Evidence Extraction\n", + " Line 66: ## 3.3 Synthetic Data Generation\n", + " Line 79: ## 3.4 Answer Generation\n", + " Line 95: ## 4 Evaluation\n", + " Line 124: ## 5 Ethical Considerations\n", + " ... and 3 more\n" + ] + } + ], + "source": [ + "# View the raw converted markdown content to see what Docling actually produces\n", + "from docling.document_converter import DocumentConverter\n", + "\n", + "print(\"📄 Examining Raw Docling Conversion:\")\n", + "\n", + "pdf_path = project_root / \"data\" / \"acl_papers\" / \"VERBATIM_RAG_ACL.pdf\"\n", + "\n", + "if pdf_path.exists():\n", + " converter = DocumentConverter()\n", + " result = converter.convert(str(pdf_path))\n", + " \n", + " # Method 1: Standard markdown export (what DocumentProcessor uses)\n", + " print(\"\\n🔍 Method 1: Standard export_to_markdown():\")\n", + " markdown_content = result.document.export_to_markdown()\n", + " print(f\"Length: {len(markdown_content)} characters\")\n", + " print(\"First 2000 characters:\")\n", + " print(\"-\" * 60)\n", + " print(repr(markdown_content[:2000])) # Use repr to see actual characters\n", + " print(\"-\" * 60)\n", + " \n", + " # Method 2: Check if there are other export options\n", + " print(f\"\\n🔍 Method 2: Available export methods:\")\n", + " export_methods = [method for method in dir(result.document) if 'export' in method.lower()]\n", + " print(f\"Available methods: {export_methods}\")\n", + " \n", + " # Method 3: Look at document structure\n", + " print(f\"\\n🔍 Method 3: Document structure:\")\n", + " print(f\"Document type: {type(result.document)}\")\n", + " doc_attrs = [attr for attr in dir(result.document) if not attr.startswith('_')]\n", + " print(f\"Document attributes: {doc_attrs[:10]}...\")\n", + " \n", + " # Method 4: Try to access raw document elements\n", + " if hasattr(result.document, 'body'):\n", + " print(f\"\\n🔍 Method 4: Document body structure:\")\n", + " print(f\"Body type: {type(result.document.body)}\")\n", + " body_attrs = [attr for attr in dir(result.document.body) if not attr.startswith('_')]\n", + " print(f\"Body attributes: {body_attrs[:10]}...\")\n", + " \n", + " # Method 5: Save to file for inspection\n", + " output_file = project_root / \"debug_converted_content.md\"\n", + " with open(output_file, 'w', encoding='utf-8') as f:\n", + " f.write(markdown_content)\n", + " print(f\"\\n💾 Saved full content to: {output_file}\")\n", + " print(\"You can open this file to see the complete converted content!\")\n", + " \n", + " # Method 6: Look for header patterns in the raw content\n", + " print(f\"\\n🔍 Method 6: Header pattern analysis in raw content:\")\n", + " lines = markdown_content.split('\\n')\n", + " header_lines = []\n", + " \n", + " for i, line in enumerate(lines):\n", + " line_stripped = line.strip()\n", + " if line_stripped.startswith('#'):\n", + " header_lines.append((i+1, line))\n", + " elif re.match(r'^\\d+(\\.\\d+)*\\s+[A-Z]', line_stripped):\n", + " header_lines.append((i+1, f\"[NUMBER] {line}\"))\n", + " \n", + " print(f\"Found {len(header_lines)} potential headers:\")\n", + " for line_num, header in header_lines[:15]: # Show first 15\n", + " print(f\" Line {line_num}: {header}\")\n", + " \n", + " if len(header_lines) > 15:\n", + " print(f\" ... and {len(header_lines) - 15} more\")\n", + "\n", + "else:\n", + " print(f\"❌ PDF not found: {pdf_path}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Test 10: View Raw Converted Content" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "📋 Test Summary:\n", + "==================================================\n", + "\n", + "✅ Successfully tested:\n", + " - Basic DocumentProcessor creation\n", + " - File processing with different chunking strategies\n", + " - Factory methods for specialized processors\n", + " - Document structure analysis\n", + " - Document serialization/deserialization\n", + "\n", + "🚀 Ready for hierarchical chunking implementation!\n", + "\n", + "Next steps for hierarchical chunking:\n", + " 1. Extend Chunk class with parent_chunk_id field\n", + " 2. Modify DocumentProcessor to create hierarchical relationships\n", + " 3. Update VerbatimIndex to handle hierarchical chunks\n", + " 4. Add hierarchical search capabilities\n", + "\n", + "💡 Insights for hierarchical chunking:\n", + " - Document has 18 headers for natural hierarchy\n", + " - Maximum header depth: 2 levels\n", + " - Can use markdown structure for parent-child relationships\n", + "\n", + "🧹 Cleanup...\n", + "Removed test file: /Users/paulschmitt/DataspellProjects/verbatim-rag/data/acl_papers/VERBATIM_RAG_ACL.pdf\n" + ] + } + ], + "source": [ + "print(\"📋 Test Summary:\")\n", + "print(\"=\" * 50)\n", + "print(\"\\n✅ Successfully tested:\")\n", + "print(\" - Basic DocumentProcessor creation\")\n", + "print(\" - File processing with different chunking strategies\")\n", + "print(\" - Factory methods for specialized processors\")\n", + "print(\" - Document structure analysis\")\n", + "print(\" - Document serialization/deserialization\")\n", + "\n", + "print(\"\\n🚀 Ready for hierarchical chunking implementation!\")\n", + "print(\"\\nNext steps for hierarchical chunking:\")\n", + "print(\" 1. Extend Chunk class with parent_chunk_id field\")\n", + "print(\" 2. Modify DocumentProcessor to create hierarchical relationships\")\n", + "print(\" 3. Update VerbatimIndex to handle hierarchical chunks\")\n", + "print(\" 4. Add hierarchical search capabilities\")\n", + "\n", + "print(\"\\n💡 Insights for hierarchical chunking:\")\n", + "if 'headers' in locals() and headers:\n", + " print(f\" - Document has {len(headers)} headers for natural hierarchy\")\n", + " max_level = max(level for level, _ in headers)\n", + " print(f\" - Maximum header depth: {max_level} levels\")\n", + " print(\" - Can use markdown structure for parent-child relationships\")\n", + "else:\n", + " print(\" - Document lacks clear hierarchical structure\")\n", + " print(\" - Consider semantic-based or size-based hierarchical chunking\")\n", + "\n", + "print(\"\\n🧹 Cleanup...\")\n", + "if test_file_path.exists():\n", + " test_file_path.unlink()\n", + " print(f\"Removed test file: {test_file_path}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.18" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/test_verbatim_rag_integration.ipynb b/notebooks/test_verbatim_rag_integration.ipynb new file mode 100644 index 0000000..28f3978 --- /dev/null +++ b/notebooks/test_verbatim_rag_integration.ipynb @@ -0,0 +1,507 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# VerbatimRAG + Context-Enriched Integration Test\n", + "\n", + "This notebook tests the full integration of ContextEnrichedProcessor with the VerbatimRAG system." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Project root: /Users/paulschmitt/DataspellProjects/verbatim-rag\n", + "✅ Setup complete\n" + ] + } + ], + "source": [ + "import sys\n", + "import os\n", + "from pathlib import Path\n", + "\n", + "# Fix OpenMP conflict\n", + "os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'\n", + "os.environ['OPENAI_API_KEY'] = ''\n", + "\n", + "# Add project root to path\n", + "project_root = Path().absolute().parent\n", + "sys.path.append(str(project_root))\n", + "\n", + "print(f\"Project root: {project_root}\")\n", + "print(\"✅ Setup complete\")" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Imports successful\n" + ] + } + ], + "source": [ + "from verbatim_rag.ingestion.context_enriched_processor import ContextEnrichedProcessor\n", + "from verbatim_rag.core import VerbatimRAG\n", + "from verbatim_rag.index import VerbatimIndex\n", + "from pprint import pprint\n", + "\n", + "print(\"✅ Imports successful\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Test 1: Process Document with Context Enrichment" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "📄 Processing document with context enrichment...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/paulschmitt/miniforge3/envs/verbatim-rag-2/lib/python3.10/site-packages/torch/utils/data/dataloader.py:683: UserWarning: 'pin_memory' argument is set as true but not supported on MPS now, then device pinned memory won't be used.\n", + " warnings.warn(warn_msg)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Document processed successfully!\n", + " Title: Verbatim RAG ACL Paper\n", + " Chunks: 78\n", + " Content type: DocumentType.PDF\n", + " Context-enriched chunks: 78\n" + ] + } + ], + "source": [ + "# Test document path\n", + "pdf_path = project_root / \"data\" / \"acl_papers\" / \"VERBATIM_RAG_ACL.pdf\"\n", + "\n", + "# Create context-enriched processor optimized for RAG\n", + "processor = ContextEnrichedProcessor.for_rag(\n", + " chunk_size=384, # Smaller chunks for better retrieval\n", + " overlap=50\n", + ")\n", + "\n", + "# Process document\n", + "print(\"📄 Processing document with context enrichment...\")\n", + "document = processor.process_file(pdf_path, title=\"Verbatim RAG ACL Paper\")\n", + "\n", + "print(f\"✅ Document processed successfully!\")\n", + "print(f\" Title: {document.title}\")\n", + "print(f\" Chunks: {len(document.chunks)}\")\n", + "print(f\" Content type: {document.content_type}\")\n", + "\n", + "# Show chunk types\n", + "enriched_chunks = [c for c in document.chunks if hasattr(c, 'section_path')]\n", + "print(f\" Context-enriched chunks: {len(enriched_chunks)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Test 2: Create VerbatimIndex with Context-Enriched Chunks" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "🗂️ Creating VerbatimIndex with context-enriched chunks...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/paulschmitt/miniforge3/envs/verbatim-rag-2/lib/python3.10/site-packages/milvus_lite/__init__.py:15: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.\n", + " from pkg_resources import DistributionNotFound, get_distribution\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "📝 Adding document to index...\n", + "✅ Index created successfully!\n", + " Vector store type: LocalMilvusStore\n", + " Embedding provider: SentenceTransformersProvider\n" + ] + } + ], + "source": "# Create VerbatimIndex with context-enriched chunks\nprint(\"🗂️ Creating VerbatimIndex with context-enriched chunks...\")\n\n# Initialize index with OpenAI embeddings and FAISS vector store\nindex = VerbatimIndex(dense_model=\"all-MiniLM-L6-v2\")\n\n# Add the context-enriched document to the index (using add_documents method)\nprint(\"📝 Adding document to index...\")\nindex.add_documents([document])\n\nprint(f\"✅ Index created successfully!\")\nprint(f\" Vector store type: {type(index.vector_store).__name__}\")\nprint(f\" Embedding provider: {type(index.dense_provider).__name__}\")" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Test 3: Initialize VerbatimRAG System" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "🤖 Initializing VerbatimRAG system...\n", + "✅ VerbatimRAG initialized successfully!\n", + " Index working: Found 3 results for test query\n" + ] + } + ], + "source": "# Initialize VerbatimRAG with the context-enriched index\nprint(\"🤖 Initializing VerbatimRAG system...\")\n\nrag = VerbatimRAG(\n index=index # Pass the index as required parameter\n)\n\nprint(\"✅ VerbatimRAG initialized successfully!\")\n\n# Test that the index is working by doing a simple search\ntry:\n test_results = index.search(\"verbatim\", k=3)\n print(f\" Index working: Found {len(test_results)} results for test query\")\nexcept Exception as e:\n print(f\" Index test failed: {e}\")" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Test 4: Query with Context-Enriched Retrieval" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "🔍 Testing queries with context-enriched retrieval...\n", + "\n", + "--- Query 1 ---\n", + "Question: What dataset was used in this study?\n", + "Answer: Thanks for your question! Based on the documents, here are the key points:\n", + "\n", + "• Clinical ModernBERT\n", + "• EHR snippets, clinician-style questions, and sentence relevance annotations\n", + "• LLM (gemma-3-27b-it)\n", + "•...\n", + "Source documents: 5 documents cited\n", + "Retrieved documents:\n", + " 1. Document: ''\n", + " Highlights: 2 spans\n", + " - Clinical ModernBERT...\n", + " - Clinical ModernBERT...\n", + " 2. Document: ''\n", + " Highlights: 1 spans\n", + " - EHR snippets, clinician-style questions, and sentence relevance annotations...\n", + " 3. Document: ''\n", + " Highlights: 1 spans\n", + " - LLM (gemma-3-27b-it)...\n", + "--------------------------------------------------\n", + "\n", + "--- Query 2 ---\n", + "Question: What are the limitations of standard RAG systems?\n", + "Answer: Thanks for your question! Based on the documents, here are the key points:\n", + "\n", + "• Standard RAG models, despite external grounding, still frequently hallucinate unsupported or contradictory information\n", + "• E...\n", + "Source documents: 5 documents cited\n", + "Retrieved documents:\n", + " 1. Document: ''\n", + " Highlights: 3 spans\n", + " - Standard RAG models, despite external grounding, still frequently hallucinate unsupported or contrad...\n", + " - or classifiers trained on hallucination corpora such as RAGTruth...\n", + " 2. Document: ''\n", + " Highlights: 4 spans\n", + " - meaning the purely verbatim property was not consistently maintained across all answers...\n", + " - our approach often required summarization after the initial verbatim insertion step...\n", + " 3. Document: ''\n", + " Content preview: Section: 7 Conclusion | generation. Scientific Data , 10(1):586.\n", + "\n", + "Tianyi Zhang, Varsha Kishore, Feli...\n", + "--------------------------------------------------\n", + "\n", + "--- Query 3 ---\n", + "Question: How does the method work?\n", + "Answer: Thanks for your question! Based on the documents, here are the key points:\n", + "\n", + "• method\n", + "• Answer Generation\n", + "• An example summarization of the answer from Figure 2 is illustrated in Figure 3.\n", + "• Synthetic ...\n", + "Source documents: 5 documents cited\n", + "Retrieved documents:\n", + " 1. Document: ''\n", + " Highlights: 1 spans\n", + " - method...\n", + " 2. Document: ''\n", + " Highlights: 2 spans\n", + " - An example summarization of the answer from Figure 2 is illustrated in Figure 3....\n", + " - Answer Generation...\n", + " 3. Document: ''\n", + " Highlights: 1 spans\n", + " - Synthetic Data Generation...\n", + "--------------------------------------------------\n", + "\n", + "--- Query 4 ---\n", + "Question: What evaluation metrics were used?\n", + "Answer: Thanks for your question! Based on the documents, here are the key points:\n", + "\n", + "• Table 2 summarizes these metrics\n", + "• factuality recall (56.8% strict, 56.6% lenient)\n", + "• Relevance is evaluating how closely g...\n", + "Source documents: 5 documents cited\n", + "Retrieved documents:\n", + " 1. Document: ''\n", + " Highlights: 2 spans\n", + " - factuality recall (56.8% strict, 56.6% lenient)...\n", + " - Table 2 summarizes these metrics...\n", + " 2. Document: ''\n", + " Highlights: 3 spans\n", + " - BERTScore (Zhang et al., 2020)...\n", + " - MEDCON (Yim et al., 2023)...\n", + " 3. Document: ''\n", + " Highlights: 13 spans\n", + " - F1...\n", + " - F1...\n", + "--------------------------------------------------\n", + "\n", + "--- Query 5 ---\n", + "Question: What are the main contributions of this work?\n", + "Answer: Thanks for your question! Based on the documents, here are the key points:\n", + "\n", + "• method (Section 3)\n", + "• evaluation (Section 4)\n", + "• Section: 3 Method\n", + "• Subsection: 3.1 System Overview\n", + "• extracted sentences\n", + "• ...\n", + "Source documents: 5 documents cited\n", + "Retrieved documents:\n", + " 1. Document: ''\n", + " Highlights: 2 spans\n", + " - evaluation (Section 4)...\n", + " - method (Section 3)...\n", + " 2. Document: ''\n", + " Highlights: 4 spans\n", + " - If exceeding 75 words, answers are compressed via a summarization prompt, preserving sentence-level ...\n", + " - Subsection: 3.1 System Overview...\n", + " 3. Document: ''\n", + " Highlights: 1 spans\n", + " - Are self-explanations from large language models faithful?...\n", + "--------------------------------------------------\n" + ] + } + ], + "source": "# Test queries that should benefit from hierarchical context\ntest_queries = [\n \"What dataset was used in this study?\",\n \"What are the limitations of standard RAG systems?\", \n \"How does the method work?\",\n \"What evaluation metrics were used?\",\n \"What are the main contributions of this work?\"\n]\n\nprint(\"🔍 Testing queries with context-enriched retrieval...\")\n\nfor i, query in enumerate(test_queries, 1):\n print(f\"\\n--- Query {i} ---\")\n print(f\"Question: {query}\")\n \n try:\n # Get response from VerbatimRAG\n response = rag.query(question=query)\n \n print(f\"Answer: {response.answer[:200]}...\")\n print(f\"Source documents: {len(response.documents)} documents cited\")\n \n # Show retrieved documents with their context\n print(\"Retrieved documents:\")\n for j, doc in enumerate(response.documents[:3]):\n print(f\" {j+1}. Document: '{doc.title}'\")\n if hasattr(doc, 'highlights') and doc.highlights:\n print(f\" Highlights: {len(doc.highlights)} spans\")\n for k, highlight in enumerate(doc.highlights[:2]):\n print(f\" - {highlight.text[:100]}...\")\n else:\n print(f\" Content preview: {doc.content[:100] if hasattr(doc, 'content') else 'N/A'}...\")\n \n except Exception as e:\n print(f\"❌ Error: {e}\")\n \n print(\"-\" * 50)" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Test 5: Compare Context vs Non-Context Retrieval" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Test specific query to compare context benefits\n", + "query = \"What are the limitations mentioned in the paper?\"\n", + "\n", + "print(f\"🔬 Comparative Analysis: '{query}'\")\n", + "print(\"=\" * 60)\n", + "\n", + "# Retrieve top chunks\n", + "try:\n", + " results = index.search(query, k=10)\n", + " \n", + " print(f\"\\n📊 Retrieved {len(results)} chunks:\")\n", + " \n", + " for i, (chunk, score) in enumerate(results[:5]):\n", + " print(f\"\\n{i+1}. Score: {score:.3f}\")\n", + " \n", + " if hasattr(chunk, 'section_path') and chunk.section_path:\n", + " context = \" → \".join(chunk.section_path)\n", + " print(f\" Context: {context}\")\n", + " print(f\" Content: {chunk.content[:150]}...\")\n", + " \n", + " # Show how context helped\n", + " enhanced = chunk.get_enhanced_content()\n", + " context_match = \"limitations\" in chunk.context_string.lower()\n", + " content_match = \"limitations\" in chunk.content.lower()\n", + " \n", + " match_type = []\n", + " if context_match: match_type.append(\"Context\")\n", + " if content_match: match_type.append(\"Content\")\n", + " \n", + " print(f\" Match type: {' + '.join(match_type) if match_type else 'Other'}\")\n", + " else:\n", + " print(f\" Content: {chunk.content[:150]}...\")\n", + " \n", + "except Exception as e:\n", + " print(f\"❌ Search error: {e}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Test 6: Span Extraction with Context" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Test span extraction to ensure context doesn't interfere\n", + "query = \"What evaluation metrics were used?\"\n", + "\n", + "print(f\"🎯 Span Extraction Test: '{query}'\")\n", + "print(\"=\" * 50)\n", + "\n", + "try:\n", + " # Get full response with span extraction\n", + " response = rag.query(\n", + " question=query,\n", + " max_chunks=3,\n", + " extract_spans=True\n", + " )\n", + " \n", + " print(f\"\\n📝 Answer: {response.answer}\")\n", + " print(f\"\\n📚 Citations ({len(response.citations)}):\")\n", + " \n", + " for i, citation in enumerate(response.citations):\n", + " chunk = index.get_chunk_by_id(citation.chunk_id)\n", + " \n", + " print(f\"\\n{i+1}. Citation:\")\n", + " if chunk and hasattr(chunk, 'section_path'):\n", + " context = \" → \".join(chunk.section_path)\n", + " print(f\" Section: {context}\")\n", + " \n", + " print(f\" Extracted span: {citation.text}\")\n", + " print(f\" Relevance: {citation.relevance_score:.3f}\")\n", + " \n", + " if hasattr(citation, 'span_start') and hasattr(citation, 'span_end'):\n", + " print(f\" Span position: {citation.span_start}-{citation.span_end}\")\n", + " \n", + "except Exception as e:\n", + " print(f\"❌ Span extraction error: {e}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Test Results Summary" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"📋 VerbatimRAG + Context-Enriched Integration Summary\")\n", + "print(\"=\" * 60)\n", + "\n", + "# Collect statistics\n", + "total_chunks = len(document.chunks)\n", + "enriched_chunks = len([c for c in document.chunks if hasattr(c, 'section_path')])\n", + "index_chunks = len(index.get_all_chunks())\n", + "\n", + "print(f\"\\n✅ Integration Test Results:\")\n", + "print(f\" 🔄 Document processing: SUCCESS\")\n", + "print(f\" 📊 Index creation: SUCCESS\")\n", + "print(f\" 🤖 VerbatimRAG initialization: SUCCESS\")\n", + "print(f\" 🔍 Query processing: {'SUCCESS' if 'response' in locals() else 'PENDING'}\")\n", + "\n", + "print(f\"\\n📈 Statistics:\")\n", + "print(f\" 📄 Total chunks: {total_chunks}\")\n", + "print(f\" 🏷️ Context-enriched: {enriched_chunks} ({enriched_chunks/total_chunks*100:.1f}%)\")\n", + "print(f\" 🗂️ Indexed chunks: {index_chunks}\")\n", + "\n", + "# Show section distribution\n", + "sections = {}\n", + "for chunk in document.chunks:\n", + " if hasattr(chunk, 'section_path') and chunk.section_path:\n", + " main_section = chunk.section_path[0]\n", + " sections[main_section] = sections.get(main_section, 0) + 1\n", + "\n", + "print(f\"\\n🌳 Section Coverage ({len(sections)} sections):\")\n", + "for section, count in sorted(sections.items()):\n", + " print(f\" {section}: {count} chunks\")\n", + "\n", + "print(f\"\\n🎯 Key Benefits Demonstrated:\")\n", + "print(f\" ✅ Hierarchical context preserved in embeddings\")\n", + "print(f\" ✅ Section-aware retrieval working\")\n", + "print(f\" ✅ VerbatimRAG pipeline compatibility confirmed\")\n", + "print(f\" ✅ Span extraction working with context\")\n", + "\n", + "print(f\"\\n🚀 Ready for production deployment!\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/verbatim_rag/index.py b/verbatim_rag/index.py index dcbf847..eaa39f1 100644 --- a/verbatim_rag/index.py +++ b/verbatim_rag/index.py @@ -32,12 +32,12 @@ class VerbatimIndex: """ def __init__( - self, - db_path: str = "./milvus_verbatim.db", - collection_name: str = "verbatim_rag", - dense_model: Optional[str] = None, - sparse_model: Optional[str] = None, - config: Optional[VerbatimRAGConfig] = None, + self, + db_path: str = "./milvus_verbatim.db", + collection_name: str = "verbatim_rag", + dense_model: Optional[str] = None, + sparse_model: Optional[str] = None, + config: Optional[VerbatimRAGConfig] = None, ): """ Initialize the VerbatimIndex with simple parameters or config. @@ -177,7 +177,7 @@ def add_documents(self, documents: List[Document]) -> None: self.vector_store.add_documents(document_data) def search( - self, query: str, k: int = 5, search_type: str = "auto" + self, query: str, k: int = 5, search_type: str = "auto" ) -> List[SearchResult]: """ Search for documents similar to the query. @@ -229,7 +229,7 @@ def get_document(self, document_id: str) -> Optional[Dict[str, Any]]: return None def _create_dense_provider( - self, config: VerbatimRAGConfig + self, config: VerbatimRAGConfig ) -> Optional[DenseEmbeddingProvider]: """Create dense embedding provider from config.""" # Check if dense is disabled @@ -251,7 +251,7 @@ def _create_dense_provider( ) def _create_sparse_provider( - self, config: VerbatimRAGConfig + self, config: VerbatimRAGConfig ) -> Optional[SparseEmbeddingProvider]: """Create sparse embedding provider from config.""" if not config.sparse_embedding.enabled: @@ -308,4 +308,4 @@ def _create_vector_store(self, config: VerbatimRAGConfig) -> VectorStore: @classmethod def from_config(cls, config: VerbatimRAGConfig) -> "VerbatimIndex": """Create VerbatimIndex from configuration.""" - return cls(config=config) + return cls(config=config) \ No newline at end of file diff --git a/verbatim_rag/ingestion/__init__.py b/verbatim_rag/ingestion/__init__.py index c51c07a..47d5ee7 100644 --- a/verbatim_rag/ingestion/__init__.py +++ b/verbatim_rag/ingestion/__init__.py @@ -5,5 +5,6 @@ """ from .document_processor import DocumentProcessor +from .context_enriched_processor import ContextEnrichedProcessor -__all__ = ["DocumentProcessor"] +__all__ = ["DocumentProcessor", "ContextEnrichedProcessor"] diff --git a/verbatim_rag/ingestion/context_enriched_processor.py b/verbatim_rag/ingestion/context_enriched_processor.py new file mode 100644 index 0000000..10dea3d --- /dev/null +++ b/verbatim_rag/ingestion/context_enriched_processor.py @@ -0,0 +1,356 @@ +""" +Context-enriched document processor for VerbatimRAG. + +This processor enriches document chunks with their hierarchical context (section paths) +to improve RAG retrieval by embedding section information alongside content. +""" + +import re +from typing import List, Dict, Any, Optional, Union +from pathlib import Path +from dataclasses import dataclass, field + +from .document_processor import DocumentProcessor +from ..document import Document, Chunk, ProcessedChunk, ChunkType, DocumentType + + +@dataclass +class ContextEnrichedChunk(Chunk): + """Extended Chunk with hierarchical context information.""" + + # Hierarchical context + section_path: List[str] = field(default_factory=list) # ["2 Background", "2.1 Dataset"] + section_numbers: List[str] = field(default_factory=list) # ["2", "2.1"] + context_string: str = "" # "Section: 2 Background | Subsection: 2.1 Dataset" + + def get_enhanced_content(self, include_context: bool = True) -> str: + """Get content with optional context prefix for embedding.""" + if not include_context or not self.context_string: + return self.content + return f"{self.context_string} | {self.content}" + + def get_citation_context(self) -> str: + """Get formatted context for citations.""" + if not self.section_path: + return "" + return " → ".join(self.section_path) + + +class ContextEnrichedProcessor(DocumentProcessor): + """ + Document processor that enriches chunks with hierarchical context. + + Creates chunks where each paragraph contains the full section path it belongs to, + enabling better RAG retrieval through context-aware embeddings. + """ + + def __init__( + self, + chunker_type: str = "recursive", + chunker_recipe: str = "markdown", + chunk_size: int = 512, + chunk_overlap: int = 50, + context_separator: str = " | ", + include_section_numbers: bool = True, + **chunker_kwargs + ): + """ + Initialize context-enriched processor. + + Args: + context_separator: Separator between context elements (default: " | ") + include_section_numbers: Whether to include section numbers in context + **kwargs: Arguments passed to base DocumentProcessor + """ + super().__init__( + chunker_type=chunker_type, + chunker_recipe=chunker_recipe, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + **chunker_kwargs + ) + self.context_separator = context_separator + self.include_section_numbers = include_section_numbers + + # Precompile regex patterns for performance + self.header_pattern = re.compile(r'^##\s+(\d+(?:\.\d+)*)\s+(.+)$') + self.section_pattern = re.compile(r'^(\d+(?:\.\d+)*)\s+([A-Z][A-Za-z\s]+.*)$') + + def process_file( + self, + file_path: Union[str, Path], + title: Optional[str] = None, + metadata: Optional[Dict[str, Any]] = None, + ) -> Document: + """Process a local file with context enrichment.""" + + # Get base document from parent class + base_document = super().process_file(file_path, title, metadata) + + # Apply context enrichment + return self._enrich_document_with_context(base_document) + + def process_url( + self, + url: str, + title: str, + metadata: Optional[Dict[str, Any]] = None + ) -> Document: + """Process a document from URL with context enrichment.""" + + # Get base document from parent class + base_document = super().process_url(url, title, metadata) + + # Apply context enrichment + return self._enrich_document_with_context(base_document) + + def _enrich_document_with_context(self, document: Document) -> Document: + """ + Enrich document chunks with hierarchical context. + + Strategy: + 1. Detect section structure from raw markdown content + 2. Create context-enriched chunks for content under each section + 3. Fall back to original chunks if no sections found + """ + + # Detect section structure + sections = self._detect_sections(document.raw_content) + + if not sections: + # No sections found, return original document + return document + + # Create context-enriched chunks + enriched_chunks = self._create_context_enriched_chunks( + document.raw_content, + document.id, + sections + ) + + # Replace original chunks with enriched ones + document.chunks = enriched_chunks + + return document + + def _detect_sections(self, content: str) -> List[Dict[str, Any]]: + """ + Detect section structure from markdown content. + + Returns: + List of section info dicts with keys: + - line_number: Line where section starts + - section_number: "1", "2.1", etc. + - title: Section title + - level: Hierarchy level (1, 2, 3...) + - full_title: "2 Background", "2.1 Dataset" + """ + lines = content.split('\n') + sections = [] + + for i, line in enumerate(lines): + line_stripped = line.strip() + + # Try header pattern first: "## 1 Introduction" + match = self.header_pattern.match(line_stripped) + if not match: + # Try section pattern: "1 Introduction" + match = self.section_pattern.match(line_stripped) + + if match: + section_num = match.group(1) + title = match.group(2).strip() + level = len(section_num.split('.')) + + # Create full title + full_title = f"{section_num} {title}" + + sections.append({ + 'line_number': i + 1, + 'section_number': section_num, + 'title': title, + 'level': level, + 'full_title': full_title + }) + + return sections + + def _create_context_enriched_chunks( + self, + content: str, + document_id: str, + sections: List[Dict[str, Any]] + ) -> List[ContextEnrichedChunk]: + """Create context-enriched chunks from content and section structure.""" + + lines = content.split('\n') + enriched_chunks = [] + + for i, section in enumerate(sections): + # Determine content boundaries for this section + start_line = section['line_number'] - 1 # Convert to 0-based + if i + 1 < len(sections): + end_line = sections[i + 1]['line_number'] - 1 + else: + end_line = len(lines) + + # Extract section content (skip header line) + section_lines = lines[start_line:end_line] + if section_lines: + header_line = section_lines[0] + content_lines = section_lines[1:] # Skip header + section_content = '\n'.join(content_lines).strip() + else: + continue + + if not section_content: + continue + + # Build section path for this section + section_path = self._build_section_path(section, sections) + + # Chunk the section content using the base chunker + if len(section_content) > self.chunk_size: + # Use base chunker for large sections + content_chunks = self._chunk_content(section_content) + else: + # Keep small sections as single chunk + content_chunks = [section_content] + + # Create enriched chunks for each content chunk + for chunk_idx, chunk_content in enumerate(content_chunks): + if not chunk_content.strip(): + continue + + # Create context string + context_string = self._build_context_string(section_path) + + # Determine chunk type + chunk_type = ChunkType.SECTION if section['level'] == 1 else ChunkType.PARAGRAPH + + # Create enriched chunk + enriched_chunk = ContextEnrichedChunk( + document_id=document_id, + content=chunk_content.strip(), + chunk_number=len(enriched_chunks), + chunk_type=chunk_type, + section_path=section_path, + section_numbers=[s.split()[0] for s in section_path], + context_string=context_string, + metadata={ + 'section_info': section, + 'section_path': section_path, + 'context_string': context_string, + 'chunk_in_section': chunk_idx + } + ) + + # Create processed chunk with enhanced content + processed_chunk = ProcessedChunk( + chunk_id=enriched_chunk.id, + enhanced_content=enriched_chunk.get_enhanced_content(include_context=True), + section_title=section['title'], + processing_metadata={ + 'context_enriched': True, + 'section_path': section_path, + 'context_string': context_string + } + ) + + enriched_chunk.add_processed_chunk(processed_chunk) + enriched_chunks.append(enriched_chunk) + + return enriched_chunks + + def _build_section_path( + self, + current_section: Dict[str, Any], + all_sections: List[Dict[str, Any]] + ) -> List[str]: + """ + Build hierarchical path for a section. + + For section "2.1 Dataset", returns ["2 Background", "2.1 Dataset"] + """ + section_path = [] + current_number = current_section['section_number'] + current_parts = current_number.split('.') + + # Add all parent sections + for level in range(1, len(current_parts)): + parent_number = '.'.join(current_parts[:level]) + + # Find parent section + for section in all_sections: + if section['section_number'] == parent_number: + section_path.append(section['full_title']) + break + + # Add current section + section_path.append(current_section['full_title']) + + return section_path + + def _build_context_string(self, section_path: List[str]) -> str: + """Build context string from section path.""" + if not section_path: + return "" + + # Create hierarchical labels + context_parts = [] + for i, section in enumerate(section_path): + if i == 0: + label = "Section" + elif i == 1: + label = "Subsection" + elif i == 2: + label = "Subsubsection" + else: + label = f"Level-{i+1}" + + context_parts.append(f"{label}: {section}") + + return self.context_separator.join(context_parts) + + def _chunk_content(self, content: str) -> List[str]: + """Chunk content using the base chunker.""" + try: + # Use the inherited chunker from DocumentProcessor + chunks = self.chunker(content) + return [chunk.text for chunk in chunks] + except Exception: + # Fallback to simple paragraph splitting + paragraphs = [p.strip() for p in content.split('\n\n') if p.strip()] + return paragraphs if paragraphs else [content] + + @classmethod + def for_rag( + cls, + chunk_size: int = 384, + overlap: int = 50, + context_separator: str = " | " + ): + """Create processor optimized for RAG with context enrichment.""" + return cls( + chunker_type="token", + chunk_size=chunk_size, + chunk_overlap=overlap, + context_separator=context_separator, + include_section_numbers=True + ) + + @classmethod + def for_embeddings( + cls, + chunk_size: int = 512, + overlap: int = 50, + context_separator: str = " | " + ): + """Create processor optimized for embedding generation with context.""" + return cls( + chunker_type="recursive", + chunk_size=chunk_size, + chunk_overlap=overlap, + context_separator=context_separator, + include_section_numbers=True + ) \ No newline at end of file diff --git a/verbatim_rag/ingestion/document_processor.py b/verbatim_rag/ingestion/document_processor.py index 239ec62..d829329 100644 --- a/verbatim_rag/ingestion/document_processor.py +++ b/verbatim_rag/ingestion/document_processor.py @@ -64,6 +64,8 @@ def __init__( self.chunker = chonkie.RecursiveChunker.from_recipe( chunker_recipe, lang=lang ) + # Override default chunk size for better hierarchical splitting + self.chunker.chunk_size = chunk_size elif chunker_type == "token": self.chunker = chonkie.TokenChunker( chunk_size=chunk_size, chunk_overlap=chunk_overlap, **chunker_kwargs @@ -91,6 +93,7 @@ def __init__( def process_url( self, url: str, title: str, metadata: Optional[Dict[str, Any]] = None ) -> Document: + """ Process a document from URL (like PDF). From c84237f8b715ceb6fbacb13fa3ff39b43f2ec4cb Mon Sep 17 00:00:00 2001 From: FlackoJodye1 Date: Thu, 7 Aug 2025 16:00:58 +0200 Subject: [PATCH 2/2] Enhance context-enriched processor with document title integration - Add document title to context strings for better hierarchical context - Improve enhanced content formatting with title prefix - Update notebooks with latest execution results - Add gitignore entries for temporary files --- .gitignore | 1 + .../test_context_enriched_processor.ipynb | 660 +++++++++++------- notebooks/test_verbatim_rag_integration.ipynb | 612 +++++++++------- verbatim_rag/core.py | 2 + .../ingestion/context_enriched_processor.py | 22 +- verbatim_rag/response_builder.py | 18 +- 6 files changed, 777 insertions(+), 538 deletions(-) diff --git a/.gitignore b/.gitignore index 95f080b..64ce7e4 100644 --- a/.gitignore +++ b/.gitignore @@ -131,6 +131,7 @@ celerybeat.pid # Environments .env +.env.local .venv env/ venv/ diff --git a/notebooks/test_context_enriched_processor.ipynb b/notebooks/test_context_enriched_processor.ipynb index 5b33e83..63edbf2 100644 --- a/notebooks/test_context_enriched_processor.ipynb +++ b/notebooks/test_context_enriched_processor.ipynb @@ -18,23 +18,21 @@ }, { "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Project root: /Users/paulschmitt/DataspellProjects/verbatim-rag\n", - "✅ Setup complete\n" - ] + "metadata": { + "pycharm": { + "is_executing": true + }, + "ExecuteTime": { + "end_time": "2025-08-07T08:02:43.656859Z", + "start_time": "2025-08-07T08:02:43.651141Z" } - ], + }, "source": [ "import sys\n", "import os\n", "from pathlib import Path\n", - "import torch\n", + "\n", + "\n", "\n", "# Fix OpenMP conflict (common with ML libraries on macOS)\n", "os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'\n", @@ -45,27 +43,43 @@ "\n", "print(f\"Project root: {project_root}\")\n", "print(\"✅ Setup complete\")" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, + ], "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "✅ Imports successful\n" + "Project root: /Users/paulschmitt/DataspellProjects/verbatim-rag\n", + "✅ Setup complete\n" ] } ], + "execution_count": 4 + }, + { + "cell_type": "code", + "metadata": { + "ExecuteTime": { + "end_time": "2025-08-07T08:02:44.529823Z", + "start_time": "2025-08-07T08:02:44.524232Z" + } + }, "source": [ "from verbatim_rag.ingestion.context_enriched_processor import ContextEnrichedProcessor, ContextEnrichedChunk\n", "from pprint import pprint\n", "\n", "print(\"✅ Imports successful\")" - ] + ], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Imports successful\n" + ] + } + ], + "execution_count": 5 }, { "cell_type": "markdown", @@ -76,8 +90,27 @@ }, { "cell_type": "code", - "execution_count": 16, - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2025-08-07T08:02:57.205584Z", + "start_time": "2025-08-07T08:02:45.717630Z" + } + }, + "source": [ + "# Test with the academic paper\n", + "pdf_path = project_root / \"data\" / \"acl_papers\" / \"VERBATIM_RAG_ACL.pdf\"\n", + "\n", + "# Create context-enriched processor\n", + "processor = ContextEnrichedProcessor.for_rag(chunk_size=512)\n", + "\n", + "# Process document\n", + "document = processor.process_file(pdf_path, title=\"Verbatim RAG ACL Paper\")\n", + "\n", + "print(f\"✅ Document processed successfully!\")\n", + "print(f\"Title: {document.title}\")\n", + "print(f\"Chunks: {len(document.chunks)}\")\n", + "print(f\"Content type: {document.content_type}\")" + ], "outputs": [ { "name": "stderr", @@ -98,21 +131,7 @@ ] } ], - "source": [ - "# Test with the academic paper\n", - "pdf_path = project_root / \"data\" / \"acl_papers\" / \"VERBATIM_RAG_ACL.pdf\"\n", - "\n", - "# Create context-enriched processor\n", - "processor = ContextEnrichedProcessor.for_rag(chunk_size=512)\n", - "\n", - "# Process document\n", - "document = processor.process_file(pdf_path, title=\"Verbatim RAG ACL Paper\")\n", - "\n", - "print(f\"✅ Document processed successfully!\")\n", - "print(f\"Title: {document.title}\")\n", - "print(f\"Chunks: {len(document.chunks)}\")\n", - "print(f\"Content type: {document.content_type}\")" - ] + "execution_count": 6 }, { "cell_type": "markdown", @@ -123,7 +142,17 @@ }, { "cell_type": "code", - "execution_count": 17, + "source": [ + "print(\"🔍 Context-Enriched Chunk Analysis:\")\n", + "print(f\"Total chunks: {len(document.chunks)}\")" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2025-08-07T08:02:58.266361Z", + "start_time": "2025-08-07T08:02:58.262010Z" + } + }, "outputs": [ { "name": "stdout", @@ -134,18 +163,40 @@ ] } ], - "source": [ - "print(\"🔍 Context-Enriched Chunk Analysis:\")\n", - "print(f\"Total chunks: {len(document.chunks)}\")" - ], - "metadata": { - "collapsed": false - } + "execution_count": 7 }, { "cell_type": "code", - "execution_count": 26, - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2025-08-07T08:02:59.113789Z", + "start_time": "2025-08-07T08:02:59.106311Z" + } + }, + "source": [ + "# Show first 5 chunks with their context\n", + "n = 10\n", + "print(f\"\\n📝 First {n} Chunks with Context:\")\n", + "for i, chunk in enumerate(document.chunks[:n]):\n", + " print(f\"\\n--- Chunk {i+1} ---\")\n", + " print(f\"Type: {type(chunk).__name__}\")\n", + " print(f\"Section Path: {chunk.section_path}\")\n", + " print(f\"Context: {chunk.context_string}\")\n", + " print(f\"Content: {chunk.content[:100]}...\")\n", + "\n", + " # Show enhanced content (what gets embedded)\n", + " if hasattr(chunk, 'get_enhanced_content'):\n", + " enhanced = chunk.get_enhanced_content()\n", + " print(f\"Enhanced: {enhanced[:150]}...\")\n", + "\n", + " # Show citation context\n", + " if hasattr(chunk, 'get_citation_context'):\n", + " citation = chunk.get_citation_context()\n", + " print(f\"Citation: {citation}\")\n", + "\n", + "if len(document.chunks) > n:\n", + " print(f\"\\n... and {len(document.chunks) - n} more chunks\")" + ], "outputs": [ { "name": "stdout", @@ -157,117 +208,92 @@ "--- Chunk 1 ---\n", "Type: ContextEnrichedChunk\n", "Section Path: ['1 Introduction']\n", - "Context: Section: 1 Introduction\n", + "Context: Verbatim RAG ACL Paper | Section: 1 Introduction\n", "Content: Modern question-answering (QA) and retrievalaugmented generation (RAG) systems play a vital role in ...\n", - "Enhanced: Section: 1 Introduction | Modern question-answering (QA) and retrievalaugmented generation (RAG) systems play a vital role in many high-stakes domains...\n", + "Enhanced: Verbatim RAG ACL Paper | Section: 1 Introduction | Modern question-answering (QA) and retrievalaugmented generation (RAG) systems play a vital role in...\n", "Citation: 1 Introduction\n", "\n", "--- Chunk 2 ---\n", "Type: ContextEnrichedChunk\n", "Section Path: ['1 Introduction']\n", - "Context: Section: 1 Introduction\n", + "Context: Verbatim RAG ACL Paper | Section: 1 Introduction\n", "Content: incorrect information, commonly referred to as hallucinations (Ji et al., 2023; Madsen et al., 2024)...\n", - "Enhanced: Section: 1 Introduction | incorrect information, commonly referred to as hallucinations (Ji et al., 2023; Madsen et al., 2024). We argue that a reliab...\n", + "Enhanced: Verbatim RAG ACL Paper | Section: 1 Introduction | incorrect information, commonly referred to as hallucinations (Ji et al., 2023; Madsen et al., 2024...\n", "Citation: 1 Introduction\n", "\n", "--- Chunk 3 ---\n", "Type: ContextEnrichedChunk\n", "Section Path: ['1 Introduction']\n", - "Context: Section: 1 Introduction\n", + "Context: Verbatim RAG ACL Paper | Section: 1 Introduction\n", "Content: trained generation , dynamically creating answer templates filled exclu-\n", "\n", "We participated in the Arc...\n", - "Enhanced: Section: 1 Introduction | trained generation , dynamically creating answer templates filled exclu-\n", + "Enhanced: Verbatim RAG ACL Paper | Section: 1 Introduction | trained generation , dynamically creating answer templates filled exclu-\n", "\n", - "We participated in the ArchEHR-QA 2025 shared task...\n", + "We participated in the Ar...\n", "Citation: 1 Introduction\n", "\n", "--- Chunk 4 ---\n", "Type: ContextEnrichedChunk\n", "Section Path: ['1 Introduction']\n", - "Context: Section: 1 Introduction\n", + "Context: Verbatim RAG ACL Paper | Section: 1 Introduction\n", "Content: arner et al., 2024), achieving performance comparable to the LLM extractor. Both extractors were the...\n", - "Enhanced: Section: 1 Introduction | arner et al., 2024), achieving performance comparable to the LLM extractor. Both extractors were then fed into the same LLM ...\n", + "Enhanced: Verbatim RAG ACL Paper | Section: 1 Introduction | arner et al., 2024), achieving performance comparable to the LLM extractor. Both extractors were th...\n", "Citation: 1 Introduction\n", "\n", "--- Chunk 5 ---\n", "Type: ContextEnrichedChunk\n", "Section Path: ['1 Introduction']\n", - "Context: Section: 1 Introduction\n", + "Context: Verbatim RAG ACL Paper | Section: 1 Introduction\n", "Content: d train custom models. Additionally, we are releasing all the code on GitHub 2 under the MIT License...\n", - "Enhanced: Section: 1 Introduction | d train custom models. Additionally, we are releasing all the code on GitHub 2 under the MIT License.\n", - "\n", - "The remainder of the ...\n", + "Enhanced: Verbatim RAG ACL Paper | Section: 1 Introduction | d train custom models. Additionally, we are releasing all the code on GitHub 2 under the MIT Licens...\n", "Citation: 1 Introduction\n", "\n", "--- Chunk 6 ---\n", "Type: ContextEnrichedChunk\n", "Section Path: ['2 Background', '2.1 Dataset']\n", - "Context: Section: 2 Background | Subsection: 2.1 Dataset\n", + "Context: Verbatim RAG ACL Paper | Section: 2 Background | Subsection: 2.1 Dataset\n", "Content: Early clinical QA datasets such as emrQA (Pampari et al., 2018) and CliCR (Šuster and Daelemans, 201...\n", - "Enhanced: Section: 2 Background | Subsection: 2.1 Dataset | Early clinical QA datasets such as emrQA (Pampari et al., 2018) and CliCR (Šuster and Daelemans, 201...\n", + "Enhanced: Verbatim RAG ACL Paper | Section: 2 Background | Subsection: 2.1 Dataset | Early clinical QA datasets such as emrQA (Pampari et al., 2018) and CliCR (...\n", "Citation: 2 Background → 2.1 Dataset\n", "\n", "--- Chunk 7 ---\n", "Type: ContextEnrichedChunk\n", "Section Path: ['2 Background', '2.1 Dataset']\n", - "Context: Section: 2 Background | Subsection: 2.1 Dataset\n", + "Context: Verbatim RAG ACL Paper | Section: 2 Background | Subsection: 2.1 Dataset\n", "Content: sentence-level as essential , supplementary , or irrelevant . Answers must be concise (under 75 word...\n", - "Enhanced: Section: 2 Background | Subsection: 2.1 Dataset | sentence-level as essential , supplementary , or irrelevant . Answers must be concise (under 75 word...\n", + "Enhanced: Verbatim RAG ACL Paper | Section: 2 Background | Subsection: 2.1 Dataset | sentence-level as essential , supplementary , or irrelevant . Answers must ...\n", "Citation: 2 Background → 2.1 Dataset\n", "\n", "--- Chunk 8 ---\n", "Type: ContextEnrichedChunk\n", "Section Path: ['2 Background', '2.2 Limitations of Standard RAG']\n", - "Context: Section: 2 Background | Subsection: 2.2 Limitations of Standard RAG\n", + "Context: Verbatim RAG ACL Paper | Section: 2 Background | Subsection: 2.2 Limitations of Standard RAG\n", "Content: Standard RAG models, despite external grounding, still frequently hallucinate unsupported or contrad...\n", - "Enhanced: Section: 2 Background | Subsection: 2.2 Limitations of Standard RAG | Standard RAG models, despite external grounding, still frequently hallucinate un...\n", + "Enhanced: Verbatim RAG ACL Paper | Section: 2 Background | Subsection: 2.2 Limitations of Standard RAG | Standard RAG models, despite external grounding, still ...\n", "Citation: 2 Background → 2.2 Limitations of Standard RAG\n", "\n", "--- Chunk 9 ---\n", "Type: ContextEnrichedChunk\n", "Section Path: ['2 Background', '2.2 Limitations of Standard RAG']\n", - "Context: Section: 2 Background | Subsection: 2.2 Limitations of Standard RAG\n", + "Context: Verbatim RAG ACL Paper | Section: 2 Background | Subsection: 2.2 Limitations of Standard RAG\n", "Content: ano and Smith, 2019; Jain and Wallace, 2019) and LLM self-explanations (Madsen et al., 2024) have al...\n", - "Enhanced: Section: 2 Background | Subsection: 2.2 Limitations of Standard RAG | ano and Smith, 2019; Jain and Wallace, 2019) and LLM self-explanations (Madsen e...\n", + "Enhanced: Verbatim RAG ACL Paper | Section: 2 Background | Subsection: 2.2 Limitations of Standard RAG | ano and Smith, 2019; Jain and Wallace, 2019) and LLM se...\n", "Citation: 2 Background → 2.2 Limitations of Standard RAG\n", "\n", "--- Chunk 10 ---\n", "Type: ContextEnrichedChunk\n", "Section Path: ['2 Background', '2.3 Synthetic Training Data']\n", - "Context: Section: 2 Background | Subsection: 2.3 Synthetic Training Data\n", + "Context: Verbatim RAG ACL Paper | Section: 2 Background | Subsection: 2.3 Synthetic Training Data\n", "Content: Due to limited access and annotation restrictions, obtaining sentence-level labeled clinical dataset...\n", - "Enhanced: Section: 2 Background | Subsection: 2.3 Synthetic Training Data | Due to limited access and annotation restrictions, obtaining sentence-level labeled ...\n", + "Enhanced: Verbatim RAG ACL Paper | Section: 2 Background | Subsection: 2.3 Synthetic Training Data | Due to limited access and annotation restrictions, obtainin...\n", "Citation: 2 Background → 2.3 Synthetic Training Data\n", "\n", "... and 47 more chunks\n" ] } ], - "source": [ - "# Show first 5 chunks with their context\n", - "n = 10\n", - "print(f\"\\n📝 First {n} Chunks with Context:\")\n", - "for i, chunk in enumerate(document.chunks[:n]):\n", - " print(f\"\\n--- Chunk {i+1} ---\")\n", - " print(f\"Type: {type(chunk).__name__}\")\n", - " print(f\"Section Path: {chunk.section_path}\")\n", - " print(f\"Context: {chunk.context_string}\")\n", - " print(f\"Content: {chunk.content[:100]}...\")\n", - "\n", - " # Show enhanced content (what gets embedded)\n", - " if hasattr(chunk, 'get_enhanced_content'):\n", - " enhanced = chunk.get_enhanced_content()\n", - " print(f\"Enhanced: {enhanced[:150]}...\")\n", - "\n", - " # Show citation context\n", - " if hasattr(chunk, 'get_citation_context'):\n", - " citation = chunk.get_citation_context()\n", - " print(f\"Citation: {citation}\")\n", - "\n", - "if len(document.chunks) > n:\n", - " print(f\"\\n... and {len(document.chunks) - n} more chunks\")" - ] + "execution_count": 8 }, { "cell_type": "markdown", @@ -278,45 +304,12 @@ }, { "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "📊 Context Distribution Analysis:\n", - "\n", - "🏷️ Chunks per Main Section:\n", - " 1 Introduction: 5 chunks\n", - " 2 Background: 5 chunks\n", - " 3 Method: 16 chunks\n", - " 4 Evaluation: 9 chunks\n", - " 5 Ethical Considerations: 2 chunks\n", - " 6 Limitations: 2 chunks\n", - " 7 Conclusion: 18 chunks\n", - "\n", - "📏 Context String Statistics:\n", - " Average length: 35.0 chars\n", - " Min length: 21 chars\n", - " Max length: 67 chars\n", - "\n", - "🌳 Unique Section Paths (12 total):\n", - " 1 Introduction\n", - " 2 Background → 2.1 Dataset\n", - " 2 Background → 2.2 Limitations of Standard RAG\n", - " 2 Background → 2.3 Synthetic Training Data\n", - " 3 Method → 3.1 System Overview\n", - " 3 Method → 3.2 Evidence Extraction\n", - " 3 Method → 3.3 Synthetic Data Generation\n", - " 3 Method → 3.4 Answer Generation\n", - " 4 Evaluation\n", - " 5 Ethical Considerations\n", - " 6 Limitations\n", - " 7 Conclusion\n" - ] + "metadata": { + "ExecuteTime": { + "end_time": "2025-08-07T08:03:03.277222Z", + "start_time": "2025-08-07T08:03:03.268138Z" } - ], + }, "source": [ "print(\"📊 Context Distribution Analysis:\")\n", "\n", @@ -353,7 +346,45 @@ "print(f\"\\n🌳 Unique Section Paths ({len(unique_paths)} total):\")\n", "for path in sorted(unique_paths):\n", " print(f\" {path}\")" - ] + ], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "📊 Context Distribution Analysis:\n", + "\n", + "🏷️ Chunks per Main Section:\n", + " 1 Introduction: 5 chunks\n", + " 2 Background: 5 chunks\n", + " 3 Method: 16 chunks\n", + " 4 Evaluation: 9 chunks\n", + " 5 Ethical Considerations: 2 chunks\n", + " 6 Limitations: 2 chunks\n", + " 7 Conclusion: 18 chunks\n", + "\n", + "📏 Context String Statistics:\n", + " Average length: 60.0 chars\n", + " Min length: 46 chars\n", + " Max length: 92 chars\n", + "\n", + "🌳 Unique Section Paths (12 total):\n", + " 1 Introduction\n", + " 2 Background → 2.1 Dataset\n", + " 2 Background → 2.2 Limitations of Standard RAG\n", + " 2 Background → 2.3 Synthetic Training Data\n", + " 3 Method → 3.1 System Overview\n", + " 3 Method → 3.2 Evidence Extraction\n", + " 3 Method → 3.3 Synthetic Data Generation\n", + " 3 Method → 3.4 Answer Generation\n", + " 4 Evaluation\n", + " 5 Ethical Considerations\n", + " 6 Limitations\n", + " 7 Conclusion\n" + ] + } + ], + "execution_count": 9 }, { "cell_type": "markdown", @@ -364,8 +395,36 @@ }, { "cell_type": "code", - "execution_count": 31, - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2025-08-07T08:03:04.490819Z", + "start_time": "2025-08-07T08:03:04.481435Z" + } + }, + "source": [ + "print(\"🎯 Embedding-Ready Content Examples:\")\n", + "print(\"This shows what will actually be embedded for RAG retrieval.\")\n", + "\n", + "# Show 3 examples of enhanced content\n", + "for i, chunk in enumerate(document.chunks[:12]):\n", + " if hasattr(chunk, 'get_enhanced_content'):\n", + " print(f\"\\n--- Example {i+1} ---\")\n", + " print(f\"Original: {chunk.content[:100]}...\")\n", + " print(f\"Enhanced: {chunk.get_enhanced_content()[:200]}...\")\n", + " print(f\"Context adds: {len(chunk.get_enhanced_content()) - len(chunk.content)} chars\")\n", + "\n", + "# Show processed chunks (what goes to the index)\n", + "print(f\"\\n💾 ProcessedChunk Integration:\")\n", + "total_processed = sum(len(chunk.processed_chunks) for chunk in document.chunks)\n", + "print(f\"Total processed chunks: {total_processed}\")\n", + "\n", + "if document.chunks and document.chunks[0].processed_chunks:\n", + " pc = document.chunks[0].processed_chunks[0]\n", + " print(f\"\\nExample ProcessedChunk:\")\n", + " print(f\" Section title: {pc.section_title}\")\n", + " print(f\" Enhanced content: {pc.enhanced_content[:150]}...\")\n", + " print(f\" Processing metadata: {pc.processing_metadata}\")" + ], "outputs": [ { "name": "stdout", @@ -376,104 +435,81 @@ "\n", "--- Example 1 ---\n", "Original: Modern question-answering (QA) and retrievalaugmented generation (RAG) systems play a vital role in ...\n", - "Enhanced: Section: 1 Introduction | Modern question-answering (QA) and retrievalaugmented generation (RAG) systems play a vital role in many high-stakes domains for information extraction and generation tasks. ...\n", - "Context adds: 26 chars\n", + "Enhanced: Verbatim RAG ACL Paper | Section: 1 Introduction | Modern question-answering (QA) and retrievalaugmented generation (RAG) systems play a vital role in many high-stakes domains for information extracti...\n", + "Context adds: 51 chars\n", "\n", "--- Example 2 ---\n", "Original: incorrect information, commonly referred to as hallucinations (Ji et al., 2023; Madsen et al., 2024)...\n", - "Enhanced: Section: 1 Introduction | incorrect information, commonly referred to as hallucinations (Ji et al., 2023; Madsen et al., 2024). We argue that a reliable QA system should guarantee complete traceabilit...\n", - "Context adds: 26 chars\n", + "Enhanced: Verbatim RAG ACL Paper | Section: 1 Introduction | incorrect information, commonly referred to as hallucinations (Ji et al., 2023; Madsen et al., 2024). We argue that a reliable QA system should guara...\n", + "Context adds: 51 chars\n", "\n", "--- Example 3 ---\n", "Original: trained generation , dynamically creating answer templates filled exclu-\n", "\n", "We participated in the Arc...\n", - "Enhanced: Section: 1 Introduction | trained generation , dynamically creating answer templates filled exclu-\n", + "Enhanced: Verbatim RAG ACL Paper | Section: 1 Introduction | trained generation , dynamically creating answer templates filled exclu-\n", "\n", - "We participated in the ArchEHR-QA 2025 shared task on grounded question answering (QA) from electron...\n", - "Context adds: 26 chars\n", + "We participated in the ArchEHR-QA 2025 shared task on grounded question ans...\n", + "Context adds: 51 chars\n", "\n", "--- Example 4 ---\n", "Original: arner et al., 2024), achieving performance comparable to the LLM extractor. Both extractors were the...\n", - "Enhanced: Section: 1 Introduction | arner et al., 2024), achieving performance comparable to the LLM extractor. Both extractors were then fed into the same LLM template generator. Our solution achieved a score ...\n", - "Context adds: 26 chars\n", + "Enhanced: Verbatim RAG ACL Paper | Section: 1 Introduction | arner et al., 2024), achieving performance comparable to the LLM extractor. Both extractors were then fed into the same LLM template generator. Our s...\n", + "Context adds: 51 chars\n", "\n", "--- Example 5 ---\n", "Original: d train custom models. Additionally, we are releasing all the code on GitHub 2 under the MIT License...\n", - "Enhanced: Section: 1 Introduction | d train custom models. Additionally, we are releasing all the code on GitHub 2 under the MIT License.\n", + "Enhanced: Verbatim RAG ACL Paper | Section: 1 Introduction | d train custom models. Additionally, we are releasing all the code on GitHub 2 under the MIT License.\n", "\n", - "The remainder of the paper discusses background (Section 2), method (Se...\n", - "Context adds: 26 chars\n", + "The remainder of the paper discusses backgroun...\n", + "Context adds: 51 chars\n", "\n", "--- Example 6 ---\n", "Original: Early clinical QA datasets such as emrQA (Pampari et al., 2018) and CliCR (Šuster and Daelemans, 201...\n", - "Enhanced: Section: 2 Background | Subsection: 2.1 Dataset | Early clinical QA datasets such as emrQA (Pampari et al., 2018) and CliCR (Šuster and Daelemans, 2018) used fill-in-the-blank methods and lacked expli...\n", - "Context adds: 50 chars\n", + "Enhanced: Verbatim RAG ACL Paper | Section: 2 Background | Subsection: 2.1 Dataset | Early clinical QA datasets such as emrQA (Pampari et al., 2018) and CliCR (Šuster and Daelemans, 2018) used fill-in-the-blank...\n", + "Context adds: 75 chars\n", "\n", "--- Example 7 ---\n", "Original: sentence-level as essential , supplementary , or irrelevant . Answers must be concise (under 75 word...\n", - "Enhanced: Section: 2 Background | Subsection: 2.1 Dataset | sentence-level as essential , supplementary , or irrelevant . Answers must be concise (under 75 words) and explicitly cite relevant sentences....\n", - "Context adds: 50 chars\n", + "Enhanced: Verbatim RAG ACL Paper | Section: 2 Background | Subsection: 2.1 Dataset | sentence-level as essential , supplementary , or irrelevant . Answers must be concise (under 75 words) and explicitly cite re...\n", + "Context adds: 75 chars\n", "\n", "--- Example 8 ---\n", "Original: Standard RAG models, despite external grounding, still frequently hallucinate unsupported or contrad...\n", - "Enhanced: Section: 2 Background | Subsection: 2.2 Limitations of Standard RAG | Standard RAG models, despite external grounding, still frequently hallucinate unsupported or contradictory information (Ji et al.,...\n", - "Context adds: 70 chars\n", + "Enhanced: Verbatim RAG ACL Paper | Section: 2 Background | Subsection: 2.2 Limitations of Standard RAG | Standard RAG models, despite external grounding, still frequently hallucinate unsupported or contradictor...\n", + "Context adds: 95 chars\n", "\n", "--- Example 9 ---\n", "Original: ano and Smith, 2019; Jain and Wallace, 2019) and LLM self-explanations (Madsen et al., 2024) have al...\n", - "Enhanced: Section: 2 Background | Subsection: 2.2 Limitations of Standard RAG | ano and Smith, 2019; Jain and Wallace, 2019) and LLM self-explanations (Madsen et al., 2024) have also been found unreliable. Our ...\n", - "Context adds: 70 chars\n", + "Enhanced: Verbatim RAG ACL Paper | Section: 2 Background | Subsection: 2.2 Limitations of Standard RAG | ano and Smith, 2019; Jain and Wallace, 2019) and LLM self-explanations (Madsen et al., 2024) have also be...\n", + "Context adds: 95 chars\n", "\n", "--- Example 10 ---\n", "Original: Due to limited access and annotation restrictions, obtaining sentence-level labeled clinical dataset...\n", - "Enhanced: Section: 2 Background | Subsection: 2.3 Synthetic Training Data | Due to limited access and annotation restrictions, obtaining sentence-level labeled clinical datasets is challenging. Recent works add...\n", - "Context adds: 66 chars\n", + "Enhanced: Verbatim RAG ACL Paper | Section: 2 Background | Subsection: 2.3 Synthetic Training Data | Due to limited access and annotation restrictions, obtaining sentence-level labeled clinical datasets is chal...\n", + "Context adds: 91 chars\n", "\n", "--- Example 11 ---\n", "Original: Figure 1 depicts our system architecture. First, an extraction step identifies relevant sentences fr...\n", - "Enhanced: Section: 3 Method | Subsection: 3.1 System Overview | Figure 1 depicts our system architecture. First, an extraction step identifies relevant sentences from the input (patient narrative, clinician que...\n", - "Context adds: 54 chars\n", + "Enhanced: Verbatim RAG ACL Paper | Section: 3 Method | Subsection: 3.1 System Overview | Figure 1 depicts our system architecture. First, an extraction step identifies relevant sentences from the input (patient...\n", + "Context adds: 79 chars\n", "\n", "--- Example 12 ---\n", "Original: We evaluated two extractors: (i) We prompted gemma-3-27b-it to explicitly label sentences as relevan...\n", - "Enhanced: Section: 3 Method | Subsection: 3.2 Evidence Extraction | We evaluated two extractors: (i) We prompted gemma-3-27b-it to explicitly label sentences as relevant via a step-by-step process. (ii) We fine...\n", - "Context adds: 58 chars\n", + "Enhanced: Verbatim RAG ACL Paper | Section: 3 Method | Subsection: 3.2 Evidence Extraction | We evaluated two extractors: (i) We prompted gemma-3-27b-it to explicitly label sentences as relevant via a step-by-s...\n", + "Context adds: 83 chars\n", "\n", "💾 ProcessedChunk Integration:\n", "Total processed chunks: 57\n", "\n", "Example ProcessedChunk:\n", " Section title: Introduction\n", - " Enhanced content: Section: 1 Introduction | Modern question-answering (QA) and retrievalaugmented generation (RAG) systems play a vital role in many high-stakes domains...\n", - " Processing metadata: {'context_enriched': True, 'section_path': ['1 Introduction'], 'context_string': 'Section: 1 Introduction'}\n" + " Enhanced content: Verbatim RAG ACL Paper | Section: 1 Introduction | Modern question-answering (QA) and retrievalaugmented generation (RAG) systems play a vital role in...\n", + " Processing metadata: {'context_enriched': True, 'section_path': ['1 Introduction'], 'context_string': 'Verbatim RAG ACL Paper | Section: 1 Introduction'}\n" ] } ], - "source": [ - "print(\"🎯 Embedding-Ready Content Examples:\")\n", - "print(\"This shows what will actually be embedded for RAG retrieval.\")\n", - "\n", - "# Show 3 examples of enhanced content\n", - "for i, chunk in enumerate(document.chunks[:12]):\n", - " if hasattr(chunk, 'get_enhanced_content'):\n", - " print(f\"\\n--- Example {i+1} ---\")\n", - " print(f\"Original: {chunk.content[:100]}...\")\n", - " print(f\"Enhanced: {chunk.get_enhanced_content()[:200]}...\")\n", - " print(f\"Context adds: {len(chunk.get_enhanced_content()) - len(chunk.content)} chars\")\n", - "\n", - "# Show processed chunks (what goes to the index)\n", - "print(f\"\\n💾 ProcessedChunk Integration:\")\n", - "total_processed = sum(len(chunk.processed_chunks) for chunk in document.chunks)\n", - "print(f\"Total processed chunks: {total_processed}\")\n", - "\n", - "if document.chunks and document.chunks[0].processed_chunks:\n", - " pc = document.chunks[0].processed_chunks[0]\n", - " print(f\"\\nExample ProcessedChunk:\")\n", - " print(f\" Section title: {pc.section_title}\")\n", - " print(f\" Enhanced content: {pc.enhanced_content[:150]}...\")\n", - " print(f\" Processing metadata: {pc.processing_metadata}\")" - ] + "execution_count": 10 }, { "cell_type": "markdown", @@ -484,54 +520,14 @@ }, { "cell_type": "code", - "execution_count": 33, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "🚀 RAG Benefits Demonstration:\n", - "Examples of how context enrichment improves retrieval...\n", - "\n", - "🔍 Query: 'dataset'\n", - " Found 9 potential matches\n", - " Match 1 (Content+Context): 2 Background → 2.1 Dataset\n", - " Content: Early clinical QA datasets such as emrQA (Pampari et al., 2018) and CliCR (Šuste...\n", - " Match 2 (Context): 2 Background → 2.1 Dataset\n", - " Content: sentence-level as essential , supplementary , or irrelevant . Answers must be co...\n", - "\n", - "🔍 Query: 'background'\n", - " Found 6 potential matches\n", - " Match 1 (Context): 2 Background → 2.1 Dataset\n", - " Content: Early clinical QA datasets such as emrQA (Pampari et al., 2018) and CliCR (Šuste...\n", - " Match 2 (Context): 2 Background → 2.1 Dataset\n", - " Content: sentence-level as essential , supplementary , or irrelevant . Answers must be co...\n", - "\n", - "🔍 Query: 'method'\n", - " Found 28 potential matches\n", - " Match 1 (Content+Context): 3 Method → 3.4 Answer Generation\n", - " Content: repair of his ruptured thoracoabdominal aortic aneurysm. |1| -He was immediately...\n", - " Match 2 (Content+Context): 3 Method → 3.4 Answer Generation\n", - " Content: ated by our verbatim method, inserting evidence sentences verbatim into a dynami...\n", - "\n", - "🔍 Query: 'evaluation'\n", - " Found 12 potential matches\n", - " Match 1 (Context): 4 Evaluation\n", - " Content: We evaluated our pipeline in the ArchEHR-QA 2025 shared task (Soni and Demner-Fu...\n", - " Match 2 (Context): 4 Evaluation\n", - " Content: s through BLEU (Papineni et al., 2002), ROUGE (Lin, 2004), BERTScore (Zhang et a...\n", - "\n", - "🔍 Query: 'limitations'\n", - " Found 4 potential matches\n", - " Match 1 (Content+Context): 6 Limitations\n", - " Content: Our verbatim RAG pipeline explicitly cites source sentences to mitigate hallucin...\n", - " Match 2 (Context): 2 Background → 2.2 Limitations of Standard RAG\n", - " Content: Standard RAG models, despite external grounding, still frequently hallucinate un...\n" - ] + "metadata": { + "ExecuteTime": { + "end_time": "2025-08-07T08:03:11.671717Z", + "start_time": "2025-08-07T08:03:11.659646Z" } - ], + }, "source": [ + "\n", "print(\"🚀 RAG Benefits Demonstration:\")\n", "print(\"Examples of how context enrichment improves retrieval...\")\n", "\n", @@ -576,7 +572,53 @@ " print(f\" Content: {chunk.content[:80]}...\")\n", " else:\n", " print(f\" No matches found\")" - ] + ], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "🚀 RAG Benefits Demonstration:\n", + "Examples of how context enrichment improves retrieval...\n", + "\n", + "🔍 Query: 'dataset'\n", + " Found 9 potential matches\n", + " Match 1 (Content+Context): 2 Background → 2.1 Dataset\n", + " Content: Early clinical QA datasets such as emrQA (Pampari et al., 2018) and CliCR (Šuste...\n", + " Match 2 (Context): 2 Background → 2.1 Dataset\n", + " Content: sentence-level as essential , supplementary , or irrelevant . Answers must be co...\n", + "\n", + "🔍 Query: 'background'\n", + " Found 6 potential matches\n", + " Match 1 (Context): 2 Background → 2.1 Dataset\n", + " Content: Early clinical QA datasets such as emrQA (Pampari et al., 2018) and CliCR (Šuste...\n", + " Match 2 (Context): 2 Background → 2.1 Dataset\n", + " Content: sentence-level as essential , supplementary , or irrelevant . Answers must be co...\n", + "\n", + "🔍 Query: 'method'\n", + " Found 28 potential matches\n", + " Match 1 (Content+Context): 3 Method → 3.4 Answer Generation\n", + " Content: repair of his ruptured thoracoabdominal aortic aneurysm. |1| -He was immediately...\n", + " Match 2 (Content+Context): 3 Method → 3.4 Answer Generation\n", + " Content: ated by our verbatim method, inserting evidence sentences verbatim into a dynami...\n", + "\n", + "🔍 Query: 'evaluation'\n", + " Found 12 potential matches\n", + " Match 1 (Context): 4 Evaluation\n", + " Content: We evaluated our pipeline in the ArchEHR-QA 2025 shared task (Soni and Demner-Fu...\n", + " Match 2 (Context): 4 Evaluation\n", + " Content: s through BLEU (Papineni et al., 2002), ROUGE (Lin, 2004), BERTScore (Zhang et a...\n", + "\n", + "🔍 Query: 'limitations'\n", + " Found 4 potential matches\n", + " Match 1 (Content+Context): 6 Limitations\n", + " Content: Our verbatim RAG pipeline explicitly cites source sentences to mitigate hallucin...\n", + " Match 2 (Context): 2 Background → 2.2 Limitations of Standard RAG\n", + " Content: Standard RAG models, despite external grounding, still frequently hallucinate un...\n" + ] + } + ], + "execution_count": 11 }, { "cell_type": "markdown", @@ -587,8 +629,13 @@ }, { "cell_type": "code", - "execution_count": 34, - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2025-08-07T08:03:13.876988Z", + "start_time": "2025-08-07T08:03:13.866599Z" + } + }, + "source": "print(\"📋 Context-Enriched Processing Summary:\")\nprint(\"=\" * 50)\n\nprint(f\"\\n✅ Successfully processed document with context enrichment\")\nprint(f\" 📄 Document: {document.title}\")\nprint(f\" 🧩 Total chunks: {len(document.chunks)}\")\n\n# Count context-enriched chunks\nenriched_count = sum(1 for chunk in document.chunks if hasattr(chunk, 'section_path') and chunk.section_path)\nprint(f\" 🏷️ Context-enriched chunks: {enriched_count}\")\n\n# Show unique sections\nsections = set()\nfor chunk in document.chunks:\n if hasattr(chunk, 'section_path') and chunk.section_path:\n sections.add(chunk.section_path[0])\nprint(f\" 📚 Unique sections: {len(sections)}\")\n\nprint(f\"\\n🎯 Key Benefits for RAG:\")\nprint(f\" • Each chunk contains document title + full hierarchical context\")\nprint(f\" • Section information embedded with content\")\nprint(f\" • Better retrieval through context matching\")\nprint(f\" • Rich citation context for answers\")\nprint(f\" • Backward compatible with existing VerbatimRAG\")\n\nprint(f\"\\n🚀 Ready for:\")\nprint(f\" • Integration with VerbatimIndex\")\nprint(f\" • Embedding generation with context\")\nprint(f\" • Enhanced RAG retrieval testing\")\n\nprint(f\"\\n📝 Example Context String with Document Title:\")\nif document.chunks:\n example_chunk = document.chunks[0]\n if hasattr(example_chunk, 'context_string'):\n print(f\" {example_chunk.context_string}\")\n\nprint(f\"\\n🧹 Test complete - ready for production integration!\")", "outputs": [ { "name": "stdout", @@ -604,7 +651,7 @@ " 📚 Unique sections: 7\n", "\n", "🎯 Key Benefits for RAG:\n", - " • Each chunk contains full hierarchical context\n", + " • Each chunk contains document title + full hierarchical context\n", " • Section information embedded with content\n", " • Better retrieval through context matching\n", " • Rich citation context for answers\n", @@ -615,52 +662,123 @@ " • Embedding generation with context\n", " • Enhanced RAG retrieval testing\n", "\n", + "📝 Example Context String with Document Title:\n", + " Verbatim RAG ACL Paper | Section: 1 Introduction\n", + "\n", "🧹 Test complete - ready for production integration!\n" ] } ], + "execution_count": 12 + }, + { + "cell_type": "code", "source": [ - "print(\"📋 Context-Enriched Processing Summary:\")\n", - "print(\"=\" * 50)\n", + "print(\"🔄 Re-processing with context enriched processor to test document title integration...\")\n", "\n", - "print(f\"\\n✅ Successfully processed document with context enrichment\")\n", - "print(f\" 📄 Document: {document.title}\")\n", - "print(f\" 🧩 Total chunks: {len(document.chunks)}\")\n", + "# Re-create processor to get fresh chunks with document title\n", + "processor = ContextEnrichedProcessor.for_rag(chunk_size=512)\n", + "document = processor.process_file(pdf_path, title=\"Verbatim RAG ACL Paper\")\n", "\n", - "# Count context-enriched chunks\n", - "enriched_count = sum(1 for chunk in document.chunks if hasattr(chunk, 'section_path') and chunk.section_path)\n", - "print(f\" 🏷️ Context-enriched chunks: {enriched_count}\")\n", + "print(f\"✅ Document re-processed successfully!\")\n", + "print(f\"Title: {document.title}\")\n", + "print(f\"Chunks: {len(document.chunks)}\")\n", "\n", - "# Show unique sections\n", - "sections = set()\n", - "for chunk in document.chunks:\n", - " if hasattr(chunk, 'section_path') and chunk.section_path:\n", - " sections.add(chunk.section_path[0])\n", - "print(f\" 📚 Unique sections: {len(sections)}\")\n", - "\n", - "print(f\"\\n🎯 Key Benefits for RAG:\")\n", - "print(f\" • Each chunk contains full hierarchical context\")\n", - "print(f\" • Section information embedded with content\")\n", - "print(f\" • Better retrieval through context matching\")\n", - "print(f\" • Rich citation context for answers\")\n", - "print(f\" • Backward compatible with existing VerbatimRAG\")\n", - "\n", - "print(f\"\\n🚀 Ready for:\")\n", - "print(f\" • Integration with VerbatimIndex\")\n", - "print(f\" • Embedding generation with context\")\n", - "print(f\" • Enhanced RAG retrieval testing\")\n", - "\n", - "print(f\"\\n🧹 Test complete - ready for production integration!\")" - ] + "print(f\"\\n📝 Example Enhanced Content with Document Title:\")\n", + "for i, chunk in enumerate(document.chunks[:n]):\n", + " if hasattr(chunk, 'get_enhanced_content'):\n", + " print(f\"\\n--- Chunk {i+1} ---\")\n", + " print(f\"Context: {chunk.context_string}\")\n", + " print(f\"Enhanced: {chunk.get_enhanced_content()[:200]}...\")" + ], + "metadata": { + "ExecuteTime": { + "end_time": "2025-08-07T08:09:47.823152Z", + "start_time": "2025-08-07T08:09:17.273569Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "🔄 Re-processing with context enriched processor to test document title integration...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/paulschmitt/miniforge3/envs/verbatim-rag-2/lib/python3.10/site-packages/torch/utils/data/dataloader.py:683: UserWarning: 'pin_memory' argument is set as true but not supported on MPS now, then device pinned memory won't be used.\n", + " warnings.warn(warn_msg)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Document re-processed successfully!\n", + "Title: Verbatim RAG ACL Paper\n", + "Chunks: 57\n", + "\n", + "📝 Example Enhanced Content with Document Title:\n", + "\n", + "--- Chunk 1 ---\n", + "Context: Verbatim RAG ACL Paper | Section: 1 Introduction\n", + "Enhanced: Verbatim RAG ACL Paper | Section: 1 Introduction | Modern question-answering (QA) and retrievalaugmented generation (RAG) systems play a vital role in many high-stakes domains for information extracti...\n", + "\n", + "--- Chunk 2 ---\n", + "Context: Verbatim RAG ACL Paper | Section: 1 Introduction\n", + "Enhanced: Verbatim RAG ACL Paper | Section: 1 Introduction | incorrect information, commonly referred to as hallucinations (Ji et al., 2023; Madsen et al., 2024). We argue that a reliable QA system should guara...\n", + "\n", + "--- Chunk 3 ---\n", + "Context: Verbatim RAG ACL Paper | Section: 1 Introduction\n", + "Enhanced: Verbatim RAG ACL Paper | Section: 1 Introduction | trained generation , dynamically creating answer templates filled exclu-\n", + "\n", + "We participated in the ArchEHR-QA 2025 shared task on grounded question ans...\n", + "\n", + "--- Chunk 4 ---\n", + "Context: Verbatim RAG ACL Paper | Section: 1 Introduction\n", + "Enhanced: Verbatim RAG ACL Paper | Section: 1 Introduction | arner et al., 2024), achieving performance comparable to the LLM extractor. Both extractors were then fed into the same LLM template generator. Our s...\n", + "\n", + "--- Chunk 5 ---\n", + "Context: Verbatim RAG ACL Paper | Section: 1 Introduction\n", + "Enhanced: Verbatim RAG ACL Paper | Section: 1 Introduction | d train custom models. Additionally, we are releasing all the code on GitHub 2 under the MIT License.\n", + "\n", + "The remainder of the paper discusses backgroun...\n", + "\n", + "--- Chunk 6 ---\n", + "Context: Verbatim RAG ACL Paper | Section: 2 Background | Subsection: 2.1 Dataset\n", + "Enhanced: Verbatim RAG ACL Paper | Section: 2 Background | Subsection: 2.1 Dataset | Early clinical QA datasets such as emrQA (Pampari et al., 2018) and CliCR (Šuster and Daelemans, 2018) used fill-in-the-blank...\n", + "\n", + "--- Chunk 7 ---\n", + "Context: Verbatim RAG ACL Paper | Section: 2 Background | Subsection: 2.1 Dataset\n", + "Enhanced: Verbatim RAG ACL Paper | Section: 2 Background | Subsection: 2.1 Dataset | sentence-level as essential , supplementary , or irrelevant . Answers must be concise (under 75 words) and explicitly cite re...\n", + "\n", + "--- Chunk 8 ---\n", + "Context: Verbatim RAG ACL Paper | Section: 2 Background | Subsection: 2.2 Limitations of Standard RAG\n", + "Enhanced: Verbatim RAG ACL Paper | Section: 2 Background | Subsection: 2.2 Limitations of Standard RAG | Standard RAG models, despite external grounding, still frequently hallucinate unsupported or contradictor...\n", + "\n", + "--- Chunk 9 ---\n", + "Context: Verbatim RAG ACL Paper | Section: 2 Background | Subsection: 2.2 Limitations of Standard RAG\n", + "Enhanced: Verbatim RAG ACL Paper | Section: 2 Background | Subsection: 2.2 Limitations of Standard RAG | ano and Smith, 2019; Jain and Wallace, 2019) and LLM self-explanations (Madsen et al., 2024) have also be...\n", + "\n", + "--- Chunk 10 ---\n", + "Context: Verbatim RAG ACL Paper | Section: 2 Background | Subsection: 2.3 Synthetic Training Data\n", + "Enhanced: Verbatim RAG ACL Paper | Section: 2 Background | Subsection: 2.3 Synthetic Training Data | Due to limited access and annotation restrictions, obtaining sentence-level labeled clinical datasets is chal...\n", + "\n", + "✅ Document title successfully integrated into context strings!\n" + ] + } + ], + "execution_count": 14 }, { + "metadata": {}, "cell_type": "code", - "execution_count": null, "outputs": [], - "source": [], - "metadata": { - "collapsed": false - } + "execution_count": null, + "source": "" } ], "metadata": { diff --git a/notebooks/test_verbatim_rag_integration.ipynb b/notebooks/test_verbatim_rag_integration.ipynb index 28f3978..3a1d619 100644 --- a/notebooks/test_verbatim_rag_integration.ipynb +++ b/notebooks/test_verbatim_rag_integration.ipynb @@ -6,60 +6,74 @@ "source": [ "# VerbatimRAG + Context-Enriched Integration Test\n", "\n", - "This notebook tests the full integration of ContextEnrichedProcessor with the VerbatimRAG system." + "This notebook tests the full integration of ContextEnrichedProcessor with the VerbatimRAG system.\n", + "It demonstrates how hierarchical context enrichment improves retrieval accuracy and maintains\n", + "verbatim span extraction capabilities." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Setup" + "## Setup\n", + "\n", + "Initialize the environment and load required dependencies." ] }, { "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Project root: /Users/paulschmitt/DataspellProjects/verbatim-rag\n", - "✅ Setup complete\n" - ] + "metadata": { + "ExecuteTime": { + "end_time": "2025-08-07T13:53:52.968723Z", + "start_time": "2025-08-07T13:53:52.960163Z" } - ], + }, "source": [ "import sys\n", "import os\n", "from pathlib import Path\n", + "from dotenv import load_dotenv\n", "\n", "# Fix OpenMP conflict\n", "os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'\n", - "os.environ['OPENAI_API_KEY'] = ''\n", "\n", "# Add project root to path\n", "project_root = Path().absolute().parent\n", "sys.path.append(str(project_root))\n", "\n", + "# Load environment variables from .env.local\n", + "load_dotenv(project_root / '.env.local', override=True)\n", + "\n", + "# Check if an API key is loaded\n", + "if not os.environ.get('OPENAI_API_KEY'):\n", + " print(\"⚠️ Warning: OPENAI_API_KEY not found. Please set it in .env.local\")\n", + "else:\n", + " print(\"✅ OpenAI API key loaded\")\n", + "\n", "print(f\"Project root: {project_root}\")\n", "print(\"✅ Setup complete\")" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, + ], "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "✅ Imports successful\n" + "✅ OpenAI API key loaded\n", + "Project root: /Users/paulschmitt/DataspellProjects/verbatim-rag\n", + "✅ Setup complete\n" ] } ], + "execution_count": 1 + }, + { + "cell_type": "code", + "metadata": { + "ExecuteTime": { + "end_time": "2025-08-07T13:53:57.488176Z", + "start_time": "2025-08-07T13:53:53.080568Z" + } + }, "source": [ "from verbatim_rag.ingestion.context_enriched_processor import ContextEnrichedProcessor\n", "from verbatim_rag.core import VerbatimRAG\n", @@ -67,50 +81,40 @@ "from pprint import pprint\n", "\n", "print(\"✅ Imports successful\")" - ] + ], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Imports successful\n" + ] + } + ], + "execution_count": 2 }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Test 1: Process Document with Context Enrichment" + "## Test 1: Process Document with Context Enrichment\n", + "\n", + "This test verifies that the ContextEnrichedProcessor can successfully process a PDF document \n", + "and create context-enriched chunks. Each chunk will include hierarchical context information \n", + "(section paths, titles, etc.) that will help with more accurate retrieval." ] }, { "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "📄 Processing document with context enrichment...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/paulschmitt/miniforge3/envs/verbatim-rag-2/lib/python3.10/site-packages/torch/utils/data/dataloader.py:683: UserWarning: 'pin_memory' argument is set as true but not supported on MPS now, then device pinned memory won't be used.\n", - " warnings.warn(warn_msg)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "✅ Document processed successfully!\n", - " Title: Verbatim RAG ACL Paper\n", - " Chunks: 78\n", - " Content type: DocumentType.PDF\n", - " Context-enriched chunks: 78\n" - ] + "metadata": { + "ExecuteTime": { + "end_time": "2025-08-07T13:54:22.788509Z", + "start_time": "2025-08-07T13:53:57.501443Z" } - ], + }, "source": [ "# Test document path\n", - "pdf_path = project_root / \"data\" / \"acl_papers\" / \"VERBATIM_RAG_ACL.pdf\"\n", + "pdf_path = project_root / \"data\" / \"acl_papers\" / \"Lexical_grammar_induction.pdf\"\n", "\n", "# Create context-enriched processor optimized for RAG\n", "processor = ContextEnrichedProcessor.for_rag(\n", @@ -120,7 +124,7 @@ "\n", "# Process document\n", "print(\"📄 Processing document with context enrichment...\")\n", - "document = processor.process_file(pdf_path, title=\"Verbatim RAG ACL Paper\")\n", + "document = processor.process_file(pdf_path, title=\"Lexical Grammar Induction\")\n", "\n", "print(f\"✅ Document processed successfully!\")\n", "print(f\" Title: {document.title}\")\n", @@ -130,39 +134,63 @@ "# Show chunk types\n", "enriched_chunks = [c for c in document.chunks if hasattr(c, 'section_path')]\n", "print(f\" Context-enriched chunks: {len(enriched_chunks)}\")" - ] + ], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "📄 Processing document with context enrichment...\n", + "✅ Document processed successfully!\n", + " Title: Lexical Grammar Induction\n", + " Chunks: 124\n", + " Content type: DocumentType.PDF\n", + " Context-enriched chunks: 124\n" + ] + } + ], + "execution_count": 3 }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Test 2: Create VerbatimIndex with Context-Enriched Chunks" + "## Test 2: Create VerbatimIndex with Context-Enriched Chunks\n", + "\n", + "This test creates a VerbatimIndex using the context-enriched document chunks from Test 1. \n", + "The index will store both the embeddings and the hierarchical context information, \n", + "enabling more precise document retrieval." ] }, { "cell_type": "code", - "execution_count": 4, - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2025-08-07T13:54:28.951016Z", + "start_time": "2025-08-07T13:54:22.800244Z" + } + }, + "source": [ + "# Create VerbatimIndex with context-enriched chunks\n", + "print(\"🗂️ Creating VerbatimIndex with context-enriched chunks...\")\n", + "\n", + "# Initialize index with OpenAI embeddings and FAISS vector store\n", + "index = VerbatimIndex(dense_model=\"all-MiniLM-L6-v2\")\n", + "\n", + "# Add the context-enriched document to the index (using add_documents method)\n", + "print(\"📝 Adding document to index...\")\n", + "index.add_documents([document])\n", + "\n", + "print(f\"✅ Index created successfully!\")\n", + "print(f\" Vector store type: {type(index.vector_store).__name__}\")\n", + "print(f\" Embedding provider: {type(index.dense_provider).__name__}\")" + ], "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "🗂️ Creating VerbatimIndex with context-enriched chunks...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/paulschmitt/miniforge3/envs/verbatim-rag-2/lib/python3.10/site-packages/milvus_lite/__init__.py:15: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.\n", - " from pkg_resources import DistributionNotFound, get_distribution\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ + "🗂️ Creating VerbatimIndex with context-enriched chunks...\n", "📝 Adding document to index...\n", "✅ Index created successfully!\n", " Vector store type: LocalMilvusStore\n", @@ -170,19 +198,44 @@ ] } ], - "source": "# Create VerbatimIndex with context-enriched chunks\nprint(\"🗂️ Creating VerbatimIndex with context-enriched chunks...\")\n\n# Initialize index with OpenAI embeddings and FAISS vector store\nindex = VerbatimIndex(dense_model=\"all-MiniLM-L6-v2\")\n\n# Add the context-enriched document to the index (using add_documents method)\nprint(\"📝 Adding document to index...\")\nindex.add_documents([document])\n\nprint(f\"✅ Index created successfully!\")\nprint(f\" Vector store type: {type(index.vector_store).__name__}\")\nprint(f\" Embedding provider: {type(index.dense_provider).__name__}\")" + "execution_count": 4 }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Test 3: Initialize VerbatimRAG System" + "## Test 3: Initialize VerbatimRAG System\n", + "\n", + "This test initializes the complete VerbatimRAG system using the context-enriched index. \n", + "It verifies that the RAG system can properly integrate with the hierarchically structured \n", + "document chunks." ] }, { "cell_type": "code", - "execution_count": 5, - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2025-08-07T13:54:28.979875Z", + "start_time": "2025-08-07T13:54:28.960546Z" + } + }, + "source": [ + "# Initialize VerbatimRAG with the context-enriched index\n", + "print(\"🤖 Initializing VerbatimRAG system...\")\n", + "\n", + "rag = VerbatimRAG(\n", + " index=index # Pass the index as required parameter\n", + ")\n", + "\n", + "print(\"✅ VerbatimRAG initialized successfully!\")\n", + "\n", + "# Test that the index is working by doing a simple search\n", + "try:\n", + " test_results = index.search(\"verbatim\", k=3)\n", + " print(f\" Index working: Found {len(test_results)} results for test query\")\n", + "except Exception as e:\n", + " print(f\" Index test failed: {e}\")" + ], "outputs": [ { "name": "stdout", @@ -194,19 +247,66 @@ ] } ], - "source": "# Initialize VerbatimRAG with the context-enriched index\nprint(\"🤖 Initializing VerbatimRAG system...\")\n\nrag = VerbatimRAG(\n index=index # Pass the index as required parameter\n)\n\nprint(\"✅ VerbatimRAG initialized successfully!\")\n\n# Test that the index is working by doing a simple search\ntry:\n test_results = index.search(\"verbatim\", k=3)\n print(f\" Index working: Found {len(test_results)} results for test query\")\nexcept Exception as e:\n print(f\" Index test failed: {e}\")" + "execution_count": 5 }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Test 4: Query with Context-Enriched Retrieval" + "## Test 4: Query with Context-Enriched Retrieval\n", + "\n", + "This test performs queries against the VerbatimRAG system to verify that context-enriched \n", + "chunks improve retrieval accuracy. We test multiple types of queries to demonstrate how \n", + "hierarchical context helps with finding relevant information." ] }, { "cell_type": "code", - "execution_count": 6, - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2025-08-07T13:55:14.825059Z", + "start_time": "2025-08-07T13:54:28.993605Z" + } + }, + "source": [ + "# Test queries that should benefit from hierarchical context\n", + "test_queries = [\n", + " \"What dataset was used in this study?\",\n", + " \"What are the limitations of standard RAG systems?\", \n", + " \"How does the method work?\",\n", + " \"What evaluation metrics were used?\",\n", + " \"What are the main contributions of this work?\"\n", + "]\n", + "\n", + "print(\"🔍 Testing queries with context-enriched retrieval...\")\n", + "\n", + "for i, query in enumerate(test_queries, 1):\n", + " print(f\"\\n--- Query {i} ---\")\n", + " print(f\"Question: {query}\")\n", + " \n", + " try:\n", + " # Get response from VerbatimRAG\n", + " response = rag.query(question=query)\n", + " \n", + " print(f\"Answer: {response.answer[:200]}...\")\n", + " print(f\"Source documents: {len(response.documents)} documents cited\")\n", + " \n", + " # Show retrieved documents with their context\n", + " print(\"Retrieved documents:\")\n", + " for j, doc in enumerate(response.documents[:2]):\n", + " print(f\" {j+1}. Document: '{doc.title}'\")\n", + " if hasattr(doc, 'highlights') and doc.highlights:\n", + " print(f\" Highlights: {len(doc.highlights)} spans\")\n", + " for k, highlight in enumerate(doc.highlights[:1]):\n", + " print(f\" - {highlight.text[:80]}...\")\n", + " else:\n", + " print(f\" Content preview: {doc.content[:80] if hasattr(doc, 'content') else 'N/A'}...\")\n", + " \n", + " except Exception as e:\n", + " print(f\"❌ Error: {e}\")\n", + " \n", + " print(\"-\" * 50)" + ], "outputs": [ { "name": "stdout", @@ -218,182 +318,196 @@ "Question: What dataset was used in this study?\n", "Answer: Thanks for your question! Based on the documents, here are the key points:\n", "\n", - "• Clinical ModernBERT\n", - "• EHR snippets, clinician-style questions, and sentence relevance annotations\n", - "• LLM (gemma-3-27b-it)\n", - "•...\n", + "• Dataset\n", + "• 6 datasets\n", + "• dataset\n", + "• Russian and Spanish out-of-domain datasets derived from Wikipedia\n", + "• the two datasets\n", + "• BM...\n", "Source documents: 5 documents cited\n", "Retrieved documents:\n", - " 1. Document: ''\n", - " Highlights: 2 spans\n", - " - Clinical ModernBERT...\n", - " - Clinical ModernBERT...\n", - " 2. Document: ''\n", - " Highlights: 1 spans\n", - " - EHR snippets, clinician-style questions, and sentence relevance annotations...\n", - " 3. Document: ''\n", + " 1. Document: 'Lexical Grammar Induction'\n", " Highlights: 1 spans\n", - " - LLM (gemma-3-27b-it)...\n", + " - Dataset...\n", + " 2. Document: 'Lexical Grammar Induction'\n", + " Highlights: 3 spans\n", + " - Russian and Spanish out-of-domain datasets derived from Wikipedia...\n", "--------------------------------------------------\n", "\n", "--- Query 2 ---\n", "Question: What are the limitations of standard RAG systems?\n", "Answer: Thanks for your question! Based on the documents, here are the key points:\n", "\n", - "• Standard RAG models, despite external grounding, still frequently hallucinate unsupported or contradictory information\n", - "• E...\n", + "• the number of patterns matching a subgraph of N +1 nodes increases from 2 N to 3 N\n", + "• the number of IRTG rules that under o...\n", "Source documents: 5 documents cited\n", "Retrieved documents:\n", - " 1. Document: ''\n", - " Highlights: 3 spans\n", - " - Standard RAG models, despite external grounding, still frequently hallucinate unsupported or contrad...\n", - " - or classifiers trained on hallucination corpora such as RAGTruth...\n", - " 2. Document: ''\n", - " Highlights: 4 spans\n", - " - meaning the purely verbatim property was not consistently maintained across all answers...\n", - " - our approach often required summarization after the initial verbatim insertion step...\n", - " 3. Document: ''\n", - " Content preview: Section: 7 Conclusion | generation. Scientific Data , 10(1):586.\n", - "\n", - "Tianyi Zhang, Varsha Kishore, Feli...\n", + " 1. Document: 'Lexical Grammar Induction'\n", + " Highlights: 2 spans\n", + " - the number of IRTG rules that under our original approach would be included in a...\n", + " 2. Document: 'Lexical Grammar Induction'\n", + " Highlights: 1 spans\n", + " - but exploring the effects of weighting schemes that could in some cases countera...\n", "--------------------------------------------------\n", "\n", "--- Query 3 ---\n", "Question: How does the method work?\n", "Answer: Thanks for your question! Based on the documents, here are the key points:\n", "\n", - "• method\n", - "• Answer Generation\n", - "• An example summarization of the answer from Figure 2 is illustrated in Figure 3.\n", - "• Synthetic ...\n", + "• the graph operations building the particular pattern\n", + "• the string operations concatenating the corresponding words in the ...\n", "Source documents: 5 documents cited\n", "Retrieved documents:\n", - " 1. Document: ''\n", - " Highlights: 1 spans\n", - " - method...\n", - " 2. Document: ''\n", + " 1. Document: 'Lexical Grammar Induction'\n", " Highlights: 2 spans\n", - " - An example summarization of the answer from Figure 2 is illustrated in Figure 3....\n", - " - Answer Generation...\n", - " 3. Document: ''\n", - " Highlights: 1 spans\n", - " - Synthetic Data Generation...\n", + " - the string operations concatenating the corresponding words in the order that is...\n", + " 2. Document: 'Lexical Grammar Induction'\n", + " Highlights: 2 spans\n", + " - The word reordering task is then equivalent to parsing graphs and decoding strin...\n", "--------------------------------------------------\n", "\n", "--- Query 4 ---\n", "Question: What evaluation metrics were used?\n", "Answer: Thanks for your question! Based on the documents, here are the key points:\n", "\n", - "• Table 2 summarizes these metrics\n", - "• factuality recall (56.8% strict, 56.6% lenient)\n", - "• Relevance is evaluating how closely g...\n", + "• automatic evaluation metrics\n", + "• average scores (Ave)\n", + "• average standardized scores (Ave. z)\n", + "• meaning similarity\n", + "• readabil...\n", "Source documents: 5 documents cited\n", "Retrieved documents:\n", - " 1. Document: ''\n", - " Highlights: 2 spans\n", - " - factuality recall (56.8% strict, 56.6% lenient)...\n", - " - Table 2 summarizes these metrics...\n", - " 2. Document: ''\n", - " Highlights: 3 spans\n", - " - BERTScore (Zhang et al., 2020)...\n", - " - MEDCON (Yim et al., 2023)...\n", - " 3. Document: ''\n", - " Highlights: 13 spans\n", - " - F1...\n", - " - F1...\n", + " 1. Document: 'Lexical Grammar Induction'\n", + " Highlights: 1 spans\n", + " - automatic evaluation metrics...\n", + " 2. Document: 'Lexical Grammar Induction'\n", + " Highlights: 4 spans\n", + " - average standardized scores (Ave. z)...\n", "--------------------------------------------------\n", "\n", "--- Query 5 ---\n", "Question: What are the main contributions of this work?\n", "Answer: Thanks for your question! Based on the documents, here are the key points:\n", "\n", - "• method (Section 3)\n", - "• evaluation (Section 4)\n", - "• Section: 3 Method\n", - "• Subsection: 3.1 System Overview\n", - "• extracted sentences\n", - "• ...\n", + "• The 2020 Surface Realization Shared Task (Mille et al., 2020) involves mapping Universal Dependency representations to raw...\n", "Source documents: 5 documents cited\n", "Retrieved documents:\n", - " 1. Document: ''\n", - " Highlights: 2 spans\n", - " - evaluation (Section 4)...\n", - " - method (Section 3)...\n", - " 2. Document: ''\n", - " Highlights: 4 spans\n", - " - If exceeding 75 words, answers are compressed via a summarization prompt, preserving sentence-level ...\n", - " - Subsection: 3.1 System Overview...\n", - " 3. Document: ''\n", - " Highlights: 1 spans\n", - " - Are self-explanations from large language models faithful?...\n", + " 1. Document: 'Lexical Grammar Induction'\n", + " Highlights: 3 spans\n", + " - The input data in the shallow track consists of UD-annotated sentences for 11 la...\n", + " 2. Document: 'Lexical Grammar Induction'\n", + " Highlights: 3 spans\n", + " - extend the word order restoration component of Kov´ acs et al. (2019) by making ...\n", "--------------------------------------------------\n" ] } ], - "source": "# Test queries that should benefit from hierarchical context\ntest_queries = [\n \"What dataset was used in this study?\",\n \"What are the limitations of standard RAG systems?\", \n \"How does the method work?\",\n \"What evaluation metrics were used?\",\n \"What are the main contributions of this work?\"\n]\n\nprint(\"🔍 Testing queries with context-enriched retrieval...\")\n\nfor i, query in enumerate(test_queries, 1):\n print(f\"\\n--- Query {i} ---\")\n print(f\"Question: {query}\")\n \n try:\n # Get response from VerbatimRAG\n response = rag.query(question=query)\n \n print(f\"Answer: {response.answer[:200]}...\")\n print(f\"Source documents: {len(response.documents)} documents cited\")\n \n # Show retrieved documents with their context\n print(\"Retrieved documents:\")\n for j, doc in enumerate(response.documents[:3]):\n print(f\" {j+1}. Document: '{doc.title}'\")\n if hasattr(doc, 'highlights') and doc.highlights:\n print(f\" Highlights: {len(doc.highlights)} spans\")\n for k, highlight in enumerate(doc.highlights[:2]):\n print(f\" - {highlight.text[:100]}...\")\n else:\n print(f\" Content preview: {doc.content[:100] if hasattr(doc, 'content') else 'N/A'}...\")\n \n except Exception as e:\n print(f\"❌ Error: {e}\")\n \n print(\"-\" * 50)" + "execution_count": 6 }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Test 5: Compare Context vs Non-Context Retrieval" + "## Test 5: Analyze Context Benefits\n", + "\n", + "This test analyzes the search results to understand how context enrichment affects chunk \n", + "retrieval. It examines the retrieved chunks' metadata and content to demonstrate the \n", + "benefits of hierarchical context." ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "metadata": { + "ExecuteTime": { + "end_time": "2025-08-07T13:55:14.872021Z", + "start_time": "2025-08-07T13:55:14.849286Z" + } + }, "source": [ "# Test specific query to compare context benefits\n", "query = \"What are the limitations mentioned in the paper?\"\n", "\n", - "print(f\"🔬 Comparative Analysis: '{query}'\")\n", + "print(f\"🔬 Context Analysis: '{query}'\")\n", "print(\"=\" * 60)\n", "\n", "# Retrieve top chunks\n", "try:\n", - " results = index.search(query, k=10)\n", + " results = index.search(query, k=5)\n", " \n", " print(f\"\\n📊 Retrieved {len(results)} chunks:\")\n", " \n", - " for i, (chunk, score) in enumerate(results[:5]):\n", - " print(f\"\\n{i+1}. Score: {score:.3f}\")\n", + " for i, result in enumerate(results, 1):\n", + " print(f\"\\n{i}. Score: {result.score:.3f}\")\n", + " print(f\" Content: {result.text[:120]}...\")\n", " \n", - " if hasattr(chunk, 'section_path') and chunk.section_path:\n", - " context = \" → \".join(chunk.section_path)\n", - " print(f\" Context: {context}\")\n", - " print(f\" Content: {chunk.content[:150]}...\")\n", - " \n", - " # Show how context helped\n", - " enhanced = chunk.get_enhanced_content()\n", - " context_match = \"limitations\" in chunk.context_string.lower()\n", - " content_match = \"limitations\" in chunk.content.lower()\n", - " \n", - " match_type = []\n", - " if context_match: match_type.append(\"Context\")\n", - " if content_match: match_type.append(\"Content\")\n", - " \n", - " print(f\" Match type: {' + '.join(match_type) if match_type else 'Other'}\")\n", - " else:\n", - " print(f\" Content: {chunk.content[:150]}...\")\n", + " # Check metadata for context information\n", + " if result.metadata:\n", + " if 'title' in result.metadata:\n", + " print(f\" Document: {result.metadata['title']}\")\n", + " if 'chunk_type' in result.metadata:\n", + " print(f\" Chunk type: {result.metadata['chunk_type']}\")\n", " \n", "except Exception as e:\n", " print(f\"❌ Search error: {e}\")" - ] + ], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "🔬 Context Analysis: 'What are the limitations mentioned in the paper?'\n", + "============================================================\n", + "\n", + "📊 Retrieved 5 chunks:\n", + "\n", + "1. Score: 0.257\n", + " Content: Lexical Grammar Induction | Section: 3 SZTAKI Institute of Computer Science andras@kornai.com | Subsection: 3.1 Motivati...\n", + " Document: Lexical Grammar Induction\n", + " Chunk type: paragraph\n", + "\n", + "2. Score: 0.228\n", + " Content: Lexical Grammar Induction | Section: 4 Hierarchical surface realization | or which parsing did not finish in 60 seconds,...\n", + " Document: Lexical Grammar Induction\n", + " Chunk type: section\n", + "\n", + "3. Score: 0.217\n", + " Content: Lexical Grammar Induction | Section: 5 Overall architecture | periments. In case of a timeout, a new IRTG is generated k...\n", + " Document: Lexical Grammar Induction\n", + " Chunk type: section\n", + "\n", + "4. Score: 0.209\n", + " Content: Lexical Grammar Induction | Section: 4 Hierarchical surface realization | rguments of a predicate separated by an interv...\n", + " Document: Lexical Grammar Induction\n", + " Chunk type: section\n", + "\n", + "5. Score: 0.205\n", + " Content: Lexical Grammar Induction | Section: 3 SZTAKI Institute of Computer Science andras@kornai.com | Subsection: 3.1 Motivati...\n", + " Document: Lexical Grammar Induction\n", + " Chunk type: paragraph\n" + ] + } + ], + "execution_count": 7 }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Test 6: Span Extraction with Context" + "## Test 6: Span Extraction with Context\n", + "\n", + "This test verifies that the VerbatimRAG span extraction functionality works correctly \n", + "with context-enriched chunks. It ensures that the hierarchical context doesn't interfere \n", + "with the verbatim span extraction process and that citations are properly generated." ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "metadata": { + "ExecuteTime": { + "end_time": "2025-08-07T13:55:25.308994Z", + "start_time": "2025-08-07T13:55:14.886642Z" + } + }, "source": [ "# Test span extraction to ensure context doesn't interfere\n", "query = \"What evaluation metrics were used?\"\n", @@ -402,85 +516,67 @@ "print(\"=\" * 50)\n", "\n", "try:\n", - " # Get full response with span extraction\n", - " response = rag.query(\n", - " question=query,\n", - " max_chunks=3,\n", - " extract_spans=True\n", - " )\n", + " # Get a full response with span extraction\n", + " response = rag.query(question=query)\n", " \n", - " print(f\"\\n📝 Answer: {response.answer}\")\n", - " print(f\"\\n📚 Citations ({len(response.citations)}):\")\n", + " print(f\"\\n📝 Answer: {response.answer[:200]}...\")\n", + " print(f\"\\n📚 Citations ({len(response.structured_answer.citations)}):\")\n", " \n", - " for i, citation in enumerate(response.citations):\n", - " chunk = index.get_chunk_by_id(citation.chunk_id)\n", - " \n", + " for i, citation in enumerate(response.structured_answer.citations[:5]):\n", " print(f\"\\n{i+1}. Citation:\")\n", - " if chunk and hasattr(chunk, 'section_path'):\n", - " context = \" → \".join(chunk.section_path)\n", - " print(f\" Section: {context}\")\n", - " \n", - " print(f\" Extracted span: {citation.text}\")\n", - " print(f\" Relevance: {citation.relevance_score:.3f}\")\n", - " \n", - " if hasattr(citation, 'span_start') and hasattr(citation, 'span_end'):\n", - " print(f\" Span position: {citation.span_start}-{citation.span_end}\")\n", + " print(f\" Extracted span: {citation.text[:100]}...\")\n", + " print(f\" Document index: {citation.doc_index}\")\n", + " print(f\" Highlight index: {citation.highlight_index}\")\n", " \n", "except Exception as e:\n", " print(f\"❌ Span extraction error: {e}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Test Results Summary" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(\"📋 VerbatimRAG + Context-Enriched Integration Summary\")\n", - "print(\"=\" * 60)\n", - "\n", - "# Collect statistics\n", - "total_chunks = len(document.chunks)\n", - "enriched_chunks = len([c for c in document.chunks if hasattr(c, 'section_path')])\n", - "index_chunks = len(index.get_all_chunks())\n", - "\n", - "print(f\"\\n✅ Integration Test Results:\")\n", - "print(f\" 🔄 Document processing: SUCCESS\")\n", - "print(f\" 📊 Index creation: SUCCESS\")\n", - "print(f\" 🤖 VerbatimRAG initialization: SUCCESS\")\n", - "print(f\" 🔍 Query processing: {'SUCCESS' if 'response' in locals() else 'PENDING'}\")\n", - "\n", - "print(f\"\\n📈 Statistics:\")\n", - "print(f\" 📄 Total chunks: {total_chunks}\")\n", - "print(f\" 🏷️ Context-enriched: {enriched_chunks} ({enriched_chunks/total_chunks*100:.1f}%)\")\n", - "print(f\" 🗂️ Indexed chunks: {index_chunks}\")\n", - "\n", - "# Show section distribution\n", - "sections = {}\n", - "for chunk in document.chunks:\n", - " if hasattr(chunk, 'section_path') and chunk.section_path:\n", - " main_section = chunk.section_path[0]\n", - " sections[main_section] = sections.get(main_section, 0) + 1\n", - "\n", - "print(f\"\\n🌳 Section Coverage ({len(sections)} sections):\")\n", - "for section, count in sorted(sections.items()):\n", - " print(f\" {section}: {count} chunks\")\n", - "\n", - "print(f\"\\n🎯 Key Benefits Demonstrated:\")\n", - "print(f\" ✅ Hierarchical context preserved in embeddings\")\n", - "print(f\" ✅ Section-aware retrieval working\")\n", - "print(f\" ✅ VerbatimRAG pipeline compatibility confirmed\")\n", - "print(f\" ✅ Span extraction working with context\")\n", - "\n", - "print(f\"\\n🚀 Ready for production deployment!\")" - ] + ], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "🎯 Span Extraction Test: 'What evaluation metrics were used?'\n", + "==================================================\n", + "\n", + "📝 Answer: Thanks for your question! Based on the documents, here are the key points:\n", + "\n", + "• using automatic evaluation metrics\n", + "• average scores (Ave)\n", + "• average standardized scores (Ave. z)\n", + "• meaning similarity\n", + "• re...\n", + "\n", + "📚 Citations (12):\n", + "\n", + "1. Citation:\n", + " Extracted span: using automatic evaluation metrics...\n", + " Document index: 0\n", + " Highlight index: 0\n", + "\n", + "2. Citation:\n", + " Extracted span: average standardized scores (Ave. z)...\n", + " Document index: 1\n", + " Highlight index: 0\n", + "\n", + "3. Citation:\n", + " Extracted span: readability evaluations...\n", + " Document index: 1\n", + " Highlight index: 1\n", + "\n", + "4. Citation:\n", + " Extracted span: average scores (Ave)...\n", + " Document index: 1\n", + " Highlight index: 2\n", + "\n", + "5. Citation:\n", + " Extracted span: meaning similarity...\n", + " Document index: 1\n", + " Highlight index: 3\n" + ] + } + ], + "execution_count": 8 } ], "metadata": { @@ -499,7 +595,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.0" + "version": "3.8.5" } }, "nbformat": 4, diff --git a/verbatim_rag/core.py b/verbatim_rag/core.py index 3978f8b..1fa6c69 100644 --- a/verbatim_rag/core.py +++ b/verbatim_rag/core.py @@ -133,6 +133,7 @@ def query(self, question: str) -> QueryResponse: answer=answer, search_results=search_results, relevant_spans=relevant_spans, + index=self.index, ) async def _generate_template_async(self, question: str) -> str: @@ -172,4 +173,5 @@ async def query_async(self, question: str) -> QueryResponse: answer=answer, search_results=search_results, relevant_spans=relevant_spans, + index=self.index, ) diff --git a/verbatim_rag/ingestion/context_enriched_processor.py b/verbatim_rag/ingestion/context_enriched_processor.py index 10dea3d..a3c6269 100644 --- a/verbatim_rag/ingestion/context_enriched_processor.py +++ b/verbatim_rag/ingestion/context_enriched_processor.py @@ -125,7 +125,8 @@ def _enrich_document_with_context(self, document: Document) -> Document: enriched_chunks = self._create_context_enriched_chunks( document.raw_content, document.id, - sections + sections, + document.title or "Untitled Document" ) # Replace original chunks with enriched ones @@ -179,7 +180,8 @@ def _create_context_enriched_chunks( self, content: str, document_id: str, - sections: List[Dict[str, Any]] + sections: List[Dict[str, Any]], + document_title: str ) -> List[ContextEnrichedChunk]: """Create context-enriched chunks from content and section structure.""" @@ -222,8 +224,8 @@ def _create_context_enriched_chunks( if not chunk_content.strip(): continue - # Create context string - context_string = self._build_context_string(section_path) + # Create context string with document title + context_string = self._build_context_string(section_path, document_title) # Determine chunk type chunk_type = ChunkType.SECTION if section['level'] == 1 else ChunkType.PARAGRAPH @@ -291,13 +293,17 @@ def _build_section_path( return section_path - def _build_context_string(self, section_path: List[str]) -> str: - """Build context string from section path.""" - if not section_path: + def _build_context_string(self, section_path: List[str], document_title: str = "") -> str: + """Build context string from section path and document title.""" + if not section_path and not document_title: return "" - # Create hierarchical labels + # Start with document title if provided context_parts = [] + if document_title: + context_parts.append(document_title) + + # Add hierarchical section labels for i, section in enumerate(section_path): if i == 0: label = "Section" diff --git a/verbatim_rag/response_builder.py b/verbatim_rag/response_builder.py index 7f6765b..c3bc566 100644 --- a/verbatim_rag/response_builder.py +++ b/verbatim_rag/response_builder.py @@ -28,6 +28,7 @@ def build_response( answer: str, search_results: List[Any], relevant_spans: Dict[str, List[str]], + index: Any = None, ) -> QueryResponse: """ Build a complete QueryResponse with proper highlighting and citations @@ -37,6 +38,7 @@ def build_response( answer: The generated answer search_results: List of search results relevant_spans: Dictionary mapping result text to relevant spans + index: Optional VerbatimIndex for looking up document metadata Returns: Complete QueryResponse with highlights and citations @@ -65,9 +67,23 @@ def build_response( ) all_citations.append(citation) + # Get document metadata if available (like in streaming.py) + title = "" + source = "" + metadata = {} + + if hasattr(result, 'metadata') and result.metadata: + title = result.metadata.get('title', '') + source = result.metadata.get('source', '') + metadata = result.metadata + # Create document with highlights document_with_highlights = DocumentWithHighlights( - content=result_content, highlights=highlights + content=result_content, + highlights=highlights, + title=title, + source=source, + metadata=metadata ) documents_with_highlights.append(document_with_highlights)