Pulling in docling-core, docling, pymupdf fixes (#1313)

jamesbraza · web-flow · commit 4c9050fe13b0 · 2026-03-10T17:13:07.000-07:00
diff --git a/packages/paper-qa-docling/pyproject.toml b/packages/paper-qa-docling/pyproject.toml
@@ -19,8 +19,7 @@ classifiers = [
 ]
 dependencies = [
     "docling-core>=2",  # Pin for v2 with DocItem, TextItem, etc.
-    "docling-parse<5",  # Downpin for https://github.com/docling-project/docling-parse/issues/226
-    "docling>=2",  # Pin for v2 introducing PdfPipelineOptions
+    "docling>=2.74",  # Pin for moving to DoclingParseDocumentBackend in https://github.com/docling-project/docling/pull/2872
     "paper-qa",
 ]
 description = "PaperQA readers implemented using Docling"
@@ -38,7 +37,6 @@ requires-python = ">=3.11"
 [project.optional-dependencies]
 dev = [
     "docling-ibm-models[opencv-python-headless]>=3.10.0",  # Lower pin and specify opencv after https://github.com/docling-project/docling-ibm-models/pull/130
-    "docling>=2.63",  # Pin for StandardPdfPipeline timeout fix
     "fhlmi>=0.39",  # Pin for bytes_to_string
     "paper-qa>=5.23",  # Pin for PDFParserFn
     "pytest-asyncio",
diff --git a/packages/paper-qa-docling/src/paperqa_docling/reader.py b/packages/paper-qa-docling/src/paperqa_docling/reader.py
@@ -8,7 +8,7 @@
 from typing import TYPE_CHECKING, Any, cast
 
 import docling
-from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
+from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.datamodel.base_models import ConversionStatus
 from docling.datamodel.pipeline_options import PdfPipelineOptions
 from docling.datamodel.settings import DEFAULT_PAGE_RANGE
@@ -43,7 +43,7 @@ def parse_pdf_to_pages(  # noqa: PLR0912
     pipeline_cls: type = StandardPdfPipeline,
     dpi: int | None = None,
     custom_pipeline_options: Mapping[str, Any] | None = None,
-    backend: "type[AbstractDocumentBackend]" = DoclingParseV4DocumentBackend,
+    backend: "type[AbstractDocumentBackend]" = DoclingParseDocumentBackend,
     **_,
 ) -> ParsedText:
     """Parse a PDF.
@@ -62,7 +62,7 @@ def parse_pdf_to_pages(  # noqa: PLR0912
         page_range: Optional start_page or two-tuple of inclusive (start_page, end_page)
             to parse only specific pages, where pages are one-indexed.
             Leaving as the default of None will parse all pages.
-        backend: PDF backend class to use for parsing, defaults to docling-parse v4.
+        backend: PDF backend class to use for parsing, defaults to docling-parse.
         **_: Thrown away kwargs.
     """
     path = Path(path)
diff --git a/packages/paper-qa-pymupdf/pyproject.toml b/packages/paper-qa-pymupdf/pyproject.toml
@@ -35,7 +35,6 @@ requires-python = ">=3.11"
 
 [project.optional-dependencies]
 dev = [
-    "PyMuPDF<1.27",  # Downpin for typing bug in https://github.com/pymupdf/PyMuPDF/issues/4903
     "fhlmi>=0.39",  # Pin for bytes_to_string
     "paper-qa>=5.23",  # Pin for PDFParserFn
     "pytest-asyncio",
diff --git a/packages/paper-qa-pymupdf/src/paperqa_pymupdf/reader.py b/packages/paper-qa-pymupdf/src/paperqa_pymupdf/reader.py
@@ -8,6 +8,7 @@
 from paperqa.types import ParsedMedia, ParsedMetadata, ParsedText
 from paperqa.utils import ImpossibleParsingError, clean_invalid_unicode
 from pydantic import JsonValue
+from pymupdf.table import find_tables
 
 
 def setup_pymupdf_python_logging() -> None:
@@ -218,7 +219,7 @@ def parse_pdf_to_pages(
                         )
 
                     # Capture tables
-                    for table_i, table in enumerate(t for t in page.find_tables()):
+                    for table_i, table in enumerate(find_tables(page)):
                         pix = page.get_pixmap(clip=table.bbox, dpi=dpi)
                         media_metadata = {
                             "bbox": tuple(table.bbox),
diff --git a/uv.lock b/uv.lock