Skip to content

Commit 4c9050f

Browse files
authored
Pulling in docling-core, docling, pymupdf fixes (#1313)
1 parent 438982e commit 4c9050f

File tree

5 files changed

+43
-47
lines changed

5 files changed

+43
-47
lines changed

packages/paper-qa-docling/pyproject.toml

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,7 @@ classifiers = [
1919
]
2020
dependencies = [
2121
"docling-core>=2", # Pin for v2 with DocItem, TextItem, etc.
22-
"docling-parse<5", # Downpin for https://github.com/docling-project/docling-parse/issues/226
23-
"docling>=2", # Pin for v2 introducing PdfPipelineOptions
22+
"docling>=2.74", # Pin for moving to DoclingParseDocumentBackend in https://github.com/docling-project/docling/pull/2872
2423
"paper-qa",
2524
]
2625
description = "PaperQA readers implemented using Docling"
@@ -38,7 +37,6 @@ requires-python = ">=3.11"
3837
[project.optional-dependencies]
3938
dev = [
4039
"docling-ibm-models[opencv-python-headless]>=3.10.0", # Lower pin and specify opencv after https://github.com/docling-project/docling-ibm-models/pull/130
41-
"docling>=2.63", # Pin for StandardPdfPipeline timeout fix
4240
"fhlmi>=0.39", # Pin for bytes_to_string
4341
"paper-qa>=5.23", # Pin for PDFParserFn
4442
"pytest-asyncio",

packages/paper-qa-docling/src/paperqa_docling/reader.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from typing import TYPE_CHECKING, Any, cast
99

1010
import docling
11-
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
11+
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
1212
from docling.datamodel.base_models import ConversionStatus
1313
from docling.datamodel.pipeline_options import PdfPipelineOptions
1414
from docling.datamodel.settings import DEFAULT_PAGE_RANGE
@@ -43,7 +43,7 @@ def parse_pdf_to_pages( # noqa: PLR0912
4343
pipeline_cls: type = StandardPdfPipeline,
4444
dpi: int | None = None,
4545
custom_pipeline_options: Mapping[str, Any] | None = None,
46-
backend: "type[AbstractDocumentBackend]" = DoclingParseV4DocumentBackend,
46+
backend: "type[AbstractDocumentBackend]" = DoclingParseDocumentBackend,
4747
**_,
4848
) -> ParsedText:
4949
"""Parse a PDF.
@@ -62,7 +62,7 @@ def parse_pdf_to_pages( # noqa: PLR0912
6262
page_range: Optional start_page or two-tuple of inclusive (start_page, end_page)
6363
to parse only specific pages, where pages are one-indexed.
6464
Leaving as the default of None will parse all pages.
65-
backend: PDF backend class to use for parsing, defaults to docling-parse v4.
65+
backend: PDF backend class to use for parsing, defaults to docling-parse.
6666
**_: Thrown away kwargs.
6767
"""
6868
path = Path(path)

packages/paper-qa-pymupdf/pyproject.toml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,6 @@ requires-python = ">=3.11"
3535

3636
[project.optional-dependencies]
3737
dev = [
38-
"PyMuPDF<1.27", # Downpin for typing bug in https://github.com/pymupdf/PyMuPDF/issues/4903
3938
"fhlmi>=0.39", # Pin for bytes_to_string
4039
"paper-qa>=5.23", # Pin for PDFParserFn
4140
"pytest-asyncio",

packages/paper-qa-pymupdf/src/paperqa_pymupdf/reader.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from paperqa.types import ParsedMedia, ParsedMetadata, ParsedText
99
from paperqa.utils import ImpossibleParsingError, clean_invalid_unicode
1010
from pydantic import JsonValue
11+
from pymupdf.table import find_tables
1112

1213

1314
def setup_pymupdf_python_logging() -> None:
@@ -218,7 +219,7 @@ def parse_pdf_to_pages(
218219
)
219220

220221
# Capture tables
221-
for table_i, table in enumerate(t for t in page.find_tables()):
222+
for table_i, table in enumerate(find_tables(page)):
222223
pix = page.get_pixmap(clip=table.bbox, dpi=dpi)
223224
media_metadata = {
224225
"bbox": tuple(table.bbox),

0 commit comments

Comments
 (0)