Skip to content

Commit 8ef7dd6

Browse files
authored
♻️ harmonize getting page count from a local input source (#364)
1 parent 0ec0344 commit 8ef7dd6

File tree

68 files changed

+389
-317
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

68 files changed

+389
-317
lines changed

.pre-commit-config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ repos:
2727
- id: gitleaks
2828

2929
- repo: https://github.com/PyCQA/pylint
30-
rev: v3.3.1
30+
rev: v3.3.9
3131
hooks:
3232
- id: pylint
3333
name: pylint

examples/auto_invoice_splitter_extraction_example.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
def parse_invoice(file_path):
1212
input_source = PathInput(file_path)
1313

14-
if input_source.is_pdf() and input_source.count_doc_pages() > 1:
14+
if input_source.is_pdf() and input_source.page_count > 1:
1515
parse_multi_page(input_source)
1616
else:
1717
parse_single_page(input_source)

mindee/extraction/multi_receipts_extractor/multi_receipts_extractor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ def extract_receipts(
2424
raise MindeeError(
2525
"No possible receipts candidates found for MultiReceipts extraction."
2626
)
27-
for page_id in range(input_source.count_doc_pages()):
27+
for page_id in range(input_source.page_count):
2828
receipt_positions = [
2929
receipt.bounding_box
3030
for receipt in inference.pages[page_id].prediction.receipts

mindee/input/sources/local_input_source.py

Lines changed: 22 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ class LocalInputSource:
3636
file_mimetype: str
3737
input_type: InputType
3838
filepath: Optional[str]
39+
_page_count: Optional[int] = None
3940

4041
def __init__(self, input_type: InputType):
4142
self.input_type = input_type
@@ -100,17 +101,25 @@ def is_pdf(self) -> bool:
100101
""":return: True if the file is a PDF."""
101102
return self.file_mimetype == "application/pdf"
102103

103-
def count_doc_pages(self) -> int:
104+
@property
105+
def page_count(self) -> int:
104106
"""
105-
Count the pages in the PDF.
107+
Count the pages in the document.
106108
107-
:return: the number of pages.
109+
:return: The number of pages.
108110
"""
109-
if self.is_pdf():
110-
self.file_object.seek(0)
111-
pdf = pdfium.PdfDocument(self.file_object)
112-
return len(pdf)
113-
return 1
111+
if self._page_count is None:
112+
if self.is_pdf():
113+
self.file_object.seek(0)
114+
pdf = pdfium.PdfDocument(self.file_object)
115+
self._page_count = len(pdf)
116+
else:
117+
self._page_count = 1
118+
return self._page_count
119+
120+
def count_doc_pages(self) -> int:
121+
"""Deprecated. Use ``page_count`` instead."""
122+
return self.page_count
114123

115124
def apply_page_options(self, page_options: PageOptions) -> None:
116125
"""Apply cut and merge options on multipage documents."""
@@ -131,10 +140,10 @@ def process_pdf(
131140
"""Run any required processing on a PDF file."""
132141
if self.is_pdf_empty():
133142
raise MindeeSourceError(f"PDF pages are empty in: {self.filename}")
134-
pages_count = self.count_doc_pages()
135-
if on_min_pages > pages_count:
143+
page_count = self.page_count
144+
if on_min_pages > page_count:
136145
return
137-
all_pages = list(range(pages_count))
146+
all_pages = list(range(page_count))
138147
if behavior == KEEP_ONLY:
139148
pages_to_keep = set()
140149
for page_id in page_indexes:
@@ -161,7 +170,7 @@ def merge_pdf_pages(self, page_numbers: set) -> None:
161170
"""
162171
Create a new PDF from pages and set it to ``file_object``.
163172
164-
:param page_numbers: List of pages number to use for merging in the original PDF.
173+
:param page_numbers: List of page numbers to use for merging in the original PDF.
165174
:return: None
166175
"""
167176
self.file_object.seek(0)
@@ -172,6 +181,7 @@ def merge_pdf_pages(self, page_numbers: set) -> None:
172181
bytes_io = io.BytesIO()
173182
new_pdf.save(bytes_io)
174183
self.file_object = bytes_io
184+
self._page_count = len(new_pdf)
175185

176186
def is_pdf_empty(self) -> bool:
177187
"""

pyproject.toml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -44,15 +44,15 @@ Changelog = "https://github.com/mindee/mindee-api-python/blob/main/CHANGELOG.md"
4444

4545
[project.optional-dependencies]
4646
lint = [
47-
"pylint==3.3.1",
48-
"pre-commit~=3.2.2",
49-
"types-pytz>=2023.3",
47+
"pylint==3.3.9",
48+
"pre-commit~=3.6.0",
49+
"types-pytz>=2024.2",
5050
"types-requests>=2.31",
5151
]
5252
test = [
5353
"toml~=0.10.2",
5454
"pytest~=7.4",
55-
"pytest-cov~=4.1",
55+
"pytest-cov~=5.0",
5656
]
5757
docs = [
5858
"sphinx~=5.3",

tests/extraction/test_image_extractor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from mindee.extraction.common.image_extractor import extract_multiple_images_from_source
77
from mindee.input.sources.path_input import PathInput
88
from mindee.product.barcode_reader.barcode_reader_v1 import BarcodeReaderV1
9-
from tests.test_inputs import PRODUCT_DATA_DIR
9+
from tests.utils import PRODUCT_DATA_DIR
1010

1111

1212
@pytest.fixture

tests/extraction/test_invoice_splitter_auto_extraction.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,7 @@
99
from mindee.product.invoice.invoice_v4 import InvoiceV4
1010
from mindee.product.invoice_splitter.invoice_splitter_v1 import InvoiceSplitterV1
1111
from tests.product import get_id, get_version
12-
from tests.test_inputs import PRODUCT_DATA_DIR
13-
from tests.utils import levenshtein_ratio
12+
from tests.utils import PRODUCT_DATA_DIR, levenshtein_ratio
1413

1514

1615
@pytest.fixture

tests/extraction/test_multi_receipts_extractor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from mindee.product.multi_receipts_detector.multi_receipts_detector_v1 import (
1111
MultiReceiptsDetectorV1,
1212
)
13-
from tests.test_inputs import PRODUCT_DATA_DIR
13+
from tests.utils import PRODUCT_DATA_DIR
1414

1515

1616
@pytest.fixture

tests/extraction/test_pdf_extractor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from mindee.product.invoice_splitter.invoice_splitter_v1_document import (
99
InvoiceSplitterV1Document,
1010
)
11-
from tests.test_inputs import PRODUCT_DATA_DIR
11+
from tests.utils import PRODUCT_DATA_DIR
1212

1313

1414
@pytest.fixture

tests/extras/test_extras_integration.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from mindee import Client
44
from mindee.product.international_id.international_id_v2 import InternationalIdV2
55
from mindee.product.invoice.invoice_v4 import InvoiceV4
6-
from tests.product import PRODUCT_DATA_DIR
6+
from tests.utils import PRODUCT_DATA_DIR
77

88

99
@pytest.fixture

0 commit comments

Comments
 (0)