Skip to content

Commit a6c8ac4

Browse files
authored
Fix O(n) memory growth in PDF conversion by calling page.close() afte… (#1612)
* Fix O(n) memory growth in PDF conversion by calling page.close() after each page * Refactor PDF memory optimization tests for improved readability and consistency * Add memory benchmarking tests for PDF conversion with page.close() fix * Remove unnecessary blank lines in PDF memory optimization tests for cleaner code * Bump version to 0.1.6b2 in __about__.py * Update PDF conversion tests to include mimetype in StreamInfo
1 parent c6308dc commit a6c8ac4

File tree

3 files changed

+385
-19
lines changed

3 files changed

+385
-19
lines changed
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
22
#
33
# SPDX-License-Identifier: MIT
4-
__version__ = "0.1.6b1"
4+
__version__ = "0.1.6b2"

packages/markitdown/src/markitdown/converters/_pdf_converter.py

Lines changed: 20 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -536,39 +536,41 @@ def convert(
536536

537537
assert isinstance(file_stream, io.IOBase)
538538

539-
markdown_chunks: list[str] = []
540-
541539
# Read file stream into BytesIO for compatibility with pdfplumber
542540
pdf_bytes = io.BytesIO(file_stream.read())
543541

544542
try:
545-
# Track how many pages are form-style vs plain text
546-
form_pages = 0
547-
plain_pages = 0
543+
# Single pass: check every page for form-style content.
544+
# Pages with tables/forms get rich extraction; plain-text
545+
# pages are collected separately. page.close() is called
546+
# after each page to free pdfplumber's cached objects and
547+
# keep memory usage constant regardless of page count.
548+
markdown_chunks: list[str] = []
549+
form_page_count = 0
550+
plain_page_indices: list[int] = []
548551

549552
with pdfplumber.open(pdf_bytes) as pdf:
550-
for page in pdf.pages:
551-
# Try form-style word position extraction
553+
for page_idx, page in enumerate(pdf.pages):
552554
page_content = _extract_form_content_from_words(page)
553555

554-
# If extraction returns None, this page is not form-style
555-
if page_content is None:
556-
plain_pages += 1
557-
# Extract text using pdfplumber's basic extraction for this page
556+
if page_content is not None:
557+
form_page_count += 1
558+
if page_content.strip():
559+
markdown_chunks.append(page_content)
560+
else:
561+
plain_page_indices.append(page_idx)
558562
text = page.extract_text()
559563
if text and text.strip():
560564
markdown_chunks.append(text.strip())
561-
else:
562-
form_pages += 1
563-
if page_content.strip():
564-
markdown_chunks.append(page_content)
565565

566-
# If most pages are plain text, use pdfminer for better text handling
567-
if plain_pages > form_pages and plain_pages > 0:
566+
page.close() # Free cached page data immediately
567+
568+
# If no pages had form-style content, use pdfminer for
569+
# the whole document (better text spacing for prose).
570+
if form_page_count == 0:
568571
pdf_bytes.seek(0)
569572
markdown = pdfminer.high_level.extract_text(pdf_bytes)
570573
else:
571-
# Build markdown from chunks
572574
markdown = "\n\n".join(markdown_chunks).strip()
573575

574576
except Exception:

0 commit comments

Comments
 (0)