Fix O(n) memory growth in PDF conversion by calling page.close() afte… (#1612)

lesyk · web-flow · commit a6c8ac46a684 · 2026-03-16T10:35:24.000-07:00
* Fix O(n) memory growth in PDF conversion by calling page.close() after each page

* Refactor PDF memory optimization tests for improved readability and consistency

* Add memory benchmarking tests for PDF conversion with page.close() fix

* Remove unnecessary blank lines in PDF memory optimization tests for cleaner code

* Bump version to 0.1.6b2 in __about__.py

* Update PDF conversion tests to include mimetype in StreamInfo
diff --git a/packages/markitdown/src/markitdown/__about__.py b/packages/markitdown/src/markitdown/__about__.py
@@ -1,4 +1,4 @@
 # SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
 #
 # SPDX-License-Identifier: MIT
-__version__ = "0.1.6b1"
+__version__ = "0.1.6b2"
diff --git a/packages/markitdown/src/markitdown/converters/_pdf_converter.py b/packages/markitdown/src/markitdown/converters/_pdf_converter.py
@@ -536,39 +536,41 @@ def convert(
 
         assert isinstance(file_stream, io.IOBase)
 
-        markdown_chunks: list[str] = []
-
         # Read file stream into BytesIO for compatibility with pdfplumber
         pdf_bytes = io.BytesIO(file_stream.read())
 
         try:
-            # Track how many pages are form-style vs plain text
-            form_pages = 0
-            plain_pages = 0
+            # Single pass: check every page for form-style content.
+            # Pages with tables/forms get rich extraction; plain-text
+            # pages are collected separately. page.close() is called
+            # after each page to free pdfplumber's cached objects and
+            # keep memory usage constant regardless of page count.
+            markdown_chunks: list[str] = []
+            form_page_count = 0
+            plain_page_indices: list[int] = []
 
             with pdfplumber.open(pdf_bytes) as pdf:
-                for page in pdf.pages:
-                    # Try form-style word position extraction
+                for page_idx, page in enumerate(pdf.pages):
                     page_content = _extract_form_content_from_words(page)
 
-                    # If extraction returns None, this page is not form-style
-                    if page_content is None:
-                        plain_pages += 1
-                        # Extract text using pdfplumber's basic extraction for this page
+                    if page_content is not None:
+                        form_page_count += 1
+                        if page_content.strip():
+                            markdown_chunks.append(page_content)
+                    else:
+                        plain_page_indices.append(page_idx)
                         text = page.extract_text()
                         if text and text.strip():
                             markdown_chunks.append(text.strip())
-                    else:
-                        form_pages += 1
-                        if page_content.strip():
-                            markdown_chunks.append(page_content)
 
-            # If most pages are plain text, use pdfminer for better text handling
-            if plain_pages > form_pages and plain_pages > 0:
+                    page.close()  # Free cached page data immediately
+
+            # If no pages had form-style content, use pdfminer for
+            # the whole document (better text spacing for prose).
+            if form_page_count == 0:
                 pdf_bytes.seek(0)
                 markdown = pdfminer.high_level.extract_text(pdf_bytes)
             else:
-                # Build markdown from chunks
                 markdown = "\n\n".join(markdown_chunks).strip()
 
         except Exception:
diff --git a/packages/markitdown/tests/test_pdf_memory.py b/packages/markitdown/tests/test_pdf_memory.py

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`	`1`	`# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>`
`2`	`2`	`#`
`3`	`3`	`# SPDX-License-Identifier: MIT`
`4`		`-__version__ = "0.1.6b1"`
	`4`	`+__version__ = "0.1.6b2"`