Resolve merge conflicts and add LLM fallback for non-searchable PDFs

gjmveloso · gjmveloso · commit 6742995a097d · 2026-04-07T14:42:42.000-04:00
- Resolve merge conflicts that were baked into the previous commits
- Add llm_caption import and two prompt constants (_PDF_IMAGE_LLM_PROMPT,
  _PDF_FULL_LLM_PROMPT) to avoid inline prompt strings
- Add _collect_lt_images() and _get_lt_image_data() helpers for extracting
  JPEG/JPEG2000 image data from pdfminer LTImage objects; use pdfminer's own
  LITERALS_DCT_DECODE / LITERALS_JPX_DECODE for filter comparison instead of
  fragile PSLiteral string conversion
- When no form pages are detected, use pdfminer extract_text for prose quality,
  then do a second pass with extract_pages to find LTFigure elements containing
  embedded images and caption each one via the LLM
- Add last-resort whole-document LLM fallback for fully non-searchable PDFs
  where no captionable images were found
- Guard _merge_partial_numbering_lines call against None return from llm_caption
diff --git a/packages/markitdown/src/markitdown/converters/_pdf_converter.py b/packages/markitdown/src/markitdown/converters/_pdf_converter.py
@@ -6,6 +6,7 @@
 from .._base_converter import DocumentConverter, DocumentConverterResult
 from .._stream_info import StreamInfo
 from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
+from ._llm_caption import llm_caption
 
 # Pattern for MasterFormat-style partial numbering (e.g., ".1", ".2", ".10")
 PARTIAL_NUMBERING_PATTERN = re.compile(r"^\.\d+$")
@@ -74,6 +75,23 @@ def _merge_partial_numbering_lines(text: str) -> str:
 
 ACCEPTED_FILE_EXTENSIONS = [".pdf"]
 
+_PDF_IMAGE_LLM_PROMPT = (
+    "You are an advanced document extraction AI. Extract and reproduce all text "
+    "visible in this image exactly as it appears. Do not translate or generate new "
+    "content. Preserve the original structure including sections, titles, headers, "
+    "tables, lists, and code snippets in Markdown format. Only output valid Markdown."
+)
+
+_PDF_FULL_LLM_PROMPT = (
+    "You are an advanced document extraction AI. Your task is to analyze the provided "
+    "document, understand its content and context, and produce a perfectly structured "
+    "Markdown document from the text within it. Do not translate or generate new text. "
+    "Retain the structure of the original content, ensuring that sections, titles, "
+    "headers and important details are clearly separated. If the document contains any "
+    "tables, lists and code snippets format them correctly to preserve their original "
+    "meaning. Only a valid Markdown-formatted output is allowed."
+)
+
 
 def _to_markdown_table(table: list[list[str]], include_separator: bool = True) -> str:
     """Convert a 2D list (rows/columns) into a nicely aligned Markdown table.
@@ -117,6 +135,51 @@ def fmt_row(row: list[str]) -> str:
     return "\n".join(md)
 
 
+def _collect_lt_images(element: Any) -> list[Any]:
+    """Recursively collect LTImage objects from a pdfminer layout element."""
+    try:
+        from pdfminer.layout import LTFigure, LTImage
+    except ImportError:
+        return []
+
+    images = []
+    if isinstance(element, LTImage):
+        images.append(element)
+    elif isinstance(element, LTFigure):
+        for child in element:
+            images.extend(_collect_lt_images(child))
+    return images
+
+
+def _get_lt_image_data(lt_image: Any) -> tuple[bytes, str] | None:
+    """
+    Extract raw bytes and MIME type from a pdfminer LTImage.
+
+    Returns (bytes, mime_type) for JPEG and JPEG2000 images whose compressed
+    stream bytes can be sent directly to an LLM vision API. Returns None for
+    other formats (e.g. raw bitmap, JBIG2) that cannot be used as-is.
+
+    Uses pdfminer's own filter literals for comparison to avoid fragile
+    string conversion of PSLiteral objects.
+    """
+    try:
+        from pdfminer.pdftypes import LITERALS_DCT_DECODE, LITERALS_JPX_DECODE
+
+        stream = lt_image.stream
+        for f, _ in stream.get_filters():
+            if f in LITERALS_DCT_DECODE:
+                data = stream.get_rawdata()
+                if data:
+                    return data, "image/jpeg"
+            elif f in LITERALS_JPX_DECODE:
+                data = stream.get_rawdata()
+                if data:
+                    return data, "image/jp2"
+    except Exception:
+        pass
+    return None
+
+
 def _extract_form_content_from_words(page: Any) -> str | None:
     """
     Extract form-style content from a PDF page by analyzing word positions.
@@ -530,15 +593,10 @@ def convert(
                     extension=".pdf",
                     feature="pdf",
                 )
-<<<<<<< HEAD
             ) from _dependency_exc_info[1].with_traceback(
-=======
-            ) from _dependency_exc_info[1].with_traceback(  # type: ignore[union-attr]
->>>>>>> 117ffa2 (Extending LLM usage for PDFs where the extracted text was empty with pdfminer)
                 _dependency_exc_info[2]
             )  # type: ignore[union-attr]
 
-<<<<<<< HEAD
         assert isinstance(file_stream, io.IOBase)
 
         # Read file stream into BytesIO for compatibility with pdfplumber
@@ -570,46 +628,75 @@ def convert(
 
                     page.close()  # Free cached page data immediately
 
-            # If no pages had form-style content, use pdfminer for
-            # the whole document (better text spacing for prose).
+            # If no pages had form-style content, discard pdfplumber results and
+            # use pdfminer for the whole document (better text spacing for prose).
             if form_page_count == 0:
+                markdown_chunks = []
                 pdf_bytes.seek(0)
-                markdown = pdfminer.high_level.extract_text(pdf_bytes)
-            else:
-                markdown = "\n\n".join(markdown_chunks).strip()
+                text = pdfminer.high_level.extract_text(pdf_bytes)
+                if text and text.strip():
+                    markdown_chunks.append(text)
+
+                # Second pass: scan for LTFigure elements containing embedded
+                # images and caption them with the LLM when available. This
+                # handles PDFs with mixed content (extractable text + images).
+                llm_client = kwargs.get("llm_client")
+                llm_model = kwargs.get("llm_model")
+                if llm_client and llm_model:
+                    from pdfminer.layout import LTFigure
+
+                    pdf_bytes.seek(0)
+                    for page_layout in pdfminer.high_level.extract_pages(pdf_bytes):
+                        for element in page_layout:
+                            # LTImage is always a child of LTFigure, never a
+                            # direct child of LTPage (PDFPageAggregator wraps
+                            # every image in begin_figure/end_figure).
+                            if isinstance(element, LTFigure):
+                                for lt_img in _collect_lt_images(element):
+                                    img_data = _get_lt_image_data(lt_img)
+                                    if img_data:
+                                        img_bytes, img_mime = img_data
+                                        ext = (
+                                            ".jpg"
+                                            if img_mime == "image/jpeg"
+                                            else ".jp2"
+                                        )
+                                        caption = llm_caption(
+                                            io.BytesIO(img_bytes),
+                                            StreamInfo(
+                                                mimetype=img_mime, extension=ext
+                                            ),
+                                            client=llm_client,
+                                            model=llm_model,
+                                            prompt=_PDF_IMAGE_LLM_PROMPT,
+                                        )
+                                        if caption:
+                                            markdown_chunks.append(caption)
+
+            markdown = "\n\n".join(markdown_chunks).strip()
 
         except Exception:
             # Fallback if pdfplumber fails
             pdf_bytes.seek(0)
             markdown = pdfminer.high_level.extract_text(pdf_bytes)
 
-        # Fallback if still empty
-        if not markdown:
-=======
-        cur_pos = file_stream.tell()
-        markdown = pdfminer.high_level.extract_text(file_stream)
-        if markdown.strip() == "":
->>>>>>> c83bacc (- Prompt improvements for non-Gemini models)
-            # Try to leverage LLM OCR capabilities when PDF is not searchable
+        # Last-resort fallback: send entire PDF to LLM when no text could be
+        # extracted at all (e.g. fully scanned PDFs with no recognized images).
+        if not markdown or not markdown.strip():
             llm_client = kwargs.get("llm_client")
             llm_model = kwargs.get("llm_model")
             if llm_client and llm_model:
-                file_stream.seek(cur_pos)
-                llm_prompt = """You are an advanced document extraction AI. Your task is to analyze the provided
-                document, understand its content and context, and produce a perfectly structured Markdown document
-                from the text within it. Do not translate neither generate new text. Retain the structure of the
-                original content, ensuring that sections, titles, headers and important details are clearly separated.
-                If the image contains any tables, lists and code snippets format them correctly to preserve their
-                original meaning. Only a valid Markdown-formatted output is allowed."""
+                pdf_bytes.seek(0)
                 markdown = llm_caption(
-                    file_stream,
+                    pdf_bytes,
                     stream_info,
                     client=llm_client,
                     model=llm_model,
-                    prompt=llm_prompt,
+                    prompt=_PDF_FULL_LLM_PROMPT,
                 )
 
         # Post-process to merge MasterFormat-style partial numbering with following text
-        markdown = _merge_partial_numbering_lines(markdown)
+        if markdown:
+            markdown = _merge_partial_numbering_lines(markdown)
 
-        return DocumentConverterResult(markdown=markdown)
+        return DocumentConverterResult(markdown=markdown or "")