Skip to content

Commit 6742995

Browse files
committed
Resolve merge conflicts and add LLM fallback for non-searchable PDFs
- Resolve merge conflicts that were baked into the previous commits - Add llm_caption import and two prompt constants (_PDF_IMAGE_LLM_PROMPT, _PDF_FULL_LLM_PROMPT) to avoid inline prompt strings - Add _collect_lt_images() and _get_lt_image_data() helpers for extracting JPEG/JPEG2000 image data from pdfminer LTImage objects; use pdfminer's own LITERALS_DCT_DECODE / LITERALS_JPX_DECODE for filter comparison instead of fragile PSLiteral string conversion - When no form pages are detected, use pdfminer extract_text for prose quality, then do a second pass with extract_pages to find LTFigure elements containing embedded images and caption each one via the LLM - Add last-resort whole-document LLM fallback for fully non-searchable PDFs where no captionable images were found - Guard _merge_partial_numbering_lines call against None return from llm_caption
1 parent a4c6046 commit 6742995

File tree

1 file changed

+116
-29
lines changed

1 file changed

+116
-29
lines changed

packages/markitdown/src/markitdown/converters/_pdf_converter.py

Lines changed: 116 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from .._base_converter import DocumentConverter, DocumentConverterResult
77
from .._stream_info import StreamInfo
88
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
9+
from ._llm_caption import llm_caption
910

1011
# Pattern for MasterFormat-style partial numbering (e.g., ".1", ".2", ".10")
1112
PARTIAL_NUMBERING_PATTERN = re.compile(r"^\.\d+$")
@@ -74,6 +75,23 @@ def _merge_partial_numbering_lines(text: str) -> str:
7475

7576
ACCEPTED_FILE_EXTENSIONS = [".pdf"]
7677

78+
_PDF_IMAGE_LLM_PROMPT = (
79+
"You are an advanced document extraction AI. Extract and reproduce all text "
80+
"visible in this image exactly as it appears. Do not translate or generate new "
81+
"content. Preserve the original structure including sections, titles, headers, "
82+
"tables, lists, and code snippets in Markdown format. Only output valid Markdown."
83+
)
84+
85+
_PDF_FULL_LLM_PROMPT = (
86+
"You are an advanced document extraction AI. Your task is to analyze the provided "
87+
"document, understand its content and context, and produce a perfectly structured "
88+
"Markdown document from the text within it. Do not translate or generate new text. "
89+
"Retain the structure of the original content, ensuring that sections, titles, "
90+
"headers and important details are clearly separated. If the document contains any "
91+
"tables, lists and code snippets format them correctly to preserve their original "
92+
"meaning. Only a valid Markdown-formatted output is allowed."
93+
)
94+
7795

7896
def _to_markdown_table(table: list[list[str]], include_separator: bool = True) -> str:
7997
"""Convert a 2D list (rows/columns) into a nicely aligned Markdown table.
@@ -117,6 +135,51 @@ def fmt_row(row: list[str]) -> str:
117135
return "\n".join(md)
118136

119137

138+
def _collect_lt_images(element: Any) -> list[Any]:
139+
"""Recursively collect LTImage objects from a pdfminer layout element."""
140+
try:
141+
from pdfminer.layout import LTFigure, LTImage
142+
except ImportError:
143+
return []
144+
145+
images = []
146+
if isinstance(element, LTImage):
147+
images.append(element)
148+
elif isinstance(element, LTFigure):
149+
for child in element:
150+
images.extend(_collect_lt_images(child))
151+
return images
152+
153+
154+
def _get_lt_image_data(lt_image: Any) -> tuple[bytes, str] | None:
155+
"""
156+
Extract raw bytes and MIME type from a pdfminer LTImage.
157+
158+
Returns (bytes, mime_type) for JPEG and JPEG2000 images whose compressed
159+
stream bytes can be sent directly to an LLM vision API. Returns None for
160+
other formats (e.g. raw bitmap, JBIG2) that cannot be used as-is.
161+
162+
Uses pdfminer's own filter literals for comparison to avoid fragile
163+
string conversion of PSLiteral objects.
164+
"""
165+
try:
166+
from pdfminer.pdftypes import LITERALS_DCT_DECODE, LITERALS_JPX_DECODE
167+
168+
stream = lt_image.stream
169+
for f, _ in stream.get_filters():
170+
if f in LITERALS_DCT_DECODE:
171+
data = stream.get_rawdata()
172+
if data:
173+
return data, "image/jpeg"
174+
elif f in LITERALS_JPX_DECODE:
175+
data = stream.get_rawdata()
176+
if data:
177+
return data, "image/jp2"
178+
except Exception:
179+
pass
180+
return None
181+
182+
120183
def _extract_form_content_from_words(page: Any) -> str | None:
121184
"""
122185
Extract form-style content from a PDF page by analyzing word positions.
@@ -530,15 +593,10 @@ def convert(
530593
extension=".pdf",
531594
feature="pdf",
532595
)
533-
<<<<<<< HEAD
534596
) from _dependency_exc_info[1].with_traceback(
535-
=======
536-
) from _dependency_exc_info[1].with_traceback( # type: ignore[union-attr]
537-
>>>>>>> 117ffa2 (Extending LLM usage for PDFs where the extracted text was empty with pdfminer)
538597
_dependency_exc_info[2]
539598
) # type: ignore[union-attr]
540599

541-
<<<<<<< HEAD
542600
assert isinstance(file_stream, io.IOBase)
543601

544602
# Read file stream into BytesIO for compatibility with pdfplumber
@@ -570,46 +628,75 @@ def convert(
570628

571629
page.close() # Free cached page data immediately
572630

573-
# If no pages had form-style content, use pdfminer for
574-
# the whole document (better text spacing for prose).
631+
# If no pages had form-style content, discard pdfplumber results and
632+
# use pdfminer for the whole document (better text spacing for prose).
575633
if form_page_count == 0:
634+
markdown_chunks = []
576635
pdf_bytes.seek(0)
577-
markdown = pdfminer.high_level.extract_text(pdf_bytes)
578-
else:
579-
markdown = "\n\n".join(markdown_chunks).strip()
636+
text = pdfminer.high_level.extract_text(pdf_bytes)
637+
if text and text.strip():
638+
markdown_chunks.append(text)
639+
640+
# Second pass: scan for LTFigure elements containing embedded
641+
# images and caption them with the LLM when available. This
642+
# handles PDFs with mixed content (extractable text + images).
643+
llm_client = kwargs.get("llm_client")
644+
llm_model = kwargs.get("llm_model")
645+
if llm_client and llm_model:
646+
from pdfminer.layout import LTFigure
647+
648+
pdf_bytes.seek(0)
649+
for page_layout in pdfminer.high_level.extract_pages(pdf_bytes):
650+
for element in page_layout:
651+
# LTImage is always a child of LTFigure, never a
652+
# direct child of LTPage (PDFPageAggregator wraps
653+
# every image in begin_figure/end_figure).
654+
if isinstance(element, LTFigure):
655+
for lt_img in _collect_lt_images(element):
656+
img_data = _get_lt_image_data(lt_img)
657+
if img_data:
658+
img_bytes, img_mime = img_data
659+
ext = (
660+
".jpg"
661+
if img_mime == "image/jpeg"
662+
else ".jp2"
663+
)
664+
caption = llm_caption(
665+
io.BytesIO(img_bytes),
666+
StreamInfo(
667+
mimetype=img_mime, extension=ext
668+
),
669+
client=llm_client,
670+
model=llm_model,
671+
prompt=_PDF_IMAGE_LLM_PROMPT,
672+
)
673+
if caption:
674+
markdown_chunks.append(caption)
675+
676+
markdown = "\n\n".join(markdown_chunks).strip()
580677

581678
except Exception:
582679
# Fallback if pdfplumber fails
583680
pdf_bytes.seek(0)
584681
markdown = pdfminer.high_level.extract_text(pdf_bytes)
585682

586-
# Fallback if still empty
587-
if not markdown:
588-
=======
589-
cur_pos = file_stream.tell()
590-
markdown = pdfminer.high_level.extract_text(file_stream)
591-
if markdown.strip() == "":
592-
>>>>>>> c83bacc (- Prompt improvements for non-Gemini models)
593-
# Try to leverage LLM OCR capabilities when PDF is not searchable
683+
# Last-resort fallback: send entire PDF to LLM when no text could be
684+
# extracted at all (e.g. fully scanned PDFs with no recognized images).
685+
if not markdown or not markdown.strip():
594686
llm_client = kwargs.get("llm_client")
595687
llm_model = kwargs.get("llm_model")
596688
if llm_client and llm_model:
597-
file_stream.seek(cur_pos)
598-
llm_prompt = """You are an advanced document extraction AI. Your task is to analyze the provided
599-
document, understand its content and context, and produce a perfectly structured Markdown document
600-
from the text within it. Do not translate neither generate new text. Retain the structure of the
601-
original content, ensuring that sections, titles, headers and important details are clearly separated.
602-
If the image contains any tables, lists and code snippets format them correctly to preserve their
603-
original meaning. Only a valid Markdown-formatted output is allowed."""
689+
pdf_bytes.seek(0)
604690
markdown = llm_caption(
605-
file_stream,
691+
pdf_bytes,
606692
stream_info,
607693
client=llm_client,
608694
model=llm_model,
609-
prompt=llm_prompt,
695+
prompt=_PDF_FULL_LLM_PROMPT,
610696
)
611697

612698
# Post-process to merge MasterFormat-style partial numbering with following text
613-
markdown = _merge_partial_numbering_lines(markdown)
699+
if markdown:
700+
markdown = _merge_partial_numbering_lines(markdown)
614701

615-
return DocumentConverterResult(markdown=markdown)
702+
return DocumentConverterResult(markdown=markdown or "")

0 commit comments

Comments
 (0)