|
6 | 6 | from .._base_converter import DocumentConverter, DocumentConverterResult |
7 | 7 | from .._stream_info import StreamInfo |
8 | 8 | from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE |
| 9 | +from ._llm_caption import llm_caption |
9 | 10 |
|
10 | 11 | # Pattern for MasterFormat-style partial numbering (e.g., ".1", ".2", ".10") |
11 | 12 | PARTIAL_NUMBERING_PATTERN = re.compile(r"^\.\d+$") |
@@ -74,6 +75,23 @@ def _merge_partial_numbering_lines(text: str) -> str: |
74 | 75 |
|
75 | 76 | ACCEPTED_FILE_EXTENSIONS = [".pdf"] |
76 | 77 |
|
| 78 | +_PDF_IMAGE_LLM_PROMPT = ( |
| 79 | + "You are an advanced document extraction AI. Extract and reproduce all text " |
| 80 | + "visible in this image exactly as it appears. Do not translate or generate new " |
| 81 | + "content. Preserve the original structure including sections, titles, headers, " |
| 82 | + "tables, lists, and code snippets in Markdown format. Only output valid Markdown." |
| 83 | +) |
| 84 | + |
| 85 | +_PDF_FULL_LLM_PROMPT = ( |
| 86 | + "You are an advanced document extraction AI. Your task is to analyze the provided " |
| 87 | + "document, understand its content and context, and produce a perfectly structured " |
| 88 | + "Markdown document from the text within it. Do not translate or generate new text. " |
| 89 | + "Retain the structure of the original content, ensuring that sections, titles, " |
| 90 | + "headers and important details are clearly separated. If the document contains any " |
| 91 | + "tables, lists and code snippets format them correctly to preserve their original " |
| 92 | + "meaning. Only a valid Markdown-formatted output is allowed." |
| 93 | +) |
| 94 | + |
77 | 95 |
|
78 | 96 | def _to_markdown_table(table: list[list[str]], include_separator: bool = True) -> str: |
79 | 97 | """Convert a 2D list (rows/columns) into a nicely aligned Markdown table. |
@@ -117,6 +135,51 @@ def fmt_row(row: list[str]) -> str: |
117 | 135 | return "\n".join(md) |
118 | 136 |
|
119 | 137 |
|
| 138 | +def _collect_lt_images(element: Any) -> list[Any]: |
| 139 | + """Recursively collect LTImage objects from a pdfminer layout element.""" |
| 140 | + try: |
| 141 | + from pdfminer.layout import LTFigure, LTImage |
| 142 | + except ImportError: |
| 143 | + return [] |
| 144 | + |
| 145 | + images = [] |
| 146 | + if isinstance(element, LTImage): |
| 147 | + images.append(element) |
| 148 | + elif isinstance(element, LTFigure): |
| 149 | + for child in element: |
| 150 | + images.extend(_collect_lt_images(child)) |
| 151 | + return images |
| 152 | + |
| 153 | + |
| 154 | +def _get_lt_image_data(lt_image: Any) -> tuple[bytes, str] | None: |
| 155 | + """ |
| 156 | + Extract raw bytes and MIME type from a pdfminer LTImage. |
| 157 | +
|
| 158 | + Returns (bytes, mime_type) for JPEG and JPEG2000 images whose compressed |
| 159 | + stream bytes can be sent directly to an LLM vision API. Returns None for |
| 160 | + other formats (e.g. raw bitmap, JBIG2) that cannot be used as-is. |
| 161 | +
|
| 162 | + Uses pdfminer's own filter literals for comparison to avoid fragile |
| 163 | + string conversion of PSLiteral objects. |
| 164 | + """ |
| 165 | + try: |
| 166 | + from pdfminer.pdftypes import LITERALS_DCT_DECODE, LITERALS_JPX_DECODE |
| 167 | + |
| 168 | + stream = lt_image.stream |
| 169 | + for f, _ in stream.get_filters(): |
| 170 | + if f in LITERALS_DCT_DECODE: |
| 171 | + data = stream.get_rawdata() |
| 172 | + if data: |
| 173 | + return data, "image/jpeg" |
| 174 | + elif f in LITERALS_JPX_DECODE: |
| 175 | + data = stream.get_rawdata() |
| 176 | + if data: |
| 177 | + return data, "image/jp2" |
| 178 | + except Exception: |
| 179 | + pass |
| 180 | + return None |
| 181 | + |
| 182 | + |
120 | 183 | def _extract_form_content_from_words(page: Any) -> str | None: |
121 | 184 | """ |
122 | 185 | Extract form-style content from a PDF page by analyzing word positions. |
@@ -530,15 +593,10 @@ def convert( |
530 | 593 | extension=".pdf", |
531 | 594 | feature="pdf", |
532 | 595 | ) |
533 | | -<<<<<<< HEAD |
534 | 596 | ) from _dependency_exc_info[1].with_traceback( |
535 | | -======= |
536 | | - ) from _dependency_exc_info[1].with_traceback( # type: ignore[union-attr] |
537 | | ->>>>>>> 117ffa2 (Extending LLM usage for PDFs where the extracted text was empty with pdfminer) |
538 | 597 | _dependency_exc_info[2] |
539 | 598 | ) # type: ignore[union-attr] |
540 | 599 |
|
541 | | -<<<<<<< HEAD |
542 | 600 | assert isinstance(file_stream, io.IOBase) |
543 | 601 |
|
544 | 602 | # Read file stream into BytesIO for compatibility with pdfplumber |
@@ -570,46 +628,75 @@ def convert( |
570 | 628 |
|
571 | 629 | page.close() # Free cached page data immediately |
572 | 630 |
|
573 | | - # If no pages had form-style content, use pdfminer for |
574 | | - # the whole document (better text spacing for prose). |
| 631 | + # If no pages had form-style content, discard pdfplumber results and |
| 632 | + # use pdfminer for the whole document (better text spacing for prose). |
575 | 633 | if form_page_count == 0: |
| 634 | + markdown_chunks = [] |
576 | 635 | pdf_bytes.seek(0) |
577 | | - markdown = pdfminer.high_level.extract_text(pdf_bytes) |
578 | | - else: |
579 | | - markdown = "\n\n".join(markdown_chunks).strip() |
| 636 | + text = pdfminer.high_level.extract_text(pdf_bytes) |
| 637 | + if text and text.strip(): |
| 638 | + markdown_chunks.append(text) |
| 639 | + |
| 640 | + # Second pass: scan for LTFigure elements containing embedded |
| 641 | + # images and caption them with the LLM when available. This |
| 642 | + # handles PDFs with mixed content (extractable text + images). |
| 643 | + llm_client = kwargs.get("llm_client") |
| 644 | + llm_model = kwargs.get("llm_model") |
| 645 | + if llm_client and llm_model: |
| 646 | + from pdfminer.layout import LTFigure |
| 647 | + |
| 648 | + pdf_bytes.seek(0) |
| 649 | + for page_layout in pdfminer.high_level.extract_pages(pdf_bytes): |
| 650 | + for element in page_layout: |
| 651 | + # LTImage is always a child of LTFigure, never a |
| 652 | + # direct child of LTPage (PDFPageAggregator wraps |
| 653 | + # every image in begin_figure/end_figure). |
| 654 | + if isinstance(element, LTFigure): |
| 655 | + for lt_img in _collect_lt_images(element): |
| 656 | + img_data = _get_lt_image_data(lt_img) |
| 657 | + if img_data: |
| 658 | + img_bytes, img_mime = img_data |
| 659 | + ext = ( |
| 660 | + ".jpg" |
| 661 | + if img_mime == "image/jpeg" |
| 662 | + else ".jp2" |
| 663 | + ) |
| 664 | + caption = llm_caption( |
| 665 | + io.BytesIO(img_bytes), |
| 666 | + StreamInfo( |
| 667 | + mimetype=img_mime, extension=ext |
| 668 | + ), |
| 669 | + client=llm_client, |
| 670 | + model=llm_model, |
| 671 | + prompt=_PDF_IMAGE_LLM_PROMPT, |
| 672 | + ) |
| 673 | + if caption: |
| 674 | + markdown_chunks.append(caption) |
| 675 | + |
| 676 | + markdown = "\n\n".join(markdown_chunks).strip() |
580 | 677 |
|
581 | 678 | except Exception: |
582 | 679 | # Fallback if pdfplumber fails |
583 | 680 | pdf_bytes.seek(0) |
584 | 681 | markdown = pdfminer.high_level.extract_text(pdf_bytes) |
585 | 682 |
|
586 | | - # Fallback if still empty |
587 | | - if not markdown: |
588 | | -======= |
589 | | - cur_pos = file_stream.tell() |
590 | | - markdown = pdfminer.high_level.extract_text(file_stream) |
591 | | - if markdown.strip() == "": |
592 | | ->>>>>>> c83bacc (- Prompt improvements for non-Gemini models) |
593 | | - # Try to leverage LLM OCR capabilities when PDF is not searchable |
| 683 | + # Last-resort fallback: send entire PDF to LLM when no text could be |
| 684 | + # extracted at all (e.g. fully scanned PDFs with no recognized images). |
| 685 | + if not markdown or not markdown.strip(): |
594 | 686 | llm_client = kwargs.get("llm_client") |
595 | 687 | llm_model = kwargs.get("llm_model") |
596 | 688 | if llm_client and llm_model: |
597 | | - file_stream.seek(cur_pos) |
598 | | - llm_prompt = """You are an advanced document extraction AI. Your task is to analyze the provided |
599 | | - document, understand its content and context, and produce a perfectly structured Markdown document |
600 | | - from the text within it. Do not translate neither generate new text. Retain the structure of the |
601 | | - original content, ensuring that sections, titles, headers and important details are clearly separated. |
602 | | - If the image contains any tables, lists and code snippets format them correctly to preserve their |
603 | | - original meaning. Only a valid Markdown-formatted output is allowed.""" |
| 689 | + pdf_bytes.seek(0) |
604 | 690 | markdown = llm_caption( |
605 | | - file_stream, |
| 691 | + pdf_bytes, |
606 | 692 | stream_info, |
607 | 693 | client=llm_client, |
608 | 694 | model=llm_model, |
609 | | - prompt=llm_prompt, |
| 695 | + prompt=_PDF_FULL_LLM_PROMPT, |
610 | 696 | ) |
611 | 697 |
|
612 | 698 | # Post-process to merge MasterFormat-style partial numbering with following text |
613 | | - markdown = _merge_partial_numbering_lines(markdown) |
| 699 | + if markdown: |
| 700 | + markdown = _merge_partial_numbering_lines(markdown) |
614 | 701 |
|
615 | | - return DocumentConverterResult(markdown=markdown) |
| 702 | + return DocumentConverterResult(markdown=markdown or "") |
0 commit comments