diff --git a/README.md b/README.md index aa2f58bb8..e26df4844 100644 --- a/README.md +++ b/README.md @@ -82,12 +82,29 @@ Or use `-o` to specify the output file: markitdown path-to-file.pdf -o document.md ``` +For DOCX files with merged or nested tables, use HTML table output to preserve +structure that Markdown pipe tables cannot represent: + +```bash +markitdown report.docx --docx-table-format html > report.md +``` + You can also pipe content: ```bash cat path-to-file.pdf | markitdown ``` +### Python API + +```python +from markitdown import MarkItDown + +md = MarkItDown(docx_table_format="html") +result = md.convert("report.docx") +print(result.text_content) +``` + ### Optional Dependencies MarkItDown has optional dependencies for activating various file formats. Earlier in this document, we installed all optional dependencies with the `[all]` option. However, you can also install them individually for more control. For example: diff --git a/packages/markitdown/README.md b/packages/markitdown/README.md index bedcba183..b34c524f2 100644 --- a/packages/markitdown/README.md +++ b/packages/markitdown/README.md @@ -32,13 +32,20 @@ pip install -e packages/markitdown[all] markitdown path-to-file.pdf > document.md ``` +For DOCX files with merged or nested tables, use HTML table output to preserve +structure that Markdown pipe tables cannot represent: + +```bash +markitdown report.docx --docx-table-format html > report.md +``` + ### Python API ```python from markitdown import MarkItDown -md = MarkItDown() -result = md.convert("test.xlsx") +md = MarkItDown(docx_table_format="html") +result = md.convert("test.docx") print(result.text_content) ``` diff --git a/packages/markitdown/src/markitdown/__main__.py b/packages/markitdown/src/markitdown/__main__.py index ccb44b64b..f1e1b6613 100644 --- a/packages/markitdown/src/markitdown/__main__.py +++ b/packages/markitdown/src/markitdown/__main__.py @@ -138,6 +138,13 @@ def main(): help="Keep data URIs (like base64-encoded images) in the output. By default, data URIs are truncated.", ) + parser.add_argument( + "--docx-table-format", + choices=("markdown", "html"), + default="markdown", + help="Format to use for DOCX tables. Use 'html' to preserve merged and nested table structure.", + ) + parser.add_argument("filename", nargs="?") args = parser.parse_args() @@ -209,7 +216,9 @@ def main(): _exit_with_error("Filename is required when using Document Intelligence.") markitdown = MarkItDown( - enable_plugins=args.use_plugins, docintel_endpoint=args.endpoint + enable_plugins=args.use_plugins, + docintel_endpoint=args.endpoint, + docx_table_format=args.docx_table_format, ) elif args.use_cu: if args.cu_endpoint is None: @@ -240,9 +249,16 @@ def main(): _exit_with_error(f"Unknown file type: {name}") cu_kwargs["cu_file_types"] = cu_types - markitdown = MarkItDown(enable_plugins=args.use_plugins, **cu_kwargs) + markitdown = MarkItDown( + enable_plugins=args.use_plugins, + docx_table_format=args.docx_table_format, + **cu_kwargs, + ) else: - markitdown = MarkItDown(enable_plugins=args.use_plugins) + markitdown = MarkItDown( + enable_plugins=args.use_plugins, + docx_table_format=args.docx_table_format, + ) if args.filename is None: result = markitdown.convert_stream( diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py index f6aa4df0e..b1eb5ede7 100644 --- a/packages/markitdown/src/markitdown/_markitdown.py +++ b/packages/markitdown/src/markitdown/_markitdown.py @@ -192,7 +192,12 @@ def enable_builtins(self, **kwargs) -> None: self.register_converter(WikipediaConverter()) self.register_converter(YouTubeConverter()) self.register_converter(BingSerpConverter()) - self.register_converter(DocxConverter()) + self.register_converter( + DocxConverter( + docx_table_format=kwargs.get("docx_table_format", "markdown"), + docx_markdownify_options=kwargs.get("docx_markdownify_options"), + ) + ) self.register_converter(XlsxConverter()) self.register_converter(XlsConverter()) self.register_converter(PptxConverter()) diff --git a/packages/markitdown/src/markitdown/converters/_docx_converter.py b/packages/markitdown/src/markitdown/converters/_docx_converter.py index 3975107b1..f624786db 100644 --- a/packages/markitdown/src/markitdown/converters/_docx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_docx_converter.py @@ -2,7 +2,7 @@ import io from warnings import warn -from typing import BinaryIO, Any +from typing import BinaryIO, Any, Optional from ._html_converter import HtmlConverter from ..converter_utils.docx.pre_process import pre_process_docx @@ -27,15 +27,34 @@ ACCEPTED_FILE_EXTENSIONS = [".docx"] +DOCX_TABLE_FORMAT_MARKDOWN = "markdown" +DOCX_TABLE_FORMAT_HTML = "html" +DOCX_TABLE_FORMATS = {DOCX_TABLE_FORMAT_MARKDOWN, DOCX_TABLE_FORMAT_HTML} + + +def _validate_docx_table_format(docx_table_format: str) -> str: + if docx_table_format not in DOCX_TABLE_FORMATS: + raise ValueError( + "docx_table_format must be one of: " + ", ".join(sorted(DOCX_TABLE_FORMATS)) + ) + return docx_table_format + class DocxConverter(HtmlConverter): """ Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible. """ - def __init__(self): + def __init__( + self, + *, + docx_table_format: str = DOCX_TABLE_FORMAT_MARKDOWN, + docx_markdownify_options: Optional[dict[str, Any]] = None, + ): super().__init__() self._html_converter = HtmlConverter() + self._docx_table_format = _validate_docx_table_format(docx_table_format) + self._docx_markdownify_options = dict(docx_markdownify_options or {}) def accepts( self, @@ -75,9 +94,22 @@ def convert( _dependency_exc_info[2] ) + docx_table_format = _validate_docx_table_format( + kwargs.pop("docx_table_format", self._docx_table_format) + ) + docx_markdownify_options = dict(self._docx_markdownify_options) + docx_markdownify_options.update( + kwargs.pop("docx_markdownify_options", {}) or {} + ) + if docx_table_format == DOCX_TABLE_FORMAT_HTML: + docx_markdownify_options["_preserve_html_tables"] = True + else: + docx_markdownify_options.pop("_preserve_html_tables", None) + style_map = kwargs.get("style_map", None) + html_converter_kwargs = {**kwargs, **docx_markdownify_options} pre_process_stream = pre_process_docx(file_stream) return self._html_converter.convert_string( mammoth.convert_to_html(pre_process_stream, style_map=style_map).value, - **kwargs, + **html_converter_kwargs, ) diff --git a/packages/markitdown/src/markitdown/converters/_markdownify.py b/packages/markitdown/src/markitdown/converters/_markdownify.py index 19e8a2984..26df4be24 100644 --- a/packages/markitdown/src/markitdown/converters/_markdownify.py +++ b/packages/markitdown/src/markitdown/converters/_markdownify.py @@ -16,6 +16,7 @@ class _CustomMarkdownify(markdownify.MarkdownConverter): """ def __init__(self, **options: Any): + self._preserve_html_tables = bool(options.pop("_preserve_html_tables", False)) options["heading_style"] = options.get("heading_style", markdownify.ATX) options["keep_data_uris"] = options.get("keep_data_uris", False) # Explicitly cast options to the expected type if necessary @@ -122,5 +123,19 @@ def convert_input( return "[x] " if el.has_attr("checked") else "[ ] " return "" + def convert_table( + self, + el: Any, + text: str, + parent_tags: Any, + **kwargs: Any, + ) -> str: + """Optionally preserve raw HTML tables for structures Markdown cannot represent.""" + + if self._preserve_html_tables: + return "\n\n" + str(el) + "\n\n" + + return super().convert_table(el, text, parent_tags) # type: ignore + def convert_soup(self, soup: Any) -> str: return super().convert_soup(soup) # type: ignore diff --git a/packages/markitdown/tests/test_docx_tables.py b/packages/markitdown/tests/test_docx_tables.py new file mode 100644 index 000000000..7d4a2de13 --- /dev/null +++ b/packages/markitdown/tests/test_docx_tables.py @@ -0,0 +1,145 @@ +#!/usr/bin/env python3 -m pytest +import locale +import subprocess +import zipfile +from pathlib import Path + +import pytest + +from markitdown import MarkItDown + + +def _write_docx_with_complex_tables(path: Path) -> None: + document_xml = """ + + + + Quarterly research note + + + + + Audited + + + + + + + Segment Revenue + + + + Q1 + 1200 + + + + + + Outer Cell + + + Nested KPI + + + + + + + + + + + +""" + with zipfile.ZipFile(path, "w") as docx: + docx.writestr( + "[Content_Types].xml", + """ + + + + + +""", + ) + docx.writestr( + "_rels/.rels", + """ + + + +""", + ) + docx.writestr("word/document.xml", document_xml) + + +@pytest.fixture() +def complex_tables_docx(tmp_path: Path) -> Path: + docx_path = tmp_path / "complex-tables.docx" + _write_docx_with_complex_tables(docx_path) + return docx_path + + +def test_docx_tables_default_to_markdown(complex_tables_docx: Path) -> None: + result = MarkItDown().convert(complex_tables_docx) + + assert "Segment Revenue" in result.markdown + assert "Nested KPI" in result.markdown + assert " None: + result = MarkItDown(docx_table_format="html").convert(complex_tables_docx) + + assert " None: + markitdown = MarkItDown(docx_table_format="markdown") + + result = markitdown.convert(complex_tables_docx, docx_table_format="html") + + assert " None: + with pytest.raises(ValueError, match="docx_table_format"): + MarkItDown(docx_table_format="invalid") + + +def test_docx_markdownify_options_are_forwarded(complex_tables_docx: Path) -> None: + result = MarkItDown(docx_markdownify_options={"strong_em_symbol": "_"}).convert( + complex_tables_docx + ) + + assert "__Audited__" in result.markdown + + +def test_cli_docx_table_format_html(complex_tables_docx: Path) -> None: + result = subprocess.run( + [ + "python", + "-m", + "markitdown", + "--docx-table-format", + "html", + str(complex_tables_docx), + ], + capture_output=True, + text=False, + ) + stdout = result.stdout.decode(locale.getpreferredencoding()) + + assert result.returncode == 0, result.stderr.decode("utf-8", errors="replace") + assert "