diff --git a/README.md b/README.md
index aa2f58bb8..e26df4844 100644
--- a/README.md
+++ b/README.md
@@ -82,12 +82,29 @@ Or use `-o` to specify the output file:
markitdown path-to-file.pdf -o document.md
```
+For DOCX files with merged or nested tables, use HTML table output to preserve
+structure that Markdown pipe tables cannot represent:
+
+```bash
+markitdown report.docx --docx-table-format html > report.md
+```
+
You can also pipe content:
```bash
cat path-to-file.pdf | markitdown
```
+### Python API
+
+```python
+from markitdown import MarkItDown
+
+md = MarkItDown(docx_table_format="html")
+result = md.convert("report.docx")
+print(result.text_content)
+```
+
### Optional Dependencies
MarkItDown has optional dependencies for activating various file formats. Earlier in this document, we installed all optional dependencies with the `[all]` option. However, you can also install them individually for more control. For example:
diff --git a/packages/markitdown/README.md b/packages/markitdown/README.md
index bedcba183..b34c524f2 100644
--- a/packages/markitdown/README.md
+++ b/packages/markitdown/README.md
@@ -32,13 +32,20 @@ pip install -e packages/markitdown[all]
markitdown path-to-file.pdf > document.md
```
+For DOCX files with merged or nested tables, use HTML table output to preserve
+structure that Markdown pipe tables cannot represent:
+
+```bash
+markitdown report.docx --docx-table-format html > report.md
+```
+
### Python API
```python
from markitdown import MarkItDown
-md = MarkItDown()
-result = md.convert("test.xlsx")
+md = MarkItDown(docx_table_format="html")
+result = md.convert("test.docx")
print(result.text_content)
```
diff --git a/packages/markitdown/src/markitdown/__main__.py b/packages/markitdown/src/markitdown/__main__.py
index ccb44b64b..f1e1b6613 100644
--- a/packages/markitdown/src/markitdown/__main__.py
+++ b/packages/markitdown/src/markitdown/__main__.py
@@ -138,6 +138,13 @@ def main():
help="Keep data URIs (like base64-encoded images) in the output. By default, data URIs are truncated.",
)
+ parser.add_argument(
+ "--docx-table-format",
+ choices=("markdown", "html"),
+ default="markdown",
+ help="Format to use for DOCX tables. Use 'html' to preserve merged and nested table structure.",
+ )
+
parser.add_argument("filename", nargs="?")
args = parser.parse_args()
@@ -209,7 +216,9 @@ def main():
_exit_with_error("Filename is required when using Document Intelligence.")
markitdown = MarkItDown(
- enable_plugins=args.use_plugins, docintel_endpoint=args.endpoint
+ enable_plugins=args.use_plugins,
+ docintel_endpoint=args.endpoint,
+ docx_table_format=args.docx_table_format,
)
elif args.use_cu:
if args.cu_endpoint is None:
@@ -240,9 +249,16 @@ def main():
_exit_with_error(f"Unknown file type: {name}")
cu_kwargs["cu_file_types"] = cu_types
- markitdown = MarkItDown(enable_plugins=args.use_plugins, **cu_kwargs)
+ markitdown = MarkItDown(
+ enable_plugins=args.use_plugins,
+ docx_table_format=args.docx_table_format,
+ **cu_kwargs,
+ )
else:
- markitdown = MarkItDown(enable_plugins=args.use_plugins)
+ markitdown = MarkItDown(
+ enable_plugins=args.use_plugins,
+ docx_table_format=args.docx_table_format,
+ )
if args.filename is None:
result = markitdown.convert_stream(
diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py
index f6aa4df0e..b1eb5ede7 100644
--- a/packages/markitdown/src/markitdown/_markitdown.py
+++ b/packages/markitdown/src/markitdown/_markitdown.py
@@ -192,7 +192,12 @@ def enable_builtins(self, **kwargs) -> None:
self.register_converter(WikipediaConverter())
self.register_converter(YouTubeConverter())
self.register_converter(BingSerpConverter())
- self.register_converter(DocxConverter())
+ self.register_converter(
+ DocxConverter(
+ docx_table_format=kwargs.get("docx_table_format", "markdown"),
+ docx_markdownify_options=kwargs.get("docx_markdownify_options"),
+ )
+ )
self.register_converter(XlsxConverter())
self.register_converter(XlsConverter())
self.register_converter(PptxConverter())
diff --git a/packages/markitdown/src/markitdown/converters/_docx_converter.py b/packages/markitdown/src/markitdown/converters/_docx_converter.py
index 3975107b1..f624786db 100644
--- a/packages/markitdown/src/markitdown/converters/_docx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_docx_converter.py
@@ -2,7 +2,7 @@
import io
from warnings import warn
-from typing import BinaryIO, Any
+from typing import BinaryIO, Any, Optional
from ._html_converter import HtmlConverter
from ..converter_utils.docx.pre_process import pre_process_docx
@@ -27,15 +27,34 @@
ACCEPTED_FILE_EXTENSIONS = [".docx"]
+DOCX_TABLE_FORMAT_MARKDOWN = "markdown"
+DOCX_TABLE_FORMAT_HTML = "html"
+DOCX_TABLE_FORMATS = {DOCX_TABLE_FORMAT_MARKDOWN, DOCX_TABLE_FORMAT_HTML}
+
+
+def _validate_docx_table_format(docx_table_format: str) -> str:
+ if docx_table_format not in DOCX_TABLE_FORMATS:
+ raise ValueError(
+ "docx_table_format must be one of: " + ", ".join(sorted(DOCX_TABLE_FORMATS))
+ )
+ return docx_table_format
+
class DocxConverter(HtmlConverter):
"""
Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
"""
- def __init__(self):
+ def __init__(
+ self,
+ *,
+ docx_table_format: str = DOCX_TABLE_FORMAT_MARKDOWN,
+ docx_markdownify_options: Optional[dict[str, Any]] = None,
+ ):
super().__init__()
self._html_converter = HtmlConverter()
+ self._docx_table_format = _validate_docx_table_format(docx_table_format)
+ self._docx_markdownify_options = dict(docx_markdownify_options or {})
def accepts(
self,
@@ -75,9 +94,22 @@ def convert(
_dependency_exc_info[2]
)
+ docx_table_format = _validate_docx_table_format(
+ kwargs.pop("docx_table_format", self._docx_table_format)
+ )
+ docx_markdownify_options = dict(self._docx_markdownify_options)
+ docx_markdownify_options.update(
+ kwargs.pop("docx_markdownify_options", {}) or {}
+ )
+ if docx_table_format == DOCX_TABLE_FORMAT_HTML:
+ docx_markdownify_options["_preserve_html_tables"] = True
+ else:
+ docx_markdownify_options.pop("_preserve_html_tables", None)
+
style_map = kwargs.get("style_map", None)
+ html_converter_kwargs = {**kwargs, **docx_markdownify_options}
pre_process_stream = pre_process_docx(file_stream)
return self._html_converter.convert_string(
mammoth.convert_to_html(pre_process_stream, style_map=style_map).value,
- **kwargs,
+ **html_converter_kwargs,
)
diff --git a/packages/markitdown/src/markitdown/converters/_markdownify.py b/packages/markitdown/src/markitdown/converters/_markdownify.py
index 19e8a2984..26df4be24 100644
--- a/packages/markitdown/src/markitdown/converters/_markdownify.py
+++ b/packages/markitdown/src/markitdown/converters/_markdownify.py
@@ -16,6 +16,7 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
"""
def __init__(self, **options: Any):
+ self._preserve_html_tables = bool(options.pop("_preserve_html_tables", False))
options["heading_style"] = options.get("heading_style", markdownify.ATX)
options["keep_data_uris"] = options.get("keep_data_uris", False)
# Explicitly cast options to the expected type if necessary
@@ -122,5 +123,19 @@ def convert_input(
return "[x] " if el.has_attr("checked") else "[ ] "
return ""
+ def convert_table(
+ self,
+ el: Any,
+ text: str,
+ parent_tags: Any,
+ **kwargs: Any,
+ ) -> str:
+ """Optionally preserve raw HTML tables for structures Markdown cannot represent."""
+
+ if self._preserve_html_tables:
+ return "\n\n" + str(el) + "\n\n"
+
+ return super().convert_table(el, text, parent_tags) # type: ignore
+
def convert_soup(self, soup: Any) -> str:
return super().convert_soup(soup) # type: ignore
diff --git a/packages/markitdown/tests/test_docx_tables.py b/packages/markitdown/tests/test_docx_tables.py
new file mode 100644
index 000000000..7d4a2de13
--- /dev/null
+++ b/packages/markitdown/tests/test_docx_tables.py
@@ -0,0 +1,145 @@
+#!/usr/bin/env python3 -m pytest
+import locale
+import subprocess
+import zipfile
+from pathlib import Path
+
+import pytest
+
+from markitdown import MarkItDown
+
+
+def _write_docx_with_complex_tables(path: Path) -> None:
+ document_xml = """
+