microsoft · gyx09212214-prog · Jun 12, 2026
diff --git a/README.md b/README.md
@@ -82,12 +82,29 @@ Or use `-o` to specify the output file:
 markitdown path-to-file.pdf -o document.md
 ```
 
+For DOCX files with merged or nested tables, use HTML table output to preserve
+structure that Markdown pipe tables cannot represent:
+
+```bash
+markitdown report.docx --docx-table-format html > report.md
+```
+
 You can also pipe content:
 
 ```bash
 cat path-to-file.pdf | markitdown
 ```
 
+### Python API
+
+```python
+from markitdown import MarkItDown
+
+md = MarkItDown(docx_table_format="html")
+result = md.convert("report.docx")
+print(result.text_content)
+```
+
 ### Optional Dependencies
 MarkItDown has optional dependencies for activating various file formats. Earlier in this document, we installed all optional dependencies with the `[all]` option. However, you can also install them individually for more control. For example:
 

diff --git a/packages/markitdown/README.md b/packages/markitdown/README.md
@@ -32,13 +32,20 @@ pip install -e packages/markitdown[all]
 markitdown path-to-file.pdf > document.md
 ```
 
+For DOCX files with merged or nested tables, use HTML table output to preserve
+structure that Markdown pipe tables cannot represent:
+
+```bash
+markitdown report.docx --docx-table-format html > report.md
+```
+
 ### Python API
 
 ```python
 from markitdown import MarkItDown
 
-md = MarkItDown()
-result = md.convert("test.xlsx")
+md = MarkItDown(docx_table_format="html")
+result = md.convert("test.docx")
 print(result.text_content)
 ```
 

diff --git a/packages/markitdown/src/markitdown/__main__.py b/packages/markitdown/src/markitdown/__main__.py
@@ -138,6 +138,13 @@ def main():
         help="Keep data URIs (like base64-encoded images) in the output. By default, data URIs are truncated.",
     )
 
+    parser.add_argument(
+        "--docx-table-format",
+        choices=("markdown", "html"),
+        default="markdown",
+        help="Format to use for DOCX tables. Use 'html' to preserve merged and nested table structure.",
+    )
+
     parser.add_argument("filename", nargs="?")
     args = parser.parse_args()
 
@@ -209,7 +216,9 @@ def main():
             _exit_with_error("Filename is required when using Document Intelligence.")
 
         markitdown = MarkItDown(
-            enable_plugins=args.use_plugins, docintel_endpoint=args.endpoint
+            enable_plugins=args.use_plugins,
+            docintel_endpoint=args.endpoint,
+            docx_table_format=args.docx_table_format,
         )
     elif args.use_cu:
         if args.cu_endpoint is None:
@@ -240,9 +249,16 @@ def main():
                     _exit_with_error(f"Unknown file type: {name}")
             cu_kwargs["cu_file_types"] = cu_types
 
-        markitdown = MarkItDown(enable_plugins=args.use_plugins, **cu_kwargs)
+        markitdown = MarkItDown(
+            enable_plugins=args.use_plugins,
+            docx_table_format=args.docx_table_format,
+            **cu_kwargs,
+        )
     else:
-        markitdown = MarkItDown(enable_plugins=args.use_plugins)
+        markitdown = MarkItDown(
+            enable_plugins=args.use_plugins,
+            docx_table_format=args.docx_table_format,
+        )
 
     if args.filename is None:
         result = markitdown.convert_stream(

diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py
@@ -192,7 +192,12 @@ def enable_builtins(self, **kwargs) -> None:
             self.register_converter(WikipediaConverter())
             self.register_converter(YouTubeConverter())
             self.register_converter(BingSerpConverter())
-            self.register_converter(DocxConverter())
+            self.register_converter(
+                DocxConverter(
+                    docx_table_format=kwargs.get("docx_table_format", "markdown"),
+                    docx_markdownify_options=kwargs.get("docx_markdownify_options"),
+                )
+            )
             self.register_converter(XlsxConverter())
             self.register_converter(XlsConverter())
             self.register_converter(PptxConverter())

diff --git a/packages/markitdown/src/markitdown/converters/_docx_converter.py b/packages/markitdown/src/markitdown/converters/_docx_converter.py
@@ -2,7 +2,7 @@
 import io
 from warnings import warn
 
-from typing import BinaryIO, Any
+from typing import BinaryIO, Any, Optional
 
 from ._html_converter import HtmlConverter
 from ..converter_utils.docx.pre_process import pre_process_docx
@@ -27,15 +27,34 @@
 
 ACCEPTED_FILE_EXTENSIONS = [".docx"]
 
+DOCX_TABLE_FORMAT_MARKDOWN = "markdown"
+DOCX_TABLE_FORMAT_HTML = "html"
+DOCX_TABLE_FORMATS = {DOCX_TABLE_FORMAT_MARKDOWN, DOCX_TABLE_FORMAT_HTML}
+
+
+def _validate_docx_table_format(docx_table_format: str) -> str:
+    if docx_table_format not in DOCX_TABLE_FORMATS:
+        raise ValueError(
+            "docx_table_format must be one of: " + ", ".join(sorted(DOCX_TABLE_FORMATS))
+        )
+    return docx_table_format
+
 
 class DocxConverter(HtmlConverter):
     """
     Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
     """
 
-    def __init__(self):
+    def __init__(
+        self,
+        *,
+        docx_table_format: str = DOCX_TABLE_FORMAT_MARKDOWN,
+        docx_markdownify_options: Optional[dict[str, Any]] = None,
+    ):
         super().__init__()
         self._html_converter = HtmlConverter()
+        self._docx_table_format = _validate_docx_table_format(docx_table_format)
+        self._docx_markdownify_options = dict(docx_markdownify_options or {})
 
     def accepts(
         self,
@@ -75,9 +94,22 @@ def convert(
                 _dependency_exc_info[2]
             )
 
+        docx_table_format = _validate_docx_table_format(
+            kwargs.pop("docx_table_format", self._docx_table_format)
+        )
+        docx_markdownify_options = dict(self._docx_markdownify_options)
+        docx_markdownify_options.update(
+            kwargs.pop("docx_markdownify_options", {}) or {}
+        )
+        if docx_table_format == DOCX_TABLE_FORMAT_HTML:
+            docx_markdownify_options["_preserve_html_tables"] = True
+        else:
+            docx_markdownify_options.pop("_preserve_html_tables", None)
+
         style_map = kwargs.get("style_map", None)
+        html_converter_kwargs = {**kwargs, **docx_markdownify_options}
         pre_process_stream = pre_process_docx(file_stream)
         return self._html_converter.convert_string(
             mammoth.convert_to_html(pre_process_stream, style_map=style_map).value,
-            **kwargs,
+            **html_converter_kwargs,
         )
diff --git a/packages/markitdown/src/markitdown/converters/_markdownify.py b/packages/markitdown/src/markitdown/converters/_markdownify.py
@@ -16,6 +16,7 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
     """
 
     def __init__(self, **options: Any):
+        self._preserve_html_tables = bool(options.pop("_preserve_html_tables", False))
         options["heading_style"] = options.get("heading_style", markdownify.ATX)
         options["keep_data_uris"] = options.get("keep_data_uris", False)
         # Explicitly cast options to the expected type if necessary
@@ -122,5 +123,19 @@ def convert_input(
             return "[x] " if el.has_attr("checked") else "[ ] "
         return ""
 
+    def convert_table(
+        self,
+        el: Any,
+        text: str,
+        parent_tags: Any,
+        **kwargs: Any,
+    ) -> str:
+        """Optionally preserve raw HTML tables for structures Markdown cannot represent."""
+
+        if self._preserve_html_tables:
+            return "\n\n" + str(el) + "\n\n"
+
+        return super().convert_table(el, text, parent_tags)  # type: ignore
+
     def convert_soup(self, soup: Any) -> str:
         return super().convert_soup(soup)  # type: ignore
diff --git a/packages/markitdown/tests/test_docx_tables.py b/packages/markitdown/tests/test_docx_tables.py
@@ -0,0 +1,145 @@
+#!/usr/bin/env python3 -m pytest
+import locale
+import subprocess
+import zipfile
+from pathlib import Path
+
+import pytest
+
+from markitdown import MarkItDown
+
+
+def _write_docx_with_complex_tables(path: Path) -> None:
+    document_xml = """<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
+<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
+  <w:body>
+    <w:p>
+      <w:r><w:t>Quarterly research note</w:t></w:r>
+    </w:p>
+    <w:p>
+      <w:r>
+        <w:rPr><w:b/></w:rPr>
+        <w:t>Audited</w:t>
+      </w:r>
+    </w:p>
+    <w:tbl>
+      <w:tr>
+        <w:tc>
+          <w:tcPr><w:gridSpan w:val="2"/></w:tcPr>
+          <w:p><w:r><w:t>Segment Revenue</w:t></w:r></w:p>
+        </w:tc>
+      </w:tr>
+      <w:tr>
+        <w:tc><w:p><w:r><w:t>Q1</w:t></w:r></w:p></w:tc>
+        <w:tc><w:p><w:r><w:t>1200</w:t></w:r></w:p></w:tc>
+      </w:tr>
+    </w:tbl>
+    <w:tbl>
+      <w:tr>
+        <w:tc>
+          <w:p><w:r><w:t>Outer Cell</w:t></w:r></w:p>
+          <w:tbl>
+            <w:tr>
+              <w:tc><w:p><w:r><w:t>Nested KPI</w:t></w:r></w:p></w:tc>
+            </w:tr>
+          </w:tbl>
+        </w:tc>
+      </w:tr>
+    </w:tbl>
+    <w:sectPr>
+      <w:pgSz w:w="12240" w:h="15840"/>
+      <w:pgMar w:top="1440" w:right="1440" w:bottom="1440" w:left="1440"/>
+    </w:sectPr>
+  </w:body>
+</w:document>
+"""
+    with zipfile.ZipFile(path, "w") as docx:
+        docx.writestr(
+            "[Content_Types].xml",
+            """<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
+<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
+  <Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
+  <Default Extension="xml" ContentType="application/xml"/>
+  <Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/>
+</Types>
+""",
+        )
+        docx.writestr(
+            "_rels/.rels",
+            """<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
+<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
+  <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/>
+</Relationships>
+""",
+        )
+        docx.writestr("word/document.xml", document_xml)
+
+
+@pytest.fixture()
+def complex_tables_docx(tmp_path: Path) -> Path:
+    docx_path = tmp_path / "complex-tables.docx"
+    _write_docx_with_complex_tables(docx_path)
+    return docx_path
+
+
+def test_docx_tables_default_to_markdown(complex_tables_docx: Path) -> None:
+    result = MarkItDown().convert(complex_tables_docx)
+
+    assert "Segment Revenue" in result.markdown
+    assert "Nested KPI" in result.markdown
+    assert "<table" not in result.markdown
+
+
+def test_docx_html_table_format_preserves_complex_table_structure(
+    complex_tables_docx: Path,
+) -> None:
+    result = MarkItDown(docx_table_format="html").convert(complex_tables_docx)
+
+    assert "<table" in result.markdown
+    assert 'colspan="2"' in result.markdown
+    assert "Segment Revenue" in result.markdown
+    assert "Nested KPI" in result.markdown
+
+
+def test_docx_table_format_can_be_overridden_per_conversion(
+    complex_tables_docx: Path,
+) -> None:
+    markitdown = MarkItDown(docx_table_format="markdown")
+
+    result = markitdown.convert(complex_tables_docx, docx_table_format="html")
+
+    assert "<table" in result.markdown
+    assert 'colspan="2"' in result.markdown
+
+
+def test_docx_table_format_rejects_invalid_value() -> None:
+    with pytest.raises(ValueError, match="docx_table_format"):
+        MarkItDown(docx_table_format="invalid")
+
+
+def test_docx_markdownify_options_are_forwarded(complex_tables_docx: Path) -> None:
+    result = MarkItDown(docx_markdownify_options={"strong_em_symbol": "_"}).convert(
+        complex_tables_docx
+    )
+
+    assert "__Audited__" in result.markdown
+
+
+def test_cli_docx_table_format_html(complex_tables_docx: Path) -> None:
+    result = subprocess.run(
+        [
+            "python",
+            "-m",
+            "markitdown",
+            "--docx-table-format",
+            "html",
+            str(complex_tables_docx),
+        ],
+        capture_output=True,
+        text=False,
+    )
+    stdout = result.stdout.decode(locale.getpreferredencoding())
+
+    assert result.returncode == 0, result.stderr.decode("utf-8", errors="replace")
+    assert "<table" in stdout
+    assert 'colspan="2"' in stdout