Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -82,12 +82,29 @@ Or use `-o` to specify the output file:
markitdown path-to-file.pdf -o document.md
```

For DOCX files with merged or nested tables, use HTML table output to preserve
structure that Markdown pipe tables cannot represent:

```bash
markitdown report.docx --docx-table-format html > report.md
```

You can also pipe content:

```bash
cat path-to-file.pdf | markitdown
```

### Python API

```python
from markitdown import MarkItDown

md = MarkItDown(docx_table_format="html")
result = md.convert("report.docx")
print(result.text_content)
```

### Optional Dependencies
MarkItDown has optional dependencies for activating various file formats. Earlier in this document, we installed all optional dependencies with the `[all]` option. However, you can also install them individually for more control. For example:

Expand Down
11 changes: 9 additions & 2 deletions packages/markitdown/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,13 +32,20 @@ pip install -e packages/markitdown[all]
markitdown path-to-file.pdf > document.md
```

For DOCX files with merged or nested tables, use HTML table output to preserve
structure that Markdown pipe tables cannot represent:

```bash
markitdown report.docx --docx-table-format html > report.md
```

### Python API

```python
from markitdown import MarkItDown

md = MarkItDown()
result = md.convert("test.xlsx")
md = MarkItDown(docx_table_format="html")
result = md.convert("test.docx")
print(result.text_content)
```

Expand Down
22 changes: 19 additions & 3 deletions packages/markitdown/src/markitdown/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,13 @@ def main():
help="Keep data URIs (like base64-encoded images) in the output. By default, data URIs are truncated.",
)

parser.add_argument(
"--docx-table-format",
choices=("markdown", "html"),
default="markdown",
help="Format to use for DOCX tables. Use 'html' to preserve merged and nested table structure.",
)

parser.add_argument("filename", nargs="?")
args = parser.parse_args()

Expand Down Expand Up @@ -209,7 +216,9 @@ def main():
_exit_with_error("Filename is required when using Document Intelligence.")

markitdown = MarkItDown(
enable_plugins=args.use_plugins, docintel_endpoint=args.endpoint
enable_plugins=args.use_plugins,
docintel_endpoint=args.endpoint,
docx_table_format=args.docx_table_format,
)
elif args.use_cu:
if args.cu_endpoint is None:
Expand Down Expand Up @@ -240,9 +249,16 @@ def main():
_exit_with_error(f"Unknown file type: {name}")
cu_kwargs["cu_file_types"] = cu_types

markitdown = MarkItDown(enable_plugins=args.use_plugins, **cu_kwargs)
markitdown = MarkItDown(
enable_plugins=args.use_plugins,
docx_table_format=args.docx_table_format,
**cu_kwargs,
)
else:
markitdown = MarkItDown(enable_plugins=args.use_plugins)
markitdown = MarkItDown(
enable_plugins=args.use_plugins,
docx_table_format=args.docx_table_format,
)

if args.filename is None:
result = markitdown.convert_stream(
Expand Down
7 changes: 6 additions & 1 deletion packages/markitdown/src/markitdown/_markitdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,12 @@ def enable_builtins(self, **kwargs) -> None:
self.register_converter(WikipediaConverter())
self.register_converter(YouTubeConverter())
self.register_converter(BingSerpConverter())
self.register_converter(DocxConverter())
self.register_converter(
DocxConverter(
docx_table_format=kwargs.get("docx_table_format", "markdown"),
docx_markdownify_options=kwargs.get("docx_markdownify_options"),
)
)
self.register_converter(XlsxConverter())
self.register_converter(XlsConverter())
self.register_converter(PptxConverter())
Expand Down
38 changes: 35 additions & 3 deletions packages/markitdown/src/markitdown/converters/_docx_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import io
from warnings import warn

from typing import BinaryIO, Any
from typing import BinaryIO, Any, Optional

from ._html_converter import HtmlConverter
from ..converter_utils.docx.pre_process import pre_process_docx
Expand All @@ -27,15 +27,34 @@

ACCEPTED_FILE_EXTENSIONS = [".docx"]

DOCX_TABLE_FORMAT_MARKDOWN = "markdown"
DOCX_TABLE_FORMAT_HTML = "html"
DOCX_TABLE_FORMATS = {DOCX_TABLE_FORMAT_MARKDOWN, DOCX_TABLE_FORMAT_HTML}


def _validate_docx_table_format(docx_table_format: str) -> str:
if docx_table_format not in DOCX_TABLE_FORMATS:
raise ValueError(
"docx_table_format must be one of: " + ", ".join(sorted(DOCX_TABLE_FORMATS))
)
return docx_table_format


class DocxConverter(HtmlConverter):
"""
Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
"""

def __init__(self):
def __init__(
self,
*,
docx_table_format: str = DOCX_TABLE_FORMAT_MARKDOWN,
docx_markdownify_options: Optional[dict[str, Any]] = None,
):
super().__init__()
self._html_converter = HtmlConverter()
self._docx_table_format = _validate_docx_table_format(docx_table_format)
self._docx_markdownify_options = dict(docx_markdownify_options or {})

def accepts(
self,
Expand Down Expand Up @@ -75,9 +94,22 @@ def convert(
_dependency_exc_info[2]
)

docx_table_format = _validate_docx_table_format(
kwargs.pop("docx_table_format", self._docx_table_format)
)
docx_markdownify_options = dict(self._docx_markdownify_options)
docx_markdownify_options.update(
kwargs.pop("docx_markdownify_options", {}) or {}
)
if docx_table_format == DOCX_TABLE_FORMAT_HTML:
docx_markdownify_options["_preserve_html_tables"] = True
else:
docx_markdownify_options.pop("_preserve_html_tables", None)

style_map = kwargs.get("style_map", None)
html_converter_kwargs = {**kwargs, **docx_markdownify_options}
pre_process_stream = pre_process_docx(file_stream)
return self._html_converter.convert_string(
mammoth.convert_to_html(pre_process_stream, style_map=style_map).value,
**kwargs,
**html_converter_kwargs,
)
15 changes: 15 additions & 0 deletions packages/markitdown/src/markitdown/converters/_markdownify.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
"""

def __init__(self, **options: Any):
self._preserve_html_tables = bool(options.pop("_preserve_html_tables", False))
options["heading_style"] = options.get("heading_style", markdownify.ATX)
options["keep_data_uris"] = options.get("keep_data_uris", False)
# Explicitly cast options to the expected type if necessary
Expand Down Expand Up @@ -122,5 +123,19 @@ def convert_input(
return "[x] " if el.has_attr("checked") else "[ ] "
return ""

def convert_table(
self,
el: Any,
text: str,
parent_tags: Any,
**kwargs: Any,
) -> str:
"""Optionally preserve raw HTML tables for structures Markdown cannot represent."""

if self._preserve_html_tables:
return "\n\n" + str(el) + "\n\n"

return super().convert_table(el, text, parent_tags) # type: ignore

def convert_soup(self, soup: Any) -> str:
return super().convert_soup(soup) # type: ignore
145 changes: 145 additions & 0 deletions packages/markitdown/tests/test_docx_tables.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
#!/usr/bin/env python3 -m pytest
import locale
import subprocess
import zipfile
from pathlib import Path

import pytest

from markitdown import MarkItDown


def _write_docx_with_complex_tables(path: Path) -> None:
document_xml = """<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body>
<w:p>
<w:r><w:t>Quarterly research note</w:t></w:r>
</w:p>
<w:p>
<w:r>
<w:rPr><w:b/></w:rPr>
<w:t>Audited</w:t>
</w:r>
</w:p>
<w:tbl>
<w:tr>
<w:tc>
<w:tcPr><w:gridSpan w:val="2"/></w:tcPr>
<w:p><w:r><w:t>Segment Revenue</w:t></w:r></w:p>
</w:tc>
</w:tr>
<w:tr>
<w:tc><w:p><w:r><w:t>Q1</w:t></w:r></w:p></w:tc>
<w:tc><w:p><w:r><w:t>1200</w:t></w:r></w:p></w:tc>
</w:tr>
</w:tbl>
<w:tbl>
<w:tr>
<w:tc>
<w:p><w:r><w:t>Outer Cell</w:t></w:r></w:p>
<w:tbl>
<w:tr>
<w:tc><w:p><w:r><w:t>Nested KPI</w:t></w:r></w:p></w:tc>
</w:tr>
</w:tbl>
</w:tc>
</w:tr>
</w:tbl>
<w:sectPr>
<w:pgSz w:w="12240" w:h="15840"/>
<w:pgMar w:top="1440" w:right="1440" w:bottom="1440" w:left="1440"/>
</w:sectPr>
</w:body>
</w:document>
"""
with zipfile.ZipFile(path, "w") as docx:
docx.writestr(
"[Content_Types].xml",
"""<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
<Default Extension="xml" ContentType="application/xml"/>
<Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/>
</Types>
""",
)
docx.writestr(
"_rels/.rels",
"""<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/>
</Relationships>
""",
)
docx.writestr("word/document.xml", document_xml)


@pytest.fixture()
def complex_tables_docx(tmp_path: Path) -> Path:
docx_path = tmp_path / "complex-tables.docx"
_write_docx_with_complex_tables(docx_path)
return docx_path


def test_docx_tables_default_to_markdown(complex_tables_docx: Path) -> None:
result = MarkItDown().convert(complex_tables_docx)

assert "Segment Revenue" in result.markdown
assert "Nested KPI" in result.markdown
assert "<table" not in result.markdown


def test_docx_html_table_format_preserves_complex_table_structure(
complex_tables_docx: Path,
) -> None:
result = MarkItDown(docx_table_format="html").convert(complex_tables_docx)

assert "<table" in result.markdown
assert 'colspan="2"' in result.markdown
assert "Segment Revenue" in result.markdown
assert "Nested KPI" in result.markdown


def test_docx_table_format_can_be_overridden_per_conversion(
complex_tables_docx: Path,
) -> None:
markitdown = MarkItDown(docx_table_format="markdown")

result = markitdown.convert(complex_tables_docx, docx_table_format="html")

assert "<table" in result.markdown
assert 'colspan="2"' in result.markdown


def test_docx_table_format_rejects_invalid_value() -> None:
with pytest.raises(ValueError, match="docx_table_format"):
MarkItDown(docx_table_format="invalid")


def test_docx_markdownify_options_are_forwarded(complex_tables_docx: Path) -> None:
result = MarkItDown(docx_markdownify_options={"strong_em_symbol": "_"}).convert(
complex_tables_docx
)

assert "__Audited__" in result.markdown


def test_cli_docx_table_format_html(complex_tables_docx: Path) -> None:
result = subprocess.run(
[
"python",
"-m",
"markitdown",
"--docx-table-format",
"html",
str(complex_tables_docx),
],
capture_output=True,
text=False,
)
stdout = result.stdout.decode(locale.getpreferredencoding())

assert result.returncode == 0, result.stderr.decode("utf-8", errors="replace")
assert "<table" in stdout
assert 'colspan="2"' in stdout