microsoft · li5435945-ship-it · Jun 10, 2026
diff --git a/packages/markitdown/pyproject.toml b/packages/markitdown/pyproject.toml
@@ -35,6 +35,7 @@ dependencies = [
 [project.optional-dependencies]
 all = [
   "python-pptx",
+  "unword",
   "mammoth~=1.11.0",
   "pandas",
   "openpyxl",
@@ -51,6 +52,7 @@ all = [
   "azure-identity",
 ]
 pptx = ["python-pptx"]
+doc = ["unword"]
 docx = ["mammoth~=1.11.0", "lxml"]
 xlsx = ["pandas", "openpyxl"]
 xls = ["pandas", "xlrd"]

diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py
@@ -29,6 +29,7 @@
     BingSerpConverter,
     PdfConverter,
     DocxConverter,
+    DocConverter,
     XlsxConverter,
     XlsConverter,
     PptxConverter,
@@ -193,6 +194,7 @@ def enable_builtins(self, **kwargs) -> None:
             self.register_converter(YouTubeConverter())
             self.register_converter(BingSerpConverter())
             self.register_converter(DocxConverter())
+            self.register_converter(DocConverter())
             self.register_converter(XlsxConverter())
             self.register_converter(XlsConverter())
             self.register_converter(PptxConverter())

diff --git a/packages/markitdown/src/markitdown/converters/__init__.py b/packages/markitdown/src/markitdown/converters/__init__.py
@@ -11,6 +11,7 @@
 from ._bing_serp_converter import BingSerpConverter
 from ._pdf_converter import PdfConverter
 from ._docx_converter import DocxConverter
+from ._doc_converter import DocConverter
 from ._xlsx_converter import XlsxConverter, XlsConverter
 from ._pptx_converter import PptxConverter
 from ._image_converter import ImageConverter
@@ -38,6 +39,7 @@
     "BingSerpConverter",
     "PdfConverter",
     "DocxConverter",
+    "DocConverter",
     "XlsxConverter",
     "XlsConverter",
     "PptxConverter",

diff --git a/packages/markitdown/src/markitdown/converters/_doc_converter.py b/packages/markitdown/src/markitdown/converters/_doc_converter.py
@@ -0,0 +1,89 @@
+# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
+#
+# SPDX-License-Identifier: MIT
+
+import sys
+from typing import BinaryIO, Any
+
+from .._base_converter import DocumentConverter, DocumentConverterResult
+from .._stream_info import StreamInfo
+from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
+
+# Try loading optional (but in this case, required) dependencies
+# Save reporting of any exceptions for later
+_dependency_exc_info = None
+try:
+    import unword
+except ImportError:
+    # Preserve the error and stack trace for later
+    _dependency_exc_info = sys.exc_info()
+
+ACCEPTED_MIME_TYPE_PREFIXES = [
+    "application/msword",
+    "application/vnd.ms-word",
+]
+
+ACCEPTED_FILE_EXTENSIONS = [".doc"]
+
+
+class DocConverter(DocumentConverter):
+    """
+    Converts legacy DOC files (OLE/CFB format) to Markdown using the unword library.
+    No external dependencies such as LibreOffice or MS Word are required.
+    """
+
+    def accepts(
+        self,
+        file_stream: BinaryIO,
+        stream_info: StreamInfo,
+        **kwargs: Any,  # Options to pass to the converter
+    ) -> bool:
+        mimetype = (stream_info.mimetype or "").lower()
+        extension = (stream_info.extension or "").lower()
+
+        if extension in ACCEPTED_FILE_EXTENSIONS:
+            return True
+
+        for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
+            if mimetype.startswith(prefix):
+                return True
+
+        return False
+
+    def convert(
+        self,
+        file_stream: BinaryIO,
+        stream_info: StreamInfo,
+        **kwargs: Any,  # Options to pass to the converter
+    ) -> DocumentConverterResult:
+        # Check: the dependencies
+        if _dependency_exc_info is not None:
+            raise MissingDependencyException(
+                MISSING_DEPENDENCY_MESSAGE.format(
+                    converter=type(self).__name__,
+                    extension=".doc",
+                    feature="doc",
+                )
+            ) from _dependency_exc_info[
+                1
+            ].with_traceback(  # type: ignore[union-attr]
+                _dependency_exc_info[2]
+            )
+
+        data = file_stream.read()
+        doc = unword.parse_doc(data)
+
+        # Combine body text and textbox content
+        markdown_parts = []
+        if doc.body_text:
+            markdown_parts.append(doc.body_text.strip())
+
+        if doc.textboxes:
+            for textbox in doc.textboxes:
+                text = textbox.strip()
+                if text:
+                    markdown_parts.append(text)
+
+        markdown = "\n\n".join(markdown_parts)
+
+        return DocumentConverterResult(markdown=markdown)
diff --git a/packages/markitdown/tests/_test_vectors.py b/packages/markitdown/tests/_test_vectors.py
@@ -31,6 +31,20 @@ class FileTestVector(object):
             "data:image/png;base64,iVBORw0KGgoAAAANSU",
         ],
     ),
+    FileTestVector(
+        filename="test.doc",
+        mimetype="application/msword",
+        charset=None,
+        url=None,
+        must_include=[
+            "314b0a30-5b04-470b-b9f7-eed2c2bec74a",
+            "d666f1f7-46cb-42bd-9a39-9a39cf2a509f",
+            "# Abstract",
+            "# Introduction",
+            "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
+        ],
+        must_not_include=[],
+    ),
     FileTestVector(
         filename="test.xlsx",
         mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",

diff --git a/packages/markitdown/tests/test_files/test.doc b/packages/markitdown/tests/test_files/test.doc
diff --git a/packages/markitdown/tests/test_files/test_sample.doc b/packages/markitdown/tests/test_files/test_sample.doc