Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions packages/markitdown/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ dependencies = [
[project.optional-dependencies]
all = [
"python-pptx",
"unword",
"mammoth~=1.11.0",
"pandas",
"openpyxl",
Expand All @@ -51,6 +52,7 @@ all = [
"azure-identity",
]
pptx = ["python-pptx"]
doc = ["unword"]
docx = ["mammoth~=1.11.0", "lxml"]
xlsx = ["pandas", "openpyxl"]
xls = ["pandas", "xlrd"]
Expand Down
2 changes: 2 additions & 0 deletions packages/markitdown/src/markitdown/_markitdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
BingSerpConverter,
PdfConverter,
DocxConverter,
DocConverter,
XlsxConverter,
XlsConverter,
PptxConverter,
Expand Down Expand Up @@ -193,6 +194,7 @@ def enable_builtins(self, **kwargs) -> None:
self.register_converter(YouTubeConverter())
self.register_converter(BingSerpConverter())
self.register_converter(DocxConverter())
self.register_converter(DocConverter())
self.register_converter(XlsxConverter())
self.register_converter(XlsConverter())
self.register_converter(PptxConverter())
Expand Down
2 changes: 2 additions & 0 deletions packages/markitdown/src/markitdown/converters/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from ._bing_serp_converter import BingSerpConverter
from ._pdf_converter import PdfConverter
from ._docx_converter import DocxConverter
from ._doc_converter import DocConverter
from ._xlsx_converter import XlsxConverter, XlsConverter
from ._pptx_converter import PptxConverter
from ._image_converter import ImageConverter
Expand Down Expand Up @@ -38,6 +39,7 @@
"BingSerpConverter",
"PdfConverter",
"DocxConverter",
"DocConverter",
"XlsxConverter",
"XlsConverter",
"PptxConverter",
Expand Down
89 changes: 89 additions & 0 deletions packages/markitdown/src/markitdown/converters/_doc_converter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
#
# SPDX-License-Identifier: MIT

import sys
from typing import BinaryIO, Any

from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE

# Try loading optional (but in this case, required) dependencies
# Save reporting of any exceptions for later
_dependency_exc_info = None
try:
import unword
except ImportError:
# Preserve the error and stack trace for later
_dependency_exc_info = sys.exc_info()

ACCEPTED_MIME_TYPE_PREFIXES = [
"application/msword",
"application/vnd.ms-word",
]

ACCEPTED_FILE_EXTENSIONS = [".doc"]


class DocConverter(DocumentConverter):
"""
Converts legacy DOC files (OLE/CFB format) to Markdown using the unword library.
No external dependencies such as LibreOffice or MS Word are required.
"""

def accepts(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> bool:
mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower()

if extension in ACCEPTED_FILE_EXTENSIONS:
return True

for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
if mimetype.startswith(prefix):
return True

return False

def convert(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> DocumentConverterResult:
# Check: the dependencies
if _dependency_exc_info is not None:
raise MissingDependencyException(
MISSING_DEPENDENCY_MESSAGE.format(
converter=type(self).__name__,
extension=".doc",
feature="doc",
)
) from _dependency_exc_info[
1
].with_traceback( # type: ignore[union-attr]
_dependency_exc_info[2]
)

data = file_stream.read()
doc = unword.parse_doc(data)

# Combine body text and textbox content
markdown_parts = []
if doc.body_text:
markdown_parts.append(doc.body_text.strip())

if doc.textboxes:
for textbox in doc.textboxes:
text = textbox.strip()
if text:
markdown_parts.append(text)

markdown = "\n\n".join(markdown_parts)

return DocumentConverterResult(markdown=markdown)
14 changes: 14 additions & 0 deletions packages/markitdown/tests/_test_vectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,20 @@ class FileTestVector(object):
"data:image/png;base64,iVBORw0KGgoAAAANSU",
],
),
FileTestVector(
filename="test.doc",
mimetype="application/msword",
charset=None,
url=None,
must_include=[
"314b0a30-5b04-470b-b9f7-eed2c2bec74a",
"d666f1f7-46cb-42bd-9a39-9a39cf2a509f",
"# Abstract",
"# Introduction",
"AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
],
must_not_include=[],
),
FileTestVector(
filename="test.xlsx",
mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
Expand Down
Binary file added packages/markitdown/tests/test_files/test.doc
Binary file not shown.
Binary file not shown.