Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions packages/markitdown/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -49,12 +49,14 @@ all = [
"azure-ai-documentintelligence",
"azure-ai-contentunderstanding>=1.2.0b1",
"azure-identity",
"striprtf",
]
pptx = ["python-pptx"]
docx = ["mammoth~=1.11.0", "lxml"]
xlsx = ["pandas", "openpyxl"]
xls = ["pandas", "xlrd"]
pdf = ["pdfminer.six>=20251230", "pdfplumber>=0.11.9"]
rtf = ["striprtf"]
outlook = ["olefile"]
audio-transcription = ["pydub", "SpeechRecognition"]
youtube-transcription = ["youtube-transcript-api"]
Expand Down
2 changes: 2 additions & 0 deletions packages/markitdown/src/markitdown/_markitdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
DocumentIntelligenceConverter,
ContentUnderstandingConverter,
CsvConverter,
RtfConverter,
)

from ._base_converter import DocumentConverter, DocumentConverterResult
Expand Down Expand Up @@ -203,6 +204,7 @@ def enable_builtins(self, **kwargs) -> None:
self.register_converter(OutlookMsgConverter())
self.register_converter(EpubConverter())
self.register_converter(CsvConverter())
self.register_converter(RtfConverter())

# Register Document Intelligence converter at the top of the stack if endpoint is provided
docintel_endpoint = kwargs.get("docintel_endpoint")
Expand Down
2 changes: 2 additions & 0 deletions packages/markitdown/src/markitdown/converters/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
)
from ._epub_converter import EpubConverter
from ._csv_converter import CsvConverter
from ._rtf_converter import RtfConverter

__all__ = [
"PlainTextConverter",
Expand All @@ -51,4 +52,5 @@
"ContentUnderstandingFileType",
"EpubConverter",
"CsvConverter",
"RtfConverter",
]
86 changes: 86 additions & 0 deletions packages/markitdown/src/markitdown/converters/_rtf_converter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
import sys

from typing import BinaryIO, Any

from charset_normalizer import from_bytes

from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE

# Try loading optional (but in this case, required) dependencies
# Save reporting of any exceptions for later
_dependency_exc_info = None
try:
from striprtf.striprtf import rtf_to_text
except ImportError:
# Preserve the error and stack trace for later
_dependency_exc_info = sys.exc_info()


ACCEPTED_MIME_TYPE_PREFIXES = [
"application/rtf",
"application/x-rtf",
"text/rtf",
"text/richtext",
]

ACCEPTED_FILE_EXTENSIONS = [".rtf"]


class RtfConverter(DocumentConverter):
"""
Converts RTF (Rich Text Format) files to Markdown. RTF formatting control
words are stripped and the underlying text content is preserved.
"""

def __init__(self):
super().__init__()

def accepts(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> bool:
mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower()

if extension in ACCEPTED_FILE_EXTENSIONS:
return True

for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
if mimetype.startswith(prefix):
return True

return False

def convert(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> DocumentConverterResult:
# Check: the dependencies
if _dependency_exc_info is not None:
raise MissingDependencyException(
MISSING_DEPENDENCY_MESSAGE.format(
converter=type(self).__name__,
extension=".rtf",
feature="rtf",
)
) from _dependency_exc_info[1].with_traceback( # type: ignore[union-attr]
_dependency_exc_info[2]
)

# RTF is an ASCII-based format, but may declare a code page for any
# non-ASCII bytes. Decode defensively so we always hand a str to
# striprtf, which performs the actual control-word stripping.
if stream_info.charset:
rtf_content = file_stream.read().decode(stream_info.charset)
else:
rtf_content = str(from_bytes(file_stream.read()).best())

text = rtf_to_text(rtf_content)

return DocumentConverterResult(markdown=text.strip())
8 changes: 8 additions & 0 deletions packages/markitdown/tests/test_files/test.rtf
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{\rtf1\ansi\ansicpg1252\deff0{\fonttbl{\f0 Helvetica;}}
{\b\fs36 RTF Test Document 8f14e45f}\par
\par
This is a plain paragraph c4ca4238 with some {\b bold a87ff679} text.\par
\par
{\i Italic line e4da3b7f} appears here.\par
A second paragraph 1679091c with more content.\par
}
41 changes: 41 additions & 0 deletions packages/markitdown/tests/test_rtf_converter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#!/usr/bin/env python3 -m pytest
import os

from markitdown import MarkItDown, StreamInfo

TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files")

RTF_TEST_STRINGS = [
"RTF Test Document 8f14e45f",
"This is a plain paragraph c4ca4238 with some bold a87ff679 text.",
"Italic line e4da3b7f appears here.",
"A second paragraph 1679091c with more content.",
]


def test_rtf_converter_local() -> None:
"""RTF files convert to Markdown with control words stripped."""
markitdown = MarkItDown()
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.rtf"))
for s in RTF_TEST_STRINGS:
assert s in result.markdown
# Control words must not leak into the output.
assert "\\rtf1" not in result.markdown
assert "\\par" not in result.markdown


def test_rtf_converter_stream() -> None:
"""RTF conversion works from a binary stream with explicit StreamInfo."""
markitdown = MarkItDown()
with open(os.path.join(TEST_FILES_DIR, "test.rtf"), "rb") as stream:
result = markitdown.convert(
stream, stream_info=StreamInfo(extension=".rtf", mimetype="application/rtf")
)
for s in RTF_TEST_STRINGS:
assert s in result.markdown


if __name__ == "__main__":
test_rtf_converter_local()
test_rtf_converter_stream()
print("All RTF converter tests passed.")