Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 6 additions & 9 deletions packages/markitdown/src/markitdown/_markitdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,12 +288,8 @@ def convert(

# Local path or url
if isinstance(source, str):
if (
source.startswith("http:")
or source.startswith("https:")
or source.startswith("file:")
or source.startswith("data:")
):
scheme = urlparse(source.strip()).scheme.lower()
if scheme in ("http", "https", "file", "data"):
# Rename the url argument to mock_url
# (Deprecated -- use stream_info)
_kwargs = {k: v for k, v in kwargs.items()}
Expand Down Expand Up @@ -437,9 +433,10 @@ def convert_uri(
**kwargs: Any,
) -> DocumentConverterResult:
uri = uri.strip()
scheme = urlparse(uri).scheme.lower()

# File URIs
if uri.startswith("file:"):
if scheme == "file":
netloc, path = file_uri_to_path(uri)
if netloc and netloc != "localhost":
raise ValueError(
Expand All @@ -453,7 +450,7 @@ def convert_uri(
**kwargs,
)
# Data URIs
elif uri.startswith("data:"):
elif scheme == "data":
mimetype, attributes, data = parse_data_uri(uri)

base_guess = StreamInfo(
Expand All @@ -471,7 +468,7 @@ def convert_uri(
**kwargs,
)
# HTTP/HTTPS URIs
elif uri.startswith("http:") or uri.startswith("https:"):
elif scheme in ("http", "https"):
response = self._requests_session.get(uri, stream=True)
response.raise_for_status()
return self.convert_response(
Expand Down
2 changes: 1 addition & 1 deletion packages/markitdown/src/markitdown/_uri_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def file_uri_to_path(file_uri: str) -> Tuple[str | None, str]:


def parse_data_uri(uri: str) -> Tuple[str | None, Dict[str, str], bytes]:
if not uri.startswith("data:"):
if not uri.lower().startswith("data:"):
raise ValueError("Not a data URI")

header, _, data = uri.partition(",")
Expand Down
19 changes: 19 additions & 0 deletions packages/markitdown/tests/test_module_misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,25 @@ def test_file_uris() -> None:
assert netloc == "localhost"
assert path == "/path/to/file.txt"

# URI schemes are case-insensitive
file_uri = "FILE:///path/to/file.txt"
netloc, path = file_uri_to_path(file_uri)
assert netloc is None
assert path == "/path/to/file.txt"


def test_convert_case_insensitive_uri_schemes(tmp_path) -> None:
markitdown = MarkItDown()

data_result = markitdown.convert("DATA:text/plain;base64,SGVsbG8sIFdvcmxkIQ==")
assert data_result.markdown == "Hello, World!"

text_file = tmp_path / "hello.txt"
text_file.write_text("Hello from file", encoding="utf-8")

file_result = markitdown.convert(text_file.as_uri().replace("file:", "FILE:", 1))
assert file_result.markdown == "Hello from file"

# Test file URI with query parameters
file_uri = "file:///path/to/file.txt?param=value"
netloc, path = file_uri_to_path(file_uri)
Expand Down