From ec4607292b9254b52fd60d35331831b05be3d554 Mon Sep 17 00:00:00 2001 From: Lucas Ma <7184042+pony-maggie@users.noreply.github.com> Date: Mon, 15 Jun 2026 08:27:01 +0800 Subject: [PATCH] fix: handle URI schemes case-insensitively --- .../markitdown/src/markitdown/_markitdown.py | 15 ++++++--------- .../markitdown/src/markitdown/_uri_utils.py | 2 +- packages/markitdown/tests/test_module_misc.py | 19 +++++++++++++++++++ 3 files changed, 26 insertions(+), 10 deletions(-) diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py index f6aa4df0e..ab9528534 100644 --- a/packages/markitdown/src/markitdown/_markitdown.py +++ b/packages/markitdown/src/markitdown/_markitdown.py @@ -288,12 +288,8 @@ def convert( # Local path or url if isinstance(source, str): - if ( - source.startswith("http:") - or source.startswith("https:") - or source.startswith("file:") - or source.startswith("data:") - ): + scheme = urlparse(source.strip()).scheme.lower() + if scheme in ("http", "https", "file", "data"): # Rename the url argument to mock_url # (Deprecated -- use stream_info) _kwargs = {k: v for k, v in kwargs.items()} @@ -437,9 +433,10 @@ def convert_uri( **kwargs: Any, ) -> DocumentConverterResult: uri = uri.strip() + scheme = urlparse(uri).scheme.lower() # File URIs - if uri.startswith("file:"): + if scheme == "file": netloc, path = file_uri_to_path(uri) if netloc and netloc != "localhost": raise ValueError( @@ -453,7 +450,7 @@ def convert_uri( **kwargs, ) # Data URIs - elif uri.startswith("data:"): + elif scheme == "data": mimetype, attributes, data = parse_data_uri(uri) base_guess = StreamInfo( @@ -471,7 +468,7 @@ def convert_uri( **kwargs, ) # HTTP/HTTPS URIs - elif uri.startswith("http:") or uri.startswith("https:"): + elif scheme in ("http", "https"): response = self._requests_session.get(uri, stream=True) response.raise_for_status() return self.convert_response( diff --git a/packages/markitdown/src/markitdown/_uri_utils.py b/packages/markitdown/src/markitdown/_uri_utils.py index 603da63e9..861a83116 100644 --- a/packages/markitdown/src/markitdown/_uri_utils.py +++ b/packages/markitdown/src/markitdown/_uri_utils.py @@ -17,7 +17,7 @@ def file_uri_to_path(file_uri: str) -> Tuple[str | None, str]: def parse_data_uri(uri: str) -> Tuple[str | None, Dict[str, str], bytes]: - if not uri.startswith("data:"): + if not uri.lower().startswith("data:"): raise ValueError("Not a data URI") header, _, data = uri.partition(",") diff --git a/packages/markitdown/tests/test_module_misc.py b/packages/markitdown/tests/test_module_misc.py index 4d62e4919..aba9e81e8 100644 --- a/packages/markitdown/tests/test_module_misc.py +++ b/packages/markitdown/tests/test_module_misc.py @@ -239,6 +239,25 @@ def test_file_uris() -> None: assert netloc == "localhost" assert path == "/path/to/file.txt" + # URI schemes are case-insensitive + file_uri = "FILE:///path/to/file.txt" + netloc, path = file_uri_to_path(file_uri) + assert netloc is None + assert path == "/path/to/file.txt" + + +def test_convert_case_insensitive_uri_schemes(tmp_path) -> None: + markitdown = MarkItDown() + + data_result = markitdown.convert("DATA:text/plain;base64,SGVsbG8sIFdvcmxkIQ==") + assert data_result.markdown == "Hello, World!" + + text_file = tmp_path / "hello.txt" + text_file.write_text("Hello from file", encoding="utf-8") + + file_result = markitdown.convert(text_file.as_uri().replace("file:", "FILE:", 1)) + assert file_result.markdown == "Hello from file" + # Test file URI with query parameters file_uri = "file:///path/to/file.txt?param=value" netloc, path = file_uri_to_path(file_uri)