Skip to content

Commit e928b43

Browse files
authored
convert_url renamed to convert_uri, and now handles data and file URIs (#1153)
1 parent 2ffe6ea commit e928b43

File tree

4 files changed

+251
-22
lines changed

4 files changed

+251
-22
lines changed

packages/markitdown/src/markitdown/_markitdown.py

Lines changed: 75 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
import codecs
2121

2222
from ._stream_info import StreamInfo
23+
from ._uri_utils import parse_data_uri, file_uri_to_path
2324

2425
from .converters import (
2526
PlainTextConverter,
@@ -242,9 +243,10 @@ def convert(
242243
# Local path or url
243244
if isinstance(source, str):
244245
if (
245-
source.startswith("http://")
246-
or source.startswith("https://")
247-
or source.startswith("file://")
246+
source.startswith("http:")
247+
or source.startswith("https:")
248+
or source.startswith("file:")
249+
or source.startswith("data:")
248250
):
249251
# Rename the url argument to mock_url
250252
# (Deprecated -- use stream_info)
@@ -253,7 +255,7 @@ def convert(
253255
_kwargs["mock_url"] = _kwargs["url"]
254256
del _kwargs["url"]
255257

256-
return self.convert_url(source, stream_info=stream_info, **_kwargs)
258+
return self.convert_uri(source, stream_info=stream_info, **_kwargs)
257259
else:
258260
return self.convert_local(source, stream_info=stream_info, **kwargs)
259261
# Path object
@@ -363,23 +365,81 @@ def convert_url(
363365
url: str,
364366
*,
365367
stream_info: Optional[StreamInfo] = None,
366-
file_extension: Optional[str] = None, # Deprecated -- use stream_info
367-
mock_url: Optional[
368-
str
369-
] = None, # Mock the request as if it came from a different URL
368+
file_extension: Optional[str] = None,
369+
mock_url: Optional[str] = None,
370370
**kwargs: Any,
371-
) -> DocumentConverterResult: # TODO: fix kwargs type
372-
# Send a HTTP request to the URL
373-
response = self._requests_session.get(url, stream=True)
374-
response.raise_for_status()
375-
return self.convert_response(
376-
response,
371+
) -> DocumentConverterResult:
372+
"""Alias for convert_uri()"""
373+
# convert_url will likely be deprecated in the future in favor of convert_uri
374+
return self.convert_uri(
375+
url,
377376
stream_info=stream_info,
378377
file_extension=file_extension,
379-
url=mock_url,
378+
mock_url=mock_url,
380379
**kwargs,
381380
)
382381

382+
def convert_uri(
383+
self,
384+
uri: str,
385+
*,
386+
stream_info: Optional[StreamInfo] = None,
387+
file_extension: Optional[str] = None, # Deprecated -- use stream_info
388+
mock_url: Optional[
389+
str
390+
] = None, # Mock the request as if it came from a different URL
391+
**kwargs: Any,
392+
) -> DocumentConverterResult:
393+
uri = uri.strip()
394+
395+
# File URIs
396+
if uri.startswith("file:"):
397+
netloc, path = file_uri_to_path(uri)
398+
if netloc and netloc != "localhost":
399+
raise ValueError(
400+
f"Unsupported file URI: {uri}. Netloc must be empty or localhost."
401+
)
402+
return self.convert_local(
403+
path,
404+
stream_info=stream_info,
405+
file_extension=file_extension,
406+
url=mock_url,
407+
**kwargs,
408+
)
409+
# Data URIs
410+
elif uri.startswith("data:"):
411+
mimetype, attributes, data = parse_data_uri(uri)
412+
413+
base_guess = StreamInfo(
414+
mimetype=mimetype,
415+
charset=attributes.get("charset"),
416+
)
417+
if stream_info is not None:
418+
base_guess = base_guess.copy_and_update(stream_info)
419+
420+
return self.convert_stream(
421+
io.BytesIO(data),
422+
stream_info=base_guess,
423+
file_extension=file_extension,
424+
url=mock_url,
425+
**kwargs,
426+
)
427+
# HTTP/HTTPS URIs
428+
elif uri.startswith("http:") or uri.startswith("https:"):
429+
response = self._requests_session.get(uri, stream=True)
430+
response.raise_for_status()
431+
return self.convert_response(
432+
response,
433+
stream_info=stream_info,
434+
file_extension=file_extension,
435+
url=mock_url,
436+
**kwargs,
437+
)
438+
else:
439+
raise ValueError(
440+
f"Unsupported URI scheme: {uri.split(':')[0]}. Supported schemes are: file:, data:, http:, https:"
441+
)
442+
383443
def convert_response(
384444
self,
385445
response: requests.Response,
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
import base64
2+
import os
3+
from typing import Tuple, Dict
4+
from urllib.request import url2pathname
5+
from urllib.parse import urlparse, unquote_to_bytes
6+
7+
8+
def file_uri_to_path(file_uri: str) -> Tuple[str | None, str]:
9+
"""Convert a file URI to a local file path"""
10+
parsed = urlparse(file_uri)
11+
if parsed.scheme != "file":
12+
raise ValueError(f"Not a file URL: {file_uri}")
13+
14+
netloc = parsed.netloc if parsed.netloc else None
15+
path = os.path.abspath(url2pathname(parsed.path))
16+
return netloc, path
17+
18+
19+
def parse_data_uri(uri: str) -> Tuple[str | None, Dict[str, str], bytes]:
20+
if not uri.startswith("data:"):
21+
raise ValueError("Not a data URI")
22+
23+
header, _, data = uri.partition(",")
24+
if not _:
25+
raise ValueError("Malformed data URI, missing ',' separator")
26+
27+
meta = header[5:] # Strip 'data:'
28+
parts = meta.split(";")
29+
30+
is_base64 = False
31+
# Ends with base64?
32+
if parts[-1] == "base64":
33+
parts.pop()
34+
is_base64 = True
35+
36+
mime_type = None # Normally this would default to text/plain but we won't assume
37+
if len(parts) and len(parts[0]) > 0:
38+
# First part is the mime type
39+
mime_type = parts.pop(0)
40+
41+
attributes: Dict[str, str] = {}
42+
for part in parts:
43+
# Handle key=value pairs in the middle
44+
if "=" in part:
45+
key, value = part.split("=", 1)
46+
attributes[key] = value
47+
elif len(part) > 0:
48+
attributes[part] = ""
49+
50+
content = base64.b64decode(data) if is_base64 else unquote_to_bytes(data)
51+
52+
return mime_type, attributes, content

packages/markitdown/tests/test_module_misc.py

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
import openai
66
import pytest
77

8+
from markitdown._uri_utils import parse_data_uri, file_uri_to_path
9+
810
from markitdown import (
911
MarkItDown,
1012
UnsupportedFormatException,
@@ -176,6 +178,79 @@ def test_stream_info_operations() -> None:
176178
assert updated_stream_info.url == "url.1"
177179

178180

181+
def test_data_uris() -> None:
182+
# Test basic parsing of data URIs
183+
data_uri = "data:text/plain;base64,SGVsbG8sIFdvcmxkIQ=="
184+
mime_type, attributes, data = parse_data_uri(data_uri)
185+
assert mime_type == "text/plain"
186+
assert len(attributes) == 0
187+
assert data == b"Hello, World!"
188+
189+
data_uri = "data:base64,SGVsbG8sIFdvcmxkIQ=="
190+
mime_type, attributes, data = parse_data_uri(data_uri)
191+
assert mime_type is None
192+
assert len(attributes) == 0
193+
assert data == b"Hello, World!"
194+
195+
data_uri = "data:text/plain;charset=utf-8;base64,SGVsbG8sIFdvcmxkIQ=="
196+
mime_type, attributes, data = parse_data_uri(data_uri)
197+
assert mime_type == "text/plain"
198+
assert len(attributes) == 1
199+
assert attributes["charset"] == "utf-8"
200+
assert data == b"Hello, World!"
201+
202+
data_uri = "data:,Hello%2C%20World%21"
203+
mime_type, attributes, data = parse_data_uri(data_uri)
204+
assert mime_type is None
205+
assert len(attributes) == 0
206+
assert data == b"Hello, World!"
207+
208+
data_uri = "data:text/plain,Hello%2C%20World%21"
209+
mime_type, attributes, data = parse_data_uri(data_uri)
210+
assert mime_type == "text/plain"
211+
assert len(attributes) == 0
212+
assert data == b"Hello, World!"
213+
214+
data_uri = "data:text/plain;charset=utf-8,Hello%2C%20World%21"
215+
mime_type, attributes, data = parse_data_uri(data_uri)
216+
assert mime_type == "text/plain"
217+
assert len(attributes) == 1
218+
assert attributes["charset"] == "utf-8"
219+
assert data == b"Hello, World!"
220+
221+
222+
def test_file_uris() -> None:
223+
# Test file URI with an empty host
224+
file_uri = "file:///path/to/file.txt"
225+
netloc, path = file_uri_to_path(file_uri)
226+
assert netloc is None
227+
assert path == "/path/to/file.txt"
228+
229+
# Test file URI with no host
230+
file_uri = "file:/path/to/file.txt"
231+
netloc, path = file_uri_to_path(file_uri)
232+
assert netloc is None
233+
assert path == "/path/to/file.txt"
234+
235+
# Test file URI with localhost
236+
file_uri = "file://localhost/path/to/file.txt"
237+
netloc, path = file_uri_to_path(file_uri)
238+
assert netloc == "localhost"
239+
assert path == "/path/to/file.txt"
240+
241+
# Test file URI with query parameters
242+
file_uri = "file:///path/to/file.txt?param=value"
243+
netloc, path = file_uri_to_path(file_uri)
244+
assert netloc is None
245+
assert path == "/path/to/file.txt"
246+
247+
# Test file URI with fragment
248+
file_uri = "file:///path/to/file.txt#fragment"
249+
netloc, path = file_uri_to_path(file_uri)
250+
assert netloc is None
251+
assert path == "/path/to/file.txt"
252+
253+
179254
def test_docx_comments() -> None:
180255
markitdown = MarkItDown()
181256

@@ -314,6 +389,8 @@ def test_markitdown_llm() -> None:
314389
"""Runs this file's tests from the command line."""
315390
for test in [
316391
test_stream_info_operations,
392+
test_data_uris,
393+
test_file_uris,
317394
test_docx_comments,
318395
test_input_as_strings,
319396
test_markitdown_remote,

packages/markitdown/tests/test_module_vectors.py

Lines changed: 47 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,9 @@
33
import time
44
import pytest
55
import codecs
6+
import base64
67

8+
from pathlib import Path
79

810
if __name__ == "__main__":
911
from _test_vectors import GENERAL_TEST_VECTORS, DATA_URI_TEST_VECTORS
@@ -108,8 +110,8 @@ def test_convert_stream_without_hints(test_vector):
108110
reason="do not run tests that query external urls",
109111
)
110112
@pytest.mark.parametrize("test_vector", GENERAL_TEST_VECTORS)
111-
def test_convert_url(test_vector):
112-
"""Test the conversion of a stream with no stream info."""
113+
def test_convert_http_uri(test_vector):
114+
"""Test the conversion of an HTTP:// or HTTPS:// URI."""
113115
markitdown = MarkItDown()
114116

115117
time.sleep(1) # Ensure we don't hit rate limits
@@ -124,8 +126,44 @@ def test_convert_url(test_vector):
124126
assert string not in result.markdown
125127

126128

129+
@pytest.mark.parametrize("test_vector", GENERAL_TEST_VECTORS)
130+
def test_convert_file_uri(test_vector):
131+
"""Test the conversion of a file:// URI."""
132+
markitdown = MarkItDown()
133+
134+
result = markitdown.convert(
135+
Path(os.path.join(TEST_FILES_DIR, test_vector.filename)).as_uri(),
136+
url=test_vector.url,
137+
)
138+
for string in test_vector.must_include:
139+
assert string in result.markdown
140+
for string in test_vector.must_not_include:
141+
assert string not in result.markdown
142+
143+
144+
@pytest.mark.parametrize("test_vector", GENERAL_TEST_VECTORS)
145+
def test_convert_data_uri(test_vector):
146+
"""Test the conversion of a data URI."""
147+
markitdown = MarkItDown()
148+
149+
data = ""
150+
with open(os.path.join(TEST_FILES_DIR, test_vector.filename), "rb") as stream:
151+
data = base64.b64encode(stream.read()).decode("utf-8")
152+
mimetype = test_vector.mimetype
153+
data_uri = f"data:{mimetype};base64,{data}"
154+
155+
result = markitdown.convert(
156+
data_uri,
157+
url=test_vector.url,
158+
)
159+
for string in test_vector.must_include:
160+
assert string in result.markdown
161+
for string in test_vector.must_not_include:
162+
assert string not in result.markdown
163+
164+
127165
@pytest.mark.parametrize("test_vector", DATA_URI_TEST_VECTORS)
128-
def test_convert_with_data_uris(test_vector):
166+
def test_convert_keep_data_uris(test_vector):
129167
"""Test API functionality when keep_data_uris is enabled"""
130168
markitdown = MarkItDown()
131169

@@ -143,7 +181,7 @@ def test_convert_with_data_uris(test_vector):
143181

144182

145183
@pytest.mark.parametrize("test_vector", DATA_URI_TEST_VECTORS)
146-
def test_convert_stream_with_data_uris(test_vector):
184+
def test_convert_stream_keep_data_uris(test_vector):
147185
"""Test the conversion of a stream with no stream info."""
148186
markitdown = MarkItDown()
149187

@@ -175,7 +213,9 @@ def test_convert_stream_with_data_uris(test_vector):
175213
test_convert_local,
176214
test_convert_stream_with_hints,
177215
test_convert_stream_without_hints,
178-
test_convert_url,
216+
test_convert_http_uri,
217+
test_convert_file_uri,
218+
test_convert_data_uri,
179219
]:
180220
for test_vector in GENERAL_TEST_VECTORS:
181221
print(
@@ -186,8 +226,8 @@ def test_convert_stream_with_data_uris(test_vector):
186226

187227
# Data URI tests
188228
for test_function in [
189-
test_convert_with_data_uris,
190-
test_convert_stream_with_data_uris,
229+
test_convert_keep_data_uris,
230+
test_convert_stream_keep_data_uris,
191231
]:
192232
for test_vector in DATA_URI_TEST_VECTORS:
193233
print(

0 commit comments

Comments
 (0)