diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py index b1169270..2e9965a6 100644 --- a/packages/markitdown/src/markitdown/_markitdown.py +++ b/packages/markitdown/src/markitdown/_markitdown.py @@ -244,7 +244,14 @@ def convert( or source.startswith("https://") or source.startswith("file://") ): - return self.convert_url(source, stream_info=stream_info, *kwargs) + # Rename the url argument to mock_url + # (Deprecated -- use stream_info) + _kwargs = {k: v for k, v in kwargs.items()} + if "url" in _kwargs: + _kwargs["mock_url"] = _kwargs["url"] + del _kwargs["url"] + + return self.convert_url(source, stream_info=stream_info, **_kwargs) else: return self.convert_local(source, stream_info=stream_info, **kwargs) # Path object @@ -350,12 +357,26 @@ def convert_stream( return self._convert(file_stream=stream, stream_info_guesses=guesses, **kwargs) def convert_url( - self, url: str, **kwargs: Any + self, + url: str, + *, + stream_info: Optional[StreamInfo] = None, + file_extension: Optional[str] = None, # Deprecated -- use stream_info + mock_url: Optional[ + str + ] = None, # Mock the request as if it came from a different URL + **kwargs: Any, ) -> DocumentConverterResult: # TODO: fix kwargs type # Send a HTTP request to the URL response = self._requests_session.get(url, stream=True) response.raise_for_status() - return self.convert_response(response, **kwargs) + return self.convert_response( + response, + stream_info=stream_info, + file_extension=file_extension, + url=mock_url, + **kwargs, + ) def convert_response( self, @@ -660,10 +681,12 @@ def _get_stream_info_guesses( return guesses - def _normalize_charset(self, charset: str) -> str: + def _normalize_charset(self, charset: str | None) -> str | None: """ Normalize a charset string to a canonical form. """ + if charset is None: + return None try: return codecs.lookup(charset).name except LookupError: diff --git a/packages/markitdown/tests/_test_vectors.py b/packages/markitdown/tests/_test_vectors.py new file mode 100644 index 00000000..5d2b2fc1 --- /dev/null +++ b/packages/markitdown/tests/_test_vectors.py @@ -0,0 +1,214 @@ +import dataclasses +from typing import List + + +@dataclasses.dataclass(frozen=True, kw_only=True) +class FileTestVector(object): + filename: str + mimetype: str | None + charset: str | None + url: str | None + must_include: List[str] + must_not_include: List[str] + + +GENERAL_TEST_VECTORS = [ + FileTestVector( + filename="test.docx", + mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document", + charset=None, + url=None, + must_include=[ + "314b0a30-5b04-470b-b9f7-eed2c2bec74a", + "49e168b7-d2ae-407f-a055-2167576f39a1", + "## d666f1f7-46cb-42bd-9a39-9a39cf2a509f", + "# Abstract", + "# Introduction", + "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", + ], + must_not_include=[], + ), + FileTestVector( + filename="test.xlsx", + mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + charset=None, + url=None, + must_include=[ + "## 09060124-b5e7-4717-9d07-3c046eb", + "6ff4173b-42a5-4784-9b19-f49caff4d93d", + "affc7dad-52dc-4b98-9b5d-51e65d8a8ad0", + ], + must_not_include=[], + ), + FileTestVector( + filename="test.xls", + mimetype="application/vnd.ms-excel", + charset=None, + url=None, + must_include=[ + "## 09060124-b5e7-4717-9d07-3c046eb", + "6ff4173b-42a5-4784-9b19-f49caff4d93d", + "affc7dad-52dc-4b98-9b5d-51e65d8a8ad0", + ], + must_not_include=[], + ), + FileTestVector( + filename="test.pptx", + mimetype="application/vnd.openxmlformats-officedocument.presentationml.presentation", + charset=None, + url=None, + must_include=[ + "2cdda5c8-e50e-4db4-b5f0-9722a649f455", + "04191ea8-5c73-4215-a1d3-1cfb43aaaf12", + "44bf7d06-5e7a-4a40-a2e1-a2e42ef28c8a", + "1b92870d-e3b5-4e65-8153-919f4ff45592", + "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", + "a3f6004b-6f4f-4ea8-bee3-3741f4dc385f", # chart title + "2003", # chart value + ], + must_not_include=[], + ), + FileTestVector( + filename="test_outlook_msg.msg", + mimetype="application/vnd.ms-outlook", + charset=None, + url=None, + must_include=[ + "# Email Message", + "**From:** test.sender@example.com", + "**To:** test.recipient@example.com", + "**Subject:** Test Email Message", + "## Content", + "This is the body of the test email message", + ], + must_not_include=[], + ), + FileTestVector( + filename="test.pdf", + mimetype="application/pdf", + charset=None, + url=None, + must_include=[ + "While there is contemporaneous exploration of multi-agent approaches" + ], + must_not_include=[], + ), + FileTestVector( + filename="test_blog.html", + mimetype="text/html", + charset="utf-8", + url="https://microsoft.github.io/autogen/blog/2023/04/21/LLM-tuning-math", + must_include=[ + "Large language models (LLMs) are powerful tools that can generate natural language texts for various applications, such as chatbots, summarization, translation, and more. GPT-4 is currently the state of the art LLM in the world. Is model selection irrelevant? What about inference parameters?", + "an example where high cost can easily prevent a generic complex", + ], + must_not_include=[], + ), + FileTestVector( + filename="test_wikipedia.html", + mimetype="text/html", + charset="utf-8", + url="https://en.wikipedia.org/wiki/Microsoft", + must_include=[ + "Microsoft entered the operating system (OS) business in 1980 with its own version of [Unix]", + 'Microsoft was founded by [Bill Gates](/wiki/Bill_Gates "Bill Gates")', + ], + must_not_include=[ + "You are encouraged to create an account and log in", + "154 languages", + "move to sidebar", + ], + ), + FileTestVector( + filename="test_serp.html", + mimetype="text/html", + charset="utf-8", + url="https://www.bing.com/search?q=microsoft+wikipedia", + must_include=[ + "](https://en.wikipedia.org/wiki/Microsoft", + "Microsoft Corporation is **an American multinational corporation and technology company headquartered** in Redmond", + "1995–2007: Foray into the Web, Windows 95, Windows XP, and Xbox", + ], + must_not_include=[ + "https://www.bing.com/ck/a?!&&p=", + "data:image/svg+xml,%3Csvg%20width%3D", + ], + ), + FileTestVector( + filename="test_mskanji.csv", + mimetype="text/csv", + charset="cp932", + url=None, + must_include=[ + "名前,年齢,住所", + "佐藤太郎,30,東京", + "三木英子,25,大阪", + "髙橋淳,35,名古屋", + ], + must_not_include=[], + ), + FileTestVector( + filename="test.json", + mimetype="application/json", + charset="ascii", + url=None, + must_include=[ + "5b64c88c-b3c3-4510-bcb8-da0b200602d8", + "9700dc99-6685-40b4-9a3a-5e406dcb37f3", + ], + must_not_include=[], + ), + FileTestVector( + filename="test_rss.xml", + mimetype="text/xml", + charset="utf-8", + url=None, + must_include=[ + "# The Official Microsoft Blog", + "## Ignite 2024: Why nearly 70% of the Fortune 500 now use Microsoft 365 Copilot", + "In the case of AI, it is absolutely true that the industry is moving incredibly fast", + ], + must_not_include=[" None: - result = subprocess.run( - ["python", "-m", "markitdown", "--version"], capture_output=True, text=True - ) - - assert result.returncode == 0, f"CLI exited with error: {result.stderr}" - assert __version__ in result.stdout, f"Version not found in output: {result.stdout}" - - -def test_invalid_flag(shared_tmp_dir) -> None: - result = subprocess.run( - ["python", "-m", "markitdown", "--foobar"], capture_output=True, text=True - ) - - assert result.returncode != 0, f"CLI exited with error: {result.stderr}" - assert ( - "unrecognized arguments" in result.stderr - ), f"Expected 'unrecognized arguments' to appear in STDERR" - assert "SYNTAX" in result.stderr, f"Expected 'SYNTAX' to appear in STDERR" - - -def test_output_to_stdout(shared_tmp_dir) -> None: - # DOC X - result = subprocess.run( - ["python", "-m", "markitdown", os.path.join(TEST_FILES_DIR, "test.docx")], - capture_output=True, - text=True, - ) - - assert result.returncode == 0, f"CLI exited with error: {result.stderr}" - for test_string in DOCX_TEST_STRINGS: - assert ( - test_string in result.stdout - ), f"Expected string not found in output: {test_string}" - - -def test_output_to_file(shared_tmp_dir) -> None: - # DOC X, flag -o at the end - docx_output_file_1 = os.path.join(shared_tmp_dir, "test_docx_1.md") - result = subprocess.run( - [ - "python", - "-m", - "markitdown", - os.path.join(TEST_FILES_DIR, "test.docx"), - "-o", - docx_output_file_1, - ], - capture_output=True, - text=True, - ) - - assert result.returncode == 0, f"CLI exited with error: {result.stderr}" - assert os.path.exists( - docx_output_file_1 - ), f"Output file not created: {docx_output_file_1}" - - with open(docx_output_file_1, "r") as f: - output = f.read() - for test_string in DOCX_TEST_STRINGS: - assert ( - test_string in output - ), f"Expected string not found in output: {test_string}" - - # DOC X, flag -o at the beginning - docx_output_file_2 = os.path.join(shared_tmp_dir, "test_docx_2.md") - result = subprocess.run( - [ - "python", - "-m", - "markitdown", - "-o", - docx_output_file_2, - os.path.join(TEST_FILES_DIR, "test.docx"), - ], - capture_output=True, - text=True, - ) - - assert result.returncode == 0, f"CLI exited with error: {result.stderr}" - assert os.path.exists( - docx_output_file_2 - ), f"Output file not created: {docx_output_file_2}" - - with open(docx_output_file_2, "r") as f: - output = f.read() - for test_string in DOCX_TEST_STRINGS: - assert ( - test_string in output - ), f"Expected string not found in output: {test_string}" - - -if __name__ == "__main__": - """Runs this file's tests from the command line.""" - import tempfile - - with tempfile.TemporaryDirectory() as tmp_dir: - test_version(tmp_dir) - test_invalid_flag(tmp_dir) - test_output_to_stdout(tmp_dir) - test_output_to_file(tmp_dir) - print("All tests passed!") diff --git a/packages/markitdown/tests/test_cli_misc.py b/packages/markitdown/tests/test_cli_misc.py new file mode 100644 index 00000000..345d5cc3 --- /dev/null +++ b/packages/markitdown/tests/test_cli_misc.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python3 -m pytest +import subprocess +import pytest +from markitdown import __version__ + +# This file contains CLI tests that are not directly tested by the FileTestVectors. +# This includes things like help messages, version numbers, and invalid flags. + + +def test_version() -> None: + result = subprocess.run( + ["python", "-m", "markitdown", "--version"], capture_output=True, text=True + ) + + assert result.returncode == 0, f"CLI exited with error: {result.stderr}" + assert __version__ in result.stdout, f"Version not found in output: {result.stdout}" + + +def test_invalid_flag() -> None: + result = subprocess.run( + ["python", "-m", "markitdown", "--foobar"], capture_output=True, text=True + ) + + assert result.returncode != 0, f"CLI exited with error: {result.stderr}" + assert ( + "unrecognized arguments" in result.stderr + ), f"Expected 'unrecognized arguments' to appear in STDERR" + assert "SYNTAX" in result.stderr, f"Expected 'SYNTAX' to appear in STDERR" + + +if __name__ == "__main__": + """Runs this file's tests from the command line.""" + test_version() + test_invalid_flag() + print("All tests passed!") diff --git a/packages/markitdown/tests/test_cli_vectors.py b/packages/markitdown/tests/test_cli_vectors.py new file mode 100644 index 00000000..b2f068c7 --- /dev/null +++ b/packages/markitdown/tests/test_cli_vectors.py @@ -0,0 +1,170 @@ +#!/usr/bin/env python3 -m pytest +import os +import time +import pytest +import subprocess +import locale +from typing import List + +if __name__ == "__main__": + from _test_vectors import GENERAL_TEST_VECTORS, FileTestVector +else: + from ._test_vectors import GENERAL_TEST_VECTORS, FileTestVector + +from markitdown import ( + MarkItDown, + UnsupportedFormatException, + FileConversionException, + StreamInfo, +) + +skip_remote = ( + True if os.environ.get("GITHUB_ACTIONS") else False +) # Don't run these tests in CI + +TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files") +TEST_FILES_URL = "https://raw.githubusercontent.com/microsoft/markitdown/refs/heads/main/packages/markitdown/tests/test_files" + + +# Prepare CLI test vectors (remove vectors that require mockig the url) +CLI_TEST_VECTORS: List[FileTestVector] = [] +for test_vector in GENERAL_TEST_VECTORS: + if test_vector.url is not None: + continue + CLI_TEST_VECTORS.append(test_vector) + + +@pytest.fixture(scope="session") +def shared_tmp_dir(tmp_path_factory): + return tmp_path_factory.mktemp("pytest_tmp") + + +@pytest.mark.parametrize("test_vector", CLI_TEST_VECTORS) +def test_output_to_stdout(shared_tmp_dir, test_vector) -> None: + """Test that the CLI outputs to stdout correctly.""" + + result = subprocess.run( + [ + "python", + "-m", + "markitdown", + os.path.join(TEST_FILES_DIR, test_vector.filename), + ], + capture_output=True, + text=True, + ) + + assert result.returncode == 0, f"CLI exited with error: {result.stderr}" + for test_string in test_vector.must_include: + assert test_string in result.stdout + for test_string in test_vector.must_not_include: + assert test_string not in result.stdout + + +@pytest.mark.parametrize("test_vector", CLI_TEST_VECTORS) +def test_output_to_file(shared_tmp_dir, test_vector) -> None: + """Test that the CLI outputs to a file correctly.""" + + output_file = os.path.join(shared_tmp_dir, test_vector.filename + ".output") + result = subprocess.run( + [ + "python", + "-m", + "markitdown", + "-o", + output_file, + os.path.join(TEST_FILES_DIR, test_vector.filename), + ], + capture_output=True, + text=True, + ) + + assert result.returncode == 0, f"CLI exited with error: {result.stderr}" + assert os.path.exists(output_file), f"Output file not created: {output_file}" + + with open(output_file, "r") as f: + output_data = f.read() + for test_string in test_vector.must_include: + assert test_string in output_data + for test_string in test_vector.must_not_include: + assert test_string not in output_data + + os.remove(output_file) + assert not os.path.exists(output_file), f"Output file not deleted: {output_file}" + + +@pytest.mark.parametrize("test_vector", CLI_TEST_VECTORS) +def test_input_from_stdin_without_hints(shared_tmp_dir, test_vector) -> None: + """Test that the CLI readds from stdin correctly.""" + + test_input = b"" + with open(os.path.join(TEST_FILES_DIR, test_vector.filename), "rb") as stream: + test_input = stream.read() + + result = subprocess.run( + [ + "python", + "-m", + "markitdown", + os.path.join(TEST_FILES_DIR, test_vector.filename), + ], + input=test_input, + capture_output=True, + text=False, + ) + + stdout = result.stdout.decode(locale.getpreferredencoding()) + assert result.returncode == 0, f"CLI exited with error: {result.stderr}" + for test_string in test_vector.must_include: + assert test_string in stdout + for test_string in test_vector.must_not_include: + assert test_string not in stdout + + +@pytest.mark.skipif( + skip_remote, + reason="do not run tests that query external urls", +) +@pytest.mark.parametrize("test_vector", CLI_TEST_VECTORS) +def test_convert_url(shared_tmp_dir, test_vector): + """Test the conversion of a stream with no stream info.""" + # Note: tmp_dir is not used here, but is needed to match the signature + + markitdown = MarkItDown() + + time.sleep(1) # Ensure we don't hit rate limits + result = subprocess.run( + ["python", "-m", "markitdown", TEST_FILES_URL + "/" + test_vector.filename], + capture_output=True, + text=False, + ) + + stdout = result.stdout.decode(locale.getpreferredencoding()) + assert result.returncode == 0, f"CLI exited with error: {result.stderr}" + for test_string in test_vector.must_include: + assert test_string in stdout + for test_string in test_vector.must_not_include: + assert test_string not in stdout + + +if __name__ == "__main__": + import sys + import tempfile + + """Runs this file's tests from the command line.""" + + with tempfile.TemporaryDirectory() as tmp_dir: + for test_function in [ + test_output_to_stdout, + test_output_to_file, + test_input_from_stdin_without_hints, + test_convert_url, + ]: + for test_vector in CLI_TEST_VECTORS: + print( + f"Running {test_function.__name__} on {test_vector.filename}...", + end="", + ) + test_function(tmp_dir, test_vector) + print("OK") + print("All tests passed!") diff --git a/packages/markitdown/tests/test_markitdown.py b/packages/markitdown/tests/test_module_misc.py similarity index 51% rename from packages/markitdown/tests/test_markitdown.py rename to packages/markitdown/tests/test_module_misc.py index f76ff8cd..4079107f 100644 --- a/packages/markitdown/tests/test_markitdown.py +++ b/packages/markitdown/tests/test_module_misc.py @@ -3,9 +3,7 @@ import os import shutil import openai - import pytest -import requests from markitdown import ( MarkItDown, @@ -14,6 +12,10 @@ StreamInfo, ) +# This file contains module tests that are not directly tested by the FileTestVectors. +# This includes things like helper functions and runtime conversion options +# (e.g., LLM clients, exiftool path, transcription services, etc.) + skip_remote = ( True if os.environ.get("GITHUB_ACTIONS") else False ) # Don't run these tests in CI @@ -59,36 +61,6 @@ "the model we're going to be using today is GPT 3.5 turbo", # From the transcript ] -XLSX_TEST_STRINGS = [ - "## 09060124-b5e7-4717-9d07-3c046eb", - "6ff4173b-42a5-4784-9b19-f49caff4d93d", - "affc7dad-52dc-4b98-9b5d-51e65d8a8ad0", -] - -XLS_TEST_STRINGS = [ - "## 09060124-b5e7-4717-9d07-3c046eb", - "6ff4173b-42a5-4784-9b19-f49caff4d93d", - "affc7dad-52dc-4b98-9b5d-51e65d8a8ad0", -] - -DOCX_TEST_STRINGS = [ - "314b0a30-5b04-470b-b9f7-eed2c2bec74a", - "49e168b7-d2ae-407f-a055-2167576f39a1", - "## d666f1f7-46cb-42bd-9a39-9a39cf2a509f", - "# Abstract", - "# Introduction", - "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", -] - -MSG_TEST_STRINGS = [ - "# Email Message", - "**From:** test.sender@example.com", - "**To:** test.recipient@example.com", - "**Subject:** Test Email Message", - "## Content", - "This is the body of the test email message", -] - DOCX_COMMENT_TEST_STRINGS = [ "314b0a30-5b04-470b-b9f7-eed2c2bec74a", "49e168b7-d2ae-407f-a055-2167576f39a1", @@ -100,65 +72,24 @@ "Yet another comment in the doc. 55yiyi-asd09", ] -PPTX_TEST_STRINGS = [ - "2cdda5c8-e50e-4db4-b5f0-9722a649f455", - "04191ea8-5c73-4215-a1d3-1cfb43aaaf12", - "44bf7d06-5e7a-4a40-a2e1-a2e42ef28c8a", - "1b92870d-e3b5-4e65-8153-919f4ff45592", - "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", - "a3f6004b-6f4f-4ea8-bee3-3741f4dc385f", # chart title - "2003", # chart value -] - BLOG_TEST_URL = "https://microsoft.github.io/autogen/blog/2023/04/21/LLM-tuning-math" BLOG_TEST_STRINGS = [ "Large language models (LLMs) are powerful tools that can generate natural language texts for various applications, such as chatbots, summarization, translation, and more. GPT-4 is currently the state of the art LLM in the world. Is model selection irrelevant? What about inference parameters?", "an example where high cost can easily prevent a generic complex", ] - -RSS_TEST_STRINGS = [ - "The Official Microsoft Blog", - "In the case of AI, it is absolutely true that the industry is moving incredibly fast", -] - - -WIKIPEDIA_TEST_URL = "https://en.wikipedia.org/wiki/Microsoft" -WIKIPEDIA_TEST_STRINGS = [ - "Microsoft entered the operating system (OS) business in 1980 with its own version of [Unix]", - 'Microsoft was founded by [Bill Gates](/wiki/Bill_Gates "Bill Gates")', -] -WIKIPEDIA_TEST_EXCLUDES = [ - "You are encouraged to create an account and log in", - "154 languages", - "move to sidebar", -] - -SERP_TEST_URL = "https://www.bing.com/search?q=microsoft+wikipedia" -SERP_TEST_STRINGS = [ - "](https://en.wikipedia.org/wiki/Microsoft", - "Microsoft Corporation is **an American multinational corporation and technology company headquartered** in Redmond", - "1995–2007: Foray into the Web, Windows 95, Windows XP, and Xbox", -] -SERP_TEST_EXCLUDES = [ - "https://www.bing.com/ck/a?!&&p=", - "data:image/svg+xml,%3Csvg%20width%3D", -] - -CSV_CP932_TEST_STRINGS = [ - "名前,年齢,住所", - "佐藤太郎,30,東京", - "三木英子,25,大阪", - "髙橋淳,35,名古屋", -] - LLM_TEST_STRINGS = [ "5bda1dd6", ] -JSON_TEST_STRINGS = [ - "5b64c88c-b3c3-4510-bcb8-da0b200602d8", - "9700dc99-6685-40b4-9a3a-5e406dcb37f3", +PPTX_TEST_STRINGS = [ + "2cdda5c8-e50e-4db4-b5f0-9722a649f455", + "04191ea8-5c73-4215-a1d3-1cfb43aaaf12", + "44bf7d06-5e7a-4a40-a2e1-a2e42ef28c8a", + "1b92870d-e3b5-4e65-8153-919f4ff45592", + "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", + "a3f6004b-6f4f-4ea8-bee3-3741f4dc385f", # chart title + "2003", # chart value ] @@ -245,95 +176,9 @@ def test_stream_info_operations() -> None: assert updated_stream_info.url == "url.1" -def test_stream_info_guesses() -> None: - """Test StreamInfo guesses based on stream content.""" - - test_tuples = [ - ( - os.path.join(TEST_FILES_DIR, "test.xlsx"), - "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", - ), - ( - os.path.join(TEST_FILES_DIR, "test.docx"), - "application/vnd.openxmlformats-officedocument.wordprocessingml.document", - ), - ( - os.path.join(TEST_FILES_DIR, "test.pptx"), - "application/vnd.openxmlformats-officedocument.presentationml.presentation", - ), - (os.path.join(TEST_FILES_DIR, "test.xls"), "application/vnd.ms-excel"), - ] - - markitdown = MarkItDown() - for file_path, expected_mimetype in test_tuples: - with open(file_path, "rb") as f: - guesses = markitdown._get_stream_info_guesses( - f, - StreamInfo( - filename=os.path.basename(file_path), - local_path=file_path, - extension=os.path.splitext(file_path)[1], - ), - ) - assert len(guesses) > 0 - assert guesses[0].mimetype == expected_mimetype - assert guesses[0].extension == os.path.splitext(file_path)[1] - - -@pytest.mark.skipif( - skip_remote, - reason="do not run tests that query external urls", -) -def test_markitdown_remote() -> None: +def test_docx_comments() -> None: markitdown = MarkItDown() - # By URL - result = markitdown.convert(PDF_TEST_URL) - for test_string in PDF_TEST_STRINGS: - assert test_string in result.text_content - - # By stream - response = requests.get(PDF_TEST_URL) - result = markitdown.convert_stream( - io.BytesIO(response.content), file_extension=".pdf", url=PDF_TEST_URL - ) - for test_string in PDF_TEST_STRINGS: - assert test_string in result.text_content - - # Youtube - result = markitdown.convert(YOUTUBE_TEST_URL) - for test_string in YOUTUBE_TEST_STRINGS: - assert test_string in result.text_content - - -def test_markitdown_local() -> None: - markitdown = MarkItDown() - - # Test PDF processing - result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.pdf")) - validate_strings(result, PDF_TEST_STRINGS) - - # Test XLSX processing - result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xlsx")) - validate_strings(result, XLSX_TEST_STRINGS) - - # Test XLS processing - result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xls")) - for test_string in XLS_TEST_STRINGS: - text_content = result.text_content.replace("\\", "") - assert test_string in text_content - - # Test DOCX processing - result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.docx")) - validate_strings(result, DOCX_TEST_STRINGS) - - # Test DOCX processing, with comments - result = markitdown.convert( - os.path.join(TEST_FILES_DIR, "test_with_comment.docx"), - style_map="comment-reference => ", - ) - validate_strings(result, DOCX_COMMENT_TEST_STRINGS) - # Test DOCX processing, with comments and setting style_map on init markitdown_with_style_map = MarkItDown(style_map="comment-reference => ") result = markitdown_with_style_map.convert( @@ -341,53 +186,9 @@ def test_markitdown_local() -> None: ) validate_strings(result, DOCX_COMMENT_TEST_STRINGS) - # Test PPTX processing - result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.pptx")) - validate_strings(result, PPTX_TEST_STRINGS) - # Test HTML processing - result = markitdown.convert( - os.path.join(TEST_FILES_DIR, "test_blog.html"), url=BLOG_TEST_URL - ) - validate_strings(result, BLOG_TEST_STRINGS) - - # Test Wikipedia processing - result = markitdown.convert( - os.path.join(TEST_FILES_DIR, "test_wikipedia.html"), url=WIKIPEDIA_TEST_URL - ) - text_content = result.text_content.replace("\\", "") - validate_strings(result, WIKIPEDIA_TEST_STRINGS, WIKIPEDIA_TEST_EXCLUDES) - - # Test Bing processing - result = markitdown.convert( - os.path.join(TEST_FILES_DIR, "test_serp.html"), url=SERP_TEST_URL - ) - text_content = result.text_content.replace("\\", "") - validate_strings(result, SERP_TEST_STRINGS, SERP_TEST_EXCLUDES) - - # Test RSS processing - result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_rss.xml")) - text_content = result.text_content.replace("\\", "") - for test_string in RSS_TEST_STRINGS: - assert test_string in text_content - - # Test MSG (Outlook email) processing - result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_outlook_msg.msg")) - validate_strings(result, MSG_TEST_STRINGS) - - # Test non-UTF-8 encoding - result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_mskanji.csv")) - validate_strings(result, CSV_CP932_TEST_STRINGS) - - # Test JSON processing - result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.json")) - validate_strings(result, JSON_TEST_STRINGS) - - # # Test ZIP file processing - result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_files.zip")) - validate_strings(result, DOCX_TEST_STRINGS) - validate_strings(result, XLSX_TEST_STRINGS) - validate_strings(result, BLOG_TEST_STRINGS) +def test_input_as_strings() -> None: + markitdown = MarkItDown() # Test input from a stream input_data = b"

Test

" @@ -400,84 +201,22 @@ def test_markitdown_local() -> None: assert "# Test" in result.text_content -def test_markitdown_streams() -> None: +@pytest.mark.skipif( + skip_remote, + reason="do not run tests that query external urls", +) +def test_markitdown_remote() -> None: markitdown = MarkItDown() - # Test PDF processing - with open(os.path.join(TEST_FILES_DIR, "test.pdf"), "rb") as f: - result = markitdown.convert(f, file_extension=".pdf") - validate_strings(result, PDF_TEST_STRINGS) - - # Test XLSX processing - with open(os.path.join(TEST_FILES_DIR, "test.xlsx"), "rb") as f: - result = markitdown.convert(f, file_extension=".xlsx") - validate_strings(result, XLSX_TEST_STRINGS) - - # Test XLS processing - with open(os.path.join(TEST_FILES_DIR, "test.xls"), "rb") as f: - result = markitdown.convert(f, file_extension=".xls") - for test_string in XLS_TEST_STRINGS: - text_content = result.text_content.replace("\\", "") - assert test_string in text_content - - # Test DOCX processing - with open(os.path.join(TEST_FILES_DIR, "test.docx"), "rb") as f: - result = markitdown.convert(f, file_extension=".docx") - validate_strings(result, DOCX_TEST_STRINGS) - - # Test DOCX processing, with comments - with open(os.path.join(TEST_FILES_DIR, "test_with_comment.docx"), "rb") as f: - result = markitdown.convert( - f, - file_extension=".docx", - style_map="comment-reference => ", - ) - validate_strings(result, DOCX_COMMENT_TEST_STRINGS) + # By URL + result = markitdown.convert(PDF_TEST_URL) + for test_string in PDF_TEST_STRINGS: + assert test_string in result.text_content - # Test DOCX processing, with comments and setting style_map on init - markitdown_with_style_map = MarkItDown(style_map="comment-reference => ") - with open(os.path.join(TEST_FILES_DIR, "test_with_comment.docx"), "rb") as f: - result = markitdown_with_style_map.convert(f, file_extension=".docx") - validate_strings(result, DOCX_COMMENT_TEST_STRINGS) - - # Test PPTX processing - with open(os.path.join(TEST_FILES_DIR, "test.pptx"), "rb") as f: - result = markitdown.convert(f, file_extension=".pptx") - validate_strings(result, PPTX_TEST_STRINGS) - - # Test HTML processing - with open(os.path.join(TEST_FILES_DIR, "test_blog.html"), "rb") as f: - result = markitdown.convert(f, file_extension=".html", url=BLOG_TEST_URL) - validate_strings(result, BLOG_TEST_STRINGS) - - # Test Wikipedia processing - with open(os.path.join(TEST_FILES_DIR, "test_wikipedia.html"), "rb") as f: - result = markitdown.convert(f, file_extension=".html", url=WIKIPEDIA_TEST_URL) - text_content = result.text_content.replace("\\", "") - validate_strings(result, WIKIPEDIA_TEST_STRINGS, WIKIPEDIA_TEST_EXCLUDES) - - # Test Bing processing - with open(os.path.join(TEST_FILES_DIR, "test_serp.html"), "rb") as f: - result = markitdown.convert(f, file_extension=".html", url=SERP_TEST_URL) - text_content = result.text_content.replace("\\", "") - validate_strings(result, SERP_TEST_STRINGS, SERP_TEST_EXCLUDES) - - # Test RSS processing - with open(os.path.join(TEST_FILES_DIR, "test_rss.xml"), "rb") as f: - result = markitdown.convert(f, file_extension=".xml") - text_content = result.text_content.replace("\\", "") - for test_string in RSS_TEST_STRINGS: - assert test_string in text_content - - # Test MSG (Outlook email) processing - with open(os.path.join(TEST_FILES_DIR, "test_outlook_msg.msg"), "rb") as f: - result = markitdown.convert(f, file_extension=".msg") - validate_strings(result, MSG_TEST_STRINGS) - - # Test JSON processing - with open(os.path.join(TEST_FILES_DIR, "test.json"), "rb") as f: - result = markitdown.convert(f, file_extension=".json") - validate_strings(result, JSON_TEST_STRINGS) + # Youtube + result = markitdown.convert(YOUTUBE_TEST_URL) + for test_string in YOUTUBE_TEST_STRINGS: + assert test_string in result.text_content @pytest.mark.skipif( @@ -573,13 +312,17 @@ def test_markitdown_llm() -> None: if __name__ == "__main__": """Runs this file's tests from the command line.""" - test_stream_info_operations() - test_stream_info_guesses() - test_markitdown_remote() - test_markitdown_local() - test_markitdown_streams() - test_speech_transcription() - test_exceptions() - test_markitdown_exiftool() - test_markitdown_llm() + for test in [ + test_stream_info_operations, + test_docx_comments, + test_input_as_strings, + test_markitdown_remote, + test_speech_transcription, + test_exceptions, + test_markitdown_exiftool, + test_markitdown_llm, + ]: + print(f"Running {test.__name__}...", end="") + test() + print("OK") print("All tests passed!") diff --git a/packages/markitdown/tests/test_module_vectors.py b/packages/markitdown/tests/test_module_vectors.py new file mode 100644 index 00000000..873be753 --- /dev/null +++ b/packages/markitdown/tests/test_module_vectors.py @@ -0,0 +1,154 @@ +#!/usr/bin/env python3 -m pytest +import os +import time +import pytest +import codecs + + +if __name__ == "__main__": + from _test_vectors import GENERAL_TEST_VECTORS +else: + from ._test_vectors import GENERAL_TEST_VECTORS + +from markitdown import ( + MarkItDown, + UnsupportedFormatException, + FileConversionException, + StreamInfo, +) + +skip_remote = ( + True if os.environ.get("GITHUB_ACTIONS") else False +) # Don't run these tests in CI + +TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files") +TEST_FILES_URL = "https://raw.githubusercontent.com/microsoft/markitdown/refs/heads/main/packages/markitdown/tests/test_files" + + +@pytest.mark.parametrize("test_vector", GENERAL_TEST_VECTORS) +def test_guess_stream_info(test_vector): + """Test the ability to guess stream info.""" + markitdown = MarkItDown() + + local_path = os.path.join(TEST_FILES_DIR, test_vector.filename) + expected_extension = os.path.splitext(test_vector.filename)[1] + + with open(local_path, "rb") as stream: + guesses = markitdown._get_stream_info_guesses( + stream, + base_guess=StreamInfo( + filename=os.path.basename(test_vector.filename), + local_path=local_path, + extension=expected_extension, + ), + ) + + # For some limited exceptions, we can't guarantee the exact + # mimetype or extension, so we'll special-case them here. + if test_vector.filename in [ + "test_outlook_msg.msg", + "test_mskanji.csv", # See: https://github.com/google/magika/issues/983 + ]: + return + + assert guesses[0].mimetype == test_vector.mimetype + assert guesses[0].extension == expected_extension + assert guesses[0].charset == test_vector.charset + + +@pytest.mark.parametrize("test_vector", GENERAL_TEST_VECTORS) +def test_convert_local(test_vector): + """Test the conversion of a local file.""" + markitdown = MarkItDown() + + result = markitdown.convert( + os.path.join(TEST_FILES_DIR, test_vector.filename), url=test_vector.url + ) + for string in test_vector.must_include: + assert string in result.markdown + for string in test_vector.must_not_include: + assert string not in result.markdown + + +@pytest.mark.parametrize("test_vector", GENERAL_TEST_VECTORS) +def test_convert_stream_with_hints(test_vector): + """Test the conversion of a stream with full stream info.""" + markitdown = MarkItDown() + + stream_info = StreamInfo( + extension=os.path.splitext(test_vector.filename)[1], + mimetype=test_vector.mimetype, + charset=test_vector.charset, + ) + + with open(os.path.join(TEST_FILES_DIR, test_vector.filename), "rb") as stream: + result = markitdown.convert( + stream, stream_info=stream_info, url=test_vector.url + ) + for string in test_vector.must_include: + assert string in result.markdown + for string in test_vector.must_not_include: + assert string not in result.markdown + + +@pytest.mark.parametrize("test_vector", GENERAL_TEST_VECTORS) +def test_convert_stream_without_hints(test_vector): + """Test the conversion of a stream with no stream info.""" + markitdown = MarkItDown() + + # For some limited exceptions, we can't guarantee the exact + # mimetype or extension, so we'll special-case them here. + if test_vector.filename in [ + # This appears to be a subtle bug in magika. + # See: https://github.com/google/magika/issues/983 + "test_mskanji.csv", + ]: + return + + with open(os.path.join(TEST_FILES_DIR, test_vector.filename), "rb") as stream: + result = markitdown.convert(stream, url=test_vector.url) + for string in test_vector.must_include: + assert string in result.markdown + for string in test_vector.must_not_include: + assert string not in result.markdown + + +@pytest.mark.skipif( + skip_remote, + reason="do not run tests that query external urls", +) +@pytest.mark.parametrize("test_vector", GENERAL_TEST_VECTORS) +def test_convert_url(test_vector): + """Test the conversion of a stream with no stream info.""" + markitdown = MarkItDown() + + time.sleep(1) # Ensure we don't hit rate limits + + result = markitdown.convert( + TEST_FILES_URL + "/" + test_vector.filename, + url=test_vector.url, # Mock where this file would be found + ) + for string in test_vector.must_include: + assert string in result.markdown + for string in test_vector.must_not_include: + assert string not in result.markdown + + +if __name__ == "__main__": + import sys + + """Runs this file's tests from the command line.""" + for test_function in [ + test_guess_stream_info, + test_convert_local, + test_convert_stream_with_hints, + test_convert_stream_without_hints, + test_convert_url, + ]: + for test_vector in GENERAL_TEST_VECTORS: + print( + f"Running {test_function.__name__} on {test_vector.filename}...", end="" + ) + test_function(test_vector) + print("OK") + print("All tests passed!")