From b085ad894a23a34e3a4190d26bdba22f6a1d726b Mon Sep 17 00:00:00 2001 From: Nadin Chernova Date: Mon, 20 Oct 2025 18:23:17 -0700 Subject: [PATCH 1/3] Add tests for pdf converter --- .../markitdown/tests/test_pdf_converter.py | 122 ++++++++++++++++++ 1 file changed, 122 insertions(+) create mode 100644 packages/markitdown/tests/test_pdf_converter.py diff --git a/packages/markitdown/tests/test_pdf_converter.py b/packages/markitdown/tests/test_pdf_converter.py new file mode 100644 index 00000000..dd44326d --- /dev/null +++ b/packages/markitdown/tests/test_pdf_converter.py @@ -0,0 +1,122 @@ +#!/usr/bin/env python3 -m pytest + +import io +import os +import pytest +from unittest.mock import Mock, patch +import sys + +#import marItDown framework, pdf converter +from markitdown import MarkItDown, StreamInfo +from markitdown.converters._pdf_converter import ( + PdfConverter, + ACCEPTED_MIME_TYPE_PREFIXES, + ACCEPTED_FILE_EXTENSIONS, +) +from markitdown._exceptions import MissingDependencyException + +# Paths and setup for locating test assets +TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files") +PDF_TEST_FILE = os.path.join(TEST_FILES_DIR, "test.pdf") + +# Unit tests for PdfConverter.accepts() +# To verify that converter correctly recognizes PDF files based on extension and MIME types, rejecting invalid ones +class TestPdfConverterAccepts: + + #Test that .pdf extension is accepted. + def test_accepts_pdf_extension(self): + converter = PdfConverter() + stream_info = StreamInfo(extension=".pdf") + assert converter.accepts(io.BytesIO(), stream_info) is True + + #Test that .PDF extension is accepted (case insensitive). + def test_accepts_pdf_extension_uppercase(self): + converter = PdfConverter() + stream_info = StreamInfo(extension=".PDF") + assert converter.accepts(io.BytesIO(), stream_info) is True + + #Test that application/pdf mimetype is accepted. + def test_accepts_application_pdf_mimetype(self): + converter = PdfConverter() + stream_info = StreamInfo(mimetype="application/pdf") + assert converter.accepts(io.BytesIO(), stream_info) is True + + #Test that application/pdf with charset is accepted. + def test_accepts_application_pdf_with_charset(self): + converter = PdfConverter() + stream_info = StreamInfo(mimetype="application/pdf; charset=utf-8") + assert converter.accepts(io.BytesIO(), stream_info) is True + + #Test that application/x-pdf mimetype is accepted + def test_accepts_application_x_pdf_mimetype(self): + converter = PdfConverter() + stream_info = StreamInfo(mimetype="application/x-pdf") + assert converter.accepts(io.BytesIO(), stream_info) is True + + #Test that mimetype matching is case insensitive. + def test_accepts_mimetype_case_insensitive(self): + converter = PdfConverter() + stream_info = StreamInfo(mimetype="APPLICATION/PDF") + assert converter.accepts(io.BytesIO(), stream_info) is True + + #Test that non-PDF extensions are rejected. + def test_rejects_wrong_extension(self): + converter = PdfConverter() + stream_info = StreamInfo(extension=".txt") + assert converter.accepts(io.BytesIO(), stream_info) is False + + #Test that non-PDF mimetypes are rejected. + def test_rejects_wrong_mimetype(self): + converter = PdfConverter() + stream_info = StreamInfo(mimetype="text/plain") + assert converter.accepts(io.BytesIO(), stream_info) is False + + #Test that empty StreamInfo is rejected. + def test_rejects_empty_stream_info(self): + converter = PdfConverter() + stream_info = StreamInfo() + assert converter.accepts(io.BytesIO(), stream_info) is False + +# Unit tests for PdfConverter.convert() +class TestPdfConverterConvert: + + #Test that MissingDependencyException is raised when pdfminer is not available. + def test_convert_missing_dependency(self): + # Mock the dependency check to simulate missing pdfminer + with patch("markitdown.converters._pdf_converter._dependency_exc_info") as mock_exc_info: + # Create a fake ImportError + try: + raise ImportError("No module named 'pdfminer'") + except ImportError: + mock_exc_info.__bool__ = Mock(return_value=True) + mock_exc_info.__getitem__ = Mock(side_effect=lambda x: sys.exc_info()[x]) + + converter = PdfConverter() + stream_info = StreamInfo(extension=".pdf") + + with pytest.raises(MissingDependencyException) as exc_info: + converter.convert(io.BytesIO(b"fake pdf content"), stream_info) + + # Check the exception message + assert "pdf" in str(exc_info.value).lower() + assert ".pdf" in str(exc_info.value) + + +#Tests for constants to ensure accepted extensions and MIME types are correclty defined and non-empty +class TestPdfConverterConstants: + #Test that ACCEPTED_MIME_TYPE_PREFIXES contains expected values. + def test_accepted_mime_type_prefixes(self): + assert "application/pdf" in ACCEPTED_MIME_TYPE_PREFIXES + assert "application/x-pdf" in ACCEPTED_MIME_TYPE_PREFIXES + assert len(ACCEPTED_MIME_TYPE_PREFIXES) >= 2 + + #Test that ACCEPTED_FILE_EXTENSIONS contains expected values. + def test_accepted_file_extensions(self): + assert ".pdf" in ACCEPTED_FILE_EXTENSIONS + assert len(ACCEPTED_FILE_EXTENSIONS) >= 1 + +# Allows this file to be executed directly with python test_pdf_converter.py +# Runs tests in verbode mode +if __name__ == "__main__": + """Runs this file's tests from the command line.""" + pytest.main([__file__, "-v"]) From 44974c5121619ba48ef7df4906c71bb0e3842623 Mon Sep 17 00:00:00 2001 From: ffreyli Date: Mon, 20 Oct 2025 21:38:07 -0700 Subject: [PATCH 2/3] Created _test_uri_utils.py with Unit Tests for the methods in _uri_utils.py --- packages/markitdown/tests/_test_uri_utils.py | 29 ++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 packages/markitdown/tests/_test_uri_utils.py diff --git a/packages/markitdown/tests/_test_uri_utils.py b/packages/markitdown/tests/_test_uri_utils.py new file mode 100644 index 00000000..098ead31 --- /dev/null +++ b/packages/markitdown/tests/_test_uri_utils.py @@ -0,0 +1,29 @@ +import pytest + +import markitdown._uri_utils + + +class TestUriUtils: + + def test_file_uri_to_path(self): + assert markitdown._uri_utils.file_uri_to_path("file://markitdown/tests/test_files/test.docx") == ("markitdown", "/tests/test_files/test.docx") + + def test_file_uri_to_path_raises_error(self): + with pytest.raises(ValueError): + markitdown._uri_utils.file_uri_to_path("https://google.com/") + + @pytest.mark.parametrize("uri, expected", [("data:,Hello%2C%20World%21", (None, {}, b"Hello, World!")), + ("data:text/plain;base64,SGVsbG8sIFdvcmxkIQ==", ("text/plain", {}, b"Hello, World!")), + ("data:text/plain;first=Hello;second=World,attributes%20with%20%3D", ("text/plain", {"first":"Hello", "second":"World"}, b"attributes with =")), + ("data:text/plain;test_attribute,empty%20attribute", ("text/plain", {'test_attribute': ''}, b"empty attribute")), + ]) + def test_parse_data_uri(self, uri, expected): + assert markitdown._uri_utils.parse_data_uri(uri) == expected + + def test_parse_data_uri_raises_error_not_data_uri(self): + with pytest.raises(ValueError): + markitdown._uri_utils.parse_data_uri("https://google.com/") + + def test_parse_data_uri_raises_error_malformed_uri(self): + with pytest.raises(ValueError): + markitdown._uri_utils.parse_data_uri("data:Hello%2C%20World%21") \ No newline at end of file From 578612c4d8b3477798cfc0f9eccaa0686ddb9644 Mon Sep 17 00:00:00 2001 From: Lucian Petriuc Date: Mon, 20 Oct 2025 23:10:33 -0700 Subject: [PATCH 3/3] added test_csv_converter.py --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index aa4abd38..d3ad5526 100644 --- a/.gitignore +++ b/.gitignore @@ -165,3 +165,4 @@ cython_debug/ src/.DS_Store .DS_Store .cursorrules +/.vs