Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 22 additions & 8 deletions packages/markitdown/src/markitdown/_markitdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,6 @@
# Override mimetype for csv to fix issue on windows
mimetypes.add_type("text/csv", ".csv")

PRIORITY_SPECIFIC_FILE_FORMAT = 0.0
PRIORITY_GENERIC_FILE_FORMAT = 10.0


_plugins: Union[None | List[Any]] = None


Expand Down Expand Up @@ -103,6 +99,23 @@ def __init__(
# Register the converters
self._page_converters: List[DocumentConverter] = []

# Note: We have tight control over the order of built-in converters, but
# plugins can register converters in any order. A converter's .priority
# reasserts some control over the order of converters.
#
# Priorities work as follows. By default, most converters get priority
# DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT (== 0). The exception
# is the PlainTextConverter, which gets priority PRIORITY_SPECIFIC_FILE_FORMAT (== 10),
# with lower values being tried first (i.e., higher priority).
#
# Just prior to conversion, the converters are sorted by priority, using
# a stable sort. This means that converters with the same priority will
# remain in the same order, with the most recently registered converters
# appearing first.
#
# Plugins can register converters with any priority, to appear before or
# after the built-ins. For example, a plugin with priority 9 will run
# before the PlainTextConverter, but after the built-in converters.
if (
enable_builtins is None or enable_builtins
): # Default to True when not specified
Expand All @@ -123,6 +136,8 @@ def enable_builtins(self, **kwargs) -> None:
self._llm_model = kwargs.get("llm_model")
self._exiftool_path = kwargs.get("exiftool_path")
self._style_map = kwargs.get("style_map")
if self._exiftool_path is None:
self._exiftool_path = os.getenv("EXIFTOOL_PATH")

# Register converters for successful browsing operations
# Later registrations are tried first / take higher priority than earlier registrations
Expand Down Expand Up @@ -349,11 +364,10 @@ def _convert(
_kwargs["_parent_converters"] = self._page_converters

# If we hit an error log it and keep trying
# try:
if True:
try:
res = converter.convert(local_path, **_kwargs)
# except Exception:
# error_trace = ("\n\n" + traceback.format_exc()).strip()
except Exception:
error_trace = ("\n\n" + traceback.format_exc()).strip()

if res is not None:
# Normalize the content
Expand Down
10 changes: 9 additions & 1 deletion packages/markitdown/src/markitdown/converters/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,15 @@ def __init__(self, title: Union[str, None] = None, text_content: str = ""):
class DocumentConverter:
"""Abstract superclass of all DocumentConverters."""

def __init__(self, priority: float = 0.0):
# Lower priority values are tried first.
PRIORITY_SPECIFIC_FILE_FORMAT = (
0.0 # e.g., .docx, .pdf, .xlsx, Or specific pages, e.g., wikipedia
)
PRIORITY_GENERIC_FILE_FORMAT = (
10.0 # Near catch-all converters for mimetypes like text/*, etc.
)

def __init__(self, priority: float = PRIORITY_SPECIFIC_FILE_FORMAT):
self._priority = priority

def convert(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,11 @@ class BingSerpConverter(DocumentConverter):
NOTE: It is better to use the Bing API
"""

def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)

def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
# Bail if not a Bing SERP
extension = kwargs.get("file_extension", "")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,13 @@ class DocumentIntelligenceConverter(DocumentConverter):

def __init__(
self,
*,
priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT,
endpoint: str,
api_version: str = "2024-07-31-preview",
):
super().__init__(priority=priority)

self.endpoint = endpoint
self.api_version = api_version
self.doc_intel_client = DocumentIntelligenceClient(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
DocumentConverterResult,
)

from ._base import DocumentConverter
from ._html_converter import HtmlConverter


Expand All @@ -14,6 +15,11 @@ class DocxConverter(HtmlConverter):
Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
"""

def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)

def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
# Bail if not a DOCX
extension = kwargs.get("file_extension", "")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,11 @@
class HtmlConverter(DocumentConverter):
"""Anything with content type text/html"""

def __init__(
self, priority: float = DocumentConverter.PRIORITY_GENERIC_FILE_FORMAT
):
super().__init__(priority=priority)

def convert(
self, local_path: str, **kwargs: Any
) -> Union[None, DocumentConverterResult]:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from typing import Union
from ._base import DocumentConverterResult
from ._base import DocumentConverter, DocumentConverterResult
from ._media_converter import MediaConverter


Expand All @@ -8,6 +8,11 @@ class ImageConverter(MediaConverter):
Converts images to markdown via extraction of metadata (if `exiftool` is installed), OCR (if `easyocr` is installed), and description via a multimodal LLM (if an llm_client is configured).
"""

def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)

def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
# Bail if not an image
extension = kwargs.get("file_extension", "")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,11 @@
class IpynbConverter(DocumentConverter):
"""Converts Jupyter Notebook (.ipynb) files to Markdown."""

def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)

def convert(
self, local_path: str, **kwargs: Any
) -> Union[None, DocumentConverterResult]:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,11 @@ class MediaConverter(DocumentConverter):
Abstract class for multi-modal media (e.g., images and audio)
"""

def __init__(
self, priority: float = DocumentConverter.PRIORITY_GENERIC_FILE_FORMAT
):
super().__init__(priority=priority)

def _get_metadata(self, local_path, exiftool_path=None):
if not exiftool_path:
which_exiftool = shutil.which("exiftool")
Expand All @@ -27,10 +32,10 @@ def _get_metadata(self, local_path, exiftool_path=None):

return None
else:
try:
if True:
result = subprocess.run(
[exiftool_path, "-json", local_path], capture_output=True, text=True
).stdout
return json.loads(result)[0]
except Exception:
return None
# except Exception:
# return None
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import tempfile
from typing import Union
from ._base import DocumentConverterResult
from ._base import DocumentConverter, DocumentConverterResult
from ._wav_converter import WavConverter
from warnings import resetwarnings, catch_warnings

Expand Down Expand Up @@ -28,6 +28,11 @@ class Mp3Converter(WavConverter):
Converts MP3 files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` AND `pydub` are installed).
"""

def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)

def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
# Bail if not a MP3
extension = kwargs.get("file_extension", "")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,11 @@ class OutlookMsgConverter(DocumentConverter):
- Email body content
"""

def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)

def convert(
self, local_path: str, **kwargs: Any
) -> Union[None, DocumentConverterResult]:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,11 @@ class PdfConverter(DocumentConverter):
Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text.
"""

def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)

def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
# Bail if not a PDF
extension = kwargs.get("file_extension", "")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,11 @@
class PlainTextConverter(DocumentConverter):
"""Anything with content type text/plain"""

def __init__(
self, priority: float = DocumentConverter.PRIORITY_GENERIC_FILE_FORMAT
):
super().__init__(priority=priority)

def convert(
self, local_path: str, **kwargs: Any
) -> Union[None, DocumentConverterResult]:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,11 @@ class PptxConverter(HtmlConverter):
Converts PPTX files to Markdown. Supports heading, tables and images with alt text.
"""

def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)

def _get_llm_description(
self, llm_client, llm_model, image_blob, content_type, prompt=None
):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,11 @@
class RssConverter(DocumentConverter):
"""Convert RSS / Atom type to markdown"""

def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)

def convert(
self, local_path: str, **kwargs
) -> Union[None, DocumentConverterResult]:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from typing import Union
from ._base import DocumentConverterResult
from ._base import DocumentConverter, DocumentConverterResult
from ._media_converter import MediaConverter

# Optional Transcription support
Expand All @@ -17,6 +17,11 @@ class WavConverter(MediaConverter):
Converts WAV files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed).
"""

def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)

def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
# Bail if not a WAV
extension = kwargs.get("file_extension", "")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,11 @@
class WikipediaConverter(DocumentConverter):
"""Handle Wikipedia pages separately, focusing only on the main document content."""

def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)

def convert(
self, local_path: str, **kwargs: Any
) -> Union[None, DocumentConverterResult]:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import pandas as pd

from ._base import DocumentConverterResult
from ._base import DocumentConverter, DocumentConverterResult
from ._html_converter import HtmlConverter


Expand All @@ -11,6 +11,11 @@ class XlsxConverter(HtmlConverter):
Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table.
"""

def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)

def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
# Bail if not a XLSX
extension = kwargs.get("file_extension", "")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,11 @@
class YouTubeConverter(DocumentConverter):
"""Handle YouTube specially, focusing on the video title, description, and transcript."""

def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)

def convert(
self, local_path: str, **kwargs: Any
) -> Union[None, DocumentConverterResult]:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,11 @@ class ZipConverter(DocumentConverter):
- Cleans up temporary files after processing
"""

def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)

def convert(
self, local_path: str, **kwargs: Any
) -> Union[None, DocumentConverterResult]:
Expand Down
6 changes: 3 additions & 3 deletions packages/markitdown/tests/test_markitdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -327,8 +327,8 @@ def test_markitdown_llm() -> None:

if __name__ == "__main__":
"""Runs this file's tests from the command line."""
# test_markitdown_remote()
# test_markitdown_local()
test_markitdown_remote()
test_markitdown_local()
test_markitdown_exiftool()
# test_markitdown_deprecation()
# test_markitdown_llm()
print("All tests passed!")