Skip to content
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 14 additions & 2 deletions packages/markitdown/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,8 @@ classifiers = [
dependencies = [
"beautifulsoup4",
"requests",
"mammoth",
"markdownify~=0.14.1",
"numpy",
"python-pptx",
"pandas",
"openpyxl",
"xlrd",
Expand All @@ -46,6 +44,14 @@ dependencies = [
"azure-identity"
]

[project.optional-dependencies]
all = [
"python-pptx",
"mammoth"
]
pptx = ["python-pptx"]
docx = ["mammoth"]

[project.urls]
Documentation = "https://github.com/microsoft/markitdown#readme"
Issues = "https://github.com/microsoft/markitdown/issues"
Expand All @@ -57,6 +63,12 @@ path = "src/markitdown/__about__.py"
[project.scripts]
markitdown = "markitdown.__main__:main"

[tool.hatch.envs.default]
features = ["all"]

[tool.hatch.envs.hatch-test]
features = ["all"]

[tool.hatch.envs.types]
extra-dependencies = [
"mypy>=1.0.0",
Expand Down
4 changes: 2 additions & 2 deletions packages/markitdown/src/markitdown/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from ._markitdown import MarkItDown
from ._exceptions import (
MarkItDownException,
ConverterPrerequisiteException,
MissingDependencyException,
FailedConversionAttempt,
FileConversionException,
UnsupportedFormatException,
Expand All @@ -19,7 +19,7 @@
"DocumentConverter",
"DocumentConverterResult",
"MarkItDownException",
"ConverterPrerequisiteException",
"MissingDependencyException",
"FailedConversionAttempt",
"FileConversionException",
"UnsupportedFormatException",
Expand Down
17 changes: 9 additions & 8 deletions packages/markitdown/src/markitdown/_exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,16 @@ class MarkItDownException(Exception):
pass


class ConverterPrerequisiteException(MarkItDownException):
class MissingDependencyException(MarkItDownException):
"""
Thrown when instantiating a DocumentConverter in cases where
a required library or dependency is not installed, an API key
is not set, or some other prerequisite is not met.

This is not necessarily a fatal error. If thrown during
MarkItDown's plugin loading phase, the converter will simply be
skipped, and a warning will be issued.
Converters shipped with MarkItDown may depend on optional
dependencies. This exception is thrown when a converter's
convert() method is called, but the required dependency is not
installed. This is not necessarily a fatal error, as the converter
will simply be skipped (an error will bubble up only if no other
suitable converter is found).

Error messages should clearly indicate which dependency is missing.
"""

pass
Expand Down
1 change: 0 additions & 1 deletion packages/markitdown/src/markitdown/_markitdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@
from ._exceptions import (
FileConversionException,
UnsupportedFormatException,
ConverterPrerequisiteException,
FailedConversionAttempt,
)

Expand Down
27 changes: 25 additions & 2 deletions packages/markitdown/src/markitdown/converters/_docx_converter.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,23 @@
from typing import Union
import sys

import mammoth
from typing import Union

from ._base import (
DocumentConverterResult,
)

from ._base import DocumentConverter
from ._html_converter import HtmlConverter
from .._exceptions import MissingDependencyException

# Try loading optional (but in this case, required) dependencies
# Save reporting of any exceptions for later
_dependency_exc_info = None
try:
import mammoth
except ImportError:
# Preserve the error and stack trace for later
_dependency_exc_info = sys.exc_info()


class DocxConverter(HtmlConverter):
Expand All @@ -26,6 +36,19 @@ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
if extension.lower() != ".docx":
return None

# Load the dependencies
if _dependency_exc_info is not None:
raise MissingDependencyException(
f"""{type(self).__name__} recognized the input as a potential .docx file, but the dependencies needed to read .docx files have not been installed. To resolve this error, include the optional dependency [docx] or [all] when installing MarkItDown. For example:

* pip install markitdown[docx]
* pip install markitdown[all]
* pip install markitdown[pptx, docx, ...]
* etc."""
) from _dependency_exc_info[1].with_traceback(
_dependency_exc_info[2]
) # Restore the original traceback

result = None
with open(local_path, "rb") as docx_file:
style_map = kwargs.get("style_map", None)
Expand Down
26 changes: 24 additions & 2 deletions packages/markitdown/src/markitdown/converters/_pptx_converter.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,22 @@
import base64
import pptx
import re
import html
import sys

from typing import Union

from ._base import DocumentConverterResult, DocumentConverter
from ._html_converter import HtmlConverter
from .._exceptions import MissingDependencyException

# Try loading optional (but in this case, required) dependencies
# Save reporting of any exceptions for later
_dependency_exc_info = None
try:
import pptx
except ImportError:
# Preserve the error and stack trace for later
_dependency_exc_info = sys.exc_info()


class PptxConverter(HtmlConverter):
Expand Down Expand Up @@ -54,9 +64,21 @@ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
if extension.lower() != ".pptx":
return None

md_content = ""
# Load the dependencies
if _dependency_exc_info is not None:
raise MissingDependencyException(
f"""{type(self).__name__} recognized the input as a potential .pptx file, but the dependencies needed to read .pptx files have not been installed. To resolve this error, include the optional dependency [pptx] or [all] when installing MarkItDown. For example:

* pip install markitdown[pptx]
* pip install markitdown[all]
* pip install markitdown[pptx, docx, ...]
* etc."""
) from _dependency_exc_info[1].with_traceback(
_dependency_exc_info[2]
) # Restore the original traceback

presentation = pptx.Presentation(local_path)
md_content = ""
slide_num = 0
for slide in presentation.slides:
slide_num += 1
Expand Down