Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
.vscode

.idea
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from typing import BinaryIO, Union
import base64
import mimetypes
from .._stream_info import StreamInfo
from _stream_info import StreamInfo


def llm_caption(
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import mimetypes
import zipfile
from io import BytesIO
from typing import BinaryIO
Expand All @@ -6,6 +7,8 @@
from bs4 import BeautifulSoup, Tag

from .math.omml import OMML_NS, oMath2Latex
from ..._stream_info import StreamInfo
from ..._llm_caption import llm_caption

MATH_ROOT_TEMPLATE = "".join(
(
Expand Down Expand Up @@ -115,7 +118,64 @@ def _pre_process_math(content: bytes) -> bytes:
return str(soup).encode()


def pre_process_docx(input_docx: BinaryIO) -> BinaryIO:
def _pre_process_images(
content: bytes,
rels_content: bytes,
files: dict,
llm_client,
llm_model,
prompt=None
) -> bytes:
"""
Finds images in the document and replaces them with their description
generated by an LLM (if provided).
"""
soup = BeautifulSoup(content.decode(), features="xml")
rels_soup = BeautifulSoup(rels_content.decode(), features="xml")

if prompt is None or prompt.strip() == "":
prompt = "Write a detailed caption for this image."
for blip_tag in soup.find_all("a:blip"):
rid = blip_tag.get("r:embed")
if rid:
rel_tag = rels_soup.find("Relationship", {"Id": rid})
if rel_tag:
image_path = "word/" + rel_tag.get("Target")
if image_path in files:
image_bytes: bytes = files[image_path]
content_type, _ = mimetypes.guess_type(image_path)
if not content_type:
content_type = "application/octet-stream"
stream_info = StreamInfo(
mimetype=content_type,
extension="." + image_path.split(".")[-1],
filename=image_path.split("/")[-1],
)
description = llm_caption(
file_stream=BytesIO(image_bytes),
stream_info=stream_info,
client=llm_client,
model=llm_model,
prompt=prompt
) or "Image could not be described"
drawing_tag = blip_tag.find_parent("w:drawing")
if drawing_tag:
placeholder_run = soup.new_tag("w:r")
placeholder_text = soup.new_tag("w:t")
placeholder_text.string = description
placeholder_run.append(placeholder_text)
drawing_tag.replace_with(placeholder_run)
else:
# fallback if no <w:drawing>
placeholder_run = soup.new_tag("w:r")
placeholder_text = soup.new_tag("w:t")
placeholder_text.string = description
placeholder_run.append(placeholder_text)
blip_tag.replace_with(placeholder_run)
return str(soup).encode()


def pre_process_docx(input_docx: BinaryIO, llm_client=None, llm_model=None, llm_prompt=None) -> BinaryIO:
"""
Pre-processes a DOCX file with provided steps.

Expand All @@ -125,6 +185,9 @@ def pre_process_docx(input_docx: BinaryIO) -> BinaryIO:

Args:
input_docx (BinaryIO): A binary input stream representing the DOCX file.
llm_client (Any): LLM client to use for generating descriptions.
llm_model (str): LLM model to use for generating descriptions.
llm_prompt (str): Prompt to use for generating descriptions.

Returns:
BinaryIO: A binary output stream representing the processed DOCX file.
Expand All @@ -136,21 +199,35 @@ def pre_process_docx(input_docx: BinaryIO) -> BinaryIO:
"word/footnotes.xml",
"word/endnotes.xml",
]
llm_for_images = llm_client is not None and llm_model is not None
with zipfile.ZipFile(input_docx, mode="r") as zip_input:
files = {name: zip_input.read(name) for name in zip_input.namelist()}
with zipfile.ZipFile(output_docx, mode="w") as zip_output:
zip_output.comment = zip_input.comment

for name, content in files.items():
if name in pre_process_enable_files:
try:
# Pre-process the content
updated_content = _pre_process_math(content)
# In the future, if there are more pre-processing steps, they can be added here
if llm_for_images:
rels_name = f"word/_rels/{name.split('/')[-1]}.rels"
if rels_name in files:
updated_content = _pre_process_images(
updated_content,
files[rels_name],
files,
llm_client=llm_client,
llm_model=llm_model,
prompt=llm_prompt
)
zip_output.writestr(name, updated_content)
except Exception:
# If there is an error in processing the content, write the original content
zip_output.writestr(name, content)
else:
zip_output.writestr(name, content)

output_docx.seek(0)
return output_docx
25 changes: 15 additions & 10 deletions packages/markitdown/src/markitdown/converters/_docx_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,17 +17,18 @@
import mammoth
import mammoth.docx.files


def mammoth_files_open(self, uri):
warn("DOCX: processing of r:link resources (e.g., linked images) is disabled.")
return io.BytesIO(b"")


mammoth.docx.files.Files.open = mammoth_files_open

except ImportError:
# Preserve the error and stack trace for later
_dependency_exc_info = sys.exc_info()


ACCEPTED_MIME_TYPE_PREFIXES = [
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
]
Expand All @@ -45,10 +46,10 @@ def __init__(self):
self._html_converter = HtmlConverter()

def accepts(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> bool:
mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower()
Expand All @@ -63,10 +64,10 @@ def accepts(
return False

def convert(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> DocumentConverterResult:
# Check: the dependencies
if _dependency_exc_info is not None:
Expand All @@ -83,7 +84,11 @@ def convert(
)

style_map = kwargs.get("style_map", None)
pre_process_stream = pre_process_docx(file_stream)

llm_client = kwargs.get("llm_client")
llm_model = kwargs.get("llm_model")
llm_prompt = kwargs.get("llm_prompt")
pre_process_stream = pre_process_docx(file_stream, llm_client, llm_model, llm_prompt)
return self._html_converter.convert_string(
mammoth.convert_to_html(pre_process_stream, style_map=style_map).value,
**kwargs,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,9 @@
import html

from typing import BinaryIO, Any
from operator import attrgetter

from ._html_converter import HtmlConverter
from ._llm_caption import llm_caption
from .._llm_caption import llm_caption
from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
Expand Down
Binary file not shown.