microsoft · carlodek · Sep 30, 2025 · Oct 1, 2025
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,5 @@
 .vscode
-
+.idea
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]

diff --git a/...src/markitdown/converters/_llm_caption.py → ...markitdown/src/markitdown/_llm_caption.py b/...src/markitdown/converters/_llm_caption.py → ...markitdown/src/markitdown/_llm_caption.py
@@ -1,7 +1,7 @@
 from typing import BinaryIO, Union
 import base64
 import mimetypes
-from .._stream_info import StreamInfo
+from _stream_info import StreamInfo
 
 
 def llm_caption(

diff --git a/packages/markitdown/src/markitdown/converter_utils/docx/pre_process.py b/packages/markitdown/src/markitdown/converter_utils/docx/pre_process.py
@@ -1,3 +1,4 @@
+import mimetypes
 import zipfile
 from io import BytesIO
 from typing import BinaryIO
@@ -6,6 +7,8 @@
 from bs4 import BeautifulSoup, Tag
 
 from .math.omml import OMML_NS, oMath2Latex
+from ..._stream_info import StreamInfo
+from ..._llm_caption import llm_caption
 
 MATH_ROOT_TEMPLATE = "".join(
     (
@@ -115,7 +118,64 @@ def _pre_process_math(content: bytes) -> bytes:
     return str(soup).encode()
 
 
-def pre_process_docx(input_docx: BinaryIO) -> BinaryIO:
+def _pre_process_images(
+        content: bytes,
+        rels_content: bytes,
+        files: dict,
+        llm_client,
+        llm_model,
+        prompt=None
+) -> bytes:
+    """
+    Finds images in the document and replaces them with their description
+    generated by an LLM (if provided).
+    """
+    soup = BeautifulSoup(content.decode(), features="xml")
+    rels_soup = BeautifulSoup(rels_content.decode(), features="xml")
+
+    if prompt is None or prompt.strip() == "":
+        prompt = "Write a detailed caption for this image."
+    for blip_tag in soup.find_all("a:blip"):
+        rid = blip_tag.get("r:embed")
+        if rid:
+            rel_tag = rels_soup.find("Relationship", {"Id": rid})
+            if rel_tag:
+                image_path = "word/" + rel_tag.get("Target")
+                if image_path in files:
+                    image_bytes: bytes = files[image_path]
+                    content_type, _ = mimetypes.guess_type(image_path)
+                    if not content_type:
+                        content_type = "application/octet-stream"
+                    stream_info = StreamInfo(
+                        mimetype=content_type,
+                        extension="." + image_path.split(".")[-1],
+                        filename=image_path.split("/")[-1],
+                    )
+                    description = llm_caption(
+                        file_stream=BytesIO(image_bytes),
+                        stream_info=stream_info,
+                        client=llm_client,
+                        model=llm_model,
+                        prompt=prompt
+                    ) or "Image could not be described"
+                    drawing_tag = blip_tag.find_parent("w:drawing")
+                    if drawing_tag:
+                        placeholder_run = soup.new_tag("w:r")
+                        placeholder_text = soup.new_tag("w:t")
+                        placeholder_text.string = description
+                        placeholder_run.append(placeholder_text)
+                        drawing_tag.replace_with(placeholder_run)
+                    else:
+                        # fallback if no <w:drawing>
+                        placeholder_run = soup.new_tag("w:r")
+                        placeholder_text = soup.new_tag("w:t")
+                        placeholder_text.string = description
+                        placeholder_run.append(placeholder_text)
+                        blip_tag.replace_with(placeholder_run)
+    return str(soup).encode()
+
+
+def pre_process_docx(input_docx: BinaryIO, llm_client=None, llm_model=None, llm_prompt=None) -> BinaryIO:
     """
     Pre-processes a DOCX file with provided steps.
 
@@ -125,6 +185,9 @@ def pre_process_docx(input_docx: BinaryIO) -> BinaryIO:
 
     Args:
         input_docx (BinaryIO): A binary input stream representing the DOCX file.
+        llm_client (Any): LLM client to use for generating descriptions.
+        llm_model (str): LLM model to use for generating descriptions.
+        llm_prompt (str): Prompt to use for generating descriptions.
 
     Returns:
         BinaryIO: A binary output stream representing the processed DOCX file.
@@ -136,21 +199,35 @@ def pre_process_docx(input_docx: BinaryIO) -> BinaryIO:
         "word/footnotes.xml",
         "word/endnotes.xml",
     ]
+    llm_for_images = llm_client is not None and llm_model is not None
     with zipfile.ZipFile(input_docx, mode="r") as zip_input:
         files = {name: zip_input.read(name) for name in zip_input.namelist()}
         with zipfile.ZipFile(output_docx, mode="w") as zip_output:
             zip_output.comment = zip_input.comment
+
             for name, content in files.items():
                 if name in pre_process_enable_files:
                     try:
                         # Pre-process the content
                         updated_content = _pre_process_math(content)
                         # In the future, if there are more pre-processing steps, they can be added here
+                        if llm_for_images:
+                            rels_name = f"word/_rels/{name.split('/')[-1]}.rels"
+                            if rels_name in files:
+                                updated_content = _pre_process_images(
+                                    updated_content,
+                                    files[rels_name],
+                                    files,
+                                    llm_client=llm_client,
+                                    llm_model=llm_model,
+                                    prompt=llm_prompt
+                                )
                         zip_output.writestr(name, updated_content)
                     except Exception:
                         # If there is an error in processing the content, write the original content
                         zip_output.writestr(name, content)
                 else:
                     zip_output.writestr(name, content)
+
     output_docx.seek(0)
     return output_docx
diff --git a/packages/markitdown/src/markitdown/converters/_docx_converter.py b/packages/markitdown/src/markitdown/converters/_docx_converter.py
@@ -17,17 +17,18 @@
     import mammoth
     import mammoth.docx.files
 
+
     def mammoth_files_open(self, uri):
         warn("DOCX: processing of r:link resources (e.g., linked images) is disabled.")
         return io.BytesIO(b"")
 
+
     mammoth.docx.files.Files.open = mammoth_files_open
 
 except ImportError:
     # Preserve the error and stack trace for later
     _dependency_exc_info = sys.exc_info()
 
-
 ACCEPTED_MIME_TYPE_PREFIXES = [
     "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
 ]
@@ -45,10 +46,10 @@ def __init__(self):
         self._html_converter = HtmlConverter()
 
     def accepts(
-        self,
-        file_stream: BinaryIO,
-        stream_info: StreamInfo,
-        **kwargs: Any,  # Options to pass to the converter
+            self,
+            file_stream: BinaryIO,
+            stream_info: StreamInfo,
+            **kwargs: Any,  # Options to pass to the converter
     ) -> bool:
         mimetype = (stream_info.mimetype or "").lower()
         extension = (stream_info.extension or "").lower()
@@ -63,10 +64,10 @@ def accepts(
         return False
 
     def convert(
-        self,
-        file_stream: BinaryIO,
-        stream_info: StreamInfo,
-        **kwargs: Any,  # Options to pass to the converter
+            self,
+            file_stream: BinaryIO,
+            stream_info: StreamInfo,
+            **kwargs: Any,  # Options to pass to the converter
     ) -> DocumentConverterResult:
         # Check: the dependencies
         if _dependency_exc_info is not None:
@@ -83,7 +84,11 @@ def convert(
             )
 
         style_map = kwargs.get("style_map", None)
-        pre_process_stream = pre_process_docx(file_stream)
+
+        llm_client = kwargs.get("llm_client")
+        llm_model = kwargs.get("llm_model")
+        llm_prompt = kwargs.get("llm_prompt")
+        pre_process_stream = pre_process_docx(file_stream, llm_client, llm_model, llm_prompt)
         return self._html_converter.convert_string(
             mammoth.convert_to_html(pre_process_stream, style_map=style_map).value,
             **kwargs,

diff --git a/packages/markitdown/src/markitdown/converters/_pptx_converter.py b/packages/markitdown/src/markitdown/converters/_pptx_converter.py
@@ -6,10 +6,9 @@
 import html
 
 from typing import BinaryIO, Any
-from operator import attrgetter
 
 from ._html_converter import HtmlConverter
-from ._llm_caption import llm_caption
+from .._llm_caption import llm_caption
 from .._base_converter import DocumentConverter, DocumentConverterResult
 from .._stream_info import StreamInfo
 from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE

diff --git a/packages/markitdown/tests/test_files/docx_with_image_test.docx b/packages/markitdown/tests/test_files/docx_with_image_test.docx