From 4ec59822231557614b6c6b10f682afa3af29f837 Mon Sep 17 00:00:00 2001 From: Kenny Zhang Date: Tue, 11 Feb 2025 16:50:36 -0500 Subject: [PATCH 1/4] added priority flag to doc intel converter constructor --- .../src/markitdown/converters/_doc_intel_converter.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py b/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py index 94acc9f3..55973a75 100644 --- a/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py +++ b/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py @@ -1,4 +1,5 @@ from typing import Any, Union +import re # Azure imports from azure.ai.documentintelligence import DocumentIntelligenceClient @@ -24,6 +25,7 @@ def __init__( self, endpoint: str, api_version: str = "2024-07-31-preview", + priority: float = 0.0 ): self.endpoint = endpoint self.api_version = api_version @@ -32,6 +34,7 @@ def __init__( api_version=self.api_version, credential=DefaultAzureCredential(), ) + self._priority = priority def convert( self, local_path: str, **kwargs: Any From 923d3fbcae4a0b7ac789dbaf0ec120ebacac6455 Mon Sep 17 00:00:00 2001 From: Kenny Zhang Date: Tue, 11 Feb 2025 16:51:15 -0500 Subject: [PATCH 2/4] fixed analysis features bug for docx --- .../src/markitdown/converters/_doc_intel_converter.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py b/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py index 55973a75..faf26e79 100644 --- a/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py +++ b/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py @@ -61,8 +61,8 @@ def convert( with open(local_path, "rb") as f: file_bytes = f.read() - # Certain document analysis features are not availiable for filetypes (.xlsx, .pptx, .html) - if extension.lower() in [".xlsx", ".pptx", ".html"]: + # Certain document analysis features are not availiable for office filetypes (.xlsx, .pptx, .html, .docx) + if extension.lower() in [".xlsx", ".pptx", ".html", ".docx"]: analysis_features = [] else: analysis_features = [ From 3adb40e49bdb03a0e2503b6d2f98e6e43541e612 Mon Sep 17 00:00:00 2001 From: Kenny Zhang Date: Tue, 11 Feb 2025 17:04:18 -0500 Subject: [PATCH 3/4] formatting --- .../src/markitdown/converters/_doc_intel_converter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py b/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py index faf26e79..68001490 100644 --- a/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py +++ b/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py @@ -25,7 +25,7 @@ def __init__( self, endpoint: str, api_version: str = "2024-07-31-preview", - priority: float = 0.0 + priority: float = 0.0, ): self.endpoint = endpoint self.api_version = api_version From 18d326ff7ddd8186d7d85b5d6097e0a8928a24a9 Mon Sep 17 00:00:00 2001 From: Kenny Zhang Date: Tue, 11 Feb 2025 17:36:55 -0500 Subject: [PATCH 4/4] removed duplicate priority argument --- .../markitdown/src/markitdown/converters/_doc_intel_converter.py | 1 - 1 file changed, 1 deletion(-) diff --git a/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py b/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py index 807a0624..ed8aabfb 100644 --- a/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py +++ b/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py @@ -27,7 +27,6 @@ def __init__( priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT, endpoint: str, api_version: str = "2024-07-31-preview", - priority: float = 0.0, ): super().__init__(priority=priority)