diff --git a/docling/datamodel/object_detection_engine_options.py b/docling/datamodel/object_detection_engine_options.py new file mode 100644 index 0000000000..148d7a6ae3 --- /dev/null +++ b/docling/datamodel/object_detection_engine_options.py @@ -0,0 +1,47 @@ +"""Engine option helpers for object-detection runtimes.""" + +from __future__ import annotations + +from typing import List, Literal + +from pydantic import Field + +from docling.models.inference_engines.object_detection.base import ( + BaseObjectDetectionEngineOptions, + ObjectDetectionEngineType, +) + + +class OnnxRuntimeObjectDetectionEngineOptions(BaseObjectDetectionEngineOptions): + """Runtime configuration for ONNX Runtime based object-detection models. + + Preprocessing parameters come from HuggingFace preprocessor configs, + not from these options. + """ + + engine_type: Literal[ObjectDetectionEngineType.ONNXRUNTIME] = ( + ObjectDetectionEngineType.ONNXRUNTIME + ) + + model_filename: str = Field( + default="model.onnx", + description="Filename of the ONNX export inside the model repository", + ) + + providers: List[str] = Field( + default_factory=lambda: ["CPUExecutionProvider"], + description="Ordered list of ONNX Runtime execution providers to try", + ) + + +class TransformersObjectDetectionEngineOptions(BaseObjectDetectionEngineOptions): + """Runtime configuration for Transformers-based object-detection models.""" + + engine_type: Literal[ObjectDetectionEngineType.TRANSFORMERS] = ( + ObjectDetectionEngineType.TRANSFORMERS + ) + + torch_dtype: str | None = Field( + default=None, + description="PyTorch dtype for model inference (e.g., 'float32', 'float16', 'bfloat16')", + ) diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index 202754fc62..36841ec908 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -13,7 +13,11 @@ ) from typing_extensions import deprecated -from docling.datamodel import asr_model_specs, stage_model_specs, vlm_model_specs +from docling.datamodel import ( + asr_model_specs, + stage_model_specs, + vlm_model_specs, +) # Import the following for backwards compatibility from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions @@ -26,9 +30,10 @@ DOCLING_LAYOUT_V2, LayoutModelConfig, ) -from docling.datamodel.pipeline_options_asr_model import ( - InlineAsrOptions, +from docling.datamodel.object_detection_engine_options import ( + BaseObjectDetectionEngineOptions, ) +from docling.datamodel.pipeline_options_asr_model import InlineAsrOptions from docling.datamodel.pipeline_options_vlm_model import ( ApiVlmOptions, InferenceFramework, @@ -36,6 +41,8 @@ ResponseFormat, ) from docling.datamodel.stage_model_specs import ( + ObjectDetectionModelSpec, + ObjectDetectionStagePresetMixin, StagePresetMixin, VlmModelSpec, ) @@ -1094,6 +1101,38 @@ class LayoutOptions(BaseLayoutOptions): ] = DOCLING_LAYOUT_HERON +class LayoutObjectDetectionOptions(ObjectDetectionStagePresetMixin, BaseLayoutOptions): + """Options for layout detection using object-detection runtimes.""" + + kind: ClassVar[str] = "layout_object_detection" + + create_orphan_clusters: Annotated[ + bool, + Field( + description=( + "Create clusters for orphaned elements not assigned to any structure. When True, isolated text or " + "elements are grouped into their own clusters. Recommended for complete document coverage." + ) + ), + ] = False + + model_spec: ObjectDetectionModelSpec = Field( + default_factory=lambda: stage_model_specs.OBJECT_DETECTION_LAYOUT_HERON.model_spec.model_copy( + deep=True + ), + description="Object-detection model specification for layout analysis", + ) + + engine_options: BaseObjectDetectionEngineOptions = Field( + description="Runtime configuration for the object-detection engine", + ) + + +LayoutObjectDetectionOptions.register_preset( + stage_model_specs.OBJECT_DETECTION_LAYOUT_HERON +) + + class AsrPipelineOptions(PipelineOptions): """Configuration options for the Automatic Speech Recognition (ASR) pipeline. diff --git a/docling/datamodel/stage_model_specs.py b/docling/datamodel/stage_model_specs.py index a24b1ee14c..43bdb3f88f 100644 --- a/docling/datamodel/stage_model_specs.py +++ b/docling/datamodel/stage_model_specs.py @@ -7,7 +7,7 @@ """ import logging -from typing import Any, ClassVar, Dict, List, Optional, Set +from typing import TYPE_CHECKING, Any, ClassVar, Dict, List, Optional, Set from pydantic import BaseModel, Field @@ -17,8 +17,16 @@ TransformersPromptStyle, ) from docling.datamodel.vlm_engine_options import BaseVlmEngineOptions +from docling.models.inference_engines.object_detection.base import ( + ObjectDetectionEngineType, +) from docling.models.inference_engines.vlm.base import VlmEngineType +if TYPE_CHECKING: + from docling.datamodel.object_detection_engine_options import ( + BaseObjectDetectionEngineOptions, + ) + _log = logging.getLogger(__name__) @@ -292,6 +300,75 @@ def has_explicit_engine_export(self, engine_type: VlmEngineType) -> bool: return False +# ============================================================================= +# OBJECT DETECTION MODEL SPECIFICATION +# ============================================================================= + + +class ObjectDetectionModelSpec(BaseModel): + """Specification for an object detection model. + + Simpler than VlmModelSpec - no prompts, no preprocessing params. + Preprocessing comes from HuggingFace preprocessor configs. + Model files are assumed to be at the root of the HuggingFace repo. + """ + + name: str = Field(description="Human-readable model name") + + repo_id: str = Field(description="Default HuggingFace repository ID") + + revision: str = Field(default="main", description="Default model revision") + + engine_overrides: Dict["ObjectDetectionEngineType", EngineModelConfig] = Field( + default_factory=dict, + description="Engine-specific configuration overrides", + ) + + def get_engine_config( + self, engine_type: "ObjectDetectionEngineType" + ) -> EngineModelConfig: + """Get EngineModelConfig for a specific object-detection engine. + + Args: + engine_type: The engine type being requested + + Returns: + EngineModelConfig populated with repo/revision and engine overrides + """ + override = self.engine_overrides.get(engine_type) + if override is not None: + return override.merge_with(self.repo_id, self.revision) + return EngineModelConfig(repo_id=self.repo_id, revision=self.revision) + + def get_repo_id(self, engine_type: "ObjectDetectionEngineType") -> str: + """Get repository ID for specific engine. + + Args: + engine_type: The engine type + + Returns: + Repository ID (with engine override if applicable) + """ + override = self.engine_overrides.get(engine_type) + if override and override.repo_id: + return override.repo_id + return self.repo_id + + def get_revision(self, engine_type: "ObjectDetectionEngineType") -> str: + """Get revision for specific engine. + + Args: + engine_type: The engine type + + Returns: + Model revision (with engine override if applicable) + """ + override = self.engine_overrides.get(engine_type) + if override and override.revision: + return override.revision + return self.revision + + # ============================================================================= # STAGE PRESET SYSTEM # ============================================================================= @@ -502,6 +579,108 @@ def from_preset( return instance +class ObjectDetectionStagePreset(BaseModel): + """Preset definition for object detection-powered stages.""" + + preset_id: str = Field(description="Preset identifier") + name: str = Field(description="Human-readable preset name") + description: str = Field(description="Description of this preset") + model_spec: ObjectDetectionModelSpec = Field( + description="Object detection model specification" + ) + default_engine_type: ObjectDetectionEngineType = Field( + default=ObjectDetectionEngineType.ONNXRUNTIME, + description="Default inference engine to use", + ) + stage_options: Dict[str, Any] = Field( + default_factory=dict, description="Additional stage-specific defaults" + ) + + +class ObjectDetectionStagePresetMixin: + """Mixin to enable preset loading for object detection stages.""" + + _presets: ClassVar[Dict[str, ObjectDetectionStagePreset]] + + def __init_subclass__(cls, **kwargs): + super().__init_subclass__(**kwargs) + cls._presets = {} + + @classmethod + def register_preset(cls, preset: ObjectDetectionStagePreset) -> None: + if preset.preset_id not in cls._presets: + cls._presets[preset.preset_id] = preset + else: + _log.error( + f"Preset '{preset.preset_id}' already registered for {cls.__name__}" + ) + + @classmethod + def get_preset(cls, preset_id: str) -> ObjectDetectionStagePreset: + if preset_id not in cls._presets: + raise KeyError( + f"Preset '{preset_id}' not found for {cls.__name__}. " + f"Available presets: {list(cls._presets.keys())}" + ) + return cls._presets[preset_id] + + @classmethod + def list_presets(cls) -> List[ObjectDetectionStagePreset]: + return list(cls._presets.values()) + + @classmethod + def list_preset_ids(cls) -> List[str]: + return list(cls._presets.keys()) + + @classmethod + def get_preset_info(cls) -> List[Dict[str, str]]: + return [ + { + "preset_id": p.preset_id, + "name": p.name, + "description": p.description, + "model": p.model_spec.name, + "default_engine": p.default_engine_type.value, + } + for p in cls._presets.values() + ] + + @classmethod + def from_preset( + cls, + preset_id: str, + engine_options: Optional["BaseObjectDetectionEngineOptions"] = None, + **overrides: Any, + ): + from docling.datamodel.object_detection_engine_options import ( + OnnxRuntimeObjectDetectionEngineOptions, + TransformersObjectDetectionEngineOptions, + ) + + preset = cls.get_preset(preset_id) + + if engine_options is None: + if preset.default_engine_type == ObjectDetectionEngineType.ONNXRUNTIME: + engine_options = OnnxRuntimeObjectDetectionEngineOptions() + elif preset.default_engine_type == ObjectDetectionEngineType.TRANSFORMERS: + engine_options = TransformersObjectDetectionEngineOptions() + else: + raise ValueError( + f"Unsupported engine type {preset.default_engine_type} for presets" + ) + + instance = cls( # type: ignore[call-arg] + model_spec=preset.model_spec, + engine_options=engine_options, + **preset.stage_options, + ) + + for key, value in overrides.items(): + setattr(instance, key, value) + + return instance + + # ============================================================================= # PRESET DEFINITIONS # ============================================================================= @@ -573,6 +752,29 @@ def from_preset( }, } +# ----------------------------------------------------------------------------- +# OBJECT DETECTION PRESETS +# ----------------------------------------------------------------------------- + +OBJECT_DETECTION_LAYOUT_HERON = ObjectDetectionStagePreset( + preset_id="layout_heron_default", + name="Layout Heron", + description="RT-DETR layout-heron model (ResNet50)", + model_spec=ObjectDetectionModelSpec( + name="layout_heron", + repo_id="docling-project/docling-layout-heron", + revision="main", + engine_overrides={ + ObjectDetectionEngineType.ONNXRUNTIME: EngineModelConfig( + repo_id="docling-project/docling-layout-heron-onnx", + extra_config={"model_filename": "model.onnx"}, + ) + }, + ), + default_engine_type=ObjectDetectionEngineType.TRANSFORMERS, +) + + # ----------------------------------------------------------------------------- # VLM_CONVERT PRESETS (for full page conversion) # ----------------------------------------------------------------------------- diff --git a/docling/models/inference_engines/object_detection/__init__.py b/docling/models/inference_engines/object_detection/__init__.py new file mode 100644 index 0000000000..0f77a5acc0 --- /dev/null +++ b/docling/models/inference_engines/object_detection/__init__.py @@ -0,0 +1,21 @@ +"""Object detection inference engines.""" + +from docling.models.inference_engines.object_detection.base import ( + BaseObjectDetectionEngine, + BaseObjectDetectionEngineOptions, + ObjectDetectionEngineInput, + ObjectDetectionEngineOutput, + ObjectDetectionEngineType, +) +from docling.models.inference_engines.object_detection.factory import ( + create_object_detection_engine, +) + +__all__ = [ + "BaseObjectDetectionEngine", + "BaseObjectDetectionEngineOptions", + "ObjectDetectionEngineInput", + "ObjectDetectionEngineOutput", + "ObjectDetectionEngineType", + "create_object_detection_engine", +] diff --git a/docling/models/inference_engines/object_detection/base.py b/docling/models/inference_engines/object_detection/base.py new file mode 100644 index 0000000000..4c5fbb5e1e --- /dev/null +++ b/docling/models/inference_engines/object_detection/base.py @@ -0,0 +1,131 @@ +"""Base classes for object-detection inference engines.""" + +from __future__ import annotations + +import logging +from abc import ABC, abstractmethod +from enum import Enum +from typing import TYPE_CHECKING, Any, Dict, List, Optional + +from PIL.Image import Image +from pydantic import BaseModel, ConfigDict, Field + +if TYPE_CHECKING: + from docling.datamodel.stage_model_specs import EngineModelConfig + +_log = logging.getLogger(__name__) + + +class ObjectDetectionEngineType(str, Enum): + """Supported inference engine types for object-detection models.""" + + ONNXRUNTIME = "onnxruntime" + TRANSFORMERS = "transformers" + + +class BaseObjectDetectionEngineOptions(BaseModel): + """Base configuration shared across object-detection engines.""" + + model_config = ConfigDict(arbitrary_types_allowed=True) + + engine_type: ObjectDetectionEngineType = Field( + description="Type of inference engine to use", + ) + + score_threshold: float = Field( + default=0.3, + description="Minimum confidence score to keep a detection (0.0 to 1.0)", + ) + + +class ObjectDetectionEngineInput(BaseModel): + """Generic input accepted by every object-detection engine.""" + + model_config = ConfigDict(arbitrary_types_allowed=True) + + image: Image = Field(description="PIL image to run inference on") + metadata: Dict[str, Any] = Field( + default_factory=dict, + description="Optional metadata that is echoed back in the output", + ) + + +class ObjectDetectionEngineOutput(BaseModel): + """Output returned by object-detection engines.""" + + label_ids: List[int] = Field( + default_factory=list, + description="Predicted class indices", + ) + scores: List[float] = Field( + default_factory=list, + description="Confidence scores for the predictions", + ) + bboxes: List[List[float]] = Field( + default_factory=list, + description="Bounding boxes as [x_min, y_min, x_max, y_max] in pixels", + ) + metadata: Dict[str, Any] = Field( + default_factory=dict, + description="Optional metadata echoed back from the input or engine", + ) + + +class BaseObjectDetectionEngine(ABC): + """Abstract base-class for object-detection engines.""" + + def __init__( + self, + options: BaseObjectDetectionEngineOptions, + model_config: Optional[EngineModelConfig] = None, + ) -> None: + """Initialize the engine. + + Args: + options: Engine-specific configuration options + model_config: Model configuration (repo_id, revision, extra_config) + """ + self.options = options + self.model_config = model_config + self._initialized = False + + @abstractmethod + def initialize(self) -> None: + """Initialize engine resources (load models, allocate buffers, etc.).""" + + @abstractmethod + def predict_batch( + self, input_batch: List[ObjectDetectionEngineInput] + ) -> List[ObjectDetectionEngineOutput]: + """Run inference on a batch of inputs.""" + + @abstractmethod + def get_label_mapping(self) -> Dict[int, str]: + """Get the label mapping for this model. + + Returns: + Dictionary mapping label IDs to label names + """ + + def predict( + self, input_data: ObjectDetectionEngineInput + ) -> ObjectDetectionEngineOutput: + """Helper to run inference on a single input.""" + if not self._initialized: + _log.debug("Initializing %s for single prediction", type(self).__name__) + self.initialize() + + results = self.predict_batch([input_data]) + return results[0] + + def __call__( + self, + input_data: ObjectDetectionEngineInput | List[ObjectDetectionEngineInput], + ) -> ObjectDetectionEngineOutput | List[ObjectDetectionEngineOutput]: + if not self._initialized: + _log.debug("Initializing %s for call", type(self).__name__) + self.initialize() + + if isinstance(input_data, list): + return self.predict_batch(input_data) + return self.predict(input_data) diff --git a/docling/models/inference_engines/object_detection/factory.py b/docling/models/inference_engines/object_detection/factory.py new file mode 100644 index 0000000000..6113bf6d3d --- /dev/null +++ b/docling/models/inference_engines/object_detection/factory.py @@ -0,0 +1,88 @@ +"""Factory for creating object detection engines.""" + +from __future__ import annotations + +import logging +from pathlib import Path +from typing import TYPE_CHECKING, Optional, Union + +from docling.datamodel.accelerator_options import AcceleratorOptions +from docling.models.inference_engines.object_detection.base import ( + BaseObjectDetectionEngine, + BaseObjectDetectionEngineOptions, + ObjectDetectionEngineType, +) + +if TYPE_CHECKING: + from docling.datamodel.stage_model_specs import ( + EngineModelConfig, + ObjectDetectionModelSpec, + ) + +_log = logging.getLogger(__name__) + + +def create_object_detection_engine( + *, + options: BaseObjectDetectionEngineOptions, + model_spec: Optional[ObjectDetectionModelSpec] = None, + accelerator_options: AcceleratorOptions, + artifacts_path: Optional[Union[Path, str]] = None, +) -> BaseObjectDetectionEngine: + """Factory to create object detection engines. + + Args: + options: Engine-specific options + model_spec: Model specification used to derive engine configuration + accelerator_options: Hardware accelerator configuration + artifacts_path: Optional path to local model artifacts root + + Returns: + Initialized engine instance (call .initialize() before use) + """ + model_config: Optional[EngineModelConfig] = None + if model_spec is not None: + model_config = model_spec.get_engine_config(options.engine_type) + + if options.engine_type == ObjectDetectionEngineType.ONNXRUNTIME: + from docling.datamodel.object_detection_engine_options import ( + OnnxRuntimeObjectDetectionEngineOptions, + ) + from docling.models.inference_engines.object_detection.onnxruntime_engine import ( + OnnxRuntimeObjectDetectionEngine, + ) + + if not isinstance(options, OnnxRuntimeObjectDetectionEngineOptions): + raise ValueError( + f"Expected OnnxRuntimeObjectDetectionEngineOptions, got {type(options)}" + ) + + return OnnxRuntimeObjectDetectionEngine( + options=options, + model_config=model_config, + artifacts_path=artifacts_path, + accelerator_options=accelerator_options, + ) + + elif options.engine_type == ObjectDetectionEngineType.TRANSFORMERS: + from docling.datamodel.object_detection_engine_options import ( + TransformersObjectDetectionEngineOptions, + ) + from docling.models.inference_engines.object_detection.transformers_engine import ( + TransformersObjectDetectionEngine, + ) + + if not isinstance(options, TransformersObjectDetectionEngineOptions): + raise ValueError( + f"Expected TransformersObjectDetectionEngineOptions, got {type(options)}" + ) + + return TransformersObjectDetectionEngine( + options=options, + model_config=model_config, + accelerator_options=accelerator_options, + artifacts_path=artifacts_path, + ) + + else: + raise ValueError(f"Unknown engine type: {options.engine_type}") diff --git a/docling/models/inference_engines/object_detection/hf_base.py b/docling/models/inference_engines/object_detection/hf_base.py new file mode 100644 index 0000000000..ae9bae2bfe --- /dev/null +++ b/docling/models/inference_engines/object_detection/hf_base.py @@ -0,0 +1,194 @@ +"""Shared HuggingFace-based helpers for object-detection engines.""" + +from __future__ import annotations + +import logging +from numbers import Integral, Real +from pathlib import Path +from typing import TYPE_CHECKING, Any, Dict, Iterable, Optional, Sequence, Union + +import numpy as np + +from docling.datamodel.accelerator_options import AcceleratorOptions +from docling.models.inference_engines.object_detection.base import ( + BaseObjectDetectionEngine, + BaseObjectDetectionEngineOptions, + ObjectDetectionEngineInput, + ObjectDetectionEngineOutput, +) +from docling.models.inference_engines.vlm._utils import resolve_model_artifacts_path +from docling.models.utils.hf_model_download import HuggingFaceModelDownloadMixin + +if TYPE_CHECKING: + from transformers.image_processing_utils import BaseImageProcessor + + from docling.datamodel.stage_model_specs import EngineModelConfig + +_log = logging.getLogger(__name__) + + +class HfObjectDetectionEngineBase( + BaseObjectDetectionEngine, HuggingFaceModelDownloadMixin +): + """Base class for object-detection engines that load HF artifacts and configs.""" + + def __init__( + self, + *, + options: BaseObjectDetectionEngineOptions, + model_config: Optional[EngineModelConfig] = None, + accelerator_options: AcceleratorOptions, + artifacts_path: Optional[Union[Path, str]] = None, + ) -> None: + if model_config is None or model_config.repo_id is None: + raise ValueError( + f"{type(self).__name__} requires model_config with repo_id" + ) + + super().__init__(options=options, model_config=model_config) + self.options: BaseObjectDetectionEngineOptions = options + self._model_config: EngineModelConfig = model_config + self._repo_id: str = model_config.repo_id + self._accelerator_options = accelerator_options + self._artifacts_path = ( + artifacts_path if artifacts_path is None else Path(artifacts_path) + ) + self._processor: Optional[BaseImageProcessor] = None + self._id_to_label: Dict[int, str] = {} + + def _resolve_model_folder(self, repo_id: str, revision: str) -> Path: + """Resolve model folder from artifacts_path or HF download.""" + + def download_wrapper(download_repo_id: str, download_revision: str) -> Path: + _log.info( + "Downloading object-detection model from HuggingFace: %s@%s", + download_repo_id, + download_revision, + ) + return self.download_models( + repo_id=download_repo_id, + revision=download_revision, + local_dir=None, + force=False, + progress=False, + ) + + return resolve_model_artifacts_path( + repo_id=repo_id, + revision=revision, + artifacts_path=self._artifacts_path, + download_fn=download_wrapper, + ) + + def _load_preprocessor(self, model_folder: Path) -> BaseImageProcessor: + """Load HuggingFace image processor from model folder.""" + preprocessor_config = model_folder / "preprocessor_config.json" + if not preprocessor_config.exists(): + raise FileNotFoundError( + f"Image processor config not found: {preprocessor_config}" + ) + + try: + from transformers import AutoImageProcessor + + _log.debug("Loading image processor from %s", model_folder) + return AutoImageProcessor.from_pretrained(str(model_folder)) + except Exception as exc: + raise RuntimeError( + f"Failed to load image processor from {model_folder}: {exc}" + ) + + def _load_label_mapping(self, model_folder: Path) -> Dict[int, str]: + """Load label mapping from HuggingFace model config.""" + try: + from transformers import AutoConfig + + config = AutoConfig.from_pretrained(str(model_folder)) + return { + int(label_id): label_name + for label_id, label_name in config.id2label.items() + } + except Exception as exc: + raise RuntimeError( + f"Failed to load label mapping from model config at {model_folder}: {exc}" + ) + + def get_label_mapping(self) -> Dict[int, str]: + """Get the label mapping for this model.""" + return self._id_to_label + + def _build_output( + self, + *, + input_item: ObjectDetectionEngineInput, + labels: Iterable[Any], + scores: Iterable[Any], + boxes: Iterable[Sequence[Any]], + apply_score_threshold: bool = False, + ) -> ObjectDetectionEngineOutput: + """Build standard engine output from raw detection iterables.""" + label_ids: list[int] = [] + output_scores: list[float] = [] + bboxes: list[list[float]] = [] + + for label, score, box in zip(labels, scores, boxes): + score_float = self._as_float(score) + if apply_score_threshold and score_float < self.options.score_threshold: + continue + + label_ids.append(self._as_int(label)) + output_scores.append(score_float) + bboxes.append([self._as_float(value) for value in box]) + + return ObjectDetectionEngineOutput( + label_ids=label_ids, + scores=output_scores, + bboxes=bboxes, + metadata=input_item.metadata.copy(), + ) + + @staticmethod + def _as_float(value: Any) -> float: + if isinstance(value, Real): + return float(value) + + if isinstance(value, np.ndarray): + if value.size != 1: + raise TypeError( + f"Expected scalar-like ndarray with size 1, got shape={value.shape}" + ) + return float(value.reshape(-1)[0]) + + import torch + + if isinstance(value, torch.Tensor): + if value.numel() != 1: + raise TypeError( + f"Expected scalar-like tensor with one element, got shape={tuple(value.shape)}" + ) + return float(value.item()) + + raise TypeError(f"Unsupported score/box value type: {type(value)!r}") + + @staticmethod + def _as_int(value: Any) -> int: + if isinstance(value, Integral): + return int(value) + + if isinstance(value, np.ndarray): + if value.size != 1: + raise TypeError( + f"Expected scalar-like ndarray with size 1, got shape={value.shape}" + ) + return int(value.reshape(-1)[0]) + + import torch + + if isinstance(value, torch.Tensor): + if value.numel() != 1: + raise TypeError( + f"Expected scalar-like tensor with one element, got shape={tuple(value.shape)}" + ) + return int(value.item()) + + raise TypeError(f"Unsupported label value type: {type(value)!r}") diff --git a/docling/models/inference_engines/object_detection/onnxruntime_engine.py b/docling/models/inference_engines/object_detection/onnxruntime_engine.py new file mode 100644 index 0000000000..aa9f137e05 --- /dev/null +++ b/docling/models/inference_engines/object_detection/onnxruntime_engine.py @@ -0,0 +1,208 @@ +"""ONNX Runtime implementation for RT-DETR style object-detection models.""" + +from __future__ import annotations + +import logging +from pathlib import Path +from typing import TYPE_CHECKING, List, Optional, Union + +import numpy as np + +if TYPE_CHECKING: + import onnxruntime as ort + +from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions +from docling.datamodel.object_detection_engine_options import ( + OnnxRuntimeObjectDetectionEngineOptions, +) +from docling.models.inference_engines.object_detection.base import ( + ObjectDetectionEngineInput, + ObjectDetectionEngineOutput, +) +from docling.models.inference_engines.object_detection.hf_base import ( + HfObjectDetectionEngineBase, +) +from docling.utils.accelerator_utils import decide_device + +if TYPE_CHECKING: + from docling.datamodel.stage_model_specs import EngineModelConfig + +_log = logging.getLogger(__name__) + + +class OnnxRuntimeObjectDetectionEngine(HfObjectDetectionEngineBase): + """ONNX Runtime engine for object detection models. + + Uses HuggingFace AutoImageProcessor for preprocessing to ensure + consistency with transformers-based models. This is the source of truth + for preprocessing parameters. + """ + + def __init__( + self, + *, + options: OnnxRuntimeObjectDetectionEngineOptions, + model_config: Optional[EngineModelConfig] = None, + accelerator_options: AcceleratorOptions, + artifacts_path: Optional[Union[Path, str]] = None, + ): + """Initialize the ONNX Runtime engine. + + Args: + options: ONNX Runtime-specific runtime options + accelerator_options: Hardware accelerator configuration + artifacts_path: Path to cached model artifacts + model_config: Model configuration (repo_id, revision, extra_config) + """ + super().__init__( + options=options, + model_config=model_config, + accelerator_options=accelerator_options, + artifacts_path=artifacts_path, + ) + self.options: OnnxRuntimeObjectDetectionEngineOptions = options + self._session: Optional[ort.InferenceSession] = None + self._model_path: Optional[Path] = None + + def _resolve_model_artifacts(self) -> tuple[Path, Path]: + """Resolve model artifacts from artifacts_path or HF download. + + Returns: + Tuple of (model_folder, model_path) + """ + repo_id = self._repo_id + revision = self._model_config.revision or "main" + + model_filename = self._resolve_model_filename() + model_folder = self._resolve_model_folder( + repo_id=repo_id, + revision=str(revision), + ) + model_path = model_folder / model_filename + + if not model_path.exists(): + raise FileNotFoundError( + f"ONNX model file '{model_filename}' not found: {model_path}" + ) + + return model_folder, model_path + + def _resolve_model_filename(self) -> str: + """Determine which ONNX filename to load.""" + filename = self.options.model_filename + extra_filename = self._model_config.extra_config.get("model_filename") + if extra_filename and isinstance(extra_filename, str): + filename = extra_filename + return filename + + def initialize(self) -> None: + """Initialize ONNX session and preprocessor.""" + import onnxruntime as ort + + _log.info("Initializing ONNX Runtime object-detection engine") + + # Resolve model folder and model path in one step + model_folder, self._model_path = self._resolve_model_artifacts() + + _log.debug(f"Using ONNX model at {self._model_path}") + + # Load preprocessor (source of truth for preprocessing) + self._processor = self._load_preprocessor(model_folder) + _log.debug(f"Loaded preprocessor with size: {self._processor.size}") # type: ignore[attr-defined] + + # Load label mapping from config + self._id_to_label = self._load_label_mapping(model_folder) + _log.debug(f"Loaded label mapping with {len(self._id_to_label)} labels") + + # Create ONNX session + sess_options = ort.SessionOptions() + sess_options.intra_op_num_threads = self._accelerator_options.num_threads + providers = self._resolve_providers() + + self._session = ort.InferenceSession( + str(self._model_path), + sess_options=sess_options, + providers=providers, + ) + + self._initialized = True + _log.info( + f"ONNX Runtime engine ready (providers={self._session.get_providers()})" + ) + + def _resolve_providers(self) -> List[str]: + """Resolve ONNX Runtime providers from accelerator and engine options.""" + configured_providers = self.options.providers or ["CPUExecutionProvider"] + if configured_providers != ["CPUExecutionProvider"]: + return configured_providers + + device = decide_device( + self._accelerator_options.device, + supported_devices=[AcceleratorDevice.CPU, AcceleratorDevice.CUDA], + ) + + if device.startswith("cuda"): + return ["CUDAExecutionProvider", "CPUExecutionProvider"] + + if device != AcceleratorDevice.CPU.value: + _log.warning( + "Unsupported ONNX device '%s' for object detection. Falling back to CPU provider.", + device, + ) + return ["CPUExecutionProvider"] + + def predict_batch( + self, input_batch: List[ObjectDetectionEngineInput] + ) -> List[ObjectDetectionEngineOutput]: + """Run inference on a batch of inputs. + + Args: + input_batch: List of input images with metadata + + Returns: + List of detection outputs + """ + if not input_batch: + return [] + if self._session is None or self._processor is None: + raise RuntimeError("Engine not initialized. Call initialize() first.") + + # Preprocess images using HF processor (source of truth) + images = [item.image.convert("RGB") for item in input_batch] + inputs = self._processor(images=images, return_tensors="np") + + # Get original sizes for post-processing + orig_sizes = np.array( + [[img.width, img.height] for img in images], dtype=np.int64 + ) + + # Run ONNX inference + output_tensors = self._session.run( + None, + { + "images": inputs["pixel_values"], + "orig_target_sizes": orig_sizes, + }, + ) + + if len(output_tensors) < 3: + raise RuntimeError( + "Expected ONNX model to return at least 3 outputs: " + "[labels, boxes, scores]" + ) + + labels_batch, boxes_batch, scores_batch = output_tensors[:3] + + batch_outputs: List[ObjectDetectionEngineOutput] = [] + for idx, input_item in enumerate(input_batch): + batch_outputs.append( + self._build_output( + input_item=input_item, + labels=labels_batch[idx], + scores=scores_batch[idx], + boxes=boxes_batch[idx], + apply_score_threshold=True, + ) + ) + + return batch_outputs diff --git a/docling/models/inference_engines/object_detection/transformers_engine.py b/docling/models/inference_engines/object_detection/transformers_engine.py new file mode 100644 index 0000000000..5d0f78d308 --- /dev/null +++ b/docling/models/inference_engines/object_detection/transformers_engine.py @@ -0,0 +1,207 @@ +"""Transformers implementation for object-detection models.""" + +from __future__ import annotations + +import logging +from pathlib import Path +from typing import TYPE_CHECKING, List, Optional, Union + +if TYPE_CHECKING: + import torch + from transformers import AutoModelForObjectDetection + +from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions +from docling.datamodel.object_detection_engine_options import ( + TransformersObjectDetectionEngineOptions, +) +from docling.models.inference_engines.object_detection.base import ( + ObjectDetectionEngineInput, + ObjectDetectionEngineOutput, +) +from docling.models.inference_engines.object_detection.hf_base import ( + HfObjectDetectionEngineBase, +) +from docling.utils.accelerator_utils import decide_device + +if TYPE_CHECKING: + from docling.datamodel.stage_model_specs import EngineModelConfig + +_log = logging.getLogger(__name__) + + +class TransformersObjectDetectionEngine(HfObjectDetectionEngineBase): + """Transformers engine for object detection models. + + Uses HuggingFace Transformers and PyTorch for inference. + Supports AutoModelForObjectDetection-compatible models. + """ + + def __init__( + self, + *, + options: TransformersObjectDetectionEngineOptions, + model_config: Optional[EngineModelConfig] = None, + accelerator_options: AcceleratorOptions, + artifacts_path: Optional[Union[Path, str]] = None, + ): + """Initialize the Transformers engine. + + Args: + options: Transformers-specific runtime options + model_config: Model configuration (repo_id, revision, extra_config) + accelerator_options: Hardware accelerator configuration + artifacts_path: Path to cached model artifacts + """ + super().__init__( + options=options, + model_config=model_config, + accelerator_options=accelerator_options, + artifacts_path=artifacts_path, + ) + self.options: TransformersObjectDetectionEngineOptions = options + self._model: Optional[AutoModelForObjectDetection] = None + self._device: Optional[torch.device] = None + + def _resolve_device(self) -> torch.device: + """Resolve PyTorch device from accelerator options.""" + import torch + + device_str = decide_device( + self._accelerator_options.device, + supported_devices=[ + AcceleratorDevice.CPU, + AcceleratorDevice.CUDA, + AcceleratorDevice.MPS, + ], + ) + + # Map to PyTorch device + if device_str.startswith("cuda"): + return torch.device(device_str) + elif device_str == AcceleratorDevice.MPS.value: + return torch.device("mps") + else: + return torch.device("cpu") + + def _resolve_torch_dtype(self) -> Optional[torch.dtype]: + """Resolve PyTorch dtype from options or model config.""" + import torch + + # Priority: options > model_config > None (auto) + dtype_str = self.options.torch_dtype or self._model_config.torch_dtype + + if dtype_str is None: + return None + + dtype_map = { + "float32": torch.float32, + "float16": torch.float16, + "bfloat16": torch.bfloat16, + } + + dtype = dtype_map.get(dtype_str) + if dtype is None: + _log.warning( + f"Unknown torch_dtype '{dtype_str}', using auto dtype detection" + ) + return dtype + + def initialize(self) -> None: + """Initialize PyTorch model and preprocessor.""" + import torch + from transformers import AutoModelForObjectDetection + + _log.info("Initializing Transformers object-detection engine") + + revision = self._model_config.revision or "main" + model_folder = self._resolve_model_folder( + repo_id=self._repo_id, + revision=revision, + ) + + _log.debug(f"Using model at {model_folder}") + + # Resolve device and dtype + self._device = self._resolve_device() + torch_dtype = self._resolve_torch_dtype() + + # Set num_threads for CPU inference + if self._device.type == "cpu": + torch.set_num_threads(self._accelerator_options.num_threads) + + # Load preprocessor (source of truth for preprocessing) + self._processor = self._load_preprocessor(model_folder) + _log.debug(f"Loaded preprocessor with size: {self._processor.size}") # type: ignore[attr-defined] + + # Load label mapping from config + self._id_to_label = self._load_label_mapping(model_folder) + _log.debug(f"Loaded label mapping with {len(self._id_to_label)} labels") + + # Load model + _log.debug(f"Loading model from {model_folder} to device {self._device}") + try: + self._model = AutoModelForObjectDetection.from_pretrained( + str(model_folder), + torch_dtype=torch_dtype, + ) + self._model.to(self._device) # type: ignore[union-attr] + self._model.eval() # type: ignore[union-attr] + except Exception as e: + raise RuntimeError(f"Failed to load model from {model_folder}: {e}") + + self._initialized = True + _log.info( + f"Transformers engine ready (device={self._device}, dtype={self._model.dtype})" # type: ignore[union-attr] + ) + + def predict_batch( + self, input_batch: List[ObjectDetectionEngineInput] + ) -> List[ObjectDetectionEngineOutput]: + """Run inference on a batch of inputs. + + Args: + input_batch: List of input images with metadata + + Returns: + List of detection outputs + """ + import torch + + if not input_batch: + return [] + if self._model is None or self._processor is None: + raise RuntimeError("Engine not initialized. Call initialize() first.") + + # Preprocess images using HF processor + images = [item.image.convert("RGB") for item in input_batch] + inputs = self._processor(images=images, return_tensors="pt").to(self._device) + + # Get target sizes for post-processing + target_sizes = torch.tensor( + [[img.height, img.width] for img in images], device=self._device + ) + + # Run inference + with torch.inference_mode(): + outputs = self._model(**inputs) # type: ignore[operator] + + # Post-process using HuggingFace processor + results = self._processor.post_process_object_detection( # type: ignore[attr-defined] + outputs, + target_sizes=target_sizes, # type: ignore[arg-type] + threshold=self.options.score_threshold, + ) + + # Convert to our output format + batch_outputs: List[ObjectDetectionEngineOutput] = [] + for input_item, result in zip(input_batch, results): + batch_outputs.append( + self._build_output( + input_item=input_item, + labels=result["labels"], + scores=result["scores"], + boxes=result["boxes"], + ) + ) + + return batch_outputs diff --git a/docling/models/plugins/defaults.py b/docling/models/plugins/defaults.py index cfb6f8dbfc..65f0a284a7 100644 --- a/docling/models/plugins/defaults.py +++ b/docling/models/plugins/defaults.py @@ -43,9 +43,13 @@ def layout_engines(): TableCropsLayoutModel, ) from docling.models.stages.layout.layout_model import LayoutModel + from docling.models.stages.layout.layout_object_detection_model import ( + LayoutObjectDetectionModel, + ) return { "layout_engines": [ + LayoutObjectDetectionModel, LayoutModel, TableCropsLayoutModel, ] diff --git a/docling/models/stages/layout/layout_object_detection_model.py b/docling/models/stages/layout/layout_object_detection_model.py new file mode 100644 index 0000000000..a05ad2598e --- /dev/null +++ b/docling/models/stages/layout/layout_object_detection_model.py @@ -0,0 +1,173 @@ +"""Layout detection stage backed by object-detection runtimes.""" + +from __future__ import annotations + +import logging +from pathlib import Path +from typing import Dict, List, Optional, Sequence + +import numpy as np +from docling_core.types.doc import CoordOrigin, DocItemLabel +from PIL import Image + +from docling.datamodel.accelerator_options import AcceleratorOptions +from docling.datamodel.base_models import BoundingBox, Cluster, LayoutPrediction, Page +from docling.datamodel.document import ConversionResult +from docling.datamodel.pipeline_options import LayoutObjectDetectionOptions +from docling.models.base_layout_model import BaseLayoutModel +from docling.models.inference_engines.object_detection import ( + BaseObjectDetectionEngine, + ObjectDetectionEngineInput, + ObjectDetectionEngineOutput, + create_object_detection_engine, +) +from docling.utils.layout_postprocessor import LayoutPostprocessor +from docling.utils.profiling import TimeRecorder + +_log = logging.getLogger(__name__) + + +class LayoutObjectDetectionModel(BaseLayoutModel): + """Layout detection using the generic object-detection inference engines.""" + + def __init__( + self, + artifacts_path: Optional[Path], + accelerator_options: AcceleratorOptions, + options: LayoutObjectDetectionOptions, + ) -> None: + self.options = options + + self.engine: BaseObjectDetectionEngine = create_object_detection_engine( + options=options.engine_options, + model_spec=self.options.model_spec, + artifacts_path=artifacts_path, + accelerator_options=accelerator_options, + ) + self.engine.initialize() + + # Convert engine's string labels to DocItemLabel enums + self._label_map = self._build_label_map() + + def _build_label_map(self) -> Dict[int, DocItemLabel]: + """Build label mapping from engine's label names to DocItemLabel enums. + + Raises: + RuntimeError: If labels don't match DocItemLabel enum. + """ + id_to_label_str = self.engine.get_label_mapping() + label_map = {} + + for label_id, label_name in id_to_label_str.items(): + # Convert label name to uppercase to match DocItemLabel enum convention + label_enum_name = label_name.upper() + try: + label_map[label_id] = DocItemLabel[label_enum_name] + except KeyError: + raise RuntimeError( + f"Label '{label_name}' (ID {label_id}) from model config " + f"does not match any DocItemLabel enum value." + ) + + return label_map + + @classmethod + def get_options_type(cls) -> type[LayoutObjectDetectionOptions]: + return LayoutObjectDetectionOptions + + def predict_layout( + self, + conv_res: ConversionResult, + pages: Sequence[Page], + ) -> Sequence[LayoutPrediction]: + pages = list(pages) + predictions: list[LayoutPrediction] = [] + + for page in pages: + assert page._backend is not None + if not page._backend.is_valid(): + existing_prediction = page.predictions.layout or LayoutPrediction() + page.predictions.layout = existing_prediction + predictions.append(existing_prediction) + continue + + page_image = page.get_image(scale=1.0) + if page_image is None: + empty_prediction = page.predictions.layout or LayoutPrediction() + page.predictions.layout = empty_prediction + predictions.append(empty_prediction) + continue + + with TimeRecorder(conv_res, "layout"): + engine_input = ObjectDetectionEngineInput( + image=page_image, + metadata={"page_no": page.page_no}, + ) + engine_output = self.engine.predict(engine_input) + + clusters = self._predictions_to_clusters( + page=page, + image=page_image, + engine_output=engine_output, + ) + + processed_clusters, processed_cells = LayoutPostprocessor( + page=page, + clusters=clusters, + options=self.options, + ).postprocess() + + layout_prediction = LayoutPrediction(clusters=processed_clusters) + page.predictions.layout = layout_prediction + + if processed_clusters: + layout_scores = [c.confidence for c in processed_clusters] + conv_res.confidence.pages[page.page_no].layout_score = float( + np.mean(layout_scores) + ) + else: + conv_res.confidence.pages[page.page_no].layout_score = 0.0 + + if processed_cells: + ocr_scores = [c.confidence for c in processed_cells if c.from_ocr] + if ocr_scores: + conv_res.confidence.pages[page.page_no].ocr_score = float( + np.mean(ocr_scores) + ) + + predictions.append(layout_prediction) + + return predictions + + def _predictions_to_clusters( + self, + page: Page, + image: Image.Image, + engine_output: ObjectDetectionEngineOutput, + ) -> List[Cluster]: + assert page.size is not None + scale_x = page.size.width / image.width + scale_y = page.size.height / image.height + + clusters: List[Cluster] = [] + for idx, (label_id, score, bbox_coords) in enumerate( + zip(engine_output.label_ids, engine_output.scores, engine_output.bboxes) + ): + label = self._label_map.get(label_id, DocItemLabel.TEXT) + bbox = BoundingBox( + l=bbox_coords[0] * scale_x, + t=bbox_coords[1] * scale_y, + r=bbox_coords[2] * scale_x, + b=bbox_coords[3] * scale_y, + coord_origin=CoordOrigin.TOPLEFT, + ) + clusters.append( + Cluster( + id=idx, + label=label, + confidence=score, + bbox=bbox, + cells=[], + ) + ) + return clusters diff --git a/docling/utils/layout_postprocessor.py b/docling/utils/layout_postprocessor.py index 96a39856fe..d06d690eb5 100644 --- a/docling/utils/layout_postprocessor.py +++ b/docling/utils/layout_postprocessor.py @@ -8,7 +8,10 @@ from rtree import index from docling.datamodel.base_models import BoundingBox, Cluster, Page -from docling.datamodel.pipeline_options import LayoutOptions +from docling.datamodel.pipeline_options import ( + LayoutObjectDetectionOptions, + LayoutOptions, +) _log = logging.getLogger(__name__) @@ -195,7 +198,10 @@ class LayoutPostprocessor: } def __init__( - self, page: Page, clusters: list[Cluster], options: LayoutOptions + self, + page: Page, + clusters: list[Cluster], + options: LayoutOptions | LayoutObjectDetectionOptions, ) -> None: """Initialize processor with page and clusters.""" diff --git a/docs/examples/layout_object_detection_example.py b/docs/examples/layout_object_detection_example.py new file mode 100644 index 0000000000..04cbe9f62f --- /dev/null +++ b/docs/examples/layout_object_detection_example.py @@ -0,0 +1,126 @@ +# %% [markdown] +# +# What this example does +# - Demonstrates runtime abstraction for object detection engines +# - Runs the same layout detection task with both ONNX Runtime and Transformers engines +# - Shows how to easily switch between inference engines while using the same model +# - Detects document structure elements (text blocks, tables, figures, etc.) +# +# Requirements +# - Python 3.10+ +# - Install Docling: `pip install docling[onnxruntime]` +# +# How to run (from repo root) +# - `python docs/examples/layout_object_detection_example.py` +# +# ## Example code +# %% + +import logging +import sys + +from docling_core.types.doc.base import ImageRefMode + +from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions +from docling.datamodel.base_models import InputFormat +from docling.datamodel.object_detection_engine_options import ( + OnnxRuntimeObjectDetectionEngineOptions, + TransformersObjectDetectionEngineOptions, +) +from docling.datamodel.pipeline_options import ( + LayoutObjectDetectionOptions, + PdfPipelineOptions, +) +from docling.datamodel.settings import settings +from docling.document_converter import ( + DocumentConverter, + ImageFormatOption, + PdfFormatOption, +) + +_log = logging.getLogger(__name__) + + +def is_onnxruntime_available() -> bool: + """Return True when onnxruntime can be imported in this environment.""" + try: + import onnxruntime + except ImportError: + return False + return True + + +def run_with_engine(engine_name: str, engine_options, input_doc_path: str): + """Run layout detection with the specified engine.""" + _log.info(f"{'=' * 80}") + _log.info(f"Running layout detection with {engine_name} engine") + _log.info(f"{'=' * 80}\n") + + # Configure pipeline options + pipeline_options = PdfPipelineOptions() + pipeline_options.do_ocr = True + pipeline_options.do_table_structure = True + pipeline_options.generate_page_images = True + pipeline_options.generate_picture_images = True + pipeline_options.images_scale = 2.0 + pipeline_options.accelerator_options = AcceleratorOptions( + device=AcceleratorDevice.AUTO + ) + + # Create layout options with the specified engine + layout_options = LayoutObjectDetectionOptions.from_preset("layout_heron_default") + layout_options.engine_options = engine_options + + pipeline_options.layout_options = layout_options + + # Create converter with the configured pipeline + converter = DocumentConverter( + format_options={ + InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options), + InputFormat.IMAGE: ImageFormatOption(pipeline_options=pipeline_options), + } + ) + + # Convert the document + result = converter.convert(input_doc_path) + + # Save output with engine-specific filename + output_filename = f"layout_object_detection_{engine_name.lower()}.html" + result.document.save_as_html(output_filename, image_mode=ImageRefMode.EMBEDDED) + _log.info(f"✓ Saved output to {output_filename}") + + return result + + +def main(): + # Configure logging to display info messages + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", + ) + logging.getLogger("docling").setLevel(logging.INFO) + + # Use a sample PDF from the test data (path relative to repo root) + input_doc_path = "tests/data/pdf/2206.01062.pdf" + + # Run 1: ONNX Runtime Engine (if available in the current environment) + if is_onnxruntime_available(): + # Uses automatic device selection via pipeline accelerator options + onnx_options = OnnxRuntimeObjectDetectionEngineOptions() + run_with_engine("ONNX", onnx_options, input_doc_path) + else: + _log.warning( + "Skipping ONNX engine run: onnxruntime is not available for Python %d.%d. " + "Use Python < 3.14 and install `docling[onnxruntime]`.", + sys.version_info.major, + sys.version_info.minor, + ) + + # Run 2: Transformers Engine + # Uses PyTorch with HuggingFace Transformers and automatic device selection + transformers_options = TransformersObjectDetectionEngineOptions() + run_with_engine("Transformers", transformers_options, input_doc_path) + + +if __name__ == "__main__": + main() diff --git a/pyproject.toml b/pyproject.toml index 071d60192f..5e571614d5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -101,6 +101,10 @@ rapidocr = [ 'rapidocr (>=3.3,<4.0.0)', 'onnxruntime (>=1.7.0,<2.0.0) ; python_version < "3.14"', ] +onnxruntime = [ + 'onnxruntime (<1.24) ; python_version < "3.14" and sys_platform == "darwin"', + 'onnxruntime-gpu (<1.24) ; python_version < "3.14" and (sys_platform == "linux" or sys_platform == "win32")', +] asr = [ 'mlx-whisper>=0.4.3 ; python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64"', 'openai-whisper>=20250625', diff --git a/uv.lock b/uv.lock index 6102d54ddc..7c8e27063f 100644 --- a/uv.lock +++ b/uv.lock @@ -1,5 +1,5 @@ version = 1 -revision = 3 +revision = 2 requires-python = ">=3.10, <4.0" resolution-markers = [ "python_full_version >= '3.12'", @@ -971,6 +971,10 @@ easyocr = [ ocrmac = [ { name = "ocrmac", marker = "sys_platform == 'darwin'" }, ] +onnxruntime = [ + { name = "onnxruntime", marker = "python_full_version < '3.14' and sys_platform == 'darwin'" }, + { name = "onnxruntime-gpu", marker = "(python_full_version < '3.14' and sys_platform == 'linux') or (python_full_version < '3.14' and sys_platform == 'win32')" }, +] rapidocr = [ { name = "onnxruntime", marker = "python_full_version < '3.14'" }, { name = "rapidocr" }, @@ -1048,6 +1052,8 @@ requires-dist = [ { name = "ocrmac", marker = "sys_platform == 'darwin'", specifier = ">=1.0.0,<2.0.0" }, { name = "ocrmac", marker = "sys_platform == 'darwin' and extra == 'ocrmac'", specifier = ">=1.0.0,<2.0.0" }, { name = "onnxruntime", marker = "python_full_version < '3.14' and extra == 'rapidocr'", specifier = ">=1.7.0,<2.0.0" }, + { name = "onnxruntime", marker = "python_full_version < '3.14' and sys_platform == 'darwin' and extra == 'onnxruntime'", specifier = "<1.24" }, + { name = "onnxruntime-gpu", marker = "(python_full_version < '3.14' and sys_platform == 'linux' and extra == 'onnxruntime') or (python_full_version < '3.14' and sys_platform == 'win32' and extra == 'onnxruntime')", specifier = "<1.24" }, { name = "openai-whisper", marker = "extra == 'asr'", specifier = ">=20250625" }, { name = "openpyxl", specifier = ">=3.1.5,<4.0.0" }, { name = "pandas", specifier = ">=2.1.4,<3.0.0" }, @@ -1071,7 +1077,7 @@ requires-dist = [ { name = "transformers", marker = "extra == 'vlm'", specifier = ">=4.46.0,<5.0.0" }, { name = "typer", specifier = ">=0.12.5,<0.22.0" }, ] -provides-extras = ["easyocr", "tesserocr", "ocrmac", "vlm", "rapidocr", "asr"] +provides-extras = ["easyocr", "tesserocr", "ocrmac", "vlm", "rapidocr", "onnxruntime", "asr"] [package.metadata.requires-dev] constraints = [ @@ -3683,6 +3689,31 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b6/ca/862b1e7a639460f0ca25fd5b6135fb42cf9deea86d398a92e44dfda2279d/onnxruntime-1.23.2-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e2b9233c4947907fd1818d0e581c049c41ccc39b2856cc942ff6d26317cee145", size = 17394184, upload-time = "2025-10-22T03:47:08.127Z" }, ] +[[package]] +name = "onnxruntime-gpu" +version = "1.23.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "coloredlogs" }, + { name = "flatbuffers" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, + { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, + { name = "packaging" }, + { name = "protobuf" }, + { name = "sympy" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/cb/ae/39283748c68a96be4f5f8a9561e0e3ca92af1eae6c2b1c07fb1da5f65cd1/onnxruntime_gpu-1.23.2-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:18de50c6c8eea50acc405ea13d299aec593e46478d7a22cd32cdbbdf7c42899d", size = 300525411, upload-time = "2025-10-22T16:56:08.415Z" }, + { url = "https://files.pythonhosted.org/packages/21/c9/47abd3ec1f34498224d2a8f5cc4d1445eb5cc7dee8e3644b1a972619c0d2/onnxruntime_gpu-1.23.2-cp310-cp310-win_amd64.whl", hash = "sha256:deba091e15357355aa836fd64c6c4ac97dd0c4609c38b08a69675073ea46b321", size = 244505340, upload-time = "2025-10-27T22:47:43.215Z" }, + { url = "https://files.pythonhosted.org/packages/43/a4/e3d7fbe32b44e814ae24ed642f05fac5d96d120efd82db7a7cac936e85a9/onnxruntime_gpu-1.23.2-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d76d1ac7a479ecc3ac54482eea4ba3b10d68e888a0f8b5f420f0bdf82c5eec59", size = 300525715, upload-time = "2025-10-22T16:56:19.928Z" }, + { url = "https://files.pythonhosted.org/packages/a9/5c/dba7c009e73dcce02e7f714574345b5e607c5c75510eb8d7bef682b45e5d/onnxruntime_gpu-1.23.2-cp311-cp311-win_amd64.whl", hash = "sha256:054282614c2fc9a4a27d74242afbae706a410f1f63cc35bc72f99709029a5ba4", size = 244506823, upload-time = "2025-10-22T16:55:09.526Z" }, + { url = "https://files.pythonhosted.org/packages/6c/d9/b7140a4f1615195938c7e358c0804bb84271f0d6886b5cbf105c6cb58aae/onnxruntime_gpu-1.23.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4f2d1f720685d729b5258ec1b36dee1de381b8898189908c98cbeecdb2f2b5c2", size = 300509596, upload-time = "2025-10-22T16:56:31.728Z" }, + { url = "https://files.pythonhosted.org/packages/87/da/2685c79e5ea587beddebe083601fead0bdf3620bc2f92d18756e7de8a636/onnxruntime_gpu-1.23.2-cp312-cp312-win_amd64.whl", hash = "sha256:fe925a84b00e291e0ad3fac29bfd8f8e06112abc760cdc82cb711b4f3935bd95", size = 244508327, upload-time = "2025-10-22T16:55:19.397Z" }, + { url = "https://files.pythonhosted.org/packages/03/05/40d561636e4114b54aa06d2371bfbca2d03e12cfdf5d4b85814802f18a75/onnxruntime_gpu-1.23.2-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1e8f75af5da07329d0c3a5006087f4051d8abd133b4be7c9bae8cdab7bea4c26", size = 300515567, upload-time = "2025-10-22T16:56:43.794Z" }, + { url = "https://files.pythonhosted.org/packages/b6/3b/418300438063d403384c79eaef1cb13c97627042f2247b35a887276a355a/onnxruntime_gpu-1.23.2-cp313-cp313-win_amd64.whl", hash = "sha256:7f1b3f49e5e126b99e23ec86b4203db41c2a911f6165f7624f2bc8267aaca767", size = 244507535, upload-time = "2025-10-22T16:55:28.532Z" }, + { url = "https://files.pythonhosted.org/packages/b8/dc/80b145e3134d7eba31309b3299a2836e37c76e4c419a261ad9796f8f8d65/onnxruntime_gpu-1.23.2-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:20959cd4ae358aab6579ab9123284a7b1498f7d51ec291d429a5edc26511306f", size = 300525759, upload-time = "2025-10-22T16:56:56.925Z" }, +] + [[package]] name = "openai-whisper" version = "20250625" @@ -6386,6 +6417,10 @@ dependencies = [ { name = "typing-extensions" }, ] wheels = [ + { url = "https://files.pythonhosted.org/packages/e3/ea/304cf7afb744aa626fa9855245526484ee55aba610d9973a0521c552a843/torch-2.10.0-1-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:c37fc46eedd9175f9c81814cc47308f1b42cfe4987e532d4b423d23852f2bf63", size = 79411450, upload-time = "2026-02-06T17:37:35.75Z" }, + { url = "https://files.pythonhosted.org/packages/25/d8/9e6b8e7df981a1e3ea3907fd5a74673e791da483e8c307f0b6ff012626d0/torch-2.10.0-1-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:f699f31a236a677b3118bc0a3ef3d89c0c29b5ec0b20f4c4bf0b110378487464", size = 79423460, upload-time = "2026-02-06T17:37:39.657Z" }, + { url = "https://files.pythonhosted.org/packages/c9/2f/0b295dd8d199ef71e6f176f576473d645d41357b7b8aa978cc6b042575df/torch-2.10.0-1-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:6abb224c2b6e9e27b592a1c0015c33a504b00a0e0938f1499f7f514e9b7bfb5c", size = 79498197, upload-time = "2026-02-06T17:37:27.627Z" }, + { url = "https://files.pythonhosted.org/packages/a4/1b/af5fccb50c341bd69dc016769503cb0857c1423fbe9343410dfeb65240f2/torch-2.10.0-1-cp313-none-macosx_11_0_arm64.whl", hash = "sha256:7350f6652dfd761f11f9ecb590bfe95b573e2961f7a242eccb3c8e78348d26fe", size = 79498248, upload-time = "2026-02-06T17:37:31.982Z" }, { url = "https://files.pythonhosted.org/packages/0c/1a/c61f36cfd446170ec27b3a4984f072fd06dab6b5d7ce27e11adb35d6c838/torch-2.10.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:5276fa790a666ee8becaffff8acb711922252521b28fbce5db7db5cf9cb2026d", size = 145992962, upload-time = "2026-01-21T16:24:14.04Z" }, { url = "https://files.pythonhosted.org/packages/b5/60/6662535354191e2d1555296045b63e4279e5a9dbad49acf55a5d38655a39/torch-2.10.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:aaf663927bcd490ae971469a624c322202a2a1e68936eb952535ca4cd3b90444", size = 915599237, upload-time = "2026-01-21T16:23:25.497Z" }, { url = "https://files.pythonhosted.org/packages/40/b8/66bbe96f0d79be2b5c697b2e0b187ed792a15c6c4b8904613454651db848/torch-2.10.0-cp310-cp310-win_amd64.whl", hash = "sha256:a4be6a2a190b32ff5c8002a0977a25ea60e64f7ba46b1be37093c141d9c49aeb", size = 113720931, upload-time = "2026-01-21T16:24:23.743Z" }, @@ -6627,12 +6662,19 @@ name = "triton" version = "3.6.0" source = { registry = "https://pypi.org/simple" } wheels = [ + { url = "https://files.pythonhosted.org/packages/44/ba/b1b04f4b291a3205d95ebd24465de0e5bf010a2df27a4e58a9b5f039d8f2/triton-3.6.0-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6c723cfb12f6842a0ae94ac307dba7e7a44741d720a40cf0e270ed4a4e3be781", size = 175972180, upload-time = "2026-01-20T16:15:53.664Z" }, { url = "https://files.pythonhosted.org/packages/8c/f7/f1c9d3424ab199ac53c2da567b859bcddbb9c9e7154805119f8bd95ec36f/triton-3.6.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a6550fae429e0667e397e5de64b332d1e5695b73650ee75a6146e2e902770bea", size = 188105201, upload-time = "2026-01-20T16:00:29.272Z" }, + { url = "https://files.pythonhosted.org/packages/0f/2c/96f92f3c60387e14cc45aed49487f3486f89ea27106c1b1376913c62abe4/triton-3.6.0-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:49df5ef37379c0c2b5c0012286f80174fcf0e073e5ade1ca9a86c36814553651", size = 176081190, upload-time = "2026-01-20T16:16:00.523Z" }, { url = "https://files.pythonhosted.org/packages/e0/12/b05ba554d2c623bffa59922b94b0775673de251f468a9609bc9e45de95e9/triton-3.6.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e8e323d608e3a9bfcc2d9efcc90ceefb764a82b99dea12a86d643c72539ad5d3", size = 188214640, upload-time = "2026-01-20T16:00:35.869Z" }, + { url = "https://files.pythonhosted.org/packages/17/5d/08201db32823bdf77a0e2b9039540080b2e5c23a20706ddba942924ebcd6/triton-3.6.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:374f52c11a711fd062b4bfbb201fd9ac0a5febd28a96fb41b4a0f51dde3157f4", size = 176128243, upload-time = "2026-01-20T16:16:07.857Z" }, { url = "https://files.pythonhosted.org/packages/ab/a8/cdf8b3e4c98132f965f88c2313a4b493266832ad47fb52f23d14d4f86bb5/triton-3.6.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:74caf5e34b66d9f3a429af689c1c7128daba1d8208df60e81106b115c00d6fca", size = 188266850, upload-time = "2026-01-20T16:00:43.041Z" }, + { url = "https://files.pythonhosted.org/packages/3c/12/34d71b350e89a204c2c7777a9bba0dcf2f19a5bfdd70b57c4dbc5ffd7154/triton-3.6.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:448e02fe6dc898e9e5aa89cf0ee5c371e99df5aa5e8ad976a80b93334f3494fd", size = 176133521, upload-time = "2026-01-20T16:16:13.321Z" }, { url = "https://files.pythonhosted.org/packages/f9/0b/37d991d8c130ce81a8728ae3c25b6e60935838e9be1b58791f5997b24a54/triton-3.6.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:10c7f76c6e72d2ef08df639e3d0d30729112f47a56b0c81672edc05ee5116ac9", size = 188289450, upload-time = "2026-01-20T16:00:49.136Z" }, + { url = "https://files.pythonhosted.org/packages/ce/4e/41b0c8033b503fd3cfcd12392cdd256945026a91ff02452bef40ec34bee7/triton-3.6.0-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1722e172d34e32abc3eb7711d0025bb69d7959ebea84e3b7f7a341cd7ed694d6", size = 176276087, upload-time = "2026-01-20T16:16:18.989Z" }, { url = "https://files.pythonhosted.org/packages/35/f8/9c66bfc55361ec6d0e4040a0337fb5924ceb23de4648b8a81ae9d33b2b38/triton-3.6.0-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d002e07d7180fd65e622134fbd980c9a3d4211fb85224b56a0a0efbd422ab72f", size = 188400296, upload-time = "2026-01-20T16:00:56.042Z" }, + { url = "https://files.pythonhosted.org/packages/49/55/5ecf0dcaa0f2fbbd4420f7ef227ee3cb172e91e5fede9d0ecaddc43363b4/triton-3.6.0-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ef5523241e7d1abca00f1d240949eebdd7c673b005edbbce0aca95b8191f1d43", size = 176138577, upload-time = "2026-01-20T16:16:25.426Z" }, { url = "https://files.pythonhosted.org/packages/df/3d/9e7eee57b37c80cec63322c0231bb6da3cfe535a91d7a4d64896fcb89357/triton-3.6.0-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a17a5d5985f0ac494ed8a8e54568f092f7057ef60e1b0fa09d3fd1512064e803", size = 188273063, upload-time = "2026-01-20T16:01:07.278Z" }, + { url = "https://files.pythonhosted.org/packages/48/db/56ee649cab5eaff4757541325aca81f52d02d4a7cd3506776cad2451e060/triton-3.6.0-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0b3a97e8ed304dfa9bd23bb41ca04cdf6b2e617d5e782a8653d616037a5d537d", size = 176274804, upload-time = "2026-01-20T16:16:31.528Z" }, { url = "https://files.pythonhosted.org/packages/f6/56/6113c23ff46c00aae423333eb58b3e60bdfe9179d542781955a5e1514cb3/triton-3.6.0-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:46bd1c1af4b6704e554cad2eeb3b0a6513a980d470ccfa63189737340c7746a7", size = 188397994, upload-time = "2026-01-20T16:01:14.236Z" }, ]