Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
## [v2.73.1](https://github.com/docling-project/docling/releases/tag/v2.73.1) - 2026-02-13

### Fix

* **asciidoc:** Handle commas in image alt text ([#2983](https://github.com/docling-project/docling/issues/2983)) ([`86b6912`](https://github.com/docling-project/docling/commit/86b691204d2e4c2a54c99d80063e2dd5b5428168))
* Use timezone-aware datetime ([#2947](https://github.com/docling-project/docling/issues/2947)) ([`e2870f9`](https://github.com/docling-project/docling/commit/e2870f94ed78caeb6db9d735b5a73fa80e5e2104))
* Add failed pages to DoclingDocument for page break consistency ([#2939](https://github.com/docling-project/docling/issues/2939)) ([`1f91482`](https://github.com/docling-project/docling/commit/1f914826bb07c32766e7db37f86baec3ea772a11))

## [v2.73.0](https://github.com/docling-project/docling/releases/tag/v2.73.0) - 2026-02-11

### Feature
Expand Down
10 changes: 7 additions & 3 deletions docling/backend/asciidoc_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -414,10 +414,14 @@ def _parse_picture(line):

# Extract optional attributes (alt text, width, height, alignment)
if attributes:
picture_info["alt"] = attributes[0].strip() if attributes[0] else ""
alt_parts = [attributes[0].strip()] if attributes[0] else [""]
for attr in attributes[1:]:
key, value = attr.split("=")
picture_info[key.strip()] = value.strip()
if "=" in attr:
key, value = attr.split("=", 1)
picture_info[key.strip()] = value.strip()
else:
alt_parts.append(attr.strip())
picture_info["alt"] = ", ".join(alt_parts)

return picture_info

Expand Down
48 changes: 46 additions & 2 deletions docling/datamodel/image_classification_engine_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@

from __future__ import annotations

from typing import List, Literal
from typing import Any, Dict, List, Literal, Optional

from pydantic import Field
from pydantic import AnyUrl, Field

from docling.datamodel.settings import default_compile_model
from docling.models.inference_engines.image_classification.base import (
Expand Down Expand Up @@ -49,3 +49,47 @@ class TransformersImageClassificationEngineOptions(
default_factory=default_compile_model,
description="Whether to compile the model with torch.compile() for better performance.",
)


class ApiKserveV2ImageClassificationEngineOptions(BaseImageClassificationEngineOptions):
"""Runtime configuration for remote KServe v2 inference."""

engine_type: Literal[ImageClassificationEngineType.API_KSERVE_V2] = (
ImageClassificationEngineType.API_KSERVE_V2
)

url: AnyUrl = Field(
description=(
"Base URL of the KServe v2 server (e.g., 'http://localhost:8000'). "
"The full endpoint path is constructed automatically as "
"/v2/models/{model_name}[/versions/{version}]/infer."
),
)

model_name: Optional[str] = Field(
default=None,
description=(
"Remote model name registered in the KServe v2 endpoint. "
"If omitted, a repo_id-derived default is used."
),
)

model_version: Optional[str] = Field(
default=None,
description="Optional model version. If omitted, the server default is used.",
)

headers: Dict[str, str] = Field(
default_factory=dict,
description="Optional HTTP headers for authentication/routing.",
)

timeout: float = Field(
default=60.0,
description="HTTP request timeout in seconds.",
)

request_parameters: Dict[str, Any] = Field(
default_factory=dict,
description="Optional top-level KServe v2 infer request parameters.",
)
48 changes: 46 additions & 2 deletions docling/datamodel/object_detection_engine_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@

from __future__ import annotations

from typing import List, Literal
from typing import Any, Dict, List, Literal, Optional

from pydantic import Field
from pydantic import AnyUrl, Field

from docling.datamodel.settings import default_compile_model
from docling.models.inference_engines.object_detection.base import (
Expand Down Expand Up @@ -51,3 +51,47 @@ class TransformersObjectDetectionEngineOptions(BaseObjectDetectionEngineOptions)
default_factory=default_compile_model,
description="Whether to compile the model with torch.compile() for better performance.",
)


class ApiKserveV2ObjectDetectionEngineOptions(BaseObjectDetectionEngineOptions):
"""Runtime configuration for remote KServe v2 inference."""

engine_type: Literal[ObjectDetectionEngineType.API_KSERVE_V2] = (
ObjectDetectionEngineType.API_KSERVE_V2
)

url: AnyUrl = Field(
description=(
"Base URL of the KServe v2 server (e.g., 'http://localhost:8000'). "
"The full endpoint path is constructed automatically as "
"/v2/models/{model_name}[/versions/{version}]/infer."
),
)

model_name: Optional[str] = Field(
default=None,
description=(
"Remote model name registered in the KServe v2 endpoint. "
"If omitted, a repo_id-derived default is used."
),
)

model_version: Optional[str] = Field(
default=None,
description="Optional model version. If omitted, the server default is used.",
)

headers: Dict[str, str] = Field(
default_factory=dict,
description="Optional HTTP headers for authentication/routing.",
)

timeout: float = Field(
default=60.0,
description="HTTP request timeout in seconds.",
)

request_parameters: Dict[str, Any] = Field(
default_factory=dict,
description="Optional top-level KServe v2 infer request parameters.",
)
18 changes: 8 additions & 10 deletions docling/datamodel/picture_classification_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,20 @@
from pydantic import BaseModel, Field

from docling.datamodel import stage_model_specs
from docling.datamodel.image_classification_engine_options import (
BaseImageClassificationEngineOptions,
TransformersImageClassificationEngineOptions,
)
from docling.datamodel.stage_model_specs import (
ImageClassificationModelSpec,
ImageClassificationStagePresetMixin,
)
from docling.models.inference_engines.image_classification.base import (
ImageClassificationEngineOptionsMixin,
)


class DocumentPictureClassifierOptions(ImageClassificationStagePresetMixin, BaseModel):
class DocumentPictureClassifierOptions(
ImageClassificationStagePresetMixin,
ImageClassificationEngineOptionsMixin,
BaseModel,
):
"""Options for configuring the DocumentPictureClassifier stage."""

kind: ClassVar[str] = "document_picture_classifier"
Expand All @@ -29,11 +32,6 @@ class DocumentPictureClassifierOptions(ImageClassificationStagePresetMixin, Base
description="Image-classification model specification for picture classification.",
)

engine_options: BaseImageClassificationEngineOptions = Field(
default_factory=TransformersImageClassificationEngineOptions,
description="Runtime configuration for the image-classification engine.",
)

@property
def repo_id(self) -> str:
return self.model_spec.get_repo_id(self.engine_options.engine_type)
Expand Down
34 changes: 14 additions & 20 deletions docling/datamodel/pipeline_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
BaseModel,
ConfigDict,
Field,
field_validator,
)
from typing_extensions import deprecated

Expand Down Expand Up @@ -58,6 +59,10 @@
SMOLDOCLING_TRANSFORMERS as smoldocling_vlm_conversion_options,
VlmModelType,
)
from docling.models.inference_engines.object_detection.base import (
ObjectDetectionEngineOptionsMixin,
)
from docling.models.inference_engines.vlm.base import VlmEngineOptionsMixin

_log = logging.getLogger(__name__)

Expand Down Expand Up @@ -639,7 +644,7 @@ def repo_cache_folder(self) -> str:


class PictureDescriptionVlmEngineOptions(
StagePresetMixin, PictureDescriptionBaseOptions
StagePresetMixin, VlmEngineOptionsMixin, PictureDescriptionBaseOptions
):
"""Configuration for VLM runtime-based picture description.

Expand Down Expand Up @@ -667,9 +672,6 @@ class PictureDescriptionVlmEngineOptions(
model_spec: VlmModelSpec = Field(
description="Model specification with runtime-specific overrides"
)
engine_options: BaseVlmEngineOptions = Field(
description="Runtime configuration (transformers, mlx, api, etc.)"
)
prompt: Annotated[
str,
Field(
Expand Down Expand Up @@ -715,7 +717,7 @@ class PictureDescriptionVlmEngineOptions(
"""


class VlmConvertOptions(StagePresetMixin, BaseModel):
class VlmConvertOptions(StagePresetMixin, VlmEngineOptionsMixin, BaseModel):
"""Configuration for VLM-based document conversion.

This stage uses vision-language models to convert document pages to
Expand All @@ -738,10 +740,6 @@ class VlmConvertOptions(StagePresetMixin, BaseModel):
description="Model specification with runtime-specific overrides"
)

engine_options: BaseVlmEngineOptions = Field(
description="Runtime configuration (transformers, mlx, api, etc.)"
)

scale: float = Field(
default=2.0, description="Image scaling factor for preprocessing"
)
Expand All @@ -759,7 +757,7 @@ class VlmConvertOptions(StagePresetMixin, BaseModel):
)


class CodeFormulaVlmOptions(StagePresetMixin, BaseModel):
class CodeFormulaVlmOptions(StagePresetMixin, VlmEngineOptionsMixin, BaseModel):
"""Configuration for VLM-based code and formula extraction.

This stage uses vision-language models to extract code blocks and
Expand All @@ -778,10 +776,6 @@ class CodeFormulaVlmOptions(StagePresetMixin, BaseModel):
description="Model specification with runtime-specific overrides"
)

engine_options: BaseVlmEngineOptions = Field(
description="Runtime configuration (transformers, mlx, api, etc.)"
)

scale: float = Field(
default=2.0, description="Image scaling factor for preprocessing"
)
Expand Down Expand Up @@ -977,7 +971,7 @@ class ConvertPipelineOptions(PipelineOptions):
Field(
description=(
"Configuration for picture classification model/runtime. "
"Supports selecting transformers or onnxruntime inference engines."
"Supports selecting transformers, onnxruntime, or remote api_kserve_v2 inference engines."
)
),
] = _default_picture_classification_options
Expand Down Expand Up @@ -1119,7 +1113,11 @@ class LayoutOptions(BaseLayoutOptions):
] = DOCLING_LAYOUT_HERON


class LayoutObjectDetectionOptions(ObjectDetectionStagePresetMixin, BaseLayoutOptions):
class LayoutObjectDetectionOptions(
ObjectDetectionStagePresetMixin,
ObjectDetectionEngineOptionsMixin,
BaseLayoutOptions,
):
"""Options for layout detection using object-detection runtimes."""

kind: ClassVar[str] = "layout_object_detection"
Expand All @@ -1141,10 +1139,6 @@ class LayoutObjectDetectionOptions(ObjectDetectionStagePresetMixin, BaseLayoutOp
description="Object-detection model specification for layout analysis",
)

engine_options: BaseObjectDetectionEngineOptions = Field(
description="Runtime configuration for the object-detection engine",
)


LayoutObjectDetectionOptions.register_preset(
stage_model_specs.OBJECT_DETECTION_LAYOUT_HERON
Expand Down
19 changes: 19 additions & 0 deletions docling/datamodel/stage_model_specs.py
Original file line number Diff line number Diff line change
Expand Up @@ -702,6 +702,7 @@ def from_preset(
**overrides: Any,
):
from docling.datamodel.object_detection_engine_options import (
ApiKserveV2ObjectDetectionEngineOptions,
OnnxRuntimeObjectDetectionEngineOptions,
TransformersObjectDetectionEngineOptions,
)
Expand All @@ -713,6 +714,13 @@ def from_preset(
engine_options = OnnxRuntimeObjectDetectionEngineOptions()
elif preset.default_engine_type == ObjectDetectionEngineType.TRANSFORMERS:
engine_options = TransformersObjectDetectionEngineOptions()
elif preset.default_engine_type == ObjectDetectionEngineType.API_KSERVE_V2:
raise ValueError(
f"Preset '{preset_id}' uses API_KSERVE_V2 engine which requires explicit "
"engine_options with a 'url' parameter. Please provide "
"engine_options=ApiKserveV2ObjectDetectionEngineOptions(url='...') "
"when calling from_preset()."
)
else:
raise ValueError(
f"Unsupported engine type {preset.default_engine_type} for presets"
Expand Down Expand Up @@ -804,6 +812,7 @@ def from_preset(
**overrides: Any,
):
from docling.datamodel.image_classification_engine_options import (
ApiKserveV2ImageClassificationEngineOptions,
OnnxRuntimeImageClassificationEngineOptions,
TransformersImageClassificationEngineOptions,
)
Expand All @@ -817,6 +826,16 @@ def from_preset(
preset.default_engine_type == ImageClassificationEngineType.TRANSFORMERS
):
engine_options = TransformersImageClassificationEngineOptions()
elif (
preset.default_engine_type
== ImageClassificationEngineType.API_KSERVE_V2
):
raise ValueError(
f"Preset '{preset_id}' uses API_KSERVE_V2 engine which requires explicit "
"engine_options with a 'url' parameter. Please provide "
"engine_options=ApiKserveV2ImageClassificationEngineOptions(url='...') "
"when calling from_preset()."
)
else:
raise ValueError(
f"Unsupported engine type {preset.default_engine_type} for presets"
Expand Down
2 changes: 2 additions & 0 deletions docling/experimental/models/table_crops_layout_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,9 @@ def __init__(
artifacts_path: Optional[Path],
accelerator_options: AcceleratorOptions,
options: TableCropsLayoutOptions,
enable_remote_services: bool = False,
):
_ = enable_remote_services
self.options = options
self.artifacts_path = artifacts_path
self.accelerator_options = accelerator_options
Expand Down
3 changes: 2 additions & 1 deletion docling/models/inference_engines/common/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Shared inference-engine utilities."""

from docling.models.inference_engines.common.hf_vision_base import HfVisionModelMixin
from docling.models.inference_engines.common.kserve_v2_http import KserveV2HttpClient

__all__ = ["HfVisionModelMixin"]
__all__ = ["HfVisionModelMixin", "KserveV2HttpClient"]
Loading
Loading