docling-project · PeterStaar-IBM · Mar 9, 2026 · Mar 8, 2026 · Mar 8, 2026
diff --git a/docling_core/transforms/chunker/hybrid_chunker.py b/docling_core/transforms/chunker/hybrid_chunker.py
@@ -3,7 +3,7 @@
 import warnings
 from collections.abc import Iterable, Iterator
 from functools import cached_property
-from typing import Any, Optional, Union
+from typing import Any, Optional, Union, cast
 
 from pydantic import BaseModel, ConfigDict, Field, computed_field, model_validator
 
@@ -258,7 +258,8 @@ def segment(self, doc_chunk: DocChunk, available_length: int, doc_serializer: Ba
             segments = line_chunker.chunk_text(lines=body_lines)
         else:
             sem_chunker = semchunk.chunkerify(self.tokenizer.get_tokenizer(), chunk_size=available_length)
-            segments = sem_chunker.chunk(doc_chunk.text)
+            sem_segments = sem_chunker(doc_chunk.text)
+            segments = cast(list[str], sem_segments)
         return segments
 
     def _merge_chunks_with_matching_metadata(self, chunks: list[DocChunk]):

diff --git a/docling_core/transforms/chunker/tokenizer/huggingface.py b/docling_core/transforms/chunker/tokenizer/huggingface.py
@@ -2,7 +2,7 @@
 
 import json
 from os import PathLike
-from typing import Optional, Union
+from typing import Any, Optional, Union
 
 from huggingface_hub import hf_hub_download
 from pydantic import ConfigDict, model_validator
@@ -58,7 +58,7 @@ def from_pretrained(
         **kwargs,
     ) -> Self:
         """Create tokenizer from model name."""
-        my_kwargs = {
+        my_kwargs: dict[str, Any] = {
             "tokenizer": AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name, **kwargs),
         }
         if max_tokens is not None:

diff --git a/pyproject.toml b/pyproject.toml
@@ -42,12 +42,12 @@ dependencies = [
     'jsonschema (>=4.16.0,<5.0.0)',
     'pydantic (>=2.6.0,<3.0.0,!=2.10.0,!=2.10.1,!=2.10.2)',
     'jsonref (>=1.1.0,<2.0.0)',
-    'tabulate (>=0.9.0,<0.10.0)',
+    'tabulate (>=0.9.0,<0.11.0)',
     'pandas (>=2.1.4,<4.0.0)',
     'pillow (>=10.0.0,<13.0.0)',
     'pyyaml (>=5.1,<7.0.0)',
     'typing-extensions (>=4.12.2,<5.0.0)',
-    'typer (>=0.12.5,<0.22.0)',
+    'typer (>=0.12.5,<0.25.0)',
     'latex2mathml (>=3.77.0,<4.0.0)',
     "defusedxml (>=0.7.1, <0.8.0)",
 ]
@@ -67,19 +67,19 @@ docling-view = "docling_core.cli.view:app"
 [project.optional-dependencies]
 chunking = [
     # common:
-    'semchunk (>=2.2.0,<3.0.0)',
+    'semchunk (>=2.2.0,<4.0.0)',
     'tree-sitter (>=0.25.0,<0.27.0)',
     'tree-sitter-python >=0.23.6',
     'tree-sitter-c >=0.23.4',
     'tree-sitter-javascript >=0.23.1',
     'tree-sitter-typescript >=0.23.2',
 
     # specific:
-    'transformers (>=4.34.0,<5.0.0)',
+    'transformers (>=4.34.0,<6.0.0)',
 ]
 chunking-openai = [
     # common:
-    'semchunk (>=2.2.0,<3.0.0)',
+    'semchunk (>=2.2.0,<4.0.0)',
     'tree-sitter (>=0.25.0,<0.27.0)',
     'tree-sitter-python >=0.23.6',
     'tree-sitter-c >=0.23.4',

diff --git a/test/data/chunker/0d_out_chunks.json b/test/data/chunker/0d_out_chunks.json
@@ -638,7 +638,7 @@
         }
     },
     {
-        "text": "to avoid this at any cost in order to have clear, unbiased baseline numbers for human document-layout annotation. Third, we introduced the feature of snapping boxes around text segments to obtain a pixel-accurate annotation and again reduce time and effort. The CCS annotation tool automatically shrinks every user-drawn box to the minimum bounding-box around the enclosed text-cells for all purely text-based segments, which excludes only Table and Picture . For the latter, we instructed annotation staff to minimise inclusion of surrounding whitespace while including all graphical lines. A downside of snapping boxes to enclosed text cells is that some wrongly parsed PDF pages cannot be annotated correctly and need to be skipped. Fourth, we established a way to flag pages as rejected for cases where no valid annotation according to the label guidelines could be achieved. Example cases for this would be PDF pages that render incorrectly or contain layouts that are impossible to capture with non-overlapping rectangles. Such rejected pages are not contained in the final dataset. With all these measures in place, experienced annotation staff managed to annotate a single page",
+        "text": "to avoid this at any cost in order to have clear, unbiased baseline numbers for human document-layout annotation. Third, we introduced the feature of snapping boxes around text segments to obtain a pixel-accurate annotation and again reduce time and effort. The CCS annotation tool automatically shrinks every user-drawn box to the minimum bounding-box around the enclosed text-cells for all purely text-based segments, which excludes only Table and Picture . For the latter, we instructed annotation staff to minimise inclusion of surrounding whitespace while including all graphical lines. A downside of snapping boxes to enclosed text cells is that some wrongly parsed PDF pages cannot be annotated correctly and need to be skipped. Fourth, we established a way to flag pages as rejected for cases where no valid annotation according to the label guidelines could be achieved. Example cases for this would be PDF pages that render incorrectly or contain layouts that are impossible to capture with non-overlapping rectangles. Such rejected pages are not contained in the final dataset.",
         "meta": {
             "doc_items": [
                 "#/texts/513"
@@ -650,7 +650,7 @@
         }
     },
     {
-        "text": "in a typical timeframe of 20s to 60s, depending on its complexity.",
+        "text": "With all these measures in place, experienced annotation staff managed to annotate a single page in a typical timeframe of 20s to 60s, depending on its complexity.",
         "meta": {
             "doc_items": [
                 "#/texts/513"
@@ -703,7 +703,7 @@
         }
     },
     {
-        "text": "In Table 2, we present baseline experiments (given in mAP) on Mask R-CNN [12], Faster R-CNN [11], and YOLOv5 [13]. Both training and evaluation were performed on RGB images with dimensions of 1025 \u00d7 1025 pixels. For training, we only used one annotation in case of redundantly annotated pages. As one can observe, the variation in mAP between the models is rather low, but overall between 6 and 10% lower than the mAP computed from the pairwise human annotations on triple-annotated pages. This gives a good indication that the DocLayNet dataset poses a worthwhile challenge for the research community to close the gap between human recognition and ML approaches. It is interesting to see that Mask R-CNN and Faster R-CNN produce very comparable mAP scores, indicating that pixel-based image segmentation derived from bounding-boxes does not help to obtain better predictions. On the other hand, the more recent Yolov5x model does very well and even out-performs humans on selected labels such as Text , Table and Picture . This is not entirely surprising, as Text , Table and",
+        "text": "In Table 2, we present baseline experiments (given in mAP) on Mask R-CNN [12], Faster R-CNN [11], and YOLOv5 [13]. Both training and evaluation were performed on RGB images with dimensions of 1025 \u00d7 1025 pixels. For training, we only used one annotation in case of redundantly annotated pages. As one can observe, the variation in mAP between the models is rather low, but overall between 6 and 10% lower than the mAP computed from the pairwise human annotations on triple-annotated pages. This gives a good indication that the DocLayNet dataset poses a worthwhile challenge for the research community to close the gap between human recognition and ML approaches. It is interesting to see that Mask R-CNN and Faster R-CNN produce very comparable mAP scores, indicating that pixel-based image segmentation derived from bounding-boxes does not help to obtain better predictions. On the other hand, the more recent Yolov5x model does very well and even out-performs humans on selected labels such as Text , Table and Picture .",
         "meta": {
             "doc_items": [
                 "#/texts/523"
@@ -715,7 +715,7 @@
         }
     },
     {
-        "text": "Picture are abundant and the most visually distinctive in a document.\ncoioct dcochon modols\nmak enbrel\nFigure 3: Page 6 of the DocLayNet paper. If recognized, metadata such as authors are appearing first under the title. Elements recognized as page headers or footers are suppressed in Markdown to deliver uninterrupted content in reading order. Tables are inserted in reading order. The paragraph in '5. Experiments' wrapping over the column end is broken up in two and interrupted by the table.\nKDD '22, August 14-18, 2022, Washington, DC, USA\nBirgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar\nTable 1: DocLayNet dataset overview. Along with the frequency of each class label, we present the relative occurrence (as %\nbetween pairwise annotations from the triple-annotated pages, from which we obtain accuracy ranges.\nof row 'Total') in the train, test and validation sets. The inter-annotator agreement is computed as the [email protected] metric",
+        "text": "This is not entirely surprising, as Text , Table and Picture are abundant and the most visually distinctive in a document.\ncoioct dcochon modols\nmak enbrel\nFigure 3: Page 6 of the DocLayNet paper. If recognized, metadata such as authors are appearing first under the title. Elements recognized as page headers or footers are suppressed in Markdown to deliver uninterrupted content in reading order. Tables are inserted in reading order. The paragraph in '5. Experiments' wrapping over the column end is broken up in two and interrupted by the table.\nKDD '22, August 14-18, 2022, Washington, DC, USA\nBirgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar\nTable 1: DocLayNet dataset overview. Along with the frequency of each class label, we present the relative occurrence (as %\nbetween pairwise annotations from the triple-annotated pages, from which we obtain accuracy ranges.",
         "meta": {
             "doc_items": [
                 "#/texts/523",
@@ -725,7 +725,18 @@
                 "#/texts/529",
                 "#/texts/530",
                 "#/texts/531",
-                "#/texts/532",
+                "#/texts/532"
+            ],
+            "headings": [
+                "Docling Technical Report",
+                "Baselines for Object Detection"
+            ]
+        }
+    },
+    {
+        "text": "of row 'Total') in the train, test and validation sets. The inter-annotator agreement is computed as the [email protected] metric",
+        "meta": {
+            "doc_items": [
                 "#/texts/533"
             ],
             "headings": [