Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions docling_core/transforms/chunker/hybrid_chunker.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import warnings
from collections.abc import Iterable, Iterator
from functools import cached_property
from typing import Any, Optional, Union
from typing import Any, Optional, Union, cast

from pydantic import BaseModel, ConfigDict, Field, computed_field, model_validator

Expand Down Expand Up @@ -258,7 +258,8 @@ def segment(self, doc_chunk: DocChunk, available_length: int, doc_serializer: Ba
segments = line_chunker.chunk_text(lines=body_lines)
else:
sem_chunker = semchunk.chunkerify(self.tokenizer.get_tokenizer(), chunk_size=available_length)
segments = sem_chunker.chunk(doc_chunk.text)
sem_segments = sem_chunker(doc_chunk.text)
segments = cast(list[str], sem_segments)
return segments

def _merge_chunks_with_matching_metadata(self, chunks: list[DocChunk]):
Expand Down
4 changes: 2 additions & 2 deletions docling_core/transforms/chunker/tokenizer/huggingface.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import json
from os import PathLike
from typing import Optional, Union
from typing import Any, Optional, Union

from huggingface_hub import hf_hub_download
from pydantic import ConfigDict, model_validator
Expand Down Expand Up @@ -58,7 +58,7 @@ def from_pretrained(
**kwargs,
) -> Self:
"""Create tokenizer from model name."""
my_kwargs = {
my_kwargs: dict[str, Any] = {
"tokenizer": AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name, **kwargs),
}
if max_tokens is not None:
Expand Down
10 changes: 5 additions & 5 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -42,12 +42,12 @@ dependencies = [
'jsonschema (>=4.16.0,<5.0.0)',
'pydantic (>=2.6.0,<3.0.0,!=2.10.0,!=2.10.1,!=2.10.2)',
'jsonref (>=1.1.0,<2.0.0)',
'tabulate (>=0.9.0,<0.10.0)',
'tabulate (>=0.9.0,<0.11.0)',
'pandas (>=2.1.4,<4.0.0)',
'pillow (>=10.0.0,<13.0.0)',
'pyyaml (>=5.1,<7.0.0)',
'typing-extensions (>=4.12.2,<5.0.0)',
'typer (>=0.12.5,<0.22.0)',
'typer (>=0.12.5,<0.25.0)',
'latex2mathml (>=3.77.0,<4.0.0)',
"defusedxml (>=0.7.1, <0.8.0)",
]
Expand All @@ -67,19 +67,19 @@ docling-view = "docling_core.cli.view:app"
[project.optional-dependencies]
chunking = [
# common:
'semchunk (>=2.2.0,<3.0.0)',
'semchunk (>=2.2.0,<4.0.0)',
'tree-sitter (>=0.25.0,<0.27.0)',
'tree-sitter-python >=0.23.6',
'tree-sitter-c >=0.23.4',
'tree-sitter-javascript >=0.23.1',
'tree-sitter-typescript >=0.23.2',

# specific:
'transformers (>=4.34.0,<5.0.0)',
'transformers (>=4.34.0,<6.0.0)',
]
chunking-openai = [
# common:
'semchunk (>=2.2.0,<3.0.0)',
'semchunk (>=2.2.0,<4.0.0)',
'tree-sitter (>=0.25.0,<0.27.0)',
'tree-sitter-python >=0.23.6',
'tree-sitter-c >=0.23.4',
Expand Down
21 changes: 16 additions & 5 deletions test/data/chunker/0d_out_chunks.json
Original file line number Diff line number Diff line change
Expand Up @@ -638,7 +638,7 @@
}
},
{
"text": "to avoid this at any cost in order to have clear, unbiased baseline numbers for human document-layout annotation. Third, we introduced the feature of snapping boxes around text segments to obtain a pixel-accurate annotation and again reduce time and effort. The CCS annotation tool automatically shrinks every user-drawn box to the minimum bounding-box around the enclosed text-cells for all purely text-based segments, which excludes only Table and Picture . For the latter, we instructed annotation staff to minimise inclusion of surrounding whitespace while including all graphical lines. A downside of snapping boxes to enclosed text cells is that some wrongly parsed PDF pages cannot be annotated correctly and need to be skipped. Fourth, we established a way to flag pages as rejected for cases where no valid annotation according to the label guidelines could be achieved. Example cases for this would be PDF pages that render incorrectly or contain layouts that are impossible to capture with non-overlapping rectangles. Such rejected pages are not contained in the final dataset. With all these measures in place, experienced annotation staff managed to annotate a single page",
"text": "to avoid this at any cost in order to have clear, unbiased baseline numbers for human document-layout annotation. Third, we introduced the feature of snapping boxes around text segments to obtain a pixel-accurate annotation and again reduce time and effort. The CCS annotation tool automatically shrinks every user-drawn box to the minimum bounding-box around the enclosed text-cells for all purely text-based segments, which excludes only Table and Picture . For the latter, we instructed annotation staff to minimise inclusion of surrounding whitespace while including all graphical lines. A downside of snapping boxes to enclosed text cells is that some wrongly parsed PDF pages cannot be annotated correctly and need to be skipped. Fourth, we established a way to flag pages as rejected for cases where no valid annotation according to the label guidelines could be achieved. Example cases for this would be PDF pages that render incorrectly or contain layouts that are impossible to capture with non-overlapping rectangles. Such rejected pages are not contained in the final dataset.",
"meta": {
"doc_items": [
"#/texts/513"
Expand All @@ -650,7 +650,7 @@
}
},
{
"text": "in a typical timeframe of 20s to 60s, depending on its complexity.",
"text": "With all these measures in place, experienced annotation staff managed to annotate a single page in a typical timeframe of 20s to 60s, depending on its complexity.",
"meta": {
"doc_items": [
"#/texts/513"
Expand Down Expand Up @@ -703,7 +703,7 @@
}
},
{
"text": "In Table 2, we present baseline experiments (given in mAP) on Mask R-CNN [12], Faster R-CNN [11], and YOLOv5 [13]. Both training and evaluation were performed on RGB images with dimensions of 1025 \u00d7 1025 pixels. For training, we only used one annotation in case of redundantly annotated pages. As one can observe, the variation in mAP between the models is rather low, but overall between 6 and 10% lower than the mAP computed from the pairwise human annotations on triple-annotated pages. This gives a good indication that the DocLayNet dataset poses a worthwhile challenge for the research community to close the gap between human recognition and ML approaches. It is interesting to see that Mask R-CNN and Faster R-CNN produce very comparable mAP scores, indicating that pixel-based image segmentation derived from bounding-boxes does not help to obtain better predictions. On the other hand, the more recent Yolov5x model does very well and even out-performs humans on selected labels such as Text , Table and Picture . This is not entirely surprising, as Text , Table and",
"text": "In Table 2, we present baseline experiments (given in mAP) on Mask R-CNN [12], Faster R-CNN [11], and YOLOv5 [13]. Both training and evaluation were performed on RGB images with dimensions of 1025 \u00d7 1025 pixels. For training, we only used one annotation in case of redundantly annotated pages. As one can observe, the variation in mAP between the models is rather low, but overall between 6 and 10% lower than the mAP computed from the pairwise human annotations on triple-annotated pages. This gives a good indication that the DocLayNet dataset poses a worthwhile challenge for the research community to close the gap between human recognition and ML approaches. It is interesting to see that Mask R-CNN and Faster R-CNN produce very comparable mAP scores, indicating that pixel-based image segmentation derived from bounding-boxes does not help to obtain better predictions. On the other hand, the more recent Yolov5x model does very well and even out-performs humans on selected labels such as Text , Table and Picture .",
"meta": {
"doc_items": [
"#/texts/523"
Expand All @@ -715,7 +715,7 @@
}
},
{
"text": "Picture are abundant and the most visually distinctive in a document.\ncoioct dcochon modols\nmak enbrel\nFigure 3: Page 6 of the DocLayNet paper. If recognized, metadata such as authors are appearing first under the title. Elements recognized as page headers or footers are suppressed in Markdown to deliver uninterrupted content in reading order. Tables are inserted in reading order. The paragraph in '5. Experiments' wrapping over the column end is broken up in two and interrupted by the table.\nKDD '22, August 14-18, 2022, Washington, DC, USA\nBirgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar\nTable 1: DocLayNet dataset overview. Along with the frequency of each class label, we present the relative occurrence (as %\nbetween pairwise annotations from the triple-annotated pages, from which we obtain accuracy ranges.\nof row 'Total') in the train, test and validation sets. The inter-annotator agreement is computed as the [email protected] metric",
"text": "This is not entirely surprising, as Text , Table and Picture are abundant and the most visually distinctive in a document.\ncoioct dcochon modols\nmak enbrel\nFigure 3: Page 6 of the DocLayNet paper. If recognized, metadata such as authors are appearing first under the title. Elements recognized as page headers or footers are suppressed in Markdown to deliver uninterrupted content in reading order. Tables are inserted in reading order. The paragraph in '5. Experiments' wrapping over the column end is broken up in two and interrupted by the table.\nKDD '22, August 14-18, 2022, Washington, DC, USA\nBirgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar\nTable 1: DocLayNet dataset overview. Along with the frequency of each class label, we present the relative occurrence (as %\nbetween pairwise annotations from the triple-annotated pages, from which we obtain accuracy ranges.",
"meta": {
"doc_items": [
"#/texts/523",
Expand All @@ -725,7 +725,18 @@
"#/texts/529",
"#/texts/530",
"#/texts/531",
"#/texts/532",
"#/texts/532"
],
"headings": [
"Docling Technical Report",
"Baselines for Object Detection"
]
}
},
{
"text": "of row 'Total') in the train, test and validation sets. The inter-annotator agreement is computed as the [email protected] metric",
"meta": {
"doc_items": [
"#/texts/533"
],
"headings": [
Expand Down
Loading
Loading