Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/checks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ['3.9', '3.10', '3.11', '3.12', '3.13', '3.14']
python-version: ['3.10', '3.11', '3.12', '3.13', '3.14']
steps:
- uses: actions/checkout@v4
- name: Cache Hugging Face models
Expand Down
4 changes: 2 additions & 2 deletions docling_core/experimental/idoctags.py
Original file line number Diff line number Diff line change
Expand Up @@ -1316,7 +1316,7 @@ def _serialize_single_item(
if (
(isinstance(item, CodeItem) and ContentType.TEXT_CODE in params.content_types)
or (isinstance(item, FormulaItem) and ContentType.TEXT_FORMULA in params.content_types)
or (not isinstance(item, (CodeItem, FormulaItem)) and ContentType.TEXT_OTHER in params.content_types)
or (not isinstance(item, CodeItem | FormulaItem) and ContentType.TEXT_OTHER in params.content_types)
):
# Check if we should serialize a single inline group child instead of text
if len(item.children) > 0 and isinstance((first_child := item.children[0].resolve(doc)), InlineGroup):
Expand Down Expand Up @@ -1754,7 +1754,7 @@ def serialize(
**kwargs: Any,
) -> SerializationResult:
"""Serialize unsupported nodes by concatenating their textual parts."""
if isinstance(item, (ListGroup, InlineGroup)):
if isinstance(item, ListGroup | InlineGroup):
parts = doc_serializer.get_parts(item=item, **kwargs)
text_res = "\n".join([p.text for p in parts if p.text])
return create_ser_result(text=text_res, span_source=parts)
Expand Down
40 changes: 27 additions & 13 deletions docling_core/transforms/chunker/code_chunking/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,16 +38,15 @@ def _get_tree_sitter_language(language: CodeLanguageLabel):
CodeLanguageLabel.JAVASCRIPT: lambda: Lang(ts_js.language()),
CodeLanguageLabel.C: lambda: Lang(ts_c.language()),
}
if sys.version_info >= (3, 10):
try:
import tree_sitter_java_orchard as ts_java

language_map[CodeLanguageLabel.JAVA] = lambda: Lang(ts_java.language())
except ImportError:
_logger.warning(
"Code chunking for Java cannot be enabled because tree-sitter-java-orchard is missing. "
"Please installed it via `pip install tree-sitter-java-orchard`."
)
try:
import tree_sitter_java_orchard as ts_java

language_map[CodeLanguageLabel.JAVA] = lambda: Lang(ts_java.language())
except ImportError:
_logger.warning(
"Code chunking for Java cannot be enabled because tree-sitter-java-orchard is missing. "
"Please installed it via `pip install tree-sitter-java-orchard`."
)

factory = language_map.get(language)
return factory() if factory else None
Expand Down Expand Up @@ -157,6 +156,21 @@ def _to_str(node: Node) -> str:
def _query_tree(language, tree: Tree, query: str):
"""Query a tree-sitter tree with the given query string."""
if not language:
return []
q = language.query(query)
return q.captures(tree.root_node)
return {}
from tree_sitter import Query, QueryCursor

q = Query(language, query)
cursor = QueryCursor(q)
matches = list(cursor.matches(tree.root_node))

# Combine all captures from all matches into a single dict
# Old API returned: {"capture_name": [node1, node2, ...]}
# New API returns: [(pattern_idx, {"capture_name": [node1, ...]}), ...]
combined_captures: dict[str, list[Node]] = {}
for _, captures_dict in matches:
for capture_name, nodes in captures_dict.items():
if capture_name not in combined_captures:
combined_captures[capture_name] = []
combined_captures[capture_name].extend(nodes)

return combined_captures
4 changes: 2 additions & 2 deletions docling_core/transforms/chunker/hierarchical_chunker.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ def chunk(
for item, level in dl_doc.iterate_items(with_groups=True):
if item.self_ref in excluded_refs:
continue
if isinstance(item, (TitleItem, SectionHeaderItem)):
if isinstance(item, TitleItem | SectionHeaderItem):
level = item.level if isinstance(item, SectionHeaderItem) else 0

# prepare to remove shadowed headings as they just went out of scope
Expand Down Expand Up @@ -185,7 +185,7 @@ def chunk(
heading_by_level[level] = item

continue
elif isinstance(item, (ListGroup, InlineGroup, DocItem)) and item.self_ref not in visited:
elif isinstance(item, ListGroup | InlineGroup | DocItem) and item.self_ref not in visited:
if self.code_chunking_strategy is not None and isinstance(item, CodeItem):
yield from self.code_chunking_strategy.chunk_code_item(
item=item,
Expand Down
2 changes: 1 addition & 1 deletion docling_core/transforms/chunker/hybrid_chunker.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ def _make_chunk_from_doc_items(
res_text
for doc_item in doc_items
if (res_text := doc_serializer.serialize(item=doc_item).text)
and not isinstance(doc_item, (TitleItem, SectionHeaderItem))
and not isinstance(doc_item, TitleItem | SectionHeaderItem)
]
)
)
Expand Down
2 changes: 1 addition & 1 deletion docling_core/transforms/serializer/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ def _iterate_items(
traverse_pictures=traverse_pictures,
):
if add_page_breaks:
if isinstance(item, (ListGroup, InlineGroup)) and item.self_ref not in my_visited:
if isinstance(item, ListGroup | InlineGroup) and item.self_ref not in my_visited:
# if group starts with new page, yield page break before group node
my_visited.add(item.self_ref)
for it, _ in _iterate_items(
Expand Down
8 changes: 4 additions & 4 deletions docling_core/transforms/serializer/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,12 +158,12 @@ def serialize(
post_processed = True
else:
text = item.text
if not isinstance(item, (CodeItem, FormulaItem)):
if not isinstance(item, CodeItem | FormulaItem):
text = html.escape(text, quote=False)
text = text.replace("\n", "<br>")

# Prepare the HTML based on item type
if isinstance(item, (TitleItem, SectionHeaderItem)):
if isinstance(item, TitleItem | SectionHeaderItem):
section_level = min(item.level + 1, 6) if isinstance(item, SectionHeaderItem) else 1
text = get_html_tag_with_text_direction(html_tag=f"h{section_level}", text=text)

Expand Down Expand Up @@ -843,7 +843,7 @@ def serialize(
for ann in item.get_annotations():
if isinstance(
ann,
(PictureClassificationData, DescriptionAnnotation, PictureMoleculeData),
PictureClassificationData | DescriptionAnnotation | PictureMoleculeData,
):
if ann_text := _get_annotation_text(ann):
text_dir = get_text_direction(ann_text)
Expand Down Expand Up @@ -1055,7 +1055,7 @@ def serialize_captions(

if (
item.self_ref not in excluded_refs
and isinstance(item, (PictureItem, TableItem))
and isinstance(item, PictureItem | TableItem)
and _should_use_legacy_annotations(params=params, item=item)
):
ann_res = self.serialize_annotations(
Expand Down
8 changes: 2 additions & 6 deletions docling_core/transforms/serializer/latex.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ def serialize(
text = item.text
post_process = True

if isinstance(item, (ListItem, TitleItem, SectionHeaderItem)):
if isinstance(item, ListItem | TitleItem | SectionHeaderItem):
# For list items, defer environment wrapping to list serializer
if isinstance(item, ListItem):
if post_process:
Expand Down Expand Up @@ -253,11 +253,7 @@ def serialize(
for ann in item.get_annotations():
if isinstance(
ann,
(
PictureClassificationData,
DescriptionAnnotation,
PictureMoleculeData,
),
PictureClassificationData | DescriptionAnnotation | PictureMoleculeData,
):
if ann_text := _get_annotation_text(ann):
# Ensure each line of the annotation is prefixed with '%'
Expand Down
8 changes: 2 additions & 6 deletions docling_core/transforms/serializer/markdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ def serialize(
text = f"- [x] {text}"
if item.label == DocItemLabel.CHECKBOX_UNSELECTED:
text = f"- [ ] {text}"
if isinstance(item, (ListItem, TitleItem, SectionHeaderItem)):
if isinstance(item, ListItem | TitleItem | SectionHeaderItem):
if not has_inline_repr:
# case where processing/formatting should be applied first (in inner scope)
text = doc_serializer.post_process(
Expand Down Expand Up @@ -320,11 +320,7 @@ def serialize(
for ann in item.get_annotations():
if isinstance(
ann,
(
PictureClassificationData,
DescriptionAnnotation,
PictureMoleculeData,
),
PictureClassificationData | DescriptionAnnotation | PictureMoleculeData,
):
if ann_text := _get_annotation_text(ann):
ann_res = create_ser_result(
Expand Down
12 changes: 5 additions & 7 deletions docling_core/types/doc/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -1855,12 +1855,10 @@ def _migrate_annotations_to_meta(self) -> Self:
)
elif not isinstance(
ann,
(
PictureClassificationData,
DescriptionAnnotation,
PictureMoleculeData,
PictureTabularChartData,
),
PictureClassificationData
| DescriptionAnnotation
| PictureMoleculeData
| PictureTabularChartData,
) and not hasattr(
self.meta,
MetaUtils.create_meta_field_name(
Expand Down Expand Up @@ -2639,7 +2637,7 @@ def _append_item(self, *, item: NodeItem, parent_ref: RefItem) -> RefItem:

self.form_items.append(item)

elif isinstance(item, (ListGroup, InlineGroup)):
elif isinstance(item, ListGroup | InlineGroup):
item_label = "groups"
item_index = len(self.groups)

Expand Down
2 changes: 1 addition & 1 deletion docling_core/types/legacy_doc/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -453,7 +453,7 @@ def export_to_markdown(
if item is None:
continue

if isinstance(item, (Table, Figure)) and item.text and item.obj_type in main_text_labels:
if isinstance(item, Table | Figure) and item.text and item.obj_type in main_text_labels:
embedded_captions.add(item.text)

# serialize document to markdown
Expand Down
6 changes: 3 additions & 3 deletions docling_core/utils/legacy.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ def docling_document_to_legacy(doc: DoclingDocument, fallback_filaname: str = "f

embedded_captions = set()
for ix, (item, level) in enumerate(doc.iterate_items(doc.body)):
if isinstance(item, (TableItem, PictureItem)) and len(item.captions) > 0:
if isinstance(item, TableItem | PictureItem) and len(item.captions) > 0:
caption = item.caption_text(doc)
if caption:
embedded_captions.add(caption)
Expand All @@ -148,7 +148,7 @@ def docling_document_to_legacy(doc: DoclingDocument, fallback_filaname: str = "f
if isinstance(item, DocItem):
item_type = item.label

if isinstance(item, (TextItem, ListItem, SectionHeaderItem)):
if isinstance(item, TextItem | ListItem | SectionHeaderItem):
if isinstance(item, ListItem) and item.marker:
text = f"{item.marker} {item.text}"
else:
Expand Down Expand Up @@ -422,7 +422,7 @@ def _transform_prov(item: BaseCell) -> Optional[ProvenanceItem]:
if item is None:
continue

if isinstance(item, (DsSchemaTable, Figure)) and item.text:
if isinstance(item, DsSchemaTable | Figure) and item.text:
embedded_captions[item.text] = ix

# build lookup from floating objects to their caption item
Expand Down
27 changes: 13 additions & 14 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,14 +31,13 @@ classifiers = [
"Topic :: Software Development :: Libraries :: Python Modules",
"Typing :: Typed",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Programming Language :: Python :: 3.13",
"Programming Language :: Python :: 3.14",
]
requires-python = '>=3.9,<4.0'
requires-python = '>=3.10,<4.0'
dependencies = [
'jsonschema (>=4.16.0,<5.0.0)',
'pydantic (>=2.6.0,<3.0.0,!=2.10.0,!=2.10.1,!=2.10.2)',
Expand Down Expand Up @@ -68,23 +67,23 @@ docling-view = "docling_core.cli.view:app"
chunking = [
# common:
'semchunk (>=2.2.0,<3.0.0)',
'tree-sitter (>=0.23.2,<0.25.0)',
'tree-sitter-python (>=0.23.6,<0.25.0)',
'tree-sitter-c (>=0.23.4,<0.23.5)',
'tree-sitter-javascript (>=0.23.1,<0.25.0)',
'tree-sitter-typescript (>=0.23.2,<0.25.0)',
'tree-sitter (>=0.25.0,<0.27.0)',
'tree-sitter-python >=0.23.6',
'tree-sitter-c >=0.23.4',
'tree-sitter-javascript >=0.23.1',
'tree-sitter-typescript >=0.23.2',

# specific:
'transformers (>=4.34.0,<5.0.0)',
]
chunking-openai = [
# common:
'semchunk (>=2.2.0,<3.0.0)',
'tree-sitter (>=0.23.2,<0.25.0)',
'tree-sitter-python (>=0.23.6,<0.25.0)',
'tree-sitter-c (>=0.23.4,<0.23.5)',
'tree-sitter-javascript (>=0.23.1,<0.25.0)',
'tree-sitter-typescript (>=0.23.2,<0.25.0)',
'tree-sitter (>=0.25.0,<0.27.0)',
'tree-sitter-python >=0.23.6',
'tree-sitter-c >=0.23.4',
'tree-sitter-javascript >=0.23.1',
'tree-sitter-typescript >=0.23.2',

# specific:
'tiktoken (>=0.9.0,<0.13.0)',
Expand Down Expand Up @@ -131,7 +130,7 @@ namespaces = true
"*" = ["*.json"]

[tool.ruff]
target-version = "py39"
target-version = "py310"
line-length = 120
respect-gitignore = true
exclude = [
Expand Down Expand Up @@ -205,7 +204,7 @@ pretty = true
no_implicit_optional = true
namespace_packages = true
show_error_codes = true
python_version = "3.9"
python_version = "3.10"
plugins = ["pydantic.mypy"]
exclude = "(^|/)test/data/.*"

Expand Down
Loading
Loading