Skip to content

Commit 9a42a3c

Browse files
feat: updating the idoctags serializer (#450)
* feat: updating the idoctags serializer Signed-off-by: Peter Staar <taa@zurich.ibm.com> * feat: adding IDocTagsToken class Signed-off-by: Peter Staar <taa@zurich.ibm.com> * fix the get_special_tokens Signed-off-by: Peter Staar <taa@zurich.ibm.com> * fix the get_special_tokens (2) Signed-off-by: Peter Staar <taa@zurich.ibm.com> * fixed the doctags serializer: if the no_content flag is true, no double list_item, no content in tables and captions Signed-off-by: Peter Staar <taa@zurich.ibm.com> * fixed the list serializer in the doctags Signed-off-by: Peter Staar <taa@zurich.ibm.com> * added missing document serializer Signed-off-by: Peter Staar <taa@zurich.ibm.com> * removed the prints Signed-off-by: Peter Staar <taa@zurich.ibm.com> * added tests for content suppression Signed-off-by: Peter Staar <taa@zurich.ibm.com> * added extra tests for serialization Signed-off-by: Peter Staar <taa@zurich.ibm.com> * added extra tests Signed-off-by: Peter Staar <taa@zurich.ibm.com> * ran pre-commit Signed-off-by: Peter Staar <taa@zurich.ibm.com> * deal with differences between xml serialization in mac and linux Signed-off-by: Peter Staar <taa@zurich.ibm.com> * revert to old doctags serialization Signed-off-by: Peter Staar <taa@zurich.ibm.com> * revert to old doctags serialization (2) Signed-off-by: Peter Staar <taa@zurich.ibm.com> * revert to old doctags serialization (3) Signed-off-by: Peter Staar <taa@zurich.ibm.com> * revert to old doctags serialization (4) Signed-off-by: Peter Staar <taa@zurich.ibm.com> * fixing the tests Signed-off-by: Peter Staar <taa@zurich.ibm.com> * fixed all the tests, except the no-content with lists Signed-off-by: Peter Staar <taa@zurich.ibm.com> * fixed the no-content in lists with doctags Signed-off-by: Peter Staar <taa@zurich.ibm.com> * removed the prints Signed-off-by: Peter Staar <taa@zurich.ibm.com> --------- Signed-off-by: Peter Staar <taa@zurich.ibm.com>
1 parent d2fb47f commit 9a42a3c

13 files changed

Lines changed: 2960 additions & 32 deletions

docling_core/experimental/idoctags.py

Lines changed: 270 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,15 @@
11
"""Define classes for DocTags serialization."""
22

33
from enum import Enum
4-
from typing import Any, Final, Optional
4+
from typing import Any, Final, Optional, Tuple
55
from xml.dom.minidom import parseString
66

77
from pydantic import BaseModel
88
from typing_extensions import override
99

1010
from docling_core.transforms.serializer.base import (
1111
BaseDocSerializer,
12+
BaseListSerializer,
1213
BaseMetaSerializer,
1314
BasePictureSerializer,
1415
BaseTableSerializer,
@@ -28,6 +29,8 @@
2829
DescriptionMetaField,
2930
DocItem,
3031
DoclingDocument,
32+
ListGroup,
33+
ListItem,
3134
MetaFieldName,
3235
MoleculeMetaField,
3336
NodeItem,
@@ -38,7 +41,10 @@
3841
TabularChartMetaField,
3942
)
4043
from docling_core.types.doc.labels import DocItemLabel
41-
from docling_core.types.doc.tokens import DocumentToken
44+
from docling_core.types.doc.tokens import (
45+
_CodeLanguageToken,
46+
_PictureClassificationToken,
47+
)
4248

4349
DOCTAGS_VERSION: Final = "1.0.0"
4450

@@ -61,6 +67,127 @@ class IDocTagsTableToken(str, Enum):
6167
OTSL_RHED = "<rhed/>" # - row header cell,
6268
OTSL_SROW = "<srow/>" # - section row cell
6369

70+
@classmethod
71+
def get_special_tokens(
72+
cls,
73+
):
74+
"""Return all table-related special tokens.
75+
76+
Includes the opening/closing OTSL tags and each enum token value.
77+
"""
78+
special_tokens: list[str] = ["<otsl>", "</otsl>"]
79+
for token in cls:
80+
special_tokens.append(f"{token.value}")
81+
82+
return special_tokens
83+
84+
85+
class IDocTagsToken(str, Enum):
86+
"""IDocTagsToken."""
87+
88+
_LOC_PREFIX = "loc_"
89+
_SECTION_HEADER_PREFIX = "section_header_level_"
90+
91+
DOCUMENT = "doctag"
92+
VERSION = "version"
93+
94+
OTSL = "otsl"
95+
ORDERED_LIST = "ordered_list"
96+
UNORDERED_LIST = "unordered_list"
97+
98+
PAGE_BREAK = "page_break"
99+
100+
CAPTION = "caption"
101+
FOOTNOTE = "footnote"
102+
FORMULA = "formula"
103+
LIST_ITEM = "list_item"
104+
PAGE_FOOTER = "page_footer"
105+
PAGE_HEADER = "page_header"
106+
PICTURE = "picture"
107+
SECTION_HEADER = "section_header"
108+
TABLE = "table"
109+
TEXT = "text"
110+
TITLE = "title"
111+
DOCUMENT_INDEX = "document_index"
112+
CODE = "code"
113+
CHECKBOX_SELECTED = "checkbox_selected"
114+
CHECKBOX_UNSELECTED = "checkbox_unselected"
115+
FORM = "form"
116+
EMPTY_VALUE = "empty_value" # used for empty value fields in fillable forms
117+
118+
@classmethod
119+
def get_special_tokens(
120+
cls,
121+
*,
122+
page_dimension: Tuple[int, int] = (500, 500),
123+
include_location_tokens: bool = True,
124+
include_code_class: bool = False,
125+
include_picture_class: bool = False,
126+
):
127+
"""Function to get all special document tokens."""
128+
special_tokens: list[str] = []
129+
for token in cls:
130+
if not token.value.endswith("_"):
131+
special_tokens.append(f"<{token.value}>")
132+
special_tokens.append(f"</{token.value}>")
133+
134+
for i in range(6):
135+
special_tokens += [
136+
f"<{IDocTagsToken._SECTION_HEADER_PREFIX.value}{i}>",
137+
f"</{IDocTagsToken._SECTION_HEADER_PREFIX.value}{i}>",
138+
]
139+
140+
special_tokens.extend(IDocTagsTableToken.get_special_tokens())
141+
142+
if include_picture_class:
143+
special_tokens.extend([t.value for t in _PictureClassificationToken])
144+
145+
if include_code_class:
146+
special_tokens.extend([t.value for t in _CodeLanguageToken])
147+
148+
if include_location_tokens:
149+
# Adding dynamically generated location-tokens
150+
for i in range(0, max(page_dimension[0], page_dimension[1])):
151+
special_tokens.append(f"<{IDocTagsToken._LOC_PREFIX.value}{i}/>")
152+
153+
return special_tokens
154+
155+
@classmethod
156+
def create_token_name_from_doc_item_label(cls, label: str, level: int = 1) -> str:
157+
"""Get token corresponding to passed doc item label."""
158+
doc_token_by_item_label = {
159+
DocItemLabel.CAPTION: IDocTagsToken.CAPTION,
160+
DocItemLabel.FOOTNOTE: IDocTagsToken.FOOTNOTE,
161+
DocItemLabel.FORMULA: IDocTagsToken.FORMULA,
162+
DocItemLabel.LIST_ITEM: IDocTagsToken.LIST_ITEM,
163+
DocItemLabel.PAGE_FOOTER: IDocTagsToken.PAGE_FOOTER,
164+
DocItemLabel.PAGE_HEADER: IDocTagsToken.PAGE_HEADER,
165+
DocItemLabel.PICTURE: IDocTagsToken.PICTURE,
166+
DocItemLabel.TABLE: IDocTagsToken.TABLE,
167+
DocItemLabel.TEXT: IDocTagsToken.TEXT,
168+
DocItemLabel.TITLE: IDocTagsToken.TITLE,
169+
DocItemLabel.DOCUMENT_INDEX: IDocTagsToken.DOCUMENT_INDEX,
170+
DocItemLabel.CODE: IDocTagsToken.CODE,
171+
DocItemLabel.CHECKBOX_SELECTED: IDocTagsToken.CHECKBOX_SELECTED,
172+
DocItemLabel.CHECKBOX_UNSELECTED: IDocTagsToken.CHECKBOX_UNSELECTED,
173+
DocItemLabel.FORM: IDocTagsToken.FORM,
174+
# Fallback mappings for labels without dedicated tokens in IDocTagsToken
175+
DocItemLabel.KEY_VALUE_REGION: IDocTagsToken.TEXT,
176+
DocItemLabel.PARAGRAPH: IDocTagsToken.TEXT,
177+
DocItemLabel.REFERENCE: IDocTagsToken.TEXT,
178+
DocItemLabel.CHART: IDocTagsToken.PICTURE,
179+
}
180+
181+
res: str
182+
if label == DocItemLabel.SECTION_HEADER:
183+
res = f"{IDocTagsToken._SECTION_HEADER_PREFIX}{level}"
184+
else:
185+
try:
186+
res = doc_token_by_item_label[DocItemLabel(label)].value
187+
except KeyError as e:
188+
raise RuntimeError(f"Unexpected DocItemLabel: {label}") from e
189+
return res
190+
64191

65192
class IDocTagsParams(DocTagsParams):
66193
"""DocTags-specific serialization parameters."""
@@ -69,6 +196,136 @@ class IDocTagsParams(DocTagsParams):
69196
pretty_indentation: Optional[str] = 2 * " "
70197

71198

199+
class IDocTagsListSerializer(BaseModel, BaseListSerializer):
200+
"""DocTags-specific list serializer."""
201+
202+
indent: int = 4
203+
204+
@override
205+
def serialize(
206+
self,
207+
*,
208+
item: ListGroup,
209+
doc_serializer: "BaseDocSerializer",
210+
doc: DoclingDocument,
211+
list_level: int = 0,
212+
is_inline_scope: bool = False,
213+
visited: Optional[set[str]] = None, # refs of visited items
214+
**kwargs: Any,
215+
) -> SerializationResult:
216+
"""Serialize a ``ListGroup`` into IDocTags markup.
217+
218+
This emits list containers (``<ordered_list>``/``<unordered_list>``) and
219+
serializes children explicitly. Nested ``ListGroup`` items are emitted as
220+
siblings without an enclosing ``<list_item>`` wrapper, while structural
221+
wrappers are still preserved even when content is suppressed.
222+
223+
Args:
224+
item: The list group to serialize.
225+
doc_serializer: The document-level serializer to delegate nested items.
226+
doc: The document that provides item resolution.
227+
list_level: Current nesting depth (0-based).
228+
is_inline_scope: Whether serialization happens in an inline context.
229+
visited: Set of already visited item refs to avoid cycles.
230+
**kwargs: Additional serializer parameters forwarded to ``IDocTagsParams``.
231+
232+
Returns:
233+
A ``SerializationResult`` containing serialized text and metadata.
234+
"""
235+
my_visited = visited if visited is not None else set()
236+
params = IDocTagsParams(**kwargs)
237+
238+
# Build list children explicitly. Requirements:
239+
# 1) <ordered_list>/<unordered_list> can be children of lists.
240+
# 2) Do NOT wrap nested lists into <list_item>, even if they are
241+
# children of a ListItem in the logical structure.
242+
# 3) Still ensure structural wrappers are preserved even when
243+
# content is suppressed (e.g., add_content=False).
244+
item_results: list[SerializationResult] = []
245+
child_results_wrapped: list[str] = []
246+
247+
excluded = doc_serializer.get_excluded_refs(**kwargs)
248+
for child_ref in item.children:
249+
child = child_ref.resolve(doc)
250+
251+
# If a nested list group is present directly under this list group,
252+
# emit it as a sibling (no <list_item> wrapper).
253+
if isinstance(child, ListGroup):
254+
if child.self_ref in my_visited or child.self_ref in excluded:
255+
continue
256+
my_visited.add(child.self_ref)
257+
sub_res = doc_serializer.serialize(
258+
item=child,
259+
list_level=list_level + 1,
260+
is_inline_scope=is_inline_scope,
261+
visited=my_visited,
262+
**kwargs,
263+
)
264+
if sub_res.text:
265+
child_results_wrapped.append(sub_res.text)
266+
item_results.append(sub_res)
267+
continue
268+
269+
# Normal case: ListItem under ListGroup
270+
if not isinstance(child, ListItem):
271+
continue
272+
if child.self_ref in my_visited or child.self_ref in excluded:
273+
continue
274+
275+
my_visited.add(child.self_ref)
276+
277+
# Serialize the list item content (DocTagsTextSerializer will not wrap it)
278+
child_res = doc_serializer.serialize(
279+
item=child,
280+
list_level=list_level + 1,
281+
is_inline_scope=is_inline_scope,
282+
visited=my_visited,
283+
**kwargs,
284+
)
285+
item_results.append(child_res)
286+
# Wrap the content into <list_item>, without any nested list content.
287+
child_text_wrapped = _wrap(
288+
text=f"{child_res.text}",
289+
wrap_tag=IDocTagsToken.LIST_ITEM.value,
290+
)
291+
child_results_wrapped.append(child_text_wrapped)
292+
293+
# After the <list_item>, append any nested lists (children of this ListItem)
294+
# as siblings at the same level (not wrapped in <list_item>).
295+
for subref in child.children:
296+
sub = subref.resolve(doc)
297+
if (
298+
isinstance(sub, ListGroup)
299+
and sub.self_ref not in my_visited
300+
and sub.self_ref not in excluded
301+
):
302+
my_visited.add(sub.self_ref)
303+
sub_res = doc_serializer.serialize(
304+
item=sub,
305+
list_level=list_level + 1,
306+
is_inline_scope=is_inline_scope,
307+
visited=my_visited,
308+
**kwargs,
309+
)
310+
if sub_res.text:
311+
child_results_wrapped.append(sub_res.text)
312+
item_results.append(sub_res)
313+
314+
delim = _get_delim(params=params)
315+
if child_results_wrapped:
316+
text_res = delim.join(child_results_wrapped)
317+
text_res = f"{text_res}{delim}"
318+
wrap_tag = (
319+
IDocTagsToken.ORDERED_LIST.value
320+
if item.first_item_is_enumerated(doc)
321+
else IDocTagsToken.UNORDERED_LIST.value
322+
)
323+
text_res = _wrap(text=text_res, wrap_tag=wrap_tag)
324+
else:
325+
text_res = ""
326+
return create_ser_result(text=text_res, span_source=item_results)
327+
328+
72329
class IDocTagsMetaSerializer(BaseModel, BaseMetaSerializer):
73330
"""DocTags-specific meta serializer."""
74331

@@ -187,6 +444,8 @@ def serialize(
187444
otsl_content = temp_table.export_to_otsl(
188445
temp_doc,
189446
add_cell_location=False,
447+
# Suppress chart cell text if global content is off
448+
add_cell_text=params.add_content,
190449
self_closing=params.do_self_closing,
191450
table_token=IDocTagsTableToken,
192451
)
@@ -200,7 +459,7 @@ def serialize(
200459

201460
text_res = "".join([r.text for r in res_parts])
202461
if text_res:
203-
token = DocumentToken.create_token_name_from_doc_item_label(
462+
token = IDocTagsToken.create_token_name_from_doc_item_label(
204463
label=DocItemLabel.CHART if is_chart else DocItemLabel.PICTURE,
205464
)
206465
text_res = _wrap(text=text_res, wrap_tag=token)
@@ -238,12 +497,16 @@ def serialize_doc(
238497
text_res = delim.join([p.text for p in parts if p.text])
239498

240499
if self.params.add_page_break:
241-
page_sep = f"<{DocumentToken.PAGE_BREAK.value}{'/' if self.params.do_self_closing else ''}>"
500+
page_sep = f"<{IDocTagsToken.PAGE_BREAK.value}{'/' if self.params.do_self_closing else ''}>"
242501
for full_match, _, _ in self._get_page_breaks(text=text_res):
243502
text_res = text_res.replace(full_match, page_sep)
244503

245-
wrap_tag = DocumentToken.DOCUMENT.value
246-
text_res = f"<{wrap_tag}><version>{DOCTAGS_VERSION}</version>{text_res}{delim}</{wrap_tag}>"
504+
tmp = f"<{IDocTagsToken.DOCUMENT.value}>"
505+
tmp += f"<{IDocTagsToken.VERSION.value}>{DOCTAGS_VERSION}</{IDocTagsToken.VERSION.value}>"
506+
tmp += f"{text_res}"
507+
tmp += f"</{IDocTagsToken.DOCUMENT.value}>"
508+
509+
text_res = tmp
247510

248511
if self.params.pretty_indentation and (
249512
my_root := parseString(text_res).documentElement
@@ -252,4 +515,5 @@ def serialize_doc(
252515
text_res = "\n".join(
253516
[line for line in text_res.split("\n") if line.strip()]
254517
)
518+
255519
return create_ser_result(text=text_res, span_source=parts)

docling_core/transforms/serializer/common.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -470,6 +470,10 @@ def get_parts(
470470
parts: list[SerializationResult] = []
471471
my_visited: set[str] = visited if visited is not None else set()
472472
params = self.params.merge_with_patch(patch=kwargs)
473+
add_content = True
474+
475+
if hasattr(params, "add_content"):
476+
add_content = getattr(params, "add_content")
473477

474478
for node, lvl in _iterate_items(
475479
node=item,
@@ -489,7 +493,7 @@ def get_parts(
489493
visited=my_visited,
490494
**(dict(level=lvl) | kwargs),
491495
)
492-
if part.text:
496+
if len(part.text.strip()) > 0 or (not add_content):
493497
parts.append(part)
494498

495499
return parts

0 commit comments

Comments
 (0)