11"""Define classes for DocTags serialization."""
22
33from enum import Enum
4- from typing import Any , Final , Optional
4+ from typing import Any , Final , Optional , Tuple
55from xml .dom .minidom import parseString
66
77from pydantic import BaseModel
88from typing_extensions import override
99
1010from docling_core .transforms .serializer .base import (
1111 BaseDocSerializer ,
12+ BaseListSerializer ,
1213 BaseMetaSerializer ,
1314 BasePictureSerializer ,
1415 BaseTableSerializer ,
2829 DescriptionMetaField ,
2930 DocItem ,
3031 DoclingDocument ,
32+ ListGroup ,
33+ ListItem ,
3134 MetaFieldName ,
3235 MoleculeMetaField ,
3336 NodeItem ,
3841 TabularChartMetaField ,
3942)
4043from docling_core .types .doc .labels import DocItemLabel
41- from docling_core .types .doc .tokens import DocumentToken
44+ from docling_core .types .doc .tokens import (
45+ _CodeLanguageToken ,
46+ _PictureClassificationToken ,
47+ )
4248
4349DOCTAGS_VERSION : Final = "1.0.0"
4450
@@ -61,6 +67,127 @@ class IDocTagsTableToken(str, Enum):
6167 OTSL_RHED = "<rhed/>" # - row header cell,
6268 OTSL_SROW = "<srow/>" # - section row cell
6369
70+ @classmethod
71+ def get_special_tokens (
72+ cls ,
73+ ):
74+ """Return all table-related special tokens.
75+
76+ Includes the opening/closing OTSL tags and each enum token value.
77+ """
78+ special_tokens : list [str ] = ["<otsl>" , "</otsl>" ]
79+ for token in cls :
80+ special_tokens .append (f"{ token .value } " )
81+
82+ return special_tokens
83+
84+
85+ class IDocTagsToken (str , Enum ):
86+ """IDocTagsToken."""
87+
88+ _LOC_PREFIX = "loc_"
89+ _SECTION_HEADER_PREFIX = "section_header_level_"
90+
91+ DOCUMENT = "doctag"
92+ VERSION = "version"
93+
94+ OTSL = "otsl"
95+ ORDERED_LIST = "ordered_list"
96+ UNORDERED_LIST = "unordered_list"
97+
98+ PAGE_BREAK = "page_break"
99+
100+ CAPTION = "caption"
101+ FOOTNOTE = "footnote"
102+ FORMULA = "formula"
103+ LIST_ITEM = "list_item"
104+ PAGE_FOOTER = "page_footer"
105+ PAGE_HEADER = "page_header"
106+ PICTURE = "picture"
107+ SECTION_HEADER = "section_header"
108+ TABLE = "table"
109+ TEXT = "text"
110+ TITLE = "title"
111+ DOCUMENT_INDEX = "document_index"
112+ CODE = "code"
113+ CHECKBOX_SELECTED = "checkbox_selected"
114+ CHECKBOX_UNSELECTED = "checkbox_unselected"
115+ FORM = "form"
116+ EMPTY_VALUE = "empty_value" # used for empty value fields in fillable forms
117+
118+ @classmethod
119+ def get_special_tokens (
120+ cls ,
121+ * ,
122+ page_dimension : Tuple [int , int ] = (500 , 500 ),
123+ include_location_tokens : bool = True ,
124+ include_code_class : bool = False ,
125+ include_picture_class : bool = False ,
126+ ):
127+ """Function to get all special document tokens."""
128+ special_tokens : list [str ] = []
129+ for token in cls :
130+ if not token .value .endswith ("_" ):
131+ special_tokens .append (f"<{ token .value } >" )
132+ special_tokens .append (f"</{ token .value } >" )
133+
134+ for i in range (6 ):
135+ special_tokens += [
136+ f"<{ IDocTagsToken ._SECTION_HEADER_PREFIX .value } { i } >" ,
137+ f"</{ IDocTagsToken ._SECTION_HEADER_PREFIX .value } { i } >" ,
138+ ]
139+
140+ special_tokens .extend (IDocTagsTableToken .get_special_tokens ())
141+
142+ if include_picture_class :
143+ special_tokens .extend ([t .value for t in _PictureClassificationToken ])
144+
145+ if include_code_class :
146+ special_tokens .extend ([t .value for t in _CodeLanguageToken ])
147+
148+ if include_location_tokens :
149+ # Adding dynamically generated location-tokens
150+ for i in range (0 , max (page_dimension [0 ], page_dimension [1 ])):
151+ special_tokens .append (f"<{ IDocTagsToken ._LOC_PREFIX .value } { i } />" )
152+
153+ return special_tokens
154+
155+ @classmethod
156+ def create_token_name_from_doc_item_label (cls , label : str , level : int = 1 ) -> str :
157+ """Get token corresponding to passed doc item label."""
158+ doc_token_by_item_label = {
159+ DocItemLabel .CAPTION : IDocTagsToken .CAPTION ,
160+ DocItemLabel .FOOTNOTE : IDocTagsToken .FOOTNOTE ,
161+ DocItemLabel .FORMULA : IDocTagsToken .FORMULA ,
162+ DocItemLabel .LIST_ITEM : IDocTagsToken .LIST_ITEM ,
163+ DocItemLabel .PAGE_FOOTER : IDocTagsToken .PAGE_FOOTER ,
164+ DocItemLabel .PAGE_HEADER : IDocTagsToken .PAGE_HEADER ,
165+ DocItemLabel .PICTURE : IDocTagsToken .PICTURE ,
166+ DocItemLabel .TABLE : IDocTagsToken .TABLE ,
167+ DocItemLabel .TEXT : IDocTagsToken .TEXT ,
168+ DocItemLabel .TITLE : IDocTagsToken .TITLE ,
169+ DocItemLabel .DOCUMENT_INDEX : IDocTagsToken .DOCUMENT_INDEX ,
170+ DocItemLabel .CODE : IDocTagsToken .CODE ,
171+ DocItemLabel .CHECKBOX_SELECTED : IDocTagsToken .CHECKBOX_SELECTED ,
172+ DocItemLabel .CHECKBOX_UNSELECTED : IDocTagsToken .CHECKBOX_UNSELECTED ,
173+ DocItemLabel .FORM : IDocTagsToken .FORM ,
174+ # Fallback mappings for labels without dedicated tokens in IDocTagsToken
175+ DocItemLabel .KEY_VALUE_REGION : IDocTagsToken .TEXT ,
176+ DocItemLabel .PARAGRAPH : IDocTagsToken .TEXT ,
177+ DocItemLabel .REFERENCE : IDocTagsToken .TEXT ,
178+ DocItemLabel .CHART : IDocTagsToken .PICTURE ,
179+ }
180+
181+ res : str
182+ if label == DocItemLabel .SECTION_HEADER :
183+ res = f"{ IDocTagsToken ._SECTION_HEADER_PREFIX } { level } "
184+ else :
185+ try :
186+ res = doc_token_by_item_label [DocItemLabel (label )].value
187+ except KeyError as e :
188+ raise RuntimeError (f"Unexpected DocItemLabel: { label } " ) from e
189+ return res
190+
64191
65192class IDocTagsParams (DocTagsParams ):
66193 """DocTags-specific serialization parameters."""
@@ -69,6 +196,136 @@ class IDocTagsParams(DocTagsParams):
69196 pretty_indentation : Optional [str ] = 2 * " "
70197
71198
199+ class IDocTagsListSerializer (BaseModel , BaseListSerializer ):
200+ """DocTags-specific list serializer."""
201+
202+ indent : int = 4
203+
204+ @override
205+ def serialize (
206+ self ,
207+ * ,
208+ item : ListGroup ,
209+ doc_serializer : "BaseDocSerializer" ,
210+ doc : DoclingDocument ,
211+ list_level : int = 0 ,
212+ is_inline_scope : bool = False ,
213+ visited : Optional [set [str ]] = None , # refs of visited items
214+ ** kwargs : Any ,
215+ ) -> SerializationResult :
216+ """Serialize a ``ListGroup`` into IDocTags markup.
217+
218+ This emits list containers (``<ordered_list>``/``<unordered_list>``) and
219+ serializes children explicitly. Nested ``ListGroup`` items are emitted as
220+ siblings without an enclosing ``<list_item>`` wrapper, while structural
221+ wrappers are still preserved even when content is suppressed.
222+
223+ Args:
224+ item: The list group to serialize.
225+ doc_serializer: The document-level serializer to delegate nested items.
226+ doc: The document that provides item resolution.
227+ list_level: Current nesting depth (0-based).
228+ is_inline_scope: Whether serialization happens in an inline context.
229+ visited: Set of already visited item refs to avoid cycles.
230+ **kwargs: Additional serializer parameters forwarded to ``IDocTagsParams``.
231+
232+ Returns:
233+ A ``SerializationResult`` containing serialized text and metadata.
234+ """
235+ my_visited = visited if visited is not None else set ()
236+ params = IDocTagsParams (** kwargs )
237+
238+ # Build list children explicitly. Requirements:
239+ # 1) <ordered_list>/<unordered_list> can be children of lists.
240+ # 2) Do NOT wrap nested lists into <list_item>, even if they are
241+ # children of a ListItem in the logical structure.
242+ # 3) Still ensure structural wrappers are preserved even when
243+ # content is suppressed (e.g., add_content=False).
244+ item_results : list [SerializationResult ] = []
245+ child_results_wrapped : list [str ] = []
246+
247+ excluded = doc_serializer .get_excluded_refs (** kwargs )
248+ for child_ref in item .children :
249+ child = child_ref .resolve (doc )
250+
251+ # If a nested list group is present directly under this list group,
252+ # emit it as a sibling (no <list_item> wrapper).
253+ if isinstance (child , ListGroup ):
254+ if child .self_ref in my_visited or child .self_ref in excluded :
255+ continue
256+ my_visited .add (child .self_ref )
257+ sub_res = doc_serializer .serialize (
258+ item = child ,
259+ list_level = list_level + 1 ,
260+ is_inline_scope = is_inline_scope ,
261+ visited = my_visited ,
262+ ** kwargs ,
263+ )
264+ if sub_res .text :
265+ child_results_wrapped .append (sub_res .text )
266+ item_results .append (sub_res )
267+ continue
268+
269+ # Normal case: ListItem under ListGroup
270+ if not isinstance (child , ListItem ):
271+ continue
272+ if child .self_ref in my_visited or child .self_ref in excluded :
273+ continue
274+
275+ my_visited .add (child .self_ref )
276+
277+ # Serialize the list item content (DocTagsTextSerializer will not wrap it)
278+ child_res = doc_serializer .serialize (
279+ item = child ,
280+ list_level = list_level + 1 ,
281+ is_inline_scope = is_inline_scope ,
282+ visited = my_visited ,
283+ ** kwargs ,
284+ )
285+ item_results .append (child_res )
286+ # Wrap the content into <list_item>, without any nested list content.
287+ child_text_wrapped = _wrap (
288+ text = f"{ child_res .text } " ,
289+ wrap_tag = IDocTagsToken .LIST_ITEM .value ,
290+ )
291+ child_results_wrapped .append (child_text_wrapped )
292+
293+ # After the <list_item>, append any nested lists (children of this ListItem)
294+ # as siblings at the same level (not wrapped in <list_item>).
295+ for subref in child .children :
296+ sub = subref .resolve (doc )
297+ if (
298+ isinstance (sub , ListGroup )
299+ and sub .self_ref not in my_visited
300+ and sub .self_ref not in excluded
301+ ):
302+ my_visited .add (sub .self_ref )
303+ sub_res = doc_serializer .serialize (
304+ item = sub ,
305+ list_level = list_level + 1 ,
306+ is_inline_scope = is_inline_scope ,
307+ visited = my_visited ,
308+ ** kwargs ,
309+ )
310+ if sub_res .text :
311+ child_results_wrapped .append (sub_res .text )
312+ item_results .append (sub_res )
313+
314+ delim = _get_delim (params = params )
315+ if child_results_wrapped :
316+ text_res = delim .join (child_results_wrapped )
317+ text_res = f"{ text_res } { delim } "
318+ wrap_tag = (
319+ IDocTagsToken .ORDERED_LIST .value
320+ if item .first_item_is_enumerated (doc )
321+ else IDocTagsToken .UNORDERED_LIST .value
322+ )
323+ text_res = _wrap (text = text_res , wrap_tag = wrap_tag )
324+ else :
325+ text_res = ""
326+ return create_ser_result (text = text_res , span_source = item_results )
327+
328+
72329class IDocTagsMetaSerializer (BaseModel , BaseMetaSerializer ):
73330 """DocTags-specific meta serializer."""
74331
@@ -187,6 +444,8 @@ def serialize(
187444 otsl_content = temp_table .export_to_otsl (
188445 temp_doc ,
189446 add_cell_location = False ,
447+ # Suppress chart cell text if global content is off
448+ add_cell_text = params .add_content ,
190449 self_closing = params .do_self_closing ,
191450 table_token = IDocTagsTableToken ,
192451 )
@@ -200,7 +459,7 @@ def serialize(
200459
201460 text_res = "" .join ([r .text for r in res_parts ])
202461 if text_res :
203- token = DocumentToken .create_token_name_from_doc_item_label (
462+ token = IDocTagsToken .create_token_name_from_doc_item_label (
204463 label = DocItemLabel .CHART if is_chart else DocItemLabel .PICTURE ,
205464 )
206465 text_res = _wrap (text = text_res , wrap_tag = token )
@@ -238,12 +497,16 @@ def serialize_doc(
238497 text_res = delim .join ([p .text for p in parts if p .text ])
239498
240499 if self .params .add_page_break :
241- page_sep = f"<{ DocumentToken .PAGE_BREAK .value } { '/' if self .params .do_self_closing else '' } >"
500+ page_sep = f"<{ IDocTagsToken .PAGE_BREAK .value } { '/' if self .params .do_self_closing else '' } >"
242501 for full_match , _ , _ in self ._get_page_breaks (text = text_res ):
243502 text_res = text_res .replace (full_match , page_sep )
244503
245- wrap_tag = DocumentToken .DOCUMENT .value
246- text_res = f"<{ wrap_tag } ><version>{ DOCTAGS_VERSION } </version>{ text_res } { delim } </{ wrap_tag } >"
504+ tmp = f"<{ IDocTagsToken .DOCUMENT .value } >"
505+ tmp += f"<{ IDocTagsToken .VERSION .value } >{ DOCTAGS_VERSION } </{ IDocTagsToken .VERSION .value } >"
506+ tmp += f"{ text_res } "
507+ tmp += f"</{ IDocTagsToken .DOCUMENT .value } >"
508+
509+ text_res = tmp
247510
248511 if self .params .pretty_indentation and (
249512 my_root := parseString (text_res ).documentElement
@@ -252,4 +515,5 @@ def serialize_doc(
252515 text_res = "\n " .join (
253516 [line for line in text_res .split ("\n " ) if line .strip ()]
254517 )
518+
255519 return create_ser_result (text = text_res , span_source = parts )
0 commit comments