fix: rich table triplet serialization (#425)

Vdaleke · ceberam · web-flow · commit c566268e0a3f · 2026-02-26T11:39:34.000+01:00
* fix: rich table triplet serialization

Co-authored-by: Cesar Berrospi Ramis &lt;ceb@zurich.ibm.com&gt;
Signed-off-by: Vdaleke &lt;vdalekesmirnov@gmail.com&gt;
Signed-off-by: Cesar Berrospi Ramis &lt;ceb@zurich.ibm.com&gt;

* refactor: remove kwargs from 'export_to_dataframe' signature

Signed-off-by: Cesar Berrospi Ramis &lt;ceb@zurich.ibm.com&gt;

---------

Signed-off-by: Vdaleke &lt;vdalekesmirnov@gmail.com&gt;
Signed-off-by: Cesar Berrospi Ramis &lt;ceb@zurich.ibm.com&gt;
Co-authored-by: Cesar Berrospi Ramis &lt;ceb@zurich.ibm.com&gt;
diff --git a/docling_core/transforms/chunker/hierarchical_chunker.py b/docling_core/transforms/chunker/hierarchical_chunker.py
@@ -65,7 +65,11 @@ def serialize(
             parts.append(cap_res)
 
         if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
-            table_df = item.export_to_dataframe(doc)
+            table_df = item._export_to_dataframe_with_options(
+                doc,
+                doc_serializer=doc_serializer,
+                **kwargs,
+            )
             if table_df.shape[0] >= 1 and table_df.shape[1] >= 1:
                 # Handle single-column tables
                 if table_df.shape[1] == 1:
diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py
@@ -2174,6 +2174,16 @@ def _migrate_annotations_to_meta(self) -> Self:
 
     def export_to_dataframe(self, doc: Optional["DoclingDocument"] = None) -> pd.DataFrame:
         """Export the table as a Pandas DataFrame."""
+
+        return self._export_to_dataframe_with_options(doc=doc)
+
+    def _export_to_dataframe_with_options(
+        self,
+        doc: Optional["DoclingDocument"] = None,
+        **kwargs: Any,
+    ) -> pd.DataFrame:
+        """Export the table as a Pandas DataFrame with contextual named arguments."""
+
         if doc is None:
             _logger.warning("Usage of TableItem.export_to_dataframe() without `doc` argument is deprecated.")
 
@@ -2203,13 +2213,13 @@ def export_to_dataframe(self, doc: Optional["DoclingDocument"] = None) -> pd.Dat
             columns = ["" for _ in range(self.data.num_cols)]
             for i in range(num_headers):
                 for j, cell in enumerate(self.data.grid[i]):
-                    col_name = cell._get_text(doc=doc)
+                    col_name = cell._get_text(doc=doc, **kwargs)
                     if columns[j] != "":
                         col_name = f".{col_name}"
                     columns[j] += col_name
 
         # Create table data
-        table_data = [[cell._get_text(doc=doc) for cell in row] for row in self.data.grid[num_headers:]]
+        table_data = [[cell._get_text(doc=doc, **kwargs) for cell in row] for row in self.data.grid[num_headers:]]
 
         # Create DataFrame
         table = pd.DataFrame(table_data, columns=columns)
diff --git a/test/data/chunker/0c_out_chunks.json b/test/data/chunker/0c_out_chunks.json
@@ -0,0 +1,39 @@
+{
+    "root": [
+        {
+            "text": "cell 0,0, 1 = cell 0,1. cell 1,0, 1 = <em><p>text in italic</p></em>. <ul>\n<li>list item 1</li>\n<li>list item 2</li>\n</ul>, 1 = cell 2,1. cell 3,0, 1 = inner cell 0,0, 1 = inner cell 0,1. inner cell 0,0, 2 = inner cell 0,2. inner cell 1,0, 1 = inner cell 1,1. inner cell 1,0, 2 = inner cell 1,2. <p>Some text in a generic group.</p>\n<p>More text in the group.</p>, 1 = cell 4,1",
+            "meta": {
+                "schema_name": "docling_core.transforms.chunker.DocMeta",
+                "version": "1.0.0",
+                "doc_items": [
+                    {
+                        "self_ref": "#/tables/0",
+                        "parent": {
+                            "$ref": "#/body"
+                        },
+                        "children": [
+                            {
+                                "$ref": "#/texts/1"
+                            },
+                            {
+                                "$ref": "#/groups/0"
+                            },
+                            {
+                                "$ref": "#/tables/1"
+                            },
+                            {
+                                "$ref": "#/groups/1"
+                            }
+                        ],
+                        "content_layer": "body",
+                        "label": "table",
+                        "prov": []
+                    }
+                ],
+                "headings": [
+                    "Rich tables"
+                ]
+            }
+        }
+    ]
+}
diff --git a/test/test_hierarchical_chunker.py b/test/test_hierarchical_chunker.py
@@ -8,6 +8,7 @@
     DocChunk,
     TripletTableSerializer,
 )
+from docling_core.transforms.serializer.html import HTMLDocSerializer
 from docling_core.transforms.serializer.markdown import MarkdownParams, MarkdownTableSerializer
 from docling_core.types.doc import DocItemLabel, DoclingDocument, PictureItem, TableData, TextItem
 
@@ -179,3 +180,27 @@ def test_triplet_table_serializer_single_column():
     expected = "Country = Italy. Country = Canada. Country = Switzerland"
     assert result.text == expected, f"Expected '{expected}', got '{result.text}'"
 
+def test_chunk_rich_table_custom_serializer(rich_table_doc: DoclingDocument):
+    doc = rich_table_doc
+
+    class MySerializerProvider(ChunkingSerializerProvider):
+        def get_serializer(self, doc: DoclingDocument):
+            return HTMLDocSerializer(
+                doc=doc,
+                table_serializer=TripletTableSerializer(),
+            )
+
+    chunker = HierarchicalChunker(
+        merge_list_items=True,
+        serializer_provider=MySerializerProvider(),
+    )
+
+    chunks = chunker.chunk(dl_doc=doc)
+    act_data = dict(
+        root=[DocChunk.model_validate(n).export_json_dict() for n in chunks]
+    )
+
+    _process(
+        act_data=act_data,
+        exp_path_str="test/data/chunker/0c_out_chunks.json",
+    )