Skip to content

Commit c566268

Browse files
Vdalekeceberam
andauthored
fix: rich table triplet serialization (#425)
* fix: rich table triplet serialization Co-authored-by: Cesar Berrospi Ramis <[email protected]> Signed-off-by: Vdaleke <[email protected]> Signed-off-by: Cesar Berrospi Ramis <[email protected]> * refactor: remove kwargs from 'export_to_dataframe' signature Signed-off-by: Cesar Berrospi Ramis <[email protected]> --------- Signed-off-by: Vdaleke <[email protected]> Signed-off-by: Cesar Berrospi Ramis <[email protected]> Co-authored-by: Cesar Berrospi Ramis <[email protected]>
1 parent 73b0757 commit c566268

4 files changed

Lines changed: 81 additions & 3 deletions

File tree

docling_core/transforms/chunker/hierarchical_chunker.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,11 @@ def serialize(
6565
parts.append(cap_res)
6666

6767
if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
68-
table_df = item.export_to_dataframe(doc)
68+
table_df = item._export_to_dataframe_with_options(
69+
doc,
70+
doc_serializer=doc_serializer,
71+
**kwargs,
72+
)
6973
if table_df.shape[0] >= 1 and table_df.shape[1] >= 1:
7074
# Handle single-column tables
7175
if table_df.shape[1] == 1:

docling_core/types/doc/document.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2174,6 +2174,16 @@ def _migrate_annotations_to_meta(self) -> Self:
21742174

21752175
def export_to_dataframe(self, doc: Optional["DoclingDocument"] = None) -> pd.DataFrame:
21762176
"""Export the table as a Pandas DataFrame."""
2177+
2178+
return self._export_to_dataframe_with_options(doc=doc)
2179+
2180+
def _export_to_dataframe_with_options(
2181+
self,
2182+
doc: Optional["DoclingDocument"] = None,
2183+
**kwargs: Any,
2184+
) -> pd.DataFrame:
2185+
"""Export the table as a Pandas DataFrame with contextual named arguments."""
2186+
21772187
if doc is None:
21782188
_logger.warning("Usage of TableItem.export_to_dataframe() without `doc` argument is deprecated.")
21792189

@@ -2203,13 +2213,13 @@ def export_to_dataframe(self, doc: Optional["DoclingDocument"] = None) -> pd.Dat
22032213
columns = ["" for _ in range(self.data.num_cols)]
22042214
for i in range(num_headers):
22052215
for j, cell in enumerate(self.data.grid[i]):
2206-
col_name = cell._get_text(doc=doc)
2216+
col_name = cell._get_text(doc=doc, **kwargs)
22072217
if columns[j] != "":
22082218
col_name = f".{col_name}"
22092219
columns[j] += col_name
22102220

22112221
# Create table data
2212-
table_data = [[cell._get_text(doc=doc) for cell in row] for row in self.data.grid[num_headers:]]
2222+
table_data = [[cell._get_text(doc=doc, **kwargs) for cell in row] for row in self.data.grid[num_headers:]]
22132223

22142224
# Create DataFrame
22152225
table = pd.DataFrame(table_data, columns=columns)
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
{
2+
"root": [
3+
{
4+
"text": "cell 0,0, 1 = cell 0,1. cell 1,0, 1 = <em><p>text in italic</p></em>. <ul>\n<li>list item 1</li>\n<li>list item 2</li>\n</ul>, 1 = cell 2,1. cell 3,0, 1 = inner cell 0,0, 1 = inner cell 0,1. inner cell 0,0, 2 = inner cell 0,2. inner cell 1,0, 1 = inner cell 1,1. inner cell 1,0, 2 = inner cell 1,2. <p>Some text in a generic group.</p>\n<p>More text in the group.</p>, 1 = cell 4,1",
5+
"meta": {
6+
"schema_name": "docling_core.transforms.chunker.DocMeta",
7+
"version": "1.0.0",
8+
"doc_items": [
9+
{
10+
"self_ref": "#/tables/0",
11+
"parent": {
12+
"$ref": "#/body"
13+
},
14+
"children": [
15+
{
16+
"$ref": "#/texts/1"
17+
},
18+
{
19+
"$ref": "#/groups/0"
20+
},
21+
{
22+
"$ref": "#/tables/1"
23+
},
24+
{
25+
"$ref": "#/groups/1"
26+
}
27+
],
28+
"content_layer": "body",
29+
"label": "table",
30+
"prov": []
31+
}
32+
],
33+
"headings": [
34+
"Rich tables"
35+
]
36+
}
37+
}
38+
]
39+
}

test/test_hierarchical_chunker.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
DocChunk,
99
TripletTableSerializer,
1010
)
11+
from docling_core.transforms.serializer.html import HTMLDocSerializer
1112
from docling_core.transforms.serializer.markdown import MarkdownParams, MarkdownTableSerializer
1213
from docling_core.types.doc import DocItemLabel, DoclingDocument, PictureItem, TableData, TextItem
1314

@@ -179,3 +180,27 @@ def test_triplet_table_serializer_single_column():
179180
expected = "Country = Italy. Country = Canada. Country = Switzerland"
180181
assert result.text == expected, f"Expected '{expected}', got '{result.text}'"
181182

183+
def test_chunk_rich_table_custom_serializer(rich_table_doc: DoclingDocument):
184+
doc = rich_table_doc
185+
186+
class MySerializerProvider(ChunkingSerializerProvider):
187+
def get_serializer(self, doc: DoclingDocument):
188+
return HTMLDocSerializer(
189+
doc=doc,
190+
table_serializer=TripletTableSerializer(),
191+
)
192+
193+
chunker = HierarchicalChunker(
194+
merge_list_items=True,
195+
serializer_provider=MySerializerProvider(),
196+
)
197+
198+
chunks = chunker.chunk(dl_doc=doc)
199+
act_data = dict(
200+
root=[DocChunk.model_validate(n).export_json_dict() for n in chunks]
201+
)
202+
203+
_process(
204+
act_data=act_data,
205+
exp_path_str="test/data/chunker/0c_out_chunks.json",
206+
)

0 commit comments

Comments
 (0)