docling-project · vagenas · Aug 27, 2025 · Aug 22, 2025 · Aug 25, 2025 · Aug 26, 2025
diff --git a/docling_core/transforms/serializer/common.py b/docling_core/transforms/serializer/common.py
@@ -359,6 +359,7 @@ def serialize(
                 item=item,
                 doc_serializer=self,
                 doc=self.doc,
+                visited=my_visited,
                 **my_kwargs,
             )
         elif isinstance(item, PictureItem):

diff --git a/docling_core/transforms/serializer/doctags.py b/docling_core/transforms/serializer/doctags.py
@@ -157,6 +157,7 @@ def serialize(
         item: TableItem,
         doc_serializer: BaseDocSerializer,
         doc: DoclingDocument,
+        visited: Optional[set[str]] = None,
         **kwargs: Any,
     ) -> SerializationResult:
         """Serializes the passed item."""
@@ -179,6 +180,7 @@ def serialize(
                 add_cell_text=params.add_table_cell_text,
                 xsize=params.xsize,
                 ysize=params.ysize,
+                visited=visited,
             )
             res_parts.append(create_ser_result(text=otsl_text, span_source=item))
 

diff --git a/docling_core/transforms/serializer/html.py b/docling_core/transforms/serializer/html.py
@@ -65,6 +65,7 @@
     PictureItem,
     PictureMoleculeData,
     PictureTabularChartData,
+    RichTableCell,
     SectionHeaderItem,
     TableCell,
     TableItem,
@@ -356,6 +357,7 @@ def serialize(
 
         if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
             body = ""
+            span_source: Union[DocItem, list[SerializationResult]] = []
 
             for i in range(nrows):
                 body += "<tr>"
@@ -376,7 +378,16 @@ def serialize(
                     if colstart != j:
                         continue
 
-                    content = html.escape(cell.text.strip())
+                    if isinstance(cell, RichTableCell):
+                        ser_res = doc_serializer.serialize(
+                            item=cell.ref.resolve(doc=doc), **kwargs
+                        )
+                        content = ser_res.text
+                        span_source = [ser_res]
+                    else:
+                        content = html.escape(cell.text.strip())
+                        span_source = item
+
                     celltag = "td"
                     if cell.column_header or cell.row_header or cell.row_section:
                         celltag = "th"
@@ -396,7 +407,7 @@ def serialize(
 
             if body:
                 body = f"<tbody>{body}</tbody>"
-                res_parts.append(create_ser_result(text=body, span_source=item))
+                res_parts.append(create_ser_result(text=body, span_source=span_source))
 
         text_res = "".join([r.text for r in res_parts])
         text_res = f"<table>{text_res}</table>" if text_res else ""

diff --git a/docling_core/transforms/serializer/markdown.py b/docling_core/transforms/serializer/markdown.py
@@ -55,6 +55,7 @@
     PictureItem,
     PictureMoleculeData,
     PictureTabularChartData,
+    RichTableCell,
     SectionHeaderItem,
     TableItem,
     TextItem,
@@ -320,7 +321,13 @@ def serialize(
                 [
                     # make sure that md tables are not broken
                     # due to newline chars in the text
-                    col.text.replace("\n", " ")
+                    (
+                        doc_serializer.serialize(
+                            item=col.ref.resolve(doc=doc), **kwargs
+                        ).text
+                        if isinstance(col, RichTableCell)
+                        else col.text
+                    ).replace("\n", " ")
                     for col in row
                 ]
                 for row in item.data.grid

diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py
@@ -34,7 +34,7 @@
     validate_call,
 )
 from tabulate import tabulate
-from typing_extensions import Annotated, Self, deprecated
+from typing_extensions import Annotated, Self, deprecated, override
 
 from docling_core.search.package import VERSION_PATTERN
 from docling_core.types.base import _JSON_POINTER_REGEX
@@ -60,7 +60,7 @@
 
 Uint64 = typing.Annotated[int, Field(ge=0, le=(2**64 - 1))]
 LevelNumber = typing.Annotated[int, Field(ge=1, le=100)]
-CURRENT_VERSION: Final = "1.5.0"
+CURRENT_VERSION: Final = "1.6.0"
 
 DEFAULT_EXPORT_LABELS = {
     DocItemLabel.TITLE,
@@ -325,7 +325,7 @@ def from_dict_format(cls, data: Any) -> Any:
                 in data
             ):
                 return data
-            text = data["bbox"].get("token", "")
+            text = data.get("bbox", {}).get("token", "")
             if not len(text):
                 text_cells = data.pop("text_cell_bboxes", None)
                 if text_cells:
@@ -337,11 +337,37 @@ def from_dict_format(cls, data: Any) -> Any:
 
         return data
 
+    def _get_text(self, doc: Optional["DoclingDocument"] = None, **kwargs: Any) -> str:
+        return self.text
+
+
+class RichTableCell(TableCell):
+    """RichTableCell."""
+
+    ref: "RefItem"
+
+    @override
+    def _get_text(self, doc: Optional["DoclingDocument"] = None, **kwargs: Any) -> str:
+        from docling_core.transforms.serializer.markdown import MarkdownDocSerializer
+
+        if doc is not None:
+            doc_serializer = MarkdownDocSerializer(doc=doc)
+            ser_res = doc_serializer.serialize(item=self.ref.resolve(doc=doc), **kwargs)
+            return ser_res.text
+        else:
+            return "<!-- rich cell -->"
+
+
+AnyTableCell = Annotated[
+    Union[RichTableCell, TableCell],
+    Field(union_mode="left_to_right"),
+]
+
 
 class TableData(BaseModel):  # TBD
     """BaseTableData."""
 
-    table_cells: List[TableCell] = []
+    table_cells: List[AnyTableCell] = []
     num_rows: int = 0
     num_cols: int = 0
 
@@ -380,7 +406,9 @@ def grid(
 
         return table_data
 
-    def remove_rows(self, indices: List[int]) -> List[List[TableCell]]:
+    def remove_rows(
+        self, indices: List[int], doc: Optional["DoclingDocument"] = None
+    ) -> List[List[TableCell]]:
         """Remove rows from the table by their indices.
 
         :param indices: List[int]: A list of indices of the rows to remove. (Starting from 0)
@@ -392,6 +420,7 @@ def remove_rows(self, indices: List[int]) -> List[List[TableCell]]:
 
         indices = sorted(indices, reverse=True)
 
+        refs_to_remove = []
         all_removed_cells = []
         for row_index in indices:
             if row_index < 0 or row_index >= self.num_rows:
@@ -403,6 +432,10 @@ def remove_rows(self, indices: List[int]) -> List[List[TableCell]]:
             end_idx = start_idx + self.num_cols
             removed_cells = self.table_cells[start_idx:end_idx]
 
+            for cell in removed_cells:
+                if isinstance(cell, RichTableCell):
+                    refs_to_remove.append(cell.ref)
+
             # Remove the cells from the table
             self.table_cells = self.table_cells[:start_idx] + self.table_cells[end_idx:]
 
@@ -417,26 +450,37 @@ def remove_rows(self, indices: List[int]) -> List[List[TableCell]]:
 
             all_removed_cells.append(removed_cells)
 
+        if refs_to_remove:
+            if doc is None:
+                _logger.warning(
+                    "When table contains rich cells, `doc` argument must be provided, "
+                    "otherwise rich cell content will be left dangling."
+                )
+            else:
+                doc._delete_items(refs_to_remove)
+
         return all_removed_cells
 
-    def pop_row(self) -> List[TableCell]:
+    def pop_row(self, doc: Optional["DoclingDocument"] = None) -> List[TableCell]:
         """Remove and return the last row from the table.
 
         :returns: List[TableCell]: A list of TableCell objects representing the popped row.
         """
         if self.num_rows == 0:
             raise IndexError("Cannot pop from an empty table.")
 
-        return self.remove_row(self.num_rows - 1)
+        return self.remove_row(self.num_rows - 1, doc=doc)
 
-    def remove_row(self, row_index: int) -> List[TableCell]:
+    def remove_row(
+        self, row_index: int, doc: Optional["DoclingDocument"] = None
+    ) -> List[TableCell]:
         """Remove a row from the table by its index.
 
         :param row_index: int: The index of the row to remove. (Starting from 0)
 
         :returns: List[TableCell]: A list of TableCell objects representing the removed row.
         """
-        return self.remove_rows([row_index])[0]
+        return self.remove_rows([row_index], doc=doc)[0]
 
     def insert_rows(
         self, row_index: int, rows: List[List[str]], after: bool = False
@@ -1509,8 +1553,15 @@ class TableItem(FloatingItem):
 
     annotations: List[TableAnnotationType] = []
 
-    def export_to_dataframe(self) -> pd.DataFrame:
+    def export_to_dataframe(
+        self, doc: Optional["DoclingDocument"] = None
+    ) -> pd.DataFrame:
         """Export the table as a Pandas DataFrame."""
+        if doc is None:
+            _logger.warning(
+                "Usage of TableItem.export_to_dataframe() without `doc` argument is deprecated."
+            )
+
         if self.data.num_rows == 0 or self.data.num_cols == 0:
             return pd.DataFrame()
 
@@ -1539,14 +1590,15 @@ def export_to_dataframe(self) -> pd.DataFrame:
             columns = ["" for _ in range(self.data.num_cols)]
             for i in range(num_headers):
                 for j, cell in enumerate(self.data.grid[i]):
-                    col_name = cell.text
+                    col_name = cell._get_text(doc=doc)
                     if columns[j] != "":
                         col_name = f".{col_name}"
                     columns[j] += col_name
 
         # Create table data
         table_data = [
-            [cell.text for cell in row] for row in self.data.grid[num_headers:]
+            [cell._get_text(doc=doc) for cell in row]
+            for row in self.data.grid[num_headers:]
         ]
 
         # Create DataFrame
@@ -1577,7 +1629,7 @@ def export_to_markdown(self, doc: Optional["DoclingDocument"] = None) -> str:
 
                     # make sure that md tables are not broken
                     # due to newline chars in the text
-                    text = col.text
+                    text = col._get_text(doc=doc)
                     text = text.replace("\n", " ")
                     tmp.append(text)
 
@@ -1623,6 +1675,7 @@ def export_to_otsl(
         add_cell_text: bool = True,
         xsize: int = 500,
         ysize: int = 500,
+        **kwargs: Any,
     ) -> str:
         """Export the table as OTSL."""
         # Possible OTSL tokens...
@@ -1652,7 +1705,7 @@ def export_to_otsl(
         for i in range(nrows):
             for j in range(ncols):
                 cell: TableCell = self.data.grid[i][j]
-                content = cell.text.strip()
+                content = cell._get_text(doc=doc, **kwargs).strip()
                 rowspan, rowstart = (
                     cell.row_span,
                     cell.start_row_offset_idx,
@@ -2304,6 +2357,15 @@ def _update_breadth_first_with_lookup(
                 refs_to_be_deleted=refs_to_be_deleted,
                 lookup=lookup,
             )
+            if isinstance(node, TableItem):
+                for cell in node.data.table_cells:
+                    if isinstance(cell, RichTableCell):
+                        path = cell.ref._split_ref_to_path()
+                        cell.ref = self._update_ref_with_lookup(
+                            item_label=path[1],
+                            item_index=int(path[2]),
+                            lookup=lookup,
+                        )
 
         # Update the self_ref reference
         if node.parent is not None:
@@ -3945,16 +4007,22 @@ def num_pages(self):
         """num_pages."""
         return len(self.pages.values())
 
-    def validate_tree(self, root) -> bool:
+    def validate_tree(self, root: NodeItem) -> bool:
         """validate_tree."""
-        res = []
         for child_ref in root.children:
             child = child_ref.resolve(self)
-            if child.parent.resolve(self) != root:
+            if child.parent.resolve(self) != root or not self.validate_tree(child):
                 return False
-            res.append(self.validate_tree(child))
 
-        return all(res) or len(res) == 0
+        if isinstance(root, TableItem):
+            for cell in root.data.table_cells:
+                if isinstance(cell, RichTableCell) and (
+                    (par_ref := cell.ref.resolve(self).parent) is None
+                    or par_ref.resolve(self) != root
+                ):
+                    return False
+
+        return True
 
     def iterate_items(
         self,
@@ -3963,7 +4031,7 @@ def iterate_items(
         traverse_pictures: bool = False,
         page_no: Optional[int] = None,
         included_content_layers: Optional[set[ContentLayer]] = None,
-        _level: int = 0,  # fixed parameter, carries through the node nesting level
+        _level: int = 0,  # deprecated
     ) -> typing.Iterable[Tuple[NodeItem, int]]:  # tuple of node and level
         """Iterate elements with level."""
         for item, stack in self._iterate_items_with_stack(
@@ -5324,7 +5392,9 @@ def get_text(text: str, max_text_len: int):
                         grid.append([])
                         for j, cell in enumerate(row):
                             if j < 10:
-                                text = get_text(text=cell.text, max_text_len=16)
+                                text = get_text(
+                                    cell._get_text(doc=self), max_text_len=16
+                                )
                                 grid[-1].append(text)
 
                     result.append("\n" + tabulate(grid) + "\n")
@@ -5588,6 +5658,16 @@ def index(self, doc: "DoclingDocument") -> None:
                                         )
                                         break
 
+                            # update rich table cells references:
+                            if isinstance(parent_item, TableItem):
+                                for cell in parent_item.data.table_cells:
+                                    if (
+                                        isinstance(cell, RichTableCell)
+                                        and cell.ref.cref == item.self_ref
+                                    ):
+                                        cell.ref.cref = new_cref
+                                        break
+
                         elif num_components == 2 and path_components[1] == "body":
                             parent_item = self._body
                         else:
@@ -5676,6 +5756,18 @@ def validate_group(doc: DoclingDocument, item: GroupItem):
             elif isinstance(item, ListItem):
                 validate_list_item(self, item)
 
+    def add_table_cell(self, table_item: TableItem, cell: TableCell) -> None:
+        """Add a table cell to the table."""
+        if isinstance(cell, RichTableCell):
+            item = cell.ref.resolve(doc=self)
+            if isinstance(item, NodeItem) and (
+                (not item.parent) or item.parent.cref != table_item.self_ref
+            ):
+                raise ValueError(
+                    f"Trying to add cell with another parent {item.parent} to {table_item.self_ref}"
+                )
+        table_item.data.table_cells.append(cell)
+
 
 # deprecated aliases (kept for backwards compatibility):
 BasePictureData = BaseAnnotation

diff --git a/docling_core/utils/legacy.py b/docling_core/utils/legacy.py
@@ -252,7 +252,7 @@ def _make_spans(cell: TableCell, table_item: TableItem):
 
                             spans = list(_make_spans(cell, item))
                             table_data[i][j] = GlmTableCell(
-                                text=cell.text,
+                                text=cell._get_text(doc=doc),
                                 bbox=(
                                     cell.bbox.as_tuple()
                                     if cell.bbox is not None