Skip to content
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docling_core/transforms/serializer/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -359,6 +359,7 @@ def serialize(
item=item,
doc_serializer=self,
doc=self.doc,
visited=my_visited,
**my_kwargs,
)
elif isinstance(item, PictureItem):
Expand Down
2 changes: 2 additions & 0 deletions docling_core/transforms/serializer/doctags.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,7 @@ def serialize(
item: TableItem,
doc_serializer: BaseDocSerializer,
doc: DoclingDocument,
visited: Optional[set[str]] = None,
**kwargs: Any,
) -> SerializationResult:
"""Serializes the passed item."""
Expand All @@ -179,6 +180,7 @@ def serialize(
add_cell_text=params.add_table_cell_text,
xsize=params.xsize,
ysize=params.ysize,
visited=visited,
)
res_parts.append(create_ser_result(text=otsl_text, span_source=item))

Expand Down
15 changes: 13 additions & 2 deletions docling_core/transforms/serializer/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@
PictureItem,
PictureMoleculeData,
PictureTabularChartData,
RichTableCell,
SectionHeaderItem,
TableCell,
TableItem,
Expand Down Expand Up @@ -356,6 +357,7 @@ def serialize(

if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
body = ""
span_source: Union[DocItem, list[SerializationResult]] = []

for i in range(nrows):
body += "<tr>"
Expand All @@ -376,7 +378,16 @@ def serialize(
if colstart != j:
continue

content = html.escape(cell.text.strip())
if isinstance(cell, RichTableCell):
ser_res = doc_serializer.serialize(
item=cell.ref.resolve(doc=doc), **kwargs
)
content = ser_res.text
span_source = [ser_res]
else:
content = html.escape(cell.text.strip())
span_source = item

celltag = "td"
if cell.column_header or cell.row_header or cell.row_section:
celltag = "th"
Expand All @@ -396,7 +407,7 @@ def serialize(

if body:
body = f"<tbody>{body}</tbody>"
res_parts.append(create_ser_result(text=body, span_source=item))
res_parts.append(create_ser_result(text=body, span_source=span_source))

text_res = "".join([r.text for r in res_parts])
text_res = f"<table>{text_res}</table>" if text_res else ""
Expand Down
9 changes: 8 additions & 1 deletion docling_core/transforms/serializer/markdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@
PictureItem,
PictureMoleculeData,
PictureTabularChartData,
RichTableCell,
SectionHeaderItem,
TableItem,
TextItem,
Expand Down Expand Up @@ -320,7 +321,13 @@ def serialize(
[
# make sure that md tables are not broken
# due to newline chars in the text
col.text.replace("\n", " ")
(
doc_serializer.serialize(
item=col.ref.resolve(doc=doc), **kwargs
).text
if isinstance(col, RichTableCell)
else col.text
).replace("\n", " ")
for col in row
]
for row in item.data.grid
Expand Down
134 changes: 113 additions & 21 deletions docling_core/types/doc/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
validate_call,
)
from tabulate import tabulate
from typing_extensions import Annotated, Self, deprecated
from typing_extensions import Annotated, Self, deprecated, override

from docling_core.search.package import VERSION_PATTERN
from docling_core.types.base import _JSON_POINTER_REGEX
Expand All @@ -60,7 +60,7 @@

Uint64 = typing.Annotated[int, Field(ge=0, le=(2**64 - 1))]
LevelNumber = typing.Annotated[int, Field(ge=1, le=100)]
CURRENT_VERSION: Final = "1.5.0"
CURRENT_VERSION: Final = "1.6.0"

DEFAULT_EXPORT_LABELS = {
DocItemLabel.TITLE,
Expand Down Expand Up @@ -325,7 +325,7 @@ def from_dict_format(cls, data: Any) -> Any:
in data
):
return data
text = data["bbox"].get("token", "")
text = data.get("bbox", {}).get("token", "")
if not len(text):
text_cells = data.pop("text_cell_bboxes", None)
if text_cells:
Expand All @@ -337,11 +337,37 @@ def from_dict_format(cls, data: Any) -> Any:

return data

def _get_text(self, doc: Optional["DoclingDocument"] = None, **kwargs: Any) -> str:
return self.text


class RichTableCell(TableCell):
"""RichTableCell."""

ref: "RefItem"

@override
def _get_text(self, doc: Optional["DoclingDocument"] = None, **kwargs: Any) -> str:
from docling_core.transforms.serializer.markdown import MarkdownDocSerializer

if doc is not None:
doc_serializer = MarkdownDocSerializer(doc=doc)
ser_res = doc_serializer.serialize(item=self.ref.resolve(doc=doc), **kwargs)
return ser_res.text
else:
return "<!-- rich cell -->"


AnyTableCell = Annotated[
Union[RichTableCell, TableCell],
Field(union_mode="left_to_right"),
]


class TableData(BaseModel): # TBD
"""BaseTableData."""

table_cells: List[TableCell] = []
table_cells: List[AnyTableCell] = []
num_rows: int = 0
num_cols: int = 0

Expand Down Expand Up @@ -380,7 +406,9 @@ def grid(

return table_data

def remove_rows(self, indices: List[int]) -> List[List[TableCell]]:
def remove_rows(
self, indices: List[int], doc: Optional["DoclingDocument"] = None
) -> List[List[TableCell]]:
"""Remove rows from the table by their indices.

:param indices: List[int]: A list of indices of the rows to remove. (Starting from 0)
Expand All @@ -392,6 +420,7 @@ def remove_rows(self, indices: List[int]) -> List[List[TableCell]]:

indices = sorted(indices, reverse=True)

refs_to_remove = []
all_removed_cells = []
for row_index in indices:
if row_index < 0 or row_index >= self.num_rows:
Expand All @@ -403,6 +432,10 @@ def remove_rows(self, indices: List[int]) -> List[List[TableCell]]:
end_idx = start_idx + self.num_cols
removed_cells = self.table_cells[start_idx:end_idx]

for cell in removed_cells:
if isinstance(cell, RichTableCell):
refs_to_remove.append(cell.ref)

# Remove the cells from the table
self.table_cells = self.table_cells[:start_idx] + self.table_cells[end_idx:]

Expand All @@ -417,26 +450,37 @@ def remove_rows(self, indices: List[int]) -> List[List[TableCell]]:

all_removed_cells.append(removed_cells)

if refs_to_remove:
if doc is None:
_logger.warning(
"When table contains rich cells, `doc` argument must be provided, "
"otherwise rich cell content will be left dangling."
)
else:
doc._delete_items(refs_to_remove)

return all_removed_cells

def pop_row(self) -> List[TableCell]:
def pop_row(self, doc: Optional["DoclingDocument"] = None) -> List[TableCell]:
"""Remove and return the last row from the table.

:returns: List[TableCell]: A list of TableCell objects representing the popped row.
"""
if self.num_rows == 0:
raise IndexError("Cannot pop from an empty table.")

return self.remove_row(self.num_rows - 1)
return self.remove_row(self.num_rows - 1, doc=doc)

def remove_row(self, row_index: int) -> List[TableCell]:
def remove_row(
self, row_index: int, doc: Optional["DoclingDocument"] = None
) -> List[TableCell]:
"""Remove a row from the table by its index.

:param row_index: int: The index of the row to remove. (Starting from 0)

:returns: List[TableCell]: A list of TableCell objects representing the removed row.
"""
return self.remove_rows([row_index])[0]
return self.remove_rows([row_index], doc=doc)[0]

def insert_rows(
self, row_index: int, rows: List[List[str]], after: bool = False
Expand Down Expand Up @@ -1509,8 +1553,15 @@ class TableItem(FloatingItem):

annotations: List[TableAnnotationType] = []

def export_to_dataframe(self) -> pd.DataFrame:
def export_to_dataframe(
self, doc: Optional["DoclingDocument"] = None
) -> pd.DataFrame:
"""Export the table as a Pandas DataFrame."""
if doc is None:
_logger.warning(
"Usage of TableItem.export_to_dataframe() without `doc` argument is deprecated."
)

if self.data.num_rows == 0 or self.data.num_cols == 0:
return pd.DataFrame()

Expand Down Expand Up @@ -1539,14 +1590,15 @@ def export_to_dataframe(self) -> pd.DataFrame:
columns = ["" for _ in range(self.data.num_cols)]
for i in range(num_headers):
for j, cell in enumerate(self.data.grid[i]):
col_name = cell.text
col_name = cell._get_text(doc=doc)
if columns[j] != "":
col_name = f".{col_name}"
columns[j] += col_name

# Create table data
table_data = [
[cell.text for cell in row] for row in self.data.grid[num_headers:]
[cell._get_text(doc=doc) for cell in row]
for row in self.data.grid[num_headers:]
]

# Create DataFrame
Expand Down Expand Up @@ -1577,7 +1629,7 @@ def export_to_markdown(self, doc: Optional["DoclingDocument"] = None) -> str:

# make sure that md tables are not broken
# due to newline chars in the text
text = col.text
text = col._get_text(doc=doc)
text = text.replace("\n", " ")
tmp.append(text)

Expand Down Expand Up @@ -1623,6 +1675,7 @@ def export_to_otsl(
add_cell_text: bool = True,
xsize: int = 500,
ysize: int = 500,
**kwargs: Any,
) -> str:
"""Export the table as OTSL."""
# Possible OTSL tokens...
Expand Down Expand Up @@ -1652,7 +1705,7 @@ def export_to_otsl(
for i in range(nrows):
for j in range(ncols):
cell: TableCell = self.data.grid[i][j]
content = cell.text.strip()
content = cell._get_text(doc=doc, **kwargs).strip()
rowspan, rowstart = (
cell.row_span,
cell.start_row_offset_idx,
Expand Down Expand Up @@ -2304,6 +2357,15 @@ def _update_breadth_first_with_lookup(
refs_to_be_deleted=refs_to_be_deleted,
lookup=lookup,
)
if isinstance(node, TableItem):
for cell in node.data.table_cells:
if isinstance(cell, RichTableCell):
path = cell.ref._split_ref_to_path()
cell.ref = self._update_ref_with_lookup(
item_label=path[1],
item_index=int(path[2]),
lookup=lookup,
)

# Update the self_ref reference
if node.parent is not None:
Expand Down Expand Up @@ -3945,16 +4007,22 @@ def num_pages(self):
"""num_pages."""
return len(self.pages.values())

def validate_tree(self, root) -> bool:
def validate_tree(self, root: NodeItem) -> bool:
"""validate_tree."""
res = []
for child_ref in root.children:
child = child_ref.resolve(self)
if child.parent.resolve(self) != root:
if child.parent.resolve(self) != root or not self.validate_tree(child):
return False
res.append(self.validate_tree(child))

return all(res) or len(res) == 0
if isinstance(root, TableItem):
for cell in root.data.table_cells:
if isinstance(cell, RichTableCell) and (
(par_ref := cell.ref.resolve(self).parent) is None
or par_ref.resolve(self) != root
):
return False

return True

def iterate_items(
self,
Expand All @@ -3963,7 +4031,7 @@ def iterate_items(
traverse_pictures: bool = False,
page_no: Optional[int] = None,
included_content_layers: Optional[set[ContentLayer]] = None,
_level: int = 0, # fixed parameter, carries through the node nesting level
_level: int = 0, # deprecated
) -> typing.Iterable[Tuple[NodeItem, int]]: # tuple of node and level
"""Iterate elements with level."""
for item, stack in self._iterate_items_with_stack(
Expand Down Expand Up @@ -5324,7 +5392,9 @@ def get_text(text: str, max_text_len: int):
grid.append([])
for j, cell in enumerate(row):
if j < 10:
text = get_text(text=cell.text, max_text_len=16)
text = get_text(
cell._get_text(doc=self), max_text_len=16
)
grid[-1].append(text)

result.append("\n" + tabulate(grid) + "\n")
Expand Down Expand Up @@ -5588,6 +5658,16 @@ def index(self, doc: "DoclingDocument") -> None:
)
break

# update rich table cells references:
if isinstance(parent_item, TableItem):
for cell in parent_item.data.table_cells:
if (
isinstance(cell, RichTableCell)
and cell.ref.cref == item.self_ref
):
cell.ref.cref = new_cref
break

elif num_components == 2 and path_components[1] == "body":
parent_item = self._body
else:
Expand Down Expand Up @@ -5676,6 +5756,18 @@ def validate_group(doc: DoclingDocument, item: GroupItem):
elif isinstance(item, ListItem):
validate_list_item(self, item)

def add_table_cell(self, table_item: TableItem, cell: TableCell) -> None:
"""Add a table cell to the table."""
if isinstance(cell, RichTableCell):
item = cell.ref.resolve(doc=self)
if isinstance(item, NodeItem) and (
(not item.parent) or item.parent.cref != table_item.self_ref
):
raise ValueError(
f"Trying to add cell with another parent {item.parent} to {table_item.self_ref}"
)
table_item.data.table_cells.append(cell)


# deprecated aliases (kept for backwards compatibility):
BasePictureData = BaseAnnotation
Expand Down
2 changes: 1 addition & 1 deletion docling_core/utils/legacy.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,7 @@ def _make_spans(cell: TableCell, table_item: TableItem):

spans = list(_make_spans(cell, item))
table_data[i][j] = GlmTableCell(
text=cell.text,
text=cell._get_text(doc=doc),
bbox=(
cell.bbox.as_tuple()
if cell.bbox is not None
Expand Down
Loading
Loading