Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions docling_core/types/doc/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@ class GlmTableCell(TableCell):
class BaseCell(AliasModel):
"""Base cell."""

# FIXME: we need to check why we have bounding_box (this should be in prov)
bounding_box: Optional[BoundingBoxContainer] = Field(
default=None, alias="bounding-box", json_schema_extra=es_field(suppress=True)
)
Expand All @@ -152,6 +153,11 @@ class Table(BaseCell):
model: Optional[str] = None


# FIXME: let's add some figure specific data-types later
class Figure(BaseCell):
"""Figure."""


class BaseText(AliasModel):
"""Base model for text objects."""

Expand Down
293 changes: 282 additions & 11 deletions docling_core/types/doc/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@
"""Models for the Docling Document data type."""

from datetime import datetime
from typing import Generic, Optional, Union
from enum import Enum
from typing import Generic, Optional, Tuple, Union

from pydantic import (
AnyHttpUrl,
Expand Down Expand Up @@ -35,6 +36,7 @@
BaseCell,
BaseText,
BitmapObject,
Figure,
PageDimensions,
PageReference,
Ref,
Expand Down Expand Up @@ -275,7 +277,7 @@ class MinimalDocument(
main_text: Optional[list[Union[Ref, BaseText]]] = Field(
default=None, alias="main-text"
)
figures: Optional[list[BaseCell]] = None
figures: Optional[list[Figure]] = None
tables: Optional[list[Table]] = None


Expand Down Expand Up @@ -345,6 +347,107 @@ def from_dict(cls, data):
return data


class DocumentToken(Enum):
"""Class to represent an LLM friendly representation of a Document."""

BEG_DOCUMENT = "<document>"
END_DOCUMENT = "</document>"

BEG_TITLE = "<title>"
END_TITLE = "</title>"

BEG_ABSTRACT = "<abstract>"
END_ABSTRACT = "</abstract>"

BEG_DOI = "<doi>"
END_DOI = "</doi>"
BEG_DATE = "<date>"
END_DATE = "</date>"

BEG_AUTHORS = "<authors>"
END_AUTHORS = "</authors>"
BEG_AUTHOR = "<author>"
END_AUTHOR = "</author>"

BEG_AFFILIATIONS = "<affiliations>"
END_AFFILIATIONS = "</affiliations>"
BEG_AFFILIATION = "<affiliation>"
END_AFFILIATION = "</affiliation>"

BEG_HEADER = "<section-header>"
END_HEADER = "</section-header>"
BEG_TEXT = "<text>"
END_TEXT = "</text>"
BEG_PARAGRAPH = "<paragraph>"
END_PARAGRAPH = "</paragraph>"
BEG_TABLE = "<table>"
END_TABLE = "</table>"
BEG_FIGURE = "<figure>"
END_FIGURE = "</figure>"
BEG_CAPTION = "<caption>"
END_CAPTION = "</caption>"
BEG_EQUATION = "<equation>"
END_EQUATION = "</equation>"
BEG_LIST = "<list>"
END_LIST = "</list>"
BEG_LISTITEM = "<list-item>"
END_LISTITEM = "</list-item>"

BEG_LOCATION = "<location>"
END_LOCATION = "</location>"
BEG_GROUP = "<group>"
END_GROUP = "</group>"

@classmethod
def get_special_tokens(
cls,
max_rows: int = 100,
max_cols: int = 100,
max_pages: int = 1000,
page_dimension: Tuple[int, int] = (100, 100),
):
"""Function to get all special document tokens."""
special_tokens = [token.value for token in cls]

# Adding dynamically generated row and col tokens
for i in range(0, max_rows):
special_tokens += [f"<row_{i}>", f"</row_{i}>"]

for i in range(0, max_cols):
special_tokens += [f"<col_{i}>", f"</col_{i}>"]

for i in range(6):
special_tokens += [f"<section-header-{i}>", f"</section-header-{i}>"]

# Adding dynamically generated page-tokens
for i in range(0, max_pages):
special_tokens.append(f"<page_{i}>")

# Adding dynamically generated location-tokens
for i in range(0, max(page_dimension[0], page_dimension[1])):
special_tokens.append(f"<loc_{i}>")

return special_tokens

@staticmethod
def get_page_token(page: int):
"""Function to get page tokens."""
return f"<page_{page}>"

@staticmethod
def get_location_token(val: float, rnorm: int = 100):
"""Function to get location tokens."""
val_ = round(rnorm * val)

if val_ < 0:
return "<loc_0>"

if val_ > rnorm:
return f"<loc_{rnorm}>"

return f"<loc_{val_}>"


class ExportedCCSDocument(
MinimalDocument,
Generic[
Expand Down Expand Up @@ -427,6 +530,14 @@ def export_to_markdown(
delim: str = "\n\n",
main_text_start: int = 0,
main_text_stop: Optional[int] = None,
main_text_labels: list[str] = [
"title",
"subtitle-level-1",
"paragraph",
"caption",
"table",
],
strict_text: bool = False,
) -> str:
r"""Serialize to Markdown.

Expand Down Expand Up @@ -461,12 +572,7 @@ def export_to_markdown(
continue

item_type = item.obj_type
if isinstance(item, BaseText) and item_type in {
"title",
"subtitle-level-1",
"paragraph",
"caption",
}:
if isinstance(item, BaseText) and item_type in main_text_labels:
text = item.text

# ignore repeated text
Expand All @@ -477,20 +583,31 @@ def export_to_markdown(

# first title match
if item_type == "title" and not has_title:
markdown_text = f"# {text}"
if strict_text:
markdown_text = f"{text}"
else:
markdown_text = f"# {text}"
has_title = True

# secondary titles
elif item_type in {"title", "subtitle-level-1"} or (
has_title and item_type == "title"
):
markdown_text = f"## {text}"
if strict_text:
markdown_text = f"{text}"
else:
markdown_text = f"## {text}"

# normal text
else:
markdown_text = text

elif isinstance(item, Table) and item.data:
elif (
isinstance(item, Table)
and item.data
and item_type in main_text_labels
and not strict_text
):
table = []
for row in item.data:
tmp = []
Expand Down Expand Up @@ -518,3 +635,157 @@ def export_to_markdown(

result = delim.join(md_texts)
return result

def export_to_document_tokens(
self,
delim: str = "\n\n",
main_text_start: int = 0,
main_text_stop: Optional[int] = None,
main_text_labels: list[str] = [
"title",
"subtitle-level-1",
"paragraph",
"caption",
"table",
"figure",
],
page_tagging: bool = True,
location_tagging: bool = True,
location_dimensions: Tuple[int, int] = (100, 100),
add_new_line: bool = True,
) -> str:
r"""Exports the document content to an DocumentToken format.

Operates on a slice of the document's main_text as defined through arguments
main_text_start and main_text_stop; defaulting to the whole main_text.

Args:
delim (str, optional): The delimiter used to separate text blocks in the
exported XML. Default is two newline characters ("\n\n").
main_text_start (int, optional): The starting index of the main text to
be included in the XML. Default is 0 (the beginning of the text).
main_text_stop (Optional[int], optional): The stopping index of the main
text. If set to None, the export includes text up to the end.
Default is None.
main_text_labels (list[str], optional): A list of text labels that
categorize the different sections of the document (e.g., "title",
"subtitle-level-1", "paragraph", "caption"). Default labels are
"title", "subtitle-level-1", "paragraph", and "caption".
location_tagging (bool, optional): Determines whether to include
location-based tagging in the XML. If True, the exported XML will
contain information about the locations of the text elements.
Default is True.
location_dimensions (Tuple[int, int], optional): Specifies the dimensions
(width and height) for the location tagging, if enabled.
Default is [100, 100].
add_new_line (bool, optional): Whether to add new line characters after
each text block. If True, a new line is added after each block of
text in the XML. Default is True.

Returns:
str: The content of the document formatted as an XML string.
"""
xml_str = DocumentToken.BEG_DOCUMENT.value

new_line = ""
if add_new_line:
new_line = "\n"

if self.main_text is not None:
for orig_item in self.main_text[main_text_start:main_text_stop]:

item = (
self._resolve_ref(orig_item)
if isinstance(orig_item, Ref)
else orig_item
)

if item is None:
continue

prov = item.prov

loc_str = "" # default is zero
if (
location_tagging
and self.page_dimensions is not None
and prov is not None
and len(prov) > 0
):

page = prov[0].page
page_dim = self.page_dimensions[page - 1]

page_w = float(page_dim.width)
page_h = float(page_dim.height)

x0 = float(prov[0].bbox[0]) / float(page_w)
y0 = float(prov[0].bbox[1]) / float(page_h)
x1 = float(prov[0].bbox[2]) / float(page_w)
y1 = float(prov[0].bbox[3]) / float(page_h)

page_tok = ""
if page_tagging:
page_tok = DocumentToken.get_page_token(page=page)

x0_tok = DocumentToken.get_location_token(
val=min(x0, x1), rnorm=location_dimensions[0]
)
y0_tok = DocumentToken.get_location_token(
val=min(y0, y1), rnorm=location_dimensions[1]
)
x1_tok = DocumentToken.get_location_token(
val=max(x0, x1), rnorm=location_dimensions[0]
)
y1_tok = DocumentToken.get_location_token(
val=max(y0, y1), rnorm=location_dimensions[1]
)

# update
loc_str = f"{DocumentToken.BEG_LOCATION.value}"
loc_str += f"{page_tok}"
loc_str += f"{x0_tok}{y0_tok}{x1_tok}{y1_tok}"
loc_str += f"{DocumentToken.END_LOCATION.value}"

item_type = item.obj_type
if isinstance(item, BaseText) and (item_type in main_text_labels):
text = item.text

xml_str += f"<{item_type}>{loc_str}{text}</{item_type}>{new_line}"

elif isinstance(item, Table) and (item_type in main_text_labels):

xml_str += f"<{item_type}>{loc_str}"

if item.text is not None and len(item.text) > 0:
xml_str += f"{DocumentToken.BEG_CAPTION.value}"
xml_str += (
f"{item.text}{DocumentToken.END_CAPTION.value}{new_line}"
)

if item.data is not None and len(item.data) > 0:
for i, row in enumerate(item.data):
xml_str += f"<row_{i}>"
for j, col in enumerate(row):
text = col.text
xml_str += f"<col_{j}>{text}</col_{j}>"

xml_str += f"</row_{i}>{new_line}"

xml_str += f"</{item_type}>{new_line}"

elif isinstance(item, Figure) and (item_type in main_text_labels):

xml_str += f"<{item_type}>{loc_str}"

if item.text is not None and len(item.text) > 0:
xml_str += f"{DocumentToken.BEG_CAPTION.value}"
xml_str += (
f"{item.text}{DocumentToken.END_CAPTION.value}{new_line}"
)

xml_str += f"</{item_type}>{new_line}"

xml_str += DocumentToken.END_DOCUMENT.value

return xml_str
Loading