docling-project · PeterStaar-IBM · Sep 9, 2024 · Sep 9, 2024 · Sep 9, 2024 · Sep 9, 2024
diff --git a/docling_core/types/doc/base.py b/docling_core/types/doc/base.py
@@ -131,6 +131,7 @@ class GlmTableCell(TableCell):
 class BaseCell(AliasModel):
     """Base cell."""
 
+    # FIXME: we need to check why we have bounding_box (this should be in prov)
     bounding_box: Optional[BoundingBoxContainer] = Field(
         default=None, alias="bounding-box", json_schema_extra=es_field(suppress=True)
     )
@@ -152,6 +153,11 @@ class Table(BaseCell):
     model: Optional[str] = None
 
 
+# FIXME: let's add some figure specific data-types later
+class Figure(BaseCell):
+    """Figure."""
+
+
 class BaseText(AliasModel):
     """Base model for text objects."""
 

diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py
@@ -6,7 +6,8 @@
 """Models for the Docling Document data type."""
 
 from datetime import datetime
-from typing import Generic, Optional, Union
+from enum import Enum
+from typing import Generic, Optional, Tuple, Union
 
 from pydantic import (
     AnyHttpUrl,
@@ -35,6 +36,7 @@
     BaseCell,
     BaseText,
     BitmapObject,
+    Figure,
     PageDimensions,
     PageReference,
     Ref,
@@ -275,7 +277,7 @@ class MinimalDocument(
     main_text: Optional[list[Union[Ref, BaseText]]] = Field(
         default=None, alias="main-text"
     )
-    figures: Optional[list[BaseCell]] = None
+    figures: Optional[list[Figure]] = None
     tables: Optional[list[Table]] = None
 
 
@@ -345,6 +347,107 @@ def from_dict(cls, data):
         return data
 
 
+class DocumentToken(Enum):
+    """Class to represent an LLM friendly representation of a Document."""
+
+    BEG_DOCUMENT = "<document>"
+    END_DOCUMENT = "</document>"
+
+    BEG_TITLE = "<title>"
+    END_TITLE = "</title>"
+
+    BEG_ABSTRACT = "<abstract>"
+    END_ABSTRACT = "</abstract>"
+
+    BEG_DOI = "<doi>"
+    END_DOI = "</doi>"
+    BEG_DATE = "<date>"
+    END_DATE = "</date>"
+
+    BEG_AUTHORS = "<authors>"
+    END_AUTHORS = "</authors>"
+    BEG_AUTHOR = "<author>"
+    END_AUTHOR = "</author>"
+
+    BEG_AFFILIATIONS = "<affiliations>"
+    END_AFFILIATIONS = "</affiliations>"
+    BEG_AFFILIATION = "<affiliation>"
+    END_AFFILIATION = "</affiliation>"
+
+    BEG_HEADER = "<section-header>"
+    END_HEADER = "</section-header>"
+    BEG_TEXT = "<text>"
+    END_TEXT = "</text>"
+    BEG_PARAGRAPH = "<paragraph>"
+    END_PARAGRAPH = "</paragraph>"
+    BEG_TABLE = "<table>"
+    END_TABLE = "</table>"
+    BEG_FIGURE = "<figure>"
+    END_FIGURE = "</figure>"
+    BEG_CAPTION = "<caption>"
+    END_CAPTION = "</caption>"
+    BEG_EQUATION = "<equation>"
+    END_EQUATION = "</equation>"
+    BEG_LIST = "<list>"
+    END_LIST = "</list>"
+    BEG_LISTITEM = "<list-item>"
+    END_LISTITEM = "</list-item>"
+
+    BEG_LOCATION = "<location>"
+    END_LOCATION = "</location>"
+    BEG_GROUP = "<group>"
+    END_GROUP = "</group>"
+
+    @classmethod
+    def get_special_tokens(
+        cls,
+        max_rows: int = 100,
+        max_cols: int = 100,
+        max_pages: int = 1000,
+        page_dimension: Tuple[int, int] = (100, 100),
+    ):
+        """Function to get all special document tokens."""
+        special_tokens = [token.value for token in cls]
+
+        # Adding dynamically generated row and col tokens
+        for i in range(0, max_rows):
+            special_tokens += [f"<row_{i}>", f"</row_{i}>"]
+
+        for i in range(0, max_cols):
+            special_tokens += [f"<col_{i}>", f"</col_{i}>"]
+
+        for i in range(6):
+            special_tokens += [f"<section-header-{i}>", f"</section-header-{i}>"]
+
+        # Adding dynamically generated page-tokens
+        for i in range(0, max_pages):
+            special_tokens.append(f"<page_{i}>")
+
+        # Adding dynamically generated location-tokens
+        for i in range(0, max(page_dimension[0], page_dimension[1])):
+            special_tokens.append(f"<loc_{i}>")
+
+        return special_tokens
+
+    @staticmethod
+    def get_page_token(page: int):
+        """Function to get page tokens."""
+        return f"<page_{page}>"
+
+    @staticmethod
+    def get_location_token(val: float, rnorm: int = 100):
+        """Function to get location tokens."""
+        val_ = round(rnorm * val)
+
+        if val_ < 0:
+            return "<loc_0>"
+
+        if val_ > rnorm:
+            return f"<loc_{rnorm}>"
+
+        return f"<loc_{val_}>"
+
+
 class ExportedCCSDocument(
     MinimalDocument,
     Generic[
@@ -427,6 +530,14 @@ def export_to_markdown(
         delim: str = "\n\n",
         main_text_start: int = 0,
         main_text_stop: Optional[int] = None,
+        main_text_labels: list[str] = [
+            "title",
+            "subtitle-level-1",
+            "paragraph",
+            "caption",
+            "table",
+        ],
+        strict_text: bool = False,
     ) -> str:
         r"""Serialize to Markdown.
 
@@ -461,12 +572,7 @@ def export_to_markdown(
                     continue
 
                 item_type = item.obj_type
-                if isinstance(item, BaseText) and item_type in {
-                    "title",
-                    "subtitle-level-1",
-                    "paragraph",
-                    "caption",
-                }:
+                if isinstance(item, BaseText) and item_type in main_text_labels:
                     text = item.text
 
                     # ignore repeated text
@@ -477,20 +583,31 @@ def export_to_markdown(
 
                     # first title match
                     if item_type == "title" and not has_title:
-                        markdown_text = f"# {text}"
+                        if strict_text:
+                            markdown_text = f"{text}"
+                        else:
+                            markdown_text = f"# {text}"
                         has_title = True
 
                     # secondary titles
                     elif item_type in {"title", "subtitle-level-1"} or (
                         has_title and item_type == "title"
                     ):
-                        markdown_text = f"## {text}"
+                        if strict_text:
+                            markdown_text = f"{text}"
+                        else:
+                            markdown_text = f"## {text}"
 
                     # normal text
                     else:
                         markdown_text = text
 
-                elif isinstance(item, Table) and item.data:
+                elif (
+                    isinstance(item, Table)
+                    and item.data
+                    and item_type in main_text_labels
+                    and not strict_text
+                ):
                     table = []
                     for row in item.data:
                         tmp = []
@@ -518,3 +635,157 @@ def export_to_markdown(
 
         result = delim.join(md_texts)
         return result
+
+    def export_to_document_tokens(
+        self,
+        delim: str = "\n\n",
+        main_text_start: int = 0,
+        main_text_stop: Optional[int] = None,
+        main_text_labels: list[str] = [
+            "title",
+            "subtitle-level-1",
+            "paragraph",
+            "caption",
+            "table",
+            "figure",
+        ],
+        page_tagging: bool = True,
+        location_tagging: bool = True,
+        location_dimensions: Tuple[int, int] = (100, 100),
+        add_new_line: bool = True,
+    ) -> str:
+        r"""Exports the document content to an DocumentToken format.
+
+        Operates on a slice of the document's main_text as defined through arguments
+        main_text_start and main_text_stop; defaulting to the whole main_text.
+
+        Args:
+            delim (str, optional): The delimiter used to separate text blocks in the
+                exported XML. Default is two newline characters ("\n\n").
+            main_text_start (int, optional): The starting index of the main text to
+                be included in the XML. Default is 0 (the beginning of the text).
+            main_text_stop (Optional[int], optional): The stopping index of the main
+                text. If set to None, the export includes text up to the end.
+                Default is None.
+            main_text_labels (list[str], optional): A list of text labels that
+                categorize the different sections of the document (e.g., "title",
+                "subtitle-level-1", "paragraph", "caption"). Default labels are
+                "title", "subtitle-level-1", "paragraph", and "caption".
+            location_tagging (bool, optional): Determines whether to include
+                location-based tagging in the XML. If True, the exported XML will
+                contain information about the locations of the text elements.
+                Default is True.
+            location_dimensions (Tuple[int, int], optional): Specifies the dimensions
+                (width and height) for the location tagging, if enabled.
+                Default is [100, 100].
+            add_new_line (bool, optional): Whether to add new line characters after
+                each text block. If True, a new line is added after each block of
+                text in the XML. Default is True.
+
+        Returns:
+            str: The content of the document formatted as an XML string.
+        """
+        xml_str = DocumentToken.BEG_DOCUMENT.value
+
+        new_line = ""
+        if add_new_line:
+            new_line = "\n"
+
+        if self.main_text is not None:
+            for orig_item in self.main_text[main_text_start:main_text_stop]:
+
+                item = (
+                    self._resolve_ref(orig_item)
+                    if isinstance(orig_item, Ref)
+                    else orig_item
+                )
+
+                if item is None:
+                    continue
+
+                prov = item.prov
+
+                loc_str = ""  # default is zero
+                if (
+                    location_tagging
+                    and self.page_dimensions is not None
+                    and prov is not None
+                    and len(prov) > 0
+                ):
+
+                    page = prov[0].page
+                    page_dim = self.page_dimensions[page - 1]
+
+                    page_w = float(page_dim.width)
+                    page_h = float(page_dim.height)
+
+                    x0 = float(prov[0].bbox[0]) / float(page_w)
+                    y0 = float(prov[0].bbox[1]) / float(page_h)
+                    x1 = float(prov[0].bbox[2]) / float(page_w)
+                    y1 = float(prov[0].bbox[3]) / float(page_h)
+
+                    page_tok = ""
+                    if page_tagging:
+                        page_tok = DocumentToken.get_page_token(page=page)
+
+                    x0_tok = DocumentToken.get_location_token(
+                        val=min(x0, x1), rnorm=location_dimensions[0]
+                    )
+                    y0_tok = DocumentToken.get_location_token(
+                        val=min(y0, y1), rnorm=location_dimensions[1]
+                    )
+                    x1_tok = DocumentToken.get_location_token(
+                        val=max(x0, x1), rnorm=location_dimensions[0]
+                    )
+                    y1_tok = DocumentToken.get_location_token(
+                        val=max(y0, y1), rnorm=location_dimensions[1]
+                    )
+
+                    # update
+                    loc_str = f"{DocumentToken.BEG_LOCATION.value}"
+                    loc_str += f"{page_tok}"
+                    loc_str += f"{x0_tok}{y0_tok}{x1_tok}{y1_tok}"
+                    loc_str += f"{DocumentToken.END_LOCATION.value}"
+
+                item_type = item.obj_type
+                if isinstance(item, BaseText) and (item_type in main_text_labels):
+                    text = item.text
+
+                    xml_str += f"<{item_type}>{loc_str}{text}</{item_type}>{new_line}"
+
+                elif isinstance(item, Table) and (item_type in main_text_labels):
+
+                    xml_str += f"<{item_type}>{loc_str}"
+
+                    if item.text is not None and len(item.text) > 0:
+                        xml_str += f"{DocumentToken.BEG_CAPTION.value}"
+                        xml_str += (
+                            f"{item.text}{DocumentToken.END_CAPTION.value}{new_line}"
+                        )
+
+                    if item.data is not None and len(item.data) > 0:
+                        for i, row in enumerate(item.data):
+                            xml_str += f"<row_{i}>"
+                            for j, col in enumerate(row):
+                                text = col.text
+                                xml_str += f"<col_{j}>{text}</col_{j}>"
+
+                            xml_str += f"</row_{i}>{new_line}"
+
+                    xml_str += f"</{item_type}>{new_line}"
+
+                elif isinstance(item, Figure) and (item_type in main_text_labels):
+
+                    xml_str += f"<{item_type}>{loc_str}"
+
+                    if item.text is not None and len(item.text) > 0:
+                        xml_str += f"{DocumentToken.BEG_CAPTION.value}"
+                        xml_str += (
+                            f"{item.text}{DocumentToken.END_CAPTION.value}{new_line}"
+                        )
+
+                    xml_str += f"</{item_type}>{new_line}"
+
+        xml_str += DocumentToken.END_DOCUMENT.value
+
+        return xml_str