Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
190 changes: 129 additions & 61 deletions docling_core/types/doc/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -5683,69 +5683,137 @@ def validate_misplaced_list_items(self):
)
return self

class _DocIndex(BaseModel):
"""A document merge buffer."""

groups: list[GroupItem] = []
texts: list[TextItem] = []
pictures: list[PictureItem] = []
tables: list[TableItem] = []
key_value_items: list[KeyValueItem] = []
form_items: list[FormItem] = []

pages: dict[int, PageItem] = {}

_body: Optional[GroupItem] = None
_max_page: int = 0
_names: list[str] = []

def get_item_list(self, key: str) -> list[NodeItem]:
return getattr(self, key)

def index(self, doc: "DoclingDocument") -> None:

orig_ref_to_new_ref: dict[str, str] = {}
page_delta = self._max_page - min(doc.pages.keys()) + 1 if doc.pages else 0

if self._body is None:
self._body = GroupItem(**doc.body.model_dump(exclude={"children"}))

self._names.append(doc.name)

# collect items in traversal order
for item, _ in doc.iterate_items(
with_groups=True,
traverse_pictures=True,
included_content_layers={c for c in ContentLayer},
):
key = item.self_ref.split("/")[1]
is_body = key == "body"
new_cref = (
"#/body" if is_body else f"#/{key}/{len(self.get_item_list(key))}"
)
# register cref mapping:
orig_ref_to_new_ref[item.self_ref] = new_cref

if not is_body:
new_item = copy.deepcopy(item)
new_item.children = []

# put item in the right list
self.get_item_list(key).append(new_item)

# update item's self reference
new_item.self_ref = new_cref

if isinstance(new_item, DocItem):
# update page numbers
# NOTE other prov sources (e.g. GraphCell) currently not covered
for prov in new_item.prov:
prov.page_no += page_delta

if item.parent:
# set item's parent
new_parent_cref = orig_ref_to_new_ref[item.parent.cref]
new_item.parent = RefItem(cref=new_parent_cref)

# add item to parent's children
path_components = new_parent_cref.split("/")
num_components = len(path_components)
if num_components == 3:
_, parent_key, parent_index_str = path_components
parent_index = int(parent_index_str)
parent_item = self.get_item_list(parent_key)[parent_index]

# update captions field (not possible in iterate_items order):
if isinstance(parent_item, FloatingItem):
for cap_it, cap in enumerate(parent_item.captions):
if cap.cref == item.self_ref:
parent_item.captions[cap_it] = RefItem(
cref=new_cref
)
break

elif num_components == 2 and path_components[1] == "body":
parent_item = self._body
else:
raise RuntimeError(
f"Unsupported ref format: {new_parent_cref}"
)
parent_item.children.append(RefItem(cref=new_cref))

# update pages
new_max_page = None
for page_nr in doc.pages:
new_page = copy.deepcopy(doc.pages[page_nr])
new_page_nr = page_nr + page_delta
new_page.page_no = new_page_nr
self.pages[new_page_nr] = new_page
if new_max_page is None or new_page_nr > new_max_page:
new_max_page = new_page_nr
if new_max_page is not None:
self._max_page = new_max_page

def get_name(self) -> str:
return " + ".join(self._names)

def _update_from_index(self, doc_index: "_DocIndex") -> None:
if doc_index._body is not None:
self.body = doc_index._body
self.groups = doc_index.groups
self.texts = doc_index.texts
self.pictures = doc_index.pictures
self.tables = doc_index.tables
self.key_value_items = doc_index.key_value_items
self.form_items = doc_index.form_items
self.pages = doc_index.pages
self.name = doc_index.get_name()

def _normalize_references(self) -> None:
"""Normalize ref numbering by ordering node items as per iterate_items()."""
new_body = GroupItem(**self.body.model_dump(exclude={"children"}))

item_lists: dict[str, list[NodeItem]] = {
"groups": [],
"texts": [],
"pictures": [],
"tables": [],
"key_value_items": [],
"form_items": [],
}
orig_ref_to_new_ref: dict[str, str] = {}
doc_index = DoclingDocument._DocIndex()
doc_index.index(doc=self)
self._update_from_index(doc_index)

# collect items in traversal order
for item, _ in self.iterate_items(
with_groups=True,
traverse_pictures=True,
included_content_layers={c for c in ContentLayer},
):
key = item.self_ref.split("/")[1]
is_body = key == "body"
new_cref = "#/body" if is_body else f"#/{key}/{len(item_lists[key])}"
# register cref mapping:
orig_ref_to_new_ref[item.self_ref] = new_cref

if not is_body:
new_item = copy.deepcopy(item)
new_item.children = []

# put item in the right list
item_lists[key].append(new_item)

# update item's self reference
new_item.self_ref = new_cref

if item.parent:
# set item's parent
new_parent_cref = orig_ref_to_new_ref[item.parent.cref]
new_item.parent = RefItem(cref=new_parent_cref)

# add item to parent's children
path_components = new_parent_cref.split("/")
num_components = len(path_components)
parent_node: NodeItem
if num_components == 3:
_, parent_key, parent_index_str = path_components
parent_index = int(parent_index_str)
parent_node = item_lists[parent_key][parent_index]
elif num_components == 2 and path_components[1] == "body":
parent_node = new_body
else:
raise RuntimeError(f"Unsupported ref format: {new_parent_cref}")
parent_node.children.append(RefItem(cref=new_cref))

# update document
self.groups = item_lists["groups"] # type: ignore
self.texts = item_lists["texts"] # type: ignore
self.pictures = item_lists["pictures"] # type: ignore
self.tables = item_lists["tables"] # type: ignore
self.key_value_items = item_lists["key_value_items"] # type: ignore
self.form_items = item_lists["form_items"] # type: ignore
self.body = new_body
@classmethod
def concatenate(cls, docs: Sequence["DoclingDocument"]) -> "DoclingDocument":
"""Concatenate multiple documents into a single document."""
doc_index = DoclingDocument._DocIndex()
for doc in docs:
doc_index.index(doc=doc)

res_doc = DoclingDocument(name=" + ".join([doc.name for doc in docs]))
res_doc._update_from_index(doc_index)
return res_doc


# deprecated aliases (kept for backwards compatibility):
Expand Down
Loading
Loading