Skip to content

Commit 1d969d4

Browse files
authored
fix: fix document re-indexing (#510)
Signed-off-by: Panos Vagenas <pva@zurich.ibm.com>
1 parent 2793dda commit 1d969d4

File tree

3 files changed

+33
-16
lines changed

3 files changed

+33
-16
lines changed

docling_core/types/doc/document.py

Lines changed: 24 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6185,6 +6185,10 @@ def index(self, doc: "DoclingDocument", page_nrs: Optional[set[int]] = None) ->
61856185

61866186
self._names.append(doc.name)
61876187

6188+
# record starting indices so post-processing only touches new items
6189+
post_processing_keys = ["texts", "pictures", "tables", "key_value_items", "form_items"]
6190+
start_indices = {k: len(self.get_item_list(k)) for k in post_processing_keys}
6191+
61886192
# collect items in traversal order
61896193
for item, _ in doc._iterate_items_with_stack(
61906194
with_groups=True,
@@ -6242,13 +6246,6 @@ def index(self, doc: "DoclingDocument", page_nrs: Optional[set[int]] = None) ->
62426246
parent_index = int(parent_index_str)
62436247
parent_item = self.get_item_list(parent_key)[parent_index]
62446248

6245-
# update captions field (not possible in iterate_items order):
6246-
if isinstance(parent_item, FloatingItem):
6247-
for cap_it, cap in enumerate(parent_item.captions):
6248-
if cap.cref == item.self_ref:
6249-
parent_item.captions[cap_it] = RefItem(cref=new_cref)
6250-
break
6251-
62526249
# update rich table cells references:
62536250
if isinstance(parent_item, TableItem):
62546251
for cell in parent_item.data.table_cells:
@@ -6262,6 +6259,26 @@ def index(self, doc: "DoclingDocument", page_nrs: Optional[set[int]] = None) ->
62626259
raise RuntimeError(f"Unsupported ref format: {new_parent_cref}")
62636260
parent_item.children.append(RefItem(cref=new_cref))
62646261

6262+
# rewrite FloatingItem explicit refs starting from start_indices to avoid corrupting items from prior calls
6263+
for key in post_processing_keys:
6264+
for idx_item in self.get_item_list(key)[start_indices[key] :]:
6265+
if isinstance(idx_item, FloatingItem):
6266+
idx_item.captions = [
6267+
RefItem(cref=orig_ref_to_new_ref[cap.cref])
6268+
for cap in idx_item.captions
6269+
if cap.cref in orig_ref_to_new_ref
6270+
]
6271+
idx_item.references = [
6272+
RefItem(cref=orig_ref_to_new_ref[ref.cref])
6273+
for ref in idx_item.references
6274+
if ref.cref in orig_ref_to_new_ref
6275+
]
6276+
idx_item.footnotes = [
6277+
RefItem(cref=orig_ref_to_new_ref[fn.cref])
6278+
for fn in idx_item.footnotes
6279+
if fn.cref in orig_ref_to_new_ref
6280+
]
6281+
62656282
# update pages
62666283
new_max_page = None
62676284
for page_nr in doc.pages:

test/data/doc/concatenated.html

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,9 @@ <h2>1 Introduction</h2>
109109
<p>Copyright © 2025, Association for the Advancement of Artificial Intelligence (www.aaai.org). All rights reserved.</p>
110110
<p>The following list summarizes the features currently available on Docling:</p>
111111
<ul>
112+
<li style="list-style-type: '· ';">Parses common document formats (PDF, Images, MS Office formats, HTML) and exports to Markdown, JSON, and HTML.</li>
113+
<li style="list-style-type: '· ';">Applies advanced AI for document understanding, including detailed page layout, OCR, reading order, figure extraction, and table structure recognition.</li>
114+
<li style="list-style-type: '· ';">Establishes a unified DoclingDocument data model for rich document representation and operations.</li>
112115
<li style="list-style-type: '· ';">Provides fully local execution capabilities making it suitable for sensitive data and air-gapped environments.</li>
113116
<li style="list-style-type: '· ';">Has an ecosystem of plug-and-play integrations with prominent generative AI development frameworks, including LangChain and LlamaIndex.</li>
114117
<li style="list-style-type: '· ';">Can leverage hardware accelerators such as GPUs.</li>
@@ -343,12 +346,9 @@ <h2>1. Introduction</h2>
343346
</li>
344347
<li style="list-style-type: '■ ';">list item 4</li>
345348
</ul>
346-
<p>This is the caption of table 1.</p>
347-
<table><caption><div class="caption">Parses common document formats (PDF, Images, MS Office formats, HTML) and exports to Markdown, JSON, and HTML.</div></caption><tbody><tr><td rowspan="2">Product</td><td colspan="2">Years</td></tr><tr><td>2016</td><td>2017</td></tr><tr><td>Apple</td><td>49823</td><td>695944</td></tr></tbody></table>
348-
<p>This is the caption of figure 1.</p>
349-
<figure><figcaption><div class="caption">Applies advanced AI for document understanding, including detailed page layout, OCR, reading order, figure extraction, and table structure recognition.</div></figcaption></figure>
350-
<p>This is the caption of figure 2.</p>
351-
<figure><figcaption><div class="caption">Establishes a unified DoclingDocument data model for rich document representation and operations.</div></figcaption><img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAEAAAABACAIAAAAlC+aJAAAAIklEQVR4nO3BAQ0AAADCoPdPbQ8HFAAAAAAAAAAAAAAA8G4wQAABiwCo9wAAAABJRU5ErkJggg=="></figure>
349+
<table><caption><div class="caption">This is the caption of table 1.</div></caption><tbody><tr><td rowspan="2">Product</td><td colspan="2">Years</td></tr><tr><td>2016</td><td>2017</td></tr><tr><td>Apple</td><td>49823</td><td>695944</td></tr></tbody></table>
350+
<figure><figcaption><div class="caption">This is the caption of figure 1.</div></figcaption></figure>
351+
<figure><figcaption><div class="caption">This is the caption of figure 2.</div></figcaption><img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAEAAAABACAIAAAAlC+aJAAAAIklEQVR4nO3BAQ0AAADCoPdPbQ8HFAAAAAAAAAAAAAAA8G4wQAABiwCo9wAAAABJRU5ErkJggg=="></figure>
352352
<ul>
353353
<li style="list-style-type: '■ ';">item 1 of list</li>
354354
</ul>

test/data/doc/concatenated.json

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12760,7 +12760,7 @@
1276012760
"prov": [],
1276112761
"captions": [
1276212762
{
12763-
"$ref": "#/texts/15"
12763+
"$ref": "#/texts/303"
1276412764
}
1276512765
],
1276612766
"references": [],
@@ -12778,7 +12778,7 @@
1277812778
"prov": [],
1277912779
"captions": [
1278012780
{
12781-
"$ref": "#/texts/16"
12781+
"$ref": "#/texts/304"
1278212782
}
1278312783
],
1278412784
"references": [],
@@ -13930,7 +13930,7 @@
1393013930
"prov": [],
1393113931
"captions": [
1393213932
{
13933-
"$ref": "#/texts/14"
13933+
"$ref": "#/texts/302"
1393413934
}
1393513935
],
1393613936
"references": [],

0 commit comments

Comments
 (0)