Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 56 additions & 11 deletions docling_core/types/doc/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -6224,6 +6224,13 @@ def index(
self, doc: "DoclingDocument", page_nrs: Optional[set[int]] = None
) -> None:

if page_nrs is not None and (
unavailable_page_nrs := page_nrs - set(doc.pages.keys())
):
raise ValueError(
f"The following page numbers are not present in the document: {unavailable_page_nrs}"
)

orig_ref_to_new_ref: dict[str, str] = {}
page_delta = self._max_page - min(doc.pages.keys()) + 1 if doc.pages else 0

Expand Down Expand Up @@ -6265,7 +6272,29 @@ def index(

if item.parent:
# set item's parent
new_parent_cref = orig_ref_to_new_ref[item.parent.cref]
new_parent_cref = orig_ref_to_new_ref.get(item.parent.cref)
if new_parent_cref is None:

parent_ref = item.parent
while new_parent_cref is None and parent_ref is not None:
parent_ref = RefItem(
cref=parent_ref.resolve(doc).parent.cref
)
new_parent_cref = orig_ref_to_new_ref.get(
parent_ref.cref
)

if new_parent_cref is not None:
warnings.warn(
f"Parent {item.parent.cref} not found in indexed nodes, "
f"using ancestor {new_parent_cref} instead"
)
else:
warnings.warn(
"No ancestor found in indexed nodes, using body as parent"
)
new_parent_cref = "#/body"

new_item.parent = RefItem(cref=new_parent_cref)

# add item to parent's children
Expand Down Expand Up @@ -6355,38 +6384,54 @@ def concatenate(cls, docs: Sequence["DoclingDocument"]) -> "DoclingDocument":
res_doc._update_from_index(doc_index)
return res_doc

def _validate_rules(self):
def _validate_rules(self, raise_on_error: bool = True):

def _handle(error: Exception):
if raise_on_error:
raise error
else:
warnings.warn(str(error))

def validate_furniture(doc: DoclingDocument):
with warnings.catch_warnings():
warnings.simplefilter("ignore", category=DeprecationWarning)
has_furniture_children = len(doc.furniture.children) > 0
if has_furniture_children:
raise ValueError(
f"Deprecated furniture node {doc.furniture.self_ref} has children"
_handle(
ValueError(
f"Deprecated furniture node {doc.furniture.self_ref} has children"
),
)

def validate_list_group(doc: DoclingDocument, item: ListGroup):
for ref in item.children:
child = ref.resolve(doc)
if not isinstance(child, ListItem):
raise ValueError(
f"ListGroup {item.self_ref} contains non-ListItem {child.self_ref} ({child.label=})"
_handle(
ValueError(
f"ListGroup {item.self_ref} contains non-ListItem {child.self_ref} ({child.label=})"
),
)

def validate_list_item(doc: DoclingDocument, item: ListItem):
if item.parent is None:
raise ValueError(f"ListItem {item.self_ref} has no parent")
if not isinstance(item.parent.resolve(doc), ListGroup):
raise ValueError(
f"ListItem {item.self_ref} has non-ListGroup parent: {item.parent.cref}"
_handle(
ValueError(f"ListItem {item.self_ref} has no parent"),
)
elif not isinstance(item.parent.resolve(doc), ListGroup):
_handle(
ValueError(
f"ListItem {item.self_ref} has non-ListGroup parent: {item.parent.cref}"
),
)

def validate_group(doc: DoclingDocument, item: GroupItem):
if (
item.parent and not item.children
): # tolerate empty body, but not other groups
raise ValueError(f"Group {item.self_ref} has no children")
_handle(
ValueError(f"Group {item.self_ref} has no children"),
)

validate_furniture(self)

Expand Down
64 changes: 63 additions & 1 deletion test/test_docling_doc.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
import re
from collections import deque
from copy import deepcopy
from pathlib import Path
Expand Down Expand Up @@ -46,7 +47,7 @@
TextItem,
TitleItem,
)
from docling_core.types.doc.document import CURRENT_VERSION
from docling_core.types.doc.document import CURRENT_VERSION, PageItem

from .test_data_gen_flag import GEN_TEST_DATA

Expand Down Expand Up @@ -1904,3 +1905,64 @@ def test_filter_pages():
with open(exp_html_file, "r", encoding="utf-8") as f:
exp_html_data = f.read()
assert html_data == exp_html_data


def _create_doc_for_filtering():
doc = DoclingDocument(
name="",
pages={
i: PageItem(page_no=i, size=Size(width=100, height=100), image=None)
for i in range(1, 3)
},
)
p1_text = doc.add_text(
text="Text 1",
parent=doc.body,
label=DocItemLabel.TEXT,
prov=ProvenanceItem(
page_no=1, bbox=BoundingBox(l=0, t=0, r=100, b=100), charspan=(0, 1)
),
)
doc.add_group(parent=p1_text)
doc.add_text(
text="Text 2",
parent=doc.body,
label=DocItemLabel.TEXT,
prov=ProvenanceItem(
page_no=2, bbox=BoundingBox(l=0, t=0, r=100, b=100), charspan=(0, 1)
),
)
return doc


def test_filter_pages_filtered_out_parent():
doc = _create_doc_for_filtering()

with pytest.warns(
UserWarning,
match="Parent #/texts/0 not found in indexed nodes, using ancestor #/body instead",
):
doc.filter(page_nrs={2})


def test_filter_invalid_pages():
doc = _create_doc_for_filtering()
with pytest.raises(
ValueError,
match=re.escape(
"The following page numbers are not present in the document: {3}"
),
):
doc.filter(page_nrs={3})


def test_validate_rules():
doc = _create_doc_for_filtering()

message = "Group #/groups/0 has no children"

with pytest.raises(ValueError, match=message):
doc._validate_rules()

with pytest.warns(UserWarning, match=message):
doc._validate_rules(raise_on_error=False)
Loading