Skip to content

Commit b531c27

Browse files
committed
fix: generate page_break for skipped pages in export functions
When pages fail to parse and are missing from the document, the serializer now generates page_break markers for each skipped page number instead of only for the next available page. This ensures users can detect when pages were skipped during PDF parsing by checking for non-consecutive page breaks. Affects: export_to_doctags(), export_to_html(), export_to_markdown()
1 parent a0ded21 commit b531c27

1 file changed

Lines changed: 38 additions & 11 deletions

File tree

docling_core/transforms/serializer/common.py

Lines changed: 38 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,36 @@ class _PageBreakSerResult(SerializationResult):
7878
node: _PageBreakNode
7979

8080

81+
def _yield_page_breaks(
82+
prev_page: int,
83+
next_page: int,
84+
lvl: int,
85+
start_index: int,
86+
) -> Iterable[Tuple[_PageBreakNode, int, int]]:
87+
"""Yield page break nodes for each page in range (prev_page, next_page].
88+
89+
When pages are non-consecutive (e.g., page 78 -> 82), this generates
90+
individual page breaks for each transition (79, 80, 81, 82).
91+
92+
Args:
93+
prev_page: The last seen page number.
94+
next_page: The current page number.
95+
lvl: The nesting level for the yielded nodes.
96+
start_index: The starting index for page break node IDs.
97+
98+
Yields:
99+
Tuples of (PageBreakNode, level, next_index) for each page transition.
100+
"""
101+
idx = start_index
102+
for page in range(prev_page + 1, next_page + 1):
103+
yield _PageBreakNode(
104+
self_ref=f"#/pb/{idx}",
105+
prev_page=page - 1,
106+
next_page=page,
107+
), lvl, idx + 1
108+
idx += 1
109+
110+
81111
def _iterate_items(
82112
doc: DoclingDocument,
83113
layers: Optional[set[ContentLayer]],
@@ -113,22 +143,19 @@ def _iterate_items(
113143
if isinstance(it, DocItem) and it.prov:
114144
page_no = it.prov[0].page_no
115145
if prev_page_nr is not None and page_no > prev_page_nr:
116-
yield _PageBreakNode(
117-
self_ref=f"#/pb/{page_break_i}",
118-
prev_page=prev_page_nr,
119-
next_page=page_no,
120-
), lvl
146+
for pb_node, pb_lvl, page_break_i in _yield_page_breaks(
147+
prev_page_nr, page_no, lvl, page_break_i
148+
):
149+
yield pb_node, pb_lvl
121150
break
122151
elif isinstance(item, DocItem) and item.prov:
123152
page_no = item.prov[0].page_no
124153
if prev_page_nr is None or page_no > prev_page_nr:
125154
if prev_page_nr is not None: # close previous range
126-
yield _PageBreakNode(
127-
self_ref=f"#/pb/{page_break_i}",
128-
prev_page=prev_page_nr,
129-
next_page=page_no,
130-
), lvl
131-
page_break_i += 1
155+
for pb_node, pb_lvl, page_break_i in _yield_page_breaks(
156+
prev_page_nr, page_no, lvl, page_break_i
157+
):
158+
yield pb_node, pb_lvl
132159
prev_page_nr = page_no
133160
yield item, lvl
134161

0 commit comments

Comments
 (0)