Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 58 additions & 17 deletions docling_core/transforms/serializer/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,51 @@ class _PageBreakSerResult(SerializationResult):
node: _PageBreakNode


def _yield_page_breaks(
prev_page: int,
next_page: int,
lvl: int,
start_index: int,
page_numbers: Optional[set[int]] = None,
) -> Iterable[tuple[_PageBreakNode, int, int]]:
"""Yield page break nodes for each page in range (prev_page, next_page].

Generates one PageBreakNode per page transition. For example, if prev_page=1
and next_page=4, and page_numbers contains pages 1, 2, 3, 4, yields 3 page
breaks for pages 2, 3, and 4.

If page_numbers is provided, only generates page breaks for pages that exist
in page_numbers. This ensures filtered documents (via filter()) don't generate
spurious page breaks for excluded pages.

Args:
prev_page: The last seen page number (1-based physical index).
next_page: The current page number (1-based physical index).
lvl: The nesting level for the yielded nodes.
start_index: The starting index for page break node IDs.
page_numbers: Optional set of valid page numbers. If provided, only pages
in this set will generate page breaks.

Yields:
Tuples of (PageBreakNode, level, next_index) for each page transition.
"""
idx = start_index
for page in range(prev_page + 1, next_page + 1):
# Skip pages that are not in the document's pages dict
if page_numbers is not None and page not in page_numbers:
continue
yield (
_PageBreakNode(
self_ref=f"#/pb/{idx}",
prev_page=page - 1,
next_page=page,
),
lvl,
idx + 1,
)
idx += 1


def _iterate_items(
doc: DoclingDocument,
layers: Optional[set[ContentLayer]],
Expand All @@ -90,6 +135,9 @@ def _iterate_items(
my_visited: set[str] = visited if visited is not None else set()
prev_page_nr: Optional[int] = None
page_break_i = 0
# Get the set of valid page numbers from the document's pages dict
# This ensures filtered documents don't generate spurious page breaks
page_numbers: set[int] = set(doc.pages.keys())
for item, lvl in doc.iterate_items(
root=node,
with_groups=True,
Expand All @@ -111,28 +159,21 @@ def _iterate_items(
if isinstance(it, DocItem) and it.prov:
page_no = it.prov[0].page_no
if prev_page_nr is not None and page_no > prev_page_nr:
yield (
_PageBreakNode(
self_ref=f"#/pb/{page_break_i}",
prev_page=prev_page_nr,
next_page=page_no,
),
lvl,
)
for pb_node, pb_lvl, page_break_i in _yield_page_breaks(
prev_page_nr, page_no, lvl, page_break_i, page_numbers
):
yield pb_node, pb_lvl
# update previous page number to avoid duplicate page breaks
prev_page_nr = page_no
break
elif isinstance(item, DocItem) and item.prov:
page_no = item.prov[0].page_no
if prev_page_nr is None or page_no > prev_page_nr:
if prev_page_nr is not None: # close previous range
yield (
_PageBreakNode(
self_ref=f"#/pb/{page_break_i}",
prev_page=prev_page_nr,
next_page=page_no,
),
lvl,
)
page_break_i += 1
for pb_node, pb_lvl, page_break_i in _yield_page_breaks(
prev_page_nr, page_no, lvl, page_break_i, page_numbers
):
yield pb_node, pb_lvl
prev_page_nr = page_no
yield item, lvl

Expand Down
75 changes: 40 additions & 35 deletions docling_core/transforms/serializer/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -963,7 +963,8 @@ def _serialize_page_img(page_img: Image):
html_content = "\n".join([p.text for p in parts if p.text])
next_page: Optional[int] = None
prev_full_match_end = 0
pages = {}
pages: dict[int, str] = {}

for full_match, prev_page, next_page in self._get_page_breaks(html_content):
this_match_start = html_content.find(full_match)
pages[prev_page] = html_content[prev_full_match_end:this_match_start]
Expand All @@ -975,50 +976,54 @@ def _serialize_page_img(page_img: Image):
elif applicable_pages is not None and len(applicable_pages) == 1:
pages[applicable_pages[0]] = html_content

# Determine pages to render from doc.pages (includes failed pages
# added by docling's _add_failed_pages_to_document)
if self.params.pages is not None:
pages_to_render = sorted(self.params.pages)
elif self.doc.pages:
pages_to_render = sorted(self.doc.pages.keys())
elif applicable_pages:
pages_to_render = sorted(applicable_pages)
else:
pages_to_render = []

html_parts.append("<table>")
html_parts.append("<tbody>")

vized_pages_dict: dict[Optional[int], Image] = {}
if visualizer:
vized_pages_dict = visualizer.get_visualization(doc=self.doc)

for page_no, page in pages.items():
if isinstance(page_no, int):
if applicable_pages is not None and page_no not in applicable_pages:
continue
page_img = self.doc.pages[page_no].image
vized_page = vized_pages_dict.get(page_no)

html_parts.append("<tr>")

html_parts.append("<td>")

if vized_page:
html_parts.append(_serialize_page_img(page_img=vized_page))
# short-cut: we already have the image in base64
elif (
(page_img is not None)
and isinstance(page_img, ImageRef)
and isinstance(page_img.uri, AnyUrl)
and page_img.uri.scheme == "data"
):
img_text = f'<img src="{page_img.uri}">'
html_parts.append(f"<figure>{img_text}</figure>")

elif (page_img is not None) and (page_img._pil is not None):
html_parts.append(_serialize_page_img(page_img=page_img._pil))
else:
html_parts.append("<figure>no page-image found</figure>")
for page_no in pages_to_render:
page_content = pages.get(page_no, "")
page_img = self.doc.pages[page_no].image
vized_page = vized_pages_dict.get(page_no)

html_parts.append("<tr>")
html_parts.append("<td>")

if vized_page:
html_parts.append(_serialize_page_img(page_img=vized_page))
elif (
(page_img is not None)
and isinstance(page_img, ImageRef)
and isinstance(page_img.uri, AnyUrl)
and page_img.uri.scheme == "data"
):
img_text = f'<img src="{page_img.uri}">'
html_parts.append(f"<figure>{img_text}</figure>")
elif (page_img is not None) and (page_img._pil is not None):
html_parts.append(_serialize_page_img(page_img=page_img._pil))
else:
html_parts.append("<figure>no page-image found</figure>")

html_parts.append("</td>")
html_parts.append("</td>")

html_parts.append("<td>")
html_parts.append(f"<div class='page'>\n{page}\n</div>")
html_parts.append("</td>")
html_parts.append("<td>")
html_parts.append(f"<div class='page'>\n{page_content}\n</div>")
html_parts.append("</td>")

html_parts.append("</tr>")
else:
raise ValueError("We need page-indices to leverage `split_page_view`")
html_parts.append("</tr>")

html_parts.append("</tbody>")
html_parts.append("</table>")
Expand Down
13 changes: 12 additions & 1 deletion test/data/doc/2408.09869v3_enriched_p2_p3_p5.gt.html

Large diffs are not rendered by default.

Loading