From 0b93e60a707632c85f7b592ee4a01974024dc5fb Mon Sep 17 00:00:00 2001 From: Maksym Lysak Date: Mon, 9 Feb 2026 15:37:36 +0100 Subject: [PATCH 01/19] Implementation of HTML backend that uses headless browser (via playwright) to materialize HTML pages into images, and add provenances with bboxes to all elements in the converted docling document Signed-off-by: Maksym Lysak --- docling/backend/html_backend.py | 399 ++++++++++++++++++++++++++- docling/datamodel/backend_options.py | 36 +++ pyproject.toml | 1 + uv.lock | 106 ++++++- 4 files changed, 537 insertions(+), 5 deletions(-) diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py index 023d94679d..15da8e568c 100644 --- a/docling/backend/html_backend.py +++ b/docling/backend/html_backend.py @@ -1,10 +1,12 @@ import base64 import logging +import math import os import re import warnings from contextlib import contextmanager from copy import deepcopy +from dataclasses import dataclass from io import BytesIO from pathlib import Path from typing import Final, Optional, Union, cast @@ -14,6 +16,8 @@ from bs4 import BeautifulSoup, NavigableString, PageElement, Tag from bs4.element import PreformattedString from docling_core.types.doc import ( + BoundingBox, + CoordOrigin, DocItem, DocItemLabel, DoclingDocument, @@ -21,8 +25,10 @@ GroupItem, GroupLabel, PictureItem, + ProvenanceItem, RefItem, RichTableCell, + Size, TableCell, TableData, TableItem, @@ -123,6 +129,14 @@ **{k: {} for k in _CODE_TAG_SET}, } +_DATA_DOCLING_ID_ATTR: Final = "data-docling-id" + + +@dataclass(frozen=True) +class _RenderedBBox: + page_no: int + bbox: BoundingBox + class _Context(BaseModel): list_ordered_flag_by_ref: dict[str, bool] = {} @@ -134,6 +148,7 @@ class AnnotatedText(BaseModel): hyperlink: Union[AnyUrl, Path, None] = None formatting: Union[Formatting, None] = None code: bool = False + source_tag_id: Optional[str] = None class AnnotatedTextList(list): @@ -142,11 +157,13 @@ def to_single_text_element(self) -> AnnotatedText: current_text = "" current_f = None current_code = False + current_source_tag_id = None for at in self: t = at.text h = at.hyperlink f = at.formatting c = at.code + s = at.source_tag_id current_text += t.strip() + " " if f is not None and current_f is None: current_f = f @@ -160,6 +177,18 @@ def to_single_text_element(self) -> AnnotatedText: _log.warning( f"Clashing hyperlinks: '{h}' and '{current_h}'! Chose '{current_h}'" ) + if s is not None and current_source_tag_id is None: + current_source_tag_id = s + elif ( + s is not None + and current_source_tag_id is not None + and s != current_source_tag_id + ): + _log.warning( + "Clashing provenance tags: " + f"'{s}' and '{current_source_tag_id}'! " + f"Chose '{current_source_tag_id}'" + ) current_code = c if c else current_code return AnnotatedText( @@ -167,6 +196,7 @@ def to_single_text_element(self) -> AnnotatedText: hyperlink=current_h, formatting=current_f, code=current_code, + source_tag_id=current_source_tag_id, ) def simplify_text_elements(self) -> "AnnotatedTextList": @@ -177,12 +207,14 @@ def simplify_text_elements(self) -> "AnnotatedTextList": hyperlink = self[0].hyperlink formatting = self[0].formatting code = self[0].code + source_tag_id = self[0].source_tag_id last_elm = text for i in range(1, len(self)): if ( hyperlink == self[i].hyperlink and formatting == self[i].formatting and code == self[i].code + and source_tag_id == self[i].source_tag_id ): sep = " " if not self[i].text.strip() or not last_elm.strip(): @@ -192,7 +224,11 @@ def simplify_text_elements(self) -> "AnnotatedTextList": else: simplified.append( AnnotatedText( - text=text, hyperlink=hyperlink, formatting=formatting, code=code + text=text, + hyperlink=hyperlink, + formatting=formatting, + code=code, + source_tag_id=source_tag_id, ) ) text = self[i].text @@ -200,10 +236,15 @@ def simplify_text_elements(self) -> "AnnotatedTextList": hyperlink = self[i].hyperlink formatting = self[i].formatting code = self[i].code + source_tag_id = self[i].source_tag_id if text: simplified.append( AnnotatedText( - text=text, hyperlink=hyperlink, formatting=formatting, code=code + text=text, + hyperlink=hyperlink, + formatting=formatting, + code=code, + source_tag_id=source_tag_id, ) ) return simplified @@ -239,7 +280,9 @@ def __init__( self.options: HTMLBackendOptions self.soup: Optional[BeautifulSoup] = None self.path_or_stream: Union[BytesIO, Path] = path_or_stream - self.base_path: Optional[str] = str(options.source_uri) + self.base_path: Optional[str] = ( + str(options.source_uri) if options.source_uri is not None else None + ) # Initialize the parents for the hierarchy self.max_levels = 10 @@ -250,6 +293,11 @@ def __init__( self.parents[i] = None self.hyperlink: Union[AnyUrl, Path, None] = None self.format_tags: list[str] = [] + self._raw_html_bytes: Optional[bytes] = None + self._rendered_html: Optional[str] = None + self._rendered_bbox_by_id: dict[str, _RenderedBBox] = {} + self._rendered_page_images: list[Image.Image] = [] + self._rendered_page_size: Optional[Size] = None try: raw = ( @@ -257,6 +305,7 @@ def __init__( if isinstance(path_or_stream, BytesIO) else Path(path_or_stream).read_bytes() ) + self._raw_html_bytes = raw self.soup = BeautifulSoup(raw, "html.parser") except Exception as e: raise RuntimeError( @@ -297,6 +346,20 @@ def convert(self) -> DoclingDocument: ) doc = DoclingDocument(name=self.file.stem or "file", origin=origin) + if cast(HTMLBackendOptions, self.options).render_page: + self._render_with_browser() + if self._rendered_html: + self.soup = BeautifulSoup(self._rendered_html, "html.parser") + + if self._rendered_page_images and self._rendered_page_size: + render_dpi = cast(HTMLBackendOptions, self.options).render_dpi + for page_no, page_image in enumerate(self._rendered_page_images, start=1): + doc.add_page( + page_no=page_no, + size=self._rendered_page_size, + image=ImageRef.from_pil(image=page_image, dpi=render_dpi), + ) + assert self.soup is not None # set the title as furniture, since it is part of the document metadata title = self.soup.title @@ -343,6 +406,264 @@ def convert(self) -> DoclingDocument: self._walk(content, doc) return doc + def _get_render_page_size(self) -> tuple[int, int]: + options = cast(HTMLBackendOptions, self.options) + width = options.render_page_width + height = options.render_page_height + if options.render_page_orientation == "landscape": + width, height = height, width + return width, height + + def _coerce_base_url(self, value: str) -> str: + if HTMLDocumentBackend._is_remote_url(value) or value.startswith("file://"): + return value + return Path(value).resolve().as_uri() + + def _get_render_html_text(self) -> str: + if self._raw_html_bytes is None: + return "" + return self._raw_html_bytes.decode("utf-8", errors="replace") + + def _inject_base_tag(self, html_text: str, base_url: Optional[str]) -> str: + if not base_url: + return html_text + soup = BeautifulSoup(html_text, "html.parser") + if soup.head is None: + return html_text + if soup.head.find("base") is not None: + return html_text + base_tag = soup.new_tag("base", href=base_url) + soup.head.insert(0, base_tag) + return str(soup) + + def _pad_image(self, image: Image.Image, width: int, height: int) -> Image.Image: + if image.width == width and image.height == height: + return image + canvas = Image.new("RGB", (width, height), color=(255, 255, 255)) + canvas.paste(image, (0, 0)) + return canvas + + def _render_with_browser(self) -> None: + options = cast(HTMLBackendOptions, self.options) + if not options.render_page: + return + + try: + from playwright.sync_api import sync_playwright + except ImportError as exc: + raise RuntimeError( + "Playwright is required for HTML rendering. " + "Install it with 'pip install playwright' and run " + "'playwright install'." + ) from exc + + width, height = self._get_render_page_size() + self._rendered_page_size = Size(width=width, height=height) + + render_url: Optional[str] = None + render_html = self._get_render_html_text() + + if isinstance(self.path_or_stream, Path): + render_url = self.path_or_stream.resolve().as_uri() + elif self.base_path: + render_html = self._inject_base_tag( + render_html, self._coerce_base_url(self.base_path) + ) + + with sync_playwright() as playwright: + browser = playwright.chromium.launch(headless=True) + context = browser.new_context( + viewport={"width": width, "height": height}, + device_scale_factor=options.render_device_scale, + ) + page = context.new_page() + if options.render_print_media: + page.emulate_media(media="print") + + if render_url: + page.goto(render_url, wait_until=options.render_wait_until) + else: + page.set_content(render_html, wait_until=options.render_wait_until) + + if options.render_wait_ms: + page.wait_for_timeout(options.render_wait_ms) + + render_data = page.evaluate( + """ + () => { + const nodes = Array.from(document.querySelectorAll('*')); + const boxes = {}; + let idx = 0; + for (const node of nodes) { + idx += 1; + const id = String(idx); + node.setAttribute('data-docling-id', id); + const rect = node.getBoundingClientRect(); + if (!rect) { + continue; + } + const width = rect.width || 0; + const height = rect.height || 0; + if (width <= 0 && height <= 0) { + continue; + } + const x = rect.left + window.scrollX; + const y = rect.top + window.scrollY; + boxes[id] = { x, y, width, height }; + } + const doc = document.documentElement; + const body = document.body; + const scrollWidth = Math.max( + doc ? doc.scrollWidth : 0, + body ? body.scrollWidth : 0 + ); + const scrollHeight = Math.max( + doc ? doc.scrollHeight : 0, + body ? body.scrollHeight : 0 + ); + return { boxes, scrollWidth, scrollHeight }; + } + """ + ) + + self._rendered_html = page.content() + scroll_height = int(render_data.get("scrollHeight", height)) + self._rendered_page_images = self._capture_page_images( + page=page, + render_data=render_data, + page_width=width, + page_height=height, + full_page=options.render_full_page, + ) + if self._rendered_page_images and self._rendered_page_size: + if options.render_full_page: + self._rendered_page_size = Size(width=width, height=scroll_height) + + self._rendered_bbox_by_id = self._build_bbox_mapping( + render_data=render_data, + page_height=int(self._rendered_page_size.height) + if self._rendered_page_size + else height, + full_page=options.render_full_page, + ) + + context.close() + browser.close() + + def _capture_page_images( + self, + page, + render_data: dict, + page_width: int, + page_height: int, + full_page: bool, + ) -> list[Image.Image]: + scroll_height = int(render_data.get("scrollHeight", page_height)) + if scroll_height <= 0: + return [] + + screenshot_bytes = page.screenshot(full_page=True) + full_image = Image.open(BytesIO(screenshot_bytes)).convert("RGB") + + if full_page: + return [full_image] + + page_images: list[Image.Image] = [] + page_count = max(1, math.ceil(scroll_height / page_height)) + scale_y = full_image.height / float(scroll_height) + target_height = round(page_height * scale_y) + + for page_idx in range(page_count): + top_css = page_idx * page_height + bottom_css = min(top_css + page_height, scroll_height) + top_px = round(top_css * scale_y) + bottom_px = round(bottom_css * scale_y) + if bottom_px <= top_px: + continue + cropped = full_image.crop((0, top_px, full_image.width, bottom_px)) + cropped = self._pad_image( + image=cropped, width=full_image.width, height=target_height + ) + page_images.append(cropped) + + return page_images + + def _build_bbox_mapping( + self, render_data: dict, page_height: int, full_page: bool + ) -> dict[str, _RenderedBBox]: + boxes = render_data.get("boxes", {}) or {} + scroll_height = float(render_data.get("scrollHeight", page_height)) + + if full_page: + page_count = 1 + else: + page_count = max(1, math.ceil(scroll_height / page_height)) + + mapping: dict[str, _RenderedBBox] = {} + for tag_id, rect in boxes.items(): + left = float(rect.get("x", 0.0)) + top = float(rect.get("y", 0.0)) + width = float(rect.get("width", 0.0)) + height = float(rect.get("height", 0.0)) + if width <= 0 and height <= 0: + continue + right = left + width + bottom = top + height + if full_page: + page_no = 1 + offset = 0.0 + else: + page_no = int(top // page_height) + 1 + page_no = min(max(page_no, 1), page_count) + offset = (page_no - 1) * page_height + bbox = BoundingBox( + l=left, + t=top - offset, + r=right, + b=bottom - offset, + coord_origin=CoordOrigin.TOPLEFT, + ) + mapping[str(tag_id)] = _RenderedBBox(page_no=page_no, bbox=bbox) + + return mapping + + def _get_tag_id(self, tag: Optional[Tag]) -> Optional[str]: + if tag is None: + return None + tag_id = tag.get(_DATA_DOCLING_ID_ATTR) + if not tag_id: + return None + return str(tag_id) + + def _get_rendered_bbox_for_tag(self, tag: Optional[Tag]) -> Optional[_RenderedBBox]: + tag_id = self._get_tag_id(tag) + if tag_id is None: + return None + return self._rendered_bbox_by_id.get(tag_id) + + def _make_prov( + self, + text: str, + tag: Optional[Tag] = None, + source_tag_id: Optional[str] = None, + ) -> Optional[ProvenanceItem]: + if not self._rendered_bbox_by_id: + return None + + render_box: Optional[_RenderedBBox] = None + if source_tag_id: + render_box = self._rendered_bbox_by_id.get(source_tag_id) + if render_box is None: + render_box = self._get_rendered_bbox_for_tag(tag) + if render_box is None: + return None + + return ProvenanceItem( + page_no=render_box.page_no, + bbox=render_box.bbox, + charspan=(0, len(text)), + ) + @staticmethod def _fix_invalid_paragraph_structure(soup: BeautifulSoup) -> None: """Rewrite

elements that contain block-level breakers. @@ -361,6 +682,8 @@ def _start_para(): nonlocal current_p if current_p is None: current_p = soup.new_tag("p") + if p.get(_DATA_DOCLING_ID_ATTR): + current_p[_DATA_DOCLING_ID_ATTR] = p.get(_DATA_DOCLING_ID_ATTR) new_nodes.append(current_p) def _flush_para_if_empty(): @@ -591,6 +914,10 @@ def parse_table_data( self.get_text(html_cell).strip() ) col_span, row_span = self._get_cell_spans(html_cell) + cell_bbox = None + rendered_cell = self._get_rendered_bbox_for_tag(html_cell) + if rendered_cell is not None: + cell_bbox = rendered_cell.bbox if row_header: row_span -= 1 while ( @@ -606,6 +933,7 @@ def parse_table_data( if rich_table_cell: rich_cell = RichTableCell( text=text, + bbox=cell_bbox, row_span=row_span, col_span=col_span, start_row_offset_idx=start_row_span + row_idx, @@ -620,6 +948,7 @@ def parse_table_data( else: simple_cell = TableCell( text=text, + bbox=cell_bbox, row_span=row_span, col_span=col_span, start_row_offset_idx=start_row_span + row_idx, @@ -662,6 +991,11 @@ def _flush_buffer() -> None: seg_clean = HTMLDocumentBackend._clean_unicode( annotated_text.text.strip() ) + prov = self._make_prov( + text=seg_clean, + tag=element, + source_tag_id=annotated_text.source_tag_id, + ) if annotated_text.code: docling_code2 = doc.add_code( parent=self.parents[self.level], @@ -669,6 +1003,7 @@ def _flush_buffer() -> None: content_layer=self.content_layer, formatting=annotated_text.formatting, hyperlink=annotated_text.hyperlink, + prov=prov, ) added_refs.append(docling_code2.get_ref()) else: @@ -679,6 +1014,7 @@ def _flush_buffer() -> None: content_layer=self.content_layer, formatting=annotated_text.formatting, hyperlink=annotated_text.hyperlink, + prov=prov, ) added_refs.append(docling_text2.get_ref()) @@ -779,6 +1115,9 @@ def _extract_text_and_hyperlink_recursively( if isinstance(item, NavigableString): text = item.strip() code = any(code_tag in self.format_tags for code_tag in _CODE_TAG_SET) + source_tag_id = ( + self._get_tag_id(item.parent) if isinstance(item.parent, Tag) else None + ) if text: return AnnotatedTextList( [ @@ -787,6 +1126,7 @@ def _extract_text_and_hyperlink_recursively( hyperlink=self.hyperlink, formatting=self._formatting, code=code, + source_tag_id=source_tag_id, ) ] ) @@ -798,6 +1138,7 @@ def _extract_text_and_hyperlink_recursively( hyperlink=self.hyperlink, formatting=self._formatting, code=code, + source_tag_id=source_tag_id, ) ] ) @@ -970,6 +1311,11 @@ def _handle_heading(self, tag: Tag, doc: DoclingDocument) -> list[RefItem]: ) annotated_text = annotated_text_list.to_single_text_element() text_clean = HTMLDocumentBackend._clean_unicode(annotated_text.text) + prov = self._make_prov( + text=text_clean, + tag=tag, + source_tag_id=annotated_text.source_tag_id, + ) # the first level is for the title item if level == 1: for key in self.parents.keys(): @@ -980,6 +1326,7 @@ def _handle_heading(self, tag: Tag, doc: DoclingDocument) -> list[RefItem]: content_layer=self.content_layer, formatting=annotated_text.formatting, hyperlink=annotated_text.hyperlink, + prov=prov, ) p1 = self.parents[self.level + 1] if p1 is not None: @@ -1013,6 +1360,7 @@ def _handle_heading(self, tag: Tag, doc: DoclingDocument) -> list[RefItem]: content_layer=self.content_layer, formatting=annotated_text.formatting, hyperlink=annotated_text.hyperlink, + prov=prov, ) p2 = self.parents[self.level + 1] if p2 is not None: @@ -1078,6 +1426,7 @@ def _handle_list(self, tag: Tag, doc: DoclingDocument) -> RefItem: # 3) add the list item if li_text: if len(min_parts) > 1: + li_prov = self._make_prov(text=li_text, tag=li) # create an empty list element in order to hook the inline group onto that one self.parents[self.level + 1] = doc.add_list_item( text="", @@ -1085,6 +1434,7 @@ def _handle_list(self, tag: Tag, doc: DoclingDocument) -> RefItem: marker=marker, parent=list_group, content_layer=self.content_layer, + prov=li_prov, ) self.level += 1 with self._use_inline_group(min_parts, doc): @@ -1093,6 +1443,11 @@ def _handle_list(self, tag: Tag, doc: DoclingDocument) -> RefItem: r"\s+|\n+", " ", annotated_text.text ).strip() li_clean = HTMLDocumentBackend._clean_unicode(li_text) + prov = self._make_prov( + text=li_clean, + tag=li, + source_tag_id=annotated_text.source_tag_id, + ) if annotated_text.code: doc.add_code( parent=self.parents[self.level], @@ -1100,6 +1455,7 @@ def _handle_list(self, tag: Tag, doc: DoclingDocument) -> RefItem: content_layer=self.content_layer, formatting=annotated_text.formatting, hyperlink=annotated_text.hyperlink, + prov=prov, ) else: doc.add_text( @@ -1109,6 +1465,7 @@ def _handle_list(self, tag: Tag, doc: DoclingDocument) -> RefItem: content_layer=self.content_layer, formatting=annotated_text.formatting, hyperlink=annotated_text.hyperlink, + prov=prov, ) # 4) recurse into any nested lists, attaching them to this

  • item @@ -1123,6 +1480,11 @@ def _handle_list(self, tag: Tag, doc: DoclingDocument) -> RefItem: annotated_text = min_parts[0] li_text = re.sub(r"\s+|\n+", " ", annotated_text.text).strip() li_clean = HTMLDocumentBackend._clean_unicode(li_text) + prov = self._make_prov( + text=li_clean, + tag=li, + source_tag_id=annotated_text.source_tag_id, + ) self.parents[self.level + 1] = doc.add_list_item( text=li_clean, enumerated=is_ordered, @@ -1132,6 +1494,7 @@ def _handle_list(self, tag: Tag, doc: DoclingDocument) -> RefItem: content_layer=self.content_layer, formatting=annotated_text.formatting, hyperlink=annotated_text.hyperlink, + prov=prov, ) # 4) recurse into any nested lists, attaching them to this
  • item @@ -1209,6 +1572,11 @@ def _handle_block(self, tag: Tag, doc: DoclingDocument) -> list[RefItem]: for annotated_text in part: if seg := annotated_text.text.strip(): seg_clean = HTMLDocumentBackend._clean_unicode(seg) + prov = self._make_prov( + text=seg_clean, + tag=tag, + source_tag_id=annotated_text.source_tag_id, + ) if annotated_text.code: docling_code = doc.add_code( parent=self.parents[self.level], @@ -1216,6 +1584,7 @@ def _handle_block(self, tag: Tag, doc: DoclingDocument) -> list[RefItem]: content_layer=self.content_layer, formatting=annotated_text.formatting, hyperlink=annotated_text.hyperlink, + prov=prov, ) added_refs.append(docling_code.get_ref()) else: @@ -1226,6 +1595,7 @@ def _handle_block(self, tag: Tag, doc: DoclingDocument) -> list[RefItem]: content_layer=self.content_layer, formatting=annotated_text.formatting, hyperlink=annotated_text.hyperlink, + prov=prov, ) added_refs.append(docling_text.get_ref()) @@ -1236,9 +1606,11 @@ def _handle_block(self, tag: Tag, doc: DoclingDocument) -> list[RefItem]: elif tag_name == "table": num_rows, num_cols = self.get_html_table_row_col(tag) data_e = TableData(num_rows=num_rows, num_cols=num_cols) + table_prov = self._make_prov(text="", tag=tag) docling_table = doc.add_table( data=data_e, parent=self.parents[self.level], + prov=table_prov, content_layer=self.content_layer, ) added_refs.append(docling_table.get_ref()) @@ -1246,7 +1618,7 @@ def _handle_block(self, tag: Tag, doc: DoclingDocument) -> list[RefItem]: for img_tag in tag("img"): if isinstance(img_tag, Tag): - im_ref2 = self._emit_image(tag, doc) + im_ref2 = self._emit_image(img_tag, doc) if im_ref2 is not None: added_refs.append(im_ref2) @@ -1261,12 +1633,18 @@ def _handle_block(self, tag: Tag, doc: DoclingDocument) -> list[RefItem]: text_clean = HTMLDocumentBackend._clean_unicode( annotated_text.text.strip() ) + prov = self._make_prov( + text=text_clean, + tag=tag, + source_tag_id=annotated_text.source_tag_id, + ) docling_code2 = doc.add_code( parent=self.parents[self.level], text=text_clean, content_layer=self.content_layer, formatting=annotated_text.formatting, hyperlink=annotated_text.hyperlink, + prov=prov, ) added_refs.append(docling_code2.get_ref()) @@ -1282,6 +1660,7 @@ def _handle_block(self, tag: Tag, doc: DoclingDocument) -> list[RefItem]: def _emit_image(self, img_tag: Tag, doc: DoclingDocument) -> Optional[RefItem]: figure = img_tag.find_parent("figure") caption: AnnotatedTextList = AnnotatedTextList() + caption_prov_tag: Optional[Tag] = None parent = self.parents[self.level] @@ -1297,6 +1676,7 @@ def get_img_hyperlink(img_tag): if img_hyperlink := get_img_hyperlink(img_tag): img_text = img_tag.get("alt") or "" caption.append(AnnotatedText(text=img_text, hyperlink=img_hyperlink)) + caption_prov_tag = img_tag if isinstance(figure, Tag): caption_tag = figure.find("figcaption", recursive=False) @@ -1304,8 +1684,10 @@ def get_img_hyperlink(img_tag): caption = self._extract_text_and_hyperlink_recursively( caption_tag, find_parent_annotation=True ) + caption_prov_tag = caption_tag if not caption and img_tag.get("alt"): caption = AnnotatedTextList([AnnotatedText(text=img_tag.get("alt"))]) + caption_prov_tag = img_tag caption_anno_text = caption.to_single_text_element() @@ -1314,6 +1696,11 @@ def get_img_hyperlink(img_tag): text_clean = HTMLDocumentBackend._clean_unicode( caption_anno_text.text.strip() ) + prov = self._make_prov( + text=text_clean, + tag=caption_prov_tag or img_tag, + source_tag_id=caption_anno_text.source_tag_id, + ) caption_item = doc.add_text( label=DocItemLabel.CAPTION, text=text_clean, @@ -1321,15 +1708,18 @@ def get_img_hyperlink(img_tag): content_layer=self.content_layer, formatting=caption_anno_text.formatting, hyperlink=caption_anno_text.hyperlink, + prov=prov, ) src_loc: str = self._get_attr_as_string(img_tag, "src") + pic_prov = self._make_prov(text="", tag=img_tag) if not cast(HTMLBackendOptions, self.options).fetch_images or not src_loc: # Do not fetch the image, just add a placeholder placeholder: PictureItem = doc.add_picture( caption=caption_item, parent=parent, content_layer=self.content_layer, + prov=pic_prov, ) return placeholder.get_ref() @@ -1341,6 +1731,7 @@ def get_img_hyperlink(img_tag): caption=caption_item, parent=parent, content_layer=self.content_layer, + prov=pic_prov, ) return docling_pic.get_ref() diff --git a/docling/datamodel/backend_options.py b/docling/datamodel/backend_options.py index 5b054f6add..94af677724 100644 --- a/docling/datamodel/backend_options.py +++ b/docling/datamodel/backend_options.py @@ -28,6 +28,42 @@ class HTMLBackendOptions(BaseBackendOptions): """ kind: Literal["html"] = Field("html", exclude=True, repr=False) + render_page: bool = Field( + False, + description=( + "Render HTML in a headless browser to capture page images and " + "element bounding boxes." + ), + ) + render_page_width: int = Field( + 794, description="Render page width in CSS pixels (A4 @ 96 DPI)." + ) + render_page_height: int = Field( + 1123, description="Render page height in CSS pixels (A4 @ 96 DPI)." + ) + render_page_orientation: Literal["portrait", "landscape"] = Field( + "portrait", description="Render page orientation." + ) + render_print_media: bool = Field( + True, description="Use print media emulation when rendering." + ) + render_wait_until: Literal["load", "domcontentloaded", "networkidle"] = Field( + "networkidle", + description="Playwright wait condition before extracting the DOM.", + ) + render_wait_ms: int = Field( + 0, description="Extra delay in milliseconds after load." + ) + render_device_scale: float = Field( + 1.0, description="Device scale factor for rendering." + ) + render_full_page: bool = Field( + False, + description=("Capture a single full-height page image instead of paginating."), + ) + render_dpi: int = Field( + 96, description="DPI used for page images created from rendering." + ) fetch_images: bool = Field( False, description=( diff --git a/pyproject.toml b/pyproject.toml index 071d60192f..7a7f23b9f0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -71,6 +71,7 @@ dependencies = [ 'scipy (>=1.6.0,<2.0.0)', "accelerate>=1.0.0,<2", "polyfactory>=2.22.2", + "playwright>=1.58.0", ] [project.urls] diff --git a/uv.lock b/uv.lock index 6102d54ddc..66bde450d4 100644 --- a/uv.lock +++ b/uv.lock @@ -1,5 +1,5 @@ version = 1 -revision = 3 +revision = 2 requires-python = ">=3.10, <4.0" resolution-markers = [ "python_full_version >= '3.12'", @@ -943,6 +943,7 @@ dependencies = [ { name = "openpyxl" }, { name = "pandas" }, { name = "pillow" }, + { name = "playwright" }, { name = "pluggy" }, { name = "polyfactory" }, { name = "pydantic" }, @@ -1052,6 +1053,7 @@ requires-dist = [ { name = "openpyxl", specifier = ">=3.1.5,<4.0.0" }, { name = "pandas", specifier = ">=2.1.4,<3.0.0" }, { name = "pillow", specifier = ">=10.0.0,<12.0.0" }, + { name = "playwright", specifier = ">=1.58.0" }, { name = "pluggy", specifier = ">=1.0.0,<2.0.0" }, { name = "polyfactory", specifier = ">=2.22.2" }, { name = "pydantic", specifier = ">=2.0.0,<3.0.0" }, @@ -1542,6 +1544,66 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f1/50/6aae23929bd019300ef13fb79d60609215c53d4541963eaffc438e62f77e/gliner-0.2.24-py3-none-any.whl", hash = "sha256:efe614e05b31d06d848373aef8270f567e34fe1b4e96f816a8c70cef24908a6c", size = 151880, upload-time = "2025-11-26T18:20:28.801Z" }, ] +[[package]] +name = "greenlet" +version = "3.3.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/8a/99/1cd3411c56a410994669062bd73dd58270c00cc074cac15f385a1fd91f8a/greenlet-3.3.1.tar.gz", hash = "sha256:41848f3230b58c08bb43dee542e74a2a2e34d3c59dc3076cec9151aeeedcae98", size = 184690, upload-time = "2026-01-23T15:31:02.076Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fe/65/5b235b40581ad75ab97dcd8b4218022ae8e3ab77c13c919f1a1dfe9171fd/greenlet-3.3.1-cp310-cp310-macosx_11_0_universal2.whl", hash = "sha256:04bee4775f40ecefcdaa9d115ab44736cd4b9c5fba733575bfe9379419582e13", size = 273723, upload-time = "2026-01-23T15:30:37.521Z" }, + { url = "https://files.pythonhosted.org/packages/ce/ad/eb4729b85cba2d29499e0a04ca6fbdd8f540afd7be142fd571eea43d712f/greenlet-3.3.1-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:50e1457f4fed12a50e427988a07f0f9df53cf0ee8da23fab16e6732c2ec909d4", size = 574874, upload-time = "2026-01-23T16:00:54.551Z" }, + { url = "https://files.pythonhosted.org/packages/87/32/57cad7fe4c8b82fdaa098c89498ef85ad92dfbb09d5eb713adedfc2ae1f5/greenlet-3.3.1-cp310-cp310-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:070472cd156f0656f86f92e954591644e158fd65aa415ffbe2d44ca77656a8f5", size = 586309, upload-time = "2026-01-23T16:05:25.18Z" }, + { url = "https://files.pythonhosted.org/packages/66/66/f041005cb87055e62b0d68680e88ec1a57f4688523d5e2fb305841bc8307/greenlet-3.3.1-cp310-cp310-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:1108b61b06b5224656121c3c8ee8876161c491cbe74e5c519e0634c837cf93d5", size = 597461, upload-time = "2026-01-23T16:15:51.943Z" }, + { url = "https://files.pythonhosted.org/packages/87/eb/8a1ec2da4d55824f160594a75a9d8354a5fe0a300fb1c48e7944265217e1/greenlet-3.3.1-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3a300354f27dd86bae5fbf7002e6dd2b3255cd372e9242c933faf5e859b703fe", size = 586985, upload-time = "2026-01-23T15:32:47.968Z" }, + { url = "https://files.pythonhosted.org/packages/15/1c/0621dd4321dd8c351372ee8f9308136acb628600658a49be1b7504208738/greenlet-3.3.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:e84b51cbebf9ae573b5fbd15df88887815e3253fc000a7d0ff95170e8f7e9729", size = 1547271, upload-time = "2026-01-23T16:04:18.977Z" }, + { url = "https://files.pythonhosted.org/packages/9d/53/24047f8924c83bea7a59c8678d9571209c6bfe5f4c17c94a78c06024e9f2/greenlet-3.3.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:e0093bd1a06d899892427217f0ff2a3c8f306182b8c754336d32e2d587c131b4", size = 1613427, upload-time = "2026-01-23T15:33:44.428Z" }, + { url = "https://files.pythonhosted.org/packages/ff/07/ac9bf1ec008916d1a3373cae212884c1dcff4a4ba0d41127ce81a8deb4e9/greenlet-3.3.1-cp310-cp310-win_amd64.whl", hash = "sha256:7932f5f57609b6a3b82cc11877709aa7a98e3308983ed93552a1c377069b20c8", size = 226100, upload-time = "2026-01-23T15:30:56.957Z" }, + { url = "https://files.pythonhosted.org/packages/ec/e8/2e1462c8fdbe0f210feb5ac7ad2d9029af8be3bf45bd9fa39765f821642f/greenlet-3.3.1-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:5fd23b9bc6d37b563211c6abbb1b3cab27db385a4449af5c32e932f93017080c", size = 274974, upload-time = "2026-01-23T15:31:02.891Z" }, + { url = "https://files.pythonhosted.org/packages/7e/a8/530a401419a6b302af59f67aaf0b9ba1015855ea7e56c036b5928793c5bd/greenlet-3.3.1-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:09f51496a0bfbaa9d74d36a52d2580d1ef5ed4fdfcff0a73730abfbbbe1403dd", size = 577175, upload-time = "2026-01-23T16:00:56.213Z" }, + { url = "https://files.pythonhosted.org/packages/8e/89/7e812bb9c05e1aaef9b597ac1d0962b9021d2c6269354966451e885c4e6b/greenlet-3.3.1-cp311-cp311-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:cb0feb07fe6e6a74615ee62a880007d976cf739b6669cce95daa7373d4fc69c5", size = 590401, upload-time = "2026-01-23T16:05:26.365Z" }, + { url = "https://files.pythonhosted.org/packages/70/ae/e2d5f0e59b94a2269b68a629173263fa40b63da32f5c231307c349315871/greenlet-3.3.1-cp311-cp311-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:67ea3fc73c8cd92f42467a72b75e8f05ed51a0e9b1d15398c913416f2dafd49f", size = 601161, upload-time = "2026-01-23T16:15:53.456Z" }, + { url = "https://files.pythonhosted.org/packages/5c/ae/8d472e1f5ac5efe55c563f3eabb38c98a44b832602e12910750a7c025802/greenlet-3.3.1-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:39eda9ba259cc9801da05351eaa8576e9aa83eb9411e8f0c299e05d712a210f2", size = 590272, upload-time = "2026-01-23T15:32:49.411Z" }, + { url = "https://files.pythonhosted.org/packages/a8/51/0fde34bebfcadc833550717eade64e35ec8738e6b097d5d248274a01258b/greenlet-3.3.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:e2e7e882f83149f0a71ac822ebf156d902e7a5d22c9045e3e0d1daf59cee2cc9", size = 1550729, upload-time = "2026-01-23T16:04:20.867Z" }, + { url = "https://files.pythonhosted.org/packages/16/c9/2fb47bee83b25b119d5a35d580807bb8b92480a54b68fef009a02945629f/greenlet-3.3.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:80aa4d79eb5564f2e0a6144fcc744b5a37c56c4a92d60920720e99210d88db0f", size = 1615552, upload-time = "2026-01-23T15:33:45.743Z" }, + { url = "https://files.pythonhosted.org/packages/1f/54/dcf9f737b96606f82f8dd05becfb8d238db0633dd7397d542a296fe9cad3/greenlet-3.3.1-cp311-cp311-win_amd64.whl", hash = "sha256:32e4ca9777c5addcbf42ff3915d99030d8e00173a56f80001fb3875998fe410b", size = 226462, upload-time = "2026-01-23T15:36:50.422Z" }, + { url = "https://files.pythonhosted.org/packages/91/37/61e1015cf944ddd2337447d8e97fb423ac9bc21f9963fb5f206b53d65649/greenlet-3.3.1-cp311-cp311-win_arm64.whl", hash = "sha256:da19609432f353fed186cc1b85e9440db93d489f198b4bdf42ae19cc9d9ac9b4", size = 225715, upload-time = "2026-01-23T15:33:17.298Z" }, + { url = "https://files.pythonhosted.org/packages/f9/c8/9d76a66421d1ae24340dfae7e79c313957f6e3195c144d2c73333b5bfe34/greenlet-3.3.1-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:7e806ca53acf6d15a888405880766ec84721aa4181261cd11a457dfe9a7a4975", size = 276443, upload-time = "2026-01-23T15:30:10.066Z" }, + { url = "https://files.pythonhosted.org/packages/81/99/401ff34bb3c032d1f10477d199724f5e5f6fbfb59816ad1455c79c1eb8e7/greenlet-3.3.1-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d842c94b9155f1c9b3058036c24ffb8ff78b428414a19792b2380be9cecf4f36", size = 597359, upload-time = "2026-01-23T16:00:57.394Z" }, + { url = "https://files.pythonhosted.org/packages/2b/bc/4dcc0871ed557792d304f50be0f7487a14e017952ec689effe2180a6ff35/greenlet-3.3.1-cp312-cp312-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:20fedaadd422fa02695f82093f9a98bad3dab5fcda793c658b945fcde2ab27ba", size = 607805, upload-time = "2026-01-23T16:05:28.068Z" }, + { url = "https://files.pythonhosted.org/packages/3b/cd/7a7ca57588dac3389e97f7c9521cb6641fd8b6602faf1eaa4188384757df/greenlet-3.3.1-cp312-cp312-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:c620051669fd04ac6b60ebc70478210119c56e2d5d5df848baec4312e260e4ca", size = 622363, upload-time = "2026-01-23T16:15:54.754Z" }, + { url = "https://files.pythonhosted.org/packages/cf/05/821587cf19e2ce1f2b24945d890b164401e5085f9d09cbd969b0c193cd20/greenlet-3.3.1-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:14194f5f4305800ff329cbf02c5fcc88f01886cadd29941b807668a45f0d2336", size = 609947, upload-time = "2026-01-23T15:32:51.004Z" }, + { url = "https://files.pythonhosted.org/packages/a4/52/ee8c46ed9f8babaa93a19e577f26e3d28a519feac6350ed6f25f1afee7e9/greenlet-3.3.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:7b2fe4150a0cf59f847a67db8c155ac36aed89080a6a639e9f16df5d6c6096f1", size = 1567487, upload-time = "2026-01-23T16:04:22.125Z" }, + { url = "https://files.pythonhosted.org/packages/8f/7c/456a74f07029597626f3a6db71b273a3632aecb9afafeeca452cfa633197/greenlet-3.3.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:49f4ad195d45f4a66a0eb9c1ba4832bb380570d361912fa3554746830d332149", size = 1636087, upload-time = "2026-01-23T15:33:47.486Z" }, + { url = "https://files.pythonhosted.org/packages/34/2f/5e0e41f33c69655300a5e54aeb637cf8ff57f1786a3aba374eacc0228c1d/greenlet-3.3.1-cp312-cp312-win_amd64.whl", hash = "sha256:cc98b9c4e4870fa983436afa999d4eb16b12872fab7071423d5262fa7120d57a", size = 227156, upload-time = "2026-01-23T15:34:34.808Z" }, + { url = "https://files.pythonhosted.org/packages/c8/ab/717c58343cf02c5265b531384b248787e04d8160b8afe53d9eec053d7b44/greenlet-3.3.1-cp312-cp312-win_arm64.whl", hash = "sha256:bfb2d1763d777de5ee495c85309460f6fd8146e50ec9d0ae0183dbf6f0a829d1", size = 226403, upload-time = "2026-01-23T15:31:39.372Z" }, + { url = "https://files.pythonhosted.org/packages/ec/ab/d26750f2b7242c2b90ea2ad71de70cfcd73a948a49513188a0fc0d6fc15a/greenlet-3.3.1-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:7ab327905cabb0622adca5971e488064e35115430cec2c35a50fd36e72a315b3", size = 275205, upload-time = "2026-01-23T15:30:24.556Z" }, + { url = "https://files.pythonhosted.org/packages/10/d3/be7d19e8fad7c5a78eeefb2d896a08cd4643e1e90c605c4be3b46264998f/greenlet-3.3.1-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:65be2f026ca6a176f88fb935ee23c18333ccea97048076aef4db1ef5bc0713ac", size = 599284, upload-time = "2026-01-23T16:00:58.584Z" }, + { url = "https://files.pythonhosted.org/packages/ae/21/fe703aaa056fdb0f17e5afd4b5c80195bbdab701208918938bd15b00d39b/greenlet-3.3.1-cp313-cp313-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:7a3ae05b3d225b4155bda56b072ceb09d05e974bc74be6c3fc15463cf69f33fd", size = 610274, upload-time = "2026-01-23T16:05:29.312Z" }, + { url = "https://files.pythonhosted.org/packages/06/00/95df0b6a935103c0452dad2203f5be8377e551b8466a29650c4c5a5af6cc/greenlet-3.3.1-cp313-cp313-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:12184c61e5d64268a160226fb4818af4df02cfead8379d7f8b99a56c3a54ff3e", size = 624375, upload-time = "2026-01-23T16:15:55.915Z" }, + { url = "https://files.pythonhosted.org/packages/cb/86/5c6ab23bb3c28c21ed6bebad006515cfe08b04613eb105ca0041fecca852/greenlet-3.3.1-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6423481193bbbe871313de5fd06a082f2649e7ce6e08015d2a76c1e9186ca5b3", size = 612904, upload-time = "2026-01-23T15:32:52.317Z" }, + { url = "https://files.pythonhosted.org/packages/c2/f3/7949994264e22639e40718c2daf6f6df5169bf48fb038c008a489ec53a50/greenlet-3.3.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:33a956fe78bbbda82bfc95e128d61129b32d66bcf0a20a1f0c08aa4839ffa951", size = 1567316, upload-time = "2026-01-23T16:04:23.316Z" }, + { url = "https://files.pythonhosted.org/packages/8d/6e/d73c94d13b6465e9f7cd6231c68abde838bb22408596c05d9059830b7872/greenlet-3.3.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4b065d3284be43728dd280f6f9a13990b56470b81be20375a207cdc814a983f2", size = 1636549, upload-time = "2026-01-23T15:33:48.643Z" }, + { url = "https://files.pythonhosted.org/packages/5e/b3/c9c23a6478b3bcc91f979ce4ca50879e4d0b2bd7b9a53d8ecded719b92e2/greenlet-3.3.1-cp313-cp313-win_amd64.whl", hash = "sha256:27289986f4e5b0edec7b5a91063c109f0276abb09a7e9bdab08437525977c946", size = 227042, upload-time = "2026-01-23T15:33:58.216Z" }, + { url = "https://files.pythonhosted.org/packages/90/e7/824beda656097edee36ab15809fd063447b200cc03a7f6a24c34d520bc88/greenlet-3.3.1-cp313-cp313-win_arm64.whl", hash = "sha256:2f080e028001c5273e0b42690eaf359aeef9cb1389da0f171ea51a5dc3c7608d", size = 226294, upload-time = "2026-01-23T15:30:52.73Z" }, + { url = "https://files.pythonhosted.org/packages/ae/fb/011c7c717213182caf78084a9bea51c8590b0afda98001f69d9f853a495b/greenlet-3.3.1-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:bd59acd8529b372775cd0fcbc5f420ae20681c5b045ce25bd453ed8455ab99b5", size = 275737, upload-time = "2026-01-23T15:32:16.889Z" }, + { url = "https://files.pythonhosted.org/packages/41/2e/a3a417d620363fdbb08a48b1dd582956a46a61bf8fd27ee8164f9dfe87c2/greenlet-3.3.1-cp314-cp314-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b31c05dd84ef6871dd47120386aed35323c944d86c3d91a17c4b8d23df62f15b", size = 646422, upload-time = "2026-01-23T16:01:00.354Z" }, + { url = "https://files.pythonhosted.org/packages/b4/09/c6c4a0db47defafd2d6bab8ddfe47ad19963b4e30f5bed84d75328059f8c/greenlet-3.3.1-cp314-cp314-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:02925a0bfffc41e542c70aa14c7eda3593e4d7e274bfcccca1827e6c0875902e", size = 658219, upload-time = "2026-01-23T16:05:30.956Z" }, + { url = "https://files.pythonhosted.org/packages/e2/89/b95f2ddcc5f3c2bc09c8ee8d77be312df7f9e7175703ab780f2014a0e781/greenlet-3.3.1-cp314-cp314-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:3e0f3878ca3a3ff63ab4ea478585942b53df66ddde327b59ecb191b19dbbd62d", size = 671455, upload-time = "2026-01-23T16:15:57.232Z" }, + { url = "https://files.pythonhosted.org/packages/80/38/9d42d60dffb04b45f03dbab9430898352dba277758640751dc5cc316c521/greenlet-3.3.1-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:34a729e2e4e4ffe9ae2408d5ecaf12f944853f40ad724929b7585bca808a9d6f", size = 660237, upload-time = "2026-01-23T15:32:53.967Z" }, + { url = "https://files.pythonhosted.org/packages/96/61/373c30b7197f9e756e4c81ae90a8d55dc3598c17673f91f4d31c3c689c3f/greenlet-3.3.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:aec9ab04e82918e623415947921dea15851b152b822661cce3f8e4393c3df683", size = 1615261, upload-time = "2026-01-23T16:04:25.066Z" }, + { url = "https://files.pythonhosted.org/packages/fd/d3/ca534310343f5945316f9451e953dcd89b36fe7a19de652a1dc5a0eeef3f/greenlet-3.3.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:71c767cf281a80d02b6c1bdc41c9468e1f5a494fb11bc8688c360524e273d7b1", size = 1683719, upload-time = "2026-01-23T15:33:50.61Z" }, + { url = "https://files.pythonhosted.org/packages/52/cb/c21a3fd5d2c9c8b622e7bede6d6d00e00551a5ee474ea6d831b5f567a8b4/greenlet-3.3.1-cp314-cp314-win_amd64.whl", hash = "sha256:96aff77af063b607f2489473484e39a0bbae730f2ea90c9e5606c9b73c44174a", size = 228125, upload-time = "2026-01-23T15:32:45.265Z" }, + { url = "https://files.pythonhosted.org/packages/6a/8e/8a2db6d11491837af1de64b8aff23707c6e85241be13c60ed399a72e2ef8/greenlet-3.3.1-cp314-cp314-win_arm64.whl", hash = "sha256:b066e8b50e28b503f604fa538adc764a638b38cf8e81e025011d26e8a627fa79", size = 227519, upload-time = "2026-01-23T15:31:47.284Z" }, + { url = "https://files.pythonhosted.org/packages/28/24/cbbec49bacdcc9ec652a81d3efef7b59f326697e7edf6ed775a5e08e54c2/greenlet-3.3.1-cp314-cp314t-macosx_11_0_universal2.whl", hash = "sha256:3e63252943c921b90abb035ebe9de832c436401d9c45f262d80e2d06cc659242", size = 282706, upload-time = "2026-01-23T15:33:05.525Z" }, + { url = "https://files.pythonhosted.org/packages/86/2e/4f2b9323c144c4fe8842a4e0d92121465485c3c2c5b9e9b30a52e80f523f/greenlet-3.3.1-cp314-cp314t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:76e39058e68eb125de10c92524573924e827927df5d3891fbc97bd55764a8774", size = 651209, upload-time = "2026-01-23T16:01:01.517Z" }, + { url = "https://files.pythonhosted.org/packages/d9/87/50ca60e515f5bb55a2fbc5f0c9b5b156de7d2fc51a0a69abc9d23914a237/greenlet-3.3.1-cp314-cp314t-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:c9f9d5e7a9310b7a2f416dd13d2e3fd8b42d803968ea580b7c0f322ccb389b97", size = 654300, upload-time = "2026-01-23T16:05:32.199Z" }, + { url = "https://files.pythonhosted.org/packages/7c/25/c51a63f3f463171e09cb586eb64db0861eb06667ab01a7968371a24c4f3b/greenlet-3.3.1-cp314-cp314t-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:4b9721549a95db96689458a1e0ae32412ca18776ed004463df3a9299c1b257ab", size = 662574, upload-time = "2026-01-23T16:15:58.364Z" }, + { url = "https://files.pythonhosted.org/packages/1d/94/74310866dfa2b73dd08659a3d18762f83985ad3281901ba0ee9a815194fb/greenlet-3.3.1-cp314-cp314t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:92497c78adf3ac703b57f1e3813c2d874f27f71a178f9ea5887855da413cd6d2", size = 653842, upload-time = "2026-01-23T15:32:55.671Z" }, + { url = "https://files.pythonhosted.org/packages/97/43/8bf0ffa3d498eeee4c58c212a3905dd6146c01c8dc0b0a046481ca29b18c/greenlet-3.3.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:ed6b402bc74d6557a705e197d47f9063733091ed6357b3de33619d8a8d93ac53", size = 1614917, upload-time = "2026-01-23T16:04:26.276Z" }, + { url = "https://files.pythonhosted.org/packages/89/90/a3be7a5f378fc6e84abe4dcfb2ba32b07786861172e502388b4c90000d1b/greenlet-3.3.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:59913f1e5ada20fde795ba906916aea25d442abcc0593fba7e26c92b7ad76249", size = 1676092, upload-time = "2026-01-23T15:33:52.176Z" }, + { url = "https://files.pythonhosted.org/packages/e1/2b/98c7f93e6db9977aaee07eb1e51ca63bd5f779b900d362791d3252e60558/greenlet-3.3.1-cp314-cp314t-win_amd64.whl", hash = "sha256:301860987846c24cb8964bdec0e31a96ad4a2a801b41b4ef40963c1b44f33451", size = 233181, upload-time = "2026-01-23T15:33:00.29Z" }, +] + [[package]] name = "griffe" version = "1.15.0" @@ -4083,6 +4145,25 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/cb/28/3bfe2fa5a7b9c46fe7e13c97bda14c895fb10fa2ebf1d0abb90e0cea7ee1/platformdirs-4.5.1-py3-none-any.whl", hash = "sha256:d03afa3963c806a9bed9d5125c8f4cb2fdaf74a55ab60e5d59b3fde758104d31", size = 18731, upload-time = "2025-12-05T13:52:56.823Z" }, ] +[[package]] +name = "playwright" +version = "1.58.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "greenlet" }, + { name = "pyee" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/f8/c9/9c6061d5703267f1baae6a4647bfd1862e386fbfdb97d889f6f6ae9e3f64/playwright-1.58.0-py3-none-macosx_10_13_x86_64.whl", hash = "sha256:96e3204aac292ee639edbfdef6298b4be2ea0a55a16b7068df91adac077cc606", size = 42251098, upload-time = "2026-01-30T15:09:24.028Z" }, + { url = "https://files.pythonhosted.org/packages/e0/40/59d34a756e02f8c670f0fee987d46f7ee53d05447d43cd114ca015cb168c/playwright-1.58.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:70c763694739d28df71ed578b9c8202bb83e8fe8fb9268c04dd13afe36301f71", size = 41039625, upload-time = "2026-01-30T15:09:27.558Z" }, + { url = "https://files.pythonhosted.org/packages/e1/ee/3ce6209c9c74a650aac9028c621f357a34ea5cd4d950700f8e2c4b7fe2c4/playwright-1.58.0-py3-none-macosx_11_0_universal2.whl", hash = "sha256:185e0132578733d02802dfddfbbc35f42be23a45ff49ccae5081f25952238117", size = 42251098, upload-time = "2026-01-30T15:09:30.461Z" }, + { url = "https://files.pythonhosted.org/packages/f1/af/009958cbf23fac551a940d34e3206e6c7eed2b8c940d0c3afd1feb0b0589/playwright-1.58.0-py3-none-manylinux1_x86_64.whl", hash = "sha256:c95568ba1eda83812598c1dc9be60b4406dffd60b149bc1536180ad108723d6b", size = 46235268, upload-time = "2026-01-30T15:09:33.787Z" }, + { url = "https://files.pythonhosted.org/packages/d9/a6/0e66ad04b6d3440dae73efb39540c5685c5fc95b17c8b29340b62abbd952/playwright-1.58.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8f9999948f1ab541d98812de25e3a8c410776aa516d948807140aff797b4bffa", size = 45964214, upload-time = "2026-01-30T15:09:36.751Z" }, + { url = "https://files.pythonhosted.org/packages/0e/4b/236e60ab9f6d62ed0fd32150d61f1f494cefbf02304c0061e78ed80c1c32/playwright-1.58.0-py3-none-win32.whl", hash = "sha256:1e03be090e75a0fabbdaeab65ce17c308c425d879fa48bb1d7986f96bfad0b99", size = 36815998, upload-time = "2026-01-30T15:09:39.627Z" }, + { url = "https://files.pythonhosted.org/packages/41/f8/5ec599c5e59d2f2f336a05b4f318e733077cd5044f24adb6f86900c3e6a7/playwright-1.58.0-py3-none-win_amd64.whl", hash = "sha256:a2bf639d0ce33b3ba38de777e08697b0d8f3dc07ab6802e4ac53fb65e3907af8", size = 36816005, upload-time = "2026-01-30T15:09:42.449Z" }, + { url = "https://files.pythonhosted.org/packages/c8/c4/cc0229fea55c87d6c9c67fe44a21e2cd28d1d558a5478ed4d617e9fb0c93/playwright-1.58.0-py3-none-win_arm64.whl", hash = "sha256:32ffe5c303901a13a0ecab91d1c3f74baf73b84f4bedbb6b935f5bc11cc98e1b", size = 33085919, upload-time = "2026-01-30T15:09:45.71Z" }, +] + [[package]] name = "pluggy" version = "1.6.0" @@ -4573,6 +4654,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c1/60/5d4751ba3f4a40a6891f24eec885f51afd78d208498268c734e256fb13c4/pydantic_settings-2.12.0-py3-none-any.whl", hash = "sha256:fddb9fd99a5b18da837b29710391e945b1e30c135477f484084ee513adb93809", size = 51880, upload-time = "2025-11-10T14:25:45.546Z" }, ] +[[package]] +name = "pyee" +version = "13.0.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/95/03/1fd98d5841cd7964a27d729ccf2199602fe05eb7a405c1462eb7277945ed/pyee-13.0.0.tar.gz", hash = "sha256:b391e3c5a434d1f5118a25615001dbc8f669cf410ab67d04c4d4e07c55481c37", size = 31250, upload-time = "2025-03-17T18:53:15.955Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9b/4d/b9add7c84060d4c1906abe9a7e5359f2a60f7a9a4f67268b2766673427d8/pyee-13.0.0-py3-none-any.whl", hash = "sha256:48195a3cddb3b1515ce0695ed76036b5ccc2ef3a9f963ff9f77aec0139845498", size = 15730, upload-time = "2025-03-17T18:53:14.532Z" }, +] + [[package]] name = "pygments" version = "2.19.2" @@ -6386,6 +6479,10 @@ dependencies = [ { name = "typing-extensions" }, ] wheels = [ + { url = "https://files.pythonhosted.org/packages/e3/ea/304cf7afb744aa626fa9855245526484ee55aba610d9973a0521c552a843/torch-2.10.0-1-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:c37fc46eedd9175f9c81814cc47308f1b42cfe4987e532d4b423d23852f2bf63", size = 79411450, upload-time = "2026-02-06T17:37:35.75Z" }, + { url = "https://files.pythonhosted.org/packages/25/d8/9e6b8e7df981a1e3ea3907fd5a74673e791da483e8c307f0b6ff012626d0/torch-2.10.0-1-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:f699f31a236a677b3118bc0a3ef3d89c0c29b5ec0b20f4c4bf0b110378487464", size = 79423460, upload-time = "2026-02-06T17:37:39.657Z" }, + { url = "https://files.pythonhosted.org/packages/c9/2f/0b295dd8d199ef71e6f176f576473d645d41357b7b8aa978cc6b042575df/torch-2.10.0-1-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:6abb224c2b6e9e27b592a1c0015c33a504b00a0e0938f1499f7f514e9b7bfb5c", size = 79498197, upload-time = "2026-02-06T17:37:27.627Z" }, + { url = "https://files.pythonhosted.org/packages/a4/1b/af5fccb50c341bd69dc016769503cb0857c1423fbe9343410dfeb65240f2/torch-2.10.0-1-cp313-none-macosx_11_0_arm64.whl", hash = "sha256:7350f6652dfd761f11f9ecb590bfe95b573e2961f7a242eccb3c8e78348d26fe", size = 79498248, upload-time = "2026-02-06T17:37:31.982Z" }, { url = "https://files.pythonhosted.org/packages/0c/1a/c61f36cfd446170ec27b3a4984f072fd06dab6b5d7ce27e11adb35d6c838/torch-2.10.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:5276fa790a666ee8becaffff8acb711922252521b28fbce5db7db5cf9cb2026d", size = 145992962, upload-time = "2026-01-21T16:24:14.04Z" }, { url = "https://files.pythonhosted.org/packages/b5/60/6662535354191e2d1555296045b63e4279e5a9dbad49acf55a5d38655a39/torch-2.10.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:aaf663927bcd490ae971469a624c322202a2a1e68936eb952535ca4cd3b90444", size = 915599237, upload-time = "2026-01-21T16:23:25.497Z" }, { url = "https://files.pythonhosted.org/packages/40/b8/66bbe96f0d79be2b5c697b2e0b187ed792a15c6c4b8904613454651db848/torch-2.10.0-cp310-cp310-win_amd64.whl", hash = "sha256:a4be6a2a190b32ff5c8002a0977a25ea60e64f7ba46b1be37093c141d9c49aeb", size = 113720931, upload-time = "2026-01-21T16:24:23.743Z" }, @@ -6627,12 +6724,19 @@ name = "triton" version = "3.6.0" source = { registry = "https://pypi.org/simple" } wheels = [ + { url = "https://files.pythonhosted.org/packages/44/ba/b1b04f4b291a3205d95ebd24465de0e5bf010a2df27a4e58a9b5f039d8f2/triton-3.6.0-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6c723cfb12f6842a0ae94ac307dba7e7a44741d720a40cf0e270ed4a4e3be781", size = 175972180, upload-time = "2026-01-20T16:15:53.664Z" }, { url = "https://files.pythonhosted.org/packages/8c/f7/f1c9d3424ab199ac53c2da567b859bcddbb9c9e7154805119f8bd95ec36f/triton-3.6.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a6550fae429e0667e397e5de64b332d1e5695b73650ee75a6146e2e902770bea", size = 188105201, upload-time = "2026-01-20T16:00:29.272Z" }, + { url = "https://files.pythonhosted.org/packages/0f/2c/96f92f3c60387e14cc45aed49487f3486f89ea27106c1b1376913c62abe4/triton-3.6.0-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:49df5ef37379c0c2b5c0012286f80174fcf0e073e5ade1ca9a86c36814553651", size = 176081190, upload-time = "2026-01-20T16:16:00.523Z" }, { url = "https://files.pythonhosted.org/packages/e0/12/b05ba554d2c623bffa59922b94b0775673de251f468a9609bc9e45de95e9/triton-3.6.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e8e323d608e3a9bfcc2d9efcc90ceefb764a82b99dea12a86d643c72539ad5d3", size = 188214640, upload-time = "2026-01-20T16:00:35.869Z" }, + { url = "https://files.pythonhosted.org/packages/17/5d/08201db32823bdf77a0e2b9039540080b2e5c23a20706ddba942924ebcd6/triton-3.6.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:374f52c11a711fd062b4bfbb201fd9ac0a5febd28a96fb41b4a0f51dde3157f4", size = 176128243, upload-time = "2026-01-20T16:16:07.857Z" }, { url = "https://files.pythonhosted.org/packages/ab/a8/cdf8b3e4c98132f965f88c2313a4b493266832ad47fb52f23d14d4f86bb5/triton-3.6.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:74caf5e34b66d9f3a429af689c1c7128daba1d8208df60e81106b115c00d6fca", size = 188266850, upload-time = "2026-01-20T16:00:43.041Z" }, + { url = "https://files.pythonhosted.org/packages/3c/12/34d71b350e89a204c2c7777a9bba0dcf2f19a5bfdd70b57c4dbc5ffd7154/triton-3.6.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:448e02fe6dc898e9e5aa89cf0ee5c371e99df5aa5e8ad976a80b93334f3494fd", size = 176133521, upload-time = "2026-01-20T16:16:13.321Z" }, { url = "https://files.pythonhosted.org/packages/f9/0b/37d991d8c130ce81a8728ae3c25b6e60935838e9be1b58791f5997b24a54/triton-3.6.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:10c7f76c6e72d2ef08df639e3d0d30729112f47a56b0c81672edc05ee5116ac9", size = 188289450, upload-time = "2026-01-20T16:00:49.136Z" }, + { url = "https://files.pythonhosted.org/packages/ce/4e/41b0c8033b503fd3cfcd12392cdd256945026a91ff02452bef40ec34bee7/triton-3.6.0-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1722e172d34e32abc3eb7711d0025bb69d7959ebea84e3b7f7a341cd7ed694d6", size = 176276087, upload-time = "2026-01-20T16:16:18.989Z" }, { url = "https://files.pythonhosted.org/packages/35/f8/9c66bfc55361ec6d0e4040a0337fb5924ceb23de4648b8a81ae9d33b2b38/triton-3.6.0-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d002e07d7180fd65e622134fbd980c9a3d4211fb85224b56a0a0efbd422ab72f", size = 188400296, upload-time = "2026-01-20T16:00:56.042Z" }, + { url = "https://files.pythonhosted.org/packages/49/55/5ecf0dcaa0f2fbbd4420f7ef227ee3cb172e91e5fede9d0ecaddc43363b4/triton-3.6.0-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ef5523241e7d1abca00f1d240949eebdd7c673b005edbbce0aca95b8191f1d43", size = 176138577, upload-time = "2026-01-20T16:16:25.426Z" }, { url = "https://files.pythonhosted.org/packages/df/3d/9e7eee57b37c80cec63322c0231bb6da3cfe535a91d7a4d64896fcb89357/triton-3.6.0-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a17a5d5985f0ac494ed8a8e54568f092f7057ef60e1b0fa09d3fd1512064e803", size = 188273063, upload-time = "2026-01-20T16:01:07.278Z" }, + { url = "https://files.pythonhosted.org/packages/48/db/56ee649cab5eaff4757541325aca81f52d02d4a7cd3506776cad2451e060/triton-3.6.0-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0b3a97e8ed304dfa9bd23bb41ca04cdf6b2e617d5e782a8653d616037a5d537d", size = 176274804, upload-time = "2026-01-20T16:16:31.528Z" }, { url = "https://files.pythonhosted.org/packages/f6/56/6113c23ff46c00aae423333eb58b3e60bdfe9179d542781955a5e1514cb3/triton-3.6.0-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:46bd1c1af4b6704e554cad2eeb3b0a6513a980d470ccfa63189737340c7746a7", size = 188397994, upload-time = "2026-01-20T16:01:14.236Z" }, ] From 3dadb6c3025d7990d94250719353892515b1581a Mon Sep 17 00:00:00 2001 From: Maksym Lysak Date: Mon, 9 Feb 2026 17:57:31 +0100 Subject: [PATCH 02/19] Added support for input HTML tag (text and checkboxes) into Docling HTML backend Signed-off-by: Maksym Lysak --- docling/backend/html_backend.py | 95 +++++++++++++++++++++++++++++++-- 1 file changed, 90 insertions(+), 5 deletions(-) diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py index 15da8e568c..86a3ba89ea 100644 --- a/docling/backend/html_backend.py +++ b/docling/backend/html_backend.py @@ -820,6 +820,7 @@ def _is_rich_table_cell(self, table_cell: Tag) -> bool: is_rich: bool = True children = table_cell.find_all(recursive=True) # all descendants of type Tag + has_input = any(child.name == "input" for child in children) if not children: content = [ item @@ -832,10 +833,17 @@ def _is_rich_table_cell(self, table_cell: Tag) -> bool: table_cell, find_parent_annotation=True ) if not annotations: - is_rich = bool(item for item in children if item.name == "img") + is_rich = bool( + item for item in children if item.name in {"img", "input"} + ) elif len(annotations) == 1: anno: AnnotatedText = annotations[0] - is_rich = bool(anno.formatting) or bool(anno.hyperlink) or anno.code + is_rich = ( + bool(anno.formatting) + or bool(anno.hyperlink) + or anno.code + or has_input + ) return is_rich @@ -1026,6 +1034,11 @@ def _flush_buffer() -> None: im_ref3 = self._emit_image(node, doc) if im_ref3: added_refs.append(im_ref3) + elif name == "input": + _flush_buffer() + input_ref = self._emit_input(node, doc) + if input_ref: + added_refs.append(input_ref) elif name in _FORMAT_TAG_MAP: _flush_buffer() with self._use_format([name]): @@ -1039,7 +1052,7 @@ def _flush_buffer() -> None: _flush_buffer() blk = self._handle_block(node, doc) added_refs.extend(blk) - elif node.find(_BLOCK_TAGS): + elif node.find(_BLOCK_TAGS) or node.find("input"): _flush_buffer() wk3 = self._walk(node, doc) added_refs.extend(wk3) @@ -1422,9 +1435,14 @@ def _handle_list(self, tag: Tag, doc: DoclingDocument) -> RefItem: li_text = re.sub( r"\s+|\n+", " ", "".join([el.text for el in min_parts]) ).strip() + inputs_in_li = [ + input_tag + for input_tag in li.find_all("input") + if input_tag.find_parent("li") is li + ] # 3) add the list item - if li_text: + if li_text or inputs_in_li: if len(min_parts) > 1: li_prov = self._make_prov(text=li_text, tag=li) # create an empty list element in order to hook the inline group onto that one @@ -1468,6 +1486,10 @@ def _handle_list(self, tag: Tag, doc: DoclingDocument) -> RefItem: prov=prov, ) + for input_tag in inputs_in_li: + if isinstance(input_tag, Tag): + self._emit_input(input_tag, doc) + # 4) recurse into any nested lists, attaching them to this
  • item for sublist in li({"ul", "ol"}, recursive=False): if isinstance(sublist, Tag): @@ -1476,7 +1498,7 @@ def _handle_list(self, tag: Tag, doc: DoclingDocument) -> RefItem: # now the list element with inline group is not a parent anymore self.parents[self.level] = None self.level -= 1 - else: + elif li_text: annotated_text = min_parts[0] li_text = re.sub(r"\s+|\n+", " ", annotated_text.text).strip() li_clean = HTMLDocumentBackend._clean_unicode(li_text) @@ -1497,6 +1519,13 @@ def _handle_list(self, tag: Tag, doc: DoclingDocument) -> RefItem: prov=prov, ) + if inputs_in_li: + self.level += 1 + for input_tag in inputs_in_li: + if isinstance(input_tag, Tag): + self._emit_input(input_tag, doc) + self.level -= 1 + # 4) recurse into any nested lists, attaching them to this
  • item for sublist in li({"ul", "ol"}, recursive=False): if isinstance(sublist, Tag): @@ -1504,6 +1533,25 @@ def _handle_list(self, tag: Tag, doc: DoclingDocument) -> RefItem: self._handle_block(sublist, doc) self.parents[self.level + 1] = None self.level -= 1 + else: + li_prov = self._make_prov(text="", tag=li) + self.parents[self.level + 1] = doc.add_list_item( + text="", + enumerated=is_ordered, + marker=marker, + parent=list_group, + content_layer=self.content_layer, + prov=li_prov, + ) + self.level += 1 + for input_tag in inputs_in_li: + if isinstance(input_tag, Tag): + self._emit_input(input_tag, doc) + for sublist in li({"ul", "ol"}, recursive=False): + if isinstance(sublist, Tag): + self._handle_block(sublist, doc) + self.parents[self.level] = None + self.level -= 1 else: for sublist in li({"ul", "ol"}, recursive=False): if isinstance(sublist, Tag): @@ -1602,6 +1650,11 @@ def _handle_block(self, tag: Tag, doc: DoclingDocument) -> list[RefItem]: for img_tag in tag("img"): if isinstance(img_tag, Tag): self._emit_image(img_tag, doc) + for input_tag in tag("input"): + if isinstance(input_tag, Tag): + input_ref = self._emit_input(input_tag, doc) + if input_ref is not None: + added_refs.append(input_ref) elif tag_name == "table": num_rows, num_cols = self.get_html_table_row_col(tag) @@ -1735,6 +1788,38 @@ def get_img_hyperlink(img_tag): ) return docling_pic.get_ref() + def _emit_input(self, input_tag: Tag, doc: DoclingDocument) -> Optional[RefItem]: + input_type = self._get_attr_as_string(input_tag, "type").lower() + if input_type == "hidden": + return None + + label = DocItemLabel.TEXT + if input_type == "checkbox": + label = ( + DocItemLabel.CHECKBOX_SELECTED + if input_tag.has_attr("checked") + else DocItemLabel.CHECKBOX_UNSELECTED + ) + + text = self._get_attr_as_string(input_tag, "value").strip() + if not text: + text = self._get_attr_as_string(input_tag, "placeholder").strip() + if not text: + text = self._get_attr_as_string(input_tag, "name").strip() + + text_clean = HTMLDocumentBackend._clean_unicode(text) if text else "" + prov = self._make_prov(text=text_clean, tag=input_tag) + input_item = doc.add_text( + parent=self.parents[self.level], + label=label, + text=text_clean, + content_layer=self.content_layer, + formatting=self._formatting, + hyperlink=self.hyperlink, + prov=prov, + ) + return input_item.get_ref() + def _create_image_ref(self, src_url: str) -> Optional[ImageRef]: try: img_data = self._load_image_data(src_url) From 3fa43842a59c4362564777048195e72cf9bc34d9 Mon Sep 17 00:00:00 2001 From: Maksym Lysak Date: Tue, 17 Feb 2026 15:12:29 +0100 Subject: [PATCH 03/19] adding key-value extraction for modified HTML backend Signed-off-by: Maksym Lysak --- docling/backend/html_backend.py | 190 ++++++++++++++++++++++++++++++++ 1 file changed, 190 insertions(+) diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py index 86a3ba89ea..8c7a7f13d2 100644 --- a/docling/backend/html_backend.py +++ b/docling/backend/html_backend.py @@ -22,6 +22,11 @@ DocItemLabel, DoclingDocument, DocumentOrigin, + GraphCell, + GraphCellLabel, + GraphData, + GraphLink, + GraphLinkLabel, GroupItem, GroupLabel, PictureItem, @@ -130,6 +135,11 @@ } _DATA_DOCLING_ID_ATTR: Final = "data-docling-id" +_FORM_CONTAINER_CLASS: Final = "form_region" +_FORM_KEY_ID_RE: Final = re.compile(r"^key(?P[A-Za-z0-9]+)$") +_FORM_VALUE_ID_RE: Final = re.compile( + r"^key(?P[A-Za-z0-9]+)_value(?P[A-Za-z0-9]+)$" +) @dataclass(frozen=True) @@ -1029,6 +1039,11 @@ def _flush_buffer() -> None: for node in element.contents: if isinstance(node, Tag): name = node.name.lower() + if self._is_form_container(node): + _flush_buffer() + form_refs = self._handle_form_container(node, doc) + added_refs.extend(form_refs) + continue if name == "img": _flush_buffer() im_ref3 = self._emit_image(node, doc) @@ -1271,6 +1286,17 @@ def _use_details(self, tag: Tag, doc: DoclingDocument): self.parents[self.level + 1] = None self.level -= 1 + @contextmanager + def _use_form_container(self, form_item: DocItem): + """Create a form container group and set it as the current parent.""" + self.parents[self.level + 1] = form_item + self.level += 1 + try: + yield None + finally: + self.parents[self.level + 1] = None + self.level -= 1 + @contextmanager def _use_footer(self, tag: Tag, doc: DoclingDocument): """Create a group with a footer. @@ -1710,6 +1736,170 @@ def _handle_block(self, tag: Tag, doc: DoclingDocument) -> list[RefItem]: self._walk(tag, doc) return added_refs + @staticmethod + def _is_form_container(tag: Tag) -> bool: + classes = tag.get("class") + if not classes: + return False + if isinstance(classes, str): + classes = [classes] + return _FORM_CONTAINER_CLASS in classes + + @staticmethod + def _is_value_in_key_scope(key_tag: Tag, value_tag: Tag) -> bool: + if key_tag is value_tag: + return True + if any(parent is key_tag for parent in value_tag.parents): + return True + key_parent = key_tag.parent + value_parent = value_tag.parent + if key_parent is not None and key_parent is value_parent: + return True + return False + + @staticmethod + def _extract_text_excluding_ids(tag: Tag, excluded_ids: set[str]) -> str: + def _extract(node: PageElement) -> list[str]: + if isinstance(node, NavigableString): + return [str(node)] + if isinstance(node, Tag): + node_id = node.get("id") + if node_id and node_id in excluded_ids: + return [] + parts: list[str] = [] + for child in node: + parts.extend(_extract(child)) + if node.name in {"p", "li"}: + parts.append(" ") + return parts + return [] + + return "".join(_extract(tag)) + + @staticmethod + def _normalize_form_text(text: str) -> tuple[str, str]: + raw = re.sub(r"\s+", " ", text).strip() + return raw, HTMLDocumentBackend._clean_unicode(raw) + + def _extract_form_graph(self, form_tag: Tag) -> Optional[GraphData]: + key_tags: dict[str, Tag] = {} + key_order: list[str] = [] + values_by_key: dict[str, list[tuple[Optional[int], int, Tag]]] = {} + value_order = 0 + + for tag in form_tag.find_all(id=True): + tag_id = tag.get("id") + if not isinstance(tag_id, str): + continue + + value_match = _FORM_VALUE_ID_RE.match(tag_id) + if value_match: + key_id = value_match.group("key_id") + value_id = value_match.group("value_id") + value_index = int(value_id) if value_id.isdigit() else None + value_order += 1 + values_by_key.setdefault(key_id, []).append( + (value_index, value_order, tag) + ) + continue + + key_match = _FORM_KEY_ID_RE.match(tag_id) + if key_match: + key_id = key_match.group("key_id") + if key_id not in key_tags: + key_tags[key_id] = tag + key_order.append(key_id) + + cells: list[GraphCell] = [] + links: list[GraphLink] = [] + cell_id_seq = 0 + + for key_id in key_order: + key_tag = key_tags[key_id] + value_entries = values_by_key.get(key_id, []) + # value_entries = [ + # entry + # for entry in value_entries + # if self._is_value_in_key_scope(key_tag, entry[2]) + # ] + value_entries.sort( + key=lambda entry: ( + entry[0] is None, + entry[0] if entry[0] is not None else entry[1], + entry[1], + ) + ) + value_tags = [entry[2] for entry in value_entries] + excluded_ids = { + tag_id + for tag_id in (tag.get("id") for tag in value_tags) + if isinstance(tag_id, str) + } + key_text_raw = self._extract_text_excluding_ids(key_tag, excluded_ids) + key_orig, key_text = self._normalize_form_text(key_text_raw) + if not key_text and not value_tags: + continue + + key_cell = GraphCell( + cell_id=cell_id_seq, + label=GraphCellLabel.KEY, + text=key_text, + orig=key_orig, + prov=self._make_prov(text=key_text, tag=key_tag), + ) + cells.append(key_cell) + cell_id_seq += 1 + + for value_tag in value_tags: + value_text_raw = HTMLDocumentBackend.get_text(value_tag) + value_orig, value_text = self._normalize_form_text(value_text_raw) + value_cell = GraphCell( + cell_id=cell_id_seq, + label=GraphCellLabel.VALUE, + text=value_text, + orig=value_orig, + prov=self._make_prov(text=value_text, tag=value_tag), + ) + cells.append(value_cell) + links.append( + GraphLink( + label=GraphLinkLabel.TO_VALUE, + source_cell_id=key_cell.cell_id, + target_cell_id=value_cell.cell_id, + ) + ) + cell_id_seq += 1 + + if not cells: + return None + return GraphData(cells=cells, links=links) + + def _handle_form_container(self, tag: Tag, doc: DoclingDocument) -> list[RefItem]: + added_refs: list[RefItem] = [] + form_graph = self._extract_form_graph(tag) + form_data = form_graph if form_graph is not None else GraphData() + form_prov = self._make_prov(text="", tag=tag) + form_item = doc.add_form( + graph=deepcopy(form_data), + prov=form_prov, + parent=self.parents[self.level], + ) + form_item.content_layer = self.content_layer + added_refs.append(form_item.get_ref()) + + if form_graph is not None: + kv_item = doc.add_key_values( + graph=form_graph, + prov=None, + parent=form_item, + ) + kv_item.content_layer = self.content_layer + added_refs.append(kv_item.get_ref()) + + with self._use_form_container(form_item): + added_refs.extend(self._walk(tag, doc)) + return added_refs + def _emit_image(self, img_tag: Tag, doc: DoclingDocument) -> Optional[RefItem]: figure = img_tag.find_parent("figure") caption: AnnotatedTextList = AnnotatedTextList() From 6f208b2a4b73df322c1d397c03a74ee5998dc69d Mon Sep 17 00:00:00 2001 From: Maksym Lysak Date: Tue, 17 Feb 2026 15:36:31 +0100 Subject: [PATCH 04/19] Allowing out of DOM scope values for the keys that don't have any other values, and restricting key-value only for the ones that satisfy scope if there are such. Signed-off-by: Maksym Lysak --- docling/backend/html_backend.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py index 8c7a7f13d2..f47bc1aef4 100644 --- a/docling/backend/html_backend.py +++ b/docling/backend/html_backend.py @@ -1817,11 +1817,13 @@ def _extract_form_graph(self, form_tag: Tag) -> Optional[GraphData]: for key_id in key_order: key_tag = key_tags[key_id] value_entries = values_by_key.get(key_id, []) - # value_entries = [ - # entry - # for entry in value_entries - # if self._is_value_in_key_scope(key_tag, entry[2]) - # ] + in_scope_entries = [ + entry + for entry in value_entries + if self._is_value_in_key_scope(key_tag, entry[2]) + ] + if in_scope_entries: + value_entries = in_scope_entries value_entries.sort( key=lambda entry: ( entry[0] is None, From 6235814779d52ccf5713cdfdbde59f10e375ce3b Mon Sep 17 00:00:00 2001 From: Maksym Lysak Date: Tue, 17 Feb 2026 17:02:05 +0100 Subject: [PATCH 05/19] improved key-value filtering, added padding to the pages as a parameter Signed-off-by: Maksym Lysak --- docling/backend/html_backend.py | 65 ++++++++++++++++++++++++++++ docling/datamodel/backend_options.py | 6 +++ 2 files changed, 71 insertions(+) diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py index f47bc1aef4..5d27aebc88 100644 --- a/docling/backend/html_backend.py +++ b/docling/backend/html_backend.py @@ -495,6 +495,20 @@ def _render_with_browser(self) -> None: else: page.set_content(render_html, wait_until=options.render_wait_until) + if options.page_padding > 0: + page.evaluate( + """ + (padding) => { + if (!document || !document.body) { + return; + } + document.body.style.padding = `${padding}px`; + document.body.style.boxSizing = "border-box"; + } + """, + options.page_padding, + ) + if options.render_wait_ms: page.wait_for_timeout(options.render_wait_ms) @@ -1757,6 +1771,42 @@ def _is_value_in_key_scope(key_tag: Tag, value_tag: Tag) -> bool: return True return False + @staticmethod + def _get_table_cell(tag: Tag) -> Optional[Tag]: + parent_cell = tag.find_parent(["td", "th"]) + return parent_cell if isinstance(parent_cell, Tag) else None + + @staticmethod + def _is_bbox_within_any_table( + value_bbox: BoundingBox, table_bboxes: list[BoundingBox], threshold: float = 0.9 + ) -> bool: + for table_bbox in table_bboxes: + if value_bbox.intersection_over_self(table_bbox) >= threshold: + return True + return False + + def _should_ignore_table_kv_link( + self, key_tag: Tag, value_tag: Tag, table_bboxes: list[BoundingBox] + ) -> bool: + key_table = key_tag.find_parent("table") + value_table = value_tag.find_parent("table") + if key_table is None and value_table is not None: + return True + + key_cell = self._get_table_cell(key_tag) + value_cell = self._get_table_cell(value_tag) + if key_cell is not None and value_cell is not None and key_cell is not value_cell: + return True + + if key_table is None and value_table is None and table_bboxes: + value_rendered = self._get_rendered_bbox_for_tag(value_tag) + if value_rendered and self._is_bbox_within_any_table( + value_rendered.bbox, table_bboxes + ): + return True + + return False + @staticmethod def _extract_text_excluding_ids(tag: Tag, excluded_ids: set[str]) -> str: def _extract(node: PageElement) -> list[str]: @@ -1814,9 +1864,24 @@ def _extract_form_graph(self, form_tag: Tag) -> Optional[GraphData]: links: list[GraphLink] = [] cell_id_seq = 0 + table_bboxes: list[BoundingBox] = [] + if self._rendered_bbox_by_id: + for table_tag in form_tag.find_all("table"): + if isinstance(table_tag, Tag): + rendered = self._get_rendered_bbox_for_tag(table_tag) + if rendered is not None: + table_bboxes.append(rendered.bbox) + for key_id in key_order: key_tag = key_tags[key_id] value_entries = values_by_key.get(key_id, []) + value_entries = [ + entry + for entry in value_entries + if not self._should_ignore_table_kv_link( + key_tag, entry[2], table_bboxes + ) + ] in_scope_entries = [ entry for entry in value_entries diff --git a/docling/datamodel/backend_options.py b/docling/datamodel/backend_options.py index 94af677724..c6a2c76cda 100644 --- a/docling/datamodel/backend_options.py +++ b/docling/datamodel/backend_options.py @@ -57,6 +57,12 @@ class HTMLBackendOptions(BaseBackendOptions): render_device_scale: float = Field( 1.0, description="Device scale factor for rendering." ) + page_padding: int = Field( + 0, + description=( + "Padding in CSS pixels applied to the HTML body before rendering." + ), + ) render_full_page: bool = Field( False, description=("Capture a single full-height page image instead of paginating."), From 94703e459d03768ee1037e895fbd72250b1df873 Mon Sep 17 00:00:00 2001 From: Maksym Lysak Date: Tue, 17 Feb 2026 18:04:30 +0100 Subject: [PATCH 06/19] Tight bboxes around text when extracting key-values, correct page size and render scale compute, and an example on how to run html_backend with rendering Signed-off-by: Maksym Lysak --- docling/backend/html_backend.py | 112 ++++++++++++++++-- .../run_with_formats_html_rendered.py | 92 ++++++++++++++ 2 files changed, 196 insertions(+), 8 deletions(-) create mode 100644 docs/examples/run_with_formats_html_rendered.py diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py index 5d27aebc88..96f52446d3 100644 --- a/docling/backend/html_backend.py +++ b/docling/backend/html_backend.py @@ -306,6 +306,7 @@ def __init__( self._raw_html_bytes: Optional[bytes] = None self._rendered_html: Optional[str] = None self._rendered_bbox_by_id: dict[str, _RenderedBBox] = {} + self._rendered_text_bbox_by_id: dict[str, _RenderedBBox] = {} self._rendered_page_images: list[Image.Image] = [] self._rendered_page_size: Optional[Size] = None @@ -517,6 +518,7 @@ def _render_with_browser(self) -> None: () => { const nodes = Array.from(document.querySelectorAll('*')); const boxes = {}; + const textBoxes = {}; let idx = 0; for (const node of nodes) { idx += 1; @@ -534,6 +536,59 @@ def _render_with_browser(self) -> None: const x = rect.left + window.scrollX; const y = rect.top + window.scrollY; boxes[id] = { x, y, width, height }; + + const walker = document.createTreeWalker( + node, + NodeFilter.SHOW_TEXT, + { + acceptNode: (textNode) => { + if (!textNode || !textNode.textContent) { + return NodeFilter.FILTER_REJECT; + } + return textNode.textContent.trim() + ? NodeFilter.FILTER_ACCEPT + : NodeFilter.FILTER_REJECT; + } + } + ); + let textLeft = null; + let textTop = null; + let textRight = null; + let textBottom = null; + while (walker.nextNode()) { + const range = document.createRange(); + range.selectNodeContents(walker.currentNode); + const rects = Array.from(range.getClientRects()); + for (const tRect of rects) { + const tWidth = tRect.width || 0; + const tHeight = tRect.height || 0; + if (tWidth <= 0 && tHeight <= 0) { + continue; + } + const tX = tRect.left + window.scrollX; + const tY = tRect.top + window.scrollY; + const tR = tX + tWidth; + const tB = tY + tHeight; + textLeft = textLeft === null ? tX : Math.min(textLeft, tX); + textTop = textTop === null ? tY : Math.min(textTop, tY); + textRight = textRight === null ? tR : Math.max(textRight, tR); + textBottom = textBottom === null ? tB : Math.max(textBottom, tB); + } + range.detach(); + } + if ( + textLeft !== null && + textTop !== null && + textRight !== null && + textBottom !== null + ) { + textBoxes[id] = { + x: textLeft, + y: textTop, + width: textRight - textLeft, + height: textBottom - textTop + }; + } } const doc = document.documentElement; const body = document.body; @@ -545,7 +600,7 @@ def _render_with_browser(self) -> None: doc ? doc.scrollHeight : 0, body ? body.scrollHeight : 0 ); - return { boxes, scrollWidth, scrollHeight }; + return { boxes, textBoxes, scrollWidth, scrollHeight }; } """ ) @@ -570,6 +625,16 @@ def _render_with_browser(self) -> None: else height, full_page=options.render_full_page, ) + self._rendered_text_bbox_by_id = self._build_bbox_mapping( + render_data={ + "boxes": render_data.get("textBoxes", {}), + "scrollHeight": render_data.get("scrollHeight"), + }, + page_height=int(self._rendered_page_size.height) + if self._rendered_page_size + else height, + full_page=options.render_full_page, + ) context.close() browser.close() @@ -665,6 +730,14 @@ def _get_rendered_bbox_for_tag(self, tag: Optional[Tag]) -> Optional[_RenderedBB return None return self._rendered_bbox_by_id.get(tag_id) + def _get_rendered_text_bbox_for_tag( + self, tag: Optional[Tag] + ) -> Optional[_RenderedBBox]: + tag_id = self._get_tag_id(tag) + if tag_id is None: + return None + return self._rendered_text_bbox_by_id.get(tag_id) + def _make_prov( self, text: str, @@ -688,6 +761,29 @@ def _make_prov( charspan=(0, len(text)), ) + def _make_text_prov( + self, + text: str, + tag: Optional[Tag] = None, + source_tag_id: Optional[str] = None, + ) -> Optional[ProvenanceItem]: + if not self._rendered_text_bbox_by_id: + return self._make_prov(text=text, tag=tag, source_tag_id=source_tag_id) + + render_box: Optional[_RenderedBBox] = None + if source_tag_id: + render_box = self._rendered_text_bbox_by_id.get(source_tag_id) + if render_box is None: + render_box = self._get_rendered_text_bbox_for_tag(tag) + if render_box is None: + return self._make_prov(text=text, tag=tag, source_tag_id=source_tag_id) + + return ProvenanceItem( + page_no=render_box.page_no, + bbox=render_box.bbox, + charspan=(0, len(text)), + ) + @staticmethod def _fix_invalid_paragraph_structure(soup: BeautifulSoup) -> None: """Rewrite

    elements that contain block-level breakers. @@ -1790,13 +1886,13 @@ def _should_ignore_table_kv_link( ) -> bool: key_table = key_tag.find_parent("table") value_table = value_tag.find_parent("table") - if key_table is None and value_table is not None: - return True - key_cell = self._get_table_cell(key_tag) value_cell = self._get_table_cell(value_tag) - if key_cell is not None and value_cell is not None and key_cell is not value_cell: - return True + if key_table is not None or value_table is not None: + if key_cell is None or value_cell is None: + return True + if key_cell is not value_cell: + return True if key_table is None and value_table is None and table_bboxes: value_rendered = self._get_rendered_bbox_for_tag(value_tag) @@ -1912,7 +2008,7 @@ def _extract_form_graph(self, form_tag: Tag) -> Optional[GraphData]: label=GraphCellLabel.KEY, text=key_text, orig=key_orig, - prov=self._make_prov(text=key_text, tag=key_tag), + prov=self._make_text_prov(text=key_text, tag=key_tag), ) cells.append(key_cell) cell_id_seq += 1 @@ -1925,7 +2021,7 @@ def _extract_form_graph(self, form_tag: Tag) -> Optional[GraphData]: label=GraphCellLabel.VALUE, text=value_text, orig=value_orig, - prov=self._make_prov(text=value_text, tag=value_tag), + prov=self._make_text_prov(text=value_text, tag=value_tag), ) cells.append(value_cell) links.append( diff --git a/docs/examples/run_with_formats_html_rendered.py b/docs/examples/run_with_formats_html_rendered.py new file mode 100644 index 0000000000..6823bc922b --- /dev/null +++ b/docs/examples/run_with_formats_html_rendered.py @@ -0,0 +1,92 @@ +import json +import logging +import time +from pathlib import Path +from docling_core.types.doc import ImageRefMode +from docling.datamodel.backend_options import HTMLBackendOptions +from docling.datamodel.base_models import InputFormat +from docling.document_converter import DocumentConverter, HTMLFormatOption +from docling.utils.visualization import draw_clusters + +_log = logging.getLogger(__name__) + +# Requires Playwright to be installed locally. + +def main() -> None: + directory_path = Path("example_html_forms_500/") + out_dir = Path("scratch/html_rendered/json") + out_dir_png = Path("scratch/html_rendered/png") + out_dir_viz = Path("scratch/html_rendered/viz") + + input_paths = sorted( + [file for file in directory_path.iterdir() if file.is_file()] + ) + + html_options = HTMLBackendOptions( + render_page=True, + # render_page_width=1588, + # ender_page_height=2246, + render_page_width=794, + render_page_height=100, + render_device_scale=2.0, + # render_page_height=1123, + render_page_orientation="portrait", + render_print_media=True, + render_wait_until="networkidle", + render_wait_ms=500, + render_full_page=True, + render_dpi=144, + page_padding=16, + enable_local_fetch=True, + fetch_images=True, + source_uri=input_paths[0].resolve(), + ) + + converter = DocumentConverter( + format_options={ + InputFormat.HTML: HTMLFormatOption(backend_options=html_options) + } + ) + timings: list[float] = [] + + out_dir.mkdir(parents=True, exist_ok=True) + out_dir_png.mkdir(parents=True, exist_ok=True) + out_dir_viz.mkdir(parents=True, exist_ok=True) + + for input_path in input_paths: + start = time.perf_counter() + res = converter.convert(input_path) + elapsed = time.perf_counter() - start + timings.append(elapsed) + print(f"Converted in {elapsed:.3f}s") + + doc = res.document + viz_pages = doc.get_visualization() + viz_pages2 = doc.get_visualization(viz_mode='key_value') + print(len(viz_pages)) + with (out_dir / f"{res.input.file.stem}.json").open("w") as fp: + fp.write(json.dumps(doc.export_to_dict())) + + page = doc.pages[1] + if page.image and page.image.pil_image: + page.image.pil_image.save( + out_dir_png / f"{res.input.file.stem}_page_{1}.png" + ) + + page_viz = viz_pages[1] + page_viz.save( + out_dir_viz / f"{res.input.file.stem}_page_{1}_viz.png" + ) + + page_viz = viz_pages2[1] + page_viz.save( + out_dir_viz / f"{res.input.file.stem}_page_{1}_viz_kvp.png" + ) + + if timings: + avg_time = sum(timings) / len(timings) + print(f"Average conversion time: {avg_time:.3f}s across {len(timings)} samples") + + +if __name__ == "__main__": + main() From f8eb3306105464222c3e8e5e9b607095314a96c8 Mon Sep 17 00:00:00 2001 From: Maksym Lysak Date: Fri, 20 Feb 2026 17:01:15 +0100 Subject: [PATCH 07/19] Updated logic for kvp extraction in html_backend, updated example Signed-off-by: Maksym Lysak --- docling/backend/html_backend.py | 58 ++++++++++++------- .../run_with_formats_html_rendered.py | 31 +++++----- 2 files changed, 50 insertions(+), 39 deletions(-) diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py index 96f52446d3..6ec599b313 100644 --- a/docling/backend/html_backend.py +++ b/docling/backend/html_backend.py @@ -1119,12 +1119,12 @@ def _flush_buffer() -> None: seg_clean = HTMLDocumentBackend._clean_unicode( annotated_text.text.strip() ) - prov = self._make_prov( - text=seg_clean, - tag=element, - source_tag_id=annotated_text.source_tag_id, - ) if annotated_text.code: + prov = self._make_prov( + text=seg_clean, + tag=element, + source_tag_id=annotated_text.source_tag_id, + ) docling_code2 = doc.add_code( parent=self.parents[self.level], text=seg_clean, @@ -1135,6 +1135,11 @@ def _flush_buffer() -> None: ) added_refs.append(docling_code2.get_ref()) else: + prov = self._make_text_prov( + text=seg_clean, + tag=element, + source_tag_id=annotated_text.source_tag_id, + ) docling_text2 = doc.add_text( parent=self.parents[self.level], label=DocItemLabel.TEXT, @@ -1460,7 +1465,7 @@ def _handle_heading(self, tag: Tag, doc: DoclingDocument) -> list[RefItem]: ) annotated_text = annotated_text_list.to_single_text_element() text_clean = HTMLDocumentBackend._clean_unicode(annotated_text.text) - prov = self._make_prov( + prov = self._make_text_prov( text=text_clean, tag=tag, source_tag_id=annotated_text.source_tag_id, @@ -1580,7 +1585,7 @@ def _handle_list(self, tag: Tag, doc: DoclingDocument) -> RefItem: # 3) add the list item if li_text or inputs_in_li: if len(min_parts) > 1: - li_prov = self._make_prov(text=li_text, tag=li) + li_prov = self._make_text_prov(text=li_text, tag=li) # create an empty list element in order to hook the inline group onto that one self.parents[self.level + 1] = doc.add_list_item( text="", @@ -1597,12 +1602,12 @@ def _handle_list(self, tag: Tag, doc: DoclingDocument) -> RefItem: r"\s+|\n+", " ", annotated_text.text ).strip() li_clean = HTMLDocumentBackend._clean_unicode(li_text) - prov = self._make_prov( - text=li_clean, - tag=li, - source_tag_id=annotated_text.source_tag_id, - ) if annotated_text.code: + prov = self._make_prov( + text=li_clean, + tag=li, + source_tag_id=annotated_text.source_tag_id, + ) doc.add_code( parent=self.parents[self.level], text=li_clean, @@ -1612,6 +1617,11 @@ def _handle_list(self, tag: Tag, doc: DoclingDocument) -> RefItem: prov=prov, ) else: + prov = self._make_text_prov( + text=li_clean, + tag=li, + source_tag_id=annotated_text.source_tag_id, + ) doc.add_text( parent=self.parents[self.level], label=DocItemLabel.TEXT, @@ -1638,7 +1648,7 @@ def _handle_list(self, tag: Tag, doc: DoclingDocument) -> RefItem: annotated_text = min_parts[0] li_text = re.sub(r"\s+|\n+", " ", annotated_text.text).strip() li_clean = HTMLDocumentBackend._clean_unicode(li_text) - prov = self._make_prov( + prov = self._make_text_prov( text=li_clean, tag=li, source_tag_id=annotated_text.source_tag_id, @@ -1670,7 +1680,7 @@ def _handle_list(self, tag: Tag, doc: DoclingDocument) -> RefItem: self.parents[self.level + 1] = None self.level -= 1 else: - li_prov = self._make_prov(text="", tag=li) + li_prov = self._make_text_prov(text="", tag=li) self.parents[self.level + 1] = doc.add_list_item( text="", enumerated=is_ordered, @@ -1756,12 +1766,12 @@ def _handle_block(self, tag: Tag, doc: DoclingDocument) -> list[RefItem]: for annotated_text in part: if seg := annotated_text.text.strip(): seg_clean = HTMLDocumentBackend._clean_unicode(seg) - prov = self._make_prov( - text=seg_clean, - tag=tag, - source_tag_id=annotated_text.source_tag_id, - ) if annotated_text.code: + prov = self._make_prov( + text=seg_clean, + tag=tag, + source_tag_id=annotated_text.source_tag_id, + ) docling_code = doc.add_code( parent=self.parents[self.level], text=seg_clean, @@ -1772,6 +1782,11 @@ def _handle_block(self, tag: Tag, doc: DoclingDocument) -> list[RefItem]: ) added_refs.append(docling_code.get_ref()) else: + prov = self._make_text_prov( + text=seg_clean, + tag=tag, + source_tag_id=annotated_text.source_tag_id, + ) docling_text = doc.add_text( parent=self.parents[self.level], label=DocItemLabel.TEXT, @@ -1851,9 +1866,8 @@ def _is_form_container(tag: Tag) -> bool: classes = tag.get("class") if not classes: return False - if isinstance(classes, str): - classes = [classes] - return _FORM_CONTAINER_CLASS in classes + class_values = [classes] if isinstance(classes, str) else classes + return _FORM_CONTAINER_CLASS in class_values @staticmethod def _is_value_in_key_scope(key_tag: Tag, value_tag: Tag) -> bool: diff --git a/docs/examples/run_with_formats_html_rendered.py b/docs/examples/run_with_formats_html_rendered.py index 6823bc922b..2a514a58c4 100644 --- a/docs/examples/run_with_formats_html_rendered.py +++ b/docs/examples/run_with_formats_html_rendered.py @@ -2,7 +2,9 @@ import logging import time from pathlib import Path + from docling_core.types.doc import ImageRefMode + from docling.datamodel.backend_options import HTMLBackendOptions from docling.datamodel.base_models import InputFormat from docling.document_converter import DocumentConverter, HTMLFormatOption @@ -12,15 +14,14 @@ # Requires Playwright to be installed locally. + def main() -> None: - directory_path = Path("example_html_forms_500/") - out_dir = Path("scratch/html_rendered/json") - out_dir_png = Path("scratch/html_rendered/png") - out_dir_viz = Path("scratch/html_rendered/viz") + input_html_path = Path("input_dir_to_html/") + out_dir = Path("ouput_dir/json") + out_dir_png = Path("ouput_dir/png") + out_dir_viz = Path("ouput_dir/viz") - input_paths = sorted( - [file for file in directory_path.iterdir() if file.is_file()] - ) + input_paths = sorted([file for file in input_html_path.iterdir() if file.is_file()]) html_options = HTMLBackendOptions( render_page=True, @@ -62,26 +63,22 @@ def main() -> None: doc = res.document viz_pages = doc.get_visualization() - viz_pages2 = doc.get_visualization(viz_mode='key_value') + viz_pages2 = doc.get_visualization(viz_mode="key_value") print(len(viz_pages)) with (out_dir / f"{res.input.file.stem}.json").open("w") as fp: fp.write(json.dumps(doc.export_to_dict())) page = doc.pages[1] if page.image and page.image.pil_image: - page.image.pil_image.save( - out_dir_png / f"{res.input.file.stem}_page_{1}.png" - ) + page.image.pil_image.save( + out_dir_png / f"{res.input.file.stem}_page_{1}.png" + ) page_viz = viz_pages[1] - page_viz.save( - out_dir_viz / f"{res.input.file.stem}_page_{1}_viz.png" - ) + page_viz.save(out_dir_viz / f"{res.input.file.stem}_page_{1}_viz.png") page_viz = viz_pages2[1] - page_viz.save( - out_dir_viz / f"{res.input.file.stem}_page_{1}_viz_kvp.png" - ) + page_viz.save(out_dir_viz / f"{res.input.file.stem}_page_{1}_viz_kvp.png") if timings: avg_time = sum(timings) / len(timings) From f378e5bbd4789a34aa3abd65d4e0f6bf15705a92 Mon Sep 17 00:00:00 2001 From: Maksym Lysak Date: Fri, 20 Feb 2026 17:07:40 +0100 Subject: [PATCH 08/19] Moved playwright into optional dependency Signed-off-by: Maksym Lysak --- docling/backend/html_backend.py | 2 +- docs/getting_started/installation.md | 1 + pyproject.toml | 2 +- uv.lock | 16 +++++++++------- 4 files changed, 12 insertions(+), 9 deletions(-) diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py index 6ec599b313..46eedcc897 100644 --- a/docling/backend/html_backend.py +++ b/docling/backend/html_backend.py @@ -464,7 +464,7 @@ def _render_with_browser(self) -> None: except ImportError as exc: raise RuntimeError( "Playwright is required for HTML rendering. " - "Install it with 'pip install playwright' and run " + "Install it with 'pip install \"docling[htmlrender]\"' and run " "'playwright install'." ) from exc diff --git a/docs/getting_started/installation.md b/docs/getting_started/installation.md index cb77184ce1..e7a6fc9e92 100644 --- a/docs/getting_started/installation.md +++ b/docs/getting_started/installation.md @@ -55,6 +55,7 @@ The following table summarizes the extras available in the `docling` package. Th | `easyocr` | Installs the [EasyOCR](https://github.com/JaidedAI/EasyOCR) OCR engine. | | `tesserocr` | Installs the Tesseract binding for using it as OCR engine. | | `ocrmac` | Installs the OcrMac OCR engine. | +| `htmlrender` | Installs dependencies for HTML page rendering in the HTML backend. | | `rapidocr` | Installs the [RapidOCR](https://github.com/RapidAI/RapidOCR) OCR engine with [onnxruntime](https://github.com/microsoft/onnxruntime/) backend. | diff --git a/pyproject.toml b/pyproject.toml index 7a7f23b9f0..d7b050f349 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -71,7 +71,6 @@ dependencies = [ 'scipy (>=1.6.0,<2.0.0)', "accelerate>=1.0.0,<2", "polyfactory>=2.22.2", - "playwright>=1.58.0", ] [project.urls] @@ -91,6 +90,7 @@ docling-tools = "docling.cli.tools:app" easyocr = ['easyocr (>=1.7,<2.0)'] tesserocr = ['tesserocr (>=2.7.1,<3.0.0)'] ocrmac = ['ocrmac (>=1.0.0,<2.0.0) ; sys_platform == "darwin"'] +htmlrender = ["playwright>=1.58.0"] vlm = [ 'transformers (>=4.46.0,<5.0.0)', 'accelerate (>=1.2.1,<2.0.0)', diff --git a/uv.lock b/uv.lock index 66bde450d4..9cfa162c03 100644 --- a/uv.lock +++ b/uv.lock @@ -943,7 +943,6 @@ dependencies = [ { name = "openpyxl" }, { name = "pandas" }, { name = "pillow" }, - { name = "playwright" }, { name = "pluggy" }, { name = "polyfactory" }, { name = "pydantic" }, @@ -969,6 +968,9 @@ asr = [ easyocr = [ { name = "easyocr" }, ] +htmlrender = [ + { name = "playwright" }, +] ocrmac = [ { name = "ocrmac", marker = "sys_platform == 'darwin'" }, ] @@ -1053,7 +1055,7 @@ requires-dist = [ { name = "openpyxl", specifier = ">=3.1.5,<4.0.0" }, { name = "pandas", specifier = ">=2.1.4,<3.0.0" }, { name = "pillow", specifier = ">=10.0.0,<12.0.0" }, - { name = "playwright", specifier = ">=1.58.0" }, + { name = "playwright", marker = "extra == 'htmlrender'", specifier = ">=1.58.0" }, { name = "pluggy", specifier = ">=1.0.0,<2.0.0" }, { name = "polyfactory", specifier = ">=2.22.2" }, { name = "pydantic", specifier = ">=2.0.0,<3.0.0" }, @@ -1073,7 +1075,7 @@ requires-dist = [ { name = "transformers", marker = "extra == 'vlm'", specifier = ">=4.46.0,<5.0.0" }, { name = "typer", specifier = ">=0.12.5,<0.22.0" }, ] -provides-extras = ["easyocr", "tesserocr", "ocrmac", "vlm", "rapidocr", "asr"] +provides-extras = ["easyocr", "tesserocr", "ocrmac", "htmlrender", "vlm", "rapidocr", "asr"] [package.metadata.requires-dev] constraints = [ @@ -6479,10 +6481,10 @@ dependencies = [ { name = "typing-extensions" }, ] wheels = [ - { url = "https://files.pythonhosted.org/packages/e3/ea/304cf7afb744aa626fa9855245526484ee55aba610d9973a0521c552a843/torch-2.10.0-1-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:c37fc46eedd9175f9c81814cc47308f1b42cfe4987e532d4b423d23852f2bf63", size = 79411450, upload-time = "2026-02-06T17:37:35.75Z" }, - { url = "https://files.pythonhosted.org/packages/25/d8/9e6b8e7df981a1e3ea3907fd5a74673e791da483e8c307f0b6ff012626d0/torch-2.10.0-1-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:f699f31a236a677b3118bc0a3ef3d89c0c29b5ec0b20f4c4bf0b110378487464", size = 79423460, upload-time = "2026-02-06T17:37:39.657Z" }, - { url = "https://files.pythonhosted.org/packages/c9/2f/0b295dd8d199ef71e6f176f576473d645d41357b7b8aa978cc6b042575df/torch-2.10.0-1-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:6abb224c2b6e9e27b592a1c0015c33a504b00a0e0938f1499f7f514e9b7bfb5c", size = 79498197, upload-time = "2026-02-06T17:37:27.627Z" }, - { url = "https://files.pythonhosted.org/packages/a4/1b/af5fccb50c341bd69dc016769503cb0857c1423fbe9343410dfeb65240f2/torch-2.10.0-1-cp313-none-macosx_11_0_arm64.whl", hash = "sha256:7350f6652dfd761f11f9ecb590bfe95b573e2961f7a242eccb3c8e78348d26fe", size = 79498248, upload-time = "2026-02-06T17:37:31.982Z" }, + { url = "https://files.pythonhosted.org/packages/5b/30/bfebdd8ec77db9a79775121789992d6b3b75ee5494971294d7b4b7c999bc/torch-2.10.0-2-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:2b980edd8d7c0a68c4e951ee1856334a43193f98730d97408fbd148c1a933313", size = 79411457, upload-time = "2026-02-10T21:44:59.189Z" }, + { url = "https://files.pythonhosted.org/packages/0f/8b/4b61d6e13f7108f36910df9ab4b58fd389cc2520d54d81b88660804aad99/torch-2.10.0-2-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:418997cb02d0a0f1497cf6a09f63166f9f5df9f3e16c8a716ab76a72127c714f", size = 79423467, upload-time = "2026-02-10T21:44:48.711Z" }, + { url = "https://files.pythonhosted.org/packages/d3/54/a2ba279afcca44bbd320d4e73675b282fcee3d81400ea1b53934efca6462/torch-2.10.0-2-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:13ec4add8c3faaed8d13e0574f5cd4a323c11655546f91fbe6afa77b57423574", size = 79498202, upload-time = "2026-02-10T21:44:52.603Z" }, + { url = "https://files.pythonhosted.org/packages/ec/23/2c9fe0c9c27f7f6cb865abcea8a4568f29f00acaeadfc6a37f6801f84cb4/torch-2.10.0-2-cp313-none-macosx_11_0_arm64.whl", hash = "sha256:e521c9f030a3774ed770a9c011751fb47c4d12029a3d6522116e48431f2ff89e", size = 79498254, upload-time = "2026-02-10T21:44:44.095Z" }, { url = "https://files.pythonhosted.org/packages/0c/1a/c61f36cfd446170ec27b3a4984f072fd06dab6b5d7ce27e11adb35d6c838/torch-2.10.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:5276fa790a666ee8becaffff8acb711922252521b28fbce5db7db5cf9cb2026d", size = 145992962, upload-time = "2026-01-21T16:24:14.04Z" }, { url = "https://files.pythonhosted.org/packages/b5/60/6662535354191e2d1555296045b63e4279e5a9dbad49acf55a5d38655a39/torch-2.10.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:aaf663927bcd490ae971469a624c322202a2a1e68936eb952535ca4cd3b90444", size = 915599237, upload-time = "2026-01-21T16:23:25.497Z" }, { url = "https://files.pythonhosted.org/packages/40/b8/66bbe96f0d79be2b5c697b2e0b187ed792a15c6c4b8904613454651db848/torch-2.10.0-cp310-cp310-win_amd64.whl", hash = "sha256:a4be6a2a190b32ff5c8002a0977a25ea60e64f7ba46b1be37093c141d9c49aeb", size = 113720931, upload-time = "2026-01-21T16:24:23.743Z" }, From 8215a84cdd00bb4d1db7712bb252152f7eb8cd8b Mon Sep 17 00:00:00 2001 From: Maksym Lysak Date: Mon, 23 Feb 2026 17:00:49 +0100 Subject: [PATCH 09/19] 1. Updated example on how to use HTML backend with page rendering; 2. Example that uses multi-processing for conversion; Signed-off-by: Maksym Lysak --- .../run_with_formats_html_rendered.py | 102 +++++--- .../run_with_formats_html_rendered_mp.py | 243 ++++++++++++++++++ 2 files changed, 314 insertions(+), 31 deletions(-) create mode 100644 docs/examples/run_with_formats_html_rendered_mp.py diff --git a/docs/examples/run_with_formats_html_rendered.py b/docs/examples/run_with_formats_html_rendered.py index 2a514a58c4..68b596a590 100644 --- a/docs/examples/run_with_formats_html_rendered.py +++ b/docs/examples/run_with_formats_html_rendered.py @@ -4,6 +4,7 @@ from pathlib import Path from docling_core.types.doc import ImageRefMode +from tqdm import tqdm from docling.datamodel.backend_options import HTMLBackendOptions from docling.datamodel.base_models import InputFormat @@ -23,6 +24,30 @@ def main() -> None: input_paths = sorted([file for file in input_html_path.iterdir() if file.is_file()]) + out_dir.mkdir(parents=True, exist_ok=True) + out_dir_png.mkdir(parents=True, exist_ok=True) + out_dir_viz.mkdir(parents=True, exist_ok=True) + + if not input_paths: + print(f"No input files found in {input_html_path}") + return + + pending_input_paths = [ + input_path + for input_path in input_paths + if not (out_dir / f"{input_path.stem}.json").exists() + ] + skipped_count = len(input_paths) - len(pending_input_paths) + + print( + f"Found {len(input_paths)} files. " + f"Skipping {skipped_count} already converted. " + f"Remaining: {len(pending_input_paths)}." + ) + + if not pending_input_paths: + return + html_options = HTMLBackendOptions( render_page=True, # render_page_width=1588, @@ -40,7 +65,7 @@ def main() -> None: page_padding=16, enable_local_fetch=True, fetch_images=True, - source_uri=input_paths[0].resolve(), + source_uri=pending_input_paths[0].resolve(), ) converter = DocumentConverter( @@ -49,40 +74,55 @@ def main() -> None: } ) timings: list[float] = [] - - out_dir.mkdir(parents=True, exist_ok=True) - out_dir_png.mkdir(parents=True, exist_ok=True) - out_dir_viz.mkdir(parents=True, exist_ok=True) - - for input_path in input_paths: - start = time.perf_counter() - res = converter.convert(input_path) - elapsed = time.perf_counter() - start - timings.append(elapsed) - print(f"Converted in {elapsed:.3f}s") - - doc = res.document - viz_pages = doc.get_visualization() - viz_pages2 = doc.get_visualization(viz_mode="key_value") - print(len(viz_pages)) - with (out_dir / f"{res.input.file.stem}.json").open("w") as fp: - fp.write(json.dumps(doc.export_to_dict())) - - page = doc.pages[1] - if page.image and page.image.pil_image: - page.image.pil_image.save( - out_dir_png / f"{res.input.file.stem}_page_{1}.png" - ) - - page_viz = viz_pages[1] - page_viz.save(out_dir_viz / f"{res.input.file.stem}_page_{1}_viz.png") - - page_viz = viz_pages2[1] - page_viz.save(out_dir_viz / f"{res.input.file.stem}_page_{1}_viz_kvp.png") + failed_files: list[Path] = [] + + with tqdm( + pending_input_paths, + total=len(pending_input_paths), + desc="HTML conversions", + unit="file", + ) as pbar: + for input_path in pbar: + pbar.set_postfix_str(input_path.name) + try: + start = time.perf_counter() + res = converter.convert(input_path) + elapsed = time.perf_counter() - start + timings.append(elapsed) + tqdm.write(f"{input_path.name}: converted in {elapsed:.3f}s") + + doc = res.document + viz_pages = doc.get_visualization() + viz_pages2 = doc.get_visualization(viz_mode="key_value") + tqdm.write(f"{input_path.name}: {len(viz_pages)} viz pages") + with (out_dir / f"{res.input.file.stem}.json").open("w") as fp: + fp.write(json.dumps(doc.export_to_dict())) + + page = doc.pages[1] + if page.image and page.image.pil_image: + page.image.pil_image.save( + out_dir_png / f"{res.input.file.stem}_page_{1}.png" + ) + + page_viz = viz_pages[1] + page_viz.save(out_dir_viz / f"{res.input.file.stem}_page_{1}_viz.png") + + page_viz = viz_pages2[1] + page_viz.save( + out_dir_viz / f"{res.input.file.stem}_page_{1}_viz_kvp.png" + ) + except Exception as exc: + failed_files.append(input_path) + _log.exception("Failed to convert %s: %s", input_path, exc) + tqdm.write(f"{input_path.name}: FAILED ({exc})") if timings: avg_time = sum(timings) / len(timings) print(f"Average conversion time: {avg_time:.3f}s across {len(timings)} samples") + if failed_files: + print(f"Failed files: {len(failed_files)}") + for failed_path in failed_files: + print(f" - {failed_path}") if __name__ == "__main__": diff --git a/docs/examples/run_with_formats_html_rendered_mp.py b/docs/examples/run_with_formats_html_rendered_mp.py new file mode 100644 index 0000000000..47878a8f1d --- /dev/null +++ b/docs/examples/run_with_formats_html_rendered_mp.py @@ -0,0 +1,243 @@ +import multiprocessing as mp +import os +import json +import logging +import time +import traceback +from concurrent.futures import ProcessPoolExecutor, as_completed +from pathlib import Path +from typing import Any + +from docling_core.types.doc import ImageRefMode +from tqdm import tqdm + +from docling.datamodel.backend_options import HTMLBackendOptions +from docling.datamodel.base_models import InputFormat +from docling.document_converter import DocumentConverter, HTMLFormatOption +from docling.utils.visualization import draw_clusters + +_log = logging.getLogger(__name__) +_WORKER_CONVERTER: DocumentConverter | None = None +_WORKER_OUT_DIR: Path | None = None +_WORKER_OUT_DIR_PNG: Path | None = None +_WORKER_OUT_DIR_VIZ: Path | None = None + +# Requires Playwright to be installed locally. + + +def _build_html_options(sample_source_uri: Path) -> HTMLBackendOptions: + return HTMLBackendOptions( + render_page=True, + # render_page_width=1588, + # ender_page_height=2246, + render_page_width=794, + render_page_height=100, + render_device_scale=2.0, + # render_page_height=1123, + render_page_orientation="portrait", + render_print_media=True, + render_wait_until="networkidle", + render_wait_ms=500, + render_full_page=True, + render_dpi=144, + page_padding=16, + enable_local_fetch=True, + fetch_images=True, + source_uri=sample_source_uri.resolve(), + ) + + +def _done_marker_path(input_path: Path, out_dir: Path) -> Path: + return out_dir / f"{input_path.stem}.done" + + +def _is_already_converted(input_path: Path, out_dir: Path) -> bool: + # Keep legacy JSON-only skip behavior and add a dedicated completion marker for MT runs. + return _done_marker_path(input_path, out_dir).exists() or ( + out_dir / f"{input_path.stem}.json" + ).exists() + + +def _init_worker( + sample_source_uri: str, out_dir: str, out_dir_png: str, out_dir_viz: str +) -> None: + global _WORKER_CONVERTER, _WORKER_OUT_DIR, _WORKER_OUT_DIR_PNG, _WORKER_OUT_DIR_VIZ + + _WORKER_OUT_DIR = Path(out_dir) + _WORKER_OUT_DIR_PNG = Path(out_dir_png) + _WORKER_OUT_DIR_VIZ = Path(out_dir_viz) + html_options = _build_html_options(Path(sample_source_uri)) + _WORKER_CONVERTER = DocumentConverter( + format_options={ + InputFormat.HTML: HTMLFormatOption(backend_options=html_options) + } + ) + + +def _write_text_atomic(path: Path, text: str) -> None: + tmp_path = path.parent / f".{path.name}.tmp.{os.getpid()}" + tmp_path.write_text(text) + tmp_path.replace(path) + + +def _convert_one(input_path_str: str) -> dict[str, Any]: + input_path = Path(input_path_str) + if ( + _WORKER_CONVERTER is None + or _WORKER_OUT_DIR is None + or _WORKER_OUT_DIR_PNG is None + or _WORKER_OUT_DIR_VIZ is None + ): + raise RuntimeError("Worker not initialized") + + try: + start = time.perf_counter() + res = _WORKER_CONVERTER.convert(input_path) + elapsed = time.perf_counter() - start + + doc = res.document + viz_pages = doc.get_visualization() + viz_pages2 = doc.get_visualization(viz_mode="key_value") + + stem = res.input.file.stem + json_path = _WORKER_OUT_DIR / f"{stem}.json" + _write_text_atomic(json_path, json.dumps(doc.export_to_dict())) + + page = doc.pages[1] + if page.image and page.image.pil_image: + page.image.pil_image.save(_WORKER_OUT_DIR_PNG / f"{stem}_page_{1}.png") + + page_viz = viz_pages[1] + page_viz.save(_WORKER_OUT_DIR_VIZ / f"{stem}_page_{1}_viz.png") + + page_viz = viz_pages2[1] + page_viz.save(_WORKER_OUT_DIR_VIZ / f"{stem}_page_{1}_viz_kvp.png") + + _write_text_atomic(_done_marker_path(input_path, _WORKER_OUT_DIR), "ok\n") + return { + "ok": True, + "file": input_path.name, + "elapsed": elapsed, + "viz_pages": len(viz_pages), + } + except Exception as exc: + return { + "ok": False, + "file": input_path.name, + "error": str(exc), + "traceback": traceback.format_exc(), + } + + +def main() -> None: + input_html_path = Path("input_dir_to_html/") + out_dir = Path("ouput_dir/json") + out_dir_png = Path("ouput_dir/png") + out_dir_viz = Path("ouput_dir/viz") + + input_paths = sorted([file for file in input_html_path.iterdir() if file.is_file()]) + + out_dir.mkdir(parents=True, exist_ok=True) + out_dir_png.mkdir(parents=True, exist_ok=True) + out_dir_viz.mkdir(parents=True, exist_ok=True) + + if not input_paths: + print(f"No input files found in {input_html_path}") + return + + pending_input_paths = [ + input_path + for input_path in input_paths + if not _is_already_converted(input_path, out_dir) + ] + skipped_count = len(input_paths) - len(pending_input_paths) + + print( + f"Found {len(input_paths)} files. " + f"Skipping {skipped_count} already converted. " + f"Remaining: {len(pending_input_paths)}." + ) + + if not pending_input_paths: + return + + timings: list[float] = [] + failed_files: list[Path] = [] + max_workers = min(4, max(1, int(os.environ.get("DOCLING_HTML_WORKERS", os.cpu_count() or 1)))) + print(f"Using {max_workers} worker process(es)") + + mp_ctx = mp.get_context("spawn") + with ProcessPoolExecutor( + max_workers=max_workers, + mp_context=mp_ctx, + initializer=_init_worker, + initargs=( + str(pending_input_paths[0]), + str(out_dir), + str(out_dir_png), + str(out_dir_viz), + ), + ) as executor: + futures = { + executor.submit(_convert_one, str(input_path)): input_path + for input_path in pending_input_paths + } + + success_count = 0 + with tqdm( + total=len(pending_input_paths), + desc="HTML conversions", + unit="file", + ) as pbar: + for future in as_completed(futures): + input_path = futures[future] + pbar.update(1) + try: + result = future.result() + except Exception as exc: + failed_files.append(input_path) + _log.exception("Worker crashed for %s: %s", input_path, exc) + tqdm.write(f"{input_path.name}: FAILED (worker crash: {exc})") + pbar.set_postfix( + ok=success_count, + failed=len(failed_files), + left=len(pending_input_paths) - pbar.n, + ) + continue + + if result.get("ok"): + success_count += 1 + elapsed = float(result["elapsed"]) + timings.append(elapsed) + tqdm.write( + f"{result['file']}: converted in {elapsed:.3f}s " + f"({result['viz_pages']} viz pages)" + ) + else: + failed_files.append(input_path) + _log.error( + "Failed to convert %s\n%s", + input_path, + result.get("traceback", result.get("error", "unknown error")), + ) + tqdm.write( + f"{result['file']}: FAILED ({result.get('error', 'unknown error')})" + ) + + pbar.set_postfix( + ok=success_count, + failed=len(failed_files), + left=len(pending_input_paths) - pbar.n, + ) + + if timings: + avg_time = sum(timings) / len(timings) + print(f"Average conversion time: {avg_time:.3f}s across {len(timings)} samples") + if failed_files: + print(f"Failed files: {len(failed_files)}") + for failed_path in failed_files: + print(f" - {failed_path}") + + +if __name__ == "__main__": + main() From 793ac636549dad5b82321bd88c6a94f0a1c82eb7 Mon Sep 17 00:00:00 2001 From: Maksym Lysak Date: Mon, 23 Feb 2026 17:18:02 +0100 Subject: [PATCH 10/19] fixes in mp HTML backend example Signed-off-by: Maksym Lysak --- .../examples/run_with_formats_html_rendered_mp.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/docs/examples/run_with_formats_html_rendered_mp.py b/docs/examples/run_with_formats_html_rendered_mp.py index 47878a8f1d..e57aec8508 100644 --- a/docs/examples/run_with_formats_html_rendered_mp.py +++ b/docs/examples/run_with_formats_html_rendered_mp.py @@ -1,7 +1,7 @@ -import multiprocessing as mp -import os import json import logging +import multiprocessing as mp +import os import time import traceback from concurrent.futures import ProcessPoolExecutor, as_completed @@ -53,9 +53,10 @@ def _done_marker_path(input_path: Path, out_dir: Path) -> Path: def _is_already_converted(input_path: Path, out_dir: Path) -> bool: # Keep legacy JSON-only skip behavior and add a dedicated completion marker for MT runs. - return _done_marker_path(input_path, out_dir).exists() or ( - out_dir / f"{input_path.stem}.json" - ).exists() + return ( + _done_marker_path(input_path, out_dir).exists() + or (out_dir / f"{input_path.stem}.json").exists() + ) def _init_worker( @@ -163,7 +164,9 @@ def main() -> None: timings: list[float] = [] failed_files: list[Path] = [] - max_workers = min(4, max(1, int(os.environ.get("DOCLING_HTML_WORKERS", os.cpu_count() or 1)))) + max_workers = min( + 4, max(1, int(os.environ.get("DOCLING_HTML_WORKERS", os.cpu_count() or 1))) + ) print(f"Using {max_workers} worker process(es)") mp_ctx = mp.get_context("spawn") From 4c60e4ff7cb9769aea449aa490787d3444fa5640 Mon Sep 17 00:00:00 2001 From: Maksym Lysak Date: Fri, 27 Feb 2026 12:03:30 +0100 Subject: [PATCH 11/19] First implementation of new KV standard from docling-core, WIP Signed-off-by: Maksym Lysak --- docling/backend/html_backend.py | 312 +++++++++++++++++++++++++++++--- pyproject.toml | 2 +- uv.lock | 11 +- 3 files changed, 296 insertions(+), 29 deletions(-) diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py index 46eedcc897..63681585cf 100644 --- a/docling/backend/html_backend.py +++ b/docling/backend/html_backend.py @@ -9,7 +9,7 @@ from dataclasses import dataclass from io import BytesIO from pathlib import Path -from typing import Final, Optional, Union, cast +from typing import Any, Final, Literal, Optional, Union, cast from urllib.parse import urljoin, urlparse import requests @@ -148,6 +148,30 @@ class _RenderedBBox: bbox: BoundingBox +@dataclass +class _ExtractedFormValue: + tag: Tag + orig: str + text: str + prov: Optional[ProvenanceItem] + kind: Literal["read_only", "fillable"] = "read_only" + + +@dataclass +class _ExtractedFormField: + key_tag: Tag + key_orig: str + key_text: str + key_prov: Optional[ProvenanceItem] + values: list[_ExtractedFormValue] + + +@dataclass +class _ExtractedFormRegion: + fields: list[_ExtractedFormField] + consumed_tag_ids: set[str] + + class _Context(BaseModel): list_ordered_flag_by_ref: dict[str, bool] = {} list_start_by_ref: dict[str, int] = {} @@ -309,6 +333,8 @@ def __init__( self._rendered_text_bbox_by_id: dict[str, _RenderedBBox] = {} self._rendered_page_images: list[Image.Image] = [] self._rendered_page_size: Optional[Size] = None + self._suppressed_tag_ids_stack: list[set[str]] = [] + self._form_fields_by_key_id_stack: list[dict[str, _ExtractedFormField]] = [] try: raw = ( @@ -724,6 +750,15 @@ def _get_tag_id(self, tag: Optional[Tag]) -> Optional[str]: return None return str(tag_id) + @staticmethod + def _get_html_id(tag: Optional[Tag]) -> Optional[str]: + if tag is None: + return None + tag_id = tag.get("id") + if not isinstance(tag_id, str) or not tag_id: + return None + return tag_id + def _get_rendered_bbox_for_tag(self, tag: Optional[Tag]) -> Optional[_RenderedBBox]: tag_id = self._get_tag_id(tag) if tag_id is None: @@ -1153,7 +1188,31 @@ def _flush_buffer() -> None: for node in element.contents: if isinstance(node, Tag): + if form_field := self._consume_form_field_for_tag(node): + _flush_buffer() + added_refs.extend( + self._add_field_item_from_extracted( + field=form_field, + doc=doc, + parent=self.parents[self.level], + ) + ) + continue + if self._is_suppressed_tag(node): + continue name = node.name.lower() + has_block_descendants = bool(node.find(_BLOCK_TAGS) or node.find("input")) + is_atomic_node = name in _BLOCK_TAGS or not has_block_descendants + if is_atomic_node: + for field in self._consume_form_fields_in_subtree(node): + _flush_buffer() + added_refs.extend( + self._add_field_item_from_extracted( + field=field, + doc=doc, + parent=self.parents[self.level], + ) + ) if self._is_form_container(node): _flush_buffer() form_refs = self._handle_form_container(node, doc) @@ -1182,7 +1241,7 @@ def _flush_buffer() -> None: _flush_buffer() blk = self._handle_block(node, doc) added_refs.extend(blk) - elif node.find(_BLOCK_TAGS) or node.find("input"): + elif has_block_descendants: _flush_buffer() wk3 = self._walk(node, doc) added_refs.extend(wk3) @@ -1256,6 +1315,8 @@ def _extract_text_and_hyperlink_recursively( return AnnotatedTextList() if isinstance(item, NavigableString): + if isinstance(item.parent, Tag) and self._is_suppressed_tag(item.parent): + return AnnotatedTextList() text = item.strip() code = any(code_tag in self.format_tags for code_tag in _CODE_TAG_SET) source_tag_id = ( @@ -1288,6 +1349,8 @@ def _extract_text_and_hyperlink_recursively( return AnnotatedTextList() tag = cast(Tag, item) + if self._is_suppressed_tag(tag): + return AnnotatedTextList() if not ignore_list or (tag.name not in ["ul", "ol"]): for child in tag: if isinstance(child, Tag) and child.name in _FORMAT_TAG_MAP: @@ -1941,7 +2004,131 @@ def _normalize_form_text(text: str) -> tuple[str, str]: raw = re.sub(r"\s+", " ", text).strip() return raw, HTMLDocumentBackend._clean_unicode(raw) - def _extract_form_graph(self, form_tag: Tag) -> Optional[GraphData]: + @staticmethod + def _infer_form_value_kind(value_tag: Tag) -> Literal["read_only", "fillable"]: + if value_tag.name in {"input", "select", "textarea"}: + return "fillable" + if value_tag.find(["input", "select", "textarea"]) is not None: + return "fillable" + return "read_only" + + @contextmanager + def _suppress_tag_ids(self, tag_ids: set[str]): + if not tag_ids: + yield None + return + self._suppressed_tag_ids_stack.append(tag_ids) + try: + yield None + finally: + self._suppressed_tag_ids_stack.pop() + + def _is_suppressed_tag(self, tag: Tag) -> bool: + tag_ids = set() + if html_id := self._get_html_id(tag): + tag_ids.add(html_id) + if docling_id := self._get_tag_id(tag): + tag_ids.add(docling_id) + if not tag_ids: + return False + return any(bool(ids & tag_ids) for ids in self._suppressed_tag_ids_stack) + + @contextmanager + def _use_form_fields_by_key_id(self, fields_by_key_id: dict[str, _ExtractedFormField]): + self._form_fields_by_key_id_stack.append(dict(fields_by_key_id)) + try: + yield None + finally: + self._form_fields_by_key_id_stack.pop() + + def _consume_form_field_for_tag(self, tag: Tag) -> Optional[_ExtractedFormField]: + tag_id = self._get_html_id(tag) + if tag_id is None: + return None + for field_map in reversed(self._form_fields_by_key_id_stack): + field = field_map.pop(tag_id, None) + if field is not None: + return field + return None + + def _consume_form_fields_in_subtree(self, tag: Tag) -> list[_ExtractedFormField]: + if not self._form_fields_by_key_id_stack: + return [] + field_map = self._form_fields_by_key_id_stack[-1] + extracted_fields: list[_ExtractedFormField] = [] + for key_id, field in list(field_map.items()): + key_tag = field.key_tag + if key_tag is tag or any(parent is tag for parent in key_tag.parents): + extracted_fields.append(field) + field_map.pop(key_id, None) + return extracted_fields + + def _is_lonely_key_covered_by_table(self, key_tag: Tag) -> bool: + key_tag_id = self._get_html_id(key_tag) + if key_tag_id is None: + return False + + table_cell = self._get_table_cell(key_tag) + if table_cell is None: + return False + + remaining_raw = self._extract_text_excluding_ids(table_cell, {key_tag_id}) + _, remaining_clean = self._normalize_form_text(remaining_raw) + if remaining_clean: + return False + + for descendant in table_cell.descendants: + if descendant is key_tag: + continue + if isinstance(descendant, Tag): + if any(parent is key_tag for parent in descendant.parents): + continue + return False + if isinstance(descendant, NavigableString): + if any(parent is key_tag for parent in descendant.parents): + continue + if str(descendant).strip(): + return False + + return True + + def _add_field_item_from_extracted( + self, + field: _ExtractedFormField, + doc: DoclingDocument, + parent: Optional[Union[DocItem, GroupItem]], + ) -> list[RefItem]: + refs: list[RefItem] = [] + doc_with_fields = cast(Any, doc) + field_item = doc_with_fields.add_field_item( + parent=parent, + content_layer=self.content_layer, + ) + refs.append(field_item.get_ref()) + + field_key = doc_with_fields.add_field_key( + text=field.key_text, + orig=field.key_orig, + prov=field.key_prov, + parent=field_item, + content_layer=self.content_layer, + ) + refs.append(field_key.get_ref()) + + for value in field.values: + field_value = doc_with_fields.add_field_value( + text=value.text, + orig=value.orig, + prov=value.prov, + parent=field_item, + content_layer=self.content_layer, + kind=value.kind, + ) + refs.append(field_value.get_ref()) + + return refs + + def _extract_form_region(self, form_tag: Tag) -> Optional[_ExtractedFormRegion]: key_tags: dict[str, Tag] = {} key_order: list[str] = [] values_by_key: dict[str, list[tuple[Optional[int], int, Tag]]] = {} @@ -1958,9 +2145,7 @@ def _extract_form_graph(self, form_tag: Tag) -> Optional[GraphData]: value_id = value_match.group("value_id") value_index = int(value_id) if value_id.isdigit() else None value_order += 1 - values_by_key.setdefault(key_id, []).append( - (value_index, value_order, tag) - ) + values_by_key.setdefault(key_id, []).append((value_index, value_order, tag)) continue key_match = _FORM_KEY_ID_RE.match(tag_id) @@ -1970,9 +2155,8 @@ def _extract_form_graph(self, form_tag: Tag) -> Optional[GraphData]: key_tags[key_id] = tag key_order.append(key_id) - cells: list[GraphCell] = [] - links: list[GraphLink] = [] - cell_id_seq = 0 + fields: list[_ExtractedFormField] = [] + consumed_tag_ids: set[str] = set() table_bboxes: list[BoundingBox] = [] if self._rendered_bbox_by_id: @@ -1988,9 +2172,7 @@ def _extract_form_graph(self, form_tag: Tag) -> Optional[GraphData]: value_entries = [ entry for entry in value_entries - if not self._should_ignore_table_kv_link( - key_tag, entry[2], table_bboxes - ) + if not self._should_ignore_table_kv_link(key_tag, entry[2], table_bboxes) ] in_scope_entries = [ entry @@ -2017,25 +2199,66 @@ def _extract_form_graph(self, form_tag: Tag) -> Optional[GraphData]: if not key_text and not value_tags: continue + values: list[_ExtractedFormValue] = [] + for value_tag in value_tags: + value_text_raw = HTMLDocumentBackend.get_text(value_tag) + value_orig, value_text = self._normalize_form_text(value_text_raw) + values.append( + _ExtractedFormValue( + tag=value_tag, + orig=value_orig, + text=value_text, + prov=self._make_text_prov(text=value_text, tag=value_tag), + kind=self._infer_form_value_kind(value_tag), + ) + ) + value_tag_id = self._get_html_id(value_tag) + if value_tag_id is not None: + consumed_tag_ids.add(value_tag_id) + + fields.append( + _ExtractedFormField( + key_tag=key_tag, + key_orig=key_orig, + key_text=key_text, + key_prov=self._make_text_prov(text=key_text, tag=key_tag), + values=values, + ) + ) + key_tag_id = self._get_html_id(key_tag) + if key_tag_id is not None: + consumed_tag_ids.add(key_tag_id) + + if not fields: + return None + return _ExtractedFormRegion(fields=fields, consumed_tag_ids=consumed_tag_ids) + + def _extract_form_graph(self, form_tag: Tag) -> Optional[GraphData]: + extracted = self._extract_form_region(form_tag) + if extracted is None: + return None + + cells: list[GraphCell] = [] + links: list[GraphLink] = [] + cell_id_seq = 0 + for field in extracted.fields: key_cell = GraphCell( cell_id=cell_id_seq, label=GraphCellLabel.KEY, - text=key_text, - orig=key_orig, - prov=self._make_text_prov(text=key_text, tag=key_tag), + text=field.key_text, + orig=field.key_orig, + prov=field.key_prov, ) cells.append(key_cell) cell_id_seq += 1 - for value_tag in value_tags: - value_text_raw = HTMLDocumentBackend.get_text(value_tag) - value_orig, value_text = self._normalize_form_text(value_text_raw) + for value in field.values: value_cell = GraphCell( cell_id=cell_id_seq, label=GraphCellLabel.VALUE, - text=value_text, - orig=value_orig, - prov=self._make_text_prov(text=value_text, tag=value_tag), + text=value.text, + orig=value.orig, + prov=value.prov, ) cells.append(value_cell) links.append( @@ -2053,6 +2276,51 @@ def _extract_form_graph(self, form_tag: Tag) -> Optional[GraphData]: def _handle_form_container(self, tag: Tag, doc: DoclingDocument) -> list[RefItem]: added_refs: list[RefItem] = [] + supports_field_kv = all( + hasattr(doc, method_name) + for method_name in ( + "add_field_region", + "add_field_item", + "add_field_key", + "add_field_value", + ) + ) + + if supports_field_kv: + doc_with_fields = cast(Any, doc) + form_region = self._extract_form_region(tag) + region_prov = self._make_prov(text="", tag=tag) + field_region = doc_with_fields.add_field_region( + prov=region_prov, + parent=self.parents[self.level], + ) + field_region.content_layer = self.content_layer + added_refs.append(field_region.get_ref()) + + consumed_tag_ids: set[str] = set() + fields_by_key_id: dict[str, _ExtractedFormField] = {} + if form_region is not None: + for field in form_region.fields: + key_tag_id = self._get_html_id(field.key_tag) + if key_tag_id is None: + continue + if not field.values: + if self._is_lonely_key_covered_by_table(field.key_tag): + consumed_tag_ids.add(key_tag_id) + continue + fields_by_key_id[key_tag_id] = field + consumed_tag_ids.add(key_tag_id) + for value in field.values: + value_tag_id = self._get_html_id(value.tag) + if value_tag_id is not None: + consumed_tag_ids.add(value_tag_id) + + with self._use_form_container(field_region): + with self._use_form_fields_by_key_id(fields_by_key_id): + with self._suppress_tag_ids(consumed_tag_ids): + added_refs.extend(self._walk(tag, doc)) + return added_refs + form_graph = self._extract_form_graph(tag) form_data = form_graph if form_graph is not None else GraphData() form_prov = self._make_prov(text="", tag=tag) @@ -2156,6 +2424,8 @@ def get_img_hyperlink(img_tag): return docling_pic.get_ref() def _emit_input(self, input_tag: Tag, doc: DoclingDocument) -> Optional[RefItem]: + if self._is_suppressed_tag(input_tag): + return None input_type = self._get_attr_as_string(input_tag, "type").lower() if input_type == "hidden": return None diff --git a/pyproject.toml b/pyproject.toml index d7b050f349..68d52ba8f4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -44,7 +44,7 @@ authors = [ requires-python = '>=3.10,<4.0' dependencies = [ 'pydantic (>=2.0.0,<3.0.0)', - 'docling-core[chunking] (>=2.62.0,<3.0.0)', + 'docling-core[chunking] @ git+https://github.com/docling-project/docling-core.git@new-kv', 'docling-parse (>=4.7.0,<5.0.0)', "docling-ibm-models>=3.9.1,<4", 'filetype (>=1.2.0,<2.0.0)', diff --git a/uv.lock b/uv.lock index 9cfa162c03..7356b68247 100644 --- a/uv.lock +++ b/uv.lock @@ -1038,7 +1038,7 @@ requires-dist = [ { name = "accelerate", marker = "extra == 'vlm'", specifier = ">=1.2.1,<2.0.0" }, { name = "beautifulsoup4", specifier = ">=4.12.3,<5.0.0" }, { name = "certifi", specifier = ">=2024.7.4" }, - { name = "docling-core", extras = ["chunking"], specifier = ">=2.62.0,<3.0.0" }, + { name = "docling-core", extras = ["chunking"], git = "https://github.com/docling-project/docling-core.git?rev=new-kv" }, { name = "docling-ibm-models", specifier = ">=3.9.1,<4" }, { name = "docling-parse", specifier = ">=4.7.0,<5.0.0" }, { name = "easyocr", marker = "extra == 'easyocr'", specifier = ">=1.7,<2.0" }, @@ -1123,9 +1123,10 @@ examples = [ [[package]] name = "docling-core" -version = "2.62.0" -source = { registry = "https://pypi.org/simple" } +version = "2.65.1" +source = { git = "https://github.com/docling-project/docling-core.git?rev=new-kv#3bdb8e9341c4cfff008cca8e18d3c43cb907e8af" } dependencies = [ + { name = "defusedxml" }, { name = "jsonref" }, { name = "jsonschema" }, { name = "latex2mathml" }, @@ -1137,10 +1138,6 @@ dependencies = [ { name = "typer" }, { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/e0/21/20d58a48f4baa9e16d49aaccf3048346a8e7833b65b09144315bf1d956db/docling_core-2.62.0.tar.gz", hash = "sha256:147c958fe3b552db5e78b5a301dba19349820066ec5ef189b67eb5ed00306a07", size = 250107, upload-time = "2026-01-30T14:01:44.448Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/c5/89/e5204af5669e6b73bfdf304fc3e4c6b4b98b10d06b8bd7dc186b5190c9f3/docling_core-2.62.0-py3-none-any.whl", hash = "sha256:0073ccbd0c9cf514b38be7d53ccd78ee7b92723294a623a3f36eb7a7aea67bf0", size = 238084, upload-time = "2026-01-30T14:01:43.059Z" }, -] [package.optional-dependencies] chunking = [ From 876a68664826f2aa4a529de202f9834056460713 Mon Sep 17 00:00:00 2001 From: Maksym Lysak Date: Fri, 27 Feb 2026 12:20:37 +0100 Subject: [PATCH 12/19] Fixed tables, and added value extraction for the HTML input elements when inside key-value pair Signed-off-by: Maksym Lysak --- docling/backend/html_backend.py | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py index 63681585cf..b06c4845f6 100644 --- a/docling/backend/html_backend.py +++ b/docling/backend/html_backend.py @@ -2012,6 +2012,18 @@ def _infer_form_value_kind(value_tag: Tag) -> Literal["read_only", "fillable"]: return "fillable" return "read_only" + @staticmethod + def _extract_form_value_text(value_tag: Tag) -> str: + # Input elements carry their user-visible content in attributes, not inner text. + if value_tag.name == "input": + for attr in ("value", "placeholder", "name"): + val = value_tag.get(attr) + if isinstance(val, str) and val.strip(): + return val.strip() + return "" + + return HTMLDocumentBackend.get_text(value_tag) + @contextmanager def _suppress_tag_ids(self, tag_ids: set[str]): if not tag_ids: @@ -2201,7 +2213,7 @@ def _extract_form_region(self, form_tag: Tag) -> Optional[_ExtractedFormRegion]: values: list[_ExtractedFormValue] = [] for value_tag in value_tags: - value_text_raw = HTMLDocumentBackend.get_text(value_tag) + value_text_raw = self._extract_form_value_text(value_tag) value_orig, value_text = self._normalize_form_text(value_text_raw) values.append( _ExtractedFormValue( @@ -2318,7 +2330,10 @@ def _handle_form_container(self, tag: Tag, doc: DoclingDocument) -> list[RefItem with self._use_form_container(field_region): with self._use_form_fields_by_key_id(fields_by_key_id): with self._suppress_tag_ids(consumed_tag_ids): - added_refs.extend(self._walk(tag, doc)) + if tag.name.lower() == "table": + added_refs.extend(self._handle_block(tag, doc)) + else: + added_refs.extend(self._walk(tag, doc)) return added_refs form_graph = self._extract_form_graph(tag) @@ -2342,7 +2357,10 @@ def _handle_form_container(self, tag: Tag, doc: DoclingDocument) -> list[RefItem added_refs.append(kv_item.get_ref()) with self._use_form_container(form_item): - added_refs.extend(self._walk(tag, doc)) + if tag.name.lower() == "table": + added_refs.extend(self._handle_block(tag, doc)) + else: + added_refs.extend(self._walk(tag, doc)) return added_refs def _emit_image(self, img_tag: Tag, doc: DoclingDocument) -> Optional[RefItem]: From 35ade0d8f571bc52c29f41cc842f5bbb139ba5f7 Mon Sep 17 00:00:00 2001 From: Maksym Lysak Date: Mon, 2 Mar 2026 18:09:39 +0100 Subject: [PATCH 13/19] HTML_backend: Added support of markers for field_items, respected reading order inside the field_item Signed-off-by: Maksym Lysak --- docling/backend/html_backend.py | 170 ++++++++++++++++++++++++++------ uv.lock | 2 +- 2 files changed, 141 insertions(+), 31 deletions(-) diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py index b06c4845f6..273d5a97e7 100644 --- a/docling/backend/html_backend.py +++ b/docling/backend/html_backend.py @@ -137,6 +137,7 @@ _DATA_DOCLING_ID_ATTR: Final = "data-docling-id" _FORM_CONTAINER_CLASS: Final = "form_region" _FORM_KEY_ID_RE: Final = re.compile(r"^key(?P[A-Za-z0-9]+)$") +_FORM_MARKER_ID_RE: Final = re.compile(r"^key(?P[A-Za-z0-9]+)_marker$") _FORM_VALUE_ID_RE: Final = re.compile( r"^key(?P[A-Za-z0-9]+)_value(?P[A-Za-z0-9]+)$" ) @@ -151,18 +152,30 @@ class _RenderedBBox: @dataclass class _ExtractedFormValue: tag: Tag + order: int orig: str text: str prov: Optional[ProvenanceItem] kind: Literal["read_only", "fillable"] = "read_only" +@dataclass +class _ExtractedFormMarker: + tag: Tag + order: int + orig: str + text: str + prov: Optional[ProvenanceItem] + + @dataclass class _ExtractedFormField: key_tag: Tag + key_order: int key_orig: str key_text: str key_prov: Optional[ProvenanceItem] + marker: Optional[_ExtractedFormMarker] values: list[_ExtractedFormValue] @@ -1201,7 +1214,9 @@ def _flush_buffer() -> None: if self._is_suppressed_tag(node): continue name = node.name.lower() - has_block_descendants = bool(node.find(_BLOCK_TAGS) or node.find("input")) + has_block_descendants = bool( + node.find(_BLOCK_TAGS) or node.find("input") + ) is_atomic_node = name in _BLOCK_TAGS or not has_block_descendants if is_atomic_node: for field in self._consume_form_fields_in_subtree(node): @@ -2024,6 +2039,10 @@ def _extract_form_value_text(value_tag: Tag) -> str: return HTMLDocumentBackend.get_text(value_tag) + @staticmethod + def _extract_form_marker_text(marker_tag: Tag) -> str: + return HTMLDocumentBackend.get_text(marker_tag) + @contextmanager def _suppress_tag_ids(self, tag_ids: set[str]): if not tag_ids: @@ -2046,7 +2065,9 @@ def _is_suppressed_tag(self, tag: Tag) -> bool: return any(bool(ids & tag_ids) for ids in self._suppressed_tag_ids_stack) @contextmanager - def _use_form_fields_by_key_id(self, fields_by_key_id: dict[str, _ExtractedFormField]): + def _use_form_fields_by_key_id( + self, fields_by_key_id: dict[str, _ExtractedFormField] + ): self._form_fields_by_key_id_stack.append(dict(fields_by_key_id)) try: yield None @@ -2060,6 +2081,9 @@ def _consume_form_field_for_tag(self, tag: Tag) -> Optional[_ExtractedFormField] for field_map in reversed(self._form_fields_by_key_id_stack): field = field_map.pop(tag_id, None) if field is not None: + for mapped_tag_id, mapped_field in list(field_map.items()): + if mapped_field is field: + field_map.pop(mapped_tag_id, None) return field return None @@ -2068,11 +2092,24 @@ def _consume_form_fields_in_subtree(self, tag: Tag) -> list[_ExtractedFormField] return [] field_map = self._form_fields_by_key_id_stack[-1] extracted_fields: list[_ExtractedFormField] = [] - for key_id, field in list(field_map.items()): - key_tag = field.key_tag - if key_tag is tag or any(parent is tag for parent in key_tag.parents): + consumed_field_ids: set[int] = set() + for _, field in list(field_map.items()): + field_obj_id = id(field) + if field_obj_id in consumed_field_ids: + continue + field_tags = [field.key_tag] + if field.marker is not None: + field_tags.append(field.marker.tag) + field_tags.extend(value.tag for value in field.values) + if any( + field_tag is tag or any(parent is tag for parent in field_tag.parents) + for field_tag in field_tags + ): extracted_fields.append(field) - field_map.pop(key_id, None) + consumed_field_ids.add(field_obj_id) + for pop_tag_id, pop_field in list(field_map.items()): + if pop_field is field: + field_map.pop(pop_tag_id, None) return extracted_fields def _is_lonely_key_covered_by_table(self, key_tag: Tag) -> bool: @@ -2118,46 +2155,77 @@ def _add_field_item_from_extracted( ) refs.append(field_item.get_ref()) - field_key = doc_with_fields.add_field_key( - text=field.key_text, - orig=field.key_orig, - prov=field.key_prov, - parent=field_item, - content_layer=self.content_layer, - ) - refs.append(field_key.get_ref()) - + parts: list[tuple[int, Literal["key", "marker", "value"], Any]] = [ + (field.key_order, "key", field) + ] + if field.marker is not None: + parts.append((field.marker.order, "marker", field.marker)) for value in field.values: - field_value = doc_with_fields.add_field_value( - text=value.text, - orig=value.orig, - prov=value.prov, - parent=field_item, - content_layer=self.content_layer, - kind=value.kind, - ) - refs.append(field_value.get_ref()) + parts.append((value.order, "value", value)) + + for _, part_type, payload in sorted(parts, key=lambda part: part[0]): + if part_type == "key": + field_key = doc_with_fields.add_field_key( + text=field.key_text, + orig=field.key_orig, + prov=field.key_prov, + parent=field_item, + content_layer=self.content_layer, + ) + refs.append(field_key.get_ref()) + elif part_type == "marker": + marker = cast(_ExtractedFormMarker, payload) + marker_item = doc.add_text( + label=DocItemLabel.MARKER, + text=marker.text, + orig=marker.orig, + prov=marker.prov, + parent=field_item, + content_layer=self.content_layer, + ) + refs.append(marker_item.get_ref()) + else: + value = cast(_ExtractedFormValue, payload) + field_value = doc_with_fields.add_field_value( + text=value.text, + orig=value.orig, + prov=value.prov, + parent=field_item, + content_layer=self.content_layer, + kind=value.kind, + ) + refs.append(field_value.get_ref()) return refs def _extract_form_region(self, form_tag: Tag) -> Optional[_ExtractedFormRegion]: key_tags: dict[str, Tag] = {} + key_orders: dict[str, int] = {} key_order: list[str] = [] + markers_by_key: dict[str, list[tuple[int, Tag]]] = {} values_by_key: dict[str, list[tuple[Optional[int], int, Tag]]] = {} - value_order = 0 + dom_order = 0 for tag in form_tag.find_all(id=True): tag_id = tag.get("id") if not isinstance(tag_id, str): continue + dom_order += 1 value_match = _FORM_VALUE_ID_RE.match(tag_id) if value_match: key_id = value_match.group("key_id") value_id = value_match.group("value_id") value_index = int(value_id) if value_id.isdigit() else None - value_order += 1 - values_by_key.setdefault(key_id, []).append((value_index, value_order, tag)) + values_by_key.setdefault(key_id, []).append( + (value_index, dom_order, tag) + ) + continue + + marker_match = _FORM_MARKER_ID_RE.match(tag_id) + if marker_match: + key_id = marker_match.group("key_id") + markers_by_key.setdefault(key_id, []).append((dom_order, tag)) continue key_match = _FORM_KEY_ID_RE.match(tag_id) @@ -2165,6 +2233,7 @@ def _extract_form_region(self, form_tag: Tag) -> Optional[_ExtractedFormRegion]: key_id = key_match.group("key_id") if key_id not in key_tags: key_tags[key_id] = tag + key_orders[key_id] = dom_order key_order.append(key_id) fields: list[_ExtractedFormField] = [] @@ -2184,7 +2253,9 @@ def _extract_form_region(self, form_tag: Tag) -> Optional[_ExtractedFormRegion]: value_entries = [ entry for entry in value_entries - if not self._should_ignore_table_kv_link(key_tag, entry[2], table_bboxes) + if not self._should_ignore_table_kv_link( + key_tag, entry[2], table_bboxes + ) ] in_scope_entries = [ entry @@ -2200,24 +2271,55 @@ def _extract_form_region(self, form_tag: Tag) -> Optional[_ExtractedFormRegion]: entry[1], ) ) + + marker: Optional[_ExtractedFormMarker] = None + marker_entries = markers_by_key.get(key_id, []) + in_scope_markers = [ + entry + for entry in marker_entries + if self._is_value_in_key_scope(key_tag, entry[1]) + ] + if in_scope_markers: + marker_entries = in_scope_markers + marker_entries.sort(key=lambda entry: entry[0]) + if marker_entries: + marker_tag = marker_entries[0][1] + marker_text_raw = self._extract_form_marker_text(marker_tag) + marker_orig, marker_text = self._normalize_form_text(marker_text_raw) + marker = _ExtractedFormMarker( + tag=marker_tag, + order=marker_entries[0][0], + orig=marker_orig, + text=marker_text, + prov=self._make_text_prov(text=marker_text, tag=marker_tag), + ) + marker_tag_id = self._get_html_id(marker_tag) + if marker_tag_id is not None: + consumed_tag_ids.add(marker_tag_id) + value_tags = [entry[2] for entry in value_entries] excluded_ids = { tag_id for tag_id in (tag.get("id") for tag in value_tags) if isinstance(tag_id, str) } + if marker is not None: + marker_id = self._get_html_id(marker.tag) + if marker_id is not None: + excluded_ids.add(marker_id) key_text_raw = self._extract_text_excluding_ids(key_tag, excluded_ids) key_orig, key_text = self._normalize_form_text(key_text_raw) if not key_text and not value_tags: continue values: list[_ExtractedFormValue] = [] - for value_tag in value_tags: + for _, value_order, value_tag in value_entries: value_text_raw = self._extract_form_value_text(value_tag) value_orig, value_text = self._normalize_form_text(value_text_raw) values.append( _ExtractedFormValue( tag=value_tag, + order=value_order, orig=value_orig, text=value_text, prov=self._make_text_prov(text=value_text, tag=value_tag), @@ -2231,9 +2333,11 @@ def _extract_form_region(self, form_tag: Tag) -> Optional[_ExtractedFormRegion]: fields.append( _ExtractedFormField( key_tag=key_tag, + key_order=key_orders.get(key_id, 0), key_orig=key_orig, key_text=key_text, key_prov=self._make_text_prov(text=key_text, tag=key_tag), + marker=marker, values=values, ) ) @@ -2320,12 +2424,18 @@ def _handle_form_container(self, tag: Tag, doc: DoclingDocument) -> list[RefItem if self._is_lonely_key_covered_by_table(field.key_tag): consumed_tag_ids.add(key_tag_id) continue - fields_by_key_id[key_tag_id] = field consumed_tag_ids.add(key_tag_id) + fields_by_key_id[key_tag_id] = field + if field.marker is not None: + marker_tag_id = self._get_html_id(field.marker.tag) + if marker_tag_id is not None: + consumed_tag_ids.add(marker_tag_id) + fields_by_key_id[marker_tag_id] = field for value in field.values: value_tag_id = self._get_html_id(value.tag) if value_tag_id is not None: consumed_tag_ids.add(value_tag_id) + fields_by_key_id[value_tag_id] = field with self._use_form_container(field_region): with self._use_form_fields_by_key_id(fields_by_key_id): diff --git a/uv.lock b/uv.lock index 7356b68247..fdaeddbe37 100644 --- a/uv.lock +++ b/uv.lock @@ -1124,7 +1124,7 @@ examples = [ [[package]] name = "docling-core" version = "2.65.1" -source = { git = "https://github.com/docling-project/docling-core.git?rev=new-kv#3bdb8e9341c4cfff008cca8e18d3c43cb907e8af" } +source = { git = "https://github.com/docling-project/docling-core.git?rev=new-kv#c5f3266235a663473ab9ff04afe55e9acc9a510b" } dependencies = [ { name = "defusedxml" }, { name = "jsonref" }, From c18bda1aac325910bbfe517d6caccc5cab877e33 Mon Sep 17 00:00:00 2001 From: Maksym Lysak Date: Tue, 3 Mar 2026 13:31:42 +0100 Subject: [PATCH 14/19] multiple fixes of kvp handling in html_backend, corrected MP example Signed-off-by: Maksym Lysak --- docling/backend/html_backend.py | 33 ++++++++++------- .../run_with_formats_html_rendered_mp.py | 35 ++++++++++++------- 2 files changed, 43 insertions(+), 25 deletions(-) diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py index 273d5a97e7..b76bb08269 100644 --- a/docling/backend/html_backend.py +++ b/docling/backend/html_backend.py @@ -2405,14 +2405,6 @@ def _handle_form_container(self, tag: Tag, doc: DoclingDocument) -> list[RefItem if supports_field_kv: doc_with_fields = cast(Any, doc) form_region = self._extract_form_region(tag) - region_prov = self._make_prov(text="", tag=tag) - field_region = doc_with_fields.add_field_region( - prov=region_prov, - parent=self.parents[self.level], - ) - field_region.content_layer = self.content_layer - added_refs.append(field_region.get_ref()) - consumed_tag_ids: set[str] = set() fields_by_key_id: dict[str, _ExtractedFormField] = {} if form_region is not None: @@ -2437,12 +2429,29 @@ def _handle_form_container(self, tag: Tag, doc: DoclingDocument) -> list[RefItem consumed_tag_ids.add(value_tag_id) fields_by_key_id[value_tag_id] = field + if not fields_by_key_id: + if tag.name.lower() == "table": + added_refs.extend(self._handle_block(tag, doc)) + else: + with self._suppress_tag_ids(consumed_tag_ids): + added_refs.extend(self._walk(tag, doc)) + return added_refs + + region_prov = self._make_prov(text="", tag=tag) + field_region = doc_with_fields.add_field_region( + prov=region_prov, + parent=self.parents[self.level], + ) + field_region.content_layer = self.content_layer + added_refs.append(field_region.get_ref()) + with self._use_form_container(field_region): with self._use_form_fields_by_key_id(fields_by_key_id): - with self._suppress_tag_ids(consumed_tag_ids): - if tag.name.lower() == "table": - added_refs.extend(self._handle_block(tag, doc)) - else: + if tag.name.lower() == "table": + # For table-form containers, keep cell content visible to rich-cell parsing. + added_refs.extend(self._handle_block(tag, doc)) + else: + with self._suppress_tag_ids(consumed_tag_ids): added_refs.extend(self._walk(tag, doc)) return added_refs diff --git a/docs/examples/run_with_formats_html_rendered_mp.py b/docs/examples/run_with_formats_html_rendered_mp.py index e57aec8508..39f5e08853 100644 --- a/docs/examples/run_with_formats_html_rendered_mp.py +++ b/docs/examples/run_with_formats_html_rendered_mp.py @@ -19,6 +19,7 @@ _log = logging.getLogger(__name__) _WORKER_CONVERTER: DocumentConverter | None = None _WORKER_OUT_DIR: Path | None = None +_WORKER_OUT_DIR_HTML: Path | None = None _WORKER_OUT_DIR_PNG: Path | None = None _WORKER_OUT_DIR_VIZ: Path | None = None @@ -47,24 +48,26 @@ def _build_html_options(sample_source_uri: Path) -> HTMLBackendOptions: ) -def _done_marker_path(input_path: Path, out_dir: Path) -> Path: - return out_dir / f"{input_path.stem}.done" - - def _is_already_converted(input_path: Path, out_dir: Path) -> bool: - # Keep legacy JSON-only skip behavior and add a dedicated completion marker for MT runs. - return ( - _done_marker_path(input_path, out_dir).exists() - or (out_dir / f"{input_path.stem}.json").exists() - ) + return (out_dir / f"{input_path.stem}.json").exists() def _init_worker( - sample_source_uri: str, out_dir: str, out_dir_png: str, out_dir_viz: str + sample_source_uri: str, + out_dir: str, + out_dir_html: str, + out_dir_png: str, + out_dir_viz: str, ) -> None: - global _WORKER_CONVERTER, _WORKER_OUT_DIR, _WORKER_OUT_DIR_PNG, _WORKER_OUT_DIR_VIZ + global \ + _WORKER_CONVERTER, \ + _WORKER_OUT_DIR, \ + _WORKER_OUT_DIR_HTML, \ + _WORKER_OUT_DIR_PNG, \ + _WORKER_OUT_DIR_VIZ _WORKER_OUT_DIR = Path(out_dir) + _WORKER_OUT_DIR_HTML = Path(out_dir_html) _WORKER_OUT_DIR_PNG = Path(out_dir_png) _WORKER_OUT_DIR_VIZ = Path(out_dir_viz) html_options = _build_html_options(Path(sample_source_uri)) @@ -86,6 +89,7 @@ def _convert_one(input_path_str: str) -> dict[str, Any]: if ( _WORKER_CONVERTER is None or _WORKER_OUT_DIR is None + or _WORKER_OUT_DIR_HTML is None or _WORKER_OUT_DIR_PNG is None or _WORKER_OUT_DIR_VIZ is None ): @@ -104,6 +108,9 @@ def _convert_one(input_path_str: str) -> dict[str, Any]: json_path = _WORKER_OUT_DIR / f"{stem}.json" _write_text_atomic(json_path, json.dumps(doc.export_to_dict())) + html_path = _WORKER_OUT_DIR_HTML / f"{stem}.html" + doc.save_as_html(html_path) + page = doc.pages[1] if page.image and page.image.pil_image: page.image.pil_image.save(_WORKER_OUT_DIR_PNG / f"{stem}_page_{1}.png") @@ -114,7 +121,6 @@ def _convert_one(input_path_str: str) -> dict[str, Any]: page_viz = viz_pages2[1] page_viz.save(_WORKER_OUT_DIR_VIZ / f"{stem}_page_{1}_viz_kvp.png") - _write_text_atomic(_done_marker_path(input_path, _WORKER_OUT_DIR), "ok\n") return { "ok": True, "file": input_path.name, @@ -133,12 +139,14 @@ def _convert_one(input_path_str: str) -> dict[str, Any]: def main() -> None: input_html_path = Path("input_dir_to_html/") out_dir = Path("ouput_dir/json") + out_dir_html = Path("ouput_dir/html") out_dir_png = Path("ouput_dir/png") out_dir_viz = Path("ouput_dir/viz") input_paths = sorted([file for file in input_html_path.iterdir() if file.is_file()]) out_dir.mkdir(parents=True, exist_ok=True) + out_dir_html.mkdir(parents=True, exist_ok=True) out_dir_png.mkdir(parents=True, exist_ok=True) out_dir_viz.mkdir(parents=True, exist_ok=True) @@ -165,7 +173,7 @@ def main() -> None: timings: list[float] = [] failed_files: list[Path] = [] max_workers = min( - 4, max(1, int(os.environ.get("DOCLING_HTML_WORKERS", os.cpu_count() or 1))) + 8, max(1, int(os.environ.get("DOCLING_HTML_WORKERS", os.cpu_count() or 1))) ) print(f"Using {max_workers} worker process(es)") @@ -177,6 +185,7 @@ def main() -> None: initargs=( str(pending_input_paths[0]), str(out_dir), + str(out_dir_html), str(out_dir_png), str(out_dir_viz), ), From 0080896f2d83e2beca336620d2b57277b14f532c Mon Sep 17 00:00:00 2001 From: Maksym Lysak Date: Wed, 4 Mar 2026 08:43:06 +0100 Subject: [PATCH 15/19] Fixing reading order within field_region by including also other elements such as text Signed-off-by: Maksym Lysak --- docling/backend/html_backend.py | 105 +++++++++++++++++++++++++++++--- 1 file changed, 97 insertions(+), 8 deletions(-) diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py index b76bb08269..f3011b3eb3 100644 --- a/docling/backend/html_backend.py +++ b/docling/backend/html_backend.py @@ -6,7 +6,7 @@ import warnings from contextlib import contextmanager from copy import deepcopy -from dataclasses import dataclass +from dataclasses import dataclass, field as dataclass_field from io import BytesIO from pathlib import Path from typing import Any, Final, Literal, Optional, Union, cast @@ -168,6 +168,15 @@ class _ExtractedFormMarker: prov: Optional[ProvenanceItem] +@dataclass +class _ExtractedFormText: + tag: Tag + order: int + orig: str + text: str + prov: Optional[ProvenanceItem] + + @dataclass class _ExtractedFormField: key_tag: Tag @@ -177,6 +186,7 @@ class _ExtractedFormField: key_prov: Optional[ProvenanceItem] marker: Optional[_ExtractedFormMarker] values: list[_ExtractedFormValue] + extra_texts: list[_ExtractedFormText] = dataclass_field(default_factory=list) @dataclass @@ -347,6 +357,7 @@ def __init__( self._rendered_page_images: list[Image.Image] = [] self._rendered_page_size: Optional[Size] = None self._suppressed_tag_ids_stack: list[set[str]] = [] + self._suppressed_tag_obj_ids_stack: list[set[int]] = [] self._form_fields_by_key_id_stack: list[dict[str, _ExtractedFormField]] = [] try: @@ -2054,7 +2065,21 @@ def _suppress_tag_ids(self, tag_ids: set[str]): finally: self._suppressed_tag_ids_stack.pop() + @contextmanager + def _suppress_tag_obj_ids(self, tag_obj_ids: set[int]): + if not tag_obj_ids: + yield None + return + self._suppressed_tag_obj_ids_stack.append(tag_obj_ids) + try: + yield None + finally: + self._suppressed_tag_obj_ids_stack.pop() + def _is_suppressed_tag(self, tag: Tag) -> bool: + tag_obj_id = id(tag) + if any(tag_obj_id in obj_ids for obj_ids in self._suppressed_tag_obj_ids_stack): + return True tag_ids = set() if html_id := self._get_html_id(tag): tag_ids.add(html_id) @@ -2155,13 +2180,15 @@ def _add_field_item_from_extracted( ) refs.append(field_item.get_ref()) - parts: list[tuple[int, Literal["key", "marker", "value"], Any]] = [ + parts: list[tuple[int, Literal["key", "marker", "value", "text"], Any]] = [ (field.key_order, "key", field) ] if field.marker is not None: parts.append((field.marker.order, "marker", field.marker)) for value in field.values: parts.append((value.order, "value", value)) + for extra_text in field.extra_texts: + parts.append((extra_text.order, "text", extra_text)) for _, part_type, payload in sorted(parts, key=lambda part: part[0]): if part_type == "key": @@ -2184,6 +2211,17 @@ def _add_field_item_from_extracted( content_layer=self.content_layer, ) refs.append(marker_item.get_ref()) + elif part_type == "text": + extra_text = cast(_ExtractedFormText, payload) + text_item = doc.add_text( + label=DocItemLabel.TEXT, + text=extra_text.text, + orig=extra_text.orig, + prov=extra_text.prov, + parent=field_item, + content_layer=self.content_layer, + ) + refs.append(text_item.get_ref()) else: value = cast(_ExtractedFormValue, payload) field_value = doc_with_fields.add_field_value( @@ -2198,19 +2236,22 @@ def _add_field_item_from_extracted( return refs - def _extract_form_region(self, form_tag: Tag) -> Optional[_ExtractedFormRegion]: + def _extract_form_region(self, form_tag: Tag) -> Optional[_ExtractedFormRegion]: # noqa: C901 key_tags: dict[str, Tag] = {} key_orders: dict[str, int] = {} key_order: list[str] = [] markers_by_key: dict[str, list[tuple[int, Tag]]] = {} values_by_key: dict[str, list[tuple[Optional[int], int, Tag]]] = {} - dom_order = 0 + all_tags = cast(list[Tag], form_tag.find_all(True)) + tag_order_by_obj_id: dict[int, int] = { + id(tag): idx for idx, tag in enumerate(all_tags, start=1) + } - for tag in form_tag.find_all(id=True): + for tag in all_tags: tag_id = tag.get("id") if not isinstance(tag_id, str): continue - dom_order += 1 + dom_order = tag_order_by_obj_id[id(tag)] value_match = _FORM_VALUE_ID_RE.match(tag_id) if value_match: @@ -2330,6 +2371,48 @@ def _extract_form_region(self, form_tag: Tag) -> Optional[_ExtractedFormRegion]: if value_tag_id is not None: consumed_tag_ids.add(value_tag_id) + component_tag_obj_ids: set[int] = { + id(key_tag), + *(id(value.tag) for value in values), + } + if marker is not None: + component_tag_obj_ids.add(id(marker.tag)) + seen_extra_tag_obj_ids: set[int] = set() + extra_texts: list[_ExtractedFormText] = [] + for value in values: + value_parent = value.tag.parent + if not isinstance(value_parent, Tag): + continue + for sibling_tag in value_parent.find_all(recursive=False): + sibling_obj_id = id(sibling_tag) + if sibling_obj_id in component_tag_obj_ids: + continue + if sibling_obj_id in seen_extra_tag_obj_ids: + continue + if self._get_html_id(sibling_tag) is not None: + continue + sibling_text_raw = self.get_text(sibling_tag) + sibling_orig, sibling_text = self._normalize_form_text( + sibling_text_raw + ) + if not sibling_text: + continue + sibling_order = tag_order_by_obj_id.get(sibling_obj_id) + if sibling_order is None: + continue + extra_texts.append( + _ExtractedFormText( + tag=sibling_tag, + order=sibling_order, + orig=sibling_orig, + text=sibling_text, + prov=self._make_text_prov( + text=sibling_text, tag=sibling_tag + ), + ) + ) + seen_extra_tag_obj_ids.add(sibling_obj_id) + fields.append( _ExtractedFormField( key_tag=key_tag, @@ -2339,6 +2422,7 @@ def _extract_form_region(self, form_tag: Tag) -> Optional[_ExtractedFormRegion]: key_prov=self._make_text_prov(text=key_text, tag=key_tag), marker=marker, values=values, + extra_texts=extra_texts, ) ) key_tag_id = self._get_html_id(key_tag) @@ -2406,6 +2490,7 @@ def _handle_form_container(self, tag: Tag, doc: DoclingDocument) -> list[RefItem doc_with_fields = cast(Any, doc) form_region = self._extract_form_region(tag) consumed_tag_ids: set[str] = set() + consumed_tag_obj_ids: set[int] = set() fields_by_key_id: dict[str, _ExtractedFormField] = {} if form_region is not None: for field in form_region.fields: @@ -2428,13 +2513,16 @@ def _handle_form_container(self, tag: Tag, doc: DoclingDocument) -> list[RefItem if value_tag_id is not None: consumed_tag_ids.add(value_tag_id) fields_by_key_id[value_tag_id] = field + for extra_text in field.extra_texts: + consumed_tag_obj_ids.add(id(extra_text.tag)) if not fields_by_key_id: if tag.name.lower() == "table": added_refs.extend(self._handle_block(tag, doc)) else: with self._suppress_tag_ids(consumed_tag_ids): - added_refs.extend(self._walk(tag, doc)) + with self._suppress_tag_obj_ids(consumed_tag_obj_ids): + added_refs.extend(self._walk(tag, doc)) return added_refs region_prov = self._make_prov(text="", tag=tag) @@ -2452,7 +2540,8 @@ def _handle_form_container(self, tag: Tag, doc: DoclingDocument) -> list[RefItem added_refs.extend(self._handle_block(tag, doc)) else: with self._suppress_tag_ids(consumed_tag_ids): - added_refs.extend(self._walk(tag, doc)) + with self._suppress_tag_obj_ids(consumed_tag_obj_ids): + added_refs.extend(self._walk(tag, doc)) return added_refs form_graph = self._extract_form_graph(tag) From af2a549629b88addbe3ca0d36892ccdbcba3d4e1 Mon Sep 17 00:00:00 2001 From: Maksym Lysak Date: Thu, 5 Mar 2026 13:05:11 +0100 Subject: [PATCH 16/19] Added heuristic to identify checkbox labels Signed-off-by: Maksym Lysak --- docling/backend/html_backend.py | 357 +++++++++++++++++++++++++++++--- 1 file changed, 325 insertions(+), 32 deletions(-) diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py index f3011b3eb3..c2bd36b61e 100644 --- a/docling/backend/html_backend.py +++ b/docling/backend/html_backend.py @@ -141,6 +141,14 @@ _FORM_VALUE_ID_RE: Final = re.compile( r"^key(?P[A-Za-z0-9]+)_value(?P[A-Za-z0-9]+)$" ) +_CUSTOM_CHECKBOX_CLASSES: Final = {"checkbox", "checkbox-box", "checkbox-input"} +_CHECKBOX_MARK_TEXTS: Final = {"x", "✓", "✔", "☑"} +_CHECKBOX_CONTAINER_CLASSES: Final = { + "checkbox-container", + "checkbox-item", + "checkbox-option", + "option", +} @dataclass(frozen=True) @@ -157,6 +165,8 @@ class _ExtractedFormValue: text: str prov: Optional[ProvenanceItem] kind: Literal["read_only", "fillable"] = "read_only" + checkbox_label: Optional[DocItemLabel] = None + consumed_label_tag_obj_ids: set[int] = dataclass_field(default_factory=set) @dataclass @@ -175,6 +185,7 @@ class _ExtractedFormText: orig: str text: str prov: Optional[ProvenanceItem] + label: DocItemLabel = DocItemLabel.TEXT @dataclass @@ -1000,6 +1011,9 @@ def _is_rich_table_cell(self, table_cell: Tag) -> bool: children = table_cell.find_all(recursive=True) # all descendants of type Tag has_input = any(child.name == "input" for child in children) + has_custom_checkbox = any( + self._is_custom_checkbox_tag(child) for child in children + ) if not children: content = [ item @@ -1022,6 +1036,7 @@ def _is_rich_table_cell(self, table_cell: Tag) -> bool: or bool(anno.hyperlink) or anno.code or has_input + or has_custom_checkbox ) return is_rich @@ -1226,7 +1241,12 @@ def _flush_buffer() -> None: continue name = node.name.lower() has_block_descendants = bool( - node.find(_BLOCK_TAGS) or node.find("input") + node.find(_BLOCK_TAGS) + or node.find("input") + or node.find( + lambda item: isinstance(item, Tag) + and self._is_custom_checkbox_tag(item) + ) ) is_atomic_node = name in _BLOCK_TAGS or not has_block_descendants if is_atomic_node: @@ -1244,6 +1264,12 @@ def _flush_buffer() -> None: form_refs = self._handle_form_container(node, doc) added_refs.extend(form_refs) continue + if self._is_custom_checkbox_tag(node): + _flush_buffer() + checkbox_ref = self._emit_custom_checkbox(node, doc) + if checkbox_ref is not None: + added_refs.append(checkbox_ref) + continue if name == "img": _flush_buffer() im_ref3 = self._emit_image(node, doc) @@ -1341,8 +1367,11 @@ def _extract_text_and_hyperlink_recursively( return AnnotatedTextList() if isinstance(item, NavigableString): - if isinstance(item.parent, Tag) and self._is_suppressed_tag(item.parent): - return AnnotatedTextList() + if isinstance(item.parent, Tag): + if self._is_suppressed_tag(item.parent): + return AnnotatedTextList() + if self._is_checkbox_label_container(item.parent): + return AnnotatedTextList() text = item.strip() code = any(code_tag in self.format_tags for code_tag in _CODE_TAG_SET) source_tag_id = ( @@ -1377,6 +1406,10 @@ def _extract_text_and_hyperlink_recursively( tag = cast(Tag, item) if self._is_suppressed_tag(tag): return AnnotatedTextList() + if self._is_checkbox_like_tag(tag): + return AnnotatedTextList() + if self._is_checkbox_label_tag(tag): + return AnnotatedTextList() if not ignore_list or (tag.name not in ["ul", "ol"]): for child in tag: if isinstance(child, Tag) and child.name in _FORMAT_TAG_MAP: @@ -1616,7 +1649,7 @@ def _handle_heading(self, tag: Tag, doc: DoclingDocument) -> list[RefItem]: added_ref.append(im_ref) return added_ref - def _handle_list(self, tag: Tag, doc: DoclingDocument) -> RefItem: + def _handle_list(self, tag: Tag, doc: DoclingDocument) -> RefItem: # noqa: C901 tag_name = tag.name.lower() start: Optional[int] = None name: str = "" @@ -1670,9 +1703,17 @@ def _handle_list(self, tag: Tag, doc: DoclingDocument) -> RefItem: for input_tag in li.find_all("input") if input_tag.find_parent("li") is li ] + custom_checkboxes_in_li = [ + checkbox_tag + for checkbox_tag in li.find_all( + lambda item: isinstance(item, Tag) + and self._is_custom_checkbox_tag(item) + ) + if checkbox_tag.find_parent("li") is li + ] # 3) add the list item - if li_text or inputs_in_li: + if li_text or inputs_in_li or custom_checkboxes_in_li: if len(min_parts) > 1: li_prov = self._make_text_prov(text=li_text, tag=li) # create an empty list element in order to hook the inline group onto that one @@ -1724,6 +1765,9 @@ def _handle_list(self, tag: Tag, doc: DoclingDocument) -> RefItem: for input_tag in inputs_in_li: if isinstance(input_tag, Tag): self._emit_input(input_tag, doc) + for checkbox_tag in custom_checkboxes_in_li: + if isinstance(checkbox_tag, Tag): + self._emit_custom_checkbox(checkbox_tag, doc) # 4) recurse into any nested lists, attaching them to this

  • item for sublist in li({"ul", "ol"}, recursive=False): @@ -1754,11 +1798,14 @@ def _handle_list(self, tag: Tag, doc: DoclingDocument) -> RefItem: prov=prov, ) - if inputs_in_li: + if inputs_in_li or custom_checkboxes_in_li: self.level += 1 for input_tag in inputs_in_li: if isinstance(input_tag, Tag): self._emit_input(input_tag, doc) + for checkbox_tag in custom_checkboxes_in_li: + if isinstance(checkbox_tag, Tag): + self._emit_custom_checkbox(checkbox_tag, doc) self.level -= 1 # 4) recurse into any nested lists, attaching them to this
  • item @@ -1782,6 +1829,9 @@ def _handle_list(self, tag: Tag, doc: DoclingDocument) -> RefItem: for input_tag in inputs_in_li: if isinstance(input_tag, Tag): self._emit_input(input_tag, doc) + for checkbox_tag in custom_checkboxes_in_li: + if isinstance(checkbox_tag, Tag): + self._emit_custom_checkbox(checkbox_tag, doc) for sublist in li({"ul", "ol"}, recursive=False): if isinstance(sublist, Tag): self._handle_block(sublist, doc) @@ -1895,6 +1945,14 @@ def _handle_block(self, tag: Tag, doc: DoclingDocument) -> list[RefItem]: input_ref = self._emit_input(input_tag, doc) if input_ref is not None: added_refs.append(input_ref) + for checkbox_tag in tag.find_all( + lambda item: isinstance(item, Tag) + and self._is_custom_checkbox_tag(item) + ): + if isinstance(checkbox_tag, Tag): + checkbox_ref = self._emit_custom_checkbox(checkbox_tag, doc) + if checkbox_ref is not None: + added_refs.append(checkbox_ref) elif tag_name == "table": num_rows, num_cols = self.get_html_table_row_col(tag) @@ -2038,6 +2096,183 @@ def _infer_form_value_kind(value_tag: Tag) -> Literal["read_only", "fillable"]: return "fillable" return "read_only" + @staticmethod + def _get_tag_classes(tag: Tag) -> set[str]: + classes = tag.get("class") + if not classes: + return set() + if isinstance(classes, str): + return {classes} + return {str(value) for value in classes if isinstance(value, str)} + + @staticmethod + def _is_input_checkbox_or_radio_tag(tag: Tag) -> bool: + if tag.name != "input": + return False + input_type = str(tag.get("type", "")).strip().lower() + return input_type in {"checkbox", "radio"} + + @staticmethod + def _is_custom_checkbox_tag(tag: Tag) -> bool: + return bool( + HTMLDocumentBackend._get_tag_classes(tag) & _CUSTOM_CHECKBOX_CLASSES + ) + + @staticmethod + def _is_checkbox_like_tag(tag: Tag) -> bool: + return HTMLDocumentBackend._is_input_checkbox_or_radio_tag( + tag + ) or HTMLDocumentBackend._is_custom_checkbox_tag(tag) + + @staticmethod + def _extract_text_excluding_tag_obj_ids( + tag: Tag, excluded_obj_ids: set[int] + ) -> str: + def _extract(node: PageElement) -> list[str]: + if isinstance(node, NavigableString): + return [str(node)] + if isinstance(node, Tag): + if id(node) in excluded_obj_ids: + return [] + parts: list[str] = [] + for child in node.contents: + parts.extend(_extract(child)) + if node.name in {"p", "li", "div", "label", "span", "td", "th"}: + parts.append(" ") + return parts + return [] + + return "".join(_extract(tag)) + + @staticmethod + def _has_direct_checkbox_like_child(tag: Tag) -> bool: + for child in tag.find_all(recursive=False): + if isinstance(child, Tag) and HTMLDocumentBackend._is_checkbox_like_tag( + child + ): + return True + return False + + def _is_checkbox_label_container(self, tag: Tag) -> bool: + classes = self._get_tag_classes(tag) + if not (classes & _CHECKBOX_CONTAINER_CLASSES): + return False + return self._has_direct_checkbox_like_child(tag) + + def _is_checkbox_label_tag(self, tag: Tag) -> bool: + if self._is_checkbox_like_tag(tag): + return False + if "checkbox-label" in self._get_tag_classes(tag): + return True + parent = tag.parent + if isinstance(parent, Tag) and self._is_checkbox_label_container(parent): + return True + return False + + @staticmethod + def _normalize_checkbox_text(text: str) -> str: + compact = re.sub(r"\s+", " ", text).strip() + if not compact: + return "" + if compact.lower() in _CHECKBOX_MARK_TEXTS: + return "" + return HTMLDocumentBackend._clean_unicode(compact) + + @staticmethod + def _is_checkbox_checked(tag: Tag) -> bool: + if HTMLDocumentBackend._is_input_checkbox_or_radio_tag(tag): + if tag.has_attr("checked"): + return True + aria_checked = str(tag.get("aria-checked", "")).strip().lower() + return aria_checked in {"true", "1", "yes", "on"} + + classes = HTMLDocumentBackend._get_tag_classes(tag) + if "checked" in classes: + return True + + aria_checked = str(tag.get("aria-checked", "")).strip().lower() + if aria_checked in {"true", "1", "yes", "on"}: + return True + + data_checked = str(tag.get("data-checked", "")).strip().lower() + if data_checked in {"true", "1", "yes", "on"}: + return True + + text = re.sub(r"\s+", "", HTMLDocumentBackend.get_text(tag)) + return text.lower() in _CHECKBOX_MARK_TEXTS + + @staticmethod + def _get_checkbox_label_for_tag(tag: Tag) -> Optional[DocItemLabel]: + if not HTMLDocumentBackend._is_checkbox_like_tag(tag): + return None + return ( + DocItemLabel.CHECKBOX_SELECTED + if HTMLDocumentBackend._is_checkbox_checked(tag) + else DocItemLabel.CHECKBOX_UNSELECTED + ) + + def _extract_checkbox_text_and_consumed_label_obj_ids( + self, checkbox_tag: Tag + ) -> tuple[str, set[int]]: + consumed_tag_obj_ids: set[int] = set() + parent = checkbox_tag.parent if isinstance(checkbox_tag.parent, Tag) else None + + # Native checkbox/radio with explicit