From dfba4525b7cfd7c65349cc6f1111f6d4109d3f0a Mon Sep 17 00:00:00 2001 From: Ultizan Date: Wed, 18 Feb 2026 15:09:13 -0800 Subject: [PATCH] fix: accept relative URIs in PdfHyperlink without validation failure PDF hyperlinks may contain relative paths, internal bookmarks, or fragment-only references that are not valid absolute URLs. The strict AnyUrl validation on PdfHyperlink.uri caused the entire page preprocess stage to fail when such URIs were encountered, resulting in empty documents and lost content. Change uri type to Union[AnyUrl, str] with a field_validator that attempts AnyUrl parsing first (preserving structured metadata like scheme/host/path) and falls back to str for non-absolute URIs. Signed-off-by: Ultizan --- docling_core/types/doc/page.py | 19 ++++++++++- test/test_page.py | 59 +++++++++++++++++++++++++++++++++- 2 files changed, 76 insertions(+), 2 deletions(-) diff --git a/docling_core/types/doc/page.py b/docling_core/types/doc/page.py index c80a3480..e51e3ccb 100644 --- a/docling_core/types/doc/page.py +++ b/docling_core/types/doc/page.py @@ -29,6 +29,7 @@ Field, FieldSerializationInfo, field_serializer, + field_validator, model_validator, ) @@ -361,11 +362,27 @@ class PdfWidget(OrderedElement): class PdfHyperlink(OrderedElement): rect: BoundingRectangle - uri: Optional[AnyUrl] = None + uri: Optional[Union[AnyUrl, str]] = None widget_text: Optional[str] = None widget_description: Optional[str] = None + @field_validator("uri", mode="before") + @classmethod + def parse_uri(cls, v: Any) -> Union[AnyUrl, str, None]: + """Parse URI with AnyUrl for structured metadata, falling back to str. + + PDF hyperlinks may contain relative paths, internal bookmarks, or other + URI forms that are not valid absolute URLs. These should not cause + validation failures during document parsing. + """ + if v is None: + return v + try: + return AnyUrl(v) + except Exception: + return str(v) + class BitmapResource(OrderedElement): """Model representing a bitmap resource with positioning and URI information.""" diff --git a/test/test_page.py b/test/test_page.py index 72e50e00..76ba0df1 100644 --- a/test/test_page.py +++ b/test/test_page.py @@ -2,9 +2,10 @@ import numpy as np import pytest +from pydantic import AnyUrl from docling_core.types.doc import CoordOrigin -from docling_core.types.doc.page import BoundingRectangle +from docling_core.types.doc.page import BoundingRectangle, PdfHyperlink SQRT_2 = math.sqrt(2) @@ -210,3 +211,59 @@ def test_bounding_rectangle_angle(rectangle: BoundingRectangle, expected_angle: float, expected_angle_360: int): assert pytest.approx(rectangle.angle, abs=1e-6) == expected_angle assert pytest.approx(rectangle.angle_360, abs=1e-6) == expected_angle_360 + + +# -- PdfHyperlink URI validation tests -- + +RECT = BoundingRectangle( + r_x0=0, + r_y0=0, + r_x1=1, + r_y1=0, + r_x2=1, + r_y2=1, + r_x3=0, + r_y3=1, + coord_origin=CoordOrigin.TOPLEFT, +) + + +class TestPdfHyperlinkUri: + """PdfHyperlink.uri should accept any URI form found in real PDFs.""" + + def test_absolute_url_parsed_as_anyurl(self): + h = PdfHyperlink(rect=RECT, uri="https://example.com/page") + assert isinstance(h.uri, AnyUrl) + assert h.uri.scheme == "https" + assert h.uri.host == "example.com" + + def test_mailto_parsed_as_anyurl(self): + h = PdfHyperlink(rect=RECT, uri="mailto:user@example.com") + assert isinstance(h.uri, AnyUrl) + assert h.uri.scheme == "mailto" + + def test_relative_path_falls_back_to_str(self): + h = PdfHyperlink( + rect=RECT, + uri="/wiki/pages/internal-document-link", + ) + assert isinstance(h.uri, str) + assert h.uri == "/wiki/pages/internal-document-link" + + def test_fragment_only_falls_back_to_str(self): + h = PdfHyperlink(rect=RECT, uri="#internal-bookmark") + assert isinstance(h.uri, str) + assert h.uri == "#internal-bookmark" + + def test_relative_path_falls_back_to_str_dotdot(self): + h = PdfHyperlink(rect=RECT, uri="../relative/path.html") + assert isinstance(h.uri, str) + assert h.uri == "../relative/path.html" + + def test_none_uri(self): + h = PdfHyperlink(rect=RECT, uri=None) + assert h.uri is None + + def test_omitted_uri(self): + h = PdfHyperlink(rect=RECT) + assert h.uri is None