diff --git a/docling_core/types/doc/page.py b/docling_core/types/doc/page.py index c80a3480..e51e3ccb 100644 --- a/docling_core/types/doc/page.py +++ b/docling_core/types/doc/page.py @@ -29,6 +29,7 @@ Field, FieldSerializationInfo, field_serializer, + field_validator, model_validator, ) @@ -361,11 +362,27 @@ class PdfWidget(OrderedElement): class PdfHyperlink(OrderedElement): rect: BoundingRectangle - uri: Optional[AnyUrl] = None + uri: Optional[Union[AnyUrl, str]] = None widget_text: Optional[str] = None widget_description: Optional[str] = None + @field_validator("uri", mode="before") + @classmethod + def parse_uri(cls, v: Any) -> Union[AnyUrl, str, None]: + """Parse URI with AnyUrl for structured metadata, falling back to str. + + PDF hyperlinks may contain relative paths, internal bookmarks, or other + URI forms that are not valid absolute URLs. These should not cause + validation failures during document parsing. + """ + if v is None: + return v + try: + return AnyUrl(v) + except Exception: + return str(v) + class BitmapResource(OrderedElement): """Model representing a bitmap resource with positioning and URI information.""" diff --git a/test/test_page.py b/test/test_page.py index 72e50e00..76ba0df1 100644 --- a/test/test_page.py +++ b/test/test_page.py @@ -2,9 +2,10 @@ import numpy as np import pytest +from pydantic import AnyUrl from docling_core.types.doc import CoordOrigin -from docling_core.types.doc.page import BoundingRectangle +from docling_core.types.doc.page import BoundingRectangle, PdfHyperlink SQRT_2 = math.sqrt(2) @@ -210,3 +211,59 @@ def test_bounding_rectangle_angle(rectangle: BoundingRectangle, expected_angle: float, expected_angle_360: int): assert pytest.approx(rectangle.angle, abs=1e-6) == expected_angle assert pytest.approx(rectangle.angle_360, abs=1e-6) == expected_angle_360 + + +# -- PdfHyperlink URI validation tests -- + +RECT = BoundingRectangle( + r_x0=0, + r_y0=0, + r_x1=1, + r_y1=0, + r_x2=1, + r_y2=1, + r_x3=0, + r_y3=1, + coord_origin=CoordOrigin.TOPLEFT, +) + + +class TestPdfHyperlinkUri: + """PdfHyperlink.uri should accept any URI form found in real PDFs.""" + + def test_absolute_url_parsed_as_anyurl(self): + h = PdfHyperlink(rect=RECT, uri="https://example.com/page") + assert isinstance(h.uri, AnyUrl) + assert h.uri.scheme == "https" + assert h.uri.host == "example.com" + + def test_mailto_parsed_as_anyurl(self): + h = PdfHyperlink(rect=RECT, uri="mailto:user@example.com") + assert isinstance(h.uri, AnyUrl) + assert h.uri.scheme == "mailto" + + def test_relative_path_falls_back_to_str(self): + h = PdfHyperlink( + rect=RECT, + uri="/wiki/pages/internal-document-link", + ) + assert isinstance(h.uri, str) + assert h.uri == "/wiki/pages/internal-document-link" + + def test_fragment_only_falls_back_to_str(self): + h = PdfHyperlink(rect=RECT, uri="#internal-bookmark") + assert isinstance(h.uri, str) + assert h.uri == "#internal-bookmark" + + def test_relative_path_falls_back_to_str_dotdot(self): + h = PdfHyperlink(rect=RECT, uri="../relative/path.html") + assert isinstance(h.uri, str) + assert h.uri == "../relative/path.html" + + def test_none_uri(self): + h = PdfHyperlink(rect=RECT, uri=None) + assert h.uri is None + + def test_omitted_uri(self): + h = PdfHyperlink(rect=RECT) + assert h.uri is None