pypdf/generic/_base.py

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -660,36 +660,39 @@ class TextStringObject(str, PdfObject):  # noqa: SLOT000
  
        _original_bytes: Optional[bytes] = None

        def __new__(cls, value: Any) -> "TextStringObject":

            org = None

            original_bytes = None

            if isinstance(value, bytes):

                org = value

                original_bytes = value

                value = value.decode("charmap")

            o = str.__new__(cls, value)

            o._original_bytes = org

            o.autodetect_utf16 = False

            o.autodetect_pdfdocencoding = False

            o.utf16_bom = b""

            if o.startswith(("\xfe\xff", "\xff\xfe")):

                assert org is not None, "mypy"

            text_string_object = str.__new__(cls, value)

            text_string_object._original_bytes = original_bytes

            text_string_object.autodetect_utf16 = False

            text_string_object.autodetect_pdfdocencoding = False

            text_string_object.utf16_bom = b""

            if original_bytes is not None and original_bytes[:2] in {codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE}:

                # The value of `original_bytes` is only set for inputs being `bytes`.

                # If this is UTF-16 data according to the BOM (first two characters),

                # perform special handling. All other cases should not need any special conversion

                # due to already being a string.

                try:

                    o = str.__new__(cls, org.decode("utf-16"))

                except UnicodeDecodeError as exc:

                    text_string_object = str.__new__(cls, original_bytes.decode("utf-16"))

                except UnicodeDecodeError as exception:

                    logger_warning(

                        f"{exc!s}\ninitial string:{exc.object!r}",

                        f"{exception!s}\ninitial string:{exception.object!r}",

                        __name__,

                    )

                    o = str.__new__(cls, exc.object[: exc.start].decode("utf-16"))

                o._original_bytes = org

                o.autodetect_utf16 = True

                o.utf16_bom = org[:2]

                    text_string_object = str.__new__(cls, exception.object[: exception.start].decode("utf-16"))

                text_string_object._original_bytes = original_bytes

                text_string_object.autodetect_utf16 = True

                text_string_object.utf16_bom = original_bytes[:2]

            else:

                try:

                    encode_pdfdocencoding(o)

                    o.autodetect_pdfdocencoding = True

                    encode_pdfdocencoding(text_string_object)

                    text_string_object.autodetect_pdfdocencoding = True

                except UnicodeEncodeError:

                    o.autodetect_utf16 = True

                    o.utf16_bom = codecs.BOM_UTF16_BE

            return o

                    text_string_object.autodetect_utf16 = True

                    text_string_object.utf16_bom = codecs.BOM_UTF16_BE

            return text_string_object

        def clone(

            self,

tests/generic/test_base.py

-Original file line number
+Diff line change
@@ -0,0 +1,45 @@
+    """Test the pypdf.generic._base module."""
+    from io import BytesIO
+    import pytest
+    from pypdf import PdfReader, PdfWriter
+    from pypdf.generic import read_hex_string_from_stream
+    from tests import get_data_from_url
+    @pytest.mark.parametrize(
+        ("source", "expected"),
+        [
+            (b"<00FE00FF>", "\xfe\xff"),
+            (b"<00FE00FF00D6>", "\xfe\xff\xd6"),
+        ]
+    )
+    def test_text_string_object__looks_like_bom(source: bytes, expected: str) -> None:
+        stream = BytesIO(source)
+        result = read_hex_string_from_stream(stream)
+        assert result == expected
+    @pytest.mark.enable_socket
+    def test_text_string_object__wrongly_detected_bom():
+        url = "https://github.com/user-attachments/files/24401507/minimal.pdf"
+        name = "issue3587.pdf"
+        reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
+        reader_page = reader.pages[0]
+        writer = PdfWriter()
+        for page in reader.pages:
+            writer_page = writer.add_blank_page(reader_page.mediabox.width, reader_page.mediabox.height)
+            writer_page.merge_page(page)
+            assert writer_page.extract_text() == (
+                "无译形带 r的参 z慧队手行 c要枪互工先调 uC一在你 k该方导最 xT况 M味政没出 v大同团\n"
+                "想急压游这体构主 m基重张预另做内已织程术并 U种规被中应 s过小立就公测和 F更为 BS\n"
+                "把强型 w利 qfJ现能您关文）己个言 VW是 Z亲社 y。说准密令 K络通自力 i诸旦明量放及 I\n"
+                "成战康养 d都蜂多开 pE次提朋动比台有培愿 A确 l充计标去人如么 b灵 N它 g弃语看 X；j\n"
+                "轮 HG采共由地友入（器 Y果感建切理情从集德翻 a单第识任 Q模 eh目经相哪受起时着 DR\n"
+                "用好 o备划付信、度解效作协读 O讨高具击始者意群治扩到 P才兰网认 t马倒来本整 L们 n\n"
+                "系可论，步各之但\n"
+                "12"
+            )

BUG: Do not consider multi-byte BOM-like sequences as BOMs #3589

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged

stefan6419846 merged 3 commits into py-pdf:main from stefan6419846:issue3587

Jan 7, 2026

+69 −21

-Original file line number
+Diff line change
@@ -0,0 +1,45 @@
+    """Test the pypdf.generic._base module."""
+    from io import BytesIO
+    import pytest
+    from pypdf import PdfReader, PdfWriter
+    from pypdf.generic import read_hex_string_from_stream
+    from tests import get_data_from_url
+    @pytest.mark.parametrize(
+        ("source", "expected"),
+        [
+            (b"<00FE00FF>", "\xfe\xff"),
+            (b"<00FE00FF00D6>", "\xfe\xff\xd6"),
+        ]
+    )
+    def test_text_string_object__looks_like_bom(source: bytes, expected: str) -> None:
+        stream = BytesIO(source)
+        result = read_hex_string_from_stream(stream)
+        assert result == expected
+    @pytest.mark.enable_socket
+    def test_text_string_object__wrongly_detected_bom():
+        url = "https://github.com/user-attachments/files/24401507/minimal.pdf"
+        name = "issue3587.pdf"
+        reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
+        reader_page = reader.pages[0]
+        writer = PdfWriter()
+        for page in reader.pages:
+            writer_page = writer.add_blank_page(reader_page.mediabox.width, reader_page.mediabox.height)
+            writer_page.merge_page(page)
+            assert writer_page.extract_text() == (
+                "无译形带 r的参 z慧队手行 c要枪互工先调 uC一在你 k该方导最 xT况 M味政没出 v大同团\n"
+                "想急压游这体构主 m基重张预另做内已织程术并 U种规被中应 s过小立就公测和 F更为 BS\n"
+                "把强型 w利 qfJ现能您关文）己个言 VW是 Z亲社 y。说准密令 K络通自力 i诸旦明量放及 I\n"
+                "成战康养 d都蜂多开 pE次提朋动比台有培愿 A确 l充计标去人如么 b灵 N它 g弃语看 X；j\n"
+                "轮 HG采共由地友入（器 Y果感建切理情从集德翻 a单第识任 Q模 eh目经相哪受起时着 DR\n"
+                "用好 o备划付信、度解效作协读 O讨高具击始者意群治扩到 P才兰网认 t马倒来本整 L们 n\n"
+                "系可论，步各之但\n"
+                "12"
+            )

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

BUG: Do not consider multi-byte BOM-like sequences as BOMs #3589

Diff view

Diff view

There are no files selected for viewing

BUG: Do not consider multi-byte BOM-like sequences as BOMs #3589

BUG: Do not consider multi-byte BOM-like sequences as BOMs #3589

Uh oh!

Uh oh!

Diff view

Diff view

There are no files selected for viewing