Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 24 additions & 21 deletions pypdf/generic/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -660,36 +660,39 @@ class TextStringObject(str, PdfObject): # noqa: SLOT000
_original_bytes: Optional[bytes] = None

def __new__(cls, value: Any) -> "TextStringObject":
org = None
original_bytes = None
if isinstance(value, bytes):
org = value
original_bytes = value
value = value.decode("charmap")
o = str.__new__(cls, value)
o._original_bytes = org
o.autodetect_utf16 = False
o.autodetect_pdfdocencoding = False
o.utf16_bom = b""
if o.startswith(("\xfe\xff", "\xff\xfe")):
assert org is not None, "mypy"
text_string_object = str.__new__(cls, value)
text_string_object._original_bytes = original_bytes
text_string_object.autodetect_utf16 = False
text_string_object.autodetect_pdfdocencoding = False
text_string_object.utf16_bom = b""
if original_bytes is not None and original_bytes[:2] in {codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE}:
# The value of `original_bytes` is only set for inputs being `bytes`.
# If this is UTF-16 data according to the BOM (first two characters),
# perform special handling. All other cases should not need any special conversion
# due to already being a string.
try:
o = str.__new__(cls, org.decode("utf-16"))
except UnicodeDecodeError as exc:
text_string_object = str.__new__(cls, original_bytes.decode("utf-16"))
except UnicodeDecodeError as exception:
logger_warning(
f"{exc!s}\ninitial string:{exc.object!r}",
f"{exception!s}\ninitial string:{exception.object!r}",
__name__,
)
o = str.__new__(cls, exc.object[: exc.start].decode("utf-16"))
o._original_bytes = org
o.autodetect_utf16 = True
o.utf16_bom = org[:2]
text_string_object = str.__new__(cls, exception.object[: exception.start].decode("utf-16"))
text_string_object._original_bytes = original_bytes
text_string_object.autodetect_utf16 = True
text_string_object.utf16_bom = original_bytes[:2]
else:
try:
encode_pdfdocencoding(o)
o.autodetect_pdfdocencoding = True
encode_pdfdocencoding(text_string_object)
text_string_object.autodetect_pdfdocencoding = True
except UnicodeEncodeError:
o.autodetect_utf16 = True
o.utf16_bom = codecs.BOM_UTF16_BE
return o
text_string_object.autodetect_utf16 = True
text_string_object.utf16_bom = codecs.BOM_UTF16_BE
return text_string_object

def clone(
self,
Expand Down
45 changes: 45 additions & 0 deletions tests/generic/test_base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
"""Test the pypdf.generic._base module."""
from io import BytesIO

import pytest

from pypdf import PdfReader, PdfWriter
from pypdf.generic import read_hex_string_from_stream
from tests import get_data_from_url


@pytest.mark.parametrize(
("source", "expected"),
[
(b"<00FE00FF>", "\xfe\xff"),
(b"<00FE00FF00D6>", "\xfe\xff\xd6"),
]
)
def test_text_string_object__looks_like_bom(source: bytes, expected: str) -> None:
stream = BytesIO(source)
result = read_hex_string_from_stream(stream)
assert result == expected


@pytest.mark.enable_socket
def test_text_string_object__wrongly_detected_bom():
url = "https://github.com/user-attachments/files/24401507/minimal.pdf"
name = "issue3587.pdf"
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
reader_page = reader.pages[0]

writer = PdfWriter()
for page in reader.pages:
writer_page = writer.add_blank_page(reader_page.mediabox.width, reader_page.mediabox.height)
writer_page.merge_page(page)

assert writer_page.extract_text() == (
"无译形带 r的参 z慧队手行 c要枪互工先调 uC一在你 k该方导最 xT况 M味政没出 v大同团\n"
"想急压游这体构主 m基重张预另做内已织程术并 U种规被中应 s过小立就公测和 F更为 BS\n"
"把强型 w利 qfJ现能您关文)己个言 VW是 Z亲社 y。说准密令 K络通自力 i诸旦明量放及 I\n"
"成战康养 d都蜂多开 pE次提朋动比台有培愿 A确 l充计标去人如么 b灵 N它 g弃语看 X;j\n"
"轮 HG采共由地友入(器 Y果感建切理情从集德翻 a单第识任 Q模 eh目经相哪受起时着 DR\n"
"用好 o备划付信、度解效作协读 O讨高具击始者意群治扩到 P才兰网认 t马倒来本整 L们 n\n"
"系可论,步各之但\n"
"12"
)