diff --git a/pypdf/generic/_files.py b/pypdf/generic/_files.py index b02d1e963..f29fa770f 100644 --- a/pypdf/generic/_files.py +++ b/pypdf/generic/_files.py @@ -1,5 +1,6 @@ from __future__ import annotations +import bisect from functools import cached_property from typing import TYPE_CHECKING, cast @@ -82,17 +83,22 @@ def _create_new(cls, writer: PdfWriter, name: str, content: str | bytes) -> Embe from pypdf.generic import create_string_object # noqa: PLC0415 filespec = DictionaryObject() filespec_reference = writer._add_object(filespec) + name_object = cast(TextStringObject, create_string_object(name)) filespec.update( { NameObject(PG.TYPE): NameObject("/Filespec"), - NameObject(FileSpecificationDictionaryEntries.F): create_string_object(name), + NameObject(FileSpecificationDictionaryEntries.F): name_object, NameObject(FileSpecificationDictionaryEntries.EF): ef_entry, } ) - # Add the name and filespec to the names array + # Add the name and filespec to the names array. + # We use the inverse order for insertion, as this allows us to re-use the + # same index. names_array = cls._get_names_array(writer) - names_array.extend([create_string_object(name), filespec_reference]) + insertion_index = cls._get_insertion_index(names_array, name_object) + names_array.insert(insertion_index, filespec_reference) + names_array.insert(insertion_index, name_object) # Return an EmbeddedFile instance return cls(name=name, pdf_object=filespec, parent=names_array) @@ -141,6 +147,22 @@ def _get_names_array(cls, writer: PdfWriter) -> ArrayObject: names.append(name) return names + @classmethod + def _get_insertion_index(cls, names_array: ArrayObject, name: str) -> int: + keys = [names_array[i].encode("utf-8") for i in range(0, len(names_array), 2)] + name_bytes = name.encode("utf-8") + + start = bisect.bisect_left(keys, name_bytes) + end = bisect.bisect_right(keys, name_bytes) + + if start != end: + return end * 2 + if start == 0: + return 0 + if start == (key_count := len(keys)): + return key_count * 2 + return end * 2 + @property def alternative_name(self) -> str | None: """Retrieve the alternative name (file specification).""" diff --git a/tests/generic/test_files.py b/tests/generic/test_files.py index ea8207705..010d09a98 100644 --- a/tests/generic/test_files.py +++ b/tests/generic/test_files.py @@ -10,6 +10,7 @@ from pypdf import PdfReader, PdfWriter from pypdf.errors import PdfReadError, PyPdfError from pypdf.generic import ( + ArrayObject, ByteStringObject, DictionaryObject, EmbeddedFile, @@ -489,3 +490,87 @@ def test_embedded_file__create__neither_kids_nor_names(): with pytest.raises(expected_exception=PdfReadError, match=r"^Got neither Names nor Kids in embedded files tree\.$"): writer.add_attachment("test2.txt", b"content2") + + +def test_embedded_file__get_insertion_index(): + # Empty list. + assert EmbeddedFile._get_insertion_index(ArrayObject(), "test.txt") == 0 + + # One mismatching entry. + assert EmbeddedFile._get_insertion_index( + ArrayObject([TextStringObject("dummy.txt"), NullObject()]), + "test.txt" + ) == 2 + assert EmbeddedFile._get_insertion_index( + ArrayObject([TextStringObject("xxx.txt"), NullObject()]), + "test.txt" + ) == 0 + + # Multiple entries. + assert EmbeddedFile._get_insertion_index( + ArrayObject([TextStringObject("dummy.txt"), NullObject(), TextStringObject("xxx.txt"), NullObject()]), + "test.txt" + ) == 2 + assert EmbeddedFile._get_insertion_index( + ArrayObject([TextStringObject("xxx.txt"), NullObject(), TextStringObject("yyy.txt"), NullObject()]), + "test.txt" + ) == 0 + assert EmbeddedFile._get_insertion_index( + ArrayObject([TextStringObject("aaa.txt"), NullObject(), TextStringObject("bbb.txt"), NullObject()]), + "test.txt" + ) == 4 + assert EmbeddedFile._get_insertion_index( + ArrayObject([ + TextStringObject("aaa.txt"), NullObject(), + TextStringObject("test.txt"), NullObject(), + TextStringObject("zzz.txt"), NullObject() + ]), + "test.txt" + ) == 4 + + # Length. + assert EmbeddedFile._get_insertion_index( + ArrayObject([TextStringObject("a"), NullObject()]), + "aa" + ) == 2 + assert EmbeddedFile._get_insertion_index( + ArrayObject([TextStringObject("a"), NullObject()]), + "a" + ) == 2 + assert EmbeddedFile._get_insertion_index( + ArrayObject([TextStringObject("aaa"), NullObject()]), + "aa" + ) == 0 + + # Special characters. + assert EmbeddedFile._get_insertion_index( + ArrayObject([TextStringObject("café"), NullObject()]), + "cafe" + ) == 0 + assert EmbeddedFile._get_insertion_index( + ArrayObject([TextStringObject("Tun"), NullObject()]), + "Tür" + ) == 2 + + +def test_embedded_file__order(): + writer = PdfWriter() + writer.add_blank_page(100, 100) + + attachment1 = writer.add_attachment("test.txt", "content") + attachment2 = writer.add_attachment("abc.txt", "content") + attachment3 = writer.add_attachment("xyz.txt", "content") + attachment4 = writer.add_attachment("test.txt", "content2") + + assert dict(writer.attachments) == { + "abc.txt": [b"content"], + "test.txt": [b"content", b"content2"], + "xyz.txt": [b"content"] + } + + assert writer.root_object["/Names"]["/EmbeddedFiles"]["/Names"] == [ + "abc.txt", attachment2.pdf_object.indirect_reference, + "test.txt", attachment1.pdf_object.indirect_reference, + "test.txt", attachment4.pdf_object.indirect_reference, + "xyz.txt", attachment3.pdf_object.indirect_reference, + ]