From 7e26f6d6ba83b2fe0090a32a87e3a28bf61d379c Mon Sep 17 00:00:00 2001 From: stefan6419846 <96178532+stefan6419846@users.noreply.github.com> Date: Thu, 11 Sep 2025 16:36:07 +0200 Subject: [PATCH 1/2] ENH: Allow deleting embedded files --- pypdf/generic/_files.py | 21 +++++++++++++++++---- tests/generic/test_files.py | 31 ++++++++++++++++++++++++++++++- 2 files changed, 47 insertions(+), 5 deletions(-) diff --git a/pypdf/generic/_files.py b/pypdf/generic/_files.py index ecdb61351..4ac6fdcfb 100644 --- a/pypdf/generic/_files.py +++ b/pypdf/generic/_files.py @@ -7,7 +7,7 @@ from pypdf.constants import CatalogAttributes as CA from pypdf.constants import FileSpecificationDictionaryEntries from pypdf.constants import PageAttributes as PG -from pypdf.errors import PdfReadError +from pypdf.errors import PdfReadError, PyPdfError from pypdf.generic import ( ArrayObject, ByteStringObject, @@ -36,14 +36,16 @@ class EmbeddedFile: Further information on embedded files can be found in section 7.11 of the PDF 2.0 specification. """ - def __init__(self, name: str, pdf_object: DictionaryObject) -> None: + def __init__(self, name: str, pdf_object: DictionaryObject, parent: ArrayObject | None = None) -> None: """ Args: name: The (primary) name as provided in the name tree. pdf_object: The corresponding PDF object to allow retrieving further data. + parent: The parent list. """ self._name = name self.pdf_object = pdf_object + self._parent = parent @property def name(self) -> str: @@ -105,7 +107,7 @@ def _create_new(cls, writer: PdfWriter, name: str, content: str | bytes) -> Embe names_array.extend([create_string_object(name), filespec]) # Return an EmbeddedFile instance - return cls(name=name, pdf_object=filespec) + return cls(name=name, pdf_object=filespec, parent=names_array) @property def alternative_name(self) -> str | None: @@ -276,6 +278,17 @@ def checksum(self, value: ByteStringObject | None) -> None: else: params[NameObject("/CheckSum")] = value + def delete(self) -> None: + """Delete the file from the document.""" + if not self._parent: + raise PyPdfError("Parent required to delete file from document.") + if self.pdf_object not in self._parent: + raise PyPdfError("File not found in parent object.") + index = self._parent.index(self.pdf_object) + self._parent.pop(index) # Reference. + self._parent.pop(index - 1) # Name. + self.pdf_object = DictionaryObject() # Invalidate. + def __repr__(self) -> str: return f"<{self.__class__.__name__} name={self.name!r}>" @@ -296,7 +309,7 @@ def _load_from_names(cls, names: ArrayObject) -> Generator[EmbeddedFile]: # Skip plain strings and retrieve them as `direct_name` by index. file_dictionary = name.get_object() direct_name = names[i - 1].get_object() - yield EmbeddedFile(name=direct_name, pdf_object=file_dictionary) + yield EmbeddedFile(name=direct_name, pdf_object=file_dictionary, parent=names) @classmethod def _load(cls, catalog: DictionaryObject) -> Generator[EmbeddedFile]: diff --git a/tests/generic/test_files.py b/tests/generic/test_files.py index 8f104bc56..c05307c01 100644 --- a/tests/generic/test_files.py +++ b/tests/generic/test_files.py @@ -8,7 +8,7 @@ import pytest from pypdf import PdfReader, PdfWriter -from pypdf.errors import PdfReadError +from pypdf.errors import PdfReadError, PyPdfError from pypdf.generic import ( ByteStringObject, DictionaryObject, @@ -394,3 +394,32 @@ def test_embedded_file_null_object_handling(): assert embedded_file.subtype is None assert embedded_file.size is None assert embedded_file.checksum is None + + +def test_embedded_file__delete_without_parent(): + attachment = EmbeddedFile(name="test.txt", pdf_object=DictionaryObject()) + with pytest.raises(PyPdfError, match=r"^Parent required to delete file from document\.$"): + attachment.delete() + + +def test_embedded_file__delete_known(): + writer = PdfWriter() + writer.add_blank_page(100, 100) + writer.add_attachment("test.txt", b"content") + writer.add_attachment("test2.txt", b"content2") + + attachments = list(writer.attachment_list) + assert len(attachments) == 2 + attachment = attachments[0] + assert attachment.name == "test.txt" + attachment.delete() + with pytest.raises(PdfReadError, match=r"^/EF entry not found: {}$"): + _ = attachment.content + + attachments = list(writer.attachment_list) + assert len(attachments) == 1 + assert attachments[0].name == "test2.txt" + + # Delete second time. + with pytest.raises(PyPdfError, match=r"^File not found in parent object\.$"): + attachment.delete() From d857f84ea118c6a48acf26739a7b526872deb4b1 Mon Sep 17 00:00:00 2001 From: stefan6419846 <96178532+stefan6419846@users.noreply.github.com> Date: Thu, 11 Sep 2025 16:38:44 +0200 Subject: [PATCH 2/2] add docs and simplify test --- docs/user/handle-attachments.md | 13 +++++++++++++ tests/generic/test_files.py | 4 +--- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/docs/user/handle-attachments.md b/docs/user/handle-attachments.md index 6b4aec63e..fd2186a5e 100644 --- a/docs/user/handle-attachments.md +++ b/docs/user/handle-attachments.md @@ -71,3 +71,16 @@ embedded_file.write("output.pdf") The same functionality is available if you iterate over the attachments of a writer using `writer.attachment_list`. + +## Delete Attachments + +To delete an existing attachment, use the following code: + +```python +from pypdf import PdfWriter + +writer = PdfWriter(clone_from="example.pdf") +attachment = writer.add_attachment(filename="test.txt", data=b"Hello World!") +attachment.delete() +assert list(writer.attachment_list) == [] +``` diff --git a/tests/generic/test_files.py b/tests/generic/test_files.py index c05307c01..92001e2da 100644 --- a/tests/generic/test_files.py +++ b/tests/generic/test_files.py @@ -405,13 +405,11 @@ def test_embedded_file__delete_without_parent(): def test_embedded_file__delete_known(): writer = PdfWriter() writer.add_blank_page(100, 100) - writer.add_attachment("test.txt", b"content") + attachment = writer.add_attachment("test.txt", b"content") writer.add_attachment("test2.txt", b"content2") attachments = list(writer.attachment_list) assert len(attachments) == 2 - attachment = attachments[0] - assert attachment.name == "test.txt" attachment.delete() with pytest.raises(PdfReadError, match=r"^/EF entry not found: {}$"): _ = attachment.content