From 2946d9626d92314b246b9d06919aa60adc88b82a Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Mon, 5 Sep 2022 22:07:32 +0200 Subject: [PATCH 1/2] ROB : fix image extraction fix some images reading when some operations are inserted between EI and Q end of image is now considered with [whitespace]EI[whitespace] (4 characters should be sufficient) --- PyPDF2/generic/_data_structures.py | 8 ++++---- tests/test_workflows.py | 13 +++++-------- 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/PyPDF2/generic/_data_structures.py b/PyPDF2/generic/_data_structures.py index 283b33b225..f6630066c6 100644 --- a/PyPDF2/generic/_data_structures.py +++ b/PyPDF2/generic/_data_structures.py @@ -759,17 +759,17 @@ def _read_inline_image(self, stream: StreamType) -> Dict[str, Any]: tok = stream.read(1) # Check for End Image tok2 = stream.read(1) - if tok2 == b"I": - # Data can contain EI, so check for the Q operator. + if tok2 == b"I" and buf[loc - 1 : loc] in WHITESPACES: + # Data can contain [\s]EI, so check for the separator \s; 4 chars suffisent Q operator not required. tok3 = stream.read(1) info = tok + tok2 - # We need to find whitespace between EI and Q. + # We need to find at least one whitespace after. has_q_whitespace = False while tok3 in WHITESPACES: has_q_whitespace = True info += tok3 tok3 = stream.read(1) - if tok3 == b"Q" and has_q_whitespace: + if has_q_whitespace: stream.seek(-1, 1) break else: diff --git a/tests/test_workflows.py b/tests/test_workflows.py index cc194f435b..200e030c71 100644 --- a/tests/test_workflows.py +++ b/tests/test_workflows.py @@ -425,7 +425,7 @@ def test_get_metadata(url, name): "https://corpora.tika.apache.org/base/docs/govdocs1/938/938702.pdf", "tika-938702.pdf", False, - (PdfReadError, "Unexpected end of stream"), + None, # iss #1090 is now fixed ), ( "https://corpora.tika.apache.org/base/docs/govdocs1/942/942358.pdf", @@ -512,19 +512,16 @@ def test_extract_text(url, name, strict, exception): ), ( "https://corpora.tika.apache.org/base/docs/govdocs1/957/957304.pdf", - "tika-938702.pdf", + "tika-957304.pdf", ), ], ) def test_compress_raised(url, name): data = BytesIO(get_pdf_from_url(url, name=name)) reader = PdfReader(data) - # TODO: which page exactly? - # TODO: Is it reasonable to have an exception here? - with pytest.raises(PdfReadError) as exc: - for page in reader.pages: - page.compress_content_streams() - assert exc.value.args[0] == "Unexpected end of stream" + # no more error since iss #1090 fix + for page in reader.pages: + page.compress_content_streams() @pytest.mark.parametrize( From 38c4e68350d46e0b8024517934cfcce9c8c9174a Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Mon, 5 Sep 2022 22:19:06 +0200 Subject: [PATCH 2/2] flake8 --- tests/test_workflows.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_workflows.py b/tests/test_workflows.py index 200e030c71..57cab70180 100644 --- a/tests/test_workflows.py +++ b/tests/test_workflows.py @@ -17,7 +17,7 @@ from PyPDF2.constants import ImageAttributes as IA from PyPDF2.constants import PageAttributes as PG from PyPDF2.constants import Ressources as RES -from PyPDF2.errors import PdfReadError, PdfReadWarning +from PyPDF2.errors import PdfReadWarning from PyPDF2.filters import _xobj_to_image from . import get_pdf_from_url, normalize_warnings