Skip to content

Commit 028d7cc

Browse files
rupertjk00ni
andauthored
When assembling the text array for an object, skip Forms that don't contain any text, instead of all Forms. (#789)
* When assembling the text array for an object, skip Forms that don't contain any text, instead of all Forms. * Apply suggestion from @k00ni to remove unnecessary parentheses. Co-authored-by: Konrad Abicht <hi@inspirito.de> * Add test coverage for change to how Form objects that contain text are handled. --------- Co-authored-by: Konrad Abicht <hi@inspirito.de>
1 parent 61c9bca commit 028d7cc

2 files changed

Lines changed: 40 additions & 11 deletions

File tree

src/Smalot/PdfParser/PDFObject.php

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -788,16 +788,26 @@ public function getTextArray(?Page $page = null): array
788788
break;
789789
}
790790

791-
// If the PDFObject is an Image or a Form, do nothing as
792-
// neither of these XObject types are text.
793-
if ($xobject instanceof Image || $xobject instanceof Form) {
791+
// If the PDFObject is an Image, do nothing as images
792+
// aren't text.
793+
if ($xobject instanceof Image) {
794794
break;
795795
}
796796

797797
// Check this is not a circular reference.
798-
if (!\in_array($xobject->getUniqueId(), self::$recursionStack, true)) {
799-
$text[] = $xobject->getText($page);
798+
if (\in_array($xobject->getUniqueId(), self::$recursionStack, true)) {
799+
break;
800+
}
801+
802+
$objectText = $xobject->getText($page);
803+
804+
// If the PDFObject is a Form and doesn't have any text,
805+
// skip it.
806+
if ($xobject instanceof Form && $objectText === ' ') {
807+
break;
800808
}
809+
810+
$text[] = $objectText;
801811
break;
802812

803813
// Marked content point with (DP) & without (MP) property list

tests/PHPUnit/Unit/PDFObjectTest.php

Lines changed: 25 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
namespace PHPUnitTests\Unit;
66

77
use PHPUnitTests\TestCase;
8+
use Smalot\PdfParser\Config;
89
use Smalot\PdfParser\Document;
910
use Smalot\PdfParser\Element;
1011
use Smalot\PdfParser\Element\ElementArray;
@@ -33,8 +34,10 @@ public function testTextArrayObjects(): void
3334
$document = new Document();
3435
$document->init();
3536

37+
$config = new Config();
3638
$image = new Image($document);
37-
$form = new Form($document);
39+
$formNoText = new Form($document);
40+
$formWithText = new Form($document, null, 'BT /F1 12 Tf 10 10 Td (Form text) Tj ET', $config);
3841
$xObject = new PDFObject($document);
3942

4043
$header1 = new Header([
@@ -50,30 +53,46 @@ public function testTextArrayObjects(): void
5053
$header2 = new Header([
5154
'Resources' => new Header([
5255
'XObject' => new Header([
53-
'Fr0' => $form,
56+
'Fr0' => $formNoText,
5457
])
5558
]),
5659
'Contents' => new ElementArray([new Element('/Fr0 Do', $document)], $document),
5760
]);
5861
$page2 = new Page($document, $header2);
5962

6063
$header3 = new Header([
64+
'Resources' => new Header([
65+
'XObject' => new Header([
66+
'Fr0' => $formWithText,
67+
])
68+
]),
69+
'Contents' => new ElementArray([new Element('/Fr0 Do', $document)], $document),
70+
]);
71+
$page3 = new Page($document, $header3);
72+
73+
$header4 = new Header([
6174
'Resources' => new Header([
6275
'XObject' => new Header([
6376
'Ps0' => $xObject,
6477
])
6578
]),
6679
'Contents' => new ElementArray([new Element('/Ps0 Do', $document)], $document),
6780
]);
68-
$page3 = new Page($document, $header3);
81+
$page4 = new Page($document, $header4);
6982

7083
// Page 1 contains an image, which should not appear in the text array.
7184
self::assertSame([], $page1->getTextArray());
7285

73-
// Page 2 contains a form, which should not appear in the text array.
86+
// Page 2 contains a form that contains no text, which should not appear
87+
// in the text array.
7488
self::assertSame([], $page2->getTextArray());
7589

76-
// Page 3 contains a non-image object, which should appear in the text array.
77-
self::assertSame([' '], $page3->getTextArray());
90+
// Page 3 contains a form that contains text, which should appear in the
91+
// text array.
92+
self::assertSame(['Form text '], $page3->getTextArray());
93+
94+
// Page 4 contains a non-image object, which should appear in the text
95+
// array.
96+
self::assertSame([' '], $page4->getTextArray());
7897
}
7998
}

0 commit comments

Comments
 (0)