Skip to content

Commit 4551cd0

Browse files
likemusick00nij0k3r
authored
Fix encoding for encoding dictionary without Type item. (#500)
* Font::decodeContent() fixes to support Encoding dictionates without Type header. * Pass $unicode by reference in Font::decodeContentByEncoding(). * Add encoding initialization. * Font's initialized encoding in private property. * Add "ext-iconv" to "require" in composer.json * Delete unnecessary unicode string test for #95 test. * Fix misprint. * Add comments + small refactoring. * Run dev-tools/vendor/bin/php-cs-fixer fix Fixed all files in 0.018 seconds, 12.000 MB memory used * Fixes by @k00ni suggestions. * Add test (with pdf file) for issue in PR #500. * Add pdf-file for test. * Add comments to Font class methods. * Apply suggestions from code review Co-authored-by: Konrad Abicht <hi@inspirito.de> * Delete trows in phpDoc for test. * Fixes according to @k00ni suggestions. - Add return types to tests methods. - Fix todos in phpDocs. * Apply suggestions from code review Co-authored-by: Konrad Abicht <hi@inspirito.de> * Update test file (which opens without error in Adobe Acrobat Reader). * CS-fixer fix. * Avoid throwing error when encoding isn't found (previous behavior) Co-authored-by: Konrad Abicht <hi@inspirito.de> Co-authored-by: Jeremy Benoist <jeremy.benoist@gmail.com>
1 parent 43ca68f commit 4551cd0

6 files changed

Lines changed: 241 additions & 76 deletions

File tree

composer.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,8 @@
1717
"require": {
1818
"php": ">=7.1",
1919
"symfony/polyfill-mbstring": "^1.18",
20-
"ext-zlib": "*"
20+
"ext-zlib": "*",
21+
"ext-iconv": "*"
2122
},
2223
"autoload": {
2324
"psr-0": {

samples/bugs/PullRequest500.pdf

8.06 KB
Binary file not shown.

src/Smalot/PdfParser/Font.php

Lines changed: 197 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,19 @@ class Font extends PDFObject
5757
*/
5858
private static $uchrCache = [];
5959

60+
/**
61+
* In some PDF-files encoding could be referenced by object id but object itself does not contain
62+
* `/Type /Encoding` in its dictionary. These objects wouldn't be initialized as Encoding in
63+
* \Smalot\PdfParser\PDFObject::factory() during file parsing (they would be just PDFObject).
64+
*
65+
* Therefore, we create an instance of Encoding from them during decoding and cache this value in this property.
66+
*
67+
* @var Encoding
68+
*
69+
* @see https://github.com/smalot/pdfparser/pull/500
70+
*/
71+
private $initializedEncodingByPdfObject;
72+
6073
public function init()
6174
{
6275
// Load translate table.
@@ -408,91 +421,210 @@ public function decodeText(array $commands): string
408421
}
409422

410423
/**
424+
* Decode given $text to "utf-8" encoded string.
425+
*
411426
* @param bool $unicode This parameter is deprecated and might be removed in a future release
412427
*/
413428
public function decodeContent(string $text, ?bool &$unicode = null): string
414429
{
415430
if ($this->has('ToUnicode')) {
416-
$bytes = $this->tableSizes['from'];
431+
return $this->decodeContentByToUnicodeCMapOrDescendantFonts($text);
432+
}
417433

418-
if ($bytes) {
419-
$result = '';
420-
$length = \strlen($text);
434+
if ($this->has('Encoding')) {
435+
$result = $this->decodeContentByEncoding($text);
421436

422-
for ($i = 0; $i < $length; $i += $bytes) {
423-
$char = substr($text, $i, $bytes);
437+
if (null !== $result) {
438+
return $result;
439+
}
440+
}
424441

425-
if (false !== ($decoded = $this->translateChar($char, false))) {
426-
$char = $decoded;
427-
} elseif ($this->has('DescendantFonts')) {
428-
if ($this->get('DescendantFonts') instanceof PDFObject) {
429-
$fonts = $this->get('DescendantFonts')->getHeader()->getElements();
430-
} else {
431-
$fonts = $this->get('DescendantFonts')->getContent();
432-
}
433-
$decoded = false;
434-
435-
foreach ($fonts as $font) {
436-
if ($font instanceof self) {
437-
if (false !== ($decoded = $font->translateChar($char, false))) {
438-
$decoded = mb_convert_encoding($decoded, 'UTF-8', 'Windows-1252');
439-
break;
440-
}
442+
return $this->decodeContentByAutodetectIfNecessary($text);
443+
}
444+
445+
/**
446+
* First try to decode $text by ToUnicode CMap.
447+
* If char translation not found in ToUnicode CMap tries:
448+
* - If DescendantFonts exists tries to decode char by one of that fonts.
449+
* - If have no success to decode by DescendantFonts interpret $text as a string with "Windows-1252" encoding.
450+
* - If DescendantFonts does not exist just return "?" as decoded char.
451+
*
452+
* @todo Seems this is invalid algorithm that do not follow pdf-format specification. Must be rewritten.
453+
*/
454+
private function decodeContentByToUnicodeCMapOrDescendantFonts(string $text): string
455+
{
456+
$bytes = $this->tableSizes['from'];
457+
458+
if ($bytes) {
459+
$result = '';
460+
$length = \strlen($text);
461+
462+
for ($i = 0; $i < $length; $i += $bytes) {
463+
$char = substr($text, $i, $bytes);
464+
465+
if (false !== ($decoded = $this->translateChar($char, false))) {
466+
$char = $decoded;
467+
} elseif ($this->has('DescendantFonts')) {
468+
if ($this->get('DescendantFonts') instanceof PDFObject) {
469+
$fonts = $this->get('DescendantFonts')->getHeader()->getElements();
470+
} else {
471+
$fonts = $this->get('DescendantFonts')->getContent();
472+
}
473+
$decoded = false;
474+
475+
foreach ($fonts as $font) {
476+
if ($font instanceof self) {
477+
if (false !== ($decoded = $font->translateChar($char, false))) {
478+
$decoded = mb_convert_encoding($decoded, 'UTF-8', 'Windows-1252');
479+
break;
441480
}
442481
}
482+
}
443483

444-
if (false !== $decoded) {
445-
$char = $decoded;
446-
} else {
447-
$char = mb_convert_encoding($char, 'UTF-8', 'Windows-1252');
448-
}
484+
if (false !== $decoded) {
485+
$char = $decoded;
449486
} else {
450-
$char = self::MISSING;
487+
$char = mb_convert_encoding($char, 'UTF-8', 'Windows-1252');
451488
}
452-
453-
$result .= $char;
489+
} else {
490+
$char = self::MISSING;
454491
}
455492

456-
$text = $result;
493+
$result .= $char;
457494
}
458-
} elseif ($this->has('Encoding') && $this->get('Encoding') instanceof Encoding) {
459-
/** @var Encoding $encoding */
460-
$encoding = $this->get('Encoding');
461-
$unicode = mb_check_encoding($text, 'UTF-8');
462-
$result = '';
463-
if ($unicode) {
464-
$chars = preg_split(
465-
'//s'.($unicode ? 'u' : ''),
466-
$text,
467-
-1,
468-
\PREG_SPLIT_DELIM_CAPTURE | \PREG_SPLIT_NO_EMPTY
469-
);
470-
471-
foreach ($chars as $char) {
472-
$dec_av = hexdec(bin2hex($char));
473-
$dec_ap = $encoding->translateChar($dec_av);
474-
$result .= self::uchr($dec_ap ?? $dec_av);
475-
}
476-
} else {
477-
$length = \strlen($text);
478495

479-
for ($i = 0; $i < $length; ++$i) {
480-
$dec_av = hexdec(bin2hex($text[$i]));
481-
$dec_ap = $encoding->translateChar($dec_av);
482-
$result .= self::uchr($dec_ap ?? $dec_av);
483-
}
484-
}
485496
$text = $result;
486-
} elseif ($this->get('Encoding') instanceof Element &&
487-
$this->get('Encoding')->equals('MacRomanEncoding')) {
488-
// mb_convert_encoding does not support MacRoman/macintosh,
489-
// so we use iconv() here
490-
$text = iconv('macintosh', 'UTF-8', $text);
491-
} elseif (!mb_check_encoding($text, 'UTF-8')) {
492-
// don't double-encode strings already in UTF-8
493-
$text = mb_convert_encoding($text, 'UTF-8', 'Windows-1252');
494497
}
495498

496499
return $text;
497500
}
501+
502+
/**
503+
* Decode content by any type of Encoding (dictionary's item) instance.
504+
*/
505+
private function decodeContentByEncoding(string $text): ?string
506+
{
507+
$encoding = $this->get('Encoding');
508+
509+
// When Encoding referenced by object id (/Encoding 520 0 R) but object itself does not contain `/Type /Encoding` in it's dictionary.
510+
if ($encoding instanceof PDFObject) {
511+
$encoding = $this->getInitializedEncodingByPdfObject($encoding);
512+
}
513+
514+
// When Encoding referenced by object id (/Encoding 520 0 R) but object itself contains `/Type /Encoding` in it's dictionary.
515+
if ($encoding instanceof Encoding) {
516+
return $this->decodeContentByEncodingEncoding($text, $encoding);
517+
}
518+
519+
// When Encoding is just string (/Encoding /WinAnsiEncoding)
520+
if ($encoding instanceof Element) { //todo: ElementString class must by used?
521+
return $this->decodeContentByEncodingElement($text, $encoding);
522+
}
523+
524+
// don't double-encode strings already in UTF-8
525+
if (!mb_check_encoding($text, 'UTF-8')) {
526+
return mb_convert_encoding($text, 'UTF-8', 'Windows-1252');
527+
}
528+
529+
return $text;
530+
}
531+
532+
/**
533+
* Returns already created or create a new one if not created before Encoding instance by PDFObject instance.
534+
*/
535+
private function getInitializedEncodingByPdfObject(PDFObject $PDFObject): Encoding
536+
{
537+
if (!$this->initializedEncodingByPdfObject) {
538+
$this->initializedEncodingByPdfObject = $this->createInitializedEncodingByPdfObject($PDFObject);
539+
}
540+
541+
return $this->initializedEncodingByPdfObject;
542+
}
543+
544+
/**
545+
* Decode content when $encoding (given by $this->get('Encoding')) is instance of Encoding.
546+
*/
547+
private function decodeContentByEncodingEncoding(string $text, Encoding $encoding): string
548+
{
549+
$result = '';
550+
$length = \strlen($text);
551+
552+
for ($i = 0; $i < $length; ++$i) {
553+
$dec_av = hexdec(bin2hex($text[$i]));
554+
$dec_ap = $encoding->translateChar($dec_av);
555+
$result .= self::uchr($dec_ap ?? $dec_av);
556+
}
557+
558+
return $result;
559+
}
560+
561+
/**
562+
* Decode content when $encoding (given by $this->get('Encoding')) is instance of Element.
563+
*/
564+
private function decodeContentByEncodingElement(string $text, Element $encoding): ?string
565+
{
566+
$pdfEncodingName = $encoding->getContent();
567+
568+
// mb_convert_encoding does not support MacRoman/macintosh,
569+
// so we use iconv() here
570+
$iconvEncodingName = $this->getIconvEncodingNameOrNullByPdfEncodingName($pdfEncodingName);
571+
572+
return $iconvEncodingName ? iconv($iconvEncodingName, 'UTF-8', $text) : null;
573+
}
574+
575+
/**
576+
* Convert PDF encoding name to iconv-known encoding name.
577+
*/
578+
private function getIconvEncodingNameOrNullByPdfEncodingName(string $pdfEncodingName): ?string
579+
{
580+
$pdfToIconvEncodingNameMap = [
581+
'StandardEncoding' => 'ISO-8859-1',
582+
'MacRomanEncoding' => 'MACINTOSH',
583+
'WinAnsiEncoding' => 'CP1252',
584+
];
585+
586+
return \array_key_exists($pdfEncodingName, $pdfToIconvEncodingNameMap)
587+
? $pdfToIconvEncodingNameMap[$pdfEncodingName]
588+
: null;
589+
}
590+
591+
/**
592+
* If string seems like "utf-8" encoded string do nothing and just return given string as is.
593+
* Otherwise, interpret string as "Window-1252" encoded string.
594+
*
595+
* @return string|false
596+
*/
597+
private function decodeContentByAutodetectIfNecessary(string $text)
598+
{
599+
if (mb_check_encoding($text, 'UTF-8')) {
600+
return $text;
601+
}
602+
603+
return mb_convert_encoding($text, 'UTF-8', 'Windows-1252');
604+
//todo: Why exactly `Windows-1252` used?
605+
}
606+
607+
/**
608+
* Create Encoding instance by PDFObject instance and init it.
609+
*/
610+
private function createInitializedEncodingByPdfObject(PDFObject $PDFObject): Encoding
611+
{
612+
$encoding = $this->createEncodingByPdfObject($PDFObject);
613+
$encoding->init();
614+
615+
return $encoding;
616+
}
617+
618+
/**
619+
* Create Encoding instance by PDFObject instance (without init).
620+
*/
621+
private function createEncodingByPdfObject(PDFObject $PDFObject): Encoding
622+
{
623+
$document = $PDFObject->getDocument();
624+
$header = $PDFObject->getHeader();
625+
$content = $PDFObject->getContent();
626+
$config = $PDFObject->getConfig();
627+
628+
return new Encoding($document, $header, $content, $config);
629+
}
498630
}

src/Smalot/PdfParser/PDFObject.php

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,11 +87,21 @@ public function init()
8787
{
8888
}
8989

90+
public function getDocument(): Document
91+
{
92+
return $this->document;
93+
}
94+
9095
public function getHeader(): ?Header
9196
{
9297
return $this->header;
9398
}
9499

100+
public function getConfig(): ?Config
101+
{
102+
return $this->config;
103+
}
104+
95105
/**
96106
* @return Element|PDFObject|Header
97107
*/

tests/Integration/FontTest.php

Lines changed: 25 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -340,14 +340,32 @@ public function testDecodeText(): void
340340
],
341341
];
342342
$this->assertEquals('æöü', $font->decodeText($commands));
343+
}
343344

344-
$commands = [
345-
[
346-
't' => '<',
347-
'c' => 'C3A6C3B6C3BC', //Unicode encoded string
348-
],
349-
];
350-
$this->assertEquals('æöü', $font->decodeText($commands));
345+
/**
346+
* Font could have indirect encoding without `/Type /Encoding`
347+
* which would be instance of PDFObject class (but not Encoding or ElementString).
348+
*
349+
* @see https://github.com/smalot/pdfparser/pull/500
350+
*/
351+
public function testDecodeTextForFontWithIndirectEncodingWithoutTypeEncoding(): void
352+
{
353+
$filename = $this->rootDir.'/samples/bugs/PullRequest500.pdf';
354+
$parser = $this->getParserInstance();
355+
$document = $parser->parseFile($filename);
356+
$pages = $document->getPages();
357+
$page1 = reset($pages);
358+
$page1Text = $page1->getText();
359+
$expectedText = <<<TEXT
360+
Export\u{a0}transakční\u{a0}historie
361+
Typ\u{a0}produktu:\u{a0}Podnikatelský\u{a0}účet\u{a0}Maxi
362+
Číslo\u{a0}účtu:\u{a0}0000000000/0000
363+
Počáteční\u{a0}zůstatek: 000\u{a0}000,00\u{a0}
364+
Konečný\u{a0}zůstatek: 000\u{a0}000,00\u{a0}
365+
Cena\u{a0}za\u{a0}služby
366+
TEXT;
367+
368+
$this->assertEquals($expectedText, trim($page1Text));
351369
}
352370

353371
/**

tests/Performance/runPerformanceTests.php

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,15 @@
22

33
require __DIR__.'/../../vendor/autoload.php';
44

5+
use Tests\Smalot\PdfParser\Performance\Exception\PerformanceFailException;
6+
use Tests\Smalot\PdfParser\Performance\Test\AbstractPerformanceTest;
7+
use Tests\Smalot\PdfParser\Performance\Test\DocumentDictionaryCacheTest;
8+
59
$tests = [
6-
new \Tests\Smalot\PdfParser\Performance\Test\DocumentDictionaryCacheTest(),
10+
new DocumentDictionaryCacheTest(),
711
];
812

9-
foreach ($tests as $test) { /* @var $test \Tests\Smalot\PdfParser\Performance\Test\AbstractPerformanceTest */
13+
foreach ($tests as $test) { /* @var $test AbstractPerformanceTest */
1014
$test->init();
1115

1216
$startTime = microtime(true);
@@ -16,6 +20,6 @@
1620
$time = $endTime - $startTime;
1721

1822
if ($test->getMaxEstimatedTime() <= $time) {
19-
throw new \Tests\Smalot\PdfParser\Performance\Exception\PerformanceFailException(sprintf('Performance failed on test "%s". Time taken was %.2f seconds, expected less than %d seconds.', get_class($test), $time, $test->getMaxEstimatedTime()));
23+
throw new PerformanceFailException(sprintf('Performance failed on test "%s". Time taken was %.2f seconds, expected less than %d seconds.', get_class($test), $time, $test->getMaxEstimatedTime()));
2024
}
2125
}

0 commit comments

Comments
 (0)