Fix encoding for encoding dictionary without Type item. (#500)

likemusic · k00ni · j0k3r · web-flow · commit 4551cd0f0744 · 2022-02-03T14:28:46.000+01:00
* Font::decodeContent() fixes to support Encoding dictionates without Type header. * Pass $unicode by reference in Font::decodeContentByEncoding(). * Add encoding initialization. * Font's initialized encoding in private property. * Add "ext-iconv" to "require" in composer.json * Delete unnecessary unicode string test for #95 test. * Fix misprint. * Add comments + small refactoring. * Run dev-tools/vendor/bin/php-cs-fixer fix Fixed all files in 0.018 seconds, 12.000 MB memory used * Fixes by @k00ni suggestions. * Add test (with pdf file) for issue in PR #500. * Add pdf-file for test. * Add comments to Font class methods. * Apply suggestions from code review Co-authored-by: Konrad Abicht <hi@inspirito.de> * Delete trows in phpDoc for test. * Fixes according to @k00ni suggestions. - Add return types to tests methods. - Fix todos in phpDocs. * Apply suggestions from code review Co-authored-by: Konrad Abicht <hi@inspirito.de> * Update test file (which opens without error in Adobe Acrobat Reader). * CS-fixer fix. * Avoid throwing error when encoding isn't found (previous behavior) Co-authored-by: Konrad Abicht <hi@inspirito.de> Co-authored-by: Jeremy Benoist <jeremy.benoist@gmail.com>
diff --git a/composer.json b/composer.json
@@ -17,7 +17,8 @@
     "require": {
         "php": ">=7.1",
         "symfony/polyfill-mbstring": "^1.18",
-        "ext-zlib": "*"
+        "ext-zlib": "*",
+        "ext-iconv": "*"
     },
     "autoload": {
         "psr-0": {
diff --git a/samples/bugs/PullRequest500.pdf b/samples/bugs/PullRequest500.pdf
diff --git a/src/Smalot/PdfParser/Font.php b/src/Smalot/PdfParser/Font.php
@@ -57,6 +57,19 @@ class Font extends PDFObject
      */
     private static $uchrCache = [];
 
+    /**
+     * In some PDF-files encoding could be referenced by object id but object itself does not contain
+     * `/Type /Encoding` in its dictionary. These objects wouldn't be initialized as Encoding in
+     * \Smalot\PdfParser\PDFObject::factory() during file parsing (they would be just PDFObject).
+     *
+     * Therefore, we create an instance of Encoding from them during decoding and cache this value in this property.
+     *
+     * @var Encoding
+     *
+     * @see https://github.com/smalot/pdfparser/pull/500
+     */
+    private $initializedEncodingByPdfObject;
+
     public function init()
     {
         // Load translate table.
@@ -408,91 +421,210 @@ public function decodeText(array $commands): string
     }
 
     /**
+     * Decode given $text to "utf-8" encoded string.
+     *
      * @param bool $unicode This parameter is deprecated and might be removed in a future release
      */
     public function decodeContent(string $text, ?bool &$unicode = null): string
     {
         if ($this->has('ToUnicode')) {
-            $bytes = $this->tableSizes['from'];
+            return $this->decodeContentByToUnicodeCMapOrDescendantFonts($text);
+        }
 
-            if ($bytes) {
-                $result = '';
-                $length = \strlen($text);
+        if ($this->has('Encoding')) {
+            $result = $this->decodeContentByEncoding($text);
 
-                for ($i = 0; $i < $length; $i += $bytes) {
-                    $char = substr($text, $i, $bytes);
+            if (null !== $result) {
+                return $result;
+            }
+        }
 
-                    if (false !== ($decoded = $this->translateChar($char, false))) {
-                        $char = $decoded;
-                    } elseif ($this->has('DescendantFonts')) {
-                        if ($this->get('DescendantFonts') instanceof PDFObject) {
-                            $fonts = $this->get('DescendantFonts')->getHeader()->getElements();
-                        } else {
-                            $fonts = $this->get('DescendantFonts')->getContent();
-                        }
-                        $decoded = false;
-
-                        foreach ($fonts as $font) {
-                            if ($font instanceof self) {
-                                if (false !== ($decoded = $font->translateChar($char, false))) {
-                                    $decoded = mb_convert_encoding($decoded, 'UTF-8', 'Windows-1252');
-                                    break;
-                                }
+        return $this->decodeContentByAutodetectIfNecessary($text);
+    }
+
+    /**
+     * First try to decode $text by ToUnicode CMap.
+     * If char translation not found in ToUnicode CMap tries:
+     *  - If DescendantFonts exists tries to decode char by one of that fonts.
+     *      - If have no success to decode by DescendantFonts interpret $text as a string with "Windows-1252" encoding.
+     *  - If DescendantFonts does not exist just return "?" as decoded char.
+     *
+     * @todo Seems this is invalid algorithm that do not follow pdf-format specification. Must be rewritten.
+     */
+    private function decodeContentByToUnicodeCMapOrDescendantFonts(string $text): string
+    {
+        $bytes = $this->tableSizes['from'];
+
+        if ($bytes) {
+            $result = '';
+            $length = \strlen($text);
+
+            for ($i = 0; $i < $length; $i += $bytes) {
+                $char = substr($text, $i, $bytes);
+
+                if (false !== ($decoded = $this->translateChar($char, false))) {
+                    $char = $decoded;
+                } elseif ($this->has('DescendantFonts')) {
+                    if ($this->get('DescendantFonts') instanceof PDFObject) {
+                        $fonts = $this->get('DescendantFonts')->getHeader()->getElements();
+                    } else {
+                        $fonts = $this->get('DescendantFonts')->getContent();
+                    }
+                    $decoded = false;
+
+                    foreach ($fonts as $font) {
+                        if ($font instanceof self) {
+                            if (false !== ($decoded = $font->translateChar($char, false))) {
+                                $decoded = mb_convert_encoding($decoded, 'UTF-8', 'Windows-1252');
+                                break;
                             }
                         }
+                    }
 
-                        if (false !== $decoded) {
-                            $char = $decoded;
-                        } else {
-                            $char = mb_convert_encoding($char, 'UTF-8', 'Windows-1252');
-                        }
+                    if (false !== $decoded) {
+                        $char = $decoded;
                     } else {
-                        $char = self::MISSING;
+                        $char = mb_convert_encoding($char, 'UTF-8', 'Windows-1252');
                     }
-
-                    $result .= $char;
+                } else {
+                    $char = self::MISSING;
                 }
 
-                $text = $result;
+                $result .= $char;
             }
-        } elseif ($this->has('Encoding') && $this->get('Encoding') instanceof Encoding) {
-            /** @var Encoding $encoding */
-            $encoding = $this->get('Encoding');
-            $unicode = mb_check_encoding($text, 'UTF-8');
-            $result = '';
-            if ($unicode) {
-                $chars = preg_split(
-                        '//s'.($unicode ? 'u' : ''),
-                        $text,
-                        -1,
-                        \PREG_SPLIT_DELIM_CAPTURE | \PREG_SPLIT_NO_EMPTY
-                );
-
-                foreach ($chars as $char) {
-                    $dec_av = hexdec(bin2hex($char));
-                    $dec_ap = $encoding->translateChar($dec_av);
-                    $result .= self::uchr($dec_ap ?? $dec_av);
-                }
-            } else {
-                $length = \strlen($text);
 
-                for ($i = 0; $i < $length; ++$i) {
-                    $dec_av = hexdec(bin2hex($text[$i]));
-                    $dec_ap = $encoding->translateChar($dec_av);
-                    $result .= self::uchr($dec_ap ?? $dec_av);
-                }
-            }
             $text = $result;
-        } elseif ($this->get('Encoding') instanceof Element &&
-                  $this->get('Encoding')->equals('MacRomanEncoding')) {
-            // mb_convert_encoding does not support MacRoman/macintosh,
-            // so we use iconv() here
-            $text = iconv('macintosh', 'UTF-8', $text);
-        } elseif (!mb_check_encoding($text, 'UTF-8')) {
-            // don't double-encode strings already in UTF-8
-            $text = mb_convert_encoding($text, 'UTF-8', 'Windows-1252');
         }
 
         return $text;
     }
+
+    /**
+     * Decode content by any type of Encoding (dictionary's item) instance.
+     */
+    private function decodeContentByEncoding(string $text): ?string
+    {
+        $encoding = $this->get('Encoding');
+
+        // When Encoding referenced by object id (/Encoding 520 0 R) but object itself does not contain `/Type /Encoding` in it's dictionary.
+        if ($encoding instanceof PDFObject) {
+            $encoding = $this->getInitializedEncodingByPdfObject($encoding);
+        }
+
+        // When Encoding referenced by object id (/Encoding 520 0 R) but object itself contains `/Type /Encoding` in it's dictionary.
+        if ($encoding instanceof Encoding) {
+            return $this->decodeContentByEncodingEncoding($text, $encoding);
+        }
+
+        // When Encoding is just string (/Encoding /WinAnsiEncoding)
+        if ($encoding instanceof Element) { //todo: ElementString class must by used?
+            return $this->decodeContentByEncodingElement($text, $encoding);
+        }
+
+        // don't double-encode strings already in UTF-8
+        if (!mb_check_encoding($text, 'UTF-8')) {
+            return mb_convert_encoding($text, 'UTF-8', 'Windows-1252');
+        }
+
+        return $text;
+    }
+
+    /**
+     * Returns already created or create a new one if not created before Encoding instance by PDFObject instance.
+     */
+    private function getInitializedEncodingByPdfObject(PDFObject $PDFObject): Encoding
+    {
+        if (!$this->initializedEncodingByPdfObject) {
+            $this->initializedEncodingByPdfObject = $this->createInitializedEncodingByPdfObject($PDFObject);
+        }
+
+        return $this->initializedEncodingByPdfObject;
+    }
+
+    /**
+     * Decode content when $encoding (given by $this->get('Encoding')) is instance of Encoding.
+     */
+    private function decodeContentByEncodingEncoding(string $text, Encoding $encoding): string
+    {
+        $result = '';
+        $length = \strlen($text);
+
+        for ($i = 0; $i < $length; ++$i) {
+            $dec_av = hexdec(bin2hex($text[$i]));
+            $dec_ap = $encoding->translateChar($dec_av);
+            $result .= self::uchr($dec_ap ?? $dec_av);
+        }
+
+        return $result;
+    }
+
+    /**
+     * Decode content when $encoding (given by $this->get('Encoding')) is instance of Element.
+     */
+    private function decodeContentByEncodingElement(string $text, Element $encoding): ?string
+    {
+        $pdfEncodingName = $encoding->getContent();
+
+        // mb_convert_encoding does not support MacRoman/macintosh,
+        // so we use iconv() here
+        $iconvEncodingName = $this->getIconvEncodingNameOrNullByPdfEncodingName($pdfEncodingName);
+
+        return $iconvEncodingName ? iconv($iconvEncodingName, 'UTF-8', $text) : null;
+    }
+
+    /**
+     * Convert PDF encoding name to iconv-known encoding name.
+     */
+    private function getIconvEncodingNameOrNullByPdfEncodingName(string $pdfEncodingName): ?string
+    {
+        $pdfToIconvEncodingNameMap = [
+            'StandardEncoding' => 'ISO-8859-1',
+            'MacRomanEncoding' => 'MACINTOSH',
+            'WinAnsiEncoding' => 'CP1252',
+        ];
+
+        return \array_key_exists($pdfEncodingName, $pdfToIconvEncodingNameMap)
+            ? $pdfToIconvEncodingNameMap[$pdfEncodingName]
+            : null;
+    }
+
+    /**
+     * If string seems like "utf-8" encoded string do nothing and just return given string as is.
+     * Otherwise, interpret string as "Window-1252" encoded string.
+     *
+     * @return string|false
+     */
+    private function decodeContentByAutodetectIfNecessary(string $text)
+    {
+        if (mb_check_encoding($text, 'UTF-8')) {
+            return $text;
+        }
+
+        return mb_convert_encoding($text, 'UTF-8', 'Windows-1252');
+        //todo: Why exactly `Windows-1252` used?
+    }
+
+    /**
+     * Create Encoding instance by PDFObject instance and init it.
+     */
+    private function createInitializedEncodingByPdfObject(PDFObject $PDFObject): Encoding
+    {
+        $encoding = $this->createEncodingByPdfObject($PDFObject);
+        $encoding->init();
+
+        return $encoding;
+    }
+
+    /**
+     * Create Encoding instance by PDFObject instance (without init).
+     */
+    private function createEncodingByPdfObject(PDFObject $PDFObject): Encoding
+    {
+        $document = $PDFObject->getDocument();
+        $header = $PDFObject->getHeader();
+        $content = $PDFObject->getContent();
+        $config = $PDFObject->getConfig();
+
+        return new Encoding($document, $header, $content, $config);
+    }
 }
diff --git a/src/Smalot/PdfParser/PDFObject.php b/src/Smalot/PdfParser/PDFObject.php
@@ -87,11 +87,21 @@ public function init()
     {
     }
 
+    public function getDocument(): Document
+    {
+        return $this->document;
+    }
+
     public function getHeader(): ?Header
     {
         return $this->header;
     }
 
+    public function getConfig(): ?Config
+    {
+        return $this->config;
+    }
+
     /**
      * @return Element|PDFObject|Header
      */
diff --git a/tests/Integration/FontTest.php b/tests/Integration/FontTest.php
@@ -340,14 +340,32 @@ public function testDecodeText(): void
             ],
         ];
         $this->assertEquals('æöü', $font->decodeText($commands));
+    }
 
-        $commands = [
-            [
-                't' => '<',
-                'c' => 'C3A6C3B6C3BC', //Unicode encoded string
-            ],
-        ];
-        $this->assertEquals('æöü', $font->decodeText($commands));
+    /**
+     * Font could have indirect encoding without `/Type /Encoding`
+     * which would be instance of PDFObject class (but not Encoding or ElementString).
+     *
+     * @see https://github.com/smalot/pdfparser/pull/500
+     */
+    public function testDecodeTextForFontWithIndirectEncodingWithoutTypeEncoding(): void
+    {
+        $filename = $this->rootDir.'/samples/bugs/PullRequest500.pdf';
+        $parser = $this->getParserInstance();
+        $document = $parser->parseFile($filename);
+        $pages = $document->getPages();
+        $page1 = reset($pages);
+        $page1Text = $page1->getText();
+        $expectedText = <<<TEXT
+Export\u{a0}transakční\u{a0}historie
+Typ\u{a0}produktu:\u{a0}Podnikatelský\u{a0}účet\u{a0}Maxi
+Číslo\u{a0}účtu:\u{a0}0000000000/0000
+Počáteční\u{a0}zůstatek: 000\u{a0}000,00\u{a0}Kč
+Konečný\u{a0}zůstatek: 000\u{a0}000,00\u{a0}Kč
+Cena\u{a0}za\u{a0}služby
+TEXT;
+
+        $this->assertEquals($expectedText, trim($page1Text));
     }
 
     /**
diff --git a/tests/Performance/runPerformanceTests.php b/tests/Performance/runPerformanceTests.php
@@ -2,11 +2,15 @@
 
 require __DIR__.'/../../vendor/autoload.php';
 
+use Tests\Smalot\PdfParser\Performance\Exception\PerformanceFailException;
+use Tests\Smalot\PdfParser\Performance\Test\AbstractPerformanceTest;
+use Tests\Smalot\PdfParser\Performance\Test\DocumentDictionaryCacheTest;
+
 $tests = [
-    new \Tests\Smalot\PdfParser\Performance\Test\DocumentDictionaryCacheTest(),
+    new DocumentDictionaryCacheTest(),
 ];
 
-foreach ($tests as $test) { /* @var $test \Tests\Smalot\PdfParser\Performance\Test\AbstractPerformanceTest */
+foreach ($tests as $test) { /* @var $test AbstractPerformanceTest */
     $test->init();
 
     $startTime = microtime(true);
@@ -16,6 +20,6 @@
     $time = $endTime - $startTime;
 
     if ($test->getMaxEstimatedTime() <= $time) {
-        throw new \Tests\Smalot\PdfParser\Performance\Exception\PerformanceFailException(sprintf('Performance failed on test "%s". Time taken was %.2f seconds, expected less than %d seconds.', get_class($test), $time, $test->getMaxEstimatedTime()));
+        throw new PerformanceFailException(sprintf('Performance failed on test "%s". Time taken was %.2f seconds, expected less than %d seconds.', get_class($test), $time, $test->getMaxEstimatedTime()));
     }
 }

Original file line number	Diff line number	Diff line change
`@@ -87,11 +87,21 @@ public function init()`
`87`	`87`	`{`
`88`	`88`	`}`
`89`	`89`
	`90`	`+ public function getDocument(): Document`
	`91`	`+ {`
	`92`	`+ return $this->document;`
	`93`	`+ }`
	`94`	`+`
`90`	`95`	`public function getHeader(): ?Header`
`91`	`96`	`{`
`92`	`97`	`return $this->header;`
`93`	`98`	`}`
`94`	`99`
	`100`	`+ public function getConfig(): ?Config`
	`101`	`+ {`
	`102`	`+ return $this->config;`
	`103`	`+ }`
	`104`	`+`
`95`	`105`	`/**`
`96`	`106`	`* @return Element\|PDFObject\|Header`
`97`	`107`	`*/`