@@ -57,6 +57,19 @@ class Font extends PDFObject
5757 */
5858 private static $ uchrCache = [];
5959
60+ /**
61+ * In some PDF-files encoding could be referenced by object id but object itself does not contain
62+ * `/Type /Encoding` in its dictionary. These objects wouldn't be initialized as Encoding in
63+ * \Smalot\PdfParser\PDFObject::factory() during file parsing (they would be just PDFObject).
64+ *
65+ * Therefore, we create an instance of Encoding from them during decoding and cache this value in this property.
66+ *
67+ * @var Encoding
68+ *
69+ * @see https://github.com/smalot/pdfparser/pull/500
70+ */
71+ private $ initializedEncodingByPdfObject ;
72+
6073 public function init ()
6174 {
6275 // Load translate table.
@@ -408,91 +421,210 @@ public function decodeText(array $commands): string
408421 }
409422
410423 /**
424+ * Decode given $text to "utf-8" encoded string.
425+ *
411426 * @param bool $unicode This parameter is deprecated and might be removed in a future release
412427 */
413428 public function decodeContent (string $ text , ?bool &$ unicode = null ): string
414429 {
415430 if ($ this ->has ('ToUnicode ' )) {
416- $ bytes = $ this ->tableSizes ['from ' ];
431+ return $ this ->decodeContentByToUnicodeCMapOrDescendantFonts ($ text );
432+ }
417433
418- if ($ bytes ) {
419- $ result = '' ;
420- $ length = \strlen ($ text );
434+ if ($ this ->has ('Encoding ' )) {
435+ $ result = $ this ->decodeContentByEncoding ($ text );
421436
422- for ($ i = 0 ; $ i < $ length ; $ i += $ bytes ) {
423- $ char = substr ($ text , $ i , $ bytes );
437+ if (null !== $ result ) {
438+ return $ result ;
439+ }
440+ }
424441
425- if (false !== ($ decoded = $ this ->translateChar ($ char , false ))) {
426- $ char = $ decoded ;
427- } elseif ($ this ->has ('DescendantFonts ' )) {
428- if ($ this ->get ('DescendantFonts ' ) instanceof PDFObject) {
429- $ fonts = $ this ->get ('DescendantFonts ' )->getHeader ()->getElements ();
430- } else {
431- $ fonts = $ this ->get ('DescendantFonts ' )->getContent ();
432- }
433- $ decoded = false ;
434-
435- foreach ($ fonts as $ font ) {
436- if ($ font instanceof self) {
437- if (false !== ($ decoded = $ font ->translateChar ($ char , false ))) {
438- $ decoded = mb_convert_encoding ($ decoded , 'UTF-8 ' , 'Windows-1252 ' );
439- break ;
440- }
442+ return $ this ->decodeContentByAutodetectIfNecessary ($ text );
443+ }
444+
445+ /**
446+ * First try to decode $text by ToUnicode CMap.
447+ * If char translation not found in ToUnicode CMap tries:
448+ * - If DescendantFonts exists tries to decode char by one of that fonts.
449+ * - If have no success to decode by DescendantFonts interpret $text as a string with "Windows-1252" encoding.
450+ * - If DescendantFonts does not exist just return "?" as decoded char.
451+ *
452+ * @todo Seems this is invalid algorithm that do not follow pdf-format specification. Must be rewritten.
453+ */
454+ private function decodeContentByToUnicodeCMapOrDescendantFonts (string $ text ): string
455+ {
456+ $ bytes = $ this ->tableSizes ['from ' ];
457+
458+ if ($ bytes ) {
459+ $ result = '' ;
460+ $ length = \strlen ($ text );
461+
462+ for ($ i = 0 ; $ i < $ length ; $ i += $ bytes ) {
463+ $ char = substr ($ text , $ i , $ bytes );
464+
465+ if (false !== ($ decoded = $ this ->translateChar ($ char , false ))) {
466+ $ char = $ decoded ;
467+ } elseif ($ this ->has ('DescendantFonts ' )) {
468+ if ($ this ->get ('DescendantFonts ' ) instanceof PDFObject) {
469+ $ fonts = $ this ->get ('DescendantFonts ' )->getHeader ()->getElements ();
470+ } else {
471+ $ fonts = $ this ->get ('DescendantFonts ' )->getContent ();
472+ }
473+ $ decoded = false ;
474+
475+ foreach ($ fonts as $ font ) {
476+ if ($ font instanceof self) {
477+ if (false !== ($ decoded = $ font ->translateChar ($ char , false ))) {
478+ $ decoded = mb_convert_encoding ($ decoded , 'UTF-8 ' , 'Windows-1252 ' );
479+ break ;
441480 }
442481 }
482+ }
443483
444- if (false !== $ decoded ) {
445- $ char = $ decoded ;
446- } else {
447- $ char = mb_convert_encoding ($ char , 'UTF-8 ' , 'Windows-1252 ' );
448- }
484+ if (false !== $ decoded ) {
485+ $ char = $ decoded ;
449486 } else {
450- $ char = self :: MISSING ;
487+ $ char = mb_convert_encoding ( $ char , ' UTF-8 ' , ' Windows-1252 ' ) ;
451488 }
452-
453- $ result .= $ char ;
489+ } else {
490+ $ char = self :: MISSING ;
454491 }
455492
456- $ text = $ result ;
493+ $ result . = $ char ;
457494 }
458- } elseif ($ this ->has ('Encoding ' ) && $ this ->get ('Encoding ' ) instanceof Encoding) {
459- /** @var Encoding $encoding */
460- $ encoding = $ this ->get ('Encoding ' );
461- $ unicode = mb_check_encoding ($ text , 'UTF-8 ' );
462- $ result = '' ;
463- if ($ unicode ) {
464- $ chars = preg_split (
465- '//s ' .($ unicode ? 'u ' : '' ),
466- $ text ,
467- -1 ,
468- \PREG_SPLIT_DELIM_CAPTURE | \PREG_SPLIT_NO_EMPTY
469- );
470-
471- foreach ($ chars as $ char ) {
472- $ dec_av = hexdec (bin2hex ($ char ));
473- $ dec_ap = $ encoding ->translateChar ($ dec_av );
474- $ result .= self ::uchr ($ dec_ap ?? $ dec_av );
475- }
476- } else {
477- $ length = \strlen ($ text );
478495
479- for ($ i = 0 ; $ i < $ length ; ++$ i ) {
480- $ dec_av = hexdec (bin2hex ($ text [$ i ]));
481- $ dec_ap = $ encoding ->translateChar ($ dec_av );
482- $ result .= self ::uchr ($ dec_ap ?? $ dec_av );
483- }
484- }
485496 $ text = $ result ;
486- } elseif ($ this ->get ('Encoding ' ) instanceof Element &&
487- $ this ->get ('Encoding ' )->equals ('MacRomanEncoding ' )) {
488- // mb_convert_encoding does not support MacRoman/macintosh,
489- // so we use iconv() here
490- $ text = iconv ('macintosh ' , 'UTF-8 ' , $ text );
491- } elseif (!mb_check_encoding ($ text , 'UTF-8 ' )) {
492- // don't double-encode strings already in UTF-8
493- $ text = mb_convert_encoding ($ text , 'UTF-8 ' , 'Windows-1252 ' );
494497 }
495498
496499 return $ text ;
497500 }
501+
502+ /**
503+ * Decode content by any type of Encoding (dictionary's item) instance.
504+ */
505+ private function decodeContentByEncoding (string $ text ): ?string
506+ {
507+ $ encoding = $ this ->get ('Encoding ' );
508+
509+ // When Encoding referenced by object id (/Encoding 520 0 R) but object itself does not contain `/Type /Encoding` in it's dictionary.
510+ if ($ encoding instanceof PDFObject) {
511+ $ encoding = $ this ->getInitializedEncodingByPdfObject ($ encoding );
512+ }
513+
514+ // When Encoding referenced by object id (/Encoding 520 0 R) but object itself contains `/Type /Encoding` in it's dictionary.
515+ if ($ encoding instanceof Encoding) {
516+ return $ this ->decodeContentByEncodingEncoding ($ text , $ encoding );
517+ }
518+
519+ // When Encoding is just string (/Encoding /WinAnsiEncoding)
520+ if ($ encoding instanceof Element) { //todo: ElementString class must by used?
521+ return $ this ->decodeContentByEncodingElement ($ text , $ encoding );
522+ }
523+
524+ // don't double-encode strings already in UTF-8
525+ if (!mb_check_encoding ($ text , 'UTF-8 ' )) {
526+ return mb_convert_encoding ($ text , 'UTF-8 ' , 'Windows-1252 ' );
527+ }
528+
529+ return $ text ;
530+ }
531+
532+ /**
533+ * Returns already created or create a new one if not created before Encoding instance by PDFObject instance.
534+ */
535+ private function getInitializedEncodingByPdfObject (PDFObject $ PDFObject ): Encoding
536+ {
537+ if (!$ this ->initializedEncodingByPdfObject ) {
538+ $ this ->initializedEncodingByPdfObject = $ this ->createInitializedEncodingByPdfObject ($ PDFObject );
539+ }
540+
541+ return $ this ->initializedEncodingByPdfObject ;
542+ }
543+
544+ /**
545+ * Decode content when $encoding (given by $this->get('Encoding')) is instance of Encoding.
546+ */
547+ private function decodeContentByEncodingEncoding (string $ text , Encoding $ encoding ): string
548+ {
549+ $ result = '' ;
550+ $ length = \strlen ($ text );
551+
552+ for ($ i = 0 ; $ i < $ length ; ++$ i ) {
553+ $ dec_av = hexdec (bin2hex ($ text [$ i ]));
554+ $ dec_ap = $ encoding ->translateChar ($ dec_av );
555+ $ result .= self ::uchr ($ dec_ap ?? $ dec_av );
556+ }
557+
558+ return $ result ;
559+ }
560+
561+ /**
562+ * Decode content when $encoding (given by $this->get('Encoding')) is instance of Element.
563+ */
564+ private function decodeContentByEncodingElement (string $ text , Element $ encoding ): ?string
565+ {
566+ $ pdfEncodingName = $ encoding ->getContent ();
567+
568+ // mb_convert_encoding does not support MacRoman/macintosh,
569+ // so we use iconv() here
570+ $ iconvEncodingName = $ this ->getIconvEncodingNameOrNullByPdfEncodingName ($ pdfEncodingName );
571+
572+ return $ iconvEncodingName ? iconv ($ iconvEncodingName , 'UTF-8 ' , $ text ) : null ;
573+ }
574+
575+ /**
576+ * Convert PDF encoding name to iconv-known encoding name.
577+ */
578+ private function getIconvEncodingNameOrNullByPdfEncodingName (string $ pdfEncodingName ): ?string
579+ {
580+ $ pdfToIconvEncodingNameMap = [
581+ 'StandardEncoding ' => 'ISO-8859-1 ' ,
582+ 'MacRomanEncoding ' => 'MACINTOSH ' ,
583+ 'WinAnsiEncoding ' => 'CP1252 ' ,
584+ ];
585+
586+ return \array_key_exists ($ pdfEncodingName , $ pdfToIconvEncodingNameMap )
587+ ? $ pdfToIconvEncodingNameMap [$ pdfEncodingName ]
588+ : null ;
589+ }
590+
591+ /**
592+ * If string seems like "utf-8" encoded string do nothing and just return given string as is.
593+ * Otherwise, interpret string as "Window-1252" encoded string.
594+ *
595+ * @return string|false
596+ */
597+ private function decodeContentByAutodetectIfNecessary (string $ text )
598+ {
599+ if (mb_check_encoding ($ text , 'UTF-8 ' )) {
600+ return $ text ;
601+ }
602+
603+ return mb_convert_encoding ($ text , 'UTF-8 ' , 'Windows-1252 ' );
604+ //todo: Why exactly `Windows-1252` used?
605+ }
606+
607+ /**
608+ * Create Encoding instance by PDFObject instance and init it.
609+ */
610+ private function createInitializedEncodingByPdfObject (PDFObject $ PDFObject ): Encoding
611+ {
612+ $ encoding = $ this ->createEncodingByPdfObject ($ PDFObject );
613+ $ encoding ->init ();
614+
615+ return $ encoding ;
616+ }
617+
618+ /**
619+ * Create Encoding instance by PDFObject instance (without init).
620+ */
621+ private function createEncodingByPdfObject (PDFObject $ PDFObject ): Encoding
622+ {
623+ $ document = $ PDFObject ->getDocument ();
624+ $ header = $ PDFObject ->getHeader ();
625+ $ content = $ PDFObject ->getContent ();
626+ $ config = $ PDFObject ->getConfig ();
627+
628+ return new Encoding ($ document , $ header , $ content , $ config );
629+ }
498630}
0 commit comments