diff --git a/src/Smalot/PdfParser/PDFObject.php b/src/Smalot/PdfParser/PDFObject.php index f2660156..d1b8846f 100644 --- a/src/Smalot/PdfParser/PDFObject.php +++ b/src/Smalot/PdfParser/PDFObject.php @@ -224,6 +224,66 @@ private function formatContent(?string $content): string return ''; } + // Find all inline image content and replace them so they aren't + // affected by the next steps + $pdfInlineImages = []; + $offsetBI = 0; + while (preg_match('/\sBI\s(\/.+?)\sID\s(.+?)\sEI(?=\s|$)/s', $content, $text, \PREG_OFFSET_CAPTURE, $offsetBI)) { + // Attempt to detemine if this instance of the 'BI' command + // actually occured within a (string) using the following + // steps: + + // Step 1: Remove any escaped parentheses from the alleged + // image characteristics data + $para = str_replace(['\\(', '\\)'], '', $text[1][0]); + + // Step 2: Remove all correctly ordered and balanced + // parentheses from (strings) + do { + $paraTest = $para; + $para = preg_replace('/\(([^()]*)\)/', '$1', $paraTest); + } while ($para != $paraTest); + + $paraOpen = strpos($para, '('); + $paraClose = strpos($para, ')'); + + // Check: If the remaining text contains a close parenthesis + // ')' AND it occurs before any open parenthesis, then we + // are almost certain to be inside a (string) + if (0 < $paraClose && (false === $paraOpen || $paraClose < $paraOpen)) { + // Bump the search offset forward and match again + $offsetBI = (int) $text[1][1]; + continue; + } + + // Step 3: Double check that this is actually inline image + // data by parsing the alleged image characteristics as a + // dictionary + $dict = $this->parseDictionary('<<'.$text[1][0].'>>'); + + // Check if an image Width and Height are set in the dict + if ((isset($dict['W']) || isset($dict['Width'])) + && (isset($dict['H']) || isset($dict['Height']))) { + $id = uniqid('IMAGE_', true); + $pdfInlineImages[$id] = [ + preg_replace(['/\r\n/', '/\r/', '/\n/'], ' ', $text[1][0]), + preg_replace(['/\r\n/', '/\r/', '/\n/'], '', $text[2][0]), + ]; + $content = preg_replace( + '/'.preg_quote($text[0][0], '/').'/', + '^^^'.$id.'^^^', + $content, + 1 + ); + } else { + // If there was no valid dictionary, or a height and width + // weren't specified, then we don't know what this is, so + // just leave it alone; bump the search offset forward and + // match again + $offsetBI = (int) $text[1][1]; + } + } + // Find all strings () and replace them so they aren't affected // by the next steps $pdfstrings = []; @@ -260,7 +320,7 @@ private function formatContent(?string $content): string // Find all dictionary << >> commands and replace them so they // aren't affected by the next steps $dictstore = []; - while (preg_match('/(<<.*?>> *)(BDC|BMC|DP|MP)/', $content, $dicttext)) { + while (preg_match('/(<<.*?>> *)(BDC|BMC|DP|MP)/s', $content, $dicttext)) { $dictid = uniqid('DICT_', true); $dictstore[$dictid] = $dicttext[1]; $content = preg_replace( @@ -319,6 +379,16 @@ private function formatContent(?string $content): string $content = str_replace('@@@'.$id.'@@@', $text, $content); } + // Restore the original content of any inline images + $pdfInlineImages = array_reverse($pdfInlineImages, true); + foreach ($pdfInlineImages as $id => $image) { + $content = str_replace( + '^^^'.$id.'^^^', + "\r\nBI\r\n".$image[0]." ID\r\n".$image[1]." EI\r\n", + $content + ); + } + $content = trim(preg_replace(['/(\r\n){2,}/', '/\r\n +/'], "\r\n", $content)); return $content; diff --git a/tests/PHPUnit/Integration/PDFObjectTest.php b/tests/PHPUnit/Integration/PDFObjectTest.php index 025c11b1..947a6944 100644 --- a/tests/PHPUnit/Integration/PDFObjectTest.php +++ b/tests/PHPUnit/Integration/PDFObjectTest.php @@ -286,6 +286,43 @@ public function testFormatContent(): void $this->assertStringContainsString('Marko Nestorović PR', $pages[0]->getText()); } + /** + * Check that inline image data does not corrupt the stream + * + * @see: https://github.com/smalot/pdfparser/issues/691 + */ + public function testFormatContentInlineImages(): void + { + $formatContent = new \ReflectionMethod('Smalot\PdfParser\PDFObject', 'formatContent'); + $formatContent->setAccessible(true); + + $cleaned = $formatContent->invoke( + $this->getPdfObjectInstance(new Document()), + 'BT (This BI /W 258 /H 51 /should not trigger /as a /PDF command) TD ET q 65.30 0 0 18.00 412 707 cm BI /W 544 /H 150 +/BPC 1 /IM true /F [/A85 /Fl] ID Gb"0F_$L6!$j/a\$:ma&h\'JnJJ9S?O_EA-W+%D^ClCH=FP3s5M-gStQm\'5/hc`C?CqL(^pV$_-er6Ik`"-1]Q ;~> EI Q /F002 10.00 Tf 0.00 Tw 0 g' + ); + + // PdfParser should not be fooled by Q's in inline image data; + // Only one 'Q' command should be found + $commandQ = preg_match_all('/Q\r\n/', $cleaned); + $this->assertEquals(1, $commandQ); + + // The 'BI' inside a string should not be interpreted as the + // beginning of an inline image command + $this->assertStringContainsString('(This BI /W 258 /H 51 /should not trigger /as a /PDF command) TD', $cleaned); + + $cleaned = $formatContent->invoke( + $this->getPdfObjectInstance(new Document()), + 'BT (This BI /W 258 /H 51 /should not () \) trigger /as a /PDF command) TD (There is no ID inline image in this data) TD (Nothing but text EI should be found) TD ET' + ); + + $this->assertEquals('BT'."\r\n". +'(This BI /W 258 /H 51 /should not () \) trigger /as a /PDF command) TD'."\r\n". +'(There is no ID inline image in this data) TD'."\r\n". +'(Nothing but text EI should be found) TD'."\r\n". +'ET', $cleaned); + } + public function testGetSectionsText(): void { $content = '/Shape <>BDC