TIKA-2342: suppport PDFBox IgnoreContentStreamSpaceGlyphs; add test; …

…remove dead code line
apache · Dec 17, 2024 · fb1f238 · fb1f238
1 parent 72c6232
commit fb1f238
Show file tree

Hide file tree

Showing 4 changed files with 73 additions and 1 deletion.
diff --git a/...rd-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/...rd-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -759,6 +759,24 @@ public boolean isSuppressDuplicateOverlappingText() {
         return defaultConfig.isSuppressDuplicateOverlappingText();
     }
 
+    /**
+     * If true, the parser should ignore spaces in the content stream and rely purely on the
+     * algorithm to determine where word breaks are (PDFBOX-3774). This can improve text extraction
+     * results where the content stream is sorted by position and has text overlapping spaces, but
+     * could cause some word breaks to not be added to the output. By default this is disabled.
+     */
+    @Field
+    public void setIgnoreContentStreamSpaceGlyphs(boolean v) {
+        defaultConfig.setIgnoreContentStreamSpaceGlyphs(v);
+    }
+
+    /**
+     * @see #setIgnoreContentStreamSpaceGlyphs(boolean)
+     */
+    public boolean isIgnoreContentStreamSpaceGlyphs() {
+        return defaultConfig.isIgnoreContentStreamSpaceGlyphs();
+    }
+
     /**
      * If true, the parser should try to remove duplicated
      * text over the same region.  This is needed for some

diff --git a/...ules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java b/...ules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
@@ -65,6 +65,9 @@ public ImageType getImageType() {
     // True if we let PDFBox remove duplicate overlapping text:
     private boolean suppressDuplicateOverlappingText = false;
 
+    // True if we let PDFBox ignore spaces in the content stream and rely purely on the algorithm:
+    private boolean ignoreContentStreamSpaceGlyphs = false;
+
     // True if we extract annotation text ourselves
     // (workaround for PDFBOX-1143):
     private boolean extractAnnotationText = true;
@@ -223,6 +226,8 @@ public void configure(PDF2XHTML pdf2XHTML) {
             pdf2XHTML.setDropThreshold(dropThreshold);
         }
         pdf2XHTML.setSuppressDuplicateOverlappingText(isSuppressDuplicateOverlappingText());
+        // TODO TIKA-2342 activate after PDFBox release
+        //pdf2XHTML.setIgnoreContentStreamSpaceGlyphs(isIgnoreContentStreamSpaceGlyphs());
     }
 
     /**
@@ -404,6 +409,24 @@ public void setSuppressDuplicateOverlappingText(boolean suppressDuplicateOverlap
         userConfigured.add("suppressDuplicateOverlappingText");
     }
 
+    /**
+     * @see #setIgnoreContentStreamSpaceGlyphs(boolean)
+     */
+    public boolean isIgnoreContentStreamSpaceGlyphs() {
+        return ignoreContentStreamSpaceGlyphs;
+    }
+
+    /**
+     * If true, the parser should ignore spaces in the content stream and rely purely on the
+     * algorithm to determine where word breaks are (PDFBOX-3774). This can improve text extraction
+     * results where the content stream is sorted by position and has text overlapping spaces, but
+     * could cause some word breaks to not be added to the output. By default this is disabled.
+     */
+    public void setIgnoreContentStreamSpaceGlyphs(boolean ignoreContentStreamSpaceGlyphs) {
+        this.ignoreContentStreamSpaceGlyphs = ignoreContentStreamSpaceGlyphs;
+        userConfigured.add("ignoreContentStreamSpaceGlyphs");
+    }
+
     /**
      * @see #setExtractAnnotationText(boolean)
      */

diff --git a/...odules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/...odules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -465,6 +465,37 @@ public void testDuplicateOverlappingText() throws Exception {
 
     }
 
+    // TODO TIKA-2342 activate after PDFBox release
+    // @Test
+    public void testIgnoreContentStreamSpaceGlyphs() throws Exception {
+        PDFParser parser = new PDFParser();
+        // Default is false (keep spaces, don't sort):
+        XMLResult r = getXML("testContentStreamSpaceGlyphs.pdf", parser);
+        assertContains("(                                      )overlap", r.xml);
+
+        parser.getPDFParserConfig().setIgnoreContentStreamSpaceGlyphs(true);
+        r = getXML("testContentStreamSpaceGlyphs.pdf", parser);
+        assertContains("( )overlap", r.xml);
+        parser.getPDFParserConfig().setSortByPosition(true);
+        r = getXML("testContentStreamSpaceGlyphs.pdf", parser);
+        assertContains("( overlap )", r.xml);
+
+        //now try with autodetect
+        ParseContext context = new ParseContext();
+        PDFParserConfig config = new PDFParserConfig();
+        context.set(PDFParserConfig.class, config);
+        r = getXML("testContentStreamSpaceGlyphs.pdf", context);
+        // Default is false (keep spaces, don't sort):
+        assertContains("(                                      )overlap", r.xml);
+
+        config.setIgnoreContentStreamSpaceGlyphs(true);
+        r = getXML("testContentStreamSpaceGlyphs.pdf", context);
+        assertContains("( )overlap", r.xml);
+        config.setSortByPosition(true);
+        r = getXML("testContentStreamSpaceGlyphs.pdf", context);
+        assertContains("( overlap )", r.xml);
+    }
+
     @Test
     public void testSortByPosition() throws Exception {
         PDFParser parser = new PDFParser();
@@ -499,7 +530,7 @@ public void testSortByPosition() throws Exception {
 
         config.setSortByPosition(true);
         context.set(PDFParserConfig.class, config);
-        stream = getResourceAsStream("/test-documents/testPDFTwoTextBoxes.pdf");
+        //stream = getResourceAsStream("/test-documents/testPDFTwoTextBoxes.pdf");
         content = getText("testPDFTwoTextBoxes.pdf", new Metadata(), context);
         content = content.replaceAll("\\s+", " ");
         // Column text is now interleaved:

diff --git a/...tika-parser-pdf-module/src/test/resources/test-documents/testContentStreamSpaceGlyphs.pdf b/...tika-parser-pdf-module/src/test/resources/test-documents/testContentStreamSpaceGlyphs.pdf