Skip to content

Commit

Permalink
TIKA-2342: suppport PDFBox IgnoreContentStreamSpaceGlyphs; add test; …
Browse files Browse the repository at this point in the history
…remove dead code line
  • Loading branch information
THausherr committed Dec 17, 2024
1 parent 72c6232 commit fb1f238
Show file tree
Hide file tree
Showing 4 changed files with 73 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -759,6 +759,24 @@ public boolean isSuppressDuplicateOverlappingText() {
return defaultConfig.isSuppressDuplicateOverlappingText();
}

/**
* If true, the parser should ignore spaces in the content stream and rely purely on the
* algorithm to determine where word breaks are (PDFBOX-3774). This can improve text extraction
* results where the content stream is sorted by position and has text overlapping spaces, but
* could cause some word breaks to not be added to the output. By default this is disabled.
*/
@Field
public void setIgnoreContentStreamSpaceGlyphs(boolean v) {
defaultConfig.setIgnoreContentStreamSpaceGlyphs(v);
}

/**
* @see #setIgnoreContentStreamSpaceGlyphs(boolean)
*/
public boolean isIgnoreContentStreamSpaceGlyphs() {
return defaultConfig.isIgnoreContentStreamSpaceGlyphs();
}

/**
* If true, the parser should try to remove duplicated
* text over the same region. This is needed for some
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,9 @@ public ImageType getImageType() {
// True if we let PDFBox remove duplicate overlapping text:
private boolean suppressDuplicateOverlappingText = false;

// True if we let PDFBox ignore spaces in the content stream and rely purely on the algorithm:
private boolean ignoreContentStreamSpaceGlyphs = false;

// True if we extract annotation text ourselves
// (workaround for PDFBOX-1143):
private boolean extractAnnotationText = true;
Expand Down Expand Up @@ -223,6 +226,8 @@ public void configure(PDF2XHTML pdf2XHTML) {
pdf2XHTML.setDropThreshold(dropThreshold);
}
pdf2XHTML.setSuppressDuplicateOverlappingText(isSuppressDuplicateOverlappingText());
// TODO TIKA-2342 activate after PDFBox release
//pdf2XHTML.setIgnoreContentStreamSpaceGlyphs(isIgnoreContentStreamSpaceGlyphs());
}

/**
Expand Down Expand Up @@ -404,6 +409,24 @@ public void setSuppressDuplicateOverlappingText(boolean suppressDuplicateOverlap
userConfigured.add("suppressDuplicateOverlappingText");
}

/**
* @see #setIgnoreContentStreamSpaceGlyphs(boolean)
*/
public boolean isIgnoreContentStreamSpaceGlyphs() {
return ignoreContentStreamSpaceGlyphs;
}

/**
* If true, the parser should ignore spaces in the content stream and rely purely on the
* algorithm to determine where word breaks are (PDFBOX-3774). This can improve text extraction
* results where the content stream is sorted by position and has text overlapping spaces, but
* could cause some word breaks to not be added to the output. By default this is disabled.
*/
public void setIgnoreContentStreamSpaceGlyphs(boolean ignoreContentStreamSpaceGlyphs) {
this.ignoreContentStreamSpaceGlyphs = ignoreContentStreamSpaceGlyphs;
userConfigured.add("ignoreContentStreamSpaceGlyphs");
}

/**
* @see #setExtractAnnotationText(boolean)
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -465,6 +465,37 @@ public void testDuplicateOverlappingText() throws Exception {

}

// TODO TIKA-2342 activate after PDFBox release
// @Test
public void testIgnoreContentStreamSpaceGlyphs() throws Exception {
PDFParser parser = new PDFParser();
// Default is false (keep spaces, don't sort):
XMLResult r = getXML("testContentStreamSpaceGlyphs.pdf", parser);
assertContains("( )overlap", r.xml);

parser.getPDFParserConfig().setIgnoreContentStreamSpaceGlyphs(true);
r = getXML("testContentStreamSpaceGlyphs.pdf", parser);
assertContains("( )overlap", r.xml);
parser.getPDFParserConfig().setSortByPosition(true);
r = getXML("testContentStreamSpaceGlyphs.pdf", parser);
assertContains("( overlap )", r.xml);

//now try with autodetect
ParseContext context = new ParseContext();
PDFParserConfig config = new PDFParserConfig();
context.set(PDFParserConfig.class, config);
r = getXML("testContentStreamSpaceGlyphs.pdf", context);
// Default is false (keep spaces, don't sort):
assertContains("( )overlap", r.xml);

config.setIgnoreContentStreamSpaceGlyphs(true);
r = getXML("testContentStreamSpaceGlyphs.pdf", context);
assertContains("( )overlap", r.xml);
config.setSortByPosition(true);
r = getXML("testContentStreamSpaceGlyphs.pdf", context);
assertContains("( overlap )", r.xml);
}

@Test
public void testSortByPosition() throws Exception {
PDFParser parser = new PDFParser();
Expand Down Expand Up @@ -499,7 +530,7 @@ public void testSortByPosition() throws Exception {

config.setSortByPosition(true);
context.set(PDFParserConfig.class, config);
stream = getResourceAsStream("/test-documents/testPDFTwoTextBoxes.pdf");
//stream = getResourceAsStream("/test-documents/testPDFTwoTextBoxes.pdf");
content = getText("testPDFTwoTextBoxes.pdf", new Metadata(), context);
content = content.replaceAll("\\s+", " ");
// Column text is now interleaved:
Expand Down
Binary file not shown.

0 comments on commit fb1f238

Please sign in to comment.