From 6c1e1e68e8c817e334ed6e482a39032698c916f6 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Thu, 21 Nov 2024 21:41:25 +0000 Subject: [PATCH 01/21] fix LINEBLOCKSTARTS for new generated files #712 --- .../grobid/core/engines/FullTextParser.java | 4 +- .../core/engines/FullTextParserTest.java | 49 +++++++++++++++++-- 2 files changed, 49 insertions(+), 4 deletions(-) diff --git a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java index dfc623a7c2..2f10c762a9 100755 --- a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java @@ -725,6 +725,7 @@ else if (nbAuthorType > (bibDataSets.size() / 2)) } } + boolean isFirstBlockToken = true; while (n < lastPos) { if (blockIndex == dp2.getBlockPtr()) { //if (n > block.getEndToken()) { @@ -842,7 +843,7 @@ else if (lineStartX - previousLineStartX > characterWidth) features.alignmentStatus = "ALIGNEDLEFT"; } - if (n == 0) { + if (isFirstBlockToken) { features.lineStatus = "LINESTART"; // be sure that previous token is closing a line, except if it's a starting line if (previousFeatures != null) { @@ -1019,6 +1020,7 @@ else if (features.blockStatus == null) { mm += text.length(); nn += text.length(); previousFeatures = features; + isFirstBlockToken = false; } // lowest position of the block lowestPos = block.getY() + block.getHeight(); diff --git a/grobid-core/src/test/java/org/grobid/core/engines/FullTextParserTest.java b/grobid-core/src/test/java/org/grobid/core/engines/FullTextParserTest.java index a4fb60a4e9..f04d2f0bff 100644 --- a/grobid-core/src/test/java/org/grobid/core/engines/FullTextParserTest.java +++ b/grobid-core/src/test/java/org/grobid/core/engines/FullTextParserTest.java @@ -3,8 +3,12 @@ import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.tuple.Pair; import org.grobid.core.analyzers.GrobidAnalyzer; +import org.grobid.core.document.Document; +import org.grobid.core.document.DocumentPiece; +import org.grobid.core.document.DocumentPointer; import org.grobid.core.factory.GrobidFactory; import org.grobid.core.layout.LayoutToken; +import org.grobid.core.layout.LayoutTokenization; import org.grobid.core.main.LibraryLoader; import org.grobid.core.utilities.GrobidProperties; import org.junit.AfterClass; @@ -12,13 +16,12 @@ import org.junit.BeforeClass; import org.junit.Test; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; +import java.util.*; import java.util.stream.Collectors; import static org.hamcrest.CoreMatchers.is; import static org.hamcrest.MatcherAssert.assertThat; +import static org.hamcrest.Matchers.hasItem; import static org.hamcrest.collection.IsCollectionWithSize.hasSize; public class FullTextParserTest { @@ -41,6 +44,46 @@ public static void tearDown() { GrobidFactory.reset(); } + public DocumentPiece getWholeDocumentPiece(Document doc) { + return new DocumentPiece( + new DocumentPointer(0, 0, 0), + new DocumentPointer(0, doc.getTokenizations().size() - 1, doc.getTokenizations().size() - 1) + ); + } + + public SortedSet getWholeDocumentParts(Document doc) { + return new TreeSet<>(Collections.singleton( + getWholeDocumentPiece(doc) + )); + } + + @Test + public void testShouldOutputBlockStartForRegularBlock() throws Exception { + String blockText = "This is a block"; + Document doc = Document.createFromText(blockText); + SortedSet documentParts = getWholeDocumentParts(doc); + Pair dataAndTokens = FullTextParser.getBodyTextFeatured(doc, documentParts); +// LOGGER.debug("data debug: {}", dataAndTokens.getLeft()); + String[] lines = dataAndTokens.getLeft().split("\n"); + assertThat("lines[0] fields", Arrays.asList(lines[0].split("\\s")), is(hasItem("BLOCKSTART"))); + } + + @Test + public void testShouldOutputBlockStartForBlockStartingWithLineFeed() throws Exception { + String blockText = "\nThis is a block"; + Document doc = Document.createFromText(blockText); + assertThat( + "doc.block[0].tokens[0].text", + doc.getBlocks().get(0).getTokens().get(0).getText(), + is("\n") + ); + SortedSet documentParts = getWholeDocumentParts(doc); + Pair dataAndTokens = FullTextParser.getBodyTextFeatured(doc, documentParts); +// LOGGER.debug("data debug: {}", dataAndTokens.getLeft()); + String[] lines = dataAndTokens.getLeft().split("\n"); + assertThat("lines[0] fields", Arrays.asList(lines[0].split("\\s")), is(hasItem("BLOCKSTART"))); + } + @Test public void testProcessTrainingDataFigures_single_figure() throws Exception { String text = "The mechanism for superconductivity FIG. 1. λ(T) vs . T for YBCO"; From d95a1ac53cde69ed00b46a68776a39008fea5797 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Wed, 4 Dec 2024 18:56:20 +0000 Subject: [PATCH 02/21] first attempt to fix tables on the fly --- .../main/java/org/grobid/core/data/Table.java | 11 + .../org/grobid/core/document/Document.java | 3 +- .../grobid/core/engines/FullTextParser.java | 242 +++++++++++++++--- 3 files changed, 216 insertions(+), 40 deletions(-) diff --git a/grobid-core/src/main/java/org/grobid/core/data/Table.java b/grobid-core/src/main/java/org/grobid/core/data/Table.java index 14d468418c..0764796b38 100644 --- a/grobid-core/src/main/java/org/grobid/core/data/Table.java +++ b/grobid-core/src/main/java/org/grobid/core/data/Table.java @@ -43,6 +43,9 @@ public class Table extends Figure { private List contentTokens = new ArrayList<>(); private List fullDescriptionTokens = new ArrayList<>(); + + // Contains the raw layoutTokens from the fulltext model + private List rawLayoutTokens = new ArrayList<>(); private boolean goodTable = true; private StringBuilder note = null; @@ -423,4 +426,12 @@ public boolean isGoodTable() { public String getTeiId() { return "tab_" + this.id; } + + public List getRawLayoutTokens() { + return rawLayoutTokens; + } + + public void setRawLayoutTokens(List rawLayoutTokens) { + this.rawLayoutTokens = rawLayoutTokens; + } } \ No newline at end of file diff --git a/grobid-core/src/main/java/org/grobid/core/document/Document.java b/grobid-core/src/main/java/org/grobid/core/document/Document.java index d7dc90b08b..3925470a45 100755 --- a/grobid-core/src/main/java/org/grobid/core/document/Document.java +++ b/grobid-core/src/main/java/org/grobid/core/document/Document.java @@ -874,6 +874,7 @@ public static List getConnectedGraphics(Block block, Document doc public void postProcessTables() { for (Table table : tables) { if (!table.firstCheck()) { + table.setGoodTable(false); continue; } @@ -919,7 +920,7 @@ public void postProcessTables() { table.getContentTokens().clear(); table.getContentTokens().addAll(contentResult); - table.secondCheck(); + table.setGoodTable(table.secondCheck()); } } diff --git a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java index dfc623a7c2..36e1065313 100755 --- a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java @@ -66,6 +66,8 @@ import java.util.StringTokenizer; import java.util.TreeSet; import java.util.regex.Matcher; +import java.util.stream.Collectors; +import java.util.stream.IntStream; import nu.xom.Element; @@ -266,6 +268,8 @@ else if (config.getConsolidateCitations() == 2) //layoutTokensBody = featSeg.getB().getLayoutTokens(); resultBody = label(bodytext); + //Correct subsequent I-
or I- + resultBody = adjustInvalidSequenceOfStartLabels(resultBody); // we apply now the figure and table models based on the fulltext labeled output figures = processFigures(resultBody, layoutTokenization.getTokenization(), doc); @@ -277,8 +281,128 @@ else if (config.getConsolidateCitations() == 2) figure.setCaptionLayoutTokens(captionProcess.getRight()); } } - + tables = processTables(resultBody, layoutTokenization.getTokenization(), doc); + + //We deal with tables considered bad by reverting them as , to reduce the risk them to be + // dropped later on. + + //TODO: double check the way the tables are validated + + List
badTables = tables.stream() + .filter(t -> !t.isGoodTable()) + .collect(Collectors.toList()); + + //LF: we update the resultBody sequence by reverting these tables as elements + if (CollectionUtils.isNotEmpty(badTables)) { + List> splitResult = Arrays.stream(resultBody.split("\n")) + .map(l -> Arrays.stream(l.split("\t")).collect(Collectors.toList())) + .collect(Collectors.toList()); + + for (Table badTable : badTables) { + // Find the index of the first layoutToken of the table in the tokenization + List rawLayoutTokenTable = badTable.getRawLayoutTokens(); + LayoutToken firstLayoutTokenTable = rawLayoutTokenTable.get(0); + + final List documentTokenization = layoutTokenization.getTokenization(); + + int tokenIndex = IntStream.range(0, documentTokenization.size()) + .filter(i -> { + LayoutToken l = documentTokenization.get(i); + return l.getText().equals(firstLayoutTokenTable.getText()) + && l.getPage() == firstLayoutTokenTable.getPage() + && l.getOffset() == firstLayoutTokenTable.getOffset(); + }) + .findFirst() + .orElse(-1); + + System.out.println(tokenIndex); + + List candidateIndexes = IntStream.range(0, splitResult.size()) + .filter(i -> splitResult.get(i).get(0).equals(firstLayoutTokenTable.getText()) + && Iterables.getLast(splitResult.get(i)).equals("I-
")) + .boxed() + .collect(Collectors.toList()); + + if (candidateIndexes.isEmpty()) { + candidateIndexes = IntStream.range(0, splitResult.size()) + .filter(i -> splitResult.get(i).get(0).equals(firstLayoutTokenTable.getText()) + && Iterables.getLast(splitResult.get(i)).equals("
")) + .boxed() + .collect(Collectors.toList()); + if (candidateIndexes.isEmpty()) { + LOGGER.info("Cannot find the candidate index for fixing the tables."); + continue; + } + } + + // Need to match with the rest + List tokensNoSpace = rawLayoutTokenTable.stream() + .map(LayoutToken::getText) + .map(StringUtils::strip) + .filter(StringUtils::isNotBlank) + .collect(Collectors.toList()); + + int resultIndexCandidate = -1; + if (tokensNoSpace.size() == 1){ + resultIndexCandidate = candidateIndexes.get(0); + } else { + for (int candidateIndex: candidateIndexes) { + List candidateTable = splitResult.subList(candidateIndex, candidateIndex + tokensNoSpace.size()) + .stream() + .map(i -> i.get(0)) + .collect(Collectors.toList()); + + String candidateTableText = String.join("", candidateTable); + String tokensText = String.join("", tokensNoSpace); + + if (candidateTableText.equals(tokensText)) { + resultIndexCandidate = candidateIndex; + break; + } + } + } + + if (resultIndexCandidate > -1) { + boolean first = true; + for (int i = resultIndexCandidate;i < resultIndexCandidate + tokensNoSpace.size(); i++) { + List line = splitResult.get(i); + String label = Iterables.getLast(line); + if (first) { + first = false; + } else { + if (label.startsWith("I-")) { + break; + } + } + line.set(line.size() - 1, label.replace("
", "")); + } + } else { + System.out.println("Cannot find the result index candiate."); + } + + +// List> badTableResult = Arrays.stream(badTable.getRawLayoutTokens().stream() +// .map(LayoutToken::getText) +// .toArray(String[]::new)) +// .map(l -> Arrays.stream(l.split("\t")).collect(Collectors.toList())) +// .collect(Collectors.toList()); +// + + } + + String resultBody2 = splitResult.stream() + .map(l -> String.join("\t", l)) + .collect(Collectors.joining("\n")); + + resultBody = resultBody2; + + } + + tables = tables.stream() + .filter(Table::isGoodTable) + .collect(Collectors.toList()); + // further parse the caption for(Table table : tables) { if ( CollectionUtils.isNotEmpty(table.getCaptionLayoutTokens()) ) { @@ -316,7 +440,7 @@ else if (config.getConsolidateCitations() == 2) // callout in superscript is by error labeled as a numerical reference callout) List markerTypes = null; - if (resultBody != null) + if (resultBody != null) markerTypes = postProcessCallout(resultBody, layoutTokenization); // final combination @@ -556,6 +680,46 @@ protected static String postProcessFullTextLabeledText(String fulltextLabeledTex return result.toString(); } + protected static String adjustInvalidSequenceOfStartLabels(String fulltextLabeledText) { + if (fulltextLabeledText == null) + return null; + StringBuilder result = new StringBuilder(); + + String[] lines = fulltextLabeledText.split("\n"); + String previousLabel = null; + for(int i=0; i getBodyTextFeatured(Document doc, SortedSet documentBodyParts) { if ((documentBodyParts == null) || (documentBodyParts.size() == 0)) { @@ -1979,9 +2143,9 @@ private static boolean testClosingTag(StringBuilder buffer, buffer.append("\n\n"); } else if (lastTag0.equals("")) { buffer.append("\n\n"); - } else if (lastTag0.equals("") || - lastTag0.equals("") || - lastTag0.equals("") || + } else if (lastTag0.equals("") || + lastTag0.equals("") || + lastTag0.equals("") || lastTag0.equals("")) { buffer.append(""); @@ -2196,9 +2360,9 @@ protected List
processTables(String rese, for (Table result : localResults) { List localTokenizationTable = result.getLayoutTokens(); - //result.setLayoutTokens(tokenizationTable); + result.setRawLayoutTokens(tokenizationTable); - // block setting: we restrict to the tokenization of this particulart table + // block setting: we restrict to the tokenization of this particular table SortedSet blockPtrs = new TreeSet<>(); for (LayoutToken lt : localTokenizationTable) { if (!LayoutTokensUtil.spaceyToken(lt.t()) && !LayoutTokensUtil.newLineToken(lt.t())) { @@ -2422,7 +2586,7 @@ protected List processEquations(String rese, } /** - * Ensure consistent use of callouts in the entire document body + * Ensure consistent use of callouts in the entire document body */ private List postProcessCallout(String result, LayoutTokenization layoutTokenization) { if (layoutTokenization == null) @@ -2482,7 +2646,7 @@ private List postProcessCallout(String result, LayoutTokenization la if (figureMarkerSeen.contains(refText)) { // already seen reference marker sequence, we skip it continue; - } + } MarkerType localMarkerType = CalloutAnalyzer.getCalloutType(refTokens); if (figureMarkerTypeCounts.get(localMarkerType) == null) figureMarkerTypeCounts.put(localMarkerType, 1); @@ -2495,7 +2659,7 @@ private List postProcessCallout(String result, LayoutTokenization la if (tableMarkerSeen.contains(refText)) { // already seen reference marker sequence, we skip it continue; - } + } MarkerType localMarkerType = CalloutAnalyzer.getCalloutType(refTokens); if (tableMarkerTypeCounts.get(localMarkerType) == null) tableMarkerTypeCounts.put(localMarkerType, 1); @@ -2508,16 +2672,16 @@ private List postProcessCallout(String result, LayoutTokenization la if (equationMarkerSeen.contains(refText)) { // already seen reference marker sequence, we skip it continue; - } + } MarkerType localMarkerType = CalloutAnalyzer.getCalloutType(refTokens); if (equationMarkerTypeCounts.get(localMarkerType) == null) equationMarkerTypeCounts.put(localMarkerType, 1); else - equationMarkerTypeCounts.put(localMarkerType, equationMarkerTypeCounts.get(localMarkerType)+1); + equationMarkerTypeCounts.put(localMarkerType, equationMarkerTypeCounts.get(localMarkerType)+1); if (!equationMarkerSeen.contains(refText)) - equationMarkerSeen.add(refText); - } + equationMarkerSeen.add(refText); + } } } @@ -2578,7 +2742,7 @@ private void toTEI(Document doc, teiFormatter, resCitations, config); if (acknowledgmentStmt.length() > 0) { - MutablePair,List,List>> localResult = + MutablePair,List,List>> localResult = parsers.getFundingAcknowledgementParser().processingXmlFragment(acknowledgmentStmt.toString(), config); if (localResult != null && localResult.getLeft() != null) { @@ -2622,7 +2786,7 @@ private void toTEI(Document doc, config); } if (fundingStmt.length() > 0) { - MutablePair,List,List>> localResult = + MutablePair,List,List>> localResult = parsers.getFundingAcknowledgementParser().processingXmlFragment(fundingStmt.toString(), config); if (localResult != null && localResult.getLeft() != null) { @@ -2660,7 +2824,7 @@ private void toTEI(Document doc, resCitations, config); if (fundingStmt.length() > 0) { - MutablePair,List,List>> localResult = + MutablePair,List,List>> localResult = parsers.getFundingAcknowledgementParser().processingXmlFragment(fundingStmt.toString(), config); if (localResult != null && localResult.getLeft() != null){ @@ -2710,15 +2874,15 @@ private void toTEI(Document doc, } if (affiliations != null && affiliations.size() >0) { - + // check if we have at least one acknowledged research infrastructure here List filteredInfrastructures = new ArrayList<>(); for(Affiliation affiliation : affiliations) { - if (affiliation.getAffiliationString() != null && affiliation.getAffiliationString().length()>0 && affiliation.isInfrastructure()) + if (affiliation.getAffiliationString() != null && affiliation.getAffiliationString().length()>0 && affiliation.isInfrastructure()) filteredInfrastructures.add(affiliation); else if (affiliation.getAffiliationString() != null && affiliation.getAffiliationString().length()>0) { // check if this organization is a known infrastructure - List localOrganizationNamings = + List localOrganizationNamings = Lexicon.getInstance().getOrganizationNamingInfo(affiliation.getAffiliationString()); if (localOrganizationNamings != null && localOrganizationNamings.size()>0) { filteredInfrastructures.add(affiliation); @@ -2730,7 +2894,7 @@ else if (affiliation.getAffiliationString() != null && affiliation.getAffiliatio if (filteredInfrastructures.size() > 0) { tei.append("\n\t\t\t\n"); for(Affiliation affiliation : filteredInfrastructures) { - List localOrganizationNamings = + List localOrganizationNamings = Lexicon.getInstance().getOrganizationNamingInfo(affiliation.getAffiliationString()); tei.append("\t\t\t\t"); tei.append("\t\t\t\t\t"); @@ -2750,7 +2914,7 @@ else if (affiliation.getAffiliationString() != null && affiliation.getAffiliatio } tei.append("\t\t\t\t\n"); } - + tei.append("\t\t\t\n"); } } @@ -2762,10 +2926,10 @@ else if (affiliation.getAffiliationString() != null && affiliation.getAffiliatio Pair> headerAvailabilityProcessed = processShort(headerAvailabilityStatementTokens, doc); if (headerAvailabilityProcessed != null) { availabilityStmt = teiFormatter.processTEIDivSection("availability", - "\t\t\t", - headerAvailabilityProcessed.getLeft(), - headerAvailabilityProcessed.getRight(), - resCitations, + "\t\t\t", + headerAvailabilityProcessed.getLeft(), + headerAvailabilityProcessed.getRight(), + resCitations, config); } if (availabilityStmt.length() > 0) { @@ -2775,11 +2939,11 @@ else if (affiliation.getAffiliationString() != null && affiliation.getAffiliatio // availability statements in non-header part availabilityStmt = getSectionAsTEI("availability", - "\t\t\t", - doc, - SegmentationLabels.AVAILABILITY, - teiFormatter, - resCitations, + "\t\t\t", + doc, + SegmentationLabels.AVAILABILITY, + teiFormatter, + resCitations, config); if (availabilityStmt.length() > 0) { tei.append(availabilityStmt.toString()); @@ -2830,7 +2994,7 @@ private void toTEIHeaderFunding(Document doc, teiFormatter, null, config); if (acknowledgmentStmt.length() > 0) { - MutablePair,List,List>> localResult = + MutablePair,List,List>> localResult = parsers.getFundingAcknowledgementParser().processingXmlFragment(acknowledgmentStmt.toString(), config); if (localResult != null && localResult.getLeft() != null) { @@ -2874,7 +3038,7 @@ private void toTEIHeaderFunding(Document doc, config); } if (fundingStmt.length() > 0) { - MutablePair,List,List>> localResult = + MutablePair,List,List>> localResult = parsers.getFundingAcknowledgementParser().processingXmlFragment(fundingStmt.toString(), config); if (localResult != null && localResult.getLeft() != null) { @@ -2912,7 +3076,7 @@ private void toTEIHeaderFunding(Document doc, null, config); if (fundingStmt.length() > 0) { - MutablePair,List,List>> localResult = + MutablePair,List,List>> localResult = parsers.getFundingAcknowledgementParser().processingXmlFragment(fundingStmt.toString(), config); if (localResult != null && localResult.getLeft() != null){ @@ -2958,15 +3122,15 @@ private void toTEIHeaderFunding(Document doc, } if (affiliations != null && affiliations.size() >0) { - + // check if we have at least one acknowledged research infrastructure here List filteredInfrastructures = new ArrayList<>(); for(Affiliation affiliation : affiliations) { - if (affiliation.getAffiliationString() != null && affiliation.getAffiliationString().length()>0 && affiliation.isInfrastructure()) + if (affiliation.getAffiliationString() != null && affiliation.getAffiliationString().length()>0 && affiliation.isInfrastructure()) filteredInfrastructures.add(affiliation); else if (affiliation.getAffiliationString() != null && affiliation.getAffiliationString().length()>0) { // check if this organization is a known infrastructure - List localOrganizationNamings = + List localOrganizationNamings = Lexicon.getInstance().getOrganizationNamingInfo(affiliation.getAffiliationString()); if (localOrganizationNamings != null && localOrganizationNamings.size()>0) { filteredInfrastructures.add(affiliation); @@ -2978,7 +3142,7 @@ else if (affiliation.getAffiliationString() != null && affiliation.getAffiliatio if (filteredInfrastructures.size() > 0) { tei.append("\n\t\t\t\n"); for(Affiliation affiliation : filteredInfrastructures) { - List localOrganizationNamings = + List localOrganizationNamings = Lexicon.getInstance().getOrganizationNamingInfo(affiliation.getAffiliationString()); tei.append("\t\t\t\t"); tei.append("\t\t\t\t\t"); @@ -2998,7 +3162,7 @@ else if (affiliation.getAffiliationString() != null && affiliation.getAffiliatio } tei.append("\t\t\t\t\n"); } - + tei.append("\t\t\t\n"); } } From b012665e9a3ff77a01b823a5925377351d81a64a Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Fri, 6 Dec 2024 12:23:59 +0000 Subject: [PATCH 03/21] revise table validation, apply check to figures, move code outside the fulltext parser --- .../java/org/grobid/core/data/Figure.java | 17 +- .../main/java/org/grobid/core/data/Table.java | 34 +- .../grobid/core/engines/FullTextParser.java | 315 +++++++----------- .../org/grobid/core/utilities/LabelUtils.java | 84 +++++ 4 files changed, 239 insertions(+), 211 deletions(-) create mode 100644 grobid-core/src/main/java/org/grobid/core/utilities/LabelUtils.java diff --git a/grobid-core/src/main/java/org/grobid/core/data/Figure.java b/grobid-core/src/main/java/org/grobid/core/data/Figure.java index ef4117e93c..e237c09e83 100644 --- a/grobid-core/src/main/java/org/grobid/core/data/Figure.java +++ b/grobid-core/src/main/java/org/grobid/core/data/Figure.java @@ -88,6 +88,9 @@ public boolean apply(GraphicObject graphicObject) { private List textArea; private List layoutTokens; + // Contains the raw layoutTokens from the fulltext model + private List rawLayoutTokens = new ArrayList<>(); + // coordinates private int page = -1; private double y = 0.0; @@ -323,8 +326,12 @@ public String getTeiId() { return "fig_" + this.id; } + public boolean isCompleteForTEI() { + return (StringUtils.isAllBlank(header) || StringUtils.isNotEmpty(caption) || CollectionUtils.isNotEmpty(graphicObjects)); + } + public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter formatter, List markerTypes) { - if (StringUtils.isEmpty(header) && StringUtils.isEmpty(caption) && CollectionUtils.isEmpty(graphicObjects)) { + if (isCompleteForTEI()) { return null; } Element figureElement = XmlBuilderUtils.teiElement("figure"); @@ -568,4 +575,12 @@ public void setLabel(StringBuilder label) { public void setUri(URI uri) { this.uri = uri; } + + public List getRawLayoutTokens() { + return rawLayoutTokens; + } + + public void setRawLayoutTokens(List rawLayoutTokens) { + this.rawLayoutTokens = rawLayoutTokens; + } } diff --git a/grobid-core/src/main/java/org/grobid/core/data/Table.java b/grobid-core/src/main/java/org/grobid/core/data/Table.java index 0764796b38..1016760284 100644 --- a/grobid-core/src/main/java/org/grobid/core/data/Table.java +++ b/grobid-core/src/main/java/org/grobid/core/data/Table.java @@ -30,7 +30,6 @@ import nu.xom.Attribute; import nu.xom.Element; import nu.xom.Node; -import nu.xom.Text; import static org.grobid.core.document.xml.XmlBuilderUtils.teiElement; import static org.grobid.core.document.xml.XmlBuilderUtils.addXmlId; @@ -44,8 +43,6 @@ public class Table extends Figure { private List contentTokens = new ArrayList<>(); private List fullDescriptionTokens = new ArrayList<>(); - // Contains the raw layoutTokens from the fulltext model - private List rawLayoutTokens = new ArrayList<>(); private boolean goodTable = true; private StringBuilder note = null; @@ -65,9 +62,13 @@ public Table() { note = new StringBuilder(); } + public boolean isCompleteForTEI() { + return (StringUtils.isNotEmpty(header) && StringUtils.isNotEmpty(caption)); + } + @Override public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter formatter, List markerTypes) { - if (StringUtils.isEmpty(header) && StringUtils.isEmpty(caption)) { + if (!isCompleteForTEI()) { return null; } @@ -107,7 +108,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form addXmlId(desc, "_" + divID); } - if ( (labeledCaption != null) && (labeledCaption.length() > 0) ) { + if (StringUtils.isNotBlank(labeledCaption)) { TaggingTokenClusteror clusteror = new TaggingTokenClusteror(GrobidModels.FULLTEXT, labeledCaption, captionLayoutTokens); List clusters = clusteror.cluster(); for (TaggingTokenCluster cluster : clusters) { @@ -172,7 +173,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form } Element noteNode = null; - if (note != null && note.toString().trim().length()>0) { + if (StringUtils.isNotBlank(note)) { noteNode = XmlBuilderUtils.teiElement("note"); if (config.isGenerateTeiIds()) { @@ -180,7 +181,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form addXmlId(noteNode, "_" + divID); } - if ( (labeledNote != null) && (labeledNote.length() > 0) ) { + if (StringUtils.isNotBlank(labeledNote)) { TaggingTokenClusteror clusteror = new TaggingTokenClusteror(GrobidModels.FULLTEXT, labeledNote, noteLayoutTokens); List clusters = clusteror.cluster(); for (TaggingTokenCluster cluster : clusters) { @@ -349,9 +350,14 @@ public String getLabeledNote() { return this.labeledNote; } - private boolean validateTable() { + /** Check if the table: + * - has label, header and content + * - header starts with "tab" + * - label can be parsed + */ + public boolean validateTable() { CntManager cnt = Engine.getCntManager(); - if (StringUtils.isEmpty(label) || StringUtils.isEmpty(header) || StringUtils.isEmpty(content)) { + if (StringUtils.isAnyBlank(label, header, content)) { cnt.i(TableRejectionCounters.EMPTY_LABEL_OR_HEADER_OR_CONTENT); return false; } @@ -362,7 +368,8 @@ private boolean validateTable() { cnt.i(TableRejectionCounters.CANNOT_PARSE_LABEL_TO_INT); return false; } - if (!getHeader().toLowerCase().startsWith("table")) { + // tab covers: table, tabelle, tableu, tabella, etc. + if (!StringUtils.startsWithIgnoreCase(getHeader(), "tab")) { cnt.i(TableRejectionCounters.HEADER_NOT_STARTS_WITH_TABLE_WORD); return false; } @@ -427,11 +434,4 @@ public String getTeiId() { return "tab_" + this.id; } - public List getRawLayoutTokens() { - return rawLayoutTokens; - } - - public void setRawLayoutTokens(List rawLayoutTokens) { - this.rawLayoutTokens = rawLayoutTokens; - } } \ No newline at end of file diff --git a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java index 36e1065313..23b39aee82 100755 --- a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java @@ -37,12 +37,7 @@ import org.grobid.core.layout.*; import org.grobid.core.tokenization.TaggingTokenCluster; import org.grobid.core.tokenization.TaggingTokenClusteror; -import org.grobid.core.utilities.LanguageUtilities; -import org.grobid.core.utilities.TextUtilities; -import org.grobid.core.utilities.KeyGen; -import org.grobid.core.utilities.LayoutTokensUtil; -import org.grobid.core.utilities.GrobidProperties; -import org.grobid.core.utilities.Consolidation; +import org.grobid.core.utilities.*; import org.grobid.core.utilities.matching.ReferenceMarkerMatcher; import org.grobid.core.utilities.matching.EntityMatcherException; import org.grobid.core.engines.citations.CalloutAnalyzer; @@ -72,6 +67,7 @@ import nu.xom.Element; import static org.apache.commons.lang3.StringUtils.*; +import static org.grobid.core.utilities.LabelUtils.postProcessFullTextLabeledText; public class FullTextParser extends AbstractParser { private static final Logger LOGGER = LoggerFactory.getLogger(FullTextParser.class); @@ -260,7 +256,7 @@ else if (config.getConsolidateCitations() == 2) List
tables = null; List equations = null; if (featSeg != null && isNotBlank(featSeg.getLeft())) { - // if featSeg is null, it usually means that no body segment is found in the + // if featSeg is null, it usually means that the fulltext body is not found in the // document segmentation String bodytext = featSeg.getLeft(); layoutTokenization = featSeg.getRight(); @@ -269,7 +265,7 @@ else if (config.getConsolidateCitations() == 2) resultBody = label(bodytext); //Correct subsequent I-
or I-
- resultBody = adjustInvalidSequenceOfStartLabels(resultBody); + resultBody = LabelUtils.adjustInvalidSequenceOfStartLabels(resultBody); // we apply now the figure and table models based on the fulltext labeled output figures = processFigures(resultBody, layoutTokenization.getTokenization(), doc); @@ -282,6 +278,17 @@ else if (config.getConsolidateCitations() == 2) } } + List
badFigures = figures.stream() + .filter(f -> !f.isCompleteForTEI()) + .collect(Collectors.toList()); + + LOGGER.warn("Identified bad figures: " + badFigures.size()); + resultBody = revertResultsForBadItems(badFigures, resultBody, TaggingLabels.FIGURE_LABEL); + + figures = figures.stream() + .filter(f -> !badFigures.contains(f)) + .collect(Collectors.toList()); + tables = processTables(resultBody, layoutTokenization.getTokenization(), doc); //We deal with tables considered bad by reverting them as , to reduce the risk them to be @@ -290,117 +297,14 @@ else if (config.getConsolidateCitations() == 2) //TODO: double check the way the tables are validated List
badTables = tables.stream() - .filter(t -> !t.isGoodTable()) + .filter(t -> !(t.isCompleteForTEI() && t.validateTable())) .collect(Collectors.toList()); - //LF: we update the resultBody sequence by reverting these tables as elements - if (CollectionUtils.isNotEmpty(badTables)) { - List> splitResult = Arrays.stream(resultBody.split("\n")) - .map(l -> Arrays.stream(l.split("\t")).collect(Collectors.toList())) - .collect(Collectors.toList()); - - for (Table badTable : badTables) { - // Find the index of the first layoutToken of the table in the tokenization - List rawLayoutTokenTable = badTable.getRawLayoutTokens(); - LayoutToken firstLayoutTokenTable = rawLayoutTokenTable.get(0); - - final List documentTokenization = layoutTokenization.getTokenization(); - - int tokenIndex = IntStream.range(0, documentTokenization.size()) - .filter(i -> { - LayoutToken l = documentTokenization.get(i); - return l.getText().equals(firstLayoutTokenTable.getText()) - && l.getPage() == firstLayoutTokenTable.getPage() - && l.getOffset() == firstLayoutTokenTable.getOffset(); - }) - .findFirst() - .orElse(-1); - - System.out.println(tokenIndex); - - List candidateIndexes = IntStream.range(0, splitResult.size()) - .filter(i -> splitResult.get(i).get(0).equals(firstLayoutTokenTable.getText()) - && Iterables.getLast(splitResult.get(i)).equals("I-
")) - .boxed() - .collect(Collectors.toList()); - - if (candidateIndexes.isEmpty()) { - candidateIndexes = IntStream.range(0, splitResult.size()) - .filter(i -> splitResult.get(i).get(0).equals(firstLayoutTokenTable.getText()) - && Iterables.getLast(splitResult.get(i)).equals("
")) - .boxed() - .collect(Collectors.toList()); - if (candidateIndexes.isEmpty()) { - LOGGER.info("Cannot find the candidate index for fixing the tables."); - continue; - } - } - - // Need to match with the rest - List tokensNoSpace = rawLayoutTokenTable.stream() - .map(LayoutToken::getText) - .map(StringUtils::strip) - .filter(StringUtils::isNotBlank) - .collect(Collectors.toList()); - - int resultIndexCandidate = -1; - if (tokensNoSpace.size() == 1){ - resultIndexCandidate = candidateIndexes.get(0); - } else { - for (int candidateIndex: candidateIndexes) { - List candidateTable = splitResult.subList(candidateIndex, candidateIndex + tokensNoSpace.size()) - .stream() - .map(i -> i.get(0)) - .collect(Collectors.toList()); - - String candidateTableText = String.join("", candidateTable); - String tokensText = String.join("", tokensNoSpace); - - if (candidateTableText.equals(tokensText)) { - resultIndexCandidate = candidateIndex; - break; - } - } - } - - if (resultIndexCandidate > -1) { - boolean first = true; - for (int i = resultIndexCandidate;i < resultIndexCandidate + tokensNoSpace.size(); i++) { - List line = splitResult.get(i); - String label = Iterables.getLast(line); - if (first) { - first = false; - } else { - if (label.startsWith("I-")) { - break; - } - } - line.set(line.size() - 1, label.replace("
", "")); - } - } else { - System.out.println("Cannot find the result index candiate."); - } - - -// List> badTableResult = Arrays.stream(badTable.getRawLayoutTokens().stream() -// .map(LayoutToken::getText) -// .toArray(String[]::new)) -// .map(l -> Arrays.stream(l.split("\t")).collect(Collectors.toList())) -// .collect(Collectors.toList()); -// - - } - - String resultBody2 = splitResult.stream() - .map(l -> String.join("\t", l)) - .collect(Collectors.joining("\n")); - - resultBody = resultBody2; - - } + LOGGER.warn("Identified bad tables: " + badTables.size()); + resultBody = revertResultsForBadItems(badTables, resultBody, TaggingLabels.TABLE_LABEL); tables = tables.stream() - .filter(Table::isGoodTable) + .filter(t-> !badTables.contains(t)) .collect(Collectors.toList()); // further parse the caption @@ -458,6 +362,109 @@ else if (config.getConsolidateCitations() == 2) } } + private static String revertResultsForBadItems(List badItems, String resultBody, String itemLabel) { + //LF: we update the resultBody sequence by reverting these tables as elements + if (CollectionUtils.isNotEmpty(badItems)) { + List> splitResult = Arrays.stream(resultBody.split("\n")) + .map(l -> Arrays.stream(l.split("\t")).collect(Collectors.toList())) + .collect(Collectors.toList()); + + for (Figure badTable : badItems) { + // Find the index of the first layoutToken of the table in the tokenization + List rawLayoutTokenTable = badTable.getRawLayoutTokens(); + LayoutToken firstLayoutTokenTable = rawLayoutTokenTable.get(0); + +// final List documentTokenization = layoutTokenization.getTokenization(); + +// int tokenIndex = IntStream.range(0, documentTokenization.size()) +// .filter(i -> { +// LayoutToken l = documentTokenization.get(i); +// return l.getText().equals(firstLayoutTokenTable.getText()) +// && l.getPage() == firstLayoutTokenTable.getPage() +// && l.getOffset() == firstLayoutTokenTable.getOffset(); +// }) +// .findFirst() +// .orElse(-1); + + List candidateIndexes = IntStream.range(0, splitResult.size()) + .filter(i -> splitResult.get(i).get(0).equals(firstLayoutTokenTable.getText()) + && Iterables.getLast(splitResult.get(i)).equals("I-"+itemLabel)) + .boxed() + .collect(Collectors.toList()); + + if (candidateIndexes.isEmpty()) { + candidateIndexes = IntStream.range(0, splitResult.size()) + .filter(i -> splitResult.get(i).get(0).equals(firstLayoutTokenTable.getText()) + && Iterables.getLast(splitResult.get(i)).equals(itemLabel)) + .boxed() + .collect(Collectors.toList()); + if (candidateIndexes.isEmpty()) { + LOGGER.info("Cannot find the candidate index for fixing the tables."); + continue; + } + } + + // Need to match with the rest + List tokensNoSpace = rawLayoutTokenTable.stream() + .map(LayoutToken::getText) + .map(StringUtils::strip) + .filter(StringUtils::isNotBlank) + .collect(Collectors.toList()); + + int resultIndexCandidate = -1; + if (tokensNoSpace.size() == 1){ + resultIndexCandidate = candidateIndexes.get(0); + } else { + for (int candidateIndex: candidateIndexes) { + List candidateTable = splitResult.subList(candidateIndex, candidateIndex + tokensNoSpace.size()) + .stream() + .map(i -> i.get(0)) + .collect(Collectors.toList()); + + String candidateTableText = String.join("", candidateTable); + String tokensText = String.join("", tokensNoSpace); + + if (candidateTableText.equals(tokensText)) { + resultIndexCandidate = candidateIndex; + break; + } + } + } + + if (resultIndexCandidate > -1) { + boolean first = true; + for (int i = resultIndexCandidate;i < resultIndexCandidate + tokensNoSpace.size(); i++) { + List line = splitResult.get(i); + String label = Iterables.getLast(line); + if (first) { + first = false; + } else { + if (label.startsWith("I-")) { + break; + } + } + line.set(line.size() - 1, label.replace(TaggingLabels.TABLE_LABEL, TaggingLabels.PARAGRAPH_LABEL)); + } + } else { + LOGGER.warn("Cannot find the result index candidate."); + } +// List> badTableResult = Arrays.stream(badTable.getRawLayoutTokens().stream() +// .map(LayoutToken::getText) +// .toArray(String[]::new)) +// .map(l -> Arrays.stream(l.split("\t")).collect(Collectors.toList())) +// .collect(Collectors.toList()); +// + } + + String resultBody2 = splitResult.stream() + .map(l -> String.join("\t", l)) + .collect(Collectors.joining("\n")); + + resultBody = resultBody2; + } + return resultBody; + } + /** * Machine-learning recognition of full text structures limted to header and funding information. @@ -642,84 +649,6 @@ public Pair> processShort(List tokens, Do return Pair.of(res, layoutTokenization); } - /** - * Post-process text labeled by the fulltext model on chunks that are known to be text (no table, or figure) - * It converts table and figure labels to paragraph labels. - */ - protected static String postProcessFullTextLabeledText(String fulltextLabeledText) { - if (fulltextLabeledText == null) - return null; - StringBuilder result = new StringBuilder(); - - String[] lines = fulltextLabeledText.split("\n"); - String previousLabel = null; - for(int i=0; i getBodyTextFeatured(Document doc, SortedSet documentBodyParts) { if ((documentBodyParts == null) || (documentBodyParts.size() == 0)) { diff --git a/grobid-core/src/main/java/org/grobid/core/utilities/LabelUtils.java b/grobid-core/src/main/java/org/grobid/core/utilities/LabelUtils.java new file mode 100644 index 0000000000..0cb0b8211a --- /dev/null +++ b/grobid-core/src/main/java/org/grobid/core/utilities/LabelUtils.java @@ -0,0 +1,84 @@ +package org.grobid.core.utilities; + +import org.apache.commons.lang3.StringUtils; +import org.grobid.core.engines.label.TaggingLabels; + +public class LabelUtils { + /** + * Post-process text labeled by the fulltext model on chunks that are known to be text (no table, or figure) + * It converts table and figure labels to paragraph labels. + */ + public static String postProcessFullTextLabeledText(String fulltextLabeledText) { + if (fulltextLabeledText == null) + return null; + StringBuilder result = new StringBuilder(); + + String[] lines = fulltextLabeledText.split("\n"); + String previousLabel = null; + for(int i=0; i Date: Fri, 6 Dec 2024 12:49:37 +0000 Subject: [PATCH 04/21] fix figure validation --- grobid-core/src/main/java/org/grobid/core/data/Figure.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/grobid-core/src/main/java/org/grobid/core/data/Figure.java b/grobid-core/src/main/java/org/grobid/core/data/Figure.java index e237c09e83..c62ff797f4 100644 --- a/grobid-core/src/main/java/org/grobid/core/data/Figure.java +++ b/grobid-core/src/main/java/org/grobid/core/data/Figure.java @@ -327,11 +327,11 @@ public String getTeiId() { } public boolean isCompleteForTEI() { - return (StringUtils.isAllBlank(header) || StringUtils.isNotEmpty(caption) || CollectionUtils.isNotEmpty(graphicObjects)); + return (StringUtils.isNotBlank(header) || StringUtils.isNotBlank(caption) || CollectionUtils.isNotEmpty(graphicObjects)); } public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter formatter, List markerTypes) { - if (isCompleteForTEI()) { + if (!isCompleteForTEI()) { return null; } Element figureElement = XmlBuilderUtils.teiElement("figure"); From 575c2fe4ddcf49a608494ce1ae1371e354d5db2a Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Fri, 6 Dec 2024 13:10:57 +0000 Subject: [PATCH 05/21] move test related to LabelUtils outside --- .../core/engines/FullTextParserTest.java | 67 ------------- .../grobid/core/utilities/LabelUtilsTest.kt | 98 +++++++++++++++++++ 2 files changed, 98 insertions(+), 67 deletions(-) create mode 100644 grobid-core/src/test/kotlin/org/grobid/core/utilities/LabelUtilsTest.kt diff --git a/grobid-core/src/test/java/org/grobid/core/engines/FullTextParserTest.java b/grobid-core/src/test/java/org/grobid/core/engines/FullTextParserTest.java index a4fb60a4e9..ac568290cb 100644 --- a/grobid-core/src/test/java/org/grobid/core/engines/FullTextParserTest.java +++ b/grobid-core/src/test/java/org/grobid/core/engines/FullTextParserTest.java @@ -193,71 +193,4 @@ public void testProcessTrainingDataTable_multiple_tables() throws Exception { } - @Test - public void testPostProcessLabeledAbstract_shouldTransformTableLabelInParagraphLabel() { - String resultWithTables = "This\tthis\tT\tTh\tThi\tThis\ts\tis\this\tThis\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tNEWFONT\tHIGHERFONT\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t0\t10\t0\tNUMBER\t0\t0\tI-
\n" + - "study\tstudy\ts\tst\tstu\tstud\ty\tdy\tudy\ttudy\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t10\t0\tNUMBER\t0\t0\t
\n" + - "was\twas\tw\twa\twas\twas\ts\tas\twas\twas\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t10\t0\tNUMBER\t0\t0\t
\n" + - "supported\tsupported\ts\tsu\tsup\tsupp\td\ted\tted\trted\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t10\t0\tNUMBER\t0\t0\t
\n" + - "by\tby\tb\tby\tby\tby\ty\tby\tby\tby\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t10\t0\tNUMBER\t0\t0\t
\n" + - "the\tthe\tt\tth\tthe\tthe\te\the\tthe\tthe\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t1\t10\t0\tNUMBER\t0\t0\t
\n" + - "South\tsouth\tS\tSo\tSou\tSout\th\tth\tuth\touth\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t1\t10\t0\tNUMBER\t0\t0\t
\n" + - "Asian\tasian\tA\tAs\tAsi\tAsia\tn\tan\tian\tsian\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t1\t10\t0\tNUMBER\t0\t0\t
\n" + - "Clinical\tclinical\tC\tCl\tCli\tClin\tl\tal\tcal\tical\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t1\t10\t0\tNUMBER\t0\t0\t
\n" + - "Toxicology\ttoxicology\tT\tTo\tTox\tToxi\ty\tgy\togy\tlogy\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t1\t10\t0\tNUMBER\t0\t0\t
\n" + - "Research\tresearch\tR\tRe\tRes\tRese\th\tch\trch\tarch\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t2\t10\t0\tNUMBER\t0\t0\t
\n" + - "Collaboration\tcollaboration\tC\tCo\tCol\tColl\tn\ton\tion\ttion\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t2\t10\t0\tNUMBER\t0\t0\t
\n" + - ",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tCOMMA\t3\t10\t0\tNUMBER\t0\t0\t
\n" + - "which\twhich\tw\twh\twhi\twhic\th\tch\tich\thich\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t3\t10\t0\tNUMBER\t0\t0\t
\n" + - "is\tis\ti\tis\tis\tis\ts\tis\tis\tis\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t3\t10\t0\tNUMBER\t0\t0\t
\n" + - "funded\tfunded\tf\tfu\tfun\tfund\td\ted\tded\tnded\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t3\t10\t0\tNUMBER\t0\t0\t
\n" + - "by\tby\tb\tby\tby\tby\ty\tby\tby\tby\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t3\t10\t0\tNUMBER\t0\t0\t
\n" + - "The\tthe\tT\tTh\tThe\tThe\te\the\tThe\tThe\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t3\t10\t0\tNUMBER\t0\t0\t
\n" + - "Wellcome\twellcome\tW\tWe\tWel\tWell\te\tme\tome\tcome\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t4\t10\t0\tNUMBER\t0\t0\t
\n" + - "Trust\ttrust\tT\tTr\tTru\tTrus\tt\tst\tust\trust\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t4\t10\t0\tNUMBER\t0\t0\t
\n" + - "/\t/\t/\t/\t/\t/\t/\t/\t/\t/\tBLOCKIN\tLINEEND\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t4\t10\t0\tNUMBER\t0\t0\t
\n" + - "National\tnational\tN\tNa\tNat\tNati\tl\tal\tnal\tonal\tBLOCKIN\tLINESTART\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t4\t10\t0\tNUMBER\t0\t0\t
\n" + - "Health\thealth\tH\tHe\tHea\tHeal\th\tth\tlth\talth\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t5\t10\t0\tNUMBER\t0\t0\t
\n" + - "and\tand\ta\tan\tand\tand\td\tnd\tand\tand\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t5\t10\t0\tNUMBER\t0\t0\t
\n" + - "Medical\tmedical\tM\tMe\tMed\tMedi\tl\tal\tcal\tical\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t5\t10\t0\tNUMBER\t0\t0\t
\n" + - "Research\tresearch\tR\tRe\tRes\tRese\th\tch\trch\tarch\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t5\t10\t0\tNUMBER\t0\t0\t
\n" + - "Council\tcouncil\tC\tCo\tCou\tCoun\tl\til\tcil\tncil\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t6\t10\t0\tNUMBER\t0\t0\t
\n" + - "International\tinternational\tI\tIn\tInt\tInte\tl\tal\tnal\tonal\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t6\t10\t0\tNUMBER\t0\t0\t
\n" + - "Collaborative\tcollaborative\tC\tCo\tCol\tColl\te\tve\tive\ttive\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t6\t10\t0\tNUMBER\t0\t0\t
\n" + - "Research\tresearch\tR\tRe\tRes\tRese\th\tch\trch\tarch\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t7\t10\t0\tNUMBER\t0\t0\t
\n" + - "Grant\tgrant\tG\tGr\tGra\tGran\tt\tnt\tant\trant\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t7\t10\t0\tNUMBER\t0\t0\t
\n" + - "GR071669MA\tgr071669ma\tG\tGR\tGR0\tGR07\tA\tMA\t9MA\t69MA\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tCONTAINSDIGITS\t0\tNOPUNCT\t8\t10\t0\tNUMBER\t0\t0\t
\n" + - ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t8\t10\t0\tNUMBER\t0\t0\t
\n" + - "The\tthe\tT\tTh\tThe\tThe\te\the\tThe\tThe\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t8\t10\t0\tNUMBER\t0\t0\t
\n" + - "funding\tfunding\tf\tfu\tfun\tfund\tg\tng\ting\tding\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t8\t10\t0\tNUMBER\t0\t0\t
\n" + - "bodies\tbodies\tb\tbo\tbod\tbodi\ts\tes\ties\tdies\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t8\t10\t0\tNUMBER\t0\t0\t
\n" + - "had\thad\th\tha\thad\thad\td\tad\thad\thad\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t10\t0\tNUMBER\t0\t0\t
\n" + - "no\tno\tn\tno\tno\tno\to\tno\tno\tno\tBLOCKIN\tLINEEND\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t10\t0\tNUMBER\t0\t0\t
\n" + - "role\trole\tr\tro\trol\trole\te\tle\tole\trole\tBLOCKIN\tLINESTART\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t10\t0\tNUMBER\t0\t0\t
\n" + - "in\tin\ti\tin\tin\tin\tn\tin\tin\tin\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t10\t0\tNUMBER\t0\t0\t
\n" + - "analyzing\tanalyzing\ta\tan\tana\tanal\tg\tng\ting\tzing\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t10\t0\tNUMBER\t0\t0\t
\n" + - "or\tor\to\tor\tor\tor\tr\tor\tor\tor\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t10\t10\t0\tNUMBER\t0\t0\t
\n" + - "interpreting\tinterpreting\ti\tin\tint\tinte\tg\tng\ting\tting\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t10\t10\t0\tNUMBER\t0\t0\t
\n" + - "the\tthe\tt\tth\tthe\tthe\te\the\tthe\tthe\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t10\t10\t0\tNUMBER\t0\t0\t
\n" + - "data\tdata\td\tda\tdat\tdata\ta\tta\tata\tdata\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t10\t10\t0\tNUMBER\t0\t0\t
\n" + - "or\tor\to\tor\tor\tor\tr\tor\tor\tor\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t11\t10\t0\tNUMBER\t0\t0\t
\n" + - "writing\twriting\tw\twr\twri\twrit\tg\tng\ting\tting\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t11\t10\t0\tNUMBER\t0\t0\t
\n" + - "the\tthe\tt\tth\tthe\tthe\te\the\tthe\tthe\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t11\t10\t0\tNUMBER\t0\t0\t
\n" + - "article\tarticle\ta\tar\tart\tarti\te\tle\tcle\ticle\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t11\t10\t0\tNUMBER\t0\t0\t
\n" + - ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t11\t10\t0\tNUMBER\t0\t0\t
"; - String postprocessed = FullTextParser.postProcessFullTextLabeledText(resultWithTables); - - assertThat(Arrays.stream(StringUtils.split(postprocessed, "\n")) - .filter(l -> l.endsWith("
")) - .count(), is(0L)); - - assertThat(Arrays.stream(StringUtils.split(postprocessed, "\n")) - .filter(l -> l.endsWith("")) - .count(), is (Arrays.stream(StringUtils.split(resultWithTables, "\n")) - .filter(l -> l.endsWith("
")) - .count())); - - } - - } \ No newline at end of file diff --git a/grobid-core/src/test/kotlin/org/grobid/core/utilities/LabelUtilsTest.kt b/grobid-core/src/test/kotlin/org/grobid/core/utilities/LabelUtilsTest.kt new file mode 100644 index 0000000000..10ee442a01 --- /dev/null +++ b/grobid-core/src/test/kotlin/org/grobid/core/utilities/LabelUtilsTest.kt @@ -0,0 +1,98 @@ +package org.grobid.core.utilities + +import org.apache.commons.lang3.StringUtils +import org.grobid.core.utilities.GrobidConfig.ModelParameters +import org.hamcrest.CoreMatchers.`is` +import org.hamcrest.MatcherAssert.assertThat +import org.junit.jupiter.api.BeforeAll +import java.util.* +import kotlin.test.Test + + +class LabelUtilsTest { + + + @Test + fun testPostProcessLabeledAbstract_shouldTransformTableLabelInParagraphLabel() { + val resultWithTables = + "This\tthis\tT\tTh\tThi\tThis\ts\tis\this\tThis\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tNEWFONT\tHIGHERFONT\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t0\t10\t0\tNUMBER\t0\t0\tI-
\n" + + "study\tstudy\ts\tst\tstu\tstud\ty\tdy\tudy\ttudy\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t10\t0\tNUMBER\t0\t0\t
\n" + + "was\twas\tw\twa\twas\twas\ts\tas\twas\twas\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t10\t0\tNUMBER\t0\t0\t
\n" + + "supported\tsupported\ts\tsu\tsup\tsupp\td\ted\tted\trted\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t10\t0\tNUMBER\t0\t0\t
\n" + + "by\tby\tb\tby\tby\tby\ty\tby\tby\tby\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t10\t0\tNUMBER\t0\t0\t
\n" + + "the\tthe\tt\tth\tthe\tthe\te\the\tthe\tthe\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t1\t10\t0\tNUMBER\t0\t0\t
\n" + + "South\tsouth\tS\tSo\tSou\tSout\th\tth\tuth\touth\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t1\t10\t0\tNUMBER\t0\t0\t
\n" + + "Asian\tasian\tA\tAs\tAsi\tAsia\tn\tan\tian\tsian\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t1\t10\t0\tNUMBER\t0\t0\t
\n" + + "Clinical\tclinical\tC\tCl\tCli\tClin\tl\tal\tcal\tical\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t1\t10\t0\tNUMBER\t0\t0\t
\n" + + "Toxicology\ttoxicology\tT\tTo\tTox\tToxi\ty\tgy\togy\tlogy\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t1\t10\t0\tNUMBER\t0\t0\t
\n" + + "Research\tresearch\tR\tRe\tRes\tRese\th\tch\trch\tarch\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t2\t10\t0\tNUMBER\t0\t0\t
\n" + + "Collaboration\tcollaboration\tC\tCo\tCol\tColl\tn\ton\tion\ttion\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t2\t10\t0\tNUMBER\t0\t0\t
\n" + + ",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tCOMMA\t3\t10\t0\tNUMBER\t0\t0\t
\n" + + "which\twhich\tw\twh\twhi\twhic\th\tch\tich\thich\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t3\t10\t0\tNUMBER\t0\t0\t
\n" + + "is\tis\ti\tis\tis\tis\ts\tis\tis\tis\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t3\t10\t0\tNUMBER\t0\t0\t
\n" + + "funded\tfunded\tf\tfu\tfun\tfund\td\ted\tded\tnded\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t3\t10\t0\tNUMBER\t0\t0\t
\n" + + "by\tby\tb\tby\tby\tby\ty\tby\tby\tby\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t3\t10\t0\tNUMBER\t0\t0\t
\n" + + "The\tthe\tT\tTh\tThe\tThe\te\the\tThe\tThe\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t3\t10\t0\tNUMBER\t0\t0\t
\n" + + "Wellcome\twellcome\tW\tWe\tWel\tWell\te\tme\tome\tcome\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t4\t10\t0\tNUMBER\t0\t0\t
\n" + + "Trust\ttrust\tT\tTr\tTru\tTrus\tt\tst\tust\trust\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t4\t10\t0\tNUMBER\t0\t0\t
\n" + + "/\t/\t/\t/\t/\t/\t/\t/\t/\t/\tBLOCKIN\tLINEEND\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t4\t10\t0\tNUMBER\t0\t0\t
\n" + + "National\tnational\tN\tNa\tNat\tNati\tl\tal\tnal\tonal\tBLOCKIN\tLINESTART\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t4\t10\t0\tNUMBER\t0\t0\t
\n" + + "Health\thealth\tH\tHe\tHea\tHeal\th\tth\tlth\talth\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t5\t10\t0\tNUMBER\t0\t0\t
\n" + + "and\tand\ta\tan\tand\tand\td\tnd\tand\tand\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t5\t10\t0\tNUMBER\t0\t0\t
\n" + + "Medical\tmedical\tM\tMe\tMed\tMedi\tl\tal\tcal\tical\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t5\t10\t0\tNUMBER\t0\t0\t
\n" + + "Research\tresearch\tR\tRe\tRes\tRese\th\tch\trch\tarch\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t5\t10\t0\tNUMBER\t0\t0\t
\n" + + "Council\tcouncil\tC\tCo\tCou\tCoun\tl\til\tcil\tncil\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t6\t10\t0\tNUMBER\t0\t0\t
\n" + + "International\tinternational\tI\tIn\tInt\tInte\tl\tal\tnal\tonal\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t6\t10\t0\tNUMBER\t0\t0\t
\n" + + "Collaborative\tcollaborative\tC\tCo\tCol\tColl\te\tve\tive\ttive\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t6\t10\t0\tNUMBER\t0\t0\t
\n" + + "Research\tresearch\tR\tRe\tRes\tRese\th\tch\trch\tarch\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t7\t10\t0\tNUMBER\t0\t0\t
\n" + + "Grant\tgrant\tG\tGr\tGra\tGran\tt\tnt\tant\trant\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t7\t10\t0\tNUMBER\t0\t0\t
\n" + + "GR071669MA\tgr071669ma\tG\tGR\tGR0\tGR07\tA\tMA\t9MA\t69MA\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tCONTAINSDIGITS\t0\tNOPUNCT\t8\t10\t0\tNUMBER\t0\t0\t
\n" + + ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t8\t10\t0\tNUMBER\t0\t0\t
\n" + + "The\tthe\tT\tTh\tThe\tThe\te\the\tThe\tThe\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t8\t10\t0\tNUMBER\t0\t0\t
\n" + + "funding\tfunding\tf\tfu\tfun\tfund\tg\tng\ting\tding\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t8\t10\t0\tNUMBER\t0\t0\t
\n" + + "bodies\tbodies\tb\tbo\tbod\tbodi\ts\tes\ties\tdies\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t8\t10\t0\tNUMBER\t0\t0\t
\n" + + "had\thad\th\tha\thad\thad\td\tad\thad\thad\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t10\t0\tNUMBER\t0\t0\t
\n" + + "no\tno\tn\tno\tno\tno\to\tno\tno\tno\tBLOCKIN\tLINEEND\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t10\t0\tNUMBER\t0\t0\t
\n" + + "role\trole\tr\tro\trol\trole\te\tle\tole\trole\tBLOCKIN\tLINESTART\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t10\t0\tNUMBER\t0\t0\t
\n" + + "in\tin\ti\tin\tin\tin\tn\tin\tin\tin\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t10\t0\tNUMBER\t0\t0\t
\n" + + "analyzing\tanalyzing\ta\tan\tana\tanal\tg\tng\ting\tzing\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t10\t0\tNUMBER\t0\t0\t
\n" + + "or\tor\to\tor\tor\tor\tr\tor\tor\tor\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t10\t10\t0\tNUMBER\t0\t0\t
\n" + + "interpreting\tinterpreting\ti\tin\tint\tinte\tg\tng\ting\tting\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t10\t10\t0\tNUMBER\t0\t0\t
\n" + + "the\tthe\tt\tth\tthe\tthe\te\the\tthe\tthe\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t10\t10\t0\tNUMBER\t0\t0\t
\n" + + "data\tdata\td\tda\tdat\tdata\ta\tta\tata\tdata\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t10\t10\t0\tNUMBER\t0\t0\t
\n" + + "or\tor\to\tor\tor\tor\tr\tor\tor\tor\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t11\t10\t0\tNUMBER\t0\t0\t
\n" + + "writing\twriting\tw\twr\twri\twrit\tg\tng\ting\tting\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t11\t10\t0\tNUMBER\t0\t0\t
\n" + + "the\tthe\tt\tth\tthe\tthe\te\the\tthe\tthe\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t11\t10\t0\tNUMBER\t0\t0\t
\n" + + "article\tarticle\ta\tar\tart\tarti\te\tle\tcle\ticle\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t11\t10\t0\tNUMBER\t0\t0\t
\n" + + ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t11\t10\t0\tNUMBER\t0\t0\t
" + + val postprocessed = LabelUtils.postProcessFullTextLabeledText(resultWithTables) + + assertThat( + Arrays.stream(StringUtils.split(postprocessed, "\n")) + .filter { l -> l.endsWith("
") } + .count(), `is`(0L) + ) + + assertThat( + Arrays.stream(StringUtils.split(postprocessed, "\n")) + .filter { l -> l.endsWith("") } + .count(), `is`( + Arrays.stream(StringUtils.split(resultWithTables, "\n")) + .filter { l -> l.endsWith("
") } + .count()) + ) + } + + companion object { + @JvmStatic + @BeforeAll + @Throws(Exception::class) + fun before() { + val modelParameters = ModelParameters() + modelParameters.name = "bao" + GrobidProperties.addModel(modelParameters) + } + } + +} \ No newline at end of file From 778080491fee93736619130c255b3f67c75b9de4 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Fri, 6 Dec 2024 13:46:59 +0000 Subject: [PATCH 06/21] fix scope --- grobid-core/src/main/java/org/grobid/core/data/Figure.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/grobid-core/src/main/java/org/grobid/core/data/Figure.java b/grobid-core/src/main/java/org/grobid/core/data/Figure.java index c62ff797f4..f186280959 100644 --- a/grobid-core/src/main/java/org/grobid/core/data/Figure.java +++ b/grobid-core/src/main/java/org/grobid/core/data/Figure.java @@ -89,7 +89,7 @@ public boolean apply(GraphicObject graphicObject) { private List layoutTokens; // Contains the raw layoutTokens from the fulltext model - private List rawLayoutTokens = new ArrayList<>(); + protected List rawLayoutTokens = new ArrayList<>(); // coordinates private int page = -1; From e0e217d94600848668c78404f4ecbbaf46076395 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Fri, 6 Dec 2024 14:03:34 +0000 Subject: [PATCH 07/21] remove unnecessary layout tokens list, renaming stuff --- .../src/main/java/org/grobid/core/data/Figure.java | 10 ---------- .../java/org/grobid/core/engines/FullTextParser.java | 12 ++++++------ 2 files changed, 6 insertions(+), 16 deletions(-) diff --git a/grobid-core/src/main/java/org/grobid/core/data/Figure.java b/grobid-core/src/main/java/org/grobid/core/data/Figure.java index f186280959..0d646ed93d 100644 --- a/grobid-core/src/main/java/org/grobid/core/data/Figure.java +++ b/grobid-core/src/main/java/org/grobid/core/data/Figure.java @@ -88,9 +88,6 @@ public boolean apply(GraphicObject graphicObject) { private List textArea; private List layoutTokens; - // Contains the raw layoutTokens from the fulltext model - protected List rawLayoutTokens = new ArrayList<>(); - // coordinates private int page = -1; private double y = 0.0; @@ -576,11 +573,4 @@ public void setUri(URI uri) { this.uri = uri; } - public List getRawLayoutTokens() { - return rawLayoutTokens; - } - - public void setRawLayoutTokens(List rawLayoutTokens) { - this.rawLayoutTokens = rawLayoutTokens; - } } diff --git a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java index 23b39aee82..8f3025f587 100755 --- a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java @@ -369,10 +369,10 @@ private static String revertResultsForBadItems(List badItems, .map(l -> Arrays.stream(l.split("\t")).collect(Collectors.toList())) .collect(Collectors.toList()); - for (Figure badTable : badItems) { + for (Figure badItem : badItems) { // Find the index of the first layoutToken of the table in the tokenization - List rawLayoutTokenTable = badTable.getRawLayoutTokens(); - LayoutToken firstLayoutTokenTable = rawLayoutTokenTable.get(0); + List rawLayoutTokenTable = badItem.getLayoutTokens(); + LayoutToken firstLayoutTokenItem = rawLayoutTokenTable.get(0); // final List documentTokenization = layoutTokenization.getTokenization(); @@ -387,14 +387,14 @@ private static String revertResultsForBadItems(List badItems, // .orElse(-1); List candidateIndexes = IntStream.range(0, splitResult.size()) - .filter(i -> splitResult.get(i).get(0).equals(firstLayoutTokenTable.getText()) + .filter(i -> splitResult.get(i).get(0).equals(firstLayoutTokenItem.getText()) && Iterables.getLast(splitResult.get(i)).equals("I-"+itemLabel)) .boxed() .collect(Collectors.toList()); if (candidateIndexes.isEmpty()) { candidateIndexes = IntStream.range(0, splitResult.size()) - .filter(i -> splitResult.get(i).get(0).equals(firstLayoutTokenTable.getText()) + .filter(i -> splitResult.get(i).get(0).equals(firstLayoutTokenItem.getText()) && Iterables.getLast(splitResult.get(i)).equals(itemLabel)) .boxed() .collect(Collectors.toList()); @@ -2289,7 +2289,7 @@ protected List
processTables(String rese, for (Table result : localResults) { List localTokenizationTable = result.getLayoutTokens(); - result.setRawLayoutTokens(tokenizationTable); +// result.setRawLayoutTokens(tokenizationTable); // block setting: we restrict to the tokenization of this particular table SortedSet blockPtrs = new TreeSet<>(); From facc35ed3f5a27819e828cd30d6e3c576c668333 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Fri, 6 Dec 2024 20:05:23 +0000 Subject: [PATCH 08/21] fix index mismatch --- .../main/java/org/grobid/core/engines/FullTextParser.java | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java index 8f3025f587..8a1f342712 100755 --- a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java @@ -412,11 +412,13 @@ private static String revertResultsForBadItems(List badItems, .collect(Collectors.toList()); int resultIndexCandidate = -1; - if (tokensNoSpace.size() == 1){ + if (candidateIndexes.isEmpty()){ + LOGGER.warn("Cannot find the candidate index for fixing the tables."); + } else if (candidateIndexes.size() == 1){ resultIndexCandidate = candidateIndexes.get(0); } else { for (int candidateIndex: candidateIndexes) { - List candidateTable = splitResult.subList(candidateIndex, candidateIndex + tokensNoSpace.size()) + List candidateTable = splitResult.subList(candidateIndex, Math.min(candidateIndex + tokensNoSpace.size(), splitResult.size())) .stream() .map(i -> i.get(0)) .collect(Collectors.toList()); From 17ad42548eaef25250bae9dd4754e5424d536a23 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Fri, 6 Dec 2024 20:59:46 +0000 Subject: [PATCH 09/21] fix wrong label adjustment --- .../main/java/org/grobid/core/utilities/LabelUtils.java | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/grobid-core/src/main/java/org/grobid/core/utilities/LabelUtils.java b/grobid-core/src/main/java/org/grobid/core/utilities/LabelUtils.java index 0cb0b8211a..d0f7fcc0a6 100644 --- a/grobid-core/src/main/java/org/grobid/core/utilities/LabelUtils.java +++ b/grobid-core/src/main/java/org/grobid/core/utilities/LabelUtils.java @@ -56,15 +56,11 @@ public static String adjustInvalidSequenceOfStartLabels(String fulltextLabeledTe String[] pieces = line.split("\t"); String label = pieces[pieces.length-1]; if (label.equals("I-"+TaggingLabels.FIGURE.getLabel())) { - if (previousLabel == null) { - continue; - } else if (previousLabel.equals("I-"+TaggingLabels.FIGURE.getLabel())) { + if (StringUtils.equals(previousLabel, "I-"+TaggingLabels.FIGURE.getLabel())) { pieces[pieces.length-1] = TaggingLabels.FIGURE.getLabel(); } } else if (label.equals("I-"+TaggingLabels.TABLE.getLabel())) { - if (previousLabel == null) { - continue; - } else if (previousLabel.equals("I-"+TaggingLabels.TABLE.getLabel())) { + if (StringUtils.equals(previousLabel, "I-"+TaggingLabels.TABLE.getLabel())) { pieces[pieces.length-1] = TaggingLabels.TABLE.getLabel(); } } From c8302316c9007a03edaf77b9e20d2f6c623877b2 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Sat, 7 Dec 2024 07:11:00 +0000 Subject: [PATCH 10/21] fix wrong upper indexes, cleanup --- .../grobid/core/engines/FullTextParser.java | 20 +------------------ 1 file changed, 1 insertion(+), 19 deletions(-) diff --git a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java index 8a1f342712..0523d98c23 100755 --- a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java @@ -374,18 +374,6 @@ private static String revertResultsForBadItems(List badItems, List rawLayoutTokenTable = badItem.getLayoutTokens(); LayoutToken firstLayoutTokenItem = rawLayoutTokenTable.get(0); -// final List documentTokenization = layoutTokenization.getTokenization(); - -// int tokenIndex = IntStream.range(0, documentTokenization.size()) -// .filter(i -> { -// LayoutToken l = documentTokenization.get(i); -// return l.getText().equals(firstLayoutTokenTable.getText()) -// && l.getPage() == firstLayoutTokenTable.getPage() -// && l.getOffset() == firstLayoutTokenTable.getOffset(); -// }) -// .findFirst() -// .orElse(-1); - List candidateIndexes = IntStream.range(0, splitResult.size()) .filter(i -> splitResult.get(i).get(0).equals(firstLayoutTokenItem.getText()) && Iterables.getLast(splitResult.get(i)).equals("I-"+itemLabel)) @@ -435,7 +423,7 @@ private static String revertResultsForBadItems(List badItems, if (resultIndexCandidate > -1) { boolean first = true; - for (int i = resultIndexCandidate;i < resultIndexCandidate + tokensNoSpace.size(); i++) { + for (int i = resultIndexCandidate;i < Math.min(resultIndexCandidate + tokensNoSpace.size(), splitResult.size()); i++) { List line = splitResult.get(i); String label = Iterables.getLast(line); if (first) { @@ -450,12 +438,6 @@ private static String revertResultsForBadItems(List badItems, } else { LOGGER.warn("Cannot find the result index candidate."); } -// List> badTableResult = Arrays.stream(badTable.getRawLayoutTokens().stream() -// .map(LayoutToken::getText) -// .toArray(String[]::new)) -// .map(l -> Arrays.stream(l.split("\t")).collect(Collectors.toList())) -// .collect(Collectors.toList()); -// } String resultBody2 = splitResult.stream() From 2f0661bd38ccbd8625e87e85f6ca6d9b6b5b2f69 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Mon, 16 Dec 2024 16:57:30 +0100 Subject: [PATCH 11/21] cosmetics --- .../main/java/org/grobid/core/engines/FullTextParser.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java index 1762e83617..209b844a64 100755 --- a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java @@ -362,14 +362,14 @@ else if (config.getConsolidateCitations() == 2) } } - private static String revertResultsForBadItems(List badItems, String resultBody, String itemLabel) { + private static String revertResultsForBadItems(List badFiguresOrTables, String resultBody, String itemLabel) { //LF: we update the resultBody sequence by reverting these tables as elements - if (CollectionUtils.isNotEmpty(badItems)) { + if (CollectionUtils.isNotEmpty(badFiguresOrTables)) { List> splitResult = Arrays.stream(resultBody.split("\n")) .map(l -> Arrays.stream(l.split("\t")).collect(Collectors.toList())) .collect(Collectors.toList()); - for (Figure badItem : badItems) { + for (Figure badItem : badFiguresOrTables) { // Find the index of the first layoutToken of the table in the tokenization List rawLayoutTokenTable = badItem.getLayoutTokens(); LayoutToken firstLayoutTokenItem = rawLayoutTokenTable.get(0); From 2f04ccd0d2b6c7066ce9fed748e9b62bc538fe3d Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Mon, 16 Dec 2024 16:57:47 +0100 Subject: [PATCH 12/21] add tests for labelling adjustment --- .../grobid/core/utilities/LabelUtilsTest.kt | 200 ++++++++++++++++++ 1 file changed, 200 insertions(+) diff --git a/grobid-core/src/test/kotlin/org/grobid/core/utilities/LabelUtilsTest.kt b/grobid-core/src/test/kotlin/org/grobid/core/utilities/LabelUtilsTest.kt index 10ee442a01..4c856f65fd 100644 --- a/grobid-core/src/test/kotlin/org/grobid/core/utilities/LabelUtilsTest.kt +++ b/grobid-core/src/test/kotlin/org/grobid/core/utilities/LabelUtilsTest.kt @@ -3,9 +3,11 @@ package org.grobid.core.utilities import org.apache.commons.lang3.StringUtils import org.grobid.core.utilities.GrobidConfig.ModelParameters import org.hamcrest.CoreMatchers.`is` +import org.hamcrest.CoreMatchers.not import org.hamcrest.MatcherAssert.assertThat import org.junit.jupiter.api.BeforeAll import java.util.* +import java.util.stream.Collectors import kotlin.test.Test @@ -84,6 +86,204 @@ class LabelUtilsTest { ) } +// fun testAdjustInvalidSequenceOfStartLabels() { +// val inputStream = javaClass.getResourceAsStream("bodyResults-sample.1.txt") +// val bodyResult = inputStream?.bufferedReader().use { it.readText() } +// +// val postProcessed = LabelUtils.postProcessFullTextLabeledText(bodyResult) +// } + + @Test + fun testAdjustInvalidSequenceOfStartLabels_noChangeNeeded_shouldReturnSameSequence() { + val bodyResult = + "B\tb\tB\tB\tB\tB\tB\tB\tB\tB\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tHIGHERFONT\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t8\t11\t0\tNUMBER\t0\t0\t\n" + + ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKEND\tLINEEND\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t8\t11\t0\tNUMBER\t0\t0\t\n" + + "014306\t014306\t0\t01\t014\t0143\t6\t06\t306\t4306\tBLOCKSTART\tLINESTART\tLINEINDENT\tSAMEFONT\tLOWERFONT\t0\t0\tNOCAPS\tALLDIGIT\t0\tNOPUNCT\t8\t11\t0\tNUMBER\t0\t0\tI-\n" + + "-\t-\t-\t-\t-\t-\t-\t-\t-\t-\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tHYPHEN\t8\t11\t0\tNUMBER\t0\t0\t\n" + + "4\t4\t4\t4\t4\t4\t4\t4\t4\t4\tBLOCKEND\tLINEEND\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tALLDIGIT\t1\tNOPUNCT\t8\t11\t0\tNUMBER\t1\t0\t\n" + + "FIG\tfig\tF\tFI\tFIG\tFIG\tG\tIG\tFIG\tFIG\tBLOCKSTART\tLINESTART\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t0\tNOPUNCT\t8\t3\t0\tNUMBER\t0\t0\t\n" + + ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t8\t3\t0\tNUMBER\t0\t0\t\n" + + "1\t1\t1\t1\t1\t1\t1\t1\t1\t1\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tALLDIGIT\t1\tNOPUNCT\t8\t3\t0\tNUMBER\t1\t0\tI-\n" + + ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t8\t3\t0\tNUMBER\t0\t0\tI-\n" + + "The\tthe\tT\tTh\tThe\tThe\te\the\tThe\tThe\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t8\t3\t0\tNUMBER\t0\t0\t\n" + + "average\taverage\ta\tav\tave\taver\te\tge\tage\trage\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t8\t3\t0\tNUMBER\t0\t0\t\n" + + "distances\tdistances\td\tdi\tdis\tdist\ts\tes\tces\tnces\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t8\t3\t0\tNUMBER\t0\t0\t\n" + + "between\tbetween\tb\tbe\tbet\tbetw\tn\ten\teen\tween\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t8\t3\t0\tNUMBER\t0\t0\t\n" + + "nucleons\tnucleons\tn\tnu\tnuc\tnucl\ts\tns\tons\teons\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t8\t3\t0\tNUMBER\t0\t0\t\n" + + "in\tin\ti\tin\tin\tin\tn\tin\tin\tin\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t8\t3\t0\tNUMBER\t0\t0\t\n" + + "10\t10\t1\t10\t10\t10\t0\t10\t10\t10\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tLOWERFONT\t0\t0\tNOCAPS\tALLDIGIT\t0\tNOPUNCT\t8\t3\t0\tNUMBER\t1\t1\t\n" + + "Be\tbe\tB\tBe\tBe\tBe\te\tBe\tBe\tBe\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tHIGHERFONT\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t8\t3\t0\tNUMBER\t0\t0\t\n" + + "(\t(\t(\t(\t(\t(\t(\t(\t(\t(\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tOPENBRACKET\t8\t3\t0\tNUMBER\t0\t0\t\n" + + "0\t0\t0\t0\t0\t0\t0\t0\t0\t0\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tALLDIGIT\t1\tNOPUNCT\t8\t3\t0\tNUMBER\t0\t0\t\n" + + "+\t+\t+\t+\t+\t+\t+\t+\t+\t+\tBLOCKIN\tLINEIN\tLINEINDENT\tNEWFONT\tLOWERFONT\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t8\t3\t0\tNUMBER\t0\t1\t\n" + + "1\t1\t1\t1\t1\t1\t1\t1\t1\t1\tBLOCKIN\tLINEIN\tLINEINDENT\tNEWFONT\tHIGHERFONT\t0\t0\tNOCAPS\tALLDIGIT\t1\tNOPUNCT\t8\t3\t0\tNUMBER\t1\t0\t\n" + + ")\t)\t)\t)\t)\t)\t)\t)\t)\t)\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tENDBRACKET\t8\t3\t0\tNUMBER\t0\t0\t\n" + + ",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tBLOCKIN\tLINEEND\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tCOMMA\t8\t3\t0\tNUMBER\t0\t0\t\n" + + "10\t10\t1\t10\t10\t10\t0\t10\t10\t10\tBLOCKIN\tLINESTART\tALIGNEDLEFT\tSAMEFONT\tLOWERFONT\t0\t0\tNOCAPS\tALLDIGIT\t0\tNOPUNCT\t8\t3\t0\tNUMBER\t1\t1\t\n" + + "B\tb\tB\tB\tB\tB\tB\tB\tB\tB\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tHIGHERFONT\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t8\t3\t0\tNUMBER\t0\t0\t\n" + + "(\t(\t(\t(\t(\t(\t(\t(\t(\t(\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tOPENBRACKET\t8\t3\t0\tNUMBER\t0\t0\t\n" + + "3\t3\t3\t3\t3\t3\t3\t3\t3\t3\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tALLDIGIT\t1\tNOPUNCT\t8\t3\t0\tNUMBER\t1\t0\t\n" + + "+\t+\t+\t+\t+\t+\t+\t+\t+\t+\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tNEWFONT\tLOWERFONT\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t8\t3\t0\tNUMBER\t0\t1\t\n" + + "0\t0\t0\t0\t0\t0\t0\t0\t0\t0\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tNEWFONT\tHIGHERFONT\t0\t0\tNOCAPS\tALLDIGIT\t1\tNOPUNCT\t8\t3\t0\tNUMBER\t0\t0\t\n" + + ")\t)\t)\t)\t)\t)\t)\t)\t)\t)\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tENDBRACKET\t8\t3\t0\tNUMBER\t0\t0\t\n" + + ",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tCOMMA\t8\t3\t0\tNUMBER\t0\t0\t\n" + + "10\t10\t1\t10\t10\t10\t0\t10\t10\t10\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tLOWERFONT\t0\t0\tNOCAPS\tALLDIGIT\t0\tNOPUNCT\t8\t3\t0\tNUMBER\t1\t1\tI-\n" + + "B\tb\tB\tB\tB\tB\tB\tB\tB\tB\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tHIGHERFONT\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t8\t3\t0\tNUMBER\t0\t0\tI-\n" + + "(\t(\t(\t(\t(\t(\t(\t(\t(\t(\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tOPENBRACKET\t8\t3\t0\tNUMBER\t0\t0\t\n" + + "1\t1\t1\t1\t1\t1\t1\t1\t1\t1\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tALLDIGIT\t1\tNOPUNCT\t8\t3\t0\tNUMBER\t1\t0\t\n" + + "+\t+\t+\t+\t+\t+\t+\t+\t+\t+\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tNEWFONT\tLOWERFONT\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t8\t3\t0\tNUMBER\t0\t1\t\n" + + "0\t0\t0\t0\t0\t0\t0\t0\t0\t0\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tNEWFONT\tHIGHERFONT\t0\t0\tNOCAPS\tALLDIGIT\t1\tNOPUNCT\t8\t3\t0\tNUMBER\t0\t0\t\n" + + ")\t)\t)\t)\t)\t)\t)\t)\t)\t)\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tENDBRACKET\t8\t3\t0\tNUMBER\t0\t0\t\n" + + ",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tCOMMA\t8\t3\t0\tNUMBER\t0\t0\t\n" + + "and\tand\ta\tan\tand\tand\td\tnd\tand\tand\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t8\t3\t0\tNUMBER\t0\t0\t\n" + + "10\t10\t1\t10\t10\t10\t0\t10\t10\t10\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tLOWERFONT\t0\t0\tNOCAPS\tALLDIGIT\t0\tNOPUNCT\t8\t3\t0\tNUMBER\t1\t1\t\n" + + "C\tc\tC\tC\tC\tC\tC\tC\tC\tC\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tHIGHERFONT\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t8\t3\t0\tNUMBER\t0\t0\t\n" + + "(\t(\t(\t(\t(\t(\t(\t(\t(\t(\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tOPENBRACKET\t8\t3\t0\tNUMBER\t0\t0\t\n" + + "0\t0\t0\t0\t0\t0\t0\t0\t0\t0\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tALLDIGIT\t1\tNOPUNCT\t8\t3\t0\tNUMBER\t0\t0\t\n" + + "+\t+\t+\t+\t+\t+\t+\t+\t+\t+\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tNEWFONT\tLOWERFONT\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t8\t3\t0\tNUMBER\t0\t1\t\n" + + "1\t1\t1\t1\t1\t1\t1\t1\t1\t1\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tNEWFONT\tHIGHERFONT\t0\t0\tNOCAPS\tALLDIGIT\t1\tNOPUNCT\t8\t3\t0\tNUMBER\t1\t0\t\n" + + ")\t)\t)\t)\t)\t)\t)\t)\t)\t)\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tENDBRACKET\t8\t3\t0\tNUMBER\t0\t0\t\n" + + "states\tstates\ts\tst\tsta\tstat\ts\tes\ttes\tates\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t8\t3\t0\tNUMBER\t0\t0\t\n" + + ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t8\t3\t0\tNUMBER\t0\t0\t\n" + + "The\tthe\tT\tTh\tThe\tThe\te\the\tThe\tThe\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t8\t3\t0\tNUMBER\t0\t0\t\n" + + "solid\tsolid\ts\tso\tsol\tsoli\td\tid\tlid\tolid\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t8\t3\t0\tNUMBER\t0\t0\t\n" + + "lines\tlines\tl\tli\tlin\tline\ts\tes\tnes\tines\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t8\t3\t0\tNUMBER\t0\t0\t\n" + + "denote\tdenote\td\tde\tden\tdeno\te\tte\tote\tnote\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t8\t3\t0\tNUMBER\t0\t0\t\n" + + "the\tthe\tt\tth\tthe\tthe\te\the\tthe\tthe\tBLOCKIN\tLINEEND\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t8\t3\t0\tNUMBER\t0\t0\t\n" + + "average\taverage\ta\tav\tave\taver\te\tge\tage\trage\tBLOCKIN\tLINESTART\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t8\t3\t0\tNUMBER\t0\t0\t\n" + + "distances\tdistances\td\tdi\tdis\tdist\ts\tes\tces\tnces\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t8\t3\t0\tNUMBER\t0\t0\t\n" + + "r\tr\tr\tr\tr\tr\tr\tr\tr\tr\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tNEWFONT\tSAMEFONTSIZE\t0\t1\tNOCAPS\tNODIGIT\t1\tNOPUNCT\t8\t3\t0\tNUMBER\t0\t0\t\n" + + "N\tn\tN\tN\tN\tN\tN\tN\tN\tN\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tLOWERFONT\t0\t1\tALLCAP\tNODIGIT\t1\tNOPUNCT\t8\t3\t0\tNUMBER\t0\t0\t\n" + + ",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t1\tALLCAP\tNODIGIT\t1\tCOMMA\t8\t3\t0\tNUMBER\t0\t0\t\n" + + "N\tn\tN\tN\tN\tN\tN\tN\tN\tN\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t1\tALLCAP\tNODIGIT\t1\tNOPUNCT\t8\t3\t0\tNUMBER\t0\t0\t\n" + + "between\tbetween\tb\tbe\tbet\tbetw\tn\ten\teen\tween\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tNEWFONT\tHIGHERFONT\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t8\t3\t0\tNUMBER\t0\t0\t\n" + + "two\ttwo\tt\ttw\ttwo\ttwo\to\two\ttwo\ttwo\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t8\t3\t0\tNUMBER\t0\t0\t\n" + + "valence\tvalence\tv\tva\tval\tvale\te\tce\tnce\tence\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t8\t3\t0\tNUMBER\t0\t0\t\n" + + "nucleons\tnucleons\tn\tnu\tnuc\tnucl\ts\tns\tons\teons\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t8\t3\t0\tNUMBER\t0\t0\t\n" + + ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t8\t3\t0\tNUMBER\t0\t0\t\n" + + "The\tthe\tT\tTh\tThe\tThe\te\the\tThe\tThe\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t8\t3\t0\tNUMBER\t0\t0\t\n" + + val postProcessed = LabelUtils.adjustInvalidSequenceOfStartLabels(bodyResult) + + assertThat(postProcessed, `is`(bodyResult)) + } + + @Test + fun testAdjustInvalidSequenceOfStartLabels_singleChangeNeeded_shouldCorrectTheSequence() { + val bodyResult = + "of\tof\to\tof\tof\tof\tf\tof\tof\tof\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t0\t\n" + + "10\t10\t1\t10\t10\t10\t0\t10\t10\t10\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tLOWERFONT\t0\t0\tNOCAPS\tALLDIGIT\t0\tNOPUNCT\t9\t5\t0\tNUMBER\t1\t1\tI-\n" + + "B\tb\tB\tB\tB\tB\tB\tB\tB\tB\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tHIGHERFONT\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t0\tI-
\n" + + "and\tand\ta\tan\tand\tand\td\tnd\tand\tand\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t0\tI-
\n" + + "the\tthe\tt\tth\tthe\tthe\te\the\tthe\tthe\tBLOCKIN\tLINEEND\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t0\tI-
\n" + + "1\t1\t1\t1\t1\t1\t1\t1\t1\t1\tBLOCKIN\tLINESTART\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tALLDIGIT\t1\tNOPUNCT\t9\t5\t0\tNUMBER\t1\t0\t
\n" + + "+\t+\t+\t+\t+\t+\t+\t+\t+\t+\tBLOCKIN\tLINEEND\tALIGNEDLEFT\tNEWFONT\tLOWERFONT\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t1\t
\n" + + "1\t1\t1\t1\t1\t1\t1\t1\t1\t1\tBLOCKIN\tLINESTART\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tALLDIGIT\t1\tNOPUNCT\t9\t5\t0\tNUMBER\t1\t0\t
\n" + + "0\t0\t0\t0\t0\t0\t0\t0\t0\t0\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tHIGHERFONT\t0\t0\tNOCAPS\tALLDIGIT\t1\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t0\t
\n" + + "state\tstate\ts\tst\tsta\tstat\te\tte\tate\ttate\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t0\t
\n" + + "of\tof\to\tof\tof\tof\tf\tof\tof\tof\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t0\t
\n" + + "10\t10\t1\t10\t10\t10\t0\t10\t10\t10\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tLOWERFONT\t0\t0\tNOCAPS\tALLDIGIT\t0\tNOPUNCT\t9\t5\t0\tNUMBER\t1\t1\t
\n" + + "B\tb\tB\tB\tB\tB\tB\tB\tB\tB\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tHIGHERFONT\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t0\t
\n" + + ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t9\t5\t0\tNUMBER\t0\t0\t
\n" + + "The\tthe\tT\tTh\tThe\tThe\te\the\tThe\tThe\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t0\t
\n" + + "panels\tpanels\tp\tpa\tpan\tpane\ts\tls\tels\tnels\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t0\t
\n" + + "(\t(\t(\t(\t(\t(\t(\t(\t(\t(\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tOPENBRACKET\t9\t5\t0\tNUMBER\t0\t0\t
\n" + + "a\ta\ta\ta\ta\ta\ta\ta\ta\ta\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t1\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t0\t
\n" + + ")\t)\t)\t)\t)\t)\t)\t)\t)\t)\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tENDBRACKET\t9\t5\t0\tNUMBER\t0\t0\t
\n" + + "are\tare\ta\tar\tare\tare\te\tre\tare\tare\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t0\t
\n" + + "calculated\tcalculated\tc\tca\tcal\tcalc\td\ted\tted\tated\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t0\t
\n" + + + val postProcessed = LabelUtils.adjustInvalidSequenceOfStartLabels(bodyResult) + + assertThat(postProcessed, not(bodyResult)) + + val splitResult = + Arrays.stream(postProcessed.split("\n".toRegex()).dropLastWhile { it.isEmpty() }.toTypedArray()) + .map> { l: String -> + Arrays.stream( + l.split("\t".toRegex()).dropLastWhile { it.isEmpty() }.toTypedArray() + ) + .collect(Collectors.toList()) + } + .collect(Collectors.toList()) + + val countStartingFigure = splitResult.stream() + .map { l: List -> l.last() } + .filter { l: String -> l.equals("I-
") } + .count() + + assertThat(countStartingFigure, `is`(1)) + } + + @Test + fun testAdjustInvalidSequenceOfStartLabels_MultipleChangeNeeded_shouldCorrectTheSequence() { + val bodyResult = + "of\tof\to\tof\tof\tof\tf\tof\tof\tof\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t0\t\n" + + "10\t10\t1\t10\t10\t10\t0\t10\t10\t10\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tLOWERFONT\t0\t0\tNOCAPS\tALLDIGIT\t0\tNOPUNCT\t9\t5\t0\tNUMBER\t1\t1\tI-\n" + + "B\tb\tB\tB\tB\tB\tB\tB\tB\tB\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tHIGHERFONT\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t0\tI-
\n" + + "and\tand\ta\tan\tand\tand\td\tnd\tand\tand\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t0\tI-
\n" + + "the\tthe\tt\tth\tthe\tthe\te\the\tthe\tthe\tBLOCKIN\tLINEEND\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t0\tI-
\n" + + "1\t1\t1\t1\t1\t1\t1\t1\t1\t1\tBLOCKIN\tLINESTART\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tALLDIGIT\t1\tNOPUNCT\t9\t5\t0\tNUMBER\t1\t0\t
\n" + + "+\t+\t+\t+\t+\t+\t+\t+\t+\t+\tBLOCKIN\tLINEEND\tALIGNEDLEFT\tNEWFONT\tLOWERFONT\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t1\t
\n" + + "1\t1\t1\t1\t1\t1\t1\t1\t1\t1\tBLOCKIN\tLINESTART\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tALLDIGIT\t1\tNOPUNCT\t9\t5\t0\tNUMBER\t1\t0\t
\n" + + "0\t0\t0\t0\t0\t0\t0\t0\t0\t0\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tHIGHERFONT\t0\t0\tNOCAPS\tALLDIGIT\t1\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t0\t
\n" + + "state\tstate\ts\tst\tsta\tstat\te\tte\tate\ttate\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t0\t
\n" + + "of\tof\to\tof\tof\tof\tf\tof\tof\tof\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t0\t
\n" + + "10\t10\t1\t10\t10\t10\t0\t10\t10\t10\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tLOWERFONT\t0\t0\tNOCAPS\tALLDIGIT\t0\tNOPUNCT\t9\t5\t0\tNUMBER\t1\t1\t
\n" + + "B\tb\tB\tB\tB\tB\tB\tB\tB\tB\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tHIGHERFONT\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t0\t
\n" + + ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t9\t5\t0\tNUMBER\t0\t0\t
\n" + + "The\tthe\tT\tTh\tThe\tThe\te\the\tThe\tThe\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t0\tI-
\n" + + "panels\tpanels\tp\tpa\tpan\tpane\ts\tls\tels\tnels\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t0\tI-
\n" + + "(\t(\t(\t(\t(\t(\t(\t(\t(\t(\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tOPENBRACKET\t9\t5\t0\tNUMBER\t0\t0\tI-
\n" + + "a\ta\ta\ta\ta\ta\ta\ta\ta\ta\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t1\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t0\t
\n" + + ")\t)\t)\t)\t)\t)\t)\t)\t)\t)\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tENDBRACKET\t9\t5\t0\tNUMBER\t0\t0\t
\n" + + "are\tare\ta\tar\tare\tare\te\tre\tare\tare\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t0\t
\n" + + "calculated\tcalculated\tc\tca\tcal\tcalc\td\ted\tted\tated\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t0\t
\n" + + "calculated\tcalculated\tc\tca\tcal\tcalc\td\ted\tted\tated\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t0\tI-
\n" + + "calculated\tcalculated\tc\tca\tcal\tcalc\td\ted\tted\tated\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t0\tI-
\n" + + "calculated\tcalculated\tc\tca\tcal\tcalc\td\ted\tted\tated\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t0\tI-
\n" + + "calculated\tcalculated\tc\tca\tcal\tcalc\td\ted\tted\tated\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t0\t
\n" + + "calculated\tcalculated\tc\tca\tcal\tcalc\td\ted\tted\tated\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t0\t
\n" + + "calculated\tcalculated\tc\tca\tcal\tcalc\td\ted\tted\tated\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t0\t
\n" + + "calculated\tcalculated\tc\tca\tcal\tcalc\td\ted\tted\tated\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t0\t
\n" + + "calculated\tcalculated\tc\tca\tcal\tcalc\td\ted\tted\tated\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t0\t
\n" + + "calculated\tcalculated\tc\tca\tcal\tcalc\td\ted\tted\tated\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t0\t
\n" + + "calculated\tcalculated\tc\tca\tcal\tcalc\td\ted\tted\tated\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t0\t
\n" + + "calculated\tcalculated\tc\tca\tcal\tcalc\td\ted\tted\tated\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t0\t
\n" + + + val postProcessed = LabelUtils.adjustInvalidSequenceOfStartLabels(bodyResult) + + assertThat(postProcessed, not(bodyResult)) + + val splitResult = + Arrays.stream(postProcessed.split("\n".toRegex()).dropLastWhile { it.isEmpty() }.toTypedArray()) + .map> { l: String -> + Arrays.stream( + l.split("\t".toRegex()).dropLastWhile { it.isEmpty() }.toTypedArray() + ) + .collect(Collectors.toList()) + } + .collect(Collectors.toList>()) + + val countStartingFigure = splitResult.stream() + .map { l: List -> l.last() } + .filter { l: String -> l.equals("I-
") } + .count() + + assertThat(countStartingFigure, `is`(2)) + + val countStartingTables = splitResult.stream() + .map { l: List -> l.last() } + .filter { l: String -> l.equals("I-
") } + .count() + + assertThat(countStartingTables, `is`(1)) + + } + + companion object { @JvmStatic @BeforeAll From f5eb7584fca4e955b11940b565b7f0e6e5704f44 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Tue, 17 Dec 2024 11:13:43 +0100 Subject: [PATCH 13/21] tests and refactoring in smaller pieces --- .../grobid/core/engines/FullTextParser.java | 117 +++---- .../core/engines/FullTextParserTest.java | 196 ------------ .../core/utilities/GrobidTestUtils.java | 8 +- .../grobid/core/engines/FullTextParserTest.kt | 295 ++++++++++++++++++ 4 files changed, 365 insertions(+), 251 deletions(-) delete mode 100644 grobid-core/src/test/java/org/grobid/core/engines/FullTextParserTest.java create mode 100644 grobid-core/src/test/kotlin/org/grobid/core/engines/FullTextParserTest.kt diff --git a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java index 209b844a64..30815e5e44 100755 --- a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java @@ -11,7 +11,6 @@ import java.nio.charset.StandardCharsets; -import org.apache.lucene.util.CollectionUtil; import org.grobid.core.GrobidModels; import org.grobid.core.data.*; import org.grobid.core.document.Document; @@ -33,7 +32,6 @@ import org.grobid.core.features.FeaturesVectorFulltext; import org.grobid.core.lang.Language; import org.grobid.core.lexicon.Lexicon; -import org.grobid.core.lexicon.Lexicon.OrganizationRecord; import org.grobid.core.layout.*; import org.grobid.core.tokenization.TaggingTokenCluster; import org.grobid.core.tokenization.TaggingTokenClusteror; @@ -43,6 +41,7 @@ import org.grobid.core.engines.citations.CalloutAnalyzer; import org.grobid.core.engines.citations.CalloutAnalyzer.MarkerType; +import org.jetbrains.annotations.NotNull; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -362,69 +361,38 @@ else if (config.getConsolidateCitations() == 2) } } - private static String revertResultsForBadItems(List badFiguresOrTables, String resultBody, String itemLabel) { + static String revertResultsForBadItems(List badFiguresOrTables, String resultBody, String itemLabel) { //LF: we update the resultBody sequence by reverting these tables as elements if (CollectionUtils.isNotEmpty(badFiguresOrTables)) { - List> splitResult = Arrays.stream(resultBody.split("\n")) + List> labelledResultsAsList = Arrays.stream(resultBody.split("\n")) .map(l -> Arrays.stream(l.split("\t")).collect(Collectors.toList())) .collect(Collectors.toList()); for (Figure badItem : badFiguresOrTables) { // Find the index of the first layoutToken of the table in the tokenization - List rawLayoutTokenTable = badItem.getLayoutTokens(); - LayoutToken firstLayoutTokenItem = rawLayoutTokenTable.get(0); - - List candidateIndexes = IntStream.range(0, splitResult.size()) - .filter(i -> splitResult.get(i).get(0).equals(firstLayoutTokenItem.getText()) - && Iterables.getLast(splitResult.get(i)).equals("I-"+itemLabel)) - .boxed() - .collect(Collectors.toList()); - + List layoutTokenItem = badItem.getLayoutTokens(); + List candidateIndexes = findCandiateIndex(layoutTokenItem, labelledResultsAsList, itemLabel); if (candidateIndexes.isEmpty()) { - candidateIndexes = IntStream.range(0, splitResult.size()) - .filter(i -> splitResult.get(i).get(0).equals(firstLayoutTokenItem.getText()) - && Iterables.getLast(splitResult.get(i)).equals(itemLabel)) - .boxed() - .collect(Collectors.toList()); - if (candidateIndexes.isEmpty()) { - LOGGER.info("Cannot find the candidate index for fixing the tables."); - continue; - } + LOGGER.info("Cannot find the candidate index for fixing the tables."); + continue; } - // Need to match with the rest - List tokensNoSpace = rawLayoutTokenTable.stream() + //A this point i have more than one candidate, which can be matched if the same first + // token is repeated in the sequence. The next step is to find the matching figure/table + // using a large sequence + + List sequenceTokenWithoutSpaces = layoutTokenItem.stream() .map(LayoutToken::getText) .map(StringUtils::strip) .filter(StringUtils::isNotBlank) .collect(Collectors.toList()); - int resultIndexCandidate = -1; - if (candidateIndexes.isEmpty()){ - LOGGER.warn("Cannot find the candidate index for fixing the tables."); - } else if (candidateIndexes.size() == 1){ - resultIndexCandidate = candidateIndexes.get(0); - } else { - for (int candidateIndex: candidateIndexes) { - List candidateTable = splitResult.subList(candidateIndex, Math.min(candidateIndex + tokensNoSpace.size(), splitResult.size())) - .stream() - .map(i -> i.get(0)) - .collect(Collectors.toList()); - - String candidateTableText = String.join("", candidateTable); - String tokensText = String.join("", tokensNoSpace); - - if (candidateTableText.equals(tokensText)) { - resultIndexCandidate = candidateIndex; - break; - } - } - } + int resultIndexCandidate = consolidateResultCandidateThroughSequence(candidateIndexes, labelledResultsAsList, sequenceTokenWithoutSpaces); if (resultIndexCandidate > -1) { boolean first = true; - for (int i = resultIndexCandidate;i < Math.min(resultIndexCandidate + tokensNoSpace.size(), splitResult.size()); i++) { - List line = splitResult.get(i); + for (int i = resultIndexCandidate;i < Math.min(resultIndexCandidate + sequenceTokenWithoutSpaces.size(), labelledResultsAsList.size()); i++) { + List line = labelledResultsAsList.get(i); String label = Iterables.getLast(line); if (first) { first = false; @@ -440,15 +408,58 @@ private static String revertResultsForBadItems(List badFigures } } - String resultBody2 = splitResult.stream() + String updatedResultBody = labelledResultsAsList.stream() .map(l -> String.join("\t", l)) .collect(Collectors.joining("\n")); - resultBody = resultBody2; + resultBody = updatedResultBody; } return resultBody; } + static int consolidateResultCandidateThroughSequence(List candidateIndexes, List> splitResult, List tokensNoSpace) { + int resultIndexCandidate = -1; + if (candidateIndexes.size() == 1){ + resultIndexCandidate = candidateIndexes.get(0); + } else { + for (int candidateIndex: candidateIndexes) { + List candidateTable = splitResult.subList(candidateIndex, Math.min(candidateIndex + tokensNoSpace.size(), splitResult.size())) + .stream() + .map(i -> i.get(0)) + .collect(Collectors.toList()); + + String candidateTableText = String.join("", candidateTable); + String tokensText = String.join("", tokensNoSpace); + + if (candidateTableText.equals(tokensText)) { + resultIndexCandidate = candidateIndex; + break; + } + } + } + return resultIndexCandidate; + } + + @NotNull + static List findCandiateIndex(List layoutTokenItem, List> labelledResultsAsList, String itemLabel) { + LayoutToken firstLayoutTokenItem = layoutTokenItem.get(0); + + List candidateIndexes = IntStream.range(0, labelledResultsAsList.size()) + .filter(i -> labelledResultsAsList.get(i).get(0).equals(firstLayoutTokenItem.getText()) + && Iterables.getLast(labelledResultsAsList.get(i)).equals("I-"+ itemLabel)) + .boxed() + .collect(Collectors.toList()); + + if (candidateIndexes.isEmpty()) { + candidateIndexes = IntStream.range(0, labelledResultsAsList.size()) + .filter(i -> labelledResultsAsList.get(i).get(0).equals(firstLayoutTokenItem.getText()) + && Iterables.getLast(labelledResultsAsList.get(i)).equals(itemLabel)) + .boxed() + .collect(Collectors.toList()); + } + return candidateIndexes; + } + /** * Machine-learning recognition of full text structures limted to header and funding information. @@ -2062,10 +2073,10 @@ private static boolean testClosingTag(StringBuilder buffer, buffer.append(""); // Make sure that paragraph is closed when markers are at the end of it - if (!currentTag0.equals("") && - (!currentTag0.equals("") || - !currentTag0.equals("") || - !currentTag0.equals("") || + if (!currentTag0.equals("") && + (!currentTag0.equals("") || + !currentTag0.equals("") || + !currentTag0.equals("") || !currentTag0.equals("") ) ) { diff --git a/grobid-core/src/test/java/org/grobid/core/engines/FullTextParserTest.java b/grobid-core/src/test/java/org/grobid/core/engines/FullTextParserTest.java deleted file mode 100644 index ac568290cb..0000000000 --- a/grobid-core/src/test/java/org/grobid/core/engines/FullTextParserTest.java +++ /dev/null @@ -1,196 +0,0 @@ -package org.grobid.core.engines; - -import org.apache.commons.lang3.StringUtils; -import org.apache.commons.lang3.tuple.Pair; -import org.grobid.core.analyzers.GrobidAnalyzer; -import org.grobid.core.factory.GrobidFactory; -import org.grobid.core.layout.LayoutToken; -import org.grobid.core.main.LibraryLoader; -import org.grobid.core.utilities.GrobidProperties; -import org.junit.AfterClass; -import org.junit.Before; -import org.junit.BeforeClass; -import org.junit.Test; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; -import java.util.stream.Collectors; - -import static org.hamcrest.CoreMatchers.is; -import static org.hamcrest.MatcherAssert.assertThat; -import static org.hamcrest.collection.IsCollectionWithSize.hasSize; - -public class FullTextParserTest { - - private FullTextParser target; - - @Before - public void setUp() throws Exception { - target = new FullTextParser(new EngineParsers()); - } - - @BeforeClass - public static void init() { - LibraryLoader.load(); - GrobidProperties.getInstance(); - } - - @AfterClass - public static void tearDown() { - GrobidFactory.reset(); - } - - @Test - public void testProcessTrainingDataFigures_single_figure() throws Exception { - String text = "The mechanism for superconductivity FIG. 1. λ(T) vs . T for YBCO"; - List tokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text); - String rese = "The\tthe\tT\tTh\tThe\tThe\te\the\tThe\tThe\tBLOCKSTART\tLINESTART\tALIGNEDLEFT\tNEWFONT\tHIGHERFONT\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\tI-\n" + - "mechanism\tmechanism\tm\tme\tmec\tmech\tm\tsm\tism\tnism\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\t\n" + - "for\tfor\tf\tfo\tfor\tfor\tr\tor\tfor\tfor\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\t\n" + - "superconductivity\tsuperconductivity\ts\tsu\tsup\tsupe\ty\tty\tity\tvity\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\t\n" + - "FIG\tfig\tF\tFI\tFIG\tFIG\tG\tIG\tFIG\tFIG\tBLOCKSTART\tLINESTART\tLINEINDENT\tNEWFONT\tHIGHERFONT\t0\t0\tALLCAP\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\tI-
\n" + - ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "1\t1\t1\t1\t1\t1\t1\t1\t1\t1\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tALLDIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t1\t0\t
\n" + - ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "λ\tλ\tλ\tλ\tλ\tλ\tλ\tλ\tλ\tλ\tBLOCKIN\tLINEIN\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "(\t(\t(\t(\t(\t(\t(\t(\t(\t(\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tOPENBRACKET\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "T\tt\tT\tT\tT\tT\tT\tT\tT\tT\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - ")\t)\t)\t)\t)\t)\t)\t)\t)\t)\tBLOCKIN\tLINEIN\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tENDBRACKET\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "vs\tvs\tv\tvs\tvs\tvs\ts\tvs\tvs\tvs\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEEND\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "T\tt\tT\tT\tT\tT\tT\tT\tT\tT\tBLOCKIN\tLINESTART\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "for\tfor\tf\tfo\tfor\tfor\tr\tor\tfor\tfor\tBLOCKIN\tLINEIN\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "YBCO\tybco\tY\tYB\tYBC\tYBCO\tO\tCO\tBCO\tYBCO\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n\n"; - - - Pair stringStringPair = target.processTrainingDataFigures(rese, tokens, "123"); - - String tei = stringStringPair.getLeft(); - String tokenisation = stringStringPair.getRight(); - String reconstructedText = Arrays.stream(tokenisation.split("\n")).map(l -> l.split("\t")[0]).collect(Collectors.joining(" ")); - - assertThat(reconstructedText, is("FIG . 1 . λ ( T ) vs . T for YBCO")); - assertThat(tokenisation.split("\n").length, is(13)); - - } - - @Test - public void testProcessTrainingDataFigures_multiple_figures() throws Exception { - String text = "The mechanism for superconductivity FIG. 1. λ(T) vs . T for YBCO"; - List tokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text); - String rese = "The\tthe\tT\tTh\tThe\tThe\te\the\tThe\tThe\tBLOCKSTART\tLINESTART\tALIGNEDLEFT\tNEWFONT\tHIGHERFONT\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\tI-\n" + - "mechanism\tmechanism\tm\tme\tmec\tmech\tm\tsm\tism\tnism\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\t\n" + - "for\tfor\tf\tfo\tfor\tfor\tr\tor\tfor\tfor\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\t\n" + - "superconductivity\tsuperconductivity\ts\tsu\tsup\tsupe\ty\tty\tity\tvity\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\t\n" + - "FIG\tfig\tF\tFI\tFIG\tFIG\tG\tIG\tFIG\tFIG\tBLOCKSTART\tLINESTART\tLINEINDENT\tNEWFONT\tHIGHERFONT\t0\t0\tALLCAP\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\tI-
\n" + - ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "1\t1\t1\t1\t1\t1\t1\t1\t1\t1\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tALLDIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t1\t0\t
\n" + - ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "λ\tλ\tλ\tλ\tλ\tλ\tλ\tλ\tλ\tλ\tBLOCKIN\tLINEIN\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "(\t(\t(\t(\t(\t(\t(\t(\t(\t(\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tOPENBRACKET\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "T\tt\tT\tT\tT\tT\tT\tT\tT\tT\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - ")\t)\t)\t)\t)\t)\t)\t)\t)\t)\tBLOCKIN\tLINEIN\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tENDBRACKET\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "vs\tvs\tv\tvs\tvs\tvs\ts\tvs\tvs\tvs\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\tI-
\n" + - ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEEND\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "T\tt\tT\tT\tT\tT\tT\tT\tT\tT\tBLOCKIN\tLINESTART\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "for\tfor\tf\tfo\tfor\tfor\tr\tor\tfor\tfor\tBLOCKIN\tLINEIN\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "YBCO\tybco\tY\tYB\tYBC\tYBCO\tO\tCO\tBCO\tYBCO\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n\n"; - - - Pair stringStringPair = target.processTrainingDataFigures(rese, tokens, "123"); - - String tei = stringStringPair.getLeft(); - String tokenisation = stringStringPair.getRight(); - List output = new ArrayList<>(); - for (String block : tokenisation.split("\n\n\n")) { - String collect = Arrays.stream(block.split("\n")).map(l -> l.split("\t")[0]).collect(Collectors.joining(" ")); - if (StringUtils.isNotBlank(collect)) { - output.add(collect); - } - } - - assertThat(output, hasSize(2)); - assertThat(output.get(0), is("FIG . 1 . λ ( T )")); - assertThat(output.get(1), is("vs . T for YBCO")); - assertThat(tokenisation.split("\n").length, is(15)); - - } - - @Test - public void testProcessTrainingDataTables_single_table() throws Exception { - String text = "The mechanism for superconductivity FIG. 1. λ(T) vs . T for YBCO"; - List tokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text); - String rese = "The\tthe\tT\tTh\tThe\tThe\te\the\tThe\tThe\tBLOCKSTART\tLINESTART\tALIGNEDLEFT\tNEWFONT\tHIGHERFONT\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\tI-\n" + - "mechanism\tmechanism\tm\tme\tmec\tmech\tm\tsm\tism\tnism\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\t\n" + - "for\tfor\tf\tfo\tfor\tfor\tr\tor\tfor\tfor\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\t\n" + - "superconductivity\tsuperconductivity\ts\tsu\tsup\tsupe\ty\tty\tity\tvity\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\t\n" + - "FIG\tfig\tF\tFI\tFIG\tFIG\tG\tIG\tFIG\tFIG\tBLOCKSTART\tLINESTART\tLINEINDENT\tNEWFONT\tHIGHERFONT\t0\t0\tALLCAP\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\tI-
\n" + - ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "1\t1\t1\t1\t1\t1\t1\t1\t1\t1\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tALLDIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t1\t0\t
\n" + - ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "λ\tλ\tλ\tλ\tλ\tλ\tλ\tλ\tλ\tλ\tBLOCKIN\tLINEIN\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "(\t(\t(\t(\t(\t(\t(\t(\t(\t(\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tOPENBRACKET\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "T\tt\tT\tT\tT\tT\tT\tT\tT\tT\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - ")\t)\t)\t)\t)\t)\t)\t)\t)\t)\tBLOCKIN\tLINEIN\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tENDBRACKET\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "vs\tvs\tv\tvs\tvs\tvs\ts\tvs\tvs\tvs\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEEND\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "T\tt\tT\tT\tT\tT\tT\tT\tT\tT\tBLOCKIN\tLINESTART\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "for\tfor\tf\tfo\tfor\tfor\tr\tor\tfor\tfor\tBLOCKIN\tLINEIN\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "YBCO\tybco\tY\tYB\tYBC\tYBCO\tO\tCO\tBCO\tYBCO\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n\n"; - - - Pair stringStringPair = target.processTrainingDataTables(rese, tokens, "123"); - - String tei = stringStringPair.getLeft(); - String tokenisation = stringStringPair.getRight(); - String reconstructedText = Arrays.stream(tokenisation.split("\n")).map(l -> l.split("\t")[0]).collect(Collectors.joining(" ")); - - assertThat(reconstructedText, is("FIG . 1 . λ ( T ) vs . T for YBCO")); - assertThat(tokenisation.split("\n").length, is(13)); - - } - - @Test - public void testProcessTrainingDataTable_multiple_tables() throws Exception { - String text = "The mechanism for superconductivity FIG. 1. λ(T) vs . T for YBCO"; - List tokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text); - String rese = "The\tthe\tT\tTh\tThe\tThe\te\the\tThe\tThe\tBLOCKSTART\tLINESTART\tALIGNEDLEFT\tNEWFONT\tHIGHERFONT\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\tI-\n" + - "mechanism\tmechanism\tm\tme\tmec\tmech\tm\tsm\tism\tnism\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\t\n" + - "for\tfor\tf\tfo\tfor\tfor\tr\tor\tfor\tfor\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\t\n" + - "superconductivity\tsuperconductivity\ts\tsu\tsup\tsupe\ty\tty\tity\tvity\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\t\n" + - "FIG\tfig\tF\tFI\tFIG\tFIG\tG\tIG\tFIG\tFIG\tBLOCKSTART\tLINESTART\tLINEINDENT\tNEWFONT\tHIGHERFONT\t0\t0\tALLCAP\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\tI-
\n" + - ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "1\t1\t1\t1\t1\t1\t1\t1\t1\t1\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tALLDIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t1\t0\t
\n" + - ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "λ\tλ\tλ\tλ\tλ\tλ\tλ\tλ\tλ\tλ\tBLOCKIN\tLINEIN\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "(\t(\t(\t(\t(\t(\t(\t(\t(\t(\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tOPENBRACKET\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "T\tt\tT\tT\tT\tT\tT\tT\tT\tT\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - ")\t)\t)\t)\t)\t)\t)\t)\t)\t)\tBLOCKIN\tLINEIN\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tENDBRACKET\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "vs\tvs\tv\tvs\tvs\tvs\ts\tvs\tvs\tvs\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\tI-
\n" + - ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEEND\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "T\tt\tT\tT\tT\tT\tT\tT\tT\tT\tBLOCKIN\tLINESTART\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "for\tfor\tf\tfo\tfor\tfor\tr\tor\tfor\tfor\tBLOCKIN\tLINEIN\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "YBCO\tybco\tY\tYB\tYBC\tYBCO\tO\tCO\tBCO\tYBCO\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n\n"; - - - Pair stringStringPair = target.processTrainingDataTables(rese, tokens, "123"); - - String tei = stringStringPair.getLeft(); - String tokenisation = stringStringPair.getRight(); - List output = new ArrayList<>(); - for (String block : tokenisation.split("\n\n\n")) { - String collect = Arrays.stream(block.split("\n")).map(l -> l.split("\t")[0]).collect(Collectors.joining(" ")); - if (StringUtils.isNotBlank(collect)) { - output.add(collect); - } - } - - assertThat(output, hasSize(2)); - assertThat(output.get(0), is("FIG . 1 . λ ( T )")); - assertThat(output.get(1), is("vs . T for YBCO")); - assertThat(tokenisation.split("\n").length, is(15)); - - } - -} \ No newline at end of file diff --git a/grobid-core/src/test/java/org/grobid/core/utilities/GrobidTestUtils.java b/grobid-core/src/test/java/org/grobid/core/utilities/GrobidTestUtils.java index f25b263049..9b7db0c685 100644 --- a/grobid-core/src/test/java/org/grobid/core/utilities/GrobidTestUtils.java +++ b/grobid-core/src/test/java/org/grobid/core/utilities/GrobidTestUtils.java @@ -10,6 +10,10 @@ public class GrobidTestUtils { + public static String getWapitiResult(List features, List> labels) { + return getWapitiResult(features, labels, " "); + } + /** * Utility method to generate a hypotetical result from wapiti. * Useful for testing the extraction of the sequence labeling. @@ -17,7 +21,7 @@ public class GrobidTestUtils { * @param labels label maps. A list of Triples, containing label (left), start_index (middle) and end_index exclusive (right) * @return a string containing the resulting features + labels returned by wapiti */ - public static String getWapitiResult(List features, List> labels) { + public static String getWapitiResult(List features, List> labels, String separator) { List labeled = new ArrayList<>(); int idx = 0; @@ -52,7 +56,7 @@ public static String getWapitiResult(List features, List\n" + + "mechanism\tmechanism\tm\tme\tmec\tmech\tm\tsm\tism\tnism\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\t\n" + + "for\tfor\tf\tfo\tfor\tfor\tr\tor\tfor\tfor\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\t\n" + + "superconductivity\tsuperconductivity\ts\tsu\tsup\tsupe\ty\tty\tity\tvity\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\t\n" + + "FIG\tfig\tF\tFI\tFIG\tFIG\tG\tIG\tFIG\tFIG\tBLOCKSTART\tLINESTART\tLINEINDENT\tNEWFONT\tHIGHERFONT\t0\t0\tALLCAP\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\tI-
\n" + + ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + "1\t1\t1\t1\t1\t1\t1\t1\t1\t1\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tALLDIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t1\t0\t
\n" + + ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + "λ\tλ\tλ\tλ\tλ\tλ\tλ\tλ\tλ\tλ\tBLOCKIN\tLINEIN\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + "(\t(\t(\t(\t(\t(\t(\t(\t(\t(\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tOPENBRACKET\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + "T\tt\tT\tT\tT\tT\tT\tT\tT\tT\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + ")\t)\t)\t)\t)\t)\t)\t)\t)\t)\tBLOCKIN\tLINEIN\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tENDBRACKET\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + "vs\tvs\tv\tvs\tvs\tvs\ts\tvs\tvs\tvs\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEEND\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + "T\tt\tT\tT\tT\tT\tT\tT\tT\tT\tBLOCKIN\tLINESTART\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + "for\tfor\tf\tfo\tfor\tfor\tr\tor\tfor\tfor\tBLOCKIN\tLINEIN\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + "YBCO\tybco\tY\tYB\tYBC\tYBCO\tO\tCO\tBCO\tYBCO\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n\n" + + + val stringStringPair = target!!.processTrainingDataFigures(rese, tokens, "123") + + val tei = stringStringPair.left + val tokenisation = stringStringPair.right + val reconstructedText = + Arrays.stream(tokenisation.split("\n".toRegex()).dropLastWhile { it.isEmpty() }.toTypedArray()) + .map { l: String -> l.split("\t".toRegex()).dropLastWhile { it.isEmpty() }.toTypedArray()[0] } + .collect(Collectors.joining(" ")) + + MatcherAssert.assertThat(reconstructedText, CoreMatchers.`is`("FIG . 1 . λ ( T ) vs . T for YBCO")) + MatcherAssert.assertThat( + tokenisation.split("\n".toRegex()).dropLastWhile { it.isEmpty() }.toTypedArray().size, + CoreMatchers.`is`(13) + ) + } + + @Test + @Throws(Exception::class) + fun testProcessTrainingDataFigures_multiple_figures() { + val text = "The mechanism for superconductivity FIG. 1. λ(T) vs . T for YBCO" + val tokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text) + val rese = + "The\tthe\tT\tTh\tThe\tThe\te\the\tThe\tThe\tBLOCKSTART\tLINESTART\tALIGNEDLEFT\tNEWFONT\tHIGHERFONT\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\tI-\n" + + "mechanism\tmechanism\tm\tme\tmec\tmech\tm\tsm\tism\tnism\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\t\n" + + "for\tfor\tf\tfo\tfor\tfor\tr\tor\tfor\tfor\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\t\n" + + "superconductivity\tsuperconductivity\ts\tsu\tsup\tsupe\ty\tty\tity\tvity\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\t\n" + + "FIG\tfig\tF\tFI\tFIG\tFIG\tG\tIG\tFIG\tFIG\tBLOCKSTART\tLINESTART\tLINEINDENT\tNEWFONT\tHIGHERFONT\t0\t0\tALLCAP\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\tI-
\n" + + ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + "1\t1\t1\t1\t1\t1\t1\t1\t1\t1\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tALLDIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t1\t0\t
\n" + + ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + "λ\tλ\tλ\tλ\tλ\tλ\tλ\tλ\tλ\tλ\tBLOCKIN\tLINEIN\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + "(\t(\t(\t(\t(\t(\t(\t(\t(\t(\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tOPENBRACKET\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + "T\tt\tT\tT\tT\tT\tT\tT\tT\tT\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + ")\t)\t)\t)\t)\t)\t)\t)\t)\t)\tBLOCKIN\tLINEIN\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tENDBRACKET\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + "vs\tvs\tv\tvs\tvs\tvs\ts\tvs\tvs\tvs\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\tI-
\n" + + ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEEND\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + "T\tt\tT\tT\tT\tT\tT\tT\tT\tT\tBLOCKIN\tLINESTART\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + "for\tfor\tf\tfo\tfor\tfor\tr\tor\tfor\tfor\tBLOCKIN\tLINEIN\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + "YBCO\tybco\tY\tYB\tYBC\tYBCO\tO\tCO\tBCO\tYBCO\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n\n" + + + val stringStringPair = target!!.processTrainingDataFigures(rese, tokens, "123") + + val tei = stringStringPair.left + val tokenisation = stringStringPair.right + val output: MutableList = ArrayList() + for (block in tokenisation.split("\n\n\n".toRegex()).dropLastWhile { it.isEmpty() }.toTypedArray()) { + val collect = Arrays.stream(block.split("\n".toRegex()).dropLastWhile { it.isEmpty() }.toTypedArray()) + .map { l: String -> l.split("\t".toRegex()).dropLastWhile { it.isEmpty() }.toTypedArray()[0] } + .collect(Collectors.joining(" ")) + if (StringUtils.isNotBlank(collect)) { + output.add(collect) + } + } + + MatcherAssert.assertThat>(output, IsCollectionWithSize.hasSize(2)) + MatcherAssert.assertThat(output[0], CoreMatchers.`is`("FIG . 1 . λ ( T )")) + MatcherAssert.assertThat(output[1], CoreMatchers.`is`("vs . T for YBCO")) + MatcherAssert.assertThat( + tokenisation.split("\n".toRegex()).dropLastWhile { it.isEmpty() }.toTypedArray().size, + CoreMatchers.`is`(15) + ) + } + + @Test + @Throws(Exception::class) + fun testProcessTrainingDataTables_single_table() { + val text = "The mechanism for superconductivity FIG. 1. λ(T) vs . T for YBCO" + val tokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text) + val rese = + "The\tthe\tT\tTh\tThe\tThe\te\the\tThe\tThe\tBLOCKSTART\tLINESTART\tALIGNEDLEFT\tNEWFONT\tHIGHERFONT\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\tI-\n" + + "mechanism\tmechanism\tm\tme\tmec\tmech\tm\tsm\tism\tnism\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\t\n" + + "for\tfor\tf\tfo\tfor\tfor\tr\tor\tfor\tfor\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\t\n" + + "superconductivity\tsuperconductivity\ts\tsu\tsup\tsupe\ty\tty\tity\tvity\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\t\n" + + "FIG\tfig\tF\tFI\tFIG\tFIG\tG\tIG\tFIG\tFIG\tBLOCKSTART\tLINESTART\tLINEINDENT\tNEWFONT\tHIGHERFONT\t0\t0\tALLCAP\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\tI-
\n" + + ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + "1\t1\t1\t1\t1\t1\t1\t1\t1\t1\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tALLDIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t1\t0\t
\n" + + ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + "λ\tλ\tλ\tλ\tλ\tλ\tλ\tλ\tλ\tλ\tBLOCKIN\tLINEIN\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + "(\t(\t(\t(\t(\t(\t(\t(\t(\t(\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tOPENBRACKET\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + "T\tt\tT\tT\tT\tT\tT\tT\tT\tT\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + ")\t)\t)\t)\t)\t)\t)\t)\t)\t)\tBLOCKIN\tLINEIN\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tENDBRACKET\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + "vs\tvs\tv\tvs\tvs\tvs\ts\tvs\tvs\tvs\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEEND\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + "T\tt\tT\tT\tT\tT\tT\tT\tT\tT\tBLOCKIN\tLINESTART\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + "for\tfor\tf\tfo\tfor\tfor\tr\tor\tfor\tfor\tBLOCKIN\tLINEIN\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + "YBCO\tybco\tY\tYB\tYBC\tYBCO\tO\tCO\tBCO\tYBCO\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n\n" + + + val stringStringPair = target!!.processTrainingDataTables(rese, tokens, "123") + + val tei = stringStringPair.left + val tokenisation = stringStringPair.right + val reconstructedText = + Arrays.stream(tokenisation.split("\n".toRegex()).dropLastWhile { it.isEmpty() }.toTypedArray()) + .map { l: String -> l.split("\t".toRegex()).dropLastWhile { it.isEmpty() }.toTypedArray()[0] } + .collect(Collectors.joining(" ")) + + MatcherAssert.assertThat(reconstructedText, CoreMatchers.`is`("FIG . 1 . λ ( T ) vs . T for YBCO")) + MatcherAssert.assertThat( + tokenisation.split("\n".toRegex()).dropLastWhile { it.isEmpty() }.toTypedArray().size, + CoreMatchers.`is`(13) + ) + } + + @Test + @Throws(Exception::class) + fun testProcessTrainingDataTable_multiple_tables() { + val text = "The mechanism for superconductivity FIG. 1. λ(T) vs . T for YBCO" + val tokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text) + val rese = + "The\tthe\tT\tTh\tThe\tThe\te\the\tThe\tThe\tBLOCKSTART\tLINESTART\tALIGNEDLEFT\tNEWFONT\tHIGHERFONT\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\tI-\n" + + "mechanism\tmechanism\tm\tme\tmec\tmech\tm\tsm\tism\tnism\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\t\n" + + "for\tfor\tf\tfo\tfor\tfor\tr\tor\tfor\tfor\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\t\n" + + "superconductivity\tsuperconductivity\ts\tsu\tsup\tsupe\ty\tty\tity\tvity\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\t\n" + + "FIG\tfig\tF\tFI\tFIG\tFIG\tG\tIG\tFIG\tFIG\tBLOCKSTART\tLINESTART\tLINEINDENT\tNEWFONT\tHIGHERFONT\t0\t0\tALLCAP\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\tI-
\n" + + ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + "1\t1\t1\t1\t1\t1\t1\t1\t1\t1\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tALLDIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t1\t0\t
\n" + + ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + "λ\tλ\tλ\tλ\tλ\tλ\tλ\tλ\tλ\tλ\tBLOCKIN\tLINEIN\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + "(\t(\t(\t(\t(\t(\t(\t(\t(\t(\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tOPENBRACKET\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + "T\tt\tT\tT\tT\tT\tT\tT\tT\tT\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + ")\t)\t)\t)\t)\t)\t)\t)\t)\t)\tBLOCKIN\tLINEIN\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tENDBRACKET\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + "vs\tvs\tv\tvs\tvs\tvs\ts\tvs\tvs\tvs\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\tI-
\n" + + ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEEND\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + "T\tt\tT\tT\tT\tT\tT\tT\tT\tT\tBLOCKIN\tLINESTART\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + "for\tfor\tf\tfo\tfor\tfor\tr\tor\tfor\tfor\tBLOCKIN\tLINEIN\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + + "YBCO\tybco\tY\tYB\tYBC\tYBCO\tO\tCO\tBCO\tYBCO\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n\n" + + + val stringStringPair = target!!.processTrainingDataTables(rese, tokens, "123") + + val tei = stringStringPair.left + val tokenisation = stringStringPair.right + val output: MutableList = ArrayList() + for (block in tokenisation.split("\n\n\n".toRegex()).dropLastWhile { it.isEmpty() }.toTypedArray()) { + val collect = Arrays.stream(block.split("\n".toRegex()).dropLastWhile { it.isEmpty() }.toTypedArray()) + .map { l: String -> l.split("\t".toRegex()).dropLastWhile { it.isEmpty() }.toTypedArray()[0] } + .collect(Collectors.joining(" ")) + if (StringUtils.isNotBlank(collect)) { + output.add(collect) + } + } + + MatcherAssert.assertThat>(output, IsCollectionWithSize.hasSize(2)) + MatcherAssert.assertThat(output[0], CoreMatchers.`is`("FIG . 1 . λ ( T )")) + MatcherAssert.assertThat(output[1], CoreMatchers.`is`("vs . T for YBCO")) + MatcherAssert.assertThat( + tokenisation.split("\n".toRegex()).dropLastWhile { it.isEmpty() }.toTypedArray().size, + CoreMatchers.`is`(15) + ) + } + + @Test + fun testFindCandidates() { + var mockDocumentSource = createMock(DocumentSource::class.java) + var document = Document.createFromText("") + + // i need to prepare a sequence where there might be multiple matches, + // and then verify that the sequence is correctly used for discrimination + var sequence = "This article solves the problem where some of our interaction are fauly. " + + "a 8 9 j 92j 3 3j 9 j 9j Table 1: The reconstruction of the national anthem " + + "We are interested in the relation between certain information and " + + "a b b d 1 2 3 4 s 3 3 d9 Table 2: The relation between information and noise " + + "the related affectionality. " + + "a b b d 1 2 3 4 5 6 7 Table 3: The relation between homicides and donuts eating " + + "The relation between homicides and donuts eating is a very important one. " + + var tokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(sequence) + + // These triples made in following way: label, starting index (included), ending index (excluded) + val labels = listOf( + Triple.of("I-", 0, 1), + Triple.of("", 1, 24), + Triple.of("I-
", 25, 26), + Triple.of("
", 26, 61), + Triple.of("I-", 62, 63), + Triple.of("", 63, 81), + Triple.of("I-
", 82, 83), + Triple.of("
", 82, 118), + Triple.of("I-", 119, 120), + Triple.of("", 120, 129), + Triple.of("I-
", 130, 131), + Triple.of("
", 131, 171), + Triple.of("I-", 171, 172), + Triple.of("", 172, 195), + ) + + val features = tokens.stream().map { it.text }.collect(Collectors.toList()) + + val wapitiResult = GrobidTestUtils.getWapitiResult(features, labels, "\t") + val labelledResultsAsList = + Arrays.stream(wapitiResult.split("\n".toRegex()).dropLastWhile { it.isEmpty() }.toTypedArray()) + .map> { l: String -> + Arrays.stream( + l.split("\t".toRegex()).dropLastWhile { it.isEmpty() }.toTypedArray() + ) + .collect(Collectors.toList()) + } + .collect(Collectors.toList()) + + println(wapitiResult) + + val table1Tokens = tokens.subList(25, 61) + val foundCandidateIndex = FullTextParser.findCandiateIndex(table1Tokens, labelledResultsAsList, TABLE_LABEL) + + assertThat(foundCandidateIndex, hasSize(3)) + assertThat(foundCandidateIndex.get(0), `is`(13)) + assertThat(foundCandidateIndex.get(1), `is`(42)) + assertThat(foundCandidateIndex.get(2), `is`(67)) + } + +} \ No newline at end of file From 81a1691008a0c232bbbf878c0defb7a6c65ef346 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Tue, 17 Dec 2024 11:16:40 +0100 Subject: [PATCH 14/21] fix typo --- .../java/org/grobid/core/engines/FullTextParser.java | 4 ++-- .../org/grobid/core/engines/FullTextParserTest.kt | 10 ++-------- 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java index 30815e5e44..494cecf872 100755 --- a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java @@ -371,7 +371,7 @@ static String revertResultsForBadItems(List badFiguresOrTables for (Figure badItem : badFiguresOrTables) { // Find the index of the first layoutToken of the table in the tokenization List layoutTokenItem = badItem.getLayoutTokens(); - List candidateIndexes = findCandiateIndex(layoutTokenItem, labelledResultsAsList, itemLabel); + List candidateIndexes = findCandidateIndex(layoutTokenItem, labelledResultsAsList, itemLabel); if (candidateIndexes.isEmpty()) { LOGGER.info("Cannot find the candidate index for fixing the tables."); continue; @@ -441,7 +441,7 @@ static int consolidateResultCandidateThroughSequence(List candidateInde } @NotNull - static List findCandiateIndex(List layoutTokenItem, List> labelledResultsAsList, String itemLabel) { + static List findCandidateIndex(List layoutTokenItem, List> labelledResultsAsList, String itemLabel) { LayoutToken firstLayoutTokenItem = layoutTokenItem.get(0); List candidateIndexes = IntStream.range(0, labelledResultsAsList.size()) diff --git a/grobid-core/src/test/kotlin/org/grobid/core/engines/FullTextParserTest.kt b/grobid-core/src/test/kotlin/org/grobid/core/engines/FullTextParserTest.kt index f76abe84fd..e816bb4b00 100644 --- a/grobid-core/src/test/kotlin/org/grobid/core/engines/FullTextParserTest.kt +++ b/grobid-core/src/test/kotlin/org/grobid/core/engines/FullTextParserTest.kt @@ -2,10 +2,7 @@ package org.grobid.core.engines import org.apache.commons.lang3.StringUtils import org.apache.commons.lang3.tuple.Triple -import org.easymock.EasyMock.createMock import org.grobid.core.analyzers.GrobidAnalyzer -import org.grobid.core.document.Document -import org.grobid.core.document.DocumentSource import org.grobid.core.engines.label.TaggingLabels.TABLE_LABEL import org.grobid.core.factory.GrobidFactory import org.grobid.core.main.LibraryLoader @@ -234,10 +231,7 @@ class FullTextParserTest { } @Test - fun testFindCandidates() { - var mockDocumentSource = createMock(DocumentSource::class.java) - var document = Document.createFromText("") - + fun testFindCandidates_shouldFindMultipleResults() { // i need to prepare a sequence where there might be multiple matches, // and then verify that the sequence is correctly used for discrimination var sequence = "This article solves the problem where some of our interaction are fauly. " + @@ -284,7 +278,7 @@ class FullTextParserTest { println(wapitiResult) val table1Tokens = tokens.subList(25, 61) - val foundCandidateIndex = FullTextParser.findCandiateIndex(table1Tokens, labelledResultsAsList, TABLE_LABEL) + val foundCandidateIndex = FullTextParser.findCandidateIndex(table1Tokens, labelledResultsAsList, TABLE_LABEL) assertThat(foundCandidateIndex, hasSize(3)) assertThat(foundCandidateIndex.get(0), `is`(13)) From 3778a6e1280392b39b1218ad6811dd7a3867c917 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Tue, 17 Dec 2024 11:38:43 +0100 Subject: [PATCH 15/21] add test for table token consolidation --- .../grobid/core/engines/FullTextParser.java | 15 +-- .../grobid/core/engines/FullTextParserTest.kt | 110 ++++++++++++++++-- 2 files changed, 111 insertions(+), 14 deletions(-) diff --git a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java index 494cecf872..95c8036040 100755 --- a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java @@ -377,21 +377,22 @@ static String revertResultsForBadItems(List badFiguresOrTables continue; } - //A this point i have more than one candidate, which can be matched if the same first + // At this point i have more than one candidate, which can be matched if the same first // token is repeated in the sequence. The next step is to find the matching figure/table // using a large sequence - List sequenceTokenWithoutSpaces = layoutTokenItem.stream() + List sequenceTokenItemWithoutSpaces = layoutTokenItem.stream() .map(LayoutToken::getText) .map(StringUtils::strip) .filter(StringUtils::isNotBlank) .collect(Collectors.toList()); - int resultIndexCandidate = consolidateResultCandidateThroughSequence(candidateIndexes, labelledResultsAsList, sequenceTokenWithoutSpaces); + //TODO: reduce candidate indexes after matching one sequence + int resultIndexCandidate = consolidateResultCandidateThroughSequence(candidateIndexes, labelledResultsAsList, sequenceTokenItemWithoutSpaces); if (resultIndexCandidate > -1) { boolean first = true; - for (int i = resultIndexCandidate;i < Math.min(resultIndexCandidate + sequenceTokenWithoutSpaces.size(), labelledResultsAsList.size()); i++) { + for (int i = resultIndexCandidate;i < Math.min(resultIndexCandidate + sequenceTokenItemWithoutSpaces.size(), labelledResultsAsList.size()); i++) { List line = labelledResultsAsList.get(i); String label = Iterables.getLast(line); if (first) { @@ -417,19 +418,19 @@ static String revertResultsForBadItems(List badFiguresOrTables return resultBody; } - static int consolidateResultCandidateThroughSequence(List candidateIndexes, List> splitResult, List tokensNoSpace) { + static int consolidateResultCandidateThroughSequence(List candidateIndexes, List> splitResult, List tokensNoSpaceItem) { int resultIndexCandidate = -1; if (candidateIndexes.size() == 1){ resultIndexCandidate = candidateIndexes.get(0); } else { for (int candidateIndex: candidateIndexes) { - List candidateTable = splitResult.subList(candidateIndex, Math.min(candidateIndex + tokensNoSpace.size(), splitResult.size())) + List candidateTable = splitResult.subList(candidateIndex, Math.min(candidateIndex + tokensNoSpaceItem.size(), splitResult.size())) .stream() .map(i -> i.get(0)) .collect(Collectors.toList()); String candidateTableText = String.join("", candidateTable); - String tokensText = String.join("", tokensNoSpace); + String tokensText = String.join("", tokensNoSpaceItem); if (candidateTableText.equals(tokensText)) { resultIndexCandidate = candidateIndex; diff --git a/grobid-core/src/test/kotlin/org/grobid/core/engines/FullTextParserTest.kt b/grobid-core/src/test/kotlin/org/grobid/core/engines/FullTextParserTest.kt index e816bb4b00..bc7a131fba 100644 --- a/grobid-core/src/test/kotlin/org/grobid/core/engines/FullTextParserTest.kt +++ b/grobid-core/src/test/kotlin/org/grobid/core/engines/FullTextParserTest.kt @@ -1,10 +1,12 @@ package org.grobid.core.engines +import jnr.posix.BaseIovec.Layout import org.apache.commons.lang3.StringUtils import org.apache.commons.lang3.tuple.Triple import org.grobid.core.analyzers.GrobidAnalyzer import org.grobid.core.engines.label.TaggingLabels.TABLE_LABEL import org.grobid.core.factory.GrobidFactory +import org.grobid.core.layout.LayoutToken import org.grobid.core.main.LibraryLoader import org.grobid.core.utilities.GrobidConfig import org.grobid.core.utilities.GrobidProperties @@ -232,7 +234,7 @@ class FullTextParserTest { @Test fun testFindCandidates_shouldFindMultipleResults() { - // i need to prepare a sequence where there might be multiple matches, + // I need to prepare a sequence where there might be multiple matches, // and then verify that the sequence is correctly used for discrimination var sequence = "This article solves the problem where some of our interaction are fauly. " + "a 8 9 j 92j 3 3j 9 j 9j Table 1: The reconstruction of the national anthem " + @@ -263,10 +265,10 @@ class FullTextParserTest { ) val features = tokens.stream().map { it.text }.collect(Collectors.toList()) + val wapitiResults = GrobidTestUtils.getWapitiResult(features, labels, "\t") - val wapitiResult = GrobidTestUtils.getWapitiResult(features, labels, "\t") - val labelledResultsAsList = - Arrays.stream(wapitiResult.split("\n".toRegex()).dropLastWhile { it.isEmpty() }.toTypedArray()) + val wapitiResultsAsList = + Arrays.stream(wapitiResults.split("\n".toRegex()).dropLastWhile { it.isEmpty() }.toTypedArray()) .map> { l: String -> Arrays.stream( l.split("\t".toRegex()).dropLastWhile { it.isEmpty() }.toTypedArray() @@ -275,10 +277,8 @@ class FullTextParserTest { } .collect(Collectors.toList()) - println(wapitiResult) - val table1Tokens = tokens.subList(25, 61) - val foundCandidateIndex = FullTextParser.findCandidateIndex(table1Tokens, labelledResultsAsList, TABLE_LABEL) + val foundCandidateIndex = FullTextParser.findCandidateIndex(table1Tokens, wapitiResultsAsList, TABLE_LABEL) assertThat(foundCandidateIndex, hasSize(3)) assertThat(foundCandidateIndex.get(0), `is`(13)) @@ -286,4 +286,100 @@ class FullTextParserTest { assertThat(foundCandidateIndex.get(2), `is`(67)) } + @Test + fun testConsolidateResultCandidateThroughSequence() { + // var mockDocumentSource = createMock(DocumentSource::class.java) + // var document = Document.createFromText("") + val sequence = "This article solves the problem where some of our interaction are fauly. " + + "a 8 9 j 92j 3 3j 9 j 9j Table 1: The reconstruction of the national anthem " + + "We are interested in the relation between certain information and " + + "a b b d 1 2 3 4 s 3 3 d9 Table 2: The relation between information and noise " + + "the related affectionality. " + + "a b b d 1 2 3 4 5 6 7 Table 3: The relation between homicides and donuts eating " + + "The relation between homicides and donuts eating is a very important one. " + + val tokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(sequence) + + // These triples made in following way: label, starting index (included), ending index (excluded) + val labels = listOf( + Triple.of("I-", 0, 1), + Triple.of("", 1, 24), + Triple.of("I-
", 25, 26), + Triple.of("
", 26, 61), + Triple.of("I-", 62, 63), + Triple.of("", 63, 81), + Triple.of("I-
", 82, 83), + Triple.of("
", 82, 118), + Triple.of("I-", 119, 120), + Triple.of("", 120, 129), + Triple.of("I-
", 130, 131), + Triple.of("
", 131, 171), + Triple.of("I-", 171, 172), + Triple.of("", 172, 195), + ) + + val features = tokens.stream().map { it.text }.collect(Collectors.toList()) + + val wapitiResults = GrobidTestUtils.getWapitiResult(features, labels, "\t") + val wapitiResultsAsList = + Arrays.stream(wapitiResults.split("\n".toRegex()).dropLastWhile { it.isEmpty() }.toTypedArray()) + .map> { l: String -> + Arrays.stream( + l.split("\t".toRegex()).dropLastWhile { it.isEmpty() }.toTypedArray() + ) + .collect(Collectors.toList()) + } + .collect(Collectors.toList()) + + val table1Tokens = tokens.subList(25, 61) + + val sequenceTokenWithoutSpacesTable1: List = table1Tokens.stream() + .map { obj: LayoutToken -> obj.text } + .map { str: String? -> StringUtils.strip(str) } + .filter { cs: String? -> StringUtils.isNotBlank(cs) } + .collect(Collectors.toList()) + + val candidatesIndexes = Arrays.asList( + 13, 42, 67 + ) + val consolidatedTable1ResultCandidateThroughSequence = FullTextParser.consolidateResultCandidateThroughSequence( + candidatesIndexes, + wapitiResultsAsList, + sequenceTokenWithoutSpacesTable1 + ) + + assertThat(consolidatedTable1ResultCandidateThroughSequence, `is`(13)) + + val table2Tokens = tokens.subList(82, 118) + + var sequenceTokenWithoutSpacesTable2: MutableList? = table2Tokens.stream() + .map { obj: LayoutToken -> obj.text } + .map { str: String? -> StringUtils.strip(str) } + .filter { cs: String? -> StringUtils.isNotBlank(cs) } + .collect(Collectors.toList()) + + val consolidatedTable2ResultCandidateThroughSequence = FullTextParser.consolidateResultCandidateThroughSequence( + candidatesIndexes, + wapitiResultsAsList, + sequenceTokenWithoutSpacesTable2 + ) + + assertThat(consolidatedTable2ResultCandidateThroughSequence, `is`(42)) + + val table3Tokens = tokens.subList(130, 171) + + var sequenceTokenWithoutSpacesTable3: MutableList? = table3Tokens.stream() + .map { obj: LayoutToken -> obj.text } + .map { str: String? -> StringUtils.strip(str) } + .filter { cs: String? -> StringUtils.isNotBlank(cs) } + .collect(Collectors.toList()) + + val consolidatedTable3ResultCandidateThroughSequence = FullTextParser.consolidateResultCandidateThroughSequence( + candidatesIndexes, + wapitiResultsAsList, + sequenceTokenWithoutSpacesTable3 + ) + + assertThat(consolidatedTable3ResultCandidateThroughSequence, `is`(67)) + } } \ No newline at end of file From 21f85c93ff372b849412d512edd42c2eb0b2133b Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Wed, 18 Dec 2024 12:00:38 +0100 Subject: [PATCH 16/21] rename methods for better clarity, move utility methods in Kotlin --- .../grobid/core/engines/FullTextParser.java | 2 +- .../org/grobid/core/engines/HeaderParser.java | 6 +- .../org/grobid/core/utilities/LabelUtils.java | 80 ----------------- .../org/grobid/core/utilities/LabelUtils.kt | 87 +++++++++++++++++++ .../grobid/core/utilities/LabelUtilsTest.kt | 12 +-- 5 files changed, 98 insertions(+), 89 deletions(-) delete mode 100644 grobid-core/src/main/java/org/grobid/core/utilities/LabelUtils.java create mode 100644 grobid-core/src/main/kotlin/org/grobid/core/utilities/LabelUtils.kt diff --git a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java index 95c8036040..2ce42cea0a 100755 --- a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java @@ -264,7 +264,7 @@ else if (config.getConsolidateCitations() == 2) resultBody = label(bodytext); //Correct subsequent I-
or I-
- resultBody = LabelUtils.adjustInvalidSequenceOfStartLabels(resultBody); + resultBody = LabelUtils.postProcessFulltextFixInvalidTableOrFigure(resultBody); // we apply now the figure and table models based on the fulltext labeled output figures = processFigures(resultBody, layoutTokenization.getTokenization(), doc); diff --git a/grobid-core/src/main/java/org/grobid/core/engines/HeaderParser.java b/grobid-core/src/main/java/org/grobid/core/engines/HeaderParser.java index 185f3714d5..88b910624e 100755 --- a/grobid-core/src/main/java/org/grobid/core/engines/HeaderParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/HeaderParser.java @@ -1,5 +1,6 @@ package org.grobid.core.engines; +import org.apache.commons.collections4.CollectionUtils; import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.tuple.ImmutablePair; import org.apache.commons.lang3.tuple.Pair; @@ -261,7 +262,7 @@ public String processingHeaderSection(GrobidAnalysisConfig config, Document doc, //resHeader.setKeyword(keywords.replace("\n", " ").replace(" ", " ")); resHeader.setKeyword(keywords); List keywordsSegmented = BiblioItem.segmentKeywords(keywords); - if ((keywordsSegmented != null) && (keywordsSegmented.size() > 0)) + if (CollectionUtils.isNotEmpty(keywordsSegmented)) resHeader.setKeywords(keywordsSegmented); } @@ -311,7 +312,7 @@ public String processingHeaderSection(GrobidAnalysisConfig config, Document doc, } // copyrights/license identification - if (resHeader.getCopyright() != null && resHeader.getCopyright().length()>0) { + if (StringUtils.isNotBlank(resHeader.getCopyright())) { if (GrobidProperties.getGrobidEngineName("copyright").equals("delft")) { CopyrightsLicense copyrightsLicense = LicenseClassifier.getInstance().classify(resHeader.getCopyright()); if (copyrightsLicense != null) @@ -928,6 +929,7 @@ else if (biblio.getPublicationDate() == null) // this will need to be reviewed with more training data, for the moment // avoid concatenation for abstracts as it brings more noise than correct pieces //biblio.setAbstract(biblio.getAbstract() + " " + clusterContent); + //TODO: avoid dumping text on the floor } else { biblio.setAbstract(clusterContent); List tokens = cluster.concatTokens(); diff --git a/grobid-core/src/main/java/org/grobid/core/utilities/LabelUtils.java b/grobid-core/src/main/java/org/grobid/core/utilities/LabelUtils.java deleted file mode 100644 index d0f7fcc0a6..0000000000 --- a/grobid-core/src/main/java/org/grobid/core/utilities/LabelUtils.java +++ /dev/null @@ -1,80 +0,0 @@ -package org.grobid.core.utilities; - -import org.apache.commons.lang3.StringUtils; -import org.grobid.core.engines.label.TaggingLabels; - -public class LabelUtils { - /** - * Post-process text labeled by the fulltext model on chunks that are known to be text (no table, or figure) - * It converts table and figure labels to paragraph labels. - */ - public static String postProcessFullTextLabeledText(String fulltextLabeledText) { - if (fulltextLabeledText == null) - return null; - StringBuilder result = new StringBuilder(); - - String[] lines = fulltextLabeledText.split("\n"); - String previousLabel = null; - for(int i=0; i followed by another I-
(or table)
+ **/ + @JvmStatic + fun postProcessFulltextFixInvalidTableOrFigure(fulltextLabeledText: String): String { + val result = StringBuilder() + + val lines = fulltextLabeledText + .split("\n".toRegex()) + .dropLastWhile { it.isEmpty() } + .toTypedArray() + + var previousLabel: String? = null + for (i in lines.indices) { + val line = lines[i] + if (StringUtils.isBlank(line)) continue + + val pieces = line + .split("\t".toRegex()) + .dropLastWhile { it.isEmpty() } + .toTypedArray() + + val label = pieces[pieces.size - 1] + if (label == "I-" + TaggingLabels.FIGURE.label) { + if (StringUtils.equals(previousLabel, "I-" + TaggingLabels.FIGURE.label)) { + pieces[pieces.size - 1] = TaggingLabels.FIGURE.label + } + } else if (label == "I-" + TaggingLabels.TABLE.label) { + if (StringUtils.equals(previousLabel, "I-" + TaggingLabels.TABLE.label)) { + pieces[pieces.size - 1] = TaggingLabels.TABLE.label + } + } + + result.append(pieces.joinToString("\t")) + previousLabel = label + result.append("\n") + } + + return result.toString() + } + +} diff --git a/grobid-core/src/test/kotlin/org/grobid/core/utilities/LabelUtilsTest.kt b/grobid-core/src/test/kotlin/org/grobid/core/utilities/LabelUtilsTest.kt index 4c856f65fd..c0eded3f66 100644 --- a/grobid-core/src/test/kotlin/org/grobid/core/utilities/LabelUtilsTest.kt +++ b/grobid-core/src/test/kotlin/org/grobid/core/utilities/LabelUtilsTest.kt @@ -94,7 +94,7 @@ class LabelUtilsTest { // } @Test - fun testAdjustInvalidSequenceOfStartLabels_noChangeNeeded_shouldReturnSameSequence() { + fun testPostProcessFulltextFixInvalidTableOrFigure_noChangeNeeded_shouldReturnSameTableOrFigureSequence() { val bodyResult = "B\tb\tB\tB\tB\tB\tB\tB\tB\tB\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tHIGHERFONT\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t8\t11\t0\tNUMBER\t0\t0\t\n" + ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKEND\tLINEEND\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t8\t11\t0\tNUMBER\t0\t0\t\n" + @@ -163,13 +163,13 @@ class LabelUtilsTest { ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t8\t3\t0\tNUMBER\t0\t0\t\n" + "The\tthe\tT\tTh\tThe\tThe\te\the\tThe\tThe\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t8\t3\t0\tNUMBER\t0\t0\t\n" - val postProcessed = LabelUtils.adjustInvalidSequenceOfStartLabels(bodyResult) + val postProcessed = LabelUtils.postProcessFulltextFixInvalidTableOrFigure(bodyResult) assertThat(postProcessed, `is`(bodyResult)) } @Test - fun testAdjustInvalidSequenceOfStartLabels_singleChangeNeeded_shouldCorrectTheSequence() { + fun testPostProcessFulltextFixInvalidTableOrFigure_singleChangeNeeded_shouldCorrectTheTableOrFigureSequence() { val bodyResult = "of\tof\to\tof\tof\tof\tf\tof\tof\tof\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t0\t\n" + "10\t10\t1\t10\t10\t10\t0\t10\t10\t10\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tLOWERFONT\t0\t0\tNOCAPS\tALLDIGIT\t0\tNOPUNCT\t9\t5\t0\tNUMBER\t1\t1\tI-\n" + @@ -194,7 +194,7 @@ class LabelUtilsTest { "calculated\tcalculated\tc\tca\tcal\tcalc\td\ted\tted\tated\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t0\t
\n" - val postProcessed = LabelUtils.adjustInvalidSequenceOfStartLabels(bodyResult) + val postProcessed = LabelUtils.postProcessFulltextFixInvalidTableOrFigure(bodyResult) assertThat(postProcessed, not(bodyResult)) @@ -217,7 +217,7 @@ class LabelUtilsTest { } @Test - fun testAdjustInvalidSequenceOfStartLabels_MultipleChangeNeeded_shouldCorrectTheSequence() { + fun testPostProcessFulltextFixInvalidTableOrFigure_MultipleChangeNeeded_shouldCorrectTheTableOrFigureSequence() { val bodyResult = "of\tof\to\tof\tof\tof\tf\tof\tof\tof\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t0\t\n" + "10\t10\t1\t10\t10\t10\t0\t10\t10\t10\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tLOWERFONT\t0\t0\tNOCAPS\tALLDIGIT\t0\tNOPUNCT\t9\t5\t0\tNUMBER\t1\t1\tI-\n" + @@ -253,7 +253,7 @@ class LabelUtilsTest { "calculated\tcalculated\tc\tca\tcal\tcalc\td\ted\tted\tated\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t5\t0\tNUMBER\t0\t0\t
\n" - val postProcessed = LabelUtils.adjustInvalidSequenceOfStartLabels(bodyResult) + val postProcessed = LabelUtils.postProcessFulltextFixInvalidTableOrFigure(bodyResult) assertThat(postProcessed, not(bodyResult)) From cbbe460941b91fbcd3fc6c210a67bea75e328bbd Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Thu, 19 Dec 2024 13:48:19 +0100 Subject: [PATCH 17/21] renaming --- .../grobid/core/engines/FullTextParser.java | 96 ++++++++++--------- 1 file changed, 51 insertions(+), 45 deletions(-) diff --git a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java index 2ce42cea0a..3316a9f769 100755 --- a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java @@ -249,25 +249,25 @@ else if (config.getConsolidateCitations() == 2) // full text processing featSeg = getBodyTextFeatured(doc, documentBodyParts); - String resultBody = null; - LayoutTokenization layoutTokenization = null; + String bodyResults = null; + LayoutTokenization bodyLayoutTokens = null; List
figures = null; List
tables = null; List equations = null; if (featSeg != null && isNotBlank(featSeg.getLeft())) { // if featSeg is null, it usually means that the fulltext body is not found in the // document segmentation - String bodytext = featSeg.getLeft(); - layoutTokenization = featSeg.getRight(); + String bodyText = featSeg.getLeft(); + bodyLayoutTokens = featSeg.getRight(); //tokenizationsBody = featSeg.getB().getTokenization(); //layoutTokensBody = featSeg.getB().getLayoutTokens(); - resultBody = label(bodytext); + bodyResults = label(bodyText); //Correct subsequent I-
or I-
- resultBody = LabelUtils.postProcessFulltextFixInvalidTableOrFigure(resultBody); + bodyResults = LabelUtils.postProcessFulltextFixInvalidTableOrFigure(bodyResults); // we apply now the figure and table models based on the fulltext labeled output - figures = processFigures(resultBody, layoutTokenization.getTokenization(), doc); + figures = processFigures(bodyResults, bodyLayoutTokens.getTokenization(), doc); // further parse the caption for(Figure figure : figures) { if (CollectionUtils.isNotEmpty(figure.getCaptionLayoutTokens()) ) { @@ -281,14 +281,14 @@ else if (config.getConsolidateCitations() == 2) .filter(f -> !f.isCompleteForTEI()) .collect(Collectors.toList()); - LOGGER.warn("Identified bad figures: " + badFigures.size()); - resultBody = revertResultsForBadItems(badFigures, resultBody, TaggingLabels.FIGURE_LABEL); + LOGGER.info("Identified bad figures: " + badFigures.size()); + bodyResults = revertResultsForBadItems(badFigures, bodyResults, TaggingLabels.FIGURE_LABEL); figures = figures.stream() .filter(f -> !badFigures.contains(f)) .collect(Collectors.toList()); - tables = processTables(resultBody, layoutTokenization.getTokenization(), doc); + tables = processTables(bodyResults, bodyLayoutTokens.getTokenization(), doc); //We deal with tables considered bad by reverting them as , to reduce the risk them to be // dropped later on. @@ -299,8 +299,8 @@ else if (config.getConsolidateCitations() == 2) .filter(t -> !(t.isCompleteForTEI() && t.validateTable())) .collect(Collectors.toList()); - LOGGER.warn("Identified bad tables: " + badTables.size()); - resultBody = revertResultsForBadItems(badTables, resultBody, TaggingLabels.TABLE_LABEL); + LOGGER.info("Identified bad tables: " + badTables.size()); + bodyResults = revertResultsForBadItems(badTables, bodyResults, TaggingLabels.TABLE_LABEL); tables = tables.stream() .filter(t-> !badTables.contains(t)) @@ -320,7 +320,7 @@ else if (config.getConsolidateCitations() == 2) } } - equations = processEquations(resultBody, layoutTokenization.getTokenization(), doc); + equations = processEquations(bodyResults, bodyLayoutTokens.getTokenization(), doc); } else { LOGGER.debug("Fulltext model: The featured body is empty"); } @@ -328,30 +328,36 @@ else if (config.getConsolidateCitations() == 2) // possible annexes (view as a piece of full text similar to the body) documentBodyParts = doc.getDocumentPart(SegmentationLabels.ANNEX); featSeg = getBodyTextFeatured(doc, documentBodyParts); - String resultAnnex = null; - List tokenizationsBody2 = null; + String annexResults = null; + List annexTokens = null; if (featSeg != null && isNotEmpty(trim(featSeg.getLeft()))) { - // if featSeg is null, it usually means that no body segment is found in the + // if featSeg is null, it usually means that no annex segment is found in the // document segmentation - String bodytext = featSeg.getLeft(); - tokenizationsBody2 = featSeg.getRight().getTokenization(); - resultAnnex = label(bodytext); - //System.out.println(rese); + String annexFeatures = featSeg.getLeft(); + annexTokens = featSeg.getRight().getTokenization(); + annexResults = label(annexFeatures); +// System.out.println(annexResults); + System.out.println("bao"); } // post-process reference and footnote callout to keep them consistent (e.g. for example avoid that a footnote // callout in superscript is by error labeled as a numerical reference callout) List markerTypes = null; - if (resultBody != null) - markerTypes = postProcessCallout(resultBody, layoutTokenization); + if (bodyResults != null) + markerTypes = postProcessCallout(bodyResults, bodyLayoutTokens); // final combination toTEI(doc, // document - resultBody, resultAnnex, // labeled data for body and annex - layoutTokenization, tokenizationsBody2, // tokenization for body and annex + bodyResults, + annexResults, // labeled data for body and annex + bodyLayoutTokens, + annexTokens, // tokenization for body and annex resHeader, // header - figures, tables, equations, markerTypes, + figures, + tables, + equations, + markerTypes, config); return doc; } catch (GrobidException e) { @@ -2639,8 +2645,8 @@ private static MarkerType getBestType(Map markerTypeCount) { * and body sections. */ private void toTEI(Document doc, - String reseBody, - String reseAnnex, + String bodyLabellingResult, + String annexLabellingResult, LayoutTokenization layoutTokenization, List tokenizationsAnnex, BiblioItem resHeader, @@ -2670,9 +2676,9 @@ private void toTEI(Document doc, parsers.getFundingAcknowledgementParser().processingXmlFragment(acknowledgmentStmt.toString(), config); if (localResult != null && localResult.getLeft() != null) { - String local_tei = localResult.getLeft().toXML(); - local_tei = local_tei.replace(" xmlns=\"http://www.tei-c.org/ns/1.0\"", ""); - annexStatements.add(local_tei); + String localTei = localResult.getLeft().toXML(); + localTei = localTei.replace(" xmlns=\"http://www.tei-c.org/ns/1.0\"", ""); + annexStatements.add(localTei); } else { annexStatements.add(acknowledgmentStmt.toString()); @@ -2681,14 +2687,14 @@ private void toTEI(Document doc, if (localResult != null && localResult.getRight() != null) { if (localResult.getRight().getLeft() != null) { List localFundings = localResult.getRight().getLeft(); - if (localFundings.size()>0) { + if (CollectionUtils.isNotEmpty(localFundings)) { fundings.addAll(localFundings); } } if (localResult.getRight().getRight() != null) { List localAffiliations = localResult.getRight().getRight(); - if (localAffiliations.size()>0) { + if (CollectionUtils.isNotEmpty(localAffiliations)) { affiliations.addAll(localAffiliations); } } @@ -2709,14 +2715,14 @@ private void toTEI(Document doc, resCitations, config); } - if (fundingStmt.length() > 0) { + if (StringUtils.isNotBlank(fundingStmt)) { MutablePair,List,List>> localResult = parsers.getFundingAcknowledgementParser().processingXmlFragment(fundingStmt.toString(), config); if (localResult != null && localResult.getLeft() != null) { - String local_tei = localResult.getLeft().toXML(); - local_tei = local_tei.replace(" xmlns=\"http://www.tei-c.org/ns/1.0\"", ""); - annexStatements.add(local_tei); + String localTEI = localResult.getLeft().toXML(); + localTEI = localTEI.replace(" xmlns=\"http://www.tei-c.org/ns/1.0\"", ""); + annexStatements.add(localTEI); } else { annexStatements.add(fundingStmt.toString()); } @@ -2724,14 +2730,14 @@ private void toTEI(Document doc, if (localResult != null && localResult.getRight() != null) { if (localResult.getRight().getLeft() != null) { List localFundings = localResult.getRight().getLeft(); - if (localFundings.size()>0) { + if (CollectionUtils.isNotEmpty(localFundings)) { fundings.addAll(localFundings); } } if (localResult.getRight().getRight() != null) { List localAffiliations = localResult.getRight().getRight(); - if (localAffiliations.size()>0) { + if (CollectionUtils.isNotEmpty(localAffiliations)) { affiliations.addAll(localAffiliations); } } @@ -2752,9 +2758,9 @@ private void toTEI(Document doc, parsers.getFundingAcknowledgementParser().processingXmlFragment(fundingStmt.toString(), config); if (localResult != null && localResult.getLeft() != null){ - String local_tei = localResult.getLeft().toXML(); - local_tei = local_tei.replace(" xmlns=\"http://www.tei-c.org/ns/1.0\"", ""); - annexStatements.add(local_tei); + String localTEI = localResult.getLeft().toXML(); + localTEI = localTEI.replace(" xmlns=\"http://www.tei-c.org/ns/1.0\"", ""); + annexStatements.add(localTEI); } else { annexStatements.add(fundingStmt.toString()); } @@ -2762,14 +2768,14 @@ private void toTEI(Document doc, if (localResult != null && localResult.getRight() != null) { if (localResult.getRight().getLeft() != null) { List localFundings = localResult.getRight().getLeft(); - if (localFundings.size()>0) { + if (CollectionUtils.isNotEmpty(localFundings)) { fundings.addAll(localFundings); } } if (localResult.getRight().getRight() != null) { List localAffiliations = localResult.getRight().getRight(); - if (localAffiliations.size()>0) { + if (CollectionUtils.isNotEmpty(localAffiliations)) { affiliations.addAll(localAffiliations); } } @@ -2778,7 +2784,7 @@ private void toTEI(Document doc, tei.append(teiFormatter.toTEIHeader(resHeader, null, resCitations, markerTypes, fundings, config)); - tei = teiFormatter.toTEIBody(tei, reseBody, resHeader, resCitations, + tei = teiFormatter.toTEIBody(tei, bodyLabellingResult, resHeader, resCitations, layoutTokenization, figures, tables, equations, markerTypes, doc, config); tei.append("\t\t\n"); @@ -2873,7 +2879,7 @@ else if (affiliation.getAffiliationString() != null && affiliation.getAffiliatio tei.append(availabilityStmt.toString()); } - tei = teiFormatter.toTEIAnnex(tei, reseAnnex, resHeader, resCitations, + tei = teiFormatter.toTEIAnnex(tei, annexLabellingResult, resHeader, resCitations, tokenizationsAnnex, markerTypes, doc, config); tei = teiFormatter.toTEIReferences(tei, resCitations, config); From f036e0af91f54fec14fba38f39241ddbe652785d Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Thu, 19 Dec 2024 16:12:34 +0100 Subject: [PATCH 18/21] allow a loose approach to gather table/figure starting token when there is likely more figures/tables obtained by the specific models as in contrast with the initial fulltext sequence --- .../grobid/core/engines/FullTextParser.java | 37 +++++++++++++++---- 1 file changed, 30 insertions(+), 7 deletions(-) diff --git a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java index 3316a9f769..494665a05b 100755 --- a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java @@ -337,7 +337,6 @@ else if (config.getConsolidateCitations() == 2) annexTokens = featSeg.getRight().getTokenization(); annexResults = label(annexFeatures); // System.out.println(annexResults); - System.out.println("bao"); } // post-process reference and footnote callout to keep them consistent (e.g. for example avoid that a footnote @@ -374,10 +373,15 @@ static String revertResultsForBadItems(List badFiguresOrTables .map(l -> Arrays.stream(l.split("\t")).collect(Collectors.toList())) .collect(Collectors.toList()); + long numberItems = labelledResultsAsList.stream() + .filter(r -> Iterables.getLast(r).startsWith("I-" + itemLabel)) + .count(); + for (Figure badItem : badFiguresOrTables) { // Find the index of the first layoutToken of the table in the tokenization List layoutTokenItem = badItem.getLayoutTokens(); - List candidateIndexes = findCandidateIndex(layoutTokenItem, labelledResultsAsList, itemLabel); + List candidateIndexes = findCandidateIndex(layoutTokenItem, labelledResultsAsList, + itemLabel, !(badFiguresOrTables.size() > numberItems)); if (candidateIndexes.isEmpty()) { LOGGER.info("Cannot find the candidate index for fixing the tables."); continue; @@ -447,20 +451,37 @@ static int consolidateResultCandidateThroughSequence(List candidateInde return resultIndexCandidate; } + /** + * Find a set of candidates representing the indexes from the labelledResults which could correspond + * to the first token of the figure/table + * + * strict = True check the I-
or I-
first and then the
or
only if there are not candidates + * strict = False is usually necessary if there are more tables than I- token, this because a figure/table could be + * identified within the sequence initially provided by the fulltext model + * + */ @NotNull static List findCandidateIndex(List layoutTokenItem, List> labelledResultsAsList, String itemLabel) { + return findCandidateIndex(layoutTokenItem, labelledResultsAsList, itemLabel, true); + } + + @NotNull + static List findCandidateIndex(List layoutTokenItem, List> labelledResultsAsList, String itemLabel, boolean strict) { LayoutToken firstLayoutTokenItem = layoutTokenItem.get(0); List candidateIndexes = IntStream.range(0, labelledResultsAsList.size()) .filter(i -> labelledResultsAsList.get(i).get(0).equals(firstLayoutTokenItem.getText()) - && Iterables.getLast(labelledResultsAsList.get(i)).equals("I-"+ itemLabel)) + && Iterables.getLast(labelledResultsAsList.get(i)).equals("I-" + itemLabel)) .boxed() .collect(Collectors.toList()); - if (candidateIndexes.isEmpty()) { + if (candidateIndexes.isEmpty() || !strict) { candidateIndexes = IntStream.range(0, labelledResultsAsList.size()) .filter(i -> labelledResultsAsList.get(i).get(0).equals(firstLayoutTokenItem.getText()) - && Iterables.getLast(labelledResultsAsList.get(i)).equals(itemLabel)) + && ( + Iterables.getLast(labelledResultsAsList.get(i)).equals(itemLabel) + || Iterables.getLast(labelledResultsAsList.get(i)).equals("I-" + itemLabel)) + ) .boxed() .collect(Collectors.toList()); } @@ -2247,10 +2268,12 @@ protected Pair processTrainingDataFigures(String rese, // If there still an open figure if (openFigure) { - while((tokenizationsFigure.size() > 0) && + while(CollectionUtils.isNotEmpty(tokenizationsFigure) && (tokenizationsFigure.get(0).getText().equals("\n") || - tokenizationsFigure.get(0).getText().equals(" ")) ) + tokenizationsFigure.get(0).getText().equals(" ")) + ) { tokenizationsFigure.remove(0); + } // process the "accumulated" figure Pair trainingData = parsers.getFigureParser() From 09c824d00e526175f01597ba675caabc380dd6f8 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Thu, 19 Dec 2024 17:00:01 +0100 Subject: [PATCH 19/21] improve the way the candidate identification for bad figures/tables is loosen up --- .../grobid/core/engines/FullTextParser.java | 31 +++++++++++++------ 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java index 494665a05b..c94aca004c 100755 --- a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java @@ -277,12 +277,17 @@ else if (config.getConsolidateCitations() == 2) } } + long numberFiguresFulltextModel = Arrays.stream(bodyResults.split("\n")) + .filter(r -> r.endsWith("I-" + TaggingLabels.FIGURE_LABEL)) + .count(); + List
badFigures = figures.stream() .filter(f -> !f.isCompleteForTEI()) .collect(Collectors.toList()); - LOGGER.info("Identified bad figures: " + badFigures.size()); - bodyResults = revertResultsForBadItems(badFigures, bodyResults, TaggingLabels.FIGURE_LABEL); + LOGGER.info("Number of figures badly formatted or incomplete we identified: " + badFigures.size()); + bodyResults = revertResultsForBadItems(badFigures, bodyResults, TaggingLabels.FIGURE_LABEL, + !(figures.size() > numberFiguresFulltextModel)); figures = figures.stream() .filter(f -> !badFigures.contains(f)) @@ -290,6 +295,10 @@ else if (config.getConsolidateCitations() == 2) tables = processTables(bodyResults, bodyLayoutTokens.getTokenization(), doc); + long numberTablesFulltextModel = Arrays.stream(bodyResults.split("\n")) + .filter(r -> r.endsWith("I-" + TaggingLabels.TABLE_LABEL)) + .count(); + //We deal with tables considered bad by reverting them as , to reduce the risk them to be // dropped later on. @@ -299,8 +308,9 @@ else if (config.getConsolidateCitations() == 2) .filter(t -> !(t.isCompleteForTEI() && t.validateTable())) .collect(Collectors.toList()); - LOGGER.info("Identified bad tables: " + badTables.size()); - bodyResults = revertResultsForBadItems(badTables, bodyResults, TaggingLabels.TABLE_LABEL); + LOGGER.info("Number of tables badly formatted or incomplete we identified: " + badTables.size()); + bodyResults = revertResultsForBadItems(badTables, bodyResults, TaggingLabels.TABLE_LABEL, + !(tables.size() > numberTablesFulltextModel)); tables = tables.stream() .filter(t-> !badTables.contains(t)) @@ -367,21 +377,21 @@ else if (config.getConsolidateCitations() == 2) } static String revertResultsForBadItems(List badFiguresOrTables, String resultBody, String itemLabel) { + return revertResultsForBadItems(badFiguresOrTables, resultBody, itemLabel, true); + } + + static String revertResultsForBadItems(List badFiguresOrTables, String resultBody, String itemLabel, boolean strict) { //LF: we update the resultBody sequence by reverting these tables as elements if (CollectionUtils.isNotEmpty(badFiguresOrTables)) { List> labelledResultsAsList = Arrays.stream(resultBody.split("\n")) .map(l -> Arrays.stream(l.split("\t")).collect(Collectors.toList())) .collect(Collectors.toList()); - long numberItems = labelledResultsAsList.stream() - .filter(r -> Iterables.getLast(r).startsWith("I-" + itemLabel)) - .count(); - for (Figure badItem : badFiguresOrTables) { // Find the index of the first layoutToken of the table in the tokenization List layoutTokenItem = badItem.getLayoutTokens(); List candidateIndexes = findCandidateIndex(layoutTokenItem, labelledResultsAsList, - itemLabel, !(badFiguresOrTables.size() > numberItems)); + itemLabel, strict); if (candidateIndexes.isEmpty()) { LOGGER.info("Cannot find the candidate index for fixing the tables."); continue; @@ -455,7 +465,8 @@ static int consolidateResultCandidateThroughSequence(List candidateInde * Find a set of candidates representing the indexes from the labelledResults which could correspond * to the first token of the figure/table * - * strict = True check the I-
or I-
first and then the
or
only if there are not candidates + * strict = True then it will check the items related to I-
or I-
first + * and then the
or
only if there are not candidates * strict = False is usually necessary if there are more tables than I- token, this because a figure/table could be * identified within the sequence initially provided by the fulltext model * From 671feb01cdeaf07d0947a3064c5c769b59718f69 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Thu, 26 Dec 2024 14:30:55 +0100 Subject: [PATCH 20/21] Fix merge --- .../core/engines/FullTextParserTest.java | 306 ------------------ .../grobid/core/engines/FullTextParserTest.kt | 84 ++++- 2 files changed, 70 insertions(+), 320 deletions(-) delete mode 100644 grobid-core/src/test/java/org/grobid/core/engines/FullTextParserTest.java diff --git a/grobid-core/src/test/java/org/grobid/core/engines/FullTextParserTest.java b/grobid-core/src/test/java/org/grobid/core/engines/FullTextParserTest.java deleted file mode 100644 index f04d2f0bff..0000000000 --- a/grobid-core/src/test/java/org/grobid/core/engines/FullTextParserTest.java +++ /dev/null @@ -1,306 +0,0 @@ -package org.grobid.core.engines; - -import org.apache.commons.lang3.StringUtils; -import org.apache.commons.lang3.tuple.Pair; -import org.grobid.core.analyzers.GrobidAnalyzer; -import org.grobid.core.document.Document; -import org.grobid.core.document.DocumentPiece; -import org.grobid.core.document.DocumentPointer; -import org.grobid.core.factory.GrobidFactory; -import org.grobid.core.layout.LayoutToken; -import org.grobid.core.layout.LayoutTokenization; -import org.grobid.core.main.LibraryLoader; -import org.grobid.core.utilities.GrobidProperties; -import org.junit.AfterClass; -import org.junit.Before; -import org.junit.BeforeClass; -import org.junit.Test; - -import java.util.*; -import java.util.stream.Collectors; - -import static org.hamcrest.CoreMatchers.is; -import static org.hamcrest.MatcherAssert.assertThat; -import static org.hamcrest.Matchers.hasItem; -import static org.hamcrest.collection.IsCollectionWithSize.hasSize; - -public class FullTextParserTest { - - private FullTextParser target; - - @Before - public void setUp() throws Exception { - target = new FullTextParser(new EngineParsers()); - } - - @BeforeClass - public static void init() { - LibraryLoader.load(); - GrobidProperties.getInstance(); - } - - @AfterClass - public static void tearDown() { - GrobidFactory.reset(); - } - - public DocumentPiece getWholeDocumentPiece(Document doc) { - return new DocumentPiece( - new DocumentPointer(0, 0, 0), - new DocumentPointer(0, doc.getTokenizations().size() - 1, doc.getTokenizations().size() - 1) - ); - } - - public SortedSet getWholeDocumentParts(Document doc) { - return new TreeSet<>(Collections.singleton( - getWholeDocumentPiece(doc) - )); - } - - @Test - public void testShouldOutputBlockStartForRegularBlock() throws Exception { - String blockText = "This is a block"; - Document doc = Document.createFromText(blockText); - SortedSet documentParts = getWholeDocumentParts(doc); - Pair dataAndTokens = FullTextParser.getBodyTextFeatured(doc, documentParts); -// LOGGER.debug("data debug: {}", dataAndTokens.getLeft()); - String[] lines = dataAndTokens.getLeft().split("\n"); - assertThat("lines[0] fields", Arrays.asList(lines[0].split("\\s")), is(hasItem("BLOCKSTART"))); - } - - @Test - public void testShouldOutputBlockStartForBlockStartingWithLineFeed() throws Exception { - String blockText = "\nThis is a block"; - Document doc = Document.createFromText(blockText); - assertThat( - "doc.block[0].tokens[0].text", - doc.getBlocks().get(0).getTokens().get(0).getText(), - is("\n") - ); - SortedSet documentParts = getWholeDocumentParts(doc); - Pair dataAndTokens = FullTextParser.getBodyTextFeatured(doc, documentParts); -// LOGGER.debug("data debug: {}", dataAndTokens.getLeft()); - String[] lines = dataAndTokens.getLeft().split("\n"); - assertThat("lines[0] fields", Arrays.asList(lines[0].split("\\s")), is(hasItem("BLOCKSTART"))); - } - - @Test - public void testProcessTrainingDataFigures_single_figure() throws Exception { - String text = "The mechanism for superconductivity FIG. 1. λ(T) vs . T for YBCO"; - List tokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text); - String rese = "The\tthe\tT\tTh\tThe\tThe\te\the\tThe\tThe\tBLOCKSTART\tLINESTART\tALIGNEDLEFT\tNEWFONT\tHIGHERFONT\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\tI-\n" + - "mechanism\tmechanism\tm\tme\tmec\tmech\tm\tsm\tism\tnism\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\t\n" + - "for\tfor\tf\tfo\tfor\tfor\tr\tor\tfor\tfor\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\t\n" + - "superconductivity\tsuperconductivity\ts\tsu\tsup\tsupe\ty\tty\tity\tvity\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\t\n" + - "FIG\tfig\tF\tFI\tFIG\tFIG\tG\tIG\tFIG\tFIG\tBLOCKSTART\tLINESTART\tLINEINDENT\tNEWFONT\tHIGHERFONT\t0\t0\tALLCAP\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\tI-
\n" + - ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "1\t1\t1\t1\t1\t1\t1\t1\t1\t1\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tALLDIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t1\t0\t
\n" + - ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "λ\tλ\tλ\tλ\tλ\tλ\tλ\tλ\tλ\tλ\tBLOCKIN\tLINEIN\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "(\t(\t(\t(\t(\t(\t(\t(\t(\t(\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tOPENBRACKET\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "T\tt\tT\tT\tT\tT\tT\tT\tT\tT\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - ")\t)\t)\t)\t)\t)\t)\t)\t)\t)\tBLOCKIN\tLINEIN\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tENDBRACKET\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "vs\tvs\tv\tvs\tvs\tvs\ts\tvs\tvs\tvs\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEEND\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "T\tt\tT\tT\tT\tT\tT\tT\tT\tT\tBLOCKIN\tLINESTART\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "for\tfor\tf\tfo\tfor\tfor\tr\tor\tfor\tfor\tBLOCKIN\tLINEIN\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "YBCO\tybco\tY\tYB\tYBC\tYBCO\tO\tCO\tBCO\tYBCO\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n\n"; - - - Pair stringStringPair = target.processTrainingDataFigures(rese, tokens, "123"); - - String tei = stringStringPair.getLeft(); - String tokenisation = stringStringPair.getRight(); - String reconstructedText = Arrays.stream(tokenisation.split("\n")).map(l -> l.split("\t")[0]).collect(Collectors.joining(" ")); - - assertThat(reconstructedText, is("FIG . 1 . λ ( T ) vs . T for YBCO")); - assertThat(tokenisation.split("\n").length, is(13)); - - } - - @Test - public void testProcessTrainingDataFigures_multiple_figures() throws Exception { - String text = "The mechanism for superconductivity FIG. 1. λ(T) vs . T for YBCO"; - List tokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text); - String rese = "The\tthe\tT\tTh\tThe\tThe\te\the\tThe\tThe\tBLOCKSTART\tLINESTART\tALIGNEDLEFT\tNEWFONT\tHIGHERFONT\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\tI-\n" + - "mechanism\tmechanism\tm\tme\tmec\tmech\tm\tsm\tism\tnism\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\t\n" + - "for\tfor\tf\tfo\tfor\tfor\tr\tor\tfor\tfor\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\t\n" + - "superconductivity\tsuperconductivity\ts\tsu\tsup\tsupe\ty\tty\tity\tvity\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\t\n" + - "FIG\tfig\tF\tFI\tFIG\tFIG\tG\tIG\tFIG\tFIG\tBLOCKSTART\tLINESTART\tLINEINDENT\tNEWFONT\tHIGHERFONT\t0\t0\tALLCAP\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\tI-
\n" + - ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "1\t1\t1\t1\t1\t1\t1\t1\t1\t1\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tALLDIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t1\t0\t
\n" + - ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "λ\tλ\tλ\tλ\tλ\tλ\tλ\tλ\tλ\tλ\tBLOCKIN\tLINEIN\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "(\t(\t(\t(\t(\t(\t(\t(\t(\t(\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tOPENBRACKET\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "T\tt\tT\tT\tT\tT\tT\tT\tT\tT\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - ")\t)\t)\t)\t)\t)\t)\t)\t)\t)\tBLOCKIN\tLINEIN\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tENDBRACKET\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "vs\tvs\tv\tvs\tvs\tvs\ts\tvs\tvs\tvs\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\tI-
\n" + - ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEEND\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "T\tt\tT\tT\tT\tT\tT\tT\tT\tT\tBLOCKIN\tLINESTART\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "for\tfor\tf\tfo\tfor\tfor\tr\tor\tfor\tfor\tBLOCKIN\tLINEIN\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "YBCO\tybco\tY\tYB\tYBC\tYBCO\tO\tCO\tBCO\tYBCO\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n\n"; - - - Pair stringStringPair = target.processTrainingDataFigures(rese, tokens, "123"); - - String tei = stringStringPair.getLeft(); - String tokenisation = stringStringPair.getRight(); - List output = new ArrayList<>(); - for (String block : tokenisation.split("\n\n\n")) { - String collect = Arrays.stream(block.split("\n")).map(l -> l.split("\t")[0]).collect(Collectors.joining(" ")); - if (StringUtils.isNotBlank(collect)) { - output.add(collect); - } - } - - assertThat(output, hasSize(2)); - assertThat(output.get(0), is("FIG . 1 . λ ( T )")); - assertThat(output.get(1), is("vs . T for YBCO")); - assertThat(tokenisation.split("\n").length, is(15)); - - } - - @Test - public void testProcessTrainingDataTables_single_table() throws Exception { - String text = "The mechanism for superconductivity FIG. 1. λ(T) vs . T for YBCO"; - List tokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text); - String rese = "The\tthe\tT\tTh\tThe\tThe\te\the\tThe\tThe\tBLOCKSTART\tLINESTART\tALIGNEDLEFT\tNEWFONT\tHIGHERFONT\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\tI-\n" + - "mechanism\tmechanism\tm\tme\tmec\tmech\tm\tsm\tism\tnism\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\t\n" + - "for\tfor\tf\tfo\tfor\tfor\tr\tor\tfor\tfor\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\t\n" + - "superconductivity\tsuperconductivity\ts\tsu\tsup\tsupe\ty\tty\tity\tvity\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\t\n" + - "FIG\tfig\tF\tFI\tFIG\tFIG\tG\tIG\tFIG\tFIG\tBLOCKSTART\tLINESTART\tLINEINDENT\tNEWFONT\tHIGHERFONT\t0\t0\tALLCAP\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\tI-
\n" + - ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "1\t1\t1\t1\t1\t1\t1\t1\t1\t1\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tALLDIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t1\t0\t
\n" + - ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "λ\tλ\tλ\tλ\tλ\tλ\tλ\tλ\tλ\tλ\tBLOCKIN\tLINEIN\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "(\t(\t(\t(\t(\t(\t(\t(\t(\t(\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tOPENBRACKET\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "T\tt\tT\tT\tT\tT\tT\tT\tT\tT\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - ")\t)\t)\t)\t)\t)\t)\t)\t)\t)\tBLOCKIN\tLINEIN\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tENDBRACKET\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "vs\tvs\tv\tvs\tvs\tvs\ts\tvs\tvs\tvs\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEEND\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "T\tt\tT\tT\tT\tT\tT\tT\tT\tT\tBLOCKIN\tLINESTART\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "for\tfor\tf\tfo\tfor\tfor\tr\tor\tfor\tfor\tBLOCKIN\tLINEIN\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "YBCO\tybco\tY\tYB\tYBC\tYBCO\tO\tCO\tBCO\tYBCO\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n\n"; - - - Pair stringStringPair = target.processTrainingDataTables(rese, tokens, "123"); - - String tei = stringStringPair.getLeft(); - String tokenisation = stringStringPair.getRight(); - String reconstructedText = Arrays.stream(tokenisation.split("\n")).map(l -> l.split("\t")[0]).collect(Collectors.joining(" ")); - - assertThat(reconstructedText, is("FIG . 1 . λ ( T ) vs . T for YBCO")); - assertThat(tokenisation.split("\n").length, is(13)); - - } - - @Test - public void testProcessTrainingDataTable_multiple_tables() throws Exception { - String text = "The mechanism for superconductivity FIG. 1. λ(T) vs . T for YBCO"; - List tokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text); - String rese = "The\tthe\tT\tTh\tThe\tThe\te\the\tThe\tThe\tBLOCKSTART\tLINESTART\tALIGNEDLEFT\tNEWFONT\tHIGHERFONT\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\tI-\n" + - "mechanism\tmechanism\tm\tme\tmec\tmech\tm\tsm\tism\tnism\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\t\n" + - "for\tfor\tf\tfo\tfor\tfor\tr\tor\tfor\tfor\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\t\n" + - "superconductivity\tsuperconductivity\ts\tsu\tsup\tsupe\ty\tty\tity\tvity\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t4\t0\tNUMBER\t0\t0\t\n" + - "FIG\tfig\tF\tFI\tFIG\tFIG\tG\tIG\tFIG\tFIG\tBLOCKSTART\tLINESTART\tLINEINDENT\tNEWFONT\tHIGHERFONT\t0\t0\tALLCAP\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\tI-
\n" + - ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "1\t1\t1\t1\t1\t1\t1\t1\t1\t1\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tALLDIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t1\t0\t
\n" + - ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "λ\tλ\tλ\tλ\tλ\tλ\tλ\tλ\tλ\tλ\tBLOCKIN\tLINEIN\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "(\t(\t(\t(\t(\t(\t(\t(\t(\t(\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tOPENBRACKET\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "T\tt\tT\tT\tT\tT\tT\tT\tT\tT\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - ")\t)\t)\t)\t)\t)\t)\t)\t)\t)\tBLOCKIN\tLINEIN\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tENDBRACKET\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "vs\tvs\tv\tvs\tvs\tvs\ts\tvs\tvs\tvs\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\tI-
\n" + - ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEEND\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "T\tt\tT\tT\tT\tT\tT\tT\tT\tT\tBLOCKIN\tLINESTART\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "for\tfor\tf\tfo\tfor\tfor\tr\tor\tfor\tfor\tBLOCKIN\tLINEIN\tLINEINDENT\tNEWFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n" + - "YBCO\tybco\tY\tYB\tYBC\tYBCO\tO\tCO\tBCO\tYBCO\tBLOCKIN\tLINEIN\tLINEINDENT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t0\tNOPUNCT\t10\t3\t0\tNUMBER\t0\t0\t
\n\n"; - - - Pair stringStringPair = target.processTrainingDataTables(rese, tokens, "123"); - - String tei = stringStringPair.getLeft(); - String tokenisation = stringStringPair.getRight(); - List output = new ArrayList<>(); - for (String block : tokenisation.split("\n\n\n")) { - String collect = Arrays.stream(block.split("\n")).map(l -> l.split("\t")[0]).collect(Collectors.joining(" ")); - if (StringUtils.isNotBlank(collect)) { - output.add(collect); - } - } - - assertThat(output, hasSize(2)); - assertThat(output.get(0), is("FIG . 1 . λ ( T )")); - assertThat(output.get(1), is("vs . T for YBCO")); - assertThat(tokenisation.split("\n").length, is(15)); - - } - - @Test - public void testPostProcessLabeledAbstract_shouldTransformTableLabelInParagraphLabel() { - String resultWithTables = "This\tthis\tT\tTh\tThi\tThis\ts\tis\this\tThis\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tNEWFONT\tHIGHERFONT\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t0\t10\t0\tNUMBER\t0\t0\tI-
\n" + - "study\tstudy\ts\tst\tstu\tstud\ty\tdy\tudy\ttudy\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t10\t0\tNUMBER\t0\t0\t
\n" + - "was\twas\tw\twa\twas\twas\ts\tas\twas\twas\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t10\t0\tNUMBER\t0\t0\t
\n" + - "supported\tsupported\ts\tsu\tsup\tsupp\td\ted\tted\trted\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t10\t0\tNUMBER\t0\t0\t
\n" + - "by\tby\tb\tby\tby\tby\ty\tby\tby\tby\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t0\t10\t0\tNUMBER\t0\t0\t
\n" + - "the\tthe\tt\tth\tthe\tthe\te\the\tthe\tthe\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t1\t10\t0\tNUMBER\t0\t0\t
\n" + - "South\tsouth\tS\tSo\tSou\tSout\th\tth\tuth\touth\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t1\t10\t0\tNUMBER\t0\t0\t
\n" + - "Asian\tasian\tA\tAs\tAsi\tAsia\tn\tan\tian\tsian\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t1\t10\t0\tNUMBER\t0\t0\t
\n" + - "Clinical\tclinical\tC\tCl\tCli\tClin\tl\tal\tcal\tical\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t1\t10\t0\tNUMBER\t0\t0\t
\n" + - "Toxicology\ttoxicology\tT\tTo\tTox\tToxi\ty\tgy\togy\tlogy\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t1\t10\t0\tNUMBER\t0\t0\t
\n" + - "Research\tresearch\tR\tRe\tRes\tRese\th\tch\trch\tarch\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t2\t10\t0\tNUMBER\t0\t0\t
\n" + - "Collaboration\tcollaboration\tC\tCo\tCol\tColl\tn\ton\tion\ttion\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t2\t10\t0\tNUMBER\t0\t0\t
\n" + - ",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tCOMMA\t3\t10\t0\tNUMBER\t0\t0\t
\n" + - "which\twhich\tw\twh\twhi\twhic\th\tch\tich\thich\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t3\t10\t0\tNUMBER\t0\t0\t
\n" + - "is\tis\ti\tis\tis\tis\ts\tis\tis\tis\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t3\t10\t0\tNUMBER\t0\t0\t
\n" + - "funded\tfunded\tf\tfu\tfun\tfund\td\ted\tded\tnded\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t3\t10\t0\tNUMBER\t0\t0\t
\n" + - "by\tby\tb\tby\tby\tby\ty\tby\tby\tby\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t3\t10\t0\tNUMBER\t0\t0\t
\n" + - "The\tthe\tT\tTh\tThe\tThe\te\the\tThe\tThe\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t3\t10\t0\tNUMBER\t0\t0\t
\n" + - "Wellcome\twellcome\tW\tWe\tWel\tWell\te\tme\tome\tcome\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t4\t10\t0\tNUMBER\t0\t0\t
\n" + - "Trust\ttrust\tT\tTr\tTru\tTrus\tt\tst\tust\trust\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t4\t10\t0\tNUMBER\t0\t0\t
\n" + - "/\t/\t/\t/\t/\t/\t/\t/\t/\t/\tBLOCKIN\tLINEEND\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tNOPUNCT\t4\t10\t0\tNUMBER\t0\t0\t
\n" + - "National\tnational\tN\tNa\tNat\tNati\tl\tal\tnal\tonal\tBLOCKIN\tLINESTART\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t4\t10\t0\tNUMBER\t0\t0\t
\n" + - "Health\thealth\tH\tHe\tHea\tHeal\th\tth\tlth\talth\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t5\t10\t0\tNUMBER\t0\t0\t
\n" + - "and\tand\ta\tan\tand\tand\td\tnd\tand\tand\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t5\t10\t0\tNUMBER\t0\t0\t
\n" + - "Medical\tmedical\tM\tMe\tMed\tMedi\tl\tal\tcal\tical\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t5\t10\t0\tNUMBER\t0\t0\t
\n" + - "Research\tresearch\tR\tRe\tRes\tRese\th\tch\trch\tarch\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t5\t10\t0\tNUMBER\t0\t0\t
\n" + - "Council\tcouncil\tC\tCo\tCou\tCoun\tl\til\tcil\tncil\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t6\t10\t0\tNUMBER\t0\t0\t
\n" + - "International\tinternational\tI\tIn\tInt\tInte\tl\tal\tnal\tonal\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t6\t10\t0\tNUMBER\t0\t0\t
\n" + - "Collaborative\tcollaborative\tC\tCo\tCol\tColl\te\tve\tive\ttive\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t6\t10\t0\tNUMBER\t0\t0\t
\n" + - "Research\tresearch\tR\tRe\tRes\tRese\th\tch\trch\tarch\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t7\t10\t0\tNUMBER\t0\t0\t
\n" + - "Grant\tgrant\tG\tGr\tGra\tGran\tt\tnt\tant\trant\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t7\t10\t0\tNUMBER\t0\t0\t
\n" + - "GR071669MA\tgr071669ma\tG\tGR\tGR0\tGR07\tA\tMA\t9MA\t69MA\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tCONTAINSDIGITS\t0\tNOPUNCT\t8\t10\t0\tNUMBER\t0\t0\t
\n" + - ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t8\t10\t0\tNUMBER\t0\t0\t
\n" + - "The\tthe\tT\tTh\tThe\tThe\te\the\tThe\tThe\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tINITCAP\tNODIGIT\t0\tNOPUNCT\t8\t10\t0\tNUMBER\t0\t0\t
\n" + - "funding\tfunding\tf\tfu\tfun\tfund\tg\tng\ting\tding\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t8\t10\t0\tNUMBER\t0\t0\t
\n" + - "bodies\tbodies\tb\tbo\tbod\tbodi\ts\tes\ties\tdies\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t8\t10\t0\tNUMBER\t0\t0\t
\n" + - "had\thad\th\tha\thad\thad\td\tad\thad\thad\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t10\t0\tNUMBER\t0\t0\t
\n" + - "no\tno\tn\tno\tno\tno\to\tno\tno\tno\tBLOCKIN\tLINEEND\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t10\t0\tNUMBER\t0\t0\t
\n" + - "role\trole\tr\tro\trol\trole\te\tle\tole\trole\tBLOCKIN\tLINESTART\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t10\t0\tNUMBER\t0\t0\t
\n" + - "in\tin\ti\tin\tin\tin\tn\tin\tin\tin\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t10\t0\tNUMBER\t0\t0\t
\n" + - "analyzing\tanalyzing\ta\tan\tana\tanal\tg\tng\ting\tzing\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t9\t10\t0\tNUMBER\t0\t0\t
\n" + - "or\tor\to\tor\tor\tor\tr\tor\tor\tor\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t10\t10\t0\tNUMBER\t0\t0\t
\n" + - "interpreting\tinterpreting\ti\tin\tint\tinte\tg\tng\ting\tting\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t10\t10\t0\tNUMBER\t0\t0\t
\n" + - "the\tthe\tt\tth\tthe\tthe\te\the\tthe\tthe\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t10\t10\t0\tNUMBER\t0\t0\t
\n" + - "data\tdata\td\tda\tdat\tdata\ta\tta\tata\tdata\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t10\t10\t0\tNUMBER\t0\t0\t
\n" + - "or\tor\to\tor\tor\tor\tr\tor\tor\tor\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t11\t10\t0\tNUMBER\t0\t0\t
\n" + - "writing\twriting\tw\twr\twri\twrit\tg\tng\ting\tting\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t11\t10\t0\tNUMBER\t0\t0\t
\n" + - "the\tthe\tt\tth\tthe\tthe\te\the\tthe\tthe\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t11\t10\t0\tNUMBER\t0\t0\t
\n" + - "article\tarticle\ta\tar\tart\tarti\te\tle\tcle\ticle\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tNOCAPS\tNODIGIT\t0\tNOPUNCT\t11\t10\t0\tNUMBER\t0\t0\t
\n" + - ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tBLOCKIN\tLINEIN\tALIGNEDLEFT\tSAMEFONT\tSAMEFONTSIZE\t0\t0\tALLCAP\tNODIGIT\t1\tDOT\t11\t10\t0\tNUMBER\t0\t0\t
"; - String postprocessed = FullTextParser.postProcessFullTextLabeledText(resultWithTables); - - assertThat(Arrays.stream(StringUtils.split(postprocessed, "\n")) - .filter(l -> l.endsWith("
")) - .count(), is(0L)); - - assertThat(Arrays.stream(StringUtils.split(postprocessed, "\n")) - .filter(l -> l.endsWith("")) - .count(), is (Arrays.stream(StringUtils.split(resultWithTables, "\n")) - .filter(l -> l.endsWith("
")) - .count())); - - } - - -} \ No newline at end of file diff --git a/grobid-core/src/test/kotlin/org/grobid/core/engines/FullTextParserTest.kt b/grobid-core/src/test/kotlin/org/grobid/core/engines/FullTextParserTest.kt index bc7a131fba..21f35e7d0d 100644 --- a/grobid-core/src/test/kotlin/org/grobid/core/engines/FullTextParserTest.kt +++ b/grobid-core/src/test/kotlin/org/grobid/core/engines/FullTextParserTest.kt @@ -1,9 +1,11 @@ package org.grobid.core.engines -import jnr.posix.BaseIovec.Layout import org.apache.commons.lang3.StringUtils import org.apache.commons.lang3.tuple.Triple import org.grobid.core.analyzers.GrobidAnalyzer +import org.grobid.core.document.Document +import org.grobid.core.document.DocumentPiece +import org.grobid.core.document.DocumentPointer import org.grobid.core.engines.label.TaggingLabels.TABLE_LABEL import org.grobid.core.factory.GrobidFactory import org.grobid.core.layout.LayoutToken @@ -13,8 +15,8 @@ import org.grobid.core.utilities.GrobidProperties import org.grobid.core.utilities.GrobidTestUtils import org.hamcrest.CoreMatchers import org.hamcrest.CoreMatchers.`is` -import org.hamcrest.MatcherAssert import org.hamcrest.MatcherAssert.assertThat +import org.hamcrest.Matchers import org.hamcrest.Matchers.hasSize import org.hamcrest.collection.IsCollectionWithSize import org.junit.AfterClass @@ -88,8 +90,8 @@ class FullTextParserTest { .map { l: String -> l.split("\t".toRegex()).dropLastWhile { it.isEmpty() }.toTypedArray()[0] } .collect(Collectors.joining(" ")) - MatcherAssert.assertThat(reconstructedText, CoreMatchers.`is`("FIG . 1 . λ ( T ) vs . T for YBCO")) - MatcherAssert.assertThat( + assertThat(reconstructedText, CoreMatchers.`is`("FIG . 1 . λ ( T ) vs . T for YBCO")) + assertThat( tokenisation.split("\n".toRegex()).dropLastWhile { it.isEmpty() }.toTypedArray().size, CoreMatchers.`is`(13) ) @@ -134,10 +136,10 @@ class FullTextParserTest { } } - MatcherAssert.assertThat>(output, IsCollectionWithSize.hasSize(2)) - MatcherAssert.assertThat(output[0], CoreMatchers.`is`("FIG . 1 . λ ( T )")) - MatcherAssert.assertThat(output[1], CoreMatchers.`is`("vs . T for YBCO")) - MatcherAssert.assertThat( + assertThat>(output, IsCollectionWithSize.hasSize(2)) + assertThat(output[0], CoreMatchers.`is`("FIG . 1 . λ ( T )")) + assertThat(output[1], CoreMatchers.`is`("vs . T for YBCO")) + assertThat( tokenisation.split("\n".toRegex()).dropLastWhile { it.isEmpty() }.toTypedArray().size, CoreMatchers.`is`(15) ) @@ -177,8 +179,8 @@ class FullTextParserTest { .map { l: String -> l.split("\t".toRegex()).dropLastWhile { it.isEmpty() }.toTypedArray()[0] } .collect(Collectors.joining(" ")) - MatcherAssert.assertThat(reconstructedText, CoreMatchers.`is`("FIG . 1 . λ ( T ) vs . T for YBCO")) - MatcherAssert.assertThat( + assertThat(reconstructedText, CoreMatchers.`is`("FIG . 1 . λ ( T ) vs . T for YBCO")) + assertThat( tokenisation.split("\n".toRegex()).dropLastWhile { it.isEmpty() }.toTypedArray().size, CoreMatchers.`is`(13) ) @@ -223,10 +225,10 @@ class FullTextParserTest { } } - MatcherAssert.assertThat>(output, IsCollectionWithSize.hasSize(2)) - MatcherAssert.assertThat(output[0], CoreMatchers.`is`("FIG . 1 . λ ( T )")) - MatcherAssert.assertThat(output[1], CoreMatchers.`is`("vs . T for YBCO")) - MatcherAssert.assertThat( + assertThat>(output, IsCollectionWithSize.hasSize(2)) + assertThat(output[0], CoreMatchers.`is`("FIG . 1 . λ ( T )")) + assertThat(output[1], CoreMatchers.`is`("vs . T for YBCO")) + assertThat( tokenisation.split("\n".toRegex()).dropLastWhile { it.isEmpty() }.toTypedArray().size, CoreMatchers.`is`(15) ) @@ -382,4 +384,58 @@ class FullTextParserTest { assertThat(consolidatedTable3ResultCandidateThroughSequence, `is`(67)) } + + @Test + @Throws(Exception::class) + fun testShouldOutputBlockStartForRegularBlock() { + val blockText = "This is a block" + val doc = Document.createFromText(blockText) + val documentParts = getWholeDocumentParts(doc) + val dataAndTokens = FullTextParser.getBodyTextFeatured(doc, documentParts) + // LOGGER.debug("data debug: {}", dataAndTokens.getLeft()); + val lines = dataAndTokens.left.split("\n".toRegex()).dropLastWhile { it.isEmpty() }.toTypedArray() + assertThat( + "lines[0] fields", + Arrays.asList( + *lines[0].split("\\s".toRegex()) + .dropLastWhile { it.isEmpty() } + .toTypedArray()), `is`(Matchers.hasItem("BLOCKSTART")) + ) + } + + @Test + @Throws(Exception::class) + fun testShouldOutputBlockStartForBlockStartingWithLineFeed() { + val blockText = "\nThis is a block" + val doc = Document.createFromText(blockText) + assertThat( + "doc.block[0].tokens[0].text", + doc.blocks[0].getTokens()[0].text, + CoreMatchers.`is`("\n") + ) + val documentParts = getWholeDocumentParts(doc) + val dataAndTokens = FullTextParser.getBodyTextFeatured(doc, documentParts) + // LOGGER.debug("data debug: {}", dataAndTokens.getLeft()); + val lines = dataAndTokens.left.split("\n".toRegex()).dropLastWhile { it.isEmpty() }.toTypedArray() + assertThat( + "lines[0] fields", + Arrays.asList(*lines[0].split("\\s".toRegex()).dropLastWhile { it.isEmpty() }.toTypedArray()), + `is`(Matchers.hasItem("BLOCKSTART")) + ) + } + + private fun getWholeDocumentPiece(doc: Document): DocumentPiece { + return DocumentPiece( + DocumentPointer(0, 0, 0), + DocumentPointer(0, doc.tokenizations.size - 1, doc.tokenizations.size - 1) + ) + } + + private fun getWholeDocumentParts(doc: Document): SortedSet { + return TreeSet( + setOf( + getWholeDocumentPiece(doc) + ) + ) + } } \ No newline at end of file From 5b0dd4f5cb1b47005407024f212cef349c744685 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Mon, 30 Dec 2024 20:28:26 +0100 Subject: [PATCH 21/21] cosmetics --- .../src/main/java/org/grobid/core/document/TEIFormatter.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java index d9f2c46006..8f9c12aad1 100755 --- a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java +++ b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java @@ -923,8 +923,8 @@ else if (biblio.getE_Year().length() == 4) tei.append("\t\t\t\n"); } - if ((abstractText != null) && (abstractText.length() != 0)) { - if ( (biblio.getLabeledAbstract() != null) && (biblio.getLabeledAbstract().length() > 0) ) { + if (StringUtils.isNotBlank(abstractText)) { + if (StringUtils.isNotBlank(biblio.getLabeledAbstract())) { // we have available structured abstract, which can be serialized as a full text "piece" StringBuilder buffer = new StringBuilder(); try {