From a94a7cc72fa31ba9431f1f35235acf047d99e67e Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Sun, 19 Jan 2025 19:26:04 +0900 Subject: [PATCH] Adding some warning if tables/figures are dropped. Cleanup. --- .../java/org/grobid/core/data/Figure.java | 25 ++++++++++--------- .../main/java/org/grobid/core/data/Table.java | 15 ++++++----- 2 files changed, 22 insertions(+), 18 deletions(-) diff --git a/grobid-core/src/main/java/org/grobid/core/data/Figure.java b/grobid-core/src/main/java/org/grobid/core/data/Figure.java index 0d646ed93d..ef43f2072a 100644 --- a/grobid-core/src/main/java/org/grobid/core/data/Figure.java +++ b/grobid-core/src/main/java/org/grobid/core/data/Figure.java @@ -329,6 +329,7 @@ public boolean isCompleteForTEI() { public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter formatter, List markerTypes) { if (!isCompleteForTEI()) { + LOGGER.warn("Found a figure that is badly formatted but it should have been spotted before. We ignore it now."); return null; } Element figureElement = XmlBuilderUtils.teiElement("figure"); @@ -339,7 +340,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form if (config.isGenerateTeiCoordinates("figure")) { List theBoxes = null; // non graphic elements - if (getLayoutTokens() != null && getLayoutTokens().size() > 0) { + if (CollectionUtils.isNotEmpty(getLayoutTokens())) { theBoxes = BoundingBoxCalculator.calculate(getLayoutTokens()); } @@ -347,10 +348,10 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form // -> note: this was restricted to the bitmap objects only... the bounding box calculation // with vector graphics might need some double check - // here we bound all figure graphics in one single box (given that we can have hundred graphics + // here we bound all figure graphics in one single box (given that we can have a hundred graphics // in a single figure) BoundingBox theGraphicsBox = null; - if ((graphicObjects != null) && (graphicObjects.size() > 0)) { + if (CollectionUtils.isNotEmpty(graphicObjects)) { for (GraphicObject graphicObject : graphicObjects) { if (theGraphicsBox == null) { theGraphicsBox = graphicObject.getBoundingBox(); @@ -366,24 +367,24 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form theBoxes.add(theGraphicsBox); } - if (theBoxes != null && theBoxes.size() > 0) { + if (CollectionUtils.isNotEmpty(theBoxes)) { String coords = Joiner.on(";").join(theBoxes); XmlBuilderUtils.addCoords(figureElement, coords); } } - if (header != null) { + + if (StringUtils.isNotBlank(header)) { Element head = XmlBuilderUtils.teiElement("head", LayoutTokensUtil.normalizeText(header.toString())); figureElement.appendChild(head); - } - if (label != null) { + + if (StringUtils.isNotBlank(label)) { Element labelEl = XmlBuilderUtils.teiElement("label", LayoutTokensUtil.normalizeText(label.toString())); figureElement.appendChild(labelEl); } - if (caption != null) { - + if (StringUtils.isNotBlank(caption)) { Element desc = XmlBuilderUtils.teiElement("figDesc"); if (config.isGenerateTeiIds()) { String divID = KeyGen.getKey().substring(0, 7); @@ -392,12 +393,12 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form // if the segment has been parsed with the full text model we further extract the clusters // to get the bibliographical references - if ( (labeledCaption != null) && (labeledCaption.length() > 0) ) { + if (StringUtils.isNotBlank(labeledCaption)) { TaggingTokenClusteror clusteror = new TaggingTokenClusteror(GrobidModels.FULLTEXT, labeledCaption, captionLayoutTokens); List clusters = clusteror.cluster(); MarkerType citationMarkerType = null; - if (markerTypes != null && markerTypes.size()>0) { + if (CollectionUtils.isNotEmpty(markerTypes)) { citationMarkerType = markerTypes.get(0); } @@ -453,7 +454,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form figureElement.appendChild(desc); } - if ((graphicObjects != null) && (graphicObjects.size() > 0)) { + if (CollectionUtils.isNotEmpty(graphicObjects)) { for (GraphicObject graphicObject : graphicObjects) { Element go = XmlBuilderUtils.teiElement("graphic"); String uri = graphicObject.getURI(); diff --git a/grobid-core/src/main/java/org/grobid/core/data/Table.java b/grobid-core/src/main/java/org/grobid/core/data/Table.java index 1016760284..2806fa212d 100644 --- a/grobid-core/src/main/java/org/grobid/core/data/Table.java +++ b/grobid-core/src/main/java/org/grobid/core/data/Table.java @@ -1,5 +1,6 @@ package org.grobid.core.data; +import org.apache.commons.collections4.CollectionUtils; import org.grobid.core.GrobidModels; import org.apache.commons.lang3.StringUtils; import org.grobid.core.data.table.Cell; @@ -69,6 +70,7 @@ public boolean isCompleteForTEI() { @Override public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter formatter, List markerTypes) { if (!isCompleteForTEI()) { + LOGGER.warn("Found a table that is badly formatted but it should have been spotted before. We ignore it now."); return null; } @@ -98,7 +100,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form }*/ Element desc = null; - if (caption != null) { + if (StringUtils.isNotBlank(caption)) { // if the segment has been parsed with the full text model we further extract the clusters // to get the bibliographical references @@ -111,16 +113,17 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form if (StringUtils.isNotBlank(labeledCaption)) { TaggingTokenClusteror clusteror = new TaggingTokenClusteror(GrobidModels.FULLTEXT, labeledCaption, captionLayoutTokens); List clusters = clusteror.cluster(); + + MarkerType citationMarkerType = null; + if (CollectionUtils.isNotEmpty(markerTypes)) { + citationMarkerType = markerTypes.get(0); + } + for (TaggingTokenCluster cluster : clusters) { if (cluster == null) { continue; } - MarkerType citationMarkerType = null; - if (markerTypes != null && markerTypes.size()>0) { - citationMarkerType = markerTypes.get(0); - } - TaggingLabel clusterLabel = cluster.getTaggingLabel(); //String clusterContent = LayoutTokensUtil.normalizeText(cluster.concatTokens()); String clusterContent = LayoutTokensUtil.normalizeDehyphenizeText(cluster.concatTokens());