Skip to content

Commit

Permalink
Adding some warning if tables/figures are dropped. Cleanup.
Browse files Browse the repository at this point in the history
  • Loading branch information
lfoppiano committed Jan 19, 2025
1 parent ab11f2d commit a94a7cc
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 18 deletions.
25 changes: 13 additions & 12 deletions grobid-core/src/main/java/org/grobid/core/data/Figure.java
Original file line number Diff line number Diff line change
Expand Up @@ -329,6 +329,7 @@ public boolean isCompleteForTEI() {

public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter formatter, List<MarkerType> markerTypes) {
if (!isCompleteForTEI()) {
LOGGER.warn("Found a figure that is badly formatted but it should have been spotted before. We ignore it now.");
return null;
}
Element figureElement = XmlBuilderUtils.teiElement("figure");
Expand All @@ -339,18 +340,18 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
if (config.isGenerateTeiCoordinates("figure")) {
List<BoundingBox> theBoxes = null;
// non graphic elements
if (getLayoutTokens() != null && getLayoutTokens().size() > 0) {
if (CollectionUtils.isNotEmpty(getLayoutTokens())) {
theBoxes = BoundingBoxCalculator.calculate(getLayoutTokens());
}

// if (getBitmapGraphicObjects() != null && !getBitmapGraphicObjects().isEmpty()) {
// -> note: this was restricted to the bitmap objects only... the bounding box calculation
// with vector graphics might need some double check

// here we bound all figure graphics in one single box (given that we can have hundred graphics
// here we bound all figure graphics in one single box (given that we can have a hundred graphics
// in a single figure)
BoundingBox theGraphicsBox = null;
if ((graphicObjects != null) && (graphicObjects.size() > 0)) {
if (CollectionUtils.isNotEmpty(graphicObjects)) {
for (GraphicObject graphicObject : graphicObjects) {
if (theGraphicsBox == null) {
theGraphicsBox = graphicObject.getBoundingBox();
Expand All @@ -366,24 +367,24 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
theBoxes.add(theGraphicsBox);
}

if (theBoxes != null && theBoxes.size() > 0) {
if (CollectionUtils.isNotEmpty(theBoxes)) {
String coords = Joiner.on(";").join(theBoxes);
XmlBuilderUtils.addCoords(figureElement, coords);
}
}
if (header != null) {

if (StringUtils.isNotBlank(header)) {
Element head = XmlBuilderUtils.teiElement("head",
LayoutTokensUtil.normalizeText(header.toString()));
figureElement.appendChild(head);

}
if (label != null) {

if (StringUtils.isNotBlank(label)) {
Element labelEl = XmlBuilderUtils.teiElement("label",
LayoutTokensUtil.normalizeText(label.toString()));
figureElement.appendChild(labelEl);
}
if (caption != null) {

if (StringUtils.isNotBlank(caption)) {
Element desc = XmlBuilderUtils.teiElement("figDesc");
if (config.isGenerateTeiIds()) {
String divID = KeyGen.getKey().substring(0, 7);
Expand All @@ -392,12 +393,12 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form

// if the segment has been parsed with the full text model we further extract the clusters
// to get the bibliographical references
if ( (labeledCaption != null) && (labeledCaption.length() > 0) ) {
if (StringUtils.isNotBlank(labeledCaption)) {
TaggingTokenClusteror clusteror = new TaggingTokenClusteror(GrobidModels.FULLTEXT, labeledCaption, captionLayoutTokens);
List<TaggingTokenCluster> clusters = clusteror.cluster();

MarkerType citationMarkerType = null;
if (markerTypes != null && markerTypes.size()>0) {
if (CollectionUtils.isNotEmpty(markerTypes)) {
citationMarkerType = markerTypes.get(0);
}

Expand Down Expand Up @@ -453,7 +454,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form

figureElement.appendChild(desc);
}
if ((graphicObjects != null) && (graphicObjects.size() > 0)) {
if (CollectionUtils.isNotEmpty(graphicObjects)) {
for (GraphicObject graphicObject : graphicObjects) {
Element go = XmlBuilderUtils.teiElement("graphic");
String uri = graphicObject.getURI();
Expand Down
15 changes: 9 additions & 6 deletions grobid-core/src/main/java/org/grobid/core/data/Table.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package org.grobid.core.data;

import org.apache.commons.collections4.CollectionUtils;
import org.grobid.core.GrobidModels;
import org.apache.commons.lang3.StringUtils;
import org.grobid.core.data.table.Cell;
Expand Down Expand Up @@ -69,6 +70,7 @@ public boolean isCompleteForTEI() {
@Override
public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter formatter, List<MarkerType> markerTypes) {
if (!isCompleteForTEI()) {
LOGGER.warn("Found a table that is badly formatted but it should have been spotted before. We ignore it now.");
return null;
}

Expand Down Expand Up @@ -98,7 +100,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
}*/

Element desc = null;
if (caption != null) {
if (StringUtils.isNotBlank(caption)) {
// if the segment has been parsed with the full text model we further extract the clusters
// to get the bibliographical references

Expand All @@ -111,16 +113,17 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
if (StringUtils.isNotBlank(labeledCaption)) {
TaggingTokenClusteror clusteror = new TaggingTokenClusteror(GrobidModels.FULLTEXT, labeledCaption, captionLayoutTokens);
List<TaggingTokenCluster> clusters = clusteror.cluster();

MarkerType citationMarkerType = null;
if (CollectionUtils.isNotEmpty(markerTypes)) {
citationMarkerType = markerTypes.get(0);
}

for (TaggingTokenCluster cluster : clusters) {
if (cluster == null) {
continue;
}

MarkerType citationMarkerType = null;
if (markerTypes != null && markerTypes.size()>0) {
citationMarkerType = markerTypes.get(0);
}

TaggingLabel clusterLabel = cluster.getTaggingLabel();
//String clusterContent = LayoutTokensUtil.normalizeText(cluster.concatTokens());
String clusterContent = LayoutTokensUtil.normalizeDehyphenizeText(cluster.concatTokens());
Expand Down

0 comments on commit a94a7cc

Please sign in to comment.