Skip to content

Commit

Permalink
Merge branch 'master' into feature/collect-discarded-data
Browse files Browse the repository at this point in the history
# Conflicts:
#	grobid-core/src/main/java/org/grobid/core/data/Figure.java
#	grobid-core/src/main/java/org/grobid/core/data/Table.java
#	grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
#	grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java
  • Loading branch information
lfoppiano committed Jan 4, 2025
2 parents 69d7616 + 5b0dd4f commit f6642f2
Show file tree
Hide file tree
Showing 10 changed files with 1,119 additions and 411 deletions.
6 changes: 5 additions & 1 deletion grobid-core/src/main/java/org/grobid/core/data/Figure.java
Original file line number Diff line number Diff line change
Expand Up @@ -325,8 +325,12 @@ public String getTeiId() {
return "fig_" + this.id;
}

public boolean isCompleteForTEI() {
return (StringUtils.isNotBlank(header) || StringUtils.isNotBlank(caption) || CollectionUtils.isNotEmpty(graphicObjects));
}

public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter formatter, List<MarkerType> markerTypes) {
if (StringUtils.isEmpty(header) && StringUtils.isEmpty(caption) && CollectionUtils.isEmpty(graphicObjects)) {
if (!isCompleteForTEI()) {
return null;
}
Element figureElement = XmlBuilderUtils.teiElement("figure");
Expand Down
26 changes: 18 additions & 8 deletions grobid-core/src/main/java/org/grobid/core/data/Table.java
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@
import nu.xom.Attribute;
import nu.xom.Element;
import nu.xom.Node;
import nu.xom.Text;

import static org.grobid.core.document.xml.XmlBuilderUtils.teiElement;
import static org.grobid.core.document.xml.XmlBuilderUtils.addXmlId;
Expand All @@ -43,6 +42,7 @@
public class Table extends Figure {
private List<LayoutToken> contentTokens = new ArrayList<>();
private List<LayoutToken> fullDescriptionTokens = new ArrayList<>();

private boolean goodTable = true;

private StringBuilder note = null;
Expand All @@ -64,9 +64,13 @@ public Table() {
note = new StringBuilder();
}

public boolean isCompleteForTEI() {
return (StringUtils.isNotEmpty(header) && StringUtils.isNotEmpty(caption));
}

@Override
public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter formatter, List<MarkerType> markerTypes) {
if (StringUtils.isEmpty(header) && StringUtils.isEmpty(caption)) {
if (!isCompleteForTEI()) {
return null;
}

Expand Down Expand Up @@ -106,7 +110,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
addXmlId(desc, "_" + divID);
}

if ( (labeledCaption != null) && (labeledCaption.length() > 0) ) {
if (StringUtils.isNotBlank(labeledCaption)) {
TaggingTokenClusteror clusteror = new TaggingTokenClusteror(GrobidModels.FULLTEXT, labeledCaption, captionLayoutTokens);
List<TaggingTokenCluster> clusters = clusteror.cluster();
for (TaggingTokenCluster cluster : clusters) {
Expand Down Expand Up @@ -171,15 +175,15 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
}

Element noteNode = null;
if (note != null && note.toString().trim().length()>0) {
if (StringUtils.isNotBlank(note)) {

noteNode = XmlBuilderUtils.teiElement("note");
if (config.isGenerateTeiIds()) {
String divID = KeyGen.getKey().substring(0, 7);
addXmlId(noteNode, "_" + divID);
}

if ( (labeledNote != null) && (labeledNote.length() > 0) ) {
if (StringUtils.isNotBlank(labeledNote)) {
TaggingTokenClusteror clusteror = new TaggingTokenClusteror(GrobidModels.FULLTEXT, labeledNote, noteLayoutTokens);
List<TaggingTokenCluster> clusters = clusteror.cluster();
for (TaggingTokenCluster cluster : clusters) {
Expand Down Expand Up @@ -348,9 +352,14 @@ public String getLabeledNote() {
return this.labeledNote;
}

private boolean validateTable() {
/** Check if the table:
* - has label, header and content
* - header starts with "tab"
* - label can be parsed
*/
public boolean validateTable() {
CntManager cnt = Engine.getCntManager();
if (StringUtils.isEmpty(label) || StringUtils.isEmpty(header) || StringUtils.isEmpty(content)) {
if (StringUtils.isAnyBlank(label, header, content)) {
cnt.i(TableRejectionCounters.EMPTY_LABEL_OR_HEADER_OR_CONTENT);
return false;
}
Expand All @@ -361,7 +370,8 @@ private boolean validateTable() {
cnt.i(TableRejectionCounters.CANNOT_PARSE_LABEL_TO_INT);
return false;
}
if (!getHeader().toLowerCase().startsWith("table")) {
// tab covers: table, tabelle, tableu, tabella, etc.
if (!StringUtils.startsWithIgnoreCase(getHeader(), "tab")) {
cnt.i(TableRejectionCounters.HEADER_NOT_STARTS_WITH_TABLE_WORD);
return false;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -874,6 +874,7 @@ public static List<GraphicObject> getConnectedGraphics(Block block, Document doc
public void postProcessTables() {
for (Table table : tables) {
if (!table.firstCheck()) {
table.setGoodTable(false);
continue;
}

Expand Down Expand Up @@ -919,7 +920,7 @@ public void postProcessTables() {
table.getContentTokens().clear();
table.getContentTokens().addAll(contentResult);

table.secondCheck();
table.setGoodTable(table.secondCheck());
}
}

Expand Down
Loading

0 comments on commit f6642f2

Please sign in to comment.