Skip to content

Commit

Permalink
fix the way table notes are streamed on the XML
Browse files Browse the repository at this point in the history
  • Loading branch information
lfoppiano committed Jan 27, 2025
1 parent 19283ef commit 3774737
Showing 1 changed file with 22 additions and 16 deletions.
38 changes: 22 additions & 16 deletions grobid-core/src/main/java/org/grobid/core/data/Table.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package org.grobid.core.data;

import org.apache.commons.collections4.CollectionUtils;
import org.grobid.core.GrobidModels;
import org.apache.commons.lang3.StringUtils;
import org.grobid.core.data.table.Cell;
Expand Down Expand Up @@ -32,6 +33,7 @@
import nu.xom.Node;
import nu.xom.Text;

import static org.grobid.core.document.TEIFormatter.isNewParagraph;
import static org.grobid.core.document.xml.XmlBuilderUtils.teiElement;
import static org.grobid.core.document.xml.XmlBuilderUtils.addXmlId;
import static org.grobid.core.document.xml.XmlBuilderUtils.textNode;
Expand Down Expand Up @@ -169,15 +171,16 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
}

Element noteNode = null;
if (note != null && note.toString().trim().length()>0) {
if (StringUtils.isNotBlank(note)) {

noteNode = XmlBuilderUtils.teiElement("note");
if (config.isGenerateTeiIds()) {
String divID = KeyGen.getKey().substring(0, 7);
addXmlId(noteNode, "_" + divID);
}

if ( (labeledNote != null) && (labeledNote.length() > 0) ) {
if (StringUtils.isNotEmpty(labeledNote) ) {
Element p = null;
TaggingTokenClusteror clusteror = new TaggingTokenClusteror(GrobidModels.FULLTEXT, labeledNote, noteLayoutTokens);
List<TaggingTokenCluster> clusters = clusteror.cluster();
for (TaggingTokenCluster cluster : clusters) {
Expand All @@ -186,14 +189,18 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
}

MarkerType citationMarkerType = null;
if (markerTypes != null && markerTypes.size()>0) {
if (CollectionUtils.isNotEmpty(markerTypes)) {
citationMarkerType = markerTypes.get(0);
}

TaggingLabel clusterLabel = cluster.getTaggingLabel();
//String clusterContent = LayoutTokensUtil.normalizeText(cluster.concatTokens());
String clusterContent = LayoutTokensUtil.normalizeDehyphenizeText(cluster.concatTokens());
if (clusterLabel.equals(TaggingLabels.CITATION_MARKER)) {
if (p == null) {
LOGGER.warn("Problem when serializing TEI fragment for table note, there is a reference at the beginning of the sentence. ");
p = teiElement("p");
}
try {
List<Node> refNodes = formatter.markReferencesTEILuceneBased(
cluster.concatTokens(),
Expand All @@ -203,30 +210,29 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
citationMarkerType);
if (refNodes != null) {
for (Node n : refNodes) {
noteNode.appendChild(n);
p.appendChild(n);
}
}
} catch(Exception e) {
LOGGER.warn("Problem when serializing TEI fragment for table note", e);
}
} else {
noteNode.appendChild(textNode(clusterContent));
if (p == null) {
p = teiElement("p");
} else if (isNewParagraph(clusterLabel, p)) {
noteNode.appendChild(p);
p = teiElement("p");
}
p.appendChild(textNode(clusterContent));
}

if (noteNode != null && config.isWithSentenceSegmentation()) {
if (config.isWithSentenceSegmentation()) {
// we need a sentence segmentation of the figure caption
formatter.segmentIntoSentences(noteNode, this.noteLayoutTokens, config, doc.getLanguage(), doc.getPDFAnnotations());
}

// enclose note content in a <p> element
if (noteNode != null) {
noteNode.setLocalName("p");

Element tabNote = XmlBuilderUtils.teiElement("note");
tabNote.appendChild(noteNode);

noteNode = tabNote;
}
}
if (p != null && p.getChildCount() > 0) {
noteNode.appendChild(p);
}
} else {
noteNode = XmlBuilderUtils.teiElement("note", LayoutTokensUtil.normalizeText(note.toString()).trim());
Expand Down

0 comments on commit 3774737

Please sign in to comment.