Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Research infrastructure recognition #1085

Merged
merged 15 commits into from
Feb 11, 2024
Merged
2 changes: 1 addition & 1 deletion doc/training/header.md
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,7 @@ Specific keyword scheme names like "PACS" or "Mathematics Subject Classification


```xml
<keywords>Mathematics Subject Classification: 83C15, 81U15, 81V80, 17B80, 81R12<lb/></keywords>
<keyword>Mathematics Subject Classification: 83C15, 81U15, 81V80, 17B80, 81R12<lb/></keyword>

<keywords type="pacs">PACS numbers: 02.30.Ik, 03.65.Fd Fd<lb/></keywords>
```
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ public class Affiliation {

private String addressString = null; // unspecified address field
private String affiliationString = null; // unspecified affiliation field
private String rawAffiliationString = null; // raw affiliation text (excluding marker)
private String rawAffiliationString = null; // raw affiliation+address text (excluding marker)

private boolean failAffiliation = true; // tag for unresolved affiliation attachment

Expand Down Expand Up @@ -330,15 +330,17 @@ public boolean isNotNull() {
(region == null) &&
(settlement == null) &&
(addrLine == null) &&
(affiliationString == null) &&
(affiliationString == null) &&
(rawAffiliationString == null) &&
(addressString == null));
}

public boolean isNotEmptyAffiliation() {
return !((departments == null) &&
(institutions == null) &&
(laboratories == null) &&
(affiliationString == null));
(affiliationString == null) &&
(rawAffiliationString == null));
}

public boolean hasAddress() {
Expand Down
2 changes: 2 additions & 0 deletions grobid-core/src/main/java/org/grobid/core/data/Funding.java
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,8 @@ public String getGrantNumber() {
}

public void setGrantNumber(String grantNumber) {
if (grantNumber != null && grantNumber.startsWith("n˚"))
grantNumber = grantNumber.replace("n˚", "");
this.grantNumber = grantNumber;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@
import org.grobid.core.features.FeatureFactory;
import org.grobid.core.features.FeaturesVectorFulltext;
import org.grobid.core.lang.Language;
import org.grobid.core.lexicon.Lexicon;
import org.grobid.core.lexicon.Lexicon.OrganizationRecord;
import org.grobid.core.layout.*;
import org.grobid.core.tokenization.TaggingTokenCluster;
import org.grobid.core.tokenization.TaggingTokenClusteror;
Expand Down Expand Up @@ -2703,12 +2705,49 @@ private void toTEI(Document doc,
}

if (affiliations != null && affiliations.size() >0) {
tei.append("\n\t\t\t<listOrg type=\"infrastructure\">\n");

// check if we have at least one acknowledged research infrastructure here
List<Affiliation> filteredInfrastructures = new ArrayList<>();
for(Affiliation affiliation : affiliations) {
if (affiliation.isNotEmptyAffiliation() && affiliation.isInfrastructure())
tei.append(Affiliation.toTEI(affiliation, 4, config));
if (affiliation.isNotEmptyAffiliation() && affiliation.isInfrastructure())
filteredInfrastructures.add(affiliation);
else if (affiliation.isNotEmptyAffiliation()) {
// check if this organization is a known infrastructure
List<Lexicon.OrganizationRecord> localOrganizationNamings =
Lexicon.getInstance().getOrganizationNamingInfo(affiliation.getAffiliationString());
if (localOrganizationNamings != null && localOrganizationNamings.size()>0) {
filteredInfrastructures.add(affiliation);
}
}
}

// serialize acknowledged research infrastructure, if any
if (filteredInfrastructures.size() > 0) {
tei.append("\n\t\t\t<listOrg type=\"infrastructure\">\n");
for(Affiliation affiliation : filteredInfrastructures) {
List<Lexicon.OrganizationRecord> localOrganizationNamings =
Lexicon.getInstance().getOrganizationNamingInfo(affiliation.getAffiliationString());
tei.append("\t\t\t\t<org type=\"infrastructure\">");
tei.append("\t\t\t\t\t<orgName type=\"extracted\">");
tei.append(TextUtilities.HTMLEncode(affiliation.getAffiliationString()));
tei.append("</orgName>\n");
if (localOrganizationNamings != null && localOrganizationNamings.size()>0) {
for(Lexicon.OrganizationRecord orgRecord : localOrganizationNamings) {
if (isNotBlank(orgRecord.fullName)) {
tei.append("\t\t\t\t\t<orgName type=\"full\"");
if (isNotBlank(orgRecord.lang))
tei.append(" lang=\"" + orgRecord.lang + "\"");
tei.append(">");
tei.append(TextUtilities.HTMLEncode(orgRecord.fullName));
tei.append("</orgName>\n");
}
}
}
tei.append("\t\t\t\t</org>\n");
}

tei.append("\t\t\t</listOrg>\n");
}
tei.append("\t\t\t</listOrg>\n");
}

// availability statements in header
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ protected FundingAcknowledgementParser() {
try {
String featureVector = FeaturesVectorFunding.addFeatures(tokenizationFunding, null);
res = label(featureVector);
//System.out.println(res);
//System.out.println(res);
} catch (Exception e) {
throw new GrobidException("CRF labeling with table model fails.", e);
}
Expand Down Expand Up @@ -323,14 +323,14 @@ public MutablePair<Element, MutableTriple<List<Funding>,List<Person>,List<Affili

} else if (clusterLabel.equals(FUNDING_INSTITUTION)) {
if (StringUtils.isNotBlank(institution.getAffiliationString())) {
if (institution.isNotNull()) {
//if (institution.isNotNull()) {
institutions.add(institution);
// next funding object
institution = new Affiliation();
}
//}
}

institution.setRawAffiliationString(clusterContent);
institution.setAffiliationString(clusterContent);
institution.appendLayoutTokens(tokens);

Element entity = teiElement("rs");
Expand All @@ -343,14 +343,13 @@ public MutablePair<Element, MutableTriple<List<Funding>,List<Person>,List<Affili

} else if (clusterLabel.equals(FUNDING_INFRASTRUCTURE)) {
if (StringUtils.isNotBlank(institution.getAffiliationString())) {
if (institution.isNotNull()) {
//if (institution.isNotNull()) {
institutions.add(institution);
// next funding object
institution = new Affiliation();
}
//}
}

institution.setRawAffiliationString(clusterContent);
institution.setAffiliationString(clusterContent);
institution.appendLayoutTokens(tokens);
institution.setInfrastructure(true);

Expand Down Expand Up @@ -455,6 +454,12 @@ public MutablePair<Element, MutableTriple<List<Funding>,List<Person>,List<Affili
if (funding.isValid())
fundings.add(funding);

if (institution.isNotNull())
institutions.add(institution);

if (affiliation.isNotNull())
affiliations.add(affiliation);

if (institutions != null && institutions.size() > 0)
affiliations.addAll(institutions);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ public class FeaturesVectorFunding {
public boolean singleChar = false;
public boolean containDash = false;
public boolean knownFunder = false;
public boolean knownInfrastructure = false;
public String punctType = null;
// one of NOPUNCT, OPENBRACKET, ENDBRACKET, DOT, COMMA, HYPHEN, QUOTE, PUNCT (default)
public boolean containPunct = false;
Expand Down Expand Up @@ -73,6 +74,12 @@ public String printVector() {
else
res.append(" 0");

// lexical information (2)
if (knownInfrastructure)
res.append(" 1");
else
res.append(" 0");

// punctuation information (2)
res.append(" " + punctType); // in case the token is a punctuation (NO otherwise)

Expand All @@ -92,6 +99,7 @@ public static String addFeatures(List<LayoutToken> tokens, List<String> tags) th
FeatureFactory featureFactory = FeatureFactory.getInstance();

List<OffsetPosition> funderPositions = Lexicon.getInstance().tokenPositionsFunderNames(tokens);
List<OffsetPosition> infrastructurePositions = Lexicon.getInstance().tokenPositionsResearchInfrastructureNames(tokens);

String line;
StringBuilder stringBuilder = new StringBuilder();
Expand All @@ -100,6 +108,8 @@ public static String addFeatures(List<LayoutToken> tokens, List<String> tags) th

int currentFunderPositions = 0;
boolean isKnownFunderToken = false;
int currentInfrastructurePositions = 0;
boolean isKnownInfrastructureToken = false;
boolean skipTest;

for (int n = 0; n < tokens.size(); n++) {
Expand Down Expand Up @@ -178,6 +188,29 @@ public static String addFeatures(List<LayoutToken> tokens, List<String> tags) th
}
}

// check the position of matches for known infrastructures
if ((infrastructurePositions != null) && (infrastructurePositions.size() > 0)) {
if (currentInfrastructurePositions == infrastructurePositions.size() - 1) {
if (infrastructurePositions.get(currentInfrastructurePositions).end < n) {
skipTest = true;
}
}
if (!skipTest) {
for (int i = currentInfrastructurePositions; i < infrastructurePositions.size(); i++) {
if ((infrastructurePositions.get(i).start <= n) &&
(infrastructurePositions.get(i).end >= n)) {
isKnownInfrastructureToken = true;
currentInfrastructurePositions = i;
break;
} else if (infrastructurePositions.get(i).start > n) {
isKnownInfrastructureToken = false;
currentInfrastructurePositions = i;
break;
}
}
}
}

if (newline) {
features.lineStatus = "LINESTART";
outputLineStatus = true;
Expand Down Expand Up @@ -286,6 +319,9 @@ public static String addFeatures(List<LayoutToken> tokens, List<String> tags) th
if (isKnownFunderToken)
features.knownFunder = true;

if (isKnownInfrastructureToken)
features.knownInfrastructure = true;

if (tag != null)
features.label = tag;

Expand Down
Loading
Loading