diff --git a/.gitignore b/.gitignore index ea482e9f6d..f19abe34f9 100644 --- a/.gitignore +++ b/.gitignore @@ -79,6 +79,7 @@ grobid-home/models/values grobid-home/models/dataseer* grobid-home/models/datasets* grobid-home/models/*-bert*/ +grobid-home/models/*_bert*/ grobid-home/models/*scibert*/ grobid-home/models/context_* diff --git a/Readme.md b/Readme.md index b6b74e46cf..0547371813 100644 --- a/Readme.md +++ b/Readme.md @@ -33,8 +33,9 @@ The following functionalities are available: - __Consolidation/resolution of the extracted bibliographical references__ using the [biblio-glutton](https://github.com/kermitt2/biblio-glutton) service or the [CrossRef REST API](https://github.com/CrossRef/rest-api-doc). In both cases, DOI/PMID resolution performance is higher than 0.95 F1-score from PDF extraction. - __Extraction and parsing of patent and non-patent references in patent__ publications. - __Extraction of Funders and funding information__ with optional matching of extracted funders with the CrossRef Funder Registry. +- __Identification of copyrights' owner and license associated to the document__, e.g. publisher or authors copyrights, CC-BY/CC-BY-NC/etc. license. -In a complete PDF processing, GROBID manages 55 final labels used to build relatively fine-grained structures, from traditional publication metadata (title, author first/last/middle names, affiliation types, detailed address, journal, volume, issue, pages, DOI, PMID, etc.) to full text structures (section title, paragraph, reference markers, head/foot notes, figure captions, etc.). +In a complete PDF processing, GROBID manages 68 final labels used to build relatively fine-grained structures, from traditional publication metadata (title, author first/last/middle names, affiliation types, detailed address, journal, volume, issue, pages, DOI, PMID, etc.) to full text structures (section title, paragraph, reference markers, head/foot notes, figure captions, etc.). GROBID includes a comprehensive [web service API](https://grobid.readthedocs.io/en/latest/Grobid-service/), [Docker images](https://grobid.readthedocs.io/en/latest/Grobid-docker/), [batch processing](https://grobid.readthedocs.io/en/latest/Grobid-batch/), a JAVA API, a generic [training and evaluation framework](https://grobid.readthedocs.io/en/latest/Training-the-models-of-Grobid/) (precision, recall, etc., n-fold cross-evaluation), systematic [end-to-end benchmarking](https://grobid.readthedocs.io/en/latest/Benchmarking/) on thousand documents and the semi-automatic generation of training data. @@ -108,7 +109,7 @@ A series of additional modules have been developed for performing __structure aw - [grobid-quantities](https://github.com/kermitt2/grobid-quantities): recognition and normalization of physical quantities/measurements - [grobid-superconductors](https://github.com/lfoppiano/grobid-superconductors): recognition of superconductor material and properties in scientific literature - [entity-fishing](https://github.com/kermitt2/entity-fishing), a tool for extracting Wikidata entities from text and document, which can also use Grobid to pre-process scientific articles in PDF, leading to more precise and relevant entity extraction and the capacity to annotate the PDF with interactive layout -- [dataseer-ml](https://github.com/dataseer/dataseer-ml): identification of sections and sentences introducing datasets in a scientific article, and classification of the type of these datasets +- [datastet](https://github.com/kermitt2/datastet): identification of sections and sentences introducing datasets in a scientific article, identification of dataset names (implict and named datasets) and classification of the type of these datasets - [grobid-ner](https://github.com/kermitt2/grobid-ner): named entity recognition - [grobid-astro](https://github.com/kermitt2/grobid-astro): recognition of astronomical entities in scientific papers - [grobid-bio](https://github.com/kermitt2/grobid-bio): a toy bio-entity tagger using BioNLP/NLPBA 2004 dataset @@ -143,7 +144,7 @@ If you want to cite this work, please refer to the present GitHub project, toget title = {GROBID}, howpublished = {\url{https://github.com/kermitt2/grobid}}, publisher = {GitHub}, - year = {2008--2023}, + year = {2008--2024}, archivePrefix = {swh}, eprint = {1:dir:dab86b296e3c3216e2241968f0d63b68e8209d3c} } diff --git a/doc/Grobid-service.md b/doc/Grobid-service.md index ccd333a571..631191d3b4 100644 --- a/doc/Grobid-service.md +++ b/doc/Grobid-service.md @@ -138,6 +138,7 @@ Extract the header of the input PDF document, normalize it and convert it into a | POST, PUT | `multipart/form-data` | `application/xml` | `input` | required | PDF file to be processed | | | | | `consolidateHeader` | optional | consolidateHeader is a string of value `0` (no consolidation), `1` (consolidate and inject all extra metadata, default value), `2` (consolidate the header and inject DOI only), or `3` (consolidate using only extracted DOI - if extracted) . | | | | | `includeRawAffiliations` | optional | `includeRawAffiliations` is a boolean value, `0` (default, do not include raw affiliation string in the result) or `1` (include raw affiliation string in the result). | +| | | | `includeRawCopyrights` | optional | `includeRawCopyrights` is a boolean value, `0` (default, do not include raw copyrights/license string in the result) or `1` (include raw copyrights/license string in the result). | Use `Accept: application/x-bibtex` to retrieve BibTeX format instead of TEI (note: the TEI XML format is much richer, it should be preferred if there is no particular reason to use BibTeX). @@ -177,6 +178,7 @@ Convert the complete input document into TEI XML format (header, body and biblio | | | | `consolidatFunders` | optional | `consolidateFunders` is a string of value `0` (no consolidation, default value) or `1` (consolidate and inject all extra metadata), or `2` (consolidate the funder and inject DOI only). | | | | | `includeRawCitations` | optional | `includeRawCitations` is a boolean value, `0` (default, do not include raw reference string in the result) or `1` (include raw reference string in the result). | | | | | `includeRawAffiliations` | optional | `includeRawAffiliations` is a boolean value, `0` (default, do not include raw affiliation string in the result) or `1` (include raw affiliation string in the result). | +| | | | `includeRawCopyrights` | optional | `includeRawCopyrights` is a boolean value, `0` (default, do not include raw copyrights/license string in the result) or `1` (include raw copyrights/license string in the result). | | | | | `teiCoordinates` | optional | list of element names for which coordinates in the PDF document have to be added, see [Coordinates of structures in the original PDF](Coordinates-in-PDF.md) for more details | | | | | `segmentSentences` | optional | Paragraphs structures in the resulting TEI will be further segmented into sentence elements | | | | | `start` | optional | Start page number of the PDF to be considered, previous pages will be skipped/ignored, integer with first page starting at `1`, (default `-1`, start from the first page of the PDF) | @@ -220,6 +222,8 @@ Regarding the bibliographical references, it is possible to include the original curl -v --form input=@./thefile.pdf --form includeRawCitations=1 localhost:8070/api/processFulltextDocument ``` +Similar raw strings can be added in the result for affiliation and copyrights/license sections. + Example with requested additional sentence segmentation of the paragraph with bounding box coordinates of the sentence structures: ```console diff --git a/grobid-core/src/main/java/org/grobid/core/GrobidModels.java b/grobid-core/src/main/java/org/grobid/core/GrobidModels.java index 32e90ccd9d..374552e55a 100755 --- a/grobid-core/src/main/java/org/grobid/core/GrobidModels.java +++ b/grobid-core/src/main/java/org/grobid/core/GrobidModels.java @@ -51,7 +51,9 @@ public enum GrobidModels implements GrobidModel { //ACKNOWLEDGEMENT("acknowledgement"), FUNDING_ACKNOWLEDGEMENT("funding-acknowledgement"), INFRASTRUCTURE("infrastructure"), - DUMMY("none"); + DUMMY("none"), + LICENSE("license"), + COPYRIGHT("copyright"); //I cannot declare it before public static final String DUMMY_FOLDER_LABEL = "none"; diff --git a/grobid-core/src/main/java/org/grobid/core/data/BiblioItem.java b/grobid-core/src/main/java/org/grobid/core/data/BiblioItem.java index bf065d1f4a..7bd030f923 100755 --- a/grobid-core/src/main/java/org/grobid/core/data/BiblioItem.java +++ b/grobid-core/src/main/java/org/grobid/core/data/BiblioItem.java @@ -6,6 +6,7 @@ import org.grobid.core.data.util.AuthorEmailAssigner; import org.grobid.core.data.util.ClassicAuthorEmailAssigner; import org.grobid.core.data.util.EmailSanitizer; +import org.grobid.core.data.CopyrightsLicense; import org.grobid.core.document.*; import org.grobid.core.engines.config.GrobidAnalysisConfig; import org.grobid.core.exceptions.GrobidException; @@ -376,6 +377,9 @@ public String toString() { // Availability statement private String availabilityStmt = null; + // Copyrights/license information object + CopyrightsLicense copyrightsLicense = null; + public static final List confPrefixes = Arrays.asList("Proceedings of", "proceedings of", "In Proceedings of the", "In: Proceeding of", "In Proceedings, ", "In Proceedings of", "In Proceeding of", "in Proceeding of", "in Proceeding", "In Proceeding", "Proceedings", @@ -4477,4 +4481,12 @@ public void setAvailabilityStmt(String availabilityStmt) { public List> getAffiliationAddresslabeledTokens() { return affiliationAddresslabeledTokens; } + + public void setCopyrightsLicense(CopyrightsLicense copyrightsLicense) { + this.copyrightsLicense = copyrightsLicense; + } + + public CopyrightsLicense getCopyrightsLicense() { + return this.copyrightsLicense; + } } diff --git a/grobid-core/src/main/java/org/grobid/core/data/CopyrightsLicense.java b/grobid-core/src/main/java/org/grobid/core/data/CopyrightsLicense.java new file mode 100644 index 0000000000..9ea5f5331f --- /dev/null +++ b/grobid-core/src/main/java/org/grobid/core/data/CopyrightsLicense.java @@ -0,0 +1,96 @@ +package org.grobid.core.data; + +import org.grobid.core.utilities.TextUtilities; + +import java.util.ArrayList; +import java.util.List; +import java.util.Arrays; + +/** + * Class for representing information related to copyrights owner and file license. + */ +public class CopyrightsLicense { + + // copyrights owner + public enum CopyrightsOwner { + PUBLISHER ("publisher"), + AUTHORS ("authors"), + UNDECIDED ("undecided"); + + private String name; + + private CopyrightsOwner(String name) { + this.name = name; + } + + public String getName() { + return name; + } + }; + + public static List copyrightOwners = Arrays.asList("publisher", "authors", "undecided"); + + // File-level licenses + public enum License { + CC0 ("CC-0"), + CCBY ("CC-BY"), + CCBYNC ("CC-BY-NC"), + CCBYNCND ("CC-BY-NC-ND"), + CCBYSA ("CC-BY-SA"), + CCBYNCSA ("CC-BY-NC-SA"), + CCBYND ("CC-BY-ND"), + COPYRIGHT ("strict-copyrights"), + OTHER ("other"), + UNDECIDED ("undecided"); + + private String name; + + private License(String name) { + this.name = name; + } + + public String getName() { + return name; + } + }; + + public static List licenses = + Arrays.asList("CC-0", "CC-BY", "CC-BY-NC", "CC-BY-NC-ND", "CC-BY-SA", "CC-BY-NC-SA", "CC-BY-ND", "copyright", "other", "undecided"); + + private CopyrightsOwner copyrightsOwner; + private double copyrightsOwnerProb; + private License license; + private double licenseProb; + + public CopyrightsOwner getCopyrightsOwner() { + return this.copyrightsOwner; + } + + public void setCopyrightsOwner(CopyrightsOwner owner) { + this.copyrightsOwner = owner; + } + + public double getCopyrightsOwnerProb() { + return this.copyrightsOwnerProb; + } + + public void setCopyrightsOwnerProb(double prob) { + this.copyrightsOwnerProb = prob; + } + + public License getLicense() { + return this.license; + } + + public void setLicense(License license) { + this.license = license; + } + + public double getLicenseProb() { + return this.licenseProb; + } + + public void setLicenseProb(double prob) { + this.licenseProb = prob; + } +} \ No newline at end of file diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java index 22390fe1f9..f66baaa0c0 100755 --- a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java +++ b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java @@ -14,6 +14,8 @@ import nu.xom.Text; import org.grobid.core.GrobidModels; +import org.grobid.core.data.CopyrightsLicense.License; +import org.grobid.core.data.CopyrightsLicense.CopyrightsOwner; import org.grobid.core.data.Date; import org.grobid.core.data.*; import org.grobid.core.document.xml.XmlBuilderUtils; @@ -248,28 +250,75 @@ public StringBuilder toTEIHeader(BiblioItem biblio, if ((biblio.getPublisher() != null) || (biblio.getPublicationDate() != null) || - (biblio.getNormalizedPublicationDate() != null)) { + (biblio.getNormalizedPublicationDate() != null) || + biblio.getCopyrightsLicense() != null) { tei.append("\t\t\t\n"); + + CopyrightsLicense copyrightsLicense = biblio.getCopyrightsLicense(); + if (biblio.getPublisher() != null) { // publisher and date under for better TEI conformance tei.append("\t\t\t\t" + TextUtilities.HTMLEncode(biblio.getPublisher()) + "\n"); - - tei.append("\t\t\t\t"); - tei.append("

Copyright "); - //if (biblio.getPublicationDate() != null) - tei.append(TextUtilities.HTMLEncode(biblio.getPublisher()) + "

\n"); - tei.append("\t\t\t\t
\n"); } else { // a dummy publicationStmt is still necessary according to TEI tei.append("\t\t\t\t\n"); - if (defaultPublicationStatement == null) { - tei.append("\t\t\t\t"); + } + + // We introduce something more meaningful with TEI customization to encode copyrights information: + // - @resp with value "publisher", "authors", "unknown", we add a comment to clarify that @resp + // should be interpreted as the copyrights owner + // - license related to copyrights exception is encoded via + // (note: I have no clue what can mean "free" as status for a document - there are always some sort of + // restrictions like moral rights even for public domain documents) + if (copyrightsLicense != null) { + tei.append("\t\t\t\t\n"); + if (addCopyrightsComment) { + tei.append("\t\t\t\t\t\n"); + } + tei.append("\t\t\t\t\t"+copyrightsLicense.getLicense().getName()+"\n"); } else { - tei.append("\t\t\t\t

" + - TextUtilities.HTMLEncode(defaultPublicationStatement) + "

"); + tei.append(" status=\"unknown\">\n"); + if (addCopyrightsComment) { + tei.append("\t\t\t\t\t\n"); + } + tei.append("\t\t\t\t\t\n"); } - tei.append("\n"); + + if (config.getIncludeRawCopyrights() && biblio.getCopyright() != null && biblio.getCopyright().length()>0) { + tei.append("\t\t\t\t\t

"); + tei.append(TextUtilities.HTMLEncode(biblio.getCopyright())); + tei.append("\n"); + } + + tei.append("\t\t\t\t\n"); + } else { + tei.append("\t\t\t\t\n"); + tei.append("\t\t\t\t\t\n"); + + if (defaultPublicationStatement != null) { + tei.append("\t\t\t\t\t

" + + TextUtilities.HTMLEncode(defaultPublicationStatement) + "

\n"); + } + + if (config.getIncludeRawCopyrights() && biblio.getCopyright() != null && biblio.getCopyright().length()>0) { + tei.append("\t\t\t\t\t

"); + tei.append(TextUtilities.HTMLEncode(biblio.getCopyright())); + tei.append("\n"); + } + + tei.append("\t\t\t\t\n"); } if (biblio.getNormalizedPublicationDate() != null) { diff --git a/grobid-core/src/main/java/org/grobid/core/engines/Engine.java b/grobid-core/src/main/java/org/grobid/core/engines/Engine.java index eb667e878a..949e2d63a5 100755 --- a/grobid-core/src/main/java/org/grobid/core/engines/Engine.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/Engine.java @@ -350,6 +350,7 @@ public String processHeader( String inputFile, int consolidate, boolean includeRawAffiliations, + boolean includeRawCopyrights, BiblioItem result ) { GrobidAnalysisConfig config = new GrobidAnalysisConfig.GrobidAnalysisConfigBuilder() @@ -357,6 +358,7 @@ public String processHeader( .endPage(2) .consolidateHeader(consolidate) .includeRawAffiliations(includeRawAffiliations) + .includeRawCopyrights(includeRawCopyrights) .build(); return processHeader(inputFile, null, config, result); } @@ -380,12 +382,14 @@ public String processHeaderFunding( File inputFile, int consolidateHeader, int consolidateFunders, - boolean includeRawAffiliations + boolean includeRawAffiliations, + boolean includeRawCopyrights ) throws Exception { GrobidAnalysisConfig config = new GrobidAnalysisConfig.GrobidAnalysisConfigBuilder() .consolidateHeader(consolidateHeader) .consolidateFunders(consolidateFunders) .includeRawAffiliations(includeRawAffiliations) + .includeRawCopyrights(includeRawCopyrights) .build(); return processHeaderFunding(inputFile, null, config); } @@ -408,6 +412,7 @@ public String processHeader( String md5Str, int consolidate, boolean includeRawAffiliations, + boolean includeRawCopyrights, BiblioItem result ) { GrobidAnalysisConfig config = new GrobidAnalysisConfig.GrobidAnalysisConfigBuilder() @@ -415,6 +420,7 @@ public String processHeader( .endPage(2) .consolidateHeader(consolidate) .includeRawAffiliations(includeRawAffiliations) + .includeRawCopyrights(includeRawCopyrights) .build(); return processHeader(inputFile, md5Str, config, result); } @@ -440,12 +446,14 @@ public String processHeaderFunding( String md5Str, int consolidateHeader, int consolidateFunders, - boolean includeRawAffiliations + boolean includeRawAffiliations, + boolean includeRawCopyrights ) throws Exception { GrobidAnalysisConfig config = new GrobidAnalysisConfig.GrobidAnalysisConfigBuilder() .consolidateHeader(consolidateHeader) .consolidateFunders(consolidateFunders) .includeRawAffiliations(includeRawAffiliations) + .includeRawCopyrights(includeRawCopyrights) .build(); return processHeaderFunding(inputFile, md5Str, config); } diff --git a/grobid-core/src/main/java/org/grobid/core/engines/HeaderParser.java b/grobid-core/src/main/java/org/grobid/core/engines/HeaderParser.java index 2db68597cc..185f3714d5 100755 --- a/grobid-core/src/main/java/org/grobid/core/engines/HeaderParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/HeaderParser.java @@ -8,6 +8,7 @@ import org.grobid.core.data.Date; import org.grobid.core.data.Keyword; import org.grobid.core.data.Person; +import org.grobid.core.data.CopyrightsLicense; import org.grobid.core.document.*; import org.grobid.core.engines.config.GrobidAnalysisConfig; import org.grobid.core.engines.label.SegmentationLabels; @@ -309,6 +310,15 @@ public String processingHeaderSection(GrobidAnalysisConfig config, Document doc, } } + // copyrights/license identification + if (resHeader.getCopyright() != null && resHeader.getCopyright().length()>0) { + if (GrobidProperties.getGrobidEngineName("copyright").equals("delft")) { + CopyrightsLicense copyrightsLicense = LicenseClassifier.getInstance().classify(resHeader.getCopyright()); + if (copyrightsLicense != null) + resHeader.setCopyrightsLicense(copyrightsLicense); + } + } + resHeader = consolidateHeader(resHeader, config.getConsolidateHeader()); // we don't need to serialize if we process the full text (it would be done 2 times) diff --git a/grobid-core/src/main/java/org/grobid/core/engines/LicenseClassifier.java b/grobid-core/src/main/java/org/grobid/core/engines/LicenseClassifier.java new file mode 100644 index 0000000000..0672d994e8 --- /dev/null +++ b/grobid-core/src/main/java/org/grobid/core/engines/LicenseClassifier.java @@ -0,0 +1,188 @@ +package org.grobid.core.engines; + +import java.util.*; + +import org.apache.commons.collections4.CollectionUtils; +import org.apache.commons.lang3.StringUtils; + +import org.grobid.core.data.CopyrightsLicense; +import org.grobid.core.data.CopyrightsLicense.CopyrightsOwner; +import org.grobid.core.data.CopyrightsLicense.License; +import org.grobid.core.utilities.GrobidProperties; +import org.grobid.core.jni.DeLFTClassifierModel; + +import com.fasterxml.jackson.core.*; +import com.fasterxml.jackson.databind.*; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class LicenseClassifier { + + private static final Logger LOGGER = LoggerFactory.getLogger(LicenseClassifier.class); + + // multi-class/multi-label classifier + private DeLFTClassifierModel classifierCopyrightsOwner = null; + private DeLFTClassifierModel classifierLicense = null; + + // binary classifiers to be added if used + private Boolean useBinary = false; + + private JsonParser parser; + + private static volatile LicenseClassifier instance; + + public static LicenseClassifier getInstance() { + if (instance == null) { + synchronized (LicenseClassifier.class) { + if (instance == null) { + getNewInstance(); + } + } + } + return instance; + } + + /** + * Create a new instance. + */ + private static synchronized void getNewInstance() { + instance = new LicenseClassifier(); + } + + private LicenseClassifier() { + this.classifierCopyrightsOwner = new DeLFTClassifierModel("copyright", GrobidProperties.getDelftArchitecture("copyright")); + this.classifierLicense = new DeLFTClassifierModel("license", GrobidProperties.getDelftArchitecture("license")); + } + + /** + * Classify a simple piece of text + * @return list of predicted labels/scores pairs + */ + public CopyrightsLicense classify(String text) throws Exception { + if (StringUtils.isEmpty(text)) + return null; + List texts = new ArrayList<>(); + texts.add(text); + return classify(texts).get(0); + } + + /** + * Classify an array of texts + * @return list of predicted labels/scores pairs for each text + */ + public List classify(List texts) throws Exception { + if (CollectionUtils.isEmpty(texts)) + return null; + + LOGGER.info("classify: " + texts.size()); + + String copyrightOwnerAsJson = this.classifierCopyrightsOwner.classify(texts); + String licencesAsJson = this.classifierLicense.classify(texts); + + return extractResults(copyrightOwnerAsJson, licencesAsJson); + } + + protected static List extractResults(String copyrightOwnerAsJson, String licencesAsJson) { + List results = new ArrayList<>(); + + // set resulting context classes to entity mentions + try { + ObjectMapper mapper = new ObjectMapper(); + JsonNode root_copyrights = mapper.readTree(copyrightOwnerAsJson); + JsonNode root_licenses = mapper.readTree(licencesAsJson); + + int entityRank =0; + JsonNode classificationsNodeCopyrights = root_copyrights.findPath("classifications"); + JsonNode classificationsNodeLicenses = root_licenses.findPath("classifications"); + if ((classificationsNodeCopyrights != null) && (!classificationsNodeCopyrights.isMissingNode()) && + (classificationsNodeLicenses != null) && (!classificationsNodeLicenses.isMissingNode())) { + Iterator ite1 = classificationsNodeCopyrights.elements(); + Iterator ite2 = classificationsNodeLicenses.elements(); + while (ite1.hasNext()) { + CopyrightsLicense result = new CopyrightsLicense(); + JsonNode classificationsNode = ite1.next(); + + List owners = CopyrightsLicense.copyrightOwners; + List scoreFields = new ArrayList<>(); + + for(String fieldOwners : owners) { + JsonNode fieldNode = classificationsNode.findPath(fieldOwners); + double scoreField = 0.0; + if ((fieldNode != null) && (!fieldNode.isMissingNode())) { + scoreFields.add(fieldNode.doubleValue()); + } + } + + CopyrightsOwner owner = null; + double bestProb = 0.0; + double scoreUndecided = 0.0; + int rank = 0; + for (Double scoreField : scoreFields) { + if (scoreField>0.5 && scoreField > bestProb) { + owner = CopyrightsOwner.valueOf(owners.get(rank).toUpperCase()); + bestProb = scoreField; + } + scoreUndecided = scoreField; + rank++; + } + + if (owner == null) { + owner = CopyrightsOwner.UNDECIDED; + bestProb = scoreUndecided; + } + + // set best copyright owner with prob + result.setCopyrightsOwner(owner); + result.setCopyrightsOwnerProb(bestProb); + + classificationsNode = ite2.next(); + + bestProb = 0.0; + List licenses = CopyrightsLicense.licenses; + scoreFields = new ArrayList<>(); + + for(String fieldLicenses : licenses) { + JsonNode fieldNode = classificationsNode.findPath(fieldLicenses); + double scoreField = 0.0; + if ((fieldNode != null) && (!fieldNode.isMissingNode())) { + scoreFields.add(fieldNode.doubleValue()); + } + } + + bestProb = 0.0; + scoreUndecided = 0.0; + License license = null; + rank = 0; + for (Double scoreField : scoreFields) { + if (scoreField>0.5 && scoreField > bestProb) { + String valueLicense = licenses.get(rank); + valueLicense = valueLicense.replace("-", ""); + license = License.valueOf(valueLicense.toUpperCase()); + bestProb = scoreField; + } + scoreUndecided = scoreField; + rank++; + } + + if (license == null) { + license = License.UNDECIDED; + bestProb = scoreUndecided; + } + + // get best license with prob + result.setLicense(license); + result.setLicenseProb(bestProb); + + results.add(result); + entityRank++; + } + } + } catch(JsonProcessingException e) { + LOGGER.error("failed to parse JSON copyrights/licenses classification result", e); + } + + return results; + } + +} diff --git a/grobid-core/src/main/java/org/grobid/core/engines/ReferenceSegmenterParser.java b/grobid-core/src/main/java/org/grobid/core/engines/ReferenceSegmenterParser.java index 147c5cf2bb..4323618f82 100644 --- a/grobid-core/src/main/java/org/grobid/core/engines/ReferenceSegmenterParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/ReferenceSegmenterParser.java @@ -91,7 +91,7 @@ public List extract(Document doc, SortedSet texts, Map> articlesBySegment = new HashMap<>(); // sub-segment texts if a DL model will be applied. Use the max sequence length for size limit - if (GrobidProperties.getGrobidCRFEngineName("patent-citation").equals("delft")) { + if (GrobidProperties.getGrobidEngineName("patent-citation").equals("delft")) { List newTexts = new ArrayList<>(); int maxSequence = GrobidProperties.getDelftTrainingMaxSequenceLength("patent-citation"); for(String text : texts) { diff --git a/grobid-core/src/main/java/org/grobid/core/engines/tagging/TaggerFactory.java b/grobid-core/src/main/java/org/grobid/core/engines/tagging/TaggerFactory.java index 2cfb398ddb..639b1011fc 100644 --- a/grobid-core/src/main/java/org/grobid/core/engines/tagging/TaggerFactory.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/tagging/TaggerFactory.java @@ -25,7 +25,7 @@ public class TaggerFactory { private TaggerFactory() {} public static synchronized GenericTagger getTagger(GrobidModel model) { - return getTagger(model, GrobidProperties.getGrobidCRFEngine(model), GrobidProperties.getDelftArchitecture(model)); + return getTagger(model, GrobidProperties.getGrobidEngine(model), GrobidProperties.getDelftArchitecture(model)); } public static synchronized GenericTagger getTagger(GrobidModel model, GrobidCRFEngine engine) { diff --git a/grobid-core/src/main/java/org/grobid/core/utilities/GrobidProperties.java b/grobid-core/src/main/java/org/grobid/core/utilities/GrobidProperties.java index 05048224f1..c32739a43f 100644 --- a/grobid-core/src/main/java/org/grobid/core/utilities/GrobidProperties.java +++ b/grobid-core/src/main/java/org/grobid/core/utilities/GrobidProperties.java @@ -588,7 +588,8 @@ public static File getPdfaltoPath() { return pathToPdfalto; } - public static String getGrobidCRFEngineName(final String modelName) { + + public static String getGrobidEngineName(final String modelName) { ModelParameters param = modelMap.get(modelName); if (param == null) { LOGGER.debug("No configuration parameter defined for model " + modelName); @@ -597,16 +598,16 @@ public static String getGrobidCRFEngineName(final String modelName) { return param.engine; } - public static GrobidCRFEngine getGrobidCRFEngine(final String modelName) { - String engineName = getGrobidCRFEngineName(modelName); + public static GrobidCRFEngine getGrobidEngine(final String modelName) { + String engineName = getGrobidEngineName(modelName); if (engineName == null) return null; else return GrobidCRFEngine.get(engineName); } - public static GrobidCRFEngine getGrobidCRFEngine(final GrobidModel model) { - return getGrobidCRFEngine(model.getModelName()); + public static GrobidCRFEngine getGrobidEngine(final GrobidModel model) { + return getGrobidEngine(model.getModelName()); } public static File getModelPath(final GrobidModel model) { @@ -614,7 +615,7 @@ public static File getModelPath(final GrobidModel model) { // model is not specified in the config, ignoring return null; } - String extension = getGrobidCRFEngine(model).getExt(); + String extension = getGrobidEngine(model).getExt(); return new File(getGrobidHome(), FOLDER_NAME_MODELS + File.separator + model.getFolderName() + File.separator + FILE_NAME_MODEL + "." + extension); diff --git a/grobid-core/src/test/java/org/grobid/core/engines/LicenseClassifierTest.java b/grobid-core/src/test/java/org/grobid/core/engines/LicenseClassifierTest.java new file mode 100644 index 0000000000..3aa0fe83c3 --- /dev/null +++ b/grobid-core/src/test/java/org/grobid/core/engines/LicenseClassifierTest.java @@ -0,0 +1,154 @@ +package org.grobid.core.engines; + +import org.grobid.core.data.CopyrightsLicense; +import org.junit.Test; + +import java.util.List; + +import static org.hamcrest.CoreMatchers.is; +import static org.hamcrest.Matchers.hasSize; +import static org.junit.Assert.assertThat; + +public class LicenseClassifierTest { + + + @Test + public void testExtractResults_shouldExtractPublishers_undecided() { + String copyrightOwnerAsJson = "{\n" + + " \"classifications\": [\n" + + " {\n" + + " \"authors\": 0.0022506017703562975,\n" + + " \"publisher\": 0.9961154460906982,\n" + + " \"text\": \"© 2015 IOP Publishing Ltd\",\n" + + " \"undecided\": 0.009332857094705105\n" + + " }\n" + + " ],\n" + + " \"date\": \"2024-02-01T15:22:17.353931\",\n" + + " \"model\": \"copyright_gru\",\n" + + " \"software\": \"DeLFT\"\n" + + "}"; + + String licencesAsJson = "{\n" + + " \"classifications\": [\n" + + " {\n" + + " \"CC-0\": 5.753641289629741e-06,\n" + + " \"CC-BY\": 0.002589514711871743,\n" + + " \"CC-BY-NC\": 0.0008843864197842777,\n" + + " \"CC-BY-NC-ND\": 0.00015740084927529097,\n" + + " \"CC-BY-NC-SA\": 0.002522438997402787,\n" + + " \"CC-BY-ND\": 0.00047874293522909284,\n" + + " \"CC-BY-SA\": 0.0004411475674714893,\n" + + " \"copyright\": 0.004834720399230719,\n" + + " \"other\": 3.192744770785794e-05,\n" + + " \"text\": \"© 2015 IOP Publishing Ltd\",\n" + + " \"undecided\": 0.9963551759719849\n" + + " }\n" + + " ],\n" + + " \"date\": \"2024-02-01T15:22:31.070649\",\n" + + " \"model\": \"license_gru\",\n" + + " \"software\": \"DeLFT\"\n" + + "}"; + + List copyrightsLicenses = LicenseClassifier.extractResults(copyrightOwnerAsJson, licencesAsJson); + + assertThat(copyrightsLicenses, hasSize(1)); + assertThat(copyrightsLicenses.get(0).getLicense(), is(CopyrightsLicense.License.UNDECIDED)); + assertThat(copyrightsLicenses.get(0).getLicenseProb(), is(0.9963551759719849)); + assertThat(copyrightsLicenses.get(0).getCopyrightsOwner(), is(CopyrightsLicense.CopyrightsOwner.PUBLISHER)); + assertThat(copyrightsLicenses.get(0).getCopyrightsOwnerProb(), is(0.9961154460906982)); + + } + + @Test + public void testExtractResults_shouldReturnAuthors_ccby() { + String copyrightOwnerAsJson = "{\n" + + " \"classifications\": [\n" + + " {\n" + + " \"authors\": 0.9663094878196716,\n" + + " \"publisher\": 0.033012233674526215,\n" + + " \"text\": \"© 2020 The Authors. Published by Elsevier Ltd. This is an open access article under the CC BY license (http://creativecommons.org/licenses/BY/4.0/). T\",\n" + + " \"undecided\": 0.005560279358178377\n" + + " }\n" + + " ],\n" + + " \"date\": \"2024-02-01T09:45:49.755983\",\n" + + " \"model\": \"copyright_gru\",\n" + + " \"software\": \"DeLFT\"\n" + + "}"; + + String licencesAsJson = "{\n" + + " \"classifications\": [\n" + + " {\n" + + " \"CC-0\": 2.471400932790857e-07,\n" + + " \"CC-BY\": 0.9981574416160583,\n" + + " \"CC-BY-NC\": 0.0009365379810333252,\n" + + " \"CC-BY-NC-ND\": 0.0003149482945445925,\n" + + " \"CC-BY-NC-SA\": 1.9512295693857595e-05,\n" + + " \"CC-BY-ND\": 0.00010157905489904806,\n" + + " \"CC-BY-SA\": 0.0007704910240136087,\n" + + " \"copyright\": 0.0026725931093096733,\n" + + " \"other\": 0.003816531505435705,\n" + + " \"text\": \"© 2020 The Authors. Published by Elsevier Ltd. This is an open access article under the CC BY license (http://creativecommons.org/licenses/BY/4.0/). T\",\n" + + " \"undecided\": 0.0006339686224237084\n" + + " }\n" + + " ],\n" + + " \"date\": \"2024-02-01T09:46:03.644485\",\n" + + " \"model\": \"license_gru\",\n" + + " \"software\": \"DeLFT\"\n" + + "}"; + List copyrightsLicenses = LicenseClassifier.extractResults(copyrightOwnerAsJson, licencesAsJson); + + assertThat(copyrightsLicenses, hasSize(1)); + assertThat(copyrightsLicenses.get(0).getLicense(), is(CopyrightsLicense.License.CCBY)); + assertThat(copyrightsLicenses.get(0).getLicenseProb(), is(0.9981574416160583)); + assertThat(copyrightsLicenses.get(0).getCopyrightsOwner(), is(CopyrightsLicense.CopyrightsOwner.AUTHORS)); + assertThat(copyrightsLicenses.get(0).getCopyrightsOwnerProb(), is(0.9663094878196716)); + + } + + @Test + public void testExtractResults_shouldReturnUndecided_copyright() { + String copyrightOwnerAsJson = "{\n" + + " \"classifications\": [\n" + + " {\n" + + " \"authors\": 0.5,\n" + + " \"publisher\": 0.5,\n" + + " \"text\": \"© 2020 The Authors. Published by Elsevier Ltd. This is an open access article under the CC BY license (http://creativecommons.org/licenses/BY/4.0/). T\",\n" + + " \"undecided\": 0.005560279358178377\n" + + " }\n" + + " ],\n" + + " \"date\": \"2024-02-01T09:45:49.755983\",\n" + + " \"model\": \"copyright_gru\",\n" + + " \"software\": \"DeLFT\"\n" + + "}"; + + String licencesAsJson = "{\n" + + " \"classifications\": [\n" + + " {\n" + + " \"CC-0\": 2.471400932790857e-07,\n" + + " \"CC-BY\": 0.5,\n" + + " \"CC-BY-NC\": 0.5,\n" + + " \"CC-BY-NC-ND\": 0.0003149482945445925,\n" + + " \"CC-BY-NC-SA\": 1.9512295693857595e-05,\n" + + " \"CC-BY-ND\": 0.00010157905489904806,\n" + + " \"CC-BY-SA\": 0.0007704910240136087,\n" + + " \"copyright\": 0.0026725931093096733,\n" + + " \"other\": 0.003816531505435705,\n" + + " \"text\": \"© 2020 The Authors. Published by Elsevier Ltd. This is an open access article under the CC BY license (http://creativecommons.org/licenses/BY/4.0/). T\",\n" + + " \"undecided\": 0.0006339686224237084\n" + + " }\n" + + " ],\n" + + " \"date\": \"2024-02-01T09:46:03.644485\",\n" + + " \"model\": \"license_gru\",\n" + + " \"software\": \"DeLFT\"\n" + + "}"; + List copyrightsLicenses = LicenseClassifier.extractResults(copyrightOwnerAsJson, licencesAsJson); + + assertThat(copyrightsLicenses, hasSize(1)); + assertThat(copyrightsLicenses.get(0).getLicense(), is(CopyrightsLicense.License.UNDECIDED)); + assertThat(copyrightsLicenses.get(0).getLicenseProb(), is(0.0006339686224237084)); + assertThat(copyrightsLicenses.get(0).getCopyrightsOwner(), is(CopyrightsLicense.CopyrightsOwner.UNDECIDED)); + assertThat(copyrightsLicenses.get(0).getCopyrightsOwnerProb(), is(0.005560279358178377)); + } + + +} \ No newline at end of file diff --git a/grobid-core/src/test/java/org/grobid/core/utilities/GrobidPropertiesTest.java b/grobid-core/src/test/java/org/grobid/core/utilities/GrobidPropertiesTest.java index 8d06535baa..65c70a14a3 100755 --- a/grobid-core/src/test/java/org/grobid/core/utilities/GrobidPropertiesTest.java +++ b/grobid-core/src/test/java/org/grobid/core/utilities/GrobidPropertiesTest.java @@ -97,7 +97,7 @@ public void testgetNBThreadsShouldReturnAvailableProcessorsIfZero() { @Test public void testShouldReturnModelPathWithExtension() { GrobidModels model = GrobidModels.DATE; - String extension = GrobidProperties.getGrobidCRFEngine(model).getExt(); + String extension = GrobidProperties.getGrobidEngine(model).getExt(); assertEquals( "model path for " + model.name(), new File(GrobidProperties.getGrobidHome(), diff --git a/grobid-home/config/grobid.yaml b/grobid-home/config/grobid.yaml index 7e8de0b656..86cb10a2cb 100644 --- a/grobid-home/config/grobid.yaml +++ b/grobid-home/config/grobid.yaml @@ -273,6 +273,28 @@ grobid: max_sequence_length: 500 batch_size: 40 + - name: "copyright" + # at this time, we only have a DeLFT implementation, + # use "wapiti" if the deep learning library JNI is not available and model will then be ignored + #engine: "delft" + engine: "wapiti" + delft: + # deep learning parameters + architecture: "gru" + #architecture: "bert" + #transformer: "allenai/scibert_scivocab_cased" + + - name: "license" + # at this time, for being active, it must be DeLFT, no other implementation is available + # use "wapiti" if the deep learning library JNI is not available and model will then be ignored + #engine: "delft" + engine: "wapiti" + delft: + # deep learning parameters + architecture: "gru" + #architecture: "bert" + #transformer: "allenai/scibert_scivocab_cased" + # for **service only**: how to load the models, # false -> models are loaded when needed, avoiding putting in memory useless models (only in case of CRF) but slow down # significantly the service at first call diff --git a/grobid-home/models/copyright_gru/config.json b/grobid-home/models/copyright_gru/config.json new file mode 100644 index 0000000000..96b5119994 --- /dev/null +++ b/grobid-home/models/copyright_gru/config.json @@ -0,0 +1,20 @@ +{ + "model_name": "copyright_gru", + "architecture": "gru", + "embeddings_name": "glove-840B", + "char_embedding_size": 25, + "word_embedding_size": 300, + "dropout": 0.5, + "recurrent_dropout": 0.25, + "maxlen": 300, + "dense_size": 32, + "use_char_feature": false, + "list_classes": [ + "publisher", + "authors", + "undecided" + ], + "fold_number": 1, + "batch_size": 256, + "transformer_name": null +} \ No newline at end of file diff --git a/grobid-home/models/copyright_gru/model_weights.hdf5 b/grobid-home/models/copyright_gru/model_weights.hdf5 new file mode 100644 index 0000000000..f80b2ddda8 Binary files /dev/null and b/grobid-home/models/copyright_gru/model_weights.hdf5 differ diff --git a/grobid-home/models/license_gru/config.json b/grobid-home/models/license_gru/config.json new file mode 100644 index 0000000000..4c3470268f --- /dev/null +++ b/grobid-home/models/license_gru/config.json @@ -0,0 +1,27 @@ +{ + "model_name": "license_gru", + "architecture": "gru", + "embeddings_name": "glove-840B", + "char_embedding_size": 25, + "word_embedding_size": 300, + "dropout": 0.5, + "recurrent_dropout": 0.25, + "maxlen": 300, + "dense_size": 32, + "use_char_feature": false, + "list_classes": [ + "CC-0", + "CC-BY", + "CC-BY-NC", + "CC-BY-NC-ND", + "CC-BY-SA", + "CC-BY-NC-SA", + "CC-BY-ND", + "copyright", + "other", + "undecided" + ], + "fold_number": 1, + "batch_size": 256, + "transformer_name": null +} \ No newline at end of file diff --git a/grobid-home/models/license_gru/model_weights.hdf5 b/grobid-home/models/license_gru/model_weights.hdf5 new file mode 100644 index 0000000000..7133dfc846 Binary files /dev/null and b/grobid-home/models/license_gru/model_weights.hdf5 differ diff --git a/grobid-service/src/main/java/org/grobid/service/GrobidRestService.java b/grobid-service/src/main/java/org/grobid/service/GrobidRestService.java index 54f6b3e502..e13f7877a6 100755 --- a/grobid-service/src/main/java/org/grobid/service/GrobidRestService.java +++ b/grobid-service/src/main/java/org/grobid/service/GrobidRestService.java @@ -54,6 +54,7 @@ public class GrobidRestService implements GrobidPaths { public static final String CONSOLIDATE_FUNDERS = "consolidateFunders"; public static final String INCLUDE_RAW_AFFILIATIONS = "includeRawAffiliations"; public static final String INCLUDE_RAW_CITATIONS = "includeRawCitations"; + public static final String INCLUDE_RAW_COPYRIGHTS = "includeRawCopyrights"; public static final String INCLUDE_FIGURES_TABLES = "includeFiguresTables"; @Inject @@ -156,11 +157,13 @@ public Response getAdmin_htmlGet(@QueryParam(SHA1) String sha1) { public Response processHeaderDocumentReturnXml_post( @FormDataParam(INPUT) InputStream inputStream, @DefaultValue("0") @FormDataParam(CONSOLIDATE_HEADER) String consolidate, - @DefaultValue("0") @FormDataParam(INCLUDE_RAW_AFFILIATIONS) String includeRawAffiliations) { + @DefaultValue("0") @FormDataParam(INCLUDE_RAW_AFFILIATIONS) String includeRawAffiliations, + @DefaultValue("0") @FormDataParam(INCLUDE_RAW_COPYRIGHTS) String includeRawCopyrights) { int consol = validateConsolidationParam(consolidate); return restProcessFiles.processStatelessHeaderDocument( inputStream, consol, validateIncludeRawParam(includeRawAffiliations), + validateIncludeRawParam(includeRawCopyrights), ExpectedResponseType.XML ); } @@ -173,12 +176,13 @@ public Response processHeaderFundingDocumentReturnXml_post( @FormDataParam(INPUT) InputStream inputStream, @DefaultValue("0") @FormDataParam(CONSOLIDATE_HEADER) String consolidateHeader, @DefaultValue("0") @FormDataParam(CONSOLIDATE_FUNDERS) String consolidateFunders, - @DefaultValue("0") @FormDataParam(INCLUDE_RAW_AFFILIATIONS) String includeRawAffiliations) { + @DefaultValue("0") @FormDataParam(INCLUDE_RAW_AFFILIATIONS) String includeRawAffiliations, + @DefaultValue("0") @FormDataParam(INCLUDE_RAW_COPYRIGHTS) String includeRawCopyrights) { int consolHeader = validateConsolidationParam(consolidateHeader); int consolFunders = validateConsolidationParam(consolidateFunders); return restProcessFiles.processStatelessHeaderFundingDocument( inputStream, consolHeader, consolFunders, - validateIncludeRawParam(includeRawAffiliations) + validateIncludeRawParam(includeRawAffiliations), validateIncludeRawParam(includeRawCopyrights) ); } @@ -190,8 +194,9 @@ public Response processHeaderFundingDocumentReturnXml_post( public Response processStatelessHeaderDocumentReturnXml( @FormDataParam(INPUT) InputStream inputStream, @DefaultValue("0") @FormDataParam(CONSOLIDATE_HEADER) String consolidate, - @DefaultValue("0") @FormDataParam(INCLUDE_RAW_AFFILIATIONS) String includeRawAffiliations) { - return processHeaderDocumentReturnXml_post(inputStream, consolidate, includeRawAffiliations); + @DefaultValue("0") @FormDataParam(INCLUDE_RAW_AFFILIATIONS) String includeRawAffiliations, + @DefaultValue("0") @FormDataParam(INCLUDE_RAW_COPYRIGHTS) String includeRawCopyrights) { + return processHeaderDocumentReturnXml_post(inputStream, consolidate, includeRawAffiliations, includeRawCopyrights); } @Path(PATH_HEADER) @@ -201,11 +206,13 @@ public Response processStatelessHeaderDocumentReturnXml( public Response processHeaderDocumentReturnBibTeX_post( @FormDataParam(INPUT) InputStream inputStream, @DefaultValue("0") @FormDataParam(CONSOLIDATE_HEADER) String consolidate, - @DefaultValue("0") @FormDataParam(INCLUDE_RAW_AFFILIATIONS) String includeRawAffiliations) { + @DefaultValue("0") @FormDataParam(INCLUDE_RAW_AFFILIATIONS) String includeRawAffiliations, + @DefaultValue("0") @FormDataParam(INCLUDE_RAW_COPYRIGHTS) String includeRawCopyrights) { int consol = validateConsolidationParam(consolidate); return restProcessFiles.processStatelessHeaderDocument( inputStream, consol, validateIncludeRawParam(includeRawAffiliations), + validateIncludeRawParam(includeRawCopyrights), ExpectedResponseType.BIBTEX ); } @@ -217,8 +224,9 @@ public Response processHeaderDocumentReturnBibTeX_post( public Response processStatelessHeaderDocumentReturnBibTeX( @FormDataParam(INPUT) InputStream inputStream, @DefaultValue("0") @FormDataParam(CONSOLIDATE_HEADER) String consolidate, - @DefaultValue("0") @FormDataParam(INCLUDE_RAW_AFFILIATIONS) String includeRawAffiliations) { - return processHeaderDocumentReturnBibTeX_post(inputStream, consolidate, includeRawAffiliations); + @DefaultValue("0") @FormDataParam(INCLUDE_RAW_AFFILIATIONS) String includeRawAffiliations, + @DefaultValue("0") @FormDataParam(INCLUDE_RAW_COPYRIGHTS) String includeRawCopyrights) { + return processHeaderDocumentReturnBibTeX_post(inputStream, consolidate, includeRawAffiliations, includeRawCopyrights); } @Path(PATH_FULL_TEXT) @@ -231,6 +239,7 @@ public Response processFulltextDocument_post( @DefaultValue("0") @FormDataParam(CONSOLIDATE_CITATIONS) String consolidateCitations, @DefaultValue("0") @FormDataParam(CONSOLIDATE_FUNDERS) String consolidateFunders, @DefaultValue("0") @FormDataParam(INCLUDE_RAW_AFFILIATIONS) String includeRawAffiliations, + @DefaultValue("0") @FormDataParam(INCLUDE_RAW_COPYRIGHTS) String includeRawCopyrights, @DefaultValue("0") @FormDataParam(INCLUDE_RAW_CITATIONS) String includeRawCitations, @DefaultValue("-1") @FormDataParam("start") int startPage, @DefaultValue("-1") @FormDataParam("end") int endPage, @@ -239,7 +248,7 @@ public Response processFulltextDocument_post( @FormDataParam("teiCoordinates") List coordinates) throws Exception { return processFulltext( inputStream, consolidateHeader, consolidateCitations, consolidateFunders, - includeRawAffiliations, includeRawCitations, + includeRawAffiliations, includeRawCitations, includeRawCopyrights, startPage, endPage, generateIDs, segmentSentences, coordinates ); } @@ -254,6 +263,7 @@ public Response processFulltextDocument( @DefaultValue("0") @FormDataParam(CONSOLIDATE_CITATIONS) String consolidateCitations, @DefaultValue("0") @FormDataParam(CONSOLIDATE_FUNDERS) String consolidateFunders, @DefaultValue("0") @FormDataParam(INCLUDE_RAW_AFFILIATIONS) String includeRawAffiliations, + @DefaultValue("0") @FormDataParam(INCLUDE_RAW_COPYRIGHTS) String includeRawCopyrights, @DefaultValue("0") @FormDataParam(INCLUDE_RAW_CITATIONS) String includeRawCitations, @DefaultValue("-1") @FormDataParam("start") int startPage, @DefaultValue("-1") @FormDataParam("end") int endPage, @@ -262,7 +272,7 @@ public Response processFulltextDocument( @FormDataParam("teiCoordinates") List coordinates) throws Exception { return processFulltext( inputStream, consolidateHeader, consolidateCitations, consolidateFunders, - includeRawAffiliations, includeRawCitations, + includeRawAffiliations, includeRawCitations, includeRawCopyrights, startPage, endPage, generateIDs, segmentSentences, coordinates ); } @@ -273,6 +283,7 @@ private Response processFulltext(InputStream inputStream, String consolidateFunders, String includeRawAffiliations, String includeRawCitations, + String includeRawCopyrights, int startPage, int endPage, String generateIDs, @@ -291,7 +302,7 @@ private Response processFulltext(InputStream inputStream, return restProcessFiles.processFulltextDocument( inputStream, consolHeader, consolCitations, consolFunders, validateIncludeRawParam(includeRawAffiliations), - includeRaw, + includeRaw, validateIncludeRawParam(includeRawCopyrights), startPage, endPage, generate, segment, teiCoordinates ); } @@ -341,6 +352,7 @@ public Response processFulltextAssetDocument_post( @DefaultValue("0") @FormDataParam(CONSOLIDATE_CITATIONS) String consolidateCitations, @DefaultValue("0") @FormDataParam(CONSOLIDATE_FUNDERS) String consolidateFunders, @DefaultValue("0") @FormDataParam(INCLUDE_RAW_AFFILIATIONS) String includeRawAffiliations, + @DefaultValue("0") @FormDataParam(INCLUDE_RAW_COPYRIGHTS) String includeRawCopyrights, @DefaultValue("0") @FormDataParam(INCLUDE_RAW_CITATIONS) String includeRawCitations, @DefaultValue("-1") @FormDataParam("start") int startPage, @DefaultValue("-1") @FormDataParam("end") int endPage, @@ -349,7 +361,7 @@ public Response processFulltextAssetDocument_post( @FormDataParam("teiCoordinates") List coordinates) throws Exception { return processStatelessFulltextAssetHelper( inputStream, consolidateHeader, consolidateCitations, consolidateFunders, - includeRawAffiliations, includeRawCitations, + includeRawAffiliations, includeRawCitations, includeRawCopyrights, startPage, endPage, generateIDs, segmentSentences, coordinates ); } @@ -364,6 +376,7 @@ public Response processStatelessFulltextAssetDocument( @DefaultValue("0") @FormDataParam(CONSOLIDATE_CITATIONS) String consolidateCitations, @DefaultValue("0") @FormDataParam(CONSOLIDATE_FUNDERS) String consolidateFunders, @DefaultValue("0") @FormDataParam(INCLUDE_RAW_AFFILIATIONS) String includeRawAffiliations, + @DefaultValue("0") @FormDataParam(INCLUDE_RAW_COPYRIGHTS) String includeRawCopyrights, @DefaultValue("0") @FormDataParam(INCLUDE_RAW_CITATIONS) String includeRawCitations, @DefaultValue("-1") @FormDataParam("start") int startPage, @DefaultValue("-1") @FormDataParam("end") int endPage, @@ -372,7 +385,7 @@ public Response processStatelessFulltextAssetDocument( @FormDataParam("teiCoordinates") List coordinates) throws Exception { return processStatelessFulltextAssetHelper( inputStream, consolidateHeader, consolidateCitations, consolidateFunders, - includeRawAffiliations, includeRawCitations, + includeRawAffiliations, includeRawCitations, includeRawCopyrights, startPage, endPage, generateIDs, segmentSentences, coordinates ); } @@ -382,6 +395,7 @@ private Response processStatelessFulltextAssetHelper(InputStream inputStream, String consolidateCitations, String consolidateFunders, String includeRawAffiliations, + String includeRawCopyrights, String includeRawCitations, int startPage, int endPage, @@ -400,7 +414,7 @@ private Response processStatelessFulltextAssetHelper(InputStream inputStream, return restProcessFiles.processStatelessFulltextAssetDocument( inputStream, consolHeader, consolCitations, consolFunders, validateIncludeRawParam(includeRawAffiliations), - includeRaw, + includeRaw, validateIncludeRawParam(includeRawCopyrights), startPage, endPage, generate, segment, teiCoordinates ); } diff --git a/grobid-service/src/main/java/org/grobid/service/process/GrobidRestProcessFiles.java b/grobid-service/src/main/java/org/grobid/service/process/GrobidRestProcessFiles.java index 14caf8b218..4692fe3de9 100644 --- a/grobid-service/src/main/java/org/grobid/service/process/GrobidRestProcessFiles.java +++ b/grobid-service/src/main/java/org/grobid/service/process/GrobidRestProcessFiles.java @@ -64,6 +64,7 @@ public Response processStatelessHeaderDocument( final InputStream inputStream, final int consolidate, final boolean includeRawAffiliations, + final boolean includeRawCopyrights, ExpectedResponseType expectedResponseType ) { LOGGER.debug(methodLogIn()); @@ -101,6 +102,7 @@ public Response processStatelessHeaderDocument( md5Str, consolidate, includeRawAffiliations, + includeRawCopyrights, result ); @@ -150,7 +152,8 @@ public Response processStatelessHeaderFundingDocument( final InputStream inputStream, final int consolidateHeader, final int consolidateFunders, - final boolean includeRawAffiliations + final boolean includeRawAffiliations, + final boolean includeRawCopyrights ) { LOGGER.debug(methodLogIn()); String retVal = null; @@ -185,7 +188,8 @@ public Response processStatelessHeaderFundingDocument( md5Str, consolidateHeader, consolidateFunders, - includeRawAffiliations + includeRawAffiliations, + includeRawCopyrights ); if (GrobidRestUtils.isResultNullOrEmpty(retVal)) { @@ -240,6 +244,7 @@ public Response processFulltextDocument(final InputStream inputStream, final int consolidateFunders, final boolean includeRawAffiliations, final boolean includeRawCitations, + final boolean includeRawCopyrights, final int startPage, final int endPage, final boolean generateIDs, @@ -280,6 +285,7 @@ public Response processFulltextDocument(final InputStream inputStream, .consolidateFunders(consolidateFunders) .includeRawAffiliations(includeRawAffiliations) .includeRawCitations(includeRawCitations) + .includeRawCopyrights(includeRawCopyrights) .startPage(startPage) .endPage(endPage) .generateTeiIds(generateIDs) @@ -341,6 +347,7 @@ public Response processStatelessFulltextAssetDocument(final InputStream inputStr final int consolidateFunders, final boolean includeRawAffiliations, final boolean includeRawCitations, + final boolean includeRawCopyrights, final int startPage, final int endPage, final boolean generateIDs, @@ -384,6 +391,7 @@ public Response processStatelessFulltextAssetDocument(final InputStream inputStr .consolidateFunders(consolidateFunders) .includeRawAffiliations(includeRawAffiliations) .includeRawCitations(includeRawCitations) + .includeRawCopyrights(includeRawCopyrights) .startPage(startPage) .endPage(endPage) .generateTeiIds(generateIDs) @@ -498,7 +506,8 @@ public Response processCitationPatentPDF(final InputStream inputStream, List patents = new ArrayList<>(); List articles = new ArrayList<>(); retVal = engine.processAllCitationsInPDFPatent(originFile.getAbsolutePath(), - articles, patents, consolidate, includeRawCitations); + articles, patents, consolidate, + includeRawCitations); if (GrobidRestUtils.isResultNullOrEmpty(retVal)) { response = Response.status(Status.NO_CONTENT).build(); diff --git a/grobid-trainer/src/main/java/org/grobid/trainer/AbstractTrainer.java b/grobid-trainer/src/main/java/org/grobid/trainer/AbstractTrainer.java index d1399b212f..f26b1eea0d 100755 --- a/grobid-trainer/src/main/java/org/grobid/trainer/AbstractTrainer.java +++ b/grobid-trainer/src/main/java/org/grobid/trainer/AbstractTrainer.java @@ -104,7 +104,7 @@ public void train(boolean incremental) { trainer.train(getTemplatePath(), dataPath, tempModelPath, GrobidProperties.getWapitiNbThreads(), model, incremental); // if we are here, that means that training succeeded // rename model for CRF sequence labellers (not with DeLFT deep learning models) - if (GrobidProperties.getGrobidCRFEngine(this.model) != GrobidCRFEngine.DELFT) + if (GrobidProperties.getGrobidEngine(this.model) != GrobidCRFEngine.DELFT) renameModels(oldModelPath, tempModelPath); } diff --git a/grobid-trainer/src/main/java/org/grobid/trainer/PatentParserTrainer.java b/grobid-trainer/src/main/java/org/grobid/trainer/PatentParserTrainer.java index b71569dc52..6ad44cd46d 100755 --- a/grobid-trainer/src/main/java/org/grobid/trainer/PatentParserTrainer.java +++ b/grobid-trainer/src/main/java/org/grobid/trainer/PatentParserTrainer.java @@ -183,7 +183,7 @@ public boolean accept(File dir, String name) { List> segmentedAccumulatedLabels = new ArrayList<>(); int maxSequence = 1000; - if (GrobidProperties.getGrobidCRFEngineName("patent-citation").equals("delft")) { + if (GrobidProperties.getGrobidEngineName("patent-citation").equals("delft")) { List newTexts = new ArrayList<>(); maxSequence = GrobidProperties.getDelftTrainingMaxSequenceLength("patent-citation"); } @@ -362,7 +362,7 @@ public void createDataSet(String setName, String corpusPath, String outputPath, List> segmentedAccumulatedLabels = new ArrayList<>(); int maxSequence = 1000; - if (GrobidProperties.getGrobidCRFEngineName("patent-citation").equals("delft")) { + if (GrobidProperties.getGrobidEngineName("patent-citation").equals("delft")) { List newTexts = new ArrayList<>(); maxSequence = GrobidProperties.getDelftTrainingMaxSequenceLength("patent-citation"); } diff --git a/grobid-trainer/src/main/java/org/grobid/trainer/TrainerFactory.java b/grobid-trainer/src/main/java/org/grobid/trainer/TrainerFactory.java index 3b6059c317..12717ede62 100644 --- a/grobid-trainer/src/main/java/org/grobid/trainer/TrainerFactory.java +++ b/grobid-trainer/src/main/java/org/grobid/trainer/TrainerFactory.java @@ -6,7 +6,7 @@ public class TrainerFactory { public static GenericTrainer getTrainer(GrobidModel model) { - switch (GrobidProperties.getGrobidCRFEngine(model)) { + switch (GrobidProperties.getGrobidEngine(model)) { case CRFPP: return new CRFPPGenericTrainer(); case WAPITI: @@ -16,7 +16,7 @@ public static GenericTrainer getTrainer(GrobidModel model) { case DUMMY: return new DummyTrainer(); default: - throw new IllegalStateException("Unsupported GROBID sequence labelling engine: " + GrobidProperties.getGrobidCRFEngine(model)); + throw new IllegalStateException("Unsupported GROBID sequence labelling engine: " + GrobidProperties.getGrobidEngine(model)); } } }