diff --git a/.gitignore b/.gitignore index 5676083f4..e1e426526 100644 --- a/.gitignore +++ b/.gitignore @@ -10,4 +10,6 @@ gerbil_data *.log google*.html export -datadump.nt \ No newline at end of file +datadump.nt +indexes +dependency-reduced-pom.xml \ No newline at end of file diff --git a/.travis.yml b/.travis.yml index d3f168a11..d734b7006 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,8 +3,9 @@ language: java before_install: - cp src/main/properties/log4j.properties src/test/resources/log4j.properties - mkdir -p "gerbil_data" - - curl --retry 4 -L -o "gerbil_data/gerbil_data.zip" "https://github.com/AKSW/gerbil/releases/download/v1.2.4/gerbil_data.zip" + - curl --retry 4 -L -o "gerbil_data/gerbil_data.zip" "https://github.com/AKSW/gerbil/releases/download/v1.2.5/gerbil_data.zip" - unzip "gerbil_data/gerbil_data.zip" + - touch src/main/properties/gerbil_keys.properties install: - mvn clean compile -DskipTests=true -Dmaven.javadoc.skip=true -B -V script: diff --git a/index.sh b/index.sh new file mode 100644 index 000000000..5b9e579ec --- /dev/null +++ b/index.sh @@ -0,0 +1,29 @@ +mkdir dbpedia_dump +cd dbpedia_dump + +wget -r --no-parent -R "*.txt, *.html, *.json" -A "*.nt, *.ttl, *.nt.bz2, *.ttl.bz2" http://downloads.dbpedia.org/2016-04/core-i18n/en/ +cd downloads.dbpedia.org/2016-04/core-i18n/en/ + +wget http://www.l3s.de/~minack/rdf2rdf/downloads/rdf2rdf-1.0.1-2.3.1.jar + + +rm *.json +rm *.txt +rm index.html + +for i in *.bz2; do + bzip2 -vd $i +done + +for i in *.ttl; do + java -jar rdf2rdf-1.0.1-2.3.1.jar $i .nt +done + +rm *.ttl +rm rdf2rdf-1.0.1-2.3.1.jar + +cd ../../../../../../ + +mvn exec:java -Dexec.mainClass="org.aksw.gerbil.tools.InitialIndexTool" -Dexec.args="dbpedia_dump/downloads.dbpedia.org/2016-04/core-i18n/en/" + +rm -rf dbpedia_dump/ diff --git a/pom.xml b/pom.xml index cb31ad3e2..b6b1e9589 100644 --- a/pom.xml +++ b/pom.xml @@ -12,8 +12,8 @@ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> 4.0.0 org.aksw - gerbil - 1.2.4 + gerbil + 1.2.5 General Entity Annotator Benchmark This project is a benchmark for entity annotation and disambiguation tools. 2014 @@ -55,8 +55,8 @@ org.aksw - gerbil.nif.transfer - 1.2.2 + gerbil.nif.transfer + 1.2.3 @@ -142,7 +142,7 @@ org.apache.lucene lucene-core - 2.9.1 + 6.2.0 commons-configuration @@ -311,6 +311,11 @@ json 20140107 + + org.apache.lucene + lucene-analyzers-common + 6.2.0 + diff --git a/src/main/java/org/aksw/gerbil/annotator/OKETask1Annotator.java b/src/main/java/org/aksw/gerbil/annotator/OKETask1Annotator.java index ccb35734c..f8a332bb6 100644 --- a/src/main/java/org/aksw/gerbil/annotator/OKETask1Annotator.java +++ b/src/main/java/org/aksw/gerbil/annotator/OKETask1Annotator.java @@ -22,7 +22,7 @@ import org.aksw.gerbil.transfer.nif.Document; import org.aksw.gerbil.transfer.nif.data.TypedNamedEntity; -public interface OKETask1Annotator extends A2KBAnnotator, EntityTyper { +public interface OKETask1Annotator extends A2KBAnnotator, RT2KBAnnotator { public List performTask1(Document document) throws GerbilException; } diff --git a/src/main/java/org/aksw/gerbil/annotator/RT2KBAnnotator.java b/src/main/java/org/aksw/gerbil/annotator/RT2KBAnnotator.java new file mode 100644 index 000000000..a9284d8e1 --- /dev/null +++ b/src/main/java/org/aksw/gerbil/annotator/RT2KBAnnotator.java @@ -0,0 +1,28 @@ +/** + * This file is part of General Entity Annotator Benchmark. + * + * General Entity Annotator Benchmark is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * General Entity Annotator Benchmark is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with General Entity Annotator Benchmark. If not, see . + */ +package org.aksw.gerbil.annotator; + +import java.util.List; + +import org.aksw.gerbil.exceptions.GerbilException; +import org.aksw.gerbil.transfer.nif.Document; +import org.aksw.gerbil.transfer.nif.TypedSpan; + +public interface RT2KBAnnotator extends EntityRecognizer, EntityTyper { + + public List performRT2KBTask(Document document) throws GerbilException; +} diff --git a/src/main/java/org/aksw/gerbil/annotator/decorator/ErrorCountingAnnotatorDecorator.java b/src/main/java/org/aksw/gerbil/annotator/decorator/ErrorCountingAnnotatorDecorator.java index 9eac748fc..a99acf2a9 100644 --- a/src/main/java/org/aksw/gerbil/annotator/decorator/ErrorCountingAnnotatorDecorator.java +++ b/src/main/java/org/aksw/gerbil/annotator/decorator/ErrorCountingAnnotatorDecorator.java @@ -27,6 +27,7 @@ import org.aksw.gerbil.annotator.EntityTyper; import org.aksw.gerbil.annotator.OKETask1Annotator; import org.aksw.gerbil.annotator.OKETask2Annotator; +import org.aksw.gerbil.annotator.RT2KBAnnotator; import org.aksw.gerbil.datatypes.ErrorTypes; import org.aksw.gerbil.datatypes.ExperimentType; import org.aksw.gerbil.evaluate.EvaluationResultContainer; @@ -52,8 +53,8 @@ * @author Michael Röder (roeder@informatik.uni-leipzig.de) * */ -public abstract class ErrorCountingAnnotatorDecorator extends AbstractAnnotatorDecorator - implements Evaluator, ErrorCounter { +public abstract class ErrorCountingAnnotatorDecorator extends AbstractAnnotatorDecorator implements Evaluator, + ErrorCounter { private static final Logger LOGGER = LoggerFactory.getLogger(ErrorCountingAnnotatorDecorator.class); @@ -82,6 +83,8 @@ public static ErrorCountingAnnotatorDecorator createDecorator(ExperimentType typ return new ErrorCountingOKETask1Annotator((OKETask1Annotator) annotator, maxErrors); case OKE_Task2: return new ErrorCountingOKETask2Annotator((OKETask2Annotator) annotator, maxErrors); + case RT2KB: + return new ErrorCountingRT2KBAnnotator((RT2KBAnnotator) annotator, maxErrors); case Rc2KB: break; case Sa2KB: @@ -125,8 +128,8 @@ public List performD2KBTask(Document document) throws GerbilExcepti } } - private static class ErrorCountingEntityRecognizer extends ErrorCountingAnnotatorDecorator - implements EntityRecognizer { + private static class ErrorCountingEntityRecognizer extends ErrorCountingAnnotatorDecorator implements + EntityRecognizer { public ErrorCountingEntityRecognizer(EntityRecognizer decoratedAnnotator, int maxErrors) { super(decoratedAnnotator, maxErrors); @@ -173,8 +176,24 @@ public List performTyping(Document document) throws GerbilException { } } - private static class ErrorCountingOKETask1Annotator extends ErrorCountingA2KBAnnotator - implements OKETask1Annotator { + private static class ErrorCountingRT2KBAnnotator extends ErrorCountingEntityRecognizer implements RT2KBAnnotator { + + protected ErrorCountingRT2KBAnnotator(RT2KBAnnotator decoratedAnnotator, int maxErrors) { + super(decoratedAnnotator, maxErrors); + } + + @Override + public List performTyping(Document document) throws GerbilException { + return ErrorCountingAnnotatorDecorator.performTyping(this, document); + } + + @Override + public List performRT2KBTask(Document document) throws GerbilException { + return ErrorCountingAnnotatorDecorator.performRT2KBTask(this, document); + } + } + + private static class ErrorCountingOKETask1Annotator extends ErrorCountingA2KBAnnotator implements OKETask1Annotator { protected ErrorCountingOKETask1Annotator(OKETask1Annotator decoratedAnnotator, int maxErrors) { super(decoratedAnnotator, maxErrors); @@ -185,14 +204,19 @@ public List performTyping(Document document) throws GerbilException { return ErrorCountingAnnotatorDecorator.performTyping(this, document); } + @Override + public List performRT2KBTask(Document document) throws GerbilException { + return ErrorCountingAnnotatorDecorator.performRT2KBTask(this, document); + } + @Override public List performTask1(Document document) throws GerbilException { return ErrorCountingAnnotatorDecorator.performOKETask1(this, document); } } - private static class ErrorCountingOKETask2Annotator extends ErrorCountingAnnotatorDecorator - implements OKETask2Annotator { + private static class ErrorCountingOKETask2Annotator extends ErrorCountingAnnotatorDecorator implements + OKETask2Annotator { protected ErrorCountingOKETask2Annotator(OKETask2Annotator decoratedAnnotator, int maxErrors) { super(decoratedAnnotator, maxErrors); @@ -269,8 +293,8 @@ protected static List performD2KBTask(ErrorCountingAnnotatorDecorat return result; } - protected static List performExtraction(ErrorCountingAnnotatorDecorator errorCounter, - Document document) throws GerbilException { + protected static List performExtraction(ErrorCountingAnnotatorDecorator errorCounter, Document document) + throws GerbilException { List result = null; try { result = ((A2KBAnnotator) errorCounter.getDecoratedAnnotator()).performA2KBTask(document); @@ -384,6 +408,29 @@ protected static List performOKETask2(ErrorCountingAnnotatorDe return result; } + public static List performRT2KBTask(ErrorCountingAnnotatorDecorator errorCounter, Document document) + throws GerbilException { + List result = null; + try { + result = ((RT2KBAnnotator) errorCounter.getDecoratedAnnotator()).performRT2KBTask(document); + } catch (Exception e) { + if (errorCounter.getErrorCount() == 0) { + // Log only the first exception completely + LOGGER.error("Got an Exception from the annotator (" + errorCounter.getName() + ")", e); + } else { + // Log only the Exception message without the stack trace + LOGGER.error("Got an Exception from the annotator (" + errorCounter.getName() + "): " + + e.getLocalizedMessage()); + } + errorCounter.increaseErrorCount(); + return new ArrayList(0); + } + if (printDebugMsg && LOGGER.isDebugEnabled()) { + logResult(result, errorCounter.getName(), "TypedNamedEntity"); + } + return result; + } + protected int errorCount = 0; protected int maxErrors; diff --git a/src/main/java/org/aksw/gerbil/annotator/decorator/SingleInstanceSecuringAnnotatorDecorator.java b/src/main/java/org/aksw/gerbil/annotator/decorator/SingleInstanceSecuringAnnotatorDecorator.java index 5c8f26bf7..84c93ef71 100644 --- a/src/main/java/org/aksw/gerbil/annotator/decorator/SingleInstanceSecuringAnnotatorDecorator.java +++ b/src/main/java/org/aksw/gerbil/annotator/decorator/SingleInstanceSecuringAnnotatorDecorator.java @@ -29,6 +29,7 @@ import org.aksw.gerbil.annotator.EntityTyper; import org.aksw.gerbil.annotator.OKETask1Annotator; import org.aksw.gerbil.annotator.OKETask2Annotator; +import org.aksw.gerbil.annotator.RT2KBAnnotator; import org.aksw.gerbil.datatypes.ErrorTypes; import org.aksw.gerbil.datatypes.ExperimentType; import org.aksw.gerbil.exceptions.GerbilException; @@ -73,6 +74,8 @@ public static SingleInstanceSecuringAnnotatorDecorator createDecorator(Experimen return new SingleInstanceSecuringOKETask1Annotator((OKETask1Annotator) annotator); case OKE_Task2: return new SingleInstanceSecuringOKETask2Annotator((OKETask2Annotator) annotator); + case RT2KB: + return new SingleInstanceSecuringRT2KBAnnotator((RT2KBAnnotator) annotator); case Rc2KB: break; case Sa2KB: @@ -86,8 +89,8 @@ public static SingleInstanceSecuringAnnotatorDecorator createDecorator(Experimen return null; } - private static class SingleInstanceSecuringC2KBAnnotator extends SingleInstanceSecuringAnnotatorDecorator - implements C2KBAnnotator { + private static class SingleInstanceSecuringC2KBAnnotator extends SingleInstanceSecuringAnnotatorDecorator implements + C2KBAnnotator { public SingleInstanceSecuringC2KBAnnotator(C2KBAnnotator decoratedAnnotator) { super(decoratedAnnotator); @@ -99,8 +102,8 @@ public List performC2KB(Document document) throws GerbilException { } } - private static class SingleInstanceSecuringD2KBAnnotator extends SingleInstanceSecuringAnnotatorDecorator - implements D2KBAnnotator { + private static class SingleInstanceSecuringD2KBAnnotator extends SingleInstanceSecuringAnnotatorDecorator implements + D2KBAnnotator { public SingleInstanceSecuringD2KBAnnotator(D2KBAnnotator decoratedAnnotator) { super(decoratedAnnotator); @@ -125,8 +128,8 @@ public List performRecognition(Document document) throws GerbilException { } } - private static class SingleInstanceSecuringA2KBAnnotator extends SingleInstanceSecuringD2KBAnnotator - implements A2KBAnnotator { + private static class SingleInstanceSecuringA2KBAnnotator extends SingleInstanceSecuringD2KBAnnotator implements + A2KBAnnotator { public SingleInstanceSecuringA2KBAnnotator(A2KBAnnotator decoratedAnnotator) { super(decoratedAnnotator); @@ -149,8 +152,8 @@ public List performA2KBTask(Document document) throws GerbilExcepti } - private static class SingleInstanceSecuringEntityTyper extends SingleInstanceSecuringAnnotatorDecorator - implements EntityTyper { + private static class SingleInstanceSecuringEntityTyper extends SingleInstanceSecuringAnnotatorDecorator implements + EntityTyper { protected SingleInstanceSecuringEntityTyper(EntityTyper decoratedAnnotator) { super(decoratedAnnotator); @@ -162,8 +165,26 @@ public List performTyping(Document document) throws GerbilException { } } - private static class SingleInstanceSecuringOKETask1Annotator extends SingleInstanceSecuringA2KBAnnotator - implements OKETask1Annotator { + private static class SingleInstanceSecuringRT2KBAnnotator extends SingleInstanceSecuringEntityRecognizer implements + RT2KBAnnotator { + + protected SingleInstanceSecuringRT2KBAnnotator(RT2KBAnnotator decoratedAnnotator) { + super(decoratedAnnotator); + } + + @Override + public List performTyping(Document document) throws GerbilException { + return SingleInstanceSecuringAnnotatorDecorator.performTyping(this, document); + } + + @Override + public List performRT2KBTask(Document document) throws GerbilException { + return SingleInstanceSecuringAnnotatorDecorator.performRT2KBTask(this, document); + } + } + + private static class SingleInstanceSecuringOKETask1Annotator extends SingleInstanceSecuringA2KBAnnotator implements + OKETask1Annotator { protected SingleInstanceSecuringOKETask1Annotator(OKETask1Annotator decoratedAnnotator) { super(decoratedAnnotator); @@ -174,6 +195,11 @@ public List performTyping(Document document) throws GerbilException { return SingleInstanceSecuringAnnotatorDecorator.performTyping(this, document); } + @Override + public List performRT2KBTask(Document document) throws GerbilException { + return SingleInstanceSecuringAnnotatorDecorator.performRT2KBTask(this, document); + } + @Override public List performTask1(Document document) throws GerbilException { return SingleInstanceSecuringAnnotatorDecorator.performOKETask1(this, document); @@ -247,8 +273,8 @@ protected static List performExtraction(SingleInstanceSecuringAnnot return result; } - protected static List performTyping(SingleInstanceSecuringAnnotatorDecorator decorator, - Document document) throws GerbilException { + protected static List performTyping(SingleInstanceSecuringAnnotatorDecorator decorator, Document document) + throws GerbilException { List result = null; try { decorator.semaphore.acquire(); @@ -265,8 +291,8 @@ protected static List performTyping(SingleInstanceSecuringAnnotatorDe return result; } - protected static List performRecognition(SingleInstanceSecuringAnnotatorDecorator decorator, - Document document) throws GerbilException { + protected static List performRecognition(SingleInstanceSecuringAnnotatorDecorator decorator, Document document) + throws GerbilException { List result = null; try { decorator.semaphore.acquire(); @@ -319,6 +345,24 @@ protected static List performOKETask2(SingleInstanceSecuringAn return result; } + protected static List performRT2KBTask(SingleInstanceSecuringAnnotatorDecorator decorator, + Document document) throws GerbilException { + List result = null; + try { + decorator.semaphore.acquire(); + } catch (InterruptedException e) { + LOGGER.error("Interrupted while waiting for the Annotator's semaphore.", e); + throw new GerbilException("Interrupted while waiting for the Annotator's semaphore.", e, + ErrorTypes.UNEXPECTED_EXCEPTION); + } + try { + result = ((RT2KBAnnotator) decorator.getDecoratedAnnotator()).performRT2KBTask(document); + } finally { + decorator.semaphore.release(); + } + return result; + } + /** * Registers the given {@link Annotator} (if it is not already present in * the registration) and returns its semaphore. diff --git a/src/main/java/org/aksw/gerbil/annotator/decorator/TimeMeasuringAnnotatorDecorator.java b/src/main/java/org/aksw/gerbil/annotator/decorator/TimeMeasuringAnnotatorDecorator.java index 63f11a7e1..33c49e829 100644 --- a/src/main/java/org/aksw/gerbil/annotator/decorator/TimeMeasuringAnnotatorDecorator.java +++ b/src/main/java/org/aksw/gerbil/annotator/decorator/TimeMeasuringAnnotatorDecorator.java @@ -18,14 +18,15 @@ import java.util.List; +import org.aksw.gerbil.annotator.A2KBAnnotator; import org.aksw.gerbil.annotator.Annotator; import org.aksw.gerbil.annotator.C2KBAnnotator; -import org.aksw.gerbil.annotator.A2KBAnnotator; import org.aksw.gerbil.annotator.D2KBAnnotator; import org.aksw.gerbil.annotator.EntityRecognizer; import org.aksw.gerbil.annotator.EntityTyper; import org.aksw.gerbil.annotator.OKETask1Annotator; import org.aksw.gerbil.annotator.OKETask2Annotator; +import org.aksw.gerbil.annotator.RT2KBAnnotator; import org.aksw.gerbil.datatypes.ExperimentType; import org.aksw.gerbil.evaluate.DoubleEvaluationResult; import org.aksw.gerbil.evaluate.EvaluationResultContainer; @@ -48,8 +49,8 @@ * @author Michael Röder (roeder@informatik.uni-leipzig.de) * */ -public abstract class TimeMeasuringAnnotatorDecorator extends AbstractAnnotatorDecorator - implements Evaluator, TimeMeasurer { +public abstract class TimeMeasuringAnnotatorDecorator extends AbstractAnnotatorDecorator implements Evaluator, + TimeMeasurer { public static final String AVG_TIME_RESULT_NAME = "avg millis/doc"; @@ -70,6 +71,8 @@ public static TimeMeasuringAnnotatorDecorator createDecorator(ExperimentType typ return new TimeMeasuringOKETask1Annotator((OKETask1Annotator) annotator); case OKE_Task2: return new TimeMeasuringOKETask2Annotator((OKETask2Annotator) annotator); + case RT2KB: + return new TimeMeasuringRT2KBAnnotator((RT2KBAnnotator) annotator); case Rc2KB: break; case Sa2KB: @@ -107,8 +110,8 @@ public List performD2KBTask(Document document) throws GerbilExcepti } } - private static class TimeMeasuringEntityRecognizer extends TimeMeasuringAnnotatorDecorator - implements EntityRecognizer { + private static class TimeMeasuringEntityRecognizer extends TimeMeasuringAnnotatorDecorator implements + EntityRecognizer { public TimeMeasuringEntityRecognizer(EntityRecognizer decoratedAnnotator) { super(decoratedAnnotator); @@ -155,8 +158,24 @@ public List performTyping(Document document) throws GerbilException { } } - private static class TimeMeasuringOKETask1Annotator extends TimeMeasuringA2KBAnnotator - implements OKETask1Annotator { + private static class TimeMeasuringRT2KBAnnotator extends TimeMeasuringEntityRecognizer implements RT2KBAnnotator { + + protected TimeMeasuringRT2KBAnnotator(RT2KBAnnotator decoratedAnnotator) { + super(decoratedAnnotator); + } + + @Override + public List performTyping(Document document) throws GerbilException { + return TimeMeasuringAnnotatorDecorator.performTyping(this, document); + } + + @Override + public List performRT2KBTask(Document document) throws GerbilException { + return TimeMeasuringAnnotatorDecorator.performRT2KBTask(this, document); + } + } + + private static class TimeMeasuringOKETask1Annotator extends TimeMeasuringA2KBAnnotator implements OKETask1Annotator { protected TimeMeasuringOKETask1Annotator(OKETask1Annotator decoratedAnnotator) { super(decoratedAnnotator); @@ -167,14 +186,19 @@ public List performTyping(Document document) throws GerbilException { return TimeMeasuringAnnotatorDecorator.performTyping(this, document); } + @Override + public List performRT2KBTask(Document document) throws GerbilException { + return TimeMeasuringAnnotatorDecorator.performRT2KBTask(this, document); + } + @Override public List performTask1(Document document) throws GerbilException { return TimeMeasuringAnnotatorDecorator.performOKETask1(this, document); } } - private static class TimeMeasuringOKETask2Annotator extends TimeMeasuringAnnotatorDecorator - implements OKETask2Annotator { + private static class TimeMeasuringOKETask2Annotator extends TimeMeasuringAnnotatorDecorator implements + OKETask2Annotator { protected TimeMeasuringOKETask2Annotator(OKETask2Annotator decoratedAnnotator) { super(decoratedAnnotator); @@ -204,8 +228,8 @@ protected static List performD2KBTask(TimeMeasuringAnnotatorDecorat return result; } - protected static List performExtraction(TimeMeasuringAnnotatorDecorator timeMeasurer, - Document document) throws GerbilException { + protected static List performExtraction(TimeMeasuringAnnotatorDecorator timeMeasurer, Document document) + throws GerbilException { long startTime = System.currentTimeMillis(); List result = null; result = ((A2KBAnnotator) timeMeasurer.getDecoratedAnnotator()).performA2KBTask(document); @@ -249,6 +273,15 @@ protected static List performOKETask2(TimeMeasuringAnnotatorDe return result; } + protected static List performRT2KBTask(TimeMeasuringAnnotatorDecorator timeMeasurer, Document document) + throws GerbilException { + long startTime = System.currentTimeMillis(); + List result = null; + result = ((RT2KBAnnotator) timeMeasurer.getDecoratedAnnotator()).performRT2KBTask(document); + timeMeasurer.addCallRuntime(System.currentTimeMillis() - startTime); + return result; + } + protected long timeSum = 0; protected int callCount = 0; diff --git a/src/main/java/org/aksw/gerbil/annotator/impl/fox/FOXAnnotator.java b/src/main/java/org/aksw/gerbil/annotator/impl/fox/FOXAnnotator.java index fdfb5b2bd..faaca97c1 100644 --- a/src/main/java/org/aksw/gerbil/annotator/impl/fox/FOXAnnotator.java +++ b/src/main/java/org/aksw/gerbil/annotator/impl/fox/FOXAnnotator.java @@ -107,6 +107,11 @@ public List performTask1(Document document) throws GerbilExcep return requestAnnotations(document).getMarkings(TypedNamedEntity.class); } + @Override + public List performRT2KBTask(Document document) throws GerbilException { + return requestAnnotations(document).getMarkings(TypedSpan.class); + } + protected Document requestAnnotations(Document document) throws GerbilException { Document resultDoc = new DocumentImpl(document.getText(), document.getDocumentURI()); HttpEntity entity = new StringEntity(new JSONObject().put("input", document.getText()).put("type", "text") diff --git a/src/main/java/org/aksw/gerbil/annotator/impl/fred/FredAnnotator.java b/src/main/java/org/aksw/gerbil/annotator/impl/fred/FredAnnotator.java index d2f007540..7e111af43 100644 --- a/src/main/java/org/aksw/gerbil/annotator/impl/fred/FredAnnotator.java +++ b/src/main/java/org/aksw/gerbil/annotator/impl/fred/FredAnnotator.java @@ -165,6 +165,11 @@ public List performTask1(Document document) throws GerbilExcep return requestAnnotations(document).getMarkings(TypedNamedEntity.class); } + @Override + public List performRT2KBTask(Document document) throws GerbilException { + return requestAnnotations(document).getMarkings(TypedSpan.class); + } + @SuppressWarnings("unchecked") protected static List transformToClass(List markings, Class clazz) { List markingsWithClass = new ArrayList(); diff --git a/src/main/java/org/aksw/gerbil/annotator/impl/instance/InstanceListBasedAnnotator.java b/src/main/java/org/aksw/gerbil/annotator/impl/instance/InstanceListBasedAnnotator.java index 9313f3b85..da109723a 100644 --- a/src/main/java/org/aksw/gerbil/annotator/impl/instance/InstanceListBasedAnnotator.java +++ b/src/main/java/org/aksw/gerbil/annotator/impl/instance/InstanceListBasedAnnotator.java @@ -25,26 +25,45 @@ public class InstanceListBasedAnnotator extends AbstractAnnotator implements A2KBAnnotator, C2KBAnnotator, D2KBAnnotator, EntityRecognizer, EntityTyper, OKETask1Annotator, OKETask2Annotator { + /* + * The mapping has been changed to contain the length since we encountered + * problems with some datasets containing a document URI more than once. + * Inside the NIF file this is not a problem because the length is added to + * the document URI. However, since we remove the positions from the URIs, + * we have to add the length in this class. + */ + /** + * Mapping of URI + text.length() to the documents. + */ protected Map uriInstanceMapping; public InstanceListBasedAnnotator(String annotatorName, List instances) { super(annotatorName); this.uriInstanceMapping = new HashMap(instances.size()); for (Document document : instances) { - uriInstanceMapping.put(document.getDocumentURI(), document); + uriInstanceMapping.put(generateDocUri(document.getDocumentURI(), document.getText().length()), document); } } - protected Document getDocument(String uri) { - if (uriInstanceMapping.containsKey(uri)) { - return uriInstanceMapping.get(uri); + protected Document getDocument(String uri, int textLength) { + String mappingUri = generateDocUri(uri, textLength); + if (uriInstanceMapping.containsKey(mappingUri)) { + return uriInstanceMapping.get(mappingUri); } else { return null; } } - protected List getDocumentMarkings(String uri, Class clazz) { - Document result = this.getDocument(uri); + protected static String generateDocUri(String uri, int textLength) { + StringBuilder builder = new StringBuilder(uri.length() + 10); + builder.append(uri); + builder.append('_'); + builder.append(textLength); + return builder.toString(); + } + + protected List getDocumentMarkings(String uri, int textLength, Class clazz) { + Document result = this.getDocument(uri, textLength); if (result == null) { return new ArrayList(0); } else { @@ -54,36 +73,41 @@ protected List getDocumentMarkings(String uri, Class c @Override public List performTask2(Document document) throws GerbilException { - return getDocumentMarkings(document.getDocumentURI(), TypedNamedEntity.class); + return getDocumentMarkings(document.getDocumentURI(), document.getText().length(), TypedNamedEntity.class); } @Override public List performTask1(Document document) throws GerbilException { - return getDocumentMarkings(document.getDocumentURI(), TypedNamedEntity.class); + return getDocumentMarkings(document.getDocumentURI(), document.getText().length(), TypedNamedEntity.class); } @Override public List performTyping(Document document) throws GerbilException { - return getDocumentMarkings(document.getDocumentURI(), TypedSpan.class); + return getDocumentMarkings(document.getDocumentURI(), document.getText().length(), TypedSpan.class); } @Override public List performRecognition(Document document) throws GerbilException { - return getDocumentMarkings(document.getDocumentURI(), Span.class); + return getDocumentMarkings(document.getDocumentURI(), document.getText().length(), Span.class); } @Override public List performD2KBTask(Document document) throws GerbilException { - return getDocumentMarkings(document.getDocumentURI(), MeaningSpan.class); + return getDocumentMarkings(document.getDocumentURI(), document.getText().length(), MeaningSpan.class); } @Override public List performC2KB(Document document) throws GerbilException { - return getDocumentMarkings(document.getDocumentURI(), Meaning.class); + return getDocumentMarkings(document.getDocumentURI(), document.getText().length(), Meaning.class); } @Override public List performA2KBTask(Document document) throws GerbilException { - return getDocumentMarkings(document.getDocumentURI(), MeaningSpan.class); + return getDocumentMarkings(document.getDocumentURI(), document.getText().length(), MeaningSpan.class); + } + + @Override + public List performRT2KBTask(Document document) throws GerbilException { + return getDocumentMarkings(document.getDocumentURI(), document.getText().length(), TypedSpan.class); } } diff --git a/src/main/java/org/aksw/gerbil/annotator/impl/nif/NIFBasedAnnotatorWebservice.java b/src/main/java/org/aksw/gerbil/annotator/impl/nif/NIFBasedAnnotatorWebservice.java index cda71447a..b30be7c04 100644 --- a/src/main/java/org/aksw/gerbil/annotator/impl/nif/NIFBasedAnnotatorWebservice.java +++ b/src/main/java/org/aksw/gerbil/annotator/impl/nif/NIFBasedAnnotatorWebservice.java @@ -47,8 +47,8 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -public class NIFBasedAnnotatorWebservice extends AbstractHttpBasedAnnotator - implements OKETask2Annotator, OKETask1Annotator, A2KBAnnotator, EntityTyper { +public class NIFBasedAnnotatorWebservice extends AbstractHttpBasedAnnotator implements OKETask2Annotator, + OKETask1Annotator, A2KBAnnotator, EntityTyper { private static final Logger LOGGER = LoggerFactory.getLogger(NIFBasedAnnotatorWebservice.class); @@ -104,6 +104,11 @@ public List performTask2(Document document) throws GerbilExcep return performAnnotation(document, TypedNamedEntity.class); } + @Override + public List performRT2KBTask(Document document) throws GerbilException { + return performAnnotation(document, TypedSpan.class); + } + protected List performAnnotation(Document document, Class resultClass) throws GerbilException { document = request(document); diff --git a/src/main/java/org/aksw/gerbil/annotator/impl/spotlight/SpotlightAnnotator.java b/src/main/java/org/aksw/gerbil/annotator/impl/spotlight/SpotlightAnnotator.java index 1c99a5d14..3e9cc5621 100644 --- a/src/main/java/org/aksw/gerbil/annotator/impl/spotlight/SpotlightAnnotator.java +++ b/src/main/java/org/aksw/gerbil/annotator/impl/spotlight/SpotlightAnnotator.java @@ -33,12 +33,13 @@ import org.aksw.gerbil.transfer.nif.Span; import org.aksw.gerbil.transfer.nif.TypedSpan; import org.aksw.gerbil.transfer.nif.data.TypedNamedEntity; +import org.apache.commons.collections.ListUtils; import org.apache.http.client.methods.HttpPost; import org.apache.http.client.methods.HttpUriRequest; import org.apache.http.impl.client.CloseableHttpClient; -public class SpotlightAnnotator extends AbstractHttpBasedAnnotator - implements OKETask1Annotator, EntityRecognizer, D2KBAnnotator, A2KBAnnotator, EntityTyper { +public class SpotlightAnnotator extends AbstractHttpBasedAnnotator implements OKETask1Annotator, EntityRecognizer, + D2KBAnnotator, A2KBAnnotator, EntityTyper { private static final String SERVICE_URL_PARAM_KEY = "org.aksw.gerbil.annotator.impl.spotlight.SpotlightAnnotator.ServieURL"; @@ -90,6 +91,17 @@ public List performTask1(Document document) throws GerbilExcep return client.annotate(document); } + @SuppressWarnings("unchecked") + @Override + public List performRT2KBTask(Document document) throws GerbilException { + List list = client.annotate(document); + if (list != null) { + return (List) ListUtils.typedList(list, TypedSpan.class); + } else { + return null; + } + } + protected HttpPost createPostRequest(String url) { return super.createPostRequest(url); } @@ -98,7 +110,7 @@ protected HttpPost createPostRequest(String url) { protected void closeRequest(HttpUriRequest request) { super.closeRequest(request); } - + @Override public CloseableHttpClient getClient() { return super.getClient(); diff --git a/src/main/java/org/aksw/gerbil/annotator/impl/xlisa/XLisaAnnotator.java b/src/main/java/org/aksw/gerbil/annotator/impl/xlisa/XLisaAnnotator.java index 9c2413432..0b50c25f3 100644 --- a/src/main/java/org/aksw/gerbil/annotator/impl/xlisa/XLisaAnnotator.java +++ b/src/main/java/org/aksw/gerbil/annotator/impl/xlisa/XLisaAnnotator.java @@ -9,7 +9,6 @@ import org.aksw.gerbil.annotator.A2KBAnnotator; import org.aksw.gerbil.annotator.impl.AbstractAnnotator; -import org.aksw.gerbil.config.GerbilConfiguration; import org.aksw.gerbil.datatypes.ErrorTypes; import org.aksw.gerbil.exceptions.GerbilException; import org.aksw.gerbil.transfer.nif.Document; @@ -29,18 +28,14 @@ public class XLisaAnnotator extends AbstractAnnotator implements A2KBAnnotator { private static final String BASE_URI = "http://km.aifb.kit.edu/services/text-annotation/?"; - private static final String XLISA_LANG_1 = "org.aksw.gerbil.annotators.definition.XLisa.lang1"; - private static final String XLISA_LANG_2 = "org.aksw.gerbil.annotators.definition.XLisa.lang2"; - private static final String XLISA_KB = "org.aksw.gerbil.annotators.definition.XLisa.kb"; - private static final String XLISA_MODEL = "org.aksw.gerbil.annotators.definition.XLisa.model"; - + private String lang1,lang2,kb,model; - public XLisaAnnotator(){ - this.lang1 = GerbilConfiguration.getInstance().getString(XLISA_LANG_1); - this.lang2 = GerbilConfiguration.getInstance().getString(XLISA_LANG_2); - this.kb = GerbilConfiguration.getInstance().getString(XLISA_KB); - this.model = GerbilConfiguration.getInstance().getString(XLISA_MODEL); + public XLisaAnnotator(String lang1, String lang2, String kb, String model){ + this.lang1 = lang1; + this.lang2 = lang2; + this.kb = kb; + this.model = model; } @Override diff --git a/src/main/java/org/aksw/gerbil/dataset/check/impl/FileBasedCachingEntityCheckerManager.java b/src/main/java/org/aksw/gerbil/dataset/check/impl/FileBasedCachingEntityCheckerManager.java index 2024a72bc..c3011d2ad 100644 --- a/src/main/java/org/aksw/gerbil/dataset/check/impl/FileBasedCachingEntityCheckerManager.java +++ b/src/main/java/org/aksw/gerbil/dataset/check/impl/FileBasedCachingEntityCheckerManager.java @@ -16,6 +16,7 @@ */ package org.aksw.gerbil.dataset.check.impl; +import java.io.BufferedInputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; @@ -89,25 +90,22 @@ public static ObjectLongOpenHashMap readCacheFile(File cacheFile) { if (!cacheFile.exists() || cacheFile.isDirectory()) { return null; } - FileInputStream fin = null; - ObjectInputStream oin = null; + ObjectInputStream ois = null; try { - fin = new FileInputStream(cacheFile); - oin = new ObjectInputStream(fin); + ois = new ObjectInputStream(new BufferedInputStream(new FileInputStream(cacheFile))); // first, read the number of URIs - int count = oin.readInt(); + int count = ois.readInt(); String uri; ObjectLongOpenHashMap cache = new ObjectLongOpenHashMap(2 * count); for (int i = 0; i < count; ++i) { - uri = (String) oin.readObject(); - cache.put(uri, oin.readLong()); + uri = (String) ois.readObject(); + cache.put(uri, ois.readLong()); } return cache; } catch (Exception e) { LOGGER.error("Exception while reading cache file.", e); } finally { - IOUtils.closeQuietly(oin); - IOUtils.closeQuietly(fin); + IOUtils.closeQuietly(ois); } return null; } @@ -121,8 +119,8 @@ public static ObjectLongOpenHashMap readCacheFile(File cacheFile) { protected File cacheFile; protected File tempCacheFile; - protected FileBasedCachingEntityCheckerManager(ObjectLongOpenHashMap cache, - long cacheEntryLifetime, File cacheFile, File tempCacheFile) { + protected FileBasedCachingEntityCheckerManager(ObjectLongOpenHashMap cache, long cacheEntryLifetime, + File cacheFile, File tempCacheFile) { this.cache = cache; this.cacheEntryLifetime = cacheEntryLifetime; this.cacheFile = cacheFile; diff --git a/src/main/java/org/aksw/gerbil/dataset/check/impl/HttpBasedEntityChecker.java b/src/main/java/org/aksw/gerbil/dataset/check/impl/HttpBasedEntityChecker.java index 8ebab0afc..c7372b267 100644 --- a/src/main/java/org/aksw/gerbil/dataset/check/impl/HttpBasedEntityChecker.java +++ b/src/main/java/org/aksw/gerbil/dataset/check/impl/HttpBasedEntityChecker.java @@ -52,9 +52,10 @@ public boolean entityExists(String uri) { } catch (IllegalArgumentException e) { if (LOGGER.isDebugEnabled()) { LOGGER.error("Exception while creating HTTP request. Returning false.", e); - } else { - LOGGER.error("Exception while creating HTTP request. Returning false. Exception: " - + e.getLocalizedMessage()); + // } else { + // LOGGER.error("Exception while creating HTTP request. + // Returning false. Exception: " + // + e.getLocalizedMessage()); } return false; } @@ -66,9 +67,10 @@ public boolean entityExists(String uri) { } catch (Exception e) { if (LOGGER.isDebugEnabled()) { LOGGER.error("Exception while sending HTTP request. Returning false.", e); - } else { - LOGGER.error( - "Exception while sending HTTP request. Returning false. Exception: " + e.getLocalizedMessage()); + // } else { + // LOGGER.error( + // "Exception while sending HTTP request. Returning false. + // Exception: " + e.getLocalizedMessage()); } return false; } finally { diff --git a/src/main/java/org/aksw/gerbil/dataset/check/index/IndexBasedEntityChecker.java b/src/main/java/org/aksw/gerbil/dataset/check/index/IndexBasedEntityChecker.java new file mode 100644 index 000000000..91eaf878e --- /dev/null +++ b/src/main/java/org/aksw/gerbil/dataset/check/index/IndexBasedEntityChecker.java @@ -0,0 +1,67 @@ +package org.aksw.gerbil.dataset.check.index; + +import java.io.Closeable; +import java.io.File; +import java.io.IOException; + +import org.aksw.gerbil.dataset.check.EntityChecker; +import org.apache.commons.io.IOUtils; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FSDirectory; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class IndexBasedEntityChecker implements EntityChecker, Closeable { + + private static final Logger LOGGER = LoggerFactory.getLogger(IndexBasedEntityChecker.class); + + public static final String URI_FIELD_NAME = "URI"; + + public static IndexBasedEntityChecker create(String indexDirPath) { + Directory indexDirectory = null; + try { + indexDirectory = FSDirectory.open(new File(indexDirPath).toPath()); + IndexReader indexReader = DirectoryReader.open(indexDirectory); + IndexSearcher indexSearcher = new IndexSearcher(indexReader); + return new IndexBasedEntityChecker(indexSearcher, indexDirectory, indexReader); + } catch (IOException e) { + LOGGER.error("Exception while trying to open index for entity checking. Returning null.", e); + IOUtils.closeQuietly(indexDirectory); + return null; + } + } + + private IndexSearcher indexSearcher; + private Directory indexDirectory; + private IndexReader indexReader; + + protected IndexBasedEntityChecker(IndexSearcher indexSearcher, Directory indexDirectory, IndexReader indexReader) { + this.indexSearcher = indexSearcher; + this.indexDirectory = indexDirectory; + this.indexReader = indexReader; + } + + @Override + public boolean entityExists(String uri) { + TopDocs docs = null; + try { + TermQuery query = new TermQuery(new Term(URI_FIELD_NAME, uri)); + docs = indexSearcher.search(query, 1); + } catch (IOException e) { + LOGGER.error("Got an exception while searching for \"" + uri + "\" in the index. Returning false.", e); + } + return (docs != null) && (docs.totalHits > 0); + } + + public void close() throws IOException { + IOUtils.closeQuietly(indexReader); + IOUtils.closeQuietly(indexDirectory); + } + +} diff --git a/src/main/java/org/aksw/gerbil/dataset/check/index/Indexer.java b/src/main/java/org/aksw/gerbil/dataset/check/index/Indexer.java new file mode 100644 index 000000000..770f00d25 --- /dev/null +++ b/src/main/java/org/aksw/gerbil/dataset/check/index/Indexer.java @@ -0,0 +1,65 @@ +package org.aksw.gerbil.dataset.check.index; + +import java.io.File; +import java.io.IOException; + +import org.apache.commons.io.IOUtils; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.StringField; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.IndexWriterConfig.OpenMode; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FSDirectory; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class Indexer { + + private static final Logger LOGGER = LoggerFactory.getLogger(Indexer.class); + + public static Indexer create(String indexDirPath) { + Directory indexDirectory = null; + try { + indexDirectory = FSDirectory.open(new File(indexDirPath).toPath()); + IndexWriterConfig config = new IndexWriterConfig(); + config.setOpenMode(OpenMode.CREATE); + IndexWriter indexWriter = new IndexWriter(indexDirectory, config); + return new Indexer(indexDirectory, indexWriter); + } catch (IOException e) { + LOGGER.error("Exception while trying to create index writer for entity checking. Returning null.", e); + IOUtils.closeQuietly(indexDirectory); + return null; + } + } + + private IndexWriter indexWriter; + private Directory indexDirectory; + + protected Indexer(Directory dir, IndexWriter writer) { + this.indexWriter = writer; + this.indexDirectory = dir; + } + + public void close() { + try { + indexWriter.commit(); + } catch (IOException e) { + LOGGER.error("Error occured during final commit of Index Writer.", e); + } + IOUtils.closeQuietly(indexWriter); + IOUtils.closeQuietly(indexDirectory); + } + + public void index(String uri) { + Document document = new Document(); + document.add(new StringField(IndexBasedEntityChecker.URI_FIELD_NAME, uri, Field.Store.NO)); + try { + indexWriter.addDocument(document); + } catch (IOException e) { + LOGGER.error("Couldn't write uri to index.", e); + e.printStackTrace(); + } + } +} diff --git a/src/main/java/org/aksw/gerbil/dataset/impl/derczysnki/DerczynskiDataset.java b/src/main/java/org/aksw/gerbil/dataset/impl/derczysnki/DerczynskiDataset.java index 8f4aaeb92..77eb74dfb 100644 --- a/src/main/java/org/aksw/gerbil/dataset/impl/derczysnki/DerczynskiDataset.java +++ b/src/main/java/org/aksw/gerbil/dataset/impl/derczysnki/DerczynskiDataset.java @@ -7,7 +7,9 @@ import java.io.InputStreamReader; import java.nio.charset.Charset; import java.util.ArrayList; +import java.util.HashSet; import java.util.List; +import java.util.Set; import org.aksw.gerbil.dataset.InitializableDataset; import org.aksw.gerbil.dataset.impl.AbstractDataset; @@ -16,48 +18,44 @@ import org.aksw.gerbil.transfer.nif.Document; import org.aksw.gerbil.transfer.nif.Marking; import org.aksw.gerbil.transfer.nif.data.DocumentImpl; -import org.aksw.gerbil.transfer.nif.data.NamedEntity; +import org.aksw.gerbil.transfer.nif.data.TypedNamedEntity; import org.apache.commons.io.IOUtils; -import au.com.bytecode.opencsv.CSVReader; +public class DerczynskiDataset extends AbstractDataset implements + InitializableDataset { -public class DerczynskiDataset extends AbstractDataset implements InitializableDataset { - - private static final char SEPARATION_CHAR = '\t'; private static StringBuilder realTweet; private String file; - private List documents; - private int firstDocId; - private int lastDocId; - - public DerczynskiDataset(String file) { - this.file = file; - } - - - - @Override - public int size() { - return documents.size(); - } - - @Override - public List getInstances() { - return documents; - } - - @Override - public void init() throws GerbilException { - this.documents = loadDocuments(new File(file)); - if ((firstDocId > 0) && (lastDocId > 0)) { - this.documents = this.documents.subList(firstDocId - 1, lastDocId); - } - } + private List documents; + private int firstDocId; + private int lastDocId; + + public DerczynskiDataset(String file) { + this.file = file; + } + + @Override + public int size() { + return documents.size(); + } + + @Override + public List getInstances() { + return documents; + } + + @Override + public void init() throws GerbilException { + this.documents = loadDocuments(new File(file)); + if ((firstDocId > 0) && (lastDocId > 0)) { + this.documents = this.documents.subList(firstDocId - 1, lastDocId); + } + } protected List loadDocuments(File tweetsFile) throws GerbilException { BufferedReader reader = null; -// CSVReader reader = null; + // CSVReader reader = null; List documents = new ArrayList(); String documentUriPrefix = "http://" + getName() + "/"; try { @@ -65,23 +63,23 @@ protected List loadDocuments(File tweetsFile) new FileInputStream(tweetsFile), Charset.forName("UTF-8"))); String line = reader.readLine(); - int tweetIndex=0; + int tweetIndex = 0; List markings = new ArrayList(); StringBuilder tweet = new StringBuilder(""); while (line != null) { - if(line.trim().isEmpty()){ - //Get Markings + if (line.trim().isEmpty()) { + // Get Markings markings = findMarkings(tweet.toString()); - //Save old tweet - documents.add(new DocumentImpl(realTweet.toString(), documentUriPrefix - + tweetIndex, markings)); - //New Tweet + // Save old tweet + documents.add(new DocumentImpl(realTweet.toString(), + documentUriPrefix + tweetIndex, markings)); + // New Tweet tweet.delete(0, tweet.length()); line = reader.readLine(); tweetIndex++; continue; } - tweet.append(line+"\n"); + tweet.append(line + "\n"); line = reader.readLine(); } } catch (IOException e) { @@ -89,29 +87,79 @@ protected List loadDocuments(File tweetsFile) ErrorTypes.DATASET_LOADING_ERROR); } finally { IOUtils.closeQuietly(reader); -// IOUtils.closeQuietly(bReader); + // IOUtils.closeQuietly(bReader); } return documents; } - - public static List findMarkings(String tweet){ - int start=0; + + public static List findMarkings(String tweet) { + int start = 0; List markings = new ArrayList(); realTweet = new StringBuilder(); String[] line = tweet.split("\n"); - for(String tokenFull : line){ + int i = 0; + for (String tokenFull : line) { String[] token = tokenFull.split("\t+"); - realTweet.append(token[0]+" "); - token[1]=token[1].trim(); - if(!token[1].trim().equals("O") && !token[1].trim().equals("NIL")){ - //TOken has URI - markings.add(new NamedEntity(start, token[0].length(), token[1])); + realTweet.append(token[0] + " "); + token[1] = token[1].trim(); + if (token.length>2&&token[2].startsWith("B-")) { + String[] marking = getWholeMarking(line, i); + Set types = new HashSet(); + types.add(marking[2]); + markings.add(new TypedNamedEntity(start, marking[0].length(), + marking[1], types)); + } - start+=token[0].length()+1; + start += token[0].length() + 1; + i++; } - + return markings; } + private static String[] getWholeMarking(String line[], int pos) { + String[] ret = new String[3]; + String[] token = line[pos].split("\t+"); + StringBuilder name = new StringBuilder().append(token[0]); + if (!token[1].equals("O") & !token[1].equals("") && !token[1].equals("NIL")) + ret[1] = token[1]; + else + ret[1] = ""; + ret[2] = getType(token[2].substring(2)); + for (int i = pos + 1; i < line.length; i++) { + token = line[i].split("\t+"); + + if (token.length >2 && token[2].startsWith("I-")) { + name.append(" ").append(token[0]); + } else { + break; + } + } + ret[0] = name.toString(); + return ret; + } + + private static String getType(String type) { + switch (type) { + case "sportsteam": + return "http://dbpedia.org/ontology/SportsTeam"; + case "person": + return "http://dbpedia.org/ontology/Person"; + case "geo-loc": + return "http://dbpedia.org/ontology/Place"; + case "facility": + return "http://dbpedia.org/ontology/Place"; + case "movie": + return "http://dbpedia.org/ontology/Film"; + case "tv-show": + return "http://dbpedia.org/ontology/TelevisionShow"; + case "company": + return "http://dbpedia.org/ontology/company"; + case "product": + return "http://dbpedia.org/ontology/product"; + default: + return ""; + } + } } diff --git a/src/main/java/org/aksw/gerbil/dataset/impl/erd/ERDDataset.java b/src/main/java/org/aksw/gerbil/dataset/impl/erd/ERDDataset.java new file mode 100644 index 000000000..081de0595 --- /dev/null +++ b/src/main/java/org/aksw/gerbil/dataset/impl/erd/ERDDataset.java @@ -0,0 +1,161 @@ +/** + * This file is part of General Entity Annotator Benchmark. + * + * General Entity Annotator Benchmark is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * General Entity Annotator Benchmark is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with General Entity Annotator Benchmark. If not, see . + */ +package org.aksw.gerbil.dataset.impl.erd; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.RandomAccessFile; + +import java.nio.charset.Charset; +import java.nio.file.Paths; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.aksw.gerbil.dataset.InitializableDataset; +import org.aksw.gerbil.dataset.impl.AbstractDataset; +import org.aksw.gerbil.datatypes.ErrorTypes; +import org.aksw.gerbil.exceptions.GerbilException; +import org.aksw.gerbil.transfer.nif.Document; +import org.aksw.gerbil.transfer.nif.Marking; +import org.aksw.gerbil.transfer.nif.data.DocumentImpl; +import org.aksw.gerbil.transfer.nif.data.NamedEntity; + +import org.apache.commons.io.IOUtils; + +@Deprecated +public class ERDDataset extends AbstractDataset implements InitializableDataset { + + private static final String FREEBASE_URI = "https://www.googleapis.com/freebase"; + + private String file_text; + private String file_annotation; + private List documents; + + public ERDDataset(String filetext, String fileannotation) { + this.file_text = filetext; + this.file_annotation = fileannotation; + } + + @Override + public int size() { + return documents.size(); + } + + @Override + public List getInstances() { + return documents; + } + + @Override + public void init() throws GerbilException { + this.documents = loadDocuments(new File(file_text), new File(file_annotation)); + } + + private String generateDocumentUri(String fileName) { + + StringBuilder builder = new StringBuilder(); + builder.append("http://"); + builder.append(name); + builder.append('/'); + builder.append(Paths.get(fileName).getFileName().toString()); + + return builder.toString(); + + } + + protected List loadDocuments(File textfile, File annotationfile) throws GerbilException { + + if (!textfile.exists()) { + throw new GerbilException("The given text file (" + textfile.getAbsolutePath() + ") does not exist.", ErrorTypes.DATASET_LOADING_ERROR); + } + if (!annotationfile.exists()) { + throw new GerbilException("The given annotation file (" + annotationfile.getAbsolutePath() + ") does not exist.", ErrorTypes.DATASET_LOADING_ERROR); + } + + List docs = new ArrayList<>(); + String documentUri = generateDocumentUri(textfile.getAbsolutePath()); + + Map textMap = new HashMap<>(); + String text_data = ""; + byte[] filedata = new byte[(int) textfile.length()]; + ERDTrec datatrec = null; + RandomAccessFile raf; + + try { + raf = new RandomAccessFile(textfile, "r"); + raf.seek(0); + raf.readFully(filedata); + text_data = new String(filedata); + raf.close(); + } catch (IOException e) { + throw new GerbilException("Exception while reading text file of dataset.", e, ErrorTypes.DATASET_LOADING_ERROR); + } + + int error = 0; + String[] text_split = text_data.split("\n"); + for (String line : text_split) { + String[] line_part = line.split("\t"); + String key; + + if (line_part.length != 2) { + error++; + key = "ERROR " + error; + } else { + key = line_part[0]; + } + + datatrec = new ERDTrec(line, datatrec); + textMap.put(key, datatrec); + } + + BufferedReader reader = null; + List markings = new ArrayList<>(); + String line; + try { + reader = new BufferedReader(new InputStreamReader(new FileInputStream(annotationfile), Charset.forName("UTF-8"))); + + while ((line = reader.readLine()) != null) { + + String[] line_split = line.split("\t"); + if (line_split.length != 5) continue; + + datatrec = textMap.get(line_split[0]); + if (datatrec != null) { + int position = datatrec.getTextPosition(line_split[3]); + int length = line_split[3].length(); + markings.add(new NamedEntity(position, length, FREEBASE_URI + line_split[2])); + } + } + + } catch (IOException e) { + throw new GerbilException("Exception while reading annotation file of dataset.", e, ErrorTypes.DATASET_LOADING_ERROR); + } finally { + IOUtils.closeQuietly(reader); + } + + docs.add(new DocumentImpl(text_data, documentUri, markings)); + + return docs; + } + +} diff --git a/src/main/java/org/aksw/gerbil/dataset/impl/erd/ERDDataset2.java b/src/main/java/org/aksw/gerbil/dataset/impl/erd/ERDDataset2.java new file mode 100644 index 000000000..42d7f1222 --- /dev/null +++ b/src/main/java/org/aksw/gerbil/dataset/impl/erd/ERDDataset2.java @@ -0,0 +1,126 @@ +package org.aksw.gerbil.dataset.impl.erd; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.nio.charset.Charset; +import java.util.ArrayList; +import java.util.List; + +import org.aksw.gerbil.dataset.InitializableDataset; +import org.aksw.gerbil.dataset.impl.AbstractDataset; +import org.aksw.gerbil.datatypes.ErrorTypes; +import org.aksw.gerbil.exceptions.GerbilException; +import org.aksw.gerbil.transfer.nif.Document; +import org.aksw.gerbil.transfer.nif.Marking; +import org.aksw.gerbil.transfer.nif.data.DocumentImpl; +import org.aksw.gerbil.transfer.nif.data.NamedEntity; + +import com.hp.hpl.jena.query.Query; +import com.hp.hpl.jena.query.QueryExecution; +import com.hp.hpl.jena.query.QueryExecutionFactory; +import com.hp.hpl.jena.query.QueryFactory; + +public class ERDDataset2 extends AbstractDataset implements + InitializableDataset { + + private List documents; + private String annotateFile; + private String textFile; + + private String queryTemp = "PREFIX owl: PREFIX freebase: SELECT ?s WHERE {?s owl:sameAs freebase:%%v%%}"; + private static final String DBPEDIA_SERVICE = "http://dbpedia.org/sparql"; + + + public ERDDataset2(String textFile, String annotateFile) { + this.annotateFile = annotateFile; + this.textFile = textFile; + } + + @Override + public int size() { + return documents.size(); + } + + @Override + public List getInstances() { + return documents; + } + + @Override + public void init() throws GerbilException { + this.documents = loadDocuments(new File(annotateFile), new File( + textFile)); + } + + private List loadDocuments(File annFile, File textFile) throws GerbilException { + List documents = new ArrayList(); + String documentUriPrefix = "http://" + getName() + "/"; + try (BufferedReader breader = new BufferedReader(new InputStreamReader( + new FileInputStream(textFile), Charset.forName("UTF-8")))) { + String line; + List markings = null; + while ((line = breader.readLine()) != null) { + if(line.isEmpty()){ + continue; + } + String[] text = line.split("\t"); + + markings = findMarkings(text, annFile); + documents.add(new DocumentImpl(text[1], documentUriPrefix + + text[0], markings)); + } + } catch (IOException e) { + throw new GerbilException("Exception while reading dataset.", e, + ErrorTypes.DATASET_LOADING_ERROR); + } + + return documents; + } + + private List findMarkings(String[] text, File annFile) throws GerbilException { + List markings = new ArrayList(); + try (BufferedReader breader = new BufferedReader(new InputStreamReader( + new FileInputStream(annFile), Charset.forName("UTF-8")))) { + String line; + + while ((line = breader.readLine()) != null) { + if(line.isEmpty()){ + continue; + } + String[] annotation = line.split("\t"); + int searchID = getTrecID(text[0]); + int annoID = getTrecID(annotation[0]); + if(searchID == annoID){ + int start = text[1].indexOf(annotation[3]); + int length = annotation[3].length(); + + //FIXME time consuming! + String freebaseID = annotation[2].substring(1, annotation[2].length()).replace("/","."); + Query query = QueryFactory.create(queryTemp.replace("%%v%%", freebaseID)); + QueryExecution qexec = QueryExecutionFactory.createServiceRequest(DBPEDIA_SERVICE, query); + String uri = qexec.execSelect().next().getResource("s").getURI(); + + + markings.add(new NamedEntity(start, length, uri)); + } + else if(annoID > searchID){ + //There is no annotation for the given text + break; + } + } + } catch (IOException e) { + throw new GerbilException("Exception while reading dataset.", e, + ErrorTypes.DATASET_LOADING_ERROR); + } + + return markings; + } + + private int getTrecID(String trec){ + return Integer.valueOf(trec.replace("TREC-", "")); + } + +} diff --git a/src/main/java/org/aksw/gerbil/dataset/impl/erd/ERDTrec.java b/src/main/java/org/aksw/gerbil/dataset/impl/erd/ERDTrec.java new file mode 100644 index 000000000..394b87550 --- /dev/null +++ b/src/main/java/org/aksw/gerbil/dataset/impl/erd/ERDTrec.java @@ -0,0 +1,56 @@ +/** + * This file is part of General Entity Annotator Benchmark. + * + * General Entity Annotator Benchmark is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * General Entity Annotator Benchmark is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with General Entity Annotator Benchmark. If not, see . + */ +package org.aksw.gerbil.dataset.impl.erd; + +public class ERDTrec { + + private String line; + private ERDTrec befor; + private int line_number; + private int count_column; + + public ERDTrec(String line, ERDTrec befor) { + this.line = line; + this.befor = befor; + + if (befor == null) { + this.line_number = 0; + this.count_column = 0; + } else { + line_number = this.befor.getLineNumber() + 1; + count_column = this.befor.getColumnCount() + 1; + } + } + + public int getTextPosition(String text) { + int pos = line.indexOf(text); + if (pos > 0) pos = count_column + pos; + return pos; + } + + protected String getLine(){ + return this.line; + } + + protected int getLineNumber(){ + return this.line_number; + } + + protected int getColumnCount(){ + return this.count_column + line.length(); + } +} diff --git a/src/main/java/org/aksw/gerbil/dataset/impl/gerdaq/GERDAQDataset.java b/src/main/java/org/aksw/gerbil/dataset/impl/gerdaq/GERDAQDataset.java new file mode 100644 index 000000000..3f658dcf0 --- /dev/null +++ b/src/main/java/org/aksw/gerbil/dataset/impl/gerdaq/GERDAQDataset.java @@ -0,0 +1,179 @@ +/** + * This file is part of General Entity Annotator Benchmark. + * + * General Entity Annotator Benchmark is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * General Entity Annotator Benchmark is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with General Entity Annotator Benchmark. If not, see . + */ +package org.aksw.gerbil.dataset.impl.gerdaq; + +import java.io.BufferedInputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashSet; +import java.util.List; + +import javax.xml.parsers.SAXParser; +import javax.xml.parsers.SAXParserFactory; + +import org.aksw.gerbil.dataset.InitializableDataset; +import org.aksw.gerbil.dataset.impl.AbstractDataset; +import org.aksw.gerbil.datatypes.ErrorTypes; +import org.aksw.gerbil.exceptions.GerbilException; +import org.aksw.gerbil.transfer.nif.Document; +import org.aksw.gerbil.transfer.nif.Marking; +import org.aksw.gerbil.transfer.nif.data.DocumentImpl; +import org.aksw.gerbil.transfer.nif.data.NamedEntity; +import org.apache.commons.io.IOUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.xml.sax.Attributes; +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; + +public class GERDAQDataset extends AbstractDataset implements InitializableDataset { + + private static final Logger LOGGER = LoggerFactory.getLogger(GERDAQDataset.class); + + private static final String WIKIPEDIA_URI = "http://en.wikipedia.org/wiki/"; + private static final String DBPEDIA_URI = "http://dbpedia.org/resource/"; + private static final String ANNOTATION_TAG = "annotation"; + private static final String DOCUMENT_TAG = "instance"; + + private String file; + private List documents; + + public GERDAQDataset(String file) { + this.file = file; + } + + @Override + public int size() { + return documents.size(); + } + + @Override + public List getInstances() { + return documents; + } + + @Override + public void init() throws GerbilException { + this.documents = loadDocuments(new File(file)); + } + + protected static String generateDocumentUri(String datasetName, String fileName) { + StringBuilder builder = new StringBuilder(); + builder.append("http://"); + builder.append(datasetName.replace(' ', '_')); + builder.append('/'); + builder.append(fileName); + builder.append('_'); + return builder.toString(); + } + + private List loadDocuments(File filePath) throws GerbilException { + List docs = new ArrayList<>(); + if (!filePath.exists()) { + throw new GerbilException("The given file (" + filePath.getAbsolutePath() + ") is not existing.", + ErrorTypes.DATASET_LOADING_ERROR); + } + + if (filePath.isDirectory()) { + + String directoryPath = filePath.getAbsolutePath(); + if (!directoryPath.endsWith(File.separator)) { + directoryPath = directoryPath + File.separator; + } + + for (File tmpFile : new File(directoryPath).listFiles()) { + docs.addAll(createDocument(tmpFile)); + } + + } else { + docs.addAll(createDocument(filePath)); + } + + return docs; + + } + + private List createDocument(File file) throws GerbilException { + List documents = new ArrayList(); + String documentUriStart = generateDocumentUri(name, file.getName()); + InputStream inputStream = null; + InputSource is = null; + try { + inputStream = new BufferedInputStream(new FileInputStream(file)); + is = new InputSource(inputStream); + SAXParserFactory factory = SAXParserFactory.newInstance(); + SAXParser saxParser = factory.newSAXParser(); + + saxParser.parse(is, new DefaultHandler() { + + private StringBuilder text = new StringBuilder(); + private int markingStart; + private String markingTitle; + private List markings; + + @Override + public void startDocument() throws SAXException { + super.startDocument(); + } + + @Override + public void startElement(String namespaceURI, String localName, String qName, Attributes atts) + throws SAXException { + + if (qName.equals(ANNOTATION_TAG)) { + markingTitle = atts.getValue("rank_0_title"); + if (markingTitle != null) { + markingStart = text.length(); + } else { + LOGGER.error("Found a marking without the necessary \"rank_0_title\" attribute."); + } + markingTitle = markingTitle.replace(' ', '_'); + } else if (qName.equals(DOCUMENT_TAG)) { + this.markings = new ArrayList<>(); + } + } + + @Override + public void characters(char[] ch, int start, int length) { + text.append(ch, start, length); + } + + @Override + public void endElement(String namespaceURI, String localName, String qName) throws SAXException { + if (qName.equals(DOCUMENT_TAG)) { + documents.add(new DocumentImpl(text.toString(), documentUriStart + documents.size(), markings)); + text.delete(0, text.length()); + } else if (qName.equals(ANNOTATION_TAG) && (markingTitle != null)) { + markings.add(new NamedEntity(markingStart, text.length() - markingStart, new HashSet( + Arrays.asList(DBPEDIA_URI + markingTitle, WIKIPEDIA_URI + markingTitle)))); + } + } + }); + } catch (Exception e) { + throw new GerbilException("Exception while reading dataset.", e, ErrorTypes.DATASET_LOADING_ERROR); + } finally { + IOUtils.closeQuietly(inputStream); + } + + return documents; + } + +} diff --git a/src/main/java/org/aksw/gerbil/dataset/impl/iitb/IITB_XMLParser.java b/src/main/java/org/aksw/gerbil/dataset/impl/iitb/IITB_XMLParser.java index a7e12a660..b770b3f13 100644 --- a/src/main/java/org/aksw/gerbil/dataset/impl/iitb/IITB_XMLParser.java +++ b/src/main/java/org/aksw/gerbil/dataset/impl/iitb/IITB_XMLParser.java @@ -16,6 +16,7 @@ */ package org.aksw.gerbil.dataset.impl.iitb; +import java.io.BufferedInputStream; import java.io.File; import java.io.FileInputStream; import java.io.IOException; @@ -30,6 +31,7 @@ import org.aksw.gerbil.exceptions.GerbilException; import org.apache.commons.io.IOUtils; import org.apache.xerces.jaxp.SAXParserFactoryImpl; +import org.xml.sax.InputSource; import org.xml.sax.SAXException; public class IITB_XMLParser { @@ -46,18 +48,20 @@ public IITB_XMLParser() throws GerbilException { } public Map> parseAnnotationsFile(File file) throws IOException, SAXException { - FileInputStream fin = null; + InputStream is = null; try { - fin = new FileInputStream(file); - return parseAnnotationsStream(fin); + is = new BufferedInputStream(new FileInputStream(file)); + return parseAnnotationsStream(is); } finally { - IOUtils.closeQuietly(fin); + IOUtils.closeQuietly(is); } } public Map> parseAnnotationsStream(InputStream is) throws IOException, SAXException { IITB_XMLHandler handler = new IITB_XMLHandler(); - parser.parse(is, handler); + InputSource is2 = new InputSource(is); + is2.setEncoding("UTF-8"); + parser.parse(is2, handler); return handler.getDocumentAnnotationsMap(); } } diff --git a/src/main/java/org/aksw/gerbil/dataset/impl/micro/Microposts2013Dataset.java b/src/main/java/org/aksw/gerbil/dataset/impl/micro/Microposts2013Dataset.java index 0e4aede9e..7a2d456a4 100644 --- a/src/main/java/org/aksw/gerbil/dataset/impl/micro/Microposts2013Dataset.java +++ b/src/main/java/org/aksw/gerbil/dataset/impl/micro/Microposts2013Dataset.java @@ -92,7 +92,7 @@ protected List loadDocuments(File tweetsFile) BufferedReader bReader = null; CSVReader reader = null; List documents = new ArrayList(); - String documentUriPrefix = "http//:" + getName() + "/"; + String documentUriPrefix = "http://" + getName() + "/"; try { bReader = new BufferedReader(new InputStreamReader( new FileInputStream(tweetsFile), Charset.forName("UTF-8"))); diff --git a/src/main/java/org/aksw/gerbil/dataset/impl/micro/Microposts2014Dataset.java b/src/main/java/org/aksw/gerbil/dataset/impl/micro/Microposts2014Dataset.java index c0f13e3a8..6e92e5d5b 100644 --- a/src/main/java/org/aksw/gerbil/dataset/impl/micro/Microposts2014Dataset.java +++ b/src/main/java/org/aksw/gerbil/dataset/impl/micro/Microposts2014Dataset.java @@ -81,7 +81,7 @@ protected List loadDocuments(File tweetsFile) throws GerbilException { BufferedReader bReader = null; CSVReader reader = null; List documents = new ArrayList(); - String documentUriPrefix = "http//:" + getName() + "/"; + String documentUriPrefix = "http://" + getName() + "/"; try { bReader = new BufferedReader( new InputStreamReader(new FileInputStream(tweetsFile), Charset.forName("UTF-8"))); diff --git a/src/main/java/org/aksw/gerbil/dataset/impl/micro/Microposts2015Dataset.java b/src/main/java/org/aksw/gerbil/dataset/impl/micro/Microposts2015Dataset.java new file mode 100644 index 000000000..16dcf0d21 --- /dev/null +++ b/src/main/java/org/aksw/gerbil/dataset/impl/micro/Microposts2015Dataset.java @@ -0,0 +1,152 @@ +package org.aksw.gerbil.dataset.impl.micro; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.nio.charset.Charset; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import org.aksw.gerbil.dataset.InitializableDataset; +import org.aksw.gerbil.dataset.impl.AbstractDataset; +import org.aksw.gerbil.datatypes.ErrorTypes; +import org.aksw.gerbil.exceptions.GerbilException; +import org.aksw.gerbil.transfer.nif.Document; +import org.aksw.gerbil.transfer.nif.Marking; +import org.aksw.gerbil.transfer.nif.data.DocumentImpl; +import org.aksw.gerbil.transfer.nif.data.TypedNamedEntity; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class Microposts2015Dataset extends AbstractDataset implements + InitializableDataset { + + private static final Logger LOGGER = LoggerFactory + .getLogger(Microposts2016Dataset.class); + + protected List documents; + private String annotatedFile; + private String tweetsFile; + + protected static int typeIndex = 4; + + public Microposts2015Dataset(String annotatedFile, String tweetsFile) { + this.annotatedFile = annotatedFile; + this.tweetsFile = tweetsFile; + } + + @Override + public int size() { + return documents.size(); + } + + @Override + public List getInstances() { + return documents; + } + + @Override + public void init() throws GerbilException { + this.documents = loadDocuments(new File(annotatedFile), new File( + tweetsFile)); + } + + protected List loadDocuments(File annotations, File tweetsFile) + throws GerbilException { + + List documents = new ArrayList(); + String documentUriPrefix = "http://" + getName() + "/"; + + try (BufferedReader bReader = new BufferedReader(new InputStreamReader( + new FileInputStream(tweetsFile), Charset.forName("UTF-8")))) { + String line; + List markings; + while ((line = bReader.readLine()) != null) { + String[] tweet = line.split("\t"); + if (tweet.length < 2) { + continue; + } + String id = tweet[0]; + String text = tweet[1]; + markings = findMarkings(getMarkingLines(annotations, id), text); + documents.add(new DocumentImpl(text, documentUriPrefix + id, + markings)); + } + } catch (IOException e) { + throw new GerbilException("Exception while reading dataset.", e, + ErrorTypes.DATASET_LOADING_ERROR); + } + + return documents; + } + + protected static List findMarkings(Set lines, String text) { + List markings = new ArrayList(); + + for (String line : lines) { + String[] annotation = line.split("\t"); + + int start = Integer.parseInt(annotation[1]); + int end = Integer.parseInt(annotation[2]); + int length = end - start; + String uri = annotation[3]; + if (uri.startsWith("NIL")) { + uri = ""; + } + Set types = new HashSet(); + types.add(getTypeURI(annotation[typeIndex])); + + markings.add(new TypedNamedEntity(start, length, uri, types)); + + } + + return markings; + } + + private static Set getMarkingLines(File annotations, String id) { + Set lines = new HashSet(); + + try (BufferedReader bReader = new BufferedReader(new InputStreamReader( + new FileInputStream(annotations), Charset.forName("UTF-8")))) { + String line; + Boolean annotationSeen = false; + while ((line = bReader.readLine()) != null) { + String[] annotation = line.split("\t"); + if (id.equals(annotation[0])) { + annotationSeen = true; + lines.add(line); + } else if (annotationSeen) { + // as the annotations are ordered by id, the last annotation + // was added + return lines; + } + } + + } catch (IOException e) { + LOGGER.error("Could not find Markings due to ", e); + } + return lines; + } + + protected static String getTypeURI(String type) { + switch (type.toLowerCase()) { + case "thing": + return "http://dbpedia.org/ontology/Thing"; + case "person": + return "http://dbpedia.org/ontology/Person"; + case "organization": + return "http://dbpedia.org/ontology/Organisation"; + case "location": + return "http://dbpedia.org/ontology/Place"; + case "event": + return "http://dbpedia.org/ontology/Event"; + case "product": + return "http://dbpedia.org/ontology/Product"; + } + return ""; + } +} diff --git a/src/main/java/org/aksw/gerbil/dataset/impl/micro/Microposts2016Dataset.java b/src/main/java/org/aksw/gerbil/dataset/impl/micro/Microposts2016Dataset.java new file mode 100644 index 000000000..adf7434eb --- /dev/null +++ b/src/main/java/org/aksw/gerbil/dataset/impl/micro/Microposts2016Dataset.java @@ -0,0 +1,172 @@ +/** + * This file is part of General Entity Annotator Benchmark. + * + * General Entity Annotator Benchmark is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * General Entity Annotator Benchmark is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with General Entity Annotator Benchmark. If not, see . + */ +package org.aksw.gerbil.dataset.impl.micro; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.nio.charset.Charset; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import org.aksw.gerbil.dataset.InitializableDataset; +import org.aksw.gerbil.dataset.impl.AbstractDataset; +import org.aksw.gerbil.datatypes.ErrorTypes; +import org.aksw.gerbil.exceptions.GerbilException; +import org.aksw.gerbil.transfer.nif.Document; +import org.aksw.gerbil.transfer.nif.Marking; +import org.aksw.gerbil.transfer.nif.data.DocumentImpl; +import org.aksw.gerbil.transfer.nif.data.TypedNamedEntity; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * @author Giuseppe Rizzo (giuse.rizzo@gmail.com) + * @author Michael Röder (roeder@informatik.uni-leipzig.de) + */ +public class Microposts2016Dataset extends AbstractDataset implements + InitializableDataset { + + private static final Logger LOGGER = LoggerFactory + .getLogger(Microposts2016Dataset.class); + + protected List documents; + private String annotatedFile; + private String tweetsFile; + + protected static int typeIndex = 5; + + public Microposts2016Dataset(String annotatedFile, String tweetsFile) { + this.annotatedFile = annotatedFile; + this.tweetsFile = tweetsFile; + } + + @Override + public int size() { + return documents.size(); + } + + @Override + public List getInstances() { + return documents; + } + + @Override + public void init() throws GerbilException { + this.documents = loadDocuments(new File(annotatedFile), new File( + tweetsFile)); + } + + protected List loadDocuments(File annotations, File tweetsFile) + throws GerbilException { + + List documents = new ArrayList(); + String documentUriPrefix = "http://" + getName() + "/"; + + try (BufferedReader bReader = new BufferedReader(new InputStreamReader( + new FileInputStream(tweetsFile), Charset.forName("UTF-8")))) { + String line; + List markings; + while ((line = bReader.readLine()) != null) { + String[] tweet = line.split("\t"); + if (tweet.length < 2) { + continue; + } + String id = tweet[0]; + String text = tweet[1]; + markings = findMarkings(getMarkingLines(annotations, id), text ); + documents.add(new DocumentImpl(text, documentUriPrefix + id, + markings)); + } + } catch (IOException e) { + throw new GerbilException("Exception while reading dataset.", e, + ErrorTypes.DATASET_LOADING_ERROR); + } + + return documents; + } + + protected static List findMarkings(Set lines, String text) { + List markings = new ArrayList(); + + for (String line : lines) { + String[] annotation = line.split("\t"); + + int start = Integer.parseInt(annotation[1]); + int end = Integer.parseInt(annotation[2]); + int length = end - start; + String uri = annotation[3]; + if (uri.startsWith("NIL")) { + uri = ""; + } + Set types = new HashSet(); + types.add(getTypeURI(annotation[typeIndex])); + + markings.add(new TypedNamedEntity(start, length, uri, types)); + + } + + return markings; + } + + private static Set getMarkingLines(File annotations, String id) { + Set lines = new HashSet(); + + try (BufferedReader bReader = new BufferedReader( + new InputStreamReader(new FileInputStream(annotations), Charset.forName("UTF-8")))) { + String line; + Boolean annotationSeen = false; + while ((line = bReader.readLine()) != null) { + String[] annotation = line.split("\t"); + if (id.equals(annotation[0])) { + annotationSeen = true; + lines.add(line); + } else if (annotationSeen) { + // as the annotations are ordered by id, the last annotation + // was added + return lines; + } + } + + } catch (IOException e) { + LOGGER.error("Could not find Markings due to ", e); + } + return lines; + } + + protected static String getTypeURI(String type) { + switch (type.toLowerCase()) { + case "thing": + return "http://dbpedia.org/ontology/Thing"; + case "person": + return "http://dbpedia.org/ontology/Person"; + case "organization": + return "http://dbpedia.org/ontology/Organisation"; + case "location": + return "http://dbpedia.org/ontology/Place"; + case "event": + return "http://dbpedia.org/ontology/Event"; + case "product": + return "http://dbpedia.org/ontology/Product"; + } + return ""; + } +} diff --git a/src/main/java/org/aksw/gerbil/dataset/impl/msnbc/MSNBC_XMLParser.java b/src/main/java/org/aksw/gerbil/dataset/impl/msnbc/MSNBC_XMLParser.java index aa0714afc..e3ead035c 100644 --- a/src/main/java/org/aksw/gerbil/dataset/impl/msnbc/MSNBC_XMLParser.java +++ b/src/main/java/org/aksw/gerbil/dataset/impl/msnbc/MSNBC_XMLParser.java @@ -16,6 +16,7 @@ */ package org.aksw.gerbil.dataset.impl.msnbc; +import java.io.BufferedInputStream; import java.io.File; import java.io.FileInputStream; import java.io.IOException; @@ -28,6 +29,7 @@ import org.aksw.gerbil.exceptions.GerbilException; import org.apache.commons.io.IOUtils; import org.apache.xerces.jaxp.SAXParserFactoryImpl; +import org.xml.sax.InputSource; import org.xml.sax.SAXException; public class MSNBC_XMLParser { @@ -44,18 +46,20 @@ public MSNBC_XMLParser() throws GerbilException { } public MSNBC_Result parseAnnotationsFile(File file) throws IOException, SAXException { - FileInputStream fin = null; + InputStream is = null; try { - fin = new FileInputStream(file); - return parseAnnotationsStream(fin); + is = new BufferedInputStream(new FileInputStream(file)); + return parseAnnotationsStream(is); } finally { - IOUtils.closeQuietly(fin); + IOUtils.closeQuietly(is); } } public MSNBC_Result parseAnnotationsStream(InputStream is) throws IOException, SAXException { MSNBC_XMLHandler handler = new MSNBC_XMLHandler(); - parser.parse(is, handler); + InputSource is2 = new InputSource(is); + is2.setEncoding("UTF-8"); + parser.parse(is2, handler); return handler; } } diff --git a/src/main/java/org/aksw/gerbil/dataset/impl/nif/FileBasedNIFDataset.java b/src/main/java/org/aksw/gerbil/dataset/impl/nif/FileBasedNIFDataset.java index 842b7820c..7d2e4b05f 100644 --- a/src/main/java/org/aksw/gerbil/dataset/impl/nif/FileBasedNIFDataset.java +++ b/src/main/java/org/aksw/gerbil/dataset/impl/nif/FileBasedNIFDataset.java @@ -16,6 +16,7 @@ */ package org.aksw.gerbil.dataset.impl.nif; +import java.io.BufferedInputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; @@ -77,14 +78,14 @@ public FileBasedNIFDataset(String filePath) { @Override protected InputStream getDataAsInputStream() { - FileInputStream fin = null; + InputStream is = null; try { LOGGER.debug("Loading NIF dataset from {}", filePath); - fin = new FileInputStream(filePath); + is = new BufferedInputStream(new FileInputStream(filePath)); } catch (FileNotFoundException e) { LOGGER.error("Couldn't load NIF dataset from file.", e); } - return fin; + return is; } @Override diff --git a/src/main/java/org/aksw/gerbil/dataset/impl/ritter/RitterDataset.java b/src/main/java/org/aksw/gerbil/dataset/impl/ritter/RitterDataset.java new file mode 100644 index 000000000..e32ef3013 --- /dev/null +++ b/src/main/java/org/aksw/gerbil/dataset/impl/ritter/RitterDataset.java @@ -0,0 +1,175 @@ +package org.aksw.gerbil.dataset.impl.ritter; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.nio.charset.Charset; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import org.aksw.gerbil.dataset.InitializableDataset; +import org.aksw.gerbil.dataset.impl.AbstractDataset; +import org.aksw.gerbil.datatypes.ErrorTypes; +import org.aksw.gerbil.exceptions.GerbilException; +import org.aksw.gerbil.transfer.nif.Document; +import org.aksw.gerbil.transfer.nif.Marking; +import org.aksw.gerbil.transfer.nif.data.DocumentImpl; +import org.aksw.gerbil.transfer.nif.data.TypedNamedEntity; +import org.apache.commons.io.IOUtils; + + +public class RitterDataset extends AbstractDataset implements InitializableDataset { + + private static StringBuilder realTweet; + private String file; + private List documents; + private int firstDocId; + private int lastDocId; + + public RitterDataset(String file) { + this.file = file; + } + + + + @Override + public int size() { + return documents.size(); + } + + @Override + public List getInstances() { + return documents; + } + + @Override + public void init() throws GerbilException { + this.documents = loadDocuments(new File(file)); + if ((firstDocId > 0) && (lastDocId > 0)) { + this.documents = this.documents.subList(firstDocId - 1, lastDocId); + } + } + + protected List loadDocuments(File tweetsFile) + throws GerbilException { + BufferedReader reader = null; +// CSVReader reader = null; + List documents = new ArrayList(); + String documentUriPrefix = "http://" + getName() + "/"; + try { + reader = new BufferedReader(new InputStreamReader( + new FileInputStream(tweetsFile), Charset.forName("UTF-8"))); + + String line = reader.readLine(); + int tweetIndex=0; + List markings = new ArrayList(); + StringBuilder tweet = new StringBuilder("").append(line); + while (line != null) { + if(line.trim().isEmpty()){ + //Get Markings + markings = findMarkings(tweet.toString()); + //Save old tweet + String tw = realTweet.toString(); + documents.add(new DocumentImpl(tw, documentUriPrefix + + tweetIndex, markings)); + //New Tweet + tweet = new StringBuilder(); + line = reader.readLine(); + tweetIndex++; + continue; + } + tweet.append(line+"\n"); + line = reader.readLine(); + } + } catch (IOException e) { + throw new GerbilException("Exception while reading dataset.", e, + ErrorTypes.DATASET_LOADING_ERROR); + } finally { + IOUtils.closeQuietly(reader); +// IOUtils.closeQuietly(bReader); + } + return documents; + } + + + public static List findMarkings(String tweet){ + int start=0; + List markings = new ArrayList(); + realTweet = new StringBuilder(); + String[] line = tweet.split("\n"); + int i=0; + for(String tokenFull : line){ + String[] token = tokenFull.split("\t+"); + realTweet.append(token[0]+" "); + token[1]=token[1].trim(); + if(token[1].startsWith("B-")){ + String[] marking = getWholeMarking(line, i); + Set types = new HashSet(); + types.add(marking[1]); + markings.add(new TypedNamedEntity(start, marking[0].length(), "", types)); + + } + start+=token[0].length()+1; + i++; + } + + return markings; + } + + private static String[] getWholeMarking(String line[], int pos){ + String[] ret = new String[2]; + String[] token = line[pos].split("\t+"); + StringBuilder name= new StringBuilder().append(token[0]); + if(!token[1].equals("O")){ + ret[1] = token[1]; + switch (token[1].trim().substring(2)) { + case "facility": + ret[1] = "http://dbpedia.org/ontology/Place"; + break; + case "company": + ret[1] = "http://dbpedia.org/ontology/Company"; + break; + case "geo-loc": + ret[1] = "http://dbpedia.org/ontology/Place"; + break; + case "movie": + ret[1] = "http://dbpedia.org/ontology/Film"; + break; + case "musicartist": + ret[1] = "http://dbpedia.org/ontology/MusicalArtist"; + break; + case "other": + ret[1] = "http://dbpedia.org/ontology/Unknown"; + break; + case "person": + ret[1] = "http://dbpedia.org/ontology/Person"; + break; + case "product": + ret[1] = "http://dbpedia.org/ontology/product"; + break; + case "sportsteam": + ret[1] = "http://dbpedia.org/ontology/SportsTeam"; + break; + case "tvshow": + ret[1] = "http://dbpedia.org/ontology/TelevisionShow"; + break; + } + } + for(int i=pos+1;i documents; + private String wordsFile; + private Boolean senseval3; + + public SensevalDataset(String wordsFile){ + this(wordsFile, "false"); + } + + public SensevalDataset(String wordsFile, String senseval3){ + this.wordsFile = wordsFile; + this.senseval3 = Boolean.valueOf(senseval3); + documents = new ArrayList(); + } + + @Override + public int size() { + return documents.size(); + } + + @Override + public List getInstances() { + return documents; + } + + @Override + public void init() throws GerbilException { + this.documents = loadDocuments(new File(this.wordsFile)); + } + + private List loadDocuments(File file) throws GerbilException { + SAXParserFactory factory = SAXParserFactory.newInstance(); + SAXParser saxParser=null; + + try{ + InputSource is; + if(senseval3){ + //FIXME: Better solution, its just one line where & is as content + String content = org.apache.commons.io.FileUtils.readFileToString(new File(this.wordsFile), "UTF-8"); + content = content.replace("&", "&").trim(); + is = new InputSource(new ByteArrayInputStream(content.getBytes())); + is.setEncoding("UTF-8"); + } + else{ + is = new InputSource(new FileInputStream(file)); + is.setEncoding("UTF-8"); + } + saxParser = factory.newSAXParser(); + saxParser.parse(is, new SensevalSAXHandler(documents)); + } catch (Exception e) { + throw new GerbilException("Exception while reading dataset.", e, + ErrorTypes.DATASET_LOADING_ERROR); + } + + + return documents; + } + + +} diff --git a/src/main/java/org/aksw/gerbil/dataset/impl/senseval/SensevalSAXHandler.java b/src/main/java/org/aksw/gerbil/dataset/impl/senseval/SensevalSAXHandler.java new file mode 100644 index 000000000..e448dfdbc --- /dev/null +++ b/src/main/java/org/aksw/gerbil/dataset/impl/senseval/SensevalSAXHandler.java @@ -0,0 +1,108 @@ +package org.aksw.gerbil.dataset.impl.senseval; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import org.aksw.gerbil.transfer.nif.Document; +import org.aksw.gerbil.transfer.nif.Marking; +import org.aksw.gerbil.transfer.nif.data.DocumentImpl; +import org.aksw.gerbil.transfer.nif.data.NamedEntity; +import org.xml.sax.Attributes; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; + +public class SensevalSAXHandler extends DefaultHandler { + + public static final String SENTENCE_ELEMENT = "sentence"; + public static final String INSTANCE_ELEMENT = "instance"; + private static final String WF_ELEMENT = "wf"; + + private StringBuilder sentence = new StringBuilder(); + private List markings = new ArrayList(); + private List documents; + private int start = 0; + private int length; + private int i = 0; + private String instanceUri; + + private byte field = -1; + + public SensevalSAXHandler(List documents) { + this.documents = documents; + } + + @Override + public void startElement(String uri, String localName, String qName, + Attributes attributes) throws SAXException { + if (qName.equalsIgnoreCase(SENTENCE_ELEMENT)) { + field = 0; + markings = new ArrayList(); + } else if (qName.equalsIgnoreCase(INSTANCE_ELEMENT)) { + field = 1; + length = 0; + instanceUri = ""; + } else if (qName.equalsIgnoreCase(WF_ELEMENT)) { + field = 2; + length = 0; + } + } + + @Override + public void endElement(String uri, String localName, String qName) + throws SAXException { + if (qName.equalsIgnoreCase(SENTENCE_ELEMENT)) { + i++; + documents.add(new DocumentImpl(sentence.toString(), + "http://senseval" + i, markings)); + sentence = new StringBuilder(); + } else if (qName.equalsIgnoreCase(INSTANCE_ELEMENT)) { + markings.add(new NamedEntity(start, length, instanceUri)); + start = sentence.length(); + } else if (qName.equalsIgnoreCase(WF_ELEMENT)) { + start = sentence.length(); + + } + this.field = 0; + } + + @Override + public void characters(char ch[], int start, int length) + throws SAXException { + switch (field) { + case 0: + break; + case 1: + case 2: + this.length = length; + String word = new String(Arrays.copyOfRange(ch, start, start + + length)); + if(word.equals("&")){ + word = word.replace("&", "&"); + } + this.start+= addWordToSentence(word); + } + this.field = 0; + + } + + public List getDocuments() { + return documents; + } + + private int addWordToSentence(String word) { + if (sentence.length() == 0) { + sentence.append(word); + return 0; + } + + if (word.matches("(,|\\.|;|:|!|\\?)")) { + sentence.append(word); + return 0; + } + else { + sentence.append(" ").append(word); + return 1; + } + } +} diff --git a/src/main/java/org/aksw/gerbil/dataset/impl/umbc/UMBCDataset.java b/src/main/java/org/aksw/gerbil/dataset/impl/umbc/UMBCDataset.java new file mode 100644 index 000000000..cb524bf73 --- /dev/null +++ b/src/main/java/org/aksw/gerbil/dataset/impl/umbc/UMBCDataset.java @@ -0,0 +1,154 @@ +package org.aksw.gerbil.dataset.impl.umbc; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.nio.charset.Charset; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import org.aksw.gerbil.dataset.InitializableDataset; +import org.aksw.gerbil.dataset.impl.AbstractDataset; +import org.aksw.gerbil.datatypes.ErrorTypes; +import org.aksw.gerbil.exceptions.GerbilException; +import org.aksw.gerbil.transfer.nif.Document; +import org.aksw.gerbil.transfer.nif.Marking; +import org.aksw.gerbil.transfer.nif.data.DocumentImpl; +import org.aksw.gerbil.transfer.nif.data.TypedNamedEntity; +import org.apache.commons.io.IOUtils; + + +public class UMBCDataset extends AbstractDataset implements InitializableDataset { + + private static StringBuilder realTweet; + private String file; + private List documents; + private int firstDocId; + private int lastDocId; + + public UMBCDataset(String file) { + this.file = file; + } + + + + @Override + public int size() { + return documents.size(); + } + + @Override + public List getInstances() { + return documents; + } + + @Override + public void init() throws GerbilException { + this.documents = loadDocuments(new File(file)); + if ((firstDocId > 0) && (lastDocId > 0)) { + this.documents = this.documents.subList(firstDocId - 1, lastDocId); + } + } + + protected List loadDocuments(File tweetsFile) + throws GerbilException { + BufferedReader reader = null; +// CSVReader reader = null; + List documents = new ArrayList(); + String documentUriPrefix = "http://" + getName() + "/"; + try { + reader = new BufferedReader(new InputStreamReader( + new FileInputStream(tweetsFile), Charset.forName("UTF-8"))); + + String line = reader.readLine(); + int tweetIndex=0; + List markings = new ArrayList(); + StringBuilder tweet = new StringBuilder("").append(line); + while (line != null) { + if(line.trim().isEmpty()){ + //Get Markings + markings = findMarkings(tweet.toString()); + //Save old tweet + String tw = realTweet.toString(); + documents.add(new DocumentImpl(tw, documentUriPrefix + + tweetIndex, markings)); + //New Tweet + tweet = new StringBuilder(); + line = reader.readLine(); + tweetIndex++; + continue; + } + tweet.append(line+"\n"); + line = reader.readLine(); + } + } catch (IOException e) { + throw new GerbilException("Exception while reading dataset.", e, + ErrorTypes.DATASET_LOADING_ERROR); + } finally { + IOUtils.closeQuietly(reader); +// IOUtils.closeQuietly(bReader); + } + return documents; + } + + + public static List findMarkings(String tweet){ + int start=0; + List markings = new ArrayList(); + realTweet = new StringBuilder(); + String[] line = tweet.split("\n"); + int i=0; + for(String tokenFull : line){ + String[] token = tokenFull.split("\t+"); + realTweet.append(token[0]+" "); + token[1]=token[1].trim(); + if(token[1].startsWith("B-")){ + String[] marking = getWholeMarking(line, i); + Set types = new HashSet(); + types.add(marking[1]); + markings.add(new TypedNamedEntity(start, marking[0].length(), "", types)); + + } + start+=token[0].length()+1; + i++; + } + + return markings; + } + + private static String[] getWholeMarking(String line[], int pos){ + String[] ret = new String[2]; + String[] token = line[pos].split("\t+"); + StringBuilder name= new StringBuilder().append(token[0]); + if(!token[1].equals("O")){ + ret[1] = token[1]; + switch (token[1].trim().substring(2)) { + case "PER": + ret[1] = "http://dbpedia.org/ontology/Person"; + break; + case "ORG": + ret[1] = "http://dbpedia.org/ontology/Organisation"; + break; + case "LOC": + ret[1] = "http://dbpedia.org/ontology/Place"; + break; + } + } + for(int i=pos+1;i documents; + private String annotatedFile; + private String tweetsFile; + + public WSDMDataset(String annotatedFile, String tweetsFile){ + this.annotatedFile = annotatedFile; + this.tweetsFile = tweetsFile; + } + + + @Override + public int size() { + return documents.size(); + } + + @Override + public List getInstances() { + return documents; + } + + @Override + public void init() throws GerbilException { + this.documents = loadDocuments(new File(annotatedFile), new File(tweetsFile)); + } + + private List loadDocuments(File annotations, File tweets) + throws GerbilException { + List documents = new ArrayList(); + String documentUriPrefix = "http://" + getName() + "/"; + //its json per line + try (BufferedReader bReader = new BufferedReader(new InputStreamReader( + new FileInputStream(tweets), Charset.forName("UTF-8")))) { + String line; + List markings; + while ((line = bReader.readLine()) != null) { + JSONObject json = new JSONObject(line); + + String id = json.getString("id_str"); + String text = json.getString("text"); + markings = findMarkings(getMarkingLines(annotations, id), text); + documents.add(new DocumentImpl(text, documentUriPrefix + id, + markings)); + } + } catch (IOException e) { + throw new GerbilException("Exception while reading dataset.", e, + ErrorTypes.DATASET_LOADING_ERROR); + } + + return documents; + } + + protected static List findMarkings(Set lines, String text) { + List markings = new ArrayList(); + + for (String line : lines) { + String[] annotation = line.split("\t"); + + String uri = WikipediaHelper.getWikipediaUri(WIKIPEDIA_DOMAIN , annotation[2]); + markings.add(new Annotation(uri)); + } + + return markings; + } + + private static Set getMarkingLines(File annotations, String id) { + Set lines = new HashSet(); + + try (BufferedReader bReader = new BufferedReader(new InputStreamReader( + new FileInputStream(annotations), Charset.forName("UTF-8")))) { + String line; + Boolean annotationSeen = false; + while ((line = bReader.readLine()) != null) { + String[] annotation = line.split("\t"); + if (id.equals(annotation[0])) { + annotationSeen = true; + lines.add(line); + } else if (annotationSeen) { + // as the annotations are ordered by id, the last annotation + // was added + return lines; + } + } + + } catch (IOException e) { + LOGGER.error("Could not find Markings due to ", e); + } + return lines; + } + +} diff --git a/src/main/java/org/aksw/gerbil/datatypes/ExperimentType.java b/src/main/java/org/aksw/gerbil/datatypes/ExperimentType.java index 3297fa5a1..d69d1090c 100644 --- a/src/main/java/org/aksw/gerbil/datatypes/ExperimentType.java +++ b/src/main/java/org/aksw/gerbil/datatypes/ExperimentType.java @@ -60,7 +60,8 @@ public enum ExperimentType implements Describable { * Input: text with marked entities
* Output: mentions for every entity */ - D2KB("D2KB", + D2KB( + "D2KB", "The input for the annotator is a text with entities that already have been marked inside. The annotator should link all these mentioned entities to a knowledge base."), /** @@ -73,7 +74,9 @@ public enum ExperimentType implements Describable { * Input: text
* Output: marked entities and scored mentions for their meaning */ - @Deprecated Sa2KB("Sa2KB", + @Deprecated + Sa2KB( + "Sa2KB", "The annotator gets a text and shall recognize entities inside and link them to a knowledge base. Additionally, each annotation is assigned a score representing the likelihood that the annotation is correct."), /** @@ -86,7 +89,9 @@ public enum ExperimentType implements Describable { * Input: text
* Output: scored markings of entities */ - @Deprecated Sc2KB("Sc2KB", + @Deprecated + Sc2KB( + "Sc2KB", "The annotator gets a text and shall return relevant entities that are mentioned inside the text. Additionally, each tag is assigned a score representing the likelihood that the annotation is correct."), /** @@ -98,7 +103,9 @@ public enum ExperimentType implements Describable { * Input: text
* Output: ranked markings of entities */ - @Deprecated Rc2KB("Sc2KB", + @Deprecated + Rc2KB( + "Sc2KB", "The annotator gets a text and shall return relevant entities that are mentioned inside the textand rank them in terms of their relevance for the topics dealt with in the input text"), /** @@ -126,8 +133,15 @@ public enum ExperimentType implements Describable { * a given text and the extraction of the part of the text, describing the * type. */ - OKE_Task2("OKE Challenge 2015 - Task 2", - "This task comprises the determining of the type of a given entity inside a given text and the extraction of the part of the text, describing the type."),; + OKE_Task2( + "OKE Challenge 2015 - Task 2", + "This task comprises the determining of the type of a given entity inside a given text and the extraction of the part of the text, describing the type."), + + /** + * The annotator gets a text and shall recognize entities inside and their + * types. + */ + RT2KB("RT2KB", "The annotator gets a text and shall recognize entities inside and their types."); private String label; private String description; @@ -166,6 +180,7 @@ public boolean equalsOrContainsType(ExperimentType type) { return true; } case ETyping: // falls through + case RT2KB: case OKE_Task1: case OKE_Task2: { return false; @@ -179,6 +194,7 @@ public boolean equalsOrContainsType(ExperimentType type) { case A2KB: case D2KB: case ETyping: + case RT2KB: case OKE_Task1: case OKE_Task2: { return false; @@ -197,6 +213,7 @@ public boolean equalsOrContainsType(ExperimentType type) { case Sc2KB: case A2KB: case D2KB: + case RT2KB: case ETyping: case OKE_Task1: case OKE_Task2: { @@ -208,6 +225,25 @@ public boolean equalsOrContainsType(ExperimentType type) { } } } + case RT2KB: { + switch (type) { + case ERec: // falls through + case ETyping: + case RT2KB: { + return true; + } + case C2KB: // falls through + case A2KB: + case D2KB: + case Sa2KB: + case Sc2KB: + case Rc2KB: + case OKE_Task1: + case OKE_Task2: { + return false; + } + } + } case C2KB: { return type == C2KB; } diff --git a/src/main/java/org/aksw/gerbil/evaluate/EvaluatorFactory.java b/src/main/java/org/aksw/gerbil/evaluate/EvaluatorFactory.java index a3faea431..6e73b2eec 100644 --- a/src/main/java/org/aksw/gerbil/evaluate/EvaluatorFactory.java +++ b/src/main/java/org/aksw/gerbil/evaluate/EvaluatorFactory.java @@ -20,7 +20,6 @@ import java.util.Arrays; import java.util.List; -import org.aksw.gerbil.config.GerbilConfiguration; import org.aksw.gerbil.dataset.Dataset; import org.aksw.gerbil.datatypes.ExperimentTaskConfiguration; import org.aksw.gerbil.datatypes.ExperimentType; @@ -29,11 +28,11 @@ import org.aksw.gerbil.datatypes.marking.MarkingClasses; import org.aksw.gerbil.evaluate.impl.ClassConsideringFMeasureCalculator; import org.aksw.gerbil.evaluate.impl.ClassifyingEvaluatorDecorator; +import org.aksw.gerbil.evaluate.impl.ConfidenceBasedFMeasureCalculator; import org.aksw.gerbil.evaluate.impl.ConfidenceScoreEvaluatorDecorator; import org.aksw.gerbil.evaluate.impl.DoubleResultComparator; import org.aksw.gerbil.evaluate.impl.FMeasureCalculator; import org.aksw.gerbil.evaluate.impl.GSInKBClassifyingEvaluatorDecorator; -import org.aksw.gerbil.evaluate.impl.ConfidenceBasedFMeasureCalculator; import org.aksw.gerbil.evaluate.impl.HierarchicalFMeasureCalculator; import org.aksw.gerbil.evaluate.impl.SpanMergingEvaluatorDecorator; import org.aksw.gerbil.evaluate.impl.SubTaskAverageCalculator; @@ -60,8 +59,7 @@ import org.aksw.gerbil.transfer.nif.TypedSpan; import org.aksw.gerbil.transfer.nif.data.TypedNamedEntity; import org.aksw.gerbil.utils.filter.TypeBasedMarkingFilter; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; +import org.aksw.gerbil.web.config.RootConfig; import com.hp.hpl.jena.rdf.model.ModelFactory; import com.hp.hpl.jena.vocabulary.OWL; @@ -70,10 +68,8 @@ @SuppressWarnings("deprecation") public class EvaluatorFactory { - private static final Logger LOGGER = LoggerFactory.getLogger(EvaluatorFactory.class); - - private static final String DEFAULT_WELL_KNOWN_KBS_PARAMETER_KEY = "org.aksw.gerbil.evaluate.DefaultWellKnownKB"; - private static final String DEFAULT_WELL_KNOWN_KBS[] = loadDefaultKBs(); + // private static final Logger LOGGER = + // LoggerFactory.getLogger(EvaluatorFactory.class); protected UriKBClassifier globalClassifier = null; protected SubClassInferencer inferencer = null; @@ -82,14 +78,6 @@ public EvaluatorFactory() { this(null, null); } - private static String[] loadDefaultKBs() { - String kbs[] = GerbilConfiguration.getInstance().getStringArray(DEFAULT_WELL_KNOWN_KBS_PARAMETER_KEY); - if (kbs == null) { - LOGGER.error("Couldn't load the list of well known KBs. This GERBIL instance might not work as expected!"); - } - return kbs; - } - public EvaluatorFactory(UriKBClassifier globalClassifier) { this(globalClassifier, null); } @@ -102,7 +90,7 @@ public EvaluatorFactory(UriKBClassifier globalClassifier, SubClassInferencer inf if (globalClassifier != null) { this.globalClassifier = globalClassifier; } else { - this.globalClassifier = new SimpleWhiteListBasedUriKBClassifier(DEFAULT_WELL_KNOWN_KBS); + this.globalClassifier = RootConfig.createDefaultUriKBClassifier(); } if (inferencer != null) { this.inferencer = inferencer; @@ -112,8 +100,7 @@ public EvaluatorFactory(UriKBClassifier globalClassifier, SubClassInferencer inf } @SuppressWarnings("rawtypes") - public Evaluator createEvaluator(ExperimentType type, ExperimentTaskConfiguration configuration, - Dataset dataset) { + public Evaluator createEvaluator(ExperimentType type, ExperimentTaskConfiguration configuration, Dataset dataset) { return createEvaluator(type, configuration, dataset, globalClassifier, inferencer); } @@ -125,9 +112,8 @@ public Evaluator createEvaluator(ExperimentType type, ExperimentTaskConfiguratio return new ClassifyingEvaluatorDecorator( new ClassConsideringFMeasureCalculator( new MatchingsCounterImpl(new ClassifiedMeaningMatchingsSearcher()), - MarkingClasses.IN_KB, MarkingClasses.EE), - new UriBasedMeaningClassifier(classifier, MarkingClasses.IN_KB), - new EmergingEntityMeaningClassifier()); + MarkingClasses.IN_KB, MarkingClasses.EE), new UriBasedMeaningClassifier( + classifier, MarkingClasses.IN_KB), new EmergingEntityMeaningClassifier()); } case Sa2KB: case A2KB: { @@ -143,8 +129,8 @@ public Evaluator createEvaluator(ExperimentType type, ExperimentTaskConfiguratio new EmergingEntityMeaningClassifier()); } case ERec: { - return new ConfidenceBasedFMeasureCalculator( - new MatchingsCounterImpl((MatchingsSearcher) MatchingsSearcherFactory + return new ConfidenceBasedFMeasureCalculator(new MatchingsCounterImpl( + (MatchingsSearcher) MatchingsSearcherFactory .createSpanMatchingsSearcher(configuration.matching))); } case D2KB: { @@ -156,48 +142,51 @@ public Evaluator createEvaluator(ExperimentType type, ExperimentTaskConfiguratio new MatchingsCounterImpl( new CompoundMatchingsSearcher( (MatchingsSearcher) MatchingsSearcherFactory - .createSpanMatchingsSearcher( - configuration.matching), + .createSpanMatchingsSearcher(configuration.matching), new ClassifiedMeaningMatchingsSearcher())), MarkingClasses.IN_KB, MarkingClasses.EE, MarkingClasses.GS_IN_KB), new StrongSpanMatchingsSearcher()), new UriBasedMeaningClassifier(classifier, MarkingClasses.IN_KB), - new EmergingEntityMeaningClassifier()), - true); + new EmergingEntityMeaningClassifier()), true); } case ETyping: { - return new SearcherBasedNotMatchingMarkingFilter(new StrongSpanMatchingsSearcher(), - new ConfidenceScoreEvaluatorDecorator( - new HierarchicalFMeasureCalculator(new HierarchicalMatchingsCounter( + return new SearcherBasedNotMatchingMarkingFilter( + new StrongSpanMatchingsSearcher(), + new ConfidenceScoreEvaluatorDecorator(new HierarchicalFMeasureCalculator( + new HierarchicalMatchingsCounter( (MatchingsSearcher) MatchingsSearcherFactory - .createSpanMatchingsSearcher(configuration.matching), - classifier, inferencer)), - FMeasureCalculator.MICRO_F1_SCORE_NAME, new DoubleResultComparator()), + .createSpanMatchingsSearcher(configuration.matching), classifier, + inferencer)), FMeasureCalculator.MICRO_F1_SCORE_NAME, new DoubleResultComparator()), true); } + case RT2KB: { + return new ConfidenceScoreEvaluatorDecorator(new HierarchicalFMeasureCalculator( + new HierarchicalMatchingsCounter((MatchingsSearcher) MatchingsSearcherFactory + .createSpanMatchingsSearcher(configuration.matching), classifier, inferencer)), + FMeasureCalculator.MICRO_F1_SCORE_NAME, new DoubleResultComparator()); + } case OKE_Task1: { ExperimentTaskConfiguration subTaskConfig; List> evaluators = new ArrayList>(); - UriKBClassifier okeClassifierTask1 = new ExactWhiteListBasedUriKBClassifier( - Arrays.asList("http://www.ontologydesignpatterns.org/ont/d0.owl#Location", - "http://www.ontologydesignpatterns.org/ont/dul/DUL.owl#Organization", - "http://www.ontologydesignpatterns.org/ont/dul/DUL.owl#Person", - "http://www.ontologydesignpatterns.org/ont/dul/DUL.owl#Role")); + UriKBClassifier okeClassifierTask1 = new ExactWhiteListBasedUriKBClassifier(Arrays.asList( + "http://www.ontologydesignpatterns.org/ont/d0.owl#Location", + "http://www.ontologydesignpatterns.org/ont/dul/DUL.owl#Organization", + "http://www.ontologydesignpatterns.org/ont/dul/DUL.owl#Person", + "http://www.ontologydesignpatterns.org/ont/dul/DUL.owl#Role")); subTaskConfig = new ExperimentTaskConfiguration(configuration.annotatorConfig, configuration.datasetConfig, ExperimentType.ERec, configuration.matching); - evaluators.add(new SubTaskEvaluator<>(subTaskConfig, - (Evaluator) createEvaluator(ExperimentType.ERec, subTaskConfig, dataset))); + evaluators.add(new SubTaskEvaluator<>(subTaskConfig, (Evaluator) createEvaluator( + ExperimentType.ERec, subTaskConfig, dataset))); subTaskConfig = new ExperimentTaskConfiguration(configuration.annotatorConfig, configuration.datasetConfig, ExperimentType.D2KB, Matching.STRONG_ENTITY_MATCH); - evaluators.add(new SubTaskEvaluator<>(subTaskConfig, - (Evaluator) createEvaluator(ExperimentType.D2KB, subTaskConfig, dataset))); + evaluators.add(new SubTaskEvaluator<>(subTaskConfig, (Evaluator) createEvaluator( + ExperimentType.D2KB, subTaskConfig, dataset))); subTaskConfig = new ExperimentTaskConfiguration(configuration.annotatorConfig, configuration.datasetConfig, ExperimentType.ETyping, Matching.STRONG_ENTITY_MATCH); - evaluators.add(new SubTaskEvaluator<>(subTaskConfig, - (Evaluator) createEvaluator(ExperimentType.ETyping, subTaskConfig, dataset, - okeClassifierTask1, inferencer))); + evaluators.add(new SubTaskEvaluator<>(subTaskConfig, (Evaluator) createEvaluator( + ExperimentType.ETyping, subTaskConfig, dataset, okeClassifierTask1, inferencer))); return new ConfidenceScoreEvaluatorDecorator( new SubTaskAverageCalculator(evaluators), FMeasureCalculator.MICRO_F1_SCORE_NAME, new DoubleResultComparator()); @@ -214,20 +203,18 @@ FMeasureCalculator.MICRO_F1_SCORE_NAME, new DoubleResultComparator()), // entities, without a class type!) subTaskConfig = new ExperimentTaskConfiguration(configuration.annotatorConfig, configuration.datasetConfig, ExperimentType.ETyping, Matching.STRONG_ENTITY_MATCH); - evaluators.add(new SubTaskEvaluator<>(subTaskConfig, - new MarkingFilteringEvaluatorDecorator<>( - new TypeBasedMarkingFilter(false, classTypes), - (Evaluator) createEvaluator(ExperimentType.ETyping, subTaskConfig, - dataset, okeClassifierTask2, inferencer)))); + evaluators.add(new SubTaskEvaluator<>(subTaskConfig, new MarkingFilteringEvaluatorDecorator<>( + new TypeBasedMarkingFilter(false, classTypes), + (Evaluator) createEvaluator(ExperimentType.ETyping, subTaskConfig, dataset, + okeClassifierTask2, inferencer)))); // sub task 2, find the correct position of the type in the text // (use only entities with a class type!) subTaskConfig = new ExperimentTaskConfiguration(configuration.annotatorConfig, configuration.datasetConfig, ExperimentType.ERec, configuration.matching); - evaluators.add(new SubTaskEvaluator<>(subTaskConfig, - new MarkingFilteringEvaluatorDecorator<>( - new TypeBasedMarkingFilter(true, classTypes), - new SpanMergingEvaluatorDecorator<>((Evaluator) createEvaluator( - ExperimentType.ERec, subTaskConfig, dataset))))); + evaluators.add(new SubTaskEvaluator<>(subTaskConfig, new MarkingFilteringEvaluatorDecorator<>( + new TypeBasedMarkingFilter(true, classTypes), + new SpanMergingEvaluatorDecorator<>((Evaluator) createEvaluator( + ExperimentType.ERec, subTaskConfig, dataset))))); return new ConfidenceScoreEvaluatorDecorator( new SubTaskAverageCalculator(evaluators), FMeasureCalculator.MICRO_F1_SCORE_NAME, @@ -259,14 +246,27 @@ protected void addSubTaskEvaluators(List> evaluators, ExperimentTas case A2KB: { subTaskConfig = new ExperimentTaskConfiguration(configuration.annotatorConfig, configuration.datasetConfig, ExperimentType.ERec, configuration.matching); - evaluators.add(new SubTaskEvaluator<>(subTaskConfig, - createEvaluator(ExperimentType.ERec, subTaskConfig, dataset))); + evaluators.add(new SubTaskEvaluator<>(subTaskConfig, createEvaluator(ExperimentType.ERec, subTaskConfig, + dataset))); subTaskConfig = new ExperimentTaskConfiguration(configuration.annotatorConfig, configuration.datasetConfig, ExperimentType.D2KB, Matching.STRONG_ENTITY_MATCH); // evaluators.add(createEvaluator(ExperimentType.ELink, // configuration, dataset)); - evaluators.add(new SubTaskEvaluator<>(subTaskConfig, - createEvaluator(ExperimentType.D2KB, subTaskConfig, dataset))); + evaluators.add(new SubTaskEvaluator<>(subTaskConfig, createEvaluator(ExperimentType.D2KB, subTaskConfig, + dataset))); + return; + } + case RT2KB: { + subTaskConfig = new ExperimentTaskConfiguration(configuration.annotatorConfig, configuration.datasetConfig, + ExperimentType.ERec, configuration.matching); + evaluators.add(new SubTaskEvaluator<>(subTaskConfig, createEvaluator(ExperimentType.ERec, subTaskConfig, + dataset))); + subTaskConfig = new ExperimentTaskConfiguration(configuration.annotatorConfig, configuration.datasetConfig, + ExperimentType.ETyping, Matching.STRONG_ENTITY_MATCH); + // evaluators.add(createEvaluator(ExperimentType.ELink, + // configuration, dataset)); + evaluators.add(new SubTaskEvaluator<>(subTaskConfig, createEvaluator(ExperimentType.ETyping, subTaskConfig, + dataset))); return; } default: { @@ -275,8 +275,7 @@ protected void addSubTaskEvaluators(List> evaluators, ExperimentTas } } - public void addEvaluators(List> evaluators, ExperimentTaskConfiguration configuration, - Dataset dataset) { + public void addEvaluators(List> evaluators, ExperimentTaskConfiguration configuration, Dataset dataset) { evaluators.add(createEvaluator(configuration.type, configuration, dataset)); addSubTaskEvaluators(evaluators, configuration, dataset); } diff --git a/src/main/java/org/aksw/gerbil/execute/ExperimentTask.java b/src/main/java/org/aksw/gerbil/execute/ExperimentTask.java index d1e8eda55..9fa43fe3c 100644 --- a/src/main/java/org/aksw/gerbil/execute/ExperimentTask.java +++ b/src/main/java/org/aksw/gerbil/execute/ExperimentTask.java @@ -27,6 +27,7 @@ import org.aksw.gerbil.annotator.EntityTyper; import org.aksw.gerbil.annotator.OKETask1Annotator; import org.aksw.gerbil.annotator.OKETask2Annotator; +import org.aksw.gerbil.annotator.RT2KBAnnotator; import org.aksw.gerbil.annotator.decorator.ErrorCountingAnnotatorDecorator; import org.aksw.gerbil.annotator.decorator.SingleInstanceSecuringAnnotatorDecorator; import org.aksw.gerbil.annotator.decorator.TimeMeasuringAnnotatorDecorator; @@ -113,11 +114,11 @@ public void run() { } Annotator decoratedAnnotator = annotator; // Add decroatoring evaluators - TimeMeasuringAnnotatorDecorator timeMeasurer = TimeMeasuringAnnotatorDecorator - .createDecorator(configuration.type, decoratedAnnotator); + TimeMeasuringAnnotatorDecorator timeMeasurer = TimeMeasuringAnnotatorDecorator.createDecorator( + configuration.type, decoratedAnnotator); decoratedAnnotator = timeMeasurer; - ErrorCountingAnnotatorDecorator errorCounter = ErrorCountingAnnotatorDecorator - .createDecorator(configuration.type, decoratedAnnotator, dataset.size()); + ErrorCountingAnnotatorDecorator errorCounter = ErrorCountingAnnotatorDecorator.createDecorator( + configuration.type, decoratedAnnotator, dataset.size()); decoratedAnnotator = errorCounter; decoratedAnnotator = SingleInstanceSecuringAnnotatorDecorator.createDecorator(configuration.type, decoratedAnnotator); @@ -427,6 +428,28 @@ protected EvaluationResult runExperiment(Dataset dataset, Annotator annotator, } break; } + case RT2KB: { + try { + List> results = new ArrayList>(dataset.size()); + List> goldStandard = new ArrayList>(dataset.size()); + RT2KBAnnotator extractor = (RT2KBAnnotator) annotator; + for (Document document : dataset.getInstances()) { + // reduce the document to a single text + results.add(extractor.performRT2KBTask(DocumentInformationReducer.reduceToPlainText(document))); + goldStandard.add(document.getMarkings(TypedSpan.class)); + taskState.increaseExperimentStepCount(); + } + if (annotatorOutputWriter != null) { + annotatorOutputWriter.storeAnnotatorOutput(configuration, results, dataset.getInstances()); + } + evalResult = evaluate(evaluators, results, goldStandard); + } catch (GerbilException e) { + throw e; + } catch (Exception e) { + throw new GerbilException(e, ErrorTypes.UNEXPECTED_EXCEPTION); + } + break; + } case OKE_Task1: { try { List> results = new ArrayList>(dataset.size()); @@ -435,8 +458,8 @@ protected EvaluationResult runExperiment(Dataset dataset, Annotator annotator, for (Document document : dataset.getInstances()) { // reduce the document to a text and a list of Spans - results.add( - okeTask1Annotator.performTask1(DocumentInformationReducer.reduceToTextAndSpans(document))); + results.add(okeTask1Annotator.performTask1(DocumentInformationReducer + .reduceToTextAndSpans(document))); goldStandard.add(document.getMarkings(TypedNamedEntity.class)); taskState.increaseExperimentStepCount(); } @@ -460,8 +483,8 @@ protected EvaluationResult runExperiment(Dataset dataset, Annotator annotator, for (Document document : dataset.getInstances()) { // reduce the document to a text and a list of Spans - results.add(okeTask2Annotator - .performTask2(DocumentInformationReducer.reduceToTextAndEntities(document))); + results.add(okeTask2Annotator.performTask2(DocumentInformationReducer + .reduceToTextAndEntities(document))); goldStandard.add(document.getMarkings(TypedNamedEntity.class)); taskState.increaseExperimentStepCount(); } diff --git a/src/main/java/org/aksw/gerbil/http/HttpManagement.java b/src/main/java/org/aksw/gerbil/http/HttpManagement.java index 57a7cb25e..b8a598757 100644 --- a/src/main/java/org/aksw/gerbil/http/HttpManagement.java +++ b/src/main/java/org/aksw/gerbil/http/HttpManagement.java @@ -22,7 +22,6 @@ import org.apache.http.client.methods.HttpUriRequest; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClientBuilder; -import org.apache.http.impl.conn.PoolingHttpClientConnectionManager; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/src/main/java/org/aksw/gerbil/matching/impl/HierarchicalMatchingsCounter.java b/src/main/java/org/aksw/gerbil/matching/impl/HierarchicalMatchingsCounter.java index 30591e502..f3fc06fb9 100644 --- a/src/main/java/org/aksw/gerbil/matching/impl/HierarchicalMatchingsCounter.java +++ b/src/main/java/org/aksw/gerbil/matching/impl/HierarchicalMatchingsCounter.java @@ -102,8 +102,7 @@ public List countMatchings(List annotatorResult, List go if ((documentCounts.truePositives == 0) && (documentCounts.falseNegatives == 0) && (documentCounts.falsePositives == 0)) { documentCounts.truePositives = 1; - LOGGER.info( - "Got an entity with a type that is not inside a known KB in the annotator and in the dataset."); + LOGGER.info("Got an entity with a type that is not inside a known KB in the annotator and in the dataset."); } } else { documentCounts = new EvaluationCounts(); @@ -112,6 +111,14 @@ public List countMatchings(List annotatorResult, List go } localCounts.add(documentCounts); } + for (int i = 0; i < annotatorResult.size(); ++i) { + if(!alreadyUsedResults.get(i)) { + if (LOGGER.isDebugEnabled()) { + LOGGER.debug("found a false positive. {}", annotatorResult.get(i)); + } + localCounts.add(new EvaluationCounts(0, 1, 0)); + } + } return localCounts; } diff --git a/src/main/java/org/aksw/gerbil/semantic/kb/WhiteListBasedUriKBClassifier.java b/src/main/java/org/aksw/gerbil/semantic/kb/WhiteListBasedUriKBClassifier.java index 590781b47..ae7e87e37 100644 --- a/src/main/java/org/aksw/gerbil/semantic/kb/WhiteListBasedUriKBClassifier.java +++ b/src/main/java/org/aksw/gerbil/semantic/kb/WhiteListBasedUriKBClassifier.java @@ -16,6 +16,7 @@ */ package org.aksw.gerbil.semantic.kb; +import java.io.BufferedInputStream; import java.io.File; import java.io.FileInputStream; import java.io.IOException; @@ -31,15 +32,15 @@ public class WhiteListBasedUriKBClassifier extends AbstractWhiteListBasedUriKBCl private static final Logger LOGGER = LoggerFactory.getLogger(WhiteListBasedUriKBClassifier.class); public static WhiteListBasedUriKBClassifier create(File file) { - FileInputStream fin = null; + InputStream is = null; try { - fin = new FileInputStream(file); - return create(fin); + is = new BufferedInputStream(new FileInputStream(file)); + return create(is); } catch (IOException e) { LOGGER.error("Exception while trying to read knowledge base namespaces.", e); return null; } finally { - IOUtils.closeQuietly(fin); + IOUtils.closeQuietly(is); } } diff --git a/src/main/java/org/aksw/gerbil/semantic/sameas/impl/cache/FileBasedCachingSameAsRetriever.java b/src/main/java/org/aksw/gerbil/semantic/sameas/impl/cache/FileBasedCachingSameAsRetriever.java index d2be18d1a..5e09ccbe9 100644 --- a/src/main/java/org/aksw/gerbil/semantic/sameas/impl/cache/FileBasedCachingSameAsRetriever.java +++ b/src/main/java/org/aksw/gerbil/semantic/sameas/impl/cache/FileBasedCachingSameAsRetriever.java @@ -16,6 +16,7 @@ */ package org.aksw.gerbil.semantic.sameas.impl.cache; +import java.io.BufferedInputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; @@ -293,11 +294,9 @@ public static Object[] readCacheFile(File cacheFile) { if (!cacheFile.exists() || cacheFile.isDirectory()) { return null; } - FileInputStream fin = null; ObjectInputStream oin = null; try { - fin = new FileInputStream(cacheFile); - oin = new ObjectInputStream(fin); + oin = new ObjectInputStream(new BufferedInputStream(new FileInputStream(cacheFile))); // first, read the number of URIs int count = oin.readInt(); String uri; @@ -323,7 +322,6 @@ public static Object[] readCacheFile(File cacheFile) { LOGGER.error("Exception while reading cache file.", e); } finally { IOUtils.closeQuietly(oin); - IOUtils.closeQuietly(fin); } return null; } diff --git a/src/main/java/org/aksw/gerbil/semantic/sameas/impl/index/IndexBasedSameAsRetriever.java b/src/main/java/org/aksw/gerbil/semantic/sameas/impl/index/IndexBasedSameAsRetriever.java new file mode 100644 index 000000000..6c0f1a812 --- /dev/null +++ b/src/main/java/org/aksw/gerbil/semantic/sameas/impl/index/IndexBasedSameAsRetriever.java @@ -0,0 +1,73 @@ +/** + * This file is part of General Entity Annotator Benchmark. + * + * General Entity Annotator Benchmark is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * General Entity Annotator Benchmark is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with General Entity Annotator Benchmark. If not, see . + */ +package org.aksw.gerbil.semantic.sameas.impl.index; + +import java.util.HashSet; +import java.util.Set; + +import org.aksw.gerbil.exceptions.GerbilException; +import org.aksw.gerbil.semantic.sameas.SameAsRetriever; +import org.aksw.gerbil.semantic.sameas.index.Searcher; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class IndexBasedSameAsRetriever implements SameAsRetriever { + + private static final Logger LOGGER = LoggerFactory.getLogger(IndexBasedSameAsRetriever.class); + private Searcher searcher; + + public IndexBasedSameAsRetriever(String indexPath) throws GerbilException{ + searcher = new Searcher(indexPath); + } + + @Override + public Set retrieveSameURIs(String uri) { + if ((uri == null) || (uri.isEmpty())) { + return null; + } + try { + Set ret = (Set) searcher.search(uri); + if(ret.isEmpty()){ + return null; + } + return ret; + } catch (GerbilException e) { + LOGGER.warn("Could not retrieve Same Uris", e); + return null; + } + + } + + @Override + public void addSameURIs(Set uris) { + Set temp = new HashSet(); + Set result; + for (String uri : uris) { + result = retrieveSameURIs(uri); + if (result != null) { + temp.addAll(retrieveSameURIs(uri)); + } + } + uris.addAll(temp); + } + + @Override + public Set retrieveSameURIs(String domain, String uri) { + return retrieveSameURIs(uri); + } + +} diff --git a/src/main/java/org/aksw/gerbil/semantic/sameas/index/Indexer.java b/src/main/java/org/aksw/gerbil/semantic/sameas/index/Indexer.java new file mode 100644 index 000000000..8d1b86510 --- /dev/null +++ b/src/main/java/org/aksw/gerbil/semantic/sameas/index/Indexer.java @@ -0,0 +1,88 @@ +package org.aksw.gerbil.semantic.sameas.index; + +import java.io.File; +import java.io.IOException; +import java.util.Collection; + +import org.aksw.gerbil.datatypes.ErrorTypes; +import org.aksw.gerbil.exceptions.GerbilException; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.StringField; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.IndexWriterConfig.OpenMode; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FSDirectory; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class Indexer extends LuceneConstants { + + private static final Logger LOGGER = LoggerFactory + .getLogger(Indexer.class); + + private IndexWriter writer; + private Directory dir; + + + public Indexer(String path) + throws GerbilException { + try { + dir = FSDirectory.open(new File(path).toPath()); + Analyzer analyzer = new StandardAnalyzer(); + IndexWriterConfig config = new IndexWriterConfig(analyzer); + config.setOpenMode(OpenMode.CREATE); + writer = new IndexWriter(dir, config); + } catch (IOException e) { + LOGGER.error("Error occured during accesing file " + path, e); + throw new GerbilException(ErrorTypes.UNEXPECTED_EXCEPTION); + } + } + + public void close() { + try { + writer.commit(); + writer.close(); + dir.close(); + } catch (IOException e) { + LOGGER.error("Error occured during closing Index Writer", e); + } + } + + public void index(String uri, Collection uris) { + indexSameAs(uri, uris); + } + + private String listToStr(Collection uris) { + String entity = ""; + for (String uri : uris) { + entity += uri + " "; + } + return entity; + } + + public void indexSameAs(String uri, Collection uris) { + Document doc = convertTerm(uri, listToStr(uris)); + try { + writer.addDocument(doc); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + + } + + private Document convertTerm(String uri, String sameAs) { + Document document = new Document(); + Field contentField = new StringField(CONTENTS, uri, Field.Store.YES); + Field sameAsField = new StringField(SAMEAS, sameAs, Field.Store.YES); + document.add(contentField); + document.add(sameAsField); + + return document; + } + +} diff --git a/src/main/java/org/aksw/gerbil/semantic/sameas/index/LuceneConstants.java b/src/main/java/org/aksw/gerbil/semantic/sameas/index/LuceneConstants.java new file mode 100644 index 000000000..c58952b61 --- /dev/null +++ b/src/main/java/org/aksw/gerbil/semantic/sameas/index/LuceneConstants.java @@ -0,0 +1,10 @@ +package org.aksw.gerbil.semantic.sameas.index; + +public abstract class LuceneConstants { + + protected static final String CONTENTS = "contents"; + + protected static final String SAMEAS = "sameAs"; + + protected static final int MAX_SEARCH = 100; +} diff --git a/src/main/java/org/aksw/gerbil/semantic/sameas/index/Searcher.java b/src/main/java/org/aksw/gerbil/semantic/sameas/index/Searcher.java new file mode 100644 index 000000000..e9164327d --- /dev/null +++ b/src/main/java/org/aksw/gerbil/semantic/sameas/index/Searcher.java @@ -0,0 +1,92 @@ +package org.aksw.gerbil.semantic.sameas.index; + +import java.io.File; +import java.io.IOException; +import java.util.Collection; +import java.util.HashSet; + +import org.aksw.gerbil.datatypes.ErrorTypes; +import org.aksw.gerbil.exceptions.GerbilException; +import org.apache.commons.io.IOUtils; +import org.apache.lucene.document.Document; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +//import org.apache.lucene.queryParser.ParseException; +//import org.apache.lucene.queryParser.QueryParser; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FSDirectory; + +public class Searcher extends LuceneConstants { + + private IndexSearcher indexSearcher; + private Directory indexDirectory; + private IndexReader indexReader; + + public Searcher(String indexDirectoryPath) throws GerbilException { + + try { + indexDirectory = FSDirectory.open(new File( + indexDirectoryPath).toPath()); + indexReader = DirectoryReader.open(indexDirectory); + indexSearcher = new IndexSearcher(indexReader); + } catch (IOException e) { + throw new GerbilException("Could not initialize Searcher", ErrorTypes.UNEXPECTED_EXCEPTION); + } + } + + public TopDocs searchTops(String searchQuery) throws IOException { + return searchTerm(searchQuery); + } + + private TopDocs searchTerm(String searchQuery) throws IOException{ + TermQuery query = new TermQuery(new Term(CONTENTS, searchQuery)); + return indexSearcher.search(query, MAX_SEARCH); + } + + + public Document getDocument(ScoreDoc scoreDoc) + throws CorruptIndexException, IOException { + return indexSearcher.doc(scoreDoc.doc); + } + + public void close() throws IOException { + IOUtils.closeQuietly(indexReader); + IOUtils.closeQuietly(indexDirectory); + } + + public Collection search(String uri) throws GerbilException{ + + return searchSameAsTerm(uri); + } + + public Collection searchSameAsTerm(String uri) throws GerbilException{ + TopDocs docs; + try { + docs = searchTops(uri); + } catch (IOException e1) { + throw new GerbilException("Could not parse index files", ErrorTypes.UNEXPECTED_EXCEPTION); + } + Collection uris = new HashSet(); + for (ScoreDoc scoreDoc : docs.scoreDocs) { + Document doc; + try { + doc = getDocument(scoreDoc); + } catch (IOException e) { + throw new GerbilException("Could not load Hits", ErrorTypes.UNEXPECTED_EXCEPTION); + } + String content = doc.get(CONTENTS); + uris.add(content); + String sameAs = doc.get(SAMEAS); + for (String uriStr : sameAs.split(" ")) + uris.add(uriStr); + } + return uris; + } + +} diff --git a/src/main/java/org/aksw/gerbil/semantic/sameas/index/document/DocumentBuilder.java b/src/main/java/org/aksw/gerbil/semantic/sameas/index/document/DocumentBuilder.java new file mode 100644 index 000000000..3b5ce6477 --- /dev/null +++ b/src/main/java/org/aksw/gerbil/semantic/sameas/index/document/DocumentBuilder.java @@ -0,0 +1,40 @@ +package org.aksw.gerbil.semantic.sameas.index.document; + +import java.io.File; +import java.io.IOException; +import java.io.PrintWriter; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +public class DocumentBuilder { + + private Set sameAs = new HashSet(); + private String dir; + + public void addSameAs(List sameAs){ + StringBuilder builder = new StringBuilder(); + for(String uri : sameAs){ + builder.append(uri+"\n"); + } + this.sameAs.add(builder.toString()); + } + + public void createFiles() { + new File(dir).mkdirs(); + for(String same : sameAs){ + File f = new File(dir+File.separator+same.hashCode()+""); + try { + f.createNewFile(); + PrintWriter pw = new PrintWriter(f); + pw.print(same); + pw.close(); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + + } + } + +} diff --git a/src/main/java/org/aksw/gerbil/semantic/subclass/ClassHierarchyLoader.java b/src/main/java/org/aksw/gerbil/semantic/subclass/ClassHierarchyLoader.java index 7112b448a..7ce61a87e 100644 --- a/src/main/java/org/aksw/gerbil/semantic/subclass/ClassHierarchyLoader.java +++ b/src/main/java/org/aksw/gerbil/semantic/subclass/ClassHierarchyLoader.java @@ -16,10 +16,12 @@ */ package org.aksw.gerbil.semantic.subclass; +import java.io.BufferedInputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; +import java.io.InputStream; import java.io.PrintStream; import java.util.Arrays; import java.util.HashSet; @@ -42,8 +44,8 @@ public class ClassHierarchyLoader { - private static final Set ALLOWED_PROPERTIES = new HashSet(Arrays.asList(RDF.type, - RDFS.subClassOf)); + private static final Set ALLOWED_PROPERTIES = new HashSet( + Arrays.asList(RDF.type, RDFS.subClassOf)); private RDFReaderFactoryRIOT factory = new RDFReaderFactoryRIOT(); @@ -55,13 +57,13 @@ public void loadClassHierarchy(File file, String rdfLang, String baseUri, Model } protected void readClassHierarchy(File file, String rdfLang, String baseUri, Model model) throws IOException { - FileInputStream fin = null; + InputStream is = null; RDFReader rdfReader = factory.getReader(rdfLang); try { - fin = new FileInputStream(file); - rdfReader.read(model, fin, baseUri); + is = new BufferedInputStream(new FileInputStream(file)); + rdfReader.read(model, is, baseUri); } finally { - IOUtils.closeQuietly(fin); + IOUtils.closeQuietly(is); } } diff --git a/src/main/java/org/aksw/gerbil/semantic/vocabs/GERBIL.java b/src/main/java/org/aksw/gerbil/semantic/vocabs/GERBIL.java index 5be00ce4e..e06be2c9a 100644 --- a/src/main/java/org/aksw/gerbil/semantic/vocabs/GERBIL.java +++ b/src/main/java/org/aksw/gerbil/semantic/vocabs/GERBIL.java @@ -55,6 +55,7 @@ protected static final Property property(String local) { public static final Resource C2KB = resource("C2KB"); public static final Resource D2KB = resource("D2KB"); public static final Resource Rc2KB = resource("Rc2KB"); + public static final Resource RT2KB = resource("RT2KB"); public static final Resource Sa2KB = resource("Sa2KB"); public static final Resource Sc2KB = resource("Sc2KB"); public static final Resource OKE2015_Task1 = resource("OKE2015_Task1"); @@ -122,6 +123,8 @@ public static Resource getExperimentTypeResource(ExperimentType type) { return ERec; case ETyping: return ETyping; + case RT2KB: + return RT2KB; } LOGGER.error("Got an unknown experiment type: " + type.name()); return null; diff --git a/src/main/java/org/aksw/gerbil/tools/AnnotatorAnalyzer.java b/src/main/java/org/aksw/gerbil/tools/AnnotatorAnalyzer.java new file mode 100644 index 000000000..1efb75cae --- /dev/null +++ b/src/main/java/org/aksw/gerbil/tools/AnnotatorAnalyzer.java @@ -0,0 +1,74 @@ +package org.aksw.gerbil.tools; + +import java.io.PrintStream; +import java.util.List; + +import org.aksw.gerbil.annotator.Annotator; +import org.aksw.gerbil.annotator.AnnotatorConfiguration; +import org.aksw.gerbil.datatypes.ExperimentType; +import org.aksw.gerbil.exceptions.GerbilException; +import org.aksw.gerbil.web.config.AnnotatorsConfig; +import org.apache.commons.io.IOUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class AnnotatorAnalyzer { + + private static final Logger LOGGER = LoggerFactory.getLogger(AnnotatorAnalyzer.class); + + private PrintStream out; + + public AnnotatorAnalyzer(PrintStream output) { + out = output; + } + + public static void main(String[] args) { + PrintStream output = null; + try { + output = new PrintStream("annotatorAnalyzer.log"); + output.println("Name,ExperimentType"); + List annotatorConfigs = AnnotatorsConfig.annotators().getConfigurations(); + for(AnnotatorConfiguration annotatorConfig : annotatorConfigs){ + AnnotatorAnalyzer ai = new AnnotatorAnalyzer(output); + try { + ai.analyzeAnnotator(annotatorConfig); + } catch(GerbilException e){ + e.printStackTrace(); + } + } + } catch (Exception e) { + e.printStackTrace(); + } finally { + IOUtils.closeQuietly(output); + } + } + + public void analyzeAnnotator(AnnotatorConfiguration config) throws GerbilException { + if (config.isApplicableForExperiment(ExperimentType.D2KB)) { + analyze(config, ExperimentType.D2KB); + } else if (config.isApplicableForExperiment(ExperimentType.OKE_Task2)) { + analyze(config, ExperimentType.OKE_Task2); + } else if (config.isApplicableForExperiment(ExperimentType.C2KB)) { + analyze(config, ExperimentType.C2KB); + } else if (config.isApplicableForExperiment(ExperimentType.ETyping)) { + analyze(config, ExperimentType.ETyping); + } else if (config.isApplicableForExperiment(ExperimentType.ERec)) { + analyze(config, ExperimentType.ERec); + } else { + LOGGER.error("Can not analyze the Annotator with the following config: " + config.toString()); + } + } + + private void analyze(AnnotatorConfiguration config, ExperimentType type) throws GerbilException { + Annotator annotator = config.getAnnotator(type); + if (annotator == null) { + return; + } + out.print(config.getName()); + out.print(','); + out.print(config.getExperimentType().getName()); + + out.println(); + } + +} diff --git a/src/main/java/org/aksw/gerbil/tools/DBpediaEntityCheckIndexTool.java b/src/main/java/org/aksw/gerbil/tools/DBpediaEntityCheckIndexTool.java new file mode 100644 index 000000000..ce2023091 --- /dev/null +++ b/src/main/java/org/aksw/gerbil/tools/DBpediaEntityCheckIndexTool.java @@ -0,0 +1,108 @@ +package org.aksw.gerbil.tools; + +import java.io.File; +import java.io.IOException; +import java.text.SimpleDateFormat; +import java.util.Calendar; +import java.util.Date; +import java.util.Set; + +import org.aksw.gerbil.dataset.check.index.Indexer; +import org.aksw.gerbil.exceptions.GerbilException; +import org.aksw.gerbil.semantic.sameas.impl.UriEncodingHandlingSameAsRetriever; +import org.apache.commons.io.FileUtils; +import org.apache.commons.io.LineIterator; +import org.apache.commons.lang.time.DurationFormatUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * This tool can be used to create the Lucene index that can be used for entity + * checking. A file can be used as source for the data, e.g., the mapping from + * DBpedia resource to Wikipedia ID. + * + * @author Michael Röder (roeder@informatik.uni-leipzig.de) + * + */ +public class DBpediaEntityCheckIndexTool { + + private static final Logger LOGGER = LoggerFactory.getLogger(DBpediaEntityCheckIndexTool.class); + + private static final String INPUT_FOLDER = "C:/Daten/DBpedia"; + private static final String OUTPUT_FOLDER = "indexes/dbpedia_check"; + + public static void main(String[] args) throws GerbilException, IOException { + Indexer index = Indexer.create(OUTPUT_FOLDER); + SimpleDateFormat format = new SimpleDateFormat(); + Date start = Calendar.getInstance().getTime(); + LOGGER.info("Start indexing at {}", format.format(start)); + indexFolder(index, INPUT_FOLDER); + index.close(); + Date end = Calendar.getInstance().getTime(); + LOGGER.info("Indexing finished at {}", format.format(end)); + LOGGER.info("Indexing took: " + DurationFormatUtils.formatDurationHMS(end.getTime() - start.getTime())); + } + + public static void indexFolder(Indexer index, String folder) { + File dir = new File(folder); + for (File f : dir.listFiles()) { + if (f.getName().endsWith(".ttl")) { + index(index, f.getAbsolutePath()); + } + } + } + + public static void index(Indexer indexer, String file) { + UriEncodingHandlingSameAsRetriever retriever = new UriEncodingHandlingSameAsRetriever(); + LineIterator iterator = null; + long size = 0, rounds = 0; + try { + iterator = FileUtils.lineIterator(new File(file), "UTF-8"); + String uri = null; + Set uris; + String old = null; + Date start = Calendar.getInstance().getTime(); + // iterate over the lines + while (iterator.hasNext()) { + String[] split = iterator.next().split("\\s+"); + if (split.length > 2) { + // get the subject of the triple + uri = split[0]; + if (uri.startsWith("<")) { + uri = uri.substring(1); + } + if (uri.endsWith(">")) { + uri = uri.substring(0, uri.length() - 1); + } + + // if this subject is new + if (!uri.equals(old)) { + // retrieve other writings of this URI + uris = retriever.retrieveSameURIs(uri); + if (uris != null) { + for (String u : uris) { + indexer.index(u); + } + } else { + indexer.index(uri); + } + } + size++; + if (size % 100000 == 0) { + Date end = Calendar.getInstance().getTime(); + rounds++; + String avgTime = DurationFormatUtils + .formatDurationHMS((end.getTime() - start.getTime()) / rounds); + LOGGER.info("Got 100000 entities...(Sum: {}, AvgTime: {})", size, avgTime); + } + } + } + } catch (IOException e) { + LOGGER.error("Exception while reading file. It will be ignored.", e); + } finally { + LineIterator.closeQuietly(iterator); + } + LOGGER.info("Successfully indexed {} triples", size); + } + +} diff --git a/src/main/java/org/aksw/gerbil/tools/DatasetAnalyzer.java b/src/main/java/org/aksw/gerbil/tools/DatasetAnalyzer.java index 5a7067618..5adfe9eb4 100644 --- a/src/main/java/org/aksw/gerbil/tools/DatasetAnalyzer.java +++ b/src/main/java/org/aksw/gerbil/tools/DatasetAnalyzer.java @@ -16,7 +16,6 @@ */ package org.aksw.gerbil.tools; -import java.io.IOException; import java.io.PrintStream; import java.io.StringReader; import java.util.List; @@ -25,10 +24,13 @@ import org.aksw.gerbil.dataset.DatasetConfiguration; import org.aksw.gerbil.datatypes.ExperimentType; import org.aksw.gerbil.exceptions.GerbilException; +import org.aksw.gerbil.semantic.kb.UriKBClassifier; import org.aksw.gerbil.transfer.nif.Document; +import org.aksw.gerbil.transfer.nif.Meaning; import org.aksw.gerbil.web.config.DatasetsConfig; +import org.aksw.gerbil.web.config.RootConfig; import org.apache.commons.io.IOUtils; -import org.apache.lucene.analysis.WhitespaceTokenizer; +import org.apache.lucene.analysis.core.WhitespaceTokenizer; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -46,7 +48,7 @@ public static void main(String[] args) { try { output = new PrintStream("datasetAnalyzation.log"); output.println( - "name,entitiesPerDoc, entitiesPerToken, avgDocumentLength,numberOfDocuments,numberOfEntities, amountOfPersons, amountOfOrganizations, amountOfLocations, amountOfOthers"); + "name,entitiesPerDoc, entitiesPerToken, avgDocumentLength,numberOfDocuments,numberOfEntities, numberOfEEs, amountOfPersons, amountOfOrganizations, amountOfLocations, amountOfOthers"); DatasetAnalyzer analyzer = new DatasetAnalyzer(output); for (DatasetConfiguration config : datasetConfigs) { try { @@ -63,6 +65,7 @@ public static void main(String[] args) { } private PrintStream output; + private UriKBClassifier classifier = RootConfig.createDefaultUriKBClassifier(); public DatasetAnalyzer(PrintStream output) { this.output = output; @@ -71,24 +74,32 @@ public DatasetAnalyzer(PrintStream output) { public void analyzeDataset(DatasetConfiguration config) throws GerbilException { if (config.isApplicableForExperiment(ExperimentType.D2KB)) { analyze(config, ExperimentType.D2KB); + } else if (config.isApplicableForExperiment(ExperimentType.ETyping)) { + analyze(config, ExperimentType.ETyping); } else if (config.isApplicableForExperiment(ExperimentType.OKE_Task2)) { analyze(config, ExperimentType.OKE_Task2); } else if (config.isApplicableForExperiment(ExperimentType.C2KB)) { analyze(config, ExperimentType.C2KB); + } else if (config.isApplicableForExperiment(ExperimentType.ERec)) { + analyze(config, ExperimentType.ERec); } else { LOGGER.error("Can not analyze the dataset with the following config: " + config.toString()); } } private int countTokensInText(String text) { - WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(new StringReader(text)); + WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(); + tokenizer.setReader(new StringReader(text)); int tokens = 0; try { + tokenizer.reset(); while (tokenizer.incrementToken()) { ++tokens; } - } catch (IOException e) { + } catch (Exception e) { LOGGER.error("Error while tokenizing text. Returning.", e); + } finally { + IOUtils.closeQuietly(tokenizer); } return tokens; } @@ -103,9 +114,15 @@ private void analyze(DatasetConfiguration config, ExperimentType type) throws Ge List documents = dataset.getInstances(); int annotationsSum = 0; int tokensSum = 0; + int eeCount = 0; for (Document document : documents) { annotationsSum += document.getMarkings().size(); tokensSum += countTokensInText(document.getText()); + for (Meaning meaning : document.getMarkings(Meaning.class)) { + if (!classifier.containsKBUri(meaning.getUris())) { + ++eeCount; + } + } } // average entities per document output.print((double) annotationsSum / (double) documents.size()); @@ -122,7 +139,9 @@ private void analyze(DatasetConfiguration config, ExperimentType type) throws Ge // number of entities output.print(annotationsSum); output.print(','); - // output.print(" tokens=" + tokensSum); + // number of EEs + output.print(eeCount); + output.print(','); output.println(); } diff --git a/src/main/java/org/aksw/gerbil/tools/InitialIndexTool.java b/src/main/java/org/aksw/gerbil/tools/InitialIndexTool.java new file mode 100644 index 000000000..cd3388df3 --- /dev/null +++ b/src/main/java/org/aksw/gerbil/tools/InitialIndexTool.java @@ -0,0 +1,199 @@ +package org.aksw.gerbil.tools; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.nio.charset.Charset; +import java.text.SimpleDateFormat; +import java.util.Calendar; +import java.util.Date; +import java.util.HashSet; +import java.util.Set; + +import org.aksw.gerbil.exceptions.GerbilException; +import org.aksw.gerbil.semantic.sameas.index.Indexer; +import org.apache.commons.lang.time.DurationFormatUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.hp.hpl.jena.query.Query; +import com.hp.hpl.jena.query.QueryExecution; +import com.hp.hpl.jena.query.QueryExecutionFactory; +import com.hp.hpl.jena.query.QueryFactory; +import com.hp.hpl.jena.query.QuerySolution; +import com.hp.hpl.jena.query.ResultSet; +import com.hp.hpl.jena.rdf.model.RDFNode; + +public class InitialIndexTool { + + private static final Logger LOGGER = LoggerFactory + .getLogger(InitialIndexTool.class); + + private static final String OUTPUT_FOLDER = "lucene_index"; + private static final String SPARQL_GET = "select distinct ?s ?o where {?s ?o}"; + + private static String service = "http://de.dbpedia.org/sparql"; + + private static Object owlSameAs=""; + + public static void main(String[] args) throws GerbilException, IOException { + Indexer index = new Indexer(OUTPUT_FOLDER); + SimpleDateFormat format = new SimpleDateFormat(); + Date start = Calendar.getInstance().getTime(); + LOGGER.info("Start indexing at {}", format.format(start)); + indexFolder(index, args[0]); + index.close(); + Date end = Calendar.getInstance().getTime(); + LOGGER.info("Indexing finished at {}", format.format(end)); + LOGGER.info("Indexing took: " + + DurationFormatUtils.formatDurationHMS(end.getTime() + - start.getTime())); + } + + public static void index(Indexer index) throws GerbilException { + int offset = 0, limit = 10000; + boolean test = true; + + Query q = QueryFactory.create(SPARQL_GET); + q.setLimit(limit); + + // Create here! + Set sameAsBlock = new HashSet(); + RDFNode old = null; + int rounds = 0, size = 0; + long total = 0; + Date start = Calendar.getInstance().getTime(); + do { + q.setOffset(offset); + Date startQ = Calendar.getInstance().getTime(); + QueryExecution qexec = QueryExecutionFactory.sparqlService(service, + q); + ResultSet res = qexec.execSelect(); + Date endQ = Calendar.getInstance().getTime(); + // get results + size = 0; + long sumI = 0; + rounds++; + // Go through all elements + while (res.hasNext()) { + size++; + QuerySolution solution = res.next(); + RDFNode node1 = solution.get("s"); + RDFNode node2 = solution.get("o"); + if (node1.equals(old)) { + sameAsBlock.add(node2.toString()); + } else if (old != null) { + // Enitity is finished + Date startI = Calendar.getInstance().getTime(); + index.index(old.toString(), sameAsBlock); + Date endI = Calendar.getInstance().getTime(); + sumI += endI.getTime() - startI.getTime(); + total += sameAsBlock.size(); + + sameAsBlock.clear(); + // Add Uri + sameAsBlock.add(node2.toString()); + old = node1; + } else { + // First run + sameAsBlock.add(node2.toString()); + old = node1; + } + } + if (size < limit) { + // No more results + test = false; + } + // Set offset so it starts immediately after last results + offset += limit; + + Date end = Calendar.getInstance().getTime(); + String avg = DurationFormatUtils + .formatDurationHMS((end.getTime() - start.getTime()) + / rounds); + String avgQ = DurationFormatUtils + .formatDurationHMS((endQ.getTime() - startQ.getTime())); + String avgI = DurationFormatUtils.formatDurationHMS(sumI); + sumI = 0; + LOGGER.info( + "Got {} triples...(Sum: {}, AvgTime: {}, QueryTime: {}, IndexTime: {})", + size, limit * (rounds - 1) + size, avg, avgQ, avgI); + } while (test); + // done + if (!sameAsBlock.isEmpty()) { + index.index(old.toString(), sameAsBlock); + sameAsBlock.clear(); + } + LOGGER.info("Successfully indexed {} triples", total); + } + + + + public static void indexFolder(Indexer index, String folder) throws GerbilException, IOException{ + File dir = new File(folder); + + for(File f : dir.listFiles()){ + if(f.getName().endsWith(".nt")) + index(index, f.getAbsolutePath()); + } + } + + public static void index(Indexer index, String file) + throws GerbilException, IOException { + BufferedReader reader = new BufferedReader(new InputStreamReader( + new FileInputStream(file), Charset.forName("UTF-8"))); + + // Create here! + Set sameAsBlock = new HashSet(); + + long total = 0, size=0, rounds=0; + String line = ""; + String old = null; + Date start = Calendar.getInstance().getTime(); + while ((line = reader.readLine()) != null) { + String[] split = line.split("\\s+"); + if(!split[1].equals(owlSameAs)) { + continue; + } + String node1 = split[0].replace("<", "").replace(">", ""); + String node2 = split[2]; + node2 = node2.substring(node2.indexOf("<")+1, node2.lastIndexOf(">")).trim(); + + if (node1.equals(old)) { + sameAsBlock.add(node2.toString()); + } else if (old != null) { + // Enitity is finished + index.index(old.toString(), sameAsBlock); + total += sameAsBlock.size(); + + sameAsBlock.clear(); + // Add Uri + sameAsBlock.add(node2.toString()); + old = node1; + } else { + // First run + sameAsBlock.add(node2.toString()); + old = node1; + } + size++; + if(size%100000==0){ + Date end = Calendar.getInstance().getTime(); + rounds++; + String avgTime =DurationFormatUtils.formatDurationHMS((end.getTime() + - start.getTime())/rounds); + LOGGER.info("Got 100000 triples...(Sum: {}, AvgTime: {})", size, avgTime); + } + } + + // done + if (!sameAsBlock.isEmpty()) { + index.index(old.toString(), sameAsBlock); + sameAsBlock.clear(); + } + reader.close(); + LOGGER.info("Successfully indexed {} triples", total); + } + +} diff --git a/src/main/java/org/aksw/gerbil/tools/NIFDatasetLoadingTest.java b/src/main/java/org/aksw/gerbil/tools/NIFDatasetLoadingTest.java index 88a28108e..ad9641bf0 100644 --- a/src/main/java/org/aksw/gerbil/tools/NIFDatasetLoadingTest.java +++ b/src/main/java/org/aksw/gerbil/tools/NIFDatasetLoadingTest.java @@ -16,13 +16,19 @@ */ package org.aksw.gerbil.tools; +import java.util.List; + import org.aksw.gerbil.dataset.impl.nif.FileBasedNIFDataset; import org.aksw.gerbil.exceptions.GerbilException; +import org.aksw.gerbil.transfer.nif.Document; +import org.aksw.gerbil.transfer.nif.Marking; import org.apache.commons.io.IOUtils; import org.apache.jena.riot.Lang; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.carrotsearch.hppc.ObjectIntOpenHashMap; + public class NIFDatasetLoadingTest { private static final Logger LOGGER = LoggerFactory.getLogger(NIFDatasetLoadingTest.class); @@ -40,7 +46,24 @@ public static void main(String[] args) { } catch (GerbilException e) { LOGGER.error("Got an exception while trying to load the dataset.", e); } + List documents = dataset.getInstances(); + LOGGER.info("Dataset size: {} documents", documents.size()); + ObjectIntOpenHashMap annotationTypes; + for (Document document : documents) { + annotationTypes = listAnnotationTypes(document); + LOGGER.info("Document {} annotation types: {}", document.getDocumentURI(), annotationTypes.toString()); + } IOUtils.closeQuietly(dataset); LOGGER.info("Finished loading of given test dataset."); } + + private static ObjectIntOpenHashMap listAnnotationTypes(Document document) { + ObjectIntOpenHashMap annotationTypes = new ObjectIntOpenHashMap(); + String className; + for (Marking marking : document.getMarkings()) { + className = marking.getClass().getSimpleName(); + annotationTypes.putOrAdd(className, 1, 1); + } + return annotationTypes; + } } diff --git a/src/main/java/org/aksw/gerbil/tools/UriExport.java b/src/main/java/org/aksw/gerbil/tools/UriExport.java new file mode 100644 index 000000000..992ec3d9a --- /dev/null +++ b/src/main/java/org/aksw/gerbil/tools/UriExport.java @@ -0,0 +1,71 @@ +package org.aksw.gerbil.tools; + +import java.io.BufferedOutputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.PrintStream; +import java.util.List; + +import org.aksw.gerbil.dataset.Dataset; +import org.aksw.gerbil.dataset.DatasetConfiguration; +import org.aksw.gerbil.dataset.check.EntityCheckerManager; +import org.aksw.gerbil.dataset.check.impl.EntityCheckerManagerImpl; +import org.aksw.gerbil.datatypes.ExperimentType; +import org.aksw.gerbil.exceptions.GerbilException; +import org.aksw.gerbil.semantic.sameas.SameAsRetriever; +import org.aksw.gerbil.semantic.sameas.impl.ErrorFixingSameAsRetriever; +import org.aksw.gerbil.transfer.nif.Document; +import org.aksw.gerbil.transfer.nif.MeaningSpan; +import org.aksw.gerbil.web.config.AdapterList; +import org.aksw.gerbil.web.config.DatasetsConfig; +import org.apache.commons.io.IOUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class UriExport { + + private static final Logger LOGGER = LoggerFactory.getLogger(UriExport.class); + + private static final SameAsRetriever SAME_AS_RETRIEVER = new ErrorFixingSameAsRetriever(); + private static final EntityCheckerManager ENTITY_CHECKER_MANAGER = new EntityCheckerManagerImpl(); + + public static void main(String[] args) { + PrintStream pout = null; + try { + pout = new PrintStream(new BufferedOutputStream(new FileOutputStream("exportedURIs.txt"))); + + AdapterList adapterList = DatasetsConfig.datasets(ENTITY_CHECKER_MANAGER, + SAME_AS_RETRIEVER); + List datasetConfigs = null; + datasetConfigs = adapterList.getAdaptersForExperiment(ExperimentType.D2KB); + for (DatasetConfiguration datasetConfig : datasetConfigs) { + try { + Dataset dataset = datasetConfig.getDataset(ExperimentType.D2KB); + printDatasetUris(dataset, pout); + LOGGER.info("Finished {}", dataset.getName()); + } catch (GerbilException e) { + LOGGER.error("Couldn't load dataset. It will be ignored.", e); + } + } + } catch (IOException e) { + LOGGER.error("Error while writing file. Aborting.", e); + } finally { + IOUtils.closeQuietly(pout); + } + } + + private static void printDatasetUris(Dataset dataset, PrintStream pout) { + for (Document document : dataset.getInstances()) { + String text = document.getText(); + for (MeaningSpan meaning : document.getMarkings(MeaningSpan.class)) { + for (String uri : meaning.getUris()) { + pout.print(uri); + pout.print('\t'); + pout.print(text.substring(meaning.getStartPosition(), + meaning.getStartPosition() + meaning.getLength())); + pout.println(); + } + } + } + } +} diff --git a/src/main/java/org/aksw/gerbil/web/FileUploadController.java b/src/main/java/org/aksw/gerbil/web/FileUploadController.java index 481eeb1ae..9715194ec 100644 --- a/src/main/java/org/aksw/gerbil/web/FileUploadController.java +++ b/src/main/java/org/aksw/gerbil/web/FileUploadController.java @@ -16,6 +16,7 @@ */ package org.aksw.gerbil.web; +import java.io.BufferedOutputStream; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; @@ -46,8 +47,7 @@ @PropertySource("gerbil.properties") public class FileUploadController { - private static final transient Logger logger = LoggerFactory - .getLogger(FileUploadController.class); + private static final transient Logger logger = LoggerFactory.getLogger(FileUploadController.class); @Value("${org.aksw.gerbil.UploadPath}") private String path; @@ -60,9 +60,8 @@ public ModelAndView upload() { } @RequestMapping(value = "upload", method = RequestMethod.POST) - public @ResponseBody - ResponseEntity upload( - MultipartHttpServletRequest request, HttpServletResponse response) { + public @ResponseBody ResponseEntity upload(MultipartHttpServletRequest request, + HttpServletResponse response) { if (path == null) { logger.error("Path must be not null"); @@ -84,9 +83,9 @@ ResponseEntity upload( try { fileContainer.setBytes(mpf.getBytes()); createFolderIfNotExists(); - FileCopyUtils.copy(mpf.getBytes(), new FileOutputStream(path - + mpf.getOriginalFilename())); - + FileCopyUtils.copy(mpf.getBytes(), + new BufferedOutputStream(new FileOutputStream(path + mpf.getOriginalFilename()))); + // the copy method closed the output stream } catch (IOException e) { logger.error("Error during file upload", e); fileContainer.setError(e.getMessage()); @@ -95,8 +94,7 @@ ResponseEntity upload( } UploadFileContainer uploadFileContainer = new UploadFileContainer(files); - return new ResponseEntity(uploadFileContainer, - HttpStatus.OK); + return new ResponseEntity(uploadFileContainer, HttpStatus.OK); } private void createFolderIfNotExists() { diff --git a/src/main/java/org/aksw/gerbil/web/MainController.java b/src/main/java/org/aksw/gerbil/web/MainController.java index 9fb6021ab..0d1acfaa8 100644 --- a/src/main/java/org/aksw/gerbil/web/MainController.java +++ b/src/main/java/org/aksw/gerbil/web/MainController.java @@ -191,6 +191,22 @@ public ModelAndView experiment(@RequestParam(value = "id") String id, HttpServle ModelAndView model = new ModelAndView(); model.setViewName("experiment"); model.addObject("tasks", results); + int currentExperimentID=-1; + int currentState = 0; + List tasks = dao.getAllRunningExperimentTasks(); + for(ExperimentTaskResult r : results){ + if(r.state==0){ + continue; + } + if(tasks.contains(r)){ + currentState = r.state; + currentExperimentID = tasks.indexOf(r); + break; + } + } + model.addObject("currentState", currentState); + model.addObject("currentExperimentID", currentExperimentID); + model.addObject("workers", RootConfig.getNoOfWorkers()); model.addObject("dataid", dataIdGenerator.createDataIDModel(results, id)); int additionalResultIds[] = ResultNameToIdMapping.getInstance().listAdditionalResultIds(results); // we need Double objects to make sure that they can be null @@ -240,6 +256,7 @@ public ModelAndView experiment(@RequestParam(value = "id") String id, HttpServle case OKE_Task2: case A2KB: case ERec: + case RT2KB: case Sa2KB: return new ModelMap("Matching", Lists.newArrayList(Matching.WEAK_ANNOTATION_MATCH, Matching.STRONG_ANNOTATION_MATCH)); diff --git a/src/main/java/org/aksw/gerbil/web/NIFWSTestingController.java b/src/main/java/org/aksw/gerbil/web/NIFWSTestingController.java index ab82b70dc..3a1ec395b 100644 --- a/src/main/java/org/aksw/gerbil/web/NIFWSTestingController.java +++ b/src/main/java/org/aksw/gerbil/web/NIFWSTestingController.java @@ -90,6 +90,10 @@ public class NIFWSTestingController { annotator.performTyping(document); break; } + case RT2KB: { + annotator.performRT2KBTask(document); + break; + } default: { throw new IllegalArgumentException("Got an unknown experiment type \"" + experimentType + "\"."); } diff --git a/src/main/java/org/aksw/gerbil/web/config/AnnotatorsConfig.java b/src/main/java/org/aksw/gerbil/web/config/AnnotatorsConfig.java index cbaacbdb9..253d81e0a 100644 --- a/src/main/java/org/aksw/gerbil/web/config/AnnotatorsConfig.java +++ b/src/main/java/org/aksw/gerbil/web/config/AnnotatorsConfig.java @@ -29,6 +29,7 @@ import org.aksw.gerbil.annotator.SingletonAnnotatorConfigImpl; import org.aksw.gerbil.config.GerbilConfiguration; import org.aksw.gerbil.datatypes.ExperimentType; +import org.aksw.gerbil.web.config.check.Checker; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.context.annotation.Bean; @@ -46,6 +47,9 @@ public class AnnotatorsConfig { public static final String ANNOTATOR_EXPERIMENT_TYPE_SUFFIX = "experimentType"; public static final String ANNOTATOR_NAME_SUFFIX = "name"; public static final String ANNOTATOR_SINGLETON_FLAG_SUFFIX = "singleton"; + + public static final String ANNOTATOR_CHECK_CLASS_SUFFIX = "check.class"; + public static final String ANNOTATOR_CHECK_ARGS_SUFFIX = "check.args"; public static void main(String[] args) { annotators(); @@ -95,7 +99,7 @@ private static AnnotatorConfiguration getConfiguration(String annotatorKey) org.apache.commons.configuration.Configuration config = GerbilConfiguration.getInstance(); StringBuilder keyBuilder = new StringBuilder(); String key; - + key = buildKey(keyBuilder, annotatorKey, ANNOTATOR_NAME_SUFFIX); if (!config.containsKey(key)) { LOGGER.error("Couldn't get a name for the \"" + annotatorKey + "\" annotator."); @@ -144,6 +148,36 @@ private static AnnotatorConfiguration getConfiguration(String annotatorKey) constructorArgClasses[i] = String.class; } Constructor constructor = annotatorClass.getConstructor(constructorArgClasses); + + // If a checker class has been defined + key = buildKey(keyBuilder, annotatorKey, ANNOTATOR_CHECK_CLASS_SUFFIX); + if (config.containsKey(key)) { + String checkerClassName = config.getString(key); + // If checker arguments have been defined + key = buildKey(keyBuilder, annotatorKey, ANNOTATOR_CHECK_ARGS_SUFFIX); + String checkerArgStrings[]; + if (config.containsKey(key)) { + checkerArgStrings = config.getStringArray(key); + } else { + checkerArgStrings = new String[0]; + } + Object checkerArgs[] = new Object[checkerArgStrings.length]; + for (int i = 0; i < checkerArgs.length; ++i) { + checkerArgs[i] = checkerArgStrings[i]; + } + try { + @SuppressWarnings("unchecked") + Class checkerClass = (Class) AnnotatorsConfig.class.getClassLoader() + .loadClass(checkerClassName); + Checker checker = checkerClass.newInstance(); + if (!checker.check(checkerArgs)) { + LOGGER.info("Check for annotator \"{}\" failed. It won't be available.", name); + return null; + } + } catch (Exception e) { + LOGGER.error("Error while trying to run check for annotator \"" + name + "\". Returning null.", e); + } + } if (isSingleton) { return new SingletonAnnotatorConfigImpl(name, cacheable, constructor, constructorArgs, type); diff --git a/src/main/java/org/aksw/gerbil/web/config/DatasetsConfig.java b/src/main/java/org/aksw/gerbil/web/config/DatasetsConfig.java index c4cbcaf1a..889cf851e 100644 --- a/src/main/java/org/aksw/gerbil/web/config/DatasetsConfig.java +++ b/src/main/java/org/aksw/gerbil/web/config/DatasetsConfig.java @@ -33,6 +33,7 @@ import org.aksw.gerbil.dataset.datahub.DatahubNIFLoader; import org.aksw.gerbil.datatypes.ExperimentType; import org.aksw.gerbil.semantic.sameas.SameAsRetriever; +import org.aksw.gerbil.web.config.check.Checker; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.context.annotation.Bean; @@ -50,6 +51,9 @@ public class DatasetsConfig { public static final String ANNOTATOR_EXPERIMENT_TYPE_SUFFIX = "experimentType"; public static final String ANNOTATOR_NAME_SUFFIX = "name"; + public static final String ANNOTATOR_CHECK_CLASS_SUFFIX = "check.class"; + public static final String ANNOTATOR_CHECK_ARGS_SUFFIX = "check.args"; + @Bean public static AdapterList datasets(EntityCheckerManager entityCheckerManager, SameAsRetriever globalRetriever) { @@ -152,6 +156,36 @@ private static DatasetConfiguration getConfiguration(String datasetKey, EntityCh Constructor constructor = datasetClass.getConstructor(constructorArgClasses); + // If a checker class has been defined + key = buildKey(keyBuilder, datasetKey, ANNOTATOR_CHECK_CLASS_SUFFIX); + if (config.containsKey(key)) { + String checkerClassName = config.getString(key); + // If checker arguments have been defined + key = buildKey(keyBuilder, datasetKey, ANNOTATOR_CHECK_ARGS_SUFFIX); + String checkerArgStrings[]; + if (config.containsKey(key)) { + checkerArgStrings = config.getStringArray(key); + } else { + checkerArgStrings = new String[0]; + } + Object checkerArgs[] = new Object[checkerArgStrings.length]; + for (int i = 0; i < checkerArgs.length; ++i) { + checkerArgs[i] = checkerArgStrings[i]; + } + try { + @SuppressWarnings("unchecked") + Class checkerClass = (Class) DatasetsConfig.class.getClassLoader() + .loadClass(checkerClassName); + Checker checker = checkerClass.newInstance(); + if (!checker.check(checkerArgs)) { + LOGGER.info("Check for dataset \"{}\" failed. It won't be available.", name); + return null; + } + } catch (Exception e) { + LOGGER.error("Error while trying to run check for dataset \"" + name + "\". Returning null.", e); + } + } + // return new DatasetConfigurationImpl(name, cacheable, constructor, // constructorArgs, type, entityCheckerManager); return new SingletonDatasetConfigImpl(name, cacheable, constructor, constructorArgs, type, entityCheckerManager, diff --git a/src/main/java/org/aksw/gerbil/web/config/RootConfig.java b/src/main/java/org/aksw/gerbil/web/config/RootConfig.java index 8e0f6fa24..707bb9668 100644 --- a/src/main/java/org/aksw/gerbil/web/config/RootConfig.java +++ b/src/main/java/org/aksw/gerbil/web/config/RootConfig.java @@ -20,6 +20,7 @@ import java.io.IOException; import java.util.Arrays; import java.util.HashSet; +import java.util.Iterator; import java.util.List; import java.util.Set; @@ -29,9 +30,13 @@ import org.aksw.gerbil.dataset.check.impl.FileBasedCachingEntityCheckerManager; import org.aksw.gerbil.dataset.check.impl.HttpBasedEntityChecker; import org.aksw.gerbil.dataset.check.impl.InMemoryCachingEntityCheckerManager; +import org.aksw.gerbil.dataset.check.index.IndexBasedEntityChecker; import org.aksw.gerbil.datatypes.ExperimentType; import org.aksw.gerbil.evaluate.EvaluatorFactory; +import org.aksw.gerbil.exceptions.GerbilException; import org.aksw.gerbil.execute.AnnotatorOutputWriter; +import org.aksw.gerbil.semantic.kb.SimpleWhiteListBasedUriKBClassifier; +import org.aksw.gerbil.semantic.kb.UriKBClassifier; import org.aksw.gerbil.semantic.sameas.SameAsRetriever; import org.aksw.gerbil.semantic.sameas.SingleUriSameAsRetriever; import org.aksw.gerbil.semantic.sameas.impl.CrawlingSameAsRetrieverDecorator; @@ -42,6 +47,7 @@ import org.aksw.gerbil.semantic.sameas.impl.cache.FileBasedCachingSameAsRetriever; import org.aksw.gerbil.semantic.sameas.impl.cache.InMemoryCachingSameAsRetriever; import org.aksw.gerbil.semantic.sameas.impl.http.HTTPBasedSameAsRetriever; +import org.aksw.gerbil.semantic.sameas.impl.index.IndexBasedSameAsRetriever; import org.aksw.gerbil.semantic.sameas.impl.wiki.WikiDbPediaBridgingSameAsRetriever; import org.aksw.gerbil.semantic.sameas.impl.wiki.WikipediaApiBasedSingleUriSameAsRetriever; import org.aksw.gerbil.semantic.subclass.ClassHierarchyLoader; @@ -110,11 +116,16 @@ public class RootConfig { private static final String ENTITY_CHECKING_MANAGER_IN_MEM_CACHE_SIZE_KEY = "org.aksw.gerbil.dataset.check.InMemoryCachingEntityCheckerManager.cacheSize"; private static final String ENTITY_CHECKING_MANAGER_IN_MEM_CACHE_DURATION_KEY = "org.aksw.gerbil.dataset.check.InMemoryCachingEntityCheckerManager.cacheDuration"; private static final String HTTP_BASED_ENTITY_CHECKING_NAMESPACE_KEY = "org.aksw.gerbil.dataset.check.HttpBasedEntityChecker.namespace"; + private static final String INDEX_BASED_ENTITY_CHECKING_CONFIG_KEY_START = "org.aksw.gerbil.dataset.check.IndexBasedEntityChecker"; private static final String WIKIPEDIA_BASED_SAME_AS_RETRIEVAL_DOMAIN_KEY = "org.aksw.gerbil.semantic.sameas.impl.wiki.WikipediaApiBasedSingleUriSameAsRetriever.domain"; private static final String SAME_AS_RETRIEVAL_DOMAIN_BLACKLIST_KEY = "org.aksw.gerbil.semantic.sameas.impl.UriFilteringSameAsRetrieverDecorator.domainBlacklist"; + private static final String INDEXED_BASED_SAME_AS_RETRIEVER_FOLDER_KEY = "org.aksw.gerbil.semantic.sameas.impl.index.IndexBasedSameAsRetriever.folder"; + private static final String INDEXED_BASED_SAME_AS_RETRIEVER_DOMAIN_KEY = "org.aksw.gerbil.semantic.sameas.impl.index.IndexBasedSameAsRetriever.domain"; private static final String AVAILABLE_EXPERIMENT_TYPES_KEY = "org.aksw.gerbil.web.MainController.availableExperimentTypes"; + private static final String DEFAULT_WELL_KNOWN_KBS_PARAMETER_KEY = "org.aksw.gerbil.evaluate.DefaultWellKnownKB"; + static @Bean public PropertySourcesPlaceholderConfigurer myPropertySourcesPlaceholderConfigurer() { PropertySourcesPlaceholderConfigurer p = new PropertySourcesPlaceholderConfigurer(); Resource[] resourceLocations = new Resource[] { new ClassPathResource("gerbil.properties"), }; @@ -163,13 +174,34 @@ public class RootConfig { retrieverManager.addStaticRetriever(new UriEncodingHandlingSameAsRetriever()); // HTTP based same as retrieval + HTTPBasedSameAsRetriever httpRetriever = null; if (GerbilConfiguration.getInstance().containsKey(HTTP_SAME_AS_RETRIEVAL_DOMAIN_KEY)) { - HTTPBasedSameAsRetriever httpRetriever = new HTTPBasedSameAsRetriever(); + httpRetriever = new HTTPBasedSameAsRetriever(); for (String domain : GerbilConfiguration.getInstance().getStringArray(HTTP_SAME_AS_RETRIEVAL_DOMAIN_KEY)) { retrieverManager.addDomainSpecificRetriever(domain, httpRetriever); } } + // If there is an index based same as retriever available + if (GerbilConfiguration.getInstance().containsKey(INDEXED_BASED_SAME_AS_RETRIEVER_FOLDER_KEY)) { + SameAsRetriever retriever; + try { + retriever = new IndexBasedSameAsRetriever( + GerbilConfiguration.getInstance().getString(INDEXED_BASED_SAME_AS_RETRIEVER_FOLDER_KEY)); + + } catch (GerbilException e) { + LOGGER.error("Could not load Index Retriever. using HTTPBasedSameAs Retriever instead"); + if (httpRetriever == null) { + retriever = new HTTPBasedSameAsRetriever(); + } else { + retriever = httpRetriever; + } + } + for (String domain : GerbilConfiguration.getInstance() + .getStringArray(INDEXED_BASED_SAME_AS_RETRIEVER_DOMAIN_KEY)) { + retrieverManager.addDomainSpecificRetriever(domain, retriever); + } + } // Wikipedia API based same as retrieval if (GerbilConfiguration.getInstance().containsKey(WIKIPEDIA_BASED_SAME_AS_RETRIEVAL_DOMAIN_KEY)) { SingleUriSameAsRetriever singleRetriever = new WikipediaApiBasedSingleUriSameAsRetriever(); @@ -199,6 +231,7 @@ public class RootConfig { decoratedRetriever = FileBasedCachingSameAsRetriever.create(sameAsRetriever, false, new File(GerbilConfiguration.getInstance().getString(SAME_AS_CACHE_FILE_KEY))); } + if (decoratedRetriever == null) { LOGGER.warn("Couldn't create file based cache for sameAs retrieving. Trying to create in Memory cache."); if (GerbilConfiguration.getInstance().containsKey(SAME_AS_IN_MEMORY_CACHE_SIZE_KEY)) { @@ -241,6 +274,7 @@ public static AnnotatorOutputWriter getAnnotatorOutputWriter() { } } + @SuppressWarnings("unchecked") public static @Bean EntityCheckerManager getEntityCheckerManager() { EntityCheckerManager manager = null; Configuration config = GerbilConfiguration.getInstance(); @@ -271,13 +305,41 @@ public static AnnotatorOutputWriter getAnnotatorOutputWriter() { if (manager == null) { manager = new EntityCheckerManagerImpl(); } - @SuppressWarnings("unchecked") List namespaces = config.getList(HTTP_BASED_ENTITY_CHECKING_NAMESPACE_KEY); if (!namespaces.isEmpty()) { for (String namespace : namespaces) { manager.registerEntityChecker(namespace.toString(), new HttpBasedEntityChecker(namespace.toString())); } } + @SuppressWarnings("rawtypes") + Iterator keyIterator = config.getKeys(INDEX_BASED_ENTITY_CHECKING_CONFIG_KEY_START); + while (keyIterator.hasNext()) { + String key = keyIterator.next().toString(); + namespaces = config.getList(key); + if (!namespaces.isEmpty()) { + // the first "namespace" is the directory of the index + IndexBasedEntityChecker indexBasedChecker = IndexBasedEntityChecker.create(namespaces.get(0)); + if (indexBasedChecker != null) { + boolean first = true; + for (String namespace : namespaces) { + if (first) { + first = false; + } else { + manager.registerEntityChecker(namespace.toString(), indexBasedChecker); + } + } + } else { + LOGGER.error( + "Couldn't create index based entity checker for index \"{}\". Creating HTTP based checker.", + namespaces.get(0)); + // use HTTP based checker + for (String namespace : namespaces) { + manager.registerEntityChecker(namespace.toString(), + new HttpBasedEntityChecker(namespace.toString())); + } + } + } + } return manager; } @@ -310,4 +372,32 @@ public static ExperimentType[] getAvailableExperimentTypes() { } } + public static UriKBClassifier createDefaultUriKBClassifier() { + return new SimpleWhiteListBasedUriKBClassifier(loadDefaultKBs()); + } + + public static String[] loadDefaultKBs() { + String kbs[] = GerbilConfiguration.getInstance().getStringArray(DEFAULT_WELL_KNOWN_KBS_PARAMETER_KEY); + if (kbs == null) { + LOGGER.error("Couldn't load the list of well known KBs. This GERBIL instance might not work as expected!"); + } + return kbs; + } + + public static int getNoOfWorkers() { + int numberOfWorkers = DEFAULT_NUMBER_OF_WORKERS; + if (GerbilConfiguration.getInstance().containsKey(NUMBER_OF_WORKERS_KEY)) { + try { + numberOfWorkers = GerbilConfiguration.getInstance().getInt(NUMBER_OF_WORKERS_KEY); + } catch (Exception e) { + // LOGGER.warn("Couldn't load number of workers from config. + // Using the default number.", e); + } + } else { + // LOGGER.warn("Couldn't load number of workers from config. Using + // the default number."); + } + return numberOfWorkers; + } + } diff --git a/src/main/java/org/aksw/gerbil/web/config/check/AnnotatorChecker.java b/src/main/java/org/aksw/gerbil/web/config/check/AnnotatorChecker.java new file mode 100644 index 000000000..007f5aff2 --- /dev/null +++ b/src/main/java/org/aksw/gerbil/web/config/check/AnnotatorChecker.java @@ -0,0 +1,30 @@ +package org.aksw.gerbil.web.config.check; + +import org.aksw.gerbil.config.GerbilConfiguration; +import org.apache.commons.configuration.Configuration; + +/** + * A {@link Checker} that checks whether the given object(s) (interpreted as + * String) are defined properties. + * + * + */ +public class AnnotatorChecker implements Checker{ + private static Configuration config= GerbilConfiguration.getInstance(); + + @Override + public boolean check(Object... objects) { + + for(Object it: objects){ + Object prop=config.getProperty(it.toString()); + if(prop==null){ + return false; + } + + } + + + return true; + } + +} diff --git a/src/main/java/org/aksw/gerbil/web/config/check/Checker.java b/src/main/java/org/aksw/gerbil/web/config/check/Checker.java new file mode 100644 index 000000000..881d8b38a --- /dev/null +++ b/src/main/java/org/aksw/gerbil/web/config/check/Checker.java @@ -0,0 +1,23 @@ +package org.aksw.gerbil.web.config.check; + +/** + * A simple interface for a class that offers a {@link #check(Object...)} method + * that checks the given objects, i.e., files regarding their existence, and + * returns true or false. + * + * @author Michael Röder (roeder@informatik.uni-leipzig.de) + * + */ +public interface Checker { + + /** + * Checks the given objects and returns true if the check was + * successful. Note that the semantic of the given object(s) is defined by + * the implementing classes. + * + * @param objects + * the object(s) that should be checked + * @return true if the check was successful. + */ + public boolean check(Object... objects); +} diff --git a/src/main/java/org/aksw/gerbil/web/config/check/DirectoryChecker.java b/src/main/java/org/aksw/gerbil/web/config/check/DirectoryChecker.java new file mode 100644 index 000000000..13b4e8dff --- /dev/null +++ b/src/main/java/org/aksw/gerbil/web/config/check/DirectoryChecker.java @@ -0,0 +1,36 @@ +package org.aksw.gerbil.web.config.check; + +import java.io.File; + +/** + * A {@link Checker} that checks whether the given object(s) (interpreted as + * String) is a directory and does exist. + * + * @author Michael Röder (roeder@informatik.uni-leipzig.de) + * + */ +public class DirectoryChecker implements Checker { + + @Override + public boolean check(Object... objects) { + for (int i = 0; i < objects.length; ++i) { + if (!checkSingleObject(objects[i])) { + return false; + } + } + return true; + } + + public boolean checkSingleObject(Object object) { + File file = null; + if (object instanceof File) { + file = (File) object; + } else if (object instanceof String) { + file = new File((String) object); + } else { + file = new File(object.toString()); + } + return file.exists() && file.isDirectory(); + } + +} diff --git a/src/main/java/org/aksw/gerbil/web/config/check/FileChecker.java b/src/main/java/org/aksw/gerbil/web/config/check/FileChecker.java new file mode 100644 index 000000000..d74af0114 --- /dev/null +++ b/src/main/java/org/aksw/gerbil/web/config/check/FileChecker.java @@ -0,0 +1,36 @@ +package org.aksw.gerbil.web.config.check; + +import java.io.File; + +/** + * A {@link Checker} that checks whether the given object(s) (interpreted as + * String) is a file and does exist. + * + * @author Michael Röder (roeder@informatik.uni-leipzig.de) + * + */ +public class FileChecker implements Checker { + + @Override + public boolean check(Object... objects) { + for (int i = 0; i < objects.length; ++i) { + if (!checkSingleObject(objects[i])) { + return false; + } + } + return true; + } + + public boolean checkSingleObject(Object object) { + File file = null; + if (object instanceof File) { + file = (File) object; + } else if (object instanceof String) { + file = new File((String) object); + } else { + file = new File(object.toString()); + } + return file.exists() && file.isFile(); + } + +} diff --git a/src/main/properties/annotators.properties b/src/main/properties/annotators.properties index 34aa1e0c6..5c8569083 100644 --- a/src/main/properties/annotators.properties +++ b/src/main/properties/annotators.properties @@ -58,12 +58,12 @@ org.aksw.gerbil.annotators.definition.Dexter.class=org.aksw.gerbil.annotator.imp org.aksw.gerbil.annotators.definition.Dexter.constructorArgs=${org.aksw.gerbil.annotators.DexterAnnotator.annotationUrl} ### DoSeR -#org.aksw.gerbil.annotators.DoSeR.serviceUrl=http://zaire.dimis.fim.uni-passau.de:8999/doser-gerbilrest/doserwrapper -#org.aksw.gerbil.annotators.definition.DoSeR.name=DoSeR -#org.aksw.gerbil.annotators.definition.DoSeR.experimentType=D2KB -#org.aksw.gerbil.annotators.definition.DoSeR.cacheable=true -#org.aksw.gerbil.annotators.definition.DoSeR.class=org.aksw.gerbil.annotator.impl.nif.NIFBasedAnnotatorWebservice -#org.aksw.gerbil.annotators.definition.DoSeR.constructorArgs=${org.aksw.gerbil.annotators.DoSeR.serviceUrl} +org.aksw.gerbil.annotators.DoSeR.serviceUrl=http://zaire.dimis.fim.uni-passau.de:8999/doser-gerbilrest/doserwrapper +org.aksw.gerbil.annotators.definition.DoSeR.name=DoSeR +org.aksw.gerbil.annotators.definition.DoSeR.experimentType=D2KB +org.aksw.gerbil.annotators.definition.DoSeR.cacheable=true +org.aksw.gerbil.annotators.definition.DoSeR.class=org.aksw.gerbil.annotator.impl.nif.NIFBasedAnnotatorWebservice +org.aksw.gerbil.annotators.definition.DoSeR.constructorArgs=${org.aksw.gerbil.annotators.DoSeR.serviceUrl} ### Entityclassifier.eu NER org.aksw.gerbil.annotators.EntityclassifierEUConfig.url=http://entityclassifier.eu/thd/api/v2/extraction?provenance=thd&entity_type=ne&lang=en&spotting_method=grammars&linking_method=LuceneSearchSkipDisPage&apikey= @@ -73,6 +73,8 @@ org.aksw.gerbil.annotators.definition.EcEU.name=Entityclassifier.eu NER org.aksw.gerbil.annotators.definition.EcEU.experimentType=A2KB org.aksw.gerbil.annotators.definition.EcEU.cacheable=true org.aksw.gerbil.annotators.definition.EcEU.class=org.aksw.gerbil.annotator.impl.nif.NIFBasedAnnotatorWebservice +org.aksw.gerbil.annotators.definition.EcEU.check.class=org.aksw.gerbil.web.config.check.AnnotatorChecker +org.aksw.gerbil.annotators.definition.EcEU.check.args=org.aksw.gerbil.annotators.EntityclassifierEUConfig.apiKey org.aksw.gerbil.annotators.definition.EcEU.constructorArgs=${org.aksw.gerbil.annotators.EntityclassifierEUConfig.url}${org.aksw.gerbil.annotators.EntityclassifierEUConfig.apiKey} ### FRED @@ -130,14 +132,21 @@ org.aksw.gerbil.annotators.definition.kea.experimentType=A2KB org.aksw.gerbil.annotators.definition.kea.cacheable=true #org.aksw.gerbil.annotators.definition.kea.singleton=true org.aksw.gerbil.annotators.definition.kea.class=org.aksw.gerbil.annotator.impl.nif.NIFBasedAnnotatorWebservice +org.aksw.gerbil.annotators.definition.kea.check.class=org.aksw.gerbil.web.config.check.AnnotatorChecker +org.aksw.gerbil.annotators.definition.kea.check.args=org.aksw.gerbil.annotators.KeaAnnotatorConfig.user +org.aksw.gerbil.annotators.definition.kea.check.args=org.aksw.gerbil.annotators.KeaAnnotatorConfig.password org.aksw.gerbil.annotators.definition.kea.constructorArgs=http://${org.aksw.gerbil.annotators.KeaAnnotatorConfig.user}:${org.aksw.gerbil.annotators.KeaAnnotatorConfig.password}@${org.aksw.gerbil.annotators.KeaAnnotatorConfig.annotationUrl} org.aksw.gerbil.annotators.definition.kea2.name=Kea org.aksw.gerbil.annotators.definition.kea2.experimentType=D2KB org.aksw.gerbil.annotators.definition.kea2.cacheable=true #org.aksw.gerbil.annotators.definition.kea2.singleton=true org.aksw.gerbil.annotators.definition.kea2.class=org.aksw.gerbil.annotator.impl.nif.NIFBasedAnnotatorWebservice +org.aksw.gerbil.annotators.definition.kea2.check.class=org.aksw.gerbil.web.config.check.AnnotatorChecker +org.aksw.gerbil.annotators.definition.kea2.check.args=org.aksw.gerbil.annotators.KeaAnnotatorConfig.user +org.aksw.gerbil.annotators.definition.kea2.check.args=org.aksw.gerbil.annotators.KeaAnnotatorConfig.password org.aksw.gerbil.annotators.definition.kea2.constructorArgs=http://${org.aksw.gerbil.annotators.KeaAnnotatorConfig.user}:${org.aksw.gerbil.annotators.KeaAnnotatorConfig.password}@${org.aksw.gerbil.annotators.KeaAnnotatorConfig.disambiguationUrl} + ### NERD-ML #NERD endpoint org.aksw.gerbil.annotators.NERD.host=http://nerd.eurecom.fr/api/ @@ -147,13 +156,34 @@ org.aksw.gerbil.annotators.definition.NERD.name=NERD-ML org.aksw.gerbil.annotators.definition.NERD.experimentType=A2KB org.aksw.gerbil.annotators.definition.NERD.cacheable=true org.aksw.gerbil.annotators.definition.NERD.class=org.aksw.gerbil.annotator.impl.nerd.NERDAnnotator +org.aksw.gerbil.annotators.definition.NERD.check.class=org.aksw.gerbil.web.config.check.AnnotatorChecker +org.aksw.gerbil.annotators.definition.NERD.check.args=org.aksw.gerbil.annotators.nerd.Key org.aksw.gerbil.annotators.definition.NERD.constructorArgs=${org.aksw.gerbil.annotators.NERD.host} +### NERFGUN +org.aksw.gerbil.annotators.NERFGUN.serviceUrl=http://psink.techfak.uni-bielefeld.de/ned +org.aksw.gerbil.annotators.definition.NERFGUN.name=NERFGUN +org.aksw.gerbil.annotators.definition.NERFGUN.experimentType=D2KB +org.aksw.gerbil.annotators.definition.NERFGUN.cacheable=true +org.aksw.gerbil.annotators.definition.NERFGUN.class=org.aksw.gerbil.annotator.impl.nif.NIFBasedAnnotatorWebservice +org.aksw.gerbil.annotators.definition.NERFGUN.constructorArgs=${org.aksw.gerbil.annotators.NERFGUN.serviceUrl} + +### PBOH +org.aksw.gerbil.annotators.PBOH.serviceUrl=http://cake.da.inf.ethz.ch:12355/gerbil-spotWrapNifWS4Test/pboh +org.aksw.gerbil.annotators.definition.PBOH.name=PBOH +org.aksw.gerbil.annotators.definition.PBOH.experimentType=D2KB +org.aksw.gerbil.annotators.definition.PBOH.cacheable=true +org.aksw.gerbil.annotators.definition.PBOH.class=org.aksw.gerbil.annotator.impl.nif.NIFBasedAnnotatorWebservice +org.aksw.gerbil.annotators.definition.PBOH.constructorArgs=${org.aksw.gerbil.annotators.PBOH.serviceUrl} + ### Tagme org.aksw.gerbil.annotators.TagmeAnnotator.annotateUrl=https://tagme.d4science.org/tagme/tag org.aksw.gerbil.annotators.TagmeAnnotator.spotUrl=https://tagme.d4science.org/tagme/spot #Use this property to set the TagMe key (moved this to the gerbil_keys.properties file) #org.aksw.gerbil.annotators.TagMe.key +org.aksw.gerbil.annotators.definition.TagMe.check.class=org.aksw.gerbil.web.config.check.AnnotatorChecker +org.aksw.gerbil.annotators.definition.TagMe.check.args=org.aksw.gerbil.annotators.TagMe.key + org.aksw.gerbil.annotators.definition.TagMe.name=TagMe 2 org.aksw.gerbil.annotators.definition.TagMe.experimentType=A2KB org.aksw.gerbil.annotators.definition.TagMe.cacheable=true @@ -179,14 +209,22 @@ org.aksw.gerbil.annotators.definition.WAT.constructorArgs=${org.aksw.gerbil.anno ### xLisa org.aksw.gerbil.annotators.XLisa.configFile=${org.aksw.gerbil.DataPath}/configs/xLisa/xlisa.properties -org.aksw.gerbil.annotators.definition.XLisa.name=xLisa +org.aksw.gerbil.annotators.definition.XLisa.name=xLisa-NGRAM org.aksw.gerbil.annotators.definition.XLisa.experimentType=A2KB org.aksw.gerbil.annotators.definition.XLisa.cacheable=true org.aksw.gerbil.annotators.definition.XLisa.class=org.aksw.gerbil.annotator.impl.xlisa.XLisaAnnotator org.aksw.gerbil.annotators.definition.XLisa.lang1=en org.aksw.gerbil.annotators.definition.XLisa.lang2=en -#org.aksw.gerbil.annotators.definition.XLisa.kb=dbpedia -org.aksw.gerbil.annotators.definition.XLisa.kb=wikipedia -#org.aksw.gerbil.annotators.definition.XLisa.model=NER -#org.aksw.gerbil.annotators.definition.XLisa.model=POS +org.aksw.gerbil.annotators.definition.XLisa.kb=dbpedia org.aksw.gerbil.annotators.definition.XLisa.model=NGRAM +org.aksw.gerbil.annotators.definition.XLisa.constructorArgs=${org.aksw.gerbil.annotators.definition.XLisa.lang1}, ${org.aksw.gerbil.annotators.definition.XLisa.lang2}, ${org.aksw.gerbil.annotators.definition.XLisa.kb}, ${org.aksw.gerbil.annotators.definition.XLisa.model} +org.aksw.gerbil.annotators.definition.XLisa2.name=xLisa-NER +org.aksw.gerbil.annotators.definition.XLisa2.experimentType=A2KB +org.aksw.gerbil.annotators.definition.XLisa2.cacheable=true +org.aksw.gerbil.annotators.definition.XLisa2.class=org.aksw.gerbil.annotator.impl.xlisa.XLisaAnnotator +org.aksw.gerbil.annotators.definition.XLisa2.lang1=en +org.aksw.gerbil.annotators.definition.XLisa2.lang2=en +org.aksw.gerbil.annotators.definition.XLisa2.kb=dbpedia +org.aksw.gerbil.annotators.definition.XLisa2.model=NER +org.aksw.gerbil.annotators.definition.XLisa2.constructorArgs=${org.aksw.gerbil.annotators.definition.XLisa2.lang1}, ${org.aksw.gerbil.annotators.definition.XLisa2.lang2}, ${org.aksw.gerbil.annotators.definition.XLisa2.kb}, ${org.aksw.gerbil.annotators.definition.XLisa2.model} + diff --git a/src/main/properties/datasets.properties b/src/main/properties/datasets.properties index 327b3c920..419636bc2 100644 --- a/src/main/properties/datasets.properties +++ b/src/main/properties/datasets.properties @@ -18,21 +18,29 @@ org.aksw.gerbil.datasets.definition.AIDAComplete.class=org.aksw.gerbil.dataset.i org.aksw.gerbil.datasets.definition.AIDAComplete.cacheable=true org.aksw.gerbil.datasets.definition.AIDAComplete.experimentType=A2KB org.aksw.gerbil.datasets.definition.AIDAComplete.constructorArgs=${org.aksw.gerbil.datasets.AIDACoNLLDatasetConfig.DatasetFile} +org.aksw.gerbil.datasets.definition.AIDAComplete.check.class=org.aksw.gerbil.web.config.check.FileChecker +org.aksw.gerbil.datasets.definition.AIDAComplete.check.args=${org.aksw.gerbil.datasets.AIDACoNLLDatasetConfig.DatasetFile} org.aksw.gerbil.datasets.definition.AIDATrain.name=AIDA/CoNLL-Training org.aksw.gerbil.datasets.definition.AIDATrain.class=org.aksw.gerbil.dataset.impl.aida.AIDACoNLLDataset org.aksw.gerbil.datasets.definition.AIDATrain.cacheable=true org.aksw.gerbil.datasets.definition.AIDATrain.experimentType=A2KB org.aksw.gerbil.datasets.definition.AIDATrain.constructorArgs=${org.aksw.gerbil.datasets.AIDACoNLLDatasetConfig.DatasetFile},1,946 +org.aksw.gerbil.datasets.definition.AIDATrain.check.class=org.aksw.gerbil.web.config.check.FileChecker +org.aksw.gerbil.datasets.definition.AIDATrain.check.args=${org.aksw.gerbil.datasets.AIDACoNLLDatasetConfig.DatasetFile} org.aksw.gerbil.datasets.definition.AIDATestA.name=AIDA/CoNLL-Test A org.aksw.gerbil.datasets.definition.AIDATestA.class=org.aksw.gerbil.dataset.impl.aida.AIDACoNLLDataset org.aksw.gerbil.datasets.definition.AIDATestA.cacheable=true org.aksw.gerbil.datasets.definition.AIDATestA.experimentType=A2KB org.aksw.gerbil.datasets.definition.AIDATestA.constructorArgs=${org.aksw.gerbil.datasets.AIDACoNLLDatasetConfig.DatasetFile},947,1162 +org.aksw.gerbil.datasets.definition.AIDATestA.check.class=org.aksw.gerbil.web.config.check.FileChecker +org.aksw.gerbil.datasets.definition.AIDATestA.check.args=${org.aksw.gerbil.datasets.AIDACoNLLDatasetConfig.DatasetFile} org.aksw.gerbil.datasets.definition.AIDATestB.name=AIDA/CoNLL-Test B org.aksw.gerbil.datasets.definition.AIDATestB.class=org.aksw.gerbil.dataset.impl.aida.AIDACoNLLDataset org.aksw.gerbil.datasets.definition.AIDATestB.cacheable=true org.aksw.gerbil.datasets.definition.AIDATestB.experimentType=A2KB org.aksw.gerbil.datasets.definition.AIDATestB.constructorArgs=${org.aksw.gerbil.datasets.AIDACoNLLDatasetConfig.DatasetFile},1163,1393 +org.aksw.gerbil.datasets.definition.AIDATestB.check.class=org.aksw.gerbil.web.config.check.FileChecker +org.aksw.gerbil.datasets.definition.AIDATestB.check.args=${org.aksw.gerbil.datasets.AIDACoNLLDatasetConfig.DatasetFile} ### AQUAINT org.aksw.gerbil.datasets.AQUAINTDatasetConfiguration.textsFolder=${org.aksw.gerbil.DataPath}/datasets/AQUAINT/RawTexts @@ -42,6 +50,8 @@ org.aksw.gerbil.datasets.definition.AQUAINT.class=org.aksw.gerbil.dataset.impl.m org.aksw.gerbil.datasets.definition.AQUAINT.cacheable=true org.aksw.gerbil.datasets.definition.AQUAINT.experimentType=A2KB org.aksw.gerbil.datasets.definition.AQUAINT.constructorArgs=${org.aksw.gerbil.datasets.AQUAINTDatasetConfiguration.textsFolder},${org.aksw.gerbil.datasets.AQUAINTDatasetConfiguration.annotationsFolder} +org.aksw.gerbil.datasets.definition.AQUAINT.check.class=org.aksw.gerbil.web.config.check.DirectoryChecker +org.aksw.gerbil.datasets.definition.AQUAINT.check.args=${org.aksw.gerbil.datasets.AQUAINTDatasetConfiguration.textsFolder},${org.aksw.gerbil.datasets.AQUAINTDatasetConfiguration.annotationsFolder} ### Derczynski org.aksw.gerbil.datasets.DerczynskiDatasetConfiguration.tweets=${org.aksw.gerbil.DataPath}/datasets/derczynski/ipm_nel_corpus/ipm_nel.conll @@ -51,7 +61,6 @@ org.aksw.gerbil.datasets.definition.Derczynski.cacheable=true org.aksw.gerbil.datasets.definition.Derczynski.experimentType=A2KB org.aksw.gerbil.datasets.definition.Derczynski.constructorArgs=${org.aksw.gerbil.datasets.DerczynskiDatasetConfiguration.tweets} - ### DBpediaSpotlight (the dataset) org.aksw.gerbil.datasets.KnownNIFFileDatasetConfig.DBPEDIA_SPOTLIGHT=${org.aksw.gerbil.DataPath}/datasets/spotlight/dbpedia-spotlight-nif.ttl org.aksw.gerbil.datasets.definition.DBPEDIA_SPOTLIGHT.name=DBpediaSpotlight @@ -60,6 +69,41 @@ org.aksw.gerbil.datasets.definition.DBPEDIA_SPOTLIGHT.cacheable=true org.aksw.gerbil.datasets.definition.DBPEDIA_SPOTLIGHT.experimentType=A2KB org.aksw.gerbil.datasets.definition.DBPEDIA_SPOTLIGHT.constructorArgs=${org.aksw.gerbil.datasets.KnownNIFFileDatasetConfig.DBPEDIA_SPOTLIGHT},${org.aksw.gerbil.datasets.definition.DBPEDIA_SPOTLIGHT.name} +### GERDAQ +org.aksw.gerbil.datasets.gerdaq.devFile=${org.aksw.gerbil.DataPath}/datasets/gerdaq/gerdaq_devel.xml +org.aksw.gerbil.datasets.gerdaq.trainingAFile=${org.aksw.gerbil.DataPath}/datasets/gerdaq/gerdaq_trainingA.xml +org.aksw.gerbil.datasets.gerdaq.trainingBFile=${org.aksw.gerbil.DataPath}/datasets/gerdaq/gerdaq_trainingB.xml +org.aksw.gerbil.datasets.gerdaq.testFile=${org.aksw.gerbil.DataPath}/datasets/gerdaq/gerdaq_test.xml +org.aksw.gerbil.datasets.definition.gerdaq_dev.name=GERDAQ-Dev +org.aksw.gerbil.datasets.definition.gerdaq_dev.class=org.aksw.gerbil.dataset.impl.gerdaq.GERDAQDataset +org.aksw.gerbil.datasets.definition.gerdaq_dev.cacheable=true +org.aksw.gerbil.datasets.definition.gerdaq_dev.experimentType=A2KB +org.aksw.gerbil.datasets.definition.gerdaq_dev.constructorArgs=${org.aksw.gerbil.datasets.gerdaq.devFile} +org.aksw.gerbil.datasets.definition.gerdaq_trainingA.name=GERDAQ-TrainingA +org.aksw.gerbil.datasets.definition.gerdaq_trainingA.class=org.aksw.gerbil.dataset.impl.gerdaq.GERDAQDataset +org.aksw.gerbil.datasets.definition.gerdaq_trainingA.cacheable=true +org.aksw.gerbil.datasets.definition.gerdaq_trainingA.experimentType=A2KB +org.aksw.gerbil.datasets.definition.gerdaq_trainingA.constructorArgs=${org.aksw.gerbil.datasets.gerdaq.trainingAFile} +org.aksw.gerbil.datasets.definition.gerdaq_trainingB.name=GERDAQ-TrainingB +org.aksw.gerbil.datasets.definition.gerdaq_trainingB.class=org.aksw.gerbil.dataset.impl.gerdaq.GERDAQDataset +org.aksw.gerbil.datasets.definition.gerdaq_trainingB.cacheable=true +org.aksw.gerbil.datasets.definition.gerdaq_trainingB.experimentType=A2KB +org.aksw.gerbil.datasets.definition.gerdaq_trainingB.constructorArgs=${org.aksw.gerbil.datasets.gerdaq.trainingBFile} +org.aksw.gerbil.datasets.definition.gerdaq_test.name=GERDAQ-Test +org.aksw.gerbil.datasets.definition.gerdaq_test.class=org.aksw.gerbil.dataset.impl.gerdaq.GERDAQDataset +org.aksw.gerbil.datasets.definition.gerdaq_test.cacheable=true +org.aksw.gerbil.datasets.definition.gerdaq_test.experimentType=A2KB +org.aksw.gerbil.datasets.definition.gerdaq_test.constructorArgs=${org.aksw.gerbil.datasets.gerdaq.testFile} + +### ERD2014 +org.aksw.gerbil.datasets.ERD2014.texts=${org.aksw.gerbil.DataPath}/datasets/erd2014/Trec_beta.query.txt +org.aksw.gerbil.datasets.ERD2014.annotations=${org.aksw.gerbil.DataPath}/datasets/erd2014/Trec_beta.annotation.txt +org.aksw.gerbil.datasets.definition.ERD2014.name=ERD2014 +org.aksw.gerbil.datasets.definition.ERD2014.class=org.aksw.gerbil.dataset.impl.erd.ERDDataset2 +org.aksw.gerbil.datasets.definition.ERD2014.cacheable=true +org.aksw.gerbil.datasets.definition.ERD2014.experimentType=A2KB +org.aksw.gerbil.datasets.definition.ERD2014.constructorArgs=${org.aksw.gerbil.datasets.ERD2014.texts},${org.aksw.gerbil.datasets.ERD2014.annotations} + ### IITB org.aksw.gerbil.datasets.IITBDatasetConfig.crawledDocs=${org.aksw.gerbil.DataPath}/datasets/iitb/crawledDocs org.aksw.gerbil.datasets.IITBDatasetConfig.annotations=${org.aksw.gerbil.DataPath}/datasets/iitb/CSAW_Annotations.xml @@ -78,9 +122,9 @@ org.aksw.gerbil.datasets.definition.KORE50.experimentType=A2KB org.aksw.gerbil.datasets.definition.KORE50.constructorArgs=${org.aksw.gerbil.datasets.KORE50.file},${org.aksw.gerbil.datasets.definition.KORE50.name} ### Meij -org.aksw.gerbil.datasets.MeijDatasetConfig.tweetsFile=${org.aksw.gerbil.DataPath}/datasets/meij/original_tweets.list -org.aksw.gerbil.datasets.MeijDatasetConfig.tagsFile=${org.aksw.gerbil.DataPath}/datasets/meij/wsdm2012_annotations.txt -org.aksw.gerbil.datasets.MeijDatasetConfig.rankFile=${org.aksw.gerbil.DataPath}/datasets/meij/wsdm2012_qrels.txt +#org.aksw.gerbil.datasets.MeijDatasetConfig.tweetsFile=${org.aksw.gerbil.DataPath}/datasets/meij/original_tweets.list +#org.aksw.gerbil.datasets.MeijDatasetConfig.tagsFile=${org.aksw.gerbil.DataPath}/datasets/meij/wsdm2012_annotations.txt +#org.aksw.gerbil.datasets.MeijDatasetConfig.rankFile=${org.aksw.gerbil.DataPath}/datasets/meij/wsdm2012_qrels.txt ### MSNBC org.aksw.gerbil.datasets.MSNBCDatasetConfig.textsFolder=${org.aksw.gerbil.DataPath}/datasets/MSNBC/RawTextsSimpleChars_utf8 @@ -91,6 +135,65 @@ org.aksw.gerbil.datasets.definition.MSNBC.cacheable=true org.aksw.gerbil.datasets.definition.MSNBC.experimentType=A2KB org.aksw.gerbil.datasets.definition.MSNBC.constructorArgs=${org.aksw.gerbil.datasets.MSNBCDatasetConfig.textsFolder},${org.aksw.gerbil.datasets.MSNBCDatasetConfig.annotationsFolder} + +### Micrposts2016 +org.aksw.gerbil.datasets.Microposts2016DatasetConfig.dev.tweets=${org.aksw.gerbil.DataPath}/datasets/microposts2016/Dev Set/NEEL2016-dev.tsv +org.aksw.gerbil.datasets.Microposts2016DatasetConfig.train.tweets=${org.aksw.gerbil.DataPath}/datasets/microposts2016/Training Set/NEEL2016-training.tsv +org.aksw.gerbil.datasets.Microposts2016DatasetConfig.test.tweets=${org.aksw.gerbil.DataPath}/datasets/microposts2016/Test Set/NEEL2016-test.tsv +org.aksw.gerbil.datasets.Microposts2016DatasetConfig.dev.annotations=${org.aksw.gerbil.DataPath}/datasets/microposts2016/Dev Set/NEEL2016-dev_neel.gs +org.aksw.gerbil.datasets.Microposts2016DatasetConfig.train.annotations=${org.aksw.gerbil.DataPath}/datasets/microposts2016/Training Set/NEEL2016-training_neel.gs +org.aksw.gerbil.datasets.Microposts2016DatasetConfig.test.annotations=${org.aksw.gerbil.DataPath}/datasets/microposts2016/Test Set/NEEL2016-test_neel.gs +org.aksw.gerbil.datasets.definition.Micro2016.name=Microposts2016-Train +org.aksw.gerbil.datasets.definition.Micro2016.class=org.aksw.gerbil.dataset.impl.micro.Microposts2016Dataset +org.aksw.gerbil.datasets.definition.Micro2016.cacheable=true +org.aksw.gerbil.datasets.definition.Micro2016.experimentType=A2KB +org.aksw.gerbil.datasets.definition.Micro2016.constructorArgs=${org.aksw.gerbil.datasets.Microposts2016DatasetConfig.train.annotations}, ${org.aksw.gerbil.datasets.Microposts2016DatasetConfig.train.tweets} +org.aksw.gerbil.datasets.definition.Micro2016.check.class=org.aksw.gerbil.web.config.check.FileChecker +org.aksw.gerbil.datasets.definition.Micro2016.check.args=${org.aksw.gerbil.datasets.Microposts2016DatasetConfig.train.annotations}, ${org.aksw.gerbil.datasets.Microposts2016DatasetConfig.train.tweets} +org.aksw.gerbil.datasets.definition.Micro2016_2.name=Microposts2016-Test +org.aksw.gerbil.datasets.definition.Micro2016_2.class=org.aksw.gerbil.dataset.impl.micro.Microposts2016Dataset +org.aksw.gerbil.datasets.definition.Micro2016_2.cacheable=true +org.aksw.gerbil.datasets.definition.Micro2016_2.experimentType=A2KB +org.aksw.gerbil.datasets.definition.Micro2016_2.constructorArgs=${org.aksw.gerbil.datasets.Microposts2016DatasetConfig.test.annotations}, ${org.aksw.gerbil.datasets.Microposts2016DatasetConfig.test.tweets} +org.aksw.gerbil.datasets.definition.Micro2016_2.check.class=org.aksw.gerbil.web.config.check.FileChecker +org.aksw.gerbil.datasets.definition.Micro2016_2.check.args=${org.aksw.gerbil.datasets.Microposts2016DatasetConfig.test.annotations}, ${org.aksw.gerbil.datasets.Microposts2016DatasetConfig.test.tweets} +org.aksw.gerbil.datasets.definition.Micro2016_3.name=Microposts2016-Dev +org.aksw.gerbil.datasets.definition.Micro2016_3.class=org.aksw.gerbil.dataset.impl.micro.Microposts2016Dataset +org.aksw.gerbil.datasets.definition.Micro2016_3.cacheable=true +org.aksw.gerbil.datasets.definition.Micro2016_3.experimentType=A2KB +org.aksw.gerbil.datasets.definition.Micro2016_3.constructorArgs=${org.aksw.gerbil.datasets.Microposts2016DatasetConfig.dev.annotations}, ${org.aksw.gerbil.datasets.Microposts2016DatasetConfig.dev.tweets} +org.aksw.gerbil.datasets.definition.Micro2016_3.check.class=org.aksw.gerbil.web.config.check.FileChecker +org.aksw.gerbil.datasets.definition.Micro2016_3.check.args=${org.aksw.gerbil.datasets.Microposts2016DatasetConfig.dev.annotations}, ${org.aksw.gerbil.datasets.Microposts2016DatasetConfig.dev.tweets} + +### Micrposts2015 +org.aksw.gerbil.datasets.Microposts2015DatasetConfig.dev.tweets=${org.aksw.gerbil.DataPath}/datasets/microposts2015/dev/NEEL2015-dev-tweets.tsv +org.aksw.gerbil.datasets.Microposts2015DatasetConfig.train.tweets=${org.aksw.gerbil.DataPath}/datasets/microposts2015/training/NEEL2015-training-tweets_v2.tsv +org.aksw.gerbil.datasets.Microposts2015DatasetConfig.test.tweets=${org.aksw.gerbil.DataPath}/datasets/microposts2015/test/NEEL2015-test-tweets.tsv +org.aksw.gerbil.datasets.Microposts2015DatasetConfig.dev.annotations=${org.aksw.gerbil.DataPath}/datasets/microposts2015/dev/NEEL2015-dev-gold_v3.tsv +org.aksw.gerbil.datasets.Microposts2015DatasetConfig.train.annotations=${org.aksw.gerbil.DataPath}/datasets/microposts2015/training/NEEL2015-training-gold_v4.tsv +org.aksw.gerbil.datasets.Microposts2015DatasetConfig.test.annotations=${org.aksw.gerbil.DataPath}/datasets/microposts2015/test/NEEL2015-test-gold_v2.tsv +org.aksw.gerbil.datasets.definition.Micro2015.name=Microposts2015-Train +org.aksw.gerbil.datasets.definition.Micro2015.class=org.aksw.gerbil.dataset.impl.micro.Microposts2015Dataset +org.aksw.gerbil.datasets.definition.Micro2015.cacheable=true +org.aksw.gerbil.datasets.definition.Micro2015.experimentType=A2KB +org.aksw.gerbil.datasets.definition.Micro2015.constructorArgs=${org.aksw.gerbil.datasets.Microposts2015DatasetConfig.train.annotations}, ${org.aksw.gerbil.datasets.Microposts2015DatasetConfig.train.tweets} +org.aksw.gerbil.datasets.definition.Micro2015.check.class=org.aksw.gerbil.web.config.check.FileChecker +org.aksw.gerbil.datasets.definition.Micro2015.check.args=${org.aksw.gerbil.datasets.Microposts2015DatasetConfig.train.annotations}, ${org.aksw.gerbil.datasets.Microposts2015DatasetConfig.train.tweets} +org.aksw.gerbil.datasets.definition.Micro2015_2.name=Microposts2015-Test +org.aksw.gerbil.datasets.definition.Micro2015_2.class=org.aksw.gerbil.dataset.impl.micro.Microposts2015Dataset +org.aksw.gerbil.datasets.definition.Micro2015_2.cacheable=true +org.aksw.gerbil.datasets.definition.Micro2015_2.experimentType=A2KB +org.aksw.gerbil.datasets.definition.Micro2015_2.constructorArgs=${org.aksw.gerbil.datasets.Microposts2015DatasetConfig.test.annotations}, ${org.aksw.gerbil.datasets.Microposts2015DatasetConfig.test.tweets} +org.aksw.gerbil.datasets.definition.Micro2015_2.check.class=org.aksw.gerbil.web.config.check.FileChecker +org.aksw.gerbil.datasets.definition.Micro2015_2.check.args=${org.aksw.gerbil.datasets.Microposts2015DatasetConfig.test.annotations}, ${org.aksw.gerbil.datasets.Microposts2015DatasetConfig.test.tweets} +org.aksw.gerbil.datasets.definition.Micro2015_3.name=Microposts2015-Dev +org.aksw.gerbil.datasets.definition.Micro2015_3.class=org.aksw.gerbil.dataset.impl.micro.Microposts2015Dataset +org.aksw.gerbil.datasets.definition.Micro2015_3.cacheable=true +org.aksw.gerbil.datasets.definition.Micro2015_3.experimentType=A2KB +org.aksw.gerbil.datasets.definition.Micro2015_3.constructorArgs=${org.aksw.gerbil.datasets.Microposts2015DatasetConfig.dev.annotations}, ${org.aksw.gerbil.datasets.Microposts2015DatasetConfig.dev.tweets} +org.aksw.gerbil.datasets.definition.Micro2015_3.check.class=org.aksw.gerbil.web.config.check.FileChecker +org.aksw.gerbil.datasets.definition.Micro2015_3.check.args=${org.aksw.gerbil.datasets.Microposts2015DatasetConfig.dev.annotations}, ${org.aksw.gerbil.datasets.Microposts2015DatasetConfig.dev.tweets} + ### Microposts2014 org.aksw.gerbil.datasets.Microposts2014DatasetConfig.train=${org.aksw.gerbil.DataPath}/datasets/microposts2014/Microposts2014-NEEL_challenge_TweetsTrainingSet.csv org.aksw.gerbil.datasets.Microposts2014DatasetConfig.test=${org.aksw.gerbil.DataPath}/datasets/microposts2014/Microposts2014-NEEL_challenge_TweetsTestSet.csv @@ -99,11 +202,15 @@ org.aksw.gerbil.datasets.definition.Micro2014.class=org.aksw.gerbil.dataset.impl org.aksw.gerbil.datasets.definition.Micro2014.cacheable=true org.aksw.gerbil.datasets.definition.Micro2014.experimentType=A2KB org.aksw.gerbil.datasets.definition.Micro2014.constructorArgs=${org.aksw.gerbil.datasets.Microposts2014DatasetConfig.train} +org.aksw.gerbil.datasets.definition.Micro2014.check.class=org.aksw.gerbil.web.config.check.FileChecker +org.aksw.gerbil.datasets.definition.Micro2014.check.args=${org.aksw.gerbil.datasets.Microposts2014DatasetConfig.train} org.aksw.gerbil.datasets.definition.Micro2014_2.name=Microposts2014-Test org.aksw.gerbil.datasets.definition.Micro2014_2.class=org.aksw.gerbil.dataset.impl.micro.Microposts2014Dataset org.aksw.gerbil.datasets.definition.Micro2014_2.cacheable=true org.aksw.gerbil.datasets.definition.Micro2014_2.experimentType=A2KB org.aksw.gerbil.datasets.definition.Micro2014_2.constructorArgs=${org.aksw.gerbil.datasets.Microposts2014DatasetConfig.test} +org.aksw.gerbil.datasets.definition.Micro2014_2.check.class=org.aksw.gerbil.web.config.check.FileChecker +org.aksw.gerbil.datasets.definition.Micro2014_2.check.args=${org.aksw.gerbil.datasets.Microposts2014DatasetConfig.test} ### Micrposts2013 org.aksw.gerbil.datasets.Microposts2013DatasetConfig.train=${org.aksw.gerbil.DataPath}/datasets/microposts2013/TweetsTrainingSetCH.tsv @@ -111,16 +218,19 @@ org.aksw.gerbil.datasets.Microposts2013DatasetConfig.test=${org.aksw.gerbil.Data org.aksw.gerbil.datasets.definition.Micro2013.name=Microposts2013-Train org.aksw.gerbil.datasets.definition.Micro2013.class=org.aksw.gerbil.dataset.impl.micro.Microposts2013Dataset org.aksw.gerbil.datasets.definition.Micro2013.cacheable=true -org.aksw.gerbil.datasets.definition.Micro2013.experimentType=A2KB +org.aksw.gerbil.datasets.definition.Micro2013.experimentType=RT2KB org.aksw.gerbil.datasets.definition.Micro2013.constructorArgs=${org.aksw.gerbil.datasets.Microposts2013DatasetConfig.train} +org.aksw.gerbil.datasets.definition.Micro2013.check.class=org.aksw.gerbil.web.config.check.FileChecker +org.aksw.gerbil.datasets.definition.Micro2013.check.args=${org.aksw.gerbil.datasets.Microposts2013DatasetConfig.train} org.aksw.gerbil.datasets.definition.Micro2013_2.name=Microposts2013-Test org.aksw.gerbil.datasets.definition.Micro2013_2.class=org.aksw.gerbil.dataset.impl.micro.Microposts2013Dataset org.aksw.gerbil.datasets.definition.Micro2013_2.cacheable=true -org.aksw.gerbil.datasets.definition.Micro2013_2.experimentType=A2KB +org.aksw.gerbil.datasets.definition.Micro2013_2.experimentType=RT2KB org.aksw.gerbil.datasets.definition.Micro2013_2.constructorArgs=${org.aksw.gerbil.datasets.Microposts2013DatasetConfig.test} +org.aksw.gerbil.datasets.definition.Micro2013_2.check.class=org.aksw.gerbil.web.config.check.FileChecker +org.aksw.gerbil.datasets.definition.Micro2013_2.check.args=${org.aksw.gerbil.datasets.Microposts2013DatasetConfig.test} - -### N³ collection +### N\u00b3 collection org.aksw.gerbil.datasets.N3_NEWS_100.file=${org.aksw.gerbil.DataPath}/datasets/N3/News-100.ttl org.aksw.gerbil.datasets.N3_REUTERS_128.file=${org.aksw.gerbil.DataPath}/datasets/N3/Reuters-128.ttl org.aksw.gerbil.datasets.N3_RSS_500.file=${org.aksw.gerbil.DataPath}/datasets/N3/RSS-500.ttl @@ -185,3 +295,87 @@ org.aksw.gerbil.datasets.definition.OKE_2015_TASK2_EVALUATION.class=org.aksw.ger org.aksw.gerbil.datasets.definition.OKE_2015_TASK2_EVALUATION.cacheable=true org.aksw.gerbil.datasets.definition.OKE_2015_TASK2_EVALUATION.experimentType=OKE_Task2 org.aksw.gerbil.datasets.definition.OKE_2015_TASK2_EVALUATION.constructorArgs=${org.aksw.gerbil.datasets.OKE_2015_TASK2_EVALUATION.file},${org.aksw.gerbil.datasets.definition.OKE_2015_TASK2_EVALUATION.name} + +### OKE 2016 Tasks +org.aksw.gerbil.datasets.OKE_2016_TASK1_EXAMPLE.file=${org.aksw.gerbil.DataPath}/datasets/oke-challenge2016/example_data/task1.ttl +org.aksw.gerbil.datasets.OKE_2016_TASK1_GS_SAMPLE.file=${org.aksw.gerbil.DataPath}/datasets/oke-challenge2016/GoldStandard_sampleData/task1/dataset_task_1.ttl +org.aksw.gerbil.datasets.OKE_2016_TASK1_EVALUATION.file=${org.aksw.gerbil.DataPath}/datasets/oke-challenge2016/evaluation-data/task1/evaluation-dataset-task1.ttl +org.aksw.gerbil.datasets.OKE_2016_TASK2_EXAMPLE.file=${org.aksw.gerbil.DataPath}/datasets/oke-challenge2016/example_data/task2.ttl +org.aksw.gerbil.datasets.OKE_2016_TASK2_GS_SAMPLE.file=${org.aksw.gerbil.DataPath}/datasets/oke-challenge2016/GoldStandard_sampleData/task2/dataset_task_2.ttl +org.aksw.gerbil.datasets.OKE_2016_TASK2_EVALUATION.file=${org.aksw.gerbil.DataPath}/datasets/oke-challenge2016/evaluation-data/task2/evaluation-dataset-task2.ttl +org.aksw.gerbil.datasets.definition.OKE_2016_TASK1_EXAMPLE.name=OKE 2016 Task 1 example set +org.aksw.gerbil.datasets.definition.OKE_2016_TASK1_EXAMPLE.class=org.aksw.gerbil.dataset.impl.nif.FileBasedNIFDataset +org.aksw.gerbil.datasets.definition.OKE_2016_TASK1_EXAMPLE.cacheable=true +org.aksw.gerbil.datasets.definition.OKE_2016_TASK1_EXAMPLE.experimentType=OKE_Task1 +org.aksw.gerbil.datasets.definition.OKE_2016_TASK1_EXAMPLE.constructorArgs=${org.aksw.gerbil.datasets.OKE_2016_TASK1_EXAMPLE.file},${org.aksw.gerbil.datasets.definition.OKE_2016_TASK1_EXAMPLE.name} +org.aksw.gerbil.datasets.definition.OKE_2016_TASK1_GS_SAMPLE.name=OKE 2016 Task 1 gold standard sample +org.aksw.gerbil.datasets.definition.OKE_2016_TASK1_GS_SAMPLE.class=org.aksw.gerbil.dataset.impl.nif.FileBasedNIFDataset +org.aksw.gerbil.datasets.definition.OKE_2016_TASK1_GS_SAMPLE.cacheable=true +org.aksw.gerbil.datasets.definition.OKE_2016_TASK1_GS_SAMPLE.experimentType=OKE_Task1 +org.aksw.gerbil.datasets.definition.OKE_2016_TASK1_GS_SAMPLE.constructorArgs=${org.aksw.gerbil.datasets.OKE_2016_TASK1_GS_SAMPLE.file},${org.aksw.gerbil.datasets.definition.OKE_2016_TASK1_GS_SAMPLE.name} +org.aksw.gerbil.datasets.definition.OKE_2016_TASK1_EVALUATION.name=OKE 2016 Task 1 evaluation dataset +org.aksw.gerbil.datasets.definition.OKE_2016_TASK1_EVALUATION.class=org.aksw.gerbil.dataset.impl.nif.FileBasedNIFDataset +org.aksw.gerbil.datasets.definition.OKE_2016_TASK1_EVALUATION.cacheable=true +org.aksw.gerbil.datasets.definition.OKE_2016_TASK1_EVALUATION.experimentType=OKE_Task1 +org.aksw.gerbil.datasets.definition.OKE_2016_TASK1_EVALUATION.constructorArgs=${org.aksw.gerbil.datasets.OKE_2016_TASK1_EVALUATION.file},${org.aksw.gerbil.datasets.definition.OKE_2016_TASK1_EVALUATION.name} +org.aksw.gerbil.datasets.definition.OKE_2016_TASK2_EXAMPLE.name=OKE 2016 Task 2 example set +org.aksw.gerbil.datasets.definition.OKE_2016_TASK2_EXAMPLE.class=org.aksw.gerbil.dataset.impl.nif.FileBasedNIFDataset +org.aksw.gerbil.datasets.definition.OKE_2016_TASK2_EXAMPLE.cacheable=true +org.aksw.gerbil.datasets.definition.OKE_2016_TASK2_EXAMPLE.experimentType=OKE_Task2 +org.aksw.gerbil.datasets.definition.OKE_2016_TASK2_EXAMPLE.constructorArgs=${org.aksw.gerbil.datasets.OKE_2016_TASK2_EXAMPLE.file},${org.aksw.gerbil.datasets.definition.OKE_2016_TASK2_EXAMPLE.name} +org.aksw.gerbil.datasets.definition.OKE_2016_TASK2_GS_SAMPLE.name=OKE 2016 Task 2 gold standard sample +org.aksw.gerbil.datasets.definition.OKE_2016_TASK2_GS_SAMPLE.class=org.aksw.gerbil.dataset.impl.nif.FileBasedNIFDataset +org.aksw.gerbil.datasets.definition.OKE_2016_TASK2_GS_SAMPLE.cacheable=true +org.aksw.gerbil.datasets.definition.OKE_2016_TASK2_GS_SAMPLE.experimentType=OKE_Task2 +org.aksw.gerbil.datasets.definition.OKE_2016_TASK2_GS_SAMPLE.constructorArgs=${org.aksw.gerbil.datasets.OKE_2016_TASK2_GS_SAMPLE.file},${org.aksw.gerbil.datasets.definition.OKE_2016_TASK2_GS_SAMPLE.name} +org.aksw.gerbil.datasets.definition.OKE_2016_TASK2_EVALUATION.name=OKE 2016 Task 2 evaluation dataset +org.aksw.gerbil.datasets.definition.OKE_2016_TASK2_EVALUATION.class=org.aksw.gerbil.dataset.impl.nif.FileBasedNIFDataset +org.aksw.gerbil.datasets.definition.OKE_2016_TASK2_EVALUATION.cacheable=true +org.aksw.gerbil.datasets.definition.OKE_2016_TASK2_EVALUATION.experimentType=OKE_Task2 +org.aksw.gerbil.datasets.definition.OKE_2016_TASK2_EVALUATION.constructorArgs=${org.aksw.gerbil.datasets.OKE_2016_TASK2_EVALUATION.file},${org.aksw.gerbil.datasets.definition.OKE_2016_TASK2_EVALUATION.name} + +### Ritter +org.aksw.gerbil.datasets.RitterDatasetConfiguration.test=${org.aksw.gerbil.DataPath}/datasets/Ritter/ner.txt +org.aksw.gerbil.datasets.definition.Ritter.name=Ritter +org.aksw.gerbil.datasets.definition.Ritter.class=org.aksw.gerbil.dataset.impl.ritter.RitterDataset +org.aksw.gerbil.datasets.definition.Ritter.cacheable=true +org.aksw.gerbil.datasets.definition.Ritter.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Ritter.constructorArgs=${org.aksw.gerbil.datasets.RitterDatasetConfiguration.test} + + +### Senseval 2 & 3 +org.aksw.gerbil.datasets.SensevalDatasetConfiguration.2=${org.aksw.gerbil.DataPath}/datasets/senseval/english_all_words_senseval2.xml +org.aksw.gerbil.datasets.SensevalDatasetConfiguration.3=${org.aksw.gerbil.DataPath}/datasets/senseval/english_all_words_senseval3.xml +org.aksw.gerbil.datasets.definition.Senseval2.name=Senseval 2 +org.aksw.gerbil.datasets.definition.Senseval2.class=org.aksw.gerbil.dataset.impl.senseval.SensevalDataset +org.aksw.gerbil.datasets.definition.Senseval2.cacheable=true +org.aksw.gerbil.datasets.definition.Senseval2.experimentType=ERec +org.aksw.gerbil.datasets.definition.Senseval2.constructorArgs=${org.aksw.gerbil.datasets.SensevalDatasetConfiguration.2} +org.aksw.gerbil.datasets.definition.Senseval3.name=Senseval 3 +org.aksw.gerbil.datasets.definition.Senseval3.class=org.aksw.gerbil.dataset.impl.senseval.SensevalDataset +org.aksw.gerbil.datasets.definition.Senseval3.cacheable=true +org.aksw.gerbil.datasets.definition.Senseval3.experimentType=ERec +org.aksw.gerbil.datasets.definition.Senseval3.constructorArgs=${org.aksw.gerbil.datasets.SensevalDatasetConfiguration.3}, true + +### UMBC +org.aksw.gerbil.datasets.UMBCDatasetConfiguration.train=${org.aksw.gerbil.DataPath}/datasets/umbc/finin.train +org.aksw.gerbil.datasets.UMBCDatasetConfiguration.test=${org.aksw.gerbil.DataPath}/datasets/umbc/finin.test +org.aksw.gerbil.datasets.definition.UMBC.name=UMBC-Test +org.aksw.gerbil.datasets.definition.UMBC.class=org.aksw.gerbil.dataset.impl.umbc.UMBCDataset +org.aksw.gerbil.datasets.definition.UMBC.cacheable=true +org.aksw.gerbil.datasets.definition.UMBC.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.UMBC.constructorArgs=${org.aksw.gerbil.datasets.UMBCDatasetConfiguration.test} +org.aksw.gerbil.datasets.definition.UMBC_2.name=UMBC-Train +org.aksw.gerbil.datasets.definition.UMBC_2.class=org.aksw.gerbil.dataset.impl.umbc.UMBCDataset +org.aksw.gerbil.datasets.definition.UMBC_2.cacheable=true +org.aksw.gerbil.datasets.definition.UMBC_2.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.UMBC_2.constructorArgs=${org.aksw.gerbil.datasets.UMBCDatasetConfiguration.train} + +### WSDM2012 +org.aksw.gerbil.datasets.WSDMDatasetConfiguration.annotations=${org.aksw.gerbil.DataPath}/datasets/WSDM/wsdm2012_annotations.txt +org.aksw.gerbil.datasets.WSDMDatasetConfiguration.tweets=${org.aksw.gerbil.DataPath}/datasets/WSDM/final.json +org.aksw.gerbil.datasets.definition.WSDM.name=WSDM 2012 +org.aksw.gerbil.datasets.definition.WSDM.class=org.aksw.gerbil.dataset.impl.wsdm.WSDMDataset +org.aksw.gerbil.datasets.definition.WSDM.cacheable=true +org.aksw.gerbil.datasets.definition.WSDM.experimentType=C2KB +org.aksw.gerbil.datasets.definition.WSDM.constructorArgs=${org.aksw.gerbil.datasets.WSDMDatasetConfiguration.annotations}, ${org.aksw.gerbil.datasets.WSDMDatasetConfiguration.tweets} diff --git a/src/main/properties/entity_checking.properties b/src/main/properties/entity_checking.properties index 7f6c88093..aa6903363 100644 --- a/src/main/properties/entity_checking.properties +++ b/src/main/properties/entity_checking.properties @@ -9,14 +9,15 @@ org.aksw.gerbil.dataset.check.InMemoryCachingEntityCheckerManager.cacheSize=1000 org.aksw.gerbil.dataset.check.InMemoryCachingEntityCheckerManager.cacheDuration"=2592000000 ### DBpedia -org.aksw.gerbil.dataset.check.HttpBasedEntityChecker.namespace=http://dbpedia.org +org.aksw.gerbil.dataset.check.IndexBasedEntityChecker.dbpedia=indexes/dbpedia_check,http://dbpedia.org,http://de.dbpedia.org,http://fr.dbpedia.org +#org.aksw.gerbil.dataset.check.HttpBasedEntityChecker.namespace=http://dbpedia.org #org.aksw.gerbil.dataset.check.HttpBasedEntityChecker.namespace=http://bg.dbpedia.org #org.aksw.gerbil.dataset.check.HttpBasedEntityChecker.namespace=http://ca.dbpedia.org #org.aksw.gerbil.dataset.check.HttpBasedEntityChecker.namespace=http://cs.dbpedia.org -org.aksw.gerbil.dataset.check.HttpBasedEntityChecker.namespace=http://de.dbpedia.org +#org.aksw.gerbil.dataset.check.HttpBasedEntityChecker.namespace=http://de.dbpedia.org #org.aksw.gerbil.dataset.check.HttpBasedEntityChecker.namespace=http://es.dbpedia.org #org.aksw.gerbil.dataset.check.HttpBasedEntityChecker.namespace=http://eu.dbpedia.org -org.aksw.gerbil.dataset.check.HttpBasedEntityChecker.namespace=http://fr.dbpedia.org +#org.aksw.gerbil.dataset.check.HttpBasedEntityChecker.namespace=http://fr.dbpedia.org #org.aksw.gerbil.dataset.check.HttpBasedEntityChecker.namespace=http://hu.dbpedia.org #org.aksw.gerbil.dataset.check.HttpBasedEntityChecker.namespace=http://id.dbpedia.org #org.aksw.gerbil.dataset.check.HttpBasedEntityChecker.namespace=http://it.dbpedia.org diff --git a/src/main/properties/gerbil.properties b/src/main/properties/gerbil.properties index b852ba5a5..51ad6b598 100644 --- a/src/main/properties/gerbil.properties +++ b/src/main/properties/gerbil.properties @@ -43,7 +43,7 @@ org.aksw.gerbil.web.config.overseerWorkers=20 ### Available Experiment Types # the experiments that are available in the FrontEnd -org.aksw.gerbil.web.MainController.availableExperimentTypes=A2KB,C2KB,D2KB,ERec,ETyping,OKE_Task1,OKE_Task2 +org.aksw.gerbil.web.MainController.availableExperimentTypes=A2KB,C2KB,D2KB,ERec,ETyping,OKE_Task1,OKE_Task2,RT2KB ### Dataset upload tmp folder org.aksw.gerbil.UploadPath=${org.aksw.gerbil.DataPath}/upload/ @@ -52,11 +52,12 @@ org.aksw.gerbil.UploadPath=${org.aksw.gerbil.DataPath}/upload/ org.aksw.gerbil.semantic.subclass.SubClassInferencer.ruleResource=subClassInferencerRules.txt # The list of files defining the class hierarchies needed to evaluate typing tasks. # The contains value triples: RDF-file,RDF-lang,base-URI -org.aksw.gerbil.semantic.subclass.SubClassInferencer.classHierarchyFiles=${org.aksw.gerbil.DataPath}/resources/hierarchies/d0.owl.xml,RDFXML,http://www.ontologydesignpatterns.org/ont/d0.owl,${org.aksw.gerbil.DataPath}/resources/hierarchies/DUL.owl.xml,RDFXML,http://www.ontologydesignpatterns.org/ont/dul/DUL.owl,${org.aksw.gerbil.DataPath}/resources/hierarchies/CollectionsLite.owl.xml,RDFXML,http://www.ontologydesignpatterns.org/ont/dul/CollectionsLite.owl,${org.aksw.gerbil.DataPath}/resources/hierarchies/Conceptualization.owl.xml,RDFXML,http://www.ontologydesignpatterns.org/ont/dul/Conceptualization.owl,${org.aksw.gerbil.DataPath}/resources/hierarchies/ontopic.owl.xml,RDFXML,http://www.ontologydesignpatterns.org/ont/dul/ontopic.owl,${org.aksw.gerbil.DataPath}/resources/hierarchies/Roles.owl.xml,RDFXML,http://www.ontologydesignpatterns.org/ont/dul/Roles.owl,${org.aksw.gerbil.DataPath}/resources/hierarchies/Supplements.owl.xml,RDFXML,http://www.ontologydesignpatterns.org/ont/dul/Supplements.owl +org.aksw.gerbil.semantic.subclass.SubClassInferencer.classHierarchyFiles=${org.aksw.gerbil.DataPath}/resources/hierarchies/d0.owl.xml,RDFXML,http://www.ontologydesignpatterns.org/ont/d0.owl,${org.aksw.gerbil.DataPath}/resources/hierarchies/DUL.owl.xml,RDFXML,http://www.ontologydesignpatterns.org/ont/dul/DUL.owl,${org.aksw.gerbil.DataPath}/resources/hierarchies/CollectionsLite.owl.xml,RDFXML,http://www.ontologydesignpatterns.org/ont/dul/CollectionsLite.owl,${org.aksw.gerbil.DataPath}/resources/hierarchies/Conceptualization.owl.xml,RDFXML,http://www.ontologydesignpatterns.org/ont/dul/Conceptualization.owl,${org.aksw.gerbil.DataPath}/resources/hierarchies/ontopic.owl.xml,RDFXML,http://www.ontologydesignpatterns.org/ont/dul/ontopic.owl,${org.aksw.gerbil.DataPath}/resources/hierarchies/Roles.owl.xml,RDFXML,http://www.ontologydesignpatterns.org/ont/dul/Roles.owl,${org.aksw.gerbil.DataPath}/resources/hierarchies/Supplements.owl.xml,RDFXML,http://www.ontologydesignpatterns.org/ont/dul/Supplements.owl,${org.aksw.gerbil.DataPath}/resources/hierarchies/DBpedia_DUL.ttl,TTL,http://dbpedia.org/ontology/ ### Well known Knowledge Bases org.aksw.gerbil.evaluate.DefaultWellKnownKB=http://dbpedia.org/resource/ org.aksw.gerbil.evaluate.DefaultWellKnownKB=http://dbpedia.org/ontology/ +org.aksw.gerbil.evaluate.DefaultWellKnownKB=http://www.ontologydesignpatterns.org/ont/dul/ org.aksw.gerbil.evaluate.DefaultWellKnownKB=http://ontologydesignpatterns.org/ont/dul/ org.aksw.gerbil.evaluate.DefaultWellKnownKB=http://www.ontologydesignpatterns.org/ont/d0.owl @@ -73,7 +74,10 @@ org.aksw.gerbil.execute.AnnotatorOutputWriter.outputDirectory=${org.aksw.gerbil. ### sameAs link retrieval # domains for which HTTP retrieval should be used -org.aksw.gerbil.semantic.sameas.impl.http.HTTPBasedSameAsRetriever.domain=dbpedia.org +#org.aksw.gerbil.semantic.sameas.impl.http.HTTPBasedSameAsRetriever.domain=dbpedia.org +org.aksw.gerbil.semantic.sameas.impl.index.IndexBasedSameAsRetriever.domain=dbpedia.org +org.aksw.gerbil.semantic.sameas.impl.index.IndexBasedSameAsRetriever.folder=indexes/dbpedia + #org.aksw.gerbil.semantic.sameas.impl.http.HTTPBasedSameAsRetriever.domain=de.dbpedia.org #org.aksw.gerbil.semantic.sameas.impl.http.HTTPBasedSameAsRetriever.domain=fr.dbpedia.org # domains which URIs should be removed from the sameAs retrieval diff --git a/src/main/webapp/WEB-INF/views/experiment.jsp b/src/main/webapp/WEB-INF/views/experiment.jsp index 96e0077b4..b2c78280a 100644 --- a/src/main/webapp/WEB-INF/views/experiment.jsp +++ b/src/main/webapp/WEB-INF/views/experiment.jsp @@ -1,6 +1,7 @@ <%@page import="org.aksw.gerbil.web.ExperimentTaskStateHelper"%> <%@ taglib prefix="c" uri="http://java.sun.com/jsp/jstl/core"%> <%@ taglib prefix="fmt" uri="http://java.sun.com/jsp/jstl/fmt"%> +<%@ taglib prefix="fn" uri="http://java.sun.com/jsp/jstl/functions" %> <%@ taglib prefix="form" uri="http://www.springframework.org/tags/form"%> <% request.setAttribute("additionalResultsCount", @@ -53,10 +54,27 @@ + + + + + Experiments could take a while +
+ +
+
+ + Your Experiments finished + + + +
+ +
Experiment URI:
Type: diff --git a/src/test/java/org/aksw/gerbil/SimpleSingleD2KBRun.java b/src/test/java/org/aksw/gerbil/SimpleSingleD2KBRun.java index 5e53d7702..77ea4de98 100644 --- a/src/test/java/org/aksw/gerbil/SimpleSingleD2KBRun.java +++ b/src/test/java/org/aksw/gerbil/SimpleSingleD2KBRun.java @@ -73,8 +73,8 @@ public class SimpleSingleD2KBRun extends EvaluatorFactory implements TaskObserve private static final Logger LOGGER = LoggerFactory.getLogger(SimpleSingleD2KBRun.class); - private static final String ANNOTATOR_NAME = "WAT"; - private static final String DATASET_NAME = "ACE2004"; + private static final String ANNOTATOR_NAME = "xLisa-NER"; + private static final String DATASET_NAME = "N3-RSS-500"; private static final ExperimentType EXPERIMENT_TYPE = ExperimentType.D2KB; private static final Matching MATCHING = Matching.STRONG_ENTITY_MATCH; @@ -86,6 +86,8 @@ public static void setMatchingsCounterDebugFlag() { } public static void main(String[] args) throws Exception { + System.out.println("waiting 10 secs..."); + Thread.sleep(10000); setMatchingsCounterDebugFlag(); SimpleSingleD2KBRun test = new SimpleSingleD2KBRun(); test.run(); diff --git a/src/test/java/org/aksw/gerbil/SingleRunTest.java b/src/test/java/org/aksw/gerbil/SingleRunTest.java index 4ec74af68..84d9ae51f 100644 --- a/src/test/java/org/aksw/gerbil/SingleRunTest.java +++ b/src/test/java/org/aksw/gerbil/SingleRunTest.java @@ -51,12 +51,12 @@ public class SingleRunTest implements TaskObserver { private static final Logger LOGGER = LoggerFactory.getLogger(SingleRunTest.class); - private static final String ANNOTATOR_NAME = "TagMe 2"; + private static final String ANNOTATOR_NAME = "DBpedia Spotlight"; private static final String DATASET_NAME = "MSNBC"; private static final ExperimentType EXPERIMENT_TYPE = ExperimentType.A2KB; - private static final Matching MATCHING = Matching.STRONG_ENTITY_MATCH; + private static final Matching MATCHING = Matching.WEAK_ANNOTATION_MATCH; - private static final boolean USE_SAME_AS_RETRIEVAL = true; + private static final boolean USE_SAME_AS_RETRIEVAL = false; private static final boolean USE_ENTITY_CHECKING = false; private static final SameAsRetriever SAME_AS_RETRIEVER = USE_SAME_AS_RETRIEVAL diff --git a/src/test/java/org/aksw/gerbil/dataset/check/index/IndexBasedEntityCheckerTest.java b/src/test/java/org/aksw/gerbil/dataset/check/index/IndexBasedEntityCheckerTest.java new file mode 100644 index 000000000..3ddb7b969 --- /dev/null +++ b/src/test/java/org/aksw/gerbil/dataset/check/index/IndexBasedEntityCheckerTest.java @@ -0,0 +1,74 @@ +package org.aksw.gerbil.dataset.check.index; + +import java.io.File; +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; + +import org.apache.commons.io.FileUtils; +import org.apache.commons.io.IOUtils; +import org.junit.Assert; +import org.junit.BeforeClass; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; +import org.junit.runners.Parameterized.Parameters; + +/** + * A simple test that writes a temporary index using the {@link #CORRECT_URIS} + * array and uses this to test the {@link IndexBasedEntityChecker} class. + * + * @author Michael Röder (roeder@informatik.uni-leipzig.de) + * + */ +@RunWith(Parameterized.class) +public class IndexBasedEntityCheckerTest { + + public static final String[] CORRECT_URIS = new String[] { "http://dbpedia.org/resource/Berlin", + "http://dbpedia.org/resource/Michael_Müller_(politician)", + "http://dbpedia.org/resource/Michael_M%C3%BCller_%28politician%29" }; + + private static String indexDir; + + @Parameters + public static Collection data() { + List testConfigs = new ArrayList(); + // DBpedia examples + testConfigs.add(new Object[] { "http://dbpedia.org/resource/Berlin", true }); + testConfigs.add(new Object[] { "http://dbpedia.org/resource/Paris", false }); + testConfigs.add(new Object[] { "http://dbpedia.org/resource/Michael_Müller_(politician)", true }); + testConfigs.add(new Object[] { "http://dbpedia.org/resource/Michael_M%C3%BCller_%28politician%29", true }); + return testConfigs; + } + + @BeforeClass + public static void createIndex() { + // Generate a temporary folder + indexDir = FileUtils.getTempDirectoryPath() + File.separator + System.currentTimeMillis(); + (new File(indexDir)).mkdir(); + Indexer indexer = Indexer.create(indexDir); + for (int i = 0; i < CORRECT_URIS.length; ++i) { + indexer.index(CORRECT_URIS[i]); + } + indexer.close(); + } + + private String uri; + private boolean expectedDecision; + + public IndexBasedEntityCheckerTest(String uri, boolean expectedDecision) { + this.uri = uri; + this.expectedDecision = expectedDecision; + } + + @Test + public void test() { + IndexBasedEntityChecker checker = null; + try { + checker = IndexBasedEntityChecker.create(indexDir); + Assert.assertEquals(expectedDecision, checker.entityExists(uri)); + } finally { + IOUtils.closeQuietly(checker); + } + } +} diff --git a/src/test/java/org/aksw/gerbil/dataset/impl/derczynski/DerczynskiDatasetTest.java b/src/test/java/org/aksw/gerbil/dataset/impl/derczynski/DerczynskiDatasetTest.java index c3c2d5553..363353fc8 100644 --- a/src/test/java/org/aksw/gerbil/dataset/impl/derczynski/DerczynskiDatasetTest.java +++ b/src/test/java/org/aksw/gerbil/dataset/impl/derczynski/DerczynskiDatasetTest.java @@ -19,16 +19,20 @@ public class DerczynskiDatasetTest { @Parameters public static Collection data() { List testConfigs = new ArrayList(); - testConfigs.add(new Object[] { "#Astros http://dbpedia.org/resource/Houston_Astros B-sportsteam HT\nlineup O NN\nfor O IN\ntonight O NN\n. O 0\nKeppinger http://dbpedia.org/resource/Jeff_Keppinger B-person NNP\nsits O VBZ\n, O ,\nDowns http://dbpedia.org/resource/Brodie_Downs B-person NNP\nplays O VBZ\n2B O NN\n, O ,\nCJ NIL B-person NNP\nbats O VBZ\n5th O JJ\n. O 0\n@alysonfooter O USR\nhttp://bit.ly/bHvgCS O URL", "#Astros" }); - testConfigs.add(new Object[] { "#Astros O B-sportsteam HT\nlineup O NN\nfor O IN\ntonight O NN\n. O 0\nKeppinger http://dbpedia.org/resource/Jeff_Keppinger B-person NNP\nsits O VBZ\n, O ,\nDowns http://dbpedia.org/resource/Brodie_Downs B-person NNP\nplays O VBZ\n2B O NN\n, O ,\nCJ NIL B-person NNP\nbats O VBZ\n5th O JJ\n. O 0\n@alysonfooter O USR\nhttp://bit.ly/bHvgCS O URL", "Keppinger" }); + testConfigs.add(new Object[] { "#Astros http://dbpedia.org/resource/Houston_Astros B-sportsteam HT\nlineup O NN\nfor O IN\ntonight O NN\n. O 0\nKeppinger http://dbpedia.org/resource/Jeff_Keppinger B-person NNP\nsits O VBZ\n, O ,\nDowns http://dbpedia.org/resource/Brodie_Downs B-person NNP\nplays O VBZ\n2B O NN\n, O ,\nCJ NIL B-person NNP\nbats O VBZ\n5th O JJ\n. O 0\n@alysonfooter O USR\nhttp://bit.ly/bHvgCS O URL", "#Astros lineup for tonight . Jeff Keppinger sits , Downs plays 2B , CJ bats 5th . @alysonfooter http://bit.ly/bHvgCS ", "#Astros" }); + testConfigs.add(new Object[] { "#Astros http://dbpedia.org/resource/Houston_Astros B-sportsteam HT\nlineup O NN\nfor http://bla.com B-Person IN\ntonight O NN\n. O 0\nKeppinger http://dbpedia.org/resource/Jeff_Keppinger B-person NNP\nsits O VBZ\n, O ,\nDowns http://dbpedia.org/resource/Brodie_Downs B-person NNP\nplays O VBZ\n2B O NN\n, O ,\nCJ NIL B-person NNP\nbats O VBZ\n5th O JJ\n. O 0\n@alysonfooter O USR\nhttp://bit.ly/bHvgCS O URL", "#Astros lineup for tonight . Keppinger sits , Downs plays 2B , CJ bats 5th . @alysonfooter http://bit.ly/bHvgCS ", "#Astros" }); + testConfigs.add(new Object[] { "#Astros O B-sportsteam HT\nlineup O I-sportsteam NN\nfor O IN\ntonight O NN\n. O 0\nJeff http://dbpedia.org/resource/Jeff_Keppinger B-person NNP\nKeppinger http://dbpedia.org/resource/Jeff_Keppinger I-person NNP\nsits O VBZ\n, O ,\nDowns http://dbpedia.org/resource/Brodie_Downs B-person NNP\nplays O VBZ\n2B O NN\n, O ,\nCJ NIL B-person NNP\nbats O VBZ\n5th O JJ\n. O 0\n@alysonfooter O USR\nhttp://bit.ly/bHvgCS O URL", "#Astros lineup for tonight . Jeff Keppinger sits , Downs plays 2B , CJ bats 5th . @alysonfooter http://bit.ly/bHvgCS ","#Astros lineup" }); + testConfigs.add(new Object[] { "#Astros O O HT\nlineup O NN\nfor O IN\ntonight O NN\n. O 0\nJeff http://dbpedia.org/resource/Jeff_Keppinger B-person NNP\nKeppinger http://dbpedia.org/resource/Jeff_Keppinger I-person NNP\nsits O VBZ\n, O ,\nDowns http://dbpedia.org/resource/Brodie_Downs B-person NNP\nplays O VBZ\n2B O NN\n, O ,\nCJ NIL B-person NNP\nbats O VBZ\n5th O JJ\n. O 0\n@alysonfooter O USR\nhttp://bit.ly/bHvgCS O URL", "#Astros lineup for tonight . Jeff Keppinger sits , Downs plays 2B , CJ bats 5th . @alysonfooter http://bit.ly/bHvgCS ","Jeff Keppinger" }); return testConfigs; } private String text; private String expectedToken; + private String tweet; - public DerczynskiDatasetTest(String text, String expectedToken) { + public DerczynskiDatasetTest(String text, String tweet, String expectedToken) { this.text = text; + this.tweet=tweet; this.expectedToken = expectedToken; } @@ -39,7 +43,7 @@ public void test() { Assert.assertTrue(markings.size() > 0); Assert.assertTrue(markings.get(0) instanceof NamedEntity); NamedEntity ne = (NamedEntity) markings.get(0); - String mention = text.substring(ne.getStartPosition(), ne.getStartPosition() + ne.getLength()); + String mention = tweet.substring(ne.getStartPosition(), ne.getStartPosition() + ne.getLength()); Assert.assertEquals(expectedToken, mention); } diff --git a/src/test/java/org/aksw/gerbil/dataset/impl/erd/ERDDatasetTest.java b/src/test/java/org/aksw/gerbil/dataset/impl/erd/ERDDatasetTest.java new file mode 100644 index 000000000..e1a81f625 --- /dev/null +++ b/src/test/java/org/aksw/gerbil/dataset/impl/erd/ERDDatasetTest.java @@ -0,0 +1,336 @@ +/** + * This file is part of General Entity Annotator Benchmark. + * + * General Entity Annotator Benchmark is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * General Entity Annotator Benchmark is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with General Entity Annotator Benchmark. If not, see . + */ +package org.aksw.gerbil.dataset.impl.erd; + +import java.io.File; +import java.io.IOException; +import java.io.RandomAccessFile; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import org.aksw.gerbil.datatypes.ErrorTypes; +import org.aksw.gerbil.exceptions.GerbilException; +import org.aksw.gerbil.transfer.nif.Document; +import org.aksw.gerbil.transfer.nif.Marking; +import org.aksw.gerbil.transfer.nif.data.DocumentImpl; +import org.aksw.gerbil.transfer.nif.data.NamedEntity; + +import org.apache.commons.lang.RandomStringUtils; +import org.apache.commons.lang.math.RandomUtils; + +import static org.hamcrest.CoreMatchers.is; +import static org.hamcrest.CoreMatchers.nullValue; +import static org.hamcrest.CoreMatchers.notNullValue; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertThat; + +import org.junit.BeforeClass; +import org.junit.Test; + +@Deprecated +public class ERDDatasetTest { + + private static final String FREEBASE_URI = "https://www.googleapis.com/freebase"; + private static final String ERD_DATASET_PATH = "gerbil_data/datasets/erd2014/"; + private static final String TEXT_FILE = "Trec_beta.query.txt"; + private static final String ANNOTATION_FILE = "Trec_beta.annotation.txt"; + + private static List EXPECTED_DOCUMENTS; + private static List LOADED_DOCUMENTS; + + private static List DOCUMENT_URI; + + @BeforeClass + public static void prepareResourcesToTest() throws GerbilException { + + DOCUMENT_URI = new ArrayList(); + + DOCUMENT_URI.add("http://ERD-Test/Trec_beta.query.txt"); + + loadExpectedSet(); + + loadDatasets(); + +// generateTerminalOutputForLoadedErdDatasets(); + + } + + @Test + public void checkTrecData() { + + int min = 5; + int max = 10; + List treclist = new ArrayList<>(); + List linelist = new ArrayList<>(); + List textlist = new ArrayList<>(); + List second_phrase_text = new ArrayList<>(); + + assertThat(treclist.size(), is(0)); + + int lineColumnCount = 0; + ERDTrec dtrec = null; + int randomtrecs = (int)(Math.random() * max) + min; + for (int i = 0; i < randomtrecs; i++) { + + String id = "TREC-" + i; + int randomTextpart = (int)(Math.random() * 5) + 2; + String text = ""; + for (int j = 0; j < randomTextpart; j++){ + String randomText = RandomStringUtils.randomAlphanumeric(RandomUtils.nextInt(max) + min); + text = text + randomText + " "; + if (j==1) second_phrase_text.add(randomText); + } + text = id + "\t" + text.substring(0,text.length()-1); + textlist.add(text); + + dtrec = new ERDTrec(text, dtrec); + treclist.add(dtrec); + + lineColumnCount = lineColumnCount + text.length() + 1; + linelist.add(lineColumnCount - 1); + + } + + for (ERDTrec trec : treclist) assertThat(trec, is(notNullValue())); + + for (int i = 0; i < treclist.size(); i++) { + + assertThat(treclist.get(i).getColumnCount(), is(linelist.get(i))); + assertThat(treclist.get(i).getLineNumber(), is(i)); + assertThat(treclist.get(i).getLine().equals(textlist.get(i)), is(true)); + assertThat((treclist.get(i).getTextPosition(second_phrase_text.get(i)) > 0), is(true)); + + } + + } + + @Test + public void checkLoadDatasets() throws GerbilException { + + assertThat(LOADED_DOCUMENTS.size(), is(1)); + + int countmarkings = 0; + for (Document tmp : LOADED_DOCUMENTS){ + countmarkings += tmp.getMarkings().size(); + } + + assertThat(countmarkings, is(59)); + + } + + @Test + public void checkExpectedDataset() { + + assertThat(EXPECTED_DOCUMENTS.size(), is(1)); + + int countmarkings = 0; + for (Document tmp : EXPECTED_DOCUMENTS){ + countmarkings += tmp.getMarkings().size(); + } + + assertThat(countmarkings, is(16)); + + } + + @Test + public void checkExpectedDatasetIsSubsetOfLoadedDataset() throws GerbilException { + + for (int i = 0; i < EXPECTED_DOCUMENTS.size(); i++){ + for (int j = 0; j < EXPECTED_DOCUMENTS.get(i).getMarkings().size(); j++){ + + String ld_mark = LOADED_DOCUMENTS.get(i).getMarkings().get(j).toString(); + ld_mark = ld_mark.substring(1, ld_mark.length()-1); + String[] ld_parts = ld_mark.split(" "); + + assertThat(ld_parts.length, is(3)); + + String ld_start = ld_parts[0].substring(0, ld_parts[0].length()-1); + String ld_length = ld_parts[1].substring(0, ld_parts[1].length()-1); + String ld_uri = ld_parts[2].substring(1 + FREEBASE_URI.length(), ld_parts[2].length()-1); + ld_uri = ld_uri.replaceAll("_", " "); + + String ex_mark = EXPECTED_DOCUMENTS.get(i).getMarkings().get(j).toString(); + ex_mark = ex_mark.substring(1, ex_mark.length()-1); + String[] ex_parts = ex_mark.split(" "); + + assertThat(ex_parts.length, is(3)); + + String ex_start = ex_parts[0].substring(0, ex_parts[0].length()-1); + String ex_length = ex_parts[1].substring(0, ex_parts[1].length()-1); + String ex_uri = ex_parts[2].substring(1 + FREEBASE_URI.length(), ex_parts[2].length()-1); + ex_uri = ex_uri.replaceAll("_", " "); + + assertEquals(ld_start, ex_start); + assertEquals(ld_length, ex_length); + assertEquals(ld_uri, ex_uri); + + } + } + + } + + @Test + public void checkLoadedDatasetFindInDatasetFiles() throws GerbilException { + + String text = getString(ERD_DATASET_PATH + ANNOTATION_FILE); + + for (int i = 0; i < LOADED_DOCUMENTS.size(); i++){ + for (int j = 0; j < LOADED_DOCUMENTS.get(i).getMarkings().size(); j++){ + + String mark = LOADED_DOCUMENTS.get(i).getMarkings().get(j).toString(); + mark = mark.substring(1, mark.length()-1); + String[] parts = mark.split(" "); + + assertThat(parts.length, is(3)); + + String start = parts[0].substring(0, parts[0].length()-1); + String length = parts[1].substring(0, parts[1].length()-1); + String uri = parts[2].substring(1 + FREEBASE_URI.length(), parts[2].length()-1); + + List searchString = new ArrayList<>(); + int pos = -1; + while ((pos = text.indexOf(uri, pos + 1)) != -1) { + int point = pos + uri.length() + 1; + searchString.add(text.substring(point, point+Integer.valueOf(length))); + } + + String match = returnStringPositionInFile(ERD_DATASET_PATH + TEXT_FILE, Integer.valueOf(start), Integer.valueOf(length)); + + assertThat((searchString.contains(match)), is(true)); + } + } + + } + + @SuppressWarnings("resource") + private static void loadDatasets() throws GerbilException { + + assertThat(LOADED_DOCUMENTS, is(nullValue())); + + LOADED_DOCUMENTS = new ArrayList<>(); + + assertThat(LOADED_DOCUMENTS, is(notNullValue())); + assertThat(LOADED_DOCUMENTS.size(), is(0)); + + ERDDataset dataset = new ERDDataset(ERD_DATASET_PATH + TEXT_FILE, ERD_DATASET_PATH + ANNOTATION_FILE); + dataset.setName("Erd-Test"); + dataset.init(); + LOADED_DOCUMENTS.addAll(dataset.getInstances()); + + } + + private static void loadExpectedSet() { + + assertThat(EXPECTED_DOCUMENTS, is(nullValue())); + + EXPECTED_DOCUMENTS = new ArrayList<>(); + + assertThat(EXPECTED_DOCUMENTS, is(notNullValue())); + assertThat(EXPECTED_DOCUMENTS.size(), is(0)); + + List text = new ArrayList<>(); + List> markings = new ArrayList<>(); + + text.add("..TREC-1.adobe indian houses..TREC-2.atypical squamous cells..TREC-3.battles in the civil war..TREC-4.becoming a paralegal..TREC-5.best long term care insurance..TREC-6.blue throated hummingbird..TREC-7.bowflex power pro..TREC-8.brooks brothers clearance..TREC-9.butter and margarine..TREC-10.california franchise tax board..TREC-11.cass county missouri..TREC-12.civil right movement..TREC-13.condos in florida..TREC-14.culpeper national cemetery..TREC-15.dangers of asbestos..TREC-16.designer dog breeds..TREC-17.discovery channel store..TREC-18.dog clean up bags..TREC-19.dogs for adoption..TREC-20.dutchess county tourism..TREC-21.earn money at home..TREC-22.east ridge high school..TREC-23.electronic skeet shoot..TREC-24.equal opportunity employer..TREC-25.er tv show..TREC-26.fact on uranus..TREC-27.fickle creek farm..TREC-28.french lick resort and casino..TREC-29.furniture for small spaces..TREC-30.gmat prep classes..TREC-31.gs pay rate..TREC-32.how to build a fence..TREC-33.hp mini 2140..TREC-34.illinois state tax..TREC-35.income tax return online..TREC-36.indiana child support.."); + + markings.add(Arrays.asList( + (Marking) new NamedEntity(203, 7, "https://www.googleapis.com/freebase/m/04cnvy"), + (Marking) new NamedEntity(229, 15, "https://www.googleapis.com/freebase/m/03d452"), + (Marking) new NamedEntity(333, 20, "https://www.googleapis.com/freebase/m/0nfgq"), + (Marking) new NamedEntity(393, 5, "https://www.googleapis.com/freebase/m/020ys5"), + (Marking) new NamedEntity(403, 7, "https://www.googleapis.com/freebase/m/02xry"), + (Marking) new NamedEntity(420, 26, "https://www.googleapis.com/freebase/m/0c4tkd"), + (Marking) new NamedEntity(601, 15, "https://www.googleapis.com/freebase/m/0dc3_"), + (Marking) new NamedEntity(662, 22, "https://www.googleapis.com/freebase/m/03ck4lv"), + (Marking) new NamedEntity(662, 22, "https://www.googleapis.com/freebase/m/027311j"), + (Marking) new NamedEntity(662, 22, "https://www.googleapis.com/freebase/m/0bs8gsb"), + (Marking) new NamedEntity(762, 2, "https://www.googleapis.com/freebase/m/0180mw"), + (Marking) new NamedEntity(833, 29, "https://www.googleapis.com/freebase/m/02761b3"), + (Marking) new NamedEntity(872, 9, "https://www.googleapis.com/freebase/m/0c_jw"), + (Marking) new NamedEntity(913, 4, "https://www.googleapis.com/freebase/m/065y10k"), + (Marking) new NamedEntity(1008, 14, "https://www.googleapis.com/freebase/m/03v0t"), + (Marking) new NamedEntity(1070, 7, "https://www.googleapis.com/freebase/m/03v1s") + )); + + EXPECTED_DOCUMENTS = new ArrayList<>(); + + for (int i = 0; i < 1; i++){ + EXPECTED_DOCUMENTS.add(new DocumentImpl(text.get(i), DOCUMENT_URI.get(i), markings.get(i))); + } + + } + + private String getString(String filePath) throws GerbilException { + + RandomAccessFile raf; + String out = ""; + try { + File file = new File(filePath); + byte[] filedata = new byte[(int) file.length()]; + raf = new RandomAccessFile(file, "r"); + raf.readFully(filedata); + out = new String(filedata); + raf.close(); + } catch (IOException e) { + throw new GerbilException("Exception while reading annotation file of dataset.", e, ErrorTypes.ANNOTATOR_LOADING_ERROR); + } + + return out; + + } + + private String returnStringPositionInFile(String filePath, int position, int length) throws GerbilException { + + RandomAccessFile raf; + String out = ""; + try { + File file = new File(filePath); + byte[] search = new byte[length]; + raf = new RandomAccessFile(file, "r"); + raf.seek(position); + raf.readFully(search); + raf.close(); + out = new String(search); + } catch (IOException e) { + throw new GerbilException("Exception while reading text file of dataset.", e, ErrorTypes.ANNOTATOR_LOADING_ERROR); + } + + return out; + + } + +// private static void generateTerminalOutputForLoadedErdDatasets() throws GerbilException { +// +// System.out.println("========================================================="); +// System.out.println("===================== Documents [" + LOADED_DOCUMENTS.size() + "] ====================="); +// for (int i = 0; i < LOADED_DOCUMENTS.size(); i++){ +// Document doc = LOADED_DOCUMENTS.get(i); +// System.out.println("========================================================="); +// System.out.println("Document-URI: " + doc.getDocumentURI()); +// System.out.println("==================== Markings [" + doc.getMarkings().size() + "] ===================="); +// for (Marking mark : doc.getMarkings()){ +// System.out.println(mark.toString()); +// } +// } +// System.out.println("========================================================="); +// +// } + +} diff --git a/src/test/java/org/aksw/gerbil/dataset/impl/gerdaq/GERDAQDatasetTest.java b/src/test/java/org/aksw/gerbil/dataset/impl/gerdaq/GERDAQDatasetTest.java new file mode 100644 index 000000000..1de4c4aa7 --- /dev/null +++ b/src/test/java/org/aksw/gerbil/dataset/impl/gerdaq/GERDAQDatasetTest.java @@ -0,0 +1,66 @@ +/** + * This file is part of General Entity Annotator Benchmark. + * + * General Entity Annotator Benchmark is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * General Entity Annotator Benchmark is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with General Entity Annotator Benchmark. If not, see . + */ +package org.aksw.gerbil.dataset.impl.gerdaq; + +import java.io.File; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import org.aksw.gerbil.transfer.nif.Document; +import org.aksw.gerbil.transfer.nif.Marking; +import org.aksw.gerbil.transfer.nif.data.DocumentImpl; +import org.aksw.gerbil.transfer.nif.data.NamedEntity; +import org.apache.commons.io.Charsets; +import org.apache.commons.io.FileUtils; +import org.junit.Assert; +import org.junit.Test; + +public class GERDAQDatasetTest { + + private static final String DATASET_NAME = "test"; + + @Test + public void checkLoadDatasets() throws Exception { + File file = File.createTempFile("GERDAQ", ".xml"); + FileUtils.write(file, + "" + String.format("%n") + + "loris candle samplerbuying land and arizonahip gry pl", + Charsets.UTF_8); + String docUriStart = GERDAQDataset.generateDocumentUri(DATASET_NAME, file.getName()); + + List expectedDocuments = Arrays.asList( + new DocumentImpl("loris candle sampler", docUriStart + 0, + Arrays.asList(new NamedEntity(6, 6, "http://dbpedia.org/resource/Candle"))), + new DocumentImpl("buying land and arizona", docUriStart + 1, + Arrays.asList(new NamedEntity(0, 11, "http://dbpedia.org/resource/Conveyancing"), + new NamedEntity(16, 7, "http://dbpedia.org/resource/Arizona"))), + new DocumentImpl("hip gry pl", docUriStart + 2, new ArrayList(0))); + + GERDAQDataset dataset = new GERDAQDataset(file.getAbsolutePath()); + try { + dataset.setName(DATASET_NAME); + dataset.init(); + + Assert.assertArrayEquals(expectedDocuments.toArray(new Document[3]), + dataset.getInstances().toArray(new Document[3])); + } finally { + dataset.close(); + } + } + +} diff --git a/src/test/java/org/aksw/gerbil/dataset/impl/micro/Microposts2015DatasetMentionSearchTest.java b/src/test/java/org/aksw/gerbil/dataset/impl/micro/Microposts2015DatasetMentionSearchTest.java new file mode 100644 index 000000000..ab3edb34a --- /dev/null +++ b/src/test/java/org/aksw/gerbil/dataset/impl/micro/Microposts2015DatasetMentionSearchTest.java @@ -0,0 +1,91 @@ +/** + * This file is part of General Entity Annotator Benchmark. + * + * General Entity Annotator Benchmark is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * General Entity Annotator Benchmark is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with General Entity Annotator Benchmark. If not, see . + */ +package org.aksw.gerbil.dataset.impl.micro; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import org.aksw.gerbil.transfer.nif.Marking; +import org.aksw.gerbil.transfer.nif.data.TypedNamedEntity; +import org.junit.Assert; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; +import org.junit.runners.Parameterized.Parameters; + +@RunWith(Parameterized.class) +public class Microposts2015DatasetMentionSearchTest { + + @Parameters + public static Collection data() { + List testConfigs = new ArrayList(); + testConfigs.add(new Object[] { + new String[] { "375737278276321000 27 33 http://dbpedia.org/resource/Africa Location", "375737278276321000 4 6 http://dbpedia.org/resource/United_States Location"}, + "The US Military’s Pivot to Africa http://t.co/wJn6YRE5hR", + new String[] {"Africa", "US"}, + new String[] {"http://dbpedia.org/ontology/Place", "http://dbpedia.org/ontology/Place" }}); + + testConfigs.add(new Object[] { + new String[] { "375733981582729000 38 46 http://dbpedia.org/resource/Brooklyn Location", + "375733981582729000 4 15 NIL840 Product"}, + "RT @BestProNews: A 19-year-old man in Brooklyn died today after he lost control of his remote control helicopter and sliced off the top of …", + new String[] {"BestProNews", "Brooklyn"}, + new String[] {"http://dbpedia.org/ontology/Product", "http://dbpedia.org/ontology/Place" }}); + + + return testConfigs; + } + + private String[] mentions; + private String tweet; + private String[] expectedMentions; + private String[] expectedTypes; + + + public Microposts2015DatasetMentionSearchTest(String[] mentions, String tweet, String[] expectedMentions, String[] expectedTypes) { + this.mentions = mentions; + this.tweet = tweet; + this.expectedMentions = expectedMentions; + this.expectedTypes = expectedTypes; + } + + @Test + public void test() { + Set lines = new HashSet(); + for(String m : mentions){ + lines.add(m); + } + List markings = Microposts2015Dataset.findMarkings(lines, tweet); + Assert.assertNotNull(markings); + Assert.assertTrue(markings.size() > 0); + int i =0; + for(Marking marking : markings){ + Assert.assertTrue(marking instanceof TypedNamedEntity); + TypedNamedEntity ne = (TypedNamedEntity) marking; + + String mention = tweet.substring(ne.getStartPosition(), ne.getStartPosition() + ne.getLength()); + Assert.assertEquals(expectedMentions[i], mention); + + String type = ne.getTypes().iterator().next(); + Assert.assertEquals(expectedTypes[i], type); + i++; + } + } +} diff --git a/src/test/java/org/aksw/gerbil/dataset/impl/micro/Microposts2016DatasetMentionSearchTest.java b/src/test/java/org/aksw/gerbil/dataset/impl/micro/Microposts2016DatasetMentionSearchTest.java new file mode 100644 index 000000000..abd3f16fd --- /dev/null +++ b/src/test/java/org/aksw/gerbil/dataset/impl/micro/Microposts2016DatasetMentionSearchTest.java @@ -0,0 +1,91 @@ +/** + * This file is part of General Entity Annotator Benchmark. + * + * General Entity Annotator Benchmark is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * General Entity Annotator Benchmark is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with General Entity Annotator Benchmark. If not, see . + */ +package org.aksw.gerbil.dataset.impl.micro; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import org.aksw.gerbil.transfer.nif.Marking; +import org.aksw.gerbil.transfer.nif.data.TypedNamedEntity; +import org.junit.Assert; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; +import org.junit.runners.Parameterized.Parameters; + +@RunWith(Parameterized.class) +public class Microposts2016DatasetMentionSearchTest { + + @Parameters + public static Collection data() { + List testConfigs = new ArrayList(); + testConfigs.add(new Object[] { + new String[] { "375737278276321000 27 33 http://dbpedia.org/resource/Africa 1 Location", "375737278276321000 4 6 http://dbpedia.org/resource/United_States 1 Location"}, + "The US Military’s Pivot to Africa http://t.co/wJn6YRE5hR", + new String[] {"US", "Africa" }, + new String[] {"http://dbpedia.org/ontology/Place", "http://dbpedia.org/ontology/Place" }}); + + testConfigs.add(new Object[] { + new String[] { "375733981582729000 38 46 http://dbpedia.org/resource/Brooklyn 1 Location", + "375733981582729000 4 15 NIL840 1 Product"}, + "RT @BestProNews: A 19-year-old man in Brooklyn died today after he lost control of his remote control helicopter and sliced off the top of …", + new String[] {"BestProNews", "Brooklyn"}, + new String[] {"http://dbpedia.org/ontology/Product", "http://dbpedia.org/ontology/Place" }}); + + + return testConfigs; + } + + private String[] mentions; + private String tweet; + private String[] expectedMentions; + private String[] expectedTypes; + + + public Microposts2016DatasetMentionSearchTest(String[] mentions, String tweet, String[] expectedMentions, String[] expectedTypes) { + this.mentions = mentions; + this.tweet = tweet; + this.expectedMentions = expectedMentions; + this.expectedTypes = expectedTypes; + } + + @Test + public void test() { + Set lines = new HashSet(); + for(String m : mentions){ + lines.add(m); + } + List markings = Microposts2016Dataset.findMarkings(lines, tweet); + Assert.assertNotNull(markings); + Assert.assertTrue(markings.size() > 0); + int i =0; + for(Marking marking : markings){ + Assert.assertTrue(marking instanceof TypedNamedEntity); + TypedNamedEntity ne = (TypedNamedEntity) marking; + + String mention = tweet.substring(ne.getStartPosition(), ne.getStartPosition() + ne.getLength()); + Assert.assertEquals(expectedMentions[i], mention); + + String type = ne.getTypes().iterator().next(); + Assert.assertEquals(expectedTypes[i], type); + i++; + } + } +} diff --git a/src/test/java/org/aksw/gerbil/dataset/impl/ritter/RitterDatasetTest.java b/src/test/java/org/aksw/gerbil/dataset/impl/ritter/RitterDatasetTest.java new file mode 100644 index 000000000..3a538eee1 --- /dev/null +++ b/src/test/java/org/aksw/gerbil/dataset/impl/ritter/RitterDatasetTest.java @@ -0,0 +1,59 @@ +package org.aksw.gerbil.dataset.impl.ritter; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; + +import org.aksw.gerbil.transfer.nif.Marking; +import org.aksw.gerbil.transfer.nif.data.NamedEntity; +import org.aksw.gerbil.transfer.nif.data.TypedNamedEntity; +import org.junit.Assert; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; +import org.junit.runners.Parameterized.Parameters; + +//TODO set @RunWith, @Parameters, @Test again, as soon dataset is in gerbil_data.zip + +//@RunWith(Parameterized.class) +public class RitterDatasetTest { + +// @Parameters + public static Collection data() { + List testConfigs = new ArrayList(); + testConfigs.add(new Object[] { "Texans O\nurged O\nto O\nflee O\nas O\nIke O\nmenaces O\ncoast O\n: O\nAuthorities O\nhave O\nurged O\nresidents O\nto O\nflee O\nthe O\nTexas B-geo-loc\ncoast I-geo-loc\n, O\na O\nURL O", "Texans urged to flee as Ike menaces coast : Authorities have urged residents to flee the Texas coast , a URL ", new String[]{"Texas coast", "http://dbpedia.org/ontology/Place"} }); + testConfigs.add(new Object[] { "Texans B-movie\nurged O\nto O\nflee O\nas O\nIke O\nmenaces O\ncoast O\n: O\nAuthorities O\nhave O\nurged O\nresidents O\nto O\nflee O\nthe O\nTexas B-geo-loc\ncoast I-geo-loc\n, O\na O\nURL O", "Texans urged to flee as Ike menaces coast : Authorities have urged residents to flee the Texas coast , a URL ", new String[]{"Texans", "http://dbpedia.org/ontology/Film"} }); + testConfigs.add(new Object[] { "Texans B-company\nurged B-PER\nto O\nflee O\nas O\nIke O\nmenaces O\ncoast O\n: O\nAuthorities O\nhave O\nurged O\nresidents O\nto O\nflee O\nthe O\nTexas B-geo-loc\ncoast I-geo-loc\n, O\na O\nURL O", "Texans urged to flee as Ike menaces coast : Authorities have urged residents to flee the Texas coast , a URL ", new String[]{"Texans", "http://dbpedia.org/ontology/Company"} }); + testConfigs.add(new Object[] { "Texans B-facility\nurged B-PER\nto O\nflee O\nas O\nIke O\nmenaces O\ncoast O\n: O\nAuthorities O\nhave O\nurged O\nresidents O\nto O\nflee O\nthe O\nTexas B-geo-loc\ncoast I-geo-loc\n, O\na O\nURL O", "Texans urged to flee as Ike menaces coast : Authorities have urged residents to flee the Texas coast , a URL ", new String[]{"Texans", "http://dbpedia.org/ontology/Place"} }); + testConfigs.add(new Object[] { "Texans B-musicartist\nurged B-PER\nto O\nflee O\nas O\nIke O\nmenaces O\ncoast O\n: O\nAuthorities O\nhave O\nurged O\nresidents O\nto O\nflee O\nthe O\nTexas B-geo-loc\ncoast I-geo-loc\n, O\na O\nURL O", "Texans urged to flee as Ike menaces coast : Authorities have urged residents to flee the Texas coast , a URL ", new String[]{"Texans", "http://dbpedia.org/ontology/MusicalArtist"} }); + testConfigs.add(new Object[] { "Texans B-other\nurged B-PER\nto O\nflee O\nas O\nIke O\nmenaces O\ncoast O\n: O\nAuthorities O\nhave O\nurged O\nresidents O\nto O\nflee O\nthe O\nTexas B-geo-loc\ncoast I-geo-loc\n, O\na O\nURL O", "Texans urged to flee as Ike menaces coast : Authorities have urged residents to flee the Texas coast , a URL ", new String[]{"Texans", "http://dbpedia.org/ontology/Unknown"} }); + testConfigs.add(new Object[] { "Texans B-person\nurged B-PER\nto O\nflee O\nas O\nIke O\nmenaces O\ncoast O\n: O\nAuthorities O\nhave O\nurged O\nresidents O\nto O\nflee O\nthe O\nTexas B-geo-loc\ncoast I-geo-loc\n, O\na O\nURL O", "Texans urged to flee as Ike menaces coast : Authorities have urged residents to flee the Texas coast , a URL ", new String[]{"Texans", "http://dbpedia.org/ontology/Person"} }); + testConfigs.add(new Object[] { "Texans B-product\nurged B-PER\nto O\nflee O\nas O\nIke O\nmenaces O\ncoast O\n: O\nAuthorities O\nhave O\nurged O\nresidents O\nto O\nflee O\nthe O\nTexas B-geo-loc\ncoast I-geo-loc\n, O\na O\nURL O", "Texans urged to flee as Ike menaces coast : Authorities have urged residents to flee the Texas coast , a URL ", new String[]{"Texans", "http://dbpedia.org/ontology/product"} }); + testConfigs.add(new Object[] { "Texans B-sportsteam\nurged B-PER\nto O\nflee O\nas O\nIke O\nmenaces O\ncoast O\n: O\nAuthorities O\nhave O\nurged O\nresidents O\nto O\nflee O\nthe O\nTexas B-geo-loc\ncoast I-geo-loc\n, O\na O\nURL O", "Texans urged to flee as Ike menaces coast : Authorities have urged residents to flee the Texas coast , a URL ", new String[]{"Texans", "http://dbpedia.org/ontology/SportsTeam"} }); + testConfigs.add(new Object[] { "Texans B-tvshow\nurged B-PER\nto O\nflee O\nas O\nIke O\nmenaces O\ncoast O\n: O\nAuthorities O\nhave O\nurged O\nresidents O\nto O\nflee O\nthe O\nTexas B-geo-loc\ncoast I-geo-loc\n, O\na O\nURL O", "Texans urged to flee as Ike menaces coast : Authorities have urged residents to flee the Texas coast , a URL ", new String[]{"Texans", "http://dbpedia.org/ontology/TelevisionShow"} }); + return testConfigs; + } + + private String text; + private String[] expectedToken; + private String tweet; + + public RitterDatasetTest(String text, String tweet, String[] expectedToken) { + this.text = text; + this.tweet = tweet; + this.expectedToken = expectedToken; + } + +// @Test + public void test() { + List markings = RitterDataset.findMarkings(text); + Assert.assertNotNull(markings); + Assert.assertTrue(markings.size() > 0); + Assert.assertTrue(markings.get(0) instanceof NamedEntity); + TypedNamedEntity ne = (TypedNamedEntity) markings.get(0); + ne.getTypes().iterator().next().equals(expectedToken[1]); + String mention = tweet.substring(ne.getStartPosition(), ne.getStartPosition() + ne.getLength()); + Assert.assertEquals(expectedToken[0], mention); + } + +} diff --git a/src/test/java/org/aksw/gerbil/dataset/impl/senseval/SensevalDatasetTest.java b/src/test/java/org/aksw/gerbil/dataset/impl/senseval/SensevalDatasetTest.java new file mode 100644 index 000000000..6c2d05807 --- /dev/null +++ b/src/test/java/org/aksw/gerbil/dataset/impl/senseval/SensevalDatasetTest.java @@ -0,0 +1,72 @@ +package org.aksw.gerbil.dataset.impl.senseval; + +import static org.junit.Assert.*; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; + +import org.aksw.gerbil.exceptions.GerbilException; +import org.aksw.gerbil.transfer.nif.Document; +import org.aksw.gerbil.transfer.nif.Marking; +import org.aksw.gerbil.transfer.nif.data.NamedEntity; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; +import org.junit.runners.Parameterized.Parameters; + +@RunWith(Parameterized.class) +public class SensevalDatasetTest { + + @Parameters + public static Collection data() { + List testConfigs = new ArrayList(); + testConfigs + .add(new Object[] {0, "src/test/resources/datasets/senseval/test.xml", "The art of change-ringing is peculiar to the English, and, like most English peculiarities, unintelligible to the rest of the world." + , new String[]{"art", "change-ringing", "is", "peculiar", "English", "most", "English", "peculiarities", "unintelligible", "rest", "world"}}); + testConfigs + .add(new Object[] {1, "src/test/resources/datasets/senseval/test.xml", "-- Dorothy L. Sayers, `` The Nine Tailors ``" + , new String[]{"Tailors"}}); + + testConfigs + .add(new Object[] {2, "src/test/resources/datasets/senseval/test.xml", "ASLACTON, England" + , new String[]{"England"}}); + + return testConfigs; + } + + private String file; + private int docIndex; + private String expectedSentence; + private String[] expectedMarkings; + + public SensevalDatasetTest(int docIndex, String file, + String expectedSentence, String[] expectedMarkings) { + this.file = file; + this.docIndex= docIndex; + this.expectedSentence=expectedSentence; + this.expectedMarkings=expectedMarkings; + } + + @Test + public void test() throws GerbilException, IOException { + SensevalDataset data = new SensevalDataset(this.file); + data.init(); + List documents = data.getInstances(); + Document doc = documents.get(docIndex); + assertEquals(expectedSentence, doc.getText()); + List markings = doc.getMarkings(); + String[] marks = new String[markings.size()]; + for(int i=0; i data() { + List testConfigs = new ArrayList(); + testConfigs.add(new Object[] { "Texans O\nurged O\nto O\nflee O\nas O\nIke O\nmenaces O\ncoast O\n: O\nAuthorities O\nhave O\nurged O\nresidents O\nto O\nflee O\nthe O\nTexas B-LOC\ncoast I-LOC\n, O\na O\nURL O", "Texans urged to flee as Ike menaces coast : Authorities have urged residents to flee the Texas coast , a URL ", "Texas coast" }); + testConfigs.add(new Object[] { "Texans B-LOC\nurged O\nto O\nflee O\nas O\nIke O\nmenaces O\ncoast O\n: O\nAuthorities O\nhave O\nurged O\nresidents O\nto O\nflee O\nthe O\nTexas B-LOC\ncoast I-LOC\n, O\na O\nURL O", "Texans urged to flee as Ike menaces coast : Authorities have urged residents to flee the Texas coast , a URL ", "Texans" }); + testConfigs.add(new Object[] { "Texans B-LOC\nurged B-PER\nto O\nflee O\nas O\nIke O\nmenaces O\ncoast O\n: O\nAuthorities O\nhave O\nurged O\nresidents O\nto O\nflee O\nthe O\nTexas B-LOC\ncoast I-LOC\n, O\na O\nURL O", "Texans urged to flee as Ike menaces coast : Authorities have urged residents to flee the Texas coast , a URL ", "Texans" }); + + return testConfigs; + } + + private String text; + private String expectedToken; + private String tweet; + + public UMBCDatasetTest(String text, String tweet, String expectedToken) { + this.text = text; + this.tweet = tweet; + this.expectedToken = expectedToken; + } + + @Test + public void test() { + List markings = UMBCDataset.findMarkings(text); + Assert.assertNotNull(markings); + Assert.assertTrue(markings.size() > 0); + Assert.assertTrue(markings.get(0) instanceof NamedEntity); + NamedEntity ne = (NamedEntity) markings.get(0); + String mention = tweet.substring(ne.getStartPosition(), ne.getStartPosition() + ne.getLength()); + Assert.assertEquals(expectedToken, mention); + } + +} diff --git a/src/test/java/org/aksw/gerbil/dataset/impl/wsdm/WSDM2012DatasetMentionSearchTest.java b/src/test/java/org/aksw/gerbil/dataset/impl/wsdm/WSDM2012DatasetMentionSearchTest.java new file mode 100644 index 000000000..29d2e1cf7 --- /dev/null +++ b/src/test/java/org/aksw/gerbil/dataset/impl/wsdm/WSDM2012DatasetMentionSearchTest.java @@ -0,0 +1,85 @@ +/** + * This file is part of General Entity Annotator Benchmark. + * + * General Entity Annotator Benchmark is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * General Entity Annotator Benchmark is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with General Entity Annotator Benchmark. If not, see . + */ +package org.aksw.gerbil.dataset.impl.wsdm; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import org.aksw.gerbil.transfer.nif.Marking; +import org.aksw.gerbil.transfer.nif.data.Annotation; +import org.aksw.gerbil.transfer.nif.data.NamedEntity; +import org.aksw.gerbil.transfer.nif.data.TypedNamedEntity; +import org.junit.Assert; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; +import org.junit.runners.Parameterized.Parameters; + +@RunWith(Parameterized.class) +public class WSDM2012DatasetMentionSearchTest { + + @Parameters + public static Collection data() { + List testConfigs = new ArrayList(); + testConfigs.add(new Object[] { + new String[] { "375737278276321000 27 Africa", "375737278276321000 4 United_States"}, + "The US Military’s Pivot to Africa http://t.co/wJn6YRE5hR", + new String[] {"http://en.wikipedia.org/wiki/United_States", "http://en.wikipedia.org/wiki/Africa" }}); + + testConfigs.add(new Object[] { + new String[] { "375733981582729000 38 Brooklyn"}, + "RT @BestProNews: A 19-year-old man in Brooklyn died today after he lost control of his remote control helicopter and sliced off the top of …", + new String[] {"http://en.wikipedia.org/wiki/Brooklyn"}}); + + + return testConfigs; + } + + private String[] mentions; + private String tweet; + private String[] expectedMentions; + + + public WSDM2012DatasetMentionSearchTest(String[] mentions, String tweet, String[] expectedMentions) { + this.mentions = mentions; + this.tweet = tweet; + this.expectedMentions = expectedMentions; + } + + @Test + public void test() { + Set lines = new HashSet(); + for(String m : mentions){ + lines.add(m); + } + List markings = WSDMDataset.findMarkings(lines, tweet); + Assert.assertNotNull(markings); + Assert.assertTrue(markings.size() > 0); + int i =0; + for(Marking marking : markings){ + Assert.assertTrue(marking instanceof Annotation); + Annotation ne = (Annotation) marking; + + Assert.assertEquals(expectedMentions[i], ne.getUris().iterator().next()); + + i++; + } + } +} diff --git a/src/test/java/org/aksw/gerbil/execute/AbstractExperimentTaskTest.java b/src/test/java/org/aksw/gerbil/execute/AbstractExperimentTaskTest.java index 9bc1bb796..6af981424 100644 --- a/src/test/java/org/aksw/gerbil/execute/AbstractExperimentTaskTest.java +++ b/src/test/java/org/aksw/gerbil/execute/AbstractExperimentTaskTest.java @@ -61,7 +61,7 @@ public void runTest(int experimentTaskId, ExperimentDAO experimentDAO, SameAsRet } catch (InterruptedException e) { e.printStackTrace(); } - Assert.assertNull("Got an exception: " + testError, testError); + Assert.assertNull("Got an exception: " + testError + " " + configuration.toString(), testError); SameAsRetrieverSingleton4Tests.storeCache(); } @@ -123,13 +123,15 @@ public F1MeasureTestingObserver(AbstractExperimentTaskTest testInstance, int exp protected void testTaskResults(Task task) { Assert.assertEquals(ExperimentDAO.TASK_FINISHED, experimentDAO.getExperimentState(experimentTaskId)); ExperimentTaskResult result = experimentDAO.getTaskResult(experimentTaskId); - Assert.assertEquals(expectedResults[MACRO_PREC_INDEX], result.getMacroPrecision(), DELTA); - Assert.assertEquals(expectedResults[MACRO_REC_INDEX], result.getMacroRecall(), DELTA); - Assert.assertEquals(expectedResults[MACRO_F1_INDEX], result.getMacroF1Measure(), DELTA); - Assert.assertEquals(expectedResults[MICRO_PREC_INDEX], result.getMicroPrecision(), DELTA); - Assert.assertEquals(expectedResults[MICRO_REC_INDEX], result.getMicroRecall(), DELTA); - Assert.assertEquals(expectedResults[MICRO_F1_INDEX], result.getMicroF1Measure(), DELTA); - Assert.assertEquals(expectedResults[ERROR_COUNT_INDEX], result.getErrorCount(), DELTA); + String errorMsg = "Error for system " + result.annotator + " on dataset " + result.dataset + + " in Experiment " + result.type.getName(); + Assert.assertEquals(errorMsg, expectedResults[MACRO_PREC_INDEX], result.getMacroPrecision(), DELTA); + Assert.assertEquals(errorMsg, expectedResults[MACRO_REC_INDEX], result.getMacroRecall(), DELTA); + Assert.assertEquals(errorMsg, expectedResults[MACRO_F1_INDEX], result.getMacroF1Measure(), DELTA); + Assert.assertEquals(errorMsg, expectedResults[MICRO_PREC_INDEX], result.getMicroPrecision(), DELTA); + Assert.assertEquals(errorMsg, expectedResults[MICRO_REC_INDEX], result.getMicroRecall(), DELTA); + Assert.assertEquals(errorMsg, expectedResults[MICRO_F1_INDEX], result.getMicroF1Measure(), DELTA); + Assert.assertEquals(errorMsg, expectedResults[ERROR_COUNT_INDEX], result.getErrorCount(), DELTA); } } } diff --git a/src/test/java/org/aksw/gerbil/execute/OKEChallengeTask1RT2KBTest.java b/src/test/java/org/aksw/gerbil/execute/OKEChallengeTask1RT2KBTest.java new file mode 100644 index 000000000..db2c55a90 --- /dev/null +++ b/src/test/java/org/aksw/gerbil/execute/OKEChallengeTask1RT2KBTest.java @@ -0,0 +1,150 @@ +/** + * This file is part of General Entity Annotator Benchmark. + * + * General Entity Annotator Benchmark is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * General Entity Annotator Benchmark is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with General Entity Annotator Benchmark. If not, see . + */ +package org.aksw.gerbil.execute; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.HashSet; +import java.util.List; + +import org.aksw.gerbil.annotator.TestAnnotatorConfiguration; +import org.aksw.gerbil.annotator.decorator.ErrorCountingAnnotatorDecorator; +import org.aksw.gerbil.database.SimpleLoggingResultStoringDAO4Debugging; +import org.aksw.gerbil.dataset.DatasetConfiguration; +import org.aksw.gerbil.dataset.impl.nif.NIFFileDatasetConfig; +import org.aksw.gerbil.datatypes.ExperimentTaskConfiguration; +import org.aksw.gerbil.datatypes.ExperimentType; +import org.aksw.gerbil.evaluate.EvaluatorFactory; +import org.aksw.gerbil.evaluate.impl.ConfidenceBasedFMeasureCalculator; +import org.aksw.gerbil.matching.Matching; +import org.aksw.gerbil.matching.impl.MatchingsCounterImpl; +import org.aksw.gerbil.semantic.kb.SimpleWhiteListBasedUriKBClassifier; +import org.aksw.gerbil.semantic.kb.UriKBClassifier; +import org.aksw.gerbil.transfer.nif.Document; +import org.aksw.gerbil.transfer.nif.Marking; +import org.aksw.gerbil.transfer.nif.data.DocumentImpl; +import org.aksw.gerbil.transfer.nif.data.TypedNamedEntity; +import org.junit.BeforeClass; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; +import org.junit.runners.Parameterized.Parameters; + +@RunWith(Parameterized.class) +public class OKEChallengeTask1RT2KBTest extends AbstractExperimentTaskTest { + + @BeforeClass + public static void setMatchingsCounterDebugFlag() { + MatchingsCounterImpl.setPrintDebugMsg(true); + ConfidenceBasedFMeasureCalculator.setPrintDebugMsg(true); + ErrorCountingAnnotatorDecorator.setPrintDebugMsg(true); + } + + private static final String TEXTS[] = new String[] { + "Florence May Harding studied at a school in Sydney, and with Douglas Robert Dundas , but in effect had no formal training in either botany or art.", + "Such notables include James Carville, who was the senior political adviser to Bill Clinton, and Donna Brazile, the campaign manager of the 2000 presidential campaign of Vice-President Al Gore.", + "The senator received a Bachelor of Laws from the Columbia University." }; + private static final DatasetConfiguration GOLD_STD = new NIFFileDatasetConfig("OKE_Task1", + "src/test/resources/OKE_Challenge/example_data/task1.ttl", false, ExperimentType.RT2KB, null, null); + private static final UriKBClassifier URI_KB_CLASSIFIER = new SimpleWhiteListBasedUriKBClassifier( + "http://dbpedia.org/resource/"); + + @Parameters + public static Collection data() { + List testConfigs = new ArrayList(); + // The extractor returns nothing + testConfigs.add(new Object[] { new Document[] {}, GOLD_STD, Matching.WEAK_ANNOTATION_MATCH, + new double[] { 0, 0, 0, 0, 0, 0, 0 } }); + // The extractor found everything and marked all entities using dbpedia + // URIs (if they were available) + testConfigs.add(new Object[] { + new Document[] { + new DocumentImpl(TEXTS[0], + "http://www.ontologydesignpatterns.org/data/oke-challenge/task-1/sentence-1", + Arrays.asList( + (Marking) new TypedNamedEntity(0, 20, + "http://dbpedia.org/resource/Florence_May_Harding", + new HashSet(Arrays.asList( + "http://www.w3.org/2002/07/owl#Individual", + "http://www.ontologydesignpatterns.org/ont/dul/DUL.owl#Person"))), + (Marking) new TypedNamedEntity(44, 6, "http://dbpedia.org/resource/Sydney", + new HashSet(Arrays.asList("http://www.w3.org/2002/07/owl#Individual", + "http://ontologydesignpatterns.org/ont/wikipedia/d0.owl#Location"))), + (Marking) new TypedNamedEntity(61, 21, + "http://www.ontologydesignpatterns.org/data/oke-challenge/task-1/Douglas_Robert_Dundas", + new HashSet(Arrays.asList("http://www.w3.org/2002/07/owl#Individual", + "http://www.ontologydesignpatterns.org/ont/dul/DUL.owl#Person"))))), + new DocumentImpl(TEXTS[1], "http://www.ontologydesignpatterns.org/data/oke-challenge/task-1/sentence-2", + Arrays.asList( + (Marking) new TypedNamedEntity(22, 14, "http://dbpedia.org/resource/James_Carville", + new HashSet(Arrays.asList("http://www.w3.org/2002/07/owl#Individual", + "http://www.ontologydesignpatterns.org/ont/dul/DUL.owl#Person"))), + (Marking) new TypedNamedEntity(57, 17, + "http://dbpedia.org/resource/Political_consulting", + new HashSet(Arrays.asList("http://www.w3.org/2002/07/owl#Individual", + "http://www.ontologydesignpatterns.org/ont/dul/DUL.owl#Role"))), + (Marking) new TypedNamedEntity(78, 12, "http://dbpedia.org/resource/Bill_Clinton", + new HashSet(Arrays.asList("http://www.w3.org/2002/07/owl#Individual", + "http://www.ontologydesignpatterns.org/ont/dul/DUL.owl#Person"))), + (Marking) new TypedNamedEntity(96, 13, "http://dbpedia.org/resource/Donna_Brazile", + new HashSet(Arrays.asList("http://www.w3.org/2002/07/owl#Individual", + "http://www.ontologydesignpatterns.org/ont/dul/DUL.owl#Person"))), + (Marking) new TypedNamedEntity(115, 16, "http://dbpedia.org/resource/Campaign_manager", + new HashSet(Arrays.asList("http://www.w3.org/2002/07/owl#Individual", + "http://www.ontologydesignpatterns.org/ont/dul/DUL.owl#Role"))), + (Marking) new TypedNamedEntity(184, 7, "http://dbpedia.org/resource/Al_Gore", + new HashSet(Arrays.asList("http://www.w3.org/2002/07/owl#Individual", + "http://www.ontologydesignpatterns.org/ont/dul/DUL.owl#Person"))))), + new DocumentImpl(TEXTS[2], "http://www.ontologydesignpatterns.org/data/oke-challenge/task-1/sentence-3", + Arrays.asList( + (Marking) new TypedNamedEntity(4, 7, + "http://www.ontologydesignpatterns.org/data/oke-challenge/task-1/Senator_1", + new HashSet(Arrays.asList("http://www.w3.org/2002/07/owl#Individual", + "http://www.ontologydesignpatterns.org/ont/dul/DUL.owl#Person"))), + (Marking) new TypedNamedEntity(49, 19, + "http://dbpedia.org/resource/Columbia_University", + new HashSet(Arrays.asList("http://www.w3.org/2002/07/owl#Individual", + "http://www.ontologydesignpatterns.org/ont/dul/DUL.owl#Organization"))))) }, + GOLD_STD, Matching.WEAK_ANNOTATION_MATCH, new double[] { 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0 } }); + return testConfigs; + } + + private Document annotatorResults[]; + private DatasetConfiguration dataset; + private double expectedResults[]; + private Matching matching; + + public OKEChallengeTask1RT2KBTest(Document[] annotatorResults, DatasetConfiguration dataset, Matching matching, + double[] expectedResults) { + this.annotatorResults = annotatorResults; + this.dataset = dataset; + this.expectedResults = expectedResults; + this.matching = matching; + } + + @Test + public void test() { + int experimentTaskId = 1; + SimpleLoggingResultStoringDAO4Debugging experimentDAO = new SimpleLoggingResultStoringDAO4Debugging(); + ExperimentTaskConfiguration configuration = new ExperimentTaskConfiguration( + new TestAnnotatorConfiguration(Arrays.asList(annotatorResults), ExperimentType.RT2KB), dataset, + ExperimentType.RT2KB, matching); + runTest(experimentTaskId, experimentDAO, new EvaluatorFactory(URI_KB_CLASSIFIER), configuration, + new F1MeasureTestingObserver(this, experimentTaskId, experimentDAO, expectedResults)); + } +} diff --git a/src/test/java/org/aksw/gerbil/semantic/sameas/index/IndexerTest.java b/src/test/java/org/aksw/gerbil/semantic/sameas/index/IndexerTest.java new file mode 100644 index 000000000..c1e799b27 --- /dev/null +++ b/src/test/java/org/aksw/gerbil/semantic/sameas/index/IndexerTest.java @@ -0,0 +1,64 @@ +package org.aksw.gerbil.semantic.sameas.index; + +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +import java.io.File; +import java.io.IOException; +import java.util.LinkedList; +import java.util.List; + +import org.aksw.gerbil.exceptions.GerbilException; +import org.junit.Test; + +public class IndexerTest { + + + @Test + public void testTerm() throws GerbilException, IOException{ + //Test if indexing and searching works + //1. make some same as retrievals + //2. index them + File indexFolder = createTempDirectory(); + //Test if folder could be created + assertTrue(indexFolder!=null); + Indexer index = new Indexer(indexFolder.getAbsolutePath()); + index.index("http://dbpedia.org/resource/Scar", getList("http://dbpedia.org")); + index.index("http://wikipedia.org/a", getList("http://wikipedia.org")); + index.index("http://de.dbpedia.org/a", getList("http://de.dbpedia.org")); + index.close(); + //3. search for one that exists + Searcher search = new Searcher(indexFolder.getAbsolutePath()); + assertFalse(search.search("http://wikipedia.org/a").isEmpty()); + assertTrue(search.search("http://wikipedia.org/d").isEmpty()); + assertFalse(search.search("http://dbpedia.org/resource/Scar").isEmpty()); + //4. search for one that dont exist + assertTrue(search.search("http://wikipedia.org/ab").isEmpty()); + search.close(); + } + + + public List getList(String prefix){ + List sameAs = new LinkedList(); + sameAs.add(prefix+"/a"); + sameAs.add(prefix+"/b"); + sameAs.add(prefix+"/c"); + sameAs.add(prefix+"/d"); + return sameAs; + } + + public static File createTempDirectory() + throws IOException { + File temp = File.createTempFile("temp_index", Long.toString(System.nanoTime())); + if (temp.exists()) { + if(!(temp.delete())){ + return null; + } + } + if (!(temp.mkdir())) { + return null; + } + return temp; +} + +} diff --git a/src/test/java/org/aksw/gerbil/web/config/check/CheckerTest.java b/src/test/java/org/aksw/gerbil/web/config/check/CheckerTest.java new file mode 100644 index 000000000..19ac70463 --- /dev/null +++ b/src/test/java/org/aksw/gerbil/web/config/check/CheckerTest.java @@ -0,0 +1,83 @@ +package org.aksw.gerbil.web.config.check; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; + +import org.junit.Assert; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; +import org.junit.runners.Parameterized.Parameters; + +@RunWith(Parameterized.class) +public class CheckerTest { + + @Parameters + public static Collection data() throws IOException { + File existingFile = File.createTempFile("checker", "test"); + File deletedFile = File.createTempFile("checker", "test"); + Assert.assertTrue(deletedFile.delete()); + + File existingDir = File.createTempFile("checker", "test"); + Assert.assertTrue(existingDir.delete()); + Assert.assertTrue(existingDir.mkdir()); + + List testConfigs = new ArrayList(); + + FileChecker fileChecker = new FileChecker(); + // Check existing file + testConfigs.add(new Object[] { fileChecker, new Object[] { existingFile }, true }); + testConfigs.add(new Object[] { fileChecker, new Object[] { existingFile.toString() }, true }); + testConfigs.add(new Object[] { fileChecker, new Object[] { existingFile.getAbsolutePath() }, true }); + // Check deleted file + testConfigs.add(new Object[] { fileChecker, new Object[] { deletedFile }, false }); + testConfigs.add(new Object[] { fileChecker, new Object[] { deletedFile.toString() }, false }); + testConfigs.add(new Object[] { fileChecker, new Object[] { deletedFile.getAbsolutePath() }, false }); + // Check more than one file + testConfigs.add(new Object[] { fileChecker, new Object[] { existingFile, existingFile }, true }); + testConfigs.add(new Object[] { fileChecker, new Object[] { existingFile, deletedFile }, false }); + testConfigs.add(new Object[] { fileChecker, new Object[] { deletedFile, deletedFile }, false }); + // Check existing directory + testConfigs.add(new Object[] { fileChecker, new Object[] { existingDir }, false }); + testConfigs.add(new Object[] { fileChecker, new Object[] { existingDir.toString() }, false }); + testConfigs.add(new Object[] { fileChecker, new Object[] { existingDir.getAbsolutePath() }, false }); + + DirectoryChecker dirChecker = new DirectoryChecker(); + // Check existing directory + testConfigs.add(new Object[] { dirChecker, new Object[] { existingDir }, true }); + testConfigs.add(new Object[] { dirChecker, new Object[] { existingDir.toString() }, true }); + testConfigs.add(new Object[] { dirChecker, new Object[] { existingDir.getAbsolutePath() }, true }); + // Check deleted file + testConfigs.add(new Object[] { dirChecker, new Object[] { deletedFile }, false }); + testConfigs.add(new Object[] { dirChecker, new Object[] { deletedFile.toString() }, false }); + testConfigs.add(new Object[] { dirChecker, new Object[] { deletedFile.getAbsolutePath() }, false }); + // Check more than one directory + testConfigs.add(new Object[] { dirChecker, new Object[] { existingDir, existingDir }, true }); + testConfigs.add(new Object[] { dirChecker, new Object[] { existingDir, deletedFile }, false }); + testConfigs.add(new Object[] { dirChecker, new Object[] { deletedFile, deletedFile }, false }); + // Check existing file + testConfigs.add(new Object[] { dirChecker, new Object[] { existingFile }, false }); + testConfigs.add(new Object[] { dirChecker, new Object[] { existingFile.toString() }, false }); + testConfigs.add(new Object[] { dirChecker, new Object[] { existingFile.getAbsolutePath() }, false }); + + return testConfigs; + } + + private Checker checker; + private Object arguments[]; + private boolean expectedResult; + + public CheckerTest(Checker checker, Object[] arguments, boolean expectedResult) { + this.checker = checker; + this.arguments = arguments; + this.expectedResult = expectedResult; + } + + @Test + public void test() { + Assert.assertEquals(expectedResult, checker.check(arguments)); + } +} diff --git a/src/test/resources/datasets/senseval/test.xml b/src/test/resources/datasets/senseval/test.xml new file mode 100644 index 000000000..ed216cbaf --- /dev/null +++ b/src/test/resources/datasets/senseval/test.xml @@ -0,0 +1,59 @@ + + + + The + art + of + change-ringing + + is + peculiar + + to + the + English + + , + and + , + like + most + + English + + peculiarities + + , + unintelligible + + to + the + rest + + of + the + world + + . + + + -- + Dorothy + L. + Sayers + , + `` + The + Nine + Tailors + + `` + + + ASLACTON + , + England + + + + \ No newline at end of file diff --git a/start.sh b/start.sh index f167dc2ea..baade1d36 100755 --- a/start.sh +++ b/start.sh @@ -1,17 +1,128 @@ #!/bin/bash +# This script is part of the GERBIL project. +# It reuses functions from Mitch Frazier (http://www.linuxjournal.com/content/asking-yesno-question-bash-script) + +##################################################################### +# Print warning message. + +function warning() +{ + echo "$*" >&2 +} + +##################################################################### +# Print error message and exit. + +function error() +{ + echo "$*" >&2 + exit 1 +} + + +##################################################################### +# Ask yesno question. +# +# Usage: yesno OPTIONS QUESTION +# +# Options: +# --timeout N Timeout if no input seen in N seconds. +# --default ANS Use ANS as the default answer on timeout or +# if an empty answer is provided. +# +# Exit status is the answer. + +function yesno() +{ + local ans + local ok=0 + local timeout=0 + local default + local t + + while [[ "$1" ]] + do + case "$1" in + --default) + shift + default=$1 + if [[ ! "$default" ]]; then error "Missing default value"; fi + t=$(tr '[:upper:]' '[:lower:]' <<<$default) + + if [[ "$t" != 'y' && "$t" != 'yes' && "$t" != 'n' && "$t" != 'no' ]]; then + error "Illegal default answer: $default" + fi + default=$t + shift + ;; + + --timeout) + shift + timeout=$1 + if [[ ! "$timeout" ]]; then error "Missing timeout value"; fi + if [[ ! "$timeout" =~ ^[0-9][0-9]*$ ]]; then error "Illegal timeout value: $timeout"; fi + shift + ;; + + -*) + error "Unrecognized option: $1" + ;; + + *) + break + ;; + esac + done + + if [[ $timeout -ne 0 && ! "$default" ]]; then + error "Non-zero timeout requires a default answer" + fi + + if [[ ! "$*" ]]; then error "Missing question"; fi + + while [[ $ok -eq 0 ]] + do + if [[ $timeout -ne 0 ]]; then + if ! read -t $timeout -p "$*" ans; then + ans=$default + else + # Turn off timeout if answer entered. + timeout=0 + if [[ ! "$ans" ]]; then ans=$default; fi + fi + else + read -p "$*" ans + if [[ ! "$ans" ]]; then + ans=$default + else + ans=$(tr '[:upper:]' '[:lower:]' <<<$ans) + fi + fi + + if [[ "$ans" == 'y' || "$ans" == 'yes' || "$ans" == 'n' || "$ans" == 'no' ]]; then + ok=1 + fi + + if [[ $ok -eq 0 ]]; then warning "Valid answers are: yes y no n"; fi + done + [[ "$ans" = "y" || "$ans" == "yes" ]] +} + +##################################################################### +# Check for dependencies echo "Checking dependencies..." file="gerbil_data/gerbil_data.zip" -url="https://github.com/AKSW/gerbil/releases/download/v1.2.4/gerbil_data.zip" +url="https://github.com/AKSW/gerbil/releases/download/v1.2.5/gerbil_data.zip" if [ ! -d "gerbil_data" ]; then - mkdir -p "gerbil_data" || exit 1 + mkdir -p "gerbil_data" || error "Could not create gerbil_data directory" if [ ! -f "$file" ]; then echo "Downloading dependencies ... ($url)" curl --retry 4 -L -o "$file" "$url" if [ ! -f "$file" ]; then - echo "Couldn't downloading dependency data: $file" + error "Couldn't downloading dependency data: $file" else echo "Extracting dependencies ... " unzip "$file" @@ -19,13 +130,15 @@ if [ ! -d "gerbil_data" ]; then fi fi +##################################################################### +# Check for property file echo "Checking properties files..." dir="src/main/properties" file="$dir/gerbil_keys.properties" if [ ! -f "$file" ]; then echo "Creating empty $file file" - mkdir -p "$dir"; + mkdir -p "$dir" || error "Could not create $dir directory" echo "##############################################################################" > $file echo "# This is the properties file contains our keys for several annotator web #" >> $file echo "# services. #" >> $file @@ -33,4 +146,47 @@ if [ ! -f "$file" ]; then echo "##############################################################################" >> $file fi +##################################################################### +# Check for dbpedia sameAs index +echo "Checking dbpedia sameAs index..." +if [ ! -d "indexes/dbpedia" ]; then + echo "Couldn't find a dbpedia sameAs index" + if yesno "Should the index be downloaded (~1GB zipped, ~2GB extracted)? (yes/no): "; then + mkdir -p "indexes/dbpedia" || error "Could not create indexes/dbpedia directory" + file="indexes/dbpedia/dbpedia_index.zip" + url="http://139.18.2.164/mroeder/gerbil/dbpedia_index.zip" + echo "Downloading index ... ($url)" + curl --retry 4 -L -o "$file" "$url" + + if [ ! -f "$file" ]; then + echo "Couldn't downloading index file: $file" + else + echo "Extracting index ... " + unzip "$file" -d "indexes/dbpedia" + fi + fi +fi + +##################################################################### +# Check for dbpedia entity check index +echo "Checking dbpedia entity check index..." +if [ ! -d "indexes/dbpedia_check" ]; then + echo "Couldn't find a dbpedia entity check index" + if yesno "Should the index be downloaded (~0.3GB zipped, ~0.7GB extracted)? (yes/no): "; then + mkdir -p "indexes/dbpedia_check" || error "Could not create indexes/dbpedia_check directory" + file="indexes/dbpedia_check/dbpedia_check_index.zip" + url="http://139.18.2.164/mroeder/gerbil/dbpedia_check_index.zip" + echo "Downloading index ... ($url)" + curl --retry 4 -L -o "$file" "$url" + + if [ ! -f "$file" ]; then + echo "Couldn't downloading index file: $file" + else + echo "Extracting index ... " + unzip "$file" -d "indexes/dbpedia_check" + fi + fi +fi + +echo "Building and starting GERBIL..." mvn clean org.apache.tomcat.maven:tomcat7-maven-plugin:2.2:run -Dmaven.tomcat.port=1234