diff --git a/.gitignore b/.gitignore
index 5676083f4..e1e426526 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,4 +10,6 @@ gerbil_data
*.log
google*.html
export
-datadump.nt
\ No newline at end of file
+datadump.nt
+indexes
+dependency-reduced-pom.xml
\ No newline at end of file
diff --git a/.travis.yml b/.travis.yml
index d3f168a11..d734b7006 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -3,8 +3,9 @@ language: java
before_install:
- cp src/main/properties/log4j.properties src/test/resources/log4j.properties
- mkdir -p "gerbil_data"
- - curl --retry 4 -L -o "gerbil_data/gerbil_data.zip" "https://github.com/AKSW/gerbil/releases/download/v1.2.4/gerbil_data.zip"
+ - curl --retry 4 -L -o "gerbil_data/gerbil_data.zip" "https://github.com/AKSW/gerbil/releases/download/v1.2.5/gerbil_data.zip"
- unzip "gerbil_data/gerbil_data.zip"
+ - touch src/main/properties/gerbil_keys.properties
install:
- mvn clean compile -DskipTests=true -Dmaven.javadoc.skip=true -B -V
script:
diff --git a/index.sh b/index.sh
new file mode 100644
index 000000000..5b9e579ec
--- /dev/null
+++ b/index.sh
@@ -0,0 +1,29 @@
+mkdir dbpedia_dump
+cd dbpedia_dump
+
+wget -r --no-parent -R "*.txt, *.html, *.json" -A "*.nt, *.ttl, *.nt.bz2, *.ttl.bz2" http://downloads.dbpedia.org/2016-04/core-i18n/en/
+cd downloads.dbpedia.org/2016-04/core-i18n/en/
+
+wget http://www.l3s.de/~minack/rdf2rdf/downloads/rdf2rdf-1.0.1-2.3.1.jar
+
+
+rm *.json
+rm *.txt
+rm index.html
+
+for i in *.bz2; do
+ bzip2 -vd $i
+done
+
+for i in *.ttl; do
+ java -jar rdf2rdf-1.0.1-2.3.1.jar $i .nt
+done
+
+rm *.ttl
+rm rdf2rdf-1.0.1-2.3.1.jar
+
+cd ../../../../../../
+
+mvn exec:java -Dexec.mainClass="org.aksw.gerbil.tools.InitialIndexTool" -Dexec.args="dbpedia_dump/downloads.dbpedia.org/2016-04/core-i18n/en/"
+
+rm -rf dbpedia_dump/
diff --git a/pom.xml b/pom.xml
index cb31ad3e2..b6b1e9589 100644
--- a/pom.xml
+++ b/pom.xml
@@ -12,8 +12,8 @@
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
4.0.0
org.aksw
- gerbil
- 1.2.4
+ gerbil
+ 1.2.5
General Entity Annotator Benchmark
This project is a benchmark for entity annotation and disambiguation tools.
2014
@@ -55,8 +55,8 @@
org.aksw
- gerbil.nif.transfer
- 1.2.2
+ gerbil.nif.transfer
+ 1.2.3
@@ -142,7 +142,7 @@
org.apache.lucene
lucene-core
- 2.9.1
+ 6.2.0
commons-configuration
@@ -311,6 +311,11 @@
json
20140107
+
+ org.apache.lucene
+ lucene-analyzers-common
+ 6.2.0
+
diff --git a/src/main/java/org/aksw/gerbil/annotator/OKETask1Annotator.java b/src/main/java/org/aksw/gerbil/annotator/OKETask1Annotator.java
index ccb35734c..f8a332bb6 100644
--- a/src/main/java/org/aksw/gerbil/annotator/OKETask1Annotator.java
+++ b/src/main/java/org/aksw/gerbil/annotator/OKETask1Annotator.java
@@ -22,7 +22,7 @@
import org.aksw.gerbil.transfer.nif.Document;
import org.aksw.gerbil.transfer.nif.data.TypedNamedEntity;
-public interface OKETask1Annotator extends A2KBAnnotator, EntityTyper {
+public interface OKETask1Annotator extends A2KBAnnotator, RT2KBAnnotator {
public List performTask1(Document document) throws GerbilException;
}
diff --git a/src/main/java/org/aksw/gerbil/annotator/RT2KBAnnotator.java b/src/main/java/org/aksw/gerbil/annotator/RT2KBAnnotator.java
new file mode 100644
index 000000000..a9284d8e1
--- /dev/null
+++ b/src/main/java/org/aksw/gerbil/annotator/RT2KBAnnotator.java
@@ -0,0 +1,28 @@
+/**
+ * This file is part of General Entity Annotator Benchmark.
+ *
+ * General Entity Annotator Benchmark is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * General Entity Annotator Benchmark is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with General Entity Annotator Benchmark. If not, see .
+ */
+package org.aksw.gerbil.annotator;
+
+import java.util.List;
+
+import org.aksw.gerbil.exceptions.GerbilException;
+import org.aksw.gerbil.transfer.nif.Document;
+import org.aksw.gerbil.transfer.nif.TypedSpan;
+
+public interface RT2KBAnnotator extends EntityRecognizer, EntityTyper {
+
+ public List performRT2KBTask(Document document) throws GerbilException;
+}
diff --git a/src/main/java/org/aksw/gerbil/annotator/decorator/ErrorCountingAnnotatorDecorator.java b/src/main/java/org/aksw/gerbil/annotator/decorator/ErrorCountingAnnotatorDecorator.java
index 9eac748fc..a99acf2a9 100644
--- a/src/main/java/org/aksw/gerbil/annotator/decorator/ErrorCountingAnnotatorDecorator.java
+++ b/src/main/java/org/aksw/gerbil/annotator/decorator/ErrorCountingAnnotatorDecorator.java
@@ -27,6 +27,7 @@
import org.aksw.gerbil.annotator.EntityTyper;
import org.aksw.gerbil.annotator.OKETask1Annotator;
import org.aksw.gerbil.annotator.OKETask2Annotator;
+import org.aksw.gerbil.annotator.RT2KBAnnotator;
import org.aksw.gerbil.datatypes.ErrorTypes;
import org.aksw.gerbil.datatypes.ExperimentType;
import org.aksw.gerbil.evaluate.EvaluationResultContainer;
@@ -52,8 +53,8 @@
* @author Michael Röder (roeder@informatik.uni-leipzig.de)
*
*/
-public abstract class ErrorCountingAnnotatorDecorator extends AbstractAnnotatorDecorator
- implements Evaluator, ErrorCounter {
+public abstract class ErrorCountingAnnotatorDecorator extends AbstractAnnotatorDecorator implements Evaluator,
+ ErrorCounter {
private static final Logger LOGGER = LoggerFactory.getLogger(ErrorCountingAnnotatorDecorator.class);
@@ -82,6 +83,8 @@ public static ErrorCountingAnnotatorDecorator createDecorator(ExperimentType typ
return new ErrorCountingOKETask1Annotator((OKETask1Annotator) annotator, maxErrors);
case OKE_Task2:
return new ErrorCountingOKETask2Annotator((OKETask2Annotator) annotator, maxErrors);
+ case RT2KB:
+ return new ErrorCountingRT2KBAnnotator((RT2KBAnnotator) annotator, maxErrors);
case Rc2KB:
break;
case Sa2KB:
@@ -125,8 +128,8 @@ public List performD2KBTask(Document document) throws GerbilExcepti
}
}
- private static class ErrorCountingEntityRecognizer extends ErrorCountingAnnotatorDecorator
- implements EntityRecognizer {
+ private static class ErrorCountingEntityRecognizer extends ErrorCountingAnnotatorDecorator implements
+ EntityRecognizer {
public ErrorCountingEntityRecognizer(EntityRecognizer decoratedAnnotator, int maxErrors) {
super(decoratedAnnotator, maxErrors);
@@ -173,8 +176,24 @@ public List performTyping(Document document) throws GerbilException {
}
}
- private static class ErrorCountingOKETask1Annotator extends ErrorCountingA2KBAnnotator
- implements OKETask1Annotator {
+ private static class ErrorCountingRT2KBAnnotator extends ErrorCountingEntityRecognizer implements RT2KBAnnotator {
+
+ protected ErrorCountingRT2KBAnnotator(RT2KBAnnotator decoratedAnnotator, int maxErrors) {
+ super(decoratedAnnotator, maxErrors);
+ }
+
+ @Override
+ public List performTyping(Document document) throws GerbilException {
+ return ErrorCountingAnnotatorDecorator.performTyping(this, document);
+ }
+
+ @Override
+ public List performRT2KBTask(Document document) throws GerbilException {
+ return ErrorCountingAnnotatorDecorator.performRT2KBTask(this, document);
+ }
+ }
+
+ private static class ErrorCountingOKETask1Annotator extends ErrorCountingA2KBAnnotator implements OKETask1Annotator {
protected ErrorCountingOKETask1Annotator(OKETask1Annotator decoratedAnnotator, int maxErrors) {
super(decoratedAnnotator, maxErrors);
@@ -185,14 +204,19 @@ public List performTyping(Document document) throws GerbilException {
return ErrorCountingAnnotatorDecorator.performTyping(this, document);
}
+ @Override
+ public List performRT2KBTask(Document document) throws GerbilException {
+ return ErrorCountingAnnotatorDecorator.performRT2KBTask(this, document);
+ }
+
@Override
public List performTask1(Document document) throws GerbilException {
return ErrorCountingAnnotatorDecorator.performOKETask1(this, document);
}
}
- private static class ErrorCountingOKETask2Annotator extends ErrorCountingAnnotatorDecorator
- implements OKETask2Annotator {
+ private static class ErrorCountingOKETask2Annotator extends ErrorCountingAnnotatorDecorator implements
+ OKETask2Annotator {
protected ErrorCountingOKETask2Annotator(OKETask2Annotator decoratedAnnotator, int maxErrors) {
super(decoratedAnnotator, maxErrors);
@@ -269,8 +293,8 @@ protected static List performD2KBTask(ErrorCountingAnnotatorDecorat
return result;
}
- protected static List performExtraction(ErrorCountingAnnotatorDecorator errorCounter,
- Document document) throws GerbilException {
+ protected static List performExtraction(ErrorCountingAnnotatorDecorator errorCounter, Document document)
+ throws GerbilException {
List result = null;
try {
result = ((A2KBAnnotator) errorCounter.getDecoratedAnnotator()).performA2KBTask(document);
@@ -384,6 +408,29 @@ protected static List performOKETask2(ErrorCountingAnnotatorDe
return result;
}
+ public static List performRT2KBTask(ErrorCountingAnnotatorDecorator errorCounter, Document document)
+ throws GerbilException {
+ List result = null;
+ try {
+ result = ((RT2KBAnnotator) errorCounter.getDecoratedAnnotator()).performRT2KBTask(document);
+ } catch (Exception e) {
+ if (errorCounter.getErrorCount() == 0) {
+ // Log only the first exception completely
+ LOGGER.error("Got an Exception from the annotator (" + errorCounter.getName() + ")", e);
+ } else {
+ // Log only the Exception message without the stack trace
+ LOGGER.error("Got an Exception from the annotator (" + errorCounter.getName() + "): "
+ + e.getLocalizedMessage());
+ }
+ errorCounter.increaseErrorCount();
+ return new ArrayList(0);
+ }
+ if (printDebugMsg && LOGGER.isDebugEnabled()) {
+ logResult(result, errorCounter.getName(), "TypedNamedEntity");
+ }
+ return result;
+ }
+
protected int errorCount = 0;
protected int maxErrors;
diff --git a/src/main/java/org/aksw/gerbil/annotator/decorator/SingleInstanceSecuringAnnotatorDecorator.java b/src/main/java/org/aksw/gerbil/annotator/decorator/SingleInstanceSecuringAnnotatorDecorator.java
index 5c8f26bf7..84c93ef71 100644
--- a/src/main/java/org/aksw/gerbil/annotator/decorator/SingleInstanceSecuringAnnotatorDecorator.java
+++ b/src/main/java/org/aksw/gerbil/annotator/decorator/SingleInstanceSecuringAnnotatorDecorator.java
@@ -29,6 +29,7 @@
import org.aksw.gerbil.annotator.EntityTyper;
import org.aksw.gerbil.annotator.OKETask1Annotator;
import org.aksw.gerbil.annotator.OKETask2Annotator;
+import org.aksw.gerbil.annotator.RT2KBAnnotator;
import org.aksw.gerbil.datatypes.ErrorTypes;
import org.aksw.gerbil.datatypes.ExperimentType;
import org.aksw.gerbil.exceptions.GerbilException;
@@ -73,6 +74,8 @@ public static SingleInstanceSecuringAnnotatorDecorator createDecorator(Experimen
return new SingleInstanceSecuringOKETask1Annotator((OKETask1Annotator) annotator);
case OKE_Task2:
return new SingleInstanceSecuringOKETask2Annotator((OKETask2Annotator) annotator);
+ case RT2KB:
+ return new SingleInstanceSecuringRT2KBAnnotator((RT2KBAnnotator) annotator);
case Rc2KB:
break;
case Sa2KB:
@@ -86,8 +89,8 @@ public static SingleInstanceSecuringAnnotatorDecorator createDecorator(Experimen
return null;
}
- private static class SingleInstanceSecuringC2KBAnnotator extends SingleInstanceSecuringAnnotatorDecorator
- implements C2KBAnnotator {
+ private static class SingleInstanceSecuringC2KBAnnotator extends SingleInstanceSecuringAnnotatorDecorator implements
+ C2KBAnnotator {
public SingleInstanceSecuringC2KBAnnotator(C2KBAnnotator decoratedAnnotator) {
super(decoratedAnnotator);
@@ -99,8 +102,8 @@ public List performC2KB(Document document) throws GerbilException {
}
}
- private static class SingleInstanceSecuringD2KBAnnotator extends SingleInstanceSecuringAnnotatorDecorator
- implements D2KBAnnotator {
+ private static class SingleInstanceSecuringD2KBAnnotator extends SingleInstanceSecuringAnnotatorDecorator implements
+ D2KBAnnotator {
public SingleInstanceSecuringD2KBAnnotator(D2KBAnnotator decoratedAnnotator) {
super(decoratedAnnotator);
@@ -125,8 +128,8 @@ public List performRecognition(Document document) throws GerbilException {
}
}
- private static class SingleInstanceSecuringA2KBAnnotator extends SingleInstanceSecuringD2KBAnnotator
- implements A2KBAnnotator {
+ private static class SingleInstanceSecuringA2KBAnnotator extends SingleInstanceSecuringD2KBAnnotator implements
+ A2KBAnnotator {
public SingleInstanceSecuringA2KBAnnotator(A2KBAnnotator decoratedAnnotator) {
super(decoratedAnnotator);
@@ -149,8 +152,8 @@ public List performA2KBTask(Document document) throws GerbilExcepti
}
- private static class SingleInstanceSecuringEntityTyper extends SingleInstanceSecuringAnnotatorDecorator
- implements EntityTyper {
+ private static class SingleInstanceSecuringEntityTyper extends SingleInstanceSecuringAnnotatorDecorator implements
+ EntityTyper {
protected SingleInstanceSecuringEntityTyper(EntityTyper decoratedAnnotator) {
super(decoratedAnnotator);
@@ -162,8 +165,26 @@ public List performTyping(Document document) throws GerbilException {
}
}
- private static class SingleInstanceSecuringOKETask1Annotator extends SingleInstanceSecuringA2KBAnnotator
- implements OKETask1Annotator {
+ private static class SingleInstanceSecuringRT2KBAnnotator extends SingleInstanceSecuringEntityRecognizer implements
+ RT2KBAnnotator {
+
+ protected SingleInstanceSecuringRT2KBAnnotator(RT2KBAnnotator decoratedAnnotator) {
+ super(decoratedAnnotator);
+ }
+
+ @Override
+ public List performTyping(Document document) throws GerbilException {
+ return SingleInstanceSecuringAnnotatorDecorator.performTyping(this, document);
+ }
+
+ @Override
+ public List performRT2KBTask(Document document) throws GerbilException {
+ return SingleInstanceSecuringAnnotatorDecorator.performRT2KBTask(this, document);
+ }
+ }
+
+ private static class SingleInstanceSecuringOKETask1Annotator extends SingleInstanceSecuringA2KBAnnotator implements
+ OKETask1Annotator {
protected SingleInstanceSecuringOKETask1Annotator(OKETask1Annotator decoratedAnnotator) {
super(decoratedAnnotator);
@@ -174,6 +195,11 @@ public List performTyping(Document document) throws GerbilException {
return SingleInstanceSecuringAnnotatorDecorator.performTyping(this, document);
}
+ @Override
+ public List performRT2KBTask(Document document) throws GerbilException {
+ return SingleInstanceSecuringAnnotatorDecorator.performRT2KBTask(this, document);
+ }
+
@Override
public List performTask1(Document document) throws GerbilException {
return SingleInstanceSecuringAnnotatorDecorator.performOKETask1(this, document);
@@ -247,8 +273,8 @@ protected static List performExtraction(SingleInstanceSecuringAnnot
return result;
}
- protected static List performTyping(SingleInstanceSecuringAnnotatorDecorator decorator,
- Document document) throws GerbilException {
+ protected static List performTyping(SingleInstanceSecuringAnnotatorDecorator decorator, Document document)
+ throws GerbilException {
List result = null;
try {
decorator.semaphore.acquire();
@@ -265,8 +291,8 @@ protected static List performTyping(SingleInstanceSecuringAnnotatorDe
return result;
}
- protected static List performRecognition(SingleInstanceSecuringAnnotatorDecorator decorator,
- Document document) throws GerbilException {
+ protected static List performRecognition(SingleInstanceSecuringAnnotatorDecorator decorator, Document document)
+ throws GerbilException {
List result = null;
try {
decorator.semaphore.acquire();
@@ -319,6 +345,24 @@ protected static List performOKETask2(SingleInstanceSecuringAn
return result;
}
+ protected static List performRT2KBTask(SingleInstanceSecuringAnnotatorDecorator decorator,
+ Document document) throws GerbilException {
+ List result = null;
+ try {
+ decorator.semaphore.acquire();
+ } catch (InterruptedException e) {
+ LOGGER.error("Interrupted while waiting for the Annotator's semaphore.", e);
+ throw new GerbilException("Interrupted while waiting for the Annotator's semaphore.", e,
+ ErrorTypes.UNEXPECTED_EXCEPTION);
+ }
+ try {
+ result = ((RT2KBAnnotator) decorator.getDecoratedAnnotator()).performRT2KBTask(document);
+ } finally {
+ decorator.semaphore.release();
+ }
+ return result;
+ }
+
/**
* Registers the given {@link Annotator} (if it is not already present in
* the registration) and returns its semaphore.
diff --git a/src/main/java/org/aksw/gerbil/annotator/decorator/TimeMeasuringAnnotatorDecorator.java b/src/main/java/org/aksw/gerbil/annotator/decorator/TimeMeasuringAnnotatorDecorator.java
index 63f11a7e1..33c49e829 100644
--- a/src/main/java/org/aksw/gerbil/annotator/decorator/TimeMeasuringAnnotatorDecorator.java
+++ b/src/main/java/org/aksw/gerbil/annotator/decorator/TimeMeasuringAnnotatorDecorator.java
@@ -18,14 +18,15 @@
import java.util.List;
+import org.aksw.gerbil.annotator.A2KBAnnotator;
import org.aksw.gerbil.annotator.Annotator;
import org.aksw.gerbil.annotator.C2KBAnnotator;
-import org.aksw.gerbil.annotator.A2KBAnnotator;
import org.aksw.gerbil.annotator.D2KBAnnotator;
import org.aksw.gerbil.annotator.EntityRecognizer;
import org.aksw.gerbil.annotator.EntityTyper;
import org.aksw.gerbil.annotator.OKETask1Annotator;
import org.aksw.gerbil.annotator.OKETask2Annotator;
+import org.aksw.gerbil.annotator.RT2KBAnnotator;
import org.aksw.gerbil.datatypes.ExperimentType;
import org.aksw.gerbil.evaluate.DoubleEvaluationResult;
import org.aksw.gerbil.evaluate.EvaluationResultContainer;
@@ -48,8 +49,8 @@
* @author Michael Röder (roeder@informatik.uni-leipzig.de)
*
*/
-public abstract class TimeMeasuringAnnotatorDecorator extends AbstractAnnotatorDecorator
- implements Evaluator, TimeMeasurer {
+public abstract class TimeMeasuringAnnotatorDecorator extends AbstractAnnotatorDecorator implements Evaluator,
+ TimeMeasurer {
public static final String AVG_TIME_RESULT_NAME = "avg millis/doc";
@@ -70,6 +71,8 @@ public static TimeMeasuringAnnotatorDecorator createDecorator(ExperimentType typ
return new TimeMeasuringOKETask1Annotator((OKETask1Annotator) annotator);
case OKE_Task2:
return new TimeMeasuringOKETask2Annotator((OKETask2Annotator) annotator);
+ case RT2KB:
+ return new TimeMeasuringRT2KBAnnotator((RT2KBAnnotator) annotator);
case Rc2KB:
break;
case Sa2KB:
@@ -107,8 +110,8 @@ public List performD2KBTask(Document document) throws GerbilExcepti
}
}
- private static class TimeMeasuringEntityRecognizer extends TimeMeasuringAnnotatorDecorator
- implements EntityRecognizer {
+ private static class TimeMeasuringEntityRecognizer extends TimeMeasuringAnnotatorDecorator implements
+ EntityRecognizer {
public TimeMeasuringEntityRecognizer(EntityRecognizer decoratedAnnotator) {
super(decoratedAnnotator);
@@ -155,8 +158,24 @@ public List performTyping(Document document) throws GerbilException {
}
}
- private static class TimeMeasuringOKETask1Annotator extends TimeMeasuringA2KBAnnotator
- implements OKETask1Annotator {
+ private static class TimeMeasuringRT2KBAnnotator extends TimeMeasuringEntityRecognizer implements RT2KBAnnotator {
+
+ protected TimeMeasuringRT2KBAnnotator(RT2KBAnnotator decoratedAnnotator) {
+ super(decoratedAnnotator);
+ }
+
+ @Override
+ public List performTyping(Document document) throws GerbilException {
+ return TimeMeasuringAnnotatorDecorator.performTyping(this, document);
+ }
+
+ @Override
+ public List performRT2KBTask(Document document) throws GerbilException {
+ return TimeMeasuringAnnotatorDecorator.performRT2KBTask(this, document);
+ }
+ }
+
+ private static class TimeMeasuringOKETask1Annotator extends TimeMeasuringA2KBAnnotator implements OKETask1Annotator {
protected TimeMeasuringOKETask1Annotator(OKETask1Annotator decoratedAnnotator) {
super(decoratedAnnotator);
@@ -167,14 +186,19 @@ public List performTyping(Document document) throws GerbilException {
return TimeMeasuringAnnotatorDecorator.performTyping(this, document);
}
+ @Override
+ public List performRT2KBTask(Document document) throws GerbilException {
+ return TimeMeasuringAnnotatorDecorator.performRT2KBTask(this, document);
+ }
+
@Override
public List performTask1(Document document) throws GerbilException {
return TimeMeasuringAnnotatorDecorator.performOKETask1(this, document);
}
}
- private static class TimeMeasuringOKETask2Annotator extends TimeMeasuringAnnotatorDecorator
- implements OKETask2Annotator {
+ private static class TimeMeasuringOKETask2Annotator extends TimeMeasuringAnnotatorDecorator implements
+ OKETask2Annotator {
protected TimeMeasuringOKETask2Annotator(OKETask2Annotator decoratedAnnotator) {
super(decoratedAnnotator);
@@ -204,8 +228,8 @@ protected static List performD2KBTask(TimeMeasuringAnnotatorDecorat
return result;
}
- protected static List performExtraction(TimeMeasuringAnnotatorDecorator timeMeasurer,
- Document document) throws GerbilException {
+ protected static List performExtraction(TimeMeasuringAnnotatorDecorator timeMeasurer, Document document)
+ throws GerbilException {
long startTime = System.currentTimeMillis();
List result = null;
result = ((A2KBAnnotator) timeMeasurer.getDecoratedAnnotator()).performA2KBTask(document);
@@ -249,6 +273,15 @@ protected static List performOKETask2(TimeMeasuringAnnotatorDe
return result;
}
+ protected static List performRT2KBTask(TimeMeasuringAnnotatorDecorator timeMeasurer, Document document)
+ throws GerbilException {
+ long startTime = System.currentTimeMillis();
+ List result = null;
+ result = ((RT2KBAnnotator) timeMeasurer.getDecoratedAnnotator()).performRT2KBTask(document);
+ timeMeasurer.addCallRuntime(System.currentTimeMillis() - startTime);
+ return result;
+ }
+
protected long timeSum = 0;
protected int callCount = 0;
diff --git a/src/main/java/org/aksw/gerbil/annotator/impl/fox/FOXAnnotator.java b/src/main/java/org/aksw/gerbil/annotator/impl/fox/FOXAnnotator.java
index fdfb5b2bd..faaca97c1 100644
--- a/src/main/java/org/aksw/gerbil/annotator/impl/fox/FOXAnnotator.java
+++ b/src/main/java/org/aksw/gerbil/annotator/impl/fox/FOXAnnotator.java
@@ -107,6 +107,11 @@ public List performTask1(Document document) throws GerbilExcep
return requestAnnotations(document).getMarkings(TypedNamedEntity.class);
}
+ @Override
+ public List performRT2KBTask(Document document) throws GerbilException {
+ return requestAnnotations(document).getMarkings(TypedSpan.class);
+ }
+
protected Document requestAnnotations(Document document) throws GerbilException {
Document resultDoc = new DocumentImpl(document.getText(), document.getDocumentURI());
HttpEntity entity = new StringEntity(new JSONObject().put("input", document.getText()).put("type", "text")
diff --git a/src/main/java/org/aksw/gerbil/annotator/impl/fred/FredAnnotator.java b/src/main/java/org/aksw/gerbil/annotator/impl/fred/FredAnnotator.java
index d2f007540..7e111af43 100644
--- a/src/main/java/org/aksw/gerbil/annotator/impl/fred/FredAnnotator.java
+++ b/src/main/java/org/aksw/gerbil/annotator/impl/fred/FredAnnotator.java
@@ -165,6 +165,11 @@ public List performTask1(Document document) throws GerbilExcep
return requestAnnotations(document).getMarkings(TypedNamedEntity.class);
}
+ @Override
+ public List performRT2KBTask(Document document) throws GerbilException {
+ return requestAnnotations(document).getMarkings(TypedSpan.class);
+ }
+
@SuppressWarnings("unchecked")
protected static List transformToClass(List markings, Class clazz) {
List markingsWithClass = new ArrayList();
diff --git a/src/main/java/org/aksw/gerbil/annotator/impl/instance/InstanceListBasedAnnotator.java b/src/main/java/org/aksw/gerbil/annotator/impl/instance/InstanceListBasedAnnotator.java
index 9313f3b85..da109723a 100644
--- a/src/main/java/org/aksw/gerbil/annotator/impl/instance/InstanceListBasedAnnotator.java
+++ b/src/main/java/org/aksw/gerbil/annotator/impl/instance/InstanceListBasedAnnotator.java
@@ -25,26 +25,45 @@
public class InstanceListBasedAnnotator extends AbstractAnnotator implements A2KBAnnotator, C2KBAnnotator,
D2KBAnnotator, EntityRecognizer, EntityTyper, OKETask1Annotator, OKETask2Annotator {
+ /*
+ * The mapping has been changed to contain the length since we encountered
+ * problems with some datasets containing a document URI more than once.
+ * Inside the NIF file this is not a problem because the length is added to
+ * the document URI. However, since we remove the positions from the URIs,
+ * we have to add the length in this class.
+ */
+ /**
+ * Mapping of URI + text.length() to the documents.
+ */
protected Map uriInstanceMapping;
public InstanceListBasedAnnotator(String annotatorName, List instances) {
super(annotatorName);
this.uriInstanceMapping = new HashMap(instances.size());
for (Document document : instances) {
- uriInstanceMapping.put(document.getDocumentURI(), document);
+ uriInstanceMapping.put(generateDocUri(document.getDocumentURI(), document.getText().length()), document);
}
}
- protected Document getDocument(String uri) {
- if (uriInstanceMapping.containsKey(uri)) {
- return uriInstanceMapping.get(uri);
+ protected Document getDocument(String uri, int textLength) {
+ String mappingUri = generateDocUri(uri, textLength);
+ if (uriInstanceMapping.containsKey(mappingUri)) {
+ return uriInstanceMapping.get(mappingUri);
} else {
return null;
}
}
- protected List getDocumentMarkings(String uri, Class clazz) {
- Document result = this.getDocument(uri);
+ protected static String generateDocUri(String uri, int textLength) {
+ StringBuilder builder = new StringBuilder(uri.length() + 10);
+ builder.append(uri);
+ builder.append('_');
+ builder.append(textLength);
+ return builder.toString();
+ }
+
+ protected List getDocumentMarkings(String uri, int textLength, Class clazz) {
+ Document result = this.getDocument(uri, textLength);
if (result == null) {
return new ArrayList(0);
} else {
@@ -54,36 +73,41 @@ protected List getDocumentMarkings(String uri, Class c
@Override
public List performTask2(Document document) throws GerbilException {
- return getDocumentMarkings(document.getDocumentURI(), TypedNamedEntity.class);
+ return getDocumentMarkings(document.getDocumentURI(), document.getText().length(), TypedNamedEntity.class);
}
@Override
public List performTask1(Document document) throws GerbilException {
- return getDocumentMarkings(document.getDocumentURI(), TypedNamedEntity.class);
+ return getDocumentMarkings(document.getDocumentURI(), document.getText().length(), TypedNamedEntity.class);
}
@Override
public List performTyping(Document document) throws GerbilException {
- return getDocumentMarkings(document.getDocumentURI(), TypedSpan.class);
+ return getDocumentMarkings(document.getDocumentURI(), document.getText().length(), TypedSpan.class);
}
@Override
public List performRecognition(Document document) throws GerbilException {
- return getDocumentMarkings(document.getDocumentURI(), Span.class);
+ return getDocumentMarkings(document.getDocumentURI(), document.getText().length(), Span.class);
}
@Override
public List performD2KBTask(Document document) throws GerbilException {
- return getDocumentMarkings(document.getDocumentURI(), MeaningSpan.class);
+ return getDocumentMarkings(document.getDocumentURI(), document.getText().length(), MeaningSpan.class);
}
@Override
public List performC2KB(Document document) throws GerbilException {
- return getDocumentMarkings(document.getDocumentURI(), Meaning.class);
+ return getDocumentMarkings(document.getDocumentURI(), document.getText().length(), Meaning.class);
}
@Override
public List performA2KBTask(Document document) throws GerbilException {
- return getDocumentMarkings(document.getDocumentURI(), MeaningSpan.class);
+ return getDocumentMarkings(document.getDocumentURI(), document.getText().length(), MeaningSpan.class);
+ }
+
+ @Override
+ public List performRT2KBTask(Document document) throws GerbilException {
+ return getDocumentMarkings(document.getDocumentURI(), document.getText().length(), TypedSpan.class);
}
}
diff --git a/src/main/java/org/aksw/gerbil/annotator/impl/nif/NIFBasedAnnotatorWebservice.java b/src/main/java/org/aksw/gerbil/annotator/impl/nif/NIFBasedAnnotatorWebservice.java
index cda71447a..b30be7c04 100644
--- a/src/main/java/org/aksw/gerbil/annotator/impl/nif/NIFBasedAnnotatorWebservice.java
+++ b/src/main/java/org/aksw/gerbil/annotator/impl/nif/NIFBasedAnnotatorWebservice.java
@@ -47,8 +47,8 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-public class NIFBasedAnnotatorWebservice extends AbstractHttpBasedAnnotator
- implements OKETask2Annotator, OKETask1Annotator, A2KBAnnotator, EntityTyper {
+public class NIFBasedAnnotatorWebservice extends AbstractHttpBasedAnnotator implements OKETask2Annotator,
+ OKETask1Annotator, A2KBAnnotator, EntityTyper {
private static final Logger LOGGER = LoggerFactory.getLogger(NIFBasedAnnotatorWebservice.class);
@@ -104,6 +104,11 @@ public List performTask2(Document document) throws GerbilExcep
return performAnnotation(document, TypedNamedEntity.class);
}
+ @Override
+ public List performRT2KBTask(Document document) throws GerbilException {
+ return performAnnotation(document, TypedSpan.class);
+ }
+
protected List performAnnotation(Document document, Class resultClass)
throws GerbilException {
document = request(document);
diff --git a/src/main/java/org/aksw/gerbil/annotator/impl/spotlight/SpotlightAnnotator.java b/src/main/java/org/aksw/gerbil/annotator/impl/spotlight/SpotlightAnnotator.java
index 1c99a5d14..3e9cc5621 100644
--- a/src/main/java/org/aksw/gerbil/annotator/impl/spotlight/SpotlightAnnotator.java
+++ b/src/main/java/org/aksw/gerbil/annotator/impl/spotlight/SpotlightAnnotator.java
@@ -33,12 +33,13 @@
import org.aksw.gerbil.transfer.nif.Span;
import org.aksw.gerbil.transfer.nif.TypedSpan;
import org.aksw.gerbil.transfer.nif.data.TypedNamedEntity;
+import org.apache.commons.collections.ListUtils;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.impl.client.CloseableHttpClient;
-public class SpotlightAnnotator extends AbstractHttpBasedAnnotator
- implements OKETask1Annotator, EntityRecognizer, D2KBAnnotator, A2KBAnnotator, EntityTyper {
+public class SpotlightAnnotator extends AbstractHttpBasedAnnotator implements OKETask1Annotator, EntityRecognizer,
+ D2KBAnnotator, A2KBAnnotator, EntityTyper {
private static final String SERVICE_URL_PARAM_KEY = "org.aksw.gerbil.annotator.impl.spotlight.SpotlightAnnotator.ServieURL";
@@ -90,6 +91,17 @@ public List performTask1(Document document) throws GerbilExcep
return client.annotate(document);
}
+ @SuppressWarnings("unchecked")
+ @Override
+ public List performRT2KBTask(Document document) throws GerbilException {
+ List list = client.annotate(document);
+ if (list != null) {
+ return (List) ListUtils.typedList(list, TypedSpan.class);
+ } else {
+ return null;
+ }
+ }
+
protected HttpPost createPostRequest(String url) {
return super.createPostRequest(url);
}
@@ -98,7 +110,7 @@ protected HttpPost createPostRequest(String url) {
protected void closeRequest(HttpUriRequest request) {
super.closeRequest(request);
}
-
+
@Override
public CloseableHttpClient getClient() {
return super.getClient();
diff --git a/src/main/java/org/aksw/gerbil/annotator/impl/xlisa/XLisaAnnotator.java b/src/main/java/org/aksw/gerbil/annotator/impl/xlisa/XLisaAnnotator.java
index 9c2413432..0b50c25f3 100644
--- a/src/main/java/org/aksw/gerbil/annotator/impl/xlisa/XLisaAnnotator.java
+++ b/src/main/java/org/aksw/gerbil/annotator/impl/xlisa/XLisaAnnotator.java
@@ -9,7 +9,6 @@
import org.aksw.gerbil.annotator.A2KBAnnotator;
import org.aksw.gerbil.annotator.impl.AbstractAnnotator;
-import org.aksw.gerbil.config.GerbilConfiguration;
import org.aksw.gerbil.datatypes.ErrorTypes;
import org.aksw.gerbil.exceptions.GerbilException;
import org.aksw.gerbil.transfer.nif.Document;
@@ -29,18 +28,14 @@
public class XLisaAnnotator extends AbstractAnnotator implements A2KBAnnotator {
private static final String BASE_URI = "http://km.aifb.kit.edu/services/text-annotation/?";
- private static final String XLISA_LANG_1 = "org.aksw.gerbil.annotators.definition.XLisa.lang1";
- private static final String XLISA_LANG_2 = "org.aksw.gerbil.annotators.definition.XLisa.lang2";
- private static final String XLISA_KB = "org.aksw.gerbil.annotators.definition.XLisa.kb";
- private static final String XLISA_MODEL = "org.aksw.gerbil.annotators.definition.XLisa.model";
-
+
private String lang1,lang2,kb,model;
- public XLisaAnnotator(){
- this.lang1 = GerbilConfiguration.getInstance().getString(XLISA_LANG_1);
- this.lang2 = GerbilConfiguration.getInstance().getString(XLISA_LANG_2);
- this.kb = GerbilConfiguration.getInstance().getString(XLISA_KB);
- this.model = GerbilConfiguration.getInstance().getString(XLISA_MODEL);
+ public XLisaAnnotator(String lang1, String lang2, String kb, String model){
+ this.lang1 = lang1;
+ this.lang2 = lang2;
+ this.kb = kb;
+ this.model = model;
}
@Override
diff --git a/src/main/java/org/aksw/gerbil/dataset/check/impl/FileBasedCachingEntityCheckerManager.java b/src/main/java/org/aksw/gerbil/dataset/check/impl/FileBasedCachingEntityCheckerManager.java
index 2024a72bc..c3011d2ad 100644
--- a/src/main/java/org/aksw/gerbil/dataset/check/impl/FileBasedCachingEntityCheckerManager.java
+++ b/src/main/java/org/aksw/gerbil/dataset/check/impl/FileBasedCachingEntityCheckerManager.java
@@ -16,6 +16,7 @@
*/
package org.aksw.gerbil.dataset.check.impl;
+import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
@@ -89,25 +90,22 @@ public static ObjectLongOpenHashMap readCacheFile(File cacheFile) {
if (!cacheFile.exists() || cacheFile.isDirectory()) {
return null;
}
- FileInputStream fin = null;
- ObjectInputStream oin = null;
+ ObjectInputStream ois = null;
try {
- fin = new FileInputStream(cacheFile);
- oin = new ObjectInputStream(fin);
+ ois = new ObjectInputStream(new BufferedInputStream(new FileInputStream(cacheFile)));
// first, read the number of URIs
- int count = oin.readInt();
+ int count = ois.readInt();
String uri;
ObjectLongOpenHashMap cache = new ObjectLongOpenHashMap(2 * count);
for (int i = 0; i < count; ++i) {
- uri = (String) oin.readObject();
- cache.put(uri, oin.readLong());
+ uri = (String) ois.readObject();
+ cache.put(uri, ois.readLong());
}
return cache;
} catch (Exception e) {
LOGGER.error("Exception while reading cache file.", e);
} finally {
- IOUtils.closeQuietly(oin);
- IOUtils.closeQuietly(fin);
+ IOUtils.closeQuietly(ois);
}
return null;
}
@@ -121,8 +119,8 @@ public static ObjectLongOpenHashMap readCacheFile(File cacheFile) {
protected File cacheFile;
protected File tempCacheFile;
- protected FileBasedCachingEntityCheckerManager(ObjectLongOpenHashMap cache,
- long cacheEntryLifetime, File cacheFile, File tempCacheFile) {
+ protected FileBasedCachingEntityCheckerManager(ObjectLongOpenHashMap cache, long cacheEntryLifetime,
+ File cacheFile, File tempCacheFile) {
this.cache = cache;
this.cacheEntryLifetime = cacheEntryLifetime;
this.cacheFile = cacheFile;
diff --git a/src/main/java/org/aksw/gerbil/dataset/check/impl/HttpBasedEntityChecker.java b/src/main/java/org/aksw/gerbil/dataset/check/impl/HttpBasedEntityChecker.java
index 8ebab0afc..c7372b267 100644
--- a/src/main/java/org/aksw/gerbil/dataset/check/impl/HttpBasedEntityChecker.java
+++ b/src/main/java/org/aksw/gerbil/dataset/check/impl/HttpBasedEntityChecker.java
@@ -52,9 +52,10 @@ public boolean entityExists(String uri) {
} catch (IllegalArgumentException e) {
if (LOGGER.isDebugEnabled()) {
LOGGER.error("Exception while creating HTTP request. Returning false.", e);
- } else {
- LOGGER.error("Exception while creating HTTP request. Returning false. Exception: "
- + e.getLocalizedMessage());
+ // } else {
+ // LOGGER.error("Exception while creating HTTP request.
+ // Returning false. Exception: "
+ // + e.getLocalizedMessage());
}
return false;
}
@@ -66,9 +67,10 @@ public boolean entityExists(String uri) {
} catch (Exception e) {
if (LOGGER.isDebugEnabled()) {
LOGGER.error("Exception while sending HTTP request. Returning false.", e);
- } else {
- LOGGER.error(
- "Exception while sending HTTP request. Returning false. Exception: " + e.getLocalizedMessage());
+ // } else {
+ // LOGGER.error(
+ // "Exception while sending HTTP request. Returning false.
+ // Exception: " + e.getLocalizedMessage());
}
return false;
} finally {
diff --git a/src/main/java/org/aksw/gerbil/dataset/check/index/IndexBasedEntityChecker.java b/src/main/java/org/aksw/gerbil/dataset/check/index/IndexBasedEntityChecker.java
new file mode 100644
index 000000000..91eaf878e
--- /dev/null
+++ b/src/main/java/org/aksw/gerbil/dataset/check/index/IndexBasedEntityChecker.java
@@ -0,0 +1,67 @@
+package org.aksw.gerbil.dataset.check.index;
+
+import java.io.Closeable;
+import java.io.File;
+import java.io.IOException;
+
+import org.aksw.gerbil.dataset.check.EntityChecker;
+import org.apache.commons.io.IOUtils;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.FSDirectory;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class IndexBasedEntityChecker implements EntityChecker, Closeable {
+
+ private static final Logger LOGGER = LoggerFactory.getLogger(IndexBasedEntityChecker.class);
+
+ public static final String URI_FIELD_NAME = "URI";
+
+ public static IndexBasedEntityChecker create(String indexDirPath) {
+ Directory indexDirectory = null;
+ try {
+ indexDirectory = FSDirectory.open(new File(indexDirPath).toPath());
+ IndexReader indexReader = DirectoryReader.open(indexDirectory);
+ IndexSearcher indexSearcher = new IndexSearcher(indexReader);
+ return new IndexBasedEntityChecker(indexSearcher, indexDirectory, indexReader);
+ } catch (IOException e) {
+ LOGGER.error("Exception while trying to open index for entity checking. Returning null.", e);
+ IOUtils.closeQuietly(indexDirectory);
+ return null;
+ }
+ }
+
+ private IndexSearcher indexSearcher;
+ private Directory indexDirectory;
+ private IndexReader indexReader;
+
+ protected IndexBasedEntityChecker(IndexSearcher indexSearcher, Directory indexDirectory, IndexReader indexReader) {
+ this.indexSearcher = indexSearcher;
+ this.indexDirectory = indexDirectory;
+ this.indexReader = indexReader;
+ }
+
+ @Override
+ public boolean entityExists(String uri) {
+ TopDocs docs = null;
+ try {
+ TermQuery query = new TermQuery(new Term(URI_FIELD_NAME, uri));
+ docs = indexSearcher.search(query, 1);
+ } catch (IOException e) {
+ LOGGER.error("Got an exception while searching for \"" + uri + "\" in the index. Returning false.", e);
+ }
+ return (docs != null) && (docs.totalHits > 0);
+ }
+
+ public void close() throws IOException {
+ IOUtils.closeQuietly(indexReader);
+ IOUtils.closeQuietly(indexDirectory);
+ }
+
+}
diff --git a/src/main/java/org/aksw/gerbil/dataset/check/index/Indexer.java b/src/main/java/org/aksw/gerbil/dataset/check/index/Indexer.java
new file mode 100644
index 000000000..770f00d25
--- /dev/null
+++ b/src/main/java/org/aksw/gerbil/dataset/check/index/Indexer.java
@@ -0,0 +1,65 @@
+package org.aksw.gerbil.dataset.check.index;
+
+import java.io.File;
+import java.io.IOException;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.StringField;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.index.IndexWriterConfig.OpenMode;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.FSDirectory;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class Indexer {
+
+ private static final Logger LOGGER = LoggerFactory.getLogger(Indexer.class);
+
+ public static Indexer create(String indexDirPath) {
+ Directory indexDirectory = null;
+ try {
+ indexDirectory = FSDirectory.open(new File(indexDirPath).toPath());
+ IndexWriterConfig config = new IndexWriterConfig();
+ config.setOpenMode(OpenMode.CREATE);
+ IndexWriter indexWriter = new IndexWriter(indexDirectory, config);
+ return new Indexer(indexDirectory, indexWriter);
+ } catch (IOException e) {
+ LOGGER.error("Exception while trying to create index writer for entity checking. Returning null.", e);
+ IOUtils.closeQuietly(indexDirectory);
+ return null;
+ }
+ }
+
+ private IndexWriter indexWriter;
+ private Directory indexDirectory;
+
+ protected Indexer(Directory dir, IndexWriter writer) {
+ this.indexWriter = writer;
+ this.indexDirectory = dir;
+ }
+
+ public void close() {
+ try {
+ indexWriter.commit();
+ } catch (IOException e) {
+ LOGGER.error("Error occured during final commit of Index Writer.", e);
+ }
+ IOUtils.closeQuietly(indexWriter);
+ IOUtils.closeQuietly(indexDirectory);
+ }
+
+ public void index(String uri) {
+ Document document = new Document();
+ document.add(new StringField(IndexBasedEntityChecker.URI_FIELD_NAME, uri, Field.Store.NO));
+ try {
+ indexWriter.addDocument(document);
+ } catch (IOException e) {
+ LOGGER.error("Couldn't write uri to index.", e);
+ e.printStackTrace();
+ }
+ }
+}
diff --git a/src/main/java/org/aksw/gerbil/dataset/impl/derczysnki/DerczynskiDataset.java b/src/main/java/org/aksw/gerbil/dataset/impl/derczysnki/DerczynskiDataset.java
index 8f4aaeb92..77eb74dfb 100644
--- a/src/main/java/org/aksw/gerbil/dataset/impl/derczysnki/DerczynskiDataset.java
+++ b/src/main/java/org/aksw/gerbil/dataset/impl/derczysnki/DerczynskiDataset.java
@@ -7,7 +7,9 @@
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.util.ArrayList;
+import java.util.HashSet;
import java.util.List;
+import java.util.Set;
import org.aksw.gerbil.dataset.InitializableDataset;
import org.aksw.gerbil.dataset.impl.AbstractDataset;
@@ -16,48 +18,44 @@
import org.aksw.gerbil.transfer.nif.Document;
import org.aksw.gerbil.transfer.nif.Marking;
import org.aksw.gerbil.transfer.nif.data.DocumentImpl;
-import org.aksw.gerbil.transfer.nif.data.NamedEntity;
+import org.aksw.gerbil.transfer.nif.data.TypedNamedEntity;
import org.apache.commons.io.IOUtils;
-import au.com.bytecode.opencsv.CSVReader;
+public class DerczynskiDataset extends AbstractDataset implements
+ InitializableDataset {
-public class DerczynskiDataset extends AbstractDataset implements InitializableDataset {
-
- private static final char SEPARATION_CHAR = '\t';
private static StringBuilder realTweet;
private String file;
- private List documents;
- private int firstDocId;
- private int lastDocId;
-
- public DerczynskiDataset(String file) {
- this.file = file;
- }
-
-
-
- @Override
- public int size() {
- return documents.size();
- }
-
- @Override
- public List getInstances() {
- return documents;
- }
-
- @Override
- public void init() throws GerbilException {
- this.documents = loadDocuments(new File(file));
- if ((firstDocId > 0) && (lastDocId > 0)) {
- this.documents = this.documents.subList(firstDocId - 1, lastDocId);
- }
- }
+ private List documents;
+ private int firstDocId;
+ private int lastDocId;
+
+ public DerczynskiDataset(String file) {
+ this.file = file;
+ }
+
+ @Override
+ public int size() {
+ return documents.size();
+ }
+
+ @Override
+ public List getInstances() {
+ return documents;
+ }
+
+ @Override
+ public void init() throws GerbilException {
+ this.documents = loadDocuments(new File(file));
+ if ((firstDocId > 0) && (lastDocId > 0)) {
+ this.documents = this.documents.subList(firstDocId - 1, lastDocId);
+ }
+ }
protected List loadDocuments(File tweetsFile)
throws GerbilException {
BufferedReader reader = null;
-// CSVReader reader = null;
+ // CSVReader reader = null;
List documents = new ArrayList();
String documentUriPrefix = "http://" + getName() + "/";
try {
@@ -65,23 +63,23 @@ protected List loadDocuments(File tweetsFile)
new FileInputStream(tweetsFile), Charset.forName("UTF-8")));
String line = reader.readLine();
- int tweetIndex=0;
+ int tweetIndex = 0;
List markings = new ArrayList();
StringBuilder tweet = new StringBuilder("");
while (line != null) {
- if(line.trim().isEmpty()){
- //Get Markings
+ if (line.trim().isEmpty()) {
+ // Get Markings
markings = findMarkings(tweet.toString());
- //Save old tweet
- documents.add(new DocumentImpl(realTweet.toString(), documentUriPrefix
- + tweetIndex, markings));
- //New Tweet
+ // Save old tweet
+ documents.add(new DocumentImpl(realTweet.toString(),
+ documentUriPrefix + tweetIndex, markings));
+ // New Tweet
tweet.delete(0, tweet.length());
line = reader.readLine();
tweetIndex++;
continue;
}
- tweet.append(line+"\n");
+ tweet.append(line + "\n");
line = reader.readLine();
}
} catch (IOException e) {
@@ -89,29 +87,79 @@ protected List loadDocuments(File tweetsFile)
ErrorTypes.DATASET_LOADING_ERROR);
} finally {
IOUtils.closeQuietly(reader);
-// IOUtils.closeQuietly(bReader);
+ // IOUtils.closeQuietly(bReader);
}
return documents;
}
-
- public static List findMarkings(String tweet){
- int start=0;
+
+ public static List findMarkings(String tweet) {
+ int start = 0;
List markings = new ArrayList();
realTweet = new StringBuilder();
String[] line = tweet.split("\n");
- for(String tokenFull : line){
+ int i = 0;
+ for (String tokenFull : line) {
String[] token = tokenFull.split("\t+");
- realTweet.append(token[0]+" ");
- token[1]=token[1].trim();
- if(!token[1].trim().equals("O") && !token[1].trim().equals("NIL")){
- //TOken has URI
- markings.add(new NamedEntity(start, token[0].length(), token[1]));
+ realTweet.append(token[0] + " ");
+ token[1] = token[1].trim();
+ if (token.length>2&&token[2].startsWith("B-")) {
+ String[] marking = getWholeMarking(line, i);
+ Set types = new HashSet();
+ types.add(marking[2]);
+ markings.add(new TypedNamedEntity(start, marking[0].length(),
+ marking[1], types));
+
}
- start+=token[0].length()+1;
+ start += token[0].length() + 1;
+ i++;
}
-
+
return markings;
}
+ private static String[] getWholeMarking(String line[], int pos) {
+ String[] ret = new String[3];
+ String[] token = line[pos].split("\t+");
+ StringBuilder name = new StringBuilder().append(token[0]);
+ if (!token[1].equals("O") & !token[1].equals("") && !token[1].equals("NIL"))
+ ret[1] = token[1];
+ else
+ ret[1] = "";
+ ret[2] = getType(token[2].substring(2));
+ for (int i = pos + 1; i < line.length; i++) {
+ token = line[i].split("\t+");
+
+ if (token.length >2 && token[2].startsWith("I-")) {
+ name.append(" ").append(token[0]);
+ } else {
+ break;
+ }
+ }
+ ret[0] = name.toString();
+ return ret;
+ }
+
+ private static String getType(String type) {
+ switch (type) {
+ case "sportsteam":
+ return "http://dbpedia.org/ontology/SportsTeam";
+ case "person":
+ return "http://dbpedia.org/ontology/Person";
+ case "geo-loc":
+ return "http://dbpedia.org/ontology/Place";
+ case "facility":
+ return "http://dbpedia.org/ontology/Place";
+ case "movie":
+ return "http://dbpedia.org/ontology/Film";
+ case "tv-show":
+ return "http://dbpedia.org/ontology/TelevisionShow";
+ case "company":
+ return "http://dbpedia.org/ontology/company";
+ case "product":
+ return "http://dbpedia.org/ontology/product";
+ default:
+ return "";
+ }
+ }
}
diff --git a/src/main/java/org/aksw/gerbil/dataset/impl/erd/ERDDataset.java b/src/main/java/org/aksw/gerbil/dataset/impl/erd/ERDDataset.java
new file mode 100644
index 000000000..081de0595
--- /dev/null
+++ b/src/main/java/org/aksw/gerbil/dataset/impl/erd/ERDDataset.java
@@ -0,0 +1,161 @@
+/**
+ * This file is part of General Entity Annotator Benchmark.
+ *
+ * General Entity Annotator Benchmark is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * General Entity Annotator Benchmark is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with General Entity Annotator Benchmark. If not, see .
+ */
+package org.aksw.gerbil.dataset.impl.erd;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.RandomAccessFile;
+
+import java.nio.charset.Charset;
+import java.nio.file.Paths;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.aksw.gerbil.dataset.InitializableDataset;
+import org.aksw.gerbil.dataset.impl.AbstractDataset;
+import org.aksw.gerbil.datatypes.ErrorTypes;
+import org.aksw.gerbil.exceptions.GerbilException;
+import org.aksw.gerbil.transfer.nif.Document;
+import org.aksw.gerbil.transfer.nif.Marking;
+import org.aksw.gerbil.transfer.nif.data.DocumentImpl;
+import org.aksw.gerbil.transfer.nif.data.NamedEntity;
+
+import org.apache.commons.io.IOUtils;
+
+@Deprecated
+public class ERDDataset extends AbstractDataset implements InitializableDataset {
+
+ private static final String FREEBASE_URI = "https://www.googleapis.com/freebase";
+
+ private String file_text;
+ private String file_annotation;
+ private List documents;
+
+ public ERDDataset(String filetext, String fileannotation) {
+ this.file_text = filetext;
+ this.file_annotation = fileannotation;
+ }
+
+ @Override
+ public int size() {
+ return documents.size();
+ }
+
+ @Override
+ public List getInstances() {
+ return documents;
+ }
+
+ @Override
+ public void init() throws GerbilException {
+ this.documents = loadDocuments(new File(file_text), new File(file_annotation));
+ }
+
+ private String generateDocumentUri(String fileName) {
+
+ StringBuilder builder = new StringBuilder();
+ builder.append("http://");
+ builder.append(name);
+ builder.append('/');
+ builder.append(Paths.get(fileName).getFileName().toString());
+
+ return builder.toString();
+
+ }
+
+ protected List loadDocuments(File textfile, File annotationfile) throws GerbilException {
+
+ if (!textfile.exists()) {
+ throw new GerbilException("The given text file (" + textfile.getAbsolutePath() + ") does not exist.", ErrorTypes.DATASET_LOADING_ERROR);
+ }
+ if (!annotationfile.exists()) {
+ throw new GerbilException("The given annotation file (" + annotationfile.getAbsolutePath() + ") does not exist.", ErrorTypes.DATASET_LOADING_ERROR);
+ }
+
+ List docs = new ArrayList<>();
+ String documentUri = generateDocumentUri(textfile.getAbsolutePath());
+
+ Map textMap = new HashMap<>();
+ String text_data = "";
+ byte[] filedata = new byte[(int) textfile.length()];
+ ERDTrec datatrec = null;
+ RandomAccessFile raf;
+
+ try {
+ raf = new RandomAccessFile(textfile, "r");
+ raf.seek(0);
+ raf.readFully(filedata);
+ text_data = new String(filedata);
+ raf.close();
+ } catch (IOException e) {
+ throw new GerbilException("Exception while reading text file of dataset.", e, ErrorTypes.DATASET_LOADING_ERROR);
+ }
+
+ int error = 0;
+ String[] text_split = text_data.split("\n");
+ for (String line : text_split) {
+ String[] line_part = line.split("\t");
+ String key;
+
+ if (line_part.length != 2) {
+ error++;
+ key = "ERROR " + error;
+ } else {
+ key = line_part[0];
+ }
+
+ datatrec = new ERDTrec(line, datatrec);
+ textMap.put(key, datatrec);
+ }
+
+ BufferedReader reader = null;
+ List markings = new ArrayList<>();
+ String line;
+ try {
+ reader = new BufferedReader(new InputStreamReader(new FileInputStream(annotationfile), Charset.forName("UTF-8")));
+
+ while ((line = reader.readLine()) != null) {
+
+ String[] line_split = line.split("\t");
+ if (line_split.length != 5) continue;
+
+ datatrec = textMap.get(line_split[0]);
+ if (datatrec != null) {
+ int position = datatrec.getTextPosition(line_split[3]);
+ int length = line_split[3].length();
+ markings.add(new NamedEntity(position, length, FREEBASE_URI + line_split[2]));
+ }
+ }
+
+ } catch (IOException e) {
+ throw new GerbilException("Exception while reading annotation file of dataset.", e, ErrorTypes.DATASET_LOADING_ERROR);
+ } finally {
+ IOUtils.closeQuietly(reader);
+ }
+
+ docs.add(new DocumentImpl(text_data, documentUri, markings));
+
+ return docs;
+ }
+
+}
diff --git a/src/main/java/org/aksw/gerbil/dataset/impl/erd/ERDDataset2.java b/src/main/java/org/aksw/gerbil/dataset/impl/erd/ERDDataset2.java
new file mode 100644
index 000000000..42d7f1222
--- /dev/null
+++ b/src/main/java/org/aksw/gerbil/dataset/impl/erd/ERDDataset2.java
@@ -0,0 +1,126 @@
+package org.aksw.gerbil.dataset.impl.erd;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.nio.charset.Charset;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.aksw.gerbil.dataset.InitializableDataset;
+import org.aksw.gerbil.dataset.impl.AbstractDataset;
+import org.aksw.gerbil.datatypes.ErrorTypes;
+import org.aksw.gerbil.exceptions.GerbilException;
+import org.aksw.gerbil.transfer.nif.Document;
+import org.aksw.gerbil.transfer.nif.Marking;
+import org.aksw.gerbil.transfer.nif.data.DocumentImpl;
+import org.aksw.gerbil.transfer.nif.data.NamedEntity;
+
+import com.hp.hpl.jena.query.Query;
+import com.hp.hpl.jena.query.QueryExecution;
+import com.hp.hpl.jena.query.QueryExecutionFactory;
+import com.hp.hpl.jena.query.QueryFactory;
+
+public class ERDDataset2 extends AbstractDataset implements
+ InitializableDataset {
+
+ private List documents;
+ private String annotateFile;
+ private String textFile;
+
+ private String queryTemp = "PREFIX owl: PREFIX freebase: SELECT ?s WHERE {?s owl:sameAs freebase:%%v%%}";
+ private static final String DBPEDIA_SERVICE = "http://dbpedia.org/sparql";
+
+
+ public ERDDataset2(String textFile, String annotateFile) {
+ this.annotateFile = annotateFile;
+ this.textFile = textFile;
+ }
+
+ @Override
+ public int size() {
+ return documents.size();
+ }
+
+ @Override
+ public List getInstances() {
+ return documents;
+ }
+
+ @Override
+ public void init() throws GerbilException {
+ this.documents = loadDocuments(new File(annotateFile), new File(
+ textFile));
+ }
+
+ private List loadDocuments(File annFile, File textFile) throws GerbilException {
+ List documents = new ArrayList();
+ String documentUriPrefix = "http://" + getName() + "/";
+ try (BufferedReader breader = new BufferedReader(new InputStreamReader(
+ new FileInputStream(textFile), Charset.forName("UTF-8")))) {
+ String line;
+ List markings = null;
+ while ((line = breader.readLine()) != null) {
+ if(line.isEmpty()){
+ continue;
+ }
+ String[] text = line.split("\t");
+
+ markings = findMarkings(text, annFile);
+ documents.add(new DocumentImpl(text[1], documentUriPrefix
+ + text[0], markings));
+ }
+ } catch (IOException e) {
+ throw new GerbilException("Exception while reading dataset.", e,
+ ErrorTypes.DATASET_LOADING_ERROR);
+ }
+
+ return documents;
+ }
+
+ private List findMarkings(String[] text, File annFile) throws GerbilException {
+ List markings = new ArrayList();
+ try (BufferedReader breader = new BufferedReader(new InputStreamReader(
+ new FileInputStream(annFile), Charset.forName("UTF-8")))) {
+ String line;
+
+ while ((line = breader.readLine()) != null) {
+ if(line.isEmpty()){
+ continue;
+ }
+ String[] annotation = line.split("\t");
+ int searchID = getTrecID(text[0]);
+ int annoID = getTrecID(annotation[0]);
+ if(searchID == annoID){
+ int start = text[1].indexOf(annotation[3]);
+ int length = annotation[3].length();
+
+ //FIXME time consuming!
+ String freebaseID = annotation[2].substring(1, annotation[2].length()).replace("/",".");
+ Query query = QueryFactory.create(queryTemp.replace("%%v%%", freebaseID));
+ QueryExecution qexec = QueryExecutionFactory.createServiceRequest(DBPEDIA_SERVICE, query);
+ String uri = qexec.execSelect().next().getResource("s").getURI();
+
+
+ markings.add(new NamedEntity(start, length, uri));
+ }
+ else if(annoID > searchID){
+ //There is no annotation for the given text
+ break;
+ }
+ }
+ } catch (IOException e) {
+ throw new GerbilException("Exception while reading dataset.", e,
+ ErrorTypes.DATASET_LOADING_ERROR);
+ }
+
+ return markings;
+ }
+
+ private int getTrecID(String trec){
+ return Integer.valueOf(trec.replace("TREC-", ""));
+ }
+
+}
diff --git a/src/main/java/org/aksw/gerbil/dataset/impl/erd/ERDTrec.java b/src/main/java/org/aksw/gerbil/dataset/impl/erd/ERDTrec.java
new file mode 100644
index 000000000..394b87550
--- /dev/null
+++ b/src/main/java/org/aksw/gerbil/dataset/impl/erd/ERDTrec.java
@@ -0,0 +1,56 @@
+/**
+ * This file is part of General Entity Annotator Benchmark.
+ *
+ * General Entity Annotator Benchmark is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * General Entity Annotator Benchmark is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with General Entity Annotator Benchmark. If not, see .
+ */
+package org.aksw.gerbil.dataset.impl.erd;
+
+public class ERDTrec {
+
+ private String line;
+ private ERDTrec befor;
+ private int line_number;
+ private int count_column;
+
+ public ERDTrec(String line, ERDTrec befor) {
+ this.line = line;
+ this.befor = befor;
+
+ if (befor == null) {
+ this.line_number = 0;
+ this.count_column = 0;
+ } else {
+ line_number = this.befor.getLineNumber() + 1;
+ count_column = this.befor.getColumnCount() + 1;
+ }
+ }
+
+ public int getTextPosition(String text) {
+ int pos = line.indexOf(text);
+ if (pos > 0) pos = count_column + pos;
+ return pos;
+ }
+
+ protected String getLine(){
+ return this.line;
+ }
+
+ protected int getLineNumber(){
+ return this.line_number;
+ }
+
+ protected int getColumnCount(){
+ return this.count_column + line.length();
+ }
+}
diff --git a/src/main/java/org/aksw/gerbil/dataset/impl/gerdaq/GERDAQDataset.java b/src/main/java/org/aksw/gerbil/dataset/impl/gerdaq/GERDAQDataset.java
new file mode 100644
index 000000000..3f658dcf0
--- /dev/null
+++ b/src/main/java/org/aksw/gerbil/dataset/impl/gerdaq/GERDAQDataset.java
@@ -0,0 +1,179 @@
+/**
+ * This file is part of General Entity Annotator Benchmark.
+ *
+ * General Entity Annotator Benchmark is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * General Entity Annotator Benchmark is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with General Entity Annotator Benchmark. If not, see .
+ */
+package org.aksw.gerbil.dataset.impl.gerdaq;
+
+import java.io.BufferedInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.List;
+
+import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
+
+import org.aksw.gerbil.dataset.InitializableDataset;
+import org.aksw.gerbil.dataset.impl.AbstractDataset;
+import org.aksw.gerbil.datatypes.ErrorTypes;
+import org.aksw.gerbil.exceptions.GerbilException;
+import org.aksw.gerbil.transfer.nif.Document;
+import org.aksw.gerbil.transfer.nif.Marking;
+import org.aksw.gerbil.transfer.nif.data.DocumentImpl;
+import org.aksw.gerbil.transfer.nif.data.NamedEntity;
+import org.apache.commons.io.IOUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.xml.sax.Attributes;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+public class GERDAQDataset extends AbstractDataset implements InitializableDataset {
+
+ private static final Logger LOGGER = LoggerFactory.getLogger(GERDAQDataset.class);
+
+ private static final String WIKIPEDIA_URI = "http://en.wikipedia.org/wiki/";
+ private static final String DBPEDIA_URI = "http://dbpedia.org/resource/";
+ private static final String ANNOTATION_TAG = "annotation";
+ private static final String DOCUMENT_TAG = "instance";
+
+ private String file;
+ private List documents;
+
+ public GERDAQDataset(String file) {
+ this.file = file;
+ }
+
+ @Override
+ public int size() {
+ return documents.size();
+ }
+
+ @Override
+ public List getInstances() {
+ return documents;
+ }
+
+ @Override
+ public void init() throws GerbilException {
+ this.documents = loadDocuments(new File(file));
+ }
+
+ protected static String generateDocumentUri(String datasetName, String fileName) {
+ StringBuilder builder = new StringBuilder();
+ builder.append("http://");
+ builder.append(datasetName.replace(' ', '_'));
+ builder.append('/');
+ builder.append(fileName);
+ builder.append('_');
+ return builder.toString();
+ }
+
+ private List loadDocuments(File filePath) throws GerbilException {
+ List docs = new ArrayList<>();
+ if (!filePath.exists()) {
+ throw new GerbilException("The given file (" + filePath.getAbsolutePath() + ") is not existing.",
+ ErrorTypes.DATASET_LOADING_ERROR);
+ }
+
+ if (filePath.isDirectory()) {
+
+ String directoryPath = filePath.getAbsolutePath();
+ if (!directoryPath.endsWith(File.separator)) {
+ directoryPath = directoryPath + File.separator;
+ }
+
+ for (File tmpFile : new File(directoryPath).listFiles()) {
+ docs.addAll(createDocument(tmpFile));
+ }
+
+ } else {
+ docs.addAll(createDocument(filePath));
+ }
+
+ return docs;
+
+ }
+
+ private List createDocument(File file) throws GerbilException {
+ List documents = new ArrayList();
+ String documentUriStart = generateDocumentUri(name, file.getName());
+ InputStream inputStream = null;
+ InputSource is = null;
+ try {
+ inputStream = new BufferedInputStream(new FileInputStream(file));
+ is = new InputSource(inputStream);
+ SAXParserFactory factory = SAXParserFactory.newInstance();
+ SAXParser saxParser = factory.newSAXParser();
+
+ saxParser.parse(is, new DefaultHandler() {
+
+ private StringBuilder text = new StringBuilder();
+ private int markingStart;
+ private String markingTitle;
+ private List markings;
+
+ @Override
+ public void startDocument() throws SAXException {
+ super.startDocument();
+ }
+
+ @Override
+ public void startElement(String namespaceURI, String localName, String qName, Attributes atts)
+ throws SAXException {
+
+ if (qName.equals(ANNOTATION_TAG)) {
+ markingTitle = atts.getValue("rank_0_title");
+ if (markingTitle != null) {
+ markingStart = text.length();
+ } else {
+ LOGGER.error("Found a marking without the necessary \"rank_0_title\" attribute.");
+ }
+ markingTitle = markingTitle.replace(' ', '_');
+ } else if (qName.equals(DOCUMENT_TAG)) {
+ this.markings = new ArrayList<>();
+ }
+ }
+
+ @Override
+ public void characters(char[] ch, int start, int length) {
+ text.append(ch, start, length);
+ }
+
+ @Override
+ public void endElement(String namespaceURI, String localName, String qName) throws SAXException {
+ if (qName.equals(DOCUMENT_TAG)) {
+ documents.add(new DocumentImpl(text.toString(), documentUriStart + documents.size(), markings));
+ text.delete(0, text.length());
+ } else if (qName.equals(ANNOTATION_TAG) && (markingTitle != null)) {
+ markings.add(new NamedEntity(markingStart, text.length() - markingStart, new HashSet(
+ Arrays.asList(DBPEDIA_URI + markingTitle, WIKIPEDIA_URI + markingTitle))));
+ }
+ }
+ });
+ } catch (Exception e) {
+ throw new GerbilException("Exception while reading dataset.", e, ErrorTypes.DATASET_LOADING_ERROR);
+ } finally {
+ IOUtils.closeQuietly(inputStream);
+ }
+
+ return documents;
+ }
+
+}
diff --git a/src/main/java/org/aksw/gerbil/dataset/impl/iitb/IITB_XMLParser.java b/src/main/java/org/aksw/gerbil/dataset/impl/iitb/IITB_XMLParser.java
index a7e12a660..b770b3f13 100644
--- a/src/main/java/org/aksw/gerbil/dataset/impl/iitb/IITB_XMLParser.java
+++ b/src/main/java/org/aksw/gerbil/dataset/impl/iitb/IITB_XMLParser.java
@@ -16,6 +16,7 @@
*/
package org.aksw.gerbil.dataset.impl.iitb;
+import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
@@ -30,6 +31,7 @@
import org.aksw.gerbil.exceptions.GerbilException;
import org.apache.commons.io.IOUtils;
import org.apache.xerces.jaxp.SAXParserFactoryImpl;
+import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
public class IITB_XMLParser {
@@ -46,18 +48,20 @@ public IITB_XMLParser() throws GerbilException {
}
public Map> parseAnnotationsFile(File file) throws IOException, SAXException {
- FileInputStream fin = null;
+ InputStream is = null;
try {
- fin = new FileInputStream(file);
- return parseAnnotationsStream(fin);
+ is = new BufferedInputStream(new FileInputStream(file));
+ return parseAnnotationsStream(is);
} finally {
- IOUtils.closeQuietly(fin);
+ IOUtils.closeQuietly(is);
}
}
public Map> parseAnnotationsStream(InputStream is) throws IOException, SAXException {
IITB_XMLHandler handler = new IITB_XMLHandler();
- parser.parse(is, handler);
+ InputSource is2 = new InputSource(is);
+ is2.setEncoding("UTF-8");
+ parser.parse(is2, handler);
return handler.getDocumentAnnotationsMap();
}
}
diff --git a/src/main/java/org/aksw/gerbil/dataset/impl/micro/Microposts2013Dataset.java b/src/main/java/org/aksw/gerbil/dataset/impl/micro/Microposts2013Dataset.java
index 0e4aede9e..7a2d456a4 100644
--- a/src/main/java/org/aksw/gerbil/dataset/impl/micro/Microposts2013Dataset.java
+++ b/src/main/java/org/aksw/gerbil/dataset/impl/micro/Microposts2013Dataset.java
@@ -92,7 +92,7 @@ protected List loadDocuments(File tweetsFile)
BufferedReader bReader = null;
CSVReader reader = null;
List documents = new ArrayList();
- String documentUriPrefix = "http//:" + getName() + "/";
+ String documentUriPrefix = "http://" + getName() + "/";
try {
bReader = new BufferedReader(new InputStreamReader(
new FileInputStream(tweetsFile), Charset.forName("UTF-8")));
diff --git a/src/main/java/org/aksw/gerbil/dataset/impl/micro/Microposts2014Dataset.java b/src/main/java/org/aksw/gerbil/dataset/impl/micro/Microposts2014Dataset.java
index c0f13e3a8..6e92e5d5b 100644
--- a/src/main/java/org/aksw/gerbil/dataset/impl/micro/Microposts2014Dataset.java
+++ b/src/main/java/org/aksw/gerbil/dataset/impl/micro/Microposts2014Dataset.java
@@ -81,7 +81,7 @@ protected List loadDocuments(File tweetsFile) throws GerbilException {
BufferedReader bReader = null;
CSVReader reader = null;
List documents = new ArrayList();
- String documentUriPrefix = "http//:" + getName() + "/";
+ String documentUriPrefix = "http://" + getName() + "/";
try {
bReader = new BufferedReader(
new InputStreamReader(new FileInputStream(tweetsFile), Charset.forName("UTF-8")));
diff --git a/src/main/java/org/aksw/gerbil/dataset/impl/micro/Microposts2015Dataset.java b/src/main/java/org/aksw/gerbil/dataset/impl/micro/Microposts2015Dataset.java
new file mode 100644
index 000000000..16dcf0d21
--- /dev/null
+++ b/src/main/java/org/aksw/gerbil/dataset/impl/micro/Microposts2015Dataset.java
@@ -0,0 +1,152 @@
+package org.aksw.gerbil.dataset.impl.micro;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.nio.charset.Charset;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import org.aksw.gerbil.dataset.InitializableDataset;
+import org.aksw.gerbil.dataset.impl.AbstractDataset;
+import org.aksw.gerbil.datatypes.ErrorTypes;
+import org.aksw.gerbil.exceptions.GerbilException;
+import org.aksw.gerbil.transfer.nif.Document;
+import org.aksw.gerbil.transfer.nif.Marking;
+import org.aksw.gerbil.transfer.nif.data.DocumentImpl;
+import org.aksw.gerbil.transfer.nif.data.TypedNamedEntity;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class Microposts2015Dataset extends AbstractDataset implements
+ InitializableDataset {
+
+ private static final Logger LOGGER = LoggerFactory
+ .getLogger(Microposts2016Dataset.class);
+
+ protected List documents;
+ private String annotatedFile;
+ private String tweetsFile;
+
+ protected static int typeIndex = 4;
+
+ public Microposts2015Dataset(String annotatedFile, String tweetsFile) {
+ this.annotatedFile = annotatedFile;
+ this.tweetsFile = tweetsFile;
+ }
+
+ @Override
+ public int size() {
+ return documents.size();
+ }
+
+ @Override
+ public List getInstances() {
+ return documents;
+ }
+
+ @Override
+ public void init() throws GerbilException {
+ this.documents = loadDocuments(new File(annotatedFile), new File(
+ tweetsFile));
+ }
+
+ protected List loadDocuments(File annotations, File tweetsFile)
+ throws GerbilException {
+
+ List documents = new ArrayList();
+ String documentUriPrefix = "http://" + getName() + "/";
+
+ try (BufferedReader bReader = new BufferedReader(new InputStreamReader(
+ new FileInputStream(tweetsFile), Charset.forName("UTF-8")))) {
+ String line;
+ List markings;
+ while ((line = bReader.readLine()) != null) {
+ String[] tweet = line.split("\t");
+ if (tweet.length < 2) {
+ continue;
+ }
+ String id = tweet[0];
+ String text = tweet[1];
+ markings = findMarkings(getMarkingLines(annotations, id), text);
+ documents.add(new DocumentImpl(text, documentUriPrefix + id,
+ markings));
+ }
+ } catch (IOException e) {
+ throw new GerbilException("Exception while reading dataset.", e,
+ ErrorTypes.DATASET_LOADING_ERROR);
+ }
+
+ return documents;
+ }
+
+ protected static List findMarkings(Set lines, String text) {
+ List markings = new ArrayList();
+
+ for (String line : lines) {
+ String[] annotation = line.split("\t");
+
+ int start = Integer.parseInt(annotation[1]);
+ int end = Integer.parseInt(annotation[2]);
+ int length = end - start;
+ String uri = annotation[3];
+ if (uri.startsWith("NIL")) {
+ uri = "";
+ }
+ Set types = new HashSet();
+ types.add(getTypeURI(annotation[typeIndex]));
+
+ markings.add(new TypedNamedEntity(start, length, uri, types));
+
+ }
+
+ return markings;
+ }
+
+ private static Set getMarkingLines(File annotations, String id) {
+ Set lines = new HashSet();
+
+ try (BufferedReader bReader = new BufferedReader(new InputStreamReader(
+ new FileInputStream(annotations), Charset.forName("UTF-8")))) {
+ String line;
+ Boolean annotationSeen = false;
+ while ((line = bReader.readLine()) != null) {
+ String[] annotation = line.split("\t");
+ if (id.equals(annotation[0])) {
+ annotationSeen = true;
+ lines.add(line);
+ } else if (annotationSeen) {
+ // as the annotations are ordered by id, the last annotation
+ // was added
+ return lines;
+ }
+ }
+
+ } catch (IOException e) {
+ LOGGER.error("Could not find Markings due to ", e);
+ }
+ return lines;
+ }
+
+ protected static String getTypeURI(String type) {
+ switch (type.toLowerCase()) {
+ case "thing":
+ return "http://dbpedia.org/ontology/Thing";
+ case "person":
+ return "http://dbpedia.org/ontology/Person";
+ case "organization":
+ return "http://dbpedia.org/ontology/Organisation";
+ case "location":
+ return "http://dbpedia.org/ontology/Place";
+ case "event":
+ return "http://dbpedia.org/ontology/Event";
+ case "product":
+ return "http://dbpedia.org/ontology/Product";
+ }
+ return "";
+ }
+}
diff --git a/src/main/java/org/aksw/gerbil/dataset/impl/micro/Microposts2016Dataset.java b/src/main/java/org/aksw/gerbil/dataset/impl/micro/Microposts2016Dataset.java
new file mode 100644
index 000000000..adf7434eb
--- /dev/null
+++ b/src/main/java/org/aksw/gerbil/dataset/impl/micro/Microposts2016Dataset.java
@@ -0,0 +1,172 @@
+/**
+ * This file is part of General Entity Annotator Benchmark.
+ *
+ * General Entity Annotator Benchmark is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * General Entity Annotator Benchmark is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with General Entity Annotator Benchmark. If not, see .
+ */
+package org.aksw.gerbil.dataset.impl.micro;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.nio.charset.Charset;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import org.aksw.gerbil.dataset.InitializableDataset;
+import org.aksw.gerbil.dataset.impl.AbstractDataset;
+import org.aksw.gerbil.datatypes.ErrorTypes;
+import org.aksw.gerbil.exceptions.GerbilException;
+import org.aksw.gerbil.transfer.nif.Document;
+import org.aksw.gerbil.transfer.nif.Marking;
+import org.aksw.gerbil.transfer.nif.data.DocumentImpl;
+import org.aksw.gerbil.transfer.nif.data.TypedNamedEntity;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * @author Giuseppe Rizzo (giuse.rizzo@gmail.com)
+ * @author Michael Röder (roeder@informatik.uni-leipzig.de)
+ */
+public class Microposts2016Dataset extends AbstractDataset implements
+ InitializableDataset {
+
+ private static final Logger LOGGER = LoggerFactory
+ .getLogger(Microposts2016Dataset.class);
+
+ protected List documents;
+ private String annotatedFile;
+ private String tweetsFile;
+
+ protected static int typeIndex = 5;
+
+ public Microposts2016Dataset(String annotatedFile, String tweetsFile) {
+ this.annotatedFile = annotatedFile;
+ this.tweetsFile = tweetsFile;
+ }
+
+ @Override
+ public int size() {
+ return documents.size();
+ }
+
+ @Override
+ public List getInstances() {
+ return documents;
+ }
+
+ @Override
+ public void init() throws GerbilException {
+ this.documents = loadDocuments(new File(annotatedFile), new File(
+ tweetsFile));
+ }
+
+ protected List loadDocuments(File annotations, File tweetsFile)
+ throws GerbilException {
+
+ List documents = new ArrayList();
+ String documentUriPrefix = "http://" + getName() + "/";
+
+ try (BufferedReader bReader = new BufferedReader(new InputStreamReader(
+ new FileInputStream(tweetsFile), Charset.forName("UTF-8")))) {
+ String line;
+ List markings;
+ while ((line = bReader.readLine()) != null) {
+ String[] tweet = line.split("\t");
+ if (tweet.length < 2) {
+ continue;
+ }
+ String id = tweet[0];
+ String text = tweet[1];
+ markings = findMarkings(getMarkingLines(annotations, id), text );
+ documents.add(new DocumentImpl(text, documentUriPrefix + id,
+ markings));
+ }
+ } catch (IOException e) {
+ throw new GerbilException("Exception while reading dataset.", e,
+ ErrorTypes.DATASET_LOADING_ERROR);
+ }
+
+ return documents;
+ }
+
+ protected static List findMarkings(Set lines, String text) {
+ List markings = new ArrayList();
+
+ for (String line : lines) {
+ String[] annotation = line.split("\t");
+
+ int start = Integer.parseInt(annotation[1]);
+ int end = Integer.parseInt(annotation[2]);
+ int length = end - start;
+ String uri = annotation[3];
+ if (uri.startsWith("NIL")) {
+ uri = "";
+ }
+ Set types = new HashSet();
+ types.add(getTypeURI(annotation[typeIndex]));
+
+ markings.add(new TypedNamedEntity(start, length, uri, types));
+
+ }
+
+ return markings;
+ }
+
+ private static Set getMarkingLines(File annotations, String id) {
+ Set lines = new HashSet();
+
+ try (BufferedReader bReader = new BufferedReader(
+ new InputStreamReader(new FileInputStream(annotations), Charset.forName("UTF-8")))) {
+ String line;
+ Boolean annotationSeen = false;
+ while ((line = bReader.readLine()) != null) {
+ String[] annotation = line.split("\t");
+ if (id.equals(annotation[0])) {
+ annotationSeen = true;
+ lines.add(line);
+ } else if (annotationSeen) {
+ // as the annotations are ordered by id, the last annotation
+ // was added
+ return lines;
+ }
+ }
+
+ } catch (IOException e) {
+ LOGGER.error("Could not find Markings due to ", e);
+ }
+ return lines;
+ }
+
+ protected static String getTypeURI(String type) {
+ switch (type.toLowerCase()) {
+ case "thing":
+ return "http://dbpedia.org/ontology/Thing";
+ case "person":
+ return "http://dbpedia.org/ontology/Person";
+ case "organization":
+ return "http://dbpedia.org/ontology/Organisation";
+ case "location":
+ return "http://dbpedia.org/ontology/Place";
+ case "event":
+ return "http://dbpedia.org/ontology/Event";
+ case "product":
+ return "http://dbpedia.org/ontology/Product";
+ }
+ return "";
+ }
+}
diff --git a/src/main/java/org/aksw/gerbil/dataset/impl/msnbc/MSNBC_XMLParser.java b/src/main/java/org/aksw/gerbil/dataset/impl/msnbc/MSNBC_XMLParser.java
index aa0714afc..e3ead035c 100644
--- a/src/main/java/org/aksw/gerbil/dataset/impl/msnbc/MSNBC_XMLParser.java
+++ b/src/main/java/org/aksw/gerbil/dataset/impl/msnbc/MSNBC_XMLParser.java
@@ -16,6 +16,7 @@
*/
package org.aksw.gerbil.dataset.impl.msnbc;
+import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
@@ -28,6 +29,7 @@
import org.aksw.gerbil.exceptions.GerbilException;
import org.apache.commons.io.IOUtils;
import org.apache.xerces.jaxp.SAXParserFactoryImpl;
+import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
public class MSNBC_XMLParser {
@@ -44,18 +46,20 @@ public MSNBC_XMLParser() throws GerbilException {
}
public MSNBC_Result parseAnnotationsFile(File file) throws IOException, SAXException {
- FileInputStream fin = null;
+ InputStream is = null;
try {
- fin = new FileInputStream(file);
- return parseAnnotationsStream(fin);
+ is = new BufferedInputStream(new FileInputStream(file));
+ return parseAnnotationsStream(is);
} finally {
- IOUtils.closeQuietly(fin);
+ IOUtils.closeQuietly(is);
}
}
public MSNBC_Result parseAnnotationsStream(InputStream is) throws IOException, SAXException {
MSNBC_XMLHandler handler = new MSNBC_XMLHandler();
- parser.parse(is, handler);
+ InputSource is2 = new InputSource(is);
+ is2.setEncoding("UTF-8");
+ parser.parse(is2, handler);
return handler;
}
}
diff --git a/src/main/java/org/aksw/gerbil/dataset/impl/nif/FileBasedNIFDataset.java b/src/main/java/org/aksw/gerbil/dataset/impl/nif/FileBasedNIFDataset.java
index 842b7820c..7d2e4b05f 100644
--- a/src/main/java/org/aksw/gerbil/dataset/impl/nif/FileBasedNIFDataset.java
+++ b/src/main/java/org/aksw/gerbil/dataset/impl/nif/FileBasedNIFDataset.java
@@ -16,6 +16,7 @@
*/
package org.aksw.gerbil.dataset.impl.nif;
+import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
@@ -77,14 +78,14 @@ public FileBasedNIFDataset(String filePath) {
@Override
protected InputStream getDataAsInputStream() {
- FileInputStream fin = null;
+ InputStream is = null;
try {
LOGGER.debug("Loading NIF dataset from {}", filePath);
- fin = new FileInputStream(filePath);
+ is = new BufferedInputStream(new FileInputStream(filePath));
} catch (FileNotFoundException e) {
LOGGER.error("Couldn't load NIF dataset from file.", e);
}
- return fin;
+ return is;
}
@Override
diff --git a/src/main/java/org/aksw/gerbil/dataset/impl/ritter/RitterDataset.java b/src/main/java/org/aksw/gerbil/dataset/impl/ritter/RitterDataset.java
new file mode 100644
index 000000000..e32ef3013
--- /dev/null
+++ b/src/main/java/org/aksw/gerbil/dataset/impl/ritter/RitterDataset.java
@@ -0,0 +1,175 @@
+package org.aksw.gerbil.dataset.impl.ritter;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.nio.charset.Charset;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import org.aksw.gerbil.dataset.InitializableDataset;
+import org.aksw.gerbil.dataset.impl.AbstractDataset;
+import org.aksw.gerbil.datatypes.ErrorTypes;
+import org.aksw.gerbil.exceptions.GerbilException;
+import org.aksw.gerbil.transfer.nif.Document;
+import org.aksw.gerbil.transfer.nif.Marking;
+import org.aksw.gerbil.transfer.nif.data.DocumentImpl;
+import org.aksw.gerbil.transfer.nif.data.TypedNamedEntity;
+import org.apache.commons.io.IOUtils;
+
+
+public class RitterDataset extends AbstractDataset implements InitializableDataset {
+
+ private static StringBuilder realTweet;
+ private String file;
+ private List documents;
+ private int firstDocId;
+ private int lastDocId;
+
+ public RitterDataset(String file) {
+ this.file = file;
+ }
+
+
+
+ @Override
+ public int size() {
+ return documents.size();
+ }
+
+ @Override
+ public List getInstances() {
+ return documents;
+ }
+
+ @Override
+ public void init() throws GerbilException {
+ this.documents = loadDocuments(new File(file));
+ if ((firstDocId > 0) && (lastDocId > 0)) {
+ this.documents = this.documents.subList(firstDocId - 1, lastDocId);
+ }
+ }
+
+ protected List loadDocuments(File tweetsFile)
+ throws GerbilException {
+ BufferedReader reader = null;
+// CSVReader reader = null;
+ List documents = new ArrayList();
+ String documentUriPrefix = "http://" + getName() + "/";
+ try {
+ reader = new BufferedReader(new InputStreamReader(
+ new FileInputStream(tweetsFile), Charset.forName("UTF-8")));
+
+ String line = reader.readLine();
+ int tweetIndex=0;
+ List markings = new ArrayList();
+ StringBuilder tweet = new StringBuilder("").append(line);
+ while (line != null) {
+ if(line.trim().isEmpty()){
+ //Get Markings
+ markings = findMarkings(tweet.toString());
+ //Save old tweet
+ String tw = realTweet.toString();
+ documents.add(new DocumentImpl(tw, documentUriPrefix
+ + tweetIndex, markings));
+ //New Tweet
+ tweet = new StringBuilder();
+ line = reader.readLine();
+ tweetIndex++;
+ continue;
+ }
+ tweet.append(line+"\n");
+ line = reader.readLine();
+ }
+ } catch (IOException e) {
+ throw new GerbilException("Exception while reading dataset.", e,
+ ErrorTypes.DATASET_LOADING_ERROR);
+ } finally {
+ IOUtils.closeQuietly(reader);
+// IOUtils.closeQuietly(bReader);
+ }
+ return documents;
+ }
+
+
+ public static List findMarkings(String tweet){
+ int start=0;
+ List markings = new ArrayList();
+ realTweet = new StringBuilder();
+ String[] line = tweet.split("\n");
+ int i=0;
+ for(String tokenFull : line){
+ String[] token = tokenFull.split("\t+");
+ realTweet.append(token[0]+" ");
+ token[1]=token[1].trim();
+ if(token[1].startsWith("B-")){
+ String[] marking = getWholeMarking(line, i);
+ Set types = new HashSet();
+ types.add(marking[1]);
+ markings.add(new TypedNamedEntity(start, marking[0].length(), "", types));
+
+ }
+ start+=token[0].length()+1;
+ i++;
+ }
+
+ return markings;
+ }
+
+ private static String[] getWholeMarking(String line[], int pos){
+ String[] ret = new String[2];
+ String[] token = line[pos].split("\t+");
+ StringBuilder name= new StringBuilder().append(token[0]);
+ if(!token[1].equals("O")){
+ ret[1] = token[1];
+ switch (token[1].trim().substring(2)) {
+ case "facility":
+ ret[1] = "http://dbpedia.org/ontology/Place";
+ break;
+ case "company":
+ ret[1] = "http://dbpedia.org/ontology/Company";
+ break;
+ case "geo-loc":
+ ret[1] = "http://dbpedia.org/ontology/Place";
+ break;
+ case "movie":
+ ret[1] = "http://dbpedia.org/ontology/Film";
+ break;
+ case "musicartist":
+ ret[1] = "http://dbpedia.org/ontology/MusicalArtist";
+ break;
+ case "other":
+ ret[1] = "http://dbpedia.org/ontology/Unknown";
+ break;
+ case "person":
+ ret[1] = "http://dbpedia.org/ontology/Person";
+ break;
+ case "product":
+ ret[1] = "http://dbpedia.org/ontology/product";
+ break;
+ case "sportsteam":
+ ret[1] = "http://dbpedia.org/ontology/SportsTeam";
+ break;
+ case "tvshow":
+ ret[1] = "http://dbpedia.org/ontology/TelevisionShow";
+ break;
+ }
+ }
+ for(int i=pos+1;i documents;
+ private String wordsFile;
+ private Boolean senseval3;
+
+ public SensevalDataset(String wordsFile){
+ this(wordsFile, "false");
+ }
+
+ public SensevalDataset(String wordsFile, String senseval3){
+ this.wordsFile = wordsFile;
+ this.senseval3 = Boolean.valueOf(senseval3);
+ documents = new ArrayList();
+ }
+
+ @Override
+ public int size() {
+ return documents.size();
+ }
+
+ @Override
+ public List getInstances() {
+ return documents;
+ }
+
+ @Override
+ public void init() throws GerbilException {
+ this.documents = loadDocuments(new File(this.wordsFile));
+ }
+
+ private List loadDocuments(File file) throws GerbilException {
+ SAXParserFactory factory = SAXParserFactory.newInstance();
+ SAXParser saxParser=null;
+
+ try{
+ InputSource is;
+ if(senseval3){
+ //FIXME: Better solution, its just one line where & is as content
+ String content = org.apache.commons.io.FileUtils.readFileToString(new File(this.wordsFile), "UTF-8");
+ content = content.replace("&", "&").trim();
+ is = new InputSource(new ByteArrayInputStream(content.getBytes()));
+ is.setEncoding("UTF-8");
+ }
+ else{
+ is = new InputSource(new FileInputStream(file));
+ is.setEncoding("UTF-8");
+ }
+ saxParser = factory.newSAXParser();
+ saxParser.parse(is, new SensevalSAXHandler(documents));
+ } catch (Exception e) {
+ throw new GerbilException("Exception while reading dataset.", e,
+ ErrorTypes.DATASET_LOADING_ERROR);
+ }
+
+
+ return documents;
+ }
+
+
+}
diff --git a/src/main/java/org/aksw/gerbil/dataset/impl/senseval/SensevalSAXHandler.java b/src/main/java/org/aksw/gerbil/dataset/impl/senseval/SensevalSAXHandler.java
new file mode 100644
index 000000000..e448dfdbc
--- /dev/null
+++ b/src/main/java/org/aksw/gerbil/dataset/impl/senseval/SensevalSAXHandler.java
@@ -0,0 +1,108 @@
+package org.aksw.gerbil.dataset.impl.senseval;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import org.aksw.gerbil.transfer.nif.Document;
+import org.aksw.gerbil.transfer.nif.Marking;
+import org.aksw.gerbil.transfer.nif.data.DocumentImpl;
+import org.aksw.gerbil.transfer.nif.data.NamedEntity;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+public class SensevalSAXHandler extends DefaultHandler {
+
+ public static final String SENTENCE_ELEMENT = "sentence";
+ public static final String INSTANCE_ELEMENT = "instance";
+ private static final String WF_ELEMENT = "wf";
+
+ private StringBuilder sentence = new StringBuilder();
+ private List markings = new ArrayList();
+ private List documents;
+ private int start = 0;
+ private int length;
+ private int i = 0;
+ private String instanceUri;
+
+ private byte field = -1;
+
+ public SensevalSAXHandler(List documents) {
+ this.documents = documents;
+ }
+
+ @Override
+ public void startElement(String uri, String localName, String qName,
+ Attributes attributes) throws SAXException {
+ if (qName.equalsIgnoreCase(SENTENCE_ELEMENT)) {
+ field = 0;
+ markings = new ArrayList();
+ } else if (qName.equalsIgnoreCase(INSTANCE_ELEMENT)) {
+ field = 1;
+ length = 0;
+ instanceUri = "";
+ } else if (qName.equalsIgnoreCase(WF_ELEMENT)) {
+ field = 2;
+ length = 0;
+ }
+ }
+
+ @Override
+ public void endElement(String uri, String localName, String qName)
+ throws SAXException {
+ if (qName.equalsIgnoreCase(SENTENCE_ELEMENT)) {
+ i++;
+ documents.add(new DocumentImpl(sentence.toString(),
+ "http://senseval" + i, markings));
+ sentence = new StringBuilder();
+ } else if (qName.equalsIgnoreCase(INSTANCE_ELEMENT)) {
+ markings.add(new NamedEntity(start, length, instanceUri));
+ start = sentence.length();
+ } else if (qName.equalsIgnoreCase(WF_ELEMENT)) {
+ start = sentence.length();
+
+ }
+ this.field = 0;
+ }
+
+ @Override
+ public void characters(char ch[], int start, int length)
+ throws SAXException {
+ switch (field) {
+ case 0:
+ break;
+ case 1:
+ case 2:
+ this.length = length;
+ String word = new String(Arrays.copyOfRange(ch, start, start
+ + length));
+ if(word.equals("&")){
+ word = word.replace("&", "&");
+ }
+ this.start+= addWordToSentence(word);
+ }
+ this.field = 0;
+
+ }
+
+ public List getDocuments() {
+ return documents;
+ }
+
+ private int addWordToSentence(String word) {
+ if (sentence.length() == 0) {
+ sentence.append(word);
+ return 0;
+ }
+
+ if (word.matches("(,|\\.|;|:|!|\\?)")) {
+ sentence.append(word);
+ return 0;
+ }
+ else {
+ sentence.append(" ").append(word);
+ return 1;
+ }
+ }
+}
diff --git a/src/main/java/org/aksw/gerbil/dataset/impl/umbc/UMBCDataset.java b/src/main/java/org/aksw/gerbil/dataset/impl/umbc/UMBCDataset.java
new file mode 100644
index 000000000..cb524bf73
--- /dev/null
+++ b/src/main/java/org/aksw/gerbil/dataset/impl/umbc/UMBCDataset.java
@@ -0,0 +1,154 @@
+package org.aksw.gerbil.dataset.impl.umbc;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.nio.charset.Charset;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import org.aksw.gerbil.dataset.InitializableDataset;
+import org.aksw.gerbil.dataset.impl.AbstractDataset;
+import org.aksw.gerbil.datatypes.ErrorTypes;
+import org.aksw.gerbil.exceptions.GerbilException;
+import org.aksw.gerbil.transfer.nif.Document;
+import org.aksw.gerbil.transfer.nif.Marking;
+import org.aksw.gerbil.transfer.nif.data.DocumentImpl;
+import org.aksw.gerbil.transfer.nif.data.TypedNamedEntity;
+import org.apache.commons.io.IOUtils;
+
+
+public class UMBCDataset extends AbstractDataset implements InitializableDataset {
+
+ private static StringBuilder realTweet;
+ private String file;
+ private List documents;
+ private int firstDocId;
+ private int lastDocId;
+
+ public UMBCDataset(String file) {
+ this.file = file;
+ }
+
+
+
+ @Override
+ public int size() {
+ return documents.size();
+ }
+
+ @Override
+ public List getInstances() {
+ return documents;
+ }
+
+ @Override
+ public void init() throws GerbilException {
+ this.documents = loadDocuments(new File(file));
+ if ((firstDocId > 0) && (lastDocId > 0)) {
+ this.documents = this.documents.subList(firstDocId - 1, lastDocId);
+ }
+ }
+
+ protected List loadDocuments(File tweetsFile)
+ throws GerbilException {
+ BufferedReader reader = null;
+// CSVReader reader = null;
+ List documents = new ArrayList();
+ String documentUriPrefix = "http://" + getName() + "/";
+ try {
+ reader = new BufferedReader(new InputStreamReader(
+ new FileInputStream(tweetsFile), Charset.forName("UTF-8")));
+
+ String line = reader.readLine();
+ int tweetIndex=0;
+ List markings = new ArrayList();
+ StringBuilder tweet = new StringBuilder("").append(line);
+ while (line != null) {
+ if(line.trim().isEmpty()){
+ //Get Markings
+ markings = findMarkings(tweet.toString());
+ //Save old tweet
+ String tw = realTweet.toString();
+ documents.add(new DocumentImpl(tw, documentUriPrefix
+ + tweetIndex, markings));
+ //New Tweet
+ tweet = new StringBuilder();
+ line = reader.readLine();
+ tweetIndex++;
+ continue;
+ }
+ tweet.append(line+"\n");
+ line = reader.readLine();
+ }
+ } catch (IOException e) {
+ throw new GerbilException("Exception while reading dataset.", e,
+ ErrorTypes.DATASET_LOADING_ERROR);
+ } finally {
+ IOUtils.closeQuietly(reader);
+// IOUtils.closeQuietly(bReader);
+ }
+ return documents;
+ }
+
+
+ public static List findMarkings(String tweet){
+ int start=0;
+ List markings = new ArrayList();
+ realTweet = new StringBuilder();
+ String[] line = tweet.split("\n");
+ int i=0;
+ for(String tokenFull : line){
+ String[] token = tokenFull.split("\t+");
+ realTweet.append(token[0]+" ");
+ token[1]=token[1].trim();
+ if(token[1].startsWith("B-")){
+ String[] marking = getWholeMarking(line, i);
+ Set types = new HashSet();
+ types.add(marking[1]);
+ markings.add(new TypedNamedEntity(start, marking[0].length(), "", types));
+
+ }
+ start+=token[0].length()+1;
+ i++;
+ }
+
+ return markings;
+ }
+
+ private static String[] getWholeMarking(String line[], int pos){
+ String[] ret = new String[2];
+ String[] token = line[pos].split("\t+");
+ StringBuilder name= new StringBuilder().append(token[0]);
+ if(!token[1].equals("O")){
+ ret[1] = token[1];
+ switch (token[1].trim().substring(2)) {
+ case "PER":
+ ret[1] = "http://dbpedia.org/ontology/Person";
+ break;
+ case "ORG":
+ ret[1] = "http://dbpedia.org/ontology/Organisation";
+ break;
+ case "LOC":
+ ret[1] = "http://dbpedia.org/ontology/Place";
+ break;
+ }
+ }
+ for(int i=pos+1;i documents;
+ private String annotatedFile;
+ private String tweetsFile;
+
+ public WSDMDataset(String annotatedFile, String tweetsFile){
+ this.annotatedFile = annotatedFile;
+ this.tweetsFile = tweetsFile;
+ }
+
+
+ @Override
+ public int size() {
+ return documents.size();
+ }
+
+ @Override
+ public List getInstances() {
+ return documents;
+ }
+
+ @Override
+ public void init() throws GerbilException {
+ this.documents = loadDocuments(new File(annotatedFile), new File(tweetsFile));
+ }
+
+ private List loadDocuments(File annotations, File tweets)
+ throws GerbilException {
+ List documents = new ArrayList();
+ String documentUriPrefix = "http://" + getName() + "/";
+ //its json per line
+ try (BufferedReader bReader = new BufferedReader(new InputStreamReader(
+ new FileInputStream(tweets), Charset.forName("UTF-8")))) {
+ String line;
+ List markings;
+ while ((line = bReader.readLine()) != null) {
+ JSONObject json = new JSONObject(line);
+
+ String id = json.getString("id_str");
+ String text = json.getString("text");
+ markings = findMarkings(getMarkingLines(annotations, id), text);
+ documents.add(new DocumentImpl(text, documentUriPrefix + id,
+ markings));
+ }
+ } catch (IOException e) {
+ throw new GerbilException("Exception while reading dataset.", e,
+ ErrorTypes.DATASET_LOADING_ERROR);
+ }
+
+ return documents;
+ }
+
+ protected static List findMarkings(Set lines, String text) {
+ List markings = new ArrayList();
+
+ for (String line : lines) {
+ String[] annotation = line.split("\t");
+
+ String uri = WikipediaHelper.getWikipediaUri(WIKIPEDIA_DOMAIN , annotation[2]);
+ markings.add(new Annotation(uri));
+ }
+
+ return markings;
+ }
+
+ private static Set getMarkingLines(File annotations, String id) {
+ Set lines = new HashSet();
+
+ try (BufferedReader bReader = new BufferedReader(new InputStreamReader(
+ new FileInputStream(annotations), Charset.forName("UTF-8")))) {
+ String line;
+ Boolean annotationSeen = false;
+ while ((line = bReader.readLine()) != null) {
+ String[] annotation = line.split("\t");
+ if (id.equals(annotation[0])) {
+ annotationSeen = true;
+ lines.add(line);
+ } else if (annotationSeen) {
+ // as the annotations are ordered by id, the last annotation
+ // was added
+ return lines;
+ }
+ }
+
+ } catch (IOException e) {
+ LOGGER.error("Could not find Markings due to ", e);
+ }
+ return lines;
+ }
+
+}
diff --git a/src/main/java/org/aksw/gerbil/datatypes/ExperimentType.java b/src/main/java/org/aksw/gerbil/datatypes/ExperimentType.java
index 3297fa5a1..d69d1090c 100644
--- a/src/main/java/org/aksw/gerbil/datatypes/ExperimentType.java
+++ b/src/main/java/org/aksw/gerbil/datatypes/ExperimentType.java
@@ -60,7 +60,8 @@ public enum ExperimentType implements Describable {
* Input: text with marked entities
* Output: mentions for every entity
*/
- D2KB("D2KB",
+ D2KB(
+ "D2KB",
"The input for the annotator is a text with entities that already have been marked inside. The annotator should link all these mentioned entities to a knowledge base."),
/**
@@ -73,7 +74,9 @@ public enum ExperimentType implements Describable {
* Input: text
* Output: marked entities and scored mentions for their meaning
*/
- @Deprecated Sa2KB("Sa2KB",
+ @Deprecated
+ Sa2KB(
+ "Sa2KB",
"The annotator gets a text and shall recognize entities inside and link them to a knowledge base. Additionally, each annotation is assigned a score representing the likelihood that the annotation is correct."),
/**
@@ -86,7 +89,9 @@ public enum ExperimentType implements Describable {
* Input: text
* Output: scored markings of entities
*/
- @Deprecated Sc2KB("Sc2KB",
+ @Deprecated
+ Sc2KB(
+ "Sc2KB",
"The annotator gets a text and shall return relevant entities that are mentioned inside the text. Additionally, each tag is assigned a score representing the likelihood that the annotation is correct."),
/**
@@ -98,7 +103,9 @@ public enum ExperimentType implements Describable {
* Input: text
* Output: ranked markings of entities
*/
- @Deprecated Rc2KB("Sc2KB",
+ @Deprecated
+ Rc2KB(
+ "Sc2KB",
"The annotator gets a text and shall return relevant entities that are mentioned inside the textand rank them in terms of their relevance for the topics dealt with in the input text"),
/**
@@ -126,8 +133,15 @@ public enum ExperimentType implements Describable {
* a given text and the extraction of the part of the text, describing the
* type.
*/
- OKE_Task2("OKE Challenge 2015 - Task 2",
- "This task comprises the determining of the type of a given entity inside a given text and the extraction of the part of the text, describing the type."),;
+ OKE_Task2(
+ "OKE Challenge 2015 - Task 2",
+ "This task comprises the determining of the type of a given entity inside a given text and the extraction of the part of the text, describing the type."),
+
+ /**
+ * The annotator gets a text and shall recognize entities inside and their
+ * types.
+ */
+ RT2KB("RT2KB", "The annotator gets a text and shall recognize entities inside and their types.");
private String label;
private String description;
@@ -166,6 +180,7 @@ public boolean equalsOrContainsType(ExperimentType type) {
return true;
}
case ETyping: // falls through
+ case RT2KB:
case OKE_Task1:
case OKE_Task2: {
return false;
@@ -179,6 +194,7 @@ public boolean equalsOrContainsType(ExperimentType type) {
case A2KB:
case D2KB:
case ETyping:
+ case RT2KB:
case OKE_Task1:
case OKE_Task2: {
return false;
@@ -197,6 +213,7 @@ public boolean equalsOrContainsType(ExperimentType type) {
case Sc2KB:
case A2KB:
case D2KB:
+ case RT2KB:
case ETyping:
case OKE_Task1:
case OKE_Task2: {
@@ -208,6 +225,25 @@ public boolean equalsOrContainsType(ExperimentType type) {
}
}
}
+ case RT2KB: {
+ switch (type) {
+ case ERec: // falls through
+ case ETyping:
+ case RT2KB: {
+ return true;
+ }
+ case C2KB: // falls through
+ case A2KB:
+ case D2KB:
+ case Sa2KB:
+ case Sc2KB:
+ case Rc2KB:
+ case OKE_Task1:
+ case OKE_Task2: {
+ return false;
+ }
+ }
+ }
case C2KB: {
return type == C2KB;
}
diff --git a/src/main/java/org/aksw/gerbil/evaluate/EvaluatorFactory.java b/src/main/java/org/aksw/gerbil/evaluate/EvaluatorFactory.java
index a3faea431..6e73b2eec 100644
--- a/src/main/java/org/aksw/gerbil/evaluate/EvaluatorFactory.java
+++ b/src/main/java/org/aksw/gerbil/evaluate/EvaluatorFactory.java
@@ -20,7 +20,6 @@
import java.util.Arrays;
import java.util.List;
-import org.aksw.gerbil.config.GerbilConfiguration;
import org.aksw.gerbil.dataset.Dataset;
import org.aksw.gerbil.datatypes.ExperimentTaskConfiguration;
import org.aksw.gerbil.datatypes.ExperimentType;
@@ -29,11 +28,11 @@
import org.aksw.gerbil.datatypes.marking.MarkingClasses;
import org.aksw.gerbil.evaluate.impl.ClassConsideringFMeasureCalculator;
import org.aksw.gerbil.evaluate.impl.ClassifyingEvaluatorDecorator;
+import org.aksw.gerbil.evaluate.impl.ConfidenceBasedFMeasureCalculator;
import org.aksw.gerbil.evaluate.impl.ConfidenceScoreEvaluatorDecorator;
import org.aksw.gerbil.evaluate.impl.DoubleResultComparator;
import org.aksw.gerbil.evaluate.impl.FMeasureCalculator;
import org.aksw.gerbil.evaluate.impl.GSInKBClassifyingEvaluatorDecorator;
-import org.aksw.gerbil.evaluate.impl.ConfidenceBasedFMeasureCalculator;
import org.aksw.gerbil.evaluate.impl.HierarchicalFMeasureCalculator;
import org.aksw.gerbil.evaluate.impl.SpanMergingEvaluatorDecorator;
import org.aksw.gerbil.evaluate.impl.SubTaskAverageCalculator;
@@ -60,8 +59,7 @@
import org.aksw.gerbil.transfer.nif.TypedSpan;
import org.aksw.gerbil.transfer.nif.data.TypedNamedEntity;
import org.aksw.gerbil.utils.filter.TypeBasedMarkingFilter;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
+import org.aksw.gerbil.web.config.RootConfig;
import com.hp.hpl.jena.rdf.model.ModelFactory;
import com.hp.hpl.jena.vocabulary.OWL;
@@ -70,10 +68,8 @@
@SuppressWarnings("deprecation")
public class EvaluatorFactory {
- private static final Logger LOGGER = LoggerFactory.getLogger(EvaluatorFactory.class);
-
- private static final String DEFAULT_WELL_KNOWN_KBS_PARAMETER_KEY = "org.aksw.gerbil.evaluate.DefaultWellKnownKB";
- private static final String DEFAULT_WELL_KNOWN_KBS[] = loadDefaultKBs();
+ // private static final Logger LOGGER =
+ // LoggerFactory.getLogger(EvaluatorFactory.class);
protected UriKBClassifier globalClassifier = null;
protected SubClassInferencer inferencer = null;
@@ -82,14 +78,6 @@ public EvaluatorFactory() {
this(null, null);
}
- private static String[] loadDefaultKBs() {
- String kbs[] = GerbilConfiguration.getInstance().getStringArray(DEFAULT_WELL_KNOWN_KBS_PARAMETER_KEY);
- if (kbs == null) {
- LOGGER.error("Couldn't load the list of well known KBs. This GERBIL instance might not work as expected!");
- }
- return kbs;
- }
-
public EvaluatorFactory(UriKBClassifier globalClassifier) {
this(globalClassifier, null);
}
@@ -102,7 +90,7 @@ public EvaluatorFactory(UriKBClassifier globalClassifier, SubClassInferencer inf
if (globalClassifier != null) {
this.globalClassifier = globalClassifier;
} else {
- this.globalClassifier = new SimpleWhiteListBasedUriKBClassifier(DEFAULT_WELL_KNOWN_KBS);
+ this.globalClassifier = RootConfig.createDefaultUriKBClassifier();
}
if (inferencer != null) {
this.inferencer = inferencer;
@@ -112,8 +100,7 @@ public EvaluatorFactory(UriKBClassifier globalClassifier, SubClassInferencer inf
}
@SuppressWarnings("rawtypes")
- public Evaluator createEvaluator(ExperimentType type, ExperimentTaskConfiguration configuration,
- Dataset dataset) {
+ public Evaluator createEvaluator(ExperimentType type, ExperimentTaskConfiguration configuration, Dataset dataset) {
return createEvaluator(type, configuration, dataset, globalClassifier, inferencer);
}
@@ -125,9 +112,8 @@ public Evaluator createEvaluator(ExperimentType type, ExperimentTaskConfiguratio
return new ClassifyingEvaluatorDecorator(
new ClassConsideringFMeasureCalculator(
new MatchingsCounterImpl(new ClassifiedMeaningMatchingsSearcher()),
- MarkingClasses.IN_KB, MarkingClasses.EE),
- new UriBasedMeaningClassifier(classifier, MarkingClasses.IN_KB),
- new EmergingEntityMeaningClassifier());
+ MarkingClasses.IN_KB, MarkingClasses.EE), new UriBasedMeaningClassifier(
+ classifier, MarkingClasses.IN_KB), new EmergingEntityMeaningClassifier());
}
case Sa2KB:
case A2KB: {
@@ -143,8 +129,8 @@ public Evaluator createEvaluator(ExperimentType type, ExperimentTaskConfiguratio
new EmergingEntityMeaningClassifier());
}
case ERec: {
- return new ConfidenceBasedFMeasureCalculator(
- new MatchingsCounterImpl((MatchingsSearcher) MatchingsSearcherFactory
+ return new ConfidenceBasedFMeasureCalculator(new MatchingsCounterImpl(
+ (MatchingsSearcher) MatchingsSearcherFactory
.createSpanMatchingsSearcher(configuration.matching)));
}
case D2KB: {
@@ -156,48 +142,51 @@ public Evaluator createEvaluator(ExperimentType type, ExperimentTaskConfiguratio
new MatchingsCounterImpl(
new CompoundMatchingsSearcher(
(MatchingsSearcher) MatchingsSearcherFactory
- .createSpanMatchingsSearcher(
- configuration.matching),
+ .createSpanMatchingsSearcher(configuration.matching),
new ClassifiedMeaningMatchingsSearcher())),
MarkingClasses.IN_KB, MarkingClasses.EE, MarkingClasses.GS_IN_KB),
new StrongSpanMatchingsSearcher()),
new UriBasedMeaningClassifier(classifier, MarkingClasses.IN_KB),
- new EmergingEntityMeaningClassifier()),
- true);
+ new EmergingEntityMeaningClassifier()), true);
}
case ETyping: {
- return new SearcherBasedNotMatchingMarkingFilter(new StrongSpanMatchingsSearcher(),
- new ConfidenceScoreEvaluatorDecorator