diff --git a/.travis.yml b/.travis.yml
index 06e2c1a06..57545bea3 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -2,8 +2,20 @@ sudo: required
language: java
+jdk: oraclejdk8
+
+dist: trusty
+
+addons:
+ apt:
+ packages:
+ - oracle-java8-installer
+
+
+
services:
- docker
before_install:
- docker pull rethinkdb:2.3.5
+
diff --git a/docker-compose-sparql.yml b/docker-compose-sparql.yml
index a577a3bb2..02d0e7797 100644
--- a/docker-compose-sparql.yml
+++ b/docker-compose-sparql.yml
@@ -31,6 +31,7 @@ services:
- ./data/frontier:/var/squirrel/data
- ./seed/seeds.csv:/var/squirrel/seeds.csv:ro
- ./whitelist/whitelist.txt:/var/squirrel/whitelist.txt:ro
+ - ./spring-config:/var/squirrel/spring-config
command: java -cp squirrel.jar org.dice_research.squirrel.components.FrontierComponentStarter
virtuosohost:
@@ -136,18 +137,18 @@ services:
- ./spring-config:/var/squirrel/spring-config
command: java -cp squirrel.jar org.dice_research.squirrel.components.WorkerComponentStarter
- deduplicator:
- image: squirrel
- container_name: deduplicator
- environment:
- DEDUPLICATION_ACTIVE: "true"
- HOBBIT_RABBIT_HOST: rabbit
- OUTPUT_FOLDER: /var/squirrel/data
- MDB_HOST_NAME: mongodb
- MDB_PORT: 27017
- SPARQL_HOST_NAME: sparqlhost
- SPARQL_HOST_PORT: 3030
- SERVICE_PRECONDITION: "rethinkdb:28015 rabbit:5672"
- volumes:
- - ./data/deduplicator:/var/squirrel/data
- command: java -cp squirrel.jar org.hobbit.core.run.ComponentStarter org.aksw.simba.squirrel.components.DeduplicatorComponent
+# deduplicator:
+# image: squirrel
+# container_name: deduplicator
+# environment:
+# DEDUPLICATION_ACTIVE: "true"
+# HOBBIT_RABBIT_HOST: rabbit
+# OUTPUT_FOLDER: /var/squirrel/data
+# MDB_HOST_NAME: mongodb
+# MDB_PORT: 27017
+# SPARQL_HOST_NAME: sparqlhost
+# SPARQL_HOST_PORT: 3030
+# SERVICE_PRECONDITION: "rethinkdb:28015 rabbit:5672"
+# volumes:
+# - ./data/deduplicator:/var/squirrel/data
+# command: java -cp squirrel.jar org.hobbit.core.run.ComponentStarter org.aksw.simba.squirrel.components.DeduplicatorComponent
diff --git a/spring-config/frontier-context.xml b/spring-config/frontier-context.xml
index 9c84264a1..01271c394 100644
--- a/spring-config/frontier-context.xml
+++ b/spring-config/frontier-context.xml
@@ -54,4 +54,5 @@
+
diff --git a/squirrel.api/src/main/java/org/dice_research/squirrel/Constants.java b/squirrel.api/src/main/java/org/dice_research/squirrel/Constants.java
index ec6121959..39452556d 100644
--- a/squirrel.api/src/main/java/org/dice_research/squirrel/Constants.java
+++ b/squirrel.api/src/main/java/org/dice_research/squirrel/Constants.java
@@ -46,6 +46,32 @@ public class Constants {
*/
public static final String URI_PREFERRED_RECRAWL_ON = "recrawl-on";
+ /*
+ * The data related to the predictor
+ */
+ /**
+ * This key stores the value predicted by the Predictor for each URI denoting
+ * the class it belongs to (Positive class or Negative class)
+ */
+ public static final String URI_PREDICTED_LABEL = "predicted-label";
+ /**
+ * This key stores the value denoting the true class of each URI
+ */
+ public static final String URI_TRUE_LABEL = "true-label";
+ /**
+ * This key stores an integer denoting the true class of the URI
+ */
+ public static final String URI_TRUE_CLASS = "true_class";
+ /**
+ * This key stores the feature vector generated for each URI for prediction purpose
+ */
+ public static final String FEATURE_VECTOR = "feature-vector";
+ /**
+ * This key stores the parent URI of each crawled URI
+ */
+ public static final String REFERRING_URI = "referring-uri";
+
+
//////////////////////////////////////////////////
// URIs
//////////////////////////////////////////////////
diff --git a/squirrel.frontier/pom.xml b/squirrel.frontier/pom.xml
index 6d06d8441..491f88181 100644
--- a/squirrel.frontier/pom.xml
+++ b/squirrel.frontier/pom.xml
@@ -21,6 +21,12 @@
org.dice-research
squirrel.web-api
+
+
+ de.jungblut.ml
+ tjungblut-online-ml
+ 0.5
+
@@ -63,4 +69,4 @@
-
\ No newline at end of file
+
diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java
index 9ba0f0401..564804f69 100644
--- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java
+++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java
@@ -35,7 +35,9 @@
import org.dice_research.squirrel.frontier.impl.QueueBasedTerminationCheck;
import org.dice_research.squirrel.frontier.impl.TerminationCheck;
import org.dice_research.squirrel.frontier.impl.WorkerGuard;
+import org.dice_research.squirrel.predictor.*;
import org.dice_research.squirrel.queue.InMemoryQueue;
+import org.dice_research.squirrel.queue.IpAddressBasedQueue;
import org.dice_research.squirrel.queue.UriQueue;
import org.dice_research.squirrel.rabbit.RPCServer;
import org.dice_research.squirrel.rabbit.RespondingDataHandler;
@@ -54,6 +56,7 @@
import org.springframework.beans.factory.annotation.Qualifier;
import org.springframework.stereotype.Component;
+
@Component
@Qualifier("frontierComponent")
public class FrontierComponent extends AbstractComponent implements RespondingDataHandler {
@@ -77,10 +80,16 @@ public class FrontierComponent extends AbstractComponent implements RespondingDa
private final WorkerGuard workerGuard = new WorkerGuard(this);
private final boolean doRecrawling = true;
private long recrawlingTime = 1000L * 60L * 60L * 24L * 30;
+
+
private Timer timerTerminator;
+
public static final boolean RECRAWLING_ACTIVE = true;
+
+ protected Predictor predictor;
+
@Override
public void init() throws Exception {
super.init();
@@ -108,9 +117,15 @@ public void init() throws Exception {
queue = new InMemoryQueue();
knownUriFilter = new InMemoryKnownUriFilter(doRecrawling, recrawlingTime);
}
+ // Training the URI predictor model with a training dataset
+ try {
+ predictor = new MultinomialPredictor.MultinomialPredictorBuilder().withFile("multiNomialTrainData.txt").build();
+ }catch (Exception e){
+ e.printStackTrace();
+ }
// Build frontier
- frontier = new ExtendedFrontierImpl(new NormalizerImpl(), knownUriFilter, uriReferences, queue, doRecrawling);
+ frontier = new ExtendedFrontierImpl(new NormalizerImpl(), knownUriFilter, uriReferences, (IpAddressBasedQueue) queue, doRecrawling, predictor);
rabbitQueue = this.incomingDataQueueFactory.createDefaultRabbitQueue(Constants.FRONTIER_QUEUE_NAME);
receiver = (new RPCServer.Builder()).responseQueueFactory(outgoingDataQueuefactory).dataHandler(this)
@@ -139,11 +154,13 @@ public void init() throws Exception {
+ webConfiguration.isVisualizationOfCrawledGraphEnabled()
+ ". No WebServiceSenderThread will be started!");
}
+
+
}
@Override
public void run() throws Exception {
-
+
terminationMutex.acquire();
}
@@ -177,7 +194,7 @@ public void handleData(byte[] data) {
@Override
public void handleData(byte[] data, ResponseHandler handler, String responseQueueName, String correlId) {
-
+
Object deserializedData;
try {
deserializedData = serializer.deserialize(data);
@@ -200,7 +217,7 @@ public void handleData(byte[] data, ResponseHandler handler, String responseQueu
if (deserializedData instanceof UriSetRequest) {
responseToUriSetRequest(handler, responseQueueName, correlId, (UriSetRequest) deserializedData);
} else if (deserializedData instanceof UriSet) {
-
+
if(timerTerminator == null) {
LOGGER.info("Initializing Terminator task...");
TimerTask terminatorTask = new TerminatorTask(queue, terminationMutex, this.workerGuard);
@@ -212,6 +229,7 @@ public void handleData(byte[] data, ResponseHandler handler, String responseQueu
} else if (deserializedData instanceof CrawlingResult) {
CrawlingResult crawlingResult = (CrawlingResult) deserializedData;
LOGGER.warn("Received the message that the crawling for {} URIs is done.", crawlingResult.uris.size());
+
frontier.crawlingDone(crawlingResult.uris);
workerGuard.removeUrisForWorker(crawlingResult.idOfWorker, crawlingResult.uris);
} else if (deserializedData instanceof AliveMessage) {
@@ -298,11 +316,11 @@ public void run() {
break;
}
}
-
+
if(!stillHasUris && terminationCheck.shouldFrontierTerminate(queue)) {
terminationMutex.release();
- }
+ }
}
}
-}
\ No newline at end of file
+}
diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/ExtendedFrontierImpl.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/ExtendedFrontierImpl.java
index b3f8b4858..26859f4ab 100644
--- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/ExtendedFrontierImpl.java
+++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/ExtendedFrontierImpl.java
@@ -12,6 +12,7 @@
import org.dice_research.squirrel.data.uri.norm.UriNormalizer;
import org.dice_research.squirrel.deduplication.hashing.UriHashCustodian;
import org.dice_research.squirrel.frontier.ExtendedFrontier;
+import org.dice_research.squirrel.predictor.Predictor;
import org.dice_research.squirrel.queue.IpAddressBasedQueue;
import org.dice_research.squirrel.queue.UriQueue;
@@ -29,11 +30,12 @@ public class ExtendedFrontierImpl extends FrontierImpl implements ExtendedFronti
* @param generalRecrawlTime used to select the general Time after URIs should be recrawled. If Value is null the default Time is used.
* @param timerPeriod used to select if URIs should be recrawled.
* @param uriHashCustodian used to access and write hash values for uris.
+ * @param predictor {@link Predictor}Used to predict the type of the URI
*/
@SuppressWarnings("unused")
public ExtendedFrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, UriQueue queue, boolean doesRecrawling,
- long generalRecrawlTime, long timerPeriod, UriHashCustodian uriHashCustodian) {
- super(normalizer, knownUriFilter, queue, doesRecrawling, generalRecrawlTime, timerPeriod, uriHashCustodian);
+ long generalRecrawlTime, long timerPeriod, UriHashCustodian uriHashCustodian, Predictor predictor) {
+ super(normalizer, knownUriFilter, queue, doesRecrawling, generalRecrawlTime, timerPeriod, uriHashCustodian, predictor);
}
/**
@@ -45,9 +47,10 @@ public ExtendedFrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFil
* @param queue {@link UriQueue} used to manage the URIs that should be
* crawled.
* @param doesRecrawling used to select if URIs should be recrawled.
+ * @param predictor {@link Predictor}Used to predict the type of the URI
*/
- public ExtendedFrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, IpAddressBasedQueue queue, boolean doesRecrawling) {
- super(normalizer, knownUriFilter, queue, doesRecrawling);
+ public ExtendedFrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, IpAddressBasedQueue queue, boolean doesRecrawling, Predictor predictor) {
+ super(normalizer, knownUriFilter, queue, doesRecrawling, predictor);
}
/**
@@ -60,9 +63,12 @@ public ExtendedFrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFil
* @param queue {@link UriQueue} used to manage the URIs that should be
* crawled.
* @param doesRecrawling used to select if URIs should be recrawled.
+ * @param predictor PredictorImpl object used for prediction
*/
- public ExtendedFrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, URIReferences uriReferences, UriQueue queue, boolean doesRecrawling) {
- super(normalizer, knownUriFilter, uriReferences, queue, doesRecrawling);
+
+ public ExtendedFrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, URIReferences uriReferences, IpAddressBasedQueue queue, boolean doesRecrawling, Predictor predictor) {
+ super(normalizer, knownUriFilter, uriReferences, queue, doesRecrawling, predictor);
+
}
@Override
@@ -78,4 +84,4 @@ public void informAboutDeadWorker(String idOfWorker, List lstUris
setIps.forEach(ip -> ipQueue.markIpAddressAsAccessible(ip));
}
}
-}
\ No newline at end of file
+}
diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java
index a41f8b85c..d48e38eb0 100644
--- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java
+++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java
@@ -1,10 +1,14 @@
package org.dice_research.squirrel.frontier.impl;
+
+import de.jungblut.math.DoubleVector;
+
import java.net.UnknownHostException;
import java.util.List;
import java.util.Timer;
import java.util.TimerTask;
+
import org.dice_research.squirrel.Constants;
import org.dice_research.squirrel.data.uri.CrawleableUri;
import org.dice_research.squirrel.data.uri.filter.KnownUriFilter;
@@ -18,8 +22,10 @@
import org.dice_research.squirrel.queue.BlockingQueue;
import org.dice_research.squirrel.queue.UriQueue;
import org.dice_research.squirrel.uri.processing.UriProcessor;
+import org.dice_research.squirrel.components.FrontierComponent;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import org.dice_research.squirrel.predictor.*;
/**
* Standard implementation of the {@link Frontier} interface containing a
@@ -97,6 +103,13 @@ public class FrontierImpl implements Frontier {
*/
private static final long DEFAULT_TIMER_PERIOD = 1000 * 60 * 60;
+
+ /**
+ * {@link Predictor Used to predict the type of the URI}
+ */
+ protected Predictor predictor;
+
+
/**
* Constructor.
*
@@ -117,10 +130,12 @@ public class FrontierImpl implements Frontier {
* Value is null the default Time is used.
* @param timerPeriod
* used to select if URIs should be recrawled.
+ * @param predictor
+ * {@link Predictor}Used to predict the type of the URI
*/
public FrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, UriQueue queue,
- GraphLogger graphLogger, boolean doesRecrawling, long generalRecrawlTime, long timerPeriod) {
- this(normalizer, knownUriFilter, null, queue, graphLogger, doesRecrawling, generalRecrawlTime, timerPeriod);
+ GraphLogger graphLogger, boolean doesRecrawling, long generalRecrawlTime, long timerPeriod, Predictor predictor) {
+ this(normalizer, knownUriFilter, null, queue, graphLogger, doesRecrawling, generalRecrawlTime, timerPeriod, predictor);
}
/**
@@ -141,32 +156,29 @@ public FrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, Uri
* Value is null the default Time is used.
* @param timerPeriod
* used to select if URIs should be recrawled.
+ * @param predictor
+ * {@link Predictor}Used to predict the type of the URI
*/
public FrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, UriQueue queue, boolean doesRecrawling,
- long generalRecrawlTime, long timerPeriod, UriHashCustodian uriHashCustodian) {
- this(normalizer, knownUriFilter, queue, null, doesRecrawling, generalRecrawlTime, timerPeriod);
+ long generalRecrawlTime, long timerPeriod, UriHashCustodian uriHashCustodian, Predictor predictor) {
+ this(normalizer, knownUriFilter, queue, null, doesRecrawling, generalRecrawlTime, timerPeriod, predictor);
}
/**
* Constructor.
*
- * @param normalizer
- * {@link UriNormalizer} used to transform given URIs into a normal
- * form
- * @param knownUriFilter
- * {@link UriFilter} used to identify URIs that already have been
- * crawled.
- * @param uriReferences
- * {@link URIReferences} used to manage URI references
- * @param queue
- * {@link UriQueue} used to manage the URIs that should be crawled.
- * @param doesRecrawling
- * Value for {@link #doesRecrawling}.
+
+ * @param normalizer {@link UriNormalizer} used to transform given URIs into a normal form
+ * @param knownUriFilter {@link UriFilter} used to identify URIs that already have been
+ * crawled.
+ * @param uriReferences {@link URIReferences} used to manage URI references
+ * @param queue {@link UriQueue} used to manage the URIs that should be
+ * crawled.
+ * @param doesRecrawling Value for {@link #doesRecrawling}.
+ * @param predictor Used to predict the type of the URI.
*/
- public FrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, URIReferences uriReferences,
- UriQueue queue, boolean doesRecrawling) {
- this(normalizer, knownUriFilter, uriReferences, queue, null, doesRecrawling, DEFAULT_GENERAL_RECRAWL_TIME,
- DEFAULT_TIMER_PERIOD);
+ public FrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, URIReferences uriReferences, UriQueue queue, boolean doesRecrawling, Predictor predictor) {
+ this(normalizer, knownUriFilter, uriReferences, queue, null, doesRecrawling, DEFAULT_GENERAL_RECRAWL_TIME, DEFAULT_TIMER_PERIOD, predictor);
}
/**
@@ -182,11 +194,14 @@ public FrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, URI
* {@link UriQueue} used to manage the URIs that should be crawled.
* @param doesRecrawling
* Value for {@link #doesRecrawling}.
+ * @param predictor
+ * {@link Predictor}Used to predict the type of the URI
+ *
*/
public FrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, UriQueue queue,
- boolean doesRecrawling) {
+ boolean doesRecrawling, Predictor predictor) {
this(normalizer, knownUriFilter, queue, null, doesRecrawling, DEFAULT_GENERAL_RECRAWL_TIME,
- DEFAULT_TIMER_PERIOD);
+ DEFAULT_TIMER_PERIOD, predictor);
}
/**
@@ -200,9 +215,12 @@ public FrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, Uri
* crawled.
* @param queue
* {@link UriQueue} used to manage the URIs that should be crawled.
+ * @param predictor
+ * {@link Predictor}Used to predict the type of the URI
+ *
*/
- public FrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, UriQueue queue) {
- this(normalizer, knownUriFilter, queue, null, false, DEFAULT_GENERAL_RECRAWL_TIME, DEFAULT_TIMER_PERIOD);
+ public FrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, UriQueue queue, Predictor predictor) {
+ this(normalizer, knownUriFilter, queue, null, false, DEFAULT_GENERAL_RECRAWL_TIME, DEFAULT_TIMER_PERIOD, predictor);
}
/**
@@ -227,10 +245,12 @@ public FrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, Uri
* Value is null the default Time is used.
* @param timerPeriod
* used to select if URIs should be recrawled.
+ * @param predictor
+ * {@link Predictor}Used to predict the type of the URI
*/
public FrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, URIReferences uriReferences,
UriQueue queue, GraphLogger graphLogger, boolean doesRecrawling, long generalRecrawlTime,
- long timerPeriod) {
+ long timerPeriod, Predictor predictor) {
this.normalizer = normalizer;
this.knownUriFilter = knownUriFilter;
this.uriReferences = uriReferences;
@@ -241,8 +261,8 @@ public FrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, URI
this.queue.open();
this.doesRecrawling = doesRecrawling;
this.timerPeriod = timerPeriod;
+ this.predictor = predictor;
FrontierImpl.generalRecrawlTime = generalRecrawlTime;
-
if (this.doesRecrawling) {
timerRecrawling = new Timer();
timerRecrawling.schedule(new TimerTask() {
@@ -276,9 +296,18 @@ public void addNewUris(List uris) {
public void addNewUri(CrawleableUri uri) {
// Normalize the URI
uri = normalizer.normalize(uri);
+ // Predict the URI type
+ if(predictor != null && uri.getType().equals("UNKNOWN")) {
+ try {
+ //predict and update uri key with the predicted class
+ String p = predictor.predict(uri);
+ uri.addData(Constants.URI_PREDICTED_LABEL, p);
+ } catch (Exception e) {
+ LOGGER.info("Exception happened while predicting", e);
+ }
+ }
// After knownUriFilter uri should be classified according to
// UriProcessor
-
if (knownUriFilter.isUriGood(uri)) {
LOGGER.debug("addNewUri(" + uri + "): URI is good [" + knownUriFilter + "]");
if (schemeUriFilter.isUriGood(uri)) {
@@ -311,20 +340,22 @@ public void addNewUri(CrawleableUri uri) {
public void crawlingDone(List uris) {
LOGGER.info("One worker finished his work and crawled " + uris.size() + " URIs.");
- // List newUris = new ArrayList<>(uriMap.size());
- // for (CrawleableUri uri : uriMap.keySet()) {
- // newUris.addAll(uriMap.get(uri));
- // knownUriFilter.add(uri, System.currentTimeMillis(),
- // uri.getTimestampNextCrawl());
- // if (uriReferences != null) {
- // uriReferences.add(uri, uriMap.get(uri));
- // }
- // }
- // // If there is a graph logger, log the data
- // if (graphLogger != null) {
- // graphLogger.log(new ArrayList<>(uriMap.keySet()), newUris);
- // }
+
+// List newUris = new ArrayList<>(uriMap.size());
+// for (CrawleableUri uri : uriMap.keySet()) {
+// newUris.addAll(uriMap.get(uri));
+// knownUriFilter.add(uri, System.currentTimeMillis(), uri.getTimestampNextCrawl());
+// if (uriReferences != null) {
+// uriReferences.add(uri, uriMap.get(uri));
+// }
+// }
+
+// // If there is a graph logger, log the data
+// if (graphLogger != null) {
+// graphLogger.log(new ArrayList<>(uriMap.keySet()), newUris);
+// }
+
// If we should give the crawled IPs to the queue
if (queue instanceof BlockingQueue) {
((BlockingQueue>) queue).markUrisAsAccessible(uris);
@@ -343,6 +374,16 @@ public void crawlingDone(List uris) {
knownUriFilter.add(uri, System.currentTimeMillis());
}
}
+
+ // Update the URI type prediction model
+ try {
+ for (CrawleableUri uri : uris) {
+ predictor.weightUpdate(uri);
+ }
+ } catch (Exception e) {
+ LOGGER.warn("Exception happened while updating the weights for the URI type predictor model",e);
+ }
+
}
@Override
@@ -379,4 +420,6 @@ public UriQueue getQueue() {
return queue;
}
-}
\ No newline at end of file
+
+}
+
diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/predictor/BinomialPredictor.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/predictor/BinomialPredictor.java
new file mode 100644
index 000000000..9e7907ee9
--- /dev/null
+++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/predictor/BinomialPredictor.java
@@ -0,0 +1,491 @@
+package org.dice_research.squirrel.predictor;
+
+
+import com.google.common.hash.Hashing;
+import de.jungblut.math.DoubleVector;
+import de.jungblut.math.activation.SigmoidActivationFunction;
+import de.jungblut.math.dense.SingleEntryDoubleVector;
+import de.jungblut.math.loss.LogLoss;
+import de.jungblut.math.minimize.CostGradientTuple;
+import de.jungblut.math.sparse.SequentialSparseDoubleVector;
+import de.jungblut.nlp.VectorizerUtils;
+import de.jungblut.online.minimizer.StochasticGradientDescent;
+import de.jungblut.online.ml.FeatureOutcomePair;
+import de.jungblut.online.regression.RegressionClassifier;
+import de.jungblut.online.regression.RegressionModel;
+import de.jungblut.online.regularization.AdaptiveFTRLRegularizer;
+import de.jungblut.online.regularization.CostWeightTuple;
+import de.jungblut.online.regularization.WeightUpdater;
+import org.dice_research.squirrel.Constants;
+import org.dice_research.squirrel.data.uri.CrawleableUri;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import java.net.URI;
+import java.util.ArrayList;
+import java.util.Arrays;
+
+/**
+ * A predictor that predicts the RDF-relevance of a URI by performing binary classification.
+ */
+public final class BinomialPredictor implements Predictor{
+
+ private static final Logger LOGGER = LoggerFactory.getLogger(BinomialPredictor.class);
+ /**
+ * {@link WeightUpdater} Used to update the weights of the predictor model used.
+ */
+ private WeightUpdater updater;
+ /**
+ * {@link RegressionLearn} Used to train the model with training data
+ */
+ private RegressionLearn learner;
+ /**
+ * {@link RegressionModel} Represents the regression model used for the prediction of the RDF-relevance of the URI
+ */
+ private RegressionModel model;
+ /**
+ * {@link RegressionClassifier} Classifier for regression model. Takes a model or the atomic parts of it and predicts the outcome for a given feature.
+ *
+ */
+ private RegressionClassifier classifier;
+ /**
+ * Used to store the location of the training data file.
+ */
+ private String filepath;
+ /**
+ * The rate at which the model learns.
+ */
+ private Double learningRate;
+ /**
+ * Regularizing parameter L2
+ */
+ private Double l2;
+ /**
+ * Regularizing parameter L1
+ */
+ private Double l1;
+ /**
+ * Hyper parameter Beta
+ */
+ private Double beta;
+ /**
+ * Validation percentage which is between 0 and 1
+ */
+ private Double holdoutValidationPercentage;
+ /**
+ * The threshold above which a URI is classified into positive class
+ */
+ private Double threshold;
+ /**
+ * The positive class for the classification
+ */
+ private String positiveClass;
+ /**
+ * {@link FeatureVectorGenerator} Used to generate the feature vector of the URI
+ */
+ private FeatureVectorGenerator featureGenerator = new FeatureVectorGenerator();
+
+ /**
+ * Predicts the type of the URI
+ * @param uri the URI to which the prediction has to be made
+ * @return the type of the URI
+ */
+ public String predict(CrawleableUri uri) {
+
+ String predictedClass = null;
+ try {
+ featureGenerator.featureHashing(uri);
+ Object featureArray = uri.getData(Constants.FEATURE_VECTOR);
+ double[] doubleFeatureArray = (double[]) featureArray;
+ DoubleVector features = new SequentialSparseDoubleVector(doubleFeatureArray);
+ //initialize the regression classifier with updated model and predict
+ this.setClassifier(new RegressionClassifier(this.getModel()));
+ DoubleVector prediction = this.classifier.predict(features);
+
+ if(prediction.get(0) >= this.getThreshold())
+ predictedClass = this.getPositiveClass();
+ else
+ predictedClass = "NEGATIVE_CLASS";
+ } catch (Exception e) {
+ LOGGER.warn("Prediction for this " + uri.getUri().toString() + " failed " + e);
+ e.printStackTrace();
+ }
+ return predictedClass;
+ }
+
+ /**
+ * Updates the predictor model based on the this URI
+ * @param curi based on which the model weights has to be updated
+ */
+ public void weightUpdate(CrawleableUri curi) {
+ try {
+ if (curi.getData(Constants.FEATURE_VECTOR) != null && curi.getData(Constants.URI_TRUE_LABEL) != null) {
+ Object featureArray = curi.getData(Constants.FEATURE_VECTOR);
+ double[] doubleFeatureArray = (double[]) featureArray;
+ DoubleVector features = new SequentialSparseDoubleVector(doubleFeatureArray);
+ Object real_value = curi.getData(Constants.URI_TRUE_LABEL);
+ int rv = (int) real_value;
+ DoubleVector rv_DoubleVector = new SingleEntryDoubleVector(rv);
+ DoubleVector nextExample = features;
+ FeatureOutcomePair realResult = new FeatureOutcomePair(nextExample, rv_DoubleVector); // real outcome
+ //update weights using the updated parameters
+ DoubleVector newWeights = this.updater.prePredictionWeightUpdate(realResult, this.model.getWeights(), learningRate, 0);
+ CostGradientTuple observed = this.learner.observeExample(realResult, newWeights);
+ // calculate new weights (note that the iteration count is not used)
+ CostWeightTuple update = this.updater.computeNewWeights(newWeights, observed.getGradient(), learningRate, 0, observed.getCost());
+ // update model and classifier
+ this.model = new RegressionModel(update.getWeight(), this.model.getActivationFunction());
+ } else {
+ LOGGER.warn("Feature vector or true label of this " + curi.getUri().toString() + " is null");
+ }
+ } catch (Exception e) {
+ LOGGER.warn("Exception happened while updating the weights for the URI type predictor model", e);
+ }
+ }
+
+ protected void setUpdater(WeightUpdater updater) {
+ this.updater = updater;
+ }
+
+ public RegressionLearn getLearner() {
+ return learner;
+ }
+
+ protected void setLearner(RegressionLearn learner) {
+ this.learner = learner;
+ }
+
+ public RegressionModel getModel() {
+ return model;
+ }
+
+ protected void setModel(RegressionModel model) {
+ this.model = model;
+ }
+
+ public RegressionClassifier getClassifier() {
+ return classifier;
+ }
+
+ protected void setClassifier(RegressionClassifier classifier) {
+ this.classifier = classifier;
+ }
+
+ protected void setFilepath(String filepath) {
+ this.filepath = filepath;
+ }
+
+ public void setLearningRate(double learningRate) {
+ this.learningRate = learningRate;
+ }
+
+ public double getL2() {
+ return l2;
+ }
+
+ public void setL2(double l2) {
+ this.l2 = l2;
+ }
+
+ public double getL1() {
+ return l1;
+ }
+
+ public void setL1(double l1) {
+ this.l1 = l1;
+ }
+
+ public double getBeta() {
+ return beta;
+ }
+
+ public void setBeta(double beta) {
+ this.beta = beta;
+ }
+
+ public String getFilepath() {
+ return filepath;
+ }
+
+ public double getLearningRate() {
+ return learningRate;
+ }
+
+ private Double getHoldoutValidationPercentage() {
+ return holdoutValidationPercentage;
+ }
+
+ private void setHoldoutValidationPercentage(Double holdoutValidationPercentage) {
+ this.holdoutValidationPercentage = holdoutValidationPercentage;
+ }
+ public Double getThreshold(){ return this.threshold; }
+
+ public void setThreshold(Double threshold) { this.threshold = threshold; }
+
+ public String getPositiveClass() { return this.positiveClass; }
+
+ public void setPositiveClass(String positiveClass) { this.positiveClass = positiveClass; }
+
+
+ /**
+ * A builder pattern for the Binomialpredictor, that uses
+ * Regression Model, Regression Learner along with default training data and other default hyperparameters
+ */
+ public static class BinomialPredictorBuilder {
+
+ private TrainingDataProvider trainingDataProvider = new BinomialTrainDataProviderImpl();
+
+ protected StochasticGradientDescent sgd; //Minimizer
+
+ private RegressionLearn learner; //Learner
+
+ private RegressionModel model; //Model
+
+ private WeightUpdater updater; //Updater
+
+ private Double learningRate;//Learning rate
+
+ private Double beta; //Beta
+
+ private Double l1; //L1
+
+ private Double l2; //L2
+
+ private Double holdoutValidationPercentage; //Validation percentage which is between 0 and 1
+
+ private RegressionClassifier classifier; //Classifier
+
+ private String filePath; // file path for the training data file
+
+ private double threshold; // threshold above which a URI is classified into the positive class
+
+ public String positiveClass; // the positive class of the binary classification
+
+ public BinomialPredictorBuilder(RegressionLearn learner, RegressionModel model, RegressionClassifier classifier, WeightUpdater updater) {
+ this.learner = learner;
+ this.model = model;
+ this.classifier = classifier;
+ this.updater = updater;
+ }
+
+ public BinomialPredictorBuilder() {
+ }
+
+ public BinomialPredictorBuilder withUpdater(WeightUpdater updater) {
+ this.setUpdater(updater);
+ return this;
+ }
+
+ public BinomialPredictorBuilder withLearner(RegressionLearn learner) {
+ this.setLearner(learner);
+ return this;
+ }
+
+ public BinomialPredictorBuilder withModel(RegressionModel model) {
+ this.setModel(model);
+ return this;
+ }
+
+ public BinomialPredictorBuilder withClassifier(RegressionClassifier regressionClassifier) {
+ this.setClassifier(regressionClassifier);
+ return this;
+ }
+
+ public BinomialPredictorBuilder withFile(String filepath) {
+ this.setFilePath(filepath);
+ return this;
+ }
+
+ public BinomialPredictorBuilder withLearningRate(Double learningRate) {
+ this.setLearningRate(learningRate);
+ return this;
+ }
+
+ public BinomialPredictorBuilder withL1(Double L1) {
+ this.setL1(L1);
+ return this;
+ }
+
+ public BinomialPredictorBuilder withL2(Double L2) {
+ this.setL2(L2);
+ return this;
+ }
+
+ public BinomialPredictorBuilder withBeta(Double Beta) {
+ this.setBeta(Beta);
+ return this;
+ }
+
+ public BinomialPredictorBuilder withThreshold(Double threshold) {
+ this.setThreshold(threshold);
+ return this;
+ }
+
+ public BinomialPredictorBuilder withPositiveClass(String positiveClass){
+ this.setPostiveClass(positiveClass);
+ return this;
+ }
+
+ public BinomialPredictor build() {
+ BinomialPredictor predictor = new BinomialPredictor();
+
+ if (this.getLearningRate() == null)
+ this.setLearningRate(0.7);
+ predictor.setLearningRate(this.learningRate);
+
+ if (this.getBeta() == null)
+ this.setBeta(1);
+ predictor.setBeta(this.beta);
+
+ if (this.getL1() == null) {
+ this.setL1(1);
+ }
+ predictor.setL1(this.l1);
+
+ if (this.getL2() == null)
+ this.setL2(1);
+
+ predictor.setL2(this.getL2());
+
+ if(this.getThreshold() == null) {
+ this.setThreshold(0.5);
+ }
+
+ predictor.setThreshold(this.getThreshold());
+
+ predictor.setPositiveClass(this.getPositiveClass());
+
+ //updater
+ if (this.getUpdater() == null) {
+ this.setUpdater(new AdaptiveFTRLRegularizer(this.getBeta(), this.getL1(), this.getL2()));
+ }
+ predictor.setUpdater(this.getUpdater());
+
+ //holdout validation percentage
+ if (this.getHoldoutValidationPercentage() == null) {
+ this.setHoldoutValidationPercentage(0.05d);
+ }
+ predictor.setHoldoutValidationPercentage(this.getHoldoutValidationPercentage());
+
+ sgd = StochasticGradientDescent.StochasticGradientDescentBuilder
+ .create(this.getLearningRate()) // learning rate
+ .holdoutValidationPercentage(this.getHoldoutValidationPercentage()) // 5% as validation set
+ .historySize(10_000) // keep 10k samples to compute relative improvement
+ .weightUpdater(updater) // FTRL updater
+ .progressReportInterval(1_000) // report every n iterations
+ .build();
+
+
+ //learner
+ if (this.getLearner() == null)
+ this.setLearner(new RegressionLearn(sgd, new SigmoidActivationFunction(), new LogLoss()));
+ predictor.setLearner(this.getLearner());
+
+ //model
+ ArrayList classList = new ArrayList<>();
+ classList.add(this.positiveClass);
+ if (this.getModel() == null)
+ this.setModel(this.learner.train(() -> trainingDataProvider.setUpStream(this.filePath, classList)));
+ predictor.setModel(this.getModel());
+
+ //this.train(filePath);
+
+ //classifier
+ if (this.getClassifier() == null)
+ if (this.getModel() != null)
+ this.setClassifier(new RegressionClassifier(this.getModel()));
+ predictor.setClassifier(this.getClassifier());
+
+ return predictor;
+ }
+
+ private RegressionLearn getLearner() {
+ return this.learner;
+ }
+
+ private void setLearner(RegressionLearn learner) {
+ this.learner = learner;
+ }
+
+ private RegressionModel getModel() {
+ return this.model;
+ }
+
+ private void setModel(RegressionModel model) {
+ this.model = model;
+ }
+
+ private RegressionClassifier getClassifier() {
+ return this.classifier;
+ }
+
+ private void setClassifier(RegressionClassifier classifier) {
+ this.classifier = classifier;
+ }
+
+ private WeightUpdater getUpdater() {
+ return this.updater;
+ }
+
+ private void setUpdater(WeightUpdater updater) {
+ this.updater = updater;
+ }
+
+ private Double getLearningRate() {
+ return this.learningRate;
+ }
+
+ private void setLearningRate(double learningRate) {
+ this.learningRate = learningRate;
+ }
+
+ private Double getBeta() {
+ return this.beta;
+ }
+
+ private void setBeta(double beta) {
+ this.beta = beta;
+ }
+
+ private Double getL1() {
+ return this.l1;
+ }
+
+ private void setL1(double l1) {
+ this.l1 = l1;
+ }
+
+ private Double getL2() {
+ return this.l2;
+ }
+
+ private void setL2(double l2) {
+ this.l2 = l2;
+ }
+
+ private String getFilePath() {
+ return this.filePath;
+ }
+
+ private void setFilePath(String filePath) {
+ this.filePath = filePath;
+ }
+
+ private Double getHoldoutValidationPercentage() {
+ return this.holdoutValidationPercentage;
+ }
+
+ private void setHoldoutValidationPercentage(Double holdoutValidationPercentage) {
+ this.holdoutValidationPercentage = holdoutValidationPercentage;
+ }
+
+ private Double getThreshold(){ return this.threshold; }
+
+ private void setThreshold(Double threshold) { this.threshold = threshold; }
+
+ private String getPositiveClass() { return this.positiveClass; }
+
+ private void setPostiveClass(String postiveClass) { this.positiveClass = postiveClass; }
+
+ }
+
+}
diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/predictor/BinomialTrainDataProviderImpl.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/predictor/BinomialTrainDataProviderImpl.java
new file mode 100644
index 000000000..470f88adc
--- /dev/null
+++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/predictor/BinomialTrainDataProviderImpl.java
@@ -0,0 +1,91 @@
+package org.dice_research.squirrel.predictor;
+
+import de.jungblut.math.DoubleVector;
+import de.jungblut.math.dense.SingleEntryDoubleVector;
+import de.jungblut.math.sparse.SequentialSparseDoubleVector;
+import de.jungblut.online.ml.FeatureOutcomePair;
+import org.dice_research.squirrel.Constants;
+import org.dice_research.squirrel.data.uri.CrawleableUri;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.*;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.net.URL;
+import java.nio.charset.Charset;
+import java.util.ArrayList;
+import java.util.stream.Stream;
+
+public class BinomialTrainDataProviderImpl implements TrainingDataProvider {
+
+ private static final SingleEntryDoubleVector POSITIVE_CLASS = new SingleEntryDoubleVector(1d);
+ private static final SingleEntryDoubleVector NEGATIVE_CLASS = new SingleEntryDoubleVector(0d);
+ private FeatureVectorGenerator featureGenerator = new FeatureVectorGenerator();
+ Logger LOGGER = LoggerFactory.getLogger(BinomialTrainDataProviderImpl.class);
+
+ /**
+ * Used to convert the data in the training file into a stream which can be fed into the learner to learn
+ * @param filePath path of the file containing the training data
+ * @param classList list containing the class names of the URI
+ * @return
+ */
+ @Override
+ public Stream setUpStream(String filePath, ArrayList classList) {
+ String positiveClass = (String) classList.get(0);
+ BufferedReader br = null;
+ try {
+ //br = new BufferedReader(new InputStreamReader(new FileInputStream(filePath)));
+ br = new BufferedReader(new InputStreamReader(getClass().getClassLoader().getResourceAsStream(filePath)
+ , Charset.defaultCharset()));
+ }catch (Exception e){
+ LOGGER.warn("Exception happened while setting up train data stream", e);
+ }
+ return br.lines().map((s) -> parseFeature(s, positiveClass));
+ }
+
+ public FeatureOutcomePair parseFeature(String line, String positiveClass) {
+ String[] split = line.split(",");
+ URI furi = null;
+ try{
+ furi = new URI(split[0].replace("\"", ""));
+ } catch (URISyntaxException e) {
+ try {
+ furi = new URI("http://scoreboard.lod2.eu/data/scoreboardDataCube1.rdf");
+ } catch (URISyntaxException ex) {
+ ex.printStackTrace();
+ }
+ }
+ CrawleableUri uri = new CrawleableUri(furi);
+ featureGenerator.featureHashing(uri);
+ Object featureArray = uri.getData(Constants.FEATURE_VECTOR);
+ double[] doubleFeatureArray = (double[]) featureArray;
+ DoubleVector features = new SequentialSparseDoubleVector(doubleFeatureArray);
+ split[1] = split[1].replace("\"", "");
+ return new FeatureOutcomePair(features, split[1].equals(positiveClass) ? POSITIVE_CLASS : NEGATIVE_CLASS);
+ }
+
+
+ /**
+ * Used to create a file using the data from an online source
+ * @param dataUri The location of the online source
+ * @param trainFilePath The location of the local file to which the data should be written
+ */
+ public void createTrainDataFile(String dataUri, String trainFilePath) {
+ URL url = null;
+ BufferedReader br = null;
+ String line;
+ try {
+ PrintWriter writer = new PrintWriter(trainFilePath, "UTF-8");
+ url = new URL(dataUri);
+ br = new BufferedReader((new InputStreamReader(url.openStream())));
+ br.readLine();
+ while((line = br.readLine()) != null){
+ writer.println(line);
+ }
+ writer.close();
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+}
diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/predictor/FeatureVectorGenerator.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/predictor/FeatureVectorGenerator.java
new file mode 100644
index 000000000..33c58b4c0
--- /dev/null
+++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/predictor/FeatureVectorGenerator.java
@@ -0,0 +1,68 @@
+package org.dice_research.squirrel.predictor;
+
+import com.google.common.hash.Hashing;
+import de.jungblut.math.DoubleVector;
+import de.jungblut.math.sparse.SequentialSparseDoubleVector;
+import de.jungblut.nlp.VectorizerUtils;
+import org.dice_research.squirrel.Constants;
+import org.dice_research.squirrel.data.uri.CrawleableUri;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.net.URI;
+import java.util.ArrayList;
+import java.util.Arrays;
+
+/**
+ * Creates the feature vector that can be used in the prediction. It considers
+ * the intrinsic URI features and the intrinsic features of the
+ * referring URI.
+ * Feature hashing uses the hash function MurmurHash3 to map the feature vectors into binary vectors.
+ *
+ */
+
+public class FeatureVectorGenerator {
+ public static final Logger LOGGER = LoggerFactory.getLogger(FeatureVectorGenerator.class);
+
+ /**
+ * Method to perfrom feature hashing to reduce the dimension of the features of the URIs
+ * @param uri URI whose feature vector is to be calculated
+ */
+ public void featureHashing(CrawleableUri uri) {
+ ArrayList tokens1 = new ArrayList<>();
+
+ // Creating tokens of the current URI
+ tokenCreation(uri, tokens1);
+ CrawleableUri referUri;
+
+ // Creating tokens of the referring URI
+ if (uri.getData(Constants.REFERRING_URI) != null) {
+ referUri = new CrawleableUri((URI) uri.getData(Constants.REFERRING_URI));
+ if (referUri != null)
+ tokenCreation(referUri, tokens1);
+ }
+ String[] tokens = tokens1.toArray(new String[0]);
+ try {
+ DoubleVector feature = VectorizerUtils.sparseHashVectorize(tokens, Hashing.murmur3_128(), () -> new SequentialSparseDoubleVector(
+ 2 << 14));
+ double[] d;
+ d = feature.toArray();
+ uri.addData(Constants.FEATURE_VECTOR, d);
+
+ } catch (Exception e) {
+ LOGGER.warn("Exception caused while adding the feature vector to the URI map", e);
+ }
+
+ }
+
+ /**
+ * Method to convert the URI in to small tokens
+ * @param uri whose tokens are to be obtained
+ * @param tokens the list in which the tokens are to be stored
+ */
+ public void tokenCreation(CrawleableUri uri, ArrayList tokens) {
+ String[] uriToken;
+ uriToken = uri.getUri().toString().split("/|\\.");
+ tokens.addAll(Arrays.asList(uriToken));
+ }
+}
diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/predictor/MultinomialPredictor.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/predictor/MultinomialPredictor.java
new file mode 100644
index 000000000..41784add2
--- /dev/null
+++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/predictor/MultinomialPredictor.java
@@ -0,0 +1,544 @@
+package org.dice_research.squirrel.predictor;
+
+import de.jungblut.math.DoubleVector;
+import de.jungblut.math.activation.SigmoidActivationFunction;
+import de.jungblut.math.dense.SingleEntryDoubleVector;
+import de.jungblut.math.loss.LogLoss;
+import de.jungblut.math.minimize.CostGradientTuple;
+import de.jungblut.math.sparse.SequentialSparseDoubleVector;
+import de.jungblut.online.minimizer.StochasticGradientDescent;
+import de.jungblut.online.ml.FeatureOutcomePair;
+import de.jungblut.online.regression.RegressionLearner;
+import de.jungblut.online.regression.RegressionModel;
+import de.jungblut.online.regression.multinomial.MultinomialRegressionClassifier;
+import de.jungblut.online.regression.multinomial.MultinomialRegressionLearner;
+import de.jungblut.online.regression.multinomial.MultinomialRegressionModel;
+import de.jungblut.online.regularization.AdaptiveFTRLRegularizer;
+import de.jungblut.online.regularization.CostWeightTuple;
+import de.jungblut.online.regularization.L2Regularizer;
+import de.jungblut.online.regularization.WeightUpdater;
+import org.dice_research.squirrel.Constants;
+import org.dice_research.squirrel.data.uri.CrawleableUri;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
+import java.nio.charset.Charset;
+import java.util.ArrayList;
+import java.util.function.IntFunction;
+
+/**
+ * A predictor that predicts the type of the URI by performing multi-class classification
+ */
+public final class MultinomialPredictor implements Predictor{
+
+ public static final Logger LOGGER = LoggerFactory.getLogger(MultinomialPredictor.class);
+ /**
+ * {@link MultinomialRegressionModel} Represents the multinomial regression model used for the prediction of the type of the URI
+ */
+ private MultinomialRegressionModel multinomialModel;
+ /**
+ * {@link MultinomialRegressionLearner} Used to train the model with training data
+ */
+ private MultinomialRegressionLearner multinomialLearner;
+ /**
+ * {@link MultinomialRegressionClassifier} Classifier for multinomial regression model.
+ * Takes a model or the atomic parts of it and predicts the outcome for a given feature.
+ *
+ */
+ private MultinomialRegressionClassifier multinomialClassifier;
+ /**
+ * {@link WeightUpdater} Used to update the weights of the predictor model used.
+ */
+ private WeightUpdater updater;
+ /**
+ * {@link RegressionLearn} Used to train the model with training data
+ */
+ private RegressionLearn learner;
+ /**
+ * Location of the file containing the training data
+ */
+ private String filepath;
+ /**
+ * The rate at which the model learns.
+ */
+ private Double learningRate;
+ /**
+ * Regularizing parameter L2
+ */
+ private Double l2;
+ /**
+ * Regularizing parameter L1
+ */
+ private Double l1;
+ /**
+ * Hyper parameter Beta
+ */
+ private Double beta;
+ /**
+ * Validation percentage which is between 0 and 1
+ */
+ private Double holdoutValidationPercentage;
+ /**
+ * A list storing the different classes of URIs obtained from the training data
+ */
+ private ArrayList classList = new ArrayList<>();
+ /**
+ * Used to generate the feature vector of a URI
+ */
+ private FeatureVectorGenerator featureGenerator = new FeatureVectorGenerator();
+
+ /**
+ * Predicts the type of the URI
+ * @param uri the URI to which the prediction has to be made
+ * @return the type of the URI
+ */
+ public String predict(CrawleableUri uri) {
+ int pred = 0;
+ String predictedClass = null;
+ try {
+ featureGenerator.featureHashing(uri);
+ Object featureArray = uri.getData(Constants.FEATURE_VECTOR);
+ double[] doubleFeatureArray = (double[]) featureArray;
+ DoubleVector features = new SequentialSparseDoubleVector(doubleFeatureArray);
+ //initialize the regression classifier with updated model and predict
+ multinomialClassifier = new MultinomialRegressionClassifier(multinomialModel);
+ DoubleVector prediction = multinomialClassifier.predict(features);
+ pred = prediction.maxIndex();
+ } catch (Exception e) {
+ LOGGER.warn("Prediction for this " + uri.getUri().toString() + " failed " , e);
+ }
+ predictedClass = this.classList.get(pred);
+ return predictedClass;
+ }
+
+ /**
+ * Updates the predictor model based on the this URI
+ * @param uri based on which the model weights are updated
+ */
+ public void weightUpdate(CrawleableUri uri) {
+ RegressionModel[] newModels = new RegressionModel[this.getMultinomialModel().getModels().length];
+ int i=0;
+ if (uri.getData(Constants.FEATURE_VECTOR) != null && uri.getData(Constants.URI_TRUE_CLASS) != null) {
+ for (RegressionModel s : this.getMultinomialModel().getModels()) {
+ Object featureArray = uri.getData(Constants.FEATURE_VECTOR);
+ double[] doubleFeatureArray = (double[]) featureArray;
+ DoubleVector features = new SequentialSparseDoubleVector(doubleFeatureArray);
+ Object real_value = uri.getData(Constants.URI_TRUE_CLASS);
+ int rv = (int) real_value;
+ DoubleVector rv_DoubleVector = new SingleEntryDoubleVector(rv);
+ DoubleVector nextExample = features;
+ FeatureOutcomePair realResult = new FeatureOutcomePair(nextExample, rv_DoubleVector); // real outcome
+ //update weights using the updated parameters
+ DoubleVector newWeights = this.updater.prePredictionWeightUpdate(realResult, s.getWeights(),learningRate,0);
+ CostGradientTuple observed = this.learner.observeExample(realResult, newWeights);
+ // calculate new weights (note that the iteration count is not used)
+ CostWeightTuple update = this.updater.computeNewWeights(newWeights, observed.getGradient(), learningRate, 0, observed.getCost());
+ // update model and classifier
+ newModels[i] = new RegressionModel(update.getWeight(), s.getActivationFunction());
+ i++;
+ }
+ //create a new multinomial model with the update weights
+ this.multinomialModel = new MultinomialRegressionModel(newModels);
+ } else
+ LOGGER.warn("URI is null");
+ }
+
+ public RegressionModel getModel() {
+ return null;
+ }
+
+ //Learning rate
+ public double getLearningRate() {
+ return learningRate;
+ }
+
+ public void setLearningRate(double learningRate) {
+ this.learningRate = learningRate;
+ }
+
+ //L2
+ public double getL2() {
+ return l2;
+ }
+
+ public void setL2(double l2) {
+ this.l2 = l2;
+ }
+
+ //L1
+ public double getL1() {
+ return l1;
+ }
+
+ public void setL1(double l1) {
+ this.l1 = l1;
+ }
+
+ //Beta
+ public double getBeta() {
+ return beta;
+ }
+
+ public void setBeta(double beta) {
+ this.beta = beta;
+ }
+
+ //Learner
+ public RegressionLearn getLearner() {
+ return learner;
+ }
+
+ protected void setLearner(RegressionLearn learner) {
+ this.learner = learner;
+ }
+
+ //Filepath
+ public String getFilepath() {
+ return filepath;
+ }
+
+ protected void setFilepath(String filepath) {
+ this.filepath = filepath;
+ }
+
+ //Updater
+ public WeightUpdater getUpdater() {
+ return updater;
+ }
+
+ protected void setUpdater(WeightUpdater updater) {
+ this.updater = updater;
+ }
+
+ //Multinomial Model
+ public MultinomialRegressionModel getMultinomialModel() {
+ return multinomialModel;
+ }
+
+ protected void setMultinomialModel(MultinomialRegressionModel multinomialModel) {
+ this.multinomialModel = multinomialModel;
+ }
+
+
+ //Multinomial Learner
+ public MultinomialRegressionLearner getMultinomialLearner() {
+ return multinomialLearner;
+ }
+
+ protected void setMultinomialLearner(MultinomialRegressionLearner multinomialLearner) {
+ this.multinomialLearner = multinomialLearner;
+ }
+
+ //Multinomial Classifier
+ public MultinomialRegressionClassifier getMultinomialClassifier() {
+ return multinomialClassifier;
+ }
+
+ protected void setMultinomialClassifier(MultinomialRegressionClassifier multinomialClassifier) {
+ this.multinomialClassifier = multinomialClassifier;
+ }
+
+
+ public Double getHoldoutValidationPercentage() {
+ return holdoutValidationPercentage;
+ }
+
+ private void setHoldoutValidationPercentage(Double holdoutValidationPercentage) {
+ this.holdoutValidationPercentage = holdoutValidationPercentage;
+ }
+
+ public ArrayList getClassList(){
+ return this.classList;
+ }
+
+ /**
+ * A builder pattern for the MultinomialPredictor, that uses Regression Model, Regression Learner along with default training data and other default hyperparameters
+ */
+ public static class MultinomialPredictorBuilder {
+
+ private TrainingDataProvider trainingDataProvider = new MultinomialTrainDataProviderImpl(); //Training Data Provider
+
+ protected StochasticGradientDescent sgd; //Minimizer
+
+ private RegressionLearn learner; //Learner
+
+ private WeightUpdater updater; //Updater
+
+ private MultinomialRegressionLearner multinomialLearner; //Multinomial learner
+
+ private MultinomialRegressionModel multinomialModel; //Multinomial odel
+
+ private MultinomialRegressionClassifier multinomialClassifier; //Multinomial Classifier
+
+ private Double learningRate; //Learning rate
+
+ private Double beta; //Beta
+
+ private Double l1; //L1
+
+ private Double l2; //L2
+
+ private Double holdoutValidationPercentage; //Validation percentage which is between 0 and 1
+
+ private String filePath; //filepath to train
+
+ private ArrayList classList = new ArrayList<>(); // list containing the names of the different classes of URI
+
+ public MultinomialPredictorBuilder(MultinomialRegressionLearner learner, MultinomialRegressionModel model, MultinomialRegressionClassifier classifier, WeightUpdater updater) {
+ this.multinomialLearner = learner;
+ this.multinomialModel = model;
+ this.multinomialClassifier = classifier;
+ this.updater = updater;
+ }
+
+ public MultinomialPredictorBuilder() {
+ }
+
+ public MultinomialPredictorBuilder withUpdater(WeightUpdater updater) {
+ this.setUpdater(updater);
+ return this;
+ }
+
+ public MultinomialPredictorBuilder withLearner(MultinomialRegressionLearner multinomialLearner) {
+ this.setMultinomialLearner(multinomialLearner);
+ return this;
+ }
+
+ public MultinomialPredictorBuilder withModel(MultinomialRegressionModel multinomialModel) {
+ this.setMultinomialModel(multinomialModel);
+ return this;
+ }
+
+ public MultinomialPredictorBuilder withClassifier(MultinomialRegressionClassifier multinomialClassifier) {
+ this.setMultinomialRegressionClassifier(multinomialClassifier);
+ return this;
+ }
+
+ public MultinomialPredictorBuilder withFile(String filepath) {
+ this.setFilePath(filepath);
+ return this;
+
+ }
+
+ public MultinomialPredictorBuilder withLearningRate(Double learningRate) {
+ this.setLearningRate(learningRate);
+ return this;
+ }
+
+ public MultinomialPredictorBuilder withL1(Double L1) {
+ this.setL1(L1);
+ return this;
+ }
+
+ public MultinomialPredictorBuilder withL2(Double L2) {
+ this.setL2(L2);
+ return this;
+ }
+
+ public MultinomialPredictorBuilder withBeta(Double Beta) {
+ this.setBeta(Beta);
+ return this;
+ }
+
+ IntFunction factory = (i) -> {
+ // take care of not sharing any state from the outside, since classes are trained in parallel
+ StochasticGradientDescent minimizer = StochasticGradientDescent.StochasticGradientDescentBuilder
+ .create(0.01)
+ .holdoutValidationPercentage(0.1d)
+ .weightUpdater(new L2Regularizer(0.1))
+ .progressReportInterval(1_000)
+ .build();
+ RegressionLearner learner = new RegressionLearner(minimizer,
+ new SigmoidActivationFunction(), new LogLoss());
+ learner.setNumPasses(5);
+ return learner;
+ };
+
+ public MultinomialPredictor build() {
+ MultinomialPredictor predictor = new MultinomialPredictor();
+
+ //Learning Rate
+ if (this.getLearningRate() == null)
+ this.setLearningRate(0.7);
+ predictor.setLearningRate(this.getLearningRate());
+
+ //Beta
+ if (this.getBeta() == null)
+ this.setBeta(1);
+ predictor.setBeta(this.getBeta());
+
+ //L1
+ if (this.getL1() == null)
+ this.setL1(1);
+ predictor.setL1(this.getL1());
+
+ //L2
+ if (this.getL2() == null)
+ this.setL2(1);
+ predictor.setL2(this.getL2());
+
+ //updater
+ if (this.getUpdater() == null) {
+ this.setUpdater(new AdaptiveFTRLRegularizer(this.getBeta(), this.getL1(), this.getL2()));
+ }
+ predictor.setUpdater(this.getUpdater());
+
+ //holdout validation percentage
+ if (this.getHoldoutValidationPercentage() == null) {
+ this.setHoldoutValidationPercentage(0.05d);
+ }
+ predictor.setHoldoutValidationPercentage(this.getHoldoutValidationPercentage());
+
+ sgd = StochasticGradientDescent.StochasticGradientDescentBuilder
+ .create(this.getLearningRate()) // learning rate
+ .holdoutValidationPercentage(this.getHoldoutValidationPercentage())// 5% as validation set
+ .historySize(10_000) // keep 10k samples to compute relative improvement
+ .weightUpdater(this.getUpdater()) // FTRL updater
+ .progressReportInterval(1_000) // report every n iterations
+ .build();
+
+ //regression learner
+ if (this.getLearner() == null)
+ this.setLearner(new RegressionLearn(sgd, new SigmoidActivationFunction(), new LogLoss()));
+ predictor.setLearner(this.getLearner());
+
+ //multinomial learner
+ if (this.getMultinomialLearner() == null)
+ this.setMultinomialLearner(new MultinomialRegressionLearner(factory));
+ predictor.setMultinomialLearner(this.getMultinomialLearner());
+
+
+ //model
+ if (this.getMultinomialModel() == null) {
+ // Storing the names of the classes of the URI obtained from the training data
+ BufferedReader br = null;
+ String line;
+ try {
+ br = new BufferedReader(new InputStreamReader(getClass().getClassLoader().getResourceAsStream(filePath)
+ , Charset.defaultCharset()));
+ while((line = br.readLine()) != null){
+ String[] split = line.split(",");
+ split[1] = split[1].replace("\"", "");
+ if(!this.classList.contains(split[1])){
+ this.classList.add(split[1]);
+ }
+ }
+ predictor.classList = this.classList;
+
+ }catch (Exception e){
+ LOGGER.warn("Exception happened while finding the classes of the URI from training data file", e);
+ }
+ this.setMultinomialModel(multinomialLearner.train(() -> trainingDataProvider.setUpStream(this.getFilePath(), this.classList)));
+ }
+ predictor.setMultinomialModel(this.getMultinomialModel());
+
+ //classifier
+ if (this.getMultinomialClassifier() == null)
+ if (this.getMultinomialModel() != null)
+ this.setMultinomialRegressionClassifier(new MultinomialRegressionClassifier(this.getMultinomialModel()));
+ predictor.setMultinomialClassifier(this.getMultinomialClassifier());
+
+ return predictor;
+ }
+
+ //Learner
+ private RegressionLearn getLearner() {
+ return learner;
+ }
+
+ private void setLearner(RegressionLearn regressionLearn) {
+ this.learner = regressionLearn;
+ }
+
+ //Updater
+ private WeightUpdater getUpdater() {
+ return updater;
+ }
+
+ private void setUpdater(WeightUpdater updater) {
+ this.updater = updater;
+ }
+
+ //Multinomial Model
+ private MultinomialRegressionModel getMultinomialModel() {
+ return multinomialModel;
+ }
+
+ private void setMultinomialModel(MultinomialRegressionModel multinomialRegressionModel) {
+ this.multinomialModel = multinomialRegressionModel;
+ }
+
+ //Multinomial Classifier
+ private MultinomialRegressionClassifier getMultinomialClassifier() {
+ return multinomialClassifier;
+ }
+
+ private void setMultinomialRegressionClassifier(MultinomialRegressionClassifier multinomialRegressionClassifier) {
+ this.multinomialClassifier = multinomialRegressionClassifier;
+ }
+
+ //Multinomial Regression Learner
+ private MultinomialRegressionLearner getMultinomialLearner() {
+ return multinomialLearner;
+ }
+
+ private void setMultinomialLearner(MultinomialRegressionLearner multinomialRegressionLearner) {
+ this.multinomialLearner = multinomialRegressionLearner;
+ }
+
+ public Double getLearningRate() {
+ return learningRate;
+ }
+
+ private void setLearningRate(double learningRate) {
+ this.learningRate = learningRate;
+ }
+
+ private Double getBeta() {
+ return beta;
+ }
+
+ private void setBeta(double beta) {
+ this.beta = beta;
+ }
+
+ private Double getL1() {
+ return l1;
+ }
+
+ private void setL1(double l1) {
+ this.l1 = l1;
+ }
+
+ private Double getL2() {
+ return l2;
+ }
+
+ private void setL2(double l2) {
+ this.l2 = l2;
+ }
+
+ private String getFilePath() {
+ return filePath;
+ }
+
+ private void setFilePath(String filePath) {
+ this.filePath = filePath;
+ }
+
+ private Double getHoldoutValidationPercentage() {
+ return holdoutValidationPercentage;
+ }
+
+ private void setHoldoutValidationPercentage(Double holdoutValidationPercentage) {
+ this.holdoutValidationPercentage = holdoutValidationPercentage;
+ }
+
+ private ArrayList getClassList(){
+ return this.classList;
+ }
+
+ }
+
+}
diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/predictor/MultinomialTrainDataProviderImpl.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/predictor/MultinomialTrainDataProviderImpl.java
new file mode 100644
index 000000000..d8056dc08
--- /dev/null
+++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/predictor/MultinomialTrainDataProviderImpl.java
@@ -0,0 +1,98 @@
+package org.dice_research.squirrel.predictor;
+
+import de.jungblut.math.DoubleVector;
+import de.jungblut.math.dense.DenseDoubleVector;
+import de.jungblut.math.sparse.SequentialSparseDoubleVector;
+import de.jungblut.online.ml.FeatureOutcomePair;
+import org.dice_research.squirrel.Constants;
+import org.dice_research.squirrel.data.uri.CrawleableUri;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.*;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.net.URL;
+import java.nio.charset.Charset;
+import java.util.ArrayList;
+import java.util.stream.Stream;
+
+public class MultinomialTrainDataProviderImpl implements TrainingDataProvider {
+
+ Logger LOGGER = LoggerFactory.getLogger(MultinomialTrainDataProviderImpl.class);
+ private FeatureVectorGenerator featureGenerator = new FeatureVectorGenerator();
+
+ /**
+ * Used to convert the data in the training file into a stream which can be fed into the learner to learn
+ * @param filePath path of the file containing the training data
+ * @param classList list containing the class names of the URI
+ * @return
+ */
+ @Override
+ public Stream setUpStream(String filePath, ArrayList classList) {
+ BufferedReader br = null;
+ try {
+ br = new BufferedReader(new InputStreamReader(getClass().getClassLoader().getResourceAsStream(filePath)
+ , Charset.defaultCharset()));
+ }catch (Exception e){
+ e.printStackTrace();
+ }
+ return br.lines().map((s) -> parseFeature(s, classList));
+ }
+
+ public FeatureOutcomePair parseFeature(String line, ArrayList classList) {
+ DoubleVector[] classes = new DoubleVector[classList.size()];
+
+ for (int i = 0; i < classes.length; i++) {
+ classes[i] = new DenseDoubleVector(classes.length);
+ classes[i].set(i, 1d);
+ }
+ String[] split = line.split(",");
+ URI furi = null;
+ try {
+ furi = new URI(split[0].replace("\"", ""));
+ } catch (URISyntaxException e) {
+ try {
+ furi = new URI("http://scoreboard.lod2.eu/data/scoreboardDataCube.rdf");
+ } catch (URISyntaxException ex) {
+ LOGGER.warn("Exception happened while parsing train data file", ex);
+ }
+ }
+ CrawleableUri uri = new CrawleableUri(furi);
+ featureGenerator.featureHashing(uri);
+ Object featureArray = uri.getData(Constants.FEATURE_VECTOR);
+ double[] doubleFeatureArray = (double[]) featureArray;
+ DoubleVector features = new SequentialSparseDoubleVector(doubleFeatureArray);
+ split[1] = split[1].replace("\"", "");
+ DoubleVector predVector;
+ if(classList.indexOf(split[1]) != -1)
+ predVector = classes[classList.indexOf(split[1])];
+ else
+ predVector = classes[0];
+
+ return new FeatureOutcomePair(features, predVector);
+ }
+
+ /**
+ * Used to create a file using the data from an online source
+ * @param dataUri The location of the online source
+ * @param trainFilePath The location of the local file to which the data should be written
+ */
+ public void createTrainDataFile(String dataUri, String trainFilePath) {
+ BufferedReader br = null;
+ URL url = null;
+ String line;
+ try {
+ PrintWriter writer = new PrintWriter(trainFilePath, "UTF-8");
+ url = new URL(dataUri);
+ br = new BufferedReader((new InputStreamReader(url.openStream())));
+ br.readLine();
+ while((line = br.readLine()) != null){
+ writer.println(line);
+ }
+ writer.close();
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+}
diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/predictor/Predictor.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/predictor/Predictor.java
new file mode 100644
index 000000000..5ec08e863
--- /dev/null
+++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/predictor/Predictor.java
@@ -0,0 +1,33 @@
+package org.dice_research.squirrel.predictor;
+
+import de.jungblut.online.regression.RegressionModel;
+import org.dice_research.squirrel.data.uri.CrawleableUri;
+
+/**
+ * Interface of an Online leaner predicting the URI type.
+ * This interface is for building a learner,training it and predicting the type of URI
+ *
+ */
+
+public interface Predictor {
+ /**
+ * Return a prediction value of the type of the given URI, the prediction should be between 0 and 1.
+ * e.g. In case of RDF type prediction, the more the predicted value is close to 1
+ * the more the URI is of type RDF otherwise is not.
+ *
+ * @param uri
+ * {@link CrawleableUri} URI whose class is to be predicted.
+ *
+ * @return the predicted class.
+ */
+ String predict(CrawleableUri uri);
+ /**
+ * Update the weight of the model. It uses the predicted value and the true label value
+ * with the feature vector from the URI map to calculate the new weight.
+ *
+ * @param uri
+ * {@link CrawleableUri} URI whose feature vector is used to update weights
+ */
+ void weightUpdate(CrawleableUri uri);
+
+}
diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/predictor/RegressionLearn.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/predictor/RegressionLearn.java
new file mode 100644
index 000000000..1d80e452b
--- /dev/null
+++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/predictor/RegressionLearn.java
@@ -0,0 +1,40 @@
+package org.dice_research.squirrel.predictor;
+
+import com.google.common.base.Preconditions;
+import de.jungblut.math.DoubleVector;
+import de.jungblut.math.activation.ActivationFunction;
+import de.jungblut.math.dense.SingleEntryDoubleVector;
+import de.jungblut.math.loss.LossFunction;
+import de.jungblut.math.minimize.CostGradientTuple;
+import de.jungblut.online.minimizer.StochasticMinimizer;
+import de.jungblut.online.ml.FeatureOutcomePair;
+import de.jungblut.online.regression.RegressionLearner;
+
+
+public class RegressionLearn extends RegressionLearner {
+ private StochasticMinimizer minimizer;
+
+ private final ActivationFunction activationFunction;
+ private final LossFunction lossFunction;
+
+ public RegressionLearn(StochasticMinimizer minimizer,
+ ActivationFunction activationFunction, LossFunction lossFunction) {
+ super(minimizer, activationFunction, lossFunction);
+ this.activationFunction = Preconditions.checkNotNull(activationFunction,
+ "activation function");
+ this.lossFunction = Preconditions.checkNotNull(lossFunction,
+ "loss function");
+ }
+
+ public CostGradientTuple observeExample(FeatureOutcomePair next, DoubleVector weights) {
+
+ DoubleVector hypothesis = new SingleEntryDoubleVector(this.activationFunction.apply(next.getFeature().dot(weights)));
+ double cost = this.lossFunction.calculateLoss(next.getOutcome(), hypothesis);
+ DoubleVector gradient = this.lossFunction.calculateGradient(next.getFeature(), next.getOutcome(), hypothesis);
+ return new CostGradientTuple(cost, gradient);
+ }
+}
+
+
+
+
diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/predictor/TrainingDataProvider.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/predictor/TrainingDataProvider.java
new file mode 100644
index 000000000..57c658755
--- /dev/null
+++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/predictor/TrainingDataProvider.java
@@ -0,0 +1,21 @@
+package org.dice_research.squirrel.predictor;
+
+import de.jungblut.online.ml.FeatureOutcomePair;
+
+import java.util.ArrayList;
+import java.util.stream.Stream;
+
+/**
+ * Interface to provide training data to the predictor
+ */
+
+public interface TrainingDataProvider{
+
+ /**
+ * Function takes the file containing the train data and converts it to a stream that can be used by a predictor to train.
+ * @param filePath path of the file containing the training data
+ * @param classList list containing the class names of the URI
+ * @return a stream of train data
+ */
+ Stream setUpStream(String filePath, ArrayList classList);
+}
diff --git a/squirrel.frontier/src/main/resources/binomialTrainData.txt b/squirrel.frontier/src/main/resources/binomialTrainData.txt
new file mode 100644
index 000000000..8e8983574
--- /dev/null
+++ b/squirrel.frontier/src/main/resources/binomialTrainData.txt
@@ -0,0 +1,228 @@
+"http://api.kasabi.com/dataset/jisc-cetis-project-directory/apis/sparql","sparql"
+"http://ipi.bio2rdf.org/sparql","sparql"
+"http://skos.um.es/unesco6/unesco6.ttl","dereferenceable"
+"http://data.kasabi.com/dataset/prelinger-archives/film//1935_comedy_mallinckrodt.rdf","dereferenceable"
+"http://red.gnoss.com/en/community/agrega/Agrega-explorer/tag/tema?rdf","dereferenceable"
+"http://api.kasabi.com/dataset/prelinger-archives/apis/sparql","sparql"
+"http://quebec.bio2rdf.org/download/data/hgnc/hgnc.n3.gz","dereferenceable"
+"https://data.sfgov.org/api/views/ea9w-4zvc/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.exim.gov/api/views/sa52-sypr/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.lacity.org/api/views/4ca8-mxuh/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.ok.gov/api/views/52ak-m2d5/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.cityofnewyork.us/api/views/qb3k-n8mm/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.cityofchicago.org/api/views/s6ha-ppgi/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.hawaii.gov/api/views/usfi-mive/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.cityofchicago.org/api/views/cauq-8yn6/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.cityofnewyork.us/api/views/y74e-vkxy/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.cityofnewyork.us/api/views/qqsi-vm9f/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://health.data.ny.gov/api/views/2hcc-shji/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.cityofnewyork.us/api/views/kwk4-6u9e/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.sfgov.org/api/views/ea9w-4zvc/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.exim.gov/api/views/sa52-sypr/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.lacity.org/api/views/4ca8-mxuh/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.ok.gov/api/views/52ak-m2d5/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.cityofnewyork.us/api/views/qb3k-n8mm/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.cityofchicago.org/api/views/s6ha-ppgi/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.hawaii.gov/api/views/usfi-mive/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.cityofchicago.org/api/views/cauq-8yn6/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.cityofnewyork.us/api/views/y74e-vkxy/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.cityofnewyork.us/api/views/qqsi-vm9f/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://health.data.ny.gov/api/views/2hcc-shji/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.cityofnewyork.us/api/views/kwk4-6u9e/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"http://quebec.bio2rdf.org/download/data/chebi/chebi.n3.gz","dereferenceable"
+"http://bio2rdf.org/interpro:ipr000700","dereferenceable"
+"http://api.talis.com/stores/climb/services/sparql","sparql"
+"http://bio2rdf.org/rdfxml/interpro:ipr013315","dereferenceable"
+"http://api.kasabi.com/dataset/prelinger-archives/apis/sparql","sparql"
+"http://quebec.bio2rdf.org/download/data/ncbi/ncbi.omim.n3.gz","dereferenceable"
+"http://api.kasabi.com/dataset/renewable-energy-generators/apis/sparql","sparql"
+"http://quebec.bio2rdf.org/download/data/ncbi/ncbi.homologene.n3.gz","dereferenceable"
+"http://quebec.bio2rdf.org/download/data/hgnc/hgnc.n3.gz","dereferenceable"
+"https://data.sfgov.org/api/views/ea9w-4zvc/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.exim.gov/api/views/sa52-sypr/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.lacity.org/api/views/4ca8-mxuh/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.ok.gov/api/views/52ak-m2d5/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.cityofnewyork.us/api/views/qb3k-n8mm/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.cityofchicago.org/api/views/s6ha-ppgi/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.hawaii.gov/api/views/usfi-mive/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.cityofchicago.org/api/views/cauq-8yn6/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.cityofnewyork.us/api/views/y74e-vkxy/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.cityofnewyork.us/api/views/qqsi-vm9f/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://health.data.ny.gov/api/views/2hcc-shji/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.cityofnewyork.us/api/views/kwk4-6u9e/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"http://nasa.dataincubator.org/person/eugeneandrewcernan","dereferenceable"
+"http://quebec.bio2rdf.org/download/data/kegg/kegg.cpd.n3.gz","dereferenceable"
+"http://api.talis.com/stores/theviewfrom/services/sparql","sparql"
+"http://linkedmanchester.org/sparql","sparql"
+"http://data.kasabi.com/dataset/discogs/release/2129148/track/2.ttl","dereferenceable"
+"http://datendienst.d-nb.de/cgi-bin/mabit.pl?cmd=fetch&userID=opendata&pass=opendata&mabheft=Title.ttl.gz","dereferenceable"
+"http://fanhu.bz/schema#","dereferenceable"
+"http://data.kasabi.com/dataset/foodista/recipe/FSZG4354.rdf","dereferenceable"
+"http://quebec.bio2rdf.org/download/data/affymetrix/affymetrix.n3.gz","dereferenceable"
+"http://quebec.bio2rdf.org/download/data/kegg/kegg.rn.n3.gz","dereferenceable"
+"http://ec.bio2rdf.org/sparql","sparql"
+"http://linkedscotland-downloads.s3.amazonaws.com/pupils.ttl.gz","dereferenceable"
+"http://quebec.bio2rdf.org/download/data/obo/obo.n3.gz","dereferenceable"
+"http://quebec.bio2rdf.org/download/data/kegg/kegg.gl.n3.gz","dereferenceable"
+"http://www.openmobilenetwork.org/page/cell55961_12294_262_1.rdf","dereferenceable"
+"http://quebec.bio2rdf.org/download/data/kegg/kegg.path.n3.gz","dereferenceable"
+"http://linkedscotland-downloads.s3.amazonaws.com/sns-geography.ttl.gz","dereferenceable"
+"https://commondatastorage.googleapis.com/ckannet-storage/2012-03-14T021756/metadata.dc.rdf","dereferenceable"
+"http://api.kasabi.com/dataset/discogs/apis/sparql","sparql"
+"http://ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/rdf/taxonomy.rdf.gz","dereferenceable"
+"http://quebec.bio2rdf.org/download/data/biocyc/biocyc.n3.gz","dereferenceable"
+"http://statistics.data.gov.uk/def/administrative-geography/MetropolitanCounty","dereferenceable"
+"http://purl.org/weso/datasets/nomenclator/asturias/2010/nomenclator-asturias-2010.ttl","dereferenceable"
+"http://lod2.eu/model/export/?m=http%3A%2F%2Flod2.eu%2F&f=rdfxml","dereferenceable"
+"http://linkedmanchester.org/resources/linkedmanchester.org/id/buses/route/125.rdf","dereferenceable"
+"https://commondatastorage.googleapis.com/ckannet-storage/2012-05-09T135942/iris2-linkedData-v0.2.rdf","dereferenceable"
+"http://www.languagelibrary.eu/owl/simple/simple_ontology.owl","dereferenceable"
+"http://kaiko.getalp.org/dbnary/static/lemon/latest/de_dbnary_lemon.ttl","dereferenceable"
+"http://mlode.nlp2rdf.org/downloads/ids.nt.gz","dereferenceable"
+"http://www.languagelibrary.eu/owl/simple/lemonsource/simple_lemonlistindividuals.owl","dereferenceable"
+"http://quebec.bio2rdf.org/download/data/ncbi/ncbi.geneid.n3.gz","dereferenceable"
+"http://s4.semanticscience.org/bio2rdf_download/rdf/current/ctd/","dereferenceable"
+"http://ckan.net/storage/f/file/text-turtle-2304325680559-1346754246694","dereferenceable"
+"http://rdf.geospecies.org/ont/geospecies.owl","dereferenceable"
+"http://agalpha.mathbiol.org:10035/repositories/tcga","dereferenceable"
+"http://dbpedia.bio2rdf.org/sparql","sparql"
+"http://www.languagelibrary.eu/owl/simple/inds/simplelistindividuals.owl","dereferenceable"
+"http://glottolog.org/downloadarea/references.rdf.zip","dereferenceable"
+"http://zbw.eu/stw/versions/latest/download/download.php?filename=stw.rdf.zip","dereferenceable"
+"http://data.colinda.org/conference.php?id=1","dereferenceable"
+"http://api.talis.com/stores/jgoodwin-genealogy/services/sparql","sparql"
+"http://code.google.com/p/linkedrecipes/downloads/detail?name=linked-recipes-schema-0.1.ttl&can=2&q=","dereferenceable"
+"http://pokedex.dataincubator.org/pokemon/garchomp","dereferenceable"
+"http://moseley.dataincubator.org/artist/beth-orton","dereferenceable"
+"http://data.bib.uni-mannheim.de/sparql","sparql"
+"http://kasabi.com/api/sparql-endpoint-foodista","sparql"
+"http://api.kasabi.com/dataset/foodista/apis/sparql","sparql"
+"http://api.talis.com/stores/wordnet/services/sparql","sparql"
+"http://data.uni-muenster.de/php/sparql","sparql"
+"http://biocarta.bio2rdf.org/sparql","sparql"
+"http://www.languagelibrary.eu/owl/simple/inds/simpleallindividuals.owl","dereferenceable"
+"http://cas.bio2rdf.org/sparql","sparql"
+"http://www.languagelibrary.eu/owl/simple/lemonsource/simple_lemonallindividuals.owl","dereferenceable"
+"http://quebec.bio2rdf.org/download/data/go/go.n3.gz","dereferenceable"
+"http://api.kasabi.com/dataset/yahoo-geoplanet/apis/sparql","sparql"
+"http://quebec.bio2rdf.org/download/data/mgi/mgi.n3.gz","dereferenceable"
+"http://opendatacommunities-downloads.s3.amazonaws.com/imd-2010-imd-rank.ttl.zip","dereferenceable"
+"http://data.kasabi.com/dataset/renewable-energy-generators/generator/R00001NANI.rdf","dereferenceable"
+"https://ckannet-storage.commondatastorage.googleapis.com/2012-11-21T021132/ontology.rdf","dereferenceable"
+"http://linkedscotland.org/sparql","sparql"
+"http://api.kasabi.com/dataset/renewable-energy-generators/apis/sparql","sparql"
+"http://linkedscotland.org/sparql","sparql"
+"http://dimitros.net/query.sparql","sparql"
+"http://linkedscotland.org/sparql","sparql"
+"http://api.kasabi.com/dataset/nasa/apis/sparql","sparql"
+"http://api.talis.com/stores/airports/services/sparql","sparql"
+"http://api.talis.com/stores/fanhubz/services/sparql","sparql"
+"http://api.talis.com/stores/theviewfrom/services/sparql","sparql"
+"http://linkedmanchester.org/sparql","sparql"
+"http://webenemasuno.linkeddata.es/source/rdf/data.zip","dereferenceable"
+"http://chembl.bio2rdf.org/sparql","sparql"
+"http://api.kasabi.com/dataset/jisc-cetis-project-directory/apis/sparql","sparql"
+"http://labs.systemone.at/wikipedia3/enwiki/20060326/enwiki-20060326.rdf.gz","dereferenceable"
+"http://givingsense.eu/frscol/FrSchoolSystem/","dereferenceable"
+"https://data.seattle.gov/api/views/w3y2-x633/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"http://spcdata.digitpa.gov.it/data/aoo.ttl","dereferenceable"
+"http://gadm.geovocab.org/id/0_10","dereferenceable"
+"http://www.ebi.ac.uk/rdf/services/atlas/sparql","sparql"
+"https://data.cityofnewyork.us/api/views/tar7-vww3/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.wa.gov/api/views/tmay-2i9v/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"http://www.nosdonnees.fr/storage/f/2013-03-25T151047/annu_lirmm_0.1.rdf","dereferenceable"
+"http://spcdata.digitpa.gov.it/data/uo.ttl","dereferenceable"
+"https://data.wa.gov/api/views/kbv8-aawq/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"http://dati.camera.it/ocd/files/persona.turtle.gz","dereferenceable"
+"https://data.maryland.gov/api/views/bwyv-uyh2/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://github.com/HsH-Bibliothek/geodata","dereferenceable"
+"https://data.lacity.org/api/views/4ee5-wmby/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.ok.gov/api/views/a2a7-88yx/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.ok.gov/api/views/iz5g-gb92/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.cityofnewyork.us/api/views/drh3-e2fd/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"http://status.scoffoni.net/index.php/pscoffoni/foaf","dereferenceable"
+"http://srcmf.org/public/alexis-rdf.zip","dereferenceable"
+"https://data.mo.gov/api/views/x8fg-yyye/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"http://statusnet.sebseb01.net/sebseb01/foaf","dereferenceable"
+"https://data.seattle.gov/api/views/uyyd-8gak/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.cityofnewyork.us/api/views/s8jv-f44n/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.kingcounty.gov/api/views/er52-nehu/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"http://data.dm2e.eu","dereferenceable"
+"http://dati.camera.it/sparql","sparql"
+"http://data.cnr.it/sparql/","sparql"
+"http://oracle.skilledtests.com/chomskybot/foaf","dereferenceable"
+"http://somsants.net/maulet1714/foaf","dereferenceable"
+"http://status.soucy.cc/hs0ucy/foaf","dereferenceable"
+"http://spip.org/spipmedias/foaf","dereferenceable"
+"http://spraci.org/michaelmd/foaf","dereferenceable"
+"https://data.illinois.gov/api/views/f7nd-jj28/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"http://dati.opendataground.it:80/comunealbanolaziale/906.rdf","dereferenceable"
+"https://data.maryland.gov/api/views/kicx-k4rc/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.cityofnewyork.us/api/views/65z6-rsii/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.cityofnewyork.us/api/views/kku6-nxdu/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.medicare.gov/api/views/nrth-mfg3/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.ok.gov/api/views/fwkc-astr/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.baltimorecity.gov/api/views/nxbm-dfav/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.wa.gov/api/views/ak95-mjh9/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.maryland.gov/api/views/pdvh-tf2u/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.hawaii.gov/api/views/hc7x-8745/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://health.data.ny.gov/api/views/hbu9-xsrx/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.ok.gov/api/views/pyir-desi/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.medicare.gov/api/views/dgck-syfz/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.cityofnewyork.us/api/views/kpav-sd4t/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://datacatalog.cookcountyil.gov/api/views/5px6-amgc/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://health.data.ny.gov/api/views/ivw2-k53g/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"http://dati.opendataground.it:80/comunealbanolaziale/907.rdf","dereferenceable"
+"https://data.lacity.org/api/views/3gwn-arjr/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.maryland.gov/api/views/xyrh-5e77/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.cityofchicago.org/api/views/kn9c-c2s2/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.cityofchicago.org/api/views/uvy2-xbnp/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.baltimorecity.gov/api/views/2j28-xzd7/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.sfgov.org/api/views/hbza-6v77/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.seattle.gov/api/views/egc4-d24i/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.cityofnewyork.us/api/views/h9gi-nx95/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.cityofnewyork.us/api/views/u553-m549/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.illinois.gov/api/views/uxhq-ykba/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.cityofnewyork.us/api/views/kjxa-7ccf/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.baltimorecity.gov/api/views/jcci-nzfy/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.cityofnewyork.us/api/views/p94q-8hxh/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.cityofnewyork.us/api/views/p424-amsu/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.illinois.gov/api/views/t224-vrp2/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.montgomerycountymd.gov/api/views/5pue-gfbe/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.maryland.gov/api/views/xedu-p97g/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"http://blogs.bootsnall.com/luka/index.rdf","dereferenceable"
+"https://data.medicare.gov/api/views/k2ze-bqvw/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.medicare.gov/api/views/ytf2-4ept/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.ok.gov/api/views/jguv-se39/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.cityofnewyork.us/api/views/arq3-7z49/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.sfgov.org/api/views/88g8-5mnd/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.seattle.gov/api/views/7ais-f98f/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.maryland.gov/api/views/mk5a-nf44/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.cityofnewyork.us/api/views/hh8v-7m7u/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"http://cb.semsol.org/sparql?query=dump","dereferenceable"
+"https://data.maryland.gov/api/views/cmwa-gxtm/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.baltimorecity.gov/api/views/xmpa-487w/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.baltimorecity.gov/api/views/rb22-mgti/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"http://linguistic.linkeddata.es/id/apertium/lexiconEN","dereferenceable"
+"https://data.sfgov.org/api/views/s593-yv8k/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.illinois.gov/api/views/92jh-73bc/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.illinois.gov/api/views/mszz-27vx/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"http://dati.opendataground.it:80/comunealbanolaziale/979.rdf","dereferenceable"
+"https://data.sfgov.org/api/views/ea9w-4zvc/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.exim.gov/api/views/sa52-sypr/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.lacity.org/api/views/4ca8-mxuh/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.ok.gov/api/views/52ak-m2d5/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.cityofnewyork.us/api/views/qb3k-n8mm/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.cityofchicago.org/api/views/s6ha-ppgi/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.hawaii.gov/api/views/usfi-mive/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.cityofchicago.org/api/views/cauq-8yn6/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.cityofnewyork.us/api/views/y74e-vkxy/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.cityofnewyork.us/api/views/qqsi-vm9f/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://health.data.ny.gov/api/views/2hcc-shji/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.cityofnewyork.us/api/views/kwk4-6u9e/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"http://datahub.io/dataset/hellenic-fire-brigade","dereferenceable"
+"https://data.cityofchicago.org/api/views/v7ui-k59z/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.sfgov.org/api/views/v94x-pf9r/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.seattle.gov/api/views/fy3x-rf3i/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"https://data.seattle.gov/api/views/7svg-ds5z/rows.rdf?accessType=DOWNLOAD","dereferenceable"
+"http://www.data.gov/semantic/data/alpha/1564/dataset-1564.rdf.gz","dereferenceable"
diff --git a/squirrel.frontier/src/main/resources/multiNomialTrainData.txt b/squirrel.frontier/src/main/resources/multiNomialTrainData.txt
new file mode 100644
index 000000000..1393a14b3
--- /dev/null
+++ b/squirrel.frontier/src/main/resources/multiNomialTrainData.txt
@@ -0,0 +1,292 @@
+"http://api.kasabi.com/dataset/jisc-cetis-project-directory/apis/sparql","SPARQL"
+"http://ipi.bio2rdf.org/sparql","SPARQL"
+"http://api.kasabi.com/dataset/prelinger-archives/apis/sparql","SPARQL"
+"http://api.talis.com/stores/schemapedia/services/sparql","SPARQL"
+"http://zbw.eu/beta/sparql/stw","SPARQL"
+"http://api.kasabi.com/api/sparql-endpoint-near","SPARQL"
+"http://api.talis.com/stores/pokedex/services/sparql","SPARQL"
+"http://api.talis.com/stores/climb/services/sparql","SPARQL"
+"http://pdb.bio2rdf.org/sparql","SPARQL"
+"http://api.talis.com/stores/datagovuk/services/sparql","SPARQL"
+"http://api.kasabi.com/dataset/bricklink/apis/sparql","SPARQL"
+"http://api.talis.com/stores/moseley/services/sparql","SPARQL"
+"http://api.talis.com/stores/theviewfrom/services/sparql","SPARQL"
+"http://linkedmanchester.org/sparql","SPARQL"
+"http://data.bib.uni-mannheim.de/sparql","SPARQL"
+"http://kasabi.com/api/sparql-endpoint-foodista","SPARQL"
+"http://api.kasabi.com/dataset/foodista/apis/sparql","SPARQL"
+"http://api.talis.com/stores/wordnet/services/sparql","SPARQL"
+"http://data.uni-muenster.de/php/sparql","SPARQL"
+"http://biocarta.bio2rdf.org/sparql","SPARQL"
+"http://linkedscotland.org/sparql","SPARQL"
+"http://api.kasabi.com/dataset/nasa/apis/sparql","SPARQL"
+"http://api.talis.com/stores/airports/services/sparql","SPARQL"
+"http://api.talis.com/stores/fanhubz/services/sparql","SPARQL"
+"http://api.talis.com/stores/theviewfrom/services/sparql","SPARQL"
+"http://linkedmanchester.org/sparql","SPARQL"
+"http://lab3.libris.kb.se/sparql","SPARQL"
+"http://lod.ac/bdls/sparql","SPARQL"
+"http://eris.okfn.org/sparql","SPARQL"
+"http://corpora.nlp2rdf.org/sparql","SPARQL"
+"http://data.cnr.it/sparql-proxy/","SPARQL"
+"http://data.ox.ac.uk/sparql/","SPARQL"
+"http://wiki.rkbexplorer.com/sparql/","SPARQL"
+"http://lobid.org/sparql/","SPARQL"
+"http://fao.270a.info/sparql","SPARQL"
+"http://dbtune.org/myspace/sparql/","SPARQL"
+"http://gutenberg.dcs.fi.uva.es/~bhscmcyt/census/sparql_en.php","SPARQL"
+"http://api.kasabi.com/dataset/pali-english-lexicon/apis/sparql","SPARQL"
+"http://ibm.rkbexplorer.com/sparql/","SPARQL"
+"http://api.kasabi.com/dataset/pali-english-lexicon/apis/sparql","SPARQL"
+"http://drugbank.bio2rdf.org/sparql","SPARQL"
+"http://sparql.wikipathways.org/","SPARQL"
+"http://ndb.publink.lod2.eu/sparql","SPARQL"
+"http://eurecom.rkbexplorer.com/sparql/","SPARQL"
+"http://eur-lex.publicdata.eu/sparql","SPARQL"
+"http://data.archiveshub.ac.uk/dump/linkedarchiveshub.zip","DUMP"
+"http://pisa.rkbexplorer.com/sparql/","SPARQL"
+"http://resex.rkbexplorer.com/sparql/","SPARQL"
+"http://www4.wiwiss.fu-berlin.de/euraxess/sparql","SPARQL"
+"http://datos.bne.es/sparql","SPARQL"
+"http://spending.lichfielddc.gov.uk/sparql","SPARQL"
+"http://ccny-cuny.eagle-i.net/sparqler/sparql","SPARQL"
+"http://upr.eagle-i.net/sparqler/sparql","SPARQL"
+"http://services.data.gov.uk/research/sparql","SPARQL"
+"http://el.dbpedia.org/sparql","SPARQL"
+"http://n-lex.publicdata.eu/sparql","SPARQL"
+"http://govwild.org/sparql","SPARQL"
+"http://setaria.oszk.hu/sparql","SPARQL"
+"http://cb.semsol.org/sparql","SPARQL"
+"http://howard.eagle-i.net/sparqler/sparql","SPARQL"
+"http://foreign.rkbexplorer.com/sparql","SPARQL"
+"http://revyu.com/sparql","SPARQL"
+"http://transparency.270a.info/sparql","SPARQL"
+"http://photos.rkbexplorer.com/sparql","SPARQL"
+"http://os.rkbexplorer.com/sparql/","SPARQL"
+"http://libver.math.auth.gr/sparql","SPARQL"
+"http://sparql.linkedopendata.it/los","SPARQL"
+"http://lisbon.rkbexplorer.com/sparql","SPARQL"
+"http://www4.wiwiss.fu-berlin.de/gutendata/sparql","SPARQL"
+"http://unodc.publicdata.eu/sparql","SPARQL"
+"http://oecd.270a.info/sparql","SPARQL"
+"http://resource.geolba.ac.at/PoolParty/sparql/GeologicUnit","SPARQL"
+"http://resource.geolba.ac.at/PoolParty/sparql/lithology","SPARQL"
+"http://www4.wiwiss.fu-berlin.de/diseasome/sparql","SPARQL"
+"http://eurostat.linked-statistics.org/sparql","SPARQL"
+"http://data.ox.ac.uk/sparql/","SPARQL"
+"http://services.data.gov.uk/education/sparql","SPARQL"
+"http://www4.wiwiss.fu-berlin.de/cordis/sparql","SPARQL"
+"http://xula.eagle-i.net/sparqler/sparql","SPARQL"
+"https://eagle-i.ea.vanderbilt.edu/sparqler/sparql","SPARQL"
+"http://www4.wiwiss.fu-berlin.de/medicare/sparql","SPARQL"
+"http://sparql.linkedopendata.it/grrt","SPARQL"
+"http://lod.euscreen.eu/sparql","SPARQL"
+"http://resrev.ilrt.bris.ac.uk/data-server-workshop/sparql","SPARQL"
+"http://api.talis.com/stores/pbac/services/sparql","SPARQL"
+"http://soa4all.isoco.net/luf/sparql","SPARQL"
+"http://dewey.info/sparql.php","SPARQL"
+"http://api.talis.com/stores/mesh-norwegian/services/sparql","SPARQL"
+"http://newcastle.rkbexplorer.com/sparql/","SPARQL"
+"http://rdf.muninn-project.org/sparql","SPARQL"
+"http://cultura.linkeddata.es/sparql","SPARQL"
+"http://platform.uberblic.org/api/v1/sparql","SPARQL"
+"http://cr.eionet.europa.eu/sparql","SPARQL"
+"http://tkm.kiom.re.kr/ontology/sparql","SPARQL"
+"http://wordnet.rkbexplorer.com/sparql/","SPARQL"
+"http://miuras.inf.um.es/sparql","SPARQL"
+"http://epsrc.rkbexplorer.com/sparql","SPARQL"
+"http://api.kasabi.com/dataset/ecco-tcp-eighteenth-century-collections-online-texts/apis/sparql","SPARQL"
+"http://sparql.data.southampton.ac.uk/","SPARQL"
+"http://cdrewu.eagle-i.net/sparqler/sparql","SPARQL"
+"https://data.lacity.org/api/views/4ee5-wmby/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.ok.gov/api/views/a2a7-88yx/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.ok.gov/api/views/iz5g-gb92/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.cityofnewyork.us/api/views/drh3-e2fd/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.seattle.gov/api/views/uyyd-8gak/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.cityofnewyork.us/api/views/s8jv-f44n/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.kingcounty.gov/api/views/er52-nehu/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.illinois.gov/api/views/f7nd-jj28/rows.rdf?accessType=DOWNLOAD","DUMP"
+"http://dati.opendataground.it:80/comunealbanolaziale/906.rdf","DUMP"
+"https://data.maryland.gov/api/views/kicx-k4rc/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.cityofnewyork.us/api/views/65z6-rsii/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.cityofnewyork.us/api/views/kku6-nxdu/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.medicare.gov/api/views/nrth-mfg3/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.ok.gov/api/views/fwkc-astr/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.baltimorecity.gov/api/views/nxbm-dfav/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.wa.gov/api/views/ak95-mjh9/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.maryland.gov/api/views/pdvh-tf2u/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.hawaii.gov/api/views/hc7x-8745/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://health.data.ny.gov/api/views/hbu9-xsrx/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.ok.gov/api/views/pyir-desi/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.medicare.gov/api/views/dgck-syfz/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.cityofnewyork.us/api/views/kpav-sd4t/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://datacatalog.cookcountyil.gov/api/views/5px6-amgc/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://health.data.ny.gov/api/views/ivw2-k53g/rows.rdf?accessType=DOWNLOAD","DUMP"
+"http://dati.opendataground.it:80/comunealbanolaziale/907.rdf","DUMP"
+"https://data.lacity.org/api/views/3gwn-arjr/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.maryland.gov/api/views/xyrh-5e77/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.cityofchicago.org/api/views/kn9c-c2s2/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.cityofchicago.org/api/views/uvy2-xbnp/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.baltimorecity.gov/api/views/2j28-xzd7/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.sfgov.org/api/views/hbza-6v77/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.seattle.gov/api/views/egc4-d24i/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.cityofnewyork.us/api/views/h9gi-nx95/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.cityofnewyork.us/api/views/u553-m549/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.illinois.gov/api/views/uxhq-ykba/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.cityofnewyork.us/api/views/kjxa-7ccf/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.baltimorecity.gov/api/views/jcci-nzfy/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.cityofnewyork.us/api/views/p94q-8hxh/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.cityofnewyork.us/api/views/p424-amsu/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.illinois.gov/api/views/t224-vrp2/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.montgomerycountymd.gov/api/views/5pue-gfbe/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.maryland.gov/api/views/xedu-p97g/rows.rdf?accessType=DOWNLOAD","DUMP"
+"http://blogs.bootsnall.com/luka/index.rdf","DUMP"
+"https://data.medicare.gov/api/views/k2ze-bqvw/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.medicare.gov/api/views/ytf2-4ept/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.ok.gov/api/views/jguv-se39/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.cityofnewyork.us/api/views/arq3-7z49/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.sfgov.org/api/views/88g8-5mnd/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.seattle.gov/api/views/7ais-f98f/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.maryland.gov/api/views/mk5a-nf44/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.cityofnewyork.us/api/views/hh8v-7m7u/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.sfgov.org/api/views/s593-yv8k/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.illinois.gov/api/views/92jh-73bc/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.illinois.gov/api/views/mszz-27vx/rows.rdf?accessType=DOWNLOAD","DUMP"
+"http://dati.opendataground.it:80/comunealbanolaziale/979.rdf","DUMP"
+"https://data.sfgov.org/api/views/ea9w-4zvc/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.exim.gov/api/views/sa52-sypr/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.lacity.org/api/views/4ca8-mxuh/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.ok.gov/api/views/52ak-m2d5/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.cityofnewyork.us/api/views/qb3k-n8mm/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.cityofchicago.org/api/views/s6ha-ppgi/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.hawaii.gov/api/views/usfi-mive/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.cityofchicago.org/api/views/cauq-8yn6/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.cityofnewyork.us/api/views/y74e-vkxy/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.cityofnewyork.us/api/views/qqsi-vm9f/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://health.data.ny.gov/api/views/2hcc-shji/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.cityofnewyork.us/api/views/kwk4-6u9e/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.seattle.gov/api/views/jedg-8zvw/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.seattle.gov/api/views/kdjv-k5qf/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.cityofnewyork.us/api/views/mnz3-dyi8/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.cityofnewyork.us/api/views/mdbu-nrqn/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.medicare.gov/api/views/qqc4-6tc7/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.cityofnewyork.us/api/views/dvzp-h4k9/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.medicare.gov/api/views/zqjn-m8m8/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.medicare.gov/api/views/9n3s-kdb3/rows.rdf?accessType=DOWNLOAD","DUMP"
+"http://dati.opendataground.it:80/comunealbanolaziale/908.rdf","DUMP"
+"https://data.kingcounty.gov/api/views/dkxx-z4fb/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.cityofnewyork.us/api/views/qhen-5rve/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.cityofnewyork.us/api/views/jfju-ynrr/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.medicare.gov/api/views/rs6n-9qwg/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.cityofchicago.org/api/views/5yjb-v3mj/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.cityofnewyork.us/api/views/x2hp-8ukt/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.ny.gov/api/views/fjce-ze3t/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.cityofnewyork.us/api/views/cxzk-qz9w/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.cityofnewyork.us/api/views/ftxv-d5ix/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.hawaii.gov/api/views/jzyk-q3tp/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.medicare.gov/api/views/qd2y-qcgs/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.hawaii.gov/api/views/wwsw-d6qv/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.oregon.gov/api/views/edj7-vxdr/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.cityofchicago.org/api/views/3qdj-cqb8/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.oregon.gov/api/views/8s3k-ygh2/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.ok.gov/api/views/js93-d7pp/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.baltimorecity.gov/api/views/782b-zpd7/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.cityofnewyork.us/api/views/upwt-zvh3/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.cityofnewyork.us/api/views/tbf6-u8ea/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.maryland.gov/api/views/3bkz-cttp/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.sfgov.org/api/views/yitu-d5am/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.baltimorecity.gov/api/views/fswi-8fjy/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.illinois.gov/api/views/sp57-w96j/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://datacatalog.cookcountyil.gov/api/views/excn-ffg4/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.kingcounty.gov/api/views/yaai-7frk/rows.rdf?accessType=DOWNLOAD","DUMP"
+"https://data.montgomerycountymd.gov/api/views/4mse-ku6q/rows.rdf?accessType=DOWNLOAD","DUMP"
+"http://suche.transparenz.hamburg.de/","CKAN"
+"http://www.opendata.provincia.roma.it/","CKAN"
+"http://opendata.cmt.es","CKAN"
+"http://data.gov.uk/","CKAN"
+"http://www.dati.gov.it/catalog/","CKAN"
+"http://www.daten.rlp.de/","CKAN"
+"http://opendata.aragon.es/catalogo/","CKAN"
+"http://rotterdamopendata.nl/","CKAN"
+"http://data.gov.ie/","CKAN"
+"http://www.opendata.admin.ch/en/","CKAN"
+"http://cz.ckan.net/en/","CKAN"
+"http://data.amsterdamopendata.nl/","CKAN"
+"http://data.opendataforum.info/","CKAN"
+"http://oppnadata.se/","CKAN"
+"https://www.data.gv.at/katalog/","CKAN"
+"https://www.govdata.de/ckan/","CKAN"
+"http://opengov.es/","CKAN"
+"http://opendata.hu/","CKAN"
+"https://offenedaten.de/","CKAN"
+"http://ckan.data.graz.gv.at/","CKAN"
+"http://ckan.data.linz.gv.at/","CKAN"
+"http://www.nosdonnees.fr/","CKAN"
+"http://datosabiertos.malaga.eu/","CKAN"
+"http://dati.trentino.it/","CKAN"
+"http://dati.veneto.it/","CKAN"
+"http://data.kk.dk/","CKAN"
+"http://portal.openbelgium.be/","CKAN"
+"http://www.odaa.dk/","CKAN"
+"http://ckan.opendatacanarias.es/","CKAN"
+"http://it.ckan.net/","CKAN"
+"http://rs.ckan.net/","CKAN"
+"http://www.hri.fi/en/","CKAN"
+"http://data.bris.ac.uk/data/","CKAN"
+"http://data.london.gov.uk","CKAN"
+"https://open-data.europa.eu/en/data/","CKAN"
+"http://data.upf.edu/en/","CKAN"
+"http://opendata.ayto-caceres.es/","CKAN"
+"http://opendata.opennorth.se/","CKAN"
+"http://datos.alcobendas.org/","CKAN"
+"http://data.opendataportal.at/","CKAN"
+"http://opendata.comune.bari.it/","CKAN"
+"http://www.opendatamalta.org/ckan/","CKAN"
+"http://dati.toscana.it/","CKAN"
+"http://data.glasgow.gov.uk/","CKAN"
+"http://data.gov.ro/","CKAN"
+"http://publicdata.eu/","CKAN"
+"http://datahub.io/","CKAN"
+"http://data.gov.md/ckan/","CKAN"
+"https://www.opengov-muenchen.de/","CKAN"
+"http://data.noe.gv.at/","CKAN"
+"http://datos.santander.es/catalogo/","CKAN"
+"http://ckan.gobex.es/","CKAN"
+"http://catalogo.upo.gob.es/en/","CKAN"
+"http://data.gov.hr/","CKAN"
+"http://annuario.comune.fi.it/","CKAN"
+"http://www.datagm.org.uk/","CKAN"
+"http://data.wu.ac.at/","CKAN"
+"http://opendata.cnmc.es/","CKAN"
+"http://opendata.awt.be/","CKAN"
+"http://pl.ckan.net/","CKAN"
+"https://opendata.government.bg","CKAN"
+"http://www.leedsdatamill.org","CKAN"
+"https://data.overheid.nl/data/","CKAN"
+"http://apigobiernoabiertocatalog.valencia.es/","CKAN"
+"http://www.opendatahub.it/","CKAN"
+"http://www.dataset.puglia.it/","CKAN"
+"http://data.zagreb.hr/","CKAN"
+"https://danepubliczne.gov.pl","CKAN"
+"https://www.data.gouv.fr","CKAN"
+"http://data.gov.gr/","CKAN"
+"http://www.edinburghopendata.info/","CKAN"
+"https://opendata.riik.ee/","CKAN"
+"http://data.salzburgerland.com/","CKAN"
+"http://opendatagortynia.gr/","CKAN"
+"http://www.opendata-hro.de/","CKAN"
+"http://opingogn.is/","CKAN"
+"http://data.gov.sk/","CKAN"
+"https://data.stadt-zuerich.ch/","CKAN"
+"https://www.avoindata.fi/data/fi/","CKAN"
+"http://en.openei.org/datasets/","CKAN"
+"http://opendata.cambridgeshireinsight.org.uk/","CKAN"
+"http://marinedata.scotland.gov.uk/~marine/","CKAN"
+"http://opendata.bonn.de/","CKAN"
+"http://data.gov.ua/","CKAN"
+"http://data.mfcr.cz/","CKAN"
+"http://opendatakosovo.org/data","CKAN"
+"http://api.talis.com/stores/eupmedia/services/sparql","SPARQL"
+"http://cr3.eionet.europa.eu/sparql","SPARQL"
+"http://data.gov.ua/","CKAN"
+"http://data.mfcr.cz/","CKAN"
diff --git a/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/FrontierImplTest.java b/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/FrontierImplTest.java
index 48e801043..f907b9718 100644
--- a/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/FrontierImplTest.java
+++ b/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/FrontierImplTest.java
@@ -16,6 +16,8 @@
import org.dice_research.squirrel.data.uri.UriType;
import org.dice_research.squirrel.data.uri.filter.MongoDBKnowUriFilter;
import org.dice_research.squirrel.data.uri.norm.NormalizerImpl;
+import org.dice_research.squirrel.predictor.MultinomialPredictor;
+import org.dice_research.squirrel.predictor.Predictor;
import org.dice_research.squirrel.queue.ipbased.MongoDBIpBasedQueue;
import org.junit.After;
import org.junit.Assert;
@@ -32,6 +34,7 @@ public class FrontierImplTest {
private static MongoDBKnowUriFilter filter;
private static List uris = new ArrayList();
private static CrawleableUriFactory4Tests cuf = new CrawleableUriFactory4Tests();
+ private static Predictor predictor;
@Before
public void setUp() throws Exception {
@@ -43,7 +46,8 @@ public void setUp() throws Exception {
queue = new MongoDBIpBasedQueue("localhost", 58027);
filter.open();
queue.open();
- frontier = new FrontierImpl(new NormalizerImpl(), filter, queue,true);
+ predictor = new MultinomialPredictor();
+ frontier = new FrontierImpl(new NormalizerImpl(), filter, queue,true, predictor);
uris.add(cuf.create(new URI("http://dbpedia.org/resource/New_York"), InetAddress.getByName("127.0.0.1"),
UriType.DEREFERENCEABLE));
@@ -172,4 +176,4 @@ public void tearDown() throws Exception {
p = Runtime.getRuntime().exec(rethinkDockerRmCommand);
p.waitFor();
}
-}
\ No newline at end of file
+}
diff --git a/squirrel.frontier/src/test/java/org/dice_research/squirrel/predictor/impl/BinomialPredictorEvaluation.java b/squirrel.frontier/src/test/java/org/dice_research/squirrel/predictor/impl/BinomialPredictorEvaluation.java
new file mode 100644
index 000000000..04f37e066
--- /dev/null
+++ b/squirrel.frontier/src/test/java/org/dice_research/squirrel/predictor/impl/BinomialPredictorEvaluation.java
@@ -0,0 +1,196 @@
+package org.dice_research.squirrel.predictor.impl;
+
+import org.dice_research.squirrel.data.uri.CrawleableUri;
+import org.dice_research.squirrel.predictor.BinomialPredictor;
+
+import java.io.*;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.net.URL;
+import java.nio.charset.Charset;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Random;
+
+public class BinomialPredictorEvaluation {
+
+ /**
+ * Used to initialize the object for the binomial predictor
+ */
+ protected BinomialPredictor predictor;
+
+ /**
+ * Indicates the path to the file containing train data.
+ */
+ protected String trainFilePath;
+
+ /**
+ * Indicates the name of the type which should be used as positive class while training.
+ */
+ protected String positiveClass;
+
+ /**
+ * Indicates the path to the file containing the test data.
+ */
+ String testFilePath;
+ /**
+ * Used to generate the
+ */
+
+ /**
+ * Constructor.
+ *
+ * @param trainFilePath Indicates the path to the file containing train data.
+ * @param positiveClass Indicates the name of the type which should be used as positive class while training.
+ * @param testFilePath Indicates the path to the file containing the test data.
+ */
+ public BinomialPredictorEvaluation(String trainFilePath, String positiveClass, String testFilePath) {
+ this.trainFilePath = trainFilePath;
+ this.positiveClass = positiveClass;
+ this.testFilePath = testFilePath;
+
+ }
+
+ /**
+ * Function to evaluate the performance of the URI predictor on a test set
+ */
+ public void evaluation() {
+ Integer uriCount = 0;
+ Integer correctCount = 0;
+ double accuracy;
+ Integer truePos = 0;
+ Integer falsePos = 0;
+ Integer falseNeg = 0;
+ Integer trueNeg = 0;
+ BufferedReader br = null;
+ try{
+ FileReader in = new FileReader(testFilePath);
+ br = new BufferedReader(in);
+ String line;
+ while ((line = br.readLine()) != null){
+ uriCount++;
+ String[] split = line.split("," );
+ URI furi = null;
+ try{
+ furi = new URI(split[0].replace("\"", ""));
+ }catch (URISyntaxException e) {
+ try {
+ furi = new URI("http://scoreboard.lod2.eu/data/scoreboardDataCube.rdf");
+ } catch (URISyntaxException ex) {
+ ex.printStackTrace();
+ }
+ }
+ CrawleableUri uri = new CrawleableUri(furi);
+ String pred = this.predictor.predict(uri);
+ split[1] = split[1].replace("\"", "");
+ if(split[1].equals(positiveClass)){
+ //System.out.println("the class is: " + split[1]);
+ if(pred.equals("dereferencing")){
+ correctCount ++;
+ truePos ++;
+ }
+ else{
+ falseNeg ++;
+ }
+ }
+ else{
+ if(!pred.equals("dereferencing")){
+ correctCount ++;
+ trueNeg ++;
+ }
+ else{
+ falsePos ++;
+ }
+ }
+ }
+
+ }catch (IOException e){
+ e.printStackTrace();
+ }
+ accuracy = correctCount.floatValue() / uriCount.floatValue();
+
+ System.out.println(" The total number of URIs is: " + uriCount);
+ System.out.println(" The total number of correct predictions is: " + correctCount);
+ System.out.println(" The accuracy of the predictor is: " + accuracy);
+ System.out.println("True Positive is: " + truePos);
+ System.out.println("False Positive is: " + falsePos);
+ System.out.println("False Negative is: " + falseNeg);
+ System.out.println("True Negative is: " + trueNeg);
+
+ }
+
+ /**
+ * Function to perform K-fold cross validation.
+ */
+ public void crossValidation(){
+ URL url = null;
+ BufferedReader br = null;
+ ArrayList lineList = new ArrayList();
+ int[][] train;
+ int[][] test;
+ int[] index;
+ String line;
+ int folds = 10;
+ int chunk;
+ try {
+
+
+ br = new BufferedReader(new InputStreamReader(getClass().getClassLoader().getResourceAsStream("binomialTrainData.txt")
+ , Charset.defaultCharset()));
+ line = br.readLine();
+ while( ( line = br.readLine()) != null){
+ lineList.add(line);
+ }
+ System.out.println(lineList.size());
+ Collections.shuffle(lineList, new Random(113));
+ chunk = lineList.size()/folds;
+ train = new int[folds][];
+ test = new int[folds][];
+ index = new int[lineList.size()];
+ for (int i = 0; i < lineList.size(); i++) {
+ index[i] = i;
+ }
+ for(int i=0; i=start && j classList = new ArrayList<>();
+ static{
+ classList.add("SPARQL");
+ classList.add("DUMP");
+ classList.add("CKAN");
+ }
+
+ Integer[][] confusionMatrix = new Integer[3][3];
+
+ /**
+ * Constructor.
+ *
+ * @param trainFilePath
+ * Indicates the path to the file containing train data.
+ * @param testFilePath
+ * Indicates the path to the file containing the test data.
+ */
+ public MultinomialPredictorEvaluation(String trainFilePath, String testFilePath){
+ this.trainFilePath = trainFilePath;
+ this.testFilePath = testFilePath;
+ for(int i=0; i<3; i++){
+ for(int j=0; j<3; j++){
+ confusionMatrix[i][j] = 0;
+ }
+ }
+ }
+
+ /**
+ * Function to evaluate the performance of the URI predictor on a test set
+ */
+ public double evaluation() {
+
+ Integer uriCount = 0;
+ Integer correctCount = 0;
+ double accuracy;
+ BufferedReader br = null;
+ try (FileReader in = new FileReader(testFilePath)){
+ br = new BufferedReader(in);
+ String line;
+ while ((line = br.readLine()) != null) {
+ uriCount ++;
+ String[] split = line.split("," );
+ URI furi = null;
+ try {
+ furi = new URI(split[0].replace("\"", ""));
+ } catch (URISyntaxException e) {
+ try {
+ furi = new URI("http://scoreboard.lod2.eu/data/scoreboardDataCube.rdf");
+ } catch (URISyntaxException ex) {
+ ex.printStackTrace();
+ }
+ }
+ CrawleableUri uri = new CrawleableUri(furi);
+ String pred = predictor.predict(uri);
+ //System.out.println("predicted values: "+ pred);
+ split[1] = split[1].replace("\"", "");
+ //System.out.println("the classList index: "+classList.indexOf(split[1]));
+ if(classList.indexOf(split[1]) != -1)
+ confusionMatrix[classList.indexOf(split[1])][classList.indexOf(pred)]++;
+ if(pred.equals(split[1])){
+ correctCount++;
+ }
+ }
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ accuracy = correctCount.floatValue() / uriCount.floatValue();
+ for(int i=0; i<3; i++){
+ for(int j=0; j<3; j++){
+ System.out.print(" " +confusionMatrix[i][j]);
+ }
+ System.out.println();
+ }
+ System.out.println(" The total number of URIs is: " + uriCount);
+ System.out.println(" The total number of correct predictions is: " + correctCount);
+ System.out.println(" The accuracy of the predictor is: " + accuracy);
+ return accuracy;
+ }
+
+ /**
+ * Function to perform K-fold cross validation.
+ */
+ public void crossValidation(){
+ URL url = null;
+ BufferedReader br = null;
+ ArrayList lineList = new ArrayList();
+ int[][] train;
+ int[][] test;
+ int[] index;
+ String line;
+ int folds = 10;
+ int chunk;
+ try {
+ br = new BufferedReader(new InputStreamReader(new FileInputStream(trainFilePath)));
+ while( ( line = br.readLine()) != null){
+ lineList.add(line);
+ }
+ Collections.shuffle(lineList, new Random(113));
+ chunk = lineList.size()/folds;
+ train = new int[folds][];
+ test = new int[folds][];
+ index = new int[lineList.size()];
+ for (int i = 0; i < lineList.size(); i++) {
+ index[i] = i;
+ }
+ for(int i=0; i=start && j analyze(CrawleableUri curi, File data, Sink sink) {
sink.addTriple(curi, t);
}
ActivityUtil.addStep(curi, getClass());
+ curi.addData(Constants.URI_TRUE_CLASS, "DUMP");
return collector.getUris(curi);
} catch (IOException | org.rdfhdt.hdt.exceptions.NotFoundException e) {
LOGGER.error("An error occured when processing the HDT file", e);
diff --git a/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/RDFAnalyzer.java b/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/RDFAnalyzer.java
index 27ffa59ad..1abb6f704 100644
--- a/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/RDFAnalyzer.java
+++ b/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/RDFAnalyzer.java
@@ -98,6 +98,9 @@ public Iterator analyze(CrawleableUri curi, File data, Sink sink) {
}
}
ActivityUtil.addStep(curi, getClass());
+ if(curi.getData(Constants.URI_TRUE_CLASS) == null){
+ curi.addData(Constants.URI_TRUE_CLASS, "DUMP");
+ }
return collector.getUris(curi);
} catch (Exception e) {
LOGGER.error("Exception while analyzing. Aborting. ", e);
diff --git a/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/ckan/CkanJsonAnalyzer.java b/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/ckan/CkanJsonAnalyzer.java
index 19b1b4edd..5de521107 100644
--- a/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/ckan/CkanJsonAnalyzer.java
+++ b/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/ckan/CkanJsonAnalyzer.java
@@ -48,7 +48,11 @@ public CkanJsonAnalyzer(UriCollector collector) {
public Iterator analyze(CrawleableUri curi, File data, Sink sink) {
// Make sure that the file contains the CKAN JSON objects we are expecting
if (Constants.URI_TYPE_VALUE_CKAN.equals(curi.getData(Constants.URI_HTTP_MIME_TYPE_KEY))) {
+
+ curi.addData(Constants.URI_TRUE_CLASS, "CKAN");
+
LOGGER.info("Starting the Ckan Json Analyzer for URI: " + curi.getUri().toString());
+
Stream lines = null;
try {
lines = Files.lines(data.toPath(), StandardCharsets.UTF_8);
diff --git a/squirrel.worker/src/main/java/org/dice_research/squirrel/fetcher/sparql/SparqlBasedFetcher.java b/squirrel.worker/src/main/java/org/dice_research/squirrel/fetcher/sparql/SparqlBasedFetcher.java
index 7c0474f23..ed33be4c8 100644
--- a/squirrel.worker/src/main/java/org/dice_research/squirrel/fetcher/sparql/SparqlBasedFetcher.java
+++ b/squirrel.worker/src/main/java/org/dice_research/squirrel/fetcher/sparql/SparqlBasedFetcher.java
@@ -1,144 +1,147 @@
-package org.dice_research.squirrel.fetcher.sparql;
-
-import java.io.BufferedOutputStream;
-import java.io.File;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.OutputStream;
-import java.sql.SQLException;
-import java.util.Iterator;
-
-import org.aksw.jena_sparql_api.core.QueryExecutionFactory;
-import org.aksw.jena_sparql_api.delay.core.QueryExecutionFactoryDelay;
-import org.aksw.jena_sparql_api.http.QueryExecutionFactoryHttp;
-import org.aksw.jena_sparql_api.pagination.core.QueryExecutionFactoryPaginated;
-import org.apache.commons.io.FileUtils;
-import org.apache.jena.graph.Triple;
-import org.apache.jena.query.QueryExecution;
-import org.apache.jena.query.QuerySolution;
-import org.apache.jena.query.ResultSet;
-import org.apache.jena.riot.RDFDataMgr;
-import org.apache.tika.io.IOUtils;
-import org.dice_research.squirrel.Constants;
-import org.dice_research.squirrel.data.uri.CrawleableUri;
-import org.dice_research.squirrel.fetcher.Fetcher;
-import org.dice_research.squirrel.fetcher.delay.Delayer;
-import org.dice_research.squirrel.metadata.ActivityUtil;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import org.springframework.stereotype.Component;
-
-/**
- * A simple {@link Fetcher} for SPARQL that tries to get triples from a SPARQL
- * endpoint using the query {@value #SELECT_ALL_TRIPLES_QUERY}.
- *
- * @author Michael Röder (michael.roeder@uni-paderborn.de)
- *
- */
-@Component
-public class SparqlBasedFetcher implements Fetcher {
-
- private static final Logger LOGGER = LoggerFactory.getLogger(SparqlBasedFetcher.class);
-
- /**
- * The default minimum delay that the system will have between sending two queries.
- */
- private static final int MINIMUM_DELAY = 1000;
-
- private static final String SELECT_ALL_TRIPLES_QUERY = "SELECT ?s ?p ?o\r\n" + "WHERE {\r\n" + "GRAPH ?g {\r\n"
- + "?s ?p ?o\r\n" + "}} ";
-
- protected int minimumDelay = MINIMUM_DELAY;
- protected File dataDirectory = FileUtils.getTempDirectory();
-
- @Override
- public File fetch(CrawleableUri uri, Delayer delayer) {
- // Check whether we can be sure that it is a SPARQL endpoint
- boolean shouldBeSparql = Constants.URI_TYPE_VALUE_SPARQL.equals(uri.getData(Constants.URI_TYPE_KEY));
- QueryExecutionFactory qef = null;
- QueryExecution execution = null;
- File dataFile = null;
- OutputStream out = null;
- try {
- // Get the permission for the first request
- delayer.getRequestPermission();
- // Create query execution instance
- qef = initQueryExecution(uri.getUri().toString(), delayer);
- // create temporary file
- try {
- dataFile = File.createTempFile("fetched_", "", dataDirectory);
- out = new BufferedOutputStream(new FileOutputStream(dataFile));
- } catch (IOException e) {
- LOGGER.error("Couldn't create temporary file for storing fetched data. Returning null.", e);
- return null;
- }
- execution = qef.createQueryExecution(SELECT_ALL_TRIPLES_QUERY);
- ResultSet resultSet = execution.execSelect();
- RDFDataMgr.writeTriples(out, new SelectedTriplesIterator(resultSet));
- uri.addData(Constants.URI_HTTP_MIME_TYPE_KEY, "application/n-triples");
- LOGGER.info("Added: " + uri.getData(Constants.URI_HTTP_MIME_TYPE_KEY));
- } catch (Throwable e) {
- // If this should have worked, print a message, otherwise silently return null
- if (shouldBeSparql) {
- LOGGER.error("Couldn't create QueryExecutionFactory for \"" + uri.getUri() + "\". Returning -1.");
- ActivityUtil.addStep(uri, getClass(), e.getMessage());
- }
- return null;
- } finally {
- IOUtils.closeQuietly(out);
- if (execution != null) {
- execution.close();
- }
- if (qef != null) {
- qef.close();
- }
- delayer.requestFinished();
- }
- ActivityUtil.addStep(uri, getClass());
- return dataFile;
- }
-
- protected QueryExecutionFactory initQueryExecution(String uri, Delayer delayer)
- throws ClassNotFoundException, SQLException {
- QueryExecutionFactory qef;
- qef = new QueryExecutionFactoryHttp(uri);
- qef = new QueryExecutionFactoryDelay(qef, Math.max(minimumDelay, delayer.getDelay()));
- try {
- LOGGER.info("Starting to Query uri:" + uri);
- return new QueryExecutionFactoryPaginated(qef, 1000);
- } catch (Exception e) {
- LOGGER.info("Couldn't create Factory with pagination. Returning Factory without pagination. Exception: {}",
- e.getLocalizedMessage());
- return qef;
- }
- }
-
-
-
- @Override
- public void close() throws IOException {
- // nothing to do
- }
-
- protected static class SelectedTriplesIterator implements Iterator {
- private ResultSet resultSet;
-
- public SelectedTriplesIterator(ResultSet resultSet) {
- this.resultSet = resultSet;
- }
-
- @Override
- public boolean hasNext() {
- return resultSet.hasNext();
- }
-
- @Override
- public Triple next() {
- QuerySolution solution = resultSet.next();
- Triple t = new Triple(solution.get("s").asNode(), solution.get("p").asNode(), solution.get("o").asNode());
- return t;
- }
-
- }
-
-}
+package org.dice_research.squirrel.fetcher.sparql;
+
+import java.io.BufferedOutputStream;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.sql.SQLException;
+import java.util.Iterator;
+
+import org.aksw.jena_sparql_api.core.QueryExecutionFactory;
+import org.aksw.jena_sparql_api.delay.core.QueryExecutionFactoryDelay;
+import org.aksw.jena_sparql_api.http.QueryExecutionFactoryHttp;
+import org.aksw.jena_sparql_api.pagination.core.QueryExecutionFactoryPaginated;
+import org.apache.commons.io.FileUtils;
+import org.apache.jena.graph.Triple;
+import org.apache.jena.query.QueryExecution;
+import org.apache.jena.query.QuerySolution;
+import org.apache.jena.query.ResultSet;
+import org.apache.jena.riot.RDFDataMgr;
+import org.apache.tika.io.IOUtils;
+import org.dice_research.squirrel.Constants;
+import org.dice_research.squirrel.data.uri.CrawleableUri;
+import org.dice_research.squirrel.fetcher.Fetcher;
+import org.dice_research.squirrel.fetcher.delay.Delayer;
+import org.dice_research.squirrel.metadata.ActivityUtil;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.springframework.stereotype.Component;
+
+/**
+ * A simple {@link Fetcher} for SPARQL that tries to get triples from a SPARQL
+ * endpoint using the query {@value #SELECT_ALL_TRIPLES_QUERY}.
+ *
+ * @author Michael Röder (michael.roeder@uni-paderborn.de)
+ *
+ */
+@Component
+public class SparqlBasedFetcher implements Fetcher {
+
+ private static final Logger LOGGER = LoggerFactory.getLogger(SparqlBasedFetcher.class);
+
+ /**
+ * The default minimum delay that the system will have between sending two queries.
+ */
+ private static final int MINIMUM_DELAY = 1000;
+
+ private static final String SELECT_ALL_TRIPLES_QUERY = "SELECT ?s ?p ?o\r\n" + "WHERE {\r\n" + "GRAPH ?g {\r\n"
+ + "?s ?p ?o\r\n" + "}} ";
+
+ protected int minimumDelay = MINIMUM_DELAY;
+ protected File dataDirectory = FileUtils.getTempDirectory();
+
+ @Override
+ public File fetch(CrawleableUri uri, Delayer delayer) {
+ // Check whether we can be sure that it is a SPARQL endpoint
+ boolean shouldBeSparql = Constants.URI_TYPE_VALUE_SPARQL.equals(uri.getData(Constants.URI_TYPE_KEY));
+ if(shouldBeSparql){
+ uri.addData(Constants.URI_TRUE_CLASS,"SPARQL" );
+ }
+ QueryExecutionFactory qef = null;
+ QueryExecution execution = null;
+ File dataFile = null;
+ OutputStream out = null;
+ try {
+ // Get the permission for the first request
+ delayer.getRequestPermission();
+ // Create query execution instance
+ qef = initQueryExecution(uri.getUri().toString(), delayer);
+ // create temporary file
+ try {
+ dataFile = File.createTempFile("fetched_", "", dataDirectory);
+ out = new BufferedOutputStream(new FileOutputStream(dataFile));
+ } catch (IOException e) {
+ LOGGER.error("Couldn't create temporary file for storing fetched data. Returning null.", e);
+ return null;
+ }
+ execution = qef.createQueryExecution(SELECT_ALL_TRIPLES_QUERY);
+ ResultSet resultSet = execution.execSelect();
+ RDFDataMgr.writeTriples(out, new SelectedTriplesIterator(resultSet));
+ uri.addData(Constants.URI_HTTP_MIME_TYPE_KEY, "application/n-triples");
+ LOGGER.info("Added: " + uri.getData(Constants.URI_HTTP_MIME_TYPE_KEY));
+ } catch (Throwable e) {
+ // If this should have worked, print a message, otherwise silently return null
+ if (shouldBeSparql) {
+ LOGGER.error("Couldn't create QueryExecutionFactory for \"" + uri.getUri() + "\". Returning -1.");
+ ActivityUtil.addStep(uri, getClass(), e.getMessage());
+ }
+ return null;
+ } finally {
+ IOUtils.closeQuietly(out);
+ if (execution != null) {
+ execution.close();
+ }
+ if (qef != null) {
+ qef.close();
+ }
+ delayer.requestFinished();
+ }
+ ActivityUtil.addStep(uri, getClass());
+ return dataFile;
+ }
+
+ protected QueryExecutionFactory initQueryExecution(String uri, Delayer delayer)
+ throws ClassNotFoundException, SQLException {
+ QueryExecutionFactory qef;
+ qef = new QueryExecutionFactoryHttp(uri);
+ qef = new QueryExecutionFactoryDelay(qef, Math.max(minimumDelay, delayer.getDelay()));
+ try {
+ LOGGER.info("Starting to Query uri:" + uri);
+ return new QueryExecutionFactoryPaginated(qef, 1000);
+ } catch (Exception e) {
+ LOGGER.info("Couldn't create Factory with pagination. Returning Factory without pagination. Exception: {}",
+ e.getLocalizedMessage());
+ return qef;
+ }
+ }
+
+
+
+ @Override
+ public void close() throws IOException {
+ // nothing to do
+ }
+
+ protected static class SelectedTriplesIterator implements Iterator {
+ private ResultSet resultSet;
+
+ public SelectedTriplesIterator(ResultSet resultSet) {
+ this.resultSet = resultSet;
+ }
+
+ @Override
+ public boolean hasNext() {
+ return resultSet.hasNext();
+ }
+
+ @Override
+ public Triple next() {
+ QuerySolution solution = resultSet.next();
+ Triple t = new Triple(solution.get("s").asNode(), solution.get("p").asNode(), solution.get("o").asNode());
+ return t;
+ }
+
+ }
+
+}
diff --git a/squirrel.worker/src/main/java/org/dice_research/squirrel/worker/impl/WorkerImpl.java b/squirrel.worker/src/main/java/org/dice_research/squirrel/worker/impl/WorkerImpl.java
index a2655b062..ec0af25b9 100644
--- a/squirrel.worker/src/main/java/org/dice_research/squirrel/worker/impl/WorkerImpl.java
+++ b/squirrel.worker/src/main/java/org/dice_research/squirrel/worker/impl/WorkerImpl.java
@@ -224,6 +224,9 @@ public void performCrawling(CrawleableUri uri) {
LOGGER.debug("I start crawling {} now...", uri);
File fetched = null;
try {
+
+
+
fetched = fetcher.fetch(uri, delayer);
} catch (Exception e) {
LOGGER.error("Exception while Fetching Data. Skipping...", e);
@@ -254,10 +257,11 @@ public void performCrawling(CrawleableUri uri) {
for (File file : fileList) {
LOGGER.info("Analyzing file " + cont + " of: " + fileList.size());
Iterator resultUris = analyzer.analyze(uri, file, sink);
- sendNewUris(resultUris);
+ sendNewUris(resultUris,uri);
cont++;
}
+
}
}
} catch (Exception e) {
@@ -282,11 +286,24 @@ public void performCrawling(CrawleableUri uri) {
LOGGER.info("Crawling {} is not allowed by the RobotsManager.", uri);
activity.addStep(manager.getClass(), "Decided to reject this URI.");
}
+
+
+ // Adding true label value for the prediction in uri map
+ if (activity.getNumberOfTriples()>0) {
+ uri.addData(Constants.URI_TRUE_LABEL,"dereferenceable");
+ } else {
+ uri.addData(Constants.URI_TRUE_LABEL, "NEGATIVE_CLASS");
+ }
+ //activity.finishActivity(sink);
+ // LOGGER.debug("Fetched {} triples", count);
+ //setSpecificRecrawlTime(uri);
+
if(storeMetadata)
activity.finishActivity(sink);
// LOGGER.debug("Fetched {} triples", count);
setSpecificRecrawlTime(uri);
+
} finally {
// Remove the activity since we don't want to send it back to the Frontier
uri.getData().remove(Constants.URI_CRAWLING_ACTIVITY);
@@ -320,13 +337,14 @@ public boolean sendsAliveMessages() {
*
* @param uriIterator an iterator used to iterate over all new URIs
*/
- public void sendNewUris(Iterator uriIterator) {
+ public void sendNewUris(Iterator uriIterator, CrawleableUri uri) {
List newUris = new ArrayList<>(MAX_URIS_PER_MESSAGE);
CrawleableUri newUri;
int packageCount = 0;
while (uriIterator != null && uriIterator.hasNext()) {
try {
newUri = serializer.deserialize(uriIterator.next());
+ newUri.addData(Constants.REFERRING_URI, uri.getUri());
uriProcessor.recognizeUriType(newUri);
newUris.add(newUri);
if ((newUris.size() >= (packageCount + 1) * MAX_URIS_PER_MESSAGE) && uriIterator.hasNext()) {
@@ -357,4 +375,4 @@ public int getId() {
return this.id;
}
-}
\ No newline at end of file
+}