diff --git a/.travis.yml b/.travis.yml index 06e2c1a06..57545bea3 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,8 +2,20 @@ sudo: required language: java +jdk: oraclejdk8 + +dist: trusty + +addons: + apt: + packages: + - oracle-java8-installer + + + services: - docker before_install: - docker pull rethinkdb:2.3.5 + diff --git a/docker-compose-sparql.yml b/docker-compose-sparql.yml index a577a3bb2..02d0e7797 100644 --- a/docker-compose-sparql.yml +++ b/docker-compose-sparql.yml @@ -31,6 +31,7 @@ services: - ./data/frontier:/var/squirrel/data - ./seed/seeds.csv:/var/squirrel/seeds.csv:ro - ./whitelist/whitelist.txt:/var/squirrel/whitelist.txt:ro + - ./spring-config:/var/squirrel/spring-config command: java -cp squirrel.jar org.dice_research.squirrel.components.FrontierComponentStarter virtuosohost: @@ -136,18 +137,18 @@ services: - ./spring-config:/var/squirrel/spring-config command: java -cp squirrel.jar org.dice_research.squirrel.components.WorkerComponentStarter - deduplicator: - image: squirrel - container_name: deduplicator - environment: - DEDUPLICATION_ACTIVE: "true" - HOBBIT_RABBIT_HOST: rabbit - OUTPUT_FOLDER: /var/squirrel/data - MDB_HOST_NAME: mongodb - MDB_PORT: 27017 - SPARQL_HOST_NAME: sparqlhost - SPARQL_HOST_PORT: 3030 - SERVICE_PRECONDITION: "rethinkdb:28015 rabbit:5672" - volumes: - - ./data/deduplicator:/var/squirrel/data - command: java -cp squirrel.jar org.hobbit.core.run.ComponentStarter org.aksw.simba.squirrel.components.DeduplicatorComponent +# deduplicator: +# image: squirrel +# container_name: deduplicator +# environment: +# DEDUPLICATION_ACTIVE: "true" +# HOBBIT_RABBIT_HOST: rabbit +# OUTPUT_FOLDER: /var/squirrel/data +# MDB_HOST_NAME: mongodb +# MDB_PORT: 27017 +# SPARQL_HOST_NAME: sparqlhost +# SPARQL_HOST_PORT: 3030 +# SERVICE_PRECONDITION: "rethinkdb:28015 rabbit:5672" +# volumes: +# - ./data/deduplicator:/var/squirrel/data +# command: java -cp squirrel.jar org.hobbit.core.run.ComponentStarter org.aksw.simba.squirrel.components.DeduplicatorComponent diff --git a/spring-config/frontier-context.xml b/spring-config/frontier-context.xml index 9c84264a1..01271c394 100644 --- a/spring-config/frontier-context.xml +++ b/spring-config/frontier-context.xml @@ -54,4 +54,5 @@ + diff --git a/squirrel.api/src/main/java/org/dice_research/squirrel/Constants.java b/squirrel.api/src/main/java/org/dice_research/squirrel/Constants.java index ec6121959..39452556d 100644 --- a/squirrel.api/src/main/java/org/dice_research/squirrel/Constants.java +++ b/squirrel.api/src/main/java/org/dice_research/squirrel/Constants.java @@ -46,6 +46,32 @@ public class Constants { */ public static final String URI_PREFERRED_RECRAWL_ON = "recrawl-on"; + /* + * The data related to the predictor + */ + /** + * This key stores the value predicted by the Predictor for each URI denoting + * the class it belongs to (Positive class or Negative class) + */ + public static final String URI_PREDICTED_LABEL = "predicted-label"; + /** + * This key stores the value denoting the true class of each URI + */ + public static final String URI_TRUE_LABEL = "true-label"; + /** + * This key stores an integer denoting the true class of the URI + */ + public static final String URI_TRUE_CLASS = "true_class"; + /** + * This key stores the feature vector generated for each URI for prediction purpose + */ + public static final String FEATURE_VECTOR = "feature-vector"; + /** + * This key stores the parent URI of each crawled URI + */ + public static final String REFERRING_URI = "referring-uri"; + + ////////////////////////////////////////////////// // URIs ////////////////////////////////////////////////// diff --git a/squirrel.frontier/pom.xml b/squirrel.frontier/pom.xml index 6d06d8441..491f88181 100644 --- a/squirrel.frontier/pom.xml +++ b/squirrel.frontier/pom.xml @@ -21,6 +21,12 @@ org.dice-research squirrel.web-api + + + de.jungblut.ml + tjungblut-online-ml + 0.5 + @@ -63,4 +69,4 @@ - \ No newline at end of file + diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java index 9ba0f0401..564804f69 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java @@ -35,7 +35,9 @@ import org.dice_research.squirrel.frontier.impl.QueueBasedTerminationCheck; import org.dice_research.squirrel.frontier.impl.TerminationCheck; import org.dice_research.squirrel.frontier.impl.WorkerGuard; +import org.dice_research.squirrel.predictor.*; import org.dice_research.squirrel.queue.InMemoryQueue; +import org.dice_research.squirrel.queue.IpAddressBasedQueue; import org.dice_research.squirrel.queue.UriQueue; import org.dice_research.squirrel.rabbit.RPCServer; import org.dice_research.squirrel.rabbit.RespondingDataHandler; @@ -54,6 +56,7 @@ import org.springframework.beans.factory.annotation.Qualifier; import org.springframework.stereotype.Component; + @Component @Qualifier("frontierComponent") public class FrontierComponent extends AbstractComponent implements RespondingDataHandler { @@ -77,10 +80,16 @@ public class FrontierComponent extends AbstractComponent implements RespondingDa private final WorkerGuard workerGuard = new WorkerGuard(this); private final boolean doRecrawling = true; private long recrawlingTime = 1000L * 60L * 60L * 24L * 30; + + private Timer timerTerminator; + public static final boolean RECRAWLING_ACTIVE = true; + + protected Predictor predictor; + @Override public void init() throws Exception { super.init(); @@ -108,9 +117,15 @@ public void init() throws Exception { queue = new InMemoryQueue(); knownUriFilter = new InMemoryKnownUriFilter(doRecrawling, recrawlingTime); } + // Training the URI predictor model with a training dataset + try { + predictor = new MultinomialPredictor.MultinomialPredictorBuilder().withFile("multiNomialTrainData.txt").build(); + }catch (Exception e){ + e.printStackTrace(); + } // Build frontier - frontier = new ExtendedFrontierImpl(new NormalizerImpl(), knownUriFilter, uriReferences, queue, doRecrawling); + frontier = new ExtendedFrontierImpl(new NormalizerImpl(), knownUriFilter, uriReferences, (IpAddressBasedQueue) queue, doRecrawling, predictor); rabbitQueue = this.incomingDataQueueFactory.createDefaultRabbitQueue(Constants.FRONTIER_QUEUE_NAME); receiver = (new RPCServer.Builder()).responseQueueFactory(outgoingDataQueuefactory).dataHandler(this) @@ -139,11 +154,13 @@ public void init() throws Exception { + webConfiguration.isVisualizationOfCrawledGraphEnabled() + ". No WebServiceSenderThread will be started!"); } + + } @Override public void run() throws Exception { - + terminationMutex.acquire(); } @@ -177,7 +194,7 @@ public void handleData(byte[] data) { @Override public void handleData(byte[] data, ResponseHandler handler, String responseQueueName, String correlId) { - + Object deserializedData; try { deserializedData = serializer.deserialize(data); @@ -200,7 +217,7 @@ public void handleData(byte[] data, ResponseHandler handler, String responseQueu if (deserializedData instanceof UriSetRequest) { responseToUriSetRequest(handler, responseQueueName, correlId, (UriSetRequest) deserializedData); } else if (deserializedData instanceof UriSet) { - + if(timerTerminator == null) { LOGGER.info("Initializing Terminator task..."); TimerTask terminatorTask = new TerminatorTask(queue, terminationMutex, this.workerGuard); @@ -212,6 +229,7 @@ public void handleData(byte[] data, ResponseHandler handler, String responseQueu } else if (deserializedData instanceof CrawlingResult) { CrawlingResult crawlingResult = (CrawlingResult) deserializedData; LOGGER.warn("Received the message that the crawling for {} URIs is done.", crawlingResult.uris.size()); + frontier.crawlingDone(crawlingResult.uris); workerGuard.removeUrisForWorker(crawlingResult.idOfWorker, crawlingResult.uris); } else if (deserializedData instanceof AliveMessage) { @@ -298,11 +316,11 @@ public void run() { break; } } - + if(!stillHasUris && terminationCheck.shouldFrontierTerminate(queue)) { terminationMutex.release(); - } + } } } -} \ No newline at end of file +} diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/ExtendedFrontierImpl.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/ExtendedFrontierImpl.java index b3f8b4858..26859f4ab 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/ExtendedFrontierImpl.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/ExtendedFrontierImpl.java @@ -12,6 +12,7 @@ import org.dice_research.squirrel.data.uri.norm.UriNormalizer; import org.dice_research.squirrel.deduplication.hashing.UriHashCustodian; import org.dice_research.squirrel.frontier.ExtendedFrontier; +import org.dice_research.squirrel.predictor.Predictor; import org.dice_research.squirrel.queue.IpAddressBasedQueue; import org.dice_research.squirrel.queue.UriQueue; @@ -29,11 +30,12 @@ public class ExtendedFrontierImpl extends FrontierImpl implements ExtendedFronti * @param generalRecrawlTime used to select the general Time after URIs should be recrawled. If Value is null the default Time is used. * @param timerPeriod used to select if URIs should be recrawled. * @param uriHashCustodian used to access and write hash values for uris. + * @param predictor {@link Predictor}Used to predict the type of the URI */ @SuppressWarnings("unused") public ExtendedFrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, UriQueue queue, boolean doesRecrawling, - long generalRecrawlTime, long timerPeriod, UriHashCustodian uriHashCustodian) { - super(normalizer, knownUriFilter, queue, doesRecrawling, generalRecrawlTime, timerPeriod, uriHashCustodian); + long generalRecrawlTime, long timerPeriod, UriHashCustodian uriHashCustodian, Predictor predictor) { + super(normalizer, knownUriFilter, queue, doesRecrawling, generalRecrawlTime, timerPeriod, uriHashCustodian, predictor); } /** @@ -45,9 +47,10 @@ public ExtendedFrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFil * @param queue {@link UriQueue} used to manage the URIs that should be * crawled. * @param doesRecrawling used to select if URIs should be recrawled. + * @param predictor {@link Predictor}Used to predict the type of the URI */ - public ExtendedFrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, IpAddressBasedQueue queue, boolean doesRecrawling) { - super(normalizer, knownUriFilter, queue, doesRecrawling); + public ExtendedFrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, IpAddressBasedQueue queue, boolean doesRecrawling, Predictor predictor) { + super(normalizer, knownUriFilter, queue, doesRecrawling, predictor); } /** @@ -60,9 +63,12 @@ public ExtendedFrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFil * @param queue {@link UriQueue} used to manage the URIs that should be * crawled. * @param doesRecrawling used to select if URIs should be recrawled. + * @param predictor PredictorImpl object used for prediction */ - public ExtendedFrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, URIReferences uriReferences, UriQueue queue, boolean doesRecrawling) { - super(normalizer, knownUriFilter, uriReferences, queue, doesRecrawling); + + public ExtendedFrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, URIReferences uriReferences, IpAddressBasedQueue queue, boolean doesRecrawling, Predictor predictor) { + super(normalizer, knownUriFilter, uriReferences, queue, doesRecrawling, predictor); + } @Override @@ -78,4 +84,4 @@ public void informAboutDeadWorker(String idOfWorker, List lstUris setIps.forEach(ip -> ipQueue.markIpAddressAsAccessible(ip)); } } -} \ No newline at end of file +} diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java index a41f8b85c..d48e38eb0 100644 --- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java @@ -1,10 +1,14 @@ package org.dice_research.squirrel.frontier.impl; + +import de.jungblut.math.DoubleVector; + import java.net.UnknownHostException; import java.util.List; import java.util.Timer; import java.util.TimerTask; + import org.dice_research.squirrel.Constants; import org.dice_research.squirrel.data.uri.CrawleableUri; import org.dice_research.squirrel.data.uri.filter.KnownUriFilter; @@ -18,8 +22,10 @@ import org.dice_research.squirrel.queue.BlockingQueue; import org.dice_research.squirrel.queue.UriQueue; import org.dice_research.squirrel.uri.processing.UriProcessor; +import org.dice_research.squirrel.components.FrontierComponent; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.dice_research.squirrel.predictor.*; /** * Standard implementation of the {@link Frontier} interface containing a @@ -97,6 +103,13 @@ public class FrontierImpl implements Frontier { */ private static final long DEFAULT_TIMER_PERIOD = 1000 * 60 * 60; + + /** + * {@link Predictor Used to predict the type of the URI} + */ + protected Predictor predictor; + + /** * Constructor. * @@ -117,10 +130,12 @@ public class FrontierImpl implements Frontier { * Value is null the default Time is used. * @param timerPeriod * used to select if URIs should be recrawled. + * @param predictor + * {@link Predictor}Used to predict the type of the URI */ public FrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, UriQueue queue, - GraphLogger graphLogger, boolean doesRecrawling, long generalRecrawlTime, long timerPeriod) { - this(normalizer, knownUriFilter, null, queue, graphLogger, doesRecrawling, generalRecrawlTime, timerPeriod); + GraphLogger graphLogger, boolean doesRecrawling, long generalRecrawlTime, long timerPeriod, Predictor predictor) { + this(normalizer, knownUriFilter, null, queue, graphLogger, doesRecrawling, generalRecrawlTime, timerPeriod, predictor); } /** @@ -141,32 +156,29 @@ public FrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, Uri * Value is null the default Time is used. * @param timerPeriod * used to select if URIs should be recrawled. + * @param predictor + * {@link Predictor}Used to predict the type of the URI */ public FrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, UriQueue queue, boolean doesRecrawling, - long generalRecrawlTime, long timerPeriod, UriHashCustodian uriHashCustodian) { - this(normalizer, knownUriFilter, queue, null, doesRecrawling, generalRecrawlTime, timerPeriod); + long generalRecrawlTime, long timerPeriod, UriHashCustodian uriHashCustodian, Predictor predictor) { + this(normalizer, knownUriFilter, queue, null, doesRecrawling, generalRecrawlTime, timerPeriod, predictor); } /** * Constructor. * - * @param normalizer - * {@link UriNormalizer} used to transform given URIs into a normal - * form - * @param knownUriFilter - * {@link UriFilter} used to identify URIs that already have been - * crawled. - * @param uriReferences - * {@link URIReferences} used to manage URI references - * @param queue - * {@link UriQueue} used to manage the URIs that should be crawled. - * @param doesRecrawling - * Value for {@link #doesRecrawling}. + + * @param normalizer {@link UriNormalizer} used to transform given URIs into a normal form + * @param knownUriFilter {@link UriFilter} used to identify URIs that already have been + * crawled. + * @param uriReferences {@link URIReferences} used to manage URI references + * @param queue {@link UriQueue} used to manage the URIs that should be + * crawled. + * @param doesRecrawling Value for {@link #doesRecrawling}. + * @param predictor Used to predict the type of the URI. */ - public FrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, URIReferences uriReferences, - UriQueue queue, boolean doesRecrawling) { - this(normalizer, knownUriFilter, uriReferences, queue, null, doesRecrawling, DEFAULT_GENERAL_RECRAWL_TIME, - DEFAULT_TIMER_PERIOD); + public FrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, URIReferences uriReferences, UriQueue queue, boolean doesRecrawling, Predictor predictor) { + this(normalizer, knownUriFilter, uriReferences, queue, null, doesRecrawling, DEFAULT_GENERAL_RECRAWL_TIME, DEFAULT_TIMER_PERIOD, predictor); } /** @@ -182,11 +194,14 @@ public FrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, URI * {@link UriQueue} used to manage the URIs that should be crawled. * @param doesRecrawling * Value for {@link #doesRecrawling}. + * @param predictor + * {@link Predictor}Used to predict the type of the URI + * */ public FrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, UriQueue queue, - boolean doesRecrawling) { + boolean doesRecrawling, Predictor predictor) { this(normalizer, knownUriFilter, queue, null, doesRecrawling, DEFAULT_GENERAL_RECRAWL_TIME, - DEFAULT_TIMER_PERIOD); + DEFAULT_TIMER_PERIOD, predictor); } /** @@ -200,9 +215,12 @@ public FrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, Uri * crawled. * @param queue * {@link UriQueue} used to manage the URIs that should be crawled. + * @param predictor + * {@link Predictor}Used to predict the type of the URI + * */ - public FrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, UriQueue queue) { - this(normalizer, knownUriFilter, queue, null, false, DEFAULT_GENERAL_RECRAWL_TIME, DEFAULT_TIMER_PERIOD); + public FrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, UriQueue queue, Predictor predictor) { + this(normalizer, knownUriFilter, queue, null, false, DEFAULT_GENERAL_RECRAWL_TIME, DEFAULT_TIMER_PERIOD, predictor); } /** @@ -227,10 +245,12 @@ public FrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, Uri * Value is null the default Time is used. * @param timerPeriod * used to select if URIs should be recrawled. + * @param predictor + * {@link Predictor}Used to predict the type of the URI */ public FrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, URIReferences uriReferences, UriQueue queue, GraphLogger graphLogger, boolean doesRecrawling, long generalRecrawlTime, - long timerPeriod) { + long timerPeriod, Predictor predictor) { this.normalizer = normalizer; this.knownUriFilter = knownUriFilter; this.uriReferences = uriReferences; @@ -241,8 +261,8 @@ public FrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFilter, URI this.queue.open(); this.doesRecrawling = doesRecrawling; this.timerPeriod = timerPeriod; + this.predictor = predictor; FrontierImpl.generalRecrawlTime = generalRecrawlTime; - if (this.doesRecrawling) { timerRecrawling = new Timer(); timerRecrawling.schedule(new TimerTask() { @@ -276,9 +296,18 @@ public void addNewUris(List uris) { public void addNewUri(CrawleableUri uri) { // Normalize the URI uri = normalizer.normalize(uri); + // Predict the URI type + if(predictor != null && uri.getType().equals("UNKNOWN")) { + try { + //predict and update uri key with the predicted class + String p = predictor.predict(uri); + uri.addData(Constants.URI_PREDICTED_LABEL, p); + } catch (Exception e) { + LOGGER.info("Exception happened while predicting", e); + } + } // After knownUriFilter uri should be classified according to // UriProcessor - if (knownUriFilter.isUriGood(uri)) { LOGGER.debug("addNewUri(" + uri + "): URI is good [" + knownUriFilter + "]"); if (schemeUriFilter.isUriGood(uri)) { @@ -311,20 +340,22 @@ public void addNewUri(CrawleableUri uri) { public void crawlingDone(List uris) { LOGGER.info("One worker finished his work and crawled " + uris.size() + " URIs."); - // List newUris = new ArrayList<>(uriMap.size()); - // for (CrawleableUri uri : uriMap.keySet()) { - // newUris.addAll(uriMap.get(uri)); - // knownUriFilter.add(uri, System.currentTimeMillis(), - // uri.getTimestampNextCrawl()); - // if (uriReferences != null) { - // uriReferences.add(uri, uriMap.get(uri)); - // } - // } - // // If there is a graph logger, log the data - // if (graphLogger != null) { - // graphLogger.log(new ArrayList<>(uriMap.keySet()), newUris); - // } + +// List newUris = new ArrayList<>(uriMap.size()); +// for (CrawleableUri uri : uriMap.keySet()) { +// newUris.addAll(uriMap.get(uri)); +// knownUriFilter.add(uri, System.currentTimeMillis(), uri.getTimestampNextCrawl()); +// if (uriReferences != null) { +// uriReferences.add(uri, uriMap.get(uri)); +// } +// } + +// // If there is a graph logger, log the data +// if (graphLogger != null) { +// graphLogger.log(new ArrayList<>(uriMap.keySet()), newUris); +// } + // If we should give the crawled IPs to the queue if (queue instanceof BlockingQueue) { ((BlockingQueue) queue).markUrisAsAccessible(uris); @@ -343,6 +374,16 @@ public void crawlingDone(List uris) { knownUriFilter.add(uri, System.currentTimeMillis()); } } + + // Update the URI type prediction model + try { + for (CrawleableUri uri : uris) { + predictor.weightUpdate(uri); + } + } catch (Exception e) { + LOGGER.warn("Exception happened while updating the weights for the URI type predictor model",e); + } + } @Override @@ -379,4 +420,6 @@ public UriQueue getQueue() { return queue; } -} \ No newline at end of file + +} + diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/predictor/BinomialPredictor.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/predictor/BinomialPredictor.java new file mode 100644 index 000000000..9e7907ee9 --- /dev/null +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/predictor/BinomialPredictor.java @@ -0,0 +1,491 @@ +package org.dice_research.squirrel.predictor; + + +import com.google.common.hash.Hashing; +import de.jungblut.math.DoubleVector; +import de.jungblut.math.activation.SigmoidActivationFunction; +import de.jungblut.math.dense.SingleEntryDoubleVector; +import de.jungblut.math.loss.LogLoss; +import de.jungblut.math.minimize.CostGradientTuple; +import de.jungblut.math.sparse.SequentialSparseDoubleVector; +import de.jungblut.nlp.VectorizerUtils; +import de.jungblut.online.minimizer.StochasticGradientDescent; +import de.jungblut.online.ml.FeatureOutcomePair; +import de.jungblut.online.regression.RegressionClassifier; +import de.jungblut.online.regression.RegressionModel; +import de.jungblut.online.regularization.AdaptiveFTRLRegularizer; +import de.jungblut.online.regularization.CostWeightTuple; +import de.jungblut.online.regularization.WeightUpdater; +import org.dice_research.squirrel.Constants; +import org.dice_research.squirrel.data.uri.CrawleableUri; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import java.net.URI; +import java.util.ArrayList; +import java.util.Arrays; + +/** + * A predictor that predicts the RDF-relevance of a URI by performing binary classification. + */ +public final class BinomialPredictor implements Predictor{ + + private static final Logger LOGGER = LoggerFactory.getLogger(BinomialPredictor.class); + /** + * {@link WeightUpdater} Used to update the weights of the predictor model used. + */ + private WeightUpdater updater; + /** + * {@link RegressionLearn} Used to train the model with training data + */ + private RegressionLearn learner; + /** + * {@link RegressionModel} Represents the regression model used for the prediction of the RDF-relevance of the URI + */ + private RegressionModel model; + /** + * {@link RegressionClassifier} Classifier for regression model. Takes a model or the atomic parts of it and predicts the outcome for a given feature. + * + */ + private RegressionClassifier classifier; + /** + * Used to store the location of the training data file. + */ + private String filepath; + /** + * The rate at which the model learns. + */ + private Double learningRate; + /** + * Regularizing parameter L2 + */ + private Double l2; + /** + * Regularizing parameter L1 + */ + private Double l1; + /** + * Hyper parameter Beta + */ + private Double beta; + /** + * Validation percentage which is between 0 and 1 + */ + private Double holdoutValidationPercentage; + /** + * The threshold above which a URI is classified into positive class + */ + private Double threshold; + /** + * The positive class for the classification + */ + private String positiveClass; + /** + * {@link FeatureVectorGenerator} Used to generate the feature vector of the URI + */ + private FeatureVectorGenerator featureGenerator = new FeatureVectorGenerator(); + + /** + * Predicts the type of the URI + * @param uri the URI to which the prediction has to be made + * @return the type of the URI + */ + public String predict(CrawleableUri uri) { + + String predictedClass = null; + try { + featureGenerator.featureHashing(uri); + Object featureArray = uri.getData(Constants.FEATURE_VECTOR); + double[] doubleFeatureArray = (double[]) featureArray; + DoubleVector features = new SequentialSparseDoubleVector(doubleFeatureArray); + //initialize the regression classifier with updated model and predict + this.setClassifier(new RegressionClassifier(this.getModel())); + DoubleVector prediction = this.classifier.predict(features); + + if(prediction.get(0) >= this.getThreshold()) + predictedClass = this.getPositiveClass(); + else + predictedClass = "NEGATIVE_CLASS"; + } catch (Exception e) { + LOGGER.warn("Prediction for this " + uri.getUri().toString() + " failed " + e); + e.printStackTrace(); + } + return predictedClass; + } + + /** + * Updates the predictor model based on the this URI + * @param curi based on which the model weights has to be updated + */ + public void weightUpdate(CrawleableUri curi) { + try { + if (curi.getData(Constants.FEATURE_VECTOR) != null && curi.getData(Constants.URI_TRUE_LABEL) != null) { + Object featureArray = curi.getData(Constants.FEATURE_VECTOR); + double[] doubleFeatureArray = (double[]) featureArray; + DoubleVector features = new SequentialSparseDoubleVector(doubleFeatureArray); + Object real_value = curi.getData(Constants.URI_TRUE_LABEL); + int rv = (int) real_value; + DoubleVector rv_DoubleVector = new SingleEntryDoubleVector(rv); + DoubleVector nextExample = features; + FeatureOutcomePair realResult = new FeatureOutcomePair(nextExample, rv_DoubleVector); // real outcome + //update weights using the updated parameters + DoubleVector newWeights = this.updater.prePredictionWeightUpdate(realResult, this.model.getWeights(), learningRate, 0); + CostGradientTuple observed = this.learner.observeExample(realResult, newWeights); + // calculate new weights (note that the iteration count is not used) + CostWeightTuple update = this.updater.computeNewWeights(newWeights, observed.getGradient(), learningRate, 0, observed.getCost()); + // update model and classifier + this.model = new RegressionModel(update.getWeight(), this.model.getActivationFunction()); + } else { + LOGGER.warn("Feature vector or true label of this " + curi.getUri().toString() + " is null"); + } + } catch (Exception e) { + LOGGER.warn("Exception happened while updating the weights for the URI type predictor model", e); + } + } + + protected void setUpdater(WeightUpdater updater) { + this.updater = updater; + } + + public RegressionLearn getLearner() { + return learner; + } + + protected void setLearner(RegressionLearn learner) { + this.learner = learner; + } + + public RegressionModel getModel() { + return model; + } + + protected void setModel(RegressionModel model) { + this.model = model; + } + + public RegressionClassifier getClassifier() { + return classifier; + } + + protected void setClassifier(RegressionClassifier classifier) { + this.classifier = classifier; + } + + protected void setFilepath(String filepath) { + this.filepath = filepath; + } + + public void setLearningRate(double learningRate) { + this.learningRate = learningRate; + } + + public double getL2() { + return l2; + } + + public void setL2(double l2) { + this.l2 = l2; + } + + public double getL1() { + return l1; + } + + public void setL1(double l1) { + this.l1 = l1; + } + + public double getBeta() { + return beta; + } + + public void setBeta(double beta) { + this.beta = beta; + } + + public String getFilepath() { + return filepath; + } + + public double getLearningRate() { + return learningRate; + } + + private Double getHoldoutValidationPercentage() { + return holdoutValidationPercentage; + } + + private void setHoldoutValidationPercentage(Double holdoutValidationPercentage) { + this.holdoutValidationPercentage = holdoutValidationPercentage; + } + public Double getThreshold(){ return this.threshold; } + + public void setThreshold(Double threshold) { this.threshold = threshold; } + + public String getPositiveClass() { return this.positiveClass; } + + public void setPositiveClass(String positiveClass) { this.positiveClass = positiveClass; } + + + /** + * A builder pattern for the Binomialpredictor, that uses + * Regression Model, Regression Learner along with default training data and other default hyperparameters + */ + public static class BinomialPredictorBuilder { + + private TrainingDataProvider trainingDataProvider = new BinomialTrainDataProviderImpl(); + + protected StochasticGradientDescent sgd; //Minimizer + + private RegressionLearn learner; //Learner + + private RegressionModel model; //Model + + private WeightUpdater updater; //Updater + + private Double learningRate;//Learning rate + + private Double beta; //Beta + + private Double l1; //L1 + + private Double l2; //L2 + + private Double holdoutValidationPercentage; //Validation percentage which is between 0 and 1 + + private RegressionClassifier classifier; //Classifier + + private String filePath; // file path for the training data file + + private double threshold; // threshold above which a URI is classified into the positive class + + public String positiveClass; // the positive class of the binary classification + + public BinomialPredictorBuilder(RegressionLearn learner, RegressionModel model, RegressionClassifier classifier, WeightUpdater updater) { + this.learner = learner; + this.model = model; + this.classifier = classifier; + this.updater = updater; + } + + public BinomialPredictorBuilder() { + } + + public BinomialPredictorBuilder withUpdater(WeightUpdater updater) { + this.setUpdater(updater); + return this; + } + + public BinomialPredictorBuilder withLearner(RegressionLearn learner) { + this.setLearner(learner); + return this; + } + + public BinomialPredictorBuilder withModel(RegressionModel model) { + this.setModel(model); + return this; + } + + public BinomialPredictorBuilder withClassifier(RegressionClassifier regressionClassifier) { + this.setClassifier(regressionClassifier); + return this; + } + + public BinomialPredictorBuilder withFile(String filepath) { + this.setFilePath(filepath); + return this; + } + + public BinomialPredictorBuilder withLearningRate(Double learningRate) { + this.setLearningRate(learningRate); + return this; + } + + public BinomialPredictorBuilder withL1(Double L1) { + this.setL1(L1); + return this; + } + + public BinomialPredictorBuilder withL2(Double L2) { + this.setL2(L2); + return this; + } + + public BinomialPredictorBuilder withBeta(Double Beta) { + this.setBeta(Beta); + return this; + } + + public BinomialPredictorBuilder withThreshold(Double threshold) { + this.setThreshold(threshold); + return this; + } + + public BinomialPredictorBuilder withPositiveClass(String positiveClass){ + this.setPostiveClass(positiveClass); + return this; + } + + public BinomialPredictor build() { + BinomialPredictor predictor = new BinomialPredictor(); + + if (this.getLearningRate() == null) + this.setLearningRate(0.7); + predictor.setLearningRate(this.learningRate); + + if (this.getBeta() == null) + this.setBeta(1); + predictor.setBeta(this.beta); + + if (this.getL1() == null) { + this.setL1(1); + } + predictor.setL1(this.l1); + + if (this.getL2() == null) + this.setL2(1); + + predictor.setL2(this.getL2()); + + if(this.getThreshold() == null) { + this.setThreshold(0.5); + } + + predictor.setThreshold(this.getThreshold()); + + predictor.setPositiveClass(this.getPositiveClass()); + + //updater + if (this.getUpdater() == null) { + this.setUpdater(new AdaptiveFTRLRegularizer(this.getBeta(), this.getL1(), this.getL2())); + } + predictor.setUpdater(this.getUpdater()); + + //holdout validation percentage + if (this.getHoldoutValidationPercentage() == null) { + this.setHoldoutValidationPercentage(0.05d); + } + predictor.setHoldoutValidationPercentage(this.getHoldoutValidationPercentage()); + + sgd = StochasticGradientDescent.StochasticGradientDescentBuilder + .create(this.getLearningRate()) // learning rate + .holdoutValidationPercentage(this.getHoldoutValidationPercentage()) // 5% as validation set + .historySize(10_000) // keep 10k samples to compute relative improvement + .weightUpdater(updater) // FTRL updater + .progressReportInterval(1_000) // report every n iterations + .build(); + + + //learner + if (this.getLearner() == null) + this.setLearner(new RegressionLearn(sgd, new SigmoidActivationFunction(), new LogLoss())); + predictor.setLearner(this.getLearner()); + + //model + ArrayList classList = new ArrayList<>(); + classList.add(this.positiveClass); + if (this.getModel() == null) + this.setModel(this.learner.train(() -> trainingDataProvider.setUpStream(this.filePath, classList))); + predictor.setModel(this.getModel()); + + //this.train(filePath); + + //classifier + if (this.getClassifier() == null) + if (this.getModel() != null) + this.setClassifier(new RegressionClassifier(this.getModel())); + predictor.setClassifier(this.getClassifier()); + + return predictor; + } + + private RegressionLearn getLearner() { + return this.learner; + } + + private void setLearner(RegressionLearn learner) { + this.learner = learner; + } + + private RegressionModel getModel() { + return this.model; + } + + private void setModel(RegressionModel model) { + this.model = model; + } + + private RegressionClassifier getClassifier() { + return this.classifier; + } + + private void setClassifier(RegressionClassifier classifier) { + this.classifier = classifier; + } + + private WeightUpdater getUpdater() { + return this.updater; + } + + private void setUpdater(WeightUpdater updater) { + this.updater = updater; + } + + private Double getLearningRate() { + return this.learningRate; + } + + private void setLearningRate(double learningRate) { + this.learningRate = learningRate; + } + + private Double getBeta() { + return this.beta; + } + + private void setBeta(double beta) { + this.beta = beta; + } + + private Double getL1() { + return this.l1; + } + + private void setL1(double l1) { + this.l1 = l1; + } + + private Double getL2() { + return this.l2; + } + + private void setL2(double l2) { + this.l2 = l2; + } + + private String getFilePath() { + return this.filePath; + } + + private void setFilePath(String filePath) { + this.filePath = filePath; + } + + private Double getHoldoutValidationPercentage() { + return this.holdoutValidationPercentage; + } + + private void setHoldoutValidationPercentage(Double holdoutValidationPercentage) { + this.holdoutValidationPercentage = holdoutValidationPercentage; + } + + private Double getThreshold(){ return this.threshold; } + + private void setThreshold(Double threshold) { this.threshold = threshold; } + + private String getPositiveClass() { return this.positiveClass; } + + private void setPostiveClass(String postiveClass) { this.positiveClass = postiveClass; } + + } + +} diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/predictor/BinomialTrainDataProviderImpl.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/predictor/BinomialTrainDataProviderImpl.java new file mode 100644 index 000000000..470f88adc --- /dev/null +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/predictor/BinomialTrainDataProviderImpl.java @@ -0,0 +1,91 @@ +package org.dice_research.squirrel.predictor; + +import de.jungblut.math.DoubleVector; +import de.jungblut.math.dense.SingleEntryDoubleVector; +import de.jungblut.math.sparse.SequentialSparseDoubleVector; +import de.jungblut.online.ml.FeatureOutcomePair; +import org.dice_research.squirrel.Constants; +import org.dice_research.squirrel.data.uri.CrawleableUri; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.*; +import java.net.URI; +import java.net.URISyntaxException; +import java.net.URL; +import java.nio.charset.Charset; +import java.util.ArrayList; +import java.util.stream.Stream; + +public class BinomialTrainDataProviderImpl implements TrainingDataProvider { + + private static final SingleEntryDoubleVector POSITIVE_CLASS = new SingleEntryDoubleVector(1d); + private static final SingleEntryDoubleVector NEGATIVE_CLASS = new SingleEntryDoubleVector(0d); + private FeatureVectorGenerator featureGenerator = new FeatureVectorGenerator(); + Logger LOGGER = LoggerFactory.getLogger(BinomialTrainDataProviderImpl.class); + + /** + * Used to convert the data in the training file into a stream which can be fed into the learner to learn + * @param filePath path of the file containing the training data + * @param classList list containing the class names of the URI + * @return + */ + @Override + public Stream setUpStream(String filePath, ArrayList classList) { + String positiveClass = (String) classList.get(0); + BufferedReader br = null; + try { + //br = new BufferedReader(new InputStreamReader(new FileInputStream(filePath))); + br = new BufferedReader(new InputStreamReader(getClass().getClassLoader().getResourceAsStream(filePath) + , Charset.defaultCharset())); + }catch (Exception e){ + LOGGER.warn("Exception happened while setting up train data stream", e); + } + return br.lines().map((s) -> parseFeature(s, positiveClass)); + } + + public FeatureOutcomePair parseFeature(String line, String positiveClass) { + String[] split = line.split(","); + URI furi = null; + try{ + furi = new URI(split[0].replace("\"", "")); + } catch (URISyntaxException e) { + try { + furi = new URI("http://scoreboard.lod2.eu/data/scoreboardDataCube1.rdf"); + } catch (URISyntaxException ex) { + ex.printStackTrace(); + } + } + CrawleableUri uri = new CrawleableUri(furi); + featureGenerator.featureHashing(uri); + Object featureArray = uri.getData(Constants.FEATURE_VECTOR); + double[] doubleFeatureArray = (double[]) featureArray; + DoubleVector features = new SequentialSparseDoubleVector(doubleFeatureArray); + split[1] = split[1].replace("\"", ""); + return new FeatureOutcomePair(features, split[1].equals(positiveClass) ? POSITIVE_CLASS : NEGATIVE_CLASS); + } + + + /** + * Used to create a file using the data from an online source + * @param dataUri The location of the online source + * @param trainFilePath The location of the local file to which the data should be written + */ + public void createTrainDataFile(String dataUri, String trainFilePath) { + URL url = null; + BufferedReader br = null; + String line; + try { + PrintWriter writer = new PrintWriter(trainFilePath, "UTF-8"); + url = new URL(dataUri); + br = new BufferedReader((new InputStreamReader(url.openStream()))); + br.readLine(); + while((line = br.readLine()) != null){ + writer.println(line); + } + writer.close(); + } catch (IOException e) { + e.printStackTrace(); + } + } +} diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/predictor/FeatureVectorGenerator.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/predictor/FeatureVectorGenerator.java new file mode 100644 index 000000000..33c58b4c0 --- /dev/null +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/predictor/FeatureVectorGenerator.java @@ -0,0 +1,68 @@ +package org.dice_research.squirrel.predictor; + +import com.google.common.hash.Hashing; +import de.jungblut.math.DoubleVector; +import de.jungblut.math.sparse.SequentialSparseDoubleVector; +import de.jungblut.nlp.VectorizerUtils; +import org.dice_research.squirrel.Constants; +import org.dice_research.squirrel.data.uri.CrawleableUri; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.net.URI; +import java.util.ArrayList; +import java.util.Arrays; + +/** + * Creates the feature vector that can be used in the prediction. It considers + * the intrinsic URI features and the intrinsic features of the + * referring URI. + * Feature hashing uses the hash function MurmurHash3 to map the feature vectors into binary vectors. + * + */ + +public class FeatureVectorGenerator { + public static final Logger LOGGER = LoggerFactory.getLogger(FeatureVectorGenerator.class); + + /** + * Method to perfrom feature hashing to reduce the dimension of the features of the URIs + * @param uri URI whose feature vector is to be calculated + */ + public void featureHashing(CrawleableUri uri) { + ArrayList tokens1 = new ArrayList<>(); + + // Creating tokens of the current URI + tokenCreation(uri, tokens1); + CrawleableUri referUri; + + // Creating tokens of the referring URI + if (uri.getData(Constants.REFERRING_URI) != null) { + referUri = new CrawleableUri((URI) uri.getData(Constants.REFERRING_URI)); + if (referUri != null) + tokenCreation(referUri, tokens1); + } + String[] tokens = tokens1.toArray(new String[0]); + try { + DoubleVector feature = VectorizerUtils.sparseHashVectorize(tokens, Hashing.murmur3_128(), () -> new SequentialSparseDoubleVector( + 2 << 14)); + double[] d; + d = feature.toArray(); + uri.addData(Constants.FEATURE_VECTOR, d); + + } catch (Exception e) { + LOGGER.warn("Exception caused while adding the feature vector to the URI map", e); + } + + } + + /** + * Method to convert the URI in to small tokens + * @param uri whose tokens are to be obtained + * @param tokens the list in which the tokens are to be stored + */ + public void tokenCreation(CrawleableUri uri, ArrayList tokens) { + String[] uriToken; + uriToken = uri.getUri().toString().split("/|\\."); + tokens.addAll(Arrays.asList(uriToken)); + } +} diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/predictor/MultinomialPredictor.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/predictor/MultinomialPredictor.java new file mode 100644 index 000000000..41784add2 --- /dev/null +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/predictor/MultinomialPredictor.java @@ -0,0 +1,544 @@ +package org.dice_research.squirrel.predictor; + +import de.jungblut.math.DoubleVector; +import de.jungblut.math.activation.SigmoidActivationFunction; +import de.jungblut.math.dense.SingleEntryDoubleVector; +import de.jungblut.math.loss.LogLoss; +import de.jungblut.math.minimize.CostGradientTuple; +import de.jungblut.math.sparse.SequentialSparseDoubleVector; +import de.jungblut.online.minimizer.StochasticGradientDescent; +import de.jungblut.online.ml.FeatureOutcomePair; +import de.jungblut.online.regression.RegressionLearner; +import de.jungblut.online.regression.RegressionModel; +import de.jungblut.online.regression.multinomial.MultinomialRegressionClassifier; +import de.jungblut.online.regression.multinomial.MultinomialRegressionLearner; +import de.jungblut.online.regression.multinomial.MultinomialRegressionModel; +import de.jungblut.online.regularization.AdaptiveFTRLRegularizer; +import de.jungblut.online.regularization.CostWeightTuple; +import de.jungblut.online.regularization.L2Regularizer; +import de.jungblut.online.regularization.WeightUpdater; +import org.dice_research.squirrel.Constants; +import org.dice_research.squirrel.data.uri.CrawleableUri; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import java.io.BufferedReader; +import java.io.InputStreamReader; +import java.nio.charset.Charset; +import java.util.ArrayList; +import java.util.function.IntFunction; + +/** + * A predictor that predicts the type of the URI by performing multi-class classification + */ +public final class MultinomialPredictor implements Predictor{ + + public static final Logger LOGGER = LoggerFactory.getLogger(MultinomialPredictor.class); + /** + * {@link MultinomialRegressionModel} Represents the multinomial regression model used for the prediction of the type of the URI + */ + private MultinomialRegressionModel multinomialModel; + /** + * {@link MultinomialRegressionLearner} Used to train the model with training data + */ + private MultinomialRegressionLearner multinomialLearner; + /** + * {@link MultinomialRegressionClassifier} Classifier for multinomial regression model. + * Takes a model or the atomic parts of it and predicts the outcome for a given feature. + * + */ + private MultinomialRegressionClassifier multinomialClassifier; + /** + * {@link WeightUpdater} Used to update the weights of the predictor model used. + */ + private WeightUpdater updater; + /** + * {@link RegressionLearn} Used to train the model with training data + */ + private RegressionLearn learner; + /** + * Location of the file containing the training data + */ + private String filepath; + /** + * The rate at which the model learns. + */ + private Double learningRate; + /** + * Regularizing parameter L2 + */ + private Double l2; + /** + * Regularizing parameter L1 + */ + private Double l1; + /** + * Hyper parameter Beta + */ + private Double beta; + /** + * Validation percentage which is between 0 and 1 + */ + private Double holdoutValidationPercentage; + /** + * A list storing the different classes of URIs obtained from the training data + */ + private ArrayList classList = new ArrayList<>(); + /** + * Used to generate the feature vector of a URI + */ + private FeatureVectorGenerator featureGenerator = new FeatureVectorGenerator(); + + /** + * Predicts the type of the URI + * @param uri the URI to which the prediction has to be made + * @return the type of the URI + */ + public String predict(CrawleableUri uri) { + int pred = 0; + String predictedClass = null; + try { + featureGenerator.featureHashing(uri); + Object featureArray = uri.getData(Constants.FEATURE_VECTOR); + double[] doubleFeatureArray = (double[]) featureArray; + DoubleVector features = new SequentialSparseDoubleVector(doubleFeatureArray); + //initialize the regression classifier with updated model and predict + multinomialClassifier = new MultinomialRegressionClassifier(multinomialModel); + DoubleVector prediction = multinomialClassifier.predict(features); + pred = prediction.maxIndex(); + } catch (Exception e) { + LOGGER.warn("Prediction for this " + uri.getUri().toString() + " failed " , e); + } + predictedClass = this.classList.get(pred); + return predictedClass; + } + + /** + * Updates the predictor model based on the this URI + * @param uri based on which the model weights are updated + */ + public void weightUpdate(CrawleableUri uri) { + RegressionModel[] newModels = new RegressionModel[this.getMultinomialModel().getModels().length]; + int i=0; + if (uri.getData(Constants.FEATURE_VECTOR) != null && uri.getData(Constants.URI_TRUE_CLASS) != null) { + for (RegressionModel s : this.getMultinomialModel().getModels()) { + Object featureArray = uri.getData(Constants.FEATURE_VECTOR); + double[] doubleFeatureArray = (double[]) featureArray; + DoubleVector features = new SequentialSparseDoubleVector(doubleFeatureArray); + Object real_value = uri.getData(Constants.URI_TRUE_CLASS); + int rv = (int) real_value; + DoubleVector rv_DoubleVector = new SingleEntryDoubleVector(rv); + DoubleVector nextExample = features; + FeatureOutcomePair realResult = new FeatureOutcomePair(nextExample, rv_DoubleVector); // real outcome + //update weights using the updated parameters + DoubleVector newWeights = this.updater.prePredictionWeightUpdate(realResult, s.getWeights(),learningRate,0); + CostGradientTuple observed = this.learner.observeExample(realResult, newWeights); + // calculate new weights (note that the iteration count is not used) + CostWeightTuple update = this.updater.computeNewWeights(newWeights, observed.getGradient(), learningRate, 0, observed.getCost()); + // update model and classifier + newModels[i] = new RegressionModel(update.getWeight(), s.getActivationFunction()); + i++; + } + //create a new multinomial model with the update weights + this.multinomialModel = new MultinomialRegressionModel(newModels); + } else + LOGGER.warn("URI is null"); + } + + public RegressionModel getModel() { + return null; + } + + //Learning rate + public double getLearningRate() { + return learningRate; + } + + public void setLearningRate(double learningRate) { + this.learningRate = learningRate; + } + + //L2 + public double getL2() { + return l2; + } + + public void setL2(double l2) { + this.l2 = l2; + } + + //L1 + public double getL1() { + return l1; + } + + public void setL1(double l1) { + this.l1 = l1; + } + + //Beta + public double getBeta() { + return beta; + } + + public void setBeta(double beta) { + this.beta = beta; + } + + //Learner + public RegressionLearn getLearner() { + return learner; + } + + protected void setLearner(RegressionLearn learner) { + this.learner = learner; + } + + //Filepath + public String getFilepath() { + return filepath; + } + + protected void setFilepath(String filepath) { + this.filepath = filepath; + } + + //Updater + public WeightUpdater getUpdater() { + return updater; + } + + protected void setUpdater(WeightUpdater updater) { + this.updater = updater; + } + + //Multinomial Model + public MultinomialRegressionModel getMultinomialModel() { + return multinomialModel; + } + + protected void setMultinomialModel(MultinomialRegressionModel multinomialModel) { + this.multinomialModel = multinomialModel; + } + + + //Multinomial Learner + public MultinomialRegressionLearner getMultinomialLearner() { + return multinomialLearner; + } + + protected void setMultinomialLearner(MultinomialRegressionLearner multinomialLearner) { + this.multinomialLearner = multinomialLearner; + } + + //Multinomial Classifier + public MultinomialRegressionClassifier getMultinomialClassifier() { + return multinomialClassifier; + } + + protected void setMultinomialClassifier(MultinomialRegressionClassifier multinomialClassifier) { + this.multinomialClassifier = multinomialClassifier; + } + + + public Double getHoldoutValidationPercentage() { + return holdoutValidationPercentage; + } + + private void setHoldoutValidationPercentage(Double holdoutValidationPercentage) { + this.holdoutValidationPercentage = holdoutValidationPercentage; + } + + public ArrayList getClassList(){ + return this.classList; + } + + /** + * A builder pattern for the MultinomialPredictor, that uses Regression Model, Regression Learner along with default training data and other default hyperparameters + */ + public static class MultinomialPredictorBuilder { + + private TrainingDataProvider trainingDataProvider = new MultinomialTrainDataProviderImpl(); //Training Data Provider + + protected StochasticGradientDescent sgd; //Minimizer + + private RegressionLearn learner; //Learner + + private WeightUpdater updater; //Updater + + private MultinomialRegressionLearner multinomialLearner; //Multinomial learner + + private MultinomialRegressionModel multinomialModel; //Multinomial odel + + private MultinomialRegressionClassifier multinomialClassifier; //Multinomial Classifier + + private Double learningRate; //Learning rate + + private Double beta; //Beta + + private Double l1; //L1 + + private Double l2; //L2 + + private Double holdoutValidationPercentage; //Validation percentage which is between 0 and 1 + + private String filePath; //filepath to train + + private ArrayList classList = new ArrayList<>(); // list containing the names of the different classes of URI + + public MultinomialPredictorBuilder(MultinomialRegressionLearner learner, MultinomialRegressionModel model, MultinomialRegressionClassifier classifier, WeightUpdater updater) { + this.multinomialLearner = learner; + this.multinomialModel = model; + this.multinomialClassifier = classifier; + this.updater = updater; + } + + public MultinomialPredictorBuilder() { + } + + public MultinomialPredictorBuilder withUpdater(WeightUpdater updater) { + this.setUpdater(updater); + return this; + } + + public MultinomialPredictorBuilder withLearner(MultinomialRegressionLearner multinomialLearner) { + this.setMultinomialLearner(multinomialLearner); + return this; + } + + public MultinomialPredictorBuilder withModel(MultinomialRegressionModel multinomialModel) { + this.setMultinomialModel(multinomialModel); + return this; + } + + public MultinomialPredictorBuilder withClassifier(MultinomialRegressionClassifier multinomialClassifier) { + this.setMultinomialRegressionClassifier(multinomialClassifier); + return this; + } + + public MultinomialPredictorBuilder withFile(String filepath) { + this.setFilePath(filepath); + return this; + + } + + public MultinomialPredictorBuilder withLearningRate(Double learningRate) { + this.setLearningRate(learningRate); + return this; + } + + public MultinomialPredictorBuilder withL1(Double L1) { + this.setL1(L1); + return this; + } + + public MultinomialPredictorBuilder withL2(Double L2) { + this.setL2(L2); + return this; + } + + public MultinomialPredictorBuilder withBeta(Double Beta) { + this.setBeta(Beta); + return this; + } + + IntFunction factory = (i) -> { + // take care of not sharing any state from the outside, since classes are trained in parallel + StochasticGradientDescent minimizer = StochasticGradientDescent.StochasticGradientDescentBuilder + .create(0.01) + .holdoutValidationPercentage(0.1d) + .weightUpdater(new L2Regularizer(0.1)) + .progressReportInterval(1_000) + .build(); + RegressionLearner learner = new RegressionLearner(minimizer, + new SigmoidActivationFunction(), new LogLoss()); + learner.setNumPasses(5); + return learner; + }; + + public MultinomialPredictor build() { + MultinomialPredictor predictor = new MultinomialPredictor(); + + //Learning Rate + if (this.getLearningRate() == null) + this.setLearningRate(0.7); + predictor.setLearningRate(this.getLearningRate()); + + //Beta + if (this.getBeta() == null) + this.setBeta(1); + predictor.setBeta(this.getBeta()); + + //L1 + if (this.getL1() == null) + this.setL1(1); + predictor.setL1(this.getL1()); + + //L2 + if (this.getL2() == null) + this.setL2(1); + predictor.setL2(this.getL2()); + + //updater + if (this.getUpdater() == null) { + this.setUpdater(new AdaptiveFTRLRegularizer(this.getBeta(), this.getL1(), this.getL2())); + } + predictor.setUpdater(this.getUpdater()); + + //holdout validation percentage + if (this.getHoldoutValidationPercentage() == null) { + this.setHoldoutValidationPercentage(0.05d); + } + predictor.setHoldoutValidationPercentage(this.getHoldoutValidationPercentage()); + + sgd = StochasticGradientDescent.StochasticGradientDescentBuilder + .create(this.getLearningRate()) // learning rate + .holdoutValidationPercentage(this.getHoldoutValidationPercentage())// 5% as validation set + .historySize(10_000) // keep 10k samples to compute relative improvement + .weightUpdater(this.getUpdater()) // FTRL updater + .progressReportInterval(1_000) // report every n iterations + .build(); + + //regression learner + if (this.getLearner() == null) + this.setLearner(new RegressionLearn(sgd, new SigmoidActivationFunction(), new LogLoss())); + predictor.setLearner(this.getLearner()); + + //multinomial learner + if (this.getMultinomialLearner() == null) + this.setMultinomialLearner(new MultinomialRegressionLearner(factory)); + predictor.setMultinomialLearner(this.getMultinomialLearner()); + + + //model + if (this.getMultinomialModel() == null) { + // Storing the names of the classes of the URI obtained from the training data + BufferedReader br = null; + String line; + try { + br = new BufferedReader(new InputStreamReader(getClass().getClassLoader().getResourceAsStream(filePath) + , Charset.defaultCharset())); + while((line = br.readLine()) != null){ + String[] split = line.split(","); + split[1] = split[1].replace("\"", ""); + if(!this.classList.contains(split[1])){ + this.classList.add(split[1]); + } + } + predictor.classList = this.classList; + + }catch (Exception e){ + LOGGER.warn("Exception happened while finding the classes of the URI from training data file", e); + } + this.setMultinomialModel(multinomialLearner.train(() -> trainingDataProvider.setUpStream(this.getFilePath(), this.classList))); + } + predictor.setMultinomialModel(this.getMultinomialModel()); + + //classifier + if (this.getMultinomialClassifier() == null) + if (this.getMultinomialModel() != null) + this.setMultinomialRegressionClassifier(new MultinomialRegressionClassifier(this.getMultinomialModel())); + predictor.setMultinomialClassifier(this.getMultinomialClassifier()); + + return predictor; + } + + //Learner + private RegressionLearn getLearner() { + return learner; + } + + private void setLearner(RegressionLearn regressionLearn) { + this.learner = regressionLearn; + } + + //Updater + private WeightUpdater getUpdater() { + return updater; + } + + private void setUpdater(WeightUpdater updater) { + this.updater = updater; + } + + //Multinomial Model + private MultinomialRegressionModel getMultinomialModel() { + return multinomialModel; + } + + private void setMultinomialModel(MultinomialRegressionModel multinomialRegressionModel) { + this.multinomialModel = multinomialRegressionModel; + } + + //Multinomial Classifier + private MultinomialRegressionClassifier getMultinomialClassifier() { + return multinomialClassifier; + } + + private void setMultinomialRegressionClassifier(MultinomialRegressionClassifier multinomialRegressionClassifier) { + this.multinomialClassifier = multinomialRegressionClassifier; + } + + //Multinomial Regression Learner + private MultinomialRegressionLearner getMultinomialLearner() { + return multinomialLearner; + } + + private void setMultinomialLearner(MultinomialRegressionLearner multinomialRegressionLearner) { + this.multinomialLearner = multinomialRegressionLearner; + } + + public Double getLearningRate() { + return learningRate; + } + + private void setLearningRate(double learningRate) { + this.learningRate = learningRate; + } + + private Double getBeta() { + return beta; + } + + private void setBeta(double beta) { + this.beta = beta; + } + + private Double getL1() { + return l1; + } + + private void setL1(double l1) { + this.l1 = l1; + } + + private Double getL2() { + return l2; + } + + private void setL2(double l2) { + this.l2 = l2; + } + + private String getFilePath() { + return filePath; + } + + private void setFilePath(String filePath) { + this.filePath = filePath; + } + + private Double getHoldoutValidationPercentage() { + return holdoutValidationPercentage; + } + + private void setHoldoutValidationPercentage(Double holdoutValidationPercentage) { + this.holdoutValidationPercentage = holdoutValidationPercentage; + } + + private ArrayList getClassList(){ + return this.classList; + } + + } + +} diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/predictor/MultinomialTrainDataProviderImpl.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/predictor/MultinomialTrainDataProviderImpl.java new file mode 100644 index 000000000..d8056dc08 --- /dev/null +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/predictor/MultinomialTrainDataProviderImpl.java @@ -0,0 +1,98 @@ +package org.dice_research.squirrel.predictor; + +import de.jungblut.math.DoubleVector; +import de.jungblut.math.dense.DenseDoubleVector; +import de.jungblut.math.sparse.SequentialSparseDoubleVector; +import de.jungblut.online.ml.FeatureOutcomePair; +import org.dice_research.squirrel.Constants; +import org.dice_research.squirrel.data.uri.CrawleableUri; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.*; +import java.net.URI; +import java.net.URISyntaxException; +import java.net.URL; +import java.nio.charset.Charset; +import java.util.ArrayList; +import java.util.stream.Stream; + +public class MultinomialTrainDataProviderImpl implements TrainingDataProvider { + + Logger LOGGER = LoggerFactory.getLogger(MultinomialTrainDataProviderImpl.class); + private FeatureVectorGenerator featureGenerator = new FeatureVectorGenerator(); + + /** + * Used to convert the data in the training file into a stream which can be fed into the learner to learn + * @param filePath path of the file containing the training data + * @param classList list containing the class names of the URI + * @return + */ + @Override + public Stream setUpStream(String filePath, ArrayList classList) { + BufferedReader br = null; + try { + br = new BufferedReader(new InputStreamReader(getClass().getClassLoader().getResourceAsStream(filePath) + , Charset.defaultCharset())); + }catch (Exception e){ + e.printStackTrace(); + } + return br.lines().map((s) -> parseFeature(s, classList)); + } + + public FeatureOutcomePair parseFeature(String line, ArrayList classList) { + DoubleVector[] classes = new DoubleVector[classList.size()]; + + for (int i = 0; i < classes.length; i++) { + classes[i] = new DenseDoubleVector(classes.length); + classes[i].set(i, 1d); + } + String[] split = line.split(","); + URI furi = null; + try { + furi = new URI(split[0].replace("\"", "")); + } catch (URISyntaxException e) { + try { + furi = new URI("http://scoreboard.lod2.eu/data/scoreboardDataCube.rdf"); + } catch (URISyntaxException ex) { + LOGGER.warn("Exception happened while parsing train data file", ex); + } + } + CrawleableUri uri = new CrawleableUri(furi); + featureGenerator.featureHashing(uri); + Object featureArray = uri.getData(Constants.FEATURE_VECTOR); + double[] doubleFeatureArray = (double[]) featureArray; + DoubleVector features = new SequentialSparseDoubleVector(doubleFeatureArray); + split[1] = split[1].replace("\"", ""); + DoubleVector predVector; + if(classList.indexOf(split[1]) != -1) + predVector = classes[classList.indexOf(split[1])]; + else + predVector = classes[0]; + + return new FeatureOutcomePair(features, predVector); + } + + /** + * Used to create a file using the data from an online source + * @param dataUri The location of the online source + * @param trainFilePath The location of the local file to which the data should be written + */ + public void createTrainDataFile(String dataUri, String trainFilePath) { + BufferedReader br = null; + URL url = null; + String line; + try { + PrintWriter writer = new PrintWriter(trainFilePath, "UTF-8"); + url = new URL(dataUri); + br = new BufferedReader((new InputStreamReader(url.openStream()))); + br.readLine(); + while((line = br.readLine()) != null){ + writer.println(line); + } + writer.close(); + } catch (IOException e) { + e.printStackTrace(); + } + } +} diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/predictor/Predictor.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/predictor/Predictor.java new file mode 100644 index 000000000..5ec08e863 --- /dev/null +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/predictor/Predictor.java @@ -0,0 +1,33 @@ +package org.dice_research.squirrel.predictor; + +import de.jungblut.online.regression.RegressionModel; +import org.dice_research.squirrel.data.uri.CrawleableUri; + +/** + * Interface of an Online leaner predicting the URI type. + * This interface is for building a learner,training it and predicting the type of URI + * + */ + +public interface Predictor { + /** + * Return a prediction value of the type of the given URI, the prediction should be between 0 and 1. + * e.g. In case of RDF type prediction, the more the predicted value is close to 1 + * the more the URI is of type RDF otherwise is not. + * + * @param uri + * {@link CrawleableUri} URI whose class is to be predicted. + * + * @return the predicted class. + */ + String predict(CrawleableUri uri); + /** + * Update the weight of the model. It uses the predicted value and the true label value + * with the feature vector from the URI map to calculate the new weight. + * + * @param uri + * {@link CrawleableUri} URI whose feature vector is used to update weights + */ + void weightUpdate(CrawleableUri uri); + +} diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/predictor/RegressionLearn.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/predictor/RegressionLearn.java new file mode 100644 index 000000000..1d80e452b --- /dev/null +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/predictor/RegressionLearn.java @@ -0,0 +1,40 @@ +package org.dice_research.squirrel.predictor; + +import com.google.common.base.Preconditions; +import de.jungblut.math.DoubleVector; +import de.jungblut.math.activation.ActivationFunction; +import de.jungblut.math.dense.SingleEntryDoubleVector; +import de.jungblut.math.loss.LossFunction; +import de.jungblut.math.minimize.CostGradientTuple; +import de.jungblut.online.minimizer.StochasticMinimizer; +import de.jungblut.online.ml.FeatureOutcomePair; +import de.jungblut.online.regression.RegressionLearner; + + +public class RegressionLearn extends RegressionLearner { + private StochasticMinimizer minimizer; + + private final ActivationFunction activationFunction; + private final LossFunction lossFunction; + + public RegressionLearn(StochasticMinimizer minimizer, + ActivationFunction activationFunction, LossFunction lossFunction) { + super(minimizer, activationFunction, lossFunction); + this.activationFunction = Preconditions.checkNotNull(activationFunction, + "activation function"); + this.lossFunction = Preconditions.checkNotNull(lossFunction, + "loss function"); + } + + public CostGradientTuple observeExample(FeatureOutcomePair next, DoubleVector weights) { + + DoubleVector hypothesis = new SingleEntryDoubleVector(this.activationFunction.apply(next.getFeature().dot(weights))); + double cost = this.lossFunction.calculateLoss(next.getOutcome(), hypothesis); + DoubleVector gradient = this.lossFunction.calculateGradient(next.getFeature(), next.getOutcome(), hypothesis); + return new CostGradientTuple(cost, gradient); + } +} + + + + diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/predictor/TrainingDataProvider.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/predictor/TrainingDataProvider.java new file mode 100644 index 000000000..57c658755 --- /dev/null +++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/predictor/TrainingDataProvider.java @@ -0,0 +1,21 @@ +package org.dice_research.squirrel.predictor; + +import de.jungblut.online.ml.FeatureOutcomePair; + +import java.util.ArrayList; +import java.util.stream.Stream; + +/** + * Interface to provide training data to the predictor + */ + +public interface TrainingDataProvider{ + + /** + * Function takes the file containing the train data and converts it to a stream that can be used by a predictor to train. + * @param filePath path of the file containing the training data + * @param classList list containing the class names of the URI + * @return a stream of train data + */ + Stream setUpStream(String filePath, ArrayList classList); +} diff --git a/squirrel.frontier/src/main/resources/binomialTrainData.txt b/squirrel.frontier/src/main/resources/binomialTrainData.txt new file mode 100644 index 000000000..8e8983574 --- /dev/null +++ b/squirrel.frontier/src/main/resources/binomialTrainData.txt @@ -0,0 +1,228 @@ +"http://api.kasabi.com/dataset/jisc-cetis-project-directory/apis/sparql","sparql" +"http://ipi.bio2rdf.org/sparql","sparql" +"http://skos.um.es/unesco6/unesco6.ttl","dereferenceable" +"http://data.kasabi.com/dataset/prelinger-archives/film//1935_comedy_mallinckrodt.rdf","dereferenceable" +"http://red.gnoss.com/en/community/agrega/Agrega-explorer/tag/tema?rdf","dereferenceable" +"http://api.kasabi.com/dataset/prelinger-archives/apis/sparql","sparql" +"http://quebec.bio2rdf.org/download/data/hgnc/hgnc.n3.gz","dereferenceable" +"https://data.sfgov.org/api/views/ea9w-4zvc/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.exim.gov/api/views/sa52-sypr/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.lacity.org/api/views/4ca8-mxuh/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.ok.gov/api/views/52ak-m2d5/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.cityofnewyork.us/api/views/qb3k-n8mm/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.cityofchicago.org/api/views/s6ha-ppgi/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.hawaii.gov/api/views/usfi-mive/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.cityofchicago.org/api/views/cauq-8yn6/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.cityofnewyork.us/api/views/y74e-vkxy/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.cityofnewyork.us/api/views/qqsi-vm9f/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://health.data.ny.gov/api/views/2hcc-shji/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.cityofnewyork.us/api/views/kwk4-6u9e/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.sfgov.org/api/views/ea9w-4zvc/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.exim.gov/api/views/sa52-sypr/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.lacity.org/api/views/4ca8-mxuh/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.ok.gov/api/views/52ak-m2d5/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.cityofnewyork.us/api/views/qb3k-n8mm/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.cityofchicago.org/api/views/s6ha-ppgi/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.hawaii.gov/api/views/usfi-mive/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.cityofchicago.org/api/views/cauq-8yn6/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.cityofnewyork.us/api/views/y74e-vkxy/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.cityofnewyork.us/api/views/qqsi-vm9f/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://health.data.ny.gov/api/views/2hcc-shji/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.cityofnewyork.us/api/views/kwk4-6u9e/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"http://quebec.bio2rdf.org/download/data/chebi/chebi.n3.gz","dereferenceable" +"http://bio2rdf.org/interpro:ipr000700","dereferenceable" +"http://api.talis.com/stores/climb/services/sparql","sparql" +"http://bio2rdf.org/rdfxml/interpro:ipr013315","dereferenceable" +"http://api.kasabi.com/dataset/prelinger-archives/apis/sparql","sparql" +"http://quebec.bio2rdf.org/download/data/ncbi/ncbi.omim.n3.gz","dereferenceable" +"http://api.kasabi.com/dataset/renewable-energy-generators/apis/sparql","sparql" +"http://quebec.bio2rdf.org/download/data/ncbi/ncbi.homologene.n3.gz","dereferenceable" +"http://quebec.bio2rdf.org/download/data/hgnc/hgnc.n3.gz","dereferenceable" +"https://data.sfgov.org/api/views/ea9w-4zvc/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.exim.gov/api/views/sa52-sypr/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.lacity.org/api/views/4ca8-mxuh/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.ok.gov/api/views/52ak-m2d5/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.cityofnewyork.us/api/views/qb3k-n8mm/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.cityofchicago.org/api/views/s6ha-ppgi/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.hawaii.gov/api/views/usfi-mive/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.cityofchicago.org/api/views/cauq-8yn6/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.cityofnewyork.us/api/views/y74e-vkxy/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.cityofnewyork.us/api/views/qqsi-vm9f/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://health.data.ny.gov/api/views/2hcc-shji/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.cityofnewyork.us/api/views/kwk4-6u9e/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"http://nasa.dataincubator.org/person/eugeneandrewcernan","dereferenceable" +"http://quebec.bio2rdf.org/download/data/kegg/kegg.cpd.n3.gz","dereferenceable" +"http://api.talis.com/stores/theviewfrom/services/sparql","sparql" +"http://linkedmanchester.org/sparql","sparql" +"http://data.kasabi.com/dataset/discogs/release/2129148/track/2.ttl","dereferenceable" +"http://datendienst.d-nb.de/cgi-bin/mabit.pl?cmd=fetch&userID=opendata&pass=opendata&mabheft=Title.ttl.gz","dereferenceable" +"http://fanhu.bz/schema#","dereferenceable" +"http://data.kasabi.com/dataset/foodista/recipe/FSZG4354.rdf","dereferenceable" +"http://quebec.bio2rdf.org/download/data/affymetrix/affymetrix.n3.gz","dereferenceable" +"http://quebec.bio2rdf.org/download/data/kegg/kegg.rn.n3.gz","dereferenceable" +"http://ec.bio2rdf.org/sparql","sparql" +"http://linkedscotland-downloads.s3.amazonaws.com/pupils.ttl.gz","dereferenceable" +"http://quebec.bio2rdf.org/download/data/obo/obo.n3.gz","dereferenceable" +"http://quebec.bio2rdf.org/download/data/kegg/kegg.gl.n3.gz","dereferenceable" +"http://www.openmobilenetwork.org/page/cell55961_12294_262_1.rdf","dereferenceable" +"http://quebec.bio2rdf.org/download/data/kegg/kegg.path.n3.gz","dereferenceable" +"http://linkedscotland-downloads.s3.amazonaws.com/sns-geography.ttl.gz","dereferenceable" +"https://commondatastorage.googleapis.com/ckannet-storage/2012-03-14T021756/metadata.dc.rdf","dereferenceable" +"http://api.kasabi.com/dataset/discogs/apis/sparql","sparql" +"http://ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/rdf/taxonomy.rdf.gz","dereferenceable" +"http://quebec.bio2rdf.org/download/data/biocyc/biocyc.n3.gz","dereferenceable" +"http://statistics.data.gov.uk/def/administrative-geography/MetropolitanCounty","dereferenceable" +"http://purl.org/weso/datasets/nomenclator/asturias/2010/nomenclator-asturias-2010.ttl","dereferenceable" +"http://lod2.eu/model/export/?m=http%3A%2F%2Flod2.eu%2F&f=rdfxml","dereferenceable" +"http://linkedmanchester.org/resources/linkedmanchester.org/id/buses/route/125.rdf","dereferenceable" +"https://commondatastorage.googleapis.com/ckannet-storage/2012-05-09T135942/iris2-linkedData-v0.2.rdf","dereferenceable" +"http://www.languagelibrary.eu/owl/simple/simple_ontology.owl","dereferenceable" +"http://kaiko.getalp.org/dbnary/static/lemon/latest/de_dbnary_lemon.ttl","dereferenceable" +"http://mlode.nlp2rdf.org/downloads/ids.nt.gz","dereferenceable" +"http://www.languagelibrary.eu/owl/simple/lemonsource/simple_lemonlistindividuals.owl","dereferenceable" +"http://quebec.bio2rdf.org/download/data/ncbi/ncbi.geneid.n3.gz","dereferenceable" +"http://s4.semanticscience.org/bio2rdf_download/rdf/current/ctd/","dereferenceable" +"http://ckan.net/storage/f/file/text-turtle-2304325680559-1346754246694","dereferenceable" +"http://rdf.geospecies.org/ont/geospecies.owl","dereferenceable" +"http://agalpha.mathbiol.org:10035/repositories/tcga","dereferenceable" +"http://dbpedia.bio2rdf.org/sparql","sparql" +"http://www.languagelibrary.eu/owl/simple/inds/simplelistindividuals.owl","dereferenceable" +"http://glottolog.org/downloadarea/references.rdf.zip","dereferenceable" +"http://zbw.eu/stw/versions/latest/download/download.php?filename=stw.rdf.zip","dereferenceable" +"http://data.colinda.org/conference.php?id=1","dereferenceable" +"http://api.talis.com/stores/jgoodwin-genealogy/services/sparql","sparql" +"http://code.google.com/p/linkedrecipes/downloads/detail?name=linked-recipes-schema-0.1.ttl&can=2&q=","dereferenceable" +"http://pokedex.dataincubator.org/pokemon/garchomp","dereferenceable" +"http://moseley.dataincubator.org/artist/beth-orton","dereferenceable" +"http://data.bib.uni-mannheim.de/sparql","sparql" +"http://kasabi.com/api/sparql-endpoint-foodista","sparql" +"http://api.kasabi.com/dataset/foodista/apis/sparql","sparql" +"http://api.talis.com/stores/wordnet/services/sparql","sparql" +"http://data.uni-muenster.de/php/sparql","sparql" +"http://biocarta.bio2rdf.org/sparql","sparql" +"http://www.languagelibrary.eu/owl/simple/inds/simpleallindividuals.owl","dereferenceable" +"http://cas.bio2rdf.org/sparql","sparql" +"http://www.languagelibrary.eu/owl/simple/lemonsource/simple_lemonallindividuals.owl","dereferenceable" +"http://quebec.bio2rdf.org/download/data/go/go.n3.gz","dereferenceable" +"http://api.kasabi.com/dataset/yahoo-geoplanet/apis/sparql","sparql" +"http://quebec.bio2rdf.org/download/data/mgi/mgi.n3.gz","dereferenceable" +"http://opendatacommunities-downloads.s3.amazonaws.com/imd-2010-imd-rank.ttl.zip","dereferenceable" +"http://data.kasabi.com/dataset/renewable-energy-generators/generator/R00001NANI.rdf","dereferenceable" +"https://ckannet-storage.commondatastorage.googleapis.com/2012-11-21T021132/ontology.rdf","dereferenceable" +"http://linkedscotland.org/sparql","sparql" +"http://api.kasabi.com/dataset/renewable-energy-generators/apis/sparql","sparql" +"http://linkedscotland.org/sparql","sparql" +"http://dimitros.net/query.sparql","sparql" +"http://linkedscotland.org/sparql","sparql" +"http://api.kasabi.com/dataset/nasa/apis/sparql","sparql" +"http://api.talis.com/stores/airports/services/sparql","sparql" +"http://api.talis.com/stores/fanhubz/services/sparql","sparql" +"http://api.talis.com/stores/theviewfrom/services/sparql","sparql" +"http://linkedmanchester.org/sparql","sparql" +"http://webenemasuno.linkeddata.es/source/rdf/data.zip","dereferenceable" +"http://chembl.bio2rdf.org/sparql","sparql" +"http://api.kasabi.com/dataset/jisc-cetis-project-directory/apis/sparql","sparql" +"http://labs.systemone.at/wikipedia3/enwiki/20060326/enwiki-20060326.rdf.gz","dereferenceable" +"http://givingsense.eu/frscol/FrSchoolSystem/","dereferenceable" +"https://data.seattle.gov/api/views/w3y2-x633/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"http://spcdata.digitpa.gov.it/data/aoo.ttl","dereferenceable" +"http://gadm.geovocab.org/id/0_10","dereferenceable" +"http://www.ebi.ac.uk/rdf/services/atlas/sparql","sparql" +"https://data.cityofnewyork.us/api/views/tar7-vww3/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.wa.gov/api/views/tmay-2i9v/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"http://www.nosdonnees.fr/storage/f/2013-03-25T151047/annu_lirmm_0.1.rdf","dereferenceable" +"http://spcdata.digitpa.gov.it/data/uo.ttl","dereferenceable" +"https://data.wa.gov/api/views/kbv8-aawq/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"http://dati.camera.it/ocd/files/persona.turtle.gz","dereferenceable" +"https://data.maryland.gov/api/views/bwyv-uyh2/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://github.com/HsH-Bibliothek/geodata","dereferenceable" +"https://data.lacity.org/api/views/4ee5-wmby/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.ok.gov/api/views/a2a7-88yx/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.ok.gov/api/views/iz5g-gb92/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.cityofnewyork.us/api/views/drh3-e2fd/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"http://status.scoffoni.net/index.php/pscoffoni/foaf","dereferenceable" +"http://srcmf.org/public/alexis-rdf.zip","dereferenceable" +"https://data.mo.gov/api/views/x8fg-yyye/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"http://statusnet.sebseb01.net/sebseb01/foaf","dereferenceable" +"https://data.seattle.gov/api/views/uyyd-8gak/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.cityofnewyork.us/api/views/s8jv-f44n/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.kingcounty.gov/api/views/er52-nehu/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"http://data.dm2e.eu","dereferenceable" +"http://dati.camera.it/sparql","sparql" +"http://data.cnr.it/sparql/","sparql" +"http://oracle.skilledtests.com/chomskybot/foaf","dereferenceable" +"http://somsants.net/maulet1714/foaf","dereferenceable" +"http://status.soucy.cc/hs0ucy/foaf","dereferenceable" +"http://spip.org/spipmedias/foaf","dereferenceable" +"http://spraci.org/michaelmd/foaf","dereferenceable" +"https://data.illinois.gov/api/views/f7nd-jj28/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"http://dati.opendataground.it:80/comunealbanolaziale/906.rdf","dereferenceable" +"https://data.maryland.gov/api/views/kicx-k4rc/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.cityofnewyork.us/api/views/65z6-rsii/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.cityofnewyork.us/api/views/kku6-nxdu/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.medicare.gov/api/views/nrth-mfg3/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.ok.gov/api/views/fwkc-astr/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.baltimorecity.gov/api/views/nxbm-dfav/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.wa.gov/api/views/ak95-mjh9/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.maryland.gov/api/views/pdvh-tf2u/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.hawaii.gov/api/views/hc7x-8745/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://health.data.ny.gov/api/views/hbu9-xsrx/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.ok.gov/api/views/pyir-desi/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.medicare.gov/api/views/dgck-syfz/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.cityofnewyork.us/api/views/kpav-sd4t/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://datacatalog.cookcountyil.gov/api/views/5px6-amgc/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://health.data.ny.gov/api/views/ivw2-k53g/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"http://dati.opendataground.it:80/comunealbanolaziale/907.rdf","dereferenceable" +"https://data.lacity.org/api/views/3gwn-arjr/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.maryland.gov/api/views/xyrh-5e77/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.cityofchicago.org/api/views/kn9c-c2s2/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.cityofchicago.org/api/views/uvy2-xbnp/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.baltimorecity.gov/api/views/2j28-xzd7/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.sfgov.org/api/views/hbza-6v77/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.seattle.gov/api/views/egc4-d24i/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.cityofnewyork.us/api/views/h9gi-nx95/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.cityofnewyork.us/api/views/u553-m549/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.illinois.gov/api/views/uxhq-ykba/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.cityofnewyork.us/api/views/kjxa-7ccf/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.baltimorecity.gov/api/views/jcci-nzfy/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.cityofnewyork.us/api/views/p94q-8hxh/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.cityofnewyork.us/api/views/p424-amsu/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.illinois.gov/api/views/t224-vrp2/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.montgomerycountymd.gov/api/views/5pue-gfbe/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.maryland.gov/api/views/xedu-p97g/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"http://blogs.bootsnall.com/luka/index.rdf","dereferenceable" +"https://data.medicare.gov/api/views/k2ze-bqvw/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.medicare.gov/api/views/ytf2-4ept/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.ok.gov/api/views/jguv-se39/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.cityofnewyork.us/api/views/arq3-7z49/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.sfgov.org/api/views/88g8-5mnd/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.seattle.gov/api/views/7ais-f98f/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.maryland.gov/api/views/mk5a-nf44/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.cityofnewyork.us/api/views/hh8v-7m7u/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"http://cb.semsol.org/sparql?query=dump","dereferenceable" +"https://data.maryland.gov/api/views/cmwa-gxtm/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.baltimorecity.gov/api/views/xmpa-487w/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.baltimorecity.gov/api/views/rb22-mgti/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"http://linguistic.linkeddata.es/id/apertium/lexiconEN","dereferenceable" +"https://data.sfgov.org/api/views/s593-yv8k/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.illinois.gov/api/views/92jh-73bc/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.illinois.gov/api/views/mszz-27vx/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"http://dati.opendataground.it:80/comunealbanolaziale/979.rdf","dereferenceable" +"https://data.sfgov.org/api/views/ea9w-4zvc/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.exim.gov/api/views/sa52-sypr/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.lacity.org/api/views/4ca8-mxuh/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.ok.gov/api/views/52ak-m2d5/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.cityofnewyork.us/api/views/qb3k-n8mm/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.cityofchicago.org/api/views/s6ha-ppgi/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.hawaii.gov/api/views/usfi-mive/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.cityofchicago.org/api/views/cauq-8yn6/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.cityofnewyork.us/api/views/y74e-vkxy/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.cityofnewyork.us/api/views/qqsi-vm9f/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://health.data.ny.gov/api/views/2hcc-shji/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.cityofnewyork.us/api/views/kwk4-6u9e/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"http://datahub.io/dataset/hellenic-fire-brigade","dereferenceable" +"https://data.cityofchicago.org/api/views/v7ui-k59z/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.sfgov.org/api/views/v94x-pf9r/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.seattle.gov/api/views/fy3x-rf3i/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"https://data.seattle.gov/api/views/7svg-ds5z/rows.rdf?accessType=DOWNLOAD","dereferenceable" +"http://www.data.gov/semantic/data/alpha/1564/dataset-1564.rdf.gz","dereferenceable" diff --git a/squirrel.frontier/src/main/resources/multiNomialTrainData.txt b/squirrel.frontier/src/main/resources/multiNomialTrainData.txt new file mode 100644 index 000000000..1393a14b3 --- /dev/null +++ b/squirrel.frontier/src/main/resources/multiNomialTrainData.txt @@ -0,0 +1,292 @@ +"http://api.kasabi.com/dataset/jisc-cetis-project-directory/apis/sparql","SPARQL" +"http://ipi.bio2rdf.org/sparql","SPARQL" +"http://api.kasabi.com/dataset/prelinger-archives/apis/sparql","SPARQL" +"http://api.talis.com/stores/schemapedia/services/sparql","SPARQL" +"http://zbw.eu/beta/sparql/stw","SPARQL" +"http://api.kasabi.com/api/sparql-endpoint-near","SPARQL" +"http://api.talis.com/stores/pokedex/services/sparql","SPARQL" +"http://api.talis.com/stores/climb/services/sparql","SPARQL" +"http://pdb.bio2rdf.org/sparql","SPARQL" +"http://api.talis.com/stores/datagovuk/services/sparql","SPARQL" +"http://api.kasabi.com/dataset/bricklink/apis/sparql","SPARQL" +"http://api.talis.com/stores/moseley/services/sparql","SPARQL" +"http://api.talis.com/stores/theviewfrom/services/sparql","SPARQL" +"http://linkedmanchester.org/sparql","SPARQL" +"http://data.bib.uni-mannheim.de/sparql","SPARQL" +"http://kasabi.com/api/sparql-endpoint-foodista","SPARQL" +"http://api.kasabi.com/dataset/foodista/apis/sparql","SPARQL" +"http://api.talis.com/stores/wordnet/services/sparql","SPARQL" +"http://data.uni-muenster.de/php/sparql","SPARQL" +"http://biocarta.bio2rdf.org/sparql","SPARQL" +"http://linkedscotland.org/sparql","SPARQL" +"http://api.kasabi.com/dataset/nasa/apis/sparql","SPARQL" +"http://api.talis.com/stores/airports/services/sparql","SPARQL" +"http://api.talis.com/stores/fanhubz/services/sparql","SPARQL" +"http://api.talis.com/stores/theviewfrom/services/sparql","SPARQL" +"http://linkedmanchester.org/sparql","SPARQL" +"http://lab3.libris.kb.se/sparql","SPARQL" +"http://lod.ac/bdls/sparql","SPARQL" +"http://eris.okfn.org/sparql","SPARQL" +"http://corpora.nlp2rdf.org/sparql","SPARQL" +"http://data.cnr.it/sparql-proxy/","SPARQL" +"http://data.ox.ac.uk/sparql/","SPARQL" +"http://wiki.rkbexplorer.com/sparql/","SPARQL" +"http://lobid.org/sparql/","SPARQL" +"http://fao.270a.info/sparql","SPARQL" +"http://dbtune.org/myspace/sparql/","SPARQL" +"http://gutenberg.dcs.fi.uva.es/~bhscmcyt/census/sparql_en.php","SPARQL" +"http://api.kasabi.com/dataset/pali-english-lexicon/apis/sparql","SPARQL" +"http://ibm.rkbexplorer.com/sparql/","SPARQL" +"http://api.kasabi.com/dataset/pali-english-lexicon/apis/sparql","SPARQL" +"http://drugbank.bio2rdf.org/sparql","SPARQL" +"http://sparql.wikipathways.org/","SPARQL" +"http://ndb.publink.lod2.eu/sparql","SPARQL" +"http://eurecom.rkbexplorer.com/sparql/","SPARQL" +"http://eur-lex.publicdata.eu/sparql","SPARQL" +"http://data.archiveshub.ac.uk/dump/linkedarchiveshub.zip","DUMP" +"http://pisa.rkbexplorer.com/sparql/","SPARQL" +"http://resex.rkbexplorer.com/sparql/","SPARQL" +"http://www4.wiwiss.fu-berlin.de/euraxess/sparql","SPARQL" +"http://datos.bne.es/sparql","SPARQL" +"http://spending.lichfielddc.gov.uk/sparql","SPARQL" +"http://ccny-cuny.eagle-i.net/sparqler/sparql","SPARQL" +"http://upr.eagle-i.net/sparqler/sparql","SPARQL" +"http://services.data.gov.uk/research/sparql","SPARQL" +"http://el.dbpedia.org/sparql","SPARQL" +"http://n-lex.publicdata.eu/sparql","SPARQL" +"http://govwild.org/sparql","SPARQL" +"http://setaria.oszk.hu/sparql","SPARQL" +"http://cb.semsol.org/sparql","SPARQL" +"http://howard.eagle-i.net/sparqler/sparql","SPARQL" +"http://foreign.rkbexplorer.com/sparql","SPARQL" +"http://revyu.com/sparql","SPARQL" +"http://transparency.270a.info/sparql","SPARQL" +"http://photos.rkbexplorer.com/sparql","SPARQL" +"http://os.rkbexplorer.com/sparql/","SPARQL" +"http://libver.math.auth.gr/sparql","SPARQL" +"http://sparql.linkedopendata.it/los","SPARQL" +"http://lisbon.rkbexplorer.com/sparql","SPARQL" +"http://www4.wiwiss.fu-berlin.de/gutendata/sparql","SPARQL" +"http://unodc.publicdata.eu/sparql","SPARQL" +"http://oecd.270a.info/sparql","SPARQL" +"http://resource.geolba.ac.at/PoolParty/sparql/GeologicUnit","SPARQL" +"http://resource.geolba.ac.at/PoolParty/sparql/lithology","SPARQL" +"http://www4.wiwiss.fu-berlin.de/diseasome/sparql","SPARQL" +"http://eurostat.linked-statistics.org/sparql","SPARQL" +"http://data.ox.ac.uk/sparql/","SPARQL" +"http://services.data.gov.uk/education/sparql","SPARQL" +"http://www4.wiwiss.fu-berlin.de/cordis/sparql","SPARQL" +"http://xula.eagle-i.net/sparqler/sparql","SPARQL" +"https://eagle-i.ea.vanderbilt.edu/sparqler/sparql","SPARQL" +"http://www4.wiwiss.fu-berlin.de/medicare/sparql","SPARQL" +"http://sparql.linkedopendata.it/grrt","SPARQL" +"http://lod.euscreen.eu/sparql","SPARQL" +"http://resrev.ilrt.bris.ac.uk/data-server-workshop/sparql","SPARQL" +"http://api.talis.com/stores/pbac/services/sparql","SPARQL" +"http://soa4all.isoco.net/luf/sparql","SPARQL" +"http://dewey.info/sparql.php","SPARQL" +"http://api.talis.com/stores/mesh-norwegian/services/sparql","SPARQL" +"http://newcastle.rkbexplorer.com/sparql/","SPARQL" +"http://rdf.muninn-project.org/sparql","SPARQL" +"http://cultura.linkeddata.es/sparql","SPARQL" +"http://platform.uberblic.org/api/v1/sparql","SPARQL" +"http://cr.eionet.europa.eu/sparql","SPARQL" +"http://tkm.kiom.re.kr/ontology/sparql","SPARQL" +"http://wordnet.rkbexplorer.com/sparql/","SPARQL" +"http://miuras.inf.um.es/sparql","SPARQL" +"http://epsrc.rkbexplorer.com/sparql","SPARQL" +"http://api.kasabi.com/dataset/ecco-tcp-eighteenth-century-collections-online-texts/apis/sparql","SPARQL" +"http://sparql.data.southampton.ac.uk/","SPARQL" +"http://cdrewu.eagle-i.net/sparqler/sparql","SPARQL" +"https://data.lacity.org/api/views/4ee5-wmby/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.ok.gov/api/views/a2a7-88yx/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.ok.gov/api/views/iz5g-gb92/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.cityofnewyork.us/api/views/drh3-e2fd/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.seattle.gov/api/views/uyyd-8gak/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.cityofnewyork.us/api/views/s8jv-f44n/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.kingcounty.gov/api/views/er52-nehu/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.illinois.gov/api/views/f7nd-jj28/rows.rdf?accessType=DOWNLOAD","DUMP" +"http://dati.opendataground.it:80/comunealbanolaziale/906.rdf","DUMP" +"https://data.maryland.gov/api/views/kicx-k4rc/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.cityofnewyork.us/api/views/65z6-rsii/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.cityofnewyork.us/api/views/kku6-nxdu/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.medicare.gov/api/views/nrth-mfg3/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.ok.gov/api/views/fwkc-astr/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.baltimorecity.gov/api/views/nxbm-dfav/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.wa.gov/api/views/ak95-mjh9/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.maryland.gov/api/views/pdvh-tf2u/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.hawaii.gov/api/views/hc7x-8745/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://health.data.ny.gov/api/views/hbu9-xsrx/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.ok.gov/api/views/pyir-desi/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.medicare.gov/api/views/dgck-syfz/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.cityofnewyork.us/api/views/kpav-sd4t/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://datacatalog.cookcountyil.gov/api/views/5px6-amgc/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://health.data.ny.gov/api/views/ivw2-k53g/rows.rdf?accessType=DOWNLOAD","DUMP" +"http://dati.opendataground.it:80/comunealbanolaziale/907.rdf","DUMP" +"https://data.lacity.org/api/views/3gwn-arjr/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.maryland.gov/api/views/xyrh-5e77/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.cityofchicago.org/api/views/kn9c-c2s2/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.cityofchicago.org/api/views/uvy2-xbnp/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.baltimorecity.gov/api/views/2j28-xzd7/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.sfgov.org/api/views/hbza-6v77/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.seattle.gov/api/views/egc4-d24i/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.cityofnewyork.us/api/views/h9gi-nx95/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.cityofnewyork.us/api/views/u553-m549/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.illinois.gov/api/views/uxhq-ykba/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.cityofnewyork.us/api/views/kjxa-7ccf/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.baltimorecity.gov/api/views/jcci-nzfy/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.cityofnewyork.us/api/views/p94q-8hxh/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.cityofnewyork.us/api/views/p424-amsu/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.illinois.gov/api/views/t224-vrp2/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.montgomerycountymd.gov/api/views/5pue-gfbe/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.maryland.gov/api/views/xedu-p97g/rows.rdf?accessType=DOWNLOAD","DUMP" +"http://blogs.bootsnall.com/luka/index.rdf","DUMP" +"https://data.medicare.gov/api/views/k2ze-bqvw/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.medicare.gov/api/views/ytf2-4ept/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.ok.gov/api/views/jguv-se39/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.cityofnewyork.us/api/views/arq3-7z49/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.sfgov.org/api/views/88g8-5mnd/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.seattle.gov/api/views/7ais-f98f/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.maryland.gov/api/views/mk5a-nf44/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.cityofnewyork.us/api/views/hh8v-7m7u/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.sfgov.org/api/views/s593-yv8k/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.illinois.gov/api/views/92jh-73bc/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.illinois.gov/api/views/mszz-27vx/rows.rdf?accessType=DOWNLOAD","DUMP" +"http://dati.opendataground.it:80/comunealbanolaziale/979.rdf","DUMP" +"https://data.sfgov.org/api/views/ea9w-4zvc/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.exim.gov/api/views/sa52-sypr/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.lacity.org/api/views/4ca8-mxuh/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.ok.gov/api/views/52ak-m2d5/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.cityofnewyork.us/api/views/qb3k-n8mm/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.cityofchicago.org/api/views/s6ha-ppgi/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.hawaii.gov/api/views/usfi-mive/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.cityofchicago.org/api/views/cauq-8yn6/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.cityofnewyork.us/api/views/y74e-vkxy/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.cityofnewyork.us/api/views/qqsi-vm9f/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://health.data.ny.gov/api/views/2hcc-shji/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.cityofnewyork.us/api/views/kwk4-6u9e/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.seattle.gov/api/views/jedg-8zvw/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.seattle.gov/api/views/kdjv-k5qf/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.cityofnewyork.us/api/views/mnz3-dyi8/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.cityofnewyork.us/api/views/mdbu-nrqn/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.medicare.gov/api/views/qqc4-6tc7/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.cityofnewyork.us/api/views/dvzp-h4k9/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.medicare.gov/api/views/zqjn-m8m8/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.medicare.gov/api/views/9n3s-kdb3/rows.rdf?accessType=DOWNLOAD","DUMP" +"http://dati.opendataground.it:80/comunealbanolaziale/908.rdf","DUMP" +"https://data.kingcounty.gov/api/views/dkxx-z4fb/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.cityofnewyork.us/api/views/qhen-5rve/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.cityofnewyork.us/api/views/jfju-ynrr/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.medicare.gov/api/views/rs6n-9qwg/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.cityofchicago.org/api/views/5yjb-v3mj/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.cityofnewyork.us/api/views/x2hp-8ukt/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.ny.gov/api/views/fjce-ze3t/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.cityofnewyork.us/api/views/cxzk-qz9w/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.cityofnewyork.us/api/views/ftxv-d5ix/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.hawaii.gov/api/views/jzyk-q3tp/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.medicare.gov/api/views/qd2y-qcgs/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.hawaii.gov/api/views/wwsw-d6qv/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.oregon.gov/api/views/edj7-vxdr/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.cityofchicago.org/api/views/3qdj-cqb8/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.oregon.gov/api/views/8s3k-ygh2/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.ok.gov/api/views/js93-d7pp/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.baltimorecity.gov/api/views/782b-zpd7/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.cityofnewyork.us/api/views/upwt-zvh3/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.cityofnewyork.us/api/views/tbf6-u8ea/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.maryland.gov/api/views/3bkz-cttp/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.sfgov.org/api/views/yitu-d5am/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.baltimorecity.gov/api/views/fswi-8fjy/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.illinois.gov/api/views/sp57-w96j/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://datacatalog.cookcountyil.gov/api/views/excn-ffg4/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.kingcounty.gov/api/views/yaai-7frk/rows.rdf?accessType=DOWNLOAD","DUMP" +"https://data.montgomerycountymd.gov/api/views/4mse-ku6q/rows.rdf?accessType=DOWNLOAD","DUMP" +"http://suche.transparenz.hamburg.de/","CKAN" +"http://www.opendata.provincia.roma.it/","CKAN" +"http://opendata.cmt.es","CKAN" +"http://data.gov.uk/","CKAN" +"http://www.dati.gov.it/catalog/","CKAN" +"http://www.daten.rlp.de/","CKAN" +"http://opendata.aragon.es/catalogo/","CKAN" +"http://rotterdamopendata.nl/","CKAN" +"http://data.gov.ie/","CKAN" +"http://www.opendata.admin.ch/en/","CKAN" +"http://cz.ckan.net/en/","CKAN" +"http://data.amsterdamopendata.nl/","CKAN" +"http://data.opendataforum.info/","CKAN" +"http://oppnadata.se/","CKAN" +"https://www.data.gv.at/katalog/","CKAN" +"https://www.govdata.de/ckan/","CKAN" +"http://opengov.es/","CKAN" +"http://opendata.hu/","CKAN" +"https://offenedaten.de/","CKAN" +"http://ckan.data.graz.gv.at/","CKAN" +"http://ckan.data.linz.gv.at/","CKAN" +"http://www.nosdonnees.fr/","CKAN" +"http://datosabiertos.malaga.eu/","CKAN" +"http://dati.trentino.it/","CKAN" +"http://dati.veneto.it/","CKAN" +"http://data.kk.dk/","CKAN" +"http://portal.openbelgium.be/","CKAN" +"http://www.odaa.dk/","CKAN" +"http://ckan.opendatacanarias.es/","CKAN" +"http://it.ckan.net/","CKAN" +"http://rs.ckan.net/","CKAN" +"http://www.hri.fi/en/","CKAN" +"http://data.bris.ac.uk/data/","CKAN" +"http://data.london.gov.uk","CKAN" +"https://open-data.europa.eu/en/data/","CKAN" +"http://data.upf.edu/en/","CKAN" +"http://opendata.ayto-caceres.es/","CKAN" +"http://opendata.opennorth.se/","CKAN" +"http://datos.alcobendas.org/","CKAN" +"http://data.opendataportal.at/","CKAN" +"http://opendata.comune.bari.it/","CKAN" +"http://www.opendatamalta.org/ckan/","CKAN" +"http://dati.toscana.it/","CKAN" +"http://data.glasgow.gov.uk/","CKAN" +"http://data.gov.ro/","CKAN" +"http://publicdata.eu/","CKAN" +"http://datahub.io/","CKAN" +"http://data.gov.md/ckan/","CKAN" +"https://www.opengov-muenchen.de/","CKAN" +"http://data.noe.gv.at/","CKAN" +"http://datos.santander.es/catalogo/","CKAN" +"http://ckan.gobex.es/","CKAN" +"http://catalogo.upo.gob.es/en/","CKAN" +"http://data.gov.hr/","CKAN" +"http://annuario.comune.fi.it/","CKAN" +"http://www.datagm.org.uk/","CKAN" +"http://data.wu.ac.at/","CKAN" +"http://opendata.cnmc.es/","CKAN" +"http://opendata.awt.be/","CKAN" +"http://pl.ckan.net/","CKAN" +"https://opendata.government.bg","CKAN" +"http://www.leedsdatamill.org","CKAN" +"https://data.overheid.nl/data/","CKAN" +"http://apigobiernoabiertocatalog.valencia.es/","CKAN" +"http://www.opendatahub.it/","CKAN" +"http://www.dataset.puglia.it/","CKAN" +"http://data.zagreb.hr/","CKAN" +"https://danepubliczne.gov.pl","CKAN" +"https://www.data.gouv.fr","CKAN" +"http://data.gov.gr/","CKAN" +"http://www.edinburghopendata.info/","CKAN" +"https://opendata.riik.ee/","CKAN" +"http://data.salzburgerland.com/","CKAN" +"http://opendatagortynia.gr/","CKAN" +"http://www.opendata-hro.de/","CKAN" +"http://opingogn.is/","CKAN" +"http://data.gov.sk/","CKAN" +"https://data.stadt-zuerich.ch/","CKAN" +"https://www.avoindata.fi/data/fi/","CKAN" +"http://en.openei.org/datasets/","CKAN" +"http://opendata.cambridgeshireinsight.org.uk/","CKAN" +"http://marinedata.scotland.gov.uk/~marine/","CKAN" +"http://opendata.bonn.de/","CKAN" +"http://data.gov.ua/","CKAN" +"http://data.mfcr.cz/","CKAN" +"http://opendatakosovo.org/data","CKAN" +"http://api.talis.com/stores/eupmedia/services/sparql","SPARQL" +"http://cr3.eionet.europa.eu/sparql","SPARQL" +"http://data.gov.ua/","CKAN" +"http://data.mfcr.cz/","CKAN" diff --git a/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/FrontierImplTest.java b/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/FrontierImplTest.java index 48e801043..f907b9718 100644 --- a/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/FrontierImplTest.java +++ b/squirrel.frontier/src/test/java/org/dice_research/squirrel/frontier/impl/FrontierImplTest.java @@ -16,6 +16,8 @@ import org.dice_research.squirrel.data.uri.UriType; import org.dice_research.squirrel.data.uri.filter.MongoDBKnowUriFilter; import org.dice_research.squirrel.data.uri.norm.NormalizerImpl; +import org.dice_research.squirrel.predictor.MultinomialPredictor; +import org.dice_research.squirrel.predictor.Predictor; import org.dice_research.squirrel.queue.ipbased.MongoDBIpBasedQueue; import org.junit.After; import org.junit.Assert; @@ -32,6 +34,7 @@ public class FrontierImplTest { private static MongoDBKnowUriFilter filter; private static List uris = new ArrayList(); private static CrawleableUriFactory4Tests cuf = new CrawleableUriFactory4Tests(); + private static Predictor predictor; @Before public void setUp() throws Exception { @@ -43,7 +46,8 @@ public void setUp() throws Exception { queue = new MongoDBIpBasedQueue("localhost", 58027); filter.open(); queue.open(); - frontier = new FrontierImpl(new NormalizerImpl(), filter, queue,true); + predictor = new MultinomialPredictor(); + frontier = new FrontierImpl(new NormalizerImpl(), filter, queue,true, predictor); uris.add(cuf.create(new URI("http://dbpedia.org/resource/New_York"), InetAddress.getByName("127.0.0.1"), UriType.DEREFERENCEABLE)); @@ -172,4 +176,4 @@ public void tearDown() throws Exception { p = Runtime.getRuntime().exec(rethinkDockerRmCommand); p.waitFor(); } -} \ No newline at end of file +} diff --git a/squirrel.frontier/src/test/java/org/dice_research/squirrel/predictor/impl/BinomialPredictorEvaluation.java b/squirrel.frontier/src/test/java/org/dice_research/squirrel/predictor/impl/BinomialPredictorEvaluation.java new file mode 100644 index 000000000..04f37e066 --- /dev/null +++ b/squirrel.frontier/src/test/java/org/dice_research/squirrel/predictor/impl/BinomialPredictorEvaluation.java @@ -0,0 +1,196 @@ +package org.dice_research.squirrel.predictor.impl; + +import org.dice_research.squirrel.data.uri.CrawleableUri; +import org.dice_research.squirrel.predictor.BinomialPredictor; + +import java.io.*; +import java.net.URI; +import java.net.URISyntaxException; +import java.net.URL; +import java.nio.charset.Charset; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Random; + +public class BinomialPredictorEvaluation { + + /** + * Used to initialize the object for the binomial predictor + */ + protected BinomialPredictor predictor; + + /** + * Indicates the path to the file containing train data. + */ + protected String trainFilePath; + + /** + * Indicates the name of the type which should be used as positive class while training. + */ + protected String positiveClass; + + /** + * Indicates the path to the file containing the test data. + */ + String testFilePath; + /** + * Used to generate the + */ + + /** + * Constructor. + * + * @param trainFilePath Indicates the path to the file containing train data. + * @param positiveClass Indicates the name of the type which should be used as positive class while training. + * @param testFilePath Indicates the path to the file containing the test data. + */ + public BinomialPredictorEvaluation(String trainFilePath, String positiveClass, String testFilePath) { + this.trainFilePath = trainFilePath; + this.positiveClass = positiveClass; + this.testFilePath = testFilePath; + + } + + /** + * Function to evaluate the performance of the URI predictor on a test set + */ + public void evaluation() { + Integer uriCount = 0; + Integer correctCount = 0; + double accuracy; + Integer truePos = 0; + Integer falsePos = 0; + Integer falseNeg = 0; + Integer trueNeg = 0; + BufferedReader br = null; + try{ + FileReader in = new FileReader(testFilePath); + br = new BufferedReader(in); + String line; + while ((line = br.readLine()) != null){ + uriCount++; + String[] split = line.split("," ); + URI furi = null; + try{ + furi = new URI(split[0].replace("\"", "")); + }catch (URISyntaxException e) { + try { + furi = new URI("http://scoreboard.lod2.eu/data/scoreboardDataCube.rdf"); + } catch (URISyntaxException ex) { + ex.printStackTrace(); + } + } + CrawleableUri uri = new CrawleableUri(furi); + String pred = this.predictor.predict(uri); + split[1] = split[1].replace("\"", ""); + if(split[1].equals(positiveClass)){ + //System.out.println("the class is: " + split[1]); + if(pred.equals("dereferencing")){ + correctCount ++; + truePos ++; + } + else{ + falseNeg ++; + } + } + else{ + if(!pred.equals("dereferencing")){ + correctCount ++; + trueNeg ++; + } + else{ + falsePos ++; + } + } + } + + }catch (IOException e){ + e.printStackTrace(); + } + accuracy = correctCount.floatValue() / uriCount.floatValue(); + + System.out.println(" The total number of URIs is: " + uriCount); + System.out.println(" The total number of correct predictions is: " + correctCount); + System.out.println(" The accuracy of the predictor is: " + accuracy); + System.out.println("True Positive is: " + truePos); + System.out.println("False Positive is: " + falsePos); + System.out.println("False Negative is: " + falseNeg); + System.out.println("True Negative is: " + trueNeg); + + } + + /** + * Function to perform K-fold cross validation. + */ + public void crossValidation(){ + URL url = null; + BufferedReader br = null; + ArrayList lineList = new ArrayList(); + int[][] train; + int[][] test; + int[] index; + String line; + int folds = 10; + int chunk; + try { + + + br = new BufferedReader(new InputStreamReader(getClass().getClassLoader().getResourceAsStream("binomialTrainData.txt") + , Charset.defaultCharset())); + line = br.readLine(); + while( ( line = br.readLine()) != null){ + lineList.add(line); + } + System.out.println(lineList.size()); + Collections.shuffle(lineList, new Random(113)); + chunk = lineList.size()/folds; + train = new int[folds][]; + test = new int[folds][]; + index = new int[lineList.size()]; + for (int i = 0; i < lineList.size(); i++) { + index[i] = i; + } + for(int i=0; i=start && j classList = new ArrayList<>(); + static{ + classList.add("SPARQL"); + classList.add("DUMP"); + classList.add("CKAN"); + } + + Integer[][] confusionMatrix = new Integer[3][3]; + + /** + * Constructor. + * + * @param trainFilePath + * Indicates the path to the file containing train data. + * @param testFilePath + * Indicates the path to the file containing the test data. + */ + public MultinomialPredictorEvaluation(String trainFilePath, String testFilePath){ + this.trainFilePath = trainFilePath; + this.testFilePath = testFilePath; + for(int i=0; i<3; i++){ + for(int j=0; j<3; j++){ + confusionMatrix[i][j] = 0; + } + } + } + + /** + * Function to evaluate the performance of the URI predictor on a test set + */ + public double evaluation() { + + Integer uriCount = 0; + Integer correctCount = 0; + double accuracy; + BufferedReader br = null; + try (FileReader in = new FileReader(testFilePath)){ + br = new BufferedReader(in); + String line; + while ((line = br.readLine()) != null) { + uriCount ++; + String[] split = line.split("," ); + URI furi = null; + try { + furi = new URI(split[0].replace("\"", "")); + } catch (URISyntaxException e) { + try { + furi = new URI("http://scoreboard.lod2.eu/data/scoreboardDataCube.rdf"); + } catch (URISyntaxException ex) { + ex.printStackTrace(); + } + } + CrawleableUri uri = new CrawleableUri(furi); + String pred = predictor.predict(uri); + //System.out.println("predicted values: "+ pred); + split[1] = split[1].replace("\"", ""); + //System.out.println("the classList index: "+classList.indexOf(split[1])); + if(classList.indexOf(split[1]) != -1) + confusionMatrix[classList.indexOf(split[1])][classList.indexOf(pred)]++; + if(pred.equals(split[1])){ + correctCount++; + } + } + } catch (IOException e) { + e.printStackTrace(); + } + accuracy = correctCount.floatValue() / uriCount.floatValue(); + for(int i=0; i<3; i++){ + for(int j=0; j<3; j++){ + System.out.print(" " +confusionMatrix[i][j]); + } + System.out.println(); + } + System.out.println(" The total number of URIs is: " + uriCount); + System.out.println(" The total number of correct predictions is: " + correctCount); + System.out.println(" The accuracy of the predictor is: " + accuracy); + return accuracy; + } + + /** + * Function to perform K-fold cross validation. + */ + public void crossValidation(){ + URL url = null; + BufferedReader br = null; + ArrayList lineList = new ArrayList(); + int[][] train; + int[][] test; + int[] index; + String line; + int folds = 10; + int chunk; + try { + br = new BufferedReader(new InputStreamReader(new FileInputStream(trainFilePath))); + while( ( line = br.readLine()) != null){ + lineList.add(line); + } + Collections.shuffle(lineList, new Random(113)); + chunk = lineList.size()/folds; + train = new int[folds][]; + test = new int[folds][]; + index = new int[lineList.size()]; + for (int i = 0; i < lineList.size(); i++) { + index[i] = i; + } + for(int i=0; i=start && j analyze(CrawleableUri curi, File data, Sink sink) { sink.addTriple(curi, t); } ActivityUtil.addStep(curi, getClass()); + curi.addData(Constants.URI_TRUE_CLASS, "DUMP"); return collector.getUris(curi); } catch (IOException | org.rdfhdt.hdt.exceptions.NotFoundException e) { LOGGER.error("An error occured when processing the HDT file", e); diff --git a/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/RDFAnalyzer.java b/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/RDFAnalyzer.java index 27ffa59ad..1abb6f704 100644 --- a/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/RDFAnalyzer.java +++ b/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/RDFAnalyzer.java @@ -98,6 +98,9 @@ public Iterator analyze(CrawleableUri curi, File data, Sink sink) { } } ActivityUtil.addStep(curi, getClass()); + if(curi.getData(Constants.URI_TRUE_CLASS) == null){ + curi.addData(Constants.URI_TRUE_CLASS, "DUMP"); + } return collector.getUris(curi); } catch (Exception e) { LOGGER.error("Exception while analyzing. Aborting. ", e); diff --git a/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/ckan/CkanJsonAnalyzer.java b/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/ckan/CkanJsonAnalyzer.java index 19b1b4edd..5de521107 100644 --- a/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/ckan/CkanJsonAnalyzer.java +++ b/squirrel.worker/src/main/java/org/dice_research/squirrel/analyzer/impl/ckan/CkanJsonAnalyzer.java @@ -48,7 +48,11 @@ public CkanJsonAnalyzer(UriCollector collector) { public Iterator analyze(CrawleableUri curi, File data, Sink sink) { // Make sure that the file contains the CKAN JSON objects we are expecting if (Constants.URI_TYPE_VALUE_CKAN.equals(curi.getData(Constants.URI_HTTP_MIME_TYPE_KEY))) { + + curi.addData(Constants.URI_TRUE_CLASS, "CKAN"); + LOGGER.info("Starting the Ckan Json Analyzer for URI: " + curi.getUri().toString()); + Stream lines = null; try { lines = Files.lines(data.toPath(), StandardCharsets.UTF_8); diff --git a/squirrel.worker/src/main/java/org/dice_research/squirrel/fetcher/sparql/SparqlBasedFetcher.java b/squirrel.worker/src/main/java/org/dice_research/squirrel/fetcher/sparql/SparqlBasedFetcher.java index 7c0474f23..ed33be4c8 100644 --- a/squirrel.worker/src/main/java/org/dice_research/squirrel/fetcher/sparql/SparqlBasedFetcher.java +++ b/squirrel.worker/src/main/java/org/dice_research/squirrel/fetcher/sparql/SparqlBasedFetcher.java @@ -1,144 +1,147 @@ -package org.dice_research.squirrel.fetcher.sparql; - -import java.io.BufferedOutputStream; -import java.io.File; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.OutputStream; -import java.sql.SQLException; -import java.util.Iterator; - -import org.aksw.jena_sparql_api.core.QueryExecutionFactory; -import org.aksw.jena_sparql_api.delay.core.QueryExecutionFactoryDelay; -import org.aksw.jena_sparql_api.http.QueryExecutionFactoryHttp; -import org.aksw.jena_sparql_api.pagination.core.QueryExecutionFactoryPaginated; -import org.apache.commons.io.FileUtils; -import org.apache.jena.graph.Triple; -import org.apache.jena.query.QueryExecution; -import org.apache.jena.query.QuerySolution; -import org.apache.jena.query.ResultSet; -import org.apache.jena.riot.RDFDataMgr; -import org.apache.tika.io.IOUtils; -import org.dice_research.squirrel.Constants; -import org.dice_research.squirrel.data.uri.CrawleableUri; -import org.dice_research.squirrel.fetcher.Fetcher; -import org.dice_research.squirrel.fetcher.delay.Delayer; -import org.dice_research.squirrel.metadata.ActivityUtil; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.springframework.stereotype.Component; - -/** - * A simple {@link Fetcher} for SPARQL that tries to get triples from a SPARQL - * endpoint using the query {@value #SELECT_ALL_TRIPLES_QUERY}. - * - * @author Michael Röder (michael.roeder@uni-paderborn.de) - * - */ -@Component -public class SparqlBasedFetcher implements Fetcher { - - private static final Logger LOGGER = LoggerFactory.getLogger(SparqlBasedFetcher.class); - - /** - * The default minimum delay that the system will have between sending two queries. - */ - private static final int MINIMUM_DELAY = 1000; - - private static final String SELECT_ALL_TRIPLES_QUERY = "SELECT ?s ?p ?o\r\n" + "WHERE {\r\n" + "GRAPH ?g {\r\n" - + "?s ?p ?o\r\n" + "}} "; - - protected int minimumDelay = MINIMUM_DELAY; - protected File dataDirectory = FileUtils.getTempDirectory(); - - @Override - public File fetch(CrawleableUri uri, Delayer delayer) { - // Check whether we can be sure that it is a SPARQL endpoint - boolean shouldBeSparql = Constants.URI_TYPE_VALUE_SPARQL.equals(uri.getData(Constants.URI_TYPE_KEY)); - QueryExecutionFactory qef = null; - QueryExecution execution = null; - File dataFile = null; - OutputStream out = null; - try { - // Get the permission for the first request - delayer.getRequestPermission(); - // Create query execution instance - qef = initQueryExecution(uri.getUri().toString(), delayer); - // create temporary file - try { - dataFile = File.createTempFile("fetched_", "", dataDirectory); - out = new BufferedOutputStream(new FileOutputStream(dataFile)); - } catch (IOException e) { - LOGGER.error("Couldn't create temporary file for storing fetched data. Returning null.", e); - return null; - } - execution = qef.createQueryExecution(SELECT_ALL_TRIPLES_QUERY); - ResultSet resultSet = execution.execSelect(); - RDFDataMgr.writeTriples(out, new SelectedTriplesIterator(resultSet)); - uri.addData(Constants.URI_HTTP_MIME_TYPE_KEY, "application/n-triples"); - LOGGER.info("Added: " + uri.getData(Constants.URI_HTTP_MIME_TYPE_KEY)); - } catch (Throwable e) { - // If this should have worked, print a message, otherwise silently return null - if (shouldBeSparql) { - LOGGER.error("Couldn't create QueryExecutionFactory for \"" + uri.getUri() + "\". Returning -1."); - ActivityUtil.addStep(uri, getClass(), e.getMessage()); - } - return null; - } finally { - IOUtils.closeQuietly(out); - if (execution != null) { - execution.close(); - } - if (qef != null) { - qef.close(); - } - delayer.requestFinished(); - } - ActivityUtil.addStep(uri, getClass()); - return dataFile; - } - - protected QueryExecutionFactory initQueryExecution(String uri, Delayer delayer) - throws ClassNotFoundException, SQLException { - QueryExecutionFactory qef; - qef = new QueryExecutionFactoryHttp(uri); - qef = new QueryExecutionFactoryDelay(qef, Math.max(minimumDelay, delayer.getDelay())); - try { - LOGGER.info("Starting to Query uri:" + uri); - return new QueryExecutionFactoryPaginated(qef, 1000); - } catch (Exception e) { - LOGGER.info("Couldn't create Factory with pagination. Returning Factory without pagination. Exception: {}", - e.getLocalizedMessage()); - return qef; - } - } - - - - @Override - public void close() throws IOException { - // nothing to do - } - - protected static class SelectedTriplesIterator implements Iterator { - private ResultSet resultSet; - - public SelectedTriplesIterator(ResultSet resultSet) { - this.resultSet = resultSet; - } - - @Override - public boolean hasNext() { - return resultSet.hasNext(); - } - - @Override - public Triple next() { - QuerySolution solution = resultSet.next(); - Triple t = new Triple(solution.get("s").asNode(), solution.get("p").asNode(), solution.get("o").asNode()); - return t; - } - - } - -} +package org.dice_research.squirrel.fetcher.sparql; + +import java.io.BufferedOutputStream; +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.OutputStream; +import java.sql.SQLException; +import java.util.Iterator; + +import org.aksw.jena_sparql_api.core.QueryExecutionFactory; +import org.aksw.jena_sparql_api.delay.core.QueryExecutionFactoryDelay; +import org.aksw.jena_sparql_api.http.QueryExecutionFactoryHttp; +import org.aksw.jena_sparql_api.pagination.core.QueryExecutionFactoryPaginated; +import org.apache.commons.io.FileUtils; +import org.apache.jena.graph.Triple; +import org.apache.jena.query.QueryExecution; +import org.apache.jena.query.QuerySolution; +import org.apache.jena.query.ResultSet; +import org.apache.jena.riot.RDFDataMgr; +import org.apache.tika.io.IOUtils; +import org.dice_research.squirrel.Constants; +import org.dice_research.squirrel.data.uri.CrawleableUri; +import org.dice_research.squirrel.fetcher.Fetcher; +import org.dice_research.squirrel.fetcher.delay.Delayer; +import org.dice_research.squirrel.metadata.ActivityUtil; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.stereotype.Component; + +/** + * A simple {@link Fetcher} for SPARQL that tries to get triples from a SPARQL + * endpoint using the query {@value #SELECT_ALL_TRIPLES_QUERY}. + * + * @author Michael Röder (michael.roeder@uni-paderborn.de) + * + */ +@Component +public class SparqlBasedFetcher implements Fetcher { + + private static final Logger LOGGER = LoggerFactory.getLogger(SparqlBasedFetcher.class); + + /** + * The default minimum delay that the system will have between sending two queries. + */ + private static final int MINIMUM_DELAY = 1000; + + private static final String SELECT_ALL_TRIPLES_QUERY = "SELECT ?s ?p ?o\r\n" + "WHERE {\r\n" + "GRAPH ?g {\r\n" + + "?s ?p ?o\r\n" + "}} "; + + protected int minimumDelay = MINIMUM_DELAY; + protected File dataDirectory = FileUtils.getTempDirectory(); + + @Override + public File fetch(CrawleableUri uri, Delayer delayer) { + // Check whether we can be sure that it is a SPARQL endpoint + boolean shouldBeSparql = Constants.URI_TYPE_VALUE_SPARQL.equals(uri.getData(Constants.URI_TYPE_KEY)); + if(shouldBeSparql){ + uri.addData(Constants.URI_TRUE_CLASS,"SPARQL" ); + } + QueryExecutionFactory qef = null; + QueryExecution execution = null; + File dataFile = null; + OutputStream out = null; + try { + // Get the permission for the first request + delayer.getRequestPermission(); + // Create query execution instance + qef = initQueryExecution(uri.getUri().toString(), delayer); + // create temporary file + try { + dataFile = File.createTempFile("fetched_", "", dataDirectory); + out = new BufferedOutputStream(new FileOutputStream(dataFile)); + } catch (IOException e) { + LOGGER.error("Couldn't create temporary file for storing fetched data. Returning null.", e); + return null; + } + execution = qef.createQueryExecution(SELECT_ALL_TRIPLES_QUERY); + ResultSet resultSet = execution.execSelect(); + RDFDataMgr.writeTriples(out, new SelectedTriplesIterator(resultSet)); + uri.addData(Constants.URI_HTTP_MIME_TYPE_KEY, "application/n-triples"); + LOGGER.info("Added: " + uri.getData(Constants.URI_HTTP_MIME_TYPE_KEY)); + } catch (Throwable e) { + // If this should have worked, print a message, otherwise silently return null + if (shouldBeSparql) { + LOGGER.error("Couldn't create QueryExecutionFactory for \"" + uri.getUri() + "\". Returning -1."); + ActivityUtil.addStep(uri, getClass(), e.getMessage()); + } + return null; + } finally { + IOUtils.closeQuietly(out); + if (execution != null) { + execution.close(); + } + if (qef != null) { + qef.close(); + } + delayer.requestFinished(); + } + ActivityUtil.addStep(uri, getClass()); + return dataFile; + } + + protected QueryExecutionFactory initQueryExecution(String uri, Delayer delayer) + throws ClassNotFoundException, SQLException { + QueryExecutionFactory qef; + qef = new QueryExecutionFactoryHttp(uri); + qef = new QueryExecutionFactoryDelay(qef, Math.max(minimumDelay, delayer.getDelay())); + try { + LOGGER.info("Starting to Query uri:" + uri); + return new QueryExecutionFactoryPaginated(qef, 1000); + } catch (Exception e) { + LOGGER.info("Couldn't create Factory with pagination. Returning Factory without pagination. Exception: {}", + e.getLocalizedMessage()); + return qef; + } + } + + + + @Override + public void close() throws IOException { + // nothing to do + } + + protected static class SelectedTriplesIterator implements Iterator { + private ResultSet resultSet; + + public SelectedTriplesIterator(ResultSet resultSet) { + this.resultSet = resultSet; + } + + @Override + public boolean hasNext() { + return resultSet.hasNext(); + } + + @Override + public Triple next() { + QuerySolution solution = resultSet.next(); + Triple t = new Triple(solution.get("s").asNode(), solution.get("p").asNode(), solution.get("o").asNode()); + return t; + } + + } + +} diff --git a/squirrel.worker/src/main/java/org/dice_research/squirrel/worker/impl/WorkerImpl.java b/squirrel.worker/src/main/java/org/dice_research/squirrel/worker/impl/WorkerImpl.java index a2655b062..ec0af25b9 100644 --- a/squirrel.worker/src/main/java/org/dice_research/squirrel/worker/impl/WorkerImpl.java +++ b/squirrel.worker/src/main/java/org/dice_research/squirrel/worker/impl/WorkerImpl.java @@ -224,6 +224,9 @@ public void performCrawling(CrawleableUri uri) { LOGGER.debug("I start crawling {} now...", uri); File fetched = null; try { + + + fetched = fetcher.fetch(uri, delayer); } catch (Exception e) { LOGGER.error("Exception while Fetching Data. Skipping...", e); @@ -254,10 +257,11 @@ public void performCrawling(CrawleableUri uri) { for (File file : fileList) { LOGGER.info("Analyzing file " + cont + " of: " + fileList.size()); Iterator resultUris = analyzer.analyze(uri, file, sink); - sendNewUris(resultUris); + sendNewUris(resultUris,uri); cont++; } + } } } catch (Exception e) { @@ -282,11 +286,24 @@ public void performCrawling(CrawleableUri uri) { LOGGER.info("Crawling {} is not allowed by the RobotsManager.", uri); activity.addStep(manager.getClass(), "Decided to reject this URI."); } + + + // Adding true label value for the prediction in uri map + if (activity.getNumberOfTriples()>0) { + uri.addData(Constants.URI_TRUE_LABEL,"dereferenceable"); + } else { + uri.addData(Constants.URI_TRUE_LABEL, "NEGATIVE_CLASS"); + } + //activity.finishActivity(sink); + // LOGGER.debug("Fetched {} triples", count); + //setSpecificRecrawlTime(uri); + if(storeMetadata) activity.finishActivity(sink); // LOGGER.debug("Fetched {} triples", count); setSpecificRecrawlTime(uri); + } finally { // Remove the activity since we don't want to send it back to the Frontier uri.getData().remove(Constants.URI_CRAWLING_ACTIVITY); @@ -320,13 +337,14 @@ public boolean sendsAliveMessages() { * * @param uriIterator an iterator used to iterate over all new URIs */ - public void sendNewUris(Iterator uriIterator) { + public void sendNewUris(Iterator uriIterator, CrawleableUri uri) { List newUris = new ArrayList<>(MAX_URIS_PER_MESSAGE); CrawleableUri newUri; int packageCount = 0; while (uriIterator != null && uriIterator.hasNext()) { try { newUri = serializer.deserialize(uriIterator.next()); + newUri.addData(Constants.REFERRING_URI, uri.getUri()); uriProcessor.recognizeUriType(newUri); newUris.add(newUri); if ((newUris.size() >= (packageCount + 1) * MAX_URIS_PER_MESSAGE) && uriIterator.hasNext()) { @@ -357,4 +375,4 @@ public int getId() { return this.id; } -} \ No newline at end of file +}