diff --git a/Dockerfile.frontier b/Dockerfile.frontier
index e0b09d4b8..8bafba5a0 100644
--- a/Dockerfile.frontier
+++ b/Dockerfile.frontier
@@ -3,7 +3,7 @@ FROM openjdk:8u151-jdk
RUN apt-get update && apt-get install -y netcat
COPY ./squirrel.frontier/target/squirrel.frontier.jar /data/squirrel/squirrel.jar
-COPY ./spring-config/default-config.xml /data/squirrel/default-config.xml
+COPY ./spring-config/ /data/squirrel/spring-config
WORKDIR /data/squirrel
#ADD entrypoint.sh /entrypoint.sh
@@ -11,4 +11,4 @@ WORKDIR /data/squirrel
VOLUME ["/var/squirrel/data"]
-CMD java -cp squirrel.jar:. org.hobbit.core.run.ComponentStarter org.aksw.simba.squirrel.components.FrontierComponent
+CMD java -cp squirrel.jar:. org.hobbit.core.run.ComponentStarter org.dice_research.squirrel.components.FrontierComponent
diff --git a/Dockerfile.web b/Dockerfile.web
new file mode 100644
index 000000000..3ceddfed8
--- /dev/null
+++ b/Dockerfile.web
@@ -0,0 +1,16 @@
+FROM openjdk:8u151-jdk
+
+RUN apt-get update && apt-get install -y netcat
+
+COPY ./squirrel.web/target/squirrel.web.jar /data/squirrel/squirrel.web.jar
+COPY ./squirrel.web/target/squirrel.web.jar.original /data/squirrel/squirrel.web.jar.original
+COPY ./squirrel.web/WEB-INF /data/squirrel/WEB-INF
+WORKDIR /data/squirrel
+
+#ADD entrypoint.sh /entrypoint.sh
+#RUN chmod +x /entrypoint.sh
+
+VOLUME ["/var/squirrel/data"]
+
+CMD java -cp squirrel.web.jar:. com.squirrel.Application
+
diff --git a/Dockerfile.worker b/Dockerfile.worker
index e7c536e6b..2d405901b 100644
--- a/Dockerfile.worker
+++ b/Dockerfile.worker
@@ -3,7 +3,7 @@ FROM openjdk:8u151-jdk
RUN apt-get update && apt-get install -y netcat
COPY ./squirrel.worker/target/squirrel.worker.jar /data/squirrel/squirrel.jar
-COPY ./spring-config/default-config.xml /data/squirrel/default-config.xml
+COPY ./spring-config /data/squirrel/spring-config
WORKDIR /data/squirrel
#ADD entrypoint.sh /entrypoint.sh
diff --git a/Makefile b/Makefile
index 4fa1ea933..cd800ee53 100644
--- a/Makefile
+++ b/Makefile
@@ -1,12 +1,13 @@
default: build
build:
- docker-compose -f docker-compose-sparql.yml down
+ docker-compose -f docker-compose.yml down
mvn clean install -U -DskipTests -Dmaven.javadoc.skip=true
dockerize:
docker build -f Dockerfile.frontier -t squirrel.frontier .
docker build -f Dockerfile.worker -t squirrel.worker .
+ docker build -f Dockerfile.web -t squirrel.web .
start: dockerize
docker-compose -f docker-compose-sparql.yml up
diff --git a/build-squirrel b/build-squirrel
new file mode 100755
index 000000000..ace055933
--- /dev/null
+++ b/build-squirrel
@@ -0,0 +1,15 @@
+#!/bin/bash
+echo "Building Squirrel..."
+cd squirrel.web-api
+mvn clean install
+cd ..
+mvn clean install -DskipTests
+clear
+echo "Creating Frontier image..."
+docker build -f Dockerfile.frontier -t squirrel.frontier .
+echo "Creating Worker image..."
+docker build -f Dockerfile.worker -t squirrel.worker .
+echo "Creating Web image..."
+docker build -f Dockerfile.web -t squirrel.web .
+clear
+echo "Finished"
diff --git a/docker-compose-sparql.yml b/docker-compose-sparql.yml
index 2bd39f1c4..3dde7d9a1 100644
--- a/docker-compose-sparql.yml
+++ b/docker-compose-sparql.yml
@@ -130,8 +130,8 @@ services:
DEDUPLICATION_ACTIVE: "true"
HOBBIT_RABBIT_HOST: rabbit
OUTPUT_FOLDER: /var/squirrel/data
- RDB_HOST_NAME: rethinkdb
- RDB_PORT: 28015
+ MDB_HOST_NAME: mongodb
+ MDB_PORT: 27017
SPARQL_HOST_NAME: sparqlhost
SPARQL_HOST_PORT: 3030
SERVICE_PRECONDITION: "rethinkdb:28015 rabbit:5672"
diff --git a/docker-compose-sparql-web.yml b/docker-compose-web.yml
similarity index 85%
rename from docker-compose-sparql-web.yml
rename to docker-compose-web.yml
index 8426312c2..0136bed32 100644
--- a/docker-compose-sparql-web.yml
+++ b/docker-compose-web.yml
@@ -19,14 +19,14 @@ services:
# ports:
frontier:
- image: squirrel:latest
+ image: squirrel.frontier:latest
container_name: frontier
environment:
- HOBBIT_RABBIT_HOST=rabbit
- SEED_FILE=/var/squirrel/seeds.txt
- URI_WHITELIST_FILE=/var/squirrel/whitelist.txt
- - RDB_HOST_NAME=rethinkdb
- - RDB_PORT=28015
+ - MDB_HOST_NAME=mongodb
+ - MDB_PORT=27017
- COMMUNICATION_WITH_WEBSERVICE=true
- VISUALIZATION_OF_CRAWLED_GRAPH=true
volumes:
@@ -36,10 +36,13 @@ services:
command: java -cp squirrel.jar org.hobbit.core.run.ComponentStarter org.dice_research.squirrel.components.FrontierComponent
web:
- image: squirrel/webimage:latest
+ image: squirrel.web:latest
container_name: web
+ environment:
+ - HOST=rabbit
ports:
- "8080:8080"
+ command: java -jar squirrel.web.jar
sparqlhost:
image: stain/jena-fuseki
@@ -70,19 +73,19 @@ services:
- "5672:5672"
worker1:
- image: squirrel:latest
+ image: squirrel.worker:latest
container_name: worker1
environment:
- HOBBIT_RABBIT_HOST=rabbit
- OUTPUT_FOLDER=/var/squirrel/data
- HTML_SCRAPER_YAML_PATH=/var/squirrel/yaml
- - CONTEXT_CONFIG_FILE=/var/squirrel/spring-config/context-sparqlStoreBased.xml
+ - CONTEXT_CONFIG_FILE=/var/squirrel/spring-config/context.xml
- SPARQL_HOST_NAME=sparqlhost
#- CKAN_WHITELIST_FILE=/var/squirrel/ckanwhitelist.txt
- SPARQL_HOST_PORT=3030
- DEDUPLICATION_ACTIVE=true
- - RDB_HOST_NAME=rethinkdb
- - RDB_PORT=28015
+ - MDB_HOST_NAME=mongodb
+ - MDB_PORT=27017
#-CKAN_PORT=
volumes:
- ./data/worker1:/var/squirrel/data
@@ -92,19 +95,19 @@ services:
command: java -cp squirrel.jar org.dice_research.squirrel.components.WorkerComponentStarter
worker2:
- image: squirrel:latest
+ image: squirrel.worker:latest
container_name: worker2
environment:
- HOBBIT_RABBIT_HOST=rabbit
- OUTPUT_FOLDER=/var/squirrel/data
- HTML_SCRAPER_YAML_PATH=/var/squirrel/yaml
- - CONTEXT_CONFIG_FILE=/var/squirrel/spring-config/context-sparqlStoreBased.xml
+ - CONTEXT_CONFIG_FILE=/var/squirrel/spring-config/context.xml
- SPARQL_HOST_NAME=sparqlhost
#- CKAN_WHITELIST_FILE=/var/squirrel/ckanwhitelist.txt
- SPARQL_HOST_PORT=3030
- DEDUPLICATION_ACTIVE=true
- - RDB_HOST_NAME=rethinkdb
- - RDB_PORT=28015
+ - MDB_HOST_NAME=mongodb
+ - MDB_PORT=27017
#-CKAN_PORT=
volumes:
- ./data/worker2:/var/squirrel/data
@@ -114,19 +117,19 @@ services:
command: java -cp squirrel.jar org.dice_research.squirrel.components.WorkerComponentStarter
worker3:
- image: squirrel:latest
+ image: squirrel.worker:latest
container_name: worker3
environment:
- HOBBIT_RABBIT_HOST=rabbit
- OUTPUT_FOLDER=/var/squirrel/data
- HTML_SCRAPER_YAML_PATH=/var/squirrel/yaml
- - CONTEXT_CONFIG_FILE=/var/squirrel/spring-config/context-sparqlStoreBased.xml
+ - CONTEXT_CONFIG_FILE=/var/squirrel/spring-config/context.xml
- SPARQL_HOST_NAME=sparqlhost
#- CKAN_WHITELIST_FILE=/var/squirrel/ckanwhitelist.txt
- SPARQL_HOST_PORT=3030
- DEDUPLICATION_ACTIVE=true
- - RDB_HOST_NAME=rethinkdb
- - RDB_PORT=28015
+ - MDB_HOST_NAME=mongodb
+ - MDB_PORT=27017
#-CKAN_PORT=
volumes:
- ./data/worker3:/var/squirrel/data
@@ -142,8 +145,8 @@ services:
DEDUPLICATION_ACTIVE: "true"
HOBBIT_RABBIT_HOST: rabbit
OUTPUT_FOLDER: /var/squirrel/data
- RDB_HOST_NAME: rethinkdb
- RDB_PORT: 28015
+ MDB_HOST_NAME: mongodb
+ MDB_PORT: 27017
SPARQL_HOST_NAME: sparqlhost
SPARQL_HOST_PORT: 3030
SERVICE_PRECONDITION: "rethinkdb:28015 rabbit:5672"
diff --git a/docker-compose.yml b/docker-compose.yml
index b64e8a21e..dcd855232 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -14,19 +14,30 @@ services:
container_name: frontier
environment:
- HOBBIT_RABBIT_HOST=rabbit
- - SEED_FILE=/var/squirrel/seeds.txt
- URI_WHITELIST_FILE=/var/squirrel/whitelist.txt
- - RDB_HOST_NAME=rethinkdb
- - RDB_PORT=28015
+ - SEED_FILE=/var/squirrel/seeds.txt
- MDB_HOST_NAME=mongodb
- MDB_PORT=27017
- COMMUNICATION_WITH_WEBSERVICE=false
- VISUALIZATION_OF_CRAWLED_GRAPH=false
+ - JVM_ARGS=-Xmx8g
volumes:
- ./data/frontier:/var/squirrel/data
- ./seed/seeds.txt:/var/squirrel/seeds.txt:ro
- - ./whitelist/ckanwhitelist.txt:/var/squirrel/whitelist.txt:ro
-# command: java -cp squirrel.jar org.hobbit.core.run.ComponentStarter org.dice_research.squirrel.components.FrontierComponent
+ - ./whitelist/whitelist.txt:/var/squirrel/whitelist.txt:ro
+ command: java -cp squirrel.jar org.hobbit.core.run.ComponentStarter org.dice_research.squirrel.components.FrontierComponent
+
+ sparqlhost:
+ image: stain/jena-fuseki
+ container_name: sparqlhost
+ ports:
+ - "3030:3030"
+ volumes:
+ - ./data/sparqlhost/sparqlhost_data:/fuseki
+ environment:
+ - ADMIN_PASSWORD=pw123
+ - JVM_ARGS=-Xmx2g
+
mongodb:
image: mongo:4.0.0
volumes:
@@ -39,7 +50,8 @@ services:
volumes:
- ./data/rethinkdb:/data
ports:
- - "28015:28015"
+ - "8080:8080"
+ command: rethinkdb --bind all
# message bus
rabbit:
@@ -58,15 +70,18 @@ services:
- HOBBIT_RABBIT_HOST=rabbit
- OUTPUT_FOLDER=/var/squirrel/data
- HTML_SCRAPER_YAML_PATH=/var/squirrel/yaml
- - CONTEXT_CONFIG_FILE=/var/squirrel/spring-config/context-fileBased.xml
- - DEDUPLICATION_ACTIVE=true
- - RDB_HOST_NAME=rethinkdb
- - RDB_PORT=28015
+ - CONTEXT_CONFIG_FILE=/var/squirrel/spring-config/context.xml
+ - SPARQL_HOST_NAME=sparqlhost
+ - SPARQL_HOST_PORT=3030
+ - DEDUPLICATION_ACTIVE=false
+ - MDB_HOST_NAME=mongodb
+ - MDB_PORT=27017
+ - JVM_ARGS=-Xmx8g
volumes:
- ./data/worker1:/var/squirrel/data
- ./yaml:/var/squirrel/yaml
- ./spring-config:/var/squirrel/spring-config
-# command: java -cp squirrel.jar org.dice_research.squirrel.components.WorkerComponentStarter
+ command: java -cp squirrel.jar org.dice_research.squirrel.components.WorkerComponentStarter
worker2:
image: squirrel.worker:latest
@@ -76,14 +91,17 @@ services:
- OUTPUT_FOLDER=/var/squirrel/data
- HTML_SCRAPER_YAML_PATH=/var/squirrel/yaml
- CONTEXT_CONFIG_FILE=/var/squirrel/spring-config/context-fileBased.xml
- - DEDUPLICATION_ACTIVE=true
- - RDB_HOST_NAME=rethinkdb
- - RDB_PORT=28015
+ - SPARQL_HOST_NAME=sparqlhost
+ - SPARQL_HOST_PORT=3030
+ - DEDUPLICATION_ACTIVE=false
+ - MDB_HOST_NAME=mongodb
+ - MDB_PORT=27017
+ - JVM_ARGS=-Xmx8g
volumes:
- ./data/worker2:/var/squirrel/data
- ./yaml:/var/squirrel/yaml
- ./spring-config:/var/squirrel/spring-config
-# command: java -cp squirrel.jar org.dice_research.squirrel.components.WorkerComponentStarter
+ command: java -cp squirrel.jar org.dice_research.squirrel.components.WorkerComponentStarter
worker3:
image: squirrel.worker:latest
@@ -93,28 +111,30 @@ services:
- OUTPUT_FOLDER=/var/squirrel/data
- HTML_SCRAPER_YAML_PATH=/var/squirrel/yaml
- CONTEXT_CONFIG_FILE=/var/squirrel/spring-config/context-fileBased.xml
+ - SPARQL_HOST_NAME=sparqlhost
+ - SPARQL_HOST_PORT=3030
- DEDUPLICATION_ACTIVE=true
- - RDB_HOST_NAME=rethinkdb
- - RDB_PORT=28015
+ - MDB_HOST_NAME=mongodb
+ - MDB_PORT=27017
+ - JVM_ARGS=-Xmx8g
volumes:
- ./data/worker3:/var/squirrel/data
- ./yaml:/var/squirrel/yaml
- ./spring-config:/var/squirrel/spring-config
-# command: java -cp squirrel.jar org.dice_research.squirrel.components.WorkerComponentStarter
-
-# deduplicator:
-# image: squirrel
-# container_name: deduplicator
-# environment:
-# DEDUPLICATION_ACTIVE: "true"
-# HOBBIT_RABBIT_HOST: rabbit
-# OUTPUT_FOLDER: /var/squirrel/data
-# RDB_HOST_NAME: rethinkdb
-# RDB_PORT: 28015
-# SPARQL_HOST_NAME: sparqlhost
-# SPARQL_HOST_PORT: 3030
-# SERVICE_PRECONDITION: "rethinkdb:28015 rabbit:5672"
-# volumes:
-# - ./data/deduplicator:/var/squirrel/data
-# command: java -cp squirrel.jar org.hobbit.core.run.ComponentStarter org.dice_research.squirrel.components.DeduplicatorComponent
+ command: java -cp squirrel.jar org.dice_research.squirrel.components.WorkerComponentStarter
+ deduplicator:
+ image: squirrel
+ container_name: deduplicator
+ environment:
+ DEDUPLICATION_ACTIVE: "true"
+ HOBBIT_RABBIT_HOST: rabbit
+ OUTPUT_FOLDER: /var/squirrel/data
+ MDB_HOST_NAME: mongodb
+ MDB_PORT: 27017
+ SPARQL_HOST_NAME: sparqlhost
+ SPARQL_HOST_PORT: 3030
+ SERVICE_PRECONDITION: "rethinkdb:28015 rabbit:5672"
+ volumes:
+ - ./data/deduplicator:/var/squirrel/data
+ command: java -cp squirrel.jar org.hobbit.core.run.ComponentStarter org.aksw.simba.squirrel.components.DeduplicatorComponent
diff --git a/pom.xml b/pom.xml
index b8fbe1dc3..21ea11409 100644
--- a/pom.xml
+++ b/pom.xml
@@ -4,7 +4,7 @@
4.0.0
org.dice-research
squirrel
- 0.3.0-SNAPSHOT
+ 0.3.0
pom
2017
Squirrel
@@ -33,12 +33,12 @@
+ squirrel.web-api
+ squirrel.web
squirrel.api
squirrel.deduplication
squirrel.frontier
squirrel.mockup
- squirrel.web
- squirrel.web-api
squirrel.worker
diff --git a/seed/seeds.txt b/seed/seeds.txt
index 86760d0aa..d2a69b00a 100644
--- a/seed/seeds.txt
+++ b/seed/seeds.txt
@@ -1,3 +1,3 @@
https://dbpedia.org/resource/New_York
https://dbpedia.org/resource/Moscow
-https://dbpedia.org/resource/Brazil
\ No newline at end of file
+https://dbpedia.org/resource/China
diff --git a/spring-config/context-sparql.xml b/spring-config/context-sparql.xml
index a6f209ed4..862b4702e 100644
--- a/spring-config/context-sparql.xml
+++ b/spring-config/context-sparql.xml
@@ -32,41 +32,68 @@
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
- org.dice_research.squirrel.analyzer.impl.HDTAnalyzer
org.dice_research.squirrel.analyzer.impl.RDFAnalyzer
- org.dice_research.squirrel.analyzer.impl.MicrodataParser
+ org.dice_research.squirrel.analyzer.impl.HDTAnalyzer
+ org.dice_research.squirrel.analyzer.impl.html.scraper.HTMLScraperAnalyzer
+ org.dice_research.squirrel.analyzer.impl.RDFaSemarglParser
+ org.dice_research.squirrel.analyzer.impl.MicrodataParser
org.dice_research.squirrel.analyzer.impl.MicroformatMF2JParser
- org.dice_research.squirrel.analyzer.impl.RDFaSemarglParser
- org.dice_research.squirrel.analyzer.impl.html.scraper.HTMLScraperAnalyzer
+ org.dice_research.squirrel.analyzer.impl.ckan.CkanJsonAnalyzer
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
@@ -76,22 +103,23 @@
-->
-
+
+ value="europeandataportal/update"/>
+ value="europeandataportal/query"/>
+
diff --git a/spring-config/context.xml b/spring-config/context.xml
index 1bc9bab4c..3ee740ebf 100644
--- a/spring-config/context.xml
+++ b/spring-config/context.xml
@@ -32,32 +32,59 @@
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
- org.dice_research.squirrel.analyzer.impl.HDTAnalyzer
org.dice_research.squirrel.analyzer.impl.RDFAnalyzer
- org.dice_research.squirrel.analyzer.impl.MicrodataParser
+ org.dice_research.squirrel.analyzer.impl.HDTAnalyzer
+ org.dice_research.squirrel.analyzer.impl.html.scraper.HTMLScraperAnalyzer
+ org.dice_research.squirrel.analyzer.impl.RDFaSemarglParser
+ org.dice_research.squirrel.analyzer.impl.MicrodataParser
org.dice_research.squirrel.analyzer.impl.MicroformatMF2JParser
- org.dice_research.squirrel.analyzer.impl.RDFaSemarglParser
- org.dice_research.squirrel.analyzer.impl.html.scraper.HTMLScraperAnalyzer
+ org.dice_research.squirrel.analyzer.impl.ckan.CkanJsonAnalyzer
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
@@ -66,7 +93,7 @@
-
+
@@ -83,9 +110,9 @@
+ value="europeandataportal/update"/>
+ value="europeandataportal/query"/>
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/squirrel.api/pom.xml b/squirrel.api/pom.xml
index 8e16a6ad6..2f80bd665 100644
--- a/squirrel.api/pom.xml
+++ b/squirrel.api/pom.xml
@@ -5,7 +5,7 @@
org.dice-research
squirrel
- 0.3.0-SNAPSHOT
+ 0.3.0
squirrel.api
jar
diff --git a/squirrel.api/src/main/java/org/dice_research/squirrel/frontier/ExtendedFrontier.java b/squirrel.api/src/main/java/org/dice_research/squirrel/frontier/ExtendedFrontier.java
index b3ea8bc25..27af41294 100644
--- a/squirrel.api/src/main/java/org/dice_research/squirrel/frontier/ExtendedFrontier.java
+++ b/squirrel.api/src/main/java/org/dice_research/squirrel/frontier/ExtendedFrontier.java
@@ -13,5 +13,5 @@ public interface ExtendedFrontier extends Frontier {
* @param lstUrisToReassign A list of {@link CrawleableUri} that should have been handeled by the
* dead worker, but was not due to his sudden death.
*/
- void informAboutDeadWorker(String idOfWorker, List lstUrisToReassign);
+ void informAboutDeadWorker(int idOfWorker, List lstUrisToReassign);
}
\ No newline at end of file
diff --git a/squirrel.api/src/main/java/org/dice_research/squirrel/queue/InMemoryQueue.java b/squirrel.api/src/main/java/org/dice_research/squirrel/queue/InMemoryQueue.java
index c5b7016b6..e808e0882 100644
--- a/squirrel.api/src/main/java/org/dice_research/squirrel/queue/InMemoryQueue.java
+++ b/squirrel.api/src/main/java/org/dice_research/squirrel/queue/InMemoryQueue.java
@@ -1,5 +1,7 @@
package org.dice_research.squirrel.queue;
+import java.net.InetAddress;
+import java.util.AbstractMap;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.Iterator;
@@ -12,6 +14,7 @@
public class InMemoryQueue extends AbstractIpAddressBasedQueue {
protected SortedMap> queue;
+ private static final int LIMITFORITERATOR = 50;
public InMemoryQueue() {
queue = new TreeMap>();
@@ -54,5 +57,10 @@ public void open() {
@Override
public void close() {
}
+
+ @Override
+ public Iterator>> getIPURIIterator() {
+ return queue.entrySet().stream().limit(LIMITFORITERATOR).map(e -> new AbstractMap.SimpleEntry<>(e.getKey().ip, e.getValue())).iterator();
+ }
}
diff --git a/squirrel.api/src/main/java/org/dice_research/squirrel/queue/IpAddressBasedQueue.java b/squirrel.api/src/main/java/org/dice_research/squirrel/queue/IpAddressBasedQueue.java
index 7f7cc842a..e3118c87a 100644
--- a/squirrel.api/src/main/java/org/dice_research/squirrel/queue/IpAddressBasedQueue.java
+++ b/squirrel.api/src/main/java/org/dice_research/squirrel/queue/IpAddressBasedQueue.java
@@ -1,6 +1,11 @@
package org.dice_research.squirrel.queue;
import java.net.InetAddress;
+import java.util.AbstractMap;
+import java.util.Iterator;
+import java.util.List;
+
+import org.dice_research.squirrel.data.uri.CrawleableUri;
/**
* This extension of the {@link UriQueue} interface defines additional methods
@@ -29,4 +34,10 @@ public interface IpAddressBasedQueue extends UriQueue {
* @return the number of IP addresses that are currently blocked.
*/
public int getNumberOfBlockedIps();
+ /**
+ * Goes through the queue und collects all IP-address with their URIs
+ *
+ * @return a IP-address-iterator with the list of uris for each IP-address
+ */
+ Iterator>> getIPURIIterator();
}
diff --git a/squirrel.api/src/main/java/org/dice_research/squirrel/rabbit/msgs/UriSetRequest.java b/squirrel.api/src/main/java/org/dice_research/squirrel/rabbit/msgs/UriSetRequest.java
index 6dfff7742..240aaa38d 100644
--- a/squirrel.api/src/main/java/org/dice_research/squirrel/rabbit/msgs/UriSetRequest.java
+++ b/squirrel.api/src/main/java/org/dice_research/squirrel/rabbit/msgs/UriSetRequest.java
@@ -9,7 +9,7 @@ public class UriSetRequest implements Serializable {
/**
* The id of the {@link org.dice_research.squirrel.worker.Worker} that sent this request.
*/
- private String idOfWorker;
+ private int idOfWorker;
/**
* Indicates whether the worker (see {@link #idOfWorker}) sends {@link org.dice_research.squirrel.worker.impl.AliveMessage}.
@@ -20,7 +20,7 @@ public class UriSetRequest implements Serializable {
* Standard constructor setting just default values.
*/
public UriSetRequest() {
- this(null, false);
+ this(0, false);
}
/**
@@ -29,12 +29,12 @@ public UriSetRequest() {
* @param idOfWorker
* @param workerSendsAliveMessages
*/
- public UriSetRequest(String idOfWorker, boolean workerSendsAliveMessages) {
+ public UriSetRequest(int idOfWorker, boolean workerSendsAliveMessages) {
this.idOfWorker = idOfWorker;
this.workerSendsAliveMessages = workerSendsAliveMessages;
}
- public String getIdOfWorker() {
+ public int getIdOfWorker() {
return idOfWorker;
}
diff --git a/squirrel.api/src/main/java/org/dice_research/squirrel/worker/AliveMessage.java b/squirrel.api/src/main/java/org/dice_research/squirrel/worker/AliveMessage.java
index 11209599d..3becf4d39 100644
--- a/squirrel.api/src/main/java/org/dice_research/squirrel/worker/AliveMessage.java
+++ b/squirrel.api/src/main/java/org/dice_research/squirrel/worker/AliveMessage.java
@@ -16,14 +16,14 @@ public class AliveMessage implements Serializable {
/**
* The id of the worker that sends the alive message.
*/
- private String idOfWorker;
+ private int idOfWorker;
/**
* Create aliveMessage by an id of a worker.
*
* @param idOfWorker The id of the worker.
*/
- public AliveMessage(String idOfWorker) {
+ public AliveMessage(int idOfWorker) {
this.idOfWorker = idOfWorker;
}
@@ -32,7 +32,7 @@ public AliveMessage(String idOfWorker) {
*
* @return the id of the worker.
*/
- public String getIdOfWorker() {
+ public int getIdOfWorker() {
return idOfWorker;
}
diff --git a/squirrel.api/src/main/java/org/dice_research/squirrel/worker/Worker.java b/squirrel.api/src/main/java/org/dice_research/squirrel/worker/Worker.java
index eb33acb1f..60d194155 100644
--- a/squirrel.api/src/main/java/org/dice_research/squirrel/worker/Worker.java
+++ b/squirrel.api/src/main/java/org/dice_research/squirrel/worker/Worker.java
@@ -38,6 +38,12 @@ public interface Worker extends Runnable {
* @return True iff the worker sends alive messages.
*/
boolean sendsAliveMessages();
+
+ /**
+ * Gives the unique id of the worker.
+ * @return The id of the worker.
+ */
+ int getId();
public void setTerminateFlag(boolean terminateFlag);
diff --git a/squirrel.deduplication/pom.xml b/squirrel.deduplication/pom.xml
index af94629ca..1e5fa9d11 100644
--- a/squirrel.deduplication/pom.xml
+++ b/squirrel.deduplication/pom.xml
@@ -6,7 +6,7 @@
org.dice-research
squirrel
- 0.3.0-SNAPSHOT
+ 0.3.0
squirrel.deduplication
jar
diff --git a/squirrel.frontier/pom.xml b/squirrel.frontier/pom.xml
index 862565d78..5bf998cc0 100644
--- a/squirrel.frontier/pom.xml
+++ b/squirrel.frontier/pom.xml
@@ -6,7 +6,7 @@
org.dice-research
squirrel
- 0.3.0-SNAPSHOT
+ 0.3.0
squirrel.frontier
jar
diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java
index eb6ceacb5..38cafcebe 100644
--- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java
+++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/components/FrontierComponent.java
@@ -11,6 +11,7 @@
import org.dice_research.squirrel.Constants;
import org.dice_research.squirrel.configurator.MongoConfiguration;
import org.dice_research.squirrel.configurator.SeedConfiguration;
+import org.dice_research.squirrel.configurator.WebConfiguration;
import org.dice_research.squirrel.configurator.WhiteListConfiguration;
import org.dice_research.squirrel.data.uri.CrawleableUri;
import org.dice_research.squirrel.data.uri.UriUtils;
@@ -26,6 +27,7 @@
import org.dice_research.squirrel.frontier.Frontier;
import org.dice_research.squirrel.frontier.impl.ExtendedFrontierImpl;
import org.dice_research.squirrel.frontier.impl.FrontierImpl;
+import org.dice_research.squirrel.frontier.impl.FrontierSenderToWebservice;
import org.dice_research.squirrel.frontier.impl.WorkerGuard;
import org.dice_research.squirrel.queue.InMemoryQueue;
import org.dice_research.squirrel.queue.IpAddressBasedQueue;
@@ -66,6 +68,7 @@ public void init() throws Exception {
super.init();
serializer = new GzipJavaUriSerializer();
MongoConfiguration mongoConfiguration = MongoConfiguration.getMDBConfiguration();
+ WebConfiguration webConfiguration = WebConfiguration.getWebConfiguration();
if(mongoConfiguration != null) {
String dbHostName = mongoConfiguration.getMDBHostName();
Integer dbPort = mongoConfiguration.getMDBPort();
@@ -106,22 +109,22 @@ public void init() throws Exception {
LOGGER.info("Frontier initialized.");
-// if (webConfiguration.isCommunicationWithWebserviceEnabled()) {
-// final FrontierSenderToWebservice sender = new FrontierSenderToWebservice(outgoingDataQueuefactory,
-// workerGuard, queue, knownUriFilter, uriReferences);
-// LOGGER.trace("FrontierSenderToWebservice -> sendCrawledGraph is set to "
-// + webConfiguration.isVisualizationOfCrawledGraphEnabled());
-// Thread senderThread = new Thread(sender);
-// senderThread.setName("Sender to the Webservice via RabbitMQ (current information from the Frontier)");
-// senderThread.start();
-// LOGGER.info("Started thread [" + senderThread.getName() + "] ");
-// } else {
-// LOGGER.info("webConfiguration.isCommunicationWithWebserviceEnabled is set to "
-// + webConfiguration.isCommunicationWithWebserviceEnabled() + "/"
-// + webConfiguration.isVisualizationOfCrawledGraphEnabled()
-// + ". No WebServiceSenderThread will be started!");
-// }
+ if (webConfiguration.isCommunicationWithWebserviceEnabled()) {
+ final FrontierSenderToWebservice sender = new FrontierSenderToWebservice(outgoingDataQueuefactory,
+ workerGuard, queue, knownUriFilter, uriReferences);
+ LOGGER.trace("FrontierSenderToWebservice -> sendCrawledGraph is set to "
+ + webConfiguration.isVisualizationOfCrawledGraphEnabled());
+ Thread senderThread = new Thread(sender);
+ senderThread.setName("Sender to the Webservice via RabbitMQ (current information from the Frontier)");
+ senderThread.start();
+ LOGGER.info("Started thread [" + senderThread.getName() + "] ");
+ } else {
+ LOGGER.info("webConfiguration.isCommunicationWithWebserviceEnabled is set to "
+ + webConfiguration.isCommunicationWithWebserviceEnabled() + "/"
+ + webConfiguration.isVisualizationOfCrawledGraphEnabled()
+ + ". No WebServiceSenderThread will be started!");
+ }
}
@Override
@@ -189,7 +192,7 @@ public void handleData(byte[] data, ResponseHandler handler, String responseQueu
crawlingResult.uris);
} else if (deserializedData instanceof AliveMessage) {
AliveMessage message = (AliveMessage) deserializedData;
- String idReceived = message.getIdOfWorker();
+ int idReceived = message.getIdOfWorker();
LOGGER.trace("Received alive message from worker with id " + idReceived);
workerGuard.putNewTimestamp(idReceived);
} else {
@@ -228,7 +231,7 @@ protected void processSeedFile(String seedFile) {
}
}
- public void informFrontierAboutDeadWorker(String idOfWorker, List lstUrisToReassign) {
+ public void informFrontierAboutDeadWorker(int idOfWorker, List lstUrisToReassign) {
if (frontier instanceof ExtendedFrontier) {
((ExtendedFrontier) frontier).informAboutDeadWorker(idOfWorker, lstUrisToReassign);
}
diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/configurator/WebConfiguration.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/configurator/WebConfiguration.java
new file mode 100644
index 000000000..eecec3b9b
--- /dev/null
+++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/configurator/WebConfiguration.java
@@ -0,0 +1,32 @@
+package org.dice_research.squirrel.configurator;
+
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class WebConfiguration extends Configuration {
+ private static final Logger LOGGER = LoggerFactory.getLogger(WebConfiguration.class);
+
+ private boolean communicationWithWebserviceEnabled;
+ private boolean visualizationOfCrawledGraphEnabaled;
+
+ private static final String COMMUNICATION_WITH_WEBSERVICE = "COMMUNICATION_WITH_WEBSERVICE";
+ private static final String VISUALIZATION_OF_CRAWLED_GRAPH = "VISUALIZATION_OF_CRAWLED_GRAPH";
+
+ private WebConfiguration(boolean communicationWithWebserviceEnabled, boolean visualizationOfCrawledGraphEnabaled) {
+ this.communicationWithWebserviceEnabled = communicationWithWebserviceEnabled;
+ this.visualizationOfCrawledGraphEnabaled = visualizationOfCrawledGraphEnabaled;
+ }
+
+ public static WebConfiguration getWebConfiguration() {
+ return new WebConfiguration(Configuration.getEnvBoolean(COMMUNICATION_WITH_WEBSERVICE, LOGGER), Configuration.getEnvBoolean(VISUALIZATION_OF_CRAWLED_GRAPH, LOGGER));
+ }
+
+ public boolean isCommunicationWithWebserviceEnabled() {
+ return communicationWithWebserviceEnabled;
+ }
+
+ public boolean isVisualizationOfCrawledGraphEnabled() {
+ return visualizationOfCrawledGraphEnabaled;
+ }
+}
diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/filter/MongoDBKnowUriFilter.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/filter/MongoDBKnowUriFilter.java
index aff7c54a0..22fc936b9 100644
--- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/filter/MongoDBKnowUriFilter.java
+++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/filter/MongoDBKnowUriFilter.java
@@ -7,7 +7,6 @@
import java.net.URISyntaxException;
import java.net.UnknownHostException;
import java.util.ArrayList;
-import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
@@ -22,9 +21,7 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import com.mongodb.BasicDBList;
import com.mongodb.BasicDBObject;
-import com.mongodb.DBObject;
import com.mongodb.MongoClient;
import com.mongodb.client.MongoCollection;
import com.mongodb.client.MongoCursor;
diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/norm/NormalizerImpl.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/norm/NormalizerImpl.java
index 5775f0ed6..14685da39 100644
--- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/norm/NormalizerImpl.java
+++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/data/uri/norm/NormalizerImpl.java
@@ -23,113 +23,115 @@
*/
public class NormalizerImpl implements UriNormalizer {
- /**
- * Nutch 1098 - finds URL encoded parts of the URL
- */
- private final static Pattern UNESCAPE_RULE_PATTERN = Pattern.compile("%([0-9A-Fa-f]{2})");
- /**
- * look-up table for characters which should not be escaped in URL paths
- */
- private final static BitSet UNESCAPED_CHARS = new BitSet(0x7F);
+ /**
+ * Nutch 1098 - finds URL encoded parts of the URL
+ */
+ private final static Pattern UNESCAPE_RULE_PATTERN = Pattern.compile("%([0-9A-Fa-f]{2})");
+ /**
+ * look-up table for characters which should not be escaped in URL paths
+ */
+ private final static BitSet UNESCAPED_CHARS = new BitSet(0x7F);
- static {
- /*
- * https://tools.ietf.org/html/rfc3986#section-2.2 For consistency,
- * percent-encoded octets in the ranges of ALPHA (%41-%5A and %61-%7A), DIGIT
- * (%30-%39), hyphen (%2D), period (%2E), underscore (%5F), or tilde (%7E)
- * should not be created by URI producers and, when found in a URI, should be
- * decoded to their corresponding unreserved characters by URI normalizers.
- */
- UNESCAPED_CHARS.set(0x2D, 0x2E);
- UNESCAPED_CHARS.set(0x30, 0x39);
- UNESCAPED_CHARS.set(0x41, 0x5A);
- UNESCAPED_CHARS.set(0x61, 0x7A);
- UNESCAPED_CHARS.set(0x5F);
- UNESCAPED_CHARS.set(0x7E);
- }
+ static {
+ /*
+ * https://tools.ietf.org/html/rfc3986#section-2.2 For consistency,
+ * percent-encoded octets in the ranges of ALPHA (%41-%5A and %61-%7A), DIGIT
+ * (%30-%39), hyphen (%2D), period (%2E), underscore (%5F), or tilde (%7E)
+ * should not be created by URI producers and, when found in a URI, should be
+ * decoded to their corresponding unreserved characters by URI normalizers.
+ */
+ UNESCAPED_CHARS.set(0x2D, 0x2E);
+ UNESCAPED_CHARS.set(0x30, 0x39);
+ UNESCAPED_CHARS.set(0x41, 0x5A);
+ UNESCAPED_CHARS.set(0x61, 0x7A);
+ UNESCAPED_CHARS.set(0x5F);
+ UNESCAPED_CHARS.set(0x7E);
+ }
- @Override
- public CrawleableUri normalize(CrawleableUri uri) {
- URI uriObject = uri.getUri();
- boolean changed = false;
- // normalize path
- String path = uriObject.getPath();
- String temp = normalizePath(path);
- if (temp != path) {
- path = temp;
- }
- // Copy Normalization from
- // https://github.com/crawler-commons/crawler-commons/blob/master/src/main/java/crawlercommons/filters/basic/BasicURLNormalizer.java
- // OR use URI.normalize()
+ @Override
+ public CrawleableUri normalize(CrawleableUri uri) {
+ URI uriObject = uri.getUri();
+ boolean changed = false;
+ // normalize path
+ String path = uriObject.getPath();
+ if (path != null) {
+ String temp = normalizePath(path);
+ if (temp != path) {
+ path = temp;
+ }
+ }
+ // Copy Normalization from
+ // https://github.com/crawler-commons/crawler-commons/blob/master/src/main/java/crawlercommons/filters/basic/BasicURLNormalizer.java
+ // OR use URI.normalize()
- // Check whether the query part of a URI has to be sorted
+ // Check whether the query part of a URI has to be sorted
- // Filter attributes of the URI
- //uriObject.getQuery();
+ // Filter attributes of the URI
+ // uriObject.getQuery();
- if (changed) {
- // TODO create new URI object;
- }
- return uri;
- }
+ if (changed) {
+ // TODO create new URI object;
+ }
+ return uri;
+ }
- /**
- * Path normalization adapted from the {@link URI} class (which is based upon
- * src/solaris/native/java/io/canonicalize_md.c) and the Crawler
- * Commons project.
- *
- * @param path
- * @return the normalized path or the given path object if no changes have been
- * made.
- */
- protected String normalizePath(String path) {
- // Check for encoded parts
- Matcher matcher = UNESCAPE_RULE_PATTERN.matcher(path);
- StringBuffer changedPath = null;
- if (matcher.find()) {
- changedPath = new StringBuffer(path);
- int hex, pos = 0;
- do {
- changedPath.append(path.substring(pos, matcher.start()));
- pos = matcher.start();
- hex = getHexValue(path.charAt(pos + 1), path.charAt(pos + 2));
- // If this character shouldn't be escaped
- if (UNESCAPED_CHARS.get(hex)) {
- changedPath.append((char) hex);
- } else {
- changedPath.append(path.substring(pos, pos + 3));
- }
- pos += 3;
- } while (matcher.find());
- if (pos < path.length()) {
- changedPath.append(path.substring(pos));
- }
- }
- if (changedPath == null) {
- return PathNormalization.normalize(path);
- } else {
- String newPath = changedPath.toString();
- return PathNormalization.normalize(newPath.equals(path) ? path : newPath);
- }
- }
+ /**
+ * Path normalization adapted from the {@link URI} class (which is based upon
+ * src/solaris/native/java/io/canonicalize_md.c) and the Crawler
+ * Commons project.
+ *
+ * @param path
+ * @return the normalized path or the given path object if no changes have been
+ * made.
+ */
+ protected String normalizePath(String path) {
+ // Check for encoded parts
+ Matcher matcher = UNESCAPE_RULE_PATTERN.matcher(path);
+ StringBuffer changedPath = null;
+ if (matcher.find()) {
+ changedPath = new StringBuffer(path);
+ int hex, pos = 0;
+ do {
+ changedPath.append(path.substring(pos, matcher.start()));
+ pos = matcher.start();
+ hex = getHexValue(path.charAt(pos + 1), path.charAt(pos + 2));
+ // If this character shouldn't be escaped
+ if (UNESCAPED_CHARS.get(hex)) {
+ changedPath.append((char) hex);
+ } else {
+ changedPath.append(path.substring(pos, pos + 3));
+ }
+ pos += 3;
+ } while (matcher.find());
+ if (pos < path.length()) {
+ changedPath.append(path.substring(pos));
+ }
+ }
+ if (changedPath == null) {
+ return PathNormalization.normalize(path);
+ } else {
+ String newPath = changedPath.toString();
+ return PathNormalization.normalize(newPath.equals(path) ? path : newPath);
+ }
+ }
- protected static int getHexValue(char c1, char c2) {
- int hex;
- if (c1 <= 0x39) {
- hex = c1 - 0x30;
- } else {
- // Check whether it is A-F or a-f
- hex = (c1 <= 0x46) ? (c1 - 0x37) : (c1 - 0x57);
- }
- hex <<= 4;
- if (c2 <= 0x39) {
- hex |= c2 - 0x30;
- } else {
- // Check whether it is A-F or a-f
- hex |= (c2 <= 0x46) ? (c2 - 0x37) : (c2 - 0x57);
- }
- return hex;
- }
+ protected static int getHexValue(char c1, char c2) {
+ int hex;
+ if (c1 <= 0x39) {
+ hex = c1 - 0x30;
+ } else {
+ // Check whether it is A-F or a-f
+ hex = (c1 <= 0x46) ? (c1 - 0x37) : (c1 - 0x57);
+ }
+ hex <<= 4;
+ if (c2 <= 0x39) {
+ hex |= c2 - 0x30;
+ } else {
+ // Check whether it is A-F or a-f
+ hex |= (c2 <= 0x46) ? (c2 - 0x37) : (c2 - 0x57);
+ }
+ return hex;
+ }
}
diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/ExtendedFrontierImpl.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/ExtendedFrontierImpl.java
index 83b71fd42..65018bf4a 100644
--- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/ExtendedFrontierImpl.java
+++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/ExtendedFrontierImpl.java
@@ -66,7 +66,7 @@ public ExtendedFrontierImpl(UriNormalizer normalizer, KnownUriFilter knownUriFil
}
@Override
- public void informAboutDeadWorker(String idOfWorker, List lstUrisToReassign) {
+ public void informAboutDeadWorker(int idOfWorker, List lstUrisToReassign) {
if (queue instanceof IpAddressBasedQueue) {
IpAddressBasedQueue ipQueue = (IpAddressBasedQueue) queue;
diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java
index 093d03966..de1ed3b8f 100644
--- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java
+++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierImpl.java
@@ -235,6 +235,7 @@ public void addNewUri(CrawleableUri uri) {
// Make sure that the IP is known
try {
uri = this.uriProcessor.recognizeInetAddress(uri);
+
} catch (UnknownHostException e) {
LOGGER.error("Could not recognize IP for {}, unknown host", uri.getUri());
}
@@ -243,9 +244,11 @@ public void addNewUri(CrawleableUri uri) {
} else {
LOGGER.error("Couldn't determine the Inet address of \"{}\". It will be ignored.", uri.getUri());
}
+ knownUriFilter.add(uri, System.currentTimeMillis());
} else {
LOGGER.warn("addNewUri(" + uri + "): " + uri.getUri().getScheme() + " is not supported, only " + schemeUriFilter.getSchemes() + ". Will not added!");
}
+
} else {
LOGGER.info("addNewUri(" + uri + "): URI is not good [" + knownUriFilter + "]. Will not be added!");
}
diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierSenderToWebservice.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierSenderToWebservice.java
index 20487509e..8e3f694d0 100644
--- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierSenderToWebservice.java
+++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/FrontierSenderToWebservice.java
@@ -1,17 +1,17 @@
package org.dice_research.squirrel.frontier.impl;
+
import com.SquirrelWebObject;
import com.graph.VisualisationGraph;
import com.graph.VisualisationNode;
import com.rabbitmq.client.Channel;
-
-import org.apache.commons.io.IOUtils;
import org.dice_research.squirrel.data.uri.CrawleableUri;
import org.dice_research.squirrel.data.uri.filter.KnownUriFilter;
import org.dice_research.squirrel.data.uri.info.URIReferences;
import org.dice_research.squirrel.data.uri.serialize.Serializer;
import org.dice_research.squirrel.data.uri.serialize.java.GzipJavaUriSerializer;
import org.dice_research.squirrel.queue.IpAddressBasedQueue;
+import org.apache.commons.io.IOUtils;
import org.hobbit.core.rabbit.DataSender;
import org.hobbit.core.rabbit.DataSenderImpl;
import org.hobbit.core.rabbit.RabbitQueueFactory;
@@ -161,31 +161,29 @@ private SquirrelWebObject generateSquirrelWebObject() throws IllegalAccessExcept
LinkedHashMap> currentQueue = new LinkedHashMap<>(50);
Iterator>> i;
- // FIXME!!!
- throw new RuntimeException("This is not correctly implemented!");
-// for (i = queue.getIPURIIterator(); i.hasNext() && currentQueue.size() < 50; ) {
-// AbstractMap.SimpleEntry> entry = i.next();
-// currentQueue.put(entry.getKey(), entry.getValue());
-// }
-// if (currentQueue.isEmpty()) {
-// newObject.setIPMapPendingURis(EMPTY_MAP);
-// newObject.setPendingURIs(EMPTY_LIST);
-// newObject.setNextCrawledURIs(EMPTY_LIST);
-// } else {
-// newObject.setIPMapPendingURis(currentQueue.entrySet().stream()
-// .map(e -> new AbstractMap.SimpleEntry<>(e.getKey().getHostAddress(), e.getValue().stream().map(uri -> uri.getUri().getPath()).collect(Collectors.toList())))
-// .collect(HashMap::new, (m, entry) -> m.put(entry.getKey(), entry.getValue()), HashMap::putAll));
-// List pendingURIs = new ArrayList<>(currentQueue.size());
-// currentQueue.forEach((key, value) -> value.forEach(uri -> pendingURIs.add(uri.getUri().toString())));
-// newObject.setPendingURIs(pendingURIs);
-// newObject.setNextCrawledURIs(currentQueue.entrySet().iterator().next().getValue().stream().map(e -> e.getUri().toString()).collect(Collectors.toList()));
-// }
-//
-// //Michael remarks, that's not a good idea to pass all crawled URIs, because that takes to much time...
-// //newObject.setCrawledURIs(Collections.EMPTY_LIST);
-// newObject.setCountOfCrawledURIs((int) knownUriFilter.count());
-//
-// return newObject;
+ for (i = queue.getIPURIIterator(); i.hasNext() && currentQueue.size() < 50; ) {
+ AbstractMap.SimpleEntry> entry = i.next();
+ currentQueue.put(entry.getKey(), entry.getValue());
+ }
+ if (currentQueue.isEmpty()) {
+ newObject.setIPMapPendingURis(EMPTY_MAP);
+ newObject.setPendingURIs(EMPTY_LIST);
+ newObject.setNextCrawledURIs(EMPTY_LIST);
+ } else {
+ newObject.setIPMapPendingURis(currentQueue.entrySet().stream()
+ .map(e -> new AbstractMap.SimpleEntry<>(e.getKey().getHostAddress(), e.getValue().stream().map(uri -> uri.getUri().getPath()).collect(Collectors.toList())))
+ .collect(HashMap::new, (m, entry) -> m.put(entry.getKey(), entry.getValue()), HashMap::putAll));
+ List pendingURIs = new ArrayList<>(currentQueue.size());
+ currentQueue.forEach((key, value) -> value.forEach(uri -> pendingURIs.add(uri.getUri().toString())));
+ newObject.setPendingURIs(pendingURIs);
+ newObject.setNextCrawledURIs(currentQueue.entrySet().iterator().next().getValue().stream().map(e -> e.getUri().toString()).collect(Collectors.toList()));
+ }
+
+ //Michael remarks, that's not a good idea to pass all crawled URIs, because that takes to much time...
+ //newObject.setCrawledURIs(Collections.EMPTY_LIST);
+ newObject.setCountOfCrawledURIs((int) knownUriFilter.count());
+
+ return newObject;
}
/**
diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/WorkerGuard.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/WorkerGuard.java
index 3ee7f0cc9..b69ad1cb5 100644
--- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/WorkerGuard.java
+++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/frontier/impl/WorkerGuard.java
@@ -24,7 +24,7 @@ public class WorkerGuard {
* A map from {@link org.dice_research.squirrel.worker.Worker} id to {@link WorkerInfo} containing information about the
* {@link org.dice_research.squirrel.worker.Worker}.
*/
- private Map mapWorkerInfo = Collections.synchronizedMap(new HashMap<>());
+ private Map mapWorkerInfo = Collections.synchronizedMap(new HashMap<>());
/**
* After this period of time (in seconds), a worker is considered to be dead if he has not sent
@@ -53,8 +53,8 @@ public WorkerGuard(FrontierComponent frontierComponent) {
timer.schedule(new TimerTask() {
@Override
public void run() {
- List lstIdsToBeRemoved = new ArrayList<>();
- for (String idWorker : mapWorkerInfo.keySet()) {
+ List lstIdsToBeRemoved = new ArrayList<>();
+ for (int idWorker : mapWorkerInfo.keySet()) {
if (mapWorkerInfo.get(idWorker).getDateLastAlive() == null) {
continue;
@@ -88,7 +88,7 @@ public void run() {
*
* @param idOfWorker the given id.
*/
- public void putNewTimestamp(String idOfWorker) {
+ public void putNewTimestamp(int idOfWorker) {
WorkerInfo workerInfo;
if (mapWorkerInfo.containsKey(idOfWorker)) {
workerInfo = mapWorkerInfo.get(idOfWorker);
@@ -105,7 +105,7 @@ public void putNewTimestamp(String idOfWorker) {
* @param idOfWorker The id of the worker for which to put the uris.
* @param lstUris The uris to put.
*/
- public void putUrisForWorker(String idOfWorker, boolean workerSendsAliveMessages, List lstUris) {
+ public void putUrisForWorker(int idOfWorker, boolean workerSendsAliveMessages, List lstUris) {
WorkerInfo workerInfo;
if (mapWorkerInfo.containsKey(idOfWorker)) {
workerInfo = mapWorkerInfo.get(idOfWorker);
@@ -139,7 +139,7 @@ public void shutdown() {
timer.cancel();
}
- public Map getMapWorkerInfo() {
+ public Map getMapWorkerInfo() {
return mapWorkerInfo;
}
diff --git a/squirrel.frontier/src/main/java/org/dice_research/squirrel/queue/MongoDBQueue.java b/squirrel.frontier/src/main/java/org/dice_research/squirrel/queue/MongoDBQueue.java
index 4fb5db6d7..93f10319d 100644
--- a/squirrel.frontier/src/main/java/org/dice_research/squirrel/queue/MongoDBQueue.java
+++ b/squirrel.frontier/src/main/java/org/dice_research/squirrel/queue/MongoDBQueue.java
@@ -7,28 +7,28 @@
import java.net.URI;
import java.net.URISyntaxException;
import java.net.UnknownHostException;
+import java.util.AbstractMap;
+import java.util.AbstractMap.SimpleEntry;
import java.util.ArrayList;
+import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import org.bson.Document;
import org.bson.types.Binary;
import org.dice_research.squirrel.data.uri.CrawleableUri;
+import org.dice_research.squirrel.data.uri.CrawleableUriFactoryImpl;
import org.dice_research.squirrel.data.uri.UriType;
import org.dice_research.squirrel.data.uri.serialize.Serializer;
import org.dice_research.squirrel.data.uri.serialize.java.SnappyJavaUriSerializer;
-import org.dice_research.squirrel.queue.AbstractIpAddressBasedQueue;
-import org.dice_research.squirrel.queue.IpUriTypePair;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.mongodb.MongoClient;
-import com.mongodb.MongoClientOptions;
-import com.mongodb.ServerAddress;
+import com.mongodb.MongoWriteException;
import com.mongodb.client.MongoCollection;
import com.mongodb.client.MongoCursor;
import com.mongodb.client.MongoDatabase;
-import com.mongodb.client.model.DropIndexOptions;
import com.mongodb.client.model.Indexes;
@SuppressWarnings("deprecation")
@@ -38,7 +38,7 @@ public class MongoDBQueue extends AbstractIpAddressBasedQueue {
private MongoDatabase mongoDB;
private Serializer serializer;
private final String DB_NAME ="squirrel";
- private final String COLLECTION_NAME = "queue";
+ private final String COLLECTION_QUEUE = "queue";
private final String COLLECTION_URIS = "uris";
private static final Logger LOGGER = LoggerFactory.getLogger(MongoDBQueue.class);
@@ -57,12 +57,12 @@ public MongoDBQueue(String hostName, Integer port, Serializer serializer) {
}
public void purge() {
- mongoDB.getCollection(COLLECTION_NAME).drop();
+ mongoDB.getCollection(COLLECTION_QUEUE).drop();
mongoDB.getCollection(COLLECTION_URIS).drop();
}
public long length() {
- return mongoDB.getCollection(COLLECTION_NAME).count();
+ return mongoDB.getCollection(COLLECTION_QUEUE).count();
}
public static void main(String[] args) throws URISyntaxException, UnknownHostException {
@@ -82,7 +82,7 @@ public static void main(String[] args) throws URISyntaxException, UnknownHostExc
@Override
public void close() {
- mongoDB.getCollection(COLLECTION_NAME).drop();
+ mongoDB.getCollection(COLLECTION_QUEUE).drop();
mongoDB.getCollection(COLLECTION_URIS).drop();
client.close();
}
@@ -91,9 +91,9 @@ public void close() {
public void open() {
mongoDB = client.getDatabase(DB_NAME);
if(!queueTableExists()) {
- mongoDB.createCollection(COLLECTION_NAME);
+ mongoDB.createCollection(COLLECTION_QUEUE);
mongoDB.createCollection(COLLECTION_URIS);
- MongoCollection mongoCollection = mongoDB.getCollection(COLLECTION_NAME);
+ MongoCollection mongoCollection = mongoDB.getCollection(COLLECTION_QUEUE);
MongoCollection mongoCollectionUris = mongoDB.getCollection(COLLECTION_URIS);
mongoCollection.createIndex(Indexes.compoundIndex(Indexes.ascending("ipAddress"), Indexes.ascending("type")));
mongoCollectionUris.createIndex(Indexes.compoundIndex(Indexes.ascending("uri"), Indexes.ascending("ipAddress"),Indexes.ascending("type")));
@@ -103,7 +103,7 @@ public void open() {
public boolean queueTableExists() {
for(String collection: mongoDB.listCollectionNames()) {
- if(collection.toLowerCase().equals(COLLECTION_NAME.toLowerCase())) {
+ if(collection.toLowerCase().equals(COLLECTION_QUEUE.toLowerCase())) {
return true;
}
}
@@ -128,7 +128,7 @@ protected void addToQueue(CrawleableUri uri) {
protected Iterator getIterator() {
- MongoCursor cursor = mongoDB.getCollection(COLLECTION_NAME).find().iterator();
+ MongoCursor cursor = mongoDB.getCollection(COLLECTION_QUEUE).find().iterator();
Iterator ipUriTypePairIterator = new Iterator() {
@Override
@@ -176,7 +176,7 @@ protected List getUris(IpUriTypePair pair) {
LOGGER.error("Error while retrieving uri from MongoDBQueue",e);
}
- mongoDB.getCollection(COLLECTION_NAME).deleteOne(new Document("ipAddress",pair.ip.getHostAddress()).append("type", pair.type.toString()));
+ mongoDB.getCollection(COLLECTION_QUEUE).deleteOne(new Document("ipAddress",pair.ip.getHostAddress()).append("type", pair.type.toString()));
mongoDB.getCollection(COLLECTION_URIS).deleteMany(new Document("ipAddress",pair.ip.getHostAddress()).append("type", pair.type.toString()));
@@ -185,7 +185,7 @@ protected List getUris(IpUriTypePair pair) {
public boolean queueContainsIpAddressTypeKey(CrawleableUri curi ,List> ipAddressTypeKey) {
- Iterator iterator = mongoDB.getCollection(COLLECTION_NAME).find(new Document("ipAddress", ipAddressTypeKey.get(0)).
+ Iterator iterator = mongoDB.getCollection(COLLECTION_QUEUE).find(new Document("ipAddress", ipAddressTypeKey.get(0)).
append("type", ipAddressTypeKey.get(1))).iterator();
if(iterator.hasNext()) {
@@ -211,14 +211,24 @@ public void addCrawleableUri(CrawleableUri uri, List> ipAddressTypeKey) {
} catch (Exception e) {
- LOGGER.error("Error while adding uri to MongoDBQueue",e);
+ if(e instanceof MongoWriteException)
+ LOGGER.info("Uri: " + uri.getUri().toString() + " already in queue. Ignoring...");
+ else
+ LOGGER.error("Error while adding uri to MongoDBQueue",e);
}
}
public void addCrawleableUri(CrawleableUri uri) {
-
- mongoDB.getCollection(COLLECTION_NAME).insertOne(crawleableUriToMongoDocument(uri)[0]);
+
+ try {
+ mongoDB.getCollection(COLLECTION_QUEUE).insertOne(crawleableUriToMongoDocument(uri)[0]);
mongoDB.getCollection(COLLECTION_URIS).insertOne(crawleableUriToMongoDocument(uri)[1]);
+ }catch (Exception e) {
+ if(e instanceof MongoWriteException)
+ LOGGER.info("Uri: " + uri.getUri().toString() + " already in queue. Ignoring...");
+ else
+ LOGGER.error("Error while adding uri to MongoDBQueue",e);
+ }
LOGGER.debug("Inserted new UriTypePair");
}
@@ -270,5 +280,62 @@ public List packTuple(String str_1, String str_2) {
pack.add(str_2);
return pack;
}
+
+ private List createCrawleableUriList(List