From c7d86cd9d422bd84e49aaa27f2b28662f653d792 Mon Sep 17 00:00:00 2001
From: "romain.fontugne" <romain.fontugne@gmail.com>
Date: Wed, 14 Aug 2024 17:42:21 +0900
Subject: [PATCH 1/9] add tables for data sources, node types, and relationship
 types

---
 documentation/data-sources.md       | 41 +++++++++++++++++++++++++++++
 documentation/node_types.md         | 30 +++++++++++++++++++++
 documentation/relationship_types.md | 31 ++++++++++++++++++++++
 3 files changed, 102 insertions(+)
 create mode 100644 documentation/data-sources.md
 create mode 100644 documentation/node_types.md
 create mode 100644 documentation/relationship_types.md

diff --git a/documentation/data-sources.md b/documentation/data-sources.md
new file mode 100644
index 0000000..9f02c28
--- /dev/null
+++ b/documentation/data-sources.md
@@ -0,0 +1,41 @@
+
+# Datasets and corresponding organization providing data to IYP
+
+
+| Organization                | Dataset Name / Description                   | URL                                                                   |
+|-----------------------------|----------------------------------------------|-----------------------------------------------------------------------|
+| Alice-LG                    | IXP route server looking glass snapshots     | https://github.com/alice-lg/alice-lg}                            |
+|                             | AMS-IX                                       | https://lg.ams-ix.net}                                           |
+|                             | BCIX                                         | https://lg.bcix.de}                                              |
+|                             | DE-CIX                                       | https://lg.de-cix.net}                                           |
+|                             | IX.br                                        | https://lg.ix.br}                                                |
+|                             | LINX                                         | https://alice-rs.linx.net}                                       |
+|                             | Megaport                                     | https://lg.megaport.com}                                         |
+|                             | Netnod                                       | https://lg.netnod.se}                                            |
+| APNIC                       | AS population estimate                       | https://stats.labs.apnic.net/aspop}                              |
+| BGPKIT                      | as2rel, peer-stats, pfx2as                   | https://data.bgpkit.com}                                         |
+| BGP.Tools                   | AS names, AS tags                            | https://bgp.tools/kb/api}                                        |
+|                             | Anycast prefix tags                          | https://github.com/bgptools/anycast-prefixes}                    |
+| CAIDA                       | AS Rank                                      | https://doi.org/10.21986/CAIDA.DATA.AS-RANK}                     |
+|                             | IXPs Dataset                                 | https://www.caida.org/catalog/datasets/ixps}                     |
+| Cisco                       | Umbrella Popularity List                     | https://s3-us-west-1.amazonaws.com/umbrella-static/index.html}   |
+| Citizen Lab                 | URL testing lists                            | https://github.com/citizenlab/test-lists}                        |
+| Cloudflare                  | Cloudflare Radar API endpoints               | https://radar.cloudflare.com}}                                   |
+|                             | radar/dns/top/ases, radar/dns/top/locations, |                                                                       |
+|                             | radar/ranking/top, radar/datasets            |                                                                       |
+| Emile Aben                  | AS names                                     | https://github.com/emileaben/asnames}                                 |
+| IHR                         | Country Dependency, AS Hegemony, ROV         | https://ihr.iijlab.net}                                               |
+| Internet Intelligence Lab   | AS to Organization Mapping                   | https://github.com/InetIntel/Dataset-AS-to-Organization-Mapping}      |
+| NRO                         | Extended allocation and assignment reports   | https://www.nro.net/about/rirs/statistics}                            |
+| OpenINTEL                   | tranco1m, umbrella1m, ns                     | https://data.openintel.nl/data}                                       |
+|                             | DNS Dependency Graph                         | https://dnsgraph.dacs.utwente.nl}                                     |
+| Packet Clearing House       | Daily routing snapshots                      | https://www.pch.net/resources/Routing\_Data}                          |
+| PeeringDB                   | API endpoints: fac, ix, ixlan, netfac, org   | https://www.peeringdb.com}                                            |
+| RIPE NCC                    | AS names, RPKI                               | https://ftp.ripe.net/ripe}                                            |
+|                             | RIPE Atlas measurement information           | https://atlas.ripe.net}                                               |
+| SimulaMet                   | rDNS data                                    | https://rir-data.org}                                                 |
+| Stanford                    | ASdb dataset                                 | https://asdb.stanford.edu}                                       |
+| Tranco                      | Tranco list                                  | https://tranco-list.eu}                                          |
+| Virginia Tech               | RoVista                                      | https://rovista.netsecurelab.org}                                |
+| World Bank                  | Indicators API: Country Population Indicator | https://www.worldbank.org}                                       |
+
diff --git a/documentation/node_types.md b/documentation/node_types.md
new file mode 100644
index 0000000..84cd037
--- /dev/null
+++ b/documentation/node_types.md
@@ -0,0 +1,30 @@
+
+# Node types available in IYP
+
+| Node types              | Description                                                                                                                       |
+|-------------------------|-----------------------------------------------------------------------------------------------------------------------------------|
+| AS                      | Autonomous System, uniquely identified with the **asn** property.                                                                 |
+| AtlasMeasurement        | RIPE Atlas Measurement, uniquely identified with the **id** property.                                                             |
+| AtlasProbe              | RIPE Atlas probe, uniquely identified with the **id** property.                                                                   |
+| AuthoritativeNameServer | Authoritative DNS nameserver for a set of domain names, uniquely identified with the **name** property.                           |
+| BGPCollector            | A RIPE RIS or RouteViews BGP collector, uniquely identified with the **name** property.                                           |
+| CaidaIXID               | Unique identifier for IXPs from CAIDA's IXP dataset.                                                                              |
+| Country                 | Represent an economy, uniquely identified by either its two or three character code (properties **country_code** and **alpha3**). |
+| DomainName              | Any DNS domain name that is not a FQDN (see HostName), uniquely identified by the **name** property.                              |
+| Estimate                | Represent a report that approximate a quantity, for example the World Bank population estimate.                                   |
+| Facility                | Co-location facility for IXPs and ASes, uniquely identified by the **name** property.                                             |
+| HostName                | A fully qualified domain name uniquely identified by the **name** property.                                                       |
+| IP                      | An IPv4 or IPv6 address uniquely identified by the **ip** property. The **af** property (address family) provides the IP version of the prefix.|
+| IXP                     | An Internet Exchange Point, loosely identified by the **name** property or using related IDs (see the EXTERNAL_ID relationship).  |
+| Name                    | Represent a name that could be associated to a network resource (e.g. an AS), uniquely identified by the **name** property.       |
+| OpaqueID                | Represent the opaque-id value found in RIR's delegated files. Resources related to the same opaque-id are registered to the same resource holder. Uniquely identified by the **id** property.|
+| Organization            | Represent an organization and is loosely identified by the **name** property or using related IDs (see the EXTERNAL_ID relationship).|
+| PeeringdbFacID          | Unique identifier for a Facility as assigned by PeeringDB.                                                                        |
+| PeeringdbIXID           | Unique identifier for an IXP as assigned by PeeringDB.                                                                            |
+| PeeringdbNetID          | Unique identifier for an AS as assigned by PeeringDB.                                                                             |
+| PeeringdbOrgID          | Unique identifier for an Organization as assigned by PeeringDB.                                                                   |
+| Prefix                  | An IPv4 or IPv6 prefix uniquely identified by the **prefix** property. The **af** property (address family) provides the IP version of the prefix.|
+| Ranking                 | Represent a specific ranking of Internet resources (e.g. CAIDA's ASRank or Tranco ranking). The rank value for each resource is given by the RANK relationship.|
+| Tag                     | The output of a classification. A tag can be the result of a manual or automated classification. Uniquely identified by the **label** property.|
+| URL                     | The full URL for an Internet resource, uniquely identified by the **url** property.                                               |
+
diff --git a/documentation/relationship_types.md b/documentation/relationship_types.md
new file mode 100644
index 0000000..e86e4a9
--- /dev/null
+++ b/documentation/relationship_types.md
@@ -0,0 +1,31 @@
+
+# Relationships available in IYP
+
+
+| Relationship               | Description                                                                                                                                                                                                      |
+|----------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| ALIAS_OF                   | Equivalent to the CNAME record in DNS. It relates two HostNames.                                                                                                                                                 |
+| ASSIGNED                   | Represent the allocation by a RIR of a network resource (AS, Prefix) to a resource holder (see OpaqueID). Or represent the assigned IP address of an AtlasProbe.                                                 |
+| AVAILABLE                  | Relate ASes and Prefixes to RIRs (in the form of an OpaqueID) meaning that the resource is not allocated and available at the related RIR.                                                                       |
+| CATEGORIZED                | Relate a network resource (AS, Prefix, URL) to a Tag, meaning that the resource has been classified accordingly to the Tag. The **reference_name** property provide the name of the original dataset/classifier. |
+| COUNTRY                    | Relate any node to its corresponding country. This relation may have different meaning depending on the original dataset (e.g. geo-location or registration).                                                    |
+| DEPENDS_ON                 | Relate an AS or Prefix to an AS, meaning the reachability of the AS/Prefix depends on a certain AS.                                                                                                              |
+| EXTERNAL_ID                | Relate a node to an identifier commonly used by an organization. For example, PeeringDB assigns unique identifiers to IXPs (see PeeringdbIXID).                                                                  |
+| LOCATED_IN                 | Location of a resource at a specific geographical or topological location. For example, co-location Facility for an IXP or AS for an AtlasProbe.                                                                 |
+| MANAGED_BY                 | Entity in charge of a network resource. For example an AS is  managed by an Organization, a DomainName is managed by an AuthoritativeNameServer.                                                                 |
+| MEMBER_OF                  | Represent the membership to an organization. For example, an AS is member of an IXP.                                                                                                                             |
+| NAME                       | Relate an entity to its usual or registered name. For example, the name of an AS.                                                                                                                                |
+| ORIGINATE                  | Relate a Prefix to an AS, meaning that the prefix is seen as being originated from that AS in BGP.                                                                                                               |
+| PARENT                     | Relate two DomainNames and represent a zone cut between the parent zone and the more specific zone.                                                                                                              |
+| PART_OF                    | Represent that one entity is a part of another. For example, an IP address is a part of an IP Prefix, a HostName is a part of a DomainName.                                                                      |
+| PEERS_WITH                 | Represent the connection between two ASes as seen in BGP. It also include peerings between ASes and BGPCollectors.                                                                                               |
+| POPULATION                 | Indicate that an AS hosts a certain fraction of the population of a country. It also represent the estimated population of a country.                                                                            |
+| QUERIED_FROM               | Relate a DomainName to an AS or Country, meaning that the AS or Country appears in the Top 100AS or Country to query the most the DomainName (as reported by Cloudflare radar).                                  |
+| RANK                       | Relate a resource to a Ranking, meaning that the resource appears in the Ranking. The **rank** property gives the exact rank position.                                                                           |
+| RESERVED                   | Indicate that an AS or Prefix is reserved for a certain purpose by RIRs or IANA.                                                                                                                                 |
+| RESOLVES_TO                | Relate a HostName to an IP address, meaning that a DNS resolution resolved the corresponding IP.                                                                                                                 |
+| ROUTE_ORIGIN_AUTHORIZATION | Relate an AS and a Prefix, meaning that the AS is authorized to originate the Prefix according to RPKI.                                                                                                          |
+| SIBLING_OF                 | Relate ASes or Organization together, meaning that they represent the same entity.                                                                                                                               |
+| TARGET                     | Relate an AtlasMeasurement to an IP, HostName, or AS, meaning that an Atlas measurement is setup to probe that resource.                                                                                         |
+| WEBSITE                    | Relate a URL to an Organization, Facility, IXP, AS, representing a common website for the resource.                                                                                                              |
+

From adfc33f4b3232360d1c85224becfe723a9471b3e Mon Sep 17 00:00:00 2001
From: Romain <romain-fontugne@users.noreply.github.com>
Date: Wed, 14 Aug 2024 17:45:30 +0900
Subject: [PATCH 2/9] fix line break in data source table

---
 documentation/data-sources.md | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/documentation/data-sources.md b/documentation/data-sources.md
index 9f02c28..ee0943e 100644
--- a/documentation/data-sources.md
+++ b/documentation/data-sources.md
@@ -20,9 +20,8 @@
 |                             | IXPs Dataset                                 | https://www.caida.org/catalog/datasets/ixps}                     |
 | Cisco                       | Umbrella Popularity List                     | https://s3-us-west-1.amazonaws.com/umbrella-static/index.html}   |
 | Citizen Lab                 | URL testing lists                            | https://github.com/citizenlab/test-lists}                        |
-| Cloudflare                  | Cloudflare Radar API endpoints               | https://radar.cloudflare.com}}                                   |
-|                             | radar/dns/top/ases, radar/dns/top/locations, |                                                                       |
-|                             | radar/ranking/top, radar/datasets            |                                                                       |
+| Cloudflare                  | Cloudflare Radar API endpoints radar/dns/top/ases, radar/dns/top/locations, radar/ranking/top, radar/datasets   | https://radar.cloudflare.com}}                                   |
+|                             |                                   |
 | Emile Aben                  | AS names                                     | https://github.com/emileaben/asnames}                                 |
 | IHR                         | Country Dependency, AS Hegemony, ROV         | https://ihr.iijlab.net}                                               |
 | Internet Intelligence Lab   | AS to Organization Mapping                   | https://github.com/InetIntel/Dataset-AS-to-Organization-Mapping}      |

From 1298c5e000898f848249fcce1caf94718004d5ba Mon Sep 17 00:00:00 2001
From: "romain.fontugne" <romain.fontugne@gmail.com>
Date: Thu, 15 Aug 2024 16:34:24 +0900
Subject: [PATCH 3/9] First draft of documentation readme

---
 documentation/README.md | 48 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)
 create mode 100644 documentation/README.md

diff --git a/documentation/README.md b/documentation/README.md
new file mode 100644
index 0000000..cfb767d
--- /dev/null
+++ b/documentation/README.md
@@ -0,0 +1,48 @@
+# IYP Documentation
+
+## IYP Ontology
+
+The list of node and relationship types defined for IYP are available at:
+- [Node types](./node_types.md)
+- [Relationship types](./relationship_types.md)
+
+## IYP Data Sources
+
+The list of all datasets imported in IYP is available [here](data-sources.md).
+The datasets licences are available the [ACKNOWLEDGMENTS file](../ACKNOWLEDGMENTS.md).
+
+## IYP Gallery
+
+The [IYP gallery](./gallery.md) provides example queries to help user browse the database.
+
+## Importing a new dataset
+### Python crawler
+To import a new dataset in IYP, you should write a crawler for that dataset.
+The main tasks of a crawler are to fetch data, parse it, model it with IYP
+ontology, and push it to the IYP database. Most of these tasks are assisted by
+the [IYP python library](../iyp/__init__.py). See the [example crawler](../iyp/crawlers/example/crawler.py) or [existing crawlers](../iyp/crawlers/) for getting started.
+See also the [IHR contributing guidelines](../CONTRIBUTING.md) and [best practices for writing crawlers](https://github.com/InternetHealthReport/internet-yellow-pages/discussions/128). 
+
+### README
+Each crawler should be accompanied by a README.md file. This is the main documentation
+for the crawler, it should contain:
+- a short description of the dataset, 
+- any specificities related to the way the data is imported (e.g. time span, data cleaning), 
+- examples of how the data is modeled,
+- dependencies to other crawlers (e.g. if the crawler requires data from another one).
+
+### Adding a crawler to IYP main branch
+If you wish your crawler to be part of the IYP weekly dumps, you can submit a [Pull Request](https://github.com/InternetHealthReport/internet-yellow-pages/pulls)
+to include the crawler to IYP's github repository main branch. 
+
+Along with the python code and README, the addition of new datasets should also 
+be reflected in the following files:
+- The list of [imported datasets](./data-sources.md).
+- The [ACKNOWLEDGMENTS.md](../ACKNOWLEDGMENTS.md) file should list the licence of all imported dataset.
+
+Furthermore, **any change to the ontology should be reflected in the documentation** ([Node types](./node_types.md) and [Relationship types](./relationship_types.md)).
+Changes to the ontology should be discussed in advance so that a consensus is
+reached before the ontology is updated either on [github discussion](https://github.com/InternetHealthReport/internet-yellow-pages/discussions) or by reaching [IYP maintainers](mailto:iyp@ihr.live).
+
+You can also consider adding example queries to the [IYP gallery](./gallery.md),
+and organizations providing data to the [IYP frontpage]().

From efd4ced071805683d584506e3724d4c40ae72715 Mon Sep 17 00:00:00 2001
From: "romain.fontugne" <romain.fontugne@gmail.com>
Date: Thu, 15 Aug 2024 16:41:41 +0900
Subject: [PATCH 4/9] fix typos in readme

---
 documentation/README.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/documentation/README.md b/documentation/README.md
index cfb767d..c164a65 100644
--- a/documentation/README.md
+++ b/documentation/README.md
@@ -9,7 +9,7 @@ The list of node and relationship types defined for IYP are available at:
 ## IYP Data Sources
 
 The list of all datasets imported in IYP is available [here](data-sources.md).
-The datasets licences are available the [ACKNOWLEDGMENTS file](../ACKNOWLEDGMENTS.md).
+The datasets licence are available the [IYP acknowledgments](../ACKNOWLEDGMENTS.md).
 
 ## IYP Gallery
 
@@ -37,12 +37,12 @@ to include the crawler to IYP's github repository main branch.
 
 Along with the python code and README, the addition of new datasets should also 
 be reflected in the following files:
-- The list of [imported datasets](./data-sources.md).
-- The [ACKNOWLEDGMENTS.md](../ACKNOWLEDGMENTS.md) file should list the licence of all imported dataset.
+- the list of [imported datasets](./data-sources.md),
+- the [IYP acknowledgments](../ACKNOWLEDGMENTS.md) file should list the licence of all imported dataset.
 
-Furthermore, **any change to the ontology should be reflected in the documentation** ([Node types](./node_types.md) and [Relationship types](./relationship_types.md)).
-Changes to the ontology should be discussed in advance so that a consensus is
-reached before the ontology is updated either on [github discussion](https://github.com/InternetHealthReport/internet-yellow-pages/discussions) or by reaching [IYP maintainers](mailto:iyp@ihr.live).
+Changes to the ontology should be discussed in advance, either on [github discussion](https://github.com/InternetHealthReport/internet-yellow-pages/discussions) or by reaching [IYP maintainers](mailto:iyp@ihr.live),
+so that a consensus is reached before the ontology is updated. 
+**Any change to the ontology should be reflected in the documentation** ([Node types](./node_types.md) and [Relationship types](./relationship_types.md)).
 
 You can also consider adding example queries to the [IYP gallery](./gallery.md),
 and organizations providing data to the [IYP frontpage]().

From 00a3bcd46c419bf65ebd889a0969d9fe01a330ba Mon Sep 17 00:00:00 2001
From: "romain.fontugne" <romain.fontugne@gmail.com>
Date: Wed, 21 Aug 2024 11:31:21 +0900
Subject: [PATCH 5/9] update gallery

---
 documentation/gallery.md | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/documentation/gallery.md b/documentation/gallery.md
index 802a0b0..1ed6b89 100644
--- a/documentation/gallery.md
+++ b/documentation/gallery.md
@@ -1,19 +1,11 @@
 # IYP Gallery
 
+Below are examples queries that you can copy/paste in [Neo4j browser](https://iyp.iijlab.net/iyp/browser/?dbms=iyp-bolt.iijlab.net:443).
+
 Querying the IYP database requires to be familiar with:
 - Cypher, Neo4j's query langage https://neo4j.com/docs/getting-started/current/cypher-intro/
 - Basic networking knowledge (IP, prefixes, ASes, etc..)
-- IYP ontology (TODO link to ontology)
-
-Below are examples queries that you can copy/paste in Neo4j browser:
-1. [Names for AS2497](#names-for-as2497)
-2. [All nodes related to 8.8.8.0/24](#all-nodes-related-to-888024)
-3. [Country code of AS2497 in delegated files](#country-code-of-as2497-in-delegated-files)
-4. [Countries of IXPs where AS2497 is present](#countries-of-ixps-where-as2497-is-present)
-5. [Top domain names hosted by AS2497](#top-domain-names-hosted-by-as2497)
-6. [ASes hosting top domain names in Japan](#ases-hosting-top-domain-names-in-japan)
-7. [Topology for AS2501's dependencies](#topology-for-as2501s-dependencies)
-
+- [IYP ontology](./README.md)
 
 ### Names for AS2497
 Find 'Name' nodes directly connected to the node corresponding to AS2497.
@@ -95,8 +87,18 @@ RETURN p
 ```
 ![Dependencies for AS2501](/documentation/assets/gallery/as2501dependencies.svg)
 
-### List of IPs for RIPE RIS full feeds
+### List of IPs for RIPE RIS full feed peers (more than 800k prefixes)
+
+```cypher
+MATCH (n:BGPCollector)-[p:PEERS_WITH]-(a:AS) 
+WHERE n.project = 'riperis' AND p.num_v4_pfxs > 800000 
+RETURN n.name, COUNT(DISTINCT p.ip) AS nb_full, COLLECT(DISTINCT p.ip) AS ips_full
+```
+
+### Active RIPE Atlas probes for the top 5 ISPs in Japan
 
 ```cypher
-MATCH (n:BGPCollector)-[p:PEERS_WITH]-(a:AS) WHERE n.project = 'riperis' AND p.num_v4_pfxs > 800000 RETURN n.name, COUNT(DISTINCT p.ip) AS nb_full, COLLECT(DISTINCT p.ip) AS ips_full
+MATCH (pb:AtlasProbe)-[:LOCATED_IN]-(a:AS)-[pop:POPULATION]-(c:Country) 
+WHERE c.country_code = 'JP' AND pb.status_name = 'Connected' AND pop.rank <= 5 
+RETURN pop.rank, a.asn, COLLECT(pb.id) AS probe_ids ORDER BY pop.rank
 ```

From f9481fb0afe9cf1efaeead0a7bd4e648ae921481 Mon Sep 17 00:00:00 2001
From: "romain.fontugne" <romain.fontugne@gmail.com>
Date: Wed, 21 Aug 2024 12:07:35 +0900
Subject: [PATCH 6/9] Do not allow file URLS

---
 public/conf_notls/neo4j.conf | 3 ++-
 public/conf_tls/neo4j.conf   | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/public/conf_notls/neo4j.conf b/public/conf_notls/neo4j.conf
index 2bde2bf..6f1cd16 100644
--- a/public/conf_notls/neo4j.conf
+++ b/public/conf_notls/neo4j.conf
@@ -205,7 +205,8 @@ server.http.enabled=true
 # Determines if Cypher will allow using file URLs when loading data using
 # `LOAD CSV`. Setting this value to `false` will cause Neo4j to fail `LOAD CSV`
 # clauses that load data from the file system.
-#dbms.security.allow_csv_import_from_file_urls=true
+dbms.security.allow_csv_import_from_file_urls=false
+dbms.security.allow_file_urls=false
 
 
 # Value of the Access-Control-Allow-Origin header sent over any HTTP or HTTPS
diff --git a/public/conf_tls/neo4j.conf b/public/conf_tls/neo4j.conf
index 6c22a0f..64d2644 100644
--- a/public/conf_tls/neo4j.conf
+++ b/public/conf_tls/neo4j.conf
@@ -205,7 +205,8 @@ dbms.ssl.policy.https.public_certificate=neo4j.cert
 # Determines if Cypher will allow using file URLs when loading data using
 # `LOAD CSV`. Setting this value to `false` will cause Neo4j to fail `LOAD CSV`
 # clauses that load data from the file system.
-#dbms.security.allow_csv_import_from_file_urls=true
+dbms.security.allow_csv_import_from_file_urls=false
+dbms.security.allow_file_urls=false
 
 
 # Value of the Access-Control-Allow-Origin header sent over any HTTP or HTTPS

From faa14c3b19a87bad4948f4856f0fb6e62f1c0c2b Mon Sep 17 00:00:00 2001
From: Romain Fontugne <romain.fontugne@gmail.com>
Date: Wed, 21 Aug 2024 11:45:25 +0000
Subject: [PATCH 7/9] setup neo4j config to disable load on local file

---
 public/conf_notls/neo4j.conf | 4 +---
 public/conf_tls/neo4j.conf   | 2 --
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/public/conf_notls/neo4j.conf b/public/conf_notls/neo4j.conf
index 6f1cd16..86409a8 100644
--- a/public/conf_notls/neo4j.conf
+++ b/public/conf_notls/neo4j.conf
@@ -206,8 +206,6 @@ server.http.enabled=true
 # `LOAD CSV`. Setting this value to `false` will cause Neo4j to fail `LOAD CSV`
 # clauses that load data from the file system.
 dbms.security.allow_csv_import_from_file_urls=false
-dbms.security.allow_file_urls=false
-
 
 # Value of the Access-Control-Allow-Origin header sent over any HTTP or HTTPS
 # connector. This defaults to '*', which allows broadest compatibility. Note
@@ -243,7 +241,7 @@ server.databases.default_to_read_only=true
 
 # A comma separated list of procedures to be loaded by default.
 # Leaving this unconfigured will load all procedures found.
-#dbms.security.procedures.allowlist=apoc.coll.*,apoc.load.*,gds.*
+dbms.security.procedures.allowlist=
 
 #********************************************************************
 # JVM Parameters
diff --git a/public/conf_tls/neo4j.conf b/public/conf_tls/neo4j.conf
index 64d2644..b15425c 100644
--- a/public/conf_tls/neo4j.conf
+++ b/public/conf_tls/neo4j.conf
@@ -206,8 +206,6 @@ dbms.ssl.policy.https.public_certificate=neo4j.cert
 # `LOAD CSV`. Setting this value to `false` will cause Neo4j to fail `LOAD CSV`
 # clauses that load data from the file system.
 dbms.security.allow_csv_import_from_file_urls=false
-dbms.security.allow_file_urls=false
-
 
 # Value of the Access-Control-Allow-Origin header sent over any HTTP or HTTPS
 # connector. This defaults to '*', which allows broadest compatibility. Note

From 2a2e8c4054b178cbc0a4c40349d1cda629831efe Mon Sep 17 00:00:00 2001
From: "romain.fontugne" <romain.fontugne@gmail.com>
Date: Wed, 28 Aug 2024 15:22:34 +0900
Subject: [PATCH 8/9] cleaning of documentation files

---
 documentation/cn.md    | 142 -----------------------------------------
 documentation/iij.md   |  62 ------------------
 documentation/ua_ru.md |  72 ---------------------
 3 files changed, 276 deletions(-)
 delete mode 100644 documentation/cn.md
 delete mode 100644 documentation/iij.md
 delete mode 100644 documentation/ua_ru.md

diff --git a/documentation/cn.md b/documentation/cn.md
deleted file mode 100644
index a8e99d6..0000000
--- a/documentation/cn.md
+++ /dev/null
@@ -1,142 +0,0 @@
-# A look into the Chinese Internet
-
-ASNs:
-- China Telecom: 4134, 23764 (CTGNet replacing 4809 overseas)
-- China Telecom 5G network?: 131285
-
-Notes: 
-- some international ASes have HK country code (e.g., telstra)
-
-
-Interesting links:
-- China Telecom map: https://www.chinatelecomasiapacific.com/jp/wp-content/uploads/2021/03/2021-CTG-Infrastructure-Map-8M.pdf
-- China Telecom peering policy: https://2021v.peeringasia.com/files/NewCTPeeringPolicy.pdf
-
-
-## General stats
-
-Number of ASes registered in China/HK:
-```cypher
-MATCH (a:AS)-[:COUNTRY {reference_org:'NRO'}]-(cc:Country)
-WHERE cc.country_code = 'CN' OR cc.country_code = 'HK'
-RETURN cc.country_code, count(DISTINCT a)
-```
-
-| "cc.country_code" | "count(DISTINCT a)" |
-|-------------------|---------------------|
-| "CN"              |                6508 |
-| "HK"              |                1198 |
-
-
-Number of active Chinese/HK ASes:
-```cypher
-MATCH (a:AS)-[:COUNTRY {reference_org:'NRO'}]-(cc:Country), (a)-[:ORIGINATE]-(:Prefix)
-WHERE cc.country_code = 'CN' OR cc.country_code = 'HK'
-RETURN cc, count(DISTINCT a)
-```
-|         "cc"          | "count(DISTINCT a)" |
-|-----------------------|---------------------|
-| {"country_code":"CN"} |                5071 |
-| {"country_code":"HK"} |                 683 |
-
-Facilities for CT (4134, 23764):
-```cypher
-MATCH (a:AS)--(fac:Facility)
-WHERE a.asn IN [4134, 23764]
-RETURN a, fac
-```
-
-Country codes for these facilities:
-```cypher
-MATCH (a:AS)--(fac:Facility)--(cc:Country)
-WHERE a.asn IN [4134, 23764]
-RETURN DISTINCT cc.country_code
-```
-
-Country codes for all ASes registered for the opaque ID:
-```cypher
-MATCH (a:AS)-[:ASSIGNED]-(oid:OpaqueID)
-WHERE a.asn IN [4134, 23764]
-WITH oid
-MATCH (oid)--(other:AS)--(fac:Facility)--(cc:Country)
-RETURN DISTINCT cc.country_code, collect(DISTINCT other.asn)
-```
-
-| "cc.country_code" | "collect(DISTINCT other.asn)" |
-|-------------------|-------------------------------|
-| "SG"              | [131285,23764]                |
-| "ID"              | [131285]                      |
-| "KE"              | [4809]                        |
-| "BR"              | [4809,4134]                   |
-| "AE"              | [4809]                        |
-| "HK"              | [4809,23764]                  |
-| "ZA"              | [4809,23764]                  |
-| "US"              | [4134]                        |
-| "DE"              | [4134,23764]                  |
-| "NL"              | [4134]                        |
-| "GB"              | [4134,23764]                  |
-| "JP"              | [23764]                       |
-| "FR"              | [23764]                       |
-
-## Co-location facilities 
-
-Facilities in China/HK:
-```cypher
-MATCH (c:Country)--(f:Facility)--(a:AS)
-WHERE c.country_code = 'CN' OR c.country_code = 'HK'
-RETURN f, count(DISTINCT a) AS nb_as ORDER BY nb_as DESC
-```
-
-Facilities where Chinese ASes are (28):
-```cypher
-MATCH (net_country:Country)-[:COUNTRY {reference_org:'NRO'}]-(net:AS)-[:LOCATED_IN]-(fac:Facility)--(fac_country:Country)
-WHERE net_country.country_code = 'CN'
-RETURN fac_country.country_code, count(DISTINCT net) AS nb_AS ORDER BY nb_AS DESC
-```
-
-ASes present at the largest number of facilities:
-```cypher
-MATCH (net_country:Country)-[:COUNTRY {reference_org:'NRO'}]-(net:AS)-[:LOCATED_IN]-(fac:Facility)--(fac_country:Country)
-WHERE net_country.country_code = 'HK' OR net_country.country_code = 'CN'
-RETURN net.asn, net_country.country_code, count(DISTINCT fac_country) AS nb_fac ORDER BY nb_fac DESC
-```
-
-## Prefix geolocation
-Geolocation of prefixes announced by Chinese ASes (54):
-```cypher
-MATCH (net_country:Country)-[:COUNTRY {reference_org:'NRO'}]-(net:AS)-[:ORIGINATE]-(pfx:Prefix)-[:COUNTRY {reference_org:'Internet Health Report'}]-(pfx_country:Country)
-WHERE net_country.country_code = 'CN'
-RETURN pfx_country.country_code, count(DISTINCT net) AS nb_AS ORDER BY nb_AS DESC
-```
-
-Geolocation of prefixes announced by HK ASes (81):
-```cypher
-MATCH (net_country:Country)-[:COUNTRY {reference_org:'NRO'}]-(net:AS)-[:ORIGINATE]-(pfx:Prefix)-[:COUNTRY {reference_org:'Internet Health Report'}]-(pfx_country:Country)
-WHERE net_country.country_code = 'HK'
-RETURN pfx_country.country_code, count(DISTINCT net) AS nb_AS ORDER BY nb_AS DESC
-```
-
-ASes with the largest footprint:
-```cypher
-MATCH (net_country:Country)-[:COUNTRY {reference_org:'NRO'}]-(net:AS)-[:ORIGINATE]-(pfx:Prefix)--(pfx_country:Country)
-WHERE net_country.country_code = 'HK' OR net_country.country_code = 'CN'
-RETURN net.asn, net_country.country_code, count(DISTINCT pfx_country) AS nb_pfx ORDER BY nb_pfx DESC
-```
-
-## Tier-1 comparison
-
-### Domain name distribution
-
-Graph (top10k):
-```cypher
-MATCH (:Ranking)-[r:RANK]-(dn:DomainName)--(ip:IP)--(pfx:Prefix)-[:ORIGINATE]-(net:AS)
-WHERE dn.name ENDS WITH '.cn' AND r.rank<10000
-RETURN dn, ip, pfx, net
-```
-
-Table (top1M):
-```cypher
-MATCH (:Ranking)-[r:RANK]-(dn:DomainName)--(ip:IP)--(pfx:Prefix)-[:ORIGINATE]-(net:AS)
-WHERE dn.name ENDS WITH '.cn' AND r.rank<100000
-RETURN net.asn,  count(DISTINCT dn) AS nb_domain_name ORDER BY nb_domain_name DESC
-```
diff --git a/documentation/iij.md b/documentation/iij.md
deleted file mode 100644
index 9ff35fc..0000000
--- a/documentation/iij.md
+++ /dev/null
@@ -1,62 +0,0 @@
-# Looking into one AS: AS2497, IIJ
-
-## IIJ's geographical footprint
-#### IXP where IIJ is present
-```cypher
-MATCH (iij:AS {asn:2497})-[:MEMBER_OF]-(ix:IXP)--(cc:Country) RETURN iij, ix, cc
-```
-
-### Facilities where IIJ is present
-```cypher
-MATCH (iij:AS {asn:2497})--(ix:Facility)--(cc:Country) RETURN iij, ix, cc
-```
-
-#### Geolocation of prefixes announced by IIJ
-```cypher
-MATCH (iij:AS {asn:2497})-[:ORIGINATE]-(pfx:Prefix)--(cc:Country) RETURN iij, pfx, cc
-```
-
-#### Geolocation of prefixes announced by IIJ's customer
-graph:
-```cypher
-MATCH (iij:AS {asn:2497})<-[:DEPENDS_ON]-(customer:AS)-[:ORIGINATE]-(pfx:Prefix)--(cc:Country) RETURN iij, customer, pfx, cc
-```
-table:
-```cypher
-MATCH (iij:AS {asn:2497})<-[dep:DEPENDS_ON]-(customer:AS)-[:ORIGINATE]-(pfx:Prefix)--(cc:Country)
-RETURN cc.country_code, count(DISTINCT customer) AS nb_dep ORDER BY nb_dep DESC
-```
-
-## IIJ's logical (DNS) footprint
-### Top domain names that map to prefixes originated from AS2497 
-```cypher
-MATCH (n:AS {asn:2497})-[:ORIGINATE]-(p:Prefix)--(i:IP)--(d:DomainName)-[r:RANK]-(:Ranking) WHERE r.rank<10000 RETURN n,p,i,d
-```
-
-### Top domain names that are related to AS2497 (including domains that map to prefixes orginated by AS2497 and prefixes that depends on AS2497)
-```cypher
-MATCH (n:AS {asn:2497})--(p:Prefix)--(i:IP)--(d:DomainName)-[r:RANK]-(:Ranking) WHERE r.rank<10000 RETURN n,p,i,d
-```
-
-## IIJ's main competitors
-```cypher
-MATCH (comp:AS)-[:PEERS_WITH {rel:1}]->(customer:AS)<-[:PEERS_WITH {rel:1}]-(iij:AS {asn:2497})
-WITH comp, customer OPTIONAL MATCH (comp)-[:NAME {reference_org:'RIPE NCC'}]-(comp_name:Name)
-RETURN comp, comp_name, count(DISTINCT customer) AS nb_customer ORDER BY nb_customer DESC
-```
-
-## Top Japanese domains
-Graph:
-```cypher
-MATCH (:Ranking)-[r:RANK]-(dn:DomainName)--(ip:IP)--(pfx:Prefix)-[:ORIGINATE]-(net:AS)
-WHERE dn.name ENDS WITH '.jp' AND r.rank<10000
-RETURN dn, ip, pfx, net
-```
-
-Table:
-```cypher
-MATCH (:Ranking)-[r:RANK]-(dn:DomainName)--(ip:IP)--(pfx:Prefix)-[:ORIGINATE]-(net:AS)
-WHERE dn.name ENDS WITH '.jp' AND r.rank<100000
-WITH net, dn  OPTIONAL MATCH (net:AS)-[:NAME {reference_org:'RIPE NCC'}]-(net_name:Name)
-RETURN net.asn, net_name.name, count(DISTINCT dn) AS nb_domain_name ORDER BY nb_domain_name DESC
-```
diff --git a/documentation/ua_ru.md b/documentation/ua_ru.md
deleted file mode 100644
index a16ff2c..0000000
--- a/documentation/ua_ru.md
+++ /dev/null
@@ -1,72 +0,0 @@
-# Examples  for
-
-## Which country an AS maps to
-### Registration country code (administrative)
-
-### Presence at IXP (biased by peering db)
-
-### Country code of its peers
-
-### Geoloc of prefixes
-
-
-## Find all nodes that have the word 'crimea' in their name
-MATCH (x)--(n:Name) WHERE toLower(n.name) CONTAINS 'crimea' RETURN  x, n;
-
-## Crimean neighbors network dependencies
-MATCH (crimea:Name)--(:AS)-[:PEERS_WITH]-(neighbors:AS)-[r:DEPENDS_ON]-(transit_as:AS)--(transit_name:Name) WHERE toLower(crimea.name) CONTAINS 'crimea' AND toFloat(r.hegemony)>0.2 RETURN transit_as, collect(DISTINCT transit_name.name), count(DISTINCT r) AS nb_links ORDER BY nb_links DESC;
-
-## Crimean IXP members dependencies
-MATCH (crimea:Name)--(:IXP)-[:MEMBER_OF]-(neighbors:AS)-[r:DEPENDS_ON]-(transit_as:AS)--(transit_name:Name) WHERE toLower(crimea.name) CONTAINS 'crimea' AND toFloat(r.hegemony)>0.2 RETURN transit_as, collect(DISTINCT transit_name.name), count(DISTINCT r) AS nb_links ORDER BY nb_links DESC;
-
-## who is at Crimea ixp but not any other ixp
-
-
-# Rostelecom
-
-## Top domain names that are hosted by Rostelecom
-```
-MATCH (n:AS {asn:12389})-[:ORIGINATE]-(p:Prefix)--(i:IP)--(d:DomainName)-[r:RANK]-(:Ranking) WHERE r.rank<1000 RETURN n,p,i,d
-```
-
-## Top domain names that depends on Rostelecom
-```
-MATCH (n:AS {asn:12389})--(p:Prefix)--(i:IP)--(d:DomainName)-[r:RANK]-(:Ranking) WHERE r.rank<1000 RETURN n,p,i,d
-```
-
-## all ases assigned to same org
-
-
-## which top domain names map to this ASes
-```
-MATCH (oid:OpaqueID)--(net:AS)--(pfx:Prefix)--(ip:IP)--(dname:DomainName)-[r]-(:Ranking), (net)-[{reference_org:'RIPE NCC'}]-(asname:Name) WHERE (oid:OpaqueID)--(:AS {asn:12389}) AND r.rank < 10000 AND net.asn<>12389 RETURN net, pfx, ip, dname, asname
-```
-
-## which prefixes map to other counties
-```
-MATCH (:AS {asn:12389})--(oid:OpaqueID)--(net:AS)--(pfx:Prefix)--(cc:Country), (net)-[{reference_org:'RIPE NCC'}]-(asname:Name) WHERE net.asn<>12389 AND cc.country_code <> 'RU' RETURN net, pfx, asname, cc, oid
-```
-
-## which IXPs they are member of:
-```
-MATCH (oid:OpaqueID)--(n:AS)--(ix:IXP) WHERE (oid:OpaqueID)--(:AS {asn:12389}) RETURN n, ix
-```
-
-## which IXPs are they operating to?
-```
-MATCH (n:AS {asn:12389})--(ix:IXP)--(cc:Country) RETURN n,ix,cc
-```
-
-## prefixes assigned to this org and ASes announcing it
-
-!
-## Country code of ASes hosting top domain names
-MATCH (:Ranking)-[r:RANK]-(domain:DomainName)--(:IP)--(:Prefix)-[:ORIGINATE]-(:AS)-[:COUNTRY {reference_org:'NRO'}]-(cc:Country) WHERE r.rank<10000 AND domain.name ENDS WITH '.ru' RETURN cc.country_code, count(DISTINCT domain.name) AS dm_count ORDER BY dm_count DESC
-MATCH (:Ranking)-[r:RANK]-(domain:DomainName)--(:IP)--(:Prefix)-[:ORIGINATE]-(:AS)-[:COUNTRY {reference_org:'NRO'}]-(cc:Country) WHERE r.rank<10000 AND domain.name ENDS WITH '.jp' RETURN cc.country_code, count(DISTINCT domain.name) AS dm_count ORDER BY dm_count DESC
-MATCH (:Ranking)-[r:RANK]-(domain:DomainName)--(:IP)--(:Prefix)-[:ORIGINATE]-(:AS)-[:COUNTRY {reference_org:'NRO'}]-(cc:Country) WHERE r.rank<10000 AND domain.name ENDS WITH '.jp' RETURN cc.country_code, count(DISTINCT domain.name) AS dm_count ORDER BY dm_count DESC
-
-# Interesting queries:
-## Orange presence
-```cypher
-MATCH (oid)--(n:AS)--(ix:IXP)--(cc:Country) WHERE (oid:OpaqueID)--(:AS {asn:5511}) RETURN n,ix,cc
-```

From 73f4dc04b2bc558beb5928029c204c8339aa12b8 Mon Sep 17 00:00:00 2001
From: Ryan Poon <62296881+ryanp8@users.noreply.github.com>
Date: Fri, 30 Aug 2024 13:50:18 +0800
Subject: [PATCH 9/9] Autodeploy (#139)

* Added testing to check if crawlers don't receive any data
* Modified create_db to run the existence check for each crawler and to upload dump to ihr archive when created
* Added autodeployment script and caddy config to deploy new databases when they are available in ihr archive.


---------

Co-authored-by: Malte Tashiro <malte@iij.ad.jp>
---
 .gitignore                                    |   2 +
 autodeploy-config.json                        |  12 +
 autodeploy/README.md                          |  54 +++
 autodeploy/autodeploy.py                      | 285 ++++++++++++
 caddy.template.json                           | 109 +++++
 config.json.example                           |   6 +
 create_db.py                                  | 408 +++++++++---------
 docker-compose.yaml                           |  21 +
 iyp/__init__.py                               |  28 +-
 iyp/crawlers/alice_lg/__init__.py             |   3 +
 iyp/crawlers/alice_lg/amsix.py                |   2 +-
 iyp/crawlers/alice_lg/bcix.py                 |   2 +-
 iyp/crawlers/alice_lg/decix.py                |   2 +-
 iyp/crawlers/alice_lg/ixbr.py                 |   2 +-
 iyp/crawlers/alice_lg/linx.py                 |   2 +-
 iyp/crawlers/alice_lg/megaport.py             |   2 +-
 iyp/crawlers/alice_lg/netnod.py               |   2 +-
 iyp/crawlers/apnic/eyeball.py                 |   5 +-
 iyp/crawlers/bgpkit/__init__.py               |   3 +
 iyp/crawlers/bgpkit/as2rel_v4.py              |   2 +-
 iyp/crawlers/bgpkit/as2rel_v6.py              |   2 +-
 iyp/crawlers/bgpkit/peerstats.py              |   5 +-
 iyp/crawlers/bgpkit/pfx2asn.py                |   5 +-
 iyp/crawlers/bgptools/anycast_prefixes.py     |   5 +-
 iyp/crawlers/bgptools/as_names.py             |   5 +-
 iyp/crawlers/bgptools/tags.py                 |   5 +-
 iyp/crawlers/caida/asrank.py                  |   5 +-
 iyp/crawlers/caida/ix_asns.py                 |   5 +-
 iyp/crawlers/caida/ixs.py                     |   5 +-
 iyp/crawlers/cisco/umbrella_top1m.py          |   5 +-
 iyp/crawlers/citizenlab/urldb.py              |   5 +-
 iyp/crawlers/cloudflare/dns_top_ases.py       |   6 +-
 iyp/crawlers/cloudflare/dns_top_locations.py  |   5 +-
 iyp/crawlers/cloudflare/ranking_bucket.py     |   5 +-
 iyp/crawlers/cloudflare/top100.py             |   5 +-
 iyp/crawlers/emileaben/as_names.py            |   5 +-
 iyp/crawlers/example/crawler.py               |   2 +-
 iyp/crawlers/iana/root_zone.py                |   5 +-
 iyp/crawlers/ihr/__init__.py                  |   3 +
 iyp/crawlers/ihr/country_dependency.py        |   5 +-
 iyp/crawlers/ihr/local_hegemony_v4.py         |   2 +-
 iyp/crawlers/ihr/local_hegemony_v6.py         |   2 +-
 iyp/crawlers/ihr/rov.py                       |   5 +-
 iyp/crawlers/inetintel/as_org.py              |   5 +-
 iyp/crawlers/manrs/members.py                 |   5 +-
 iyp/crawlers/nro/delegated_stats.py           |   5 +-
 iyp/crawlers/openintel/__init__.py            |   7 +
 iyp/crawlers/openintel/dnsgraph_jp.py         |   2 +-
 iyp/crawlers/openintel/dnsgraph_nl.py         |   2 +-
 iyp/crawlers/openintel/dnsgraph_rdns.py       |   2 +-
 iyp/crawlers/openintel/infra_mx.py            |   2 +-
 iyp/crawlers/openintel/infra_ns.py            |   2 +-
 iyp/crawlers/openintel/tranco1m.py            |   2 +-
 iyp/crawlers/openintel/umbrella1m.py          |   2 +-
 iyp/crawlers/pch/__init__.py                  |   3 +
 .../pch/daily_routing_snapshots_v4.py         |   2 +-
 .../pch/daily_routing_snapshots_v6.py         |   2 +-
 iyp/crawlers/peeringdb/fac.py                 |   5 +-
 iyp/crawlers/peeringdb/ix.py                  |   5 +-
 iyp/crawlers/peeringdb/org.py                 |   5 +-
 iyp/crawlers/ripe/as_names.py                 |   5 +-
 iyp/crawlers/ripe/atlas_measurements.py       |   5 +-
 iyp/crawlers/ripe/atlas_probes.py             |   5 +-
 iyp/crawlers/ripe/roa.py                      |   5 +-
 iyp/crawlers/simulamet/rirdata_rdns.py        |   5 +-
 iyp/crawlers/stanford/asdb.py                 |   5 +-
 iyp/crawlers/tranco/top1m.py                  |   5 +-
 iyp/crawlers/virginiatech/rovista.py          |   5 +-
 iyp/crawlers/worldbank/country_pop.py         |   5 +-
 requirements.txt                              |   4 +-
 70 files changed, 901 insertions(+), 260 deletions(-)
 create mode 100644 autodeploy-config.json
 create mode 100644 autodeploy/README.md
 create mode 100644 autodeploy/autodeploy.py
 create mode 100644 caddy.template.json

diff --git a/.gitignore b/.gitignore
index 0c6ba6b..cf165c8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -129,3 +129,5 @@ dmypy.json
 .pyre/
 
 .history/
+
+**/dumps
\ No newline at end of file
diff --git a/autodeploy-config.json b/autodeploy-config.json
new file mode 100644
index 0000000..e210b16
--- /dev/null
+++ b/autodeploy-config.json
@@ -0,0 +1,12 @@
+{
+  "archive_base_url": "https://ihr-archive.iijlab.net/ihr/iyp/",
+  "caddy_config_url": "http://sandbox.ihr.live:2019/config",
+  "caddy_post_url": "http://localhost:2019/load",
+  "caddy_template": "caddy.template.json",
+  "urls": {
+    "active_bolt": "ryan-bolt.ihr.live",
+    "active_http": "ryan.ihr.live",
+    "prev_bolt": "ryan-prev-bolt.ihr.live",
+    "prev_http": "ryan-prev.ihr.live"
+  }
+}
diff --git a/autodeploy/README.md b/autodeploy/README.md
new file mode 100644
index 0000000..78dd23f
--- /dev/null
+++ b/autodeploy/README.md
@@ -0,0 +1,54 @@
+# Autodeployment Script
+
+## Usage
+
+### Starting caddy
+
+Make sure that Caddy is running. If not, run it with `docker compose up caddy`. If Caddy
+was running previously, then the new Caddy instance will resume from the previous
+config. See the [Caddy docs](https://caddyserver.com/docs/running#docker-compose) for
+more info.
+
+### Running the script
+
+To run the script, run `python3 -m autodeploy.autodeploy <config>`. This will first find the date
+of the most recent active deployment using the caddy config. If there is no active
+deployment, today's date is used. With this date, the script will then check ihr-archive
+to see if a dump has been pushed in the subsequent 7 days. If so, a neo4j instance will
+be deployed using that dump. For example, if the latest deployment is for 2024-06-15,
+the script will check if there is a dump for 2024-06-16 to 2024-06-23.
+
+Alternatively, running `python3 -m autodeploy.autodeploy <config> --date [year]-[month]-[day]` will
+check if there is a dump in the archive for the specified date and deploy it directly.
+
+## How it works
+
+### Checking for a dump to deploy
+
+If the date is not provided when running the script, it will first make a request to
+Caddy to get the current config. The config is parsed to retrieve the port of the active
+database. The date is parsed from the port number as explained below. Starting from this
+date, the next 7 days are then checked in ihr-archive for valid dumps.
+
+#### Caddy Config
+
+Caddy is updated by substituting the desired ports in the specified Caddy config
+template. The ports are constructed with the following structure: 1MMDD for neo4j http
+port, and 2MMDD for neo4j bolt port. The json is sent to caddy by making a POST request
+to sandbox.ihr.live:2019/load.  The current config is retrieved by making a GET request
+to sandbox.ihr.live:2019/config.
+
+### Starting the database
+
+Once a dump has been found, its log is downloaded from the archive. If the log indicates
+that there are no errors, then the dump is downloaded. A docker container is then
+started that loads the dump into a neo4j database. The database is stored in a docker
+volume with the name data-MM-DD. Another container is then used to start the database
+using the data stored in data-MM-YY. It binds its internal neo4j 7474 and 7687 ports to
+the external ones that contain the dump's date.
+
+If a container is already running for this date, it and its data volume are deleted, and
+a new one is created from the downloaded dump data.
+
+If there was already an active database, it becomes the previous database. The current
+previous database container is stopped, and its data volume is deleted.
diff --git a/autodeploy/autodeploy.py b/autodeploy/autodeploy.py
new file mode 100644
index 0000000..68d82c1
--- /dev/null
+++ b/autodeploy/autodeploy.py
@@ -0,0 +1,285 @@
+import argparse
+import json
+import logging
+import os
+import sys
+import time
+from datetime import datetime, timedelta, timezone
+
+import docker
+import requests
+
+NEO4J_VERSION = '5.16.0'
+
+ARCHIVE_URL_SUFFIX = '%Y/%m/%d/iyp-%Y-%m-%d'
+LOG_URL_SUFFIX = ARCHIVE_URL_SUFFIX + '.log'
+DUMP_URL_SUFFIX = ARCHIVE_URL_SUFFIX + '.dump'
+
+DUMP_DOWNLOAD_DIR_SUFFIX = 'dumps/%Y/%m/%d'
+
+DOCKER_VOLUME_FMT = 'data-%m-%d'
+DOCKER_CONTAINER_NAME_FMT = 'deploy-%m-%d'
+
+
+def remove_deployment(client: docker.DockerClient, date: datetime):
+    """Checks if there is an active deployment for the given date (month-day).
+
+    If there is, remove it and the corresponding volume storing its data.
+    """
+    container_name = date.strftime(DOCKER_CONTAINER_NAME_FMT)
+    volume_name = date.strftime(DOCKER_VOLUME_FMT)
+    try:
+        container = client.containers.get(container_name)
+        logging.warning(f'Removing active deployment for {date.strftime("%m-%d")}')
+        container.stop()
+        # Wait a little bit after the container has been removed
+        # before deleting the volume
+        # TODO Is there a better way?
+        while True:
+            try:
+                client.volumes.get(volume_name).remove()
+                break
+            except BaseException:
+                time.sleep(1)
+    except docker.errors.NotFound:
+        logging.info(f'No existing deployment for {date.strftime("%Y-%m-%d")}. Starting deployment')
+
+
+def get_ports_from_caddy_config(config: dict):
+    """Makes a request to caddy config and returns the ports currently being used (both
+    active and previous)"""
+    caddy_config_url = config['caddy_config_url']
+    r = requests.get(caddy_config_url)
+    try:
+        r.raise_for_status()
+    except requests.HTTPError as e:
+        logging.error(f'Failed to retrieve Caddy config from {caddy_config_url}: {e}')
+        sys.exit(1)
+    try:
+        body = r.json()
+    except json.JSONDecodeError as e:
+        logging.error(f'Failed to parse Caddy config: {e}')
+        sys.exit(1)
+
+    routes = body['apps']['http']['servers']['srv0']['routes']
+    active = dict()
+    for route in routes:
+        # This happens with a fresh caddy build. No ports are active, so
+        # return an empty dict.
+        # TODO So there is a route dict, but without a 'match' entry?
+        if 'match' not in route:
+            return dict()
+        host = route['match'][0]['host'][0]
+        dial = route['handle'][0]['routes'][0]['handle'][0]['upstreams'][0]['dial']
+        active[host] = dial
+
+    # TODO We want to have both active_bolt and active_http URLs in the config, right?
+    # If only one is present, we have a problem.
+    # Also, if we reach this point there _must_ be at least a currently active
+    # deployment?
+    ports = dict()
+    active_bolt = config['urls']['active_bolt']
+    active_http = config['urls']['active_http']
+    if active_bolt in active:
+        ports['active_bolt'] = active[active_bolt].split(':')[1]
+    if active_http in active:
+        ports['active_http'] = active[active_http].split(':')[1]
+
+    # It's possible for there to be only one active deployment, and there
+    # are no previous ports. Only attempt to parse the previous ports
+    # if they have been filled in on the caddy config
+    prev_bolt = config['urls']['prev_bolt']
+    prev_http = config['urls']['prev_http']
+    if prev_bolt in active and 'PREV_BOLT_PORT' not in active[prev_bolt]:
+        ports['prev_bolt'] = active[prev_bolt].split(':')[1]
+    if prev_http in active and 'PREV_HTTP_PORT' not in active[prev_http]:
+        ports['prev_http'] = active[prev_http].split(':')[1]
+    return ports
+
+
+def check_log(config: dict, date: datetime):
+    """Makes a request to archive and checks if there is a valid dump for the specified
+    date."""
+    logging.info(f'Downloading logs for {date.strftime("%Y-%m-%d")}')
+    log_url_fmt = os.path.join(config['archive_base_url'], LOG_URL_SUFFIX)
+    log_url = date.strftime(log_url_fmt)
+    r = requests.get(log_url)
+    try:
+        r.raise_for_status()
+    except requests.HTTPError as e:
+        # We expect the request to fail if the log does not exist (404), but not for
+        # other reasons.
+        if r.status_code != 404:
+            logging.error(f'Expected HTTP code 200 or 404, but got: {e}')
+            sys.exit(1)
+        return False
+    body = r.content
+    last_line = body.decode().split('\n')[-1]
+    if 'Errors:' in last_line:
+        logging.error(f'There were errors from create_db found in logs for {log_url}')
+        sys.exit(1)
+    return True
+
+
+def get_port_date(port):
+    """Extracts the month and day from a port.
+
+    Port should have format [1|2]MMDD.
+
+    Returns the tuple (month, day)
+    """
+    month = int(port[-4:-2])
+    day = int(port[-2:])
+    return month, day
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('config')
+    parser.add_argument('-d', '--date', help='deploy IYP dump for this date (YYYY-MM-DD)')
+    args = parser.parse_args()
+
+    FORMAT = '%(asctime)s %(processName)s %(message)s'
+    logging.basicConfig(
+        format=FORMAT,
+        level=logging.INFO,
+        datefmt='%Y-%m-%d %H:%M:%S'
+    )
+    logging.info(f'Started: {sys.argv}')
+
+    with open(args.config, 'r') as f:
+        try:
+            config: dict = json.load(f)
+        except json.JSONDecodeError as e:
+            logging.error(f'Invalid configuration specified: {e}')
+            sys.exit(1)
+
+    root = os.path.dirname(os.path.realpath(__file__))
+
+    # If no date is provided when running the script, check if any dumps
+    # have been made within a week since the previous deployment. Otherwise,
+    # use the date provided in command line arg.
+    if args.date:
+        try:
+            date = datetime.strptime(args.date, '%Y-%m-%d')
+        except ValueError as e:
+            logging.error(f'Invalid date specified: {e}')
+            sys.exit(1)
+    else:
+        ports = get_ports_from_caddy_config(config)
+        success = False
+        if 'active_http' in ports:
+            active_http = ports['active_http']
+            month, day = get_port_date(active_http)
+            start_date = datetime.now(tz=timezone.utc).replace(month=month, day=day)
+        else:
+            start_date = datetime.now(tz=timezone.utc)
+
+        # Download logs from ihr archive each day in the next week since
+        # the previous release
+        for i in range(1, 8):
+            date = start_date + timedelta(days=i)
+            if check_log(config, date):
+                success = True
+                break
+            else:
+                logging.warning(f'No archive entry found for {date.strftime("%Y-%m-%d")}.')
+
+        if not success:
+            logging.error('Exiting because no active dates were found in archive.')
+            sys.exit(1)
+
+    # Define ports and filenames that depend on the date
+    volume_name = date.strftime(DOCKER_VOLUME_FMT)
+    container_name = date.strftime(DOCKER_CONTAINER_NAME_FMT)
+    http_port = date.strftime('1%m%d')
+    bolt_port = date.strftime('2%m%d')
+
+    client = docker.from_env()
+
+    # Check if there is an existing deployment for this day and remove if so
+    remove_deployment(client, date)
+
+    # Download dump from ihr archive
+    logging.info(f'Downloading dump for {date.strftime("%Y-%m-%d")}')
+    dump_dir = os.path.join(root, date.strftime(DUMP_DOWNLOAD_DIR_SUFFIX))
+    os.makedirs(dump_dir, exist_ok=True)
+    dump_url_fmt = os.path.join(config['archive_base_url'], DUMP_URL_SUFFIX)
+    dump_url = date.strftime(dump_url_fmt)
+    r = requests.get(dump_url)
+    try:
+        r.raise_for_status()
+    except requests.HTTPError as e:
+        logging.error(f'Failed to fetch dump from {dump_url}: {e}')
+        sys.exit(1)
+    # TODO Is the + necessary?
+    with open(os.path.join(dump_dir, 'neo4j.dump'), 'wb+') as f:
+        f.write(r.content)
+
+    # Load dump into volume
+    logging.info('Load dump into neo4j db')
+    client.containers.run(
+        'neo4j/neo4j-admin:' + NEO4J_VERSION,
+        command='neo4j-admin database load neo4j --from-path=/dumps --verbose',
+        name='load',
+        tty=True,
+        stdin_open=True,
+        remove=True,
+        volumes={
+                volume_name: {'bind': '/data', 'mode': 'rw'},
+                dump_dir: {'bind': '/dumps', 'mode': 'rw'},
+        }
+    )
+
+    # Run neo4j based on data in volume just created
+    logging.warning('Starting deployment container')
+    client.containers.run(
+        'neo4j:' + NEO4J_VERSION,
+        name=container_name,
+        ports={
+            7474: int(http_port),
+            7687: int(bolt_port)
+        },
+        volumes={
+            volume_name: {'bind': '/data', 'mode': 'rw'},
+        },
+        environment={
+            'NEO4J_AUTH': 'neo4j/password',
+            'NEO4J_server_memory_heap_initial__size': '16G',
+            'NEO4J_server_memory_heap_max__size': '16G',
+        },
+        detach=True,
+        remove=True
+    )
+
+    # Get currently active config
+    ports = get_ports_from_caddy_config(config)
+
+    # Only delete current prev if it exists
+    if 'prev_http' in ports:
+        prev_month, prev_day = get_port_date(ports['prev_http'])
+        # It's possible that you're trying to redeploy the current prev
+        # If this condition isn't here, then the new deployment will be deleted
+        # since it has the same date as prev
+        if prev_month != date.month or prev_day != date.day:
+            remove_deployment(client, date.replace(month=prev_month, day=prev_day))
+
+    with open(config['caddy_template'], 'r') as f:
+        caddy_template = f.read()
+
+    caddy_template = caddy_template.replace('<BOLT_PORT>', bolt_port)
+    caddy_template = caddy_template.replace('<HTTP_PORT>', http_port)
+
+    # If there are no active ports (for example, on the first run after a fresh
+    # caddy build), don't try to set prev ports
+    if 'active_http' in ports:
+        caddy_template = caddy_template.replace('<PREV_BOLT_PORT>', ports['active_bolt'])
+        caddy_template = caddy_template.replace('<PREV_HTTP_PORT>', ports['active_http'])
+
+    # Update config
+    requests.post(config['caddy_post_url'], caddy_template, headers={'Content-Type': 'application/json'})
+
+
+if __name__ == '__main__':
+    main()
+    sys.exit(0)
diff --git a/caddy.template.json b/caddy.template.json
new file mode 100644
index 0000000..d86c1a4
--- /dev/null
+++ b/caddy.template.json
@@ -0,0 +1,109 @@
+{
+    "apps": {
+      "http": {
+        "servers": {
+          "srv0": {
+            "listen": [":443"],
+            "routes": [
+              {
+                "match": [{ "host": ["ryan-bolt.ihr.live"] }],
+                "handle": [
+                  {
+                    "handler": "subroute",
+                    "routes": [
+                      {
+                        "handle": [
+                          {
+                            "handler": "reverse_proxy",
+                            "upstreams": [{ "dial": "sandbox.ihr.live:<BOLT_PORT>" }]
+                          }
+                        ]
+                      }
+                    ]
+                  }
+                ],
+                "terminal": true
+              },
+              {
+                "match": [{ "host": ["ryan-prev-bolt.ihr.live"] }],
+                "handle": [
+                  {
+                    "handler": "subroute",
+                    "routes": [
+                      {
+                        "handle": [
+                          {
+                            "handler": "reverse_proxy",
+                            "upstreams": [{ "dial": "sandbox.ihr.live:<PREV_BOLT_PORT>" }]
+                          }
+                        ]
+                      }
+                    ]
+                  }
+                ],
+                "terminal": true
+              },
+              {
+                "match": [{ "host": ["sandbox.ihr.live"] }],
+                "handle": [
+                  {
+                    "handler": "subroute",
+                    "routes": [
+                      {
+                        "handle": [
+                          {
+                            "handler": "reverse_proxy",
+                            "upstreams": [{ "dial": "ryan.ihr.live:<HTTP_PORT>" }]
+                          }
+                        ]
+                      }
+                    ]
+                  }
+                ],
+                "terminal": true
+              },
+              {
+                "match": [{ "host": ["ryan.ihr.live"] }],
+                "handle": [
+                  {
+                    "handler": "subroute",
+                    "routes": [
+                      {
+                        "handle": [
+                          {
+                            "handler": "reverse_proxy",
+                            "upstreams": [{ "dial": "sandbox.ihr.live:<HTTP_PORT>" }]
+                          }
+                        ]
+                      }
+                    ]
+                  }
+                ],
+                "terminal": true
+              },
+              {
+                "match": [{ "host": ["ryan-prev.ihr.live"] }],
+                "handle": [
+                  {
+                    "handler": "subroute",
+                    "routes": [
+                      {
+                        "handle": [
+                          {
+                            "handler": "reverse_proxy",
+                            "upstreams": [{ "dial": "sandbox.ihr.live:<PREV_HTTP_PORT>" }]
+                          }
+                        ]
+                      }
+                    ]
+                  }
+                ],
+                "terminal": true
+              }
+            ]
+          }
+        }
+      }
+    }
+  }
+  
\ No newline at end of file
diff --git a/config.json.example b/config.json.example
index ee12e2c..ef1e158 100644
--- a/config.json.example
+++ b/config.json.example
@@ -1,4 +1,10 @@
 {
+    "archive": {
+        "host": "",
+        "user": "",
+        "base_path": ""
+    },
+
     "cache": {
         "directory": "tmp/",
         "duration_in_days": 6
diff --git a/create_db.py b/create_db.py
index e754e78..6b62403 100644
--- a/create_db.py
+++ b/create_db.py
@@ -1,208 +1,228 @@
+import argparse
 import importlib
 import json
 import logging
 import os
-# import shutil
 import sys
+from datetime import datetime, timezone
 from time import sleep
 
-import arrow
 import docker
+import paramiko
+from scp import SCPClient
 
 from send_email import send_email
 
-NEO4J_VERSION = '5.21.2'
-
-today = arrow.utcnow()
-date = f'{today.year}-{today.month:02d}-{today.day:02d}'
-
-# Use the current directory as root.
-root = os.path.dirname(os.path.realpath(__file__))
-# Alternatively, specify your own path.
-# root = ''
-if not root:
-    sys.exit('Please configure a root path.')
-
-tmp_dir = os.path.join(root, 'neo4j/tmp', date, '')
-dump_dir = os.path.join(root, 'dumps', f'{today.year}/{today.month:02d}/{today.day:02d}', '')
-
-os.makedirs(tmp_dir, exist_ok=True)
-os.makedirs(dump_dir, exist_ok=True)
-
-# Initialize logging
-scriptname = sys.argv[0].replace('/', '_')[0:-3]
-FORMAT = '%(asctime)s %(processName)s %(message)s'
-logging.basicConfig(
-    format=FORMAT,
-    filename=f'{dump_dir}iyp-{date}.log',
-    level=logging.WARNING,
-    datefmt='%Y-%m-%d %H:%M:%S'
-)
-logging.warning('Started: %s' % sys.argv)
-
-# Load configuration file
-with open('config.json', 'r') as fp:
-    conf = json.load(fp)
-
-# Start a new neo4j container
-client = docker.from_env()
-
-# ######### Start a new docker image ##########
-
-logging.warning('Starting new container...')
-container = client.containers.run(
-    'neo4j:' + NEO4J_VERSION,
-    name=f'iyp-{date}',
-    ports={
-        7474: 7474,
-        7687: 7687
-    },
-    volumes={
-        tmp_dir: {'bind': '/data', 'mode': 'rw'},
-    },
-    environment={
-        'NEO4J_AUTH': 'neo4j/password',
-        'NEO4J_server_memory_heap_initial__size': '16G',
-        'NEO4J_server_memory_heap_max__size': '16G',
-    },
-    remove=True,
-    detach=True
-)
-
-# Wait for the container to be ready
-timeout = 120
-stop_time = 1
-elapsed_time = 0
-container_ready = False
-
-while elapsed_time < timeout:
-    sleep(stop_time)
-    elapsed_time += stop_time
-    # Not the most premium solution, but the alternative is using
-    # stream=True, which creates a blocking generator that we have
-    # to somehow interrupt in case the database does not start
-    # correctly. And writing a signal handler just for this seems
-    # overkill.
-    last_msg = container.logs(stderr=False, tail=1)
-    if last_msg.endswith(b'Started.\n'):
-        logging.warning('Container ready.')
-        container_ready = True
-        break
-
-if not container_ready:
-    logging.error('Timed our while waiting for container to start.')
-    try:
-        container_logs = container.logs().decode('utf-8')
-    except Exception as e:
-        logging.error(f'Can not get logs from container: {e}')
+NEO4J_VERSION = '5.16.0'
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-a', '--archive', action='store_true', help='push dump to archive server')
+    args = parser.parse_args()
+
+    today = datetime.now(tz=timezone.utc)
+    date = today.strftime('%Y-%m-%d')
+
+    # Use the current directory as root.
+    root = os.path.dirname(os.path.realpath(__file__))
+    # Alternatively, specify your own path.
+    # root = ''
+    if not root:
+        sys.exit('Please configure a root path.')
+
+    dump_dir = os.path.join(root, 'dumps', today.strftime('%Y/%m/%d'))
+
+    os.makedirs(dump_dir, exist_ok=True)
+
+    # Initialize logging
+    FORMAT = '%(asctime)s %(processName)s %(message)s'
+    logging.basicConfig(
+        format=FORMAT,
+        filename=os.path.join(dump_dir, f'iyp-{date}.log'),
+        level=logging.INFO,
+        datefmt='%Y-%m-%d %H:%M:%S'
+    )
+    logging.info(f'Started: {sys.argv}')
+
+    # Load configuration file
+    with open('config.json', 'r') as fp:
+        conf = json.load(fp)
+
+    # Neo4j container settings
+    neo4j_volume = f'data-{date}'
+
+    # Start a new neo4j container
+    client = docker.from_env()
+
+    # ######### Start a new docker image ##########
+
+    logging.warning('Starting new container...')
+    container = client.containers.run(
+        'neo4j:' + NEO4J_VERSION,
+        name=f'iyp-{date}',
+        ports={
+            7474: 7474,
+            7687: 7687
+        },
+        volumes={
+            neo4j_volume: {'bind': '/data', 'mode': 'rw'},
+        },
+        environment={
+            'NEO4J_AUTH': 'neo4j/password',
+            'NEO4J_server_memory_heap_initial__size': '16G',
+            'NEO4J_server_memory_heap_max__size': '16G',
+        },
+        remove=True,
+        detach=True
+    )
+
+    # Wait for the container to be ready
+    timeout = 120
+    stop_time = 1
+    elapsed_time = 0
+    container_ready = False
+
+    while elapsed_time < timeout:
+        sleep(stop_time)
+        elapsed_time += stop_time
+        # Not the most premium solution, but the alternative is using
+        # stream=True, which creates a blocking generator that we have
+        # to somehow interrupt in case the database does not start
+        # correctly. And writing a signal handler just for this seems
+        # overkill.
+        last_msg = container.logs(stderr=False, tail=1)
+        if last_msg.endswith(b'Started.\n'):
+            logging.info('Container ready.')
+            container_ready = True
+            break
+
+    if not container_ready:
+        logging.error('Timed our while waiting for container to start.')
+        try:
+            container_logs = container.logs().decode('utf-8')
+        except Exception as e:
+            logging.error(f'Can not get logs from container: {e}')
+            sys.exit('Problem while starting the container.')
+        logging.error(f'Container logs:\n{container_logs}')
+        logging.error('Trying to stop container...')
+        container.stop()
         sys.exit('Problem while starting the container.')
-    logging.error(f'Container logs:\n{container_logs}')
-    logging.error('Trying to stop container...')
-    container.stop()
-    sys.exit('Problem while starting the container.')
 
-# ######### Fetch data and feed to neo4j ##########
-
-
-class RelationCountError(Exception):
-    def __init__(self, message):
-        self.message = message
-        super().__init__(self.message)
-
-
-logging.warning('Fetching data...')
-status = {}
-no_error = True
-for module_name in conf['iyp']['crawlers']:
-
-    try:
+    # ########## Fetch data and feed to neo4j ##########
+
+    class RelationCountError(Exception):
+        def __init__(self, message):
+            self.message = message
+            super().__init__(self.message)
+
+    logging.info('Fetching data...')
+    status = {}
+    no_error = True
+    for module_name in conf['iyp']['crawlers']:
+        try:
+            module = importlib.import_module(module_name)
+            logging.info(f'start {module}')
+            name = module_name.replace('iyp.crawlers.', '')
+            crawler = module.Crawler(module.ORG, module.URL, name)
+            crawler.run()
+            passed = crawler.unit_test()
+            crawler.close()
+            if not passed:
+                error_message = f'Did not receive data from crawler {name}'
+                raise RelationCountError(error_message)
+            status[module_name] = 'OK'
+            logging.info(f'end {module}')
+        except RelationCountError as relation_count_error:
+            no_error = False
+            logging.error(relation_count_error)
+            status[module_name] = relation_count_error
+            send_email(relation_count_error)
+        except Exception as e:
+            no_error = False
+            logging.error('crawler crashed!')
+            status[module_name] = e
+            send_email(e)
+
+    # ######### Post processing scripts ##########
+
+    logging.info('Post-processing...')
+    for module_name in conf['iyp']['post']:
         module = importlib.import_module(module_name)
-        logging.warning(f'start {module}')
-        name = module_name.replace('iyp.crawlers.', '')
-        crawler = module.Crawler(module.ORG, module.URL, name)
-        relations_count = crawler.count_relations()
-        crawler.run()
-        relations_count_new = crawler.count_relations()
-        crawler.close()
-        if not relations_count_new > relations_count:
-            error_message = (
-                f'Unexpected relation count change in the crawler "{name}": '
-                f'Expected new relations ({relations_count_new}) '
-                f'to be greater than the previous relations ({relations_count}).'
-            )
-            raise RelationCountError(error_message)
-        status[module_name] = 'OK'
-        logging.warning(f'end {module}')
-    except RelationCountError as relation_count_error:
-        no_error = False
-        logging.error(relation_count_error)
-        status[module_name] = relation_count_error
-        send_email(relation_count_error)
-    except Exception as e:
-        no_error = False
-        logging.exception('crawler crashed!!')
-        status[module_name] = e
-        send_email(e)
-
-
-# ######### Post processing scripts ##########
-
-logging.warning('Post-processing...')
-for module_name in conf['iyp']['post']:
-    module = importlib.import_module(module_name)
-
-    try:
-        logging.warning(f'start {module}')
-        post = module.PostProcess()
-        post.run()
-        post.close()
-        status[module_name] = 'OK'
-        logging.warning(f'end {module}')
-
-    except Exception as e:
-        no_error = False
-        logging.error('crawler crashed!!\n')
-        logging.error(e)
-        logging.error('\n')
-        status[module_name] = e
-
-
-# ######### Stop container and dump DB ##########
-
-logging.warning('Stopping container...')
-container.stop(timeout=180)
-
-logging.warning('Dumping database...')
-if os.path.exists(f'{dump_dir}/neo4j.dump'):
-    os.remove(f'{dump_dir}/neo4j.dump')
-
-# make sure the directory is writable for any user
-os.chmod(dump_dir, 0o777)
-
-container = client.containers.run(
-    'neo4j/neo4j-admin:' + NEO4J_VERSION,
-    command='neo4j-admin database dump neo4j --to-path=/dumps --verbose',
-    tty=True,
-    stdin_open=True,
-    remove=True,
-    volumes={
-        tmp_dir: {'bind': '/data', 'mode': 'rw'},
-        dump_dir: {'bind': '/dumps', 'mode': 'rw'},
-    }
-)
-
-# rename dump
-os.rename(f'{dump_dir}/neo4j.dump', f'{dump_dir}/iyp-{date}.dump')
-
-final_words = ''
-if not no_error:
-    # TODO send an email
-    final_words += 'There was errors!'
-    logging.error('there was errors!\n')
-    logging.error({k: error for k, error in status.items() if error != 'OK'})
-else:
-    final_words = 'No error :)'
-# Delete tmp file in cron job
-#    shutil.rmtree(tmp_dir)
-
-logging.warning(f'Finished: {sys.argv} {final_words}')
+
+        try:
+            logging.info(f'start {module}')
+            post = module.PostProcess()
+            post.run()
+            post.close()
+            status[module_name] = 'OK'
+            logging.info(f'end {module}')
+
+        except Exception as e:
+            no_error = False
+            logging.error('crawler crashed!')
+            logging.error(e)
+            status[module_name] = e
+
+    # ######### Stop container and dump DB ##########
+
+    logging.info('Stopping container...')
+    container.stop(timeout=180)
+
+    logging.info('Dumping database...')
+    dump_file = os.path.join(dump_dir, 'neo4j.dump')
+    if os.path.exists(dump_file):
+        os.remove(dump_file)
+
+    # make sure the directory is writable for any user
+    os.chmod(dump_dir, 0o777)
+
+    container = client.containers.run(
+        'neo4j/neo4j-admin:' + NEO4J_VERSION,
+        command='neo4j-admin database dump neo4j --to-path=/dumps --verbose',
+        tty=True,
+        stdin_open=True,
+        remove=True,
+        volumes={
+            neo4j_volume: {'bind': '/data', 'mode': 'rw'},
+            dump_dir: {'bind': '/dumps', 'mode': 'rw'},
+        }
+    )
+
+    # Delete the data volume once the dump been created
+    client.volumes.get(neo4j_volume).remove()
+
+    # rename dump
+
+    os.rename(dump_file, os.path.join(dump_dir, f'iyp-{date}.dump'))
+
+    if not no_error:
+        # TODO send an email
+
+        # Add the log line to indicate to autodeploy that there were errors
+        final_words = f'\nErrors: {" ".join((k for k in status))}'
+        logging.error('There were errors!')
+    else:
+        final_words = 'No error :)'
+    # Delete tmp file in cron job
+    #    shutil.rmtree(tmp_dir)
+
+    logging.info(f'Finished: {sys.argv} {final_words}')
+
+    if args.archive:
+        # Push the dump and log to ihr archive
+        ssh = paramiko.SSHClient()
+        # Do not show info logging for paramiko.
+        logging.getLogger('paramiko').setLevel(logging.WARNING)
+        ssh.load_system_host_keys()
+        ssh.connect(conf['archive']['host'], username=conf['archive']['user'])
+
+        dest = os.path.join(conf['archive']['base_path'], today.strftime('%Y/%m/%d'))
+        ssh.exec_command(f'mkdir -p {dest}')
+
+        with SCPClient(ssh.get_transport()) as scp:
+            scp.put(dump_dir, recursive=True, remote_path=dest)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/docker-compose.yaml b/docker-compose.yaml
index de510b0..fd985a5 100644
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@@ -1,3 +1,6 @@
+volumes:
+  caddy_data:
+  caddy_config:
 services:
   iyp_loader:
     image: neo4j/neo4j-admin:5.21.2
@@ -63,3 +66,21 @@ services:
     depends_on:
       iyp_loader:
         condition: service_completed_successfully
+
+  caddy:
+    image: caddy:latest
+    container_name: caddy
+    restart: unless-stopped
+    ports:
+      - "80:80"
+      - "443:443"
+      - "443:443/udp"
+      - "2019:2019"
+    environment:
+      - CADDY_ADMIN=0.0.0.0:2019
+    volumes:
+      - ./site:/srv
+      - caddy_data:/data
+      - caddy_config:/config
+    command: /usr/bin/caddy run --resume
+      
\ No newline at end of file
diff --git a/iyp/__init__.py b/iyp/__init__.py
index 47ead72..81b2d2b 100644
--- a/iyp/__init__.py
+++ b/iyp/__init__.py
@@ -476,7 +476,7 @@ def batch_add_node_label(self, node_ids, label):
         label: label string or list of label strings
         """
         label_str = str(label)
-        if type(label) is list:
+        if isinstance(label, list):
             label_str = ':'.join(label)
 
         for i in range(0, len(node_ids), BATCH_SIZE):
@@ -709,15 +709,23 @@ def count_relations(self):
 
         return result['count']
 
-    def unit_test(self, logging):
-        relation_count = self.count_relations()
-        logging.info('Relations before starting: %s' % relation_count)
-        self.run()
-        relation_count_new = self.count_relations()
-        logging.info('Relations after starting: %s' % relation_count_new)
-        self.close()
-        print('assertion failed') if relation_count_new <= relation_count else print('assertion passed')
-        assert relation_count_new > relation_count
+    def unit_test(self, relation_types):
+        """Check for existence of relationships created by this crawler.
+
+        relation_types should be a list of types for which existence is checked.
+        """
+        logging.info(f'Running existence test for {relation_types}')
+        passed = True
+        for relation_type in relation_types:
+            existenceQuery = f"""MATCH ()-[r:{relation_type}]-()
+                                USING INDEX r:{relation_type}(reference_name)
+                                WHERE r.reference_name = '{self.reference['reference_name']}'
+                                RETURN 0 LIMIT 1"""
+            result = self.iyp.tx.run(existenceQuery)
+            if len(list(result)) == 0:
+                passed = False
+                logging.error(f'Missing data for relation {relation_type}')
+        return passed
 
     def close(self):
         # Commit changes to IYP
diff --git a/iyp/crawlers/alice_lg/__init__.py b/iyp/crawlers/alice_lg/__init__.py
index d7bbdfe..878bdf7 100644
--- a/iyp/crawlers/alice_lg/__init__.py
+++ b/iyp/crawlers/alice_lg/__init__.py
@@ -442,3 +442,6 @@ def run(self) -> None:
         if originate_rels:
             logging.info(f'Pushing {len(originate_rels)} ORIGINATE relationships.')
             self.iyp.batch_add_links('ORIGINATE', originate_rels)
+
+    def unit_test(self):
+        return super().unit_test(['MEMBER_OF', 'ORIGINATE', 'MANAGED_BY'])
diff --git a/iyp/crawlers/alice_lg/amsix.py b/iyp/crawlers/alice_lg/amsix.py
index 7328645..6e74b62 100644
--- a/iyp/crawlers/alice_lg/amsix.py
+++ b/iyp/crawlers/alice_lg/amsix.py
@@ -28,7 +28,7 @@ def main() -> None:
 
     crawler = Crawler(ORG, URL, NAME)
     if args.unit_test:
-        crawler.unit_test(logging)
+        crawler.unit_test()
     else:
         crawler.run()
         crawler.close()
diff --git a/iyp/crawlers/alice_lg/bcix.py b/iyp/crawlers/alice_lg/bcix.py
index 66bc993..ed1a6f7 100644
--- a/iyp/crawlers/alice_lg/bcix.py
+++ b/iyp/crawlers/alice_lg/bcix.py
@@ -28,7 +28,7 @@ def main() -> None:
 
     crawler = Crawler(ORG, URL, NAME)
     if args.unit_test:
-        crawler.unit_test(logging)
+        crawler.unit_test()
     else:
         crawler.run()
         crawler.close()
diff --git a/iyp/crawlers/alice_lg/decix.py b/iyp/crawlers/alice_lg/decix.py
index 8a3aa25..d3c8c35 100644
--- a/iyp/crawlers/alice_lg/decix.py
+++ b/iyp/crawlers/alice_lg/decix.py
@@ -28,7 +28,7 @@ def main() -> None:
 
     crawler = Crawler(ORG, URL, NAME)
     if args.unit_test:
-        crawler.unit_test(logging)
+        crawler.unit_test()
     else:
         crawler.run()
         crawler.close()
diff --git a/iyp/crawlers/alice_lg/ixbr.py b/iyp/crawlers/alice_lg/ixbr.py
index 548701a..691e6e4 100644
--- a/iyp/crawlers/alice_lg/ixbr.py
+++ b/iyp/crawlers/alice_lg/ixbr.py
@@ -28,7 +28,7 @@ def main() -> None:
 
     crawler = Crawler(ORG, URL, NAME)
     if args.unit_test:
-        crawler.unit_test(logging)
+        crawler.unit_test()
     else:
         crawler.run()
         crawler.close()
diff --git a/iyp/crawlers/alice_lg/linx.py b/iyp/crawlers/alice_lg/linx.py
index ef67bd2..223635c 100644
--- a/iyp/crawlers/alice_lg/linx.py
+++ b/iyp/crawlers/alice_lg/linx.py
@@ -28,7 +28,7 @@ def main() -> None:
 
     crawler = Crawler(ORG, URL, NAME)
     if args.unit_test:
-        crawler.unit_test(logging)
+        crawler.unit_test()
     else:
         crawler.run()
         crawler.close()
diff --git a/iyp/crawlers/alice_lg/megaport.py b/iyp/crawlers/alice_lg/megaport.py
index 8080f45..5156a21 100644
--- a/iyp/crawlers/alice_lg/megaport.py
+++ b/iyp/crawlers/alice_lg/megaport.py
@@ -28,7 +28,7 @@ def main() -> None:
 
     crawler = Crawler(ORG, URL, NAME)
     if args.unit_test:
-        crawler.unit_test(logging)
+        crawler.unit_test()
     else:
         crawler.run()
         crawler.close()
diff --git a/iyp/crawlers/alice_lg/netnod.py b/iyp/crawlers/alice_lg/netnod.py
index 8ba89f4..5f711a4 100644
--- a/iyp/crawlers/alice_lg/netnod.py
+++ b/iyp/crawlers/alice_lg/netnod.py
@@ -28,7 +28,7 @@ def main() -> None:
 
     crawler = Crawler(ORG, URL, NAME)
     if args.unit_test:
-        crawler.unit_test(logging)
+        crawler.unit_test()
     else:
         crawler.run()
         crawler.close()
diff --git a/iyp/crawlers/apnic/eyeball.py b/iyp/crawlers/apnic/eyeball.py
index bee4fd4..8054402 100644
--- a/iyp/crawlers/apnic/eyeball.py
+++ b/iyp/crawlers/apnic/eyeball.py
@@ -84,6 +84,9 @@ def run(self):
             self.iyp.batch_add_links('RANK', rank_links)
             self.iyp.batch_add_links('POPULATION', pop_links)
 
+    def unit_test(self):
+        return super().unit_test(['POPULATION', 'COUNTRY', 'RANK', 'NAME'])
+
 
 def main() -> None:
     parser = argparse.ArgumentParser()
@@ -103,7 +106,7 @@ def main() -> None:
 
     crawler = Crawler(ORG, URL, NAME)
     if args.unit_test:
-        crawler.unit_test(logging)
+        crawler.unit_test()
     else:
         crawler.run()
         crawler.close()
diff --git a/iyp/crawlers/bgpkit/__init__.py b/iyp/crawlers/bgpkit/__init__.py
index e5d8e3b..1c54c89 100644
--- a/iyp/crawlers/bgpkit/__init__.py
+++ b/iyp/crawlers/bgpkit/__init__.py
@@ -47,3 +47,6 @@ def run(self):
 
         # Push all links to IYP
         self.iyp.batch_add_links('PEERS_WITH', links)
+
+    def unit_test(self):
+        return super().unit_test(['PEERS_WITH'])
diff --git a/iyp/crawlers/bgpkit/as2rel_v4.py b/iyp/crawlers/bgpkit/as2rel_v4.py
index 0a1c1bc..2278dd0 100644
--- a/iyp/crawlers/bgpkit/as2rel_v4.py
+++ b/iyp/crawlers/bgpkit/as2rel_v4.py
@@ -35,7 +35,7 @@ def main() -> None:
 
     crawler = Crawler(ORG, URL, NAME)
     if args.unit_test:
-        crawler.unit_test(logging)
+        crawler.unit_test()
     else:
         crawler.run()
         crawler.close()
diff --git a/iyp/crawlers/bgpkit/as2rel_v6.py b/iyp/crawlers/bgpkit/as2rel_v6.py
index e6dafac..e7807c7 100644
--- a/iyp/crawlers/bgpkit/as2rel_v6.py
+++ b/iyp/crawlers/bgpkit/as2rel_v6.py
@@ -35,7 +35,7 @@ def main() -> None:
 
     crawler = Crawler(ORG, URL, NAME)
     if args.unit_test:
-        crawler.unit_test(logging)
+        crawler.unit_test()
     else:
         crawler.run()
         crawler.close()
diff --git a/iyp/crawlers/bgpkit/peerstats.py b/iyp/crawlers/bgpkit/peerstats.py
index 58cf8cb..7724186 100644
--- a/iyp/crawlers/bgpkit/peerstats.py
+++ b/iyp/crawlers/bgpkit/peerstats.py
@@ -90,6 +90,9 @@ def run(self):
             # Push all links to IYP
             self.iyp.batch_add_links('PEERS_WITH', links)
 
+    def unit_test(self):
+        return super().unit_test(['PEERS_WITH'])
+
 
 def main() -> None:
     parser = argparse.ArgumentParser()
@@ -109,7 +112,7 @@ def main() -> None:
 
     crawler = Crawler(ORG, URL, NAME)
     if args.unit_test:
-        crawler.unit_test(logging)
+        crawler.unit_test()
     else:
         crawler.run()
         crawler.close()
diff --git a/iyp/crawlers/bgpkit/pfx2asn.py b/iyp/crawlers/bgpkit/pfx2asn.py
index fe3cb56..d83e8db 100644
--- a/iyp/crawlers/bgpkit/pfx2asn.py
+++ b/iyp/crawlers/bgpkit/pfx2asn.py
@@ -62,6 +62,9 @@ def run(self):
         # Push all links to IYP
         self.iyp.batch_add_links('ORIGINATE', links)
 
+    def unit_test(self):
+        return super().unit_test(['ORIGINATE'])
+
 
 def main() -> None:
     parser = argparse.ArgumentParser()
@@ -81,7 +84,7 @@ def main() -> None:
 
     crawler = Crawler(ORG, URL, NAME)
     if args.unit_test:
-        crawler.unit_test(logging)
+        crawler.unit_test()
     else:
         crawler.run()
         crawler.close()
diff --git a/iyp/crawlers/bgptools/anycast_prefixes.py b/iyp/crawlers/bgptools/anycast_prefixes.py
index c46c5da..824e5c4 100644
--- a/iyp/crawlers/bgptools/anycast_prefixes.py
+++ b/iyp/crawlers/bgptools/anycast_prefixes.py
@@ -102,6 +102,9 @@ def update(self, res, filename: str):
             # Push all links to IYP
             self.iyp.batch_add_links('CATEGORIZED', links)
 
+    def unit_test(self):
+        return super().unit_test(['CATEGORIZED'])
+
 
 def main() -> None:
     parser = argparse.ArgumentParser()
@@ -121,7 +124,7 @@ def main() -> None:
 
     crawler = Crawler(ORG, URL, NAME)
     if args.unit_test:
-        crawler.unit_test(logging)
+        crawler.unit_test()
     else:
         crawler.run()
         crawler.close()
diff --git a/iyp/crawlers/bgptools/as_names.py b/iyp/crawlers/bgptools/as_names.py
index 7db4c99..bbfa957 100644
--- a/iyp/crawlers/bgptools/as_names.py
+++ b/iyp/crawlers/bgptools/as_names.py
@@ -85,6 +85,9 @@ def run(self):
         self.iyp.batch_add_links('NAME', name_links)
         self.iyp.batch_add_links('CATEGORIZED', tag_links)
 
+    def unit_test(self):
+        return super().unit_test(['NAME'])
+
 
 def main() -> None:
     parser = argparse.ArgumentParser()
@@ -104,7 +107,7 @@ def main() -> None:
 
     crawler = Crawler(ORG, URL, NAME)
     if args.unit_test:
-        crawler.unit_test(logging)
+        crawler.unit_test()
     else:
         crawler.run()
         crawler.close()
diff --git a/iyp/crawlers/bgptools/tags.py b/iyp/crawlers/bgptools/tags.py
index 3a20fa1..155f644 100644
--- a/iyp/crawlers/bgptools/tags.py
+++ b/iyp/crawlers/bgptools/tags.py
@@ -83,6 +83,9 @@ def run(self):
                     print('Error for: ', line)
                     print(error)
 
+    def unit_test(self):
+        return super().unit_test(['CATEGORIZED'])
+
 
 def main() -> None:
     parser = argparse.ArgumentParser()
@@ -102,7 +105,7 @@ def main() -> None:
 
     crawler = Crawler(ORG, URL, NAME)
     if args.unit_test:
-        crawler.unit_test(logging)
+        crawler.unit_test()
     else:
         crawler.run()
         crawler.close()
diff --git a/iyp/crawlers/caida/asrank.py b/iyp/crawlers/caida/asrank.py
index 3cc8d9c..51e8e45 100644
--- a/iyp/crawlers/caida/asrank.py
+++ b/iyp/crawlers/caida/asrank.py
@@ -100,6 +100,9 @@ def run(self):
         self.iyp.batch_add_links('COUNTRY', country_links)
         self.iyp.batch_add_links('RANK', rank_links)
 
+    def unit_test(self):
+        return super().unit_test(['NAME', 'COUNTRY', 'RANK'])
+
 
 def main() -> None:
     parser = argparse.ArgumentParser()
@@ -119,7 +122,7 @@ def main() -> None:
 
     crawler = Crawler(ORG, URL, NAME)
     if args.unit_test:
-        crawler.unit_test(logging)
+        crawler.unit_test()
     else:
         crawler.run()
         crawler.close()
diff --git a/iyp/crawlers/caida/ix_asns.py b/iyp/crawlers/caida/ix_asns.py
index 6fbb86e..8e0f7f9 100644
--- a/iyp/crawlers/caida/ix_asns.py
+++ b/iyp/crawlers/caida/ix_asns.py
@@ -90,6 +90,9 @@ def run(self):
         # Push all links to IYP
         self.iyp.batch_add_links('MEMBER_OF', member_links)
 
+    def unit_test(self):
+        return super().unit_test(['MEMBER_OF'])
+
 
 def main() -> None:
     parser = argparse.ArgumentParser()
@@ -109,7 +112,7 @@ def main() -> None:
 
     crawler = Crawler(ORG, URL, NAME)
     if args.unit_test:
-        crawler.unit_test(logging)
+        crawler.unit_test()
     else:
         crawler.run()
         crawler.close()
diff --git a/iyp/crawlers/caida/ixs.py b/iyp/crawlers/caida/ixs.py
index 2301c09..e9b4542 100644
--- a/iyp/crawlers/caida/ixs.py
+++ b/iyp/crawlers/caida/ixs.py
@@ -200,6 +200,9 @@ def run(self):
         self.iyp.batch_add_links('WEBSITE', website_links)
         self.iyp.batch_add_links('MANAGED_BY', prefix_links)
 
+    def unit_test(self):
+        return super().unit_test(['EXTERNAL_ID', 'NAME', 'COUNTRY', 'WEBSITE', 'MANAGED_BY'])
+
 
 def main() -> None:
     parser = argparse.ArgumentParser()
@@ -219,7 +222,7 @@ def main() -> None:
 
     crawler = Crawler(ORG, URL, NAME)
     if args.unit_test:
-        crawler.unit_test(logging)
+        crawler.unit_test()
     else:
         crawler.run()
         crawler.close()
diff --git a/iyp/crawlers/cisco/umbrella_top1m.py b/iyp/crawlers/cisco/umbrella_top1m.py
index 23bd3e2..9ed18fb 100644
--- a/iyp/crawlers/cisco/umbrella_top1m.py
+++ b/iyp/crawlers/cisco/umbrella_top1m.py
@@ -123,6 +123,9 @@ def run(self):
         logging.info(f'Pushing {len(processed_links)} RANK relationships...')
         self.iyp.batch_add_links('RANK', processed_links)
 
+    def unit_test(self):
+        return super().unit_test(['RANK'])
+
 
 def main() -> None:
     parser = argparse.ArgumentParser()
@@ -142,7 +145,7 @@ def main() -> None:
 
     crawler = Crawler(ORG, URL, NAME)
     if args.unit_test:
-        crawler.unit_test(logging)
+        crawler.unit_test()
     else:
         crawler.run()
         crawler.close()
diff --git a/iyp/crawlers/citizenlab/urldb.py b/iyp/crawlers/citizenlab/urldb.py
index b56718a..6893570 100644
--- a/iyp/crawlers/citizenlab/urldb.py
+++ b/iyp/crawlers/citizenlab/urldb.py
@@ -70,6 +70,9 @@ def run(self):
         # Push all links to IYP
         self.iyp.batch_add_links('CATEGORIZED', links)
 
+    def unit_test(self):
+        return super().unit_test(['CATEGORIZED'])
+
 
 def main() -> None:
     parser = argparse.ArgumentParser()
@@ -89,7 +92,7 @@ def main() -> None:
 
     crawler = Crawler(ORG, URL, NAME)
     if args.unit_test:
-        crawler.unit_test(logging)
+        crawler.unit_test()
     else:
         crawler.run()
         crawler.close()
diff --git a/iyp/crawlers/cloudflare/dns_top_ases.py b/iyp/crawlers/cloudflare/dns_top_ases.py
index 8c15ac0..fe6b6e1 100644
--- a/iyp/crawlers/cloudflare/dns_top_ases.py
+++ b/iyp/crawlers/cloudflare/dns_top_ases.py
@@ -47,6 +47,10 @@ def compute_link(self, param):
                 'props': [flat_prop, self.reference]
             })
 
+    # already defined in imported Crawler
+    # def unit_test(self):
+    #     pass
+
 
 def main() -> None:
     parser = argparse.ArgumentParser()
@@ -66,7 +70,7 @@ def main() -> None:
 
     crawler = Crawler(ORG, URL, NAME)
     if args.unit_test:
-        crawler.unit_test(logging)
+        crawler.unit_test()
     else:
         crawler.run()
         crawler.close()
diff --git a/iyp/crawlers/cloudflare/dns_top_locations.py b/iyp/crawlers/cloudflare/dns_top_locations.py
index 8f40586..37eb0f4 100644
--- a/iyp/crawlers/cloudflare/dns_top_locations.py
+++ b/iyp/crawlers/cloudflare/dns_top_locations.py
@@ -168,6 +168,9 @@ def compute_link(self, param):
                 'props': [flat_prop, self.reference]
             })
 
+    def unit_test(self):
+        return super().unit_test(['QUERIED_FROM'])
+
 
 def main() -> None:
     parser = argparse.ArgumentParser()
@@ -187,7 +190,7 @@ def main() -> None:
 
     crawler = Crawler(ORG, URL, NAME)
     if args.unit_test:
-        crawler.unit_test(logging)
+        crawler.unit_test()
     else:
         crawler.run()
         crawler.close()
diff --git a/iyp/crawlers/cloudflare/ranking_bucket.py b/iyp/crawlers/cloudflare/ranking_bucket.py
index 6d9a02e..c6b64c7 100644
--- a/iyp/crawlers/cloudflare/ranking_bucket.py
+++ b/iyp/crawlers/cloudflare/ranking_bucket.py
@@ -122,6 +122,9 @@ def run(self):
                 print(f'Adding {len(domain_links)} RANK relationships', file=sys.stderr)
                 self.iyp.batch_add_links('RANK', domain_links)
 
+    def unit_test(self):
+        return super().unit_test(['RANK'])
+
 
 # Main program
 if __name__ == '__main__':
@@ -141,7 +144,7 @@ def run(self):
 
     crawler = Crawler(ORG, URL, NAME)
     if args.unit_test:
-        crawler.unit_test(logging)
+        crawler.unit_test()
     else:
         crawler.run()
         crawler.close()
diff --git a/iyp/crawlers/cloudflare/top100.py b/iyp/crawlers/cloudflare/top100.py
index 2a6b63d..c44fbe2 100644
--- a/iyp/crawlers/cloudflare/top100.py
+++ b/iyp/crawlers/cloudflare/top100.py
@@ -71,6 +71,9 @@ def update(self, entry):
         domain_qid = self.iyp.get_node('DomainName', {'name': entry['domain']})
         self.iyp.add_links(domain_qid, statements)
 
+    def unit_test(self):
+        return super().unit_test(['RANK'])
+
 
 def main() -> None:
     parser = argparse.ArgumentParser()
@@ -90,7 +93,7 @@ def main() -> None:
 
     crawler = Crawler(ORG, URL, NAME)
     if args.unit_test:
-        crawler.unit_test(logging)
+        crawler.unit_test()
     else:
         crawler.run()
         crawler.close()
diff --git a/iyp/crawlers/emileaben/as_names.py b/iyp/crawlers/emileaben/as_names.py
index 66d4b10..197253e 100644
--- a/iyp/crawlers/emileaben/as_names.py
+++ b/iyp/crawlers/emileaben/as_names.py
@@ -66,6 +66,9 @@ def run(self):
             # Push all links to IYP
             self.iyp.batch_add_links('NAME', links)
 
+    def unit_test(self):
+        return super().unit_test(['NAME'])
+
 
 def main() -> None:
     parser = argparse.ArgumentParser()
@@ -85,7 +88,7 @@ def main() -> None:
 
     crawler = Crawler(ORG, URL, NAME)
     if args.unit_test:
-        crawler.unit_test(logging)
+        crawler.unit_test()
     else:
         crawler.run()
         crawler.close()
diff --git a/iyp/crawlers/example/crawler.py b/iyp/crawlers/example/crawler.py
index 63c94ed..e2b4b74 100644
--- a/iyp/crawlers/example/crawler.py
+++ b/iyp/crawlers/example/crawler.py
@@ -74,7 +74,7 @@ def main() -> None:
 
     crawler = Crawler(ORG, URL, NAME)
     if args.unit_test:
-        crawler.unit_test(logging)
+        crawler.unit_test()
     else:
         crawler.run()
         crawler.close()
diff --git a/iyp/crawlers/iana/root_zone.py b/iyp/crawlers/iana/root_zone.py
index 2ec8c1b..39d6224 100644
--- a/iyp/crawlers/iana/root_zone.py
+++ b/iyp/crawlers/iana/root_zone.py
@@ -107,6 +107,9 @@ def run(self):
         logging.info(f'Pushing {len(managed_by)} MANAGED_BY relationships.')
         self.iyp.batch_add_links('MANAGED_BY', managed_by_relationships)
 
+    def unit_test(self):
+        return super().unit_test(['RESOLVES_TO', 'MANAGED_BY'])
+
 
 def main() -> None:
     parser = argparse.ArgumentParser()
@@ -126,7 +129,7 @@ def main() -> None:
 
     crawler = Crawler(ORG, URL, NAME)
     if args.unit_test:
-        crawler.unit_test(logging)
+        crawler.unit_test()
     else:
         crawler.run()
         crawler.close()
diff --git a/iyp/crawlers/ihr/__init__.py b/iyp/crawlers/ihr/__init__.py
index 9ff8517..67f0863 100644
--- a/iyp/crawlers/ihr/__init__.py
+++ b/iyp/crawlers/ihr/__init__.py
@@ -101,3 +101,6 @@ def run(self):
 
         # Remove downloaded file
         os.remove(local_filename)
+
+    def unit_test(self):
+        return super().unit_test(['DEPENDS_ON'])
diff --git a/iyp/crawlers/ihr/country_dependency.py b/iyp/crawlers/ihr/country_dependency.py
index 1a8d4af..81936b4 100644
--- a/iyp/crawlers/ihr/country_dependency.py
+++ b/iyp/crawlers/ihr/country_dependency.py
@@ -114,6 +114,9 @@ def run(self):
             # Push links to IYP
             self.iyp.batch_add_links('RANK', links)
 
+    def unit_test(self):
+        return super().unit_test(['RANK', 'COUNTRY'])
+
 
 # Main program
 def main() -> None:
@@ -134,7 +137,7 @@ def main() -> None:
 
     crawler = Crawler(ORG, URL, NAME)
     if args.unit_test:
-        crawler.unit_test(logging)
+        crawler.unit_test()
     else:
         crawler.run()
         crawler.close()
diff --git a/iyp/crawlers/ihr/local_hegemony_v4.py b/iyp/crawlers/ihr/local_hegemony_v4.py
index b0cc3b0..a61bd8e 100644
--- a/iyp/crawlers/ihr/local_hegemony_v4.py
+++ b/iyp/crawlers/ihr/local_hegemony_v4.py
@@ -36,7 +36,7 @@ def main() -> None:
 
     crawler = Crawler(ORG, URL, NAME)
     if args.unit_test:
-        crawler.unit_test(logging)
+        crawler.unit_test()
     else:
         crawler.run()
         crawler.close()
diff --git a/iyp/crawlers/ihr/local_hegemony_v6.py b/iyp/crawlers/ihr/local_hegemony_v6.py
index 6599a5d..7c24341 100644
--- a/iyp/crawlers/ihr/local_hegemony_v6.py
+++ b/iyp/crawlers/ihr/local_hegemony_v6.py
@@ -36,7 +36,7 @@ def main() -> None:
 
     crawler = Crawler(ORG, URL, NAME)
     if args.unit_test:
-        crawler.unit_test(logging)
+        crawler.unit_test()
     else:
         crawler.run()
         crawler.close()
diff --git a/iyp/crawlers/ihr/rov.py b/iyp/crawlers/ihr/rov.py
index 200493a..3e9dfd7 100644
--- a/iyp/crawlers/ihr/rov.py
+++ b/iyp/crawlers/ihr/rov.py
@@ -176,6 +176,9 @@ def run(self):
         # Remove downloaded file
         os.remove(local_filename)
 
+    def unit_test(self):
+        return super().unit_test(['ORIGINATE', 'CATEGORIZED', 'DEPENDS_ON', 'COUNTRY'])
+
 
 def main() -> None:
     parser = argparse.ArgumentParser()
@@ -195,7 +198,7 @@ def main() -> None:
 
     crawler = Crawler(ORG, URL, NAME)
     if args.unit_test:
-        crawler.unit_test(logging)
+        crawler.unit_test()
     else:
         crawler.run()
         crawler.close()
diff --git a/iyp/crawlers/inetintel/as_org.py b/iyp/crawlers/inetintel/as_org.py
index 71676d9..a5720e3 100644
--- a/iyp/crawlers/inetintel/as_org.py
+++ b/iyp/crawlers/inetintel/as_org.py
@@ -185,6 +185,9 @@ def close(self):
         os.remove(self.filename)
         os.rmdir(self.tmpdir)
 
+    def unit_test(self):
+        return super().unit_test(['SIBLING_OF'])
+
 
 def main() -> None:
     parser = argparse.ArgumentParser()
@@ -204,7 +207,7 @@ def main() -> None:
 
     crawler = Crawler(ORG, URL, NAME)
     if args.unit_test:
-        crawler.unit_test(logging)
+        crawler.unit_test()
     else:
         crawler.run()
         crawler.close()
diff --git a/iyp/crawlers/manrs/members.py b/iyp/crawlers/manrs/members.py
index 22384bf..fa67140 100644
--- a/iyp/crawlers/manrs/members.py
+++ b/iyp/crawlers/manrs/members.py
@@ -129,6 +129,9 @@ def run(self):
         self.iyp.batch_add_links('COUNTRY', country_rels)
         self.iyp.batch_add_links('IMPLEMENT', implement_rels)
 
+    def unit_test(self):
+        return super().unit_test(['MEMBER_OF', 'IMPLEMENT', 'COUNTRY'])
+
 
 def main() -> None:
     parser = argparse.ArgumentParser()
@@ -148,7 +151,7 @@ def main() -> None:
 
     crawler = Crawler(ORG, URL, NAME)
     if args.unit_test:
-        crawler.unit_test(logging)
+        crawler.unit_test()
     else:
         crawler.run()
         crawler.close()
diff --git a/iyp/crawlers/nro/delegated_stats.py b/iyp/crawlers/nro/delegated_stats.py
index 79ec931..ce07423 100644
--- a/iyp/crawlers/nro/delegated_stats.py
+++ b/iyp/crawlers/nro/delegated_stats.py
@@ -197,6 +197,9 @@ def run(self):
         for label, links in prefix_status_links.items():
             self.iyp.batch_add_links(label, links)
 
+    def unit_test(self):
+        return super().unit_test(['AVAILABLE', 'ASSIGNED', 'RESERVED', 'COUNTRY'])
+
 
 def main() -> None:
     parser = argparse.ArgumentParser()
@@ -216,7 +219,7 @@ def main() -> None:
 
     crawler = Crawler(ORG, URL, NAME)
     if args.unit_test:
-        crawler.unit_test(logging)
+        crawler.unit_test()
     else:
         crawler.run()
         crawler.close()
diff --git a/iyp/crawlers/openintel/__init__.py b/iyp/crawlers/openintel/__init__.py
index b3f3cd5..dc02acb 100644
--- a/iyp/crawlers/openintel/__init__.py
+++ b/iyp/crawlers/openintel/__init__.py
@@ -239,6 +239,10 @@ def run(self):
         self.iyp.batch_add_links('MANAGED_BY', mng_links)
         self.iyp.batch_add_links('PART_OF', partof_links)
 
+    def unit_test(self):
+        # use different version depending on infra_ns vs others
+        return super().unit_test(['RESOLVES_TO', 'MANAGED_BY', 'PART_OF'])
+
 
 class DnsgraphCrawler(BaseCrawler):
 
@@ -382,3 +386,6 @@ def run(self):
         ns_id = [link['dst_id'] for link in links_managed_by]
         logging.info(f'Adding AuthoritativeNameServer label to {len(ns_id)} nodes')
         self.iyp.batch_add_node_label(ns_id, 'AuthoritativeNameServer')
+
+    def unit_test(self):
+        return super().unit_test(['PARENT', 'PART_OF', 'ALIAS_OF', 'MANAGED_BY', 'RESOLVES_TO'])
diff --git a/iyp/crawlers/openintel/dnsgraph_jp.py b/iyp/crawlers/openintel/dnsgraph_jp.py
index c8457d1..30dd210 100644
--- a/iyp/crawlers/openintel/dnsgraph_jp.py
+++ b/iyp/crawlers/openintel/dnsgraph_jp.py
@@ -33,7 +33,7 @@ def main() -> None:
 
     crawler = Crawler(ORG, URL, NAME)
     if args.unit_test:
-        crawler.unit_test(logging)
+        crawler.unit_test()
     else:
         crawler.run()
         crawler.close()
diff --git a/iyp/crawlers/openintel/dnsgraph_nl.py b/iyp/crawlers/openintel/dnsgraph_nl.py
index 7977ea8..7b29c3b 100644
--- a/iyp/crawlers/openintel/dnsgraph_nl.py
+++ b/iyp/crawlers/openintel/dnsgraph_nl.py
@@ -33,7 +33,7 @@ def main() -> None:
 
     crawler = Crawler(ORG, URL, NAME)
     if args.unit_test:
-        crawler.unit_test(logging)
+        crawler.unit_test()
     else:
         crawler.run()
         crawler.close()
diff --git a/iyp/crawlers/openintel/dnsgraph_rdns.py b/iyp/crawlers/openintel/dnsgraph_rdns.py
index 30210e5..a9730b4 100644
--- a/iyp/crawlers/openintel/dnsgraph_rdns.py
+++ b/iyp/crawlers/openintel/dnsgraph_rdns.py
@@ -33,7 +33,7 @@ def main() -> None:
 
     crawler = Crawler(ORG, URL, NAME)
     if args.unit_test:
-        crawler.unit_test(logging)
+        crawler.unit_test()
     else:
         crawler.run()
         crawler.close()
diff --git a/iyp/crawlers/openintel/infra_mx.py b/iyp/crawlers/openintel/infra_mx.py
index 3841c77..4c33f94 100644
--- a/iyp/crawlers/openintel/infra_mx.py
+++ b/iyp/crawlers/openintel/infra_mx.py
@@ -42,7 +42,7 @@ def main() -> None:
 
     crawler = Crawler(ORG, URL, NAME)
     if args.unit_test:
-        crawler.unit_test(logging)
+        crawler.unit_test()
     else:
         crawler.run()
         crawler.close()
diff --git a/iyp/crawlers/openintel/infra_ns.py b/iyp/crawlers/openintel/infra_ns.py
index a4395c6..91dfb8d 100644
--- a/iyp/crawlers/openintel/infra_ns.py
+++ b/iyp/crawlers/openintel/infra_ns.py
@@ -35,7 +35,7 @@ def main() -> None:
 
     crawler = Crawler(ORG, URL, NAME)
     if args.unit_test:
-        crawler.unit_test(logging)
+        crawler.unit_test()
     else:
         crawler.run()
         crawler.close()
diff --git a/iyp/crawlers/openintel/tranco1m.py b/iyp/crawlers/openintel/tranco1m.py
index c9dd706..d459d63 100644
--- a/iyp/crawlers/openintel/tranco1m.py
+++ b/iyp/crawlers/openintel/tranco1m.py
@@ -35,7 +35,7 @@ def main() -> None:
 
     crawler = Crawler(ORG, URL, NAME)
     if args.unit_test:
-        crawler.unit_test(logging)
+        crawler.unit_test()
     else:
         crawler.run()
         crawler.close()
diff --git a/iyp/crawlers/openintel/umbrella1m.py b/iyp/crawlers/openintel/umbrella1m.py
index 2b7b606..9b3ccde 100644
--- a/iyp/crawlers/openintel/umbrella1m.py
+++ b/iyp/crawlers/openintel/umbrella1m.py
@@ -35,7 +35,7 @@ def main() -> None:
 
     crawler = Crawler(ORG, URL, NAME)
     if args.unit_test:
-        crawler.unit_test(logging)
+        crawler.unit_test()
     else:
         crawler.run()
         crawler.close()
diff --git a/iyp/crawlers/pch/__init__.py b/iyp/crawlers/pch/__init__.py
index 071fd10..db023a4 100644
--- a/iyp/crawlers/pch/__init__.py
+++ b/iyp/crawlers/pch/__init__.py
@@ -326,3 +326,6 @@ def run(self) -> None:
 
         # Clear cache.
         self.cache_handler.clear_cache()
+
+    def unit_test(self):
+        return super().unit_test(['ORIGINATE'])
diff --git a/iyp/crawlers/pch/daily_routing_snapshots_v4.py b/iyp/crawlers/pch/daily_routing_snapshots_v4.py
index 2f5fa4d..649ee5e 100644
--- a/iyp/crawlers/pch/daily_routing_snapshots_v4.py
+++ b/iyp/crawlers/pch/daily_routing_snapshots_v4.py
@@ -34,7 +34,7 @@ def main() -> None:
 
     crawler = Crawler(ORG, URL, NAME)
     if args.unit_test:
-        crawler.unit_test(logging)
+        crawler.unit_test()
     else:
         crawler.run()
         crawler.close()
diff --git a/iyp/crawlers/pch/daily_routing_snapshots_v6.py b/iyp/crawlers/pch/daily_routing_snapshots_v6.py
index a2ff010..3915769 100644
--- a/iyp/crawlers/pch/daily_routing_snapshots_v6.py
+++ b/iyp/crawlers/pch/daily_routing_snapshots_v6.py
@@ -34,7 +34,7 @@ def main() -> None:
 
     crawler = Crawler(ORG, URL, NAME)
     if args.unit_test:
-        crawler.unit_test(logging)
+        crawler.unit_test()
     else:
         crawler.run()
         crawler.close()
diff --git a/iyp/crawlers/peeringdb/fac.py b/iyp/crawlers/peeringdb/fac.py
index d86dbd6..ebf012f 100644
--- a/iyp/crawlers/peeringdb/fac.py
+++ b/iyp/crawlers/peeringdb/fac.py
@@ -131,6 +131,9 @@ def run(self):
         self.iyp.batch_add_links('EXTERNAL_ID', facid_links)
         self.iyp.batch_add_links('MANAGED_BY', org_links)
 
+    def unit_test(self):
+        return super().unit_test(['NAME', 'WEBSITE', 'COUNTRY', 'EXTERNAL_ID', 'MANAGED_BY'])
+
 
 def main() -> None:
     parser = argparse.ArgumentParser()
@@ -150,7 +153,7 @@ def main() -> None:
 
     crawler = Crawler(ORG, URL, NAME)
     if args.unit_test:
-        crawler.unit_test(logging)
+        crawler.unit_test()
     else:
         crawler.run()
         crawler.close()
diff --git a/iyp/crawlers/peeringdb/ix.py b/iyp/crawlers/peeringdb/ix.py
index 8cb8c28..267c512 100644
--- a/iyp/crawlers/peeringdb/ix.py
+++ b/iyp/crawlers/peeringdb/ix.py
@@ -393,6 +393,9 @@ def register_ixs(self):
         self.iyp.batch_add_links('EXTERNAL_ID', id_links)
         self.iyp.batch_add_links('NAME', name_links)
 
+    def unit_test(self):
+        return super().unit_test(['MANAGED_BY', 'LOCATED_IN', 'COUNTRY', 'WEBSITE', 'EXTERNAL_ID', 'NAME'])
+
 
 def main() -> None:
     parser = argparse.ArgumentParser()
@@ -412,7 +415,7 @@ def main() -> None:
 
     crawler = Crawler(ORG, URL, NAME)
     if args.unit_test:
-        crawler.unit_test(logging)
+        crawler.unit_test()
     else:
         crawler.run()
         crawler.close()
diff --git a/iyp/crawlers/peeringdb/org.py b/iyp/crawlers/peeringdb/org.py
index f1f1beb..aa0a2c7 100644
--- a/iyp/crawlers/peeringdb/org.py
+++ b/iyp/crawlers/peeringdb/org.py
@@ -119,6 +119,9 @@ def run(self):
         self.iyp.batch_add_links('COUNTRY', country_links)
         self.iyp.batch_add_links('EXTERNAL_ID', orgid_links)
 
+    def unit_test(self):
+        return super().unit_test(['NAME', 'WEBSITE', 'COUNTRY', 'EXTERNAL_ID'])
+
 
 def main() -> None:
     parser = argparse.ArgumentParser()
@@ -138,7 +141,7 @@ def main() -> None:
 
     crawler = Crawler(ORG, URL, NAME)
     if args.unit_test:
-        crawler.unit_test(logging)
+        crawler.unit_test()
     else:
         crawler.run()
         crawler.close()
diff --git a/iyp/crawlers/ripe/as_names.py b/iyp/crawlers/ripe/as_names.py
index 609cdca..9e163ad 100644
--- a/iyp/crawlers/ripe/as_names.py
+++ b/iyp/crawlers/ripe/as_names.py
@@ -69,6 +69,9 @@ def run(self):
         self.iyp.batch_add_links('NAME', name_links)
         self.iyp.batch_add_links('COUNTRY', country_links)
 
+    def unit_test(self):
+        return super().unit_test(['NAME', 'COUNTRY'])
+
 
 def main() -> None:
     parser = argparse.ArgumentParser()
@@ -88,7 +91,7 @@ def main() -> None:
 
     crawler = Crawler(ORG, URL, NAME)
     if args.unit_test:
-        crawler.unit_test(logging)
+        crawler.unit_test()
     else:
         crawler.run()
         crawler.close()
diff --git a/iyp/crawlers/ripe/atlas_measurements.py b/iyp/crawlers/ripe/atlas_measurements.py
index 25c7b77..eb60f6c 100644
--- a/iyp/crawlers/ripe/atlas_measurements.py
+++ b/iyp/crawlers/ripe/atlas_measurements.py
@@ -258,6 +258,9 @@ def run(self):
         self.iyp.batch_add_links('PART_OF', part_of_links)
         logging.info('Done.')
 
+    def unit_test(self):
+        return super().unit_test(['PART_OF', 'TARGET'])
+
 
 def main() -> None:
     parser = argparse.ArgumentParser()
@@ -278,7 +281,7 @@ def main() -> None:
 
     crawler = Crawler(ORG, URL, NAME)
     if args.unit_test:
-        crawler.unit_test(logging)
+        crawler.unit_test()
     else:
         crawler.run()
         crawler.close()
diff --git a/iyp/crawlers/ripe/atlas_probes.py b/iyp/crawlers/ripe/atlas_probes.py
index 2a94c61..4782099 100644
--- a/iyp/crawlers/ripe/atlas_probes.py
+++ b/iyp/crawlers/ripe/atlas_probes.py
@@ -175,6 +175,9 @@ def run(self):
         self.iyp.batch_add_links('LOCATED_IN', located_in_links)
         self.iyp.batch_add_links('COUNTRY', country_links)
 
+    def unit_test(self):
+        return super().unit_test(['ASSIGNED', 'LOCATED_IN', 'COUNTRY'])
+
 
 def main() -> None:
     parser = argparse.ArgumentParser()
@@ -195,7 +198,7 @@ def main() -> None:
 
     crawler = Crawler(ORG, URL, NAME)
     if args.unit_test:
-        crawler.unit_test(logging)
+        crawler.unit_test()
     else:
         crawler.run()
         crawler.close()
diff --git a/iyp/crawlers/ripe/roa.py b/iyp/crawlers/ripe/roa.py
index 7c1e162..c66d1a7 100644
--- a/iyp/crawlers/ripe/roa.py
+++ b/iyp/crawlers/ripe/roa.py
@@ -100,6 +100,9 @@ def run(self):
             # Push all links to IYP
             self.iyp.batch_add_links('ROUTE_ORIGIN_AUTHORIZATION', links)
 
+    def unit_test(self):
+        return super().unit_test(['ROUTE_ORIGIN_AUTHORIZATION'])
+
 
 def main() -> None:
     parser = argparse.ArgumentParser()
@@ -119,7 +122,7 @@ def main() -> None:
 
     crawler = Crawler(ORG, URL, NAME)
     if args.unit_test:
-        crawler.unit_test(logging)
+        crawler.unit_test()
     else:
         crawler.run()
         crawler.close()
diff --git a/iyp/crawlers/simulamet/rirdata_rdns.py b/iyp/crawlers/simulamet/rirdata_rdns.py
index bc0994d..c226898 100644
--- a/iyp/crawlers/simulamet/rirdata_rdns.py
+++ b/iyp/crawlers/simulamet/rirdata_rdns.py
@@ -123,6 +123,9 @@ def run(self):
 
         self.iyp.batch_add_links('MANAGED_BY', links_managed_by)
 
+    def unit_test(self):
+        return super().unit_test(['MANAGED_BY'])
+
 
 def main() -> None:
     parser = argparse.ArgumentParser()
@@ -142,7 +145,7 @@ def main() -> None:
 
     crawler = Crawler(ORG, URL, NAME)
     if args.unit_test:
-        crawler.unit_test(logging)
+        crawler.unit_test()
     else:
         crawler.run()
         crawler.close()
diff --git a/iyp/crawlers/stanford/asdb.py b/iyp/crawlers/stanford/asdb.py
index b32186c..133b623 100644
--- a/iyp/crawlers/stanford/asdb.py
+++ b/iyp/crawlers/stanford/asdb.py
@@ -126,6 +126,9 @@ def run(self):
         # Push all links to IYP
         self.iyp.batch_add_links('CATEGORIZED', links)
 
+    def unit_test(self):
+        return super().unit_test(['CATEGORIZED'])
+
 
 def main() -> None:
     parser = argparse.ArgumentParser()
@@ -145,7 +148,7 @@ def main() -> None:
 
     crawler = Crawler(ORG, URL, NAME)
     if args.unit_test:
-        crawler.unit_test(logging)
+        crawler.unit_test()
     else:
         crawler.run()
         crawler.close()
diff --git a/iyp/crawlers/tranco/top1m.py b/iyp/crawlers/tranco/top1m.py
index 6b4eab8..c2cfb37 100644
--- a/iyp/crawlers/tranco/top1m.py
+++ b/iyp/crawlers/tranco/top1m.py
@@ -65,6 +65,9 @@ def run(self):
         # Push all links to IYP
         self.iyp.batch_add_links('RANK', links)
 
+    def unit_test(self):
+        return super().unit_test(['RANK'])
+
 
 def main() -> None:
     parser = argparse.ArgumentParser()
@@ -84,7 +87,7 @@ def main() -> None:
 
     crawler = Crawler(ORG, URL, NAME)
     if args.unit_test:
-        crawler.unit_test(logging)
+        crawler.unit_test()
     else:
         crawler.run()
         crawler.close()
diff --git a/iyp/crawlers/virginiatech/rovista.py b/iyp/crawlers/virginiatech/rovista.py
index b72e734..c31585c 100644
--- a/iyp/crawlers/virginiatech/rovista.py
+++ b/iyp/crawlers/virginiatech/rovista.py
@@ -75,6 +75,9 @@ def run(self):
         # Push all links to IYP
         self.iyp.batch_add_links('CATEGORIZED', links)
 
+    def unit_test(self):
+        return super().unit_test(['CATEGORIZED'])
+
 
 def main() -> None:
     parser = argparse.ArgumentParser()
@@ -94,7 +97,7 @@ def main() -> None:
 
     crawler = Crawler(ORG, URL, NAME)
     if args.unit_test:
-        crawler.unit_test(logging)
+        crawler.unit_test()
     else:
         crawler.run()
         crawler.close()
diff --git a/iyp/crawlers/worldbank/country_pop.py b/iyp/crawlers/worldbank/country_pop.py
index ea19f0c..1b05c23 100644
--- a/iyp/crawlers/worldbank/country_pop.py
+++ b/iyp/crawlers/worldbank/country_pop.py
@@ -64,6 +64,9 @@ def run(self):
         # Push all links to IYP
         self.iyp.batch_add_links('POPULATION', links)
 
+    def unit_test(self):
+        return super().unit_test(['POPULATION'])
+
 
 def main() -> None:
     parser = argparse.ArgumentParser()
@@ -83,7 +86,7 @@ def main() -> None:
 
     crawler = Crawler(ORG, URL, NAME)
     if args.unit_test:
-        crawler.unit_test(logging)
+        crawler.unit_test()
     else:
         crawler.run()
         crawler.close()
diff --git a/requirements.txt b/requirements.txt
index 00afa5c..54fbbb7 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -21,4 +21,6 @@ flake8
 pre-commit
 PyGithub
 clickhouse_driver
-pyspark
\ No newline at end of file
+pyspark
+paramiko
+scp