From c7d86cd9d422bd84e49aaa27f2b28662f653d792 Mon Sep 17 00:00:00 2001 From: "romain.fontugne" Date: Wed, 14 Aug 2024 17:42:21 +0900 Subject: [PATCH 1/9] add tables for data sources, node types, and relationship types --- documentation/data-sources.md | 41 +++++++++++++++++++++++++++++ documentation/node_types.md | 30 +++++++++++++++++++++ documentation/relationship_types.md | 31 ++++++++++++++++++++++ 3 files changed, 102 insertions(+) create mode 100644 documentation/data-sources.md create mode 100644 documentation/node_types.md create mode 100644 documentation/relationship_types.md diff --git a/documentation/data-sources.md b/documentation/data-sources.md new file mode 100644 index 0000000..9f02c28 --- /dev/null +++ b/documentation/data-sources.md @@ -0,0 +1,41 @@ + +# Datasets and corresponding organization providing data to IYP + + +| Organization | Dataset Name / Description | URL | +|-----------------------------|----------------------------------------------|-----------------------------------------------------------------------| +| Alice-LG | IXP route server looking glass snapshots | https://github.com/alice-lg/alice-lg} | +| | AMS-IX | https://lg.ams-ix.net} | +| | BCIX | https://lg.bcix.de} | +| | DE-CIX | https://lg.de-cix.net} | +| | IX.br | https://lg.ix.br} | +| | LINX | https://alice-rs.linx.net} | +| | Megaport | https://lg.megaport.com} | +| | Netnod | https://lg.netnod.se} | +| APNIC | AS population estimate | https://stats.labs.apnic.net/aspop} | +| BGPKIT | as2rel, peer-stats, pfx2as | https://data.bgpkit.com} | +| BGP.Tools | AS names, AS tags | https://bgp.tools/kb/api} | +| | Anycast prefix tags | https://github.com/bgptools/anycast-prefixes} | +| CAIDA | AS Rank | https://doi.org/10.21986/CAIDA.DATA.AS-RANK} | +| | IXPs Dataset | https://www.caida.org/catalog/datasets/ixps} | +| Cisco | Umbrella Popularity List | https://s3-us-west-1.amazonaws.com/umbrella-static/index.html} | +| Citizen Lab | URL testing lists | https://github.com/citizenlab/test-lists} | +| Cloudflare | Cloudflare Radar API endpoints | https://radar.cloudflare.com}} | +| | radar/dns/top/ases, radar/dns/top/locations, | | +| | radar/ranking/top, radar/datasets | | +| Emile Aben | AS names | https://github.com/emileaben/asnames} | +| IHR | Country Dependency, AS Hegemony, ROV | https://ihr.iijlab.net} | +| Internet Intelligence Lab | AS to Organization Mapping | https://github.com/InetIntel/Dataset-AS-to-Organization-Mapping} | +| NRO | Extended allocation and assignment reports | https://www.nro.net/about/rirs/statistics} | +| OpenINTEL | tranco1m, umbrella1m, ns | https://data.openintel.nl/data} | +| | DNS Dependency Graph | https://dnsgraph.dacs.utwente.nl} | +| Packet Clearing House | Daily routing snapshots | https://www.pch.net/resources/Routing\_Data} | +| PeeringDB | API endpoints: fac, ix, ixlan, netfac, org | https://www.peeringdb.com} | +| RIPE NCC | AS names, RPKI | https://ftp.ripe.net/ripe} | +| | RIPE Atlas measurement information | https://atlas.ripe.net} | +| SimulaMet | rDNS data | https://rir-data.org} | +| Stanford | ASdb dataset | https://asdb.stanford.edu} | +| Tranco | Tranco list | https://tranco-list.eu} | +| Virginia Tech | RoVista | https://rovista.netsecurelab.org} | +| World Bank | Indicators API: Country Population Indicator | https://www.worldbank.org} | + diff --git a/documentation/node_types.md b/documentation/node_types.md new file mode 100644 index 0000000..84cd037 --- /dev/null +++ b/documentation/node_types.md @@ -0,0 +1,30 @@ + +# Node types available in IYP + +| Node types | Description | +|-------------------------|-----------------------------------------------------------------------------------------------------------------------------------| +| AS | Autonomous System, uniquely identified with the **asn** property. | +| AtlasMeasurement | RIPE Atlas Measurement, uniquely identified with the **id** property. | +| AtlasProbe | RIPE Atlas probe, uniquely identified with the **id** property. | +| AuthoritativeNameServer | Authoritative DNS nameserver for a set of domain names, uniquely identified with the **name** property. | +| BGPCollector | A RIPE RIS or RouteViews BGP collector, uniquely identified with the **name** property. | +| CaidaIXID | Unique identifier for IXPs from CAIDA's IXP dataset. | +| Country | Represent an economy, uniquely identified by either its two or three character code (properties **country_code** and **alpha3**). | +| DomainName | Any DNS domain name that is not a FQDN (see HostName), uniquely identified by the **name** property. | +| Estimate | Represent a report that approximate a quantity, for example the World Bank population estimate. | +| Facility | Co-location facility for IXPs and ASes, uniquely identified by the **name** property. | +| HostName | A fully qualified domain name uniquely identified by the **name** property. | +| IP | An IPv4 or IPv6 address uniquely identified by the **ip** property. The **af** property (address family) provides the IP version of the prefix.| +| IXP | An Internet Exchange Point, loosely identified by the **name** property or using related IDs (see the EXTERNAL_ID relationship). | +| Name | Represent a name that could be associated to a network resource (e.g. an AS), uniquely identified by the **name** property. | +| OpaqueID | Represent the opaque-id value found in RIR's delegated files. Resources related to the same opaque-id are registered to the same resource holder. Uniquely identified by the **id** property.| +| Organization | Represent an organization and is loosely identified by the **name** property or using related IDs (see the EXTERNAL_ID relationship).| +| PeeringdbFacID | Unique identifier for a Facility as assigned by PeeringDB. | +| PeeringdbIXID | Unique identifier for an IXP as assigned by PeeringDB. | +| PeeringdbNetID | Unique identifier for an AS as assigned by PeeringDB. | +| PeeringdbOrgID | Unique identifier for an Organization as assigned by PeeringDB. | +| Prefix | An IPv4 or IPv6 prefix uniquely identified by the **prefix** property. The **af** property (address family) provides the IP version of the prefix.| +| Ranking | Represent a specific ranking of Internet resources (e.g. CAIDA's ASRank or Tranco ranking). The rank value for each resource is given by the RANK relationship.| +| Tag | The output of a classification. A tag can be the result of a manual or automated classification. Uniquely identified by the **label** property.| +| URL | The full URL for an Internet resource, uniquely identified by the **url** property. | + diff --git a/documentation/relationship_types.md b/documentation/relationship_types.md new file mode 100644 index 0000000..e86e4a9 --- /dev/null +++ b/documentation/relationship_types.md @@ -0,0 +1,31 @@ + +# Relationships available in IYP + + +| Relationship | Description | +|----------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| ALIAS_OF | Equivalent to the CNAME record in DNS. It relates two HostNames. | +| ASSIGNED | Represent the allocation by a RIR of a network resource (AS, Prefix) to a resource holder (see OpaqueID). Or represent the assigned IP address of an AtlasProbe. | +| AVAILABLE | Relate ASes and Prefixes to RIRs (in the form of an OpaqueID) meaning that the resource is not allocated and available at the related RIR. | +| CATEGORIZED | Relate a network resource (AS, Prefix, URL) to a Tag, meaning that the resource has been classified accordingly to the Tag. The **reference_name** property provide the name of the original dataset/classifier. | +| COUNTRY | Relate any node to its corresponding country. This relation may have different meaning depending on the original dataset (e.g. geo-location or registration). | +| DEPENDS_ON | Relate an AS or Prefix to an AS, meaning the reachability of the AS/Prefix depends on a certain AS. | +| EXTERNAL_ID | Relate a node to an identifier commonly used by an organization. For example, PeeringDB assigns unique identifiers to IXPs (see PeeringdbIXID). | +| LOCATED_IN | Location of a resource at a specific geographical or topological location. For example, co-location Facility for an IXP or AS for an AtlasProbe. | +| MANAGED_BY | Entity in charge of a network resource. For example an AS is managed by an Organization, a DomainName is managed by an AuthoritativeNameServer. | +| MEMBER_OF | Represent the membership to an organization. For example, an AS is member of an IXP. | +| NAME | Relate an entity to its usual or registered name. For example, the name of an AS. | +| ORIGINATE | Relate a Prefix to an AS, meaning that the prefix is seen as being originated from that AS in BGP. | +| PARENT | Relate two DomainNames and represent a zone cut between the parent zone and the more specific zone. | +| PART_OF | Represent that one entity is a part of another. For example, an IP address is a part of an IP Prefix, a HostName is a part of a DomainName. | +| PEERS_WITH | Represent the connection between two ASes as seen in BGP. It also include peerings between ASes and BGPCollectors. | +| POPULATION | Indicate that an AS hosts a certain fraction of the population of a country. It also represent the estimated population of a country. | +| QUERIED_FROM | Relate a DomainName to an AS or Country, meaning that the AS or Country appears in the Top 100AS or Country to query the most the DomainName (as reported by Cloudflare radar). | +| RANK | Relate a resource to a Ranking, meaning that the resource appears in the Ranking. The **rank** property gives the exact rank position. | +| RESERVED | Indicate that an AS or Prefix is reserved for a certain purpose by RIRs or IANA. | +| RESOLVES_TO | Relate a HostName to an IP address, meaning that a DNS resolution resolved the corresponding IP. | +| ROUTE_ORIGIN_AUTHORIZATION | Relate an AS and a Prefix, meaning that the AS is authorized to originate the Prefix according to RPKI. | +| SIBLING_OF | Relate ASes or Organization together, meaning that they represent the same entity. | +| TARGET | Relate an AtlasMeasurement to an IP, HostName, or AS, meaning that an Atlas measurement is setup to probe that resource. | +| WEBSITE | Relate a URL to an Organization, Facility, IXP, AS, representing a common website for the resource. | + From adfc33f4b3232360d1c85224becfe723a9471b3e Mon Sep 17 00:00:00 2001 From: Romain Date: Wed, 14 Aug 2024 17:45:30 +0900 Subject: [PATCH 2/9] fix line break in data source table --- documentation/data-sources.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/documentation/data-sources.md b/documentation/data-sources.md index 9f02c28..ee0943e 100644 --- a/documentation/data-sources.md +++ b/documentation/data-sources.md @@ -20,9 +20,8 @@ | | IXPs Dataset | https://www.caida.org/catalog/datasets/ixps} | | Cisco | Umbrella Popularity List | https://s3-us-west-1.amazonaws.com/umbrella-static/index.html} | | Citizen Lab | URL testing lists | https://github.com/citizenlab/test-lists} | -| Cloudflare | Cloudflare Radar API endpoints | https://radar.cloudflare.com}} | -| | radar/dns/top/ases, radar/dns/top/locations, | | -| | radar/ranking/top, radar/datasets | | +| Cloudflare | Cloudflare Radar API endpoints radar/dns/top/ases, radar/dns/top/locations, radar/ranking/top, radar/datasets | https://radar.cloudflare.com}} | +| | | | Emile Aben | AS names | https://github.com/emileaben/asnames} | | IHR | Country Dependency, AS Hegemony, ROV | https://ihr.iijlab.net} | | Internet Intelligence Lab | AS to Organization Mapping | https://github.com/InetIntel/Dataset-AS-to-Organization-Mapping} | From 1298c5e000898f848249fcce1caf94718004d5ba Mon Sep 17 00:00:00 2001 From: "romain.fontugne" Date: Thu, 15 Aug 2024 16:34:24 +0900 Subject: [PATCH 3/9] First draft of documentation readme --- documentation/README.md | 48 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 documentation/README.md diff --git a/documentation/README.md b/documentation/README.md new file mode 100644 index 0000000..cfb767d --- /dev/null +++ b/documentation/README.md @@ -0,0 +1,48 @@ +# IYP Documentation + +## IYP Ontology + +The list of node and relationship types defined for IYP are available at: +- [Node types](./node_types.md) +- [Relationship types](./relationship_types.md) + +## IYP Data Sources + +The list of all datasets imported in IYP is available [here](data-sources.md). +The datasets licences are available the [ACKNOWLEDGMENTS file](../ACKNOWLEDGMENTS.md). + +## IYP Gallery + +The [IYP gallery](./gallery.md) provides example queries to help user browse the database. + +## Importing a new dataset +### Python crawler +To import a new dataset in IYP, you should write a crawler for that dataset. +The main tasks of a crawler are to fetch data, parse it, model it with IYP +ontology, and push it to the IYP database. Most of these tasks are assisted by +the [IYP python library](../iyp/__init__.py). See the [example crawler](../iyp/crawlers/example/crawler.py) or [existing crawlers](../iyp/crawlers/) for getting started. +See also the [IHR contributing guidelines](../CONTRIBUTING.md) and [best practices for writing crawlers](https://github.com/InternetHealthReport/internet-yellow-pages/discussions/128). + +### README +Each crawler should be accompanied by a README.md file. This is the main documentation +for the crawler, it should contain: +- a short description of the dataset, +- any specificities related to the way the data is imported (e.g. time span, data cleaning), +- examples of how the data is modeled, +- dependencies to other crawlers (e.g. if the crawler requires data from another one). + +### Adding a crawler to IYP main branch +If you wish your crawler to be part of the IYP weekly dumps, you can submit a [Pull Request](https://github.com/InternetHealthReport/internet-yellow-pages/pulls) +to include the crawler to IYP's github repository main branch. + +Along with the python code and README, the addition of new datasets should also +be reflected in the following files: +- The list of [imported datasets](./data-sources.md). +- The [ACKNOWLEDGMENTS.md](../ACKNOWLEDGMENTS.md) file should list the licence of all imported dataset. + +Furthermore, **any change to the ontology should be reflected in the documentation** ([Node types](./node_types.md) and [Relationship types](./relationship_types.md)). +Changes to the ontology should be discussed in advance so that a consensus is +reached before the ontology is updated either on [github discussion](https://github.com/InternetHealthReport/internet-yellow-pages/discussions) or by reaching [IYP maintainers](mailto:iyp@ihr.live). + +You can also consider adding example queries to the [IYP gallery](./gallery.md), +and organizations providing data to the [IYP frontpage](). From efd4ced071805683d584506e3724d4c40ae72715 Mon Sep 17 00:00:00 2001 From: "romain.fontugne" Date: Thu, 15 Aug 2024 16:41:41 +0900 Subject: [PATCH 4/9] fix typos in readme --- documentation/README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/documentation/README.md b/documentation/README.md index cfb767d..c164a65 100644 --- a/documentation/README.md +++ b/documentation/README.md @@ -9,7 +9,7 @@ The list of node and relationship types defined for IYP are available at: ## IYP Data Sources The list of all datasets imported in IYP is available [here](data-sources.md). -The datasets licences are available the [ACKNOWLEDGMENTS file](../ACKNOWLEDGMENTS.md). +The datasets licence are available the [IYP acknowledgments](../ACKNOWLEDGMENTS.md). ## IYP Gallery @@ -37,12 +37,12 @@ to include the crawler to IYP's github repository main branch. Along with the python code and README, the addition of new datasets should also be reflected in the following files: -- The list of [imported datasets](./data-sources.md). -- The [ACKNOWLEDGMENTS.md](../ACKNOWLEDGMENTS.md) file should list the licence of all imported dataset. +- the list of [imported datasets](./data-sources.md), +- the [IYP acknowledgments](../ACKNOWLEDGMENTS.md) file should list the licence of all imported dataset. -Furthermore, **any change to the ontology should be reflected in the documentation** ([Node types](./node_types.md) and [Relationship types](./relationship_types.md)). -Changes to the ontology should be discussed in advance so that a consensus is -reached before the ontology is updated either on [github discussion](https://github.com/InternetHealthReport/internet-yellow-pages/discussions) or by reaching [IYP maintainers](mailto:iyp@ihr.live). +Changes to the ontology should be discussed in advance, either on [github discussion](https://github.com/InternetHealthReport/internet-yellow-pages/discussions) or by reaching [IYP maintainers](mailto:iyp@ihr.live), +so that a consensus is reached before the ontology is updated. +**Any change to the ontology should be reflected in the documentation** ([Node types](./node_types.md) and [Relationship types](./relationship_types.md)). You can also consider adding example queries to the [IYP gallery](./gallery.md), and organizations providing data to the [IYP frontpage](). From 00a3bcd46c419bf65ebd889a0969d9fe01a330ba Mon Sep 17 00:00:00 2001 From: "romain.fontugne" Date: Wed, 21 Aug 2024 11:31:21 +0900 Subject: [PATCH 5/9] update gallery --- documentation/gallery.md | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/documentation/gallery.md b/documentation/gallery.md index 802a0b0..1ed6b89 100644 --- a/documentation/gallery.md +++ b/documentation/gallery.md @@ -1,19 +1,11 @@ # IYP Gallery +Below are examples queries that you can copy/paste in [Neo4j browser](https://iyp.iijlab.net/iyp/browser/?dbms=iyp-bolt.iijlab.net:443). + Querying the IYP database requires to be familiar with: - Cypher, Neo4j's query langage https://neo4j.com/docs/getting-started/current/cypher-intro/ - Basic networking knowledge (IP, prefixes, ASes, etc..) -- IYP ontology (TODO link to ontology) - -Below are examples queries that you can copy/paste in Neo4j browser: -1. [Names for AS2497](#names-for-as2497) -2. [All nodes related to 8.8.8.0/24](#all-nodes-related-to-888024) -3. [Country code of AS2497 in delegated files](#country-code-of-as2497-in-delegated-files) -4. [Countries of IXPs where AS2497 is present](#countries-of-ixps-where-as2497-is-present) -5. [Top domain names hosted by AS2497](#top-domain-names-hosted-by-as2497) -6. [ASes hosting top domain names in Japan](#ases-hosting-top-domain-names-in-japan) -7. [Topology for AS2501's dependencies](#topology-for-as2501s-dependencies) - +- [IYP ontology](./README.md) ### Names for AS2497 Find 'Name' nodes directly connected to the node corresponding to AS2497. @@ -95,8 +87,18 @@ RETURN p ``` ![Dependencies for AS2501](/documentation/assets/gallery/as2501dependencies.svg) -### List of IPs for RIPE RIS full feeds +### List of IPs for RIPE RIS full feed peers (more than 800k prefixes) + +```cypher +MATCH (n:BGPCollector)-[p:PEERS_WITH]-(a:AS) +WHERE n.project = 'riperis' AND p.num_v4_pfxs > 800000 +RETURN n.name, COUNT(DISTINCT p.ip) AS nb_full, COLLECT(DISTINCT p.ip) AS ips_full +``` + +### Active RIPE Atlas probes for the top 5 ISPs in Japan ```cypher -MATCH (n:BGPCollector)-[p:PEERS_WITH]-(a:AS) WHERE n.project = 'riperis' AND p.num_v4_pfxs > 800000 RETURN n.name, COUNT(DISTINCT p.ip) AS nb_full, COLLECT(DISTINCT p.ip) AS ips_full +MATCH (pb:AtlasProbe)-[:LOCATED_IN]-(a:AS)-[pop:POPULATION]-(c:Country) +WHERE c.country_code = 'JP' AND pb.status_name = 'Connected' AND pop.rank <= 5 +RETURN pop.rank, a.asn, COLLECT(pb.id) AS probe_ids ORDER BY pop.rank ``` From f9481fb0afe9cf1efaeead0a7bd4e648ae921481 Mon Sep 17 00:00:00 2001 From: "romain.fontugne" Date: Wed, 21 Aug 2024 12:07:35 +0900 Subject: [PATCH 6/9] Do not allow file URLS --- public/conf_notls/neo4j.conf | 3 ++- public/conf_tls/neo4j.conf | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/public/conf_notls/neo4j.conf b/public/conf_notls/neo4j.conf index 2bde2bf..6f1cd16 100644 --- a/public/conf_notls/neo4j.conf +++ b/public/conf_notls/neo4j.conf @@ -205,7 +205,8 @@ server.http.enabled=true # Determines if Cypher will allow using file URLs when loading data using # `LOAD CSV`. Setting this value to `false` will cause Neo4j to fail `LOAD CSV` # clauses that load data from the file system. -#dbms.security.allow_csv_import_from_file_urls=true +dbms.security.allow_csv_import_from_file_urls=false +dbms.security.allow_file_urls=false # Value of the Access-Control-Allow-Origin header sent over any HTTP or HTTPS diff --git a/public/conf_tls/neo4j.conf b/public/conf_tls/neo4j.conf index 6c22a0f..64d2644 100644 --- a/public/conf_tls/neo4j.conf +++ b/public/conf_tls/neo4j.conf @@ -205,7 +205,8 @@ dbms.ssl.policy.https.public_certificate=neo4j.cert # Determines if Cypher will allow using file URLs when loading data using # `LOAD CSV`. Setting this value to `false` will cause Neo4j to fail `LOAD CSV` # clauses that load data from the file system. -#dbms.security.allow_csv_import_from_file_urls=true +dbms.security.allow_csv_import_from_file_urls=false +dbms.security.allow_file_urls=false # Value of the Access-Control-Allow-Origin header sent over any HTTP or HTTPS From faa14c3b19a87bad4948f4856f0fb6e62f1c0c2b Mon Sep 17 00:00:00 2001 From: Romain Fontugne Date: Wed, 21 Aug 2024 11:45:25 +0000 Subject: [PATCH 7/9] setup neo4j config to disable load on local file --- public/conf_notls/neo4j.conf | 4 +--- public/conf_tls/neo4j.conf | 2 -- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/public/conf_notls/neo4j.conf b/public/conf_notls/neo4j.conf index 6f1cd16..86409a8 100644 --- a/public/conf_notls/neo4j.conf +++ b/public/conf_notls/neo4j.conf @@ -206,8 +206,6 @@ server.http.enabled=true # `LOAD CSV`. Setting this value to `false` will cause Neo4j to fail `LOAD CSV` # clauses that load data from the file system. dbms.security.allow_csv_import_from_file_urls=false -dbms.security.allow_file_urls=false - # Value of the Access-Control-Allow-Origin header sent over any HTTP or HTTPS # connector. This defaults to '*', which allows broadest compatibility. Note @@ -243,7 +241,7 @@ server.databases.default_to_read_only=true # A comma separated list of procedures to be loaded by default. # Leaving this unconfigured will load all procedures found. -#dbms.security.procedures.allowlist=apoc.coll.*,apoc.load.*,gds.* +dbms.security.procedures.allowlist= #******************************************************************** # JVM Parameters diff --git a/public/conf_tls/neo4j.conf b/public/conf_tls/neo4j.conf index 64d2644..b15425c 100644 --- a/public/conf_tls/neo4j.conf +++ b/public/conf_tls/neo4j.conf @@ -206,8 +206,6 @@ dbms.ssl.policy.https.public_certificate=neo4j.cert # `LOAD CSV`. Setting this value to `false` will cause Neo4j to fail `LOAD CSV` # clauses that load data from the file system. dbms.security.allow_csv_import_from_file_urls=false -dbms.security.allow_file_urls=false - # Value of the Access-Control-Allow-Origin header sent over any HTTP or HTTPS # connector. This defaults to '*', which allows broadest compatibility. Note From 2a2e8c4054b178cbc0a4c40349d1cda629831efe Mon Sep 17 00:00:00 2001 From: "romain.fontugne" Date: Wed, 28 Aug 2024 15:22:34 +0900 Subject: [PATCH 8/9] cleaning of documentation files --- documentation/cn.md | 142 ----------------------------------------- documentation/iij.md | 62 ------------------ documentation/ua_ru.md | 72 --------------------- 3 files changed, 276 deletions(-) delete mode 100644 documentation/cn.md delete mode 100644 documentation/iij.md delete mode 100644 documentation/ua_ru.md diff --git a/documentation/cn.md b/documentation/cn.md deleted file mode 100644 index a8e99d6..0000000 --- a/documentation/cn.md +++ /dev/null @@ -1,142 +0,0 @@ -# A look into the Chinese Internet - -ASNs: -- China Telecom: 4134, 23764 (CTGNet replacing 4809 overseas) -- China Telecom 5G network?: 131285 - -Notes: -- some international ASes have HK country code (e.g., telstra) - - -Interesting links: -- China Telecom map: https://www.chinatelecomasiapacific.com/jp/wp-content/uploads/2021/03/2021-CTG-Infrastructure-Map-8M.pdf -- China Telecom peering policy: https://2021v.peeringasia.com/files/NewCTPeeringPolicy.pdf - - -## General stats - -Number of ASes registered in China/HK: -```cypher -MATCH (a:AS)-[:COUNTRY {reference_org:'NRO'}]-(cc:Country) -WHERE cc.country_code = 'CN' OR cc.country_code = 'HK' -RETURN cc.country_code, count(DISTINCT a) -``` - -| "cc.country_code" | "count(DISTINCT a)" | -|-------------------|---------------------| -| "CN" | 6508 | -| "HK" | 1198 | - - -Number of active Chinese/HK ASes: -```cypher -MATCH (a:AS)-[:COUNTRY {reference_org:'NRO'}]-(cc:Country), (a)-[:ORIGINATE]-(:Prefix) -WHERE cc.country_code = 'CN' OR cc.country_code = 'HK' -RETURN cc, count(DISTINCT a) -``` -| "cc" | "count(DISTINCT a)" | -|-----------------------|---------------------| -| {"country_code":"CN"} | 5071 | -| {"country_code":"HK"} | 683 | - -Facilities for CT (4134, 23764): -```cypher -MATCH (a:AS)--(fac:Facility) -WHERE a.asn IN [4134, 23764] -RETURN a, fac -``` - -Country codes for these facilities: -```cypher -MATCH (a:AS)--(fac:Facility)--(cc:Country) -WHERE a.asn IN [4134, 23764] -RETURN DISTINCT cc.country_code -``` - -Country codes for all ASes registered for the opaque ID: -```cypher -MATCH (a:AS)-[:ASSIGNED]-(oid:OpaqueID) -WHERE a.asn IN [4134, 23764] -WITH oid -MATCH (oid)--(other:AS)--(fac:Facility)--(cc:Country) -RETURN DISTINCT cc.country_code, collect(DISTINCT other.asn) -``` - -| "cc.country_code" | "collect(DISTINCT other.asn)" | -|-------------------|-------------------------------| -| "SG" | [131285,23764] | -| "ID" | [131285] | -| "KE" | [4809] | -| "BR" | [4809,4134] | -| "AE" | [4809] | -| "HK" | [4809,23764] | -| "ZA" | [4809,23764] | -| "US" | [4134] | -| "DE" | [4134,23764] | -| "NL" | [4134] | -| "GB" | [4134,23764] | -| "JP" | [23764] | -| "FR" | [23764] | - -## Co-location facilities - -Facilities in China/HK: -```cypher -MATCH (c:Country)--(f:Facility)--(a:AS) -WHERE c.country_code = 'CN' OR c.country_code = 'HK' -RETURN f, count(DISTINCT a) AS nb_as ORDER BY nb_as DESC -``` - -Facilities where Chinese ASes are (28): -```cypher -MATCH (net_country:Country)-[:COUNTRY {reference_org:'NRO'}]-(net:AS)-[:LOCATED_IN]-(fac:Facility)--(fac_country:Country) -WHERE net_country.country_code = 'CN' -RETURN fac_country.country_code, count(DISTINCT net) AS nb_AS ORDER BY nb_AS DESC -``` - -ASes present at the largest number of facilities: -```cypher -MATCH (net_country:Country)-[:COUNTRY {reference_org:'NRO'}]-(net:AS)-[:LOCATED_IN]-(fac:Facility)--(fac_country:Country) -WHERE net_country.country_code = 'HK' OR net_country.country_code = 'CN' -RETURN net.asn, net_country.country_code, count(DISTINCT fac_country) AS nb_fac ORDER BY nb_fac DESC -``` - -## Prefix geolocation -Geolocation of prefixes announced by Chinese ASes (54): -```cypher -MATCH (net_country:Country)-[:COUNTRY {reference_org:'NRO'}]-(net:AS)-[:ORIGINATE]-(pfx:Prefix)-[:COUNTRY {reference_org:'Internet Health Report'}]-(pfx_country:Country) -WHERE net_country.country_code = 'CN' -RETURN pfx_country.country_code, count(DISTINCT net) AS nb_AS ORDER BY nb_AS DESC -``` - -Geolocation of prefixes announced by HK ASes (81): -```cypher -MATCH (net_country:Country)-[:COUNTRY {reference_org:'NRO'}]-(net:AS)-[:ORIGINATE]-(pfx:Prefix)-[:COUNTRY {reference_org:'Internet Health Report'}]-(pfx_country:Country) -WHERE net_country.country_code = 'HK' -RETURN pfx_country.country_code, count(DISTINCT net) AS nb_AS ORDER BY nb_AS DESC -``` - -ASes with the largest footprint: -```cypher -MATCH (net_country:Country)-[:COUNTRY {reference_org:'NRO'}]-(net:AS)-[:ORIGINATE]-(pfx:Prefix)--(pfx_country:Country) -WHERE net_country.country_code = 'HK' OR net_country.country_code = 'CN' -RETURN net.asn, net_country.country_code, count(DISTINCT pfx_country) AS nb_pfx ORDER BY nb_pfx DESC -``` - -## Tier-1 comparison - -### Domain name distribution - -Graph (top10k): -```cypher -MATCH (:Ranking)-[r:RANK]-(dn:DomainName)--(ip:IP)--(pfx:Prefix)-[:ORIGINATE]-(net:AS) -WHERE dn.name ENDS WITH '.cn' AND r.rank<10000 -RETURN dn, ip, pfx, net -``` - -Table (top1M): -```cypher -MATCH (:Ranking)-[r:RANK]-(dn:DomainName)--(ip:IP)--(pfx:Prefix)-[:ORIGINATE]-(net:AS) -WHERE dn.name ENDS WITH '.cn' AND r.rank<100000 -RETURN net.asn, count(DISTINCT dn) AS nb_domain_name ORDER BY nb_domain_name DESC -``` diff --git a/documentation/iij.md b/documentation/iij.md deleted file mode 100644 index 9ff35fc..0000000 --- a/documentation/iij.md +++ /dev/null @@ -1,62 +0,0 @@ -# Looking into one AS: AS2497, IIJ - -## IIJ's geographical footprint -#### IXP where IIJ is present -```cypher -MATCH (iij:AS {asn:2497})-[:MEMBER_OF]-(ix:IXP)--(cc:Country) RETURN iij, ix, cc -``` - -### Facilities where IIJ is present -```cypher -MATCH (iij:AS {asn:2497})--(ix:Facility)--(cc:Country) RETURN iij, ix, cc -``` - -#### Geolocation of prefixes announced by IIJ -```cypher -MATCH (iij:AS {asn:2497})-[:ORIGINATE]-(pfx:Prefix)--(cc:Country) RETURN iij, pfx, cc -``` - -#### Geolocation of prefixes announced by IIJ's customer -graph: -```cypher -MATCH (iij:AS {asn:2497})<-[:DEPENDS_ON]-(customer:AS)-[:ORIGINATE]-(pfx:Prefix)--(cc:Country) RETURN iij, customer, pfx, cc -``` -table: -```cypher -MATCH (iij:AS {asn:2497})<-[dep:DEPENDS_ON]-(customer:AS)-[:ORIGINATE]-(pfx:Prefix)--(cc:Country) -RETURN cc.country_code, count(DISTINCT customer) AS nb_dep ORDER BY nb_dep DESC -``` - -## IIJ's logical (DNS) footprint -### Top domain names that map to prefixes originated from AS2497 -```cypher -MATCH (n:AS {asn:2497})-[:ORIGINATE]-(p:Prefix)--(i:IP)--(d:DomainName)-[r:RANK]-(:Ranking) WHERE r.rank<10000 RETURN n,p,i,d -``` - -### Top domain names that are related to AS2497 (including domains that map to prefixes orginated by AS2497 and prefixes that depends on AS2497) -```cypher -MATCH (n:AS {asn:2497})--(p:Prefix)--(i:IP)--(d:DomainName)-[r:RANK]-(:Ranking) WHERE r.rank<10000 RETURN n,p,i,d -``` - -## IIJ's main competitors -```cypher -MATCH (comp:AS)-[:PEERS_WITH {rel:1}]->(customer:AS)<-[:PEERS_WITH {rel:1}]-(iij:AS {asn:2497}) -WITH comp, customer OPTIONAL MATCH (comp)-[:NAME {reference_org:'RIPE NCC'}]-(comp_name:Name) -RETURN comp, comp_name, count(DISTINCT customer) AS nb_customer ORDER BY nb_customer DESC -``` - -## Top Japanese domains -Graph: -```cypher -MATCH (:Ranking)-[r:RANK]-(dn:DomainName)--(ip:IP)--(pfx:Prefix)-[:ORIGINATE]-(net:AS) -WHERE dn.name ENDS WITH '.jp' AND r.rank<10000 -RETURN dn, ip, pfx, net -``` - -Table: -```cypher -MATCH (:Ranking)-[r:RANK]-(dn:DomainName)--(ip:IP)--(pfx:Prefix)-[:ORIGINATE]-(net:AS) -WHERE dn.name ENDS WITH '.jp' AND r.rank<100000 -WITH net, dn OPTIONAL MATCH (net:AS)-[:NAME {reference_org:'RIPE NCC'}]-(net_name:Name) -RETURN net.asn, net_name.name, count(DISTINCT dn) AS nb_domain_name ORDER BY nb_domain_name DESC -``` diff --git a/documentation/ua_ru.md b/documentation/ua_ru.md deleted file mode 100644 index a16ff2c..0000000 --- a/documentation/ua_ru.md +++ /dev/null @@ -1,72 +0,0 @@ -# Examples for - -## Which country an AS maps to -### Registration country code (administrative) - -### Presence at IXP (biased by peering db) - -### Country code of its peers - -### Geoloc of prefixes - - -## Find all nodes that have the word 'crimea' in their name -MATCH (x)--(n:Name) WHERE toLower(n.name) CONTAINS 'crimea' RETURN x, n; - -## Crimean neighbors network dependencies -MATCH (crimea:Name)--(:AS)-[:PEERS_WITH]-(neighbors:AS)-[r:DEPENDS_ON]-(transit_as:AS)--(transit_name:Name) WHERE toLower(crimea.name) CONTAINS 'crimea' AND toFloat(r.hegemony)>0.2 RETURN transit_as, collect(DISTINCT transit_name.name), count(DISTINCT r) AS nb_links ORDER BY nb_links DESC; - -## Crimean IXP members dependencies -MATCH (crimea:Name)--(:IXP)-[:MEMBER_OF]-(neighbors:AS)-[r:DEPENDS_ON]-(transit_as:AS)--(transit_name:Name) WHERE toLower(crimea.name) CONTAINS 'crimea' AND toFloat(r.hegemony)>0.2 RETURN transit_as, collect(DISTINCT transit_name.name), count(DISTINCT r) AS nb_links ORDER BY nb_links DESC; - -## who is at Crimea ixp but not any other ixp - - -# Rostelecom - -## Top domain names that are hosted by Rostelecom -``` -MATCH (n:AS {asn:12389})-[:ORIGINATE]-(p:Prefix)--(i:IP)--(d:DomainName)-[r:RANK]-(:Ranking) WHERE r.rank<1000 RETURN n,p,i,d -``` - -## Top domain names that depends on Rostelecom -``` -MATCH (n:AS {asn:12389})--(p:Prefix)--(i:IP)--(d:DomainName)-[r:RANK]-(:Ranking) WHERE r.rank<1000 RETURN n,p,i,d -``` - -## all ases assigned to same org - - -## which top domain names map to this ASes -``` -MATCH (oid:OpaqueID)--(net:AS)--(pfx:Prefix)--(ip:IP)--(dname:DomainName)-[r]-(:Ranking), (net)-[{reference_org:'RIPE NCC'}]-(asname:Name) WHERE (oid:OpaqueID)--(:AS {asn:12389}) AND r.rank < 10000 AND net.asn<>12389 RETURN net, pfx, ip, dname, asname -``` - -## which prefixes map to other counties -``` -MATCH (:AS {asn:12389})--(oid:OpaqueID)--(net:AS)--(pfx:Prefix)--(cc:Country), (net)-[{reference_org:'RIPE NCC'}]-(asname:Name) WHERE net.asn<>12389 AND cc.country_code <> 'RU' RETURN net, pfx, asname, cc, oid -``` - -## which IXPs they are member of: -``` -MATCH (oid:OpaqueID)--(n:AS)--(ix:IXP) WHERE (oid:OpaqueID)--(:AS {asn:12389}) RETURN n, ix -``` - -## which IXPs are they operating to? -``` -MATCH (n:AS {asn:12389})--(ix:IXP)--(cc:Country) RETURN n,ix,cc -``` - -## prefixes assigned to this org and ASes announcing it - -! -## Country code of ASes hosting top domain names -MATCH (:Ranking)-[r:RANK]-(domain:DomainName)--(:IP)--(:Prefix)-[:ORIGINATE]-(:AS)-[:COUNTRY {reference_org:'NRO'}]-(cc:Country) WHERE r.rank<10000 AND domain.name ENDS WITH '.ru' RETURN cc.country_code, count(DISTINCT domain.name) AS dm_count ORDER BY dm_count DESC -MATCH (:Ranking)-[r:RANK]-(domain:DomainName)--(:IP)--(:Prefix)-[:ORIGINATE]-(:AS)-[:COUNTRY {reference_org:'NRO'}]-(cc:Country) WHERE r.rank<10000 AND domain.name ENDS WITH '.jp' RETURN cc.country_code, count(DISTINCT domain.name) AS dm_count ORDER BY dm_count DESC -MATCH (:Ranking)-[r:RANK]-(domain:DomainName)--(:IP)--(:Prefix)-[:ORIGINATE]-(:AS)-[:COUNTRY {reference_org:'NRO'}]-(cc:Country) WHERE r.rank<10000 AND domain.name ENDS WITH '.jp' RETURN cc.country_code, count(DISTINCT domain.name) AS dm_count ORDER BY dm_count DESC - -# Interesting queries: -## Orange presence -```cypher -MATCH (oid)--(n:AS)--(ix:IXP)--(cc:Country) WHERE (oid:OpaqueID)--(:AS {asn:5511}) RETURN n,ix,cc -``` From 73f4dc04b2bc558beb5928029c204c8339aa12b8 Mon Sep 17 00:00:00 2001 From: Ryan Poon <62296881+ryanp8@users.noreply.github.com> Date: Fri, 30 Aug 2024 13:50:18 +0800 Subject: [PATCH 9/9] Autodeploy (#139) * Added testing to check if crawlers don't receive any data * Modified create_db to run the existence check for each crawler and to upload dump to ihr archive when created * Added autodeployment script and caddy config to deploy new databases when they are available in ihr archive. --------- Co-authored-by: Malte Tashiro --- .gitignore | 2 + autodeploy-config.json | 12 + autodeploy/README.md | 54 +++ autodeploy/autodeploy.py | 285 ++++++++++++ caddy.template.json | 109 +++++ config.json.example | 6 + create_db.py | 408 +++++++++--------- docker-compose.yaml | 21 + iyp/__init__.py | 28 +- iyp/crawlers/alice_lg/__init__.py | 3 + iyp/crawlers/alice_lg/amsix.py | 2 +- iyp/crawlers/alice_lg/bcix.py | 2 +- iyp/crawlers/alice_lg/decix.py | 2 +- iyp/crawlers/alice_lg/ixbr.py | 2 +- iyp/crawlers/alice_lg/linx.py | 2 +- iyp/crawlers/alice_lg/megaport.py | 2 +- iyp/crawlers/alice_lg/netnod.py | 2 +- iyp/crawlers/apnic/eyeball.py | 5 +- iyp/crawlers/bgpkit/__init__.py | 3 + iyp/crawlers/bgpkit/as2rel_v4.py | 2 +- iyp/crawlers/bgpkit/as2rel_v6.py | 2 +- iyp/crawlers/bgpkit/peerstats.py | 5 +- iyp/crawlers/bgpkit/pfx2asn.py | 5 +- iyp/crawlers/bgptools/anycast_prefixes.py | 5 +- iyp/crawlers/bgptools/as_names.py | 5 +- iyp/crawlers/bgptools/tags.py | 5 +- iyp/crawlers/caida/asrank.py | 5 +- iyp/crawlers/caida/ix_asns.py | 5 +- iyp/crawlers/caida/ixs.py | 5 +- iyp/crawlers/cisco/umbrella_top1m.py | 5 +- iyp/crawlers/citizenlab/urldb.py | 5 +- iyp/crawlers/cloudflare/dns_top_ases.py | 6 +- iyp/crawlers/cloudflare/dns_top_locations.py | 5 +- iyp/crawlers/cloudflare/ranking_bucket.py | 5 +- iyp/crawlers/cloudflare/top100.py | 5 +- iyp/crawlers/emileaben/as_names.py | 5 +- iyp/crawlers/example/crawler.py | 2 +- iyp/crawlers/iana/root_zone.py | 5 +- iyp/crawlers/ihr/__init__.py | 3 + iyp/crawlers/ihr/country_dependency.py | 5 +- iyp/crawlers/ihr/local_hegemony_v4.py | 2 +- iyp/crawlers/ihr/local_hegemony_v6.py | 2 +- iyp/crawlers/ihr/rov.py | 5 +- iyp/crawlers/inetintel/as_org.py | 5 +- iyp/crawlers/manrs/members.py | 5 +- iyp/crawlers/nro/delegated_stats.py | 5 +- iyp/crawlers/openintel/__init__.py | 7 + iyp/crawlers/openintel/dnsgraph_jp.py | 2 +- iyp/crawlers/openintel/dnsgraph_nl.py | 2 +- iyp/crawlers/openintel/dnsgraph_rdns.py | 2 +- iyp/crawlers/openintel/infra_mx.py | 2 +- iyp/crawlers/openintel/infra_ns.py | 2 +- iyp/crawlers/openintel/tranco1m.py | 2 +- iyp/crawlers/openintel/umbrella1m.py | 2 +- iyp/crawlers/pch/__init__.py | 3 + .../pch/daily_routing_snapshots_v4.py | 2 +- .../pch/daily_routing_snapshots_v6.py | 2 +- iyp/crawlers/peeringdb/fac.py | 5 +- iyp/crawlers/peeringdb/ix.py | 5 +- iyp/crawlers/peeringdb/org.py | 5 +- iyp/crawlers/ripe/as_names.py | 5 +- iyp/crawlers/ripe/atlas_measurements.py | 5 +- iyp/crawlers/ripe/atlas_probes.py | 5 +- iyp/crawlers/ripe/roa.py | 5 +- iyp/crawlers/simulamet/rirdata_rdns.py | 5 +- iyp/crawlers/stanford/asdb.py | 5 +- iyp/crawlers/tranco/top1m.py | 5 +- iyp/crawlers/virginiatech/rovista.py | 5 +- iyp/crawlers/worldbank/country_pop.py | 5 +- requirements.txt | 4 +- 70 files changed, 901 insertions(+), 260 deletions(-) create mode 100644 autodeploy-config.json create mode 100644 autodeploy/README.md create mode 100644 autodeploy/autodeploy.py create mode 100644 caddy.template.json diff --git a/.gitignore b/.gitignore index 0c6ba6b..cf165c8 100644 --- a/.gitignore +++ b/.gitignore @@ -129,3 +129,5 @@ dmypy.json .pyre/ .history/ + +**/dumps \ No newline at end of file diff --git a/autodeploy-config.json b/autodeploy-config.json new file mode 100644 index 0000000..e210b16 --- /dev/null +++ b/autodeploy-config.json @@ -0,0 +1,12 @@ +{ + "archive_base_url": "https://ihr-archive.iijlab.net/ihr/iyp/", + "caddy_config_url": "http://sandbox.ihr.live:2019/config", + "caddy_post_url": "http://localhost:2019/load", + "caddy_template": "caddy.template.json", + "urls": { + "active_bolt": "ryan-bolt.ihr.live", + "active_http": "ryan.ihr.live", + "prev_bolt": "ryan-prev-bolt.ihr.live", + "prev_http": "ryan-prev.ihr.live" + } +} diff --git a/autodeploy/README.md b/autodeploy/README.md new file mode 100644 index 0000000..78dd23f --- /dev/null +++ b/autodeploy/README.md @@ -0,0 +1,54 @@ +# Autodeployment Script + +## Usage + +### Starting caddy + +Make sure that Caddy is running. If not, run it with `docker compose up caddy`. If Caddy +was running previously, then the new Caddy instance will resume from the previous +config. See the [Caddy docs](https://caddyserver.com/docs/running#docker-compose) for +more info. + +### Running the script + +To run the script, run `python3 -m autodeploy.autodeploy `. This will first find the date +of the most recent active deployment using the caddy config. If there is no active +deployment, today's date is used. With this date, the script will then check ihr-archive +to see if a dump has been pushed in the subsequent 7 days. If so, a neo4j instance will +be deployed using that dump. For example, if the latest deployment is for 2024-06-15, +the script will check if there is a dump for 2024-06-16 to 2024-06-23. + +Alternatively, running `python3 -m autodeploy.autodeploy --date [year]-[month]-[day]` will +check if there is a dump in the archive for the specified date and deploy it directly. + +## How it works + +### Checking for a dump to deploy + +If the date is not provided when running the script, it will first make a request to +Caddy to get the current config. The config is parsed to retrieve the port of the active +database. The date is parsed from the port number as explained below. Starting from this +date, the next 7 days are then checked in ihr-archive for valid dumps. + +#### Caddy Config + +Caddy is updated by substituting the desired ports in the specified Caddy config +template. The ports are constructed with the following structure: 1MMDD for neo4j http +port, and 2MMDD for neo4j bolt port. The json is sent to caddy by making a POST request +to sandbox.ihr.live:2019/load. The current config is retrieved by making a GET request +to sandbox.ihr.live:2019/config. + +### Starting the database + +Once a dump has been found, its log is downloaded from the archive. If the log indicates +that there are no errors, then the dump is downloaded. A docker container is then +started that loads the dump into a neo4j database. The database is stored in a docker +volume with the name data-MM-DD. Another container is then used to start the database +using the data stored in data-MM-YY. It binds its internal neo4j 7474 and 7687 ports to +the external ones that contain the dump's date. + +If a container is already running for this date, it and its data volume are deleted, and +a new one is created from the downloaded dump data. + +If there was already an active database, it becomes the previous database. The current +previous database container is stopped, and its data volume is deleted. diff --git a/autodeploy/autodeploy.py b/autodeploy/autodeploy.py new file mode 100644 index 0000000..68d82c1 --- /dev/null +++ b/autodeploy/autodeploy.py @@ -0,0 +1,285 @@ +import argparse +import json +import logging +import os +import sys +import time +from datetime import datetime, timedelta, timezone + +import docker +import requests + +NEO4J_VERSION = '5.16.0' + +ARCHIVE_URL_SUFFIX = '%Y/%m/%d/iyp-%Y-%m-%d' +LOG_URL_SUFFIX = ARCHIVE_URL_SUFFIX + '.log' +DUMP_URL_SUFFIX = ARCHIVE_URL_SUFFIX + '.dump' + +DUMP_DOWNLOAD_DIR_SUFFIX = 'dumps/%Y/%m/%d' + +DOCKER_VOLUME_FMT = 'data-%m-%d' +DOCKER_CONTAINER_NAME_FMT = 'deploy-%m-%d' + + +def remove_deployment(client: docker.DockerClient, date: datetime): + """Checks if there is an active deployment for the given date (month-day). + + If there is, remove it and the corresponding volume storing its data. + """ + container_name = date.strftime(DOCKER_CONTAINER_NAME_FMT) + volume_name = date.strftime(DOCKER_VOLUME_FMT) + try: + container = client.containers.get(container_name) + logging.warning(f'Removing active deployment for {date.strftime("%m-%d")}') + container.stop() + # Wait a little bit after the container has been removed + # before deleting the volume + # TODO Is there a better way? + while True: + try: + client.volumes.get(volume_name).remove() + break + except BaseException: + time.sleep(1) + except docker.errors.NotFound: + logging.info(f'No existing deployment for {date.strftime("%Y-%m-%d")}. Starting deployment') + + +def get_ports_from_caddy_config(config: dict): + """Makes a request to caddy config and returns the ports currently being used (both + active and previous)""" + caddy_config_url = config['caddy_config_url'] + r = requests.get(caddy_config_url) + try: + r.raise_for_status() + except requests.HTTPError as e: + logging.error(f'Failed to retrieve Caddy config from {caddy_config_url}: {e}') + sys.exit(1) + try: + body = r.json() + except json.JSONDecodeError as e: + logging.error(f'Failed to parse Caddy config: {e}') + sys.exit(1) + + routes = body['apps']['http']['servers']['srv0']['routes'] + active = dict() + for route in routes: + # This happens with a fresh caddy build. No ports are active, so + # return an empty dict. + # TODO So there is a route dict, but without a 'match' entry? + if 'match' not in route: + return dict() + host = route['match'][0]['host'][0] + dial = route['handle'][0]['routes'][0]['handle'][0]['upstreams'][0]['dial'] + active[host] = dial + + # TODO We want to have both active_bolt and active_http URLs in the config, right? + # If only one is present, we have a problem. + # Also, if we reach this point there _must_ be at least a currently active + # deployment? + ports = dict() + active_bolt = config['urls']['active_bolt'] + active_http = config['urls']['active_http'] + if active_bolt in active: + ports['active_bolt'] = active[active_bolt].split(':')[1] + if active_http in active: + ports['active_http'] = active[active_http].split(':')[1] + + # It's possible for there to be only one active deployment, and there + # are no previous ports. Only attempt to parse the previous ports + # if they have been filled in on the caddy config + prev_bolt = config['urls']['prev_bolt'] + prev_http = config['urls']['prev_http'] + if prev_bolt in active and 'PREV_BOLT_PORT' not in active[prev_bolt]: + ports['prev_bolt'] = active[prev_bolt].split(':')[1] + if prev_http in active and 'PREV_HTTP_PORT' not in active[prev_http]: + ports['prev_http'] = active[prev_http].split(':')[1] + return ports + + +def check_log(config: dict, date: datetime): + """Makes a request to archive and checks if there is a valid dump for the specified + date.""" + logging.info(f'Downloading logs for {date.strftime("%Y-%m-%d")}') + log_url_fmt = os.path.join(config['archive_base_url'], LOG_URL_SUFFIX) + log_url = date.strftime(log_url_fmt) + r = requests.get(log_url) + try: + r.raise_for_status() + except requests.HTTPError as e: + # We expect the request to fail if the log does not exist (404), but not for + # other reasons. + if r.status_code != 404: + logging.error(f'Expected HTTP code 200 or 404, but got: {e}') + sys.exit(1) + return False + body = r.content + last_line = body.decode().split('\n')[-1] + if 'Errors:' in last_line: + logging.error(f'There were errors from create_db found in logs for {log_url}') + sys.exit(1) + return True + + +def get_port_date(port): + """Extracts the month and day from a port. + + Port should have format [1|2]MMDD. + + Returns the tuple (month, day) + """ + month = int(port[-4:-2]) + day = int(port[-2:]) + return month, day + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('config') + parser.add_argument('-d', '--date', help='deploy IYP dump for this date (YYYY-MM-DD)') + args = parser.parse_args() + + FORMAT = '%(asctime)s %(processName)s %(message)s' + logging.basicConfig( + format=FORMAT, + level=logging.INFO, + datefmt='%Y-%m-%d %H:%M:%S' + ) + logging.info(f'Started: {sys.argv}') + + with open(args.config, 'r') as f: + try: + config: dict = json.load(f) + except json.JSONDecodeError as e: + logging.error(f'Invalid configuration specified: {e}') + sys.exit(1) + + root = os.path.dirname(os.path.realpath(__file__)) + + # If no date is provided when running the script, check if any dumps + # have been made within a week since the previous deployment. Otherwise, + # use the date provided in command line arg. + if args.date: + try: + date = datetime.strptime(args.date, '%Y-%m-%d') + except ValueError as e: + logging.error(f'Invalid date specified: {e}') + sys.exit(1) + else: + ports = get_ports_from_caddy_config(config) + success = False + if 'active_http' in ports: + active_http = ports['active_http'] + month, day = get_port_date(active_http) + start_date = datetime.now(tz=timezone.utc).replace(month=month, day=day) + else: + start_date = datetime.now(tz=timezone.utc) + + # Download logs from ihr archive each day in the next week since + # the previous release + for i in range(1, 8): + date = start_date + timedelta(days=i) + if check_log(config, date): + success = True + break + else: + logging.warning(f'No archive entry found for {date.strftime("%Y-%m-%d")}.') + + if not success: + logging.error('Exiting because no active dates were found in archive.') + sys.exit(1) + + # Define ports and filenames that depend on the date + volume_name = date.strftime(DOCKER_VOLUME_FMT) + container_name = date.strftime(DOCKER_CONTAINER_NAME_FMT) + http_port = date.strftime('1%m%d') + bolt_port = date.strftime('2%m%d') + + client = docker.from_env() + + # Check if there is an existing deployment for this day and remove if so + remove_deployment(client, date) + + # Download dump from ihr archive + logging.info(f'Downloading dump for {date.strftime("%Y-%m-%d")}') + dump_dir = os.path.join(root, date.strftime(DUMP_DOWNLOAD_DIR_SUFFIX)) + os.makedirs(dump_dir, exist_ok=True) + dump_url_fmt = os.path.join(config['archive_base_url'], DUMP_URL_SUFFIX) + dump_url = date.strftime(dump_url_fmt) + r = requests.get(dump_url) + try: + r.raise_for_status() + except requests.HTTPError as e: + logging.error(f'Failed to fetch dump from {dump_url}: {e}') + sys.exit(1) + # TODO Is the + necessary? + with open(os.path.join(dump_dir, 'neo4j.dump'), 'wb+') as f: + f.write(r.content) + + # Load dump into volume + logging.info('Load dump into neo4j db') + client.containers.run( + 'neo4j/neo4j-admin:' + NEO4J_VERSION, + command='neo4j-admin database load neo4j --from-path=/dumps --verbose', + name='load', + tty=True, + stdin_open=True, + remove=True, + volumes={ + volume_name: {'bind': '/data', 'mode': 'rw'}, + dump_dir: {'bind': '/dumps', 'mode': 'rw'}, + } + ) + + # Run neo4j based on data in volume just created + logging.warning('Starting deployment container') + client.containers.run( + 'neo4j:' + NEO4J_VERSION, + name=container_name, + ports={ + 7474: int(http_port), + 7687: int(bolt_port) + }, + volumes={ + volume_name: {'bind': '/data', 'mode': 'rw'}, + }, + environment={ + 'NEO4J_AUTH': 'neo4j/password', + 'NEO4J_server_memory_heap_initial__size': '16G', + 'NEO4J_server_memory_heap_max__size': '16G', + }, + detach=True, + remove=True + ) + + # Get currently active config + ports = get_ports_from_caddy_config(config) + + # Only delete current prev if it exists + if 'prev_http' in ports: + prev_month, prev_day = get_port_date(ports['prev_http']) + # It's possible that you're trying to redeploy the current prev + # If this condition isn't here, then the new deployment will be deleted + # since it has the same date as prev + if prev_month != date.month or prev_day != date.day: + remove_deployment(client, date.replace(month=prev_month, day=prev_day)) + + with open(config['caddy_template'], 'r') as f: + caddy_template = f.read() + + caddy_template = caddy_template.replace('', bolt_port) + caddy_template = caddy_template.replace('', http_port) + + # If there are no active ports (for example, on the first run after a fresh + # caddy build), don't try to set prev ports + if 'active_http' in ports: + caddy_template = caddy_template.replace('', ports['active_bolt']) + caddy_template = caddy_template.replace('', ports['active_http']) + + # Update config + requests.post(config['caddy_post_url'], caddy_template, headers={'Content-Type': 'application/json'}) + + +if __name__ == '__main__': + main() + sys.exit(0) diff --git a/caddy.template.json b/caddy.template.json new file mode 100644 index 0000000..d86c1a4 --- /dev/null +++ b/caddy.template.json @@ -0,0 +1,109 @@ +{ + "apps": { + "http": { + "servers": { + "srv0": { + "listen": [":443"], + "routes": [ + { + "match": [{ "host": ["ryan-bolt.ihr.live"] }], + "handle": [ + { + "handler": "subroute", + "routes": [ + { + "handle": [ + { + "handler": "reverse_proxy", + "upstreams": [{ "dial": "sandbox.ihr.live:" }] + } + ] + } + ] + } + ], + "terminal": true + }, + { + "match": [{ "host": ["ryan-prev-bolt.ihr.live"] }], + "handle": [ + { + "handler": "subroute", + "routes": [ + { + "handle": [ + { + "handler": "reverse_proxy", + "upstreams": [{ "dial": "sandbox.ihr.live:" }] + } + ] + } + ] + } + ], + "terminal": true + }, + { + "match": [{ "host": ["sandbox.ihr.live"] }], + "handle": [ + { + "handler": "subroute", + "routes": [ + { + "handle": [ + { + "handler": "reverse_proxy", + "upstreams": [{ "dial": "ryan.ihr.live:" }] + } + ] + } + ] + } + ], + "terminal": true + }, + { + "match": [{ "host": ["ryan.ihr.live"] }], + "handle": [ + { + "handler": "subroute", + "routes": [ + { + "handle": [ + { + "handler": "reverse_proxy", + "upstreams": [{ "dial": "sandbox.ihr.live:" }] + } + ] + } + ] + } + ], + "terminal": true + }, + { + "match": [{ "host": ["ryan-prev.ihr.live"] }], + "handle": [ + { + "handler": "subroute", + "routes": [ + { + "handle": [ + { + "handler": "reverse_proxy", + "upstreams": [{ "dial": "sandbox.ihr.live:" }] + } + ] + } + ] + } + ], + "terminal": true + } + ] + } + } + } + } + } + \ No newline at end of file diff --git a/config.json.example b/config.json.example index ee12e2c..ef1e158 100644 --- a/config.json.example +++ b/config.json.example @@ -1,4 +1,10 @@ { + "archive": { + "host": "", + "user": "", + "base_path": "" + }, + "cache": { "directory": "tmp/", "duration_in_days": 6 diff --git a/create_db.py b/create_db.py index e754e78..6b62403 100644 --- a/create_db.py +++ b/create_db.py @@ -1,208 +1,228 @@ +import argparse import importlib import json import logging import os -# import shutil import sys +from datetime import datetime, timezone from time import sleep -import arrow import docker +import paramiko +from scp import SCPClient from send_email import send_email -NEO4J_VERSION = '5.21.2' - -today = arrow.utcnow() -date = f'{today.year}-{today.month:02d}-{today.day:02d}' - -# Use the current directory as root. -root = os.path.dirname(os.path.realpath(__file__)) -# Alternatively, specify your own path. -# root = '' -if not root: - sys.exit('Please configure a root path.') - -tmp_dir = os.path.join(root, 'neo4j/tmp', date, '') -dump_dir = os.path.join(root, 'dumps', f'{today.year}/{today.month:02d}/{today.day:02d}', '') - -os.makedirs(tmp_dir, exist_ok=True) -os.makedirs(dump_dir, exist_ok=True) - -# Initialize logging -scriptname = sys.argv[0].replace('/', '_')[0:-3] -FORMAT = '%(asctime)s %(processName)s %(message)s' -logging.basicConfig( - format=FORMAT, - filename=f'{dump_dir}iyp-{date}.log', - level=logging.WARNING, - datefmt='%Y-%m-%d %H:%M:%S' -) -logging.warning('Started: %s' % sys.argv) - -# Load configuration file -with open('config.json', 'r') as fp: - conf = json.load(fp) - -# Start a new neo4j container -client = docker.from_env() - -# ######### Start a new docker image ########## - -logging.warning('Starting new container...') -container = client.containers.run( - 'neo4j:' + NEO4J_VERSION, - name=f'iyp-{date}', - ports={ - 7474: 7474, - 7687: 7687 - }, - volumes={ - tmp_dir: {'bind': '/data', 'mode': 'rw'}, - }, - environment={ - 'NEO4J_AUTH': 'neo4j/password', - 'NEO4J_server_memory_heap_initial__size': '16G', - 'NEO4J_server_memory_heap_max__size': '16G', - }, - remove=True, - detach=True -) - -# Wait for the container to be ready -timeout = 120 -stop_time = 1 -elapsed_time = 0 -container_ready = False - -while elapsed_time < timeout: - sleep(stop_time) - elapsed_time += stop_time - # Not the most premium solution, but the alternative is using - # stream=True, which creates a blocking generator that we have - # to somehow interrupt in case the database does not start - # correctly. And writing a signal handler just for this seems - # overkill. - last_msg = container.logs(stderr=False, tail=1) - if last_msg.endswith(b'Started.\n'): - logging.warning('Container ready.') - container_ready = True - break - -if not container_ready: - logging.error('Timed our while waiting for container to start.') - try: - container_logs = container.logs().decode('utf-8') - except Exception as e: - logging.error(f'Can not get logs from container: {e}') +NEO4J_VERSION = '5.16.0' + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('-a', '--archive', action='store_true', help='push dump to archive server') + args = parser.parse_args() + + today = datetime.now(tz=timezone.utc) + date = today.strftime('%Y-%m-%d') + + # Use the current directory as root. + root = os.path.dirname(os.path.realpath(__file__)) + # Alternatively, specify your own path. + # root = '' + if not root: + sys.exit('Please configure a root path.') + + dump_dir = os.path.join(root, 'dumps', today.strftime('%Y/%m/%d')) + + os.makedirs(dump_dir, exist_ok=True) + + # Initialize logging + FORMAT = '%(asctime)s %(processName)s %(message)s' + logging.basicConfig( + format=FORMAT, + filename=os.path.join(dump_dir, f'iyp-{date}.log'), + level=logging.INFO, + datefmt='%Y-%m-%d %H:%M:%S' + ) + logging.info(f'Started: {sys.argv}') + + # Load configuration file + with open('config.json', 'r') as fp: + conf = json.load(fp) + + # Neo4j container settings + neo4j_volume = f'data-{date}' + + # Start a new neo4j container + client = docker.from_env() + + # ######### Start a new docker image ########## + + logging.warning('Starting new container...') + container = client.containers.run( + 'neo4j:' + NEO4J_VERSION, + name=f'iyp-{date}', + ports={ + 7474: 7474, + 7687: 7687 + }, + volumes={ + neo4j_volume: {'bind': '/data', 'mode': 'rw'}, + }, + environment={ + 'NEO4J_AUTH': 'neo4j/password', + 'NEO4J_server_memory_heap_initial__size': '16G', + 'NEO4J_server_memory_heap_max__size': '16G', + }, + remove=True, + detach=True + ) + + # Wait for the container to be ready + timeout = 120 + stop_time = 1 + elapsed_time = 0 + container_ready = False + + while elapsed_time < timeout: + sleep(stop_time) + elapsed_time += stop_time + # Not the most premium solution, but the alternative is using + # stream=True, which creates a blocking generator that we have + # to somehow interrupt in case the database does not start + # correctly. And writing a signal handler just for this seems + # overkill. + last_msg = container.logs(stderr=False, tail=1) + if last_msg.endswith(b'Started.\n'): + logging.info('Container ready.') + container_ready = True + break + + if not container_ready: + logging.error('Timed our while waiting for container to start.') + try: + container_logs = container.logs().decode('utf-8') + except Exception as e: + logging.error(f'Can not get logs from container: {e}') + sys.exit('Problem while starting the container.') + logging.error(f'Container logs:\n{container_logs}') + logging.error('Trying to stop container...') + container.stop() sys.exit('Problem while starting the container.') - logging.error(f'Container logs:\n{container_logs}') - logging.error('Trying to stop container...') - container.stop() - sys.exit('Problem while starting the container.') -# ######### Fetch data and feed to neo4j ########## - - -class RelationCountError(Exception): - def __init__(self, message): - self.message = message - super().__init__(self.message) - - -logging.warning('Fetching data...') -status = {} -no_error = True -for module_name in conf['iyp']['crawlers']: - - try: + # ########## Fetch data and feed to neo4j ########## + + class RelationCountError(Exception): + def __init__(self, message): + self.message = message + super().__init__(self.message) + + logging.info('Fetching data...') + status = {} + no_error = True + for module_name in conf['iyp']['crawlers']: + try: + module = importlib.import_module(module_name) + logging.info(f'start {module}') + name = module_name.replace('iyp.crawlers.', '') + crawler = module.Crawler(module.ORG, module.URL, name) + crawler.run() + passed = crawler.unit_test() + crawler.close() + if not passed: + error_message = f'Did not receive data from crawler {name}' + raise RelationCountError(error_message) + status[module_name] = 'OK' + logging.info(f'end {module}') + except RelationCountError as relation_count_error: + no_error = False + logging.error(relation_count_error) + status[module_name] = relation_count_error + send_email(relation_count_error) + except Exception as e: + no_error = False + logging.error('crawler crashed!') + status[module_name] = e + send_email(e) + + # ######### Post processing scripts ########## + + logging.info('Post-processing...') + for module_name in conf['iyp']['post']: module = importlib.import_module(module_name) - logging.warning(f'start {module}') - name = module_name.replace('iyp.crawlers.', '') - crawler = module.Crawler(module.ORG, module.URL, name) - relations_count = crawler.count_relations() - crawler.run() - relations_count_new = crawler.count_relations() - crawler.close() - if not relations_count_new > relations_count: - error_message = ( - f'Unexpected relation count change in the crawler "{name}": ' - f'Expected new relations ({relations_count_new}) ' - f'to be greater than the previous relations ({relations_count}).' - ) - raise RelationCountError(error_message) - status[module_name] = 'OK' - logging.warning(f'end {module}') - except RelationCountError as relation_count_error: - no_error = False - logging.error(relation_count_error) - status[module_name] = relation_count_error - send_email(relation_count_error) - except Exception as e: - no_error = False - logging.exception('crawler crashed!!') - status[module_name] = e - send_email(e) - - -# ######### Post processing scripts ########## - -logging.warning('Post-processing...') -for module_name in conf['iyp']['post']: - module = importlib.import_module(module_name) - - try: - logging.warning(f'start {module}') - post = module.PostProcess() - post.run() - post.close() - status[module_name] = 'OK' - logging.warning(f'end {module}') - - except Exception as e: - no_error = False - logging.error('crawler crashed!!\n') - logging.error(e) - logging.error('\n') - status[module_name] = e - - -# ######### Stop container and dump DB ########## - -logging.warning('Stopping container...') -container.stop(timeout=180) - -logging.warning('Dumping database...') -if os.path.exists(f'{dump_dir}/neo4j.dump'): - os.remove(f'{dump_dir}/neo4j.dump') - -# make sure the directory is writable for any user -os.chmod(dump_dir, 0o777) - -container = client.containers.run( - 'neo4j/neo4j-admin:' + NEO4J_VERSION, - command='neo4j-admin database dump neo4j --to-path=/dumps --verbose', - tty=True, - stdin_open=True, - remove=True, - volumes={ - tmp_dir: {'bind': '/data', 'mode': 'rw'}, - dump_dir: {'bind': '/dumps', 'mode': 'rw'}, - } -) - -# rename dump -os.rename(f'{dump_dir}/neo4j.dump', f'{dump_dir}/iyp-{date}.dump') - -final_words = '' -if not no_error: - # TODO send an email - final_words += 'There was errors!' - logging.error('there was errors!\n') - logging.error({k: error for k, error in status.items() if error != 'OK'}) -else: - final_words = 'No error :)' -# Delete tmp file in cron job -# shutil.rmtree(tmp_dir) - -logging.warning(f'Finished: {sys.argv} {final_words}') + + try: + logging.info(f'start {module}') + post = module.PostProcess() + post.run() + post.close() + status[module_name] = 'OK' + logging.info(f'end {module}') + + except Exception as e: + no_error = False + logging.error('crawler crashed!') + logging.error(e) + status[module_name] = e + + # ######### Stop container and dump DB ########## + + logging.info('Stopping container...') + container.stop(timeout=180) + + logging.info('Dumping database...') + dump_file = os.path.join(dump_dir, 'neo4j.dump') + if os.path.exists(dump_file): + os.remove(dump_file) + + # make sure the directory is writable for any user + os.chmod(dump_dir, 0o777) + + container = client.containers.run( + 'neo4j/neo4j-admin:' + NEO4J_VERSION, + command='neo4j-admin database dump neo4j --to-path=/dumps --verbose', + tty=True, + stdin_open=True, + remove=True, + volumes={ + neo4j_volume: {'bind': '/data', 'mode': 'rw'}, + dump_dir: {'bind': '/dumps', 'mode': 'rw'}, + } + ) + + # Delete the data volume once the dump been created + client.volumes.get(neo4j_volume).remove() + + # rename dump + + os.rename(dump_file, os.path.join(dump_dir, f'iyp-{date}.dump')) + + if not no_error: + # TODO send an email + + # Add the log line to indicate to autodeploy that there were errors + final_words = f'\nErrors: {" ".join((k for k in status))}' + logging.error('There were errors!') + else: + final_words = 'No error :)' + # Delete tmp file in cron job + # shutil.rmtree(tmp_dir) + + logging.info(f'Finished: {sys.argv} {final_words}') + + if args.archive: + # Push the dump and log to ihr archive + ssh = paramiko.SSHClient() + # Do not show info logging for paramiko. + logging.getLogger('paramiko').setLevel(logging.WARNING) + ssh.load_system_host_keys() + ssh.connect(conf['archive']['host'], username=conf['archive']['user']) + + dest = os.path.join(conf['archive']['base_path'], today.strftime('%Y/%m/%d')) + ssh.exec_command(f'mkdir -p {dest}') + + with SCPClient(ssh.get_transport()) as scp: + scp.put(dump_dir, recursive=True, remote_path=dest) + + +if __name__ == '__main__': + main() diff --git a/docker-compose.yaml b/docker-compose.yaml index de510b0..fd985a5 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -1,3 +1,6 @@ +volumes: + caddy_data: + caddy_config: services: iyp_loader: image: neo4j/neo4j-admin:5.21.2 @@ -63,3 +66,21 @@ services: depends_on: iyp_loader: condition: service_completed_successfully + + caddy: + image: caddy:latest + container_name: caddy + restart: unless-stopped + ports: + - "80:80" + - "443:443" + - "443:443/udp" + - "2019:2019" + environment: + - CADDY_ADMIN=0.0.0.0:2019 + volumes: + - ./site:/srv + - caddy_data:/data + - caddy_config:/config + command: /usr/bin/caddy run --resume + \ No newline at end of file diff --git a/iyp/__init__.py b/iyp/__init__.py index 47ead72..81b2d2b 100644 --- a/iyp/__init__.py +++ b/iyp/__init__.py @@ -476,7 +476,7 @@ def batch_add_node_label(self, node_ids, label): label: label string or list of label strings """ label_str = str(label) - if type(label) is list: + if isinstance(label, list): label_str = ':'.join(label) for i in range(0, len(node_ids), BATCH_SIZE): @@ -709,15 +709,23 @@ def count_relations(self): return result['count'] - def unit_test(self, logging): - relation_count = self.count_relations() - logging.info('Relations before starting: %s' % relation_count) - self.run() - relation_count_new = self.count_relations() - logging.info('Relations after starting: %s' % relation_count_new) - self.close() - print('assertion failed') if relation_count_new <= relation_count else print('assertion passed') - assert relation_count_new > relation_count + def unit_test(self, relation_types): + """Check for existence of relationships created by this crawler. + + relation_types should be a list of types for which existence is checked. + """ + logging.info(f'Running existence test for {relation_types}') + passed = True + for relation_type in relation_types: + existenceQuery = f"""MATCH ()-[r:{relation_type}]-() + USING INDEX r:{relation_type}(reference_name) + WHERE r.reference_name = '{self.reference['reference_name']}' + RETURN 0 LIMIT 1""" + result = self.iyp.tx.run(existenceQuery) + if len(list(result)) == 0: + passed = False + logging.error(f'Missing data for relation {relation_type}') + return passed def close(self): # Commit changes to IYP diff --git a/iyp/crawlers/alice_lg/__init__.py b/iyp/crawlers/alice_lg/__init__.py index d7bbdfe..878bdf7 100644 --- a/iyp/crawlers/alice_lg/__init__.py +++ b/iyp/crawlers/alice_lg/__init__.py @@ -442,3 +442,6 @@ def run(self) -> None: if originate_rels: logging.info(f'Pushing {len(originate_rels)} ORIGINATE relationships.') self.iyp.batch_add_links('ORIGINATE', originate_rels) + + def unit_test(self): + return super().unit_test(['MEMBER_OF', 'ORIGINATE', 'MANAGED_BY']) diff --git a/iyp/crawlers/alice_lg/amsix.py b/iyp/crawlers/alice_lg/amsix.py index 7328645..6e74b62 100644 --- a/iyp/crawlers/alice_lg/amsix.py +++ b/iyp/crawlers/alice_lg/amsix.py @@ -28,7 +28,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/alice_lg/bcix.py b/iyp/crawlers/alice_lg/bcix.py index 66bc993..ed1a6f7 100644 --- a/iyp/crawlers/alice_lg/bcix.py +++ b/iyp/crawlers/alice_lg/bcix.py @@ -28,7 +28,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/alice_lg/decix.py b/iyp/crawlers/alice_lg/decix.py index 8a3aa25..d3c8c35 100644 --- a/iyp/crawlers/alice_lg/decix.py +++ b/iyp/crawlers/alice_lg/decix.py @@ -28,7 +28,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/alice_lg/ixbr.py b/iyp/crawlers/alice_lg/ixbr.py index 548701a..691e6e4 100644 --- a/iyp/crawlers/alice_lg/ixbr.py +++ b/iyp/crawlers/alice_lg/ixbr.py @@ -28,7 +28,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/alice_lg/linx.py b/iyp/crawlers/alice_lg/linx.py index ef67bd2..223635c 100644 --- a/iyp/crawlers/alice_lg/linx.py +++ b/iyp/crawlers/alice_lg/linx.py @@ -28,7 +28,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/alice_lg/megaport.py b/iyp/crawlers/alice_lg/megaport.py index 8080f45..5156a21 100644 --- a/iyp/crawlers/alice_lg/megaport.py +++ b/iyp/crawlers/alice_lg/megaport.py @@ -28,7 +28,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/alice_lg/netnod.py b/iyp/crawlers/alice_lg/netnod.py index 8ba89f4..5f711a4 100644 --- a/iyp/crawlers/alice_lg/netnod.py +++ b/iyp/crawlers/alice_lg/netnod.py @@ -28,7 +28,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/apnic/eyeball.py b/iyp/crawlers/apnic/eyeball.py index bee4fd4..8054402 100644 --- a/iyp/crawlers/apnic/eyeball.py +++ b/iyp/crawlers/apnic/eyeball.py @@ -84,6 +84,9 @@ def run(self): self.iyp.batch_add_links('RANK', rank_links) self.iyp.batch_add_links('POPULATION', pop_links) + def unit_test(self): + return super().unit_test(['POPULATION', 'COUNTRY', 'RANK', 'NAME']) + def main() -> None: parser = argparse.ArgumentParser() @@ -103,7 +106,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/bgpkit/__init__.py b/iyp/crawlers/bgpkit/__init__.py index e5d8e3b..1c54c89 100644 --- a/iyp/crawlers/bgpkit/__init__.py +++ b/iyp/crawlers/bgpkit/__init__.py @@ -47,3 +47,6 @@ def run(self): # Push all links to IYP self.iyp.batch_add_links('PEERS_WITH', links) + + def unit_test(self): + return super().unit_test(['PEERS_WITH']) diff --git a/iyp/crawlers/bgpkit/as2rel_v4.py b/iyp/crawlers/bgpkit/as2rel_v4.py index 0a1c1bc..2278dd0 100644 --- a/iyp/crawlers/bgpkit/as2rel_v4.py +++ b/iyp/crawlers/bgpkit/as2rel_v4.py @@ -35,7 +35,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/bgpkit/as2rel_v6.py b/iyp/crawlers/bgpkit/as2rel_v6.py index e6dafac..e7807c7 100644 --- a/iyp/crawlers/bgpkit/as2rel_v6.py +++ b/iyp/crawlers/bgpkit/as2rel_v6.py @@ -35,7 +35,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/bgpkit/peerstats.py b/iyp/crawlers/bgpkit/peerstats.py index 58cf8cb..7724186 100644 --- a/iyp/crawlers/bgpkit/peerstats.py +++ b/iyp/crawlers/bgpkit/peerstats.py @@ -90,6 +90,9 @@ def run(self): # Push all links to IYP self.iyp.batch_add_links('PEERS_WITH', links) + def unit_test(self): + return super().unit_test(['PEERS_WITH']) + def main() -> None: parser = argparse.ArgumentParser() @@ -109,7 +112,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/bgpkit/pfx2asn.py b/iyp/crawlers/bgpkit/pfx2asn.py index fe3cb56..d83e8db 100644 --- a/iyp/crawlers/bgpkit/pfx2asn.py +++ b/iyp/crawlers/bgpkit/pfx2asn.py @@ -62,6 +62,9 @@ def run(self): # Push all links to IYP self.iyp.batch_add_links('ORIGINATE', links) + def unit_test(self): + return super().unit_test(['ORIGINATE']) + def main() -> None: parser = argparse.ArgumentParser() @@ -81,7 +84,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/bgptools/anycast_prefixes.py b/iyp/crawlers/bgptools/anycast_prefixes.py index c46c5da..824e5c4 100644 --- a/iyp/crawlers/bgptools/anycast_prefixes.py +++ b/iyp/crawlers/bgptools/anycast_prefixes.py @@ -102,6 +102,9 @@ def update(self, res, filename: str): # Push all links to IYP self.iyp.batch_add_links('CATEGORIZED', links) + def unit_test(self): + return super().unit_test(['CATEGORIZED']) + def main() -> None: parser = argparse.ArgumentParser() @@ -121,7 +124,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/bgptools/as_names.py b/iyp/crawlers/bgptools/as_names.py index 7db4c99..bbfa957 100644 --- a/iyp/crawlers/bgptools/as_names.py +++ b/iyp/crawlers/bgptools/as_names.py @@ -85,6 +85,9 @@ def run(self): self.iyp.batch_add_links('NAME', name_links) self.iyp.batch_add_links('CATEGORIZED', tag_links) + def unit_test(self): + return super().unit_test(['NAME']) + def main() -> None: parser = argparse.ArgumentParser() @@ -104,7 +107,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/bgptools/tags.py b/iyp/crawlers/bgptools/tags.py index 3a20fa1..155f644 100644 --- a/iyp/crawlers/bgptools/tags.py +++ b/iyp/crawlers/bgptools/tags.py @@ -83,6 +83,9 @@ def run(self): print('Error for: ', line) print(error) + def unit_test(self): + return super().unit_test(['CATEGORIZED']) + def main() -> None: parser = argparse.ArgumentParser() @@ -102,7 +105,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/caida/asrank.py b/iyp/crawlers/caida/asrank.py index 3cc8d9c..51e8e45 100644 --- a/iyp/crawlers/caida/asrank.py +++ b/iyp/crawlers/caida/asrank.py @@ -100,6 +100,9 @@ def run(self): self.iyp.batch_add_links('COUNTRY', country_links) self.iyp.batch_add_links('RANK', rank_links) + def unit_test(self): + return super().unit_test(['NAME', 'COUNTRY', 'RANK']) + def main() -> None: parser = argparse.ArgumentParser() @@ -119,7 +122,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/caida/ix_asns.py b/iyp/crawlers/caida/ix_asns.py index 6fbb86e..8e0f7f9 100644 --- a/iyp/crawlers/caida/ix_asns.py +++ b/iyp/crawlers/caida/ix_asns.py @@ -90,6 +90,9 @@ def run(self): # Push all links to IYP self.iyp.batch_add_links('MEMBER_OF', member_links) + def unit_test(self): + return super().unit_test(['MEMBER_OF']) + def main() -> None: parser = argparse.ArgumentParser() @@ -109,7 +112,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/caida/ixs.py b/iyp/crawlers/caida/ixs.py index 2301c09..e9b4542 100644 --- a/iyp/crawlers/caida/ixs.py +++ b/iyp/crawlers/caida/ixs.py @@ -200,6 +200,9 @@ def run(self): self.iyp.batch_add_links('WEBSITE', website_links) self.iyp.batch_add_links('MANAGED_BY', prefix_links) + def unit_test(self): + return super().unit_test(['EXTERNAL_ID', 'NAME', 'COUNTRY', 'WEBSITE', 'MANAGED_BY']) + def main() -> None: parser = argparse.ArgumentParser() @@ -219,7 +222,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/cisco/umbrella_top1m.py b/iyp/crawlers/cisco/umbrella_top1m.py index 23bd3e2..9ed18fb 100644 --- a/iyp/crawlers/cisco/umbrella_top1m.py +++ b/iyp/crawlers/cisco/umbrella_top1m.py @@ -123,6 +123,9 @@ def run(self): logging.info(f'Pushing {len(processed_links)} RANK relationships...') self.iyp.batch_add_links('RANK', processed_links) + def unit_test(self): + return super().unit_test(['RANK']) + def main() -> None: parser = argparse.ArgumentParser() @@ -142,7 +145,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/citizenlab/urldb.py b/iyp/crawlers/citizenlab/urldb.py index b56718a..6893570 100644 --- a/iyp/crawlers/citizenlab/urldb.py +++ b/iyp/crawlers/citizenlab/urldb.py @@ -70,6 +70,9 @@ def run(self): # Push all links to IYP self.iyp.batch_add_links('CATEGORIZED', links) + def unit_test(self): + return super().unit_test(['CATEGORIZED']) + def main() -> None: parser = argparse.ArgumentParser() @@ -89,7 +92,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/cloudflare/dns_top_ases.py b/iyp/crawlers/cloudflare/dns_top_ases.py index 8c15ac0..fe6b6e1 100644 --- a/iyp/crawlers/cloudflare/dns_top_ases.py +++ b/iyp/crawlers/cloudflare/dns_top_ases.py @@ -47,6 +47,10 @@ def compute_link(self, param): 'props': [flat_prop, self.reference] }) + # already defined in imported Crawler + # def unit_test(self): + # pass + def main() -> None: parser = argparse.ArgumentParser() @@ -66,7 +70,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/cloudflare/dns_top_locations.py b/iyp/crawlers/cloudflare/dns_top_locations.py index 8f40586..37eb0f4 100644 --- a/iyp/crawlers/cloudflare/dns_top_locations.py +++ b/iyp/crawlers/cloudflare/dns_top_locations.py @@ -168,6 +168,9 @@ def compute_link(self, param): 'props': [flat_prop, self.reference] }) + def unit_test(self): + return super().unit_test(['QUERIED_FROM']) + def main() -> None: parser = argparse.ArgumentParser() @@ -187,7 +190,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/cloudflare/ranking_bucket.py b/iyp/crawlers/cloudflare/ranking_bucket.py index 6d9a02e..c6b64c7 100644 --- a/iyp/crawlers/cloudflare/ranking_bucket.py +++ b/iyp/crawlers/cloudflare/ranking_bucket.py @@ -122,6 +122,9 @@ def run(self): print(f'Adding {len(domain_links)} RANK relationships', file=sys.stderr) self.iyp.batch_add_links('RANK', domain_links) + def unit_test(self): + return super().unit_test(['RANK']) + # Main program if __name__ == '__main__': @@ -141,7 +144,7 @@ def run(self): crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/cloudflare/top100.py b/iyp/crawlers/cloudflare/top100.py index 2a6b63d..c44fbe2 100644 --- a/iyp/crawlers/cloudflare/top100.py +++ b/iyp/crawlers/cloudflare/top100.py @@ -71,6 +71,9 @@ def update(self, entry): domain_qid = self.iyp.get_node('DomainName', {'name': entry['domain']}) self.iyp.add_links(domain_qid, statements) + def unit_test(self): + return super().unit_test(['RANK']) + def main() -> None: parser = argparse.ArgumentParser() @@ -90,7 +93,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/emileaben/as_names.py b/iyp/crawlers/emileaben/as_names.py index 66d4b10..197253e 100644 --- a/iyp/crawlers/emileaben/as_names.py +++ b/iyp/crawlers/emileaben/as_names.py @@ -66,6 +66,9 @@ def run(self): # Push all links to IYP self.iyp.batch_add_links('NAME', links) + def unit_test(self): + return super().unit_test(['NAME']) + def main() -> None: parser = argparse.ArgumentParser() @@ -85,7 +88,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/example/crawler.py b/iyp/crawlers/example/crawler.py index 63c94ed..e2b4b74 100644 --- a/iyp/crawlers/example/crawler.py +++ b/iyp/crawlers/example/crawler.py @@ -74,7 +74,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/iana/root_zone.py b/iyp/crawlers/iana/root_zone.py index 2ec8c1b..39d6224 100644 --- a/iyp/crawlers/iana/root_zone.py +++ b/iyp/crawlers/iana/root_zone.py @@ -107,6 +107,9 @@ def run(self): logging.info(f'Pushing {len(managed_by)} MANAGED_BY relationships.') self.iyp.batch_add_links('MANAGED_BY', managed_by_relationships) + def unit_test(self): + return super().unit_test(['RESOLVES_TO', 'MANAGED_BY']) + def main() -> None: parser = argparse.ArgumentParser() @@ -126,7 +129,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/ihr/__init__.py b/iyp/crawlers/ihr/__init__.py index 9ff8517..67f0863 100644 --- a/iyp/crawlers/ihr/__init__.py +++ b/iyp/crawlers/ihr/__init__.py @@ -101,3 +101,6 @@ def run(self): # Remove downloaded file os.remove(local_filename) + + def unit_test(self): + return super().unit_test(['DEPENDS_ON']) diff --git a/iyp/crawlers/ihr/country_dependency.py b/iyp/crawlers/ihr/country_dependency.py index 1a8d4af..81936b4 100644 --- a/iyp/crawlers/ihr/country_dependency.py +++ b/iyp/crawlers/ihr/country_dependency.py @@ -114,6 +114,9 @@ def run(self): # Push links to IYP self.iyp.batch_add_links('RANK', links) + def unit_test(self): + return super().unit_test(['RANK', 'COUNTRY']) + # Main program def main() -> None: @@ -134,7 +137,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/ihr/local_hegemony_v4.py b/iyp/crawlers/ihr/local_hegemony_v4.py index b0cc3b0..a61bd8e 100644 --- a/iyp/crawlers/ihr/local_hegemony_v4.py +++ b/iyp/crawlers/ihr/local_hegemony_v4.py @@ -36,7 +36,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/ihr/local_hegemony_v6.py b/iyp/crawlers/ihr/local_hegemony_v6.py index 6599a5d..7c24341 100644 --- a/iyp/crawlers/ihr/local_hegemony_v6.py +++ b/iyp/crawlers/ihr/local_hegemony_v6.py @@ -36,7 +36,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/ihr/rov.py b/iyp/crawlers/ihr/rov.py index 200493a..3e9dfd7 100644 --- a/iyp/crawlers/ihr/rov.py +++ b/iyp/crawlers/ihr/rov.py @@ -176,6 +176,9 @@ def run(self): # Remove downloaded file os.remove(local_filename) + def unit_test(self): + return super().unit_test(['ORIGINATE', 'CATEGORIZED', 'DEPENDS_ON', 'COUNTRY']) + def main() -> None: parser = argparse.ArgumentParser() @@ -195,7 +198,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/inetintel/as_org.py b/iyp/crawlers/inetintel/as_org.py index 71676d9..a5720e3 100644 --- a/iyp/crawlers/inetintel/as_org.py +++ b/iyp/crawlers/inetintel/as_org.py @@ -185,6 +185,9 @@ def close(self): os.remove(self.filename) os.rmdir(self.tmpdir) + def unit_test(self): + return super().unit_test(['SIBLING_OF']) + def main() -> None: parser = argparse.ArgumentParser() @@ -204,7 +207,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/manrs/members.py b/iyp/crawlers/manrs/members.py index 22384bf..fa67140 100644 --- a/iyp/crawlers/manrs/members.py +++ b/iyp/crawlers/manrs/members.py @@ -129,6 +129,9 @@ def run(self): self.iyp.batch_add_links('COUNTRY', country_rels) self.iyp.batch_add_links('IMPLEMENT', implement_rels) + def unit_test(self): + return super().unit_test(['MEMBER_OF', 'IMPLEMENT', 'COUNTRY']) + def main() -> None: parser = argparse.ArgumentParser() @@ -148,7 +151,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/nro/delegated_stats.py b/iyp/crawlers/nro/delegated_stats.py index 79ec931..ce07423 100644 --- a/iyp/crawlers/nro/delegated_stats.py +++ b/iyp/crawlers/nro/delegated_stats.py @@ -197,6 +197,9 @@ def run(self): for label, links in prefix_status_links.items(): self.iyp.batch_add_links(label, links) + def unit_test(self): + return super().unit_test(['AVAILABLE', 'ASSIGNED', 'RESERVED', 'COUNTRY']) + def main() -> None: parser = argparse.ArgumentParser() @@ -216,7 +219,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/openintel/__init__.py b/iyp/crawlers/openintel/__init__.py index b3f3cd5..dc02acb 100644 --- a/iyp/crawlers/openintel/__init__.py +++ b/iyp/crawlers/openintel/__init__.py @@ -239,6 +239,10 @@ def run(self): self.iyp.batch_add_links('MANAGED_BY', mng_links) self.iyp.batch_add_links('PART_OF', partof_links) + def unit_test(self): + # use different version depending on infra_ns vs others + return super().unit_test(['RESOLVES_TO', 'MANAGED_BY', 'PART_OF']) + class DnsgraphCrawler(BaseCrawler): @@ -382,3 +386,6 @@ def run(self): ns_id = [link['dst_id'] for link in links_managed_by] logging.info(f'Adding AuthoritativeNameServer label to {len(ns_id)} nodes') self.iyp.batch_add_node_label(ns_id, 'AuthoritativeNameServer') + + def unit_test(self): + return super().unit_test(['PARENT', 'PART_OF', 'ALIAS_OF', 'MANAGED_BY', 'RESOLVES_TO']) diff --git a/iyp/crawlers/openintel/dnsgraph_jp.py b/iyp/crawlers/openintel/dnsgraph_jp.py index c8457d1..30dd210 100644 --- a/iyp/crawlers/openintel/dnsgraph_jp.py +++ b/iyp/crawlers/openintel/dnsgraph_jp.py @@ -33,7 +33,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/openintel/dnsgraph_nl.py b/iyp/crawlers/openintel/dnsgraph_nl.py index 7977ea8..7b29c3b 100644 --- a/iyp/crawlers/openintel/dnsgraph_nl.py +++ b/iyp/crawlers/openintel/dnsgraph_nl.py @@ -33,7 +33,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/openintel/dnsgraph_rdns.py b/iyp/crawlers/openintel/dnsgraph_rdns.py index 30210e5..a9730b4 100644 --- a/iyp/crawlers/openintel/dnsgraph_rdns.py +++ b/iyp/crawlers/openintel/dnsgraph_rdns.py @@ -33,7 +33,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/openintel/infra_mx.py b/iyp/crawlers/openintel/infra_mx.py index 3841c77..4c33f94 100644 --- a/iyp/crawlers/openintel/infra_mx.py +++ b/iyp/crawlers/openintel/infra_mx.py @@ -42,7 +42,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/openintel/infra_ns.py b/iyp/crawlers/openintel/infra_ns.py index a4395c6..91dfb8d 100644 --- a/iyp/crawlers/openintel/infra_ns.py +++ b/iyp/crawlers/openintel/infra_ns.py @@ -35,7 +35,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/openintel/tranco1m.py b/iyp/crawlers/openintel/tranco1m.py index c9dd706..d459d63 100644 --- a/iyp/crawlers/openintel/tranco1m.py +++ b/iyp/crawlers/openintel/tranco1m.py @@ -35,7 +35,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/openintel/umbrella1m.py b/iyp/crawlers/openintel/umbrella1m.py index 2b7b606..9b3ccde 100644 --- a/iyp/crawlers/openintel/umbrella1m.py +++ b/iyp/crawlers/openintel/umbrella1m.py @@ -35,7 +35,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/pch/__init__.py b/iyp/crawlers/pch/__init__.py index 071fd10..db023a4 100644 --- a/iyp/crawlers/pch/__init__.py +++ b/iyp/crawlers/pch/__init__.py @@ -326,3 +326,6 @@ def run(self) -> None: # Clear cache. self.cache_handler.clear_cache() + + def unit_test(self): + return super().unit_test(['ORIGINATE']) diff --git a/iyp/crawlers/pch/daily_routing_snapshots_v4.py b/iyp/crawlers/pch/daily_routing_snapshots_v4.py index 2f5fa4d..649ee5e 100644 --- a/iyp/crawlers/pch/daily_routing_snapshots_v4.py +++ b/iyp/crawlers/pch/daily_routing_snapshots_v4.py @@ -34,7 +34,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/pch/daily_routing_snapshots_v6.py b/iyp/crawlers/pch/daily_routing_snapshots_v6.py index a2ff010..3915769 100644 --- a/iyp/crawlers/pch/daily_routing_snapshots_v6.py +++ b/iyp/crawlers/pch/daily_routing_snapshots_v6.py @@ -34,7 +34,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/peeringdb/fac.py b/iyp/crawlers/peeringdb/fac.py index d86dbd6..ebf012f 100644 --- a/iyp/crawlers/peeringdb/fac.py +++ b/iyp/crawlers/peeringdb/fac.py @@ -131,6 +131,9 @@ def run(self): self.iyp.batch_add_links('EXTERNAL_ID', facid_links) self.iyp.batch_add_links('MANAGED_BY', org_links) + def unit_test(self): + return super().unit_test(['NAME', 'WEBSITE', 'COUNTRY', 'EXTERNAL_ID', 'MANAGED_BY']) + def main() -> None: parser = argparse.ArgumentParser() @@ -150,7 +153,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/peeringdb/ix.py b/iyp/crawlers/peeringdb/ix.py index 8cb8c28..267c512 100644 --- a/iyp/crawlers/peeringdb/ix.py +++ b/iyp/crawlers/peeringdb/ix.py @@ -393,6 +393,9 @@ def register_ixs(self): self.iyp.batch_add_links('EXTERNAL_ID', id_links) self.iyp.batch_add_links('NAME', name_links) + def unit_test(self): + return super().unit_test(['MANAGED_BY', 'LOCATED_IN', 'COUNTRY', 'WEBSITE', 'EXTERNAL_ID', 'NAME']) + def main() -> None: parser = argparse.ArgumentParser() @@ -412,7 +415,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/peeringdb/org.py b/iyp/crawlers/peeringdb/org.py index f1f1beb..aa0a2c7 100644 --- a/iyp/crawlers/peeringdb/org.py +++ b/iyp/crawlers/peeringdb/org.py @@ -119,6 +119,9 @@ def run(self): self.iyp.batch_add_links('COUNTRY', country_links) self.iyp.batch_add_links('EXTERNAL_ID', orgid_links) + def unit_test(self): + return super().unit_test(['NAME', 'WEBSITE', 'COUNTRY', 'EXTERNAL_ID']) + def main() -> None: parser = argparse.ArgumentParser() @@ -138,7 +141,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/ripe/as_names.py b/iyp/crawlers/ripe/as_names.py index 609cdca..9e163ad 100644 --- a/iyp/crawlers/ripe/as_names.py +++ b/iyp/crawlers/ripe/as_names.py @@ -69,6 +69,9 @@ def run(self): self.iyp.batch_add_links('NAME', name_links) self.iyp.batch_add_links('COUNTRY', country_links) + def unit_test(self): + return super().unit_test(['NAME', 'COUNTRY']) + def main() -> None: parser = argparse.ArgumentParser() @@ -88,7 +91,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/ripe/atlas_measurements.py b/iyp/crawlers/ripe/atlas_measurements.py index 25c7b77..eb60f6c 100644 --- a/iyp/crawlers/ripe/atlas_measurements.py +++ b/iyp/crawlers/ripe/atlas_measurements.py @@ -258,6 +258,9 @@ def run(self): self.iyp.batch_add_links('PART_OF', part_of_links) logging.info('Done.') + def unit_test(self): + return super().unit_test(['PART_OF', 'TARGET']) + def main() -> None: parser = argparse.ArgumentParser() @@ -278,7 +281,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/ripe/atlas_probes.py b/iyp/crawlers/ripe/atlas_probes.py index 2a94c61..4782099 100644 --- a/iyp/crawlers/ripe/atlas_probes.py +++ b/iyp/crawlers/ripe/atlas_probes.py @@ -175,6 +175,9 @@ def run(self): self.iyp.batch_add_links('LOCATED_IN', located_in_links) self.iyp.batch_add_links('COUNTRY', country_links) + def unit_test(self): + return super().unit_test(['ASSIGNED', 'LOCATED_IN', 'COUNTRY']) + def main() -> None: parser = argparse.ArgumentParser() @@ -195,7 +198,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/ripe/roa.py b/iyp/crawlers/ripe/roa.py index 7c1e162..c66d1a7 100644 --- a/iyp/crawlers/ripe/roa.py +++ b/iyp/crawlers/ripe/roa.py @@ -100,6 +100,9 @@ def run(self): # Push all links to IYP self.iyp.batch_add_links('ROUTE_ORIGIN_AUTHORIZATION', links) + def unit_test(self): + return super().unit_test(['ROUTE_ORIGIN_AUTHORIZATION']) + def main() -> None: parser = argparse.ArgumentParser() @@ -119,7 +122,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/simulamet/rirdata_rdns.py b/iyp/crawlers/simulamet/rirdata_rdns.py index bc0994d..c226898 100644 --- a/iyp/crawlers/simulamet/rirdata_rdns.py +++ b/iyp/crawlers/simulamet/rirdata_rdns.py @@ -123,6 +123,9 @@ def run(self): self.iyp.batch_add_links('MANAGED_BY', links_managed_by) + def unit_test(self): + return super().unit_test(['MANAGED_BY']) + def main() -> None: parser = argparse.ArgumentParser() @@ -142,7 +145,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/stanford/asdb.py b/iyp/crawlers/stanford/asdb.py index b32186c..133b623 100644 --- a/iyp/crawlers/stanford/asdb.py +++ b/iyp/crawlers/stanford/asdb.py @@ -126,6 +126,9 @@ def run(self): # Push all links to IYP self.iyp.batch_add_links('CATEGORIZED', links) + def unit_test(self): + return super().unit_test(['CATEGORIZED']) + def main() -> None: parser = argparse.ArgumentParser() @@ -145,7 +148,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/tranco/top1m.py b/iyp/crawlers/tranco/top1m.py index 6b4eab8..c2cfb37 100644 --- a/iyp/crawlers/tranco/top1m.py +++ b/iyp/crawlers/tranco/top1m.py @@ -65,6 +65,9 @@ def run(self): # Push all links to IYP self.iyp.batch_add_links('RANK', links) + def unit_test(self): + return super().unit_test(['RANK']) + def main() -> None: parser = argparse.ArgumentParser() @@ -84,7 +87,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/virginiatech/rovista.py b/iyp/crawlers/virginiatech/rovista.py index b72e734..c31585c 100644 --- a/iyp/crawlers/virginiatech/rovista.py +++ b/iyp/crawlers/virginiatech/rovista.py @@ -75,6 +75,9 @@ def run(self): # Push all links to IYP self.iyp.batch_add_links('CATEGORIZED', links) + def unit_test(self): + return super().unit_test(['CATEGORIZED']) + def main() -> None: parser = argparse.ArgumentParser() @@ -94,7 +97,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/worldbank/country_pop.py b/iyp/crawlers/worldbank/country_pop.py index ea19f0c..1b05c23 100644 --- a/iyp/crawlers/worldbank/country_pop.py +++ b/iyp/crawlers/worldbank/country_pop.py @@ -64,6 +64,9 @@ def run(self): # Push all links to IYP self.iyp.batch_add_links('POPULATION', links) + def unit_test(self): + return super().unit_test(['POPULATION']) + def main() -> None: parser = argparse.ArgumentParser() @@ -83,7 +86,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/requirements.txt b/requirements.txt index 00afa5c..54fbbb7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -21,4 +21,6 @@ flake8 pre-commit PyGithub clickhouse_driver -pyspark \ No newline at end of file +pyspark +paramiko +scp