DNS remodeling (#119)

* update url2domain to url2hostname * remove iana root zone file and dns hierarchy from config file * Atlas measurement targets are now hostnames * update openintel crawlers to the new DNS model * umbrella now ranks a mix of DomainName and HostName nodes and should be run after openintel.umbrella1m * Add explanation for cloudflare DNS modeling * lower umbrella crawler in config file * update READMEs with the new DNS modeling * add (:Service {name:'DNS'}) node and link it to authoritative name servers * Nodes do not have reference properties * Normalize IPv6 addresses * Fix wrong crawler name * Typos and formatting * Remove infra_mx crawler since it does not do anything at the moment * Update Cisco Umbrella crawler - Batch create new nodes (happens more often than expected) - Add logging output - Do not use builtins as variable names * Remove redundant set and parameters * Remove Service node for now We could not decide on a name, so we will deal with this later. --------- Co-authored-by: Malte Tashiro <[email protected]>
InternetHealthReport · Feb 7, 2024 · 838c8be · 838c8be
1 parent f464e59
commit 838c8be
Show file tree

Hide file tree

Showing 15 changed files with 183 additions and 89 deletions.
diff --git a/config.json.example b/config.json.example
@@ -63,19 +63,17 @@
             "iyp.crawlers.peeringdb.ix",
             "iyp.crawlers.cloudflare.top100",
             "iyp.crawlers.tranco.top1M",
-            "iyp.crawlers.cisco.umbrella_top1M",
             "iyp.crawlers.openintel.tranco1m",
             "iyp.crawlers.openintel.umbrella1m",
             "iyp.crawlers.openintel.infra_ns",
-            "iyp.crawlers.openintel.infra_mx",
+            "iyp.crawlers.cisco.umbrella_top1M",
             "iyp.crawlers.citizenlab.urldb",
             "iyp.crawlers.inetintel.as_org",
             "iyp.crawlers.pch.daily_routing_snapshots_v4",
             "iyp.crawlers.pch.daily_routing_snapshots_v6",
             "iyp.crawlers.emileaben.as_names",
             "iyp.crawlers.ripe.atlas_probes",
             "iyp.crawlers.ripe.atlas_measurements",
-            "iyp.crawlers.iana.root_zone",
             "iyp.crawlers.alice_lg.amsix",
             "iyp.crawlers.alice_lg.bcix",
             "iyp.crawlers.alice_lg.decix",
@@ -91,8 +89,7 @@
             "iyp.post.ip2prefix",
             "iyp.post.address_family",
             "iyp.post.country_information",
-            "iyp.post.dns_hierarchy",
-            "iyp.post.url2domain"
+            "iyp.post.url2hostname"
         ]
     }
 }
diff --git a/iyp/crawlers/cisco/README.md b/iyp/crawlers/cisco/README.md
@@ -1,17 +1,19 @@
 # Cisco Umbrella -- https://umbrella-static.s3-us-west-1.amazonaws.com/index.html
 
-The popularity list contains most queried domains based on passive DNS usage across the Umbrella global network.
+The popularity list contains most queried domains (ranging from TLDs to FQDNs)
+based on passive DNS usage across the Umbrella global network.
 
-IYP uses this data to create and annotate DomainName nodes.
+IYP uses this data to create and annotate DomainName and HostName nodes.
 
 ## Graph representation
 
 The rank of the domain is indicated by the `rank` property of the relationship.
 
 ```Cypher
 (:DomainName {name: 'google.com'})-[:RANK {rank: 1}]->(:Ranking {name: 'Cisco Umbrella Top 1 million'})
+(:HostName {name: 'www.google.com'})-[:RANK {rank: 8}]->(:Ranking {name: 'Cisco Umbrella Top 1 million'})
 ```
 
 ## Dependence
 
-This crawler is not depending on other crawlers.
+This crawler depends on `openintel.umbrella1m`.
diff --git a/iyp/crawlers/cisco/umbrella_top1M.py b/iyp/crawlers/cisco/umbrella_top1M.py
@@ -6,6 +6,7 @@
 from zipfile import ZipFile
 
 import requests
+import tldextract
 
 from iyp import BaseCrawler, RequestStatusError
 
@@ -22,31 +23,75 @@ def run(self):
 
         self.cisco_qid = self.iyp.get_node('Ranking', {'name': 'Cisco Umbrella Top 1 million'})
 
-        sys.stderr.write('Downloading latest list...\n')
+        logging.info('Downloading latest list...')
         req = requests.get(URL)
         if req.status_code != 200:
             raise RequestStatusError('Error while fetching Cisco Umbrella Top 1M csv file')
 
         links = []
-        domains = set()
         # open zip file and read top list
         with ZipFile(io.BytesIO(req.content)) as z:
-            with z.open('top-1m.csv') as list:
-                for i, row in enumerate(io.TextIOWrapper(list)):
+            with z.open('top-1m.csv') as top_list:
+                for i, row in enumerate(io.TextIOWrapper(top_list)):
                     row = row.rstrip()
                     rank, domain = row.split(',')
 
-                    domains.add(domain)
                     links.append({'src_name': domain, 'dst_id': self.cisco_qid,
                                   'props': [self.reference, {'rank': int(rank)}]})
 
-        name_id = self.iyp.batch_get_nodes_by_single_prop('DomainName', 'name', domains)
+        logging.info('Fetching DomainName/HostName nodes...')
+        domain_id = self.iyp.batch_get_nodes_by_single_prop('DomainName', 'name')
+        host_id = self.iyp.batch_get_nodes_by_single_prop('HostName', 'name')
 
+        # Umbrella mixes up domain and host names.
+        # By order of preferences we rank:
+        # 1) existing domain name
+        # 2) existing host name
+        # 3) do our best to figure out if it is a domain or host and create the
+        # corresponding node
+
+        new_domain_names = set()
+        new_host_names = set()
+        unprocessed_links = list()
+        processed_links = list()
+
+        logging.info('Building relationships...')
         for link in links:
-            link['src_id'] = name_id[link['src_name']]
+            if link['src_name'] in domain_id:
+                link['src_id'] = domain_id[link['src_name']]
+                processed_links.append(link)
+            elif link['src_name'] in host_id:
+                link['src_id'] = host_id[link['src_name']]
+                processed_links.append(link)
+            else:
+                unprocessed_links.append(link)
+                ranked_thing = tldextract.extract(link['src_name'])
+                name = link['src_name']
+                if name == ranked_thing.registered_domain:
+                    new_domain_names.add(name)
+                else:
+                    new_host_names.add(name)
+
+        if new_domain_names:
+            logging.info(f'Pushing {len(new_domain_names)} additional DomainName nodes...')
+            domain_id.update(self.iyp.batch_get_nodes_by_single_prop('DomainName', 'name', new_domain_names, all=False))
+        if new_host_names:
+            logging.info(f'Pushing {len(new_host_names)} additional HostName nodes...')
+            host_id.update(self.iyp.batch_get_nodes_by_single_prop('HostName', 'name', new_host_names, all=False))
+
+        for link in unprocessed_links:
+            if link['src_name'] in domain_id:
+                link['src_id'] = domain_id[link['src_name']]
+            elif link['src_name'] in host_id:
+                link['src_id'] = host_id[link['src_name']]
+            else:
+                logging.error(f'Missing DomainName/HostName node for name "{link["src_name"]}". Should not happen.')
+                continue
+            processed_links.append(link)
 
         # Push all links to IYP
-        self.iyp.batch_add_links('RANK', links)
+        logging.info(f'Pushing {len(processed_links)} RANK relationships...')
+        self.iyp.batch_add_links('RANK', processed_links)
 
 
 def main() -> None:

diff --git a/iyp/crawlers/cloudflare/README.md b/iyp/crawlers/cloudflare/README.md
@@ -1,4 +1,4 @@
-# Cloudflare Radar -- https://radar.cloudflare.com/ 
+# Cloudflare Radar -- https://radar.cloudflare.com/
 
 Cloudflare uses aggregated and anonymized DNS queries to their `1.1.1.1` public resolver service to
 provide various datasets, including:
@@ -17,8 +17,12 @@ provide various datasets, including:
 - [Top 100 ASes querying each of the 10,000 highest ranked domain
   names](https://developers.cloudflare.com/api/operations/radar_get__top_ases): Same as above, but
   fetch AS numbers instead.
-  
+
 All rankings are based on one week of data.
+Cloudflare radar's top location and ASes is available for both domain names
+and host names. Results are likely accounting for all NS, A, AAAA queries made to
+Cloudflare's resolver. Since NS queries for host names make no sense IYP links these
+results to `DomainName` nodes.
 
 ## Graph representation
 

diff --git a/iyp/crawlers/cloudflare/dns_top_ases.py b/iyp/crawlers/cloudflare/dns_top_ases.py
@@ -1,3 +1,8 @@
+# Cloudflare radar's top location and ASes is available for both domain names
+# and host names. Results are likely accounting for all NS, A, AAAA queries made to
+# Cloudflare's resolver. Since NS queries for host names make no sense it seems
+# more intuitive to link these results to DomainName nodes.
+
 import argparse
 import logging
 import os

diff --git a/iyp/crawlers/cloudflare/dns_top_locations.py b/iyp/crawlers/cloudflare/dns_top_locations.py
@@ -1,3 +1,8 @@
+# Cloudflare radar's top location and ASes is available for both domain names
+# and host names. Results are likely accounting for all NS, A, AAAA queries made to
+# Cloudflare's resolver. Since NS queries for host names make no sense it seems
+# more intuitive to link these results to DomainName nodes.
+
 import argparse
 import glob
 import json

diff --git a/iyp/crawlers/cloudflare/ranking_bucket.py b/iyp/crawlers/cloudflare/ranking_bucket.py
@@ -24,6 +24,9 @@
 class Crawler(BaseCrawler):
     # Base Crawler provides access to IYP via self.iyp and setup a dictionary with the
     # org/url/today's date in self.reference
+    #
+    # Cloudflare ranks second and third level domain names (not host names).
+    # See https://blog.cloudflare.com/radar-domain-rankings/
 
     def run(self):
         """Fetch data and push to IYP."""

diff --git a/iyp/crawlers/cloudflare/top100.py b/iyp/crawlers/cloudflare/top100.py
@@ -21,6 +21,9 @@
 class Crawler(BaseCrawler):
     # Base Crawler provides access to IYP via self.iyp
     # and setup a dictionary with the org/url/today's date in self.reference
+    #
+    # Cloudflare ranks second and third level domain names (not host names).
+    # See https://blog.cloudflare.com/radar-domain-rankings/
 
     def run(self):
         """Fetch data and push to IYP."""

diff --git a/iyp/crawlers/openintel/README.md b/iyp/crawlers/openintel/README.md
@@ -4,26 +4,34 @@ The OpenINTEL measurement platform captures daily snapshots of the state of larg
 global Domain Name System (DNS) by running a number of forward and reverse DNS measurements.
 
 While OpenINTEL runs measurements to a variety of domain names, IYP currently only fetches data for
-the [Tranco top 1 million list](https://data.openintel.nl/data/tranco1m/) and the CISCO Umbrella 
+the [Tranco top 1 million list](https://data.openintel.nl/data/tranco1m/) and the CISCO Umbrella
 top 1 million list since it combines rankings.
 IYP also get the list of authoritative names servers seen by OpenINTEL.
 
-IYP uses only `A` queries to add IP resolution for DomainName and AuthoritativeNameServer nodes.
-
 A crawler of mail servers is also implemented but not used as it creates a very large number
 of links and this dataset is currently not requested/needed by anyone.
 
 ## Graph representation
 
-IP resolution for  popular domain names:
+IP resolution for  popular host names:
+
 ```Cypher
-(:DomainName {name: 'google.com'})-[:RESOLVES_TO]->(:IP {ip: '142.250.179.142'})
+(:HostName {name: 'google.com'})-[:RESOLVES_TO]->(:IP {ip: '142.250.179.142'})
 ```
 
 IP resolution of authoritative name servers:
+
+```Cypher
+(:HostName:AuthoritativeNameServer {name: 'ns1.google.com'})-[:RESOLVES_TO]->(:IP {ip: '216.239.32.10'})
+(:IP {ip: '216.239.32.10'})-[:SERVE]->(:Service {name: 'DNS'})
+```
+
+Domain names managed by name servers:
+
 ```Cypher
-(:AuthoritativeNameServer {name: 'ns1.google.com'})-[:RESOLVES_TO]->(:IP {ip: '216.239.32.10'})
+(:DomainName {name: 'google.com'})-[:MANAGED_BY]->(:HostName:AuthoritativeNameServer {name: 'ns1.google.com'})
 ```
+
 ## Dependence
 
 This crawler is not depending on other crawlers.
diff --git a/iyp/crawlers/openintel/__init__.py b/iyp/crawlers/openintel/__init__.py
@@ -7,6 +7,7 @@
 import logging
 import os
 import tempfile
+from ipaddress import IPv6Address
 
 import arrow
 import boto3
@@ -42,17 +43,11 @@ def valid_date(s):
 
 
 class OpenIntelCrawler(BaseCrawler):
-    def __init__(self, organization, url, name, dataset, additional_domain_type=str()):
+    def __init__(self, organization, url, name, dataset):
         """Initialization of the OpenIntel crawler requires the name of the dataset
-        (e.g. tranco or infra:ns).
-
-        If the dataset contains special types of domain
-        names, an additional label can be specified (e.g., `AuthoritativeNameServer`)
-        that will be attached to the `DomainName` nodes.
-        """
+        (e.g. tranco or infra:ns)."""
 
         self.dataset = dataset
-        self.additional_domain_type = additional_domain_type
         super().__init__(organization, url, name)
 
     def get_parquet(self):
@@ -179,52 +174,74 @@ def run(self):
 
         print(f'Read {len(df)} unique records from {len(self.pandas_df_list)} Parquet file(s).')
 
-        # Only domain names from the `query_name` column that will receive the
-        # additional_domain_type label (if present).
-        query_domain_names = set(df['query_name'])
-        # Name server domain names.
-        ns_domain_names = set(df[df.ns_address.notnull()]['ns_address'])
-        # All domain names, including the ones from the name server column.
-        all_domain_names = query_domain_names.union(ns_domain_names)
-        # Create all DomainName nodes.
-        domain_id = self.iyp.batch_get_nodes_by_single_prop('DomainName', 'name', all_domain_names)
-        # Get node IDs for NS nodes and add NS label.
-        ns_id = {name: domain_id[name] for name in ns_domain_names}
+        # query_names for NS records are domain names
+        domain_names = set(df[df.response_type == 'NS']['query_name'])
+
+        # response values of NS records are name servers
+        name_servers = set(df[df.ns_address.notnull()]['ns_address'])
+
+        # query_names for A and AAAA records are host names
+        host_names = set(df[(df.response_type == 'A') | (df.response_type == 'AAAA')]['query_name'])
+
+        ipv6_addresses = set()
+        # Normalize IPv6 addresses.
+        for ip in df[df.ip6_address.notnull()]['ip6_address']:
+            try:
+                ip_normalized = IPv6Address(ip).compressed
+            except ValueError as e:
+                logging.error(f'Ignoring invalid IPv6 address "{ip}": {e}')
+                continue
+            ipv6_addresses.add(ip_normalized)
+
+        # Get/create all nodes:
+        domain_id = self.iyp.batch_get_nodes_by_single_prop('DomainName', 'name', domain_names)
+        host_id = self.iyp.batch_get_nodes_by_single_prop('HostName', 'name', host_names)
+        ns_id = self.iyp.batch_get_nodes_by_single_prop('HostName', 'name', name_servers)
         self.iyp.batch_add_node_label(list(ns_id.values()), 'AuthoritativeNameServer')
-        # Add additional node label if present.
-        additional_id = set()
-        if self.additional_domain_type and self.additional_domain_type != 'DomainName':
-            additional_id = {domain_id[name] for name in query_domain_names}
-            self.iyp.batch_add_node_label(list(additional_id), self.additional_domain_type)
         ip4_id = self.iyp.batch_get_nodes_by_single_prop('IP', 'ip', set(df[df.ip4_address.notnull()]['ip4_address']))
-        ip6_id = self.iyp.batch_get_nodes_by_single_prop('IP', 'ip', set(df[df.ip6_address.notnull()]['ip6_address']))
+        ip6_id = self.iyp.batch_get_nodes_by_single_prop('IP', 'ip', ipv6_addresses)
+
+        print(f'Got {len(domain_id)} domains, {len(ns_id)} nameservers, {len(host_id)} hosts, {len(ip4_id)} IPv4, '
+              f'{len(ip6_id)} IPv6')
+
+        # Compute links
         res_links = []
         mng_links = []
+        partof_links = []
 
-        print(f'Got {len(domain_id)} domains, {len(ns_id)} nameservers, {len(ip4_id)} IPv4, {len(ip6_id)} IPv6')
-        if self.additional_domain_type:
-            print(f'Added "{self.additional_domain_type}" label to {len(additional_id)} nodes.')
-
+        # RESOLVES_TO and MANAGED_BY links
         for row in df.itertuples():
-            domain_qid = domain_id[row.query_name]
+
+            # NS Record
+            if row.response_type == 'NS' and row.ns_address:
+                domain_qid = domain_id[row.query_name]
+                ns_qid = ns_id[row.ns_address]
+                mng_links.append({'src_id': domain_qid, 'dst_id': ns_qid, 'props': [self.reference]})
 
             # A Record
-            if row.response_type == 'A' and row.ip4_address:
+            elif row.response_type == 'A' and row.ip4_address:
+                host_qid = host_id[row.query_name]
                 ip_qid = ip4_id[row.ip4_address]
-                res_links.append({'src_id': domain_qid, 'dst_id': ip_qid, 'props': [self.reference]})
+                res_links.append({'src_id': host_qid, 'dst_id': ip_qid, 'props': [self.reference]})
 
             # AAAA Record
             elif row.response_type == 'AAAA' and row.ip6_address:
-                ip_qid = ip6_id[row.ip6_address]
-                res_links.append({'src_id': domain_qid, 'dst_id': ip_qid, 'props': [self.reference]})
-
-            # NS Record
-            elif row.response_type == 'NS' and row.ns_address:
-                ns_qid = ns_id[row.ns_address]
-                mng_links.append({'src_id': domain_qid, 'dst_id': ns_qid, 'props': [self.reference]})
+                try:
+                    ip_normalized = IPv6Address(row.ip6_address).compressed
+                except ValueError:
+                    # Error message was already logged above.
+                    continue
+                host_qid = host_id[row.query_name]
+                ip_qid = ip6_id[ip_normalized]
+                res_links.append({'src_id': host_qid, 'dst_id': ip_qid, 'props': [self.reference]})
+
+        # PART_OF links between HostNames and DomainNames
+        for hd in host_names.intersection(domain_names):
+            partof_links.append({'src_id': host_id[hd], 'dst_id': domain_id[hd], 'props': [self.reference]})
 
         print(f'Computed {len(res_links)} RESOLVES_TO links and {len(mng_links)} MANAGED_BY links')
 
         # Push all links to IYP
         self.iyp.batch_add_links('RESOLVES_TO', res_links)
         self.iyp.batch_add_links('MANAGED_BY', mng_links)
+        self.iyp.batch_add_links('PART_OF', partof_links)
diff --git a/iyp/crawlers/openintel/infra_mx.py b/iyp/crawlers/openintel/infra_mx.py
@@ -19,6 +19,12 @@ def __init__(self, organization, url, name):
 
 
 def main() -> None:
+
+    ############################################
+    # This crawler is not working the NODE_TYPE argument has been deprecated
+    ############################################
+    return
+
     parser = argparse.ArgumentParser()
     parser.add_argument('--unit-test', action='store_true')
     args = parser.parse_args()