From 838c8bed6936fa473d6510bb18c9f89f48ab177b Mon Sep 17 00:00:00 2001
From: Romain <romain-fontugne@users.noreply.github.com>
Date: Wed, 7 Feb 2024 16:55:53 +0900
Subject: [PATCH] DNS remodeling (#119)

* update url2domain to url2hostname

* remove iana root zone file and dns hierarchy from config file

* Atlas measurement targets are now hostnames

* update openintel crawlers to the new DNS model

* umbrella now ranks a mix of DomainName and HostName nodes and should be run after openintel.umbrella1m

* Add explanation for cloudflare DNS modeling

* lower umbrella crawler in config file

* update READMEs with the new DNS modeling

* add (:Service {name:'DNS'}) node and link it to authoritative name servers

* Nodes do not have reference properties

* Normalize IPv6 addresses

* Fix wrong crawler name

* Typos and formatting

* Remove infra_mx crawler since it does not do anything at the moment

* Update Cisco Umbrella crawler

- Batch create new nodes (happens more often than expected)
- Add logging output
- Do not use builtins as variable names

* Remove redundant set and parameters

* Remove Service node for now

We could not decide on a name, so we will deal with this later.

---------

Co-authored-by: Malte Tashiro <malte@iij.ad.jp>
---
 config.json.example                          |  7 +-
 iyp/crawlers/cisco/README.md                 |  8 +-
 iyp/crawlers/cisco/umbrella_top1M.py         | 61 +++++++++++--
 iyp/crawlers/cloudflare/README.md            |  8 +-
 iyp/crawlers/cloudflare/dns_top_ases.py      |  5 ++
 iyp/crawlers/cloudflare/dns_top_locations.py |  5 ++
 iyp/crawlers/cloudflare/ranking_bucket.py    |  3 +
 iyp/crawlers/cloudflare/top100.py            |  3 +
 iyp/crawlers/openintel/README.md             | 20 +++--
 iyp/crawlers/openintel/__init__.py           | 95 ++++++++++++--------
 iyp/crawlers/openintel/infra_mx.py           |  6 ++
 iyp/crawlers/openintel/infra_ns.py           |  3 +-
 iyp/crawlers/ripe/README.md                  |  4 +-
 iyp/crawlers/ripe/atlas_measurements.py      | 26 +++---
 iyp/post/{url2domain.py => url2hostname.py}  | 18 ++--
 15 files changed, 183 insertions(+), 89 deletions(-)
 rename iyp/post/{url2domain.py => url2hostname.py} (68%)

diff --git a/config.json.example b/config.json.example
index abace94..81134ca 100644
--- a/config.json.example
+++ b/config.json.example
@@ -63,11 +63,10 @@
             "iyp.crawlers.peeringdb.ix",
             "iyp.crawlers.cloudflare.top100",
             "iyp.crawlers.tranco.top1M",
-            "iyp.crawlers.cisco.umbrella_top1M",
             "iyp.crawlers.openintel.tranco1m",
             "iyp.crawlers.openintel.umbrella1m",
             "iyp.crawlers.openintel.infra_ns",
-            "iyp.crawlers.openintel.infra_mx",
+            "iyp.crawlers.cisco.umbrella_top1M",
             "iyp.crawlers.citizenlab.urldb",
             "iyp.crawlers.inetintel.as_org",
             "iyp.crawlers.pch.daily_routing_snapshots_v4",
@@ -75,7 +74,6 @@
             "iyp.crawlers.emileaben.as_names",
             "iyp.crawlers.ripe.atlas_probes",
             "iyp.crawlers.ripe.atlas_measurements",
-            "iyp.crawlers.iana.root_zone",
             "iyp.crawlers.alice_lg.amsix",
             "iyp.crawlers.alice_lg.bcix",
             "iyp.crawlers.alice_lg.decix",
@@ -91,8 +89,7 @@
             "iyp.post.ip2prefix",
             "iyp.post.address_family",
             "iyp.post.country_information",
-            "iyp.post.dns_hierarchy",
-            "iyp.post.url2domain"
+            "iyp.post.url2hostname"
         ]
     }
 }
diff --git a/iyp/crawlers/cisco/README.md b/iyp/crawlers/cisco/README.md
index 24d1811..839402e 100644
--- a/iyp/crawlers/cisco/README.md
+++ b/iyp/crawlers/cisco/README.md
@@ -1,8 +1,9 @@
 # Cisco Umbrella -- https://umbrella-static.s3-us-west-1.amazonaws.com/index.html
 
-The popularity list contains most queried domains based on passive DNS usage across the Umbrella global network.
+The popularity list contains most queried domains (ranging from TLDs to FQDNs)
+based on passive DNS usage across the Umbrella global network.
 
-IYP uses this data to create and annotate DomainName nodes.
+IYP uses this data to create and annotate DomainName and HostName nodes.
 
 ## Graph representation
 
@@ -10,8 +11,9 @@ The rank of the domain is indicated by the `rank` property of the relationship.
 
 ```Cypher
 (:DomainName {name: 'google.com'})-[:RANK {rank: 1}]->(:Ranking {name: 'Cisco Umbrella Top 1 million'})
+(:HostName {name: 'www.google.com'})-[:RANK {rank: 8}]->(:Ranking {name: 'Cisco Umbrella Top 1 million'})
 ```
 
 ## Dependence
 
-This crawler is not depending on other crawlers.
+This crawler depends on `openintel.umbrella1m`.
diff --git a/iyp/crawlers/cisco/umbrella_top1M.py b/iyp/crawlers/cisco/umbrella_top1M.py
index 629f15b..714681b 100644
--- a/iyp/crawlers/cisco/umbrella_top1M.py
+++ b/iyp/crawlers/cisco/umbrella_top1M.py
@@ -6,6 +6,7 @@
 from zipfile import ZipFile
 
 import requests
+import tldextract
 
 from iyp import BaseCrawler, RequestStatusError
 
@@ -22,31 +23,75 @@ def run(self):
 
         self.cisco_qid = self.iyp.get_node('Ranking', {'name': 'Cisco Umbrella Top 1 million'})
 
-        sys.stderr.write('Downloading latest list...\n')
+        logging.info('Downloading latest list...')
         req = requests.get(URL)
         if req.status_code != 200:
             raise RequestStatusError('Error while fetching Cisco Umbrella Top 1M csv file')
 
         links = []
-        domains = set()
         # open zip file and read top list
         with ZipFile(io.BytesIO(req.content)) as z:
-            with z.open('top-1m.csv') as list:
-                for i, row in enumerate(io.TextIOWrapper(list)):
+            with z.open('top-1m.csv') as top_list:
+                for i, row in enumerate(io.TextIOWrapper(top_list)):
                     row = row.rstrip()
                     rank, domain = row.split(',')
 
-                    domains.add(domain)
                     links.append({'src_name': domain, 'dst_id': self.cisco_qid,
                                   'props': [self.reference, {'rank': int(rank)}]})
 
-        name_id = self.iyp.batch_get_nodes_by_single_prop('DomainName', 'name', domains)
+        logging.info('Fetching DomainName/HostName nodes...')
+        domain_id = self.iyp.batch_get_nodes_by_single_prop('DomainName', 'name')
+        host_id = self.iyp.batch_get_nodes_by_single_prop('HostName', 'name')
 
+        # Umbrella mixes up domain and host names.
+        # By order of preferences we rank:
+        # 1) existing domain name
+        # 2) existing host name
+        # 3) do our best to figure out if it is a domain or host and create the
+        # corresponding node
+
+        new_domain_names = set()
+        new_host_names = set()
+        unprocessed_links = list()
+        processed_links = list()
+
+        logging.info('Building relationships...')
         for link in links:
-            link['src_id'] = name_id[link['src_name']]
+            if link['src_name'] in domain_id:
+                link['src_id'] = domain_id[link['src_name']]
+                processed_links.append(link)
+            elif link['src_name'] in host_id:
+                link['src_id'] = host_id[link['src_name']]
+                processed_links.append(link)
+            else:
+                unprocessed_links.append(link)
+                ranked_thing = tldextract.extract(link['src_name'])
+                name = link['src_name']
+                if name == ranked_thing.registered_domain:
+                    new_domain_names.add(name)
+                else:
+                    new_host_names.add(name)
+
+        if new_domain_names:
+            logging.info(f'Pushing {len(new_domain_names)} additional DomainName nodes...')
+            domain_id.update(self.iyp.batch_get_nodes_by_single_prop('DomainName', 'name', new_domain_names, all=False))
+        if new_host_names:
+            logging.info(f'Pushing {len(new_host_names)} additional HostName nodes...')
+            host_id.update(self.iyp.batch_get_nodes_by_single_prop('HostName', 'name', new_host_names, all=False))
+
+        for link in unprocessed_links:
+            if link['src_name'] in domain_id:
+                link['src_id'] = domain_id[link['src_name']]
+            elif link['src_name'] in host_id:
+                link['src_id'] = host_id[link['src_name']]
+            else:
+                logging.error(f'Missing DomainName/HostName node for name "{link["src_name"]}". Should not happen.')
+                continue
+            processed_links.append(link)
 
         # Push all links to IYP
-        self.iyp.batch_add_links('RANK', links)
+        logging.info(f'Pushing {len(processed_links)} RANK relationships...')
+        self.iyp.batch_add_links('RANK', processed_links)
 
 
 def main() -> None:
diff --git a/iyp/crawlers/cloudflare/README.md b/iyp/crawlers/cloudflare/README.md
index 4ba886f..7ce4aee 100644
--- a/iyp/crawlers/cloudflare/README.md
+++ b/iyp/crawlers/cloudflare/README.md
@@ -1,4 +1,4 @@
-# Cloudflare Radar -- https://radar.cloudflare.com/ 
+# Cloudflare Radar -- https://radar.cloudflare.com/
 
 Cloudflare uses aggregated and anonymized DNS queries to their `1.1.1.1` public resolver service to
 provide various datasets, including:
@@ -17,8 +17,12 @@ provide various datasets, including:
 - [Top 100 ASes querying each of the 10,000 highest ranked domain
   names](https://developers.cloudflare.com/api/operations/radar_get__top_ases): Same as above, but
   fetch AS numbers instead.
-  
+
 All rankings are based on one week of data.
+Cloudflare radar's top location and ASes is available for both domain names
+and host names. Results are likely accounting for all NS, A, AAAA queries made to
+Cloudflare's resolver. Since NS queries for host names make no sense IYP links these
+results to `DomainName` nodes.
 
 ## Graph representation
 
diff --git a/iyp/crawlers/cloudflare/dns_top_ases.py b/iyp/crawlers/cloudflare/dns_top_ases.py
index f24c952..8c15ac0 100644
--- a/iyp/crawlers/cloudflare/dns_top_ases.py
+++ b/iyp/crawlers/cloudflare/dns_top_ases.py
@@ -1,3 +1,8 @@
+# Cloudflare radar's top location and ASes is available for both domain names
+# and host names. Results are likely accounting for all NS, A, AAAA queries made to
+# Cloudflare's resolver. Since NS queries for host names make no sense it seems
+# more intuitive to link these results to DomainName nodes.
+
 import argparse
 import logging
 import os
diff --git a/iyp/crawlers/cloudflare/dns_top_locations.py b/iyp/crawlers/cloudflare/dns_top_locations.py
index 4f4118f..46e46a0 100644
--- a/iyp/crawlers/cloudflare/dns_top_locations.py
+++ b/iyp/crawlers/cloudflare/dns_top_locations.py
@@ -1,3 +1,8 @@
+# Cloudflare radar's top location and ASes is available for both domain names
+# and host names. Results are likely accounting for all NS, A, AAAA queries made to
+# Cloudflare's resolver. Since NS queries for host names make no sense it seems
+# more intuitive to link these results to DomainName nodes.
+
 import argparse
 import glob
 import json
diff --git a/iyp/crawlers/cloudflare/ranking_bucket.py b/iyp/crawlers/cloudflare/ranking_bucket.py
index 7193ad6..fbea3f0 100644
--- a/iyp/crawlers/cloudflare/ranking_bucket.py
+++ b/iyp/crawlers/cloudflare/ranking_bucket.py
@@ -24,6 +24,9 @@
 class Crawler(BaseCrawler):
     # Base Crawler provides access to IYP via self.iyp and setup a dictionary with the
     # org/url/today's date in self.reference
+    #
+    # Cloudflare ranks second and third level domain names (not host names).
+    # See https://blog.cloudflare.com/radar-domain-rankings/
 
     def run(self):
         """Fetch data and push to IYP."""
diff --git a/iyp/crawlers/cloudflare/top100.py b/iyp/crawlers/cloudflare/top100.py
index 5e2f5fb..d189da8 100644
--- a/iyp/crawlers/cloudflare/top100.py
+++ b/iyp/crawlers/cloudflare/top100.py
@@ -21,6 +21,9 @@
 class Crawler(BaseCrawler):
     # Base Crawler provides access to IYP via self.iyp
     # and setup a dictionary with the org/url/today's date in self.reference
+    #
+    # Cloudflare ranks second and third level domain names (not host names).
+    # See https://blog.cloudflare.com/radar-domain-rankings/
 
     def run(self):
         """Fetch data and push to IYP."""
diff --git a/iyp/crawlers/openintel/README.md b/iyp/crawlers/openintel/README.md
index 70cf9c5..0f3ec5f 100644
--- a/iyp/crawlers/openintel/README.md
+++ b/iyp/crawlers/openintel/README.md
@@ -4,26 +4,34 @@ The OpenINTEL measurement platform captures daily snapshots of the state of larg
 global Domain Name System (DNS) by running a number of forward and reverse DNS measurements.
 
 While OpenINTEL runs measurements to a variety of domain names, IYP currently only fetches data for
-the [Tranco top 1 million list](https://data.openintel.nl/data/tranco1m/) and the CISCO Umbrella 
+the [Tranco top 1 million list](https://data.openintel.nl/data/tranco1m/) and the CISCO Umbrella
 top 1 million list since it combines rankings.
 IYP also get the list of authoritative names servers seen by OpenINTEL.
 
-IYP uses only `A` queries to add IP resolution for DomainName and AuthoritativeNameServer nodes.
-
 A crawler of mail servers is also implemented but not used as it creates a very large number
 of links and this dataset is currently not requested/needed by anyone.
 
 ## Graph representation
 
-IP resolution for  popular domain names:
+IP resolution for  popular host names:
+
 ```Cypher
-(:DomainName {name: 'google.com'})-[:RESOLVES_TO]->(:IP {ip: '142.250.179.142'})
+(:HostName {name: 'google.com'})-[:RESOLVES_TO]->(:IP {ip: '142.250.179.142'})
 ```
 
 IP resolution of authoritative name servers:
+
+```Cypher
+(:HostName:AuthoritativeNameServer {name: 'ns1.google.com'})-[:RESOLVES_TO]->(:IP {ip: '216.239.32.10'})
+(:IP {ip: '216.239.32.10'})-[:SERVE]->(:Service {name: 'DNS'})
+```
+
+Domain names managed by name servers:
+
 ```Cypher
-(:AuthoritativeNameServer {name: 'ns1.google.com'})-[:RESOLVES_TO]->(:IP {ip: '216.239.32.10'})
+(:DomainName {name: 'google.com'})-[:MANAGED_BY]->(:HostName:AuthoritativeNameServer {name: 'ns1.google.com'})
 ```
+
 ## Dependence
 
 This crawler is not depending on other crawlers.
diff --git a/iyp/crawlers/openintel/__init__.py b/iyp/crawlers/openintel/__init__.py
index 75b3c05..d255205 100644
--- a/iyp/crawlers/openintel/__init__.py
+++ b/iyp/crawlers/openintel/__init__.py
@@ -7,6 +7,7 @@
 import logging
 import os
 import tempfile
+from ipaddress import IPv6Address
 
 import arrow
 import boto3
@@ -42,17 +43,11 @@ def valid_date(s):
 
 
 class OpenIntelCrawler(BaseCrawler):
-    def __init__(self, organization, url, name, dataset, additional_domain_type=str()):
+    def __init__(self, organization, url, name, dataset):
         """Initialization of the OpenIntel crawler requires the name of the dataset
-        (e.g. tranco or infra:ns).
-
-        If the dataset contains special types of domain
-        names, an additional label can be specified (e.g., `AuthoritativeNameServer`)
-        that will be attached to the `DomainName` nodes.
-        """
+        (e.g. tranco or infra:ns)."""
 
         self.dataset = dataset
-        self.additional_domain_type = additional_domain_type
         super().__init__(organization, url, name)
 
     def get_parquet(self):
@@ -179,52 +174,74 @@ def run(self):
 
         print(f'Read {len(df)} unique records from {len(self.pandas_df_list)} Parquet file(s).')
 
-        # Only domain names from the `query_name` column that will receive the
-        # additional_domain_type label (if present).
-        query_domain_names = set(df['query_name'])
-        # Name server domain names.
-        ns_domain_names = set(df[df.ns_address.notnull()]['ns_address'])
-        # All domain names, including the ones from the name server column.
-        all_domain_names = query_domain_names.union(ns_domain_names)
-        # Create all DomainName nodes.
-        domain_id = self.iyp.batch_get_nodes_by_single_prop('DomainName', 'name', all_domain_names)
-        # Get node IDs for NS nodes and add NS label.
-        ns_id = {name: domain_id[name] for name in ns_domain_names}
+        # query_names for NS records are domain names
+        domain_names = set(df[df.response_type == 'NS']['query_name'])
+
+        # response values of NS records are name servers
+        name_servers = set(df[df.ns_address.notnull()]['ns_address'])
+
+        # query_names for A and AAAA records are host names
+        host_names = set(df[(df.response_type == 'A') | (df.response_type == 'AAAA')]['query_name'])
+
+        ipv6_addresses = set()
+        # Normalize IPv6 addresses.
+        for ip in df[df.ip6_address.notnull()]['ip6_address']:
+            try:
+                ip_normalized = IPv6Address(ip).compressed
+            except ValueError as e:
+                logging.error(f'Ignoring invalid IPv6 address "{ip}": {e}')
+                continue
+            ipv6_addresses.add(ip_normalized)
+
+        # Get/create all nodes:
+        domain_id = self.iyp.batch_get_nodes_by_single_prop('DomainName', 'name', domain_names)
+        host_id = self.iyp.batch_get_nodes_by_single_prop('HostName', 'name', host_names)
+        ns_id = self.iyp.batch_get_nodes_by_single_prop('HostName', 'name', name_servers)
         self.iyp.batch_add_node_label(list(ns_id.values()), 'AuthoritativeNameServer')
-        # Add additional node label if present.
-        additional_id = set()
-        if self.additional_domain_type and self.additional_domain_type != 'DomainName':
-            additional_id = {domain_id[name] for name in query_domain_names}
-            self.iyp.batch_add_node_label(list(additional_id), self.additional_domain_type)
         ip4_id = self.iyp.batch_get_nodes_by_single_prop('IP', 'ip', set(df[df.ip4_address.notnull()]['ip4_address']))
-        ip6_id = self.iyp.batch_get_nodes_by_single_prop('IP', 'ip', set(df[df.ip6_address.notnull()]['ip6_address']))
+        ip6_id = self.iyp.batch_get_nodes_by_single_prop('IP', 'ip', ipv6_addresses)
+
+        print(f'Got {len(domain_id)} domains, {len(ns_id)} nameservers, {len(host_id)} hosts, {len(ip4_id)} IPv4, '
+              f'{len(ip6_id)} IPv6')
+
+        # Compute links
         res_links = []
         mng_links = []
+        partof_links = []
 
-        print(f'Got {len(domain_id)} domains, {len(ns_id)} nameservers, {len(ip4_id)} IPv4, {len(ip6_id)} IPv6')
-        if self.additional_domain_type:
-            print(f'Added "{self.additional_domain_type}" label to {len(additional_id)} nodes.')
-
+        # RESOLVES_TO and MANAGED_BY links
         for row in df.itertuples():
-            domain_qid = domain_id[row.query_name]
+
+            # NS Record
+            if row.response_type == 'NS' and row.ns_address:
+                domain_qid = domain_id[row.query_name]
+                ns_qid = ns_id[row.ns_address]
+                mng_links.append({'src_id': domain_qid, 'dst_id': ns_qid, 'props': [self.reference]})
 
             # A Record
-            if row.response_type == 'A' and row.ip4_address:
+            elif row.response_type == 'A' and row.ip4_address:
+                host_qid = host_id[row.query_name]
                 ip_qid = ip4_id[row.ip4_address]
-                res_links.append({'src_id': domain_qid, 'dst_id': ip_qid, 'props': [self.reference]})
+                res_links.append({'src_id': host_qid, 'dst_id': ip_qid, 'props': [self.reference]})
 
             # AAAA Record
             elif row.response_type == 'AAAA' and row.ip6_address:
-                ip_qid = ip6_id[row.ip6_address]
-                res_links.append({'src_id': domain_qid, 'dst_id': ip_qid, 'props': [self.reference]})
-
-            # NS Record
-            elif row.response_type == 'NS' and row.ns_address:
-                ns_qid = ns_id[row.ns_address]
-                mng_links.append({'src_id': domain_qid, 'dst_id': ns_qid, 'props': [self.reference]})
+                try:
+                    ip_normalized = IPv6Address(row.ip6_address).compressed
+                except ValueError:
+                    # Error message was already logged above.
+                    continue
+                host_qid = host_id[row.query_name]
+                ip_qid = ip6_id[ip_normalized]
+                res_links.append({'src_id': host_qid, 'dst_id': ip_qid, 'props': [self.reference]})
+
+        # PART_OF links between HostNames and DomainNames
+        for hd in host_names.intersection(domain_names):
+            partof_links.append({'src_id': host_id[hd], 'dst_id': domain_id[hd], 'props': [self.reference]})
 
         print(f'Computed {len(res_links)} RESOLVES_TO links and {len(mng_links)} MANAGED_BY links')
 
         # Push all links to IYP
         self.iyp.batch_add_links('RESOLVES_TO', res_links)
         self.iyp.batch_add_links('MANAGED_BY', mng_links)
+        self.iyp.batch_add_links('PART_OF', partof_links)
diff --git a/iyp/crawlers/openintel/infra_mx.py b/iyp/crawlers/openintel/infra_mx.py
index 24266d0..3841c77 100644
--- a/iyp/crawlers/openintel/infra_mx.py
+++ b/iyp/crawlers/openintel/infra_mx.py
@@ -19,6 +19,12 @@ def __init__(self, organization, url, name):
 
 
 def main() -> None:
+
+    ############################################
+    # This crawler is not working the NODE_TYPE argument has been deprecated
+    ############################################
+    return
+
     parser = argparse.ArgumentParser()
     parser.add_argument('--unit-test', action='store_true')
     args = parser.parse_args()
diff --git a/iyp/crawlers/openintel/infra_ns.py b/iyp/crawlers/openintel/infra_ns.py
index 776f1aa..a4395c6 100644
--- a/iyp/crawlers/openintel/infra_ns.py
+++ b/iyp/crawlers/openintel/infra_ns.py
@@ -10,12 +10,11 @@
 NAME = 'openintel.infra_ns'
 
 DATASET = 'infra:ns'
-NODE_TYPE = 'AuthoritativeNameServer'
 
 
 class Crawler(OpenIntelCrawler):
     def __init__(self, organization, url, name):
-        super().__init__(organization, url, name, DATASET, NODE_TYPE)
+        super().__init__(organization, url, name, DATASET)
 
 
 def main() -> None:
diff --git a/iyp/crawlers/ripe/README.md b/iyp/crawlers/ripe/README.md
index f56196b..ce54b07 100644
--- a/iyp/crawlers/ripe/README.md
+++ b/iyp/crawlers/ripe/README.md
@@ -52,7 +52,7 @@ ASN(s), and country.
 We fetch the [list of
 measurements](https://atlas.ripe.net/docs/apis/rest-api-manual/measurements/)
 to obtain metadata of *ongoing* Atlas measurements.  `AtlasProbe`s are `PART_OF`
-`AtlasMeasurement`s and measurements `TARGET` one or more `IP`s, a `DomainName`, or
+`AtlasMeasurement`s and measurements `TARGET` one or more `IP`s, a `HostName`, or
 both. The Atlas platform also maps the measurement target to an `AS` number if possible.
 The crawler includes this relationship as well.
 
@@ -61,7 +61,7 @@ never connected or are abandoned.
 
 ```Cypher
 (:AtlasProbe {id: 6425})-[:PART_OF]->(:AtlasMeasurement {id: 17635549})-[:TARGET]->(:AS {asn: 2497})
-(:AtlasProbe {id: 6425})-[:PART_OF]->(:AtlasMeasurement {id: 17635549})-[:TARGET]->(:DomainName {name: 'jp-tyo-as2497.anchors.atlas.ripe.net'})
+(:AtlasProbe {id: 6425})-[:PART_OF]->(:AtlasMeasurement {id: 17635549})-[:TARGET]->(:HostName {name: 'jp-tyo-as2497.anchors.atlas.ripe.net'})
 (:AtlasProbe {id: 6425})-[:PART_OF]->(:AtlasMeasurement {id: 17635549})-[:TARGET]->(:IP {ip: '202.214.87.158'})
 ```
 
diff --git a/iyp/crawlers/ripe/atlas_measurements.py b/iyp/crawlers/ripe/atlas_measurements.py
index d0f9aac..79770f0 100644
--- a/iyp/crawlers/ripe/atlas_measurements.py
+++ b/iyp/crawlers/ripe/atlas_measurements.py
@@ -73,7 +73,7 @@ def __transform_data(data):
             # on the '_' delimiter, this action would potentially
             # cause a TypeError from flatdict if it isn't handled properly.
             target_info = {
-                'domain': item.pop('target', None),
+                'hostname': item.pop('target', None),
                 'asn': item.pop('target_asn', None),
                 'ip': item.pop('target_ip', None),
                 'prefix': item.pop('target_prefix', None),
@@ -148,7 +148,7 @@ def run(self):
         probe_ids = set()
         ips = set()
         ases = set()
-        domains = set()
+        hostnames = set()
 
         valid_probe_measurements = list()
 
@@ -171,16 +171,16 @@ def run(self):
                 probe_af = int(probe_measurement['af'])
                 resolved_ips[i] = ipaddress.ip_address(resolved_ips[i]).compressed if probe_af == 6 else resolved_ips[i]
 
-            domain = probe_measurement['target']['domain']
-            if domain == '' or self.__is_valid_ip(domain):
-                domain = None
-                probe_measurement['target']['domain'] = None
+            hostname = probe_measurement['target']['hostname']
+            if hostname == '' or self.__is_valid_ip(hostname):
+                hostname = None
+                probe_measurement['target']['hostname'] = None
 
             asn = probe_measurement['target']['asn']
             probe_ids_participated = probe_measurement['current_probes']
 
             self.__add_if_not_none(probe_measurement_id, probe_measurement_ids)
-            self.__add_if_not_none(domain, domains)
+            self.__add_if_not_none(hostname, hostnames)
             self.__add_if_not_none(asn, ases)
             self.__add_if_not_none_values(resolved_ips, ips)
             self.__add_if_not_none_values(probe_ids_participated, probe_ids)
@@ -204,8 +204,8 @@ def run(self):
         probe_ids = self.iyp.batch_get_nodes_by_single_prop('AtlasProbe', 'id', probe_ids, all=False, create=True)
         logging.info(f'{len(ips)} IPs')
         ip_ids = self.iyp.batch_get_nodes_by_single_prop('IP', 'ip', ips, all=False, create=True)
-        logging.info(f'{len(domains)} domains')
-        domain_ids = self.iyp.batch_get_nodes_by_single_prop('DomainName', 'name', domains, all=False, create=True)
+        logging.info(f'{len(hostnames)} hostnames')
+        hostname_ids = self.iyp.batch_get_nodes_by_single_prop('HostName', 'name', hostnames, all=False, create=True)
         logging.info(f'{len(ases)} ASNs')
         asn_ids = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn', ases, all=False, create=True)
 
@@ -226,10 +226,10 @@ def run(self):
                 target_links.append({'src_id': probe_measurement_qid, 'dst_id': asn_qid,
                                     'props': [probe_measurement_reference]})
 
-            probe_measurement_domain = probe_measurement['target']['domain']
-            if probe_measurement_domain:
-                domain_qid = domain_ids[probe_measurement_domain]
-                target_links.append({'src_id': probe_measurement_qid, 'dst_id': domain_qid,
+            probe_measurement_hostname = probe_measurement['target']['hostname']
+            if probe_measurement_hostname:
+                hostname_qid = hostname_ids[probe_measurement_hostname]
+                target_links.append({'src_id': probe_measurement_qid, 'dst_id': hostname_qid,
                                     'props': [probe_measurement_reference]})
 
             probe_measurement_ips = self.__get_all_resolved_ips(probe_measurement)
diff --git a/iyp/post/url2domain.py b/iyp/post/url2hostname.py
similarity index 68%
rename from iyp/post/url2domain.py
rename to iyp/post/url2hostname.py
index 6a89f94..6429a3f 100644
--- a/iyp/post/url2domain.py
+++ b/iyp/post/url2hostname.py
@@ -8,27 +8,27 @@
 
 class PostProcess(BasePostProcess):
     def run(self):
-        """Link URLs and their corresponding DomainNames."""
+        """Link URLs and their corresponding HostNames."""
 
         # Get all URL nodes.
         url_id = self.iyp.batch_get_nodes_by_single_prop('URL', 'url')
 
-        # Get all DomainName Nodes
-        domain_id = self.iyp.batch_get_nodes_by_single_prop('DomainName', 'name')
+        # Get all HostName Nodes
+        hostname_id = self.iyp.batch_get_nodes_by_single_prop('HostName', 'name')
 
         # Compute links
         links = []
         for url, url_qid in url_id.items():
-            # Extract domain name from URL
-            domain = tldextract.extract(url).registered_domain
+            # Extract host name from URL
+            hostname = tldextract.extract(url).fqdn
 
-            # Get DomainName node for the domain
-            domain_qid = domain_id.get(domain)
+            # Get HostName node for the fqdn of the URL
+            hostname_qid = hostname_id.get(hostname)
 
-            if domain_qid is not None:
+            if hostname_qid is not None:
                 links.append({
                     'src_id': url_qid,
-                    'dst_id': domain_qid,
+                    'dst_id': hostname_qid,
                     'props': [self.reference]
                 })