From 838c8bed6936fa473d6510bb18c9f89f48ab177b Mon Sep 17 00:00:00 2001 From: Romain Date: Wed, 7 Feb 2024 16:55:53 +0900 Subject: [PATCH] DNS remodeling (#119) * update url2domain to url2hostname * remove iana root zone file and dns hierarchy from config file * Atlas measurement targets are now hostnames * update openintel crawlers to the new DNS model * umbrella now ranks a mix of DomainName and HostName nodes and should be run after openintel.umbrella1m * Add explanation for cloudflare DNS modeling * lower umbrella crawler in config file * update READMEs with the new DNS modeling * add (:Service {name:'DNS'}) node and link it to authoritative name servers * Nodes do not have reference properties * Normalize IPv6 addresses * Fix wrong crawler name * Typos and formatting * Remove infra_mx crawler since it does not do anything at the moment * Update Cisco Umbrella crawler - Batch create new nodes (happens more often than expected) - Add logging output - Do not use builtins as variable names * Remove redundant set and parameters * Remove Service node for now We could not decide on a name, so we will deal with this later. --------- Co-authored-by: Malte Tashiro --- config.json.example | 7 +- iyp/crawlers/cisco/README.md | 8 +- iyp/crawlers/cisco/umbrella_top1M.py | 61 +++++++++++-- iyp/crawlers/cloudflare/README.md | 8 +- iyp/crawlers/cloudflare/dns_top_ases.py | 5 ++ iyp/crawlers/cloudflare/dns_top_locations.py | 5 ++ iyp/crawlers/cloudflare/ranking_bucket.py | 3 + iyp/crawlers/cloudflare/top100.py | 3 + iyp/crawlers/openintel/README.md | 20 +++-- iyp/crawlers/openintel/__init__.py | 95 ++++++++++++-------- iyp/crawlers/openintel/infra_mx.py | 6 ++ iyp/crawlers/openintel/infra_ns.py | 3 +- iyp/crawlers/ripe/README.md | 4 +- iyp/crawlers/ripe/atlas_measurements.py | 26 +++--- iyp/post/{url2domain.py => url2hostname.py} | 18 ++-- 15 files changed, 183 insertions(+), 89 deletions(-) rename iyp/post/{url2domain.py => url2hostname.py} (68%) diff --git a/config.json.example b/config.json.example index abace94..81134ca 100644 --- a/config.json.example +++ b/config.json.example @@ -63,11 +63,10 @@ "iyp.crawlers.peeringdb.ix", "iyp.crawlers.cloudflare.top100", "iyp.crawlers.tranco.top1M", - "iyp.crawlers.cisco.umbrella_top1M", "iyp.crawlers.openintel.tranco1m", "iyp.crawlers.openintel.umbrella1m", "iyp.crawlers.openintel.infra_ns", - "iyp.crawlers.openintel.infra_mx", + "iyp.crawlers.cisco.umbrella_top1M", "iyp.crawlers.citizenlab.urldb", "iyp.crawlers.inetintel.as_org", "iyp.crawlers.pch.daily_routing_snapshots_v4", @@ -75,7 +74,6 @@ "iyp.crawlers.emileaben.as_names", "iyp.crawlers.ripe.atlas_probes", "iyp.crawlers.ripe.atlas_measurements", - "iyp.crawlers.iana.root_zone", "iyp.crawlers.alice_lg.amsix", "iyp.crawlers.alice_lg.bcix", "iyp.crawlers.alice_lg.decix", @@ -91,8 +89,7 @@ "iyp.post.ip2prefix", "iyp.post.address_family", "iyp.post.country_information", - "iyp.post.dns_hierarchy", - "iyp.post.url2domain" + "iyp.post.url2hostname" ] } } diff --git a/iyp/crawlers/cisco/README.md b/iyp/crawlers/cisco/README.md index 24d1811..839402e 100644 --- a/iyp/crawlers/cisco/README.md +++ b/iyp/crawlers/cisco/README.md @@ -1,8 +1,9 @@ # Cisco Umbrella -- https://umbrella-static.s3-us-west-1.amazonaws.com/index.html -The popularity list contains most queried domains based on passive DNS usage across the Umbrella global network. +The popularity list contains most queried domains (ranging from TLDs to FQDNs) +based on passive DNS usage across the Umbrella global network. -IYP uses this data to create and annotate DomainName nodes. +IYP uses this data to create and annotate DomainName and HostName nodes. ## Graph representation @@ -10,8 +11,9 @@ The rank of the domain is indicated by the `rank` property of the relationship. ```Cypher (:DomainName {name: 'google.com'})-[:RANK {rank: 1}]->(:Ranking {name: 'Cisco Umbrella Top 1 million'}) +(:HostName {name: 'www.google.com'})-[:RANK {rank: 8}]->(:Ranking {name: 'Cisco Umbrella Top 1 million'}) ``` ## Dependence -This crawler is not depending on other crawlers. +This crawler depends on `openintel.umbrella1m`. diff --git a/iyp/crawlers/cisco/umbrella_top1M.py b/iyp/crawlers/cisco/umbrella_top1M.py index 629f15b..714681b 100644 --- a/iyp/crawlers/cisco/umbrella_top1M.py +++ b/iyp/crawlers/cisco/umbrella_top1M.py @@ -6,6 +6,7 @@ from zipfile import ZipFile import requests +import tldextract from iyp import BaseCrawler, RequestStatusError @@ -22,31 +23,75 @@ def run(self): self.cisco_qid = self.iyp.get_node('Ranking', {'name': 'Cisco Umbrella Top 1 million'}) - sys.stderr.write('Downloading latest list...\n') + logging.info('Downloading latest list...') req = requests.get(URL) if req.status_code != 200: raise RequestStatusError('Error while fetching Cisco Umbrella Top 1M csv file') links = [] - domains = set() # open zip file and read top list with ZipFile(io.BytesIO(req.content)) as z: - with z.open('top-1m.csv') as list: - for i, row in enumerate(io.TextIOWrapper(list)): + with z.open('top-1m.csv') as top_list: + for i, row in enumerate(io.TextIOWrapper(top_list)): row = row.rstrip() rank, domain = row.split(',') - domains.add(domain) links.append({'src_name': domain, 'dst_id': self.cisco_qid, 'props': [self.reference, {'rank': int(rank)}]}) - name_id = self.iyp.batch_get_nodes_by_single_prop('DomainName', 'name', domains) + logging.info('Fetching DomainName/HostName nodes...') + domain_id = self.iyp.batch_get_nodes_by_single_prop('DomainName', 'name') + host_id = self.iyp.batch_get_nodes_by_single_prop('HostName', 'name') + # Umbrella mixes up domain and host names. + # By order of preferences we rank: + # 1) existing domain name + # 2) existing host name + # 3) do our best to figure out if it is a domain or host and create the + # corresponding node + + new_domain_names = set() + new_host_names = set() + unprocessed_links = list() + processed_links = list() + + logging.info('Building relationships...') for link in links: - link['src_id'] = name_id[link['src_name']] + if link['src_name'] in domain_id: + link['src_id'] = domain_id[link['src_name']] + processed_links.append(link) + elif link['src_name'] in host_id: + link['src_id'] = host_id[link['src_name']] + processed_links.append(link) + else: + unprocessed_links.append(link) + ranked_thing = tldextract.extract(link['src_name']) + name = link['src_name'] + if name == ranked_thing.registered_domain: + new_domain_names.add(name) + else: + new_host_names.add(name) + + if new_domain_names: + logging.info(f'Pushing {len(new_domain_names)} additional DomainName nodes...') + domain_id.update(self.iyp.batch_get_nodes_by_single_prop('DomainName', 'name', new_domain_names, all=False)) + if new_host_names: + logging.info(f'Pushing {len(new_host_names)} additional HostName nodes...') + host_id.update(self.iyp.batch_get_nodes_by_single_prop('HostName', 'name', new_host_names, all=False)) + + for link in unprocessed_links: + if link['src_name'] in domain_id: + link['src_id'] = domain_id[link['src_name']] + elif link['src_name'] in host_id: + link['src_id'] = host_id[link['src_name']] + else: + logging.error(f'Missing DomainName/HostName node for name "{link["src_name"]}". Should not happen.') + continue + processed_links.append(link) # Push all links to IYP - self.iyp.batch_add_links('RANK', links) + logging.info(f'Pushing {len(processed_links)} RANK relationships...') + self.iyp.batch_add_links('RANK', processed_links) def main() -> None: diff --git a/iyp/crawlers/cloudflare/README.md b/iyp/crawlers/cloudflare/README.md index 4ba886f..7ce4aee 100644 --- a/iyp/crawlers/cloudflare/README.md +++ b/iyp/crawlers/cloudflare/README.md @@ -1,4 +1,4 @@ -# Cloudflare Radar -- https://radar.cloudflare.com/ +# Cloudflare Radar -- https://radar.cloudflare.com/ Cloudflare uses aggregated and anonymized DNS queries to their `1.1.1.1` public resolver service to provide various datasets, including: @@ -17,8 +17,12 @@ provide various datasets, including: - [Top 100 ASes querying each of the 10,000 highest ranked domain names](https://developers.cloudflare.com/api/operations/radar_get__top_ases): Same as above, but fetch AS numbers instead. - + All rankings are based on one week of data. +Cloudflare radar's top location and ASes is available for both domain names +and host names. Results are likely accounting for all NS, A, AAAA queries made to +Cloudflare's resolver. Since NS queries for host names make no sense IYP links these +results to `DomainName` nodes. ## Graph representation diff --git a/iyp/crawlers/cloudflare/dns_top_ases.py b/iyp/crawlers/cloudflare/dns_top_ases.py index f24c952..8c15ac0 100644 --- a/iyp/crawlers/cloudflare/dns_top_ases.py +++ b/iyp/crawlers/cloudflare/dns_top_ases.py @@ -1,3 +1,8 @@ +# Cloudflare radar's top location and ASes is available for both domain names +# and host names. Results are likely accounting for all NS, A, AAAA queries made to +# Cloudflare's resolver. Since NS queries for host names make no sense it seems +# more intuitive to link these results to DomainName nodes. + import argparse import logging import os diff --git a/iyp/crawlers/cloudflare/dns_top_locations.py b/iyp/crawlers/cloudflare/dns_top_locations.py index 4f4118f..46e46a0 100644 --- a/iyp/crawlers/cloudflare/dns_top_locations.py +++ b/iyp/crawlers/cloudflare/dns_top_locations.py @@ -1,3 +1,8 @@ +# Cloudflare radar's top location and ASes is available for both domain names +# and host names. Results are likely accounting for all NS, A, AAAA queries made to +# Cloudflare's resolver. Since NS queries for host names make no sense it seems +# more intuitive to link these results to DomainName nodes. + import argparse import glob import json diff --git a/iyp/crawlers/cloudflare/ranking_bucket.py b/iyp/crawlers/cloudflare/ranking_bucket.py index 7193ad6..fbea3f0 100644 --- a/iyp/crawlers/cloudflare/ranking_bucket.py +++ b/iyp/crawlers/cloudflare/ranking_bucket.py @@ -24,6 +24,9 @@ class Crawler(BaseCrawler): # Base Crawler provides access to IYP via self.iyp and setup a dictionary with the # org/url/today's date in self.reference + # + # Cloudflare ranks second and third level domain names (not host names). + # See https://blog.cloudflare.com/radar-domain-rankings/ def run(self): """Fetch data and push to IYP.""" diff --git a/iyp/crawlers/cloudflare/top100.py b/iyp/crawlers/cloudflare/top100.py index 5e2f5fb..d189da8 100644 --- a/iyp/crawlers/cloudflare/top100.py +++ b/iyp/crawlers/cloudflare/top100.py @@ -21,6 +21,9 @@ class Crawler(BaseCrawler): # Base Crawler provides access to IYP via self.iyp # and setup a dictionary with the org/url/today's date in self.reference + # + # Cloudflare ranks second and third level domain names (not host names). + # See https://blog.cloudflare.com/radar-domain-rankings/ def run(self): """Fetch data and push to IYP.""" diff --git a/iyp/crawlers/openintel/README.md b/iyp/crawlers/openintel/README.md index 70cf9c5..0f3ec5f 100644 --- a/iyp/crawlers/openintel/README.md +++ b/iyp/crawlers/openintel/README.md @@ -4,26 +4,34 @@ The OpenINTEL measurement platform captures daily snapshots of the state of larg global Domain Name System (DNS) by running a number of forward and reverse DNS measurements. While OpenINTEL runs measurements to a variety of domain names, IYP currently only fetches data for -the [Tranco top 1 million list](https://data.openintel.nl/data/tranco1m/) and the CISCO Umbrella +the [Tranco top 1 million list](https://data.openintel.nl/data/tranco1m/) and the CISCO Umbrella top 1 million list since it combines rankings. IYP also get the list of authoritative names servers seen by OpenINTEL. -IYP uses only `A` queries to add IP resolution for DomainName and AuthoritativeNameServer nodes. - A crawler of mail servers is also implemented but not used as it creates a very large number of links and this dataset is currently not requested/needed by anyone. ## Graph representation -IP resolution for popular domain names: +IP resolution for popular host names: + ```Cypher -(:DomainName {name: 'google.com'})-[:RESOLVES_TO]->(:IP {ip: '142.250.179.142'}) +(:HostName {name: 'google.com'})-[:RESOLVES_TO]->(:IP {ip: '142.250.179.142'}) ``` IP resolution of authoritative name servers: + +```Cypher +(:HostName:AuthoritativeNameServer {name: 'ns1.google.com'})-[:RESOLVES_TO]->(:IP {ip: '216.239.32.10'}) +(:IP {ip: '216.239.32.10'})-[:SERVE]->(:Service {name: 'DNS'}) +``` + +Domain names managed by name servers: + ```Cypher -(:AuthoritativeNameServer {name: 'ns1.google.com'})-[:RESOLVES_TO]->(:IP {ip: '216.239.32.10'}) +(:DomainName {name: 'google.com'})-[:MANAGED_BY]->(:HostName:AuthoritativeNameServer {name: 'ns1.google.com'}) ``` + ## Dependence This crawler is not depending on other crawlers. diff --git a/iyp/crawlers/openintel/__init__.py b/iyp/crawlers/openintel/__init__.py index 75b3c05..d255205 100644 --- a/iyp/crawlers/openintel/__init__.py +++ b/iyp/crawlers/openintel/__init__.py @@ -7,6 +7,7 @@ import logging import os import tempfile +from ipaddress import IPv6Address import arrow import boto3 @@ -42,17 +43,11 @@ def valid_date(s): class OpenIntelCrawler(BaseCrawler): - def __init__(self, organization, url, name, dataset, additional_domain_type=str()): + def __init__(self, organization, url, name, dataset): """Initialization of the OpenIntel crawler requires the name of the dataset - (e.g. tranco or infra:ns). - - If the dataset contains special types of domain - names, an additional label can be specified (e.g., `AuthoritativeNameServer`) - that will be attached to the `DomainName` nodes. - """ + (e.g. tranco or infra:ns).""" self.dataset = dataset - self.additional_domain_type = additional_domain_type super().__init__(organization, url, name) def get_parquet(self): @@ -179,52 +174,74 @@ def run(self): print(f'Read {len(df)} unique records from {len(self.pandas_df_list)} Parquet file(s).') - # Only domain names from the `query_name` column that will receive the - # additional_domain_type label (if present). - query_domain_names = set(df['query_name']) - # Name server domain names. - ns_domain_names = set(df[df.ns_address.notnull()]['ns_address']) - # All domain names, including the ones from the name server column. - all_domain_names = query_domain_names.union(ns_domain_names) - # Create all DomainName nodes. - domain_id = self.iyp.batch_get_nodes_by_single_prop('DomainName', 'name', all_domain_names) - # Get node IDs for NS nodes and add NS label. - ns_id = {name: domain_id[name] for name in ns_domain_names} + # query_names for NS records are domain names + domain_names = set(df[df.response_type == 'NS']['query_name']) + + # response values of NS records are name servers + name_servers = set(df[df.ns_address.notnull()]['ns_address']) + + # query_names for A and AAAA records are host names + host_names = set(df[(df.response_type == 'A') | (df.response_type == 'AAAA')]['query_name']) + + ipv6_addresses = set() + # Normalize IPv6 addresses. + for ip in df[df.ip6_address.notnull()]['ip6_address']: + try: + ip_normalized = IPv6Address(ip).compressed + except ValueError as e: + logging.error(f'Ignoring invalid IPv6 address "{ip}": {e}') + continue + ipv6_addresses.add(ip_normalized) + + # Get/create all nodes: + domain_id = self.iyp.batch_get_nodes_by_single_prop('DomainName', 'name', domain_names) + host_id = self.iyp.batch_get_nodes_by_single_prop('HostName', 'name', host_names) + ns_id = self.iyp.batch_get_nodes_by_single_prop('HostName', 'name', name_servers) self.iyp.batch_add_node_label(list(ns_id.values()), 'AuthoritativeNameServer') - # Add additional node label if present. - additional_id = set() - if self.additional_domain_type and self.additional_domain_type != 'DomainName': - additional_id = {domain_id[name] for name in query_domain_names} - self.iyp.batch_add_node_label(list(additional_id), self.additional_domain_type) ip4_id = self.iyp.batch_get_nodes_by_single_prop('IP', 'ip', set(df[df.ip4_address.notnull()]['ip4_address'])) - ip6_id = self.iyp.batch_get_nodes_by_single_prop('IP', 'ip', set(df[df.ip6_address.notnull()]['ip6_address'])) + ip6_id = self.iyp.batch_get_nodes_by_single_prop('IP', 'ip', ipv6_addresses) + + print(f'Got {len(domain_id)} domains, {len(ns_id)} nameservers, {len(host_id)} hosts, {len(ip4_id)} IPv4, ' + f'{len(ip6_id)} IPv6') + + # Compute links res_links = [] mng_links = [] + partof_links = [] - print(f'Got {len(domain_id)} domains, {len(ns_id)} nameservers, {len(ip4_id)} IPv4, {len(ip6_id)} IPv6') - if self.additional_domain_type: - print(f'Added "{self.additional_domain_type}" label to {len(additional_id)} nodes.') - + # RESOLVES_TO and MANAGED_BY links for row in df.itertuples(): - domain_qid = domain_id[row.query_name] + + # NS Record + if row.response_type == 'NS' and row.ns_address: + domain_qid = domain_id[row.query_name] + ns_qid = ns_id[row.ns_address] + mng_links.append({'src_id': domain_qid, 'dst_id': ns_qid, 'props': [self.reference]}) # A Record - if row.response_type == 'A' and row.ip4_address: + elif row.response_type == 'A' and row.ip4_address: + host_qid = host_id[row.query_name] ip_qid = ip4_id[row.ip4_address] - res_links.append({'src_id': domain_qid, 'dst_id': ip_qid, 'props': [self.reference]}) + res_links.append({'src_id': host_qid, 'dst_id': ip_qid, 'props': [self.reference]}) # AAAA Record elif row.response_type == 'AAAA' and row.ip6_address: - ip_qid = ip6_id[row.ip6_address] - res_links.append({'src_id': domain_qid, 'dst_id': ip_qid, 'props': [self.reference]}) - - # NS Record - elif row.response_type == 'NS' and row.ns_address: - ns_qid = ns_id[row.ns_address] - mng_links.append({'src_id': domain_qid, 'dst_id': ns_qid, 'props': [self.reference]}) + try: + ip_normalized = IPv6Address(row.ip6_address).compressed + except ValueError: + # Error message was already logged above. + continue + host_qid = host_id[row.query_name] + ip_qid = ip6_id[ip_normalized] + res_links.append({'src_id': host_qid, 'dst_id': ip_qid, 'props': [self.reference]}) + + # PART_OF links between HostNames and DomainNames + for hd in host_names.intersection(domain_names): + partof_links.append({'src_id': host_id[hd], 'dst_id': domain_id[hd], 'props': [self.reference]}) print(f'Computed {len(res_links)} RESOLVES_TO links and {len(mng_links)} MANAGED_BY links') # Push all links to IYP self.iyp.batch_add_links('RESOLVES_TO', res_links) self.iyp.batch_add_links('MANAGED_BY', mng_links) + self.iyp.batch_add_links('PART_OF', partof_links) diff --git a/iyp/crawlers/openintel/infra_mx.py b/iyp/crawlers/openintel/infra_mx.py index 24266d0..3841c77 100644 --- a/iyp/crawlers/openintel/infra_mx.py +++ b/iyp/crawlers/openintel/infra_mx.py @@ -19,6 +19,12 @@ def __init__(self, organization, url, name): def main() -> None: + + ############################################ + # This crawler is not working the NODE_TYPE argument has been deprecated + ############################################ + return + parser = argparse.ArgumentParser() parser.add_argument('--unit-test', action='store_true') args = parser.parse_args() diff --git a/iyp/crawlers/openintel/infra_ns.py b/iyp/crawlers/openintel/infra_ns.py index 776f1aa..a4395c6 100644 --- a/iyp/crawlers/openintel/infra_ns.py +++ b/iyp/crawlers/openintel/infra_ns.py @@ -10,12 +10,11 @@ NAME = 'openintel.infra_ns' DATASET = 'infra:ns' -NODE_TYPE = 'AuthoritativeNameServer' class Crawler(OpenIntelCrawler): def __init__(self, organization, url, name): - super().__init__(organization, url, name, DATASET, NODE_TYPE) + super().__init__(organization, url, name, DATASET) def main() -> None: diff --git a/iyp/crawlers/ripe/README.md b/iyp/crawlers/ripe/README.md index f56196b..ce54b07 100644 --- a/iyp/crawlers/ripe/README.md +++ b/iyp/crawlers/ripe/README.md @@ -52,7 +52,7 @@ ASN(s), and country. We fetch the [list of measurements](https://atlas.ripe.net/docs/apis/rest-api-manual/measurements/) to obtain metadata of *ongoing* Atlas measurements. `AtlasProbe`s are `PART_OF` -`AtlasMeasurement`s and measurements `TARGET` one or more `IP`s, a `DomainName`, or +`AtlasMeasurement`s and measurements `TARGET` one or more `IP`s, a `HostName`, or both. The Atlas platform also maps the measurement target to an `AS` number if possible. The crawler includes this relationship as well. @@ -61,7 +61,7 @@ never connected or are abandoned. ```Cypher (:AtlasProbe {id: 6425})-[:PART_OF]->(:AtlasMeasurement {id: 17635549})-[:TARGET]->(:AS {asn: 2497}) -(:AtlasProbe {id: 6425})-[:PART_OF]->(:AtlasMeasurement {id: 17635549})-[:TARGET]->(:DomainName {name: 'jp-tyo-as2497.anchors.atlas.ripe.net'}) +(:AtlasProbe {id: 6425})-[:PART_OF]->(:AtlasMeasurement {id: 17635549})-[:TARGET]->(:HostName {name: 'jp-tyo-as2497.anchors.atlas.ripe.net'}) (:AtlasProbe {id: 6425})-[:PART_OF]->(:AtlasMeasurement {id: 17635549})-[:TARGET]->(:IP {ip: '202.214.87.158'}) ``` diff --git a/iyp/crawlers/ripe/atlas_measurements.py b/iyp/crawlers/ripe/atlas_measurements.py index d0f9aac..79770f0 100644 --- a/iyp/crawlers/ripe/atlas_measurements.py +++ b/iyp/crawlers/ripe/atlas_measurements.py @@ -73,7 +73,7 @@ def __transform_data(data): # on the '_' delimiter, this action would potentially # cause a TypeError from flatdict if it isn't handled properly. target_info = { - 'domain': item.pop('target', None), + 'hostname': item.pop('target', None), 'asn': item.pop('target_asn', None), 'ip': item.pop('target_ip', None), 'prefix': item.pop('target_prefix', None), @@ -148,7 +148,7 @@ def run(self): probe_ids = set() ips = set() ases = set() - domains = set() + hostnames = set() valid_probe_measurements = list() @@ -171,16 +171,16 @@ def run(self): probe_af = int(probe_measurement['af']) resolved_ips[i] = ipaddress.ip_address(resolved_ips[i]).compressed if probe_af == 6 else resolved_ips[i] - domain = probe_measurement['target']['domain'] - if domain == '' or self.__is_valid_ip(domain): - domain = None - probe_measurement['target']['domain'] = None + hostname = probe_measurement['target']['hostname'] + if hostname == '' or self.__is_valid_ip(hostname): + hostname = None + probe_measurement['target']['hostname'] = None asn = probe_measurement['target']['asn'] probe_ids_participated = probe_measurement['current_probes'] self.__add_if_not_none(probe_measurement_id, probe_measurement_ids) - self.__add_if_not_none(domain, domains) + self.__add_if_not_none(hostname, hostnames) self.__add_if_not_none(asn, ases) self.__add_if_not_none_values(resolved_ips, ips) self.__add_if_not_none_values(probe_ids_participated, probe_ids) @@ -204,8 +204,8 @@ def run(self): probe_ids = self.iyp.batch_get_nodes_by_single_prop('AtlasProbe', 'id', probe_ids, all=False, create=True) logging.info(f'{len(ips)} IPs') ip_ids = self.iyp.batch_get_nodes_by_single_prop('IP', 'ip', ips, all=False, create=True) - logging.info(f'{len(domains)} domains') - domain_ids = self.iyp.batch_get_nodes_by_single_prop('DomainName', 'name', domains, all=False, create=True) + logging.info(f'{len(hostnames)} hostnames') + hostname_ids = self.iyp.batch_get_nodes_by_single_prop('HostName', 'name', hostnames, all=False, create=True) logging.info(f'{len(ases)} ASNs') asn_ids = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn', ases, all=False, create=True) @@ -226,10 +226,10 @@ def run(self): target_links.append({'src_id': probe_measurement_qid, 'dst_id': asn_qid, 'props': [probe_measurement_reference]}) - probe_measurement_domain = probe_measurement['target']['domain'] - if probe_measurement_domain: - domain_qid = domain_ids[probe_measurement_domain] - target_links.append({'src_id': probe_measurement_qid, 'dst_id': domain_qid, + probe_measurement_hostname = probe_measurement['target']['hostname'] + if probe_measurement_hostname: + hostname_qid = hostname_ids[probe_measurement_hostname] + target_links.append({'src_id': probe_measurement_qid, 'dst_id': hostname_qid, 'props': [probe_measurement_reference]}) probe_measurement_ips = self.__get_all_resolved_ips(probe_measurement) diff --git a/iyp/post/url2domain.py b/iyp/post/url2hostname.py similarity index 68% rename from iyp/post/url2domain.py rename to iyp/post/url2hostname.py index 6a89f94..6429a3f 100644 --- a/iyp/post/url2domain.py +++ b/iyp/post/url2hostname.py @@ -8,27 +8,27 @@ class PostProcess(BasePostProcess): def run(self): - """Link URLs and their corresponding DomainNames.""" + """Link URLs and their corresponding HostNames.""" # Get all URL nodes. url_id = self.iyp.batch_get_nodes_by_single_prop('URL', 'url') - # Get all DomainName Nodes - domain_id = self.iyp.batch_get_nodes_by_single_prop('DomainName', 'name') + # Get all HostName Nodes + hostname_id = self.iyp.batch_get_nodes_by_single_prop('HostName', 'name') # Compute links links = [] for url, url_qid in url_id.items(): - # Extract domain name from URL - domain = tldextract.extract(url).registered_domain + # Extract host name from URL + hostname = tldextract.extract(url).fqdn - # Get DomainName node for the domain - domain_qid = domain_id.get(domain) + # Get HostName node for the fqdn of the URL + hostname_qid = hostname_id.get(hostname) - if domain_qid is not None: + if hostname_qid is not None: links.append({ 'src_id': url_qid, - 'dst_id': domain_qid, + 'dst_id': hostname_qid, 'props': [self.reference] })