Skip to content

Commit

Permalink
DNS remodeling (#119)
Browse files Browse the repository at this point in the history
* update url2domain to url2hostname

* remove iana root zone file and dns hierarchy from config file

* Atlas measurement targets are now hostnames

* update openintel crawlers to the new DNS model

* umbrella now ranks a mix of DomainName and HostName nodes and should be run after openintel.umbrella1m

* Add explanation for cloudflare DNS modeling

* lower umbrella crawler in config file

* update READMEs with the new DNS modeling

* add (:Service {name:'DNS'}) node and link it to authoritative name servers

* Nodes do not have reference properties

* Normalize IPv6 addresses

* Fix wrong crawler name

* Typos and formatting

* Remove infra_mx crawler since it does not do anything at the moment

* Update Cisco Umbrella crawler

- Batch create new nodes (happens more often than expected)
- Add logging output
- Do not use builtins as variable names

* Remove redundant set and parameters

* Remove Service node for now

We could not decide on a name, so we will deal with this later.

---------

Co-authored-by: Malte Tashiro <[email protected]>
  • Loading branch information
romain-fontugne and m-appel authored Feb 7, 2024
1 parent f464e59 commit 838c8be
Show file tree
Hide file tree
Showing 15 changed files with 183 additions and 89 deletions.
7 changes: 2 additions & 5 deletions config.json.example
Original file line number Diff line number Diff line change
Expand Up @@ -63,19 +63,17 @@
"iyp.crawlers.peeringdb.ix",
"iyp.crawlers.cloudflare.top100",
"iyp.crawlers.tranco.top1M",
"iyp.crawlers.cisco.umbrella_top1M",
"iyp.crawlers.openintel.tranco1m",
"iyp.crawlers.openintel.umbrella1m",
"iyp.crawlers.openintel.infra_ns",
"iyp.crawlers.openintel.infra_mx",
"iyp.crawlers.cisco.umbrella_top1M",
"iyp.crawlers.citizenlab.urldb",
"iyp.crawlers.inetintel.as_org",
"iyp.crawlers.pch.daily_routing_snapshots_v4",
"iyp.crawlers.pch.daily_routing_snapshots_v6",
"iyp.crawlers.emileaben.as_names",
"iyp.crawlers.ripe.atlas_probes",
"iyp.crawlers.ripe.atlas_measurements",
"iyp.crawlers.iana.root_zone",
"iyp.crawlers.alice_lg.amsix",
"iyp.crawlers.alice_lg.bcix",
"iyp.crawlers.alice_lg.decix",
Expand All @@ -91,8 +89,7 @@
"iyp.post.ip2prefix",
"iyp.post.address_family",
"iyp.post.country_information",
"iyp.post.dns_hierarchy",
"iyp.post.url2domain"
"iyp.post.url2hostname"
]
}
}
8 changes: 5 additions & 3 deletions iyp/crawlers/cisco/README.md
Original file line number Diff line number Diff line change
@@ -1,17 +1,19 @@
# Cisco Umbrella -- https://umbrella-static.s3-us-west-1.amazonaws.com/index.html

The popularity list contains most queried domains based on passive DNS usage across the Umbrella global network.
The popularity list contains most queried domains (ranging from TLDs to FQDNs)
based on passive DNS usage across the Umbrella global network.

IYP uses this data to create and annotate DomainName nodes.
IYP uses this data to create and annotate DomainName and HostName nodes.

## Graph representation

The rank of the domain is indicated by the `rank` property of the relationship.

```Cypher
(:DomainName {name: 'google.com'})-[:RANK {rank: 1}]->(:Ranking {name: 'Cisco Umbrella Top 1 million'})
(:HostName {name: 'www.google.com'})-[:RANK {rank: 8}]->(:Ranking {name: 'Cisco Umbrella Top 1 million'})
```

## Dependence

This crawler is not depending on other crawlers.
This crawler depends on `openintel.umbrella1m`.
61 changes: 53 additions & 8 deletions iyp/crawlers/cisco/umbrella_top1M.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from zipfile import ZipFile

import requests
import tldextract

from iyp import BaseCrawler, RequestStatusError

Expand All @@ -22,31 +23,75 @@ def run(self):

self.cisco_qid = self.iyp.get_node('Ranking', {'name': 'Cisco Umbrella Top 1 million'})

sys.stderr.write('Downloading latest list...\n')
logging.info('Downloading latest list...')
req = requests.get(URL)
if req.status_code != 200:
raise RequestStatusError('Error while fetching Cisco Umbrella Top 1M csv file')

links = []
domains = set()
# open zip file and read top list
with ZipFile(io.BytesIO(req.content)) as z:
with z.open('top-1m.csv') as list:
for i, row in enumerate(io.TextIOWrapper(list)):
with z.open('top-1m.csv') as top_list:
for i, row in enumerate(io.TextIOWrapper(top_list)):
row = row.rstrip()
rank, domain = row.split(',')

domains.add(domain)
links.append({'src_name': domain, 'dst_id': self.cisco_qid,
'props': [self.reference, {'rank': int(rank)}]})

name_id = self.iyp.batch_get_nodes_by_single_prop('DomainName', 'name', domains)
logging.info('Fetching DomainName/HostName nodes...')
domain_id = self.iyp.batch_get_nodes_by_single_prop('DomainName', 'name')
host_id = self.iyp.batch_get_nodes_by_single_prop('HostName', 'name')

# Umbrella mixes up domain and host names.
# By order of preferences we rank:
# 1) existing domain name
# 2) existing host name
# 3) do our best to figure out if it is a domain or host and create the
# corresponding node

new_domain_names = set()
new_host_names = set()
unprocessed_links = list()
processed_links = list()

logging.info('Building relationships...')
for link in links:
link['src_id'] = name_id[link['src_name']]
if link['src_name'] in domain_id:
link['src_id'] = domain_id[link['src_name']]
processed_links.append(link)
elif link['src_name'] in host_id:
link['src_id'] = host_id[link['src_name']]
processed_links.append(link)
else:
unprocessed_links.append(link)
ranked_thing = tldextract.extract(link['src_name'])
name = link['src_name']
if name == ranked_thing.registered_domain:
new_domain_names.add(name)
else:
new_host_names.add(name)

if new_domain_names:
logging.info(f'Pushing {len(new_domain_names)} additional DomainName nodes...')
domain_id.update(self.iyp.batch_get_nodes_by_single_prop('DomainName', 'name', new_domain_names, all=False))
if new_host_names:
logging.info(f'Pushing {len(new_host_names)} additional HostName nodes...')
host_id.update(self.iyp.batch_get_nodes_by_single_prop('HostName', 'name', new_host_names, all=False))

for link in unprocessed_links:
if link['src_name'] in domain_id:
link['src_id'] = domain_id[link['src_name']]
elif link['src_name'] in host_id:
link['src_id'] = host_id[link['src_name']]
else:
logging.error(f'Missing DomainName/HostName node for name "{link["src_name"]}". Should not happen.')
continue
processed_links.append(link)

# Push all links to IYP
self.iyp.batch_add_links('RANK', links)
logging.info(f'Pushing {len(processed_links)} RANK relationships...')
self.iyp.batch_add_links('RANK', processed_links)


def main() -> None:
Expand Down
8 changes: 6 additions & 2 deletions iyp/crawlers/cloudflare/README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Cloudflare Radar -- https://radar.cloudflare.com/
# Cloudflare Radar -- https://radar.cloudflare.com/

Cloudflare uses aggregated and anonymized DNS queries to their `1.1.1.1` public resolver service to
provide various datasets, including:
Expand All @@ -17,8 +17,12 @@ provide various datasets, including:
- [Top 100 ASes querying each of the 10,000 highest ranked domain
names](https://developers.cloudflare.com/api/operations/radar_get__top_ases): Same as above, but
fetch AS numbers instead.

All rankings are based on one week of data.
Cloudflare radar's top location and ASes is available for both domain names
and host names. Results are likely accounting for all NS, A, AAAA queries made to
Cloudflare's resolver. Since NS queries for host names make no sense IYP links these
results to `DomainName` nodes.

## Graph representation

Expand Down
5 changes: 5 additions & 0 deletions iyp/crawlers/cloudflare/dns_top_ases.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
# Cloudflare radar's top location and ASes is available for both domain names
# and host names. Results are likely accounting for all NS, A, AAAA queries made to
# Cloudflare's resolver. Since NS queries for host names make no sense it seems
# more intuitive to link these results to DomainName nodes.

import argparse
import logging
import os
Expand Down
5 changes: 5 additions & 0 deletions iyp/crawlers/cloudflare/dns_top_locations.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
# Cloudflare radar's top location and ASes is available for both domain names
# and host names. Results are likely accounting for all NS, A, AAAA queries made to
# Cloudflare's resolver. Since NS queries for host names make no sense it seems
# more intuitive to link these results to DomainName nodes.

import argparse
import glob
import json
Expand Down
3 changes: 3 additions & 0 deletions iyp/crawlers/cloudflare/ranking_bucket.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@
class Crawler(BaseCrawler):
# Base Crawler provides access to IYP via self.iyp and setup a dictionary with the
# org/url/today's date in self.reference
#
# Cloudflare ranks second and third level domain names (not host names).
# See https://blog.cloudflare.com/radar-domain-rankings/

def run(self):
"""Fetch data and push to IYP."""
Expand Down
3 changes: 3 additions & 0 deletions iyp/crawlers/cloudflare/top100.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@
class Crawler(BaseCrawler):
# Base Crawler provides access to IYP via self.iyp
# and setup a dictionary with the org/url/today's date in self.reference
#
# Cloudflare ranks second and third level domain names (not host names).
# See https://blog.cloudflare.com/radar-domain-rankings/

def run(self):
"""Fetch data and push to IYP."""
Expand Down
20 changes: 14 additions & 6 deletions iyp/crawlers/openintel/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,26 +4,34 @@ The OpenINTEL measurement platform captures daily snapshots of the state of larg
global Domain Name System (DNS) by running a number of forward and reverse DNS measurements.

While OpenINTEL runs measurements to a variety of domain names, IYP currently only fetches data for
the [Tranco top 1 million list](https://data.openintel.nl/data/tranco1m/) and the CISCO Umbrella
the [Tranco top 1 million list](https://data.openintel.nl/data/tranco1m/) and the CISCO Umbrella
top 1 million list since it combines rankings.
IYP also get the list of authoritative names servers seen by OpenINTEL.

IYP uses only `A` queries to add IP resolution for DomainName and AuthoritativeNameServer nodes.

A crawler of mail servers is also implemented but not used as it creates a very large number
of links and this dataset is currently not requested/needed by anyone.

## Graph representation

IP resolution for popular domain names:
IP resolution for popular host names:

```Cypher
(:DomainName {name: 'google.com'})-[:RESOLVES_TO]->(:IP {ip: '142.250.179.142'})
(:HostName {name: 'google.com'})-[:RESOLVES_TO]->(:IP {ip: '142.250.179.142'})
```

IP resolution of authoritative name servers:

```Cypher
(:HostName:AuthoritativeNameServer {name: 'ns1.google.com'})-[:RESOLVES_TO]->(:IP {ip: '216.239.32.10'})
(:IP {ip: '216.239.32.10'})-[:SERVE]->(:Service {name: 'DNS'})
```

Domain names managed by name servers:

```Cypher
(:AuthoritativeNameServer {name: 'ns1.google.com'})-[:RESOLVES_TO]->(:IP {ip: '216.239.32.10'})
(:DomainName {name: 'google.com'})-[:MANAGED_BY]->(:HostName:AuthoritativeNameServer {name: 'ns1.google.com'})
```

## Dependence

This crawler is not depending on other crawlers.
95 changes: 56 additions & 39 deletions iyp/crawlers/openintel/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import logging
import os
import tempfile
from ipaddress import IPv6Address

import arrow
import boto3
Expand Down Expand Up @@ -42,17 +43,11 @@ def valid_date(s):


class OpenIntelCrawler(BaseCrawler):
def __init__(self, organization, url, name, dataset, additional_domain_type=str()):
def __init__(self, organization, url, name, dataset):
"""Initialization of the OpenIntel crawler requires the name of the dataset
(e.g. tranco or infra:ns).
If the dataset contains special types of domain
names, an additional label can be specified (e.g., `AuthoritativeNameServer`)
that will be attached to the `DomainName` nodes.
"""
(e.g. tranco or infra:ns)."""

self.dataset = dataset
self.additional_domain_type = additional_domain_type
super().__init__(organization, url, name)

def get_parquet(self):
Expand Down Expand Up @@ -179,52 +174,74 @@ def run(self):

print(f'Read {len(df)} unique records from {len(self.pandas_df_list)} Parquet file(s).')

# Only domain names from the `query_name` column that will receive the
# additional_domain_type label (if present).
query_domain_names = set(df['query_name'])
# Name server domain names.
ns_domain_names = set(df[df.ns_address.notnull()]['ns_address'])
# All domain names, including the ones from the name server column.
all_domain_names = query_domain_names.union(ns_domain_names)
# Create all DomainName nodes.
domain_id = self.iyp.batch_get_nodes_by_single_prop('DomainName', 'name', all_domain_names)
# Get node IDs for NS nodes and add NS label.
ns_id = {name: domain_id[name] for name in ns_domain_names}
# query_names for NS records are domain names
domain_names = set(df[df.response_type == 'NS']['query_name'])

# response values of NS records are name servers
name_servers = set(df[df.ns_address.notnull()]['ns_address'])

# query_names for A and AAAA records are host names
host_names = set(df[(df.response_type == 'A') | (df.response_type == 'AAAA')]['query_name'])

ipv6_addresses = set()
# Normalize IPv6 addresses.
for ip in df[df.ip6_address.notnull()]['ip6_address']:
try:
ip_normalized = IPv6Address(ip).compressed
except ValueError as e:
logging.error(f'Ignoring invalid IPv6 address "{ip}": {e}')
continue
ipv6_addresses.add(ip_normalized)

# Get/create all nodes:
domain_id = self.iyp.batch_get_nodes_by_single_prop('DomainName', 'name', domain_names)
host_id = self.iyp.batch_get_nodes_by_single_prop('HostName', 'name', host_names)
ns_id = self.iyp.batch_get_nodes_by_single_prop('HostName', 'name', name_servers)
self.iyp.batch_add_node_label(list(ns_id.values()), 'AuthoritativeNameServer')
# Add additional node label if present.
additional_id = set()
if self.additional_domain_type and self.additional_domain_type != 'DomainName':
additional_id = {domain_id[name] for name in query_domain_names}
self.iyp.batch_add_node_label(list(additional_id), self.additional_domain_type)
ip4_id = self.iyp.batch_get_nodes_by_single_prop('IP', 'ip', set(df[df.ip4_address.notnull()]['ip4_address']))
ip6_id = self.iyp.batch_get_nodes_by_single_prop('IP', 'ip', set(df[df.ip6_address.notnull()]['ip6_address']))
ip6_id = self.iyp.batch_get_nodes_by_single_prop('IP', 'ip', ipv6_addresses)

print(f'Got {len(domain_id)} domains, {len(ns_id)} nameservers, {len(host_id)} hosts, {len(ip4_id)} IPv4, '
f'{len(ip6_id)} IPv6')

# Compute links
res_links = []
mng_links = []
partof_links = []

print(f'Got {len(domain_id)} domains, {len(ns_id)} nameservers, {len(ip4_id)} IPv4, {len(ip6_id)} IPv6')
if self.additional_domain_type:
print(f'Added "{self.additional_domain_type}" label to {len(additional_id)} nodes.')

# RESOLVES_TO and MANAGED_BY links
for row in df.itertuples():
domain_qid = domain_id[row.query_name]

# NS Record
if row.response_type == 'NS' and row.ns_address:
domain_qid = domain_id[row.query_name]
ns_qid = ns_id[row.ns_address]
mng_links.append({'src_id': domain_qid, 'dst_id': ns_qid, 'props': [self.reference]})

# A Record
if row.response_type == 'A' and row.ip4_address:
elif row.response_type == 'A' and row.ip4_address:
host_qid = host_id[row.query_name]
ip_qid = ip4_id[row.ip4_address]
res_links.append({'src_id': domain_qid, 'dst_id': ip_qid, 'props': [self.reference]})
res_links.append({'src_id': host_qid, 'dst_id': ip_qid, 'props': [self.reference]})

# AAAA Record
elif row.response_type == 'AAAA' and row.ip6_address:
ip_qid = ip6_id[row.ip6_address]
res_links.append({'src_id': domain_qid, 'dst_id': ip_qid, 'props': [self.reference]})

# NS Record
elif row.response_type == 'NS' and row.ns_address:
ns_qid = ns_id[row.ns_address]
mng_links.append({'src_id': domain_qid, 'dst_id': ns_qid, 'props': [self.reference]})
try:
ip_normalized = IPv6Address(row.ip6_address).compressed
except ValueError:
# Error message was already logged above.
continue
host_qid = host_id[row.query_name]
ip_qid = ip6_id[ip_normalized]
res_links.append({'src_id': host_qid, 'dst_id': ip_qid, 'props': [self.reference]})

# PART_OF links between HostNames and DomainNames
for hd in host_names.intersection(domain_names):
partof_links.append({'src_id': host_id[hd], 'dst_id': domain_id[hd], 'props': [self.reference]})

print(f'Computed {len(res_links)} RESOLVES_TO links and {len(mng_links)} MANAGED_BY links')

# Push all links to IYP
self.iyp.batch_add_links('RESOLVES_TO', res_links)
self.iyp.batch_add_links('MANAGED_BY', mng_links)
self.iyp.batch_add_links('PART_OF', partof_links)
6 changes: 6 additions & 0 deletions iyp/crawlers/openintel/infra_mx.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,12 @@ def __init__(self, organization, url, name):


def main() -> None:

############################################
# This crawler is not working the NODE_TYPE argument has been deprecated
############################################
return

parser = argparse.ArgumentParser()
parser.add_argument('--unit-test', action='store_true')
args = parser.parse_args()
Expand Down
Loading

0 comments on commit 838c8be

Please sign in to comment.