diff --git a/iyp/__init__.py b/iyp/__init__.py
index 2194c73..f9fccef 100644
--- a/iyp/__init__.py
+++ b/iyp/__init__.py
@@ -8,6 +8,7 @@
from typing import Optional
import requests
+from github import Github
from neo4j import GraphDatabase
BATCH_SIZE = 50000
@@ -79,6 +80,34 @@ def dict2str(d, eq=':', pfx=''):
return '{' + ','.join(data) + '}'
+def get_commit_datetime(repo, file_path):
+ """Get the datetime of the latest commit modifying a file in a GitHub repository.
+
+ repo: The name of the repository in org/repo format, e.g.,
+ "InternetHealthReport/internet-yellow-pages"
+ file_path: The path to the file relative to the repository root, e.g.,
+ "iyp/__init__.py"
+ """
+ return Github().get_repo(repo).get_commits(path=file_path)[0].commit.committer.date
+
+
+def set_modification_time_from_last_modified_header(reference, response):
+ """Set the reference_time_modification field of the specified reference dict to the
+ datetime parsed from the Last-Modified header of the specified response if
+ possible."""
+ try:
+ last_modified_str = response.headers['Last-Modified']
+ # All HTTP dates are in UTC:
+ # https://www.rfc-editor.org/rfc/rfc2616#section-3.3.1
+ last_modified = datetime.strptime(last_modified_str,
+ '%a, %d %b %Y %H:%M:%S %Z').replace(tzinfo=timezone.utc)
+ reference['reference_time_modification'] = last_modified
+ except KeyError:
+ logging.warning('No Last-Modified header; will not set modification time.')
+ except ValueError as e:
+ logging.error(f'Failed to parse Last-Modified header "{last_modified_str}": {e}')
+
+
class RequestStatusError(requests.HTTPError):
def __init__(self, message):
self.message = message
@@ -109,6 +138,12 @@ def __init__(self, message):
super().__init__(self.message)
+class DataNotAvailableError(Exception):
+ def __init__(self, message):
+ self.message = message
+ super().__init__(self.message)
+
+
class IYP(object):
def __init__(self):
@@ -548,9 +583,9 @@ def add_links(self, src_node, links):
for i, (type, dst_node, prop) in enumerate(links):
assert 'reference_org' in prop
- assert 'reference_url' in prop
+ assert 'reference_url_data' in prop
assert 'reference_name' in prop
- assert 'reference_time' in prop
+ assert 'reference_time_fetch' in prop
prop = format_properties(prop)
@@ -589,10 +624,12 @@ def __init__(self):
"""IYP and references initialization."""
self.reference = {
- 'reference_org': 'Internet Yellow Pages',
- 'reference_url': 'https://iyp.iijlab.net',
'reference_name': 'iyp',
- 'reference_time': datetime.combine(datetime.utcnow(), time.min, timezone.utc)
+ 'reference_org': 'Internet Yellow Pages',
+ 'reference_url_data': 'https://iyp.iijlab.net',
+ 'reference_url_info': str(),
+ 'reference_time_fetch': datetime.combine(datetime.utcnow(), time.min, timezone.utc),
+ 'reference_time_modification': None
}
# connection to IYP database
@@ -617,8 +654,10 @@ def __init__(self, organization, url, name):
self.reference = {
'reference_name': name,
'reference_org': organization,
- 'reference_url': url,
- 'reference_time': datetime.combine(datetime.utcnow(), time.min, timezone.utc)
+ 'reference_url_data': url,
+ 'reference_url_info': str(),
+ 'reference_time_fetch': datetime.combine(datetime.utcnow(), time.min, timezone.utc),
+ 'reference_time_modification': None
}
# connection to IYP database
diff --git a/iyp/crawlers/alice_lg/__init__.py b/iyp/crawlers/alice_lg/__init__.py
index 1f88a24..d7bbdfe 100644
--- a/iyp/crawlers/alice_lg/__init__.py
+++ b/iyp/crawlers/alice_lg/__init__.py
@@ -1,7 +1,6 @@
import ipaddress
import logging
import os
-import sys
from collections import defaultdict
from concurrent.futures import as_completed
from datetime import datetime
@@ -84,6 +83,10 @@ def __init__(self,
# URLs to the API
url = url.rstrip('/')
+ if url.endswith('/api/v1'):
+ self.reference['reference_url_info'] = url[:-len('/api/v1')]
+ else:
+ logging.warning(f'Data URL does not end with "/api/v1", will not set info URL: {url}')
self.urls = {
'routeservers': f'{url}/routeservers',
'neighbors': url + '/routeservers/{rs}/neighbors',
@@ -97,6 +100,8 @@ def __init__(self,
# List of neighbor dicts. Each dict contains information about the route server,
# so we do not keep track of that separately.
self.neighbors = list()
+ # Dict mapping routeserver_id to the cache time of that server.
+ self.routeserver_cached_at = dict()
# Dict mapping (routeserver_id, neighbor_id) tuple to a list of route dicts.
self.routes = dict()
# If routes should be fetched or not.
@@ -123,8 +128,6 @@ def decode_json(resp: Response, *args, **kwargs) -> None:
try:
resp.data = resp.json()
except JSONDecodeError as e:
- print(f'Failed to retrieve data for {resp.url}', file=sys.stderr)
- print(f'Error while reading json data: {e}', file=sys.stderr)
logging.error(f'Error while reading json data: {e}')
logging.error(resp.status_code)
logging.error(resp.headers)
@@ -160,8 +163,6 @@ def fetch_urls(self, urls: list, additional_data=list()) -> Iterable:
except Exception as e:
logging.error(f'Failed to retrieve data for {future}')
logging.error(e)
- print(f'Failed to retrieve data for {future}', file=sys.stderr)
- print(e, file=sys.stderr)
return False, dict(), None
def fetch_url(self, url: str) -> Tuple[bool, dict]:
@@ -177,7 +178,6 @@ def __fetch_routeservers(self) -> None:
logging.info('Using cached route server information.')
self.routeservers = self.cache_handler.load_cached_object(routeserver_object_name)
else:
- print(f'Fetching route servers from {self.urls["routeservers"]}')
logging.info(f'Fetching route servers from {self.urls["routeservers"]}')
is_ok, routeservers_root = self.fetch_url(self.urls['routeservers'])
if not is_ok:
@@ -190,17 +190,49 @@ def __fetch_neighbors(self) -> None:
neighbor_object_name = 'neighbors'
if self.cache_handler.cached_object_exists(neighbor_object_name):
logging.info('Using cached neighbor information.')
- self.neighbors = self.cache_handler.load_cached_object(neighbor_object_name)
+ neighbor_object = self.cache_handler.load_cached_object(neighbor_object_name)
+ self.routeserver_cached_at = neighbor_object['routeserver_cached_at']
+ self.neighbors = neighbor_object['neighbors']
else:
- print(f'Fetching neighbor information from {len(self.routeservers)} route servers.')
logging.info(f'Fetching neighbor information from {len(self.routeservers)} route servers.')
neighbor_urls = [self.urls['neighbors'].format(rs=rs['id']) for rs in self.routeservers]
failed_routeservers = list()
- for is_ok, neighbor_list_root, routeserver_id in self.fetch_urls(neighbor_urls,
- additional_data=self.routeservers):
+ for is_ok, neighbor_list_root, routeserver in self.fetch_urls(neighbor_urls,
+ additional_data=self.routeservers):
+ routeserver_id = routeserver['id']
if not is_ok:
failed_routeservers.append(routeserver_id)
continue
+ try:
+ cached_at_str = neighbor_list_root['api']['cache_status']['cached_at']
+ except KeyError:
+ cached_at_str = str()
+ if cached_at_str:
+ cached_at = None
+ # Alice-LG uses nanosecond-granularity timestamps, which are not
+ # valid ISO format...
+ try:
+ pre, suf = cached_at_str.rsplit('.', maxsplit=1)
+ if suf.endswith('Z'):
+ # UTC
+ frac_seconds = suf[:-1]
+ tz_suffix = '+00:00'
+ elif '+' in suf:
+ # Hopefully a timezone identifier of form +HH:MM
+ frac_seconds, tz_suffix = suf.split('+')
+ tz_suffix = '+' + tz_suffix
+ else:
+ raise ValueError(f'Failed to get timezone from timestamp :{cached_at_str}')
+ if not frac_seconds.isdigit():
+ raise ValueError(f'Fractional seconds are not digits: {cached_at_str}')
+ # Reduce to six digits (ms).
+ frac_seconds = frac_seconds[:6]
+ cached_at_str = f'{pre}.{frac_seconds}{tz_suffix}'
+ cached_at = datetime.fromisoformat(cached_at_str)
+ except ValueError as e:
+ logging.warning(f'Failed to get cached_at timestamp for routeserver "{routeserver_id}": {e}')
+ if cached_at:
+ self.routeserver_cached_at[routeserver_id] = cached_at
# Spelling of neighbors/neighbours field is not consistent...
if 'neighbors' in neighbor_list_root:
neighbor_list = neighbor_list_root['neighbors']
@@ -208,10 +240,11 @@ def __fetch_neighbors(self) -> None:
neighbor_list = neighbor_list_root['neighbours']
else:
logging.error(f'Missing "neighbors"/"neighbours" field in reply: {neighbor_list_root}')
- print(f'Missing "neighbors"/"neighbours" field in reply: {neighbor_list_root}', file=sys.stderr)
continue
self.neighbors += neighbor_list
- self.cache_handler.save_cached_object(neighbor_object_name, self.neighbors)
+ neighbor_object = {'routeserver_cached_at': self.routeserver_cached_at,
+ 'neighbors': self.neighbors}
+ self.cache_handler.save_cached_object(neighbor_object_name, neighbor_object)
if failed_routeservers:
logging.warning(f'Failed to get neighbor information for {len(failed_routeservers)} routeservers: '
f'{failed_routeservers}')
@@ -343,7 +376,15 @@ def run(self) -> None:
if ('details:route_changes' in flattened_neighbor
and isinstance(flattened_neighbor['details:route_changes'], flatdict.FlatDict)):
flattened_neighbor.pop('details:route_changes')
- self.reference['reference_url'] = self.urls['neighbors'].format(rs=neighbor['routeserver_id'])
+ routeserver_id = neighbor['routeserver_id']
+ self.reference['reference_url_data'] = self.urls['neighbors'].format(rs=routeserver_id)
+ if routeserver_id in self.routeserver_cached_at:
+ self.reference['reference_time_modification'] = self.routeserver_cached_at[routeserver_id]
+ else:
+ logging.info(f'No modification time for routeserver: {routeserver_id}')
+ # Set to None to not reuse value of previous loop iteration.
+ self.reference['reference_time_modification'] = None
+
member_of_rels.append({'src_id': member_asn, # Translate to QID later.
'dst_id': n.data['ixp_qid'],
'props': [flattened_neighbor, self.reference.copy()]})
@@ -354,7 +395,8 @@ def run(self) -> None:
if self.fetch_routes:
logging.info('Iterating routes.')
for (routeserver_id, neighbor_id), routes in self.routes.items():
- self.reference['reference_url'] = self.urls['routes'].format(rs=routeserver_id, neighbor=neighbor_id)
+ self.reference['reference_url_data'] = self.urls['routes'].format(rs=routeserver_id,
+ neighbor=neighbor_id)
for route in routes:
prefix = ipaddress.ip_network(route['network']).compressed
origin_asn = route['bgp']['as_path'][-1]
diff --git a/iyp/crawlers/bgpkit/__init__.py b/iyp/crawlers/bgpkit/__init__.py
index 1894490..e5d8e3b 100644
--- a/iyp/crawlers/bgpkit/__init__.py
+++ b/iyp/crawlers/bgpkit/__init__.py
@@ -3,15 +3,16 @@
import requests
-from iyp import BaseCrawler, RequestStatusError
+from iyp import (BaseCrawler, RequestStatusError,
+ set_modification_time_from_last_modified_header)
class AS2RelCrawler(BaseCrawler):
def __init__(self, organization, url, name, af):
"""Initialization: set the address family attribute (af)"""
-
- self.af = af
super().__init__(organization, url, name)
+ self.af = af
+ self.reference['reference_url_info'] = 'https://data.bgpkit.com/as2rel/README.txt'
def run(self):
"""Fetch the AS relationship file from BGPKIT website and process lines one by
@@ -19,7 +20,9 @@ def run(self):
req = requests.get(self.url, stream=True)
if req.status_code != 200:
- raise RequestStatusError('Error while fetching AS relationships')
+ raise RequestStatusError(f'Error while fetching AS relationships: {req.status_code}')
+
+ set_modification_time_from_last_modified_header(self.reference, req)
rels = []
asns = set()
diff --git a/iyp/crawlers/bgpkit/peerstats.py b/iyp/crawlers/bgpkit/peerstats.py
index 6f0b54f..58cf8cb 100644
--- a/iyp/crawlers/bgpkit/peerstats.py
+++ b/iyp/crawlers/bgpkit/peerstats.py
@@ -17,6 +17,9 @@
class Crawler(BaseCrawler):
+ def __init__(self, organization, url, name):
+ super().__init__(organization, url, name)
+ self.reference['reference_url_info'] = 'https://data.bgpkit.com/peer-stats/README.md'
def run(self):
"""Fetch peer stats for each collector."""
@@ -49,6 +52,7 @@ def run(self):
prev_day -= timedelta(days=1)
logging.warning("Today's data not yet available!")
+ self.reference['reference_time_modification'] = self.now
for collector in collectors:
url = URL.format(collector=collector, year=self.now.year,
month=self.now.month, day=self.now.day,
@@ -65,7 +69,7 @@ def run(self):
'BGPCollector',
{'name': stats['collector'], 'project': stats['project']}
)
- self.reference['reference_url'] = url
+ self.reference['reference_url_data'] = url
asns = set()
diff --git a/iyp/crawlers/bgpkit/pfx2asn.py b/iyp/crawlers/bgpkit/pfx2asn.py
index 89f5a93..76d0b69 100644
--- a/iyp/crawlers/bgpkit/pfx2asn.py
+++ b/iyp/crawlers/bgpkit/pfx2asn.py
@@ -7,7 +7,8 @@
import requests
-from iyp import BaseCrawler, RequestStatusError
+from iyp import (BaseCrawler, RequestStatusError,
+ set_modification_time_from_last_modified_header)
URL = 'https://data.bgpkit.com/pfx2as/pfx2as-latest.json.bz2'
ORG = 'BGPKIT'
@@ -22,7 +23,9 @@ def run(self):
req = requests.get(URL, stream=True)
if req.status_code != 200:
- raise RequestStatusError('Error while fetching pfx2as relationships')
+ raise RequestStatusError(f'Error while fetching pfx2as relationships: {req.status_code}')
+
+ set_modification_time_from_last_modified_header(self.reference, req)
entries = []
asns = set()
@@ -35,7 +38,7 @@ def run(self):
req.close()
- logging.info('Pushing nodes to neo4j...\n')
+ logging.info('Pushing nodes to neo4j...')
# get ASNs and prefixes IDs
self.asn_id = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn', asns)
self.prefix_id = self.iyp.batch_get_nodes_by_single_prop('Prefix', 'prefix', prefixes)
@@ -48,7 +51,7 @@ def run(self):
links.append({'src_id': asn_qid, 'dst_id': prefix_qid, 'props': [self.reference, entry]}) # Set AS name
- logging.info('Pushing links to neo4j...\n')
+ logging.info('Pushing links to neo4j...')
# Push all links to IYP
self.iyp.batch_add_links('ORIGINATE', links)
diff --git a/iyp/crawlers/bgptools/anycast_prefixes.py b/iyp/crawlers/bgptools/anycast_prefixes.py
index cbf13b2..e096262 100644
--- a/iyp/crawlers/bgptools/anycast_prefixes.py
+++ b/iyp/crawlers/bgptools/anycast_prefixes.py
@@ -6,7 +6,8 @@
import requests
-from iyp import BaseCrawler, ConnectionError, RequestStatusError
+from iyp import (BaseCrawler, ConnectionError, RequestStatusError,
+ get_commit_datetime)
# Organization name and URL to data
ORG = 'BGP.Tools'
@@ -38,6 +39,12 @@ def fetch_dataset(url: str):
class Crawler(BaseCrawler):
# Base Crawler provides access to IYP via self.iyp
# and setup a dictionary with the org/url/today's date in self.reference
+ def __init__(self, organization, url, name):
+ super().__init__(organization, url, name)
+ self.repo = 'bgptools/anycast-prefixes'
+ self.v4_file = 'anycatch-v4-prefixes.txt'
+ self.v6_file = 'anycatch-v6-prefixes.txt'
+ self.reference['reference_url_info'] = 'https://bgp.tools/kb/anycatch'
def run(self):
ipv4_prefixes_url = get_dataset_url(URL, 4)
@@ -51,13 +58,16 @@ def run(self):
ipv6_prefixes_filename = os.path.join(tmpdir, 'anycast_ipv6_prefixes.txt')
# Fetch data and push to IYP.
- self.reference['reference_url'] = ipv4_prefixes_url # Overriding the reference_url according to prefixes
+ # Overriding the reference_url_data according to prefixes
+ self.reference['reference_url_data'] = ipv4_prefixes_url
+ self.reference['reference_time_modification'] = get_commit_datetime(self.repo, self.v4_file)
ipv4_prefixes_response = fetch_dataset(ipv4_prefixes_url)
logging.info('IPv4 prefixes fetched successfully.')
self.update(ipv4_prefixes_response, ipv4_prefixes_filename)
logging.info('IPv4 prefixes pushed to IYP.')
- self.reference['reference_url'] = ipv6_prefixes_url
+ self.reference['reference_url_data'] = ipv6_prefixes_url
+ self.reference['reference_time_modification'] = get_commit_datetime(self.repo, self.v6_file)
ipv6_prefixes_response = fetch_dataset(ipv6_prefixes_url)
logging.info('IPv6 prefixes fetched successfully.')
self.update(ipv6_prefixes_response, ipv6_prefixes_filename)
diff --git a/iyp/crawlers/bgptools/as_names.py b/iyp/crawlers/bgptools/as_names.py
index e14adc3..53854a2 100644
--- a/iyp/crawlers/bgptools/as_names.py
+++ b/iyp/crawlers/bgptools/as_names.py
@@ -15,13 +15,13 @@
class Crawler(BaseCrawler):
def __init__(self, organization, url, name):
+ super().__init__(organization, url, name)
+ self.reference['reference_url_info'] = 'https://bgp.tools/kb/api'
self.headers = {
'user-agent': 'IIJ/Internet Health Report - admin@ihr.live'
}
- super().__init__(organization, url, name)
-
def run(self):
"""Fetch the AS name file from BGP.Tools website and push it to IYP."""
diff --git a/iyp/crawlers/bgptools/tags.py b/iyp/crawlers/bgptools/tags.py
index e4e5b26..3a20fa1 100644
--- a/iyp/crawlers/bgptools/tags.py
+++ b/iyp/crawlers/bgptools/tags.py
@@ -37,13 +37,13 @@
class Crawler(BaseCrawler):
def __init__(self, organization, url, name):
+ super().__init__(organization, url, name)
+ self.reference['reference_url_info'] = 'https://bgp.tools/kb/api'
self.headers = {
'user-agent': 'IIJ/Internet Health Report - admin@ihr.live'
}
- super().__init__(organization, url, name)
-
def run(self):
"""Fetch the AS name file from BGP.Tools website and process lines one by
one."""
@@ -53,9 +53,9 @@ def run(self):
# Reference information for data pushed to the wikibase
self.reference = {
'reference_org': ORG,
- 'reference_url': url,
+ 'reference_url_data': url,
'reference_name': NAME,
- 'reference_time': datetime.combine(datetime.utcnow(), time.min, timezone.utc)
+ 'reference_time_fetch': datetime.combine(datetime.utcnow(), time.min, timezone.utc)
}
req = requests.get(url, headers=self.headers)
diff --git a/iyp/crawlers/caida/asrank.py b/iyp/crawlers/caida/asrank.py
index c565182..3cc8d9c 100644
--- a/iyp/crawlers/caida/asrank.py
+++ b/iyp/crawlers/caida/asrank.py
@@ -16,6 +16,9 @@
class Crawler(BaseCrawler):
+ def __init__(self, organization, url, name):
+ super().__init__(organization, url, name)
+ self.reference['reference_url_info'] = 'https://asrank.caida.org/'
def run(self):
"""Fetch networks information from ASRank and push to IYP."""
diff --git a/iyp/crawlers/caida/ix_asns.py b/iyp/crawlers/caida/ix_asns.py
index a84ff02..6fbb86e 100644
--- a/iyp/crawlers/caida/ix_asns.py
+++ b/iyp/crawlers/caida/ix_asns.py
@@ -3,6 +3,7 @@
import logging
import os
import sys
+from datetime import datetime, timezone
import arrow
import flatdict
@@ -35,9 +36,22 @@ def __init__(self, organization, url, name):
else:
# for loop was not 'broken', no file available
raise Exception('No recent CAIDA ix-asns file available')
+ date = date.datetime.replace(day=1, hour=0, minute=0, second=0, microsecond=0, tzinfo=timezone.utc)
logging.info('going to use this URL: ' + url)
super().__init__(organization, url, name)
+ self.reference['reference_url_info'] = 'https://publicdata.caida.org/datasets/ixps/README.txt'
+ self.reference['reference_time_modification'] = date
+
+ def __set_modification_time_from_metadata_line(self, line):
+ try:
+ date_str = json.loads(line.lstrip('#'))['date']
+ date = datetime.strptime(date_str, '%Y.%m.%d %H:%M:%S').replace(tzinfo=timezone.utc)
+ self.reference['reference_time_modification'] = date
+ except (json.JSONDecodeError, KeyError, ValueError) as e:
+ logging.warning(f'Failed to get modification date from metadata line: {line.strip()}')
+ logging.warning(e)
+ logging.warning('Using date from filename.')
def run(self):
"""Fetch the latest file and process lines one by one."""
@@ -52,6 +66,7 @@ def run(self):
# Find all possible values and create corresponding nodes
for line in req.text.splitlines():
if line.startswith('#'):
+ self.__set_modification_time_from_metadata_line(line)
continue
ix = json.loads(line)
diff --git a/iyp/crawlers/caida/ixs.py b/iyp/crawlers/caida/ixs.py
index fdc3f31..2301c09 100644
--- a/iyp/crawlers/caida/ixs.py
+++ b/iyp/crawlers/caida/ixs.py
@@ -4,6 +4,7 @@
import logging
import os
import sys
+from datetime import datetime, timezone
import arrow
import requests
@@ -36,9 +37,22 @@ def __init__(self, organization, url, name):
else:
# for loop was not 'broken', no file available
raise Exception('No recent CAIDA ix-asns file available')
+ date = date.datetime.replace(day=1, hour=0, minute=0, second=0, microsecond=0, tzinfo=timezone.utc)
logging.info('going to use this URL: ' + url)
super().__init__(organization, url, name)
+ self.reference['reference_url_info'] = 'https://publicdata.caida.org/datasets/ixps/README.txt'
+ self.reference['reference_time_modification'] = date
+
+ def __set_modification_time_from_metadata_line(self, line):
+ try:
+ date_str = json.loads(line.lstrip('#'))['date']
+ date = datetime.strptime(date_str, '%Y.%m.%d %H:%M:%S').replace(tzinfo=timezone.utc)
+ self.reference['reference_time_modification'] = date
+ except (json.JSONDecodeError, KeyError, ValueError) as e:
+ logging.warning(f'Failed to get modification date from metadata line: {line.strip()}')
+ logging.warning(e)
+ logging.warning('Using date from filename.')
def run(self):
"""Fetch the latest file and process lines one by one."""
@@ -58,6 +72,7 @@ def run(self):
# Find all possible values and create corresponding nodes
for line in req.text.splitlines():
if line.startswith('#'):
+ self.__set_modification_time_from_metadata_line(line)
continue
ix = json.loads(line)
diff --git a/iyp/crawlers/cisco/umbrella_top1M.py b/iyp/crawlers/cisco/umbrella_top1M.py
index 714681b..5b607f5 100644
--- a/iyp/crawlers/cisco/umbrella_top1M.py
+++ b/iyp/crawlers/cisco/umbrella_top1M.py
@@ -3,6 +3,7 @@
import logging
import os
import sys
+from datetime import datetime, timedelta, timezone
from zipfile import ZipFile
import requests
@@ -17,6 +18,33 @@
class Crawler(BaseCrawler):
+ def __init__(self, organization, url, name):
+ super().__init__(organization, url, name)
+ self.reference['reference_url_info'] = 'https://s3-us-west-1.amazonaws.com/umbrella-static/index.html'
+
+ def __set_modification_time(self):
+ """Set the modification time by looking for the last available historical file.
+ The current (non-historical) file is created on the next day.
+
+ For example, if a file for 2024-02-13 is available, it means the current file
+ was created on 2024-02-14.
+ """
+ hist_url = 'http://s3-us-west-1.amazonaws.com/umbrella-static/top-1m-%Y-%m-%d.csv.zip'
+ date = datetime.now(tz=timezone.utc).replace(hour=0, minute=0, second=0, microsecond=0)
+ for attempt in range(7):
+ r = requests.head(date.strftime(hist_url))
+ if r.ok:
+ break
+ date -= timedelta(days=1)
+ else:
+ logging.warning(f'Failed to find historical list within search interval (>{date}); '
+ 'Will not set modification time.')
+ return
+
+ # date now points to the last available historical file , which means the
+ # current file is the day after this date.
+ self.reference['reference_time_modification'] = date + timedelta(days=1)
+ logging.info(self.reference)
def run(self):
"""Fetch Umbrella top 1M and push to IYP."""
@@ -26,7 +54,9 @@ def run(self):
logging.info('Downloading latest list...')
req = requests.get(URL)
if req.status_code != 200:
- raise RequestStatusError('Error while fetching Cisco Umbrella Top 1M csv file')
+ raise RequestStatusError(f'Error while fetching Cisco Umbrella Top 1M csv file: {req.status_code}')
+
+ self.__set_modification_time()
links = []
# open zip file and read top list
diff --git a/iyp/crawlers/citizenlab/urldb.py b/iyp/crawlers/citizenlab/urldb.py
index 109b3aa..ebd1837 100644
--- a/iyp/crawlers/citizenlab/urldb.py
+++ b/iyp/crawlers/citizenlab/urldb.py
@@ -23,6 +23,9 @@ def generate_url(suffix):
class Crawler(BaseCrawler):
# Base Crawler provides access to IYP via self.iyp
# and set up a dictionary with the org/url/today's date in self.reference
+ def __init__(self, organization, url, name):
+ super().__init__(organization, url, name)
+ self.reference['reference_url_info'] = 'https://github.com/citizenlab/test-lists'
def run(self):
# Fetch country codes to generate urls
diff --git a/iyp/crawlers/cloudflare/dns_top_locations.py b/iyp/crawlers/cloudflare/dns_top_locations.py
index 46e46a0..8f40586 100644
--- a/iyp/crawlers/cloudflare/dns_top_locations.py
+++ b/iyp/crawlers/cloudflare/dns_top_locations.py
@@ -9,6 +9,7 @@
import logging
import os
import sys
+from datetime import datetime, timezone
import flatdict
import requests
@@ -42,6 +43,12 @@ def __init__(self, organization, url, name):
# Initialize IYP connection
super().__init__(organization, url, name)
+ # Not super elegant.
+ if name == 'cloudflare.dns_top_ases':
+ self.reference['reference_url_info'] = 'https://developers.cloudflare.com/api/operations/radar-get-dns-top-ases' # noqa: E501
+ elif name == 'cloudflare.dns_top_locations':
+ self.reference['reference_url_info'] = 'https://developers.cloudflare.com/radar/investigate/dns/#top-locations' # noqa: E501
+
# Fetch domain names registered in IYP
existing_dn = self.iyp.tx.run(
f"""MATCH (dn:DomainName)-[r:RANK]-(:Ranking)
@@ -117,7 +124,17 @@ def run(self):
for i, file in enumerate(files):
with open(file, 'rb') as fp:
# Process line one after the other
- for domain_top in json.load(fp)['result'].items():
+ results = json.load(fp)['result']
+ if not self.reference['reference_time_modification']:
+ # Get the reference time from the first file.
+ try:
+ date_str = results['meta']['dateRange'][0]['endTime']
+ date = datetime.strptime(date_str, '%Y-%m-%dT%H:%M:%SZ').replace(tzinfo=timezone.utc)
+ self.reference['reference_time_modification'] = date
+ except (KeyError, ValueError, TypeError) as e:
+ logging.warning(f'Failed to get modification time: {e}')
+
+ for domain_top in results.items():
self.compute_link(domain_top)
if i % 100 == 0:
diff --git a/iyp/crawlers/cloudflare/ranking_bucket.py b/iyp/crawlers/cloudflare/ranking_bucket.py
index fbea3f0..6d9a02e 100644
--- a/iyp/crawlers/cloudflare/ranking_bucket.py
+++ b/iyp/crawlers/cloudflare/ranking_bucket.py
@@ -3,6 +3,7 @@
import logging
import os
import sys
+from datetime import datetime, timezone
import requests
from requests.adapters import HTTPAdapter, Retry
@@ -12,7 +13,7 @@
# Organization name and URL to data
ORG = 'Cloudflare'
URL_DATASETS = 'https://api.cloudflare.com/client/v4/radar/datasets?limit=10&offset=0&datasetType=RANKING_BUCKET&format=json' # noqa: E501
-URL = ''
+URL = 'https://api.cloudflare.com/client/v4/radar/datasets'
URL_DL = 'https://api.cloudflare.com/client/v4/radar/datasets/download'
NAME = 'cloudflare.ranking_bucket'
@@ -27,6 +28,9 @@ class Crawler(BaseCrawler):
#
# Cloudflare ranks second and third level domain names (not host names).
# See https://blog.cloudflare.com/radar-domain-rankings/
+ def __init__(self, organization, url, name):
+ super().__init__(organization, url, name)
+ self.reference['reference_url_info'] = 'https://developers.cloudflare.com/radar/investigate/domain-ranking-datasets/' # noqa: E501
def run(self):
"""Fetch data and push to IYP."""
@@ -61,6 +65,16 @@ def run(self):
datasets = list()
all_domains = set()
for dataset in datasets_json['result']['datasets']:
+ if not self.reference['reference_time_modification']:
+ # Get modification time from first dataset. Should be the same for all
+ # datasets.
+ try:
+ date_str = dataset['meta']['targetDateEnd']
+ date = datetime.strptime(date_str, '%Y-%m-%d').replace(tzinfo=timezone.utc)
+ self.reference['reference_time_modification'] = date
+ except (KeyError, ValueError, TypeError) as e:
+ logging.warning(f'Failed to get modification time: {e}')
+
# Get the dataset URL
req = req_session.post(URL_DL, json={'datasetId': dataset['id']})
if req.status_code != 200:
@@ -92,7 +106,6 @@ def run(self):
dataset_title = f'Cloudflare {dataset["title"]}'
logging.info(f'Processing dataset: {dataset_title}')
print(f'Processing dataset: {dataset_title}')
- self.reference['reference_url'] = dataset['url']
ranking_id = self.iyp.get_node('Ranking',
{
'name': dataset_title,
diff --git a/iyp/crawlers/cloudflare/top100.py b/iyp/crawlers/cloudflare/top100.py
index d189da8..2a6b63d 100644
--- a/iyp/crawlers/cloudflare/top100.py
+++ b/iyp/crawlers/cloudflare/top100.py
@@ -3,6 +3,7 @@
import logging
import os
import sys
+from datetime import datetime, timezone
import requests
@@ -24,6 +25,9 @@ class Crawler(BaseCrawler):
#
# Cloudflare ranks second and third level domain names (not host names).
# See https://blog.cloudflare.com/radar-domain-rankings/
+ def __init__(self, organization, url, name):
+ super().__init__(organization, url, name)
+ self.reference['reference_url_info'] = 'https://developers.cloudflare.com/radar/investigate/domain-ranking-datasets/' # noqa: E501
def run(self):
"""Fetch data and push to IYP."""
@@ -37,13 +41,22 @@ def run(self):
'Content-Type': 'application/json'
}
- req = requests.get(self.reference['reference_url'], headers=headers)
+ req = requests.get(self.reference['reference_url_data'], headers=headers)
if req.status_code != 200:
print(f'Cannot download data {req.status_code}: {req.text}')
- raise RequestStatusError('Error while fetching data file')
+ raise RequestStatusError(f'Error while fetching data file: {req.status_code}')
+
+ results = req.json()['result']
+
+ try:
+ date_str = results['meta']['dateRange'][0]['endTime']
+ date = datetime.strptime(date_str, '%Y-%m-%dT%H:%M:%SZ').replace(tzinfo=timezone.utc)
+ self.reference['reference_time_modification'] = date
+ except (KeyError, ValueError, TypeError) as e:
+ logging.warning(f'Failed to get modification time: {e}')
# Process line one after the other
- for i, _ in enumerate(map(self.update, req.json()['result']['top'])):
+ for i, _ in enumerate(map(self.update, results['top'])):
sys.stderr.write(f'\rProcessed {i} lines')
sys.stderr.write('\n')
diff --git a/iyp/crawlers/emileaben/as_names.py b/iyp/crawlers/emileaben/as_names.py
index cebe71c..66d4b10 100644
--- a/iyp/crawlers/emileaben/as_names.py
+++ b/iyp/crawlers/emileaben/as_names.py
@@ -6,7 +6,7 @@
import requests
-from iyp import BaseCrawler, RequestStatusError
+from iyp import BaseCrawler, RequestStatusError, get_commit_datetime
# Organization name and URL to data
ORG = 'emileaben'
@@ -17,6 +17,10 @@
class Crawler(BaseCrawler):
# Base Crawler provides access to IYP via self.iyp
# and setup a dictionary with the org/url/today's date in self.reference
+ def __init__(self, organization, url, name):
+ super().__init__(organization, url, name)
+ self.reference['reference_url_info'] = 'https://github.com/emileaben/asnames'
+ self.reference['reference_time_modification'] = get_commit_datetime('emileaben/asnames', 'asnames.csv')
def run(self):
# Create a temporary directory
diff --git a/iyp/crawlers/example/crawler.py b/iyp/crawlers/example/crawler.py
index a329eec..63c94ed 100644
--- a/iyp/crawlers/example/crawler.py
+++ b/iyp/crawlers/example/crawler.py
@@ -21,7 +21,7 @@ def run(self):
"""Fetch data and push to IYP."""
# Fetch data
- req = requests.get(self.reference['reference_url'])
+ req = requests.get(self.reference['reference_url_data'])
if req.status_code != 200:
logging.error('Cannot download data {req.status_code}: {req.text}')
raise RequestStatusError('Error while fetching data file')
diff --git a/iyp/crawlers/ihr/__init__.py b/iyp/crawlers/ihr/__init__.py
index c8fa727..9ff8517 100644
--- a/iyp/crawlers/ihr/__init__.py
+++ b/iyp/crawlers/ihr/__init__.py
@@ -1,6 +1,6 @@
import csv
import os
-from datetime import datetime, time, timezone
+from datetime import timezone
import arrow
import lz4.frame
@@ -34,6 +34,7 @@ class HegemonyCrawler(BaseCrawler):
def __init__(self, organization, url, name, af):
self.af = af
super().__init__(organization, url, name)
+ self.reference['reference_url_info'] = 'https://ihr.iijlab.net/ihr/en-us/documentation#AS_dependency'
def run(self):
"""Fetch data from file and push to IYP."""
@@ -50,12 +51,12 @@ def run(self):
url = self.url.format(year=today.year, month=today.month, day=today.day)
req = requests.head(url)
- self.reference = {
- 'reference_url': url,
- 'reference_org': self.organization,
- 'reference_name': self.name,
- 'reference_time': datetime.combine(today.date(), time.min, timezone.utc)
- }
+ self.reference['reference_url_data'] = url
+ self.reference['reference_time_modification'] = today.datetime.replace(hour=0,
+ minute=0,
+ second=0,
+ microsecond=0,
+ tzinfo=timezone.utc)
os.makedirs('tmp/', exist_ok=True)
os.system(f'wget {url} -P tmp/')
diff --git a/iyp/crawlers/ihr/country_dependency.py b/iyp/crawlers/ihr/country_dependency.py
index 3ff4125..1a8d4af 100644
--- a/iyp/crawlers/ihr/country_dependency.py
+++ b/iyp/crawlers/ihr/country_dependency.py
@@ -3,7 +3,7 @@
import logging
import os
import sys
-from datetime import datetime, time, timezone
+from datetime import datetime, timezone
import arrow
import iso3166
@@ -37,6 +37,7 @@ def __init__(self, organization, url, name):
self.http_session.mount('https://', HTTPAdapter(max_retries=retries))
super().__init__(organization, url, name)
+ self.reference['reference_url_info'] = 'https://ihr.iijlab.net/ihr/en-us/documentation#Country_s_network_dependency' # noqa: E501
def run(self):
"""Fetch data from API and push to IYP."""
@@ -49,14 +50,8 @@ def run(self):
raise RequestStatusError('Error while fetching data for ' + cc)
data = json.loads(req.text)
ranking = data['results']
-
- # Setup references
- self.reference = {
- 'reference_org': ORG,
- 'reference_url': URL,
- 'reference_name': NAME,
- 'reference_time': datetime.combine(datetime.utcnow(), time.min, timezone.utc)
- }
+ if not ranking:
+ continue
# Setup rankings' node
country_qid = self.iyp.get_node('Country',
@@ -65,15 +60,22 @@ def run(self):
}
)
- countryrank_statements = []
- if country_qid is not None:
- countryrank_statements = [('COUNTRY', country_qid, self.reference)]
-
# Find the latest timebin in the data
last_timebin = '1970-01-01'
for r in ranking:
if arrow.get(r['timebin']) > arrow.get(last_timebin):
last_timebin = r['timebin']
+ self.reference['reference_url_data'] = self.url + f'&timebin={last_timebin}'
+ self.reference['reference_time_modification'] = None
+ try:
+ date = datetime.strptime(last_timebin, '%Y-%m-%dT%H:%M:%SZ').replace(tzinfo=timezone.utc)
+ self.reference['reference_time_modification'] = date
+ except ValueError as e:
+ logging.warning(f'Failed to get modification time: {e}')
+
+ countryrank_statements = []
+ if country_qid is not None:
+ countryrank_statements = [('COUNTRY', country_qid, self.reference.copy())]
# Make ranking and push data
links = []
@@ -106,7 +108,7 @@ def run(self):
links.append({
'src_id': self.asn_id[asn['asn']],
'dst_id': self.countryrank_qid,
- 'props': [self.reference, asn]
+ 'props': [self.reference.copy(), asn]
})
# Push links to IYP
diff --git a/iyp/crawlers/ihr/rov.py b/iyp/crawlers/ihr/rov.py
index 76c2857..4d5daf8 100644
--- a/iyp/crawlers/ihr/rov.py
+++ b/iyp/crawlers/ihr/rov.py
@@ -3,7 +3,7 @@
import logging
import os
import sys
-from datetime import datetime, time, timezone
+from datetime import timezone
import arrow
import lz4.frame
@@ -45,6 +45,9 @@ def close(self):
class Crawler(BaseCrawler):
+ def __init__(self, organization, url, name):
+ super().__init__(organization, url, name)
+ self.reference['reference_url_info'] = 'https://ihr-archive.iijlab.net/ihr/rov/README.txt'
def run(self):
"""Fetch data from file and push to IYP."""
@@ -60,12 +63,12 @@ def run(self):
today = today.shift(days=-1)
url = URL.format(year=today.year, month=today.month, day=today.day)
- self.reference = {
- 'reference_org': ORG,
- 'reference_url': url,
- 'reference_name': NAME,
- 'reference_time': datetime.combine(today.date(), time.min, timezone.utc)
- }
+ self.reference['reference_url_data'] = url
+ self.reference['reference_time_modification'] = today.datetime.replace(hour=0,
+ minute=0,
+ second=0,
+ microsecond=0,
+ tzinfo=timezone.utc)
os.makedirs('tmp/', exist_ok=True)
os.system(f'wget {url} -P tmp/')
@@ -73,7 +76,7 @@ def run(self):
local_filename = 'tmp/' + url.rpartition('/')[2]
self.csv = lz4Csv(local_filename)
- logging.warning('Getting node IDs from neo4j...\n')
+ logging.info('Getting node IDs from neo4j...')
asn_id = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn')
prefix_id = self.iyp.batch_get_nodes_by_single_prop('Prefix', 'prefix')
tag_id = self.iyp.batch_get_nodes_by_single_prop('Tag', 'label')
@@ -84,7 +87,7 @@ def run(self):
dep_links = []
country_links = []
- logging.warning('Computing links...\n')
+ logging.info('Computing links...')
for line in csv.reader(self.csv, quotechar='"', delimiter=',', skipinitialspace=True):
# header
# id, timebin, prefix, hege, af, visibility, rpki_status, irr_status,
@@ -158,7 +161,7 @@ def run(self):
self.csv.close()
# Push links to IYP
- logging.warning('Pushing links to neo4j...\n')
+ logging.info('Pushing links to neo4j...')
self.iyp.batch_add_links('ORIGINATE', orig_links)
self.iyp.batch_add_links('CATEGORIZED', tag_links)
self.iyp.batch_add_links('DEPENDS_ON', dep_links)
diff --git a/iyp/crawlers/inetintel/as_org.py b/iyp/crawlers/inetintel/as_org.py
index 6faa657..71676d9 100644
--- a/iyp/crawlers/inetintel/as_org.py
+++ b/iyp/crawlers/inetintel/as_org.py
@@ -4,6 +4,7 @@
import sys
import tempfile
from collections import defaultdict
+from datetime import datetime, timezone
import pandas as pd
import requests
@@ -40,6 +41,21 @@ def get_latest_dataset_url(github_repo: str, data_dir: str, file_extension: str)
class Crawler(BaseCrawler):
# Base Crawler provides access to IYP via self.iyp
# and set up a dictionary with the org/url/today's date in self.reference
+ def __init__(self, organization, url, name):
+ super().__init__(organization, url, name)
+ self.reference['reference_url_info'] = 'https://github.com/InetIntel/Dataset-AS-to-Organization-Mapping'
+ self.__get_modification_time_from_url()
+
+ def __get_modification_time_from_url(self):
+ expected_suffix = '.json'
+ try:
+ if not URL.endswith(expected_suffix):
+ raise ValueError(f'Expected "{expected_suffix}" file for data URL')
+ _, date_str = URL[:-len(expected_suffix)].rsplit('.', maxsplit=1)
+ date = datetime.strptime(date_str, '%Y-%m').replace(tzinfo=timezone.utc)
+ self.reference['reference_time_modification'] = date
+ except ValueError as e:
+ logging.warning(f'Failed to set modification time: {e}')
def run(self):
"""Fetch data and push to IYP."""
diff --git a/iyp/crawlers/manrs/members.py b/iyp/crawlers/manrs/members.py
index b652008..22384bf 100644
--- a/iyp/crawlers/manrs/members.py
+++ b/iyp/crawlers/manrs/members.py
@@ -61,8 +61,8 @@ def __init__(self, organization, url, name):
self.reference = {
'reference_name': NAME,
'reference_org': ORG,
- 'reference_url': URL,
- 'reference_time': datetime.combine(datetime.utcnow(), time.min, timezone.utc)
+ 'reference_url_data': URL,
+ 'reference_time_fetch': datetime.combine(datetime.utcnow(), time.min, timezone.utc)
}
def run(self):
diff --git a/iyp/crawlers/nro/delegated_stats.py b/iyp/crawlers/nro/delegated_stats.py
index ded0bbe..ea96368 100644
--- a/iyp/crawlers/nro/delegated_stats.py
+++ b/iyp/crawlers/nro/delegated_stats.py
@@ -4,6 +4,7 @@
import os
import sys
from collections import defaultdict
+from datetime import datetime, timezone
import requests
@@ -18,6 +19,9 @@
class Crawler(BaseCrawler):
+ def __init__(self, organization, url, name):
+ super().__init__(organization, url, name)
+ self.reference['reference_url_info'] = 'https://www.nro.net/wp-content/uploads/nro-extended-stats-readme5.txt'
def run(self):
"""Fetch the delegated stat file from RIPE website and process lines one by
@@ -43,8 +47,15 @@ def run(self):
if line.strip().startswith('#'):
continue
- # skip version and summary lines
fields_value = line.split('|')
+ # get modification time from version line
+ if len(fields_value) == 7 and fields_value[0].isdigit():
+ try:
+ date = datetime.strptime(fields_value[5], '%Y%m%d').replace(tzinfo=timezone.utc)
+ self.reference['reference_time_modification'] = date
+ except ValueError as e:
+ logging.warning(f'Failed to set modification time: {e}')
+ # skip summary lines
if len(fields_value) < 8:
continue
@@ -65,7 +76,7 @@ def run(self):
prefixes.add(prefix)
# Create all nodes
- logging.warning('Pushing nodes to neo4j...\n')
+ logging.warning('Pushing nodes to neo4j...')
opaqueid_id = self.iyp.batch_get_nodes_by_single_prop('OpaqueID', 'id', opaqueids)
prefix_id = self.iyp.batch_get_nodes_by_single_prop('Prefix', 'prefix', prefixes)
country_id = self.iyp.batch_get_nodes_by_single_prop('Country', 'country_code', countries)
@@ -120,7 +131,7 @@ def run(self):
status_links[rec['status'].upper()].append(
{'src_id': prefix_qid, 'dst_id': opaqueid_qid, 'props': [reference]})
- logging.warning('Pusing links to neo4j...\n')
+ logging.warning('Pushing links to neo4j...')
# Push all links to IYP
self.iyp.batch_add_links('COUNTRY', country_links)
for label, links in status_links.items():
diff --git a/iyp/crawlers/openintel/__init__.py b/iyp/crawlers/openintel/__init__.py
index 01c2ff4..ddb5415 100644
--- a/iyp/crawlers/openintel/__init__.py
+++ b/iyp/crawlers/openintel/__init__.py
@@ -1,7 +1,6 @@
# Simple Python script to fetch domain name to IP address mappings from OpenINTEL data
# OpenIntelCrawler is based on code from Mattijs Jonker The The pattern is given in a ASCII art representation
- where nodes are depicted by a pair of parentheses, The pattern is given in a ASCII art representation
+ where nodes are depicted by a pair of parentheses, Thus In IYP all nodes and relationships have a type (called
+ In IYP all nodes and relationships have a type (called
labels for nodes) that convey what the nodes and relationships represent.
- The labels/types are given after the colon, for example (:AS)
+ The labels/types are given after the colon, for example (:AS)
is a node representing an AS, and, -[:MEMBER_OF]- is relationship
- of type member of.
+ of type member of.
The The All relationships in IYP have at least these properties:
References
MATCH
clause describes a pattern in the graph.()
,
+ ()
,
and relationship are depicted with two dashes --
sometimes
including more information in square brackets -[]-
. (iij:AS)-[:MEMBER_OF]-(ix:IXP)
- describes a path that starts from a node we'll call iij
+ describes a path that starts from a node we'll call iij
that connects to another node we'll call ix.
- iij and ix are arbitrary identifiers that allow us to refer to a
+ iij and ix are arbitrary identifiers that allow us to refer to a
certain node later on. WHERE
clause describe conditions for nodes
- or relationship that match the pattern. Here we specify that the
+ or relationship that match the pattern. Here we specify that the
node called iij should have a property asn that equals to 2497.RETURN
clause describes the nodes and links we want to display.Node and relationship properties
The previous example had a condition on the asn property of the AS node, you can also filter on the relationship properties. For example, this query look for IIJ memberships other than those given by PeeringDB.