From dfad84fa282a06032d12a3b67c631e5877886c1f Mon Sep 17 00:00:00 2001 From: Malte Tashiro Date: Tue, 13 Feb 2024 10:26:53 +0000 Subject: [PATCH] Add info URL and modification timestamp for IHR --- iyp/crawlers/ihr/__init__.py | 15 +++++++------ iyp/crawlers/ihr/country_dependency.py | 30 ++++++++++++++------------ iyp/crawlers/ihr/rov.py | 23 +++++++++++--------- 3 files changed, 37 insertions(+), 31 deletions(-) diff --git a/iyp/crawlers/ihr/__init__.py b/iyp/crawlers/ihr/__init__.py index cb5f676..9ff8517 100644 --- a/iyp/crawlers/ihr/__init__.py +++ b/iyp/crawlers/ihr/__init__.py @@ -1,6 +1,6 @@ import csv import os -from datetime import datetime, time, timezone +from datetime import timezone import arrow import lz4.frame @@ -34,6 +34,7 @@ class HegemonyCrawler(BaseCrawler): def __init__(self, organization, url, name, af): self.af = af super().__init__(organization, url, name) + self.reference['reference_url_info'] = 'https://ihr.iijlab.net/ihr/en-us/documentation#AS_dependency' def run(self): """Fetch data from file and push to IYP.""" @@ -50,12 +51,12 @@ def run(self): url = self.url.format(year=today.year, month=today.month, day=today.day) req = requests.head(url) - self.reference = { - 'reference_url_data': url, - 'reference_org': self.organization, - 'reference_name': self.name, - 'reference_time_fetch': datetime.combine(today.date(), time.min, timezone.utc) - } + self.reference['reference_url_data'] = url + self.reference['reference_time_modification'] = today.datetime.replace(hour=0, + minute=0, + second=0, + microsecond=0, + tzinfo=timezone.utc) os.makedirs('tmp/', exist_ok=True) os.system(f'wget {url} -P tmp/') diff --git a/iyp/crawlers/ihr/country_dependency.py b/iyp/crawlers/ihr/country_dependency.py index 18e2a1e..1a8d4af 100644 --- a/iyp/crawlers/ihr/country_dependency.py +++ b/iyp/crawlers/ihr/country_dependency.py @@ -3,7 +3,7 @@ import logging import os import sys -from datetime import datetime, time, timezone +from datetime import datetime, timezone import arrow import iso3166 @@ -37,6 +37,7 @@ def __init__(self, organization, url, name): self.http_session.mount('https://', HTTPAdapter(max_retries=retries)) super().__init__(organization, url, name) + self.reference['reference_url_info'] = 'https://ihr.iijlab.net/ihr/en-us/documentation#Country_s_network_dependency' # noqa: E501 def run(self): """Fetch data from API and push to IYP.""" @@ -49,14 +50,8 @@ def run(self): raise RequestStatusError('Error while fetching data for ' + cc) data = json.loads(req.text) ranking = data['results'] - - # Setup references - self.reference = { - 'reference_org': ORG, - 'reference_url_data': URL, - 'reference_name': NAME, - 'reference_time_fetch': datetime.combine(datetime.utcnow(), time.min, timezone.utc) - } + if not ranking: + continue # Setup rankings' node country_qid = self.iyp.get_node('Country', @@ -65,15 +60,22 @@ def run(self): } ) - countryrank_statements = [] - if country_qid is not None: - countryrank_statements = [('COUNTRY', country_qid, self.reference)] - # Find the latest timebin in the data last_timebin = '1970-01-01' for r in ranking: if arrow.get(r['timebin']) > arrow.get(last_timebin): last_timebin = r['timebin'] + self.reference['reference_url_data'] = self.url + f'&timebin={last_timebin}' + self.reference['reference_time_modification'] = None + try: + date = datetime.strptime(last_timebin, '%Y-%m-%dT%H:%M:%SZ').replace(tzinfo=timezone.utc) + self.reference['reference_time_modification'] = date + except ValueError as e: + logging.warning(f'Failed to get modification time: {e}') + + countryrank_statements = [] + if country_qid is not None: + countryrank_statements = [('COUNTRY', country_qid, self.reference.copy())] # Make ranking and push data links = [] @@ -106,7 +108,7 @@ def run(self): links.append({ 'src_id': self.asn_id[asn['asn']], 'dst_id': self.countryrank_qid, - 'props': [self.reference, asn] + 'props': [self.reference.copy(), asn] }) # Push links to IYP diff --git a/iyp/crawlers/ihr/rov.py b/iyp/crawlers/ihr/rov.py index b8a3e29..4d5daf8 100644 --- a/iyp/crawlers/ihr/rov.py +++ b/iyp/crawlers/ihr/rov.py @@ -3,7 +3,7 @@ import logging import os import sys -from datetime import datetime, time, timezone +from datetime import timezone import arrow import lz4.frame @@ -45,6 +45,9 @@ def close(self): class Crawler(BaseCrawler): + def __init__(self, organization, url, name): + super().__init__(organization, url, name) + self.reference['reference_url_info'] = 'https://ihr-archive.iijlab.net/ihr/rov/README.txt' def run(self): """Fetch data from file and push to IYP.""" @@ -60,12 +63,12 @@ def run(self): today = today.shift(days=-1) url = URL.format(year=today.year, month=today.month, day=today.day) - self.reference = { - 'reference_org': ORG, - 'reference_url_data': url, - 'reference_name': NAME, - 'reference_time_fetch': datetime.combine(today.date(), time.min, timezone.utc) - } + self.reference['reference_url_data'] = url + self.reference['reference_time_modification'] = today.datetime.replace(hour=0, + minute=0, + second=0, + microsecond=0, + tzinfo=timezone.utc) os.makedirs('tmp/', exist_ok=True) os.system(f'wget {url} -P tmp/') @@ -73,7 +76,7 @@ def run(self): local_filename = 'tmp/' + url.rpartition('/')[2] self.csv = lz4Csv(local_filename) - logging.warning('Getting node IDs from neo4j...\n') + logging.info('Getting node IDs from neo4j...') asn_id = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn') prefix_id = self.iyp.batch_get_nodes_by_single_prop('Prefix', 'prefix') tag_id = self.iyp.batch_get_nodes_by_single_prop('Tag', 'label') @@ -84,7 +87,7 @@ def run(self): dep_links = [] country_links = [] - logging.warning('Computing links...\n') + logging.info('Computing links...') for line in csv.reader(self.csv, quotechar='"', delimiter=',', skipinitialspace=True): # header # id, timebin, prefix, hege, af, visibility, rpki_status, irr_status, @@ -158,7 +161,7 @@ def run(self): self.csv.close() # Push links to IYP - logging.warning('Pushing links to neo4j...\n') + logging.info('Pushing links to neo4j...') self.iyp.batch_add_links('ORIGINATE', orig_links) self.iyp.batch_add_links('CATEGORIZED', tag_links) self.iyp.batch_add_links('DEPENDS_ON', dep_links)