Skip to content

Commit

Permalink
Add info URL and modification timestamp for IHR
Browse files Browse the repository at this point in the history
  • Loading branch information
m-appel committed Feb 13, 2024
1 parent 0a2f924 commit dfad84f
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 31 deletions.
15 changes: 8 additions & 7 deletions iyp/crawlers/ihr/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import csv
import os
from datetime import datetime, time, timezone
from datetime import timezone

import arrow
import lz4.frame
Expand Down Expand Up @@ -34,6 +34,7 @@ class HegemonyCrawler(BaseCrawler):
def __init__(self, organization, url, name, af):
self.af = af
super().__init__(organization, url, name)
self.reference['reference_url_info'] = 'https://ihr.iijlab.net/ihr/en-us/documentation#AS_dependency'

def run(self):
"""Fetch data from file and push to IYP."""
Expand All @@ -50,12 +51,12 @@ def run(self):
url = self.url.format(year=today.year, month=today.month, day=today.day)
req = requests.head(url)

self.reference = {
'reference_url_data': url,
'reference_org': self.organization,
'reference_name': self.name,
'reference_time_fetch': datetime.combine(today.date(), time.min, timezone.utc)
}
self.reference['reference_url_data'] = url
self.reference['reference_time_modification'] = today.datetime.replace(hour=0,
minute=0,
second=0,
microsecond=0,
tzinfo=timezone.utc)

os.makedirs('tmp/', exist_ok=True)
os.system(f'wget {url} -P tmp/')
Expand Down
30 changes: 16 additions & 14 deletions iyp/crawlers/ihr/country_dependency.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import logging
import os
import sys
from datetime import datetime, time, timezone
from datetime import datetime, timezone

import arrow
import iso3166
Expand Down Expand Up @@ -37,6 +37,7 @@ def __init__(self, organization, url, name):
self.http_session.mount('https://', HTTPAdapter(max_retries=retries))

super().__init__(organization, url, name)
self.reference['reference_url_info'] = 'https://ihr.iijlab.net/ihr/en-us/documentation#Country_s_network_dependency' # noqa: E501

def run(self):
"""Fetch data from API and push to IYP."""
Expand All @@ -49,14 +50,8 @@ def run(self):
raise RequestStatusError('Error while fetching data for ' + cc)
data = json.loads(req.text)
ranking = data['results']

# Setup references
self.reference = {
'reference_org': ORG,
'reference_url_data': URL,
'reference_name': NAME,
'reference_time_fetch': datetime.combine(datetime.utcnow(), time.min, timezone.utc)
}
if not ranking:
continue

# Setup rankings' node
country_qid = self.iyp.get_node('Country',
Expand All @@ -65,15 +60,22 @@ def run(self):
}
)

countryrank_statements = []
if country_qid is not None:
countryrank_statements = [('COUNTRY', country_qid, self.reference)]

# Find the latest timebin in the data
last_timebin = '1970-01-01'
for r in ranking:
if arrow.get(r['timebin']) > arrow.get(last_timebin):
last_timebin = r['timebin']
self.reference['reference_url_data'] = self.url + f'&timebin={last_timebin}'
self.reference['reference_time_modification'] = None
try:
date = datetime.strptime(last_timebin, '%Y-%m-%dT%H:%M:%SZ').replace(tzinfo=timezone.utc)
self.reference['reference_time_modification'] = date
except ValueError as e:
logging.warning(f'Failed to get modification time: {e}')

countryrank_statements = []
if country_qid is not None:
countryrank_statements = [('COUNTRY', country_qid, self.reference.copy())]

# Make ranking and push data
links = []
Expand Down Expand Up @@ -106,7 +108,7 @@ def run(self):
links.append({
'src_id': self.asn_id[asn['asn']],
'dst_id': self.countryrank_qid,
'props': [self.reference, asn]
'props': [self.reference.copy(), asn]
})

# Push links to IYP
Expand Down
23 changes: 13 additions & 10 deletions iyp/crawlers/ihr/rov.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import logging
import os
import sys
from datetime import datetime, time, timezone
from datetime import timezone

import arrow
import lz4.frame
Expand Down Expand Up @@ -45,6 +45,9 @@ def close(self):


class Crawler(BaseCrawler):
def __init__(self, organization, url, name):
super().__init__(organization, url, name)
self.reference['reference_url_info'] = 'https://ihr-archive.iijlab.net/ihr/rov/README.txt'

def run(self):
"""Fetch data from file and push to IYP."""
Expand All @@ -60,20 +63,20 @@ def run(self):
today = today.shift(days=-1)
url = URL.format(year=today.year, month=today.month, day=today.day)

self.reference = {
'reference_org': ORG,
'reference_url_data': url,
'reference_name': NAME,
'reference_time_fetch': datetime.combine(today.date(), time.min, timezone.utc)
}
self.reference['reference_url_data'] = url
self.reference['reference_time_modification'] = today.datetime.replace(hour=0,
minute=0,
second=0,
microsecond=0,
tzinfo=timezone.utc)

os.makedirs('tmp/', exist_ok=True)
os.system(f'wget {url} -P tmp/')

local_filename = 'tmp/' + url.rpartition('/')[2]
self.csv = lz4Csv(local_filename)

logging.warning('Getting node IDs from neo4j...\n')
logging.info('Getting node IDs from neo4j...')
asn_id = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn')
prefix_id = self.iyp.batch_get_nodes_by_single_prop('Prefix', 'prefix')
tag_id = self.iyp.batch_get_nodes_by_single_prop('Tag', 'label')
Expand All @@ -84,7 +87,7 @@ def run(self):
dep_links = []
country_links = []

logging.warning('Computing links...\n')
logging.info('Computing links...')
for line in csv.reader(self.csv, quotechar='"', delimiter=',', skipinitialspace=True):
# header
# id, timebin, prefix, hege, af, visibility, rpki_status, irr_status,
Expand Down Expand Up @@ -158,7 +161,7 @@ def run(self):
self.csv.close()

# Push links to IYP
logging.warning('Pushing links to neo4j...\n')
logging.info('Pushing links to neo4j...')
self.iyp.batch_add_links('ORIGINATE', orig_links)
self.iyp.batch_add_links('CATEGORIZED', tag_links)
self.iyp.batch_add_links('DEPENDS_ON', dep_links)
Expand Down

0 comments on commit dfad84f

Please sign in to comment.