Add info URL and modification timestamp for IHR

InternetHealthReport · Feb 13, 2024 · dfad84f · dfad84f
1 parent 0a2f924
commit dfad84f
Show file tree

Hide file tree

Showing 3 changed files with 37 additions and 31 deletions.
diff --git a/iyp/crawlers/ihr/__init__.py b/iyp/crawlers/ihr/__init__.py
@@ -1,6 +1,6 @@
 import csv
 import os
-from datetime import datetime, time, timezone
+from datetime import timezone
 
 import arrow
 import lz4.frame
@@ -34,6 +34,7 @@ class HegemonyCrawler(BaseCrawler):
     def __init__(self, organization, url, name, af):
         self.af = af
         super().__init__(organization, url, name)
+        self.reference['reference_url_info'] = 'https://ihr.iijlab.net/ihr/en-us/documentation#AS_dependency'
 
     def run(self):
         """Fetch data from file and push to IYP."""
@@ -50,12 +51,12 @@ def run(self):
                 url = self.url.format(year=today.year, month=today.month, day=today.day)
                 req = requests.head(url)
 
-        self.reference = {
-            'reference_url_data': url,
-            'reference_org': self.organization,
-            'reference_name': self.name,
-            'reference_time_fetch': datetime.combine(today.date(), time.min, timezone.utc)
-        }
+        self.reference['reference_url_data'] = url
+        self.reference['reference_time_modification'] = today.datetime.replace(hour=0,
+                                                                               minute=0,
+                                                                               second=0,
+                                                                               microsecond=0,
+                                                                               tzinfo=timezone.utc)
 
         os.makedirs('tmp/', exist_ok=True)
         os.system(f'wget {url} -P tmp/')

diff --git a/iyp/crawlers/ihr/country_dependency.py b/iyp/crawlers/ihr/country_dependency.py
@@ -3,7 +3,7 @@
 import logging
 import os
 import sys
-from datetime import datetime, time, timezone
+from datetime import datetime, timezone
 
 import arrow
 import iso3166
@@ -37,6 +37,7 @@ def __init__(self, organization, url, name):
         self.http_session.mount('https://', HTTPAdapter(max_retries=retries))
 
         super().__init__(organization, url, name)
+        self.reference['reference_url_info'] = 'https://ihr.iijlab.net/ihr/en-us/documentation#Country_s_network_dependency'  # noqa: E501
 
     def run(self):
         """Fetch data from API and push to IYP."""
@@ -49,14 +50,8 @@ def run(self):
                 raise RequestStatusError('Error while fetching data for ' + cc)
             data = json.loads(req.text)
             ranking = data['results']
-
-            # Setup references
-            self.reference = {
-                'reference_org': ORG,
-                'reference_url_data': URL,
-                'reference_name': NAME,
-                'reference_time_fetch': datetime.combine(datetime.utcnow(), time.min, timezone.utc)
-            }
+            if not ranking:
+                continue
 
             # Setup rankings' node
             country_qid = self.iyp.get_node('Country',
@@ -65,15 +60,22 @@ def run(self):
                                             }
                                             )
 
-            countryrank_statements = []
-            if country_qid is not None:
-                countryrank_statements = [('COUNTRY', country_qid, self.reference)]
-
             # Find the latest timebin in the data
             last_timebin = '1970-01-01'
             for r in ranking:
                 if arrow.get(r['timebin']) > arrow.get(last_timebin):
                     last_timebin = r['timebin']
+            self.reference['reference_url_data'] = self.url + f'&timebin={last_timebin}'
+            self.reference['reference_time_modification'] = None
+            try:
+                date = datetime.strptime(last_timebin, '%Y-%m-%dT%H:%M:%SZ').replace(tzinfo=timezone.utc)
+                self.reference['reference_time_modification'] = date
+            except ValueError as e:
+                logging.warning(f'Failed to get modification time: {e}')
+
+            countryrank_statements = []
+            if country_qid is not None:
+                countryrank_statements = [('COUNTRY', country_qid, self.reference.copy())]
 
             # Make ranking and push data
             links = []
@@ -106,7 +108,7 @@ def run(self):
                     links.append({
                         'src_id': self.asn_id[asn['asn']],
                         'dst_id': self.countryrank_qid,
-                        'props': [self.reference, asn]
+                        'props': [self.reference.copy(), asn]
                     })
 
             # Push links to IYP

diff --git a/iyp/crawlers/ihr/rov.py b/iyp/crawlers/ihr/rov.py
@@ -3,7 +3,7 @@
 import logging
 import os
 import sys
-from datetime import datetime, time, timezone
+from datetime import timezone
 
 import arrow
 import lz4.frame
@@ -45,6 +45,9 @@ def close(self):
 
 
 class Crawler(BaseCrawler):
+    def __init__(self, organization, url, name):
+        super().__init__(organization, url, name)
+        self.reference['reference_url_info'] = 'https://ihr-archive.iijlab.net/ihr/rov/README.txt'
 
     def run(self):
         """Fetch data from file and push to IYP."""
@@ -60,20 +63,20 @@ def run(self):
                 today = today.shift(days=-1)
                 url = URL.format(year=today.year, month=today.month, day=today.day)
 
-        self.reference = {
-            'reference_org': ORG,
-            'reference_url_data': url,
-            'reference_name': NAME,
-            'reference_time_fetch': datetime.combine(today.date(), time.min, timezone.utc)
-        }
+        self.reference['reference_url_data'] = url
+        self.reference['reference_time_modification'] = today.datetime.replace(hour=0,
+                                                                               minute=0,
+                                                                               second=0,
+                                                                               microsecond=0,
+                                                                               tzinfo=timezone.utc)
 
         os.makedirs('tmp/', exist_ok=True)
         os.system(f'wget {url} -P tmp/')
 
         local_filename = 'tmp/' + url.rpartition('/')[2]
         self.csv = lz4Csv(local_filename)
 
-        logging.warning('Getting node IDs from neo4j...\n')
+        logging.info('Getting node IDs from neo4j...')
         asn_id = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn')
         prefix_id = self.iyp.batch_get_nodes_by_single_prop('Prefix', 'prefix')
         tag_id = self.iyp.batch_get_nodes_by_single_prop('Tag', 'label')
@@ -84,7 +87,7 @@ def run(self):
         dep_links = []
         country_links = []
 
-        logging.warning('Computing links...\n')
+        logging.info('Computing links...')
         for line in csv.reader(self.csv, quotechar='"', delimiter=',', skipinitialspace=True):
             # header
             # id, timebin, prefix, hege, af, visibility, rpki_status, irr_status,
@@ -158,7 +161,7 @@ def run(self):
         self.csv.close()
 
         # Push links to IYP
-        logging.warning('Pushing links to neo4j...\n')
+        logging.info('Pushing links to neo4j...')
         self.iyp.batch_add_links('ORIGINATE', orig_links)
         self.iyp.batch_add_links('CATEGORIZED', tag_links)
         self.iyp.batch_add_links('DEPENDS_ON', dep_links)