Replace sys.exit call with exception handling in crawlers (#107)

* Replace sys.exit call with exception handling in crawlers * Pre-commit hooks modifications --------- Co-authored-by: Malte Tashiro <[email protected]>
InternetHealthReport · Jan 10, 2024 · 96ac2c5 · 96ac2c5
1 parent cf73ea2
commit 96ac2c5
Show file tree

Hide file tree

Showing 33 changed files with 106 additions and 92 deletions.
diff --git a/iyp/__init__.py b/iyp/__init__.py
@@ -3,11 +3,11 @@
 import logging
 import os
 import pickle
-import sys
 from datetime import datetime, time, timezone
 from shutil import rmtree
 from typing import Optional
 
+import requests
 from neo4j import GraphDatabase
 
 BATCH_SIZE = 50000
@@ -65,18 +65,48 @@ def dict2str(d, eq=':', pfx=''):
     for key, value in d.items():
         if isinstance(value, str) and '"' in value:
             escaped = value.replace("'", r"\'")
-            data.append(f"{pfx+key}{eq} '{escaped}'")
+            data.append(f"{pfx + key}{eq} '{escaped}'")
         elif isinstance(value, str) or isinstance(value, datetime):
-            data.append(f'{pfx+key}{eq} "{value}"')
+            data.append(f'{pfx + key}{eq} "{value}"')
         elif value is None:
             # Neo4j does not have the concept of empty properties.
             pass
         else:
-            data.append(f'{pfx+key}{eq} {value}')
+            data.append(f'{pfx + key}{eq} {value}')
 
     return '{' + ','.join(data) + '}'
 
 
+class RequestStatusError(requests.HTTPError):
+    def __init__(self, message):
+        self.message = message
+        super().__init__(self.message)
+
+
+class JSONDecodeError(ValueError):
+    def __init__(self, message):
+        self.message = message
+        super().__init__(self.message)
+
+
+class MissingKeyError(Exception):
+    def __init__(self, message):
+        self.message = message
+        super().__init__(self.message)
+
+
+class ConnectionError(requests.exceptions.ConnectionError):
+    def __init__(self, message):
+        self.message = message
+        super().__init__(self.message)
+
+
+class AddressValueError(ValueError):
+    def __init__(self, message):
+        self.message = message
+        super().__init__(self.message)
+
+
 class IYP(object):
 
     def __init__(self):
@@ -95,7 +125,7 @@ def __init__(self):
         self.db = GraphDatabase.driver(uri, auth=(self.login, self.password))
 
         if self.db is None:
-            sys.exit('Could not connect to the Neo4j database!')
+            raise ConnectionError('Could not connect to the Neo4j database!')
         # Raises an exception if there is a problem.
         # "Best practice" is to just let the program
         # crash: https://neo4j.com/docs/python-manual/current/connect/

diff --git a/iyp/crawlers/apnic/eyeball.py b/iyp/crawlers/apnic/eyeball.py
@@ -6,7 +6,7 @@
 import iso3166
 import requests
 
-from iyp import BaseCrawler
+from iyp import BaseCrawler, RequestStatusError
 
 # URL to APNIC API
 URL = 'http://v6data.data.labs.apnic.net/ipv6-measurement/Economies/'
@@ -40,7 +40,7 @@ def run(self):
             self.url = URL + f'{cc}/{cc}.asns.json?m={MIN_POP_PERC}'
             req = requests.get(self.url)
             if req.status_code != 200:
-                sys.exit('Error while fetching data for ' + cc)
+                raise RequestStatusError(f'Error while fetching data for {cc}')
 
             asns = set()
             names = set()

diff --git a/iyp/crawlers/bgp/rv_ris.py b/iyp/crawlers/bgp/rv_ris.py
@@ -57,15 +57,15 @@ def run(self):
 
             for asn in origin_asns:
                 rnode.data['origin'][asn].add(elem.collector)
-                sys.stderr.write(f'\rProcessed {i+1} BGP messages')
+                sys.stderr.write(f'\rProcessed {i + 1} BGP messages')
 
         sys.stderr.write('\nPushing data to IYP...\n')
 
         # Push all prefixes data to IYP
         for i, rnode in enumerate(rtree):
             data = rnode.data['origin']
             self.update_entry(rnode.prefix, data)
-            sys.stderr.write(f'\rProcessed {i+1} prefixes')
+            sys.stderr.write(f'\rProcessed {i + 1} prefixes')
 
     def update_entry(self, prefix, originasn_collector):
         """Add the prefix to wikibase if it's not already there and update its

diff --git a/iyp/crawlers/bgpkit/__init__.py b/iyp/crawlers/bgpkit/__init__.py
@@ -1,10 +1,9 @@
 import bz2
 import json
-import sys
 
 import requests
 
-from iyp import BaseCrawler
+from iyp import BaseCrawler, RequestStatusError
 
 
 class AS2RelCrawler(BaseCrawler):
@@ -20,7 +19,7 @@ def run(self):
 
         req = requests.get(self.url, stream=True)
         if req.status_code != 200:
-            sys.exit('Error while fetching AS relationships')
+            raise RequestStatusError('Error while fetching AS relationships')
 
         rels = []
         asns = set()

diff --git a/iyp/crawlers/bgpkit/peerstats.py b/iyp/crawlers/bgpkit/peerstats.py
@@ -8,7 +8,7 @@
 
 import requests
 
-from iyp import BaseCrawler
+from iyp import BaseCrawler, RequestStatusError
 
 MAIN_PAGE = 'https://data.bgpkit.com/peer-stats/'
 URL = 'https://data.bgpkit.com/peer-stats/{collector}/{year}/{month:02d}/peer-stats_{collector}_{year}-{month:02d}-{day:02d}_{epoch}.bz2'  # noqa: E501
@@ -24,7 +24,7 @@ def run(self):
         req = requests.get(MAIN_PAGE)
         if req.status_code != 200:
             logging.error(f'Cannot fetch peer-stats page {req.status_code}: req.text')
-            sys.exit('Error while fetching main page')
+            raise RequestStatusError('Error while fetching main page')
 
         # Find all collectors
         collectors = []

diff --git a/iyp/crawlers/bgpkit/pfx2asn.py b/iyp/crawlers/bgpkit/pfx2asn.py
@@ -7,7 +7,7 @@
 
 import requests
 
-from iyp import BaseCrawler
+from iyp import BaseCrawler, RequestStatusError
 
 URL = 'https://data.bgpkit.com/pfx2as/pfx2as-latest.json.bz2'
 ORG = 'BGPKIT'
@@ -22,7 +22,7 @@ def run(self):
 
         req = requests.get(URL, stream=True)
         if req.status_code != 200:
-            sys.exit('Error while fetching pfx2as relationships')
+            raise RequestStatusError('Error while fetching pfx2as relationships')
 
         entries = []
         asns = set()

diff --git a/iyp/crawlers/bgptools/anycast_prefixes.py b/iyp/crawlers/bgptools/anycast_prefixes.py
@@ -6,7 +6,7 @@
 
 import requests
 
-from iyp import BaseCrawler
+from iyp import BaseCrawler, ConnectionError, RequestStatusError
 
 # Organization name and URL to data
 ORG = 'BGP.Tools'
@@ -29,10 +29,10 @@ def fetch_dataset(url: str):
         return res
     except requests.exceptions.ConnectionError as e:
         logging.error(e)
-        sys.exit('Connection error while fetching data file')
+        raise ConnectionError('Connection error while fetching data file')
     except requests.exceptions.HTTPError as e:
         logging.error(e)
-        sys.exit('Error while fetching data file')
+        raise RequestStatusError('Error while fetching data file')
 
 
 class Crawler(BaseCrawler):

diff --git a/iyp/crawlers/bgptools/as_names.py b/iyp/crawlers/bgptools/as_names.py
@@ -5,7 +5,7 @@
 
 import requests
 
-from iyp import BaseCrawler
+from iyp import BaseCrawler, RequestStatusError
 
 # curl -s https://bgp.tools/asns.csv | head -n 5
 URL = 'https://bgp.tools/asns.csv'
@@ -27,7 +27,7 @@ def run(self):
 
         req = requests.get(URL, headers=self.headers)
         if req.status_code != 200:
-            sys.exit('Error while fetching AS names')
+            raise RequestStatusError('Error while fetching AS names')
 
         lines = []
         asns = set()

diff --git a/iyp/crawlers/bgptools/tags.py b/iyp/crawlers/bgptools/tags.py
@@ -6,7 +6,7 @@
 
 import requests
 
-from iyp import BaseCrawler
+from iyp import BaseCrawler, RequestStatusError
 
 # curl -s https://bgp.tools/asns.csv | head -n 5
 URL = 'https://bgp.tools/tags/'
@@ -61,7 +61,7 @@ def run(self):
             req = requests.get(url, headers=self.headers)
             if req.status_code != 200:
                 print(req.text)
-                sys.exit('Error while fetching AS names')
+                raise RequestStatusError('Error while fetching AS names')
 
             self.tag_qid = self.iyp.get_node('Tag', {'label': label})
             for line in req.text.splitlines():

diff --git a/iyp/crawlers/caida/asrank.py b/iyp/crawlers/caida/asrank.py
@@ -7,7 +7,7 @@
 import flatdict
 import requests
 
-from iyp import BaseCrawler
+from iyp import BaseCrawler, RequestStatusError
 
 # URL to ASRank API
 URL = 'https://api.asrank.caida.org/v2/restful/asns/?first=10000'
@@ -26,14 +26,13 @@ def run(self):
         has_next = True
         i = 0
         while has_next:
-            url = URL + f'&offset={i*10000}'
+            url = URL + f'&offset={i * 10000}'
             i += 1
             logging.info(f'Fetching {url}')
             req = requests.get(url)
             if req.status_code != 200:
                 logging.error(f'Request failed with status: {req.status_code}')
-                # FIXME should raise an exception
-                sys.exit('Error while fetching data from API')
+                raise RequestStatusError('Error while fetching data from API')
 
             ranking = json.loads(req.text)['data']['asns']
             has_next = ranking['pageInfo']['hasNextPage']

diff --git a/iyp/crawlers/cisco/umbrella_top1M.py b/iyp/crawlers/cisco/umbrella_top1M.py
@@ -7,7 +7,7 @@
 
 import requests
 
-from iyp import BaseCrawler
+from iyp import BaseCrawler, RequestStatusError
 
 # URL to Tranco top 1M
 URL = 'http://s3-us-west-1.amazonaws.com/umbrella-static/top-1m.csv.zip'
@@ -25,7 +25,7 @@ def run(self):
         sys.stderr.write('Downloading latest list...\n')
         req = requests.get(URL)
         if req.status_code != 200:
-            sys.exit('Error while fetching Cisco Umbrella Top 1M csv file')
+            raise RequestStatusError('Error while fetching Cisco Umbrella Top 1M csv file')
 
         links = []
         domains = set()

diff --git a/iyp/crawlers/citizenlab/urldb.py b/iyp/crawlers/citizenlab/urldb.py
@@ -6,7 +6,7 @@
 
 import requests
 
-from iyp import BaseCrawler
+from iyp import BaseCrawler, RequestStatusError
 
 # Organization name and URL to data
 ORG = 'Citizen Lab'
@@ -30,7 +30,7 @@ def run(self):
 
         if req_for_country_codes.status_code != 200:
             logging.error('Cannot download data {req.status_code}: {req.text}')
-            sys.exit('Error while fetching data file')
+            raise RequestStatusError('Error while fetching data file')
 
         content = req_for_country_codes.content.decode('utf-8')
         csv_data = csv.reader(content.splitlines(), delimiter=',')

diff --git a/iyp/crawlers/cloudflare/dns_top_locations.py b/iyp/crawlers/cloudflare/dns_top_locations.py
@@ -116,7 +116,7 @@ def run(self):
                     self.compute_link(domain_top)
 
             if i % 100 == 0:
-                sys.stderr.write(f'Pushing link batch #{int(i/100)}...\r')
+                sys.stderr.write(f'Pushing link batch #{int(i / 100)}...\r')
                 self.iyp.batch_add_links('QUERIED_FROM', self.statements)
                 self.statements = []
 

diff --git a/iyp/crawlers/cloudflare/ranking_bucket.py b/iyp/crawlers/cloudflare/ranking_bucket.py
@@ -7,7 +7,7 @@
 import requests
 from requests.adapters import HTTPAdapter, Retry
 
-from iyp import BaseCrawler
+from iyp import BaseCrawler, JSONDecodeError, RequestStatusError
 
 # Organization name and URL to data
 ORG = 'Cloudflare'
@@ -45,12 +45,12 @@ def run(self):
         req = req_session.get(URL_DATASETS)
         if req.status_code != 200:
             logging.error(f'Cannot download data {req.status_code}: {req.text}')
-            sys.exit('Error while fetching data file')
+            raise RequestStatusError('Error while fetching data file')
 
         datasets_json = req.json()
         if 'success' not in datasets_json or not datasets_json['success']:
             logging.error(f'HTTP request succeeded but API returned: {req.text}')
-            sys.exit('Error while fetching data file')
+            raise JSONDecodeError('Error while fetching data file')
 
         # Fetch all datasets first before starting to process them. This way we can
         # get/create all DomainName nodes in one go and then just add the RANK

diff --git a/iyp/crawlers/cloudflare/top100.py b/iyp/crawlers/cloudflare/top100.py
@@ -6,7 +6,7 @@
 
 import requests
 
-from iyp import BaseCrawler
+from iyp import BaseCrawler, RequestStatusError
 
 # Organization name and URL to data
 ORG = 'Cloudflare'
@@ -37,7 +37,7 @@ def run(self):
         req = requests.get(self.reference['reference_url'], headers=headers)
         if req.status_code != 200:
             print(f'Cannot download data {req.status_code}: {req.text}')
-            sys.exit('Error while fetching data file')
+            raise RequestStatusError('Error while fetching data file')
 
         # Process line one after the other
         for i, _ in enumerate(map(self.update, req.json()['result']['top'])):

diff --git a/iyp/crawlers/emileaben/as_names.py b/iyp/crawlers/emileaben/as_names.py
@@ -6,7 +6,7 @@
 
 import requests
 
-from iyp import BaseCrawler
+from iyp import BaseCrawler, RequestStatusError
 
 # Organization name and URL to data
 ORG = 'emileaben'
@@ -27,10 +27,10 @@ def run(self):
             res = requests.get(URL)
         except requests.exceptions.ConnectionError as e:
             logging.error(e)
-            sys.exit('Connection error while fetching data file')
+            raise ConnectionError('Connection error while fetching data file')
         except requests.exceptions.HTTPError as e:
             logging.error(e)
-            sys.exit('Error while fetching data file')
+            raise RequestStatusError('Error while fetching data file')
         with open(filename, 'w') as file:
             file.write(res.text)
 

diff --git a/iyp/crawlers/example/crawler.py b/iyp/crawlers/example/crawler.py
@@ -5,7 +5,7 @@
 
 import requests
 
-from iyp import BaseCrawler
+from iyp import BaseCrawler, RequestStatusError
 
 # Organization name and URL to data
 ORG = 'Example Org'
@@ -24,7 +24,7 @@ def run(self):
         req = requests.get(self.reference['reference_url'])
         if req.status_code != 200:
             logging.error('Cannot download data {req.status_code}: {req.text}')
-            sys.exit('Error while fetching data file')
+            raise RequestStatusError('Error while fetching data file')
 
         # Process line one after the other
         for i, line in enumerate(req.text.splitlines()):

diff --git a/iyp/crawlers/ihr/country_dependency.py b/iyp/crawlers/ihr/country_dependency.py
@@ -11,7 +11,7 @@
 from requests.adapters import HTTPAdapter
 from urllib3.util.retry import Retry
 
-from iyp import BaseCrawler
+from iyp import BaseCrawler, RequestStatusError
 
 # URL to the API
 URL = 'https://ihr.iijlab.net/ihr/api/hegemony/countries/?country={country}&af=4'
@@ -46,7 +46,7 @@ def run(self):
             self.url = URL.format(country=cc)
             req = self.http_session.get(self.url + '&format=json')
             if req.status_code != 200:
-                sys.exit('Error while fetching data for ' + cc)
+                raise RequestStatusError('Error while fetching data for ' + cc)
             data = json.loads(req.text)
             ranking = data['results']