diff --git a/iyp/__init__.py b/iyp/__init__.py index b2bd1e7..7dcfe36 100644 --- a/iyp/__init__.py +++ b/iyp/__init__.py @@ -3,11 +3,11 @@ import logging import os import pickle -import sys from datetime import datetime, time, timezone from shutil import rmtree from typing import Optional +import requests from neo4j import GraphDatabase BATCH_SIZE = 50000 @@ -65,18 +65,48 @@ def dict2str(d, eq=':', pfx=''): for key, value in d.items(): if isinstance(value, str) and '"' in value: escaped = value.replace("'", r"\'") - data.append(f"{pfx+key}{eq} '{escaped}'") + data.append(f"{pfx + key}{eq} '{escaped}'") elif isinstance(value, str) or isinstance(value, datetime): - data.append(f'{pfx+key}{eq} "{value}"') + data.append(f'{pfx + key}{eq} "{value}"') elif value is None: # Neo4j does not have the concept of empty properties. pass else: - data.append(f'{pfx+key}{eq} {value}') + data.append(f'{pfx + key}{eq} {value}') return '{' + ','.join(data) + '}' +class RequestStatusError(requests.HTTPError): + def __init__(self, message): + self.message = message + super().__init__(self.message) + + +class JSONDecodeError(ValueError): + def __init__(self, message): + self.message = message + super().__init__(self.message) + + +class MissingKeyError(Exception): + def __init__(self, message): + self.message = message + super().__init__(self.message) + + +class ConnectionError(requests.exceptions.ConnectionError): + def __init__(self, message): + self.message = message + super().__init__(self.message) + + +class AddressValueError(ValueError): + def __init__(self, message): + self.message = message + super().__init__(self.message) + + class IYP(object): def __init__(self): @@ -95,7 +125,7 @@ def __init__(self): self.db = GraphDatabase.driver(uri, auth=(self.login, self.password)) if self.db is None: - sys.exit('Could not connect to the Neo4j database!') + raise ConnectionError('Could not connect to the Neo4j database!') # Raises an exception if there is a problem. # "Best practice" is to just let the program # crash: https://neo4j.com/docs/python-manual/current/connect/ diff --git a/iyp/crawlers/apnic/eyeball.py b/iyp/crawlers/apnic/eyeball.py index 4c043db..bee4fd4 100644 --- a/iyp/crawlers/apnic/eyeball.py +++ b/iyp/crawlers/apnic/eyeball.py @@ -6,7 +6,7 @@ import iso3166 import requests -from iyp import BaseCrawler +from iyp import BaseCrawler, RequestStatusError # URL to APNIC API URL = 'http://v6data.data.labs.apnic.net/ipv6-measurement/Economies/' @@ -40,7 +40,7 @@ def run(self): self.url = URL + f'{cc}/{cc}.asns.json?m={MIN_POP_PERC}' req = requests.get(self.url) if req.status_code != 200: - sys.exit('Error while fetching data for ' + cc) + raise RequestStatusError(f'Error while fetching data for {cc}') asns = set() names = set() diff --git a/iyp/crawlers/bgp/rv_ris.py b/iyp/crawlers/bgp/rv_ris.py index d009336..0083b0e 100644 --- a/iyp/crawlers/bgp/rv_ris.py +++ b/iyp/crawlers/bgp/rv_ris.py @@ -57,7 +57,7 @@ def run(self): for asn in origin_asns: rnode.data['origin'][asn].add(elem.collector) - sys.stderr.write(f'\rProcessed {i+1} BGP messages') + sys.stderr.write(f'\rProcessed {i + 1} BGP messages') sys.stderr.write('\nPushing data to IYP...\n') @@ -65,7 +65,7 @@ def run(self): for i, rnode in enumerate(rtree): data = rnode.data['origin'] self.update_entry(rnode.prefix, data) - sys.stderr.write(f'\rProcessed {i+1} prefixes') + sys.stderr.write(f'\rProcessed {i + 1} prefixes') def update_entry(self, prefix, originasn_collector): """Add the prefix to wikibase if it's not already there and update its diff --git a/iyp/crawlers/bgpkit/__init__.py b/iyp/crawlers/bgpkit/__init__.py index c096839..1894490 100644 --- a/iyp/crawlers/bgpkit/__init__.py +++ b/iyp/crawlers/bgpkit/__init__.py @@ -1,10 +1,9 @@ import bz2 import json -import sys import requests -from iyp import BaseCrawler +from iyp import BaseCrawler, RequestStatusError class AS2RelCrawler(BaseCrawler): @@ -20,7 +19,7 @@ def run(self): req = requests.get(self.url, stream=True) if req.status_code != 200: - sys.exit('Error while fetching AS relationships') + raise RequestStatusError('Error while fetching AS relationships') rels = [] asns = set() diff --git a/iyp/crawlers/bgpkit/peerstats.py b/iyp/crawlers/bgpkit/peerstats.py index b3fc7d2..6f0b54f 100644 --- a/iyp/crawlers/bgpkit/peerstats.py +++ b/iyp/crawlers/bgpkit/peerstats.py @@ -8,7 +8,7 @@ import requests -from iyp import BaseCrawler +from iyp import BaseCrawler, RequestStatusError MAIN_PAGE = 'https://data.bgpkit.com/peer-stats/' URL = 'https://data.bgpkit.com/peer-stats/{collector}/{year}/{month:02d}/peer-stats_{collector}_{year}-{month:02d}-{day:02d}_{epoch}.bz2' # noqa: E501 @@ -24,7 +24,7 @@ def run(self): req = requests.get(MAIN_PAGE) if req.status_code != 200: logging.error(f'Cannot fetch peer-stats page {req.status_code}: req.text') - sys.exit('Error while fetching main page') + raise RequestStatusError('Error while fetching main page') # Find all collectors collectors = [] diff --git a/iyp/crawlers/bgpkit/pfx2asn.py b/iyp/crawlers/bgpkit/pfx2asn.py index bd1025c..89f5a93 100644 --- a/iyp/crawlers/bgpkit/pfx2asn.py +++ b/iyp/crawlers/bgpkit/pfx2asn.py @@ -7,7 +7,7 @@ import requests -from iyp import BaseCrawler +from iyp import BaseCrawler, RequestStatusError URL = 'https://data.bgpkit.com/pfx2as/pfx2as-latest.json.bz2' ORG = 'BGPKIT' @@ -22,7 +22,7 @@ def run(self): req = requests.get(URL, stream=True) if req.status_code != 200: - sys.exit('Error while fetching pfx2as relationships') + raise RequestStatusError('Error while fetching pfx2as relationships') entries = [] asns = set() diff --git a/iyp/crawlers/bgptools/anycast_prefixes.py b/iyp/crawlers/bgptools/anycast_prefixes.py index 2661bb8..cbf13b2 100644 --- a/iyp/crawlers/bgptools/anycast_prefixes.py +++ b/iyp/crawlers/bgptools/anycast_prefixes.py @@ -6,7 +6,7 @@ import requests -from iyp import BaseCrawler +from iyp import BaseCrawler, ConnectionError, RequestStatusError # Organization name and URL to data ORG = 'BGP.Tools' @@ -29,10 +29,10 @@ def fetch_dataset(url: str): return res except requests.exceptions.ConnectionError as e: logging.error(e) - sys.exit('Connection error while fetching data file') + raise ConnectionError('Connection error while fetching data file') except requests.exceptions.HTTPError as e: logging.error(e) - sys.exit('Error while fetching data file') + raise RequestStatusError('Error while fetching data file') class Crawler(BaseCrawler): diff --git a/iyp/crawlers/bgptools/as_names.py b/iyp/crawlers/bgptools/as_names.py index a977185..21498d6 100644 --- a/iyp/crawlers/bgptools/as_names.py +++ b/iyp/crawlers/bgptools/as_names.py @@ -5,7 +5,7 @@ import requests -from iyp import BaseCrawler +from iyp import BaseCrawler, RequestStatusError # curl -s https://bgp.tools/asns.csv | head -n 5 URL = 'https://bgp.tools/asns.csv' @@ -27,7 +27,7 @@ def run(self): req = requests.get(URL, headers=self.headers) if req.status_code != 200: - sys.exit('Error while fetching AS names') + raise RequestStatusError('Error while fetching AS names') lines = [] asns = set() diff --git a/iyp/crawlers/bgptools/tags.py b/iyp/crawlers/bgptools/tags.py index d12bd5d..e4e5b26 100644 --- a/iyp/crawlers/bgptools/tags.py +++ b/iyp/crawlers/bgptools/tags.py @@ -6,7 +6,7 @@ import requests -from iyp import BaseCrawler +from iyp import BaseCrawler, RequestStatusError # curl -s https://bgp.tools/asns.csv | head -n 5 URL = 'https://bgp.tools/tags/' @@ -61,7 +61,7 @@ def run(self): req = requests.get(url, headers=self.headers) if req.status_code != 200: print(req.text) - sys.exit('Error while fetching AS names') + raise RequestStatusError('Error while fetching AS names') self.tag_qid = self.iyp.get_node('Tag', {'label': label}) for line in req.text.splitlines(): diff --git a/iyp/crawlers/caida/asrank.py b/iyp/crawlers/caida/asrank.py index b05de64..c565182 100644 --- a/iyp/crawlers/caida/asrank.py +++ b/iyp/crawlers/caida/asrank.py @@ -7,7 +7,7 @@ import flatdict import requests -from iyp import BaseCrawler +from iyp import BaseCrawler, RequestStatusError # URL to ASRank API URL = 'https://api.asrank.caida.org/v2/restful/asns/?first=10000' @@ -26,14 +26,13 @@ def run(self): has_next = True i = 0 while has_next: - url = URL + f'&offset={i*10000}' + url = URL + f'&offset={i * 10000}' i += 1 logging.info(f'Fetching {url}') req = requests.get(url) if req.status_code != 200: logging.error(f'Request failed with status: {req.status_code}') - # FIXME should raise an exception - sys.exit('Error while fetching data from API') + raise RequestStatusError('Error while fetching data from API') ranking = json.loads(req.text)['data']['asns'] has_next = ranking['pageInfo']['hasNextPage'] diff --git a/iyp/crawlers/cisco/umbrella_top1M.py b/iyp/crawlers/cisco/umbrella_top1M.py index fd2c968..629f15b 100644 --- a/iyp/crawlers/cisco/umbrella_top1M.py +++ b/iyp/crawlers/cisco/umbrella_top1M.py @@ -7,7 +7,7 @@ import requests -from iyp import BaseCrawler +from iyp import BaseCrawler, RequestStatusError # URL to Tranco top 1M URL = 'http://s3-us-west-1.amazonaws.com/umbrella-static/top-1m.csv.zip' @@ -25,7 +25,7 @@ def run(self): sys.stderr.write('Downloading latest list...\n') req = requests.get(URL) if req.status_code != 200: - sys.exit('Error while fetching Cisco Umbrella Top 1M csv file') + raise RequestStatusError('Error while fetching Cisco Umbrella Top 1M csv file') links = [] domains = set() diff --git a/iyp/crawlers/citizenlab/urldb.py b/iyp/crawlers/citizenlab/urldb.py index c4c69aa..109b3aa 100644 --- a/iyp/crawlers/citizenlab/urldb.py +++ b/iyp/crawlers/citizenlab/urldb.py @@ -6,7 +6,7 @@ import requests -from iyp import BaseCrawler +from iyp import BaseCrawler, RequestStatusError # Organization name and URL to data ORG = 'Citizen Lab' @@ -30,7 +30,7 @@ def run(self): if req_for_country_codes.status_code != 200: logging.error('Cannot download data {req.status_code}: {req.text}') - sys.exit('Error while fetching data file') + raise RequestStatusError('Error while fetching data file') content = req_for_country_codes.content.decode('utf-8') csv_data = csv.reader(content.splitlines(), delimiter=',') diff --git a/iyp/crawlers/cloudflare/dns_top_locations.py b/iyp/crawlers/cloudflare/dns_top_locations.py index d31629e..4f4118f 100644 --- a/iyp/crawlers/cloudflare/dns_top_locations.py +++ b/iyp/crawlers/cloudflare/dns_top_locations.py @@ -116,7 +116,7 @@ def run(self): self.compute_link(domain_top) if i % 100 == 0: - sys.stderr.write(f'Pushing link batch #{int(i/100)}...\r') + sys.stderr.write(f'Pushing link batch #{int(i / 100)}...\r') self.iyp.batch_add_links('QUERIED_FROM', self.statements) self.statements = [] diff --git a/iyp/crawlers/cloudflare/ranking_bucket.py b/iyp/crawlers/cloudflare/ranking_bucket.py index 51ce721..7193ad6 100644 --- a/iyp/crawlers/cloudflare/ranking_bucket.py +++ b/iyp/crawlers/cloudflare/ranking_bucket.py @@ -7,7 +7,7 @@ import requests from requests.adapters import HTTPAdapter, Retry -from iyp import BaseCrawler +from iyp import BaseCrawler, JSONDecodeError, RequestStatusError # Organization name and URL to data ORG = 'Cloudflare' @@ -45,12 +45,12 @@ def run(self): req = req_session.get(URL_DATASETS) if req.status_code != 200: logging.error(f'Cannot download data {req.status_code}: {req.text}') - sys.exit('Error while fetching data file') + raise RequestStatusError('Error while fetching data file') datasets_json = req.json() if 'success' not in datasets_json or not datasets_json['success']: logging.error(f'HTTP request succeeded but API returned: {req.text}') - sys.exit('Error while fetching data file') + raise JSONDecodeError('Error while fetching data file') # Fetch all datasets first before starting to process them. This way we can # get/create all DomainName nodes in one go and then just add the RANK diff --git a/iyp/crawlers/cloudflare/top100.py b/iyp/crawlers/cloudflare/top100.py index fb976d5..5e2f5fb 100644 --- a/iyp/crawlers/cloudflare/top100.py +++ b/iyp/crawlers/cloudflare/top100.py @@ -6,7 +6,7 @@ import requests -from iyp import BaseCrawler +from iyp import BaseCrawler, RequestStatusError # Organization name and URL to data ORG = 'Cloudflare' @@ -37,7 +37,7 @@ def run(self): req = requests.get(self.reference['reference_url'], headers=headers) if req.status_code != 200: print(f'Cannot download data {req.status_code}: {req.text}') - sys.exit('Error while fetching data file') + raise RequestStatusError('Error while fetching data file') # Process line one after the other for i, _ in enumerate(map(self.update, req.json()['result']['top'])): diff --git a/iyp/crawlers/emileaben/as_names.py b/iyp/crawlers/emileaben/as_names.py index 76114f9..cebe71c 100644 --- a/iyp/crawlers/emileaben/as_names.py +++ b/iyp/crawlers/emileaben/as_names.py @@ -6,7 +6,7 @@ import requests -from iyp import BaseCrawler +from iyp import BaseCrawler, RequestStatusError # Organization name and URL to data ORG = 'emileaben' @@ -27,10 +27,10 @@ def run(self): res = requests.get(URL) except requests.exceptions.ConnectionError as e: logging.error(e) - sys.exit('Connection error while fetching data file') + raise ConnectionError('Connection error while fetching data file') except requests.exceptions.HTTPError as e: logging.error(e) - sys.exit('Error while fetching data file') + raise RequestStatusError('Error while fetching data file') with open(filename, 'w') as file: file.write(res.text) diff --git a/iyp/crawlers/example/crawler.py b/iyp/crawlers/example/crawler.py index bad4cb1..a329eec 100644 --- a/iyp/crawlers/example/crawler.py +++ b/iyp/crawlers/example/crawler.py @@ -5,7 +5,7 @@ import requests -from iyp import BaseCrawler +from iyp import BaseCrawler, RequestStatusError # Organization name and URL to data ORG = 'Example Org' @@ -24,7 +24,7 @@ def run(self): req = requests.get(self.reference['reference_url']) if req.status_code != 200: logging.error('Cannot download data {req.status_code}: {req.text}') - sys.exit('Error while fetching data file') + raise RequestStatusError('Error while fetching data file') # Process line one after the other for i, line in enumerate(req.text.splitlines()): diff --git a/iyp/crawlers/ihr/country_dependency.py b/iyp/crawlers/ihr/country_dependency.py index 07d4f23..3ff4125 100644 --- a/iyp/crawlers/ihr/country_dependency.py +++ b/iyp/crawlers/ihr/country_dependency.py @@ -11,7 +11,7 @@ from requests.adapters import HTTPAdapter from urllib3.util.retry import Retry -from iyp import BaseCrawler +from iyp import BaseCrawler, RequestStatusError # URL to the API URL = 'https://ihr.iijlab.net/ihr/api/hegemony/countries/?country={country}&af=4' @@ -46,7 +46,7 @@ def run(self): self.url = URL.format(country=cc) req = self.http_session.get(self.url + '&format=json') if req.status_code != 200: - sys.exit('Error while fetching data for ' + cc) + raise RequestStatusError('Error while fetching data for ' + cc) data = json.loads(req.text) ranking = data['results'] diff --git a/iyp/crawlers/inetintel/as_org.py b/iyp/crawlers/inetintel/as_org.py index a9f6596..6faa657 100644 --- a/iyp/crawlers/inetintel/as_org.py +++ b/iyp/crawlers/inetintel/as_org.py @@ -9,7 +9,7 @@ import requests from github import Github -from iyp import BaseCrawler +from iyp import BaseCrawler, ConnectionError, RequestStatusError def get_latest_dataset_url(github_repo: str, data_dir: str, file_extension: str): @@ -55,10 +55,10 @@ def run(self): req = requests.get(URL) except requests.exceptions.ConnectionError as e: logging.error(e) - sys.exit('Connection error while fetching data file') + raise ConnectionError('Connection error while fetching data file') except requests.exceptions.HTTPError as e: logging.error(e) - sys.exit('Error while fetching data file') + raise RequestStatusError('Error while fetching data file') with open(self.filename, 'w') as file: file.write(req.text) diff --git a/iyp/crawlers/manrs/members.py b/iyp/crawlers/manrs/members.py index d2eb6a8..8c37117 100644 --- a/iyp/crawlers/manrs/members.py +++ b/iyp/crawlers/manrs/members.py @@ -6,7 +6,7 @@ import requests -from iyp import BaseCrawler +from iyp import BaseCrawler, RequestStatusError # URL to MANRS csv file URL = 'https://www.manrs.org/wp-json/manrs/v1/csv/4' @@ -70,7 +70,7 @@ def run(self): req = requests.get(URL) if req.status_code != 200: - sys.exit('Error while fetching MANRS csv file') + raise RequestStatusError('Error while fetching MANRS csv file') for i, row in enumerate(req.text.splitlines()): # Skip the header diff --git a/iyp/crawlers/nro/delegated_stats.py b/iyp/crawlers/nro/delegated_stats.py index 2bcd3c0..ded0bbe 100644 --- a/iyp/crawlers/nro/delegated_stats.py +++ b/iyp/crawlers/nro/delegated_stats.py @@ -7,7 +7,7 @@ import requests -from iyp import BaseCrawler +from iyp import BaseCrawler, RequestStatusError # NOTE: this script is not adding new ASNs. It only adds links for existing ASNs # Should be run after crawlers that push many ASNs (e.g. ripe.as_names) @@ -25,7 +25,7 @@ def run(self): req = requests.get(URL) if req.status_code != 200: - sys.exit('Error while fetching delegated file') + raise RequestStatusError('Error while fetching delegated file') asn_id = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn') diff --git a/iyp/crawlers/pch/__init__.py b/iyp/crawlers/pch/__init__.py index 7071c6a..7e61dfb 100644 --- a/iyp/crawlers/pch/__init__.py +++ b/iyp/crawlers/pch/__init__.py @@ -15,7 +15,7 @@ from requests_futures.sessions import FuturesSession from urllib3.util.retry import Retry -from iyp import BaseCrawler, CacheHandler +from iyp import AddressValueError, BaseCrawler, CacheHandler from iyp.crawlers.pch.show_bgp_parser import ShowBGPParser PARALLEL_DOWNLOADS = 8 @@ -47,7 +47,7 @@ def __init__(self, organization: str, url: str, name: str, af: int): """af: Address family of the crawler. Must be 4 or 6.""" if af not in (4, 6): logging.error(f'Invalid address family: {af}') - sys.exit(f'Invalid address family: {af}') + raise AddressValueError(f'Invalid address family: {af}') self.MAX_LOOKBACK = timedelta(days=7) self.af = af if self.af == 4: diff --git a/iyp/crawlers/pch/show_bgp_parser.py b/iyp/crawlers/pch/show_bgp_parser.py index 6e399c1..f1f92ab 100644 --- a/iyp/crawlers/pch/show_bgp_parser.py +++ b/iyp/crawlers/pch/show_bgp_parser.py @@ -25,7 +25,7 @@ def __init__(self, af: int) -> None: """af: Address family of the parser. Must be 4 or 6.""" if af not in (4, 6): logging.error(f'Invalid address family specified: {af}') - sys.exit('Invalid address family specified.') + AddressValueError('Invalid address family specified.') self.af = af self.status_codes = {'s': 'suppressed', 'd': 'damped', diff --git a/iyp/crawlers/rapid7/forward_dns_v4.py b/iyp/crawlers/rapid7/forward_dns_v4.py index 6f74dfd..e0391c4 100644 --- a/iyp/crawlers/rapid7/forward_dns_v4.py +++ b/iyp/crawlers/rapid7/forward_dns_v4.py @@ -122,7 +122,7 @@ def run(self): f'{len(self.wh._domain2qid)} domain names in wiki\n') # push data to wiki for i, (tld, pfxs) in enumerate(self.tld_pfx.items()): - sys.stderr.write(f'\33[2K\rUpdating iyp... {i+1}/{len(self.tld_pfx)}\t{tld} {len(pfxs)} prefixes') + sys.stderr.write(f'\33[2K\rUpdating iyp... {i + 1}/{len(self.tld_pfx)}\t{tld} {len(pfxs)} prefixes') self.update(tld, pfxs) sys.stderr.write('\n') diff --git a/iyp/crawlers/ripe/as_names.py b/iyp/crawlers/ripe/as_names.py index 53b8b86..bd4ea21 100644 --- a/iyp/crawlers/ripe/as_names.py +++ b/iyp/crawlers/ripe/as_names.py @@ -5,7 +5,7 @@ import requests -from iyp import BaseCrawler +from iyp import BaseCrawler, RequestStatusError URL = 'https://ftp.ripe.net/ripe/asnames/asn.txt' ORG = 'RIPE NCC' @@ -19,7 +19,7 @@ def run(self): req = requests.get(URL) if req.status_code != 200: - sys.exit('Error while fetching AS names') + raise RequestStatusError('Error while fetching AS names') lines = [] asns = set() diff --git a/iyp/crawlers/ripe/atlas_measurements.py b/iyp/crawlers/ripe/atlas_measurements.py index 405e55f..d0f9aac 100644 --- a/iyp/crawlers/ripe/atlas_measurements.py +++ b/iyp/crawlers/ripe/atlas_measurements.py @@ -11,7 +11,8 @@ from requests.adapters import HTTPAdapter from urllib3.util.retry import Retry -from iyp import BaseCrawler +from iyp import (BaseCrawler, JSONDecodeError, MissingKeyError, + RequestStatusError) ORG = 'RIPE NCC' @@ -19,24 +20,6 @@ NAME = 'ripe.atlas_measurements' -class RequestStatusError(requests.HTTPError): - def __init__(self, message): - self.message = message - super().__init__(self.message) - - -class JSONDecodeError(ValueError): - def __init__(self, message): - self.message = message - super().__init__(self.message) - - -class MissingKeyError(Exception): - def __init__(self, message): - self.message = message - super().__init__(self.message) - - class Crawler(BaseCrawler): def __init__(self, organization, url, name): self.__initialize_session() diff --git a/iyp/crawlers/ripe/atlas_probes.py b/iyp/crawlers/ripe/atlas_probes.py index 8d8596a..03c1ddb 100644 --- a/iyp/crawlers/ripe/atlas_probes.py +++ b/iyp/crawlers/ripe/atlas_probes.py @@ -12,7 +12,7 @@ from requests.adapters import HTTPAdapter from urllib3.util.retry import Retry -from iyp import BaseCrawler +from iyp import BaseCrawler, MissingKeyError, RequestStatusError ORG = 'RIPE NCC' @@ -39,13 +39,13 @@ def __initialize_session(self) -> None: @staticmethod def __process_response(response: requests.Response): if response.status_code != requests.codes.ok: - sys.exit(f'Request to {response.url} failed with status: {response.status_code}') + raise RequestStatusError(f'Request to {response.url} failed with status: {response.status_code}') try: data = response.json() except json.decoder.JSONDecodeError as e: - sys.exit(f'Decoding JSON reply from {response.url} failed with exception: {e}') + raise RequestStatusError(f'Decoding JSON reply from {response.url} failed with exception: {e}') if 'next' not in data or 'results' not in data: - sys.exit('"next" or "results" key missing from response data.') + raise MissingKeyError('"next" or "results" key missing from response data.') next_url = data['next'] if not next_url: diff --git a/iyp/crawlers/ripe/roa.py b/iyp/crawlers/ripe/roa.py index 9ed5aae..55459e4 100644 --- a/iyp/crawlers/ripe/roa.py +++ b/iyp/crawlers/ripe/roa.py @@ -7,7 +7,7 @@ import requests -from iyp import BaseCrawler +from iyp import BaseCrawler, RequestStatusError # URL to RIPE repository URL = 'https://ftp.ripe.net/rpki/' @@ -44,7 +44,7 @@ def run(self): logging.info(f'Fetching ROA file: {self.url}') req = requests.get(self.url) if req.status_code != 200: - sys.exit('Error while fetching data for ' + self.url) + raise RequestStatusError('Error while fetching data for ' + self.url) # Aggregate data per prefix asns = set() diff --git a/iyp/crawlers/spamhaus/asn_drop.py b/iyp/crawlers/spamhaus/asn_drop.py index cf1487b..becc99b 100644 --- a/iyp/crawlers/spamhaus/asn_drop.py +++ b/iyp/crawlers/spamhaus/asn_drop.py @@ -3,6 +3,7 @@ import requests +from iyp import RequestStatusError from iyp.wiki.wikihandy import Wikihandy # URL to ASN Drop List @@ -65,7 +66,7 @@ def run(self): req = requests.get(URL) if req.status_code != 200: - sys.exit('Error while fetching the blocklist') + raise RequestStatusError('Error while fetching the blocklist') for i, row in enumerate(req.text.splitlines()): # Skip the header @@ -73,7 +74,7 @@ def run(self): continue self.update_net(row) - sys.stderr.write(f'\rProcessed {i+1} ASes') + sys.stderr.write(f'\rProcessed {i + 1} ASes') sys.stderr.write('\n') self.iyp.close() diff --git a/iyp/crawlers/spamhaus/prefix_drop.py b/iyp/crawlers/spamhaus/prefix_drop.py index 5f527e2..e86694b 100644 --- a/iyp/crawlers/spamhaus/prefix_drop.py +++ b/iyp/crawlers/spamhaus/prefix_drop.py @@ -3,6 +3,7 @@ import requests +from iyp import RequestStatusError from iyp.wiki.wikihandy import Wikihandy # URL to spamhaus data @@ -66,7 +67,7 @@ def run(self): req = requests.get(self.url) if req.status_code != 200: - sys.exit('Error while fetching the blocklist') + raise RequestStatusError('Error while fetching the blocklist') for i, row in enumerate(req.text.splitlines()): # Skip the header @@ -74,7 +75,7 @@ def run(self): continue self.update_net(row) - sys.stderr.write(f'\rProcessed {i+1} prefixes') + sys.stderr.write(f'\rProcessed {i + 1} prefixes') sys.stderr.write('\n') self.iyp.close() diff --git a/iyp/crawlers/spamhaus/prefix_edrop.py b/iyp/crawlers/spamhaus/prefix_edrop.py index 879a0b7..84ca931 100644 --- a/iyp/crawlers/spamhaus/prefix_edrop.py +++ b/iyp/crawlers/spamhaus/prefix_edrop.py @@ -3,6 +3,7 @@ import requests +from iyp import RequestStatusError from iyp.wiki.wikihandy import Wikihandy # URL to spamhaus data @@ -63,7 +64,7 @@ def run(self): req = requests.get(URL) if req.status_code != 200: - sys.exit('Error while fetching the blocklist') + raise RequestStatusError('Error while fetching the blocklist') for i, row in enumerate(req.text.splitlines()): # Skip the header @@ -71,7 +72,7 @@ def run(self): continue self.update_net(row) - sys.stderr.write(f'\rProcessed {i+1} prefixes') + sys.stderr.write(f'\rProcessed {i + 1} prefixes') sys.stderr.write('\n') self.iyp.close() diff --git a/iyp/crawlers/stanford/asdb.py b/iyp/crawlers/stanford/asdb.py index 58a620d..3a4e569 100644 --- a/iyp/crawlers/stanford/asdb.py +++ b/iyp/crawlers/stanford/asdb.py @@ -10,7 +10,7 @@ import requests from bs4 import BeautifulSoup -from iyp import BaseCrawler +from iyp import BaseCrawler, RequestStatusError def get_latest_asdb_dataset_url(asdb_stanford_data_url: str, file_name_format: str): @@ -37,7 +37,7 @@ def run(self): req = requests.get(URL) if req.status_code != 200: - sys.exit('Error while fetching ASdb') + raise RequestStatusError('Error while fetching ASdb') lines = [] asns = set() diff --git a/iyp/crawlers/tranco/top1M.py b/iyp/crawlers/tranco/top1M.py index e6214c5..e4b1923 100644 --- a/iyp/crawlers/tranco/top1M.py +++ b/iyp/crawlers/tranco/top1M.py @@ -7,7 +7,7 @@ import requests -from iyp import BaseCrawler +from iyp import BaseCrawler, RequestStatusError # URL to Tranco top 1M URL = 'https://tranco-list.eu/top-1m.csv.zip' @@ -25,7 +25,7 @@ def run(self): sys.stderr.write('Downloading latest list...\n') req = requests.get(URL) if req.status_code != 200: - sys.exit('Error while fetching Tranco csv file') + raise RequestStatusError('Error while fetching Tranco csv file') links = [] domains = set()