Skip to content

Commit

Permalink
Replace sys.exit call with exception handling in crawlers (#107)
Browse files Browse the repository at this point in the history
* Replace sys.exit call with exception handling in crawlers

* Pre-commit hooks modifications

---------

Co-authored-by: Malte Tashiro <[email protected]>
  • Loading branch information
KiranSatyaRaj and m-appel authored Jan 10, 2024
1 parent cf73ea2 commit 96ac2c5
Show file tree
Hide file tree
Showing 33 changed files with 106 additions and 92 deletions.
40 changes: 35 additions & 5 deletions iyp/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@
import logging
import os
import pickle
import sys
from datetime import datetime, time, timezone
from shutil import rmtree
from typing import Optional

import requests
from neo4j import GraphDatabase

BATCH_SIZE = 50000
Expand Down Expand Up @@ -65,18 +65,48 @@ def dict2str(d, eq=':', pfx=''):
for key, value in d.items():
if isinstance(value, str) and '"' in value:
escaped = value.replace("'", r"\'")
data.append(f"{pfx+key}{eq} '{escaped}'")
data.append(f"{pfx + key}{eq} '{escaped}'")
elif isinstance(value, str) or isinstance(value, datetime):
data.append(f'{pfx+key}{eq} "{value}"')
data.append(f'{pfx + key}{eq} "{value}"')
elif value is None:
# Neo4j does not have the concept of empty properties.
pass
else:
data.append(f'{pfx+key}{eq} {value}')
data.append(f'{pfx + key}{eq} {value}')

return '{' + ','.join(data) + '}'


class RequestStatusError(requests.HTTPError):
def __init__(self, message):
self.message = message
super().__init__(self.message)


class JSONDecodeError(ValueError):
def __init__(self, message):
self.message = message
super().__init__(self.message)


class MissingKeyError(Exception):
def __init__(self, message):
self.message = message
super().__init__(self.message)


class ConnectionError(requests.exceptions.ConnectionError):
def __init__(self, message):
self.message = message
super().__init__(self.message)


class AddressValueError(ValueError):
def __init__(self, message):
self.message = message
super().__init__(self.message)


class IYP(object):

def __init__(self):
Expand All @@ -95,7 +125,7 @@ def __init__(self):
self.db = GraphDatabase.driver(uri, auth=(self.login, self.password))

if self.db is None:
sys.exit('Could not connect to the Neo4j database!')
raise ConnectionError('Could not connect to the Neo4j database!')
# Raises an exception if there is a problem.
# "Best practice" is to just let the program
# crash: https://neo4j.com/docs/python-manual/current/connect/
Expand Down
4 changes: 2 additions & 2 deletions iyp/crawlers/apnic/eyeball.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import iso3166
import requests

from iyp import BaseCrawler
from iyp import BaseCrawler, RequestStatusError

# URL to APNIC API
URL = 'http://v6data.data.labs.apnic.net/ipv6-measurement/Economies/'
Expand Down Expand Up @@ -40,7 +40,7 @@ def run(self):
self.url = URL + f'{cc}/{cc}.asns.json?m={MIN_POP_PERC}'
req = requests.get(self.url)
if req.status_code != 200:
sys.exit('Error while fetching data for ' + cc)
raise RequestStatusError(f'Error while fetching data for {cc}')

asns = set()
names = set()
Expand Down
4 changes: 2 additions & 2 deletions iyp/crawlers/bgp/rv_ris.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,15 +57,15 @@ def run(self):

for asn in origin_asns:
rnode.data['origin'][asn].add(elem.collector)
sys.stderr.write(f'\rProcessed {i+1} BGP messages')
sys.stderr.write(f'\rProcessed {i + 1} BGP messages')

sys.stderr.write('\nPushing data to IYP...\n')

# Push all prefixes data to IYP
for i, rnode in enumerate(rtree):
data = rnode.data['origin']
self.update_entry(rnode.prefix, data)
sys.stderr.write(f'\rProcessed {i+1} prefixes')
sys.stderr.write(f'\rProcessed {i + 1} prefixes')

def update_entry(self, prefix, originasn_collector):
"""Add the prefix to wikibase if it's not already there and update its
Expand Down
5 changes: 2 additions & 3 deletions iyp/crawlers/bgpkit/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
import bz2
import json
import sys

import requests

from iyp import BaseCrawler
from iyp import BaseCrawler, RequestStatusError


class AS2RelCrawler(BaseCrawler):
Expand All @@ -20,7 +19,7 @@ def run(self):

req = requests.get(self.url, stream=True)
if req.status_code != 200:
sys.exit('Error while fetching AS relationships')
raise RequestStatusError('Error while fetching AS relationships')

rels = []
asns = set()
Expand Down
4 changes: 2 additions & 2 deletions iyp/crawlers/bgpkit/peerstats.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

import requests

from iyp import BaseCrawler
from iyp import BaseCrawler, RequestStatusError

MAIN_PAGE = 'https://data.bgpkit.com/peer-stats/'
URL = 'https://data.bgpkit.com/peer-stats/{collector}/{year}/{month:02d}/peer-stats_{collector}_{year}-{month:02d}-{day:02d}_{epoch}.bz2' # noqa: E501
Expand All @@ -24,7 +24,7 @@ def run(self):
req = requests.get(MAIN_PAGE)
if req.status_code != 200:
logging.error(f'Cannot fetch peer-stats page {req.status_code}: req.text')
sys.exit('Error while fetching main page')
raise RequestStatusError('Error while fetching main page')

# Find all collectors
collectors = []
Expand Down
4 changes: 2 additions & 2 deletions iyp/crawlers/bgpkit/pfx2asn.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

import requests

from iyp import BaseCrawler
from iyp import BaseCrawler, RequestStatusError

URL = 'https://data.bgpkit.com/pfx2as/pfx2as-latest.json.bz2'
ORG = 'BGPKIT'
Expand All @@ -22,7 +22,7 @@ def run(self):

req = requests.get(URL, stream=True)
if req.status_code != 200:
sys.exit('Error while fetching pfx2as relationships')
raise RequestStatusError('Error while fetching pfx2as relationships')

entries = []
asns = set()
Expand Down
6 changes: 3 additions & 3 deletions iyp/crawlers/bgptools/anycast_prefixes.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

import requests

from iyp import BaseCrawler
from iyp import BaseCrawler, ConnectionError, RequestStatusError

# Organization name and URL to data
ORG = 'BGP.Tools'
Expand All @@ -29,10 +29,10 @@ def fetch_dataset(url: str):
return res
except requests.exceptions.ConnectionError as e:
logging.error(e)
sys.exit('Connection error while fetching data file')
raise ConnectionError('Connection error while fetching data file')
except requests.exceptions.HTTPError as e:
logging.error(e)
sys.exit('Error while fetching data file')
raise RequestStatusError('Error while fetching data file')


class Crawler(BaseCrawler):
Expand Down
4 changes: 2 additions & 2 deletions iyp/crawlers/bgptools/as_names.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import requests

from iyp import BaseCrawler
from iyp import BaseCrawler, RequestStatusError

# curl -s https://bgp.tools/asns.csv | head -n 5
URL = 'https://bgp.tools/asns.csv'
Expand All @@ -27,7 +27,7 @@ def run(self):

req = requests.get(URL, headers=self.headers)
if req.status_code != 200:
sys.exit('Error while fetching AS names')
raise RequestStatusError('Error while fetching AS names')

lines = []
asns = set()
Expand Down
4 changes: 2 additions & 2 deletions iyp/crawlers/bgptools/tags.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

import requests

from iyp import BaseCrawler
from iyp import BaseCrawler, RequestStatusError

# curl -s https://bgp.tools/asns.csv | head -n 5
URL = 'https://bgp.tools/tags/'
Expand Down Expand Up @@ -61,7 +61,7 @@ def run(self):
req = requests.get(url, headers=self.headers)
if req.status_code != 200:
print(req.text)
sys.exit('Error while fetching AS names')
raise RequestStatusError('Error while fetching AS names')

self.tag_qid = self.iyp.get_node('Tag', {'label': label})
for line in req.text.splitlines():
Expand Down
7 changes: 3 additions & 4 deletions iyp/crawlers/caida/asrank.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import flatdict
import requests

from iyp import BaseCrawler
from iyp import BaseCrawler, RequestStatusError

# URL to ASRank API
URL = 'https://api.asrank.caida.org/v2/restful/asns/?first=10000'
Expand All @@ -26,14 +26,13 @@ def run(self):
has_next = True
i = 0
while has_next:
url = URL + f'&offset={i*10000}'
url = URL + f'&offset={i * 10000}'
i += 1
logging.info(f'Fetching {url}')
req = requests.get(url)
if req.status_code != 200:
logging.error(f'Request failed with status: {req.status_code}')
# FIXME should raise an exception
sys.exit('Error while fetching data from API')
raise RequestStatusError('Error while fetching data from API')

ranking = json.loads(req.text)['data']['asns']
has_next = ranking['pageInfo']['hasNextPage']
Expand Down
4 changes: 2 additions & 2 deletions iyp/crawlers/cisco/umbrella_top1M.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

import requests

from iyp import BaseCrawler
from iyp import BaseCrawler, RequestStatusError

# URL to Tranco top 1M
URL = 'http://s3-us-west-1.amazonaws.com/umbrella-static/top-1m.csv.zip'
Expand All @@ -25,7 +25,7 @@ def run(self):
sys.stderr.write('Downloading latest list...\n')
req = requests.get(URL)
if req.status_code != 200:
sys.exit('Error while fetching Cisco Umbrella Top 1M csv file')
raise RequestStatusError('Error while fetching Cisco Umbrella Top 1M csv file')

links = []
domains = set()
Expand Down
4 changes: 2 additions & 2 deletions iyp/crawlers/citizenlab/urldb.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

import requests

from iyp import BaseCrawler
from iyp import BaseCrawler, RequestStatusError

# Organization name and URL to data
ORG = 'Citizen Lab'
Expand All @@ -30,7 +30,7 @@ def run(self):

if req_for_country_codes.status_code != 200:
logging.error('Cannot download data {req.status_code}: {req.text}')
sys.exit('Error while fetching data file')
raise RequestStatusError('Error while fetching data file')

content = req_for_country_codes.content.decode('utf-8')
csv_data = csv.reader(content.splitlines(), delimiter=',')
Expand Down
2 changes: 1 addition & 1 deletion iyp/crawlers/cloudflare/dns_top_locations.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ def run(self):
self.compute_link(domain_top)

if i % 100 == 0:
sys.stderr.write(f'Pushing link batch #{int(i/100)}...\r')
sys.stderr.write(f'Pushing link batch #{int(i / 100)}...\r')
self.iyp.batch_add_links('QUERIED_FROM', self.statements)
self.statements = []

Expand Down
6 changes: 3 additions & 3 deletions iyp/crawlers/cloudflare/ranking_bucket.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import requests
from requests.adapters import HTTPAdapter, Retry

from iyp import BaseCrawler
from iyp import BaseCrawler, JSONDecodeError, RequestStatusError

# Organization name and URL to data
ORG = 'Cloudflare'
Expand Down Expand Up @@ -45,12 +45,12 @@ def run(self):
req = req_session.get(URL_DATASETS)
if req.status_code != 200:
logging.error(f'Cannot download data {req.status_code}: {req.text}')
sys.exit('Error while fetching data file')
raise RequestStatusError('Error while fetching data file')

datasets_json = req.json()
if 'success' not in datasets_json or not datasets_json['success']:
logging.error(f'HTTP request succeeded but API returned: {req.text}')
sys.exit('Error while fetching data file')
raise JSONDecodeError('Error while fetching data file')

# Fetch all datasets first before starting to process them. This way we can
# get/create all DomainName nodes in one go and then just add the RANK
Expand Down
4 changes: 2 additions & 2 deletions iyp/crawlers/cloudflare/top100.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

import requests

from iyp import BaseCrawler
from iyp import BaseCrawler, RequestStatusError

# Organization name and URL to data
ORG = 'Cloudflare'
Expand Down Expand Up @@ -37,7 +37,7 @@ def run(self):
req = requests.get(self.reference['reference_url'], headers=headers)
if req.status_code != 200:
print(f'Cannot download data {req.status_code}: {req.text}')
sys.exit('Error while fetching data file')
raise RequestStatusError('Error while fetching data file')

# Process line one after the other
for i, _ in enumerate(map(self.update, req.json()['result']['top'])):
Expand Down
6 changes: 3 additions & 3 deletions iyp/crawlers/emileaben/as_names.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

import requests

from iyp import BaseCrawler
from iyp import BaseCrawler, RequestStatusError

# Organization name and URL to data
ORG = 'emileaben'
Expand All @@ -27,10 +27,10 @@ def run(self):
res = requests.get(URL)
except requests.exceptions.ConnectionError as e:
logging.error(e)
sys.exit('Connection error while fetching data file')
raise ConnectionError('Connection error while fetching data file')
except requests.exceptions.HTTPError as e:
logging.error(e)
sys.exit('Error while fetching data file')
raise RequestStatusError('Error while fetching data file')
with open(filename, 'w') as file:
file.write(res.text)

Expand Down
4 changes: 2 additions & 2 deletions iyp/crawlers/example/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import requests

from iyp import BaseCrawler
from iyp import BaseCrawler, RequestStatusError

# Organization name and URL to data
ORG = 'Example Org'
Expand All @@ -24,7 +24,7 @@ def run(self):
req = requests.get(self.reference['reference_url'])
if req.status_code != 200:
logging.error('Cannot download data {req.status_code}: {req.text}')
sys.exit('Error while fetching data file')
raise RequestStatusError('Error while fetching data file')

# Process line one after the other
for i, line in enumerate(req.text.splitlines()):
Expand Down
4 changes: 2 additions & 2 deletions iyp/crawlers/ihr/country_dependency.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

from iyp import BaseCrawler
from iyp import BaseCrawler, RequestStatusError

# URL to the API
URL = 'https://ihr.iijlab.net/ihr/api/hegemony/countries/?country={country}&af=4'
Expand Down Expand Up @@ -46,7 +46,7 @@ def run(self):
self.url = URL.format(country=cc)
req = self.http_session.get(self.url + '&format=json')
if req.status_code != 200:
sys.exit('Error while fetching data for ' + cc)
raise RequestStatusError('Error while fetching data for ' + cc)
data = json.loads(req.text)
ranking = data['results']

Expand Down
Loading

0 comments on commit 96ac2c5

Please sign in to comment.