Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replace sys.exit call with exception handling in crawlers #107

Merged
merged 5 commits into from
Jan 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 35 additions & 5 deletions iyp/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@
import logging
import os
import pickle
import sys
from datetime import datetime, time, timezone
from shutil import rmtree
from typing import Optional

import requests
from neo4j import GraphDatabase

BATCH_SIZE = 50000
Expand Down Expand Up @@ -65,18 +65,48 @@ def dict2str(d, eq=':', pfx=''):
for key, value in d.items():
if isinstance(value, str) and '"' in value:
escaped = value.replace("'", r"\'")
data.append(f"{pfx+key}{eq} '{escaped}'")
data.append(f"{pfx + key}{eq} '{escaped}'")
elif isinstance(value, str) or isinstance(value, datetime):
data.append(f'{pfx+key}{eq} "{value}"')
data.append(f'{pfx + key}{eq} "{value}"')
elif value is None:
# Neo4j does not have the concept of empty properties.
pass
else:
data.append(f'{pfx+key}{eq} {value}')
data.append(f'{pfx + key}{eq} {value}')

return '{' + ','.join(data) + '}'


class RequestStatusError(requests.HTTPError):
def __init__(self, message):
self.message = message
super().__init__(self.message)


class JSONDecodeError(ValueError):
def __init__(self, message):
self.message = message
super().__init__(self.message)


class MissingKeyError(Exception):
def __init__(self, message):
self.message = message
super().__init__(self.message)


class ConnectionError(requests.exceptions.ConnectionError):
def __init__(self, message):
self.message = message
super().__init__(self.message)


class AddressValueError(ValueError):
def __init__(self, message):
self.message = message
super().__init__(self.message)


class IYP(object):

def __init__(self):
Expand All @@ -95,7 +125,7 @@ def __init__(self):
self.db = GraphDatabase.driver(uri, auth=(self.login, self.password))

if self.db is None:
sys.exit('Could not connect to the Neo4j database!')
raise ConnectionError('Could not connect to the Neo4j database!')
# Raises an exception if there is a problem.
# "Best practice" is to just let the program
# crash: https://neo4j.com/docs/python-manual/current/connect/
Expand Down
4 changes: 2 additions & 2 deletions iyp/crawlers/apnic/eyeball.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import iso3166
import requests

from iyp import BaseCrawler
from iyp import BaseCrawler, RequestStatusError

# URL to APNIC API
URL = 'http://v6data.data.labs.apnic.net/ipv6-measurement/Economies/'
Expand Down Expand Up @@ -40,7 +40,7 @@ def run(self):
self.url = URL + f'{cc}/{cc}.asns.json?m={MIN_POP_PERC}'
req = requests.get(self.url)
if req.status_code != 200:
sys.exit('Error while fetching data for ' + cc)
raise RequestStatusError(f'Error while fetching data for {cc}')

asns = set()
names = set()
Expand Down
4 changes: 2 additions & 2 deletions iyp/crawlers/bgp/rv_ris.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,15 +57,15 @@ def run(self):

for asn in origin_asns:
rnode.data['origin'][asn].add(elem.collector)
sys.stderr.write(f'\rProcessed {i+1} BGP messages')
sys.stderr.write(f'\rProcessed {i + 1} BGP messages')

sys.stderr.write('\nPushing data to IYP...\n')

# Push all prefixes data to IYP
for i, rnode in enumerate(rtree):
data = rnode.data['origin']
self.update_entry(rnode.prefix, data)
sys.stderr.write(f'\rProcessed {i+1} prefixes')
sys.stderr.write(f'\rProcessed {i + 1} prefixes')

def update_entry(self, prefix, originasn_collector):
"""Add the prefix to wikibase if it's not already there and update its
Expand Down
5 changes: 2 additions & 3 deletions iyp/crawlers/bgpkit/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
import bz2
import json
import sys

import requests

from iyp import BaseCrawler
from iyp import BaseCrawler, RequestStatusError


class AS2RelCrawler(BaseCrawler):
Expand All @@ -20,7 +19,7 @@ def run(self):

req = requests.get(self.url, stream=True)
if req.status_code != 200:
sys.exit('Error while fetching AS relationships')
raise RequestStatusError('Error while fetching AS relationships')

rels = []
asns = set()
Expand Down
4 changes: 2 additions & 2 deletions iyp/crawlers/bgpkit/peerstats.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

import requests

from iyp import BaseCrawler
from iyp import BaseCrawler, RequestStatusError

MAIN_PAGE = 'https://data.bgpkit.com/peer-stats/'
URL = 'https://data.bgpkit.com/peer-stats/{collector}/{year}/{month:02d}/peer-stats_{collector}_{year}-{month:02d}-{day:02d}_{epoch}.bz2' # noqa: E501
Expand All @@ -24,7 +24,7 @@ def run(self):
req = requests.get(MAIN_PAGE)
if req.status_code != 200:
logging.error(f'Cannot fetch peer-stats page {req.status_code}: req.text')
sys.exit('Error while fetching main page')
raise RequestStatusError('Error while fetching main page')

# Find all collectors
collectors = []
Expand Down
4 changes: 2 additions & 2 deletions iyp/crawlers/bgpkit/pfx2asn.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

import requests

from iyp import BaseCrawler
from iyp import BaseCrawler, RequestStatusError

URL = 'https://data.bgpkit.com/pfx2as/pfx2as-latest.json.bz2'
ORG = 'BGPKIT'
Expand All @@ -22,7 +22,7 @@ def run(self):

req = requests.get(URL, stream=True)
if req.status_code != 200:
sys.exit('Error while fetching pfx2as relationships')
raise RequestStatusError('Error while fetching pfx2as relationships')

entries = []
asns = set()
Expand Down
6 changes: 3 additions & 3 deletions iyp/crawlers/bgptools/anycast_prefixes.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

import requests

from iyp import BaseCrawler
from iyp import BaseCrawler, ConnectionError, RequestStatusError

# Organization name and URL to data
ORG = 'BGP.Tools'
Expand All @@ -29,10 +29,10 @@ def fetch_dataset(url: str):
return res
except requests.exceptions.ConnectionError as e:
logging.error(e)
sys.exit('Connection error while fetching data file')
raise ConnectionError('Connection error while fetching data file')
except requests.exceptions.HTTPError as e:
logging.error(e)
sys.exit('Error while fetching data file')
raise RequestStatusError('Error while fetching data file')


class Crawler(BaseCrawler):
Expand Down
4 changes: 2 additions & 2 deletions iyp/crawlers/bgptools/as_names.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import requests

from iyp import BaseCrawler
from iyp import BaseCrawler, RequestStatusError

# curl -s https://bgp.tools/asns.csv | head -n 5
URL = 'https://bgp.tools/asns.csv'
Expand All @@ -27,7 +27,7 @@ def run(self):

req = requests.get(URL, headers=self.headers)
if req.status_code != 200:
sys.exit('Error while fetching AS names')
raise RequestStatusError('Error while fetching AS names')

lines = []
asns = set()
Expand Down
4 changes: 2 additions & 2 deletions iyp/crawlers/bgptools/tags.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

import requests

from iyp import BaseCrawler
from iyp import BaseCrawler, RequestStatusError

# curl -s https://bgp.tools/asns.csv | head -n 5
URL = 'https://bgp.tools/tags/'
Expand Down Expand Up @@ -61,7 +61,7 @@ def run(self):
req = requests.get(url, headers=self.headers)
if req.status_code != 200:
print(req.text)
sys.exit('Error while fetching AS names')
raise RequestStatusError('Error while fetching AS names')

self.tag_qid = self.iyp.get_node('Tag', {'label': label})
for line in req.text.splitlines():
Expand Down
7 changes: 3 additions & 4 deletions iyp/crawlers/caida/asrank.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import flatdict
import requests

from iyp import BaseCrawler
from iyp import BaseCrawler, RequestStatusError

# URL to ASRank API
URL = 'https://api.asrank.caida.org/v2/restful/asns/?first=10000'
Expand All @@ -26,14 +26,13 @@ def run(self):
has_next = True
i = 0
while has_next:
url = URL + f'&offset={i*10000}'
url = URL + f'&offset={i * 10000}'
i += 1
logging.info(f'Fetching {url}')
req = requests.get(url)
if req.status_code != 200:
logging.error(f'Request failed with status: {req.status_code}')
# FIXME should raise an exception
sys.exit('Error while fetching data from API')
raise RequestStatusError('Error while fetching data from API')

ranking = json.loads(req.text)['data']['asns']
has_next = ranking['pageInfo']['hasNextPage']
Expand Down
4 changes: 2 additions & 2 deletions iyp/crawlers/cisco/umbrella_top1M.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

import requests

from iyp import BaseCrawler
from iyp import BaseCrawler, RequestStatusError

# URL to Tranco top 1M
URL = 'http://s3-us-west-1.amazonaws.com/umbrella-static/top-1m.csv.zip'
Expand All @@ -25,7 +25,7 @@ def run(self):
sys.stderr.write('Downloading latest list...\n')
req = requests.get(URL)
if req.status_code != 200:
sys.exit('Error while fetching Cisco Umbrella Top 1M csv file')
raise RequestStatusError('Error while fetching Cisco Umbrella Top 1M csv file')

links = []
domains = set()
Expand Down
4 changes: 2 additions & 2 deletions iyp/crawlers/citizenlab/urldb.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

import requests

from iyp import BaseCrawler
from iyp import BaseCrawler, RequestStatusError

# Organization name and URL to data
ORG = 'Citizen Lab'
Expand All @@ -30,7 +30,7 @@ def run(self):

if req_for_country_codes.status_code != 200:
logging.error('Cannot download data {req.status_code}: {req.text}')
sys.exit('Error while fetching data file')
raise RequestStatusError('Error while fetching data file')

content = req_for_country_codes.content.decode('utf-8')
csv_data = csv.reader(content.splitlines(), delimiter=',')
Expand Down
2 changes: 1 addition & 1 deletion iyp/crawlers/cloudflare/dns_top_locations.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ def run(self):
self.compute_link(domain_top)

if i % 100 == 0:
sys.stderr.write(f'Pushing link batch #{int(i/100)}...\r')
sys.stderr.write(f'Pushing link batch #{int(i / 100)}...\r')
self.iyp.batch_add_links('QUERIED_FROM', self.statements)
self.statements = []

Expand Down
6 changes: 3 additions & 3 deletions iyp/crawlers/cloudflare/ranking_bucket.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import requests
from requests.adapters import HTTPAdapter, Retry

from iyp import BaseCrawler
from iyp import BaseCrawler, JSONDecodeError, RequestStatusError

# Organization name and URL to data
ORG = 'Cloudflare'
Expand Down Expand Up @@ -45,12 +45,12 @@ def run(self):
req = req_session.get(URL_DATASETS)
if req.status_code != 200:
logging.error(f'Cannot download data {req.status_code}: {req.text}')
sys.exit('Error while fetching data file')
raise RequestStatusError('Error while fetching data file')

datasets_json = req.json()
if 'success' not in datasets_json or not datasets_json['success']:
logging.error(f'HTTP request succeeded but API returned: {req.text}')
sys.exit('Error while fetching data file')
raise JSONDecodeError('Error while fetching data file')

# Fetch all datasets first before starting to process them. This way we can
# get/create all DomainName nodes in one go and then just add the RANK
Expand Down
4 changes: 2 additions & 2 deletions iyp/crawlers/cloudflare/top100.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

import requests

from iyp import BaseCrawler
from iyp import BaseCrawler, RequestStatusError

# Organization name and URL to data
ORG = 'Cloudflare'
Expand Down Expand Up @@ -37,7 +37,7 @@ def run(self):
req = requests.get(self.reference['reference_url'], headers=headers)
if req.status_code != 200:
print(f'Cannot download data {req.status_code}: {req.text}')
sys.exit('Error while fetching data file')
raise RequestStatusError('Error while fetching data file')

# Process line one after the other
for i, _ in enumerate(map(self.update, req.json()['result']['top'])):
Expand Down
6 changes: 3 additions & 3 deletions iyp/crawlers/emileaben/as_names.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

import requests

from iyp import BaseCrawler
from iyp import BaseCrawler, RequestStatusError

# Organization name and URL to data
ORG = 'emileaben'
Expand All @@ -27,10 +27,10 @@ def run(self):
res = requests.get(URL)
except requests.exceptions.ConnectionError as e:
logging.error(e)
sys.exit('Connection error while fetching data file')
raise ConnectionError('Connection error while fetching data file')
except requests.exceptions.HTTPError as e:
logging.error(e)
sys.exit('Error while fetching data file')
raise RequestStatusError('Error while fetching data file')
with open(filename, 'w') as file:
file.write(res.text)

Expand Down
4 changes: 2 additions & 2 deletions iyp/crawlers/example/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import requests

from iyp import BaseCrawler
from iyp import BaseCrawler, RequestStatusError

# Organization name and URL to data
ORG = 'Example Org'
Expand All @@ -24,7 +24,7 @@ def run(self):
req = requests.get(self.reference['reference_url'])
if req.status_code != 200:
logging.error('Cannot download data {req.status_code}: {req.text}')
sys.exit('Error while fetching data file')
raise RequestStatusError('Error while fetching data file')

# Process line one after the other
for i, line in enumerate(req.text.splitlines()):
Expand Down
4 changes: 2 additions & 2 deletions iyp/crawlers/ihr/country_dependency.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

from iyp import BaseCrawler
from iyp import BaseCrawler, RequestStatusError

# URL to the API
URL = 'https://ihr.iijlab.net/ihr/api/hegemony/countries/?country={country}&af=4'
Expand Down Expand Up @@ -46,7 +46,7 @@ def run(self):
self.url = URL.format(country=cc)
req = self.http_session.get(self.url + '&format=json')
if req.status_code != 200:
sys.exit('Error while fetching data for ' + cc)
raise RequestStatusError('Error while fetching data for ' + cc)
data = json.loads(req.text)
ranking = data['results']

Expand Down
Loading