diff --git a/.gitignore b/.gitignore index 0c6ba6bc..cf165c84 100644 --- a/.gitignore +++ b/.gitignore @@ -129,3 +129,5 @@ dmypy.json .pyre/ .history/ + +**/dumps \ No newline at end of file diff --git a/autodeploy-config.json b/autodeploy-config.json new file mode 100644 index 00000000..e210b16b --- /dev/null +++ b/autodeploy-config.json @@ -0,0 +1,12 @@ +{ + "archive_base_url": "https://ihr-archive.iijlab.net/ihr/iyp/", + "caddy_config_url": "http://sandbox.ihr.live:2019/config", + "caddy_post_url": "http://localhost:2019/load", + "caddy_template": "caddy.template.json", + "urls": { + "active_bolt": "ryan-bolt.ihr.live", + "active_http": "ryan.ihr.live", + "prev_bolt": "ryan-prev-bolt.ihr.live", + "prev_http": "ryan-prev.ihr.live" + } +} diff --git a/autodeploy/README.md b/autodeploy/README.md new file mode 100644 index 00000000..78dd23f7 --- /dev/null +++ b/autodeploy/README.md @@ -0,0 +1,54 @@ +# Autodeployment Script + +## Usage + +### Starting caddy + +Make sure that Caddy is running. If not, run it with `docker compose up caddy`. If Caddy +was running previously, then the new Caddy instance will resume from the previous +config. See the [Caddy docs](https://caddyserver.com/docs/running#docker-compose) for +more info. + +### Running the script + +To run the script, run `python3 -m autodeploy.autodeploy `. This will first find the date +of the most recent active deployment using the caddy config. If there is no active +deployment, today's date is used. With this date, the script will then check ihr-archive +to see if a dump has been pushed in the subsequent 7 days. If so, a neo4j instance will +be deployed using that dump. For example, if the latest deployment is for 2024-06-15, +the script will check if there is a dump for 2024-06-16 to 2024-06-23. + +Alternatively, running `python3 -m autodeploy.autodeploy --date [year]-[month]-[day]` will +check if there is a dump in the archive for the specified date and deploy it directly. + +## How it works + +### Checking for a dump to deploy + +If the date is not provided when running the script, it will first make a request to +Caddy to get the current config. The config is parsed to retrieve the port of the active +database. The date is parsed from the port number as explained below. Starting from this +date, the next 7 days are then checked in ihr-archive for valid dumps. + +#### Caddy Config + +Caddy is updated by substituting the desired ports in the specified Caddy config +template. The ports are constructed with the following structure: 1MMDD for neo4j http +port, and 2MMDD for neo4j bolt port. The json is sent to caddy by making a POST request +to sandbox.ihr.live:2019/load. The current config is retrieved by making a GET request +to sandbox.ihr.live:2019/config. + +### Starting the database + +Once a dump has been found, its log is downloaded from the archive. If the log indicates +that there are no errors, then the dump is downloaded. A docker container is then +started that loads the dump into a neo4j database. The database is stored in a docker +volume with the name data-MM-DD. Another container is then used to start the database +using the data stored in data-MM-YY. It binds its internal neo4j 7474 and 7687 ports to +the external ones that contain the dump's date. + +If a container is already running for this date, it and its data volume are deleted, and +a new one is created from the downloaded dump data. + +If there was already an active database, it becomes the previous database. The current +previous database container is stopped, and its data volume is deleted. diff --git a/autodeploy/autodeploy.py b/autodeploy/autodeploy.py new file mode 100644 index 00000000..68d82c13 --- /dev/null +++ b/autodeploy/autodeploy.py @@ -0,0 +1,285 @@ +import argparse +import json +import logging +import os +import sys +import time +from datetime import datetime, timedelta, timezone + +import docker +import requests + +NEO4J_VERSION = '5.16.0' + +ARCHIVE_URL_SUFFIX = '%Y/%m/%d/iyp-%Y-%m-%d' +LOG_URL_SUFFIX = ARCHIVE_URL_SUFFIX + '.log' +DUMP_URL_SUFFIX = ARCHIVE_URL_SUFFIX + '.dump' + +DUMP_DOWNLOAD_DIR_SUFFIX = 'dumps/%Y/%m/%d' + +DOCKER_VOLUME_FMT = 'data-%m-%d' +DOCKER_CONTAINER_NAME_FMT = 'deploy-%m-%d' + + +def remove_deployment(client: docker.DockerClient, date: datetime): + """Checks if there is an active deployment for the given date (month-day). + + If there is, remove it and the corresponding volume storing its data. + """ + container_name = date.strftime(DOCKER_CONTAINER_NAME_FMT) + volume_name = date.strftime(DOCKER_VOLUME_FMT) + try: + container = client.containers.get(container_name) + logging.warning(f'Removing active deployment for {date.strftime("%m-%d")}') + container.stop() + # Wait a little bit after the container has been removed + # before deleting the volume + # TODO Is there a better way? + while True: + try: + client.volumes.get(volume_name).remove() + break + except BaseException: + time.sleep(1) + except docker.errors.NotFound: + logging.info(f'No existing deployment for {date.strftime("%Y-%m-%d")}. Starting deployment') + + +def get_ports_from_caddy_config(config: dict): + """Makes a request to caddy config and returns the ports currently being used (both + active and previous)""" + caddy_config_url = config['caddy_config_url'] + r = requests.get(caddy_config_url) + try: + r.raise_for_status() + except requests.HTTPError as e: + logging.error(f'Failed to retrieve Caddy config from {caddy_config_url}: {e}') + sys.exit(1) + try: + body = r.json() + except json.JSONDecodeError as e: + logging.error(f'Failed to parse Caddy config: {e}') + sys.exit(1) + + routes = body['apps']['http']['servers']['srv0']['routes'] + active = dict() + for route in routes: + # This happens with a fresh caddy build. No ports are active, so + # return an empty dict. + # TODO So there is a route dict, but without a 'match' entry? + if 'match' not in route: + return dict() + host = route['match'][0]['host'][0] + dial = route['handle'][0]['routes'][0]['handle'][0]['upstreams'][0]['dial'] + active[host] = dial + + # TODO We want to have both active_bolt and active_http URLs in the config, right? + # If only one is present, we have a problem. + # Also, if we reach this point there _must_ be at least a currently active + # deployment? + ports = dict() + active_bolt = config['urls']['active_bolt'] + active_http = config['urls']['active_http'] + if active_bolt in active: + ports['active_bolt'] = active[active_bolt].split(':')[1] + if active_http in active: + ports['active_http'] = active[active_http].split(':')[1] + + # It's possible for there to be only one active deployment, and there + # are no previous ports. Only attempt to parse the previous ports + # if they have been filled in on the caddy config + prev_bolt = config['urls']['prev_bolt'] + prev_http = config['urls']['prev_http'] + if prev_bolt in active and 'PREV_BOLT_PORT' not in active[prev_bolt]: + ports['prev_bolt'] = active[prev_bolt].split(':')[1] + if prev_http in active and 'PREV_HTTP_PORT' not in active[prev_http]: + ports['prev_http'] = active[prev_http].split(':')[1] + return ports + + +def check_log(config: dict, date: datetime): + """Makes a request to archive and checks if there is a valid dump for the specified + date.""" + logging.info(f'Downloading logs for {date.strftime("%Y-%m-%d")}') + log_url_fmt = os.path.join(config['archive_base_url'], LOG_URL_SUFFIX) + log_url = date.strftime(log_url_fmt) + r = requests.get(log_url) + try: + r.raise_for_status() + except requests.HTTPError as e: + # We expect the request to fail if the log does not exist (404), but not for + # other reasons. + if r.status_code != 404: + logging.error(f'Expected HTTP code 200 or 404, but got: {e}') + sys.exit(1) + return False + body = r.content + last_line = body.decode().split('\n')[-1] + if 'Errors:' in last_line: + logging.error(f'There were errors from create_db found in logs for {log_url}') + sys.exit(1) + return True + + +def get_port_date(port): + """Extracts the month and day from a port. + + Port should have format [1|2]MMDD. + + Returns the tuple (month, day) + """ + month = int(port[-4:-2]) + day = int(port[-2:]) + return month, day + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('config') + parser.add_argument('-d', '--date', help='deploy IYP dump for this date (YYYY-MM-DD)') + args = parser.parse_args() + + FORMAT = '%(asctime)s %(processName)s %(message)s' + logging.basicConfig( + format=FORMAT, + level=logging.INFO, + datefmt='%Y-%m-%d %H:%M:%S' + ) + logging.info(f'Started: {sys.argv}') + + with open(args.config, 'r') as f: + try: + config: dict = json.load(f) + except json.JSONDecodeError as e: + logging.error(f'Invalid configuration specified: {e}') + sys.exit(1) + + root = os.path.dirname(os.path.realpath(__file__)) + + # If no date is provided when running the script, check if any dumps + # have been made within a week since the previous deployment. Otherwise, + # use the date provided in command line arg. + if args.date: + try: + date = datetime.strptime(args.date, '%Y-%m-%d') + except ValueError as e: + logging.error(f'Invalid date specified: {e}') + sys.exit(1) + else: + ports = get_ports_from_caddy_config(config) + success = False + if 'active_http' in ports: + active_http = ports['active_http'] + month, day = get_port_date(active_http) + start_date = datetime.now(tz=timezone.utc).replace(month=month, day=day) + else: + start_date = datetime.now(tz=timezone.utc) + + # Download logs from ihr archive each day in the next week since + # the previous release + for i in range(1, 8): + date = start_date + timedelta(days=i) + if check_log(config, date): + success = True + break + else: + logging.warning(f'No archive entry found for {date.strftime("%Y-%m-%d")}.') + + if not success: + logging.error('Exiting because no active dates were found in archive.') + sys.exit(1) + + # Define ports and filenames that depend on the date + volume_name = date.strftime(DOCKER_VOLUME_FMT) + container_name = date.strftime(DOCKER_CONTAINER_NAME_FMT) + http_port = date.strftime('1%m%d') + bolt_port = date.strftime('2%m%d') + + client = docker.from_env() + + # Check if there is an existing deployment for this day and remove if so + remove_deployment(client, date) + + # Download dump from ihr archive + logging.info(f'Downloading dump for {date.strftime("%Y-%m-%d")}') + dump_dir = os.path.join(root, date.strftime(DUMP_DOWNLOAD_DIR_SUFFIX)) + os.makedirs(dump_dir, exist_ok=True) + dump_url_fmt = os.path.join(config['archive_base_url'], DUMP_URL_SUFFIX) + dump_url = date.strftime(dump_url_fmt) + r = requests.get(dump_url) + try: + r.raise_for_status() + except requests.HTTPError as e: + logging.error(f'Failed to fetch dump from {dump_url}: {e}') + sys.exit(1) + # TODO Is the + necessary? + with open(os.path.join(dump_dir, 'neo4j.dump'), 'wb+') as f: + f.write(r.content) + + # Load dump into volume + logging.info('Load dump into neo4j db') + client.containers.run( + 'neo4j/neo4j-admin:' + NEO4J_VERSION, + command='neo4j-admin database load neo4j --from-path=/dumps --verbose', + name='load', + tty=True, + stdin_open=True, + remove=True, + volumes={ + volume_name: {'bind': '/data', 'mode': 'rw'}, + dump_dir: {'bind': '/dumps', 'mode': 'rw'}, + } + ) + + # Run neo4j based on data in volume just created + logging.warning('Starting deployment container') + client.containers.run( + 'neo4j:' + NEO4J_VERSION, + name=container_name, + ports={ + 7474: int(http_port), + 7687: int(bolt_port) + }, + volumes={ + volume_name: {'bind': '/data', 'mode': 'rw'}, + }, + environment={ + 'NEO4J_AUTH': 'neo4j/password', + 'NEO4J_server_memory_heap_initial__size': '16G', + 'NEO4J_server_memory_heap_max__size': '16G', + }, + detach=True, + remove=True + ) + + # Get currently active config + ports = get_ports_from_caddy_config(config) + + # Only delete current prev if it exists + if 'prev_http' in ports: + prev_month, prev_day = get_port_date(ports['prev_http']) + # It's possible that you're trying to redeploy the current prev + # If this condition isn't here, then the new deployment will be deleted + # since it has the same date as prev + if prev_month != date.month or prev_day != date.day: + remove_deployment(client, date.replace(month=prev_month, day=prev_day)) + + with open(config['caddy_template'], 'r') as f: + caddy_template = f.read() + + caddy_template = caddy_template.replace('', bolt_port) + caddy_template = caddy_template.replace('', http_port) + + # If there are no active ports (for example, on the first run after a fresh + # caddy build), don't try to set prev ports + if 'active_http' in ports: + caddy_template = caddy_template.replace('', ports['active_bolt']) + caddy_template = caddy_template.replace('', ports['active_http']) + + # Update config + requests.post(config['caddy_post_url'], caddy_template, headers={'Content-Type': 'application/json'}) + + +if __name__ == '__main__': + main() + sys.exit(0) diff --git a/caddy.template.json b/caddy.template.json new file mode 100644 index 00000000..d86c1a41 --- /dev/null +++ b/caddy.template.json @@ -0,0 +1,109 @@ +{ + "apps": { + "http": { + "servers": { + "srv0": { + "listen": [":443"], + "routes": [ + { + "match": [{ "host": ["ryan-bolt.ihr.live"] }], + "handle": [ + { + "handler": "subroute", + "routes": [ + { + "handle": [ + { + "handler": "reverse_proxy", + "upstreams": [{ "dial": "sandbox.ihr.live:" }] + } + ] + } + ] + } + ], + "terminal": true + }, + { + "match": [{ "host": ["ryan-prev-bolt.ihr.live"] }], + "handle": [ + { + "handler": "subroute", + "routes": [ + { + "handle": [ + { + "handler": "reverse_proxy", + "upstreams": [{ "dial": "sandbox.ihr.live:" }] + } + ] + } + ] + } + ], + "terminal": true + }, + { + "match": [{ "host": ["sandbox.ihr.live"] }], + "handle": [ + { + "handler": "subroute", + "routes": [ + { + "handle": [ + { + "handler": "reverse_proxy", + "upstreams": [{ "dial": "ryan.ihr.live:" }] + } + ] + } + ] + } + ], + "terminal": true + }, + { + "match": [{ "host": ["ryan.ihr.live"] }], + "handle": [ + { + "handler": "subroute", + "routes": [ + { + "handle": [ + { + "handler": "reverse_proxy", + "upstreams": [{ "dial": "sandbox.ihr.live:" }] + } + ] + } + ] + } + ], + "terminal": true + }, + { + "match": [{ "host": ["ryan-prev.ihr.live"] }], + "handle": [ + { + "handler": "subroute", + "routes": [ + { + "handle": [ + { + "handler": "reverse_proxy", + "upstreams": [{ "dial": "sandbox.ihr.live:" }] + } + ] + } + ] + } + ], + "terminal": true + } + ] + } + } + } + } + } + \ No newline at end of file diff --git a/config.json.example b/config.json.example index ee12e2c2..ef1e158c 100644 --- a/config.json.example +++ b/config.json.example @@ -1,4 +1,10 @@ { + "archive": { + "host": "", + "user": "", + "base_path": "" + }, + "cache": { "directory": "tmp/", "duration_in_days": 6 diff --git a/create_db.py b/create_db.py index e754e788..6b624035 100644 --- a/create_db.py +++ b/create_db.py @@ -1,208 +1,228 @@ +import argparse import importlib import json import logging import os -# import shutil import sys +from datetime import datetime, timezone from time import sleep -import arrow import docker +import paramiko +from scp import SCPClient from send_email import send_email -NEO4J_VERSION = '5.21.2' - -today = arrow.utcnow() -date = f'{today.year}-{today.month:02d}-{today.day:02d}' - -# Use the current directory as root. -root = os.path.dirname(os.path.realpath(__file__)) -# Alternatively, specify your own path. -# root = '' -if not root: - sys.exit('Please configure a root path.') - -tmp_dir = os.path.join(root, 'neo4j/tmp', date, '') -dump_dir = os.path.join(root, 'dumps', f'{today.year}/{today.month:02d}/{today.day:02d}', '') - -os.makedirs(tmp_dir, exist_ok=True) -os.makedirs(dump_dir, exist_ok=True) - -# Initialize logging -scriptname = sys.argv[0].replace('/', '_')[0:-3] -FORMAT = '%(asctime)s %(processName)s %(message)s' -logging.basicConfig( - format=FORMAT, - filename=f'{dump_dir}iyp-{date}.log', - level=logging.WARNING, - datefmt='%Y-%m-%d %H:%M:%S' -) -logging.warning('Started: %s' % sys.argv) - -# Load configuration file -with open('config.json', 'r') as fp: - conf = json.load(fp) - -# Start a new neo4j container -client = docker.from_env() - -# ######### Start a new docker image ########## - -logging.warning('Starting new container...') -container = client.containers.run( - 'neo4j:' + NEO4J_VERSION, - name=f'iyp-{date}', - ports={ - 7474: 7474, - 7687: 7687 - }, - volumes={ - tmp_dir: {'bind': '/data', 'mode': 'rw'}, - }, - environment={ - 'NEO4J_AUTH': 'neo4j/password', - 'NEO4J_server_memory_heap_initial__size': '16G', - 'NEO4J_server_memory_heap_max__size': '16G', - }, - remove=True, - detach=True -) - -# Wait for the container to be ready -timeout = 120 -stop_time = 1 -elapsed_time = 0 -container_ready = False - -while elapsed_time < timeout: - sleep(stop_time) - elapsed_time += stop_time - # Not the most premium solution, but the alternative is using - # stream=True, which creates a blocking generator that we have - # to somehow interrupt in case the database does not start - # correctly. And writing a signal handler just for this seems - # overkill. - last_msg = container.logs(stderr=False, tail=1) - if last_msg.endswith(b'Started.\n'): - logging.warning('Container ready.') - container_ready = True - break - -if not container_ready: - logging.error('Timed our while waiting for container to start.') - try: - container_logs = container.logs().decode('utf-8') - except Exception as e: - logging.error(f'Can not get logs from container: {e}') +NEO4J_VERSION = '5.16.0' + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('-a', '--archive', action='store_true', help='push dump to archive server') + args = parser.parse_args() + + today = datetime.now(tz=timezone.utc) + date = today.strftime('%Y-%m-%d') + + # Use the current directory as root. + root = os.path.dirname(os.path.realpath(__file__)) + # Alternatively, specify your own path. + # root = '' + if not root: + sys.exit('Please configure a root path.') + + dump_dir = os.path.join(root, 'dumps', today.strftime('%Y/%m/%d')) + + os.makedirs(dump_dir, exist_ok=True) + + # Initialize logging + FORMAT = '%(asctime)s %(processName)s %(message)s' + logging.basicConfig( + format=FORMAT, + filename=os.path.join(dump_dir, f'iyp-{date}.log'), + level=logging.INFO, + datefmt='%Y-%m-%d %H:%M:%S' + ) + logging.info(f'Started: {sys.argv}') + + # Load configuration file + with open('config.json', 'r') as fp: + conf = json.load(fp) + + # Neo4j container settings + neo4j_volume = f'data-{date}' + + # Start a new neo4j container + client = docker.from_env() + + # ######### Start a new docker image ########## + + logging.warning('Starting new container...') + container = client.containers.run( + 'neo4j:' + NEO4J_VERSION, + name=f'iyp-{date}', + ports={ + 7474: 7474, + 7687: 7687 + }, + volumes={ + neo4j_volume: {'bind': '/data', 'mode': 'rw'}, + }, + environment={ + 'NEO4J_AUTH': 'neo4j/password', + 'NEO4J_server_memory_heap_initial__size': '16G', + 'NEO4J_server_memory_heap_max__size': '16G', + }, + remove=True, + detach=True + ) + + # Wait for the container to be ready + timeout = 120 + stop_time = 1 + elapsed_time = 0 + container_ready = False + + while elapsed_time < timeout: + sleep(stop_time) + elapsed_time += stop_time + # Not the most premium solution, but the alternative is using + # stream=True, which creates a blocking generator that we have + # to somehow interrupt in case the database does not start + # correctly. And writing a signal handler just for this seems + # overkill. + last_msg = container.logs(stderr=False, tail=1) + if last_msg.endswith(b'Started.\n'): + logging.info('Container ready.') + container_ready = True + break + + if not container_ready: + logging.error('Timed our while waiting for container to start.') + try: + container_logs = container.logs().decode('utf-8') + except Exception as e: + logging.error(f'Can not get logs from container: {e}') + sys.exit('Problem while starting the container.') + logging.error(f'Container logs:\n{container_logs}') + logging.error('Trying to stop container...') + container.stop() sys.exit('Problem while starting the container.') - logging.error(f'Container logs:\n{container_logs}') - logging.error('Trying to stop container...') - container.stop() - sys.exit('Problem while starting the container.') -# ######### Fetch data and feed to neo4j ########## - - -class RelationCountError(Exception): - def __init__(self, message): - self.message = message - super().__init__(self.message) - - -logging.warning('Fetching data...') -status = {} -no_error = True -for module_name in conf['iyp']['crawlers']: - - try: + # ########## Fetch data and feed to neo4j ########## + + class RelationCountError(Exception): + def __init__(self, message): + self.message = message + super().__init__(self.message) + + logging.info('Fetching data...') + status = {} + no_error = True + for module_name in conf['iyp']['crawlers']: + try: + module = importlib.import_module(module_name) + logging.info(f'start {module}') + name = module_name.replace('iyp.crawlers.', '') + crawler = module.Crawler(module.ORG, module.URL, name) + crawler.run() + passed = crawler.unit_test() + crawler.close() + if not passed: + error_message = f'Did not receive data from crawler {name}' + raise RelationCountError(error_message) + status[module_name] = 'OK' + logging.info(f'end {module}') + except RelationCountError as relation_count_error: + no_error = False + logging.error(relation_count_error) + status[module_name] = relation_count_error + send_email(relation_count_error) + except Exception as e: + no_error = False + logging.error('crawler crashed!') + status[module_name] = e + send_email(e) + + # ######### Post processing scripts ########## + + logging.info('Post-processing...') + for module_name in conf['iyp']['post']: module = importlib.import_module(module_name) - logging.warning(f'start {module}') - name = module_name.replace('iyp.crawlers.', '') - crawler = module.Crawler(module.ORG, module.URL, name) - relations_count = crawler.count_relations() - crawler.run() - relations_count_new = crawler.count_relations() - crawler.close() - if not relations_count_new > relations_count: - error_message = ( - f'Unexpected relation count change in the crawler "{name}": ' - f'Expected new relations ({relations_count_new}) ' - f'to be greater than the previous relations ({relations_count}).' - ) - raise RelationCountError(error_message) - status[module_name] = 'OK' - logging.warning(f'end {module}') - except RelationCountError as relation_count_error: - no_error = False - logging.error(relation_count_error) - status[module_name] = relation_count_error - send_email(relation_count_error) - except Exception as e: - no_error = False - logging.exception('crawler crashed!!') - status[module_name] = e - send_email(e) - - -# ######### Post processing scripts ########## - -logging.warning('Post-processing...') -for module_name in conf['iyp']['post']: - module = importlib.import_module(module_name) - - try: - logging.warning(f'start {module}') - post = module.PostProcess() - post.run() - post.close() - status[module_name] = 'OK' - logging.warning(f'end {module}') - - except Exception as e: - no_error = False - logging.error('crawler crashed!!\n') - logging.error(e) - logging.error('\n') - status[module_name] = e - - -# ######### Stop container and dump DB ########## - -logging.warning('Stopping container...') -container.stop(timeout=180) - -logging.warning('Dumping database...') -if os.path.exists(f'{dump_dir}/neo4j.dump'): - os.remove(f'{dump_dir}/neo4j.dump') - -# make sure the directory is writable for any user -os.chmod(dump_dir, 0o777) - -container = client.containers.run( - 'neo4j/neo4j-admin:' + NEO4J_VERSION, - command='neo4j-admin database dump neo4j --to-path=/dumps --verbose', - tty=True, - stdin_open=True, - remove=True, - volumes={ - tmp_dir: {'bind': '/data', 'mode': 'rw'}, - dump_dir: {'bind': '/dumps', 'mode': 'rw'}, - } -) - -# rename dump -os.rename(f'{dump_dir}/neo4j.dump', f'{dump_dir}/iyp-{date}.dump') - -final_words = '' -if not no_error: - # TODO send an email - final_words += 'There was errors!' - logging.error('there was errors!\n') - logging.error({k: error for k, error in status.items() if error != 'OK'}) -else: - final_words = 'No error :)' -# Delete tmp file in cron job -# shutil.rmtree(tmp_dir) - -logging.warning(f'Finished: {sys.argv} {final_words}') + + try: + logging.info(f'start {module}') + post = module.PostProcess() + post.run() + post.close() + status[module_name] = 'OK' + logging.info(f'end {module}') + + except Exception as e: + no_error = False + logging.error('crawler crashed!') + logging.error(e) + status[module_name] = e + + # ######### Stop container and dump DB ########## + + logging.info('Stopping container...') + container.stop(timeout=180) + + logging.info('Dumping database...') + dump_file = os.path.join(dump_dir, 'neo4j.dump') + if os.path.exists(dump_file): + os.remove(dump_file) + + # make sure the directory is writable for any user + os.chmod(dump_dir, 0o777) + + container = client.containers.run( + 'neo4j/neo4j-admin:' + NEO4J_VERSION, + command='neo4j-admin database dump neo4j --to-path=/dumps --verbose', + tty=True, + stdin_open=True, + remove=True, + volumes={ + neo4j_volume: {'bind': '/data', 'mode': 'rw'}, + dump_dir: {'bind': '/dumps', 'mode': 'rw'}, + } + ) + + # Delete the data volume once the dump been created + client.volumes.get(neo4j_volume).remove() + + # rename dump + + os.rename(dump_file, os.path.join(dump_dir, f'iyp-{date}.dump')) + + if not no_error: + # TODO send an email + + # Add the log line to indicate to autodeploy that there were errors + final_words = f'\nErrors: {" ".join((k for k in status))}' + logging.error('There were errors!') + else: + final_words = 'No error :)' + # Delete tmp file in cron job + # shutil.rmtree(tmp_dir) + + logging.info(f'Finished: {sys.argv} {final_words}') + + if args.archive: + # Push the dump and log to ihr archive + ssh = paramiko.SSHClient() + # Do not show info logging for paramiko. + logging.getLogger('paramiko').setLevel(logging.WARNING) + ssh.load_system_host_keys() + ssh.connect(conf['archive']['host'], username=conf['archive']['user']) + + dest = os.path.join(conf['archive']['base_path'], today.strftime('%Y/%m/%d')) + ssh.exec_command(f'mkdir -p {dest}') + + with SCPClient(ssh.get_transport()) as scp: + scp.put(dump_dir, recursive=True, remote_path=dest) + + +if __name__ == '__main__': + main() diff --git a/docker-compose.yaml b/docker-compose.yaml index de510b00..fd985a59 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -1,3 +1,6 @@ +volumes: + caddy_data: + caddy_config: services: iyp_loader: image: neo4j/neo4j-admin:5.21.2 @@ -63,3 +66,21 @@ services: depends_on: iyp_loader: condition: service_completed_successfully + + caddy: + image: caddy:latest + container_name: caddy + restart: unless-stopped + ports: + - "80:80" + - "443:443" + - "443:443/udp" + - "2019:2019" + environment: + - CADDY_ADMIN=0.0.0.0:2019 + volumes: + - ./site:/srv + - caddy_data:/data + - caddy_config:/config + command: /usr/bin/caddy run --resume + \ No newline at end of file diff --git a/iyp/__init__.py b/iyp/__init__.py index 47ead72c..81b2d2b0 100644 --- a/iyp/__init__.py +++ b/iyp/__init__.py @@ -476,7 +476,7 @@ def batch_add_node_label(self, node_ids, label): label: label string or list of label strings """ label_str = str(label) - if type(label) is list: + if isinstance(label, list): label_str = ':'.join(label) for i in range(0, len(node_ids), BATCH_SIZE): @@ -709,15 +709,23 @@ def count_relations(self): return result['count'] - def unit_test(self, logging): - relation_count = self.count_relations() - logging.info('Relations before starting: %s' % relation_count) - self.run() - relation_count_new = self.count_relations() - logging.info('Relations after starting: %s' % relation_count_new) - self.close() - print('assertion failed') if relation_count_new <= relation_count else print('assertion passed') - assert relation_count_new > relation_count + def unit_test(self, relation_types): + """Check for existence of relationships created by this crawler. + + relation_types should be a list of types for which existence is checked. + """ + logging.info(f'Running existence test for {relation_types}') + passed = True + for relation_type in relation_types: + existenceQuery = f"""MATCH ()-[r:{relation_type}]-() + USING INDEX r:{relation_type}(reference_name) + WHERE r.reference_name = '{self.reference['reference_name']}' + RETURN 0 LIMIT 1""" + result = self.iyp.tx.run(existenceQuery) + if len(list(result)) == 0: + passed = False + logging.error(f'Missing data for relation {relation_type}') + return passed def close(self): # Commit changes to IYP diff --git a/iyp/crawlers/alice_lg/__init__.py b/iyp/crawlers/alice_lg/__init__.py index d7bbdfe1..878bdf76 100644 --- a/iyp/crawlers/alice_lg/__init__.py +++ b/iyp/crawlers/alice_lg/__init__.py @@ -442,3 +442,6 @@ def run(self) -> None: if originate_rels: logging.info(f'Pushing {len(originate_rels)} ORIGINATE relationships.') self.iyp.batch_add_links('ORIGINATE', originate_rels) + + def unit_test(self): + return super().unit_test(['MEMBER_OF', 'ORIGINATE', 'MANAGED_BY']) diff --git a/iyp/crawlers/alice_lg/amsix.py b/iyp/crawlers/alice_lg/amsix.py index 73286459..6e74b623 100644 --- a/iyp/crawlers/alice_lg/amsix.py +++ b/iyp/crawlers/alice_lg/amsix.py @@ -28,7 +28,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/alice_lg/bcix.py b/iyp/crawlers/alice_lg/bcix.py index 66bc9932..ed1a6f7d 100644 --- a/iyp/crawlers/alice_lg/bcix.py +++ b/iyp/crawlers/alice_lg/bcix.py @@ -28,7 +28,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/alice_lg/decix.py b/iyp/crawlers/alice_lg/decix.py index 8a3aa25f..d3c8c35b 100644 --- a/iyp/crawlers/alice_lg/decix.py +++ b/iyp/crawlers/alice_lg/decix.py @@ -28,7 +28,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/alice_lg/ixbr.py b/iyp/crawlers/alice_lg/ixbr.py index 548701a2..691e6e4e 100644 --- a/iyp/crawlers/alice_lg/ixbr.py +++ b/iyp/crawlers/alice_lg/ixbr.py @@ -28,7 +28,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/alice_lg/linx.py b/iyp/crawlers/alice_lg/linx.py index ef67bd24..223635c0 100644 --- a/iyp/crawlers/alice_lg/linx.py +++ b/iyp/crawlers/alice_lg/linx.py @@ -28,7 +28,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/alice_lg/megaport.py b/iyp/crawlers/alice_lg/megaport.py index 8080f456..5156a214 100644 --- a/iyp/crawlers/alice_lg/megaport.py +++ b/iyp/crawlers/alice_lg/megaport.py @@ -28,7 +28,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/alice_lg/netnod.py b/iyp/crawlers/alice_lg/netnod.py index 8ba89f4c..5f711a46 100644 --- a/iyp/crawlers/alice_lg/netnod.py +++ b/iyp/crawlers/alice_lg/netnod.py @@ -28,7 +28,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/apnic/eyeball.py b/iyp/crawlers/apnic/eyeball.py index bee4fd46..80544023 100644 --- a/iyp/crawlers/apnic/eyeball.py +++ b/iyp/crawlers/apnic/eyeball.py @@ -84,6 +84,9 @@ def run(self): self.iyp.batch_add_links('RANK', rank_links) self.iyp.batch_add_links('POPULATION', pop_links) + def unit_test(self): + return super().unit_test(['POPULATION', 'COUNTRY', 'RANK', 'NAME']) + def main() -> None: parser = argparse.ArgumentParser() @@ -103,7 +106,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/bgpkit/__init__.py b/iyp/crawlers/bgpkit/__init__.py index e5d8e3bf..1c54c892 100644 --- a/iyp/crawlers/bgpkit/__init__.py +++ b/iyp/crawlers/bgpkit/__init__.py @@ -47,3 +47,6 @@ def run(self): # Push all links to IYP self.iyp.batch_add_links('PEERS_WITH', links) + + def unit_test(self): + return super().unit_test(['PEERS_WITH']) diff --git a/iyp/crawlers/bgpkit/as2rel_v4.py b/iyp/crawlers/bgpkit/as2rel_v4.py index 0a1c1bc6..2278dd07 100644 --- a/iyp/crawlers/bgpkit/as2rel_v4.py +++ b/iyp/crawlers/bgpkit/as2rel_v4.py @@ -35,7 +35,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/bgpkit/as2rel_v6.py b/iyp/crawlers/bgpkit/as2rel_v6.py index e6dafacc..e7807c76 100644 --- a/iyp/crawlers/bgpkit/as2rel_v6.py +++ b/iyp/crawlers/bgpkit/as2rel_v6.py @@ -35,7 +35,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/bgpkit/peerstats.py b/iyp/crawlers/bgpkit/peerstats.py index 58cf8cbe..77241862 100644 --- a/iyp/crawlers/bgpkit/peerstats.py +++ b/iyp/crawlers/bgpkit/peerstats.py @@ -90,6 +90,9 @@ def run(self): # Push all links to IYP self.iyp.batch_add_links('PEERS_WITH', links) + def unit_test(self): + return super().unit_test(['PEERS_WITH']) + def main() -> None: parser = argparse.ArgumentParser() @@ -109,7 +112,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/bgpkit/pfx2asn.py b/iyp/crawlers/bgpkit/pfx2asn.py index fe3cb560..d83e8dbc 100644 --- a/iyp/crawlers/bgpkit/pfx2asn.py +++ b/iyp/crawlers/bgpkit/pfx2asn.py @@ -62,6 +62,9 @@ def run(self): # Push all links to IYP self.iyp.batch_add_links('ORIGINATE', links) + def unit_test(self): + return super().unit_test(['ORIGINATE']) + def main() -> None: parser = argparse.ArgumentParser() @@ -81,7 +84,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/bgptools/anycast_prefixes.py b/iyp/crawlers/bgptools/anycast_prefixes.py index c46c5da4..824e5c45 100644 --- a/iyp/crawlers/bgptools/anycast_prefixes.py +++ b/iyp/crawlers/bgptools/anycast_prefixes.py @@ -102,6 +102,9 @@ def update(self, res, filename: str): # Push all links to IYP self.iyp.batch_add_links('CATEGORIZED', links) + def unit_test(self): + return super().unit_test(['CATEGORIZED']) + def main() -> None: parser = argparse.ArgumentParser() @@ -121,7 +124,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/bgptools/as_names.py b/iyp/crawlers/bgptools/as_names.py index 7db4c99b..bbfa9573 100644 --- a/iyp/crawlers/bgptools/as_names.py +++ b/iyp/crawlers/bgptools/as_names.py @@ -85,6 +85,9 @@ def run(self): self.iyp.batch_add_links('NAME', name_links) self.iyp.batch_add_links('CATEGORIZED', tag_links) + def unit_test(self): + return super().unit_test(['NAME']) + def main() -> None: parser = argparse.ArgumentParser() @@ -104,7 +107,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/bgptools/tags.py b/iyp/crawlers/bgptools/tags.py index 3a20fa1d..155f6446 100644 --- a/iyp/crawlers/bgptools/tags.py +++ b/iyp/crawlers/bgptools/tags.py @@ -83,6 +83,9 @@ def run(self): print('Error for: ', line) print(error) + def unit_test(self): + return super().unit_test(['CATEGORIZED']) + def main() -> None: parser = argparse.ArgumentParser() @@ -102,7 +105,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/caida/asrank.py b/iyp/crawlers/caida/asrank.py index 3cc8d9c8..51e8e45d 100644 --- a/iyp/crawlers/caida/asrank.py +++ b/iyp/crawlers/caida/asrank.py @@ -100,6 +100,9 @@ def run(self): self.iyp.batch_add_links('COUNTRY', country_links) self.iyp.batch_add_links('RANK', rank_links) + def unit_test(self): + return super().unit_test(['NAME', 'COUNTRY', 'RANK']) + def main() -> None: parser = argparse.ArgumentParser() @@ -119,7 +122,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/caida/ix_asns.py b/iyp/crawlers/caida/ix_asns.py index 6fbb86e7..8e0f7f9c 100644 --- a/iyp/crawlers/caida/ix_asns.py +++ b/iyp/crawlers/caida/ix_asns.py @@ -90,6 +90,9 @@ def run(self): # Push all links to IYP self.iyp.batch_add_links('MEMBER_OF', member_links) + def unit_test(self): + return super().unit_test(['MEMBER_OF']) + def main() -> None: parser = argparse.ArgumentParser() @@ -109,7 +112,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/caida/ixs.py b/iyp/crawlers/caida/ixs.py index 2301c092..e9b4542b 100644 --- a/iyp/crawlers/caida/ixs.py +++ b/iyp/crawlers/caida/ixs.py @@ -200,6 +200,9 @@ def run(self): self.iyp.batch_add_links('WEBSITE', website_links) self.iyp.batch_add_links('MANAGED_BY', prefix_links) + def unit_test(self): + return super().unit_test(['EXTERNAL_ID', 'NAME', 'COUNTRY', 'WEBSITE', 'MANAGED_BY']) + def main() -> None: parser = argparse.ArgumentParser() @@ -219,7 +222,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/cisco/umbrella_top1m.py b/iyp/crawlers/cisco/umbrella_top1m.py index 23bd3e25..9ed18fba 100644 --- a/iyp/crawlers/cisco/umbrella_top1m.py +++ b/iyp/crawlers/cisco/umbrella_top1m.py @@ -123,6 +123,9 @@ def run(self): logging.info(f'Pushing {len(processed_links)} RANK relationships...') self.iyp.batch_add_links('RANK', processed_links) + def unit_test(self): + return super().unit_test(['RANK']) + def main() -> None: parser = argparse.ArgumentParser() @@ -142,7 +145,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/citizenlab/urldb.py b/iyp/crawlers/citizenlab/urldb.py index b56718a1..68935706 100644 --- a/iyp/crawlers/citizenlab/urldb.py +++ b/iyp/crawlers/citizenlab/urldb.py @@ -70,6 +70,9 @@ def run(self): # Push all links to IYP self.iyp.batch_add_links('CATEGORIZED', links) + def unit_test(self): + return super().unit_test(['CATEGORIZED']) + def main() -> None: parser = argparse.ArgumentParser() @@ -89,7 +92,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/cloudflare/dns_top_ases.py b/iyp/crawlers/cloudflare/dns_top_ases.py index 8c15ac04..fe6b6e11 100644 --- a/iyp/crawlers/cloudflare/dns_top_ases.py +++ b/iyp/crawlers/cloudflare/dns_top_ases.py @@ -47,6 +47,10 @@ def compute_link(self, param): 'props': [flat_prop, self.reference] }) + # already defined in imported Crawler + # def unit_test(self): + # pass + def main() -> None: parser = argparse.ArgumentParser() @@ -66,7 +70,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/cloudflare/dns_top_locations.py b/iyp/crawlers/cloudflare/dns_top_locations.py index 8f40586c..37eb0f42 100644 --- a/iyp/crawlers/cloudflare/dns_top_locations.py +++ b/iyp/crawlers/cloudflare/dns_top_locations.py @@ -168,6 +168,9 @@ def compute_link(self, param): 'props': [flat_prop, self.reference] }) + def unit_test(self): + return super().unit_test(['QUERIED_FROM']) + def main() -> None: parser = argparse.ArgumentParser() @@ -187,7 +190,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/cloudflare/ranking_bucket.py b/iyp/crawlers/cloudflare/ranking_bucket.py index 6d9a02e5..c6b64c72 100644 --- a/iyp/crawlers/cloudflare/ranking_bucket.py +++ b/iyp/crawlers/cloudflare/ranking_bucket.py @@ -122,6 +122,9 @@ def run(self): print(f'Adding {len(domain_links)} RANK relationships', file=sys.stderr) self.iyp.batch_add_links('RANK', domain_links) + def unit_test(self): + return super().unit_test(['RANK']) + # Main program if __name__ == '__main__': @@ -141,7 +144,7 @@ def run(self): crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/cloudflare/top100.py b/iyp/crawlers/cloudflare/top100.py index 2a6b63de..c44fbe24 100644 --- a/iyp/crawlers/cloudflare/top100.py +++ b/iyp/crawlers/cloudflare/top100.py @@ -71,6 +71,9 @@ def update(self, entry): domain_qid = self.iyp.get_node('DomainName', {'name': entry['domain']}) self.iyp.add_links(domain_qid, statements) + def unit_test(self): + return super().unit_test(['RANK']) + def main() -> None: parser = argparse.ArgumentParser() @@ -90,7 +93,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/emileaben/as_names.py b/iyp/crawlers/emileaben/as_names.py index 66d4b108..197253ec 100644 --- a/iyp/crawlers/emileaben/as_names.py +++ b/iyp/crawlers/emileaben/as_names.py @@ -66,6 +66,9 @@ def run(self): # Push all links to IYP self.iyp.batch_add_links('NAME', links) + def unit_test(self): + return super().unit_test(['NAME']) + def main() -> None: parser = argparse.ArgumentParser() @@ -85,7 +88,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/example/crawler.py b/iyp/crawlers/example/crawler.py index 63c94ed0..e2b4b74c 100644 --- a/iyp/crawlers/example/crawler.py +++ b/iyp/crawlers/example/crawler.py @@ -74,7 +74,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/iana/root_zone.py b/iyp/crawlers/iana/root_zone.py index 2ec8c1bd..39d6224f 100644 --- a/iyp/crawlers/iana/root_zone.py +++ b/iyp/crawlers/iana/root_zone.py @@ -107,6 +107,9 @@ def run(self): logging.info(f'Pushing {len(managed_by)} MANAGED_BY relationships.') self.iyp.batch_add_links('MANAGED_BY', managed_by_relationships) + def unit_test(self): + return super().unit_test(['RESOLVES_TO', 'MANAGED_BY']) + def main() -> None: parser = argparse.ArgumentParser() @@ -126,7 +129,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/ihr/__init__.py b/iyp/crawlers/ihr/__init__.py index 9ff85173..67f08631 100644 --- a/iyp/crawlers/ihr/__init__.py +++ b/iyp/crawlers/ihr/__init__.py @@ -101,3 +101,6 @@ def run(self): # Remove downloaded file os.remove(local_filename) + + def unit_test(self): + return super().unit_test(['DEPENDS_ON']) diff --git a/iyp/crawlers/ihr/country_dependency.py b/iyp/crawlers/ihr/country_dependency.py index 1a8d4afb..81936b42 100644 --- a/iyp/crawlers/ihr/country_dependency.py +++ b/iyp/crawlers/ihr/country_dependency.py @@ -114,6 +114,9 @@ def run(self): # Push links to IYP self.iyp.batch_add_links('RANK', links) + def unit_test(self): + return super().unit_test(['RANK', 'COUNTRY']) + # Main program def main() -> None: @@ -134,7 +137,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/ihr/local_hegemony_v4.py b/iyp/crawlers/ihr/local_hegemony_v4.py index b0cc3b0f..a61bd8e8 100644 --- a/iyp/crawlers/ihr/local_hegemony_v4.py +++ b/iyp/crawlers/ihr/local_hegemony_v4.py @@ -36,7 +36,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/ihr/local_hegemony_v6.py b/iyp/crawlers/ihr/local_hegemony_v6.py index 6599a5de..7c243419 100644 --- a/iyp/crawlers/ihr/local_hegemony_v6.py +++ b/iyp/crawlers/ihr/local_hegemony_v6.py @@ -36,7 +36,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/ihr/rov.py b/iyp/crawlers/ihr/rov.py index 200493a1..3e9dfd7c 100644 --- a/iyp/crawlers/ihr/rov.py +++ b/iyp/crawlers/ihr/rov.py @@ -176,6 +176,9 @@ def run(self): # Remove downloaded file os.remove(local_filename) + def unit_test(self): + return super().unit_test(['ORIGINATE', 'CATEGORIZED', 'DEPENDS_ON', 'COUNTRY']) + def main() -> None: parser = argparse.ArgumentParser() @@ -195,7 +198,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/inetintel/as_org.py b/iyp/crawlers/inetintel/as_org.py index 71676d9e..a5720e3c 100644 --- a/iyp/crawlers/inetintel/as_org.py +++ b/iyp/crawlers/inetintel/as_org.py @@ -185,6 +185,9 @@ def close(self): os.remove(self.filename) os.rmdir(self.tmpdir) + def unit_test(self): + return super().unit_test(['SIBLING_OF']) + def main() -> None: parser = argparse.ArgumentParser() @@ -204,7 +207,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/manrs/members.py b/iyp/crawlers/manrs/members.py index 22384bf8..fa671407 100644 --- a/iyp/crawlers/manrs/members.py +++ b/iyp/crawlers/manrs/members.py @@ -129,6 +129,9 @@ def run(self): self.iyp.batch_add_links('COUNTRY', country_rels) self.iyp.batch_add_links('IMPLEMENT', implement_rels) + def unit_test(self): + return super().unit_test(['MEMBER_OF', 'IMPLEMENT', 'COUNTRY']) + def main() -> None: parser = argparse.ArgumentParser() @@ -148,7 +151,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/nro/delegated_stats.py b/iyp/crawlers/nro/delegated_stats.py index 79ec9318..ce074238 100644 --- a/iyp/crawlers/nro/delegated_stats.py +++ b/iyp/crawlers/nro/delegated_stats.py @@ -197,6 +197,9 @@ def run(self): for label, links in prefix_status_links.items(): self.iyp.batch_add_links(label, links) + def unit_test(self): + return super().unit_test(['AVAILABLE', 'ASSIGNED', 'RESERVED', 'COUNTRY']) + def main() -> None: parser = argparse.ArgumentParser() @@ -216,7 +219,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/openintel/__init__.py b/iyp/crawlers/openintel/__init__.py index b3f3cd5f..dc02acb7 100644 --- a/iyp/crawlers/openintel/__init__.py +++ b/iyp/crawlers/openintel/__init__.py @@ -239,6 +239,10 @@ def run(self): self.iyp.batch_add_links('MANAGED_BY', mng_links) self.iyp.batch_add_links('PART_OF', partof_links) + def unit_test(self): + # use different version depending on infra_ns vs others + return super().unit_test(['RESOLVES_TO', 'MANAGED_BY', 'PART_OF']) + class DnsgraphCrawler(BaseCrawler): @@ -382,3 +386,6 @@ def run(self): ns_id = [link['dst_id'] for link in links_managed_by] logging.info(f'Adding AuthoritativeNameServer label to {len(ns_id)} nodes') self.iyp.batch_add_node_label(ns_id, 'AuthoritativeNameServer') + + def unit_test(self): + return super().unit_test(['PARENT', 'PART_OF', 'ALIAS_OF', 'MANAGED_BY', 'RESOLVES_TO']) diff --git a/iyp/crawlers/openintel/dnsgraph_jp.py b/iyp/crawlers/openintel/dnsgraph_jp.py index c8457d1f..30dd210e 100644 --- a/iyp/crawlers/openintel/dnsgraph_jp.py +++ b/iyp/crawlers/openintel/dnsgraph_jp.py @@ -33,7 +33,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/openintel/dnsgraph_nl.py b/iyp/crawlers/openintel/dnsgraph_nl.py index 7977ea84..7b29c3bb 100644 --- a/iyp/crawlers/openintel/dnsgraph_nl.py +++ b/iyp/crawlers/openintel/dnsgraph_nl.py @@ -33,7 +33,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/openintel/dnsgraph_rdns.py b/iyp/crawlers/openintel/dnsgraph_rdns.py index 30210e5c..a9730b4e 100644 --- a/iyp/crawlers/openintel/dnsgraph_rdns.py +++ b/iyp/crawlers/openintel/dnsgraph_rdns.py @@ -33,7 +33,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/openintel/infra_mx.py b/iyp/crawlers/openintel/infra_mx.py index 3841c77b..4c33f94d 100644 --- a/iyp/crawlers/openintel/infra_mx.py +++ b/iyp/crawlers/openintel/infra_mx.py @@ -42,7 +42,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/openintel/infra_ns.py b/iyp/crawlers/openintel/infra_ns.py index a4395c64..91dfb8da 100644 --- a/iyp/crawlers/openintel/infra_ns.py +++ b/iyp/crawlers/openintel/infra_ns.py @@ -35,7 +35,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/openintel/tranco1m.py b/iyp/crawlers/openintel/tranco1m.py index c9dd706c..d459d63c 100644 --- a/iyp/crawlers/openintel/tranco1m.py +++ b/iyp/crawlers/openintel/tranco1m.py @@ -35,7 +35,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/openintel/umbrella1m.py b/iyp/crawlers/openintel/umbrella1m.py index 2b7b606d..9b3ccde1 100644 --- a/iyp/crawlers/openintel/umbrella1m.py +++ b/iyp/crawlers/openintel/umbrella1m.py @@ -35,7 +35,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/pch/__init__.py b/iyp/crawlers/pch/__init__.py index 071fd104..db023a43 100644 --- a/iyp/crawlers/pch/__init__.py +++ b/iyp/crawlers/pch/__init__.py @@ -326,3 +326,6 @@ def run(self) -> None: # Clear cache. self.cache_handler.clear_cache() + + def unit_test(self): + return super().unit_test(['ORIGINATE']) diff --git a/iyp/crawlers/pch/daily_routing_snapshots_v4.py b/iyp/crawlers/pch/daily_routing_snapshots_v4.py index 2f5fa4dc..649ee5e8 100644 --- a/iyp/crawlers/pch/daily_routing_snapshots_v4.py +++ b/iyp/crawlers/pch/daily_routing_snapshots_v4.py @@ -34,7 +34,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/pch/daily_routing_snapshots_v6.py b/iyp/crawlers/pch/daily_routing_snapshots_v6.py index a2ff0109..3915769f 100644 --- a/iyp/crawlers/pch/daily_routing_snapshots_v6.py +++ b/iyp/crawlers/pch/daily_routing_snapshots_v6.py @@ -34,7 +34,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/peeringdb/fac.py b/iyp/crawlers/peeringdb/fac.py index d86dbd68..ebf012f6 100644 --- a/iyp/crawlers/peeringdb/fac.py +++ b/iyp/crawlers/peeringdb/fac.py @@ -131,6 +131,9 @@ def run(self): self.iyp.batch_add_links('EXTERNAL_ID', facid_links) self.iyp.batch_add_links('MANAGED_BY', org_links) + def unit_test(self): + return super().unit_test(['NAME', 'WEBSITE', 'COUNTRY', 'EXTERNAL_ID', 'MANAGED_BY']) + def main() -> None: parser = argparse.ArgumentParser() @@ -150,7 +153,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/peeringdb/ix.py b/iyp/crawlers/peeringdb/ix.py index 8cb8c28b..267c512d 100644 --- a/iyp/crawlers/peeringdb/ix.py +++ b/iyp/crawlers/peeringdb/ix.py @@ -393,6 +393,9 @@ def register_ixs(self): self.iyp.batch_add_links('EXTERNAL_ID', id_links) self.iyp.batch_add_links('NAME', name_links) + def unit_test(self): + return super().unit_test(['MANAGED_BY', 'LOCATED_IN', 'COUNTRY', 'WEBSITE', 'EXTERNAL_ID', 'NAME']) + def main() -> None: parser = argparse.ArgumentParser() @@ -412,7 +415,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/peeringdb/org.py b/iyp/crawlers/peeringdb/org.py index f1f1bebe..aa0a2c70 100644 --- a/iyp/crawlers/peeringdb/org.py +++ b/iyp/crawlers/peeringdb/org.py @@ -119,6 +119,9 @@ def run(self): self.iyp.batch_add_links('COUNTRY', country_links) self.iyp.batch_add_links('EXTERNAL_ID', orgid_links) + def unit_test(self): + return super().unit_test(['NAME', 'WEBSITE', 'COUNTRY', 'EXTERNAL_ID']) + def main() -> None: parser = argparse.ArgumentParser() @@ -138,7 +141,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/ripe/as_names.py b/iyp/crawlers/ripe/as_names.py index 609cdca3..9e163ada 100644 --- a/iyp/crawlers/ripe/as_names.py +++ b/iyp/crawlers/ripe/as_names.py @@ -69,6 +69,9 @@ def run(self): self.iyp.batch_add_links('NAME', name_links) self.iyp.batch_add_links('COUNTRY', country_links) + def unit_test(self): + return super().unit_test(['NAME', 'COUNTRY']) + def main() -> None: parser = argparse.ArgumentParser() @@ -88,7 +91,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/ripe/atlas_measurements.py b/iyp/crawlers/ripe/atlas_measurements.py index 25c7b77d..eb60f6cf 100644 --- a/iyp/crawlers/ripe/atlas_measurements.py +++ b/iyp/crawlers/ripe/atlas_measurements.py @@ -258,6 +258,9 @@ def run(self): self.iyp.batch_add_links('PART_OF', part_of_links) logging.info('Done.') + def unit_test(self): + return super().unit_test(['PART_OF', 'TARGET']) + def main() -> None: parser = argparse.ArgumentParser() @@ -278,7 +281,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/ripe/atlas_probes.py b/iyp/crawlers/ripe/atlas_probes.py index 2a94c615..4782099e 100644 --- a/iyp/crawlers/ripe/atlas_probes.py +++ b/iyp/crawlers/ripe/atlas_probes.py @@ -175,6 +175,9 @@ def run(self): self.iyp.batch_add_links('LOCATED_IN', located_in_links) self.iyp.batch_add_links('COUNTRY', country_links) + def unit_test(self): + return super().unit_test(['ASSIGNED', 'LOCATED_IN', 'COUNTRY']) + def main() -> None: parser = argparse.ArgumentParser() @@ -195,7 +198,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/ripe/roa.py b/iyp/crawlers/ripe/roa.py index 7c1e162b..c66d1a7c 100644 --- a/iyp/crawlers/ripe/roa.py +++ b/iyp/crawlers/ripe/roa.py @@ -100,6 +100,9 @@ def run(self): # Push all links to IYP self.iyp.batch_add_links('ROUTE_ORIGIN_AUTHORIZATION', links) + def unit_test(self): + return super().unit_test(['ROUTE_ORIGIN_AUTHORIZATION']) + def main() -> None: parser = argparse.ArgumentParser() @@ -119,7 +122,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/simulamet/rirdata_rdns.py b/iyp/crawlers/simulamet/rirdata_rdns.py index bc0994d1..c2268983 100644 --- a/iyp/crawlers/simulamet/rirdata_rdns.py +++ b/iyp/crawlers/simulamet/rirdata_rdns.py @@ -123,6 +123,9 @@ def run(self): self.iyp.batch_add_links('MANAGED_BY', links_managed_by) + def unit_test(self): + return super().unit_test(['MANAGED_BY']) + def main() -> None: parser = argparse.ArgumentParser() @@ -142,7 +145,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/stanford/asdb.py b/iyp/crawlers/stanford/asdb.py index b32186c1..133b6237 100644 --- a/iyp/crawlers/stanford/asdb.py +++ b/iyp/crawlers/stanford/asdb.py @@ -126,6 +126,9 @@ def run(self): # Push all links to IYP self.iyp.batch_add_links('CATEGORIZED', links) + def unit_test(self): + return super().unit_test(['CATEGORIZED']) + def main() -> None: parser = argparse.ArgumentParser() @@ -145,7 +148,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/tranco/top1m.py b/iyp/crawlers/tranco/top1m.py index 6b4eab84..c2cfb374 100644 --- a/iyp/crawlers/tranco/top1m.py +++ b/iyp/crawlers/tranco/top1m.py @@ -65,6 +65,9 @@ def run(self): # Push all links to IYP self.iyp.batch_add_links('RANK', links) + def unit_test(self): + return super().unit_test(['RANK']) + def main() -> None: parser = argparse.ArgumentParser() @@ -84,7 +87,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/virginiatech/rovista.py b/iyp/crawlers/virginiatech/rovista.py index b72e7343..c31585c2 100644 --- a/iyp/crawlers/virginiatech/rovista.py +++ b/iyp/crawlers/virginiatech/rovista.py @@ -75,6 +75,9 @@ def run(self): # Push all links to IYP self.iyp.batch_add_links('CATEGORIZED', links) + def unit_test(self): + return super().unit_test(['CATEGORIZED']) + def main() -> None: parser = argparse.ArgumentParser() @@ -94,7 +97,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/iyp/crawlers/worldbank/country_pop.py b/iyp/crawlers/worldbank/country_pop.py index ea19f0c3..1b05c23c 100644 --- a/iyp/crawlers/worldbank/country_pop.py +++ b/iyp/crawlers/worldbank/country_pop.py @@ -64,6 +64,9 @@ def run(self): # Push all links to IYP self.iyp.batch_add_links('POPULATION', links) + def unit_test(self): + return super().unit_test(['POPULATION']) + def main() -> None: parser = argparse.ArgumentParser() @@ -83,7 +86,7 @@ def main() -> None: crawler = Crawler(ORG, URL, NAME) if args.unit_test: - crawler.unit_test(logging) + crawler.unit_test() else: crawler.run() crawler.close() diff --git a/requirements.txt b/requirements.txt index 00afa5c7..54fbbb75 100644 --- a/requirements.txt +++ b/requirements.txt @@ -21,4 +21,6 @@ flake8 pre-commit PyGithub clickhouse_driver -pyspark \ No newline at end of file +pyspark +paramiko +scp