From 8dc8a81cbeca5142b6efb2c36d5517fd8a68db80 Mon Sep 17 00:00:00 2001 From: Malte Tashiro Date: Thu, 14 Mar 2024 05:16:51 +0000 Subject: [PATCH] Fix BGP.Tools AS names crawler Due to a format change, the previous version was broken. We now use pandas to parse the CSV and be robust against future format changes. The current change also added a class field which is similar to a tag, which we now also import. We do not import the country code field, since we do not know where it comes from and will probably just add redundant information. --- iyp/crawlers/bgptools/as_names.py | 80 ++++++++++++++++++++----------- 1 file changed, 52 insertions(+), 28 deletions(-) diff --git a/iyp/crawlers/bgptools/as_names.py b/iyp/crawlers/bgptools/as_names.py index 53854a2f..7db4c99b 100644 --- a/iyp/crawlers/bgptools/as_names.py +++ b/iyp/crawlers/bgptools/as_names.py @@ -2,12 +2,13 @@ import logging import os import sys +from io import BytesIO +import pandas as pd import requests from iyp import BaseCrawler, RequestStatusError -# curl -s https://bgp.tools/asns.csv | head -n 5 URL = 'https://bgp.tools/asns.csv' ORG = 'BGP.Tools' NAME = 'bgptools.as_names' @@ -22,6 +23,15 @@ def __init__(self, organization, url, name): 'user-agent': 'IIJ/Internet Health Report - admin@ihr.live' } + @staticmethod + def replace_link_ids(links: list, src_id: dict = dict(), dst_id=dict()): + """Replace the src_id and dst_id values from links with their actual id.""" + for link in links: + if src_id: + link['src_id'] = src_id[link['src_id']] + if dst_id: + link['dst_id'] = dst_id[link['dst_id']] + def run(self): """Fetch the AS name file from BGP.Tools website and push it to IYP.""" @@ -29,37 +39,51 @@ def run(self): if req.status_code != 200: raise RequestStatusError('Error while fetching AS names') - lines = [] + df = pd.read_csv(BytesIO(req.content), keep_default_na=False) + asns = set() names = set() - - # Collect all ASNs and names - for line in req.text.splitlines(): - if line.startswith('asn,'): + tags = set() + name_links = list() + tag_links = list() + + # Normally we would use itertuples, since it is way faster. But we want to be + # robust against format changes and since one column is called "class", which is + # a Python keyword, the field name would be replaced by a positional value, + # e.g., r._3 instead of r.class, which means that if the format is changed, this + # crawler breaks again. + # Since the data set is not too large, iterrows is fine performance-wise. + for r in df.iterrows(): + has_link = False + entry = r[1] + asn = entry['asn'] + if not asn.startswith('AS'): + logging.warning(f'asn field does not start with "AS": {entry}') continue - - asn, _, name = line.partition(',') - name = name.rpartition(',')[0].strip('"') asn = int(asn[2:]) - asns.add(asn) - names.add(name) - lines.append([asn, name]) - - # get ASNs and names IDs - self.asn_id = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn', asns) - self.name_id = self.iyp.batch_get_nodes_by_single_prop('Name', 'name', names) - - # Compute links - links = [] - for (asn, name) in lines: - - asn_qid = self.asn_id[asn] - name_qid = self.name_id[name] - - links.append({'src_id': asn_qid, 'dst_id': name_qid, 'props': [self.reference]}) # Set AS name - - # Push all links to IYP - self.iyp.batch_add_links('NAME', links) + name = entry['name'] + if name != 'ERR_AS_NAME_NOT_FOUND': + names.add(name) + name_links.append({'src_id': asn, 'dst_id': name, 'props': [self.reference]}) + has_link = True + tag = entry['class'] + if tag != 'Unknown': + tags.add(tag) + tag_links.append({'src_id': asn, 'dst_id': tag, 'props': [self.reference]}) + has_link = True + if has_link: + # Only create AS nodes if we have a relationship. + asns.add(asn) + + asn_id = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn', asns, all=False) + name_id = self.iyp.batch_get_nodes_by_single_prop('Name', 'name', names, all=False) + tag_id = self.iyp.batch_get_nodes_by_single_prop('Tag', 'label', tags, all=False) + + self.replace_link_ids(name_links, asn_id, name_id) + self.replace_link_ids(tag_links, asn_id, tag_id) + + self.iyp.batch_add_links('NAME', name_links) + self.iyp.batch_add_links('CATEGORIZED', tag_links) def main() -> None: