Merge pull request #134 from m-appel/133-bgptools-as-names

Fix BGP.Tools AS names crawler
InternetHealthReport · Mar 15, 2024 · 073e3be · 073e3be
2 parents 32f973d + 8dc8a81
commit 073e3be
Showing 1 changed file with 52 additions and 28 deletions.
diff --git a/iyp/crawlers/bgptools/as_names.py b/iyp/crawlers/bgptools/as_names.py
@@ -2,12 +2,13 @@
 import logging
 import os
 import sys
+from io import BytesIO
 
+import pandas as pd
 import requests
 
 from iyp import BaseCrawler, RequestStatusError
 
-# curl -s https://bgp.tools/asns.csv | head -n 5
 URL = 'https://bgp.tools/asns.csv'
 ORG = 'BGP.Tools'
 NAME = 'bgptools.as_names'
@@ -22,44 +23,67 @@ def __init__(self, organization, url, name):
             'user-agent': 'IIJ/Internet Health Report - [email protected]'
         }
 
+    @staticmethod
+    def replace_link_ids(links: list, src_id: dict = dict(), dst_id=dict()):
+        """Replace the src_id and dst_id values from links with their actual id."""
+        for link in links:
+            if src_id:
+                link['src_id'] = src_id[link['src_id']]
+            if dst_id:
+                link['dst_id'] = dst_id[link['dst_id']]
+
     def run(self):
         """Fetch the AS name file from BGP.Tools website and push it to IYP."""
 
         req = requests.get(URL, headers=self.headers)
         if req.status_code != 200:
             raise RequestStatusError('Error while fetching AS names')
 
-        lines = []
+        df = pd.read_csv(BytesIO(req.content), keep_default_na=False)
+
         asns = set()
         names = set()
-
-        # Collect all ASNs and names
-        for line in req.text.splitlines():
-            if line.startswith('asn,'):
+        tags = set()
+        name_links = list()
+        tag_links = list()
+
+        # Normally we would use itertuples, since it is way faster. But we want to be
+        # robust against format changes and since one column is called "class", which is
+        # a Python keyword, the field name would be replaced by a positional value,
+        # e.g., r._3 instead of r.class, which means that if the format is changed, this
+        # crawler breaks again.
+        # Since the data set is not too large, iterrows is fine performance-wise.
+        for r in df.iterrows():
+            has_link = False
+            entry = r[1]
+            asn = entry['asn']
+            if not asn.startswith('AS'):
+                logging.warning(f'asn field does not start with "AS": {entry}')
                 continue
-
-            asn, _, name = line.partition(',')
-            name = name.rpartition(',')[0].strip('"')
             asn = int(asn[2:])
-            asns.add(asn)
-            names.add(name)
-            lines.append([asn, name])
-
-        # get ASNs and names IDs
-        self.asn_id = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn', asns)
-        self.name_id = self.iyp.batch_get_nodes_by_single_prop('Name', 'name', names)
-
-        # Compute links
-        links = []
-        for (asn, name) in lines:
-
-            asn_qid = self.asn_id[asn]
-            name_qid = self.name_id[name]
-
-            links.append({'src_id': asn_qid, 'dst_id': name_qid, 'props': [self.reference]})  # Set AS name
-
-        # Push all links to IYP
-        self.iyp.batch_add_links('NAME', links)
+            name = entry['name']
+            if name != 'ERR_AS_NAME_NOT_FOUND':
+                names.add(name)
+                name_links.append({'src_id': asn, 'dst_id': name, 'props': [self.reference]})
+                has_link = True
+            tag = entry['class']
+            if tag != 'Unknown':
+                tags.add(tag)
+                tag_links.append({'src_id': asn, 'dst_id': tag, 'props': [self.reference]})
+                has_link = True
+            if has_link:
+                # Only create AS nodes if we have a relationship.
+                asns.add(asn)
+
+        asn_id = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn', asns, all=False)
+        name_id = self.iyp.batch_get_nodes_by_single_prop('Name', 'name', names, all=False)
+        tag_id = self.iyp.batch_get_nodes_by_single_prop('Tag', 'label', tags, all=False)
+
+        self.replace_link_ids(name_links, asn_id, name_id)
+        self.replace_link_ids(tag_links, asn_id, tag_id)
+
+        self.iyp.batch_add_links('NAME', name_links)
+        self.iyp.batch_add_links('CATEGORIZED', tag_links)
 
 
 def main() -> None: