From 8dc8a81cbeca5142b6efb2c36d5517fd8a68db80 Mon Sep 17 00:00:00 2001
From: Malte Tashiro <malte@iij.ad.jp>
Date: Thu, 14 Mar 2024 05:16:51 +0000
Subject: [PATCH] Fix BGP.Tools AS names crawler

Due to a format change, the previous version was broken. We now use
pandas to parse the CSV and be robust against future format changes.
The current change also added a class field which is similar to a tag,
which we now also import.
We do not import the country code field, since we do not know where it
comes from and will probably just add redundant information.
---
 iyp/crawlers/bgptools/as_names.py | 80 ++++++++++++++++++++-----------
 1 file changed, 52 insertions(+), 28 deletions(-)

diff --git a/iyp/crawlers/bgptools/as_names.py b/iyp/crawlers/bgptools/as_names.py
index 53854a2f..7db4c99b 100644
--- a/iyp/crawlers/bgptools/as_names.py
+++ b/iyp/crawlers/bgptools/as_names.py
@@ -2,12 +2,13 @@
 import logging
 import os
 import sys
+from io import BytesIO
 
+import pandas as pd
 import requests
 
 from iyp import BaseCrawler, RequestStatusError
 
-# curl -s https://bgp.tools/asns.csv | head -n 5
 URL = 'https://bgp.tools/asns.csv'
 ORG = 'BGP.Tools'
 NAME = 'bgptools.as_names'
@@ -22,6 +23,15 @@ def __init__(self, organization, url, name):
             'user-agent': 'IIJ/Internet Health Report - admin@ihr.live'
         }
 
+    @staticmethod
+    def replace_link_ids(links: list, src_id: dict = dict(), dst_id=dict()):
+        """Replace the src_id and dst_id values from links with their actual id."""
+        for link in links:
+            if src_id:
+                link['src_id'] = src_id[link['src_id']]
+            if dst_id:
+                link['dst_id'] = dst_id[link['dst_id']]
+
     def run(self):
         """Fetch the AS name file from BGP.Tools website and push it to IYP."""
 
@@ -29,37 +39,51 @@ def run(self):
         if req.status_code != 200:
             raise RequestStatusError('Error while fetching AS names')
 
-        lines = []
+        df = pd.read_csv(BytesIO(req.content), keep_default_na=False)
+
         asns = set()
         names = set()
-
-        # Collect all ASNs and names
-        for line in req.text.splitlines():
-            if line.startswith('asn,'):
+        tags = set()
+        name_links = list()
+        tag_links = list()
+
+        # Normally we would use itertuples, since it is way faster. But we want to be
+        # robust against format changes and since one column is called "class", which is
+        # a Python keyword, the field name would be replaced by a positional value,
+        # e.g., r._3 instead of r.class, which means that if the format is changed, this
+        # crawler breaks again.
+        # Since the data set is not too large, iterrows is fine performance-wise.
+        for r in df.iterrows():
+            has_link = False
+            entry = r[1]
+            asn = entry['asn']
+            if not asn.startswith('AS'):
+                logging.warning(f'asn field does not start with "AS": {entry}')
                 continue
-
-            asn, _, name = line.partition(',')
-            name = name.rpartition(',')[0].strip('"')
             asn = int(asn[2:])
-            asns.add(asn)
-            names.add(name)
-            lines.append([asn, name])
-
-        # get ASNs and names IDs
-        self.asn_id = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn', asns)
-        self.name_id = self.iyp.batch_get_nodes_by_single_prop('Name', 'name', names)
-
-        # Compute links
-        links = []
-        for (asn, name) in lines:
-
-            asn_qid = self.asn_id[asn]
-            name_qid = self.name_id[name]
-
-            links.append({'src_id': asn_qid, 'dst_id': name_qid, 'props': [self.reference]})  # Set AS name
-
-        # Push all links to IYP
-        self.iyp.batch_add_links('NAME', links)
+            name = entry['name']
+            if name != 'ERR_AS_NAME_NOT_FOUND':
+                names.add(name)
+                name_links.append({'src_id': asn, 'dst_id': name, 'props': [self.reference]})
+                has_link = True
+            tag = entry['class']
+            if tag != 'Unknown':
+                tags.add(tag)
+                tag_links.append({'src_id': asn, 'dst_id': tag, 'props': [self.reference]})
+                has_link = True
+            if has_link:
+                # Only create AS nodes if we have a relationship.
+                asns.add(asn)
+
+        asn_id = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn', asns, all=False)
+        name_id = self.iyp.batch_get_nodes_by_single_prop('Name', 'name', names, all=False)
+        tag_id = self.iyp.batch_get_nodes_by_single_prop('Tag', 'label', tags, all=False)
+
+        self.replace_link_ids(name_links, asn_id, name_id)
+        self.replace_link_ids(tag_links, asn_id, tag_id)
+
+        self.iyp.batch_add_links('NAME', name_links)
+        self.iyp.batch_add_links('CATEGORIZED', tag_links)
 
 
 def main() -> None: