Skip to content

Commit

Permalink
Merge pull request #134 from m-appel/133-bgptools-as-names
Browse files Browse the repository at this point in the history
Fix BGP.Tools AS names crawler
  • Loading branch information
romain-fontugne authored Mar 15, 2024
2 parents 32f973d + 8dc8a81 commit 073e3be
Showing 1 changed file with 52 additions and 28 deletions.
80 changes: 52 additions & 28 deletions iyp/crawlers/bgptools/as_names.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,13 @@
import logging
import os
import sys
from io import BytesIO

import pandas as pd
import requests

from iyp import BaseCrawler, RequestStatusError

# curl -s https://bgp.tools/asns.csv | head -n 5
URL = 'https://bgp.tools/asns.csv'
ORG = 'BGP.Tools'
NAME = 'bgptools.as_names'
Expand All @@ -22,44 +23,67 @@ def __init__(self, organization, url, name):
'user-agent': 'IIJ/Internet Health Report - [email protected]'
}

@staticmethod
def replace_link_ids(links: list, src_id: dict = dict(), dst_id=dict()):
"""Replace the src_id and dst_id values from links with their actual id."""
for link in links:
if src_id:
link['src_id'] = src_id[link['src_id']]
if dst_id:
link['dst_id'] = dst_id[link['dst_id']]

def run(self):
"""Fetch the AS name file from BGP.Tools website and push it to IYP."""

req = requests.get(URL, headers=self.headers)
if req.status_code != 200:
raise RequestStatusError('Error while fetching AS names')

lines = []
df = pd.read_csv(BytesIO(req.content), keep_default_na=False)

asns = set()
names = set()

# Collect all ASNs and names
for line in req.text.splitlines():
if line.startswith('asn,'):
tags = set()
name_links = list()
tag_links = list()

# Normally we would use itertuples, since it is way faster. But we want to be
# robust against format changes and since one column is called "class", which is
# a Python keyword, the field name would be replaced by a positional value,
# e.g., r._3 instead of r.class, which means that if the format is changed, this
# crawler breaks again.
# Since the data set is not too large, iterrows is fine performance-wise.
for r in df.iterrows():
has_link = False
entry = r[1]
asn = entry['asn']
if not asn.startswith('AS'):
logging.warning(f'asn field does not start with "AS": {entry}')
continue

asn, _, name = line.partition(',')
name = name.rpartition(',')[0].strip('"')
asn = int(asn[2:])
asns.add(asn)
names.add(name)
lines.append([asn, name])

# get ASNs and names IDs
self.asn_id = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn', asns)
self.name_id = self.iyp.batch_get_nodes_by_single_prop('Name', 'name', names)

# Compute links
links = []
for (asn, name) in lines:

asn_qid = self.asn_id[asn]
name_qid = self.name_id[name]

links.append({'src_id': asn_qid, 'dst_id': name_qid, 'props': [self.reference]}) # Set AS name

# Push all links to IYP
self.iyp.batch_add_links('NAME', links)
name = entry['name']
if name != 'ERR_AS_NAME_NOT_FOUND':
names.add(name)
name_links.append({'src_id': asn, 'dst_id': name, 'props': [self.reference]})
has_link = True
tag = entry['class']
if tag != 'Unknown':
tags.add(tag)
tag_links.append({'src_id': asn, 'dst_id': tag, 'props': [self.reference]})
has_link = True
if has_link:
# Only create AS nodes if we have a relationship.
asns.add(asn)

asn_id = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn', asns, all=False)
name_id = self.iyp.batch_get_nodes_by_single_prop('Name', 'name', names, all=False)
tag_id = self.iyp.batch_get_nodes_by_single_prop('Tag', 'label', tags, all=False)

self.replace_link_ids(name_links, asn_id, name_id)
self.replace_link_ids(tag_links, asn_id, tag_id)

self.iyp.batch_add_links('NAME', name_links)
self.iyp.batch_add_links('CATEGORIZED', tag_links)


def main() -> None:
Expand Down

0 comments on commit 073e3be

Please sign in to comment.