-
Notifications
You must be signed in to change notification settings - Fork 23
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #134 from m-appel/133-bgptools-as-names
Fix BGP.Tools AS names crawler
- Loading branch information
Showing
1 changed file
with
52 additions
and
28 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,12 +2,13 @@ | |
import logging | ||
import os | ||
import sys | ||
from io import BytesIO | ||
|
||
import pandas as pd | ||
import requests | ||
|
||
from iyp import BaseCrawler, RequestStatusError | ||
|
||
# curl -s https://bgp.tools/asns.csv | head -n 5 | ||
URL = 'https://bgp.tools/asns.csv' | ||
ORG = 'BGP.Tools' | ||
NAME = 'bgptools.as_names' | ||
|
@@ -22,44 +23,67 @@ def __init__(self, organization, url, name): | |
'user-agent': 'IIJ/Internet Health Report - [email protected]' | ||
} | ||
|
||
@staticmethod | ||
def replace_link_ids(links: list, src_id: dict = dict(), dst_id=dict()): | ||
"""Replace the src_id and dst_id values from links with their actual id.""" | ||
for link in links: | ||
if src_id: | ||
link['src_id'] = src_id[link['src_id']] | ||
if dst_id: | ||
link['dst_id'] = dst_id[link['dst_id']] | ||
|
||
def run(self): | ||
"""Fetch the AS name file from BGP.Tools website and push it to IYP.""" | ||
|
||
req = requests.get(URL, headers=self.headers) | ||
if req.status_code != 200: | ||
raise RequestStatusError('Error while fetching AS names') | ||
|
||
lines = [] | ||
df = pd.read_csv(BytesIO(req.content), keep_default_na=False) | ||
|
||
asns = set() | ||
names = set() | ||
|
||
# Collect all ASNs and names | ||
for line in req.text.splitlines(): | ||
if line.startswith('asn,'): | ||
tags = set() | ||
name_links = list() | ||
tag_links = list() | ||
|
||
# Normally we would use itertuples, since it is way faster. But we want to be | ||
# robust against format changes and since one column is called "class", which is | ||
# a Python keyword, the field name would be replaced by a positional value, | ||
# e.g., r._3 instead of r.class, which means that if the format is changed, this | ||
# crawler breaks again. | ||
# Since the data set is not too large, iterrows is fine performance-wise. | ||
for r in df.iterrows(): | ||
has_link = False | ||
entry = r[1] | ||
asn = entry['asn'] | ||
if not asn.startswith('AS'): | ||
logging.warning(f'asn field does not start with "AS": {entry}') | ||
continue | ||
|
||
asn, _, name = line.partition(',') | ||
name = name.rpartition(',')[0].strip('"') | ||
asn = int(asn[2:]) | ||
asns.add(asn) | ||
names.add(name) | ||
lines.append([asn, name]) | ||
|
||
# get ASNs and names IDs | ||
self.asn_id = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn', asns) | ||
self.name_id = self.iyp.batch_get_nodes_by_single_prop('Name', 'name', names) | ||
|
||
# Compute links | ||
links = [] | ||
for (asn, name) in lines: | ||
|
||
asn_qid = self.asn_id[asn] | ||
name_qid = self.name_id[name] | ||
|
||
links.append({'src_id': asn_qid, 'dst_id': name_qid, 'props': [self.reference]}) # Set AS name | ||
|
||
# Push all links to IYP | ||
self.iyp.batch_add_links('NAME', links) | ||
name = entry['name'] | ||
if name != 'ERR_AS_NAME_NOT_FOUND': | ||
names.add(name) | ||
name_links.append({'src_id': asn, 'dst_id': name, 'props': [self.reference]}) | ||
has_link = True | ||
tag = entry['class'] | ||
if tag != 'Unknown': | ||
tags.add(tag) | ||
tag_links.append({'src_id': asn, 'dst_id': tag, 'props': [self.reference]}) | ||
has_link = True | ||
if has_link: | ||
# Only create AS nodes if we have a relationship. | ||
asns.add(asn) | ||
|
||
asn_id = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn', asns, all=False) | ||
name_id = self.iyp.batch_get_nodes_by_single_prop('Name', 'name', names, all=False) | ||
tag_id = self.iyp.batch_get_nodes_by_single_prop('Tag', 'label', tags, all=False) | ||
|
||
self.replace_link_ids(name_links, asn_id, name_id) | ||
self.replace_link_ids(tag_links, asn_id, tag_id) | ||
|
||
self.iyp.batch_add_links('NAME', name_links) | ||
self.iyp.batch_add_links('CATEGORIZED', tag_links) | ||
|
||
|
||
def main() -> None: | ||
|