From e6473de6dd63530115ab3c85c2adcc53c541a204 Mon Sep 17 00:00:00 2001 From: Malte Tashiro Date: Mon, 28 Oct 2024 04:43:44 +0000 Subject: [PATCH] Add CAIDA AS Relationship crawler --- ACKNOWLEDGMENTS.md | 14 +++-- config.json.example | 2 + documentation/data-sources.md | 65 ++++++++++----------- iyp/crawlers/caida/README.md | 46 ++++++++++++--- iyp/crawlers/caida/__init__.py | 70 +++++++++++++++++++++++ iyp/crawlers/caida/as_relationships_v4.py | 45 +++++++++++++++ iyp/crawlers/caida/as_relationships_v6.py | 45 +++++++++++++++ 7 files changed, 241 insertions(+), 46 deletions(-) create mode 100644 iyp/crawlers/caida/__init__.py create mode 100644 iyp/crawlers/caida/as_relationships_v4.py create mode 100644 iyp/crawlers/caida/as_relationships_v6.py diff --git a/ACKNOWLEDGMENTS.md b/ACKNOWLEDGMENTS.md index 193541d3..cee4faf8 100644 --- a/ACKNOWLEDGMENTS.md +++ b/ACKNOWLEDGMENTS.md @@ -42,15 +42,19 @@ tags](https://github.com/bgptools/anycast-prefixes) provided by ## CAIDA -We use two datasets from [CAIDA](https://www.caida.org/) which use is authorized +We use three datasets from [CAIDA](https://www.caida.org/) which use is authorized under their [Acceptable Use Agreement](https://www.caida.org/about/legal/aua/). -> CAIDA AS Rank https://doi.org/10.21986/CAIDA.DATA.AS-RANK. +> AS Rank https://doi.org/10.21986/CAIDA.DATA.AS-RANK. and -> The CAIDA UCSD IXPs Dataset, -> https://www.caida.org/catalog/datasets/ixps +> Internet eXchange Points Dataset, +> https://doi.org/10.21986/CAIDA.DATA.IXPS + +and + +> AS Relationships (serial-1), https://catalog.caida.org/dataset/as_relationships_serial_1 ## Cisco @@ -194,7 +198,7 @@ NetSecLab group at Virginia Tech. > ACM Internet Measurement Conference (IMC), October 2023. ## World Bank -We use the country population indicator `SP.POP.TOTL.` from the +We use the country population indicator `SP.POP.TOTL.` from the [Indicators API](https://datahelpdesk.worldbank.org/knowledgebase/articles/889392-about-the-indicators-api-documentation) dataset provided by the [World Bank](https://www.worldbank.org/en/home). \ No newline at end of file diff --git a/config.json.example b/config.json.example index 9dbae3b2..0615681e 100644 --- a/config.json.example +++ b/config.json.example @@ -49,6 +49,8 @@ "iyp.crawlers.bgptools.as_names", "iyp.crawlers.apnic.eyeball", "iyp.crawlers.caida.asrank", + "iyp.crawlers.caida.as_relationships_v4", + "iyp.crawlers.caida.as_relationships_v6", "iyp.crawlers.ihr.country_dependency", "iyp.crawlers.bgpkit.pfx2asn", "iyp.crawlers.bgpkit.as2rel_v4", diff --git a/documentation/data-sources.md b/documentation/data-sources.md index ee0943eb..8e8c8892 100644 --- a/documentation/data-sources.md +++ b/documentation/data-sources.md @@ -4,37 +4,38 @@ | Organization | Dataset Name / Description | URL | |-----------------------------|----------------------------------------------|-----------------------------------------------------------------------| -| Alice-LG | IXP route server looking glass snapshots | https://github.com/alice-lg/alice-lg} | -| | AMS-IX | https://lg.ams-ix.net} | -| | BCIX | https://lg.bcix.de} | -| | DE-CIX | https://lg.de-cix.net} | -| | IX.br | https://lg.ix.br} | -| | LINX | https://alice-rs.linx.net} | -| | Megaport | https://lg.megaport.com} | -| | Netnod | https://lg.netnod.se} | -| APNIC | AS population estimate | https://stats.labs.apnic.net/aspop} | -| BGPKIT | as2rel, peer-stats, pfx2as | https://data.bgpkit.com} | -| BGP.Tools | AS names, AS tags | https://bgp.tools/kb/api} | -| | Anycast prefix tags | https://github.com/bgptools/anycast-prefixes} | -| CAIDA | AS Rank | https://doi.org/10.21986/CAIDA.DATA.AS-RANK} | -| | IXPs Dataset | https://www.caida.org/catalog/datasets/ixps} | -| Cisco | Umbrella Popularity List | https://s3-us-west-1.amazonaws.com/umbrella-static/index.html} | -| Citizen Lab | URL testing lists | https://github.com/citizenlab/test-lists} | -| Cloudflare | Cloudflare Radar API endpoints radar/dns/top/ases, radar/dns/top/locations, radar/ranking/top, radar/datasets | https://radar.cloudflare.com}} | +| Alice-LG | IXP route server looking glass snapshots | https://github.com/alice-lg/alice-lg | +| | AMS-IX | https://lg.ams-ix.net | +| | BCIX | https://lg.bcix.de | +| | DE-CIX | https://lg.de-cix.net | +| | IX.br | https://lg.ix.br | +| | LINX | https://alice-rs.linx.net | +| | Megaport | https://lg.megaport.com | +| | Netnod | https://lg.netnod.se | +| APNIC | AS population estimate | https://stats.labs.apnic.net/aspop | +| BGPKIT | as2rel, peer-stats, pfx2as | https://data.bgpkit.com | +| BGP.Tools | AS names, AS tags | https://bgp.tools/kb/api | +| | Anycast prefix tags | https://github.com/bgptools/anycast-prefixes | +| CAIDA | AS Rank | https://doi.org/10.21986/CAIDA.DATA.AS-RANK | +| | IXPs Dataset | https://doi.org/10.21986/CAIDA.DATA.IXPS | +| | AS Relationships | https://catalog.caida.org/dataset/as_relationships_serial_1 | +| Cisco | Umbrella Popularity List | https://s3-us-west-1.amazonaws.com/umbrella-static/index.html | +| Citizen Lab | URL testing lists | https://github.com/citizenlab/test-lists | +| Cloudflare | Cloudflare Radar API endpoints radar/dns/top/ases, radar/dns/top/locations, radar/ranking/top, radar/datasets | https://radar.cloudflare.com | | | | -| Emile Aben | AS names | https://github.com/emileaben/asnames} | -| IHR | Country Dependency, AS Hegemony, ROV | https://ihr.iijlab.net} | -| Internet Intelligence Lab | AS to Organization Mapping | https://github.com/InetIntel/Dataset-AS-to-Organization-Mapping} | -| NRO | Extended allocation and assignment reports | https://www.nro.net/about/rirs/statistics} | -| OpenINTEL | tranco1m, umbrella1m, ns | https://data.openintel.nl/data} | -| | DNS Dependency Graph | https://dnsgraph.dacs.utwente.nl} | -| Packet Clearing House | Daily routing snapshots | https://www.pch.net/resources/Routing\_Data} | -| PeeringDB | API endpoints: fac, ix, ixlan, netfac, org | https://www.peeringdb.com} | -| RIPE NCC | AS names, RPKI | https://ftp.ripe.net/ripe} | -| | RIPE Atlas measurement information | https://atlas.ripe.net} | -| SimulaMet | rDNS data | https://rir-data.org} | -| Stanford | ASdb dataset | https://asdb.stanford.edu} | -| Tranco | Tranco list | https://tranco-list.eu} | -| Virginia Tech | RoVista | https://rovista.netsecurelab.org} | -| World Bank | Indicators API: Country Population Indicator | https://www.worldbank.org} | +| Emile Aben | AS names | https://github.com/emileaben/asnames | +| IHR | Country Dependency, AS Hegemony, ROV | https://ihr.iijlab.net | +| Internet Intelligence Lab | AS to Organization Mapping | https://github.com/InetIntel/Dataset-AS-to-Organization-Mapping | +| NRO | Extended allocation and assignment reports | https://www.nro.net/about/rirs/statistics | +| OpenINTEL | tranco1m, umbrella1m, ns | https://data.openintel.nl/data | +| | DNS Dependency Graph | https://dnsgraph.dacs.utwente.nl | +| Packet Clearing House | Daily routing snapshots | https://www.pch.net/resources/Routing_Data | +| PeeringDB | API endpoints: fac, ix, ixlan, netfac, org | https://www.peeringdb.com | +| RIPE NCC | AS names, RPKI | https://ftp.ripe.net/ripe | +| | RIPE Atlas measurement information | https://atlas.ripe.net | +| SimulaMet | rDNS data | https://rir-data.org | +| Stanford | ASdb dataset | https://asdb.stanford.edu | +| Tranco | Tranco list | https://tranco-list.eu | +| Virginia Tech | RoVista | https://rovista.netsecurelab.org | +| World Bank | Indicators API: Country Population Indicator | https://www.worldbank.org | diff --git a/iyp/crawlers/caida/README.md b/iyp/crawlers/caida/README.md index 24767790..37c93280 100644 --- a/iyp/crawlers/caida/README.md +++ b/iyp/crawlers/caida/README.md @@ -1,6 +1,7 @@ # CAIDA -- https://caida.org ## ASRank (asrank.py) + AS rank in terms of customer cone size, meaning that large transit providers are higher ranked. @@ -11,7 +12,8 @@ Ranking: Connect ASes nodes to a single ranking node corresponding to ASRank. The rank is given as a link attribute. For example: -``` + +```cypher (:AS {asn:2497})-[:RANK {rank:87}]-(:Ranking {name:'CAIDA ASRank'}) ``` @@ -19,7 +21,7 @@ Country: Connect AS to country nodes, meaning that the AS is registered in that country. -``` +```cypher (:AS)-[:COUNTRY]-(:Country) ``` @@ -27,7 +29,8 @@ AS name: Connect AS to names nodes, providing the name of an AS. For example: -``` + +```cypher (:AS {asn:2497})-[:NAME]-(:Name {name:'IIJ'}) ``` @@ -35,14 +38,14 @@ For example: The asrank crawler is not depending on other crawlers. - ## IXPs (ixs.py) + List of IXPs obtained from PeeringDB, Hurricane Electric, Packet Clearing House. ### Graph representation Nodes: - + - `(:IXP {name})`: IXP node - `(:Name {name})`: Name of IXP - `(:Prefix {prefix})`: Prefix of IXP peering LAN @@ -61,21 +64,46 @@ Relationships: ``` ### Dependence -The ixs crawler depends on the peeringdb.ix crawler. +The ixs crawler depends on the peeringdb.ix crawler. ## IXP memberships (ix_asns.py) -List of ASes present at each IXP. +List of ASes present at each IXP. ### Graph representation Relationships: -```Cypher +```cypher (:AS)-[:MEMBER_OF]->(:IXP) ``` +### Dependence + +The ix_asns crawler depends on the ixs crawler. + +## AS relationships (as_relationships_v[4|6].py) + +Inferred AS relationships (peer-to-peer or customer-provider). + +### Graph representation + +```cypher +(:AS {asn: 2497})-[r:PEERS_WITH {af: 4, rel: -1}]->(:AS {asn: 7500}) +``` + +Either the `reference_name` or `af` properties can be used to distinguish between IPv4 +and IPv6. + +`rel: -1` indicates customer-provider, and the direction of the relationship is modeled +as `provider -> customer` to be consistent with `bgpkit.as2rel`. + +`rel: 0` indicates peer-to-peer relationship. + +**Note:** While both CAIDA and BGPKIT use `rel: 0` to indicate a peer-to-peer +relationship, BGPKIT uses `rel: 1` for customer-provider, whereas CAIDA uses `rel: -1`. ### Dependence -The ix_asns crawler dependends on the ixs crawler. + +The as_relatonship crawler does not depend on other crawlers. \ No newline at end of file diff --git a/iyp/crawlers/caida/__init__.py b/iyp/crawlers/caida/__init__.py new file mode 100644 index 00000000..614da4f3 --- /dev/null +++ b/iyp/crawlers/caida/__init__.py @@ -0,0 +1,70 @@ +import bz2 +import logging +import os +from datetime import datetime, timezone +from io import BytesIO + +import requests +from bs4 import BeautifulSoup + +from iyp import BaseCrawler + + +class ASRelCrawler(BaseCrawler): + def __init__(self, organization, url, name, af): + super().__init__(organization, url, name) + self.af = af + self.reference['reference_url_info'] = \ + 'https://publicdata.caida.org/datasets/as-relationships/serial-1/README.txt' + + def __get_latest_file(self): + index = requests.get(self.reference['reference_url_data']) + index.raise_for_status() + soup = BeautifulSoup(index.text, features='html.parser') + if self.af == 4: + filename_template = '%Y%m%d.as-rel.txt.bz2' + else: + filename_template = '%Y%m%d.as-rel.v6-stable.txt.bz2' + links = soup.find_all('a') + file_dates = list() + for link in links: + try: + dt = datetime.strptime(link['href'], filename_template).replace(tzinfo=timezone.utc) + except ValueError: + continue + file_dates.append((dt, link['href'])) + file_dates.sort() + latest_file_date, latest_file_name = file_dates[-1] + self.reference['reference_time_modification'] = latest_file_date + self.reference['reference_url_data'] = os.path.join(self.reference['reference_url_data'], latest_file_name) + logging.info(f'Fetching file: {self.reference["reference_url_data"]}') + + def run(self): + self.__get_latest_file() + req = requests.get(self.reference['reference_url_data']) + req.raise_for_status() + + with bz2.open(BytesIO(req.content), 'rb') as f: + text = f.read().decode() + + ases = set() + peers_with_links = list() + for line in text.splitlines(): + if line.startswith('#'): + continue + left_asn, right_asn, kind = map(int, line.split('|')) + ases.add(left_asn) + ases.add(right_asn) + peers_with_links.append({'src_id': left_asn, 'dst_id': right_asn, + 'props': [self.reference, {'rel': kind, 'af': self.af}]}) + + as_id = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn', ases) + + for link in peers_with_links: + link['src_id'] = as_id[link['src_id']] + link['dst_id'] = as_id[link['dst_id']] + + self.iyp.batch_add_links('PEERS_WITH', peers_with_links) + + def unit_test(self): + return super().unit_test(['PEERS_WITH']) diff --git a/iyp/crawlers/caida/as_relationships_v4.py b/iyp/crawlers/caida/as_relationships_v4.py new file mode 100644 index 00000000..d9146bc6 --- /dev/null +++ b/iyp/crawlers/caida/as_relationships_v4.py @@ -0,0 +1,45 @@ +import argparse +import logging +import os +import sys + +from iyp.crawlers.caida import ASRelCrawler + +URL = 'https://publicdata.caida.org/datasets/as-relationships/serial-1/' +ORG = 'CAIDA' +NAME = 'caida.as_relationships_v4' + + +class Crawler(ASRelCrawler): + def __init__(self, organization, url, name): + super().__init__(organization, url, name, 4) + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument('--unit-test', action='store_true') + args = parser.parse_args() + + scriptname = os.path.basename(sys.argv[0]).replace('/', '_')[0:-3] + FORMAT = '%(asctime)s %(levelname)s %(message)s' + logging.basicConfig( + format=FORMAT, + filename='log/' + scriptname + '.log', + level=logging.INFO, + datefmt='%Y-%m-%d %H:%M:%S' + ) + + logging.info(f'Started: {sys.argv}') + + crawler = Crawler(ORG, URL, NAME) + if args.unit_test: + crawler.unit_test() + else: + crawler.run() + crawler.close() + logging.info(f'Finished: {sys.argv}') + + +if __name__ == '__main__': + main() + sys.exit(0) diff --git a/iyp/crawlers/caida/as_relationships_v6.py b/iyp/crawlers/caida/as_relationships_v6.py new file mode 100644 index 00000000..df954bde --- /dev/null +++ b/iyp/crawlers/caida/as_relationships_v6.py @@ -0,0 +1,45 @@ +import argparse +import logging +import os +import sys + +from iyp.crawlers.caida import ASRelCrawler + +URL = 'https://publicdata.caida.org/datasets/as-relationships/serial-1/' +ORG = 'CAIDA' +NAME = 'caida.as_relationships_v6' + + +class Crawler(ASRelCrawler): + def __init__(self, organization, url, name): + super().__init__(organization, url, name, 6) + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument('--unit-test', action='store_true') + args = parser.parse_args() + + scriptname = os.path.basename(sys.argv[0]).replace('/', '_')[0:-3] + FORMAT = '%(asctime)s %(levelname)s %(message)s' + logging.basicConfig( + format=FORMAT, + filename='log/' + scriptname + '.log', + level=logging.INFO, + datefmt='%Y-%m-%d %H:%M:%S' + ) + + logging.info(f'Started: {sys.argv}') + + crawler = Crawler(ORG, URL, NAME) + if args.unit_test: + crawler.unit_test() + else: + crawler.run() + crawler.close() + logging.info(f'Finished: {sys.argv}') + + +if __name__ == '__main__': + main() + sys.exit(0)