From e6473de6dd63530115ab3c85c2adcc53c541a204 Mon Sep 17 00:00:00 2001
From: Malte Tashiro <malte@iij.ad.jp>
Date: Mon, 28 Oct 2024 04:43:44 +0000
Subject: [PATCH] Add CAIDA AS Relationship crawler

---
 ACKNOWLEDGMENTS.md                        | 14 +++--
 config.json.example                       |  2 +
 documentation/data-sources.md             | 65 ++++++++++-----------
 iyp/crawlers/caida/README.md              | 46 ++++++++++++---
 iyp/crawlers/caida/__init__.py            | 70 +++++++++++++++++++++++
 iyp/crawlers/caida/as_relationships_v4.py | 45 +++++++++++++++
 iyp/crawlers/caida/as_relationships_v6.py | 45 +++++++++++++++
 7 files changed, 241 insertions(+), 46 deletions(-)
 create mode 100644 iyp/crawlers/caida/__init__.py
 create mode 100644 iyp/crawlers/caida/as_relationships_v4.py
 create mode 100644 iyp/crawlers/caida/as_relationships_v6.py

diff --git a/ACKNOWLEDGMENTS.md b/ACKNOWLEDGMENTS.md
index 193541d3..cee4faf8 100644
--- a/ACKNOWLEDGMENTS.md
+++ b/ACKNOWLEDGMENTS.md
@@ -42,15 +42,19 @@ tags](https://github.com/bgptools/anycast-prefixes) provided by
 
 ## CAIDA
 
-We use two datasets from [CAIDA](https://www.caida.org/) which use is authorized
+We use three datasets from [CAIDA](https://www.caida.org/) which use is authorized
 under their [Acceptable Use Agreement](https://www.caida.org/about/legal/aua/).
 
-> CAIDA AS Rank https://doi.org/10.21986/CAIDA.DATA.AS-RANK.
+> AS Rank https://doi.org/10.21986/CAIDA.DATA.AS-RANK.
 
 and
 
-> The CAIDA UCSD IXPs Dataset,
-> https://www.caida.org/catalog/datasets/ixps
+> Internet eXchange Points Dataset,
+> https://doi.org/10.21986/CAIDA.DATA.IXPS
+
+and
+
+> AS Relationships (serial-1), https://catalog.caida.org/dataset/as_relationships_serial_1
 
 ## Cisco
 
@@ -194,7 +198,7 @@ NetSecLab group at Virginia Tech.
 > ACM Internet Measurement Conference (IMC), October 2023.
 
 ## World Bank
-We use the country population indicator `SP.POP.TOTL.` from the 
+We use the country population indicator `SP.POP.TOTL.` from the
 [Indicators API](https://datahelpdesk.worldbank.org/knowledgebase/articles/889392-about-the-indicators-api-documentation)
 dataset provided by the
 [World Bank](https://www.worldbank.org/en/home).
\ No newline at end of file
diff --git a/config.json.example b/config.json.example
index 9dbae3b2..0615681e 100644
--- a/config.json.example
+++ b/config.json.example
@@ -49,6 +49,8 @@
             "iyp.crawlers.bgptools.as_names",
             "iyp.crawlers.apnic.eyeball",
             "iyp.crawlers.caida.asrank",
+            "iyp.crawlers.caida.as_relationships_v4",
+            "iyp.crawlers.caida.as_relationships_v6",
             "iyp.crawlers.ihr.country_dependency",
             "iyp.crawlers.bgpkit.pfx2asn",
             "iyp.crawlers.bgpkit.as2rel_v4",
diff --git a/documentation/data-sources.md b/documentation/data-sources.md
index ee0943eb..8e8c8892 100644
--- a/documentation/data-sources.md
+++ b/documentation/data-sources.md
@@ -4,37 +4,38 @@
 
 | Organization                | Dataset Name / Description                   | URL                                                                   |
 |-----------------------------|----------------------------------------------|-----------------------------------------------------------------------|
-| Alice-LG                    | IXP route server looking glass snapshots     | https://github.com/alice-lg/alice-lg}                            |
-|                             | AMS-IX                                       | https://lg.ams-ix.net}                                           |
-|                             | BCIX                                         | https://lg.bcix.de}                                              |
-|                             | DE-CIX                                       | https://lg.de-cix.net}                                           |
-|                             | IX.br                                        | https://lg.ix.br}                                                |
-|                             | LINX                                         | https://alice-rs.linx.net}                                       |
-|                             | Megaport                                     | https://lg.megaport.com}                                         |
-|                             | Netnod                                       | https://lg.netnod.se}                                            |
-| APNIC                       | AS population estimate                       | https://stats.labs.apnic.net/aspop}                              |
-| BGPKIT                      | as2rel, peer-stats, pfx2as                   | https://data.bgpkit.com}                                         |
-| BGP.Tools                   | AS names, AS tags                            | https://bgp.tools/kb/api}                                        |
-|                             | Anycast prefix tags                          | https://github.com/bgptools/anycast-prefixes}                    |
-| CAIDA                       | AS Rank                                      | https://doi.org/10.21986/CAIDA.DATA.AS-RANK}                     |
-|                             | IXPs Dataset                                 | https://www.caida.org/catalog/datasets/ixps}                     |
-| Cisco                       | Umbrella Popularity List                     | https://s3-us-west-1.amazonaws.com/umbrella-static/index.html}   |
-| Citizen Lab                 | URL testing lists                            | https://github.com/citizenlab/test-lists}                        |
-| Cloudflare                  | Cloudflare Radar API endpoints radar/dns/top/ases, radar/dns/top/locations, radar/ranking/top, radar/datasets   | https://radar.cloudflare.com}}                                   |
+| Alice-LG                    | IXP route server looking glass snapshots     | https://github.com/alice-lg/alice-lg                            |
+|                             | AMS-IX                                       | https://lg.ams-ix.net                                           |
+|                             | BCIX                                         | https://lg.bcix.de                                              |
+|                             | DE-CIX                                       | https://lg.de-cix.net                                           |
+|                             | IX.br                                        | https://lg.ix.br                                                |
+|                             | LINX                                         | https://alice-rs.linx.net                                       |
+|                             | Megaport                                     | https://lg.megaport.com                                         |
+|                             | Netnod                                       | https://lg.netnod.se                                            |
+| APNIC                       | AS population estimate                       | https://stats.labs.apnic.net/aspop                              |
+| BGPKIT                      | as2rel, peer-stats, pfx2as                   | https://data.bgpkit.com                                         |
+| BGP.Tools                   | AS names, AS tags                            | https://bgp.tools/kb/api                                        |
+|                             | Anycast prefix tags                          | https://github.com/bgptools/anycast-prefixes                    |
+| CAIDA                       | AS Rank                                      | https://doi.org/10.21986/CAIDA.DATA.AS-RANK                     |
+|                             | IXPs Dataset                                 | https://doi.org/10.21986/CAIDA.DATA.IXPS                        |
+|                             | AS Relationships                             | https://catalog.caida.org/dataset/as_relationships_serial_1     |
+| Cisco                       | Umbrella Popularity List                     | https://s3-us-west-1.amazonaws.com/umbrella-static/index.html   |
+| Citizen Lab                 | URL testing lists                            | https://github.com/citizenlab/test-lists                        |
+| Cloudflare                  | Cloudflare Radar API endpoints radar/dns/top/ases, radar/dns/top/locations, radar/ranking/top, radar/datasets   | https://radar.cloudflare.com                                   |
 |                             |                                   |
-| Emile Aben                  | AS names                                     | https://github.com/emileaben/asnames}                                 |
-| IHR                         | Country Dependency, AS Hegemony, ROV         | https://ihr.iijlab.net}                                               |
-| Internet Intelligence Lab   | AS to Organization Mapping                   | https://github.com/InetIntel/Dataset-AS-to-Organization-Mapping}      |
-| NRO                         | Extended allocation and assignment reports   | https://www.nro.net/about/rirs/statistics}                            |
-| OpenINTEL                   | tranco1m, umbrella1m, ns                     | https://data.openintel.nl/data}                                       |
-|                             | DNS Dependency Graph                         | https://dnsgraph.dacs.utwente.nl}                                     |
-| Packet Clearing House       | Daily routing snapshots                      | https://www.pch.net/resources/Routing\_Data}                          |
-| PeeringDB                   | API endpoints: fac, ix, ixlan, netfac, org   | https://www.peeringdb.com}                                            |
-| RIPE NCC                    | AS names, RPKI                               | https://ftp.ripe.net/ripe}                                            |
-|                             | RIPE Atlas measurement information           | https://atlas.ripe.net}                                               |
-| SimulaMet                   | rDNS data                                    | https://rir-data.org}                                                 |
-| Stanford                    | ASdb dataset                                 | https://asdb.stanford.edu}                                       |
-| Tranco                      | Tranco list                                  | https://tranco-list.eu}                                          |
-| Virginia Tech               | RoVista                                      | https://rovista.netsecurelab.org}                                |
-| World Bank                  | Indicators API: Country Population Indicator | https://www.worldbank.org}                                       |
+| Emile Aben                  | AS names                                     | https://github.com/emileaben/asnames                                 |
+| IHR                         | Country Dependency, AS Hegemony, ROV         | https://ihr.iijlab.net                                               |
+| Internet Intelligence Lab   | AS to Organization Mapping                   | https://github.com/InetIntel/Dataset-AS-to-Organization-Mapping      |
+| NRO                         | Extended allocation and assignment reports   | https://www.nro.net/about/rirs/statistics                            |
+| OpenINTEL                   | tranco1m, umbrella1m, ns                     | https://data.openintel.nl/data                                       |
+|                             | DNS Dependency Graph                         | https://dnsgraph.dacs.utwente.nl                                     |
+| Packet Clearing House       | Daily routing snapshots                      | https://www.pch.net/resources/Routing_Data                          |
+| PeeringDB                   | API endpoints: fac, ix, ixlan, netfac, org   | https://www.peeringdb.com                                            |
+| RIPE NCC                    | AS names, RPKI                               | https://ftp.ripe.net/ripe                                            |
+|                             | RIPE Atlas measurement information           | https://atlas.ripe.net                                               |
+| SimulaMet                   | rDNS data                                    | https://rir-data.org                                                 |
+| Stanford                    | ASdb dataset                                 | https://asdb.stanford.edu                                       |
+| Tranco                      | Tranco list                                  | https://tranco-list.eu                                          |
+| Virginia Tech               | RoVista                                      | https://rovista.netsecurelab.org                                |
+| World Bank                  | Indicators API: Country Population Indicator | https://www.worldbank.org                                       |
 
diff --git a/iyp/crawlers/caida/README.md b/iyp/crawlers/caida/README.md
index 24767790..37c93280 100644
--- a/iyp/crawlers/caida/README.md
+++ b/iyp/crawlers/caida/README.md
@@ -1,6 +1,7 @@
 # CAIDA -- https://caida.org
 
 ## ASRank (asrank.py)
+
 AS rank in terms of customer cone size, meaning that large transit providers are
 higher ranked.
 
@@ -11,7 +12,8 @@ Ranking:
 Connect ASes nodes to a single ranking node corresponding to ASRank. The rank is
 given as a link attribute.
 For example:
-```
+
+```cypher
 (:AS  {asn:2497})-[:RANK {rank:87}]-(:Ranking {name:'CAIDA ASRank'})
 ```
 
@@ -19,7 +21,7 @@ Country:
 
 Connect AS to country nodes, meaning that the AS is registered in that country.
 
-```
+```cypher
 (:AS)-[:COUNTRY]-(:Country)
 ```
 
@@ -27,7 +29,8 @@ AS name:
 
 Connect AS to names nodes, providing the name of an AS.
 For example:
-```
+
+```cypher
 (:AS {asn:2497})-[:NAME]-(:Name {name:'IIJ'})
 ```
 
@@ -35,14 +38,14 @@ For example:
 
 The asrank crawler is not depending on other crawlers.
 
-
 ## IXPs (ixs.py)
+
 List of IXPs obtained from PeeringDB, Hurricane Electric, Packet Clearing House.
 
 ### Graph representation
 
 Nodes:
- 
+
 - `(:IXP {name})`: IXP node
 - `(:Name {name})`: Name of IXP
 - `(:Prefix {prefix})`: Prefix of IXP peering LAN
@@ -61,21 +64,46 @@ Relationships:
 ```
 
 ### Dependence
-The ixs crawler depends on the peeringdb.ix crawler.
 
+The ixs crawler depends on the peeringdb.ix crawler.
 
 ## IXP memberships (ix_asns.py)
-List of ASes present at each IXP.
 
+List of ASes present at each IXP.
 
 ### Graph representation
 
 Relationships:
 
-```Cypher
+```cypher
 (:AS)-[:MEMBER_OF]->(:IXP)
 ```
 
+### Dependence
+
+The ix_asns crawler depends on the ixs crawler.
+
+## AS relationships (as_relationships_v[4|6].py)
+
+Inferred AS relationships (peer-to-peer or customer-provider).
+
+### Graph representation
+
+```cypher
+(:AS {asn: 2497})-[r:PEERS_WITH {af: 4, rel: -1}]->(:AS {asn: 7500})
+```
+
+Either the `reference_name` or `af` properties can be used to distinguish between IPv4
+and IPv6.
+
+`rel: -1` indicates customer-provider, and the direction of the relationship is modeled
+as `provider -> customer` to be consistent with `bgpkit.as2rel`.
+
+`rel: 0` indicates peer-to-peer relationship.
+
+**Note:** While both CAIDA and BGPKIT use `rel: 0` to indicate a peer-to-peer
+relationship, BGPKIT uses `rel: 1` for customer-provider, whereas CAIDA uses `rel: -1`.
 
 ### Dependence
-The ix_asns crawler dependends on the ixs crawler.
+
+The as_relatonship crawler does not depend on other crawlers.
\ No newline at end of file
diff --git a/iyp/crawlers/caida/__init__.py b/iyp/crawlers/caida/__init__.py
new file mode 100644
index 00000000..614da4f3
--- /dev/null
+++ b/iyp/crawlers/caida/__init__.py
@@ -0,0 +1,70 @@
+import bz2
+import logging
+import os
+from datetime import datetime, timezone
+from io import BytesIO
+
+import requests
+from bs4 import BeautifulSoup
+
+from iyp import BaseCrawler
+
+
+class ASRelCrawler(BaseCrawler):
+    def __init__(self, organization, url, name, af):
+        super().__init__(organization, url, name)
+        self.af = af
+        self.reference['reference_url_info'] = \
+            'https://publicdata.caida.org/datasets/as-relationships/serial-1/README.txt'
+
+    def __get_latest_file(self):
+        index = requests.get(self.reference['reference_url_data'])
+        index.raise_for_status()
+        soup = BeautifulSoup(index.text, features='html.parser')
+        if self.af == 4:
+            filename_template = '%Y%m%d.as-rel.txt.bz2'
+        else:
+            filename_template = '%Y%m%d.as-rel.v6-stable.txt.bz2'
+        links = soup.find_all('a')
+        file_dates = list()
+        for link in links:
+            try:
+                dt = datetime.strptime(link['href'], filename_template).replace(tzinfo=timezone.utc)
+            except ValueError:
+                continue
+            file_dates.append((dt, link['href']))
+        file_dates.sort()
+        latest_file_date, latest_file_name = file_dates[-1]
+        self.reference['reference_time_modification'] = latest_file_date
+        self.reference['reference_url_data'] = os.path.join(self.reference['reference_url_data'], latest_file_name)
+        logging.info(f'Fetching file: {self.reference["reference_url_data"]}')
+
+    def run(self):
+        self.__get_latest_file()
+        req = requests.get(self.reference['reference_url_data'])
+        req.raise_for_status()
+
+        with bz2.open(BytesIO(req.content), 'rb') as f:
+            text = f.read().decode()
+
+        ases = set()
+        peers_with_links = list()
+        for line in text.splitlines():
+            if line.startswith('#'):
+                continue
+            left_asn, right_asn, kind = map(int, line.split('|'))
+            ases.add(left_asn)
+            ases.add(right_asn)
+            peers_with_links.append({'src_id': left_asn, 'dst_id': right_asn,
+                                     'props': [self.reference, {'rel': kind, 'af': self.af}]})
+
+        as_id = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn', ases)
+
+        for link in peers_with_links:
+            link['src_id'] = as_id[link['src_id']]
+            link['dst_id'] = as_id[link['dst_id']]
+
+        self.iyp.batch_add_links('PEERS_WITH', peers_with_links)
+
+    def unit_test(self):
+        return super().unit_test(['PEERS_WITH'])
diff --git a/iyp/crawlers/caida/as_relationships_v4.py b/iyp/crawlers/caida/as_relationships_v4.py
new file mode 100644
index 00000000..d9146bc6
--- /dev/null
+++ b/iyp/crawlers/caida/as_relationships_v4.py
@@ -0,0 +1,45 @@
+import argparse
+import logging
+import os
+import sys
+
+from iyp.crawlers.caida import ASRelCrawler
+
+URL = 'https://publicdata.caida.org/datasets/as-relationships/serial-1/'
+ORG = 'CAIDA'
+NAME = 'caida.as_relationships_v4'
+
+
+class Crawler(ASRelCrawler):
+    def __init__(self, organization, url, name):
+        super().__init__(organization, url, name, 4)
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--unit-test', action='store_true')
+    args = parser.parse_args()
+
+    scriptname = os.path.basename(sys.argv[0]).replace('/', '_')[0:-3]
+    FORMAT = '%(asctime)s %(levelname)s %(message)s'
+    logging.basicConfig(
+        format=FORMAT,
+        filename='log/' + scriptname + '.log',
+        level=logging.INFO,
+        datefmt='%Y-%m-%d %H:%M:%S'
+    )
+
+    logging.info(f'Started: {sys.argv}')
+
+    crawler = Crawler(ORG, URL, NAME)
+    if args.unit_test:
+        crawler.unit_test()
+    else:
+        crawler.run()
+        crawler.close()
+    logging.info(f'Finished: {sys.argv}')
+
+
+if __name__ == '__main__':
+    main()
+    sys.exit(0)
diff --git a/iyp/crawlers/caida/as_relationships_v6.py b/iyp/crawlers/caida/as_relationships_v6.py
new file mode 100644
index 00000000..df954bde
--- /dev/null
+++ b/iyp/crawlers/caida/as_relationships_v6.py
@@ -0,0 +1,45 @@
+import argparse
+import logging
+import os
+import sys
+
+from iyp.crawlers.caida import ASRelCrawler
+
+URL = 'https://publicdata.caida.org/datasets/as-relationships/serial-1/'
+ORG = 'CAIDA'
+NAME = 'caida.as_relationships_v6'
+
+
+class Crawler(ASRelCrawler):
+    def __init__(self, organization, url, name):
+        super().__init__(organization, url, name, 6)
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--unit-test', action='store_true')
+    args = parser.parse_args()
+
+    scriptname = os.path.basename(sys.argv[0]).replace('/', '_')[0:-3]
+    FORMAT = '%(asctime)s %(levelname)s %(message)s'
+    logging.basicConfig(
+        format=FORMAT,
+        filename='log/' + scriptname + '.log',
+        level=logging.INFO,
+        datefmt='%Y-%m-%d %H:%M:%S'
+    )
+
+    logging.info(f'Started: {sys.argv}')
+
+    crawler = Crawler(ORG, URL, NAME)
+    if args.unit_test:
+        crawler.unit_test()
+    else:
+        crawler.run()
+        crawler.close()
+    logging.info(f'Finished: {sys.argv}')
+
+
+if __name__ == '__main__':
+    main()
+    sys.exit(0)