InternetHealthReport · romain-fontugne · Dec 19, 2023 · Dec 17, 2023 · Dec 18, 2023 · Dec 18, 2023
diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -6,6 +6,11 @@ labels: ""
 assignees: ""
 ---
 
+*NOTE (Delete after reading): There is no need to open bug reports based on
+error messages in the log of the weekly database dump. We usually notice them
+and can judge if a simple rerun of the crawler suffices (e.g., due to a
+temporary connectivity issue), or if there is a bug in the crawler.*
+
 **Describe the bug**
 A clear and concise description of what the bug is.
 

diff --git a/iyp/__init__.py b/iyp/__init__.py
diff --git a/iyp/crawlers/apnic/eyeball.py b/iyp/crawlers/apnic/eyeball.py
@@ -32,8 +32,8 @@ def run(self):
             logging.info(f'processing {country}')
 
             # Get the QID of the country and corresponding ranking
-            cc_qid = self.iyp.get_node('Country', {'country_code': cc}, create=True)
-            ranking_qid = self.iyp.get_node('Ranking', {'name': f'APNIC eyeball estimates ({cc})'}, create=True)
+            cc_qid = self.iyp.get_node('Country', {'country_code': cc})
+            ranking_qid = self.iyp.get_node('Ranking', {'name': f'APNIC eyeball estimates ({cc})'})
             statements = [['COUNTRY', cc_qid, self.reference]]
             self.iyp.add_links(ranking_qid, statements)
 
@@ -57,8 +57,8 @@ def run(self):
                 names.add(asn['autnum'])
 
             # Get node IDs
-            self.asn_id = self.iyp.batch_get_nodes('AS', 'asn', asns, all=False)
-            self.name_id = self.iyp.batch_get_nodes('Name', 'name', names, all=False)
+            self.asn_id = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn', asns, all=False)
+            self.name_id = self.iyp.batch_get_nodes_by_single_prop('Name', 'name', names, all=False)
 
             # Compute links
             country_links = []

diff --git a/iyp/crawlers/bgpkit/as2rel.py b/iyp/crawlers/bgpkit/as2rel.py
@@ -34,7 +34,7 @@ def run(self):
             rels.append(rel)
 
         # get ASNs IDs
-        self.asn_id = self.iyp.batch_get_nodes('AS', 'asn', asns)
+        self.asn_id = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn', asns)
 
         # Compute links
         links = []

diff --git a/iyp/crawlers/bgpkit/peerstats.py b/iyp/crawlers/bgpkit/peerstats.py
@@ -63,8 +63,7 @@ def run(self):
             stats = json.load(bz2.open(req.raw))
             collector_qid = self.iyp.get_node(
                 'BGPCollector',
-                {'name': stats['collector'], 'project': stats['project']},
-                create=True
+                {'name': stats['collector'], 'project': stats['project']}
             )
             self.reference['reference_url'] = url
 
@@ -75,7 +74,7 @@ def run(self):
                 asns.add(peer['asn'])
 
             # get ASNs' IDs
-            self.asn_id = self.iyp.batch_get_nodes('AS', 'asn', asns, all=False)
+            self.asn_id = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn', asns, all=False)
 
             # Compute links
             links = []

diff --git a/iyp/crawlers/bgpkit/pfx2asn.py b/iyp/crawlers/bgpkit/pfx2asn.py
@@ -37,8 +37,8 @@ def run(self):
 
         logging.info('Pushing nodes to neo4j...\n')
         # get ASNs and prefixes IDs
-        self.asn_id = self.iyp.batch_get_nodes('AS', 'asn', asns)
-        self.prefix_id = self.iyp.batch_get_nodes('Prefix', 'prefix', prefixes)
+        self.asn_id = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn', asns)
+        self.prefix_id = self.iyp.batch_get_nodes_by_single_prop('Prefix', 'prefix', prefixes)
 
         # Compute links
         links = []

diff --git a/iyp/crawlers/bgptools/anycast_prefixes.py b/iyp/crawlers/bgptools/anycast_prefixes.py
@@ -76,8 +76,8 @@ def update(self, res, filename: str):
                 prefixes.add(line)
                 lines.append(line)
 
-            prefix_id = self.iyp.batch_get_nodes('Prefix', 'prefix', prefixes)
-            tag_id = self.iyp.get_node('Tag', {'label': 'Anycast'}, create=True)
+            prefix_id = self.iyp.batch_get_nodes_by_single_prop('Prefix', 'prefix', prefixes)
+            tag_id = self.iyp.get_node('Tag', {'label': 'Anycast'})
 
             links = []
             for line in lines:

diff --git a/iyp/crawlers/bgptools/as_names.py b/iyp/crawlers/bgptools/as_names.py
@@ -46,8 +46,8 @@ def run(self):
             lines.append([asn, name])
 
         # get ASNs and names IDs
-        self.asn_id = self.iyp.batch_get_nodes('AS', 'asn', asns)
-        self.name_id = self.iyp.batch_get_nodes('Name', 'name', names)
+        self.asn_id = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn', asns)
+        self.name_id = self.iyp.batch_get_nodes_by_single_prop('Name', 'name', names)
 
         # Compute links
         links = []

diff --git a/iyp/crawlers/bgptools/tags.py b/iyp/crawlers/bgptools/tags.py
@@ -63,15 +63,15 @@ def run(self):
                 print(req.text)
                 sys.exit('Error while fetching AS names')
 
-            self.tag_qid = self.iyp.get_node('Tag', {'label': label}, create=True)
+            self.tag_qid = self.iyp.get_node('Tag', {'label': label})
             for line in req.text.splitlines():
                 # skip header
                 if line.startswith('asn'):
                     continue
 
                 # Parse given line to get ASN, name, and country code
                 asn, _, _ = line.partition(',')
-                asn_qid = self.iyp.get_node('AS', {'asn': asn[2:]}, create=True)
+                asn_qid = self.iyp.get_node('AS', {'asn': asn[2:]})
                 statements = [['CATEGORIZED', self.tag_qid, self.reference]]  # Set AS name
 
                 try:

diff --git a/iyp/crawlers/caida/asrank.py b/iyp/crawlers/caida/asrank.py
@@ -59,10 +59,10 @@ def run(self):
         # Get/create ASNs, names, and country nodes
         print('Pushing nodes.', file=sys.stderr)
         logging.info('Pushing nodes.')
-        self.asn_id = self.iyp.batch_get_nodes('AS', 'asn', asns)
-        self.country_id = self.iyp.batch_get_nodes('Country', 'country_code', countries)
-        self.name_id = self.iyp.batch_get_nodes('Name', 'name', names, all=False)
-        self.asrank_qid = self.iyp.get_node('Ranking', {'name': 'CAIDA ASRank'}, create=True)
+        self.asn_id = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn', asns)
+        self.country_id = self.iyp.batch_get_nodes_by_single_prop('Country', 'country_code', countries)
+        self.name_id = self.iyp.batch_get_nodes_by_single_prop('Name', 'name', names, all=False)
+        self.asrank_qid = self.iyp.get_node('Ranking', {'name': 'CAIDA ASRank'})
 
         # Compute links
         country_links = list()

diff --git a/iyp/crawlers/cisco/umbrella_top1M.py b/iyp/crawlers/cisco/umbrella_top1M.py
@@ -20,7 +20,7 @@ class Crawler(BaseCrawler):
     def run(self):
         """Fetch Umbrella top 1M and push to IYP."""
 
-        self.cisco_qid = self.iyp.get_node('Ranking', {'name': 'Cisco Umbrella Top 1 million'}, create=True)
+        self.cisco_qid = self.iyp.get_node('Ranking', {'name': 'Cisco Umbrella Top 1 million'})
 
         sys.stderr.write('Downloading latest list...\n')
         req = requests.get(URL)
@@ -40,7 +40,7 @@ def run(self):
                     links.append({'src_name': domain, 'dst_id': self.cisco_qid,
                                   'props': [self.reference, {'rank': int(rank)}]})
 
-        name_id = self.iyp.batch_get_nodes('DomainName', 'name', domains)
+        name_id = self.iyp.batch_get_nodes_by_single_prop('DomainName', 'name', domains)
 
         for link in links:
             link['src_id'] = name_id[link['src_name']]

diff --git a/iyp/crawlers/citizenlab/urldb.py b/iyp/crawlers/citizenlab/urldb.py
@@ -68,8 +68,8 @@ def run(self):
                     continue
                 lines.append([url, category])
 
-        url_id = self.iyp.batch_get_nodes('URL', 'url', urls)
-        category_id = self.iyp.batch_get_nodes('Tag', 'label', categories)
+        url_id = self.iyp.batch_get_nodes_by_single_prop('URL', 'url', urls)
+        category_id = self.iyp.batch_get_nodes_by_single_prop('Tag', 'label', categories)
 
         links = []
         for (url, category) in lines:

diff --git a/iyp/crawlers/cloudflare/dns_top_ases.py b/iyp/crawlers/cloudflare/dns_top_ases.py
@@ -18,7 +18,7 @@ class Crawler(Crawler):
     def run(self):
         """Push data to IYP."""
 
-        self.as_id = self.iyp.batch_get_nodes('AS', 'asn')
+        self.as_id = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn')
 
         super().run()
 

diff --git a/iyp/crawlers/cloudflare/dns_top_locations.py b/iyp/crawlers/cloudflare/dns_top_locations.py
@@ -103,7 +103,7 @@ def run(self):
         # FIXME this should be called before/separately
         self.fetch()
 
-        self.country_id = self.iyp.batch_get_nodes('Country', 'country_code')
+        self.country_id = self.iyp.batch_get_nodes_by_single_prop('Country', 'country_code')
         self.statements = []
 
         tmp_dir = self.get_tmp_dir()

diff --git a/iyp/crawlers/cloudflare/ranking_bucket.py b/iyp/crawlers/cloudflare/ranking_bucket.py
@@ -83,7 +83,7 @@ def run(self):
         # domain_ids, but iterate over the domains set instead.
         logging.info(f'Adding/retrieving {len(all_domains)} DomainName nodes.')
         print(f'Adding/retrieving {len(all_domains)} DomainName nodes')
-        domain_ids = self.iyp.batch_get_nodes('DomainName', 'name', all_domains)
+        domain_ids = self.iyp.batch_get_nodes_by_single_prop('DomainName', 'name', all_domains)
 
         for dataset, domains in datasets:
             dataset_title = f'Cloudflare {dataset["title"]}'
@@ -96,7 +96,7 @@ def run(self):
                                                'description': dataset['description'],
                                                'top': dataset['meta']['top']
                                            },
-                                           create=True)
+                                           id_properties={'name'})
 
             # Create RANK relationships
             domain_links = [{'src_id': domain_ids[domain], 'dst_id': ranking_id, 'props': [self.reference]}

diff --git a/iyp/crawlers/cloudflare/top100.py b/iyp/crawlers/cloudflare/top100.py
@@ -26,7 +26,7 @@ def run(self):
         """Fetch data and push to IYP."""
 
         self.cf_qid = self.iyp.get_node(
-            'Ranking', {'name': 'Cloudflare top 100 domains'}, create=True)
+            'Ranking', {'name': 'Cloudflare top 100 domains'})
 
         # Fetch data
         headers = {
@@ -52,7 +52,7 @@ def update(self, entry):
 
         # Commit to IYP
         # Get the AS's node ID (create if it is not yet registered) and commit changes
-        domain_qid = self.iyp.get_node('DomainName', {'name': entry['domain']}, create=True)
+        domain_qid = self.iyp.get_node('DomainName', {'name': entry['domain']})
         self.iyp.add_links(domain_qid, statements)
 
 

diff --git a/iyp/crawlers/emileaben/as_names.py b/iyp/crawlers/emileaben/as_names.py
@@ -48,8 +48,8 @@ def run(self):
                 as_names.add(as_name)
                 lines.append(values)
 
-            asns_id = self.iyp.batch_get_nodes('AS', 'asn', asns, all=False)
-            as_names_id = self.iyp.batch_get_nodes('Name', 'name', as_names, all=False)
+            asns_id = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn', asns, all=False)
+            as_names_id = self.iyp.batch_get_nodes_by_single_prop('Name', 'name', as_names, all=False)
 
             links = []
 

diff --git a/iyp/crawlers/example/crawler.py b/iyp/crawlers/example/crawler.py
@@ -44,16 +44,15 @@ def update(self, one_line):
             {
                 'example_property_0': value,
                 'example_property_1': value,
-            },
-            create=True
+            }
         )
 
         # set relationship
         statements = [['EXAMPLE_RELATIONSHIP_LABEL', val_qid, self.reference]]
 
         # Commit to IYP
         # Get the AS's node ID (create if it is not yet registered) and commit changes
-        as_qid = self.iyp.get_node('AS', {'asn': asn}, create=True)
+        as_qid = self.iyp.get_node('AS', {'asn': asn})
         self.iyp.add_links(as_qid, statements)
 
 

diff --git a/iyp/crawlers/ihr/__init__.py b/iyp/crawlers/ihr/__init__.py
@@ -64,7 +64,7 @@ def run(self):
         self.csv = lz4Csv(local_filename)
 
         self.timebin = None
-        asn_id = self.iyp.batch_get_nodes('AS', 'asn', set())
+        asn_id = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn', set())
 
         links = []
 
@@ -83,11 +83,11 @@ def run(self):
 
             originasn = int(rec['originasn'])
             if originasn not in asn_id:
-                asn_id[originasn] = self.iyp.get_node('AS', {'asn': originasn}, create=True)
+                asn_id[originasn] = self.iyp.get_node('AS', {'asn': originasn})
 
             asn = int(rec['asn'])
             if asn not in asn_id:
-                asn_id[asn] = self.iyp.get_node('AS', {'asn': asn}, create=True)
+                asn_id[asn] = self.iyp.get_node('AS', {'asn': asn})
 
             links.append({
                 'src_id': asn_id[originasn],

diff --git a/iyp/crawlers/ihr/country_dependency.py b/iyp/crawlers/ihr/country_dependency.py
@@ -62,8 +62,7 @@ def run(self):
             country_qid = self.iyp.get_node('Country',
                                             {
                                                 'country_code': cc,
-                                            },
-                                            create=True
+                                            }
                                             )
 
             countryrank_statements = []
@@ -81,8 +80,7 @@ def run(self):
             for metric, weight in [('Total eyeball', 'eyeball'), ('Total AS', 'as')]:
 
                 self.countryrank_qid = self.iyp.get_node('Ranking',
-                                                         {'name': f'IHR country ranking: {metric} ({cc})'},
-                                                         create=True
+                                                         {'name': f'IHR country ranking: {metric} ({cc})'}
                                                          )
                 self.iyp.add_links(self.countryrank_qid, countryrank_statements)
 
@@ -101,7 +99,7 @@ def run(self):
                     asns.add(asn['asn'])
                     asn['rank'] = i + 1
 
-                self.asn_id = self.iyp.batch_get_nodes('AS', 'asn', asns, all=False)
+                self.asn_id = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn', asns, all=False)
 
                 # Compute links
                 for asn in selected:

diff --git a/iyp/crawlers/ihr/rov.py b/iyp/crawlers/ihr/rov.py
@@ -74,10 +74,10 @@ def run(self):
         self.csv = lz4Csv(local_filename)
 
         logging.warning('Getting node IDs from neo4j...\n')
-        asn_id = self.iyp.batch_get_nodes('AS', 'asn')
-        prefix_id = self.iyp.batch_get_nodes('Prefix', 'prefix')
-        tag_id = self.iyp.batch_get_nodes('Tag', 'label')
-        country_id = self.iyp.batch_get_nodes('Country', 'country_code')
+        asn_id = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn')
+        prefix_id = self.iyp.batch_get_nodes_by_single_prop('Prefix', 'prefix')
+        tag_id = self.iyp.batch_get_nodes_by_single_prop('Tag', 'label')
+        country_id = self.iyp.batch_get_nodes_by_single_prop('Country', 'country_code')
 
         orig_links = []
         tag_links = []
@@ -98,26 +98,26 @@ def run(self):
 
             prefix = rec['prefix']
             if prefix not in prefix_id:
-                prefix_id[prefix] = self.iyp.get_node('Prefix', {'prefix': prefix}, create=True)
+                prefix_id[prefix] = self.iyp.get_node('Prefix', {'prefix': prefix})
 
             # make status/country/origin links only for lines where asn=originasn
             if rec['asn_id'] == rec['originasn_id']:
                 # Make sure all nodes exist
                 originasn = int(rec['originasn_id'])
                 if originasn not in asn_id:
-                    asn_id[originasn] = self.iyp.get_node('AS', {'asn': originasn}, create=True)
+                    asn_id[originasn] = self.iyp.get_node('AS', {'asn': originasn})
 
                 rpki_status = 'RPKI ' + rec['rpki_status']
                 if rpki_status not in tag_id:
-                    tag_id[rpki_status] = self.iyp.get_node('Tag', {'label': rpki_status}, create=True)
+                    tag_id[rpki_status] = self.iyp.get_node('Tag', {'label': rpki_status})
 
                 irr_status = 'IRR ' + rec['irr_status']
                 if irr_status not in tag_id:
-                    tag_id[irr_status] = self.iyp.get_node('Tag', {'label': irr_status}, create=True)
+                    tag_id[irr_status] = self.iyp.get_node('Tag', {'label': irr_status})
 
                 cc = rec['country_id']
                 if cc not in country_id:
-                    country_id[cc] = self.iyp.get_node('Country', {'country_code': cc}, create=True)
+                    country_id[cc] = self.iyp.get_node('Country', {'country_code': cc})
 
                 # Compute links
                 orig_links.append({
@@ -147,7 +147,7 @@ def run(self):
             # Dependency links
             asn = int(rec['asn_id'])
             if asn not in asn_id:
-                asn_id[asn] = self.iyp.get_node('AS', {'asn': asn}, create=True)
+                asn_id[asn] = self.iyp.get_node('AS', {'asn': asn})
 
             dep_links.append({
                 'src_id': prefix_id[prefix],

diff --git a/iyp/crawlers/inetintel/as_org.py b/iyp/crawlers/inetintel/as_org.py
@@ -108,8 +108,8 @@ def run(self):
                 batch_lines.append([asn, url, sibling_asns, pdb_orgs])
                 count_rows += 1
 
-            asn_id = self.iyp.batch_get_nodes('AS', 'asn', batch_asns, all=False)
-            url_id = self.iyp.batch_get_nodes('URL', 'url', batch_urls, all=False)
+            asn_id = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn', batch_asns, all=False)
+            url_id = self.iyp.batch_get_nodes_by_single_prop('URL', 'url', batch_urls, all=False)
 
             asn_to_url_links = []
             asn_to_sibling_asn_links = []

diff --git a/iyp/crawlers/manrs/members.py b/iyp/crawlers/manrs/members.py
@@ -23,8 +23,7 @@ def __init__(self, organization, url, name):
 
         self.manrs_qid = self.iyp.get_node(
             'Organization',
-            {'name': 'MANRS'},
-            create=True
+            {'name': 'MANRS'}
         )
 
         # Actions defined by MANRS
@@ -55,7 +54,7 @@ def __init__(self, organization, url, name):
                     'name': action['label'],
                     'description': action['description']
                 },
-                create=True
+                id_properties={'name'}
             )
 
         # Reference information for data pushed to IYP
@@ -94,7 +93,7 @@ def update_net(self, one_line):
 
         # set countries
         for cc in areas.split(';'):
-            country_qid = self.iyp.get_node('Country', {'country_code': cc}, create=True)
+            country_qid = self.iyp.get_node('Country', {'country_code': cc})
             statements.append(['COUNTRY', country_qid, self.reference])
 
         # set actions
@@ -106,7 +105,7 @@ def update_net(self, one_line):
         for asn in asns.split(';'):
             if asn:     # ignore organizations with no ASN
                 # Get the AS QID (create if AS is not yet registered) and commit changes
-                as_qid = self.iyp.get_node('AS', {'asn': str(asn)}, create=True)
+                as_qid = self.iyp.get_node('AS', {'asn': str(asn)})
                 self.iyp.add_links(as_qid, statements)