Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Query function rework #91

Merged
merged 5 commits into from
Dec 19, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .github/ISSUE_TEMPLATE/bug_report.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,11 @@ labels: ""
assignees: ""
---

*NOTE (Delete after reading): There is no need to open bug reports based on
error messages in the log of the weekly database dump. We usually notice them
and can judge if a simple rerun of the crawler suffices (e.g., due to a
temporary connectivity issue), or if there is a bug in the crawler.*

**Describe the bug**
A clear and concise description of what the bug is.

Expand Down
403 changes: 250 additions & 153 deletions iyp/__init__.py

Large diffs are not rendered by default.

8 changes: 4 additions & 4 deletions iyp/crawlers/apnic/eyeball.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,8 @@ def run(self):
logging.info(f'processing {country}')

# Get the QID of the country and corresponding ranking
cc_qid = self.iyp.get_node('Country', {'country_code': cc}, create=True)
ranking_qid = self.iyp.get_node('Ranking', {'name': f'APNIC eyeball estimates ({cc})'}, create=True)
cc_qid = self.iyp.get_node('Country', {'country_code': cc})
ranking_qid = self.iyp.get_node('Ranking', {'name': f'APNIC eyeball estimates ({cc})'})
statements = [['COUNTRY', cc_qid, self.reference]]
self.iyp.add_links(ranking_qid, statements)

Expand All @@ -57,8 +57,8 @@ def run(self):
names.add(asn['autnum'])

# Get node IDs
self.asn_id = self.iyp.batch_get_nodes('AS', 'asn', asns, all=False)
self.name_id = self.iyp.batch_get_nodes('Name', 'name', names, all=False)
self.asn_id = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn', asns, all=False)
self.name_id = self.iyp.batch_get_nodes_by_single_prop('Name', 'name', names, all=False)

# Compute links
country_links = []
Expand Down
2 changes: 1 addition & 1 deletion iyp/crawlers/bgpkit/as2rel.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def run(self):
rels.append(rel)

# get ASNs IDs
self.asn_id = self.iyp.batch_get_nodes('AS', 'asn', asns)
self.asn_id = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn', asns)

# Compute links
links = []
Expand Down
5 changes: 2 additions & 3 deletions iyp/crawlers/bgpkit/peerstats.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,7 @@ def run(self):
stats = json.load(bz2.open(req.raw))
collector_qid = self.iyp.get_node(
'BGPCollector',
{'name': stats['collector'], 'project': stats['project']},
create=True
{'name': stats['collector'], 'project': stats['project']}
)
self.reference['reference_url'] = url

Expand All @@ -75,7 +74,7 @@ def run(self):
asns.add(peer['asn'])

# get ASNs' IDs
self.asn_id = self.iyp.batch_get_nodes('AS', 'asn', asns, all=False)
self.asn_id = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn', asns, all=False)

# Compute links
links = []
Expand Down
4 changes: 2 additions & 2 deletions iyp/crawlers/bgpkit/pfx2asn.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,8 @@ def run(self):

logging.info('Pushing nodes to neo4j...\n')
# get ASNs and prefixes IDs
self.asn_id = self.iyp.batch_get_nodes('AS', 'asn', asns)
self.prefix_id = self.iyp.batch_get_nodes('Prefix', 'prefix', prefixes)
self.asn_id = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn', asns)
self.prefix_id = self.iyp.batch_get_nodes_by_single_prop('Prefix', 'prefix', prefixes)

# Compute links
links = []
Expand Down
4 changes: 2 additions & 2 deletions iyp/crawlers/bgptools/anycast_prefixes.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,8 @@ def update(self, res, filename: str):
prefixes.add(line)
lines.append(line)

prefix_id = self.iyp.batch_get_nodes('Prefix', 'prefix', prefixes)
tag_id = self.iyp.get_node('Tag', {'label': 'Anycast'}, create=True)
prefix_id = self.iyp.batch_get_nodes_by_single_prop('Prefix', 'prefix', prefixes)
tag_id = self.iyp.get_node('Tag', {'label': 'Anycast'})

links = []
for line in lines:
Expand Down
4 changes: 2 additions & 2 deletions iyp/crawlers/bgptools/as_names.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,8 @@ def run(self):
lines.append([asn, name])

# get ASNs and names IDs
self.asn_id = self.iyp.batch_get_nodes('AS', 'asn', asns)
self.name_id = self.iyp.batch_get_nodes('Name', 'name', names)
self.asn_id = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn', asns)
self.name_id = self.iyp.batch_get_nodes_by_single_prop('Name', 'name', names)

# Compute links
links = []
Expand Down
4 changes: 2 additions & 2 deletions iyp/crawlers/bgptools/tags.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,15 +63,15 @@ def run(self):
print(req.text)
sys.exit('Error while fetching AS names')

self.tag_qid = self.iyp.get_node('Tag', {'label': label}, create=True)
self.tag_qid = self.iyp.get_node('Tag', {'label': label})
for line in req.text.splitlines():
# skip header
if line.startswith('asn'):
continue

# Parse given line to get ASN, name, and country code
asn, _, _ = line.partition(',')
asn_qid = self.iyp.get_node('AS', {'asn': asn[2:]}, create=True)
asn_qid = self.iyp.get_node('AS', {'asn': asn[2:]})
statements = [['CATEGORIZED', self.tag_qid, self.reference]] # Set AS name

try:
Expand Down
8 changes: 4 additions & 4 deletions iyp/crawlers/caida/asrank.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,10 +59,10 @@ def run(self):
# Get/create ASNs, names, and country nodes
print('Pushing nodes.', file=sys.stderr)
logging.info('Pushing nodes.')
self.asn_id = self.iyp.batch_get_nodes('AS', 'asn', asns)
self.country_id = self.iyp.batch_get_nodes('Country', 'country_code', countries)
self.name_id = self.iyp.batch_get_nodes('Name', 'name', names, all=False)
self.asrank_qid = self.iyp.get_node('Ranking', {'name': 'CAIDA ASRank'}, create=True)
self.asn_id = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn', asns)
self.country_id = self.iyp.batch_get_nodes_by_single_prop('Country', 'country_code', countries)
self.name_id = self.iyp.batch_get_nodes_by_single_prop('Name', 'name', names, all=False)
self.asrank_qid = self.iyp.get_node('Ranking', {'name': 'CAIDA ASRank'})

# Compute links
country_links = list()
Expand Down
4 changes: 2 additions & 2 deletions iyp/crawlers/cisco/umbrella_top1M.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ class Crawler(BaseCrawler):
def run(self):
"""Fetch Umbrella top 1M and push to IYP."""

self.cisco_qid = self.iyp.get_node('Ranking', {'name': 'Cisco Umbrella Top 1 million'}, create=True)
self.cisco_qid = self.iyp.get_node('Ranking', {'name': 'Cisco Umbrella Top 1 million'})

sys.stderr.write('Downloading latest list...\n')
req = requests.get(URL)
Expand All @@ -40,7 +40,7 @@ def run(self):
links.append({'src_name': domain, 'dst_id': self.cisco_qid,
'props': [self.reference, {'rank': int(rank)}]})

name_id = self.iyp.batch_get_nodes('DomainName', 'name', domains)
name_id = self.iyp.batch_get_nodes_by_single_prop('DomainName', 'name', domains)

for link in links:
link['src_id'] = name_id[link['src_name']]
Expand Down
4 changes: 2 additions & 2 deletions iyp/crawlers/citizenlab/urldb.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,8 @@ def run(self):
continue
lines.append([url, category])

url_id = self.iyp.batch_get_nodes('URL', 'url', urls)
category_id = self.iyp.batch_get_nodes('Tag', 'label', categories)
url_id = self.iyp.batch_get_nodes_by_single_prop('URL', 'url', urls)
category_id = self.iyp.batch_get_nodes_by_single_prop('Tag', 'label', categories)

links = []
for (url, category) in lines:
Expand Down
2 changes: 1 addition & 1 deletion iyp/crawlers/cloudflare/dns_top_ases.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ class Crawler(Crawler):
def run(self):
"""Push data to IYP."""

self.as_id = self.iyp.batch_get_nodes('AS', 'asn')
self.as_id = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn')

super().run()

Expand Down
2 changes: 1 addition & 1 deletion iyp/crawlers/cloudflare/dns_top_locations.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ def run(self):
# FIXME this should be called before/separately
self.fetch()

self.country_id = self.iyp.batch_get_nodes('Country', 'country_code')
self.country_id = self.iyp.batch_get_nodes_by_single_prop('Country', 'country_code')
self.statements = []

tmp_dir = self.get_tmp_dir()
Expand Down
4 changes: 2 additions & 2 deletions iyp/crawlers/cloudflare/ranking_bucket.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def run(self):
# domain_ids, but iterate over the domains set instead.
logging.info(f'Adding/retrieving {len(all_domains)} DomainName nodes.')
print(f'Adding/retrieving {len(all_domains)} DomainName nodes')
domain_ids = self.iyp.batch_get_nodes('DomainName', 'name', all_domains)
domain_ids = self.iyp.batch_get_nodes_by_single_prop('DomainName', 'name', all_domains)

for dataset, domains in datasets:
dataset_title = f'Cloudflare {dataset["title"]}'
Expand All @@ -96,7 +96,7 @@ def run(self):
'description': dataset['description'],
'top': dataset['meta']['top']
},
create=True)
id_properties={'name'})

# Create RANK relationships
domain_links = [{'src_id': domain_ids[domain], 'dst_id': ranking_id, 'props': [self.reference]}
Expand Down
4 changes: 2 additions & 2 deletions iyp/crawlers/cloudflare/top100.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def run(self):
"""Fetch data and push to IYP."""

self.cf_qid = self.iyp.get_node(
'Ranking', {'name': 'Cloudflare top 100 domains'}, create=True)
'Ranking', {'name': 'Cloudflare top 100 domains'})

# Fetch data
headers = {
Expand All @@ -52,7 +52,7 @@ def update(self, entry):

# Commit to IYP
# Get the AS's node ID (create if it is not yet registered) and commit changes
domain_qid = self.iyp.get_node('DomainName', {'name': entry['domain']}, create=True)
domain_qid = self.iyp.get_node('DomainName', {'name': entry['domain']})
self.iyp.add_links(domain_qid, statements)


Expand Down
4 changes: 2 additions & 2 deletions iyp/crawlers/emileaben/as_names.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,8 @@ def run(self):
as_names.add(as_name)
lines.append(values)

asns_id = self.iyp.batch_get_nodes('AS', 'asn', asns, all=False)
as_names_id = self.iyp.batch_get_nodes('Name', 'name', as_names, all=False)
asns_id = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn', asns, all=False)
as_names_id = self.iyp.batch_get_nodes_by_single_prop('Name', 'name', as_names, all=False)

links = []

Expand Down
5 changes: 2 additions & 3 deletions iyp/crawlers/example/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,16 +44,15 @@ def update(self, one_line):
{
'example_property_0': value,
'example_property_1': value,
},
create=True
}
)

# set relationship
statements = [['EXAMPLE_RELATIONSHIP_LABEL', val_qid, self.reference]]

# Commit to IYP
# Get the AS's node ID (create if it is not yet registered) and commit changes
as_qid = self.iyp.get_node('AS', {'asn': asn}, create=True)
as_qid = self.iyp.get_node('AS', {'asn': asn})
self.iyp.add_links(as_qid, statements)


Expand Down
6 changes: 3 additions & 3 deletions iyp/crawlers/ihr/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def run(self):
self.csv = lz4Csv(local_filename)

self.timebin = None
asn_id = self.iyp.batch_get_nodes('AS', 'asn', set())
asn_id = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn', set())

links = []

Expand All @@ -83,11 +83,11 @@ def run(self):

originasn = int(rec['originasn'])
if originasn not in asn_id:
asn_id[originasn] = self.iyp.get_node('AS', {'asn': originasn}, create=True)
asn_id[originasn] = self.iyp.get_node('AS', {'asn': originasn})

asn = int(rec['asn'])
if asn not in asn_id:
asn_id[asn] = self.iyp.get_node('AS', {'asn': asn}, create=True)
asn_id[asn] = self.iyp.get_node('AS', {'asn': asn})

links.append({
'src_id': asn_id[originasn],
Expand Down
8 changes: 3 additions & 5 deletions iyp/crawlers/ihr/country_dependency.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,7 @@ def run(self):
country_qid = self.iyp.get_node('Country',
{
'country_code': cc,
},
create=True
}
)

countryrank_statements = []
Expand All @@ -81,8 +80,7 @@ def run(self):
for metric, weight in [('Total eyeball', 'eyeball'), ('Total AS', 'as')]:

self.countryrank_qid = self.iyp.get_node('Ranking',
{'name': f'IHR country ranking: {metric} ({cc})'},
create=True
{'name': f'IHR country ranking: {metric} ({cc})'}
)
self.iyp.add_links(self.countryrank_qid, countryrank_statements)

Expand All @@ -101,7 +99,7 @@ def run(self):
asns.add(asn['asn'])
asn['rank'] = i + 1

self.asn_id = self.iyp.batch_get_nodes('AS', 'asn', asns, all=False)
self.asn_id = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn', asns, all=False)

# Compute links
for asn in selected:
Expand Down
20 changes: 10 additions & 10 deletions iyp/crawlers/ihr/rov.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,10 +74,10 @@ def run(self):
self.csv = lz4Csv(local_filename)

logging.warning('Getting node IDs from neo4j...\n')
asn_id = self.iyp.batch_get_nodes('AS', 'asn')
prefix_id = self.iyp.batch_get_nodes('Prefix', 'prefix')
tag_id = self.iyp.batch_get_nodes('Tag', 'label')
country_id = self.iyp.batch_get_nodes('Country', 'country_code')
asn_id = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn')
prefix_id = self.iyp.batch_get_nodes_by_single_prop('Prefix', 'prefix')
tag_id = self.iyp.batch_get_nodes_by_single_prop('Tag', 'label')
country_id = self.iyp.batch_get_nodes_by_single_prop('Country', 'country_code')

orig_links = []
tag_links = []
Expand All @@ -98,26 +98,26 @@ def run(self):

prefix = rec['prefix']
if prefix not in prefix_id:
prefix_id[prefix] = self.iyp.get_node('Prefix', {'prefix': prefix}, create=True)
prefix_id[prefix] = self.iyp.get_node('Prefix', {'prefix': prefix})

# make status/country/origin links only for lines where asn=originasn
if rec['asn_id'] == rec['originasn_id']:
# Make sure all nodes exist
originasn = int(rec['originasn_id'])
if originasn not in asn_id:
asn_id[originasn] = self.iyp.get_node('AS', {'asn': originasn}, create=True)
asn_id[originasn] = self.iyp.get_node('AS', {'asn': originasn})

rpki_status = 'RPKI ' + rec['rpki_status']
if rpki_status not in tag_id:
tag_id[rpki_status] = self.iyp.get_node('Tag', {'label': rpki_status}, create=True)
tag_id[rpki_status] = self.iyp.get_node('Tag', {'label': rpki_status})

irr_status = 'IRR ' + rec['irr_status']
if irr_status not in tag_id:
tag_id[irr_status] = self.iyp.get_node('Tag', {'label': irr_status}, create=True)
tag_id[irr_status] = self.iyp.get_node('Tag', {'label': irr_status})

cc = rec['country_id']
if cc not in country_id:
country_id[cc] = self.iyp.get_node('Country', {'country_code': cc}, create=True)
country_id[cc] = self.iyp.get_node('Country', {'country_code': cc})

# Compute links
orig_links.append({
Expand Down Expand Up @@ -147,7 +147,7 @@ def run(self):
# Dependency links
asn = int(rec['asn_id'])
if asn not in asn_id:
asn_id[asn] = self.iyp.get_node('AS', {'asn': asn}, create=True)
asn_id[asn] = self.iyp.get_node('AS', {'asn': asn})

dep_links.append({
'src_id': prefix_id[prefix],
Expand Down
4 changes: 2 additions & 2 deletions iyp/crawlers/inetintel/as_org.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,8 +108,8 @@ def run(self):
batch_lines.append([asn, url, sibling_asns, pdb_orgs])
count_rows += 1

asn_id = self.iyp.batch_get_nodes('AS', 'asn', batch_asns, all=False)
url_id = self.iyp.batch_get_nodes('URL', 'url', batch_urls, all=False)
asn_id = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn', batch_asns, all=False)
url_id = self.iyp.batch_get_nodes_by_single_prop('URL', 'url', batch_urls, all=False)

asn_to_url_links = []
asn_to_sibling_asn_links = []
Expand Down
9 changes: 4 additions & 5 deletions iyp/crawlers/manrs/members.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,7 @@ def __init__(self, organization, url, name):

self.manrs_qid = self.iyp.get_node(
'Organization',
{'name': 'MANRS'},
create=True
{'name': 'MANRS'}
)

# Actions defined by MANRS
Expand Down Expand Up @@ -55,7 +54,7 @@ def __init__(self, organization, url, name):
'name': action['label'],
'description': action['description']
},
create=True
id_properties={'name'}
)

# Reference information for data pushed to IYP
Expand Down Expand Up @@ -94,7 +93,7 @@ def update_net(self, one_line):

# set countries
for cc in areas.split(';'):
country_qid = self.iyp.get_node('Country', {'country_code': cc}, create=True)
country_qid = self.iyp.get_node('Country', {'country_code': cc})
statements.append(['COUNTRY', country_qid, self.reference])

# set actions
Expand All @@ -106,7 +105,7 @@ def update_net(self, one_line):
for asn in asns.split(';'):
if asn: # ignore organizations with no ASN
# Get the AS QID (create if AS is not yet registered) and commit changes
as_qid = self.iyp.get_node('AS', {'asn': str(asn)}, create=True)
as_qid = self.iyp.get_node('AS', {'asn': str(asn)})
self.iyp.add_links(as_qid, statements)


Expand Down
Loading