Skip to content

Commit

Permalink
Merge pull request #91 from m-appel/query-rework
Browse files Browse the repository at this point in the history
Query function rework
  • Loading branch information
romain-fontugne authored Dec 19, 2023
2 parents a5d856a + 63588f6 commit ed0180b
Show file tree
Hide file tree
Showing 37 changed files with 358 additions and 266 deletions.
403 changes: 250 additions & 153 deletions iyp/__init__.py

Large diffs are not rendered by default.

8 changes: 4 additions & 4 deletions iyp/crawlers/apnic/eyeball.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,8 @@ def run(self):
logging.info(f'processing {country}')

# Get the QID of the country and corresponding ranking
cc_qid = self.iyp.get_node('Country', {'country_code': cc}, create=True)
ranking_qid = self.iyp.get_node('Ranking', {'name': f'APNIC eyeball estimates ({cc})'}, create=True)
cc_qid = self.iyp.get_node('Country', {'country_code': cc})
ranking_qid = self.iyp.get_node('Ranking', {'name': f'APNIC eyeball estimates ({cc})'})
statements = [['COUNTRY', cc_qid, self.reference]]
self.iyp.add_links(ranking_qid, statements)

Expand All @@ -57,8 +57,8 @@ def run(self):
names.add(asn['autnum'])

# Get node IDs
self.asn_id = self.iyp.batch_get_nodes('AS', 'asn', asns, all=False)
self.name_id = self.iyp.batch_get_nodes('Name', 'name', names, all=False)
self.asn_id = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn', asns, all=False)
self.name_id = self.iyp.batch_get_nodes_by_single_prop('Name', 'name', names, all=False)

# Compute links
country_links = []
Expand Down
2 changes: 1 addition & 1 deletion iyp/crawlers/bgpkit/as2rel.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def run(self):
rels.append(rel)

# get ASNs IDs
self.asn_id = self.iyp.batch_get_nodes('AS', 'asn', asns)
self.asn_id = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn', asns)

# Compute links
links = []
Expand Down
5 changes: 2 additions & 3 deletions iyp/crawlers/bgpkit/peerstats.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,7 @@ def run(self):
stats = json.load(bz2.open(req.raw))
collector_qid = self.iyp.get_node(
'BGPCollector',
{'name': stats['collector'], 'project': stats['project']},
create=True
{'name': stats['collector'], 'project': stats['project']}
)
self.reference['reference_url'] = url

Expand All @@ -75,7 +74,7 @@ def run(self):
asns.add(peer['asn'])

# get ASNs' IDs
self.asn_id = self.iyp.batch_get_nodes('AS', 'asn', asns, all=False)
self.asn_id = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn', asns, all=False)

# Compute links
links = []
Expand Down
4 changes: 2 additions & 2 deletions iyp/crawlers/bgpkit/pfx2asn.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,8 @@ def run(self):

logging.info('Pushing nodes to neo4j...\n')
# get ASNs and prefixes IDs
self.asn_id = self.iyp.batch_get_nodes('AS', 'asn', asns)
self.prefix_id = self.iyp.batch_get_nodes('Prefix', 'prefix', prefixes)
self.asn_id = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn', asns)
self.prefix_id = self.iyp.batch_get_nodes_by_single_prop('Prefix', 'prefix', prefixes)

# Compute links
links = []
Expand Down
4 changes: 2 additions & 2 deletions iyp/crawlers/bgptools/anycast_prefixes.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,8 @@ def update(self, res, filename: str):
prefixes.add(line)
lines.append(line)

prefix_id = self.iyp.batch_get_nodes('Prefix', 'prefix', prefixes)
tag_id = self.iyp.get_node('Tag', {'label': 'Anycast'}, create=True)
prefix_id = self.iyp.batch_get_nodes_by_single_prop('Prefix', 'prefix', prefixes)
tag_id = self.iyp.get_node('Tag', {'label': 'Anycast'})

links = []
for line in lines:
Expand Down
4 changes: 2 additions & 2 deletions iyp/crawlers/bgptools/as_names.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,8 @@ def run(self):
lines.append([asn, name])

# get ASNs and names IDs
self.asn_id = self.iyp.batch_get_nodes('AS', 'asn', asns)
self.name_id = self.iyp.batch_get_nodes('Name', 'name', names)
self.asn_id = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn', asns)
self.name_id = self.iyp.batch_get_nodes_by_single_prop('Name', 'name', names)

# Compute links
links = []
Expand Down
4 changes: 2 additions & 2 deletions iyp/crawlers/bgptools/tags.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,15 +63,15 @@ def run(self):
print(req.text)
sys.exit('Error while fetching AS names')

self.tag_qid = self.iyp.get_node('Tag', {'label': label}, create=True)
self.tag_qid = self.iyp.get_node('Tag', {'label': label})
for line in req.text.splitlines():
# skip header
if line.startswith('asn'):
continue

# Parse given line to get ASN, name, and country code
asn, _, _ = line.partition(',')
asn_qid = self.iyp.get_node('AS', {'asn': asn[2:]}, create=True)
asn_qid = self.iyp.get_node('AS', {'asn': asn[2:]})
statements = [['CATEGORIZED', self.tag_qid, self.reference]] # Set AS name

try:
Expand Down
8 changes: 4 additions & 4 deletions iyp/crawlers/caida/asrank.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,10 +59,10 @@ def run(self):
# Get/create ASNs, names, and country nodes
print('Pushing nodes.', file=sys.stderr)
logging.info('Pushing nodes.')
self.asn_id = self.iyp.batch_get_nodes('AS', 'asn', asns)
self.country_id = self.iyp.batch_get_nodes('Country', 'country_code', countries)
self.name_id = self.iyp.batch_get_nodes('Name', 'name', names, all=False)
self.asrank_qid = self.iyp.get_node('Ranking', {'name': 'CAIDA ASRank'}, create=True)
self.asn_id = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn', asns)
self.country_id = self.iyp.batch_get_nodes_by_single_prop('Country', 'country_code', countries)
self.name_id = self.iyp.batch_get_nodes_by_single_prop('Name', 'name', names, all=False)
self.asrank_qid = self.iyp.get_node('Ranking', {'name': 'CAIDA ASRank'})

# Compute links
country_links = list()
Expand Down
4 changes: 2 additions & 2 deletions iyp/crawlers/cisco/umbrella_top1M.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ class Crawler(BaseCrawler):
def run(self):
"""Fetch Umbrella top 1M and push to IYP."""

self.cisco_qid = self.iyp.get_node('Ranking', {'name': 'Cisco Umbrella Top 1 million'}, create=True)
self.cisco_qid = self.iyp.get_node('Ranking', {'name': 'Cisco Umbrella Top 1 million'})

sys.stderr.write('Downloading latest list...\n')
req = requests.get(URL)
Expand All @@ -40,7 +40,7 @@ def run(self):
links.append({'src_name': domain, 'dst_id': self.cisco_qid,
'props': [self.reference, {'rank': int(rank)}]})

name_id = self.iyp.batch_get_nodes('DomainName', 'name', domains)
name_id = self.iyp.batch_get_nodes_by_single_prop('DomainName', 'name', domains)

for link in links:
link['src_id'] = name_id[link['src_name']]
Expand Down
4 changes: 2 additions & 2 deletions iyp/crawlers/citizenlab/urldb.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,8 @@ def run(self):
continue
lines.append([url, category])

url_id = self.iyp.batch_get_nodes('URL', 'url', urls)
category_id = self.iyp.batch_get_nodes('Tag', 'label', categories)
url_id = self.iyp.batch_get_nodes_by_single_prop('URL', 'url', urls)
category_id = self.iyp.batch_get_nodes_by_single_prop('Tag', 'label', categories)

links = []
for (url, category) in lines:
Expand Down
2 changes: 1 addition & 1 deletion iyp/crawlers/cloudflare/dns_top_ases.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ class Crawler(Crawler):
def run(self):
"""Push data to IYP."""

self.as_id = self.iyp.batch_get_nodes('AS', 'asn')
self.as_id = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn')

super().run()

Expand Down
2 changes: 1 addition & 1 deletion iyp/crawlers/cloudflare/dns_top_locations.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ def run(self):
# FIXME this should be called before/separately
self.fetch()

self.country_id = self.iyp.batch_get_nodes('Country', 'country_code')
self.country_id = self.iyp.batch_get_nodes_by_single_prop('Country', 'country_code')
self.statements = []

tmp_dir = self.get_tmp_dir()
Expand Down
4 changes: 2 additions & 2 deletions iyp/crawlers/cloudflare/ranking_bucket.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def run(self):
# domain_ids, but iterate over the domains set instead.
logging.info(f'Adding/retrieving {len(all_domains)} DomainName nodes.')
print(f'Adding/retrieving {len(all_domains)} DomainName nodes')
domain_ids = self.iyp.batch_get_nodes('DomainName', 'name', all_domains)
domain_ids = self.iyp.batch_get_nodes_by_single_prop('DomainName', 'name', all_domains)

for dataset, domains in datasets:
dataset_title = f'Cloudflare {dataset["title"]}'
Expand All @@ -96,7 +96,7 @@ def run(self):
'description': dataset['description'],
'top': dataset['meta']['top']
},
create=True)
id_properties={'name'})

# Create RANK relationships
domain_links = [{'src_id': domain_ids[domain], 'dst_id': ranking_id, 'props': [self.reference]}
Expand Down
4 changes: 2 additions & 2 deletions iyp/crawlers/cloudflare/top100.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def run(self):
"""Fetch data and push to IYP."""

self.cf_qid = self.iyp.get_node(
'Ranking', {'name': 'Cloudflare top 100 domains'}, create=True)
'Ranking', {'name': 'Cloudflare top 100 domains'})

# Fetch data
headers = {
Expand All @@ -52,7 +52,7 @@ def update(self, entry):

# Commit to IYP
# Get the AS's node ID (create if it is not yet registered) and commit changes
domain_qid = self.iyp.get_node('DomainName', {'name': entry['domain']}, create=True)
domain_qid = self.iyp.get_node('DomainName', {'name': entry['domain']})
self.iyp.add_links(domain_qid, statements)


Expand Down
4 changes: 2 additions & 2 deletions iyp/crawlers/emileaben/as_names.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,8 @@ def run(self):
as_names.add(as_name)
lines.append(values)

asns_id = self.iyp.batch_get_nodes('AS', 'asn', asns, all=False)
as_names_id = self.iyp.batch_get_nodes('Name', 'name', as_names, all=False)
asns_id = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn', asns, all=False)
as_names_id = self.iyp.batch_get_nodes_by_single_prop('Name', 'name', as_names, all=False)

links = []

Expand Down
5 changes: 2 additions & 3 deletions iyp/crawlers/example/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,16 +44,15 @@ def update(self, one_line):
{
'example_property_0': value,
'example_property_1': value,
},
create=True
}
)

# set relationship
statements = [['EXAMPLE_RELATIONSHIP_LABEL', val_qid, self.reference]]

# Commit to IYP
# Get the AS's node ID (create if it is not yet registered) and commit changes
as_qid = self.iyp.get_node('AS', {'asn': asn}, create=True)
as_qid = self.iyp.get_node('AS', {'asn': asn})
self.iyp.add_links(as_qid, statements)


Expand Down
6 changes: 3 additions & 3 deletions iyp/crawlers/ihr/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def run(self):
self.csv = lz4Csv(local_filename)

self.timebin = None
asn_id = self.iyp.batch_get_nodes('AS', 'asn', set())
asn_id = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn', set())

links = []

Expand All @@ -83,11 +83,11 @@ def run(self):

originasn = int(rec['originasn'])
if originasn not in asn_id:
asn_id[originasn] = self.iyp.get_node('AS', {'asn': originasn}, create=True)
asn_id[originasn] = self.iyp.get_node('AS', {'asn': originasn})

asn = int(rec['asn'])
if asn not in asn_id:
asn_id[asn] = self.iyp.get_node('AS', {'asn': asn}, create=True)
asn_id[asn] = self.iyp.get_node('AS', {'asn': asn})

links.append({
'src_id': asn_id[originasn],
Expand Down
8 changes: 3 additions & 5 deletions iyp/crawlers/ihr/country_dependency.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,7 @@ def run(self):
country_qid = self.iyp.get_node('Country',
{
'country_code': cc,
},
create=True
}
)

countryrank_statements = []
Expand All @@ -81,8 +80,7 @@ def run(self):
for metric, weight in [('Total eyeball', 'eyeball'), ('Total AS', 'as')]:

self.countryrank_qid = self.iyp.get_node('Ranking',
{'name': f'IHR country ranking: {metric} ({cc})'},
create=True
{'name': f'IHR country ranking: {metric} ({cc})'}
)
self.iyp.add_links(self.countryrank_qid, countryrank_statements)

Expand All @@ -101,7 +99,7 @@ def run(self):
asns.add(asn['asn'])
asn['rank'] = i + 1

self.asn_id = self.iyp.batch_get_nodes('AS', 'asn', asns, all=False)
self.asn_id = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn', asns, all=False)

# Compute links
for asn in selected:
Expand Down
20 changes: 10 additions & 10 deletions iyp/crawlers/ihr/rov.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,10 +74,10 @@ def run(self):
self.csv = lz4Csv(local_filename)

logging.warning('Getting node IDs from neo4j...\n')
asn_id = self.iyp.batch_get_nodes('AS', 'asn')
prefix_id = self.iyp.batch_get_nodes('Prefix', 'prefix')
tag_id = self.iyp.batch_get_nodes('Tag', 'label')
country_id = self.iyp.batch_get_nodes('Country', 'country_code')
asn_id = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn')
prefix_id = self.iyp.batch_get_nodes_by_single_prop('Prefix', 'prefix')
tag_id = self.iyp.batch_get_nodes_by_single_prop('Tag', 'label')
country_id = self.iyp.batch_get_nodes_by_single_prop('Country', 'country_code')

orig_links = []
tag_links = []
Expand All @@ -98,26 +98,26 @@ def run(self):

prefix = rec['prefix']
if prefix not in prefix_id:
prefix_id[prefix] = self.iyp.get_node('Prefix', {'prefix': prefix}, create=True)
prefix_id[prefix] = self.iyp.get_node('Prefix', {'prefix': prefix})

# make status/country/origin links only for lines where asn=originasn
if rec['asn_id'] == rec['originasn_id']:
# Make sure all nodes exist
originasn = int(rec['originasn_id'])
if originasn not in asn_id:
asn_id[originasn] = self.iyp.get_node('AS', {'asn': originasn}, create=True)
asn_id[originasn] = self.iyp.get_node('AS', {'asn': originasn})

rpki_status = 'RPKI ' + rec['rpki_status']
if rpki_status not in tag_id:
tag_id[rpki_status] = self.iyp.get_node('Tag', {'label': rpki_status}, create=True)
tag_id[rpki_status] = self.iyp.get_node('Tag', {'label': rpki_status})

irr_status = 'IRR ' + rec['irr_status']
if irr_status not in tag_id:
tag_id[irr_status] = self.iyp.get_node('Tag', {'label': irr_status}, create=True)
tag_id[irr_status] = self.iyp.get_node('Tag', {'label': irr_status})

cc = rec['country_id']
if cc not in country_id:
country_id[cc] = self.iyp.get_node('Country', {'country_code': cc}, create=True)
country_id[cc] = self.iyp.get_node('Country', {'country_code': cc})

# Compute links
orig_links.append({
Expand Down Expand Up @@ -147,7 +147,7 @@ def run(self):
# Dependency links
asn = int(rec['asn_id'])
if asn not in asn_id:
asn_id[asn] = self.iyp.get_node('AS', {'asn': asn}, create=True)
asn_id[asn] = self.iyp.get_node('AS', {'asn': asn})

dep_links.append({
'src_id': prefix_id[prefix],
Expand Down
4 changes: 2 additions & 2 deletions iyp/crawlers/inetintel/as_org.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,8 +108,8 @@ def run(self):
batch_lines.append([asn, url, sibling_asns, pdb_orgs])
count_rows += 1

asn_id = self.iyp.batch_get_nodes('AS', 'asn', batch_asns, all=False)
url_id = self.iyp.batch_get_nodes('URL', 'url', batch_urls, all=False)
asn_id = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn', batch_asns, all=False)
url_id = self.iyp.batch_get_nodes_by_single_prop('URL', 'url', batch_urls, all=False)

asn_to_url_links = []
asn_to_sibling_asn_links = []
Expand Down
9 changes: 4 additions & 5 deletions iyp/crawlers/manrs/members.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,7 @@ def __init__(self, organization, url, name):

self.manrs_qid = self.iyp.get_node(
'Organization',
{'name': 'MANRS'},
create=True
{'name': 'MANRS'}
)

# Actions defined by MANRS
Expand Down Expand Up @@ -55,7 +54,7 @@ def __init__(self, organization, url, name):
'name': action['label'],
'description': action['description']
},
create=True
id_properties={'name'}
)

# Reference information for data pushed to IYP
Expand Down Expand Up @@ -94,7 +93,7 @@ def update_net(self, one_line):

# set countries
for cc in areas.split(';'):
country_qid = self.iyp.get_node('Country', {'country_code': cc}, create=True)
country_qid = self.iyp.get_node('Country', {'country_code': cc})
statements.append(['COUNTRY', country_qid, self.reference])

# set actions
Expand All @@ -106,7 +105,7 @@ def update_net(self, one_line):
for asn in asns.split(';'):
if asn: # ignore organizations with no ASN
# Get the AS QID (create if AS is not yet registered) and commit changes
as_qid = self.iyp.get_node('AS', {'asn': str(asn)}, create=True)
as_qid = self.iyp.get_node('AS', {'asn': str(asn)})
self.iyp.add_links(as_qid, statements)


Expand Down
Loading

0 comments on commit ed0180b

Please sign in to comment.