diff --git a/iyp/crawlers/bgp/README.md b/iyp/crawlers/bgp/README.md deleted file mode 100644 index 9ca50036..00000000 --- a/iyp/crawlers/bgp/README.md +++ /dev/null @@ -1,5 +0,0 @@ -# BGP crawlers - -## Requirements -This script fetch RIB files for the current day at midnight. Run this after -4am and before midnight UTC to make sure that RIB files are available. diff --git a/iyp/crawlers/bgp/rv_ris.py b/iyp/crawlers/bgp/rv_ris.py deleted file mode 100644 index 0083b0ee..00000000 --- a/iyp/crawlers/bgp/rv_ris.py +++ /dev/null @@ -1,116 +0,0 @@ -import logging -import sys -from collections import defaultdict - -import arrow -import pybgpstream -import radix - -from iyp.wiki.wikihandy import Wikihandy - -# URL to original data -URL_RV = 'http://routeviews.org/{}/bgpdata/' -URL_RIS = 'http://data.ris.ripe.net/{}/' - - -class Crawler(object): - def __init__(self): - """""" - - # Helper for wiki access - self.wh = Wikihandy(preload=True) - - # Get the QID for Routeviews organization - self.org_qid = self.wh.get_qid('Route Views') - self.today = self.wh.today() - - def run(self): - """Fetch BGP data from collectors and push to wikibase.""" - - today = arrow.now().replace(hour=0, minute=0) - start = today.shift(hours=-1) - end = today.shift(hours=1) - stream = pybgpstream.BGPStream( - from_time=int(start.timestamp()), until_time=int(end.timestamp()), - record_type='ribs', - ) - - rtree = radix.Radix() - - sys.stderr.write('\nReading BGP data:\n') - for i, elem in enumerate(stream): - # Extract the prefix and origin ASN - msg = elem.fields - prefix = msg['prefix'] - origin_asn_str = msg['as-path'].split(' ')[-1] - origin_asns = [] - if '{' in origin_asn_str: - origin_asns = origin_asn_str[1:-1].split(',') - else: - origin_asns = [origin_asn_str] - - # Store origin ASN in radix tree - rnode = rtree.search_exact(prefix) - if rnode is None: - rnode = rtree.add(prefix) - rnode.data['origin'] = defaultdict(set) - - for asn in origin_asns: - rnode.data['origin'][asn].add(elem.collector) - sys.stderr.write(f'\rProcessed {i + 1} BGP messages') - - sys.stderr.write('\nPushing data to IYP...\n') - - # Push all prefixes data to IYP - for i, rnode in enumerate(rtree): - data = rnode.data['origin'] - self.update_entry(rnode.prefix, data) - sys.stderr.write(f'\rProcessed {i + 1} prefixes') - - def update_entry(self, prefix, originasn_collector): - """Add the prefix to wikibase if it's not already there and update its - properties.""" - - statements = [] - - # set origin AS - for asn, collectors in originasn_collector.items(): - for collector in collectors: - # Added properties will have this additional information - url = URL_RV - if 'rrc' in collector: - url = URL_RIS - - self.reference = [ - (self.wh.get_pid('source'), self.org_qid), - (self.wh.get_pid('reference URL'), url.format(collector)), - (self.wh.get_pid('point in time'), self.today) - ] - - as_qid = self.wh.asn2qid(asn, create=True) - statements.append([self.wh.get_pid('originated by'), as_qid, self.reference]) - - # Commit to wikibase - # Get the prefix QID (create if prefix is not yet registered) and commit changes - prefix_qid = self.wh.prefix2qid(prefix, create=True) - self.wh.upsert_statements('update from RIS/Routeviews RIBs', prefix_qid, statements) - - -# Main program -if __name__ == '__main__': - - scriptname = sys.argv[0].replace('/', '_')[0:-3] - FORMAT = '%(asctime)s %(processName)s %(message)s' - logging.basicConfig( - format=FORMAT, - filename='log/' + scriptname + '.log', - level=logging.INFO, - datefmt='%Y-%m-%d %H:%M:%S' - ) - logging.info('Started: %s' % sys.argv) - - crawler = Crawler() - if len(sys.argv) == 1 and sys.argv[1] == 'unit_test': - crawler.unit_test(logging) - else: - crawler.run() diff --git a/iyp/crawlers/rapid7/forward_dns_v4.py b/iyp/crawlers/rapid7/forward_dns_v4.py deleted file mode 100644 index e0391c45..00000000 --- a/iyp/crawlers/rapid7/forward_dns_v4.py +++ /dev/null @@ -1,169 +0,0 @@ -import gzip -import json -import logging -import os -import pickle -import sys -from collections import defaultdict - -import requests -import tldextract - -from iyp.wiki.ip2asn import ip2asn -from iyp.wiki.wikihandy import Wikihandy - -# URL to Rapid7 open data -# TODO automatically fetch filename -# TODO remove all data with URL regex -# TODO remove downloaded file -URL = 'https://opendata.rapid7.com/sonar.fdns_v2/2021-02-26-1614298023-fdns_a.json.gz' -# URL = 'https://opendata.rapid7.com/sonar.fdns_v2/2021-02-26-1614297920-fdns_aaaa.json.gz' # noqa: W505 - - -def download_file(url, local_filename): - r = requests.get(url, stream=True) - with open(local_filename, 'wb') as f: - for chunk in r.iter_content(chunk_size=1024): - if chunk: # filter out keep-alive new chunks - f.write(chunk) - f.flush() - return local_filename - - -class Crawler(object): - def __init__(self, fdns_url=URL): - """Fetch QID for Rapid7 and initialize wikihandy.""" - - sys.stderr.write('Initialization...\n') - self.fdns_url = fdns_url - # Helper for wiki access - self.wh = Wikihandy() - - self.org_qid = self.wh.get_qid('Rapid7', - create={ # Create it if it doesn't exist - # Commit message - 'summary': 'add Rapid7 forward DNS data', - # Item description - 'description': 'Rapid7, a security company that provides unified ' - 'vulnerability management solutions', - 'statements': [ - [self.wh.get_pid('instance of'), self.wh.get_qid('organization')], - [self.wh.get_pid('website'), 'https://www.rapid7.com/'], - ] - }) - - # Added properties will have this additional information - today = self.wh.today() - self.reference = [ - (self.wh.get_pid('source'), self.org_qid), - (self.wh.get_pid('reference URL'), fdns_url), - (self.wh.get_pid('point in time'), today) - ] - - self.ia = ip2asn(wikihandy=self.wh) - - # keep track of all resolved prefixes so we just make one push per - # domain - self.tld_pfx = defaultdict(set) - - def match_domain_prefix(self, line): - """Parse a line from the rapid7 dataset, extract the domain and ip, and find the - corresponding IP prefix. - - return: (domain name, prefix) or None, None if the domain is not in the wiki - """ - - tld = None - prefix = None - - datapoint = json.loads(line) - if (datapoint['type'] in ['a', 'aaaa'] - and 'value' in datapoint - and 'name' in datapoint): - - ext = tldextract.extract(datapoint['name']) - tld = ext[-2] + '.' + ext[-1] - - # skip domains not in the wiki - if self.wh.domain2qid(tld) is None: - return tld, None - - ip_info = self.ia.lookup(datapoint['value']) - if ip_info is None: - return tld, None - - prefix = ip_info['prefix'] - self.tld_pfx[tld].add(prefix) - - return tld, prefix - - def run(self): - """Fetch Rapid7 DNS forward data, find corresponding BGP prefixes and push - resolution for domains already in the wikibase.""" - - # download rapid7 data and find corresponding prefixes - sys.stderr.write('Downloading Rapid7 dataset...\n') - fname = self.fdns_url.split('/')[-1] - if not os.path.exists(fname): - fname = download_file(self.fdns_url, fname) - - sys.stderr.write('Processing dataset...\n') - if os.path.exists(fname + '.pickle'): - sys.stderr.write('Load data from cache!') - self.tld_pfx = pickle.load(open(fname + '.pickle', 'rb')) - else: - with gzip.open(fname, 'rt') as finput: - for line in finput: - self.match_domain_prefix(line) - - pickle.dump(self.tld_pfx, open(fname + '.pickle', 'wb')) - - sys.stderr.write(f'Found {len(self.tld_pfx)} domain names in Rapid7 dataset out of the ' - f'{len(self.wh._domain2qid)} domain names in wiki\n') - # push data to wiki - for i, (tld, pfxs) in enumerate(self.tld_pfx.items()): - sys.stderr.write(f'\33[2K\rUpdating iyp... {i + 1}/{len(self.tld_pfx)}\t{tld} {len(pfxs)} prefixes') - self.update(tld, pfxs) - - sys.stderr.write('\n') - - def update(self, tld, pfxs): - """Update statements for the given domain name.""" - - # make all statements - statements = [] - for pfx in pfxs: - pfx_qid = self.wh.prefix2qid(pfx, create=True) - statements.append( - [self.wh.get_pid('forward DNS'), pfx_qid, self.reference] - ) - - # Commit to wikibase - # Get the domain name QID and commit changes - dn_qid = self.wh.domain2qid(tld) - try: - # TODO remove old data with URL regex - self.wh.upsert_statements('update from Rapid7 forward DNS data', dn_qid, statements) - except Exception as e: - logging.error(f'Could not update domain {dn_qid}') - logging.error(str(e)) - - -# Main program -if __name__ == '__main__': - - scriptname = sys.argv[0].replace('/', '_')[0:-3] - FORMAT = '%(asctime)s %(processName)s %(message)s' - logging.basicConfig( - format=FORMAT, - filename='log/' + scriptname + '.log', - level=logging.INFO, - datefmt='%Y-%m-%d %H:%M:%S' - ) - logging.info('Started: %s' % sys.argv) - - crawler = Crawler() - if len(sys.argv) == 1 and sys.argv[1] == 'unit_test': - crawler.unit_test(logging) - else: - crawler.run() diff --git a/iyp/crawlers/rapid7/forward_dns_v6.py b/iyp/crawlers/rapid7/forward_dns_v6.py deleted file mode 100644 index d623ac50..00000000 --- a/iyp/crawlers/rapid7/forward_dns_v6.py +++ /dev/null @@ -1,7 +0,0 @@ -from iyp.crawlers.rapid7.forward_dns_v4 import Crawler - -URL = 'https://opendata.rapid7.com/sonar.fdns_v2/2021-02-26-1614297920-fdns_aaaa.json.gz' - -if __name__ == '__main__': - crawler = Crawler(fdns_url=URL) - crawler.run() diff --git a/iyp/crawlers/spamhaus/asn_drop.py b/iyp/crawlers/spamhaus/asn_drop.py deleted file mode 100644 index becc99b1..00000000 --- a/iyp/crawlers/spamhaus/asn_drop.py +++ /dev/null @@ -1,125 +0,0 @@ -import logging -import sys - -import requests - -from iyp import RequestStatusError -from iyp.wiki.wikihandy import Wikihandy - -# URL to ASN Drop List -URL = 'https://www.spamhaus.org/drop/asndrop.txt' - - -class Crawler(object): - def __init__(self): - """""" - - # Helper for wiki access - self.wh = Wikihandy(preload=True) - - # Get the QID for Spamhaus organization - self.spamhaus_qid = self.wh.get_qid('Spamhaus', - create={ # Create it if it doesn't exist - # Commit message - 'summary': 'add Spamhaus organization', - # Item description - 'description': 'The Spamhaus Project is an international organisation ' - 'to track email spammers and spam-related activity', - 'aliases': 'The Spamhaus Project|the spamhaus project', - 'statements': [[self.wh.get_pid('instance of'), - self.wh.get_qid('organization')]] - }) - - # Get the QID for Spamhaus DROP project - self.drop_qid = self.wh.get_qid('Spamhaus DROP lists', - create={ # Create it if it doesn't exist - # Commit message - 'summary': 'add Spamhaus block list', - # Item description - 'description': "The Spamhaus Don't Route Or Peer Lists", - 'statements': [[self.wh.get_pid('managed by'), self.spamhaus_qid]] - }) - - # Get the QID for Spamhaus ASN-DROP list - self.asn_drop_qid = self.wh.get_qid('Spamhaus ASN-DROP list', - create={ # Create it if it doesn't exist - # Commit message - 'summary': 'add Spamhaus block list', - # Item description - 'description': 'ASN-DROP contains a list of Autonomous System Numbers ' - 'controlled by spammers or cyber criminals, as well as ' - '"hijacked" ASNs.', - 'statements': [[self.wh.get_pid('managed by'), self.spamhaus_qid], - [self.wh.get_pid('part of'), self.drop_qid]] - }) - - # Added properties will have this additional information - today = self.wh.today() - self.reference = [ - (self.wh.get_pid('source'), self.spamhaus_qid), - (self.wh.get_pid('reference URL'), URL), - (self.wh.get_pid('point in time'), today) - ] - - def run(self): - """Fetch blocklist from Spamhaus and push to wikibase.""" - - req = requests.get(URL) - if req.status_code != 200: - raise RequestStatusError('Error while fetching the blocklist') - - for i, row in enumerate(req.text.splitlines()): - # Skip the header - if row.startswith(';'): - continue - - self.update_net(row) - sys.stderr.write(f'\rProcessed {i + 1} ASes') - sys.stderr.write('\n') - - self.iyp.close() - - def update_net(self, one_line): - """Add the network to wikibase if it's not already there and update its - properties.""" - - asn, _, cc_name = one_line.partition(';') - asn = int(asn[2:]) - cc, name = [word.strip() for word in cc_name.split('|')] - - # Properties for this AS - statements = [ - [self.wh.get_pid('reported in'), self.asn_drop_qid, self.reference], - [self.wh.get_pid('name'), name, self.reference], - ] - - # set countries - if len(cc) == 2: - cc_qid = self.wh.country2qid(cc) - if cc_qid is not None: - statements.append([self.wh.get_pid('country'), cc_qid, self.reference]) - - # Commit to wikibase - # Get the AS QID (create if AS is not yet registered) and commit changes - net_qid = self.wh.asn2qid(asn, create=True) - self.wh.upsert_statements('update from Spamhaus ASN DROP list', net_qid, statements) - - -# Main program -if __name__ == '__main__': - - scriptname = sys.argv[0].replace('/', '_')[0:-3] - FORMAT = '%(asctime)s %(processName)s %(message)s' - logging.basicConfig( - format=FORMAT, - filename='log/' + scriptname + '.log', - level=logging.INFO, - datefmt='%Y-%m-%d %H:%M:%S' - ) - logging.info('Started: %s' % sys.argv) - - crawler = Crawler() - if len(sys.argv) == 1 and sys.argv[1] == 'unit_test': - crawler.unit_test(logging) - else: - crawler.run() diff --git a/iyp/crawlers/spamhaus/prefix_drop.py b/iyp/crawlers/spamhaus/prefix_drop.py deleted file mode 100644 index e86694b1..00000000 --- a/iyp/crawlers/spamhaus/prefix_drop.py +++ /dev/null @@ -1,117 +0,0 @@ -import logging -import sys - -import requests - -from iyp import RequestStatusError -from iyp.wiki.wikihandy import Wikihandy - -# URL to spamhaus data -URL = 'https://www.spamhaus.org/drop/drop.txt' - - -class Crawler(object): - def __init__(self, url=URL): - """""" - - # API endpoint - self.url = url - - # Helper for wiki access - self.wh = Wikihandy(preload=True) - - # Get the QID for Spamhaus organization - self.spamhaus_qid = self.wh.get_qid('Spamhaus', - create={ # Create it if it doesn't exist - # Commit message - 'summary': 'add Spamhaus organization', - # Item description - 'description': 'The Spamhaus Project is an international organisation ' - 'to track email spammers and spam-related activity', - 'aliases': 'The Spamhaus Project|the spamhaus project', - 'statements': [[self.wh.get_pid('instance of'), - self.wh.get_qid('organization')]] - }) - - # Get the QID for Spamhaus DROP project - self.drop_qid = self.wh.get_qid('Spamhaus DROP lists', - create={ # Create it if it doesn't exist - # Commit message - 'summary': 'add Spamhaus block list', - # Item description - 'description': "The Spamhaus Don't Route Or Peer Lists", - 'statements': [[self.wh.get_pid('managed by'), self.spamhaus_qid]] - }) - - # Get the QID for Spamhaus DROP list - self.drop_qid = self.wh.get_qid('Spamhaus DROP list', - create={ # Create it if it doesn't exist - # Commit message - 'summary': 'add Spamhaus block list', - 'description': 'The DROP list only include netblocks allocated directly by ' - 'an established RIR or NIR.', - 'statements': [[self.wh.get_pid('managed by'), self.spamhaus_qid], - [self.wh.get_pid('part of'), self.drop_qid]] - }) - - # Added properties will have this additional information - today = self.wh.today() - self.reference = [ - (self.wh.get_pid('source'), self.spamhaus_qid), - (self.wh.get_pid('reference URL'), self.url), - (self.wh.get_pid('point in time'), today) - ] - - def run(self): - """Fetch blocklist from Spamhaus and push to wikibase.""" - - req = requests.get(self.url) - if req.status_code != 200: - raise RequestStatusError('Error while fetching the blocklist') - - for i, row in enumerate(req.text.splitlines()): - # Skip the header - if row.startswith(';'): - continue - - self.update_net(row) - sys.stderr.write(f'\rProcessed {i + 1} prefixes') - sys.stderr.write('\n') - - self.iyp.close() - - def update_net(self, one_line): - """Add the prefix to wikibase if it's not already there and update its - properties.""" - - prefix, _, _ = one_line.partition(';') - - # Properties for this prefix - statements = [ - [self.wh.get_pid('reported in'), self.drop_qid, self.reference], - ] - - # Commit to wikibase - # Get the prefix QID (create if prefix is not yet registered) and commit changes - net_qid = self.wh.prefix2qid(prefix, create=True) - self.wh.upsert_statements('update from Spamhaus DROP list', net_qid, statements) - - -# Main program -if __name__ == '__main__': - - scriptname = sys.argv[0].replace('/', '_')[0:-3] - FORMAT = '%(asctime)s %(processName)s %(message)s' - logging.basicConfig( - format=FORMAT, - filename='log/' + scriptname + '.log', - level=logging.INFO, - datefmt='%Y-%m-%d %H:%M:%S' - ) - logging.info('Started: %s' % sys.argv) - - crawler = Crawler(URL) - if len(sys.argv) == 1 and sys.argv[1] == 'unit_test': - crawler.unit_test(logging) - else: - crawler.run() diff --git a/iyp/crawlers/spamhaus/prefix_dropv6.py b/iyp/crawlers/spamhaus/prefix_dropv6.py deleted file mode 100644 index f9097066..00000000 --- a/iyp/crawlers/spamhaus/prefix_dropv6.py +++ /dev/null @@ -1,26 +0,0 @@ -import logging -import sys - -from iyp.crawlers.spamhaus.prefix_drop import Crawler - -URL = 'https://www.spamhaus.org/drop/dropv6.txt' - - -# Main program -if __name__ == '__main__': - - scriptname = sys.argv[0].replace('/', '_')[0:-3] - FORMAT = '%(asctime)s %(processName)s %(message)s' - logging.basicConfig( - format=FORMAT, - filename='log/' + scriptname + '.log', - level=logging.INFO, - datefmt='%Y-%m-%d %H:%M:%S' - ) - logging.info('Started: %s' % sys.argv) - - crawler = Crawler(URL) - if len(sys.argv) == 1 and sys.argv[1] == 'unit_test': - crawler.unit_test(logging) - else: - crawler.run() diff --git a/iyp/crawlers/spamhaus/prefix_edrop.py b/iyp/crawlers/spamhaus/prefix_edrop.py deleted file mode 100644 index 84ca9316..00000000 --- a/iyp/crawlers/spamhaus/prefix_edrop.py +++ /dev/null @@ -1,114 +0,0 @@ -import logging -import sys - -import requests - -from iyp import RequestStatusError -from iyp.wiki.wikihandy import Wikihandy - -# URL to spamhaus data -URL = 'https://www.spamhaus.org/drop/edrop.txt' - - -class Crawler(object): - def __init__(self): - """""" - - # Helper for wiki access - self.wh = Wikihandy(preload=True) - - # Get the QID for Spamhaus organization - self.spamhaus_qid = self.wh.get_qid('Spamhaus', - create={ # Create it if it doesn't exist - # Commit message - 'summary': 'add Spamhaus organization', - 'description': 'The Spamhaus Project is an international organisation ' - 'to track email spammers and spam-related activity', - 'aliases': 'The Spamhaus Project|the spamhaus project', - 'statements': [[self.wh.get_pid('instance of'), - self.wh.get_qid('organization')]] - }) - - # Get the QID for Spamhaus DROP project - self.drop_qid = self.wh.get_qid('Spamhaus DROP lists', - create={ # Create it if it doesn't exist - # Commit message - 'summary': 'add Spamhaus block list', - # Item description - 'description': "The Spamhaus Don't Route Or Peer Lists", - 'statements': [[self.wh.get_pid('managed by'), self.spamhaus_qid]] - }) - - # Get the QID for Spamhaus EDROP list - self.drop_qid = self.wh.get_qid('Spamhaus EDROP list', - create={ # Create it if it doesn't exist - # Commit message - 'summary': 'add Spamhaus block list', - 'description': 'EDROP is an extension of the DROP list that includes ' - 'suballocated netblocks controlled by spammers or cyber ' - 'criminals', - 'statements': [[self.wh.get_pid('managed by'), self.spamhaus_qid], - [self.wh.get_pid('part of'), self.drop_qid]] - }) - - # Added properties will have this additional information - today = self.wh.today() - self.reference = [ - (self.wh.get_pid('source'), self.spamhaus_qid), - (self.wh.get_pid('reference URL'), URL), - (self.wh.get_pid('point in time'), today) - ] - - def run(self): - """Fetch blocklist from Spamhaus and push to wikibase.""" - - req = requests.get(URL) - if req.status_code != 200: - raise RequestStatusError('Error while fetching the blocklist') - - for i, row in enumerate(req.text.splitlines()): - # Skip the header - if row.startswith(';'): - continue - - self.update_net(row) - sys.stderr.write(f'\rProcessed {i + 1} prefixes') - sys.stderr.write('\n') - - self.iyp.close() - - def update_net(self, one_line): - """Add the prefix to wikibase if it's not already there and update its - properties.""" - - prefix, _, _ = one_line.partition(';') - - # Properties for this prefix - statements = [ - [self.wh.get_pid('reported in'), self.drop_qid, self.reference], - ] - - # Commit to wikibase - # Get the prefix QID (create if prefix is not yet registered) and commit changes - net_qid = self.wh.prefix2qid(prefix, create=True) - self.wh.upsert_statements('update from Spamhaus EDROP list', net_qid, statements) - - -# Main program -if __name__ == '__main__': - - scriptname = sys.argv[0].replace('/', '_')[0:-3] - FORMAT = '%(asctime)s %(processName)s %(message)s' - logging.basicConfig( - format=FORMAT, - filename='log/' + scriptname + '.log', - level=logging.INFO, - datefmt='%Y-%m-%d %H:%M:%S' - ) - logging.info('Started: %s' % sys.argv) - - crawler = Crawler() - if len(sys.argv) == 1 and sys.argv[1] == 'unit_test': - crawler.unit_test(logging) - else: - crawler.run() diff --git a/iyp/crawlers/tools.py b/iyp/crawlers/tools.py deleted file mode 100644 index 43352def..00000000 --- a/iyp/crawlers/tools.py +++ /dev/null @@ -1,98 +0,0 @@ -import logging - -import pywikibot -from pywikibot.data import api -from SPARQLWrapper import JSON, SPARQLWrapper - - -class wikihandy(object): - - def __init__(self, wikidata_site, sparql='https://query.wikidata.org/sparql'): - self._label_pid = {} - self._label_qid = {} - self._asn2qid = None - self.wikidata_site = wikidata_site - - self.sparql = SPARQLWrapper(sparql) - - def get_items(self, label): - """Return the list of items for the given label.""" - params = {'action': 'wbsearchentities', 'format': 'json', - 'language': 'en', 'type': 'item', - 'search': label} - request = api.Request(site=self.wikidata_site, parameters=params) - result = request.submit() - return result['search'] if len(result['search']) > 0 else None - - def label2qid(self, label, lang='en'): - """Retrieve item id based on the given label.""" - - if label not in self._label_qid: - items = self.get_items(label) - if items is None: - return None - - self._label_qid[label] = items[0]['id'] - - return self._label_qid[label] - - def label2pid(self, label, lang='en'): - """Retrieve property id based on the given label.""" - - if label not in self._label_pid: - - params = {'action': 'wbsearchentities', 'format': 'json', - 'language': lang, 'type': 'property', - 'search': label} - request = api.Request(site=self.wikidata_site, parameters=params) - result = request.submit() - - if len(result['search']) == 0: - return None - - if len(result['search']) > 1: - logging.warning(f'Several properties have the label: {label}') - logging.warning(result) - - self._label_pid[label] = result['search'][0]['id'] - - return self._label_pid[label] - - def asn2qid(self, asn): - """Retrive QID of items assigned with the given Autonomous System Number.""" - - if self._asn2qid is None: - QUERY = """ - #Items that have a pKa value set - SELECT ?item ?asn - WHERE - { - # ?item wdt:%s wdt:%s . - ?item wdt:%s ?asn . - } - """ % ( - self.label2pid('instance of'), - self.label2qid('Autonomous System'), - self.label2pid('autonomous system number') - ) - - self.sparql.setQuery(QUERY) - self.sparql.setReturnFormat(JSON) - results = self.sparql.query().convert() - - self._asn2qid = {} - for res in results['results']['bindings']: - res_qid = res['item']['value'].rpartition('/')[2] - res_asn = int(res['asn']['value']) - - self._asn2qid[res_asn] = res_qid - - return self._asn2qid.get(int(asn), None) - - -if __name__ == '__main__': - wikidata_site = pywikibot.Site('wikidata', 'wikidata') - wh = wikihandy(wikidata_site) - - import IPython - IPython.embed() diff --git a/iyp/tools/__init_.py b/iyp/tools/__init_.py deleted file mode 100644 index e69de29b..00000000 diff --git a/iyp/tools/bootstrap.py b/iyp/tools/bootstrap.py deleted file mode 100644 index c873bb49..00000000 --- a/iyp/tools/bootstrap.py +++ /dev/null @@ -1,70 +0,0 @@ -import csv -from collections import defaultdict - -from iyp.wiki.wikihandy import Wikihandy - -BASIC_PROPERTY_FNAME = 'basic/properties.csv' -BASIC_ITEMS_FNAME = 'basic/items.csv' - -wh = Wikihandy(preload=False) - - -def decomment(csvfile): - """Ignore lines with comments.""" - for row in csvfile: - if '#' not in row: - yield row - - -print('Adding properties') -with open(BASIC_PROPERTY_FNAME, 'r') as fp: - csvdata = csv.reader(decomment(fp), skipinitialspace=True) - - for row in csvdata: - if not row: - continue - - label, description, aliases, data_type = [col.strip() for col in row] - pid = wh.get_pid(label) - - if pid is None: - pid = wh.add_property('bootstrap', label, description, aliases, data_type) - print(pid, label) - -print('Adding items') -statements = defaultdict(list) -# wikidata = wikihandy.Wikihandy(wikidata_project='wikidata', lang='wikidata') - -with open(BASIC_ITEMS_FNAME, 'r') as fp: - csvdata = csv.reader(decomment(fp), skipinitialspace=True) - - for row in csvdata: - if not row: - continue - - label, description, aliases, statements = [col.strip() for col in row] - print(label) - - # Retrive statements from the csv file - # Assume all properties have the 'wikidata-item' datatype - claims = [] - for statement in statements.split('|'): - try: - property, target = statement.split(':') - except ValueError: - # skip lines with no statement - continue - - claims.append([wh.get_pid(property.strip()), wh.get_qid(target), []]) - - qid = wh.get_qid(label) - if qid is None: - wh.add_item( - 'bootstrap', - label, - description, - aliases, - claims - ) - else: - wh.upsert_statements('bootstrap', qid, claims) diff --git a/iyp/tools/ip2plan.py b/iyp/tools/ip2plan.py deleted file mode 100644 index 52d0e79b..00000000 --- a/iyp/tools/ip2plan.py +++ /dev/null @@ -1,99 +0,0 @@ -import logging -import sys - -import radix -from SPARQLWrapper import JSON, SPARQLWrapper - -# TODO fetch PIDs with sparql, and make it a standalone script -# (no need for fancy pywikibot setup) -from iyp.wiki.wikihandy import DEFAULT_WIKI_SPARQL, Wikihandy - - -class ip2plan(object): - - def __init__(self, wikihandy=None, sparql=DEFAULT_WIKI_SPARQL): - """Fetch peering lans and their corresponding IXP from iyp. - - wikihandy: a Wikihandy instance to use. A new will be created if - this is set to None. - """ - - logging.info('ip2plan initialization...\n') - if wikihandy is None: - self.wh = Wikihandy() - else: - self.wh = wikihandy - - self.rtree = radix.Radix() - self.sparql = SPARQLWrapper(sparql) - - logging.info('Fetching prefix info...\n') - # Fetch prefixes - QUERY = """ - SELECT ?item ?prefix ?ix_qid ?org_qid - WHERE - { - ?item wdt:%s wd:%s. - ?item rdfs:label ?prefix. - ?item wdt:%s ?ix_qid. - ?ix_qid wdt:%s ?org_qid. - } - """ % ( - self.wh.get_pid('instance of'), - self.wh.get_qid('peering LAN'), - self.wh.get_pid('managed by'), - self.wh.get_pid('managed by'), - ) - # Query wiki - self.sparql.setQuery(QUERY) - self.sparql.setReturnFormat(JSON) - response = self.sparql.query().convert() - results = response['results'] - - # Parse results - for res in results['bindings']: - prefix_qid = res['item']['value'].rpartition('/')[2] - prefix = res['prefix']['value'] - ix_qid = res['ix_qid']['value'].rpartition('/')[2] - org_qid = res['org_qid']['value'].rpartition('/')[2] - - rnode = self.rtree.add(prefix) - rnode.data['prefix'] = prefix - rnode.data['ix_qid'] = ix_qid - rnode.data['prefix_qid'] = prefix_qid - rnode.data['org_qid'] = org_qid - - logging.info(QUERY) - logging.info(f'Found {len(self.rtree.nodes())} peering LANs') - - def lookup(self, ip): - """Lookup for the given ip address. - - Returns a dictionary with the corresponding prefix and ASN, as well as the - corresponding QIDs. - """ - try: - node = self.rtree.search_best(ip) - except ValueError: - print('Wrong IP address: %s' % ip) - return None - - if node is None: - return None - else: - return node.data - - -if __name__ == '__main__': - - if len(sys.argv) < 2: - print(f'usage: {sys.argv[0]} IP') - sys.exit() - - ip = sys.argv[1] - ia = ip2plan() - res = ia.lookup(ip) - if res is None: - print('Unknown') - else: - print(res) diff --git a/iyp/tools/revertChanges.py b/iyp/tools/revertChanges.py deleted file mode 100644 index 7e23f23d..00000000 --- a/iyp/tools/revertChanges.py +++ /dev/null @@ -1,78 +0,0 @@ -import datetime -import re -import sys - -import pywikibot -from pywikibot.data import api -from pywikibot.exceptions import Error - - -class UndoBot(object): - """Revert changes made in the last one hour.""" - - def __init__(self, lag=60): - self.lag = lag - self.site = pywikibot.Site() - self.user = self.site.username() - self.start = self.site.server_time() - self.end = self.site.server_time() - datetime.timedelta(minutes=self.lag) - - def undo_all(self): - - contribs = self.site.usercontribs(user=self.user, start=self.start, end=self.end) - for item in contribs: - result = self.revert_one_item(item) - if result: - print('{0}: {1}'.format(item['title'], result)) - else: - print('Skipped {0}'.format(item['title'])) - - def revert_one_item(self, item): - page = pywikibot.Page(self.site, item['title']) - - # Load revision history - history = list(page.revisions(total=2)) - if len(history) <= 1: - return False - rev = history[0] - - try: - # self.site.rollbackpage(page, user=self.user, markbot=True) - self.site.editpage(page, undo=rev['revid']) - except api.APIError as e: - if e.code == 'badtoken': - pywikibot.error( - 'There was an API token error rollbacking the edit') - return False - except Error as e: - print(e) - else: - return 'The edit(s) made in {} by {} was undoned.'.format( - page.title(), self.user) - - return False - - def callback(self, item): - """Sample callback function for 'private' revert bot. - - @param item: an item from user contributions - - @type item: dict - - @rtype: bool - """ - if 'top' in item: - page = pywikibot.Page(self.site, item['title']) - text = page.get(get_redirect=True) - pattern = re.compile(r'\[\[.+?:.+?\..+?\]\]', re.UNICODE) - return bool(pattern.search(text)) - return False - - -if __name__ == '__main__': - if len(sys.argv) < 2: - sys.exit('usage: {sys.argv[0]} last_x_minutes\nUndo changes made in the last X minutes') - - lag = int(sys.argv[1]) - ub = UndoBot(lag) - ub.undo_all() diff --git a/iyp/wiki/__init__.py b/iyp/wiki/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/iyp/wiki/decorators.py b/iyp/wiki/decorators.py deleted file mode 100644 index be2fc91f..00000000 --- a/iyp/wiki/decorators.py +++ /dev/null @@ -1,12 +0,0 @@ -from functools import wraps - - -# Decorator for making a method thread safe and fix concurrent pywikibot cache -# accesses -def thread_safe(method): - @wraps(method) - def _impl(self, *method_args, **method_kwargs): - with self.lock: - res = method(self, *method_args, **method_kwargs) - return res - return _impl diff --git a/iyp/wiki/ip2asn.py b/iyp/wiki/ip2asn.py deleted file mode 100644 index 460782fa..00000000 --- a/iyp/wiki/ip2asn.py +++ /dev/null @@ -1,97 +0,0 @@ -import logging -import sys - -import radix -from SPARQLWrapper import JSON, SPARQLWrapper - -# TODO fetch PIDs with sparql, and make it a standalone script -# (no need for fancy pywikibot setup) -from iyp.wiki.wikihandy import DEFAULT_WIKI_SPARQL, Wikihandy - - -class ip2asn(object): - - def __init__(self, wikihandy=None, sparql=DEFAULT_WIKI_SPARQL): - """Fetch routing prefixes and their origin AS from iyp. - - wikihandy: a Wikihandy instance to use. A new will be created if - this is set to None. - """ - - logging.info('ip2asn initialization...\n') - if wikihandy is None: - self.wh = Wikihandy() - else: - self.wh = wikihandy - - self.rtree = radix.Radix() - self.sparql = SPARQLWrapper(sparql) - - logging.info('Fetching prefix info...\n') - # Fetch prefixes - QUERY = """ - #Items that have a pKa value set - SELECT ?item ?prefix ?as_qid ?asn - WHERE - { - ?item wdt:%s wd:%s. - ?item rdfs:label ?prefix. - ?item wdt:%s ?as_qid. - ?as_qid wdt:%s ?asn. - } - """ % ( - self.wh.get_pid('instance of'), - self.wh.get_qid('IP routing prefix'), - self.wh.get_pid('originated by'), - self.wh.get_pid('autonomous system number'), - ) - # Query wiki - self.sparql.setQuery(QUERY) - self.sparql.setReturnFormat(JSON) - response = self.sparql.query().convert() - results = response['results'] - - # Parse results - for res in results['bindings']: - prefix_qid = res['item']['value'].rpartition('/')[2] - prefix = res['prefix']['value'] - asn = res['asn']['value'] - as_qid = res['as_qid']['value'].rpartition('/')[2] - - rnode = self.rtree.add(prefix) - rnode.data['prefix'] = prefix - rnode.data['asn'] = asn - rnode.data['prefix_qid'] = prefix_qid - rnode.data['as_qid'] = as_qid - - def lookup(self, ip): - """Lookup for the given ip address. - - Returns a dictionary with the corresponding prefix and ASN, as well as the - corresponding QIDs. - """ - try: - node = self.rtree.search_best(ip) - except ValueError: - print('Wrong IP address: %s' % ip) - return None - - if node is None: - return None - else: - return node.data - - -if __name__ == '__main__': - - if len(sys.argv) < 2: - print(f'usage: {sys.argv[0]} IP') - sys.exit() - - ip = sys.argv[1] - ia = ip2asn() - res = ia.lookup(ip) - if res is None: - print('Unknown') - else: - print(res) diff --git a/iyp/wiki/remove_duplicate_statements.py b/iyp/wiki/remove_duplicate_statements.py deleted file mode 100644 index 5e8b8f08..00000000 --- a/iyp/wiki/remove_duplicate_statements.py +++ /dev/null @@ -1,100 +0,0 @@ -"""Delete duplicate statements that have no reference.""" - -import logging -import sys - -from SPARQLWrapper import JSON, SPARQLWrapper - -from iyp.wiki.wikihandy import DEFAULT_WIKI_SPARQL, Wikihandy - - -class Cleaner(object): - - def __init__(self, wikihandy=None, sparql=DEFAULT_WIKI_SPARQL): - """Initialize SPARQL and wikihandy. - - wikihandy: a Wikihandy instance to use. A new will be created if - this is set to None. - """ - - logging.info('remove_duplicate_statements initialization...\n') - if wikihandy is None: - self.wh = Wikihandy(preload=False) - else: - self.wh = wikihandy - - self.sparql = SPARQLWrapper(sparql) - - def run(self): - """Find duplicate statements and remove them.""" - - for item_qid in self.all_qid(): - self.remove_duplicate(item_qid) - - def all_qid(self): - - sys.stderr.write('fetching all items QID...\n') - QUERY = """ - SELECT DISTINCT ?item - WHERE { - ?item wdt:P1 ?stmt_value. - } - """ - - # Query wiki - self.sparql.setQuery(QUERY) - self.sparql.setReturnFormat(JSON) - response = self.sparql.query().convert() - results = response['results'] - - # Parse resul - for i, res in enumerate(results['bindings']): - sys.stderr.write(f'\r{i}/{len(results["bindings"])} items') - yield res['item']['value'].rpartition('/')[2] - - def remove_duplicate(self, qid): - - # Get the ItemPage object - item = self.wh.get_item(qid=qid) - - # fetch item properties - item.get() - - for prop, claims in item.claims.items(): - - # Skip if there is only one claim - if len(claims) < 2: - continue - - claims_to_remove = [] - for i, claim in enumerate(claims): - # skip claims with references, it should be removed by the - # corresponding crawler - if len(claim.sources) > 0: - continue - - for claim2 in claims[i + 1:]: - if len(claim2.sources) > 0: - continue - - if claim2.target == claim.target: - print(f'delete for item {qid}: \t', claim2) - print('same as: \t', claim) - - claims_to_remove.append(claim2) - - # Avoid deleting more than 500 claims at once - if len(claims_to_remove) > 300: - item.removeClaims(claims_to_remove) - claims_to_remove = [] - - if len(claims_to_remove) > 0: - # Remove claims - item.removeClaims(claims_to_remove) - break - - -if __name__ == '__main__': - - clean = Cleaner() - clean.run() diff --git a/iyp/wiki/wikihandy.py b/iyp/wiki/wikihandy.py deleted file mode 100644 index d8861f6f..00000000 --- a/iyp/wiki/wikihandy.py +++ /dev/null @@ -1,1011 +0,0 @@ -import ipaddress -import logging -import sys -from collections import defaultdict -from threading import RLock - -import arrow -import iso3166 -import pywikibot -from SPARQLWrapper import JSON, SPARQLWrapper - -from iyp.wiki import decorators - -# DEFAULT_WIKI_SPARQL = 'http://localhost:8989/bigdata/namespace/wdq/sparql' -# 'https://exp1.iijlab.net/wdqs/bigdata/namespace/wdq/sparql' -# DEFAULT_WIKI_PROJECT = 'local' -DEFAULT_WIKI_SPARQL = 'http://iyp-proxy.iijlab.net/bigdata/namespace/wdq/sparql' -DEFAULT_WIKI_PROJECT = 'iyp' -DEFAULT_LANG = 'en' -MAX_PENDING_REQUESTS = 250 -MAX_CLAIM_EDIT = 300 - -EXOTIC_CC = {'ZZ': 'unknown country', 'EU': 'Europe', 'AP': 'Asia-Pacific'} - -# TODO add method to efficiently get countries -# TODO make a generic function for all the XXX2qid() functions - - -class Wikihandy(object): - - def __init__(self, wikidata_project=DEFAULT_WIKI_PROJECT, lang=DEFAULT_LANG, - sparql=DEFAULT_WIKI_SPARQL, preload=False): - - logging.debug('Wikihandy: Enter initialization') - - # used to make pywikibot cache access thread-safe - self.lock = RLock() - - self._asn2qid = {} - self._prefix2qid = {} - self._ip2qid = {} - self._domain2qid = {} - self.repo = pywikibot.DataSite(lang, wikidata_project, - user=pywikibot.config.usernames[wikidata_project][lang]) - - self.sparql = SPARQLWrapper(sparql) - self.label_pid, self.label_qid = self.id4alllabels() - self.pending_requests = 0 - - if preload: - self.asn2qid(1) - self.prefix2qid('10.0.0.0/8') - self.domain2qid('example.com') - self.ip2qid('10.0.0.0') - - logging.debug('Wikihandy: Leave initialization') - - def login(self): - """Login to the wikibase.""" - - return self.repo.login() - - def today(self): - """Return a wikibase time object with current date.""" - - now = self.repo.server_time() - return pywikibot.WbTime(year=now.year, month=now.month, day=now.day, - calendarmodel='http://www.wikidata.org/entity/Q1985727') - - def to_wbtime(self, datetime): - """Convert a string, timestamp, or datetime object to a pywikibot WbTime - object.""" - - dt = arrow.get(datetime) - return pywikibot.WbTime(year=dt.year, month=dt.month, day=dt.day, - calendarmodel='http://www.wikidata.org/entity/Q1985727') - - @decorators.thread_safe - def get_item(self, label=None, qid=None): - """Return the first item with the given label.""" - - if qid is not None: - return pywikibot.ItemPage(self.repo, qid) - - if label is not None: - if label in self.label_qid: - qid = self.label_qid[label] - return pywikibot.ItemPage(self.repo, qid) - - return None - - # params = {'action': 'wbsearchentities', 'format': 'json', - # 'language': lang, 'type': 'item', - # 'search': label} - # request = api.Request(site=self.repo, parameters=params) - # result = request.submit() - # return result['search'] - - @decorators.thread_safe - def get_property(self, label=None, pid=None): - """Return the fisrt property with the given label.""" - - if pid is not None: - return pywikibot.PropertyPage(self.repo, pid) - - if label is not None: - if label in self.label_pid: - pid = self.label_pid[label] - return pywikibot.PropertyPage(self.repo, pid) - - return None - - # params = {'action': 'wbsearchentities', 'format': 'json', - # 'language': lang, 'type': 'property', - # 'search': label} - # request = api.Request(site=self.repo, parameters=params) - # result = request.submit() - # print(result) - # return result['search'] - - @decorators.thread_safe - def add_property(self, summary, label, description, aliases, data_type): - """Create new property if it doesn't already exists. - - Return the property PID. - """ - - pid = self.get_pid(label) - if pid is not None: - return pid - - new_prop = pywikibot.PropertyPage(self.repo, datatype=data_type) - data = { - 'labels': {'en': label}, - 'aliases': {'en': aliases}, - 'descriptions': {'en': description} - } - self.editEntity(new_prop, data, summary, asynchronous=False) - pid = new_prop.getID() - - # Keep it in the cache - self.label_pid[label] = pid - - return pid - - @decorators.thread_safe - def add_item(self, summary, label, description=None, aliases=None, - statements=None): - """Create new item if it doesn't already exists. - - - summary (string): a commit message - - label (string): the item name - - description (string): the item description in english - - aliases: item's aliases - - statements: list of statements for the created item - - Return the item QID - """ - - qid = self.label_qid.get(label, None) - if qid is not None: - self.upsert_statements(summary, qid, statements) - return qid - - data = {} - new_item = pywikibot.ItemPage(self.repo) - if label is not None: - data['labels'] = {'en': label.strip()} - if aliases is not None: - data['aliases'] = {'en': aliases} - if description is not None and label != description: - data['descriptions'] = {'en': description} - if statements is not None: - data['claims'] = self.upsert_statements(summary, new_item, statements, commit=False)['claims'] - - self.editEntity(new_item, data, summary, asynchronous=False) - qid = new_item.getID() - - # Keep it in the cache - self.label_qid[label] = qid - - return qid - - def __dict2quantity(self, target): - """Transform a dictionnary representing a quantity (value, unit: a QID to the - item representing the unit, lowerBound, upperBound) to a wikibase quantity - object.""" - - target_tmp = dict(target) - if 'unit' in target_tmp: - target_tmp['unit'] = self.get_item(qid=target['unit']) - target_tmp['site'] = self.repo - return pywikibot.WbQuantity(**target_tmp) - - def _update_statement_local(self, claims, target, ref_urls=None, sources=None): - """Update a statement locally (changed are not pushed to wikibase). - - If a reference URL is given, then it will update the statement that have the - same reference URL. Otherwise it will update the first statement that has no - reference URL. - """ - - ref_url_pid = self.label_pid['reference URL'] - source_pid = self.label_pid['source'] - selected_claim = None - - # search for a claim with the same reference url - if ref_urls is not None: - selected_claim = select_first_claim(claims, ref_url_pid, ref_urls) - - # search for a claim with the same source - elif sources is not None: - selected_claim = select_first_claim(claims, source_pid, sources) - - # search for the first claim without a reference url - else: - for claim in claims: - if claim.sources: - for source in claim.sources: - if ref_url_pid not in source: - selected_claim = claim - break - else: - selected_claim = claim - break - - if selected_claim is None: - # Couldn't find a matching claim - return None - - # Update statement target value - if selected_claim.type == 'wikibase-item': - target_value = self.get_item(qid=target) - if target_value.getID() != selected_claim.getTarget().id: - # selected_claim.changeTarget(target_value) # API access - selected_claim.setTarget(target_value) # no API access! - elif selected_claim.type == 'quantity': - target_value = self.__dict2quantity(target) - if target_value != selected_claim.getTarget(): - selected_claim.setTarget(target_value) # no API access! - else: - target_value = target - if target_value != selected_claim.getTarget(): - # selected_claim.changeTarget(target_value) # API access - selected_claim.setTarget(target_value) # no API access! - - return selected_claim - - def _update_qualifiers_local(self, qualifiers, new_qualifiers_list): - new_qualifiers = [] - for pid, value in new_qualifiers_list: - if pid in qualifiers: - self._update_statement_local(qualifiers[pid], value) - else: - new_qualifiers.append([pid, value]) - - return new_qualifiers - - def _update_references_local(self, sources, new_ref_list): - new_sources = [] - for pid, value in new_ref_list: - updated = False - for source in sources: - if pid in source: - source = self._update_statement_local(source[pid], value) - updated = True - if not updated: - new_sources.append([pid, value]) - - return new_sources - - def _insert_qualifiers_local(self, claim, new_qualifiers): - # Add new qualifiers - if 'qualifiers' not in claim and new_qualifiers: - claim['qualifiers'] = {} - - for pid, value in new_qualifiers: - qualifier = pywikibot.Claim(self.repo, pid) - target_value = value - if qualifier.type == 'wikibase-item': - target_value = self.get_item(qid=value) - elif qualifier.type == 'quantity': - target_value = self.__dict2quantity(value) - qualifier.setTarget(target_value) - claim['qualifiers'][pid] = [qualifier.toJSON()['mainsnak']] - - def _insert_references_local(self, claim, new_references): - # Add new sources - if 'references' not in claim and new_references: - claim['references'] = [] - - refs = defaultdict(list) - for pid, value in new_references: - reference = pywikibot.Claim(self.repo, pid) - target_value = value - if reference.type == 'wikibase-item': - target_value = self.get_item(qid=value) - elif reference.type == 'quantity': - target_value = self.__dict2quantity(value) - reference.setTarget(target_value) - reference.isReference = True - refs[pid].append(reference.toJSON()) - - if refs: - claim['references'].append({'snaks': refs}) - - @decorators.thread_safe - def upsert_statements(self, summary, item_id, statements, commit=True, - checkRefURL=True, checkSource=False, delete_ref_url=None, - asynchronous=True): - """Update statements that have the same reference URLs or create new statements. - All of this in only one or two API calls. - - This method finds statements based on the given item_id and reference - URLs, it updates statements with new values, and delete outdated - statements (i.e. statements with same references but not seen in the - given statements). - - The statements parameter is a list of statement where each statement is - a list in the form ['pid', 'target', 'references', 'qualifiers']. - References and qualifiers have the same format, both are a list of - pairs (PID, value (e.g QID, string, URL)). If checkRefURL is - True and an existing claim has the same 'reference URL' has the one - given in the qualifiers then this claim value and qualifiers will be - updated. If checkSource is True, it will update a claim from the same - source. Otherwise the first claim will be updated. - - Notices: - - If the property datatype is 'wikibase-item' then the target is expected - to be the item PID. For properties with a different datatype the value - of target is used as is. - - If the item has multiple times the given property only the first one - is modified. - - When the list delete_ref_url is not None, all statements with the given - URLs will also be removed. This is useful if the reference URLs has - changed. - """ - - all_updated_claims = [] - ref_url_pid = self.get_pid('reference URL') - source_pid = self.get_pid('source') - # Retrieve item and claims objects - item = None - if isinstance(item_id, pywikibot.ItemPage): - item = item_id - else: - item = self.get_item(qid=item_id) - - # Select claims objects - if item.getID() != '-1': - all_claims = dict(item.get()['claims']) - # find all reference URLs in given statements - all_given_ref_urls = set([ - val for statement in statements - for pid, val in unpack_statement(statement)[2] if pid == ref_url_pid - ]) - - # add outdated URLs - if delete_ref_url is not None: - all_given_ref_urls.update(delete_ref_url) - selected_claims = select_claims(all_claims, ref_url_pid, all_given_ref_urls) - - else: - all_claims = {} - selected_claims = {} - - for statement in statements: - - property_id, target, references, qualifiers = unpack_statement(statement) - - claims = selected_claims - given_ref_urls = None - if checkRefURL: - given_ref_urls = set([val for pid, val in references if pid == ref_url_pid]) - if not given_ref_urls: - given_ref_urls = None - claims = all_claims - - given_sources = None - if checkSource: - given_sources = set([val for pid, val in references if pid == source_pid]) - - # Find the matching claim or create a new one - selected_claim = None - if property_id in claims: - # update the main statement value - selected_claim = self._update_statement_local( - claims[property_id], target, given_ref_urls, - given_sources) - - if selected_claim is None: - # create a new claim - selected_claim = pywikibot.Claim(self.repo, property_id) - target_value = target - if selected_claim.type == 'wikibase-item': - target_value = self.get_item(qid=target) - elif selected_claim.type == 'quantity': - target_value = self.__dict2quantity(target) - selected_claim.setTarget(target_value) - else: - # Remove the claim from the list, so we won't select it anymore - claims[property_id].remove(selected_claim) - - # update current qualifiers and references - new_qualifiers = self._update_qualifiers_local(selected_claim.qualifiers, qualifiers) - new_references = self._update_references_local(selected_claim.sources, references) - - # add new qualifiers and references - updated_claim = selected_claim.toJSON() - - # update references - self._insert_qualifiers_local(updated_claim, new_qualifiers) - self._insert_references_local(updated_claim, new_references) - - # Add updated claims - all_updated_claims.append(updated_claim) - - # Commit changes - if commit and item is not None: - self.editEntity(item, all_updated_claims, summary, asynchronous=asynchronous) - claims_to_remove = [claim for claims_list in selected_claims.values() - for claim in claims_list] - if claims_to_remove: - if len(claims_to_remove) > MAX_CLAIM_EDIT: - # Remove in batches if there is too many to do - batch_size = MAX_CLAIM_EDIT - 1 - for i in range(0, len(claims_to_remove), batch_size): - batch = claims_to_remove[i:min(i + batch_size, len(claims_to_remove))] - item.removeClaims(batch) - else: - item.removeClaims(claims_to_remove) - else: - pass - - return {'claims': all_updated_claims} - - @decorators.thread_safe - def editEntity(self, entity, data, summary, asynchronous=True): - """Update entity in the database. - - data: should be either a dictionary that may give all informationi (e.g. - label, description, claims) or a list with only updated claims. - - Update is done in asynchronous manner if MAX_PENDING_REQUESTS permits, - use synchronous call otherwise. - """ - - if isinstance(data, list): - claims = data - data = {'claims': claims} - if len(claims) == 0: - # Nothing to do - return - - # API limits the number of claims to 500 - if len(claims) > MAX_CLAIM_EDIT: - batch_size = MAX_CLAIM_EDIT - 1 - self.editEntity(entity, claims[batch_size:], summary, asynchronous) - data['claims'] = claims[:batch_size] - - if asynchronous and self.pending_requests < MAX_PENDING_REQUESTS: - self.pending_requests += 1 - entity.editEntity(data, summary=summary, asynchronous=True, callback=self.on_delivery) - else: - logging.debug('Too many pending requests. Doing a synchronous request.') - entity.editEntity(data, summary=summary) - - def on_delivery(self, entity, error): - """Print errors if a commit didn't succeed.""" - - self.pending_requests -= 1 - if error is not None: - logging.error(f'!!! ERROR (on_delivery)!!!: {entity} {error}') - print('!!! ERROR (on_delivery)!!!') - print(entity) - print(error) - - @decorators.thread_safe - def add_statement(self, summary, item_id, property_id, target, qualifiers=[]): - """Create new claim, if the property datatype is 'wikibase-item' then the target - is expected to be the item PID. - - For properties with a different datatype the value of target is used as is. - Qualifiers is a list of pairs (PID, value (e.g QID, string, URL)) - """ - - item = self.get_item(qid=item_id) - claim = pywikibot.Claim(self.repo, property_id) - target_value = target - if claim.type == 'wikibase-item': - target_value = self.get_item(qid=target) - claim.setTarget(target_value) - item.addClaim(claim, summary=summary) - - for pid, value in qualifiers: - qualifier = pywikibot.Claim(self.repo, pid) - target_value = value - if qualifier.type == 'wikibase-item': - target_value = self.get_item(qid=value) - qualifier.setTarget(target_value) - claim.addQualifier(qualifier, summary=summary) - - def get_qid(self, label, create=None): - """Retrieve item id based on the given label. Returns None if the label is - unknown or create it with values passed in the create parameter. - - The create parameter is a dictionary with keys: - - summary (mandatory) - - label (optional, reusing the label parameter is not given) - - description (optional) - - aliases (optional) - - statements (optional) - """ - - qid = self.label_qid.get(label, None) - if qid is None and create is not None: - # Create the item - if 'label' not in create: - create['label'] = label - qid = self.add_item(**create) - elif qid is None: - # Try a query with this label - res = self.label2id(label, type='Q') - if res is not None and res.startswith('Q'): - self.label_qid[label] = res - qid = res - - return qid - - def get_pid(self, label): - """Retrieve property id based on the given label. - - Returns None if the label is unknown. - """ - - pid = self.label_pid.get(label, None) - if pid is None: - # Try a query with this label - res = self.label2id(label, type='P') - if res is not None and res.startswith('P'): - self.label_pid[label] = res - pid = res - - return pid - - @decorators.thread_safe - def extid2qid(self, label=None, qid=None): - """Find items that have an external ID for the given type of IDs. - return: dict where keys are the external ids and values are the QIDs - - warning: assumes there is only one item per external IDs - """ - - extid_qid = qid - if qid is None and label is not None: - extid_qid = self.get_qid(label) - - if extid_qid is None: - print('Error: could not find the item corresponding to this external ID') - return None - - QUERY = """ - #Items that have a pKa value set - SELECT ?item ?extid - WHERE - { - ?item p:%s ?extidStatement . - ?extidStatement ps:%s ?extid . - ?extidStatement pq:%s wd:%s . - } - """ % ( - self.get_pid('external ID'), - self.get_pid('external ID'), - self.get_pid('instance of'), - extid_qid - ) - - self.sparql.setQuery(QUERY) - self.sparql.setReturnFormat(JSON) - response = self.sparql.query().convert() - results = response['results'] - - extid2qid = {} - for res in results['bindings']: - res_qid = res['item']['value'].rpartition('/')[2] - res_extid = res['extid']['value'] - - extid2qid[res_extid] = res_qid - - return extid2qid - - @decorators.thread_safe - def asn2qid(self, asn, create=False): - """Retrive QID of items assigned with the given Autonomous System Number. - - param: asn (int) - """ - - if isinstance(asn, str) and asn.startswith('AS'): - asn = asn[2:] - - if int(asn) < 0: - print('Error: ASN value should be positive.') - return None - - if len(self._asn2qid) == 0: - logging.info('Wikihandy: downloading AS QIDs') - # Bootstrap : retrieve all existing ASN/QID pairs - QUERY = """ - #Items that have a pKa value set - SELECT ?item ?asn - WHERE - { - ?item wdt:%s wd:%s . - ?item wdt:%s ?asn . - } - """ % ( - self.get_pid('instance of'), - self.get_qid('autonomous system'), - self.get_pid('autonomous system number') - ) - - self.sparql.setQuery(QUERY) - self.sparql.setReturnFormat(JSON) - results = self.sparql.query().convert() - - for res in results['results']['bindings']: - res_qid = res['item']['value'].rpartition('/')[2] - res_asn = int(res['asn']['value']) - - self._asn2qid[res_asn] = res_qid - - logging.info(f'Wikihandy: downloaded QIDs for {len(self._asn2qid)} ASes') - - # Find the AS QID or add it to wikibase - qid = self._asn2qid.get(int(asn), None) - if create and qid is None: - # if this AS is unknown, create corresponding item - qid = self.add_item('new AS', f'AS{asn}', - statements=[ - [self.get_pid('instance of'), self.get_qid('autonomous system'), []], - [self.get_pid('autonomous system number'), str(asn), []] - ]) - - return qid - - @decorators.thread_safe - def prefix2qid(self, prefix, create=False): - """Retrive QID of items assigned with the given routing IP prefix. - - param: prefix (str) - """ - - prefix = prefix.strip() - - # TODO use a proper regex - if ('.' not in prefix and ':' not in prefix) or '/' not in prefix: - print('Error: wrong format: ', prefix) - return None - - # IP version - af = 4 - if ':' in prefix: - af = 6 - - if len(self._prefix2qid) == 0: - # Bootstrap : retrieve all existing prefix/QID pairs - - logging.info('Wikihandy: downloading prefix QIDs') - - QUERY = """ - #Items that have a pKa value set - SELECT ?item ?prefix - WHERE - { - ?item wdt:%s wd:%s. - ?item rdfs:label ?prefix. - } - """ % ( - self.get_pid('instance of'), - self.get_qid('IP routing prefix'), - ) - - self.sparql.setQuery(QUERY) - self.sparql.setReturnFormat(JSON) - results = self.sparql.query().convert() - - for res in results['results']['bindings']: - res_qid = res['item']['value'].rpartition('/')[2] - res_prefix = res['prefix']['value'] - - self._prefix2qid[res_prefix] = res_qid - - logging.info(f'Wikihandy: downloaded QIDs for {len(self._prefix2qid)} prefixes ') - - # Find the prefix QID or add it to wikibase - qid = self._prefix2qid.get(prefix, None) - if create and qid is None: - # if this prefix is unknown, create corresponding item - qid = self.add_item('new prefix', prefix, - statements=[ - [self.get_pid('instance of'), self.get_qid('IP routing prefix'), []], - [self.get_pid('IP version'), self.get_qid(f'IPv{af}'), []], - ],) - - return qid - - @decorators.thread_safe - def ip2qid(self, ip_address: str, create=False): - """Retrive QID of items assigned with the given IP address. - - param: ip_address (str) - - returns None if the given address is invalid or if the address is not in - IYP (and create=False). - """ - - try: - ip = ipaddress.ip_address(ip_address) - except ValueError: - return None - - if len(self._ip2qid) == 0: - # Bootstrap : retrieve all existing IP/QID pairs - - logging.info('Wikihandy: downloading IP QIDs') - - QUERY = """ - #Items that have a pKa value set - SELECT ?item ?ip - WHERE - { - ?item wdt:%s wd:%s. - ?item rdfs:label ?ip. - } - """ % ( - self.get_pid('instance of'), - self.get_qid('IP address'), - ) - - self.sparql.setQuery(QUERY) - self.sparql.setReturnFormat(JSON) - results = self.sparql.query().convert() - - for res in results['results']['bindings']: - res_qid = res['item']['value'].rpartition('/')[2] - res_ip = res['ip']['value'] - - self._ip2qid[res_ip] = res_qid - - logging.info(f'Wikihandy: downloaded QIDs for {len(self._ip2qid)} IP addresses ') - - # Find the IP QID or add it to wikibase - qid = self._ip2qid.get(ip.compressed, None) - if create and qid is None: - # if this IP is unknown, create corresponding item - qid = self.add_item('new IP address', ip.compressed, - statements=[ - [self.get_pid('instance of'), self.get_qid('IP address'), []], - [self.get_pid('IP version'), self.get_qid(f'IPv{ip.version}'), []], - ],) - - return qid - - @decorators.thread_safe - def domain2qid(self, domain, create=False): - """Retrive QID of items assigned to the given domain name. - - param: domain (str) - """ - - domain = domain.strip() - - if len(self._domain2qid) == 0: - # Bootstrap : retrieve all existing prefix/QID pairs - - logging.info('Wikihandy: downloading domain QIDs') - - QUERY = """ - #Items that have a pKa value set - SELECT ?item ?domain - WHERE - { - ?item wdt:%s wd:%s. - ?item rdfs:label ?domain. - } - """ % ( - self.get_pid('instance of'), - self.get_qid('domain name'), - ) - - self.sparql.setQuery(QUERY) - self.sparql.setReturnFormat(JSON) - results = self.sparql.query().convert()['results'] - - self._domain2qid = {} - for res in results['bindings']: - res_qid = res['item']['value'].rpartition('/')[2] - res_domain = res['domain']['value'] - - self._domain2qid[res_domain] = res_qid - - logging.info(f'Wikihandy: downloaded QIDs for {len(self._domain2qid)} domains') - - # Find the domain QID or add it to wikibase - qid = self._domain2qid.get(domain, None) - if create and qid is None: - # if this domain is unknown, create corresponding item - qid = self.add_item('new domain', domain, - statements=[ - [self.get_pid('instance of'), self.get_qid('domain'), []], - ]) - - return qid - - def country2qid(self, cc, create=True): - """Find a country QID or add the country to wikibase if it doesn't exist and - create is set to True. - - param: cc (string) Two-character country code. - - Notice: the label of a created country is the country name as defined - by iso3166). - """ - - # Check if country page exists - cc = cc.upper() - cc_label = 'unknown country' - if cc in EXOTIC_CC: - cc_label = EXOTIC_CC[cc] - elif cc in iso3166.countries_by_alpha2: - cc_label = iso3166.countries_by_alpha2[cc].name - else: - return None - - # Create the country page if it doesn't exists - cc_qid = self.get_qid(cc_label) - if create and cc_qid is None: - cc_qid = self.add_item('add new country', cc_label, aliases=cc, - statements=[[self.get_pid('instance of'), self.get_qid('country'), []]]) - - return cc_qid - - def print_all_items(self): - # Reduce throttling delay - - QUERY = """SELECT ?item ?itemLabel - WHERE { - ?item rdfs:label ?itemLabel. - } """ - - self.sparql.setQuery(QUERY) - self.sparql.setReturnFormat(JSON) - results = self.sparql.query().convert()['results'] - - for res in results['bindings']: - qid = res['item']['value'].rpartition('/')[2] - if qid.startswith('Q'): - item = self.get_item(qid=qid) - print(f'# {item}') - # item.delete(reason='delete all', prompt=False) - - def label2id(self, label, type=None): - """Return the qid or pid corresponding to the given label using a sparql - query.""" - - QUERY = """SELECT ?item - WHERE { - ?item rdfs:label "%s"@en. - } """ % label - - # Fetch existing entities - self.sparql.setQuery(QUERY) - self.sparql.setReturnFormat(JSON) - results = self.sparql.query().convert()['results'] - - for res in results['bindings']: - id = res['item']['value'].rpartition('/')[2] - if type is None or id.startswith(type): - return id - - return None - - def id4alllabels(self): - """Return two dictionaries, one for properties and one for items, with labels - as keys and Q/P IDs as values. For entities that have the same label only the - first entity found is given, the other are ignored. - - Also ignore items that are instance of: - - autonomous system - - IP routing prefix - - domain name - Use dedicated method for these items (e.g. asn2qid) - """ - - properties = {} - items = {} - results = [] - - # Query Sparql for label of items that are not AS, prefix, domain - QUERY = """SELECT ?item ?itemLabel - WHERE { - ?item rdfs:label ?itemLabel. - OPTIONAL{ - ?item wdt:%s ?instance. - } - FILTER( !bound(?instance) || ?instance NOT IN (wd:%s, wd:%s, wd:%s, wd:%s)) - OPTIONAL{ - ?item wdt:%s ?extID. - } - FILTER( !bound(?extID) ) - - } """ % ( - self.label2id('instance of'), - self.label2id('autonomous system'), - self.label2id('IP routing prefix'), - self.label2id('IP address'), - self.label2id('domain name'), - self.label2id('external ID'), - ) - - # Fetch existing entities - self.sparql.setQuery(QUERY) - self.sparql.setReturnFormat(JSON) - results = self.sparql.query().convert() - - for res in results['results']['bindings']: - id = res['item']['value'].rpartition('/')[2] - label = res['itemLabel']['value'] - - # sort properties/items in dictionaries - if id.startswith('P'): - if label not in properties: - properties[label] = id - elif id.startswith('Q'): - if label not in items: - items[label] = id - - logging.info(f'Wikihandy: loaded {len(properties)} PIDs and {len(items)} QIDs') - - return properties, items - -# Handy functions - - -def select_claims(claims_dict, ref_pid, ref_values): - """Select claims from claims_dict that have a reference statement ref_pid with one - of the values given in ref_values.""" - - selected_claims = defaultdict(list) - - for pid, claims in claims_dict.items(): - for claim in claims: - for source in claim.sources: - if ref_pid in source: - for ref in source[ref_pid]: - if ref.getTarget() in ref_values: - selected_claims[pid].append(claim) - - return selected_claims - - -def select_first_claim(claims_list, ref_pid, ref_values): - """Select first claim from all_claims that have a reference statement ref_pid with - one of the values given in ref_values. - - Return None if such claim is not found. - """ - - for claim in claims_list: - for source in claim.sources: - if ref_pid in source: - for ref in source[ref_pid]: - if ref.getTarget() in ref_values: - return claim - - return None - - -def unpack_statement(statement): - """Return four objects (pid, target, references, qualifiers) regardless of the - length of the given statement.""" - - references = [] - qualifiers = [] - if len(statement) == 2: - property_id, target = statement - elif len(statement) == 3: - property_id, target, references = statement - else: - property_id, target, references, qualifiers = statement - - return property_id, target, references, qualifiers - - -if __name__ == '__main__': - - FORMAT = '%(asctime)s %(processName)s %(message)s' - logging.basicConfig( - format=FORMAT, - filename='log/wikihandy.log', - level=logging.INFO, - datefmt='%Y-%m-%d %H:%M:%S' - ) - logging.info('Started: %s' % sys.argv) - - wh = Wikihandy() - - import IPython - IPython.embed()