From dc71dae68899292e581ea3c03eda64915be693b7 Mon Sep 17 00:00:00 2001 From: JustinLoye Date: Mon, 4 Mar 2024 08:36:19 +0000 Subject: [PATCH 1/3] add country population crawler country population crawl of worldbank, issue #129 --- iyp/crawlers/worldbank/README.md | 16 +++++ iyp/crawlers/worldbank/country_pop.py | 98 +++++++++++++++++++++++++++ 2 files changed, 114 insertions(+) create mode 100644 iyp/crawlers/worldbank/README.md create mode 100644 iyp/crawlers/worldbank/country_pop.py diff --git a/iyp/crawlers/worldbank/README.md b/iyp/crawlers/worldbank/README.md new file mode 100644 index 00000000..d518214f --- /dev/null +++ b/iyp/crawlers/worldbank/README.md @@ -0,0 +1,16 @@ +# World Bank's country population -- https://www.worldbank.org/en/home + +> The World Bank Group works in every major area of development. We provide a wide array of financial products and technical assistance, and we help countries share and apply innovative knowledge and solutions to the challenges they face. + +> The World Bank is like a cooperative, made up of 189 member countries. These member countries, or shareholders, are represented by a Board of Governors, who are the ultimate policymakers at the World Bank. Generally, the governors are member countries' ministers of finance or ministers of development. They meet once a year at the Annual Meetings of the Boards of Governors of the World Bank Group and the International Monetary Fund. + +## Graph representation + +### Country Estimate +Connect `Country` to an `Estimate` node meaning that a country has an estimated population of `value`. +``` +(:Country)-[:POPULATION {value: 123}]->(:Estimate {name: 'World Bank Population Estimate'}) +``` + +## Dependence +This crawler depends on crawlers setting the country codes. \ No newline at end of file diff --git a/iyp/crawlers/worldbank/country_pop.py b/iyp/crawlers/worldbank/country_pop.py new file mode 100644 index 00000000..3b0273df --- /dev/null +++ b/iyp/crawlers/worldbank/country_pop.py @@ -0,0 +1,98 @@ +import argparse +import json +import logging +import os +import sys +from datetime import datetime, timezone + +import requests + +from iyp import BaseCrawler, RequestStatusError + +URL = 'http://api.worldbank.org/v2/country/all/indicator/SP.POP.TOTL?per_page=400&mrv=1&format=json' +ORG = 'WorldBank' +NAME = 'worldbank.country_pop' + + +class Crawler(BaseCrawler): + def __init__(self, organization, url, name): + super().__init__(organization, url, name) + self.reference['reference_url_info'] = ( + 'https://datahelpdesk.worldbank.org/knowledgebase/articles/' + '889392-about-the-indicators-api-documentation' + ) + + def run(self): + """Get country population from Worldbank API and push it to IYP.""" + + # Get content + req = requests.get(URL) + if req.status_code != 200: + raise RequestStatusError('Error while fetching country population') + content = json.loads(req.content) + + # Set last time of modification + self.reference['reference_time_modification'] = datetime.strptime(content[0]['lastupdated'], + '%Y-%m-%d').replace(tzinfo=timezone.utc) + + # Get countries present in IYP cc to id mapping + country_ids = self.iyp.batch_get_nodes_by_single_prop('Country', 'country_code', create=False, all=True) + + # Get countries and population from World Bank + lines = set() + for entry in json.loads(req.content)[1]: + + if entry['indicator']['id'] != 'SP.POP.TOTL': + continue + + country = entry['country']['id'] + if country not in country_ids: + continue + + population = int(entry['value']) + lines.add((country, population)) + + # Get `Estimate` node ID + estimate_qid = self.iyp.get_node('Estimate', properties={'name': 'World Bank Population Estimate'}) + + # Compute links + links = [] + for (country, population) in lines: + + country_qid = country_ids[country] + + links.append({'src_id': country_qid, 'dst_id': estimate_qid, + 'props': [self.reference, {'value': population}]}) + + # Push all links to IYP + self.iyp.batch_add_links('POPULATION', links) + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument('--unit-test', action='store_true') + args = parser.parse_args() + + scriptname = os.path.basename(sys.argv[0]).replace('/', '_')[0:-3] + FORMAT = '%(asctime)s %(levelname)s %(message)s' + logging.basicConfig( + format=FORMAT, + filename='log/' + scriptname + '.log', + level=logging.INFO, + datefmt='%Y-%m-%d %H:%M:%S' + ) + + logging.info(f'Started: {sys.argv}') + + crawler = Crawler(ORG, URL, NAME) + if args.unit_test: + crawler.unit_test(logging) + else: + crawler.run() + crawler.close() + logging.info(f'Finished: {sys.argv}') + + +if __name__ == '__main__': + main() + sys.exit(0) From 5f02dbd07bbfeee19a661558446f72ae1ab437a2 Mon Sep 17 00:00:00 2001 From: JustinLoye Date: Mon, 4 Mar 2024 08:58:53 +0000 Subject: [PATCH 2/3] add World Bank to acknowledgements --- ACKNOWLEDGMENTS.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/ACKNOWLEDGMENTS.md b/ACKNOWLEDGMENTS.md index 5dcd6ccc..193541d3 100644 --- a/ACKNOWLEDGMENTS.md +++ b/ACKNOWLEDGMENTS.md @@ -192,3 +192,9 @@ NetSecLab group at Virginia Tech. > Weitong Li, Zhexiao Lin, Md. Ishtiaq Ashiq, Emile Aben, Romain Fontugne, > Amreesh Phokeer, and Taejoong Chung. > ACM Internet Measurement Conference (IMC), October 2023. + +## World Bank +We use the country population indicator `SP.POP.TOTL.` from the +[Indicators API](https://datahelpdesk.worldbank.org/knowledgebase/articles/889392-about-the-indicators-api-documentation) +dataset provided by the +[World Bank](https://www.worldbank.org/en/home). \ No newline at end of file From 89761634d3894deaef45ad7c63364f41f0964cfd Mon Sep 17 00:00:00 2001 From: JustinLoye Date: Mon, 11 Mar 2024 06:48:14 +0000 Subject: [PATCH 3/3] update config.json.example and nitpicks --- config.json.example | 1 + iyp/crawlers/worldbank/README.md | 2 +- iyp/crawlers/worldbank/country_pop.py | 5 +---- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/config.json.example b/config.json.example index 7ba1a0d9..ee12e2c2 100644 --- a/config.json.example +++ b/config.json.example @@ -83,6 +83,7 @@ "iyp.crawlers.alice_lg.linx", "iyp.crawlers.alice_lg.megaport", "iyp.crawlers.alice_lg.netnod", + "iyp.crawlers.worldbank.country_pop", "iyp.crawlers.simulamet.rirdata_rdns", "iyp.crawlers.openintel.dnsgraph_nl", "iyp.crawlers.openintel.dnsgraph_rdns", diff --git a/iyp/crawlers/worldbank/README.md b/iyp/crawlers/worldbank/README.md index d518214f..1ab7160c 100644 --- a/iyp/crawlers/worldbank/README.md +++ b/iyp/crawlers/worldbank/README.md @@ -13,4 +13,4 @@ Connect `Country` to an `Estimate` node meaning that a country has an estimated ``` ## Dependence -This crawler depends on crawlers setting the country codes. \ No newline at end of file +This crawler depends on crawlers creating Country nodes. \ No newline at end of file diff --git a/iyp/crawlers/worldbank/country_pop.py b/iyp/crawlers/worldbank/country_pop.py index 3b0273df..ea19f0c3 100644 --- a/iyp/crawlers/worldbank/country_pop.py +++ b/iyp/crawlers/worldbank/country_pop.py @@ -40,10 +40,7 @@ def run(self): # Get countries and population from World Bank lines = set() - for entry in json.loads(req.content)[1]: - - if entry['indicator']['id'] != 'SP.POP.TOTL': - continue + for entry in content[1]: country = entry['country']['id'] if country not in country_ids: