From 551e56f8f66c1df7de5e35e58be0a3ff798767de Mon Sep 17 00:00:00 2001 From: Malte Tashiro Date: Wed, 11 Dec 2024 04:46:14 +0000 Subject: [PATCH] Misc --- iyp/crawlers/google/crux_top1m_country.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/iyp/crawlers/google/crux_top1m_country.py b/iyp/crawlers/google/crux_top1m_country.py index 323b82e..07a45f6 100644 --- a/iyp/crawlers/google/crux_top1m_country.py +++ b/iyp/crawlers/google/crux_top1m_country.py @@ -10,7 +10,7 @@ from iyp import BaseCrawler -# Data source Google (archived on github by IHR) +# Data source Google (archived on GitHub by IHR) ORG = 'Google' URL = 'https://github.com/InternetHealthReport/crux-top-lists-country/raw/refs/heads/main/data/country' NAME = 'google.crux_top1m_country' @@ -35,11 +35,13 @@ def run(self): country_links = list() rank_links = list() + + end = arrow.utcnow().replace(day=1, hour=0, minute=0, second=0, microsecond=0) + start = end.shift(months=-3) + for country in iso3166.countries: country_code = country.alpha2 - end = arrow.utcnow().replace(day=1, hour=0, minute=0, second=0, microsecond=0) - start = end.shift(months=-3) df = None for date in reversed(list(arrow.Arrow.range('month', start, end))): @@ -66,7 +68,6 @@ def run(self): ranking_name = f'CrUX top 1M ({country_code})' - # Create/fetch corresponding nodes in IYP hostnames.update(df['hostname'].unique()) rankings.add(ranking_name) countries.add(country_code) @@ -89,10 +90,12 @@ def run(self): ] }) + # Create/fetch corresponding nodes in IYP ranking_id = self.iyp.batch_get_nodes_by_single_prop('Ranking', 'name', rankings, all=False) hostname_id = self.iyp.batch_get_nodes_by_single_prop('HostName', 'name', hostnames, all=False) country_id = self.iyp.batch_get_nodes_by_single_prop('Country', 'country_code', countries, all=False) + # Replace link ends with QIDs for link in country_links: link['src_id'] = ranking_id[link['src_id']] link['dst_id'] = country_id[link['dst_id']]