From 3f1beedb8c1e843b221c59dc73f0b7900197a72a Mon Sep 17 00:00:00 2001 From: Renne Rocha Date: Wed, 14 Dec 2022 20:32:12 -0300 Subject: [PATCH] Use real User Agent in project requests --- data_collection/gazette/settings.py | 3 +++ data_collection/gazette/spiders/rj_nova_iguacu.py | 4 ---- data_collection/gazette/spiders/rn_mossoro.py | 4 ---- 3 files changed, 3 insertions(+), 8 deletions(-) diff --git a/data_collection/gazette/settings.py b/data_collection/gazette/settings.py index 5eceadd4c..87a38b5ab 100644 --- a/data_collection/gazette/settings.py +++ b/data_collection/gazette/settings.py @@ -12,6 +12,9 @@ "spidermon.contrib.scrapy.pipelines.ItemValidationPipeline": 400, "gazette.pipelines.SQLDatabasePipeline": 500, } +USER_AGENT = ( + "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:108.0) Gecko/20100101 Firefox/108.0" +) DOWNLOAD_TIMEOUT = 360 diff --git a/data_collection/gazette/spiders/rj_nova_iguacu.py b/data_collection/gazette/spiders/rj_nova_iguacu.py index 79a2d2921..c852fbb9e 100755 --- a/data_collection/gazette/spiders/rj_nova_iguacu.py +++ b/data_collection/gazette/spiders/rj_nova_iguacu.py @@ -14,10 +14,6 @@ class RjNovaIguacu(BaseGazetteSpider): start_date = dt.date(2014, 1, 6) BASE_URL = "https://www.novaiguacu.rj.gov.br/diario-oficial/" - custom_settings = { - "USER_AGENT": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:105.0) Gecko/20100101 Firefox/105.0", - } - def start_requests(self): for date in rrule(DAILY, dtstart=self.start_date, until=self.end_date): yield scrapy.Request( diff --git a/data_collection/gazette/spiders/rn_mossoro.py b/data_collection/gazette/spiders/rn_mossoro.py index 9ec01336d..b02c98b2e 100644 --- a/data_collection/gazette/spiders/rn_mossoro.py +++ b/data_collection/gazette/spiders/rn_mossoro.py @@ -14,10 +14,6 @@ class RnMossoroSpider(BaseGazetteSpider): allowed_domains = ["jom.prefeiturademossoro.com.br"] start_date = dt.date(2008, 1, 1) - custom_settings = { - "USER_AGENT": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:105.0) Gecko/20100101 Firefox/105.0", - } - def start_requests(self): # avoid skipping months if day of start_date is at the end of the month first_day_of_start_date_month = dt.date(