From 77ecc9accfd0cd4a3a49ba8ecf3a2b30dc48cba4 Mon Sep 17 00:00:00 2001 From: trevineju Date: Fri, 10 Jan 2025 23:30:14 -0300 Subject: [PATCH] =?UTF-8?q?Adiciona=20novo=20raspador=20para=20Florian?= =?UTF-8?q?=C3=B3polis-SC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ...rianopolis.py => sc_florianopolis_2009.py} | 4 +- .../spiders/sc/sc_florianopolis_2024.py | 49 +++++++++++++++++++ 2 files changed, 51 insertions(+), 2 deletions(-) rename data_collection/gazette/spiders/sc/{sc_florianopolis.py => sc_florianopolis_2009.py} (97%) create mode 100644 data_collection/gazette/spiders/sc/sc_florianopolis_2024.py diff --git a/data_collection/gazette/spiders/sc/sc_florianopolis.py b/data_collection/gazette/spiders/sc/sc_florianopolis_2009.py similarity index 97% rename from data_collection/gazette/spiders/sc/sc_florianopolis.py rename to data_collection/gazette/spiders/sc/sc_florianopolis_2009.py index 6dd188553..2241445f2 100644 --- a/data_collection/gazette/spiders/sc/sc_florianopolis.py +++ b/data_collection/gazette/spiders/sc/sc_florianopolis_2009.py @@ -10,7 +10,7 @@ class ScFlorianopolisSpider(BaseGazetteSpider): - name = "sc_florianopolis" + name = "sc_florianopolis_2009" TERRITORY_ID = "4205407" start_date = date(2009, 6, 1) @@ -24,7 +24,7 @@ def start_requests(self): for year, month in periods_of_interest: data = dict(ano=str(year), mes=str(month), passo="1", enviar="") yield FormRequest( - "https://www.pmf.sc.gov.br/governo/index.php?pagina=govdiariooficial", + "https://www.pmf.sc.gov.br/governo/index.php?pagina=govdiarioantigo", formdata=data, ) diff --git a/data_collection/gazette/spiders/sc/sc_florianopolis_2024.py b/data_collection/gazette/spiders/sc/sc_florianopolis_2024.py new file mode 100644 index 000000000..6a45174d6 --- /dev/null +++ b/data_collection/gazette/spiders/sc/sc_florianopolis_2024.py @@ -0,0 +1,49 @@ +from datetime import date, datetime + +from scrapy import FormRequest + +from gazette.items import Gazette +from gazette.spiders.base import BaseGazetteSpider + + +class ScFlorianopolisSpider(BaseGazetteSpider): + name = "sc_florianopolis_2024" + TERRITORY_ID = "4205407" + start_date = date(2024, 8, 5) + allowed_domains = ["edicao.dom.sc.gov.br"] + + def _requests(self, page): + formdata = { + "Edicao[cod_municipio]": "146", + "Edicao_page": str(page), + "r": "site/edicoes", + } + return FormRequest( + url="https://edicao.dom.sc.gov.br/?", + method="GET", + formdata=formdata, + callback=self.parse_pagination, + cb_kwargs={"page": page}, + ) + + def start_requests(self): + yield self._requests(1) + + def parse_pagination(self, response, page): + for item in response.css("tbody tr"): + edition_number = item.css("td::text")[1].get() + edition_url = item.css("td a")[1].attrib["href"] + raw_date = item.css("td::text")[2].get() + edition_date = datetime.strptime(raw_date, "%d/%m/%Y").date() + + if self.start_date <= edition_date <= self.end_date: + yield Gazette( + date=edition_date, + edition_number=edition_number, + is_extra_edition=False, + power="executive_legislative", + file_urls=[edition_url], + ) + + if edition_date > self.start_date: + yield self._requests(page + 1)