Skip to content

Commit

Permalink
Adiciona novo raspador para Florianópolis-SC e atualiza antigo (#1349)
Browse files Browse the repository at this point in the history
  • Loading branch information
trevineju authored Jan 11, 2025
2 parents 1d5a9a4 + 77ecc9a commit c21f7b6
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@


class ScFlorianopolisSpider(BaseGazetteSpider):
name = "sc_florianopolis"
name = "sc_florianopolis_2009"
TERRITORY_ID = "4205407"
start_date = date(2009, 6, 1)

Expand All @@ -24,7 +24,7 @@ def start_requests(self):
for year, month in periods_of_interest:
data = dict(ano=str(year), mes=str(month), passo="1", enviar="")
yield FormRequest(
"https://www.pmf.sc.gov.br/governo/index.php?pagina=govdiariooficial",
"https://www.pmf.sc.gov.br/governo/index.php?pagina=govdiarioantigo",
formdata=data,
)

Expand Down
49 changes: 49 additions & 0 deletions data_collection/gazette/spiders/sc/sc_florianopolis_2024.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
from datetime import date, datetime

from scrapy import FormRequest

from gazette.items import Gazette
from gazette.spiders.base import BaseGazetteSpider


class ScFlorianopolisSpider(BaseGazetteSpider):
name = "sc_florianopolis_2024"
TERRITORY_ID = "4205407"
start_date = date(2024, 8, 5)
allowed_domains = ["edicao.dom.sc.gov.br"]

def _requests(self, page):
formdata = {
"Edicao[cod_municipio]": "146",
"Edicao_page": str(page),
"r": "site/edicoes",
}
return FormRequest(
url="https://edicao.dom.sc.gov.br/?",
method="GET",
formdata=formdata,
callback=self.parse_pagination,
cb_kwargs={"page": page},
)

def start_requests(self):
yield self._requests(1)

def parse_pagination(self, response, page):
for item in response.css("tbody tr"):
edition_number = item.css("td::text")[1].get()
edition_url = item.css("td a")[1].attrib["href"]
raw_date = item.css("td::text")[2].get()
edition_date = datetime.strptime(raw_date, "%d/%m/%Y").date()

if self.start_date <= edition_date <= self.end_date:
yield Gazette(
date=edition_date,
edition_number=edition_number,
is_extra_edition=False,
power="executive_legislative",
file_urls=[edition_url],
)

if edition_date > self.start_date:
yield self._requests(page + 1)

0 comments on commit c21f7b6

Please sign in to comment.