Skip to content

Commit

Permalink
sistema replicavel administracaopublica #1247
Browse files Browse the repository at this point in the history
  • Loading branch information
almeidadm committed Oct 4, 2024
1 parent e250421 commit a40a109
Show file tree
Hide file tree
Showing 3 changed files with 71 additions and 5 deletions.
57 changes: 57 additions & 0 deletions data_collection/gazette/spiders/base/administracaopublica.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import re
from datetime import datetime
from typing import Any

from dateutil.rrule import DAILY, rrule
from scrapy import Request
from scrapy.http import Response

from gazette.items import Gazette
from gazette.spiders.base import BaseGazetteSpider


class BaseAdministracaoPublicaSpider(BaseGazetteSpider):
"""
Base spider for cities using the https://administracaopublica.com.br/diario-oficial?token= plataform.
Gazzetes are also avaiable in http://www.transparenciadministrativa.com.br/diario/diariov2.xhtml?token=.
"""

token: str
allowed_domains = ["administracaopublica.com.br"]

def start_requests(self):
dates = list(
rrule(freq=DAILY, interval=20, dtstart=self.start_date, until=self.end_date)
)
dates.append(self.end_date)

for i in range(len(dates) - 1):
de = dates[i].strftime("%Y-%m-%d")
ate = dates[i + 1].strftime("%Y-%m-%d")
yield Request(
f"https://administracaopublica.com.br/diario-oficial?token={self.token}&de={de}&ate={ate}"
)

def parse(self, response: Response, **kwargs: Any) -> Any:
gazettes = response.css(".diario_item_diario__g9Qfw")
for gazzete in gazettes:
href = gazzete.css('[class*="generics_button_baixar__"]::attr(href)').get()
if href is None:
continue
pattern = gazzete.css("::text").extract()
match pattern:
case [edition, power, date, _]:
pass
case [edition, date, _]:
power = ""
power_dict = {
"EXECUTIVO": "executive",
"LEGISLATIVO": "legislative",
}
yield Gazette(
edition_number=re.findall(r"\s*(\d+\/\d+)\s*", edition),
date=datetime.strptime(date, "%d/%m/%Y").date(),
file_urls=[f"https://administracaopublica.com.br{href}"],
is_extra_edition=power == "EXTRA",
power=power_dict.get(power, "executive_legislative"),
)
10 changes: 10 additions & 0 deletions data_collection/gazette/spiders/ma/ma_nova_iorque.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
import datetime as dt

from gazette.spiders.base.administracaopublica import BaseAdministracaoPublicaSpider


class MaNovaIorqueSpider(BaseAdministracaoPublicaSpider):
TERRITORY_ID = "2107308"
name = "ma_nova_iorque"
start_date = dt.date(2017, 2, 15)
token = "4f1cf16edf5d73feaad4fec2a03c7c9e1cf536aa"
9 changes: 4 additions & 5 deletions data_collection/gazette/spiders/ma/ma_peritoro.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
import datetime as dt

from gazette.spiders.base.aplus import BaseAplusSpider
from gazette.spiders.base.administracaopublica import BaseAdministracaoPublicaSpider


class MaPeritoroSpider(BaseAplusSpider):
class MaPeritoroSpider(BaseAdministracaoPublicaSpider):
TERRITORY_ID = "2108454"
name = "ma_peritoro"
start_date = dt.date(2020, 1, 4)
allowed_domains = ["peritoro.ma.gov.br"]
url_base = "https://www.peritoro.ma.gov.br/diario/"
start_date = dt.date(2017, 1, 2)
token = "9de645b503b922df799865ffcb07a6ec7b9cb53e"

0 comments on commit a40a109

Please sign in to comment.