Skip to content

Commit

Permalink
Atualiza raspadores para Maranhãozinho e Centro do Guilherme-MA
Browse files Browse the repository at this point in the history
  • Loading branch information
trevineju committed Jan 11, 2025
1 parent c21f7b6 commit c612b3e
Show file tree
Hide file tree
Showing 3 changed files with 67 additions and 16 deletions.
55 changes: 55 additions & 0 deletions data_collection/gazette/spiders/base/aratext.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import re
from urllib.parse import urlparse

import dateparser
from scrapy import Request

from gazette.items import Gazette
from gazette.spiders.base import BaseGazetteSpider


class BaseAratextSpider(BaseGazetteSpider):
def parse(self, response, page=1):
for item in response.css("#edicoes-anteriores tbody tr"):
raw_edition_date = (
item.css("td")[2].css("::text").get().split(",")[1].strip()
)
edition_date = dateparser.parse(raw_edition_date, languages=["pt"]).date()

raw_edition_number = item.css("a::text").get().strip()
edition_number = re.search(r"(\d+)/", raw_edition_number).group(1)

path = item.css("a").attrib["href"]
intermediary_page = (
urlparse(self.start_urls[0])._replace(path=path).geturl()
)

if self.start_date <= edition_date <= self.end_date:
gazette = {
"date": edition_date,
"edition_number": edition_number,
"is_extra_edition": False,
"power": self.power,
}

yield Request(
intermediary_page,
callback=self.parse_intermediary_page,
cb_kwargs={"gazette": gazette},
)

last_page = response.xpath('//*[@class="pagination"]//*[@rel="next"]') == []

if edition_date > self.start_date and not last_page:
page += 1
yield Request(
f"{self.start_urls[0]}?page={page}",
callback=self.parse,
cb_kwargs={"page": page},
)

def parse_intermediary_page(self, response, gazette):
file_path = response.css("#Box-area-title a").attrib["href"]
gazette_url = urlparse(self.start_urls[0])._replace(path=file_path).geturl()

yield Gazette(**gazette, file_urls=[gazette_url])
13 changes: 6 additions & 7 deletions data_collection/gazette/spiders/ma/ma_centro_do_guilherme.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
import datetime

from gazette.spiders.base.siganet import BaseSiganetSpider
from gazette.spiders.base.aratext import BaseAratextSpider


class MaCentroDoGuilhermeSpider(BaseSiganetSpider):
zyte_smartproxy_enabled = True

class MaCentroDoGuilhermeSpider(BaseAratextSpider):
TERRITORY_ID = "2103158"
name = "ma_centro_do_guilherme"
start_date = datetime.date(2021, 3, 12)
allowed_domains = ["transparencia.centrodoguilherme.ma.gov.br"]
BASE_URL = "https://transparencia.centrodoguilherme.ma.gov.br/acessoInformacao/diario/diario"
start_date = datetime.date(2024, 1, 4)
power = "executive"
allowed_domains = ["centrodoguilherme.ma.gov.br"]
start_urls = ["https://centrodoguilherme.ma.gov.br/diariooficial"]
15 changes: 6 additions & 9 deletions data_collection/gazette/spiders/ma/ma_maranhaozinho.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,12 @@
import datetime

from gazette.spiders.base.siganet import BaseSiganetSpider
from gazette.spiders.base.aratext import BaseAratextSpider


class MaMaranhaozinhoSpider(BaseSiganetSpider):
zyte_smartproxy_enabled = True

class MaMaranhaozinhoSpider(BaseAratextSpider):
TERRITORY_ID = "2106375"
name = "ma_maranhaozinho"
start_date = datetime.date(2021, 1, 26)
allowed_domains = ["transparencia.maranhaozinho.ma.gov.br"]
BASE_URL = (
"https://transparencia.maranhaozinho.ma.gov.br/acessoInformacao/diario/diario"
)
start_date = datetime.date(2024, 1, 2)
power = "executive"
allowed_domains = ["maranhaozinho.ma.gov.br"]
start_urls = ["https://maranhaozinho.ma.gov.br/diariooficial"]

0 comments on commit c612b3e

Please sign in to comment.