diff --git a/data_collection/gazette/spiders/es_vitoria.py b/data_collection/gazette/spiders/es_vitoria.py new file mode 100644 index 0000000000..b19222fb63 --- /dev/null +++ b/data_collection/gazette/spiders/es_vitoria.py @@ -0,0 +1,236 @@ +from collections.abc import Generator +from datetime import date, datetime + +from parsel import Selector +from scrapy.http import FormRequest, Request, TextResponse + +from gazette.items import Gazette +from gazette.spiders.base import BaseGazetteSpider + +BASE_URL: str = "https://diariooficial.vitoria.es.gov.br/" + + +class EsVitoriaSpider(BaseGazetteSpider): + name: str = "es_vitoria" + TERRITORY_ID: str = "3205309" + start_date: date = date(2014, 7, 21) + + allowed_domains: list[str] = ["diariooficial.vitoria.es.gov.br"] + + # When there are too many requests, the server may return + # an HTTP 406 status code when trying to download a PDF file + # + # We set `custom_settings` to avoid triggering the 406 HTTP status code + # by spreading the downloads for this spider over time + + custom_settings: dict = { + "DOWNLOAD_DELAY": 0.1, # 100 ms + "RANDOMIZE_DOWNLOAD_DELAY": True, + "RETRY_HTTP_CODES": [500, 502, 503, 504, 522, 524, 408, 429, 406], + } + + def __init__(self, *args, **kwargs) -> None: + super(EsVitoriaSpider, self).__init__(*args, **kwargs) + + self.data_by_monthly_date_by_date: dict[ + tuple[ + int, # year + int, # month + ], + dict[ + date, # gazette_date + list[ + str, # url + ], + ], + ] = {} + + def start_requests(self) -> Generator: + url: str = BASE_URL + + today: date = date.today() + year: int = today.year + month: int = today.month + + yield Request( + url=url, + callback=self.initial_parse, + meta={"cookiejar": f"{self.name}_{year}_{month}"}, + ) + + def initial_parse(self, response: TextResponse) -> Generator: + year_select: Selector = response.xpath("//select[contains(@id, 'ddlAno')]") + year_formkey: str = year_select.attrib["name"] + years_available: map[int] = map( + int, year_select.xpath("./option/@value").getall() + ) + chosen_year: int = int( + year_select.xpath("./option[contains(@selected, 'selected')]/@value").get() + ) + + year: int + for year in years_available: + if year < self.start_date.year or self.end_date.year < year: + continue + + if year == chosen_year: + yield from self.parse_year(response, year) + continue + + yield FormRequest.from_response( + response, + formdata={year_formkey: str(year)}, + callback=self.parse_year, + cb_kwargs={"year": year}, + # We are isolating cookiejar per name-year-month combination + # to avoid interference between concurrent requests + # Whenever we request a past year, it sets the month to December + meta={"cookiejar": f"{self.name}_{year}_12"}, + ) + + def parse_year(self, response: TextResponse, year: int) -> Generator: + year_select: Selector = response.xpath("//select[contains(@id, 'ddlAno')]") + year_formkey: str = year_select.attrib["name"] + + month_select: Selector = response.xpath("//select[contains(@id, 'ddlMes')]") + month_formkey: str = month_select.attrib["name"] + + chosen_month: int = int( + month_select.xpath("./option[contains(@selected, 'selected')]/@value").get() + ) + + first_day_of_start_date_month: date = date( + self.start_date.year, self.start_date.month, 1 + ) + + month: int + for month in range(1, 13): + first_day_of_month: date = date(year, month, 1) + if ( + first_day_of_month < first_day_of_start_date_month + or self.end_date < first_day_of_month + ): + continue + + current_year_month: tuple[int, int] = (year, month) + + if month == chosen_month: + yield from self.parse_editions_list(response, current_year_month) + continue + + formdata: dict[str, str] = { + "__EVENTTARGET": month_formkey, + "__EVENTARGUMENT": "", + year_formkey: str(year), + month_formkey: str(month), + } + yield FormRequest.from_response( + response, + formdata=formdata, + callback=self.parse_editions_list, + cb_kwargs={ + "current_year_month": current_year_month, + }, + # We are isolating cookiejar per name-year-month combination + # to avoid interference between concurrent requests + meta={"cookiejar": f"{self.name}_{year}_{month}"}, + ) + + def parse_editions_list( + self, + response: TextResponse, + current_year_month: tuple[int, int], + current_page: int = 1, + ) -> Generator: + + year_select: Selector = response.xpath("//select[contains(@id, 'ddlAno')]") + year_formkey: str = year_select.attrib["name"] + + month_select: Selector = response.xpath("//select[contains(@id, 'ddlMes')]") + month_formkey: str = month_select.attrib["name"] + + row: Selector + file_urls: list[str] + year: int + month: int + + year, month = current_year_month + + for row in response.xpath( + "//ancestor::a[span[contains(@id, '_grdArquivos_')]]" + ): + raw_string: str = row.xpath("./span/text()").get() + date_string_from_text: str = raw_string.split()[-1] + gazette_date: date = self._parse_date(date_string_from_text) + + if not gazette_date: + self.logger.warning( + f"No valid date could be extracted from '{raw_string}'" + ) + continue + + if gazette_date > self.end_date: + continue + elif gazette_date < self.start_date: + return + + if gazette_date.timetuple()[:2] != current_year_month: + self.logger.warning( + f"Found {gazette_date.isoformat()} gazette while querying" + f" for {current_year_month[0]}-{current_year_month[1]:02}" + f" period. Skipping..." + ) + continue + + url: str = response.urljoin(row.attrib["href"]) + + file_urls = self.data_by_monthly_date_by_date.setdefault( + current_year_month, {} + ).setdefault(gazette_date, []) + + if url not in file_urls: + # We use this strategy to avoid duplicates while maintaining row order + file_urls.append(url) + + number_of_pages: int = len( + response.xpath("//ul[contains(@class, 'pagination')]/li").getall() + ) + + if current_page < number_of_pages: + formdata = { + "__EVENTARGUMENT": f"Page${current_page + 1}", + "__EVENTTARGET": "ctl00$conteudo$ucPesquisarDiarioOficial$grdArquivos", + year_formkey: str(year), + month_formkey: str(month), + } + + yield FormRequest.from_response( + response, + formdata=formdata, + callback=self.parse_editions_list, + cb_kwargs={ + "current_year_month": current_year_month, + "current_page": current_page + 1, + }, + # We keep using the same cookiejar for the name_year_month combination + # because, if we don't, it can interfere with the paging data for + # a different name_year_month combination + meta={"cookiejar": f"{self.name}_{year}_{month}"}, + ) + else: + current_year_month_data: dict[ + date, # gazette_date + list[ + str, # url + ], + ] = self.data_by_monthly_date_by_date.get(current_year_month, {}) + for gazette_date, file_urls in current_year_month_data.items(): + yield Gazette( + date=gazette_date, + is_extra_edition=False, + file_urls=file_urls, + power="executive", + ) + + def _parse_date(self, raw_date: str) -> date: + return datetime.strptime(raw_date, "%d/%m/%Y").date()