From dcd35fa511e73727ee7dde89443d851adc1697e1 Mon Sep 17 00:00:00 2001 From: NateScarlet Date: Tue, 16 Jul 2019 22:28:44 +0800 Subject: [PATCH] fix: handle holiday change fix #14 --- fetch_holidays.py | 90 +++++++++++++++++++++++++++--------- tests/test_fetch_holidays.py | 14 +++++- 2 files changed, 81 insertions(+), 23 deletions(-) diff --git a/fetch_holidays.py b/fetch_holidays.py index 67de7ad..cf07bce 100755 --- a/fetch_holidays.py +++ b/fetch_holidays.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 """Fetch holidays from gov.cn """ +import logging import argparse import json import re @@ -11,10 +12,12 @@ import bs4 import requests -SEARCH_URL = ('http://sousuo.gov.cn/s.htm' - '?t=paper&advance=true&sort=&title={year}+%E8%8A%82%E5%81%87%E6%97%A5' - '&puborg=%E5%9B%BD%E5%8A%A1%E9%99%A2%E5%8A%9E%E5%85%AC%E5%8E%85' - '&pcodeJiguan=%E5%9B%BD%E5%8A%9E%E5%8F%91%E6%98%8E%E7%94%B5') +SEARCH_URL = 'http://sousuo.gov.cn/s.htm' +LOGGER = logging.getLogger(__name__) +PAPER_EXCLUDE = [ + 'http://www.gov.cn/zhengce/content/2014-09/29/content_9102.htm', + 'http://www.gov.cn/zhengce/content/2015-02/09/content_9466.htm', +] def get_paper_urls(year: int) -> List[str]: @@ -24,14 +27,20 @@ def get_paper_urls(year: int) -> List[str]: year (int): eg. 2018 Returns: - List[str]: Urls + List[str]: Urls, newlest first. """ - url = SEARCH_URL.format(year=year) - body = requests.get(url).text + body = requests.get(SEARCH_URL, params={ + 't': 'paper', + 'advance': 'true', + 'title': year, + 'q': '假期', + 'pcodeJiguan': '国办发明电', + 'puborg': '国务院办公厅' + }).text ret = re.findall( r'
  • ', body, flags=re.S) - + ret = [i for i in ret if i not in PAPER_EXCLUDE] return ret @@ -72,15 +81,50 @@ def get_rules(paper: str) -> Iterator[Tuple[str, str]]: """ lines: list = paper.splitlines() + lines = sorted(set(lines), key=lines.index) count = 0 - for i in sorted(set(lines), key=lines.index): + for i in chain(get_normal_rules(lines), get_patch_rules(lines)): + count += 1 + yield i + if not count: + raise NotImplementedError(lines) + + +def get_normal_rules(lines: Iterator[str]) -> Iterator[Tuple[str, str]]: + """Get normal holiday rule for a year + + Args: + lines (Iterator[str]): paper content + + Returns: + Iterator[Tuple[str, str]]: (name, description) + """ + for i in lines: match = re.match(r'[一二三四五六七八九十]、(.+?):(.+)', i) if match: - count += 1 yield match.groups() - if not count: - raise NotImplementedError(lines) + +def get_patch_rules(lines: Iterator[str]) -> Iterator[Tuple[str, str]]: + """Get holiday patch rule for existed holiday + + Args: + lines (Iterator[str]): paper content + + Returns: + Iterator[Tuple[str, str]]: (name, description) + """ + name = None + for i in lines: + nameMatch = re.match(r'.*\d+年(.{2,})(?:假期|放假)安排.*', i) + if nameMatch: + name = nameMatch.group(1) + if name: + match = re.match(r'^[一二三四五六七八九十]、(.+)$', i) + if match: + description = match.group(1) + if re.match(r'.*\d+月\d+日.*', description): + yield name, description def _cast_int(value): @@ -176,7 +220,7 @@ def _extract_dates_1(self, value: str) -> Iterator[date]: assert len(groups) == 3, groups yield self.parent.get_date(year=groups[0], month=groups[1], day=groups[2]) - def _extract_dates_2(self, value: str)-> Iterator[date]: + def _extract_dates_2(self, value: str) -> Iterator[date]: match = re.findall( r'(?:(\d+)年)?(?:(\d+)月)?(\d+)日(?:至|-|—)(?:(\d+)年)?(?:(\d+)月)?(\d+)日', value) for groups in match: @@ -189,7 +233,7 @@ def _extract_dates_2(self, value: str)-> Iterator[date]: for i in range((end - start).days + 1): yield start + timedelta(days=i) - def _extract_dates_3(self, value: str)-> Iterator[date]: + def _extract_dates_3(self, value: str) -> Iterator[date]: match = re.findall( r'(?:(\d+)年)?(?:(\d+)月)?(\d+)日(?:([^)]+))?' r'(?:、(?:(\d+)年)?(?:(\d+)月)?(\d+)日(?:([^)]+))?)+', @@ -263,20 +307,22 @@ def fetch_holiday(year: int): """Fetch holiday data. """ papers = get_paper_urls(year) + papers.reverse() - days = [] + days = dict() for i in papers: paper = get_paper(i) - rules = get_rules(paper) - for name, description in rules: - days.extend({ - 'name': name, - **j - } for j in DescriptionParser(description, year).parse()) + try: + rules = get_rules(paper) + for name, description in rules: + for j in DescriptionParser(description, year).parse(): + days[j['date']] = {'name': name, **j} + except NotImplementedError as ex: + raise RuntimeError('Can not extract rules', i) from ex return { 'year': year, 'papers': papers, - 'days': sorted(days, key=lambda x: x['date']) + 'days': sorted(days.values(), key=lambda x: x['date']) } diff --git a/tests/test_fetch_holidays.py b/tests/test_fetch_holidays.py index fd6cb47..d6967ae 100644 --- a/tests/test_fetch_holidays.py +++ b/tests/test_fetch_holidays.py @@ -2,11 +2,23 @@ import json import sys -from fetch_holidays import CustomJSONEncoder, DescriptionParser +from fetch_holidays import CustomJSONEncoder, DescriptionParser, get_paper_urls, get_rules, get_paper from .filetools import _file_path +def test_get_paper_urls(): + assert get_paper_urls(2019) == [ + 'http://www.gov.cn/zhengce/content/2019-03/22/content_5375877.htm', + 'http://www.gov.cn/zhengce/content/2018-12/06/content_5346276.htm' + ] + + +def test_get_patch_rules(): + assert(list(get_rules(get_paper('http://www.gov.cn/zhengce/content/2019-03/22/content_5375877.htm'))) + == [('劳动节', '2019年5月1日至4日放假调休,共4天。4月28日(星期日)、5月5日(星期日)上班。')]) + + def _normalize(iterable): return sorted(json.loads(json.dumps(list(iterable), cls=CustomJSONEncoder)), key=lambda x: x['date'])