diff --git a/anyway/parsers/news_flash.py b/anyway/parsers/news_flash.py index b05a39a1..45700264 100644 --- a/anyway/parsers/news_flash.py +++ b/anyway/parsers/news_flash.py @@ -1,5 +1,9 @@ import os import sys +import requests +from bs4 import BeautifulSoup +import logging +from pytz import timezone from anyway.parsers import twitter, rss_sites from anyway.parsers.news_flash_db_adapter import init_db @@ -9,6 +13,7 @@ classify_organization, ) from anyway.parsers.location_extraction import extract_geo_features +from anyway.parsers.timezones import ISREAL_SUMMER_TIMEZONE # FIX: classifier should be chosen by source (screen name), so `twitter` should be `mda` news_flash_classifiers = {"ynet": classify_rss, "twitter": classify_tweets, "walla": classify_rss} @@ -37,6 +42,21 @@ def update_all_in_db(source=None, newsflash_id=None): db.commit() +def scrape_hour_for_walla_newsflash(newsflash): + try: + israel_tz = timezone('Asia/Jerusalem') + + page = requests.get(newsflash.link).content + time_element = BeautifulSoup(page, "html.parser").find("div", class_="time") + time = time_element.get_text() + scraped_hour = int(time[:2]) + newsflash.date = newsflash.date.replace(hour=scraped_hour).replace(tzinfo=None) + newsflash_date_localized = israel_tz.localize(newsflash.date) + newsflash.date = timezone("UTC").normalize(newsflash_date_localized) + except Exception as e: + logging.error(f"during scraping hour for newsflash {e}") + + def scrape_extract_store_rss(site_name, db): latest_date = db.get_latest_date_of_source(site_name) for newsflash in rss_sites.scrape(site_name): @@ -45,6 +65,8 @@ def scrape_extract_store_rss(site_name, db): # TODO: pass both title and description, leaving this choice to the classifier newsflash.accident = classify_rss(newsflash.title or newsflash.description) newsflash.organization = classify_organization(site_name) + if site_name == "walla": # walla's rss feed currently shows wrong time zone + scrape_hour_for_walla_newsflash(newsflash) if newsflash.accident: # FIX: No accident-accurate date extracted extract_geo_features(db, newsflash)