Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bug 2438 change newsflashes timezones to utc #2452

Draft
wants to merge 7 commits into
base: dev
Choose a base branch
from
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions anyway/parsers/news_flash.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
import os
import sys
import requests
from bs4 import BeautifulSoup
import logging
from pytz import timezone

from anyway.parsers import twitter, rss_sites
from anyway.parsers.news_flash_db_adapter import init_db
Expand All @@ -9,6 +13,7 @@
classify_organization,
)
from anyway.parsers.location_extraction import extract_geo_features
from anyway.parsers.timezones import ISREAL_SUMMER_TIMEZONE

# FIX: classifier should be chosen by source (screen name), so `twitter` should be `mda`
news_flash_classifiers = {"ynet": classify_rss, "twitter": classify_tweets, "walla": classify_rss}
Expand Down Expand Up @@ -37,6 +42,21 @@ def update_all_in_db(source=None, newsflash_id=None):
db.commit()


def scrape_hour_for_walla_newsflash(newsflash):
try:
israel_tz = timezone('Asia/Jerusalem')

page = requests.get(newsflash.link).content
time_element = BeautifulSoup(page, "html.parser").find("div", class_="time")
time = time_element.get_text()
scraped_hour = int(time[:2])
newsflash.date = newsflash.date.replace(hour=scraped_hour).replace(tzinfo=None)
newsflash_date_localized = israel_tz.localize(newsflash.date)
newsflash.date = timezone("UTC").normalize(newsflash_date_localized)
except Exception as e:
logging.error(f"during scraping hour for newsflash {e}")


def scrape_extract_store_rss(site_name, db):
latest_date = db.get_latest_date_of_source(site_name)
for newsflash in rss_sites.scrape(site_name):
Expand All @@ -45,6 +65,8 @@ def scrape_extract_store_rss(site_name, db):
# TODO: pass both title and description, leaving this choice to the classifier
newsflash.accident = classify_rss(newsflash.title or newsflash.description)
newsflash.organization = classify_organization(site_name)
if site_name == "walla": # walla's rss feed currently shows wrong time zone
scrape_hour_for_walla_newsflash(newsflash)
if newsflash.accident:
# FIX: No accident-accurate date extracted
extract_geo_features(db, newsflash)
Expand Down