Skip to content
This repository has been archived by the owner on Apr 25, 2023. It is now read-only.

Commit

Permalink
Merge pull request #839 from alonisser/master
Browse files Browse the repository at this point in the history
minor cleanup and adaptions + a part of #833
  • Loading branch information
alonisser authored Jul 8, 2017
2 parents b9c01f9 + 2a0d386 commit d3cd50c
Show file tree
Hide file tree
Showing 8 changed files with 59 additions and 46 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,4 @@ npm-debug.log
*.egg-info
presence/presence.txt
presence/presence_log.txt
data/datapackage*
56 changes: 28 additions & 28 deletions auxiliary/tag_suggestions/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

import operator


def approve(admin, request, tag_suggestions):
for tag_suggestion in tag_suggestions:
obj = tag_suggestion.object
Expand All @@ -23,58 +24,57 @@ def approve(admin, request, tag_suggestions):
def sum_add_two_dictionaries(dict, dict_to_add):
"""Takes two dictionaries, assuming their values are numeric, and sum each item that exist in both,
writing the merged dictionary to the first dictionary."""
#go over the dictionary to add
# go over the dictionary to add
for key in dict_to_add:
if key in dict:
dict[key] += dict_to_add[key]
else:
dict[key] = dict_to_add[key]



#A list of prefix charcters to use in tag extraction
# A list of prefix charcters to use in tag extraction
prefixes = [u'ב', u'ו', u'ה', u'מ', u'מה', u'ל', u'']
_all_tags_names = []


def all_tags_names():
'''Lazy intialization of tags list'''

global _all_tags_names
if (_all_tags_names == []):
# Extract only used tags, to avoid irrelevant tags
vote_tags = Tag.objects.usage_for_model(Vote)
bill_tags = Tag.objects.usage_for_model(Bill)
cm_tags = Tag.objects.usage_for_model(CommitteeMeeting)
all_tags = list(set(vote_tags).union(bill_tags).union(cm_tags))
#A list of tags that have been tagged over 10 times in the website
global _all_tags_names

# A list of tags that have been tagged over 10 times in the website

_all_tags_names = [tag.name for tag in all_tags]

return _all_tags_names

return _all_tags_names


def get_tags_in_text(text):
"""Returns a dictionary, the keys are tags found in text, and the values are the number of occurrences in text"""

result_dict = {}
words = text.split() if text is not None else []
#look for tag in word

# look for tag in word
for tag in all_tags_names():
#create tag variations according to prefixes
# create tag variations according to prefixes
tag_variations = [(p + tag) for p in prefixes]

#find number of occurences of tags for each word
occurence_count = 0
# find number of occurences of tags for each word
occurence_count = 0
for word in words:
if word in tag_variations:
occurence_count += 1
#if tag found more than once, add them
if occurence_count > 0 :

# if tag found more than once, add them
if occurence_count > 0:
result_dict[tag] = result_dict.get(tag, 0) + occurence_count

return result_dict


Expand All @@ -83,17 +83,17 @@ def extract_suggested_tags(current_tags, text_list):
and the values are the number of occurrences in arguments text.
current_tags are removed from final list.
The list is sorted from most occuring tags to least occuring tags'''

tags_occurrences = {}
#find occurences of tags in text

# find occurences of tags in text
for text_to_extract in text_list:
sum_add_two_dictionaries(tags_occurrences, get_tags_in_text(text_to_extract))
#remove tags that are already tagged

# remove tags that are already tagged
for tag in current_tags:
if tag.name in tags_occurrences:
if tag.name in tags_occurrences:
del tags_occurrences[tag.name]
#sort suggestions
return sorted(tags_occurrences.iteritems(), key=operator.itemgetter(1),reverse=True)

# sort suggestions
return sorted(tags_occurrences.iteritems(), key=operator.itemgetter(1), reverse=True)
5 changes: 2 additions & 3 deletions committees/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,7 @@
from lobbyists.models import LobbyistCorporation
from itertools import groupby
from hebrew_numbers import gematria_to_int
from mks.utils import get_all_mk_names
from knesset_data.protocols.committee import \
CommitteeMeetingProtocol as KnessetDataCommitteeMeetingProtocol

from knesset_data_django.committees import members_extended

COMMITTEE_PROTOCOL_PAGINATE_BY = 120
Expand Down Expand Up @@ -287,6 +285,7 @@ def reparse_protocol(self, redownload=True, mks=None, mk_names=None):
reparse_protocol(self, redownload, mks, mk_names)

def update_from_dataservice(self, dataservice_object=None):
# TODO: obviousely broken, not sure what was here originaly and where it moved
from committees.management.commands.scrape_committee_meetings import \
Command as ScrapeCommitteeMeetingCommand
from knesset_data.dataservice.committees import \
Expand Down
3 changes: 2 additions & 1 deletion committees/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,9 +287,10 @@ def _handle_add_protocol(self, cm, request):
if not cm.protocol_text: # don't override existing protocols
cm.protocol_text = request.POST.get('protocol_text')
cm.save()
cm.create_protocol_parts()
mks, mk_names = get_all_mk_names()
cm.find_attending_members(mks, mk_names)
cm.create_protocol_parts(mks=mks, mk_names=mk_names)


def _handle_remove_lobbyist(self, cm, request):
lobbyist_name = request.POST.get('lobbyist_name')
Expand Down
7 changes: 5 additions & 2 deletions deploy/crontab.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,11 @@
45 03 * * * /oknesset_data/oknesset/Open-Knesset/manage.py parse_plenum_protocols --download --parse 2>&1 | /usr/bin/logger -t open_knesset
00 04 * * * /oknesset_data/oknesset/Open-Knesset/manage.py parse_future_plenum_meetings 2>&1 | /usr/bin/logger -t open_knesset
15 04 * * * /oknesset_data/oknesset/Open-Knesset/manage.py syncdata --update 2>&1 | /usr/bin/logger -t open_knesset
34 04 * * * /oknesset_data/oknesset/Open-Knesset/manage.py scrape_committee_meetings --from_days=100 2>&1 | /usr/bin/logger -t open_knesset
12 04 * * * /oknesset_data/oknesset/Open-Knesset/manage.py scrape_committees 2>&1 | /usr/bin/logger -t open_knesset

# the committee scrapers are handled as part of download_knesset_datapackage management command

# 34 04 * * * /oknesset_data/oknesset/Open-Knesset/manage.py scrape_committee_meetings --from_days=100 2>&1 | /usr/bin/logger -t open_knesset
# 12 04 * * * /oknesset_data/oknesset/Open-Knesset/manage.py scrape_committees 2>&1 | /usr/bin/logger -t open_knesset
59 04 * * * /oknesset_data/oknesset/Open-Knesset/manage.py send_email_to_editors 2>&1 | /usr/bin/logger -t open_knesset
00 05 * * * /oknesset_data/oknesset/Open-Knesset/manage.py notify --daily 2>&1 | /usr/bin/logger -t open_knesset
01 05 * * 5 /oknesset_data/oknesset/Open-Knesset/manage.py notify --weekly 2>&1 | /usr/bin/logger -t open_knesset
Expand Down
13 changes: 10 additions & 3 deletions events/scrapers.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
import json
import dateutil.parser

GOOGLE_CALENDAR_API_KEY = settings.GOOGLE_CALENDAR_API_KEY


class PersonsEventsScraper(BaseScraper):
"""
Expand All @@ -24,14 +26,14 @@ def __init__(self):
self.source = BaseSource()
self.storage = BaseStorage()

def _get_google_cal_page(self, calendar_id, sync_token, page_token=None):
api_key = settings.GOOGLE_CALENDAR_API_KEY
def _get_google_cal_page(self, calendar_id, sync_token, page_token=None, is_retry=False):

if page_token is not None:
param = '&pageToken=%s' % quote(page_token)
else:
param = '&syncToken=%s' % quote(sync_token) if sync_token is not None else ''
calendar_url = 'https://content.googleapis.com/calendar/v3/calendars/%s/events?showDeleted=true&singleEvents=true%s&key=%s' % (
quote(calendar_id), param, quote(api_key))
quote(calendar_id), param, quote(GOOGLE_CALENDAR_API_KEY))
try:
response = urllib2.urlopen(calendar_url)
data = json.load(response)
Expand All @@ -42,6 +44,11 @@ def _get_google_cal_page(self, calendar_id, sync_token, page_token=None):
res['items'] = data['items'] if 'items' in data else []
return res
except urllib2.HTTPError as e:
if e.code == 410 and not is_retry:
# Retry without sync token according to docs
self._getLogger().info(u'retrying calendar with sync token invalidated %s' % calendar_id)
return self._get_google_cal_page(calendar_id=calendar_id, sync_token=None, page_token=page_token,
is_retry=True)
self._getLogger().exception(
u'Exception in trying to fetch google calendar id %s with url %s' % (calendar_id, calendar_url))
return None
Expand Down
6 changes: 3 additions & 3 deletions ok_tag/tag_suggestions.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,16 +39,16 @@ def sum_add_two_dictionaries(dict, dict_to_add):

def all_tags_names():
'''Lazy intialization of tags list'''

if (_all_tags_names == []):
global _all_tags_names
if _all_tags_names == []:
# Extract only used tags, to avoid irrelevant tags
vote_tags = Tag.objects.usage_for_model(Vote)
bill_tags = Tag.objects.usage_for_model(Bill)
cm_tags = Tag.objects.usage_for_model(CommitteeMeeting)
all_tags = list(set(vote_tags).union(bill_tags).union(cm_tags))

# A list of tags that have been tagged over 10 times in the website
global _all_tags_names

_all_tags_names = [tag.name for tag in all_tags]

return _all_tags_names
Expand Down
14 changes: 8 additions & 6 deletions simple/management/commands/syncdata.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,18 +15,17 @@

from django.conf import settings
from django.contrib.contenttypes.models import ContentType
from django.db.models import Max
from okscraper_django.management.base_commands import NoArgsDbLogCommand

from pyth.plugins.rtf15.reader import Rtf15Reader

from committees.models import Committee, CommitteeMeeting
from knesset.utils import cannonize
from knesset.utils import send_chat_notification
from laws.models import (Vote, VoteAction, Bill, Law, PrivateProposal,
from laws.models import (Vote, Bill, Law, PrivateProposal,
KnessetProposal, GovProposal, GovLegislationCommitteeDecision)
from links.models import Link
from mks.models import Member, Party, Membership, WeeklyPresence, Knesset
from mks.models import Member, WeeklyPresence, Knesset

from persons.models import Person, PersonAlias

Expand All @@ -37,7 +36,7 @@
from simple.parsers import parse_laws
from simple.parsers import parse_remote
from simple.parsers.parse_gov_legislation_comm import ParseGLC
from simple.parsers import mk_info_html_parser as mk_parser

from simple.parsers import parse_presence
from syncdata_globals import p_explanation, strong_explanation, explanation

Expand Down Expand Up @@ -97,7 +96,7 @@ def _handle_noargs(self, **options):
if all_options:
process = True

selected_options = [all_options, process, update, laws]
selected_options = [all_options, process, update, laws, presence]
if not any(selected_options):
logger.error(
"no arguments found. doing nothing. \ntry -h for help.\n--all to run the full syncdata flow.\n--update for an online dynamic update.")
Expand Down Expand Up @@ -437,7 +436,7 @@ def get_approved_bill_text_for_vote(self, vote):
logger.exception(u'Exception with approved bill text for vote %s title=%s' % (vote.id, vote.title))

def update_presence(self):
logger.debug("update presence")
logger.info("Starting to update presence")
try:
(presence, valid_weeks) = parse_presence.parse_presence(filename=os.path.join(DATA_ROOT, 'presence.txt.gz'))

Expand All @@ -451,6 +450,7 @@ def update_presence(self):
c = None

for member in Member.current_members.all():
logger.info("Trying to update presence for %s" % member.pk)
if member.id not in presence:
logger.error('member %s (id=%d) not found in presence data', member.name, member.id)
continue
Expand All @@ -477,6 +477,8 @@ def update_presence(self):
else:
date = iso_to_gregorian(*current_timestamp, iso_day=0)
current_timestamp = (date + datetime.timedelta(8)).isocalendar()[:2]
logger.info('Finished updating presence')


def update_private_proposal_content_html(self, pp):
html = parse_remote.rtf(pp.source_url)
Expand Down

0 comments on commit d3cd50c

Please sign in to comment.