Merge pull request #839 from alonisser/master

minor cleanup and adaptions + a part of #833
hasadna · Jul 8, 2017 · d3cd50c · d3cd50c
2 parents b9c01f9 + 2a0d386
commit d3cd50c
Show file tree

Hide file tree

Showing 8 changed files with 59 additions and 46 deletions.
diff --git a/.gitignore b/.gitignore
@@ -25,3 +25,4 @@ npm-debug.log
 *.egg-info
 presence/presence.txt
 presence/presence_log.txt
+data/datapackage*
diff --git a/auxiliary/tag_suggestions/__init__.py b/auxiliary/tag_suggestions/__init__.py
@@ -7,6 +7,7 @@
 
 import operator
 
+
 def approve(admin, request, tag_suggestions):
     for tag_suggestion in tag_suggestions:
         obj = tag_suggestion.object
@@ -23,58 +24,57 @@ def approve(admin, request, tag_suggestions):
 def sum_add_two_dictionaries(dict, dict_to_add):
     """Takes two dictionaries, assuming their values are numeric, and sum each item that exist in both, 
     writing the merged dictionary to the first dictionary."""
-    #go over the dictionary to add
+    # go over the dictionary to add
     for key in dict_to_add:
         if key in dict:
             dict[key] += dict_to_add[key]
         else:
             dict[key] = dict_to_add[key]
 
 
-
-#A list of prefix charcters to use in tag extraction
+# A list of prefix charcters to use in tag extraction
 prefixes = [u'ב', u'ו', u'ה', u'מ', u'מה', u'ל', u'']
 _all_tags_names = []
 
+
 def all_tags_names():
     '''Lazy intialization of tags list'''
-
+    global _all_tags_names
     if (_all_tags_names == []):
         # Extract only used tags, to avoid irrelevant tags 
         vote_tags = Tag.objects.usage_for_model(Vote)
         bill_tags = Tag.objects.usage_for_model(Bill)
         cm_tags = Tag.objects.usage_for_model(CommitteeMeeting)
         all_tags = list(set(vote_tags).union(bill_tags).union(cm_tags))
-        
-        #A list of tags that have been tagged over 10 times in the website
-        global _all_tags_names
+
+        # A list of tags that have been tagged over 10 times in the website
+
         _all_tags_names = [tag.name for tag in all_tags]
-
-    return _all_tags_names
 
+    return _all_tags_names
 
 
 def get_tags_in_text(text):
     """Returns a dictionary, the keys are tags found in text, and the values are the number of occurrences in text"""
-          
+
     result_dict = {}
     words = text.split() if text is not None else []
-    
-    #look for tag in word 
+
+    # look for tag in word
     for tag in all_tags_names():
-        #create tag variations according to prefixes
+        # create tag variations according to prefixes
         tag_variations = [(p + tag) for p in prefixes]
 
-        #find number of occurences of tags for each word
-        occurence_count = 0 
+        # find number of occurences of tags for each word
+        occurence_count = 0
         for word in words:
             if word in tag_variations:
                 occurence_count += 1
-        
-        #if tag found more than once, add them 
-        if occurence_count > 0 :                
+
+        # if tag found more than once, add them
+        if occurence_count > 0:
             result_dict[tag] = result_dict.get(tag, 0) + occurence_count
-            
+
     return result_dict
 
 
@@ -83,17 +83,17 @@ def extract_suggested_tags(current_tags, text_list):
         and the values are the number of occurrences in arguments text.
         current_tags are removed from final list.
         The list is sorted from most occuring tags to least occuring tags'''
-    
+
     tags_occurrences = {}
-    
-    #find occurences of tags in text
+
+    # find occurences of tags in text
     for text_to_extract in text_list:
         sum_add_two_dictionaries(tags_occurrences, get_tags_in_text(text_to_extract))
-    
-    #remove tags that are already tagged
+
+    # remove tags that are already tagged
     for tag in current_tags:
-        if tag.name in tags_occurrences:                          
+        if tag.name in tags_occurrences:
             del tags_occurrences[tag.name]
-    
-    #sort suggestions
-    return sorted(tags_occurrences.iteritems(), key=operator.itemgetter(1),reverse=True)
+
+    # sort suggestions
+    return sorted(tags_occurrences.iteritems(), key=operator.itemgetter(1), reverse=True)
diff --git a/committees/models.py b/committees/models.py
@@ -21,9 +21,7 @@
 from lobbyists.models import LobbyistCorporation
 from itertools import groupby
 from hebrew_numbers import gematria_to_int
-from mks.utils import get_all_mk_names
-from knesset_data.protocols.committee import \
-    CommitteeMeetingProtocol as KnessetDataCommitteeMeetingProtocol
+
 from knesset_data_django.committees import members_extended
 
 COMMITTEE_PROTOCOL_PAGINATE_BY = 120
@@ -287,6 +285,7 @@ def reparse_protocol(self, redownload=True, mks=None, mk_names=None):
         reparse_protocol(self, redownload, mks, mk_names)
 
     def update_from_dataservice(self, dataservice_object=None):
+        # TODO: obviousely broken, not sure what was here originaly and where it moved
         from committees.management.commands.scrape_committee_meetings import \
             Command as ScrapeCommitteeMeetingCommand
         from knesset_data.dataservice.committees import \

diff --git a/committees/views.py b/committees/views.py
@@ -287,9 +287,10 @@ def _handle_add_protocol(self, cm, request):
         if not cm.protocol_text:  # don't override existing protocols
             cm.protocol_text = request.POST.get('protocol_text')
             cm.save()
-            cm.create_protocol_parts()
             mks, mk_names = get_all_mk_names()
             cm.find_attending_members(mks, mk_names)
+            cm.create_protocol_parts(mks=mks, mk_names=mk_names)
+
 
     def _handle_remove_lobbyist(self, cm, request):
         lobbyist_name = request.POST.get('lobbyist_name')

diff --git a/deploy/crontab.txt b/deploy/crontab.txt
@@ -4,8 +4,11 @@
 45 03 * * * /oknesset_data/oknesset/Open-Knesset/manage.py parse_plenum_protocols --download --parse 2>&1 | /usr/bin/logger -t open_knesset
 00 04 * * * /oknesset_data/oknesset/Open-Knesset/manage.py parse_future_plenum_meetings 2>&1 | /usr/bin/logger -t open_knesset
 15 04 * * * /oknesset_data/oknesset/Open-Knesset/manage.py syncdata --update 2>&1 | /usr/bin/logger -t open_knesset
-34 04 * * * /oknesset_data/oknesset/Open-Knesset/manage.py scrape_committee_meetings --from_days=100 2>&1 | /usr/bin/logger -t open_knesset
-12 04 * * * /oknesset_data/oknesset/Open-Knesset/manage.py scrape_committees 2>&1 | /usr/bin/logger -t open_knesset
+
+# the committee scrapers are handled as part of download_knesset_datapackage management command
+
+# 34 04 * * * /oknesset_data/oknesset/Open-Knesset/manage.py scrape_committee_meetings --from_days=100 2>&1 | /usr/bin/logger -t open_knesset
+# 12 04 * * * /oknesset_data/oknesset/Open-Knesset/manage.py scrape_committees 2>&1 | /usr/bin/logger -t open_knesset
 59 04 * * * /oknesset_data/oknesset/Open-Knesset/manage.py send_email_to_editors 2>&1 | /usr/bin/logger -t open_knesset
 00 05 * * * /oknesset_data/oknesset/Open-Knesset/manage.py notify --daily 2>&1 | /usr/bin/logger -t open_knesset
 01 05 * * 5 /oknesset_data/oknesset/Open-Knesset/manage.py notify --weekly 2>&1 | /usr/bin/logger -t open_knesset

diff --git a/events/scrapers.py b/events/scrapers.py
@@ -13,6 +13,8 @@
 import json
 import dateutil.parser
 
+GOOGLE_CALENDAR_API_KEY = settings.GOOGLE_CALENDAR_API_KEY
+
 
 class PersonsEventsScraper(BaseScraper):
     """
@@ -24,14 +26,14 @@ def __init__(self):
         self.source = BaseSource()
         self.storage = BaseStorage()
 
-    def _get_google_cal_page(self, calendar_id, sync_token, page_token=None):
-        api_key = settings.GOOGLE_CALENDAR_API_KEY
+    def _get_google_cal_page(self, calendar_id, sync_token, page_token=None, is_retry=False):
+
         if page_token is not None:
             param = '&pageToken=%s' % quote(page_token)
         else:
             param = '&syncToken=%s' % quote(sync_token) if sync_token is not None else ''
         calendar_url = 'https://content.googleapis.com/calendar/v3/calendars/%s/events?showDeleted=true&singleEvents=true%s&key=%s' % (
-            quote(calendar_id), param, quote(api_key))
+            quote(calendar_id), param, quote(GOOGLE_CALENDAR_API_KEY))
         try:
             response = urllib2.urlopen(calendar_url)
             data = json.load(response)
@@ -42,6 +44,11 @@ def _get_google_cal_page(self, calendar_id, sync_token, page_token=None):
             res['items'] = data['items'] if 'items' in data else []
             return res
         except urllib2.HTTPError as e:
+            if e.code == 410 and not is_retry:
+                # Retry without sync token according to docs
+                self._getLogger().info(u'retrying calendar with sync token invalidated %s' % calendar_id)
+                return self._get_google_cal_page(calendar_id=calendar_id, sync_token=None, page_token=page_token,
+                                                 is_retry=True)
             self._getLogger().exception(
                 u'Exception in trying to fetch google calendar id %s with url %s' % (calendar_id, calendar_url))
             return None

diff --git a/ok_tag/tag_suggestions.py b/ok_tag/tag_suggestions.py
@@ -39,16 +39,16 @@ def sum_add_two_dictionaries(dict, dict_to_add):
 
 def all_tags_names():
     '''Lazy intialization of tags list'''
-
-    if (_all_tags_names == []):
+    global _all_tags_names
+    if _all_tags_names == []:
         # Extract only used tags, to avoid irrelevant tags
         vote_tags = Tag.objects.usage_for_model(Vote)
         bill_tags = Tag.objects.usage_for_model(Bill)
         cm_tags = Tag.objects.usage_for_model(CommitteeMeeting)
         all_tags = list(set(vote_tags).union(bill_tags).union(cm_tags))
 
         # A list of tags that have been tagged over 10 times in the website
-        global _all_tags_names
+
         _all_tags_names = [tag.name for tag in all_tags]
 
     return _all_tags_names

diff --git a/simple/management/commands/syncdata.py b/simple/management/commands/syncdata.py
@@ -15,18 +15,17 @@
 
 from django.conf import settings
 from django.contrib.contenttypes.models import ContentType
-from django.db.models import Max
 from okscraper_django.management.base_commands import NoArgsDbLogCommand
 
 from pyth.plugins.rtf15.reader import Rtf15Reader
 
 from committees.models import Committee, CommitteeMeeting
 from knesset.utils import cannonize
 from knesset.utils import send_chat_notification
-from laws.models import (Vote, VoteAction, Bill, Law, PrivateProposal,
+from laws.models import (Vote, Bill, Law, PrivateProposal,
                          KnessetProposal, GovProposal, GovLegislationCommitteeDecision)
 from links.models import Link
-from mks.models import Member, Party, Membership, WeeklyPresence, Knesset
+from mks.models import Member, WeeklyPresence, Knesset
 
 from persons.models import Person, PersonAlias
 
@@ -37,7 +36,7 @@
 from simple.parsers import parse_laws
 from simple.parsers import parse_remote
 from simple.parsers.parse_gov_legislation_comm import ParseGLC
-from simple.parsers import mk_info_html_parser as mk_parser
+
 from simple.parsers import parse_presence
 from syncdata_globals import p_explanation, strong_explanation, explanation
 
@@ -97,7 +96,7 @@ def _handle_noargs(self, **options):
         if all_options:
             process = True
 
-        selected_options = [all_options, process, update, laws]
+        selected_options = [all_options, process, update, laws, presence]
         if not any(selected_options):
             logger.error(
                 "no arguments found. doing nothing. \ntry -h for help.\n--all to run the full syncdata flow.\n--update for an online dynamic update.")
@@ -437,7 +436,7 @@ def get_approved_bill_text_for_vote(self, vote):
             logger.exception(u'Exception with approved bill text for vote %s title=%s' % (vote.id, vote.title))
 
     def update_presence(self):
-        logger.debug("update presence")
+        logger.info("Starting to update presence")
         try:
             (presence, valid_weeks) = parse_presence.parse_presence(filename=os.path.join(DATA_ROOT, 'presence.txt.gz'))
 
@@ -451,6 +450,7 @@ def update_presence(self):
         c = None
 
         for member in Member.current_members.all():
+            logger.info("Trying to update presence for %s" % member.pk)
             if member.id not in presence:
                 logger.error('member %s (id=%d) not found in presence data', member.name, member.id)
                 continue
@@ -477,6 +477,8 @@ def update_presence(self):
                 else:
                     date = iso_to_gregorian(*current_timestamp, iso_day=0)
                 current_timestamp = (date + datetime.timedelta(8)).isocalendar()[:2]
+        logger.info('Finished updating presence')
+
 
     def update_private_proposal_content_html(self, pp):
         html = parse_remote.rtf(pp.source_url)