From 50243e6bb89e9d3f1e3d10a12707e4dd3b12b875 Mon Sep 17 00:00:00 2001
From: Nicolai Spohrer <nicolai@xeve.de>
Date: Sat, 12 Oct 2019 15:57:47 +0200
Subject: [PATCH 1/4] immobilienscout: Fixes. Add date retrieval. wggesucht:
 General fixes. Fix pagination. General: Add possibility to filter by date
 (immoscout does not offer this). GMaps stuff untested as it does not seem to
 work for free any longer.

---
 flathunter/crawl_immobilienscout.py | 34 +++++++++--
 flathunter/crawl_wggesucht.py       | 69 ++++++++++++----------
 flathunter/hunter.py                | 90 +++++++++++++++++++++++------
 requirements.txt                    |  2 +
 4 files changed, 141 insertions(+), 54 deletions(-)

diff --git a/flathunter/crawl_immobilienscout.py b/flathunter/crawl_immobilienscout.py
index d4ab9aff..665749f5 100644
--- a/flathunter/crawl_immobilienscout.py
+++ b/flathunter/crawl_immobilienscout.py
@@ -12,9 +12,9 @@ def __init__(self):
     def get_results(self, search_url):
         # convert to paged URL
         if '/P-' in search_url:
-            search_url = re.sub(r"/Suche/(.+?)/P-\d+", "/Suche/\1/P-%i", search_url)
+            search_url = re.sub(r"/Suche/(.+?)/P-\d+", "/Suche/\1/P-[pageno]", search_url)
         else:
-            search_url = re.sub(r"/Suche/(.+?)/", r"/Suche/\1/P-%i/", search_url)
+            search_url = re.sub(r"/Suche/(.+?)/", r"/Suche/\1/P-[pageno]/", search_url)
         self.__log__.debug("Got search URL %s" % search_url)
 
         # load first page to get number of entries
@@ -29,16 +29,28 @@ def get_results(self, search_url):
         entries = self.extract_data(soup)
 
         # iterate over all remaining pages
-        while len(entries) < no_of_results:
+        num_empty_pages = 0
+        num_entries = len(entries)
+        while num_entries < no_of_results and num_empty_pages < 5:
             self.__log__.debug('Next Page')
             page_no += 1
             soup = self.get_page(search_url, page_no)
-            entries.extend(self.extract_data(soup))
+            new_entries = self.extract_data(soup)
+            num_entries += len(new_entries)
+
+            if new_entries == 0:
+                num_empty_pages += 1
+
+            entries.extend(new_entries)
 
         return entries
 
     def get_page(self, search_url, page_no):
-        resp = requests.get(search_url % page_no)
+        url = search_url.replace("[pageno]", str(page_no), 1)
+        return self.get_generic_page(url)
+
+    def get_generic_page(self, url):
+        resp = requests.get(url)
         if resp.status_code != 200:
             self.__log__.error("Got response (%i): %s" % (resp.status_code, resp.content))
         return BeautifulSoup(resp.content, 'html.parser')
@@ -68,3 +80,15 @@ def extract_data(self, soup):
 
         self.__log__.debug('extracted: ' + str(entries))
         return entries
+
+    def load_date(self, url):
+        # extract address from expose itself
+        soup = self.get_generic_page(url)
+
+        bezugsfrei_elements = soup.find_all(lambda e: e.has_attr("class") and "is24qa-bezugsfrei-ab" in e["class"])
+        bezugsfrei_date = "?"
+        if bezugsfrei_elements:
+            bezugsfrei_date = bezugsfrei_elements[0].text.strip()
+
+        return bezugsfrei_date
+
diff --git a/flathunter/crawl_wggesucht.py b/flathunter/crawl_wggesucht.py
index 97ffa76a..d2f41638 100644
--- a/flathunter/crawl_wggesucht.py
+++ b/flathunter/crawl_wggesucht.py
@@ -15,60 +15,69 @@ def get_results(self, search_url):
         self.__log__.debug("Got search URL %s" % search_url)
 
         # load first page
-        page_no = 0
-        soup = self.get_page(search_url, page_no)
-        no_of_pages = 0  # TODO get it from soup
-        self.__log__.info('Found pages: ' + str(no_of_pages))
+        soup = self.get_page(search_url)
+
+        # extract additional pages
+        page_urls = []
+        a_paginations = soup.find_all("a", class_="a-pagination")
+        for a_pagination in a_paginations:
+            # for each additional page
+            page_urls.append("https://www.wg-gesucht.de/" + a_pagination.get('href'))
+
+        self.__log__.info('Found pages: ' + str(len(page_urls)+1))
 
         # get data from first page
         entries = self.extract_data(soup)
         self.__log__.debug('Number of found entries: ' + str(len(entries)))
 
         # iterate over all remaining pages
-        while (page_no + 1) < no_of_pages:  # page_no starts with 0, no_of_pages with 1
-            page_no += 1
-            self.__log__.debug('Checking page %i' % page_no)
-            soup = self.get_page(search_url, page_no)
+        current_page_no = 2
+        for page_url in page_urls:
+            self.__log__.debug('Checking page %i' % current_page_no)
+            soup = self.get_page(page_url)
             entries.extend(self.extract_data(soup))
             self.__log__.debug('Number of found entries: ' + str(len(entries)))
+            current_page_no += 1
 
         return entries
 
-    def get_page(self, search_url, page_no):
-        resp = requests.get(search_url)  # TODO add page_no in url
+    def get_page(self, search_url):
+        # search_url must be specific page - cannot add page number manually
+        resp = requests.get(search_url)
         if resp.status_code != 200:
             self.__log__.error("Got response (%i): %s" % (resp.status_code, resp.content))
-        return BeautifulSoup(resp.content, 'html.parser')
+        return BeautifulSoup(resp.content, 'lxml')
 
     def extract_data(self, soup):
         entries = []
 
-        findings = soup.find_all(lambda e: e.has_attr('id') and e['id'].startswith('ad--'))
+        findings = soup.find_all(lambda e: e.has_attr('id') and e['id'].startswith('liste-'))
         existingFindings = list(
-            filter(lambda e: e.has_attr('class') and not 'listenansicht-inactive' in e['class'], findings))
+            filter(lambda e: e.has_attr('class') and not 'display-none' in e['class'], findings))
 
         baseurl = 'https://www.wg-gesucht.de/'
         for row in existingFindings:
-            url = baseurl + row['adid']  # u'wohnungen-in-Muenchen-Altstadt-Lehel.6038357.html'
-            id = int(url.split('.')[-2])
-            rooms = row.find(lambda e: e.has_attr('class') and 'ang_spalte_zimmer' in e['class']).text.strip()  # u'3'
-            price = row.find(
-                lambda e: e.has_attr('class') and 'ang_spalte_miete' in e['class']).text.strip()  # u'433\u20ac'
-            size = row.find(
-                lambda e: e.has_attr('class') and 'ang_spalte_groesse' in e['class']).text.strip()  # u'75m\xb2'
-            district = row.find(
-                lambda e: e.has_attr('class') and 'ang_spalte_stadt' in e['class']).text.strip()  # u'Altstadt-Lehel'
-            date = row.find(
-                lambda e: e.has_attr('class') and 'ang_spalte_freiab' in e['class']).text.strip()  # u'21.03.17'
+            infostring = row.find(
+                lambda e: e.name == "div" and e.has_attr('class') and 'list-details-panel-inner' in e[
+                    'class']).p.text.strip()
+            rooms = "1?"  # re.findall(r'\d[-]Zimmer[-]Wohnung', infostring)[0][:1]
+            date = re.findall(r'\d{2}.\d{2}.\d{4}', infostring)[0]
+            detail = row.find_all(lambda e: e.name == "a" and e.has_attr('class') and 'detailansicht' in e['class']);
+            title = detail[2].text.strip()
+            url = baseurl + detail[0]["href"]
+            size_price = detail[0].text.strip()
+            price = re.findall(r'\d{2,4}\s€', size_price)[0]
+            size = re.findall(r'\d{2,4}\sm²', size_price)[0]
 
             details = {
                 'id': int(url.split('.')[-2]),
                 'url': url,
-                'title': "Wohnung in %s ab dem %s" % (district, date),
+                'title': title,
                 'price': price,
                 'size': size,
                 'rooms': rooms + " Zi.",
-                'address': url
+                'address': url,
+                'date': date,
             }
             entries.append(details)
 
@@ -78,9 +87,7 @@ def extract_data(self, soup):
 
     def load_address(self, url):
         # extract address from expose itself
-        exposeHTML = requests.get(url).content
-        exposeSoup = BeautifulSoup(exposeHTML, 'html.parser')
-        address_raw = exposeSoup.find(lambda e: e.has_attr('onclick') and '#map_tab' in e['onclick']).text
-        address = address_raw.strip().split('\n')[0] + ", " + address_raw.strip().split('\n')[-1].strip()
-
+        r = requests.get(url)
+        flat = BeautifulSoup(r.content, 'lxml')
+        address = ' '.join(flat.find('div', {"class": "col-sm-4 mb10"}).find("a", {"href": "#"}).text.strip().split())
         return address
diff --git a/flathunter/hunter.py b/flathunter/hunter.py
index a7eec058..f12aa48f 100644
--- a/flathunter/hunter.py
+++ b/flathunter/hunter.py
@@ -1,17 +1,21 @@
+import datetime
 import logging
-import requests
 import re
-import urllib
-import datetime
 import time
+import urllib
+from dateutil import parser
+import requests
+
 from flathunter.sender_telegram import SenderTelegram
 
+
 class Hunter:
     __log__ = logging.getLogger(__name__)
     GM_MODE_TRANSIT = 'transit'
     GM_MODE_BICYCLE = 'bicycling'
     GM_MODE_DRIVING = 'driving'
     GM_NO_KEY = 'YOUR_API_KEY'
+    GM_MODE_WALKING = 'walking'
 
     def hunt_flats(self, config, searchers, id_watch):
         sender = SenderTelegram(config)
@@ -21,6 +25,7 @@ def hunt_flats(self, config, searchers, id_watch):
         for url in config.get('urls', list()):
             self.__log__.debug('Processing URL: ' + url)
 
+            # TODO: improve control flow
             try:
                 for searcher in searchers:
                     if re.search(searcher.URL_PATTERN, url):
@@ -29,6 +34,9 @@ def hunt_flats(self, config, searchers, id_watch):
             except requests.exceptions.ConnectionError:
                 self.__log__.warning("Connection to %s failed. Retrying. " % url.split('/')[2])
                 continue
+            except Exception as e:
+                self.__log__.warning("Unknown error:  {}".format(e.with_traceback()))
+                continue
 
             # on error, stop execution
             if not results:
@@ -43,7 +51,7 @@ def hunt_flats(self, config, searchers, id_watch):
 
                 # to reduce traffic, some addresses need to be loaded on demand
                 address = expose['address']
-                if address.startswith('http'):
+                if address.startswith('http'):  # ugh, TODO
                     url = address
                     for searcher in searchers:
                         if re.search(searcher.URL_PATTERN, url):
@@ -51,22 +59,69 @@ def hunt_flats(self, config, searchers, id_watch):
                             self.__log__.debug("Loaded address %s for url %s" % (address, url))
                             break
 
+                # filter districts
+                blacklist = config.get('blacklist', list())
+                address = ' '.join(filter(lambda x: x not in blacklist, address.split()))
+
+                # add to visited list already now so that we can actually skip if entry does not match date filter
+                id_watch.add(expose['id'])
+
+                # get date if necessary
+                if not "date" in expose:
+                    for searcher in searchers:
+                        if re.search(searcher.URL_PATTERN, url):
+                            print(expose["url"])
+                            expose["date"] = searcher.load_date(expose["url"])
+                            self.__log__.debug("Loaded date {} for url {}".format(address, expose["url"]))
+                            break
+
+                date_filter = config.get('date_filter', dict())
+
+                mismatched_date = False
+                for blacklisted_phrase in date_filter.get("blacklist_phrases", []):
+                    if blacklisted_phrase in expose["date"]:
+                        mismatched_date = True
+                        break
+
+                if mismatched_date:
+                    # go to next expose
+                    self.__log__.info("Skipping entry, date {} matches blacklist".format(expose["date"]))
+                    continue
+
+                # try to parse date string
+                try:
+                    parsed_date = parser.parse(expose["date"], dayfirst=True).date()
+
+                    date_min = date_filter.get('date_min')
+                    date_max = date_filter.get('date_max')
+
+                    if date_min and parsed_date < date_min:
+                        self.__log__.info("Skipping entry, date {} too early".format(expose["date"]))
+                        continue
+                    if date_max and parsed_date > date_max:
+                        self.__log__.info("Skipping entry, date {} too late".format(expose["date"]))
+                        continue
+
+                except (ValueError, OverflowError):
+                    self.__log__.debug("Could not parse date {} for url {} - ignoring filters".format(expose["date"], url))
+
                 # calculdate durations
                 message = config.get('message', "").format(
                     title=expose['title'],
+                    date=expose['date'],
                     rooms=expose['rooms'],
                     size=expose['size'],
                     price=expose['price'],
                     url=expose['url'],
+                    address=address,
                     durations=self.get_formatted_durations(config, address)).strip()
 
                 # send message to all receivers
                 sender.send_msg(message)
 
                 new_links = new_links + 1
-                id_watch.add(expose['id'])
 
-        self.__log__.info(str(new_links) + ' new offer found')
+        self.__log__.info(str(new_links) + ' new offers found')
 
     def get_formatted_durations(self, config, address):
         out = ""
@@ -75,7 +130,7 @@ def get_formatted_durations(self, config, address):
                 dest = duration.get('destination')
                 name = duration.get('name')
                 for mode in duration.get('modes', list()):
-                    if 'gm_id' in mode and 'title' in mode and 'key' in config.get('google_maps_api',dict()):
+                    if 'gm_id' in mode and 'title' in mode and 'key' in config.get('google_maps_api', dict()):
                         duration = self.get_gmaps_distance(config, address, dest, mode['gm_id'])
                         out += "> %s (%s): %s\n" % (name, mode['title'], duration)
 
@@ -83,8 +138,8 @@ def get_formatted_durations(self, config, address):
 
     def get_gmaps_distance(self, config, address, dest, mode):
         # get timestamp for next monday at 9:00:00 o'clock
-        now = datetime.datetime.today().replace(hour=9,minute=0,second=0)
-        next_monday = now + datetime.timedelta(days=(7-now.weekday()))
+        now = datetime.datetime.today().replace(hour=9, minute=0, second=0)
+        next_monday = now + datetime.timedelta(days=(7 - now.weekday()))
         arrival_time = str(int(time.mktime(next_monday.timetuple())))
 
         # decode from unicode and url encode addresses
@@ -93,12 +148,12 @@ def get_gmaps_distance(self, config, address, dest, mode):
         self.__log__.debug("Got address: %s" % address)
 
         # get google maps config stuff
-        base_url = config.get('google_maps_api',dict()).get('url')
-        gm_key = config.get('google_maps_api',dict()).get('key')
+        base_url = config.get('google_maps_api', dict()).get('url')
+        gm_key = config.get('google_maps_api', dict()).get('key')
 
         if (not gm_key or self.GM_NO_KEY )  and mode != self.GM_MODE_DRIVING:
             self.__log__.warning("No Google Maps API key configured and without using a mode different from "
-                                    "'driving' is not allowed. Downgrading to mode 'drinving' thus. ")
+                                 "'driving' is not allowed. Downgrading to mode 'drinving' thus. ")
             mode = 'driving'
             base_url = base_url.replace('&key={key}', '')
 
@@ -114,14 +169,13 @@ def get_gmaps_distance(self, config, address, dest, mode):
         for row in result['rows']:
             for element in row['elements']:
                 if 'status' in element and element['status'] != 'OK':
-                    self.__log__.warning("For address %s we got the status message: %s" % (address,element['status']))
+                    self.__log__.warning("For address %s we got the status message: %s" % (address, element['status']))
                     self.__log__.debug("We got this result: %s" % repr(result))
                     continue
-                    self.__log__.debug("Got distance and duration: %s / %s (%i seconds)"
-                                       % (element['distance']['text'], element['duration']['text'],
-                                          element['duration']['value'])
-                                       )
+                self.__log__.debug("Got distance and duration: %s / %s (%i seconds)"
+                                   % (element['distance']['text'], element['duration']['text'],
+                                      element['duration']['value'])
+                                   )
                 distances[element['duration']['value']] = '%s (%s)' % \
                                                           (element['duration']['text'], element['distance']['text'])
         return distances[min(distances.keys())] if distances else None
-
diff --git a/requirements.txt b/requirements.txt
index a2d59628..a0248548 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,3 +6,5 @@ idna==2.5
 PyYAML==3.12
 requests==2.18.1
 urllib3==1.21.1
+lxml==4.2.1
+python-dateutil==2.8.0

From 0cbf01bdcce1d8af37e4f23b9dcd0a2104209719 Mon Sep 17 00:00:00 2001
From: Nicolai Spohrer <nicolai@xeve.de>
Date: Sat, 12 Oct 2019 16:12:45 +0200
Subject: [PATCH 2/4] wggesucht: try/catch for address which cannot be found.

---
 flathunter/crawl_wggesucht.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/flathunter/crawl_wggesucht.py b/flathunter/crawl_wggesucht.py
index d2f41638..25265b70 100644
--- a/flathunter/crawl_wggesucht.py
+++ b/flathunter/crawl_wggesucht.py
@@ -89,5 +89,8 @@ def load_address(self, url):
         # extract address from expose itself
         r = requests.get(url)
         flat = BeautifulSoup(r.content, 'lxml')
-        address = ' '.join(flat.find('div', {"class": "col-sm-4 mb10"}).find("a", {"href": "#"}).text.strip().split())
+        try:
+            address = ' '.join(flat.find('div', {"class": "col-sm-4 mb10"}).find("a", {"href": "#"}).text.strip().split())
+        except:
+            address = "?"
         return address

From a4d8fc1e4095b23e9f44a6c98b65071b32191986 Mon Sep 17 00:00:00 2001
From: Nicolai Spohrer <nicolai@xeve.de>
Date: Sat, 12 Oct 2019 16:13:54 +0200
Subject: [PATCH 3/4] Update default config

Add date+address to message, add date_filter, comment out GMaps stuff
---
 config.yaml.dist | 60 ++++++++++++++++++++++++++++++++----------------
 1 file changed, 40 insertions(+), 20 deletions(-)

diff --git a/config.yaml.dist b/config.yaml.dist
index c310ae81..d633fc8d 100644
--- a/config.yaml.dist
+++ b/config.yaml.dist
@@ -8,10 +8,16 @@ loop:
 # Currently supported services: www.immobilienscout24.de and
 # www.wg-gesucht.de. List the URLs in the following format:
 # urls:
-# 	- https://www.immobilienscout24.de/Suche/...
-# 	- https://www.wg-gesucht.de/...
+# 	- "https://www.immobilienscout24.de/Suche/..."
+# 	- "https://www.wg-gesucht.de/..."
 urls:
 
+# There are often city districts in the address which
+# Google Maps does not like. Use this blacklist to remove
+# districts from the search.
+blacklist:
+  - Innenstadt
+
 # If an expose includes an address, the bot is capable of
 # displaying the distance and time to travel (duration) to
 # some configured other addresses, for specific kinds of
@@ -22,6 +28,7 @@ urls:
 # 	- "bicyle"
 #	- "transit" (public transport)
 #	- "driving"
+#   - "walking"
 # 
 # The example configuration below includes a place for
 # "John", located at the main train station of munich.
@@ -29,21 +36,21 @@ urls:
 # each with a different label. Furthermore a place for
 # "Jane" is included, located at the given destination and
 # with the same kinds of travel.
-durations:
-    - name: John
-      destination: Hauptbahnhof, München
-      modes: 
-          - gm_id: transit
-            title: "Öff."
-          - gm_id: bicycle
-            title: "Rad"
-    - name: Jane
-      destination: Karlsplatz, München
-      modes: 
-          - gm_id: transit
-            title: "Öff."
-          - gm_id: driving
-            title: "Auto"
+#durations:
+#    - name: John
+#      destination: Hauptbahnhof, München
+#      modes:
+#          - gm_id: transit
+#            title: "Öff."
+#          - gm_id: bicycle
+#            title: "Rad"
+#    - name: Jane
+#      destination: Karlsplatz, München
+#      modes:
+#          - gm_id: transit
+#            title: "Öff."
+#          - gm_id: driving
+#            title: "Auto"
 
 # Multiline message (yes, the | is supposed to be there), 
 # to format the message received from the Telegram bot. 
@@ -54,15 +61,18 @@ durations:
 #	- {price}: Price for the flat
 # 	- {durations}: Durations calculated by GMaps, see above
 #	- {url}: URL to the expose
+#	- {address}: address of the flat
+#	- {date}: possible date of move
 message: |
-    {title}
+    {title} (ab {date})
     Zimmer: {rooms}
     Größe: {size}
     Preis: {price}
-    Anfahrt:
-    {durations}
+    Adresse: {address}
 
     {url}
+#    Anfahrt:
+#    {durations}
 
 # Calculating durations requires access to the Google Maps API. 
 # Below you can configure the URL to access the API, with placeholders.
@@ -92,3 +102,13 @@ google_maps_api:
 telegram:
     bot_token: 
     receiver_ids:
+
+# It is possible to filter entries by date of possible move. Three filters are available:
+# a minimum date, a maximum date and a blacklist. The blacklist is useful for dates
+# which cannot be parsed (e.g. "sofort").
+#date_filter:
+#    date_min: 2019-12-01
+#    date_max: 2020-01-01
+#    blacklist_phrases:
+#        - "sofort"
+date_filter:

From f15036ec13539be9d998bf768f7a1d19291578e9 Mon Sep 17 00:00:00 2001
From: Nicolai <nicolai@xeve.de>
Date: Sat, 12 Oct 2019 17:41:50 +0200
Subject: [PATCH 4/4] Remove unnecessary debug print

---
 flathunter/hunter.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/flathunter/hunter.py b/flathunter/hunter.py
index f12aa48f..e0ffb1e8 100644
--- a/flathunter/hunter.py
+++ b/flathunter/hunter.py
@@ -70,7 +70,6 @@ def hunt_flats(self, config, searchers, id_watch):
                 if not "date" in expose:
                     for searcher in searchers:
                         if re.search(searcher.URL_PATTERN, url):
-                            print(expose["url"])
                             expose["date"] = searcher.load_date(expose["url"])
                             self.__log__.debug("Loaded date {} for url {}".format(address, expose["url"]))
                             break