From 50243e6bb89e9d3f1e3d10a12707e4dd3b12b875 Mon Sep 17 00:00:00 2001 From: Nicolai Spohrer Date: Sat, 12 Oct 2019 15:57:47 +0200 Subject: [PATCH 1/4] immobilienscout: Fixes. Add date retrieval. wggesucht: General fixes. Fix pagination. General: Add possibility to filter by date (immoscout does not offer this). GMaps stuff untested as it does not seem to work for free any longer. --- flathunter/crawl_immobilienscout.py | 34 +++++++++-- flathunter/crawl_wggesucht.py | 69 ++++++++++++---------- flathunter/hunter.py | 90 +++++++++++++++++++++++------ requirements.txt | 2 + 4 files changed, 141 insertions(+), 54 deletions(-) diff --git a/flathunter/crawl_immobilienscout.py b/flathunter/crawl_immobilienscout.py index d4ab9aff..665749f5 100644 --- a/flathunter/crawl_immobilienscout.py +++ b/flathunter/crawl_immobilienscout.py @@ -12,9 +12,9 @@ def __init__(self): def get_results(self, search_url): # convert to paged URL if '/P-' in search_url: - search_url = re.sub(r"/Suche/(.+?)/P-\d+", "/Suche/\1/P-%i", search_url) + search_url = re.sub(r"/Suche/(.+?)/P-\d+", "/Suche/\1/P-[pageno]", search_url) else: - search_url = re.sub(r"/Suche/(.+?)/", r"/Suche/\1/P-%i/", search_url) + search_url = re.sub(r"/Suche/(.+?)/", r"/Suche/\1/P-[pageno]/", search_url) self.__log__.debug("Got search URL %s" % search_url) # load first page to get number of entries @@ -29,16 +29,28 @@ def get_results(self, search_url): entries = self.extract_data(soup) # iterate over all remaining pages - while len(entries) < no_of_results: + num_empty_pages = 0 + num_entries = len(entries) + while num_entries < no_of_results and num_empty_pages < 5: self.__log__.debug('Next Page') page_no += 1 soup = self.get_page(search_url, page_no) - entries.extend(self.extract_data(soup)) + new_entries = self.extract_data(soup) + num_entries += len(new_entries) + + if new_entries == 0: + num_empty_pages += 1 + + entries.extend(new_entries) return entries def get_page(self, search_url, page_no): - resp = requests.get(search_url % page_no) + url = search_url.replace("[pageno]", str(page_no), 1) + return self.get_generic_page(url) + + def get_generic_page(self, url): + resp = requests.get(url) if resp.status_code != 200: self.__log__.error("Got response (%i): %s" % (resp.status_code, resp.content)) return BeautifulSoup(resp.content, 'html.parser') @@ -68,3 +80,15 @@ def extract_data(self, soup): self.__log__.debug('extracted: ' + str(entries)) return entries + + def load_date(self, url): + # extract address from expose itself + soup = self.get_generic_page(url) + + bezugsfrei_elements = soup.find_all(lambda e: e.has_attr("class") and "is24qa-bezugsfrei-ab" in e["class"]) + bezugsfrei_date = "?" + if bezugsfrei_elements: + bezugsfrei_date = bezugsfrei_elements[0].text.strip() + + return bezugsfrei_date + diff --git a/flathunter/crawl_wggesucht.py b/flathunter/crawl_wggesucht.py index 97ffa76a..d2f41638 100644 --- a/flathunter/crawl_wggesucht.py +++ b/flathunter/crawl_wggesucht.py @@ -15,60 +15,69 @@ def get_results(self, search_url): self.__log__.debug("Got search URL %s" % search_url) # load first page - page_no = 0 - soup = self.get_page(search_url, page_no) - no_of_pages = 0 # TODO get it from soup - self.__log__.info('Found pages: ' + str(no_of_pages)) + soup = self.get_page(search_url) + + # extract additional pages + page_urls = [] + a_paginations = soup.find_all("a", class_="a-pagination") + for a_pagination in a_paginations: + # for each additional page + page_urls.append("https://www.wg-gesucht.de/" + a_pagination.get('href')) + + self.__log__.info('Found pages: ' + str(len(page_urls)+1)) # get data from first page entries = self.extract_data(soup) self.__log__.debug('Number of found entries: ' + str(len(entries))) # iterate over all remaining pages - while (page_no + 1) < no_of_pages: # page_no starts with 0, no_of_pages with 1 - page_no += 1 - self.__log__.debug('Checking page %i' % page_no) - soup = self.get_page(search_url, page_no) + current_page_no = 2 + for page_url in page_urls: + self.__log__.debug('Checking page %i' % current_page_no) + soup = self.get_page(page_url) entries.extend(self.extract_data(soup)) self.__log__.debug('Number of found entries: ' + str(len(entries))) + current_page_no += 1 return entries - def get_page(self, search_url, page_no): - resp = requests.get(search_url) # TODO add page_no in url + def get_page(self, search_url): + # search_url must be specific page - cannot add page number manually + resp = requests.get(search_url) if resp.status_code != 200: self.__log__.error("Got response (%i): %s" % (resp.status_code, resp.content)) - return BeautifulSoup(resp.content, 'html.parser') + return BeautifulSoup(resp.content, 'lxml') def extract_data(self, soup): entries = [] - findings = soup.find_all(lambda e: e.has_attr('id') and e['id'].startswith('ad--')) + findings = soup.find_all(lambda e: e.has_attr('id') and e['id'].startswith('liste-')) existingFindings = list( - filter(lambda e: e.has_attr('class') and not 'listenansicht-inactive' in e['class'], findings)) + filter(lambda e: e.has_attr('class') and not 'display-none' in e['class'], findings)) baseurl = 'https://www.wg-gesucht.de/' for row in existingFindings: - url = baseurl + row['adid'] # u'wohnungen-in-Muenchen-Altstadt-Lehel.6038357.html' - id = int(url.split('.')[-2]) - rooms = row.find(lambda e: e.has_attr('class') and 'ang_spalte_zimmer' in e['class']).text.strip() # u'3' - price = row.find( - lambda e: e.has_attr('class') and 'ang_spalte_miete' in e['class']).text.strip() # u'433\u20ac' - size = row.find( - lambda e: e.has_attr('class') and 'ang_spalte_groesse' in e['class']).text.strip() # u'75m\xb2' - district = row.find( - lambda e: e.has_attr('class') and 'ang_spalte_stadt' in e['class']).text.strip() # u'Altstadt-Lehel' - date = row.find( - lambda e: e.has_attr('class') and 'ang_spalte_freiab' in e['class']).text.strip() # u'21.03.17' + infostring = row.find( + lambda e: e.name == "div" and e.has_attr('class') and 'list-details-panel-inner' in e[ + 'class']).p.text.strip() + rooms = "1?" # re.findall(r'\d[-]Zimmer[-]Wohnung', infostring)[0][:1] + date = re.findall(r'\d{2}.\d{2}.\d{4}', infostring)[0] + detail = row.find_all(lambda e: e.name == "a" and e.has_attr('class') and 'detailansicht' in e['class']); + title = detail[2].text.strip() + url = baseurl + detail[0]["href"] + size_price = detail[0].text.strip() + price = re.findall(r'\d{2,4}\s€', size_price)[0] + size = re.findall(r'\d{2,4}\sm²', size_price)[0] details = { 'id': int(url.split('.')[-2]), 'url': url, - 'title': "Wohnung in %s ab dem %s" % (district, date), + 'title': title, 'price': price, 'size': size, 'rooms': rooms + " Zi.", - 'address': url + 'address': url, + 'date': date, } entries.append(details) @@ -78,9 +87,7 @@ def extract_data(self, soup): def load_address(self, url): # extract address from expose itself - exposeHTML = requests.get(url).content - exposeSoup = BeautifulSoup(exposeHTML, 'html.parser') - address_raw = exposeSoup.find(lambda e: e.has_attr('onclick') and '#map_tab' in e['onclick']).text - address = address_raw.strip().split('\n')[0] + ", " + address_raw.strip().split('\n')[-1].strip() - + r = requests.get(url) + flat = BeautifulSoup(r.content, 'lxml') + address = ' '.join(flat.find('div', {"class": "col-sm-4 mb10"}).find("a", {"href": "#"}).text.strip().split()) return address diff --git a/flathunter/hunter.py b/flathunter/hunter.py index a7eec058..f12aa48f 100644 --- a/flathunter/hunter.py +++ b/flathunter/hunter.py @@ -1,17 +1,21 @@ +import datetime import logging -import requests import re -import urllib -import datetime import time +import urllib +from dateutil import parser +import requests + from flathunter.sender_telegram import SenderTelegram + class Hunter: __log__ = logging.getLogger(__name__) GM_MODE_TRANSIT = 'transit' GM_MODE_BICYCLE = 'bicycling' GM_MODE_DRIVING = 'driving' GM_NO_KEY = 'YOUR_API_KEY' + GM_MODE_WALKING = 'walking' def hunt_flats(self, config, searchers, id_watch): sender = SenderTelegram(config) @@ -21,6 +25,7 @@ def hunt_flats(self, config, searchers, id_watch): for url in config.get('urls', list()): self.__log__.debug('Processing URL: ' + url) + # TODO: improve control flow try: for searcher in searchers: if re.search(searcher.URL_PATTERN, url): @@ -29,6 +34,9 @@ def hunt_flats(self, config, searchers, id_watch): except requests.exceptions.ConnectionError: self.__log__.warning("Connection to %s failed. Retrying. " % url.split('/')[2]) continue + except Exception as e: + self.__log__.warning("Unknown error: {}".format(e.with_traceback())) + continue # on error, stop execution if not results: @@ -43,7 +51,7 @@ def hunt_flats(self, config, searchers, id_watch): # to reduce traffic, some addresses need to be loaded on demand address = expose['address'] - if address.startswith('http'): + if address.startswith('http'): # ugh, TODO url = address for searcher in searchers: if re.search(searcher.URL_PATTERN, url): @@ -51,22 +59,69 @@ def hunt_flats(self, config, searchers, id_watch): self.__log__.debug("Loaded address %s for url %s" % (address, url)) break + # filter districts + blacklist = config.get('blacklist', list()) + address = ' '.join(filter(lambda x: x not in blacklist, address.split())) + + # add to visited list already now so that we can actually skip if entry does not match date filter + id_watch.add(expose['id']) + + # get date if necessary + if not "date" in expose: + for searcher in searchers: + if re.search(searcher.URL_PATTERN, url): + print(expose["url"]) + expose["date"] = searcher.load_date(expose["url"]) + self.__log__.debug("Loaded date {} for url {}".format(address, expose["url"])) + break + + date_filter = config.get('date_filter', dict()) + + mismatched_date = False + for blacklisted_phrase in date_filter.get("blacklist_phrases", []): + if blacklisted_phrase in expose["date"]: + mismatched_date = True + break + + if mismatched_date: + # go to next expose + self.__log__.info("Skipping entry, date {} matches blacklist".format(expose["date"])) + continue + + # try to parse date string + try: + parsed_date = parser.parse(expose["date"], dayfirst=True).date() + + date_min = date_filter.get('date_min') + date_max = date_filter.get('date_max') + + if date_min and parsed_date < date_min: + self.__log__.info("Skipping entry, date {} too early".format(expose["date"])) + continue + if date_max and parsed_date > date_max: + self.__log__.info("Skipping entry, date {} too late".format(expose["date"])) + continue + + except (ValueError, OverflowError): + self.__log__.debug("Could not parse date {} for url {} - ignoring filters".format(expose["date"], url)) + # calculdate durations message = config.get('message', "").format( title=expose['title'], + date=expose['date'], rooms=expose['rooms'], size=expose['size'], price=expose['price'], url=expose['url'], + address=address, durations=self.get_formatted_durations(config, address)).strip() # send message to all receivers sender.send_msg(message) new_links = new_links + 1 - id_watch.add(expose['id']) - self.__log__.info(str(new_links) + ' new offer found') + self.__log__.info(str(new_links) + ' new offers found') def get_formatted_durations(self, config, address): out = "" @@ -75,7 +130,7 @@ def get_formatted_durations(self, config, address): dest = duration.get('destination') name = duration.get('name') for mode in duration.get('modes', list()): - if 'gm_id' in mode and 'title' in mode and 'key' in config.get('google_maps_api',dict()): + if 'gm_id' in mode and 'title' in mode and 'key' in config.get('google_maps_api', dict()): duration = self.get_gmaps_distance(config, address, dest, mode['gm_id']) out += "> %s (%s): %s\n" % (name, mode['title'], duration) @@ -83,8 +138,8 @@ def get_formatted_durations(self, config, address): def get_gmaps_distance(self, config, address, dest, mode): # get timestamp for next monday at 9:00:00 o'clock - now = datetime.datetime.today().replace(hour=9,minute=0,second=0) - next_monday = now + datetime.timedelta(days=(7-now.weekday())) + now = datetime.datetime.today().replace(hour=9, minute=0, second=0) + next_monday = now + datetime.timedelta(days=(7 - now.weekday())) arrival_time = str(int(time.mktime(next_monday.timetuple()))) # decode from unicode and url encode addresses @@ -93,12 +148,12 @@ def get_gmaps_distance(self, config, address, dest, mode): self.__log__.debug("Got address: %s" % address) # get google maps config stuff - base_url = config.get('google_maps_api',dict()).get('url') - gm_key = config.get('google_maps_api',dict()).get('key') + base_url = config.get('google_maps_api', dict()).get('url') + gm_key = config.get('google_maps_api', dict()).get('key') if (not gm_key or self.GM_NO_KEY ) and mode != self.GM_MODE_DRIVING: self.__log__.warning("No Google Maps API key configured and without using a mode different from " - "'driving' is not allowed. Downgrading to mode 'drinving' thus. ") + "'driving' is not allowed. Downgrading to mode 'drinving' thus. ") mode = 'driving' base_url = base_url.replace('&key={key}', '') @@ -114,14 +169,13 @@ def get_gmaps_distance(self, config, address, dest, mode): for row in result['rows']: for element in row['elements']: if 'status' in element and element['status'] != 'OK': - self.__log__.warning("For address %s we got the status message: %s" % (address,element['status'])) + self.__log__.warning("For address %s we got the status message: %s" % (address, element['status'])) self.__log__.debug("We got this result: %s" % repr(result)) continue - self.__log__.debug("Got distance and duration: %s / %s (%i seconds)" - % (element['distance']['text'], element['duration']['text'], - element['duration']['value']) - ) + self.__log__.debug("Got distance and duration: %s / %s (%i seconds)" + % (element['distance']['text'], element['duration']['text'], + element['duration']['value']) + ) distances[element['duration']['value']] = '%s (%s)' % \ (element['duration']['text'], element['distance']['text']) return distances[min(distances.keys())] if distances else None - diff --git a/requirements.txt b/requirements.txt index a2d59628..a0248548 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,3 +6,5 @@ idna==2.5 PyYAML==3.12 requests==2.18.1 urllib3==1.21.1 +lxml==4.2.1 +python-dateutil==2.8.0 From 0cbf01bdcce1d8af37e4f23b9dcd0a2104209719 Mon Sep 17 00:00:00 2001 From: Nicolai Spohrer Date: Sat, 12 Oct 2019 16:12:45 +0200 Subject: [PATCH 2/4] wggesucht: try/catch for address which cannot be found. --- flathunter/crawl_wggesucht.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/flathunter/crawl_wggesucht.py b/flathunter/crawl_wggesucht.py index d2f41638..25265b70 100644 --- a/flathunter/crawl_wggesucht.py +++ b/flathunter/crawl_wggesucht.py @@ -89,5 +89,8 @@ def load_address(self, url): # extract address from expose itself r = requests.get(url) flat = BeautifulSoup(r.content, 'lxml') - address = ' '.join(flat.find('div', {"class": "col-sm-4 mb10"}).find("a", {"href": "#"}).text.strip().split()) + try: + address = ' '.join(flat.find('div', {"class": "col-sm-4 mb10"}).find("a", {"href": "#"}).text.strip().split()) + except: + address = "?" return address From a4d8fc1e4095b23e9f44a6c98b65071b32191986 Mon Sep 17 00:00:00 2001 From: Nicolai Spohrer Date: Sat, 12 Oct 2019 16:13:54 +0200 Subject: [PATCH 3/4] Update default config Add date+address to message, add date_filter, comment out GMaps stuff --- config.yaml.dist | 60 ++++++++++++++++++++++++++++++++---------------- 1 file changed, 40 insertions(+), 20 deletions(-) diff --git a/config.yaml.dist b/config.yaml.dist index c310ae81..d633fc8d 100644 --- a/config.yaml.dist +++ b/config.yaml.dist @@ -8,10 +8,16 @@ loop: # Currently supported services: www.immobilienscout24.de and # www.wg-gesucht.de. List the URLs in the following format: # urls: -# - https://www.immobilienscout24.de/Suche/... -# - https://www.wg-gesucht.de/... +# - "https://www.immobilienscout24.de/Suche/..." +# - "https://www.wg-gesucht.de/..." urls: +# There are often city districts in the address which +# Google Maps does not like. Use this blacklist to remove +# districts from the search. +blacklist: + - Innenstadt + # If an expose includes an address, the bot is capable of # displaying the distance and time to travel (duration) to # some configured other addresses, for specific kinds of @@ -22,6 +28,7 @@ urls: # - "bicyle" # - "transit" (public transport) # - "driving" +# - "walking" # # The example configuration below includes a place for # "John", located at the main train station of munich. @@ -29,21 +36,21 @@ urls: # each with a different label. Furthermore a place for # "Jane" is included, located at the given destination and # with the same kinds of travel. -durations: - - name: John - destination: Hauptbahnhof, München - modes: - - gm_id: transit - title: "Öff." - - gm_id: bicycle - title: "Rad" - - name: Jane - destination: Karlsplatz, München - modes: - - gm_id: transit - title: "Öff." - - gm_id: driving - title: "Auto" +#durations: +# - name: John +# destination: Hauptbahnhof, München +# modes: +# - gm_id: transit +# title: "Öff." +# - gm_id: bicycle +# title: "Rad" +# - name: Jane +# destination: Karlsplatz, München +# modes: +# - gm_id: transit +# title: "Öff." +# - gm_id: driving +# title: "Auto" # Multiline message (yes, the | is supposed to be there), # to format the message received from the Telegram bot. @@ -54,15 +61,18 @@ durations: # - {price}: Price for the flat # - {durations}: Durations calculated by GMaps, see above # - {url}: URL to the expose +# - {address}: address of the flat +# - {date}: possible date of move message: | - {title} + {title} (ab {date}) Zimmer: {rooms} Größe: {size} Preis: {price} - Anfahrt: - {durations} + Adresse: {address} {url} +# Anfahrt: +# {durations} # Calculating durations requires access to the Google Maps API. # Below you can configure the URL to access the API, with placeholders. @@ -92,3 +102,13 @@ google_maps_api: telegram: bot_token: receiver_ids: + +# It is possible to filter entries by date of possible move. Three filters are available: +# a minimum date, a maximum date and a blacklist. The blacklist is useful for dates +# which cannot be parsed (e.g. "sofort"). +#date_filter: +# date_min: 2019-12-01 +# date_max: 2020-01-01 +# blacklist_phrases: +# - "sofort" +date_filter: From f15036ec13539be9d998bf768f7a1d19291578e9 Mon Sep 17 00:00:00 2001 From: Nicolai Date: Sat, 12 Oct 2019 17:41:50 +0200 Subject: [PATCH 4/4] Remove unnecessary debug print --- flathunter/hunter.py | 1 - 1 file changed, 1 deletion(-) diff --git a/flathunter/hunter.py b/flathunter/hunter.py index f12aa48f..e0ffb1e8 100644 --- a/flathunter/hunter.py +++ b/flathunter/hunter.py @@ -70,7 +70,6 @@ def hunt_flats(self, config, searchers, id_watch): if not "date" in expose: for searcher in searchers: if re.search(searcher.URL_PATTERN, url): - print(expose["url"]) expose["date"] = searcher.load_date(expose["url"]) self.__log__.debug("Loaded date {} for url {}".format(address, expose["url"])) break