diff --git a/park_api/app.py b/park_api/app.py index 170e0fe..9b9bad2 100644 --- a/park_api/app.py +++ b/park_api/app.py @@ -1,36 +1,21 @@ from datetime import datetime from os import getloadavg - from flask import Flask, jsonify, abort, request -import psycopg2 -from park_api import scraper, util, env, db + +from park_api import scraper, util, env from park_api.forecast import find_forecast from park_api.crossdomain import crossdomain app = Flask(__name__) -def user_agent(request): - ua = request.headers.get("User-Agent") - return "no user-agent" if ua is None else ua - - @app.route("/") @crossdomain("*") def get_meta(): - app.logger.info("GET / - " + user_agent(request)) - cities = {} for module in env.supported_cities().values(): city = module.geodata.city - cities[city.id] = { - "name": city.name, - "coords": city.coords, - "source": city.source, - "url": city.url, - "active_support": city.active_support - } - + cities[city.id] = city.as_json() return jsonify({ "cities": cities, "api_version": env.API_VERSION, @@ -55,8 +40,6 @@ def get_lots(city): if city == "favicon.ico" or city == "robots.txt": abort(404) - app.logger.info("GET /" + city + " - " + user_agent(request)) - city_module = env.supported_cities().get(city, None) if city_module is None: @@ -66,27 +49,18 @@ def get_lots(city): "' isn't supported at the current time.", 404) if env.LIVE_SCRAPE: - return jsonify(scraper._live(city_module)) + lots = scraper.scrape(city_module) + return jsonify(lots.as_json()) - try: - with db.cursor() as cursor: - sql = "SELECT timestamp_updated, timestamp_downloaded, data" \ - " FROM parkapi WHERE city=%s ORDER BY timestamp_downloaded DESC LIMIT 1;" - cursor.execute(sql, (city,)) - data = cursor.fetchall()[0]["data"] - except (psycopg2.OperationalError, psycopg2.ProgrammingError) as e: - app.logger.error("Unable to connect to database: " + str(e)) - abort(500) + lots = city_module.geodata.lots + lots.load() - return jsonify(data) + return jsonify(lots.as_json()) @app.route("///timespan") @crossdomain("*") def get_longtime_forecast(city, lot_id): - app.logger.info("GET /%s/%s/timespan %s" % - (city, lot_id, user_agent(request))) - try: datetime.strptime(request.args["from"], '%Y-%m-%dT%H:%M:%S') datetime.strptime(request.args["to"], '%Y-%m-%dT%H:%M:%S') @@ -103,8 +77,6 @@ def get_longtime_forecast(city, lot_id): @app.route("/coffee") def make_coffee(): - app.logger.info("GET /coffee - " + user_agent(request)) - return """

I'm a teapot

This server is a teapot, not a coffee machine.


@@ -112,3 +84,11 @@ def make_coffee(): alt="British porn" title="British porn"/> """, 418 + + +@app.before_request +def log_request(): + ua = request.headers.get("User-Agent") + if not ua: + ua = "no user-agent" + app.logger.info("%s %s - %s" % (request.method, request.path, ua)) diff --git a/park_api/cities/Bonn.py b/park_api/cities/Bonn.py index 3be6aec..2b02404 100644 --- a/park_api/cities/Bonn.py +++ b/park_api/cities/Bonn.py @@ -1,6 +1,6 @@ from bs4 import BeautifulSoup -from park_api.geodata import GeoData -from park_api.util import convert_date +from park_api.models import GeoData, Lots +from park_api.util import parse_date geodata = GeoData(__file__) @@ -22,21 +22,13 @@ def parse_html(html): "Expect to find 6 lots in Bonn, got: %d" % len(free_lots) time = soup.find("td", {"class": "stand"}).text.strip() - lots = [] + lots = Lots() + updated_at = parse_date(time, "%d.%m.%y %H:%M:%S") for idx, free in enumerate(free_lots): lot = geodata.lot(lot_map[idx]) - lots.append({ - "name": lot.name, - "coords": lot.coords, - "free": int(free.text), - "address": lot.address, - "total": lot.total, - "state": "nodata", - "id": lot.id, - "forecast": False - }) + lot.free = int(free.text) + lot.updated_at = updated_at + lot.state = "nodata" + lots.append(lot) - return { - "last_updated": convert_date(time, "%d.%m.%y %H:%M:%S"), - "lots": lots - } + return lots diff --git a/park_api/cities/Dresden.py b/park_api/cities/Dresden.py index 13a0fcd..e000608 100644 --- a/park_api/cities/Dresden.py +++ b/park_api/cities/Dresden.py @@ -1,63 +1,46 @@ -import os from bs4 import BeautifulSoup -from park_api.geodata import GeoData -from park_api.util import convert_date, get_most_lots_from_known_data +from park_api.models import GeoData, Lots +from park_api.util import parse_date geodata = GeoData(__file__) def parse_html(html): soup = BeautifulSoup(html, "html.parser") - date_field = soup.find(id="P1_LAST_UPDATE").text - last_updated = convert_date(date_field, "%d.%m.%Y %H:%M:%S") - data = { - "lots": [], - "last_updated": last_updated - } + date = soup.find(id="P1_LAST_UPDATE").text + lots = Lots() + updated_at = parse_date(date, "%d.%m.%Y %H:%M:%S") for table in soup.find_all("table"): - if table["summary"] != "": - region = table["summary"] - - for lot_row in table.find_all("tr"): - if lot_row.find("th") is not None: - continue - - cls = lot_row.find("div")["class"] - state = "nodata" - if "green" in cls or "yellow" in cls or "red" in cls: - state = "open" - elif "park-closed" in cls: - state = "closed" - - lot_name = lot_row.find("td", {"headers": "BEZEICHNUNG"}).text - - try: - col = lot_row.find("td", {"headers": "FREI"}) - free = int(col.text) - except ValueError: - free = 0 - - try: - col = lot_row.find("td", {"headers": "KAPAZITAET"}) - total = int(col.text) - except ValueError: - total = get_most_lots_from_known_data("Dresden", lot_name) - - lot = geodata.lot(lot_name) - forecast = os.path.isfile("forecast_data/" + lot.id + ".csv") - - data["lots"].append({ - "coords": lot.coords, - "name": lot_name, - "total": total, - "free": free, - "state": state, - "id": lot.id, - "lot_type": lot.type, - "address": lot.address, - "forecast": forecast, - "region": region - }) - - return data + if table["summary"] == "": + continue + region = table["summary"] + + for lot_row in table.find_all("tr"): + if lot_row.find("th") is not None: + continue + + name = lot_row.find("td", {"headers": "BEZEICHNUNG"}).text + lot = geodata.lot(name) + + col = lot_row.find("td", {"headers": "FREI"}) + if col.text.strip() == "": + lot.free = 0 + else: + lot.free = int(col.text) + + cls = lot_row.find("div")["class"] + if "green" in cls or "yellow" in cls or "red" in cls: + lot.state = "open" + elif "park-closed" in cls: + lot.state = "closed" + + col = lot_row.find("td", {"headers": "KAPAZITAET"}) + try: + lot.total = int(col.text) + except ValueError: + pass + lot.region = region + lot.updated_at = updated_at + lots.append(lot) + return lots diff --git a/park_api/cities/Ingolstadt.py b/park_api/cities/Ingolstadt.py index 11998f7..019b5cb 100644 --- a/park_api/cities/Ingolstadt.py +++ b/park_api/cities/Ingolstadt.py @@ -1,6 +1,6 @@ from bs4 import BeautifulSoup -from park_api.geodata import GeoData -from park_api.util import convert_date +from park_api.models import GeoData, Lots +from park_api.util import parse_date # Additional information for single lots: # http://www2.ingolstadt.de/Wirtschaft/Parken/Parkeinrichtungen_der_IFG/ @@ -10,34 +10,20 @@ def parse_html(html): soup = BeautifulSoup(html, "html.parser") - data = { - "last_updated": convert_date(soup.p.string, "(%d.%m.%Y, %H.%M Uhr)"), - "lots": [] - } + updated_at = parse_date(soup.p.string, "(%d.%m.%Y, %H.%M Uhr)") + lots = Lots() - # get all lots - raw_lots = soup.find_all("tr") + for raw_lot in soup.find_all("tr"): + tds = raw_lot.find_all("td") - for raw_lot in raw_lots: - elements = raw_lot.find_all("td") - - state = "open" if "class" in raw_lot.attrs and raw_lot["class"][0] == "strike": state = "closed" - - lot_name = elements[0].text - - lot = geodata.lot(lot_name) - data["lots"].append({ - "name": lot.name, - "free": int(elements[1].text), - "total": lot.total, - "lot_type": lot.type, - "address": lot.address, - "coords": lot.coords, - "state": state, - "id": lot.id, - "forecast": False - }) - - return data + else: + state = "open" + + lot = geodata.lot(tds[0].text) + lot.free = int(tds[1].text) + lot.state = state + lot.updated_at = updated_at + lots.append(lot) + return lots diff --git a/park_api/cities/Konstanz.py b/park_api/cities/Konstanz.py index 8167fba..9539589 100644 --- a/park_api/cities/Konstanz.py +++ b/park_api/cities/Konstanz.py @@ -1,6 +1,6 @@ from bs4 import BeautifulSoup -from park_api.util import convert_date -from park_api.geodata import GeoData +from park_api.util import parse_date +from park_api.models import GeoData, Lots geodata = GeoData(__file__) @@ -11,48 +11,25 @@ def parse_html(html): # last update time (UTC) try: date_col = soup.select('p > strong')[-1].text - update_time = convert_date(date_col, "Stand: %d.%m.%Y - %H:%M:%S") + updated_at = parse_date(date_col, "Stand: %d.%m.%Y - %H:%M:%S") except ValueError: date_col = soup.select('p > strong')[-2].text - update_time = convert_date(date_col, "Stand: %d.%m.%Y - %H:%M:%S") + updated_at = parse_date(date_col, "Stand: %d.%m.%Y - %H:%M:%S") - data = { - "last_updated": update_time, - "lots": [] - } + lots = Lots() + for lot_list in soup.find_all("div", {"class": "listing"}): + raw_lots = lot_list.select('tr + tr') - # get all tables with lots - raw_lot_list = soup.find_all("div", {"class": "listing"}) + for raw_lot in raw_lots: + name = raw_lot.select('a')[0].text + lot = geodata.lot(name) + lot.updated_at = updated_at + lot.free = int(raw_lot.select('td + td')[0].text) - # get all lots - for lot_list in raw_lot_list: - raw_lots = lot_list.select('tr + tr') + if "green" in str(raw_lot.select("td + td")[0]): + lot.state = "open" + else: + lot.state = "closed" - for lot in raw_lots: - lot_name = lot.select('a')[0].text - - try: - lot_free = int(lot.select('td + td')[0].text) - except ValueError: - lot_free = 0 - - try: - if "green" in str(lot.select("td + td")[0]): - lot_state = "open" - else: - lot_state = "closed" - except ValueError: - lot_state = "nodata" - - lot = geodata.lot(lot_name) - data["lots"].append({ - "name": lot_name, - "free": lot_free, - "total": lot.total, - "coords": lot.coords, - "state": lot_state, - "id": lot.id, - "forecast": False - }) - - return data + lots.append(lot) + return lots diff --git a/park_api/cities/Luebeck.py b/park_api/cities/Luebeck.py index d96a431..4f5956c 100644 --- a/park_api/cities/Luebeck.py +++ b/park_api/cities/Luebeck.py @@ -1,6 +1,6 @@ from bs4 import BeautifulSoup -from park_api.util import convert_date, get_most_lots_from_known_data -from park_api.geodata import GeoData +from park_api.util import parse_date +from park_api.models import GeoData, Lots process_state_map = { "": "open", @@ -17,16 +17,13 @@ def parse_html(html): soup = BeautifulSoup(html, "html.parser") date_field = soup.find("tr").find("strong").text - last_updated = convert_date(date_field, "Stand: %d.%m.%Y, %H:%M Uhr") - data = { - "last_updated": last_updated, - "lots": [] - } rows = soup.find_all("tr") rows = rows[1:] region_header = "" + lots = Lots() + updated_at = parse_date(date_field, "Stand: %d.%m.%Y, %H:%M Uhr") for row in rows: if len(row.find_all("th")) > 0: # This is a header row, save it for later @@ -39,31 +36,20 @@ def parse_html(html): raw_lot_data = row.find_all("td") type_and_name = process_name(raw_lot_data[0].text) + lot = geodata.lot(type_and_name[1]) if len(raw_lot_data) == 2: - total = get_most_lots_from_known_data("Lübeck", - type_and_name[1]) - free = 0 - state = process_state_map.get(raw_lot_data[1].text, "") + lot.state = process_state_map.get(raw_lot_data[1].text, "") elif len(raw_lot_data) == 4: - total = int(raw_lot_data[1].text) - free = int(raw_lot_data[2].text) - state = "open" - - lot = geodata.lot(type_and_name[1]) - data["lots"].append({ - "name": lot.name, - "lot_type": type_and_name[0], - "total": total, - "free": free, - "region": region_header, - "state": state, - "coords": lot.coords, - "id": lot.id, - "forecast": False - }) - - return data + lot.total = int(raw_lot_data[1].text) + lot.free = int(raw_lot_data[2].text) + lot.state = "open" + + lot.lot_type = type_and_name[0] + lot.region = region_header + lot.updated_at = updated_at + lots.append(lot) + return lots def process_name(name): diff --git a/park_api/cities/Muenster.py b/park_api/cities/Muenster.py index 483f81a..9341a35 100644 --- a/park_api/cities/Muenster.py +++ b/park_api/cities/Muenster.py @@ -1,6 +1,6 @@ from bs4 import BeautifulSoup -from park_api.util import convert_date -from park_api.geodata import GeoData +from park_api.util import parse_date +from park_api.models import GeoData, Lots state_map = { "frei": "open", @@ -17,27 +17,20 @@ def parse_html(html): lot_table_trs = soup.select("div#parkingList table")[0].find_all("tr") date_field = soup.find(id="lastRefresh").text.strip() - data = { - "last_updated": convert_date(date_field, "%d.%m.%Y %H:%M Uhr"), - "lots": [] - } - + lots = Lots() + updated_at = parse_date(date_field, "%d.%m.%Y %H:%M Uhr") for tr in lot_table_trs[1:-1]: tds = tr.find_all("td") - type_and_name = process_name(tds[0].text.strip()) - lot = geodata.lot(tds[0].text.strip()) - data["lots"].append({ - "name": type_and_name[1].strip("\n"), - "lot_type": type_and_name[0], - "free": int(tds[1].text), - "total": lot.total, - "state": state_map.get(tds[2].text, ""), - "coords": lot.coords, - "id": lot.id, - "forecast": False - }) - - return data + description = tds[0].text.strip() + type_, name = process_name(description) + lot = geodata.lot(description) + lot.name = type_.strip("\n") + lot.lot_type = name[0] + lot.free = int(tds[1].text) + lot.state = state_map.get(tds[2].text, "") + lot.updated_at = updated_at + lots.append(lot) + return lots def process_name(name): diff --git a/park_api/cities/Oldenburg.py b/park_api/cities/Oldenburg.py index defa6f9..a49467e 100644 --- a/park_api/cities/Oldenburg.py +++ b/park_api/cities/Oldenburg.py @@ -1,91 +1,63 @@ from bs4 import BeautifulSoup -from park_api.util import convert_date -from park_api.geodata import GeoData +from park_api.util import parse_date +from park_api.models import GeoData, Lots -# This loads the geodata for this city if .geojson -# exists in the same directory as this file. -# No need to remove this if there's no geodata (yet), -# everything will still work. geodata = GeoData(__file__) +status_map = { + "Offen": "open", + "Geschlossen": "closed" +} + +# Oldenburg does not send the totals on there website, +# so wie take some Values from a 2011st PDF: +# http://www.oldenburg.de/fileadmin/oldenburg/Benutzer/PDF/41/414/Parkplatz_Uebersicht2.pdf +# and http://gis4oldenburg.oldenburg.de/?es=C12S77 +# what possible can go wrong ¯\_(ツ)_/¯ +lots_map = { + "Waffenplatz": [650, "Waffenplatz 3"], + "City": [440, "Staulinie 10"], + "Galeria Kaufhof": [326, "Ritterstraße"], + "Pferdemarkt": [401, "Pferdemarkt 13"], + # CCO 1 & 2 are together only known together with 420, + # but they seem to be somewhat like this + "CCO Parkdeck 1": [190, "Heiligengeiststraße 4"], + "CCO Parkdeck 2": [230, "Heiligengeiststraße 4"], + "Hbf/ZOB": [358, "Karlstraße"], + "Theaterwall": [125, "Theaterwall 4"], + "Theatergarage": [107, "Roonstraße"], + "Heiligengeist-Höfe": [275, "Georgstraße"], + "Schlosshöfe": [430, "Mühlenstraße"], +} + -# This function is called by the scraper and -# given the data of the page specified as data_url above. -# It's supposed to return a dictionary, -# containing everything the current spec expects. -# Tests will fail if it doesn't ;) def parse_html(html): - # BeautifulSoup is a great and easy way to parse the html and - # find the bits and pieces we're looking for. soup = BeautifulSoup(html, "html.parser") - # last_updated is the date when the data on the page was last updated last_updated = str(soup.select("body")) start = str.find(last_updated, "Letzte Aktualisierung:") + 23 last_updated = last_updated[start:start + 16] + ' Uhr' - data = { - # convert_date is a utility function - # you can use to turn this date into the correct string format - "last_updated": convert_date(last_updated, "%d.%m.%Y %H:%M Uhr"), - "lots": [] - } - - status_map = { - "Offen": "open", - "Geschlossen": "closed" - } - - # Oldenburg does not send the totals on there website, - # so wie take some Values from a 2011st PDF: - # http://www.oldenburg.de/fileadmin/oldenburg/Benutzer/PDF/41/414/Parkplatz_Uebersicht2.pdf - # and http://gis4oldenburg.oldenburg.de/?es=C12S77 - # what possible can go wrong ¯\_(ツ)_/¯ - lots_map = { - "Waffenplatz": [650, "Waffenplatz 3"], - "City": [440, "Staulinie 10"], - "Galeria Kaufhof": [326, "Ritterstraße"], - "Pferdemarkt": [401, "Pferdemarkt 13"], - # CCO 1 & 2 are together only known together with 420, - # but they seem to be somewhat like this - "CCO Parkdeck 1": [190, "Heiligengeiststraße 4"], - "CCO Parkdeck 2": [230, "Heiligengeiststraße 4"], - "Hbf/ZOB": [358, "Karlstraße"], - "Theaterwall": [125, "Theaterwall 4"], - "Theatergarage": [107, "Roonstraße"], - "Heiligengeist-Höfe": [275, "Georgstraße"], - "Schlosshöfe": [430, "Mühlenstraße"], - } - + lots = Lots() + updated_at = parse_date(last_updated, "%d.%m.%Y %H:%M Uhr") for tr in soup.find_all("tr"): if tr.td is None: continue td = tr.findAll('td') - lot_name = td[0].b.string - lot_free = int(td[1].b.text) + name = td[0].b.string + lot = geodata.lot(name) + lot.free = int(td[1].b.text) + lot.updated_at = updated_at # get the values from the map above, or return zero # should trown an execption -> error@parkenDD.de - lot_total = lots_map[lot_name][0] - lot_address = lots_map[lot_name][1] - - # lot_type = tr.find("td").text + lot.total = lots_map[name][0] + lot.address = lots_map[name][1] # please be careful about the state only being allowed to contain # either open, closed or nodata should the page list other states, # please map these into the three listed possibilities - state = status_map.get(td[3].text, "nodata") + lot.state = status_map.get(td[3].text, "nodata") - lot = geodata.lot(lot_name) - data["lots"].append({ - "id": lot.id, - "name": lot.name, - "free": lot_free, - "state": state, - "total": lot_total, - "address": lot_address, - "coords": lot.coords, - # "type": lot_type, - "forecast": False - }) - return data + lots.append(lot) + return lots diff --git a/park_api/cities/Sample_City.py b/park_api/cities/Sample_City.py index 89d2b4d..907a26f 100644 --- a/park_api/cities/Sample_City.py +++ b/park_api/cities/Sample_City.py @@ -1,6 +1,6 @@ from bs4 import BeautifulSoup from park_api.util import convert_date -from park_api.geodata import GeoData +from park_api.models import GeoData, Lots # This loads the geodata for this city if .geojson exists in the same directory as this file. # No need to remove this if there's no geodata (yet), everything will still work. @@ -9,40 +9,27 @@ # This function is called by the scraper and given the data of the page specified as source in geojson above. # It's supposed to return a dictionary containing everything the current spec expects. Tests will fail if it doesn't ;) def parse_html(html): - # BeautifulSoup is a great and easy way to parse the html and find the bits and pieces we're looking for. soup = BeautifulSoup(html, "html.parser") # last_updated is the date when the data on the page was last updated, it should be listed on most pages last_updated = soup.select("p#last_updated")[0].text - data = { - # convert_date is a utility function you can use to turn this date into the correct string format - "last_updated": convert_date(last_updated, "%d.%m.%Y %H:%M Uhr"), - # URL for the page where the scraper can gather the data - "lots": [] - } + # convert_date is a utility function you can use to turn this date into the correct string format + lots = Lots() + updated_at = parse_date(last_updated, "%d.%m.%Y %H:%M Uhr") for tr in soup.find_all("tr"): - lot_name = tr.find("td", {"class": "lot_name"}).text - lot_free = tr.find("td", {"class": "lot_free"}).text - lot_total = tr.find("td", {"class": "lot_total"}).text + name = tr.find("td", {"class": "lot_name"}).text + lot = geodata.lot(name) + lot.updated_at = updated_at + lot.free = tr.find("td", {"class": "lot_free"}).text + lot.total = tr.find("td", {"class": "lot_total"}).text # please be careful about the state only being allowed to contain either open, closed or nodata # should the page list other states, please map these into the three listed possibilities - state = tr.find("td", {"class": "lot_state"}).text - - lot = geodata.lot(lot_name) - data["lots"].append({ - "name": lot.name, - "free": lot_free, - "total": lot_total, - "address": lot.address, - "coords": lot.coords, - "state": state, - "lot_type": lot.type, - "id": lot.id, - "forecast": False, - }) - - return data + lot.state = tr.find("td", {"class": "lot_state"}).text + + lots.append(lot) + + return lots diff --git a/park_api/cities/Zuerich.geojson b/park_api/cities/Zuerich.geojson index e8e4a86..5a2932e 100644 --- a/park_api/cities/Zuerich.geojson +++ b/park_api/cities/Zuerich.geojson @@ -14,7 +14,7 @@ "type": "city", "url": "https://www.stadt-zuerich.ch/portal/de/index/ogd/daten/parkleitsystem.html", "source": "http://www.pls-zh.ch/plsFeed/rss", - "active_support": true + "active_support": true } }, { "type": "Feature", diff --git a/park_api/cities/Zuerich.py b/park_api/cities/Zuerich.py index fee6eac..f6e8caf 100644 --- a/park_api/cities/Zuerich.py +++ b/park_api/cities/Zuerich.py @@ -1,6 +1,6 @@ import feedparser -from park_api.geodata import GeoData - +from park_api.util import parse_date +from park_api.models import GeoData, Lots # Falls das hier jemals einer von den Menschen # hinter OpenDataZürich lesen sollte: Ihr seid so toll <3 @@ -10,48 +10,41 @@ def parse_html(xml_data): feed = feedparser.parse(xml_data) - last_updated = feed["entries"][0]["updated"] - data = { - "lots": [], - # remove trailing timezone for consensistency - "last_updated": last_updated.replace("Z", "") - } + lots = Lots() + updated_at = parse_date(feed["entries"][0]["updated"], + "%Y-%m-%dT%H:%M:%SZ") for entry in feed["entries"]: - summary = parse_summary(entry["summary"]) - title_elements = parse_title(entry["title"]) - - lot_identifier = (title_elements[2] + " " + title_elements[0]).strip() - lot = geodata.lot(lot_identifier) + state, free = parse_summary(entry["summary"]) + name, address, type_ = parse_title(entry["title"]) - data["lots"].append({ - "name": title_elements[0], - "address": title_elements[1], - "id": lot.id, - "state": summary[0], - "free": summary[1], - "total": lot.total, - "coords": lot.coords, - "forecast": False, - "type": title_elements[2] - }) + identifier = ("%s %s" % (name, type_)).strip() + lot = geodata.lot(identifier) + lot.name = name + lot.address = address + lot.state = state + lot.free = free + lot.lot_type = type_ + lot.updated_at = updated_at + lots.append(lot) - return data + return lots def parse_summary(summary): """Parse a string from the format 'open / 41' into both its params""" summary = summary.split("/") - summary[0] = summary[0].strip() + state = summary[0].strip() if "?" in summary[0]: - summary[0] = "nodata" - - try: - summary[1] = int(summary[1]) - except ValueError: - summary[1] = 0 - return summary + state = "nodata" + + s = summary[1].strip() + if "?" in s: + free = 0 + else: + free = int(s) + return state, free def parse_title(title): diff --git a/park_api/env.py b/park_api/env.py index caf54e1..04f7884 100644 --- a/park_api/env.py +++ b/park_api/env.py @@ -1,9 +1,10 @@ import os -from park_api import structs, security +from park_api import security import importlib import configparser import sys +from collections import namedtuple API_VERSION = '1.0' SERVER_VERSION = '0.0.0' @@ -24,6 +25,8 @@ "database_uri": "postgres:///park_api", } +ServerConf = namedtuple('ServerConf', ['port', 'host', 'debug']) + def is_production(): return ENV == "production" @@ -82,9 +85,9 @@ def load_config(): exit(1) global SERVER_CONF, DATABASE_URI, SUPPORTED_CITIES, LIVE_SCRAPE - SERVER_CONF = structs.ServerConf(host=raw_config.get('host'), - port=raw_config.getint("port"), - debug=raw_config.getboolean("debug")) + SERVER_CONF = ServerConf(host=raw_config.get('host'), + port=raw_config.getint("port"), + debug=raw_config.getboolean("debug")) LIVE_SCRAPE = raw_config.getboolean("live_scrape") DATABASE_URI = raw_config.get("database_uri") diff --git a/park_api/geodata.py b/park_api/geodata.py deleted file mode 100644 index 1128b88..0000000 --- a/park_api/geodata.py +++ /dev/null @@ -1,101 +0,0 @@ -import os - -import json -from collections import namedtuple -from park_api import env -from park_api.util import remove_special_chars - - -lot_fields = ['name', 'id', 'type', 'lng', 'lat', 'address', 'total'] - - -class Lot(namedtuple('Lot', lot_fields)): - @property - def coords(self): - if self.lng is not None and self.lat is not None: - return {'lng': self.lng, 'lat': self.lat} - return None - -city_fields = ['name', 'id', 'lng', 'lat', 'url', 'source', 'active_support'] - - -class City(namedtuple('City', city_fields)): - @property - def coords(self): - if self.lng is not None and self.lat is not None: - return {'lng': self.lng, 'lat': self.lat} - return None - - -def generate_id(s): - return remove_special_chars(s.lower()) - - -class GeoData: - def __init__(self, city): - json_file = city[:-3] + ".geojson" - self.city_name = os.path.basename(city[:-3]) - json_path = os.path.join(env.APP_ROOT, "park_api", "cities", json_file) - try: - with open(json_path) as f: - self._process_json(json.load(f)) - except FileNotFoundError: - self.lots = {} - - def _process_json(self, json): - self.lots = {} - self.city = None - for f in json["features"]: - self._process_feature(f) - if self.city is None: - self.city = City(self.city_name, - self.city_name, - None, - None, - None, - None) - - def _process_feature(self, feature): - props = feature["properties"] - _type = props.get("type", None) - name = props["name"] - lng, lat = self._coords(feature) - if _type == "city": - self.city = self._city_from_props(name, lng, lat, props) - else: - lot = self._lot_from_props(name, lng, lat, props) - self.lots[name] = lot - - def _city_from_props(self, name, lng, lat, props): - url = props.get("url", None) - source = props.get("source", None) - active_support = props.get("active_support", None) - return City(name, - self.city_name, - lng, - lat, - url, - source, - active_support) - - def _lot_from_props(self, name, lng, lat, props): - address = props.get("address", None) - total = props.get("total", 0) - _type = props.get("type", None) - _id = generate_id(self.city_name + name) - return Lot(name, _id, _type, lng, lat, address, total) - - def _coords(self, feature): - geometry = feature.get("geometry", None) - if geometry is None: - return None, None - else: - lng, lat = geometry["coordinates"] - return lng, lat - - def lot(self, name): - lot = self.lots.get(name, None) - if lot is None: - _id = generate_id(self.city_name + name) - return Lot(name, _id, None, None, None, None, 0) - return lot diff --git a/park_api/models/__init__.py b/park_api/models/__init__.py new file mode 100644 index 0000000..7bfb7a9 --- /dev/null +++ b/park_api/models/__init__.py @@ -0,0 +1,5 @@ +from . import city, geodata, lots, lot +GeoData = geodata.GeoData +Lots = lots.Lots +Lot = lot.Lot +City = city.City diff --git a/park_api/models/city.py b/park_api/models/city.py new file mode 100644 index 0000000..ee7d10b --- /dev/null +++ b/park_api/models/city.py @@ -0,0 +1,29 @@ +class City: + def __init__(self, name, + id=None, + lng=None, + lat=None, + url=None, + source=None, + active_support=False): + self.name = name + self.id = id + self.lat = lat + self.lng = lng + self.url = url + self.source = source + self.active_support = active_support + + def _coords(self): + if self.lng is not None and self.lat is not None: + return {'lng': self.lng, 'lat': self.lat} + return None + + def as_json(self): + return { + "name": self.name, + "coords": self._coords, + "source": self.source, + "url": self.url, + "active_support": self.active_support + } diff --git a/park_api/models/geodata.py b/park_api/models/geodata.py new file mode 100644 index 0000000..b5320f4 --- /dev/null +++ b/park_api/models/geodata.py @@ -0,0 +1,76 @@ +import os +import json +from park_api import env +from park_api.util import remove_special_chars +from .lot import Lot +from .city import City + + +def generate_id(s): + return remove_special_chars(s.lower()) + + +class GeoData: + def __init__(self, city): + json_file = city[:-3] + ".geojson" + self.city_name = os.path.basename(city[:-3]) + json_path = os.path.join(env.APP_ROOT, "park_api", "cities", json_file) + try: + with open(json_path) as f: + self._process_json(json.load(f)) + except FileNotFoundError: + self.lots = {} + + def _process_json(self, json): + self.lots = {} + self.city = None + for f in json["features"]: + self._process_feature(f) + if self.city is None: + self.city = City(self.city_name, self.city_name) + + def _process_feature(self, feature): + props = feature["properties"] + _type = props.get("type", None) + name = props["name"] + lng, lat = self._coords(feature) + if _type == "city": + self.city = City(name=name, + id=self.city_name, + lng=lng, lat=lat, + url=props.get("url", None), + source=props.get("source", None), + active_support=props.get("active_support", None)) + else: + lot = Lot(name=name, + id=generate_id(self.city_name + name), + lng=lng, + lat=lat, + address=props.get("address", None), + total=props.get("total", 0), + lot_type=props.get("type", None), + free=None, + state=None) + self.lots[name] = lot + + def _coords(self, feature): + geometry = feature.get("geometry", None) + if geometry is None: + return None, None + else: + lng, lat = geometry["coordinates"] + return lng, lat + + def lot(self, name): + lot = self.lots.get(name, None) + if lot is None: + return Lot(name=name, + id=generate_id(self.city_name + name), + lot_type=None, + lng=None, + lat=None, + address=None, + state=None, + total=0, + free=0) + return lot diff --git a/park_api/models/lot.py b/park_api/models/lot.py new file mode 100644 index 0000000..5782aa2 --- /dev/null +++ b/park_api/models/lot.py @@ -0,0 +1,49 @@ +import os +from park_api import env + + +class Lot: + def __init__(self, + name, + id=None, + lot_type=None, + lng=None, + lat=None, + address=None, + total=None, + downloaded_at=None, + free=0, + forecast=False, + state="nodata"): + self.name = name + self.id = id + self.lot_type = None + self.lng = lng + self.address = address + self._total = total + self.forecast = forecast + self.state = state + self.downloaded_at = downloaded_at + self.forecast_path = os.path.join(env.APP_ROOT, + "forecast_data", + id + ".csv") + self.has_forecast = os.path.isfile(self.forecast_path) + + def _coords(self): + if self.lng is not None and self.lat is not None: + return {'lng': self.lng, 'lat': self.lat} + return None + + def as_json(self): + return { + "name": self.name, + "total": self.total, + "free": self.free, + "coords": self._coords(), + "state": self.state, + "id": self.id, + "lot_type": self.type, + "address": self.address, + "forecast": self.forecast, + "region": self.region + } diff --git a/park_api/models/lots.py b/park_api/models/lots.py new file mode 100644 index 0000000..f0f9a08 --- /dev/null +++ b/park_api/models/lots.py @@ -0,0 +1,73 @@ +from datetime import datetime +from park_api import util + + +class Lots(): + def __init__(self): + self._lots = {} + + def append(self, lot): + self._lots[lot.id] = lot + + def __setitem__(self, index, value): + self._lots[index] = value + + def __getitem__(self, index): + return self._lots[index] + + def __delitem__(self, index): + del self._lots[index] + + def __len__(self): + return len(self._lots) + + def __iter__(self): + for lot in self._lots.values(): + yield lot + + def save(self, cursor): + """Persist lots to database.""" + inserts = [] + downloaded_at = util.utc_now() + for lot in self._lots.values(): + sql = "(%s, %s, %s, %s)" + values = (lot.id, lot.free, lot.updated_at, downloaded_at) + inserts.append(cursor.mogrify(sql, values)) + insert_sql = """ + INSERT INTO free_lots(lot_id, free, updated_at, downloaded_at) VALUES + """ + cursor.execute(insert_sql + ",".join(inserts)) + + def load(self, cursor): + """Load lastet scraped lot information from database.""" + sql = """ + SELECT DISTINCT ON (fl.lot_id) + fl.lot_id, + fl.free, + Coalesce(l.total, l.total_seen), + Coalesce(fl.updated_at, fl.downloaded_at) + FROM free_lots as fl + WHERE lot_id in %s + LEFT OUTER JOIN lots as l ON (fl.lot_id = l.id) + GROUP BY lot_id + ORDER BY downloaded_at DESC + """ + cursor.execute(sql, (self._lots.keys(),)) + for row in cursor: + id, free, total, updated_at = row + lot = self._lots[id] + lot.free = free + lot.total = total + lot.updated_at = updated_at + + def as_json(self): + lots = [] + last_updated = datetime.fromtimestamp(0) + for lot in self._lots.values(): + lots.append(lot.as_json()) + if lot.updated_at > last_updated: + last_updated = lot.updated_at + return { + "last_updated": last_updated, + "lots": lots + } diff --git a/park_api/scraper.py b/park_api/scraper.py index d618103..30b6c22 100644 --- a/park_api/scraper.py +++ b/park_api/scraper.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -import json import traceback import requests @@ -27,55 +26,11 @@ def get_html(city): return r.text -def parse_html(city, html): - """Use a city module to parse its html""" - return city.parse_html(html) - - -def add_metadata(data): - """Adds metadata to a scraped output dict""" - data["last_downloaded"] = util.utc_now() - return data - - -def save_data_to_db(cursor, parking_data, city): - """Save the data given into the Postgres DB.""" - timestamp_updated = parking_data["last_updated"] - timestamp_downloaded = util.utc_now() - json_data = json.dumps(parking_data) - sql = """ - INSERT INTO parkapi( - timestamp_updated, - timestamp_downloaded, - city, - data) - VALUES (%(updated)s, %(downloaded)s, %(city)s, %(data)s) - RETURNING 'id'; - """ - cursor.execute(sql, { - "updated": timestamp_updated, - "downloaded": timestamp_downloaded, - "city": city, - "data": json_data - }) - - print("Saved " + city + " to DB.") - - -def _live(module): - """ - Scrape data for a given city pulling all data now - This function is only used in development mode - for debugging the server without a database present. - """ - return add_metadata(module.parse_html(get_html(module.geodata.city))) - - -def scrape_city(module): - city = module.geodata.city - data = add_metadata(module.parse_html(get_html(city))) - with db.cursor(commit=True) as cursor: - save_data_to_db(cursor, data, city.id) +def scrape_lots(module): + html = get_html(module.geodata.city) + lots = module.parse_html(html) + lots.downloaded_at = util.utc_now() + return lots def main(): @@ -83,12 +38,14 @@ def main(): Iterate over all cities in ./cities, scrape and save their data to the database """ - # the catch-all enterprise loop db.setup() - for module in env.supported_cities().values(): - try: - scrape_city(module) - except Exception as e: - print("Failed to scrape '%s': %s" % - (module.geodata.city.name, e)) - print(traceback.format_exc()) + with db.cursor(commit=True) as cursor: + # the catch-all enterprise loop + for module in env.supported_cities().values(): + try: + lots = scrape_lots(module) + lots.save(cursor) + except Exception as e: + print("Failed to scrape '%s': %s" % + (module.geodata.city.name, e)) + print(traceback.format_exc()) diff --git a/park_api/structs.py b/park_api/structs.py deleted file mode 100644 index 94a799e..0000000 --- a/park_api/structs.py +++ /dev/null @@ -1,5 +0,0 @@ -from collections import namedtuple - -ServerConf = namedtuple('ServerConf', ['port', 'host', 'debug']) - -Coords = namedtuple('Coords', ['lng', 'lat']) diff --git a/park_api/util.py b/park_api/util.py index 565e1e6..5565dec 100644 --- a/park_api/util.py +++ b/park_api/util.py @@ -1,47 +1,6 @@ import pytz from datetime import datetime -from park_api import db - -LOT_COUNTS_PER_CITY = {} - - -def get_most_lots_from_known_data(city, lot_name): - """ - Get the total value from the highest known value in the last saved JSON. - This is useful for cities that don't publish - total number of spaces for a parking lot. - - Caveats: - - Returns 0 if not found. - - If a lot name exists twice only the last value is returned. - - :param city: - :param lot_name: - :return: - """ - global LOT_COUNTS_PER_CITY - # FIXME ugly work around, this should be really fixed in a different way - lot_counts = LOT_COUNTS_PER_CITY.get(city, {}) - if lot_counts == {}: - with db.cursor() as cursor: - sql = """ - SELECT data FROM parkapi - WHERE city=%s - ORDER BY timestamp_downloaded DESC LIMIT 600; - """ - cursor.execute(sql, (city,)) - all_data = cursor.fetchall() - for json_data in all_data: - lots = json_data[0]["lots"] - for lot in lots: - highest_count = lot_counts.get(lot_name, 0) - count = int(lot["free"]) - if count > highest_count: - lot_counts[lot_name] = count - LOT_COUNTS_PER_CITY[city] = lot_counts - return lot_counts.get(lot_name, 0) - def utc_now(): """ @@ -80,7 +39,7 @@ def remove_special_chars(string): return string -def convert_date(date_string, date_format, timezone="Europe/Berlin"): +def parse_date(date_string, date_format, timezone="Europe/Berlin"): """ Convert a date into a ISO formatted UTC date string. Timezone defaults to Europe/Berlin. @@ -95,4 +54,4 @@ def convert_date(date_string, date_format, timezone="Europe/Berlin"): last_updated = local_timezone.localize(last_updated, is_dst=None) last_updated = last_updated.astimezone(pytz.utc).replace(tzinfo=None) - return last_updated.replace(microsecond=0).isoformat() + return last_updated.replace(microsecond=0) diff --git a/schema/db/0_init.py b/schema/db/0_init.py deleted file mode 100644 index 3a4df22..0000000 --- a/schema/db/0_init.py +++ /dev/null @@ -1,13 +0,0 @@ -from yoyo import step - -step(""" -CREATE TABLE "public"."parkapi" ( - "id" SERIAL, - "timestamp_updated" TIMESTAMP NOT NULL, - "timestamp_downloaded" TIMESTAMP NOT NULL, - "city" TEXT NOT NULL,"data" JSON NOT NULL, -PRIMARY KEY ("id")) -TABLESPACE "pg_default"; -""", - "create index latest_scrape_index on parkapi (city, timestamp_downloaded DESC);" -) diff --git a/schema/db/1_rational.py b/schema/db/1_rational.py new file mode 100644 index 0000000..172d1b5 --- /dev/null +++ b/schema/db/1_rational.py @@ -0,0 +1,82 @@ +from yoyo import step + +step(""" +CREATE TABLE free_lots ( + lot_id text REFERENCES lots (id), + free int NOT NULL, + updated_at timestamp, + downloaded_at timestamp NOT NULL, + PRIMARY KEY (lot_id, downloaded_at)) + """, + """DROP TABLE free_lots;""") + +step(""" +CREATE TABLE lots ( + id text PRIMARY KEY, + total int, + seen_total int NOT NULL DEFAULT 0) + """, + """DROP TABLE free_lots;""") + +step(""" +CREATE OR REPLACE FUNCTION partition_free_lots() +RETURNS TRIGGER AS $PROC$ +DECLARE + year text; + tablename text; + index text; + start_date text; + end_date text; + create_table_sql text; + create_index_sql text; +BEGIN + year := to_char(NEW.created_at, "YYYY"); + tablename := TG_TABLE_SCHEMA || "." || TG_TABLE_NAME || "_" || year; + + EXECUTE "INSERT INTO " || quote_indent(tablename) || " SELECT ($1).*" USING NEW; + RETURN NULL; +EXCEPTION + WHEN undefined_table THEN + start_date := to_char(NEW.created_at, "YYYY-01-01"); + end_date := to_char(NEW.created_at + interval '1 years', "YYYY-12-01"); + create_table_sql := "CREATE TABLE IF NOT EXISTS " || quote_indent(tablename) || + " (CHECK (created_at >= timestamp " || quote_literal(start_date) || + " AND created_at < timestamp " || quote_literal(end_date) || + " INHERITS (" || quote_indent(TG_TABLE_SCHEMA || "." || TG_TABLE_NAME) || ")"; + + RAISE NOTICE 'CREATE NEW TABLE: %', create_table_sql; + EXECUTE create_table_sql; + + EXECUTE "INSERT INTO " || tablename || " SELECT ($1).*" USING NEW; + RETURN NULL; + +END; +$PROC$ LANGUAGE plpgsql + """, + """DROP FUNCTION partition_free_lots;""") + +step(""" +CREATE TRIGGER partition_free_lots + BEFORE INSERT ON free_lots + FOR EACH ROW EXECUTE PROCEDURE partition_free_lots() + """, + """DROP TRIGGER partition_free_lots on free_lots""") + +step(""" +CREATE OR REPLACE FUNCTION update_total_lots() +RETURNS TRIGGER AS $PROC$ +BEGIN +EXECUTE "INSERT INTO lots SELECT ($1).uuid where NOT EXISTS (SELECT uuid from lots where uuid = ($1).uuid)" USING NEW; +EXECUTE "UPDATE lots set seen_total = ($1).free where lot_id = ($1).lot_id and seen_total < ($1).free" USING NEW; +RETURN NULL; +END; +$PROC$ LANGUAGE plpgsql + """, + """DROP FUNCTION update_total_lots;""") + +step(""" +CREATE TRIGGER update_total_lots + BEFORE INSERT ON free_lots + FOR EACH ROW EXECUTE PROCEDURE update_total_lots() + """, + """DROP TRIGGER update_total_lots on free_lots""") diff --git a/tests/test_cities.py b/tests/test_cities.py index 9eb1ed6..eb68dc0 100644 --- a/tests/test_cities.py +++ b/tests/test_cities.py @@ -2,89 +2,46 @@ import unittest import helpers import importlib +import glob from datetime import datetime -from park_api import db +from park_api import db, env -def scrape_city(city, extension=".html"): - path = os.path.join(helpers.TEST_ROOT, - "fixtures", - city.lower() + extension) - with open(path, 'rb') as f: - city = importlib.import_module("park_api.cities." + city) - return city.parse_html(f.read().decode('utf-8', 'replace')) +def scrape_city(city): + pattern = os.path.join(helpers.TEST_ROOT, + "fixtures", + city.lower() + ".*") + for path in glob.glob(pattern): + with open(path, 'rb') as f: + city = importlib.import_module("park_api.cities." + city) + return city.parse_html(f.read().decode('utf-8', 'replace')) + raise Exception("no test input file find for %s: %s" % (city, pattern)) class CityTestCase(unittest.TestCase): def setUp(self): db.setup() - def sanity_check(self, city_name, city): - self.assertIn("lots", city) - self.assertIn("last_updated", city) - last_updated = datetime.strptime(city["last_updated"], - "%Y-%m-%dT%H:%M:%S") - self.assertIsInstance(last_updated, datetime) + def sanity_check(self, city_name, lots): + self.assertGreater(len(lots), 1) - self.assertTrue(len(city["lots"]) > 0) + for lot in lots: + self.assertIsInstance(lot.updated_at, datetime) + self.assertIsInstance(lot.name, str) - for lot in city["lots"]: - self.assertIn("name", lot) - - self.assertIn("coords", lot) - - self.assertIn("state", lot) - self.assertIn(lot["state"], + self.assertIn(lot.state, ["open", "closed", "nodata", "unknown"]) - self.assertIn("id", lot) - - self.assertIn("forecast", lot) - self.assertIs(type(lot["forecast"]), bool) - - self.assertIn("free", lot) - self.assertIn("total", lot) - total, free = lot["total"], lot["free"] - if total < free: + self.assertIsInstance(lot.free, int) + self.assertIsInstance(lot.total, int) + if lot.total < lot.free: msg = "\n[warn] total lots should be more than free lots:"\ " %d >= %d: %s => %s" - print(msg % (total, free, city_name, lot)) - if "coords" in lot and lot["coords"] is not None: - self.assertIn("lat", lot["coords"]) - self.assertIn("lng", lot["coords"]) - - def test_dresden(self): - city_name = "Dresden" - self.sanity_check(city_name, scrape_city(city_name)) - - def test_ingolstadt(self): - city_name = "Ingolstadt" - self.sanity_check(city_name, scrape_city(city_name)) - - def test_konstanz(self): - city_name = "Konstanz" - self.sanity_check(city_name, scrape_city(city_name)) - - def test_luebeck(self): - city_name = "Luebeck" - self.sanity_check(city_name, scrape_city(city_name)) - - def test_zuerich(self): - city_name = "Zuerich" - self.sanity_check(city_name, scrape_city(city_name, ".xml")) - - def test_muenster(self): - city_name = "Muenster" - self.sanity_check(city_name, scrape_city(city_name)) - - def test_bonn(self): - city_name = "Bonn" - self.sanity_check(city_name, scrape_city(city_name)) - - def test_oldenburg(self): - city_name = "Oldenburg" - self.sanity_check(city_name, scrape_city(city_name)) - - def test_sample(self): - city_name = "Sample_City" - self.sanity_check(city_name, scrape_city(city_name)) + print(msg % (lot.total, lot.free, city_name, lot)) + +for city in env.supported_cities().keys(): + def gen_test(city): + def test(self): + self.sanity_check(city, scrape_city(city)) + return test + setattr(CityTestCase, "test_%s" % city.lower(), gen_test(city)) diff --git a/tests/test_scraper.py b/tests/test_scraper.py index c1a6445..d5c438d 100644 --- a/tests/test_scraper.py +++ b/tests/test_scraper.py @@ -1,13 +1,14 @@ import os import unittest import helpers -import requests import requests_mock from park_api import env, scraper, db + class ScraperTestCase(unittest.TestCase): def setUp(self): db.setup() + @requests_mock.Mocker() def test_insert(self, mock): path = os.path.join(helpers.TEST_ROOT, "fixtures", "dresden.html") @@ -16,4 +17,4 @@ def test_insert(self, mock): with open(path) as f: src = module.geodata.city.source mock.get(src, text=f.read()) - scraper.scrape_city(module) + scraper.scrape_lots(module)