From b4d7daf6270184f01860506e12cd857615e1677c Mon Sep 17 00:00:00 2001 From: Hannah Bast Date: Sun, 10 Mar 2024 05:02:24 +0100 Subject: [PATCH] Add Qleverfile for osm-planet + more goodies 1. A random alphanumerical string is now appended automatically to the ACCESS_TOKEN when the Qleverfile is copied to the cwd 2. Values can now be of the form $$(...), in which case the ... part is evaluated and the values becomes the result of that evluation --- src/qlever/Qleverfiles/Qleverfile.osm-planet | 38 ++++++++++++++++++++ src/qlever/commands/setup-config.py | 6 +++- src/qlever/commands/start.py | 10 +++--- src/qlever/qleverfile.py | 38 +++++++++++++++++--- src/qlever/util.py | 17 +++++++-- 5 files changed, 96 insertions(+), 13 deletions(-) create mode 100644 src/qlever/Qleverfiles/Qleverfile.osm-planet diff --git a/src/qlever/Qleverfiles/Qleverfile.osm-planet b/src/qlever/Qleverfiles/Qleverfile.osm-planet new file mode 100644 index 00000000..a0361963 --- /dev/null +++ b/src/qlever/Qleverfiles/Qleverfile.osm-planet @@ -0,0 +1,38 @@ +# Qleverfile for OSM Planet, use with the qlever script (pip install qlever) +# +# qlever get-data # takes ~50 mins to download .ttl.bz2 file of ~ 300 GB +# qlever index # takes ~12 hours and ~20 GB RAM (on an AMD Ryzen 9 5900X) +# qlever start # takes a few seconds +# +# For the OSM data of a single country, do `qlever setup-config osm-country` +# and edit the Qleverfile to specify the country, + +[data] +NAME = osm-planet +DATA_URL = https://osm2rdf.cs.uni-freiburg.de/ttl/planet.osm.ttl.bz2 +GET_DATA_CMD = curl --location --fail --continue-at - --remote-time --output ${NAME}.ttl.bz2 ${DATA_URL} +VERSION = $$(date -r ${NAME}.ttl.bz2 +"%d.%m.%Y") +DESCRIPTION = OSM Planet, data from ${DATA_URL} version ${VERSION} (complete OSM data, with GeoSPARQL predicates ogc:sfContains and ogc:sfIntersects) + +[index] +INPUT_FILES = ${data:NAME}.ttl.bz2 +CAT_INPUT_FILES = lbzcat -f -n 2 ${INPUT_FILES} +WITH_TEXT = false +STXXL_MEMORY = 20G +SETTINGS_JSON = { "languages-internal": [], "prefixes-external": [""], "ascii-prefixes-only": false, "num-triples-per-batch": 5000000 } + +[server] +PORT = 7007 +ACCESS_TOKEN = ${data:NAME} +MEMORY_FOR_QUERIES = 90G +CACHE_MAX_SIZE = 40G +CACHE_MAX_SIZE_SINGLE_ENTRY = 30G +TIMEOUT = 300s + +[runtime] +SYSTEM = docker +IMAGE = docker.io/adfreiburg/qlever:latest + +[ui] +UI_PORT = 7000 +UI_CONFIG = osm-planet diff --git a/src/qlever/commands/setup-config.py b/src/qlever/commands/setup-config.py index 0ced66ee..12afe984 100644 --- a/src/qlever/commands/setup-config.py +++ b/src/qlever/commands/setup-config.py @@ -3,6 +3,7 @@ from qlever.command import QleverCommand from qlever.log import log +from qlever.util import get_random_string class SetupConfigCommand(QleverCommand): @@ -35,7 +36,10 @@ def execute(self, args) -> bool: # Construct the command line and show it. qleverfile_path = (self.qleverfiles_path / f"Qleverfile.{args.config_name}") - setup_config_cmd = f"cp -a {qleverfile_path} Qleverfile" + setup_config_cmd = ( + f"cat {qleverfile_path}" + f" | sed -E 's/(^ACCESS_TOKEN.*)/\\1_{get_random_string(12)}/'" + f"> Qleverfile") self.show(setup_config_cmd, only_show=args.show) if args.show: return False diff --git a/src/qlever/commands/start.py b/src/qlever/commands/start.py index e52e063c..c2375299 100644 --- a/src/qlever/commands/start.py +++ b/src/qlever/commands/start.py @@ -27,7 +27,7 @@ def should_have_qleverfile(self) -> bool: return True def relevant_qleverfile_arguments(self) -> dict[str: list[str]]: - return {"data": ["name", "index_description", "text_description"], + return {"data": ["name", "description", "text_description"], "server": ["server_binary", "port", "access_token", "memory_for_queries", "cache_max_size", "cache_max_size_single_entry", @@ -163,8 +163,8 @@ def execute(self, args) -> bool: # Set the access token if specified. access_arg = f"--data-urlencode \"access-token={args.access_token}\"" - if args.index_description: - desc = args.index_description + if args.description: + desc = args.description curl_cmd = (f"curl -Gs http://localhost:{port}/api" f" --data-urlencode \"index-description={desc}\"" f" {access_arg} > /dev/null") @@ -174,9 +174,9 @@ def execute(self, args) -> bool: except Exception as e: log.error(f"Setting the index description failed ({e})") if args.text_description: - desc = args.text_description + text_desc = args.text_description curl_cmd = (f"curl -Gs http://localhost:{port}/api" - f" --data-urlencode \"text-description={desc}\"" + f" --data-urlencode \"text-description={text_desc}\"" f" {access_arg} > /dev/null") log.debug(curl_cmd) try: diff --git a/src/qlever/qleverfile.py b/src/qlever/qleverfile.py index 55e1c9fe..b8150599 100644 --- a/src/qlever/qleverfile.py +++ b/src/qlever/qleverfile.py @@ -1,9 +1,12 @@ from __future__ import annotations +import re import socket +import subprocess from configparser import ConfigParser, ExtendedInterpolation from qlever.containerize import Containerize +from qlever.log import log class QleverfileException(Exception): @@ -43,12 +46,13 @@ def arg(*args, **kwargs): data_args["get_data_cmd"] = arg( "--get-data-cmd", type=str, required=True, help="The command to get the data") - data_args["index_description"] = arg( - "--index-description", type=str, required=True, - help="A concise description of the indexed dataset") + data_args["description"] = arg( + "--description", type=str, required=True, + help="A concise description of the dataset") data_args["text_description"] = arg( "--text-description", type=str, default=None, - help="A description of the indexed text if any") + help="A concice description of the addtional text data" + " if any") index_args["input_files"] = arg( "--input-files", type=str, required=True, @@ -200,12 +204,36 @@ def read(qleverfile_path): """ # Read the Qleverfile. - config = ConfigParser(interpolation=ExtendedInterpolation()) + defaults = {"random": "83724324hztz", "version": "01.01.01"} + config = ConfigParser(interpolation=ExtendedInterpolation(), + defaults=defaults) try: config.read(qleverfile_path) except Exception as e: raise QleverfileException(f"Error parsing {qleverfile_path}: {e}") + # Iterate over all sections and options and check if there are any + # values of the form $$(...) that need to be replaced. + for section in config.sections(): + for option in config[section]: + value = config[section][option] + match = re.match(r"^\$\((.*)\)$", value) + if match: + try: + value = subprocess.check_output( + match.group(1), shell=True, text=True, + stderr=subprocess.STDOUT) + except Exception as e: + log.info("") + log.error(f"Error evaluating {value} for option " + f"{section}.{option.upper()} in " + f"{qleverfile_path}:") + log.info("") + log.info(e.output if hasattr(e, "output") else e) + exit(1) + config[section][option] = value + log.info(f"Set {section}.{option} to {value}") + # Make sure that all the sections are there. for section in ["data", "index", "server", "runtime", "ui"]: if section not in config: diff --git a/src/qlever/util.py b/src/qlever/util.py index a7ae9fb4..91762bdc 100644 --- a/src/qlever/util.py +++ b/src/qlever/util.py @@ -1,10 +1,13 @@ from __future__ import annotations +import random +import re +import string import subprocess +from datetime import date, datetime from pathlib import Path + from qlever.log import log -import re -from datetime import datetime, date def get_total_file_size(patterns: list[str]) -> int: @@ -94,3 +97,13 @@ def show_table_line(pid, user, start_time, rss, cmdline): except Exception as e: log.error(f"Could not get process info: {e}") return False + + +def get_random_string(length: int) -> str: + """ + Helper function that returns a randomly chosen string of the given + length. Take the current time as seed. + """ + random.seed(datetime.now()) + return "".join(random.choices(string.ascii_letters + string.digits, + k=length))