From b4d7daf6270184f01860506e12cd857615e1677c Mon Sep 17 00:00:00 2001
From: Hannah Bast <bast@cs.uni-freiburg.de>
Date: Sun, 10 Mar 2024 05:02:24 +0100
Subject: [PATCH] Add Qleverfile for osm-planet + more goodies

1. A random alphanumerical string is now appended automatically to the
   ACCESS_TOKEN when the Qleverfile is copied to the cwd

2. Values can now be of the form $$(...), in which case the ... part is
   evaluated and the values becomes the result of that evluation
---
 src/qlever/Qleverfiles/Qleverfile.osm-planet | 38 ++++++++++++++++++++
 src/qlever/commands/setup-config.py          |  6 +++-
 src/qlever/commands/start.py                 | 10 +++---
 src/qlever/qleverfile.py                     | 38 +++++++++++++++++---
 src/qlever/util.py                           | 17 +++++++--
 5 files changed, 96 insertions(+), 13 deletions(-)
 create mode 100644 src/qlever/Qleverfiles/Qleverfile.osm-planet

diff --git a/src/qlever/Qleverfiles/Qleverfile.osm-planet b/src/qlever/Qleverfiles/Qleverfile.osm-planet
new file mode 100644
index 00000000..a0361963
--- /dev/null
+++ b/src/qlever/Qleverfiles/Qleverfile.osm-planet
@@ -0,0 +1,38 @@
+# Qleverfile for OSM Planet, use with the qlever script (pip install qlever)
+#
+# qlever get-data  # takes ~50 mins to download .ttl.bz2 file of ~ 300 GB
+# qlever index     # takes ~12 hours and ~20 GB RAM (on an AMD Ryzen 9 5900X)
+# qlever start     # takes a few seconds
+#
+# For the OSM data of a single country, do `qlever setup-config osm-country`
+# and edit the Qleverfile to specify the country,
+
+[data]
+NAME         = osm-planet
+DATA_URL     = https://osm2rdf.cs.uni-freiburg.de/ttl/planet.osm.ttl.bz2
+GET_DATA_CMD = curl --location --fail --continue-at - --remote-time --output ${NAME}.ttl.bz2 ${DATA_URL}
+VERSION      = $$(date -r ${NAME}.ttl.bz2 +"%d.%m.%Y")
+DESCRIPTION  = OSM Planet, data from ${DATA_URL} version ${VERSION} (complete OSM data, with GeoSPARQL predicates ogc:sfContains and ogc:sfIntersects)
+
+[index]
+INPUT_FILES     = ${data:NAME}.ttl.bz2
+CAT_INPUT_FILES = lbzcat -f -n 2 ${INPUT_FILES}
+WITH_TEXT       = false
+STXXL_MEMORY    = 20G
+SETTINGS_JSON   = { "languages-internal": [], "prefixes-external": [""], "ascii-prefixes-only": false, "num-triples-per-batch": 5000000 }
+
+[server]
+PORT                        = 7007
+ACCESS_TOKEN                = ${data:NAME}
+MEMORY_FOR_QUERIES          = 90G
+CACHE_MAX_SIZE              = 40G
+CACHE_MAX_SIZE_SINGLE_ENTRY = 30G
+TIMEOUT                     = 300s
+
+[runtime]
+SYSTEM = docker
+IMAGE = docker.io/adfreiburg/qlever:latest
+
+[ui]
+UI_PORT   = 7000
+UI_CONFIG = osm-planet
diff --git a/src/qlever/commands/setup-config.py b/src/qlever/commands/setup-config.py
index 0ced66ee..12afe984 100644
--- a/src/qlever/commands/setup-config.py
+++ b/src/qlever/commands/setup-config.py
@@ -3,6 +3,7 @@
 
 from qlever.command import QleverCommand
 from qlever.log import log
+from qlever.util import get_random_string
 
 
 class SetupConfigCommand(QleverCommand):
@@ -35,7 +36,10 @@ def execute(self, args) -> bool:
         # Construct the command line and show it.
         qleverfile_path = (self.qleverfiles_path
                            / f"Qleverfile.{args.config_name}")
-        setup_config_cmd = f"cp -a {qleverfile_path} Qleverfile"
+        setup_config_cmd = (
+                f"cat {qleverfile_path}"
+                f" | sed -E 's/(^ACCESS_TOKEN.*)/\\1_{get_random_string(12)}/'"
+                f"> Qleverfile")
         self.show(setup_config_cmd, only_show=args.show)
         if args.show:
             return False
diff --git a/src/qlever/commands/start.py b/src/qlever/commands/start.py
index e52e063c..c2375299 100644
--- a/src/qlever/commands/start.py
+++ b/src/qlever/commands/start.py
@@ -27,7 +27,7 @@ def should_have_qleverfile(self) -> bool:
         return True
 
     def relevant_qleverfile_arguments(self) -> dict[str: list[str]]:
-        return {"data": ["name", "index_description", "text_description"],
+        return {"data": ["name", "description", "text_description"],
                 "server": ["server_binary", "port", "access_token",
                            "memory_for_queries", "cache_max_size",
                            "cache_max_size_single_entry",
@@ -163,8 +163,8 @@ def execute(self, args) -> bool:
 
         # Set the access token if specified.
         access_arg = f"--data-urlencode \"access-token={args.access_token}\""
-        if args.index_description:
-            desc = args.index_description
+        if args.description:
+            desc = args.description
             curl_cmd = (f"curl -Gs http://localhost:{port}/api"
                         f" --data-urlencode \"index-description={desc}\""
                         f" {access_arg} > /dev/null")
@@ -174,9 +174,9 @@ def execute(self, args) -> bool:
             except Exception as e:
                 log.error(f"Setting the index description failed ({e})")
         if args.text_description:
-            desc = args.text_description
+            text_desc = args.text_description
             curl_cmd = (f"curl -Gs http://localhost:{port}/api"
-                        f" --data-urlencode \"text-description={desc}\""
+                        f" --data-urlencode \"text-description={text_desc}\""
                         f" {access_arg} > /dev/null")
             log.debug(curl_cmd)
             try:
diff --git a/src/qlever/qleverfile.py b/src/qlever/qleverfile.py
index 55e1c9fe..b8150599 100644
--- a/src/qlever/qleverfile.py
+++ b/src/qlever/qleverfile.py
@@ -1,9 +1,12 @@
 from __future__ import annotations
 
+import re
 import socket
+import subprocess
 from configparser import ConfigParser, ExtendedInterpolation
 
 from qlever.containerize import Containerize
+from qlever.log import log
 
 
 class QleverfileException(Exception):
@@ -43,12 +46,13 @@ def arg(*args, **kwargs):
         data_args["get_data_cmd"] = arg(
                 "--get-data-cmd", type=str, required=True,
                 help="The command to get the data")
-        data_args["index_description"] = arg(
-                "--index-description", type=str, required=True,
-                help="A concise description of the indexed dataset")
+        data_args["description"] = arg(
+                "--description", type=str, required=True,
+                help="A concise description of the dataset")
         data_args["text_description"] = arg(
                 "--text-description", type=str, default=None,
-                help="A description of the indexed text if any")
+                help="A concice description of the addtional text data"
+                     " if any")
 
         index_args["input_files"] = arg(
                 "--input-files", type=str, required=True,
@@ -200,12 +204,36 @@ def read(qleverfile_path):
         """
 
         # Read the Qleverfile.
-        config = ConfigParser(interpolation=ExtendedInterpolation())
+        defaults = {"random": "83724324hztz", "version": "01.01.01"}
+        config = ConfigParser(interpolation=ExtendedInterpolation(),
+                              defaults=defaults)
         try:
             config.read(qleverfile_path)
         except Exception as e:
             raise QleverfileException(f"Error parsing {qleverfile_path}: {e}")
 
+        # Iterate over all sections and options and check if there are any
+        # values of the form $$(...) that need to be replaced.
+        for section in config.sections():
+            for option in config[section]:
+                value = config[section][option]
+                match = re.match(r"^\$\((.*)\)$", value)
+                if match:
+                    try:
+                        value = subprocess.check_output(
+                                match.group(1), shell=True, text=True,
+                                stderr=subprocess.STDOUT)
+                    except Exception as e:
+                        log.info("")
+                        log.error(f"Error evaluating {value} for option "
+                                  f"{section}.{option.upper()} in "
+                                  f"{qleverfile_path}:")
+                        log.info("")
+                        log.info(e.output if hasattr(e, "output") else e)
+                        exit(1)
+                    config[section][option] = value
+                    log.info(f"Set {section}.{option} to {value}")
+
         # Make sure that all the sections are there.
         for section in ["data", "index", "server", "runtime", "ui"]:
             if section not in config:
diff --git a/src/qlever/util.py b/src/qlever/util.py
index a7ae9fb4..91762bdc 100644
--- a/src/qlever/util.py
+++ b/src/qlever/util.py
@@ -1,10 +1,13 @@
 from __future__ import annotations
 
+import random
+import re
+import string
 import subprocess
+from datetime import date, datetime
 from pathlib import Path
+
 from qlever.log import log
-import re
-from datetime import datetime, date
 
 
 def get_total_file_size(patterns: list[str]) -> int:
@@ -94,3 +97,13 @@ def show_table_line(pid, user, start_time, rss, cmdline):
     except Exception as e:
         log.error(f"Could not get process info: {e}")
         return False
+
+
+def get_random_string(length: int) -> str:
+    """
+    Helper function that returns a randomly chosen string of the given
+    length. Take the current time as seed.
+    """
+    random.seed(datetime.now())
+    return "".join(random.choices(string.ascii_letters + string.digits,
+                                  k=length))