From f8ac361daed45f081f645f11394d31221c6479d6 Mon Sep 17 00:00:00 2001
From: Hannah Bast <bast@cs.uni-freiburg.de>
Date: Sun, 27 Oct 2024 08:38:59 +0100
Subject: [PATCH 1/2] Update Qleverfile for OHM Planet

Add options `--add-way-node-order` and `--add-way-node-geometry` to
`osm2rdf`. This adds the (extensive) complete information on which ways
contain which nodes and the geometries of those nodes, even if they are
used for nothing else and have no tags.

Also clean up the Qleverfile and update the information at the top.
---
 src/qlever/Qleverfiles/Qleverfile.ohm-planet | 23 +++++++++++---------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/src/qlever/Qleverfiles/Qleverfile.ohm-planet b/src/qlever/Qleverfiles/Qleverfile.ohm-planet
index 88bee455..1d1ed3f9 100644
--- a/src/qlever/Qleverfiles/Qleverfile.ohm-planet
+++ b/src/qlever/Qleverfiles/Qleverfile.ohm-planet
@@ -1,33 +1,36 @@
-# Qleverfile for OHM Planet, use with https://github.com/ad-freiburg/qlever-control
+# Qleverfile for Wikidata, use with the QLever CLI (`pip install qlever`)
 #
-# qlever get-data  # ~20 mins (download PBF, convert to TTL, add GeoSPARQL triples)
-# qlever index     # ~20 mins and ~5 GB RAM (on an AMD Ryzen 9 5900X)
-# qlever start     # ~1 sec
+# qlever get-data  # ~7 hours, ~110 GB (compressed), ~20 billion triples
+# qlever index     # ~5 hours, ~20 GB RAM, ~500 GB index size on disk
+# qlever start     # a few seconds, adjust MEMORY_FOR_QUERIES as needed
 #
-# For `qlever get-data` to work, `osm2rdf` must be installed and in the `PATH`.
+# Adding a text index takes an additional ~2 hours and ~50 GB of disk space
+#
+# Measured on an AMD Ryzen 9 5950X with 128 GB RAM, and NVMe SSD (18.10.2024)
 
 [data]
 NAME           = ohm-planet
 GET_DATA_URL   = https://planet.openhistoricalmap.org/planet
 CHECK_BINARIES = osm2rdf -h > /dev/null || (echo "osm2rdf not found, make sure that it's installed and in your PATH" && exit 1)
-GET_DATA_CMD_1 = curl -LRfC - -o ${NAME}.pbf $$(curl -s ${GET_DATA_URL}/state.txt) 2>&1 | tee ${NAME}.download-log.txt
-GET_DATA_CMD_2 = osm2rdf ${NAME}.pbf -o ${NAME}.ttl --source-dataset OHM --cache . --add-hascentroid 2>&1 | tee ${NAME}.osm2rdf-log.txt
-GET_DATA_CMD   = set -o pipefail && ${CHECK_BINARIES} && ${GET_DATA_CMD_1} && echo && ${GET_DATA_CMD_2}
+GET_DATA_CMD_1 = curl -LRf -o ${NAME}.pbf $$(curl -s ${GET_DATA_URL}/state.txt) 2>&1 | tee ${NAME}.download-log.txt
+GET_DATA_CMD_2 = osm2rdf ${NAME}.pbf -o ${NAME}.ttl --source-dataset OHM --cache . --add-way-node-order --add-way-node-geometry 2>&1 | tee ${NAME}.osm2rdf-log.txt
+GET_DATA_CMD   = ${CHECK_BINARIES} && ${GET_DATA_CMD_1} && echo && ${GET_DATA_CMD_2}
 VERSION        = $$(date -r ${NAME}.pbf +%d.%m.%Y || echo "NO_DATE")
 DESCRIPTION    = OHM Planet, data from ${GET_DATA_URL} version ${VERSION} (with GeoSPARQL predicates ogc:sfContains and ogc:sfIntersects)
 
 [index]
 INPUT_FILES      = ${data:NAME}.ttl.bz2
 CAT_INPUT_FILES  = bzcat -f ${INPUT_FILES}
-SETTINGS_JSON    = { "prefixes-external": [""], "ascii-prefixes-only": false, "parallel-parsing": true, "num-triples-per-batch": 5000000 }
+SETTINGS_JSON    = { "prefixes-external": [""], "ascii-prefixes-only": false, "num-triples-per-batch": 5000000 }
 
 [server]
 PORT                        = 7037
 ACCESS_TOKEN                = ${data:NAME}
 MEMORY_FOR_QUERIES          = 10G
 CACHE_MAX_SIZE              = 5G
+TIMEOUT                     = 600s
 CACHE_MAX_SIZE_SINGLE_ENTRY = 4G
-WARMUP_CMD                  = curl -s https://qlever.cs.uni-freiburg.de/mapui-petri/query --data-urlencode "query=PREFIX geo: <http://www.opengis.net/ont/geosparql#> PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> PREFIX osm: <https://www.openstreetmap.org/> SELECT ?osm_id ?geometry WHERE { ?osm_id geo:hasGeometry/geo:asWKT ?geometry . ?osm_id rdf:type osm:node } LIMIT 1" --data-urlencode "backend=https://qlever.cs.uni-freiburg.de/api/${data:NAME}" > /dev/null
+WARMUP_CMD                  = curl -s https://qlever.cs.uni-freiburg.de/petrimaps/query --data-urlencode "query=PREFIX geo: <http://www.opengis.net/ont/geosparql#> PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> PREFIX osm: <https://www.openstreetmap.org/> SELECT ?osm_id ?geometry WHERE { ?osm_id geo:hasGeometry/geo:asWKT ?geometry . ?osm_id rdf:type osm:node } LIMIT 1" --data-urlencode "backend=https://qlever.cs.uni-freiburg.de/api/${data:NAME}" > /dev/null
 
 [runtime]
 SYSTEM = docker

From 37bbb371ceb0e7f3211116750c95ee2acfea85e6 Mon Sep 17 00:00:00 2001
From: Hannah Bast <bast@cs.uni-freiburg.de>
Date: Mon, 6 Jan 2025 12:12:16 +0100
Subject: [PATCH 2/2] Add option `parser-buffer-size` and improve
 `Qleverfile.ohm-planet`

---
 src/qlever/Qleverfiles/Qleverfile.ohm-planet | 22 +++++-----
 src/qlever/commands/index.py                 | 46 +++++++++++++++-----
 src/qlever/qleverfile.py                     |  8 +++-
 3 files changed, 53 insertions(+), 23 deletions(-)

diff --git a/src/qlever/Qleverfiles/Qleverfile.ohm-planet b/src/qlever/Qleverfiles/Qleverfile.ohm-planet
index 1d1ed3f9..75748da0 100644
--- a/src/qlever/Qleverfiles/Qleverfile.ohm-planet
+++ b/src/qlever/Qleverfiles/Qleverfile.ohm-planet
@@ -1,27 +1,27 @@
-# Qleverfile for Wikidata, use with the QLever CLI (`pip install qlever`)
+# Qleverfile for OpenHistoricalMap, use with the QLever CLI (`pip install qlever`)
 #
-# qlever get-data  # ~7 hours, ~110 GB (compressed), ~20 billion triples
-# qlever index     # ~5 hours, ~20 GB RAM, ~500 GB index size on disk
+# qlever get-data  # ~1 hour, ~14 GB (ttl.gz), ~3.4 B triples (with osm2rdf)
+# qlever index     # ~1 hour, ~10 GB RAM, ~60 GB index size on disk
 # qlever start     # a few seconds, adjust MEMORY_FOR_QUERIES as needed
 #
-# Adding a text index takes an additional ~2 hours and ~50 GB of disk space
-#
-# Measured on an AMD Ryzen 9 5950X with 128 GB RAM, and NVMe SSD (18.10.2024)
+# Measured on an AMD Ryzen 9 5900X with 128 GB RAM, and NVMe SSD (04.01.2025)
 
 [data]
 NAME           = ohm-planet
 GET_DATA_URL   = https://planet.openhistoricalmap.org/planet
 CHECK_BINARIES = osm2rdf -h > /dev/null || (echo "osm2rdf not found, make sure that it's installed and in your PATH" && exit 1)
-GET_DATA_CMD_1 = curl -LRf -o ${NAME}.pbf $$(curl -s ${GET_DATA_URL}/state.txt) 2>&1 | tee ${NAME}.download-log.txt
-GET_DATA_CMD_2 = osm2rdf ${NAME}.pbf -o ${NAME}.ttl --source-dataset OHM --cache . --add-way-node-order --add-way-node-geometry 2>&1 | tee ${NAME}.osm2rdf-log.txt
+GET_DATA_CMD_1 = unbuffer wget -O ${NAME}.pbf $$(curl -s ${GET_DATA_URL}/state.txt) 2>&1 | tee ${NAME}.download-log.txt
+GET_DATA_CMD_2 = osm2rdf ${NAME}.pbf -o ${NAME}.ttl --source-dataset OHM --cache . --add-way-node-order --no-untagged-nodes-geometric-relations 2>&1 | tee ${NAME}.osm2rdf-log.txt
 GET_DATA_CMD   = ${CHECK_BINARIES} && ${GET_DATA_CMD_1} && echo && ${GET_DATA_CMD_2}
 VERSION        = $$(date -r ${NAME}.pbf +%d.%m.%Y || echo "NO_DATE")
 DESCRIPTION    = OHM Planet, data from ${GET_DATA_URL} version ${VERSION} (with GeoSPARQL predicates ogc:sfContains and ogc:sfIntersects)
 
 [index]
-INPUT_FILES      = ${data:NAME}.ttl.bz2
-CAT_INPUT_FILES  = bzcat -f ${INPUT_FILES}
-SETTINGS_JSON    = { "prefixes-external": [""], "ascii-prefixes-only": false, "num-triples-per-batch": 5000000 }
+INPUT_FILES        = ${data:NAME}.ttl.bz2
+MULTI_INPUT_JSON   = { "cmd": "lbzcat -n 4 ${INPUT_FILES}", "parallel": "true" }
+STXXL_MEMORY       = 5G
+PARSER_BUFFER_SIZE = 20M
+SETTINGS_JSON      = { "num-triples-per-batch": 5000000 }
 
 [server]
 PORT                        = 7037
diff --git a/src/qlever/commands/index.py b/src/qlever/commands/index.py
index d8393564..56ba9701 100644
--- a/src/qlever/commands/index.py
+++ b/src/qlever/commands/index.py
@@ -2,13 +2,17 @@
 
 import glob
 import json
-import shlex
 import re
+import shlex
 
 from qlever.command import QleverCommand
 from qlever.containerize import Containerize
 from qlever.log import log
-from qlever.util import get_existing_index_files, get_total_file_size, run_command
+from qlever.util import (
+    get_existing_index_files,
+    get_total_file_size,
+    run_command,
+)
 
 
 class IndexCommand(QleverCommand):
@@ -39,6 +43,7 @@ def relevant_qleverfile_arguments(self) -> dict[str : list[str]]:
                 "use_patterns",
                 "text_index",
                 "stxxl_memory",
+                "parser_buffer_size",
             ],
             "runtime": ["system", "image", "index_container"],
         }
@@ -76,7 +81,8 @@ def get_input_options_for_json(self, args) -> str:
         # Check that it is an array of length at least one.
         if not isinstance(input_specs, list):
             raise self.InvalidInputJson(
-                "`MULTI_INPUT_JSON` must be a JSON array", args.multi_input_json
+                "`MULTI_INPUT_JSON` must be a JSON array",
+                args.multi_input_json,
             )
         if len(input_specs) == 0:
             raise self.InvalidInputJson(
@@ -90,13 +96,15 @@ def get_input_options_for_json(self, args) -> str:
             # Check that `input_spec` is a dictionary.
             if not isinstance(input_spec, dict):
                 raise self.InvalidInputJson(
-                    f"Element {i} in `MULTI_INPUT_JSON` must be a JSON " "object",
+                    f"Element {i} in `MULTI_INPUT_JSON` must be a JSON "
+                    "object",
                     input_spec,
                 )
             # For each `input_spec`, we must have a command.
             if "cmd" not in input_spec:
                 raise self.InvalidInputJson(
-                    f"Element {i} in `MULTI_INPUT_JSON` must contain a " "key `cmd`",
+                    f"Element {i} in `MULTI_INPUT_JSON` must contain a "
+                    "key `cmd`",
                     input_spec,
                 )
             # If the command contains a `{}` placeholder, we need a `for-each`
@@ -204,14 +212,23 @@ def execute(self, args) -> bool:
             index_cmd += " --only-pso-and-pos-permutations --no-patterns"
         if not args.use_patterns:
             index_cmd += " --no-patterns"
-        if args.text_index in ["from_text_records", "from_text_records_and_literals"]:
+        if args.text_index in [
+            "from_text_records",
+            "from_text_records_and_literals",
+        ]:
             index_cmd += (
-                f" -w {args.name}.wordsfile.tsv" f" -d {args.name}.docsfile.tsv"
+                f" -w {args.name}.wordsfile.tsv"
+                f" -d {args.name}.docsfile.tsv"
             )
-        if args.text_index in ["from_literals", "from_text_records_and_literals"]:
+        if args.text_index in [
+            "from_literals",
+            "from_text_records_and_literals",
+        ]:
             index_cmd += " --text-words-from-literals"
         if args.stxxl_memory:
             index_cmd += f" --stxxl-memory {args.stxxl_memory}"
+        if args.parser_buffer_size:
+            index_cmd += f" --parser-buffer-size {args.parser_buffer_size}"
         index_cmd += f" | tee {args.name}.index-log.txt"
 
         # If the total file size is larger than 10 GB, set ulimit (such that a
@@ -234,7 +251,8 @@ def execute(self, args) -> bool:
 
         # Command for writing the settings JSON to a file.
         settings_json_cmd = (
-            f"echo {shlex.quote(args.settings_json)} " f"> {args.name}.settings.json"
+            f"echo {shlex.quote(args.settings_json)} "
+            f"> {args.name}.settings.json"
         )
 
         # Show the command line.
@@ -279,9 +297,15 @@ def execute(self, args) -> bool:
             return False
 
         # Remove already existing container.
-        if args.system in Containerize.supported_systems() and args.overwrite_existing:
+        if (
+            args.system in Containerize.supported_systems()
+            and args.overwrite_existing
+        ):
             if Containerize.is_running(args.system, args.index_container):
-                log.info("Another index process is running, trying to stop " "it ...")
+                log.info(
+                    "Another index process is running, trying to stop "
+                    "it ..."
+                )
                 log.info("")
                 try:
                     run_command(f"{args.system} rm -f {args.index_container}")
diff --git a/src/qlever/qleverfile.py b/src/qlever/qleverfile.py
index 7a5f8552..3d3012c4 100644
--- a/src/qlever/qleverfile.py
+++ b/src/qlever/qleverfile.py
@@ -113,10 +113,16 @@ def arg(*args, **kwargs):
         index_args["stxxl_memory"] = arg(
             "--stxxl-memory",
             type=str,
-            default="5G",
             help="The amount of memory to use for the index build "
             "(the name of the option has historical reasons)",
         )
+        index_args["parser_buffer_size"] = arg(
+            "--parser-buffer-size",
+            type=str,
+            help="Each parser thread reads the input in batches of this size"
+            "; in parallel parsing, each batch that is not the last must be "
+            "larger enough to contain then end of at least one statement",
+        )
         index_args["only_pso_and_pos_permutations"] = arg(
             "--only-pso-and-pos-permutations",
             action="store_true",