ad-freiburg · hannahbast · Oct 27, 2024 · Dec 24, 2024 · Jan 6, 2025
diff --git a/src/qlever/Qleverfiles/Qleverfile.ohm-planet b/src/qlever/Qleverfiles/Qleverfile.ohm-planet
@@ -1,33 +1,36 @@
-# Qleverfile for OHM Planet, use with https://github.com/ad-freiburg/qlever-control
+# Qleverfile for OpenHistoricalMap, use with the QLever CLI (`pip install qlever`)
 #
-# qlever get-data  # ~20 mins (download PBF, convert to TTL, add GeoSPARQL triples)
-# qlever index     # ~20 mins and ~5 GB RAM (on an AMD Ryzen 9 5900X)
-# qlever start     # ~1 sec
+# qlever get-data  # ~1 hour, ~14 GB (ttl.gz), ~3.4 B triples (with osm2rdf)
+# qlever index     # ~1 hour, ~10 GB RAM, ~60 GB index size on disk
+# qlever start     # a few seconds, adjust MEMORY_FOR_QUERIES as needed
 #
-# For `qlever get-data` to work, `osm2rdf` must be installed and in the `PATH`.
+# Measured on an AMD Ryzen 9 5900X with 128 GB RAM, and NVMe SSD (04.01.2025)
 
 [data]
 NAME           = ohm-planet
 GET_DATA_URL   = https://planet.openhistoricalmap.org/planet
 CHECK_BINARIES = osm2rdf -h > /dev/null || (echo "osm2rdf not found, make sure that it's installed and in your PATH" && exit 1)
-GET_DATA_CMD_1 = curl -LRfC - -o ${NAME}.pbf $$(curl -s ${GET_DATA_URL}/state.txt) 2>&1 | tee ${NAME}.download-log.txt
-GET_DATA_CMD_2 = osm2rdf ${NAME}.pbf -o ${NAME}.ttl --source-dataset OHM --cache . --add-hascentroid 2>&1 | tee ${NAME}.osm2rdf-log.txt
-GET_DATA_CMD   = set -o pipefail && ${CHECK_BINARIES} && ${GET_DATA_CMD_1} && echo && ${GET_DATA_CMD_2}
+GET_DATA_CMD_1 = unbuffer wget -O ${NAME}.pbf $$(curl -s ${GET_DATA_URL}/state.txt) 2>&1 | tee ${NAME}.download-log.txt
+GET_DATA_CMD_2 = osm2rdf ${NAME}.pbf -o ${NAME}.ttl --source-dataset OHM --cache . --add-way-node-order --no-untagged-nodes-geometric-relations 2>&1 | tee ${NAME}.osm2rdf-log.txt
+GET_DATA_CMD   = ${CHECK_BINARIES} && ${GET_DATA_CMD_1} && echo && ${GET_DATA_CMD_2}
 VERSION        = $$(date -r ${NAME}.pbf +%d.%m.%Y || echo "NO_DATE")
 DESCRIPTION    = OHM Planet, data from ${GET_DATA_URL} version ${VERSION} (with GeoSPARQL predicates ogc:sfContains and ogc:sfIntersects)
 
 [index]
-INPUT_FILES      = ${data:NAME}.ttl.bz2
-CAT_INPUT_FILES  = bzcat -f ${INPUT_FILES}
-SETTINGS_JSON    = { "prefixes-external": [""], "ascii-prefixes-only": false, "parallel-parsing": true, "num-triples-per-batch": 5000000 }
+INPUT_FILES        = ${data:NAME}.ttl.bz2
+MULTI_INPUT_JSON   = { "cmd": "lbzcat -n 4 ${INPUT_FILES}", "parallel": "true" }
+STXXL_MEMORY       = 5G
+PARSER_BUFFER_SIZE = 20M
+SETTINGS_JSON      = { "num-triples-per-batch": 5000000 }
 
 [server]
 PORT                        = 7037
 ACCESS_TOKEN                = ${data:NAME}
 MEMORY_FOR_QUERIES          = 10G
 CACHE_MAX_SIZE              = 5G
+TIMEOUT                     = 600s
 CACHE_MAX_SIZE_SINGLE_ENTRY = 4G
-WARMUP_CMD                  = curl -s https://qlever.cs.uni-freiburg.de/mapui-petri/query --data-urlencode "query=PREFIX geo: <http://www.opengis.net/ont/geosparql#> PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> PREFIX osm: <https://www.openstreetmap.org/> SELECT ?osm_id ?geometry WHERE { ?osm_id geo:hasGeometry/geo:asWKT ?geometry . ?osm_id rdf:type osm:node } LIMIT 1" --data-urlencode "backend=https://qlever.cs.uni-freiburg.de/api/${data:NAME}" > /dev/null
+WARMUP_CMD                  = curl -s https://qlever.cs.uni-freiburg.de/petrimaps/query --data-urlencode "query=PREFIX geo: <http://www.opengis.net/ont/geosparql#> PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> PREFIX osm: <https://www.openstreetmap.org/> SELECT ?osm_id ?geometry WHERE { ?osm_id geo:hasGeometry/geo:asWKT ?geometry . ?osm_id rdf:type osm:node } LIMIT 1" --data-urlencode "backend=https://qlever.cs.uni-freiburg.de/api/${data:NAME}" > /dev/null
 
 [runtime]
 SYSTEM = docker

diff --git a/src/qlever/commands/index.py b/src/qlever/commands/index.py
@@ -2,13 +2,17 @@
 
 import glob
 import json
-import shlex
 import re
+import shlex
 
 from qlever.command import QleverCommand
 from qlever.containerize import Containerize
 from qlever.log import log
-from qlever.util import get_existing_index_files, get_total_file_size, run_command
+from qlever.util import (
+    get_existing_index_files,
+    get_total_file_size,
+    run_command,
+)
 
 
 class IndexCommand(QleverCommand):
@@ -39,6 +43,7 @@ def relevant_qleverfile_arguments(self) -> dict[str : list[str]]:
                 "use_patterns",
                 "text_index",
                 "stxxl_memory",
+                "parser_buffer_size",
             ],
             "runtime": ["system", "image", "index_container"],
         }
@@ -76,7 +81,8 @@ def get_input_options_for_json(self, args) -> str:
         # Check that it is an array of length at least one.
         if not isinstance(input_specs, list):
             raise self.InvalidInputJson(
-                "`MULTI_INPUT_JSON` must be a JSON array", args.multi_input_json
+                "`MULTI_INPUT_JSON` must be a JSON array",
+                args.multi_input_json,
             )
         if len(input_specs) == 0:
             raise self.InvalidInputJson(
@@ -90,13 +96,15 @@ def get_input_options_for_json(self, args) -> str:
             # Check that `input_spec` is a dictionary.
             if not isinstance(input_spec, dict):
                 raise self.InvalidInputJson(
-                    f"Element {i} in `MULTI_INPUT_JSON` must be a JSON " "object",
+                    f"Element {i} in `MULTI_INPUT_JSON` must be a JSON "
+                    "object",
                     input_spec,
                 )
             # For each `input_spec`, we must have a command.
             if "cmd" not in input_spec:
                 raise self.InvalidInputJson(
-                    f"Element {i} in `MULTI_INPUT_JSON` must contain a " "key `cmd`",
+                    f"Element {i} in `MULTI_INPUT_JSON` must contain a "
+                    "key `cmd`",
                     input_spec,
                 )
             # If the command contains a `{}` placeholder, we need a `for-each`
@@ -204,14 +212,23 @@ def execute(self, args) -> bool:
             index_cmd += " --only-pso-and-pos-permutations --no-patterns"
         if not args.use_patterns:
             index_cmd += " --no-patterns"
-        if args.text_index in ["from_text_records", "from_text_records_and_literals"]:
+        if args.text_index in [
+            "from_text_records",
+            "from_text_records_and_literals",
+        ]:
             index_cmd += (
-                f" -w {args.name}.wordsfile.tsv" f" -d {args.name}.docsfile.tsv"
+                f" -w {args.name}.wordsfile.tsv"
+                f" -d {args.name}.docsfile.tsv"
             )
-        if args.text_index in ["from_literals", "from_text_records_and_literals"]:
+        if args.text_index in [
+            "from_literals",
+            "from_text_records_and_literals",
+        ]:
             index_cmd += " --text-words-from-literals"
         if args.stxxl_memory:
             index_cmd += f" --stxxl-memory {args.stxxl_memory}"
+        if args.parser_buffer_size:
+            index_cmd += f" --parser-buffer-size {args.parser_buffer_size}"
         index_cmd += f" | tee {args.name}.index-log.txt"
 
         # If the total file size is larger than 10 GB, set ulimit (such that a
@@ -234,7 +251,8 @@ def execute(self, args) -> bool:
 
         # Command for writing the settings JSON to a file.
         settings_json_cmd = (
-            f"echo {shlex.quote(args.settings_json)} " f"> {args.name}.settings.json"
+            f"echo {shlex.quote(args.settings_json)} "
+            f"> {args.name}.settings.json"
         )
 
         # Show the command line.
@@ -279,9 +297,15 @@ def execute(self, args) -> bool:
             return False
 
         # Remove already existing container.
-        if args.system in Containerize.supported_systems() and args.overwrite_existing:
+        if (
+            args.system in Containerize.supported_systems()
+            and args.overwrite_existing
+        ):
             if Containerize.is_running(args.system, args.index_container):
-                log.info("Another index process is running, trying to stop " "it ...")
+                log.info(
+                    "Another index process is running, trying to stop "
+                    "it ..."
+                )
                 log.info("")
                 try:
                     run_command(f"{args.system} rm -f {args.index_container}")

diff --git a/src/qlever/qleverfile.py b/src/qlever/qleverfile.py
@@ -113,10 +113,16 @@ def arg(*args, **kwargs):
         index_args["stxxl_memory"] = arg(
             "--stxxl-memory",
             type=str,
-            default="5G",
             help="The amount of memory to use for the index build "
             "(the name of the option has historical reasons)",
         )
+        index_args["parser_buffer_size"] = arg(
+            "--parser-buffer-size",
+            type=str,
+            help="Each parser thread reads the input in batches of this size"
+            "; in parallel parsing, each batch that is not the last must be "
+            "larger enough to contain then end of at least one statement",
+        )
         index_args["only_pso_and_pos_permutations"] = arg(
             "--only-pso-and-pos-permutations",
             action="store_true",