Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update Qleverfile for OHM Planet #81

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 15 additions & 12 deletions src/qlever/Qleverfiles/Qleverfile.ohm-planet
Original file line number Diff line number Diff line change
@@ -1,33 +1,36 @@
# Qleverfile for OHM Planet, use with https://github.com/ad-freiburg/qlever-control
# Qleverfile for OpenHistoricalMap, use with the QLever CLI (`pip install qlever`)
#
# qlever get-data # ~20 mins (download PBF, convert to TTL, add GeoSPARQL triples)
# qlever index # ~20 mins and ~5 GB RAM (on an AMD Ryzen 9 5900X)
# qlever start # ~1 sec
# qlever get-data # ~1 hour, ~14 GB (ttl.gz), ~3.4 B triples (with osm2rdf)
# qlever index # ~1 hour, ~10 GB RAM, ~60 GB index size on disk
# qlever start # a few seconds, adjust MEMORY_FOR_QUERIES as needed
#
# For `qlever get-data` to work, `osm2rdf` must be installed and in the `PATH`.
# Measured on an AMD Ryzen 9 5900X with 128 GB RAM, and NVMe SSD (04.01.2025)

[data]
NAME = ohm-planet
GET_DATA_URL = https://planet.openhistoricalmap.org/planet
CHECK_BINARIES = osm2rdf -h > /dev/null || (echo "osm2rdf not found, make sure that it's installed and in your PATH" && exit 1)
GET_DATA_CMD_1 = curl -LRfC - -o ${NAME}.pbf $$(curl -s ${GET_DATA_URL}/state.txt) 2>&1 | tee ${NAME}.download-log.txt
GET_DATA_CMD_2 = osm2rdf ${NAME}.pbf -o ${NAME}.ttl --source-dataset OHM --cache . --add-hascentroid 2>&1 | tee ${NAME}.osm2rdf-log.txt
GET_DATA_CMD = set -o pipefail && ${CHECK_BINARIES} && ${GET_DATA_CMD_1} && echo && ${GET_DATA_CMD_2}
GET_DATA_CMD_1 = unbuffer wget -O ${NAME}.pbf $$(curl -s ${GET_DATA_URL}/state.txt) 2>&1 | tee ${NAME}.download-log.txt
GET_DATA_CMD_2 = osm2rdf ${NAME}.pbf -o ${NAME}.ttl --source-dataset OHM --cache . --add-way-node-order --no-untagged-nodes-geometric-relations 2>&1 | tee ${NAME}.osm2rdf-log.txt
GET_DATA_CMD = ${CHECK_BINARIES} && ${GET_DATA_CMD_1} && echo && ${GET_DATA_CMD_2}
VERSION = $$(date -r ${NAME}.pbf +%d.%m.%Y || echo "NO_DATE")
DESCRIPTION = OHM Planet, data from ${GET_DATA_URL} version ${VERSION} (with GeoSPARQL predicates ogc:sfContains and ogc:sfIntersects)

[index]
INPUT_FILES = ${data:NAME}.ttl.bz2
CAT_INPUT_FILES = bzcat -f ${INPUT_FILES}
SETTINGS_JSON = { "prefixes-external": [""], "ascii-prefixes-only": false, "parallel-parsing": true, "num-triples-per-batch": 5000000 }
INPUT_FILES = ${data:NAME}.ttl.bz2
MULTI_INPUT_JSON = { "cmd": "lbzcat -n 4 ${INPUT_FILES}", "parallel": "true" }
STXXL_MEMORY = 5G
PARSER_BUFFER_SIZE = 20M
SETTINGS_JSON = { "num-triples-per-batch": 5000000 }

[server]
PORT = 7037
ACCESS_TOKEN = ${data:NAME}
MEMORY_FOR_QUERIES = 10G
CACHE_MAX_SIZE = 5G
TIMEOUT = 600s
CACHE_MAX_SIZE_SINGLE_ENTRY = 4G
WARMUP_CMD = curl -s https://qlever.cs.uni-freiburg.de/mapui-petri/query --data-urlencode "query=PREFIX geo: <http://www.opengis.net/ont/geosparql#> PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> PREFIX osm: <https://www.openstreetmap.org/> SELECT ?osm_id ?geometry WHERE { ?osm_id geo:hasGeometry/geo:asWKT ?geometry . ?osm_id rdf:type osm:node } LIMIT 1" --data-urlencode "backend=https://qlever.cs.uni-freiburg.de/api/${data:NAME}" > /dev/null
WARMUP_CMD = curl -s https://qlever.cs.uni-freiburg.de/petrimaps/query --data-urlencode "query=PREFIX geo: <http://www.opengis.net/ont/geosparql#> PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> PREFIX osm: <https://www.openstreetmap.org/> SELECT ?osm_id ?geometry WHERE { ?osm_id geo:hasGeometry/geo:asWKT ?geometry . ?osm_id rdf:type osm:node } LIMIT 1" --data-urlencode "backend=https://qlever.cs.uni-freiburg.de/api/${data:NAME}" > /dev/null

[runtime]
SYSTEM = docker
Expand Down
46 changes: 35 additions & 11 deletions src/qlever/commands/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,17 @@

import glob
import json
import shlex
import re
import shlex

from qlever.command import QleverCommand
from qlever.containerize import Containerize
from qlever.log import log
from qlever.util import get_existing_index_files, get_total_file_size, run_command
from qlever.util import (
get_existing_index_files,
get_total_file_size,
run_command,
)


class IndexCommand(QleverCommand):
Expand Down Expand Up @@ -39,6 +43,7 @@ def relevant_qleverfile_arguments(self) -> dict[str : list[str]]:
"use_patterns",
"text_index",
"stxxl_memory",
"parser_buffer_size",
],
"runtime": ["system", "image", "index_container"],
}
Expand Down Expand Up @@ -76,7 +81,8 @@ def get_input_options_for_json(self, args) -> str:
# Check that it is an array of length at least one.
if not isinstance(input_specs, list):
raise self.InvalidInputJson(
"`MULTI_INPUT_JSON` must be a JSON array", args.multi_input_json
"`MULTI_INPUT_JSON` must be a JSON array",
args.multi_input_json,
)
if len(input_specs) == 0:
raise self.InvalidInputJson(
Expand All @@ -90,13 +96,15 @@ def get_input_options_for_json(self, args) -> str:
# Check that `input_spec` is a dictionary.
if not isinstance(input_spec, dict):
raise self.InvalidInputJson(
f"Element {i} in `MULTI_INPUT_JSON` must be a JSON " "object",
f"Element {i} in `MULTI_INPUT_JSON` must be a JSON "
"object",
input_spec,
)
# For each `input_spec`, we must have a command.
if "cmd" not in input_spec:
raise self.InvalidInputJson(
f"Element {i} in `MULTI_INPUT_JSON` must contain a " "key `cmd`",
f"Element {i} in `MULTI_INPUT_JSON` must contain a "
"key `cmd`",
input_spec,
)
# If the command contains a `{}` placeholder, we need a `for-each`
Expand Down Expand Up @@ -204,14 +212,23 @@ def execute(self, args) -> bool:
index_cmd += " --only-pso-and-pos-permutations --no-patterns"
if not args.use_patterns:
index_cmd += " --no-patterns"
if args.text_index in ["from_text_records", "from_text_records_and_literals"]:
if args.text_index in [
"from_text_records",
"from_text_records_and_literals",
]:
index_cmd += (
f" -w {args.name}.wordsfile.tsv" f" -d {args.name}.docsfile.tsv"
f" -w {args.name}.wordsfile.tsv"
f" -d {args.name}.docsfile.tsv"
)
if args.text_index in ["from_literals", "from_text_records_and_literals"]:
if args.text_index in [
"from_literals",
"from_text_records_and_literals",
]:
index_cmd += " --text-words-from-literals"
if args.stxxl_memory:
index_cmd += f" --stxxl-memory {args.stxxl_memory}"
if args.parser_buffer_size:
index_cmd += f" --parser-buffer-size {args.parser_buffer_size}"
index_cmd += f" | tee {args.name}.index-log.txt"

# If the total file size is larger than 10 GB, set ulimit (such that a
Expand All @@ -234,7 +251,8 @@ def execute(self, args) -> bool:

# Command for writing the settings JSON to a file.
settings_json_cmd = (
f"echo {shlex.quote(args.settings_json)} " f"> {args.name}.settings.json"
f"echo {shlex.quote(args.settings_json)} "
f"> {args.name}.settings.json"
)

# Show the command line.
Expand Down Expand Up @@ -279,9 +297,15 @@ def execute(self, args) -> bool:
return False

# Remove already existing container.
if args.system in Containerize.supported_systems() and args.overwrite_existing:
if (
args.system in Containerize.supported_systems()
and args.overwrite_existing
):
if Containerize.is_running(args.system, args.index_container):
log.info("Another index process is running, trying to stop " "it ...")
log.info(
"Another index process is running, trying to stop "
"it ..."
)
log.info("")
try:
run_command(f"{args.system} rm -f {args.index_container}")
Expand Down
8 changes: 7 additions & 1 deletion src/qlever/qleverfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,10 +113,16 @@ def arg(*args, **kwargs):
index_args["stxxl_memory"] = arg(
"--stxxl-memory",
type=str,
default="5G",
help="The amount of memory to use for the index build "
"(the name of the option has historical reasons)",
)
index_args["parser_buffer_size"] = arg(
"--parser-buffer-size",
type=str,
help="Each parser thread reads the input in batches of this size"
"; in parallel parsing, each batch that is not the last must be "
"larger enough to contain then end of at least one statement",
)
index_args["only_pso_and_pos_permutations"] = arg(
"--only-pso-and-pos-permutations",
action="store_true",
Expand Down
Loading