Merge branch 'ad-freiburg:main' into improve_code_for_stop

ad-freiburg · Dec 23, 2024 · 3df9c0b · 3df9c0b
2 parents 695e59c + c9894be
commit 3df9c0b
Show file tree

Hide file tree

Showing 41 changed files with 2,367 additions and 659 deletions.
diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml
@@ -0,0 +1,29 @@
+name: Unit Tests
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+
+jobs:
+  unit_tests:
+
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["pypy3.9", "pypy3.10", "3.9", "3.10", "3.11", "3.12"]
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{matrix.python-version}}
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install .
+          pip install pytest pytest-cov
+      - name: Test with pytest
+        run: |
+          pytest -v
diff --git a/.github/workflows/qleverfiles-check.yml b/.github/workflows/qleverfiles-check.yml
@@ -0,0 +1,51 @@
+name: Qleverfiles check
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+  merge_group:
+
+jobs:
+  qleverfiles-check:
+    runs-on: ${{matrix.os}}
+    strategy:
+      fail-fast: true
+      matrix:
+        os: [ubuntu-24.04]
+
+    steps:
+      - name: Checkout the repository for the qlever script
+        uses: actions/checkout@v3
+        with:
+          path: qlever-control
+
+      - name: Install the script locally
+        working-directory: ${{github.workspace}}/qlever-control
+        run: |
+          # python3 -m pip install --upgrade pip setuptools wheel
+          # python3 --version
+          # pip3 --version
+          # pip3 show setuptools wheel
+          pip install -e .
+
+      - name: Check that all the files in `src/qlever/Qleverfiles` parse.
+        working-directory: ${{github.workspace}}/qlever-control
+        run: |
+          export QLEVER_ARGCOMPLETE_ENABLED=1
+          for QLEVERFILE in src/qlever/Qleverfiles/Qleverfile.*; do
+            echo
+            echo -e "\x1b[1;34mChecking ${QLEVERFILE}\x1b[0m"
+            echo
+            NAME=${QLEVERFILE##*.}
+            rm -f Qleverfile
+            qlever setup-config $NAME
+            qlever get-data --show
+            qlever index --show
+            qlever start --show
+            qlever ui --show
+            echo
+            echo -e "\x1b[34mAll checks passed for ${QLEVERFILE}\x1b[0m"
+            echo
+          done
diff --git a/pyproject.toml b/pyproject.toml
@@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "qlever"
 description = "Script for using the QLever SPARQL engine."
-version = "0.5.6"
+version = "0.5.14"
 authors = [
     { name = "Hannah Bast", email = "[email protected]" }
 ]
@@ -35,3 +35,8 @@ package-data = { "qlever" = ["Qleverfiles/*"] }
 
 [tool.pytest.ini_options]
 pythonpath = ["src"]
+
+[tool.ruff]
+line-length = 79 
+[tool.ruff.lint]
+extend-select = ["I"]
diff --git a/src/qlever/Qleverfiles/Qleverfile.dblp b/src/qlever/Qleverfiles/Qleverfile.dblp
@@ -1,20 +1,24 @@
-# Qleverfile for DBLP, use with https://github.com/ad-freiburg/qlever-control
+# Qleverfile for DBLP, use with QLever CLI (`pip install qlever`)
 #
-# qlever get-data  # takes ~3 mins (downloads .ttl.gz file of size ~3 GB)
-# qlever index     # takes ~4 mins (on an AMD Ryzen 9 5900X)
-# qlever start     # takes a few seconds
+# qlever get-data  # ~1 min, ~5 GB compressed, 1.3 B triples
+# qlever index     # ~30 min, ~20 GB RAM, ~25 GB index size on disk
+# qlever start     # ~3 s, adjust MEMORY_FOR_QUERIES as needed
+#
+# Measured on an AMD Ryzen 9 5950X with 128 GB RAM, and NVMe SSD (25.10.2024)
 
 [data]
 NAME         = dblp
-GET_DATA_URL = https://dblp.org/rdf/dblp.ttl.gz
-GET_DATA_CMD = curl -LRC - -O ${GET_DATA_URL} 2>&1 | tee ${data:NAME}.download-log.txt
+DATA_TARFILE = dblp_KG_with_associated_data.tar
+GET_DATA_URL = https://sparql.dblp.org/download/${DATA_TARFILE}
+GET_DATA_CMD = (curl -LROC - ${GET_DATA_URL} && tar -xf ${DATA_TARFILE}) 2>&1 | tee ${NAME}.download-log.txt && rm -f ${DATA_TARFILE}
 VERSION      = $$(date -r dblp.ttl.gz +"%d.%m.%Y %H:%M" || echo "NO_DATE")
-DESCRIPTION  = DBLP computer science bibliography, data from ${GET_DATA_URL} (version ${VERSION})
+DESCRIPTION  = DBLP computer science bibliography + citations from OpenCitations, data from ${GET_DATA_URL} (version ${VERSION})
+FORMAT       = ttl
 
 [index]
-INPUT_FILES     = dblp.ttl.gz
-CAT_INPUT_FILES = zcat ${INPUT_FILES}
-SETTINGS_JSON   = { "ascii-prefixes-only": false, "num-triples-per-batch": 1000000, "prefixes-external": [""] }
+INPUT_FILES      = *.gz
+MULTI_INPUT_JSON = { "cmd": "zcat {}", "for-each": "*.gz" }
+SETTINGS_JSON    = { "ascii-prefixes-only": false, "num-triples-per-batch": 5000000, "prefixes-external": [""] }
 
 [server]
 PORT               = 7015

diff --git a/src/qlever/Qleverfiles/Qleverfile.dblp-plus b/src/qlever/Qleverfiles/Qleverfile.dblp-plus
@@ -9,12 +9,12 @@
 [data]
 NAME              = dblp-plus
 GET_DATA_CMD      = wget -nc -O dblp.ttl.gz https://dblp.org/rdf/dblp.ttl.gz
-INDEX_DESCRIPTION = Publication data from https://dblp.org, with affiliations from https://www.wikidata.org and citations from https://opencitations.net
+DESCRIPTION       = Publication data from https://dblp.org, with affiliations from https://www.wikidata.org and citations from https://opencitations.net
 TEXT_DESCRIPTION  = All literals, search with FILTER KEYWORDS(?text, "...")
 
 [index]
 INPUT_FILES       = dblp.ttl.gz affiliations.nt affiliations.additions.nt citations.nt
-CAT_INPUT_FILES   = zcat -f ${RDF_FILES}
+CAT_INPUT_FILES   = zcat -f ${INPUT_FILES}
 SETTINGS_JSON     = { "ascii-prefixes-only": false, "num-triples-per-batch": 1000000, "prefixes-external": [ "<https://w3id.org", "<https://doi.org", "<http://dx.doi.org" ] }
 TEXT_INDEX        = from_literals
 

diff --git a/src/qlever/Qleverfiles/Qleverfile.default b/src/qlever/Qleverfiles/Qleverfile.default
@@ -31,7 +31,7 @@ SETTINGS_JSON   = { "num-triples-per-batch": 1000000 }
 # URL parameter `access_token`. It should not be easily guessable, unless you
 # don't mind others to get privileged access to your server.
 [server]
-PORT         =
+PORT         = 8888
 ACCESS_TOKEN = 
 
 # Use SYSTEM = docker to run QLever inside a docker container; the Docker image

diff --git a/src/qlever/Qleverfiles/Qleverfile.fbeasy b/src/qlever/Qleverfiles/Qleverfile.fbeasy
@@ -13,13 +13,13 @@ TEXT_DESCRIPTION  = Sentences from Wikipedia that mention at least one Freebase
 
 [index]
 INPUT_FILES     = fbeasy.nt
-CAT_INPUT_FILES = cat ${RDF_FILES}
+CAT_INPUT_FILES = cat ${INPUT_FILES}
 SETTINGS_JSON   = { "ascii-prefixes-only": true, "num-triples-per-batch": 10000000 }
 
 [server]
-PORT                        = 7003
-ACCESS_TOKEN                = ${data:NAME}_12631403
-MEMORY_FOR_QUERIES          = 5G
+PORT               = 7003
+ACCESS_TOKEN       = ${data:NAME}
+MEMORY_FOR_QUERIES = 5G
 
 [runtime]
 SYSTEM = docker

diff --git a/src/qlever/Qleverfiles/Qleverfile.freebase b/src/qlever/Qleverfiles/Qleverfile.freebase
@@ -12,12 +12,12 @@ DESCRIPTION  = RDF data from ${DATA_URL}, latest (and final) version from 09.08.
 
 [index]
 INPUT_FILES     = freebase-rdf-latest.gz
-CAT_INPUT_FILES = zcat ${RDF_FILES}
+CAT_INPUT_FILES = zcat ${INPUT_FILES}
 SETTINGS_JSON   = { "languages-internal": [ "en" ], "prefixes-external": ["<"], "locale": { "language": "en", "country": "US", "ignore-punctuation": true }, "ascii-prefixes-only": false, "num-triples-per-batch": 10000000 }
 
 [server]
 PORT               = 7002
-ACCESS_TOKEN       = ${data:NAME}_12631403
+ACCESS_TOKEN       = ${data:NAME}
 MEMORY_FOR_QUERIES = 10G
 
 [runtime]

diff --git a/src/qlever/Qleverfiles/Qleverfile.imdb b/src/qlever/Qleverfiles/Qleverfile.imdb
@@ -28,7 +28,7 @@ ACCESS_TOKEN       = ${data:NAME}
 MEMORY_FOR_QUERIES = 5G
 
 [runtime]
-SYSTEM = native
+SYSTEM = docker
 IMAGE  = docker.io/adfreiburg/qlever:latest
 
 [ui]

diff --git a/src/qlever/Qleverfiles/Qleverfile.orkg b/src/qlever/Qleverfiles/Qleverfile.orkg
@@ -0,0 +1,30 @@
+# Qleverfile for ORKG, use with the QLever CLI (`pip install qlever`)
+#
+# qlever get-data  # Get the dataset
+# qlever index     # Build index data structures
+# qlever start     # Start the server
+
+[data]
+NAME         = orkg
+GET_DATA_URL = https://orkg.org/api/rdf/dump
+GET_DATA_CMD = curl -LR -o ${NAME}.ttl ${GET_DATA_URL} 2>&1 | tee ${NAME}.download-log.txt
+VERSION      = $$(date -r ${NAME}.ttl +%d.%m.%Y || echo "NO_DATE")
+DESCRIPTION  = The Open Research Knowledge Graph (ORKG) (data from ${GET_DATA_URL}, version ${VERSION})
+
+[index]
+INPUT_FILES     = ${data:NAME}.ttl
+CAT_INPUT_FILES = cat ${INPUT_FILES}
+SETTINGS_JSON   = { "ascii-prefixes-only": false, "num-triples-per-batch": 1000000, "prefixes-external": [""] }
+
+[server]
+PORT               = 7053
+ACCESS_TOKEN       = ${data:NAME}
+MEMORY_FOR_QUERIES = 10G
+CACHE_MAX_SIZE     = 5G
+
+[runtime]
+SYSTEM = docker
+IMAGE  = docker.io/adfreiburg/qlever:latest
+
+[ui]
+UI_CONFIG = orkg
diff --git a/src/qlever/Qleverfiles/Qleverfile.osm-planet b/src/qlever/Qleverfiles/Qleverfile.osm-planet
@@ -11,7 +11,7 @@
 NAME         = osm-planet
 DATA_URL     = https://osm2rdf.cs.uni-freiburg.de/ttl/planet.osm.ttl.bz2
 GET_DATA_CMD = curl --location --fail --continue-at - --remote-time --output ${NAME}.ttl.bz2 ${DATA_URL}
-VERSION      = $$(date -r ${NAME}.ttl.bz2 +"%d.%m.%Y")
+VERSION      = $$(date -r ${NAME}.ttl.bz2 +"%d.%m.%Y" || echo "NO_DATE")
 DESCRIPTION  = OSM Planet, data from ${DATA_URL} version ${VERSION} (complete OSM data, with GeoSPARQL predicates ogc:sfContains and ogc:sfIntersects)
 
 [index]