diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml new file mode 100644 index 00000000..e902cf3c --- /dev/null +++ b/.github/workflows/pytest.yml @@ -0,0 +1,29 @@ +name: Unit Tests + +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + +jobs: + unit_tests: + + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["pypy3.9", "pypy3.10", "3.9", "3.10", "3.11", "3.12"] + steps: + - uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: ${{matrix.python-version}} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install . + pip install pytest pytest-cov + - name: Test with pytest + run: | + pytest -v diff --git a/.github/workflows/qleverfiles-check.yml b/.github/workflows/qleverfiles-check.yml index 528beb14..4cda5f8d 100644 --- a/.github/workflows/qleverfiles-check.yml +++ b/.github/workflows/qleverfiles-check.yml @@ -33,6 +33,7 @@ jobs: - name: Check that all the files in `src/qlever/Qleverfiles` parse. working-directory: ${{github.workspace}}/qlever-control run: | + export QLEVER_ARGCOMPLETE_ENABLED=1 for QLEVERFILE in src/qlever/Qleverfiles/Qleverfile.*; do echo echo -e "\x1b[1;34mChecking ${QLEVERFILE}\x1b[0m" diff --git a/pyproject.toml b/pyproject.toml index 2a1f7e95..5987f4f4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta" [project] name = "qlever" description = "Script for using the QLever SPARQL engine." -version = "0.5.8" +version = "0.5.14" authors = [ { name = "Hannah Bast", email = "bast@cs.uni-freiburg.de" } ] @@ -35,3 +35,8 @@ package-data = { "qlever" = ["Qleverfiles/*"] } [tool.pytest.ini_options] pythonpath = ["src"] + +[tool.ruff] +line-length = 79 +[tool.ruff.lint] +extend-select = ["I"] diff --git a/src/qlever/Qleverfiles/Qleverfile.dblp b/src/qlever/Qleverfiles/Qleverfile.dblp index 8a0e7f5d..574d1192 100644 --- a/src/qlever/Qleverfiles/Qleverfile.dblp +++ b/src/qlever/Qleverfiles/Qleverfile.dblp @@ -17,7 +17,7 @@ FORMAT = ttl [index] INPUT_FILES = *.gz -MULTI_INPUT_JSON = $$(ls *.gz | awk 'BEGIN { printf "[ " } NR > 1 { printf ", " } { printf "{\"cmd\": \"zcat " $$0 "\"}" } END { printf "]" }') +MULTI_INPUT_JSON = { "cmd": "zcat {}", "for-each": "*.gz" } SETTINGS_JSON = { "ascii-prefixes-only": false, "num-triples-per-batch": 5000000, "prefixes-external": [""] } [server] diff --git a/src/qlever/Qleverfiles/Qleverfile.orkg b/src/qlever/Qleverfiles/Qleverfile.orkg new file mode 100644 index 00000000..dcfb7fe2 --- /dev/null +++ b/src/qlever/Qleverfiles/Qleverfile.orkg @@ -0,0 +1,30 @@ +# Qleverfile for ORKG, use with the QLever CLI (`pip install qlever`) +# +# qlever get-data # Get the dataset +# qlever index # Build index data structures +# qlever start # Start the server + +[data] +NAME = orkg +GET_DATA_URL = https://orkg.org/api/rdf/dump +GET_DATA_CMD = curl -LR -o ${NAME}.ttl ${GET_DATA_URL} 2>&1 | tee ${NAME}.download-log.txt +VERSION = $$(date -r ${NAME}.ttl +%d.%m.%Y || echo "NO_DATE") +DESCRIPTION = The Open Research Knowledge Graph (ORKG) (data from ${GET_DATA_URL}, version ${VERSION}) + +[index] +INPUT_FILES = ${data:NAME}.ttl +CAT_INPUT_FILES = cat ${INPUT_FILES} +SETTINGS_JSON = { "ascii-prefixes-only": false, "num-triples-per-batch": 1000000, "prefixes-external": [""] } + +[server] +PORT = 7053 +ACCESS_TOKEN = ${data:NAME} +MEMORY_FOR_QUERIES = 10G +CACHE_MAX_SIZE = 5G + +[runtime] +SYSTEM = docker +IMAGE = docker.io/adfreiburg/qlever:latest + +[ui] +UI_CONFIG = orkg diff --git a/src/qlever/Qleverfiles/Qleverfile.pubchem b/src/qlever/Qleverfiles/Qleverfile.pubchem index 098257db..25aef1ca 100644 --- a/src/qlever/Qleverfiles/Qleverfile.pubchem +++ b/src/qlever/Qleverfiles/Qleverfile.pubchem @@ -1,51 +1,127 @@ -# Qleverfile for PubChem, use with https://github.com/ad-freiburg/qlever-control +# Qleverfile for PubChem, use with the QLever CLI (`pip install qlever`) # -# Resource requirements (as of 18.08.2024, on an AMD Ryzen 9 5900X): -# -# qlever get-data # ~2 hours, ~150 GB, ~19 billion triples -# qlever index # ~7 hours, ~20 GB RAM, ~400 GB disk space +# qlever get-data # ~2 hours, ~120 GB, ~19 billion triples +# qlever index # ~6 hours, ~20 GB RAM, ~350 GB disk space (for the index) # qlever start # a few seconds # -# NOTE 1: `get-data` does not only download the PubChem RDF data, but also +# Measured on an AMD Ryzen 9 7950X with 128 GB RAM, and NVMe SSD (17.12.2024) +# +# NOTE 1: `qlever get-data` does not only download the PubChem RDF data, but also # a number of ontologies. These are very useful to obtain names for IRIs like # `sio:SIO_000008` or `obo:IAO_0000412` (otherwise very hard to understand). -# The ontologies BAO and NDF-RT are infrequently updated, for latest versions, +# The ontologies BAO and NDF-RT are occasionally updated; for latest versions, # see the download links at https://bioportal.bioontology.org/ontologies/BAO # and https://bioportal.bioontology.org/ontologies/NDF-RT . -# -# NOTE 2: Many of the TTL files have generic prefix definitions in the middle -# of the file, like @prefix ns23: . -# See https://github.com/ad-freiburg/qlever/issues/711#issuecomment-1197113953 -# This is allowed by the standard, but unusual. For use with QLever, we -# therefore convert the TTL files to NT when downloading them. # -# NOTE 3: The PubChem data contains several invalid IRIs, in particular, -# containing spaces. The previous version of this Qleverfile used a combination -# of `sed` and `awk` to fix this. In the meantime, QLever's default is to warn -# about such IRIs while indexing, but accept them anyway. +# NOTE 2: The `MULTI_INPUT_JSON` zcats selected files together in one input +# stream because there are too many files and the command line triggered by +# `qlever index` would be too long otherwise. [data] NAME = pubchem GET_DATA_URL = ftp://ftp.ncbi.nlm.nih.gov/pubchem/RDF +ONTOLOGIES_DIR = RDF.ontologies +PUBCHEM_DIR = RDF.pubchem +ONTOLOGIES_CSV = ontologies.csv CHECK_REQUIREMENTS = for CMD in docker parallel; do $$CMD --version >/dev/null 2>&1 || (echo "Requires \"$$CMD\", please install it"; false); done -MAKE_GET_DATA_CMD_1 = DIR=DATA.ontologies && mkdir -p $$DIR && cat $$DIR/ontologies.csv | while IFS=',' read -r DESC FILE URL; do ERRFILE=$${FILE%.*}.jena-stderr; echo "echo \"Processing $$URL ($$FILE) ...\" && curl -sLRo $$DIR/$$FILE \"$$URL\" && docker run --rm -v $$(pwd):/data stain/jena riot --output=NT /data/$$DIR/$$FILE 2> $$DIR/$$ERRFILE | gzip -c > $$DIR/$${FILE%.*}.nt.gz && rm -f $$DIR/$$FILE && if [ ! -s $$DIR/$$ERRFILE ]; then rm -f $$DIR/$$ERRFILE; fi || echo \"ERROR processing $$URL ($$FILE)\""; done > pubchem.get-data-cmds.txt -MAKE_GET_DATA_CMD_2 = DIR=DATA.pubchem && mkdir -p $$DIR && curl -LRO ${GET_DATA_URL}/void.ttl && grep -oP '${GET_DATA_URL}/.*?\.ttl\.gz' void.ttl | while read URL; do FILE=$$(basename $$URL); echo "echo \"Processing $$URL ...\" && curl -sLRo $$DIR/$$FILE \"$$URL\" && docker run -i --rm -v $$(pwd):/data stain/jena turtle --output=NT /data/$$DIR/$$FILE | gzip -c > $$DIR/$${FILE%%.*}.nt.gz && rm -f $$DIR/$$FILE || echo \"ERROR processing $$URL\""; done >> pubchem.get-data-cmds.txt -GET_DATA_CMD = ${CHECK_REQUIREMENTS} && ${MAKE_GET_DATA_CMD_1} && ${MAKE_GET_DATA_CMD_2} && cat pubchem.get-data-cmds.txt | parallel --line-buffer 2>&1 | tee pubchem.get-data-log.txt +GET_DATA_CMD_1 = mkdir -p ${ONTOLOGIES_DIR} && cd ${ONTOLOGIES_DIR} && cat ${ONTOLOGIES_CSV} | parallel --colsep "," 'FILE={2} && URL={3} && ERRFILE=$${FILE%.*}.jena-stderr; echo "Processing $$URL ($$FILE) ..." && curl -sLRo $$FILE $$URL && docker run --rm -v $$(pwd):/data stain/jena riot --output=NT /data/$$FILE 2> $$ERRFILE | gzip -c > $${FILE%.*}.nt.gz && rm -f $$FILE; if [ -s $$ERRFILE ]; then grep -q "ERROR *riot" $$ERRFILE && echo "riot ERRORs in $$FILE, check $$ERRFILE"; else rm $$ERRFILE; fi' +GET_DATA_CMD_2 = mkdir -p ${PUBCHEM_DIR} && wget -r -nv -nH --cut-dirs=2 --no-parent -P ${PUBCHEM_DIR} ${GET_DATA_URL} +GET_DATA_CMD = ${CHECK_REQUIREMENTS} && ${GET_DATA_CMD_1} 2>&1 | tee pubchem.get-data-log.txt; ${GET_DATA_CMD_2} 2>&1 | tee -a pubchem.get-data-log.txt VERSION = $$(date -r void.ttl +%d.%m.%Y || echo "NO_DATE") DESCRIPTION = PubChem RDF from ${GET_DATA_URL} (version ${VERSION}) + associated ontologies (bao, bfo, biopax-level3, chebi, cheminf, cito, dublin_core_terms, fabio, go, iao, ncit, obi, pr, ro, sio, skos, so, uo) -MAKE_ONTOLOGIES_CSV = $$(mkdir -p DATA.ontologies && echo "BAO - BioAssay Ontology,bao.owl,https://data.bioontology.org/ontologies/BAO/submissions/56/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb\nBFO - Basic Formal Ontology,bfo.owl,http://purl.obolibrary.org/obo/bfo.owl\n BioPAX - biological pathway data,bp.owl,http://www.biopax.org/release/biopax-level3.owl\n CHEMINF - Chemical Information Ontology,cheminf.owl,http://purl.obolibrary.org/obo/cheminf.owl\n ChEBI - Chemical Entities of Biological Interest,chebi.owl,http://purl.obolibrary.org/obo/chebi.owl\n CiTO,cito.nt,http://purl.org/spar/cito.nt\n DCMI Terms,dcterms.nt,https://www.dublincore.org/specifications/dublin-core/dcmi-terms/dublin_core_terms.nt\n FaBiO,fabio.nt,http://purl.org/spar/fabio.nt\n GO - Gene Ontology,go.owl,http://purl.obolibrary.org/obo/go.owl\n IAO - Information Artifact Ontology,iao.owl,http://purl.obolibrary.org/obo/iao.owl\n NCIt,ncit.owl,http://purl.obolibrary.org/obo/ncit.owl\n NDF-RT,ndfrt.owl,https://data.bioontology.org/ontologies/NDF-RT/submissions/1/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb\n OBI - Ontology for Biomedical Investigations,obi.owl,http://purl.obolibrary.org/obo/obi.owl\n OWL,owl.ttl,http://www.w3.org/2002/07/owl.ttl\n PDBo,pdbo.owl,http://rdf.wwpdb.org/schema/pdbx-v40.owl\n PR - PRotein Ontology (PRO),pr.owl,http://purl.obolibrary.org/obo/pr.owl\n RDF Schema,rdfs.ttl,https://www.w3.org/2000/01/rdf-schema.ttl\n RDF,rdf.ttl,http://www.w3.org/1999/02/22-rdf-syntax-ns.ttl\n RO - Relation Ontology,ro.owl,http://purl.obolibrary.org/obo/ro.owl\n SIO - Semanticscience Integrated Ontology,sio.owl,http://semanticscience.org/ontology/sio.owl\n SKOS,skos.rdf,http://www.w3.org/TR/skos-reference/skos.rdf\n SO - Sequence types and features ontology,so.owl,http://purl.obolibrary.org/obo/so.owl\n UO - Units of measurement ontology,uo.owl,http://purl.obolibrary.org/obo/uo.owl" > DATA.ontologies/ontologies.csv) +MAKE_ONTOLOGIES_CSV = $$(mkdir -p ${ONTOLOGIES_DIR} && echo "BAO - BioAssay Ontology,bao.owl,https://data.bioontology.org/ontologies/BAO/submissions/56/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb\nBFO - Basic Formal Ontology,bfo.owl,http://purl.obolibrary.org/obo/bfo.owl\nBioPAX - biological pathway data,bp.owl,http://www.biopax.org/release/biopax-level3.owl\nCHEMINF - Chemical Information Ontology,cheminf.owl,http://purl.obolibrary.org/obo/cheminf.owl\nChEBI - Chemical Entities of Biological Interest,chebi.owl,http://purl.obolibrary.org/obo/chebi.owl\nCiTO,cito.nt,http://purl.org/spar/cito.nt\nDCMI Terms,dcterms.nt,https://www.dublincore.org/specifications/dublin-core/dcmi-terms/dublin_core_terms.nt\nFaBiO,fabio.nt,http://purl.org/spar/fabio.nt\nGO - Gene Ontology,go.owl,http://purl.obolibrary.org/obo/go.owl\nIAO - Information Artifact Ontology,iao.owl,http://purl.obolibrary.org/obo/iao.owl\nNCIt,ncit.owl,http://purl.obolibrary.org/obo/ncit.owl\nNDF-RT,ndfrt.owl,https://data.bioontology.org/ontologies/NDF-RT/submissions/1/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb\nOBI - Ontology for Biomedical Investigations,obi.owl,http://purl.obolibrary.org/obo/obi.owl\nOWL,owl.ttl,http://www.w3.org/2002/07/owl.ttl\nPDBo,pdbo.owl,http://rdf.wwpdb.org/schema/pdbx-v40.owl\nPR - PRotein Ontology (PRO),pr.owl,http://purl.obolibrary.org/obo/pr.owl\nRDF Schema,rdfs.ttl,https://www.w3.org/2000/01/rdf-schema.ttl\nRDF,rdf.ttl,http://www.w3.org/1999/02/22-rdf-syntax-ns.ttl\nRO - Relation Ontology,ro.owl,http://purl.obolibrary.org/obo/ro.owl\nSIO - Semanticscience Integrated Ontology,sio.owl,http://semanticscience.org/ontology/sio.owl\nSKOS,skos.rdf,http://www.w3.org/TR/skos-reference/skos.rdf\nSO - Sequence types and features ontology,so.owl,http://purl.obolibrary.org/obo/so.owl\nUO - Units of measurement ontology,uo.owl,http://purl.obolibrary.org/obo/uo.owl" > ${ONTOLOGIES_DIR}/${ONTOLOGIES_CSV}) [index] -INPUT_FILES = DATA.ontologies/*.nt.gz DATA.pubchem/*.nt.gz -CAT_INPUT_FILES = zcat ${INPUT_FILES} -SETTINGS_JSON = { "languages-internal": [], "prefixes-external": [""], "ascii-prefixes-only": false, "num-triples-per-batch": 5000000 } -STXXL_MEMORY = 10G +INPUT_FILES = ${data:ONTOLOGIES_DIR}/*.nt.gz ${data:PUBCHEM_DIR}/*/*.ttl.gz ${data:PUBCHEM_DIR}/*/*/*.ttl.gz +BASE_URL = http://rdf.ncbi.nlm.nih.gov/pubchem +MULTI_INPUT_JSON = [{ "cmd": "zcat ${data:ONTOLOGIES_DIR}/*.nt.gz", "graph": "${BASE_URL}/ruleset"}, + { "cmd": "zcat {}", "graph": "${BASE_URL}/anatomy", "for-each": "${data:PUBCHEM_DIR}/anatomy/*.ttl.gz" }, + { "cmd": "zcat {}", "graph": "${BASE_URL}/author", "for-each": "${data:PUBCHEM_DIR}/author/*.ttl.gz" }, + { "cmd": "zcat {}", "graph": "${BASE_URL}/bioassay", "for-each": "${data:PUBCHEM_DIR}/bioassay/*.ttl.gz" }, + { "cmd": "zcat {}", "graph": "${BASE_URL}/book", "for-each": "${data:PUBCHEM_DIR}/book/*.ttl.gz" }, + { "cmd": "zcat {}", "graph": "${BASE_URL}/cell", "for-each": "${data:PUBCHEM_DIR}/cell/*.ttl.gz" }, + { "cmd": "zcat ${data:PUBCHEM_DIR}/compound/general/*0.ttl.gz", "graph": "${BASE_URL}/compound" }, + { "cmd": "zcat ${data:PUBCHEM_DIR}/compound/general/*1.ttl.gz", "graph": "${BASE_URL}/compound" }, + { "cmd": "zcat ${data:PUBCHEM_DIR}/compound/general/*2.ttl.gz", "graph": "${BASE_URL}/compound" }, + { "cmd": "zcat ${data:PUBCHEM_DIR}/compound/general/*3.ttl.gz", "graph": "${BASE_URL}/compound" }, + { "cmd": "zcat ${data:PUBCHEM_DIR}/compound/general/*4.ttl.gz", "graph": "${BASE_URL}/compound" }, + { "cmd": "zcat ${data:PUBCHEM_DIR}/compound/general/*5.ttl.gz", "graph": "${BASE_URL}/compound" }, + { "cmd": "zcat ${data:PUBCHEM_DIR}/compound/general/*6.ttl.gz", "graph": "${BASE_URL}/compound" }, + { "cmd": "zcat ${data:PUBCHEM_DIR}/compound/general/*7.ttl.gz", "graph": "${BASE_URL}/compound" }, + { "cmd": "zcat ${data:PUBCHEM_DIR}/compound/general/*8.ttl.gz", "graph": "${BASE_URL}/compound" }, + { "cmd": "zcat ${data:PUBCHEM_DIR}/compound/general/*9.ttl.gz", "graph": "${BASE_URL}/compound" }, + { "cmd": "zcat {}", "graph": "${BASE_URL}/compound", "for-each": "${data:PUBCHEM_DIR}/compound/general/*[!0-9].ttl.gz" }, + { "cmd": "zcat {}", "graph": "${BASE_URL}/concept", "for-each": "${data:PUBCHEM_DIR}/concept/*.ttl.gz" }, + { "cmd": "zcat {}", "graph": "${BASE_URL}/conserveddomain", "for-each": "${data:PUBCHEM_DIR}/conserveddomain/*.ttl.gz" }, + { "cmd": "zcat {}", "graph": "${BASE_URL}/cooccurrence", "for-each": "${data:PUBCHEM_DIR}/cooccurrence/*.ttl.gz" }, + { "cmd": "zcat ${data:PUBCHEM_DIR}/descriptor/compound/*0.ttl.gz", "graph": "${BASE_URL}/descriptor" }, + { "cmd": "zcat ${data:PUBCHEM_DIR}/descriptor/compound/*1.ttl.gz", "graph": "${BASE_URL}/descriptor" }, + { "cmd": "zcat ${data:PUBCHEM_DIR}/descriptor/compound/*2.ttl.gz", "graph": "${BASE_URL}/descriptor" }, + { "cmd": "zcat ${data:PUBCHEM_DIR}/descriptor/compound/*3.ttl.gz", "graph": "${BASE_URL}/descriptor" }, + { "cmd": "zcat ${data:PUBCHEM_DIR}/descriptor/compound/*4.ttl.gz", "graph": "${BASE_URL}/descriptor" }, + { "cmd": "zcat ${data:PUBCHEM_DIR}/descriptor/compound/*5.ttl.gz", "graph": "${BASE_URL}/descriptor" }, + { "cmd": "zcat ${data:PUBCHEM_DIR}/descriptor/compound/*6.ttl.gz", "graph": "${BASE_URL}/descriptor" }, + { "cmd": "zcat ${data:PUBCHEM_DIR}/descriptor/compound/*7.ttl.gz", "graph": "${BASE_URL}/descriptor" }, + { "cmd": "zcat ${data:PUBCHEM_DIR}/descriptor/compound/*8.ttl.gz", "graph": "${BASE_URL}/descriptor" }, + { "cmd": "zcat ${data:PUBCHEM_DIR}/descriptor/compound/*9.ttl.gz", "graph": "${BASE_URL}/descriptor" }, + { "cmd": "zcat {}", "graph": "${BASE_URL}/descriptor", "for-each": "${data:PUBCHEM_DIR}/descriptor/compound/*[!0-9].ttl.gz" }, + { "cmd": "zcat {}", "graph": "${BASE_URL}/descriptor", "for-each": "${data:PUBCHEM_DIR}/descriptor/substance/*.ttl.gz" }, + { "cmd": "zcat {}", "graph": "${BASE_URL}/disease", "for-each": "${data:PUBCHEM_DIR}/disease/*.ttl.gz" }, + { "cmd": "zcat {}", "graph": "${BASE_URL}/endpoint", "for-each": "${data:PUBCHEM_DIR}/endpoint/*.ttl.gz" }, + { "cmd": "zcat {}", "graph": "${BASE_URL}/gene", "for-each": "${data:PUBCHEM_DIR}/gene/*.ttl.gz"}, + { "cmd": "zcat {}", "graph": "${BASE_URL}/grant", "for-each": "${data:PUBCHEM_DIR}/grant/*.ttl.gz" }, + { "cmd": "zcat {}", "graph": "${BASE_URL}/inchikey", "for-each": "${data:PUBCHEM_DIR}/inchikey/*.ttl.gz" }, + { "cmd": "zcat {}", "graph": "${BASE_URL}/journal", "for-each": "${data:PUBCHEM_DIR}/journal/*.ttl.gz" }, + { "cmd": "zcat {}", "graph": "${BASE_URL}/measuregroup", "for-each": "${data:PUBCHEM_DIR}/measuregroup/*.ttl.gz" }, + { "cmd": "zcat {}", "graph": "${BASE_URL}/organization", "for-each": "${data:PUBCHEM_DIR}/organization/*.ttl.gz" }, + { "cmd": "zcat ${data:PUBCHEM_DIR}/patent/*0.ttl.gz", "graph": "${BASE_URL}/patent" }, + { "cmd": "zcat ${data:PUBCHEM_DIR}/patent/*1.ttl.gz", "graph": "${BASE_URL}/patent" }, + { "cmd": "zcat ${data:PUBCHEM_DIR}/patent/*2.ttl.gz", "graph": "${BASE_URL}/patent" }, + { "cmd": "zcat ${data:PUBCHEM_DIR}/patent/*3.ttl.gz", "graph": "${BASE_URL}/patent" }, + { "cmd": "zcat ${data:PUBCHEM_DIR}/patent/*4.ttl.gz", "graph": "${BASE_URL}/patent" }, + { "cmd": "zcat ${data:PUBCHEM_DIR}/patent/*5.ttl.gz", "graph": "${BASE_URL}/patent" }, + { "cmd": "zcat ${data:PUBCHEM_DIR}/patent/*6.ttl.gz", "graph": "${BASE_URL}/patent" }, + { "cmd": "zcat ${data:PUBCHEM_DIR}/patent/*7.ttl.gz", "graph": "${BASE_URL}/patent" }, + { "cmd": "zcat ${data:PUBCHEM_DIR}/patent/*8.ttl.gz", "graph": "${BASE_URL}/patent" }, + { "cmd": "zcat ${data:PUBCHEM_DIR}/patent/*9.ttl.gz", "graph": "${BASE_URL}/patent" }, + { "cmd": "zcat ${data:PUBCHEM_DIR}/patent/cpc/*.ttl.gz", "graph": "${BASE_URL}/patent" }, + { "cmd": "zcat ${data:PUBCHEM_DIR}/patent/ipc/*.ttl.gz", "graph": "${BASE_URL}/patent" }, + { "cmd": "zcat {}", "graph": "${BASE_URL}/patent", "for-each": "${data:PUBCHEM_DIR}/patent/*[!0-9].ttl.gz" }, + { "cmd": "zcat {}", "graph": "${BASE_URL}/pathway", "for-each": "${data:PUBCHEM_DIR}/pathway/*.ttl.gz" }, + { "cmd": "zcat {}", "graph": "${BASE_URL}/protein", "for-each": "${data:PUBCHEM_DIR}/protein/*.ttl.gz" }, + { "cmd": "zcat ${data:PUBCHEM_DIR}/reference/*0.ttl.gz", "graph": "${BASE_URL}/reference" }, + { "cmd": "zcat ${data:PUBCHEM_DIR}/reference/*1.ttl.gz", "graph": "${BASE_URL}/reference" }, + { "cmd": "zcat ${data:PUBCHEM_DIR}/reference/*2.ttl.gz", "graph": "${BASE_URL}/reference" }, + { "cmd": "zcat ${data:PUBCHEM_DIR}/reference/*3.ttl.gz", "graph": "${BASE_URL}/reference" }, + { "cmd": "zcat ${data:PUBCHEM_DIR}/reference/*4.ttl.gz", "graph": "${BASE_URL}/reference" }, + { "cmd": "zcat ${data:PUBCHEM_DIR}/reference/*5.ttl.gz", "graph": "${BASE_URL}/reference" }, + { "cmd": "zcat ${data:PUBCHEM_DIR}/reference/*6.ttl.gz", "graph": "${BASE_URL}/reference" }, + { "cmd": "zcat ${data:PUBCHEM_DIR}/reference/*7.ttl.gz", "graph": "${BASE_URL}/reference" }, + { "cmd": "zcat ${data:PUBCHEM_DIR}/reference/*8.ttl.gz", "graph": "${BASE_URL}/reference" }, + { "cmd": "zcat ${data:PUBCHEM_DIR}/reference/*9.ttl.gz", "graph": "${BASE_URL}/reference" }, + { "cmd": "zcat {}", "graph": "${BASE_URL}/reference", "for-each": "${data:PUBCHEM_DIR}/reference/*[!0-9].ttl.gz" }, + { "cmd": "zcat {}", "graph": "${BASE_URL}/source", "for-each": "${data:PUBCHEM_DIR}/source/*.ttl.gz" }, + { "cmd": "zcat ${data:PUBCHEM_DIR}/substance/*0.ttl.gz", "graph": "${BASE_URL}/substance" }, + { "cmd": "zcat ${data:PUBCHEM_DIR}/substance/*1.ttl.gz", "graph": "${BASE_URL}/substance" }, + { "cmd": "zcat ${data:PUBCHEM_DIR}/substance/*2.ttl.gz", "graph": "${BASE_URL}/substance" }, + { "cmd": "zcat ${data:PUBCHEM_DIR}/substance/*3.ttl.gz", "graph": "${BASE_URL}/substance" }, + { "cmd": "zcat ${data:PUBCHEM_DIR}/substance/*4.ttl.gz", "graph": "${BASE_URL}/substance" }, + { "cmd": "zcat ${data:PUBCHEM_DIR}/substance/*5.ttl.gz", "graph": "${BASE_URL}/substance" }, + { "cmd": "zcat ${data:PUBCHEM_DIR}/substance/*6.ttl.gz", "graph": "${BASE_URL}/substance" }, + { "cmd": "zcat ${data:PUBCHEM_DIR}/substance/*7.ttl.gz", "graph": "${BASE_URL}/substance" }, + { "cmd": "zcat ${data:PUBCHEM_DIR}/substance/*8.ttl.gz", "graph": "${BASE_URL}/substance" }, + { "cmd": "zcat ${data:PUBCHEM_DIR}/substance/*9.ttl.gz", "graph": "${BASE_URL}/substance" }, + { "cmd": "zcat {}", "graph": "${BASE_URL}/substance", "for-each": "${data:PUBCHEM_DIR}/substance/*[!0-9].ttl.gz" }, + { "cmd": "zcat {}", "graph": "${BASE_URL}/synonym", "for-each": "${data:PUBCHEM_DIR}/synonym/*.ttl.gz" }, + { "cmd": "zcat {}", "graph": "${BASE_URL}/taxonomy", "for-each": "${data:PUBCHEM_DIR}/taxonomy/*.ttl.gz" }] +SETTINGS_JSON = { "languages-internal": [], "prefixes-external": [""], "num-triples-per-batch": 10000000 } +STXXL_MEMORY = 20G [server] PORT = 7023 ACCESS_TOKEN = ${data:NAME} MEMORY_FOR_QUERIES = 20G -TIMEOUT = 120s +TIMEOUT = 600s [runtime] SYSTEM = docker diff --git a/src/qlever/Qleverfiles/Qleverfile.uniprot b/src/qlever/Qleverfiles/Qleverfile.uniprot index 74169406..cddffe0f 100644 --- a/src/qlever/Qleverfiles/Qleverfile.uniprot +++ b/src/qlever/Qleverfiles/Qleverfile.uniprot @@ -1,30 +1,62 @@ # Qleverfile for UniProt, use with https://github.com/ad-freiburg/qlever-control # -# qlever get-data # takes ~ 30 hours and ~ 2 TB of disk (for the NT files) -# qlever index # takes ~ 40 hours and ~ 60 GB RAM (on an AMD Ryzen 9 5900X) +# qlever get-data # takes ~ 30 hours and ~ 1.6 TB of disk (for the TTL files) +# qlever index # takes ~ 40 hours and ~ 60 GB RAM (on an AMD Ryzen 9 9950X) # qlever start # starts the server (takes a few seconds) # -# Install packages: sudo apt install -y libxml2-utils parallel xz-utils pv +# Install packages: sudo apt install -y libxml2-utils parallel xz-utils wget # Install manually: Apache Jena binaries (https://dlcdn.apache.org/jena/binaries) # # Set DATE to the date of the latest release. Build on SSD (requires ~ 7 TB -# during build, ~ 3 TB after build). The uniprot.index.???.meta files can be on -# HDD without significant performance loss (when running the server). +# during build, ~ 3 TB after build). [data] -NAME = uniprot -DATE = 2024-05-29 -DOWNLOAD_URL = https://ftp.uniprot.org/pub/databases/uniprot/current_release/rdf -GET_RDFXML_CMD = mkdir -p rdf.${DATE} && curl -s ${DOWNLOAD_URL}/RELEASE.meta4 | sed "s//" | xmllint --xpath "/metalink/files/file/url[@location=\"ch\"]/text()" - | while read URL; do wget --no-verbose -P rdf.${DATE} $$URL 2>&1 | tee -a uniprot.download-log; done -RDFXML2NT_CMD = mkdir -p nt.${DATE} && for RDFXML in rdf.${DATE}/*.{owl,owl.xz,rdf,rdf.xz}; do echo "xzcat -f $$RDFXML | rdfxml --output=nt 2> /dev/null | gzip -c > nt.${DATE}/$$(basename $$RDFXML | sed 's/\(rdf\|rdf.xz\|owl\|owl.xz\)$$/nt.gz/') && echo 'DONE converting $$RDFXML'"; done | parallel -GET_DATA_CMD = rdfxml --help && date > ${NAME}.get-data.begin-date && ${GET_RDFXML_CMD} && ${RDFXML2NT_CMD} && date > ${NAME}.get-data.end-date -DESCRIPTION = Complete UniProt data from ${DOWNLOAD_URL}, version ${DATE} +NAME = uniprot +DATE = 2024-11-27 +RDFXML_DIR = rdf.${DATE} +TTL_DIR = ttl.${DATE} +UNIPROT_URL = https://ftp.uniprot.org/pub/databases/uniprot/current_release/rdf +RHEA_URL = https://ftp.expasy.org/databases/rhea/rdf +EXAMPLES_URL = https://github.com/sib-swiss/sparql-examples +GET_EXAMPLES_CMD = mkdir -p ${TTL_DIR} && git clone ${EXAMPLES_URL} && (cd sparql-examples && ./convertToOneTurtle.sh -p uniprot && gzip examples_uniprot.ttl && mv -f examples_uniprot.ttl.gz ../${TTL_DIR} && cd .. && rm -rf sparql-examples) +GET_RDFXML_CMD = mkdir -p ${RDFXML_DIR} && (echo "${RHEA_URL}/chebi.owl.gz"; echo "${RHEA_URL}/rhea.rdf.gz"; curl -s ${UNIPROT_URL}/RELEASE.meta4 | sed "s//" | xmllint --xpath "/metalink/files/file/url[@location=\"ch\"]/text()" -) | while read URL; do wget --no-verbose -P ${RDFXML_DIR} $$URL 2>&1 | tee -a uniprot.download-log; done +RDFXML2TTL_CMD = mkdir -p ${TTL_DIR} && for RDFXML in ${RDFXML_DIR}/*.{owl,owl.xz,rdf,rdf.xz}; do echo "xzcat -f $$RDFXML | rdfxml --output=ttl -q 2> ${TTL_DIR}/$$(basename $$RDFXML).stderr | gzip -c > ${TTL_DIR}/$$(basename $$RDFXML | sed 's/\(rdf\|rdf.xz\|owl\|owl.xz\)$$/ttl.gz/') && echo 'DONE converting $$RDFXML'"; done | parallel +GET_DATA_CMD = date > ${NAME}.get-data.begin-date && ${GET_EXAMPLES_CMD} && ${GET_RDFXML_CMD} && ${RDFXML2TTL_CMD} && date > ${NAME}.get-data.end-date +DESCRIPTION = Complete UniProt data from ${UNIPROT_URL}, with additional data from ${RHEA_URL} and ${EXAMPLES_URL} [index] -INPUT_FILES = nt.${data:DATE}/*.nt.gz -CAT_INPUT_FILES = parallel --tmpdir . -j 4 'zcat -f {}' ::: ${INPUT_FILES} | pv -q -B 5G -SETTINGS_JSON = { "languages-internal": [], "prefixes-external": [""], "locale": { "language": "en", "country": "US", "ignore-punctuation": true }, "ascii-prefixes-only": true, "num-triples-per-batch": 25000000 } -STXXL_MEMORY = 60G +INPUT_FILES = ${data:TTL_DIR}/*.ttl.gz +MULTI_INPUT_JSON = [{ "cmd": "zcat {}", "graph": "http://sparql.uniprot.org/uniprot", "for-each": "${data:TTL_DIR}/uniprotkb_reviewed_*.ttl.gz" }, + { "cmd": "zcat {}", "graph": "http://sparql.uniprot.org/uniprot", "for-each": "${data:TTL_DIR}/uniprotkb_unreviewed_*.ttl.gz" }, + { "cmd": "zcat {}", "graph": "http://sparql.uniprot.org/uniparc", "for-each": "${data:TTL_DIR}/uniparc_*.ttl.gz" }, + { "cmd": "zcat {}", "graph": "http://sparql.uniprot.org/uniref", "for-each": "${data:TTL_DIR}/uniref*.ttl.gz" }, + { "cmd": "zcat {}", "graph": "http://sparql.uniprot.org/obsolete", "for-each": "${data:TTL_DIR}/uniprotkb_obsolete_*.ttl.gz" }, + { "cmd": "zcat ${data:TTL_DIR}/chebi.ttl.gz", "graph": "http://sparql.uniprot.org/chebi" }, + { "cmd": "zcat ${data:TTL_DIR}/citation_mapping.ttl.gz", "graph": "http://sparql.uniprot.org/citationmapping" }, + { "cmd": "zcat ${data:TTL_DIR}/citations.ttl.gz", "graph": "http://sparql.uniprot.org/citations" }, + { "cmd": "zcat ${data:TTL_DIR}/databases.ttl.gz", "graph": "http://sparql.uniprot.org/databases" }, + { "cmd": "zcat ${data:TTL_DIR}/diseases.ttl.gz", "graph": "http://sparql.uniprot.org/diseases" }, + { "cmd": "zcat ${data:TTL_DIR}/enzyme-hierarchy.ttl.gz", "graph": "http://sparql.uniprot.org/enzymes" }, + { "cmd": "zcat ${data:TTL_DIR}/enzyme.ttl.gz", "graph": "http://sparql.uniprot.org/enzymes" }, + { "cmd": "zcat ${data:TTL_DIR}/go-hierarchy.ttl.gz", "graph": "http://sparql.uniprot.org/go" }, + { "cmd": "zcat ${data:TTL_DIR}/go.ttl.gz", "graph": "http://sparql.uniprot.org/go" }, + { "cmd": "zcat ${data:TTL_DIR}/journals.ttl.gz", "graph": "http://sparql.uniprot.org/journal" }, + { "cmd": "zcat ${data:TTL_DIR}/keywords-hierarchy.ttl.gz", "graph": "http://sparql.uniprot.org/keywords" }, + { "cmd": "zcat ${data:TTL_DIR}/keywords.ttl.gz", "graph": "http://sparql.uniprot.org/keywords" }, + { "cmd": "zcat ${data:TTL_DIR}/locations-hierarchy.ttl.gz", "graph": "http://sparql.uniprot.org/locations" }, + { "cmd": "zcat ${data:TTL_DIR}/locations.ttl.gz", "graph": "http://sparql.uniprot.org/locations" }, + { "cmd": "zcat ${data:TTL_DIR}/pathways-hierarchy*.ttl.gz", "graph": "http://sparql.uniprot.org/pathways" }, + { "cmd": "zcat ${data:TTL_DIR}/pathways.ttl.gz", "graph": "http://sparql.uniprot.org/pathways" }, + { "cmd": "zcat ${data:TTL_DIR}/proteomes.ttl.gz", "graph": "http://sparql.uniprot.org/proteomes" }, + { "cmd": "zcat ${data:TTL_DIR}/taxonomy-hierarchy.ttl.gz", "graph": "http://sparql.uniprot.org/taxonomy" }, + { "cmd": "zcat ${data:TTL_DIR}/taxonomy.ttl.gz", "graph": "http://sparql.uniprot.org/taxonomy" }, + { "cmd": "zcat ${data:TTL_DIR}/tissues.ttl.gz", "graph": "http://sparql.uniprot.org/tissues" }, + { "cmd": "zcat ${data:TTL_DIR}/rhea.ttl.gz", "graph": "https://sparql.rhea-db.org/rhea" }, + { "cmd": "zcat ${data:TTL_DIR}/examples_uniprot.ttl.gz", "graph": "http://sparql.uniprot.org/.well-known/sparql-examples" }, + { "cmd": "zcat ${data:TTL_DIR}/core.ttl.gz", "graph": "http://purl.uniprot.org/core" }, + { "cmd": "zcat ${data:TTL_DIR}/void.ttl.gz", "graph": "http://rdfs.org/ns/void" }] +SETTINGS_JSON = { "languages-internal": [], "prefixes-external": [""], "locale": { "language": "en", "country": "US", "ignore-punctuation": true }, "ascii-prefixes-only": true, "num-triples-per-batch": 25000000 } +STXXL_MEMORY = 60G [server] PORT = 7018 diff --git a/src/qlever/Qleverfiles/Qleverfile.wikidata b/src/qlever/Qleverfiles/Qleverfile.wikidata index 2924090a..e6ec6f6c 100644 --- a/src/qlever/Qleverfiles/Qleverfile.wikidata +++ b/src/qlever/Qleverfiles/Qleverfile.wikidata @@ -13,29 +13,26 @@ NAME = wikidata [data] GET_DATA_URL = https://dumps.wikimedia.org/wikidatawiki/entities -GET_DATA_CMD = curl -LROC - ${GET_DATA_URL}/latest-all.ttl.bz2 ${GET_DATA_URL}/latest-lexemes.ttl.bz2 2>&1 | tee wikidata.download-log.txt && curl -sL ${GET_DATA_URL}/dcatap.rdf | docker run -i --rm -v $$(pwd):/data stain/jena riot --syntax=RDF/XML --output=NT /dev/stdin > dcatap.nt +GET_DATA_CMD = curl -LRC - -O ${GET_DATA_URL}/latest-all.ttl.bz2 -O ${GET_DATA_URL}/latest-lexemes.ttl.bz2 2>&1 | tee wikidata.download-log.txt && curl -sL ${GET_DATA_URL}/dcatap.rdf | docker run -i --rm -v $$(pwd):/data stain/jena riot --syntax=RDF/XML --output=NT /dev/stdin > dcatap.nt DATE_WIKIDATA = $$(date -r latest-all.ttl.bz2 +%d.%m.%Y || echo "NO_DATE") DATE_WIKIPEDIA = $$(date -r wikipedia-abstracts.nt +%d.%m.%Y || echo "NO_DATE") -DESCRIPTION = Full Wikidata dump from ${GET_DATA_URL} (latest-all.ttl.bz2 and latest-lexemes.ttl.bz2, version ${DATE_WIKIDATA}) + English Wikipeda abstracts (version ${DATE_WIKIPEDIA}, available via schema:description) -TEXT_DESCRIPTION = All English and German literals + all sentences from the English Wikipedia (version ${DATE_WIKIPEDIA}), use with FILTER KEYWORDS(...) +DESCRIPTION = Full Wikidata dump from ${GET_DATA_URL} (latest-all.ttl.bz2 and latest-lexemes.ttl.bz2, version ${DATE_WIKIDATA}) [index] -INPUT_FILES = latest-all.ttl.bz2 latest-lexemes.ttl.bz2 wikipedia-abstracts.nt dcatap.nt +INPUT_FILES = latest-all.ttl.bz2 latest-lexemes.ttl.bz2 dcatap.nt MULTI_INPUT_JSON = [{ "cmd": "lbzcat -n 4 latest-all.ttl.bz2", "format": "ttl", "parallel": "true" }, { "cmd": "lbzcat -n 1 latest-lexemes.ttl.bz2", "format": "ttl", "parallel": "false" }, - { "cmd": "cat wikipedia-abstracts.nt", "format": "nt", "parallel": "false" }, { "cmd": "cat dcatap.nt", "format": "nt", "parallel": "false" }] SETTINGS_JSON = { "languages-internal": [], "prefixes-external": [""], "locale": { "language": "en", "country": "US", "ignore-punctuation": true }, "ascii-prefixes-only": true, "num-triples-per-batch": 5000000 } STXXL_MEMORY = 10G -TEXT_INDEX = from_text_records [server] PORT = 7001 -ACCESS_TOKEN = ${data:NAME}_3fz47hfzrbf64b -MEMORY_FOR_QUERIES = 40G -CACHE_MAX_SIZE = 30G +ACCESS_TOKEN = ${data:NAME} +MEMORY_FOR_QUERIES = 20G +CACHE_MAX_SIZE = 15G CACHE_MAX_SIZE_SINGLE_ENTRY = 5G -TIMEOUT = 300s +TIMEOUT = 600s [runtime] SYSTEM = docker diff --git a/src/qlever/commands/add_text_index.py b/src/qlever/commands/add_text_index.py index 15fa647b..f2401878 100644 --- a/src/qlever/commands/add_text_index.py +++ b/src/qlever/commands/add_text_index.py @@ -64,7 +64,7 @@ def execute(self, args) -> bool: # Show the command line. self.show(add_text_index_cmd, only_show=args.show) if args.show: - return False + return True # When running natively, check if the binary exists and works. if args.system == "native": @@ -74,6 +74,7 @@ def execute(self, args) -> bool: log.error(f"Running \"{args.index_binary}\" failed ({e}), " f"set `--index-binary` to a different binary or " f"use `--container_system`") + return False # Check if text index files already exist. existing_text_index_files = get_existing_index_files( diff --git a/src/qlever/commands/cache_stats.py b/src/qlever/commands/cache_stats.py index 28eb6e58..5b618a98 100644 --- a/src/qlever/commands/cache_stats.py +++ b/src/qlever/commands/cache_stats.py @@ -47,7 +47,7 @@ def execute(self, args) -> bool: self.show("\n".join([cache_stats_cmd, cache_settings_cmd]), only_show=args.show) if args.show: - return False + return True # Execute them. try: diff --git a/src/qlever/commands/clear_cache.py b/src/qlever/commands/clear_cache.py index 448aeb39..8732120c 100644 --- a/src/qlever/commands/clear_cache.py +++ b/src/qlever/commands/clear_cache.py @@ -48,7 +48,7 @@ def execute(self, args) -> bool: f"\"{args.access_token}\"") self.show(clear_cache_cmd, only_show=args.show) if args.show: - return False + return True # Execute the command. try: @@ -76,5 +76,7 @@ def execute(self, args) -> bool: # Show cache stats. log.info("") args.detailed = False - CacheStatsCommand().execute(args) + if not CacheStatsCommand().execute(args): + log.error("Clearing the cache was successful, but showing the " + "cache stats failed {e}") return True diff --git a/src/qlever/commands/example_queries.py b/src/qlever/commands/example_queries.py index c70ded38..d5e4915b 100644 --- a/src/qlever/commands/example_queries.py +++ b/src/qlever/commands/example_queries.py @@ -21,10 +21,7 @@ class ExampleQueriesCommand(QleverCommand): """ def __init__(self): - self.presets = { - "virtuoso-wikidata": "https://wikidata.demo.openlinksw.com/sparql", - "qlever-wikidata": "https://qlever.cs.uni-freiburg.de/api/wikidata", - } + pass def description(self) -> str: return "Show how much of the cache is currently being used" @@ -41,8 +38,15 @@ def additional_arguments(self, subparser) -> None: ) subparser.add_argument( "--sparql-endpoint-preset", - choices=self.presets.keys(), - help="Shortcut for setting the SPARQL endpoint", + choices=[ + "https://qlever.dev/api/wikidata", + "https://qlever.dev/api/uniprot", + "https://qlever.dev/api/pubchem", + "https://qlever.dev/api/osm-planet", + "https://wikidata.demo.openlinksw.com/sparql", + "https://sparql.uniprot.org/sparql", + ], + help="SPARQL endpoint from fixed list (to save typing)", ) subparser.add_argument( "--get-queries-cmd", @@ -86,7 +90,7 @@ def additional_arguments(self, subparser) -> None: "application/sparql-results+json", "text/turtle", ], - default="text/tab-separated-values", + default="application/sparql-results+json", help="Accept header for the SPARQL query", ) subparser.add_argument( @@ -98,7 +102,7 @@ def additional_arguments(self, subparser) -> None: subparser.add_argument( "--width-query-description", type=int, - default=40, + default=70, help="Width for printing the query description", ) subparser.add_argument( @@ -113,6 +117,32 @@ def additional_arguments(self, subparser) -> None: default=14, help="Width for printing the result size", ) + subparser.add_argument( + "--show-query", + choices=["always", "never", "on-error"], + default="never", + help="Show the queries that will be executed (always, never, on error)", + ) + subparser.add_argument( + "--show-prefixes", + action="store_true", + default=False, + help="When showing the query, also show the prefixes", + ) + + def pretty_print_query(self, query: str, show_prefixes: bool) -> None: + remove_prefixes_cmd = " | sed '/^PREFIX /Id'" if not show_prefixes else "" + pretty_print_query_cmd = ( + f"echo {shlex.quote(query)}" + f" | docker run -i --rm sparqling/sparql-formatter" + f"{remove_prefixes_cmd} | grep -v '^$'" + ) + try: + query_pp = run_command(pretty_print_query_cmd, return_output=True) + log.info(colored(query_pp.rstrip(), "cyan")) + except Exception as e: + log.error(f"Failed to pretty-print query: {e}") + log.info(colored(query.rstrip(), "cyan")) def execute(self, args) -> bool: # We can't have both `--remove-offset-and-limit` and `--limit`. @@ -135,9 +165,8 @@ def execute(self, args) -> bool: return False # Handle shotcuts for SPARQL endpoint. - if args.sparql_endpoint_preset in self.presets: - args.sparql_endpoint = self.presets[args.sparql_endpoint_preset] - args.ui_config = args.sparql_endpoint_preset.split("-")[1] + if args.sparql_endpoint_preset: + args.sparql_endpoint = args.sparql_endpoint_preset # Limit only works with full result. if args.limit and args.download_or_count == "count": @@ -178,7 +207,7 @@ def execute(self, args) -> bool: only_show=args.show, ) if args.show: - return False + return True # Get the example queries. try: @@ -210,8 +239,11 @@ def execute(self, args) -> bool: if args.clear_cache == "yes": args.server_url = sparql_endpoint args.complete = False + clear_cache_successful = False with mute_log(): - ClearCacheCommand().execute(args) + clear_cache_successful = ClearCacheCommand().execute(args) + if not clear_cache_successful: + log.warn("Failed to clear the cache") # Remove OFFSET and LIMIT (after the last closing bracket). if args.remove_offset_and_limit or args.limit: @@ -262,6 +294,9 @@ def execute(self, args) -> bool: # A bit of pretty-printing. query = re.sub(r"\s+", " ", query) query = re.sub(r"\s*\.\s*\}", " }", query) + if args.show_query == "always": + log.info("") + self.pretty_print_query(query, args.show_prefixes) # Launch query. try: @@ -282,55 +317,81 @@ def execute(self, args) -> bool: params={"query": query}, result_file=result_file, ).strip() - if http_code != "200": - raise Exception( - f"HTTP code {http_code}" f" {Path(result_file).read_text()}" - ) - time_seconds = time.time() - start_time - error_msg = None + if http_code == "200": + time_seconds = time.time() - start_time + error_msg = None + else: + error_msg = { + "short": f"HTTP code: {http_code}", + "long": re.sub(r"\s+", " ", Path(result_file).read_text()), + } except Exception as e: if args.log_level == "DEBUG": traceback.print_exc() - error_msg = re.sub(r"\s+", " ", str(e)) + error_msg = { + "short": "Exception", + "long": re.sub(r"\s+", " ", str(e)), + } # Get result size (via the command line, in order to avoid loading # a potentially large JSON file into Python, which is slow). if error_msg is None: - try: - if args.download_or_count == "count": - if args.accept == "text/tab-separated-values": - result_size = run_command( - f"sed 1d {result_file}", return_output=True - ) - else: + # CASE 0: Rhe result is empty despite a 200 HTTP code. + if Path(result_file).stat().st_size == 0: + result_size = 0 + error_msg = { + "short": "Empty result", + "long": "curl returned with code 200, " + "but the result is empty", + } + + # CASE 1: Just counting the size of the result (TSV or JSON). + elif args.download_or_count == "count": + if args.accept == "text/tab-separated-values": + result_size = run_command( + f"sed 1d {result_file}", return_output=True + ) + else: + try: result_size = run_command( f'jq -r ".results.bindings[0]' f" | to_entries[0].value.value" f' | tonumber" {result_file}', return_output=True, ) + except Exception as e: + error_msg = { + "short": "Malformed JSON", + "long": "curl returned with code 200, " + "but the JSON is malformed: " + + re.sub(r"\s+", " ", str(e)), + } + + # CASE 2: Downloading the full result (TSV, CSV, Turtle, JSON). + else: + if ( + args.accept == "text/tab-separated-values" + or args.accept == "text/csv" + ): + result_size = run_command( + f"sed 1d {result_file} | wc -l", return_output=True + ) + elif args.accept == "text/turtle": + result_size = run_command( + f"sed '1d;/^@prefix/d;/^\\s*$/d' " f"{result_file} | wc -l", + return_output=True, + ) else: - if ( - args.accept == "text/tab-separated-values" - or args.accept == "text/csv" - ): - result_size = run_command( - f"sed 1d {result_file} | wc -l", return_output=True - ) - elif args.accept == "text/turtle": - result_size = run_command( - f"sed '1d;/^@prefix/d;/^\\s*$/d' " - f"{result_file} | wc -l", - return_output=True, - ) - else: + try: result_size = run_command( f'jq -r ".results.bindings | length"' f" {result_file}", return_output=True, ) - result_size = int(result_size) - except Exception as e: - error_msg = str(e) + except Exception as e: + error_msg = { + "short": "Malformed JSON", + "long": re.sub(r"\s+", " ", str(e)), + } # Remove the result file (unless in debug mode). if args.log_level != "DEBUG": @@ -341,6 +402,7 @@ def execute(self, args) -> bool: description = description[: args.width_query_description - 3] description += "..." if error_msg is None: + result_size = int(result_size) log.info( f"{description:<{args.width_query_description}} " f"{time_seconds:6.2f} s " @@ -352,16 +414,24 @@ def execute(self, args) -> bool: num_failed += 1 if ( args.width_error_message > 0 - and len(error_msg) > args.width_error_message + and len(error_msg["long"]) > args.width_error_message and args.log_level != "DEBUG" + and args.show_query != "on-error" ): - error_msg = error_msg[: args.width_error_message - 3] - error_msg += "..." - log.error( + error_msg["long"] = ( + error_msg["long"][: args.width_error_message - 3] + "..." + ) + seperator_short_long = "\n" if args.show_query == "on-error" else " " + log.info( f"{description:<{args.width_query_description}} " - f"failed " - f"{colored(error_msg, 'red')}" + f"{colored('FAILED ', 'red')}" + f"{colored(error_msg['short'], 'red'):>{args.width_result_size}}" + f"{seperator_short_long}" + f"{colored(error_msg['long'], 'red')}" ) + if args.show_query == "on-error": + self.pretty_print_query(query, args.show_prefixes) + log.info("") # Check that each query has a time and a result size, or it failed. assert len(result_sizes) == len(query_times) diff --git a/src/qlever/commands/get_data.py b/src/qlever/commands/get_data.py index 4ae2bb7d..b27eca5f 100644 --- a/src/qlever/commands/get_data.py +++ b/src/qlever/commands/get_data.py @@ -31,7 +31,7 @@ def execute(self, args) -> bool: # Construct the command line and show it. self.show(args.get_data_cmd, only_show=args.show) if args.show: - return False + return True # Execute the command line. try: diff --git a/src/qlever/commands/index.py b/src/qlever/commands/index.py index 1d8bb442..9b0b7286 100644 --- a/src/qlever/commands/index.py +++ b/src/qlever/commands/index.py @@ -3,12 +3,12 @@ import glob import json import shlex +import re from qlever.command import QleverCommand from qlever.containerize import Containerize from qlever.log import log -from qlever.util import (get_existing_index_files, get_total_file_size, - run_command) +from qlever.util import get_existing_index_files, get_total_file_size, run_command class IndexCommand(QleverCommand): @@ -20,24 +20,36 @@ def __init__(self): pass def description(self) -> str: - return ("Build the index for a given RDF dataset") + return "Build the index for a given RDF dataset" def should_have_qleverfile(self) -> bool: return True - def relevant_qleverfile_arguments(self) -> dict[str: list[str]]: - return {"data": ["name", "format"], - "index": ["input_files", "cat_input_files", "multi_input_json", - "settings_json", "index_binary", - "only_pso_and_pos_permutations", "use_patterns", - "text_index", "stxxl_memory"], - "runtime": ["system", "image", "index_container"]} + def relevant_qleverfile_arguments(self) -> dict[str : list[str]]: + return { + "data": ["name", "format"], + "index": [ + "input_files", + "cat_input_files", + "multi_input_json", + "parallel_parsing", + "settings_json", + "index_binary", + "only_pso_and_pos_permutations", + "use_patterns", + "text_index", + "stxxl_memory", + ], + "runtime": ["system", "image", "index_container"], + } def additional_arguments(self, subparser) -> None: subparser.add_argument( - "--overwrite-existing", action="store_true", - default=False, - help="Overwrite an existing index, think twice before using.") + "--overwrite-existing", + action="store_true", + default=False, + help="Overwrite an existing index, think twice before using.", + ) # Exception for invalid JSON. class InvalidInputJson(Exception): @@ -48,22 +60,29 @@ def __init__(self, error_message, additional_info): # Helper function to get command line options from JSON. def get_input_options_for_json(self, args) -> str: - # Parse the JSON. + # Parse the JSON. If `args.multi_input_json` look like JSONL, turn + # it into a JSON array. try: + jsonl_line_regex = re.compile(r"^\s*\{.*\}\s*$") + jsonl_lines = args.multi_input_json.split("\n") + if all(re.match(jsonl_line_regex, line) for line in jsonl_lines): + args.multi_input_json = "[" + ", ".join(jsonl_lines) + "]" input_specs = json.loads(args.multi_input_json) except Exception as e: raise self.InvalidInputJson( - f"Failed to parse `MULTI_INPUT_JSON` ({e})", - args.multi_input_json) + f"Failed to parse `MULTI_INPUT_JSON` as either JSON or JSONL ({e})", + args.multi_input_json, + ) # Check that it is an array of length at least one. if not isinstance(input_specs, list): raise self.InvalidInputJson( - "`MULTI_INPUT_JSON` must be a JSON array", - args.multi_input_json) + "`MULTI_INPUT_JSON` must be a JSON array", args.multi_input_json + ) if len(input_specs) == 0: raise self.InvalidInputJson( - "`MULTI_INPUT_JSON` must contain at least one element", - args.multi_input_json) + "`MULTI_INPUT_JSON` must contain at least one element", + args.multi_input_json, + ) # For each of the maps, construct the corresponding command-line # options to the index binary. input_options = [] @@ -71,35 +90,77 @@ def get_input_options_for_json(self, args) -> str: # Check that `input_spec` is a dictionary. if not isinstance(input_spec, dict): raise self.InvalidInputJson( - f"Element {i} in `MULTI_INPUT_JSON` must be a JSON " - "object", - input_spec) + f"Element {i} in `MULTI_INPUT_JSON` must be a JSON " "object", + input_spec, + ) # For each `input_spec`, we must have a command. if "cmd" not in input_spec: raise self.InvalidInputJson( - f"Element {i} in `MULTI_INPUT_JSON` must contain a " - "key `cmd`", - input_spec) - input_cmd = input_spec["cmd"] + f"Element {i} in `MULTI_INPUT_JSON` must contain a " "key `cmd`", + input_spec, + ) + # If the command contains a `{}` placeholder, we need a `for-each` + # key` specifying the pattern for the placeholder values, and vice + # versa. + if "{}" in input_spec["cmd"] and "for-each" not in input_spec: + raise self.InvalidInputJson( + f"Element {i} in `MULTI_INPUT_JSON` must contain a " + "key `for-each` if the command contains a placeholder " + "`{}`", + input_spec, + ) + if "for-each" in input_spec and "{}" not in input_spec["cmd"]: + raise self.InvalidInputJson( + f"Element {i} in `MULTI_INPUT_JSON` contains a " + "key `for-each`, but the command does not contain a " + "placeholder `{{}}`", + input_spec, + ) + # Get all commands. This is just the value of the `cmd` key if no + # `for-each` key is specified. Otherwise, we have a command for + # each file matching the pattern. + if "for-each" not in input_spec: + input_cmds = [input_spec["cmd"]] + else: + try: + files = sorted(glob.glob(input_spec["for-each"])) + except Exception as e: + raise self.InvalidInputJson( + f"Element {i} in `MULTI_INPUT_JSON` contains an " + f"invalid `for-each` pattern: {e}", + input_spec, + ) + input_cmds = [input_spec["cmd"].format(file) for file in files] # The `format`, `graph`, and `parallel` keys are optional. input_format = input_spec.get("format", args.format) input_graph = input_spec.get("graph", "-") input_parallel = input_spec.get("parallel", "false") # There must not be any other keys. - extra_keys = input_spec.keys() - {"cmd", "format", "graph", "parallel"} + extra_keys = input_spec.keys() - { + "cmd", + "format", + "graph", + "parallel", + "for-each", + } if extra_keys: raise self.InvalidInputJson( - f"Element {i} in `MULTI_INPUT_JSON` must only contain " - "the keys `format`, `graph`, and `parallel`. Contains " - "extra keys {extra_keys}.", - input_spec) + f"Element {i} in `MULTI_INPUT_JSON` must only contain " + "the keys `format`, `graph`, and `parallel`. Contains " + "extra keys {extra_keys}.", + input_spec, + ) # Add the command-line options for this input stream. We use - # process substitution `<(...)` as a convenient way to handle - # an input stream just like a file. This is not POSIX compliant, - # but supported by various shells, including bash and zsh. - input_options.append( - f"-f <({input_cmd}) -F {input_format} " - f"-g \"{input_graph}\" -p {input_parallel}") + # process substitution `<(...)` as a convenient way to handle an + # input stream just like a file. This is not POSIX compliant, but + # supported by various shells, including bash and zsh. If + # `for-each` is specified, add one command for each matching file. + for input_cmd in input_cmds: + input_option = f"-f <({input_cmd}) -g {input_graph}" + input_option += f" -F {input_format}" + if input_parallel == "true": + input_option += " -p true" + input_options.append(input_option) # Return the concatenated command-line options. return " ".join(input_options) @@ -108,9 +169,13 @@ def execute(self, args) -> bool: # basename of the index, and the settings file). There are two ways # to specify the input: via a single stream or via multiple streams. if args.cat_input_files and not args.multi_input_json: - index_cmd = (f"{args.cat_input_files} | {args.index_binary}" - f" -i {args.name} -s {args.name}.settings.json" - f" -F {args.format} -f -") + index_cmd = ( + f"{args.cat_input_files} | {args.index_binary}" + f" -i {args.name} -s {args.name}.settings.json" + f" -F {args.format} -f -" + ) + if args.parallel_parsing: + index_cmd += f" -p {args.parallel_parsing}" elif args.multi_input_json and not args.cat_input_files: try: input_options = self.get_input_options_for_json(args) @@ -119,13 +184,17 @@ def execute(self, args) -> bool: log.info("") log.info(e.additional_info) return False - index_cmd = (f"{args.index_binary}" - f" -i {args.name} -s {args.name}.settings.json" - f" {input_options}") + index_cmd = ( + f"{args.index_binary}" + f" -i {args.name} -s {args.name}.settings.json" + f" {input_options}" + ) else: - log.error("Specify exactly one of `CAT_INPUT_FILES` (for a " - "single input stream) or `MULTI_INPUT_JSON` (for " - "multiple input streams)") + log.error( + "Specify exactly one of `CAT_INPUT_FILES` (for a " + "single input stream) or `MULTI_INPUT_JSON` (for " + "multiple input streams)" + ) log.info("") log.info("See `qlever index --help` for more information") return False @@ -135,12 +204,11 @@ def execute(self, args) -> bool: index_cmd += " --only-pso-and-pos-permutations --no-patterns" if not args.use_patterns: index_cmd += " --no-patterns" - if args.text_index in \ - ["from_text_records", "from_text_records_and_literals"]: - index_cmd += (f" -w {args.name}.wordsfile.tsv" - f" -d {args.name}.docsfile.tsv") - if args.text_index in \ - ["from_literals", "from_text_records_and_literals"]: + if args.text_index in ["from_text_records", "from_text_records_and_literals"]: + index_cmd += ( + f" -w {args.name}.wordsfile.tsv" f" -d {args.name}.docsfile.tsv" + ) + if args.text_index in ["from_literals", "from_text_records_and_literals"]: index_cmd += " --text-words-from-literals" if args.stxxl_memory: index_cmd += f" --stxxl-memory {args.stxxl_memory}" @@ -148,38 +216,42 @@ def execute(self, args) -> bool: # If the total file size is larger than 10 GB, set ulimit (such that a # large number of open files is allowed). - total_file_size = get_total_file_size( - shlex.split(args.input_files)) + total_file_size = get_total_file_size(shlex.split(args.input_files)) if total_file_size > 1e10: index_cmd = f"ulimit -Sn 1048576; {index_cmd}" # Run the command in a container (if so desired). if args.system in Containerize.supported_systems(): index_cmd = Containerize().containerize_command( - index_cmd, - args.system, "run --rm", - args.image, - args.index_container, - volumes=[("$(pwd)", "/index")], - working_directory="/index") + index_cmd, + args.system, + "run --rm", + args.image, + args.index_container, + volumes=[("$(pwd)", "/index")], + working_directory="/index", + ) # Command for writing the settings JSON to a file. - settings_json_cmd = (f"echo {shlex.quote(args.settings_json)} " - f"> {args.name}.settings.json") + settings_json_cmd = ( + f"echo {shlex.quote(args.settings_json)} " f"> {args.name}.settings.json" + ) # Show the command line. self.show(f"{settings_json_cmd}\n{index_cmd}", only_show=args.show) if args.show: - return False + return True # When running natively, check if the binary exists and works. if args.system == "native": try: run_command(f"{args.index_binary} --help") except Exception as e: - log.error(f"Running \"{args.index_binary}\" failed, " - f"set `--index-binary` to a different binary or " - f"set `--system to a container system`") + log.error( + f'Running "{args.index_binary}" failed, ' + f"set `--index-binary` to a different binary or " + f"set `--system to a container system`" + ) log.info("") log.info(f"The error message was: {e}") return False @@ -187,28 +259,29 @@ def execute(self, args) -> bool: # Check if all of the input files exist. for pattern in shlex.split(args.input_files): if len(glob.glob(pattern)) == 0: - log.error(f"No file matching \"{pattern}\" found") + log.error(f'No file matching "{pattern}" found') log.info("") - log.info("Did you call `qlever get-data`? If you did, check " - "GET_DATA_CMD and INPUT_FILES in the QLeverfile") + log.info( + "Did you call `qlever get-data`? If you did, check " + "GET_DATA_CMD and INPUT_FILES in the QLeverfile" + ) return False # Check if index files (name.index.*) already exist. existing_index_files = get_existing_index_files(args.name) if len(existing_index_files) > 0 and not args.overwrite_existing: log.error( - f"Index files for basename \"{args.name}\" found, if you " - f"want to overwrite them, use --overwrite-existing") + f'Index files for basename "{args.name}" found, if you ' + f"want to overwrite them, use --overwrite-existing" + ) log.info("") log.info(f"Index files found: {existing_index_files}") return False # Remove already existing container. - if args.system in Containerize.supported_systems() \ - and args.overwrite_existing: + if args.system in Containerize.supported_systems() and args.overwrite_existing: if Containerize.is_running(args.system, args.index_container): - log.info("Another index process is running, trying to stop " - "it ...") + log.info("Another index process is running, trying to stop " "it ...") log.info("") try: run_command(f"{args.system} rm -f {args.index_container}") diff --git a/src/qlever/commands/index_stats.py b/src/qlever/commands/index_stats.py index 975576ac..b997b8c7 100644 --- a/src/qlever/commands/index_stats.py +++ b/src/qlever/commands/index_stats.py @@ -18,32 +18,45 @@ def __init__(self): pass def description(self) -> str: - return ("Breakdown of the time and space used for the index build") + return "Breakdown of the time and space used for the index build" def should_have_qleverfile(self) -> bool: return False - def relevant_qleverfile_arguments(self) -> dict[str: list[str]]: + def relevant_qleverfile_arguments(self) -> dict[str : list[str]]: return {"data": ["name"]} def additional_arguments(self, subparser) -> None: - subparser.add_argument("--only-time", action="store_true", - default=False, - help="Show only the time used") - subparser.add_argument("--only-space", action="store_true", - default=False, - help="Show only the space used") - subparser.add_argument("--ignore-text-index", action="store_true", - default=False, - help="Ignore the text index") - subparser.add_argument("--time-unit", - choices=["s", "min", "h", "auto"], - default="auto", - help="The time unit") - subparser.add_argument("--size-unit", - choices=["B", "MB", "GB", "TB", "auto"], - default="auto", - help="The size unit") + subparser.add_argument( + "--only-time", + action="store_true", + default=False, + help="Show only the time used", + ) + subparser.add_argument( + "--only-space", + action="store_true", + default=False, + help="Show only the space used", + ) + subparser.add_argument( + "--ignore-text-index", + action="store_true", + default=False, + help="Ignore the text index", + ) + subparser.add_argument( + "--time-unit", + choices=["s", "min", "h", "auto"], + default="auto", + help="The time unit", + ) + subparser.add_argument( + "--size-unit", + choices=["B", "MB", "GB", "TB", "auto"], + default="auto", + help="The size unit", + ) def execute_time(self, args, log_file_name) -> bool: """ @@ -65,8 +78,9 @@ def execute_time(self, args, log_file_name) -> bool: with open(text_log_file_name, "r") as text_log_file: lines.extend(text_log_file.readlines()) except Exception as e: - log.error(f"Problem reading text index log file " - f"{text_log_file_name}: {e}") + log.error( + f"Problem reading text index log file " f"{text_log_file_name}: {e}" + ) return False # Helper function that finds the next line matching the given `regex`, @@ -95,12 +109,14 @@ def find_next_line(regex, update_current_line=True): if regex_match: try: return datetime.strptime( - re.match(timestamp_regex, line).group(), - timestamp_format), regex_match + re.match(timestamp_regex, line).group(), timestamp_format + ), regex_match except Exception as e: - log.error(f"Could not parse timestamp of form " - f"\"{timestamp_regex}\" from line " - f" \"{line.rstrip()}\" ({e})") + log.error( + f"Could not parse timestamp of form " + f'"{timestamp_regex}" from line ' + f' "{line.rstrip()}" ({e})' + ) # If we get here, we did not find a matching line. if not update_current_line: current_line = current_line_backup @@ -119,26 +135,32 @@ def find_next_line(regex, update_current_line=True): # file (old format: "Creating a pair" + names of permutations in # line "Writing meta data for ..."; new format: name of # permutations already in line "Creating permutations ..."). - perm_begin, _ = find_next_line(r"INFO:\s*Creating a pair", - update_current_line=False) + perm_begin, _ = find_next_line( + r"INFO:\s*Creating a pair", update_current_line=False + ) if perm_begin is None: perm_begin, perm_info = find_next_line( r"INFO:\s*Creating permutations ([A-Z]+ and [A-Z]+)", - update_current_line=False) + update_current_line=False, + ) else: _, perm_info = find_next_line( r"INFO:\s*Writing meta data for ([A-Z]+ and [A-Z]+)", - update_current_line=False) + update_current_line=False, + ) if perm_info is None: break perm_begin_and_info.append((perm_begin, perm_info)) - convert_end = (perm_begin_and_info[0][0] if - len(perm_begin_and_info) > 0 else None) + convert_end = ( + perm_begin_and_info[0][0] if len(perm_begin_and_info) > 0 else None + ) normal_end, _ = find_next_line(r"INFO:\s*Index build completed") - text_begin, _ = find_next_line(r"INFO:\s*Adding text index", - update_current_line=False) - text_end, _ = find_next_line(r"INFO:\s*Text index build comp", - update_current_line=False) + text_begin, _ = find_next_line( + r"INFO:\s*Adding text index", update_current_line=False + ) + text_end, _ = find_next_line( + r"INFO:\s*Text index build comp", update_current_line=False + ) if args.ignore_text_index: text_begin = text_end = None @@ -147,9 +169,11 @@ def find_next_line(regex, update_current_line=True): log.error("Missing line that index build has started") return False if overall_begin and not merge_begin: - log.error("According to the log file, the index build " - "has started, but is still in its first " - "phase (parsing the input)") + log.error( + "According to the log file, the index build " + "has started, but is still in its first " + "phase (parsing the input)" + ) return False # Helper function that shows the duration for a phase (if the start and @@ -187,22 +211,24 @@ def show_duration(heading, start_end_pairs): show_duration("Convert to global IDs", [(convert_begin, convert_end)]) for i in range(len(perm_begin_and_info)): perm_begin, perm_info = perm_begin_and_info[i] - perm_end = perm_begin_and_info[i + 1][0] if i + 1 < len( - perm_begin_and_info) else normal_end - perm_info_text = (perm_info.group(1).replace(" and ", " & ") - if perm_info else f"#{i + 1}") - show_duration(f"Permutation {perm_info_text}", - [(perm_begin, perm_end)]) + perm_end = ( + perm_begin_and_info[i + 1][0] + if i + 1 < len(perm_begin_and_info) + else normal_end + ) + perm_info_text = ( + perm_info.group(1).replace(" and ", " & ") if perm_info else f"#{i + 1}" + ) + show_duration(f"Permutation {perm_info_text}", [(perm_begin, perm_end)]) show_duration("Text index", [(text_begin, text_end)]) if text_begin and text_end: log.info("") - show_duration("TOTAL time", - [(overall_begin, normal_end), - (text_begin, text_end)]) + show_duration( + "TOTAL time", [(overall_begin, normal_end), (text_begin, text_end)] + ) elif normal_end: log.info("") - show_duration("TOTAL time", - [(overall_begin, normal_end)]) + show_duration("TOTAL time", [(overall_begin, normal_end)]) return True def execute_space(self, args) -> bool: @@ -252,24 +278,29 @@ def show_size(heading, size): return True def execute(self, args) -> bool: - ret_value = args.show + return_value = True # The "time" part of the command. if not args.only_space: log_file_name = f"{args.name}.index-log.txt" - self.show(f"Breakdown of the time used for " - f"building the index, based on the timestamps for key " - f"lines in \"{log_file_name}\"", only_show=args.show) + self.show( + f"Breakdown of the time used for " + f"building the index, based on the timestamps for key " + f'lines in "{log_file_name}"', + only_show=args.show, + ) if not args.show: - ret_value &= self.execute_time(args, log_file_name) + return_value &= self.execute_time(args, log_file_name) if not args.only_time: log.info("") # The "space" part of the command. if not args.only_time: - self.show("Breakdown of the space used for building the index", - only_show=args.show) + self.show( + "Breakdown of the space used for building the index", + only_show=args.show, + ) if not args.show: - ret_value &= self.execute_space(args) + return_value &= self.execute_space(args) - return ret_value + return return_value diff --git a/src/qlever/commands/log.py b/src/qlever/commands/log.py index 3b2599b4..816072bc 100644 --- a/src/qlever/commands/log.py +++ b/src/qlever/commands/log.py @@ -47,10 +47,20 @@ def execute(self, args) -> bool: log_cmd += f" {log_file}" self.show(log_cmd, only_show=args.show) if args.show: - return False + return True # Execute the command. log.info(f"Follow log file {log_file}, press Ctrl-C to stop" f" following (will not stop the server)") log.info("") - subprocess.run(log_cmd, shell=True) + try: + subprocess.run(log_cmd, shell=True) + return True + except Exception as e: + log.error(e) + return False + + + + + diff --git a/src/qlever/commands/query.py b/src/qlever/commands/query.py index 163c0c8a..6b3acbf0 100644 --- a/src/qlever/commands/query.py +++ b/src/qlever/commands/query.py @@ -18,42 +18,82 @@ def __init__(self): pass def description(self) -> str: - return ("Send a query to a SPARQL endpoint") + return "Send a query to a SPARQL endpoint" def should_have_qleverfile(self) -> bool: return False - def relevant_qleverfile_arguments(self) -> dict[str: list[str]]: - return {"server": ["port"]} + def relevant_qleverfile_arguments(self) -> dict[str : list[str]]: + return {"server": ["port", "access_token"]} def additional_arguments(self, subparser) -> None: - subparser.add_argument("--query", type=str, - default="SELECT * WHERE { ?s ?p ?o } LIMIT 10", - help="SPARQL query to send") - subparser.add_argument("--sparql-endpoint", type=str, - help="URL of the SPARQL endpoint") - subparser.add_argument("--accept", type=str, - choices=["text/tab-separated-values", - "text/csv", - "application/sparql-results+json", - "application/sparql-results+xml", - "application/qlever-results+json"], - default="text/tab-separated-values", - help="Accept header for the SPARQL query") - subparser.add_argument("--no-time", action="store_true", - default=False, - help="Do not print the (end-to-end) time taken") + subparser.add_argument( + "query", + type=str, + nargs="?", + default="SELECT * WHERE { ?s ?p ?o } LIMIT 10", + help="SPARQL query to send", + ) + subparser.add_argument( + "--pin-to-cache", + action="store_true", + default=False, + help="Pin the query to the cache", + ) + subparser.add_argument( + "--sparql-endpoint", type=str, help="URL of the SPARQL endpoint" + ) + subparser.add_argument( + "--accept", + type=str, + choices=[ + "text/tab-separated-values", + "text/csv", + "application/sparql-results+json", + "application/sparql-results+xml", + "application/qlever-results+json", + ], + default="text/tab-separated-values", + help="Accept header for the SPARQL query", + ) + subparser.add_argument( + "--no-time", + action="store_true", + default=False, + help="Do not print the (end-to-end) time taken", + ) def execute(self, args) -> bool: + # When pinning to the cache, set `send=0` and request media type + # `application/qlever-results+json` so that we get the result size. + # Also, we need to provide the access token. + if args.pin_to_cache: + args.accept = "application/qlever-results+json" + curl_cmd_additions = ( + f" --data pinresult=true --data send=0" + f" --data access-token=" + f"{shlex.quote(args.access_token)}" + f" | jq .resultsize | numfmt --grouping" + f" | xargs -I {{}} printf" + f' "Result pinned to cache,' + f' number of rows: {{}}\\n"' + ) + else: + curl_cmd_additions = "" + # Show what the command will do. - sparql_endpoint = (args.sparql_endpoint if args.sparql_endpoint - else f"localhost:{args.port}") - curl_cmd = (f"curl -s {sparql_endpoint}" - f" -H \"Accept: {args.accept}\"" - f" --data-urlencode query={shlex.quote(args.query)}") + sparql_endpoint = ( + args.sparql_endpoint if args.sparql_endpoint else f"localhost:{args.port}" + ) + curl_cmd = ( + f"curl -s {sparql_endpoint}" + f' -H "Accept: {args.accept}"' + f" --data-urlencode query={shlex.quote(args.query)}" + f"{curl_cmd_additions}" + ) self.show(curl_cmd, only_show=args.show) if args.show: - return False + return True # Launch query. try: @@ -62,8 +102,7 @@ def execute(self, args) -> bool: time_msecs = round(1000 * (time.time() - start_time)) if not args.no_time and args.log_level != "NO_LOG": log.info("") - log.info(f"Query processing time (end-to-end):" - f" {time_msecs:,d} ms") + log.info(f"Query processing time (end-to-end):" f" {time_msecs:,d} ms") except Exception as e: if args.log_level == "DEBUG": traceback.print_exc() diff --git a/src/qlever/commands/setup_config.py b/src/qlever/commands/setup_config.py index c1140c1b..0eff3b23 100644 --- a/src/qlever/commands/setup_config.py +++ b/src/qlever/commands/setup_config.py @@ -1,6 +1,7 @@ from __future__ import annotations import subprocess +from os import environ from pathlib import Path from qlever.command import QleverCommand @@ -15,9 +16,9 @@ class SetupConfigCommand(QleverCommand): def __init__(self): self.qleverfiles_path = Path(__file__).parent.parent / "Qleverfiles" - self.qleverfile_names = \ - [p.name.split(".")[1] - for p in self.qleverfiles_path.glob("Qleverfile.*")] + self.qleverfile_names = [ + p.name.split(".")[1] for p in self.qleverfiles_path.glob("Qleverfile.*") + ] def description(self) -> str: return "Get a pre-configured Qleverfile" @@ -25,57 +26,72 @@ def description(self) -> str: def should_have_qleverfile(self) -> bool: return False - def relevant_qleverfile_arguments(self) -> dict[str: list[str]]: + def relevant_qleverfile_arguments(self) -> dict[str : list[str]]: return {} def additional_arguments(self, subparser) -> None: subparser.add_argument( - "config_name", type=str, - choices=self.qleverfile_names, - help="The name of the pre-configured Qleverfile to create") + "config_name", + type=str, + choices=self.qleverfile_names, + help="The name of the pre-configured Qleverfile to create", + ) def execute(self, args) -> bool: + # Show a warning if `QLEVER_OVERRIDE_SYSTEM_NATIVE` is set. + qlever_is_running_in_container = environ.get("QLEVER_IS_RUNNING_IN_CONTAINER") + if qlever_is_running_in_container: + log.warning( + "The environment variable `QLEVER_IS_RUNNING_IN_CONTAINER` is set, " + "therefore the Qleverfile is modified to use `SYSTEM = native` " + "(since inside the container, QLever should run natively)" + ) + log.info("") # Construct the command line and show it. - qleverfile_path = (self.qleverfiles_path - / f"Qleverfile.{args.config_name}") + qleverfile_path = self.qleverfiles_path / f"Qleverfile.{args.config_name}" setup_config_cmd = ( - f"cat {qleverfile_path}" - f" | sed -E 's/(^ACCESS_TOKEN.*)/\\1_{get_random_string(12)}/'" - f"> Qleverfile") + f"cat {qleverfile_path}" + f" | sed -E 's/(^ACCESS_TOKEN.*)/\\1_{get_random_string(12)}/'" + ) + if qlever_is_running_in_container: + setup_config_cmd += ( + " | sed -E 's/(^SYSTEM[[:space:]]*=[[:space:]]*).*/\\1native/'" + ) + setup_config_cmd += "> Qleverfile" self.show(setup_config_cmd, only_show=args.show) if args.show: - return False + return True # If there is already a Qleverfile in the current directory, exit. qleverfile_path = Path("Qleverfile") if qleverfile_path.exists(): log.error("`Qleverfile` already exists in current directory") log.info("") - log.info("If you want to create a new Qleverfile using " - "`qlever setup-config`, delete the existing Qleverfile " - "first") + log.info( + "If you want to create a new Qleverfile using " + "`qlever setup-config`, delete the existing Qleverfile " + "first" + ) return False # Copy the Qleverfile to the current directory. try: - subprocess.run(setup_config_cmd, shell=True, check=True, - stdin=subprocess.DEVNULL, stdout=subprocess.DEVNULL) + subprocess.run( + setup_config_cmd, + shell=True, + check=True, + stdin=subprocess.DEVNULL, + stdout=subprocess.DEVNULL, + ) except Exception as e: - log.error(f"Could not copy \"{qleverfile_path}\"" - f" to current directory: {e}") + log.error( + f'Could not copy "{qleverfile_path}"' f" to current directory: {e}" + ) return False # If we get here, everything went well. - log.info(f"Created Qleverfile for config \"{args.config_name}\"" - f" in current directory") + log.info( + f'Created Qleverfile for config "{args.config_name}"' + f" in current directory" + ) return True - - # if config_name == "default": - # log.info("Since this is the default Qleverfile, you need to " - # "edit it before you can continue") - # log.info("") - # log.info("Afterwards, run `qlever` without arguments to see " - # "which actions are available") - # else: - # show_available_action_names() - # log.info("") diff --git a/src/qlever/commands/status.py b/src/qlever/commands/status.py index 5d066660..a8efed54 100644 --- a/src/qlever/commands/status.py +++ b/src/qlever/commands/status.py @@ -35,7 +35,7 @@ def execute(self, args) -> bool: f"the command line matches {args.cmdline_regex}" f" using Python's psutil library", only_show=args.show) if args.show: - return False + return True # Show the results as a table. num_processes_found = 0 @@ -47,3 +47,4 @@ def execute(self, args) -> bool: num_processes_found += 1 if num_processes_found == 0: print("No processes found") + return True diff --git a/src/qlever/commands/stop.py b/src/qlever/commands/stop.py index f2f8e80a..82225304 100644 --- a/src/qlever/commands/stop.py +++ b/src/qlever/commands/stop.py @@ -49,7 +49,7 @@ def execute(self, args) -> bool: f"\"{args.server_container}\"") self.show(description, only_show=args.show) if args.show: - return False + return True # First check if there is container running and if yes, stop and remove # it (unless the user has specified `--no-containers`). @@ -90,14 +90,12 @@ def execute(self, args) -> bool: return False return True - # No matching process found. + # If no matching process found, show a message and the output of the + # status command. message = "No matching process found" if args.no_containers else \ "No matching process or container found" log.error(message) - - # Show output of status command. args.cmdline_regex = "^ServerMain.* -i [^ ]*" log.info("") StatusCommand().execute(args) - - return False + return True diff --git a/src/qlever/commands/system_info.py b/src/qlever/commands/system_info.py index 0d1ed167..1e45d1bd 100644 --- a/src/qlever/commands/system_info.py +++ b/src/qlever/commands/system_info.py @@ -58,7 +58,7 @@ def execute(self, args) -> bool: # Say what the command is doing. self.show("Show system information and Qleverfile", only_show=args.show) if args.show: - return False + return True # Show system information. show_heading("System Information") diff --git a/src/qlever/commands/ui.py b/src/qlever/commands/ui.py index 3de7c177..23dea87a 100644 --- a/src/qlever/commands/ui.py +++ b/src/qlever/commands/ui.py @@ -1,6 +1,7 @@ from __future__ import annotations import subprocess +from os import environ from qlever.command import QleverCommand from qlever.containerize import Containerize @@ -17,46 +18,72 @@ def __init__(self): pass def description(self) -> str: - return ("Launch the QLever UI web application") + return "Launch the QLever UI web application" def should_have_qleverfile(self) -> bool: return True - def relevant_qleverfile_arguments(self) -> dict[str: list[str]]: - return {"data": ["name"], - "server": ["host_name", "port"], - "ui": ["ui_port", "ui_config", - "ui_system", "ui_image", "ui_container"]} + def relevant_qleverfile_arguments(self) -> dict[str : list[str]]: + return { + "data": ["name"], + "server": ["host_name", "port"], + "ui": ["ui_port", "ui_config", "ui_system", "ui_image", "ui_container"], + } def additional_arguments(self, subparser) -> None: pass def execute(self, args) -> bool: + # If QLEVER_OVERRIDE_DISABLE_UI is set, this command is disabled. + qlever_is_running_in_container = environ.get("QLEVER_IS_RUNNING_IN_CONTAINER") + if qlever_is_running_in_container: + log.error( + "The environment variable `QLEVER_OVERRIDE_DISABLE_UI` is set, " + "therefore `qlever ui` is not available (it should not be called " + "from inside a container)" + ) + log.info("") + if not args.show: + log.info( + "For your information, showing the commands that are " + "executed when `qlever ui` is available:" + ) + log.info("") + # Construct commands and show them. server_url = f"http://{args.host_name}:{args.port}" ui_url = f"http://{args.host_name}:{args.ui_port}" pull_cmd = f"{args.ui_system} pull -q {args.ui_image}" - run_cmd = f"{args.ui_system} run -d " \ - f"--publish {args.ui_port}:7000 " \ - f"--name {args.ui_container} " \ - f"{args.ui_image}" - exec_cmd = f"{args.ui_system} exec -it " \ - f"{args.ui_container} " \ - f"bash -c \"python manage.py configure " \ - f"{args.ui_config} {server_url}\"" - self.show("\n".join(["Stop running containers", - pull_cmd, run_cmd, exec_cmd]), only_show=args.show) - if args.show: + run_cmd = ( + f"{args.ui_system} run -d " + f"--publish {args.ui_port}:7000 " + f"--name {args.ui_container} " + f"{args.ui_image}" + ) + exec_cmd = ( + f"{args.ui_system} exec -it " + f"{args.ui_container} " + f'bash -c "python manage.py configure ' + f'{args.ui_config} {server_url}"' + ) + self.show( + "\n".join(["Stop running containers", pull_cmd, run_cmd, exec_cmd]), + only_show=args.show, + ) + if qlever_is_running_in_container: return False + if args.show: + return True # Stop running containers. for container_system in Containerize.supported_systems(): - Containerize.stop_and_remove_container( - container_system, args.ui_container) + Containerize.stop_and_remove_container(container_system, args.ui_container) # Check if the UI port is already being used. if is_port_used(args.ui_port): - log.warning(f"It looks like the specified port for the UI ({args.ui_port}) is already in use. You can set another port in the Qleverfile in the [ui] section with the UI_PORT variable.") + log.warning( + f"It looks like the specified port for the UI ({args.ui_port}) is already in use. You can set another port in the Qleverfile in the [ui] section with the UI_PORT variable." + ) # Try to start the QLever UI. try: @@ -68,7 +95,9 @@ def execute(self, args) -> bool: return False # Success. - log.info(f"The QLever UI should now be up at {ui_url} ..." - f"You can log in as QLever UI admin with username and " - f"password \"demo\"") + log.info( + f"The QLever UI should now be up at {ui_url} ..." + f"You can log in as QLever UI admin with username and " + f'password "demo"' + ) return True diff --git a/src/qlever/commands/warmup.py b/src/qlever/commands/warmup.py index 7a7041fb..49150262 100644 --- a/src/qlever/commands/warmup.py +++ b/src/qlever/commands/warmup.py @@ -30,7 +30,7 @@ def execute(self, args) -> bool: # Show what the command is doing. self.show(args.warmup_cmd, only_show=args.show) if args.show: - return False + return True # Execute the command. try: diff --git a/src/qlever/qlever_main.py b/src/qlever/qlever_main.py index a936abd5..d93e4549 100644 --- a/src/qlever/qlever_main.py +++ b/src/qlever/qlever_main.py @@ -35,31 +35,38 @@ def main(): log.info("") log.info(colored(f"Command: {args.command}", attrs=["bold"])) log.info("") - command_object.execute(args) + commandWasSuccesful = command_object.execute(args) log.info("") + if not commandWasSuccesful: + exit(1) except KeyboardInterrupt: log.info("") log.info("Ctrl-C pressed, exiting ...") log.info("") - exit(0) + exit(1) except Exception as e: # Check if it's a certain kind of `AttributeError` and give a hint in # that case. match_error = re.search(r"object has no attribute '(.+)'", str(e)) - match_trace = re.search(r"(qlever/commands/.+\.py)\", line (\d+)", - traceback.format_exc()) + match_trace = re.search( + r"(qlever/commands/.+\.py)\", line (\d+)", traceback.format_exc() + ) if isinstance(e, AttributeError) and match_error and match_trace: attribute = match_error.group(1) trace_command = match_trace.group(1) trace_line = match_trace.group(2) log.error(f"{e} in `{trace_command}` at line {trace_line}") log.info("") - log.info(f"Likely cause: you used `args.{attribute}`, but it was " - f"neither defined in `relevant_qleverfile_arguments` " - f"nor in `additional_arguments`") + log.info( + f"Likely cause: you used `args.{attribute}`, but it was " + f"neither defined in `relevant_qleverfile_arguments` " + f"nor in `additional_arguments`" + ) log.info("") - log.info(f"If you did not implement `{trace_command}` yourself, " - f"please report this issue") + log.info( + f"If you did not implement `{trace_command}` yourself, " + f"please report this issue" + ) log.info("") else: log.error(f"An unexpected error occurred: {e}") diff --git a/src/qlever/qleverfile.py b/src/qlever/qleverfile.py index 6afc9143..7a5f8552 100644 --- a/src/qlever/qleverfile.py +++ b/src/qlever/qleverfile.py @@ -40,170 +40,277 @@ def arg(*args, **kwargs): ui_args = all_args["ui"] = {} data_args["name"] = arg( - "--name", type=str, required=True, - help="The name of the dataset") + "--name", type=str, required=True, help="The name of the dataset" + ) data_args["get_data_cmd"] = arg( - "--get-data-cmd", type=str, required=True, - help="The command to get the data") + "--get-data-cmd", + type=str, + required=True, + help="The command to get the data", + ) data_args["description"] = arg( - "--description", type=str, required=True, - help="A concise description of the dataset") + "--description", + type=str, + required=True, + help="A concise description of the dataset", + ) data_args["text_description"] = arg( - "--text-description", type=str, default=None, - help="A concise description of the additional text data" - " if any") + "--text-description", + type=str, + default=None, + help="A concise description of the additional text data" " if any", + ) data_args["format"] = arg( - "--format", type=str, default="ttl", - choices=["ttl", "nt", "nq"], - help="The format of the data") + "--format", + type=str, + default="ttl", + choices=["ttl", "nt", "nq"], + help="The format of the data", + ) index_args["input_files"] = arg( - "--input-files", type=str, required=True, - help="A space-separated list of patterns that match " - "all the files of the dataset") + "--input-files", + type=str, + required=True, + help="A space-separated list of patterns that match " + "all the files of the dataset", + ) index_args["cat_input_files"] = arg( - "--cat-input-files", type=str, - help="The command that produces the input") + "--cat-input-files", type=str, help="The command that produces the input" + ) index_args["multi_input_json"] = arg( - "--multi-input-json", type=str, default=None, - help="JSON to specify multiple input files, each with a " - "`cmd` (command that writes the triples to stdout), " - "`format` (format like for the `--format` option), " - "`graph` (name of the graph, use `-` for the default graph), " - "`parallel` (parallel parsing for large files, where all " - "prefix declaration are at the beginning)") + "--multi-input-json", + type=str, + default=None, + help="JSON to specify multiple input files, each with a " + "`cmd` (command that writes the triples to stdout), " + "`format` (format like for the `--format` option), " + "`graph` (name of the graph, use `-` for the default graph), " + "`parallel` (parallel parsing for large files, where all " + "prefix declaration are at the beginning)", + ) + index_args["parallel_parsing"] = arg( + "--parallel-parsing", + type=str, + choices=["true", "false"], + help="Use parallel parsing (recommended for large files, " + "but it requires that all prefix declarations are at the " + "beginning of the file)", + ) index_args["settings_json"] = arg( - "--settings-json", type=str, default="{}", - help="The `.settings.json` file for the index") + "--settings-json", + type=str, + default="{}", + help="The `.settings.json` file for the index", + ) index_args["index_binary"] = arg( - "--index-binary", type=str, default="IndexBuilderMain", - help="The binary for building the index (this requires " - "that you have compiled QLever on your machine)") + "--index-binary", + type=str, + default="IndexBuilderMain", + help="The binary for building the index (this requires " + "that you have compiled QLever on your machine)", + ) index_args["stxxl_memory"] = arg( - "--stxxl-memory", type=str, default="5G", - help="The amount of memory to use for the index build " - "(the name of the option has historical reasons)") + "--stxxl-memory", + type=str, + default="5G", + help="The amount of memory to use for the index build " + "(the name of the option has historical reasons)", + ) index_args["only_pso_and_pos_permutations"] = arg( - "--only-pso-and-pos-permutations", action="store_true", - default=False, - help="Only create the PSO and POS permutations") + "--only-pso-and-pos-permutations", + action="store_true", + default=False, + help="Only create the PSO and POS permutations", + ) index_args["use_patterns"] = arg( - "--use-patterns", action="store_true", default=True, - help="Precompute so-called patterns needed for fast processing" - " of queries like SELECT ?p (COUNT(DISTINCT ?s) AS ?c) " - "WHERE { ?s ?p [] ... } GROUP BY ?p") + "--use-patterns", + action="store_true", + default=True, + help="Precompute so-called patterns needed for fast processing" + " of queries like SELECT ?p (COUNT(DISTINCT ?s) AS ?c) " + "WHERE { ?s ?p [] ... } GROUP BY ?p", + ) index_args["text_index"] = arg( - "--text-index", - choices=["none", "from_text_records", "from_literals", - "from_text_records_and_literals"], - default="none", - help="Whether to also build an index for text search" - "and for which texts") + "--text-index", + choices=[ + "none", + "from_text_records", + "from_literals", + "from_text_records_and_literals", + ], + default="none", + help="Whether to also build an index for text search" "and for which texts", + ) index_args["text_words_file"] = arg( - "--text-words-file", type=str, default=None, - help="File with the words for the text index (one line " - "per word, format: `word or IRI\t0 or 1\tdoc id\t1`)") + "--text-words-file", + type=str, + default=None, + help="File with the words for the text index (one line " + "per word, format: `word or IRI\t0 or 1\tdoc id\t1`)", + ) index_args["text_docs_file"] = arg( - "--text-docs-file", type=str, default=None, - help="File with the documents for the text index (one line " - "per document, format: `id\tdocument text`)") + "--text-docs-file", + type=str, + default=None, + help="File with the documents for the text index (one line " + "per document, format: `id\tdocument text`)", + ) server_args["server_binary"] = arg( - "--server-binary", type=str, default="ServerMain", - help="The binary for starting the server (this requires " - "that you have compiled QLever on your machine)") + "--server-binary", + type=str, + default="ServerMain", + help="The binary for starting the server (this requires " + "that you have compiled QLever on your machine)", + ) server_args["host_name"] = arg( - "--host-name", type=str, default="localhost", - help="The name of the host on which the server listens for " - "requests") + "--host-name", + type=str, + default="localhost", + help="The name of the host on which the server listens for " "requests", + ) server_args["port"] = arg( - "--port", type=int, - help="The port on which the server listens for requests") + "--port", type=int, help="The port on which the server listens for requests" + ) server_args["access_token"] = arg( - "--access-token", type=str, default=None, - help="The access token for privileged operations") + "--access-token", + type=str, + default=None, + help="The access token for privileged operations", + ) server_args["memory_for_queries"] = arg( - "--memory-for-queries", type=str, default="5G", - help="The maximal amount of memory used for query processing" - " (if a query needs more than what is available, the " - "query will not be processed)") + "--memory-for-queries", + type=str, + default="5G", + help="The maximal amount of memory used for query processing" + " (if a query needs more than what is available, the " + "query will not be processed)", + ) server_args["cache_max_size"] = arg( - "--cache-max-size", type=str, default="2G", - help="The maximal amount of memory used for caching") + "--cache-max-size", + type=str, + default="2G", + help="The maximal amount of memory used for caching", + ) server_args["cache_max_size_single_entry"] = arg( - "--cache-max-size-single-entry", type=str, default="1G", - help="The maximal amount of memory used for caching a single " - "query result") + "--cache-max-size-single-entry", + type=str, + default="1G", + help="The maximal amount of memory used for caching a single " + "query result", + ) server_args["cache_max_num_entries"] = arg( - "--cache-max-num-entries", type=int, default=200, - help="The maximal number of entries in the cache" - " (the eviction policy when the cache is full is LRU)") + "--cache-max-num-entries", + type=int, + default=200, + help="The maximal number of entries in the cache" + " (the eviction policy when the cache is full is LRU)", + ) server_args["timeout"] = arg( - "--timeout", type=str, default="30s", - help="The maximal time in seconds a query is allowed to run" - " (can be increased per query with the URL parameters " - "`timeout` and `access_token`)") + "--timeout", + type=str, + default="30s", + help="The maximal time in seconds a query is allowed to run" + " (can be increased per query with the URL parameters " + "`timeout` and `access_token`)", + ) server_args["num_threads"] = arg( - "--num-threads", type=int, default=8, - help="The number of threads used for query processing") + "--num-threads", + type=int, + default=8, + help="The number of threads used for query processing", + ) server_args["only_pso_and_pos_permutations"] = arg( - "--only-pso-and-pos-permutations", action="store_true", - default=False, - help="Only use the PSO and POS permutations (then each " - "triple pattern must have a fixed predicate)") + "--only-pso-and-pos-permutations", + action="store_true", + default=False, + help="Only use the PSO and POS permutations (then each " + "triple pattern must have a fixed predicate)", + ) server_args["use_patterns"] = arg( - "--use-patterns", action="store_true", default=True, - help="Use the patterns precomputed during the index build" - " (see `qlever index --help` for their utility)") + "--use-patterns", + action="store_true", + default=True, + help="Use the patterns precomputed during the index build" + " (see `qlever index --help` for their utility)", + ) server_args["use_text_index"] = arg( - "--use-text-index", choices=["yes", "no"], default="no", - help="Whether to use the text index (requires that one was " - "built, see `qlever index`)") + "--use-text-index", + choices=["yes", "no"], + default="no", + help="Whether to use the text index (requires that one was " + "built, see `qlever index`)", + ) server_args["warmup_cmd"] = arg( - "--warmup-cmd", type=str, - help="Command executed after the server has started " - " (executed as part of `qlever start` unless " - " `--no-warmup` is specified, or with `qlever warmup`)") + "--warmup-cmd", + type=str, + help="Command executed after the server has started " + " (executed as part of `qlever start` unless " + " `--no-warmup` is specified, or with `qlever warmup`)", + ) runtime_args["system"] = arg( - "--system", type=str, - choices=Containerize.supported_systems() + ["native"], - default="docker", - help=("Whether to run commands like `index` or `start` " - "natively or in a container, and if in a container, " - "which system to use")) + "--system", + type=str, + choices=Containerize.supported_systems() + ["native"], + default="docker", + help=( + "Whether to run commands like `index` or `start` " + "natively or in a container, and if in a container, " + "which system to use" + ), + ) runtime_args["image"] = arg( - "--image", type=str, - default="docker.io/adfreiburg/qlever", - help="The name of the image when running in a container") + "--image", + type=str, + default="docker.io/adfreiburg/qlever", + help="The name of the image when running in a container", + ) runtime_args["index_container"] = arg( - "--index-container", type=str, - help="The name of the container used by `qlever index`") + "--index-container", + type=str, + help="The name of the container used by `qlever index`", + ) runtime_args["server_container"] = arg( - "--server-container", type=str, - help="The name of the container used by `qlever start`") + "--server-container", + type=str, + help="The name of the container used by `qlever start`", + ) ui_args["ui_port"] = arg( - "--ui-port", type=int, default=8176, - help="The port of the Qlever UI when running `qlever ui`") + "--ui-port", + type=int, + default=8176, + help="The port of the Qlever UI when running `qlever ui`", + ) ui_args["ui_config"] = arg( - "--ui-config", type=str, default="default", - help="The name of the backend configuration for the QLever UI" - " (this determines AC queries and example queries)") + "--ui-config", + type=str, + default="default", + help="The name of the backend configuration for the QLever UI" + " (this determines AC queries and example queries)", + ) ui_args["ui_system"] = arg( - "--ui-system", type=str, - choices=Containerize.supported_systems(), - default="docker", - help="Which container system to use for `qlever ui`" - " (unlike for `qlever index` and `qlever start`, " - " \"native\" is not yet supported here)") + "--ui-system", + type=str, + choices=Containerize.supported_systems(), + default="docker", + help="Which container system to use for `qlever ui`" + " (unlike for `qlever index` and `qlever start`, " + ' "native" is not yet supported here)', + ) ui_args["ui_image"] = arg( - "--ui-image", type=str, - default="docker.io/adfreiburg/qlever-ui", - help="The name of the image used for `qlever ui`") + "--ui-image", + type=str, + default="docker.io/adfreiburg/qlever-ui", + help="The name of the image used for `qlever ui`", + ) ui_args["ui_container"] = arg( - "--ui-container", type=str, - help="The name of the container used for `qlever ui`") + "--ui-container", + type=str, + help="The name of the container used for `qlever ui`", + ) return all_args @@ -221,8 +328,7 @@ def read(qleverfile_path): # Read the Qleverfile. defaults = {"random": "83724324hztz", "version": "01.01.01"} - config = ConfigParser(interpolation=ExtendedInterpolation(), - defaults=defaults) + config = ConfigParser(interpolation=ExtendedInterpolation(), defaults=defaults) try: config.read(qleverfile_path) except Exception as e: @@ -237,13 +343,18 @@ def read(qleverfile_path): if match: try: value = subprocess.check_output( - match.group(1), shell=True, text=True, - stderr=subprocess.STDOUT).strip() + match.group(1), + shell=True, + text=True, + stderr=subprocess.STDOUT, + ).strip() except Exception as e: log.info("") - log.error(f"Error evaluating {value} for option " - f"{section}.{option.upper()} in " - f"{qleverfile_path}:") + log.error( + f"Error evaluating {value} for option " + f"{section}.{option.upper()} in " + f"{qleverfile_path}:" + ) log.info("") log.info(e.output if hasattr(e, "output") else e) exit(1) @@ -270,8 +381,8 @@ def read(qleverfile_path): if "text_docs_file" not in index: index["text_docs_file"] = f"{name}.docsfile.tsv" server = config["server"] - if index.get("text_index", "none") != "none": - server["use_text_index"] = "yes" + if index.get("text_index", "none") != "none": + server["use_text_index"] = "yes" # Return the parsed Qleverfile with the added inherited values. return config diff --git a/test/qlever/commands/test_cache_stats_execute.py b/test/qlever/commands/test_cache_stats_execute.py new file mode 100644 index 00000000..af60fe14 --- /dev/null +++ b/test/qlever/commands/test_cache_stats_execute.py @@ -0,0 +1,214 @@ +from __future__ import annotations + +import unittest +from unittest.mock import MagicMock, patch + +from qlever.commands.cache_stats import CacheStatsCommand + + +class TestCacheStatsCommand(unittest.TestCase): + def setUp(self): + self.command = CacheStatsCommand() + + @patch("qlever.commands.cache_stats.subprocess.check_output") + @patch("qlever.commands.cache_stats.json.loads") + @patch("qlever.commands.cache_stats.log") + # Test execute of cache stats command for basic case with successful + # execution + def test_execute_successful_basic_cache_stats( + self, mock_log, mock_json_loads, mock_check_output + ): + # Mock arguments for basic cache stats + args = MagicMock() + args.server_url = None + args.port = 1234 + args.show = False + args.detailed = False + + # Mock `subprocess.check_output` and `json.loads` as encoded bytes + mock_check_output.side_effect = [ + # Mock cache_stats + b'{"pinned-size": 1e9, "non-pinned-size": 3e9}', + # Mock cache_settings + b'{"cache-max-size": "10 GB"}', + ] + # mock cache_stats_dict and cache_settings_dict as a dictionary + mock_json_loads.side_effect = [ + {"pinned-size": 1e9, "non-pinned-size": 3e9}, + {"cache-max-size": "10 GB"}, + ] + + # Execute the command + result = self.command.execute(args) + + # Assertions + expected_stats_call = ( + f"curl -s localhost:{args.port} " + f'--data-urlencode "cmd=cache-stats"' + ) + expected_settings_call = ( + f"curl -s localhost:{args.port} " + f'--data-urlencode "cmd=get-settings"' + ) + + mock_check_output.assert_any_call(expected_stats_call, shell=True) + mock_check_output.assert_any_call(expected_settings_call, shell=True) + + # Verify the correct information logs + mock_log.info.assert_any_call( + "Pinned queries : 1.0 GB of 10.0 GB [10.0%]" + ) + mock_log.info.assert_any_call( + "Non-pinned queries : 3.0 GB of 10.0 GB [30.0%]" + ) + mock_log.info.assert_any_call( + "FREE : 6.0 GB of 10.0 GB [60.0%]" + ) + + self.assertTrue(result) + + @patch("qlever.commands.cache_stats.subprocess.check_output") + @patch("qlever.commands.cache_stats.json.loads") + @patch("qlever.commands.cache_stats.log") + # Test for show_dict_as_table function. Reached if 'args.detailed = True'. + def test_execute_detailed_cache_stats( + self, mock_log, mock_json_loads, mock_check_output + ): + # Mock arguments for detailed cache stats + args = MagicMock() + args.server_url = "http://testlocalhost:1234" + args.show = False + args.detailed = True + + # Mock the responses from `subprocess.check_output` and `json.loads` + mock_check_output.side_effect = [ + b'{"pinned-size": 2e9, "non-pinned-size": 1e9, "test-stat": 500}', + b'{"cache-max-size": "10 GB", "test-setting": 1000}', + ] + # CAREFUL: if value is float you will get an error in re.match + mock_json_loads.side_effect = [ + { + "pinned-size": int(2e9), + "non-pinned-size": int(1e9), + "test-stat": 500, + }, + {"cache-max-size": "10 GB", "test-setting": 1000}, + ] + + # Execute the command + result = self.command.execute(args) + + # Assertions + expected_stats_call = ( + f"curl -s {args.server_url} " f'--data-urlencode "cmd=cache-stats"' + ) + expected_settings_call = ( + f"curl -s {args.server_url} " + f'--data-urlencode "cmd=get-settings"' + ) + + mock_check_output.assert_any_call(expected_stats_call, shell=True) + mock_check_output.assert_any_call(expected_settings_call, shell=True) + + # Verify that detailed stats and settings were logged as a table + mock_log.info.assert_any_call("pinned-size : 2,000,000,000") + mock_log.info.assert_any_call("non-pinned-size : 1,000,000,000") + mock_log.info.assert_any_call("test-stat : 500") + mock_log.info.assert_any_call("cache-max-size : 10 GB") + mock_log.info.assert_any_call("test-setting : 1,000") + + self.assertTrue(result) + + @patch("qlever.commands.cache_stats.subprocess.check_output") + @patch("qlever.commands.cache_stats.log") + # Checking if correct error message is given for unsuccessful try/except + # block. + def test_execute_failed_cache_stats(self, mock_log, mock_check_output): + # Mock arguments for basic cache stats + args = MagicMock() + args.server_url = "http://testlocalhost:1234" + args.show = False + args.detailed = False + + # Simulate a command execution failure + mock_check_output.side_effect = Exception("Mocked command failure") + + # Execute the command + result = self.command.execute(args) + + # Assertions to verify that error was logged + mock_log.error.assert_called_once_with( + "Failed to get cache stats and settings: Mocked command failure" + ) + + self.assertFalse(result) + + @patch("qlever.commands.cache_stats.subprocess.check_output") + @patch("qlever.commands.cache_stats.json.loads") + @patch("qlever.commands.cache_stats.log") + # Checking if correct error message is given for invalid cache_size + def test_execute_invalid_cache_size_format( + self, mock_log, mock_json_loads, mock_check_output + ): + # Mock arguments for basic cache stats + args = MagicMock() + args.server_url = None + args.port = 1234 + args.show = False + args.detailed = False + + # Mock the responses with invalid cache size format + mock_check_output.side_effect = [ + b'{"pinned-size": 2e9, "non-pinned-size": 1e9}', + # Mock cache stats with invalid cache settings + b'{"cache-max-size": "1000 MB"}', + ] + mock_json_loads.side_effect = [ + {"pinned-size": 2e9, "non-pinned-size": 1e9}, + {"cache-max-size": "1000 MB"}, + ] + + # Execute the command + result = self.command.execute(args) + + # Assertions to verify that error was logged + mock_log.error.assert_called_once_with( + "Cache size 1000 MB is not in GB, QLever should return " + "bytes instead" + ) + + self.assertFalse(result) + + @patch("qlever.commands.cache_stats.subprocess.check_output") + @patch("qlever.commands.cache_stats.json.loads") + @patch("qlever.commands.cache_stats.log") + # Checking if correct log message is given for empty cache_size + def test_execute_empty_cache_size( + self, mock_log, mock_json_loads, mock_check_output + ): + # Mock arguments for basic cache stats + args = MagicMock() + args.server_url = None + args.port = 1234 + args.show = False + args.detailed = False + + # Mock the responses with empty cache size + mock_check_output.side_effect = [ + b'{"pinned-size": 0, "non-pinned-size": 0}', + b'{"cache-max-size": "10 GB"}', + ] + mock_json_loads.side_effect = [ + {"pinned-size": 0, "non-pinned-size": 0}, + {"cache-max-size": "10 GB"}, + ] + + # Execute the command + result = self.command.execute(args) + + # Assertions to verify that log.info was called correctly + mock_log.info.assert_called_once_with( + "Cache is empty, all 10.0 GB available" + ) + + self.assertTrue(result) diff --git a/test/qlever/commands/test_cache_stats_other_methods.py b/test/qlever/commands/test_cache_stats_other_methods.py new file mode 100644 index 00000000..45b5681d --- /dev/null +++ b/test/qlever/commands/test_cache_stats_other_methods.py @@ -0,0 +1,53 @@ +import argparse +import unittest + +from qlever.commands.cache_stats import CacheStatsCommand + + +class TestStartCommand(unittest.TestCase): + def test_description(self): + self.assertEqual( + "Show how much of the cache is currently being " "used", + CacheStatsCommand().description(), + ) + + def test_should_have_qleverfile(self): + assert not CacheStatsCommand().should_have_qleverfile() + + def test_relevant_qleverfile_arguments(self): + testdict = {"server": ["host_name", "port"]} + self.assertEqual( + testdict, CacheStatsCommand().relevant_qleverfile_arguments() + ) + + def test_additional_arguments(self): + # Create an instance of CacheStatsCommand + csc = CacheStatsCommand() + + # Create a parser and a subparser + parser = argparse.ArgumentParser() + subparser = parser.add_argument_group("test") + # Call the method + csc.additional_arguments(subparser) + # Parse an empty argument list to see the default + args = parser.parse_args([]) + + # Test that the default value for server-url is set correctly + """Why is there no default="localhost:{port}"? """ + self.assertEqual(args.server_url, None) + + # Test that the help text for server-url is correctly set + argument_help = subparser._group_actions[-2].help + self.assertEqual( + "URL of the QLever server, default is " "localhost:{port}", + argument_help, + ) + + # Test that the default value for --detailed is set correctly + self.assertEqual(False, args.detailed) + + # Test that the help text for --detailed is correctly set + argument_help = subparser._group_actions[-1].help + self.assertEqual( + "Show detailed statistics and settings", argument_help + ) diff --git a/test/qlever/commands/test_log_execute.py b/test/qlever/commands/test_log_execute.py new file mode 100644 index 00000000..ef5db83f --- /dev/null +++ b/test/qlever/commands/test_log_execute.py @@ -0,0 +1,134 @@ +from __future__ import annotations + +import unittest +from unittest.mock import MagicMock, call, patch + +from qlever.commands.log import LogCommand + + +class TestLogCommand(unittest.TestCase): + @patch("subprocess.run") + @patch("qlever.commands.log.log") + # Test execute of index command for basic case with successful execution + def test_execute_beginning_without_no_follow(self, mock_log, mock_run): + # Setup args + args = MagicMock() + args.name = "TestName" + args.from_beginning = True + args.no_follow = False + args.show = False + + # Instantiate LogCommand and execute the function + result = LogCommand().execute(args) + + # Assertions + log_file = f"{args.name}.server-log.txt" + expected_log_cmd = f"tail -n +1 -f {log_file}" + expected_log_msg = ( + f"Follow log file {log_file}, press Ctrl-C " + f"to stop following (will not stop the server)" + ) + # Check that the info log contains the exception message + mock_log.info.assert_has_calls( + [call(expected_log_msg), call("")], any_order=False + ) + + # Checking if run_command was only called once + mock_run.assert_called_once_with(expected_log_cmd, shell=True) + + assert result + + @patch("subprocess.run") + @patch("qlever.commands.log.log") + # tests execute with args.no_follow = True + def test_execute_without_beginning_with_no_follow( + self, mock_log, mock_run + ): + # Setup args + args = MagicMock() + args.name = "TestName" + args.from_beginning = False + args.no_follow = True + args.show = False + args.tail_num_lines = 50 + # Instantiate LogCommand and execute the function + result = LogCommand().execute(args) + + # Assertions + log_file = f"{args.name}.server-log.txt" + expected_log_cmd = f"tail -n {args.tail_num_lines} {log_file}" + expected_log_msg = ( + f"Follow log file {log_file}, press Ctrl-C " + f"to stop following (will not stop the server)" + ) + # Check that the info log contains the exception message + mock_log.info.assert_has_calls( + [call(expected_log_msg), call("")], any_order=False + ) + + # Checking if run_command was only called once + mock_run.assert_called_once_with(expected_log_cmd, shell=True) + + assert result + + @patch("qlever.commands.log.LogCommand.show") + # test if execute returns true for args.show = true + def test_execute_show(self, mock_show): + # Setup args + args = MagicMock() + args.name = "TestName" + args.from_beginning = True + args.no_follow = True + args.show = True + # Instantiate LogCommand and execute the function + result = LogCommand().execute(args) + + # Assertions + log_file = f"{args.name}.server-log.txt" + expected_log_cmd = f"tail -n +1 {log_file}" + + # Check that show is executed with correct arguments + mock_show.assert_called_once_with( + expected_log_cmd, only_show=args.show + ) + assert result + + @patch("subprocess.run") + @patch("qlever.commands.log.log") + # test for failed subprocess.run + def test_execute_failed_to_run_subprocess(self, mock_log, mock_run): + # Setup args + args = MagicMock() + args.name = "TestName" + args.from_beginning = False + args.no_follow = True + args.show = False + args.tail_num_lines = 50 + + # Assertions + # Simulate a command execution failure + error_msg = Exception("Failed to run subprocess.run") + mock_run.side_effect = error_msg + + # Instantiate LogCommand and execute the function + result = LogCommand().execute(args) + + log_file = f"{args.name}.server-log.txt" + expected_log_cmd = f"tail -n {args.tail_num_lines} {log_file}" + expected_log_msg = ( + f"Follow log file {log_file}, press Ctrl-C " + f"to stop following (will not stop the server)" + ) + + # Check that the info log contains the exception message + mock_log.info.assert_has_calls( + [call(expected_log_msg), call("")], any_order=False + ) + + # Checking if run_command was only called once + mock_run.assert_called_once_with(expected_log_cmd, shell=True) + + # Assertions to verify that error was logged + mock_log.error.assert_called_once_with(error_msg) + + assert not result diff --git a/test/qlever/commands/test_log_other_methods.py b/test/qlever/commands/test_log_other_methods.py new file mode 100644 index 00000000..b5a9f2a0 --- /dev/null +++ b/test/qlever/commands/test_log_other_methods.py @@ -0,0 +1,58 @@ +import argparse +import unittest + +from qlever.commands.log import LogCommand + + +class TestStartCommand(unittest.TestCase): + def test_description(self): + self.assertEqual( + "Show the last lines of the server log file and " "follow it", + LogCommand().description(), + ) + + def test_should_have_qleverfile(self): + assert not LogCommand().should_have_qleverfile() + + def test_relevant_qleverfile_arguments(self): + testdict = {"data": ["name"]} + self.assertEqual( + testdict, LogCommand().relevant_qleverfile_arguments() + ) + + def test_additional_arguments(self): + # Create an instance of StopCommand + lc = LogCommand() + + # Create a parser and a subparser + parser = argparse.ArgumentParser() + subparser = parser.add_argument_group("test") + # Call the method + lc.additional_arguments(subparser) + # Parse an empty argument list to see the default + args = parser.parse_args([]) + + # Test that the default value for tail-num-lines is set correctly + self.assertEqual(args.tail_num_lines, 20) + + # Test that the help text for + # --tail-num-lines is correctly set + argument_help = subparser._group_actions[-3].help + self.assertEqual( + "Show this many of the last lines of the log " "file", + argument_help, + ) + + # Test that the default value for --from-beginning is set correctly + self.assertEqual(False, args.from_beginning) + + # Test that the help text for --from-beginning is correctly set + argument_help = subparser._group_actions[-2].help + self.assertEqual("Show all lines of the log file", argument_help) + + # Test that the default value for -no-follow is set correctly + self.assertEqual(False, args.no_follow) + + # Test that the help text for --no-follow is correctly set + argument_help = subparser._group_actions[-1].help + self.assertEqual(argument_help, "Don't follow the log file") diff --git a/test/qlever/commands/test_status_execute.py b/test/qlever/commands/test_status_execute.py new file mode 100644 index 00000000..7f993c81 --- /dev/null +++ b/test/qlever/commands/test_status_execute.py @@ -0,0 +1,140 @@ +import sys +import unittest +from io import StringIO +from unittest.mock import MagicMock, call, patch + +import qlever.command +from qlever.commands.status import StatusCommand + + +def get_mock_args(only_show): + args = MagicMock() + args.cmdline_regex = "^(ServerMain|IndexBuilderMain)" + args.show = only_show + return [args, args.cmdline_regex, args.show] + + +class TestStatusCommand(unittest.TestCase): + @patch("qlever.commands.status.show_process_info") + @patch("psutil.process_iter") + # testing execute for 2 processes. Just the second one is a qlever process. + # Mocking the process_iter and show_process_info method and testing + # if the methods are called correctly. + def test_execute_processes_found( + self, mock_process_iter, mock_show_process_info + ): + # Mocking the input for the execute function + [args, args.cmdline_regex, args.show] = get_mock_args(False) + + # Creating mock psutil.Process objects with necessary attributes + mock_process1 = MagicMock() + mock_process1.as_dict.return_value = {"test": [1]} + # to test with real psutil.process objects use this: + """mock_process1.as_dict.return_value = { + 'cmdline': ['cmdline1'], + 'pid': 1, + 'username': 'user1', + 'create_time': datetime.now().timestamp(), + 'memory_info': MagicMock(rss=512 * 1024 * 1024) # 512 MB + }""" + + mock_process2 = MagicMock() + mock_process2.as_dict.return_value = {"test": [2]} + # to test with real psutil.process objects use this: + """mock_process2.as_dict.return_value = { + 'cmdline': ['cmdline2'], + 'pid': 2, + 'username': 'user2', + 'create_time': datetime.now().timestamp(), + 'memory_info': MagicMock(rss=1024 * 1024 * 1024) # 1 GB + }""" + + mock_process3 = MagicMock() + mock_process3.as_dict.return_value = {"test": [3]} + + # Mock the return value of process_iter + # to be a list of these mocked process objects + mock_process_iter.return_value = [ + mock_process1, + mock_process2, + mock_process3, + ] + + # Simulate show_process_info returning False for the first + # True for the second and False for the third process + mock_show_process_info.side_effect = [False, True, False] + + sc = StatusCommand() + + # Execute the function + result = sc.execute(args) + + # Assert that process_iter was called once + mock_process_iter.assert_called_once() + + # Assert that show_process_info was called 3times + # in correct order with the correct arguments + expected_calls = [ + call(mock_process1, args.cmdline_regex, show_heading=True), + call(mock_process2, args.cmdline_regex, show_heading=True), + call(mock_process3, args.cmdline_regex, show_heading=False), + ] + mock_show_process_info.assert_has_calls( + expected_calls, any_order=False + ) + self.assertTrue(result) + + @patch("qlever.util.show_process_info") + @patch("psutil.process_iter") + def test_execute_no_processes_found( + self, mock_process_iter, mock_show_process_info + ): + # Mocking the input for the execute function + [args, args.cmdline_regex, args.show] = get_mock_args(False) + + # Mock process_iter to return an empty list, + # simulating that no matching processes are found + mock_process_iter.return_value = [] + + # Capture the string-output + captured_output = StringIO() + sys.stdout = captured_output + + # Instantiate the StatusCommand + status_command = StatusCommand() + + # Execute the function + result = status_command.execute(args) + + # Reset redirect + sys.stdout = sys.__stdout__ + + # Assert that process_iter was called once + mock_process_iter.assert_called_once() + + # Assert that show_process_info was never called + # since there are no processes + mock_show_process_info.assert_not_called() + + self.assertTrue(result) + + # Verify the correct output was printed + self.assertIn("No processes found", captured_output.getvalue()) + + @patch.object(qlever.command.QleverCommand, "show") + def test_execute_show_action_description(self, mock_show): + # Mocking the input for the execute function + [args, args.cmdline_regex, args.show] = get_mock_args(True) + + # Execute the function + result = StatusCommand().execute(args) + + # Assert that verifies that show was called with the correct parameters + mock_show.assert_any_call( + f"Show all processes on this machine where " + f"the command line matches {args.cmdline_regex}" + f" using Python's psutil library", + only_show=args.show, + ) + + self.assertTrue(result) diff --git a/test/qlever/commands/test_status_other_methods.py b/test/qlever/commands/test_status_other_methods.py new file mode 100644 index 00000000..c1954000 --- /dev/null +++ b/test/qlever/commands/test_status_other_methods.py @@ -0,0 +1,41 @@ +import argparse +import unittest + +from qlever.commands.status import StatusCommand + + +class TestStatusCommand(unittest.TestCase): + def test_description(self): + result = StatusCommand().description() + self.assertEqual( + result, "Show QLever processes running on this machine" + ) + + def test_should_have_qleverfile(self): + self.assertFalse(StatusCommand().should_have_qleverfile()) + + def test_relevant_qleverfile_arguments(self): + result = StatusCommand().relevant_qleverfile_arguments() + self.assertEqual(result, {}) + + def test_additional_arguments(self): + # Create an instance of StatusCommand + sc = StatusCommand() + + # Create a parser and a subparser + parser = argparse.ArgumentParser() + subparser = parser.add_argument_group("test") + # Call the method + sc.additional_arguments(subparser) + # Parse an empty argument list to see the default + args = parser.parse_args([]) + + # Test that the default value is set correctly + self.assertEqual(args.cmdline_regex, "^(ServerMain|IndexBuilderMain)") + + # Test that the help text is correctly set + argument_help = subparser._group_actions[-1].help + self.assertEqual( + argument_help, + "Show only processes where the command line matches this regex", + ) diff --git a/test/qlever/commands/test_stop_execute.py b/test/qlever/commands/test_stop_execute.py new file mode 100644 index 00000000..6adceab5 --- /dev/null +++ b/test/qlever/commands/test_stop_execute.py @@ -0,0 +1,248 @@ +from __future__ import annotations + +import unittest +from unittest.mock import MagicMock, patch + +from qlever.commands.stop import StopCommand + + +class TestStopCommand(unittest.TestCase): + @patch("qlever.commands.stop.StatusCommand.execute") + @patch("psutil.process_iter") + @patch("qlever.containerize.Containerize.stop_and_remove_container") + @patch("qlever.commands.stop.StopCommand.show") + def test_execute_no_matching_processes_or_containers( + self, + mock_show, + mock_stop_and_remove_container, + mock_process_iter, + mock_status_execute, + ): + # Setup args + args = MagicMock() + args.cmdline_regex = "ServerMain.* -i [^ ]*%%NAME%%" + args.name = "TestName" + args.no_containers = True + args.server_container = "test_container" + args.show = False + + # Replace the regex placeholder + expected_regex = args.cmdline_regex.replace("%%NAME%%", args.name) + + # Mock process_iter to return no matching processes + mock_process_iter.return_value = [] + + # Instantiate the StopCommand + sc = StopCommand() + + # Execute the function + result = sc.execute(args) + + # Assertions + mock_show.assert_called_once_with( + f'Checking for processes matching "{expected_regex}"', + only_show=False, + ) + mock_process_iter.assert_called_once() + mock_stop_and_remove_container.assert_not_called() + mock_status_execute.assert_called_once_with(args) + self.assertTrue(result) + + @patch("qlever.commands.stop.StatusCommand.execute") + @patch("psutil.process_iter") + @patch("qlever.containerize.Containerize.stop_and_remove_container") + @patch("qlever.commands.stop.StopCommand.show") + def test_execute_with_matching_process( + self, + mock_show, + mock_stop_and_remove_container, + mock_process_iter, + mock_status_execute, + ): + # Setup args + args = MagicMock() + args.cmdline_regex = "ServerMain.* -i [^ ]*%%NAME%%" + args.name = "TestName" + args.no_containers = True + args.server_container = "test_container" + args.show = False + + # Replace the regex placeholder + expected_regex = args.cmdline_regex.replace("%%NAME%%", args.name) + + # Creating mock psutil.Process objects with necessary attributes + mock_process = MagicMock() + # to test with real psutil.process objects use this: + + mock_process.as_dict.return_value = { + "cmdline": ["ServerMain", "-i", "/some/path/TestName"], + "pid": 1234, + "username": "test_user", + } + + mock_process_iter.return_value = [mock_process] + + # Mock process.kill to simulate successful process termination + mock_process.kill.return_value = None + + # Instantiate the StopCommand + sc = StopCommand() + + # Execute the function + result = sc.execute(args) + + # Assertions + mock_show.assert_called_once_with( + f'Checking for processes matching "{expected_regex}"', + only_show=False, + ) + mock_process_iter.assert_called_once() + mock_stop_and_remove_container.assert_not_called() + mock_process.kill.assert_called_once() + mock_status_execute.assert_not_called() + self.assertTrue(result) + + @patch("qlever.commands.stop.StatusCommand.execute") + @patch("psutil.process_iter") + @patch("qlever.containerize.Containerize.stop_and_remove_container") + @patch("qlever.commands.stop.StopCommand.show") + def test_execute_with_containers( + self, + mock_show, + mock_stop_and_remove_container, + mock_process_iter, + mock_status_execute, + ): + # Setup args + args = MagicMock() + args.cmdline_regex = "ServerMain.* -i [^ ]*%%NAME%%" + args.name = "TestName" + args.no_containers = False + args.server_container = "test_container" + args.show = False + + # Replace the regex placeholder + expected_regex = args.cmdline_regex.replace("%%NAME%%", args.name) + + # Mocking container stop and removal + mock_stop_and_remove_container.return_value = True + + # Instantiate the StopCommand + sc = StopCommand() + + # Execute the function + result = sc.execute(args) + + # Assertions + mock_show.assert_called_once_with( + f'Checking for processes matching "{expected_regex}" and for' + f' Docker container with name "{args.server_container}"', + only_show=False, + ) + mock_process_iter.assert_not_called() + mock_stop_and_remove_container.assert_called_once() + mock_status_execute.assert_not_called() + self.assertTrue(result) + + @patch("qlever.commands.stop.StatusCommand.execute") + @patch("psutil.process_iter") + @patch("qlever.containerize.Containerize.stop_and_remove_container") + @patch("qlever.commands.stop.StopCommand.show") + def test_execute_with_no_containers_and_no_matching_process( + self, + mock_show, + mock_stop_and_remove_container, + mock_process_iter, + mock_status_execute, + ): + # Setup args + args = MagicMock() + args.cmdline_regex = "ServerMain.* -i [^ ]*%%NAME%%" + args.name = "TestName" + args.no_containers = False + args.server_container = "test_container" + args.show = False + + # Replace the regex placeholder + expected_regex = args.cmdline_regex.replace("%%NAME%%", args.name) + + # Mock process_iter to return no matching processes + mock_process_iter.return_value = [] + + # Mock container stop and removal to return False (no container found) + mock_stop_and_remove_container.return_value = False + + # Instantiate the StopCommand + sc = StopCommand() + + # Execute the function + result = sc.execute(args) + + # Assertions + mock_show.assert_called_once_with( + f'Checking for processes matching "{expected_regex}" and for' + f' Docker container with name "{args.server_container}"', + only_show=False, + ) + mock_process_iter.assert_called_once() + mock_stop_and_remove_container.assert_called() + mock_status_execute.assert_called_once_with(args) + self.assertTrue(result) + + @patch("qlever.commands.stop.StatusCommand.execute") + @patch("psutil.process_iter") + @patch("qlever.containerize.Containerize.stop_and_remove_container") + @patch("qlever.commands.stop.StopCommand.show") + @patch("qlever.commands.stop.show_process_info") + def test_execute_with_error_killing_process( + self, + mock_show_process_info, + mock_show, + mock_stop_and_remove_container, + mock_process_iter, + mock_status_execute, + ): + # Setup args + args = MagicMock() + args.cmdline_regex = "ServerMain.* -i [^ ]*%%NAME%%" + args.name = "TestName" + args.no_containers = True + args.server_container = "test_container" + args.show = False + + # Replace the regex placeholder + expected_regex = args.cmdline_regex.replace("%%NAME%%", args.name) + + # Creating mock psutil.Process objects with necessary attributes + mock_process = MagicMock() + mock_process.as_dict.return_value = { + "cmdline": ["ServerMain", "-i", "/some/path/TestName"], + "pid": 1234, + "create_time": 1234567890, + "memory_info": MagicMock(rss=1024 * 1024 * 512), + "username": "test_user", + } + mock_process_iter.return_value = [mock_process] + + # Mock process.kill to raise an exception + mock_process.kill.side_effect = Exception("Test") + + # Instantiate the StopCommand + sc = StopCommand() + + # Execute the function + result = sc.execute(args) + + # Assertions + mock_show.assert_called_once_with( + f'Checking for processes matching "{expected_regex}"', + only_show=False, + ) + mock_process_iter.assert_called_once() + mock_stop_and_remove_container.assert_not_called() + mock_process.kill.assert_called_once() + mock_show_process_info.assert_called_once_with( + mock_process, "", show_heading=True + ) + mock_status_execute.assert_not_called() + self.assertFalse(result) diff --git a/test/qlever/commands/test_stop_other_methods.py b/test/qlever/commands/test_stop_other_methods.py new file mode 100644 index 00000000..7c0b4ff6 --- /dev/null +++ b/test/qlever/commands/test_stop_other_methods.py @@ -0,0 +1,60 @@ +import argparse +import unittest + +from qlever.commands.stop import StopCommand + + +class TestStopCommand(unittest.TestCase): + def test_description(self): + result = StopCommand().description() + self.assertEqual( + result, "Stop QLever server for a " "given datasedataset or port" + ) + + def test_should_have_qleverfile(self): + self.assertTrue(StopCommand().should_have_qleverfile()) + + def test_relevant_qleverfile_arguments(self): + result = StopCommand().relevant_qleverfile_arguments() + self.assertEqual( + result, + { + "data": ["name"], + "server": ["port"], + "runtime": ["server_container"], + }, + ) + + def test_additional_arguments(self): + # Create an instance of StopCommand + sc = StopCommand() + + # Create a parser and a subparser + parser = argparse.ArgumentParser() + subparser = parser.add_argument_group("test") + # Call the method + sc.additional_arguments(subparser) + # Parse an empty argument list to see the default + args = parser.parse_args([]) + + # Test that the default value for cmdline_regex is set correctly + self.assertEqual( + args.cmdline_regex, "ServerMain.* -i " "[^ ]*%%NAME%%" + ) + + # Test that the help text for cmdline_regex is correctly set + argument_help = subparser._group_actions[-2].help + self.assertEqual( + argument_help, + "Show only processes where " "the command line matches this regex", + ) + + # Test that the default value for no-containers is set correctly + self.assertEqual(args.no_containers, False) + + # Test that the help text for no-containers is correctly set + argument_help = subparser._group_actions[-1].help + self.assertEqual( + argument_help, + "Do not look for containers, " "only for native processes", + )