diff --git a/Qleverfiles/Qleverfile.pubchem b/Qleverfiles/Qleverfile.pubchem index 01010aba..98456d6d 100644 --- a/Qleverfiles/Qleverfile.pubchem +++ b/Qleverfiles/Qleverfile.pubchem @@ -4,40 +4,39 @@ # qlever index # takes ~20 hours and ~40 GB RAM on an AMD Ryzen 9 5900X # qlever start # starts the server (takes around 2 minutes) # -# When rebuilding the index and you already have pubchem.prefix-definitions, you -# can comment out EXTRACT_PREFIXES. See the variable settings below for more -# details. - -# Indexer settings -DB = pubchem -RDF_FILES = "ttl/*.ttl.gz" -EXTRACT_PREFIXES = "for F in ${RDF_FILES}; do zcat -f \$F | \grep ^@prefix; done | sort -u > ${DB}.prefix-definitions" -CAT_FILES = "zcat -f ${DB}.prefix-definitions ${RDF_FILES}" +# TODO: The instance on https://qlever.cs.uni-freiburg.de/pubchem also contains +# the following ontologies, which are very useful for resolving names but which +# are not yet part of what is downloaded with GET_DATA_CMD. +# +# bao bfo biopax-level3 chebi cheminf cito dublin_core_terms fabio go iao ncit +# obi pr ro sio skos so uo + + +[DEFAULT] +NAME = pubchem + +[data] +GET_DATA_URL = https://ftp.ncbi.nlm.nih.gov/pubchem/RDF +GET_DATA_CMD = wget --recursive --exclude-directories=nbr2d,nbr3d --no-host-directories --no-clobber --cut-dirs=2 --directory-prefix=ttl ${GET_DATA_URL} && find ttl \( -name "*.ttl.gz" -o -name "*.ttl" \) | parallel 'zcat -f {} | docker run --rm -i stain/jena riot --syntax=TTL --output=NT /dev/stdin 2> pubchem.ttl2nt.stderr | gzip > nt/$$(basename -s .ttl.gz -s ttl {}).nt.gz' +INDEX_DESCRIPTION = PubChem RDF from ${GET_DATA_URL}, version 29.10.2023 (all folders except nbr2d and nbr3d) + +[index] +FILE_NAMES = nt/*.nt.gz nt/*/*.nt.gz nt/*/*/*.nt.gz +CAT_FILES = zcat ${FILE_NAMES} WITH_TEXT_INDEX = false STXXL_MEMORY_GB = 10 SETTINGS_JSON = '{ "languages-internal": [""], "prefixes-external": [ "