diff --git a/Qleverfiles/Qleverfile.pubchem b/Qleverfiles/Qleverfile.pubchem index 0b45540e..0eff8c97 100644 --- a/Qleverfiles/Qleverfile.pubchem +++ b/Qleverfiles/Qleverfile.pubchem @@ -1,7 +1,7 @@ # Qleverfile for PubChem, use with https://github.com/ad-freiburg/qlever-control # -# qlever get-data # downloads+converts .gz files of size ~114 GB, takes ~5 h -# qlever index # takes ~5 h and ~20 GB RAM on an AMD Ryzen 9 5900X +# qlever get-data # downloads .gz files of total size 114 GB; see NOTES 2, 3, 4 +# qlever index # takes ~5 hours and ~20 GB RAM on an AMD Ryzen 9 5900X # qlever start # starts the server (a few seconds) # # IMPORTANT NOTES: @@ -16,21 +16,24 @@ # obi pr ro sio skos so uo # # NOTE 2: The robots.txt file from https://ftp.ncbi.nlm.nih.gov currently -# disallows downloading the PubChem RDF data using `wget --recursive`. As a -# workaround, the `GET_DATA_CMD` below uses `curl` to download the `void.ttl` -# and from that file extracts the URLs of the TTL files, which are then -# downloaded one by one. Alternatively, `wget --recursive` does work for the -# URL `ftp://ftp.ncbi.nlm.nih.gov/pubchem/RDF` (`ftp` instead of `https`). +# disallows downloading the PubChem RDF data using `wget --recursive` as in the +# GET_DATA_CMD below. As a workaround, you can write a simple Python script +# (using `BeautifulSoup` and `urllib.parse`) to scrape the URLs from the HTML +# pages and download the files individually. This was done for the latest +# version of https://qlever.cs.uni-freiburg.de/pubchem . # # NOTE 3: Many of the TTL files have generic prefix definitions in the middle -# of the file, like `@prefix ns23: `. +# of the file, like @prefix ns23: . # See https://github.com/ad-freiburg/qlever/issues/711#issuecomment-1197113953 -# This is allowed by the standard, but very unusual. For use with QLever, the -# `GET_DATA_CMD` therefore converts the TTL files to NT before indexing. +# This is allowed by the standard, but VERY unusual. For use with QLever, +# convert the TTL files to NT before indexing, see GET_DATA_CMD below. # # NOTE 4: Many of the files (TTL as well as NT) contain invalid IRIs because -# spaces and braces are not properly escaped. The `GET_DATA_CMD` below -# percent-encodes each occurrence of a space, `[`, `]`, `{`, and `}` in an IRI. +# spaces and braces are not properly escaped. Here is a simple awk-based script +# to percent-encode spaces and braces in all IRIs in the NT files: +# +# for NTGZ in nt.${DATE}/*.nt.gz; do echo "zcat $NTGZ | sed 's/> />\t/1; s/> />\t/1; s/ \.\$/\t./' | awk 'BEGIN{FS=OFS=\"\t\"} {for (i = 1; i <= 3; i++) if (\$i ~ /^<.*>\$/) { gsub(/ /, \"%20\", \$i); gsub(/\[/, \"%5B\", \$i); gsub(/\]/, \"%5D\", \$i); gsub(/{/, \"%7B\", \$i); gsub(/}/, \"%7D\", \$i); } print }' | sed 's/\t/ /g' | gzip -c > nt.${DATE}.FIXED/$(basename $NTGZ)"; done > fix-nt.commands.txt +# cat fix-nt.commands.txt | parallel [DEFAULT] @@ -44,7 +47,7 @@ GET_DATA_CMD = mkdir -p ttl.${DATE} && mkdir -p nt.${DATE} && ${MAKE_GET_DA INDEX_DESCRIPTION = PubChem RDF from ${GET_DATA_URL}, version ${DATE} (all folders except nbr2d and nbr3d) [index] -FILE_NAMES = nt.ONTOLOGIES/*.nt.gz nt.${DATE}/*.nt.gz +FILE_NAMES = pubchem.additional-ontologies.nt.gz nt.${DATE}/*.nt.gz CAT_FILES = zcat ${FILE_NAMES} WITH_TEXT_INDEX = false STXXL_MEMORY = 10G