From 02b2869008d845af9304a22edbce2da9a9180a2d Mon Sep 17 00:00:00 2001 From: Hannah Bast Date: Sat, 16 Nov 2024 08:40:20 +0100 Subject: [PATCH] Improve `GET_DATA_CMD` and fix graph errors --- src/qlever/Qleverfiles/Qleverfile.pubchem | 30 ++++++++++++----------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/src/qlever/Qleverfiles/Qleverfile.pubchem b/src/qlever/Qleverfiles/Qleverfile.pubchem index 159da7e5..497402df 100644 --- a/src/qlever/Qleverfiles/Qleverfile.pubchem +++ b/src/qlever/Qleverfiles/Qleverfile.pubchem @@ -14,17 +14,19 @@ # and https://bioportal.bioontology.org/ontologies/NDF-RT . [data] -NAME = pubchem -GET_DATA_URL = ftp://ftp.ncbi.nlm.nih.gov/pubchem/RDF -ONTOLOGIES_DIR = RDF.ontologies -PUBCHEM_DIR = RDF.pubchem -CHECK_REQUIREMENTS = for CMD in docker parallel; do $$CMD --version >/dev/null 2>&1 || (echo "Requires \"$$CMD\", please install it"; false); done -GET_DATA_CMD_1 = mkdir -p ${ONTOLOGIES_DIR} && (cd ${ONTOLOGIES_DIR} && echo -e ${ONTOLOGIES_CSV} | while IFS=',' read -r DESC FILE URL; do ERRFILE=$${FILE%.*}.jena-stderr; echo "Processing $$URL ($$FILE) ..." && curl -sLRo $$FILE $$URL && docker run --rm -v $$(pwd):/data stain/jena riot --output=NT /data/$$FILE 2> $$ERRFILE | gzip -c > $${FILE%.*}.nt.gz && rm -f $$FILE; if [ -s $$ERRFILE ]; then grep -q "ERROR *riot" $$ERRFILE && echo "riot ERRORs in $$FILE, check $$ERRFILE"; else rm $$ERRFILE; fi; done) -GET_DATA_CMD_2 = mkdir -p ${PUBCHEM_DIR} && wget --no-verbose -r -nH --cut-dirs=2 -A '*.ttl.gz' ftp://ftp.ncbi.nlm.nih.gov/pubchem/RDF -P RDF.pubchem -GET_DATA_CMD = ${CHECK_REQUIREMENTS} && (${GET_DATA_CMD_1} && ${GET_DATA_CMD_2}) 2>&1 | tee pubchem.get-data-log.txt +NAME = pubchem +GET_DATA_URL = ftp://ftp.ncbi.nlm.nih.gov/pubchem/RDF +ONTOLOGIES_DIR = RDF.ontologies +PUBCHEM_DIR = RDF.pubchem +ONTOLOGIES_CSV = ontologies.csv +CHECK_REQUIREMENTS = for CMD in docker parallel; do $$CMD --version >/dev/null 2>&1 || (echo "Requires \"$$CMD\", please install it"; false); done +GET_DATA_CMD_1 = mkdir -p ${ONTOLOGIES_DIR} && cd ${ONTOLOGIES_DIR} && cat ${ONTOLOGIES_CSV} | parallel --colsep "," 'FILE={2} && URL={3} && ERRFILE=$${FILE%.*}.jena-stderr; echo "Processing $$URL ($$FILE) ..." && curl -sLRo $$FILE $$URL && docker run --rm -v $$(pwd):/data stain/jena riot --output=NT /data/$$FILE 2> $$ERRFILE | gzip -c > $${FILE%.*}.nt.gz && rm -f $$FILE; if [ -s $$ERRFILE ]; then grep -q "ERROR *riot" $$ERRFILE && echo "riot ERRORs in $$FILE, check $$ERRFILE"; else rm $$ERRFILE; fi' +GET_DATA_CMD_2 = mkdir -p ${PUBCHEM_DIR} && cd ${PUBCHEM_DIR} && curl -LRO ${GET_DATA_URL}/void.ttl && grep -oP "${GET_DATA_URL}/.*?\.ttl\.gz" void.ttl | parallel -j 8 'URL={1} && FILE=$$(echo $$URL | sed "s,^${GET_DATA_URL}/,,") && curl --retry 10 --create-dirs -sLRo "$$FILE" "$$URL" && echo "DONE downloading $$URL" || echo "ERROR downloading $$URL [error code $$?]"' +GET_DATA_CMD = ${GET_DATA_CMD_2} +# GET_DATA_CMD = ${CHECK_REQUIREMENTS} && (${GET_DATA_CMD_1} && ${GET_DATA_CMD_2}) 2>&1 | tee pubchem.get-data-log.txt VERSION = $$(date -r void.ttl +%d.%m.%Y || echo "NO_DATE") DESCRIPTION = PubChem RDF from ${GET_DATA_URL} (version ${VERSION}) + associated ontologies (bao, bfo, biopax-level3, chebi, cheminf, cito, dublin_core_terms, fabio, go, iao, ncit, obi, pr, ro, sio, skos, so, uo) -ONTOLOGIES_CSV = "BAO - BioAssay Ontology,bao.owl,https://data.bioontology.org/ontologies/BAO/submissions/56/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb\nBFO - Basic Formal Ontology,bfo.owl,http://purl.obolibrary.org/obo/bfo.owl\nBioPAX - biological pathway data,bp.owl,http://www.biopax.org/release/biopax-level3.owl\nCHEMINF - Chemical Information Ontology,cheminf.owl,http://purl.obolibrary.org/obo/cheminf.owl\nChEBI - Chemical Entities of Biological Interest,chebi.owl,http://purl.obolibrary.org/obo/chebi.owl\nCiTO,cito.nt,http://purl.org/spar/cito.nt\nDCMI Terms,dcterms.nt,https://www.dublincore.org/specifications/dublin-core/dcmi-terms/dublin_core_terms.nt\nFaBiO,fabio.nt,http://purl.org/spar/fabio.nt\nGO - Gene Ontology,go.owl,http://purl.obolibrary.org/obo/go.owl\nIAO - Information Artifact Ontology,iao.owl,http://purl.obolibrary.org/obo/iao.owl\nNCIt,ncit.owl,http://purl.obolibrary.org/obo/ncit.owl\nNDF-RT,ndfrt.owl,https://data.bioontology.org/ontologies/NDF-RT/submissions/1/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb\nOBI - Ontology for Biomedical Investigations,obi.owl,http://purl.obolibrary.org/obo/obi.owl\nOWL,owl.ttl,http://www.w3.org/2002/07/owl.ttl\nPDBo,pdbo.owl,http://rdf.wwpdb.org/schema/pdbx-v40.owl\nPR - PRotein Ontology (PRO),pr.owl,http://purl.obolibrary.org/obo/pr.owl\nRDF Schema,rdfs.ttl,https://www.w3.org/2000/01/rdf-schema.ttl\nRDF,rdf.ttl,http://www.w3.org/1999/02/22-rdf-syntax-ns.ttl\nRO - Relation Ontology,ro.owl,http://purl.obolibrary.org/obo/ro.owl\nSIO - Semanticscience Integrated Ontology,sio.owl,http://semanticscience.org/ontology/sio.owl\nSKOS,skos.rdf,http://www.w3.org/TR/skos-reference/skos.rdf\nSO - Sequence types and features ontology,so.owl,http://purl.obolibrary.org/obo/so.owl\nUO - Units of measurement ontology,uo.owl,http://purl.obolibrary.org/obo/uo.owl" +MAKE_ONTOLOGIES_CSV = $$(mkdir -p ${ONTOLOGIES_DIR} && echo "BAO - BioAssay Ontology,bao.owl,https://data.bioontology.org/ontologies/BAO/submissions/56/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb\nBFO - Basic Formal Ontology,bfo.owl,http://purl.obolibrary.org/obo/bfo.owl\nBioPAX - biological pathway data,bp.owl,http://www.biopax.org/release/biopax-level3.owl\nCHEMINF - Chemical Information Ontology,cheminf.owl,http://purl.obolibrary.org/obo/cheminf.owl\nChEBI - Chemical Entities of Biological Interest,chebi.owl,http://purl.obolibrary.org/obo/chebi.owl\nCiTO,cito.nt,http://purl.org/spar/cito.nt\nDCMI Terms,dcterms.nt,https://www.dublincore.org/specifications/dublin-core/dcmi-terms/dublin_core_terms.nt\nFaBiO,fabio.nt,http://purl.org/spar/fabio.nt\nGO - Gene Ontology,go.owl,http://purl.obolibrary.org/obo/go.owl\nIAO - Information Artifact Ontology,iao.owl,http://purl.obolibrary.org/obo/iao.owl\nNCIt,ncit.owl,http://purl.obolibrary.org/obo/ncit.owl\nNDF-RT,ndfrt.owl,https://data.bioontology.org/ontologies/NDF-RT/submissions/1/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb\nOBI - Ontology for Biomedical Investigations,obi.owl,http://purl.obolibrary.org/obo/obi.owl\nOWL,owl.ttl,http://www.w3.org/2002/07/owl.ttl\nPDBo,pdbo.owl,http://rdf.wwpdb.org/schema/pdbx-v40.owl\nPR - PRotein Ontology (PRO),pr.owl,http://purl.obolibrary.org/obo/pr.owl\nRDF Schema,rdfs.ttl,https://www.w3.org/2000/01/rdf-schema.ttl\nRDF,rdf.ttl,http://www.w3.org/1999/02/22-rdf-syntax-ns.ttl\nRO - Relation Ontology,ro.owl,http://purl.obolibrary.org/obo/ro.owl\nSIO - Semanticscience Integrated Ontology,sio.owl,http://semanticscience.org/ontology/sio.owl\nSKOS,skos.rdf,http://www.w3.org/TR/skos-reference/skos.rdf\nSO - Sequence types and features ontology,so.owl,http://purl.obolibrary.org/obo/so.owl\nUO - Units of measurement ontology,uo.owl,http://purl.obolibrary.org/obo/uo.owl" > ${ONTOLOGIES_DIR}/${ONTOLOGIES_CSV}) [index] INPUT_FILES = ${data:ONTOLOGIES_DIR}/*.nt.gz ${data:PUBCHEM_DIR}/*/*.ttl.gz ${data:PUBCHEM_DIR}/*/*/*.ttl.gz @@ -35,12 +37,12 @@ MULTI_INPUT_JSON = [{ "cmd": "zcat ${data:ONTOLOGIES_DIR}/*.nt.gz", "graph": "${ { "cmd": "zcat ${data:PUBCHEM_DIR}/bioassay/*.ttl.gz", "graph": "${BASE_URL}/bioassay"}, { "cmd": "zcat ${data:PUBCHEM_DIR}/book/*.ttl.gz", "graph": "${BASE_URL}/book"}, { "cmd": "zcat ${data:PUBCHEM_DIR}/cell/*.ttl.gz", "graph": "${BASE_URL}/cell"}, - { "cmd": "zcat ${data:PUBCHEM_DIR}/compound/general/*.ttl.gz", "graph": "${BASE_URL}/compound/general"}, + { "cmd": "zcat ${data:PUBCHEM_DIR}/compound/general/*.ttl.gz", "graph": "${BASE_URL}/compound"}, { "cmd": "zcat ${data:PUBCHEM_DIR}/concept/*.ttl.gz", "graph": "${BASE_URL}/concept"}, { "cmd": "zcat ${data:PUBCHEM_DIR}/conserveddomain/*.ttl.gz", "graph": "${BASE_URL}/conserveddomain"}, { "cmd": "zcat ${data:PUBCHEM_DIR}/cooccurrence/*.ttl.gz", "graph": "${BASE_URL}/cooccurrence"}, - { "cmd": "zcat ${data:PUBCHEM_DIR}/descriptor/compound/*.ttl.gz", "graph": "${BASE_URL}/descriptor/compound"}, - { "cmd": "zcat ${data:PUBCHEM_DIR}/descriptor/substance/*.ttl.gz", "graph": "${BASE_URL}/descriptor/substance"}, + { "cmd": "zcat ${data:PUBCHEM_DIR}/descriptor/compound/*.ttl.gz", "graph": "${BASE_URL}/descriptor"}, + { "cmd": "zcat ${data:PUBCHEM_DIR}/descriptor/substance/*.ttl.gz", "graph": "${BASE_URL}/descriptor"}, { "cmd": "zcat ${data:PUBCHEM_DIR}/disease/*.ttl.gz", "graph": "${BASE_URL}/disease"}, { "cmd": "zcat ${data:PUBCHEM_DIR}/endpoint/*.ttl.gz", "graph": "${BASE_URL}/endpoint"}, { "cmd": "zcat ${data:PUBCHEM_DIR}/gene/*.ttl.gz", "graph": "${BASE_URL}/gene"}, @@ -50,8 +52,8 @@ MULTI_INPUT_JSON = [{ "cmd": "zcat ${data:ONTOLOGIES_DIR}/*.nt.gz", "graph": "${ { "cmd": "zcat ${data:PUBCHEM_DIR}/measuregroup/*.ttl.gz", "graph": "${BASE_URL}/measuregroup"}, { "cmd": "zcat ${data:PUBCHEM_DIR}/organization/*.ttl.gz", "graph": "${BASE_URL}/organization"}, { "cmd": "zcat ${data:PUBCHEM_DIR}/patent/*.ttl.gz", "graph": "${BASE_URL}/patent"}, - { "cmd": "zcat ${data:PUBCHEM_DIR}/patent/cpc/*.ttl.gz", "graph": "${BASE_URL}/patent/cpc"}, - { "cmd": "zcat ${data:PUBCHEM_DIR}/patent/ipc/*.ttl.gz", "graph": "${BASE_URL}/patent/ipc"}, + { "cmd": "zcat ${data:PUBCHEM_DIR}/patent/cpc/*.ttl.gz", "graph": "${BASE_URL}/patent"}, + { "cmd": "zcat ${data:PUBCHEM_DIR}/patent/ipc/*.ttl.gz", "graph": "${BASE_URL}/patent"}, { "cmd": "zcat ${data:PUBCHEM_DIR}/pathway/*.ttl.gz", "graph": "${BASE_URL}/pathway"}, { "cmd": "zcat ${data:PUBCHEM_DIR}/protein/*.ttl.gz", "graph": "${BASE_URL}/protein"}, { "cmd": "zcat ${data:PUBCHEM_DIR}/reference/*.ttl.gz", "graph": "${BASE_URL}/reference"},