Skip to content

Commit

Permalink
Improve GET_DATA_CMD and fix graph errors
Browse files Browse the repository at this point in the history
  • Loading branch information
Hannah Bast committed Nov 16, 2024
1 parent a599533 commit 02b2869
Showing 1 changed file with 16 additions and 14 deletions.
30 changes: 16 additions & 14 deletions src/qlever/Qleverfiles/Qleverfile.pubchem
Original file line number Diff line number Diff line change
Expand Up @@ -14,17 +14,19 @@
# and https://bioportal.bioontology.org/ontologies/NDF-RT .

[data]
NAME = pubchem
GET_DATA_URL = ftp://ftp.ncbi.nlm.nih.gov/pubchem/RDF
ONTOLOGIES_DIR = RDF.ontologies
PUBCHEM_DIR = RDF.pubchem
CHECK_REQUIREMENTS = for CMD in docker parallel; do $$CMD --version >/dev/null 2>&1 || (echo "Requires \"$$CMD\", please install it"; false); done
GET_DATA_CMD_1 = mkdir -p ${ONTOLOGIES_DIR} && (cd ${ONTOLOGIES_DIR} && echo -e ${ONTOLOGIES_CSV} | while IFS=',' read -r DESC FILE URL; do ERRFILE=$${FILE%.*}.jena-stderr; echo "Processing $$URL ($$FILE) ..." && curl -sLRo $$FILE $$URL && docker run --rm -v $$(pwd):/data stain/jena riot --output=NT /data/$$FILE 2> $$ERRFILE | gzip -c > $${FILE%.*}.nt.gz && rm -f $$FILE; if [ -s $$ERRFILE ]; then grep -q "ERROR *riot" $$ERRFILE && echo "riot ERRORs in $$FILE, check $$ERRFILE"; else rm $$ERRFILE; fi; done)
GET_DATA_CMD_2 = mkdir -p ${PUBCHEM_DIR} && wget --no-verbose -r -nH --cut-dirs=2 -A '*.ttl.gz' ftp://ftp.ncbi.nlm.nih.gov/pubchem/RDF -P RDF.pubchem
GET_DATA_CMD = ${CHECK_REQUIREMENTS} && (${GET_DATA_CMD_1} && ${GET_DATA_CMD_2}) 2>&1 | tee pubchem.get-data-log.txt
NAME = pubchem
GET_DATA_URL = ftp://ftp.ncbi.nlm.nih.gov/pubchem/RDF
ONTOLOGIES_DIR = RDF.ontologies
PUBCHEM_DIR = RDF.pubchem
ONTOLOGIES_CSV = ontologies.csv
CHECK_REQUIREMENTS = for CMD in docker parallel; do $$CMD --version >/dev/null 2>&1 || (echo "Requires \"$$CMD\", please install it"; false); done
GET_DATA_CMD_1 = mkdir -p ${ONTOLOGIES_DIR} && cd ${ONTOLOGIES_DIR} && cat ${ONTOLOGIES_CSV} | parallel --colsep "," 'FILE={2} && URL={3} && ERRFILE=$${FILE%.*}.jena-stderr; echo "Processing $$URL ($$FILE) ..." && curl -sLRo $$FILE $$URL && docker run --rm -v $$(pwd):/data stain/jena riot --output=NT /data/$$FILE 2> $$ERRFILE | gzip -c > $${FILE%.*}.nt.gz && rm -f $$FILE; if [ -s $$ERRFILE ]; then grep -q "ERROR *riot" $$ERRFILE && echo "riot ERRORs in $$FILE, check $$ERRFILE"; else rm $$ERRFILE; fi'
GET_DATA_CMD_2 = mkdir -p ${PUBCHEM_DIR} && cd ${PUBCHEM_DIR} && curl -LRO ${GET_DATA_URL}/void.ttl && grep -oP "${GET_DATA_URL}/.*?\.ttl\.gz" void.ttl | parallel -j 8 'URL={1} && FILE=$$(echo $$URL | sed "s,^${GET_DATA_URL}/,,") && curl --retry 10 --create-dirs -sLRo "$$FILE" "$$URL" && echo "DONE downloading $$URL" || echo "ERROR downloading $$URL [error code $$?]"'
GET_DATA_CMD = ${GET_DATA_CMD_2}
# GET_DATA_CMD = ${CHECK_REQUIREMENTS} && (${GET_DATA_CMD_1} && ${GET_DATA_CMD_2}) 2>&1 | tee pubchem.get-data-log.txt
VERSION = $$(date -r void.ttl +%d.%m.%Y || echo "NO_DATE")
DESCRIPTION = PubChem RDF from ${GET_DATA_URL} (version ${VERSION}) + associated ontologies (bao, bfo, biopax-level3, chebi, cheminf, cito, dublin_core_terms, fabio, go, iao, ncit, obi, pr, ro, sio, skos, so, uo)
ONTOLOGIES_CSV = "BAO - BioAssay Ontology,bao.owl,https://data.bioontology.org/ontologies/BAO/submissions/56/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb\nBFO - Basic Formal Ontology,bfo.owl,http://purl.obolibrary.org/obo/bfo.owl\nBioPAX - biological pathway data,bp.owl,http://www.biopax.org/release/biopax-level3.owl\nCHEMINF - Chemical Information Ontology,cheminf.owl,http://purl.obolibrary.org/obo/cheminf.owl\nChEBI - Chemical Entities of Biological Interest,chebi.owl,http://purl.obolibrary.org/obo/chebi.owl\nCiTO,cito.nt,http://purl.org/spar/cito.nt\nDCMI Terms,dcterms.nt,https://www.dublincore.org/specifications/dublin-core/dcmi-terms/dublin_core_terms.nt\nFaBiO,fabio.nt,http://purl.org/spar/fabio.nt\nGO - Gene Ontology,go.owl,http://purl.obolibrary.org/obo/go.owl\nIAO - Information Artifact Ontology,iao.owl,http://purl.obolibrary.org/obo/iao.owl\nNCIt,ncit.owl,http://purl.obolibrary.org/obo/ncit.owl\nNDF-RT,ndfrt.owl,https://data.bioontology.org/ontologies/NDF-RT/submissions/1/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb\nOBI - Ontology for Biomedical Investigations,obi.owl,http://purl.obolibrary.org/obo/obi.owl\nOWL,owl.ttl,http://www.w3.org/2002/07/owl.ttl\nPDBo,pdbo.owl,http://rdf.wwpdb.org/schema/pdbx-v40.owl\nPR - PRotein Ontology (PRO),pr.owl,http://purl.obolibrary.org/obo/pr.owl\nRDF Schema,rdfs.ttl,https://www.w3.org/2000/01/rdf-schema.ttl\nRDF,rdf.ttl,http://www.w3.org/1999/02/22-rdf-syntax-ns.ttl\nRO - Relation Ontology,ro.owl,http://purl.obolibrary.org/obo/ro.owl\nSIO - Semanticscience Integrated Ontology,sio.owl,http://semanticscience.org/ontology/sio.owl\nSKOS,skos.rdf,http://www.w3.org/TR/skos-reference/skos.rdf\nSO - Sequence types and features ontology,so.owl,http://purl.obolibrary.org/obo/so.owl\nUO - Units of measurement ontology,uo.owl,http://purl.obolibrary.org/obo/uo.owl"
MAKE_ONTOLOGIES_CSV = $$(mkdir -p ${ONTOLOGIES_DIR} && echo "BAO - BioAssay Ontology,bao.owl,https://data.bioontology.org/ontologies/BAO/submissions/56/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb\nBFO - Basic Formal Ontology,bfo.owl,http://purl.obolibrary.org/obo/bfo.owl\nBioPAX - biological pathway data,bp.owl,http://www.biopax.org/release/biopax-level3.owl\nCHEMINF - Chemical Information Ontology,cheminf.owl,http://purl.obolibrary.org/obo/cheminf.owl\nChEBI - Chemical Entities of Biological Interest,chebi.owl,http://purl.obolibrary.org/obo/chebi.owl\nCiTO,cito.nt,http://purl.org/spar/cito.nt\nDCMI Terms,dcterms.nt,https://www.dublincore.org/specifications/dublin-core/dcmi-terms/dublin_core_terms.nt\nFaBiO,fabio.nt,http://purl.org/spar/fabio.nt\nGO - Gene Ontology,go.owl,http://purl.obolibrary.org/obo/go.owl\nIAO - Information Artifact Ontology,iao.owl,http://purl.obolibrary.org/obo/iao.owl\nNCIt,ncit.owl,http://purl.obolibrary.org/obo/ncit.owl\nNDF-RT,ndfrt.owl,https://data.bioontology.org/ontologies/NDF-RT/submissions/1/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb\nOBI - Ontology for Biomedical Investigations,obi.owl,http://purl.obolibrary.org/obo/obi.owl\nOWL,owl.ttl,http://www.w3.org/2002/07/owl.ttl\nPDBo,pdbo.owl,http://rdf.wwpdb.org/schema/pdbx-v40.owl\nPR - PRotein Ontology (PRO),pr.owl,http://purl.obolibrary.org/obo/pr.owl\nRDF Schema,rdfs.ttl,https://www.w3.org/2000/01/rdf-schema.ttl\nRDF,rdf.ttl,http://www.w3.org/1999/02/22-rdf-syntax-ns.ttl\nRO - Relation Ontology,ro.owl,http://purl.obolibrary.org/obo/ro.owl\nSIO - Semanticscience Integrated Ontology,sio.owl,http://semanticscience.org/ontology/sio.owl\nSKOS,skos.rdf,http://www.w3.org/TR/skos-reference/skos.rdf\nSO - Sequence types and features ontology,so.owl,http://purl.obolibrary.org/obo/so.owl\nUO - Units of measurement ontology,uo.owl,http://purl.obolibrary.org/obo/uo.owl" > ${ONTOLOGIES_DIR}/${ONTOLOGIES_CSV})

[index]
INPUT_FILES = ${data:ONTOLOGIES_DIR}/*.nt.gz ${data:PUBCHEM_DIR}/*/*.ttl.gz ${data:PUBCHEM_DIR}/*/*/*.ttl.gz
Expand All @@ -35,12 +37,12 @@ MULTI_INPUT_JSON = [{ "cmd": "zcat ${data:ONTOLOGIES_DIR}/*.nt.gz", "graph": "${
{ "cmd": "zcat ${data:PUBCHEM_DIR}/bioassay/*.ttl.gz", "graph": "${BASE_URL}/bioassay"},
{ "cmd": "zcat ${data:PUBCHEM_DIR}/book/*.ttl.gz", "graph": "${BASE_URL}/book"},
{ "cmd": "zcat ${data:PUBCHEM_DIR}/cell/*.ttl.gz", "graph": "${BASE_URL}/cell"},
{ "cmd": "zcat ${data:PUBCHEM_DIR}/compound/general/*.ttl.gz", "graph": "${BASE_URL}/compound/general"},
{ "cmd": "zcat ${data:PUBCHEM_DIR}/compound/general/*.ttl.gz", "graph": "${BASE_URL}/compound"},
{ "cmd": "zcat ${data:PUBCHEM_DIR}/concept/*.ttl.gz", "graph": "${BASE_URL}/concept"},
{ "cmd": "zcat ${data:PUBCHEM_DIR}/conserveddomain/*.ttl.gz", "graph": "${BASE_URL}/conserveddomain"},
{ "cmd": "zcat ${data:PUBCHEM_DIR}/cooccurrence/*.ttl.gz", "graph": "${BASE_URL}/cooccurrence"},
{ "cmd": "zcat ${data:PUBCHEM_DIR}/descriptor/compound/*.ttl.gz", "graph": "${BASE_URL}/descriptor/compound"},
{ "cmd": "zcat ${data:PUBCHEM_DIR}/descriptor/substance/*.ttl.gz", "graph": "${BASE_URL}/descriptor/substance"},
{ "cmd": "zcat ${data:PUBCHEM_DIR}/descriptor/compound/*.ttl.gz", "graph": "${BASE_URL}/descriptor"},
{ "cmd": "zcat ${data:PUBCHEM_DIR}/descriptor/substance/*.ttl.gz", "graph": "${BASE_URL}/descriptor"},
{ "cmd": "zcat ${data:PUBCHEM_DIR}/disease/*.ttl.gz", "graph": "${BASE_URL}/disease"},
{ "cmd": "zcat ${data:PUBCHEM_DIR}/endpoint/*.ttl.gz", "graph": "${BASE_URL}/endpoint"},
{ "cmd": "zcat ${data:PUBCHEM_DIR}/gene/*.ttl.gz", "graph": "${BASE_URL}/gene"},
Expand All @@ -50,8 +52,8 @@ MULTI_INPUT_JSON = [{ "cmd": "zcat ${data:ONTOLOGIES_DIR}/*.nt.gz", "graph": "${
{ "cmd": "zcat ${data:PUBCHEM_DIR}/measuregroup/*.ttl.gz", "graph": "${BASE_URL}/measuregroup"},
{ "cmd": "zcat ${data:PUBCHEM_DIR}/organization/*.ttl.gz", "graph": "${BASE_URL}/organization"},
{ "cmd": "zcat ${data:PUBCHEM_DIR}/patent/*.ttl.gz", "graph": "${BASE_URL}/patent"},
{ "cmd": "zcat ${data:PUBCHEM_DIR}/patent/cpc/*.ttl.gz", "graph": "${BASE_URL}/patent/cpc"},
{ "cmd": "zcat ${data:PUBCHEM_DIR}/patent/ipc/*.ttl.gz", "graph": "${BASE_URL}/patent/ipc"},
{ "cmd": "zcat ${data:PUBCHEM_DIR}/patent/cpc/*.ttl.gz", "graph": "${BASE_URL}/patent"},
{ "cmd": "zcat ${data:PUBCHEM_DIR}/patent/ipc/*.ttl.gz", "graph": "${BASE_URL}/patent"},
{ "cmd": "zcat ${data:PUBCHEM_DIR}/pathway/*.ttl.gz", "graph": "${BASE_URL}/pathway"},
{ "cmd": "zcat ${data:PUBCHEM_DIR}/protein/*.ttl.gz", "graph": "${BASE_URL}/protein"},
{ "cmd": "zcat ${data:PUBCHEM_DIR}/reference/*.ttl.gz", "graph": "${BASE_URL}/reference"},
Expand Down

0 comments on commit 02b2869

Please sign in to comment.