-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconfig.py
230 lines (195 loc) · 9.31 KB
/
config.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
# flake8: noqa: W291,E501
import os
from enum import Enum
def _env_variable_else(env_var_name, default):
return os.environ.get(env_var_name) if os.environ.get(env_var_name) else default
DB_PATH = "./data/sumstats_meta.db"
STORAGE_PATH = _env_variable_else("STORAGE_PATH", "./data")
LOGGING_PATH = "../logs"
STAGING_PATH = _env_variable_else("STAGING_PATH", "depo_ss_staging")
VALIDATED_PATH = _env_variable_else("VALIDATED_PATH", "depo_ss_validated")
SW_PATH = _env_variable_else("SW_PATH", "./bin")
DEPO_PATH = _env_variable_else("DEPO_PATH", "./depo_data")
CONTAINERISE = _env_variable_else("CONTAINERISE", "./depo_data")
FTP_STAGING_PATH = _env_variable_else("FTP_STAGING_PATH", "depo_ss_staging_ftp")
# --- Rabbit and Celery --- #
BROKER = "amqp"
BROKER_HOST = "rabbitmq"
BROKER_PORT = 5672
# the following two queues were required for EHK + EBI SLURM cluster
# install (pre-validation and post-validation) but are not required
# if the worker is able to perform validation and see the database
CELERY_QUEUE1 = _env_variable_else("CELERY_QUEUE1", "preval")
CELERY_QUEUE2 = _env_variable_else("CELERY_QUEUE2", "postval")
CELERY_QUEUE3 = _env_variable_else("CELERY_QUEUE3", "metadata-yml-update")
# --- Remote --- #
VALIDATE_WITH_SSH = _env_variable_else("VALIDATE_WITH_SSH", False)
COMPUTE_FARM_LOGIN_NODE = _env_variable_else("COMPUTE_FARM_LOGIN_NODE", None)
COMPUTE_FARM_USERNAME = _env_variable_else("COMPUTE_FARM_USERNAME", None)
COMPUTE_FARM_QUEUE = "production"
COMPUTE_FARM_QUEUE_LONG = _env_variable_else("COMPUTE_FARM_QUEUE_LONG", "production")
REMOTE_HTTP_PROXY = _env_variable_else("REMOTE_HTTP_PROXY", None)
REMOTE_HTTPS_PROXY = _env_variable_else("REMOTE_HTTPS_PROXY", None)
SINGULARITY_IMAGE = _env_variable_else("SINGULARITY_IMAGE", "gwas-sumstats-service")
SINGULARITY_TAG = _env_variable_else("SINGULARITY_TAG", "latest")
# --- MONGO DB --- #
MONGO_URI = _env_variable_else("MONGO_URI", None)
MONGO_USER = _env_variable_else("MONGO_USER", "")
MONGO_PASSWORD = _env_variable_else("MONGO_PASSWORD", "")
MONGO_DB = _env_variable_else("MONGO_DB", None)
NR = "NR"
# --- File transfer (FTP nad Globus) config --- #
FTP_SERVER = _env_variable_else("FTP_SERVER", None)
FTP_USERNAME = _env_variable_else("FTP_USERNAME", None)
FTP_PASSWORD = _env_variable_else("FTP_PASSWORD", None)
FTP_SERVER_EBI = _env_variable_else("FTP_SERVER_EBI", "ftp.ebi.ac.uk")
FTP_PREFIX = _env_variable_else("FTP_PREFIX", "/pub/databases/gwas/summary_statistics")
TOKEN_FILE = "refresh-tokens.json"
REDIRECT_URI = "https://auth.globus.org/v2/web/auth-code"
SCOPES = "openid email profile " "urn:globus:auth:scope:transfer.api.globus.org:all"
GWAS_ENDPOINT_ID = _env_variable_else("GWAS_ENDPOINT_ID", None)
CLIENT_SECRET = _env_variable_else("CLIENT_SECRET", None)
CLIENT_ID = _env_variable_else("CLIENT_ID", None)
TRANSFER_CLIENT_ID = _env_variable_else("TRANSFER_CLIENT_ID", None)
GWAS_GLOBUS_GROUP = _env_variable_else("GWAS_GLOBUS_GROUP", None)
GLOBUS_HOSTNAME = _env_variable_else("GLOBUS_HOSTNAME", None)
DEPO_API_AUTH_TOKEN = _env_variable_else("DEPO_API_AUTH_TOKEN", None)
OUTPUT_PATH = _env_variable_else("OUTPUT_PATH", "metadata/output")
MAPPED_COLLECTION_ID = _env_variable_else("MAPPED_COLLECTION_ID", None)
STORAGE_GATEWAY_ID = _env_variable_else("STORAGE_GATEWAY_ID", None)
GWAS_IDENTITY = _env_variable_else(
"GWAS_IDENTITY", "66dab3b3-b880-4017-b496-9643da909b89"
)
# --- Mail --- #
MAIL_FROM = _env_variable_else("MAIL_FROM", "[email protected]")
MAIL_TO = _env_variable_else("MAIL_TO", "[email protected]")
MAIL_SERVER = _env_variable_else("MAIL_SERVER", "outgoing.ebi.ac.uk")
MAIL_PORT = _env_variable_else("MAIL_PORT", "587")
# --- SQLite schema --- #
DB_SCHEMA = """
PRAGMA foreign_keys = ON;
CREATE TABLE IF NOT EXISTS studies (
studyID TEXT NOT NULL UNIQUE,
callbackID TEXT,
filePath TEXT,
md5 TEXT,
assembly TEXT,
retrieved INT CHECK (retrieved IN (0,1)),
dataValid INT CHECK (dataValid IN (0,1)),
errorCode INT,
readme TEXT,
entryUUID TEXT,
FOREIGN KEY(errorCode) REFERENCES errors(id)
);
CREATE TABLE IF NOT EXISTS errors (
id INTEGER PRIMARY KEY,
errorText TEXT UNIQUE
);
BEGIN TRANSACTION;
INSERT OR IGNORE INTO errors(id, errorText) VALUES(1, "The summary statistics file cannot be found"); -- 1
INSERT OR IGNORE INTO errors(id, errorText) VALUES(2, "The md5sum of the summary statistics file does not match the one provided"); -- 2
INSERT OR IGNORE INTO errors(id, errorText) VALUES(3, "Summary statistics file validation failed, please run the validator on your file to see the errors (available here: https://pypi.org/project/gwas-sumstats-tools/)"); -- 3
INSERT OR IGNORE INTO errors(id, errorText) VALUES(4, "Missing mandatory field, you must provide (i) file path/URL, (ii) md5 sum and (iii) genome assembly for each file"); -- 4
INSERT OR IGNORE INTO errors(id, errorText) VALUES(5, "Genome assembly invalid - please see documentation for valid assemblies"); -- 5
COMMIT;
"""
VALIDATION_ERRORS = [
{"id": 1, "errorText": "The summary statistics file cannot be found"},
{
"id": 2,
"errorText": "The md5sum of the summary statistics file does not match the one provided",
},
{
"id": 3,
"errorText": "Summary statistics file validation failed, please run the validator on your file to see the errors (available here: https://pypi.org/project/gwas-sumstats-tools/)",
},
{
"id": 4,
"errorText": "Missing mandatory field, you must provide (i) file path/URL, (ii) md5 sum and (iii) genome assembly for each file",
},
{
"id": 5,
"errorText": "Genome assembly invalid - please see documentation for valid assemblies",
},
{
"id": 6,
"errorText": "Summary statistics file validation failed: File extension error, please run the validator on your file to see the errors (available here: https://pypi.org/project/gwas-sumstats-tools/)",
},
{
"id": 7,
"errorText": "Summary statistics file validation failed: File header error, please run the validator on your file to see the errors (available here: https://pypi.org/project/gwas-sumstats-tools/)",
},
{
"id": 8,
"errorText": "Summary statistics file validation failed: File squareness error, please run the validator on your file to see the errors (available here: https://pypi.org/project/gwas-sumstats-tools/)",
},
{
"id": 9,
"errorText": "Summary statistics file validation failed: File contains fewer than 100,000 rows. If you have fewer than 100,000 variants in your dataset, please contact [email protected] for further advice.",
},
{
"id": 10,
"errorText": "There is a problem on our side, please contact [email protected] for further advice.",
},
{"id": 11, "errorText": "The raw sumstats file can not be found"},
{
"id": 12,
"errorText": "Analysis software must be provided in the metadata template for summary statistics containing p-values equal to zero.",
},
]
VALID_ASSEMBLIES = ["GRCh38", "GRCh37", "NCBI36", "NCBI35", "NCBI34", "NR"]
LATEST_ASSEMBLY = "GRCh38"
HM_COORDINATE_SYSTEM = "1-based"
HM_REFERENCE = "ftp://ftp.ensembl.org/pub/release-95/fasta/homo_sapiens/dna/"
NEXTFLOW_CONFIG = (
"executor.name = 'slurm'\n"
"process.executor = 'slurm'\n"
"executor.queueSize = 100\n"
"singularity.cacheDir = '{sing_cache_dir}'\n"
).format(
sing_cache_dir=_env_variable_else("SINGULARITY_CACHEDIR", "./singularity_cache")
)
SUBMISSION_TEMPLATE_HEADER_MAP = {
"Genotyping technology": "genotyping_technology",
"Number of individuals": "sample_size",
"Ancestry category": "sample_ancestry",
"Reported trait": "trait_description",
"MAF lower limit": "minor_allele_freq_lower_limit",
"Ancestry method": "ancestry_method",
"Case control study": "case_control_study",
"Number of cases": "case_count",
"Number of controls": "control_count",
"Summary statistics assembly": "genome_assembly",
"Analysis Software": "analysis_software",
"Imputation panel": "imputation_panel",
"Imputation software": "imputation_software",
"Adjusted covariates": "adjusted_covariates",
"Mapped trait": "ontology_mapping",
"Readme text": "author_notes",
"Coordinate system": "coordinate_system",
"Sex": "sex",
}
SUBMISSION_TEMPLATE_HEADER_MAP_pre1_8 = {
"Readme file" if k == "Readme text" else k: v
for k, v in SUBMISSION_TEMPLATE_HEADER_MAP.items()
}
STUDY_FIELD_TO_SPLIT = (
"genotyping_technology",
"trait_description",
"ontology_mapping",
"adjusted_covariates",
)
STUDY_FIELD_BOOLS = ("is_harmonised", "is_sorted")
SAMPLE_FIELD_TO_SPLIT = ("ancestry_method", "sample_ancestry")
SAMPLE_FIELD_BOOLS = "case_control_study"
SUMSTATS_FILE_TYPE = _env_variable_else("SSF_VERSION", "GWAS-SSF v1.0")
GWAS_CATALOG_REST_API_STUDY_URL = "https://www.ebi.ac.uk/gwas/rest/api/studies/"
GWAS_DEPO_REST_API_URL = _env_variable_else(
"GWAS_DEPO_REST_API_URL", "https://www.ebi.ac.uk/gwas/deposition/api/v1/"
)
class MetadataYamlStatus(Enum):
PENDING = "pending"
IN_PROGRESS = "in-progress"
COMPLETED = "completed"
FAILED = "failed"
SKIPPED = "skipped"