forked from vidboda/nenufaar
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmnenufaar.sh
executable file
·385 lines (315 loc) · 12.4 KB
/
mnenufaar.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
#!/usr/bin/sh
###########################################################################
######### ###########
######### Meta nenufaar ###########
######### @uthor : D Baux david.baux<at>inserm.fr ###########
######### Date : 12/05/2016 ###########
######### ###########
###########################################################################
VERSION=0.1.1
USAGE="
Program: mnenufaar
Version: ${VERSION}
Contact: Baux David <david.baux<at>inserm.fr>
Usage: $(basename "$0") [options] -- program to perform an nenufaar analyses of families: merge BAMs and call variants on merged BAM. Single VCF output for all samples.
Example:
sh mnenufaar.sh -i=path/to/run/folder/
Arguments:
-i, --input_path set the absolute path to input directory (must be created before script execution)
Options:
-h, --help shows this help text
-c, --caller Chooses which GATK variant caller to use: ug: UnifiedGenotyper or hc, HaplotypeCaller default hc
-up, --use_platypus true, false Platypus variant calling - Default behaviour is GATK variant calling, then Platypus and VCF merging using GATK CombineVariants
-dcov, --downsample_to_coverage GATK option default 1000 - selects the variant caller, HaplotypeCaller if dcov <= 1000, or UnifiedGenotyper - DEPRECATED IN THIS VERSION
-a, --annotator Name of annotator: cava (output to textfile), annovar (output to both text and vcf file), vep, snpeff (both output to VCF), or merge to generate a merged cava/annovar file (hg19), default no annotation
-f, --filter combined with annovar only, filters out variants with MAF > 1% in ExAC, ESP or 1KG, true/false, default false. Warning: does not produce the annotated VCF, only tab delimited file.
-g, --genome Version of genome (assembly), either hg19 or hg38, default hg19
-p, --protocol Protocol used to select sequences: capture/amplicon/wgs, default capture
-o, --output_path Sets the absolute path to output directory (must be created before script execution)
-r, --reference Path to genome fasta reference file
-snp, --snp Sets the absolute path to vcf file for dbSNP
-indel1, --indel1 Sets the absolute path to vcf file for indels reference (1000G_phase1.indels sor Mills_and_1000G_gold_standard)
-indel2, --indel2 Sets the absolute path to vcf file for indels reference (1000G_phase1.indels or Mills_and_1000G_gold_standard)S
-hsm, --hsmetrics Boolean true,false: Asks for Picard HsMetrics calculation: necessitates a Picard.intervals.list (target picard file) file at the root of the run folder, default false - can also take a baits interval file named Picard.baits.intervals.list
-l, --gene_list path to a txt file with a #NAME and a list of genes to be marked in a annovar file
Docs:
http://bio-bwa.sourceforge.net/bwa.shtml
http://www.htslib.org/doc/samtools-1.2.html
http://broadinstitute.github.io/picard/command-line-overview.html
https://www.broadinstitute.org/gatk/guide/best-practices?bpm=DNAseq
https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_engine_CommandLineGATK.php#--downsample_to_coverage
http://www.well.ox.ac.uk/cava
http://annovar.openbioinformatics.org/en/latest/
https://github.com/lindenb/jvarkit
# <my run folder>
# |
# <Intervals.list>
# |
# <Picard.intervals.list> #optionnal, mandatory if -hsm=true
# |
# <sample_1>
# | |
# | |
# | <sample_1.R1.fastq.gz>
# | |
# | <sample_1.R2.fastq.gz>
# | |
# |
# <sample_2>
# | |
# | |
# | <sample_2.R1.fastq.gz>
# | |
# | <sample_2.R2.fastq.gz>
# |
# |
# <etc> ....
"
############## If no options are given, print help message #################################
if [ $# -eq 0 ]; then
echo "${USAGE}"
echo "Error Message : No arguments provided"
echo ""
exit 1
fi
############### Get options from conf file #################################
CONFIG_FILE=mnenufaar.conf
#we check params against regexp
UNKNOWN=`cat ${CONFIG_FILE} | grep -Evi "^(#.*|[A-Z0-9_]*=[a-z0-9_ \.\/\$\{\}\(\)\"\'=-]*)$"`
if [ -n "${UNKNOWN}" ]; then
echo "Error in config file. Not allowed lines:"
echo ${UNKNOWN}
exit 1
fi
source ./${CONFIG_FILE}
############### Get arguments from command line #################################
while [[ "$#" -gt 0 ]]
do
KEY="$1"
case "${KEY}" in
-i|--input) #mandatory
INPUT_PATH="$2"
shift
;;
-o|--output)
OUTPUT_PATH="$2"
shift
;;
-r|--reference)
REF_PATH="$2"
shift
;;
-snp|--snp)
SNP_PATH="$2"
shift
;;
-indel1|--indel1)
INDEL1="$2"
shift
;;
-indel2|--indel2)
INDEL2="$2"
shift
;;
-dcov|--downsample_to_coverage)
DCOV="$2"
shift
;;
-g|--genome) #default hg19
GENOME="$2"
shift
;;
-c|--caller) #default HC
CALLER="$2"
shift
;;
-p|--protocol) #default capture
PROTOCOL="$2"
shift
;;
-hsm|--hsmetrics) #default false
HSMETRICS="$2"
shift
;;
-a|--annotator)
ANNOTATOR="$2"
shift
;;
-f|--filter) #default false
FILTER="$2"
shift
;;
-up|--use_platypus) #default true
USE_PLATYPUS="$2"
shift
;;
-h|--help)
echo "${USAGE}"
exit 1
;;
-l|--gene_list)
LIST="$2"
shift
;;
-cu|--clean_up) #default true
CLEAN_UP="$2"
shift
;;
-log|--log-file)
LOG_FILE="$2"
shift
;;
*)
echo "Error Message : Unknown option ${KEY}" # unknown option
exit
;;
esac
shift
done
mkdir ${OUTPUT_PATH}
if [ ${#LOG_FILE} -eq 0 ];then
LOG_FILE="${OUTPUT_PATH}mnenufaar_${ID}.log"
fi
echo ""
echo "#############################################################################################"
echo "mnenufaar ${VERSION} will be launched. tail -f ${LOG_FILE} for detailed information. "
echo "#############################################################################################"
touch ${LOG_FILE}
exec &>${LOG_FILE}
echo ""
echo "#############################################################################################"
echo "Config File ${CONFIG_FILE} successfully loaded - `date`"
echo "##############################################################################################"
######## add / to INPUT_PATH if needed
if [[ "${INPUT_PATH}" =~ .+[^\/]$ ]];then
INPUT_PATH="${INPUT_PATH}/"
fi
######## change assembly if necessary ########
if [[ "${GENOME}" == 'hg38' ]]; then
REF_PATH='refData/genome/hg38/hg38.fa'
SNP_PATH='refData/dbSNP/144_hg38/dbsnp_144.hg38.vcf.gz'
INDEL1='refData/indelReference/hg38/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz'
INDEL2='refData/indelReference/hg38/Homo_sapiens_assembly38.known_indels.vcf.gz'
fi
validate_genome() { echo "hg19 hg38" | grep -F -q -w "$1"; }
if [ "${GENOME}" != 0 ]; then
validate_genome "${GENOME}" && echo "VALID GENOME OPTION = ${GENOME}" || { echo "INVALID GENOME OPTION = ${GENOME} -> see help (-h)" && exit 1; }
fi
if [ "${LIST}" != '' ] && [ ! -e "${LIST}" ]; then
echo 'GENE LIST FILE ${LIST} DOES NOT EXIST -> see help (-h)' && exit 1;
fi
if [ "${LIST}" != '' ]; then
LIST="-l ${LIST}"
fi
DATE1=$(date +"%s")
##########################################################################
################## Functions Declaration #######################
###########################################################################
#https://wikis.utexas.edu/display/bioiteam/Example+BWA+alignment+script
# general function that exits after printing its text argument
# in a standard format which can be easily grep'd.
err() {
echo "[$(basename "$0")]...The script has terminated unexpectedly $1";
echo "[$(basename "$0")]...The script has terminated unexpectedly $1";
exit 1 # any non-0 exit code signals an error
}
# function to check return code of programs.
# exits with standard message if code is non-zero;
# otherwise displays completiong message and date.
# arg 1 is the return code (usually $?)
# arg2 is text describing what ran
ckRes() {
if [ "$1" == '0' ]; then
echo "[$(basename "$0")]...Done - $2 - `date`";
else
err "[$(basename "$0")]...$2 returned non-0 exit code $1";
fi
}
# function that checks if a file exists
# arg 1 is the file name
# arg2 is text describing the file (optional)
ckFile() {
if [ ! -e "$1" ]; then
err "[$(basename "$0")]...$2 File '$1' not found";
fi
}
# function that checks if a file exists and
# that it has non-0 length. needed because
# programs don't always return non-0 return
# codes, and worse, they also create their
# output file with 0 length so that just
# checking for its existence is not enough
# to ensure the program ran properly
ckFileSz() {
ckFile $1 $2;
SZ=$(ls -l $1 | awk '{print $5}');
if [ "$SZ" == '0' ]; then
err "[$(basename "$0")]...$2 File '$1' is zero length";
fi
}
###########################################################################
###########################################################################
###########################################################################
###
###nenufaar generate BAM for each sample
###
echo "COMMAND: nohup ${SHELL} ${NENUFAAR} -i ${INPUT_PATH} -o ${OUTPUT_PATH} -r ${REF_PATH} -snp ${SNP_PATH} -indel1 ${INDEL1} -indel2 ${INDEL2} -g ${GENOME} -dcov ${DCOV} -c ${CALLER} -p ${PROTOCOL} -hsm ${HSMETRICS} -id ${ID} -b "
nohup ${SHELL} ${NENUFAAR} -i ${INPUT_PATH} -o ${OUTPUT_PATH} -r ${REF_PATH} -snp ${SNP_PATH} -indel1 ${INDEL1} -indel2 ${INDEL2} -g ${GENOME} -dcov ${DCOV} -c ${CALLER} -p ${PROTOCOL} -hsm ${HSMETRICS} -id ${ID} -b
#I often have an error wifth a fi at the end of nenufaar script, so commented until fixed
STATUS=$?
if [ "${STATUS}" -ne 0 ];then
echo "###########################################################################"
echo "There has been an issue with BAM generation - check log files - mnenufaar will exit"
echo "###########################################################################"
exit 1
fi
###
###merge all BAMs - first move them in a common directory
###
RUN_BASEDIR_NAME=$(basename "${INPUT_PATH}")
BAMS_PARENT_DIR=${OUTPUT_PATH}${RUN_BASEDIR_NAME};
BAMS_DIR=$(find ${BAMS_PARENT_DIR} -mindepth 1 -maxdepth 1 -type d)
mkdir -p ${OUTPUT_PATH}${RUN_BASEDIR_NAME}/MERGED_SAMPLES
MERGED_DIR=${OUTPUT_PATH}${RUN_BASEDIR_NAME}/MERGED_SAMPLES/
for CURRENT_BAM_DIR in ${BAMS_DIR[@]}
do
mv ${CURRENT_BAM_DIR}/${ID}/*.ba* ${MERGED_DIR}
done
BAMS_FILES=(${MERGED_DIR}*.bam)
BAIS_FILES=(${MERGED_DIR}*.bai)
echo "#############################################################################################"
echo "SAMBAMBA : merge - `date` ID_ANALYSE : ${ID} - Run : ${RUN_BASEDIR_NAME} - SAMPLES : ${BAMS_FILES[@]}"
echo "#############################################################################################"
echo "COMMAND: ${SRUN_24_COMMAND} ${SAMBAMBA} merge -t ${NB_THREAD} ${MERGED_DIR}MERGED_SAMPLES.bam ${BAMS_FILES[@]}"
${SRUN_24_COMMAND} ${SAMBAMBA} merge -t ${NB_THREAD} ${MERGED_DIR}MERGED_SAMPLES.bam ${BAMS_FILES[@]}
STATUS=$?
if [ "${STATUS}" -ne 0 ];then
echo "###########################################################################"
echo "There has been an issue with BAM merging - check log files - mnenufaar will exit"
echo "###########################################################################"
exit 1
fi
rm ${BAMS_FILES[@]}
rm ${BAIS_FILES[@]}
###
###nenufaar generate single VCF for all samples
###
echo "COMMAND: nohup ${SHELL} nenufaar.sh -i ${INPUT_PATH} -o ${OUTPUT_PATH} -r ${REF_PATH} -snp ${SNP_PATH} -indel1 ${INDEL1} -indel2 ${INDEL2} -g ${GENOME} -dcov ${DCOV} -c ${CALLER} -p ${PROTOCOL} -hsm ${HSMETRICS} -a ${ANNOTATOR} -f ${FILTER} -up ${USE_PLATYPUS} ${LIST} -vc"
nohup ${SHELL} ${NENUFAAR} -i ${INPUT_PATH} -o ${OUTPUT_PATH} -r ${REF_PATH} -snp ${SNP_PATH} -indel1 ${INDEL1} -indel2 ${INDEL2} -g ${GENOME} -dcov ${DCOV} -c ${CALLER} -p ${PROTOCOL} -hsm ${HSMETRICS} -a ${ANNOTATOR} -f ${FILTER} -up ${USE_PLATYPUS} ${LIST} -vc
STATUS=$?
if [ "${STATUS}" -ne '0' ];then
echo "###########################################################################"
echo "There has been an issue with VCF generation - check log files - mnenufaar will exit"
echo "###########################################################################"
exit 1
fi
DATE2=$(date +"%s")
DIFF=$((${DATE2}-${DATE1}))
echo "#############################################################################################"
echo "#############################################################################################"
echo "META NENUFAAR MULTI-SAMPLE RUN: ${ID} COMPLETED WITH SUCCESS"
echo "EXECUTION TIME:"
printf '%dh:%dm:%ds\n' $((${DIFF}/3600)) $((${DIFF}%3600/60)) $((${DIFF}%60))
echo "#############################################################################################"
echo "#############################################################################################"
exit 0