-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathSVprotocol_part1.sh
465 lines (426 loc) · 18 KB
/
SVprotocol_part1.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
#!/usr/bin/sh
#===============================================================================
#* This pipeline is used to detect SV in one sample using three software
#TODO 110: mapping short-read sequence to reference genome
#TODO 120: Remove duplicates
#TODO 130: SV detection for one sample using three software
#===============================================================================
function usage()
{
echo "
******This pipeline is used to detect SV in one sample using three software
Usage: SVprotocol_part1.sh <The long option is recommended>
-s, --sample (required) sample id
-c, --config (required) a config file including basic setting.
-j, --thread (optional) the number of thread, default = 5
-f, --force (optional) removes output subdirectory if it exists, run all process again
-h, --help (optional) This small usage guide
1) In your_analysis_dir, you must have a subdirectory named as fastq
<sample_name> including two FASTQ file (<sample_name>_R1.fastq.gz and <sample_name>_R2.fastq.gz)
2) Please prepare a config file following the example file
1. Containing the path of software to be used
2. You must use the variable name shown in the example file because, in this pipeline, we will use these variable names
3) Please just include chromosome-level sequence or sequence with size larger than 250kb in the reference genome fasta file
4) Before running this pipeline, please create dictionary for genome fasta
>java -jar picard.jar CreateSequenceDictionary R=Rhipicephalus_microplus.chromosome.fasta O=Rhipicephalus_microplus.chromosome.dict
5) ref_genome_fasta = <prefix>.fasta. Before you run SVseq2, please
split genome-wide sequence fasta to single chromosome sequence fasta named <prefix>.<chr>.fasta, with index file named <prefix>.<chr>.fasta.fai
6) Before running this pipeline, please index your reference genome by yourself
>bwa index Rhipicephalus_microplus.chromosome.fasta
Real time: 2288.064 sec; CPU: 2281.825 secs
Rhipicephalus_microplus.chromosome.fasta
Rhipicephalus_microplus.chromosome.fasta.amb
Rhipicephalus_microplus.chromosome.fasta.ann
Rhipicephalus_microplus.chromosome.fasta.bwt
Rhipicephalus_microplus.chromosome.fasta.fai
Rhipicephalus_microplus.chromosome.fasta.pac
Rhipicephalus_microplus.chromosome.fasta.sa
7) Example:
SVprotocol_110_120_130.sh --sample AHRm.725
--config example.config
--thread 2
"
}
OPTIONS=hs:c:j:f
LONGOPTIONS=help,sample:,config:,thread:,force
PARSED=$(getopt --options=$OPTIONS --longoptions=$LONGOPTIONS --name "$0" -- "$@")
if [ $? != 0 ]; then
echo "ERROR: Terminating..."
exit 2
fi
eval set -- "${PARSED}"
FLAG_sample=0
FLAG_config=0
thread=5
force="n"
while true
do
case "$1" in
-s|--sample)
sample=$2
FLAG_sample=1
shift 2
;;
-c|--config)
config=$2
FLAG_config=1
shift 2
;;
-j|--thread)
thread=$2
shift 2
;;
-f|--force)
force="y"
shift
;;
-h|--help)
usage
shift
exit 0
;;
--)
shift
break
;;
*)
echo "ERROR: Unknown argument $1 . Please check again."
exit 3
;;
esac
done
until [ $FLAG_sample -eq 1 -a $FLAG_config -eq 1 ]
do
echo "ERROR: missing required variables. Please check again."
exit 1
done
#===============================================================================
#****/ Main
#===============================================================================
source $config
cd $your_analysis_dir
echo "SVprotocol_part1.sh"
printf "\t--sample\t%s\n" "$sample"
printf "\t--config\t%s\n" "$config"
printf "\t--thread\t%s\n" "$thread"
printf "\t--force\t%s\n" "$force"
printf "%-20s\t%-s\n" "Hostname:" "`hostname`"
printf "%-20s\t%-s\n" "Working directory:" "`pwd`"
printf "INFO\t%-s\t%-s\n" "`date`" "Start time"
#****/ Check required files
if [ ! -f ${ref_genome_fasta}.ann -o ! -f ${ref_genome_fasta}.fai ]
then
echo "Please index your fasta using bwa with command: bwa index ref.fasta"
exit 1
fi
if [ ! -f ${ref_genome_fasta/.fasta/}.dict -o ! -f ${ref_genome_fasta/.gz/}.dict ]
then
echo "Please create dictionary for genome fasta: picard.jar CreateSequenceDictionary "
exit 1
fi
if [ "$force" == "y" ]
then
rm -rf 110.raw.reads.mapping/$sample >/dev/null
rm -rf 120.remove.duplicates/$sample >/dev/null
rm -rf 130.sv.detection/Manta/$sample >/dev/null
rm -rf 130.sv.detection/Lumpy/$sample >/dev/null
rm -rf 130.sv.detection/SVseq2/$sample >/dev/null
rm -rf 130.sv.detection/svimmer_1sample/$sample >/dev/null
printf "INFO\t%-s\t%-s\n" "`date`" "Forced delete existed files, including 110.raw.reads.mapping/$sample, 120.remove.duplicates/$sample, 130.sv.detection/Manta/$sample, 130.sv.detection/Lumpy/$sample, 130.sv.detection/SVseq2/$sample, 130.sv.detection/svimmer_1sample/$sample"
fi
function check_existed_file()
{
local file_name=$1
if [ -f $file_name ]
then
file_size=`ls -l $file_name |awk '{print $5}'`
if [ $file_size -gt 0 ]
then
return 0
else
return 1
fi
else
return 1
fi
}
#************/ Create directory
mkdir -p 110.raw.reads.mapping/$sample >/dev/null
mkdir -p 120.remove.duplicates/$sample >/dev/null
mkdir -p 130.sv.detection/Manta/$sample >/dev/null
mkdir -p 130.sv.detection/Lumpy/$sample >/dev/null
mkdir -p 130.sv.detection/SVseq2/$sample >/dev/null
mkdir -p 130.sv.detection/svimmer_1sample/$sample >/dev/null
existed_file=$your_analysis_dir/120.remove.duplicates/$sample/$sample.dedup.bam
check_existed_file $existed_file
if [ $? -eq 1 ]
then
#************/ 110.raw.reads.mapping /***********************************
existed_file=$your_analysis_dir/110.raw.reads.mapping/$sample/$sample.bam
check_existed_file $existed_file
if [ $? -eq 1 ]
then
#* 111.mapping (9h30min) / 12 CPUs
$bwa mem -t $thread -B 4 -O 6 -E 1 -M -R "@RG\tID:$sample\tSM:$sample\tLB:$sample\tPU:$sample\tPL:ILLUMINA" $ref_genome_fasta fastq/$sample/${sample}_R1.fastq.gz fastq/$sample/${sample}_R2.fastq.gz | $samtools view -@ $thread -bS - > 110.raw.reads.mapping/$sample/$sample.bam
if [ $? -ne 0 ]
then
printf "ERROR%-s\t%-s\n" "`date`" "Failed: bwa mem $sample"
exit 1
fi
#* 112.Sort bam (20min)
#* -m INT: Approximately the maximum required memory per thread, specified either in bytes or with a K, M, or G suffix. [768 MiB];To prevent sort from creating a huge number of temporary files, it enforces a minimum value of 1M for this setting.
$samtools sort -@ $thread -m 1G -o 110.raw.reads.mapping/$sample/$sample.sorted.bam 110.raw.reads.mapping/$sample/$sample.bam && mv 110.raw.reads.mapping/$sample/$sample.sorted.bam 110.raw.reads.mapping/$sample/$sample.bam
#* 113.Create index of bam file (1min)
$samtools index -@ $thread 110.raw.reads.mapping/$sample/$sample.bam
if [ $? != 0 ]
then
printf "%-20s\t%-s\n" "ERROR:" "samtools sort & index $sample"
exit 1
fi
printf "INFO\t%-s\t%-s\n" "`date`" "BWA finished"
else
printf "INFO\t%-s\t%-s\n" "`date`" "BWA has been executed before, we skip this step."
fi
#************/ 120: Remove duplicates (2h) /***********************************
cd $your_analysis_dir/120.remove.duplicates
existed_file=$your_analysis_dir/120.remove.duplicates/$sample/$sample.dedup.bam
check_existed_file $existed_file
if [ $? -eq 1 ]
then
$gatk --java-options "-Xmx4G -XX:ParallelGCThreads=$thread" MarkDuplicates \
-I ../110.raw.reads.mapping/$sample/$sample.bam \
-O $sample/$sample.dedup.bam \
--REMOVE_DUPLICATES false \
-M $sample/metric.$sample.txt \
--ASSUME_SORT_ORDER coordinate \
--COMPRESSION_LEVEL 5 \
--VALIDATION_STRINGENCY SILENT \
--CREATE_INDEX true \
--TMP_DIR $sample/tmp/
#** remove file
#** $?: the return value of last command,success is 0, failed is 1
if [ $? == 0 ]
then
rm -rf $your_analysis_dir/120.remove.duplicates/$sample/tmp 2>/dev/null
fi
printf "INFO\t%-s\t%-s\n" "`date`" "Remove Duplicated finished"
else
printf "INFO\t%-s\t%-s\n" "`date`" "Remove Duplicated has been executed before, we skip this step."
fi
else
printf "INFO\t%-s\t%-s\n" "`date`" "BWA has been executed before, we skip this step."
printf "INFO\t%-s\t%-s\n" "`date`" "Remove Duplicated has been executed before, we skip this step."
fi
#************/ 130: SV detection /***********************************
#****/ 131: Manta
manta_analysis_path=$your_analysis_dir/130.sv.detection/Manta/$sample
cd $manta_analysis_path
existed_file=$manta_analysis_path/results/variants/diploidSV.vcf.gz
check_existed_file $existed_file
if [ $? -eq 1 ]
then
manta_input_bam=${your_analysis_dir}/120.remove.duplicates/$sample/${sample}.dedup.bam
#* 131.1: Configuration Manta
#* Single Diploid Sample Analysis
${manta_install_path}/bin/configManta.py \
--bam $manta_input_bam \
--referenceFasta $ref_genome_fasta \
--runDir ${manta_analysis_path}
# Successfully created workflow run script.
# To execute the workflow, run the following script and set appropriate options:
# /picb/humpopg-bigdata5/liuqi/SVprotocol/130.sv.detection/Manta/AHRm.725/runWorkflow.py
#* 131.2: workflow execution (1h)
#! Please use python2 to run Manta
manta_log=$your_analysis_dir/130.sv.detection/Manta/$sample/${sample}.Manta.log
$python2 ${manta_analysis_path}/runWorkflow.py -j $thread 1>$manta_log 2>&1 &
else
printf "INFO\t%-s\t%-s\n" "`date`" "Manta has been executed before, we skip this step."
fi
#****/ 132: Lumpy
#* Lumpy can't use thread
lumpy_analysis_path=$your_analysis_dir/130.sv.detection/Lumpy/$sample
cd $lumpy_analysis_path
existed_file=$lumpy_analysis_path/${sample}.Lumpy.vcf
check_existed_file $existed_file
if [ $? -eq 1 ]
then
#**** 132.1: Extract the discordant paired-end alignments (20min)
lumpy_input_bam=${your_analysis_dir}/120.remove.duplicates/$sample/${sample}.dedup.bam
$samtools view -@ $thread -b -F 1294 $lumpy_input_bam > ${sample}.discordants.unsorted.bam
#**** 132.2: Extract the split-read alignments (20min)
$samtools view -h $lumpy_input_bam | $python2 ${lumpy_install_path}/scripts/extractSplitReads_BwaMem -i stdin | $samtools view -@ $thread -Sb - > ${sample}.splitters.unsorted.bam
#**** 132.3: Sort both alignments (30min)
$samtools sort -@ $thread ${sample}.discordants.unsorted.bam -o ${sample}.discordants.bam
$samtools sort -@ $thread ${sample}.splitters.unsorted.bam -o ${sample}.splitters.bam
#**** 132.4: Run LUMPY Express (44min)
lumpy_log=$your_analysis_dir/130.sv.detection/Lumpy/$sample/${sample}.Lumpy.log
${lumpy_install_path}/bin/lumpyexpress \
-B $lumpy_input_bam \
-S ${sample}.splitters.bam \
-D ${sample}.discordants.bam \
-o ${sample}.Lumpy.vcf 1>$lumpy_log 2>&1 &
#**** Wait for the above program to run
wait
printf "INFO\t%-s\t%-s\n" "`date`" "Manta&Lumpy finished"
else
printf "INFO\t%-s\t%-s\n" "`date`" "Lumpy has been executed before, we skip this step."
fi
#****/ 133: SVseq2
svseq2_input_bam=${your_analysis_dir}/120.remove.duplicates/$sample/${sample}.dedup.bam
svseq2_analysis_path=$your_analysis_dir/130.sv.detection/SVseq2/$sample
cd $svseq2_analysis_path
existed_file=$svseq2_analysis_path/${sample}.SVseq2.vcf
check_existed_file $existed_file
if [ $? -eq 1 ]
then
#**** 133.1: For each chromosome (8h)
#**** We extract chromosome name from genome.fasta.fai
#**** GWHAMMN00000001 325144201 17 60 61
genome_fai=${ref_genome_fasta}.fai
rm -f tmp_multiRun.sh >/dev/null 2>&1
minMQ=20
cat $genome_fai | while read line
do
arr=($line)
chr=${arr[0]}
echo "$samtools view -@ $thread -bS -q $minMQ -o ${sample}.$chr.bam $svseq2_input_bam $chr:300
$samtools index -@ $thread ${sample}.$chr.bam
$svseq2 -r $ref_genome_fasta -c $chr -b ${sample}.$chr.bam --c 3 --o ${sample}.$chr.svseq2.DEL.out > ${sample}.$chr.logfile " >${sample}.$chr.sh
echo "sh ${sample}.$chr.sh" >>tmp_multiRun.sh
done
sh ${this_protocol_script_path}/shell_multipleRun.sh tmp_multiRun.sh $thread
#**** Wait for the above program to run
wait
#** Check to SVseq2 out
cat $genome_fai | while read line
do
arr=($line)
chr=${arr[0]}
if [ ! -f ${sample}.${chr}.svseq2.DEL.out ]
then
printf "ERROR\t%-s\t%-s\n" "`date`" "SVseq2 failed, Please rerun SVseq2 in chr:$chr"
exit 1
elif [[ `ls -l ${sample}.${chr}.svseq2.DEL.out |awk '{print $5}'` == 0 ]]
then
printf "ERROR\t%-s\t%-s\n" "`date`" "The size of result of SVseq2 is 0, Please rerun SVseq2 in chr:$chr"
exit 1
else
printf "INFO\t%-s\t%-s\n" "`date`" "SVseq2 successed in chr:$chr"
fi
done
#****/ Merging results in all chromosomes and convert the result to a VCF file
cd $svseq2_analysis_path
ls *DEL.out > del.out.list
$perl ${this_protocol_script_path}/code_convertSVseq22vcf.pl -i del.out.list -o ${sample} -s ${sample}
printf "INFO\t%-s\t%-s\n" "`date`" "SVseq2 finished"
else
printf "INFO\t%-s\t%-s\n" "`date`" "SVseq2 has been executed before, we skip this step."
fi
#************/ Sort VCF /***************************************************
#****/ 131: Manta
cd $manta_analysis_path
existed_file=$manta_analysis_path/${sample}.chr.sort.Manta.vcf.gz
check_existed_file $existed_file
if [ $? -eq 1 ]
then
manta_out_vcf=$manta_analysis_path/results/variants/diploidSV.vcf.gz
#* 1- Inversion in Manta should be convert format:
#* https://github.com/Illumina/manta/tree/master/src/python/libexec/convertInversion.py
manta_out_convertINV_vcf=$manta_analysis_path/${sample}.Manta.vcf.gz
$python2 $code_convertInversion $samtools $ref_genome_fasta $manta_out_vcf | $bgzip -c > $manta_out_convertINV_vcf && $tabix -p vcf -f $manta_out_convertINV_vcf
#* Sort Manta result
genome_dict=${ref_genome_fasta/.fasta/}.dict
$java -jar $picard SortVcf \
I=$manta_out_convertINV_vcf \
O=${sample}.chr.sort.Manta.vcf \
SEQUENCE_DICTIONARY=$genome_dict \
VALIDATION_STRINGENCY=SILENT \
CREATE_INDEX=false
#**** Compressed VCF and create index
$bgzip ${sample}.chr.sort.Manta.vcf && $tabix -p vcf -f ${sample}.chr.sort.Manta.vcf.gz
printf "INFO\t%-s\t%-s\n" "`date`" "Sort Manta finished"
else
printf "INFO\t%-s\t%-s\n" "`date`" "Sort Manta has been executed before, we skip this step."
fi
#****/ 132: Sort Lumpy
cd $lumpy_analysis_path
existed_file=$lumpy_analysis_path/${sample}.chr.sort.Lumpy.vcf.gz
check_existed_file $existed_file
if [ $? -eq 1 ]
then
genome_dict=${ref_genome_fasta/.fasta/}.dict
$java -jar $picard SortVcf \
I=${sample}.Lumpy.vcf \
O=${sample}.chr.sort.Lumpy.vcf \
SEQUENCE_DICTIONARY=$genome_dict \
VALIDATION_STRINGENCY=SILENT \
CREATE_INDEX=false
#**** Compressed VCF and create index
$bgzip ${sample}.chr.sort.Lumpy.vcf && $tabix -p vcf -f ${sample}.chr.sort.Lumpy.vcf.gz
#**** remove temporary files
rm -f ${sample}.splitters.* ${sample}.discordants.* >/dev/null 2>&1
printf "INFO\t%-s\t%-s\n" "`date`" "Sort Lumpy finished"
else
printf "INFO\t%-s\t%-s\n" "`date`" "Sort Lumpy has been executed before, we skip this step."
fi
#****/ 133: Sort SVseq2
cd $svseq2_analysis_path
existed_file=$svseq2_analysis_path/${sample}.chr.sort.SVseq2.vcf.gz
check_existed_file $existed_file
if [ $? -eq 1 ]
then
genome_dict=${ref_genome_fasta/.fasta/}.dict
$java -jar $picard SortVcf \
I=${sample}.SVseq2.vcf \
O=${sample}.chr.sort.SVseq2.vcf \
SEQUENCE_DICTIONARY=$genome_dict \
VALIDATION_STRINGENCY=SILENT \
CREATE_INDEX=false
#**** Compressed VCF and create index
$bgzip ${sample}.chr.sort.SVseq2.vcf && $tabix -p vcf -f ${sample}.chr.sort.SVseq2.vcf.gz
#**** remove temporary files
rm -f tmp_multiRun.sh ${sample}.*.sh >/dev/null 2>&1
rm ${sample}.*.bam* >/dev/null 2>&1
rm -f del.out.list *DEL.out >/dev/null 2>&1
printf "INFO\t%-s\t%-s\n" "`date`" "Sort SVseq2 finished"
else
printf "INFO\t%-s\t%-s\n" "`date`" "Sort SVseq2 has been executed before, we skip this step."
fi
#************/ Filtering SVs with sizes larger than 50bp and smaller than 2Mb (1min)
svimmer1_analysis_path=$your_analysis_dir/130.sv.detection/svimmer_1sample/$sample
mkdir -p $svimmer1_analysis_path
cd $svimmer1_analysis_path
rm -f vcf.list >/dev/null 2>&1
for software in Manta Lumpy SVseq2
do
ln -s $your_analysis_dir/130.sv.detection/$software/$sample/${sample}.chr.sort.${software}.vcf.gz* ./ >/dev/null 2>&1
$perl ${this_protocol_script_path}/code_SV_filtering.pl --tool $software --vcf ${sample}.chr.sort.${software}.vcf.gz --len 50 --xlen 2000000
$tabix -p vcf -f ${sample}.chr.sort.${software}.filtered.vcf.gz
echo "${sample}.chr.sort.${software}.filtered.vcf.gz" >> vcf.list
done
#**** Merging SVs discovered by Manta, Lumpy, and SVseq2 using svimmer software
#**** We extract chromosome name from genome.fasta.fai
#**** GWHAMMN00000001 325144201 17 60 61
chr=""
genome_fai=${ref_genome_fasta}.fai
while read line
do
arr=($line)
chr_i=${arr[0]}
chr="$chr $chr_i"
done < $genome_fai
$python3_9 $svimmer --threads $thread vcf.list $chr | $bgzip -c > ${sample}.chr.svimmer.vcf.gz
printf "INFO\t%-s\t%-s\n" "`date`" "svimmer 1 sample finished"
#**** Filtering SVs with sizes larger than 50bp and smaller than 2Mb
$perl ${this_protocol_script_path}/code_SV_filtering.pl --tool svimmer --vcf ${sample}.chr.svimmer.vcf.gz --len 50 --xlen 2000000
$tabix -p vcf -f ${sample}.chr.svimmer.filtered.vcf.gz
#**** remove temporary files
# rm -f ${sample}.chr.sort.*.filtered.vcf.gz* ${sample}.chr.svimmer.vcf.gz* vcf.list >/dev/null 2>&1
# for software in Manta Lumpy SVseq2
# do
# rm -f ${sample}.chr.sort.${software}.vcf.gz* >/dev/null 2>&1
# done
printf "INFO\t%-s\t%-s\n" "`date`" "SV filtering for one sample finished"
#************/ Ending
printf "INFO\t%-s\t%-s\n" "`date`" "Congratulations"
printf "INFO\t%-s\t%-s\n" "`date`" "End"