-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathrepeatmasker.sh
executable file
·119 lines (108 loc) · 5.46 KB
/
repeatmasker.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
#!/bin/bash
#######################################################################################
### ###
### Copyright (C) 2017 Pawel Krawczyk ([email protected]) ###
### ###
### This program is free software: you can redistribute it and/or modify ###
### it under the terms of the GNU General Public License as published by ###
### the Free Software Foundation, either version 3 of the License, or ###
### (at your option) any later version. ###
### ###
### This program is distributed in the hope that it will be useful, ###
### but WITHOUT ANY WARRANTY; without even the implied warranty of ###
### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ###
### GNU General Public License for more details. ###
### ###
### You should have received a copy of the GNU General Public License ###
### along with this program. If not, see <http://www.gnu.org/licenses/>. ###
### ###
#######################################################################################
#make sure fastx_toolkit is in the path
module load fastx_toolkit
#before start - set paths for software and libraries:
repeat_masker=`which RepeatMasker`
repeat_masker_parsing="/home/smaegol/storage/soft/repeatmasker/Parsing-RepeatMasker-Outputs/parseRM_simple.pl -fast"
clip_rmasker=`pwd`"/identify_LINE_repeatmasker_softclip.py"
clip_rmasker_R3=`pwd`"/identify_LINE_repeatmasker_softclip_R3.py"
lib_location="/home/smaegol/storage/soft/repeatmasker/RepeatMasker/Libraries/Dfam_2.0/homo_sapiens/LINEs/masklib.hmm"
#options for repeatmasker
threads=5
max_divergence=10
# check if required software is accessible (in the PATH)
if ! [ -x "$(command -v $repeat_masker)" ]; then
echo 'Error: RepeatMasker is not accessible.' >&2
exit 1
fi
if ! [ -x "$(command -v $repeat_masker_parsing)" ]; then
echo 'Error: RepeatMasker Parsing script is not accessible.' >&2
exit 1
fi
if ! [ -x "$(command -v fastq_to_fasta)" ]; then
echo 'Error: fastq_to_fasta from fastx_toolkit is not accessible.' >&2
exit 1
fi
#find sequence files in the currect folder based on name (for LINE RACE seqs - begin with L1)
#we start processing with R5 files
for f in `find . -name "L1*ENDO**R5.fastq"`
do
echo "Processing file: $f "
FILENAME_PREFIX=`expr match "$f" '\(.*\)R5'`
FILENAME_SUFFIX=`expr match "$f" '.*R5\(.*\)'`
PRIMER=`expr match "$FILENAME_PREFIX" '..\(....\).*'`
fasta=$FILENAME_PREFIX"R5.fasta"
rmasker_out=$fasta".out"
rmasker_parsed=$rmasker_out".parsed1/"$rmasker_out".length.tab"
clipped_output=$f".sam.clipped.fasta"
touch $clipped_output
clipped_rmasker_output=$f".sam.clipped.fasta.rmasker.fasta"
if ! [ -f "$fasta" ]; then
#convert fastq to fasta using fastx_toolkit
echo "Converting $f to fasta ($fasta)"
fastq_to_fasta -i $f -o $fasta -Q33
fi
if ! [ -f "$rmasker_out" ]; then
#run repeatmasker using specified library
echo "Running repeatmasker on input fasta file"
if [ "$PRIMER" == 'L1MM' ]; then
$repeat_masker -div $max_divergence -species mouse -u -source -xsmall -norna -nolow -qq -pa $threads $fasta
else
$repeat_masker -lib $lib_location -div $max_divergence -u -source -xsmall -norna -nolow -qq -pa $threads $fasta
fi
fi
#parse repeatmasker output using ParseRM_simple.pl from Parsing-RepeatMasker-Outputs
echo "Parsing repeatmasker output"
$repeat_masker_parsing -RMout $rmasker_out -genfile $fasta
#clip fragments outside of identified LINE
echo "Getting soft-clipped sequences"
$clip_rmasker --input $rmasker_parsed --fasta $fasta --output $clipped_rmasker_output
#process R3 file - the same way like for R5
R3_FILE=$FILENAME_PREFIX"R3"$FILENAME_SUFFIX
echo $R3_FILE
fasta_R3=$FILENAME_PREFIX"R3.fasta"
rmasker_out_R3=$fasta_R3".out"
rmasker_parsed_R3=$rmasker_out_R3".parsed1/"$rmasker_out_R3".length.tab"
clipped_rmasker_output_R3=$R3_FILE".sam.clipped.fasta.rmasker.fasta"
clipped_output_R3=$R3_FILE".sam.clipped.fasta"
touch $clipped_output_R3
if ! [ -f "$fasta_R3" ]; then
#convert fastq to fasta using fastx_toolkit
echo "Converting $R3_FILE to fasta ($fasta_R3)"
fastq_to_fasta -i $R3_FILE -o $fasta_R3 -Q33
fi
if ! [ -f "$rmasker_out_R3" ]; then
#run repeatmasker using specified library
echo "Running repeatmasker on input fasta file"
if [ "$PRIMER" == 'L1MM' ]; then
$repeat_masker -div $max_divergence -species mouse -u -source -xsmall -norna -nolow -qq -pa $threads $fasta_R3
else
$repeat_masker -lib $lib_location -div $max_divergence -u -source -xsmall -norna -nolow -qq -pa $threads $fasta_R3
fi
fi
#parse repeatmasker output using ParseRM_simple.pl from Parsing-RepeatMasker-Outputs
echo "Parsing repeatmasker output"
$repeat_masker_parsing -RMout $rmasker_out_R3 -genfile $fasta_R3
#clip fragments outside of identified LINE
echo "Getting soft-clipped sequences"
$clip_rmasker_R3 --input $rmasker_parsed_R3 --fasta $fasta_R3 --output $clipped_rmasker_output_R3
done
echo "DONE ALL"