-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathFASTAindexSPLITTERinEQUALsize.sh
87 lines (50 loc) · 3.62 KB
/
FASTAindexSPLITTERinEQUALsize.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
###FASTA INDEX SPLITTER-NEARLY EQUALLY BASED ON LENGTHS####
#########written with love by fatih sarigol################
#####EXPLANATION#######
##Imagine you have 30000 scaffolds(or contigs or chromosomes etc) in your reference genome and you want to run some analyses on it that take so much time
##You can manually subset it to smaller pieces, for example by splitting its index into 100 pieces with 300 scaffolds in each
##But distribution of lengths of the scaffolds always changes, and doing so may lead to some subsets taking much longer time than some others
##This code calculates the total length of the fasta file, and splits its index into subsets with almost equal total genome lengths
##You can choose how many subsets you want, which you can use in a job array easily. The names of the subsets will be INDEX1 INDEX2 etc.
##A scaffold will always stay in a single subset and will never be split into 2 different subsets.
###INSTRUCTION OF USAGE###
##AFTER MAKING IT EXECUTABLE (such as by "chmod +x FASTAindexSPLITTERinEQUALsize.sh"), RUN THIS LITTLE PROGRAM AS:
#./FASTAindexSPLITTERinEQUALsize.sh samtoolsExecutable fastaFile numberOfDivisions
##example usage; if we want to split the fasta index into 15 nearly equal subsets:
#./FASTAindexSPLITTERinEQUALsize.sh /MyPrograms/samtools1.9/samtools MyGenome.fa 15
###THE CODE###
#!/bin/bash
#Following 3 definitions are variables you define on the command line after the program executable:
#Define the location of samtools program executable (if it is in your environment -such as via conda- simply type samtools on the command line)
samtools=$1
#Choose your fasta file for indexing (and splitting its index)
MYFASTA=$2
#Choose the number you want to divide your fasta index to, you will have 15 files with nearly equal total lengths for example
DIVIDE=$3
#Get fasta index using samtools
$samtools faidx $MYFASTA
#Get first 2 columns with Scaffold names and lengths
cut -f1,2 $MYFASTA.fai > indexCOL12
#Calculate total length of the fasta file
awk '{total += $2} END {print total}' indexCOL12 > totalSIZE
#Define a bash variable using totalSIZE result
TOTALsize=$(cat totalSIZE)
#Define TOTALsize and DIVIDE variables inside the awk program, add the lengths of Scaffolds, when the total reaches the division of TOTALsize/DIVIDE print SPLIThere
#If the total didn't reach that value yet, simply print the line as it is. Once the total reaches that value, restart the total to zero and start counting again
awk -v TOTSIZE="$TOTALsize" -v DIV="$DIVIDE" 'BEGIN {s = 0}{s += $2; if (s >= TOTSIZE/DIV ) {print $0 "\t" "SPLIThere"; s = 0} else print $0}' indexCOL12 > FastaIndexToSplit
#Add SPLIThere as 3rd column to first line (We need the 2 following steps for csplit to make it start our indexes from 1 instead of 0)
sed -i '1s/$/\tSPLIThere/' FastaIndexToSplit
#Add a first line with zero (anything else would also work)
sed -i '1 i\0' FastaIndexToSplit
#Split from lines with the word "SPLIThere" in it, name the new split files starting with "INDEX"
csplit -s -f INDEX -z FastaIndexToSplit /SPLIThere/ {*}
#Clear the third column which has SPLIThere in some files
sed -i -r 's/(\s+)?\S+//3' INDEX*
#Rename the INDEX files to be accepted by SGE array jobs
for f in *X0*; do mv -- "$f" "${f:0:6-1}${f:6}"; done
#Add a second column with zero to convert the index lines into bed file
for i in *INDEX*; do awk '{print $1 "\t" "0" "\t" $2}' ${i} > TEMP ; mv TEMP ${i}; done
#Your new index bed files are ready to go!
#Remove intermediate files
rm indexCOL12 totalSIZE FastaIndexToSplit INDEX0 $MYFASTA.fai
echo "Your fasta index file has been split to nearly equal total lengths of $3 subsets in bed format! :)"