forked from biobenkj/SPARTA_Linux
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSPARTA.py
163 lines (101 loc) · 7.07 KB
/
SPARTA.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
__author__ = 'benkjohnson'
import check_dependencies_linux
import optparse
import sys
cd = check_dependencies_linux.CheckDependencies()
optParser = optparse.OptionParser(
usage="python SPARTA.py [options]",
description="Simple Program for Automated reference-based bacterial RNA-seq Transcriptome Analysis (SPARTA)",
epilog="Written by Benjamin K. Johnson ([email protected]), Michigan State University Department of " +
"Microbiology and Molecular Genetics. (c) 2015")
optParser.add_option("--cleanup", help="Clean up the intermediate files to save space. Default action is to retain the intermediate files.", action="store_true", dest="cleanup")
optParser.add_option("--verbose", help="Display more output for each step of the analysis.", action="store_true", dest="verbose")
optParser.add_option("--noninteractive", help="Non-interactive mode. This is for running SPARTA without any user input. Assumes data is on the desktop. If this"
" option is specified, you must fill out the configuration file (ConfigFile.txt) with the appropriate experimental conditions in the SPARTA folder.", action="store_true", dest="noninteractive")
optParser.add_option("--threads", help="Define the number of threads that SPARTA should run with. This will enable some speed-up on multi-processor machines. As a generality, define the number of threads as the same number of cores in your computer. Default is 2.",
action="store", type="int", default=2, dest="threads")
optParser.add_option("--procs", help="Assign how many processors to use. This is most useful with --noninteractive and on a high performance computing environment. Usage: --procs=<integer value of processors>",
action="store", type="int", default=1, dest="procs")
trim = optparse.OptionGroup(optParser, 'Trimmomatic options', "The order the options will be run are: ILLUMINACLIP, LEADING, TRAILING, SLIDINGWINDOW, MINLEN")
trim.add_option("--clip", help="ILLUMINACLIP options. MiSeq & HiSeq usually TruSeq3.fa; GAII usually TruSeq2.fa. Default is ILLUMINACLIP:TruSeq3-SE.fa:2:30:10. Usage: --clip=<adapterseqs>:<seed mismatches>:<palindrome clip threshold>:<simple clip threshold>",
action="store", default="TruSeq3-SE.fa:2:30:10", dest="illuminaclip")
trim.add_option("--lead", help="Set the minimun quality required to keep a base. Default is LEADING=3. Usage: --lead=<quality>",
action="store", type="int", default=3, dest="leading")
trim.add_option("--trail", help="Set the minimum quality required to keep a base. Default is TRAILING=3. Usage: --trail=<quality>",
action="store", type="int", default=3, dest="trailing")
trim.add_option("--slidewin", help="SLIDINGWINDOW options. Default is SLIDINGWINDOW:4:15. Usage: --slidewin=<window_size>:<required_quality>",
action="store", default="4:15", dest="slidingwindow")
trim.add_option("--minlentrim", help="Set the minimum read length to keep in base pairs. Default is 36. Usage: --minlentrim=<readlength>",
action="store", type="int", default=36, dest="minlentrim")
bowtie = optparse.OptionGroup(optParser, 'Bowtie options')
bowtie.add_option("--mismatch", help="Output alignments with at most a defined number of mismatches. Usage: --mismatch=<integer_value>",
action="store", type="int", dest="mismatch")
bowtie.add_option("--otherbowtieoptions", help="Bowtie has so many options that it is not worth listing them here. Go to http://bowtie-bio.sourceforge.net/manual.shtml#command-line for the manual and all available options. Usage: --otherbowtieoptions='all options inputed as a string (note the quotes!)'",
action="store", type="str", dest="otherbowtieoptions")
htseq = optparse.OptionGroup(optParser, 'HTSeq options')
htseq.add_option("--stranded", help="Stranded options: yes, no, reverse. Default is --stranded=reverse. Usage: --stranded=yes/no/reverse",
action="store", default="reverse", dest="stranded")
htseq.add_option("--order", help="Order options: name, pos. Usage: --order=name/pos.", action="store", type="str", default="name", dest="order")
htseq.add_option("--minqual", help="Skip all reads with quality lower than the given value. Default is --minqual=10. Usage: --minqual=<value>",
action="store", type="int", default=10, dest="minqual")
htseq.add_option("--type", help="The feature type (3rd column in GTF file) to be used. Default is --type=exon (suitable for RNA-seq analysis)",
action="store", type="str", default="exon", dest="type")
htseq.add_option("--idattr", help="Feature ID from the GTF file to identify counts in the output table Default is --idattr=gene_id. Usage: --idattr=<id attribute>",
action="store", default="gene_id", dest="idattr")
htseq.add_option("--mode", help="Mode to handle reads overlapping more than one feature. Default is --mode=union. Usage: --mode=union/intersection-strict/intersection-nonempty",
action="store", default="union", dest="mode")
optParser.add_option_group(trim)
optParser.add_option_group(bowtie)
optParser.add_option_group(htseq)
(options, args) = optParser.parse_args()
# args = parser.parse_args()
if options.noninteractive:
cond_lst, data_path = cd.parseConfigFile(options)
else:
#Welcome the user to the software and check dependencies
print "Welcome to SPARTA!"
print "Let's make sure we have everything we need to get started..."
print "Now checking dependencies...\n"
#Check for Java, R, and NumPy
javacheck = cd.checkjava(options)
Rcheck = cd.checkR(options)
numpycheck = cd.checknumpy(options)
print "Everything appears to be fine. Moving on.\n"
import qc_analysis
import mapping_and_counting
import differential_expression
qc = qc_analysis.QC_analysis()
mac = mapping_and_counting.Mapping_and_Counting()
de = differential_expression.DifferentialExpression()
if options.noninteractive:
#Create a folder called RNAseq_Data with which to store all of the data analysis
subfolderpath = qc.create_folder()
#Read in the data and sort out the genome feature file and genome sequence files
qc.findreferencefiles(data_path)
#Trimmomatic
qc.trimmomatic(data_path, subfolderpath, options)
#FastQC
qc.fastqc(data_path, subfolderpath, options)
#Bowtie
mac.bowtie(data_path, subfolderpath, options)
#HTSeq
mac.htseq(subfolderpath, options)
#edgeR
de.de_analysis_noninteractive(subfolderpath, cond_lst)
else:
#Create a folder called RNAseq_Data with which to store all of the data analysis
subfolderpath = qc.create_folder()
#Find the RNAseq data folder location
rawdatapath = qc.finddata()
#Read in the data and sort out the genome feature file and genome sequence files
qc.findreferencefiles(rawdatapath)
#Trimmomatic
qc.trimmomatic(rawdatapath, subfolderpath, options)
#FastQC
qc.fastqc(rawdatapath, subfolderpath, options)
#Bowtie
mac.bowtie(rawdatapath, subfolderpath, options)
#HTSeq
mac.htseq(subfolderpath, options)
#edgeR
de.de_analysis(subfolderpath)