-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathmakeExonDB.py
executable file
·171 lines (140 loc) · 8.11 KB
/
makeExonDB.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
# Given a gtf genome annotation file and a genome fasta file, create and pickle
# dicts of exon objects. 1 file is created with a dict for each chromosome, and
# each dict contains 2 lists, 1 for forward strand exons on this chromosome
# and one for reverse strand exons. These can then be used to create junction objects
# and junction fasta files.
# Output directories will be created if they do not exist, and default values
# are provided. The genes, exons, and records directories will be created as
# subdirectories of the output directory.
# usage: python makeExonDB3.py -f genome.fasta -a genome.gtf
# [-o outDir] [-e exonDir] [-r recordDir] [-g geneDir] [-v]
### TODO:
### 1. try catching the deprecation warning from Biopython
### 2. allow single file option (as currently implemented) or to loop through all files in a directory
### 4. streamline code
### 5. add logging instead of printing to stdout?
### 6. create script to combine all genes so I can easily find data for any gene in 1 structure
import argparse
from Bio import SeqIO
from BCBio import GFF
import cPickle as pickle
import utils_os
import sys
# not currently using genes, but figured it may be useful to have these objects
# around for future analysis if necessary. Each gene contains a chromosome id
# and a SeqFeature object which contains all of the exons for the gene. Sequence
# data is not stored in this object, just positional info for each exon.
class gene:
def __init__(self, chr, obj):
self.chr = chr # string chromosome id
self.feature = obj # SeqFeature for entire gene, contains exons as subfeatures
def __str__(self):
msg = "chr: " + str(self.chr)
msg += " feature: " + str(self.feature)
return msg
# tries getting primary gene name, if that doesn't exist tries getting secondary gene name.
# if neither primary or secondary gene name fields exist, an exception will be thrown
# which is caught and handled in the calling function
def getGeneName(use_feature):
if args.name1 in use_feature.qualifiers:
use_name = args.name1
else:
use_name = args.name2
return use_feature.qualifiers[use_name][0]
# the workhorse function.
# param chrSeqRecord: a SeqRecord for 1 chromosome
def parseChrFeatures(chrSeqRecord):
chrExonsByDirection = [] # each element will be a (chrID, strand, dict with (exon start, stop):SeqFeature, 2 per chromosome (1 per strand)
exonsPos = {} # all positive strand exons
exonsNeg = {} # all neg strand exons
geneExons = {} # gene_name: RecFeature. don't actually use these right now, but may be useful at some point
for ftr in chrSeqRecord.features: # this is 1 entire gene
try:
sftr = None # genes with a single exon are at the top level ftr
for sftr in ftr.sub_features:
# add exon to the exon dict for the current chr
# keeping in separate dicts for now so we are only looking at 1 direction with 100kb
if sftr.strand == 1:
exonsPos[(int(sftr.location.start), int(sftr.location.end))] = sftr
else:
exonsNeg[(int(sftr.location.start), int(sftr.location.end))] = sftr
# gene names are tied to the subrecords, and chromosome to SeqRecords, so need to track this info with the feature object
if sftr:
geneExons[getGeneName(sftr)] = gene(chrSeqRecord.id, ftr)
else:
# this is a single exon gene so gene name is in the feature instead of subfeatures
if args.verbose:
print "single exon gene " + str(getGeneName(ftr))
geneExons[getGeneName(ftr)] = gene(chrSeqRecord.id, ftr)
# and need to add the feature to the appropriate exonDict
if ftr.strand == 1:
exonsPos[(int(ftr.location.start), int(ftr.location.end))] = ftr
else:
exonsNeg[(int(ftr.location.start), int(ftr.location.end))] = ftr
except Exception as e:
print "Exception"
print e
print "error:", sys.exc_info()[0]
print "parsing features for", ftr
if len(exonsPos) > 0:
chrExonsByDirection.append((chrSeqRecord.id, 1, exonsPos)) # chrSeqRecord.id is chr1 for example, 1 indicates pos strand
if len(exonsNeg) > 0:
chrExonsByDirection.append((chrSeqRecord.id, -1, exonsNeg)) # -1 indicates neg strand
########### save these objects for future use #############
pk_geneExons = open(geneOutFullPath + '/geneExons_' + chrSeqRecord.id + '.pkl', 'wb')
pickle.dump(geneExons, pk_geneExons)
pk_geneExons.close()
if args.verbose:
print "num genes: " + str(len(geneExons))
pk_exonsByDirection = open(exonOutFullPath + '/exonsByStrand_' + chrSeqRecord.id + '.pkl', 'wb')
pickle.dump(chrExonsByDirection, pk_exonsByDirection)
pk_exonsByDirection.close()
if args.verbose:
for elem in chrExonsByDirection:
print "chromosome:" + str(elem[0])
print "strand: " + str(elem[1])
print "number of exons: " + str(len(elem[2]))
pk_rec = open(recOutFullPath + '/rec_' + chrSeqRecord.id + '.pkl', 'wb')
pickle.dump(chrSeqRecord, pk_rec)
pk_rec.close()
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('-f', '--fastaFile', required=True, help='path to fasta file with chromosome sequences')
parser.add_argument('-a', '--annotationFile', required=True, help='path to gff file with exon annotations')
parser.add_argument('-o', '--outDir', help='directory to output files, will be created if it does not exist', default='output')
parser.add_argument('-n1', '--name1', help='name of field in gtf to use for gene names', default='gene_name')
parser.add_argument('-n2', '--name2', help='name of field in gtf to use for gene names if n1 does not exist', default='gene_id')
parser.add_argument('-e', '--exonOutDir', help='directory to output exon pickle files, will be created within outDir if it does not exist', default='exons')
parser.add_argument('-g', '--geneOutDir', help='directory to output gene pickle files, will be created within outDir if it does not exist', default='genes')
parser.add_argument('-r', '--recOutDir', help='directory to output seq record pickle files, will be created within outDir if it does not exist', default='records')
parser.add_argument('-v', '--verbose', help='print info about data obtained', action='store_true')
args = parser.parse_args()
exonOutFullPath = '/'.join([args.outDir, args.exonOutDir])
geneOutFullPath = '/'.join([args.outDir, args.geneOutDir])
recOutFullPath = '/'.join([args.outDir, args.recOutDir])
# create output directories if necessary
utils_os.createDirectory(args.outDir)
utils_os.createDirectory(exonOutFullPath)
utils_os.createDirectory(geneOutFullPath)
utils_os.createDirectory(recOutFullPath)
# read in the sequences
f_handle = open(args.fastaFile, "rU")
f_dict = SeqIO.to_dict(SeqIO.parse(f_handle, "fasta"))
f_handle.close()
# only want exons for now. unfortunately can't limit by strand so have to do that after the fact
# we can speed things up a bit by limiting to just the chromosomes we have sequences for
limit_info = dict(
gff_id = f_dict.keys(),
gff_type = ["exon"])
if args.verbose:
print "data we are searching for in gff: " + str(limit_info)
########### read in the annotations, adding annotations to the sequences
a_handle = open(args.annotationFile, "rU")
if args.verbose:
print "opening annotation file", args.annotationFile
for rec in GFF.parse(a_handle, base_dict=f_dict, limit_info=limit_info): # each rec is a SeqRecord (for 1 chromosome)
if args.verbose:
print "####### starting new chromosome: " + str(rec.id)
# populate the data structures with genes and exons from this chromosome and pickle the objects
parseChrFeatures(rec)
a_handle.close()