-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathread_orf.py
135 lines (123 loc) · 4.7 KB
/
read_orf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
def readingFrames(sequence):
"""
Creates the 6 different reading frames of our data
within each reading frame is a list of the triplet codons required
:param sequence:
:return: the 6 reading frames in the form of codons
Created by Ashwin Mukund
"""
RF1 = []
RF2 = []
RF3 = []
RF4 = []
RF5 = []
RF6 = []
for i in range(0, len(sequence), 3):
rf1codons = sequence[i:i + 3]
RF1.append(rf1codons)
for i in range(1, len(sequence), 3):
rf2codons = sequence[i:i + 3]
RF2.append(rf2codons)
for i in range(2, len(sequence), 3):
rf3codons = sequence[i:i + 3]
RF3.append(rf3codons)
# Reverse Compliment
complement = ''
for i in range(0, len(sequence)):
if sequence[i] == 'A':
complement += 'T'
elif sequence[i] == 'T':
complement += 'A'
elif sequence[i] == 'G':
complement += 'C'
elif sequence[i] == 'C':
complement += 'G'
# print(complement)
reversecomplement = complement[::-1]
# print(reversecomplement)
for i in range(0, len(reversecomplement), 3):
rf4codons = reversecomplement[i:i + 3]
RF4.append(rf4codons)
for i in range(1, len(reversecomplement), 3):
rf5codons = reversecomplement[i:i + 3]
RF5.append(rf5codons)
for i in range(2, len(reversecomplement), 3):
rf6codons = reversecomplement[i:i + 3]
RF6.append(rf6codons)
readingframes = [RF1, RF2, RF3, RF4, RF5, RF6]
return readingframes
def ORFData(readingframes):
"""
Gets every occurence of the start and stop codon in the file and places it into two different lists for 6 of the
reading frames
Created by Ashwin Mukund
"""
startcodonReadingFrames = []
stopcodonReadingFrames = []
for j in range(6):
startcodonReadingFrames.append([])
stopcodonReadingFrames.append([])
for k in range(len(readingframes[j])):
if readingframes[j][k] == 'ATG':
'''if j==0:
startIndex=3*k
elif j==1:
startIndex=(3*k)+1
elif j==2:
startIndex=(3*k)+2
elif j==3:
startIndex=-1*(3*k)
elif j==4:
startIndex=-1*((3*k)+1)
elif j==5:
startIndex=-1*((3*k)+2)'''
startcodonReadingFrames[j].append(k)
if readingframes[j][k] == 'TAG' or readingframes[j][k] == 'TAA' or readingframes[j][k] == 'TGA':
'''
if j == 0:
stopIndex = 3 * k
elif j == 1:
stopIndex = (3 * k) + 1
elif j == 2:
stopIndex = (3 * k) + 2
elif j == 3:
stopIndex = -1 * (3 * k)
elif j == 4:
stopIndex = -1 * ((3 * k) + 1)
elif j == 5:
stopIndex = -1 * ((3 * k) + 2)'''
stopcodonReadingFrames[j].append(k)
return startcodonReadingFrames, stopcodonReadingFrames
def printORFs(startData, stopData, min_aa=100):
"""
Compares every start codon to the first instance of the stop codon. This is an inefficient function, since it has an
unecessary for loop for the stop codon list
An ORF is only valid for every instance start codon with the first instnace of the stop codon, since that is how
RNA Polymerase works
Regardless, the function examines and appends every valid ORF's locations to a tuple
Created by Ashwin Mukund and Haowen Zhou
"""
readingFrameORFs = []
for j in range(len(startData)):
readingFrameORFs.append([])
for i in range(len(startData[j])):
if max(stopData[j]) - startData[j][i] >= min_aa:
nearest_stop = max(stopData[j])
for k in range(len(stopData[j])):
if (startData[j][i] < stopData[j][k]) and (stopData[j][k] < nearest_stop):
nearest_stop = stopData[j][k]
if nearest_stop - startData[j][i] >= min_aa:
readingFrameORFs[j].append((startData[j][i], nearest_stop))
return readingFrameORFs
def sequenceORFs(ORFpairs, readingframes):
"""
Takes the tuple of the valid ORF's for each RF and then takes the original reading frame's and slices the list based
on the locations of the tuples to get the proper ORF's
Created by Ashwin Mukund
"""
ORFs = []
for j in range(len(ORFpairs)):
ORFs.append([])
for k in range(len(ORFpairs[j])):
ORFs[j].append(readingframes[j][ORFpairs[j][k][0]:ORFpairs[j][k][1] + 1])
return ORFs