-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgenerate_loc_report.py
142 lines (128 loc) · 5.01 KB
/
generate_loc_report.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import argparse
import RNA
import localcider
from Bio.Seq import Seq
import shutil
def nascent_polypeptide(fa, proline_threshold):
# Checks upstream bases for nascent polypeptide arrest sequence
# Input: upstream bases, % proline threshold
# Output: dictionary with info about nascent chain
with open(fa) as f:
seq = f.read()
dna = Seq(seq)
amino_acids = dna.translate()
SeqOb = localcider.sequenceParameters.SequenceParameters(str(amino_acids))
kappa = SeqOb.get_kappa()
NCPR = SeqOb.get_NCPR()
prolines = 0
for char in amino_acids:
if char == 'P':
prolines = prolines + 1
pct_proline = 100 * prolines/len(amino_acids)
positive_residues = False
if (kappa > .5) ^ (NCPR > 0):
positive_residues = True
nascent_info = {}
if positive_residues or (pct_proline > proline_threshold):
nascent_info["Arrest Sequence"] = str(amino_acids)
nascent_info["Kappa"] = kappa
nascent_info["NCPR"] = NCPR
nascent_info["Percent Prolines"] = pct_proline
return nascent_info
else:
nascent_info["Arrest Sequence"] = "-"
nascent_info["Kappa"] = "-"
nascent_info["NCPR"] = "-"
nascent_info["Percent Prolines"] = "-"
return nascent_info
def calc_free_energy(segment, aSD):
# Finds SD interactions in a given sequence
# Input: segment sequence, anti-SD sequence
# Output: list of dictionaries with SD interactions
fc = RNA.fold_compound(segment[0] + '&' + aSD)
# compute MFE and MFE structure
(mfe_struct, mfe) = fc.mfe()
SD_interactions = []
if mfe <= -4.4:
SD_interactions.append({"SD-like Sequence": segment[0], "SD distance from shift": segment[1],"SD MFE": mfe})
return SD_interactions
def SD(sequence, aSD, candidate_SD_region_len):
# Finds the minimum MFE SD interaction in candidate SD region
# Input: upstream sequeence, anti-SD sequence, candidate region length
# Output: dictionary with most likely SD interaction
with open(sequence) as f:
reference = f.read()
f.close()
candidate_SD_region = reference[-candidate_SD_region_len:]
for i in range(len(candidate_SD_region) - 4):
loc = len(candidate_SD_region) - i #define starting loc for possible SD motif, this is also distance until slippery site
SD_interactions = calc_free_energy((candidate_SD_region[i:i+4], loc), aSD)
if SD_interactions == []:
return {"SD-likesequence": "-", "SD distance from shift": "-","SD MFE": "-"}
min_mfe = 0
min_mfe_dict = {}
# need to account for ties later
for SD_dict in SD_interactions:
if SD_dict["SD MFE"] < min_mfe:
min_mfe_dict = SD_dict
return min_mfe_dict
def eval_downstream_structure(sto, ref, bases):
# Looks for downstream structure
# Input: CaCoFold fold output, original sequence, number of unpaired bases
# Output: Whether there is a downstream structure that will stall translation, and CaCoFold structure
# Read in reference
with open(ref) as f:
reference = f.read()
f.close()
# Read in CaCoFold output
with open(sto) as f:
CaCoFold_out = f.read()
f.close()
#evaluate structure
for line in CaCoFold_out.split('#'):
if line[0:11] == "=GC SS_cons":
for string in line.split(" "):
if len(string) > 7:
structure = string
check = structure[0:bases]
struct = False
for char in check:
if char != ":":
struct = True
break
return struct, structure
def generate_output(sto, ref, num_bases, upstream_bases, len_SD_region, aSD, proline_threshold, report_file, loc):
# Generates dictionary with information about this slippery sequence
# Only writes to output if there is a downstream structure that will stall translation
struct, structure_string = eval_downstream_structure(sto, ref, num_bases)
if struct == True:
# Look for SD-like sequence
min_mfe_dict = SD(upstream_bases, aSD, len_SD_region)
# Look for nascent polypeptide arrest sequence
nascent = nascent_polypeptide(upstream_bases, proline_threshold)
loc_dict = nascent.copy()
for key, value in min_mfe_dict.items():
loc_dict[key] = value
loc_dict["Slippery Sequence Start"] = loc.split('-')[2]
loc_dict["cDNA"] = loc.split('/')[1]
loc_dict["Structure"] = structure_string[:-1]
with open(report_file, 'a') as f:
f.write(str(loc_dict))
f.write("\n")
f.close()
return
else:
return
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('-S', default=None, type=str, help='Path to CaCoFold output')
parser.add_argument('-F', default=None, type=str, help='Path to reference sequence')
parser.add_argument('-B', default=20, type=int, help='# downstream bases to evaluate for structure')
parser.add_argument('-U', default=None, type=str, help='Path to upstream bases')
parser.add_argument('-R', default=20, type=int, help='Length of candidate SD region')
parser.add_argument('-A', default='AUCACCUCCUUU', type=str, help='Anti SD sequence')
parser.add_argument('-P', default=None, type=int, help='Proline threshold')
parser.add_argument('-O', default=None, type=str, help='Path to report file')
parser.add_argument('-L', default=None, type=str, help='Slippery sequence loc')
args = parser.parse_args()
generate_output(args.S, args.F, args.B, args.U, args.R, args.A, args.P, args.O, args.L)