-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdigestion.py
executable file
·227 lines (167 loc) · 8 KB
/
digestion.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# ------------------------------------------------------------------------------
"""
This script define the class Digestion which perform the in silico digestion and
check the uniqueness of peptide sequences.
It contains also the lines for command line usage.
"""
# ------------------------------------------------------------------------------
import re
from textwrap import dedent
import numpy as np
import reader
import enzyme
import mass
import cli
# ------------------------------------------------------------------------------
class Digestion:
""" Instantiate this class to perform in silico digestion and find unique
peptide """
# Default parameter. Can be change with self.set<param>(value)
_enzyme = 'trypsin' # See enzyme.py for more enzymes choice
_min_length = 7
_max_length = None
_max_miscleavages = 1
_max_mass = 4600 # in dalton
def __init__(self, fasta):
self.fasta = fasta
self.params = {'enzyme': Digestion._enzyme,
'min_length': Digestion._min_length,
'max_length': Digestion._max_length,
'max_miscleavages': Digestion._max_miscleavages,
'max_mass': Digestion._max_mass,
'clip_nterm_m': False
}
self.proteins = reader.fasta_to_dict(self.fasta)
self.peptides = {id: [] for id in self.proteins}
self.outdir = f'{fasta}_digestedPeptides.csv'
self.is_unique = {}
def __str__(self):
return dedent(f"""\
Parameters:
Enzyme: {self.params['enzyme']}
Minimum peptide sequence length: {self.params['min_length']}
Maximum peptide sequence length: {self.params['max_length']}
Maximum of miscleavages: {self.params['max_miscleavages']}
Maximum peptide's mass: {self.params['max_mass']}
""")
def set_enzyme(self, enzyme_name):
self.params['enzyme'] = enzyme_name.lower()
def set_min_length(self, minimum):
self.params['min_length'] = int(minimum)
def set_max_length(self, maximum):
self.params['max_length'] = int(maximum)
def set_max_miscleavages(self, miscleavages):
self.params['max_miscleavages'] = int(miscleavages)
def set_max_mass(self, _mass):
self.params['max_mass'] = float(_mass)
def set_clip_nterm_m(self, bool_):
self.params['clip_nterm_m'] = bool_
def set_outdir(self, dir_):
self.outdir = dir_
def _is_cleaved(self):
return len([pep for peptides in self.peptides.values() for pep in peptides]) != 0
def _get_mass(self, seq):
"""Sum the mass of all residues and add the mass of H (N-terminal) and OH
(C-terminus) to obtain the sequence mass"""
_mass = sum((mass.aminoAcid[aa] for aa in seq)) + mass.atom['H']*2 + mass.atom['O']
return round(_mass, 4)
def _join_sequences(self, base_sequences, miss):
""" Iterate through base sequences (fully digested peptide) and join miss+1
adjacent sequences """
num_sequences = len(base_sequences)-miss # Number of sequences to loop on
return [''.join(base_sequences[n:n+miss+1]) for n in range(num_sequences)]
def _is_good_peptide(self, seq):
""" Filter sequences with given threshold(s)"""
if self.params['min_length'] is not None and \
self.params['min_length'] > len(seq):
return False
if self.params['max_length'] is not None and \
self.params['max_length'] < len(seq):
return False
try:
if self.params['max_mass'] is not None and \
self.params['max_mass'] < self._get_mass(seq):
return False
# Catch non-amino acid characters (such as X)
except KeyError:
return False
return True
def _format_line(self, peptide, id_, include_unique_flag=False):
if include_unique_flag:
uniqueness = 'unique' if self.is_unique[peptide] else 'non-unique'
return f'{peptide}, {id_}, {self._get_mass(peptide)}, {uniqueness}\n'
return f'{peptide}, {id_}, {self._get_mass(peptide)}\n'
def write(self, outdir=None):
""" Write results to a csv file """
if not self._is_cleaved():
raise ValueError('No sequences found. Please use cleave_proteins() to generate sequences')
if outdir is not None:
self.set_outdir(outdir)
include_unique_flag = len(self.is_unique) != 0
with open(self.outdir, 'w', encoding='UTF-8') as out_file:
for id_ in self.peptides:
for peptide in self.peptides[id_]:
out_file.write(self._format_line(peptide, id_, include_unique_flag))
def cleave_proteins(self):
""" Perform an in silico digestion of proteins with previously selected
or default parameters """
regexp = enzyme.cleavage_site[self.params['enzyme']]
for id_, protein_sequences in self.proteins.items():
# split protein into perfectly digested sequences
base_sequences = re.split(regexp, protein_sequences)
miss = 0
while miss <= self.params['max_miscleavages']:
# Join base sequences in function of actual miscleavage value
all_peptides = self._join_sequences(base_sequences, miss)
if all_peptides == []:
miss += 1
continue
# Also consider N-terminal peptide without its methionine
if not self.params['clip_nterm_m']:
first_pep = all_peptides[0]
all_peptides.append(re.sub(r'^[Mm]', '', first_pep))
# Filter all peptide with given threshold(s)
if len(all_peptides) > 0:
self.peptides[id_].extend([pep for pep in all_peptides
if self._is_good_peptide(pep)])
miss += 1
def check_peptide_uniqueness(self):
""" Check if peptide sequences are unique in the whole fasta file (i.e.
has no identical match in other proteins)"""
# Get all peptides. Use set() to remove peptide duplicates from same protein (if any)
all_peptides = [peptide for peptides in self.peptides.values()
for peptide in set(peptides)]
# Find frequency of each peptide (does not account peptide duplicates from same protein)
peptides, count = np.unique(all_peptides, return_counts=True)
peptides_frequency = dict(zip(peptides, count))
self.is_unique = {pep: (peptides_frequency[pep] == 1)
for peptides in self.peptides.values() for pep in peptides}
# ------------------------------------------------------------------------------
# For command line usage
if __name__=='__main__':
args = cli.parse_args()
run = Digestion(args.fasta_file)
run.set_min_length(args.min_length)
if args.max_length is not None:
run.set_max_length(args.max_length)
run.set_max_miscleavages(args.miscleavages)
run.set_max_mass(args.mass)
run.set_clip_nterm_m(args.clip_nterm_m)
run.set_enzyme(args.enzyme)
if args.output is not None:
run.set_outdir(args.output)
print(dedent(f"""
Performing digestion with:
min_length = {run.params['min_length']}
max_length = {run.params['max_length']}
enzyme = '{run.params['enzyme']}'
max_mass = {run.params['max_mass']}
max_miscleavages = {run.params['max_miscleavages']}
"""))
run.cleave_proteins()
if args.unique is True:
run.check_peptide_uniqueness()
run.write()
print(f'List of peptide sequences saved as {run.outdir}')