-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathchecks.py
357 lines (273 loc) · 14.1 KB
/
checks.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
"""In this file multiple checks are defined for the protein.
"""
import os
import math
import logging
check_logger = logging.getLogger('pymemdyn.checks')
try:
import modeller
from modeller import automodel
except:
check_logger.warning("""!! WARNING !! : No installation of MODELLER was found.
Missing loops and/or sidechains cannot be remodelled and will cause errors.""")
from aminoAcids import AminoAcids
from Bio.PDB import PDBParser, PDBIO, Select
from collections import Counter
import logging
check_logger = logging.getLogger('pymemdyn.checks')
class CheckProtein():
def __init__(self, *args, **kwargs):
self.pdb = kwargs['pdb']
self.chains = kwargs['chains']
self.loop_fill = kwargs['loop_fill']
self.logger = logging.getLogger('pymemdyn.checks.checkProtein')
self.aa = AminoAcids()
self.nr_missing = 0
def find_missingLoops(self, pdb_chain, chain):
"""
Check if the residue numbering is continuous, write sequence.
If loops are missing, return missingLoc.
"""
with open(pdb_chain, 'r') as inf:
lines = inf.readlines()
# Initialize params
missingLoc = {}
resID_prev = 0
prev_chain = ''
first_res = True
first_chain_res = False
pdbseq = ''
aa_d = self.aa.codes321 # 3-letter code to 1 letter code dict is now aa_d
# Loop over protein.pdb
for i, line in enumerate(lines):
splitted = line.split() # Careful here, sometimes columns are stuck together!
# Determine if line is useful
if not splitted[0] == 'ATOM':
if splitted[0] == 'TER': # End of chain
first_chain_res = True
self.logger.debug('found TER at line {}'.format(i))
pdbseq += '/'
continue # Skip to next line
else: # Useful line
try:
resID = int(line[22:26])
# Save ID of the first residue
if first_res:
self.first_res_ID = int(line[22:26])
self.logger.debug('first resID: {}'.format(self.first_res_ID))
resID_prev = resID - 1
first_res = False
chainID = line[21]
except:
raise Exception("Cannot read resID or chain ID in the following line: \n{}".format(line))
# Check for non-std res
if line[17:20] not in list(aa_d.keys()):
self.logger.exception("Residue {}, {} is not a standard amino acid. Non-standard amino acids are not supported.".format(resID, splitted[3]))
raise Exception("Residue {}, {} is not a standard amino acid. Non-standard amino acids are not supported.".format(resID, splitted[3]))
resIDchain = chainID + str(resID)
if (resID != resID_prev) and (resID != resID_prev + 1): # missing
if first_chain_res:
first_chain_res = False
self.logger.debug('Ignoring residue {} for missing residue check (first residue of chain)'.format(resID))
else:
missingLoc[prev_chain+str(resID_prev)] = resIDchain
self.nr_missing = resID - resID_prev -1
pdbseq += '-'*self.nr_missing # for missing residue
if resID != resID_prev: # next res
pdbseq += aa_d[line[17:20]] # 1 letter code
resID_prev = resID
prev_chain = chainID
# End of for-loop
self.logger.debug('length missingLoc: {}'.format(len(missingLoc)))
# Generate fasta file (not used, but nice for debugging)
seq = open(f"seq_original_{chain}.fasta", "w")
seq.write(">Generated automatically from {}\n".format(self.pdb))
seq.write(pdbseq)
self.seq = pdbseq
return missingLoc
def find_missingSideChains(self, pdb_chain, chain):
self.structure = PDBParser(QUIET=True).get_structure('protein', pdb_chain)
missing_sideChains = []
residues_to_exclude = []
residues_to_keep = []
for model in self.structure:
for chains in model:
for residue in chains:
atom_count = Counter()
current_atoms = [atom.element for atom in residue if atom.element != 'H'] # Exclude hydrogen
for atom in current_atoms:
atom_count[atom] += 1
amino = residue.resname
self.logger.debug('Residue {}: {} atom counts ({}) | reference atom counts ({}).'.format(residue.id[1], amino, self.aa.sideChains[amino], atom_count))
if self.aa.sideChains[amino] != atom_count:
self.logger.info('Residue {}: {} atom counts ({}) does not match with reference atom counts ({}). It will be deleted from your pdb and replaced with MODELLER'.format(residue.id[1], amino, self.aa.sideChains[amino], atom_count))
missing_sideChains.append((residue.id[1], amino))
residues_to_exclude.append(residue)
else:
residues_to_keep.append(residue)
class ResiduesSelect(Select):
def __init__(self, residues):
self.residues = residues
"""A custom selector that excludes specific residues."""
def accept_residue(self, residue):
return residue in self.residues
io = PDBIO()
io.set_structure(self.structure)
io.save(f'{self.pdb[:-4]}_{chain}.pdb', ResiduesSelect(residues_to_keep))
return missing_sideChains
def make_ml_pir(self, **kwargs):
"""
make_ml_pr: Modify missing regions and create a MODELLER alignment file (.pir)
:kwarg work_dir: working directory
:kwarg tgt1: alignment.pir
"""
with open(os.path.join(kwargs["work_dir"], self.pdb), 'r') as in_pdb:
lines_pdb = in_pdb.readlines()
self.logger.info(f'Identified chains:\t{self.chains}')
broken_chains = []
for chain in self.chains:
self.logger.info(f'Checking chain:\t{chain}')
lines_chain = []
for line in lines_pdb:
try:
if (line[21] == chain):
lines_chain.append(line)
except:
pass
self.chain_pdb = f'{self.pdb[:-4]}_{chain}.pdb'
with open(self.chain_pdb, 'w') as file:
for line in lines_chain:
file.write(line)
missingLoc = self.find_missingLoops(self.chain_pdb, chain)
if len(missingLoc) == 0:
self.logger.info('No missing loops found')
else:
self.logger.info('Missing loops found in locations {}. Will be filled with poly-ala.'.format(list(missingLoc.keys())))
missing_SideChains = self.find_missingSideChains(self.chain_pdb, chain)
if len(missing_SideChains) == len(missingLoc) == 0:
continue
else:
broken_chains.append(chain)
pdbseq = self.seq
self.logger.debug('pdbseq: {}'.format(pdbseq))
if pdbseq.endswith("/"):
pdbseq = pdbseq[:-1] # looses trailing '/'
mod_seq = pdbseq
tmpl_seq = pdbseq
for sc in missing_SideChains:
loc = sc[0] - self.first_res_ID
code = self.aa.codes321[sc[1]]
tmpl_seq = tmpl_seq[:loc] + '-' + tmpl_seq[loc+1:]
mod_seq = mod_seq[:loc] + code + mod_seq[loc+1:]
target = ''
self.logger.debug(list(missingLoc.keys()))
first_res_ID = self.first_res_ID
for loopstart in list(missingLoc.keys()):
loop = int(loopstart[1:])
self.logger.debug('loop: {}\n'.format(loop))
chain = loopstart[0]
self.logger.debug('chain: {}\n'.format(chain))
for line in lines_pdb:
if line[:4] == "ATOM":
if int(line[22:26]) == loop and \
"".join(line[11:17]).strip() == "C" and line[21] == chain:
start = [float(line[30:38]), float(line[38:46]), float(line[46:54])]
target = int(missingLoc[loopstart][1:])
self.logger.debug('loop: {}\n'.format(loop))
if int(line[22:26]) == target and \
"".join(line[11:17]).strip() == "N" and line[21] == chain:
end = [float(line[30:38]), float(line[38:46]), float(line[46:54])]
self.logger.debug('target: {}\n'.format(target))
aa_dist = float(self.loop_fill)
x = abs(start[0] - end[0])
y = abs(start[1] - end[1])
z = abs(start[2] - end[2])
dist = math.sqrt(x*x + y*y + z*z)
num_aa = math.ceil(dist / aa_dist)
self.logger.debug('\tnum_aa: {}\n'.format(num_aa))
# Determine which chain we are in (and how many '/'-signs we need to ignore)
ignore_chain_characters = self.chains.index(chain)
start_seq_index = loop - first_res_ID +1 - ignore_chain_characters # (we ignored the beginloop)
end_seq_index = target - first_res_ID - ignore_chain_characters
self.logger.debug('\tstart: {}'.format(start_seq_index))
self.logger.debug('\tend: {}\n'.format(end_seq_index))
loop_seq_dash = pdbseq[start_seq_index:end_seq_index] # '----'
before = pdbseq[start_seq_index-2:start_seq_index] # 'AB'
after = pdbseq[end_seq_index:end_seq_index+2] # 'BA'
loop_seq = before + loop_seq_dash + after # 'AB----BA'
self.logger.debug('\tbefore: {}, dash: {}, after: {}'.format(before, loop_seq_dash, after))
self.logger.debug('\tloop_seq: {}\n'.format(loop_seq))
gaps = before + "A" * num_aa + after # 'ABAABA'
loop_replace = before + num_aa *'-' + after # 'AB--BA'
self.logger.debug('\tgaps: {}\n'.format(gaps))
self.logger.debug('\tloop_rep: {}\n'.format(loop_replace))
mod_seq = mod_seq.replace(loop_seq, gaps)
tmpl_seq = tmpl_seq.replace(loop_seq, loop_replace)
# while tmpl_seq.endswith('-'):
# tmpl_seq = tmpl_seq[:-1]
# mod_seq = mod_seq[:-1]
mod_seq += "*"
tmpl_seq += "*"
self.logger.debug(mod_seq)
self.logger.debug(tmpl_seq)
self.logger.debug("these lengths should be equal: {}, {}".format(len(mod_seq), len(tmpl_seq)))
# Write to .pir file
# tgt is the alignment.pir file
tgt1 = open(os.path.join(kwargs['work_dir'], f'alignment_{chain}.pir'), "w")
tgt1.write(f'>P1;{self.chain_pdb}\n')
firstResID_tmpl = int(first_res_ID) + (len(tmpl_seq) - len(tmpl_seq.lstrip('-')))
finalResID = int((len(pdbseq) + 1))
tgt1.write(f'structure:{self.chain_pdb}:{firstResID_tmpl}:{chain}:{finalResID+first_res_ID-1}:{chain}::: 2.10:-1.00\n')
new_line = 0
for aa in tmpl_seq:
tgt1.write(aa)
new_line += 1
if (new_line % 60) == 0:
tgt1.write("\n")
tgt1.write(f"\n>P1;refined_{chain}\n")
tgt1.write(f"sequence:::::::::\n")
new_line = 0
for aa in mod_seq:
tgt1.write(aa)
new_line += 1
if (new_line % 60) == 0:
tgt1.write("\n")
tgt1.close()
return broken_chains
def refine_protein(self, **kwargs):
"""
Refine protein structure using MODELLER
:kwarg knowns: the .pdb file to be refined
:returns: .pdb file of refined protein
"""
first_res = self.first_res_ID
self.logger.debug('self.chains: {}{}'.format(self.chains, type(self.chains[0])))
chains = self.chains
# class MyModel(automodel.AutoModel):
# def special_patches(self, aln):
# # Renumber residues in chains
# self.rename_segments(segment_ids=chains,
# renumber_residues=[first_res, first_res, first_res, first_res])
env = modeller.Environ()
env.io.atom_files_directory = ['.', '../atom_files']
a = automodel.AutoModel(env,
alnfile = f'alignment_{kwargs["chain"]}.pir',
knowns = kwargs["knowns"],
sequence = f'refined_{kwargs["chain"]}')
a.starting_model= 1
a.ending_model = 1
a.md_level = None
a.make()
# Get new file name
refined_pdb = a.get_model_filename(root_name=f'refined_{kwargs["chain"]}', id1=9999, id2=1, file_ext='.pdb')
# set chain ID
with open(refined_pdb, "r") as src:
tgt_prot = open('refined_tmp.pdb', "w")
for line in src:
if line.startswith("ATOM"):
# correct protein chain IDs
if line[21] != kwargs["chain"]:
line = line[:21] + kwargs["chain"] + line[22:]
tgt_prot.write(line)
os.rename('refined_tmp.pdb', refined_pdb)
return refined_pdb