-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprotein_modeling.py
120 lines (114 loc) · 5.49 KB
/
protein_modeling.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import os, glob, csv, sys
import pandas as pd
from collections import defaultdict
import pyrosetta
pyrosetta.init()
EMBOSS = '/usr/local/emboss/bin/needle'
template_sequence_library = glob.glob(os.path.join(os.getcwd(), 'pdb_sequeuce_fasta/*.fasta'))
aligned_hits = 'template_hits.csv'
template_pdb_files = os.path.join(os.getcwd(), 'pdb_structure_files')
def sequence_alignment(target_sequence, template_sequence):
'''Protein sequence alignment using EMBOSS program'''
for template_seq in template_sequence:
target_seq_id = os.path.basename(target_sequence).split('.')[0]
template_seq_id = os.path.basename(template_seq).split('.')[0]
os.system('{0} -sid1 {1} -asequence {2} -sid2 {3} -bsequence {4} -gapopen 10.0 -gapextend 0.5 -aformat3 markx3 -outfile {1}_{3}.needle'.format(EMBOSS, target_seq_id, target_sequence, template_seq_id, template_seq))
def top_hits_from_sequence_alignment(alignend_sequence):
'''Selecting the template hits based on the ranking of score value'''
result_table = pd.DataFrame(columns = ('query', 'template', 'length', 'identity', 'similarity', 'gaps', 'score'))
emboss_ind_title = 1
for alignment_file in alignend_sequence:
with open(alignment_file, 'r') as readFILE:
for line in readFILE:
line = line.strip()
if '# 1:' in line:
query = line.split()[-1]
elif '# 2:' in line:
template = line.split()[-1]
elif '# Length:' in line:
length = line.split()[-1]
elif '# Identity:' in line:
identity = line.split()[-1].replace('(', '').replace(')', '').replace('%', '')
elif '# Similarity:' in line:
similarity = line.split()[-1].replace('(', '').replace(')', '').replace('%', '')
elif '# Gaps:' in line:
gaps = line[24:].replace('(', '').replace(')', '').replace('%', '')
elif '# Score:' in line:
score = line.split()[-1].replace('(', '').replace(')', '').replace('%', '')
result_table.loc[emboss_ind_title] = query, template, float(length), float(identity), float(similarity), float(gaps), float(score)
emboss_ind_title += 1
result_table['score_rank'] = result_table['score'].rank(method = 'dense', ascending=0)
top_hit_table = result_table.sort_values('score_rank', ascending=True)
top_hit_table[:10].to_csv(aligned_hits, sep=',', index=False)
def modeling(template_hits, template_pdb_path, target_seq_path):
'''Modeling of protein structure using PyRosetta pose manipulation application'''
top_hit_template_file_path = []
with open(template_hits, newline='') as csvFile:
reader = csv.DictReader(csvFile)
for row in reader:
target_seq = os.path.basename(target_seq_path).split('.')[0]
alignment_file_name = '{}_{}.needle'.format(target_seq, row['template'])
alignment_file_path_with_name = os.path.join(os.getcwd(), alignment_file_name)
top_hit_template_file_path.append(alignment_file_path_with_name)
aligned_seq = defaultdict(list)
for path in top_hit_template_file_path:
target_template_file_name = os.path.splitext(os.path.basename(path))[0]
target_name_fasta_format = '>{} ..'.format('_'.join(target_template_file_name.split('_')[0:2]))
template_name_fasta_format = '>{} ..'.format('_'.join(target_template_file_name.split('_')[2:]))
target_aligned_seq = ''
template_aligned_seq = ''
with open (path, 'r') as readFile:
parse = False
parse2 = False
for line in readFile:
line = line.strip()
if not parse:
if line.startswith(target_name_fasta_format):
parse = True
elif line.startswith(template_name_fasta_format):
parse = False
else:
target_aligned_seq+=line
if not parse2:
if line.startswith(template_name_fasta_format):
parse2 = True
elif line.startswith('#'):
parse2 = False
else:
template_aligned_seq += line
aligned_seq[target_template_file_name].append(target_aligned_seq)
aligned_seq[target_template_file_name].append(template_aligned_seq)
target_seq_for_modeling = {}
for name, alignment_file in aligned_seq.items():
# top_hits_alignment = '{}\n{}\n{}\n\n'.format(name, alignment_file[0], alignment_file[1])
# with open('top_hits_alignment.txt', 'a') as writeFile:
# writeFile.write(top_hits_alignment)
target_seq_based_on_temp_pdb = ''
for i in range(len(alignment_file[0])):
if not alignment_file[1][i] == '-':
target_seq_based_on_temp_pdb += alignment_file[0][i]
target_seq_for_modeling[name]=target_seq_based_on_temp_pdb
final_target_template_for_modeling = {}
for target_template, target_final_seq in target_seq_for_modeling.items():
template_name = '_'.join(target_template.split('_')[2:])
temp_list_dir = os.listdir(template_pdb_path)
for template_hit in temp_list_dir:
if template_name in template_hit:
final_target_template_for_modeling[template_hit] = target_final_seq
for template_pdb, target_seq in final_target_template_for_modeling.items():
output_model_name = '{}_{}.pdb'.format(target_seq_path.split('.')[0], '_'.join(template_pdb.split('_')[0:2]))
join_apo_dir_path = os.path.join(template_pdb_path, template_pdb)
pose = pyrosetta.pose_from_file(join_apo_dir_path)
assert(pose.size() == len(target_seq))
scorefxn = pyrosetta.get_fa_scorefxn()
for i in range(len(target_seq)):
seqpos = i + 1
name1 = target_seq[i]
if name1 == "-":
continue
pyrosetta.rosetta.protocols.toolbox.pose_manipulation.repack_this_residue(seqpos, pose, scorefxn, True, name1)
pose.dump_pdb(output_model_name)
if __name__ == "__main__":
sequence_alignment(sys.argv[1], template_sequence_library)
top_hits_from_sequence_alignment(glob.glob('*.needle'))
modeling(aligned_hits, template_pdb_files, sys.argv[1])