forked from amirbitran/dbfold
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_snapshot_configs.py
110 lines (82 loc) · 7.83 KB
/
get_snapshot_configs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Aug 16 08:13:37 2018
Loops through desired snapshots. For each native substructure, computes average
distance between residues participating in that substrucutre in the current
snapshot. This value is also refered to as a "score"
In general, you can apply substructures from a protein of length L to analyze
a smaller protein of length L'<=L (e.g. a truncation of the larger protein)
If so, then we truncate the contact map/substructures at L' residues
Example instances, assuming the directory to analyze is located at a path ../1igd/MultiUmbrella17:
python get_snapshot_configs.py --directory=../1igd/MultiUmbrella17 --root='1igd' --native_file=../1igd/files/1igd_0.200_20_Emin.pdb --d_cutoff=6.5 --min_seq_separation=3 --contact_sep_thresh=7
python get_snapshot_configs.py --directory=../1igd/MultiUmbrella17 --root='1igd' --native_file=../1igd/files/1igd_0.200_20_Emin.pdb --substructure_file=1igd_data/Substructures.dat
python get_snapshot_configs.py --directory=../1igd/MultiUmbrella17 --root='1igd' --native_file=../1igd/files/1igd_0.200_20_Emin.pdb --substructure_file=1igd_data/Substructures.dat --temperatures='0.800, 0.825' --output_filename='SS_test.dat'
python get_snapshot_configs.py --directory=../1igd/MultiUmbrella17 --root='1igd' --native_file=../1igd/files/1igd_0.200_20_Emin.pdb --substructure_file=1igd_data/Substructures.dat --temperatures='0.800, 0.825' --output_filename='SS_test.dat' --min_step=0 --max_step=100000000
@author: amirbitran
"""
import numpy as np
import joblib
import dbfold
import dbfold.analyze_structures
import dbfold.utils
import glob
import argparse
parser = argparse.ArgumentParser(description='Hi')
parser.add_argument("--directory", help = 'This is the path to the directory containing the PDB files we want to analyze...' )
parser.add_argument("--root", help = 'The root for PDB files...all PDB files will be named {root}_{temperature}_{setpoint}.{MC timestep} for equilibrium simulations, and {root}_{temperature}_{traj number}.{MC timestep} for unfolding simulations')
parser.add_argument("--native_file", help = 'Path to the native file for the protein whose snapshots you want to analyze. Must be specified even if you provide your own substructures in advance')
parser.add_argument("--d_cutoff", default = '6.5', help = 'distance cutoff (in angstroms) used to compute contacts. If not specified, then --substructure_file must be specifed instead so that pre-computed substructures can be used. Defaults to 6.5.')
parser.add_argument("--min_seq_separation", default = '8', help = 'min separation in sequence between two residues for them to be counted as being in contact. If not specified, then --substructure_file must be specifed instead so that pre-computed substructures can be used. Defaults to 8')
parser.add_argument("--min_clustersize", default = '5', help = 'mininmum size of island of contacts for it to be considered a substructure. If not specified, then --substructure_file must be specifed instead so that pre-computed substructures can be used. Defaults to 5')
parser.add_argument("--contact_sep_thresh", default = '5', help = 'two contacts cannot be separated by a Manhattan distance greater than this on the contact map for them to be grouped into an island. If not specified, then --substructure_file must be specifed instead so that pre-computed substructures can be used. Defaults to 5')
parser.add_argument("--substructure_file", required = False, help = 'Path to a .dat file which contains substructures and native distances. This could have been generated by creating a Protein class and running generate_subs. If this is not specified, then the arguments --d_cutoff, --min_seq_separation, --min_islesize, and --max_contact_sep must be specified instead so that substructures can be computed de novo. Note that if a substructure_file is provided, the d_cutoff and min_seq_separation values there take precedence over any entered as command-line arguments')
parser.add_argument("--min_step", default = '0', help = 'minimum MC step to analyze. Defaults to 0.')
parser.add_argument("--max_step", default = 'inf', help = 'maximum MC step to analyze. Defaujlts to infinity.')
parser.add_argument("--temperatures", default = "*.***", type = str, help = "Temperatures at which you want to run analysis, as a comma-separated string. For instnace you can type --temperatures='0.800, 0.900' or --temperatures = '0.8**' or --temperatures='*.***', the latter being the default ")
parser.add_argument("--output_filename", default = "Substructure_scores.dat", help = "A file with this name, which contains scores for all snapshots, will be saved in directory. Defaults to Substructure_scores.dat ")
args = parser.parse_args()
directory = args.directory
fileroot = args.root
native_file=args.native_file
d_cutoff = float(args.d_cutoff)
min_seq_separation=int(args.min_seq_separation)
min_clustersize=int(args.min_clustersize)
contact_sep_thresh=int(args.contact_sep_thresh)
min_step=float(args.min_step)
max_step=float(args.max_step)
temperatures = [item for item in args.temperatures.split(',')]
output_filename=args.output_filename
if args.substructure_file:
substructure_file=args.substructure_file
print('Pre-made substructures provided in {}'.format(substructure_file))
native_substructures, native_distances, d_cutoff, min_seq_separation = joblib.load(substructure_file)
print('Pre-made substructures loaded..')
#native_contacts=np.zeros(np.shape(native_distances))
#native_contacts[np.where((native_distances<d_cutoff) & (native_distances!=0))]=1
#note, by native_distances and native_contacts, we actually mean the distances/contacts that are found in the protein used to produce the substructure file
#So for instance, if the substructure file is the full version of the protein of interest, then we define the contacts/distnaces in the full protein as the "native" ones
#now, load the protein of interest's contacts to figure out if we need to truncate the native contacts
zz = dbfold.analyze_structures.find_native_contacts(native_file, d_cutoff, min_seq_separation)
length=np.shape(zz)[0]
if length<np.shape(native_substructures)[0]:
native_substructures = native_substructures[0:length, 0:length, :] #only keep whatever portions of substructures are acutally found in the current protein
native_distances=native_distances[0:length, 0:length] #same for native distances and contacts
#native_contacts=native_contacts[0:length, 0:length]
else:
print('Identifying substructures de novo!')
native_contacts, native_substructures=dbfold.analyze_structures.identify_native_substructures(native_file, d_cutoff, min_seq_separation, contact_sep_thresh, min_clustersize, plot=False )
native_distances=dbfold.analyze_structures.find_native_contacts(native_file, d_cutoff, min_seq_separation, mode = 'distances')
print('{} snapshots identified!'.format(np.shape(native_substructures)[2]))
PDB_files=[]
for temp in temperatures: PDB_files+=glob.glob('{}/{}_{}*.*0'.format(directory, fileroot, temp))
times = dbfold.utils.get_times(PDB_files)
PDB_files = [file for f, file in enumerate(PDB_files) if times[f]>=min_step and times[f]<max_step ]
scores=np.zeros((len(PDB_files), np.shape(native_substructures)[2])) #scores[f, n] tells you, for file f, what is the average distance between alpha carbons that were assigned to native substructure n
print('Time to assign scores! \n {} files total'.format(len(PDB_files)))
for f, file in enumerate(PDB_files):
if np.mod(f,500)==0:
print('{} files completed'.format(f))
scores[f,:], distances = dbfold.analyze_structures.score_snapshot(file, native_substructures, min_seq_separation = min_seq_separation)
joblib.dump([scores, PDB_files, native_distances, native_substructures], open('{}/{}'.format(directory, output_filename), "wb"), compress=3 )
print('Scores obtained and saved! Have a good day!')