-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathextractor.py
69 lines (58 loc) · 3.05 KB
/
extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# Borrowed code from ForestMHC
hydropathy_score = {
'G':-0.4, 'A':1.8, 'P':-1.6, 'V':4.2, 'L':3.8, 'I':4.5, 'M':1.9,
'F':2.8, 'Y':-1.3, 'W':-0.9, 'S':-0.8, 'T':-0.7, 'C':2.5, 'N':-3.5,
'Q':-3.5, 'K':-3.9, 'H':-3.2, 'R':-4.5, 'D':-3.5, 'E':-3.5
}
molar_mass = {
'G':75, 'A':89, 'P':115, 'V':117, 'L':131, 'I':131, 'M':149,
'F':165, 'Y':181, 'W':204, 'S':105, 'T':119, 'C':121, 'N':132,
'Q':146, 'K':146, 'H':155, 'R':174, 'D':133, 'E':147
}
amino_acids = list(molar_mass.keys())
aromatics = ['F', 'Y', 'W']
num_aa = 20
def make_kidera():
kidera = "A,-1.56,-1.67,-0.97,-0.27,-0.93,-0.78,-0.2,-0.08,0.21,-0.48;R,0.22,1.27,1.37,1.87,-1.7,0.46,0.92,-0.39,0.23,0.93;N,1.14,-0.07,-0.12,0.81,0.18,0.37,-0.09,1.23,1.1,-1.73;D,0.58,-0.22,-1.58,0.81,-0.92,0.15,-1.52,0.47,0.76,0.7;C,0.12,-0.89,0.45,-1.05,-0.71,2.41,1.52,-0.69,1.13,1.1;Q,-0.47,0.24,0.07,1.1,1.1,0.59,0.84,-0.71,-0.03,-2.33;E,-1.45,0.19,-1.61,1.17,-1.31,0.4,0.04,0.38,-0.35,-0.12;G,1.46,-1.96,-0.23,-0.16,0.1,-0.11,1.32,2.36,-1.66,0.46;H,-0.41,0.52,-0.28,0.28,1.61,1.01,-1.85,0.47,1.13,1.63;I,-0.73,-0.16,1.79,-0.77,-0.54,0.03,-0.83,0.51,0.66,-1.78;L,-1.04,0,-0.24,-1.1,-0.55,-2.05,0.96,-0.76,0.45,0.93;K,-0.34,0.82,-0.23,1.7,1.54,-1.62,1.15,-0.08,-0.48,0.6;M,-1.4,0.18,-0.42,-0.73,2,1.52,0.26,0.11,-1.27,0.27;F,-0.21,0.98,-0.36,-1.43,0.22,-0.81,0.67,1.1,1.71,-0.44;P,2.06,-0.33,-1.15,-0.75,0.88,-0.45,0.3,-2.3,0.74,-0.28;S,0.81,-1.08,0.16,0.42,-0.21,-0.43,-1.89,-1.15,-0.97,-0.23;T,0.26,-0.7,1.21,0.63,-0.1,0.21,0.24,-1.15,-0.56,0.19;W,0.3,2.1,-0.72,-1.57,-1.16,0.57,-0.48,-0.4,-2.3,-0.6;Y,1.38,1.48,0.8,-0.56,0,-0.68,-0.31,1.03,-0.05,0.53;V,-0.74,-0.71,2.04,-0.4,0.5,-0.81,-1.07,0.06,-0.46,0.65"
kidera_split = kidera.split(";")
aa_dict = {}
for i, aa_str in enumerate(kidera_split):
aa_str_split = aa_str.split(",")
aa = aa_str_split[0]
factor_vals = [float(v) for v in aa_str_split[1:]]
aa_dict[aa] = factor_vals
return aa_dict
def make_hot():
aa_dict = {}
for i, aa in enumerate(amino_acids):
aa_one_hot = [0] * num_aa
aa_one_hot[i] = 1
aa_dict[aa] = aa_one_hot
return aa_dict
def make_hydropathy():
'''
hydropathy score (Gibbs free energy transfer upon solvation in water,
with negative being favorable) Kyte Doolittle, J. Mol. Bio. 1982.
size len(pep)
'''
return {k: [v] for k,v in hydropathy_score.items()}
def make_mass():
'g/mol, including water. size: len(pep)'
return {k: [v] for k,v in molar_mass.items()}
def make_aromatic():
'binary membership in aromatic group or not. size: len(pep)'
return {k: [k in aromatics] for k in amino_acids}
class Extractor():
def __init__(self):
self.recipes = [
make_kidera(),
make_hot(),
make_hydropathy(),
make_mass(),
make_aromatic(),
]
for recipe in self.recipes:
assert len(recipe) == num_aa
def extract(self, peptide):
x_elems = [recipe[aa] for aa in peptide for recipe in self.recipes]
return [f for features in x_elems for f in features]