-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
166 lines (126 loc) · 4.19 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
##
# @author Lukáš Plevač <[email protected]>
# @date 12.5.2023
# Semestral project to BIF on BUT FIT
# Create ancestrals amino acides seqvences for Phylogenetic tree
#
from Bio import Phylo
from Bio import AlignIO
import pandas as pd
DEBUG = False
TREE_FILE = "tree.tre"
MSA_FILE = "msa.fasta"
ANCESTRALS_FILE = "ancestrals.csv"
OUT_DIR = "output"
##
## FILE LOAD PART
##
# first load phylo genetic tree
PhyloTree = Phylo.read(TREE_FILE, "newick")
# second load fasta
FastaAlign = AlignIO.read(MSA_FILE, "fasta")
# third load ancestrals
ancestrals = pd.read_csv(ANCESTRALS_FILE, sep = ",")
# create dict for simple fasta indexing by fasta ID
FastaAlignDict = {}
for fasta in FastaAlign:
FastaAlignDict[fasta.id] = fasta
##
## Check if tree file coresponding with ancestrals
##
## get all nodes from ancestrals
a_nodes = []
for node in ancestrals['node']:
if node not in a_nodes:
a_nodes.append(node)
a_nodes.sort()
## get all nodes from tree
t_nodes = []
for clade in PhyloTree.find_clades():
if clade.confidence is not None:
t_nodes.append(clade.confidence)
t_nodes.sort()
if a_nodes != t_nodes:
print("Error not same nodes in tree and ancestrals files")
print("Nodes in tree file: {}".format(t_nodes))
print("Nodes in ancestrals file: {}".format(a_nodes))
exit(1)
##
## Create acentral seqvence without spaces
## using posterior probability form csv file
##
seq = {}
for node in a_nodes:
seq[node] = {}
ac_node = ancestrals[ancestrals['node'] == node]
ac_node.reset_index()
for _, row in ac_node.iterrows():
amino = row[['A','R','N','D','C','Q','E','G','H','I','L','K','M','F','P','S','T','W','Y','V']]
amino = pd.to_numeric(amino, errors ='coerce').fillna(0).astype('float')
seq[node][row['position'] - 1] = amino.idxmax(axis=0)
##
## Space placement part
##
##
# Get all leaves nodes for clade and distance to it
#
# @param clade clade to find all leafs nodes
# @return array of dict of leaf clade and total_branch_len from param clade
def getLeaves(clade):
opend = [{'clade': clade, 'total_len': 0}]
closed = []
while len(opend) > 0:
parrent = opend.pop()
if parrent['clade'].name is not None:
closed.append(parrent)
else:
for kid in parrent['clade']:
opend.append({'clade': kid, 'total_len': parrent['total_len'] + kid.branch_length})
return closed
## Naive space place by leaves but not length
## commented not used
'''
for clade in PhyloTree.find_clades():
if clade.confidence is not None:
leaves = getLeaves(clade)
leaves_seqs = [FastaAlignDict[leaf['clade'].name] for leaf in leaves]
for pos in seq[clade.confidence].keys():
spaces = 0
for leave_seq in leaves_seqs:
if leave_seq[pos] == "-":
spaces += 1
space_prob = spaces / len(leave_seq)
if (space_prob >= 0.5):
seq[clade.confidence][pos] = '-'
'''
## space placement by leaves and count with length of branches
for clade in PhyloTree.find_clades():
if clade.confidence is not None:
leaves = getLeaves(clade)
leaves_seqs = [{'seq': FastaAlignDict[leaf['clade'].name], 'len': leaf['total_len']} for leaf in leaves]
for pos in seq[clade.confidence].keys():
space_prob = 0
len_sum = 0
for leave_seq in leaves_seqs:
len_sum += leave_seq['len']
for leave_seq in leaves_seqs:
if leave_seq['seq'][pos] == "-":
space_prob += leave_seq['len'] / len_sum
if (space_prob >= 0.5):
seq[clade.confidence][pos] = '-'
##
## Save seq to files
##
# first convert seq dict to string
for name, sq in seq.items():
str_seq = ['-'] * len(sq)
for index, amino in sq.items():
str_seq[index] = amino
seq[name] = "".join(str_seq)
if DEBUG:
print("{}: {}".format(name, seq[name]))
# save to files
for name in seq.keys():
text_file = open("{}/node_{}.fas".format(OUT_DIR, name), "wt")
text_file.write(seq[name])
text_file.close()