-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcreate_subgraph.py
378 lines (309 loc) · 17.3 KB
/
create_subgraph.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
# Given a starting graph of node pairs, find all paths between them to create a subgraph
from calculate_information_content import drop_low_information_content_nodes
from constants import FILTER_ONTOLOGIES_BY_INFORMATION_CONTENT #, PHEKNOWLATOR_BROAD_NODES_DICT
from find_path import find_shortest_path,find_shortest_path_pattern
from find_path import prioritize_path_cs,prioritize_path_pdp
from find_path import calc_cosine_sim_from_label_list,calc_cosine_sim_from_uri_list,generate_comparison_terms_dict,unique_nodes
import pandas as pd
from tqdm import tqdm
from evaluation import output_path_lists
from evaluation import output_num_paths_pairs
from igraph import *
import os
import glob
import sys
import logging.config
from pythonjsonlogger import jsonlogger
import networkx as nx
import igraph
# logging
log_dir, log, log_config = 'builds/logs', 'cartoomics_log.log', glob.glob('**/logging.ini', recursive=True)
try:
if not os.path.exists(log_dir): os.mkdir(log_dir)
except FileNotFoundError:
log_dir, log_config = '../builds/logs', glob.glob('../builds/logging.ini', recursive=True)
if not os.path.exists(log_dir): os.mkdir(log_dir)
logger = logging.getLogger(__name__)
logging.config.fileConfig(log_config[0], disable_existing_loggers=False, defaults={'log_file': log_dir + '/' + log})
def subgraph_shortest_path(input_nodes_df,graph,weights,search_type,kg_type):
input_nodes_df.columns= input_nodes_df.columns.str.lower()
all_paths = []
for i in range(len(input_nodes_df)):
start_node = input_nodes_df.iloc[i].loc['source_label']
end_node = input_nodes_df.iloc[i].loc['target_label']
shortest_path_df = find_shortest_path(start_node,end_node,graph,weights,search_type,kg_type,input_nodes_df)
all_paths.append(shortest_path_df)
df = pd.concat(all_paths)
df.reset_index(drop=True, inplace=True)
#Remove duplicate edges
df = df.drop_duplicates(subset=['S','P','O'])
return df
def subgraph_shortest_path_pattern(input_nodes_df,graph,weights,search_type,kg_type,manually_chosen_uris):
input_nodes_df.columns = input_nodes_df.columns.str.lower()
all_paths = []
for i in range(len(input_nodes_df)):
start_node = input_nodes_df.iloc[i].loc['source']
end_node = input_nodes_df.iloc[i].loc['target']
shortest_path_df,manually_chosen_uris = find_shortest_path_pattern(start_node,end_node,graph,weights,search_type,kg_type,input_nodes_df,manually_chosen_uris)
if len(shortest_path_df) > 0:
all_paths.append(shortest_path_df)
if len(all_paths) > 0:
df = pd.concat(all_paths)
df.reset_index(drop=True, inplace=True)
#Remove duplicate edges
df = df.drop_duplicates(subset=['S','P','O'])
else:
df = pd.DataFrame()
return df,manually_chosen_uris
# Have user define weights to upweight -igraph type only
def user_defined_edge_weights(graph, triples_df,kg_type ):
if kg_type == 'pkl':
edges = graph.labels_all[graph.labels_all['entity_type'] == 'RELATIONS'].label.tolist()
print("### Unique Edges in Knowledge Graph ###")
print('\n'.join(edges))
still_adding = True
to_weight= []
print('\n')
print('Input the edges to avoid in the path search (if possible). When finished input "Done."')
while(still_adding):
user_input = input('Edge or "Done": ')
if user_input == 'Done':
still_adding = False
else:
to_weight.append(user_input)
to_weight = graph.labels_all[graph.labels_all['label'].isin(to_weight)]['entity_uri'].tolist()
if kg_type == 'kg-covid19':
edges = set(list(graph.graph_object.es['predicate']))
print("### Unique Edges in Knowledge Graph ###")
print('\n'.join(edges))
still_adding = True
to_weight= []
print('\n')
print('Input the edges to avoid in the path search (if possible). When finished input "Done"')
while(still_adding):
user_input = input('Edge or "Done"')
if user_input == 'Done':
still_adding = False
else:
to_weight.append(user_input)
edges= graph.graph_object.es['predicate']
graph.graph_object.es['weight'] = [10 if x in to_weight else 1 for x in edges]
return(graph)
# Have user define weights to upweight - igraph type only
def user_defined_edge_exclusion(graph,kg_type ):
if kg_type == 'pkl':
edges = graph.labels_all[graph.labels_all['entity_type'] == 'RELATIONS'].label.tolist()
print("### Unique Edges in Knowledge Graph ###")
print('\n'.join(edges))
still_adding = True
to_drop= []
print('\n')
print('Input the edges to avoid in the path search (if possible). When finished input "Done."')
while(still_adding):
user_input = input('Edge or "Done": ')
if user_input == 'Done':
still_adding = False
else:
to_drop.append(user_input)
to_drop = graph.labels_all[graph.labels_all['label'].isin(to_drop)]['entity_uri'].tolist()
if kg_type == 'kg-covid19':
edges = set(list(graph.graph_object.es['predicate']))
print("### Unique Edges in Knowledge Graph ###")
print('\n'.join(edges))
still_adding = True
to_drop= []
print('\n')
print('Input the edges to avoid in the path search (if possible). When finished input "Done"')
while(still_adding):
user_input = input('Edge or "Done"')
if user_input == 'Done':
still_adding = False
else:
to_drop.append(user_input)
for edge in to_drop:
graph.graph_object.delete_edges(graph.graph_object.es.select(predicate = edge))
return(graph)
# Edges to remove
def automatic_defined_edge_exclusion(graph,kg_type):
if kg_type == 'pkl':
to_drop = ['http://purl.obolibrary.org/obo/RO_0002160','http://purl.obolibrary.org/obo/BFO_0000050','http://www.w3.org/1999/02/22-rdf-syntax-ns#type','http://purl.obolibrary.org/obo/RO_0001025','http://purl.obolibrary.org/obo/RO_0000087']
if kg_type != 'pkl':
to_drop = ['biolink:category','biolink:in_taxon']
for edge in to_drop:
# Remove from graph object
if isinstance(graph.graph_object, igraph.Graph):
graph.graph_object.delete_edges(graph.graph_object.es.select(predicate = edge))
if isinstance(graph.graph_object, nx.Graph):
edges_to_delete = [(source, target) for source, target, data in graph.graph_object.edges(data=True) if data.get("predicate") == edge]
graph.graph_object.remove_edges_from(edges_to_delete)
# Remove from df
graph.edgelist = graph.edgelist[graph.edgelist["predicate"] != edge]
return graph
# Nodes to remove
def automatic_defined_node_exclusion(graph,kg_type,output_dir,threshold = 0.3):
if kg_type == 'pkl':
# To drop manually defined nodes
#to_drop = list(PHEKNOWLATOR_BROAD_NODES_DICT.values())
to_drop = []
# To drop nodes based on Information Content
for ont in tqdm(FILTER_ONTOLOGIES_BY_INFORMATION_CONTENT):
to_drop = drop_low_information_content_nodes(to_drop,ont,output_dir,threshold)
print("Removing nodes from KG")
# Remove from graph object
if isinstance(graph.graph_object, igraph.Graph):
for uri in tqdm(to_drop):
# Get the indices of vertices with corresponding label
indices_to_delete = [v.index for v in graph.graph_object.vs if v["name"] == uri]
# Remove the vertices by their indices
try:
graph.graph_object.delete_vertices(indices_to_delete)
except KeyError:
print('Specified node to be removed does not exist. Update PHEKNOWLATOR_BROAD_NODES_DICT in constants.py.')
logging.error('Specified node to be removed does not exist. Update PHEKNOWLATOR_BROAD_NODES_DICT in constants.py.')
sys.exit(1)
if isinstance(graph.graph_object, nx.Graph):
# Remove nodes by their uri
try:
graph.graph_object.remove_nodes_from(to_drop)
except KeyError:
print('Specified node to be removed does not exist. Update PHEKNOWLATOR_BROAD_NODES_DICT in constants.py.')
logging.error('Specified node to be removed does not exist. Update PHEKNOWLATOR_BROAD_NODES_DICT in constants.py.')
sys.exit(1)
# Remove from dfs
graph.labels_all = graph.labels_all[~graph.labels_all["entity_uri"].isin(to_drop)]
graph.edgelist = graph.edgelist[~(graph.edgelist["subject"].isin(to_drop) | graph.edgelist["object"].isin(to_drop))]
print(nx.number_of_nodes(graph.graph_object))
return graph
def subgraph_prioritized_path_cs(input_nodes_df,graph,weights,search_type,triples_file,output_dir,input_dir,embedding_dimensions,kg_type, find_graph_similarity = False,existing_path_nodes = 'none'):
input_nodes_df.columns= input_nodes_df.columns.str.lower()
all_paths = []
num_paths_df = pd.DataFrame(columns = ['source_node','target_node','num_paths'])
#List of all chosen paths for subgraph
#all_chosen_path_nodes = []
#Dict of all shortest paths for subgraph
all_path_nodes = {}
for i in tqdm(range(len(input_nodes_df))):
df_paths = pd.DataFrame()
start_node = input_nodes_df.iloc[i].loc['source_label']
end_node = input_nodes_df.iloc[i].loc['target_label']
if existing_path_nodes != 'none':
pair_path_nodes = existing_path_nodes[start_node + end_node]
else:
pair_path_nodes = 'none'
node_pair = input_nodes_df.iloc[i]
path_nodes,cs_shortest_path_df,all_paths_cs_values,chosen_path_nodes_cs = prioritize_path_cs(input_nodes_df,node_pair,graph,weights,search_type,triples_file,input_dir,embedding_dimensions,kg_type,pair_path_nodes)
all_paths.append(cs_shortest_path_df)
df_paths['source_node'] = [start_node]
df_paths['target_node'] = [end_node]
df_paths['num_paths'] = [len(path_nodes)]
num_paths_df = pd.concat([num_paths_df,df_paths],axis=0)
#Output path list to file where index will match the pair# in the _Input_Nodes_.csv
#Get sum of all cosine values in value_list
path_list = list(map(sum, all_paths_cs_values))
output_path_lists(output_dir,path_list,'CosineSimilarity',i)
#all_chosen_path_nodes.append(chosen_path_nodes_cs)
all_path_nodes[start_node + end_node] = path_nodes
df = pd.concat(all_paths)
df.reset_index(drop=True, inplace=True)
#Remove duplicate edges
df = df.drop_duplicates(subset=['S_ID','P_ID','O_ID','S','P','O'])
output_num_paths_pairs(output_dir,num_paths_df,'CosineSimilarity')
return df,all_paths_cs_values,all_path_nodes #all_chosen_path_nodes
def subgraph_prioritized_path_pdp(input_nodes_df,graph,weights,search_type,pdp_weight,output_dir, kg_type, existing_path_nodes = 'none'):
input_nodes_df.columns= input_nodes_df.columns.str.lower()
all_paths = []
num_paths_df = pd.DataFrame(columns = ['source_node','target_node','num_paths'])
#List of all chosen paths for subgraph
#all_chosen_path_nodes = []
#Dict of all shortest paths for subgraph
all_path_nodes = {}
for i in tqdm(range(len(input_nodes_df))):
df_paths = pd.DataFrame()
start_node = input_nodes_df.iloc[i].loc['source_label']
end_node = input_nodes_df.iloc[i].loc['target_label']
if existing_path_nodes != 'none':
pair_path_nodes = existing_path_nodes[start_node + end_node]
else:
pair_path_nodes = 'none'
node_pair = input_nodes_df.iloc[i]
node_pair = input_nodes_df.iloc[i]
path_nodes,pdp_shortest_path_df,paths_pdp,chosen_path_nodes_pdp = prioritize_path_pdp(input_nodes_df,node_pair,graph,weights,search_type,pdp_weight,kg_type,pair_path_nodes)
all_paths.append(pdp_shortest_path_df)
df_paths['source_node'] = [start_node]
df_paths['target_node'] = [end_node]
df_paths['num_paths'] = [len(path_nodes)]
num_paths_df = pd.concat([num_paths_df,df_paths],axis=0)
#Output path list to file where index will match the pair# in the _Input_Nodes_.csv
output_path_lists(output_dir,paths_pdp,'PDP',i)
#all_chosen_path_nodes.append(chosen_path_nodes_pdp)
all_path_nodes[start_node + end_node] = path_nodes
df = pd.concat(all_paths)
df.reset_index(drop=True, inplace=True)
#Remove duplicate edges
df = df.drop_duplicates(subset=['S_ID','P_ID','O_ID','S','P','O'])
output_num_paths_pairs(output_dir,num_paths_df,'PDP')
return df,paths_pdp,all_path_nodes #all_chosen_path_nodes
def subgraph_prioritized_path_guiding_term(input_nodes_df,term_row,graph,g_nodes,labels_all,triples_df,weights,search_type,triples_file,output_dir,input_dir,embedding_dimensions,kg_type,existing_path_nodes = 'none'):
input_nodes_df.columns= input_nodes_df.columns.str.lower()
all_paths = []
num_paths_df = pd.DataFrame(columns = ['source_node','target_node','num_paths'])
term_foldername = 'Guiding_Term_'+term_row['term_label'].replace(" ","_").replace(".","_").replace(":","_").replace("'",'')
for i in tqdm(range(len(input_nodes_df))):
df_paths = pd.DataFrame()
start_node = input_nodes_df.iloc[i].loc['source_label']
end_node = input_nodes_df.iloc[i].loc['target_label']
if existing_path_nodes != 'none':
pair_path_nodes = existing_path_nodes[start_node + end_node]
else:
pair_path_nodes = 'none'
node_pair = input_nodes_df.iloc[i]
path_nodes,cs_shortest_path_df,all_paths_cs_values,chosen_path_nodes_cs = prioritize_path_cs(input_nodes_df,node_pair,graph,weights,search_type,triples_file,input_dir,embedding_dimensions,kg_type,pair_path_nodes,term_row)
all_paths.append(cs_shortest_path_df)
df_paths['source_node'] = [start_node]
df_paths['target_node'] = [end_node]
df_paths['num_paths'] = [len(path_nodes)]
num_paths_df = pd.concat([num_paths_df,df_paths],axis=0)
#Output path list to file where index will match the pair# in the _Input_Nodes_.csv
#Get sum of all cosine values in value_list
path_list = list(map(sum, all_paths_cs_values))
output_path_lists(output_dir,path_list,term_foldername,i)
df = pd.concat(all_paths)
df.reset_index(drop=True, inplace=True)
#Remove duplicate edges
df = df.drop_duplicates(subset=['S_ID','P_ID','O_ID','S','P','O'])
output_num_paths_pairs(output_dir,num_paths_df,term_foldername)
return df,all_paths_cs_values,term_foldername
def get_cosine_sim_one_pathway(g,comparison_terms_df,kg_type,embeddings,algorithm,emb,entity_map,wikipathway,subgraph_nodes,annotated_nodes,all_subgraphs_cosine_sim,node_type,compared_pathway):
#For each guiding term calculate cosine values to all nodes in supgraph
for t in range(len(comparison_terms_df)):
term_row = comparison_terms_df.iloc[t]
if node_type == 'labels':
avg_cosine_sim,embeddings = calc_cosine_sim_from_label_list(emb,entity_map,subgraph_nodes,annotated_nodes,g.labels_all,kg_type,embeddings,term_row)
elif node_type == 'uris':
avg_cosine_sim,embeddings = calc_cosine_sim_from_uri_list(emb,entity_map,subgraph_nodes,g.labels_all,kg_type,embeddings,term_row)
#Organize all path cosine similarity values into dictionary per term
all_subgraphs_cosine_sim = generate_comparison_terms_dict(all_subgraphs_cosine_sim,term_row,avg_cosine_sim,algorithm,wikipathway,compared_pathway)
return all_subgraphs_cosine_sim,embeddings
def compare_subgraph_guiding_terms(s,subgraph_df,g,comparison_terms,kg_type,embeddings,algorithm,emb,entity_map,wikipathway,all_subgraphs_cosine_sim,node_type):
#Get all nodes from subgraph not in original edgelist
subgraph_nodes = unique_nodes(subgraph_df[['S_ID','O_ID']])
#input_nodes = unique_nodes(s[['source_id','target_id']])
#If comparing to intermediate terms only in subgraph
#intermediate_nodes = [i for i in subgraph_nodes if i not in input_nodes]
#When passed only the terms of that wikipathway abstract
if isinstance(comparison_terms,pd.DataFrame):
all_subgraphs_cosine_sim,embeddings = get_cosine_sim_one_pathway(g,comparison_terms,kg_type,embeddings,algorithm,emb,entity_map,wikipathway,subgraph_nodes,s,all_subgraphs_cosine_sim,node_type,wikipathway)
#When passed the terms of all wikipathway abstracts as dictionary
elif isinstance(comparison_terms,dict):
for w in comparison_terms.keys():
w_comparison_terms_df = comparison_terms[w]
all_subgraphs_cosine_sim,embeddings = get_cosine_sim_one_pathway(g,w_comparison_terms_df,kg_type,embeddings,algorithm,emb,entity_map,wikipathway,subgraph_nodes,s,all_subgraphs_cosine_sim,node_type,w)
'''elif isinstance(comparison_terms,list):
new_df_vals = []
for i in range(len(subgraph_df)):
new_df_vals.append()'''
return all_subgraphs_cosine_sim,embeddings
def get_wikipathways_subgraph(annotated_wikipathways_subgraph_df):
wikipathways_subgraph_df = annotated_wikipathways_subgraph_df[['source_id', 'target_id']]
wikipathways_subgraph_df = wikipathways_subgraph_df.rename(columns={'source_id' : 'S_ID', 'target_id': 'O_ID'})
return wikipathways_subgraph_df