-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgraph_experiments.py
126 lines (86 loc) · 5.14 KB
/
graph_experiments.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
from inputs import *
from create_graph import create_graph,create_igraph_graph,create_graph
from create_subgraph import subgraph_shortest_path
from visualize_subgraph import output_visualization
from evaluation import *
from find_path import get_template_based_paths,template_based_subgraph_output,return_all_simple_paths, get_node_namespace
from graph import KnowledgeGraph
from tqdm import tqdm
def check_for_existance(source_node,target_node,output_dir):
filename = output_dir+"/"+source_node+"_"+target_node+"_Subgraph.csv"
#Check for existence of output directory
if not os.path.exists(filename):
exists = 'false'
else:
exists = 'true'
#print(exists)
return exists
def get_nodes_from_input(input_df,s):
#Commented out work for pkl and uniprot, use if need to convert labels??
#Add only first row if length is 1
if len(input_df) == 1:
df = pd.DataFrame()
df['source_label'] = [s.loc[s['source'] == input_df.iloc[0].loc['source'],'source_label'].values[0]]
df['target_label'] = [s.loc[s['target'] == input_df.iloc[0].loc['target'],'target_label'].values[0]]
input_df = pd.concat([input_df,df],axis=1)
#Add other rows if there is more than 1
elif len(input_df) > 1:
df = pd.DataFrame()
s1 = [s.loc[s['source'] == input_df.iloc[0].loc['source'],'source_label'].values[0]]
t1 = [s.loc[s['target'] == input_df.iloc[0].loc['target'],'target_label'].values[0]]
s2 = [s.loc[s['source'] == input_df.iloc[1].loc['source'],'source_label'].values[0]]
t2 = [s.loc[s['target'] == input_df.iloc[1].loc['target'],'target_label'].values[0]]
df['source_label'] = s1 + s2
df['target_label'] = t1 + t2
input_df = pd.concat([input_df,df],axis=1)
return input_df
#input_df is the selected nodes we want to search, s is the original mapped file with all source/target/source_label
def one_path_search(input_df,s,igraph,igraph_nodes,labels_all,edgelist,search_type,triples_file,output_dir,kg_type):
#input_df = get_nodes_from_input(input_df,s)
# Define output filenames for s
source_name = input_df.iloc[0].loc['source_label']
if kg_type == 'pkl':
source_name = source_name.replace('CONTEXTUAL ','')
source_name = source_name.replace(' ','_')
source_name = source_name.replace(':','_')
target_name = input_df.iloc[0].loc['target_label']
target_name = target_name.replace(':','_')
#print("Finding subgraph using user input and 1 shortest path......")
exists = check_for_existance(source_name,target_name,output_dir+'/' + kg_type+'_shortest_path')
if exists == 'false':
subgraph_sp = subgraph_shortest_path(input_df,igraph,igraph_nodes,labels_all,edgelist,False,search_type,kg_type)
#print("Outputting CS visualization......")
if len(subgraph_sp) > 0:
cs_noa_df = output_visualization(input_df,source_name,target_name,subgraph_sp,output_dir+'/' + kg_type + '_shortest_path')
def two_path_search(input_df,s,igraph,igraph_nodes,labels_all,edgelist,search_type,triples_file,output_dir,kg_type):
#input_df = get_nodes_from_input(input_df,s)
# Define output filenames for s
source_name = input_df.iloc[0].loc['source_label']
if kg_type == 'pkl':
source_name = source_name.replace('CONTEXTUAL ','')
source_name = source_name.replace(' ','_')
source_name = source_name.replace(':','_')
middle_name = input_df.iloc[0].loc['target_label']
middle_name = middle_name.replace(' ','_')
target_name = input_df.iloc[1].loc['target_label']
target_name = target_name.replace(' ','_')
target_name = middle_name+'_'+target_name
print("Finding subgraph using user input and 1 shortest path......")
exists = check_for_existance(source_name,target_name,output_dir+'/' + kg_type+'_shortest_path')
if exists == 'false':
subgraph_sp = subgraph_shortest_path(input_df,igraph,igraph_nodes,labels_all,edgelist,False,search_type,kg_type)
print("Outputting CS visualization......")
cs_noa_df = output_visualization(source_name,target_name,subgraph_sp,output_dir+'/' + kg_type+'_shortest_path')
def template_based_search(kg_type, g, search_type,output_dir):
if kg_type == 'pkl':
template = ['microbe','metabolite','gene','protein','process','metabolite','disease']
elif kg_type == 'uniprot_kg':
template = ['microbe','protein','reaction','process','disease'] # ['microbe','protein','reaction','chemical','process','disease'] ]
#template_based_paths_df = get_template_based_paths(template,kg_type,g,search_type)
#To run with file previously generated
#template_file = '~/pkl_uniprot_kg/Outputs_allPairs/uniprot_kg_all_template_based/microbe_protein_reaction_process_disease__Subgraph.csv'
#template_based_paths_df = pd.read_csv(template_file,sep='|')
name = '_'.join(template)
output_visualization(pd.DataFrame(),name,'',template_based_paths_df,output_dir+'/' + kg_type+'_'+search_type+'_template_based')
subfolder_name = kg_type+'_'+search_type+'_template_based'
template_based_subgraph_output(output_dir,kg_type,template,template_based_paths_df,subfolder_name)