-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathassign_nodes.py
716 lines (619 loc) · 27.2 KB
/
assign_nodes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
import pandas as pd
import re
import os
import math
import sys
import glob
import logging.config
from pythonjsonlogger import jsonlogger
import itertools
import numpy as np
import requests
import json
import copy
import csv
from tqdm import tqdm
from itertools import combinations
import copy
from constants import (
WIKIPATHWAYS_SUBFOLDER,
WIKIPATHWAYS_METADATA_FILESTRING,
WIKIPATHWAYS_UNKNOWN_PREFIXES,
PKL_PREFIXES,
NODE_PREFIX_MAPPINGS,
NODE_NORMALIZER_URL,
PKL_GENE_URI,
PKL_OBO_URI
)
# logging
log_dir, log, log_config = 'builds/logs', 'cartoomics_log.log', glob.glob('**/logging.ini', recursive=True)
try:
if not os.path.exists(log_dir): os.mkdir(log_dir)
except FileNotFoundError:
log_dir, log_config = '../builds/logs', glob.glob('../builds/logging.ini', recursive=True)
if not os.path.exists(log_dir): os.mkdir(log_dir)
logger = logging.getLogger(__name__)
logging.config.fileConfig(log_config[0], disable_existing_loggers=False, defaults={'log_file': log_dir + '/' + log})
# set column number and width to display all information
pd.set_option('display.max_rows', None)
#Added here to avoid circular import
def get_label(labels,value,kg_type):
if kg_type == 'pkl':
label = labels.loc[labels['entity_uri'] == value,'label'].values[0]
if kg_type == 'kg-covid19':
label = labels.loc[labels['id'] == value,'label'].values[0]
return label
# Read in the user example file and output as a pandas dataframe
def read_user_input(user_example_file, guiding_term = False):
if guiding_term:
examples = pd.read_csv(user_example_file)
if len(examples.columns) > 1:
print('Error in format of ' + user_example_file + ', ensure that only "term" column exists in each row.')
logging.error('Error in format of %s, ensure that only "term" column exists in each row.',user_example_file)
sys.exit(1)
elif examples.columns[0] != "term":
print('Error in format of ' + user_example_file + ', ensure that only "term" column exists in each row.')
logging.error('Error in format of %s, ensure that only "term" column exists in each row.',user_example_file)
sys.exit(1)
return examples
elif not guiding_term:
try:
examples = pd.read_csv(user_example_file, sep= "|")
#Check for poorly formatted file
except pd.errors.ParserError:
print('Error in format of ' + user_example_file + ', ensure that only "source" and "target" columns exist in each row.')
logging.error('Error in format of %s, ensure that only "source" and "target" columns exist in each row.',user_example_file)
sys.exit(1)
#Check for extra columns or blank values or absence of source/target columns
if (len(examples.columns) != 2) | (examples.isna().values.any()) | (len([item for item in examples.columns if item not in ['source','target']]) > 0):
print('Error in format of ' + user_example_file + ', ensure that only "source" and "target" columns exist in each row.')
logging.error('Error in format of %s, ensure that only "source" and "target" columns exist in each row.',user_example_file)
sys.exit(1)
return(examples)
def read_ocr_input(user_input_file):
df = pd.read_csv(user_input_file, sep = "\t")
if "genes" in user_input_file:
df = df.loc[df["organism_name"] == "Homo sapiens"]
return(df)
# Get list of unique nodes
# Inputs: examples pandas dataframe of user input examples.
# Outputs: nodes set of unique nodes
def unique_nodes(examples):
# get unique node values
nodes = list(set(pd.melt(examples)["value"].values))
return(nodes)
# Search through labels to find nodes based on input feature
# Inputs: node string for user input example.
# kg knowledge graph of class Knowledge Graph
# ontology specific ontology to restrict search of nodes
def find_node(node, kg, ontology = ""):
no_match = True
### Check for exact matches and return only those
matches,exact_match,no_match = exact_match_identification(kg.labels_all,node)
#Do not continue if match found
if not no_match:
return matches,exact_match
### For genes check for exact matches by appending (human)
matches,exact_match,no_match = exact_gene_match_identification(kg.labels_all,node)
#Do not continue if match found
if not no_match:
return matches,exact_match
### Check for exact matches in synonym and return only those
matches,exact_match,no_match = exact_synonym_match_identification(kg.labels_all,node)
#Do not continue if match found
if not no_match:
return matches,exact_match
### Fuzzy match
matches,exact_match,no_match = fuzzy_match_identification(kg.labels_all,node)
#Do not continue if match found
if not no_match:
return matches,exact_match
def exact_match_identification(nodes,node):
### Check for exact matches and return only those
exact_matches = nodes[(nodes["label"].str.lower() == node.lower())|(nodes["entity_uri"].str.lower() == node.lower())][["label", "entity_uri"]]
if len(exact_matches) == 1:
#Return node label if exact match is identified
node_label = exact_matches.iloc[0][["label"]].values[0]
exact_match = True
no_match = False
return node_label,exact_match,no_match
#Return full df of exact matches if more than 1
if len(exact_matches) > 1:
exact_match = False
no_match = False
return exact_matches,exact_match,no_match
#Return flag if no exact matches
else:
no_match = True
return "","",no_match
def exact_gene_match_identification(nodes,node):
### For genes check for exact matches by appending (human)
human_gene_match = node.lower() + " (human)"
exact_gene_matches = nodes[(nodes["label"].str.lower() == human_gene_match)][["label", "entity_uri"]]
if len(exact_gene_matches) == 1:
#Return node label if exact match is identified
node_label = exact_gene_matches.iloc[0][["label"]].values[0]
exact_gene_match = True
no_match = False
return node_label,exact_gene_match,no_match
if len(exact_gene_matches) > 1:
exact_gene_match = False
no_match = False
return exact_gene_matches,exact_gene_match,no_match
#Return flag if no exact matches
else:
no_match = True
return "","",no_match
def exact_synonym_match_identification(nodes,node):
### Check for exact matches in synonym and return only those, don't assume nodes with special characters is a regex
exact_synonym_matches = nodes[nodes["synonym"].str.contains(node,flags=re.IGNORECASE, na = False,regex=False)][["label", "entity_uri", "synonym"]]
if len(exact_synonym_matches) > 0:
for i in range(len(exact_synonym_matches)):
synonym_list = exact_synonym_matches.iloc[i].loc["synonym"].split("|")
synonym_match = [i for i in synonym_list if i.lower() == node.lower()]
if len(synonym_match) == 1:
node_label = exact_synonym_matches.iloc[i][["label"]].values[0]
exact_match = True
no_match = False
return node_label,exact_match,no_match
if len(exact_synonym_matches) > 0:
exact_match = False
no_match = False
return exact_synonym_matches,exact_match,no_match
#Return flag if no exact matches
else:
no_match = True
return "","",no_match
def fuzzy_match_identification(nodes,node):
### All caps input is probably a gene or protein. Either search in a case sensitive manner or assign to specific ontology.
if node.isupper(): #likely a gene or protein
results = nodes[(nodes["label"].str.contains(node,flags=re.IGNORECASE, na = False,regex=False)|nodes["synonym"].str.contains(node,flags=re.IGNORECASE, na = False,regex=False)|nodes["description/definition"].str.contains(node,flags=re.IGNORECASE, na = False,regex=False)) & nodes["entity_uri"].str.contains("gene|PR|GO",flags=re.IGNORECASE, na = False,regex=False) ][["label", "entity_uri"]]
else:
results = nodes[nodes["label"].str.contains(node,flags=re.IGNORECASE, na = False,regex=False)|nodes["synonym"].str.contains(node,flags=re.IGNORECASE, na = False,regex=False)|nodes["description/definition"].str.contains(node,flags=re.IGNORECASE, na = False,regex=False)][["label", "entity_uri"]]
# sort results by ontology
results = results.sort_values(['entity_uri'])
no_match = False
exact_match = False
return results, exact_match, no_match
def map_input_to_nodes(node,kg,enable_skipping):
search_loop = True
exact_match = False
while(search_loop):
#print("User Search Node: ", node)
found_nodes,exact_match = find_node(node,kg)
#Handle when node label is returned for exact match, which will be a string not a df
if isinstance(found_nodes, str):
search_loop = False
nrow = 1
logging.info('Found exact match in KG')
#Do not require more user input when enable skipping true
elif enable_skipping:
nrow = found_nodes.shape[0]
#print("No exact search terms returned, skipping enabled")
logging.info("No exact search terms returned, skipping enabled: %s",node)
return found_nodes,nrow,exact_match
else:
nrow = found_nodes.shape[0]
if nrow == 0:
print("No search terms returned")
node = input("Please try another input term: ")
else:
search_loop = False
#print("Found", nrow, "features in KG")
logging.info('Found %s features in KG',nrow)
return found_nodes,nrow,exact_match
def manage_user_input(found_nodes,user_input,kg,exact_match):
#Only continue search if node_label match not found according to exact_match flag
if exact_match:
node_label = found_nodes
user_id_input = kg.labels_all.loc[kg.labels_all['label'] == node_label]['entity_uri'].values[0]
bad_input = False
else:
user_id_input = 'none'
if node_in_search(found_nodes,user_input):
#Manage if there are 2 duplicate label names
if len(found_nodes[found_nodes['label'] == user_input][['label','entity_uri']]) > 1:
dup_node = True
logging.info('Duplicate label names found: %s',user_input)
while(dup_node):
#Get all uris with the duplicate labels
l = found_nodes[found_nodes['label'] == user_input]['entity_uri'].values.tolist()
print('Select from the following options: ')
#Show options as numeric, ask for numeric return
for i in range(len(l)):
print(str(i+1),': ',l[i])
option_input = input("Input option #: ")
if str(int(option_input)-1) in [str(v) for v in range(len(l))]:
try:
#Return the actualy uri not the numeric entry
user_id_input = l[int(option_input)-1]
except IndexError: continue
#Convert uri back to label and store as node_label
if user_id_input in found_nodes[found_nodes['label'] == user_input]['entity_uri'].values.tolist():
node_label = kg.labels_all.loc[kg.labels_all['entity_uri'] == user_id_input,'label'].values[0]
bad_input = False
dup_node = False
logging.info('ID chosen: %s',user_id_input)
else:
print("Input id does not correspond with selected label.... try again")
logging.info('Input id does not correspond with selected label: %s',user_id_input)
else:
node_label = user_input
bad_input = False
dup_node = False
elif node_in_labels(kg,user_input):
node_label= user_input
#Still return node ID even if not needed
user_id_input = found_nodes[found_nodes['label'] == user_input]['entity_uri'][0]
bad_input = False
else:
print("Input not in search results.... try again")
logging.info('Input not in search results: %s',user_input)
node_label = ""
bad_input = True
return node_label,bad_input,user_id_input
def search_node(node, kg, examples, enable_skipping, guiding_term = False):
vals_per_page = 20
skip_node = False
logging.info('Searching for node: %s',node)
bad_input = True
found_nodes,nrow,exact_match = map_input_to_nodes(node,kg,enable_skipping)
if exact_match:
node_label,bad_input,id_given = manage_user_input(found_nodes,found_nodes,kg,exact_match)
elif not exact_match and enable_skipping:
skip_node = True
node_label = ''
id_given = ''
else:
i = 1
while(bad_input):
high = min(nrow,(i)*vals_per_page)
print(found_nodes.iloc[(i-1)*vals_per_page:high,].to_string())
user_input = input("Input node 'label' or 'f' for the next " + str(vals_per_page) + " features, 'b' for the previous " + str(vals_per_page) + ", or 'u' to update the node search term: ")
if user_input == 'f':
if (nrow / i ) > vals_per_page:
i+=1
elif user_input == 'b':
if i > 1:
i-=1
elif user_input == 'u':
#Will replace the original node label in examples file with new one
examples = examples.replace([node],'REPLACE')
node = input("Input new node search term: ")
logging.info('Input new node search term: %s.',node)
examples = examples.replace(['REPLACE'],node)
found_nodes,nrow,exact_match = map_input_to_nodes(node,kg,enable_skipping)
i = 1
else:
node_label,bad_input,id_given = manage_user_input(found_nodes,user_input,kg,exact_match)
logging.info('Node label chosen for %s: %r',node,node_label)
if exact_match:
#Update node label to be identical to given node
node_label = node
logging.info('Exact match found, using original label %s',node_label)
return node_label,id_given,examples,skip_node
def create_annotated_df(examples,node,node_label,id_given,guiding_term):
#examples_new = copy.deepcopy(examples)
if guiding_term:
examples.loc[examples["term"] == node,"term_label"] = node_label
examples.loc[examples["term"] == node,"term_id"] = id_given
else:
examples.loc[examples["source"] == node,"source_label"] = node_label
examples.loc[examples["target"] == node,"target_label"] = node_label
examples.loc[examples["source"] == node,"source_id"] = id_given
examples.loc[examples["target"] == node,"target_id"] = id_given
#print(type(examples.iloc[3].loc['source_label']))
#examples = examples.replace(np.nan, 'none')
return examples
# Check if search input is in the list of integer_ids
def node_in_search(found_nodes, user_input):
if user_input in found_nodes[["label"]].values:
return(True)
else:
return(False)
# Check if search input is in the list of integer_ids
def node_id_in_search(found_nodes, user_input):
if user_input in found_nodes[["entity_uri"]].values:
return(True)
else:
return(False)
# Check if search input is in the all nodes
def node_in_labels(kg, user_input):
labels = kg.labels_all
if user_input in labels[["label"]].values:
return(True)
else:
return(False)
#subgraph_df is a dataframe with source,targe headers and | delimited
def create_input_file(examples,output_dir,input_type):
input_file = output_dir+"/_" + input_type + "_Input_Nodes_.csv"
logging.info('Input file created: %s',input_file)
examples.to_csv(input_file, sep = "|", index = False)
#Takes in a list of skipped nodes for a given pathway
def create_skipped_node_file(skipped_nodes,output_dir,filename=''):
#Only output file if nodes are skipped
if len(skipped_nodes) > 0:
skipped_node_file = output_dir+"/" + filename + "skipped_nodes.csv"
logging.info('Skipped file created: %s',skipped_node_file)
e = open(skipped_node_file,"a")
e.truncate(0)
writer = csv.writer(e, delimiter="\t")
for n in set(skipped_nodes):
writer.writerow([n])
e.close()
# Check if the input_nodes file already exists
def check_input_existence(output_dir,input_type):
exists = 'false'
mapped_file = ''
for fname in os.listdir(output_dir):
if bool(re.match("_" + input_type + "_Input_Nodes_.csv",fname)):
exists = 'true'
mapped_file = fname
return exists,mapped_file
def get_wikipathway_id(node,wikipathway_input_folder,kg_type):
for fname in os.listdir(wikipathway_input_folder):
if WIKIPATHWAYS_METADATA_FILESTRING in fname:
df = pd.read_csv(wikipathway_input_folder + "/" + fname,sep=',')
try:
database = df.loc[df['textlabel'] == node]['database'].values[0]
database_id = df.loc[df['textlabel'] == node]['databaseID'].values[0]
if database not in WIKIPATHWAYS_UNKNOWN_PREFIXES:
#Chebi nodes have full cure given as ID
if database.lower() == 'chebi' and 'chebi' in database_id.lower():
database_id = database_id.split(':')[1]
prefix = NODE_PREFIX_MAPPINGS[database]
#Create node curie from prefix and id, using string of database_id when column is read in as int (when all gene IDs)
node_curie = prefix + ":" + str(database_id)
if kg_type == 'kg_covid19':
return node_curie
if kg_type == 'pkl':
normalized_node = normalize_node_api(node_curie)
normalized_node_uri = convert_to_uri(normalized_node)
return normalized_node_uri
#Handle case where node type is unknown
else:
logging.info('Input node type: %s, indexing manually',database)
return node
#Handle case where node is not in metadata file
except IndexError:
logging.info('Input node not in database metadata file: %s, indexing manually',wikipathway_input_folder + "/" + fname)
return node
def convert_to_uri(curie):
if 'gene' in curie.lower():
uri = PKL_GENE_URI + curie.split(":")[1]
else:
uri = PKL_OBO_URI + curie.replace(":","_")
return uri
#Takes cure in the form PREFIX:ID
def normalize_node_api(node_curie):
url = NODE_NORMALIZER_URL + node_curie
# Make the HTTP request to NodeNormalizer
response = requests.get(url, timeout=30)
response.raise_for_status()
# Write response to file if it contains data
entries = response.json()[node_curie]
try:
if len(entries) > 1: #.strip().split("\n")
for iden in entries['equivalent_identifiers']:
if iden['identifier'].split(':')[0] in PKL_PREFIXES:
norm_node = iden['identifier']
return norm_node
#Handle case where node normalizer returns nothing
except TypeError:
return node_curie
else:
return node_curie
# Wrapper function
def interactive_search_wrapper(g,user_input_file, output_dir, input_type,kg_type,enable_skipping,input_dir="",input_substring=""):
#Check for existence based on input type
exists = check_input_existence(output_dir,input_type)
if enable_skipping:
skipped_nodes = []
guiding_term_skipped_nodes = []
if(exists[0] == 'false'):
print('Interactive Node Search')
logging.info('Interactive Node Search')
#Interactively assign node
if input_type == 'annotated_diagram':
#Creates examples df without source_label and target_label
u = read_user_input(user_input_file[0])
n = unique_nodes(u)
u['source_label'] = 'none'
u['target_label'] = 'none'
u['source_id'] = 'none'
u['target_id'] = 'none'
examples = u
#For wikipathways diagram, search for metadata file for matches
if WIKIPATHWAYS_SUBFOLDER in output_dir:
wikipathway_input_folder = output_dir.split("_output")[0]
for node in n:
node_uri = get_wikipathway_id(node,wikipathway_input_folder,kg_type)
try:
#Ensure uri exist in PKL
node_label = get_label(g.labels_all,node_uri,kg_type)
#Use original label for graph similarity
examples = create_annotated_df(examples,node,node,node_uri,False)
logging.info('Found node id for %r: %s',node,node_uri)
except IndexError:
#Otherwise perform search through graph
node_label,id_given,examples,skip_node = search_node(node,g,examples,enable_skipping)
#Only add node to examples df if not skipped
if not skip_node:
examples = create_annotated_df(examples,node,node_label,id_given,False)
#Drop any triples that contain that node
elif skip_node:
'''examples.drop(examples[examples['source'] == node].index, inplace = True)
examples.drop(examples[examples['target'] == node].index, inplace = True)'''
examples = skip_node_in_edgelist(examples,[node])
skipped_nodes.append(node)
logging.info('All input nodes searched.')
else:
for node in n:
node_label,id_given,examples,skip_node = search_node(node,g,u,enable_skipping)
#Only add node to examples df if not skipped
if not skip_node:
examples = create_annotated_df(examples,node,node_label,id_given,False)
#Drop any triples that contain that node
elif skip_node:
'''examples.drop(examples[examples['source'] == node].index, inplace = True)
examples.drop(examples[examples['target'] == node].index, inplace = True)'''
u = skip_node_in_edgelist(u,[node])
skipped_nodes.append(node)
logging.info('All input nodes searched.')
if input_type == 'pathway_ocr':
#List of entities that are already mapped to identifiers
n = {}
#List of entities that need to be manually mapped to identifiers
n_manual = []
for i in user_input_file:
ocr_frame = read_ocr_input(i)
if "genes" in i:
for i in range(len(ocr_frame)):
try:
gene_id = str(ocr_frame.iloc[i].loc['ncbigene_id'])
pkl_id = "http://www.ncbi.nlm.nih.gov/gene/" + str(gene_id)
n[ocr_frame.iloc[i].loc['word']] = get_label(g.labels_all,pkl_id,kg_type)
except IndexError:
try:
n_manual.append(ocr_frame.iloc[i].loc['ncbigene_symbol'])
except IndexError:
n_manual.append(ocr_frame.iloc[i].loc['word'])
elif 'chemicals' in i:
for i in range(len(ocr_frame)):
try:
chebi_id = str(ocr_frame.iloc[i].loc['chebi']).upper().split(':')[1]
pkl_id = 'http://purl.obolibrary.org/obo/CHEBI_'+str(chebi_id)
n[ocr_frame.iloc[i].loc['word']] = get_label(g.labels_all,pkl_id,kg_type)
#Index Error since no chebi value will error on first step of splitting by ":"
except IndexError:
n_manual.append(ocr_frame.iloc[i].loc['word'])
else:
nodes = unique_nodes(ocr_frame["word"].to_frame())
for i in nodes:
n_manual.append(i)
n_existing = list(n.keys())
n_both = n_manual + n_existing
#Creates examples df without source_label and target_label
u = pd.DataFrame(itertools.permutations(n_both,2))
u = u.rename(columns = {0: "source", 1:"target"})
u[['source_label','target_label']] = pd.DataFrame([['','']],index=u.index)
for i in range(len(u)):
try:
#u.iloc[i].loc['source_label'] = n[u.iloc[i].loc['source']]
u.at[i,'source_label'] = n[u.iloc[i].loc['source']]
except KeyError:
pass
try:
u.at[i,'target_label'] = n[u.iloc[i].loc['target']]
except KeyError:
pass
for node in n:
node_label,id_given,examples,skip_node = search_node(node,g,u,enable_skipping)
#Only add node to examples df if not skipped
if not skip_node:
examples = create_annotated_df(examples,node,node_label,id_given,False)
#Drop any triples that contain that node
elif skip_node:
'''examples.drop(examples[examples['source'] == node].index, inplace = True)
examples.drop(examples[examples['target'] == node].index, inplace = True)'''
u = skip_node_in_edgelist(u,[node])
skipped_nodes.append(node)
logging.info('All input nodes searched.')
if input_type == 'literature_comparison' or input_type == 'guiding_term':
if input_type == 'literature_comparison':
term_file = input_dir+'/literature_comparison/' + input_substring + '_Literature_Comparison_Terms.csv'
elif input_type == 'guiding_term':
term_file = input_dir+'/Guiding_Terms.csv'
#Creates examples df without source_label and target_label
u = read_user_input(term_file,True)
n = unique_nodes(u)
for node in n:
node_label,id_given,examples,skip_node = search_node(node,g,u,enable_skipping,True)
#Only add node to examples df if not skipped
if not skip_node:
examples = create_annotated_df(examples,node,node_label,id_given,True)
#Drop any triples that contain that node
elif skip_node:
examples.drop(examples[examples['term'] == node].index, inplace = True)
guiding_term_skipped_nodes.append(node)
logging.info('All input nodes searched.')
create_input_file(examples,output_dir,input_type)
#Creates a skipped_node file per input diagram
if enable_skipping:
create_skipped_node_file(skipped_nodes,output_dir)
create_skipped_node_file(guiding_term_skipped_nodes,output_dir,'guidingTerms')
else:
print('Node mapping file exists... moving to embedding creation')
logging.info('Node mapping file exists... moving to embedding creation')
mapped_file = output_dir + '/'+ exists[1]
examples = pd.read_csv(mapped_file, sep = "|")
logging.info('Node mapping file: %s',mapped_file)
return(examples)
## dropping a row in the case of self loops based on node ID
def skip_self_loops(input_df):
for i in range(len(input_df)):
if input_df.loc[i,"source_id"] == input_df.loc[i,"target_id"]:
input_df.drop([i], axis = 0, inplace = True)
return(input_df)
#Takes in a df of the edgelist and a list of all node labels to remove. Edgelist format is source,target columns
def skip_node_in_edgelist(edgelist_df,removed_nodes):
other_columns = ['source_label','target_label','source_id','target_id']
#print('Removing nodes')
for node in removed_nodes:
#print(node)
new_edges = []
node_objects = list(set(edgelist_df.loc[edgelist_df.source == node,'target'].tolist()))
node_subjects = list(set(edgelist_df.loc[edgelist_df.target == node,'source'].tolist()))
#Remove self from lists in the case of self loops
if node in node_objects: node_objects.remove(node)
if node in node_subjects: node_subjects.remove(node)
if len(node_objects) > 0 and len(node_subjects) > 0:
for s in node_subjects:
for o in node_objects:
if all(w in edgelist_df.columns.tolist() for w in other_columns):
s_label = edgelist_df.loc[edgelist_df['source'] == s,'source_label'].values[0]
o_label = edgelist_df.loc[edgelist_df['target'] == o,'target_label'].values[0]
s_id = edgelist_df.loc[edgelist_df['source'] == s,'source_id'].values[0]
o_id = edgelist_df.loc[edgelist_df['target'] == o,'target_id'].values[0]
new_edges.append([s,o,s_label,o_label,s_id,o_id])
else:
new_edges.append([s,o])
elif len(node_objects) > 1:
all_pairs = list(combinations(node_objects, 2))
for i in all_pairs:
s = list(i)[0]
o = list(i)[1]
if all(w in edgelist_df.columns.tolist() for w in other_columns):
s_label = edgelist_df.loc[edgelist_df['source'] == s,'source_label'].values[0]
o_label = edgelist_df.loc[edgelist_df['target'] == o,'target_label'].values[0]
s_id = edgelist_df.loc[edgelist_df['source'] == s,'source_id'].values[0]
o_id = edgelist_df.loc[edgelist_df['target'] == o,'target_id'].values[0]
new_edges.append([s,o,s_label,o_label,s_id,o_id])
else:
new_edges.append([s,o])
elif len(node_subjects) > 1:
all_pairs = list(combinations(node_subjects, 2))
for i in all_pairs:
s = list(i)[0]
o = list(i)[1]
#All node_subjects can be indexed to row that contains them as a subject
if all(w in edgelist_df.columns.tolist() for w in other_columns):
s_label = edgelist_df.loc[edgelist_df['source'] == s,'source_label'].values[0]
o_label = edgelist_df.loc[edgelist_df['source'] == o,'source_label'].values[0]
s_id = edgelist_df.loc[edgelist_df['source'] == s,'source_id'].values[0]
o_id = edgelist_df.loc[edgelist_df['source'] == o,'source_id'].values[0]
new_edges.append([s,o,s_label,o_label,s_id,o_id])
else:
new_edges.append([s,o])
#Remove triples with node
edgelist_df = edgelist_df.drop(edgelist_df.loc[edgelist_df['source'] == node].index)
edgelist_df = edgelist_df.drop(edgelist_df.loc[edgelist_df['target'] == node].index) #, inplace=True).reset_index(drop=True)
edgelist_df = edgelist_df.reset_index(drop=True)
#Add new triples to edgelist
if len(new_edges) > 0:
new_triples_df = pd.DataFrame(new_edges, columns = edgelist_df.columns.tolist())
edgelist_df = pd.concat([edgelist_df,new_triples_df], axis=0)
edgelist_df = edgelist_df.reset_index(drop=True)
#Remove duplicate node pairs that are in different order
edgelist_df = edgelist_df.groupby(edgelist_df.apply(frozenset, axis=1), as_index=False).first()
return edgelist_df