-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain_N2V_inf.py
230 lines (178 loc) · 8.47 KB
/
main_N2V_inf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
# -*- coding: UTF-8 -*-
import tensorflow as tf
import numpy as np
from scipy.optimize import fmin_ncg
import time
import os
from my_utils import load_data_for_N2V
from hessians import hessian_vector_product
def add_layer(input_data, in_size, out_size, act_func=None, name=None):
weights = tf.Variable(tf.random.normal([in_size, out_size]), name=name + '_weight')
biases = tf.Variable(tf.random.normal([1, out_size]) + 0.001, name=name + '_biases')
result = tf.matmul(input_data, weights) + biases
if act_func is None:
outputs = result
else:
outputs = act_func(result)
return outputs
# def geo_eval(y_pred, U_ture, classLatMedian, classLonMedian, userLocation):
#
# assert len(y_pred) == len(U_ture), "#preds: %d, #users: %d" % (len(y_pred), len(U_ture))
#
# distances = []
# latlon_pred = []
# latlon_true = []
# for i in range(0, len(y_pred)):
# user = U_ture[i]
# location = userLocation[user].split(',')
# lat, lon = float(location[0]), float(location[1])
# latlon_true.append([lat, lon])
# prediction = str(y_pred[i])
# lat_pred, lon_pred = classLatMedian[prediction], classLonMedian[prediction]
# latlon_pred.append([lat_pred, lon_pred, y_pred[i]])
# distance = haversine((lat, lon), (lat_pred, lon_pred))
# distances.append(distance)
#
# acc_at_161 = 100 * len([d for d in distances if d < 161]) / float(len(distances))
# # return np.mean(distances), np.median(distances), acc_at_161, distances, latlon_true, latlon_pred
# return np.mean(distances), np.median(distances), acc_at_161, distances, latlon_true, latlon_pred
""" load data for NLP model """
dump_file = "./dataset_cmu/dump_doc_dim_512.pkl"
edge_file = "./dataset_cmu/node2vec_dim128.npy"
data = load_data_for_N2V(dump_file, edge_file, feature_norm='None')
features, labels, idx_train, idx_val, idx_test, U_train, U_dev, U_test, classLatMedian, classLonMedian, userLocation, edge_emb = data
""" MLP + influence by using tensorflow. """
learning_rate = 0.01
graph_emb_size = 128
content_emb_size = 512
class_num = 129
training_epochs = 100
display_epoch = 10
patience = 10
x_input = tf.compat.v1.placeholder(tf.float32, [None, content_emb_size], name='contentEmbedding')
y_label = tf.compat.v1.placeholder(tf.int64, [None, class_num], name='LabelData')
x_input_emb = tf.compat.v1.placeholder(tf.float32, [None, graph_emb_size], name='nodeEmbedding')
hidden_1 = add_layer(x_input, content_emb_size, 512, act_func=tf.nn.relu, name='content_MLP')
hidden_2 = add_layer(x_input_emb, graph_emb_size, 512, act_func=tf.nn.relu, name='graph_MLP')
x_3 = tf.concat((hidden_1, hidden_2), axis=1)
output_x = add_layer(x_3, 1024, class_num, act_func=None, name='output_MLP')
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=y_label, logits=output_x))
optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate).minimize(loss)
# pred = tf.argmax(output_x, axis=1)
# acc = tf.equal(tf.argmax(output_x, 1), tf.argmax(y_label, 1))
# acc = tf.reduce_mean(tf.cast(acc, tf.float32))
'''calculate influence '''
all_params = tf.compat.v1.trainable_variables()
params = [tf.compat.v1.trainable_variables()[4]] # only last layer's params
gradients = tf.gradients(loss, params)
v_placeholder = params
hessian_vector = hessian_vector_product(loss, params, v_placeholder)
'''calculate influence '''
# Initialize the variables (i.e. assign their default value)
init = tf.compat.v1.global_variables_initializer()
# 'Saver' op to save and restore all the variables
saver = tf.compat.v1.train.Saver()
def get_influence(test_x, test_x_emb, test_y):
# Done --> predicted_loss_diffs == First step:S test(for test point which interested)、
# Done --> Second step:I up,loss(calculate the effect of each training point)
# test_x : content features
# test_x_emb: graph embedding features
inverse_hvp = get_inverse_hvp_cg(get_test_grad_loss(test_x, test_x_emb, test_y)[0])
num_to_remove = len(idx_train)
predicted_loss_diffs = list()
for idx_to_remove in range(0, num_to_remove):
single_train_feed_dict = fill_feed_dict_with_one_ex(idx_to_remove)
train_grad_loss_val = sess.run(gradients, feed_dict=single_train_feed_dict)
predicted_loss_diffs.append(np.dot(inverse_hvp, train_grad_loss_val[0].flatten()) / num_to_remove)
return np.array(predicted_loss_diffs)
def get_test_grad_loss(test_x, test_x_emb, test_y):
return sess.run(gradients, {x_input: test_x, x_input_emb: test_x_emb, y_label: test_y})
def get_inverse_hvp_cg(v):
fmin_loss_fn = get_fmin_loss_fn(v)
fmin_grad_fn = get_fmin_grad_fn(v)
fmin_results = fmin_ncg(
f=fmin_loss_fn,
x0=np.concatenate(v),
fprime=fmin_grad_fn, # gradient
fhess_p=get_fmin_hvp,
callback=None,
avextol=1e-8,
maxiter=20)
return get_vec_to_list_fn()(fmin_results)
def get_fmin_loss_fn(v):
def get_fmin_loss(x):
hessian_vector_val = minibatch_hessian_vector_val(get_vec_to_list_fn()(x))
return 0.5 * np.dot(np.concatenate(hessian_vector_val), x) - np.dot(np.concatenate(v), x)
return get_fmin_loss
def get_fmin_grad_fn(v):
def get_fmin_grad(x):
hessian_vector_val = minibatch_hessian_vector_val(get_vec_to_list_fn()(x))
return np.concatenate(hessian_vector_val) - np.concatenate(v)
return get_fmin_grad
def minibatch_hessian_vector_val(v):
feed_dict = fill_feed_dict_with_all_ex()
# Can optimize this
feed_dict = update_feed_dict_with_v_placeholder(feed_dict, v)
hessian_vector_val = sess.run(hessian_vector, feed_dict=feed_dict)
hessian_vector_val = np.reshape(hessian_vector_val,
np.shape(hessian_vector_val[0])[0] * np.shape(hessian_vector_val[0])[1])
return [hessian_vector_val]
def get_fmin_hvp(x, p):
hessian_vector_val = minibatch_hessian_vector_val(get_vec_to_list_fn()(p))
return np.concatenate(hessian_vector_val)
def fill_feed_dict_with_all_ex():
feed_dict = {
x_input: features[idx_train],
x_input_emb: edge_emb[idx_train],
y_label: get_one_hot(labels[idx_train])
}
return feed_dict
def fill_feed_dict_with_one_ex(target_idx):
feed_dict = {
x_input: [features[target_idx]],
x_input_emb: [edge_emb[target_idx]],
y_label: get_one_hot([labels[target_idx]])
}
return feed_dict
def update_feed_dict_with_v_placeholder(feed_dict, vec):
for pl_block, vec_block in zip(v_placeholder, [np.reshape(vec, v_placeholder[0].get_shape())]):
feed_dict[pl_block] = vec_block
return feed_dict
def get_vec_to_list_fn():
def vec_to_list(v):
return v
return vec_to_list
def get_one_hot(y_label):
one_hot_index = np.arange(len(y_label)) * class_num + y_label
one_hot = np.zeros((len(y_label), class_num))
one_hot.flat[one_hot_index] = 1
return one_hot
""" start running the framework ..."""
tf_config = tf.compat.v1.ConfigProto()
tf_config.gpu_options.per_process_gpu_memory_fraction = 0.5 # 分配50%
tf_config.gpu_options.allow_growth = True # 自适应
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
""""for each test sample, calculate it's influence on each training sample, i.e. inf_of_a_test_point"""
features_test, nodeEmbed_test, labels_test = features[idx_test], edge_emb[idx_test], labels[idx_test]
error_index = list() # !!! store the error index which should rerun after.
for i in range(0, len(idx_test)):
with tf.compat.v1.Session(config=tf_config) as sess:
sess.run(init)
try:
inf_of_a_test_point = get_influence([features_test[i]], [nodeEmbed_test[i]], get_one_hot([labels_test[i]]))
except Exception:
error_index.append(i)
print("-----------------------------------------There is a RuntimeWarning at index:", i)
with open("./error_index.txt", 'a') as f:
f.write("\nTime:" + str(time.asctime(time.localtime(time.time()))) + "\t\tError_at_index:" + str(i))
continue
else:
np.savetxt("./Res_inf_N2V/inf_of_a_test_point{}.txt".format(i), inf_of_a_test_point)
print("Time:", time.asctime(time.localtime(time.time())),
"has done ---------------------------- {}".format(i))
# show and save the whole error_index
error_index_str = "\n\nTime:" + str(time.asctime(time.localtime(time.time()))) + \
" \t\tModel:N2V \nAll_Error_index:" + str(error_index)
print(error_index_str)
with open("./error_index.txt", 'a') as f:
f.write(error_index_str)