This repository has been archived by the owner on Jun 13, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 16
/
Copy pathactive_learning_functions.py
416 lines (365 loc) · 17.7 KB
/
active_learning_functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
from __future__ import division
from p_vae import *
from codings import *
import numpy as np
import tensorflow as tf
from scipy.stats import bernoulli
import argparse
import os
import random
from random import sample
#### parser configurations
parser = argparse.ArgumentParser(
description='EDDI')
parser.add_argument(
'--epochs',
type=int,
default=3000,
metavar='N_eps',
help='number of epochs to train (default: 3000)')
parser.add_argument(
'--latent_dim',
type=int,
default=10,
metavar='LD',
help='latent dimension (default: 10)')
parser.add_argument(
'--p',
type=float,
default=0.7,
metavar='probability',
help='dropout probability of artificial missingness during training')
parser.add_argument(
'--iteration',
type=int,
default=-1,
metavar='it',
help='iterations per epoch. set to -1 to run the full epoch. ')
parser.add_argument(
'--batch_size',
type=int,
default=100,
metavar='batch',
help='Mini Batch size per epoch. ')
parser.add_argument(
'--K',
type=int,
default=20,
metavar='K',
help='Dimension of PNP feature map ')
parser.add_argument(
'--M',
type=int,
default=50,
metavar='M',
help='Number of MC samples when perform imputing')
parser.add_argument(
'--eval',
type=str,
default='rmse',
metavar='eval',
help='eval: evaluation metric of active learning. ''rmse'':rmse; ''nllh'':negative log likelihood')
parser.add_argument(
'--repeat',
type=int,
default=5,
metavar='repeat',
help='Number of repeats of the active learning experiment')
parser.add_argument(
'--output_dir',
type=str,
default=os.getenv('PT_OUTPUT_DIR', '/tmp'))
parser.add_argument(
'--data_dir',
type=str,
default=os.getenv('PT_DATA_DIR', 'data'),
help='Directory where UCI dataset is stored.')
args = parser.parse_args()
#### Set directories
UCI = args.data_dir
ENCODER_WEIGHTS = os.path.join(args.output_dir, 'encoder.tensorflow')
FINETUNED_DECODER_WEIGHTS = os.path.join(args.output_dir, 'generator.tensorflow')
rs = 42 # random seed
def p_vae_active_learning(Data_train,mask_train,Data_test,mask_test,epochs,latent_dim,batch_size,p,K,M,eval,Repeat,estimation_method=0):
'''
This function loads a pretrained p-VAE model, and performs active learning using single global strategy.
Note that we assume that the last column of x is the target variable of interest
:param Data_train: training data matrix
:param mask_train: mask matrix that indicates the missingness of training data. 1=observed, 0 = missing
:param Data_test: test data matrix
:param mask_test: mask matrix that indicates the missingness of test data. 1=observed, 0 = missing
:param latent_dim: latent dimension of partial VAE.
:param K: dimension of feature map of PNP encoder
:param M: number of samples used for MC sampling
:param eval: evaluation metric of active learning. 'rmse':rmse; 'nllh':negative log likelihood
:param Repeat: number of repeats.
:param estimation_method: what method to use for single ordering information reward estimation.
In order to calculate the single best ordering, we need to somehow marginalize (average) the
information reward over the data set (in this case, the test set).
we provide two methods of marginalization.
- estimation_method = 0: information reward marginalized using the model distribution p_{vae_model}(x_o).
- estimation_method = 1: information reward marginalized using the data distribution p_{data}(x_o)
:return: None (active learning results are saved to args.output_dir)
'''
for r in range(Repeat):
## train partial VAE
tf.reset_default_graph()
vae = train_p_vae(Data_train,mask_train, epochs, latent_dim,batch_size, p, K,10)
n_test = Data_test.shape[0]
n_train = Data_train.shape[0]
OBS_DIM = Data_test.shape[1]
# kwargs = {
# 'K': K,
# 'obs_distrib': "Gaussian",
# 'latent_dim': latent_dim,
# 'encoder': PNP_fc_uci_encoder,
# 'decoder': fc_uci_decoder,
# 'obs_dim': OBS_DIM,
# 'load_model': 1,
# 'decoder_path': FINETUNED_DECODER_WEIGHTS,
# 'encoder_path': ENCODER_WEIGHTS,
# }
# vae = PN_Plus_VAE(**kwargs)
## create arrays to store results
if r == 0:
# information curves
information_curve_RAND = np.zeros(
(Repeat, n_test, OBS_DIM - 1 + 1))
information_curve_SING = np.zeros(
(Repeat, n_test, OBS_DIM - 1 + 1))
information_curve_CHAI = np.zeros(
(Repeat, n_test, OBS_DIM - 1 + 1))
# history of optimal actions
action_SING = np.zeros((Repeat, n_test,
OBS_DIM - 1 ))
action_CHAI = np.zeros((Repeat, n_test,
OBS_DIM - 1))
# history of information reward values
R_hist_SING = np.zeros(
(Repeat, OBS_DIM - 1 , n_test,
OBS_DIM - 1 ))
R_hist_CHAI = np.zeros(
(Repeat, OBS_DIM - 1, n_test,
OBS_DIM - 1))
# history of posterior samples of partial inference
im_SING = np.zeros((Repeat, OBS_DIM - 1 , M,
n_test, OBS_DIM ))
im_CHAI = np.zeros((Repeat, OBS_DIM - 1, M,
n_test, OBS_DIM))
## Perform active variable selection with random ordiner and SING (single sequence)
for strategy in range(3):
if strategy == 0:### random strategy
## create arrays to store data and missingness
x = Data_test[:, :] #
x = np.reshape(x, [n_test, OBS_DIM])
mask = np.zeros((n_test, OBS_DIM))
mask[:, -1] = 0 # we will never observe target value
## initialize array that stores optimal actions (i_optimal)
i_optimal = [
nums for nums in range(OBS_DIM - 1 )
]
i_optimal = np.tile(i_optimal, [n_test, 1])
random.shuffle([random.shuffle(c) for c in i_optimal])
## evaluate likelihood at initial stage (no observation)
negative_predictive_llh, uncertainty = vae.predictive_loss(
x, mask,eval, M)
information_curve_RAND[r, :, 0] = negative_predictive_llh
for t in range(OBS_DIM - 1 ):
print("Repeat = {:.1f}".format(r))
print("Strategy = {:.1f}".format(strategy))
print("Step = {:.1f}".format(t))
io = np.eye(OBS_DIM)[i_optimal[:, t]]
mask = mask + io
negative_predictive_llh, uncertainty = vae.predictive_loss(
x, mask,eval, M)
information_curve_RAND[r, :, t +
1] = negative_predictive_llh
elif strategy == 1:### single ordering strategy "SING"
#SING is obtrained by maximize mean information reward for each step for the test set to be consistant with the description in the paper.
#We can also get this order by using a subset of training set to obtain the optimal ordering and apply this to the testset.
x = Data_test[:, :] #
x = np.reshape(x, [n_test, OBS_DIM])
mask = np.zeros((n_test, OBS_DIM)) # this stores the mask of missingness (stems from both test data missingness and unselected features during active learing)
mask2 = np.zeros((n_test, OBS_DIM)) # this stores the mask indicating that which features has been selected of each data
mask[:, -1] = 0 # Note that no matter how you initialize mask, we always keep the target variable (last column) unobserved.
negative_predictive_llh, uncertainty = vae.predictive_loss(
x, mask,eval, M)
information_curve_SING[r, :, 0] = negative_predictive_llh
for t in range(OBS_DIM - 1 ): # t is a indicator of step
print("Repeat = {:.1f}".format(r))
print("Strategy = {:.1f}".format(strategy))
print("Step = {:.1f}".format(t))
## note that for single ordering, there are two rewards.
# The first one (R) is calculated based on no observations.
# This is used for active learning phase, since single ordering should not depend on observations.
# The second one (R_eval) is calculated in the same way as chain rule approximation. This is only used for visualization.
R = -1e4 * np.ones(
(n_test, OBS_DIM - 1)
)
im_0 = completion(x, mask*0, M, vae) # sample from model prior
im = completion(x, mask, M, vae) # sample conditional on observations
im_SING[r, t, :, :, :] = im
for u in range(OBS_DIM - 1): # u is the indicator for features. calculate reward function for each feature candidates
loc = np.where(mask2[:, u] == 0)[0]
if estimation_method == 0:
R[loc, u] = R_lindley_chain(u, x, mask, M, vae, im_0,loc)
else:
R[loc, u] = R_lindley_chain(u, x, mask, M, vae, im, loc)
R_hist_SING[r, t, :, :] = R
i_optimal = (R.mean(axis=0)).argmax() # optimal decision based on reward averaged on all data
i_optimal = np.tile(i_optimal, [n_test])
io = np.eye(OBS_DIM)[i_optimal]
action_SING[r, :, t] = i_optimal
mask = mask + io*mask_test # this mask takes into account both data missingness and missingness of unselected features
negative_predictive_llh, uncertainty = vae.predictive_loss(
x, mask,eval, M)
mask2 = mask2 + io # this mask only stores missingess of unselected features, i.e., which features has been selected of each data
information_curve_SING[r, :, t +
1] = negative_predictive_llh
elif strategy == 2: ### EDDI strategy (chain rule approximation)
# personalized active feature selection strategy
## create arrays to store data and missingness
x = Data_test[:, :] #
x = np.reshape(x, [n_test, OBS_DIM])
mask = np.zeros((n_test, OBS_DIM)) # this stores the mask of missingness (stems from both test data missingness and unselected features during active learing)
mask2 = np.zeros((n_test,OBS_DIM)) # this stores the mask indicating that which features has been selected of each data
mask[:,-1] = 0 # Note that no matter how you initialize mask, we always keep the target variable (last column) unobserved.
## evaluate likelihood at initial stage (no observation)
negative_predictive_llh, uncertainty = vae.predictive_loss(
x, mask, eval,M)
information_curve_CHAI[r, :, 0] = negative_predictive_llh
for t in range(OBS_DIM - 1): # t is a indicator of step
print("Repeat = {:.1f}".format(r))
print("Strategy = {:.1f}".format(strategy))
print("Step = {:.1f}".format(t))
R = -1e4 * np.ones((n_test, OBS_DIM - 1))
im = completion(x, mask, M, vae)
im_CHAI[r, t, :, :, :] = im
for u in range(OBS_DIM - 1): # u is the indicator for features. calculate reward function for each feature candidates
loc = np.where(mask[:, u] == 0)[0]
R[loc, u] = R_lindley_chain(u, x, mask, M, vae, im,
loc)
R_hist_CHAI[r, t, :, :] = R
i_optimal = R.argmax(axis=1)
io = np.eye(OBS_DIM)[i_optimal]
action_CHAI[r, :, t] = i_optimal
mask = mask + io # this mask takes into account both data missingness and missingness of unselected features
negative_predictive_llh, uncertainty = vae.predictive_loss(
x, mask, eval,M)
mask2 = mask2 + io # this mask only stores missingess of unselected features, i.e., which features has been selected of each data
print(mask2[0:5, :])
information_curve_CHAI[r, :, t +
1] = negative_predictive_llh
# Save results
np.savez(
os.path.join(
args.output_dir,
'UCI_information_curve_RAND.npz'),
information_curve=information_curve_RAND)
np.savez(
os.path.join(
args.output_dir,
'UCI_information_curve_SING.npz'),
information_curve=information_curve_SING)
np.savez(
os.path.join(
args.output_dir,
'UCI_information_curve_CHAI.npz'),
information_curve=information_curve_CHAI)
np.savez(
os.path.join(args.output_dir,
'UCI_action_SING.npz'),
action=action_SING)
np.savez(
os.path.join(args.output_dir,
'UCI_action_CHAI.npz'),
action=action_CHAI)
np.savez(
os.path.join(args.output_dir,
'UCI_R_hist_SING.npz'),
R_hist=R_hist_SING)
np.savez(
os.path.join(args.output_dir,
'UCI_R_hist_CHAI.npz'),
R_hist=R_hist_CHAI)
np.savez(
os.path.join(args.output_dir,
'UCI_im_SING.npz'),
im=im_SING)
np.savez(
os.path.join(args.output_dir,
'UCI_im_CHAI.npz'),
im=im_CHAI)
return None
def train_p_vae(Data_train,mask_train, epochs, latent_dim,batch_size, p, K,iteration):
'''
This function trains the partial VAE.
:param Data_train: training Data matrix, N by D
:param mask_train: mask matrix that indicates the missingness. 1=observed, 0 = missing
:param epochs: number of epochs of training
:param LATENT_DIM: latent dimension for partial VAE model
:param p: dropout rate for creating additional missingness during training
:param K: dimension of feature map of PNP encoder
:param iteration: how many mini-batches are used each epoch. set to -1 to run the full epoch.
:return: trained VAE, together with the test data used for testing.
'''
obs_dim = Data_train.shape[1]
n_train = Data_train.shape[0]
list_train = np.arange(n_train)
####### construct
kwargs = {
'K': K,
'obs_distrib': "Gaussian",
'latent_dim': latent_dim,
'batch_size': batch_size,
'encoder': PNP_fc_uci_encoder,
'decoder': fc_uci_decoder,
'obs_dim': obs_dim,
'load_model':0,
'decoder_path': FINETUNED_DECODER_WEIGHTS,
'encoder_path': ENCODER_WEIGHTS,
}
vae = PN_Plus_VAE(**kwargs)
if iteration == -1:
n_it = int(np.ceil(n_train / float(kwargs['batch_size'])))
else:
n_it = iteration
for epoch in range(epochs):
training_loss_full = 0.
# test_loss, test_kl, test_recon = vae.full_batch_loss(Data_test,mask_test)
# test_loss = test_loss
# test_kl = test_kl / n_test
# test_recon = test_recon / n_test
# iterate through batches
# np.random.shuffle(list_train)
for it in range(n_it):
if iteration == -1:
batch_indices = list_train[it*kwargs['batch_size']:min(it*kwargs['batch_size'] + kwargs['batch_size'], n_train - 1)]
else:
batch_indices = sample(range(n_train), kwargs['batch_size'])
x = Data_train[batch_indices, :]
mask_train_batch = mask_train[batch_indices, :]
# DROPOUT_TRAIN = np.minimum(np.random.rand(1), p)
# while True:
# mask_drop = np.array([bernoulli.rvs(1 - DROPOUT_TRAIN, size=obs_dim)] *
# kwargs['batch_size'])
# if np.sum(mask_drop>0):
# break
DROPOUT_TRAIN = np.minimum(np.random.rand(mask_train_batch.shape[0], obs_dim), p)
while True:
# mask_drop = np.array([bernoulli.rvs(1 - DROPOUT_TRAIN)] )
mask_drop = bernoulli.rvs(1 - DROPOUT_TRAIN)
if np.sum(mask_drop > 0):
break
mask_drop = mask_drop.reshape([kwargs['batch_size'], obs_dim])
_ = vae.update(x, mask_drop*mask_train_batch)
loss_full, _, _ = vae.full_batch_loss(x,mask_drop*mask_train_batch)
training_loss_full += loss_full
# average loss over most recent epoch
training_loss_full /= n_it
print(
'Epoch: {} \tnegative training ELBO per observed feature: {:.2f}'
.format(epoch, training_loss_full))
vae.save_generator(FINETUNED_DECODER_WEIGHTS)
vae.save_encoder(ENCODER_WEIGHTS)
return vae