-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path1_ecotox_data_prep.py
114 lines (82 loc) · 5.06 KB
/
1_ecotox_data_prep.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
from args_parser import arg_parser_ecotox
import numpy as np
import pickle
import pandas as pd
from sklearn.preprocessing import StandardScaler
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from utils_ecotox import compute_fingerprint_all, get_word_embedding
from eco_dataloader import EtoxDataset
def func_normalize_fit(train_data):
# We normalize the 3 types of features independently, species node features, chemical node features, and edge features
scaler_duration = StandardScaler()
scaler_sp_feature = StandardScaler()
scaler_chem_feature = StandardScaler()
train_data_temp = [(*item[:4], torch.tensor(item[4].astype(np.float32)), item[5], item[6], torch.tensor(item[7].astype(np.float32)), torch.tensor(item[8].astype(np.float32))) for item in train_data]
durations = torch.cat([item[4].reshape(-1, 1) for item in train_data_temp], dim=0)
sp_features = torch.cat([item[7] for item in train_data_temp], dim=0)
chem_features = torch.cat([item[8] for item in train_data_temp], dim=0)
scaler_duration.fit(durations.to(torch.float32))
scaler_sp_feature.fit(sp_features.to(torch.float32))
scaler_chem_feature.fit(chem_features.to(torch.float32))
return scaler_duration, scaler_sp_feature, scaler_chem_feature
def func_normalize_transform(scaler_duration, scaler_sp_feature, scaler_chem_feature, data):
norm_data = [(item[0], item[1], item[2], item[3], scaler_duration.transform(item[4].reshape(-1, 1).astype(np.float32)), item[5], item[6], torch.tensor(scaler_sp_feature.transform(item[7].astype(np.float32))), torch.tensor(scaler_chem_feature.transform(item[8].astype(np.float32)))) for item in data]
return norm_data
def func_prepare_data(tuple_list):
data_selection = []
for rows in tuple_list:
data_row = []
data_row.extend([rows[2],rows[3], rows[4][0,0], rows[5], rows[6]])
data_row.extend(rows[7].numpy().tolist()[0])
data_row.extend(rows[8].numpy().tolist()[0])
data_selection.append(data_row)
return np.array(data_selection)
if __name__ == '__main__':
args = arg_parser_ecotox()
# print(args)
df = pd.read_csv(args.ecotox_file)
df.head()
if args.compute_feats==1:
sp_fmat, sp_fmat.shape, word_embeddings, word_embeddings_df = get_word_embedding(df)
word_embeddings_df.to_csv(args.u_filename_raw, index=False)
fingerprints = compute_fingerprint_all(df)
fingerprints.to_csv(args.v_filename_raw, index=False)
word_embeddings_df1 = pd.read_csv(args.u_filename_raw)
fingerprints = pd.read_csv(args.v_filename_raw)
dataset = EtoxDataset(args.ecotox_file, args.conc_threshold, args.u_filename_raw, args.v_filename_raw)
# Define sizes for train, validation and test splits
train_size = int(0.8 * len(dataset))
val_size = int(0.1 * len(dataset))
test_size = len(dataset) - train_size - val_size
spec_feats_arr = dataset.spec_feats.loc[:,'we_0':'we_'+str(word_embeddings_df1.shape[1]-2)].values
chem_feats_arr = dataset.chem_feats.loc[:, 'fp_0':'fp_'+str(fingerprints.shape[1]-2)].values
if args.save_feats==1:
np.save(args.u_filename, spec_feats_arr)
np.save(args.v_filename, chem_feats_arr)
# manual_seed(0) for split 1, manual_seed(10) for split 2 and manual_seed(100) for split 3
train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size], generator=torch.Generator().manual_seed(0))
scaler_d, scaler_sp, scaler_ch = func_normalize_fit(train_dataset)
train_data_normalized = func_normalize_transform(scaler_d, scaler_sp, scaler_ch, train_dataset)
val_data_normalized = func_normalize_transform(scaler_d, scaler_sp, scaler_ch, val_dataset)
test_data_normalized = func_normalize_transform(scaler_d, scaler_sp, scaler_ch, test_dataset)
# If needed - but memory intensive: save all data as is using pickle
# with open(args.train_norm_pkl', 'wb') as file:
# pickle.dump(train_data_normalized, file, protocol=4)
# with open(args.val_norm_pkl', 'wb') as file:
# pickle.dump(val_data_normalized, file, protocol=4)
# with open(args.test_norm_pkl', 'wb') as file:
# pickle.dump(test_data_normalized, file, protocol=4)
# prepare data splits with normalized features as np.array
train_select = func_prepare_data(train_data_normalized)
val_select = func_prepare_data(val_data_normalized)
test_select = func_prepare_data(test_data_normalized)
# If needed - but memory intensive
# # save all data including concatenated features
# np.savetxt(args.train_norm_all_file, train_select, delimiter=',')
# np.savetxt(args.val_norm_all_file, val_select, delimiter=',')
# np.savetxt(args.test_norm_all_file, test_select, delimiter=',')
# save only interactions and duration (edge feature)
np.savetxt(args.train_file, train_select[:,:5], delimiter=',')
np.savetxt(args.val_file, val_select[:,:5], delimiter=',')
np.savetxt(args.test_file, test_select[:,:5], delimiter=',')