-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpartitioning_data.py
186 lines (144 loc) · 7.03 KB
/
partitioning_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
"""
Adapted from https://github.com/hadarser/SetToGraphPaper/blob/master/dataloaders/jets_loader.py (Apache-2.0 License)
"""
import os
import uproot
import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader, Sampler
from datetime import datetime
MAX_EDGES = 10
data_dir = 'data/'
node_features_list = ['trk_d0', 'trk_z0', 'trk_phi', 'trk_ctgtheta', 'trk_pt', 'trk_charge']
jet_features_list = ['jet_pt', 'jet_eta', 'jet_phi', 'jet_M']
true_vtx_list = ['true_vtx_x', 'true_vtx_y', 'true_vtx_z']
def get_data_loader(which_set, batch_size, debug_load=False):
jets_data = JetGraphDataset(which_set, debug_load)
batch_sampler = JetsBatchSampler(jets_data.n_nodes, batch_size)
data_loader = DataLoader(jets_data, batch_sampler=batch_sampler)
return data_loader
def transform_features(transform_list, arr):
new_arr = np.zeros_like(arr)
for col_i, (mean, std) in enumerate(transform_list):
new_arr[col_i, :] = (arr[col_i, :] - mean) / std
return new_arr
class JetGraphDataset(Dataset):
def __init__(self, which_set, debug_load=False, random_permutation=True):
"""
Initialization
:param which_set: either "train", "validation" or "test"
:param debug_load: if True, will load only a small subset
:param random_permutation: if True, apply random permutation to the order of the nodes/vertices.
"""
assert which_set in ['train', 'validation', 'test']
fname = {'train': 'training', 'validation': 'valid', 'test': 'test'}
self.random_permutation = random_permutation
self.filename = os.path.join(data_dir, which_set, fname[which_set]+'_data.root')
with uproot.open(self.filename) as f:
tree = f['tree']
self.n_jets = int(tree.numentries)
self.n_nodes = np.array([len(x) for x in tree.array(b'trk_vtx_index')])
self.jet_arrays = tree.arrays(
jet_features_list + node_features_list + true_vtx_list + ['trk_vtx_index'])
self.sets, self.partitions, self.partitions_as_graphs = [], [], []
if debug_load:
self.n_jets = 100
self.n_nodes = self.n_nodes[:100]
start_load = datetime.now()
for set_, partition, partition_as_graph in self.get_all_items():
if torch.cuda.is_available():
set_ = torch.tensor(set_, dtype=torch.float, device='cuda')
partition = torch.tensor(partition, dtype=torch.long, device='cuda')
partition_as_graph = torch.tensor(partition_as_graph, dtype=torch.float, device='cuda')
self.sets.append(set_)
self.partitions.append(partition)
self.partitions_as_graphs.append(partition_as_graph)
if not torch.cuda.is_available():
self.sets = np.array(self.sets)
self.partitions = np.array(self.partitions)
self.partitions_as_graphs = np.array(self.partitions_as_graphs)
print(f' {str(datetime.now() - start_load).split(".")[0]}', flush=True)
def __len__(self):
"""Returns the length of the dataset"""
return self.n_jets
def get_all_items(self):
node_feats = np.array([np.asarray(self.jet_arrays[str.encode(x)]) for x in node_features_list])
jet_feats = np.array([np.asarray(self.jet_arrays[str.encode(x)]) for x in jet_features_list])
vtx_feats = np.array([np.asarray(self.jet_arrays[str.encode(x)]) for x in true_vtx_list])
n_labels = np.array(self.jet_arrays[b'trk_vtx_index'])
for i in range(self.n_jets):
n_nodes = self.n_nodes[i]
node_feats_i = np.stack(node_feats[:, i], axis=0) # shape (6, n_nodes)
jet_feats_i = jet_feats[:, i] # shape (4, )
jet_feats_i = jet_feats_i[:, np.newaxis] # shape (4, 1)
vtx_feats_i = np.stack(vtx_feats[:, i], axis=0)
node_feats_i = transform_features(FeatureTransform.node_feature_transform_list, node_feats_i)
jet_feats_i = transform_features(FeatureTransform.jet_features_transform_list, jet_feats_i)
vtx_feats_i = transform_features(FeatureTransform.true_vtx_transform_list, vtx_feats_i)
jet_feats_i = np.repeat(jet_feats_i, n_nodes, axis=1) # change shape to (4, n_nodes)
set_i = np.concatenate([node_feats_i, jet_feats_i]).T # shape (n_nodes, 10)
partition_i = n_labels[i]
incidence_i = (np.arange(MAX_EDGES)[:,np.newaxis] == n_labels[i]).astype("float")
vtx_feats_i = np.concatenate((
vtx_feats_i.T, np.zeros((MAX_EDGES - vtx_feats_i.shape[1], 3))
))
if self.random_permutation:
perm = np.random.permutation(n_nodes)
set_i = set_i[perm] # random permutation
partition_i = partition_i[perm] # random permuatation
incidence_i = incidence_i[:,perm]
target_i = incidence_i
yield set_i, partition_i, target_i
def __getitem__(self, idx):
"""Generates a single instance of data"""
return self.sets[idx], self.partitions[idx], self.partitions_as_graphs[idx]
class JetsBatchSampler(Sampler):
def __init__(self, n_nodes_array, batch_size):
"""
Initialization
:param n_nodes_array: array of sizes of the jets
:param batch_size: batch size
"""
super().__init__(n_nodes_array.size)
self.dataset_size = n_nodes_array.size
self.batch_size = batch_size
self.index_to_batch = {}
self.node_size_idx = {}
running_idx = -1
for n_nodes_i in set(n_nodes_array):
if n_nodes_i <= 1:
continue
self.node_size_idx[n_nodes_i] = np.where(n_nodes_array == n_nodes_i)[0]
n_of_size = len(self.node_size_idx[n_nodes_i])
n_batches = max(n_of_size / self.batch_size, 1)
self.node_size_idx[n_nodes_i] = np.array_split(np.random.permutation(self.node_size_idx[n_nodes_i]),
n_batches)
for batch in self.node_size_idx[n_nodes_i]:
running_idx += 1
self.index_to_batch[running_idx] = batch
self.n_batches = running_idx + 1
def __len__(self):
return self.n_batches
def __iter__(self):
batch_order = np.random.permutation(np.arange(self.n_batches))
for i in batch_order:
yield self.index_to_batch[i]
class FeatureTransform(object):
# Based on mean and std values of TRAINING set only
node_feature_transform_list = [
(0.0006078152, 14.128961),
(0.0038490593, 10.688491),
(-0.0026713554, 1.8167108),
(0.0047640945, 1.889725),
(5.237357, 7.4841413),
(-0.00015662189, 1.0)]
jet_features_transform_list = [
(75.95093, 49.134453),
(0.0022607117, 1.2152709),
(-0.0023569583, 1.8164033),
(9.437994, 6.765137)]
true_vtx_transform_list = [
(0.15225221, 91.49316),
(-0.013348891, 91.526245),
(0.09531866, 221.47397)
]