From 058672e20d02a5073609f9dfa4ac0a1611ec9efb Mon Sep 17 00:00:00 2001
From: Francois Drielsma <francois.drielsma@gmail.com>
Date: Tue, 2 Aug 2022 15:30:58 -0700
Subject: [PATCH 01/52] Trainling spaces removed

---
 mlreco/models/layers/common/momentum.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/mlreco/models/layers/common/momentum.py b/mlreco/models/layers/common/momentum.py
index d0d1c04f..ac9d4c11 100644
--- a/mlreco/models/layers/common/momentum.py
+++ b/mlreco/models/layers/common/momentum.py
@@ -44,8 +44,8 @@ class VertexNet(MomentumNet):
     Small MLP for handling vertex regression and particle primary prediction.
     '''
     def __init__(self, num_input, num_output=1, num_hidden=128):
-        super(VertexNet, self).__init__(num_input, num_output=num_output, 
-                                                   num_hidden=num_hidden, 
+        super(VertexNet, self).__init__(num_input, num_output=num_output,
+                                                   num_hidden=num_hidden,
                                                    positive_outputs=False)
     def forward(self, x):
         # if x.shape[0] > 1:
@@ -72,7 +72,7 @@ class DeepVertexNet(nn.Module):
         node_y = torch.randn(16, 5)
         edge_feature_x2y = net(node_x, node_y) # (16, 5)
     '''
-    def __init__(self, num_input, num_output=1, num_hidden=512, num_layers=5, 
+    def __init__(self, num_input, num_output=1, num_hidden=512, num_layers=5,
                  positive_outputs=False):
         super(DeepVertexNet, self).__init__()
         self.linear = nn.ModuleList()
@@ -104,7 +104,7 @@ def forward(self, x):
 
 class EvidentialMomentumNet(nn.Module):
 
-    def __init__(self, num_input, num_output=4, 
+    def __init__(self, num_input, num_output=4,
                  num_hidden=128, eps=0.0, logspace=False):
         super(EvidentialMomentumNet, self).__init__()
         self.linear1 = nn.Linear(num_input, num_hidden)
@@ -137,10 +137,10 @@ def forward(self, x):
         vab = self.softplus(x[:, :3]) + self.eps
         alpha = torch.clamp(vab[:, 1] + 1.0, min=1.0).view(-1, 1)
         gamma = 2.0 * self.gamma(x[:, 3]).view(-1, 1)
-        out = torch.cat([gamma, vab[:, 0].view(-1, 1), 
+        out = torch.cat([gamma, vab[:, 0].view(-1, 1),
                          alpha, vab[:, 2].view(-1, 1)], dim=1)
         if not self.logspace:
             evidence = torch.clamp(out, min=self.eps)
         else:
-            evidence = out 
-        return evidence
\ No newline at end of file
+            evidence = out
+        return evidence

From fbb299e2a48e2b88766da3841d5720a9da94cb2b Mon Sep 17 00:00:00 2001
From: Francois Drielsma <francois.drielsma@gmail.com>
Date: Tue, 2 Aug 2022 17:10:09 -0700
Subject: [PATCH 02/52] Handle empty edge_index case gracefully in the
 edge_assignment_score function

---
 mlreco/utils/gnn/evaluation.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/mlreco/utils/gnn/evaluation.py b/mlreco/utils/gnn/evaluation.py
index eb0b0346..978d4f34 100644
--- a/mlreco/utils/gnn/evaluation.py
+++ b/mlreco/utils/gnn/evaluation.py
@@ -199,7 +199,7 @@ def grouping_loss(pred_mat: nb.float32[:],
 @nb.njit(cache=True)
 def edge_assignment_score(edge_index: nb.int64[:,:],
                           edge_scores: nb.float32[:,:],
-                          n: nb.int64) -> (nb.int64[:,:], nb.float32):
+                          n: nb.int64) -> (nb.int64[:,:], nb.int64[:], nb.float32):
     """
     Function that finds the graph that produces the lowest
     grouping score iteratively adding the most likely edges,
@@ -208,11 +208,16 @@ def edge_assignment_score(edge_index: nb.int64[:,:],
     Args:
         edge_index (np.ndarray) : (E,2) Incidence matrix
         edge_scores (np.ndarray): (E,2) Two-channel edge score
-        n (int)                : Total number of clusters C
+        n (int)                 : Total number of clusters C
     Returns:
         np.ndarray: (E',2) Optimal incidence matrix
+        np.ndarray: (C) Optimal group ID for each node
         float     : Score for the optimal incidence matrix
     """
+    # If there is no edge, do not bother
+    if not len(edge_index):
+        return np.empty((2,0), dtype=np.int64), np.zeros(n, dtype=np.int64), 0.
+
     # Build an input adjacency matrix to constrain the edge selection to the input graph
     adj_mat = adjacency_matrix(edge_index, n)
 

From 113dec6435afab1be2241c4e5f9ed755e32092e1 Mon Sep 17 00:00:00 2001
From: Francois Drielsma <francois.drielsma@gmail.com>
Date: Tue, 2 Aug 2022 23:40:42 -0700
Subject: [PATCH 03/52] Typing bug fix for images with no particles in cluster
 parser

---
 mlreco/iotools/parsers/cluster.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlreco/iotools/parsers/cluster.py b/mlreco/iotools/parsers/cluster.py
index cb2dab54..d139f301 100644
--- a/mlreco/iotools/parsers/cluster.py
+++ b/mlreco/iotools/parsers/cluster.py
@@ -146,7 +146,7 @@ def parse_cluster3d(cluster_event,
             labels['vtx_x']   = np.array([p.ancestor_position().x() for p in particles_asis_v])
             labels['vtx_y']   = np.array([p.ancestor_position().y() for p in particles_asis_v])
             labels['vtx_z']   = np.array([p.ancestor_position().z() for p in particles_asis_v])
-            labels['primary_group'] = np.array((nu_ids > 0) & np.array([p.group_id()==p.parent_id() for p in particles_v]), dtype=np.float32)
+            labels['primary_group'] = np.array((nu_ids > 0) & np.array([p.group_id()==p.parent_id() for p in particles_v], dtype=bool), dtype=np.float32)
         labels['sem'] = np.array([p.shape() for p in particles_v])
 
     # Loop over clusters, store info

From d1a410580c8c439af63efb2f810dc5971a0afe9b Mon Sep 17 00:00:00 2001
From: Francois Drielsma <francois.drielsma@gmail.com>
Date: Fri, 5 Aug 2022 09:44:18 -0700
Subject: [PATCH 04/52] Fix segfault related to empty input to UResNet in the
 full chian

---
 mlreco/models/full_chain.py |  4 ++--
 mlreco/models/uresnet.py    | 20 ++++++++++----------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/mlreco/models/full_chain.py b/mlreco/models/full_chain.py
index bb523231..85576788 100644
--- a/mlreco/models/full_chain.py
+++ b/mlreco/models/full_chain.py
@@ -213,7 +213,7 @@ def full_chain_cnn(self, input):
             input[0][deghost, 4] = charges
             result.update({'input_rescaled':[input[0][deghost,:5]]})
         if self.enable_uresnet:
-            if self.enable_charge_rescaling:
+            if self.enable_charge_rescaling and deghost.sum() > 0:
                 assert not self.uresnet_lonely.ghost
                 result.update(self.uresnet_lonely([input[0][deghost, :4+self.input_features]]))
             else:
@@ -232,7 +232,7 @@ def full_chain_cnn(self, input):
                                       ppn_input['decoderTensors'][0])
             result.update(ppn_output)
 
-        if self.enable_charge_rescaling:
+        if self.enable_charge_rescaling and deghost.sum() > 0:
             # Reshape output tensors of UResNet and PPN to be of the original shape
             for key in ['segmentation', 'points', 'classify_endpoints', 'mask_ppn', 'ppn_coords', 'ppn_layers']:
                 res = result[key][0] if isinstance(result[key][0], torch.Tensor) else result[key][0][-1]
diff --git a/mlreco/models/uresnet.py b/mlreco/models/uresnet.py
index c103f5cc..337ecc43 100644
--- a/mlreco/models/uresnet.py
+++ b/mlreco/models/uresnet.py
@@ -282,19 +282,19 @@ def forward(self, result, label, weights=None):
 
         if self._ghost:
             results = {
-                'accuracy': uresnet_acc/count,
-                'loss': (self._alpha * uresnet_loss + self._beta * mask_loss)/count,
-                'ghost_mask_acc': mask_acc / count,
-                'ghost_mask_loss': self._beta * mask_loss / count,
-                'uresnet_loss': self._alpha * uresnet_loss / count,
-                'uresnet_acc': uresnet_acc / count,
-                'ghost2ghost': ghost2ghost / count,
-                'nonghost2nonghost': nonghost2nonghost / count
+                'accuracy': uresnet_acc/count if count else 0.,
+                'loss': (self._alpha * uresnet_loss + self._beta * mask_loss)/count if count else (self._alpha * uresnet_loss + self._beta * mask_loss),
+                'ghost_mask_acc': mask_acc / count if count else 0.,
+                'ghost_mask_loss': self._beta * mask_loss / count if count else self._beta * mask_los,
+                'uresnet_loss': self._alpha * uresnet_loss / count if count else self._alpha * uresnet_loss,
+                'uresnet_acc': uresnet_acc / count if count else 0.,
+                'ghost2ghost': ghost2ghost / count if count else 0.,
+                'nonghost2nonghost': nonghost2nonghost / count if count else 0.
             }
         else:
             results = {
-                'accuracy': uresnet_acc/count,
-                'loss': uresnet_loss/count
+                'accuracy': uresnet_acc/count if count else 0.,
+                'loss': uresnet_loss/count if count else uresnet_loss
             }
         for c in range(self._num_classes):
             if count_class[c] > 0:

From e356ec3aa37053cb13571d3d4fbdc89222030f44 Mon Sep 17 00:00:00 2001
From: Francois Drielsma <francois.drielsma@gmail.com>
Date: Thu, 11 Aug 2022 00:33:54 -0700
Subject: [PATCH 05/52] Fixed segfault in edge_assignment_score when the
 edge_index is empty

---
 mlreco/models/full_chain.py    | 4 +++-
 mlreco/utils/gnn/data.py       | 2 +-
 mlreco/utils/gnn/evaluation.py | 2 +-
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/mlreco/models/full_chain.py b/mlreco/models/full_chain.py
index 85576788..e1334d32 100644
--- a/mlreco/models/full_chain.py
+++ b/mlreco/models/full_chain.py
@@ -235,6 +235,7 @@ def full_chain_cnn(self, input):
         if self.enable_charge_rescaling and deghost.sum() > 0:
             # Reshape output tensors of UResNet and PPN to be of the original shape
             for key in ['segmentation', 'points', 'classify_endpoints', 'mask_ppn', 'ppn_coords', 'ppn_layers']:
+                if key not in result: continue
                 res = result[key][0] if isinstance(result[key][0], torch.Tensor) else result[key][0][-1]
                 tensor = torch.zeros((input[0].shape[0], res.shape[1]), dtype=res.dtype, device=res.device)
                 tensor[deghost] = res
@@ -242,7 +243,8 @@ def full_chain_cnn(self, input):
                     result[key][0]     = tensor
                 else:
                     result[key][0][-1] = tensor
-            result['ppn_output_coordinates'][0] = input[0][:,:4].type(result['ppn_output_coordinates'][0].dtype)
+            if 'ppn_output_coordinates' in result:
+                result['ppn_output_coordinates'][0] = input[0][:,:4].type(result['ppn_output_coordinates'][0].dtype)
 
         # The rest of the chain only needs 1 input feature
         if self.input_features > 1:
diff --git a/mlreco/utils/gnn/data.py b/mlreco/utils/gnn/data.py
index e71982c3..c8b2d8f2 100644
--- a/mlreco/utils/gnn/data.py
+++ b/mlreco/utils/gnn/data.py
@@ -304,7 +304,7 @@ def split_edge_index(edge_index: nb.int64[:,:],
     """
     # If the input is empty, simply return defaults
     if not edge_index.shape[1]:
-        return [np.empty((2,0), dtype=np.int64) for b in batches], [np.empty(0, dtype=np.int64) for b in batches]
+        return [np.empty((0,2), dtype=np.int64) for b in batches], [np.empty(0, dtype=np.int64) for b in batches]
 
     # For each batch ID, get the list of edges that belong to it
     ebids = [np.where(batch_ids[edge_index[0]] == b)[0] for b in batches]
diff --git a/mlreco/utils/gnn/evaluation.py b/mlreco/utils/gnn/evaluation.py
index 978d4f34..fca4327a 100644
--- a/mlreco/utils/gnn/evaluation.py
+++ b/mlreco/utils/gnn/evaluation.py
@@ -216,7 +216,7 @@ def edge_assignment_score(edge_index: nb.int64[:,:],
     """
     # If there is no edge, do not bother
     if not len(edge_index):
-        return np.empty((2,0), dtype=np.int64), np.zeros(n, dtype=np.int64), 0.
+        return np.empty((0,2), dtype=np.int64), np.zeros(n, dtype=np.int64), 0.
 
     # Build an input adjacency matrix to constrain the edge selection to the input graph
     adj_mat = adjacency_matrix(edge_index, n)

From 5b01f00fe6fc6e8fad341bb2e4d2306dd79f9945 Mon Sep 17 00:00:00 2001
From: Francois Drielsma <francois.drielsma@gmail.com>
Date: Wed, 24 Aug 2022 15:43:51 -0700
Subject: [PATCH 06/52] Heuristic patch for shower start point prediction

---
 mlreco/utils/gnn/data.py | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/mlreco/utils/gnn/data.py b/mlreco/utils/gnn/data.py
index e71982c3..8166a6d1 100644
--- a/mlreco/utils/gnn/data.py
+++ b/mlreco/utils/gnn/data.py
@@ -209,15 +209,13 @@ def _get_extra_gnn_features(fragments,
                 end_points = get_track_endpoints_geo(input[0], f, points_tensor)
                 ppn_points = torch.cat((ppn_points, end_points.reshape(1,-1)), dim=0)
             else:
-                dmask  = torch.nonzero(torch.max(
-                    torch.abs(points_tensor[f,:3]), dim=1).values < 1.,
-                    as_tuple=True)[0]
+                scores = torch.softmax(points_tensor[f, -2:], dim=1)[:,-1]
                 # scores = torch.sigmoid(points_tensor[f, -1])
-                # argmax = dmask[torch.argmax(scores[dmask])] \
-                #          if len(dmask) else torch.argmax(scores)
-                scores = torch.softmax(points_tensor[f, -2:], dim=1)
-                argmax = dmask[torch.argmax(scores[dmask, -1])] \
-                            if len(dmask) else torch.argmax(scores[:, -1])
+                dmask  = torch.nonzero((scores > 0.5) & (torch.max(
+                    torch.abs(points_tensor[f,:3]), dim=1).values < 1.),
+                    as_tuple=True)[0]
+                argmax = dmask[torch.argmax(scores[dmask])] \
+                            if len(dmask) else torch.argmax(scores)
                 start  = input[0][f][argmax,1:4] + \
                             points_tensor[f][argmax,:3] + 0.5
                 ppn_points = torch.cat((ppn_points,
@@ -261,7 +259,7 @@ def split_clusts(clusts, batch_ids, batches, counts):
         [np.ndarray]  : (B) List of cluster IDs in each batch
     """
     clusts_split, cbids = _split_clusts(clusts, batch_ids, batches, counts)
-    
+
     # Cast the list of clusters to np.array (object type)
     same_length = [np.all([len(c) == len(bclusts[0]) for c in bclusts]) for bclusts in clusts_split]
     return [np.array(clusts_split[b], dtype=object if not sl else np.int64) for b, sl in enumerate(same_length)], cbids

From a0cc9ba7d5302de2b030703074922fe842cbf285 Mon Sep 17 00:00:00 2001
From: Francois Drielsma <francois.drielsma@gmail.com>
Date: Thu, 25 Aug 2022 15:49:20 -0700
Subject: [PATCH 07/52] Changed default GrapPA accuracy from 0 to 1 (e.g. no
 node -> accuracy of 1)

---
 .../models/layers/gnn/losses/edge_channel.py  | 12 +---
 .../layers/gnn/losses/node_kinematics.py      | 63 +++++--------------
 .../models/layers/gnn/losses/node_primary.py  | 12 +---
 mlreco/models/layers/gnn/losses/node_type.py  | 12 +---
 4 files changed, 22 insertions(+), 77 deletions(-)

diff --git a/mlreco/models/layers/gnn/losses/edge_channel.py b/mlreco/models/layers/gnn/losses/edge_channel.py
index 713f0da8..12b587d0 100644
--- a/mlreco/models/layers/gnn/losses/edge_channel.py
+++ b/mlreco/models/layers/gnn/losses/edge_channel.py
@@ -172,16 +172,8 @@ def forward(self, out, clusters, graph=None):
                 # Increment the number of edges
                 n_edges += len(edge_pred)
 
-        # Handle the case where no cluster/edge were found
-        if not n_edges:
-            return {
-                'accuracy': 0.,
-                'loss': torch.tensor(0., requires_grad=True, device=clusters[0].device),
-                'n_edges': n_edges
-            }
-
         return {
-            'accuracy': total_acc/n_edges,
-            'loss': total_loss/n_edges,
+            'accuracy': total_acc/n_edges if n_edges else 1.,
+            'loss': total_loss/n_edges if n_edges else torch.tensor(0., requires_grad=True, device=clusters[0].device),
             'n_edges': n_edges
         }
diff --git a/mlreco/models/layers/gnn/losses/node_kinematics.py b/mlreco/models/layers/gnn/losses/node_kinematics.py
index cc8f2e5a..23afe964 100644
--- a/mlreco/models/layers/gnn/losses/node_kinematics.py
+++ b/mlreco/models/layers/gnn/losses/node_kinematics.py
@@ -146,8 +146,7 @@ def forward(self, out, types):
             compute_momentum = False
         compute_vtx = 'node_pred_vtx' in out
 
-        anchors_list = []
-        vertex_labels = []
+        vtx_anchors, vtx_labels = [], []
 
         for i in range(len(types)):
 
@@ -269,7 +268,7 @@ def forward(self, out, types):
                         vtx_label = torch.tensor(node_assn_vtx[valid_mask_vtx][pos_mask_vtx], dtype=node_pred_vtx.dtype, device=node_pred_vtx.device)
                         if self.normalize_vtx_label: # If requested, bring vertex labels in the range [0,1 ]
                             vtx_label = vtx_label/self.spatial_size
-                        vertex_labels.append(vtx_label.detach().cpu().numpy())
+                        vtx_labels.append(vtx_label.detach().cpu().numpy())
 
                         vtx_pred = node_pred_vtx[pos_mask_vtx,:3]
                         if self.use_anchor_points: # If requested, predict positions with respect to anchor points (end points of particles)
@@ -278,7 +277,7 @@ def forward(self, out, types):
                             min_dist = torch.argmin(dist_to_anchor, dim=1)
                             range_index = torch.arange(end_points.shape[0]).to(device=end_points.device).long()
                             anchors = end_points[range_index, min_dist, :]
-                            anchors_list.append(anchors.detach().cpu().numpy())
+                            vtx_anchors.append(anchors.detach().cpu().numpy())
                             vtx_pred = vtx_pred + anchors
 
                         loss2 = torch.mean(torch.clamp(torch.sum(self.vtx_position_loss(vtx_pred, vtx_label), dim=1),
@@ -293,7 +292,7 @@ def forward(self, out, types):
                         n_clusts_vtx += len(valid_mask_vtx)
                         n_clusts_vtx_pos += len(pos_mask_vtx)
                     else:
-                        vertex_labels.append(np.empty((0,3)))
+                        vtx_labels.append(np.empty((0,3)))
                         if self.use_anchor_points: anchors.append(np.empty((0,3)))
 
                 # Compute the accuracy of assignment (fraction of correctly assigned nodes)
@@ -310,63 +309,33 @@ def forward(self, out, types):
 
         n_clusts = n_clusts_type + n_clusts_momentum + n_clusts_vtx + n_clusts_vtx_pos
 
-        # Handle the case where no cluster/edge were found
-        if not n_clusts:
-            result = {
-                'accuracy': 0.,
-                'loss': torch.tensor(0., requires_grad=True, device=types[0].device if len(types) and torch.is_tensor(types[0]) else 'cpu', dtype=torch.float),
-                'n_clusts_momentum': n_clusts_momentum,
-                'n_clusts_type': n_clusts_type,
-                'n_clusts_vtx': n_clusts_vtx,
-                'n_clusts_vtx_positives': n_clusts_vtx_pos
-            }
-            if compute_type:
-                result.update({
-                    'type_loss': 0.,
-                    'type_accuracy': 0.,
-                })
-            if compute_momentum:
-                result.update({
-                    'p_loss': 0.,
-                    'p_accuracy': 0.,
-                })
-            if compute_vtx:
-                result.update({
-                    'vtx_position_loss': 0.,
-                    'vtx_score_loss': 0.,
-                    'vtx_position_acc': 0.,
-                    'vtx_score_acc': 0.,
-                })
-            return result
-
         result = {
-            'accuracy': (type_acc+p_acc+vtx_position_acc+vtx_score_acc)/n_clusts,
-            'loss': total_loss/n_clusts,
+            'accuracy': (type_acc + p_acc + vtx_position_acc + vtx_score_acc)/n_clusts if n_clusts else 1.,
+            'loss': total_loss/n_clusts if n_clusts else torch.tensor(0., requires_grad=True, device=types[0].device, dtype=torch.float),
             'n_clusts_momentum': n_clusts_momentum,
             'n_clusts_type': n_clusts_type,
             'n_clusts_vtx': n_clusts_vtx,
             'n_clusts_vtx_positives': n_clusts_vtx_pos
         }
 
-        result['anchors'] = anchors_list
-        result['vertex_labels'] = vertex_labels
-
         if compute_type:
             result.update({
-                'type_accuracy': 0. if not n_clusts_type else type_acc/n_clusts_type,
-                'type_loss': 0. if not n_clusts_type else type_loss/n_clusts_type,
+                'type_accuracy': type_acc/n_clusts_type if n_clusts_type else 1.,
+                'type_loss': type_loss/n_clusts_type if n_clusts_type else 0.
             })
         if compute_momentum:
             result.update({
-                'p_accuracy': 0. if not n_clusts_momentum else p_acc/n_clusts_momentum,
-                'p_loss': 0. if not n_clusts_momentum else p_loss/n_clusts_momentum,
+                'p_accuracy': p_acc/n_clusts_momentum if n_clusts_momentum else 1.,
+                'p_loss': p_loss/n_clusts_momentum if p_loss else 0.
             })
         if compute_vtx:
             result.update({
-                'vtx_score_loss': 0. if not n_clusts_vtx else vtx_score_loss/n_clusts_vtx,
-                'vtx_score_acc': 0. if not n_clusts_vtx else vtx_score_acc/n_clusts_vtx,
-                'vtx_position_loss': 0. if not n_clusts_vtx_pos else vtx_position_loss/n_clusts_vtx_pos,
-                'vtx_position_acc': 0. if not n_clusts_vtx_pos else vtx_position_acc/n_clusts_vtx_pos,
+                'vtx_anchors': vtx_anchors,
+                'vtx_labels': vtx_labels,
+                'vtx_score_loss': vtx_score_loss/n_clusts_vtx if n_clusts_vtx else 0.,
+                'vtx_score_acc': vtx_score_acc/n_clusts_vtx if n_clusts_vtx else 1.,
+                'vtx_position_loss': vtx_position_loss/n_clusts_vtx_pos if n_clusts_vtx_pos else 0.,
+                'vtx_position_acc': vtx_position_acc/n_clusts_vtx_pos if n_clusts_vtx_pos else 1.
             })
 
         return result
diff --git a/mlreco/models/layers/gnn/losses/node_primary.py b/mlreco/models/layers/gnn/losses/node_primary.py
index 8629c242..73d91520 100644
--- a/mlreco/models/layers/gnn/losses/node_primary.py
+++ b/mlreco/models/layers/gnn/losses/node_primary.py
@@ -127,16 +127,8 @@ def forward(self, out, clusters):
                 # Increment the number of nodes
                 n_clusts += len(clusts)
 
-        # Handle the case where no cluster/edge were found
-        if not n_clusts:
-            return {
-                'accuracy': 0.,
-                'loss': torch.tensor(0., requires_grad=True, device=clusters[0].device),
-                'n_clusts': n_clusts
-            }
-
         return {
-            'accuracy': total_acc/n_clusts,
-            'loss': total_loss/n_clusts,
+            'accuracy': total_acc/n_clusts if n_clusts else 1.,
+            'loss': total_loss/n_clusts if n_clusts else torch.tensor(0., requires_grad=True, device=clusters[0].device),
             'n_clusts': n_clusts
         }
diff --git a/mlreco/models/layers/gnn/losses/node_type.py b/mlreco/models/layers/gnn/losses/node_type.py
index b3554da5..a6839c68 100644
--- a/mlreco/models/layers/gnn/losses/node_type.py
+++ b/mlreco/models/layers/gnn/losses/node_type.py
@@ -105,16 +105,8 @@ def forward(self, out, types):
                 # Increment the number of nodes
                 n_clusts += len(node_mask)
 
-        # Handle the case where no cluster/edge were found
-        if not n_clusts:
-            return {
-                'accuracy': 0.,
-                'loss': torch.tensor(0., requires_grad=True, device=types[0].device),
-                'n_clusts': n_clusts
-            }
-
         return {
-            'accuracy': total_acc/n_clusts,
-            'loss': total_loss/n_clusts,
+            'accuracy': total_acc/n_clusts if n_clusts else 1.,
+            'loss': total_loss/n_clusts if n_clusts else torch.tensor(0., requires_grad=True, device=types[0].device),
             'n_clusts': n_clusts
         }

From 24d67ff6e68939594c0c2f82a1060ab549c42d85 Mon Sep 17 00:00:00 2001
From: Francois Drielsma <francois.drielsma@gmail.com>
Date: Thu, 1 Sep 2022 13:22:14 -0700
Subject: [PATCH 08/52] Fixed bug in label end points used in the standalone
 GrapPA model

---
 mlreco/utils/gnn/cluster.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mlreco/utils/gnn/cluster.py b/mlreco/utils/gnn/cluster.py
index 46a44a84..7136f14d 100644
--- a/mlreco/utils/gnn/cluster.py
+++ b/mlreco/utils/gnn/cluster.py
@@ -373,12 +373,12 @@ def _get_cluster_points_label(data: nb.float64[:,:],
     # Get start and end points (one and the same for all but track class)
     batch_ids = _get_cluster_batch(data, clusts)
     points = np.empty((len(clusts), 6), dtype=data.dtype)
-    for i, c in enumerate(clusts): # Here clusters are groups
+    for i, c in enumerate(clusts):
         batch_mask = np.where(particles[:,batch_col] == batch_ids[i])[0]
         clust_ids  = np.unique(data[c, 5]).astype(np.int64)
         minid = np.argmin(particles[batch_mask][clust_ids,-2]) # Pick the first cluster in time
-        order = np.array([0, 1, 2, 4, 5, 6]) if (np.random.choice(2) or not random_order) else np.array([4, 5, 6, 0, 1, 2])
-        points[i] = particles[batch_mask][clust_ids[minid]][order]
+        order = np.arange(6) if (np.random.choice(2) or not random_order) else np.array([3, 4, 5, 0, 1, 2])
+        points[i] = particles[batch_mask][clust_ids[minid]][order+1] # The first column is the batch ID
 
     # Bring the start points to the closest point in the corresponding cluster
     for i, c in enumerate(clusts):

From 382e529b808de5b166cbc9c06f1482fe1b5febd7 Mon Sep 17 00:00:00 2001
From: Francois Drielsma <francois.drielsma@gmail.com>
Date: Thu, 1 Sep 2022 23:29:11 -0700
Subject: [PATCH 09/52] Include MPR primaries into the primary particle target

---
 mlreco/iotools/parsers/cluster.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlreco/iotools/parsers/cluster.py b/mlreco/iotools/parsers/cluster.py
index d139f301..5f852e82 100644
--- a/mlreco/iotools/parsers/cluster.py
+++ b/mlreco/iotools/parsers/cluster.py
@@ -146,7 +146,7 @@ def parse_cluster3d(cluster_event,
             labels['vtx_x']   = np.array([p.ancestor_position().x() for p in particles_asis_v])
             labels['vtx_y']   = np.array([p.ancestor_position().y() for p in particles_asis_v])
             labels['vtx_z']   = np.array([p.ancestor_position().z() for p in particles_asis_v])
-            labels['primary_group'] = np.array((nu_ids > 0) & np.array([p.group_id()==p.parent_id() for p in particles_v], dtype=bool), dtype=np.float32)
+            labels['primary_group'] = np.array([p.group_id()==p.parent_id() for p in particles_v], dtype=np.float32)
         labels['sem'] = np.array([p.shape() for p in particles_v])
 
     # Loop over clusters, store info

From d9a4e6dc75d35ddf4a78f4108ce8ef4a077df32f Mon Sep 17 00:00:00 2001
From: Francois Drielsma <francois.drielsma@gmail.com>
Date: Thu, 8 Sep 2022 14:18:53 -0700
Subject: [PATCH 10/52] Fix in docstring of neutrino parser

---
 mlreco/iotools/parsers/particles.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlreco/iotools/parsers/particles.py b/mlreco/iotools/parsers/particles.py
index 32934e0d..b29a7203 100644
--- a/mlreco/iotools/parsers/particles.py
+++ b/mlreco/iotools/parsers/particles.py
@@ -56,7 +56,7 @@ def parse_neutrino_asis(neutrino_event, cluster_event):
         schema:
           neutrino_asis:
             parser: parse_neutrino_asis
-            particle_asis:
+            args:
               neutrino_event: neutrino_mpv
               cluster_event: cluster3d_pcluster
 

From 3aa4ec998de73aa4f3a0ae657d8e06c4eac3b669 Mon Sep 17 00:00:00 2001
From: Francois Drielsma <francois.drielsma@gmail.com>
Date: Fri, 9 Sep 2022 10:34:02 -0700
Subject: [PATCH 11/52] Include more details in UResNet+PPN log, harmonize PPN
 metric output

---
 mlreco/models/layers/common/gnn_full_chain.py |  6 +++---
 mlreco/models/layers/common/ppnplus.py        |  8 ++++----
 mlreco/models/uresnet_ppn_chain.py            | 10 ++++++----
 3 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/mlreco/models/layers/common/gnn_full_chain.py b/mlreco/models/layers/common/gnn_full_chain.py
index 82f05ff1..a9aa8204 100644
--- a/mlreco/models/layers/common/gnn_full_chain.py
+++ b/mlreco/models/layers/common/gnn_full_chain.py
@@ -669,8 +669,8 @@ def forward(self, out, seg_label, ppn_label=None, cluster_label=None, kinematics
             for key in res_ppn:
                 res['ppn_' + key] = res_ppn[key]
 
-            accuracy += res_ppn['ppn_acc']
-            loss += self.ppn_weight*res_ppn['ppn_loss']
+            accuracy += res_ppn['accuracy']
+            loss += self.ppn_weight*res_ppn['loss']
 
         if self.enable_ghost and (self.enable_cnn_clust or \
                                   self.enable_gnn_track or \
@@ -895,7 +895,7 @@ def forward(self, out, seg_label, ppn_label=None, cluster_label=None, kinematics
             if self.enable_uresnet:
                 print('Segmentation Accuracy: {:.4f}'.format(res_seg['accuracy']))
             if self.enable_ppn:
-                print('PPN Accuracy: {:.4f}'.format(res_ppn['ppn_acc']))
+                print('PPN Accuracy: {:.4f}'.format(res_ppn['accuracy']))
             if self.enable_cnn_clust and ('graph' in out or 'embeddings' in out):
                 if not self._enable_graph_spice:
                     print('Clustering Embedding Accuracy: {:.4f}'.format(res_cnn_clust['accuracy']))
diff --git a/mlreco/models/layers/common/ppnplus.py b/mlreco/models/layers/common/ppnplus.py
index b07c3f07..90bc06fc 100644
--- a/mlreco/models/layers/common/ppnplus.py
+++ b/mlreco/models/layers/common/ppnplus.py
@@ -436,7 +436,7 @@ def forward(self, result, segment_label, particles_label):
             'mask_loss': 0.,
             'type_loss': 0.,
             'classify_endpoints_loss': 0.,
-            'classify_endpoints_acc': 0.
+            'classify_endpoints_accuracy': 0.
         }
         # Semantic Segmentation Loss
         for igpu in range(len(segment_label)):
@@ -560,7 +560,7 @@ def forward(self, result, segment_label, particles_label):
                                     acc_classify_endpoints = acc_point_class / point_class_count
                                     #total_loss += loss_classify_endpoints.float()
                             res['classify_endpoints_loss'] += float(loss_classify_endpoints) / num_batches
-                            res['classify_endpoints_acc'] += float(acc_classify_endpoints) / num_batches
+                            res['classify_endpoints_accuracy'] += float(acc_classify_endpoints) / num_batches
                         # --- end of Endpoint classification
 
                         # Distance Loss
@@ -579,6 +579,6 @@ def forward(self, result, segment_label, particles_label):
             total_loss += loss_gpu
 
         total_acc /= num_batches
-        res['ppn_loss'] = total_loss
-        res['ppn_acc'] = float(total_acc)
+        res['loss'] = total_loss
+        res['accuracy'] = float(total_acc)
         return res
diff --git a/mlreco/models/uresnet_ppn_chain.py b/mlreco/models/uresnet_ppn_chain.py
index 3ee0f616..9f8f24d0 100644
--- a/mlreco/models/uresnet_ppn_chain.py
+++ b/mlreco/models/uresnet_ppn_chain.py
@@ -142,9 +142,11 @@ def forward(self, outputs, segment_label, particles_label, weights=None):
             outputs, segment_label, particles_label)
 
         res = {
-            'loss': res_segmentation['loss'] + res_ppn['ppn_loss'],
-            'accuracy': (res_segmentation['accuracy'] + res_ppn['ppn_acc']) / 2.0,
-            'reg_loss': res_ppn['reg_loss'],
-            'type_loss': res_ppn['type_loss']
+            'loss': res_segmentation['loss'] + res_ppn['loss'],
+            'accuracy': (res_segmentation['accuracy'] + res_ppn['accuracy'])/2
         }
+
+        res.update({'segmentation_'+k:v for k, v in res_segmentation.items()})
+        res.update({'ppn_'+k:v for k, v in res_ppn.items()})
+
         return res

From 5e8e490ebf44751d469009240d73ea46b099f919 Mon Sep 17 00:00:00 2001
From: Francois Drielsma <francois.drielsma@gmail.com>
Date: Fri, 9 Sep 2022 10:44:05 -0700
Subject: [PATCH 12/52] More semantic harmonization in the full chain metric
 output

---
 mlreco/models/layers/common/gnn_full_chain.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlreco/models/layers/common/gnn_full_chain.py b/mlreco/models/layers/common/gnn_full_chain.py
index a9aa8204..e80d5618 100644
--- a/mlreco/models/layers/common/gnn_full_chain.py
+++ b/mlreco/models/layers/common/gnn_full_chain.py
@@ -658,7 +658,7 @@ def forward(self, out, seg_label, ppn_label=None, cluster_label=None, kinematics
             else:
                 res_seg = self.uresnet_loss({'segmentation':[out['segmentation'][0][deghost]]}, [seg_label[0][deghost]])
             for key in res_seg:
-                res['uresnet_' + key] = res_seg[key]
+                res['segmentation_' + key] = res_seg[key]
             accuracy += res_seg['accuracy']
             loss += self.segmentation_weight*res_seg['loss']
             #print('uresnet ', self.segmentation_weight, res_seg['loss'], loss)

From f9ffc378929a78c023c2a6d94ee423762428a274 Mon Sep 17 00:00:00 2001
From: Francois Drielsma <francois.drielsma@gmail.com>
Date: Mon, 12 Sep 2022 09:17:25 -0700
Subject: [PATCH 13/52] Constrain full chain fragment end points to fragment
 voxel set

---
 mlreco/utils/gnn/data.py | 32 +++++++++++++++++++++++++-------
 1 file changed, 25 insertions(+), 7 deletions(-)

diff --git a/mlreco/utils/gnn/data.py b/mlreco/utils/gnn/data.py
index 65e21d88..acdeb9e2 100644
--- a/mlreco/utils/gnn/data.py
+++ b/mlreco/utils/gnn/data.py
@@ -163,7 +163,10 @@ def _get_extra_gnn_features(fragments,
                            input,
                            result,
                            use_ppn=False,
-                           use_supp=False):
+                           use_supp=False,
+                           enhance=False,
+                           allow_outside=False,
+                           coords_col=(1, 4)):
     """
     Extracting extra features to feed into the GNN particle aggregators
 
@@ -171,6 +174,14 @@ def _get_extra_gnn_features(fragments,
             end points for tracks (+ direction estimate)
     - Supplemental: Mean/RMS energy in the fragment + semantic class
 
+    If the `enhance` parameter is `True`, tracks leverage PPN predictions
+    to provide a more accurate estimate of the end points. This needs to be 
+    avoided for track fragments, as PPN is not trained to find end points for them.
+    If set to `False`, the two voxels furthest away from each other are picked.
+
+    If the `allow_outside` parameter is `True`, the end point estimates
+    are *not* brought back to the closest fragment voxel.
+
     Parameters
     ==========
     fragments: np.ndarray
@@ -180,6 +191,8 @@ def _get_extra_gnn_features(fragments,
     result: dictionary
     use_ppn: bool
     use_supp: bool
+    enhance: bool
+    allow_outside: bool
 
     Returns
     =======
@@ -205,9 +218,9 @@ def _get_extra_gnn_features(fragments,
                                         dtype=torch.double)
         points_tensor = result['points'][0].detach().double()
         for i, f in enumerate(fragments[mask]):
+            fragment_voxels = input[0][f][:,coords_col[0]:coords_col[1]]
             if frag_seg[mask][i] == 1:
-                end_points = get_track_endpoints_geo(input[0], f, points_tensor)
-                ppn_points = torch.cat((ppn_points, end_points.reshape(1,-1)), dim=0)
+                end_points = get_track_endpoints_geo(input[0], f, points_tensor if enhance else None)
             else:
                 scores = torch.softmax(points_tensor[f, -2:], dim=1)[:,-1]
                 # scores = torch.sigmoid(points_tensor[f, -1])
@@ -216,10 +229,15 @@ def _get_extra_gnn_features(fragments,
                     as_tuple=True)[0]
                 argmax = dmask[torch.argmax(scores[dmask])] \
                             if len(dmask) else torch.argmax(scores)
-                start  = input[0][f][argmax,1:4] + \
-                            points_tensor[f][argmax,:3] + 0.5
-                ppn_points = torch.cat((ppn_points,
-                    torch.cat([start, start]).reshape(1,-1)), dim=0)
+                start  = fragment_voxels[argmax] + points_tensor[f][argmax,:3] + 0.5
+                end_points = torch.cat([start, start])
+
+            if not allow_outside:
+                dist_mat   = torch.cdist(end_points.reshape(-1,3), fragment_voxels)
+                argmins    = torch.argmin(dist_mat, dim=1)
+                end_points = torch.cat([fragment_voxels[argmins[0]], fragment_voxels[argmins[1]]])
+
+            ppn_points = torch.cat((ppn_points, end_points.reshape(1,-1)), dim=0)
 
         kwargs['points'] = ppn_points
 

From 6871d05ca5a5919eb8dea25ceb3398a7255cf639 Mon Sep 17 00:00:00 2001
From: Francois Drielsma <francois.drielsma@gmail.com>
Date: Mon, 12 Sep 2022 13:39:16 -0700
Subject: [PATCH 14/52] Save time on track fragment end point heuristic

---
 mlreco/utils/gnn/data.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlreco/utils/gnn/data.py b/mlreco/utils/gnn/data.py
index acdeb9e2..cbb8c45a 100644
--- a/mlreco/utils/gnn/data.py
+++ b/mlreco/utils/gnn/data.py
@@ -232,7 +232,7 @@ def _get_extra_gnn_features(fragments,
                 start  = fragment_voxels[argmax] + points_tensor[f][argmax,:3] + 0.5
                 end_points = torch.cat([start, start])
 
-            if not allow_outside:
+            if not allow_outside and (frag_seg[mask][i] != 1 or (frag_seg[mask][i] == 1 and enhance)):
                 dist_mat   = torch.cdist(end_points.reshape(-1,3), fragment_voxels)
                 argmins    = torch.argmin(dist_mat, dim=1)
                 end_points = torch.cat([fragment_voxels[argmins[0]], fragment_voxels[argmins[1]]])

From aeae78a786c62f287b500350339e9f21b59b5261 Mon Sep 17 00:00:00 2001
From: Francois Drielsma <francois.drielsma@gmail.com>
Date: Tue, 13 Sep 2022 10:50:24 -0700
Subject: [PATCH 15/52] Fixed bug where unwrapper fails when use_anchor_points
 the GNN node kinematics loss is False

---
 mlreco/models/layers/gnn/losses/node_kinematics.py | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/mlreco/models/layers/gnn/losses/node_kinematics.py b/mlreco/models/layers/gnn/losses/node_kinematics.py
index 23afe964..2c26e8d7 100644
--- a/mlreco/models/layers/gnn/losses/node_kinematics.py
+++ b/mlreco/models/layers/gnn/losses/node_kinematics.py
@@ -166,11 +166,9 @@ def forward(self, out, types):
                 clusts = out['clusts'][i][j]
 
                 # Increment the type loss, balance classes if requested
-                if compute_type:
+                if compute_type and out['node_pred_type'][i][j].shape:
                     # Get the type predictions and true types from the specified columns
                     node_pred_type = out['node_pred_type'][i][j]
-                    if not node_pred_type.shape[0]:
-                        continue
                     node_assn_type = get_cluster_label(labels, clusts, column=self.type_col)
 
                     # Do not apply loss to nodes labeled -1 (unknown class)
@@ -207,11 +205,9 @@ def forward(self, out, types):
                         n_clusts_type += len(valid_mask_type)
 
                 # Increment the momentum loss
-                if compute_momentum:
+                if compute_momentum and out['node_pred_p'][i][j].shape:
                     # Get the momentum predictions and true momenta from the specified columns
                     node_pred_p = out['node_pred_p'][i][j]
-                    if not node_pred_p.shape[0]:
-                        continue
                     node_assn_p = get_momenta_label(labels, clusts, column=self.momentum_col)
 
                     # Do not apply loss to nodes labeled -1 (unknown class)
@@ -236,12 +232,10 @@ def forward(self, out, types):
                         # Increment the number of nodes
                         n_clusts_momentum += len(clusts)
 
-                if compute_vtx:
+                if compute_vtx and out['node_pred_vtx'][i][j].shape:
                     # Get the vertex predictions, node features and true vertices from the specified columns
                     node_pred_vtx = out['node_pred_vtx'][i][j]
                     input_node_features = out['input_node_features'][i][j]
-                    if not node_pred_vtx.shape[0]:
-                        continue
                     node_assn_vtx     = np.stack([get_cluster_label(labels, clusts, column=c) for c in range(self.vtx_col, self.vtx_col+3)], axis=1)
                     node_assn_vtx_pos = get_cluster_label(labels, clusts, column=self.vtx_positives_col)
 
@@ -330,13 +324,13 @@ def forward(self, out, types):
             })
         if compute_vtx:
             result.update({
-                'vtx_anchors': vtx_anchors,
                 'vtx_labels': vtx_labels,
                 'vtx_score_loss': vtx_score_loss/n_clusts_vtx if n_clusts_vtx else 0.,
                 'vtx_score_acc': vtx_score_acc/n_clusts_vtx if n_clusts_vtx else 1.,
                 'vtx_position_loss': vtx_position_loss/n_clusts_vtx_pos if n_clusts_vtx_pos else 0.,
                 'vtx_position_acc': vtx_position_acc/n_clusts_vtx_pos if n_clusts_vtx_pos else 1.
             })
+            if self.use_anchor_points: result['vtx_anchors'] = vtx_anchors
 
         return result
 

From 9fa6a942fe29ae731eda2e506a9b797fd58aa096 Mon Sep 17 00:00:00 2001
From: Francois Drielsma <francois.drielsma@gmail.com>
Date: Tue, 13 Sep 2022 13:12:31 -0700
Subject: [PATCH 16/52] Updated default list of results to be concatenated

---
 mlreco/main_funcs.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/mlreco/main_funcs.py b/mlreco/main_funcs.py
index 0f33ff85..822f65bc 100644
--- a/mlreco/main_funcs.py
+++ b/mlreco/main_funcs.py
@@ -12,6 +12,7 @@
 # happens in the process_config function before anything
 # else is allowed to happen.
 
+
 class Handlers:
     cfg          = None
     data_io      = None
@@ -45,18 +46,21 @@ def inference(cfg, event_list=None):
 
 def process_config(cfg, verbose=True):
 
-    # Set GPUS to be used
     if 'trainval' in cfg:
+        # Set GPUs to be used
         os.environ['CUDA_VISIBLE_DEVICES'] = cfg['trainval']['gpus']
         cfg['trainval']['gpus'] = list(range(len([int(a) for a in cfg['trainval']['gpus'].split(',') if a.isdigit()])))
+
         # Update seed
         if cfg['trainval']['seed'] < 0:
             import time
             cfg['trainval']['seed'] = int(time.time())
         else:
             cfg['trainval']['seed'] = int(cfg['trainval']['seed'])
+
         # Set MinkowskiEngine number of threads
         os.environ['OMP_NUM_THREADS'] = '16' # default value
+
         # Set default concat_result
         default_concat_result = ['input_edge_features', 'input_node_features','points', 'coordinates',
                                  'particle_node_features', 'particle_edge_features',
@@ -71,7 +75,7 @@ def process_config(cfg, verbose=True):
                                  'particle_edge_pred', 'particle_group_pred', 'particles',
                                  'inter_edge_index', 'inter_node_pred', 'inter_edge_pred', 'inter_group_pred',
                                  'inter_particles', 'node_pred_p', 'node_pred_type',
-                                 'vertex_labels', 'anchors', 'grappa_inter_vertex_labels', 'grappa_inter_anchors',
+                                 'vtx_labels', 'vtx_anchors', 'grappa_inter_vtx_labels', 'grappa_inter_vtx_anchors',
                                  'kinematics_node_pred_p', 'kinematics_node_pred_type',
                                  'flow_edge_pred', 'kinematics_particles', 'kinematics_edge_index',
                                  'clust_fragments', 'clust_frag_seg', 'interactions', 'inter_cosmic_pred',

From 2e5bd1184fa59c251f72b7d7e14d55b9cf5403a7 Mon Sep 17 00:00:00 2001
From: Francois Drielsma <francois.drielsma@gmail.com>
Date: Thu, 15 Sep 2022 12:00:32 -0700
Subject: [PATCH 17/52] Added better handling of empty input in UResNet and PPN
 loss. Skip full chain GNN section if no voxels

---
 mlreco/models/layers/common/gnn_full_chain.py |  2 +-
 mlreco/models/layers/common/ppnplus.py        | 14 +++++++-------
 mlreco/models/uresnet.py                      | 18 +++++++++---------
 3 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/mlreco/models/layers/common/gnn_full_chain.py b/mlreco/models/layers/common/gnn_full_chain.py
index e80d5618..6cda0c5e 100644
--- a/mlreco/models/layers/common/gnn_full_chain.py
+++ b/mlreco/models/layers/common/gnn_full_chain.py
@@ -582,7 +582,7 @@ def forward(self, input):
         """
 
         result, input, revert_func = self.full_chain_cnn(input)
-        if self.process_fragments and (self.enable_gnn_track or self.enable_gnn_shower or self.enable_gnn_inter or self.enable_gnn_particle):
+        if len(input[0]) and self.process_fragments and (self.enable_gnn_track or self.enable_gnn_shower or self.enable_gnn_inter or self.enable_gnn_particle):
             result = self.full_chain_gnn(result, input)
 
         result = revert_func(result)
diff --git a/mlreco/models/layers/common/ppnplus.py b/mlreco/models/layers/common/ppnplus.py
index 90bc06fc..bcd689f9 100644
--- a/mlreco/models/layers/common/ppnplus.py
+++ b/mlreco/models/layers/common/ppnplus.py
@@ -566,19 +566,19 @@ def forward(self, result, segment_label, particles_label):
                         # Distance Loss
                         d2, _ = torch.min(distance_positives, dim=0)
                         reg_loss = d2.mean()
-                        res['reg_loss'] += float(reg_loss) / num_batches
-                        res['type_loss'] += float(type_loss) / num_batches
-                        res['mask_loss'] += float(mask_loss_final) / num_batches
-                        total_loss += (reg_loss + type_loss + mask_loss_final) / num_batches
+                        res['reg_loss'] += float(reg_loss) / num_batches if num_batches else float(reg_loss)
+                        res['type_loss'] += float(type_loss) / num_batches if num_batches else float(type_loss)
+                        res['mask_loss'] += float(mask_loss_final) / num_batches if num_batches else float(mask_loss_final)
+                        total_loss += (reg_loss + type_loss + mask_loss_final) / num_batches if num_batches else reg_loss + type_loss + mask_loss_final
                         if self._classify_endpoints:
-                            total_loss += loss_classify_endpoints / num_batches
+                            total_loss += loss_classify_endpoints / num_batches if num_batches else loss_classify_endpoints
 
-                loss_layer /= num_batches
+                loss_layer /= max(1, num_batches)
                 loss_gpu += loss_layer
             loss_gpu /= len(ppn_layers)
             total_loss += loss_gpu
 
-        total_acc /= num_batches
+        total_acc = total_acc / num_batches if num_batches else 1.
         res['loss'] = total_loss
         res['accuracy'] = float(total_acc)
         return res
diff --git a/mlreco/models/uresnet.py b/mlreco/models/uresnet.py
index 337ecc43..036762ec 100644
--- a/mlreco/models/uresnet.py
+++ b/mlreco/models/uresnet.py
@@ -282,23 +282,23 @@ def forward(self, result, label, weights=None):
 
         if self._ghost:
             results = {
-                'accuracy': uresnet_acc/count if count else 0.,
-                'loss': (self._alpha * uresnet_loss + self._beta * mask_loss)/count if count else (self._alpha * uresnet_loss + self._beta * mask_loss),
-                'ghost_mask_acc': mask_acc / count if count else 0.,
-                'ghost_mask_loss': self._beta * mask_loss / count if count else self._beta * mask_los,
+                'accuracy': uresnet_acc/count if count else 1.,
+                'loss': (self._alpha * uresnet_loss + self._beta * mask_loss)/count if count else self._alpha * uresnet_loss + self._beta * mask_loss,
+                'ghost_mask_accuracy': mask_acc / count if count else 1.,
+                'ghost_mask_loss': self._beta * mask_loss / count if count else self._beta * mask_loss,
+                'uresnet_accuracy': uresnet_acc / count if count else 1.,
                 'uresnet_loss': self._alpha * uresnet_loss / count if count else self._alpha * uresnet_loss,
-                'uresnet_acc': uresnet_acc / count if count else 0.,
-                'ghost2ghost': ghost2ghost / count if count else 0.,
-                'nonghost2nonghost': nonghost2nonghost / count if count else 0.
+                'ghost2ghost': ghost2ghost / count if count else 1.,
+                'nonghost2nonghost': nonghost2nonghost / count if count else 1.
             }
         else:
             results = {
-                'accuracy': uresnet_acc/count if count else 0.,
+                'accuracy': uresnet_acc/count if count else 1.,
                 'loss': uresnet_loss/count if count else uresnet_loss
             }
         for c in range(self._num_classes):
             if count_class[c] > 0:
                 results['accuracy_class_%d' % c] = uresnet_acc_class[c]/count_class[c]
             else:
-                results['accuracy_class_%d' % c] = -1.
+                results['accuracy_class_%d' % c] = 1.
         return results

From 98a4186575d5117274aea2d0d80d1319aabbbca3 Mon Sep 17 00:00:00 2001
From: Francois Drielsma <francois.drielsma@gmail.com>
Date: Thu, 15 Sep 2022 14:59:28 -0700
Subject: [PATCH 18/52] Fixed PPN postprocessing for batch_size>1 (was broken
 by not unwrapping points output)

---
 mlreco/models/full_chain.py                   | 34 +++++--------------
 mlreco/models/layers/common/gnn_full_chain.py | 12 +++++--
 mlreco/utils/ppn.py                           | 24 ++++++-------
 3 files changed, 30 insertions(+), 40 deletions(-)

diff --git a/mlreco/models/full_chain.py b/mlreco/models/full_chain.py
index e1334d32..c01eacbc 100644
--- a/mlreco/models/full_chain.py
+++ b/mlreco/models/full_chain.py
@@ -210,14 +210,13 @@ def full_chain_cnn(self, input):
 
             # Rescale the charge column, store it
             charges = compute_rescaled_charge(input[0], deghost, last_index=last_index)
-            input[0][deghost, 4] = charges
-            result.update({'input_rescaled':[input[0][deghost,:5]]})
+            full_n  = len(input[0])
+            input[0] = input[0][deghost]
+            input[0][:, 4] = charges
+            result.update({'input_rescaled':[input[0][:,:5]]})
+
         if self.enable_uresnet:
-            if self.enable_charge_rescaling and deghost.sum() > 0:
-                assert not self.uresnet_lonely.ghost
-                result.update(self.uresnet_lonely([input[0][deghost, :4+self.input_features]]))
-            else:
-                result.update(self.uresnet_lonely([input[0][:, :4+self.input_features]]))
+            result.update(self.uresnet_lonely([input[0][:, :4+self.input_features]]))
 
         if self.enable_ppn:
             ppn_input = {}
@@ -232,27 +231,13 @@ def full_chain_cnn(self, input):
                                       ppn_input['decoderTensors'][0])
             result.update(ppn_output)
 
-        if self.enable_charge_rescaling and deghost.sum() > 0:
-            # Reshape output tensors of UResNet and PPN to be of the original shape
-            for key in ['segmentation', 'points', 'classify_endpoints', 'mask_ppn', 'ppn_coords', 'ppn_layers']:
-                if key not in result: continue
-                res = result[key][0] if isinstance(result[key][0], torch.Tensor) else result[key][0][-1]
-                tensor = torch.zeros((input[0].shape[0], res.shape[1]), dtype=res.dtype, device=res.device)
-                tensor[deghost] = res
-                if isinstance(result[key][0], torch.Tensor):
-                    result[key][0]     = tensor
-                else:
-                    result[key][0][-1] = tensor
-            if 'ppn_output_coordinates' in result:
-                result['ppn_output_coordinates'][0] = input[0][:,:4].type(result['ppn_output_coordinates'][0].dtype)
-
         # The rest of the chain only needs 1 input feature
         if self.input_features > 1:
             input[0] = input[0][:, :-self.input_features+1]
 
         cnn_result = {}
 
-        if self.enable_ghost:
+        if self.enable_ghost and not self.enable_charge_rescaling:
 
             # Update input based on deghosting results
             # if self.cheat_ghost:
@@ -315,8 +300,7 @@ def full_chain_cnn(self, input):
         if self._gspice_use_true_labels:
             semantic_labels = label_seg[0][:, -1]
         else:
-            semantic_labels = torch.argmax(cnn_result['segmentation'][0],
-                                           dim=1).flatten()
+            semantic_labels = torch.argmax(cnn_result['segmentation'][0], dim=1).flatten()
 
         if self.enable_cnn_clust:
             if label_clustering is None and self.training:
@@ -390,7 +374,7 @@ def full_chain_cnn(self, input):
         #     cnn_result['true_points'] = coords
 
         def return_to_original(result):
-            if self.enable_ghost:
+            if self.enable_ghost and not self.enable_charge_rescaling:
                 result['segmentation'][0] = segmentation
             return result
 
diff --git a/mlreco/models/layers/common/gnn_full_chain.py b/mlreco/models/layers/common/gnn_full_chain.py
index 6cda0c5e..7d3f3d74 100644
--- a/mlreco/models/layers/common/gnn_full_chain.py
+++ b/mlreco/models/layers/common/gnn_full_chain.py
@@ -650,12 +650,17 @@ def forward(self, out, seg_label, ppn_label=None, cluster_label=None, kinematics
                 res['deghost_' + key] = res_deghost[key]
             accuracy += res_deghost['accuracy']
             loss += self.deghost_weight*res_deghost['loss']
-            deghost = (seg_label[0][:,-1] < 5) & (out['ghost'][0][:,0] > out['ghost'][0][:,1]) # Only non-ghost (both true and pred) can go in semseg eval
+            deghost = out['ghost'][0][:,0] > out['ghost'][0][:,1]
 
         if self.enable_uresnet:
             if not self.enable_charge_rescaling:
                 res_seg = self.uresnet_loss(out, seg_label)
             else:
+                seg = out['segmentation'][0]
+                full_seg = torch.zeros((seg_label[0].shape[0], seg.shape[1]), dtype=seg.dtype, device=seg.device)
+                full_seg[deghost] = seg 
+                out['segmentation'][0] = full_seg
+                deghost &= seg_label[0][:,-1] < 5 # Only apply loss to true non-ghosts
                 res_seg = self.uresnet_loss({'segmentation':[out['segmentation'][0][deghost]]}, [seg_label[0][deghost]])
             for key in res_seg:
                 res['segmentation_' + key] = res_seg[key]
@@ -672,7 +677,8 @@ def forward(self, out, seg_label, ppn_label=None, cluster_label=None, kinematics
             accuracy += res_ppn['accuracy']
             loss += self.ppn_weight*res_ppn['loss']
 
-        if self.enable_ghost and (self.enable_cnn_clust or \
+        if self.enable_ghost and not self.enable_charge_rescaling and \
+                                 (self.enable_cnn_clust or \
                                   self.enable_gnn_track or \
                                   self.enable_gnn_shower or \
                                   self.enable_gnn_inter or \
@@ -726,7 +732,7 @@ def forward(self, out, seg_label, ppn_label=None, cluster_label=None, kinematics
 
                 segmentation_pred = out['segmentation'][0]
 
-                if self.enable_ghost:
+                if self.enable_ghost and not self.enable_charge_rescaling:
                     segmentation_pred = segmentation_pred[deghost]
                 if self._gspice_use_true_labels:
                     gs_seg_label = torch.cat([cluster_label[0][:, :4], segment_label[:, None]], dim=1)
diff --git a/mlreco/utils/ppn.py b/mlreco/utils/ppn.py
index b88021c6..c600de63 100644
--- a/mlreco/utils/ppn.py
+++ b/mlreco/utils/ppn.py
@@ -302,15 +302,15 @@ def uresnet_ppn_type_point_selector(data, out, score_threshold=0.5, type_score_t
     # If 'points' is specified in `concat_result`,
     # then it won't be unwrapped.
     if len(points) == len(ppn_coords[-1]):
-        #pass
+        pass
         # print(entry, np.unique(ppn_coords[-1][:, 0], return_counts=True))
-        points = points[ppn_coords[-1][:, 0] == entry, :]
+        #points = points[ppn_coords[-1][:, 0] == entry, :]
     else: # in case it has been unwrapped (possible in no-ghost scenario)
         points = out['points'][entry]
 
     enable_classify_endpoints = 'classify_endpoints' in out
     if enable_classify_endpoints:
-        classify_endpoints = out['classify_endpoints'][0][ppn_coords[-1][:, 0] == entry, :]#[entry]
+        classify_endpoints = out['classify_endpoints'][0]
 
     mask_ppn = out['mask_ppn'][-1]
     # predicted type labels
@@ -349,7 +349,7 @@ def uresnet_ppn_type_point_selector(data, out, score_threshold=0.5, type_score_t
         batch_index = batch_ids == b
         batch_index2 = ppn_coords[-1][:, 0] == b
         # print(batch_index.shape, batch_index2.shape, mask_ppn.shape, scores.shape)
-        mask = ((~(mask_ppn[batch_index2] == 0)).any(axis=1)) & (scores[batch_index][:, 1] > score_threshold)
+        mask = ((~(mask_ppn[batch_index2] == 0)).any(axis=1)) & (scores[batch_index2][:, 1] > score_threshold)
         # If we want to restrict the postprocessing to specific voxels
         # (e.g. within a particle cluster, not the full event)
         # then use the argument `selection`.
@@ -362,26 +362,26 @@ def uresnet_ppn_type_point_selector(data, out, score_threshold=0.5, type_score_t
             new_mask[indices] = mask[indices]
             mask = new_mask
 
-        ppn_type_predictions = np.argmax(scipy.special.softmax(points[batch_index][mask][:, type_col[0]:type_col[1]], axis=1), axis=1)
-        ppn_type_softmax = scipy.special.softmax(points[batch_index][mask][:, type_col[0]:type_col[1]], axis=1)
+        ppn_type_predictions = np.argmax(scipy.special.softmax(points[batch_index2][mask][:, type_col[0]:type_col[1]], axis=1), axis=1)
+        ppn_type_softmax = scipy.special.softmax(points[batch_index2][mask][:, type_col[0]:type_col[1]], axis=1)
         if enable_classify_endpoints:
-            ppn_classify_endpoints = scipy.special.softmax(classify_endpoints[batch_index][mask], axis=1)
+            ppn_classify_endpoints = scipy.special.softmax(classify_endpoints[batch_index2][mask], axis=1)
         if enforce_type:
             for c in range(num_classes):
                 uresnet_points = uresnet_predictions[batch_index][mask] == c
                 ppn_points = ppn_type_softmax[:, c] > type_score_threshold #ppn_type_predictions == c
                 if np.count_nonzero(ppn_points) > 0 and np.count_nonzero(uresnet_points) > 0:
-                    d = scipy.spatial.distance.cdist(points[batch_index][mask][ppn_points][:, :3] + event_data[batch_index][mask][ppn_points][:, coords_col[0]:coords_col[1]] + 0.5, event_data[batch_index][mask][uresnet_points][:, coords_col[0]:coords_col[1]])
+                    d = scipy.spatial.distance.cdist(points[batch_index2][mask][ppn_points][:, :3] + event_data[batch_index][mask][ppn_points][:, coords_col[0]:coords_col[1]] + 0.5, event_data[batch_index][mask][uresnet_points][:, coords_col[0]:coords_col[1]])
                     ppn_mask = (d < type_threshold).any(axis=1)
-                    final_points.append(points[batch_index][mask][ppn_points][ppn_mask][:, :3] + 0.5 + event_data[batch_index][mask][ppn_points][ppn_mask][:, coords_col[0]:coords_col[1]])
-                    final_scores.append(scores[batch_index][mask][ppn_points][ppn_mask])
+                    final_points.append(points[batch_index2][mask][ppn_points][ppn_mask][:, :3] + 0.5 + event_data[batch_index][mask][ppn_points][ppn_mask][:, coords_col[0]:coords_col[1]])
+                    final_scores.append(scores[batch_index2][mask][ppn_points][ppn_mask])
                     final_types.append(ppn_type_predictions[ppn_points][ppn_mask])
                     final_softmax.append(ppn_type_softmax[ppn_points][ppn_mask])
                     if enable_classify_endpoints:
                         final_endpoints.append(ppn_classify_endpoints[ppn_points][ppn_mask])
         else:
-            final_points = [points[batch_index][mask][:, :3] + 0.5 + event_data[batch_index][mask][:, coords_col[0]:coords_col[1]]]
-            final_scores = [scores[batch_index][mask]]
+            final_points = [points[batch_index2][mask][:, :3] + 0.5 + event_data[batch_index][mask][:, coords_col[0]:coords_col[1]]]
+            final_scores = [scores[batch_index2][mask]]
             final_types = [ppn_type_predictions]
             final_softmax =  [ppn_type_softmax]
             if enable_classify_endpoints:

From 7626cf8bdb3e65e51912cced831b35d2a9f056aa Mon Sep 17 00:00:00 2001
From: Francois Drielsma <francois.drielsma@gmail.com>
Date: Fri, 16 Sep 2022 11:35:44 -0700
Subject: [PATCH 19/52] Fix bug with deghosted tensor indexing and cluster
 labeling

---
 mlreco/models/full_chain.py                   | 21 ++++++++++++-------
 mlreco/models/layers/common/gnn_full_chain.py | 10 ++-------
 .../layers/gnn/losses/node_kinematics.py      | 12 +++++------
 3 files changed, 21 insertions(+), 22 deletions(-)

diff --git a/mlreco/models/full_chain.py b/mlreco/models/full_chain.py
index c01eacbc..203373b4 100644
--- a/mlreco/models/full_chain.py
+++ b/mlreco/models/full_chain.py
@@ -210,13 +210,18 @@ def full_chain_cnn(self, input):
 
             # Rescale the charge column, store it
             charges = compute_rescaled_charge(input[0], deghost, last_index=last_index)
-            full_n  = len(input[0])
-            input[0] = input[0][deghost]
-            input[0][:, 4] = charges
-            result.update({'input_rescaled':[input[0][:,:5]]})
+            input[0][deghost, 4] = charges
+            result.update({'input_rescaled':[input[0][deghost,:5]]})
 
         if self.enable_uresnet:
-            result.update(self.uresnet_lonely([input[0][:, :4+self.input_features]]))
+            if not self.enable_charge_rescaling:
+                result.update(self.uresnet_lonely([input[0][:, :4+self.input_features]]))
+            else:
+                result.update(self.uresnet_lonely([input[0][deghost, :4+self.input_features]]))
+                seg = result['segmentation'][0]
+                full_seg = torch.zeros((input[0].shape[0], seg.shape[1]), dtype=seg.dtype, device=seg.device) 
+                full_seg[deghost] = seg
+                result['segmentation'][0] = full_seg
 
         if self.enable_ppn:
             ppn_input = {}
@@ -237,7 +242,7 @@ def full_chain_cnn(self, input):
 
         cnn_result = {}
 
-        if self.enable_ghost and not self.enable_charge_rescaling:
+        if self.enable_ghost:
 
             # Update input based on deghosting results
             # if self.cheat_ghost:
@@ -268,7 +273,7 @@ def full_chain_cnn(self, input):
             deghost_result.update(result)
             deghost_result.pop('ghost')
             deghost_result['segmentation'][0] = result['segmentation'][0][deghost]
-            if self.enable_ppn:
+            if self.enable_ppn and not self.enable_charge_rescaling:
                 deghost_result['points']            = [result['points'][0][deghost]]
                 if 'classify_endpoints' in deghost_result:
                     deghost_result['classify_endpoints'] = [result['classify_endpoints'][0][deghost]]
@@ -374,7 +379,7 @@ def full_chain_cnn(self, input):
         #     cnn_result['true_points'] = coords
 
         def return_to_original(result):
-            if self.enable_ghost and not self.enable_charge_rescaling:
+            if self.enable_ghost:
                 result['segmentation'][0] = segmentation
             return result
 
diff --git a/mlreco/models/layers/common/gnn_full_chain.py b/mlreco/models/layers/common/gnn_full_chain.py
index 7d3f3d74..26d4a08d 100644
--- a/mlreco/models/layers/common/gnn_full_chain.py
+++ b/mlreco/models/layers/common/gnn_full_chain.py
@@ -650,17 +650,12 @@ def forward(self, out, seg_label, ppn_label=None, cluster_label=None, kinematics
                 res['deghost_' + key] = res_deghost[key]
             accuracy += res_deghost['accuracy']
             loss += self.deghost_weight*res_deghost['loss']
-            deghost = out['ghost'][0][:,0] > out['ghost'][0][:,1]
+            deghost = (out['ghost'][0][:,0] > out['ghost'][0][:,1]) & (seg_label[0][:,-1] < 5) # Only apply loss to reco/true non-ghosts
 
         if self.enable_uresnet:
             if not self.enable_charge_rescaling:
                 res_seg = self.uresnet_loss(out, seg_label)
             else:
-                seg = out['segmentation'][0]
-                full_seg = torch.zeros((seg_label[0].shape[0], seg.shape[1]), dtype=seg.dtype, device=seg.device)
-                full_seg[deghost] = seg 
-                out['segmentation'][0] = full_seg
-                deghost &= seg_label[0][:,-1] < 5 # Only apply loss to true non-ghosts
                 res_seg = self.uresnet_loss({'segmentation':[out['segmentation'][0][deghost]]}, [seg_label[0][deghost]])
             for key in res_seg:
                 res['segmentation_' + key] = res_seg[key]
@@ -677,8 +672,7 @@ def forward(self, out, seg_label, ppn_label=None, cluster_label=None, kinematics
             accuracy += res_ppn['accuracy']
             loss += self.ppn_weight*res_ppn['loss']
 
-        if self.enable_ghost and not self.enable_charge_rescaling and \
-                                 (self.enable_cnn_clust or \
+        if self.enable_ghost and (self.enable_cnn_clust or \
                                   self.enable_gnn_track or \
                                   self.enable_gnn_shower or \
                                   self.enable_gnn_inter or \
diff --git a/mlreco/models/layers/gnn/losses/node_kinematics.py b/mlreco/models/layers/gnn/losses/node_kinematics.py
index 2c26e8d7..a8f815f2 100644
--- a/mlreco/models/layers/gnn/losses/node_kinematics.py
+++ b/mlreco/models/layers/gnn/losses/node_kinematics.py
@@ -166,7 +166,7 @@ def forward(self, out, types):
                 clusts = out['clusts'][i][j]
 
                 # Increment the type loss, balance classes if requested
-                if compute_type and out['node_pred_type'][i][j].shape:
+                if compute_type and out['node_pred_type'][i][j].shape[0]:
                     # Get the type predictions and true types from the specified columns
                     node_pred_type = out['node_pred_type'][i][j]
                     node_assn_type = get_cluster_label(labels, clusts, column=self.type_col)
@@ -205,7 +205,7 @@ def forward(self, out, types):
                         n_clusts_type += len(valid_mask_type)
 
                 # Increment the momentum loss
-                if compute_momentum and out['node_pred_p'][i][j].shape:
+                if compute_momentum and out['node_pred_p'][i][j].shape[0]:
                     # Get the momentum predictions and true momenta from the specified columns
                     node_pred_p = out['node_pred_p'][i][j]
                     node_assn_p = get_momenta_label(labels, clusts, column=self.momentum_col)
@@ -232,7 +232,7 @@ def forward(self, out, types):
                         # Increment the number of nodes
                         n_clusts_momentum += len(clusts)
 
-                if compute_vtx and out['node_pred_vtx'][i][j].shape:
+                if compute_vtx and out['node_pred_vtx'][i][j].shape[0]:
                     # Get the vertex predictions, node features and true vertices from the specified columns
                     node_pred_vtx = out['node_pred_vtx'][i][j]
                     input_node_features = out['input_node_features'][i][j]
@@ -291,13 +291,13 @@ def forward(self, out, types):
 
                 # Compute the accuracy of assignment (fraction of correctly assigned nodes)
                 # and the accuracy of momentum estimation (RMS relative residual)
-                if compute_type and len(valid_mask_type):
+                if compute_type and out['node_pred_type'][i][j].shape[0] and len(valid_mask_type):
                     type_acc += float(torch.sum(torch.argmax(node_pred_type, dim=1) == node_assn_type))
 
-                if compute_momentum and len(valid_mask_p):
+                if compute_momentum and out['node_pred_p'][i][j].shape[0] and len(valid_mask_p):
                     p_acc += float(torch.sum(1.- torch.abs(node_pred_p.squeeze()-node_assn_p)/node_assn_p)) # 1-MAPE
 
-                if compute_vtx and len(pos_mask_vtx):
+                if compute_vtx and out['node_pred_vtx'][i][j].shape[0] and len(pos_mask_vtx):
                     vtx_position_acc += float(torch.sum(1. - torch.abs(vtx_pred - vtx_label)/(torch.abs(vtx_pred) + torch.abs(vtx_label))))/3.
                     vtx_score_acc += float(torch.sum(torch.argmax(node_pred_vtx[:,3:], dim=1) == node_assn_vtx_pos))
 

From 836d1e6f523f4983730da61c26aa429fba979003 Mon Sep 17 00:00:00 2001
From: Francois Drielsma <francois.drielsma@gmail.com>
Date: Mon, 19 Sep 2022 15:30:51 -0700
Subject: [PATCH 20/52] _asis parsers now leave coordinates unchanged. New
 parse_particles and parse_neutrinos introduced which rescale the coordinates
 to voxel coordinates by default, and behave like _asis parsers if
 voxel_coordinates is set to False

---
 analysis/classes/ui.py              |   2 +-
 config/chain/metrics.cfg            |   4 +-
 docs/source/HowTo.rst               |  14 ++--
 mlreco/iotools/parsers/__init__.py  |  10 +--
 mlreco/iotools/parsers/cluster.py   |  10 +--
 mlreco/iotools/parsers/particles.py | 108 +++++++++++++++++-----------
 test/test_parser.py                 |   2 +-
 7 files changed, 89 insertions(+), 61 deletions(-)

diff --git a/analysis/classes/ui.py b/analysis/classes/ui.py
index f2277c89..0536ff9a 100644
--- a/analysis/classes/ui.py
+++ b/analysis/classes/ui.py
@@ -752,7 +752,7 @@ class FullChainEvaluator(FullChainPredictor):
                 - particle_corrected
                 - cluster3d_pcluster
             particles_asis:
-                - parse_particle_asis
+                - parse_particles
                 - particle_pcluster
                 - cluster3d_pcluster
 
diff --git a/config/chain/metrics.cfg b/config/chain/metrics.cfg
index 3e53ff5e..6c109f58 100644
--- a/config/chain/metrics.cfg
+++ b/config/chain/metrics.cfg
@@ -46,8 +46,8 @@ iotool:
         - parse_particle_graph_corrected
         - particle_corrected
         - cluster3d_pcluster
-      particles_asis:
-        - parse_particle_asis
+      particles:
+        - parse_particles
         - particle_pcluster
         - cluster3d_pcluster
       meta:
diff --git a/docs/source/HowTo.rst b/docs/source/HowTo.rst
index 9df97d93..ce9c5ff1 100644
--- a/docs/source/HowTo.rst
+++ b/docs/source/HowTo.rst
@@ -48,24 +48,24 @@ data blob:
 
 How to get true particle information
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-You need to use the parser ``particles_asis``. For example:
+You need to use the parser ``parse_particles``. For example:
 
 .. code-block:: yaml
 
     iotool:
       dataset:
         schema:
-          particles_asis:
-            - parse_particle_asis
+          particles:
+            - parse_particles
             - particle_pcluster
             - cluster3d_pcluster
 
-Then you will be able to access ``data['particles_asis'][entry]``
+Then you will be able to access ``data['particles'][entry]``
 which is a list of objects of type ``larcv::Particle``.
 
 .. code-block:: python
 
-    for p in data['particles_asis'][entry]:
+    for p in data['particles'][entry]:
         mom = np.array([p.px(), p.py(), p.pz()])
         print(p.id(), p.num_voxels(), mom/np.linalg.norm(mom))
 
@@ -107,7 +107,7 @@ How to get true neutrino information
 
 Assuming you are either using a Singularity container that has the right
 larcv2 compiled or you followed the note above explaining how to get it
-by yourself, you can use the ``parse_neutrino_asis`` parser of ``lartpc_mlreco3d``.
+by yourself, you can use the ``parse_neutrinos`` parser of ``lartpc_mlreco3d``.
 
 
 .. code-block:: yaml
@@ -116,7 +116,7 @@ by yourself, you can use the ``parse_neutrino_asis`` parser of ``lartpc_mlreco3d
       dataset:
         schema:
           neutrinos:
-            - parse_neutrino_asis
+            - parse_neutrinos
             - neutrino_mpv
             - cluster3d_pcluster
 
diff --git a/mlreco/iotools/parsers/__init__.py b/mlreco/iotools/parsers/__init__.py
index a3ea33a1..a32f7fb8 100644
--- a/mlreco/iotools/parsers/__init__.py
+++ b/mlreco/iotools/parsers/__init__.py
@@ -24,8 +24,8 @@
 .. csv-table:: Particle parsers
     :header: Parser name, Description
 
-    ``parse_particle_asis``, Retrieve array of larcv::Particle
-    ``parse_neutrino_asis``, Retrieve array of larcv::Neutrino
+    ``parse_particles``, Retrieve array of larcv::Particle
+    ``parse_neutrinos``, Retrieve array of larcv::Neutrino
     ``parse_particle_points``, Retrieve array of larcv::Particle ground truth points tensor
     ``parse_particle_coords``, Retrieve array of larcv::Particle coordinates (start and end) and start time
     ``parse_particle_graph``, Construct edges between particles (i.e. clusters) from larcv::EventParticle
@@ -87,13 +87,15 @@
 )
 
 from mlreco.iotools.parsers.particles import (
-    parse_particle_asis,
-    parse_neutrino_asis,
+    parse_particles,
+    parse_neutrinos,
     parse_particle_points,
     parse_particle_coords,
     parse_particle_graph,
     parse_particle_singlep_pdg,
     parse_particle_singlep_einit,
+    parse_particle_asis, # Deprecated
+    parse_neutrino_asis, # Deprecated
     parse_particle_points_with_tagging, # Deprecated
     parse_particle_graph_corrected # Deprecated
 )
diff --git a/mlreco/iotools/parsers/cluster.py b/mlreco/iotools/parsers/cluster.py
index 5f852e82..2e954ba6 100644
--- a/mlreco/iotools/parsers/cluster.py
+++ b/mlreco/iotools/parsers/cluster.py
@@ -4,7 +4,7 @@
 from mlreco.utils.groups import get_interaction_id, get_nu_id, get_particle_id, get_primary_id
 from mlreco.utils.groups import type_labels as TYPE_LABELS
 from mlreco.iotools.parsers.sparse import parse_sparse3d
-from mlreco.iotools.parsers.particles import parse_particle_asis
+from mlreco.iotools.parsers.particles import parse_particles
 from mlreco.iotools.parsers.clean_data import clean_sparse_data
 
 
@@ -140,12 +140,12 @@ def parse_cluster3d(cluster_event,
             labels['type']    = get_particle_id(particles_v, nu_ids, include_mpr=type_include_mpr)
             labels['primary'] = get_primary_id(cluster_event, particles_v)
         if add_kinematics_info:
-            particles_asis_v  = parse_particle_asis(particle_event, cluster_event)
+            particles_v       = parse_particles(particle_event, cluster_event)
             labels['type']    = get_particle_id(particles_v, nu_ids, include_mpr=type_include_mpr)
             labels['p']       = np.array([(p.px()**2+p.py()**2+p.pz()**2)/1e3 for p in particles_v])
-            labels['vtx_x']   = np.array([p.ancestor_position().x() for p in particles_asis_v])
-            labels['vtx_y']   = np.array([p.ancestor_position().y() for p in particles_asis_v])
-            labels['vtx_z']   = np.array([p.ancestor_position().z() for p in particles_asis_v])
+            labels['vtx_x']   = np.array([p.ancestor_position().x() for p in particles_v])
+            labels['vtx_y']   = np.array([p.ancestor_position().y() for p in particles_v])
+            labels['vtx_z']   = np.array([p.ancestor_position().z() for p in particles_v])
             labels['primary_group'] = np.array([p.group_id()==p.parent_id() for p in particles_v], dtype=np.float32)
         labels['sem'] = np.array([p.shape() for p in particles_v])
 
diff --git a/mlreco/iotools/parsers/particles.py b/mlreco/iotools/parsers/particles.py
index b29a7203..48661ef9 100644
--- a/mlreco/iotools/parsers/particles.py
+++ b/mlreco/iotools/parsers/particles.py
@@ -4,89 +4,103 @@
 from mlreco.utils.groups import type_labels as TYPE_LABELS
 
 
-def parse_particle_asis(particle_event, cluster_event):
+def parse_particles(particle_event, cluster_event, voxel_coordinates=True):
     """
-    A function to copy construct & return an array of larcv::Particle
+    A function to copy construct & return an array of larcv::Particle.
+
+    If `voxel_coordinates` is set to `True`, the parser rescales the truth
+    positions (start, end, etc.) to voxel coordinates.
 
     .. code-block:: yaml
 
         schema:
-          particle_asis:
-            parser: parse_particle_asis
+          particles:
+            parser: parse_particles
             args:
               particle_event: particle_pcluster
               cluster_event: cluster3d_pcluster
+              voxel_coordinates: True
 
     Configuration
     -------------
     particle_event: larcv::EventParticle
     cluster_event: larcv::EventClusterVoxel3D
         to translate coordinates
+    voxel_coordinates: bool
 
     Returns
     -------
     list
-        a python list of larcv::Particle object
+        a python list of larcv::Particle objects
     """
     particles = [larcv.Particle(p) for p in particle_event.as_vector()]
-    meta = cluster_event.meta()
-    funcs = ['first_step', 'last_step', 'position', 'end_position', 'ancestor_position']
-    for p in particles:
-        for f in funcs:
-            pos = getattr(p,f)()
-            x = (pos.x() - meta.min_x()) / meta.size_voxel_x()
-            y = (pos.y() - meta.min_y()) / meta.size_voxel_y()
-            z = (pos.z() - meta.min_z()) / meta.size_voxel_z()
-            # x = (pos.x() - meta.origin().x) / meta.size_voxel_x()
-            # y = (pos.y() - meta.origin().y) / meta.size_voxel_y()
-            # z = (pos.z() - meta.origin().z) / meta.size_voxel_z()
-            # x = pos.x() * meta.size_voxel_x() + meta.origin().x
-            # y = pos.y() * meta.size_voxel_y() + meta.origin().y
-            # z = pos.z() * meta.size_voxel_z() + meta.origin().z
-            getattr(p,f)(x,y,z,pos.t())
+    if voxel_coordinates:
+        meta = cluster_event.meta()
+        funcs = ['first_step', 'last_step', 'position', 'end_position', 'ancestor_position']
+        for p in particles:
+            for f in funcs:
+                pos = getattr(p,f)()
+                x = (pos.x() - meta.min_x()) / meta.size_voxel_x()
+                y = (pos.y() - meta.min_y()) / meta.size_voxel_y()
+                z = (pos.z() - meta.min_z()) / meta.size_voxel_z()
+                # x = (pos.x() - meta.origin().x) / meta.size_voxel_x()
+                # y = (pos.y() - meta.origin().y) / meta.size_voxel_y()
+                # z = (pos.z() - meta.origin().z) / meta.size_voxel_z()
+                # x = pos.x() * meta.size_voxel_x() + meta.origin().x
+                # y = pos.y() * meta.size_voxel_y() + meta.origin().y
+                # z = pos.z() * meta.size_voxel_z() + meta.origin().z
+                getattr(p,f)(x,y,z,pos.t())
+
     return particles
 
 
-def parse_neutrino_asis(neutrino_event, cluster_event):
+def parse_neutrinos(neutrino_event, cluster_event, voxel_coordinates=True):
     """
-    A function to copy construct & return an array of larcv::Neutrino
+    A function to copy construct & return an array of larcv::Neutrino.
+
+    If `voxel_coordinates` is set to `True`, the parser rescales the truth
+    position information to voxel coordinates.
 
     .. code-block:: yaml
 
         schema:
-          neutrino_asis:
-            parser: parse_neutrino_asis
+          neutrinos:
+            parser: parse_neutrinos
             args:
               neutrino_event: neutrino_mpv
               cluster_event: cluster3d_pcluster
+              voxel_coordinates: True
 
     Configuration
     -------------
     neutrino_pcluster: larcv::EventNeutrino
     cluster3d_pcluster: larcv::EventClusterVoxel3D
         to translate coordinates
+    voxel_coordinates: bool
 
     Returns
     -------
     list
-        a python list of larcv::Neutrino object
+        a python list of larcv::Neutrino objects
     """
     neutrinos = [larcv.Neutrino(p) for p in neutrino_event.as_vector()]
-    meta = cluster_event.meta()
-    funcs = ['position']
-    for p in neutrinos:
-        for f in funcs:
-            pos = getattr(p,f)()
-            x = (pos.x() - meta.min_x()) / meta.size_voxel_x()
-            y = (pos.y() - meta.min_y()) / meta.size_voxel_y()
-            z = (pos.z() - meta.min_z()) / meta.size_voxel_z()
-            # x = (pos.x() - meta.origin().x) / meta.size_voxel_x()
-            # y = (pos.y() - meta.origin().y) / meta.size_voxel_y()
-            # z = (pos.z() - meta.origin().z) / meta.size_voxel_z()
-            # x = pos.x() * meta.size_voxel_x() + meta.origin().x
-            # y = pos.y() * meta.size_voxel_y() + meta.origin().y
-            # z = pos.z() * meta.size_voxel_z() + meta.origin().z
-            getattr(p,f)(x,y,z,pos.t())
+    if voxel_coordinates:
+        meta = cluster_event.meta()
+        funcs = ['position']
+        for p in neutrinos:
+            for f in funcs:
+                pos = getattr(p,f)()
+                x = (pos.x() - meta.min_x()) / meta.size_voxel_x()
+                y = (pos.y() - meta.min_y()) / meta.size_voxel_y()
+                z = (pos.z() - meta.min_z()) / meta.size_voxel_z()
+                # x = (pos.x() - meta.origin().x) / meta.size_voxel_x()
+                # y = (pos.y() - meta.origin().y) / meta.size_voxel_y()
+                # z = (pos.z() - meta.origin().z) / meta.size_voxel_z()
+                # x = pos.x() * meta.size_voxel_x() + meta.origin().x
+                # y = pos.y() * meta.size_voxel_y() + meta.origin().y
+                # z = pos.z() * meta.size_voxel_z() + meta.origin().z
+                getattr(p,f)(x,y,z,pos.t())
+
     return neutrinos
 
 
@@ -166,7 +180,7 @@ def parse_particle_coords(particle_event, cluster_event):
         last_step_x, last_step_y, last_step_z, first_step_t, shape_id]
     '''
     # Scale particle coordinates to image size
-    particles = parse_particle_asis(particle_event, cluster_event)
+    particles = parse_particles(particle_event, cluster_event)
 
     # Make features
     particle_feats = []
@@ -322,6 +336,18 @@ def parse_particle_singlep_einit(particle_event):
     return -1
 
 
+def parse_particle_asis(particle_event, cluster_event):
+    from warnings import warn
+    warn("Deprecated: parse_particle_asis is deprecated, use parse_particles with voxel_coordinates set to False", DeprecationWarning)
+    return parse_particles(particle_event, cluster_event, voxel_coordinates=False)
+
+
+def parse_neutrino_asis(neutrino_event, cluster_event):
+    from warnings import warn
+    warn("Deprecated: parse_neutrino_asis is deprecated, use parse_neutrinos with voxel_coordinates set to False", DeprecationWarning)
+    return parse_neutrinos(neutrino_event, cluster_event, voxel_coordinates=False)
+
+
 def parse_particle_points_with_tagging(sparse_event, particle_event):
     from warnings import warn
     warn("Deprecated: parse_particle_points_with_tagging deprecated, use parse_particle_points instead", DeprecationWarning)
diff --git a/test/test_parser.py b/test/test_parser.py
index a19a1929..c24bb75e 100644
--- a/test/test_parser.py
+++ b/test/test_parser.py
@@ -261,7 +261,7 @@ def test_parse_semantics():
 def test_parse_weights():
     pass
 
-def test_parse_particle_asis():
+def test_parse_particles():
     pass
 
 @pytest.mark.parametrize("event_cluster3d", [3], indirect=True)

From 5928268d6653013cd58e7f28cab64b94476fd642 Mon Sep 17 00:00:00 2001
From: Francois Drielsma <francois.drielsma@gmail.com>
Date: Fri, 23 Sep 2022 09:57:59 -0700
Subject: [PATCH 21/52] Added a vertex prediction heuristic if an interaction
 has a single primary

---
 mlreco/utils/vertex.py | 26 +++++++++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

diff --git a/mlreco/utils/vertex.py b/mlreco/utils/vertex.py
index f1b2424f..42617041 100644
--- a/mlreco/utils/vertex.py
+++ b/mlreco/utils/vertex.py
@@ -308,9 +308,29 @@ def predict_vertex(inter_idx, data_idx, input_data, res,
 
     all_voxels = input_data[data_idx]
     if 'ghost' in res and apply_deghosting:
-        mask_ghost = np.argmax(res['ghost'][data_idx], axis=1) == 0
-        all_voxels = input_data[data_idx][mask_ghost]
-
+        if 'input_rescaled' not in res:
+            mask_ghost = np.argmax(res['ghost'][data_idx], axis=1) == 0
+            all_voxels = input_data[data_idx][mask_ghost]
+        else:
+            all_voxels = res['input_rescaled'][data_idx]
+
+    # Handle the case where only a single primary is available
+    if len(ppn_candidates) == 1:
+        particle_seg = res['particles_seg'][data_idx][inter_mask][c_indices[0]]
+        end_points = res['particle_node_features'][data_idx][inter_mask][primary_particles][c_indices[0], -9:-3].reshape(-1,3)
+        if particle_seg != 1:
+            # If there's a single shower object, pick the shower start point
+            return end_points[0]
+        else:
+            # If there's a single track, pick the end point with the lowest local charge density
+            voxels = all_voxels[c_candidates[0], coords_col[0]:coords_col[1]]
+            dist_mat = scipy.spatial.distance.cdist(end_points, voxels)
+            mask = dist_mat < 5
+            charges = all_voxels[c_candidates[0],4]
+            locald = [np.sum(charges[mask[0]]), np.sum(charges[mask[1]])]
+            return end_points[np.argmin(locald)]
+
+    # Handle all other cases
     ppn_candidates2 = []
     directions = []
     distances_others, distances_primaries = [], []

From 5e33fa9c5c415e57efc7f7271c6b934abcd1575f Mon Sep 17 00:00:00 2001
From: Francois Drielsma <francois.drielsma@gmail.com>
Date: Wed, 28 Sep 2022 16:44:32 -0700
Subject: [PATCH 22/52] Fixed inter_cluster_distance function, added distance
 method option

---
 mlreco/utils/gnn/network.py | 43 +++++++++++++++++++++++++++----------
 1 file changed, 32 insertions(+), 11 deletions(-)

diff --git a/mlreco/utils/gnn/network.py b/mlreco/utils/gnn/network.py
index 51b5c934..c202a8b8 100644
--- a/mlreco/utils/gnn/network.py
+++ b/mlreco/utils/gnn/network.py
@@ -6,7 +6,7 @@
 from scipy.spatial import Delaunay
 from scipy.sparse.csgraph import minimum_spanning_tree
 
-from mlreco.utils.numba import numba_wrapper, submatrix_nb, cdist_nb
+from mlreco.utils.numba import numba_wrapper, submatrix_nb, cdist_nb, mean_nb
 
 
 @nb.njit(cache=True)
@@ -421,7 +421,7 @@ def _get_edge_distances(voxels: nb.float32[:,:],
 
 
 @numba_wrapper(cast_args=['voxels'], list_args=['clusts'])
-def inter_cluster_distance(voxels, clsuts, batch_ids):
+def inter_cluster_distance(voxels, clusts, batch_ids=None, mode='voxel'):
     """
     Finds the inter-cluster distance between every pair of clusters within
     each batch, returned as a block-diagonal matrix.
@@ -430,24 +430,45 @@ def inter_cluster_distance(voxels, clsuts, batch_ids):
         voxels (torch.tensor) : (N,3) Tensor of voxel coordinates
         clusts ([np.ndarray]) : (C) List of arrays of voxel IDs in each cluster
         batch_ids (np.ndarray): (C) List of cluster batch IDs
+        mode (str)            : Eiher use closest voxel distance (`voxel`) or centroid distance (`centroid`)
     Returns:
         torch.tensor: (C,C) Tensor of pair-wise cluster distances
     """
+    # If there is no batch_ids provided, assign 0 to all clusters
+    if batch_ids is None:
+        batch_ids = np.zeros(len(clusts), dtype=np.int64) 
+
     return _inter_cluster_distance(voxels, clusts, batch_ids, mode)
 
-@nb.njit(parallel=True, cache=True)
+@nb.njit(parallel=True)
 def _inter_cluster_distance(voxels: nb.float64[:,:],
                             clusts: nb.types.List(nb.int64[:]),
-                            batch_ids: nb.int64[:]) -> nb.float64[:,:]:
+                            batch_ids: nb.int64[:],
+                            mode: str = 'voxel') -> nb.float64[:,:]:
 
+    assert len(clusts) == len(batch_ids)
     dist_mat = np.zeros((len(batch_ids), len(batch_ids)), dtype=voxels.dtype)
-    for i in nb.prange(len(batch_ids)):
-        for j in range(len(batch_ids)):
-            if batch_ids[i] == batch_ids[j]:
-                if i < j:
-                    dist_mat[i,j] = np.min(cdist_nb(voxels[clusts[i]], voxels[clusts[j]]))
-                elif i > j:
-                    dist_mat[i,j] = dist_mat[j,i]
+    if mode == 'voxel':
+        for i in nb.prange(len(batch_ids)):
+            for j in range(len(batch_ids)):
+                if batch_ids[i] == batch_ids[j]:
+                    if i < j:
+                        dist_mat[i,j] = np.min(cdist_nb(voxels[clusts[i]], voxels[clusts[j]]))
+                    elif i > j:
+                        dist_mat[i,j] = dist_mat[j,i]
+    elif mode == 'centroid':
+        centroids = np.empty((len(batch_ids), voxels.shape[1]), dtype=voxels.dtype)
+        for i in nb.prange(len(batch_ids)):
+            centroids[i] = mean_nb(voxels[clusts[i]], axis=0)
+        for i in nb.prange(len(batch_ids)):
+            for j in range(len(batch_ids)):
+                if batch_ids[i] == batch_ids[j]:
+                    if i < j:
+                        dist_mat[i,j] = np.sqrt(np.sum((centroids[j]-centroids[i])**2))
+                    else:
+                        dist_mat[i,j] = dist_mat[j,i]
+    else:
+        raise ValueError('Inter-cluster distance mode not supported')
 
     return dist_mat
 

From 1a9f1f51d0ece3e0de67069025b0a8c5d26956f7 Mon Sep 17 00:00:00 2001
From: Francois Drielsma <francois.drielsma@gmail.com>
Date: Wed, 28 Sep 2022 23:19:55 -0700
Subject: [PATCH 23/52] Bug fix in vertex heuristic for n_primary=1 case

---
 mlreco/utils/vertex.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlreco/utils/vertex.py b/mlreco/utils/vertex.py
index 42617041..6c6527b9 100644
--- a/mlreco/utils/vertex.py
+++ b/mlreco/utils/vertex.py
@@ -316,7 +316,7 @@ def predict_vertex(inter_idx, data_idx, input_data, res,
 
     # Handle the case where only a single primary is available
     if len(ppn_candidates) == 1:
-        particle_seg = res['particles_seg'][data_idx][inter_mask][c_indices[0]]
+        particle_seg = res['particles_seg'][data_idx][inter_mask][primary_particles][c_indices[0]]
         end_points = res['particle_node_features'][data_idx][inter_mask][primary_particles][c_indices[0], -9:-3].reshape(-1,3)
         if particle_seg != 1:
             # If there's a single shower object, pick the shower start point

From 3336fa86444983f1a39cdda2f1cf4166054fa0ae Mon Sep 17 00:00:00 2001
From: Francois Drielsma <francois.drielsma@gmail.com>
Date: Wed, 19 Oct 2022 17:12:51 -0700
Subject: [PATCH 24/52] Bug fix in network topology visualization tool cluster
 label casting

---
 mlreco/visualization/gnn.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlreco/visualization/gnn.py b/mlreco/visualization/gnn.py
index 938db03f..fd4e2d80 100644
--- a/mlreco/visualization/gnn.py
+++ b/mlreco/visualization/gnn.py
@@ -57,7 +57,7 @@ def network_topology(voxels, clusters, edge_index=[], clust_labels=[], edge_labe
     # Define the node features (label, color)
     n = len(clusters)
     if not len(clust_labels): clust_labels = np.ones(n)
-    if len(clust_labels) and isinstance(clust_labels[0], float):
+    if len(clust_labels) and not float(clust_labels[0]).is_integer():
         node_labels = ['Instance ID: %d<br>Group ID: %0.3f<br>Centroid: (%0.1f, %0.1f, %0.1f)' % (i, clust_labels[i], pos[i,0], pos[i,1], pos[i,2]) for i in range(n)]
     else:
         node_labels = ['Instance ID: %d<br>Group ID: %d<br>Centroid: (%0.1f, %0.1f, %0.1f)' % (i, clust_labels[i], pos[i,0], pos[i,1], pos[i,2]) for i in range(n)]

From 62f7d6564445ac2c60d713540ec87cccd7a667e7 Mon Sep 17 00:00:00 2001
From: Francois Drielsma <francois.drielsma@gmail.com>
Date: Wed, 19 Oct 2022 20:27:04 -0700
Subject: [PATCH 25/52] Fix inter_cluster_distance function (was parallelized
 but not thread safe)

---
 mlreco/utils/gnn/network.py | 25 ++++++++++---------------
 1 file changed, 10 insertions(+), 15 deletions(-)

diff --git a/mlreco/utils/gnn/network.py b/mlreco/utils/gnn/network.py
index c202a8b8..0f50b2f8 100644
--- a/mlreco/utils/gnn/network.py
+++ b/mlreco/utils/gnn/network.py
@@ -440,7 +440,7 @@ def inter_cluster_distance(voxels, clusts, batch_ids=None, mode='voxel'):
 
     return _inter_cluster_distance(voxels, clusts, batch_ids, mode)
 
-@nb.njit(parallel=True)
+@nb.njit(parallel=True, cache=True)
 def _inter_cluster_distance(voxels: nb.float64[:,:],
                             clusts: nb.types.List(nb.int64[:]),
                             batch_ids: nb.int64[:],
@@ -448,25 +448,20 @@ def _inter_cluster_distance(voxels: nb.float64[:,:],
 
     assert len(clusts) == len(batch_ids)
     dist_mat = np.zeros((len(batch_ids), len(batch_ids)), dtype=voxels.dtype)
+    indxi, indxj = np.triu_indices(len(batch_ids), 1)
     if mode == 'voxel':
-        for i in nb.prange(len(batch_ids)):
-            for j in range(len(batch_ids)):
-                if batch_ids[i] == batch_ids[j]:
-                    if i < j:
-                        dist_mat[i,j] = np.min(cdist_nb(voxels[clusts[i]], voxels[clusts[j]]))
-                    elif i > j:
-                        dist_mat[i,j] = dist_mat[j,i]
+        for k in nb.prange(len(indxi)):
+            i, j = indxi[k], indxj[k]
+            if batch_ids[i] == batch_ids[j]:
+                dist_mat[i,j] = dist_mat[j,i] = np.min(cdist_nb(voxels[clusts[i]], voxels[clusts[j]]))
     elif mode == 'centroid':
         centroids = np.empty((len(batch_ids), voxels.shape[1]), dtype=voxels.dtype)
         for i in nb.prange(len(batch_ids)):
             centroids[i] = mean_nb(voxels[clusts[i]], axis=0)
-        for i in nb.prange(len(batch_ids)):
-            for j in range(len(batch_ids)):
-                if batch_ids[i] == batch_ids[j]:
-                    if i < j:
-                        dist_mat[i,j] = np.sqrt(np.sum((centroids[j]-centroids[i])**2))
-                    else:
-                        dist_mat[i,j] = dist_mat[j,i]
+        for k in nb.prange(len(indxi)):
+            i, j = indxi[k], indxj[k]
+            if batch_ids[i] == batch_ids[j]:
+                dist_mat[i,j] = dist_mat[j,i] = np.sqrt(np.sum((centroids[j]-centroids[i])**2))
     else:
         raise ValueError('Inter-cluster distance mode not supported')
 

From 0ce6ea85224454ad3bd5709f8d7f796338c7233b Mon Sep 17 00:00:00 2001
From: Temigo <temigo@gmx.com>
Date: Thu, 20 Oct 2022 19:25:11 -0700
Subject: [PATCH 26/52] Small fix to Francois' changes

---
 mlreco/models/layers/common/gnn_full_chain.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlreco/models/layers/common/gnn_full_chain.py b/mlreco/models/layers/common/gnn_full_chain.py
index 26d4a08d..bf26c796 100644
--- a/mlreco/models/layers/common/gnn_full_chain.py
+++ b/mlreco/models/layers/common/gnn_full_chain.py
@@ -726,7 +726,7 @@ def forward(self, out, seg_label, ppn_label=None, cluster_label=None, kinematics
 
                 segmentation_pred = out['segmentation'][0]
 
-                if self.enable_ghost and not self.enable_charge_rescaling:
+                if self.enable_ghost: #and not self.enable_charge_rescaling:
                     segmentation_pred = segmentation_pred[deghost]
                 if self._gspice_use_true_labels:
                     gs_seg_label = torch.cat([cluster_label[0][:, :4], segment_label[:, None]], dim=1)

From b2c21882189dce743b2675717cb9f6f84fdd9321 Mon Sep 17 00:00:00 2001
From: Francois Drielsma <francois.drielsma@gmail.com>
Date: Tue, 25 Oct 2022 10:15:15 -0700
Subject: [PATCH 27/52] Added routines useful to draw a training curve and its
 corresponding validation points

---
 mlreco/visualization/training.py | 301 +++++++++++++++++++++++++++++++
 1 file changed, 301 insertions(+)
 create mode 100644 mlreco/visualization/training.py

diff --git a/mlreco/visualization/training.py b/mlreco/visualization/training.py
new file mode 100644
index 00000000..0cecd28e
--- /dev/null
+++ b/mlreco/visualization/training.py
@@ -0,0 +1,301 @@
+import glob
+import numpy as np
+import pandas as pd
+import seaborn as sns
+import matplotlib as mpl
+from matplotlib import pyplot as plt
+from plotly.offline import iplot
+from plotly import graph_objs as go
+from plotly import subplots as psubplots
+from plotly import colors as pcolors
+
+def set_size(width, fraction=1):
+    """
+    Returns optimal figure dimension for a latex
+    plot, depending on the requested width.
+    
+    Args:
+        width (int)     : Width of the figure
+        fraction (float): Fraction of the width
+    """
+    # Width of figure (in pts)
+    fig_width_pt = width * fraction
+
+    # Convert from pt to inches
+    inches_per_pt = 1 / 72.27
+
+    # Golden ratio to set aesthetic figure height
+    # https://disq.us/p/2940ij3
+    golden_ratio = (5**.5 - 1) / 2
+
+    # Figure width in inches
+    fig_width_in = fig_width_pt * inches_per_pt
+    
+    # Figure height in inches
+    fig_height_in = fig_width_in * golden_ratio
+
+    return (fig_width_in, fig_height_in)
+
+
+def apply_latex_style():
+    """
+    Sets the necessary matplotlib and seaborn parameters
+    to draw a plot using latex.
+    """
+    sns.set(rc={'figure.figsize':set_size(250),
+                'text.usetex':True,
+                'font.family': 'serif',
+                'axes.labelsize': 8,
+                'font.size': 8,
+                'legend.fontsize': 8,
+                'legend.labelspacing': 0.25,
+                'legend.columnspacing': 0.25,
+                'xtick.labelsize': 8,
+                'ytick.labelsize': 8,}, context='paper')
+    sns.set_style('white')
+    sns.set_style(rc={'axes.grid':True, 'font.family': 'serif'})
+    mpl.rcParams['text.latex.preamble'] = [r"\usepackage{amsmath,bm}"]
+
+
+def find_key(df, key_list, separator=':'):
+    """
+    Checks if a DataFrame contains any of the keys listed
+    in a character-separated string.
+ 
+    Args:
+        df (pandas.DataFrame): Pandas dataframe (or dictionary) containing data
+        key_lit (str)        : Character-separated list of keys
+        separator (str)      : Separation character between keys
+    Returns:
+        str: Key found
+        str: Name of the first key (for legend purposes)
+    """
+    key_list  = key_list.split(separator)
+    key_name  = key_list[0]
+    key_found = np.array([k in df.keys() for k in key_list])
+    if not np.any(key_found):
+        raise KeyError('Could not find the keys provided:', key_list)
+    key = key_list[np.where(key_found)[0][0]]
+    return key, key_name
+
+
+def get_training_df(log_dir, prefix='train'):
+    """
+    Finds all training log files inside the specified directory
+    and concatenates them. If the range of iterations overlap, keep only
+    that from the file started further in the training.
+ 
+    Assumes that the formatting of the log file names is of the form
+    `prefix-x.csv`, with `x` the number of iterations.
+ 
+    Args:
+        log_dir (str): Path to the directory that contains the training log files
+        prefix (str) : Prefix shared between training file names (default: `train`)
+    Returns:
+        pandas.DataFrame: Combined training log data
+    """
+    log_files  = np.array(glob.glob(f'{log_dir}/{prefix}*'))
+    end_points = np.array([int(f.split('-')[-1].split('.csv')[0]) for f in log_files])
+    order      = np.argsort(end_points)
+    end_points = np.append(end_points[order], 1e12)
+    return pd.concat([pd.read_csv(f, nrows=end_points[i+1]-end_points[i]) for i, f in enumerate(log_files[order])], sort=True) 
+
+
+def get_validation_df(log_dir, keys, prefix='inference'):
+    """
+    Finds all validation log files inside the specified directory
+    and build a single dataframe out of them. It returns the mean and
+    std of the requested keys for each file.
+ 
+    Assumes that the formatting of the log file names is of the form
+    `prefix-x.csv`, with `x` the number of iterations.
+    
+    The key list allows for `:`-separated names, in case separate files
+    use different names for the same quantity.
+
+    Args:
+        log_dir (str): Path to the directory that contains the validation log files
+        keys (list)  : List of quantities to get mean/std for
+        prefix (str) : Prefix shared between validation file names (default: `inference`)
+    Returns:
+        pandas.DataFrame: Combined validation log data
+    """
+    # Initialize a dictionary
+    val_data = {'iter':[]}
+    for key in keys:
+        key_name = key.split(':')[0]
+        val_data[key_name+'_mean'] = []
+        val_data[key_name+'_err'] = []
+
+    # Loop over validation log files
+    log_files = np.array(glob.glob(f'{log_dir}/{prefix}*'))
+    for log_file in log_files:
+        df = pd.read_csv(log_file)
+        it = int(log_file.split('/')[-1].split('-')[-1].split('.')[0])
+        val_data['iter'].append(it)
+        for key_list in keys:
+            key, key_name = find_key(df, key_list)
+            val_data[f'{key_name}_mean'].append(df[key].mean())
+            val_data[f'{key_name}_err'].append(df[key].std()/np.sqrt(len(df[key])))
+ 
+    args = np.argsort(val_data['iter'])
+    for key, val in val_data.items():
+        val_data[key] = np.array(val)[args]
+ 
+    return pd.DataFrame(val_data)
+
+
+def draw_training_curves(log_dir, models, metrics,
+                         limits={}, model_names={}, metric_names={},
+                         max_iter=-1, step=1, smoothing=1, print_min=False, print_max=False,
+                         plotly=True, same_plot=True, paper=False, leg_ncols=1,
+                         figure_name='', train_prefix='train', val_prefix='inference'):
+    """
+    Finds all training and validation log files inside the specified 
+    directory and draws an evolution plot of the request quantities.
+
+    Args:
+        log_dir (str)      : Path to the directory that contains the folder with log files
+        models (list)      : List of model (folder) names under the main directory
+        metrics (list)     : List of quantities to draw
+        limits (list/dict) : List of y boundaries for the plot (or dictionary of y boundaries, one per metric)
+        model_names (dict) : Dictionary which maps raw model names to model labels (default: empty dict)
+        metric_names (dict): Dictionary which maps raw metric names to metric labels (default: empty dict)
+        max_iter (int)     : Maximum number of interation to include in the plot (default: -1)
+        step (int)         : Step between two successive iterations that are represented (default: 1)
+        smoothing (int)    : Number of iteration over which to average the metric value (default: 1)
+        plotly (bool)      : Use plotly to draw (interactive)
+        same_plot (bool)   : Draw all model/metric pairs on a single plot (default: True)
+        paper (bool)       : Format plot for paper (use latex)
+        leg_ncols (int)    : Number of columns in the legend (default: 3)
+        figure_name (str)  : Name of the figure. If specified, figure is saved (default: '')
+        train_prefix (str) : Prefix shared between training file names (default: `train`)
+        val_prefix (str)   : Prefix shared between validation file names (default: `inference`)
+    """
+    # Set the style
+    plotly_colors = pcolors.convert_colors_to_same_type(pcolors.DEFAULT_PLOTLY_COLORS, 'tuple')[0]
+    if not plotly:
+        cr_char = '\n'
+        if paper:
+            apply_latex_style()
+            linewidth  = 0.5
+            markersize = 1
+        else:
+            sns.set(rc={'figure.figsize':(16,9)}, context='notebook', font_scale=2)
+            sns.set_style('white')
+            sns.set_style(rc={'axes.grid':True})
+            linewidth  = 2
+            markersize = 10
+    else:
+        graphs = []
+        cr_char = '<br>'
+        converter = lambda color: 'rgba({}, {}, {}, 0.5)'.format(*color)
+        plotly_colors = pcolors.color_parser(plotly_colors, converter)
+        layout = go.Layout(template='plotly_white', width=1000, height=500, margin=dict(r=20, l=20, b=20, t=20), 
+                           xaxis=dict(title=dict(text='Epochs', font=dict(size=20)), tickfont=dict(size=20), linecolor='black', mirror=True),
+                           yaxis=dict(title=dict(text='Metric', font=dict(size=20)), tickfont=dict(size=20), linecolor='black', mirror=True),
+                           legend=dict(font=dict(size=20)))
+        if len(models) == 1 and same_plot:
+            layout['legend']['title'] = model_names[models[0]] if models[0] in model_names else models[0]
+
+ 
+    # If there is >1 subplot, prepare the canvas
+    if not same_plot:
+        if not plotly:
+            fig, axes = plt.subplots(len(metrics), sharex=True)
+            fig.subplots_adjust(hspace=0)
+            for axis in axes:
+                axis.set_facecolor('white')
+        else:
+            fig = psubplots.make_subplots(rows=len(metrics), shared_xaxes=True, vertical_spacing=0)
+            for i in range(len(metrics)):
+                if i > 0:
+                    layout[f'xaxis{i+1}'] = layout['xaxis']
+                    layout[f'yaxis{i+1}'] = layout['yaxis']
+                layout[f'xaxis{i+1}']['title']['text'] = '' if i < len(metrics)-1 else 'Epochs'
+                layout[f'yaxis{i+1}']['title']['text'] = metric_names[metrics[i]] if metrics[i] in metric_names else metrics[i]
+                if metrics[i] in limits and len(limits[metrics[i]]) == 2:
+                    layout[f'yaxis{i+1}']['range'] = limits[metrics[i]]
+
+            fig.update_layout(layout)
+ 
+    elif plotly:
+        if isinstance(limits, list) and len(limits) == 2:
+            layout['yaxis']['range'] = limits
+        fig = go.Figure(layout=layout)
+
+    # Get the DataFrames for the requested models/metrics
+    dfs, val_dfs, colors = {}, {}, {}
+    for i, key in enumerate(models):
+        log_subdir = log_dir+key
+        dfs[key] = get_training_df(log_subdir, train_prefix)
+        val_dfs[key] = get_validation_df(log_subdir, metrics, val_prefix)
+        colors[key] = plotly_colors[i]
+
+    # Draw the requested metrics
+    for i, metric_list in enumerate(metrics):
+        # Draw the training curves
+        metric, metric_name = find_key(dfs[key], metric_list)
+        for j, key in enumerate(dfs.keys()):
+            # Get the necessary data
+            epoch_train  = dfs[key]['epoch'][:max_iter:step]
+            metric_train = dfs[key][metric][:max_iter:step] if smoothing == 1 else dfs[key][metric][:max_iter].rolling(smoothing, min_periods=1, center=True).mean()[::step]
+            draw_val     = bool(len(val_dfs[key]['iter']))
+            if draw_val:
+                mask_val     = val_dfs[key]['iter'] < max_iter if max_iter > -1 else val_dfs[key]['iter'] < 1e12
+                iter_val     = val_dfs[key]['iter'][mask_val]
+                epoch_val    = [float(dfs[key]['epoch'][dfs[key]['iter'] == it]) for it in iter_val]
+                metricm_val  = val_dfs[key][metric_name+'_mean'][mask_val]
+                metrice_val  = val_dfs[key][metric_name+'_err'][mask_val]
+ 
+            # Pick a label for this specific model/metric pair
+            if not same_plot:
+                label = model_names[key] if key in model_names else key
+            else:
+                if len(models) == 1:
+                    label = metric_names[metric_name] if metric_name in metric_names else metric_name
+                else:
+                    label = f'{metric_names[metric_name] if metric_name in metric_names else metric_name} ({model_names[key] if key in model_names else key})'
+                if print_min and draw_val:
+                    label += f'{cr_char}Min: {iter_val[np.argmin(metricm_val)]:d}'
+                if print_max and draw_val:
+                    label += f'{cr_char}Max: {iter_val[np.argmax(metricm_val)]:d}'
+
+            # Prepare the relevant plots
+            color = colors[key] if not same_plot else plotly_colors[i*len(models)+j]
+            if not plotly:
+                axis = plt if same_plot else axes[i]
+                axis.plot(epoch_train, metric_train, label=label, color=color, alpha=0.5, linewidth=linewidth)
+                if draw_val:
+                    axis.errorbar(epoch_val, metricm_val, yerr=metrice_val, fmt='.', color=color, linewidth=linewidth, markersize=markersize)
+            else:
+                graphs += [go.Scatter(x=epoch_train, y=metric_train, name=label, line=dict(color=color), showlegend=(same_plot | (not same_plot and not i)))]
+                if draw_val:
+                    hovertext = [f'(Iteration: {iter_val[i]:d})' for i in range(len(iter_val))]
+                    # hovertext = [f'(Iteration: {iter_val[i]:d}, Epoch: {epoch_val[i]:0.3f}, Metric: {metricm_val[i]:0.3f})' for i in range(len(iter_val))]
+                    graphs += [go.Scatter(x=epoch_val, y=metricm_val, error_y_array=metrice_val, mode='markers', hovertext=hovertext, marker=dict(color=color), showlegend=False)]
+
+    if not plotly:
+        if not same_plot:
+            for i, metric in enumerate(metrics):
+                metric_name = metric.split(':')[0]
+                axes[i].set_xlabel('Epochs')
+                axes[i].set_ylabel(metric_names[metric_name] if metric_name in metric_names else metric_name)
+                if metric_name in limits and len(limits[metric_name]) == 2:
+                    axes[i].set_ylim(limits[metric_name])
+            axes[0].legend(ncol=leg_ncols)
+        else:
+            plt.xlabel('Epochs')
+            plt.ylabel('Metric')
+            plt.gca().set_ylim(limits)
+            legend_title = model_names[models[0]] if models[0] in model_names else models[0]
+            plt.legend(ncol=leg_ncols, title=legend_title if len(models)==1 else None)
+        if len(figure_name):
+            plt.savefig(f'{figure_name}.pdf', bbox_inches='tight')
+        plt.show()
+    else:
+        if not same_plot:
+            fig.add_traces(graphs, rows=list(np.arange(len(metrics), step=1./(2*len(models))).astype(int)+1), cols=list(np.ones(2*len(models)*len(metrics), dtype=int)))
+        else:
+            fig.add_traces(graphs)
+        iplot(fig)

From cac961c06172eac62891b7f7ed74adb47b5a6876 Mon Sep 17 00:00:00 2001
From: Francois Drielsma <francois.drielsma@gmail.com>
Date: Tue, 25 Oct 2022 10:52:42 -0700
Subject: [PATCH 28/52] Tweaks to the training curve drawer

---
 mlreco/visualization/training.py | 41 ++++++++++++++++++--------------
 1 file changed, 23 insertions(+), 18 deletions(-)

diff --git a/mlreco/visualization/training.py b/mlreco/visualization/training.py
index 0cecd28e..879f5674 100644
--- a/mlreco/visualization/training.py
+++ b/mlreco/visualization/training.py
@@ -148,7 +148,7 @@ def get_validation_df(log_dir, keys, prefix='inference'):
 def draw_training_curves(log_dir, models, metrics,
                          limits={}, model_names={}, metric_names={},
                          max_iter=-1, step=1, smoothing=1, print_min=False, print_max=False,
-                         plotly=True, same_plot=True, paper=False, leg_ncols=1,
+                         interactive=True, same_plot=True, paper=False, leg_ncols=1,
                          figure_name='', train_prefix='train', val_prefix='inference'):
     """
     Finds all training and validation log files inside the specified 
@@ -159,22 +159,22 @@ def draw_training_curves(log_dir, models, metrics,
         models (list)      : List of model (folder) names under the main directory
         metrics (list)     : List of quantities to draw
         limits (list/dict) : List of y boundaries for the plot (or dictionary of y boundaries, one per metric)
-        model_names (dict) : Dictionary which maps raw model names to model labels (default: empty dict)
-        metric_names (dict): Dictionary which maps raw metric names to metric labels (default: empty dict)
-        max_iter (int)     : Maximum number of interation to include in the plot (default: -1)
-        step (int)         : Step between two successive iterations that are represented (default: 1)
-        smoothing (int)    : Number of iteration over which to average the metric value (default: 1)
-        plotly (bool)      : Use plotly to draw (interactive)
-        same_plot (bool)   : Draw all model/metric pairs on a single plot (default: True)
-        paper (bool)       : Format plot for paper (use latex)
-        leg_ncols (int)    : Number of columns in the legend (default: 3)
-        figure_name (str)  : Name of the figure. If specified, figure is saved (default: '')
+        model_names (dict) : Dictionary which maps raw model names to model labels (default: `{}`)
+        metric_names (dict): Dictionary which maps raw metric names to metric labels (default: `{}`)
+        max_iter (int)     : Maximum number of interation to include in the plot (default: `-1`)
+        step (int)         : Step between two successive iterations that are represented (default: `1`)
+        smoothing (int)    : Number of iteration over which to average the metric value (default: `1`)
+        interactive (bool) : Use plotly to draw (default: `True`)
+        same_plot (bool)   : Draw all model/metric pairs on a single plot (default: `True`)
+        paper (bool)       : Format plot for paper, using latex (default: `False`)
+        leg_ncols (int)    : Number of columns in the legend (default: `1`)
+        figure_name (str)  : Name of the figure. If specified, figure is saved (default: `''`)
         train_prefix (str) : Prefix shared between training file names (default: `train`)
         val_prefix (str)   : Prefix shared between validation file names (default: `inference`)
     """
     # Set the style
     plotly_colors = pcolors.convert_colors_to_same_type(pcolors.DEFAULT_PLOTLY_COLORS, 'tuple')[0]
-    if not plotly:
+    if not interactive:
         cr_char = '\n'
         if paper:
             apply_latex_style()
@@ -201,7 +201,7 @@ def draw_training_curves(log_dir, models, metrics,
  
     # If there is >1 subplot, prepare the canvas
     if not same_plot:
-        if not plotly:
+        if not interactive:
             fig, axes = plt.subplots(len(metrics), sharex=True)
             fig.subplots_adjust(hspace=0)
             for axis in axes:
@@ -218,10 +218,11 @@ def draw_training_curves(log_dir, models, metrics,
                     layout[f'yaxis{i+1}']['range'] = limits[metrics[i]]
 
             fig.update_layout(layout)
- 
-    elif plotly:
+    elif interactive:
         if isinstance(limits, list) and len(limits) == 2:
             layout['yaxis']['range'] = limits
+        if len(metrics) == 1:
+            layout['yaxis']['title']['text'] = metric_names[metrics[0]] if metrics[0] in metric_names else metrics[0]
         fig = go.Figure(layout=layout)
 
     # Get the DataFrames for the requested models/metrics
@@ -254,6 +255,8 @@ def draw_training_curves(log_dir, models, metrics,
             else:
                 if len(models) == 1:
                     label = metric_names[metric_name] if metric_name in metric_names else metric_name
+                elif len(metrics) == 1:
+                    label = model_names[key] if key in model_names else key
                 else:
                     label = f'{metric_names[metric_name] if metric_name in metric_names else metric_name} ({model_names[key] if key in model_names else key})'
                 if print_min and draw_val:
@@ -263,7 +266,7 @@ def draw_training_curves(log_dir, models, metrics,
 
             # Prepare the relevant plots
             color = colors[key] if not same_plot else plotly_colors[i*len(models)+j]
-            if not plotly:
+            if not interactive:
                 axis = plt if same_plot else axes[i]
                 axis.plot(epoch_train, metric_train, label=label, color=color, alpha=0.5, linewidth=linewidth)
                 if draw_val:
@@ -275,7 +278,7 @@ def draw_training_curves(log_dir, models, metrics,
                     # hovertext = [f'(Iteration: {iter_val[i]:d}, Epoch: {epoch_val[i]:0.3f}, Metric: {metricm_val[i]:0.3f})' for i in range(len(iter_val))]
                     graphs += [go.Scatter(x=epoch_val, y=metricm_val, error_y_array=metrice_val, mode='markers', hovertext=hovertext, marker=dict(color=color), showlegend=False)]
 
-    if not plotly:
+    if not interactive:
         if not same_plot:
             for i, metric in enumerate(metrics):
                 metric_name = metric.split(':')[0]
@@ -286,7 +289,9 @@ def draw_training_curves(log_dir, models, metrics,
             axes[0].legend(ncol=leg_ncols)
         else:
             plt.xlabel('Epochs')
-            plt.ylabel('Metric')
+            ylabel = metric_names[metrics[0]] if metrics[0] in metric_names else metrics[0]
+            print(ylabel)
+            plt.ylabel(ylabel if len(metrics) == 1 else 'Metric')
             plt.gca().set_ylim(limits)
             legend_title = model_names[models[0]] if models[0] in model_names else models[0]
             plt.legend(ncol=leg_ncols, title=legend_title if len(models)==1 else None)

From 349834478bcaf5d53346bc86beb1888289ce7eac Mon Sep 17 00:00:00 2001
From: Temigo <temigo@gmx.com>
Date: Thu, 27 Oct 2022 12:13:10 -0700
Subject: [PATCH 29/52] Add volume boundary split functionality in I/O tools

---
 mlreco/iotools/collates.py                    | 136 +++++++++++++++++-
 mlreco/iotools/factories.py                   |  10 +-
 mlreco/iotools/parsers/sparse.py              |  57 +++++---
 mlreco/models/full_chain.py                   |   3 -
 .../layers/gnn/losses/node_kinematics.py      |   1 +
 mlreco/utils/deghosting.py                    |   3 -
 6 files changed, 186 insertions(+), 24 deletions(-)

diff --git a/mlreco/iotools/collates.py b/mlreco/iotools/collates.py
index f9f5de54..9809411a 100644
--- a/mlreco/iotools/collates.py
+++ b/mlreco/iotools/collates.py
@@ -7,7 +7,124 @@
 import numpy as np
 
 
-def CollateSparse(batch):
+class VolumeBoundaries:
+    """
+    VolumeBoundaries is a helper class to deal with multiple detector volumes. Assume you have N
+    volumes that you want to process independently, but your input data file does not separate
+    between them (maybe it is hard to make the separation at simulation level, e.g. in Supera).
+    You can specify in the configuration of the collate function where the volume boundaries are
+    and this helper class will take care of the following:
+
+    1. Relabel batch ids: this will introduce "virtual" batch ids to account for each volume in
+    each batch.
+
+    2. Shift coordinates: voxel coordinates are shifted such that the origin is always the bottom
+    left corner of a volume. In other words, it ensures the voxel coordinate phase space is the
+    same regardless of which volume we are processing. That way you can train on a single volume
+    (subpart of the detector, e.g. cryostat or TPC) and process later however many volumes make up
+    your detector.
+
+    3. Sort coordinates: there is no guarantee that concatenating coordinates of N volumes vs the
+    stored coordinates for label tensors which cover all volumes already by default will yield the
+    same ordering. Hence we do a np.lexsort on coordinates after 1. and 2. have happened. We sort
+    by: batch id, z, y, x in this order.
+
+    An example of configuration would be :
+
+    ```yaml
+    collate:
+      collate_fn: Collatesparse
+      boundaries: [[1376.3], None, None]
+    ```
+
+    `boundaries` is what defines the different volumes. It has a length equal to the spatial dimension.
+    For each spatial dimension, `None` means that there is no boundary along that axis.
+    A list of floating numbers specifies the volume boundaries along that axis in voxel units.
+    The list of volumes will be inferred from this list of boundaries ("meshgrid" style, taking
+    all possible combinations of the boundaries to generate all the volumes).
+    """
+    def __init__(self, definitions):
+        """
+        See explanation of `boundaries` above.
+
+        Parameters
+        ==========
+        definitions: list
+        """
+        self.dim = len(definitions)
+        self.boundaries = definitions
+
+        # Quick sanity check
+        for i in range(self.dim):
+            assert self.boundaries[i] == 'None' or (isinstance(self.boundaries[i], list) and len(self.boundaries[i]) > 0)
+            if self.boundaries[i] == 'None':
+                self.boundaries[i] = None
+                continue
+            self.boundaries[i].sort() # Ascending order
+
+
+    def split(self, voxels):
+        """
+        Parameters
+        ==========
+        voxels: np.array, shape (N, 4)
+            It should contain (batch id, x, y, z) coordinates in this order (as an example if you are working in 3D).
+
+        Returns
+        =======
+        new_voxels: np.array, shape (N, 4)
+            The array contains voxels with shifted coordinates + virtual batch ids. This array is not yet permuted
+            to obey the lexsort.
+        perm: np.array, shape (N,)
+            This is a permutation mask which can be used to apply the lexsort to both the new voxels and the features
+            or data tensor (which is not passed to this function).
+        """
+        coords = voxels[:, 1:]
+        assert len(coords.shape) == 2
+        assert self.dim == coords.shape[1]
+
+        all_boundaries, shifts = [], []
+        n_boundaries =[]
+        for n in range(self.dim):
+            if self.boundaries[n] is None: 
+                all_boundaries.append([np.ones((coords.shape[0],), dtype=bool)])
+                shifts.append([0.])
+                n_boundaries.append(0)
+                continue
+            dim_boundaries = [] 
+            dim_shifts = []
+            for i in range(len(self.boundaries[n])):
+                dim_boundaries.append( coords[:, n] < self.boundaries[n][i] )
+                dim_shifts.append(self.boundaries[n][i-1] if i > 0 else 0.)
+            dim_boundaries.append( coords[:, n] >= self.boundaries[n][-1] )
+            dim_shifts.append(self.boundaries[n][-1])
+            all_boundaries.append(dim_boundaries)
+            shifts.append(dim_shifts)
+            n_boundaries.append(len(self.boundaries[n]))
+
+        #n_volumes = np.prod([len(x) for x in all_boundaries])
+        # Generate indices
+        all_index = []
+        for n in range(self.dim):
+            all_index.append(np.arange(n_boundaries[n]+1))
+        combo = np.array(np.meshgrid(*tuple(all_index))).T.reshape(-1, self.dim)
+
+        virtual_batch_ids = np.zeros((coords.shape[0],), dtype=np.int32)
+        new_coords = coords.copy()
+        for idx, c in enumerate(combo):
+            m = all_boundaries[0][c[0]]
+            for n in range(1, self.dim):
+                m = np.logical_and(m, all_boundaries[n][c[n]])
+            virtual_batch_ids[m] = idx
+            for n in range(self.dim):
+                new_coords[m, n] -= int(shifts[n][c[n]])
+
+        new_voxels = np.concatenate([virtual_batch_ids[:, None], new_coords], axis=1)
+        perm = np.lexsort(new_voxels.T[list(range(1, self.dim+1)) + [0], :])
+        return new_voxels, perm
+
+
+def CollateSparse(batch, **kwargs):
     '''
     Collate sparse input.
 
@@ -15,6 +132,9 @@ def CollateSparse(batch):
     ----------
     batch : a list of dictionary
         Each list element (single dictionary) is a minibatch data = key-value pairs where a value is a parser function return.
+    boundaries: list, optional, default is None
+        This contains a list of volume boundaries if you want to process distinct volumes independently. See VolumeBoundaries
+        documentation for more details and explanations.
 
     Returns
     -------
@@ -29,6 +149,10 @@ def CollateSparse(batch):
     - The dictionaries in the input batch tuple are assumed to have identical list of keys.
     '''
     import MinkowskiEngine as ME
+
+    split_boundaries = 'boundaries' in kwargs
+    vb = VolumeBoundaries(kwargs['boundaries']) if split_boundaries else None
+
     result = {}
     concat = np.concatenate
     for key in batch[0].keys():
@@ -54,7 +178,11 @@ def CollateSparse(batch):
                 coords_minibatch.append(batched_coords)
 
             #coords = torch.Tensor(concat(coords_minibatch, axis=0))
+            dim = coords[0].shape[1]
             coords = concat(coords_minibatch, axis=0)
+            if split_boundaries:
+                coords[:, :dim+1], perm = vb.split(coords[:, :dim+1])
+                coords = coords[perm]
 
             result[key] = coords
         else:
@@ -89,6 +217,12 @@ def CollateSparse(batch):
                                            axis=1 ) for batch_id, sample in enumerate(batch) ],
                                  axis = 0)
                 data = concat([sample[key][1] for sample in batch], axis=0)
+
+                if split_boundaries:
+                    voxels, perm = vb.split(voxels)
+                    voxels = voxels[perm]
+                    data = data[perm]
+
                 result[key] = concat([voxels, data], axis=1)
             elif isinstance(batch[0][key],np.ndarray) and \
                  len(batch[0][key].shape) == 1:
diff --git a/mlreco/iotools/factories.py b/mlreco/iotools/factories.py
index 9963736f..217f0632 100644
--- a/mlreco/iotools/factories.py
+++ b/mlreco/iotools/factories.py
@@ -44,6 +44,13 @@ def loader_factory(cfg,event_list=None):
     shuffle      = True if not 'shuffle' in params     else bool(params['shuffle'    ])
     num_workers  = 1    if not 'num_workers' in params else int (params['num_workers'])
     collate_fn   = None if not 'collate_fn' in params  else str (params['collate_fn' ])
+    collate_kwargs = {}
+
+    if collate_fn is None:
+        collate_params = params.get('collate', {})
+        collate_fn = None if not 'collate_fn' in collate_params else str(collate_params['collate_fn'])
+        collate_params.pop('collate_fn', None)
+        collate_kwargs = collate_params
 
     if not int(params['batch_size']) % int(params['minibatch_size']) == 0:
         print('iotools.batch_size (',params['batch_size'],'must be divisble by iotools.minibatch_size',params['minibatch_size'])
@@ -51,6 +58,7 @@ def loader_factory(cfg,event_list=None):
 
     import mlreco.iotools.collates
     import mlreco.iotools.samplers
+    from functools import partial
 
     ds = dataset_factory(cfg,event_list)
     sampler = None
@@ -59,7 +67,7 @@ def loader_factory(cfg,event_list=None):
         sam_cfg['minibatch_size']=cfg['iotool']['minibatch_size']
         sampler = getattr(mlreco.iotools.samplers,sam_cfg['name']).create(ds,sam_cfg)
     if collate_fn is not None:
-        collate_fn = getattr(mlreco.iotools.collates,collate_fn)
+        collate_fn = partial(getattr(mlreco.iotools.collates,collate_fn), **collate_kwargs)
         loader = DataLoader(ds,
                             batch_size  = minibatch_size,
                             shuffle     = shuffle,
diff --git a/mlreco/iotools/parsers/sparse.py b/mlreco/iotools/parsers/sparse.py
index 01a18f9d..fc728159 100644
--- a/mlreco/iotools/parsers/sparse.py
+++ b/mlreco/iotools/parsers/sparse.py
@@ -57,7 +57,7 @@ def parse_sparse2d(sparse_event_list):
     return np_voxels, np.concatenate(output, axis=-1)
 
 
-def parse_sparse3d(sparse_event_list):
+def parse_sparse3d(sparse_event_list, features=None):
     """
     A function to retrieve sparse tensor input from larcv::EventSparseTensor3D object
 
@@ -78,6 +78,15 @@ def parse_sparse3d(sparse_event_list):
     -------------
     sparse_event_list: list of larcv::EventSparseTensor3D
         Can be repeated to load more features (one per feature).
+    features: int, optional
+        Default is None (ignored). If a positive integer is specified,
+        the sparse_event_list will be split in equal lists of length
+        `features`. Each list will be concatenated along the feature
+        dimension separately. Then all lists are concatenated along the 
+        first dimension (voxels). For example, this lets you work with
+        distinct detector volumes whose input data is stored in separate
+        TTrees.`features` is required to be a divider of the `sparse_event_list`
+        length.
 
     Returns
     -------
@@ -86,21 +95,37 @@ def parse_sparse3d(sparse_event_list):
     data: numpy array(float32) with shape (N,C)
         Pixel values/channels, as many channels as specified larcv::EventSparseTensor3D.
     """
-    meta = None
-    output = []
-    np_voxels = None
-    for sparse_event in sparse_event_list:
-        num_point = sparse_event.as_vector().size()
-        if meta is None:
-            meta = sparse_event.meta()
-            np_voxels = np.empty(shape=(num_point, 3), dtype=np.int32)
-            larcv.fill_3d_voxels(sparse_event, np_voxels)
-        else:
-            assert meta == sparse_event.meta()
-        np_data = np.empty(shape=(num_point, 1), dtype=np.float32)
-        larcv.fill_3d_pcloud(sparse_event, np_data)
-        output.append(np_data)
-    return np_voxels, np.concatenate(output, axis=-1)
+    split_sparse_event_list = [sparse_event_list]
+    if features is not None and features > 0:
+        if len(sparse_event_list) % features > 0:
+            raise Exception("features number in parse_sparse3d should be a divider of the sparse_event_list length.")
+        split_sparse_event_list = np.split(np.array(sparse_event_list), len(sparse_event_list) / features)
+    
+    voxels, features = [], []
+    features_count = None
+    for sparse_event_list in split_sparse_event_list:
+        if features_count is None:
+            features_count = len(sparse_event_list)
+        assert len(sparse_event_list) == features_count
+
+        meta = None
+        output = []
+        np_voxels = None
+        for sparse_event in sparse_event_list:
+            num_point = sparse_event.as_vector().size()
+            if meta is None:
+                meta = sparse_event.meta()
+                np_voxels = np.empty(shape=(num_point, 3), dtype=np.int32)
+                larcv.fill_3d_voxels(sparse_event, np_voxels)
+            else:
+                assert meta == sparse_event.meta()
+            np_data = np.empty(shape=(num_point, 1), dtype=np.float32)
+            larcv.fill_3d_pcloud(sparse_event, np_data)
+            output.append(np_data)
+        voxels.append(np_voxels)
+        features.append(np.concatenate(output, axis=-1))
+
+    return np.concatenate(voxels, axis=0), np.concatenate(features, axis=0)
 
 
 def parse_sparse3d_ghost(sparse_event_semantics):
diff --git a/mlreco/models/full_chain.py b/mlreco/models/full_chain.py
index 203373b4..06814fe4 100644
--- a/mlreco/models/full_chain.py
+++ b/mlreco/models/full_chain.py
@@ -257,9 +257,6 @@ def full_chain_cnn(self, input):
             input = [input[0][deghost]]
 
             if label_seg is not None and label_clustering is not None:
-
-                #print(label_seg[0].shape, label_clustering[0].shape)
-
                 # ME uses 0 for batch column, so need to compensate
                 label_clustering = adapt_labels(result,
                                                 label_seg,
diff --git a/mlreco/models/layers/gnn/losses/node_kinematics.py b/mlreco/models/layers/gnn/losses/node_kinematics.py
index a8f815f2..96170cb9 100644
--- a/mlreco/models/layers/gnn/losses/node_kinematics.py
+++ b/mlreco/models/layers/gnn/losses/node_kinematics.py
@@ -175,6 +175,7 @@ def forward(self, out, types):
                     valid_mask_type = node_assn_type > -1
 
                     # Do not apply loss if the logit corresponding to the true class is -inf (forbidden prediction)
+                    # Not a problem is node_assn_type is -1, as these rows will already be excluded by previous mask
                     valid_mask_type &= (node_pred_type[np.arange(len(node_assn_type)),node_assn_type] != -float('inf')).detach().cpu().numpy()
 
                     # If high purity is requested, do not include broken particle in the loss
diff --git a/mlreco/utils/deghosting.py b/mlreco/utils/deghosting.py
index 5540151b..8333d239 100644
--- a/mlreco/utils/deghosting.py
+++ b/mlreco/utils/deghosting.py
@@ -124,9 +124,6 @@ def adapt_labels_knn(result, label_seg, label_clustering,
         assert true_mask.shape[0] == label_seg[0].shape[0]
     c3 = max(c2, batch_column+1)
 
-    indices = "2762  2763  2767  2769  4821  4822  4831  4832  4833  4834  4835  4844  4857  6617 12095 12096 12097".split()
-    indices = np.array([int(i) for i in indices])
-
     for i in range(len(label_seg)):
         coords = label_seg[i][:, :c3]
         label_c = []

From 8eb58694f9505c4dd3ec70dc112aa1f75202c0a1 Mon Sep 17 00:00:00 2001
From: Temigo <temigo@gmx.com>
Date: Fri, 28 Oct 2022 15:17:07 -0700
Subject: [PATCH 30/52] Changes to analysis tools (and small fixes) for volume
 splitting

---
 analysis/classes/Interaction.py               |   99 ++
 analysis/classes/Particle.py                  |   98 ++
 analysis/classes/ParticleFragment.py          |   63 +
 analysis/classes/TruthInteraction.py          |   31 +
 analysis/classes/TruthParticle.py             |   70 +
 analysis/classes/TruthParticleFragment.py     |   24 +
 analysis/classes/__init__.py                  |    6 +
 analysis/classes/particle.py                  |  343 +----
 analysis/classes/ui.py                        | 1153 ++++++++++-------
 mlreco/iotools/collates.py                    |   99 +-
 mlreco/models/layers/common/gnn_full_chain.py |    2 +
 mlreco/utils/cluster/fragmenter.py            |    1 +
 mlreco/utils/gnn/data.py                      |    1 +
 13 files changed, 1182 insertions(+), 808 deletions(-)
 create mode 100644 analysis/classes/Interaction.py
 create mode 100644 analysis/classes/Particle.py
 create mode 100644 analysis/classes/ParticleFragment.py
 create mode 100644 analysis/classes/TruthInteraction.py
 create mode 100644 analysis/classes/TruthParticle.py
 create mode 100644 analysis/classes/TruthParticleFragment.py

diff --git a/analysis/classes/Interaction.py b/analysis/classes/Interaction.py
new file mode 100644
index 00000000..a22a8f77
--- /dev/null
+++ b/analysis/classes/Interaction.py
@@ -0,0 +1,99 @@
+import numpy as np
+import pandas as pd
+
+from typing import Counter, List, Union
+from collections import defaultdict, Counter
+from . import Particle
+
+
+class Interaction:
+    """
+    Data structure for managing interaction-level
+    full chain output information.
+
+    Attributes
+    ----------
+    id: int
+        Unique ID (Interaction ID) of this interaction.
+    particles: List[Particle]
+        List of <Particle> objects that belong to this Interaction.
+    vertex: (1,3) np.array (Optional)
+        3D coordinates of the predicted interaction vertex
+    nu_id: int (Optional, TODO)
+        Label indicating whether this interaction is a neutrino interaction
+        WARNING: The nu_id label is most likely unreliable. Don't use this
+        in reconstruction (used for debugging)
+    num_particles: int
+        total number of particles in this interaction.
+    """
+    def __init__(self, interaction_id, particles, vertex=None, nu_id=-1, volume=0):
+        self.id = interaction_id
+        self.particles = particles
+        self.match = []
+        self._match_counts = {}
+        self.check_validity()
+        # Voxel indices of an interaction is defined by the union of
+        # constituent particle voxel indices
+        self.voxel_indices = []
+        for p in self.particles:
+            self.voxel_indices.append(p.voxel_indices)
+            assert p.interaction_id == interaction_id
+        self.voxel_indices = np.hstack(self.voxel_indices)
+        self.size = self.voxel_indices.shape[0]
+        self.num_particles = len(self.particles)
+
+        self.pid_keys = {
+            0: 'Photon',
+            1: 'Electron',
+            2: 'Muon',
+            3: 'Pion',
+            4: 'Proton'
+        }
+
+        self.get_particles_summary()
+
+        self.vertex = vertex
+        if self.vertex is None:
+            self.vertex = np.array([-1, -1, -1])
+
+        self.nu_id = nu_id
+        self.volume = volume
+
+        self.particle_ids = [p.id for p in self.particles]
+        self.particle_counts = Counter({ self.pid_keys[i] : 0 for i in range(len(self.pid_keys))})
+        self.particle_counts.update([self.pid_keys[p.pid] for p in self.particles])
+
+        self.primary_particle_counts = Counter({ self.pid_keys[i] : 0 for i in range(len(self.pid_keys))})
+        self.primary_particle_counts.update([self.pid_keys[p.pid] for p in self.particles if p.is_primary])
+
+        if sum(self.primary_particle_counts.values()) == 0:
+            # print("Interaction {} has no primary particles!".format(self.id))
+            self.is_valid = False
+        else:
+            self.is_valid = True
+
+    def check_validity(self):
+        for p in self.particles:
+            assert isinstance(p, Particle)
+
+    def get_particles_summary(self):
+        self.particles_summary = ""
+        self.particles = sorted(self.particles, key=lambda x: x.id)
+        for p in self.particles:
+            pmsg = "    - Particle {}: PID = {}, Size = {}, Match = {} \n".format(
+                p.id, self.pid_keys[p.pid], p.points.shape[0], str(p.match))
+            self.particles_summary += pmsg
+
+
+    def __repr__(self):
+
+        self.get_particles_summary()
+        msg = "Interaction {}, Valid: {}, Vertex: x={:.2f}, y={:.2f}, z={:.2f}\n"\
+            "--------------------------------------------------------------------\n".format(
+            self.id, self.is_valid, self.vertex[0], self.vertex[1], self.vertex[2])
+        return msg + self.particles_summary
+
+    def __str__(self):
+        return "Interaction(id={}, vertex={}, nu_id={}, Particles={})".format(
+            self.id, str(self.vertex), self.nu_id, str(self.particle_ids))
+
diff --git a/analysis/classes/Particle.py b/analysis/classes/Particle.py
new file mode 100644
index 00000000..252e0107
--- /dev/null
+++ b/analysis/classes/Particle.py
@@ -0,0 +1,98 @@
+import numpy as np
+import pandas as pd
+
+from typing import Counter, List, Union
+
+
+class Particle:
+    '''
+    Data Structure for managing Particle-level
+    full chain output information
+
+    Attributes
+    ----------
+    id: int
+        Unique ID of the particle
+    points: (N, 3) np.array
+        3D coordinates of the voxels that belong to this particle
+    size: int
+        Total number of voxels that belong to this particle
+    depositions: (N, 1) np.array
+        Array of energy deposition values for each voxel (rescaled, ADC units)
+    voxel_indices: (N, ) np.array
+        Numeric integer indices of voxel positions of this particle
+        with respect to the total array of point in a single image.
+    semantic_type: int
+        Semantic type (shower fragment (0), track (1),
+        michel (2), delta (3), lowE (4)) of this particle.
+    pid: int
+        PDG Type (Photon (0), Electron (1), Muon (2),
+        Charged Pion (3), Proton (4)) of this particle.
+    pid_conf: float
+        Softmax probability score for the most likely pid prediction
+    interaction_id: int
+        Integer ID of the particle's parent interaction
+    image_id: int
+        ID of the image in which this particle resides in
+    is_primary: bool
+        Indicator whether this particle is a primary from an interaction.
+    match: List[int]
+        List of TruthParticle IDs for which this particle is matched to
+
+    startpoint: (1,3) np.array
+        (1, 3) array of particle's startpoint, if it could be assigned
+    endpoint: (1,3) np.array
+        (1, 3) array of particle's endpoint, if it could be assigned
+    '''
+    def __init__(self, coords, group_id, semantic_type, interaction_id,
+                 pid, image_id=0, voxel_indices=None, depositions=None, volume=0, **kwargs):
+        self.id = group_id
+        self.points = coords
+        self.size = coords.shape[0]
+        self.depositions = depositions # In rescaled ADC
+        self.voxel_indices = voxel_indices
+        self.semantic_type = semantic_type
+        self.pid = pid
+        self.pid_conf = kwargs.get('pid_conf', None)
+        self.interaction_id = interaction_id
+        self.image_id = image_id
+        self.is_primary = kwargs.get('is_primary', False)
+        self.match = []
+        self._match_counts = {}
+#         self.fragments = fragment_ids
+        self.semantic_keys = {
+            0: 'Shower Fragment',
+            1: 'Track',
+            2: 'Michel Electron',
+            3: 'Delta Ray',
+            4: 'LowE Depo'
+        }
+
+        self.pid_keys = {
+            -1: 'None',
+            0: 'Photon',
+            1: 'Electron',
+            2: 'Muon',
+            3: 'Pion',
+            4: 'Proton'
+        }
+
+        self.sum_edep = np.sum(self.depositions)
+        self.volume = volume
+
+    def __str__(self):
+        return self.__repr__()
+
+    def __repr__(self):
+        fmt = "Particle( Image ID={:<3} | Particle ID={:<3} | Semantic_type: {:<15}"\
+                " | PID: {:<8} | Primary: {:<2} | Score = {:.2f}% | Interaction ID: {:<2} | Size: {:<5} | Volume: {:<2} )"
+        msg = fmt.format(self.image_id, self.id,
+                         self.semantic_keys[self.semantic_type] if self.semantic_type in self.semantic_keys else "None",
+                         self.pid_keys[self.pid] if self.pid in self.pid_keys else "None",
+                         self.is_primary,
+                         self.pid_conf * 100,
+                         self.interaction_id,
+                         self.points.shape[0],
+                         self.volume)
+        return msg
+
diff --git a/analysis/classes/ParticleFragment.py b/analysis/classes/ParticleFragment.py
new file mode 100644
index 00000000..07cf40d7
--- /dev/null
+++ b/analysis/classes/ParticleFragment.py
@@ -0,0 +1,63 @@
+import numpy as np
+import pandas as pd
+
+from typing import Counter, List, Union
+from . import Particle
+
+
+class ParticleFragment(Particle):
+    '''
+    Data structure for managing fragment-level
+    full chain output information
+
+    Attributes
+    ----------
+    See <Particle> documentation for shared attributes.
+    Below are attributes exclusive to ParticleFragment
+
+    id: int
+        fragment ID of this particle fragment (different from particle id)
+    group_id: int
+        Group ID (alias for Particle ID) for which this fragment belongs to.
+    is_primary: bool
+        If True, then this particle fragment corresponds to
+        a primary ionization trajectory within the group of fragments that
+        compose a particle.
+    '''
+    def __init__(self, coords, fragment_id, semantic_type, interaction_id,
+                 group_id, image_id=0, voxel_indices=None,
+                 depositions=None, volume=0, **kwargs):
+        self.id = fragment_id
+        self.points = coords
+        self.size = coords.shape[0]
+        self.depositions = depositions # In rescaled ADC
+        self.voxel_indices = voxel_indices
+        self.semantic_type = semantic_type
+        self.group_id = group_id
+        self.interaction_id = interaction_id
+        self.image_id = image_id
+        self.is_primary = kwargs.get('is_primary', False)
+        self.semantic_keys = {
+            0: 'Shower Fragment',
+            1: 'Track',
+            2: 'Michel Electron',
+            3: 'Delta Ray',
+            4: 'LowE Depo'
+        }
+        self.volume = volume
+
+    def __str__(self):
+        return self.__repr__()
+
+    def __repr__(self):
+        fmt = "ParticleFragment( Image ID={:<3} | Fragment ID={:<3} | Semantic_type: {:<15}"\
+                " | Group ID: {:<3} | Primary: {:<2} | Interaction ID: {:<2} | Size: {:<5} | Volume: {:<2})"
+        msg = fmt.format(self.image_id, self.id,
+                         self.semantic_keys[self.semantic_type] if self.semantic_type in self.semantic_keys else "None",
+                         self.group_id,
+                         self.is_primary,
+                         self.interaction_id,
+                         self.points.shape[0],
+                         self.volume)
+        return msg
+
diff --git a/analysis/classes/TruthInteraction.py b/analysis/classes/TruthInteraction.py
new file mode 100644
index 00000000..d76f863a
--- /dev/null
+++ b/analysis/classes/TruthInteraction.py
@@ -0,0 +1,31 @@
+import numpy as np
+import pandas as pd
+
+from . import Interaction, TruthParticle
+
+
+class TruthInteraction(Interaction):
+    """
+    Analogous data structure for Interactions retrieved from true labels.
+    """
+    def __init__(self, *args, **kwargs):
+        super(TruthInteraction, self).__init__(*args, **kwargs)
+        self.match = []
+        self._match_counts = {}
+
+    def check_validity(self):
+        for p in self.particles:
+            assert isinstance(p, TruthParticle)
+
+    def __repr__(self):
+
+        self.get_particles_summary()
+        msg = "TruthInteraction {}, Vertex: x={:.2f}, y={:.2f}, z={:.2f}\n"\
+            "-----------------------------------------------\n".format(
+            self.id, self.vertex[0], self.vertex[1], self.vertex[2])
+        return msg + self.particles_summary
+
+    def __str__(self):
+        return "TruthInteraction(id={}, vertex={}, nu_id={}, Particles={})".format(
+            self.id, str(self.vertex), self.nu_id, str(self.particle_ids))
+
diff --git a/analysis/classes/TruthParticle.py b/analysis/classes/TruthParticle.py
new file mode 100644
index 00000000..7757b13b
--- /dev/null
+++ b/analysis/classes/TruthParticle.py
@@ -0,0 +1,70 @@
+import numpy as np
+import pandas as pd
+
+from typing import Counter, List, Union
+from . import Particle
+
+
+class TruthParticle(Particle):
+    '''
+    Data structure mirroring <Particle>, reserved for true particles
+    derived from true labels / true MC information.
+
+    Attributes
+    ----------
+    See <Particle> documentation for shared attributes.
+    Below are attributes exclusive to TruthParticle
+
+    asis: larcv.Particle C++ object (Optional)
+        Raw larcv.Particle C++ object as retrived from parse_particles_asis.
+    match: List[int]
+        List of Particle IDs that match to this TruthParticle
+    coords_noghost:
+        Coordinates using true labels (not adapted to deghosting output)
+    depositions_noghost:
+        Depositions using true labels (not adapted to deghosting output), in MeV.
+    depositions_MeV:
+        Similar as `depositions`, i.e. using adapted true labels.
+        Using true MeV energy deposits instead of rescaled ADC units.
+    '''
+    def __init__(self, *args, particle_asis=None, coords_noghost=None, depositions_noghost=None,
+                depositions_MeV=None, **kwargs):
+        super(TruthParticle, self).__init__(*args, **kwargs)
+        self.asis = particle_asis
+        self.match = []
+        self._match_counts = {}
+        self.coords_noghost = coords_noghost
+        self.depositions_noghost = depositions_noghost
+        self.depositions_MeV = depositions_MeV
+
+    def __repr__(self):
+        fmt = "TruthParticle( Image ID={:<3} | Particle ID={:<3} | Semantic_type: {:<15}"\
+                " | PID: {:<8} | Primary: {:<2} | Interaction ID: {:<2} | Size: {:<5} | Volume: {:<2} )"
+        msg = fmt.format(self.image_id, self.id,
+                         self.semantic_keys[self.semantic_type] if self.semantic_type in self.semantic_keys else "None",
+                         self.pid_keys[self.pid] if self.pid in self.pid_keys else "None",
+                         self.is_primary,
+                         self.interaction_id,
+                         self.points.shape[0],
+                         self.volume)
+        return msg
+
+
+    def is_contained(self, spatial_size):
+
+        p = self.particle_asis
+        check_contained = p.position().x() >= 0 and p.position().x() <= spatial_size \
+            and p.position().y() >= 0 and p.position().y() <= spatial_size \
+            and p.position().z() >= 0 and p.position().z() <= spatial_size \
+            and p.end_position().x() >= 0 and p.end_position().x() <= spatial_size \
+            and p.end_position().y() >= 0 and p.end_position().y() <= spatial_size \
+            and p.end_position().z() >= 0 and p.end_position().z() <= spatial_size
+        return check_contained
+
+    def purity_efficiency(self, other_particle):
+        overlap = len(np.intersect1d(self.voxel_indices, other_particle.voxel_indices))
+        return {
+            "purity": overlap / len(other_particle.voxel_indices),
+            "efficiency": overlap / len(self.voxel_indices)
+        }
+
diff --git a/analysis/classes/TruthParticleFragment.py b/analysis/classes/TruthParticleFragment.py
new file mode 100644
index 00000000..9df9366b
--- /dev/null
+++ b/analysis/classes/TruthParticleFragment.py
@@ -0,0 +1,24 @@
+import numpy as np
+import pandas as pd
+
+from typing import Counter, List, Union
+from . import ParticleFragment
+
+
+class TruthParticleFragment(ParticleFragment):
+
+    def __init__(self, *args, depositions_MeV=None, **kwargs):
+        super(TruthParticleFragment, self).__init__(*args, **kwargs)
+        self.depositions_MeV = depositions_MeV
+
+    def __repr__(self):
+        fmt = "TruthParticleFragment( Image ID={:<3} | Fragment ID={:<3} | Semantic_type: {:<15}"\
+                " | Group ID: {:<3} | Primary: {:<2} | Interaction ID: {:<2} | Size: {:<5} | Volume: {:<2})"
+        msg = fmt.format(self.image_id, self.id,
+                         self.semantic_keys[self.semantic_type] if self.semantic_type in self.semantic_keys else "None",
+                         self.group_id,
+                         self.is_primary,
+                         self.interaction_id,
+                         self.points.shape[0],
+                         self.volume)
+        return msg
diff --git a/analysis/classes/__init__.py b/analysis/classes/__init__.py
index e69de29b..6902e665 100644
--- a/analysis/classes/__init__.py
+++ b/analysis/classes/__init__.py
@@ -0,0 +1,6 @@
+from .Particle import Particle
+from .ParticleFragment import ParticleFragment
+from .TruthParticle import TruthParticle
+from .TruthParticleFragment import TruthParticleFragment
+from .Interaction import Interaction
+from .TruthInteraction import TruthInteraction
diff --git a/analysis/classes/particle.py b/analysis/classes/particle.py
index 1645307f..8d4dc2f0 100644
--- a/analysis/classes/particle.py
+++ b/analysis/classes/particle.py
@@ -8,348 +8,7 @@
 
 from pprint import pprint
 
-
-class Particle:
-    '''
-    Data Structure for managing Particle-level
-    full chain output information
-
-    Attributes
-    ----------
-    id: int
-        Unique ID of the particle
-    points: (N, 3) np.array
-        3D coordinates of the voxels that belong to this particle
-    size: int
-        Total number of voxels that belong to this particle
-    depositions: (N, 1) np.array
-        Array of energy deposition values for each voxel (rescaled, ADC units)
-    voxel_indices: (N, ) np.array
-        Numeric integer indices of voxel positions of this particle
-        with respect to the total array of point in a single image.
-    semantic_type: int
-        Semantic type (shower fragment (0), track (1),
-        michel (2), delta (3), lowE (4)) of this particle.
-    pid: int
-        PDG Type (Photon (0), Electron (1), Muon (2),
-        Charged Pion (3), Proton (4)) of this particle.
-    pid_conf: float
-        Softmax probability score for the most likely pid prediction
-    interaction_id: int
-        Integer ID of the particle's parent interaction
-    image_id: int
-        ID of the image in which this particle resides in
-    is_primary: bool
-        Indicator whether this particle is a primary from an interaction.
-    match: List[int]
-        List of TruthParticle IDs for which this particle is matched to
-
-    startpoint: (1,3) np.array
-        (1, 3) array of particle's startpoint, if it could be assigned
-    endpoint: (1,3) np.array
-        (1, 3) array of particle's endpoint, if it could be assigned
-    '''
-    def __init__(self, coords, group_id, semantic_type, interaction_id,
-                 pid, image_id=0, voxel_indices=None, depositions=None, **kwargs):
-        self.id = group_id
-        self.points = coords
-        self.size = coords.shape[0]
-        self.depositions = depositions # In rescaled ADC
-        self.voxel_indices = voxel_indices
-        self.semantic_type = semantic_type
-        self.pid = pid
-        self.pid_conf = kwargs.get('pid_conf', None)
-        self.interaction_id = interaction_id
-        self.image_id = image_id
-        self.is_primary = kwargs.get('is_primary', False)
-        self.match = []
-        self._match_counts = {}
-#         self.fragments = fragment_ids
-        self.semantic_keys = {
-            0: 'Shower Fragment',
-            1: 'Track',
-            2: 'Michel Electron',
-            3: 'Delta Ray',
-            4: 'LowE Depo'
-        }
-
-        self.pid_keys = {
-            -1: 'None',
-            0: 'Photon',
-            1: 'Electron',
-            2: 'Muon',
-            3: 'Pion',
-            4: 'Proton'
-        }
-
-        self.sum_edep = np.sum(self.depositions)
-
-    def __str__(self):
-        return self.__repr__()
-
-    def __repr__(self):
-        fmt = "Particle( Image ID={:<3} | Particle ID={:<3} | Semantic_type: {:<15}"\
-            " | PID: {:<8} | Primary: {:<2} | Score = {:.2f}% | Interaction ID: {:<2} | Size: {:<5} )"
-        msg = fmt.format(self.image_id, self.id,
-                         self.semantic_keys[self.semantic_type] if self.semantic_type in self.semantic_keys else "None",
-                         self.pid_keys[self.pid] if self.pid in self.pid_keys else "None",
-                         self.is_primary,
-                         self.pid_conf * 100,
-                         self.interaction_id,
-                         self.points.shape[0])
-        return msg
-
-
-class ParticleFragment(Particle):
-    '''
-    Data structure for managing fragment-level
-    full chain output information
-
-    Attributes
-    ----------
-    See <Particle> documentation for shared attributes.
-    Below are attributes exclusive to ParticleFragment
-
-    id: int
-        fragment ID of this particle fragment (different from particle id)
-    group_id: int
-        Group ID (alias for Particle ID) for which this fragment belongs to.
-    is_primary: bool
-        If True, then this particle fragment corresponds to
-        a primary ionization trajectory within the group of fragments that
-        compose a particle.
-    '''
-    def __init__(self, coords, fragment_id, semantic_type, interaction_id,
-                 group_id, image_id=0, voxel_indices=None,
-                 depositions=None, **kwargs):
-        self.id = fragment_id
-        self.points = coords
-        self.size = coords.shape[0]
-        self.depositions = depositions # In rescaled ADC
-        self.voxel_indices = voxel_indices
-        self.semantic_type = semantic_type
-        self.group_id = group_id
-        self.interaction_id = interaction_id
-        self.image_id = image_id
-        self.is_primary = kwargs.get('is_primary', False)
-        self.semantic_keys = {
-            0: 'Shower Fragment',
-            1: 'Track',
-            2: 'Michel Electron',
-            3: 'Delta Ray',
-            4: 'LowE Depo'
-        }
-
-    def __str__(self):
-        return self.__repr__()
-
-    def __repr__(self):
-        fmt = "ParticleFragment( Image ID={:<3} | Fragment ID={:<3} | Semantic_type: {:<15}"\
-            " | Group ID: {:<3} | Primary: {:<2} | Interaction ID: {:<2} | Size: {:<5} )"
-        msg = fmt.format(self.image_id, self.id,
-                         self.semantic_keys[self.semantic_type] if self.semantic_type in self.semantic_keys else "None",
-                         self.group_id,
-                         self.is_primary,
-                         self.interaction_id,
-                         self.points.shape[0])
-        return msg
-
-
-class TruthParticleFragment(ParticleFragment):
-
-    def __init__(self, *args, depositions_MeV=None, **kwargs):
-        super(TruthParticleFragment, self).__init__(*args, **kwargs)
-        self.depositions_MeV = depositions_MeV
-
-    def __repr__(self):
-        fmt = "TruthParticleFragment( Image ID={:<3} | Fragment ID={:<3} | Semantic_type: {:<15}"\
-            " | Group ID: {:<3} | Primary: {:<2} | Interaction ID: {:<2} | Size: {:<5} )"
-        msg = fmt.format(self.image_id, self.id,
-                         self.semantic_keys[self.semantic_type] if self.semantic_type in self.semantic_keys else "None",
-                         self.group_id,
-                         self.is_primary,
-                         self.interaction_id,
-                         self.points.shape[0])
-        return msg
-
-
-class TruthParticle(Particle):
-    '''
-    Data structure mirroring <Particle>, reserved for true particles
-    derived from true labels / true MC information.
-
-    Attributes
-    ----------
-    See <Particle> documentation for shared attributes.
-    Below are attributes exclusive to TruthParticle
-
-    asis: larcv.Particle C++ object (Optional)
-        Raw larcv.Particle C++ object as retrived from parse_particles_asis.
-    match: List[int]
-        List of Particle IDs that match to this TruthParticle
-    coords_noghost:
-        Coordinates using true labels (not adapted to deghosting output)
-    depositions_noghost:
-        Depositions using true labels (not adapted to deghosting output), in MeV.
-    depositions_MeV:
-        Similar as `depositions`, i.e. using adapted true labels.
-        Using true MeV energy deposits instead of rescaled ADC units.
-    '''
-    def __init__(self, *args, particle_asis=None, coords_noghost=None, depositions_noghost=None,
-                depositions_MeV=None, **kwargs):
-        super(TruthParticle, self).__init__(*args, **kwargs)
-        self.asis = particle_asis
-        self.match = []
-        self._match_counts = {}
-        self.coords_noghost = coords_noghost
-        self.depositions_noghost = depositions_noghost
-        self.depositions_MeV = depositions_MeV
-
-    def __repr__(self):
-        fmt = "TruthParticle( Image ID={:<3} | Particle ID={:<3} | Semantic_type: {:<15}"\
-            " | PID: {:<8} | Primary: {:<2} | Interaction ID: {:<2} | Size: {:<5} )"
-        msg = fmt.format(self.image_id, self.id,
-                         self.semantic_keys[self.semantic_type] if self.semantic_type in self.semantic_keys else "None",
-                         self.pid_keys[self.pid] if self.pid in self.pid_keys else "None",
-                         self.is_primary,
-                         self.interaction_id,
-                         self.points.shape[0])
-        return msg
-
-
-    def is_contained(self, spatial_size):
-
-        p = self.particle_asis
-        check_contained = p.position().x() >= 0 and p.position().x() <= spatial_size \
-            and p.position().y() >= 0 and p.position().y() <= spatial_size \
-            and p.position().z() >= 0 and p.position().z() <= spatial_size \
-            and p.end_position().x() >= 0 and p.end_position().x() <= spatial_size \
-            and p.end_position().y() >= 0 and p.end_position().y() <= spatial_size \
-            and p.end_position().z() >= 0 and p.end_position().z() <= spatial_size
-        return check_contained
-
-    def purity_efficiency(self, other_particle):
-        overlap = len(np.intersect1d(self.voxel_indices, other_particle.voxel_indices))
-        return {
-            "purity": overlap / len(other_particle.voxel_indices),
-            "efficiency": overlap / len(self.voxel_indices)
-        }
-
-class Interaction:
-    """
-    Data structure for managing interaction-level
-    full chain output information.
-
-    Attributes
-    ----------
-    id: int
-        Unique ID (Interaction ID) of this interaction.
-    particles: List[Particle]
-        List of <Particle> objects that belong to this Interaction.
-    vertex: (1,3) np.array (Optional)
-        3D coordinates of the predicted interaction vertex
-    nu_id: int (Optional, TODO)
-        Label indicating whether this interaction is a neutrino interaction
-        WARNING: The nu_id label is most likely unreliable. Don't use this
-        in reconstruction (used for debugging)
-    num_particles: int
-        total number of particles in this interaction.
-    """
-    def __init__(self, interaction_id, particles, vertex=None, nu_id=-1):
-        self.id = interaction_id
-        self.particles = particles
-        self.match = []
-        self._match_counts = {}
-        self.check_validity()
-        # Voxel indices of an interaction is defined by the union of
-        # constituent particle voxel indices
-        self.voxel_indices = []
-        for p in self.particles:
-            self.voxel_indices.append(p.voxel_indices)
-            assert p.interaction_id == interaction_id
-        self.voxel_indices = np.hstack(self.voxel_indices)
-        self.size = self.voxel_indices.shape[0]
-        self.num_particles = len(self.particles)
-
-        self.pid_keys = {
-            0: 'Photon',
-            1: 'Electron',
-            2: 'Muon',
-            3: 'Pion',
-            4: 'Proton'
-        }
-
-        self.get_particles_summary()
-
-        self.vertex = vertex
-        if self.vertex is None:
-            self.vertex = np.array([-1, -1, -1])
-
-        self.nu_id = nu_id
-
-        self.particle_ids = [p.id for p in self.particles]
-        self.particle_counts = Counter({ self.pid_keys[i] : 0 for i in range(len(self.pid_keys))})
-        self.particle_counts.update([self.pid_keys[p.pid] for p in self.particles])
-
-        self.primary_particle_counts = Counter({ self.pid_keys[i] : 0 for i in range(len(self.pid_keys))})
-        self.primary_particle_counts.update([self.pid_keys[p.pid] for p in self.particles if p.is_primary])
-
-        if sum(self.primary_particle_counts.values()) == 0:
-            # print("Interaction {} has no primary particles!".format(self.id))
-            self.is_valid = False
-        else:
-            self.is_valid = True
-
-    def check_validity(self):
-        for p in self.particles:
-            assert isinstance(p, Particle)
-
-    def get_particles_summary(self):
-        self.particles_summary = ""
-        self.particles = sorted(self.particles, key=lambda x: x.id)
-        for p in self.particles:
-            pmsg = "    - Particle {}: PID = {}, Size = {}, Match = {} \n".format(
-                p.id, self.pid_keys[p.pid], p.points.shape[0], str(p.match))
-            self.particles_summary += pmsg
-
-
-    def __repr__(self):
-
-        self.get_particles_summary()
-        msg = "Interaction {}, Valid: {}, Vertex: x={:.2f}, y={:.2f}, z={:.2f}\n"\
-            "--------------------------------------------------------------------\n".format(
-            self.id, self.is_valid, self.vertex[0], self.vertex[1], self.vertex[2])
-        return msg + self.particles_summary
-
-    def __str__(self):
-        return "Interaction(id={}, vertex={}, nu_id={}, Particles={})".format(
-            self.id, str(self.vertex), self.nu_id, str(self.particle_ids))
-
-
-class TruthInteraction(Interaction):
-    """
-    Analogous data structure for Interactions retrieved from true labels.
-    """
-    def __init__(self, *args, **kwargs):
-        super(TruthInteraction, self).__init__(*args, **kwargs)
-        self.match = []
-        self._match_counts = {}
-
-    def check_validity(self):
-        for p in self.particles:
-            assert isinstance(p, TruthParticle)
-
-    def __repr__(self):
-
-        self.get_particles_summary()
-        msg = "TruthInteraction {}, Vertex: x={:.2f}, y={:.2f}, z={:.2f}\n"\
-            "-----------------------------------------------\n".format(
-            self.id, self.vertex[0], self.vertex[1], self.vertex[2])
-        return msg + self.particles_summary
-
-    def __str__(self):
-        return "TruthInteraction(id={}, vertex={}, nu_id={}, Particles={})".format(
-            self.id, str(self.vertex), self.nu_id, str(self.particle_ids))
+from . import Particle, TruthParticle, Interaction, TruthInteraction
 
 
 def matrix_counts(particles_x, particles_y):
diff --git a/analysis/classes/ui.py b/analysis/classes/ui.py
index 0536ff9a..1f0d0158 100644
--- a/analysis/classes/ui.py
+++ b/analysis/classes/ui.py
@@ -8,7 +8,10 @@
 from collections import defaultdict
 
 from scipy.special import softmax
-from analysis.classes.particle import *
+from analysis.classes import Particle, ParticleFragment, TruthParticleFragment, \
+        TruthParticle, Interaction, TruthInteraction
+from analysis.classes.particle import matrix_counts, matrix_iou, \
+        match_particles_fn, match_interactions_fn, group_particles_to_interactions_fn
 from analysis.algorithms.point_matching import *
 
 from mlreco.utils.groups import type_labels as TYPE_LABELS
@@ -16,6 +19,7 @@
 from mlreco.utils.deghosting import deghost_labels_and_predictions, compute_rescaled_charge
 
 from mlreco.utils.gnn.cluster import get_cluster_label
+from mlreco.iotools.collates import VolumeBoundaries
 
 
 class FullChainPredictor:
@@ -52,6 +56,7 @@ class FullChainPredictor:
     '''
     def __init__(self, data_blob, result, cfg, predictor_cfg={}, deghosting=False):
         self.module_config = cfg['model']['modules']
+        self.cfg = cfg
 
         # Handle deghosting before anything and save deghosting specific
         # quantities separately from data_blob and result
@@ -99,8 +104,18 @@ def __init__(self, data_blob, result, cfg, predictor_cfg={}, deghosting=False):
                 self.volume_boundaries[1, :] = (self.volume_boundaries[1, :] - min_y) / size_voxel_y
                 self.volume_boundaries[2, :] = (self.volume_boundaries[2, :] - min_z) / size_voxel_z
 
+        # Determine whether we need to account for several distinct volumes
+        # split over "virtual" batch ids
+        boundaries = cfg['iotool'].get('collate', {}).get('boundaries', None)
+        if boundaries is not None:
+            self.vb = VolumeBoundaries(boundaries)
+            self._num_volumes = self.vb.num_volumes()
+        else:
+            self.vb = None
+            self._num_volumes = 1
+
     def __repr__(self):
-        msg = "FullChainEvaluator(num_images={})".format(self.num_images)
+        msg = "FullChainEvaluator(num_images={})".format(int(self.num_images/self._num_volumes))
         return msg
 
 
@@ -381,11 +396,59 @@ def _fit_predict_vertex_info(self, entry, inter_idx):
 
         return vertex_info
 
+    def _get_entries(self, entry, volume):
+        """
+        Make a list of actual entries in the batch ids. This accounts for potential
+        virtual batch ids in case we used volume boundaries to process several volumes
+        separately.
+
+        Parameters
+        ==========
+        entry: int
+            Which entry of the original dataset you want to access.
+        volume: int or None
+            Which volume you want to access. None means all of them.
+
+        Returns
+        =======
+        list
+            List of integers = actual batch ids in the tensors (potentially virtual batch ids).
+        """
+        entries = [entry] # default behavior
+        if self.vb is not None: # in case we defined virtual batch ids (volume boundaries)
+            entries = self.vb.virtual_batch_ids(entry) # these are ALL the virtual batch ids corresponding to this entry
+            if volume is not None: # maybe we wanted to select a specific volume
+                entries = [entries[volume]]
+        return entries
+
+    def _check_volume(self, volume):
+        """
+        Basic sanity check that the volume given makes sense given the config.
+
+        Parameters
+        ==========
+        volume: int or None
+
+        Returns
+        =======
+        Nothing
+        """
+        if volume is not None and self.vb is None:
+            raise Exception("You need to specify volume boundaries in your I/O config (collate section).")
+        if volume is not None:
+            assert isinstance(volume, (int, np.int64, np.int32)) and volume >= 0
+
+    def _translate(self, voxels, volume):
+        if self.vb is None:
+            return voxels
+        else:
+            return self.vb.translate(voxels, volume)
 
     def get_fragments(self, entry, only_primaries=False,
                       min_particle_voxel_count=-1,
                       attaching_threshold=2,
-                      semantic_type=None, verbose=False, true_id=False) -> List[Particle]:
+                      semantic_type=None, verbose=False,
+                      true_id=False, volume=None) -> List[Particle]:
         '''
         Method for retriving fragment list for given batch index.
 
@@ -394,119 +457,141 @@ def get_fragments(self, entry, only_primaries=False,
 
         Method also performs startpoint prediction for shower fragments.
 
-        Inputs:
-            - entry: Batch number to retrieve example.
-            - semantic_type (optional): if True, only ppn candiates with the
+        Parameters
+        ==========
+        entry: int
+            Batch number to retrieve example.
+        only_primaries: bool, default False
+        min_particle_voxel_count: int, default -1
+        attaching_threshold: float, default 2
+            threshold distance to attach ppn point to particle.
+        semantic_type: int, default None
+            If True, only ppn candiates with the
             same predicted semantic type will be matched to its corresponding
             particle.
-            - threshold (float, optional): threshold distance to attach
-            ppn point to particle.
+        verbose: bool, default False
+        true_id: bool, default False
+        volume: int, default None
 
-        Returns:
-            - out: List of <Particle> instances (see Particle class definition).
+        Returns
+        =======
+        list
+            List of <Particle> instances (see Particle class definition).
         '''
+        self._check_volume(volume)
+
         if min_particle_voxel_count < 0:
             min_particle_voxel_count = self.min_particle_voxel_count
 
-        point_cloud = self.data_blob['input_data'][entry][:, 1:4]
-        depositions = self.result['input_rescaled'][entry][:, 4]
-        fragments = self.result['fragments'][entry]
-        fragments_seg = self.result['fragments_seg'][entry]
+        entries = self._get_entries(entry, volume)
 
-        shower_mask = fragments_seg == 0
-        shower_frag_primary = np.argmax(self.result['shower_node_pred'][entry], axis=1)
+        out_fragment_list = []
+        for entry in entries:
+            volume = entry % self._num_volumes
 
-        if 'shower_node_features' in self.result:
-            shower_node_features = self.result['shower_node_features'][entry]
-        if 'track_node_features' in self.result:
-            track_node_features = self.result['track_node_features'][entry]
+            point_cloud = self.data_blob['input_data'][entry][:, 1:4]
+            depositions = self.result['input_rescaled'][entry][:, 4]
+            fragments = self.result['fragments'][entry]
+            fragments_seg = self.result['fragments_seg'][entry]
 
-        assert len(fragments_seg) == len(fragments)
+            shower_mask = np.isin(fragments_seg, self.module_config['grappa_shower']['base']['node_type'])
+            shower_frag_primary = np.argmax(self.result['shower_node_pred'][entry], axis=1)
 
-        temp = []
+            if 'shower_node_features' in self.result:
+                shower_node_features = self.result['shower_node_features'][entry]
+            if 'track_node_features' in self.result:
+                track_node_features = self.result['track_node_features'][entry]
 
-        if ('inter_group_pred' in self.result) and ('particles' in self.result) and len(fragments) > 0:
+            assert len(fragments_seg) == len(fragments)
 
-            group_labels = self._fit_predict_groups(entry)
-            inter_labels = self._fit_predict_interaction_labels(entry)
-            group_ids = get_cluster_label(group_labels.reshape(-1, 1), fragments, column=0)
-            inter_ids = get_cluster_label(inter_labels.reshape(-1, 1), fragments, column=0)
+            temp = []
+
+            if ('inter_group_pred' in self.result) and ('particles' in self.result) and len(fragments) > 0:
+
+                group_labels = self._fit_predict_groups(entry)
+                inter_labels = self._fit_predict_interaction_labels(entry)
+                group_ids = get_cluster_label(group_labels.reshape(-1, 1), fragments, column=0)
+                inter_ids = get_cluster_label(inter_labels.reshape(-1, 1), fragments, column=0)
+
+            else:
+                group_ids = np.ones(len(fragments)).astype(int) * -1
+                inter_ids = np.ones(len(fragments)).astype(int) * -1
 
-        else:
-            group_ids = np.ones(len(fragments)).astype(int) * -1
-            inter_ids = np.ones(len(fragments)).astype(int) * -1
-
-        if true_id:
-            true_fragment_labels = self.data_blob['cluster_label'][entry][:, 5]
-
-
-        for i, p in enumerate(fragments):
-            voxels = point_cloud[p]
-            seg_label = fragments_seg[i]
-            part = ParticleFragment(voxels, i, seg_label,
-                            interaction_id=inter_ids[i],
-                            group_id=group_ids[i],
-                            image_id=entry,
-                            voxel_indices=p,
-                            depositions=depositions[p],
-                            is_primary=False,
-                            pid_conf=-1,
-                            alias='Fragment')
-            temp.append(part)
             if true_id:
-                fid = true_fragment_labels[p]
-                fids, counts = np.unique(fid.astype(int), return_counts=True)
-                part.true_ids = fids
-                part.true_counts = counts
-
-        # Label shower fragments as primaries and attach startpoint
-        shower_counter = 0
-        for p in temp:
-            if p.semantic_type == 0:
+                true_fragment_labels = self.data_blob['cluster_label'][entry][:, 5]
+
+
+            for i, p in enumerate(fragments):
+                voxels = point_cloud[p]
+                seg_label = fragments_seg[i]
+                part = ParticleFragment(self._translate(voxels, volume),
+                                i, seg_label,
+                                interaction_id=inter_ids[i],
+                                group_id=group_ids[i],
+                                image_id=entry,
+                                voxel_indices=p,
+                                depositions=depositions[p],
+                                is_primary=False,
+                                pid_conf=-1,
+                                alias='Fragment',
+                                volume=volume)
+                temp.append(part)
+                if true_id:
+                    fid = true_fragment_labels[p]
+                    fids, counts = np.unique(fid.astype(int), return_counts=True)
+                    part.true_ids = fids
+                    part.true_counts = counts
+
+            # Label shower fragments as primaries and attach startpoint
+            shower_counter = 0
+            for p in np.array(temp)[shower_mask]:
                 is_primary = shower_frag_primary[shower_counter]
                 p.is_primary = bool(is_primary)
                 p.startpoint = shower_node_features[shower_counter][19:22]
                 # p.group_id = int(shower_group_pred[shower_counter])
                 shower_counter += 1
-        assert shower_counter == shower_frag_primary.shape[0]
-
-        # Attach endpoint to track fragments
-        track_counter = 0
-        for p in temp:
-            if p.semantic_type == 1:
-                # p.group_id = int(track_group_pred[track_counter])
-                p.startpoint = track_node_features[track_counter][19:22]
-                p.endpoint = track_node_features[track_counter][22:25]
-                track_counter += 1
-        # assert track_counter == track_group_pred.shape[0]
-
-        # Apply fragment voxel cut
-        out = []
-        for p in temp:
-            if p.points.shape[0] < min_particle_voxel_count:
-                continue
-            out.append(p)
+            assert shower_counter == shower_frag_primary.shape[0]
+
+            # Attach endpoint to track fragments
+            track_counter = 0
+            for p in temp:
+                if p.semantic_type == 1:
+                    # p.group_id = int(track_group_pred[track_counter])
+                    p.startpoint = track_node_features[track_counter][19:22]
+                    p.endpoint = track_node_features[track_counter][22:25]
+                    track_counter += 1
+            # assert track_counter == track_group_pred.shape[0]
+
+            # Apply fragment voxel cut
+            out = []
+            for p in temp:
+                if p.points.shape[0] < min_particle_voxel_count:
+                    continue
+                out.append(p)
 
-        # Check primaries and assign ppn points
-        if only_primaries:
-            out = [p for p in out if p.is_primary]
+            # Check primaries and assign ppn points
+            if only_primaries:
+                out = [p for p in out if p.is_primary]
 
-        if semantic_type is not None:
-            out = [p for p in out if p.semantic_type == semantic_type]
+            if semantic_type is not None:
+                out = [p for p in out if p.semantic_type == semantic_type]
 
-        if len(out) == 0:
-            return out
+            if len(out) == 0:
+                return out
 
-        ppn_results = self._fit_predict_ppn(entry)
-        match_points_to_particles(ppn_results, out,
-            ppn_distance_threshold=attaching_threshold)
+            ppn_results = self._fit_predict_ppn(entry)
+            match_points_to_particles(ppn_results, out,
+                ppn_distance_threshold=attaching_threshold)
 
-        return out
+            out_fragment_list.extend(out)
+
+        return out_fragment_list
 
 
     def get_particles(self, entry, only_primaries=True,
                       min_particle_voxel_count=-1,
-                      attaching_threshold=2) -> List[Particle]:
+                      attaching_threshold=2,
+                      volume=None) -> List[Particle]:
         '''
         Method for retriving particle list for given batch index.
 
@@ -527,104 +612,124 @@ def get_particles(self, entry, only_primaries=True,
         with the closest Hausdorff distance to the particle point cloud
         (smallest point-to-set distance)
 
-        Inputs:
-            - entry: Batch number to retrieve example.
-            - primaries: If set to True, only retrieve predicted primaries.
-        Returns:
-            - out: List of <Particle> instances (see Particle class definition).
+        Parameters
+        ==========
+        entry: int
+            Batch number to retrieve example.
+        only_primaries: bool, default True
+            If set to True, only retrieve predicted primaries.
+        min_particle_voxel_count: int, default -1
+        attaching_threshold: int, default 2
+        volume: int, default None
+
+        Returns
+        =======
+        list
+            List of <Particle> instances (see Particle class definition).
         '''
+        self._check_volume(volume)
+
         if min_particle_voxel_count < 0:
             min_particle_voxel_count = self.min_particle_voxel_count
 
-        point_cloud      = self.data_blob['input_data'][entry][:, 1:4]
-        depositions      = self.result['input_rescaled'][entry][:, 4]
-        particles        = self.result['particles'][entry]
-        # inter_group_pred = self.result['inter_group_pred'][entry]
-        #print(point_cloud.shape, depositions.shape, len(particles))
-        particles_seg    = self.result['particles_seg'][entry]
+        entries = self._get_entries(entry, volume)
 
-        type_logits = self.result['node_pred_type'][entry]
-        input_node_features = [None] * type_logits.shape[0]
-        if 'particle_node_features' in self.result:
-            input_node_features = self.result['particle_node_features'][entry]
-        pids = np.argmax(type_logits, axis=1)
-
-        out = []
-        if point_cloud.shape[0] == 0:
-            return out
-        assert len(particles_seg) == len(particles)
-        assert len(pids) == len(particles)
-        assert len(input_node_features) == len(particles)
-        assert point_cloud.shape[0] == depositions.shape[0]
+        out_particle_list = []
+        for entry in entries:
+            volume = entry % self._num_volumes
 
-        node_pred_vtx = self.result['node_pred_vtx'][entry]
+            point_cloud      = self.data_blob['input_data'][entry][:, 1:4]
+            depositions      = self.result['input_rescaled'][entry][:, 4]
+            particles        = self.result['particles'][entry]
+            # inter_group_pred = self.result['inter_group_pred'][entry]
+            #print(point_cloud.shape, depositions.shape, len(particles))
+            particles_seg    = self.result['particles_seg'][entry]
 
-        assert node_pred_vtx.shape[0] == len(particles)
-
-        if ('inter_group_pred' in self.result) and ('particles' in self.result) and len(particles) > 0:
-
-            assert len(self.result['inter_group_pred'][entry]) == len(particles)
-            inter_labels = self._fit_predict_interaction_labels(entry)
-            inter_ids = get_cluster_label(inter_labels.reshape(-1, 1), particles, column=0)
-
-        else:
-            inter_ids = np.ones(len(particles)).astype(int) * -1
+            type_logits = self.result['node_pred_type'][entry]
+            input_node_features = [None] * type_logits.shape[0]
+            if 'particle_node_features' in self.result:
+                input_node_features = self.result['particle_node_features'][entry]
+            pids = np.argmax(type_logits, axis=1)
 
-        for i, p in enumerate(particles):
-            voxels = point_cloud[p]
-            if voxels.shape[0] < min_particle_voxel_count:
-                continue
-            seg_label = particles_seg[i]
-            pid = pids[i]
-            if seg_label == 2 or seg_label == 3:
-                pid = 1
-            interaction_id = inter_ids[i]
-            is_primary = bool(np.argmax(node_pred_vtx[i][3:]))
-            part = Particle(voxels, i, seg_label, interaction_id,
-                            pid,
-                            batch_id=entry,
-                            voxel_indices=p,
-                            depositions=depositions[p],
-                            is_primary=is_primary,
-                            pid_conf=softmax(type_logits[i])[pids[i]])
-
-            part._node_features = input_node_features[i]
-            out.append(part)
+            out = []
+            if point_cloud.shape[0] == 0:
+                return out
+            assert len(particles_seg) == len(particles)
+            assert len(pids) == len(particles)
+            assert len(input_node_features) == len(particles)
+            assert point_cloud.shape[0] == depositions.shape[0]
 
-        if only_primaries:
-            out = [p for p in out if p.is_primary]
+            node_pred_vtx = self.result['node_pred_vtx'][entry]
 
-        if len(out) == 0:
-            return out
+            assert node_pred_vtx.shape[0] == len(particles)
 
-        ppn_results = self._fit_predict_ppn(entry)
+            if ('inter_group_pred' in self.result) and ('particles' in self.result) and len(particles) > 0:
 
-        # Get ppn candidates for particle
-        match_points_to_particles(ppn_results, out,
-            ppn_distance_threshold=attaching_threshold)
+                assert len(self.result['inter_group_pred'][entry]) == len(particles)
+                inter_labels = self._fit_predict_interaction_labels(entry)
+                inter_ids = get_cluster_label(inter_labels.reshape(-1, 1), particles, column=0)
 
-        # Attach startpoint and endpoint
-        # as done in full chain geometric encoder
-        for p in out:
-            if p.size < min_particle_voxel_count:
-                continue
-            if p.semantic_type == 0:
-                pt = p._node_features[19:22]
-                # Check startpoint is replicated
-                assert(np.sum(
-                    np.abs(pt - p._node_features[22:25])) < 1e-12)
-                p.startpoint = pt
-            elif p.semantic_type == 1:
-                startpoint, endpoint = p._node_features[19:22], p._node_features[22:25]
-                p.startpoint = startpoint
-                p.endpoint = endpoint
             else:
-                continue
+                inter_ids = np.ones(len(particles)).astype(int) * -1
 
-        return out
+            for i, p in enumerate(particles):
+                voxels = point_cloud[p]
+                if voxels.shape[0] < min_particle_voxel_count:
+                    continue
+                seg_label = particles_seg[i]
+                pid = pids[i]
+                if seg_label == 2 or seg_label == 3:
+                    pid = 1
+                interaction_id = inter_ids[i]
+                is_primary = bool(np.argmax(node_pred_vtx[i][3:]))
+                part = Particle(self._translate(voxels, volume),
+                                i, seg_label, interaction_id,
+                                pid,
+                                batch_id=entry,
+                                voxel_indices=p,
+                                depositions=depositions[p],
+                                is_primary=is_primary,
+                                pid_conf=softmax(type_logits[i])[pids[i]],
+                                volume=volume)
+
+                part._node_features = input_node_features[i]
+                out.append(part)
+
+            if only_primaries:
+                out = [p for p in out if p.is_primary]
+
+            if len(out) == 0:
+                return out
+
+            ppn_results = self._fit_predict_ppn(entry)
+
+            # Get ppn candidates for particle
+            match_points_to_particles(ppn_results, out,
+                ppn_distance_threshold=attaching_threshold)
+
+            # Attach startpoint and endpoint
+            # as done in full chain geometric encoder
+            for p in out:
+                if p.size < min_particle_voxel_count:
+                    continue
+                if p.semantic_type == 0:
+                    pt = p._node_features[19:22]
+                    # Check startpoint is replicated
+                    assert(np.sum(
+                        np.abs(pt - p._node_features[22:25])) < 1e-12)
+                    p.startpoint = pt
+                elif p.semantic_type == 1:
+                    startpoint, endpoint = p._node_features[19:22], p._node_features[22:25]
+                    p.startpoint = startpoint
+                    p.endpoint = endpoint
+                else:
+                    continue
+            out_particle_list.extend(out)
 
+        return out_particle_list
 
-    def get_interactions(self, entry, drop_nonprimary_particles=True) -> List[Interaction]:
+
+    def get_interactions(self, entry, drop_nonprimary_particles=True, volume=None) -> List[Interaction]:
         '''
         Method for retriving interaction list for given batch index.
 
@@ -644,36 +749,63 @@ def get_interactions(self, entry, drop_nonprimary_particles=True) -> List[Intera
         Returns:
             - out: List of <Interaction> instances (see particle.Interaction).
         '''
-        particles = self.get_particles(entry, only_primaries=drop_nonprimary_particles)
-        out = group_particles_to_interactions_fn(particles)
-        for ia in out:
-            ia.vertex = self._fit_predict_vertex_info(entry, ia.id)
-        return out
+        self._check_volume(volume)
+
+        entries = self._get_entries(entry, volume)
+
+        out_interaction_list = []
+        for e in entries:
+            volume = e % self._num_volumes
+            particles = self.get_particles(entry, only_primaries=drop_nonprimary_particles, volume=volume)
+            out = group_particles_to_interactions_fn(particles)
+            for ia in out:
+                ia.vertex = self._fit_predict_vertex_info(e, ia.id)
+                ia.volume = volume
+            out_interaction_list.extend(out)
+
+        return out_interaction_list
 
 
-    def fit_predict_labels(self, entry):
+    def fit_predict_labels(self, entry, volume=None):
         '''
         Predict all labels of a given batch index <entry>.
 
         We define <labels> to be 1d tensors that annotate voxels.
         '''
-        pred_seg = self._fit_predict_semantics(entry)
-        pred_fragments = self._fit_predict_fragments(entry)
-        pred_groups = self._fit_predict_groups(entry)
-        pred_interaction_labels = self._fit_predict_interaction_labels(entry)
-        pred_pids = self._fit_predict_pids(entry)
-
-        pred = {
-            'segment': pred_seg,
-            'fragment': pred_fragments,
-            'group': pred_groups,
-            'interaction': pred_interaction_labels,
-            'pdg': pred_pids
+        self._check_volume(volume)
+        entries = self._get_entries(entry, volume)
+
+        all_pred = {
+            'segment': [],
+            'fragment': [],
+            'group': [],
+            'interaction': [],
+            'pdg': []
         }
+        for entry in entries:
+            pred_seg = self._fit_predict_semantics(entry)
+            pred_fragments = self._fit_predict_fragments(entry)
+            pred_groups = self._fit_predict_groups(entry)
+            pred_interaction_labels = self._fit_predict_interaction_labels(entry)
+            pred_pids = self._fit_predict_pids(entry)
+
+            pred = {
+                'segment': pred_seg,
+                'fragment': pred_fragments,
+                'group': pred_groups,
+                'interaction': pred_interaction_labels,
+                'pdg': pred_pids
+            }
+
+            for key in pred:
+                if len(all_pred[key]) == 0:
+                    all_pred[key] = pred[key]
+                else:
+                    all_pred[key] = np.concatenate([all_pred[key], pred[key]], axis=0)
 
-        self._pred = pred
+        self._pred = all_pred
 
-        return pred
+        return all_pred
 
 
     def fit_predict(self, **kwargs):
@@ -693,7 +825,7 @@ def fit_predict(self, **kwargs):
         labels = []
         list_particles, list_interactions = [], []
 
-        for entry in range(self.num_images):
+        for entry in range(int(self.num_images / self._num_volumes)):
 
             pred_dict = self.fit_predict_labels(entry)
             labels.append(pred_dict)
@@ -778,17 +910,56 @@ def __init__(self, data_blob, result, cfg, processor_cfg={}, **kwargs):
         super(FullChainEvaluator, self).__init__(data_blob, result, cfg, processor_cfg, **kwargs)
         self.michel_primary_ionization_only = processor_cfg.get('michel_primary_ionization_only', False)
 
-    def get_true_label(self, entry, name, schema='cluster_label'):
+    def get_true_label(self, entry, name, schema='cluster_label', volume=None):
+        """
+        Retrieve tensor in data blob, labelled with `schema`.
+
+        Parameters
+        ==========
+        entry: int
+        name: str
+            Must be a predefined name within `['segment', 'fragment', 'group',
+            'interaction', 'pdg', 'nu']`.
+        schema: str
+            Key for dataset schema to retrieve the info from.
+        volume: int, default None
+
+        Returns
+        =======
+        np.array
+        """
         if name not in self.LABEL_TO_COLUMN:
             raise KeyError("Invalid label identifier name: {}. "\
                 "Available column names = {}".format(
                     name, str(list(self.LABEL_TO_COLUMN.keys()))))
         column_idx = self.LABEL_TO_COLUMN[name]
-        return self.data_blob[schema][entry][:, column_idx]
 
+        self._check_volume(volume)
+
+        entries = self._get_entries(entry, volume)
+        out = []
+        for entry in entries:
+            out.append(self.data_blob[schema][entry][:, column_idx])
+        return np.concatenate(out, axis=0)
+
+
+    def get_predicted_label(self, entry, name, volume=None):
+        """
+        Returns predicted quantities to label a plot.
+
+        Parameters
+        ==========
+        entry: int
+        name: str
+            Must be a predefined name within `['segment', 'fragment', 'group',
+            'interaction', 'pdg', 'nu']`.
+        volume: int, default None
 
-    def get_predicted_label(self, entry, name):
-        pred = self.fit_predict_labels(entry)
+        Returns
+        =======
+        np.array
+        """
+        pred = self.fit_predict_labels(entry, volume=volume)
         return pred[name]
 
 
@@ -814,93 +985,104 @@ def _apply_true_voxel_cut(self, entry):
         return set(particles_exclude)
 
 
-    def get_true_fragments(self, entry, verbose=False) -> List[TruthParticleFragment]:
+    def get_true_fragments(self, entry, verbose=False, volume=None) -> List[TruthParticleFragment]:
         '''
         Get list of <TruthParticleFragment> instances for given <entry> batch id.
         '''
-        # Both are "adapted" labels
-        labels = self.data_blob['cluster_label'][entry]
-        segment_label = self.data_blob['segment_label'][entry][:, -1]
-        rescaled_input_charge = self.result['input_rescaled'][entry][:, 4]
-
-        fragment_ids = set(list(np.unique(labels[:, 5]).astype(int)))
-        fragments = []
-
-        for fid in fragment_ids:
-            mask = labels[:, 5] == fid
-
-            semantic_type, counts = np.unique(labels[:, -1][mask], return_counts=True)
-            if semantic_type.shape[0] > 1:
-                if verbose:
-                    print("Semantic Type of Fragment {} is not "\
-                        "unique: {}, {}".format(fid,
-                                                str(semantic_type),
-                                                str(counts)))
-                perm = counts.argmax()
-                semantic_type = semantic_type[perm]
-            else:
-                semantic_type = semantic_type[0]
-
-            points = labels[mask][:, 1:4]
-            size = points.shape[0]
-            depositions = rescaled_input_charge[mask]
-            depositions_MeV = labels[mask][:, 4]
-            voxel_indices = np.where(mask)[0]
-
-            group_id, counts = np.unique(labels[:, 6][mask].astype(int), return_counts=True)
-            if group_id.shape[0] > 1:
-                if verbose:
-                    print("Group ID of Fragment {} is not "\
-                        "unique: {}, {}".format(fid,
-                                                str(group_id),
-                                                str(counts)))
-                perm = counts.argmax()
-                group_id = group_id[perm]
-            else:
-                group_id = group_id[0]
-
-            interaction_id, counts = np.unique(labels[:, 7][mask].astype(int), return_counts=True)
-            if interaction_id.shape[0] > 1:
-                if verbose:
-                    print("Interaction ID of Fragment {} is not "\
-                        "unique: {}, {}".format(fid,
-                                                str(interaction_id),
-                                                str(counts)))
-                perm = counts.argmax()
-                interaction_id = interaction_id[perm]
-            else:
-                interaction_id = interaction_id[0]
-
-
-            is_primary, counts = np.unique(labels[:, -2][mask].astype(bool), return_counts=True)
-            if is_primary.shape[0] > 1:
-                if verbose:
-                    print("Primary label of Fragment {} is not "\
-                        "unique: {}, {}".format(fid,
-                                                str(is_primary),
-                                                str(counts)))
-                perm = counts.argmax()
-                is_primary = is_primary[perm]
-            else:
-                is_primary = is_primary[0]
+        self._check_volume(volume)
+
+        entries = self._get_entries(entry, volume)
+
+        out_fragments_list = []
+        for entry in entries:
+            volume = entry % self._num_volumes
+
+            # Both are "adapted" labels
+            labels = self.data_blob['cluster_label'][entry]
+            segment_label = self.data_blob['segment_label'][entry][:, -1]
+            rescaled_input_charge = self.result['input_rescaled'][entry][:, 4]
+
+            fragment_ids = set(list(np.unique(labels[:, 5]).astype(int)))
+            fragments = []
+
+            for fid in fragment_ids:
+                mask = labels[:, 5] == fid
+
+                semantic_type, counts = np.unique(labels[:, -1][mask], return_counts=True)
+                if semantic_type.shape[0] > 1:
+                    if verbose:
+                        print("Semantic Type of Fragment {} is not "\
+                            "unique: {}, {}".format(fid,
+                                                    str(semantic_type),
+                                                    str(counts)))
+                    perm = counts.argmax()
+                    semantic_type = semantic_type[perm]
+                else:
+                    semantic_type = semantic_type[0]
+
+                points = labels[mask][:, 1:4]
+                size = points.shape[0]
+                depositions = rescaled_input_charge[mask]
+                depositions_MeV = labels[mask][:, 4]
+                voxel_indices = np.where(mask)[0]
+
+                group_id, counts = np.unique(labels[:, 6][mask].astype(int), return_counts=True)
+                if group_id.shape[0] > 1:
+                    if verbose:
+                        print("Group ID of Fragment {} is not "\
+                            "unique: {}, {}".format(fid,
+                                                    str(group_id),
+                                                    str(counts)))
+                    perm = counts.argmax()
+                    group_id = group_id[perm]
+                else:
+                    group_id = group_id[0]
+
+                interaction_id, counts = np.unique(labels[:, 7][mask].astype(int), return_counts=True)
+                if interaction_id.shape[0] > 1:
+                    if verbose:
+                        print("Interaction ID of Fragment {} is not "\
+                            "unique: {}, {}".format(fid,
+                                                    str(interaction_id),
+                                                    str(counts)))
+                    perm = counts.argmax()
+                    interaction_id = interaction_id[perm]
+                else:
+                    interaction_id = interaction_id[0]
+
+
+                is_primary, counts = np.unique(labels[:, -2][mask].astype(bool), return_counts=True)
+                if is_primary.shape[0] > 1:
+                    if verbose:
+                        print("Primary label of Fragment {} is not "\
+                            "unique: {}, {}".format(fid,
+                                                    str(is_primary),
+                                                    str(counts)))
+                    perm = counts.argmax()
+                    is_primary = is_primary[perm]
+                else:
+                    is_primary = is_primary[0]
 
-            part = TruthParticleFragment(points, fid, semantic_type,
-                            interaction_id=interaction_id,
-                            group_id=group_id,
-                            image_id=entry,
-                            voxel_indices=voxel_indices,
-                            depositions=depositions,
-                            depositions_MeV=depositions_MeV,
-                            is_primary=is_primary,
-                            alias='Fragment')
+                part = TruthParticleFragment(self._translate(points, volume),
+                                fid, semantic_type,
+                                interaction_id=interaction_id,
+                                group_id=group_id,
+                                image_id=entry,
+                                voxel_indices=voxel_indices,
+                                depositions=depositions,
+                                depositions_MeV=depositions_MeV,
+                                is_primary=is_primary,
+                                alias='Fragment',
+                                volume=volume)
 
-            fragments.append(part)
+                fragments.append(part)
+            out_fragments_list.extend(fragments)
 
-        return fragments
+        return out_fragments_list
 
 
     def get_true_particles(self, entry, only_primaries=True,
-                           verbose=False) -> List[TruthParticle]:
+                           verbose=False, volume=None) -> List[TruthParticle]:
         '''
         Get list of <TruthParticle> instances for given <entry> batch id.
 
@@ -918,217 +1100,298 @@ def get_true_particles(self, entry, only_primaries=True,
                 id number
             p: true momentum vector
         '''
-        labels = self.data_blob['cluster_label'][entry]
-        if self.deghosting:
-            labels_noghost = self.data_blob['cluster_label_noghost'][entry]
-        segment_label = self.data_blob['segment_label'][entry][:, -1]
-        particle_ids = set(list(np.unique(labels[:, 6]).astype(int)))
-        rescaled_input_charge = self.result['input_rescaled'][entry][:, 4]
+        self._check_volume(volume)
 
-        particles = []
-        exclude_ids = set([])
+        entries = self._get_entries(entry, volume)
 
-        for idx, p in enumerate(self.data_blob['particles_asis'][entry]):
-            pid = int(p.id())
-            # 1. Check if current pid is one of the existing group ids
-            if pid not in particle_ids:
-                # print("PID {} not in particle_ids".format(pid))
-                continue
-            is_primary = p.group_id() == p.parent_id()
-            if p.pdg_code() not in TYPE_LABELS:
-                # print("PID {} not in TYPE LABELS".format(pid))
-                continue
-            # For deghosting inputs, perform voxel cut with true nonghost coords.
-            if self.deghosting:
-                exclude_ids = self._apply_true_voxel_cut(entry)
-                if pid in exclude_ids:
-                    # Skip this particle if its below the voxel minimum requirement
-                    # print("PID {} was excluded from the list of particles due"\
-                    #     " to true nonghost voxel cut. Exclude IDS = {}".format(
-                    #         p.id(), str(exclude_ids)
-                    #     ))
-                    continue
+        out_particles_list = []
+        global_entry = entry
+        for entry in entries:
+            volume = entry % self._num_volumes
 
-            pdg = TYPE_LABELS[p.pdg_code()]
-            mask = labels[:, 6].astype(int) == pid
+            labels = self.data_blob['cluster_label'][entry]
             if self.deghosting:
-                mask_noghost = labels_noghost[:, 6].astype(int) == pid
-            # If particle is Michel electron, we have the option to
-            # only consider the primary ionization.
-            # Semantic labels only label the primary ionization as Michel.
-            # Cluster labels will have the entire Michel together.
-            if self.michel_primary_ionization_only and 2 in labels[mask][:, -1].astype(int):
-                mask = mask & (labels[:, -1].astype(int) == 2)
+                labels_noghost = self.data_blob['cluster_label_noghost'][entry]
+            segment_label = self.data_blob['segment_label'][entry][:, -1]
+            particle_ids = set(list(np.unique(labels[:, 6]).astype(int)))
+            rescaled_input_charge = self.result['input_rescaled'][entry][:, 4]
+
+            particles = []
+            exclude_ids = set([])
+
+            for idx, p in enumerate(self.data_blob['particles_asis'][global_entry]):
+                pid = int(p.id())
+                # 1. Check if current pid is one of the existing group ids
+                if pid not in particle_ids:
+                    # print("PID {} not in particle_ids".format(pid))
+                    continue
+                is_primary = p.group_id() == p.parent_id()
+                if p.pdg_code() not in TYPE_LABELS:
+                    # print("PID {} not in TYPE LABELS".format(pid))
+                    continue
+                # For deghosting inputs, perform voxel cut with true nonghost coords.
                 if self.deghosting:
-                    mask_noghost = mask_noghost & (labels_noghost[:, -1].astype(int) == 2)
-
-            # Check semantics
-            semantic_type, sem_counts = np.unique(
-                labels[mask][:, -1].astype(int), return_counts=True)
-
-            if semantic_type.shape[0] > 1:
-                if verbose:
-                    print("Semantic Type of Particle {} is not "\
-                        "unique: {}, {}".format(pid,
-                                                str(semantic_type),
-                                                str(sem_counts)))
-                perm = sem_counts.argmax()
-                semantic_type = semantic_type[perm]
-            else:
-                semantic_type = semantic_type[0]
+                    exclude_ids = self._apply_true_voxel_cut(global_entry)
+                    if pid in exclude_ids:
+                        # Skip this particle if its below the voxel minimum requirement
+                        # print("PID {} was excluded from the list of particles due"\
+                        #     " to true nonghost voxel cut. Exclude IDS = {}".format(
+                        #         p.id(), str(exclude_ids)
+                        #     ))
+                        continue
+
+                pdg = TYPE_LABELS[p.pdg_code()]
+                mask = labels[:, 6].astype(int) == pid
+                if self.deghosting:
+                    mask_noghost = labels_noghost[:, 6].astype(int) == pid
+                # If particle is Michel electron, we have the option to
+                # only consider the primary ionization.
+                # Semantic labels only label the primary ionization as Michel.
+                # Cluster labels will have the entire Michel together.
+                if self.michel_primary_ionization_only and 2 in labels[mask][:, -1].astype(int):
+                    mask = mask & (labels[:, -1].astype(int) == 2)
+                    if self.deghosting:
+                        mask_noghost = mask_noghost & (labels_noghost[:, -1].astype(int) == 2)
+
+                # Check semantics
+                semantic_type, sem_counts = np.unique(
+                    labels[mask][:, -1].astype(int), return_counts=True)
+
+                if semantic_type.shape[0] > 1:
+                    if verbose:
+                        print("Semantic Type of Particle {} is not "\
+                            "unique: {}, {}".format(pid,
+                                                    str(semantic_type),
+                                                    str(sem_counts)))
+                    perm = sem_counts.argmax()
+                    semantic_type = semantic_type[perm]
+                else:
+                    semantic_type = semantic_type[0]
 
 
 
-            coords = self.data_blob['input_data'][entry][mask][:, 1:4]
+                coords = self.data_blob['input_data'][entry][mask][:, 1:4]
 
-            interaction_id, int_counts = np.unique(labels[mask][:, 7].astype(int),
-                                                   return_counts=True)
-            if interaction_id.shape[0] > 1:
-                if verbose:
-                    print("Interaction ID of Particle {} is not "\
-                        "unique: {}".format(pid, str(interaction_id)))
-                perm = int_counts.argmax()
-                interaction_id = interaction_id[perm]
-            else:
-                interaction_id = interaction_id[0]
-
-            nu_id, nu_counts = np.unique(labels[mask][:, 8].astype(int),
-                                         return_counts=True)
-            if nu_id.shape[0] > 1:
-                if verbose:
-                    print("Neutrino ID of Particle {} is not "\
-                        "unique: {}".format(pid, str(nu_id)))
-                perm = nu_counts.argmax()
-                nu_id = nu_id[perm]
-            else:
-                nu_id = nu_id[0]
+                interaction_id, int_counts = np.unique(labels[mask][:, 7].astype(int),
+                                                       return_counts=True)
+                if interaction_id.shape[0] > 1:
+                    if verbose:
+                        print("Interaction ID of Particle {} is not "\
+                            "unique: {}".format(pid, str(interaction_id)))
+                    perm = int_counts.argmax()
+                    interaction_id = interaction_id[perm]
+                else:
+                    interaction_id = interaction_id[0]
+
+                nu_id, nu_counts = np.unique(labels[mask][:, 8].astype(int),
+                                             return_counts=True)
+                if nu_id.shape[0] > 1:
+                    if verbose:
+                        print("Neutrino ID of Particle {} is not "\
+                            "unique: {}".format(pid, str(nu_id)))
+                    perm = nu_counts.argmax()
+                    nu_id = nu_id[perm]
+                else:
+                    nu_id = nu_id[0]
 
-            fragments = np.unique(labels[mask][:, 5].astype(int))
-            depositions_MeV = labels[mask][:, 4]
-            depositions = rescaled_input_charge[mask] # Will be in ADC
-            coords_noghost, depositions_noghost = None, None
-            if self.deghosting:
-                coords_noghost = labels_noghost[mask_noghost][:, 1:4]
-                depositions_noghost = labels_noghost[mask_noghost][:, 4].squeeze()
-
-            particle = TruthParticle(coords, pid,
-                semantic_type, interaction_id, pdg,
-                particle_asis=p,
-                batch_id=entry,
-                depositions=depositions,
-                is_primary=is_primary,
-                coords_noghost=coords_noghost,
-                depositions_noghost=depositions_noghost,
-                depositions_MeV=depositions_MeV)
-
-            particle.p = np.array([p.px(), p.py(), p.pz()])
-            particle.fragments = fragments
-            particle.particle_asis = p
-            particle.nu_id = nu_id
-            particle.voxel_indices = np.where(mask)[0]
-
-            particle.startpoint = np.array([p.first_step().x(),
-                                            p.first_step().y(),
-                                            p.first_step().z()])
-
-            if semantic_type == 1:
-                particle.endpoint = np.array([p.last_step().x(),
-                                              p.last_step().y(),
-                                              p.last_step().z()])
-
-            if particle.voxel_indices.shape[0] >= self.min_particle_voxel_count:
-                particles.append(particle)
+                fragments = np.unique(labels[mask][:, 5].astype(int))
+                depositions_MeV = labels[mask][:, 4]
+                depositions = rescaled_input_charge[mask] # Will be in ADC
+                coords_noghost, depositions_noghost = None, None
+                if self.deghosting:
+                    coords_noghost = labels_noghost[mask_noghost][:, 1:4]
+                    depositions_noghost = labels_noghost[mask_noghost][:, 4].squeeze()
+
+                particle = TruthParticle(self._translate(coords, volume),
+                    pid,
+                    semantic_type, interaction_id, pdg,
+                    particle_asis=p,
+                    batch_id=entry,
+                    depositions=depositions,
+                    is_primary=is_primary,
+                    coords_noghost=coords_noghost,
+                    depositions_noghost=depositions_noghost,
+                    depositions_MeV=depositions_MeV,
+                    volume=entry % self._num_volumes)
+
+                particle.p = np.array([p.px(), p.py(), p.pz()])
+                particle.fragments = fragments
+                particle.particle_asis = p
+                particle.nu_id = nu_id
+                particle.voxel_indices = np.where(mask)[0]
+
+                particle.startpoint = np.array([p.first_step().x(),
+                                                p.first_step().y(),
+                                                p.first_step().z()])
+
+                if semantic_type == 1:
+                    particle.endpoint = np.array([p.last_step().x(),
+                                                  p.last_step().y(),
+                                                  p.last_step().z()])
+
+                if particle.voxel_indices.shape[0] >= self.min_particle_voxel_count:
+                    particles.append(particle)
+
+            out_particles_list.extend(particles)
 
         if only_primaries:
-            particles = [p for p in particles if p.is_primary]
+            out_particles_list = [p for p in out_particles_list if p.is_primary]
 
-        return particles
+        return out_particles_list
 
 
     def get_true_interactions(self, entry, drop_nonprimary_particles=True,
-                              min_particle_voxel_count=-1) -> List[Interaction]:
+                              min_particle_voxel_count=-1,
+                              volume=None) -> List[Interaction]:
+        self._check_volume(volume)
         if min_particle_voxel_count < 0:
             min_particle_voxel_count = self.min_particle_voxel_count
 
-        true_particles = self.get_true_particles(entry, only_primaries=drop_nonprimary_particles)
-        out = group_particles_to_interactions_fn(true_particles,
-                                                 get_nu_id=True, mode='truth')
-        vertices = self.get_true_vertices(entry)
-        for ia in out:
-            ia.vertex = vertices[ia.id]
-        return out
+        entries = self._get_entries(entry, volume)
+        out_interactions_list = []
+        for e in entries:
+            volume = e % self._num_volumes
+            true_particles = self.get_true_particles(entry, only_primaries=drop_nonprimary_particles, volume=volume)
+            out = group_particles_to_interactions_fn(true_particles,
+                                                     get_nu_id=True, mode='truth')
+            vertices = self.get_true_vertices(entry, volume=volume)
+            for ia in out:
+                ia.vertex = vertices[ia.id]
+                ia.volume = volume
+            out_interactions_list.extend(out)
+
+        return out_interactions_list
+
 
+    def get_true_vertices(self, entry, volume=None):
+        """
+        Parameters
+        ==========
+        entry: int
+        volume: int, default None
+
+        Returns
+        =======
+        dict
+            Keys are true interactions ids, values are np.array of shape (N, 3)
+            with true vertices coordinates.
+        """
+        self._check_volume(volume)
 
-    def get_true_vertices(self, entry):
-        inter_idxs = np.unique(
-            self.data_blob['cluster_label'][entry][:, 7].astype(int))
+        entries = self._get_entries(entry, volume)
         out = {}
-        for inter_idx in inter_idxs:
-            if inter_idx < 0:
-                continue
-            vtx = get_vertex(self.data_blob['kinematics_label'],
-                            self.data_blob['cluster_label'],
-                            data_idx=entry,
-                            inter_idx=inter_idx)
-            out[inter_idx] = vtx
+        for entry in entries:
+            volume = entry % self._num_volumes
+            inter_idxs = np.unique(
+                self.data_blob['cluster_label'][entry][:, 7].astype(int))
+            for inter_idx in inter_idxs:
+                if inter_idx < 0:
+                    continue
+                vtx = get_vertex(self.data_blob['kinematics_label'],
+                                self.data_blob['cluster_label'],
+                                data_idx=entry,
+                                inter_idx=inter_idx)
+                out[inter_idx] = self._translate(vtx, volume)
+
         return out
 
 
     def match_particles(self, entry,
                         only_primaries=False,
-                        mode='pred_to_true', **kwargs):
+                        mode='pred_to_true',
+                        volume=None, **kwargs):
         '''
         Returns (<Particle>, None) if no match was found
+
+        Parameters
+        ==========
+        entry: int
+        only_primaries: bool, default False
+        mode: str, default 'pred_to_true'
+            Must be either 'pred_to_true' or 'true_to_pred'
+        volume: int, default None
         '''
-        if mode == 'pred_to_true':
-            # Match each pred to one in true
-            particles_from = self.get_particles(entry, only_primaries=only_primaries)
-            particles_to = self.get_true_particles(entry, only_primaries=only_primaries)
-        elif mode == 'true_to_pred':
-            # Match each true to one in pred
-            particles_to = self.get_particles(entry, only_primaries=only_primaries)
-            particles_from = self.get_true_particles(entry, only_primaries=only_primaries)
-        else:
-            raise ValueError("Mode {} is not valid. For matching each"\
-                " prediction to truth, use 'pred_to_true' (and vice versa).".format(mode))
-        matched_pairs, _, _ = match_particles_fn(particles_from, particles_to,
-                                                min_overlap=self.min_overlap_count,
-                                                overlap_mode=self.overlap_mode,
-                                                **kwargs)
-        return matched_pairs
+        self._check_volume(volume)
+
+        entries = self._get_entries(entry, volume)
+        all_matches = []
+        for e in entries:
+            volume = e % self._num_volumes
+            if mode == 'pred_to_true':
+                # Match each pred to one in true
+                particles_from = self.get_particles(entry, only_primaries=only_primaries, volume=volume)
+                particles_to = self.get_true_particles(entry, only_primaries=only_primaries, volume=volume)
+            elif mode == 'true_to_pred':
+                # Match each true to one in pred
+                particles_to = self.get_particles(entry, only_primaries=only_primaries, volume=volume)
+                particles_from = self.get_true_particles(entry, only_primaries=only_primaries, volume=volume)
+            else:
+                raise ValueError("Mode {} is not valid. For matching each"\
+                    " prediction to truth, use 'pred_to_true' (and vice versa).".format(mode))
+            matched_pairs, _, _ = match_particles_fn(particles_from, particles_to,
+                                                    min_overlap=self.min_overlap_count,
+                                                    overlap_mode=self.overlap_mode,
+                                                    **kwargs)
+            all_matches.extend(matched_pairs)
+        return all_matches
 
 
     def match_interactions(self, entry, mode='pred_to_true',
                            drop_nonprimary_particles=True,
                            match_particles=True,
-                           return_counts=False, **kwargs):
-        if mode == 'pred_to_true':
-            ints_from = self.get_interactions(entry, drop_nonprimary_particles=drop_nonprimary_particles)
-            ints_to = self.get_true_interactions(entry, drop_nonprimary_particles=drop_nonprimary_particles)
-        elif mode == 'true_to_pred':
-            ints_to = self.get_interactions(entry, drop_nonprimary_particles=drop_nonprimary_particles)
-            ints_from = self.get_true_interactions(entry, drop_nonprimary_particles=drop_nonprimary_particles)
-        else:
-            raise ValueError("Mode {} is not valid. For matching each"\
-                " prediction to truth, use 'pred_to_true' (and vice versa).".format(mode))
+                           return_counts=False,
+                           volume=None, **kwargs):
+        """
+        Parameters
+        ==========
+        entry: int
+        mode: str, default 'pred_to_true'
+            Must be either 'pred_to_true' or 'true_to_pred'.
+        drop_nonprimary_particles: bool, default True
+        match_particles: bool, default True
+        return_counts: bool, default False
+        volume: int, default None
 
-        matched_interactions, _, counts = match_interactions_fn(ints_from, ints_to,
+        Returns
+        =======
+        List[Tuple[Interaction, Interaction]]
+            List of tuples, indicating the matched interactions.
+        """
+        self._check_volume(volume)
+
+        entries = self._get_entries(entry, volume)
+        all_matches, all_counts = [], []
+        for e in entries:
+            volume = e % self._num_volumes
+            if mode == 'pred_to_true':
+                ints_from = self.get_interactions(entry, drop_nonprimary_particles=drop_nonprimary_particles, volume=volume)
+                ints_to = self.get_true_interactions(entry, drop_nonprimary_particles=drop_nonprimary_particles, volume=volume)
+            elif mode == 'true_to_pred':
+                ints_to = self.get_interactions(entry, drop_nonprimary_particles=drop_nonprimary_particles, volume=volume)
+                ints_from = self.get_true_interactions(entry, drop_nonprimary_particles=drop_nonprimary_particles, volume=volume)
+            else:
+                raise ValueError("Mode {} is not valid. For matching each"\
+                    " prediction to truth, use 'pred_to_true' (and vice versa).".format(mode))
+
+            matched_interactions, _, counts = match_interactions_fn(ints_from, ints_to,
+                                                                    min_overlap=self.min_overlap_count,
+                                                                    **kwargs)
+
+            if match_particles:
+                for interactions in matched_interactions:
+                    domain, codomain = interactions
+                    if codomain is None:
+                        domain_particles, codomain_particles = domain.particles, []
+                    else:
+                        domain_particles, codomain_particles = domain.particles, codomain.particles
+                        # continue
+                    matched_particles, _, _ = match_particles_fn(domain_particles, codomain_particles,
                                                                 min_overlap=self.min_overlap_count,
-                                                                **kwargs)
-
-        if match_particles:
-            for interactions in matched_interactions:
-                domain, codomain = interactions
-                if codomain is None:
-                    domain_particles, codomain_particles = domain.particles, []
-                else:
-                    domain_particles, codomain_particles = domain.particles, codomain.particles
-                    # continue
-                matched_particles, _, _ = match_particles_fn(domain_particles, codomain_particles,
-                                                            min_overlap=self.min_overlap_count,
-                                                            overlap_mode=self.overlap_mode)
+                                                                overlap_mode=self.overlap_mode)
+            all_matches.extend(matched_interactions)
+            all_counts.extend(counts)
 
         if return_counts:
-            return matched_interactions, counts
+            return all_matches, all_counts
         else:
-            return matched_interactions
+            return all_matches
diff --git a/mlreco/iotools/collates.py b/mlreco/iotools/collates.py
index 9809411a..24306037 100644
--- a/mlreco/iotools/collates.py
+++ b/mlreco/iotools/collates.py
@@ -62,6 +62,73 @@ def __init__(self, definitions):
                 continue
             self.boundaries[i].sort() # Ascending order
 
+        n_boundaries = [len(self.boundaries[n]) if self.boundaries[n] is not None else 0 for n in range(self.dim)]
+        # Generate indices that describe all volumes
+        all_index = []
+        for n in range(self.dim):
+            all_index.append(np.arange(n_boundaries[n]+1))
+        self.combo = np.array(np.meshgrid(*tuple(all_index))).T.reshape(-1, self.dim)
+
+        # Generate coordinate shifts for each volume
+        # List of list (1st dim is spatial dimension, 2nd is volume splits in a given spatial dimension)
+        shifts = []
+        for n in range(self.dim):
+            if self.boundaries[n] is None:
+                shifts.append([0.])
+                continue
+            dim_shifts = []
+            for i in range(len(self.boundaries[n])):
+                dim_shifts.append(self.boundaries[n][i-1] if i > 0 else 0.)
+            dim_shifts.append(self.boundaries[n][-1])
+            shifts.append(dim_shifts)
+        self.shifts = shifts
+
+    def num_volumes(self):
+        """
+        Returns
+        =======
+        int
+        """
+        return len(self.combo)
+
+    def virtual_batch_ids(self, entry=0):
+        """
+        Parameters
+        ==========
+        entry: int, optional
+            Which entry of the dataset you are trying to access.
+
+        Returns
+        =======
+        list
+            List of virtual batch ids that correspond to this entry.
+        """
+        return np.arange(len(self.combo)) + entry * self.num_volumes()
+
+    def translate(self, voxels, volume):
+        """
+        Meant to reverse what the split method does: for voxels coordinates initially in the range of volume 0,
+        translate to the range of a specific volume given in argument.
+
+        Parameters
+        ==========
+        voxels: np.ndarray
+            Expected shape is (D_0, ..., D_N, self.dim) with N >=0. In other words, voxels can be a list of
+            coordinate or a single coordinate with shape (d,).
+        volume: int
+
+        Returns
+        =======
+        np.ndarray
+            Translated voxels array, using internally computed shifts.
+        """
+        assert volume >= 0 and volume < self.num_volumes()
+        assert voxels.shape[-1] == self.dim
+
+        new_voxels = voxels.copy()
+        for n in range(self.dim):
+            new_voxels[..., n] += int(self.shifts[n][self.combo[volume][n]])
+        return new_voxels
 
     def split(self, voxels):
         """
@@ -79,45 +146,35 @@ def split(self, voxels):
             This is a permutation mask which can be used to apply the lexsort to both the new voxels and the features
             or data tensor (which is not passed to this function).
         """
+        assert len(voxels.shape) == 2
+        batch_ids = voxels[:, 0]
         coords = voxels[:, 1:]
-        assert len(coords.shape) == 2
         assert self.dim == coords.shape[1]
 
-        all_boundaries, shifts = [], []
-        n_boundaries =[]
+        # This will contain the list of boolean masks corresponding to each boundary
+        # in each spatial dimension (so, list of list)
+        all_boundaries = []
         for n in range(self.dim):
             if self.boundaries[n] is None: 
                 all_boundaries.append([np.ones((coords.shape[0],), dtype=bool)])
-                shifts.append([0.])
-                n_boundaries.append(0)
                 continue
             dim_boundaries = [] 
-            dim_shifts = []
             for i in range(len(self.boundaries[n])):
                 dim_boundaries.append( coords[:, n] < self.boundaries[n][i] )
-                dim_shifts.append(self.boundaries[n][i-1] if i > 0 else 0.)
             dim_boundaries.append( coords[:, n] >= self.boundaries[n][-1] )
-            dim_shifts.append(self.boundaries[n][-1])
             all_boundaries.append(dim_boundaries)
-            shifts.append(dim_shifts)
-            n_boundaries.append(len(self.boundaries[n]))
-
-        #n_volumes = np.prod([len(x) for x in all_boundaries])
-        # Generate indices
-        all_index = []
-        for n in range(self.dim):
-            all_index.append(np.arange(n_boundaries[n]+1))
-        combo = np.array(np.meshgrid(*tuple(all_index))).T.reshape(-1, self.dim)
 
         virtual_batch_ids = np.zeros((coords.shape[0],), dtype=np.int32)
         new_coords = coords.copy()
-        for idx, c in enumerate(combo):
-            m = all_boundaries[0][c[0]]
+        for idx, c in enumerate(self.combo): # Looping over volumes
+            m = all_boundaries[0][c[0]] # Building a boolean mask for this volume
             for n in range(1, self.dim):
                 m = np.logical_and(m, all_boundaries[n][c[n]])
-            virtual_batch_ids[m] = idx
+            # Now defining virtual batch id
+            # We need to take into account original batch id
+            virtual_batch_ids[m] = idx + batch_ids[m] * self.num_volumes()
             for n in range(self.dim):
-                new_coords[m, n] -= int(shifts[n][c[n]])
+                new_coords[m, n] -= int(self.shifts[n][c[n]])
 
         new_voxels = np.concatenate([virtual_batch_ids[:, None], new_coords], axis=1)
         perm = np.lexsort(new_voxels.T[list(range(1, self.dim+1)) + [0], :])
diff --git a/mlreco/models/layers/common/gnn_full_chain.py b/mlreco/models/layers/common/gnn_full_chain.py
index bf26c796..1a0af553 100644
--- a/mlreco/models/layers/common/gnn_full_chain.py
+++ b/mlreco/models/layers/common/gnn_full_chain.py
@@ -186,6 +186,7 @@ def run_fragment_gnns(self, result, input):
         frag_dict = self.get_all_fragments(result, input)
         fragments = frag_dict['frags']
         frag_seg = frag_dict['frag_seg']
+        print('run fragment gnns', np.unique(frag_seg, return_counts=True), len(fragments))
 
         if self.enable_gnn_shower:
 
@@ -217,6 +218,7 @@ def run_fragment_gnns(self, result, input):
                          fragments[em_mask],
                          output_keys,
                          kwargs)
+            print('run fragment gnns', len(result['shower_fragments'][0][0]), len(fragments[em_mask]), len(fragments))
 
         if self.enable_gnn_track:
 
diff --git a/mlreco/utils/cluster/fragmenter.py b/mlreco/utils/cluster/fragmenter.py
index 4d38dc26..537834fa 100644
--- a/mlreco/utils/cluster/fragmenter.py
+++ b/mlreco/utils/cluster/fragmenter.py
@@ -23,6 +23,7 @@ def format_fragments(fragments, frag_batch_ids, frag_seg, batch_column, batch_si
                             dtype=object if not same_length else np.int64)
     frag_batch_ids_np = np.array(frag_batch_ids)
     frag_seg_np = np.array(frag_seg)
+    print('format fragments', np.unique(frag_batch_ids_np, return_counts=True), np.unique(frag_seg_np, return_counts=True))
 
     batches, counts = torch.unique(batch_column, return_counts=True)
     # In case one of the events is "missing" and len(counts) < batch_size
diff --git a/mlreco/utils/gnn/data.py b/mlreco/utils/gnn/data.py
index cbb8c45a..1c91eecc 100644
--- a/mlreco/utils/gnn/data.py
+++ b/mlreco/utils/gnn/data.py
@@ -208,6 +208,7 @@ def _get_extra_gnn_features(fragments,
     for c in classes:
         mask |= (frag_seg == c)
     mask = np.where(mask)[0]
+    print('mask', np.unique(frag_seg, return_counts=True), frag_seg.shape, len(mask), classes)
 
     #print("INPUT = ", input)
 

From 308ccd2fd0b89fea47ecc76df1378c0365e5dcfb Mon Sep 17 00:00:00 2001
From: Francois Drielsma <francois.drielsma@gmail.com>
Date: Tue, 1 Nov 2022 13:17:11 -0700
Subject: [PATCH 31/52] Temporarly add parsers which compute rescaled charge on
 the fly. Will be removed later

---
 mlreco/iotools/parsers/__init__.py |  2 ++
 mlreco/iotools/parsers/cluster.py  | 21 +++++++++++++++++++++
 mlreco/iotools/parsers/sparse.py   | 16 ++++++++++++++++
 3 files changed, 39 insertions(+)

diff --git a/mlreco/iotools/parsers/__init__.py b/mlreco/iotools/parsers/__init__.py
index a32f7fb8..f63dc3e4 100644
--- a/mlreco/iotools/parsers/__init__.py
+++ b/mlreco/iotools/parsers/__init__.py
@@ -75,6 +75,7 @@
     parse_sparse2d,
     parse_sparse3d,
     parse_sparse3d_ghost,
+    parse_sparse3d_charge_rescaled, # TEMPORARY
     parse_sparse2d_scn, # Deprecated
     parse_sparse3d_scn # Depreacted
 )
@@ -82,6 +83,7 @@
 from mlreco.iotools.parsers.cluster import (
     parse_cluster2d,
     parse_cluster3d,
+    parse_cluster3d_charge_rescaled, # TEMPORARY
     parse_cluster3d_kinematics_clean, # Deprecated
     parse_cluster3d_clean_full # Depreacted
 )
diff --git a/mlreco/iotools/parsers/cluster.py b/mlreco/iotools/parsers/cluster.py
index 2e954ba6..abc44f05 100644
--- a/mlreco/iotools/parsers/cluster.py
+++ b/mlreco/iotools/parsers/cluster.py
@@ -192,6 +192,27 @@ def parse_cluster3d(cluster_event,
     return np_voxels, np_features
 
 
+def parse_cluster3d_charge_rescaled(cluster_event,
+                                    particle_event = None,
+                                    particle_mpv_event = None,
+                                    sparse_semantics_event = None,
+                                    sparse_value_event_list = None,
+                                    add_particle_info = False,
+                                    add_kinematics_info = False,
+                                    clean_data = True,
+                                    precedence = [1,2,0,3,4],
+                                    type_include_mpr = False):
+    # Produces cluster3d labels with sparse3d_reco_rescaled on the fly on datasets that do not have it
+    np_voxels, np_features = parse_cluster3d(cluster_event, particle_event, particle_mpv_event, sparse_semantics_event, None,
+                                             add_particle_info, add_kinematics_info, clean_data, precedence, type_include_mpr)
+
+    from .sparse import parse_sparse3d_charge_rescaled
+    _, val_features  = parse_sparse3d_charge_rescaled(sparse_value_event_list)
+    np_features[:,0] = val_features[:,-1]
+
+    return np_voxels, np_features
+
+
 def parse_cluster3d_clean_full(cluster_event, particle_event, particle_mpv_event=None, sparse_semantics_event=None):
     from warnings import warn
     warn("Deprecated: parse_cluster3d_clean_full deprecated, use parse_cluster3d instead", DeprecationWarning)
diff --git a/mlreco/iotools/parsers/sparse.py b/mlreco/iotools/parsers/sparse.py
index 01a18f9d..b50e2872 100644
--- a/mlreco/iotools/parsers/sparse.py
+++ b/mlreco/iotools/parsers/sparse.py
@@ -131,6 +131,22 @@ def parse_sparse3d_ghost(sparse_event_semantics):
     return np_voxels, (np_data==5).astype(np.float32)
 
 
+def parse_sparse3d_charge_rescaled(sparse_event_list):
+    # Produces sparse3d_reco_rescaled on the fly on datasets that do not have it
+    np_voxels, output = parse_sparse3d(sparse_event_list)
+
+    deghost      = output[:, -1] < 5
+    hit_charges  = output[deghost,  :3]
+    hit_ids      = output[deghost, 3:6]
+    pmask        = hit_ids > -1
+
+    _, inverse, counts = np.unique(hit_ids, return_inverse=True, return_counts=True)
+    multiplicity = counts[inverse].reshape(-1,3)
+    charges = np.sum((hit_charges*pmask)/multiplicity, axis=1)/np.sum(pmask, axis=1)
+
+    return np_voxels[deghost], charges.reshape(-1,1)
+
+
 def parse_sparse2d_scn(sparse_event_list):
     from warnings import warn
     warn("Deprecated: parse_sparse2d_scn deprecated, use parse_sparse2d instead", DeprecationWarning)

From 5b4e4ad8905bd188389c353ed15546f33fa6b842 Mon Sep 17 00:00:00 2001
From: Francois Drielsma <francois.drielsma@gmail.com>
Date: Tue, 1 Nov 2022 16:23:53 -0700
Subject: [PATCH 32/52] Bug fix in training curve drawing tool

---
 mlreco/visualization/training.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/mlreco/visualization/training.py b/mlreco/visualization/training.py
index 879f5674..b61b5770 100644
--- a/mlreco/visualization/training.py
+++ b/mlreco/visualization/training.py
@@ -197,7 +197,6 @@ def draw_training_curves(log_dir, models, metrics,
                            legend=dict(font=dict(size=20)))
         if len(models) == 1 and same_plot:
             layout['legend']['title'] = model_names[models[0]] if models[0] in model_names else models[0]
-
  
     # If there is >1 subplot, prepare the canvas
     if not same_plot:
@@ -233,13 +232,13 @@ def draw_training_curves(log_dir, models, metrics,
         val_dfs[key] = get_validation_df(log_subdir, metrics, val_prefix)
         colors[key] = plotly_colors[i]
 
-    # Draw the requested metrics
+    # Loop over the requested metrics
     for i, metric_list in enumerate(metrics):
-        # Draw the training curves
-        metric, metric_name = find_key(dfs[key], metric_list)
+        # Get a graph per training campaign
         for j, key in enumerate(dfs.keys()):
             # Get the necessary data
             epoch_train  = dfs[key]['epoch'][:max_iter:step]
+            metric, metric_name = find_key(dfs[key], metric_list)
             metric_train = dfs[key][metric][:max_iter:step] if smoothing == 1 else dfs[key][metric][:max_iter].rolling(smoothing, min_periods=1, center=True).mean()[::step]
             draw_val     = bool(len(val_dfs[key]['iter']))
             if draw_val:
@@ -290,7 +289,6 @@ def draw_training_curves(log_dir, models, metrics,
         else:
             plt.xlabel('Epochs')
             ylabel = metric_names[metrics[0]] if metrics[0] in metric_names else metrics[0]
-            print(ylabel)
             plt.ylabel(ylabel if len(metrics) == 1 else 'Metric')
             plt.gca().set_ylim(limits)
             legend_title = model_names[models[0]] if models[0] in model_names else models[0]

From 6cbcad70fc4cf365523cac221c82b4a35cd2b30d Mon Sep 17 00:00:00 2001
From: Temigo <temigo@gmx.com>
Date: Wed, 2 Nov 2022 12:04:31 -0700
Subject: [PATCH 33/52] Remove debugging print statements + small fix in
 VolumeBoundaries

---
 mlreco/iotools/collates.py                    | 3 ++-
 mlreco/models/layers/common/gnn_full_chain.py | 2 --
 mlreco/utils/cluster/fragmenter.py            | 1 -
 mlreco/utils/gnn/data.py                      | 1 -
 4 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/mlreco/iotools/collates.py b/mlreco/iotools/collates.py
index 24306037..ee040b03 100644
--- a/mlreco/iotools/collates.py
+++ b/mlreco/iotools/collates.py
@@ -56,10 +56,11 @@ def __init__(self, definitions):
 
         # Quick sanity check
         for i in range(self.dim):
-            assert self.boundaries[i] == 'None' or (isinstance(self.boundaries[i], list) and len(self.boundaries[i]) > 0)
+            assert self.boundaries[i] == 'None' or self.boundaries[i] is None or (isinstance(self.boundaries[i], list) and len(self.boundaries[i]) > 0)
             if self.boundaries[i] == 'None':
                 self.boundaries[i] = None
                 continue
+            if self.boundaries[i] is None: continue
             self.boundaries[i].sort() # Ascending order
 
         n_boundaries = [len(self.boundaries[n]) if self.boundaries[n] is not None else 0 for n in range(self.dim)]
diff --git a/mlreco/models/layers/common/gnn_full_chain.py b/mlreco/models/layers/common/gnn_full_chain.py
index 1a0af553..bf26c796 100644
--- a/mlreco/models/layers/common/gnn_full_chain.py
+++ b/mlreco/models/layers/common/gnn_full_chain.py
@@ -186,7 +186,6 @@ def run_fragment_gnns(self, result, input):
         frag_dict = self.get_all_fragments(result, input)
         fragments = frag_dict['frags']
         frag_seg = frag_dict['frag_seg']
-        print('run fragment gnns', np.unique(frag_seg, return_counts=True), len(fragments))
 
         if self.enable_gnn_shower:
 
@@ -218,7 +217,6 @@ def run_fragment_gnns(self, result, input):
                          fragments[em_mask],
                          output_keys,
                          kwargs)
-            print('run fragment gnns', len(result['shower_fragments'][0][0]), len(fragments[em_mask]), len(fragments))
 
         if self.enable_gnn_track:
 
diff --git a/mlreco/utils/cluster/fragmenter.py b/mlreco/utils/cluster/fragmenter.py
index 537834fa..4d38dc26 100644
--- a/mlreco/utils/cluster/fragmenter.py
+++ b/mlreco/utils/cluster/fragmenter.py
@@ -23,7 +23,6 @@ def format_fragments(fragments, frag_batch_ids, frag_seg, batch_column, batch_si
                             dtype=object if not same_length else np.int64)
     frag_batch_ids_np = np.array(frag_batch_ids)
     frag_seg_np = np.array(frag_seg)
-    print('format fragments', np.unique(frag_batch_ids_np, return_counts=True), np.unique(frag_seg_np, return_counts=True))
 
     batches, counts = torch.unique(batch_column, return_counts=True)
     # In case one of the events is "missing" and len(counts) < batch_size
diff --git a/mlreco/utils/gnn/data.py b/mlreco/utils/gnn/data.py
index 1c91eecc..cbb8c45a 100644
--- a/mlreco/utils/gnn/data.py
+++ b/mlreco/utils/gnn/data.py
@@ -208,7 +208,6 @@ def _get_extra_gnn_features(fragments,
     for c in classes:
         mask |= (frag_seg == c)
     mask = np.where(mask)[0]
-    print('mask', np.unique(frag_seg, return_counts=True), frag_seg.shape, len(mask), classes)
 
     #print("INPUT = ", input)
 

From 52510f3cd6c451511b1f1c04606cdb2d3712e853 Mon Sep 17 00:00:00 2001
From: Temigo <temigo@gmx.com>
Date: Thu, 3 Nov 2022 16:45:29 -0700
Subject: [PATCH 34/52] FlashManager + fix get_nu_id + parse_opflash

---
 analysis/classes/FlashManager.py   | 206 +++++++++++++++++++++++++++++
 analysis/classes/Interaction.py    |   7 +
 analysis/classes/__init__.py       |   1 +
 mlreco/iotools/parsers/__init__.py |   3 +-
 mlreco/iotools/parsers/misc.py     |  22 +++
 mlreco/utils/groups.py             |  10 +-
 6 files changed, 246 insertions(+), 3 deletions(-)
 create mode 100644 analysis/classes/FlashManager.py

diff --git a/analysis/classes/FlashManager.py b/analysis/classes/FlashManager.py
new file mode 100644
index 00000000..2ddf6326
--- /dev/null
+++ b/analysis/classes/FlashManager.py
@@ -0,0 +1,206 @@
+import os, sys
+
+
+class FlashManager:
+    def __init__(self, cfg, cfg_fmatch, meta=None, detector_specs=None):
+
+        # Setup OpT0finder
+        basedir = os.getenv('FMATCH_BASEDIR') 
+        if basedir is None:
+            raise Exception("You need to source OpT0Finder configure.sh first, or set the FMATCH_BASEDIR environment variable.")
+
+        sys.path.append(os.path.join(basedir, 'python'))
+        #print(os.getenv('LD_LIBRARY_PATH'), os.getenv('ROOT_INCLUDE_PATH'))
+        os.environ['LD_LIBRARY_PATH'] = "%s:%s" % (os.path.join(basedir, 'build/lib'), os.environ['LD_LIBRARY_PATH'])
+        #os.environ['ROOT_INCLUDE_PATH'] = os.path.join(basedir, 'build/include')
+        #print(os.environ['LD_LIBRARY_PATH'], os.environ['ROOT_INCLUDE_PATH'])
+        if 'FMATCH_DATADIR' not in os.environ: # needed for loading detector specs
+            os.environ['FMATCH_DATADIR'] = os.path.join(basedir, 'dat')
+        import ROOT
+
+        import flashmatch
+        from flashmatch.visualization import plotly_layout3d, plot_track, plot_flash, plot_qcluster
+        from flashmatch import flashmatch, geoalgo
+
+        # Setup meta
+        self.cfg = cfg
+
+        self.min_x, self.min_y, self.min_z = None, None, None
+        self.size_voxel_x, self.size_voxel_y, self.size_voxel_z = None, None, None
+        if meta is not None:
+            self.min_x = meta[0]
+            self.min_y = meta[1]
+            self.min_z = meta[2]
+            self.size_voxel_x = meta[6]
+            self.size_voxel_y = meta[7]
+            self.size_voxel_z = meta[8]
+
+        # Setup flash matching
+        print('Setting up OpT0Finder for flash matching...')
+        self.mgr = flashmatch.FlashMatchManager()
+        cfg = flashmatch.CreatePSetFromFile(cfg_fmatch)
+        if detector_specs is None:
+            self.det = flashmatch.DetectorSpecs.GetME(os.path.join(basedir, 'dat/detector_specs.cfg'))
+        else:
+            assert isinstance(detector_specs, str)
+            if not os.path.exists(detector_specs):
+                raise Exception("Detector specs file not found")
+
+            self.det = flashmatch.DetectorSpecs.GetME(detector_specs)
+        self.mgr.Configure(cfg)
+        print('...done.')
+
+        self.all_matches = None
+        self.pmt_v, self.tpc_v = None, None
+
+    def get_flash(self, flash_id, array=False):
+        from flashmatch import flashmatch
+
+        if self.pmt_v is None:
+            raise Exception("self.pmt_v is None")
+
+        for flash in self.pmt_v:
+            if flash.idx != flash_id: continue
+            if array: return flashmatch.as_ndarray(flash)
+            else: return flash
+
+        raise Exception("Flash %d does not exist in self.pmt_v" % flash_id)
+
+    def get_qcluster(self, tpc_id, array=False):
+        from flashmatch import flashmatch
+
+        if self.tpc_v is None:
+            raise Exception("self.tpc_v is None")
+
+        for tpc in self.tpc_v:
+            if tpc.idx != tpc_id: continue
+            if array: return flashmatch.as_ndarray(tpc)
+            else: return tpc
+
+        raise Exception("TPC object %d does not exist in self.tpc_v" % tpc_id)
+
+    def make_qcluster(self, interactions):
+        from flashmatch import flashmatch
+
+        if self.min_x is None:
+            raise Exception('min_x is None')
+
+        tpc_v = []
+        for p in interactions:
+            qcluster = flashmatch.QCluster_t()
+            qcluster.idx = int(p.id) # Assign a unique index
+            qcluster.time = 0  # assumed time w.r.t. trigger for reconstruction
+            for i in range(p.size):
+                # Create a geoalgo::QPoint_t
+                qpoint = flashmatch.QPoint_t(
+                    p.points[i, 0] * self.size_voxel_x + self.min_x,
+                    p.points[i, 1] * self.size_voxel_y + self.min_y,
+                    p.points[i, 2] * self.size_voxel_z + self.min_z,
+                    p.depositions[i])
+                # Add it to geoalgo::QCluster_t
+                qcluster.push_back(qpoint)
+            tpc_v.append(qcluster)
+
+        if self.tpc_v is not None:
+            print("Warning: overwriting internal list of particles.")
+        self.tpc_v = tpc_v
+        return tpc_v
+
+    def make_flash(self, larcv_flashes):
+        """
+        Parameters
+        ==========
+        larcv_flashes: list of list of larcv::Flash
+
+        Returns
+        =======
+        list of flashmatch::Flash_t
+        """
+        from flashmatch import flashmatch
+
+        flashes = []
+        for branch in larcv_flashes:
+            flashes.extend(branch)
+
+        pmt_v = []
+        for idx, f in enumerate(flashes):
+            # f is an object of type larcv::Flash
+            flash = flashmatch.Flash_t()
+            flash.idx = f.id()  # Assign a unique index
+            flash.time = f.time()  # Flash timing, a candidate T0
+
+            # Assign the flash position and error on this position
+            flash.x, flash.y, flash.z = 0, 0, 0
+            flash.x_err, flash.y_err, flash.z_err = 0, 0, 0
+
+            # PE distribution over the 360 photodetectors
+            #flash.pe_v = f.PEPerOpDet()
+            #for i in range(360):
+            offset = 0 if len(f.PEPerOpDet()) == 180 else 180
+            for i in range(180):
+                flash.pe_v.push_back(f.PEPerOpDet()[i + offset])
+                flash.pe_err_v.push_back(0.)
+            pmt_v.append(flash)
+        if self.pmt_v is not None:
+            print("Warning: overwriting internal list of flashes.")
+        self.pmt_v = pmt_v
+        return pmt_v
+
+    def run_flash_matching(self, flashes=None, interactions=None):
+        if self.tpc_v is None:
+            if interactions is None:
+                raise Exception('You need to specify `interactions`, or to run make_qcluster.')
+        if interactions is not None:
+            self.make_qcluster(interactions)
+        
+
+        if self.pmt_v is None:
+            if flashes is None:
+                raise Exception("PMT objects need to be defined. Either specify `flashes`, or run make_flash.")
+        if flashes is not None:
+            self.make_flash(flashes)
+
+        assert self.tpc_v is not None and self.pmt_v is not None
+
+        self.mgr.Reset()
+
+        # First register all objects in manager
+        for x in self.tpc_v:
+            self.mgr.Add(x)
+        for x in self.pmt_v:
+            self.mgr.Add(x)
+            
+        # Run the matching
+        if self.all_matches is not None:
+            print("Warning: overwriting internal list of matches.")
+        self.all_matches = self.mgr.Match()
+        return self.all_matches
+
+    def get_match(self, idx, matches=None):
+        if matches is None:
+            if self.all_matches is None:
+                raise Exception("Need to run flash matching first with run_flash_matching.")
+            matches = self.all_matches
+
+        for m in self.all_matches:
+            if self.tpc_v[m.tpc_id].idx != idx: continue
+            return m
+
+        return None
+
+    def get_matched_flash(self, idx, matches=None):
+        m = self.get_match(idx, matches=matches)
+        if m is None: return None
+
+        flash_id = m.flash_id
+        if flash_id is None: return None
+
+        if flash_id > len(self.pmt_v):
+            raise Exception("Could not find flash id %d in self.pmt_v" % flash_id)
+
+        return self.pmt_v[flash_id]
+        
+
+    def get_t0(self, idx, matches=None):
+        flash = self.get_matched_flash(idx, matches=matches)
+        return None if flash is None else flash.time
diff --git a/analysis/classes/Interaction.py b/analysis/classes/Interaction.py
index a22a8f77..3e85df18 100644
--- a/analysis/classes/Interaction.py
+++ b/analysis/classes/Interaction.py
@@ -35,10 +35,17 @@ def __init__(self, interaction_id, particles, vertex=None, nu_id=-1, volume=0):
         # Voxel indices of an interaction is defined by the union of
         # constituent particle voxel indices
         self.voxel_indices = []
+        self.points = []
+        self.depositions = []
         for p in self.particles:
             self.voxel_indices.append(p.voxel_indices)
+            self.points.append(p.points)
+            self.depositions.append(p.depositions)
             assert p.interaction_id == interaction_id
         self.voxel_indices = np.hstack(self.voxel_indices)
+        self.points = np.concatenate(self.points, axis=0)
+        self.depositions = np.hstack(self.depositions)
+
         self.size = self.voxel_indices.shape[0]
         self.num_particles = len(self.particles)
 
diff --git a/analysis/classes/__init__.py b/analysis/classes/__init__.py
index 6902e665..c4fb0f0f 100644
--- a/analysis/classes/__init__.py
+++ b/analysis/classes/__init__.py
@@ -4,3 +4,4 @@
 from .TruthParticleFragment import TruthParticleFragment
 from .Interaction import Interaction
 from .TruthInteraction import TruthInteraction
+from .FlashManager import FlashManager
diff --git a/mlreco/iotools/parsers/__init__.py b/mlreco/iotools/parsers/__init__.py
index a32f7fb8..00e13ab2 100644
--- a/mlreco/iotools/parsers/__init__.py
+++ b/mlreco/iotools/parsers/__init__.py
@@ -103,5 +103,6 @@
 from mlreco.iotools.parsers.misc import (
     parse_meta2d,
     parse_meta3d,
-    parse_run_info
+    parse_run_info,
+    parse_opflash
 )
diff --git a/mlreco/iotools/parsers/misc.py b/mlreco/iotools/parsers/misc.py
index 4918d138..e8964148 100644
--- a/mlreco/iotools/parsers/misc.py
+++ b/mlreco/iotools/parsers/misc.py
@@ -122,3 +122,25 @@ def parse_run_info(sparse_event):
          (run, subrun, event)
     """
     return sparse_event.run(), sparse_event.subrun(), sparse_event.event()
+
+
+def parse_opflash(opflash_event):
+    """
+    Copy construct OpFlash and return an array of larcv::Flash.
+
+    .. code-block:: yaml
+        schema:
+          opflash_cryoE:
+            parser:parse_opflash
+            opflash_event: opflash_cryoE
+
+    Configuration
+    -------------
+    opflash_event: larcv::EventFlash
+
+    Returns
+    -------
+    list
+    """
+    opflashes = [larcv.Flash(f) for f in opflash_event.as_vector()]
+    return opflashes
diff --git a/mlreco/utils/groups.py b/mlreco/utils/groups.py
index a945fb03..bd998a8a 100644
--- a/mlreco/utils/groups.py
+++ b/mlreco/utils/groups.py
@@ -278,10 +278,16 @@ def get_nu_id(cluster_event, particle_v, interaction_ids, particle_mpv=None):
     else:
         # Find mpv particles
         is_mpv = np.zeros((len(particle_v),))
-        mpv_ids = [p.id() for p in particle_mpv]
+        # mpv_ids = [p.id() for p in particle_mpv]
+        mpv_pdg = np.array([p.pdg_code() for p in particle_mpv])
+        mpv_energy = np.array([p.energy_init() for p in particle_mpv])
         for idx, part in enumerate(particle_v):
             # track_id - 1 in `particle_pcluster_tree` corresponds to id (or track_id) in `particle_mpv_tree`
-            if (part.track_id()-1) in mpv_ids or (part.ancestor_track_id()-1) in mpv_ids:
+            # if (part.track_id()-1) in mpv_ids or (part.ancestor_track_id()-1) in mpv_ids:
+            # FIXME the above was wrong I think.
+            close = np.isclose(part.energy_init()*1e-3, mpv_energy)
+            pdg = part.pdg_code() == mpv_pdg
+            if close.any() and pdg.any() and (np.where(close)[0] == np.where(pdg)[0]).any():
                 is_mpv[idx] = 1.
             # else:
             #     print("fake cosmic", part.pdg_code(), part.shape(), part.creation_process(), part.track_id(), part.ancestor_track_id(), mpv_ids)

From 535f88dcc3df869b282ec634b70c81ce7d981b06 Mon Sep 17 00:00:00 2001
From: Francois Drielsma <francois.drielsma@gmail.com>
Date: Thu, 3 Nov 2022 17:49:40 -0700
Subject: [PATCH 35/52] Added option in GrapPA to do edge length selection
 based on semantic type

---
 mlreco/models/grappa.py     |  49 +++++++++----
 mlreco/utils/gnn/cluster.py |  54 +++++++++++++--
 mlreco/utils/gnn/network.py | 132 +++++++++++++++++++-----------------
 mlreco/utils/numba.py       |  56 +++++++--------
 4 files changed, 179 insertions(+), 112 deletions(-)

diff --git a/mlreco/models/grappa.py b/mlreco/models/grappa.py
index d6a3ed7c..38d8b74f 100644
--- a/mlreco/models/grappa.py
+++ b/mlreco/models/grappa.py
@@ -8,8 +8,8 @@
 from mlreco.models.layers.gnn import gnn_model_construct, node_encoder_construct, edge_encoder_construct, node_loss_construct, edge_loss_construct
 
 from mlreco.utils.gnn.data import merge_batch, split_clusts, split_edge_index
-from mlreco.utils.gnn.cluster import form_clusters, get_cluster_batch, get_cluster_label, get_cluster_points_label, get_cluster_directions, get_cluster_dedxs
-from mlreco.utils.gnn.network import complete_graph, delaunay_graph, mst_graph, bipartite_graph, inter_cluster_distance, knn_graph
+from mlreco.utils.gnn.cluster import form_clusters, get_cluster_batch, get_cluster_label, get_cluster_primary_label, get_cluster_points_label, get_cluster_directions, get_cluster_dedxs
+from mlreco.utils.gnn.network import complete_graph, delaunay_graph, mst_graph, bipartite_graph, inter_cluster_distance, knn_graph, restrict_graph
 
 class GNN(torch.nn.Module):
     """
@@ -150,9 +150,18 @@ def __init__(self, cfg, name='grappa', batch_col=0, coords_col=(1, 4)):
         # Choose what type of network to use
         self.network = base_config.get('network', 'complete')
         self.edge_max_dist = base_config.get('edge_max_dist', -1)
-        self.edge_dist_metric = base_config.get('edge_dist_metric', 'set')
+        self.edge_dist_metric = base_config.get('edge_dist_metric', 'voxel')
         self.edge_knn_k = base_config.get('edge_knn_k', 5)
 
+        # Turn the edge_max_dist value into a matrix
+        if not isinstance(self.edge_max_dist, list): self.edge_max_dist = [self.edge_max_dist]
+        mat_size = int((np.sqrt(8*len(self.edge_max_dist)+1)-1)/2)
+        max_dist_mat = np.zeros((mat_size, mat_size), dtype=float)
+        max_dist_mat[np.triu_indices(mat_size)] = self.edge_max_dist
+        max_dist_mat += max_dist_mat.T - np.diag(np.diag(max_dist_mat))
+        self.edge_max_dist = max_dist_mat
+        print('edge_max_dist matrix', self.edge_max_dist)
+
         # If requested, merge images together within the batch
         self.merge_batch = base_config.get('merge_batch', False)
         self.merge_batch_mode = base_config.get('merge_batch_mode', 'const')
@@ -300,26 +309,25 @@ def forward(self, data, clusts=None, groups=None, points=None, extra_feats=None,
 
         # If necessary, compute the cluster distance matrix
         dist_mat = None
-        if self.edge_max_dist > 0 or self.network == 'mst' or self.network == 'knn':
-            dist_mat = inter_cluster_distance(cluster_data[:,self.coords_index[0]:self.coords_index[1]], clusts, batch_ids, self.edge_dist_metric)
+        if np.any(self.edge_max_dist > -1) or self.network == 'mst' or self.network == 'knn':
+            dist_mat = inter_cluster_distance(cluster_data[:,self.coords_index[0]:self.coords_index[1]].float(), clusts, batch_ids, self.edge_dist_metric)
 
         # Form the requested network
         if len(clusts) == 1:
             edge_index = np.empty((2,0), dtype=np.int64)
         elif self.network == 'complete':
-            edge_index = complete_graph(batch_ids, dist_mat, self.edge_max_dist)
+            edge_index = complete_graph(batch_ids)
         elif self.network == 'delaunay':
             import numba as nb
-            edge_index = delaunay_graph(cluster_data.cpu().numpy(), nb.typed.List(clusts), batch_ids, dist_mat, self.edge_max_dist,
-                                        batch_col=self.batch_index, coords_col=self.coords_index)
+            edge_index = delaunay_graph(cluster_data.cpu().numpy(), nb.typed.List(clusts), batch_ids, self.batch_index, self.coords_index)
         elif self.network == 'mst':
-            edge_index = mst_graph(batch_ids, dist_mat, self.edge_max_dist)
+            edge_index = mst_graph(batch_ids, dist_mat)
         elif self.network == 'knn':
             edge_index = knn_graph(batch_ids, self.edge_knn_k, dist_mat)
         elif self.network == 'bipartite':
             clust_ids = get_cluster_label(cluster_data, clusts, self.source_col)
             group_ids = get_cluster_label(cluster_data, clusts, self.target_col)
-            edge_index = bipartite_graph(batch_ids, clust_ids==group_ids, dist_mat, self.edge_max_dist)
+            edge_index = bipartite_graph(batch_ids, clust_ids==group_ids, dist_mat)
         else:
             raise ValueError('Network type not recognized: '+self.network)
 
@@ -328,6 +336,15 @@ def forward(self, data, clusts=None, groups=None, points=None, extra_feats=None,
             mask = groups[edge_index[0]] == groups[edge_index[1]]
             edge_index = edge_index[:,mask]
 
+        # Restrict the input graph based on edge distance, if requested
+        if np.any(self.edge_max_dist > -1):
+            if self.edge_max_dist.shape[0] == 1:
+                edge_index = restrict_graph(edge_index, dist_mat, self.edge_max_dist)
+            else:
+                # Here get_cluster_primary_label is used to ensure that Michel/Delta showers are given the appropriate semantic label
+                classes = extra_feats[:,-1].cpu().numpy().astype(int) if extra_feats is not None else get_cluster_primary_label(cluster_data, clusts, -1).astype(int)
+                edge_index = restrict_graph(edge_index, dist_mat, self.edge_max_dist, classes)
+
         # Update result with a list of edges for each batch id
         edge_index_split, ebids = split_edge_index(edge_index, batch_ids, batches)
         result['edge_index'] = [edge_index_split]
@@ -346,11 +363,15 @@ def forward(self, data, clusts=None, groups=None, points=None, extra_feats=None,
                 points = get_cluster_points_label(cluster_data, particles, clusts, coords_index=self.coords_index)
             x = torch.cat([x, points.float()], dim=1)
             if self.add_start_dir:
-                dirs = get_cluster_directions(cluster_data[:, self.coords_index[0]:self.coords_index[1]], points[:,:3], clusts, self.start_dir_max_dist, self.start_dir_opt)
-                x = torch.cat([x, dirs.float()], dim=1)
+                dirs_start = get_cluster_directions(cluster_data[:, self.coords_index[0]:self.coords_index[1]], points[:,:3], clusts, self.start_dir_max_dist, self.start_dir_opt)
+                dirs_end   = get_cluster_directions(cluster_data[:, self.coords_index[0]:self.coords_index[1]], points[:,3:6], clusts, self.start_dir_max_dist, self.start_dir_opt)
+                #x = torch.cat([x, dirs_start.float(), dirs_end.float()], dim=1)
+                x = torch.cat([x, dirs_start.float()], dim=1)
             if self.add_start_dedx:
-                dedxs = get_cluster_dedxs(cluster_data[:, self.coords_index[0]:self.coords_index[1]], cluster_data[:,4], points[:,:3], clusts, self.start_dir_max_dist)
-                x = torch.cat([x, dedxs.reshape(-1,1).float()], dim=1)
+                dedxs_start = get_cluster_dedxs(cluster_data[:, self.coords_index[0]:self.coords_index[1]], cluster_data[:,4], points[:,:3], clusts, self.start_dir_max_dist)
+                dedxs_end   = get_cluster_dedxs(cluster_data[:, self.coords_index[0]:self.coords_index[1]], cluster_data[:,4], points[:,3:6], clusts, self.start_dir_max_dist)
+                #x = torch.cat([x, dedxs_start.reshape(-1,1).float(), dedxs_end.reshape(-1,1).float()], dim=1)
+                x = torch.cat([x, dedxs_start.reshape(-1,1).float()], dim=1)
 
         # Bring edge_index and batch_ids to device
         index = torch.tensor(edge_index, device=cluster_data.device, dtype=torch.long)
diff --git a/mlreco/utils/gnn/cluster.py b/mlreco/utils/gnn/cluster.py
index 7136f14d..d933ce93 100644
--- a/mlreco/utils/gnn/cluster.py
+++ b/mlreco/utils/gnn/cluster.py
@@ -123,10 +123,9 @@ def get_cluster_label(data, clusts, column=5):
     Args:
         data (np.ndarray)    : (N,8) [x, y, z, batchid, value, id, groupid, shape]
         clusts ([np.ndarray]): (C) List of arrays of voxel IDs in each cluster
-        column (int)         : Column which specifies the cluster ID
-        dtype (dtype)
+        column (int)         : Column which specifies the cluster label
     Returns:
-        np.ndarray: (C) List of cluster IDs
+        np.ndarray: (C) List of cluster labels
     """
     return _get_cluster_label(data, clusts, column)
 
@@ -141,6 +140,47 @@ def _get_cluster_label(data: nb.float64[:,:],
         labels[i] = v[np.argmax(np.array(cts))]
     return labels
 
+
+@numba_wrapper(cast_args=['data'], list_args=['clusts'])
+def get_cluster_primary_label(data, clusts, column, cluster_column=5, group_column=6):
+    """
+    Function that returns the majority label of the primary component
+    of a cluster, as specified in the requested data column.
+
+    The primary component is identified by picking the set of label
+    voxels that have a cluster_id identical to the cluster group_id.
+
+    Args:
+        data (np.ndarray)    : (N,8) [x, y, z, batchid, value, id, groupid, shape]
+        clusts ([np.ndarray]): (C) List of arrays of voxel IDs in each cluster
+        column (int)         : Column which specifies the cluster label
+        cluster_column (int) : Column which specifies the cluster ID
+        group_column (int)   : Column which specifies the cluster group ID
+    Returns:
+        np.ndarray: (C) List of cluster primary labels
+    """
+    return _get_cluster_primary_label(data, clusts, column, cluster_column, group_column)
+
+@nb.njit(cache=True)
+def _get_cluster_primary_label(data: nb.float64[:,:],
+                               clusts: nb.types.List(nb.int64[:]),
+                               column: nb.int64,
+                               cluster_column: nb.int64 = 5,
+                               group_column: nb.int64 = 6) -> nb.float64[:]:
+    labels = np.empty(len(clusts), dtype=data.dtype)
+    group_ids = _get_cluster_label(data, clusts, group_column)
+    for i in range(len(clusts)):
+        cluster_ids  = data[clusts[i], cluster_column]
+        primary_mask = cluster_ids == group_ids[i]
+        if len(data[clusts[i][primary_mask]]):
+            v, cts = unique_nb(data[clusts[i][primary_mask], column])
+        else: # If the primary is empty, use group
+            v, cts = unique_nb(data[clusts[i], column])
+        labels[i] = v[np.argmax(np.array(cts))]
+
+    return labels
+
+
 @numba_wrapper(cast_args=['data'], list_args=['clusts'], keep_torch=True, ref_arg='data')
 def get_momenta_label(data, clusts, column=8):
     """
@@ -350,13 +390,13 @@ def get_cluster_points_label(data, particles, clusts, random_order=True, batch_c
     and end points of tracks if track.
 
     Args:
-        data (torch.tensor)     : (N,6) Voxel coordinates [x, y, z, batch_id, value, clust_id, group_id]
-        particles (torch.tensor): (N,9) Point coordinates [start_x, start_y, start_z, batch_id, last_x, last_y, last_z, start_t, shape_id]
+        data (torch.tensor)     : (N,X) Voxel coordinates [batch_id, x, y, z, ...]
+        particles (torch.tensor): (N,9) Point coordinates [batch_id, start_x, start_y, start_z, last_x, last_y, last_z, start_t, shape_id]
                                 (obtained with parse_particle_coords)
         clusts ([np.ndarray])   : (C) List of arrays of voxel IDs in each cluster
         random_order (bool)     : Whether or not to shuffle the start and end points randomly
     Returns:
-        np.ndarray: (N,3/6) particle wise start (and end points in RANDOMIZED ORDER)
+        np.ndarray: (N,6) cluster-wise start and end points (in RANDOMIZED ORDER by default)
     """
     return _get_cluster_points_label(data, particles, clusts, random_order,
                                     batch_col=batch_col,
@@ -528,7 +568,7 @@ def cluster_direction(voxels: nb.float64[:,:],
 
     Args:
         voxels (torch.tensor): (N,3) Voxel coordinates [x, y, z]
-        starts (torch.tensor): (C,3) Coordinates of the start points
+        start (torch.tensor) : (C,3) Coordinates of the start point
         max_dist (float)     : Max distance between start voxel and other voxels
         optimize (bool)      : Optimizes the number of points involved in the estimate
     Returns:
diff --git a/mlreco/utils/gnn/network.py b/mlreco/utils/gnn/network.py
index 0f50b2f8..11466c49 100644
--- a/mlreco/utils/gnn/network.py
+++ b/mlreco/utils/gnn/network.py
@@ -30,16 +30,14 @@ def loop_graph(n: nb.int64) -> nb.int64[:,:]:
 
 @nb.njit(cache=True)
 def complete_graph(batch_ids: nb.int64[:],
-                   dist_mat: nb.float64[:,:] = None,
-                   max_dist: nb.float64 = -1.) -> nb.int64[:,:]:
+                   directed: bool = False) -> nb.int64[:,:]:
     """
     Function that returns an incidence matrix of a complete graph
     that connects every node with ever other node.
 
     Args:
         batch_ids (np.ndarray): (C) List of batch ids
-        dist_mat (np.ndarray) : (C,C) Tensor of pair-wise cluster distances
-        max_dist (double)     : Maximal edge length
+        directed (bool)       : If directed, only keep edges [i,j] for which j>=i
     Returns:
         np.ndarray: (2,E) Tensor of edges
     """
@@ -60,16 +58,9 @@ def complete_graph(batch_ids: nb.int64[:],
                 ret[k] = [i,j]
                 k += 1
 
-    # If requested, remove the edges above a certain length threshold
-    if max_dist > -1:
-        assert dist_mat is not None
-        dists = np.empty(len(ret), dtype=dist_mat.dtype)
-        for k, e in enumerate(ret):
-            dists[k] = dist_mat[e[0],e[1]]
-        ret = ret[dists < max_dist]
-
-    # Add the reciprocal edges as to create an undirected graph
-    ret = np.vstack((ret, ret[:,::-1]))
+    # Add the reciprocal edges as to create an undirected graph, if requested
+    if not directed:
+        ret = np.vstack((ret, ret[:,::-1]))
 
     return ret.T
 
@@ -77,8 +68,7 @@ def complete_graph(batch_ids: nb.int64[:],
 def delaunay_graph(data: nb.float64[:,:],
                    clusts: nb.types.List(nb.int64[:]),
                    batch_ids: nb.int64[:],
-                   dist_mat: nb.float64[:,:] = None,
-                   max_dist: nb.float64 = -1.,
+                   directed: bool = False,
                    batch_col: nb.int64 = 0,
                    coords_col: nb.types.List(nb.int64[:]) = (1, 4)) -> nb.int64[:,:]:
     """
@@ -89,8 +79,7 @@ def delaunay_graph(data: nb.float64[:,:],
         data (np.ndarray)     : (N,4) [x, y, z, batchid]
         clusts ([np.ndarray]) : (C) List of arrays of voxel IDs in each cluster
         batch_ids (np.ndarray): (C) List of batch ids
-        dist_mat (np.ndarray) : (C,C) Tensor of pair-wise cluster distances
-        max_dist (double)     : Maximal edge length
+        directed (bool)       : If directed, only keep edges [i,j] for which j>=i
     Returns:
         np.ndarray: (2,E) Tensor of edges
     """
@@ -115,24 +104,17 @@ def delaunay_graph(data: nb.float64[:,:],
         edges = np.vstack((clust_ids[edges[0]],clust_ids[edges[1]])).T
         ret   = np.vstack((ret, edges))
 
-    # If requested, remove the edges above a certain length threshold
-    if max_dist > -1:
-        assert dist_mat is not None
-        dists = np.empty(len(ret), dtype=dist_mat.dtype)
-        for k, e in enumerate(ret):
-            dists[k] = dist_mat[e[0],e[1]]
-        ret = ret[dists < max_dist]
-
-    # Add the reciprocal edges as to create an undirected graph
-    ret = np.vstack((ret, ret[:,::-1]))
+    # Add the reciprocal edges as to create an undirected graph, if requested
+    if not directed:
+        ret = np.vstack((ret, ret[:,::-1]))
 
     return ret.T
 
 
 @nb.njit(cache=True)
 def mst_graph(batch_ids: nb.int64[:],
-              dist_mat: nb.float64[:,:] = None,
-              max_dist: nb.float64 = -1.) -> nb.int64[:,:]:
+              dist_mat: nb.float64[:,:],
+              directed: bool = False) -> nb.int64[:,:]:
     """
     Function that returns an incidence matrix that connects nodes
     that share an edge in their corresponding Euclidean Minimum Spanning Tree (MST).
@@ -140,7 +122,7 @@ def mst_graph(batch_ids: nb.int64[:],
     Args:
         batch_ids (np.ndarray): (C) List of batch ids
         dist_mat (np.ndarray) : (C,C) Tensor of pair-wise cluster distances
-        max_dist (double)     : Maximal edge length
+        directed (bool)       : If directed, only keep edges [i,j] for which j>=i
     Returns:
         np.ndarray: (2,E) Tensor of edges
     """
@@ -156,16 +138,9 @@ def mst_graph(batch_ids: nb.int64[:],
             edges = np.vstack((clust_ids[edges[0]],clust_ids[edges[1]])).T
             ret   = np.vstack((ret, edges))
 
-    # If requested, remove the edges above a certain length threshold
-    if max_dist > -1:
-        assert dist_mat is not None
-        dists = np.empty(len(ret), dtype=dist_mat.dtype)
-        for k, e in enumerate(ret):
-            dists[k] = dist_mat[e[0],e[1]]
-        ret = ret[dists < max_dist]
-
-    # Add the reciprocal edges as to create an undirected graph
-    ret = np.vstack((ret, ret[:,::-1]))
+    # Add the reciprocal edges as to create an undirected graph, if requested
+    if not directed:
+        ret = np.vstack((ret, ret[:,::-1]))
 
     return ret.T
 
@@ -173,7 +148,8 @@ def mst_graph(batch_ids: nb.int64[:],
 @nb.njit(cache=True)
 def knn_graph(batch_ids: nb.int64[:],
               k: nb.int64,
-              dist_mat: nb.float64[:,:] = None) -> nb.int64[:,:]:
+              dist_mat: nb.float64[:,:],
+              directed: bool = False) -> nb.int64[:,:]:
     """
     Function that returns an incidence matrix that connects nodes
     that are k nearest neighbors. Sorts the distance matrix.
@@ -182,6 +158,7 @@ def knn_graph(batch_ids: nb.int64[:],
         batch_ids (np.ndarray): (C) List of batch ids
         k (int)               : Number of connected neighbors for each node
         dist_mat (np.ndarray) : (C,C) Tensor of pair-wise cluster distances
+        directed (bool)       : If directed, only keep edges [i,j] for which j>=i
     Returns:
         np.ndarray: (2,E) Tensor of edges
     """
@@ -200,8 +177,9 @@ def knn_graph(batch_ids: nb.int64[:],
                 if len(edges):
                     ret = np.vstack((ret, edges))
 
-    # Add the reciprocal edges as to create an undirected graph
-    ret = np.vstack((ret, ret[:,::-1]))
+    # Add the reciprocal edges as to create an undirected graph, if requested
+    if not directed:
+        ret = np.vstack((ret, ret[:,::-1]))
 
     return ret.T
 
@@ -209,8 +187,6 @@ def knn_graph(batch_ids: nb.int64[:],
 @nb.njit(cache=True)
 def bipartite_graph(batch_ids: nb.int64[:],
                     primaries: nb.boolean[:],
-                    dist_mat: nb.float64[:,:] = None,
-                    max_dist: nb.float64 = -1,
                     directed: nb.boolean = True,
                     directed_to: str = 'secondary') -> nb.int64[:,:]:
     """
@@ -220,8 +196,8 @@ def bipartite_graph(batch_ids: nb.int64[:],
     Args:
         batch_ids (np.ndarray): (C) List of batch ids
         primaries (np.ndarray): (C) Primary mask (True if primary)
-        dist_mat (np.ndarray) : (C,C) Tensor of pair-wise cluster distances
-        max_dist (double)     : Maximal edge length
+        directed (bool)       : True if edges only exist in one direction
+        directed_to (str)     : Whether to point the edges to the primaries or the secondaries
     Returns:
         np.ndarray: (2,E) Tensor of edges
     """
@@ -232,14 +208,6 @@ def bipartite_graph(batch_ids: nb.int64[:],
             if batch_ids[i] ==  batch_ids[j]:
                 ret = np.vstack((ret, np.array([[i,j]])))
 
-    # If requested, remove the edges above a certain length threshold
-    if max_dist > -1:
-        assert dist_mat is not None
-        dists = np.empty(len(ret), dtype=dist_mat.dtype)
-        for k, e in enumerate(ret):
-            dists[k] = dist_mat[e[0],e[1]]
-        ret = ret[dists < max_dist]
-
     # Handle directedness, by default graph is directed towards secondaries
     if directed:
         if directed_to == 'primary':
@@ -252,6 +220,44 @@ def bipartite_graph(batch_ids: nb.int64[:],
     return ret.T
 
 
+@nb.njit(cache=True)
+def restrict_graph(edge_index: nb.int64[:,:],
+                   dist_mat: nb.float64[:,:],
+                   max_dist: nb.float64[:,:],
+                   classes: nb.int64[:] = None) -> nb.int64[:,:]:
+    """
+    Function that restricts an incidence matrix of a graph
+    to the edges below a certain length.
+
+    If `classes` are specified, the maximum edge length must be provided
+    for each possible combination of node classes.
+
+    Args:
+        edge_index (np.ndarray): (2,E) Tensor of edges
+        dist_mat (np.ndarray)  : (C,C) Tensor of pair-wise cluster distances
+        max_dist (np.ndarray)  : (N_c, N_c) Maximum edge length for each class type
+        classes (np.ndarray)   : (C) List of class for each cluster in the graph
+    Returns:
+        np.ndarray: (2,E) Restricted tensor of edges
+    """
+    if classes is None:
+        assert max_dist.shape[0] == max_dist.shape[1] == 1
+        max_dist = max_dist[0][0]
+        edge_dists = np.empty(edge_index.shape[1], dtype=dist_mat.dtype)
+        for k in range(edge_index.shape[1]):
+            i, j = edge_index[0,k], edge_index[1,k]
+            edge_dists[k] = dist_mat[i, j]
+        return edge_index[:, edge_dists < max_dist]
+    else:
+        edge_max_dists = np.empty(edge_index.shape[1], dtype=dist_mat.dtype)
+        edge_dists = np.empty(edge_index.shape[1], dtype=dist_mat.dtype)
+        for k in range(edge_index.shape[1]):
+            i, j = edge_index[0,k], edge_index[1,k]
+            edge_max_dists[k] = max_dist[classes[i], classes[j]]
+            edge_dists[k] = dist_mat[i, j]
+        return edge_index[:, edge_dists < edge_max_dists]
+
+
 @numba_wrapper(cast_args=['data'], list_args=['clusts'], keep_torch=True, ref_arg='data')
 def get_cluster_edge_features(data, clusts, edge_index, batch_col=0, coords_col=(1, 4)):
     """
@@ -312,7 +318,7 @@ def _get_cluster_edge_features_vec(data: nb.float32[:,:],
                                    coords_col: nb.types.List(nb.int64[:]) = (1, 4)) -> nb.float32[:,:]:
 
     # Get the closest points of approach IDs for each edge
-    lend, idxs1, idxs2 = _get_edge_distances(data[:,:3], clusts, edge_index)
+    lend, idxs1, idxs2 = _get_edge_distances(data[:,coords_col[0]:coords_col[1]], clusts, edge_index)
 
     # Get the points that correspond to the first voxels
     v1 = data[idxs1, coords_col[0]:coords_col[1]]
@@ -380,7 +386,7 @@ def _get_voxel_edge_features(data: nb.float32[:,:],
     return feats
 
 
-@numba_wrapper(cast_args=['voxels'], list_args='clusts')
+@numba_wrapper(cast_args=['voxels'], list_args=['clusts'])
 def get_edge_distances(voxels, clusts, edge_index):
     """
     For each edge, finds the closest points of approach (CPAs) between the
@@ -441,27 +447,25 @@ def inter_cluster_distance(voxels, clusts, batch_ids=None, mode='voxel'):
     return _inter_cluster_distance(voxels, clusts, batch_ids, mode)
 
 @nb.njit(parallel=True, cache=True)
-def _inter_cluster_distance(voxels: nb.float64[:,:],
+def _inter_cluster_distance(voxels: nb.float32[:,:],
                             clusts: nb.types.List(nb.int64[:]),
                             batch_ids: nb.int64[:],
                             mode: str = 'voxel') -> nb.float64[:,:]:
 
     assert len(clusts) == len(batch_ids)
     dist_mat = np.zeros((len(batch_ids), len(batch_ids)), dtype=voxels.dtype)
-    indxi, indxj = np.triu_indices(len(batch_ids), 1)
+    indxi, indxj = complete_graph(batch_ids, directed=True)
     if mode == 'voxel':
         for k in nb.prange(len(indxi)):
             i, j = indxi[k], indxj[k]
-            if batch_ids[i] == batch_ids[j]:
-                dist_mat[i,j] = dist_mat[j,i] = np.min(cdist_nb(voxels[clusts[i]], voxels[clusts[j]]))
+            dist_mat[i,j] = dist_mat[j,i] = np.min(cdist_nb(voxels[clusts[i]], voxels[clusts[j]]))
     elif mode == 'centroid':
         centroids = np.empty((len(batch_ids), voxels.shape[1]), dtype=voxels.dtype)
         for i in nb.prange(len(batch_ids)):
             centroids[i] = mean_nb(voxels[clusts[i]], axis=0)
         for k in nb.prange(len(indxi)):
             i, j = indxi[k], indxj[k]
-            if batch_ids[i] == batch_ids[j]:
-                dist_mat[i,j] = dist_mat[j,i] = np.sqrt(np.sum((centroids[j]-centroids[i])**2))
+            dist_mat[i,j] = dist_mat[j,i] = np.sqrt(np.sum((centroids[j]-centroids[i])**2))
     else:
         raise ValueError('Inter-cluster distance mode not supported')
 
diff --git a/mlreco/utils/numba.py b/mlreco/utils/numba.py
index a4aafede..d5b6e60e 100644
--- a/mlreco/utils/numba.py
+++ b/mlreco/utils/numba.py
@@ -10,8 +10,10 @@ def numba_wrapper(cast_args=[], list_args=[], keep_torch=False, ref_arg=None):
     to make the relevant conversions to numpy where necessary.
 
     Args:
-        type_arg (str)    : Argument name which determines the data type and device location
-        list_args ([str]) : List of arguments which need to be cast to a numba list
+        cast_args ([str]): List of arguments to be cast to numpy
+        list_args ([str]): List of arguments which need to be cast to a numba typed list
+        keep_torch (bool): Make the output a torch object, if the reference argument is one
+        ref_arg (str)    : Reference argument used to assign a type and device to the torch output
     Returns:
         Function
     '''
@@ -63,7 +65,7 @@ def inner(*args, **kwargs):
 
 
 @nb.njit(cache=True)
-def unique_nb(x: nb.int64[:]) -> (nb.int64[:], nb.int64[:]):
+def unique_nb(x: nb.int32[:]) -> (nb.int32[:], nb.int32[:]):
     b = np.sort(x.flatten())
     unique = list(b[:1])
     counts = [1 for _ in unique]
@@ -77,9 +79,9 @@ def unique_nb(x: nb.int64[:]) -> (nb.int64[:], nb.int64[:]):
 
 
 @nb.njit(cache=True)
-def submatrix_nb(x:nb.float64[:,:],
-                 index1: nb.int64[:],
-                 index2: nb.int64[:]) -> nb.float64[:,:]:
+def submatrix_nb(x:nb.float32[:,:],
+                 index1: nb.int32[:],
+                 index2: nb.int32[:]) -> nb.float32[:,:]:
     """
     Numba implementation of matrix subsampling
     """
@@ -91,8 +93,8 @@ def submatrix_nb(x:nb.float64[:,:],
 
 
 @nb.njit(cache=True)
-def cdist_nb(x1: nb.float64[:,:],
-             x2: nb.float64[:,:]) -> nb.float64[:,:]:
+def cdist_nb(x1: nb.float32[:,:],
+             x2: nb.float32[:,:]) -> nb.float32[:,:]:
     """
     Numba implementation of Eucleadian cdist in 3D.
     """
@@ -104,8 +106,8 @@ def cdist_nb(x1: nb.float64[:,:],
 
 
 @nb.njit(cache=True)
-def mean_nb(x: nb.float64[:,:],
-            axis: nb.int64) -> nb.float64[:]:
+def mean_nb(x: nb.float32[:,:],
+            axis: nb.int32) -> nb.float32[:]:
     """
     Numba implementation of np.mean(x, axis)
     """
@@ -121,13 +123,13 @@ def mean_nb(x: nb.float64[:,:],
 
 
 @nb.njit(cache=True)
-def argmin_nb(x: nb.float64[:,:],
-              axis: nb.int64) -> nb.int64[:]:
+def argmin_nb(x: nb.float32[:,:],
+              axis: nb.int32) -> nb.int32[:]:
     """
     Numba implementation of np.argmin(x, axis)
     """
     assert axis == 0 or axis == 1
-    argmin = np.empty(x.shape[1-axis], dtype=np.int64)
+    argmin = np.empty(x.shape[1-axis], dtype=np.int32)
     if axis == 0:
         for i in range(len(argmin)):
             argmin[i] = np.argmin(x[:,i])
@@ -138,13 +140,13 @@ def argmin_nb(x: nb.float64[:,:],
 
 
 @nb.njit(cache=True)
-def argmax_nb(x: nb.float64[:,:],
-              axis: nb.int64) -> nb.int64[:]:
+def argmax_nb(x: nb.float32[:,:],
+              axis: nb.int32) -> nb.int32[:]:
     """
     Numba implementation of np.argmax(x, axis)
     """
     assert axis == 0 or axis == 1
-    argmax = np.empty(x.shape[1-axis], dtype=np.int64)
+    argmax = np.empty(x.shape[1-axis], dtype=np.int32)
     if axis == 0:
         for i in range(len(argmax)):
             argmax[i] = np.argmax(x[:,i])
@@ -155,13 +157,13 @@ def argmax_nb(x: nb.float64[:,:],
 
 
 @nb.njit(cache=True)
-def min_nb(x: nb.float64[:,:],
-           axis: nb.int64) -> nb.float64[:]:
+def min_nb(x: nb.float32[:,:],
+           axis: nb.int32) -> nb.float32[:]:
     """
     Numba implementation of np.max(x, axis)
     """
     assert axis == 0 or axis == 1
-    xmin = np.empty(x.shape[1-axis], dtype=np.int64)
+    xmin = np.empty(x.shape[1-axis], dtype=np.int32)
     if axis == 0:
         for i in range(len(xmin)):
             xmin[i] = np.min(x[:,i])
@@ -172,13 +174,13 @@ def min_nb(x: nb.float64[:,:],
 
 
 @nb.njit(cache=True)
-def max_nb(x: nb.float64[:,:],
-           axis: nb.int64) -> nb.float64[:]:
+def max_nb(x: nb.float32[:,:],
+           axis: nb.int32) -> nb.float32[:]:
     """
     Numba implementation of np.max(x, axis)
     """
     assert axis == 0 or axis == 1
-    xmax = np.empty(x.shape[1-axis], dtype=np.int64)
+    xmax = np.empty(x.shape[1-axis], dtype=np.int32)
     if axis == 0:
         for i in range(len(xmax)):
             xmax[i] = np.max(x[:,i])
@@ -189,8 +191,8 @@ def max_nb(x: nb.float64[:,:],
 
 
 @nb.njit(cache=True)
-def all_nb(x: nb.float64[:,:],
-              axis: nb.int64) -> nb.int64[:]:
+def all_nb(x: nb.float32[:,:],
+              axis: nb.int32) -> nb.int32[:]:
     """
     Numba implementation of np.all(x, axis)
     """
@@ -206,8 +208,8 @@ def all_nb(x: nb.float64[:,:],
 
 
 @nb.njit(cache=True)
-def softmax_nb(x: nb.float64[:,:],
-               axis: nb.int64) -> nb.float64[:,:]:
+def softmax_nb(x: nb.float32[:,:],
+               axis: nb.int32) -> nb.float32[:,:]:
     assert axis == 0 or axis == 1
     if axis == 0:
         xmax = max_nb(x, axis=0)
@@ -220,7 +222,7 @@ def softmax_nb(x: nb.float64[:,:],
 
 
 @nb.njit(cache=True)
-def log_loss_nb(x1: nb.boolean[:], x2: nb.float64[:]) -> nb.float64:
+def log_loss_nb(x1: nb.boolean[:], x2: nb.float32[:]) -> nb.float32:
     if len(x1) > 0:
         return -(np.sum(np.log(x2[x1])) + np.sum(np.log(1.-x2[~x1])))/len(x1)
     else:

From 4d77afa31025b6bbacf034fd1429aa6dd8dddc36 Mon Sep 17 00:00:00 2001
From: Temigo <temigo@gmx.com>
Date: Fri, 4 Nov 2022 13:02:58 -0700
Subject: [PATCH 36/52] Connect FlashManager with Predictor

---
 analysis/classes/ui.py | 53 ++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 51 insertions(+), 2 deletions(-)

diff --git a/analysis/classes/ui.py b/analysis/classes/ui.py
index 1f0d0158..31f5b57c 100644
--- a/analysis/classes/ui.py
+++ b/analysis/classes/ui.py
@@ -1,6 +1,7 @@
 from typing import Callable, Tuple, List
 import numpy as np
 import pandas as pd
+import os, sys
 
 from mlreco.utils.cluster.cluster_graph_constructor import ClusterGraphConstructor
 from mlreco.utils.ppn import uresnet_ppn_type_point_selector
@@ -9,7 +10,7 @@
 
 from scipy.special import softmax
 from analysis.classes import Particle, ParticleFragment, TruthParticleFragment, \
-        TruthParticle, Interaction, TruthInteraction
+        TruthParticle, Interaction, TruthInteraction, FlashManager
 from analysis.classes.particle import matrix_counts, matrix_iou, \
         match_particles_fn, match_interactions_fn, group_particles_to_interactions_fn
 from analysis.algorithms.point_matching import *
@@ -54,7 +55,8 @@ class FullChainPredictor:
 
     4) Does not support deghosting at the moment. (TODO)
     '''
-    def __init__(self, data_blob, result, cfg, predictor_cfg={}, deghosting=False):
+    def __init__(self, data_blob, result, cfg, predictor_cfg={}, deghosting=False,
+            enable_flash_matching=False, flash_matching_cfg="", opflash_keys=[]):
         self.module_config = cfg['model']['modules']
         self.cfg = cfg
 
@@ -114,11 +116,58 @@ def __init__(self, data_blob, result, cfg, predictor_cfg={}, deghosting=False):
             self.vb = None
             self._num_volumes = 1
 
+        # Prepare flash matching if requested
+        self.enable_flash_matching = enable_flash_matching
+        self.fm = None
+        if enable_flash_matching:
+            if 'meta' not in self.data_blob:
+                raise Exception('Meta unspecified in data_blob. Please add it to your I/O schema.')
+            #if 'FMATCH_BASEDIR' not in os.environ:
+            #    raise Exception('FMATCH_BASEDIR undefined. Please source `OpT0Finder/configure.sh` or define it manually.')
+            assert os.path.exists(flash_matching_cfg)
+            assert len(opflash_keys) == self._num_volumes
+
+            self.fm = FlashManager(cfg, flash_matching_cfg, meta=self.data_blob['meta'][0])
+            self.opflash_keys = opflash_keys
+
+            self.flash_matches = {} # key is volume, value is tuple (tpc_v, pmt_v, list of matches)
+            # type is (list of Interaction/TruthInteraction, list of larcv::Flash, list of flashmatch::FlashMatch_t)
+            
+
     def __repr__(self):
         msg = "FullChainEvaluator(num_images={})".format(int(self.num_images/self._num_volumes))
         return msg
 
+    def get_flash_matches(self, entry, use_true_tpc_objects=False, volume=None):
+        if volume not in self.flash_matches:
+            self._run_flash_matching(entry, use_true_tpc_objects=use_true_tpc_objects, volume=volume)
+
+        tpc_v, pmt_v, matches = self.flash_matches[volume]
+        return [(tpc_v[m.tpc_id], pmt_v[m.flash_id], m) for m in matches]
 
+    def _run_flash_matching(self, entry, use_true_tpc_objects=False, volume=None):
+        if use_true_tpc_objects:
+            if not hasattr(self, 'get_true_interactions'):
+                raise Exception('This Predictor does not know about truth info.')
+
+            tpc_v = self.get_true_interactions(entry, drop_nonprimary_particles=False, volume=volume)
+        else:
+            tpc_v = self.get_interactions(entry, drop_nonprimary_particles=False, volume=volume)
+
+        input_tpc_v = self.fm.make_qcluster(tpc_v)
+        
+        selected_opflash_keys = self.opflash_keys
+        if volume is not None:
+            assert isinstance(volume, int)
+            selected_opflash_keys = [self.opflash_keys[volume]]
+        pmt_v = []
+        for key in selected_opflash_keys:
+            pmt_v.extend(self.data_blob[key][entry])
+        input_pmt_v = self.fm.make_flash([self.data_blob[key][entry] for key in selected_opflash_keys])
+
+        matches = self.fm.run_flash_matching()
+        self.flash_matches[volume] = (tpc_v, pmt_v, matches)
+        
     def _fit_predict_ppn(self, entry):
         '''
         Method for predicting ppn predictions.

From ab70713aed6df00dff45408b850ab913165f09ce Mon Sep 17 00:00:00 2001
From: Temigo <temigo@gmx.com>
Date: Fri, 4 Nov 2022 13:36:59 -0700
Subject: [PATCH 37/52] Add some documentation to flash matching interface

---
 analysis/classes/FlashManager.py | 76 ++++++++++++++++++++++++++++++++
 analysis/classes/ui.py           | 24 ++++++++++
 2 files changed, 100 insertions(+)

diff --git a/analysis/classes/FlashManager.py b/analysis/classes/FlashManager.py
index 2ddf6326..4daf4a06 100644
--- a/analysis/classes/FlashManager.py
+++ b/analysis/classes/FlashManager.py
@@ -2,7 +2,32 @@
 
 
 class FlashManager:
+    """
+    Meant as an interface to OpT0finder, likelihood-based flash matching.
+
+    See https://github.com/drinkingkazu/OpT0Finder for more details about it.
+    """
     def __init__(self, cfg, cfg_fmatch, meta=None, detector_specs=None):
+        """
+        Expects that the environment variable `FMATCH_BASEDIR` is set.
+        You can either set it by hand (to the path where one can find
+        OpT0Finder) or you can source `OpT0Finder/configure.sh` if you
+        are running code from a command line.
+
+        Parameters
+        ==========
+        cfg: dict
+            The full chain config.
+        cfg_fmatch: str
+            Path to config for OpT0Finder.
+        meta: np.ndarray, optional, default is None
+            Used to shift coordinates of interactions to "real" detector
+            coordinates for QCluster_t.
+        detector_specs: str, optional
+            Path to `detector_specs.cfg` file which defines some geometry
+            information about the detector PMT system. By default will look
+            into `OpT0Finder/dat/detector_specs.cfg`.
+        """
 
         # Setup OpT0finder
         basedir = os.getenv('FMATCH_BASEDIR') 
@@ -80,6 +105,23 @@ def get_qcluster(self, tpc_id, array=False):
         raise Exception("TPC object %d does not exist in self.tpc_v" % tpc_id)
 
     def make_qcluster(self, interactions):
+        """
+        Make flashmatch::QCluster_t objects from list of interactions.
+
+        Note that coordinates of `interactions` are in voxel coordinates,
+        but inside this function we shift back to real detector coordinates
+        using meta information. flashmatch::QCluster_t objects are in 
+        real cm coordinates.
+
+        Parameters
+        ==========
+        interactions: list of Interaction/TruthInteraction
+            (Predicted or true) interaction objects.
+
+        Returns
+        =======
+        list of flashmatch::QCluster_t
+        """
         from flashmatch import flashmatch
 
         if self.min_x is None:
@@ -177,6 +219,17 @@ def run_flash_matching(self, flashes=None, interactions=None):
         return self.all_matches
 
     def get_match(self, idx, matches=None):
+        """
+        Parameters
+        ==========
+        idx: int
+            Index of TPC object for which we want to retrieve a match.
+        matches: list of flashmatch::FlashMatch_t, optional, default is None
+
+        Returns
+        =======
+        flashmatch::FlashMatch_t
+        """
         if matches is None:
             if self.all_matches is None:
                 raise Exception("Need to run flash matching first with run_flash_matching.")
@@ -189,6 +242,17 @@ def get_match(self, idx, matches=None):
         return None
 
     def get_matched_flash(self, idx, matches=None):
+        """
+        Parameters
+        ==========
+        idx: int
+            Index of TPC object for which we want to retrieve a match.
+        matches: list of flashmatch::FlashMatch_t, optional, default is None
+
+        Returns
+        =======
+        flashmatch::Flash_t
+        """
         m = self.get_match(idx, matches=matches)
         if m is None: return None
 
@@ -202,5 +266,17 @@ def get_matched_flash(self, idx, matches=None):
         
 
     def get_t0(self, idx, matches=None):
+        """
+        Parameters
+        ==========
+        idx: int
+            Index of TPC object for which we want to retrieve a match.
+        matches: list of flashmatch::FlashMatch_t, optional, default is None
+
+        Returns
+        =======
+        float
+            Time in us with respect to simulation time reference.
+        """
         flash = self.get_matched_flash(idx, matches=matches)
         return None if flash is None else flash.time
diff --git a/analysis/classes/ui.py b/analysis/classes/ui.py
index 31f5b57c..d181d35a 100644
--- a/analysis/classes/ui.py
+++ b/analysis/classes/ui.py
@@ -139,6 +139,22 @@ def __repr__(self):
         return msg
 
     def get_flash_matches(self, entry, use_true_tpc_objects=False, volume=None):
+        """
+        If flash matches has not yet been computed for this volume, then it will
+        be run as part of this function. Otherwise, flash matching results are
+        cached in `self.flash_matches` per volume.
+
+        Parameters
+        ==========
+        entry: int
+        use_true_tpc_objects: bool, default is False
+            Whether to use true or predicted interactions.
+        volume: int, default is None
+
+        Returns
+        =======
+        list of tuple (Interaction, larcv::Flash, flashmatch::FlashMatch_t)
+        """
         if volume not in self.flash_matches:
             self._run_flash_matching(entry, use_true_tpc_objects=use_true_tpc_objects, volume=volume)
 
@@ -146,6 +162,14 @@ def get_flash_matches(self, entry, use_true_tpc_objects=False, volume=None):
         return [(tpc_v[m.tpc_id], pmt_v[m.flash_id], m) for m in matches]
 
     def _run_flash_matching(self, entry, use_true_tpc_objects=False, volume=None):
+        """
+        Parameters
+        ==========
+        entry: int
+        use_true_tpc_objects: bool, default is False
+            Whether to use true or predicted interactions.
+        volume: int, default is None
+        """
         if use_true_tpc_objects:
             if not hasattr(self, 'get_true_interactions'):
                 raise Exception('This Predictor does not know about truth info.')

From b8c887343de1324342105554361b38d8449ced18 Mon Sep 17 00:00:00 2001
From: Temigo <temigo@gmx.com>
Date: Tue, 8 Nov 2022 10:04:12 -0800
Subject: [PATCH 38/52] Fix bug in get_nu_id + fix bug in get_flash_matches

---
 analysis/algorithms/selections/example_nue.py | 11 +++++++++--
 analysis/classes/ui.py                        |  6 +++---
 mlreco/utils/groups.py                        |  8 ++++----
 3 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/analysis/algorithms/selections/example_nue.py b/analysis/algorithms/selections/example_nue.py
index 3f022776..3e3391b0 100644
--- a/analysis/algorithms/selections/example_nue.py
+++ b/analysis/algorithms/selections/example_nue.py
@@ -19,7 +19,14 @@ def debug_pid(data_blob, res, data_idx, analysis_cfg, cfg):
 
     predictor = FullChainEvaluator(data_blob, res, cfg, analysis_cfg)
     image_idxs = data_blob['index']
+    print(data_blob['index'], data_blob['run_info'])
     for idx, index in enumerate(image_idxs):
+        index_dict = {
+            'Index': index,
+            'run': data_blob['run_info'][idx][0],
+            'subrun': data_blob['run_info'][idx][1],
+            'event': data_blob['run_info'][idx][2]
+        }
 
         # Process Interaction Level Information
         matches, counts = predictor.match_interactions(idx,
@@ -75,7 +82,7 @@ def debug_pid(data_blob, res, data_idx, analysis_cfg, cfg):
                 true_int_dict['true_nu_energy'] = nu.energy_init()
 
             pred_int_dict['interaction_match_counts'] = counts[i]
-            interactions_dict = OrderedDict({'Index': index})
+            interactions_dict = OrderedDict(index_dict.copy())
             interactions_dict.update(true_int_dict)
             interactions_dict.update(pred_int_dict)
             interactions.append(interactions_dict)
@@ -89,7 +96,7 @@ def debug_pid(data_blob, res, data_idx, analysis_cfg, cfg):
             matched_particles, _, ious = match_particles_fn(true_particles,
                                                             pred_particles)
             for i, m in enumerate(matched_particles):
-                particles_dict = OrderedDict({'Index': index})
+                particles_dict = OrderedDict(index_dict.copy())
                 true_p, pred_p = m[0], m[1]
                 pred_particle_dict = get_particle_properties(pred_p,
                     vertex=pred_int.vertex,
diff --git a/analysis/classes/ui.py b/analysis/classes/ui.py
index d181d35a..061c5e29 100644
--- a/analysis/classes/ui.py
+++ b/analysis/classes/ui.py
@@ -130,7 +130,7 @@ def __init__(self, data_blob, result, cfg, predictor_cfg={}, deghosting=False,
             self.fm = FlashManager(cfg, flash_matching_cfg, meta=self.data_blob['meta'][0])
             self.opflash_keys = opflash_keys
 
-            self.flash_matches = {} # key is volume, value is tuple (tpc_v, pmt_v, list of matches)
+            self.flash_matches = {} # key is (volume, use_true_tpc_objects), value is tuple (tpc_v, pmt_v, list of matches)
             # type is (list of Interaction/TruthInteraction, list of larcv::Flash, list of flashmatch::FlashMatch_t)
             
 
@@ -155,7 +155,7 @@ def get_flash_matches(self, entry, use_true_tpc_objects=False, volume=None):
         =======
         list of tuple (Interaction, larcv::Flash, flashmatch::FlashMatch_t)
         """
-        if volume not in self.flash_matches:
+        if (volume, use_true_tpc_objects) not in self.flash_matches:
             self._run_flash_matching(entry, use_true_tpc_objects=use_true_tpc_objects, volume=volume)
 
         tpc_v, pmt_v, matches = self.flash_matches[volume]
@@ -190,7 +190,7 @@ def _run_flash_matching(self, entry, use_true_tpc_objects=False, volume=None):
         input_pmt_v = self.fm.make_flash([self.data_blob[key][entry] for key in selected_opflash_keys])
 
         matches = self.fm.run_flash_matching()
-        self.flash_matches[volume] = (tpc_v, pmt_v, matches)
+        self.flash_matches[(volume, use_true_tpc_objects)] = (tpc_v, pmt_v, matches)
         
     def _fit_predict_ppn(self, entry):
         '''
diff --git a/mlreco/utils/groups.py b/mlreco/utils/groups.py
index bd998a8a..574b9d4f 100644
--- a/mlreco/utils/groups.py
+++ b/mlreco/utils/groups.py
@@ -275,19 +275,19 @@ def get_nu_id(cluster_event, particle_v, interaction_ids, particle_mpv=None):
         # if there is nu interaction
         if num_primary > 1:
             nu_id[inds] = 1
-    else:
+    elif len(particle_mpv) > 0:
         # Find mpv particles
         is_mpv = np.zeros((len(particle_v),))
         # mpv_ids = [p.id() for p in particle_mpv]
-        mpv_pdg = np.array([p.pdg_code() for p in particle_mpv])
-        mpv_energy = np.array([p.energy_init() for p in particle_mpv])
+        mpv_pdg = np.array([p.pdg_code() for p in particle_mpv]).reshape((-1,))
+        mpv_energy = np.array([p.energy_init() for p in particle_mpv]).reshape((-1,))
         for idx, part in enumerate(particle_v):
             # track_id - 1 in `particle_pcluster_tree` corresponds to id (or track_id) in `particle_mpv_tree`
             # if (part.track_id()-1) in mpv_ids or (part.ancestor_track_id()-1) in mpv_ids:
             # FIXME the above was wrong I think.
             close = np.isclose(part.energy_init()*1e-3, mpv_energy)
             pdg = part.pdg_code() == mpv_pdg
-            if close.any() and pdg.any() and (np.where(close)[0] == np.where(pdg)[0]).any():
+            if (close & pdg).any():
                 is_mpv[idx] = 1.
             # else:
             #     print("fake cosmic", part.pdg_code(), part.shape(), part.creation_process(), part.track_id(), part.ancestor_track_id(), mpv_ids)

From 8b320a0052c2d570aa94c2fe234a73fd819dc388 Mon Sep 17 00:00:00 2001
From: Temigo <temigo@gmx.com>
Date: Tue, 8 Nov 2022 10:20:26 -0800
Subject: [PATCH 39/52] Forgot one fix

---
 analysis/classes/ui.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/analysis/classes/ui.py b/analysis/classes/ui.py
index 061c5e29..6ab72311 100644
--- a/analysis/classes/ui.py
+++ b/analysis/classes/ui.py
@@ -158,7 +158,7 @@ def get_flash_matches(self, entry, use_true_tpc_objects=False, volume=None):
         if (volume, use_true_tpc_objects) not in self.flash_matches:
             self._run_flash_matching(entry, use_true_tpc_objects=use_true_tpc_objects, volume=volume)
 
-        tpc_v, pmt_v, matches = self.flash_matches[volume]
+        tpc_v, pmt_v, matches = self.flash_matches[(volume, use_true_tpc_objects)]
         return [(tpc_v[m.tpc_id], pmt_v[m.flash_id], m) for m in matches]
 
     def _run_flash_matching(self, entry, use_true_tpc_objects=False, volume=None):

From 2024bd732ab21d24a832afcc296877dc00edcfc9 Mon Sep 17 00:00:00 2001
From: Francois Drielsma <francois.drielsma@gmail.com>
Date: Tue, 8 Nov 2022 14:11:04 -0800
Subject: [PATCH 40/52] Nomenclature change in GrapPA node kinematics loss
 output

---
 mlreco/models/layers/gnn/losses/node_kinematics.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/mlreco/models/layers/gnn/losses/node_kinematics.py b/mlreco/models/layers/gnn/losses/node_kinematics.py
index a8f815f2..c26592c4 100644
--- a/mlreco/models/layers/gnn/losses/node_kinematics.py
+++ b/mlreco/models/layers/gnn/losses/node_kinematics.py
@@ -326,9 +326,9 @@ def forward(self, out, types):
             result.update({
                 'vtx_labels': vtx_labels,
                 'vtx_score_loss': vtx_score_loss/n_clusts_vtx if n_clusts_vtx else 0.,
-                'vtx_score_acc': vtx_score_acc/n_clusts_vtx if n_clusts_vtx else 1.,
+                'vtx_score_accuracy': vtx_score_acc/n_clusts_vtx if n_clusts_vtx else 1.,
                 'vtx_position_loss': vtx_position_loss/n_clusts_vtx_pos if n_clusts_vtx_pos else 0.,
-                'vtx_position_acc': vtx_position_acc/n_clusts_vtx_pos if n_clusts_vtx_pos else 1.
+                'vtx_position_accuracy': vtx_position_acc/n_clusts_vtx_pos if n_clusts_vtx_pos else 1.
             })
             if self.use_anchor_points: result['vtx_anchors'] = vtx_anchors
 
@@ -601,8 +601,8 @@ def forward(self, out, types, iteration=None):
                 result.update({
                     'vtx_position_loss': 0.,
                     'vtx_score_loss': 0.,
-                    'vtx_position_acc': 0.,
-                    'vtx_score_acc': 0.,
+                    'vtx_position_accurary': 0.,
+                    'vtx_score_accuracy': 0.,
                 })
             return result
 
@@ -628,9 +628,9 @@ def forward(self, out, types, iteration=None):
         if compute_vtx:
             result.update({
                 'vtx_score_loss': 0. if not n_clusts_vtx else vtx_score_loss/n_clusts_vtx,
-                'vtx_score_acc': 0. if not n_clusts_vtx else vtx_score_acc/n_clusts_vtx,
+                'vtx_score_accurary': 0. if not n_clusts_vtx else vtx_score_acc/n_clusts_vtx,
                 'vtx_position_loss': 0. if not n_clusts_vtx_positives else vtx_position_loss/n_clusts_vtx_positives,
-                'vtx_position_acc': 0. if not n_clusts_vtx_positives else vtx_position_acc/n_clusts_vtx_positives,
+                'vtx_position_accuray': 0. if not n_clusts_vtx_positives else vtx_position_acc/n_clusts_vtx_positives,
             })
 
         return result

From 2f998a3c11e30f6bfa628006e70288bcd9677eae Mon Sep 17 00:00:00 2001
From: Francois Drielsma <francois.drielsma@gmail.com>
Date: Wed, 9 Nov 2022 11:42:39 -0800
Subject: [PATCH 41/52] Weird sys.path.insert causing circular imports removed

---
 mlreco/utils/cluster/dense_cluster.py | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/mlreco/utils/cluster/dense_cluster.py b/mlreco/utils/cluster/dense_cluster.py
index bd55fb3b..6e037020 100644
--- a/mlreco/utils/cluster/dense_cluster.py
+++ b/mlreco/utils/cluster/dense_cluster.py
@@ -1,22 +1,14 @@
 import numpy as np
 import pandas as pd
-import sys
 import os, re
 import torch
 import yaml
 import time
-from scipy.spatial.distance import cdist
 from sklearn.metrics import adjusted_rand_score as ari
-import argparse
-
-current_directory = os.path.dirname(os.path.abspath(__file__))
-current_directory = os.path.dirname(current_directory)
-sys.path.insert(0, current_directory)
 
 from mlreco.utils.metrics import *
 from mlreco.trainval import trainval
 from mlreco.iotools.factories import loader_factory
-from sklearn.cluster import DBSCAN
 from pprint import pprint
 
 

From f4f99d5668ce5469180638e558def01e2dc7496e Mon Sep 17 00:00:00 2001
From: Francois Drielsma <francois.drielsma@gmail.com>
Date: Wed, 9 Nov 2022 17:22:21 -0800
Subject: [PATCH 42/52] Fixed primary particle group labeling, added
 michel/delta to shower primaries

---
 mlreco/iotools/parsers/cluster.py |  9 ++--
 mlreco/utils/groups.py            | 70 ++++++++++++++++++++++++-------
 2 files changed, 59 insertions(+), 20 deletions(-)

diff --git a/mlreco/iotools/parsers/cluster.py b/mlreco/iotools/parsers/cluster.py
index abc44f05..72216e61 100644
--- a/mlreco/iotools/parsers/cluster.py
+++ b/mlreco/iotools/parsers/cluster.py
@@ -1,7 +1,7 @@
 from collections import OrderedDict
 import numpy as np
 from larcv import larcv
-from mlreco.utils.groups import get_interaction_id, get_nu_id, get_particle_id, get_primary_id
+from mlreco.utils.groups import get_interaction_id, get_nu_id, get_particle_id, get_shower_primary_id, get_group_primary_id
 from mlreco.utils.groups import type_labels as TYPE_LABELS
 from mlreco.iotools.parsers.sparse import parse_sparse3d
 from mlreco.iotools.parsers.particles import parse_particles
@@ -138,15 +138,16 @@ def parse_cluster3d(cluster_event,
             labels['inter']   = inter_ids
             labels['nu']      = nu_ids
             labels['type']    = get_particle_id(particles_v, nu_ids, include_mpr=type_include_mpr)
-            labels['primary'] = get_primary_id(cluster_event, particles_v)
+            labels['primary_shower'] = get_shower_primary_id(cluster_event, particles_v)
         if add_kinematics_info:
+            primary_ids       = get_group_primary_id(particles_v)
             particles_v       = parse_particles(particle_event, cluster_event)
             labels['type']    = get_particle_id(particles_v, nu_ids, include_mpr=type_include_mpr)
-            labels['p']       = np.array([(p.px()**2+p.py()**2+p.pz()**2)/1e3 for p in particles_v])
+            labels['p']       = np.array([p.p()/1e3 for p in particles_v]) # In GeV
             labels['vtx_x']   = np.array([p.ancestor_position().x() for p in particles_v])
             labels['vtx_y']   = np.array([p.ancestor_position().y() for p in particles_v])
             labels['vtx_z']   = np.array([p.ancestor_position().z() for p in particles_v])
-            labels['primary_group'] = np.array([p.group_id()==p.parent_id() for p in particles_v], dtype=np.float32)
+            labels['primary_group'] = primary_ids
         labels['sem'] = np.array([p.shape() for p in particles_v])
 
     # Loop over clusters, store info
diff --git a/mlreco/utils/groups.py b/mlreco/utils/groups.py
index a945fb03..4771bd13 100644
--- a/mlreco/utils/groups.py
+++ b/mlreco/utils/groups.py
@@ -357,9 +357,9 @@ def get_particle_id(particles_v, nu_ids, include_mpr=False):
     return particle_ids
 
 
-def get_primary_id(cluster_event, particles_v):
+def get_shower_primary_id(cluster_event, particles_v):
     '''
-    Function that assigns valid primary tags.
+    Function that assigns valid primary tags to shower fragments.
     This could be handled somewhere else (e.g. SUPERA)
 
     Inputs:
@@ -368,35 +368,35 @@ def get_primary_id(cluster_event, particles_v):
     Outputs:
         - array: (N) list of group ids
     '''
-    # Only shower fragments that come first in time and deposit energy can be primaries
-    group_ids    = np.array([p.group_id() for p in particles_v])
-    primary_ids  = np.empty(particles_v.size(), dtype=np.int32)
+    # Loop over the list of particles
+    group_ids   = np.array([p.group_id() for p in particles_v])
+    primary_ids = np.empty(particles_v.size(), dtype=np.int32)
     for i, p in enumerate(particles_v):
-        # If the particle is LE, not primary
-        if p.shape() == 4:
+        # If the particle is a track or a low energy cluster, it is not a primary shower fragment
+        if p.shape() == 1 or p.shape() == 4:
             primary_ids[i] = 0
             continue
 
-        # If the particle is not EM, use default
-        gid = int(p.group_id())
-        if p.shape() != 0:
-            primary_ids[i] = int(gid == i)
+        # If a particle is a Delta or a Michel, it is a primary shower fragment
+        if p.shape() == 2 or p.shape() == 3:
+            primary_ids[i] = 1
             continue
 
-        # If the particle is nuclear activity, Delta or Michel, make it non primary
+        # If the shower fragment originates from nuclear activity, it is not a primary
         process = p.creation_process()
         parent_pdg_code = abs(p.parent_pdg_code())
-        if 'Inelastic' in process or 'Capture' in process or parent_pdg_code == 13:
+        if 'Inelastic' in process or 'Capture' in process or parent_pdg_code == 2112:
             primary_ids[i] = 0
             continue
 
-        # If a particle's parent fragment has size zero, make it non primary
+        # If a shower group's parent fragment has size zero, there is no valid primary in the group
+        gid = int(p.group_id())
         parent_size = cluster_event.as_vector()[gid].as_vector().size()
         if not parent_size:
             primary_ids[i] = 0
             continue
 
-        # If a particle's parent is not the first in time, make it non primary
+        # If a shower group's parent fragment is not the first in time, there is no valid primary in the group
         idxs = np.where(group_ids == gid)[0]
         clust_times = np.array([particles_v[int(j)].first_step().t() for j in idxs])
         min_id = np.argmin(clust_times)
@@ -404,7 +404,45 @@ def get_primary_id(cluster_event, particles_v):
             primary_ids[i] = 0
             continue
 
-        # Use default otherwise
+        # If all conditions are met, label shower fragments which have identical ID and group ID as primary
         primary_ids[i] = int(gid == i)
 
     return primary_ids
+
+
+def get_group_primary_id(particles_v):
+    '''
+    Function that assigns valid primary tags to particle groups.
+    This could be handled somewhere else (e.g. SUPERA)
+
+    Inputs:
+        - particles_v (array of larcv::Particle)    : (N) LArCV Particle objects
+    Outputs:
+        - array: (N) list of group ids
+    '''
+    # Loop over the list of particles
+    primary_ids = np.empty(particles_v.size(), dtype=np.int32)
+    for i, p in enumerate(particles_v):
+        # If the particle is not a shower or a track, it is not a primary
+        if p.shape() != 0 and p.shape() != 1:
+            primary_ids[i] = 0
+            continue
+
+        # If the particle group originates from nuclear activity, it is not a primary
+        gid = int(p.group_id())
+        process = particles_v[gid].creation_process()
+        parent_pdg_code = abs(particles_v[gid].parent_pdg_code())
+        ancestor_pdg_code = abs(particles_v[gid].ancestor_pdg_code())
+        if 'Inelastic' in process or 'Capture' in process or parent_pdg_code == 2112 or ancestor_pdg_code == 2112:
+            primary_ids[i] = 0
+            continue
+
+        # If the parent is a pi0, make sure that it is a primary pi0 (pi0s are not stored in particle list)
+        if parent_pdg_code == 111 and ancestor_pdg_code != 111:
+            primary_ids[i] = 0
+            continue
+
+        # If the parent ID of the primary particle in the group is the same as the group ID, it is a primary
+        primary_ids[i] = int(particles_v[gid].parent_id() == gid)
+
+    return primary_ids

From 90a1b0c4aa4d743b20ac478d6f9e1473c16ea961 Mon Sep 17 00:00:00 2001
From: Francois Drielsma <francois.drielsma@gmail.com>
Date: Wed, 9 Nov 2022 23:02:50 -0800
Subject: [PATCH 43/52] Bug fix in training curve visualization

---
 mlreco/visualization/training.py | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/mlreco/visualization/training.py b/mlreco/visualization/training.py
index b61b5770..40c7ed87 100644
--- a/mlreco/visualization/training.py
+++ b/mlreco/visualization/training.py
@@ -79,7 +79,7 @@ def find_key(df, key_list, separator=':'):
     return key, key_name
 
 
-def get_training_df(log_dir, prefix='train'):
+def get_training_df(log_dir, keys, prefix='train'):
     """
     Finds all training log files inside the specified directory
     and concatenates them. If the range of iterations overlap, keep only
@@ -90,6 +90,7 @@ def get_training_df(log_dir, prefix='train'):
  
     Args:
         log_dir (str): Path to the directory that contains the training log files
+        keys (list)  : List of quantities of interest
         prefix (str) : Prefix shared between training file names (default: `train`)
     Returns:
         pandas.DataFrame: Combined training log data
@@ -98,7 +99,15 @@ def get_training_df(log_dir, prefix='train'):
     end_points = np.array([int(f.split('-')[-1].split('.csv')[0]) for f in log_files])
     order      = np.argsort(end_points)
     end_points = np.append(end_points[order], 1e12)
-    return pd.concat([pd.read_csv(f, nrows=end_points[i+1]-end_points[i]) for i, f in enumerate(log_files[order])], sort=True) 
+    log_dfs    = []
+    for i, f in enumerate(log_files[order]):
+        df = pd.read_csv(f, nrows=end_points[i+1]-end_points[i])
+        for key_list in keys:
+            key, key_name = find_key(df, key_list)
+            df[key_name] = df[key]
+        log_dfs.append(df)
+
+    return pd.concat(log_dfs, sort=True) 
 
 
 def get_validation_df(log_dir, keys, prefix='inference'):
@@ -228,7 +237,7 @@ def draw_training_curves(log_dir, models, metrics,
     dfs, val_dfs, colors = {}, {}, {}
     for i, key in enumerate(models):
         log_subdir = log_dir+key
-        dfs[key] = get_training_df(log_subdir, train_prefix)
+        dfs[key] = get_training_df(log_subdir, metrics, train_prefix)
         val_dfs[key] = get_validation_df(log_subdir, metrics, val_prefix)
         colors[key] = plotly_colors[i]
 

From 90828578bfee36cc450afd1887d0bd66fda8be47 Mon Sep 17 00:00:00 2001
From: Francois Drielsma <francois.drielsma@gmail.com>
Date: Thu, 10 Nov 2022 08:25:33 -0800
Subject: [PATCH 44/52] Relabeling of deghosted points now uses Chebyshev
 distance 1 in DBSCAN (cheaper, reliable)

---
 mlreco/utils/deghosting.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlreco/utils/deghosting.py b/mlreco/utils/deghosting.py
index 5540151b..f02b0e68 100644
--- a/mlreco/utils/deghosting.py
+++ b/mlreco/utils/deghosting.py
@@ -215,7 +215,7 @@ def adapt_labels_knn(result, label_seg, label_clustering,
         # for which cluster id and group id columns are 5 and 6 respectively.
         cluster_id_col = 5
         track_label = 1
-        dbscan = DBSCAN(eps=np.sqrt(3), min_samples=1)
+        dbscan = DBSCAN(eps=1.1, min_samples=1, metric='chebyshev')
         track_mask = label_c[:, -1] == track_label
         for batch_id in unique(coords[:, batch_column]):
             batch_mask = label_c[:, batch_column] == batch_id

From 814c93aaa1dd47e13af9a9b6795db5097bb9fd51 Mon Sep 17 00:00:00 2001
From: Temigo <temigo@gmx.com>
Date: Thu, 10 Nov 2022 09:21:40 -0800
Subject: [PATCH 45/52] Fix multibatch bug for flash matching + add missing
 light yield in QCluster_t

---
 analysis/algorithms/selections/__init__.py |  1 +
 analysis/classes/FlashManager.py           | 10 ++++++----
 analysis/classes/TruthInteraction.py       |  4 ++++
 analysis/classes/ui.py                     | 21 +++++++++++++--------
 4 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/analysis/algorithms/selections/__init__.py b/analysis/algorithms/selections/__init__.py
index 392f6a06..b4887760 100644
--- a/analysis/algorithms/selections/__init__.py
+++ b/analysis/algorithms/selections/__init__.py
@@ -3,3 +3,4 @@
 from .michel_electrons import michel_electrons
 from .example_nue import debug_pid
 from .statistics import statistics
+from .flash_matching import flash_matching
diff --git a/analysis/classes/FlashManager.py b/analysis/classes/FlashManager.py
index 4daf4a06..7ebcb73f 100644
--- a/analysis/classes/FlashManager.py
+++ b/analysis/classes/FlashManager.py
@@ -59,6 +59,8 @@ def __init__(self, cfg, cfg_fmatch, meta=None, detector_specs=None):
             self.size_voxel_x = meta[6]
             self.size_voxel_y = meta[7]
             self.size_voxel_z = meta[8]
+            #print('Meta min = ', self.min_x, self.min_y, self.min_z)
+            #print('Meta size = ', self.size_voxel_x, self.size_voxel_y, self.size_voxel_z)
 
         # Setup flash matching
         print('Setting up OpT0Finder for flash matching...')
@@ -104,7 +106,7 @@ def get_qcluster(self, tpc_id, array=False):
 
         raise Exception("TPC object %d does not exist in self.tpc_v" % tpc_id)
 
-    def make_qcluster(self, interactions):
+    def make_qcluster(self, interactions, use_depositions_MeV=False, ADC_to_MeV=1.):
         """
         Make flashmatch::QCluster_t objects from list of interactions.
 
@@ -138,7 +140,7 @@ def make_qcluster(self, interactions):
                     p.points[i, 0] * self.size_voxel_x + self.min_x,
                     p.points[i, 1] * self.size_voxel_y + self.min_y,
                     p.points[i, 2] * self.size_voxel_z + self.min_z,
-                    p.depositions[i])
+                    p.depositions[i]*ADC_to_MeV*self.det.LightYield() if not use_depositions_MeV else p.depositions_MeV[i]*self.det.LightYield())
                 # Add it to geoalgo::QCluster_t
                 qcluster.push_back(qpoint)
             tpc_v.append(qcluster)
@@ -188,12 +190,12 @@ def make_flash(self, larcv_flashes):
         self.pmt_v = pmt_v
         return pmt_v
 
-    def run_flash_matching(self, flashes=None, interactions=None):
+    def run_flash_matching(self, flashes=None, interactions=None, **kwargs):
         if self.tpc_v is None:
             if interactions is None:
                 raise Exception('You need to specify `interactions`, or to run make_qcluster.')
         if interactions is not None:
-            self.make_qcluster(interactions)
+            self.make_qcluster(interactions, **kwargs)
         
 
         if self.pmt_v is None:
diff --git a/analysis/classes/TruthInteraction.py b/analysis/classes/TruthInteraction.py
index d76f863a..68a23450 100644
--- a/analysis/classes/TruthInteraction.py
+++ b/analysis/classes/TruthInteraction.py
@@ -12,6 +12,10 @@ def __init__(self, *args, **kwargs):
         super(TruthInteraction, self).__init__(*args, **kwargs)
         self.match = []
         self._match_counts = {}
+        self.depositions_MeV = []
+        for p in self.particles:
+            self.depositions_MeV.append(p.depositions_MeV)
+        self.depositions_MeV = np.hstack(self.depositions_MeV)
 
     def check_validity(self):
         for p in self.particles:
diff --git a/analysis/classes/ui.py b/analysis/classes/ui.py
index 6ab72311..ae3f9bc5 100644
--- a/analysis/classes/ui.py
+++ b/analysis/classes/ui.py
@@ -130,7 +130,7 @@ def __init__(self, data_blob, result, cfg, predictor_cfg={}, deghosting=False,
             self.fm = FlashManager(cfg, flash_matching_cfg, meta=self.data_blob['meta'][0])
             self.opflash_keys = opflash_keys
 
-            self.flash_matches = {} # key is (volume, use_true_tpc_objects), value is tuple (tpc_v, pmt_v, list of matches)
+            self.flash_matches = {} # key is (entry, volume, use_true_tpc_objects), value is tuple (tpc_v, pmt_v, list of matches)
             # type is (list of Interaction/TruthInteraction, list of larcv::Flash, list of flashmatch::FlashMatch_t)
             
 
@@ -138,7 +138,9 @@ def __repr__(self):
         msg = "FullChainEvaluator(num_images={})".format(int(self.num_images/self._num_volumes))
         return msg
 
-    def get_flash_matches(self, entry, use_true_tpc_objects=False, volume=None):
+    def get_flash_matches(self, entry, use_true_tpc_objects=False, volume=None,
+            use_depositions_MeV=False,
+            ADC_to_MeV=1.):
         """
         If flash matches has not yet been computed for this volume, then it will
         be run as part of this function. Otherwise, flash matching results are
@@ -155,13 +157,16 @@ def get_flash_matches(self, entry, use_true_tpc_objects=False, volume=None):
         =======
         list of tuple (Interaction, larcv::Flash, flashmatch::FlashMatch_t)
         """
-        if (volume, use_true_tpc_objects) not in self.flash_matches:
-            self._run_flash_matching(entry, use_true_tpc_objects=use_true_tpc_objects, volume=volume)
+        if (entry, volume, use_true_tpc_objects) not in self.flash_matches:
+            self._run_flash_matching(entry, use_true_tpc_objects=use_true_tpc_objects, volume=volume,
+                    use_depositions_MeV=use_depositions_MeV, ADC_to_MeV=ADC_to_MeV)
 
-        tpc_v, pmt_v, matches = self.flash_matches[(volume, use_true_tpc_objects)]
+        tpc_v, pmt_v, matches = self.flash_matches[(entry, volume, use_true_tpc_objects)]
         return [(tpc_v[m.tpc_id], pmt_v[m.flash_id], m) for m in matches]
 
-    def _run_flash_matching(self, entry, use_true_tpc_objects=False, volume=None):
+    def _run_flash_matching(self, entry, use_true_tpc_objects=False, volume=None,
+            use_depositions_MeV=False,
+            ADC_to_MeV=1.):
         """
         Parameters
         ==========
@@ -178,7 +183,7 @@ def _run_flash_matching(self, entry, use_true_tpc_objects=False, volume=None):
         else:
             tpc_v = self.get_interactions(entry, drop_nonprimary_particles=False, volume=volume)
 
-        input_tpc_v = self.fm.make_qcluster(tpc_v)
+        input_tpc_v = self.fm.make_qcluster(tpc_v, use_depositions_MeV=use_depositions_MeV, ADC_to_MeV=ADC_to_MeV)
         
         selected_opflash_keys = self.opflash_keys
         if volume is not None:
@@ -190,7 +195,7 @@ def _run_flash_matching(self, entry, use_true_tpc_objects=False, volume=None):
         input_pmt_v = self.fm.make_flash([self.data_blob[key][entry] for key in selected_opflash_keys])
 
         matches = self.fm.run_flash_matching()
-        self.flash_matches[(volume, use_true_tpc_objects)] = (tpc_v, pmt_v, matches)
+        self.flash_matches[(entry, volume, use_true_tpc_objects)] = (tpc_v, pmt_v, matches)
         
     def _fit_predict_ppn(self, entry):
         '''

From 2021ff7120be650689bf1d10298b05379b9369a5 Mon Sep 17 00:00:00 2001
From: Francois Drielsma <francois.drielsma@gmail.com>
Date: Thu, 10 Nov 2022 11:16:24 -0800
Subject: [PATCH 46/52] Added option to compute local direction and dedx from
 particle end point in GrapPA. Warning: all *start* parameters now deprecated.

---
 mlreco/models/grappa.py     | 94 ++++++++++++++++++++-----------------
 mlreco/utils/gnn/cluster.py |  2 +-
 2 files changed, 53 insertions(+), 43 deletions(-)

diff --git a/mlreco/models/grappa.py b/mlreco/models/grappa.py
index 38d8b74f..1631debd 100644
--- a/mlreco/models/grappa.py
+++ b/mlreco/models/grappa.py
@@ -36,27 +36,27 @@ class GNN(torch.nn.Module):
         .. code-block:: yaml
 
           base:
-            node_type         : <semantic class to group (all classes if -1, default 0, i.e. EM)>
-            node_min_size     : <minimum number of voxels inside a cluster to be considered (default -1)>
-            source_col        : <column in the input data that specifies the source node ids of each voxel (default 5)>
-            target_col        : <column in the input data that specifies the target instance ids of each voxel (default 6)>
-            use_dbscan        : <use DBSCAN to cluster the input instances of the class specified by node_type (default False)>
-            add_start_point   : <add label start point to the node features (default False)>
-            add_start_dir     : <add predicted start direction to the node features (default False)>
-            start_dir_max_dist: <maximium distance between start point and cluster voxels to be used to estimate direction (default -1, i.e no limit)>
-            start_dir_opt     : <optimize start direction by minimizing relative transverse spread of neighborhood (slow, default: False)>
-            add_start_dedx    : <add predicted start dedx to the node features (default False)>
-            network           : <type of network: 'complete', 'delaunay', 'mst', 'knn' or 'bipartite' (default 'complete')>
-            edge_max_dist     : <maximal edge Euclidean length (default -1)>
-            edge_dist_method  : <edge length evaluation method: 'centroid' or 'set' (default 'set')>
-            merge_batch       : <flag for whether to merge batches (default False)>
-            merge_batch_mode  : <mode of batch merging, 'const' or 'fluc'; 'const' use a fixed size of batch for merging, 'fluc' takes the input size a mean and sample based on it (default 'const')>
-            merge_batch_size  : <size of batch merging (default 2)>
-            shuffle_clusters  : <randomize cluster order (default False)>
-            kinematics_mlp    : <applies type and momentum MLPs on the node features (default False)>
+            source_col      : <column in the input data that specifies the source node ids of each voxel (default 5)>
+            target_col      : <column in the input data that specifies the target instance ids of each voxel (default 6)>
+            node_type       : <semantic class to aggregate (all classes if -1, default -1)>
+            node_min_size   : <minimum number of voxels inside a cluster to be included in the aggregation (default -1, i.e. no threshold)>
+            add_points      : <add label point(s) to the node features: False (none) or True (both) (default False)>
+            add_local_dirs  : <add reconstructed local direction(s) to the node features: False (none), True (both) or 'start' (default False)>
+            dir_max_dist    : <maximium distance between start point and cluster voxels to be used to estimate direction: support value or 'optimize' (default 5 voxels)>
+            add_local_dedxs : <add reconstructed local dedx(s) to the node features: False (none), True (both) or 'start' (default False)>
+            dedx_max_dist   : <maximium distance between start point and cluster voxels to be used to estimate dedx (default 5 voxels)>
+            network         : <type of network: 'complete', 'delaunay', 'mst', 'knn' or 'bipartite' (default 'complete')>
+            edge_max_dist   : <maximal edge Euclidean length (default -1)>
+            edge_dist_method: <edge length evaluation method: 'centroid' or 'voxel' (default 'voxel')>
+            merge_batch     : <flag for whether to merge batches (default False)>
+            merge_batch_mode: <mode of batch merging, 'const' or 'fluc'; 'const' use a fixed size of batch for merging, 'fluc' takes the input size a mean and sample based on it (default 'const')>
+            merge_batch_size: <size of batch merging (default 2)>
+            shuffle_clusters: <randomize cluster order (default False)>
 
     dbscan: dict
+
         dictionary of dbscan parameters
+
     node_encoder: dict
 
         .. code-block:: yaml
@@ -128,21 +128,28 @@ def __init__(self, cfg, name='grappa', batch_col=0, coords_col=(1, 4)):
         # Get the chain input parameters
         base_config = cfg[name].get('base', {})
         self.name = name
+        self.batch_index = batch_col
+        self.coords_index = coords_col
 
         # Choose what type of node to use
-        self.node_type = base_config.get('node_type', 0)
-        self.node_min_size = base_config.get('node_min_size', -1)
-        self.source_col = base_config.get('source_col', 5)
-        self.target_col = base_config.get('target_col', 6)
-        self.add_start_point = base_config.get('add_start_point', False)
-        self.add_start_dir = base_config.get('add_start_dir', False)
-        self.start_dir_max_dist = base_config.get('start_dir_max_dist', -1)
-        self.start_dir_opt = base_config.get('start_dir_opt', False)
-        self.add_start_dedx = base_config.get('add_start_dedx', False)
+        self.source_col       = base_config.get('source_col', 5)
+        self.target_col       = base_config.get('target_col', 6)
+        self.node_type        = base_config.get('node_type', -1)
+        self.node_min_size    = base_config.get('node_min_size', -1)
+        self.add_points       = base_config.get('add_points', False)
+        self.add_local_dirs   = base_config.get('add_local_dirs', False)
+        self.dir_max_dist     = base_config.get('dir_max_dist', 5)
+        self.opt_dir_max_dist = self.dir_max_dist == 'optimize'
+        self.add_local_dedxs  = base_config.get('add_local_dedxs', False)
+        self.dedx_max_dist    = base_config.get('dedx_max_dist', 5)
         self.shuffle_clusters = base_config.get('shuffle_clusters', False)
 
-        self.batch_index = batch_col
-        self.coords_index = coords_col
+        # *Deprecated* but kept for backward compatibility:
+        if 'add_start_point'    in base_config: self.add_points = base_config['add_start_point']
+        if 'add_start_dir'      in base_config: self.add_local_dirs = 'start' if base_config['add_start_dir'] else False
+        if 'add_start_dedx'     in base_config: self.add_local_dedxs = 'start' if base_config['add_start_dedx'] else False
+        if 'start_dir_max_dist' in base_config: self.dir_max_dist = self.dedx_max_dist = base_config['start_dir_max_dist']
+        if 'start_dir_opt'      in base_config: self.opt_dir_max_dist = base_config['start_dir_opt']
 
         # Interpret node type as list of classes to cluster, -1 means all classes
         if isinstance(self.node_type, int): self.node_type = [self.node_type]
@@ -160,7 +167,6 @@ def __init__(self, cfg, name='grappa', batch_col=0, coords_col=(1, 4)):
         max_dist_mat[np.triu_indices(mat_size)] = self.edge_max_dist
         max_dist_mat += max_dist_mat.T - np.diag(np.diag(max_dist_mat))
         self.edge_max_dist = max_dist_mat
-        print('edge_max_dist matrix', self.edge_max_dist)
 
         # If requested, merge images together within the batch
         self.merge_batch = base_config.get('merge_batch', False)
@@ -357,21 +363,25 @@ def forward(self, data, clusts=None, groups=None, points=None, extra_feats=None,
         if extra_feats is not None:
             x = torch.cat([x, extra_feats.float()], dim=1)
 
-        # Add start point and/or start direction to node features if requested
-        if self.add_start_point or points is not None:
+        # Add end points and/or local directions to node features, if requested
+        if self.add_points or points is not None:
             if points is None:
                 points = get_cluster_points_label(cluster_data, particles, clusts, coords_index=self.coords_index)
             x = torch.cat([x, points.float()], dim=1)
-            if self.add_start_dir:
-                dirs_start = get_cluster_directions(cluster_data[:, self.coords_index[0]:self.coords_index[1]], points[:,:3], clusts, self.start_dir_max_dist, self.start_dir_opt)
-                dirs_end   = get_cluster_directions(cluster_data[:, self.coords_index[0]:self.coords_index[1]], points[:,3:6], clusts, self.start_dir_max_dist, self.start_dir_opt)
-                #x = torch.cat([x, dirs_start.float(), dirs_end.float()], dim=1)
-                x = torch.cat([x, dirs_start.float()], dim=1)
-            if self.add_start_dedx:
-                dedxs_start = get_cluster_dedxs(cluster_data[:, self.coords_index[0]:self.coords_index[1]], cluster_data[:,4], points[:,:3], clusts, self.start_dir_max_dist)
-                dedxs_end   = get_cluster_dedxs(cluster_data[:, self.coords_index[0]:self.coords_index[1]], cluster_data[:,4], points[:,3:6], clusts, self.start_dir_max_dist)
-                #x = torch.cat([x, dedxs_start.reshape(-1,1).float(), dedxs_end.reshape(-1,1).float()], dim=1)
-                x = torch.cat([x, dedxs_start.reshape(-1,1).float()], dim=1)
+            if self.add_local_dirs:
+                dirs_start = get_cluster_directions(cluster_data[:, self.coords_index[0]:self.coords_index[1]], points[:,:3], clusts, self.dir_max_dist, self.opt_dir_max_dist)
+                if self.add_local_dirs != 'start':
+                    dirs_end = get_cluster_directions(cluster_data[:, self.coords_index[0]:self.coords_index[1]], points[:,3:6], clusts, self.dir_max_dist, self.opt_dir_max_dist)
+                    x = torch.cat([x, dirs_start.float(), dirs_end.float()], dim=1)
+                else:
+                    x = torch.cat([x, dirs_start.float()], dim=1)
+            if self.add_local_dedxs:
+                dedxs_start = get_cluster_dedxs(cluster_data[:, self.coords_index[0]:self.coords_index[1]], cluster_data[:,4], points[:,:3], clusts, self.dedx_max_dir)
+                if self.add_local_dedxs != 'start':
+                    dedxs_end = get_cluster_dedxs(cluster_data[:, self.coords_index[0]:self.coords_index[1]], cluster_data[:,4], points[:,3:6], clusts, self.dedx_max_dir)
+                    x = torch.cat([x, dedxs_start.reshape(-1,1).float(), dedxs_end.reshape(-1,1).float()], dim=1)
+                else:
+                    x = torch.cat([x, dedxs_start.reshape(-1,1).float()], dim=1)
 
         # Bring edge_index and batch_ids to device
         index = torch.tensor(edge_index, device=cluster_data.device, dtype=torch.long)
diff --git a/mlreco/utils/gnn/cluster.py b/mlreco/utils/gnn/cluster.py
index d933ce93..e7b67f89 100644
--- a/mlreco/utils/gnn/cluster.py
+++ b/mlreco/utils/gnn/cluster.py
@@ -575,7 +575,7 @@ def cluster_direction(voxels: nb.float64[:,:],
         torch.tensor: (3) Orientation
     """
     # If max_dist is set, limit the set of voxels to those within a sphere of radius max_dist
-    if max_dist > 0 and not optimize:
+    if not optimize and max_dist > 0:
         dist_mat = cdist_nb(start.reshape(1,-1), voxels).flatten()
         voxels = voxels[dist_mat <= max_dist]
         if len(voxels) < 2:

From 7a8f915bb2b529eb85456b1d001d69072abfe106 Mon Sep 17 00:00:00 2001
From: Temigo <temigo@gmx.com>
Date: Thu, 10 Nov 2022 15:31:26 -0800
Subject: [PATCH 47/52] Fix bug for volume=None

---
 analysis/classes/FlashManager.py |  8 ++--
 analysis/classes/ui.py           | 65 +++++++++++++++++++++++++++-----
 mlreco/iotools/collates.py       | 25 ++++++++++++
 3 files changed, 84 insertions(+), 14 deletions(-)

diff --git a/analysis/classes/FlashManager.py b/analysis/classes/FlashManager.py
index 7ebcb73f..65c3fc81 100644
--- a/analysis/classes/FlashManager.py
+++ b/analysis/classes/FlashManager.py
@@ -145,8 +145,8 @@ def make_qcluster(self, interactions, use_depositions_MeV=False, ADC_to_MeV=1.):
                 qcluster.push_back(qpoint)
             tpc_v.append(qcluster)
 
-        if self.tpc_v is not None:
-            print("Warning: overwriting internal list of particles.")
+        #if self.tpc_v is not None:
+        #    print("Warning: overwriting internal list of particles.")
         self.tpc_v = tpc_v
         return tpc_v
 
@@ -185,8 +185,8 @@ def make_flash(self, larcv_flashes):
                 flash.pe_v.push_back(f.PEPerOpDet()[i + offset])
                 flash.pe_err_v.push_back(0.)
             pmt_v.append(flash)
-        if self.pmt_v is not None:
-            print("Warning: overwriting internal list of flashes.")
+        #if self.pmt_v is not None:
+        #    print("Warning: overwriting internal list of flashes.")
         self.pmt_v = pmt_v
         return pmt_v
 
diff --git a/analysis/classes/ui.py b/analysis/classes/ui.py
index ae3f9bc5..c94d0d05 100644
--- a/analysis/classes/ui.py
+++ b/analysis/classes/ui.py
@@ -183,8 +183,18 @@ def _run_flash_matching(self, entry, use_true_tpc_objects=False, volume=None,
         else:
             tpc_v = self.get_interactions(entry, drop_nonprimary_particles=False, volume=volume)
 
+        # If we are not running flash matching over the entire volume at once,
+        # then we need to shift the coordinates that will be used for flash matching
+        # back to the reference of the first volume.
+        if volume is not None:
+            for tpc_object in tpc_v:
+                tpc_object.points = self._untranslate(tpc_object.points, volume)
         input_tpc_v = self.fm.make_qcluster(tpc_v, use_depositions_MeV=use_depositions_MeV, ADC_to_MeV=ADC_to_MeV)
+        if volume is not None:
+            for tpc_object in tpc_v:
+                tpc_object.points = self._translate(tpc_object.points, volume)
         
+        # Now making Flash_t objects
         selected_opflash_keys = self.opflash_keys
         if volume is not None:
             assert isinstance(volume, int)
@@ -194,6 +204,7 @@ def _run_flash_matching(self, entry, use_true_tpc_objects=False, volume=None,
             pmt_v.extend(self.data_blob[key][entry])
         input_pmt_v = self.fm.make_flash([self.data_blob[key][entry] for key in selected_opflash_keys])
 
+        # Running flash matching and caching the results
         matches = self.fm.run_flash_matching()
         self.flash_matches[(entry, volume, use_true_tpc_objects)] = (tpc_v, pmt_v, matches)
         
@@ -517,11 +528,45 @@ def _check_volume(self, volume):
             assert isinstance(volume, (int, np.int64, np.int32)) and volume >= 0
 
     def _translate(self, voxels, volume):
-        if self.vb is None:
+        """
+        Go from 1-volume-only back to full volume coordinates
+
+        Parameters
+        ==========
+        voxels: np.ndarray
+            Shape (N, 3)
+        volume: int
+
+        Returns
+        =======
+        np.ndarray
+            Shape (N, 3)
+        """
+        if self.vb is None or volume is None:
             return voxels
         else:
             return self.vb.translate(voxels, volume)
 
+    def _untranslate(self, voxels, volume):
+        """
+        Go from full volume to 1-volume-only coordinates
+
+        Parameters
+        ==========
+        voxels: np.ndarray
+            Shape (N, 3)
+        volume: int
+
+        Returns
+        =======
+        np.ndarray
+            Shape (N, 3)
+        """
+        if self.vb is None or volume is None:
+            return voxels
+        else:
+            return self.vb.untranslate(voxels, volume)
+
     def get_fragments(self, entry, only_primaries=False,
                       min_particle_voxel_count=-1,
                       attaching_threshold=2,
@@ -565,7 +610,7 @@ def get_fragments(self, entry, only_primaries=False,
 
         out_fragment_list = []
         for entry in entries:
-            volume = entry % self._num_volumes
+            volume = entry % self._num_volumes if volume is not None else volume
 
             point_cloud = self.data_blob['input_data'][entry][:, 1:4]
             depositions = self.result['input_rescaled'][entry][:, 4]
@@ -714,7 +759,7 @@ def get_particles(self, entry, only_primaries=True,
 
         out_particle_list = []
         for entry in entries:
-            volume = entry % self._num_volumes
+            volume = entry % self._num_volumes if volume is not None else volume
 
             point_cloud      = self.data_blob['input_data'][entry][:, 1:4]
             depositions      = self.result['input_rescaled'][entry][:, 4]
@@ -833,7 +878,7 @@ def get_interactions(self, entry, drop_nonprimary_particles=True, volume=None) -
 
         out_interaction_list = []
         for e in entries:
-            volume = e % self._num_volumes
+            volume = e % self._num_volumes if volume is not None else volume
             particles = self.get_particles(entry, only_primaries=drop_nonprimary_particles, volume=volume)
             out = group_particles_to_interactions_fn(particles)
             for ia in out:
@@ -1073,7 +1118,7 @@ def get_true_fragments(self, entry, verbose=False, volume=None) -> List[TruthPar
 
         out_fragments_list = []
         for entry in entries:
-            volume = entry % self._num_volumes
+            volume = entry % self._num_volumes if volume is not None else volume
 
             # Both are "adapted" labels
             labels = self.data_blob['cluster_label'][entry]
@@ -1185,7 +1230,7 @@ def get_true_particles(self, entry, only_primaries=True,
         out_particles_list = []
         global_entry = entry
         for entry in entries:
-            volume = entry % self._num_volumes
+            volume = entry % self._num_volumes if volume is not None else volume
 
             labels = self.data_blob['cluster_label'][entry]
             if self.deghosting:
@@ -1328,7 +1373,7 @@ def get_true_interactions(self, entry, drop_nonprimary_particles=True,
         entries = self._get_entries(entry, volume)
         out_interactions_list = []
         for e in entries:
-            volume = e % self._num_volumes
+            volume = e % self._num_volumes if volume is not None else volume
             true_particles = self.get_true_particles(entry, only_primaries=drop_nonprimary_particles, volume=volume)
             out = group_particles_to_interactions_fn(true_particles,
                                                      get_nu_id=True, mode='truth')
@@ -1359,7 +1404,7 @@ def get_true_vertices(self, entry, volume=None):
         entries = self._get_entries(entry, volume)
         out = {}
         for entry in entries:
-            volume = entry % self._num_volumes
+            volume = entry % self._num_volumes if volume is not None else volume
             inter_idxs = np.unique(
                 self.data_blob['cluster_label'][entry][:, 7].astype(int))
             for inter_idx in inter_idxs:
@@ -1394,7 +1439,7 @@ def match_particles(self, entry,
         entries = self._get_entries(entry, volume)
         all_matches = []
         for e in entries:
-            volume = e % self._num_volumes
+            volume = e % self._num_volumes if volume is not None else volume
             if mode == 'pred_to_true':
                 # Match each pred to one in true
                 particles_from = self.get_particles(entry, only_primaries=only_primaries, volume=volume)
@@ -1440,7 +1485,7 @@ def match_interactions(self, entry, mode='pred_to_true',
         entries = self._get_entries(entry, volume)
         all_matches, all_counts = [], []
         for e in entries:
-            volume = e % self._num_volumes
+            volume = e % self._num_volumes if volume is not None else volume
             if mode == 'pred_to_true':
                 ints_from = self.get_interactions(entry, drop_nonprimary_particles=drop_nonprimary_particles, volume=volume)
                 ints_to = self.get_true_interactions(entry, drop_nonprimary_particles=drop_nonprimary_particles, volume=volume)
diff --git a/mlreco/iotools/collates.py b/mlreco/iotools/collates.py
index ee040b03..9a4770fa 100644
--- a/mlreco/iotools/collates.py
+++ b/mlreco/iotools/collates.py
@@ -131,6 +131,31 @@ def translate(self, voxels, volume):
             new_voxels[..., n] += int(self.shifts[n][self.combo[volume][n]])
         return new_voxels
 
+    def untranslate(self, voxels, volume):
+        """
+        Meant to reverse what the translate method does: for voxels coordinates initially in the range of full detector,
+        translate to the range of 1 volume for a specific volume given in argument.
+
+        Parameters
+        ==========
+        voxels: np.ndarray
+            Expected shape is (D_0, ..., D_N, self.dim) with N >=0. In other words, voxels can be a list of
+            coordinate or a single coordinate with shape (d,).
+        volume: int
+
+        Returns
+        =======
+        np.ndarray
+            Translated voxels array, using internally computed shifts.
+        """
+        assert volume >= 0 and volume < self.num_volumes()
+        assert voxels.shape[-1] == self.dim
+
+        new_voxels = voxels.copy()
+        for n in range(self.dim):
+            new_voxels[..., n] -= int(self.shifts[n][self.combo[volume][n]])
+        return new_voxels
+
     def split(self, voxels):
         """
         Parameters

From be9a192fdca56f520aec839bc4f947f137d49fd1 Mon Sep 17 00:00:00 2001
From: Francois Drielsma <francois.drielsma@gmail.com>
Date: Tue, 15 Nov 2022 10:33:53 -0800
Subject: [PATCH 48/52] Bug fix GNN full chain when using charge rescaling +
 gSPICE

---
 mlreco/models/layers/common/gnn_full_chain.py |  2 +-
 mlreco/visualization/training.py              | 53 +++++++++++--------
 2 files changed, 32 insertions(+), 23 deletions(-)

diff --git a/mlreco/models/layers/common/gnn_full_chain.py b/mlreco/models/layers/common/gnn_full_chain.py
index 26d4a08d..f665b962 100644
--- a/mlreco/models/layers/common/gnn_full_chain.py
+++ b/mlreco/models/layers/common/gnn_full_chain.py
@@ -726,7 +726,7 @@ def forward(self, out, seg_label, ppn_label=None, cluster_label=None, kinematics
 
                 segmentation_pred = out['segmentation'][0]
 
-                if self.enable_ghost and not self.enable_charge_rescaling:
+                if self.enable_ghost:
                     segmentation_pred = segmentation_pred[deghost]
                 if self._gspice_use_true_labels:
                     gs_seg_label = torch.cat([cluster_label[0][:, :4], segment_label[:, None]], dim=1)
diff --git a/mlreco/visualization/training.py b/mlreco/visualization/training.py
index 40c7ed87..2e09ac75 100644
--- a/mlreco/visualization/training.py
+++ b/mlreco/visualization/training.py
@@ -156,7 +156,8 @@ def get_validation_df(log_dir, keys, prefix='inference'):
 
 def draw_training_curves(log_dir, models, metrics,
                          limits={}, model_names={}, metric_names={},
-                         max_iter=-1, step=1, smoothing=1, print_min=False, print_max=False,
+                         max_iter=-1, step=1, smoothing=1, iter_per_epoch=-1,
+                         print_min=False, print_max=False,
                          interactive=True, same_plot=True, paper=False, leg_ncols=1,
                          figure_name='', train_prefix='train', val_prefix='inference'):
     """
@@ -164,22 +165,23 @@ def draw_training_curves(log_dir, models, metrics,
     directory and draws an evolution plot of the request quantities.
 
     Args:
-        log_dir (str)      : Path to the directory that contains the folder with log files
-        models (list)      : List of model (folder) names under the main directory
-        metrics (list)     : List of quantities to draw
-        limits (list/dict) : List of y boundaries for the plot (or dictionary of y boundaries, one per metric)
-        model_names (dict) : Dictionary which maps raw model names to model labels (default: `{}`)
-        metric_names (dict): Dictionary which maps raw metric names to metric labels (default: `{}`)
-        max_iter (int)     : Maximum number of interation to include in the plot (default: `-1`)
-        step (int)         : Step between two successive iterations that are represented (default: `1`)
-        smoothing (int)    : Number of iteration over which to average the metric value (default: `1`)
-        interactive (bool) : Use plotly to draw (default: `True`)
-        same_plot (bool)   : Draw all model/metric pairs on a single plot (default: `True`)
-        paper (bool)       : Format plot for paper, using latex (default: `False`)
-        leg_ncols (int)    : Number of columns in the legend (default: `1`)
-        figure_name (str)  : Name of the figure. If specified, figure is saved (default: `''`)
-        train_prefix (str) : Prefix shared between training file names (default: `train`)
-        val_prefix (str)   : Prefix shared between validation file names (default: `inference`)
+        log_dir (str)       : Path to the directory that contains the folder with log files
+        models (list)       : List of model (folder) names under the main directory
+        metrics (list)      : List of quantities to draw
+        limits (list/dict)  : List of y boundaries for the plot (or dictionary of y boundaries, one per metric)
+        model_names (dict)  : Dictionary which maps raw model names to model labels (default: `{}`)
+        metric_names (dict) : Dictionary which maps raw metric names to metric labels (default: `{}`)
+        max_iter (int)      : Maximum number of interation to include in the plot (default: `-1`)
+        step (int)          : Step between two successive iterations that are represented (default: `1`)
+        smoothing (int)     : Number of iteration over which to average the metric value (default: `1`)
+        iter_per_epoch (float): Number of iterations to complete an epoch (default: `-1`, figures it out from train log)
+        interactive (bool)  : Use plotly to draw (default: `True`)
+        same_plot (bool)    : Draw all model/metric pairs on a single plot (default: `True`)
+        paper (bool)        : Format plot for paper, using latex (default: `False`)
+        leg_ncols (int)     : Number of columns in the legend (default: `1`)
+        figure_name (str)   : Name of the figure. If specified, figure is saved (default: `''`)
+        train_prefix (str)  : Prefix shared between training file names (default: `train`)
+        val_prefix (str)    : Prefix shared between validation file names (default: `inference`)
     """
     # Set the style
     plotly_colors = pcolors.convert_colors_to_same_type(pcolors.DEFAULT_PLOTLY_COLORS, 'tuple')[0]
@@ -251,11 +253,18 @@ def draw_training_curves(log_dir, models, metrics,
             metric_train = dfs[key][metric][:max_iter:step] if smoothing == 1 else dfs[key][metric][:max_iter].rolling(smoothing, min_periods=1, center=True).mean()[::step]
             draw_val     = bool(len(val_dfs[key]['iter']))
             if draw_val:
-                mask_val     = val_dfs[key]['iter'] < max_iter if max_iter > -1 else val_dfs[key]['iter'] < 1e12
-                iter_val     = val_dfs[key]['iter'][mask_val]
-                epoch_val    = [float(dfs[key]['epoch'][dfs[key]['iter'] == it]) for it in iter_val]
-                metricm_val  = val_dfs[key][metric_name+'_mean'][mask_val]
-                metrice_val  = val_dfs[key][metric_name+'_err'][mask_val]
+                mask_val    = val_dfs[key]['iter'] < max_iter if max_iter > -1 else val_dfs[key]['iter'] < 1e12
+                iter_val    = val_dfs[key]['iter'][mask_val]
+                if iter_per_epoch < 0:
+                    epoch_val   = [dfs[key]['epoch'][dfs[key]['iter'] == it] for it in iter_val]
+                    epoch_val   = np.array([float(e) if len(e)==1 else -1 for e in epoch_val])
+                    mask_val   &= epoch_val > -1
+                    iter_val    = iter_val[epoch_val > -1]
+                    epoch_val   = epoch_val[epoch_val > -1]
+                else:
+                    epoch_val = iter_val/iter_per_epoch
+                metricm_val = val_dfs[key][metric_name+'_mean'][mask_val]
+                metrice_val = val_dfs[key][metric_name+'_err'][mask_val]
  
             # Pick a label for this specific model/metric pair
             if not same_plot:

From 354e5ee71b6ee0757c45eb3394722a52a8d0ffc3 Mon Sep 17 00:00:00 2001
From: Francois Drielsma <francois.drielsma@gmail.com>
Date: Wed, 16 Nov 2022 23:25:14 -0800
Subject: [PATCH 49/52] Add option to exclude MPR particles from the primary
 target

---
 mlreco/iotools/parsers/cluster.py | 14 ++++++++++----
 mlreco/utils/groups.py            | 10 +++++++++-
 2 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/mlreco/iotools/parsers/cluster.py b/mlreco/iotools/parsers/cluster.py
index 72216e61..6659d94f 100644
--- a/mlreco/iotools/parsers/cluster.py
+++ b/mlreco/iotools/parsers/cluster.py
@@ -63,7 +63,8 @@ def parse_cluster3d(cluster_event,
                     add_kinematics_info = False,
                     clean_data = True,
                     precedence = [1,2,0,3,4],
-                    type_include_mpr = False):
+                    type_include_mpr = False,
+                    primary_include_mpr = True):
     """
     a function to retrieve a 3D clusters tensor
 
@@ -82,6 +83,8 @@ def parse_cluster3d(cluster_event,
               add_kinematics_info: false
               clean_data: true
               precedence: [1,2,0,3,4]
+              type_include_mpr: false
+              primary_include_mpr: true
 
     Configuration
     -------------
@@ -94,6 +97,8 @@ def parse_cluster3d(cluster_event,
     add_kinematics_info: bool
     clean_data: bool
     precedence: list
+    type_include_mpr: bool
+    primary_include_mpr: bool
 
     Returns
     -------
@@ -140,7 +145,7 @@ def parse_cluster3d(cluster_event,
             labels['type']    = get_particle_id(particles_v, nu_ids, include_mpr=type_include_mpr)
             labels['primary_shower'] = get_shower_primary_id(cluster_event, particles_v)
         if add_kinematics_info:
-            primary_ids       = get_group_primary_id(particles_v)
+            primary_ids       = get_group_primary_id(particles_v, nu_ids, include_mpr=primary_include_mpr)
             particles_v       = parse_particles(particle_event, cluster_event)
             labels['type']    = get_particle_id(particles_v, nu_ids, include_mpr=type_include_mpr)
             labels['p']       = np.array([p.p()/1e3 for p in particles_v]) # In GeV
@@ -202,10 +207,11 @@ def parse_cluster3d_charge_rescaled(cluster_event,
                                     add_kinematics_info = False,
                                     clean_data = True,
                                     precedence = [1,2,0,3,4],
-                                    type_include_mpr = False):
+                                    type_include_mpr = False,
+                                    primary_include_mpr = False):
     # Produces cluster3d labels with sparse3d_reco_rescaled on the fly on datasets that do not have it
     np_voxels, np_features = parse_cluster3d(cluster_event, particle_event, particle_mpv_event, sparse_semantics_event, None,
-                                             add_particle_info, add_kinematics_info, clean_data, precedence, type_include_mpr)
+                                             add_particle_info, add_kinematics_info, clean_data, precedence, type_include_mpr, primary_include_mpr)
 
     from .sparse import parse_sparse3d_charge_rescaled
     _, val_features  = parse_sparse3d_charge_rescaled(sparse_value_event_list)
diff --git a/mlreco/utils/groups.py b/mlreco/utils/groups.py
index 4771bd13..7c948234 100644
--- a/mlreco/utils/groups.py
+++ b/mlreco/utils/groups.py
@@ -333,6 +333,7 @@ def get_particle_id(particles_v, nu_ids, include_mpr=False):
     Inputs:
         - particles_v (array of larcv::Particle)    : (N) LArCV Particle objects
         - nu_ids: a numpy array with shape (n, 1) where 1 is neutrino id (0 if not an MPV)
+        - include_mpr: include MPR (cosmic-like) particles to PID target
     Outputs:
         - array: (N) list of group ids
     '''
@@ -410,19 +411,26 @@ def get_shower_primary_id(cluster_event, particles_v):
     return primary_ids
 
 
-def get_group_primary_id(particles_v):
+def get_group_primary_id(particles_v, nu_ids=None, include_mpr=True):
     '''
     Function that assigns valid primary tags to particle groups.
     This could be handled somewhere else (e.g. SUPERA)
 
     Inputs:
         - particles_v (array of larcv::Particle)    : (N) LArCV Particle objects
+        - nu_ids: a numpy array with shape (n, 1) where 1 is neutrino id (0 if not an MPV)
+        - include_mpr: include MPR (cosmic-like) particles to primary target
     Outputs:
         - array: (N) list of group ids
     '''
     # Loop over the list of particles
     primary_ids = np.empty(particles_v.size(), dtype=np.int32)
     for i, p in enumerate(particles_v):
+        # If MPR particles are not included and the nu_id < 1, assign invalid
+        if not include_mpr and nu_ids[i] < 1:
+            primary_ids[i] = -1
+            continue
+
         # If the particle is not a shower or a track, it is not a primary
         if p.shape() != 0 and p.shape() != 1:
             primary_ids[i] = 0

From 6afb611e96db053b50948167d0644f9beaa0659a Mon Sep 17 00:00:00 2001
From: Temigo <temigo@gmx.com>
Date: Thu, 17 Nov 2022 10:43:13 -0800
Subject: [PATCH 50/52] Volume bug fix + reflash merging option

---
 analysis/algorithms/selections/example_nue.py |  41 +++-
 .../algorithms/selections/flash_matching.py   | 228 ++++++++++++++++++
 analysis/algorithms/selections/statistics.py  |   2 +-
 .../selections/through_going_muons.py         |   4 +-
 analysis/classes/FlashManager.py              |  49 +++-
 analysis/classes/ui.py                        |  37 +--
 6 files changed, 339 insertions(+), 22 deletions(-)
 create mode 100644 analysis/algorithms/selections/flash_matching.py

diff --git a/analysis/algorithms/selections/example_nue.py b/analysis/algorithms/selections/example_nue.py
index 3e3391b0..5474d11f 100644
--- a/analysis/algorithms/selections/example_nue.py
+++ b/analysis/algorithms/selections/example_nue.py
@@ -9,6 +9,12 @@
 import time
 import numpy as np
 
+# Setup OpT0finder
+import os, sys
+sys.path.append('/sdf/group/neutrino/ldomine/OpT0Finder/python')
+import flashmatch
+from flashmatch import flashmatch, geoalgo
+
 
 @evaluate(['interactions', 'particles'], mode='per_batch')
 def debug_pid(data_blob, res, data_idx, analysis_cfg, cfg):
@@ -16,8 +22,19 @@ def debug_pid(data_blob, res, data_idx, analysis_cfg, cfg):
     interactions, particles = [], []
     deghosting = analysis_cfg['analysis']['deghosting']
     primaries = analysis_cfg['analysis']['match_primaries']
+    enable_flash_matching = analysis_cfg['analysis'].get('enable_flash_matching', False)
+    ADC_to_MeV = analysis_cfg['analysis'].get('ADC_to_MeV', 1./350.)
+
+    processor_cfg       = analysis_cfg['analysis'].get('processor_cfg', {})
+    if enable_flash_matching:
+        predictor = FullChainEvaluator(data_blob, res, cfg, processor_cfg,
+                deghosting=deghosting,
+                enable_flash_matching=True,
+                flash_matching_cfg=os.path.join(os.environ['FMATCH_BASEDIR'], "dat/flashmatch_112022.cfg"),
+                opflash_keys=['opflash_cryoE', 'opflash_cryoW'])
+    else:
+        predictor = FullChainEvaluator(data_blob, res, cfg, processor_cfg)
 
-    predictor = FullChainEvaluator(data_blob, res, cfg, analysis_cfg)
     image_idxs = data_blob['index']
     print(data_blob['index'], data_blob['run_info'])
     for idx, index in enumerate(image_idxs):
@@ -27,6 +44,11 @@ def debug_pid(data_blob, res, data_idx, analysis_cfg, cfg):
             'subrun': data_blob['run_info'][idx][1],
             'event': data_blob['run_info'][idx][2]
         }
+        if enable_flash_matching:
+            flash_matches_cryoE = predictor.get_flash_matches(idx, use_true_tpc_objects=False, volume=0,
+                    use_depositions_MeV=False, ADC_to_MeV=ADC_to_MeV)
+            flash_matches_cryoW = predictor.get_flash_matches(idx, use_true_tpc_objects=False, volume=1,
+                    use_depositions_MeV=False, ADC_to_MeV=ADC_to_MeV)
 
         # Process Interaction Level Information
         matches, counts = predictor.match_interactions(idx,
@@ -82,6 +104,23 @@ def debug_pid(data_blob, res, data_idx, analysis_cfg, cfg):
                 true_int_dict['true_nu_energy'] = nu.energy_init()
 
             pred_int_dict['interaction_match_counts'] = counts[i]
+
+            if enable_flash_matching:
+                volume = true_int.volume if true_int is not None else pred_int.volume
+                flash_matches = flash_matches_cryoW if volume == 1 else flash_matches_cryoE
+                pred_int_dict['fmatched'] = False
+                pred_int_dict['fmatch_time'] = None
+                pred_int_dict['fmatch_total_pe'] = None
+                pred_int_dict['fmatch_id'] = None
+                if pred_int is not None:
+                    for interaction, flash, match in flash_matches:
+                        if interaction.id != pred_int.id: continue
+                        pred_int_dict['fmatched'] = True
+                        pred_int_dict['fmatch_time'] = flash.time()
+                        pred_int_dict['fmatch_total_pe'] = flash.TotalPE()
+                        pred_int_dict['fmatch_id'] = flash.id()
+                        break
+
             interactions_dict = OrderedDict(index_dict.copy())
             interactions_dict.update(true_int_dict)
             interactions_dict.update(pred_int_dict)
diff --git a/analysis/algorithms/selections/flash_matching.py b/analysis/algorithms/selections/flash_matching.py
new file mode 100644
index 00000000..e9d087ce
--- /dev/null
+++ b/analysis/algorithms/selections/flash_matching.py
@@ -0,0 +1,228 @@
+from collections import OrderedDict
+from analysis.algorithms.utils import count_primary_particles, get_particle_properties
+from analysis.classes.ui import FullChainEvaluator
+
+from analysis.decorator import evaluate
+from analysis.classes.particle import match_particles_fn, matrix_iou
+
+from pprint import pprint
+import time
+import numpy as np
+import os, sys
+
+# Setup OpT0finder
+sys.path.append('/sdf/group/neutrino/ldomine/OpT0Finder/python')
+import flashmatch
+from flashmatch import flashmatch, geoalgo
+
+
+def find_true_time(interaction):
+    """
+    Returns
+    =======
+    Time in us
+    """
+    time = None
+    for p in interaction.particles:
+        if not p.is_primary: continue
+        time = 1e-3 * p.asis.ancestor_t() if time is None else min(time, 1e-3 * p.particle_asis.ancestor_t())
+    return time
+
+def find_true_x(interaction):
+    """
+    Returns
+    =======
+    True vertex x in cm (absolute coordinates)
+    """
+    x = []
+    for p in interaction.particles:
+        if not p.is_primary: continue
+        x.append(p.asis.x())
+    if len(x) == 0:
+        return None
+    values, counts = np.unique(x, return_counts=True)
+    if len(values) > 1:
+        print("Warning found > 1 true x in interaction", values, counts)
+    return values[np.argmax(counts)]
+
+
+@evaluate(['interactions', 'flashes', 'matches'], mode='per_batch')
+def flash_matching(data_blob, res, data_idx, analysis_cfg, cfg):
+
+    interactions, flashes, matches = [], [], []
+    deghosting = analysis_cfg['analysis']['deghosting']
+    primaries = analysis_cfg['analysis']['drop_nonprimary_particles']
+    use_true_tpc_objects = analysis_cfg['analysis'].get('use_true_tpc_objects', False)
+    use_depositions_MeV = analysis_cfg['analysis'].get('use_depositions_MeV', False)
+    ADC_to_MeV = analysis_cfg['analysis'].get('ADC_to_MeV', 1./350.)
+
+    processor_cfg       = analysis_cfg['analysis'].get('processor_cfg', {})
+    predictor = FullChainEvaluator(data_blob, res, cfg, processor_cfg,
+            deghosting=deghosting,
+            enable_flash_matching=True,
+            flash_matching_cfg=os.path.join(os.environ['FMATCH_BASEDIR'], "dat/flashmatch_112022.cfg"),
+            opflash_keys=['opflash_cryoE', 'opflash_cryoW'])
+
+    image_idxs = data_blob['index']
+    print(data_idx, data_blob['index'], data_blob['run_info'])
+    for idx, index in enumerate(image_idxs):
+        index_dict = {
+            'Index': index,
+            'run': data_blob['run_info'][idx][0],
+            'subrun': data_blob['run_info'][idx][1],
+            'event': data_blob['run_info'][idx][2]
+        }
+        meta = data_blob['meta'][idx]
+
+        all_times_cryoE, all_times_cryoW = [], []
+        for flash in data_blob['opflash_cryoE'][idx]:
+            all_times_cryoE.append(flash.time())
+        for flash in data_blob['opflash_cryoW'][idx]:
+            all_times_cryoW.append(flash.time())
+        ordered_flashes_cryoE = np.array(data_blob['opflash_cryoE'][idx])[np.argsort(all_times_cryoE)] 
+        ordered_flashes_cryoW = np.array(data_blob['opflash_cryoW'][idx])[np.argsort(all_times_cryoW)]
+
+        prev_flash_time, next_flash_time = {}, {}
+        for flash_idx, flash in enumerate(ordered_flashes_cryoE):
+            if flash_idx > 0:
+                prev_flash_time[(0, flash.id())] = ordered_flashes_cryoE[flash_idx-1].time()
+            else:
+                prev_flash_time[(0, flash.id())] = None
+            if flash_idx < len(ordered_flashes_cryoE)-1:
+                next_flash_time[(0, flash.id())] = ordered_flashes_cryoE[flash_idx+1].time()
+            else:
+                next_flash_time[(0, flash.id())] = None
+        for flash_idx, flash in enumerate(ordered_flashes_cryoW):
+            if flash_idx > 0:
+                prev_flash_time[(1, flash.id())] = ordered_flashes_cryoW[flash_idx-1].time()
+            else:
+                prev_flash_time[(1, flash.id())] = None
+            if flash_idx < len(ordered_flashes_cryoW)-1:
+                next_flash_time[(1, flash.id())] = ordered_flashes_cryoW[flash_idx+1].time()
+            else:
+                next_flash_time[(1, flash.id())] = None
+        
+        flash_matches_cryoE = predictor.get_flash_matches(idx, use_true_tpc_objects=use_true_tpc_objects, volume=0,
+                use_depositions_MeV=use_depositions_MeV, ADC_to_MeV=ADC_to_MeV)
+        flash_matches_cryoW = predictor.get_flash_matches(idx, use_true_tpc_objects=use_true_tpc_objects, volume=1,
+                use_depositions_MeV=use_depositions_MeV, ADC_to_MeV=ADC_to_MeV)
+
+        matched_interactions = None
+        if not use_true_tpc_objects:
+            matched_interactions = predictor.match_interactions(idx,
+                    mode='pred_to_true', drop_nonprimary_particles=primaries, match_particles=True)
+
+        interaction_ids, flash_ids = [], []
+        for interaction, flash, match in flash_matches_cryoE + flash_matches_cryoW:
+            interaction_ids.append(interaction.id)
+            flash_ids.append(flash.id())
+
+            interaction_dict = OrderedDict(index_dict.copy())
+
+            interaction_dict['interaction_id'] = interaction.id
+            interaction_dict['size'] = interaction.size
+            interaction_dict['num_particles'] = interaction.num_particles
+            interaction_dict['interaction_min_x'] = interaction.points[:, 0].min()
+            interaction_dict['interaction_max_x'] = interaction.points[:, 0].max()
+            interaction_dict['interaction_min_y'] = interaction.points[:, 1].min()
+            interaction_dict['interaction_max_y'] = interaction.points[:, 1].max()
+            interaction_dict['interaction_min_z'] = interaction.points[:, 2].min()
+            interaction_dict['interaction_max_z'] = interaction.points[:, 2].max()
+            interaction_dict['interaction_edep'] = interaction.depositions.sum()
+            interaction_dict['fmatched'] = True
+            interaction_dict['volume'] = interaction.volume
+
+            if not use_true_tpc_objects: # Using TruthInteraction
+                for pred_int, true_int in matched_interactions:
+                    if pred_int.id != interaction.id: continue
+                    if true_int is None:
+                        interaction_dict['matched'] = False
+                        interaction_dict['true_time'] = None
+                        interaction_dict['true_x'] = None
+                    else:
+                        interaction_dict['matched'] = True
+                        interaction_dict['true_time'] = find_true_time(true_int)
+                        interaction_dict['true_x'] = find_true_x(true_int)
+            else:
+                interaction_dict['true_time'] = find_true_time(interaction)
+                interaction_dict['true_x'] = find_true_x(interaction)
+                interaction_dict['interaction_edep_MeV'] = interaction.depositions_MeV.sum()
+
+            flash_dict = OrderedDict(index_dict.copy())
+            
+            flash_dict['flash_id'] = flash.id()
+            flash_dict['time'] = flash.time()
+            flash_dict['total_pe'] = flash.TotalPE()
+            flash_dict['abstime'] = flash.absTime()
+            flash_dict['time_width'] = flash.timeWidth()
+            flash_dict['fmatched'] = True
+            flash_dict['volume'] = interaction.volume
+            flash_dict['prev_flash_time'] = prev_flash_time[(interaction.volume, flash.id())]
+            flash_dict['next_flash_time'] = next_flash_time[(interaction.volume, flash.id())]
+
+            interactions.append(interaction_dict)
+            flashes.append(flash_dict)
+            match_dict = flash_dict.copy()
+            match_dict.update(interaction_dict)
+            match_dict['fmatch_score'] = match.score
+            # Convert from absolute cm to voxel coordinates
+            match_dict['fmatch_x'] = (match.tpc_point.x - meta[0]) / meta[6]
+            match_dict['hypothesis_total_pe'] = np.sum(match.hypothesis)
+            matches.append(match_dict)
+        
+        if use_true_tpc_objects:
+            all_interactions = predictor.get_true_interactions(idx, drop_nonprimary_particles=primaries)
+        else:
+            all_interactions = predictor.get_interactions(idx, drop_nonprimary_particles=primaries)
+
+        for interaction in all_interactions:
+            if interaction.id in interaction_ids: continue
+
+            interaction_dict = OrderedDict(index_dict.copy())
+            interaction_dict['interaction_id'] = interaction.id
+            interaction_dict['size'] = interaction.size
+            interaction_dict['num_particles'] = interaction.num_particles
+            interaction_dict['interaction_min_x'] = interaction.points[:, 0].min()
+            interaction_dict['interaction_max_x'] = interaction.points[:, 0].max()
+            interaction_dict['interaction_min_y'] = interaction.points[:, 1].min()
+            interaction_dict['interaction_max_y'] = interaction.points[:, 1].max()
+            interaction_dict['interaction_min_z'] = interaction.points[:, 2].min()
+            interaction_dict['interaction_max_z'] = interaction.points[:, 2].max()
+            interaction_dict['interaction_edep'] = interaction.depositions.sum()
+            interaction_dict['fmatched'] = False
+
+            if not use_true_tpc_objects: # Using TruthInteraction
+                for pred_int, true_int in matched_interactions:
+                    if pred_int.id != interaction.id: continue
+                    if true_int is None:
+                        interaction_dict['matched'] = False
+                        interaction_dict['true_time'] = None
+                        interaction_dict['true_x'] = None
+                    else:
+                        interaction_dict['matched'] = True
+                        interaction_dict['true_time'] = find_true_time(true_int)
+                        interaction_dict['true_x'] = find_true_x(true_int)
+            else:
+                interaction_dict['true_time'] = find_true_time(interaction)
+                interaction_dict['true_x'] = find_true_x(interaction)
+                interaction_dict['interaction_edep_MeV'] = interaction.depositions_MeV.sum()
+            interactions.append(interaction_dict)
+
+        volume = [0] * len(data_blob['opflash_cryoE'][idx])
+        volume += [1] * len(data_blob['opflash_cryoW'][idx])
+        for flash_idx, flash in enumerate(data_blob['opflash_cryoE'][idx] + data_blob['opflash_cryoW'][idx]):
+            if flash.id() in flash_ids: continue
+            flash_dict = OrderedDict(index_dict.copy())
+            
+            flash_dict['flash_id'] = flash.id()
+            flash_dict['time'] = flash.time()
+            flash_dict['total_pe'] = flash.TotalPE()
+            flash_dict['abstime'] = flash.absTime()
+            flash_dict['time_width'] = flash.timeWidth()
+            flash_dict['fmatched'] = False
+            flash_dict['volume'] = volume[flash_idx]
+            flash_dict['prev_flash_time'] = prev_flash_time[(volume[flash_idx], flash.id())]
+            flash_dict['next_flash_time'] = next_flash_time[(volume[flash_idx], flash.id())]
+            flashes.append(flash_dict)
+
+    return [interactions, flashes, matches] #[interactions, flashes]
diff --git a/analysis/algorithms/selections/statistics.py b/analysis/algorithms/selections/statistics.py
index c0b5db09..dd1de187 100644
--- a/analysis/algorithms/selections/statistics.py
+++ b/analysis/algorithms/selections/statistics.py
@@ -28,7 +28,7 @@ def statistics(data_blob, res, data_idx, analysis_cfg, cfg):
     bin_size             = processor_cfg.get('bin_size', 17) # 5cm
 
     # Initialize analysis differently depending on data/MC setting
-    predictor = FullChainPredictor(data_blob, res, cfg, analysis_cfg, deghosting=deghosting)
+    predictor = FullChainPredictor(data_blob, res, cfg, processor_cfg, deghosting=deghosting)
 
     image_idxs = data_blob['index']
     pca = PCA(n_components=2)
diff --git a/analysis/algorithms/selections/through_going_muons.py b/analysis/algorithms/selections/through_going_muons.py
index 589ca9e9..f91ba11f 100644
--- a/analysis/algorithms/selections/through_going_muons.py
+++ b/analysis/algorithms/selections/through_going_muons.py
@@ -123,9 +123,9 @@ def through_going_muons(data_blob, res, data_idx, analysis_cfg, cfg):
     #
     # Initialize analysis differently depending on data/MC setting
     if not data:
-        predictor = FullChainEvaluator(data_blob, res, cfg, analysis_cfg, deghosting=deghosting)
+        predictor = FullChainEvaluator(data_blob, res, cfg, processor_cfg, deghosting=deghosting)
     else:
-        predictor = FullChainPredictor(data_blob, res, cfg, analysis_cfg, deghosting=deghosting)
+        predictor = FullChainPredictor(data_blob, res, cfg, processor_cfg, deghosting=deghosting)
 
     image_idxs = data_blob['index']
 
diff --git a/analysis/classes/FlashManager.py b/analysis/classes/FlashManager.py
index 65c3fc81..9c9a0cd8 100644
--- a/analysis/classes/FlashManager.py
+++ b/analysis/classes/FlashManager.py
@@ -1,4 +1,5 @@
 import os, sys
+import numpy as np
 
 
 class FlashManager:
@@ -7,7 +8,7 @@ class FlashManager:
 
     See https://github.com/drinkingkazu/OpT0Finder for more details about it.
     """
-    def __init__(self, cfg, cfg_fmatch, meta=None, detector_specs=None):
+    def __init__(self, cfg, cfg_fmatch, meta=None, detector_specs=None, reflash_merging_window=None):
         """
         Expects that the environment variable `FMATCH_BASEDIR` is set.
         You can either set it by hand (to the path where one can find
@@ -80,6 +81,8 @@ def __init__(self, cfg, cfg_fmatch, meta=None, detector_specs=None):
         self.all_matches = None
         self.pmt_v, self.tpc_v = None, None
 
+        self.reflash_merging_window = reflash_merging_window
+
     def get_flash(self, flash_id, array=False):
         from flashmatch import flashmatch
 
@@ -166,12 +169,13 @@ def make_flash(self, larcv_flashes):
         for branch in larcv_flashes:
             flashes.extend(branch)
 
-        pmt_v = []
+        pmt_v, times = [], []
         for idx, f in enumerate(flashes):
             # f is an object of type larcv::Flash
             flash = flashmatch.Flash_t()
             flash.idx = f.id()  # Assign a unique index
             flash.time = f.time()  # Flash timing, a candidate T0
+            times.append(flash.time)
 
             # Assign the flash position and error on this position
             flash.x, flash.y, flash.z = 0, 0, 0
@@ -187,9 +191,50 @@ def make_flash(self, larcv_flashes):
             pmt_v.append(flash)
         #if self.pmt_v is not None:
         #    print("Warning: overwriting internal list of flashes.")
+        if self.reflash_merging_window is not None:
+            # then proceed to merging close flashes
+            perm = np.argsort(times)
+            pmt_v = np.array(pmt_v)[perm]
+            final_pmt_v = [pmt_v[0]]
+            is_merging = False
+            for idx, flash in enumerate(pmt_v[1:]):
+                if flash.time - final_pmt_v[-1].time < self.reflash_merging_window:
+                    new_flash = self.merge_flashes(flash, final_pmt_v[-1])
+                    final_pmt_v[-1] = new_flash
+                else:
+                    final_pmt_v.append(flash)
+            pmt_v = final_pmt_v
+
         self.pmt_v = pmt_v
         return pmt_v
 
+    def merge_flashes(self, a, b):
+        """
+        Util to merge 2 flashmatch::Flash_t objects on the fly.
+
+        Final time is minimum of both times. Final PE count per
+        photodetectors is the sum between the 2 flashes.
+
+        Parameters
+        ==========
+        a: flashmatch::Flash_t
+        b: flashmatch::Flash_t
+
+        Returns
+        =======
+        flashmatch::Flash_t
+        """
+        from flashmatch import flashmatch
+        flash = flashmatch.Flash_t()
+        flash.idx = min(a.idx, b.idx)
+        flash.time = min(a.time, b.time)
+        flash.x, flash.y, flash.z = min(a.x, b.x), min(a.y, b.y), min(a.z, b.z)
+        flash.x_err, flash.y_err, flash.z_err = min(a.x_err, b.x_err), min(a.y_err, b.y_err), min(a.z_err, b.z_err)
+        for i in range(180):
+            flash.pe_v.push_back(a.pe_v[i] + b.pe_v[i])
+            flash.pe_err_v.push_back(a.pe_err_v[i] + b.pe_err_v[i])
+        return flash
+
     def run_flash_matching(self, flashes=None, interactions=None, **kwargs):
         if self.tpc_v is None:
             if interactions is None:
diff --git a/analysis/classes/ui.py b/analysis/classes/ui.py
index c94d0d05..2f16fa34 100644
--- a/analysis/classes/ui.py
+++ b/analysis/classes/ui.py
@@ -73,7 +73,7 @@ def __init__(self, data_blob, result, cfg, predictor_cfg={}, deghosting=False,
         self.num_images = len(data_blob['input_data'])
         self.index = self.data_blob['index']
 
-        self.spatial_size             = predictor_cfg.get('spatial_size', 768)
+        # self.spatial_size             = predictor_cfg.get('spatial_size', 768)
         # For matching particles and interactions
         self.min_overlap_count        = predictor_cfg.get('min_overlap_count', 0)
         # Idem, can be 'count' or 'iou'
@@ -89,6 +89,8 @@ def __init__(self, data_blob, result, cfg, predictor_cfg={}, deghosting=False,
 
         self.batch_mask = self.data_blob['input_data']
 
+        # This is used to apply fiducial volume cuts.
+        # Min/max boundaries in each dimension haev to be specified.
         self.volume_boundaries = predictor_cfg.get('volume_boundaries', None)
         if self.volume_boundaries is None:
             # Using ICARUS Cryo 0 as a default
@@ -108,6 +110,8 @@ def __init__(self, data_blob, result, cfg, predictor_cfg={}, deghosting=False,
 
         # Determine whether we need to account for several distinct volumes
         # split over "virtual" batch ids
+        # Note this is different from "self.volume_boundaries" above
+        # FIXME rename one or the other to be clearer
         boundaries = cfg['iotool'].get('collate', {}).get('boundaries', None)
         if boundaries is not None:
             self.vb = VolumeBoundaries(boundaries)
@@ -120,6 +124,8 @@ def __init__(self, data_blob, result, cfg, predictor_cfg={}, deghosting=False,
         self.enable_flash_matching = enable_flash_matching
         self.fm = None
         if enable_flash_matching:
+            reflash_merging_window = predictor_cfg.get('reflash_merging_window', None)
+
             if 'meta' not in self.data_blob:
                 raise Exception('Meta unspecified in data_blob. Please add it to your I/O schema.')
             #if 'FMATCH_BASEDIR' not in os.environ:
@@ -127,7 +133,7 @@ def __init__(self, data_blob, result, cfg, predictor_cfg={}, deghosting=False,
             assert os.path.exists(flash_matching_cfg)
             assert len(opflash_keys) == self._num_volumes
 
-            self.fm = FlashManager(cfg, flash_matching_cfg, meta=self.data_blob['meta'][0])
+            self.fm = FlashManager(cfg, flash_matching_cfg, meta=self.data_blob['meta'][0], reflash_merging_window=reflash_merging_window)
             self.opflash_keys = opflash_keys
 
             self.flash_matches = {} # key is (entry, volume, use_true_tpc_objects), value is tuple (tpc_v, pmt_v, list of matches)
@@ -610,7 +616,7 @@ def get_fragments(self, entry, only_primaries=False,
 
         out_fragment_list = []
         for entry in entries:
-            volume = entry % self._num_volumes if volume is not None else volume
+            volume = entry % self._num_volumes
 
             point_cloud = self.data_blob['input_data'][entry][:, 1:4]
             depositions = self.result['input_rescaled'][entry][:, 4]
@@ -759,7 +765,7 @@ def get_particles(self, entry, only_primaries=True,
 
         out_particle_list = []
         for entry in entries:
-            volume = entry % self._num_volumes if volume is not None else volume
+            volume = entry % self._num_volumes
 
             point_cloud      = self.data_blob['input_data'][entry][:, 1:4]
             depositions      = self.result['input_rescaled'][entry][:, 4]
@@ -878,7 +884,7 @@ def get_interactions(self, entry, drop_nonprimary_particles=True, volume=None) -
 
         out_interaction_list = []
         for e in entries:
-            volume = e % self._num_volumes if volume is not None else volume
+            volume = e % self._num_volumes if self.vb is not None else volume
             particles = self.get_particles(entry, only_primaries=drop_nonprimary_particles, volume=volume)
             out = group_particles_to_interactions_fn(particles)
             for ia in out:
@@ -1118,7 +1124,7 @@ def get_true_fragments(self, entry, verbose=False, volume=None) -> List[TruthPar
 
         out_fragments_list = []
         for entry in entries:
-            volume = entry % self._num_volumes if volume is not None else volume
+            volume = entry % self._num_volumes
 
             # Both are "adapted" labels
             labels = self.data_blob['cluster_label'][entry]
@@ -1230,7 +1236,7 @@ def get_true_particles(self, entry, only_primaries=True,
         out_particles_list = []
         global_entry = entry
         for entry in entries:
-            volume = entry % self._num_volumes if volume is not None else volume
+            volume = entry % self._num_volumes
 
             labels = self.data_blob['cluster_label'][entry]
             if self.deghosting:
@@ -1373,7 +1379,7 @@ def get_true_interactions(self, entry, drop_nonprimary_particles=True,
         entries = self._get_entries(entry, volume)
         out_interactions_list = []
         for e in entries:
-            volume = e % self._num_volumes if volume is not None else volume
+            volume = e % self._num_volumes if self.vb is not None else volume
             true_particles = self.get_true_particles(entry, only_primaries=drop_nonprimary_particles, volume=volume)
             out = group_particles_to_interactions_fn(true_particles,
                                                      get_nu_id=True, mode='truth')
@@ -1404,7 +1410,7 @@ def get_true_vertices(self, entry, volume=None):
         entries = self._get_entries(entry, volume)
         out = {}
         for entry in entries:
-            volume = entry % self._num_volumes if volume is not None else volume
+            volume = entry % self._num_volumes if self.vb is not None else volume
             inter_idxs = np.unique(
                 self.data_blob['cluster_label'][entry][:, 7].astype(int))
             for inter_idx in inter_idxs:
@@ -1439,7 +1445,7 @@ def match_particles(self, entry,
         entries = self._get_entries(entry, volume)
         all_matches = []
         for e in entries:
-            volume = e % self._num_volumes if volume is not None else volume
+            volume = e % self._num_volumes if self.vb is not None else volume
             if mode == 'pred_to_true':
                 # Match each pred to one in true
                 particles_from = self.get_particles(entry, only_primaries=only_primaries, volume=volume)
@@ -1451,10 +1457,9 @@ def match_particles(self, entry,
             else:
                 raise ValueError("Mode {} is not valid. For matching each"\
                     " prediction to truth, use 'pred_to_true' (and vice versa).".format(mode))
+            all_kwargs = {"min_overlap": self.min_overlap_count, "overlap_mode": self.overlap_mode, **kwargs}
             matched_pairs, _, _ = match_particles_fn(particles_from, particles_to,
-                                                    min_overlap=self.min_overlap_count,
-                                                    overlap_mode=self.overlap_mode,
-                                                    **kwargs)
+                                                    **all_kwargs)
             all_matches.extend(matched_pairs)
         return all_matches
 
@@ -1485,7 +1490,7 @@ def match_interactions(self, entry, mode='pred_to_true',
         entries = self._get_entries(entry, volume)
         all_matches, all_counts = [], []
         for e in entries:
-            volume = e % self._num_volumes if volume is not None else volume
+            volume = e % self._num_volumes if self.vb is not None else volume
             if mode == 'pred_to_true':
                 ints_from = self.get_interactions(entry, drop_nonprimary_particles=drop_nonprimary_particles, volume=volume)
                 ints_to = self.get_true_interactions(entry, drop_nonprimary_particles=drop_nonprimary_particles, volume=volume)
@@ -1496,9 +1501,9 @@ def match_interactions(self, entry, mode='pred_to_true',
                 raise ValueError("Mode {} is not valid. For matching each"\
                     " prediction to truth, use 'pred_to_true' (and vice versa).".format(mode))
 
+            all_kwargs = {"min_overlap": self.min_overlap_count, **kwargs}
             matched_interactions, _, counts = match_interactions_fn(ints_from, ints_to,
-                                                                    min_overlap=self.min_overlap_count,
-                                                                    **kwargs)
+                                                                    **all_kwargs)
 
             if match_particles:
                 for interactions in matched_interactions:

From 40ba7e0a52df91cb743cbdcda2d849d05d838647 Mon Sep 17 00:00:00 2001
From: Francois Drielsma <francois.drielsma@gmail.com>
Date: Thu, 17 Nov 2022 15:56:57 -0800
Subject: [PATCH 51/52] Add option in standalone GrapPA to break individual
 input clusters into fragments

---
 mlreco/models/grappa.py | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/mlreco/models/grappa.py b/mlreco/models/grappa.py
index 1631debd..97f9c335 100644
--- a/mlreco/models/grappa.py
+++ b/mlreco/models/grappa.py
@@ -142,6 +142,7 @@ def __init__(self, cfg, name='grappa', batch_col=0, coords_col=(1, 4)):
         self.opt_dir_max_dist = self.dir_max_dist == 'optimize'
         self.add_local_dedxs  = base_config.get('add_local_dedxs', False)
         self.dedx_max_dist    = base_config.get('dedx_max_dist', 5)
+        self.break_clusters   = base_config.get('break_clusters', False)
         self.shuffle_clusters = base_config.get('shuffle_clusters', False)
 
         # *Deprecated* but kept for backward compatibility:
@@ -283,6 +284,15 @@ def forward(self, data, clusts=None, groups=None, points=None, extra_feats=None,
                                        self.node_min_size,
                                        self.source_col,
                                        cluster_classes=self.node_type)
+                if self.break_clusters:
+                    from sklearn.cluster import DBSCAN
+                    dbscan = DBSCAN(eps=1.1, min_samples=1, metric='chebyshev')
+                    broken_clusts = []
+                    for c in clusts:
+                        labels = dbscan.fit(cluster_data[c, self.coords_index[0]:self.coords_index[1]].detach().cpu().numpy()).labels_
+                        for l in np.unique(labels):
+                            broken_clusts.append(c[labels==l])
+                    clusts = broken_clusts
 
         # If requested, shuffle the order in which the clusters are listed (used for debugging)
         if self.shuffle_clusters:
@@ -311,7 +321,6 @@ def forward(self, data, clusts=None, groups=None, points=None, extra_feats=None,
 
         batch_ids = get_cluster_batch(cluster_data, clusts, batch_index=self.batch_index)
         clusts_split, cbids = split_clusts(clusts, batch_ids, batches, bcounts)
-        result['clusts'] = [clusts_split]
 
         # If necessary, compute the cluster distance matrix
         dist_mat = None
@@ -348,12 +357,12 @@ def forward(self, data, clusts=None, groups=None, points=None, extra_feats=None,
                 edge_index = restrict_graph(edge_index, dist_mat, self.edge_max_dist)
             else:
                 # Here get_cluster_primary_label is used to ensure that Michel/Delta showers are given the appropriate semantic label
-                classes = extra_feats[:,-1].cpu().numpy().astype(int) if extra_feats is not None else get_cluster_primary_label(cluster_data, clusts, -1).astype(int)
+                if self.source_col == 5: classes = extra_feats[:,-1].cpu().numpy().astype(int) if extra_feats is not None else get_cluster_label(cluster_data, clusts, -1).astype(int)
+                if self.source_col == 6: classes = extra_feats[:,-1].cpu().numpy().astype(int) if extra_feats is not None else get_cluster_primary_label(cluster_data, clusts, -1).astype(int)
                 edge_index = restrict_graph(edge_index, dist_mat, self.edge_max_dist, classes)
 
         # Update result with a list of edges for each batch id
         edge_index_split, ebids = split_edge_index(edge_index, batch_ids, batches)
-        result['edge_index'] = [edge_index_split]
 
         # Obtain node and edge features
         x = self.node_encoder(cluster_data, clusts)
@@ -376,9 +385,9 @@ def forward(self, data, clusts=None, groups=None, points=None, extra_feats=None,
                 else:
                     x = torch.cat([x, dirs_start.float()], dim=1)
             if self.add_local_dedxs:
-                dedxs_start = get_cluster_dedxs(cluster_data[:, self.coords_index[0]:self.coords_index[1]], cluster_data[:,4], points[:,:3], clusts, self.dedx_max_dir)
+                dedxs_start = get_cluster_dedxs(cluster_data[:, self.coords_index[0]:self.coords_index[1]], cluster_data[:,4], points[:,:3], clusts, self.dedx_max_dist)
                 if self.add_local_dedxs != 'start':
-                    dedxs_end = get_cluster_dedxs(cluster_data[:, self.coords_index[0]:self.coords_index[1]], cluster_data[:,4], points[:,3:6], clusts, self.dedx_max_dir)
+                    dedxs_end = get_cluster_dedxs(cluster_data[:, self.coords_index[0]:self.coords_index[1]], cluster_data[:,4], points[:,3:6], clusts, self.dedx_max_dist)
                     x = torch.cat([x, dedxs_start.reshape(-1,1).float(), dedxs_end.reshape(-1,1).float()], dim=1)
                 else:
                     x = torch.cat([x, dedxs_start.reshape(-1,1).float()], dim=1)

From c13cc8caa0d9ca684c4bb1f35cadd38884f6bbbf Mon Sep 17 00:00:00 2001
From: Francois Drielsma <francois.drielsma@gmail.com>
Date: Thu, 17 Nov 2022 16:05:03 -0800
Subject: [PATCH 52/52] Added option to restrict the maximum GrapPA input graph
 size (to deal with memory). Default: 2e6 edges

---
 mlreco/models/grappa.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/mlreco/models/grappa.py b/mlreco/models/grappa.py
index 97f9c335..b8fc5491 100644
--- a/mlreco/models/grappa.py
+++ b/mlreco/models/grappa.py
@@ -160,6 +160,7 @@ def __init__(self, cfg, name='grappa', batch_col=0, coords_col=(1, 4)):
         self.edge_max_dist = base_config.get('edge_max_dist', -1)
         self.edge_dist_metric = base_config.get('edge_dist_metric', 'voxel')
         self.edge_knn_k = base_config.get('edge_knn_k', 5)
+        self.edge_max_count = base_config.get('edge_max_count', 2e6)
 
         # Turn the edge_max_dist value into a matrix
         if not isinstance(self.edge_max_dist, list): self.edge_max_dist = [self.edge_max_dist]
@@ -321,6 +322,11 @@ def forward(self, data, clusts=None, groups=None, points=None, extra_feats=None,
 
         batch_ids = get_cluster_batch(cluster_data, clusts, batch_index=self.batch_index)
         clusts_split, cbids = split_clusts(clusts, batch_ids, batches, bcounts)
+        result['clusts'] = [clusts_split]
+        if self.edge_max_count > -1:
+            _, cnts = np.unique(batch_ids, return_counts=True)
+            if np.sum([c*(c-1) for c in cnts]) > 2*self.edge_max_count:
+                return result
 
         # If necessary, compute the cluster distance matrix
         dist_mat = None
@@ -363,6 +369,9 @@ def forward(self, data, clusts=None, groups=None, points=None, extra_feats=None,
 
         # Update result with a list of edges for each batch id
         edge_index_split, ebids = split_edge_index(edge_index, batch_ids, batches)
+        result['edge_index'] = [edge_index_split]
+        if edge_index.shape[1] > self.edge_max_count:
+            return result
 
         # Obtain node and edge features
         x = self.node_encoder(cluster_data, clusts)