From 6cb2f33a116aaa7a192315ae5b546d5f33fe04ce Mon Sep 17 00:00:00 2001
From: Francois Drielsma <francois.drielsma@gmail.com>
Date: Mon, 30 May 2022 21:13:43 -0700
Subject: [PATCH 01/30] Modified GrapPA to always include 6 features for end
 points, regardless of semantic type

---
 mlreco/models/grappa.py     |  2 +-
 mlreco/utils/gnn/cluster.py | 38 +++++++++++++++----------------------
 2 files changed, 16 insertions(+), 24 deletions(-)

diff --git a/mlreco/models/grappa.py b/mlreco/models/grappa.py
index 9efffd90..9c013d8b 100644
--- a/mlreco/models/grappa.py
+++ b/mlreco/models/grappa.py
@@ -347,7 +347,7 @@ def forward(self, data, clusts=None, groups=None, points=None, extra_feats=None,
         # Add start point and/or start direction to node features if requested
         if self.add_start_point or points is not None:
             if points is None:
-                points = get_cluster_points_label(cluster_data, particles, clusts, self.source_col==6, coords_index=self.coords_index)
+                points = get_cluster_points_label(cluster_data, particles, clusts, coords_index=self.coords_index)
             x = torch.cat([x, points.float()], dim=1)
             if self.add_start_dir:
                 dirs = get_cluster_directions(cluster_data[:, self.coords_index[0]:self.coords_index[1]], points[:,:3], clusts, self.start_dir_max_dist, self.start_dir_opt)
diff --git a/mlreco/utils/gnn/cluster.py b/mlreco/utils/gnn/cluster.py
index a1f84280..fc005568 100644
--- a/mlreco/utils/gnn/cluster.py
+++ b/mlreco/utils/gnn/cluster.py
@@ -361,23 +361,22 @@ def _get_cluster_features_extended(data: nb.float64[:,:],
 
 
 @numba_wrapper(cast_args=['data','particles'], list_args=['clusts','coords_index'], keep_torch=True, ref_arg='data')
-def get_cluster_points_label(data, particles, clusts, groupwise, batch_col=0, coords_index=[1, 4]):
+def get_cluster_points_label(data, particles, clusts, random_order=True, batch_col=0, coords_index=[1, 4]):
     """
     Function that gets label points for each cluster.
-    - If fragments (groupwise=False), returns start point only
-    - If particle instance (groupwise=True), returns start point of primary shower fragment
-      twice if shower, delta or Michel and end points of tracks if track.
+    Returns start point of primary shower fragment twice if shower, delta or Michel 
+    and end points of tracks if track.
 
     Args:
         data (torch.tensor)     : (N,6) Voxel coordinates [x, y, z, batch_id, value, clust_id, group_id]
         particles (torch.tensor): (N,9) Point coordinates [start_x, start_y, start_z, batch_id, last_x, last_y, last_z, start_t, shape_id]
                                 (obtained with parse_particle_coords)
         clusts ([np.ndarray])   : (C) List of arrays of voxel IDs in each cluster
-        groupwise (bool)        : Whether or not to get a single point per group (merges shower fragments)
+        random_order (bool)     : Whether or not to shuffle the start and end points randomly
     Returns:
         np.ndarray: (N,3/6) particle wise start (and end points in RANDOMIZED ORDER)
     """
-    return _get_cluster_points_label(data, particles, clusts, groupwise,
+    return _get_cluster_points_label(data, particles, clusts, random_order,
                                     batch_col=batch_col,
                                     coords_index=coords_index)
 
@@ -385,26 +384,19 @@ def get_cluster_points_label(data, particles, clusts, groupwise, batch_col=0, co
 def _get_cluster_points_label(data: nb.float64[:,:],
                               particles: nb.float64[:,:],
                               clusts: nb.types.List(nb.int64[:]),
-                              groupwise: nb.boolean = False,
+                              random_order: nb.boolean = True,
                               batch_col: nb.int64 = 0,
                               coords_index: nb.types.List(nb.int64[:]) = [1, 4]) -> nb.float64[:,:]:
-    # Get batch_ids and group_ids
+
+    # Get start and end points (one and the same for all but track class)
     batch_ids = _get_cluster_batch(data, clusts)
-    if not groupwise:
-        points = np.empty((len(clusts), 3), dtype=data.dtype)
-        clust_ids = _get_cluster_label(data, clusts)
-        for i, c in enumerate(clusts):
-            batch_mask = np.where(particles[:,batch_col] == batch_ids[i])[0]
-            idx = batch_mask[clust_ids[i]]
-            points[i] = particles[idx, coords_index[0]:coords_index[1]]
-    else:
-        points = np.empty((len(clusts), 6), dtype=data.dtype)
-        for i, g in enumerate(clusts): # Here clusters are groups
-            batch_mask = np.where(particles[:,batch_col] == batch_ids[i])[0]
-            clust_ids  = np.unique(data[g,5]).astype(np.int64)
-            minid = np.argmin(particles[batch_mask][clust_ids,-2]) # Pick the first cluster in time
-            order = np.array([0, 1, 2, 4, 5, 6]) if np.random.choice(2) else np.array([4, 5, 6, 0, 1, 2])
-            points[i] = particles[batch_mask][clust_ids[minid]][order]
+    points = np.empty((len(clusts), 6), dtype=data.dtype)
+    for i, c in enumerate(clusts): # Here clusters are groups
+        batch_mask = np.where(particles[:,batch_col] == batch_ids[i])[0]
+        clust_ids  = np.unique(data[c, 5]).astype(np.int64)
+        minid = np.argmin(particles[batch_mask][clust_ids,-2]) # Pick the first cluster in time
+        order = np.array([0, 1, 2, 4, 5, 6]) if (np.random.choice(2) or not random_order) else np.array([4, 5, 6, 0, 1, 2])
+        points[i] = particles[batch_mask][clust_ids[minid]][order]
 
     # Bring the start points to the closest point in the corresponding cluster
     for i, c in enumerate(clusts):

From b520fd2b7531c73d051bd03c6172e5ce5122aceb Mon Sep 17 00:00:00 2001
From: Francois Drielsma <francois.drielsma@gmail.com>
Date: Tue, 21 Jun 2022 09:57:05 -0700
Subject: [PATCH 02/30] Ensure that loss is not applied in GraPA to nodes or
 edges associated with undefined clusters

---
 mlreco/models/grappa.py                       |  1 -
 .../models/layers/gnn/losses/edge_channel.py  | 25 +++++++------
 .../layers/gnn/losses/node_kinematics.py      | 36 +++++++++++--------
 .../models/layers/gnn/losses/node_primary.py  | 16 +++++----
 mlreco/models/layers/gnn/losses/node_type.py  |  5 ++-
 5 files changed, 48 insertions(+), 35 deletions(-)

diff --git a/mlreco/models/grappa.py b/mlreco/models/grappa.py
index 9c013d8b..dba0a98c 100644
--- a/mlreco/models/grappa.py
+++ b/mlreco/models/grappa.py
@@ -1,4 +1,3 @@
-from operator import xor
 import random
 import torch
 import numpy as np
diff --git a/mlreco/models/layers/gnn/losses/edge_channel.py b/mlreco/models/layers/gnn/losses/edge_channel.py
index f2014c58..93dc6ec0 100644
--- a/mlreco/models/layers/gnn/losses/edge_channel.py
+++ b/mlreco/models/layers/gnn/losses/edge_channel.py
@@ -93,18 +93,23 @@ def forward(self, out, clusters, graph=None):
                 if not edge_pred.shape[0]:
                     continue
                 edge_index = out['edge_index'][i][j]
-                clusts = out['clusts'][i][j]
-                group_ids = get_cluster_label(labels, clusts, self.target_col)
+                clusts     = out['clusts'][i][j]
+                clust_ids  = get_cluster_label(labels, clusts, self.source_col)
+                group_ids  = get_cluster_label(labels, clusts, self.target_col)
 
-                # If high purity is requested, remove edges in groups without a primary
+                # If a cluster target is -1, none of its edges contribute to the loss
+                valid_clust_mask = group_ids > -1
+                valid_mask = np.all(valid_clust_mask[edge_index], axis = -1)
+
+                # If high purity is requested, remove edges in groups without a single primary
                 if self.high_purity:
-                    clust_ids   = get_cluster_label(labels, clusts, self.source_col)
-                    primary_ids = get_cluster_label(labels, clusts, self.primary_col)
-                    purity_mask = edge_purity_mask(edge_index, clust_ids, group_ids, primary_ids)
-                    if not purity_mask.any():
-                        continue
-                    edge_index  = edge_index[purity_mask]
-                    edge_pred   = edge_pred[np.where(purity_mask)[0]]
+                    primary_ids  = get_cluster_label(labels, clusts, self.primary_col)
+                    valid_mask  &= edge_purity_mask(edge_index, clust_ids, group_ids, primary_ids)
+
+                # Apply valid mask to edges and their predictions
+                if not valid_mask.any(): continue
+                edge_index = edge_index[valid_mask]
+                edge_pred  = edge_pred[np.where(valid_mask)[0]]
 
                 # Use group information or particle tree to determine the true edge assigment
                 if self.target == 'group':
diff --git a/mlreco/models/layers/gnn/losses/node_kinematics.py b/mlreco/models/layers/gnn/losses/node_kinematics.py
index b49e130e..a3a391e5 100644
--- a/mlreco/models/layers/gnn/losses/node_kinematics.py
+++ b/mlreco/models/layers/gnn/losses/node_kinematics.py
@@ -197,14 +197,20 @@ def forward(self, out, types):
                     node_pred_p = out['node_pred_p'][i][j]
                     if not node_pred_p.shape[0]:
                         continue
-
                     node_assn_p = get_momenta_label(labels, clusts, column=self.momentum_col)
-                    loss = self.reg_lossfn(node_pred_p.squeeze(), node_assn_p.float())
-                    total_loss += loss
-                    p_loss += float(loss)
 
-                    # Increment the number of nodes
-                    n_clusts_momentum += len(clusts)
+                    # Do not apply loss to nodes labeled -1 (unknown class)
+                    node_mask = torch.nonzero(node_assn_p > -1, as_tuple=True)[0]
+                    if len(node_mask):
+                        node_pred_p = node_pred_p[node_mask]
+                        node_assn_p = node_assn_p[node_mask]
+
+                        loss = self.reg_lossfn(node_pred_p.squeeze(), node_assn_p.float())
+                        total_loss += loss
+                        p_loss += float(loss)
+
+                        # Increment the number of nodes
+                        n_clusts_momentum += len(clusts)
 
                 if compute_vtx:
                     node_pred_vtx = out['node_pred_vtx'][i][j]
@@ -265,7 +271,7 @@ def forward(self, out, types):
                             pos_pred = pos_pred + anchors
 
                         vertex_labels.append(pos_label)
-                        loss1 = torch.mean(torch.clamp(torch.sum(self.vtx_position_loss(pos_pred, pos_label), dim=1), 
+                        loss1 = torch.mean(torch.clamp(torch.sum(self.vtx_position_loss(pos_pred, pos_label), dim=1),
                                                        max=self.max_vertex_distance**2))
 
                         # print(loss1, pos_pred)
@@ -273,7 +279,7 @@ def forward(self, out, types):
 
                         # loss1 = torch.sum(torch.mean(self.vtx_position_loss(pos_pred, pos_label), dim=1))
                         # print(loss1, loss2)
-                        
+
                         total_loss += loss1 + loss2
 
                         vtx_position_loss += float(loss1)
@@ -674,7 +680,7 @@ def __init__(self, loss_config, **kwargs):
 
     def compute_type(self, node_pred_type, labels, clusts, iteration=None):
         '''
-        Compute particle classification type loss. 
+        Compute particle classification type loss.
         '''
         # assert False
         total_loss, n_clusts_type, type_loss = 0, 0, 0
@@ -715,14 +721,14 @@ def compute_type(self, node_pred_type, labels, clusts, iteration=None):
         return total_loss, type_loss, n_clusts_type, type_acc
 
     def compute_vertex(self, node_pred_vtx, labels, clusts):
-        
+
         node_x_vtx = get_cluster_label(labels, clusts, column=self.vtx_col)
         node_y_vtx = get_cluster_label(labels, clusts, column=self.vtx_col+1)
         node_z_vtx = get_cluster_label(labels, clusts, column=self.vtx_col+2)
 
         node_assn_vtx = torch.tensor(np.stack([node_x_vtx, node_y_vtx, node_z_vtx], axis=1),
                                     dtype=torch.float, device=node_pred_vtx.device, requires_grad=False)
-        
+
         vtx_position_loss = 0
         vtx_score_loss = 0
         loss = 0
@@ -732,7 +738,7 @@ def compute_vertex(self, node_pred_vtx, labels, clusts):
         vtx_score_acc = 0
 
         # Select primaries for vertex regression
-        select_primaries = get_cluster_label(labels, clusts, 
+        select_primaries = get_cluster_label(labels, clusts,
                                              column=self.vtx_positives_col)
 
         select_primaries = select_primaries.astype(bool)
@@ -780,7 +786,7 @@ def compute_vertex(self, node_pred_vtx, labels, clusts):
             good_index_vtx = torch.all((0 <= vtx_scatter_label) & (vtx_scatter_label <= 1), dim=1)
         else:
             vtx_scatter_label = node_assn_vtx
-            good_index_vtx = torch.all((0 <= vtx_scatter_label) & (vtx_scatter_label <= self.spatial_size), dim=1)   
+            good_index_vtx = torch.all((0 <= vtx_scatter_label) & (vtx_scatter_label <= self.spatial_size), dim=1)
 
         vtx_scatter = nodes_vtx_reg
 
@@ -790,10 +796,10 @@ def compute_vertex(self, node_pred_vtx, labels, clusts):
         if len(vtx_scatter):
 
             # for now only sum losses, they get averaged below in results dictionary
-            loss2 = self.vtx_score_loss(node_pred_vtx[:, 3:], 
+            loss2 = self.vtx_score_loss(node_pred_vtx[:, 3:],
                                         positives)
             loss1 = torch.sum(torch.mean(
-                self.vtx_position_loss(vtx_scatter[good_index_vtx, :3], 
+                self.vtx_position_loss(vtx_scatter[good_index_vtx, :3],
                                        vtx_scatter_label[good_index_vtx]), dim=1))
 
             # print("Position Loss = ", loss1, vtx_scatter[good_index_vtx].shape)
diff --git a/mlreco/models/layers/gnn/losses/node_primary.py b/mlreco/models/layers/gnn/losses/node_primary.py
index 34c79b2f..2c69b8b6 100644
--- a/mlreco/models/layers/gnn/losses/node_primary.py
+++ b/mlreco/models/layers/gnn/losses/node_primary.py
@@ -94,14 +94,18 @@ def forward(self, out, clusters):
                     else:
                         raise ValueError('Group prediction algorithm not recognized: '+self.group_pred_alg)
 
+                # If a cluster target is -1, ignore the loss associated with it
+                valid_mask = primary_ids > -1
+
                 # If requested, remove groups that do not contain exactly one primary from the loss
                 if self.high_purity:
-                    purity_mask = node_purity_mask(clust_ids, group_ids, primary_ids)
-                    if not purity_mask.any():
-                        continue
-                    clusts      = clusts[purity_mask]
-                    primary_ids = primary_ids[purity_mask]
-                    node_pred   = node_pred[np.where(purity_mask)[0]]
+                    valid_mask &= node_purity_mask(clust_ids, group_ids, primary_ids)
+
+                # Apply valid mask to nodes and their predictions
+                if not valid_mask.any(): continue
+                clusts      = clusts[valid_mask]
+                primary_ids = primary_ids[valid_mask]
+                node_pred   = node_pred[np.where(valid_mask)[0]]
 
                 # If the majority cluster ID agrees with the majority group ID, assign as primary
                 node_assn = torch.tensor(primary_ids, dtype=torch.long, device=node_pred.device, requires_grad=False)
diff --git a/mlreco/models/layers/gnn/losses/node_type.py b/mlreco/models/layers/gnn/losses/node_type.py
index d6101b19..b3554da5 100644
--- a/mlreco/models/layers/gnn/losses/node_type.py
+++ b/mlreco/models/layers/gnn/losses/node_type.py
@@ -84,10 +84,9 @@ def forward(self, out, types):
                 node_assn = get_cluster_label(labels, clusts, column=self.target_col)
                 node_assn = torch.tensor(node_assn, dtype=torch.long, device=node_pred.device, requires_grad=False)
 
-                # Do not apply loss to nodes labeled -1 (unknown class)
+                # If a cluster target is -1, ignore the loss associated with it
                 node_mask = torch.nonzero(node_assn > -1, as_tuple=True)[0]
-                if not len(node_mask):
-                    continue
+                if not len(node_mask): continue
                 node_pred = node_pred[node_mask]
                 node_assn = node_assn[node_mask]
 

From 6d8675eb159f811cd8c1e104439ca4f303af4036 Mon Sep 17 00:00:00 2001
From: Temigo <temigo@gmx.com>
Date: Tue, 21 Jun 2022 11:05:48 -0700
Subject: [PATCH 03/30] WIP on new adapt_labels

---
 mlreco/utils/deghosting.py | 180 +++++++++++++++++++++++--------------
 1 file changed, 112 insertions(+), 68 deletions(-)

diff --git a/mlreco/utils/deghosting.py b/mlreco/utils/deghosting.py
index 0455f045..7d346d0c 100644
--- a/mlreco/utils/deghosting.py
+++ b/mlreco/utils/deghosting.py
@@ -9,7 +9,25 @@ def compute_rescaled_charge(input_data, deghost_mask, last_index = 6, batch_col
     """
     Computes rescaled charge after deghosting
 
-    last_index: 4 + deghost_input_features
+    Note
+    ----
+    This function should work on Numpy arrays or Torch tensors.
+
+    Parameters
+    ----------
+    input_data: np.ndarray or torch.Tensor
+        Shape (N, 4+num_features) where 4 corresponds to batch_id,x,y,z
+    deghost_mask: np.ndarray or torch.Tensor
+        Shape (N,), N_deghost is the predicted deghosted voxel count
+    last_index: int, default 6
+        Indexes where hit-related features start @ 4 + deghost_input_features
+    batch_col: int, default 0
+
+    Returns
+    -------
+    np.ndarray or torch.Tensor
+        Shape (N_deghost,) contains rescaled charge array for input data.
+        Includes deghosted mask already.
     """
     if torch.is_tensor(input_data):
         unique = torch.unique
@@ -38,12 +56,50 @@ def adapt_labels_knn(result, label_seg, label_clustering,
                       batch_column=0,
                       coords_column_range=(1, 4),
                       true_mask=None):
+    """
+    Returns new cluster labels that have the same size as the input w/ ghost points.
+    Points predicted as nonghost but that are true ghosts get the cluster label of
+    the closest cluster.
+    Points that are true ghosts and predicted as ghosts get "empty" (-1) values.
+
+    Note
+    ----
+    Uses GPU version from `torch_cluster.knn` to speed up
+    the label adaptation computation.
+
+    Parameters
+    ----------
+    result: dict
+    label_seg: list of torch.Tensor
+    label_clustering: list of torch.Tensor
+    num_classes: int, default 5
+        Semantic classes count.
+    batch_column: int, default 0
+    coords_column_range: tuple, default (1, 4)
+    true_mask: torch.Tensor, default None
+        True nonghost mask. If None, will use the intersection
+        of predicted nonghost and true nonghost. This option is
+        useful to do "cheat ghost predictions" (i.e. mimic deghosting
+        predictions using true ghost mask, to run later stages
+        of the chain independently of the deghosting stage).
+
+    Returns
+    -------
+    np.ndarray
+        shape: (input w/ ghost points, label_clusters_features)
+
+    See Also
+    --------
+    adapt_labels, adapt_labels_numpy
+    """
     complete_label_clustering = []
     c1, c2 = coords_column_range
 
     if true_mask is not None:
         assert true_mask.shape[0] == label_seg[0].shape[0]
     c3 = max(c2, batch_column+1)
+    import time
+    start = time.time()
     for i in range(len(label_seg)):
         coords = label_seg[i][:, :c3]
         label_c = []
@@ -59,95 +115,70 @@ def adapt_labels_knn(result, label_seg, label_clustering,
                                                      batch_clustering.size(1)))
             if torch.cuda.is_available():
                 new_label_clustering = new_label_clustering.cuda()
-            X_train = batch_clustering[:, c1:c2]
 
             if true_mask is None:
                 nonghost_mask = (result['ghost'][i][batch_mask].argmax(dim=1) == 0)
                 # Select voxels predicted as nonghost, but true ghosts
                 mask = nonghost_mask & (label_seg[i][:, -1][batch_mask] == num_classes)
+                mask = torch.where(mask)[0]
+                # Now we need a special treatment for these, if there are any.
                 if batch_coords[mask].shape[0] > 0:
-                    neighbors = knn(X_train, batch_coords[mask, c1:c2], 1)
-                    _, d = neighbors[0], neighbors[1]
-
-                    additional_label_clustering = torch.cat([batch_coords[mask],
-                                                            batch_clustering[d, c3:]], dim=1).float()
-                    new_label_clustering[mask] = additional_label_clustering
+                    tagged_voxels_count = 1 # to get the loop started
+                    X_true = batch_clustering
+                    X_pred = batch_coords[mask]
+                    while tagged_voxels_count > 0 and X_pred.shape[0] > 0:
+                        #print(batch_id, "while", X_true.shape, X_pred.shape, tagged_voxels_count)
+                        neighbors = knn(X_true[:, c1:c2].float(), X_pred[:, c1:c2].float(), 1)
+                        _, d = neighbors[0], neighbors[1]
+
+                        # compute Chebyshev distance between predicted and true
+                        distances = torch.amax(torch.abs(X_true[neighbors[1], c1:c2] - X_pred[neighbors[0], c1:c2]), dim=1)
+                        #print(distances)
+                        select_mask = distances <= 1
+
+                        tagged_voxels_count = select_mask.sum()
+                        if tagged_voxels_count > 0:
+                            # We assign label of closest true nonghost voxel to those within Chebyshev distance 1
+                            additional_label_clustering = torch.cat([X_pred[select_mask],
+                                                                    X_true[d[select_mask], c3:]], dim=1).float()
+
+                            new_label_clustering[mask[select_mask]] = additional_label_clustering
+                            mask = mask[~select_mask]
+                            # Update for next iteration
+                            X_true = additional_label_clustering
+                            X_pred = X_pred[~select_mask]
             else:
                 nonghost_mask = true_mask[batch_mask]
 
+            # Include true nonghost voxels by default
             new_label_clustering[label_seg[i][batch_mask, -1] < num_classes] = batch_clustering.float()
+            # Now we save - need only to keep predicted nonghost voxels.
             label_c.append(new_label_clustering[nonghost_mask])
         label_c = torch.cat(label_c, dim=0)
         complete_label_clustering.append(label_c)
+    print("time = ", time.time() - start)
+    #assert False
     return complete_label_clustering
 
 
-def adapt_labels(result, label_seg, label_clustering,
-                 num_classes=5,
-                 batch_column=0,
-                 coords_column_range=(1, 4),
-                 true_mask=None):
-    """
-    Returns new cluster labels that have the same size as the input w/ ghost points.
-    Points predicted as nonghost but that are true ghosts get the cluster label of
-    the closest cluster.
-    Points that are true ghosts and predicted as ghosts get "emtpy" (-1) values.
-    Return shape: (input w/ ghost points, label_clusters_features)
+def adapt_labels(*args, **kwargs):
     """
-    complete_label_clustering = []
-    c1, c2 = coords_column_range
+    Kept for backward compatibility, to deprecate soon.
 
-    if true_mask is not None:
-        assert true_mask.shape[0] == label_seg[0].shape[0]
-
-    c3 = max(c2, batch_column+1)
-    for i in range(len(label_seg)):
-        coords = label_seg[i][:, :c3]
-        label_c = []
-        for batch_id in coords[:, batch_column].int().unique():
-            batch_mask = coords[:, batch_column] == batch_id
-            batch_coords = coords[batch_mask]
-            batch_clustering = label_clustering[i][label_clustering[i][:, batch_column] == batch_id]
-
-            # Prepare new labels
-            new_label_clustering = -1. * torch.ones((batch_coords.size(0),
-                                                     label_clustering[i].size(1)))
-            new_label_clustering[:, :c3] = batch_coords
-
-            if torch.cuda.is_available():
-                new_label_clustering = new_label_clustering.cuda()
-
-            if true_mask is None:
-                nonghost_mask = (result['ghost'][i][batch_mask].argmax(dim=1) == 0)
-                # Select voxels predicted as nonghost, but true ghosts
-                mask = nonghost_mask & (label_seg[i][:, -1][batch_mask] == num_classes)
-                # Assign them to closest cluster
-                if len(batch_clustering):
-                    #print(batch_coords.shape, batch_clustering.shape)
-                    d = distances(batch_coords[mask, c1:c2],
-                                batch_clustering[:, c1:c2]).argmin(dim=1)
-                    additional_label_clustering = torch.cat([batch_coords[mask],
-                                                            batch_clustering[d, c3:]], dim=1).float()
-                    new_label_clustering[mask] = additional_label_clustering
-            else:
-                nonghost_mask = true_mask[batch_mask]
-
-            if len(batch_clustering):
-                new_label_clustering[label_seg[i][batch_mask, -1] < num_classes] = batch_clustering.float()
-
-            label_c.append(new_label_clustering[nonghost_mask])
-        label_c = torch.cat(label_c, dim=0)
-        complete_label_clustering.append(label_c)
-    return complete_label_clustering
+    See Also
+    --------
+    adapt_labels_knn, adapt_labels_numpy
+    """
+    return adapt_labels_knn(*args, **kargs)
 
 
 def adapt_labels_numpy(result, label_seg, label_clustering, num_classes=5, batch_col=0, coords_col=(1, 4)):
     """
-    Returns new cluster labels that have the same size as the input w/ ghost points.
-    Points predicted as nonghost but that are true ghosts get the cluster label of
-    the closest cluster.
-    Points that are true ghosts and predicted as ghosts get "emtpy" (-1) values.
-    Return shape: (input w/ ghost points, label_clusters_features)
+    Numpy version of `adapt_labels`.
+
+    See Also
+    --------
+    adapt_labels, adapt_labels_knn
     """
     c1, c2 = coords_col
     complete_label_clustering = []
@@ -189,6 +220,19 @@ def deghost_labels_and_predictions(data_blob, result):
     '''
     Given dictionaries <data_blob> and <result>, apply deghosting to
     uresnet predictions and labels for use in later reconstruction stages.
+
+    Warning
+    -------
+    Modifies in place the input data and result dictionaries.
+
+    Note
+    ----
+    Used in analysis tools (decorator).
+
+    Parameters
+    ----------
+    data_blob: dict
+    result: dict
     '''
 
     result['ghost_mask'] = [

From 0434b3cfc293f23c10c27a534e442c450a01fdf7 Mon Sep 17 00:00:00 2001
From: Temigo <temigo@gmx.com>
Date: Tue, 21 Jun 2022 15:11:54 -0700
Subject: [PATCH 04/30] Modify adapt_labels

---
 .../models/layers/gnn/losses/edge_channel.py  |   3 +-
 .../layers/gnn/losses/node_kinematics.py      |   3 +-
 .../models/layers/gnn/losses/node_primary.py  |   2 +
 mlreco/utils/deghosting.py                    | 132 +++++++++++-------
 4 files changed, 84 insertions(+), 56 deletions(-)

diff --git a/mlreco/models/layers/gnn/losses/edge_channel.py b/mlreco/models/layers/gnn/losses/edge_channel.py
index 93dc6ec0..713f0da8 100644
--- a/mlreco/models/layers/gnn/losses/edge_channel.py
+++ b/mlreco/models/layers/gnn/losses/edge_channel.py
@@ -87,7 +87,8 @@ def forward(self, out, clusters, graph=None):
 
                 # Narrow down the tensor to the rows in the batch
                 labels = clusters[i][batches == j]
-
+                if not labels.shape[0]:
+                    continue
                 # Get the output of the forward function
                 edge_pred = out['edge_pred'][i][j]
                 if not edge_pred.shape[0]:
diff --git a/mlreco/models/layers/gnn/losses/node_kinematics.py b/mlreco/models/layers/gnn/losses/node_kinematics.py
index a3a391e5..436d0498 100644
--- a/mlreco/models/layers/gnn/losses/node_kinematics.py
+++ b/mlreco/models/layers/gnn/losses/node_kinematics.py
@@ -158,7 +158,8 @@ def forward(self, out, types):
 
                 # Narrow down the tensor to the rows in the batch
                 labels = types[i][batches==j]
-
+                if not labels.shape[0]:
+                    continue
                 clusts = out['clusts'][i][j]
 
                 # Increment the type loss, balance classes if requested
diff --git a/mlreco/models/layers/gnn/losses/node_primary.py b/mlreco/models/layers/gnn/losses/node_primary.py
index 2c69b8b6..8629c242 100644
--- a/mlreco/models/layers/gnn/losses/node_primary.py
+++ b/mlreco/models/layers/gnn/losses/node_primary.py
@@ -77,6 +77,8 @@ def forward(self, out, clusters):
 
                 # Narrow down the label tensor and other predictions to the batch at hand
                 labels = clusters[i][batches==j]
+                if not labels.shape[0]:
+                    continue
                 node_pred = out['node_pred'][i][j]
                 if not node_pred.shape[0]:
                     continue
diff --git a/mlreco/utils/deghosting.py b/mlreco/utils/deghosting.py
index 7d346d0c..e7186387 100644
--- a/mlreco/utils/deghosting.py
+++ b/mlreco/utils/deghosting.py
@@ -55,7 +55,8 @@ def adapt_labels_knn(result, label_seg, label_clustering,
                       num_classes=5,
                       batch_column=0,
                       coords_column_range=(1, 4),
-                      true_mask=None):
+                      true_mask=None,
+                      use_numpy=False):
     """
     Returns new cluster labels that have the same size as the input w/ ghost points.
     Points predicted as nonghost but that are true ghosts get the cluster label of
@@ -95,15 +96,37 @@ def adapt_labels_knn(result, label_seg, label_clustering,
     complete_label_clustering = []
     c1, c2 = coords_column_range
 
+    if use_numpy:
+        unique            = np.unique
+        ones              = np.ones
+        argmax            = lambda x: np.argmax(x, axis=1)
+        where             = np.where
+        concatenate0      = lambda x: np.concatenate(x, axis=1)
+        concatenate1      = lambda x: np.concatenate(x, axis=0)
+        compute_neighbor  = lambda X_true, X_pred: cdist(X_pred[:, c1:c2], X_true[:, c1:c2]).argmin(axis=1)
+        compute_distances = lambda X_true, X_pred: np.amax(np.abs(X_true[:, c1:c2] - X_pred[:, c1:c2]), axis=1)
+        make_float        = lambda x : x
+        get_shape         = lambda x, y: (x.shape[0], y.shape[1])
+    else:
+        unique            = lambda x: x.int().unique()
+        ones              = torch.ones
+        argmax            = lambda x: torch.argmax(x, dim=1)
+        where             = torch.where
+        concatenate0      = lambda x: torch.cat(x, dim=1).float()
+        concatenate1      = lambda x: torch.cat(x, dim=0)
+        compute_neighbor  = lambda X_true, X_pred: knn(X_true[:, c1:c2].float(), X_pred[:, c1:c2].float(), 1)[1]
+        compute_distances = lambda X_true, X_pred: torch.amax(torch.abs(X_true[:, c1:c2] - X_pred[:, c1:c2]), dim=1)
+        make_float        = lambda x: x.float()
+        get_shape         = lambda x, y: (x.size(0), y.size(1))
+
     if true_mask is not None:
         assert true_mask.shape[0] == label_seg[0].shape[0]
     c3 = max(c2, batch_column+1)
-    import time
-    start = time.time()
+
     for i in range(len(label_seg)):
         coords = label_seg[i][:, :c3]
         label_c = []
-        for batch_id in coords[:, batch_column].int().unique():
+        for batch_id in unique(coords[:, batch_column]):
             batch_mask = coords[:, batch_column] == batch_id
             batch_coords = coords[batch_mask]
             batch_clustering = label_clustering[i][label_clustering[i][:, batch_column] == batch_id]
@@ -111,36 +134,38 @@ def adapt_labels_knn(result, label_seg, label_clustering,
                 continue
 
             # Prepare new labels
-            new_label_clustering = -1. * torch.ones((batch_coords.size(0),
-                                                     batch_clustering.size(1)))
-            if torch.cuda.is_available():
+            new_label_clustering = -1. * ones(get_shape(batch_coords, batch_clustering))
+            if (not use_numpy) and torch.cuda.is_available():
                 new_label_clustering = new_label_clustering.cuda()
+            new_label_clustering[:, :c3] = batch_coords
 
             if true_mask is None:
-                nonghost_mask = (result['ghost'][i][batch_mask].argmax(dim=1) == 0)
+                nonghost_mask = argmax(result['ghost'][i][batch_mask]) == 0
                 # Select voxels predicted as nonghost, but true ghosts
                 mask = nonghost_mask & (label_seg[i][:, -1][batch_mask] == num_classes)
-                mask = torch.where(mask)[0]
+                mask = where(mask)[0]
                 # Now we need a special treatment for these, if there are any.
                 if batch_coords[mask].shape[0] > 0:
                     tagged_voxels_count = 1 # to get the loop started
                     X_true = batch_clustering
                     X_pred = batch_coords[mask]
                     while tagged_voxels_count > 0 and X_pred.shape[0] > 0:
-                        #print(batch_id, "while", X_true.shape, X_pred.shape, tagged_voxels_count)
-                        neighbors = knn(X_true[:, c1:c2].float(), X_pred[:, c1:c2].float(), 1)
-                        _, d = neighbors[0], neighbors[1]
+                        # print(batch_id, "while", X_true.shape, X_pred.shape, tagged_voxels_count)
+                        #neighbors = knn(X_true[:, c1:c2].float(), X_pred[:, c1:c2].float(), 1)
+                        #_, d = neighbors[0], neighbors[1]
+                        d = compute_neighbor(X_true, X_pred)
 
                         # compute Chebyshev distance between predicted and true
-                        distances = torch.amax(torch.abs(X_true[neighbors[1], c1:c2] - X_pred[neighbors[0], c1:c2]), dim=1)
+                        # distances = torch.amax(torch.abs(X_true[neighbors[1], c1:c2] - X_pred[neighbors[0], c1:c2]), dim=1)
+                        distances =compute_distances(X_true[d], X_pred)
                         #print(distances)
                         select_mask = distances <= 1
 
                         tagged_voxels_count = select_mask.sum()
                         if tagged_voxels_count > 0:
                             # We assign label of closest true nonghost voxel to those within Chebyshev distance 1
-                            additional_label_clustering = torch.cat([X_pred[select_mask],
-                                                                    X_true[d[select_mask], c3:]], dim=1).float()
+                            additional_label_clustering = concatenate0([X_pred[select_mask],
+                                                                    X_true[d[select_mask], c3:]])
 
                             new_label_clustering[mask[select_mask]] = additional_label_clustering
                             mask = mask[~select_mask]
@@ -151,13 +176,12 @@ def adapt_labels_knn(result, label_seg, label_clustering,
                 nonghost_mask = true_mask[batch_mask]
 
             # Include true nonghost voxels by default
-            new_label_clustering[label_seg[i][batch_mask, -1] < num_classes] = batch_clustering.float()
+            new_label_clustering[label_seg[i][batch_mask, -1] < num_classes] = make_float(batch_clustering)
             # Now we save - need only to keep predicted nonghost voxels.
             label_c.append(new_label_clustering[nonghost_mask])
-        label_c = torch.cat(label_c, dim=0)
+        label_c = concatenate1(label_c)
         complete_label_clustering.append(label_c)
-    print("time = ", time.time() - start)
-    #assert False
+
     return complete_label_clustering
 
 
@@ -172,7 +196,7 @@ def adapt_labels(*args, **kwargs):
     return adapt_labels_knn(*args, **kargs)
 
 
-def adapt_labels_numpy(result, label_seg, label_clustering, num_classes=5, batch_col=0, coords_col=(1, 4)):
+def adapt_labels_numpy(*args, **kwargs):
     """
     Numpy version of `adapt_labels`.
 
@@ -180,40 +204,40 @@ def adapt_labels_numpy(result, label_seg, label_clustering, num_classes=5, batch
     --------
     adapt_labels, adapt_labels_knn
     """
-    c1, c2 = coords_col
-    complete_label_clustering = []
-
-    c3 = max(c2, batch_col+1)
-    for i in range(len(label_seg)):
-        coords = label_seg[i][:, :4]
-        label_c = []
-        #print(len(coords[:, -1].unique()))
-        for batch_id in np.unique(coords[:, batch_col]):
-            batch_mask = coords[:, batch_col] == batch_id
-            batch_coords = coords[batch_mask]
-            batch_clustering = label_clustering[i][label_clustering[i][:, batch_col] == batch_id]
-
-            # Prepare new labels
-            new_label_clustering = -1. * np.ones((batch_coords.shape[0], label_clustering[i].shape[1]))
-            new_label_clustering[:, :c3] = batch_coords
-
-            nonghost_mask = (result['ghost'][i][batch_mask].argmax(axis=1) == 0)
-            # Select voxels predicted as nonghost, but true ghosts
-            mask = nonghost_mask & (label_seg[i][:, -1][batch_mask] == num_classes)
-            if len(batch_clustering):
-                # Assign them to closest cluster
-                d = cdist(batch_coords[mask, c1:c2], batch_clustering[:, c1:c2]).argmin(axis=1)
-                additional_label_clustering = np.concatenate([batch_coords[mask], batch_clustering[d, 4:]], axis=1)
-                new_label_clustering[mask] = additional_label_clustering
-
-            #print(new_label_clustering.size(), label_seg[i][batch_mask, -1].size(), batch_clustering.size())
-            if len(batch_clustering):
-                new_label_clustering[label_seg[i][batch_mask, -1] < num_classes] = batch_clustering
-
-            label_c.append(new_label_clustering[nonghost_mask])
-        label_c = np.concatenate(label_c, axis=0)
-        complete_label_clustering.append(label_c)
-    return complete_label_clustering
+    return adapt_labels_knn(*args, **kwargs, use_numpy=True)
+    # c1, c2 = coords_col
+    # complete_label_clustering = []
+    #
+    # c3 = max(c2, batch_col+1)
+    # for i in range(len(label_seg)):
+    #     coords = label_seg[i][:, :4]
+    #     label_c = []
+    #     #print(len(coords[:, -1].unique()))
+    #     for batch_id in np.unique(coords[:, batch_col]):
+    #         batch_mask = coords[:, batch_col] == batch_id
+    #         batch_coords = coords[batch_mask]
+    #         batch_clustering = label_clustering[i][label_clustering[i][:, batch_col] == batch_id]
+    #
+    #         # Prepare new labels
+    #         new_label_clustering = -1. * np.ones((batch_coords.shape[0], label_clustering[i].shape[1]))
+    #         new_label_clustering[:, :c3] = batch_coords
+    #
+    #         nonghost_mask = (result['ghost'][i][batch_mask].argmax(axis=1) == 0)
+    #         # Select voxels predicted as nonghost, but true ghosts
+    #         mask = nonghost_mask & (label_seg[i][:, -1][batch_mask] == num_classes)
+    #         if len(batch_clustering):
+    #             # Assign them to closest cluster
+    #             d = cdist(batch_coords[mask, c1:c2], batch_clustering[:, c1:c2]).argmin(axis=1)
+    #             additional_label_clustering = np.concatenate([batch_coords[mask], batch_clustering[d, 4:]], axis=1)
+    #             new_label_clustering[mask] = additional_label_clustering
+    #
+    #         if len(batch_clustering):
+    #             new_label_clustering[label_seg[i][batch_mask, -1] < num_classes] = batch_clustering
+    #
+    #         label_c.append(new_label_clustering[nonghost_mask])
+    #     label_c = np.concatenate(label_c, axis=0)
+    #     complete_label_clustering.append(label_c)
+    # return complete_label_clustering
 
 
 def deghost_labels_and_predictions(data_blob, result):

From 64c3952da5e15df97388d8242825b8efd6d11783 Mon Sep 17 00:00:00 2001
From: Temigo <temigo@gmx.com>
Date: Tue, 21 Jun 2022 16:39:15 -0700
Subject: [PATCH 05/30] Add semantic constraint to adapt_labels

---
 mlreco/utils/deghosting.py | 82 +++++++++++++++++++++++---------------
 1 file changed, 50 insertions(+), 32 deletions(-)

diff --git a/mlreco/utils/deghosting.py b/mlreco/utils/deghosting.py
index e7186387..f47a749a 100644
--- a/mlreco/utils/deghosting.py
+++ b/mlreco/utils/deghosting.py
@@ -133,47 +133,64 @@ def adapt_labels_knn(result, label_seg, label_clustering,
             if len(batch_clustering) == 0:
                 continue
 
+            if true_mask is None:
+                nonghost_mask = argmax(result['ghost'][i][batch_mask]) == 0
+            else:
+                nonghost_mask = true_mask[batch_mask]
+
             # Prepare new labels
             new_label_clustering = -1. * ones(get_shape(batch_coords, batch_clustering))
             if (not use_numpy) and torch.cuda.is_available():
                 new_label_clustering = new_label_clustering.cuda()
             new_label_clustering[:, :c3] = batch_coords
 
-            if true_mask is None:
-                nonghost_mask = argmax(result['ghost'][i][batch_mask]) == 0
+            # Loop over predicted semantics
+            # print(result['segmentation'][i].shape, batch_mask.shape, batch_mask.sum())
+            if result['segmentation'][i].shape[0] == batch_mask.shape[0]:
+                semantic_pred = argmax(result['segmentation'][i][batch_mask])
+            else: # adapt_labels was called from analysis tools (see below deghost_labels_and_predictions)
+                # the problem in this case is that `segmentation` has already been deghosted
+                semantic_pred = argmax(result['segmentation_noghost'][i][batch_mask])
+
+            for semantic in unique(semantic_pred):
+                semantic_mask = semantic_pred == semantic
+
+                if true_mask is not None:
+                    continue
                 # Select voxels predicted as nonghost, but true ghosts
-                mask = nonghost_mask & (label_seg[i][:, -1][batch_mask] == num_classes)
+                mask = nonghost_mask & (label_seg[i][:, -1][batch_mask] == num_classes) & semantic_mask
                 mask = where(mask)[0]
                 # Now we need a special treatment for these, if there are any.
-                if batch_coords[mask].shape[0] > 0:
-                    tagged_voxels_count = 1 # to get the loop started
-                    X_true = batch_clustering
-                    X_pred = batch_coords[mask]
-                    while tagged_voxels_count > 0 and X_pred.shape[0] > 0:
-                        # print(batch_id, "while", X_true.shape, X_pred.shape, tagged_voxels_count)
-                        #neighbors = knn(X_true[:, c1:c2].float(), X_pred[:, c1:c2].float(), 1)
-                        #_, d = neighbors[0], neighbors[1]
-                        d = compute_neighbor(X_true, X_pred)
-
-                        # compute Chebyshev distance between predicted and true
-                        # distances = torch.amax(torch.abs(X_true[neighbors[1], c1:c2] - X_pred[neighbors[0], c1:c2]), dim=1)
-                        distances =compute_distances(X_true[d], X_pred)
-                        #print(distances)
-                        select_mask = distances <= 1
-
-                        tagged_voxels_count = select_mask.sum()
-                        if tagged_voxels_count > 0:
-                            # We assign label of closest true nonghost voxel to those within Chebyshev distance 1
-                            additional_label_clustering = concatenate0([X_pred[select_mask],
-                                                                    X_true[d[select_mask], c3:]])
-
-                            new_label_clustering[mask[select_mask]] = additional_label_clustering
-                            mask = mask[~select_mask]
-                            # Update for next iteration
-                            X_true = additional_label_clustering
-                            X_pred = X_pred[~select_mask]
-            else:
-                nonghost_mask = true_mask[batch_mask]
+                if batch_coords[mask].shape[0] == 0:
+                    continue
+                tagged_voxels_count = 1 # to get the loop started
+                X_true = batch_clustering[batch_clustering[:, -1] == semantic]
+                if X_true.shape[0] == 0:
+                    continue
+                X_pred = batch_coords[mask]
+                while tagged_voxels_count > 0 and X_pred.shape[0] > 0:
+                    # print(batch_id, "while", X_true.shape, X_pred.shape, tagged_voxels_count)
+                    #neighbors = knn(X_true[:, c1:c2].float(), X_pred[:, c1:c2].float(), 1)
+                    #_, d = neighbors[0], neighbors[1]
+                    d = compute_neighbor(X_true, X_pred)
+
+                    # compute Chebyshev distance between predicted and true
+                    # distances = torch.amax(torch.abs(X_true[neighbors[1], c1:c2] - X_pred[neighbors[0], c1:c2]), dim=1)
+                    distances =compute_distances(X_true[d], X_pred)
+                    #print(distances)
+                    select_mask = distances <= 1
+
+                    tagged_voxels_count = select_mask.sum()
+                    if tagged_voxels_count > 0:
+                        # We assign label of closest true nonghost voxel to those within Chebyshev distance 1
+                        additional_label_clustering = concatenate0([X_pred[select_mask],
+                                                                X_true[d[select_mask], c3:]])
+
+                        new_label_clustering[mask[select_mask]] = additional_label_clustering
+                        mask = mask[~select_mask]
+                        # Update for next iteration
+                        X_true = additional_label_clustering
+                        X_pred = X_pred[~select_mask]
 
             # Include true nonghost voxels by default
             new_label_clustering[label_seg[i][batch_mask, -1] < num_classes] = make_float(batch_clustering)
@@ -295,6 +312,7 @@ def deghost_labels_and_predictions(data_blob, result):
 
     if 'segmentation' in result \
         and result['segmentation'] is not None:
+        result['segmentation_noghost'] = result['segmentation']
         result['segmentation'] = [
             result['segmentation'][i][result['ghost_mask'][i]] \
                 for i in range(len(result['segmentation']))]

From 176cd9682f6ff77ec3b55814691d3daefbd88739 Mon Sep 17 00:00:00 2001
From: Temigo <temigo@gmx.com>
Date: Wed, 22 Jun 2022 09:42:05 -0700
Subject: [PATCH 06/30] Add track cluster id labels modification in
 `adapt_labels`

This is related to issue #111. We should run DBSCAN on
true track clusters (per instance) to separate true
particles that are broken into several true fragments.
GraphSpice should not be expected to learn to put
together broken tracks, that is meant for Grappa Track.
---
 mlreco/utils/deghosting.py | 38 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/mlreco/utils/deghosting.py b/mlreco/utils/deghosting.py
index f47a749a..50ca66dd 100644
--- a/mlreco/utils/deghosting.py
+++ b/mlreco/utils/deghosting.py
@@ -2,6 +2,7 @@
 import torch
 from mlreco.models.layers.common.dbscan import distances
 from scipy.spatial.distance import cdist
+from sklearn.cluster import DBSCAN
 from torch_cluster import knn
 
 
@@ -197,6 +198,43 @@ def adapt_labels_knn(result, label_seg, label_clustering,
             # Now we save - need only to keep predicted nonghost voxels.
             label_c.append(new_label_clustering[nonghost_mask])
         label_c = concatenate1(label_c)
+
+        # Also run DBSCAN on track true clusters
+        # We want to avoid true track clusters broken in two having the same cluster id label
+        # Note: we assume we are adapting either cluster or kinematics labels,
+        # for which cluster id and group id columns are 5 and 6 respectively.
+        cluster_id_col = 5
+        track_label = 1
+        dbscan = DBSCAN(eps=np.sqrt(3), min_samples=1)
+        track_mask = label_c[:, -1] == track_label
+        for batch_id in unique(coords[:, batch_column]):
+            batch_mask = label_c[:, batch_column] == batch_id
+            #print(batch_id, 'before', torch.unique(label_c[track_mask & batch_mask, cluster_id_col], return_counts=True))
+            # We want to select cluster ids for track-like particles
+            batch_clustering = label_clustering[i][(label_clustering[i][:, batch_column] == batch_id) & (label_clustering[i][:, -1] == track_label)]
+            if len(batch_clustering) == 0:
+                continue
+            cluster_count = 0 # Reset counter for each batch entry
+            for c in unique(batch_clustering[:, cluster_id_col]):
+                if c < 0:
+                    continue
+                cluster_mask = label_c[batch_mask, cluster_id_col] == c
+                if cluster_mask.sum() == 0:
+                    continue
+                coordinates = label_c[batch_mask][cluster_mask, c1:c2]
+                if not isinstance(label_c, np.ndarray):
+                    coordinates = coordinates.detach().cpu().numpy()
+                l = dbscan.fit(coordinates).labels_
+                # Unclustered voxels can keep the label -1
+                if not isinstance(label_c, np.ndarray):
+                    l = torch.tensor(l, device = label_c.device).float()
+                l[l > -1] = l[l > -1] + cluster_count
+                label_c[where(batch_mask)[0][cluster_mask], cluster_id_col] = l
+                label_c[where(batch_mask)[0][cluster_mask], cluster_id_col+1] = l
+                cluster_count += l.max() + 1
+                #print(batch_id, c, l.unique(), (l == -1).sum())
+            #print(batch_id, 'after', torch.unique(label_c[track_mask & batch_mask, cluster_id_col], return_counts=True))
+
         complete_label_clustering.append(label_c)
 
     return complete_label_clustering

From e56f7771c294fa98722b1063280d529ed50aa280 Mon Sep 17 00:00:00 2001
From: Temigo <temigo@gmx.com>
Date: Wed, 22 Jun 2022 11:55:21 -0700
Subject: [PATCH 07/30] Finish implementing issue #111

Used event visualization to validate implementation, a few fixes
were needed. GraphSpice should now never be expected (in its loss)
to put together broken true track clusters.
---
 analysis/classes/particle.py | 12 +++++------
 mlreco/utils/deghosting.py   | 42 ++++++------------------------------
 2 files changed, 12 insertions(+), 42 deletions(-)

diff --git a/analysis/classes/particle.py b/analysis/classes/particle.py
index d266e614..1645307f 100644
--- a/analysis/classes/particle.py
+++ b/analysis/classes/particle.py
@@ -91,8 +91,8 @@ def __repr__(self):
         fmt = "Particle( Image ID={:<3} | Particle ID={:<3} | Semantic_type: {:<15}"\
             " | PID: {:<8} | Primary: {:<2} | Score = {:.2f}% | Interaction ID: {:<2} | Size: {:<5} )"
         msg = fmt.format(self.image_id, self.id,
-                         self.semantic_keys[self.semantic_type],
-                         self.pid_keys[self.pid],
+                         self.semantic_keys[self.semantic_type] if self.semantic_type in self.semantic_keys else "None",
+                         self.pid_keys[self.pid] if self.pid in self.pid_keys else "None",
                          self.is_primary,
                          self.pid_conf * 100,
                          self.interaction_id,
@@ -147,7 +147,7 @@ def __repr__(self):
         fmt = "ParticleFragment( Image ID={:<3} | Fragment ID={:<3} | Semantic_type: {:<15}"\
             " | Group ID: {:<3} | Primary: {:<2} | Interaction ID: {:<2} | Size: {:<5} )"
         msg = fmt.format(self.image_id, self.id,
-                         self.semantic_keys[self.semantic_type],
+                         self.semantic_keys[self.semantic_type] if self.semantic_type in self.semantic_keys else "None",
                          self.group_id,
                          self.is_primary,
                          self.interaction_id,
@@ -165,7 +165,7 @@ def __repr__(self):
         fmt = "TruthParticleFragment( Image ID={:<3} | Fragment ID={:<3} | Semantic_type: {:<15}"\
             " | Group ID: {:<3} | Primary: {:<2} | Interaction ID: {:<2} | Size: {:<5} )"
         msg = fmt.format(self.image_id, self.id,
-                         self.semantic_keys[self.semantic_type],
+                         self.semantic_keys[self.semantic_type] if self.semantic_type in self.semantic_keys else "None",
                          self.group_id,
                          self.is_primary,
                          self.interaction_id,
@@ -209,8 +209,8 @@ def __repr__(self):
         fmt = "TruthParticle( Image ID={:<3} | Particle ID={:<3} | Semantic_type: {:<15}"\
             " | PID: {:<8} | Primary: {:<2} | Interaction ID: {:<2} | Size: {:<5} )"
         msg = fmt.format(self.image_id, self.id,
-                         self.semantic_keys[self.semantic_type],
-                         self.pid_keys[self.pid],
+                         self.semantic_keys[self.semantic_type] if self.semantic_type in self.semantic_keys else "None",
+                         self.pid_keys[self.pid] if self.pid in self.pid_keys else "None",
                          self.is_primary,
                          self.interaction_id,
                          self.points.shape[0])
diff --git a/mlreco/utils/deghosting.py b/mlreco/utils/deghosting.py
index 50ca66dd..40d6a823 100644
--- a/mlreco/utils/deghosting.py
+++ b/mlreco/utils/deghosting.py
@@ -214,7 +214,9 @@ def adapt_labels_knn(result, label_seg, label_clustering,
             batch_clustering = label_clustering[i][(label_clustering[i][:, batch_column] == batch_id) & (label_clustering[i][:, -1] == track_label)]
             if len(batch_clustering) == 0:
                 continue
-            cluster_count = 0 # Reset counter for each batch entry
+            # Reset counter for each batch entry
+            # we need to avoid labeling a cluster with an already existing cluster id (maybe not track)
+            cluster_count =  unique(label_clustering[i][(label_clustering[i][:, batch_column] == batch_id)][:, cluster_id_col]).max()+1
             for c in unique(batch_clustering[:, cluster_id_col]):
                 if c < 0:
                     continue
@@ -230,8 +232,9 @@ def adapt_labels_knn(result, label_seg, label_clustering,
                     l = torch.tensor(l, device = label_c.device).float()
                 l[l > -1] = l[l > -1] + cluster_count
                 label_c[where(batch_mask)[0][cluster_mask], cluster_id_col] = l
-                label_c[where(batch_mask)[0][cluster_mask], cluster_id_col+1] = l
-                cluster_count += l.max() + 1
+                #label_c[where(batch_mask)[0][cluster_mask], cluster_id_col+1] = l
+                cluster_count += int(l.max() + 1)
+                # print(batch_id, c, (batch_clustering[:, cluster_id_col] == c).sum(), cluster_mask.sum(), unique(l))
                 #print(batch_id, c, l.unique(), (l == -1).sum())
             #print(batch_id, 'after', torch.unique(label_c[track_mask & batch_mask, cluster_id_col], return_counts=True))
 
@@ -260,39 +263,6 @@ def adapt_labels_numpy(*args, **kwargs):
     adapt_labels, adapt_labels_knn
     """
     return adapt_labels_knn(*args, **kwargs, use_numpy=True)
-    # c1, c2 = coords_col
-    # complete_label_clustering = []
-    #
-    # c3 = max(c2, batch_col+1)
-    # for i in range(len(label_seg)):
-    #     coords = label_seg[i][:, :4]
-    #     label_c = []
-    #     #print(len(coords[:, -1].unique()))
-    #     for batch_id in np.unique(coords[:, batch_col]):
-    #         batch_mask = coords[:, batch_col] == batch_id
-    #         batch_coords = coords[batch_mask]
-    #         batch_clustering = label_clustering[i][label_clustering[i][:, batch_col] == batch_id]
-    #
-    #         # Prepare new labels
-    #         new_label_clustering = -1. * np.ones((batch_coords.shape[0], label_clustering[i].shape[1]))
-    #         new_label_clustering[:, :c3] = batch_coords
-    #
-    #         nonghost_mask = (result['ghost'][i][batch_mask].argmax(axis=1) == 0)
-    #         # Select voxels predicted as nonghost, but true ghosts
-    #         mask = nonghost_mask & (label_seg[i][:, -1][batch_mask] == num_classes)
-    #         if len(batch_clustering):
-    #             # Assign them to closest cluster
-    #             d = cdist(batch_coords[mask, c1:c2], batch_clustering[:, c1:c2]).argmin(axis=1)
-    #             additional_label_clustering = np.concatenate([batch_coords[mask], batch_clustering[d, 4:]], axis=1)
-    #             new_label_clustering[mask] = additional_label_clustering
-    #
-    #         if len(batch_clustering):
-    #             new_label_clustering[label_seg[i][batch_mask, -1] < num_classes] = batch_clustering
-    #
-    #         label_c.append(new_label_clustering[nonghost_mask])
-    #     label_c = np.concatenate(label_c, axis=0)
-    #     complete_label_clustering.append(label_c)
-    # return complete_label_clustering
 
 
 def deghost_labels_and_predictions(data_blob, result):

From 9103d9f933f683bbb758d34b1bdeb0ea3a0702e4 Mon Sep 17 00:00:00 2001
From: Temigo <temigo@gmx.com>
Date: Thu, 23 Jun 2022 10:31:48 -0700
Subject: [PATCH 08/30] Small fix in node_kinematics + statistics script

---
 analysis/algorithms/selections/__init__.py    |   3 +-
 analysis/algorithms/selections/statistics.py  | 118 ++++++++++++++++++
 .../layers/gnn/losses/node_kinematics.py      |   4 +-
 3 files changed, 122 insertions(+), 3 deletions(-)
 create mode 100644 analysis/algorithms/selections/statistics.py

diff --git a/analysis/algorithms/selections/__init__.py b/analysis/algorithms/selections/__init__.py
index 9c507c63..392f6a06 100644
--- a/analysis/algorithms/selections/__init__.py
+++ b/analysis/algorithms/selections/__init__.py
@@ -1,4 +1,5 @@
 from .stopping_muons import stopping_muons
 from .through_going_muons import through_going_muons
 from .michel_electrons import michel_electrons
-from .example_nue import debug_pid
\ No newline at end of file
+from .example_nue import debug_pid
+from .statistics import statistics
diff --git a/analysis/algorithms/selections/statistics.py b/analysis/algorithms/selections/statistics.py
new file mode 100644
index 00000000..a2d68888
--- /dev/null
+++ b/analysis/algorithms/selections/statistics.py
@@ -0,0 +1,118 @@
+from collections import OrderedDict
+from turtle import update
+from sklearn.decomposition import PCA
+
+from analysis.classes.ui import FullChainEvaluator, FullChainPredictor
+from analysis.decorator import evaluate
+
+import numpy as np
+
+
+@evaluate(['particles', 'interactions'], mode='per_batch')
+def statistics(data_blob, res, data_idx, analysis_cfg, cfg):
+    """
+    Collect statistics of predicted particles/interactions.
+
+    To be used for data vs MC comparisons at higher level.
+    """
+    particles, interactions = [], []
+
+    deghosting          = analysis_cfg['analysis']['deghosting']
+    processor_cfg       = analysis_cfg['analysis'].get('processor_cfg', {})
+
+    start_segment_radius = processor_cfg.get('start_segment_radius', 17)
+    shower_label          = processor_cfg.get('shower_label', 0)
+    Michel_label          = processor_cfg.get('Michel_label', 2)
+    track_label          = processor_cfg.get('track_label', 1)
+    bin_size             = processor_cfg.get('bin_size', 17) # 5cm
+
+    # Initialize analysis differently depending on data/MC setting
+    predictor = FullChainPredictor(data_blob, res, cfg, analysis_cfg, deghosting=deghosting)
+
+    image_idxs = data_blob['index']
+    pca = PCA(n_components=2)
+
+    for i, index in enumerate(image_idxs):
+        pred_particles = predictor.get_particles(i, only_primaries=False)
+
+        # Loop over predicted particles
+        for p in pred_particles:
+            direction = None
+            if p.semantic_type in [track_label, shower_label] and p.startpoint[0] >= 0.:
+                d = np.sqrt(((p.points -p.startpoint)**2).sum(axis=1))
+                if (d < start_segment_radius).sum() >= 2:
+                    direction = pca.fit(p.points[d < start_segment_radius]).components_[0, :]
+            if direction is None:
+                if len(p.points) >= 2:
+                    direction = pca.fit(p.points).components_[0, :]
+                else:
+                    direction = [-1, -1, -1]
+
+            length = -1
+            if p.semantic_type == track_label:
+                length = 0.
+                if len(p.points) >= 2:
+                    coords_pca = pca.fit_transform(p.points)[:, 0]
+                    bins = np.arange(coords_pca.min(), coords_pca.max(), bin_size)
+                    # bin_inds takes values in [1, len(bins)]
+                    bin_inds = np.digitize(coords_pca, bins)
+                    for b_i in np.unique(bin_inds):
+                        mask = bin_inds == b_i
+                        if np.count_nonzero(mask) < 2: continue
+                        # Repeat PCA locally for better measurement of dx
+                        pca_axis = pca.fit_transform(p.points[mask])
+                        dx = pca_axis[:, 0].max() - pca_axis[:, 0].min()
+                        length += dx
+            update_dict = {
+                'index': index,
+                'id': p.id,
+                'size': p.size,
+                'semantic_type': p.semantic_type,
+                'pid': p.pid,
+                'interaction_id': p.interaction_id,
+                'is_primary': p.is_primary,
+                'sum_edep': p.depositions.sum(),
+                'dir_x': direction[0],
+                'dir_y': direction[1],
+                'dir_z': direction[2],
+                'length': length,
+                'start_x': -1,
+                'start_y': -1,
+                'start_z': -1,
+                'end_x': -1,
+                'end_y': -1,
+                'end_z': -1,
+            }
+            if p.semantic_type == track_label:
+                update_dict.update({
+                    'start_x': p.startpoint[0],
+                    'start_y': p.startpoint[1],
+                    'start_z': p.startpoint[2],
+                    'end_x': p.endpoint[0],
+                    'end_y': p.endpoint[1],
+                    'end_z': p.endpoint[2],
+                })
+            elif p.semantic_type == shower_label:
+                update_dict.update({
+                    'start_x': p.startpoint[0],
+                    'start_y': p.startpoint[1],
+                    'start_z': p.startpoint[2],
+                })
+            particles.append(OrderedDict(update_dict))
+
+        pred_interactions = predictor.get_interactions(i, drop_nonprimary_particles=False)
+
+        for int in pred_interactions:
+            update_dict = {
+                'index': index,
+                'id': int.id,
+                'size': int.size,
+                'num_particles': int.num_particles,
+            }
+            for key in int.particle_counts:
+                update_dict[key + '_count'] = int.particle_counts[key]
+            for key in int.primary_particle_counts:
+                update_dict[key + '_primary_count'] = int.primary_particle_counts[key]
+            interactions.append(OrderedDict(update_dict))
+
+    return [particles, interactions]
diff --git a/mlreco/models/layers/gnn/losses/node_kinematics.py b/mlreco/models/layers/gnn/losses/node_kinematics.py
index 436d0498..fb217fe8 100644
--- a/mlreco/models/layers/gnn/losses/node_kinematics.py
+++ b/mlreco/models/layers/gnn/losses/node_kinematics.py
@@ -254,11 +254,11 @@ def forward(self, out, types):
                     reshaped = out['input_node_features'][i][j][:, 19:25][good_index & positives.bool()].view(-1, 2, 3)
 
                     # Do not apply loss to nodes labeled -1 (unknown class)
-                    node_mask = good_index & positives.bool()
+                    node_mask = good_index & positives.bool() & (positives >= 0.)
                     # print(node_mask.any())
                     if node_mask.any():
                         # for now only sum losses, they get averaged below in results dictionary
-                        loss2 = self.vtx_score_loss(node_pred_vtx[good_index, 3:], positives[good_index])
+                        loss2 = self.vtx_score_loss(node_pred_vtx[good_index & (positives >= 0.), 3:], positives[good_index & (positives >= 0.)])
 
                         pos_pred = node_pred_vtx[node_mask, :3]
                         pos_label = node_assn_vtx[node_mask]

From e0e14c14f5632bacf97857a68cab8319f3ddb6a4 Mon Sep 17 00:00:00 2001
From: Temigo <temigo@gmx.com>
Date: Thu, 23 Jun 2022 14:39:56 -0700
Subject: [PATCH 09/30] Add semantic mistakes to the processing of

---
 mlreco/utils/deghosting.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/mlreco/utils/deghosting.py b/mlreco/utils/deghosting.py
index 40d6a823..5cd6c3d5 100644
--- a/mlreco/utils/deghosting.py
+++ b/mlreco/utils/deghosting.py
@@ -158,8 +158,9 @@ def adapt_labels_knn(result, label_seg, label_clustering,
 
                 if true_mask is not None:
                     continue
-                # Select voxels predicted as nonghost, but true ghosts
-                mask = nonghost_mask & (label_seg[i][:, -1][batch_mask] == num_classes) & semantic_mask
+                # Select voxels predicted as nonghost, but true ghosts (or true nonghost, but wrong semantic prediction)
+                # mask = nonghost_mask & (label_seg[i][:, -1][batch_mask] == num_classes) & semantic_mask
+                mask = nonghost_mask & (label_seg[i][:, -1][batch_mask] != semantic) & semantic_mask
                 mask = where(mask)[0]
                 # Now we need a special treatment for these, if there are any.
                 if batch_coords[mask].shape[0] == 0:

From b829a82491af9b1981576cbe97c6d791b84aeb26 Mon Sep 17 00:00:00 2001
From: Temigo <temigo@gmx.com>
Date: Thu, 23 Jun 2022 15:43:12 -0700
Subject: [PATCH 10/30] Add length to analysis tools calorimetry + fix
 adapt_labels for semantic mistakes

---
 analysis/algorithms/calorimetry.py           | 32 +++++++++++++++++++-
 analysis/algorithms/selections/statistics.py | 15 ++-------
 analysis/algorithms/utils.py                 | 14 ++++++---
 mlreco/utils/deghosting.py                   |  9 ++++--
 4 files changed, 48 insertions(+), 22 deletions(-)

diff --git a/analysis/algorithms/calorimetry.py b/analysis/algorithms/calorimetry.py
index 5186a220..fd35ab97 100644
--- a/analysis/algorithms/calorimetry.py
+++ b/analysis/algorithms/calorimetry.py
@@ -1,6 +1,7 @@
 from analysis.classes.particle import Particle
 import numpy as np
 import numba as nb
+from sklearn.decomposition import PCA
 
 
 def compute_sum_deposited(particle : Particle):
@@ -8,6 +9,35 @@ def compute_sum_deposited(particle : Particle):
     sum_E = particle.deposition.sum()
     return sum_E
 
+
+def compute_track_length(points, bin_size=17):
+    """
+    Compute track length by dividing it into segments
+    and computing a local PCA axis, then summing the
+    local lengths of the segments.
+
+    Parameters
+    ----------
+    points: np.ndarray
+        Shape (N, 3)
+    bin_size: int, optional
+        Size (in voxels) of the segments
+    """
+    pca = PCA(n_components=2)
+    length = 0.
+    if len(points) >= 2:
+        coords_pca = pca.fit_transform(points)[:, 0]
+        bins = np.arange(coords_pca.min(), coords_pca.max(), bin_size)
+        # bin_inds takes values in [1, len(bins)]
+        bin_inds = np.digitize(coords_pca, bins)
+        for b_i in np.unique(bin_inds):
+            mask = bin_inds == b_i
+            if np.count_nonzero(mask) < 2: continue
+            # Repeat PCA locally for better measurement of dx
+            pca_axis = pca.fit_transform(points[mask])
+            dx = pca_axis[:, 0].max() - pca_axis[:, 0].min()
+            length += dx
+    return length
 # TODO:
 # def proton_energy_tabular(particle: Particle):
 #     assert particle.pid == 4  # Proton
@@ -16,4 +46,4 @@ def compute_sum_deposited(particle : Particle):
 
 # def multiple_coulomb_scattering(particle: Particle):
 #     assert particle.pid == 2  # Muon
-#     pass
\ No newline at end of file
+#     pass
diff --git a/analysis/algorithms/selections/statistics.py b/analysis/algorithms/selections/statistics.py
index a2d68888..b0fd9a98 100644
--- a/analysis/algorithms/selections/statistics.py
+++ b/analysis/algorithms/selections/statistics.py
@@ -2,6 +2,7 @@
 from turtle import update
 from sklearn.decomposition import PCA
 
+from analysis.algorithms.calorimetry import compute_track_length
 from analysis.classes.ui import FullChainEvaluator, FullChainPredictor
 from analysis.decorator import evaluate
 
@@ -50,19 +51,7 @@ def statistics(data_blob, res, data_idx, analysis_cfg, cfg):
 
             length = -1
             if p.semantic_type == track_label:
-                length = 0.
-                if len(p.points) >= 2:
-                    coords_pca = pca.fit_transform(p.points)[:, 0]
-                    bins = np.arange(coords_pca.min(), coords_pca.max(), bin_size)
-                    # bin_inds takes values in [1, len(bins)]
-                    bin_inds = np.digitize(coords_pca, bins)
-                    for b_i in np.unique(bin_inds):
-                        mask = bin_inds == b_i
-                        if np.count_nonzero(mask) < 2: continue
-                        # Repeat PCA locally for better measurement of dx
-                        pca_axis = pca.fit_transform(p.points[mask])
-                        dx = pca_axis[:, 0].max() - pca_axis[:, 0].min()
-                        length += dx
+                length = compute_track_length(p.points, bin_size=bin_size)
             update_dict = {
                 'index': index,
                 'id': p.id,
diff --git a/analysis/algorithms/utils.py b/analysis/algorithms/utils.py
index db5d84aa..e1d94e85 100644
--- a/analysis/algorithms/utils.py
+++ b/analysis/algorithms/utils.py
@@ -1,6 +1,7 @@
 from collections import OrderedDict
 from turtle import up
 from analysis.classes.particle import Interaction, Particle, TruthParticle
+from analysis.algorithms.calorimetry import compute_track_length
 
 from pprint import pprint
 import numpy as np
@@ -57,7 +58,7 @@ def count_primary_particles(interaction: Interaction, prefix=None):
     return out
 
 
-def get_particle_properties(particle: Particle, vertex, prefix=None):
+def get_particle_properties(particle: Particle, vertex, prefix=None, save_feats=False):
 
     update_dict = OrderedDict({
         'particle_id': -1,
@@ -68,10 +69,12 @@ def get_particle_properties(particle: Particle, vertex, prefix=None):
         'particle_is_primary': False,
         'particle_has_startpoint': False,
         'particle_has_endpoints': False,
+        'particle_length': -1,
     })
 
-    node_dict = OrderedDict({'node_feat_{}'.format(i) : -1 for i in range(28)})
-    update_dict.update(node_dict)
+    if save_feats:
+        node_dict = OrderedDict({'node_feat_{}'.format(i) : -1 for i in range(28)})
+        update_dict.update(node_dict)
 
     if particle is None:
         out = attach_prefix(update_dict, prefix)
@@ -83,7 +86,8 @@ def get_particle_properties(particle: Particle, vertex, prefix=None):
         update_dict['particle_size'] = particle.size
         update_dict['particle_E'] = particle.sum_edep
         update_dict['particle_is_primary'] = particle.is_primary
-        
+        if particle.semantic_type == 1:
+            update_dict['particle_length'] = compute_track_length(particle.points)
         # if not isinstance(particle, TruthParticle):
         #     node_dict = OrderedDict({'node_feat_{}'.format(i) : particle.node_features[i] \
         #         for i in range(particle.node_features.shape[0])})
@@ -92,4 +96,4 @@ def get_particle_properties(particle: Particle, vertex, prefix=None):
 
     out = attach_prefix(update_dict, prefix)
 
-    return out
\ No newline at end of file
+    return out
diff --git a/mlreco/utils/deghosting.py b/mlreco/utils/deghosting.py
index 5cd6c3d5..09affd7b 100644
--- a/mlreco/utils/deghosting.py
+++ b/mlreco/utils/deghosting.py
@@ -153,6 +153,10 @@ def adapt_labels_knn(result, label_seg, label_clustering,
                 # the problem in this case is that `segmentation` has already been deghosted
                 semantic_pred = argmax(result['segmentation_noghost'][i][batch_mask])
 
+            # Include true nonghost voxels by default when they have the right semantic prediction
+            true_pred = label_seg[i][batch_mask, -1]
+            new_label_clustering[(true_pred < num_classes) & (semantic_pred == true_pred)] = make_float(batch_clustering)
+
             for semantic in unique(semantic_pred):
                 semantic_mask = semantic_pred == semantic
 
@@ -160,7 +164,7 @@ def adapt_labels_knn(result, label_seg, label_clustering,
                     continue
                 # Select voxels predicted as nonghost, but true ghosts (or true nonghost, but wrong semantic prediction)
                 # mask = nonghost_mask & (label_seg[i][:, -1][batch_mask] == num_classes) & semantic_mask
-                mask = nonghost_mask & (label_seg[i][:, -1][batch_mask] != semantic) & semantic_mask
+                mask = nonghost_mask & (true_pred != semantic) & semantic_mask
                 mask = where(mask)[0]
                 # Now we need a special treatment for these, if there are any.
                 if batch_coords[mask].shape[0] == 0:
@@ -194,8 +198,7 @@ def adapt_labels_knn(result, label_seg, label_clustering,
                         X_true = additional_label_clustering
                         X_pred = X_pred[~select_mask]
 
-            # Include true nonghost voxels by default
-            new_label_clustering[label_seg[i][batch_mask, -1] < num_classes] = make_float(batch_clustering)
+
             # Now we save - need only to keep predicted nonghost voxels.
             label_c.append(new_label_clustering[nonghost_mask])
         label_c = concatenate1(label_c)

From ede637fddb8db7489ad7d3614745275686f7e33a Mon Sep 17 00:00:00 2001
From: Temigo <temigo@gmx.com>
Date: Thu, 23 Jun 2022 16:10:02 -0700
Subject: [PATCH 11/30] Fix crash in adapt_labels

---
 mlreco/utils/deghosting.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mlreco/utils/deghosting.py b/mlreco/utils/deghosting.py
index 09affd7b..cac705d5 100644
--- a/mlreco/utils/deghosting.py
+++ b/mlreco/utils/deghosting.py
@@ -155,7 +155,8 @@ def adapt_labels_knn(result, label_seg, label_clustering,
 
             # Include true nonghost voxels by default when they have the right semantic prediction
             true_pred = label_seg[i][batch_mask, -1]
-            new_label_clustering[(true_pred < num_classes) & (semantic_pred == true_pred)] = make_float(batch_clustering)
+            new_label_clustering[(true_pred < num_classes)] = make_float(batch_clustering)
+            new_label_clustering[(true_pred < num_classes) & (semantic_pred != true_pred)][:, c3:] = -1.
 
             for semantic in unique(semantic_pred):
                 semantic_mask = semantic_pred == semantic

From 7f8f2781214d2b3e5e713e78b554c43a84e9023f Mon Sep 17 00:00:00 2001
From: Francois Drielsma <francois.drielsma@gmail.com>
Date: Thu, 30 Jun 2022 10:00:24 -0700
Subject: [PATCH 12/30] Fixed bug in primary high purity assignment

---
 mlreco/utils/gnn/evaluation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlreco/utils/gnn/evaluation.py b/mlreco/utils/gnn/evaluation.py
index bad9acc3..eb0b0346 100644
--- a/mlreco/utils/gnn/evaluation.py
+++ b/mlreco/utils/gnn/evaluation.py
@@ -324,7 +324,7 @@ def node_purity_mask(clust_ids: nb.int64[:],
     purity_mask = np.zeros(len(clust_ids), dtype=np.bool_)
     for g in np.unique(group_ids):
         group_mask = group_ids == g
-        if np.sum(group_mask) > 1 and np.sum(primary_ids[group_mask]) == 1:
+        if np.sum(group_mask) > 1 and np.sum(primary_ids[group_mask] == 1) == 1:
             purity_mask[group_mask] = np.ones(np.sum(group_mask))
 
     return purity_mask

From cd2388447e7946bc952b436a820f38b87c161f4f Mon Sep 17 00:00:00 2001
From: Francois Drielsma <francois.drielsma@gmail.com>
Date: Thu, 30 Jun 2022 23:13:38 -0700
Subject: [PATCH 13/30] Added option for high purity particle type
 classification target

---
 mlreco/iotools/parsers/cluster.py             |  4 +--
 .../layers/gnn/losses/node_kinematics.py      | 23 +++++++++++-----
 mlreco/utils/gnn/cluster.py                   | 26 +++----------------
 3 files changed, 23 insertions(+), 30 deletions(-)

diff --git a/mlreco/iotools/parsers/cluster.py b/mlreco/iotools/parsers/cluster.py
index f288ea0d..8309fe17 100644
--- a/mlreco/iotools/parsers/cluster.py
+++ b/mlreco/iotools/parsers/cluster.py
@@ -285,7 +285,7 @@ def parse_cluster3d_full_extended(data):
         np_voxels   = np.empty(shape=(0, 3), dtype=np.float32)
         np_features = np.empty(shape=(0, 8), dtype=np.float32)
 
-    return np_voxels, np_features    
+    return np_voxels, np_features
 
 
 def parse_cluster3d_types(data):
@@ -745,4 +745,4 @@ def parse_cluster3d_scales(data):
         scale_voxels, unique_indices = np.unique(scale_voxels, axis=0, return_index=True)
         scale_data = grp_data[unique_indices]
         scales.append((scale_voxels, scale_data))
-    return scales
\ No newline at end of file
+    return scales
diff --git a/mlreco/models/layers/gnn/losses/node_kinematics.py b/mlreco/models/layers/gnn/losses/node_kinematics.py
index fb217fe8..7321695c 100644
--- a/mlreco/models/layers/gnn/losses/node_kinematics.py
+++ b/mlreco/models/layers/gnn/losses/node_kinematics.py
@@ -70,6 +70,7 @@ def __init__(self, loss_config, batch_col=0, coords_col=(1, 4)):
         self.batch_col = batch_col
         self.coords_col = coords_col
 
+        self.group_col = loss_config.get('cluster_col', 6)
         self.type_col = loss_config.get('type_col', 7)
         self.momentum_col = loss_config.get('momentum_col', 8)
         self.vtx_col = loss_config.get('vtx_col', 9)
@@ -117,6 +118,7 @@ def __init__(self, loss_config, batch_col=0, coords_col=(1, 4)):
         self.use_anchor_points = loss_config.get('use_anchor_points', False)
         self.max_vertex_distance = loss_config.get('max_vertex_distance', 50)
         self.type_loss_weight = loss_config.get('type_loss_weight', 1.0)
+        self.type_high_purity = loss_config.get('type_high_purity', False)
 
         self.compute_momentum_switch = loss_config.get('compute_momentum', True)
 
@@ -169,13 +171,22 @@ def forward(self, out, types):
                     if not node_pred_type.shape[0]:
                         continue
                     node_assn_type = get_cluster_label(labels, clusts, column=self.type_col)
-                    node_assn_type = torch.tensor(node_assn_type, dtype=torch.long, device=node_pred_type.device, requires_grad=False)
 
                     # Do not apply loss to nodes labeled -1 (unknown class)
-                    node_mask = torch.nonzero(node_assn_type > -1, as_tuple=True)[0]
-                    if len(node_mask):
-                        node_pred_type = node_pred_type[node_mask]
-                        node_assn_type = node_assn_type[node_mask]
+                    valid_mask = node_assn_type > -1
+
+                    # If high purity is requested, do not include broken particle in the loss
+                    if self.type_high_purity:
+                        group_ids    = get_cluster_label(labels, clusts, column=self.group_col)
+                        _, inv, cnts = np.unique(group_ids, return_inverse=True, return_counts=True)
+                        valid_mask  &= (cnts[inv] == 1)
+                    valid_mask = np.where(valid_mask, as_tuple=True)[0]
+
+                    # Compute loss
+                    if len(valid_mask):
+                        node_assn_type = torch.tensor(node_assn_type, dtype=torch.long, device=node_pred_type.device, requires_grad=False)
+                        node_pred_type = node_pred_type[valid_mask]
+                        node_assn_type = node_assn_type[valid_mask]
 
                         if self.balance_classes:
                             vals, counts = torch.unique(node_assn_type, return_counts=True)
@@ -190,7 +201,7 @@ def forward(self, out, types):
                             type_loss += self.type_loss_weight * float(loss)
 
                         # Increment the number of nodes
-                        n_clusts_type += len(node_mask)
+                        n_clusts_type += len(valid_mask)
 
                 # Increment the momentum loss
                 if compute_momentum:
diff --git a/mlreco/utils/gnn/cluster.py b/mlreco/utils/gnn/cluster.py
index fc005568..46a44a84 100644
--- a/mlreco/utils/gnn/cluster.py
+++ b/mlreco/utils/gnn/cluster.py
@@ -128,32 +128,14 @@ def get_cluster_label(data, clusts, column=5):
     Returns:
         np.ndarray: (C) List of cluster IDs
     """
-    dtype = data[:, column].dtype
-    if len(clusts) > 0:
-        if dtype == np.int64:
-            return _get_cluster_label_int(data, clusts, column)
-        else:
-            return _get_cluster_label_float(data, clusts, column)
-    else:
-        return np.empty((0,), dtype=np.int32)
+    return _get_cluster_label(data, clusts, column)
 
 @nb.njit(cache=True)
-def _get_cluster_label_int(data: nb.float64[:,:],
-                       clusts: nb.types.List(nb.int64[:]),
-                       column: nb.int64 = 5) -> nb.int64[:]:
-
-    labels = np.empty(len(clusts), dtype=np.int64)
-    for i, c in enumerate(clusts):
-        v, cts = unique_nb(data[c, column])
-        labels[i] = v[np.argmax(np.array(cts))]
-    return labels
-
-@nb.njit(cache=True)
-def _get_cluster_label_float(data: nb.float64[:,:],
+def _get_cluster_label(data: nb.float64[:,:],
                        clusts: nb.types.List(nb.int64[:]),
                        column: nb.int64 = 5) -> nb.float64[:]:
 
-    labels = np.empty(len(clusts), dtype=np.float64)
+    labels = np.empty(len(clusts), dtype=data.dtype)
     for i, c in enumerate(clusts):
         v, cts = unique_nb(data[c, column])
         labels[i] = v[np.argmax(np.array(cts))]
@@ -364,7 +346,7 @@ def _get_cluster_features_extended(data: nb.float64[:,:],
 def get_cluster_points_label(data, particles, clusts, random_order=True, batch_col=0, coords_index=[1, 4]):
     """
     Function that gets label points for each cluster.
-    Returns start point of primary shower fragment twice if shower, delta or Michel 
+    Returns start point of primary shower fragment twice if shower, delta or Michel
     and end points of tracks if track.
 
     Args:

From 77e53b12c4ca5da5532e511cf2f89191f0213a9b Mon Sep 17 00:00:00 2001
From: Francois Drielsma <francois.drielsma@gmail.com>
Date: Mon, 4 Jul 2022 10:45:08 -0700
Subject: [PATCH 14/30] Fixed bug in dataset event list maker

---
 mlreco/iotools/datasets.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mlreco/iotools/datasets.py b/mlreco/iotools/datasets.py
index 607f9106..65be2ea3 100644
--- a/mlreco/iotools/datasets.py
+++ b/mlreco/iotools/datasets.py
@@ -1,4 +1,4 @@
-import os, glob
+mport os, glob
 import numpy as np
 from torch.utils.data import Dataset
 import mlreco.iotools.parsers
@@ -134,7 +134,7 @@ def get_event_list(cfg, key):
         event_list = None
         if key in cfg:
             if os.path.isfile(cfg[key]):
-                event_list = [int(val) for val in open(cfg[key],'r').read().replace(',',' ').split() if val.digit()]
+                event_list = [int(val) for val in open(cfg[key],'r').read().replace(',',' ').split() if val.isdigit()]
             else:
                 try:
                     import ast

From 2ea874cae9183c709822f3f1440568978d9aadc8 Mon Sep 17 00:00:00 2001
From: Francois Drielsma <francois.drielsma@gmail.com>
Date: Wed, 6 Jul 2022 14:15:13 -0700
Subject: [PATCH 15/30] Small bug fix in GrapPA type loss

---
 mlreco/models/layers/gnn/losses/node_kinematics.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlreco/models/layers/gnn/losses/node_kinematics.py b/mlreco/models/layers/gnn/losses/node_kinematics.py
index 7321695c..be5b6058 100644
--- a/mlreco/models/layers/gnn/losses/node_kinematics.py
+++ b/mlreco/models/layers/gnn/losses/node_kinematics.py
@@ -180,7 +180,7 @@ def forward(self, out, types):
                         group_ids    = get_cluster_label(labels, clusts, column=self.group_col)
                         _, inv, cnts = np.unique(group_ids, return_inverse=True, return_counts=True)
                         valid_mask  &= (cnts[inv] == 1)
-                    valid_mask = np.where(valid_mask, as_tuple=True)[0]
+                    valid_mask = np.where(valid_mask)[0]
 
                     # Compute loss
                     if len(valid_mask):

From 6fe6e44ec0dc989ca8ad64a5bc6450012d8055c0 Mon Sep 17 00:00:00 2001
From: Francois Drielsma <francois.drielsma@gmail.com>
Date: Wed, 6 Jul 2022 14:17:42 -0700
Subject: [PATCH 16/30] Fixed typo...

---
 mlreco/iotools/datasets.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlreco/iotools/datasets.py b/mlreco/iotools/datasets.py
index 65be2ea3..dad9eaa5 100644
--- a/mlreco/iotools/datasets.py
+++ b/mlreco/iotools/datasets.py
@@ -1,4 +1,4 @@
-mport os, glob
+import os, glob
 import numpy as np
 from torch.utils.data import Dataset
 import mlreco.iotools.parsers

From 040f295f7231d1fbf8ec88ce051da8ba17809cfd Mon Sep 17 00:00:00 2001
From: Temigo <temigo@gmx.com>
Date: Wed, 6 Jul 2022 15:31:55 -0700
Subject: [PATCH 17/30] Update nue selection script + fix bug in adapt_labels

---
 analysis/algorithms/calorimetry.py            |  42 ++++
 analysis/algorithms/selections/example_nue.py | 211 ++++++------------
 .../algorithms/selections/michel_electrons.py |   6 +
 analysis/algorithms/selections/statistics.py  |  13 +-
 .../algorithms/selections/stopping_muons.py   |  10 +-
 analysis/algorithms/utils.py                  |  17 +-
 analysis/classes/ui.py                        |  63 +++++-
 config/chain/metrics.cfg                      | 208 ++++++++---------
 mlreco/post_processing/metrics/__init__.py    |   1 +
 .../metrics/cluster_gnn_metrics.py            |   8 +-
 .../metrics/deghosting_metrics.py             |   5 +
 mlreco/utils/deghosting.py                    |  22 +-
 12 files changed, 317 insertions(+), 289 deletions(-)

diff --git a/analysis/algorithms/calorimetry.py b/analysis/algorithms/calorimetry.py
index fd35ab97..79c44421 100644
--- a/analysis/algorithms/calorimetry.py
+++ b/analysis/algorithms/calorimetry.py
@@ -22,6 +22,10 @@ def compute_track_length(points, bin_size=17):
         Shape (N, 3)
     bin_size: int, optional
         Size (in voxels) of the segments
+
+    Returns
+    -------
+    float
     """
     pca = PCA(n_components=2)
     length = 0.
@@ -38,6 +42,44 @@ def compute_track_length(points, bin_size=17):
             dx = pca_axis[:, 0].max() - pca_axis[:, 0].min()
             length += dx
     return length
+
+
+def compute_particle_direction(p: Particle, start_segment_radius=17, vertex=None):
+    """
+    Given a Particle, compute the start direction. Within `start_segment_radius`
+    of the start point, find PCA axis and measure direction.
+
+    If not start point is found, returns (-1, -1, -1).
+
+    Parameters
+    ----------
+    p: Particle
+    start_segment_radius: float, optional
+
+    Returns
+    -------
+    np.ndarray
+        Shape (3,)
+    """
+    pca = PCA(n_components=2)
+    direction = None
+    if hasattr(p, "startpoint") and p.startpoint[0] >= 0.:
+        startpoint = p.startpoint
+        if hasattr(p, "endpoint") and vertex is not None: # make sure we pick the one closest to vertex
+            use_end = np.argmin([
+                np.sqrt(((vertex-p.startpoint)**2).sum()),
+                np.sqrt(((vertex-p.endpoint)**2).sum())
+            ])
+            startpoint = p.endpoint if use_end else p.startpoint
+        d = np.sqrt(((p.points - startpoint)**2).sum(axis=1))
+        if (d < start_segment_radius).sum() >= 2:
+            direction = pca.fit(p.points[d < start_segment_radius]).components_[0, :]
+    if direction is None: # we could not find a startpoint
+        if len(p.points) >= 2: # just all voxels
+            direction = pca.fit(p.points).components_[0, :]
+        else:
+            direction = np.array([-1, -1, -1])
+    return direction
 # TODO:
 # def proton_energy_tabular(particle: Particle):
 #     assert particle.pid == 4  # Proton
diff --git a/analysis/algorithms/selections/example_nue.py b/analysis/algorithms/selections/example_nue.py
index a491a974..3f022776 100644
--- a/analysis/algorithms/selections/example_nue.py
+++ b/analysis/algorithms/selections/example_nue.py
@@ -7,6 +7,7 @@
 
 from pprint import pprint
 import time
+import numpy as np
 
 
 @evaluate(['interactions', 'particles'], mode='per_batch')
@@ -18,24 +19,61 @@ def debug_pid(data_blob, res, data_idx, analysis_cfg, cfg):
 
     predictor = FullChainEvaluator(data_blob, res, cfg, analysis_cfg)
     image_idxs = data_blob['index']
-    for i, index in enumerate(image_idxs):
+    for idx, index in enumerate(image_idxs):
 
         # Process Interaction Level Information
-        matches, counts = predictor.match_interactions(i,
+        matches, counts = predictor.match_interactions(idx,
             mode='true_to_pred',
             match_particles=True,
             drop_nonprimary_particles=primaries,
             return_counts=True)
 
+        matched_pred_indices = []
+        for i, interaction_pair in enumerate(matches):
+            true_int, pred_int = interaction_pair[0], interaction_pair[1]
+            if pred_int is not None:
+                matched_pred_indices.append(pred_int.id)
+
+        pred_interactions = predictor.get_interactions(idx, drop_nonprimary_particles=primaries)
+        for int in pred_interactions:
+            if int.id not in matched_pred_indices:
+                matches.append((None, int))
+                if isinstance(counts, list) or isinstance(counts, np.ndarray):
+                    counts = np.concatenate([counts, [0]])
+                else: counts = np.array([counts, 0])
+
         for i, interaction_pair in enumerate(matches):
             true_int, pred_int = interaction_pair[0], interaction_pair[1]
             true_int_dict = count_primary_particles(true_int, prefix='true')
             pred_int_dict = count_primary_particles(pred_int, prefix='pred')
-            if pred_int is None:
-                pred_int_dict['true_interaction_matched'] = False
-            else:
-                pred_int_dict['true_interaction_matched'] = True
-            true_int_dict['true_nu_id'] = true_int.nu_id
+            pred_int_dict['true_interaction_matched'] = False
+            if true_int is not None and pred_int is not None:
+                    pred_int_dict['true_interaction_matched'] = True
+            # Store neutrino information
+            true_int_dict['true_nu_id'] = -1
+            true_int_dict['true_nu_interaction_type'] = -1
+            true_int_dict['true_nu_interaction_mode'] = -1
+            true_int_dict['true_nu_current_type'] = -1
+            true_int_dict['true_nu_energy'] = -1
+            if true_int is not None:
+                true_int_dict['true_nu_id'] = true_int.nu_id
+            if 'neutrino_asis' in data_blob and true_int is not None and true_int.nu_id > 0:
+                # assert 'particles_asis' in data_blob
+                # particles = data_blob['particles_asis'][i]
+                neutrinos = data_blob['neutrino_asis'][idx]
+                if len(neutrinos) > 1 or len(neutrinos) == 0: continue
+                nu = neutrinos[0]
+                # Get larcv::Particle objects for each
+                # particle of the true interaction
+                # true_particles = np.array(particles)[np.array([p.id for p in true_int.particles])]
+                # true_particles_track_ids = [p.track_id() for p in true_particles]
+                # for nu in neutrinos:
+                #     if nu.mct_index() not in true_particles_track_ids: continue
+                true_int_dict['true_nu_interaction_type'] = nu.interaction_type()
+                true_int_dict['true_nu_interaction_mode'] = nu.interaction_mode()
+                true_int_dict['true_nu_current_type'] = nu.current_type()
+                true_int_dict['true_nu_energy'] = nu.energy_init()
+
             pred_int_dict['interaction_match_counts'] = counts[i]
             interactions_dict = OrderedDict({'Index': index})
             interactions_dict.update(true_int_dict)
@@ -43,9 +81,11 @@ def debug_pid(data_blob, res, data_idx, analysis_cfg, cfg):
             interactions.append(interactions_dict)
 
             # Process particle level information
-            pred_particles, true_particles = [], true_int.particles
+            pred_particles, true_particles = [], []
             if pred_int is not None:
                 pred_particles = pred_int.particles
+            if true_int is not None:
+                true_particles = true_int.particles
             matched_particles, _, ious = match_particles_fn(true_particles,
                                                             pred_particles)
             for i, m in enumerate(matched_particles):
@@ -57,147 +97,32 @@ def debug_pid(data_blob, res, data_idx, analysis_cfg, cfg):
                 true_particle_dict = get_particle_properties(true_p,
                     vertex=true_int.vertex,
                     prefix='true')
-                if pred_p is not None:
+                pred_particle_dict['true_particle_is_matched'] = False
+                if pred_p is not None and true_p is not None:
                     pred_particle_dict['true_particle_is_matched'] = True
-                else:
-                    pred_particle_dict['true_particle_is_matched'] = False
+
                 pred_particle_dict['particle_match_counts'] = ious[i]
 
+                true_particle_dict['true_interaction_id'] = true_int.id
+                pred_particle_dict['pred_interaction_id'] = pred_int.id
+
+                true_particle_dict['true_particle_energy_init'] = -1
+                true_particle_dict['true_particle_energy_deposit'] = -1
+                true_particle_dict['true_particle_children_count'] = -1
+                if 'particles_asis' in data_blob:
+                    particles_asis = data_blob['particles_asis'][idx]
+                    if len(particles_asis) > true_p.id:
+                        true_part = particles_asis[true_p.id]
+                        true_particle_dict['true_particle_energy_init'] = true_part.energy_init()
+                        true_particle_dict['true_particle_energy_deposit'] = true_part.energy_deposit()
+                        # If no children other than itself: particle is stopping.
+                        children = true_part.children_id()
+                        children = [x for x in children if x != true_part.id()]
+                        true_particle_dict['true_particle_children_count'] = len(children)
+
                 particles_dict.update(pred_particle_dict)
                 particles_dict.update(true_particle_dict)
 
                 particles.append(particles_dict)
 
     return [interactions, particles]
-
-
-
-@evaluate(['interactions_test', 'node_features', 'edge_features'], mode='per_batch')
-def test_selection(data_blob, res, data_idx, analysis_cfg, cfg):
-
-    # Set default fieldnames and values. (Needed for logger to work)
-    interactions_tp = []
-    node_df = []
-    deghosting = analysis_cfg['analysis']['deghosting']
-    primaries = analysis_cfg['analysis']['match_primaries']
-
-    predictor = FullChainEvaluator(data_blob, res, cfg, analysis_cfg, deghosting=deghosting)
-    image_idxs = data_blob['index']
-
-    # Match True interactions to Predicted interactions
-
-    for i, index in enumerate(image_idxs):
-
-        print('-------------------Index: {}---------------------'.format(index))
-
-        matches = predictor.match_interactions(i, mode='tp', match_particles=True, primaries=primaries)
-
-        for interaction_pair in matches:
-            true_int, pred_int = interaction_pair[0], interaction_pair[1]
-            if pred_int is not None:
-                pred_particles, true_particles = pred_int.particles, true_int.particles
-            else:
-                pred_particles, true_particles = [], true_int.particles
-
-            # Match true particles to predicted particles
-            matched_particles, _, _ = match(true_particles, pred_particles)
-
-            if pred_int is None:
-                print("No predicted interaction match = ", matched_particles)
-                true_count_primary_leptons = true_int.primary_particle_counts[1] \
-                                            + true_int.primary_particle_counts[2]
-                true_count_primary_particles = sum(true_int.primary_particle_counts.values())
-                for p in true_int.particles:
-                    update_dict = OrderedDict({
-                        'index': index,
-                        'pred_interaction_id': -1,
-                        'true_interaction_id': true_int.id,
-                        'pred_particle_type': -1,
-                        'pred_particle_size': -1,
-                        'pred_particle_is_primary': False,
-                        'pred_particle_is_matched': False,
-                        'true_particle_type': p.pid,
-                        'true_particle_size': p.size,
-                        'true_particle_is_primary': False,
-                        'true_particle_is_matched': False,
-                        'pred_count_primary_leptons': 0,
-                        'pred_count_primary_particles': 0,
-                        'true_count_primary_leptons': true_count_primary_leptons,
-                        'true_count_primary_particles': true_count_primary_particles,
-                        'true_interaction_is_matched': False,
-                        'pred_particle_E': -1,
-                        'true_particle_E': p.sum_edep})
-                    interactions_tp.append(update_dict)
-
-            else:
-                true_count_primary_leptons = true_int.primary_particle_counts[1] \
-                                           + true_int.primary_particle_counts[2]
-                true_count_primary_particles = sum(true_int.primary_particle_counts.values())
-
-                pred_count_primary_leptons = {}
-                pred_count_primary_particles = {}
-
-                update_dicts = []
-
-                for p in pred_particles:
-                    if p.is_primary:
-                        # Count matched primaries
-                        pred_count_primary_particles[p.id] = True
-                        if (p.pid == 1 or p.pid == 2):
-                            # Count matched primary leptons
-                            pred_count_primary_leptons[p.id] = True
-
-                for m in matched_particles:
-                    update_dict = OrderedDict({
-                        'index': index,
-                        'pred_interaction_id': -1,
-                        'true_interaction_id': true_int.id,
-                        'pred_particle_type': -1,
-                        'pred_particle_size': -1,
-                        'pred_particle_is_primary': False,
-                        'pred_particle_is_matched': False,
-                        'true_particle_type': -1,
-                        'true_particle_size': -1,
-                        'true_particle_is_primary': False,
-                        'true_particle_is_matched': False,
-                        'pred_count_primary_leptons': 0,
-                        'pred_count_primary_particles': 0,
-                        'true_count_primary_leptons': true_count_primary_leptons,
-                        'true_count_primary_particles': true_count_primary_particles,
-                        'true_interaction_is_matched': True,
-                        'pred_particle_E': -1,
-                        'true_particle_E': -1})
-
-                    update_dict['pred_interaction_id'] = pred_int.id
-                    update_dict['true_interaction_id'] = true_int.id
-
-                    p1, p2 = m
-
-                    if p2 is not None:
-                        p2.is_matched = True
-                        update_dict['pred_particle_type'] = p2.pid
-                        update_dict['pred_particle_size'] = p2.size
-                        update_dict['pred_particle_E'] = p2.sum_edep
-                        update_dict['pred_particle_is_primary'] = p2.is_primary
-                        update_dict['pred_particle_is_matched'] = True
-                        update_dict['true_particle_is_matched'] = True
-
-                    update_dict['true_particle_type'] = p1.pid
-                    update_dict['true_particle_size'] = p1.size
-                    update_dict['true_particle_is_primary'] = p1.is_primary
-                    update_dict['true_particle_E'] = p1.sum_edep
-
-                    update_dicts.append(update_dict)
-
-                    node_dict = OrderedDict({'node_feat_{}'.format(i) : p2.node_features[i] \
-                        for i in range(p2.node_features.shape[0])})
-
-                    node_df.append(node_dict)
-
-
-                for d in update_dicts:
-                    d['pred_count_primary_leptons'] = sum(pred_count_primary_leptons.values())
-                    d['pred_count_primary_particles'] = sum(pred_count_primary_particles.values())
-                    interactions_tp.append(d)
-
-    return [interactions_tp, node_df]
diff --git a/analysis/algorithms/selections/michel_electrons.py b/analysis/algorithms/selections/michel_electrons.py
index 6dc6fcf5..31fbe131 100644
--- a/analysis/algorithms/selections/michel_electrons.py
+++ b/analysis/algorithms/selections/michel_electrons.py
@@ -80,6 +80,8 @@ def michel_electrons(data_blob, res, data_idx, analysis_cfg, cfg):
     ablation_radius    = processor_cfg.get('ablation_radius', 15)
     ablation_min_samples = processor_cfg.get('ablation_min_samples', 5)
     shower_threshold   = processor_cfg.get('shower_threshold', 10)
+    fiducial_threshold = processor_cfg.get('fiducial_threshold', 30)
+    muon_min_voxel_count = processor_cfg.get('muon_min_voxel_count', 30)
     matching_mode      = processor_cfg.get('matching_mode', 'true_to_pred')
     #
     # ====== Initialization ======
@@ -105,6 +107,7 @@ def michel_electrons(data_blob, res, data_idx, analysis_cfg, cfg):
 
             for tp in true_particles:
                 if tp.semantic_type != michel_label: continue
+                if not predictor.is_contained(tp.points, threshold=fiducial_threshold): continue
                 p = data_blob['particles_asis'][i][tp.id]
 
                 michel_is_attached_at_edge = False
@@ -114,6 +117,7 @@ def michel_electrons(data_blob, res, data_idx, analysis_cfg, cfg):
                 for tp2 in true_particles:
                     if tp2.semantic_type != track_label: continue
                     if tp2.pid != muon_label and tp2.pid != pion_label: continue
+                    if tp2.size < muon_min_voxel_count: continue
                     d = cdist(tp.points, tp2.points).min()
                     attached_at_edge, cluster_count, old_cluster_count = is_attached_at_edge(tp.points, tp2.points,
                                                                     attached_threshold=attached_threshold,
@@ -159,11 +163,13 @@ def michel_electrons(data_blob, res, data_idx, analysis_cfg, cfg):
         # Loop over predicted particles
         for p in pred_particles:
             if p.semantic_type != michel_label: continue
+            if not predictor.is_contained(p.points, threshold=fiducial_threshold): continue
 
             # Check whether it is attached to the edge of a track
             michel_is_attached_at_edge = False
             for p2 in pred_particles:
                 if p2.semantic_type != track_label: continue
+                if p2.size < muon_min_voxel_count: continue
                 if not is_attached_at_edge(p.points, p2.points,
                                         attached_threshold=attached_threshold,
                                         one_pixel=one_pixel,
diff --git a/analysis/algorithms/selections/statistics.py b/analysis/algorithms/selections/statistics.py
index b0fd9a98..c0b5db09 100644
--- a/analysis/algorithms/selections/statistics.py
+++ b/analysis/algorithms/selections/statistics.py
@@ -2,7 +2,7 @@
 from turtle import update
 from sklearn.decomposition import PCA
 
-from analysis.algorithms.calorimetry import compute_track_length
+from analysis.algorithms.calorimetry import compute_track_length, compute_particle_direction
 from analysis.classes.ui import FullChainEvaluator, FullChainPredictor
 from analysis.decorator import evaluate
 
@@ -38,16 +38,7 @@ def statistics(data_blob, res, data_idx, analysis_cfg, cfg):
 
         # Loop over predicted particles
         for p in pred_particles:
-            direction = None
-            if p.semantic_type in [track_label, shower_label] and p.startpoint[0] >= 0.:
-                d = np.sqrt(((p.points -p.startpoint)**2).sum(axis=1))
-                if (d < start_segment_radius).sum() >= 2:
-                    direction = pca.fit(p.points[d < start_segment_radius]).components_[0, :]
-            if direction is None:
-                if len(p.points) >= 2:
-                    direction = pca.fit(p.points).components_[0, :]
-                else:
-                    direction = [-1, -1, -1]
+            direction = compute_particle_direction(p, start_segment_radius=start_segment_radius)
 
             length = -1
             if p.semantic_type == track_label:
diff --git a/analysis/algorithms/selections/stopping_muons.py b/analysis/algorithms/selections/stopping_muons.py
index 80e05bfc..5ce2e428 100644
--- a/analysis/algorithms/selections/stopping_muons.py
+++ b/analysis/algorithms/selections/stopping_muons.py
@@ -46,12 +46,14 @@ def stopping_muons(data_blob, res, data_idx, analysis_cfg, cfg):
     # Avoid hardcoding labels
     muon_label       = processor_cfg.get('muon_label', 2)
     track_label      = processor_cfg.get('track_label', 1)
+    fiducial_threshold = processor_cfg.get('fiducial_threshold', 30)
+    #muon_min_voxel_count = processor_cfg.get('muon_min_voxel_count', 30)
 
     # Initialize analysis differently depending on data/MC setting
     if not data:
-        predictor = FullChainEvaluator(data_blob, res, cfg, analysis_cfg, deghosting=deghosting)
+        predictor = FullChainEvaluator(data_blob, res, cfg, processor_cfg, deghosting=deghosting)
     else:
-        predictor = FullChainPredictor(data_blob, res, cfg, analysis_cfg, deghosting=deghosting)
+        predictor = FullChainPredictor(data_blob, res, cfg, processor_cfg, deghosting=deghosting)
 
     image_idxs = data_blob['index']
 
@@ -69,7 +71,7 @@ def stopping_muons(data_blob, res, data_idx, analysis_cfg, cfg):
             true_particles = predictor.get_true_particles(i, only_primaries=False)
             # Match true particles to predicted particles
             true_ids = np.array([p.id for p in true_particles])
-            matched_particles = predictor.match_particles(i, mode='true_to_pred', min_overlap=0.1)
+            matched_particles = predictor.match_particles(i, mode='true_to_pred')
 
             # Quality Metrics
             # FIXME: put into Analysis tools UI ?
@@ -81,6 +83,7 @@ def stopping_muons(data_blob, res, data_idx, analysis_cfg, cfg):
             # Count true stopping muons in the event
             for tp in true_particles:
                 if tp.semantic_type != track_label: continue
+                if not predictor.is_contained(tp.points, threshold=fiducial_threshold): continue
                 num_voxels = tp.size
 
                 p = data_blob['particles_asis'][i][tp.id]
@@ -136,6 +139,7 @@ def stopping_muons(data_blob, res, data_idx, analysis_cfg, cfg):
         # Loop over predicted particles
         for p in pred_particles:
             if p.semantic_type != track_label: continue
+            if not predictor.is_contained(p.points, threshold=fiducial_threshold): continue
             coords = p.points
 
             # Check for presence of Michel electron
diff --git a/analysis/algorithms/utils.py b/analysis/algorithms/utils.py
index e1d94e85..04536a9a 100644
--- a/analysis/algorithms/utils.py
+++ b/analysis/algorithms/utils.py
@@ -1,7 +1,7 @@
 from collections import OrderedDict
 from turtle import up
 from analysis.classes.particle import Interaction, Particle, TruthParticle
-from analysis.algorithms.calorimetry import compute_track_length
+from analysis.algorithms.calorimetry import compute_track_length, compute_particle_direction
 
 from pprint import pprint
 import numpy as np
@@ -28,7 +28,8 @@ def count_primary_particles(interaction: Interaction, prefix=None):
         'vertex_x': -1,
         'vertex_y': -1,
         'vertex_z': -1,
-        'has_vertex': False
+        'has_vertex': False,
+        'count_primary_protons': -1
     })
 
     if interaction is None:
@@ -37,16 +38,20 @@ def count_primary_particles(interaction: Interaction, prefix=None):
     else:
         count_primary_leptons = {}
         count_primary_particles = {}
+        count_primary_protons = {}
 
         for p in interaction.particles:
             if p.is_primary:
                 count_primary_particles[p.id] = True
                 if (p.pid == 1 or p.pid == 2):
                     count_primary_leptons[p.id] = True
+                elif p.pid == 4:
+                    count_primary_protons[p.id] = True
 
         update_dict['interaction_id'] = interaction.id
         update_dict['count_primary_leptons'] = sum(count_primary_leptons.values())
         update_dict['count_primary_particles'] = sum(count_primary_particles.values())
+        update_dict['count_primary_protons'] = sum(count_primary_protons.values())
         if (np.array(interaction.vertex) > 0).all():
             update_dict['has_vertex'] = True
             update_dict['vertex_x'] = interaction.vertex[0]
@@ -70,6 +75,9 @@ def get_particle_properties(particle: Particle, vertex, prefix=None, save_feats=
         'particle_has_startpoint': False,
         'particle_has_endpoints': False,
         'particle_length': -1,
+        'particle_dir_x': -1,
+        'particle_dir_y': -1,
+        'particle_dir_z': -1
     })
 
     if save_feats:
@@ -88,6 +96,11 @@ def get_particle_properties(particle: Particle, vertex, prefix=None, save_feats=
         update_dict['particle_is_primary'] = particle.is_primary
         if particle.semantic_type == 1:
             update_dict['particle_length'] = compute_track_length(particle.points)
+            direction = compute_particle_direction(particle, vertex=vertex)
+            assert len(direction) == 3
+            update_dict['particle_dir_x'] = direction[0]
+            update_dict['particle_dir_y'] = direction[1]
+            update_dict['particle_dir_z'] = direction[2]
         # if not isinstance(particle, TruthParticle):
         #     node_dict = OrderedDict({'node_feat_{}'.format(i) : particle.node_features[i] \
         #         for i in range(particle.node_features.shape[0])})
diff --git a/analysis/classes/ui.py b/analysis/classes/ui.py
index f5a80a59..ad821ea4 100644
--- a/analysis/classes/ui.py
+++ b/analysis/classes/ui.py
@@ -82,6 +82,23 @@ def __init__(self, data_blob, result, cfg, predictor_cfg={}, deghosting=False):
 
         self.batch_mask = self.data_blob['input_data']
 
+        self.volume_boundaries = predictor_cfg.get('volume_boundaries', None)
+        if self.volume_boundaries is None:
+            # Using ICARUS Cryo 0 as a default
+            pass
+        else:
+            self.volume_boundaries = np.array(self.volume_boundaries, dtype=np.float64)
+            if 'meta' not in self.data_blob:
+                raise Exception("Cannot use volume boundaries because meta is missing from iotools config.")
+            else: # convert to voxel units
+                meta = self.data_blob['meta'][0]
+                min_x, min_y, min_z = meta[0:3]
+                size_voxel_x, size_voxel_y, size_voxel_z = meta[6:9]
+
+                self.volume_boundaries[0, :] = (self.volume_boundaries[0, :] - min_x) / size_voxel_x
+                self.volume_boundaries[1, :] = (self.volume_boundaries[1, :] - min_y) / size_voxel_y
+                self.volume_boundaries[2, :] = (self.volume_boundaries[2, :] - min_z) / size_voxel_z
+
     def __repr__(self):
         msg = "FullChainEvaluator(num_images={})".format(self.num_images)
         return msg
@@ -195,6 +212,37 @@ def randomize_labels(labels):
         return new_labels
 
 
+    def is_contained(self, points, threshold=30):
+        """
+        Parameters
+        ----------
+        points: np.ndarray
+            Shape (N, 3)
+        threshold: float or np.ndarray
+            Distance (in voxels) from boundaries beyond which
+            an object is contained. Can be an array if different
+            threshold must be applied in x, y and z (shape (3,)).
+
+        Returns
+        -------
+        bool
+        """
+        if not isinstance(threshold, np.ndarray):
+            threshold = threshold * np.ones((3,))
+        else:
+            assert threshold.shape[0] == 3
+            assert len(threshold.shape) == 1
+
+        if self.volume_boundaries is None:
+            raise Exception("Please define volume boundaries before using containment method.")
+
+        x_contained = (self.volume_boundaries[0, 0] + threshold[0] <= points[:, 0]) & (points[:, 0] <= self.volume_boundaries[0, 1] - threshold[0])
+        y_contained = (self.volume_boundaries[1, 0] + threshold[1] <= points[:, 1]) & (points[:, 1] <= self.volume_boundaries[1, 1] - threshold[1])
+        z_contained = (self.volume_boundaries[2, 0] + threshold[2] <= points[:, 0]) & (points[:, 2] <= self.volume_boundaries[2, 1] - threshold[2])
+
+        return (x_contained & y_contained & z_contained).all()
+
+
     def _fit_predict_fragments(self, entry):
         '''
         Method for obtaining voxel-level fragment labels for full image,
@@ -335,7 +383,7 @@ def _fit_predict_vertex_info(self, entry, inter_idx):
 
 
     def get_fragments(self, entry, only_primaries=False,
-                      min_particle_voxel_count=0,
+                      min_particle_voxel_count=-1,
                       attaching_threshold=2,
                       semantic_type=None, verbose=False, true_id=False) -> List[Particle]:
         '''
@@ -357,6 +405,9 @@ def get_fragments(self, entry, only_primaries=False,
         Returns:
             - out: List of <Particle> instances (see Particle class definition).
         '''
+        if min_particle_voxel_count < 0:
+            min_particle_voxel_count = self.min_particle_voxel_count
+
         point_cloud = self.data_blob['input_data'][entry][:, 1:4]
         depositions = self.result['input_rescaled'][entry][:, 4]
         fragments = self.result['fragments'][entry]
@@ -454,7 +505,7 @@ def get_fragments(self, entry, only_primaries=False,
 
 
     def get_particles(self, entry, only_primaries=True,
-                      min_particle_voxel_count=10,
+                      min_particle_voxel_count=-1,
                       attaching_threshold=2) -> List[Particle]:
         '''
         Method for retriving particle list for given batch index.
@@ -482,6 +533,9 @@ def get_particles(self, entry, only_primaries=True,
         Returns:
             - out: List of <Particle> instances (see Particle class definition).
         '''
+        if min_particle_voxel_count < 0:
+            min_particle_voxel_count = self.min_particle_voxel_count
+
         point_cloud      = self.data_blob['input_data'][entry][:, 1:4]
         depositions      = self.result['input_rescaled'][entry][:, 4]
         particles        = self.result['particles'][entry]
@@ -992,7 +1046,10 @@ def get_true_particles(self, entry, only_primaries=True,
 
 
     def get_true_interactions(self, entry, drop_nonprimary_particles=True,
-                              min_particle_voxel_count=20) -> List[Interaction]:
+                              min_particle_voxel_count=-1) -> List[Interaction]:
+        if min_particle_voxel_count < 0:
+            min_particle_voxel_count = self.min_particle_voxel_count
+
         true_particles = self.get_true_particles(entry, only_primaries=drop_nonprimary_particles)
         out = group_particles_to_interactions_fn(true_particles,
                                                  get_nu_id=True, mode='truth')
diff --git a/config/chain/metrics.cfg b/config/chain/metrics.cfg
index 173783da..3e53ff5e 100644
--- a/config/chain/metrics.cfg
+++ b/config/chain/metrics.cfg
@@ -1,35 +1,33 @@
 iotool:
-  batch_size: 32
+  batch_size: 2
   shuffle: False
   num_workers: 4
   collate_fn: CollateSparse
   sampler:
-    batch_size: 32
+    batch_size: 2
     name: SequentialBatchSampler #RandomSequenceSampler
   dataset:
     name: LArCVDataset
     data_keys:
-      #- /sdf/group/neutrino/ldomine/mpvmpr_062021/mpvmpr_062021_v03.root
-      #- /sdf/group/neutrino/ldomine/mpvmpr_062021/mpvmpr_062021_v04.root
-      #- /sdf/group/neutrino/ldomine/mpvmpr_082021/test.root
-      #- /sdf/group/neutrino/ldomine/mpvmpr_082021/train.root
-      #- /sdf/group/neutrino/ldomine/larcv_nue_ccqe_v08/nue_ccqe_v08.root
-      #- /sdf/group/neutrino/ldomine/nue_ccqe_012022_v00/nue_ccqe_012022_v00.root
-      - /sdf/group/neutrino/ldomine/mpvmpr_012022/test.root
-      #- /sdf/group/neutrino/ldomine/nue_ccqe_012022_v01/nue_ccqe_012022_v01.root
+      #- /sdf/group/neutrino/ldomine/mpvmpr_012022/test.root
+      - /sdf/group/neutrino/ldomine/mpvmpr_052022_v10/test[0-3].root
     limit_num_files: 10
-    #event_list: '[35]'
     schema:
       input_data:
         - parse_sparse3d_scn
         - sparse3d_reco
         - sparse3d_reco_chi2
+        - sparse3d_reco_hit_charge0
+        - sparse3d_reco_hit_charge1
+        - sparse3d_reco_hit_charge2
+        - sparse3d_reco_hit_key0
+        - sparse3d_reco_hit_key1
+        - sparse3d_reco_hit_key2
       segment_label:
         - parse_sparse3d_scn
         - sparse3d_pcluster_semantics_ghost
       cluster_label:
         - parse_cluster3d_clean_full
-        #- parse_cluster3d_full
         - cluster3d_pcluster
         - particle_pcluster
         - particle_mpv
@@ -52,10 +50,14 @@ iotool:
         - parse_particle_asis
         - particle_pcluster
         - cluster3d_pcluster
+      meta:
+        - parse_meta3d
+        - sparse3d_reco
 model:
   name: full_chain
   modules:
     chain:
+      enable_charge_rescaling: True
       enable_uresnet: True
       enable_ppn: True
       enable_cnn_clust: True
@@ -74,10 +76,12 @@ model:
 
     # Shower GNN config
     grappa_shower:
-      # model_path: '/sdf/group/neutrino/ldomine/chain/weights_shower_clustering0/snapshot-27499.ckpt'
+      #model_path: /sdf/group/neutrino/drielsma/me/train/icarus/weights/full_chain/grappa_shower_track_transfer/snapshot-999.ckpt
+      #model_path: '/sdf/group/neutrino/drielsma/me/train/icarus/weights/full_chain/grappa_shower_track_transfer/snapshot-5999.ckpt'
+      freeze_weights: True
       base:
         node_type: 0
-        node_min_size: 3
+        node_min_size: -1 #3
         add_start_point: True
         add_start_dir: True
         start_dir_max_dist: 5
@@ -121,7 +125,8 @@ model:
 
     # Track GNN config
     grappa_track:
-      # model_path: '/sdf/group/neutrino/ldomine/chain/weights_track_clustering0/snapshot-2999.ckpt'
+      #model_path: /sdf/group/neutrino/drielsma/me/train/icarus/weights/full_chain/grappa_shower_track_transfer/snapshot-999.ckpt
+      freeze_weights: True
       base:
         node_type: 1
         node_min_size: 3
@@ -150,13 +155,11 @@ model:
 
     # Interaction GNN config
     grappa_inter:
-      use_shower_primary: False
-      # model_path: '/sdf/group/neutrino/ldomine/chain/weights_inter_clustering0/snapshot-10999.ckpt'
+      use_true_particles: False #True
+      use_shower_primary: True
       type_net:
-        # model_path: '/sdf/group/neutrino/ldomine/chain/weights_inter_clustering0/snapshot-10999.ckpt'
         num_hidden: 32
       vertex_net:
-        # model_path: '/sdf/group/neutrino/ldomine/chain/weights_inter_clustering0/snapshot-10999.ckpt'
         num_hidden: 32
       base:
         node_type: [0, 1, 2, 3]
@@ -189,33 +192,24 @@ model:
       node_loss:
         name: kinematics
         balance_classes: True
-        spatial_size: 768
+        spatial_size: 6144
 
     # CNN Clustering config
     graph_spice:
-      #model_path: '/sdf/group/neutrino/koh0207/weights/mink/mink_graph_spice/old_mpvmpr/snapshot-7499.ckpt'
-      # model_path: '/sdf/group/neutrino/ldomine/chain/me/v04/weights_graph_spice1/snapshot-19999.ckpt'
-      # #model_name: ''
-      # graph_spice.embedder.encoder:
-      #     model_path: '/sdf/group/neutrino/ldomine/chain/me/v04/weights_graph_spice1/snapshot-19999.ckpt'
-      #     model_name: 'graph_spice.embedder'
-      # graph_spice.embedder.decoder:
-      #     model_path: '/sdf/group/neutrino/ldomine/chain/me/v04/weights_graph_spice1/snapshot-19999.ckpt'
-      #     model_name: 'graph_spice.embedder'
-      # graph_spice.kernel_fn:
-      #     model_path: '/sdf/group/neutrino/ldomine/chain/me/v04/weights_graph_spice1/snapshot-1499.ckpt'
-      #     model_name: 'graph_spice.kernel_fn'
+      #model_path: /sdf/group/neutrino/drielsma/me/train/icarus/weights/full_chain/graph_spice/snapshot-13749.ckpt
+      freeze_weights: True
       skip_classes: [0, 2, 3, 4]
       min_points: 3
       node_dim: 22
       use_raw_features: True
+      use_true_labels: False
       constructor_cfg:
         mode: 'knn'
         seg_col: -1
         cluster_col: 5
         edge_mode: 'attributes'
         hyper_dimension: 22
-        edge_cut_threshold: 0.1
+        edge_cut_threshold: 0.1 #0.9
       embedder_cfg:
         graph_spice_embedder:
           segmentationLayer: False
@@ -227,9 +221,9 @@ model:
         uresnet:
           filters: 32
           input_kernel: 5
-          depth: 5
+          depth: 5 #6
           reps: 2
-          spatial_size: 768
+          spatial_size: 6144
           num_input: 4 # 1 feature + 3 normalized coords
           allow_bias: False
           activation:
@@ -250,68 +244,73 @@ model:
       kernel_lossfn: 'lovasz_hinge'
       edge_loss_cfg:
         loss_type: 'LogDice'
-      eval: True
+      eval: True #CAREFUL
+
+    # UResNet deghosting for charge rescaling
+    uresnet_deghost:
+      #model_path: /sdf/group/neutrino/drielsma/me/train/icarus/weights/deghost/doublets/fc-snapshot-34999.ckpt
+      freeze_weights: True
+      uresnet_lonely:
+        num_input: 2
+        num_classes: 2
+        filters: 32
+        depth: 5
+        reps: 2
+        spatial_size: 6144
+        ghost: False
+        activation:
+          name: lrelu
+          args:
+            negative_slope: 0.33
+        allow_bias: False
+        #weight_loss: True
+        norm_layer:
+          name: batch_norm
+          args:
+            eps: 0.0001
+            momentum: 0.01
 
     # UResNet + PPN
     uresnet_ppn:
       uresnet_lonely:
-        ghost: True
+        #model_path: /sdf/group/neutrino/drielsma/me/train/icarus/weights/full_chain/uresnet_ppn/snapshot-29999.ckpt
+        freeze_weights: True
         num_input: 2
-        #freeze_weight: True
-        #model_path: '/sdf/group/neutrino/ldomine/chain/me/mpvmpr_082021/weight_uresnet1/snapshot-48499.ckpt'
-        #model_path: '/sdf/group/neutrino/koh0207/weights/mink/uresnet_ppn/old_mpvmpr/BCE/snapshot-99999.ckpt'
-        #model_path: '/sdf/group/neutrino/ldomine/chain/me/v04/weights_graph_spice1/snapshot-1499.ckpt'
-        # model_path: '/sdf/group/neutrino/ldomine/chain/me/new_mpvmpr/weights_uresnet_ppn1/snapshot-39999.ckpt'
-        # uresnet_lonely.net.encoder:
-        #     model_path: '/sdf/group/neutrino/ldomine/chain/me/new_mpvmpr/weights_uresnet_ppn1/snapshot-39999.ckpt'
-        #     model_name: 'uresnet_lonely.net'
-        # uresnet_lonely.net.decoder:
-        #     model_path: '/sdf/group/neutrino/ldomine/chain/me/new_mpvmpr/weights_uresnet_ppn1/snapshot-39999.ckpt'
-        #     model_name: 'uresnet_lonely.net'
-        # uresnet_lonely.output:
-        #     model_path: '/sdf/group/neutrino/ldomine/chain/me/new_mpvmpr/weights_uresnet_ppn1/snapshot-39999.ckpt'
-        # uresnet_lonely.linear_ghost:
-        #     model_path: '/sdf/group/neutrino/ldomine/chain/me/new_mpvmpr/weights_uresnet_ppn1/snapshot-39999.ckpt'
-        # #model_name: 'backbone'
-        #model_path: '/sdf/group/neutrino/ldomine/chain/me/v04/weights_uresnet_ppn0/snapshot-19999.ckpt'
-        #model_path: '/sdf/group/neutrino/ldomine/chain/me/v04/weights_uresnet_ppn1/snapshot-12999.ckpt'
         num_classes: 5
-        filters: 16
-        depth: 6
+        filters: 32
+        depth: 5
         reps: 2
-        spatial_size: 768
-        weight_loss: False
+        spatial_size: 6144
+        #ghost: True
         activation:
           name: lrelu
           args:
-            negative_slope: 0.0 #0.33
+            negative_slope: 0.33
         allow_bias: False
+        #weight_loss: True
         norm_layer:
           name: batch_norm
           args:
             eps: 0.0001
             momentum: 0.01
       ppn:
-        #freeze_weight: True
-        #model_path: '/sdf/group/neutrino/ldomine/chain/me/mpvmpr_082021/weight_uresnet_ppn0/snapshot-4499.ckpt'
-        #particles_label_seg_col: -2
-        ghost: True
-        #downsample_ghost: True
-        #model_path: '/sdf/group/neutrino/ldomine/chain/me/new_mpvmpr/weights_uresnet_ppn1/snapshot-39999.ckpt'
+        #model_path: /sdf/group/neutrino/drielsma/me/train/icarus/weights/full_chain/uresnet_ppn/snapshot-29999.ckpt
+        freeze_weights: True
         ppn_resolution: 1.0
         mask_loss_name: 'BCE'
-        depth: 6
-        filters: 16
+        depth: 5
+        filters: 32
         num_classes: 5
         ppn_score_threshold: 0.6
-        spatial_size: 768
+        spatial_size: 6144
         classify_endpoints: True
         particles_label_seg_col: -3
+        #ghost: True
+        #point_classes: [0,1]
 
     # Kinematics GNN config
     grappa_kinematics:
       use_true_particles: False
-      #model_name: 'kinematics_edge_predictor'
       momentum_net:
         num_hidden: 32
       base:
@@ -335,7 +334,7 @@ model:
             pool_mode: 'avg'
             latent_size: 64 #256
             input_kernel: 3
-            spatial_size: 768
+            spatial_size: 6144
       edge_encoder:
         name: 'mix_debug'
         normalize: True
@@ -347,7 +346,7 @@ model:
             coordConv: True
             pool_mode: 'avg'
             latent_size: 32
-            spatial_size: 768
+            spatial_size: 6144
       gnn_model:
         name: nnconv_old #modular_nnconv
         edge_feats: 51
@@ -373,7 +372,7 @@ model:
         coordConv: True
         pool_mode: 'avg'
         latent_size: 2
-        spatial_size: 768
+        spatial_size: 6144
     cosmic_loss:
       node_loss:
         name: type
@@ -395,13 +394,13 @@ post_processing:
   cluster_gnn_metrics+inter:
     ghost: True
     enable_physics_metrics: False
-    integrated_metrics: False
+    integrated_metrics: True
     store_method: single-file #per-iteration
     clusts: particles
     particles: particles_asis
     edge_pred: inter_edge_pred
     edge_index: inter_edge_index
-    node_pred: ''
+    node_pred: inter_node_pred
     target_col: 7
     source_col: 6
     chain: grappa_inter
@@ -410,7 +409,7 @@ post_processing:
   cluster_gnn_metrics+shower:
     ghost: True
     enable_physics_metrics: False
-    integrated_metrics: False
+    integrated_metrics: True
     store_method: single-file #per-iteration
     clusts: shower_fragments
     particles: particles_asis
@@ -425,7 +424,7 @@ post_processing:
   cluster_gnn_metrics+track:
     ghost: True
     enable_physics_metrics: False
-    integrated_metrics: False
+    integrated_metrics: True
     store_method: single-file #per-iteration
     clusts: track_fragments
     particles: particles_asis
@@ -474,16 +473,18 @@ post_processing:
   #   inter_threshold: 10
   #   ghost: True
   #   primary_pdgs: [13, -13, -11, 11, 2212, 22, 211, -211]
-  # vertex_metrics:
-  #   ghost: True
-  #   attaching_threshold: 10 # to associate PPN points to a given primary
-  #   inter_threshold: 20 # PPN candidates need to minimize difference between distance to closest primary and distance of voxels to closest primary
-  #   other_primaries_threshold: 10 # Primaries too far from the other primaries will be ignored
-  #   other_primaries_gamma_threshold: 100 # T_B for photon exclusively
-  #   #fraction_bad_primaries: 0.6
-  #   pca_radius: 28
-  #   min_track_count: 2
-  #   min_voxel_count: 10
+  vertex_metrics:
+    #ghost: True
+    store_method: single-file
+    spatial_size: 6144
+    #attaching_threshold: 10 # to associate PPN points to a given primary
+    #inter_threshold: 20 # PPN candidates need to minimize difference between distance to closest primary and distance of voxels to closest primary
+    #other_primaries_threshold: 10 # Primaries too far from the other primaries will be ignored
+    #other_primaries_gamma_threshold: 100 # T_B for photon exclusively
+    #fraction_bad_primaries: 0.6
+    #pca_radius: 28
+    #min_track_count: 2
+    #min_voxel_count: 10
   # michel_reconstruction:
   #   store_method: per-iteration
   #   dbscan: False
@@ -498,36 +499,19 @@ post_processing:
   #   association_threshold: 5
   #   step: 16 # about 5cm size fragments
   #   recompute_dx: True
+  analysis_tools_metrics:
+    ghost: True
+    store_method: single-file
 trainval:
   seed: 123
   gpus: '0'
-  weight_prefix: /sdf/group/neutrino/ldomine/chain/me/mpvmpr_012022/weight_trash/snapshot
-  iterations: 775 #1798 #464 #232
+  weight_prefix: /sdf/group/neutrino/ldomine/chain/me/mpvmpr_062022/weight_trash/snapshot
+  iterations: 606 #775 #1798 #464 #232
   report_step: 1
   checkpoint_step: 500
-  #model_path: '/sdf/group/neutrino/ldomine/chain/me/v04/weights_graph_spice2/snapshot-999.ckpt'
-  #model_path: '/sdf/group/neutrino/ldomine/chain/me/mpvmpr_082021/weight_uresnet1/snapshot-39999.ckpt'
-  #model_path: '/sdf/group/neutrino/ldomine/chain/me/mpvmpr_082021/weight_uresnet_ppn2/snapshot-9499.ckpt'
-  #model_path: '/sdf/group/neutrino/ldomine/chain/me/mpvmpr_082021/weight_graph_spice2/snapshot-112499.ckpt'
-  #model_path: '/sdf/group/neutrino/ldomine/chain/me/mpvmpr_082021/weight_graph_spice2/snapshot-39999.ckpt'
-  #model_path: '/sdf/group/neutrino/ldomine/chain/me/mpvmpr_082021/weight_inter_clustering0/snapshot-43499.ckpt'
-  #model_path: '/sdf/group/neutrino/ldomine/chain/me/mpvmpr_082021/weight_kinematics_clustering0/snapshot-44999.ckpt'
-  #model_path: '/sdf/group/neutrino/ldomine/chain/me/mpvmpr_082021/weight_inter_clustering1/snapshot-41999.ckpt'
-  #model_path: '/sdf/group/neutrino/ldomine/chain/me/mpvmpr_082021/weight_shower_clustering1/snapshot-65499.ckpt'
-  #model_path: '/sdf/group/neutrino/ldomine/chain/me/mpvmpr_082021/weight_inter_clustering2/snapshot-44499.ckpt'
-  #model_path: '/sdf/group/neutrino/ldomine/chain/me/mpvmpr_082021/weight_inter_clustering3/snapshot-50499.ckpt'
-  #model_path: '/sdf/group/neutrino/ldomine/chain/me/mpvmpr_082021/weight_inter_clustering2/snapshot-44499.ckpt'
-  #model_path: '/sdf/group/neutrino/ldomine/chain/me/mpvmpr_082021/weight_graph_spice4/snapshot-39999.ckpt'
-  #model_path: '/sdf/group/neutrino/ldomine/chain/me/mpvmpr_082021/weight_track_clustering3/snapshot-41499.ckpt'
-  #model_path: '/sdf/group/neutrino/ldomine/chain/me/mpvmpr_082021/weight_inter_clustering4/snapshot-75999.ckpt'
-  #model_path: '/sdf/group/neutrino/ldomine/chain/me/mpvmpr_012022/weight_graph_spice0/*.ckpt'
-  #model_path: '/sdf/group/neutrino/ldomine/chain/me/mpvmpr_012022/weight_graph_spice0/snapshot-91999.ckpt'
-  #model_path: '/sdf/group/neutrino/ldomine/chain/me/mpvmpr_012022/weight_gnn_shower0/*.ckpt'
-  #model_path: '/sdf/group/neutrino/ldomine/chain/me/mpvmpr_012022/weight_gnn_shower0/*.ckpt'
-  #model_path: '/sdf/group/neutrino/ldomine/chain/me/mpvmpr_012022/weight_gnn_track1/snapshot-95499.ckpt'
-  #model_path: '/sdf/group/neutrino/ldomine/chain/me/mpvmpr_012022/weight_gnn_interaction0/snapshot-92999.ckpt'
-  model_path: '/sdf/group/neutrino/ldomine/chain/me/mpvmpr_012022/weight_gnn_interaction1/snapshot-98999.ckpt'
-  log_dir: /sdf/group/neutrino/ldomine/chain/me/mpvmpr_012022/log_metrics1
+  #model_path: '/sdf/group/neutrino/ldomine/chain/me/mpvmpr_012022/weight_gnn_interaction1/snapshot-98999.ckpt'
+  model_path: /sdf/group/neutrino/drielsma/me/train/icarus/weights/full_chain/grappa_inter_transfer_primary/snapshot-22499.ckpt
+  log_dir: /sdf/group/neutrino/ldomine/chain/me/mpvmpr_062022/log_trash #metrics4
   train: False
   debug: False
   minibatch_size: -1
diff --git a/mlreco/post_processing/metrics/__init__.py b/mlreco/post_processing/metrics/__init__.py
index dcf69148..fbd15cec 100644
--- a/mlreco/post_processing/metrics/__init__.py
+++ b/mlreco/post_processing/metrics/__init__.py
@@ -19,3 +19,4 @@
 from .duq_metrics import duq_metrics
 from .pid_metrics import pid_metrics
 from .doublet_metrics import doublet_metrics
+from .analysis_tools_metrics import analysis_tools_metrics
diff --git a/mlreco/post_processing/metrics/cluster_gnn_metrics.py b/mlreco/post_processing/metrics/cluster_gnn_metrics.py
index 9d1e62f4..a418d816 100644
--- a/mlreco/post_processing/metrics/cluster_gnn_metrics.py
+++ b/mlreco/post_processing/metrics/cluster_gnn_metrics.py
@@ -241,15 +241,15 @@ def cluster_gnn_metrics(cfg, module_cfg, data_blob, res, logdir, iteration,
                 ari, ami, sbd, pur, eff = clustering_metrics(clusts[data_idx][purity_mask], group_ids[purity_mask], node_pred[purity_mask])
             #print(ari, pur, eff)
         primary_accuracy = -1.
-        high_purity = -1
-        if node_pred_primary is not None:
+        high_purity_value = -1
+        if high_purity and node_pred_primary is not None:
             primary_accuracy = np.count_nonzero(node_pred_primary == node_true_primary) / len(node_pred_primary)
-            high_purity = purity_mask.any()
+            high_purity_value = purity_mask.any()
             #print(data_idx, "primary accuracy", primary_accuracy, high_purity)
         # Store
         row_names = ('ari', 'ami', 'sbd', 'pur', 'eff',
                     'num_fragments', 'num_pix', 'num_true_clusts', 'num_pred_clusts', 'primary_accuracy', 'high_purity')
         row_values = (ari, ami, sbd, pur, eff,
-                    n, num_pix, len(np.unique(group_ids)), len(np.unique(node_pred)), primary_accuracy, high_purity)
+                    n, num_pix, len(np.unique(group_ids)), len(np.unique(node_pred)), primary_accuracy, high_purity_value)
 
     return row_names, row_values
diff --git a/mlreco/post_processing/metrics/deghosting_metrics.py b/mlreco/post_processing/metrics/deghosting_metrics.py
index ed035b70..4b6f4e01 100644
--- a/mlreco/post_processing/metrics/deghosting_metrics.py
+++ b/mlreco/post_processing/metrics/deghosting_metrics.py
@@ -116,6 +116,7 @@ def deghosting_metrics(cfg, module_cfg, data_blob, res, logdir, iteration,
         num_pred_pix_true, num_true_pix_pred = [], []
         num_true_deghost_pix, num_original_pix = [], []
         ghost_false_positives, ghost_true_positives = [], []
+        ghost2nonghost = []
         for c in range(num_classes):
             class_mask = label == c
             class_predictions = predictions[class_mask]
@@ -138,6 +139,8 @@ def deghosting_metrics(cfg, module_cfg, data_blob, res, logdir, iteration,
             ghost_false_positives.append(np.count_nonzero(ghost_predictions[class_mask] == 1))
             # Fraction of pixels in this class (correctly) predicted as nonghost
             ghost_true_positives.append(np.count_nonzero(ghost_predictions[class_mask] == 0))
+            # Fraction of true ghost points predicted to belong to this class
+            ghost2nonghost.append(np.count_nonzero((label == 5) & (ghost_predictions == 0) & (predictions == c)))
             # confusion matrix
             # pixels predicted as nonghost + should be in class c, but predicted as c2
             for c2 in range(num_classes):
@@ -162,6 +165,8 @@ def deghosting_metrics(cfg, module_cfg, data_blob, res, logdir, iteration,
         row_values += ghost_false_positives
         row_names += ['ghost_true_positives_class%d' % c for c in range(num_classes)]
         row_values += ghost_true_positives
+        row_names += ['ghost2nonghost_class%d' % c for c in range(num_classes)]
+        row_values += ghost2nonghost
 
     elif deghosting_type == '6':
         ghost2ghost = (predictions[label == 5] == 5).sum() / float(num_ghost_points)
diff --git a/mlreco/utils/deghosting.py b/mlreco/utils/deghosting.py
index cac705d5..e2244c3d 100644
--- a/mlreco/utils/deghosting.py
+++ b/mlreco/utils/deghosting.py
@@ -124,6 +124,9 @@ def adapt_labels_knn(result, label_seg, label_clustering,
         assert true_mask.shape[0] == label_seg[0].shape[0]
     c3 = max(c2, batch_column+1)
 
+    indices = "2762  2763  2767  2769  4821  4822  4831  4832  4833  4834  4835  4844  4857  6617 12095 12096 12097".split()
+    indices = np.array([int(i) for i in indices])
+
     for i in range(len(label_seg)):
         coords = label_seg[i][:, :c3]
         label_c = []
@@ -156,8 +159,7 @@ def adapt_labels_knn(result, label_seg, label_clustering,
             # Include true nonghost voxels by default when they have the right semantic prediction
             true_pred = label_seg[i][batch_mask, -1]
             new_label_clustering[(true_pred < num_classes)] = make_float(batch_clustering)
-            new_label_clustering[(true_pred < num_classes) & (semantic_pred != true_pred)][:, c3:] = -1.
-
+            new_label_clustering[(true_pred < num_classes) & (semantic_pred != true_pred), c3:] = -1.
             for semantic in unique(semantic_pred):
                 semantic_mask = semantic_pred == semantic
 
@@ -167,6 +169,7 @@ def adapt_labels_knn(result, label_seg, label_clustering,
                 # mask = nonghost_mask & (label_seg[i][:, -1][batch_mask] == num_classes) & semantic_mask
                 mask = nonghost_mask & (true_pred != semantic) & semantic_mask
                 mask = where(mask)[0]
+                #print(semantic, np.intersect1d(mask.cpu().numpy(), indices))
                 # Now we need a special treatment for these, if there are any.
                 if batch_coords[mask].shape[0] == 0:
                     continue
@@ -183,8 +186,8 @@ def adapt_labels_knn(result, label_seg, label_clustering,
 
                     # compute Chebyshev distance between predicted and true
                     # distances = torch.amax(torch.abs(X_true[neighbors[1], c1:c2] - X_pred[neighbors[0], c1:c2]), dim=1)
-                    distances =compute_distances(X_true[d], X_pred)
-                    #print(distances)
+                    distances = compute_distances(X_true[d], X_pred)
+
                     select_mask = distances <= 1
 
                     tagged_voxels_count = select_mask.sum()
@@ -202,8 +205,10 @@ def adapt_labels_knn(result, label_seg, label_clustering,
 
             # Now we save - need only to keep predicted nonghost voxels.
             label_c.append(new_label_clustering[nonghost_mask])
+            #print(new_label_clustering[nonghost_mask][indices])
+            #print(new_label_clustering[indices])
         label_c = concatenate1(label_c)
-
+        print("ignored 0 ", (label_c[:, -1] == -1).sum())
         # Also run DBSCAN on track true clusters
         # We want to avoid true track clusters broken in two having the same cluster id label
         # Note: we assume we are adapting either cluster or kinematics labels,
@@ -214,7 +219,6 @@ def adapt_labels_knn(result, label_seg, label_clustering,
         track_mask = label_c[:, -1] == track_label
         for batch_id in unique(coords[:, batch_column]):
             batch_mask = label_c[:, batch_column] == batch_id
-            #print(batch_id, 'before', torch.unique(label_c[track_mask & batch_mask, cluster_id_col], return_counts=True))
             # We want to select cluster ids for track-like particles
             batch_clustering = label_clustering[i][(label_clustering[i][:, batch_column] == batch_id) & (label_clustering[i][:, -1] == track_label)]
             if len(batch_clustering) == 0:
@@ -237,11 +241,7 @@ def adapt_labels_knn(result, label_seg, label_clustering,
                     l = torch.tensor(l, device = label_c.device).float()
                 l[l > -1] = l[l > -1] + cluster_count
                 label_c[where(batch_mask)[0][cluster_mask], cluster_id_col] = l
-                #label_c[where(batch_mask)[0][cluster_mask], cluster_id_col+1] = l
-                cluster_count += int(l.max() + 1)
-                # print(batch_id, c, (batch_clustering[:, cluster_id_col] == c).sum(), cluster_mask.sum(), unique(l))
-                #print(batch_id, c, l.unique(), (l == -1).sum())
-            #print(batch_id, 'after', torch.unique(label_c[track_mask & batch_mask, cluster_id_col], return_counts=True))
+                cluster_count = int(l.max() + 1)
 
         complete_label_clustering.append(label_c)
 

From 1bf47ddeb8f40cc042529e6ac6d8fceeecc2ae0a Mon Sep 17 00:00:00 2001
From: Francois Drielsma <francois.drielsma@gmail.com>
Date: Wed, 6 Jul 2022 23:15:03 -0700
Subject: [PATCH 18/30] Allows >1 semantic class to be part of
 GrapPA-shower/track in the full chain

---
 mlreco/models/layers/common/gnn_full_chain.py | 20 +++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/mlreco/models/layers/common/gnn_full_chain.py b/mlreco/models/layers/common/gnn_full_chain.py
index 511f1e66..69f9c844 100644
--- a/mlreco/models/layers/common/gnn_full_chain.py
+++ b/mlreco/models/layers/common/gnn_full_chain.py
@@ -33,19 +33,21 @@ def __init__(self, cfg):
         if self.enable_gnn_shower:
             self.grappa_shower     = GNN(cfg, name='grappa_shower', batch_col=self.batch_col, coords_col=self.coords_col)
             grappa_shower_cfg      = cfg.get('grappa_shower', {})
-            self._shower_id        = grappa_shower_cfg.get('base', {}).get('node_type', 0)
+            self._shower_ids       = grappa_shower_cfg.get('base', {}).get('node_type', 0)
             self._shower_use_true_particles = grappa_shower_cfg.get('use_true_particles', False)
+            if not isinstance(self._shower_ids, list): self._shower_ids = [self._shower_ids]
 
         if self.enable_gnn_track:
             self.grappa_track      = GNN(cfg, name='grappa_track', batch_col=self.batch_col, coords_col=self.coords_col)
             grappa_track_cfg       = cfg.get('grappa_track', {})
-            self._track_id         = grappa_track_cfg.get('base', {}).get('node_type', 1)
-            self._track_use_true_particles =grappa_track_cfg.get('use_true_particles', False)
+            self._track_ids        = grappa_track_cfg.get('base', {}).get('node_type', 1)
+            self._track_use_true_particles = grappa_track_cfg.get('use_true_particles', False)
+            if not isinstance(self._track_ids, list): self._track_ids = [self._track_ids]
 
         if self.enable_gnn_particle:
             self.grappa_particle   = GNN(cfg, name='grappa_particle', batch_col=self.batch_col, coords_col=self.coords_col)
             grappa_particle_cfg    = cfg.get('grappa_particle', {})
-            self._particle_ids     = grappa_particle_cfg.get('base', {}).get('node_type', [0,1])
+            self._particle_ids     = grappa_particle_cfg.get('base', {}).get('node_type', [0,1,2,3])
             self._particle_use_true_particles = grappa_particle_cfg.get('use_true_particles', False)
 
         if self.enable_gnn_inter:
@@ -187,7 +189,7 @@ def run_fragment_gnns(self, result, input):
             # Run shower GrapPA: merges shower fragments into shower instances
             em_mask, kwargs = self.get_extra_gnn_features(fragments,
                                                           frag_seg,
-                                                          [self._shower_id],
+                                                          self._shower_ids,
                                                           input,
                                                           result,
                                                           use_ppn=self.use_ppn_in_gnn,
@@ -218,7 +220,7 @@ def run_fragment_gnns(self, result, input):
             # Run track GrapPA: merges tracks fragments into track instances
             track_mask, kwargs = self.get_extra_gnn_features(fragments,
                                                              frag_seg,
-                                                             [self._track_id],
+                                                             self._track_ids,
                                                              input,
                                                              result,
                                                              use_ppn=self.use_ppn_in_gnn,
@@ -308,7 +310,8 @@ def get_all_particles(self, frag_result, result, input):
                                             'shower_group_pred',
                                             'shower_fragments')
 
-                mask &= (frag_seg != self._shower_id)
+                for c in self._shower_ids:
+                    mask &= (frag_seg != c)
             # Append one particle per track group
             if self.enable_gnn_track:
                 self.select_particle_in_group(result, counts, b, particles,
@@ -317,7 +320,8 @@ def get_all_particles(self, frag_result, result, input):
                                             'track_group_pred',
                                             'track_fragments')
 
-                mask &= (frag_seg != self._track_id)
+                for c in self._track_ids:
+                    mask &= (frag_seg != c)
 
             # Append one particle per fragment that is not already accounted for
             particles.extend(fragments[mask])

From 9f5a57bc292af16ec569466d42034b2151776b15 Mon Sep 17 00:00:00 2001
From: Francois Drielsma <francois.drielsma@gmail.com>
Date: Thu, 7 Jul 2022 13:25:56 -0700
Subject: [PATCH 19/30] Changed default semantic precedence in voxel duplicate
 removal from [0,2,1,3,4] to [1,2,0,3,4]

---
 mlreco/iotools/libparsers/clean_data.py | 2 +-
 mlreco/iotools/parsers/clean_data.py    | 2 +-
 mlreco/utils/groups.py                  | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/mlreco/iotools/libparsers/clean_data.py b/mlreco/iotools/libparsers/clean_data.py
index f7508f67..e654a030 100644
--- a/mlreco/iotools/libparsers/clean_data.py
+++ b/mlreco/iotools/libparsers/clean_data.py
@@ -36,7 +36,7 @@ def clean_data(grp_voxels, grp_data, img_voxels, img_data, meta):
     img_data = img_data[perm]
 
     # step 2: remove duplicates
-    sel1 = filter_duplicate_voxels_ref(grp_voxels, grp_data[:,-1],meta, usebatch=True, precedence=[0,2,1,3,4])
+    sel1 = filter_duplicate_voxels_ref(grp_voxels, grp_data[:,-1],meta, usebatch=True)
     inds1 = np.where(sel1)[0]
     grp_voxels = grp_voxels[inds1,:]
     grp_data = grp_data[inds1]
diff --git a/mlreco/iotools/parsers/clean_data.py b/mlreco/iotools/parsers/clean_data.py
index f7508f67..70d5ca26 100644
--- a/mlreco/iotools/parsers/clean_data.py
+++ b/mlreco/iotools/parsers/clean_data.py
@@ -36,7 +36,7 @@ def clean_data(grp_voxels, grp_data, img_voxels, img_data, meta):
     img_data = img_data[perm]
 
     # step 2: remove duplicates
-    sel1 = filter_duplicate_voxels_ref(grp_voxels, grp_data[:,-1],meta, usebatch=True, precedence=[0,2,1,3,4])
+    sel1 = filter_duplicate_voxels_ref(grp_voxels, grp_data[:,-1], meta, usebatch=True)
     inds1 = np.where(sel1)[0]
     grp_voxels = grp_voxels[inds1,:]
     grp_data = grp_data[inds1]
diff --git a/mlreco/utils/groups.py b/mlreco/utils/groups.py
index 955cec45..c17693f3 100644
--- a/mlreco/utils/groups.py
+++ b/mlreco/utils/groups.py
@@ -78,7 +78,7 @@ def filter_duplicate_voxels(data, usebatch=True):
     return ret
 
 
-def filter_duplicate_voxels_ref(data, reference, meta, usebatch=True, precedence=[2,1,0,3,4]):
+def filter_duplicate_voxels_ref(data, reference, meta, usebatch=True, precedence=[1,2,0,3,4]):
     """
     return array that will filter out duplicate voxels
     Sort with respect to a reference and following the specified precedence order

From 9118685f95cd19f0a7c56bc20755b8077eefaf28 Mon Sep 17 00:00:00 2001
From: Francois Drielsma <francois.drielsma@gmail.com>
Date: Thu, 7 Jul 2022 16:33:04 -0700
Subject: [PATCH 20/30] Added an option (true by default) to enfore PID
 predictions to match semantics

---
 mlreco/iotools/libparsers/clean_data.py       | 49 -------------------
 mlreco/models/layers/common/gnn_full_chain.py | 13 +++++
 .../layers/gnn/losses/node_kinematics.py      |  3 ++
 mlreco/utils/deghosting.py                    |  2 +-
 4 files changed, 17 insertions(+), 50 deletions(-)
 delete mode 100644 mlreco/iotools/libparsers/clean_data.py

diff --git a/mlreco/iotools/libparsers/clean_data.py b/mlreco/iotools/libparsers/clean_data.py
deleted file mode 100644
index e654a030..00000000
--- a/mlreco/iotools/libparsers/clean_data.py
+++ /dev/null
@@ -1,49 +0,0 @@
-import numpy as np
-from mlreco.utils.groups import filter_duplicate_voxels, filter_duplicate_voxels_ref, filter_nonimg_voxels
-
-
-def clean_data(grp_voxels, grp_data, img_voxels, img_data, meta):
-    """
-    Helper that factorizes common cleaning operations required
-    when trying to match true sparse3d and cluster3d data products.
-
-    1) lexicographically sort group data (images are lexicographically sorted)
-
-    2) remove voxels from group data that are not in image
-
-    3) choose only one group per voxel (by lexicographic order)
-
-    Parameters
-    ----------
-    grp_voxels: np.ndarray
-    grp_data: np.ndarray
-    img_voxels: np.ndarray
-    img_data: np.ndarray
-    meta: larcv::Meta
-
-    Returns
-    -------
-    grp_voxels: np.ndarray
-    grp_data: np.ndarray
-    """
-    # step 1: lexicographically sort group data
-    perm = np.lexsort(grp_voxels.T)
-    grp_voxels = grp_voxels[perm,:]
-    grp_data = grp_data[perm]
-
-    perm = np.lexsort(img_voxels.T)
-    img_voxels = img_voxels[perm,:]
-    img_data = img_data[perm]
-
-    # step 2: remove duplicates
-    sel1 = filter_duplicate_voxels_ref(grp_voxels, grp_data[:,-1],meta, usebatch=True)
-    inds1 = np.where(sel1)[0]
-    grp_voxels = grp_voxels[inds1,:]
-    grp_data = grp_data[inds1]
-
-    # step 3: remove voxels not in image
-    sel2 = filter_nonimg_voxels(grp_voxels, img_voxels, usebatch=False)
-    inds2 = np.where(sel2)[0]
-    grp_voxels = grp_voxels[inds2,:]
-    grp_data = grp_data[inds2]
-    return grp_voxels, grp_data
diff --git a/mlreco/models/layers/common/gnn_full_chain.py b/mlreco/models/layers/common/gnn_full_chain.py
index 69f9c844..626e6ee0 100644
--- a/mlreco/models/layers/common/gnn_full_chain.py
+++ b/mlreco/models/layers/common/gnn_full_chain.py
@@ -57,6 +57,9 @@ def __init__(self, cfg):
             self._inter_use_true_particles = grappa_inter_cfg.get('use_true_particles', False)
             self.inter_source_col = cfg.get('grappa_inter_loss', {}).get('edge_loss', {}).get('source_col', 6)
             self._inter_use_shower_primary = grappa_inter_cfg.get('use_shower_primary', True)
+            self._inter_enforce_semantics       = grappa_inter_cfg.get('enforce_semantics', True)
+            self._inter_enforce_semantics_shape = grappa_inter_cfg.get('enforce_semantics_shape', (4,5))
+            self._inter_enforce_semantics_map   = grappa_inter_cfg.get('enforce_semantics_map', [[0,0,1,1,1,2,3],[0,1,2,3,4,1,1]])
 
         if self.enable_gnn_kinematics:
             self.grappa_kinematics = GNN(cfg, name='grappa_kinematics', batch_col=self.batch_col, coords_col=self.coords_col)
@@ -450,6 +453,16 @@ def run_particle_gnns(self, result, input, frag_result):
                          output_keys,
                          kwargs)
 
+            # If requested, enforce that particle PID predictions are compatible with semantics,
+            # i.e. set logits to -inf if they belong to incompatible PIDs
+            if self._inter_enforce_semantics:
+                sem_pid_logic = -float('inf')*torch.ones(self._inter_enforce_semantics_shape, dtype=input[0].dtype, device=input[0].device)
+                sem_pid_logic[self._inter_enforce_semantics_map] = 0.
+                pid_logits = result['node_pred_type']
+                for i in range(len(pid_logits)):
+                    for b in range(len(pid_logits[i])):
+                        pid_logits[i][b] += sem_pid_logic[part_seg[part_batch_ids==b]]
+
         # ---
         # 4. GNN for particle flow & kinematics
         # ---
diff --git a/mlreco/models/layers/gnn/losses/node_kinematics.py b/mlreco/models/layers/gnn/losses/node_kinematics.py
index be5b6058..1469b182 100644
--- a/mlreco/models/layers/gnn/losses/node_kinematics.py
+++ b/mlreco/models/layers/gnn/losses/node_kinematics.py
@@ -175,6 +175,9 @@ def forward(self, out, types):
                     # Do not apply loss to nodes labeled -1 (unknown class)
                     valid_mask = node_assn_type > -1
 
+                    # Do not apply loss if the logit corresponding to the true class is -inf (forbidden prediction)
+                    valid_mask &= node_pred_type[np.arange(len(node_assn_type)),node_assn_type] != -float('inf')
+
                     # If high purity is requested, do not include broken particle in the loss
                     if self.type_high_purity:
                         group_ids    = get_cluster_label(labels, clusts, column=self.group_col)
diff --git a/mlreco/utils/deghosting.py b/mlreco/utils/deghosting.py
index e2244c3d..5540151b 100644
--- a/mlreco/utils/deghosting.py
+++ b/mlreco/utils/deghosting.py
@@ -208,7 +208,7 @@ def adapt_labels_knn(result, label_seg, label_clustering,
             #print(new_label_clustering[nonghost_mask][indices])
             #print(new_label_clustering[indices])
         label_c = concatenate1(label_c)
-        print("ignored 0 ", (label_c[:, -1] == -1).sum())
+        #print("ignored 0 ", (label_c[:, -1] == -1).sum())
         # Also run DBSCAN on track true clusters
         # We want to avoid true track clusters broken in two having the same cluster id label
         # Note: we assume we are adapting either cluster or kinematics labels,

From a71a410edee8f3c4e60cb5220d6a84cebf274699 Mon Sep 17 00:00:00 2001
From: Francois Drielsma <francois.drielsma@gmail.com>
Date: Thu, 7 Jul 2022 16:35:19 -0700
Subject: [PATCH 21/30] Small bug fix in PID prediction loss

---
 mlreco/models/layers/gnn/losses/node_kinematics.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlreco/models/layers/gnn/losses/node_kinematics.py b/mlreco/models/layers/gnn/losses/node_kinematics.py
index 1469b182..325c79b8 100644
--- a/mlreco/models/layers/gnn/losses/node_kinematics.py
+++ b/mlreco/models/layers/gnn/losses/node_kinematics.py
@@ -176,7 +176,7 @@ def forward(self, out, types):
                     valid_mask = node_assn_type > -1
 
                     # Do not apply loss if the logit corresponding to the true class is -inf (forbidden prediction)
-                    valid_mask &= node_pred_type[np.arange(len(node_assn_type)),node_assn_type] != -float('inf')
+                    valid_mask &= (node_pred_type[np.arange(len(node_assn_type)),node_assn_type] != -float('inf')).detach().cpu().numpy()
 
                     # If high purity is requested, do not include broken particle in the loss
                     if self.type_high_purity:

From 3b3e79ee9d141211a03abcb1c13e157c520f27f0 Mon Sep 17 00:00:00 2001
From: Francois Drielsma <francois.drielsma@gmail.com>
Date: Thu, 7 Jul 2022 16:59:05 -0700
Subject: [PATCH 22/30] Add option to only apply vertex loss to particles not
 broken by reco (default to true)

---
 .../layers/gnn/losses/node_kinematics.py       | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/mlreco/models/layers/gnn/losses/node_kinematics.py b/mlreco/models/layers/gnn/losses/node_kinematics.py
index 325c79b8..dad0e6d6 100644
--- a/mlreco/models/layers/gnn/losses/node_kinematics.py
+++ b/mlreco/models/layers/gnn/losses/node_kinematics.py
@@ -118,7 +118,8 @@ def __init__(self, loss_config, batch_col=0, coords_col=(1, 4)):
         self.use_anchor_points = loss_config.get('use_anchor_points', False)
         self.max_vertex_distance = loss_config.get('max_vertex_distance', 50)
         self.type_loss_weight = loss_config.get('type_loss_weight', 1.0)
-        self.type_high_purity = loss_config.get('type_high_purity', False)
+        self.type_high_purity = loss_config.get('type_high_purity', True)
+        self.vtx_high_purity  = loss_config.get('vtx_high_purity', True)
 
         self.compute_momentum_switch = loss_config.get('compute_momentum', True)
 
@@ -231,6 +232,7 @@ def forward(self, out, types):
                     node_pred_vtx = out['node_pred_vtx'][i][j]
                     if not node_pred_vtx.shape[0]:
                         continue
+                    input_node_features = out['input_node_features'][i][j]
 
                     # Predictions are shifts w.r.t the barycenter of each cluster
                     # anchors = []
@@ -242,10 +244,21 @@ def forward(self, out, types):
                     node_x_vtx = get_cluster_label(labels, clusts, column=self.vtx_col)
                     node_y_vtx = get_cluster_label(labels, clusts, column=self.vtx_col+1)
                     node_z_vtx = get_cluster_label(labels, clusts, column=self.vtx_col+2)
+                    positives  = get_cluster_label(labels, clusts, column=self.vtx_positives_col)
 
                     node_assn_vtx = torch.tensor(np.stack([node_x_vtx, node_y_vtx, node_z_vtx], axis=1),
                                                 dtype=torch.float, device=node_pred_vtx.device, requires_grad=False)
 
+                    # If high purity is requested, do not include broken particle in the loss
+                    if self.vtx_high_purity:
+                        group_ids     = get_cluster_label(labels, clusts, column=self.group_col)
+                        _, inv, cnts  = np.unique(group_ids, return_inverse=True, return_counts=True)
+                        valid_mask    = np.where(cnts[inv] == 1)[0]
+                        node_pred_vtx = node_pred_vtx[valid_mask]
+                        node_assn_vtx = node_assn_vtx[valid_mask]
+                        positives     = positives[valid_mask]
+                        input_node_features = input_node_features[valid_mask]
+
                     if self.normalize_vtx_label:
                         node_assn_vtx = node_assn_vtx/self.spatial_size
 
@@ -254,7 +267,6 @@ def forward(self, out, types):
                     else:
                         good_index = torch.all( (0 <= node_assn_vtx) & (node_assn_vtx <= self.spatial_size), dim=1)
 
-                    positives = get_cluster_label(labels, clusts, column=self.vtx_positives_col)
                     # Take the max for each cluster - e.g. for a shower, the primary fragment only
                     # is marked as primary particle, so taking majority count would eliminate the shower
                     # from primary particles for vertex identification purpose.
@@ -265,7 +277,7 @@ def forward(self, out, types):
 
                     positives = torch.tensor(positives, dtype=torch.long, device=node_pred_vtx.device, requires_grad=False)
 
-                    reshaped = out['input_node_features'][i][j][:, 19:25][good_index & positives.bool()].view(-1, 2, 3)
+                    reshaped = input_node_features[:, 19:25][good_index & positives.bool()].view(-1, 2, 3)
 
                     # Do not apply loss to nodes labeled -1 (unknown class)
                     node_mask = good_index & positives.bool() & (positives >= 0.)

From cb1a9cecad32486a4e56279c86aeec8d5e5bba58 Mon Sep 17 00:00:00 2001
From: Francois Drielsma <francois.drielsma@gmail.com>
Date: Sat, 9 Jul 2022 09:09:12 -0700
Subject: [PATCH 23/30] Fixed GNN node_kinematics loss bugs, added high purity
 option for momentum regression

---
 .../layers/gnn/losses/node_kinematics.py      | 54 +++++++++++--------
 1 file changed, 32 insertions(+), 22 deletions(-)

diff --git a/mlreco/models/layers/gnn/losses/node_kinematics.py b/mlreco/models/layers/gnn/losses/node_kinematics.py
index dad0e6d6..c1487055 100644
--- a/mlreco/models/layers/gnn/losses/node_kinematics.py
+++ b/mlreco/models/layers/gnn/losses/node_kinematics.py
@@ -119,6 +119,7 @@ def __init__(self, loss_config, batch_col=0, coords_col=(1, 4)):
         self.max_vertex_distance = loss_config.get('max_vertex_distance', 50)
         self.type_loss_weight = loss_config.get('type_loss_weight', 1.0)
         self.type_high_purity = loss_config.get('type_high_purity', True)
+        self.momentum_high_purity = loss_config.get('momentum_high_purity', True)
         self.vtx_high_purity  = loss_config.get('vtx_high_purity', True)
 
         self.compute_momentum_switch = loss_config.get('compute_momentum', True)
@@ -167,30 +168,30 @@ def forward(self, out, types):
 
                 # Increment the type loss, balance classes if requested
                 if compute_type:
-                    # Get the class labels and true momenta from the specified columns
+                    # Get the class labels and true type from the specified columns
                     node_pred_type = out['node_pred_type'][i][j]
                     if not node_pred_type.shape[0]:
                         continue
                     node_assn_type = get_cluster_label(labels, clusts, column=self.type_col)
 
                     # Do not apply loss to nodes labeled -1 (unknown class)
-                    valid_mask = node_assn_type > -1
+                    valid_mask_type = node_assn_type > -1
 
                     # Do not apply loss if the logit corresponding to the true class is -inf (forbidden prediction)
-                    valid_mask &= (node_pred_type[np.arange(len(node_assn_type)),node_assn_type] != -float('inf')).detach().cpu().numpy()
+                    valid_mask_type &= (node_pred_type[np.arange(len(node_assn_type)),node_assn_type] != -float('inf')).detach().cpu().numpy()
 
                     # If high purity is requested, do not include broken particle in the loss
                     if self.type_high_purity:
                         group_ids    = get_cluster_label(labels, clusts, column=self.group_col)
                         _, inv, cnts = np.unique(group_ids, return_inverse=True, return_counts=True)
-                        valid_mask  &= (cnts[inv] == 1)
-                    valid_mask = np.where(valid_mask)[0]
+                        valid_mask_type  &= (cnts[inv] == 1)
+                    valid_mask_type = np.where(valid_mask_type)[0]
 
                     # Compute loss
-                    if len(valid_mask):
+                    if len(valid_mask_type):
                         node_assn_type = torch.tensor(node_assn_type, dtype=torch.long, device=node_pred_type.device, requires_grad=False)
-                        node_pred_type = node_pred_type[valid_mask]
-                        node_assn_type = node_assn_type[valid_mask]
+                        node_pred_type = node_pred_type[valid_mask_type]
+                        node_assn_type = node_assn_type[valid_mask_type]
 
                         if self.balance_classes:
                             vals, counts = torch.unique(node_assn_type, return_counts=True)
@@ -205,7 +206,7 @@ def forward(self, out, types):
                             type_loss += self.type_loss_weight * float(loss)
 
                         # Increment the number of nodes
-                        n_clusts_type += len(valid_mask)
+                        n_clusts_type += len(valid_mask_type)
 
                 # Increment the momentum loss
                 if compute_momentum:
@@ -216,10 +217,19 @@ def forward(self, out, types):
                     node_assn_p = get_momenta_label(labels, clusts, column=self.momentum_col)
 
                     # Do not apply loss to nodes labeled -1 (unknown class)
-                    node_mask = torch.nonzero(node_assn_p > -1, as_tuple=True)[0]
-                    if len(node_mask):
-                        node_pred_p = node_pred_p[node_mask]
-                        node_assn_p = node_assn_p[node_mask]
+                    valid_mask_p = node_assn_p.detach().cpu().numpy() > -1
+
+                    # If high purity is requested, do not include broken particle in the loss
+                    if self.momentum_high_purity:
+                        group_ids    = get_cluster_label(labels, clusts, column=self.group_col)
+                        _, inv, cnts = np.unique(group_ids, return_inverse=True, return_counts=True)
+                        valid_mask_p  &= (cnts[inv] == 1)
+                    valid_mask_p = np.where(valid_mask_p)[0]
+
+                    # Do not apply loss to nodes labeled -1 (unknown class)
+                    if len(valid_mask_p):
+                        node_pred_p = node_pred_p[valid_mask_p]
+                        node_assn_p = node_assn_p[valid_mask_p]
 
                         loss = self.reg_lossfn(node_pred_p.squeeze(), node_assn_p.float())
                         total_loss += loss
@@ -251,13 +261,13 @@ def forward(self, out, types):
 
                     # If high purity is requested, do not include broken particle in the loss
                     if self.vtx_high_purity:
-                        group_ids     = get_cluster_label(labels, clusts, column=self.group_col)
-                        _, inv, cnts  = np.unique(group_ids, return_inverse=True, return_counts=True)
-                        valid_mask    = np.where(cnts[inv] == 1)[0]
-                        node_pred_vtx = node_pred_vtx[valid_mask]
-                        node_assn_vtx = node_assn_vtx[valid_mask]
-                        positives     = positives[valid_mask]
-                        input_node_features = input_node_features[valid_mask]
+                        group_ids      = get_cluster_label(labels, clusts, column=self.group_col)
+                        _, inv, cnts   = np.unique(group_ids, return_inverse=True, return_counts=True)
+                        valid_mask_vtx = np.where(cnts[inv] == 1)[0]
+                        node_pred_vtx  = node_pred_vtx[valid_mask_vtx]
+                        node_assn_vtx  = node_assn_vtx[valid_mask_vtx]
+                        positives      = positives[valid_mask_vtx]
+                        input_node_features = input_node_features[valid_mask_vtx]
 
                     if self.normalize_vtx_label:
                         node_assn_vtx = node_assn_vtx/self.spatial_size
@@ -318,10 +328,10 @@ def forward(self, out, types):
 
                 # Compute the accuracy of assignment (fraction of correctly assigned nodes)
                 # and the accuracy of momentum estimation (RMS relative residual)
-                if compute_type:
+                if compute_type and len(valid_mask_type):
                     type_acc += float(torch.sum(torch.argmax(node_pred_type, dim=1) == node_assn_type))
 
-                if compute_momentum:
+                if compute_momentum and len(valid_mask_p):
                     p_acc += float(torch.sum(1.- torch.abs(node_pred_p.squeeze()-node_assn_p)/node_assn_p)) # 1-MAPE
 
                 if compute_vtx and node_pred_vtx[good_index].shape[0]:

From 15ea93c9d72bc2deb7ca6d252096a711658e262e Mon Sep 17 00:00:00 2001
From: Francois Drielsma <francois.drielsma@gmail.com>
Date: Fri, 15 Jul 2022 09:17:20 -0700
Subject: [PATCH 24/30] Moved to dictionary-based parser configurations, added
 support for parser parameters

---
 mlreco/iotools/datasets.py                    |  79 +-
 mlreco/iotools/parsers/__init__.py            | 120 ++-
 mlreco/iotools/parsers/clean_data.py          |  13 +-
 mlreco/iotools/parsers/cluster.py             | 769 +++---------------
 mlreco/iotools/parsers/misc.py                | 148 +---
 mlreco/iotools/parsers/particles.py           | 418 ++++------
 mlreco/iotools/parsers/sparse.py              | 283 ++-----
 mlreco/main_funcs.py                          |   8 +-
 mlreco/models/layers/common/gnn_full_chain.py |   4 +-
 mlreco/post_processing/metrics/__init__.py    |   2 +-
 mlreco/utils/cluster/fragmenter.py            |   2 +-
 mlreco/utils/gnn/data.py                      |   6 +-
 mlreco/utils/groups.py                        |   6 +-
 mlreco/utils/metrics.py                       |   2 +-
 14 files changed, 511 insertions(+), 1349 deletions(-)

diff --git a/mlreco/iotools/datasets.py b/mlreco/iotools/datasets.py
index dad9eaa5..72a8be4e 100644
--- a/mlreco/iotools/datasets.py
+++ b/mlreco/iotools/datasets.py
@@ -1,4 +1,4 @@
-import os, glob
+import os, glob, inspect
 import numpy as np
 from torch.utils.data import Dataset
 import mlreco.iotools.parsers
@@ -19,14 +19,15 @@ def __init__(self, data_schema, data_keys, limit_num_files=0, limit_num_samples=
 
         Parameters
         ----------
-        data_dirs : list
-            a list of data directories to find files (up to 10 files read from each dir)
         data_schema : dict
-            a dictionary of string <=> list of strings. The key is a unique name of a data chunk in a batch.
-            The list must be length >= 2: the first string names the parser function, and the rest of strings
-            identifies data keys in the input files.
+            A dictionary of (string, dictionary) pairs. The key is a unique name of
+            a data chunk in a batch and the associated dictionary must include:
+              - parser: name of the parser
+              - args: (key, value) pairs that correspond to parser argument names and their values
+            The nested dictionaries can replaced be lists, in which case
+            they will be considered as parser argument values, in order.
         data_keys : list
-            a list of strings that is required to be present in the filename
+            a list of strings that is required to be present in the file paths
         limit_num_files : int
             an integer limiting number of files to be taken per data directory
         limit_num_samples : int
@@ -38,7 +39,6 @@ def __init__(self, data_schema, data_keys, limit_num_files=0, limit_num_samples=
         """
 
         # Create file list
-        #self._files = _list_files(data_dirs,data_key,limit_num_files)
         self._files = []
         for key in data_keys:
             fs = glob.glob(key)
@@ -58,17 +58,40 @@ def __init__(self, data_schema, data_keys, limit_num_files=0, limit_num_samples=
         self._data_parsers = []
         self._trees = {}
         for key, value in data_schema.items():
-            if len(value) < 2:
-                print('iotools.datasets.schema contains a key %s with list length < 2!' % key)
-                raise ValueError
-            if not hasattr(mlreco.iotools.parsers,value[0]):
-                print('The specified parser name %s does not exist!' % value[0])
+            # If the schema is a list, make it a dictionary, warn of deprecation
+            if isinstance(value, list):
+                from warnings import warn
+                warn('Deprecated: Using a list to specify a schema is deprected, move to using dictionaries', DeprecationWarning)
+                if len(value) < 2:
+                    print(f'iotools.datasets.schema contains a key %s with list length < 2!' % key)
+                    raise ValueError
+                value = {'parser':value[0], 'args':value[1:]}
+
+            # Identify the parser and its parameter names, convert args list to kwargs, if needed
+            assert 'parser' in value, 'A parser needs to be specified for %s' % key
+            if not hasattr(mlreco.iotools.parsers, value['parser']):
+                print('The specified parser name %s does not exist!' % value['parser'])
+            assert 'args' in value, 'Parser arguments must be provided for %s' % key
+            fn = getattr(mlreco.iotools.parsers, value['parser'])
+            keys = list(inspect.signature(fn).parameters.keys())
+            if isinstance(value['args'], list):
+                if len(keys) == 1 and 'event_list' in keys[0]:
+                    value['args'] = {keys[0]: value['args']} # Don't unroll if a list is expected
+                else:
+                    value['args'] = {keys[i]: value['args'][i] for i in range(len(value['args']))}
+            assert isinstance(value['args'], dict), 'Parser arguments must be a list or dictionary for %s' % key
+            for k in value['args'].keys():
+                assert k in keys, 'Argument %s does not exist in parser %s' % (k, value['parser'])
+
+            # Append data key and parsers
             self._data_keys.append(key)
-            self._data_parsers.append((getattr(mlreco.iotools.parsers,value[0]),value[1:]))
-            for data_key in value[1:]:
-                if isinstance(data_key, dict): data_key = list(data_key.values())[0]
-                if data_key in self._trees: continue
-                self._trees[data_key] = None
+            self._data_parsers.append((getattr(mlreco.iotools.parsers,value['parser']), value['args']))
+            for arg_name, data_key in value['args'].items():
+                if 'event' not in arg_name: continue
+                if 'event_list' not in arg_name: data_key = [data_key]
+                for k in data_key:
+                    if k not in self._trees: self._trees[k] = None
+
         self._data_keys.append('index')
 
         # Prepare TTrees and load files
@@ -99,7 +122,7 @@ def __init__(self, data_schema, data_keys, limit_num_files=0, limit_num_samples=
             if len(removed):
                 print('WARNING: ignoring some of specified events in event_list as they do not exist in the sample.')
                 print(removed)
-            self._event_list=event_list[np.where(event_list < self._entries)]
+            self._event_list = event_list[np.where(event_list < self._entries)]
             self._entries = len(self._event_list)
 
         if skip_event_list is not None:
@@ -174,18 +197,24 @@ def __getitem__(self,idx):
                 for f in self._files: chain.AddFile(f)
                 self._trees[key] = chain
             self._trees_ready=True
+
         # Move the event pointer
         for tree in self._trees.values():
             tree.GetEntry(event_idx)
+
         # Create data chunks
         result = {}
-        for index, (parser, datatree_keys) in enumerate(self._data_parsers):
-            if isinstance(datatree_keys[0], dict):
-                data = [(getattr(self._trees[list(d.values())[0]], list(d.values())[0] + '_branch'), list(d.keys())[0]) for d in datatree_keys]
-            else:
-                data = [getattr(self._trees[key], key + '_branch') for key in datatree_keys]
+        for index, (parser, args) in enumerate(self._data_parsers):
+            kwargs = {}
+            for k, v in args.items():
+                if   'event_list' in k:
+                    kwargs[k] = [getattr(self._trees[vi], vi+'_branch') for vi in v]
+                elif 'event' in k:
+                    kwargs[k] = getattr(self._trees[v], v+'_branch')
+                else:
+                    kwargs[k] = v
             name = self._data_keys[index]
-            result[name] = parser(data)
+            result[name] = parser(**kwargs)
 
         result['index'] = event_idx
         return result
diff --git a/mlreco/iotools/parsers/__init__.py b/mlreco/iotools/parsers/__init__.py
index 6ea0fdd3..04e68617 100644
--- a/mlreco/iotools/parsers/__init__.py
+++ b/mlreco/iotools/parsers/__init__.py
@@ -8,51 +8,36 @@
 List of existing parsers
 ========================
 
-.. csv-table:: Cluster parsers
+.. csv-table:: Sparse parsers
     :header: Parser name, Description
 
-    ``parse_cluster2d``,Retrieved 2D cluster tensors with limited information
-    ``parse_cluster3d``, Retrieve a 3D clusters tensor
-    ``parse_cluster3d_full``, Retrieve a 3D clusters tensor with full features list
-    ``parse_cluster3d_types``, Retrieve a 3D clusters tensor and PDG information
-    ``parse_cluster3d_kinematics``, Retrieve a 3D clusters tensor with kinematics features
-    ``parse_cluster3d_kinematics_clean``, Similar to parse_cluster3d_kinematics, but removes overlap voxels.
-    ``parse_cluster3d_clean_full``,
-    ``parse_cluster3d_scales``, Retrieves clusters tensors at different spatial sizes.
+    ``parse_sparse2d``, Retrieve sparse tensor input from larcv::EventSparseTensor2D object
+    ``parse_sparse3d``, Retrieve sparse tensor input from larcv::EventSparseTensor3D object
+    ``parse_sparse3d_ghost``, Takes semantics tensor and turns its labels into ghost labels
 
-.. csv-table:: Sparse parsers
+.. csv-table:: Cluster parsers
     :header: Parser name, Description
 
-    ``parse_sparse2d_scn``,
-    ``parse_sparse3d_scn``, Retrieve sparse tensor input from larcv::EventSparseTensor3D object
-    ``parse_sparse3d``, Return it in concatenated form (shape (N, 3+C))
-    ``parse_weights``, Generate weights from larcv::EventSparseTensor3D and larcv::Particle list
-    ``parse_sparse3d_clean``,
-    ``parse_sparse3d_scn_scales``, Retrieves sparse tensors at different spatial sizes.
-
+    ``parse_cluster2d``, Retrieve list of sparse tensor input from larcv::EventClusterPixel2D
+    ``parse_cluster3d``, Retrieve list of sparse tensor input from larcv::EventClusterVoxel3D
 
 .. csv-table:: Particle parsers
     :header: Parser name, Description
 
-    ``parse_particle_singlep_pdg``, Get each true particle's PDG code.
-    ``parse_particle_singlep_einit``, Get each true particle's true initial energy.
-    ``parse_particle_asis``, Copy construct & return an array of larcv::Particle
-    ``parse_neutrino_asis``, Copy construct & return an array of larcv::Neutrino
-    ``parse_particle_coords``, Returns particle coordinates (start and end) and start time.
-    ``parse_particle_points``, Retrieve particles ground truth points tensor
-    ``parse_particle_points_with_tagging``, Same as `parse_particle_points` including start vs end point tagging.
-    ``parse_particle_graph``, Parse larcv::EventParticle to construct edges between particles (i.e. clusters)
-    ``parse_particle_graph_corrected``, Also removes edges to clusters that have a zero pixel count.
+    ``parse_particle_asis``, Retrieve array of larcv::Particle
+    ``parse_neutrino_asis``, Retrieve array of larcv::Neutrino
+    ``parse_particle_points``, Retrieve array of larcv::Particle ground truth points tensor
+    ``parse_particle_coords``, Retrieve array of larcv::Particle coordinates (start and end) and start time
+    ``parse_particle_graph``, Construct edges between particles (i.e. clusters) from larcv::EventParticle
+    ``parse_particle_singlep_pdg``, Get a single larcv::Particle PDG code
+    ``parse_particle_singlep_einit``, Get a single larcv::Particle initial energy
 
-
-.. csv-table:: Misc parsers
+.. csv-table:: Miscellaneous parsers
     :header: Parser name, Description
 
     ``parse_meta3d``, Get the meta information to translate into real world coordinates (3D)
     ``parse_meta2d``, Get the meta information to translate into real world coordinates (2D)
-    ``parse_dbscan``, Create dbscan tensor
     ``parse_run_info``, Parse run info (run, subrun, event number)
-    ``parse_tensor3d``, Retrieve larcv::EventSparseTensor3D as a dense numpy array
 
 
 What does a typical parser configuration look like?
@@ -63,17 +48,19 @@
 
     schema:
       input_data:
-        - parse_sparse3d_scn
-        - sparse3d_reco
-        - sparse3d_reco_chi2
+        parser: parse_sparse3d
+        args:
+          sparse_event_list:
+            - sparse3d_reco
+            - sparse3d_reco_chi2
 
 Then `input_data` is an arbitrary name chosen by the user, which will be the key to
-access the output of the parser ``parse_sparse3d_scn`` (first element of
-the bullet list). The rest of the bullet list are ROOT TTree names that will be
-fed to the parser. In this example, the parser will be called with a list of 2 elements:
-a ``larcv::EventSparseTensor3D`` coming from the ROOT TTree
-``sparse3d_reco``, and another one coming from the TTree
-``sparse3d_reco_chi2``.
+access the output of the parser ``parse_sparse3d``. The parser arguments can be
+ROOT TTree names that will be fed to the parser or parser arguments. The arguments
+can either be passed as an ordered list (following the order of the function arguments) or
+a dictionary of (argument name, value) pairs. In this example, the parser will be called
+with a list of 2 objects: A ``larcv::EventSparseTensor3D`` coming from the ROOT TTree
+``sparse3d_reco``, and another one coming from the TTree ``sparse3d_reco_chi2``.
 
 How do I know what a parser requires?
 =====================================
@@ -83,45 +70,36 @@
 =========================================
 To be completed.
 """
-from mlreco.iotools.parsers.misc import (
-    parse_meta2d,
-    parse_meta3d,
-    parse_dbscan,
-    parse_run_info,
-    parse_tensor3d
+
+from mlreco.iotools.parsers.sparse import (
+    parse_sparse2d,
+    parse_sparse3d,
+    parse_sparse3d_ghost,
+    parse_sparse2d_scn, # Deprecated
+    parse_sparse3d_scn # Depreacted
+)
+
+from mlreco.iotools.parsers.cluster import (
+    parse_cluster2d,
+    parse_cluster3d,
+    parse_cluster3d_kinematics_clean, # Deprecated
+    parse_cluster3d_clean_full # Depreacted
 )
 
 from mlreco.iotools.parsers.particles import (
-    parse_particle_singlep_pdg,
-    parse_particle_singlep_einit,
     parse_particle_asis,
     parse_neutrino_asis,
-    parse_particle_coords,
     parse_particle_points,
-    parse_particle_points_with_tagging,
+    parse_particle_coords,
     parse_particle_graph,
-    parse_particle_graph_corrected
-)
-
-from mlreco.iotools.parsers.sparse import (
-    parse_sparse2d_scn,
-    parse_sparse3d_scn,
-    parse_sparse3d_ghost,
-    parse_sparse3d,
-    parse_sparse3d_scn_scales,
-    parse_sparse3d_clean,
-    parse_weights
+    parse_particle_singlep_pdg,
+    parse_particle_singlep_einit,
+    parse_particle_points_with_tagging, # Deprecated
+    parse_particle_graph_corrected # Deprecated
 )
 
-from mlreco.iotools.parsers.cluster import (
-    parse_cluster2d,
-    parse_cluster3d,
-    parse_cluster3d_full,
-    parse_cluster3d_types,
-    parse_cluster3d_kinematics,
-    parse_cluster3d_kinematics_clean,
-    parse_cluster3d_clean_full_extended,
-    parse_cluster3d_full_extended,
-    parse_cluster3d_clean_full,
-    parse_cluster3d_scales
+from mlreco.iotools.parsers.misc import (
+    parse_meta2d,
+    parse_meta3d,
+    parse_run_info
 )
diff --git a/mlreco/iotools/parsers/clean_data.py b/mlreco/iotools/parsers/clean_data.py
index 70d5ca26..6bf0cc5f 100644
--- a/mlreco/iotools/parsers/clean_data.py
+++ b/mlreco/iotools/parsers/clean_data.py
@@ -1,8 +1,8 @@
 import numpy as np
-from mlreco.utils.groups import filter_duplicate_voxels, filter_duplicate_voxels_ref, filter_nonimg_voxels
+from mlreco.utils.groups import filter_duplicate_voxels_ref, filter_nonimg_voxels
 
 
-def clean_data(grp_voxels, grp_data, img_voxels, img_data, meta):
+def clean_sparse_data(grp_voxels, grp_data, img_voxels, img_data, meta, precedence):
     """
     Helper that factorizes common cleaning operations required
     when trying to match true sparse3d and cluster3d data products.
@@ -20,13 +20,14 @@ def clean_data(grp_voxels, grp_data, img_voxels, img_data, meta):
     img_voxels: np.ndarray
     img_data: np.ndarray
     meta: larcv::Meta
+    precedence: list
 
     Returns
     -------
     grp_voxels: np.ndarray
     grp_data: np.ndarray
     """
-    # step 1: lexicographically sort group data
+    # Step 1: lexicographically sort group data
     perm = np.lexsort(grp_voxels.T)
     grp_voxels = grp_voxels[perm,:]
     grp_data = grp_data[perm]
@@ -35,13 +36,13 @@ def clean_data(grp_voxels, grp_data, img_voxels, img_data, meta):
     img_voxels = img_voxels[perm,:]
     img_data = img_data[perm]
 
-    # step 2: remove duplicates
-    sel1 = filter_duplicate_voxels_ref(grp_voxels, grp_data[:,-1], meta, usebatch=True)
+    # Step 2: remove duplicates
+    sel1 = filter_duplicate_voxels_ref(grp_voxels, grp_data[:,-1], meta, usebatch=True, precedence=precedence)
     inds1 = np.where(sel1)[0]
     grp_voxels = grp_voxels[inds1,:]
     grp_data = grp_data[inds1]
 
-    # step 3: remove voxels not in image
+    # Step 3: remove voxels not in image
     sel2 = filter_nonimg_voxels(grp_voxels, img_voxels, usebatch=False)
     inds2 = np.where(sel2)[0]
     grp_voxels = grp_voxels[inds2,:]
diff --git a/mlreco/iotools/parsers/cluster.py b/mlreco/iotools/parsers/cluster.py
index 8309fe17..4639a79d 100644
--- a/mlreco/iotools/parsers/cluster.py
+++ b/mlreco/iotools/parsers/cluster.py
@@ -1,13 +1,14 @@
+from collections import OrderedDict
 import numpy as np
 from larcv import larcv
-from mlreco.utils.groups import filter_duplicate_voxels, filter_duplicate_voxels_ref, filter_nonimg_voxels
-from mlreco.iotools.parsers.sparse import parse_sparse3d_scn
-from mlreco.iotools.parsers.particles import parse_particle_asis
-from mlreco.iotools.parsers.clean_data import clean_data
+from mlreco.utils.groups import get_interaction_id, get_nu_id, get_particle_id, get_primary_id
 from mlreco.utils.groups import type_labels as TYPE_LABELS
+from mlreco.iotools.parsers.sparse import parse_sparse3d
+from mlreco.iotools.parsers.particles import parse_particle_asis
+from mlreco.iotools.parsers.clean_data import clean_sparse_data
 
 
-def parse_cluster2d(data):
+def parse_cluster2d(cluster_event):
     """
     A function to retrieve a 2D clusters tensor
 
@@ -15,12 +16,13 @@ def parse_cluster2d(data):
 
         schema:
           cluster_label:
-            - parse_cluster2d
-            - cluster2d_pcluster
+            parser: parse_cluster2d
+            args:
+              cluster_event: cluster2d_pcluster
 
     Configuration
     -------------
-    cluster2d_pcluster: larcv::EventClusterPixel2D
+    cluster_event: larcv::EventClusterPixel2D
 
     Returns
     -------
@@ -30,7 +32,7 @@ def parse_cluster2d(data):
     np_features: np.ndarray
         a numpy array with the shape (N,2) where 2 is pixel value and cluster id, respectively
     """
-    cluster_event = data[0].as_vector().front()
+    cluster_event = cluster_event.as_vector().front()
     meta = cluster_event.meta()
     num_clusters = cluster_event.size()
     clusters_voxels, clusters_features = [], []
@@ -52,7 +54,15 @@ def parse_cluster2d(data):
     return np_voxels, np_features
 
 
-def parse_cluster3d(data):
+def parse_cluster3d(cluster_event,
+                    particle_event = None,
+                    particle_mpv_event = None,
+                    sparse_semantics_event = None,
+                    sparse_value_event = None,
+                    add_particle_info = False,
+                    add_kinematics_info = False,
+                    clean_data = True,
+                    precedence = [1,2,0,3,4]):
     """
     a function to retrieve a 3D clusters tensor
 
@@ -60,64 +70,29 @@ def parse_cluster3d(data):
 
         schema:
           cluster_label:
-            - parse_cluster3d
-            - cluster3d_pcluster
-
-    Configuration
-    -------------
-    cluster3d_pcluster: larcv::EventClusterVoxel3D
-
-    Returns
-    -------
-    np_voxels: np.ndarray
-        a numpy array with the shape (n,3) where 3 represents (x,y,z)
-        coordinate
-    np_features: np.ndarray
-        a numpy array with the shape (n,2) where 2 is voxel value and cluster id, respectively
-
-    See Also
-    --------
-    parse_cluster3d_full
-    """
-    cluster_event = data[0]
-    meta = cluster_event.meta()
-    num_clusters = cluster_event.as_vector().size()
-    clusters_voxels, clusters_features = [], []
-    for i in range(num_clusters):
-        cluster = cluster_event.as_vector()[i]
-        num_points = cluster.as_vector().size()
-        if num_points > 0:
-            x = np.empty(shape=(num_points,), dtype=np.int32)
-            y = np.empty(shape=(num_points,), dtype=np.int32)
-            z = np.empty(shape=(num_points,), dtype=np.int32)
-            value = np.empty(shape=(num_points,), dtype=np.float32)
-            larcv.as_flat_arrays(cluster,meta,x, y, z, value)
-            cluster_id = np.full(shape=(cluster.as_vector().size()),
-                                 fill_value=i, dtype=np.float32)
-            clusters_voxels.append(np.stack([x, y, z], axis=1))
-            clusters_features.append(np.column_stack([value,cluster_id]))
-    np_voxels   = np.concatenate(clusters_voxels, axis=0)
-    np_features = np.concatenate(clusters_features, axis=0)
-    return np_voxels, np_features
-
-
-def parse_cluster3d_full(data):
-    """
-    a function to retrieve a 3D clusters tensor with full features list
-
-    .. code-block:: yaml
-
-        schema:
-          cluster_label:
-            - parse_cluster3d_full
-            - cluster3d_pcluster
-            - particle_mpv
+            parser: parse_cluster3d
+            args:
+              cluster_event: cluster3d_pcluster
+              particle_event: particle_pcluster
+              particle_mpv_event: particle_mpv
+              sparse_semantics_event: sparse3d_semantics
+              sparse_value_event: sparse3d_pcluster
+              add_particle_info: true
+              add_kinematics_info: false
+              clean_data: true
+              precedence: [1,2,0,3,4]
 
     Configuration
     -------------
-    cluster3d_pcluster: larcv::EventClusterVoxel3D
-    particle_mpv: larcv::EventParticle, optional
-        To determine neutrino vs cosmic labels
+    cluster_event: larcv::EventClusterVoxel3D
+    particle_event: larcv::EventParticle
+    particle_mpv_event: larcv::EventParticle
+    sparse_semantics_event: larcv::EventSparseTensor3D
+    sparse_value_event: larcv::EventSparseTensor3D
+    add_particle_info: bool
+    add_kinematics_info: bool
+    clean_data: bool
+    precedence: list
 
     Returns
     -------
@@ -125,624 +100,110 @@ def parse_cluster3d_full(data):
         a numpy array with the shape (n,3) where 3 represents (x,y,z)
         coordinate
     np_features: np.ndarray
-        a numpy array with the shape (n,8) where 8 is respectively
-
+        a numpy array with the shape (n,m) where m (2-13) includes:
         * voxel value,
-        * cluster id,
+        * cluster id
+        if add_particle_info is true, it also includes
         * group id,
         * interaction id,
         * nu id,
         * particle type,
-        * primary id,
-        * semantic type,
-    """
-    cluster_event = data[0]
-    particles_v = data[1].as_vector()
-    meta = cluster_event.meta()
-    num_clusters = cluster_event.as_vector().size()
-    clusters_voxels, clusters_features = [], []
-    particle_mpv = None
-    if len(data) > 2:
-        particle_mpv = data[2].as_vector()
-
-    from mlreco.utils.groups import get_interaction_id, get_nu_id, get_particle_id, get_primary_id
-    group_ids =   np.array([p.group_id() for p in particles_v])
-    inter_ids =   get_interaction_id(particles_v)
-    nu_ids      = get_nu_id(cluster_event, particles_v, inter_ids, particle_mpv=particle_mpv)
-    pids        = get_particle_id(particles_v, nu_ids)
-    primary_ids = get_primary_id(cluster_event, particles_v)
-
-    for i in range(num_clusters):
-        cluster = cluster_event.as_vector()[i]
-        num_points = cluster.as_vector().size()
-        if num_points > 0:
-            x = np.empty(shape=(num_points,), dtype=np.int32)
-            y = np.empty(shape=(num_points,), dtype=np.int32)
-            z = np.empty(shape=(num_points,), dtype=np.int32)
-            value = np.empty(shape=(num_points,), dtype=np.float32)
-            larcv.as_flat_arrays(cluster,meta,x, y, z, value)
-            assert i == particles_v[i].id()
-            cluster_id = np.full(shape=(cluster.as_vector().size()),
-                                 fill_value=particles_v[i].id(), dtype=np.float32)
-            group_id = np.full(shape=(cluster.as_vector().size()),
-                               #fill_value=particles_v[i].group_id(), dtype=np.float32)
-                               fill_value=group_ids[i], dtype=np.float32)
-            inter_id = np.full(shape=(cluster.as_vector().size()),
-                               fill_value=inter_ids[i], dtype=np.float32)
-            nu_id = np.full(shape=(cluster.as_vector().size()),
-                            fill_value=nu_ids[i], dtype=np.float32)
-            pdg = np.full(shape=(cluster.as_vector().size()),
-                          fill_value=pids[i], dtype=np.float32)
-            primary_id = np.full(shape=(cluster.as_vector().size()),
-                            fill_value=primary_ids[i], dtype=np.float32)
-            sem_type = np.full(shape=(cluster.as_vector().size()),
-                               fill_value=particles_v[i].shape(), dtype=np.float32)
-            clusters_voxels.append(np.stack([x, y, z], axis=1))
-            clusters_features.append(np.column_stack([value,cluster_id,group_id,inter_id,nu_id,pdg,primary_id,sem_type]))
-
-    if len(clusters_voxels):
-        np_voxels   = np.concatenate(clusters_voxels, axis=0)
-        np_features = np.concatenate(clusters_features, axis=0)
-    else:
-        np_voxels   = np.empty(shape=(0, 3), dtype=np.float32)
-        np_features = np.empty(shape=(0, 8), dtype=np.float32)
-
-    return np_voxels, np_features
-
-
-
-def parse_cluster3d_full_extended(data):
-    """
-    a function to retrieve a 3D clusters tensor with full features list
-
-    .. code-block:: yaml
-
-        schema:
-          cluster_label:
-            - parse_cluster3d_full
-            - cluster3d_pcluster
-            - particle_mpv
-
-    Configuration
-    -------------
-    cluster3d_pcluster: larcv::EventClusterVoxel3D
-    particle_mpv: larcv::EventParticle, optional
-        To determine neutrino vs cosmic labels
-
-    Returns
-    -------
-    np_voxels: np.ndarray
-        a numpy array with the shape (n,3) where 3 represents (x,y,z)
-        coordinate
-    np_features: np.ndarray
-        a numpy array with the shape (n,8) where 8 is respectively
-
-        * voxel value,
-        * cluster id,
+        * primary id
+        if add_kinematics_info is true, it also includes
         * group id,
-        * interaction id,
-        * nu id,
         * particle type,
-        * primary id,
-        * semantic type,
+        * momentum,
+        * vtx (x,y,z),
+        * primary group id
+        if either add_* is true, it includes last:
+        * semantic type
     """
-    cluster_event = data[0]
-    particles_v = data[1].as_vector()
-    particles_v_asis = parse_particle_asis([data[1], data[0]])
+
+    # Get the cluster-wise information
     meta = cluster_event.meta()
     num_clusters = cluster_event.as_vector().size()
+    labels = OrderedDict()
+    labels['cluster'] = np.arange(num_clusters)
+    if add_particle_info or add_kinematics_info:
+        assert particle_event is not None, "Must provide particle tree if particle/kinematics information is included"
+        particles_v     = particle_event.as_vector()
+        particles_mpv_v = particle_mpv_event.as_vector() if particle_mpv_event is not None else None
+        inter_ids       = get_interaction_id(particles_v)
+        nu_ids          = get_nu_id(cluster_event, particles_v, inter_ids, particle_mpv=particles_mpv_v)
+
+        labels['cluster'] = np.array([p.id() for p in particles_v])
+        labels['group']   = np.array([p.group_id() for p in particles_v])
+        if add_particle_info:
+            labels['inter']   = inter_ids
+            labels['nu']      = nu_ids
+            labels['type']    = get_particle_id(particles_v, nu_ids)
+            labels['primary'] = get_primary_id(cluster_event, particles_v)
+        if add_kinematics_info:
+            particles_asis_v  = parse_particle_asis(particle_event, cluster_event)
+            labels['type']    = get_particle_id(particles_v, nu_ids)
+            labels['p']       = np.array([(p.px()**2+p.py()**2+p.pz()**2)/1e3 for p in particles_v])
+            labels['vtx_x']   = np.array([p.ancestor_position().x() for p in particles_asis_v])
+            labels['vtx_y']   = np.array([p.ancestor_position().y() for p in particles_asis_v])
+            labels['vtx_z']   = np.array([p.ancestor_position().z() for p in particles_asis_v])
+            labels['primary_group'] = np.array((nu_ids > 0) & np.array([p.group_id()==p.parent_id() for p in particles_v]), dtype=np.float32)
+        labels['sem'] = np.array([p.shape() for p in particles_v])
+
+    # Loop over clusters, store info
     clusters_voxels, clusters_features = [], []
-    particle_mpv = None
-    if len(data) > 2:
-        particle_mpv = data[2].as_vector()
-
-    from mlreco.utils.groups import get_interaction_id, get_nu_id, get_particle_id, get_primary_id
-    group_ids =   np.array([p.group_id() for p in particles_v])
-    inter_ids =   get_interaction_id(particles_v)
-    nu_ids      = get_nu_id(cluster_event, particles_v, inter_ids, particle_mpv=particle_mpv)
-    pids        = get_particle_id(particles_v, nu_ids)
-    primary_ids = get_primary_id(cluster_event, particles_v)
-
     for i in range(num_clusters):
         cluster = cluster_event.as_vector()[i]
         num_points = cluster.as_vector().size()
         if num_points > 0:
+            # Get the position and pixel value from EventSparseTensor3D, append positions
             x = np.empty(shape=(num_points,), dtype=np.int32)
             y = np.empty(shape=(num_points,), dtype=np.int32)
             z = np.empty(shape=(num_points,), dtype=np.int32)
             value = np.empty(shape=(num_points,), dtype=np.float32)
-            larcv.as_flat_arrays(cluster,meta,x, y, z, value)
-            assert i == particles_v[i].id()
-            cluster_id = np.full(shape=(cluster.as_vector().size()),
-                                 fill_value=particles_v[i].id(), dtype=np.float32)
-            group_id = np.full(shape=(cluster.as_vector().size()),
-                               #fill_value=particles_v[i].group_id(), dtype=np.float32)
-                               fill_value=group_ids[i], dtype=np.float32)
-            inter_id = np.full(shape=(cluster.as_vector().size()),
-                               fill_value=inter_ids[i], dtype=np.float32)
-            nu_id = np.full(shape=(cluster.as_vector().size()),
-                            fill_value=nu_ids[i], dtype=np.float32)
-            pdg = np.full(shape=(cluster.as_vector().size()),
-                          fill_value=pids[i], dtype=np.float32)
-            is_primary = np.full(shape=(cluster.as_vector().size()),
-                        fill_value=float((nu_ids[i] > 0) and (particles_v[i].group_id() == particles_v[i].parent_id())),
-                        dtype=np.float32)
-            vtx_x = np.full(shape=(cluster.as_vector().size()),
-                            fill_value=particles_v_asis[i].ancestor_position().x(), dtype=np.float32)
-            vtx_y = np.full(shape=(cluster.as_vector().size()),
-                            fill_value=particles_v_asis[i].ancestor_position().y(), dtype=np.float32)
-            vtx_z = np.full(shape=(cluster.as_vector().size()),
-                            fill_value=particles_v_asis[i].ancestor_position().z(), dtype=np.float32)
-            sem_type = np.full(shape=(cluster.as_vector().size()),
-                               fill_value=particles_v[i].shape(), dtype=np.float32)
+            larcv.as_flat_arrays(cluster, meta, x, y, z, value)
             clusters_voxels.append(np.stack([x, y, z], axis=1))
-            clusters_features.append(np.column_stack([value, cluster_id, group_id,
-                inter_id, nu_id, pdg, p, vtx_x, vtx_y, vtx_z, is_primary, sem_type]))
-    if len(clusters_voxels) > 0:
-        np_voxels   = np.concatenate(clusters_voxels, axis=0)
-        np_features = np.concatenate(clusters_features, axis=0)
-    else:
-        np_voxels   = np.empty(shape=(0, 3), dtype=np.float32)
-        np_features = np.empty(shape=(0, 8), dtype=np.float32)
-
-    return np_voxels, np_features
 
+            # Append the cluster-wise information
+            features = [value]
+            for k, l in labels.items():
+                size = cluster.as_vector().size()
+                features.append(np.full(shape=(size), fill_value=l[i], dtype=np.float32))
+            clusters_features.append(np.column_stack(features))
 
-def parse_cluster3d_types(data):
-    """
-    a function to retrieve a 3D clusters tensor and PDG information
-
-    .. code-block:: yaml
-
-        schema:
-          cluster_label:
-            - parse_cluster3d_types
-            - cluster3d_pcluster
-            - particle_mpv
-
-    Configuration
-    -------------
-    cluster3d_pcluster: larcv::EventClusterVoxel3D
-    particle_mpv: larcv::EventParticle, optional
-        To determine neutrino vs cosmic labels
-
-    Returns
-    -------
-    np_voxels: np.ndarray
-        a numpy array with the shape (n,3) where 3 represents (x,y,z)
-        coordinate
-    np_features: np.ndarray
-        a numpy array with the shape (n,4) where 4 is voxel value,
-        cluster id, group id, pdg, respectively
-
-    See Also
-    --------
-    parse_cluster3d_full
-    """
-    cluster_event = data[0]
-    particles_v = data[1].as_vector()
-    # print(cluster_event)
-    # assert False
-    meta = cluster_event.meta()
-    num_clusters = cluster_event.as_vector().size()
-    clusters_voxels, clusters_features = [], []
-    particle_mpv = None
-    if len(data) > 2:
-        particle_mpv = data[2].as_vector()
-
-    from mlreco.utils.groups import get_interaction_id, get_nu_id
-    group_ids = np.array([p.group_id() for p in particles_v])
-    inter_ids = get_interaction_id(particles_v)
-    nu_ids    = get_nu_id(cluster_event, particles_v, inter_ids, particle_mpv = particle_mpv)
-
-    for i in range(num_clusters):
-        cluster = cluster_event.as_vector()[i]
-        num_points = cluster.as_vector().size()
-        if num_points > 0:
-            x = np.empty(shape=(num_points,), dtype=np.int32)
-            y = np.empty(shape=(num_points,), dtype=np.int32)
-            z = np.empty(shape=(num_points,), dtype=np.int32)
-            value = np.empty(shape=(num_points,), dtype=np.float32)
-            larcv.as_flat_arrays(cluster,meta,x, y, z, value)
-            assert i == particles_v[i].id()
-            cluster_id = np.full(shape=(cluster.as_vector().size()),
-                                 fill_value=particles_v[i].id(), dtype=np.float32)
-            group_id = np.full(shape=(cluster.as_vector().size()),
-                               #fill_value=particles_v[i].group_id(), dtype=np.float32)
-                               fill_value=group_ids[i], dtype=np.float32)
-            t = int(particles_v[i].pdg_code())
-            if t in TYPE_LABELS.keys():
-                pdg = np.full(shape=(cluster.as_vector().size()),
-                                fill_value=TYPE_LABELS[t], dtype=np.float32)
-            else:
-                pdg = np.full(shape=(cluster.as_vector().size()),
-                                fill_value=-1, dtype=np.float32)
-            clusters_voxels.append(np.stack([x, y, z], axis=1))
-            clusters_features.append(np.column_stack([value, cluster_id, group_id, pdg]))
+    # If there are no non-empty clusters, return. Concatenate otherwise
+    if not len(clusters_voxels):
+        return np.empty(shape=(0, 3), dtype=np.float32), np.empty(shape=(0, len(labels)+1), dtype=np.float32)
     np_voxels   = np.concatenate(clusters_voxels, axis=0)
     np_features = np.concatenate(clusters_features, axis=0)
-    # mask = np_features[:, 6] == np.unique(np_features[:, 6])[0]
-
-    # print(np_features[mask][:, [0, 1, 5, 6]])
-    return np_voxels, np_features
 
+    # If requested, remove duplicate voxels (cluster overlaps) and account for semantics
+    if clean_data:
+        assert sparse_semantics_event is not None, 'Need to provide a semantics tensor to clean up output'
+        sem_voxels, sem_features = parse_sparse3d([sparse_semantics_event])
+        np_voxels,  np_features  = clean_sparse_data(np_voxels, np_features, sem_voxels, sem_features, meta, precedence)
+        np_features[:,-1] = sem_features[:,-1] # Match semantic column to semantic tensor
+        np_features[sem_features[:,-1] > 3, 1:-1] = -1 # Set all cluster labels to -1 if semantic class is LE or ghost
 
-def parse_cluster3d_kinematics(data):
-    """
-    a function to retrieve a 3D clusters tensor with kinematics features
-    (including vertex information and primary particle tagging).
+    # If a value tree is provided, override value colum
+    if sparse_value_event:
+        _, val_features  = parse_sparse3d([sparse_value_event])
+        np_features[:,0] = val_features[:,-1]
 
-    .. code-block:: yaml
-
-        schema:
-          cluster_label:
-            - parse_cluster3d_kinematics
-            - cluster3d_pcluster
-            - particle_pcluster
-            - particle_mpv
-
-    Configuration
-    -------------
-    cluster3d_pcluster: larcv::EventClusterVoxel3D
-    particle_pcluster: larcv::EventParticle
-    particle_mpv: larcv::EventParticle, optional
-        To determine neutrino vs cosmic labels
-
-    Returns
-    -------
-    np_voxels: np.ndarray
-        a numpy array with the shape (n,3) where 3 represents (x,y,z)
-        coordinate
-    np_features: np.ndarray
-        a numpy array with the shape (n,9) where 9 is respectively
-
-        * voxel value,
-        * cluster id,
-        * group id,
-        * pdg,
-        * momentum,
-        * vtx_x,
-        * vtx_y,
-        * vtx_z,
-        * is_primary
-
-    See Also
-    --------
-    parse_cluster3d_full
-    parse_cluster3d_kinematics_clean
-
-    Note
-    ----
-    Likely to be merged with `parse_cluster3d_full` soon.
-    """
-    cluster_event = data[0]
-    particles_v = data[1].as_vector()
-    particles_v_asis = parse_particle_asis([data[1], data[0]])
-
-    meta = cluster_event.meta()
-    num_clusters = cluster_event.as_vector().size()
-    clusters_voxels, clusters_features = [], []
-    particle_mpv = None
-    if len(data) > 2:
-        particle_mpv = data[2].as_vector()
-
-    from mlreco.utils.groups import get_interaction_id, get_nu_id, get_particle_id
-    group_ids = np.array([p.group_id() for p in particles_v])
-    inter_ids = get_interaction_id(particles_v)
-    nu_ids    = get_nu_id(cluster_event, particles_v, inter_ids, particle_mpv = particle_mpv)
-    pids      = get_particle_id(particles_v, nu_ids)
-
-    for i in range(num_clusters):
-        cluster = cluster_event.as_vector()[i]
-        num_points = cluster.as_vector().size()
-        if num_points > 0:
-            x = np.empty(shape=(num_points,), dtype=np.int32)
-            y = np.empty(shape=(num_points,), dtype=np.int32)
-            z = np.empty(shape=(num_points,), dtype=np.int32)
-            value = np.empty(shape=(num_points,), dtype=np.float32)
-            larcv.as_flat_arrays(cluster,meta,x, y, z, value)
-            assert i == particles_v[i].id()
-            cluster_id = np.full(shape=(cluster.as_vector().size()),
-                                 fill_value=particles_v[i].id(), dtype=np.float32)
-            group_id = np.full(shape=(cluster.as_vector().size()),
-                               #fill_value=particles_v[i].group_id(), dtype=np.float32)
-                               fill_value=group_ids[i], dtype=np.float32)
-            px = particles_v[i].px()
-            py = particles_v[i].py()
-            pz = particles_v[i].pz()
-            p = np.sqrt(px**2 + py**2 + pz**2) / 1000.0
-            p = np.full(shape=(cluster.as_vector().size()),
-                                fill_value=p, dtype=np.float32)
-            pdg = np.full(shape=(cluster.as_vector().size()),
-                            fill_value=pids[i], dtype=np.float32)
-            vtx_x = np.full(shape=(cluster.as_vector().size()),
-                            fill_value=particles_v_asis[i].ancestor_position().x(), dtype=np.float32)
-            vtx_y = np.full(shape=(cluster.as_vector().size()),
-                            fill_value=particles_v_asis[i].ancestor_position().y(), dtype=np.float32)
-            vtx_z = np.full(shape=(cluster.as_vector().size()),
-                            fill_value=particles_v_asis[i].ancestor_position().z(), dtype=np.float32)
-            # is_primary = np.full(shape=(cluster.as_vector().size()),
-            #             fill_value=float((nu_ids[i] > 0) and (particles_v[i].parent_id() == particles_v[i].id()) and (particles_v[i].group_id() == particles_v[i].id())),
-            #             dtype=np.float32)
-            is_primary = np.full(shape=(cluster.as_vector().size()),
-                        fill_value=float((nu_ids[i] > 0) and (particles_v[i].group_id() == particles_v[i].parent_id())),
-                        dtype=np.float32)
-            clusters_voxels.append(np.stack([x, y, z], axis=1))
-            clusters_features.append(np.column_stack([value, cluster_id, group_id, pdg, p, vtx_x, vtx_y, vtx_z, is_primary]))
-    if len(clusters_voxels) > 0:
-        np_voxels   = np.concatenate(clusters_voxels, axis=0)
-        np_features = np.concatenate(clusters_features, axis=0)
-    else:
-        np_voxels = np.empty((0, 3), dtype=np.int32)
-        np_features = np.empty((0, 9), dtype=np.float32)
-    # mask = np_features[:, 6] == np.unique(np_features[:, 6])[0]
-
-    # print(np_features[mask][:, [0, 1, 5, 6]])
     return np_voxels, np_features
 
 
-def parse_cluster3d_kinematics_clean(data):
-    """
-    Similar to parse_cluster3d_kinematics, but removes overlap voxels.
-
-    .. code-block:: yaml
-
-        schema:
-          cluster_label:
-            - parse_cluster3d_kinematics_clean
-            - cluster3d_pcluster
-            - particle_pcluster
-            - particle_mpv
-            - sparse3d_pcluster
-
-    Configuration
-    -------------
-    cluster3d_pcluster: larcv::EventClusterVoxel3D
-    particle_pcluster: larcv::EventParticle
-    particle_mpv: larcv::EventParticle, optional
-        To determine neutrino vs cosmic labels
-    sparse3d_pcluster: larcv::EventSparseTensor3D
-        This tensor will help determine overlap voxels and final shape.
-
-    Returns
-    -------
-    np_voxels: np.ndarray
-        a numpy array with the shape (n,3) where 3 represents (x,y,z)
-        coordinate
-    np_features: np.ndarray
-        a numpy array with the shape (n, 10) where 10 is respectively
-
-        * voxel value,
-        * cluster id,
-        * group id,
-        * pdg,
-        * momentum,
-        * vtx_x,
-        * vtx_y,
-        * vtx_z,
-        * is_primary,
-        * semantic type
-
-    See Also
-    --------
-    parse_cluster3d_full
-    parse_cluster3d_kinematics
-    """
-    grp_voxels, grp_data = parse_cluster3d_kinematics(data[:-1])
-    _, cluster_data = parse_cluster3d_full(data[:-1])
-    img_voxels, img_data = parse_sparse3d_scn([data[-1]])
-
-    grp_data = np.concatenate([grp_data, cluster_data[:, -1][:, None]], axis=1)
-    grp_voxels, grp_data = clean_data(grp_voxels, grp_data, img_voxels, img_data, data[0].meta())
-    return grp_voxels, grp_data#[:, :-1]
-
-
-def parse_cluster3d_kinematics_full_clean(data):
-    grp_voxels, grp_data = parse_cluster3d_kinematics_full(data[:-1])
-    _, cluster_data = parse_cluster3d_full(data[:-1])
-    img_voxels, img_data = parse_sparse3d_scn([data[-1]])
-
-    grp_data = np.concatenate([grp_data, cluster_data[:, -1][:, None]], axis=1)
-    grp_voxels, grp_data = clean_data(grp_voxels, grp_data, img_voxels, img_data, data[0].meta())
-    return grp_voxels, grp_data#[:, :-1]
-
-
-def parse_cluster3d_clean(data):
-    """
-    A function to retrieve a 3D clusters tensor.
-
-    .. code-block:: yaml
-
-        schema:
-          cluster_label:
-            - parse_cluster3d_clean
-            - cluster3d_pcluster
-            - particle_mpv
-
-    Configuration
-    -------------
-    cluster3d_pcluster: larcv::EventClusterVoxel3D
-    particle_mpv: larcv::EventParticle, optional
-        To determine neutrino vs cosmic labels
-
-    Parameters
-    ----------
-    data: list
-        length 3 array of larcv::EventClusterVoxel3D, larcv::EventSparseTensor3D
-        and larcv::EventParticle
-
-    Returns
-    -------
-    np_voxels: np.ndarray
-        a numpy array with the shape (N,3) where 3 represents (x,y,z)
-        coordinate
-    np_features: np.ndarray
-        a numpy array with the shape (N,2) where 2 represents (value, cluster_id)
-
-    See Also
-    --------
-    parse_cluster3d_full
-    """
-    grp_voxels, grp_data = parse_cluster3d_full(data)
-    return grp_voxels, grp_data[:,:2]
-
-
-def parse_cluster3d_clean_full(data):
-    """
-    A function to retrieve clusters tensor.  Do the following cleaning:
-
-    1) lexicographically sort group data (images are lexicographically sorted)
-
-    2) remove voxels from group data that are not in image
-
-    3) choose only one group per voxel (by lexicographic order)
-
-    4) override semantic labels with those from sparse3d
-    and give labels -1 to all voxels of class 4 and above
-
-    .. code-block:: yaml
-
-        schema:
-          cluster_label:
-            - parse_cluster3d_clean_full
-            - cluster3d_pcluster
-            - particle_mpv
-            - sparse3d_pcluster
-
-    Configuration
-    -------------
-    cluster3d_pcluster: larcv::EventClusterVoxel3D
-    particle_mpv: larcv::EventParticle, optional
-        To determine neutrino vs cosmic labels
-    sparse3d_pcluster: larcv::EventSparseTensor3D
-        Will determine final shape and overlap voxels.
-
-    Returns
-    -------
-    grp_voxels: np.ndarray
-        a numpy array with the shape (N,3) where 3 represents (x,y,z)
-        coordinate
-    grp_data: np.ndarray
-        a numpy array with the shape (n,8) where 8 is respectively
-
-        * voxel value,
-        * cluster id,
-        * group id,
-        * interaction id,
-        * nu id,
-        * particle type,
-        * primary id,
-        * semantic type
-
-    See Also
-    --------
-    parse_cluster3d_full
-    """
-    grp_voxels, grp_data = parse_cluster3d_full(data[:-1])
-    img_voxels, img_data = parse_sparse3d_scn([data[-1]])
-
-    grp_voxels, grp_data = clean_data(grp_voxels, grp_data, img_voxels, img_data, data[0].meta())
-
-    # step 4: override semantic labels with those from sparse3d
-    # and give labels -1 to all voxels of class 4 and above
-    grp_data[:,-1] = img_data[:,-1]
-    grp_data[img_data[:,-1] > 3,1:5] = -1
-    return grp_voxels, grp_data
+def parse_cluster3d_clean_full(cluster_event, particle_event, particle_mpv_event=None, sparse_semantics_event=None):
+    from warnings import warn
+    warn("Deprecated: parse_cluster3d_clean_full deprecated, use parse_cluster3d instead", DeprecationWarning)
+    if sparse_semantics_event is None and isinstance(particle_mpv_event, larcv.EventSparseTensor3D):
+        sparse_semantics_event = particle_mpv_event
+        particle_mpv_event = None
+    return parse_cluster3d(cluster_event, particle_event, particle_mpv_event, sparse_semantics_event, add_particle_info=True)
 
 
-def parse_cluster3d_clean_full_extended(data):
-    """
-    A function to retrieve clusters tensor.  Do the following cleaning:
-
-    1) lexicographically sort group data (images are lexicographically sorted)
-
-    2) remove voxels from group data that are not in image
-
-    3) choose only one group per voxel (by lexicographic order)
-
-    4) override semantic labels with those from sparse3d
-    and give labels -1 to all voxels of class 4 and above
-
-    .. code-block:: yaml
-
-        schema:
-          cluster_label:
-            - parse_cluster3d_clean_full
-            - cluster3d_pcluster
-            - particle_mpv
-            - sparse3d_pcluster
-
-    Configuration
-    -------------
-    cluster3d_pcluster: larcv::EventClusterVoxel3D
-    particle_mpv: larcv::EventParticle, optional
-        To determine neutrino vs cosmic labels
-    sparse3d_pcluster: larcv::EventSparseTensor3D
-        Will determine final shape and overlap voxels.
-
-    Returns
-    -------
-    grp_voxels: np.ndarray
-        a numpy array with the shape (N,3) where 3 represents (x,y,z)
-        coordinate
-    grp_data: np.ndarray
-        a numpy array with the shape (n,8) where 8 is respectively
-
-        * voxel value,
-        * cluster id,
-        * group id,
-        * interaction id,
-        * nu id,
-        * particle type,
-        * primary id,
-        * semantic type
-
-    See Also
-    --------
-    parse_cluster3d_full
-    """
-    grp_voxels, grp_data = parse_cluster3d_full_extended(data[:-1])
-    img_voxels, img_data = parse_sparse3d_scn([data[-1]])
-
-    grp_voxels, grp_data = clean_data(grp_voxels, grp_data, img_voxels, img_data, data[0].meta())
-
-    # step 4: override semantic labels with those from sparse3d
-    # and give labels -1 to all voxels of class 4 and above
-    grp_data[:,-1] = img_data[:,-1]
-    grp_data[img_data[:,-1] > 3,1:5] = -1
-
-    return grp_voxels, grp_data
-
-
-def parse_cluster3d_scales(data):
-    """
-    Retrieves clusters tensors at different spatial sizes.
-
-    .. code-block:: yaml
-
-        schema:
-          cluster_label:
-            - parse_cluster3d_scales
-            - cluster3d_pcluster
-            - sparse3d_pcluster
-
-    Configuration
-    -------------
-    cluster3d_pcluster: larcv::EventClusterVoxel3D
-    sparse3d_pcluster: larcv::EventSparseTensor3D
-        Will determine final shape and overlap voxels.
-
-    Returns
-    -------
-    list of tuples
-    """
-    grp_voxels, grp_data = parse_cluster3d_clean_full(data)
-    spatial_size = data[0].meta().num_voxel_x()
-    max_depth = int(np.floor(np.log2(spatial_size))-1)
-    scales = []
-    for d in range(max_depth):
-        scale_voxels = np.floor(grp_voxels/2**d)#.astype(int)
-        scale_voxels, unique_indices = np.unique(scale_voxels, axis=0, return_index=True)
-        scale_data = grp_data[unique_indices]
-        scales.append((scale_voxels, scale_data))
-    return scales
+def parse_cluster3d_kinematics_clean(cluster_event, particle_event, particle_mpv_event=None, sparse_semantics_event=None):
+    from warnings import warn
+    warn("Deprecated: parse_cluster3d_kinematics_clean deprecated, use parse_cluster3d instead", DeprecationWarning)
+    if sparse_semantics_event is None and isinstance(particle_mpv_event, larcv.EventSparseTensor3D):
+        sparse_semantics_event = particle_mpv_event
+        particle_mpv_event = None
+    return parse_cluster3d(cluster_event, particle_event, particle_mpv_event, sparse_semantics_event, add_kinematics_info=True)
diff --git a/mlreco/iotools/parsers/misc.py b/mlreco/iotools/parsers/misc.py
index 91f797fd..4918d138 100644
--- a/mlreco/iotools/parsers/misc.py
+++ b/mlreco/iotools/parsers/misc.py
@@ -4,9 +4,9 @@
 from mlreco.iotools.parsers.sparse import parse_sparse3d_scn
 
 
-def parse_meta3d(data):
+def parse_meta2d(sparse_event, projection_id = 0):
     """
-    Get the meta information to translate into real world coordinates (3D).
+    Get the meta information to translate into real world coordinates (2D).
 
     Each entry in a dataset is a cube, where pixel coordinates typically go
     from 0 to some integer N in each dimension. If you wish to translate
@@ -17,41 +17,46 @@ def parse_meta3d(data):
 
         schema:
           meta:
-            - parse_meta3d
-            - sparse3d_pcluster
+            parser: parse_meta2d
+            args:
+              sparse_event: sparse2d_pcluster
+              projection_id: 0
 
     Configuration
     ----------
-    sparse3d_pcluster : larcv::EventSparseTensor3D or larcv::EventClusterVoxel3D
+    sparse2d_event : larcv::EventSparseTensor2D or larcv::EventClusterVoxel2D
+    projection_id : int
 
     Returns
     -------
     np.ndarray
         Contains in order:
 
-        * `min_x`, `min_y`, `min_z` (real world coordinates)
-        * `max_x`, `max_y`, `max_z` (real world coordinates)
-        * `size_voxel_x`, `size_voxel_y`, `size_voxel_z` the size of each voxel
+        * `min_x`, `min_y` (real world coordinates)
+        * `max_x`, `max_y` (real world coordinates)
+        * `size_voxel_x`, `size_voxel_y` the size of each voxel
         in real world units
+
+    Note
+    ----
+    TODO document how to specify projection id.
     """
-    event_tensor3d = data[0]
-    meta = event_tensor3d.meta()
+
+    tensor2d = sparse_event.sparse_tensor_2d(projection_id)
+    meta = tensor2d.meta()
     return [
         meta.min_x(),
         meta.min_y(),
-        meta.min_z(),
         meta.max_x(),
         meta.max_y(),
-        meta.max_z(),
-        meta.size_voxel_x(),
-        meta.size_voxel_y(),
-        meta.size_voxel_z()
+        meta.pixel_width(),
+        meta.pixel_height()
     ]
 
 
-def parse_meta2d(data):
+def parse_meta3d(sparse_event):
     """
-    Get the meta information to translate into real world coordinates (2D).
+    Get the meta information to translate into real world coordinates (3D).
 
     Each entry in a dataset is a cube, where pixel coordinates typically go
     from 0 to some integer N in each dimension. If you wish to translate
@@ -62,91 +67,53 @@ def parse_meta2d(data):
 
         schema:
           meta:
-            - parse_meta2d
-            - sparse2d_pcluster
+            parser: parse_meta3d
+            args:
+              sparse_event: sparse3d_pcluster
 
     Configuration
     ----------
-    sparse2d_pcluster : larcv::EventSparseTensor2D or larcv::EventClusterVoxel2D
+    sparse_event : larcv::EventSparseTensor3D or larcv::EventClusterVoxel3D
 
     Returns
     -------
     np.ndarray
         Contains in order:
 
-        * `min_x`, `min_y` (real world coordinates)
-        * `max_x`, `max_y` (real world coordinates)
-        * `size_voxel_x`, `size_voxel_y` the size of each voxel
+        * `min_x`, `min_y`, `min_z` (real world coordinates)
+        * `max_x`, `max_y`, `max_z` (real world coordinates)
+        * `size_voxel_x`, `size_voxel_y`, `size_voxel_z` the size of each voxel
         in real world units
-
-    Note
-    ----
-    TODO document how to specify projection id.
     """
-    event_tensor2d = data[0]
-    projection_id = 0  # default
-    if isinstance(event_tensor2d, tuple):
-        projection_id = event_tensor2d[1]
-        event_tensor2d = event_tensor2d[0]
-
-    tensor2d = event_tensor2d.sparse_tensor_2d(projection_id)
-    meta = tensor2d.meta()
+    meta = sparse_event.meta()
     return [
         meta.min_x(),
         meta.min_y(),
+        meta.min_z(),
         meta.max_x(),
         meta.max_y(),
-        meta.pixel_width(),
-        meta.pixel_height()
+        meta.max_z(),
+        meta.size_voxel_x(),
+        meta.size_voxel_y(),
+        meta.size_voxel_z()
     ]
 
 
-def parse_dbscan(data):
-    """
-    A function to create dbscan tensor
-
-    .. code-block:: yaml
-
-        schema:
-          meta:
-            - parse_dbscan
-            - sparse3d_pcluster
-
-    Configuration
-    ----------
-    sparse3d_pcluster : larcv::EventSparseTensor3D
-
-    Returns
-    -------
-    voxels: numpy array(int32) with shape (N,3)
-        Coordinates
-    data: numpy array(float32) with shape (N,1)
-        dbscan cluster. -1 if not assigned
-    """
-    np_voxels, np_types = parse_sparse3d_scn(data)
-    # now run dbscan on data
-    clusts = dbscan_types(np_voxels, np_types)
-    # start with no clusters assigned.
-    np_types.fill(-1)
-    for i, c in enumerate(clusts):
-        np_types[c] = i
-    return np_voxels, np_types
-
-
-def parse_run_info(data):
+def parse_run_info(sparse_event):
     """
     Parse run info (run, subrun, event number)
 
     .. code-block:: yaml
 
         schema:
-          meta:
-            - parse_run_info
-            - sparse3d_pcluster
+          run_info:
+            parser: parse_run_info
+            args:
+              sparse_event: sparse3d_pcluster
 
     Configuration
     ----------
-    sparse3d_pcluster : larcv::EventSparseTensor3D or larcv::EventClusterVoxel3D
+    sparse_event : larcv::EventSparseTensor3D or larcv::EventClusterVoxel3D
         data to get run info from
 
     Returns
@@ -154,35 +121,4 @@ def parse_run_info(data):
     tuple
          (run, subrun, event)
     """
-    return data[0].run(), data[0].subrun(), data[0].event()
-
-
-def parse_tensor3d(data):
-    """
-    A function to retrieve larcv::EventSparseTensor3D as a dense numpy array
-
-    .. code-block:: yaml
-
-        schema:
-          meta:
-            - parse_tensor3d
-            - sparse3d_pcluster
-
-    Configuration
-    ----------
-    sparse3d_pcluster : larcv::EventSparseTensor3D
-
-    Returns
-    -------
-    np.ndarray
-        a numpy array of a dense 3d tensor object, last dimension = channels
-    """
-    np_data = []
-    meta = None
-    for event_tensor3d in data:
-        if meta is None:
-            meta = event_tensor3d.meta()
-        else:
-            assert meta == event_tensor3d.meta()
-        np_data.append(np.array(larcv.as_ndarray(event_tensor3d)))
-    return np.stack(np_data, axis=-1)
+    return sparse_event.run(), sparse_event.subrun(), sparse_event.event()
diff --git a/mlreco/iotools/parsers/particles.py b/mlreco/iotools/parsers/particles.py
index d0a33c69..32934e0d 100644
--- a/mlreco/iotools/parsers/particles.py
+++ b/mlreco/iotools/parsers/particles.py
@@ -2,87 +2,25 @@
 from larcv import larcv
 from mlreco.utils.ppn import get_ppn_info
 from mlreco.utils.groups import type_labels as TYPE_LABELS
-# Global type labels for PDG to Particle Type Label (nominal) conversion.
 
 
-def parse_particle_singlep_pdg(data):
-    """
-    Get each true particle's PDG code.
-
-    .. code-block:: yaml
-
-        schema:
-          pdg_list:
-            - parse_particle_singlep_pdg
-            - particle_pcluster
-
-    Configuration
-    ----------
-    particle_pcluster : larcv::EventParticle
-
-    Returns
-    -------
-    np.ndarray
-        List of PDG codes for each particle in TTree.
-    """
-    parts = data[0]
-    pdgs = []
-    pdg = -1
-    for p in parts.as_vector():
-        # print(p.track_id())
-        if not p.track_id() == 1: continue
-        if int(p.pdg_code()) in TYPE_LABELS.keys():
-            pdg = TYPE_LABELS[int(p.pdg_code())]
-        else: pdg = -1
-        return np.asarray([pdg])
-
-    return np.asarray([pdg])
-
-
-def parse_particle_singlep_einit(data):
-    """
-    Get each true particle's true initial energy.
-
-    .. code-block:: yaml
-
-        schema:
-          pdg_list:
-            - parse_particle_singlep_einit
-            - particle_pcluster
-
-    Configuration
-    ----------
-    particle_pcluster : larcv::EventParticle
-
-    Returns
-    -------
-    np.ndarray
-        List of true initial energy for each particle in TTree.
-    """
-    parts = data[0]
-    for p in parts.as_vector():
-        is_primary = p.track_id() == p.parent_track_id()
-        if not p.track_id() == 1: continue
-        return p.energy_init()
-    return -1
-
-
-def parse_particle_asis(data):
+def parse_particle_asis(particle_event, cluster_event):
     """
     A function to copy construct & return an array of larcv::Particle
 
     .. code-block:: yaml
 
         schema:
-          segment_label:
-            - parse_particle_asis
-            - particle_pcluster
-            - cluster3d_pcluster
+          particle_asis:
+            parser: parse_particle_asis
+            args:
+              particle_event: particle_pcluster
+              cluster_event: cluster3d_pcluster
 
     Configuration
     -------------
-    particle_pcluster: larcv::EventParticle
-    cluster3d_pcluster: larcv::EventClusterVoxel3D
+    particle_event: larcv::EventParticle
+    cluster_event: larcv::EventClusterVoxel3D
         to translate coordinates
 
     Returns
@@ -90,16 +28,9 @@ def parse_particle_asis(data):
     list
         a python list of larcv::Particle object
     """
-    particles = data[0]
-    particles = [larcv.Particle(p) for p in data[0].as_vector()]
-
-    clusters  = data[1]
-    #assert data[0].as_vector().size() in [clusters.as_vector().size(),clusters.as_vector().size()-1]
-
-    meta = clusters.meta()
-
-
-    funcs = ["first_step","last_step","position","end_position","ancestor_position"]
+    particles = [larcv.Particle(p) for p in particle_event.as_vector()]
+    meta = cluster_event.meta()
+    funcs = ['first_step', 'last_step', 'position', 'end_position', 'ancestor_position']
     for p in particles:
         for f in funcs:
             pos = getattr(p,f)()
@@ -116,17 +47,18 @@ def parse_particle_asis(data):
     return particles
 
 
-def parse_neutrino_asis(data):
+def parse_neutrino_asis(neutrino_event, cluster_event):
     """
     A function to copy construct & return an array of larcv::Neutrino
 
     .. code-block:: yaml
 
         schema:
-          segment_label:
-            - parse_neutrino_asis
-            - neutrino_mpv
-            - cluster3d_pcluster
+          neutrino_asis:
+            parser: parse_neutrino_asis
+            particle_asis:
+              neutrino_event: neutrino_mpv
+              cluster_event: cluster3d_pcluster
 
     Configuration
     -------------
@@ -139,17 +71,9 @@ def parse_neutrino_asis(data):
     list
         a python list of larcv::Neutrino object
     """
-    neutrinos = data[0]
-    neutrinos = [larcv.Neutrino(p) for p in data[0].as_vector()]
-
-    clusters  = data[1]
-    #assert data[0].as_vector().size() in [clusters.as_vector().size(),clusters.as_vector().size()-1]
-
-    meta = clusters.meta()
-
-
-    #funcs = ["first_step","last_step","position","end_position","ancestor_position"]
-    funcs = ["position"]
+    neutrinos = [larcv.Neutrino(p) for p in neutrino_event.as_vector()]
+    meta = cluster_event.meta()
+    funcs = ['position']
     for p in neutrinos:
         for f in funcs:
             pos = getattr(p,f)()
@@ -166,64 +90,27 @@ def parse_neutrino_asis(data):
     return neutrinos
 
 
-def parse_particle_coords(data):
-    '''
-    Function that returns particle coordinates (start and end) and start time.
-
-    This is used for particle clustering into interactions
-
-    .. code-block:: yaml
-
-        schema:
-          segment_label:
-            - parse_particle_coords
-            - particle_pcluster
-            - cluster3d_pcluster
-
-    Configuration
-    -------------
-    particle_pcluster: larcv::EventParticle
-    cluster3d_pcluster: larcv::EventClusterVoxel3D
-        to translate coordinates
-
-    Returns
-    -------
-    numpy.ndarray
-        Shape (N,8) containing: [first_step_x, first_step_y, first_step_z,
-        last_step_x, last_step_y, last_step_z, first_step_t, shape_id]
-    '''
-    # Scale particle coordinates to image size
-    particles = parse_particle_asis(data)
-
-    # Make features
-    particle_feats = []
-    for i, p in enumerate(particles):
-        start_point = last_point = [p.first_step().x(), p.first_step().y(), p.first_step().z()]
-        if p.shape() == 1: # End point only meaningful and thought out for tracks
-            last_point  = [p.last_step().x(), p.last_step().y(), p.last_step().z()]
-        particle_feats.append(np.concatenate((start_point, last_point, [p.first_step().t(), p.shape()])))
-
-    particle_feats = np.vstack(particle_feats)
-    return particle_feats[:,:3], particle_feats[:,3:]
-
-
-def parse_particle_points(data, include_point_tagging=False):
+def parse_particle_points(sparse_event, particle_event, include_point_tagging=True):
     """
     A function to retrieve particles ground truth points tensor, returns
     points coordinates, types and particle index.
+    If include_point_tagging is true, it includes start vs end point tagging.
 
     .. code-block:: yaml
 
         schema:
-          segment_label:
-            - parse_particle_points
-            - sparse3d_pcluster
-            - particle_pcluster
+          points:
+            parser: parse_particle_points
+            args:
+              sprase_event: sparse3d_pcluster
+              particle_event: particle_pcluster
+              include_point_tagging: True
 
     Configuration
     -------------
     sparse3d_pcluster: larcv::EventSparseTensor3D
     particle_pcluster: larcv::EventParticle
+    include_point_tagging: bool
 
     Returns
     -------
@@ -234,11 +121,11 @@ def parse_particle_points(data, include_point_tagging=False):
         a numpy array with the shape (N, 2) where 2 represents the class of the ground truth point
         and the particle data index in this order. (optionally: end/start tagging)
     """
-    particles_v = data[1].as_vector()
-    part_info = get_ppn_info(particles_v, data[0].meta())
+    particles_v = particle_event.as_vector()
+    part_info = get_ppn_info(particles_v, sparse_event.meta())
     # For open data - to reproduce
-    # part_info = get_ppn_info(particles_v, data[0].meta(), min_voxel_count=7, min_energy_deposit=10, use_particle_shape=False)
-    # part_info = get_ppn_info(particles_v, data[0].meta(), min_voxel_count=5, min_energy_deposit=10, use_particle_shape=False)
+    # part_info = get_ppn_info(particles_v, sparse_event.meta(), min_voxel_count=7, min_energy_deposit=10, use_particle_shape=False)
+    # part_info = get_ppn_info(particles_v, sparse_event.meta(), min_voxel_count=5, min_energy_deposit=10, use_particle_shape=False)
     np_values = np.column_stack([part_info[:, 3], part_info[:, 8]]) if part_info.shape[0] > 0 else np.empty(shape=(0, 2), dtype=np.float32)
     if include_point_tagging:
         np_values = np.column_stack([part_info[:, 3], part_info[:, 8], part_info[:, 9]]) if part_info.shape[0] > 0 else np.empty(shape=(0, 3), dtype=np.float32)
@@ -251,49 +138,63 @@ def parse_particle_points(data, include_point_tagging=False):
         return np.empty(shape=(0, 3), dtype=np.int32), np_values
 
 
-def parse_particle_points_with_tagging(data):
-    """
-    Same as `parse_particle_points` including start vs end point tagging.
+def parse_particle_coords(particle_event, cluster_event):
+    '''
+    Function that returns particle coordinates (start and end) and start time.
+
+    This is used for particle clustering into interactions
 
     .. code-block:: yaml
 
         schema:
-          segment_label:
-            - parse_particle_points_with_tagging
-            - sparse3d_pcluster
-            - particle_pcluster
+          coords:
+            parser: parse_particle_coords
+            args:
+              particle_event: particle_pcluster
+              cluster_event: cluster3d_pcluster
 
     Configuration
     -------------
-    sparse3d_pcluster: larcv::EventSparseTensor3D
     particle_pcluster: larcv::EventParticle
+    cluster3d_pcluster: larcv::EventClusterVoxel3D
+        to translate coordinates
 
     Returns
     -------
-    np_voxels: np.ndarray
-        a numpy array with the shape (N,3) where 3 represents (x,y,z)
-        coordinate
-    np_values: np.ndarray
-        a numpy array with the shape (N, 3) where 3 represents the class of the ground truth point,
-        the particle data index and end/start tagging in this order.
+    numpy.ndarray
+        Shape (N,8) containing: [first_step_x, first_step_y, first_step_z,
+        last_step_x, last_step_y, last_step_z, first_step_t, shape_id]
+    '''
+    # Scale particle coordinates to image size
+    particles = parse_particle_asis(particle_event, cluster_event)
 
-    See Also
-    ---------
-    parse_particle_points
-    """
-    return parse_particle_points(data, include_point_tagging=True)
+    # Make features
+    particle_feats = []
+    for i, p in enumerate(particles):
+        start_point = last_point = [p.first_step().x(), p.first_step().y(), p.first_step().z()]
+        if p.shape() == 1: # End point only meaningful and thought out for tracks
+            last_point  = [p.last_step().x(), p.last_step().y(), p.last_step().z()]
+        particle_feats.append(np.concatenate((start_point, last_point, [p.first_step().t(), p.shape()])))
 
+    particle_feats = np.vstack(particle_feats)
+    return particle_feats[:,:3], particle_feats[:,3:]
 
-def parse_particle_graph(data):
+
+def parse_particle_graph(particle_event, cluster_event=None):
     """
     A function to parse larcv::EventParticle to construct edges between particles (i.e. clusters)
 
+    If cluster_event is provided, it also removes edges to clusters
+    that have a zero pixel count and patches subsequently broken parentage.
+
     .. code-block:: yaml
 
         schema:
-          segment_label:
-            - parse_particle_graph
-            - particle_pcluster
+          graph:
+            parser: parse_particle_graph
+            args:
+              particle_event: particle_pcluster
+              cluster_event: cluster3d_pcluster
 
     Configuration
     -------------
@@ -308,97 +209,126 @@ def parse_particle_graph(data):
     --------
     parse_particle_graph_corrected: in addition, remove empty clusters.
     """
-    particles = data[0]
+    particles_v   = particle_event.as_vector()
+    num_particles = particles_v.size()
+    if cluster_event is None:
+        # Fill edges (directed [parent,child] pair)
+        edges = np.empty((0,2), dtype = np.int32)
+        for cluster_id in range(num_particles):
+            p = particles_v[cluster_id]
+            if p.parent_id() != p.id():
+                edges = np.vstack((edges, [int(p.parent_id()), cluster_id]))
+            if p.parent_id() == p.id() and p.group_id() != p.id():
+                edges = np.vstack((edges, [int(p.group_id()), cluster_id]))
+    else:
+        # Check that the cluster and particle objects are consistent
+        num_clusters = cluster_event.size()
+        assert num_clusters == num_particles
+
+        # Fill edges (directed [parent,child] pair)
+        zero_nodes, zero_nodes_pid = [], []
+        edges = np.empty((0,2), dtype = np.int32)
+        for cluster_id in range(num_particles):
+            cluster = cluster_event.as_vector()[cluster_id]
+            num_points = cluster.as_vector().size()
+            p = particles_v[cluster_id]
+            if p.id() != p.group_id():
+                continue
+            if p.parent_id() != p.group_id():
+                edges = np.vstack((edges, [int(p.parent_id()),p.group_id()]))
+            if num_points == 0:
+                zero_nodes.append(p.group_id())
+                zero_nodes_pid.append(cluster_id)
+
+        # Remove zero pixel nodes
+        for i, zn in enumerate(zero_nodes):
+            children = np.where(edges[:, 0] == zn)[0]
+            if len(children) == 0:
+                edges = edges[edges[:, 0] != zn]
+                edges = edges[edges[:, 1] != zn]
+                continue
+            parent = np.where(edges[:, 1] == zn)[0]
+            assert len(parent) <= 1
+
+            # If zero node has a parent, then assign children to that parent
+            if len(parent) == 1:
+                parent_id = edges[parent][0][0]
+                edges[:, 0][children] = parent_id
+            else:
+                edges = edges[edges[:, 0] != zn]
+            edges = edges[edges[:, 1] != zn]
 
-    # For convention, construct particle id => cluster id mapping
-    particle_to_cluster = np.zeros(shape=[particles.as_vector().size()],dtype=np.int32)
+    return edges
 
-    # Fill edges (directed, [parent,child] pair)
-    edges = np.empty((0,2), dtype = np.int32)
-    for cluster_id in range(particles.as_vector().size()):
-        p = particles.as_vector()[cluster_id]
-        #print(p.id(), p.parent_id(), p.group_id())
-        if p.parent_id() != p.id():
-            edges = np.vstack((edges, [int(p.parent_id()),cluster_id]))
-        if p.parent_id() == p.id() and p.group_id() != p.id():
-            edges = np.vstack((edges, [int(p.group_id()),cluster_id]))
 
-    return edges
+def parse_particle_singlep_pdg(particle_event):
+    """
+    Get each true particle's PDG code.
+
+    .. code-block:: yaml
 
+        schema:
+          pdg_list:
+            parser: parse_particle_singlep_pdg
+            args:
+              particle_event: particle_pcluster
 
-def parse_particle_graph_corrected(data):
+    Configuration
+    ----------
+    particle_event : larcv::EventParticle
+
+    Returns
+    -------
+    np.ndarray
+        List of PDG codes for each particle in TTree.
     """
-    A function to parse larcv::EventParticle to construct edges between particles (i.e. clusters)
+    pdgs = []
+    pdg = -1
+    for p in particle_event.as_vector():
+        if not p.track_id() == 1: continue
+        if int(p.pdg_code()) in TYPE_LABELS.keys():
+            pdg = TYPE_LABELS[int(p.pdg_code())]
+        else: pdg = -1
+        return np.asarray([pdg])
+
+    return np.asarray([pdg])
 
-    Also removes edges to clusters that have a zero pixel count.
+
+def parse_particle_singlep_einit(particle_event):
+    """
+    Get each true particle's true initial energy.
 
     .. code-block:: yaml
 
         schema:
-          segment_label:
-            - parse_particle_graph_corrected
-            - particle_pcluster
-            - cluster3d_pcluster
+          einit_list:
+            parser: parse_particle_singlep_pdg
+            args:
+              particle_event: particle_pcluster
 
     Configuration
-    -------------
-    particle_pcluster: larcv::EventParticle
-    cluster3d_pcluster: larcv::EventClusterVoxel3D
+    ----------
+    particle_event : larcv::EventParticle
 
     Returns
     -------
     np.ndarray
-        a numpy array of directed edges where each edge is (parent,child) batch index ID.
-
-    See Also
-    --------
-    parse_particle_graph: same parser without correcting for empty clusters.
+        List of true initial energy for each particle in TTree.
     """
-    particles = data[0]
-    cluster_event = data[1]
-
-    # For convention, construct particle id => cluster id mapping
-    num_clusters = cluster_event.size()
-    num_particles = particles.as_vector().size()
-    assert num_clusters == num_particles
-
-    zero_nodes = []
-    zero_nodes_pid = []
-
-    # Fill edges (directed, [parent,child] pair)
-    edges = np.empty((0,2), dtype = np.int32)
-    for cluster_id in range(num_particles):
-        cluster = cluster_event.as_vector()[cluster_id]
-        num_points = cluster.as_vector().size()
-        p = particles.as_vector()[cluster_id]
-        #print(p.id(), p.parent_id(), p.group_id())
-        if p.id() != p.group_id():
-            continue
-        if p.parent_id() != p.group_id():
-            edges = np.vstack((edges, [int(p.parent_id()),p.group_id()]))
-        if num_points == 0:
-            zero_nodes.append(p.group_id())
-            zero_nodes_pid.append(cluster_id)
-
-    # Remove zero pixel nodes:
-    # print('------------------------------')
-    # print(edges)
-    # print(zero_nodes)
-    for i, zn in enumerate(zero_nodes):
-        children = np.where(edges[:, 0] == zn)[0]
-        if len(children) == 0:
-            edges = edges[edges[:, 0] != zn]
-            edges = edges[edges[:, 1] != zn]
-            continue
-        parent = np.where(edges[:, 1] == zn)[0]
-        assert len(parent) <= 1
-        # If zero node has a parent, then assign children to that parent
-        if len(parent) == 1:
-            parent_id = edges[parent][0][0]
-            edges[:, 0][children] = parent_id
-        else:
-            edges = edges[edges[:, 0] != zn]
-        edges = edges[edges[:, 1] != zn]
-    # print(edges)
+    for p in particle_event.as_vector():
+        is_primary = p.track_id() == p.parent_track_id()
+        if not p.track_id() == 1: continue
+        return p.energy_init()
+    return -1
 
-    return edges
+
+def parse_particle_points_with_tagging(sparse_event, particle_event):
+    from warnings import warn
+    warn("Deprecated: parse_particle_points_with_tagging deprecated, use parse_particle_points instead", DeprecationWarning)
+    return parse_particle_points(sparse_event, particle_event, include_point_tagging=True)
+
+
+def parse_particle_graph_corrected(particle_event, cluster_event):
+    from warnings import warn
+    warn("Deprecated: parse_particle_graph_corrected deprecated, use parse_particle_graph instead", DeprecationWarning)
+    return parse_particle_graph(particle_event, cluster_event)
diff --git a/mlreco/iotools/parsers/sparse.py b/mlreco/iotools/parsers/sparse.py
index ffae3c2a..01a18f9d 100644
--- a/mlreco/iotools/parsers/sparse.py
+++ b/mlreco/iotools/parsers/sparse.py
@@ -1,9 +1,8 @@
 import numpy as np
 from larcv import larcv
-from mlreco.utils.groups import filter_duplicate_voxels, filter_duplicate_voxels_ref, filter_nonimg_voxels
 
 
-def parse_sparse2d_scn(data):
+def parse_sparse2d(sparse_event_list):
     """
     A function to retrieve sparse tensor input from larcv::EventSparseTensor2D object
 
@@ -12,13 +11,17 @@ def parse_sparse2d_scn(data):
     .. code-block:: yaml
 
         schema:
-          segment_label:
-            - parse_sparse2d_scn
-            - sparse2d_pcluster
+          input_data:
+            parser: parse_sparse2d
+            args:
+              sparse_event_list:
+                - sparse2d_pcluster_0 (, 0)
+                - sparse2d_pcluster_1 (, 1)
+                - ...
 
     Configuration
     -------------
-    sparse2d_pcluster: larcv::EventSparseTensor2D
+    sparse_event_list: list of larcv::EventSparseTensor2D
         Optionally, give an array of (larcv::EventSparseTensor2D, int) for projection id
 
     Returns
@@ -31,30 +34,30 @@ def parse_sparse2d_scn(data):
     meta = None
     output = []
     np_voxels = None
-    for event_tensor2d in data:
+    for sparse_event in sparse_event_list:
         projection_id = 0  # default
-        if isinstance(event_tensor2d, tuple):
-            projection_id = event_tensor2d[1]
-            event_tensor2d = event_tensor2d[0]
+        if isinstance(sparse_event, tuple):
+            projection_id = sparse_event[1]
+            sparse_event = sparse_event[0]
 
-        tensor2d = event_tensor2d.sparse_tensor_2d(projection_id)
-        num_point = tensor2d.as_vector().size()
+        tensor = sparse_event.sparse_tensor_2d(projection_id)
+        num_point = tensor.as_vector().size()
 
         if meta is None:
 
-            meta = tensor2d.meta()
+            meta = tensor.meta()
             np_voxels = np.empty(shape=(num_point, 2), dtype=np.int32)
-            larcv.fill_2d_voxels(tensor2d, np_voxels)
+            larcv.fill_2d_voxels(tensor, np_voxels)
 
         # else:
-        #     assert meta == tensor2d.meta()
+        #     assert meta == tensor.meta()
         np_data = np.empty(shape=(num_point, 1), dtype=np.float32)
-        larcv.fill_2d_pcloud(tensor2d, np_data)
+        larcv.fill_2d_pcloud(tensor, np_data)
         output.append(np_data)
     return np_voxels, np.concatenate(output, axis=-1)
 
 
-def parse_sparse3d_scn(data):
+def parse_sparse3d(sparse_event_list):
     """
     A function to retrieve sparse tensor input from larcv::EventSparseTensor3D object
 
@@ -63,13 +66,17 @@ def parse_sparse3d_scn(data):
     .. code-block:: yaml
 
         schema:
-          segment_label:
-            - parse_sparse3d_scn
-            - sparse3d_pcluster
+          input_data:
+            parser: parse_sparse3d
+            args:
+              sparse_event_list:
+                - sparse3d_pcluster_0
+                - sparse3d_pcluster_1
+                - ...
 
     Configuration
     -------------
-    sparse3d_pcluster: larcv::EventSparseTensor3D
+    sparse_event_list: list of larcv::EventSparseTensor3D
         Can be repeated to load more features (one per feature).
 
     Returns
@@ -82,235 +89,55 @@ def parse_sparse3d_scn(data):
     meta = None
     output = []
     np_voxels = None
-    for event_tensor3d in data:
-        num_point = event_tensor3d.as_vector().size()
+    for sparse_event in sparse_event_list:
+        num_point = sparse_event.as_vector().size()
         if meta is None:
-            meta = event_tensor3d.meta()
+            meta = sparse_event.meta()
             np_voxels = np.empty(shape=(num_point, 3), dtype=np.int32)
-            larcv.fill_3d_voxels(event_tensor3d, np_voxels)
+            larcv.fill_3d_voxels(sparse_event, np_voxels)
         else:
-            assert meta == event_tensor3d.meta()
+            assert meta == sparse_event.meta()
         np_data = np.empty(shape=(num_point, 1), dtype=np.float32)
-        larcv.fill_3d_pcloud(event_tensor3d, np_data)
+        larcv.fill_3d_pcloud(sparse_event, np_data)
         output.append(np_data)
     return np_voxels, np.concatenate(output, axis=-1)
 
 
-def parse_sparse3d(data):
+def parse_sparse3d_ghost(sparse_event_semantics):
     """
-    A function to retrieve sparse tensor from larcv::EventSparseTensor3D object
-    and return it in concatenated form (shape (N, 3+C)) instead of voxels and
-    features arrays separately.
-
-    .. code-block:: yaml
-
-        schema:
-          segment_label:
-            - parse_sparse3d
-            - sparse3d_pcluster
-
-    Configuration
-    -------------
-    sparse3d_pcluster: larcv::EventSparseTensor3D
-        Can be repeated to load more features (one per feature).
-
-    Returns
-    -------
-    np.ndarray
-        a numpy array with the shape (N,3+C) where 3+C represents
-        (x,y,z) coordinate and C stored pixel values (channels).
-    """
-    meta = None
-    output = []
-    for event_tensor3d in data:
-        num_point = event_tensor3d.as_vector().size()
-        if meta is None:
-            meta = event_tensor3d.meta()
-            np_voxels = np.empty(shape=(num_point, 3), dtype=np.int32)
-            larcv.fill_3d_voxels(event_tensor3d, np_voxels)
-            output.append(np_voxels)
-        else:
-            assert meta == event_tensor3d.meta()
-        np_values = np.empty(shape=(num_point, 1), dtype=np.float32)
-        larcv.fill_3d_pcloud(event_tensor3d, np_values)
-        output.append(np_values)
-    return np.concatenate(output, axis=-1)
-
-
-def parse_sparse3d_ghost(data):
-    meta = None
-    output = []
-    np_voxels = None
-    for event_tensor3d in data:
-        num_point = event_tensor3d.as_vector().size()
-        if meta is None:
-            meta = event_tensor3d.meta()
-            np_voxels = np.empty(shape=(num_point, 3), dtype=np.int32)
-            larcv.fill_3d_voxels(event_tensor3d, np_voxels)
-        else:
-            assert meta == event_tensor3d.meta()
-        np_data = np.empty(shape=(num_point, 1), dtype=np.float32)
-        larcv.fill_3d_pcloud(event_tensor3d, np_data)
-        output.append(np_data)
-
-    #print('ghost', np_voxels.shape, output[0].shape)
-    #print('preghost', output[0])
-    #print('postghost', (output[0]==5).astype(np.float32))
-    return np_voxels, (output[0]==5).astype(np.float32)
-
-
-def parse_weights(data):
-    """
-    A function to generate weights from larcv::EventSparseTensor3D and larcv::Particle list
-
-    For each voxel belonging to a particle :math:`p`, if the particle has :math:`N_p` voxels,
-    the weight is computed as
-
-    .. math::
-        w_p = 1. / (N_p + 1)
-
-    .. code-block:: yaml
-
-        schema:
-          weights:
-            - parse_weights
-            - sparse3d_pcluster
-            - sparse3d_index
-            - particle_pcluster
-
-    Configuration
-    -------------
-    sparse3d_pcluster: larcv::EventSparseTensor3D
-    sparse3d_index: larcv::EventSparseTensor3D
-        Contains index information (to which particle each voxel belongs)
-    particle_pcluster: larcv::Particle
-
-    Returns
-    -------
-    np_voxels: np.ndarray
-    np_values: np.ndarray
-        Weight values for each voxel
-    """
-    event_tensor3d = data[0]
-    num_point = event_tensor3d.as_vector().size()
-    np_voxels = np.empty(shape=(num_point, 3), dtype=np.int32)
-    larcv.fill_3d_voxels(event_tensor3d, np_voxels)
-
-    event_index = data[1]
-    assert num_point == event_index.as_vector().size()
-    np_index = np.empty(shape=(num_point, 1), dtype=np.float32)
-    larcv.fill_3d_pcloud(event_index, np_index)
-
-    particles = data[2]
-    num_voxels = np.array([1. / (p.num_voxels()+1) for p in particles.as_vector()])
-
-    return np_voxels, num_voxels[np_index.astype(int)]
-
-
-def parse_sparse3d_clean(data):
-    """
-    A function to retrieve clusters tensor.  Do the following cleaning:
-
-    1) lexicographically sort coordinates
-
-    2) choose only one group per voxel (by lexicographic order)
+    A function to retrieve sparse tensor input from larcv::EventSparseTensor3D object
 
-    3) get labels from the image labels for each voxel in addition to groups
+    Converts the sematic class to a ghost vs non-ghost label.
 
     .. code-block:: yaml
 
         schema:
-          segment_label:
-            - parse_sparse3d_clean
-            - sparse3d_mcst_reco
-            - sparse3d_mcst_reco_group
-            - sparsed_fivetypes_reco
+          ghost_label:
+            parser: parse_sparse3d
+            args:
+              sparse_event_semantics: sparse3d_semantics
 
     Configuration
     -------------
-    sparse3d_mcst_reco: larcv::EventSparseTensor3D
-    sparse3d_mcst_reco_group: larcv::EventSparseTensor3D
-    sparse3d_fivetypes_reco: larcv::EventSparseTensor3D
+    sparse_event_semantics: larcv::EventSparseTensor3D
 
     Returns
     -------
-    grp_voxels: np.ndarray
-        a numpy array with the shape (N,3) where 3 represents (x,y,z)
-        coordinate
-    grp_data: np.ndarray
-        a numpy array with the shape (N,3) where 3 is energy + cluster id + label
-
-    See Also
-    --------
-    parse_sparse3d_scn
-    """
-    img_voxels, img_data = parse_sparse3d_scn([data[0]])
-    perm = np.lexsort(img_voxels.T)
-    img_voxels = img_voxels[perm]
-    #img_data = img_data[perm]
-    img_voxels, unique_indices = np.unique(img_voxels, axis=0, return_index=True)
-    #img_data = img_data[unique_indices]
-
-    grp_voxels, grp_data = parse_sparse3d_scn([data[1]])
-    perm = np.lexsort(grp_voxels.T)
-    grp_voxels = grp_voxels[perm]
-    grp_data = grp_data[perm]
-    grp_voxels, unique_indices = np.unique(grp_voxels, axis=0, return_index=True)
-    grp_data = grp_data[unique_indices]
-
-    label_voxels, label_data = parse_sparse3d_scn([data[2]])
-    perm = np.lexsort(label_voxels.T)
-    label_voxels = label_voxels[perm]
-    label_data = label_data[perm]
-    label_voxels, unique_indices = np.unique(label_voxels, axis=0, return_index=True)
-    label_data = label_data[unique_indices]
-
-    sel2 = filter_nonimg_voxels(grp_voxels, label_voxels[(label_data<5).reshape((-1,)),:], usebatch=False)
-    inds2 = np.where(sel2)[0]
-    grp_voxels = grp_voxels[inds2]
-    grp_data = grp_data[inds2]
-
-    sel2 = filter_nonimg_voxels(img_voxels, label_voxels[(label_data<5).reshape((-1,)),:], usebatch=False)
-    inds2 = np.where(sel2)[0]
-    img_voxels = img_voxels[inds2]
-    img_data = img_data[inds2]
-    return grp_voxels, np.concatenate([img_data, grp_data, label_data[label_data<5][:, None]], axis=1)
-
-
-def parse_sparse3d_scn_scales(data):
+    np.ndarray
+        a numpy array with the shape (N,3+1) where 3+1 represents
+        (x,y,z) coordinate and 1 stored ghost labels (channels).
     """
-    Retrieves sparse tensors at different spatial sizes.
+    np_voxels, np_data = parse_sparse3d([sparse_event_semantics])
+    return np_voxels, (np_data==5).astype(np.float32)
 
-    .. code-block:: yaml
 
-        schema:
-          segment_label:
-            - parse_sparse3d_scn_scales
-            - sparse3d_pcluster
+def parse_sparse2d_scn(sparse_event_list):
+    from warnings import warn
+    warn("Deprecated: parse_sparse2d_scn deprecated, use parse_sparse2d instead", DeprecationWarning)
+    return parse_sparse2d(sparse_event_list)
 
-    Configuration
-    -------------
-    sparse3d_pcluster: larcv::EventSparseTensor3D
-        Can be repeated to load more features (one per feature).
-
-    Returns
-    -------
-    list of tuples
-    """
-    grp_voxels, grp_data = parse_sparse3d_scn(data)
-    perm = np.lexsort(grp_voxels.T)
-    grp_voxels = grp_voxels[perm]
-    grp_data = grp_data[perm]
 
-    spatial_size = data[0].meta().num_voxel_x()
-    max_depth = int(np.floor(np.log2(spatial_size))-1)
-    scales = []
-    for d in range(max_depth):
-        scale_voxels = np.floor(grp_voxels/2**d)#.astype(int)
-        scale_voxels, unique_indices = np.unique(scale_voxels, axis=0, return_index=True)
-        scale_data = grp_data[unique_indices]
-        # perm = np.lexsort(scale_voxels.T)
-        # scale_voxels = scale_voxels[perm]
-        # scale_data = scale_data[perm]
-        scales.append((scale_voxels, scale_data))
-    return scales
+def parse_sparse3d_scn(sparse_event_list):
+    from warnings import warn
+    warn("Deprecated: parse_sparse3d_scn deprecated, use parse_sparse3d instead", DeprecationWarning)
+    return parse_sparse3d(sparse_event_list)
diff --git a/mlreco/main_funcs.py b/mlreco/main_funcs.py
index ab2a1175..5b7ceadd 100644
--- a/mlreco/main_funcs.py
+++ b/mlreco/main_funcs.py
@@ -1,6 +1,5 @@
-import os, time, datetime, glob, sys
+import os, time, datetime, glob, sys, yaml
 import numpy as np
-import pprint
 try:
     import MinkowskiEngine as ME
 except ImportError:
@@ -115,8 +114,9 @@ def process_config(cfg, verbose=True):
     # Report GPUs to be used (if any)
     # Report configuations
     if verbose:
-        pp = pprint.PrettyPrinter(indent=4)
-        pp.pprint(cfg)
+        from warnings import filterwarnings
+        filterwarnings('once', message='Deprecated', category=DeprecationWarning)
+        print(yaml.dump(cfg, default_flow_style=None))
 
 
 def make_directories(cfg, loaded_iteration, handlers=None):
diff --git a/mlreco/models/layers/common/gnn_full_chain.py b/mlreco/models/layers/common/gnn_full_chain.py
index 626e6ee0..08c1e383 100644
--- a/mlreco/models/layers/common/gnn_full_chain.py
+++ b/mlreco/models/layers/common/gnn_full_chain.py
@@ -351,7 +351,7 @@ def get_all_particles(self, frag_result, result, input):
                     for c in particles[b]] ) for b in bcids]
 
         parts = [np.array([vids[c].astype(np.int64) for c in particles[b]],
-                        dtype=np.object \
+                        dtype=object \
                         if not same_length[idx] \
                         else np.int64) for idx, b in enumerate(bcids)]
 
@@ -548,7 +548,7 @@ def run_particle_gnns(self, result, input, frag_result):
             same_length = [np.all([len(c) == len(interactions[b][0]) for c in interactions[b]] ) for b in bcids]
 
             interactions_np = [np.array([vids[c].astype(np.int64) for c in interactions[b]],
-                               dtype=np.object if not same_length[idx] else np.int64) \
+                               dtype=object if not same_length[idx] else np.int64) \
                                    for idx, b in enumerate(bcids)]
 
             inter_cosmic_pred_np = [inter_cosmic_pred[b] for idx, b in enumerate(bcids)]
diff --git a/mlreco/post_processing/metrics/__init__.py b/mlreco/post_processing/metrics/__init__.py
index fbd15cec..513f07cd 100644
--- a/mlreco/post_processing/metrics/__init__.py
+++ b/mlreco/post_processing/metrics/__init__.py
@@ -19,4 +19,4 @@
 from .duq_metrics import duq_metrics
 from .pid_metrics import pid_metrics
 from .doublet_metrics import doublet_metrics
-from .analysis_tools_metrics import analysis_tools_metrics
+#from .analysis_tools_metrics import analysis_tools_metrics
diff --git a/mlreco/utils/cluster/fragmenter.py b/mlreco/utils/cluster/fragmenter.py
index 7007becd..0fb81c6a 100644
--- a/mlreco/utils/cluster/fragmenter.py
+++ b/mlreco/utils/cluster/fragmenter.py
@@ -38,7 +38,7 @@ def format_fragments(fragments, frag_batch_ids, frag_seg, batch_column, batch_si
                     for c in fragments_np[b]] ) for b in bcids]
 
     frags = [np.array([vids[c].astype(np.int64) for c in fragments_np[b]],
-                        dtype=np.object if not same_length[idx] else np.int64) \
+                        dtype=object if not same_length[idx] else np.int64) \
                         for idx, b in enumerate(bcids)]
 
     frags_seg = [frag_seg_np[b] for idx, b in enumerate(bcids)]
diff --git a/mlreco/utils/gnn/data.py b/mlreco/utils/gnn/data.py
index 75a6617f..39224503 100644
--- a/mlreco/utils/gnn/data.py
+++ b/mlreco/utils/gnn/data.py
@@ -191,7 +191,7 @@ def _get_extra_gnn_features(fragments,
         and `extra_feats` (if `use_supp` is True).
     """
     # Build a mask for the requested classes
-    mask = np.zeros(len(frag_seg), dtype=np.bool)
+    mask = np.zeros(len(frag_seg), dtype=bool)
     for c in classes:
         mask |= (frag_seg == c)
     mask = np.where(mask)[0]
@@ -264,7 +264,7 @@ def split_clusts(clusts, batch_ids, batches, counts):
     
     # Cast the list of clusters to np.array (object type)
     same_length = [np.all([len(c) == len(bclusts[0]) for c in bclusts]) for bclusts in clusts_split]
-    return [np.array(clusts_split[b], dtype=np.object if not sl else np.int64) for b, sl in enumerate(same_length)], cbids
+    return [np.array(clusts_split[b], dtype=object if not sl else np.int64) for b, sl in enumerate(same_length)], cbids
 
 @nb.njit(cache=True)
 def _split_clusts(clusts: nb.types.List(nb.int64[:]),
@@ -313,4 +313,4 @@ def split_edge_index(edge_index: nb.int64[:,:],
         index += n
 
     # Split the edge index into a list of edge indices
-    return [np.ascontiguousarray(np.vstack((ecids[edge_index[0][b]], ecids[edge_index[1][b]])).T) for b in ebids], ebids
\ No newline at end of file
+    return [np.ascontiguousarray(np.vstack((ecids[edge_index[0][b]], ecids[edge_index[1][b]])).T) for b in ebids], ebids
diff --git a/mlreco/utils/groups.py b/mlreco/utils/groups.py
index c17693f3..3323b5ef 100644
--- a/mlreco/utils/groups.py
+++ b/mlreco/utils/groups.py
@@ -58,7 +58,7 @@ def filter_duplicate_voxels(data, usebatch=True):
     else:
         k = 3
     n = data.shape[0]
-    ret = np.empty(n, dtype=np.bool)
+    ret = np.empty(n, dtype=bool)
     for i in range(1,n):
         if np.all(data[i-1,:k] == data[i,:k]):
             # duplicate voxel
@@ -91,7 +91,7 @@ def filter_duplicate_voxels_ref(data, reference, meta, usebatch=True, precedence
     else:
         k = 3
     n = data.shape[0]
-    ret = np.full(n, True, dtype=np.bool)
+    ret = np.full(n, True, dtype=bool)
     duplicates = {}
     for i in range(1,n):
         if np.all(data[i-1,:k] == data[i,:k]):
@@ -125,7 +125,7 @@ def filter_nonimg_voxels(data_grp, data_img, usebatch=True):
     nimg = data_img.shape[0]
     igrp = 0
     iimg = 0
-    ret = np.empty(ngrp, dtype=np.bool) # return array
+    ret = np.empty(ngrp, dtype=bool) # return array
     while igrp < ngrp and iimg < nimg:
         if np.all(data_grp[igrp,:k] == data_img[iimg,:k]):
             # voxel is in both data
diff --git a/mlreco/utils/metrics.py b/mlreco/utils/metrics.py
index 72f4e12a..798840f0 100644
--- a/mlreco/utils/metrics.py
+++ b/mlreco/utils/metrics.py
@@ -105,7 +105,7 @@ def contingency_table(a, b, na=None, nb=None):
         na = np.max(a)
     if not nb:
         nb = np.max(b)
-    table = np.zeros((na, nb), dtype=np.int)
+    table = np.zeros((na, nb), dtype=int)
     for i, j in zip(a,b):
         table[i,j] += 1
     return table

From 2aa674419f9478aad80907d37fa1fe8268a699a0 Mon Sep 17 00:00:00 2001
From: Francois Drielsma <francois.drielsma@gmail.com>
Date: Mon, 18 Jul 2022 21:28:53 -0700
Subject: [PATCH 25/30] Minor bug fix in GNN node kinematics loss (accuracy
 could be > 1)

---
 mlreco/iotools/parsers/__init__.py                 | 2 +-
 mlreco/models/layers/gnn/losses/node_kinematics.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/mlreco/iotools/parsers/__init__.py b/mlreco/iotools/parsers/__init__.py
index 04e68617..a3ea33a1 100644
--- a/mlreco/iotools/parsers/__init__.py
+++ b/mlreco/iotools/parsers/__init__.py
@@ -35,8 +35,8 @@
 .. csv-table:: Miscellaneous parsers
     :header: Parser name, Description
 
-    ``parse_meta3d``, Get the meta information to translate into real world coordinates (3D)
     ``parse_meta2d``, Get the meta information to translate into real world coordinates (2D)
+    ``parse_meta3d``, Get the meta information to translate into real world coordinates (3D)
     ``parse_run_info``, Parse run info (run, subrun, event number)
 
 
diff --git a/mlreco/models/layers/gnn/losses/node_kinematics.py b/mlreco/models/layers/gnn/losses/node_kinematics.py
index c1487055..fa64f7bc 100644
--- a/mlreco/models/layers/gnn/losses/node_kinematics.py
+++ b/mlreco/models/layers/gnn/losses/node_kinematics.py
@@ -337,7 +337,7 @@ def forward(self, out, types):
                 if compute_vtx and node_pred_vtx[good_index].shape[0]:
                     # print(node_pred_vtx[good_index & positives.bool(), :3], node_assn_vtx[good_index & positives.bool()])
                     vtx_position_acc += float(torch.sum(1. - torch.abs(node_pred_vtx[good_index & positives.bool(), :3]-node_assn_vtx[good_index & positives.bool()])/(torch.abs(node_assn_vtx[good_index & positives.bool()]) + torch.abs(node_pred_vtx[good_index & positives.bool(), :3]))))/3.
-                    vtx_score_acc += float(torch.sum(torch.argmax(node_pred_vtx[good_index, 3:], dim=1) == positives[good_index]))
+                    vtx_score_acc += float(torch.sum(torch.argmax(node_pred_vtx[good_index & (positives >= 0.), 3:], dim=1) == positives[good_index & (positives >= 0.)]))
 
         n_clusts = n_clusts_type + n_clusts_momentum + n_clusts_vtx + n_clusts_vtx_positives
 

From 1819f5f61a16f4dbc6b6d62e289b5b63479f88c2 Mon Sep 17 00:00:00 2001
From: Francois Drielsma <francois.drielsma@gmail.com>
Date: Tue, 19 Jul 2022 14:10:18 -0700
Subject: [PATCH 26/30] Bug fix: ensure the are type predictions before
 enforcing semantic logic

---
 mlreco/models/layers/common/gnn_full_chain.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlreco/models/layers/common/gnn_full_chain.py b/mlreco/models/layers/common/gnn_full_chain.py
index 08c1e383..82f05ff1 100644
--- a/mlreco/models/layers/common/gnn_full_chain.py
+++ b/mlreco/models/layers/common/gnn_full_chain.py
@@ -455,7 +455,7 @@ def run_particle_gnns(self, result, input, frag_result):
 
             # If requested, enforce that particle PID predictions are compatible with semantics,
             # i.e. set logits to -inf if they belong to incompatible PIDs
-            if self._inter_enforce_semantics:
+            if self._inter_enforce_semantics and 'node_pred_type' in result:
                 sem_pid_logic = -float('inf')*torch.ones(self._inter_enforce_semantics_shape, dtype=input[0].dtype, device=input[0].device)
                 sem_pid_logic[self._inter_enforce_semantics_map] = 0.
                 pid_logits = result['node_pred_type']

From 65b602efad6eeeecf36092cb23f616c20ab71e28 Mon Sep 17 00:00:00 2001
From: Francois Drielsma <francois.drielsma@gmail.com>
Date: Wed, 20 Jul 2022 16:34:23 -0700
Subject: [PATCH 27/30] Fixed bugs in vtx prediction accuracy evaluation that
 caused accuracy figures > 1

---
 mlreco/main_funcs.py                          |   1 +
 .../layers/gnn/losses/node_kinematics.py      | 144 +++++++-----------
 2 files changed, 56 insertions(+), 89 deletions(-)

diff --git a/mlreco/main_funcs.py b/mlreco/main_funcs.py
index 5b7ceadd..0f33ff85 100644
--- a/mlreco/main_funcs.py
+++ b/mlreco/main_funcs.py
@@ -71,6 +71,7 @@ def process_config(cfg, verbose=True):
                                  'particle_edge_pred', 'particle_group_pred', 'particles',
                                  'inter_edge_index', 'inter_node_pred', 'inter_edge_pred', 'inter_group_pred',
                                  'inter_particles', 'node_pred_p', 'node_pred_type',
+                                 'vertex_labels', 'anchors', 'grappa_inter_vertex_labels', 'grappa_inter_anchors',
                                  'kinematics_node_pred_p', 'kinematics_node_pred_type',
                                  'flow_edge_pred', 'kinematics_particles', 'kinematics_edge_index',
                                  'clust_fragments', 'clust_frag_seg', 'interactions', 'inter_cosmic_pred',
diff --git a/mlreco/models/layers/gnn/losses/node_kinematics.py b/mlreco/models/layers/gnn/losses/node_kinematics.py
index fa64f7bc..fd961df9 100644
--- a/mlreco/models/layers/gnn/losses/node_kinematics.py
+++ b/mlreco/models/layers/gnn/losses/node_kinematics.py
@@ -138,7 +138,7 @@ def forward(self, out, types):
         total_loss, total_acc = 0., 0.
         type_loss, p_loss, type_acc, p_acc = 0., 0., 0., 0.
         vtx_position_loss, vtx_score_loss, vtx_position_acc, vtx_score_acc = 0., 0., 0., 0.
-        n_clusts_type, n_clusts_momentum, n_clusts_vtx, n_clusts_vtx_positives = 0, 0, 0, 0
+        n_clusts_type, n_clusts_momentum, n_clusts_vtx, n_clusts_vtx_pos = 0, 0, 0, 0
 
         compute_type = 'node_pred_type' in out
         compute_momentum = 'node_pred_p' in out
@@ -168,7 +168,7 @@ def forward(self, out, types):
 
                 # Increment the type loss, balance classes if requested
                 if compute_type:
-                    # Get the class labels and true type from the specified columns
+                    # Get the type predictions and true types from the specified columns
                     node_pred_type = out['node_pred_type'][i][j]
                     if not node_pred_type.shape[0]:
                         continue
@@ -189,9 +189,8 @@ def forward(self, out, types):
 
                     # Compute loss
                     if len(valid_mask_type):
-                        node_assn_type = torch.tensor(node_assn_type, dtype=torch.long, device=node_pred_type.device, requires_grad=False)
                         node_pred_type = node_pred_type[valid_mask_type]
-                        node_assn_type = node_assn_type[valid_mask_type]
+                        node_assn_type = torch.tensor(node_assn_type[valid_mask_type], dtype=torch.long, device=node_pred_type.device, requires_grad=False)
 
                         if self.balance_classes:
                             vals, counts = torch.unique(node_assn_type, return_counts=True)
@@ -210,7 +209,7 @@ def forward(self, out, types):
 
                 # Increment the momentum loss
                 if compute_momentum:
-                    # Get the class labels and true momenta from the specified columns
+                    # Get the momentum predictions and true momenta from the specified columns
                     node_pred_p = out['node_pred_p'][i][j]
                     if not node_pred_p.shape[0]:
                         continue
@@ -226,7 +225,7 @@ def forward(self, out, types):
                         valid_mask_p  &= (cnts[inv] == 1)
                     valid_mask_p = np.where(valid_mask_p)[0]
 
-                    # Do not apply loss to nodes labeled -1 (unknown class)
+                    # Compute loss
                     if len(valid_mask_p):
                         node_pred_p = node_pred_p[valid_mask_p]
                         node_assn_p = node_assn_p[valid_mask_p]
@@ -239,92 +238,60 @@ def forward(self, out, types):
                         n_clusts_momentum += len(clusts)
 
                 if compute_vtx:
+                    # Get the vertex predictions, node features and true vertices from the specified columns
                     node_pred_vtx = out['node_pred_vtx'][i][j]
+                    input_node_features = out['input_node_features'][i][j]
                     if not node_pred_vtx.shape[0]:
                         continue
-                    input_node_features = out['input_node_features'][i][j]
+                    node_assn_vtx     = np.stack([get_cluster_label(labels, clusts, column=c) for c in range(self.vtx_col, self.vtx_col+3)], axis=1)
+                    node_assn_vtx_pos = get_cluster_label(labels, clusts, column=self.vtx_positives_col)
 
-                    # Predictions are shifts w.r.t the barycenter of each cluster
-                    # anchors = []
-                    # for c in clusts:
-                    #     anchors.append(torch.mean(labels[c, :3], dim=0) + 0.5)
-                    # anchors = torch.stack(anchors)
-                    # node_pred_vtx[:, :3] = node_pred_vtx[:, :3] + anchors
-
-                    node_x_vtx = get_cluster_label(labels, clusts, column=self.vtx_col)
-                    node_y_vtx = get_cluster_label(labels, clusts, column=self.vtx_col+1)
-                    node_z_vtx = get_cluster_label(labels, clusts, column=self.vtx_col+2)
-                    positives  = get_cluster_label(labels, clusts, column=self.vtx_positives_col)
-
-                    node_assn_vtx = torch.tensor(np.stack([node_x_vtx, node_y_vtx, node_z_vtx], axis=1),
-                                                dtype=torch.float, device=node_pred_vtx.device, requires_grad=False)
+                    # Do not apply loss to nodes labeled -1 or nodes with vertices outside of volume (TODO: this is weak if the volume is not a cube)
+                    valid_mask_vtx = (node_assn_vtx >= 0.).all(axis=1) & (node_assn_vtx <= self.spatial_size).all(axis=1) & (node_assn_vtx_pos > -1)
 
                     # If high purity is requested, do not include broken particle in the loss
                     if self.vtx_high_purity:
-                        group_ids      = get_cluster_label(labels, clusts, column=self.group_col)
-                        _, inv, cnts   = np.unique(group_ids, return_inverse=True, return_counts=True)
-                        valid_mask_vtx = np.where(cnts[inv] == 1)[0]
-                        node_pred_vtx  = node_pred_vtx[valid_mask_vtx]
-                        node_assn_vtx  = node_assn_vtx[valid_mask_vtx]
-                        positives      = positives[valid_mask_vtx]
-                        input_node_features = input_node_features[valid_mask_vtx]
-
-                    if self.normalize_vtx_label:
-                        node_assn_vtx = node_assn_vtx/self.spatial_size
-
-                        # Exclude vertex that is outside of the volume
-                        good_index = torch.all( (0 <= node_assn_vtx) & (node_assn_vtx <= 1), dim=1)
-                    else:
-                        good_index = torch.all( (0 <= node_assn_vtx) & (node_assn_vtx <= self.spatial_size), dim=1)
-
-                    # Take the max for each cluster - e.g. for a shower, the primary fragment only
-                    # is marked as primary particle, so taking majority count would eliminate the shower
-                    # from primary particles for vertex identification purpose.
-                    # positives = []
-                    # for c in clusts:
-                    #     positives.append(labels[c, self.vtx_positives_col].max().item())
-                    # positives = np.array(positives)
-
-                    positives = torch.tensor(positives, dtype=torch.long, device=node_pred_vtx.device, requires_grad=False)
-
-                    reshaped = input_node_features[:, 19:25][good_index & positives.bool()].view(-1, 2, 3)
-
-                    # Do not apply loss to nodes labeled -1 (unknown class)
-                    node_mask = good_index & positives.bool() & (positives >= 0.)
-                    # print(node_mask.any())
-                    if node_mask.any():
-                        # for now only sum losses, they get averaged below in results dictionary
-                        loss2 = self.vtx_score_loss(node_pred_vtx[good_index & (positives >= 0.), 3:], positives[good_index & (positives >= 0.)])
-
-                        pos_pred = node_pred_vtx[node_mask, :3]
-                        pos_label = node_assn_vtx[node_mask]
-
-                        if self.use_anchor_points:
-                            dist_to_anchor = torch.norm(pos_pred.view(-1, 1, 3) - reshaped, dim=2).view(-1, 2)
+                        group_ids       = get_cluster_label(labels, clusts, column=self.group_col)
+                        _, inv, cnts    = np.unique(group_ids, return_inverse=True, return_counts=True)
+                        valid_mask_vtx &= cnts[inv] == 1
+                    valid_mask_vtx = np.where(valid_mask_vtx)[0]
+
+                    # Compute the losses only if there is at least > 1 positive node 
+                    pos_mask_vtx = np.where(node_assn_vtx_pos[valid_mask_vtx])[0]
+                    if len(pos_mask_vtx):
+                        # Compute the primary score loss on all valid nodes
+                        node_pred_vtx     = node_pred_vtx[valid_mask_vtx]
+                        node_assn_vtx_pos = torch.tensor(node_assn_vtx_pos[valid_mask_vtx], dtype=torch.long, device=node_pred_vtx.device)
+
+                        loss1 = self.vtx_score_loss(node_pred_vtx[:, 3:], node_assn_vtx_pos)
+
+                        # Compute the vertex position loss on positive nodes only
+                        vtx_label = torch.tensor(node_assn_vtx[valid_mask_vtx][pos_mask_vtx], dtype=node_pred_vtx.dtype, device=node_pred_vtx.device)
+                        if self.normalize_vtx_label: # If requested, bring vertex labels in the range [0,1 ]
+                            vtx_label = vtx_label/self.spatial_size
+                        vertex_labels.append(vtx_label)
+
+                        vtx_pred = node_pred_vtx[pos_mask_vtx,:3]
+                        if self.use_anchor_points: # If requested, predict positions with respect to anchor points (end points of particles)
+                            end_points = input_node_features[valid_mask_vtx,19:25][pos_mask_vtx].view(-1, 2, 3)
+                            dist_to_anchor = torch.norm(vtx_pred.view(-1, 1, 3) - end_points, dim=2).view(-1, 2)
                             min_dist = torch.argmin(dist_to_anchor, dim=1)
-                            range_index = torch.arange(reshaped.shape[0]).to(device=reshaped.device).long()
-                            anchors = reshaped[range_index, min_dist, :]
+                            range_index = torch.arange(end_points.shape[0]).to(device=end_points.device).long()
+                            anchors = end_points[range_index, min_dist, :]
                             anchors_list.append(anchors)
-                            pos_pred = pos_pred + anchors
+                            vtx_pred = vtx_pred + anchors
 
-                        vertex_labels.append(pos_label)
-                        loss1 = torch.mean(torch.clamp(torch.sum(self.vtx_position_loss(pos_pred, pos_label), dim=1),
+                        loss2 = torch.mean(torch.clamp(torch.sum(self.vtx_position_loss(vtx_pred, vtx_label), dim=1),
                                                        max=self.max_vertex_distance**2))
 
-                        # print(loss1, pos_pred)
-                        # assert False
-
-                        # loss1 = torch.sum(torch.mean(self.vtx_position_loss(pos_pred, pos_label), dim=1))
-                        # print(loss1, loss2)
-
+                        # Combine losses
                         total_loss += loss1 + loss2
+                        vtx_score_loss += float(loss1)
+                        vtx_position_loss += float(loss2)
 
-                        vtx_position_loss += float(loss1)
-                        vtx_score_loss += float(loss2)
-
-                        n_clusts_vtx += (good_index).sum().item()
-                        n_clusts_vtx_positives += (good_index & positives.bool()).sum().item()
-                        # print("Removing", (~good_index).sum().item(), len(good_index) )
+                        # Increment the number of nodes
+                        n_clusts_vtx += len(valid_mask_vtx)
+                        n_clusts_vtx_pos += len(pos_mask_vtx)
 
                 # Compute the accuracy of assignment (fraction of correctly assigned nodes)
                 # and the accuracy of momentum estimation (RMS relative residual)
@@ -334,12 +301,11 @@ def forward(self, out, types):
                 if compute_momentum and len(valid_mask_p):
                     p_acc += float(torch.sum(1.- torch.abs(node_pred_p.squeeze()-node_assn_p)/node_assn_p)) # 1-MAPE
 
-                if compute_vtx and node_pred_vtx[good_index].shape[0]:
-                    # print(node_pred_vtx[good_index & positives.bool(), :3], node_assn_vtx[good_index & positives.bool()])
-                    vtx_position_acc += float(torch.sum(1. - torch.abs(node_pred_vtx[good_index & positives.bool(), :3]-node_assn_vtx[good_index & positives.bool()])/(torch.abs(node_assn_vtx[good_index & positives.bool()]) + torch.abs(node_pred_vtx[good_index & positives.bool(), :3]))))/3.
-                    vtx_score_acc += float(torch.sum(torch.argmax(node_pred_vtx[good_index & (positives >= 0.), 3:], dim=1) == positives[good_index & (positives >= 0.)]))
+                if compute_vtx and len(pos_mask_vtx):
+                    vtx_position_acc += float(torch.sum(1. - torch.abs(vtx_pred - vtx_label)/(torch.abs(vtx_pred) + torch.abs(vtx_label))))/3.
+                    vtx_score_acc += float(torch.sum(torch.argmax(node_pred_vtx[:,3:], dim=1) == node_assn_vtx_pos))
 
-        n_clusts = n_clusts_type + n_clusts_momentum + n_clusts_vtx + n_clusts_vtx_positives
+        n_clusts = n_clusts_type + n_clusts_momentum + n_clusts_vtx + n_clusts_vtx_pos
 
         # Handle the case where no cluster/edge were found
         if not n_clusts:
@@ -349,7 +315,7 @@ def forward(self, out, types):
                 'n_clusts_momentum': n_clusts_momentum,
                 'n_clusts_type': n_clusts_type,
                 'n_clusts_vtx': n_clusts_vtx,
-                'n_clusts_vtx_positives': n_clusts_vtx_positives
+                'n_clusts_vtx_positives': n_clusts_vtx_pos
             }
             if compute_type:
                 result.update({
@@ -376,11 +342,11 @@ def forward(self, out, types):
             'n_clusts_momentum': n_clusts_momentum,
             'n_clusts_type': n_clusts_type,
             'n_clusts_vtx': n_clusts_vtx,
-            'n_clusts_vtx_positives': n_clusts_vtx_positives
+            'n_clusts_vtx_positives': n_clusts_vtx_pos
         }
 
-        result['anchors'] = [anchors_list]
-        result['vertex_labels'] = [vertex_labels]
+        result['anchors'] = anchors_list
+        result['vertex_labels'] = vertex_labels
 
         if compute_type:
             result.update({
@@ -396,8 +362,8 @@ def forward(self, out, types):
             result.update({
                 'vtx_score_loss': 0. if not n_clusts_vtx else vtx_score_loss/n_clusts_vtx,
                 'vtx_score_acc': 0. if not n_clusts_vtx else vtx_score_acc/n_clusts_vtx,
-                'vtx_position_loss': 0. if not n_clusts_vtx_positives else vtx_position_loss/n_clusts_vtx_positives,
-                'vtx_position_acc': 0. if not n_clusts_vtx_positives else vtx_position_acc/n_clusts_vtx_positives,
+                'vtx_position_loss': 0. if not n_clusts_vtx_pos else vtx_position_loss/n_clusts_vtx_pos,
+                'vtx_position_acc': 0. if not n_clusts_vtx_pos else vtx_position_acc/n_clusts_vtx_pos,
             })
 
         return result

From 4148a06ce23a2e60691222d8dd6beca27eb88d63 Mon Sep 17 00:00:00 2001
From: Francois Drielsma <francois.drielsma@gmail.com>
Date: Wed, 20 Jul 2022 17:12:54 -0700
Subject: [PATCH 28/30] Add option to include MPR particle in the PID target

---
 mlreco/iotools/parsers/cluster.py |  7 ++++---
 mlreco/utils/groups.py            | 12 ++++++++----
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/mlreco/iotools/parsers/cluster.py b/mlreco/iotools/parsers/cluster.py
index 4639a79d..cb2dab54 100644
--- a/mlreco/iotools/parsers/cluster.py
+++ b/mlreco/iotools/parsers/cluster.py
@@ -62,7 +62,8 @@ def parse_cluster3d(cluster_event,
                     add_particle_info = False,
                     add_kinematics_info = False,
                     clean_data = True,
-                    precedence = [1,2,0,3,4]):
+                    precedence = [1,2,0,3,4],
+                    type_include_mpr = False):
     """
     a function to retrieve a 3D clusters tensor
 
@@ -136,11 +137,11 @@ def parse_cluster3d(cluster_event,
         if add_particle_info:
             labels['inter']   = inter_ids
             labels['nu']      = nu_ids
-            labels['type']    = get_particle_id(particles_v, nu_ids)
+            labels['type']    = get_particle_id(particles_v, nu_ids, include_mpr=type_include_mpr)
             labels['primary'] = get_primary_id(cluster_event, particles_v)
         if add_kinematics_info:
             particles_asis_v  = parse_particle_asis(particle_event, cluster_event)
-            labels['type']    = get_particle_id(particles_v, nu_ids)
+            labels['type']    = get_particle_id(particles_v, nu_ids, include_mpr=type_include_mpr)
             labels['p']       = np.array([(p.px()**2+p.py()**2+p.pz()**2)/1e3 for p in particles_v])
             labels['vtx_x']   = np.array([p.ancestor_position().x() for p in particles_asis_v])
             labels['vtx_y']   = np.array([p.ancestor_position().y() for p in particles_asis_v])
diff --git a/mlreco/utils/groups.py b/mlreco/utils/groups.py
index 3323b5ef..a945fb03 100644
--- a/mlreco/utils/groups.py
+++ b/mlreco/utils/groups.py
@@ -312,19 +312,23 @@ def get_nu_id(cluster_event, particle_v, interaction_ids, particle_mpv=None):
 }
 
 
-def get_particle_id(particles_v, nu_ids):
+def get_particle_id(particles_v, nu_ids, include_mpr=False):
     '''
     Function that gives one of five labels to particles of
     particle species predictions. This function ensures:
-    - Particles that do not originate from an MPV are labeled -1
+    - Particles that do not originate from an MPV are labeled -1,
+      unless the include_mpr flag is set to true
     - Particles that are not a track or a shower, and their
-      daughters, are labeled -1 (Michel and Delta)
+      daughters, are labeled -1 (Michel and Delta), as their
+      particle type is already constrained (only electron)
     - Particles that are neutron daughters are labeled -1
     - All shower daughters are labeled the same as their primary. This
       makes sense as otherwise an electron primary gets overruled by
       its many photon daughters (voxel-wise majority vote). This can
       lead to problems as, if an electron daughter is not clustered with
       the primary, it is labeled electron, which is counter-intuitive.
+      This is handled downstream with the high_purity flag.
+    - Particles that are not in the list target are labeled -1
 
     Inputs:
         - particles_v (array of larcv::Particle)    : (N) LArCV Particle objects
@@ -341,7 +345,7 @@ def get_particle_id(particles_v, nu_ids):
         process = particles_v[group_id].creation_process()
         shape = int(particles_v[group_id].shape())
 
-        if nu_ids[i] < 1: t = -1
+        if not include_mpr and nu_ids[i] < 1: t = -1
         if shape > 1: t = -1
         if (t == 22 or t == 2212) and ('Inelastic' in process or 'Capture' in process): t = -1
 

From a4e9918cb87d38c20200f7f8daf11fece596a070 Mon Sep 17 00:00:00 2001
From: Francois Drielsma <francois.drielsma@gmail.com>
Date: Thu, 21 Jul 2022 09:38:51 -0700
Subject: [PATCH 29/30] Implemented a more numerically stable version of
 softmax in numba, fixed some issues with high scores

---
 mlreco/utils/numba.py | 45 +++++++++++++++++++++++++++++++++++++++----
 1 file changed, 41 insertions(+), 4 deletions(-)

diff --git a/mlreco/utils/numba.py b/mlreco/utils/numba.py
index da29ebc7..a4aafede 100644
--- a/mlreco/utils/numba.py
+++ b/mlreco/utils/numba.py
@@ -154,6 +154,40 @@ def argmax_nb(x: nb.float64[:,:],
     return argmax
 
 
+@nb.njit(cache=True)
+def min_nb(x: nb.float64[:,:],
+           axis: nb.int64) -> nb.float64[:]:
+    """
+    Numba implementation of np.max(x, axis)
+    """
+    assert axis == 0 or axis == 1
+    xmin = np.empty(x.shape[1-axis], dtype=np.int64)
+    if axis == 0:
+        for i in range(len(xmin)):
+            xmin[i] = np.min(x[:,i])
+    else:
+        for i in range(len(xmax)):
+            xmin[i] = np.min(x[i])
+    return xmin
+
+
+@nb.njit(cache=True)
+def max_nb(x: nb.float64[:,:],
+           axis: nb.int64) -> nb.float64[:]:
+    """
+    Numba implementation of np.max(x, axis)
+    """
+    assert axis == 0 or axis == 1
+    xmax = np.empty(x.shape[1-axis], dtype=np.int64)
+    if axis == 0:
+        for i in range(len(xmax)):
+            xmax[i] = np.max(x[:,i])
+    else:
+        for i in range(len(xmax)):
+            xmax[i] = np.max(x[i])
+    return xmax
+
+
 @nb.njit(cache=True)
 def all_nb(x: nb.float64[:,:],
               axis: nb.int64) -> nb.int64[:]:
@@ -175,11 +209,14 @@ def all_nb(x: nb.float64[:,:],
 def softmax_nb(x: nb.float64[:,:],
                axis: nb.int64) -> nb.float64[:,:]:
     assert axis == 0 or axis == 1
-    exps = np.exp(x)
     if axis == 0:
-        return exps/np.sum(exps,axis=0)
+        xmax = max_nb(x, axis=0)
+        logsumexp = np.log(np.sum(np.exp(x-xmax), axis=0)) + xmax
+        return np.exp(x - logsumexp)
     else:
-        return exps/np.sum(exps,axis=1).reshape(-1,1)
+        xmax = max_nb(x, axis=1).reshape(-1,1)
+        logsumexp = np.log(np.sum(np.exp(x-xmax), axis=1)).reshape(-1,1) + xmax
+        return np.exp(x - logsumexp)
 
 
 @nb.njit(cache=True)
@@ -187,4 +224,4 @@ def log_loss_nb(x1: nb.boolean[:], x2: nb.float64[:]) -> nb.float64:
     if len(x1) > 0:
         return -(np.sum(np.log(x2[x1])) + np.sum(np.log(1.-x2[~x1])))/len(x1)
     else:
-        return 0.
\ No newline at end of file
+        return 0.

From 20ead7184e23a5415c77d5e3ab0838f634346aff Mon Sep 17 00:00:00 2001
From: Francois Drielsma <francois.drielsma@gmail.com>
Date: Thu, 21 Jul 2022 12:52:57 -0700
Subject: [PATCH 30/30] Bug fix in vertex labels and anchor output

---
 mlreco/models/layers/gnn/losses/node_kinematics.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/mlreco/models/layers/gnn/losses/node_kinematics.py b/mlreco/models/layers/gnn/losses/node_kinematics.py
index fd961df9..cc8f2e5a 100644
--- a/mlreco/models/layers/gnn/losses/node_kinematics.py
+++ b/mlreco/models/layers/gnn/losses/node_kinematics.py
@@ -269,7 +269,7 @@ def forward(self, out, types):
                         vtx_label = torch.tensor(node_assn_vtx[valid_mask_vtx][pos_mask_vtx], dtype=node_pred_vtx.dtype, device=node_pred_vtx.device)
                         if self.normalize_vtx_label: # If requested, bring vertex labels in the range [0,1 ]
                             vtx_label = vtx_label/self.spatial_size
-                        vertex_labels.append(vtx_label)
+                        vertex_labels.append(vtx_label.detach().cpu().numpy())
 
                         vtx_pred = node_pred_vtx[pos_mask_vtx,:3]
                         if self.use_anchor_points: # If requested, predict positions with respect to anchor points (end points of particles)
@@ -278,7 +278,7 @@ def forward(self, out, types):
                             min_dist = torch.argmin(dist_to_anchor, dim=1)
                             range_index = torch.arange(end_points.shape[0]).to(device=end_points.device).long()
                             anchors = end_points[range_index, min_dist, :]
-                            anchors_list.append(anchors)
+                            anchors_list.append(anchors.detach().cpu().numpy())
                             vtx_pred = vtx_pred + anchors
 
                         loss2 = torch.mean(torch.clamp(torch.sum(self.vtx_position_loss(vtx_pred, vtx_label), dim=1),
@@ -292,6 +292,9 @@ def forward(self, out, types):
                         # Increment the number of nodes
                         n_clusts_vtx += len(valid_mask_vtx)
                         n_clusts_vtx_pos += len(pos_mask_vtx)
+                    else:
+                        vertex_labels.append(np.empty((0,3)))
+                        if self.use_anchor_points: anchors.append(np.empty((0,3)))
 
                 # Compute the accuracy of assignment (fraction of correctly assigned nodes)
                 # and the accuracy of momentum estimation (RMS relative residual)