Merge pull request #108 from Temigo/me

Small changes
DeepLearnPhysics · May 18, 2022 · cf154b0 · cf154b0
2 parents b7b6a2e + cc944e9
commit cf154b0
Show file tree

Hide file tree

Showing 8 changed files with 199 additions and 5 deletions.
diff --git a/bin/check_valid_dataset.py b/bin/check_valid_dataset.py
@@ -0,0 +1,90 @@
+# Script to mark all bad ROOT files before merging them with hadd
+# ================================================================
+#
+# Usage: python3 bin/check_valid_dataset.py bad_files.txt file1.root file2.root ... fileN.root
+#
+# Output: will write a list of bad files in bad_files.txt
+# (one per line) that can then be used to move or remove
+# these bad files before doing hadd. For example using:
+#
+# $ for file in $(cat bad_files.txt); do mv "$file" bad_files/; done
+#
+# What it does:
+# Loop over all TTrees in a given ROOT file and check that
+# they have the same number of entries.
+#
+from ROOT import TCanvas, TPad, TFile, TPaveLabel, TPaveText, TChain
+from ROOT import gROOT
+import ROOT
+import pandas as pd
+import numpy as np
+import argparse
+
+
+if __name__ == "__main__":
+    argparse = argparse.ArgumentParser(description="Check validity of dataset")
+    argparse.add_argument("output_file", type=str, help="output text file to write bad files names")
+    argparse.add_argument("files", type=str, nargs="+", help="files to check")
+
+    args = argparse.parse_args()
+
+    # print(args)
+
+    output = open(args.output_file, 'w')
+    bad_files = []
+    global_keys = []
+    counts = []
+
+    def mark_bad_file(file):
+        output.write(file + '\n')
+        bad_files.append(file)
+
+    for idx, file in enumerate(args.files):
+        print(file)
+        f = TFile(file)
+        keys = [key.GetName() for key in f.GetListOfKeys()]
+        global_keys.append(keys)
+
+        # If keys is a subset of global_keys or global_keys is shorter
+        # if global_keys is None:
+        #     global_keys = keys
+        # elif len(np.intersect1d(keys, global_keys)) < len(global_keys):
+        #     # keys is a subset of global keys
+        #     mark_bad_file(file)
+        #     continue
+        # elif len(np.intersect1d(keys, global_keys)) < len(keys):
+        #     # global_keys is a subset of keys
+        #     if arg.files[idx-1] not in bad_files:
+        #         mark_bad_file(arg.files[idx-1])
+        #         global_keys = keys
+                # note that's assuming we don't get 2 files in a row with bad keys...
+
+        # print(keys)
+
+        trees = [f.Get(key) for key in keys]
+
+        nentries = [tree.GetEntries() for tree in trees]
+        counts.append(len(np.unique(nentries)))
+        # print(nentries)
+
+        # if len(np.unique(nentries)) != 1:
+        #     mark_bad_file(file)
+
+    all_keys = np.unique(np.hstack(global_keys))
+    #print(all_keys)
+    # Function testing equality of two lists of strings
+    def is_equal(a, b):
+        c = np.intersect1d(a, b)
+        return len(c) == len(a) and len(c) == len(b)
+
+    for idx, file in enumerate(args.files):
+        if counts[idx] != 1 or not is_equal(np.unique(global_keys[idx]), all_keys):
+            mark_bad_file(file)
+            # print(len(global_keys[idx]), len(all_keys))
+            # print(counts[idx], is_equal(global_keys[idx], all_keys))
+
+    print('\nFound bad files: ')
+    for f in bad_files:
+        print(f)
+
+    output.close()
diff --git a/mlreco/iotools/datasets.py b/mlreco/iotools/datasets.py
@@ -16,7 +16,7 @@ class LArCVDataset(Dataset):
     def __init__(self, data_schema, data_keys, limit_num_files=0, limit_num_samples=0, event_list=None, skip_event_list=None):
         """
         Instantiates the LArCVDataset.
-        
+
         Parameters
         ----------
         data_dirs : list
@@ -110,6 +110,8 @@ def __init__(self, data_schema, data_keys, limit_num_files=0, limit_num_samples=
         if limit_num_samples > 0 and self._entries > limit_num_samples:
             self._entries = limit_num_samples
 
+        print('Found %d events in file(s)' % len(self._event_list))
+
         # Flag to identify if Trees are initialized or not
         self._trees_ready=False
 

diff --git a/mlreco/iotools/parsers/__init__.py b/mlreco/iotools/parsers/__init__.py
@@ -106,6 +106,7 @@
 from mlreco.iotools.parsers.sparse import (
     parse_sparse2d_scn,
     parse_sparse3d_scn,
+    parse_sparse3d_ghost,
     parse_sparse3d,
     parse_sparse3d_scn_scales,
     parse_sparse3d_clean,

diff --git a/mlreco/iotools/parsers/sparse.py b/mlreco/iotools/parsers/sparse.py
@@ -137,6 +137,28 @@ def parse_sparse3d(data):
     return np.concatenate(output, axis=-1)
 
 
+def parse_sparse3d_ghost(data):
+    meta = None
+    output = []
+    np_voxels = None
+    for event_tensor3d in data:
+        num_point = event_tensor3d.as_vector().size()
+        if meta is None:
+            meta = event_tensor3d.meta()
+            np_voxels = np.empty(shape=(num_point, 3), dtype=np.int32)
+            larcv.fill_3d_voxels(event_tensor3d, np_voxels)
+        else:
+            assert meta == event_tensor3d.meta()
+        np_data = np.empty(shape=(num_point, 1), dtype=np.float32)
+        larcv.fill_3d_pcloud(event_tensor3d, np_data)
+        output.append(np_data)
+
+    #print('ghost', np_voxels.shape, output[0].shape)
+    #print('preghost', output[0])
+    #print('postghost', (output[0]==5).astype(np.float32))
+    return np_voxels, (output[0]==5).astype(np.float32)
+
+
 def parse_weights(data):
     """
     A function to generate weights from larcv::EventSparseTensor3D and larcv::Particle list

diff --git a/mlreco/models/layers/gnn/losses/node_kinematics.py b/mlreco/models/layers/gnn/losses/node_kinematics.py
@@ -274,7 +274,7 @@ def forward(self, out, types):
                         # loss1 = torch.sum(torch.mean(self.vtx_position_loss(pos_pred, pos_label), dim=1))
                         # print(loss1, loss2)
 
-                        total_loss += loss1
+                        total_loss += loss1 + loss2
 
                         vtx_position_loss += float(loss1)
                         vtx_score_loss += float(loss2)
@@ -822,4 +822,4 @@ def compute_vertex(self, node_pred_vtx, labels, clusts):
                 vtx_position_acc,
                 vtx_score_acc)
 
-        return out
+        return out
diff --git a/mlreco/post_processing/metrics/__init__.py b/mlreco/post_processing/metrics/__init__.py
@@ -17,4 +17,5 @@
 from .evidential_gnn import evidential_gnn_metrics
 from .evidential_gnn import default_gnn_metrics
 from .duq_metrics import duq_metrics
-from .pid_metrics import pid_metrics
+from .pid_metrics import pid_metrics
+from .doublet_metrics import doublet_metrics
diff --git a/mlreco/post_processing/metrics/doublet_metrics.py b/mlreco/post_processing/metrics/doublet_metrics.py
@@ -0,0 +1,78 @@
+import numpy as np
+import scipy
+from scipy.spatial.distance import cdist
+from mlreco.post_processing import post_processing
+
+
+@post_processing('doublet_metrics',
+                ['input_data', 'nhits', 'seg_label_full'],
+                ['segmentation'])
+def doublet_metrics(cfg, module_cfg, data_blob, res, logdir, iteration,
+                        data_idx=None, input_data=None,
+                        segmentation=None, nhits=None, seg_label_full=None, **kwargs):
+    import torch
+    row_names, row_values = [], []
+    data = input_data[data_idx]
+    label = seg_label_full[data_idx][:,-1]
+    nhits = nhits[data_idx][:, -1]
+
+    num_classes_ghost = segmentation[data_idx].shape[1]
+    num_classes_semantic = module_cfg.get('num_classes_semantic', 5)
+    num_ghost_points = np.count_nonzero(label == num_classes_semantic)
+    num_nonghost_points = np.count_nonzero(label < num_classes_semantic)
+
+    shower_label = module_cfg.get('shower_label', 0)
+    edep_col     = module_cfg.get('edep_col', -2)
+    assert shower_label >= 0 and shower_label < num_classes_semantic
+
+    row_names += ['num_ghost_points', 'num_nonghost_points']
+    row_values += [num_ghost_points, num_nonghost_points]
+
+    ghost_predictions = np.argmax(res['segmentation'][data_idx], axis=1)
+    mask = ghost_predictions == 0
+
+    # Fraction of ghost points predicted as ghost points
+    ghost2ghost = (ghost_predictions[label == num_classes_semantic] == 1).sum() / float(num_ghost_points)
+    # Fraction of true non-ghost points predicted as true non-ghost points
+    nonghost2nonghost = (ghost_predictions[label < num_classes_semantic] == 0).sum() / float(num_nonghost_points)
+    row_names += ["ghost2ghost", "nonghost2nonghost"]
+    row_values += [ghost2ghost, nonghost2nonghost]
+
+    for c in range(num_classes_semantic):
+        row_names += ['num_true_pix_class_%d' % c]
+        row_values += [np.count_nonzero(label == c)]
+        #print(c, np.count_nonzero(label == c), np.count_nonzero((label == c) & (ghost_predictions == 1)))
+        row_names += ['num_pred_pix_class_%d_%d' % (c, x) for x in range(num_classes_ghost)]
+        row_values += [np.count_nonzero((label == c) & (ghost_predictions == x)) for x in range(num_classes_ghost)]
+
+        row_names += ['num_pred_pix_doublets_class_%d_%d' % (c, x) for x in range(num_classes_ghost)]
+        row_values += [np.count_nonzero((label == c) & (ghost_predictions == x) & (nhits == 2)) for x in range(num_classes_ghost)]
+
+        row_names += ['num_pred_pix_triplets_class_%d_%d' % (c, x) for x in range(num_classes_ghost)]
+        row_values += [np.count_nonzero((label == c) & (ghost_predictions == x) & (nhits == 3)) for x in range(num_classes_ghost)]
+
+        row_names += ['num_doublets_class_%d' % c, 'num_triplets_class_%d' % c]
+        row_values += [np.count_nonzero((label == c) & (nhits == 2)), np.count_nonzero((label == c) & (nhits == 3))]
+
+    row_names += ['num_doublets_ghost', 'num_triplets_ghost']
+    row_values += [np.count_nonzero((label == num_classes_semantic) & (nhits == 2)), np.count_nonzero((label == num_classes_semantic) & (nhits == 3))]
+
+    row_names += ['num_doublets_ghost_%d' % x for x in range(num_classes_ghost)]
+    row_values += [np.count_nonzero((label == num_classes_semantic) & (nhits == 2) & (ghost_predictions == x)) for x in range(num_classes_ghost)]
+
+    row_names += ['num_triplets_ghost_%d' % x for x in range(num_classes_ghost)]
+    row_values += [np.count_nonzero((label == num_classes_semantic) & (nhits == 3) & (ghost_predictions == x)) for x in range(num_classes_ghost)]
+
+    # Record shower voxels sum in true mask and in (true & pred) mask
+    # to see if we lose a significant amount of energy
+    # (might be offset by true ghost predicted as nonghost)
+    row_names += ['shower_true_voxel_sum', 'shower_true_pred_voxel_sum']
+    row_values += [data[label == shower_label, edep_col].sum(), data[(label == shower_label) & mask, edep_col].sum()]
+
+    row_names += ['shower_true_voxel_sum_doublets', 'shower_true_pred_voxel_sum_doublets']
+    row_values += [data[(label == shower_label) & (nhits == 2), edep_col].sum(), data[(label == shower_label) & mask & (nhits == 2), edep_col].sum()]
+
+    row_names += ['shower_true_voxel_sum_triplets', 'shower_true_pred_voxel_sum_triplets']
+    row_values += [data[(label == shower_label) & (nhits == 3), edep_col].sum(), data[(label == shower_label) & mask & (nhits == 3), edep_col].sum()]
+
+    return tuple(row_names), tuple(row_values)
diff --git a/mlreco/trainval.py b/mlreco/trainval.py
@@ -284,7 +284,7 @@ def _forward(self, train_blob, loss_blob, iteration=None):
 
             if not len(self._gpus):
                 train_blob = train_blob[0]
-            print(not self._net.device_ids)
+            #print(not self._net.device_ids)
             result = self._net(train_blob)
 
             if not len(self._gpus):