Skip to content

Commit

Permalink
Merge pull request #108 from Temigo/me
Browse files Browse the repository at this point in the history
Small changes
  • Loading branch information
Temigo authored May 18, 2022
2 parents b7b6a2e + cc944e9 commit cf154b0
Show file tree
Hide file tree
Showing 8 changed files with 199 additions and 5 deletions.
90 changes: 90 additions & 0 deletions bin/check_valid_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
# Script to mark all bad ROOT files before merging them with hadd
# ================================================================
#
# Usage: python3 bin/check_valid_dataset.py bad_files.txt file1.root file2.root ... fileN.root
#
# Output: will write a list of bad files in bad_files.txt
# (one per line) that can then be used to move or remove
# these bad files before doing hadd. For example using:
#
# $ for file in $(cat bad_files.txt); do mv "$file" bad_files/; done
#
# What it does:
# Loop over all TTrees in a given ROOT file and check that
# they have the same number of entries.
#
from ROOT import TCanvas, TPad, TFile, TPaveLabel, TPaveText, TChain
from ROOT import gROOT
import ROOT
import pandas as pd
import numpy as np
import argparse


if __name__ == "__main__":
argparse = argparse.ArgumentParser(description="Check validity of dataset")
argparse.add_argument("output_file", type=str, help="output text file to write bad files names")
argparse.add_argument("files", type=str, nargs="+", help="files to check")

args = argparse.parse_args()

# print(args)

output = open(args.output_file, 'w')
bad_files = []
global_keys = []
counts = []

def mark_bad_file(file):
output.write(file + '\n')
bad_files.append(file)

for idx, file in enumerate(args.files):
print(file)
f = TFile(file)
keys = [key.GetName() for key in f.GetListOfKeys()]
global_keys.append(keys)

# If keys is a subset of global_keys or global_keys is shorter
# if global_keys is None:
# global_keys = keys
# elif len(np.intersect1d(keys, global_keys)) < len(global_keys):
# # keys is a subset of global keys
# mark_bad_file(file)
# continue
# elif len(np.intersect1d(keys, global_keys)) < len(keys):
# # global_keys is a subset of keys
# if arg.files[idx-1] not in bad_files:
# mark_bad_file(arg.files[idx-1])
# global_keys = keys
# note that's assuming we don't get 2 files in a row with bad keys...

# print(keys)

trees = [f.Get(key) for key in keys]

nentries = [tree.GetEntries() for tree in trees]
counts.append(len(np.unique(nentries)))
# print(nentries)

# if len(np.unique(nentries)) != 1:
# mark_bad_file(file)

all_keys = np.unique(np.hstack(global_keys))
#print(all_keys)
# Function testing equality of two lists of strings
def is_equal(a, b):
c = np.intersect1d(a, b)
return len(c) == len(a) and len(c) == len(b)

for idx, file in enumerate(args.files):
if counts[idx] != 1 or not is_equal(np.unique(global_keys[idx]), all_keys):
mark_bad_file(file)
# print(len(global_keys[idx]), len(all_keys))
# print(counts[idx], is_equal(global_keys[idx], all_keys))

print('\nFound bad files: ')
for f in bad_files:
print(f)

output.close()
4 changes: 3 additions & 1 deletion mlreco/iotools/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ class LArCVDataset(Dataset):
def __init__(self, data_schema, data_keys, limit_num_files=0, limit_num_samples=0, event_list=None, skip_event_list=None):
"""
Instantiates the LArCVDataset.
Parameters
----------
data_dirs : list
Expand Down Expand Up @@ -110,6 +110,8 @@ def __init__(self, data_schema, data_keys, limit_num_files=0, limit_num_samples=
if limit_num_samples > 0 and self._entries > limit_num_samples:
self._entries = limit_num_samples

print('Found %d events in file(s)' % len(self._event_list))

# Flag to identify if Trees are initialized or not
self._trees_ready=False

Expand Down
1 change: 1 addition & 0 deletions mlreco/iotools/parsers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@
from mlreco.iotools.parsers.sparse import (
parse_sparse2d_scn,
parse_sparse3d_scn,
parse_sparse3d_ghost,
parse_sparse3d,
parse_sparse3d_scn_scales,
parse_sparse3d_clean,
Expand Down
22 changes: 22 additions & 0 deletions mlreco/iotools/parsers/sparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,28 @@ def parse_sparse3d(data):
return np.concatenate(output, axis=-1)


def parse_sparse3d_ghost(data):
meta = None
output = []
np_voxels = None
for event_tensor3d in data:
num_point = event_tensor3d.as_vector().size()
if meta is None:
meta = event_tensor3d.meta()
np_voxels = np.empty(shape=(num_point, 3), dtype=np.int32)
larcv.fill_3d_voxels(event_tensor3d, np_voxels)
else:
assert meta == event_tensor3d.meta()
np_data = np.empty(shape=(num_point, 1), dtype=np.float32)
larcv.fill_3d_pcloud(event_tensor3d, np_data)
output.append(np_data)

#print('ghost', np_voxels.shape, output[0].shape)
#print('preghost', output[0])
#print('postghost', (output[0]==5).astype(np.float32))
return np_voxels, (output[0]==5).astype(np.float32)


def parse_weights(data):
"""
A function to generate weights from larcv::EventSparseTensor3D and larcv::Particle list
Expand Down
4 changes: 2 additions & 2 deletions mlreco/models/layers/gnn/losses/node_kinematics.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,7 @@ def forward(self, out, types):
# loss1 = torch.sum(torch.mean(self.vtx_position_loss(pos_pred, pos_label), dim=1))
# print(loss1, loss2)

total_loss += loss1
total_loss += loss1 + loss2

vtx_position_loss += float(loss1)
vtx_score_loss += float(loss2)
Expand Down Expand Up @@ -822,4 +822,4 @@ def compute_vertex(self, node_pred_vtx, labels, clusts):
vtx_position_acc,
vtx_score_acc)

return out
return out
3 changes: 2 additions & 1 deletion mlreco/post_processing/metrics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,5 @@
from .evidential_gnn import evidential_gnn_metrics
from .evidential_gnn import default_gnn_metrics
from .duq_metrics import duq_metrics
from .pid_metrics import pid_metrics
from .pid_metrics import pid_metrics
from .doublet_metrics import doublet_metrics
78 changes: 78 additions & 0 deletions mlreco/post_processing/metrics/doublet_metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import numpy as np
import scipy
from scipy.spatial.distance import cdist
from mlreco.post_processing import post_processing


@post_processing('doublet_metrics',
['input_data', 'nhits', 'seg_label_full'],
['segmentation'])
def doublet_metrics(cfg, module_cfg, data_blob, res, logdir, iteration,
data_idx=None, input_data=None,
segmentation=None, nhits=None, seg_label_full=None, **kwargs):
import torch
row_names, row_values = [], []
data = input_data[data_idx]
label = seg_label_full[data_idx][:,-1]
nhits = nhits[data_idx][:, -1]

num_classes_ghost = segmentation[data_idx].shape[1]
num_classes_semantic = module_cfg.get('num_classes_semantic', 5)
num_ghost_points = np.count_nonzero(label == num_classes_semantic)
num_nonghost_points = np.count_nonzero(label < num_classes_semantic)

shower_label = module_cfg.get('shower_label', 0)
edep_col = module_cfg.get('edep_col', -2)
assert shower_label >= 0 and shower_label < num_classes_semantic

row_names += ['num_ghost_points', 'num_nonghost_points']
row_values += [num_ghost_points, num_nonghost_points]

ghost_predictions = np.argmax(res['segmentation'][data_idx], axis=1)
mask = ghost_predictions == 0

# Fraction of ghost points predicted as ghost points
ghost2ghost = (ghost_predictions[label == num_classes_semantic] == 1).sum() / float(num_ghost_points)
# Fraction of true non-ghost points predicted as true non-ghost points
nonghost2nonghost = (ghost_predictions[label < num_classes_semantic] == 0).sum() / float(num_nonghost_points)
row_names += ["ghost2ghost", "nonghost2nonghost"]
row_values += [ghost2ghost, nonghost2nonghost]

for c in range(num_classes_semantic):
row_names += ['num_true_pix_class_%d' % c]
row_values += [np.count_nonzero(label == c)]
#print(c, np.count_nonzero(label == c), np.count_nonzero((label == c) & (ghost_predictions == 1)))
row_names += ['num_pred_pix_class_%d_%d' % (c, x) for x in range(num_classes_ghost)]
row_values += [np.count_nonzero((label == c) & (ghost_predictions == x)) for x in range(num_classes_ghost)]

row_names += ['num_pred_pix_doublets_class_%d_%d' % (c, x) for x in range(num_classes_ghost)]
row_values += [np.count_nonzero((label == c) & (ghost_predictions == x) & (nhits == 2)) for x in range(num_classes_ghost)]

row_names += ['num_pred_pix_triplets_class_%d_%d' % (c, x) for x in range(num_classes_ghost)]
row_values += [np.count_nonzero((label == c) & (ghost_predictions == x) & (nhits == 3)) for x in range(num_classes_ghost)]

row_names += ['num_doublets_class_%d' % c, 'num_triplets_class_%d' % c]
row_values += [np.count_nonzero((label == c) & (nhits == 2)), np.count_nonzero((label == c) & (nhits == 3))]

row_names += ['num_doublets_ghost', 'num_triplets_ghost']
row_values += [np.count_nonzero((label == num_classes_semantic) & (nhits == 2)), np.count_nonzero((label == num_classes_semantic) & (nhits == 3))]

row_names += ['num_doublets_ghost_%d' % x for x in range(num_classes_ghost)]
row_values += [np.count_nonzero((label == num_classes_semantic) & (nhits == 2) & (ghost_predictions == x)) for x in range(num_classes_ghost)]

row_names += ['num_triplets_ghost_%d' % x for x in range(num_classes_ghost)]
row_values += [np.count_nonzero((label == num_classes_semantic) & (nhits == 3) & (ghost_predictions == x)) for x in range(num_classes_ghost)]

# Record shower voxels sum in true mask and in (true & pred) mask
# to see if we lose a significant amount of energy
# (might be offset by true ghost predicted as nonghost)
row_names += ['shower_true_voxel_sum', 'shower_true_pred_voxel_sum']
row_values += [data[label == shower_label, edep_col].sum(), data[(label == shower_label) & mask, edep_col].sum()]

row_names += ['shower_true_voxel_sum_doublets', 'shower_true_pred_voxel_sum_doublets']
row_values += [data[(label == shower_label) & (nhits == 2), edep_col].sum(), data[(label == shower_label) & mask & (nhits == 2), edep_col].sum()]

row_names += ['shower_true_voxel_sum_triplets', 'shower_true_pred_voxel_sum_triplets']
row_values += [data[(label == shower_label) & (nhits == 3), edep_col].sum(), data[(label == shower_label) & mask & (nhits == 3), edep_col].sum()]

return tuple(row_names), tuple(row_values)
2 changes: 1 addition & 1 deletion mlreco/trainval.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,7 @@ def _forward(self, train_blob, loss_blob, iteration=None):

if not len(self._gpus):
train_blob = train_blob[0]
print(not self._net.device_ids)
#print(not self._net.device_ids)
result = self._net(train_blob)

if not len(self._gpus):
Expand Down

0 comments on commit cf154b0

Please sign in to comment.