-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathduplicate_checker.py
80 lines (59 loc) · 2.38 KB
/
duplicate_checker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
from metacat.webapi import MetaCatClient
from argparse import ArgumentParser as ap
def get_files_and_dupes(query):
files = [i for i in mc_client.query(query)]
# Just get ids
ids = ['_'.join(f['name'].split('_')[3:6]) for f in files]
# Count them
counts = {i:ids.count(i) for i in ids}
# Find the duplicates
dupes = [i for i,c in counts.items() if c > 1]
dupe_full_files = [
f['name'] for f in files if '_'.join(f['name'].split('_')[3:6]) in dupes
]
return files, dupes, dupe_full_files
if __name__ == '__main__':
parser = ap()
parser.add_argument('--workflows', type=str, nargs='+')
parser.add_argument('--list', action='store_true')
parser.add_argument('--save', type=str, default=None)
args = parser.parse_args()
mc_client = MetaCatClient()
base_query = ("files where dune.workflow['workflow_id'] in "
f"({','.join(args.workflows)})")
reco_query = base_query + " and core.data_tier='full-reconstructed'"
ana_query = base_query + " and core.data_tier='root-tuple-virtual'"
# Get the reco files
files = [i for i in mc_client.query(reco_query)]
files, dupes, dupe_full_files = get_files_and_dupes(reco_query)
print("Total queried reco files:", len(files))
print(f"{len(dupes)} inputs with duplicated reco output")
print(f"{len(dupe_full_files)} duplicate reco outputs")
print('\n---------------')
ana_files, ana_dupes, ana_dupe_full_files = get_files_and_dupes(ana_query)
print("Total queried ana files:", len(ana_files))
print(f"{len(ana_dupes)} inputs with duplicated ana output")
print(f"{len(ana_dupe_full_files)} duplicate ana outputs")
# Look both ways -- see if an id is found in either duplicate list
# for both the ana and reco files
all_bad_files = [
f['name'] for f in ana_files
if '_'.join(f['name'].split('_')[3:6]) in ana_dupes
or '_'.join(f['name'].split('_')[3:6]) in dupes
] + [
f['name'] for f in files
if '_'.join(f['name'].split('_')[3:6]) in ana_dupes
or '_'.join(f['name'].split('_')[3:6]) in dupes
]
print('\n---------------')
print(f'{len(all_bad_files)} plagued output files')
# Sort so it's easier to check output
all_bad_files.sort(
key = lambda a : '_'.join(a.split('_')[3:6])
)
if args.list:
for d in all_bad_files:
print(d)
if args.save is not None:
with open(args.save, 'w') as f:
f.writelines([l+'\n' for l in all_bad_files])