From 960795b978a04afd819673930da492342482c8f7 Mon Sep 17 00:00:00 2001 From: Juan Miguel Carceller <22276694+jmcarcell@users.noreply.github.com> Date: Wed, 20 Nov 2024 09:33:38 +0100 Subject: [PATCH] Add a tool to merge several podio files into a single one (#681) * Add a tool to merge several podio files into a single one * Generate a metadata frame if it doesn't exist * Add configuration for the metadata parameter name * Hardcode the metadata parameters --------- Co-authored-by: jmcarcell --- tools/CMakeLists.txt | 1 + tools/podio-merge-files | 67 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 68 insertions(+) create mode 100755 tools/podio-merge-files diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt index c5fa5d4d8..cbacb59a2 100644 --- a/tools/CMakeLists.txt +++ b/tools/CMakeLists.txt @@ -3,6 +3,7 @@ install(PROGRAMS ${CMAKE_CURRENT_LIST_DIR}/podio-vis DESTINATION ${CMAKE_INSTALL if(ENABLE_RNTUPLE) install(PROGRAMS ${CMAKE_CURRENT_LIST_DIR}/podio-ttree-to-rntuple DESTINATION ${CMAKE_INSTALL_BINDIR}) endif() +install(PROGRAMS ${CMAKE_CURRENT_LIST_DIR}/podio-merge-files DESTINATION ${CMAKE_INSTALL_BINDIR}) # Add a very basic test of podio-vis if(BUILD_TESTING) diff --git a/tools/podio-merge-files b/tools/podio-merge-files new file mode 100755 index 000000000..4b313b78b --- /dev/null +++ b/tools/podio-merge-files @@ -0,0 +1,67 @@ +#!/usr/bin/env python3 +"""podio-merge-files tool to merge any number of podio files into one""" + +import argparse +import sys +import podio +import podio.root_io +from podio import reading + +parser = argparse.ArgumentParser( + description="Merge any number of podio files into one, can merge TTree and RNTuple files" +) + +parser.add_argument("--output-file", help="name of the output file", required=True) +parser.add_argument("files", nargs="+", help="which files to merge") +parser.add_argument( + "--metadata", + choices=["none", "all", "first"], + default="first", + help="metadata to include in the output file, default: " + "only the one from the first file, other options: all files, none", +) +args = parser.parse_args() + +all_files = set() +for f in args.files: + if f in all_files: + raise ValueError(f"File {f} is present more than once in the input list") + all_files.add(f) + +ROOT_FORMAT = reading._determine_root_format(args.files[0]) # pylint: disable=protected-access +if ROOT_FORMAT == reading.RootFileFormat.TTREE: + reader = podio.root_io.Reader(args.files) + writer = podio.root_io.Writer(args.output_file) +elif ROOT_FORMAT == reading.RootFileFormat.RNTUPLE: + reader = podio.root_io.RNTupleReader(args.files) + writer = podio.root_io.RNTupleWriter(args.output_file) +else: + raise ValueError(f"Input file {args.files[0]} is not a TTree or RNTuple file") + +categories = list(reader.categories) +is_metadata_available = True # pylint: disable=invalid-name +try: + # All frames will be copied as they are except the metadata ones + categories.remove("metadata") +except ValueError: + is_metadata_available = False # pylint: disable=invalid-name + +for category in categories: + all_frames = reader.get(category) + for frame in all_frames: + writer.write_frame(frame, category) + +if args.metadata == "none": + sys.exit(0) + +if not is_metadata_available: + print("Warning: metadata category 'metadata' not found in the input files, it will be created") + all_frames = [podio.Frame()] +else: + if args.metadata == "first": + all_frames = [reader.get("metadata")[0]] + else: + all_frames = reader.get("metadata") +for frame in all_frames: + frame.put_parameter("MergedInputFiles", args.files) + writer.write_frame(frame, "metadata")