Skip to content

Commit

Permalink
Add selective collection reading to SIO
Browse files Browse the repository at this point in the history
  • Loading branch information
tmadlener committed Jan 6, 2025
1 parent a55cce2 commit 37859c7
Show file tree
Hide file tree
Showing 5 changed files with 57 additions and 14 deletions.
10 changes: 8 additions & 2 deletions include/podio/SIOFrameData.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,13 @@ class SIOFrameData {
/// tableBuffer containing the necessary information for unpacking the
/// collections. The two size parameters denote the uncompressed size of the
/// respective buffers.
SIOFrameData(sio::buffer&& collBuffers, std::size_t dataSize, sio::buffer&& tableBuffer, std::size_t tableSize) :
SIOFrameData(sio::buffer&& collBuffers, std::size_t dataSize, sio::buffer&& tableBuffer, std::size_t tableSize,
std::vector<std::string> limitColls = {}) :
m_recBuffer(std::move(collBuffers)),
m_tableBuffer(std::move(tableBuffer)),
m_dataSize(dataSize),
m_tableSize(tableSize) {
m_tableSize(tableSize),
m_limitColls(std::move(limitColls)) {
}

std::optional<podio::CollectionReadBuffers> getCollectionBuffers(const std::string& name);
Expand Down Expand Up @@ -79,6 +81,10 @@ class SIOFrameData {
std::vector<short> m_subsetCollectionBits{};

podio::GenericParameters m_parameters{};

/// The collections that should be made available for a Frame constructed from
/// this (if non-empty)
std::vector<std::string> m_limitColls{};
};
} // namespace podio

Expand Down
22 changes: 20 additions & 2 deletions include/podio/SIOReader.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,21 +37,39 @@ class SIOReader {

/// Read the next data entry for a given category.
///
/// @note Given how the SIO files are currently layed out it is in fact not
/// possible to only read a subset of a Frame. Rather the subset of
/// collections to read will be an artificial limit on the returned
/// SIOFrameData. Limiting the collections to read will not improve I/O
/// performance.
///
/// @param name The category name for which to read the next entry
/// @param collsToRead (optional) the collection names that should be read. If
/// not provided (or empty) all collections will be read
///
/// @returns FrameData from which a podio::Frame can be constructed if the
/// category exists and if there are still entries left to read.
/// Otherwise a nullptr
std::unique_ptr<podio::SIOFrameData> readNextEntry(const std::string& name);
std::unique_ptr<podio::SIOFrameData> readNextEntry(const std::string& name,
const std::vector<std::string>& collsToRead = {});

/// Read the desired data entry for a given category.
///
/// @note Given how the SIO files are currently layed out it is in fact not
/// possible to only read a subset of a Frame. Rather the subset of
/// collections to read will be an artificial limit on the returned
/// SIOFrameData. Limiting the collections to read will not improve I/O
/// performance.
///
/// @param name The category name for which to read the next entry
/// @param entry The entry number to read
/// @param collsToRead (optional) the collection names that should be read. If
/// not provided (or empty) all collections will be read
///
/// @returns FrameData from which a podio::Frame can be constructed if the
/// category and the desired entry exist. Otherwise a nullptr
std::unique_ptr<podio::SIOFrameData> readEntry(const std::string& name, const unsigned entry);
std::unique_ptr<podio::SIOFrameData> readEntry(const std::string& name, const unsigned entry,
const std::vector<std::string>& collsToRead = {});

/// Get the number of entries for the given name
///
Expand Down
27 changes: 22 additions & 5 deletions src/SIOFrameData.cc
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@ std::optional<podio::CollectionReadBuffers> SIOFrameData::getCollectionBuffers(c
const auto nameIt = std::find(std::begin(names), std::end(names), name);
// collection indices start at 1!
const auto index = std::distance(std::begin(names), nameIt) + 1;
// This collection is not available (artificially!)
if (m_availableBlocks[index] == 0) {
return std::nullopt;
}

// Mark this block as consumed
m_availableBlocks[index] = 0;
Expand All @@ -38,11 +42,8 @@ std::vector<std::string> SIOFrameData::getAvailableCollections() {
std::vector<std::string> collections;
for (size_t i = 1; i < m_blocks.size(); ++i) {
if (m_availableBlocks[i]) {
// We have to get the collID of this collection in the idTable as there is
// no guarantee that it coincides with the index in the blocks.
// Additionally, collection indices start at 1
const auto collID = m_idTable.ids()[i - 1];
collections.push_back(m_idTable.name(collID).value());
const auto name = m_idTable.names()[i - 1];
collections.push_back(name);
}
}

Expand All @@ -67,6 +68,22 @@ void SIOFrameData::unpackBuffers() {
sio::buffer uncBuffer{m_dataSize};
compressor.uncompress(m_recBuffer.span(), uncBuffer);
sio::api::read_blocks(uncBuffer.span(), m_blocks);

if (m_limitColls.empty()) {
return;
}

// In order to save on memory and to not litter the rest of the implementation
// with similar checks, we immediately throw away all collections that should
// not become available
for (size_t i = 1; i < m_blocks.size(); ++i) {
const auto name = m_idTable.names()[i - 1];
if (std::ranges::find(m_limitColls, name) == m_limitColls.end()) {
auto buffers = dynamic_cast<SIOBlock*>(m_blocks[i].get())->getBuffers();
buffers.deleteBuffers(buffers);
m_availableBlocks[i] = 0;
}
}
}

void SIOFrameData::createBlocks() {
Expand Down
10 changes: 6 additions & 4 deletions src/SIOReader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ void SIOReader::openFile(const std::string& filename) {
readEDMDefinitions(); // Potentially could do this lazily
}

std::unique_ptr<SIOFrameData> SIOReader::readNextEntry(const std::string& name) {
std::unique_ptr<SIOFrameData> SIOReader::readNextEntry(const std::string& name,
const std::vector<std::string>& collsToRead) {
// Skip to where the next record of this name starts in the file, based on
// how many times we have already read this name
//
Expand All @@ -44,14 +45,15 @@ std::unique_ptr<SIOFrameData> SIOReader::readNextEntry(const std::string& name)
m_nameCtr[name]++;

return std::make_unique<SIOFrameData>(std::move(dataBuffer), dataInfo._uncompressed_length, std::move(tableBuffer),
tableInfo._uncompressed_length);
tableInfo._uncompressed_length, collsToRead);
}

std::unique_ptr<SIOFrameData> SIOReader::readEntry(const std::string& name, const unsigned entry) {
std::unique_ptr<SIOFrameData> SIOReader::readEntry(const std::string& name, const unsigned entry,
const std::vector<std::string>& collsToRead) {
// NOTE: Will create or overwrite the entry counter
// All checks are done in the following function
m_nameCtr[name] = entry;
return readNextEntry(name);
return readNextEntry(name, collsToRead);
}

std::vector<std::string_view> SIOReader::getAvailableCategories() const {
Expand Down
2 changes: 1 addition & 1 deletion tests/sio_io/read_frame_sio.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,5 +12,5 @@ int main(int argc, char* argv[]) {
}

return read_frames<podio::SIOReader>(inputFile, assertBuildVersion) +
test_frame_aux_info<podio::SIOReader>(inputFile);
test_frame_aux_info<podio::SIOReader>(inputFile) + test_read_frame_limited<podio::SIOReader>(inputFile);
}

0 comments on commit 37859c7

Please sign in to comment.