Skip to content

Commit

Permalink
#2281: Typeless Rabensifner and small fixes to StateHolder
Browse files Browse the repository at this point in the history
  • Loading branch information
JacobDomagala committed Sep 12, 2024
1 parent 9cba850 commit 3ef0ab4
Show file tree
Hide file tree
Showing 10 changed files with 528 additions and 543 deletions.
6 changes: 2 additions & 4 deletions src/vt/collective/reduce/allreduce/helpers.h
Original file line number Diff line number Diff line change
Expand Up @@ -135,10 +135,8 @@ struct DataHelper {
template <template <typename Arg> class Op, typename... Args>
static void reduce(
std::vector<Scalar>& dest, Args &&... data) {
auto vector_val = DataHan::toVec(std::forward<Args>(data)...);
for (uint32_t i = 0; i < vector_val.size(); i++) {
Op<Scalar>()(dest[i], vector_val[i]);
}
auto& vector_val = DataHan::toVec(std::forward<Args>(data)...);
Op<std::vector<Scalar>>()(dest, vector_val);
}

static bool empty(const std::vector<Scalar>& payload) {
Expand Down
179 changes: 179 additions & 0 deletions src/vt/collective/reduce/allreduce/rabenseifner.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
/*
//@HEADER
// *****************************************************************************
//
// rabenseifner.impl.h
// DARMA/vt => Virtual Transport
//
// Copyright 2019-2021 National Technology & Engineering Solutions of Sandia, LLC
// (NTESS). Under the terms of Contract DE-NA0003525 with NTESS, the U.S.
// Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
//
// * Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistributions in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * Neither the name of the copyright holder nor the names of its
// contributors may be used to endorse or promote products derived from this
// software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
// POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact [email protected]
//
// *****************************************************************************
//@HEADER
*/

#include "vt/collective/reduce/allreduce/rabenseifner.h"
#include "vt/configs/error/config_assert.h"

namespace vt::collective::reduce::allreduce {

Rabenseifner::Rabenseifner(
detail::StrongVrtProxy proxy, detail::StrongGroup group, size_t num_elems)
: collection_proxy_(proxy.get()),
local_num_elems_(num_elems),
nodes_(theGroup()->GetGroupNodes(group.get())),
num_nodes_(nodes_.size()),
this_node_(theContext()->getNode()),
num_steps_(static_cast<int32_t>(log2(num_nodes_))),
nprocs_pof2_(1 << num_steps_),
nprocs_rem_(num_nodes_ - nprocs_pof2_) {

auto const is_default_group = group.get() == default_group;
if (not is_default_group) {
auto it = std::find(nodes_.begin(), nodes_.end(), theContext()->getNode());
vtAssert(it != nodes_.end(), "This node was not found in group nodes!");

this_node_ = it - nodes_.begin();
}

is_even_ = this_node_ % 2 == 0;
is_part_of_adjustment_group_ = this_node_ < (2 * nprocs_rem_);
if (is_part_of_adjustment_group_) {
if (is_even_) {
vrt_node_ = this_node_ / 2;
} else {
vrt_node_ = -1;
}
} else {
vrt_node_ = this_node_ - nprocs_rem_;
}

vt_debug_print(
terse, allreduce,
"Rabenseifner (this={}): proxy={:x} proxy_={} local_num_elems={}\n",
print_ptr(this), proxy.get(), proxy_.getProxy(), local_num_elems_);
}

Rabenseifner::Rabenseifner(
detail::StrongGroup group)
: group_(group.get()),
local_num_elems_(1),
nodes_(theGroup()->GetGroupNodes(group.get())),
num_nodes_(nodes_.size()),
this_node_(theContext()->getNode()),
num_steps_(static_cast<int32_t>(log2(num_nodes_))),
nprocs_pof2_(1 << num_steps_),
nprocs_rem_(num_nodes_ - nprocs_pof2_) {
std::string nodes_info;
for (auto& node : nodes_) {
nodes_info += fmt::format("{} ", node);
}
auto const is_default_group = group.get() == default_group;
auto const is_part_of_allreduce =
(not is_default_group and theGroup()->inGroup(group.get())) or
is_default_group;

vt_debug_print(
terse, allreduce,
"Rabenseifner: is_default_group={} is_part_of_allreduce={} num_nodes_={} "
"Nodes:[{}]\n",
is_default_group, is_part_of_allreduce, num_nodes_, nodes_info);

if (not is_default_group and theGroup()->inGroup(group.get())) {
auto it = std::find(nodes_.begin(), nodes_.end(), theContext()->getNode());
vtAssert(it != nodes_.end(), "This node was not found in group nodes!");

// index in group list
this_node_ = it - nodes_.begin();
}

// We collectively create this Reducer, so it's possible that not all Nodes are part of it
if (is_part_of_allreduce) {
is_even_ = this_node_ % 2 == 0;
is_part_of_adjustment_group_ = this_node_ < (2 * nprocs_rem_);
if (is_part_of_adjustment_group_) {
if (is_even_) {
vrt_node_ = this_node_ / 2;
} else {
vrt_node_ = -1;
}
} else {
vrt_node_ = this_node_ - nprocs_rem_;
}
}
}

Rabenseifner::Rabenseifner(detail::StrongObjGroup objgroup)
: objgroup_proxy_(objgroup.get()),
local_num_elems_(1),
nodes_(theGroup()->GetGroupNodes(default_group)),
num_nodes_(nodes_.size()),
this_node_(theContext()->getNode()),
num_steps_(static_cast<int32_t>(log2(num_nodes_))),
nprocs_pof2_(1 << num_steps_),
nprocs_rem_(num_nodes_ - nprocs_pof2_) {
std::string nodes_info;
for (auto& node : nodes_) {
nodes_info += fmt::format("{} ", node);
}

vt_debug_print(
terse, allreduce,
"Rabenseifner: is_default_group={} is_part_of_allreduce={} num_nodes_={} "
"Nodes:[{}]\n",
true, true, num_nodes_, nodes_info);

// We collectively create this Reducer, so it's possible that not all Nodes are part of it
is_even_ = this_node_ % 2 == 0;
is_part_of_adjustment_group_ = this_node_ < (2 * nprocs_rem_);
if (is_part_of_adjustment_group_) {
if (is_even_) {
vrt_node_ = this_node_ / 2;
} else {
vrt_node_ = -1;
}
} else {
vrt_node_ = this_node_ - nprocs_rem_;
}
}

Rabenseifner::~Rabenseifner() {
if (collection_proxy_ != u64empty) {
StateHolder::clearAll(detail::StrongVrtProxy{collection_proxy_});
} else if (objgroup_proxy_ != u64empty) {
StateHolder::clearAll(detail::StrongObjGroup{objgroup_proxy_});
} else {
StateHolder::clearAll(detail::StrongGroup{group_});
}
}

} // namespace vt::collective::reduce::allreduce
33 changes: 12 additions & 21 deletions src/vt/collective/reduce/allreduce/rabenseifner.h
Original file line number Diff line number Diff line change
Expand Up @@ -76,27 +76,17 @@ struct ObjgroupAllreduceT {};
* \tparam Op Reduction operation (e.g., sum, max, min).
* \tparam finalHandler Callback handler for the final result.
*/
template <template <typename Arg> class Op>
struct Rabenseifner {
// using Data = DataT;
// using DataType = DataHandler<DataT>;
// using Scalar = typename DataType::Scalar;
// using ReduceOp = Op<Scalar>;
// using DataHelperT = DataHelper<Scalar, DataT>;
// using StateT = State<Scalar, DataT>;

// using Trait = ObjFuncTraits<decltype(f)>;
// using CallbackType =
// typename Trait::template WrapType<pipe::PipeManagerTL::CallbackRetType>;

struct Rabenseifner {
Rabenseifner(detail::StrongVrtProxy proxy, detail::StrongGroup group, size_t num_elems);
Rabenseifner(detail::StrongGroup group);
Rabenseifner(detail::StrongObjGroup objgroup);
~Rabenseifner();

template <typename DataT, typename CallbackType>
void setFinalHandler(const CallbackType& fin, size_t id);

template <typename DataT, typename... Args>
template <typename DataT, template <typename Arg> class Op, typename... Args>
void localReduce(size_t id, Args&&... args);
/**
* \brief Initialize the allreduce algorithm.
Expand All @@ -123,7 +113,7 @@ struct Rabenseifner {
*
* This function starts the allreduce operation, adjusting for non-power-of-two process counts if necessary.
*/
template <typename DataT>
template <typename DataT, template <typename Arg> class Op>
void allreduce(size_t id);

/**
Expand All @@ -132,7 +122,7 @@ struct Rabenseifner {
* This function performs additional steps to handle non-power-of-two process counts, ensuring that the
* main scatter-reduce and gather-allgather phases can proceed with a power-of-two number of processes.
*/
template <typename DataT>
template <typename DataT, template <typename Arg> class Op>
void adjustForPowerOfTwo(size_t id);

/**
Expand All @@ -142,7 +132,7 @@ struct Rabenseifner {
*
* \param msg Message containing the data from the partner process.
*/
template <typename DataT, typename Scalar = typename DataHandler<DataT>::Scalar>
template <typename DataT, typename Scalar, template <typename Arg> class Op>
void adjustForPowerOfTwoRightHalf(RabenseifnerMsg<Scalar, DataT>* msg);

/**
Expand All @@ -152,7 +142,7 @@ struct Rabenseifner {
*
* \param msg Message containing the data from the partner process.
*/
template <typename DataT, typename Scalar = typename DataHandler<DataT>::Scalar>
template <typename DataT, typename Scalar, template <typename Arg> class Op>
void adjustForPowerOfTwoLeftHalf(RabenseifnerMsg<Scalar, DataT>* msg);

/**
Expand All @@ -162,7 +152,7 @@ struct Rabenseifner {
*
* \param msg Message containing the data from the partner process.
*/
template <typename DataT, typename Scalar = typename DataHandler<DataT>::Scalar>
template <typename DataT, typename Scalar, template <typename Arg> class Op>
void adjustForPowerOfTwoFinalPart(RabenseifnerMsg<Scalar, DataT>* msg);

/**
Expand Down Expand Up @@ -194,15 +184,15 @@ struct Rabenseifner {
*
* \param step The current step in the scatter phase.
*/
template <typename DataT>
template <typename DataT, template <typename Arg> class Op>
void scatterTryReduce(size_t id, int32_t step);

/**
* \brief Perform the scatter-reduce iteration.
*
* This function sends data to the appropriate partner process and proceeds to the next step in the scatter phase.
*/
template <typename DataT>
template <typename DataT, template <typename Arg> class Op>
void scatterReduceIter(size_t id);

/**
Expand All @@ -212,7 +202,7 @@ struct Rabenseifner {
*
* \param msg Message containing the data from the partner process.
*/
template <typename DataT, typename Scalar>
template <typename DataT, typename Scalar, template <typename Arg> class Op>
void scatterReduceIterHandler(RabenseifnerMsg<Scalar, DataT>* msg);

/**
Expand Down Expand Up @@ -295,6 +285,7 @@ struct Rabenseifner {

VirtualProxyType collection_proxy_ = u64empty;
ObjGroupProxyType objgroup_proxy_ = u64empty;
GroupType group_ = u64empty;

size_t local_num_elems_ = {};

Expand Down
Loading

0 comments on commit 3ef0ab4

Please sign in to comment.