Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

#2240: Add Rabenseifner and Recursive doubling allreduce algorithms for ObjGroup #2272

Closed
Changes from 1 commit
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
7def560
#2240: Initial work for new allreduce
JacobDomagala Mar 24, 2024
01b6afb
#2240: Semi working Rabenseifner
JacobDomagala Mar 27, 2024
5289362
#2240: Working Rabenseifner (non-commutative ops)
JacobDomagala Apr 4, 2024
d52afeb
#2240: Fix non power of 2 for new allreduce
JacobDomagala Apr 7, 2024
5372da5
#2240: Initial work for adding recursive doubling allreduce algorithm
JacobDomagala Apr 10, 2024
90a20e0
#2240: Make sure the order of reduce operations is correct
JacobDomagala Apr 11, 2024
8bf1cc9
#2240: Working Recursive doubling
JacobDomagala Apr 15, 2024
bb1ca10
#2240: Code cleanup and make Rabenseifner work with any Op type
JacobDomagala Apr 16, 2024
166f231
#2240: Improve accuracy of timing allreduce algorithms in allreduce.cc
JacobDomagala Apr 26, 2024
fa16fa1
#2240: Add unit tests for new allreduce and cleanup code
JacobDomagala May 21, 2024
a0fdad8
#2240: DataHandler for Rabenseifner allreduce that provides common AP…
JacobDomagala May 28, 2024
f9a60fa
#2240: Fix warnings
JacobDomagala May 28, 2024
63b39f5
#2240: Update ObjGroup test to use custom DataHandler for Rabenseifne…
JacobDomagala May 30, 2024
a07f6c8
#2240: Add unit test for Rabenseifner with Kokkos::View as DataType a…
JacobDomagala May 31, 2024
316bfb8
#2240: Move function definitions to impl.h file for Rabenseifner
JacobDomagala Jun 3, 2024
b8cd612
#2240: Add allreduce print category and use it in rabenseifner instea…
JacobDomagala Jun 4, 2024
5f40e4b
#2240: Provide documentation for RecursiveDoubling algorithm
JacobDomagala Jun 4, 2024
b200ecd
#2240: Use vt_debug_print for RecursiveDoubling allreduce
JacobDomagala Jun 4, 2024
eb1bc40
#2240: Update allreduce perf tests to use array of payload sizes
JacobDomagala Jun 5, 2024
977e9e3
#2240: Fix runtime failure in allreduce perf test
JacobDomagala Jun 7, 2024
1456dd5
#2240: Working allreduce perf test with Kokkos
JacobDomagala Jun 16, 2024
5803848
#2240: Working RecursiveDoubling with multiple allreduce in flight
JacobDomagala Jun 17, 2024
2015e78
#2240: Update Rabenseifner to use ID for each allreduce and update tests
JacobDomagala Jun 18, 2024
87ad4cf
#2240: Fix failing unit and performance tests for multiple allreduce …
JacobDomagala Jun 25, 2024
ab0357b
#2240: Fix compile issues on some compilers and runtime issue with pa…
JacobDomagala Jul 2, 2024
be3ee2c
#2240: Update logs
JacobDomagala Jul 6, 2024
28139a7
#2240: Fix issues with handlers being executed and payload not being …
JacobDomagala Jul 16, 2024
c5232dc
#2240: Add helpers and use Kokkos::View for internals of Rabenseifner…
JacobDomagala Jul 17, 2024
57b8cab
#2240: Store Reducers by tuple(ProxyType, DataType, OperandType)
JacobDomagala Jul 18, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
#2240: Update logs
  • Loading branch information
JacobDomagala committed Jul 18, 2024
commit be3ee2c26702ffefa80bb937c279d89790c8c208
22 changes: 11 additions & 11 deletions src/vt/collective/reduce/allreduce/rabenseifner.impl.h
Original file line number Diff line number Diff line change
@@ -193,7 +193,7 @@ void Rabenseifner<DataT, Op, ObjT, finalHandler>::adjustForPowerOfTwo(size_t id)
auto const partner = is_even_ ? this_node_ + 1 : this_node_ - 1;

vt_debug_print(
terse, allreduce, "Rabenseifner (Send Part1): To Node {} ID = {}\n", partner, id
terse, allreduce, "Rabenseifner::adjustForPowerOfTwo: To Node {} ID = {}\n", partner, id
);

if (is_even_) {
@@ -231,7 +231,7 @@ void Rabenseifner<DataT, Op, ObjT, finalHandler>::adjustForPowerOfTwoRightHalf(
}

vt_debug_print(
terse, allreduce, "Rabenseifner (Recv Part1): From Node {} ID = {}\n",
terse, allreduce, "Rabenseifner::adjustForPowerOfTwoRightHalf: From Node {} ID = {}\n",
theContext()->getFromNodeCurrentTask(), msg->id_
);

@@ -260,7 +260,7 @@ void Rabenseifner<DataT, Op, ObjT, finalHandler>::adjustForPowerOfTwoLeftHalf(
}

vt_debug_print(
terse, allreduce, "Rabenseifner (Recv Part1): From Node {} ID = {}\n",
terse, allreduce, "Rabenseifner::adjustForPowerOfTwoLeftHalf: From Node {} ID = {}\n",
theContext()->getFromNodeCurrentTask(), msg->id_
);

@@ -276,7 +276,7 @@ void Rabenseifner<DataT, Op, ObjT, finalHandler>::adjustForPowerOfTwoFinalPart(
AllreduceRbnRawMsg<Scalar>* msg) {

vt_debug_print(
terse, allreduce, "Rabenseifner (Recv Part2): From Node {} ID = {}\n",
terse, allreduce, "Rabenseifner::adjustForPowerOfTwoFinalPart: From Node {} ID = {}\n",
theContext()->getFromNodeCurrentTask(), msg->id_
);

@@ -356,7 +356,7 @@ void Rabenseifner<DataT, Op, ObjT, finalHandler>::scatterReduceIter(size_t id) {

vt_debug_print(
terse, allreduce,
"Rabenseifner Part2 (Send step {}): To Node {} starting with idx = {} and "
"Rabenseifner Scatter (Send step {}): To Node {} starting with idx = {} and "
"count "
"{} ID = {}\n",
state.scatter_step_, dest, state.s_index_[state.scatter_step_],
@@ -408,7 +408,7 @@ void Rabenseifner<DataT, Op, ObjT, finalHandler>::scatterReduceIterHandler(

vt_debug_print(
terse, allreduce,
"Rabenseifner Part2 (Recv step {}): scatter_mask_= {} nprocs_pof2_ = {}: "
"Rabenseifner Scatter (Recv step {}): scatter_mask_= {} nprocs_pof2_ = {}: "
"idx = {} from {} ID = {}\n",
msg->step_, state.scatter_mask_, nprocs_pof2_, state.r_index_[msg->step_],
theContext()->getFromNodeCurrentTask(), msg->id_
@@ -486,7 +486,7 @@ void Rabenseifner<DataT, Op, ObjT, finalHandler>::gatherIter(size_t id) {

vt_debug_print(
terse, allreduce,
"Rabenseifner Part3 (step {}): Sending to Node {} starting with idx = {} and "
"Rabenseifner Gather (step {}): Sending to Node {} starting with idx = {} and "
"count "
"{} ID = {}\n",
state.gather_step_, dest, state.r_index_[state.gather_step_],
@@ -516,7 +516,7 @@ void Rabenseifner<DataT, Op, ObjT, finalHandler>::gatherIterHandler(
AllreduceRbnRawMsg<Scalar>* msg) {
auto& state = states_.at(msg->id_);
vt_debug_print(
terse, allreduce, "Rabenseifner Part3 (step {}): Received idx = {} from {} ID = {}\n",
terse, allreduce, "Rabenseifner Gather (step {}): Received idx = {} from {} ID = {}\n",
msg->step_, state.s_index_[msg->step_],
theContext()->getFromNodeCurrentTask(), msg->id_
);
@@ -553,7 +553,7 @@ void Rabenseifner<DataT, Op, ObjT, finalHandler>::finalPart(size_t id) {

vt_debug_print(
terse, allreduce,
"Rabenseifner Part4: Executing final handler with size {} ID = {}\n", state.val_.size(), id
"Rabenseifner::finalPart(): Executing final handler with size {} ID = {}\n", state.val_.size(), id
);

parent_proxy_[this_node_].template invoke<finalHandler>(
@@ -584,7 +584,7 @@ void Rabenseifner<DataT, Op, ObjT, finalHandler>::sendToExcludedNodes(size_t id)
auto& state = states_.at(id);
if (is_part_of_adjustment_group_ and is_even_) {
vt_debug_print(
terse, allreduce, "Rabenseifner Part4: Sending to Node {} ID = {}\n",
terse, allreduce, "Rabenseifner::sendToExcludedNodes(): Sending to Node {} ID = {}\n",
this_node_ + 1, id
);
proxy_[this_node_ + 1]
@@ -600,7 +600,7 @@ void Rabenseifner<DataT, Op, ObjT, finalHandler>::sendToExcludedNodesHandler(
auto& state = states_.at(msg->id_);
vt_debug_print(
terse, allreduce,
"Rabenseifner Part4: Received allreduce result with size {} ID = {}\n", msg->size_, msg->id_
"Rabenseifner::sendToExcludedNodesHandler(): Received allreduce result with size {} ID = {}\n", msg->size_, msg->id_
);

parent_proxy_[this_node_].template invoke<finalHandler>(