diff --git a/CMakeLists.txt b/CMakeLists.txt index fb792da9f5..758b49302b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -59,8 +59,6 @@ option(ENABLE_SAMRAI_TESTS "Enable SAMRAI Test Programs" On) option(ENABLE_PERF_TESTS "Enable Performance Tests." Off) set(NUM_PERF_PROCS 8 CACHE INT "Number of processors for performance tests.") option(ENABLE_CHECK_ASSERTIONS "Enable assertion checking." On) -option(ENABLE_CHECK_DEV_ASSERTIONS "Enable SAMRAI developer assertion checking." Off) -option(ENABLE_CHECK_DIM_ASSERTIONS "Enable assertion checking for dimensions." Off) option(ENABLE_BOX_COUNTING "Turns on box telemetry." Off) option(ENABLE_DEPRECATED "Build with deprecated features." On) option(ENABLE_TIMERS "Enable SAMRAI timers." On) @@ -72,6 +70,7 @@ set(CUDA_ARCH "sm_70" CACHE STRING "Compute architecture to pass to CUDA builds" set(CMAKE_CUDA_FLAGS "" CACHE STRING "") set(CMAKE_INSTALL_LIBDIR lib) #set(CMAKE_INSTALL_RPATH_USE_LINK_PATH Off CACHE Bool "Rpath uses Link path") +set(SAMRAI_RAJA_WORKGROUP_THREADS 512 CACHE INT "Number of workgroup threads") include(GNUInstallDirs) diff --git a/config/SAMRAI_config.h.cmake.in b/config/SAMRAI_config.h.cmake.in index 013a8e5609..ad9a805b93 100644 --- a/config/SAMRAI_config.h.cmake.in +++ b/config/SAMRAI_config.h.cmake.in @@ -340,6 +340,8 @@ /* Maximum dimension allowed */ #define SAMRAI_MAXIMUM_DIMENSION @SAMRAI_MAXIMUM_DIMENSION@ +#define SAMRAI_RAJA_WORKGROUP_THREADS @SAMRAI_RAJA_WORKGROUP_THREADS@ + /* Define to 1 if you have the ANSI C header files. */ #undef STDC_HEADERS @@ -358,7 +360,9 @@ /* Configure for compiling on BGL family of machines */ #undef __BGL_FAMILY__ - +#ifdef HAVE_RAJA +#define SAMRAI_HAVE_KERNEL_FUSER +#endif namespace SAMRAI { static const unsigned short MAX_DIM_VAL = SAMRAI_MAXIMUM_DIMENSION; diff --git a/source/SAMRAI/hier/CMakeLists.txt b/source/SAMRAI/hier/CMakeLists.txt index 8cdff48343..1342c6251e 100644 --- a/source/SAMRAI/hier/CMakeLists.txt +++ b/source/SAMRAI/hier/CMakeLists.txt @@ -140,6 +140,10 @@ if (ENABLE_MPI) set (hier_depends ${hier_depends} mpi) endif () +if (ENABLE_CUDA) + set (hier_depends ${hier_depends} cuda) +endif () + blt_add_library( NAME SAMRAI_hier SOURCES ${hier_sources} diff --git a/source/SAMRAI/hier/ForAll.h b/source/SAMRAI/hier/ForAll.h index 533702ba31..656b4b2c14 100644 --- a/source/SAMRAI/hier/ForAll.h +++ b/source/SAMRAI/hier/ForAll.h @@ -19,6 +19,7 @@ #include "SAMRAI/hier/Box.h" #include "SAMRAI/hier/Index.h" #include "SAMRAI/tbox/ExecutionPolicy.h" +#include "SAMRAI/tbox/KernelFuser.h" #include #include @@ -145,8 +146,37 @@ struct for_all<1> { RAJA::make_tuple(make_range(ifirst, ilast, 0)), body); } + + template ::value, int>::type = 0> + inline static void eval(tbox::KernelFuser* fuser, const hier::Index& ifirst, const hier::Index& ilast, LoopBody body) + { + if (fuser == nullptr) { + RAJA::kernel::Policy1d>( + RAJA::make_tuple(make_range(ifirst, ilast, 0)), + body); + } else { + fuser->enqueue(ifirst(0), ilast(0), body); + } + } + + template ::value, int>::type = 0> + inline static void eval(tbox::KernelFuser* fuser, const hier::Index& ifirst, const hier::Index& ilast, LoopBody body) + { + if (fuser == nullptr) { + RAJA::kernel( + RAJA::make_tuple(make_range(ifirst, ilast, 0)), + body); + } else { + fuser->enqueue(ifirst(0), ilast(0), body); + } + } }; + +// 2D and 3D don't use the fuser for anything pending suppor for +// multidimensional loops in KernelFuser. template <> struct for_all<2> { template { make_range(ifirst, ilast, 1)), body); } + + template ::value, int>::type = 0> + inline static void eval(tbox::KernelFuser* fuser, const hier::Index& ifirst, const hier::Index& ilast, LoopBody body) + { + NULL_USE(fuser); + RAJA::kernel::Policy2d>( + RAJA::make_tuple(make_range(ifirst, ilast, 0), + make_range(ifirst, ilast, 1)), + body); + } + + template ::value, int>::type = 0> + inline static void eval(tbox::KernelFuser* fuser, const hier::Index& ifirst, const hier::Index& ilast, LoopBody body) + { + NULL_USE(fuser); + RAJA::kernel( + RAJA::make_tuple(make_range(ifirst, ilast, 0), + make_range(ifirst, ilast, 1)), + body); + } }; template <> @@ -193,6 +245,30 @@ struct for_all<3> { make_range(ifirst, ilast, 2)), body); } + + template ::value, int>::type = 0> + inline static void eval(tbox::KernelFuser* fuser, const hier::Index& ifirst, const hier::Index& ilast, LoopBody body) + { + NULL_USE(fuser); + RAJA::kernel::Policy3d>( + RAJA::make_tuple(make_range(ifirst, ilast, 0), + make_range(ifirst, ilast, 1), + make_range(ifirst, ilast, 2)), + body); + } + + template ::value, int>::type = 0> + inline static void eval(tbox::KernelFuser* fuser, const hier::Index& ifirst, const hier::Index& ilast, LoopBody body) + { + NULL_USE(fuser); + RAJA::kernel( + RAJA::make_tuple(make_range(ifirst, ilast, 0), + make_range(ifirst, ilast, 1), + make_range(ifirst, ilast, 2)), + body); + } }; } // namespace detail @@ -205,6 +281,17 @@ inline void for_all(int begin, int end, LoopBody body) RAJA::forall::Policy>(RAJA::RangeSegment(begin, end), body); } +template ::value, int>::type = 0> +inline void for_all(tbox::KernelFuser* fuser, int begin, int end, LoopBody body) +{ + if (fuser == nullptr) { + RAJA::forall::Policy>(RAJA::RangeSegment(begin, end), body); + } else { + fuser->enqueue(begin, end, body); + } +} + template ::value, int>::type = 0> inline void for_all(int begin, int end, LoopBody body) @@ -212,6 +299,17 @@ inline void for_all(int begin, int end, LoopBody body) RAJA::forall(RAJA::RangeSegment(begin, end), body); } +template ::value, int>::type = 0> +inline void for_all(tbox::KernelFuser* fuser, int begin, int end, LoopBody body) +{ + if (fuser == nullptr) { + RAJA::forall(RAJA::RangeSegment(begin, end), body); + } else { + fuser->enqueue(begin, end, body); + } +} + // does NOT include end template inline void parallel_for_all(int begin, int end, LoopBody body) @@ -219,6 +317,16 @@ inline void parallel_for_all(int begin, int end, LoopBody body) for_all(begin, end, body); } +template +inline void parallel_for_all(tbox::KernelFuser* fuser, int begin, int end, LoopBody body) +{ + if (fuser == nullptr) { + for_all(begin, end, body); + } else { + for_all(fuser, begin, end, body); + } +} + template inline void host_parallel_for_all(int begin, int end, LoopBody body) { @@ -231,12 +339,25 @@ inline void for_all(const hier::Box& box, const int dim, LoopBody body) for_all(box.lower()(dim), box.upper()(dim) + 1, body); } + +template +inline void for_all(tbox::KernelFuser* fuser, const hier::Box& box, const int dim, LoopBody body) +{ + for_all(fuser, box.lower()(dim), box.upper()(dim) + 1, body); +} + template inline void parallel_for_all(const hier::Box& box, const int dim, LoopBody body) { for_all(box.lower()(dim), box.upper()(dim) + 1, body); } +template +inline void parallel_for_all(tbox::KernelFuser* fuser, const hier::Box& box, const int dim, LoopBody body) +{ + for_all(fuser, box.lower()(dim), box.upper()(dim) + 1, body); +} + template inline void host_parallel_for_all(const hier::Box& box, const int dim, LoopBody body) { @@ -250,12 +371,29 @@ inline void for_all(const hier::Box& box, LoopBody body) detail::for_all::template eval(box.lower(), box.upper(), body); } +template +inline void for_all(tbox::KernelFuser* fuser, const hier::Box& box, LoopBody body) +{ + if (fuser == nullptr) { + for_all(box, body); + } else { + constexpr int arg_count = detail::function_traits::argument_count; + detail::for_all::template eval(fuser, box.lower(), box.upper(), body); + } +} + template inline void parallel_for_all(const hier::Box& box, LoopBody body) { for_all(box, body); } +template +inline void parallel_for_all(tbox::KernelFuser* fuser, const hier::Box& box, LoopBody body) +{ + for_all(fuser, box, body); +} + template inline void host_parallel_for_all(const hier::Box& box, LoopBody body) { diff --git a/source/SAMRAI/hier/PatchData.cpp b/source/SAMRAI/hier/PatchData.cpp index 385b208c8e..2376bf6d44 100644 --- a/source/SAMRAI/hier/PatchData.cpp +++ b/source/SAMRAI/hier/PatchData.cpp @@ -31,6 +31,30 @@ PatchData::~PatchData() { } +void +PatchData::copyFuseable( + const PatchData& src, + const BoxOverlap& overlap) +{ + copy(src, overlap); +} + +void +PatchData::packStreamFuseable( + tbox::MessageStream& stream, + const BoxOverlap& overlap) const +{ + packStream(stream, overlap); +} + +void +PatchData::unpackStreamFuseable( + tbox::MessageStream& stream, + const BoxOverlap& overlap) +{ + unpackStream(stream, overlap); +} + /* ************************************************************************* * diff --git a/source/SAMRAI/hier/PatchData.h b/source/SAMRAI/hier/PatchData.h index a4a4c0e2ca..1d2f320b2e 100644 --- a/source/SAMRAI/hier/PatchData.h +++ b/source/SAMRAI/hier/PatchData.h @@ -20,6 +20,15 @@ #include "SAMRAI/tbox/Utilities.h" namespace SAMRAI { + +/* + * Forward declaration of KernelFuser class - required here because it sucks in + * RAJA and requires CUDA. + */ +//namespace tbox { +//class KernelFuser; +//} + namespace hier { /** @@ -160,6 +169,11 @@ class PatchData const PatchData& src, const BoxOverlap& overlap) = 0; + virtual void + copyFuseable( + const PatchData& src, + const BoxOverlap& overlap); + /** * Copy data from the source into the destination using the designated * overlap descriptor. The overlap description will have been computed @@ -206,6 +220,18 @@ class PatchData tbox::MessageStream& stream, const BoxOverlap& overlap) const = 0; + /** + * Pack data lying on the specified index set into the output stream using + * the given KernelFuser. The default implementation of this method will + * call packStream without the fuser argument. See the abstract stream + * virtual base class for more information about the packing operators + * defined for streams. + */ + virtual void + packStreamFuseable( + tbox::MessageStream& stream, + const BoxOverlap& overlap) const; + /** * Unpack data from the message stream into the specified index set. * See the abstract stream virtual base class for more information about @@ -216,6 +242,18 @@ class PatchData tbox::MessageStream& stream, const BoxOverlap& overlap) = 0; + /** + * Unpack data from the message stream into the specified index set using + * the given KernelFuser. The default implementation of this method will + * call unpackStream without the fuser argument. See the abstract stream + * virtual base class for more information about the packing operators + * defined for streams. + */ + virtual void + unpackStreamFuseable( + tbox::MessageStream& stream, + const BoxOverlap& overlap); + /** * Checks that class version and restart file version are equal. If so, * reads in the data members common to all patch data types from restart diff --git a/source/SAMRAI/hier/PatchLevel.cpp b/source/SAMRAI/hier/PatchLevel.cpp index 30f29c3b66..671ba9f5ac 100644 --- a/source/SAMRAI/hier/PatchLevel.cpp +++ b/source/SAMRAI/hier/PatchLevel.cpp @@ -9,7 +9,9 @@ ************************************************************************/ #include "SAMRAI/hier/PatchLevel.h" +#include "SAMRAI/tbox/Collectives.h" #include "SAMRAI/tbox/MathUtilities.h" +#include "SAMRAI/tbox/StagedKernelFusers.h" #include "SAMRAI/tbox/TimerManager.h" #include "SAMRAI/hier/BaseGridGeometry.h" #include "SAMRAI/hier/BoxContainer.h" diff --git a/source/SAMRAI/hier/PatchLevel.h b/source/SAMRAI/hier/PatchLevel.h index eba972eb5a..595db02210 100644 --- a/source/SAMRAI/hier/PatchLevel.h +++ b/source/SAMRAI/hier/PatchLevel.h @@ -17,6 +17,7 @@ #include "SAMRAI/hier/BoxLevel.h" #include "SAMRAI/hier/PatchFactory.h" #include "SAMRAI/hier/ProcessorMapping.h" +#include "SAMRAI/tbox/StagedKernelFusers.h" #include "SAMRAI/tbox/Utilities.h" #include @@ -758,6 +759,11 @@ class PatchLevel for (Iterator ip(begin()); ip != end(); ++ip) { ip->allocatePatchData(id, timestamp); } + +#if defined(HAVE_RAJA) + tbox::StagedKernelFusers::getInstance()->launchAndCleanup(); +#endif + } /*! @@ -775,6 +781,11 @@ class PatchLevel for (Iterator ip(begin()); ip != end(); ++ip) { ip->allocatePatchData(components, timestamp); } + +#if defined(HAVE_RAJA) + tbox::StagedKernelFusers::getInstance()->launchAndCleanup(); +#endif + } /*! @@ -812,6 +823,10 @@ class PatchLevel for (Iterator ip(begin()); ip != end(); ++ip) { ip->deallocatePatchData(id); } + +#if defined(HAVE_RAJA) + tbox::StagedKernelFusers::getInstance()->launchAndCleanup(); +#endif } /*! @@ -829,6 +844,10 @@ class PatchLevel for (Iterator ip(begin()); ip != end(); ++ip) { ip->deallocatePatchData(components); } + +#if defined(HAVE_RAJA) + tbox::StagedKernelFusers::getInstance()->launchAndCleanup(); +#endif } /*! diff --git a/source/SAMRAI/pdat/ArrayData.cpp b/source/SAMRAI/pdat/ArrayData.cpp index f65d23ed7d..33733c826b 100644 --- a/source/SAMRAI/pdat/ArrayData.cpp +++ b/source/SAMRAI/pdat/ArrayData.cpp @@ -11,6 +11,8 @@ #ifndef included_pdat_ArrayData_C #define included_pdat_ArrayData_C +#include "SAMRAI/tbox/KernelFuser.h" +#include "SAMRAI/tbox/StagedKernelFusers.h" #include "SAMRAI/tbox/MessageStream.h" #include "SAMRAI/tbox/Utilities.h" #include "SAMRAI/tbox/MathUtilities.h" @@ -101,7 +103,8 @@ ArrayData::ArrayData( , d_array(d_depth * d_offset), #endif - d_on_host(true) + d_on_host(true), + d_use_fuser(false) { TBOX_ASSERT(depth > 0); @@ -132,7 +135,8 @@ ArrayData::ArrayData( #else d_array(d_depth * d_offset), #endif - d_on_host(true) + d_on_host(true), + d_use_fuser(false) { #ifndef HAVE_UMPIRE NULL_USE(allocator); @@ -298,12 +302,14 @@ void ArrayData::copy( const TYPE* const src_ptr = &src.d_array[0]; const size_t n = d_offset * d_depth; #if defined(HAVE_RAJA) + tbox::KernelFuser* fuser = d_use_fuser ? + tbox::StagedKernelFusers::getInstance()->getFuser(0) : nullptr; if (d_on_host) { hier::host_parallel_for_all(0, n, [=] (int i) { copyop(dst_ptr[i], src_ptr[i]); }); } else { - hier::parallel_for_all(0, n, [=] SAMRAI_HOST_DEVICE(int i) { + hier::parallel_for_all(fuser, 0, n, [=] SAMRAI_HOST_DEVICE(int i) { copyop(dst_ptr[i], src_ptr[i]); }); } @@ -492,12 +498,15 @@ void ArrayData::copyDepth( #if defined(HAVE_RAJA) + tbox::KernelFuser* fuser = d_use_fuser ? + tbox::StagedKernelFusers::getInstance()->getFuser(0) : nullptr; + if (d_on_host) { hier::host_parallel_for_all(0, d_offset, [=] (int i) { copyop(dst_ptr_d[i], src_ptr_d[i]); }); } else { - hier::parallel_for_all(0, d_offset, [=] SAMRAI_HOST_DEVICE(int i) { + hier::parallel_for_all(fuser, 0, d_offset, [=] SAMRAI_HOST_DEVICE(int i) { copyop(dst_ptr_d[i], src_ptr_d[i]); }); } @@ -1009,12 +1018,15 @@ void ArrayData::fillAll( TYPE* ptr = &d_array[0]; const size_t n = d_depth * d_offset; #if defined(HAVE_RAJA) + tbox::KernelFuser* fuser = d_use_fuser ? + tbox::StagedKernelFusers::getInstance()->getFuser(0) : nullptr; + if (d_on_host) { hier::host_parallel_for_all(0, n, [=] (int i) { ptr[i] = t; }); } else { - hier::parallel_for_all(0, n, [=] SAMRAI_HOST_DEVICE(int i) { + hier::parallel_for_all(fuser, 0, n, [=] SAMRAI_HOST_DEVICE(int i) { ptr[i] = t; }); } @@ -1051,12 +1063,15 @@ void ArrayData::fill( const size_t n = d_offset; if (!d_box.empty()) { #if defined(HAVE_RAJA) + tbox::KernelFuser* fuser = d_use_fuser ? + tbox::StagedKernelFusers::getInstance()->getFuser(0) : nullptr; + if (d_on_host) { hier::host_parallel_for_all(0, n, [=] (int i) { ptr[i] = t; }); } else { - hier::parallel_for_all(0, n, [=] SAMRAI_HOST_DEVICE(int i) { + hier::parallel_for_all(fuser, 0, n, [=] SAMRAI_HOST_DEVICE(int i) { ptr[i] = t; }); } @@ -1082,6 +1097,9 @@ void ArrayData::fill( if (!ispace.empty()) { #if defined(HAVE_RAJA) + tbox::KernelFuser* fuser = d_use_fuser ? + tbox::StagedKernelFusers::getInstance()->getFuser(0) : nullptr; + switch (ispace.getDim().getValue()) { case 1: { auto data = getView<1>(d); @@ -1090,7 +1108,7 @@ void ArrayData::fill( data(i) = t; }); } else { - hier::parallel_for_all(ispace, [=] SAMRAI_HOST_DEVICE(int i) { + hier::parallel_for_all(fuser, ispace, [=] SAMRAI_HOST_DEVICE(int i) { data(i) = t; }); } @@ -1103,7 +1121,7 @@ void ArrayData::fill( data(i,j) = t; }); } else { - hier::parallel_for_all(ispace, [=] SAMRAI_HOST_DEVICE(int i, int j) { + hier::parallel_for_all(fuser, ispace, [=] SAMRAI_HOST_DEVICE(int i, int j) { data(i,j) = t; }); } @@ -1116,7 +1134,7 @@ void ArrayData::fill( data(i,j,k) = t; }); } else { - hier::parallel_for_all(ispace, [=] SAMRAI_HOST_DEVICE(int i, int j, int k) { + hier::parallel_for_all(fuser, ispace, [=] SAMRAI_HOST_DEVICE(int i, int j, int k) { data(i,j,k) = t; }); } diff --git a/source/SAMRAI/pdat/ArrayData.h b/source/SAMRAI/pdat/ArrayData.h index 671de1c850..4b1dbb7ce1 100644 --- a/source/SAMRAI/pdat/ArrayData.h +++ b/source/SAMRAI/pdat/ArrayData.h @@ -653,6 +653,21 @@ class ArrayData return d_on_host; } + void startKernelFuser() + { + d_use_fuser = true; + } + + void stopKernelFuser() + { + d_use_fuser = false; + } + + bool useFuser() const + { + return d_use_fuser; + } + /*! * The array data iterator iterates over the elements of a box * associated with an ArrayData object. This typedef is @@ -729,6 +744,7 @@ class ArrayData #endif bool d_on_host; + bool d_use_fuser; }; #if defined(HAVE_RAJA) diff --git a/source/SAMRAI/pdat/ArrayDataOperationUtilities.cpp b/source/SAMRAI/pdat/ArrayDataOperationUtilities.cpp index 6aee4cbdda..0e8fd6db96 100644 --- a/source/SAMRAI/pdat/ArrayDataOperationUtilities.cpp +++ b/source/SAMRAI/pdat/ArrayDataOperationUtilities.cpp @@ -14,10 +14,11 @@ #include "SAMRAI/pdat/ArrayDataOperationUtilities.h" #include "SAMRAI/pdat/ArrayData.h" #include "SAMRAI/hier/ForAll.h" -#include "SAMRAI/tbox/Utilities.h" #include "SAMRAI/pdat/SumOperation.h" #include "SAMRAI/tbox/Collectives.h" #include "SAMRAI/tbox/NVTXUtilities.h" +#include "SAMRAI/tbox/StagedKernelFusers.h" +#include "SAMRAI/tbox/Utilities.h" namespace SAMRAI { @@ -112,6 +113,12 @@ void ArrayDataOperationUtilities::doArrayDataOperationOnBox( bool on_host = (src_on_host && dst_on_host); #endif +#if defined(HAVE_RAJA) + bool use_fuser = dst.useFuser(); + tbox::KernelFuser* fuser = use_fuser ? + tbox::StagedKernelFusers::getInstance()->getFuser(0) : nullptr; +#endif + /* * Loop over the depth sections of the data arrays. */ @@ -131,7 +138,7 @@ void ArrayDataOperationUtilities::doArrayDataOperationOnBox( op(dest(i), s2(i)); }); } else { - hier::parallel_for_all(opbox, [=] SAMRAI_HOST_DEVICE(int i) { + hier::parallel_for_all(fuser, opbox, [=] SAMRAI_HOST_DEVICE(int i) { op(dest(i), s2(i)); }); } @@ -148,7 +155,7 @@ void ArrayDataOperationUtilities::doArrayDataOperationOnBox( op(dest(i, j), s2(i, j)); }); } else { - hier::parallel_for_all(opbox, [=] SAMRAI_HOST_DEVICE(int i, int j) { + hier::parallel_for_all(fuser, opbox, [=] SAMRAI_HOST_DEVICE(int i, int j) { op(dest(i, j), s2(i, j)); }); } @@ -166,7 +173,7 @@ void ArrayDataOperationUtilities::doArrayDataOperationOnBox( op(dest(i, j, k), s2(i, j, k)); }); } else { - hier::parallel_for_all(opbox, [=] SAMRAI_HOST_DEVICE(int i, int j, int k) { + hier::parallel_for_all(fuser, opbox, [=] SAMRAI_HOST_DEVICE(int i, int j, int k) { op(dest(i, j, k), s2(i, j, k)); }); } @@ -312,6 +319,12 @@ void ArrayDataOperationUtilities::doArrayDataBufferOperationOnBox( bool on_host = arraydata.dataOnHost(); #endif +#if defined(HAVE_RAJA) + bool use_fuser = arraydata.useFuser(); + tbox::KernelFuser* fuser = use_fuser ? + tbox::StagedKernelFusers::getInstance()->getFuser(0) : nullptr; +#endif + /* * Loop over the depth sections of the data arrays. */ @@ -335,7 +348,7 @@ void ArrayDataOperationUtilities::doArrayDataBufferOperationOnBox( op(dest(i), source(i)); }); } else { - hier::parallel_for_all(opbox, [=] SAMRAI_HOST_DEVICE(int i) { + hier::parallel_for_all(fuser, opbox, [=] SAMRAI_HOST_DEVICE(int i) { op(dest(i), source(i)); }); } @@ -349,7 +362,7 @@ void ArrayDataOperationUtilities::doArrayDataBufferOperationOnBox( op(dest(i, j), source(i, j)); }); } else { - hier::parallel_for_all(opbox, [=] SAMRAI_HOST_DEVICE(int i, int j) { + hier::parallel_for_all(fuser, opbox, [=] SAMRAI_HOST_DEVICE(int i, int j) { op(dest(i, j), source(i, j)); }); } @@ -363,7 +376,7 @@ void ArrayDataOperationUtilities::doArrayDataBufferOperationOnBox( op(dest(i, j, k), source(i, j, k)); }); } else { - hier::parallel_for_all(opbox, [=] SAMRAI_HOST_DEVICE(int i, int j, int k) { + hier::parallel_for_all(fuser, opbox, [=] SAMRAI_HOST_DEVICE(int i, int j, int k) { op(dest(i, j, k), source(i, j, k)); }); } @@ -511,6 +524,12 @@ inline void ArrayDataOperationUtilities >::doArr bool on_host = (src_on_host && dst_on_host); #endif +#if defined(HAVE_RAJA) +// bool use_fuser = dst.useFuser(); +// tbox::KernelFuser* fuser = use_fuser ? +// tbox::KernelFuser::getFuser() : nullptr; +#endif + /* * Loop over the depth sections of the data arrays. */ @@ -535,6 +554,7 @@ inline void ArrayDataOperationUtilities >::doArr sumop_dbl(dest_imag, s2_imag); }); } else { + //hier::parallel_for_all(fuser, opbox, [=] SAMRAI_HOST_DEVICE(int i) { hier::parallel_for_all(opbox, [=] SAMRAI_HOST_DEVICE(int i) { double &dest_real = reinterpret_cast(dest(i))[0]; double &dest_imag = reinterpret_cast(dest(i))[1]; @@ -562,6 +582,7 @@ inline void ArrayDataOperationUtilities >::doArr sumop_dbl(dest_imag, s2_imag); }); } else { + //hier::parallel_for_all(fuser, opbox, [=] SAMRAI_HOST_DEVICE(int i, int j) { hier::parallel_for_all(opbox, [=] SAMRAI_HOST_DEVICE(int i, int j) { double &dest_real = reinterpret_cast(dest(i,j))[0]; double &dest_imag = reinterpret_cast(dest(i,j))[1]; @@ -590,6 +611,7 @@ inline void ArrayDataOperationUtilities >::doArr sumop_dbl(dest_imag, s2_imag); }); } else { + //hier::parallel_for_all(fuser, opbox, [=] SAMRAI_HOST_DEVICE(int i, int j, int k) { hier::parallel_for_all(opbox, [=] SAMRAI_HOST_DEVICE(int i, int j, int k) { double &dest_real = reinterpret_cast(dest(i,j,k))[0]; double &dest_imag = reinterpret_cast(dest(i,j,k))[1]; @@ -745,6 +767,12 @@ inline void ArrayDataOperationUtilities >::doAr bool on_host = arraydata.dataOnHost(); #endif +#if defined(HAVE_RAJA) +// bool use_fuser = arraydata.useFuser(); +// tbox::KernelFuser* fuser = use_fuser ? +// tbox::KernelFuser::getFuser() : nullptr; +#endif + /* * Loop over the depth sections of the data arrays. */ @@ -773,6 +801,7 @@ inline void ArrayDataOperationUtilities >::doAr sumop_dbl(dest_imag, source_imag); }); } else { + //hier::parallel_for_all(fuser, opbox, [=] SAMRAI_HOST_DEVICE(int i) { hier::parallel_for_all(opbox, [=] SAMRAI_HOST_DEVICE(int i) { double &dest_real = reinterpret_cast(dest(i))[0]; double &dest_imag = reinterpret_cast(dest(i))[1]; @@ -797,6 +826,7 @@ inline void ArrayDataOperationUtilities >::doAr sumop_dbl(dest_imag, source_imag); }); } else { + //hier::parallel_for_all(fuser, opbox, [=] SAMRAI_HOST_DEVICE(int i, int j) { hier::parallel_for_all(opbox, [=] SAMRAI_HOST_DEVICE(int i, int j) { double &dest_real = reinterpret_cast(dest(i,j))[0]; double &dest_imag = reinterpret_cast(dest(i,j))[1]; @@ -821,6 +851,7 @@ inline void ArrayDataOperationUtilities >::doAr sumop_dbl(dest_imag, source_imag); }); } else { + //hier::parallel_for_all(fuser, opbox, [=] SAMRAI_HOST_DEVICE(int i, int j, int k) { hier::parallel_for_all(opbox, [=] SAMRAI_HOST_DEVICE(int i, int j, int k) { double &dest_real = reinterpret_cast(dest(i,j,k))[0]; double &dest_imag = reinterpret_cast(dest(i,j,k))[1]; diff --git a/source/SAMRAI/pdat/ArrayDataOperationUtilities.h b/source/SAMRAI/pdat/ArrayDataOperationUtilities.h index 0315f9def3..bca5e2d8fa 100644 --- a/source/SAMRAI/pdat/ArrayDataOperationUtilities.h +++ b/source/SAMRAI/pdat/ArrayDataOperationUtilities.h @@ -113,7 +113,6 @@ class ArrayDataOperationUtilities ArrayDataOperationUtilities& operator = ( const ArrayDataOperationUtilities&); - }; } diff --git a/source/SAMRAI/pdat/ArrayView.h b/source/SAMRAI/pdat/ArrayView.h index d649e817d3..8f3c6d2bcf 100644 --- a/source/SAMRAI/pdat/ArrayView.h +++ b/source/SAMRAI/pdat/ArrayView.h @@ -70,7 +70,7 @@ struct ArrayView<1, TYPE> : public RAJA::View{ {box.lower()[0]} }, - std::array{ {box.upper()[0]} }, + std::array{ {box.upper()[0]+1} }, RAJA::as_array::get())){} }; @@ -84,7 +84,7 @@ struct ArrayView<2, TYPE> : public RAJA::View{ {box.lower()[0], box.lower()[1]} }, - std::array{ {box.upper()[0], box.upper()[1]} }, + std::array{ {box.upper()[0]+1, box.upper()[1]+1} }, RAJA::as_array::get())){} }; @@ -98,7 +98,7 @@ struct ArrayView<3, TYPE> : public RAJA::View{ {box.lower()[0], box.lower()[1], box.lower()[2]} }, - std::array{ {box.upper()[0], box.upper()[1], box.upper()[2]} }, + std::array{ {box.upper()[0]+1, box.upper()[1]+1, box.upper()[2]+1} }, RAJA::as_array::get())){}; }; @@ -112,7 +112,7 @@ struct ArrayView<1, const TYPE> : public RAJA::View{ {box.lower()[0]} }, - std::array{ {box.upper()[0]} }, + std::array{ {box.upper()[0]+1} }, RAJA::as_array::get())){} }; @@ -126,7 +126,7 @@ struct ArrayView<2, const TYPE> : public RAJA::View{ {box.lower()[0], box.lower()[1]} }, - std::array{ {box.upper()[0], box.upper()[1]} }, + std::array{ {box.upper()[0]+1, box.upper()[1]+1} }, RAJA::as_array::get())){} }; @@ -141,7 +141,7 @@ struct ArrayView<3, const TYPE> : public RAJA::View{ {box.lower()[0], box.lower()[1], box.lower()[2]} }, - std::array{ {box.upper()[0], box.upper()[1], box.upper()[2]} }, + std::array{ {box.upper()[0]+1, box.upper()[1]+1, box.upper()[2]+1} }, RAJA::as_array::get())){}; }; diff --git a/source/SAMRAI/pdat/CMakeLists.txt b/source/SAMRAI/pdat/CMakeLists.txt index 7162d09099..bae6902eab 100644 --- a/source/SAMRAI/pdat/CMakeLists.txt +++ b/source/SAMRAI/pdat/CMakeLists.txt @@ -339,6 +339,10 @@ target_include_directories( $ $) +blt_print_target_properties(TARGET SAMRAI_pdat) +blt_print_target_properties(TARGET raja) +blt_print_target_properties(TARGET RAJA) + install(TARGETS SAMRAI_pdat EXPORT SAMRAITargets ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} diff --git a/source/SAMRAI/pdat/CellData.h b/source/SAMRAI/pdat/CellData.h index 99e84adb7d..df90422254 100644 --- a/source/SAMRAI/pdat/CellData.h +++ b/source/SAMRAI/pdat/CellData.h @@ -279,6 +279,14 @@ class CellData:public hier::PatchData const hier::PatchData& src, const hier::BoxOverlap& overlap); + virtual void + copyFuseable( + const hier::PatchData& src, + const hier::BoxOverlap& overlap) + { + copy(src, overlap); + } + /*! * @brief Copy data from source (i.e., this) to destination * patch data object on the given overlap. @@ -356,6 +364,14 @@ class CellData:public hier::PatchData tbox::MessageStream& stream, const hier::BoxOverlap& overlap) const; + virtual void + packStreamFuseable( + tbox::MessageStream& stream, + const hier::BoxOverlap& overlap) const + { + packStream(stream, overlap); + } + /*! * @brief Unpack data from stream into this patch data object * over the specified box overlap region. The overlap must be a @@ -368,6 +384,15 @@ class CellData:public hier::PatchData tbox::MessageStream& stream, const hier::BoxOverlap& overlap); + virtual void + unpackStreamFuseable( + tbox::MessageStream& stream, + const hier::BoxOverlap& overlap) + { + unpackStream(stream, overlap); + } + + /*! * @brief Add data from source to destination (i.e., this) * patch data object on the given overlap. diff --git a/source/SAMRAI/pdat/EdgeData.h b/source/SAMRAI/pdat/EdgeData.h index cd9cbaf514..6fc6124934 100644 --- a/source/SAMRAI/pdat/EdgeData.h +++ b/source/SAMRAI/pdat/EdgeData.h @@ -395,6 +395,7 @@ class EdgeData:public hier::PatchData tbox::MessageStream& stream, const hier::BoxOverlap& overlap); + /*! * @brief Fill all values at depth d with the value t. * diff --git a/source/SAMRAI/pdat/NodeData.h b/source/SAMRAI/pdat/NodeData.h index 62c665fdce..b645d0a297 100644 --- a/source/SAMRAI/pdat/NodeData.h +++ b/source/SAMRAI/pdat/NodeData.h @@ -284,6 +284,16 @@ class NodeData:public hier::PatchData const hier::PatchData& src, const hier::BoxOverlap& overlap); + virtual void + copyFuseable( + const hier::PatchData& src, + const hier::BoxOverlap& overlap) + { + d_data->startKernelFuser(); + copy(src, overlap); + d_data->stopKernelFuser(); + } + /*! * @brief Copy data from source (i.e., this) to destination * patch data object on the given overlap. @@ -362,6 +372,16 @@ class NodeData:public hier::PatchData tbox::MessageStream& stream, const hier::BoxOverlap& overlap) const; + virtual void + packStreamFuseable( + tbox::MessageStream& stream, + const hier::BoxOverlap& overlap) const + { + d_data->startKernelFuser(); + packStream(stream, overlap); + d_data->stopKernelFuser(); + } + /*! * @brief Unpack data from stream into this patch data object over * the specified box overlap region. The overlap must be a @@ -374,6 +394,17 @@ class NodeData:public hier::PatchData tbox::MessageStream& stream, const hier::BoxOverlap& overlap); + virtual void + unpackStreamFuseable( + tbox::MessageStream& stream, + const hier::BoxOverlap& overlap) + { + d_data->startKernelFuser(); + unpackStream(stream, overlap); + d_data->stopKernelFuser(); + } + + /*! * @brief Fill all values at depth d with the value t. * diff --git a/source/SAMRAI/pdat/OuterfaceData.h b/source/SAMRAI/pdat/OuterfaceData.h index d43d00a798..51d324f8f6 100644 --- a/source/SAMRAI/pdat/OuterfaceData.h +++ b/source/SAMRAI/pdat/OuterfaceData.h @@ -415,6 +415,16 @@ class OuterfaceData:public hier::PatchData tbox::MessageStream& stream, const hier::BoxOverlap& overlap) const; + virtual void + packStream( + tbox::MessageStream& stream, + const hier::BoxOverlap& overlap, + tbox::KernelFuser& fuser) const + { + NULL_USE(fuser); + packStream(stream, overlap); + } + /*! * @brief Unpack data from stream into this patch data object over * the specified box overlap region. The overlap must be an @@ -427,6 +437,16 @@ class OuterfaceData:public hier::PatchData tbox::MessageStream& stream, const hier::BoxOverlap& overlap); + virtual void + unpackStream( + tbox::MessageStream& stream, + const hier::BoxOverlap& overlap, + tbox::KernelFuser& fuser) + { + NULL_USE(fuser); + unpackStream(stream, overlap); + } + /*! * @brief Fill all values at depth d with the value t. * diff --git a/source/SAMRAI/tbox/AllocatorDatabase.cpp b/source/SAMRAI/tbox/AllocatorDatabase.cpp index db7f6f720b..77dc0ad319 100644 --- a/source/SAMRAI/tbox/AllocatorDatabase.cpp +++ b/source/SAMRAI/tbox/AllocatorDatabase.cpp @@ -100,6 +100,16 @@ AllocatorDatabase::initialize() rm.makeAllocator("samrai::stream_allocator", allocator); } + if (!rm.isAllocator("samrai::fuser_allocator")) { +#if defined(HAVE_CUDA) + auto allocator = rm.getAllocator(umpire::resource::Pinned); +#else + auto allocator = rm.getAllocator(umpire::resource::Host); +#endif + + rm.makeAllocator("samrai::fuser_allocator", allocator); + } + if (!rm.isAllocator("samrai::temporary_data_allocator")) { #if defined(HAVE_CUDA) //auto allocator = rm.getAllocator(umpire::resource::Device); @@ -133,6 +143,13 @@ AllocatorDatabase::getStreamAllocator() return umpire::TypedAllocator(rm.getAllocator("samrai::stream_allocator")); } +umpire::TypedAllocator +AllocatorDatabase::getKernelFuserAllocator() +{ + umpire::ResourceManager& rm = umpire::ResourceManager::getInstance(); + return umpire::TypedAllocator(rm.getAllocator("samrai::fuser_allocator")); +} + umpire::TypedAllocator AllocatorDatabase::getInternalHostAllocator() { diff --git a/source/SAMRAI/tbox/AllocatorDatabase.h b/source/SAMRAI/tbox/AllocatorDatabase.h index 311d9c4abc..51869eca73 100644 --- a/source/SAMRAI/tbox/AllocatorDatabase.h +++ b/source/SAMRAI/tbox/AllocatorDatabase.h @@ -88,6 +88,13 @@ class AllocatorDatabase umpire::TypedAllocator getStreamAllocator(); #endif + /*! + * @brief Get the kernel fuser allocator. + */ +#ifdef HAVE_UMPIRE + umpire::TypedAllocator getKernelFuserAllocator(); +#endif + /*! * @brief Get a host allocator. */ diff --git a/source/SAMRAI/tbox/AsyncCommPeer.cpp b/source/SAMRAI/tbox/AsyncCommPeer.cpp index 4cf86de49a..f9c3611147 100644 --- a/source/SAMRAI/tbox/AsyncCommPeer.cpp +++ b/source/SAMRAI/tbox/AsyncCommPeer.cpp @@ -76,6 +76,7 @@ AsyncCommPeer::AsyncCommPeer(): d_external_buf(0), d_internal_buf_size(0), d_internal_buf(0), + d_count_buf(0), d_mpi(SAMRAI_MPI::getSAMRAIWorld()), d_tag0(-1), d_tag1(-1), @@ -119,6 +120,7 @@ AsyncCommPeer::AsyncCommPeer( d_external_buf(0), d_internal_buf_size(0), d_internal_buf(0), + d_count_buf(0), d_mpi(SAMRAI_MPI::getSAMRAIWorld()), d_tag0(-1), d_tag1(-1), @@ -165,6 +167,16 @@ AsyncCommPeer::~AsyncCommPeer() #endif d_internal_buf = 0; } + if (d_count_buf) { +#ifdef HAVE_UMPIRE + d_allocator.deallocate( + (char*)d_count_buf, 2 * sizeof(FlexData)); +#else + free(d_count_buf); +#endif + d_count_buf = 0; + } + d_first_recv_buf = 0; } @@ -570,7 +582,31 @@ AsyncCommPeer::checkRecv( // Post receive for first (and maybe only) chunk of data. const size_t first_chunk_count = getNumberOfFlexData( d_max_first_data_len); - resizeBuffer(first_chunk_count + 2); + + if (first_chunk_count > 0) { + resizeBuffer(first_chunk_count + 2); + d_first_recv_buf = d_internal_buf; + } else { + // If the size of the first chunk is zero, due to + // d_max_first_data_len being set to zero, then we use + // a small buffer to get only the full count size, deferring + // the receipt of the full data to the second Irecv. + if (d_count_buf) { +#ifdef HAVE_UMPIRE + d_allocator.deallocate( + (char*)d_count_buf, 2 * sizeof(FlexData)); +#else + free(d_count_buf); +#endif + } +#ifdef HAVE_UMPIRE + d_count_buf = + (FlexData *)d_allocator.allocate(2 * sizeof(FlexData)); +#else + d_count_buf = (FlexData *)malloc(2 * sizeof(FlexData)); +#endif + d_first_recv_buf = d_count_buf; + } TBOX_ASSERT(req[0] == MPI_REQUEST_NULL); #ifdef DEBUG_CHECK_ASSERTIONS @@ -578,7 +614,7 @@ AsyncCommPeer::checkRecv( #endif t_recv_timer->start(); d_mpi_err = d_mpi.Irecv( - d_internal_buf, + d_first_recv_buf, static_cast(sizeof(FlexData) * (first_chunk_count + 2)), MPI_BYTE, d_peer_rank, @@ -647,9 +683,9 @@ AsyncCommPeer::checkRecv( TBOX_ASSERT(mpi_status[0].MPI_SOURCE == d_peer_rank); TBOX_ASSERT(req[0] == MPI_REQUEST_NULL); // Get full count embedded in message. - d_full_count = d_internal_buf[count - 1].d_i; + d_full_count = d_first_recv_buf[count - 1].d_i; - TBOX_ASSERT(d_internal_buf[count - 2].d_i == 0); // Sequence number check. + TBOX_ASSERT(d_first_recv_buf[count - 2].d_i == 0); // Sequence number check. TBOX_ASSERT(getNumberOfFlexData(d_full_count) >= count - 2); if (d_full_count > d_max_first_data_len) { @@ -664,7 +700,18 @@ AsyncCommPeer::checkRecv( const size_t second_chunk_count = getNumberOfFlexData( d_full_count - d_max_first_data_len); - resizeBuffer(d_internal_buf_size + second_chunk_count); + size_t new_internal_buf_size = + d_internal_buf_size + second_chunk_count; + + // If the first Irecv didn't use d_internal_buf, then + // the message in the second Irecv will contain the entire + // buffer of data for this communicattion instance, and we need + // to add 2 to the buffer size to make room for the trailing + // metadata. + if (d_internal_buf_size == 0) { + new_internal_buf_size += 2; + } + resizeBuffer(new_internal_buf_size); TBOX_ASSERT(req[1] == MPI_REQUEST_NULL); req[1] = MPI_REQUEST_NULL; @@ -971,6 +1018,16 @@ AsyncCommPeer::clearRecvData() #endif d_internal_buf = 0; } + if (d_count_buf) { +#ifdef HAVE_UMPIRE + d_allocator.deallocate( + (char*)d_count_buf, 2 * sizeof(FlexData)); +#else + free(d_count_buf); +#endif + d_count_buf = 0; + } + d_first_recv_buf = 0; } /* diff --git a/source/SAMRAI/tbox/AsyncCommPeer.h b/source/SAMRAI/tbox/AsyncCommPeer.h index e3a0b09bcc..e8e60eb9d6 100644 --- a/source/SAMRAI/tbox/AsyncCommPeer.h +++ b/source/SAMRAI/tbox/AsyncCommPeer.h @@ -615,7 +615,10 @@ class AsyncCommPeer:public AsyncCommStage::Member * for overhead data. */ size_t d_internal_buf_size; - FlexData* d_internal_buf; + FlexData* d_internal_buf = nullptr; + + FlexData* d_count_buf = nullptr; + FlexData* d_first_recv_buf = nullptr; /*! * diff --git a/source/SAMRAI/tbox/CMakeLists.txt b/source/SAMRAI/tbox/CMakeLists.txt index 3ce0bccd93..8027d882f5 100644 --- a/source/SAMRAI/tbox/CMakeLists.txt +++ b/source/SAMRAI/tbox/CMakeLists.txt @@ -27,6 +27,7 @@ set ( tbox_headers InputDatabase.h InputManager.h IOStream.h + KernelFuser.h Logger.h MathUtilities.h MathUtilities.cpp @@ -48,9 +49,11 @@ set ( tbox_headers SAMRAI_MPI.h SAMRAIManager.h Schedule.h + ScheduleKernelFuser.h Serializable.h SiloDatabase.h SiloDatabaseFactory.h + StagedKernelFusers.h StartupShutdownManager.h Statistic.h Statistician.h @@ -59,6 +62,7 @@ set ( tbox_headers TimerManager.h Tracer.h Transaction.h + TransactionFuseable.h Utilities.h) set_source_files_properties( @@ -88,6 +92,7 @@ set (tbox_sources HDFDatabaseFactory.cpp IEEE.cpp InputManager.cpp + KernelFuser.cpp Logger.cpp MathUtilitiesSpecial.cpp MemoryDatabase.cpp @@ -106,9 +111,11 @@ set (tbox_sources SAMRAI_MPI.cpp Scanner.cpp Schedule.cpp + ScheduleKernelFuser.cpp Serializable.cpp SiloDatabase.cpp SiloDatabaseFactory.cpp + StagedKernelFusers.cpp StartupShutdownManager.cpp StatTransaction.cpp Statistic.cpp @@ -117,6 +124,7 @@ set (tbox_sources TimerManager.cpp Tracer.cpp Transaction.cpp + TransactionFuseable.cpp Utilities.cpp) if (ENABLE_HDF5) @@ -148,9 +156,11 @@ if (ENABLE_RAJA) endif () if (ENABLE_CUDA) - set(cuda_sources GPUUtilities.cpp Schedule.cpp) + set(cuda_sources GPUUtilities.cpp Schedule.cpp TransactionFuseable.cpp) set_source_files_properties(${cuda_sources} PROPERTIES LANGUAGE CUDA) + set (tbox_depends ${tbox_depends} cuda) + if (ENABLE_NVTX_REGIONS) find_package(CUDA REQUIRED) @@ -178,6 +188,9 @@ target_include_directories( SAMRAI_tbox $ $) +blt_print_target_properties( + TARGET SAMRAI_tbox) + install(TARGETS SAMRAI_tbox EXPORT SAMRAITargets diff --git a/source/SAMRAI/tbox/ExecutionPolicy.h b/source/SAMRAI/tbox/ExecutionPolicy.h index 2fc09b52c4..90c70ccc9b 100644 --- a/source/SAMRAI/tbox/ExecutionPolicy.h +++ b/source/SAMRAI/tbox/ExecutionPolicy.h @@ -11,6 +11,8 @@ #ifndef included_tbox_ExecutionPolicy #define included_tbox_ExecutionPolicy +#include "SAMRAI/SAMRAI_config.h" + #if defined(HAVE_RAJA) #include "RAJA/RAJA.hpp" @@ -112,6 +114,11 @@ struct policy_traits { >; using ReductionPolicy = RAJA::cuda_reduce; + + using WorkGroupPolicy = RAJA::WorkGroupPolicy< + RAJA::cuda_work_async, + RAJA::unordered_cuda_loop_y_block_iter_x_threadblock_average, + RAJA::constant_stride_array_of_objects>; }; #else @@ -146,6 +153,12 @@ struct policy_traits { >; using ReductionPolicy = RAJA::seq_reduce; + + using WorkGroupPolicy = RAJA::WorkGroupPolicy< + RAJA::loop_work, + RAJA::reverse_ordered, + RAJA::ragged_array_of_objects>; + }; #endif // HAVE_CUDA diff --git a/source/SAMRAI/tbox/KernelFuser.cpp b/source/SAMRAI/tbox/KernelFuser.cpp new file mode 100644 index 0000000000..6f77f3639f --- /dev/null +++ b/source/SAMRAI/tbox/KernelFuser.cpp @@ -0,0 +1,30 @@ +/************************************************************************* + * + * This file is part of the SAMRAI distribution. For full copyright + * information, see COPYRIGHT and LICENSE. + * + * Copyright: (c) 1997-2021 Lawrence Livermore National Security, LLC + * Description: Singleton kernel fuser + * + ************************************************************************/ + +#include "SAMRAI/tbox/KernelFuser.h" + + +namespace SAMRAI { +namespace tbox { + + +KernelFuser::~KernelFuser() +{ +} + +void +KernelFuser::initialize() +{ +} + + +} +} + diff --git a/source/SAMRAI/tbox/KernelFuser.h b/source/SAMRAI/tbox/KernelFuser.h new file mode 100644 index 0000000000..8b187cae6e --- /dev/null +++ b/source/SAMRAI/tbox/KernelFuser.h @@ -0,0 +1,116 @@ +#ifndef included_tbox_KernelFuser +#define included_tbox_KernelFuser + +#include "SAMRAI/SAMRAI_config.h" + +#ifdef HAVE_UMPIRE +#include "umpire/ResourceManager.hpp" +#include "umpire/TypedAllocator.hpp" +#endif + +#include "SAMRAI/tbox/ExecutionPolicy.h" +#include "SAMRAI/tbox/AllocatorDatabase.h" +#include "SAMRAI/tbox/Utilities.h" + +#ifdef HAVE_RAJA +#include "RAJA/RAJA.hpp" +#endif + + +#ifdef HAVE_UMPIRE +#include "umpire/Allocator.hpp" +#include "umpire/TypedAllocator.hpp" +#endif + +namespace SAMRAI { +namespace tbox { + +class KernelFuser +{ +public: + + +#ifdef HAVE_RAJA + template + void enqueue(int begin, int end, Kernel&& kernel) { + if (d_launched) { + TBOX_ERROR("KernelFuser Error: Cannont enqueue until cleanup called after previous launch."); + } + + d_workpool.enqueue(RAJA::RangeSegment(begin, end), std::forward(kernel)); + } +#endif + + void launch() + { + if (d_launched) { + TBOX_ERROR("KernelFuser Error: This KernelFuser already launched."); + } + +#ifdef HAVE_RAJA + if (d_workpool.num_loops() > 0) { + d_workgroup = d_workpool.instantiate(); + d_worksite = d_workgroup.run(); + } + d_launched = true; +#endif + } + + void cleanup() + { +#ifdef HAVE_RAJA + d_workpool.clear(); + d_workgroup.clear(); + d_worksite.clear(); + d_launched = false; +#endif + } + + bool launched() const + { + return d_launched; + } + + void initialize(); + + KernelFuser() : +#ifdef HAVE_RAJA + d_workpool(AllocatorDatabase::getDatabase()->getKernelFuserAllocator()), + d_workgroup(d_workpool.instantiate()), + d_worksite(d_workgroup.run()), +#endif + d_launched(false) + { + } + + + virtual ~KernelFuser(); + + +private: +#ifdef HAVE_UMPIRE + using Allocator = umpire::TypedAllocator; +#else + using Allocator = ResourceAllocator; +#endif + +#ifdef HAVE_RAJA + using Policy = typename tbox::detail::policy_traits< tbox::policy::parallel >::WorkGroupPolicy; + using WorkPool = RAJA::WorkPool , Allocator>; + using WorkGroup = RAJA::WorkGroup, Allocator>; + using WorkSite = RAJA::WorkSite , Allocator>; +#endif + +#ifdef HAVE_RAJA + WorkPool d_workpool; + WorkGroup d_workgroup; + WorkSite d_worksite; +#endif + + bool d_launched; +}; + +} +} + +#endif diff --git a/source/SAMRAI/tbox/Schedule.cpp b/source/SAMRAI/tbox/Schedule.cpp index 72c3021fb6..0e2cfa61da 100644 --- a/source/SAMRAI/tbox/Schedule.cpp +++ b/source/SAMRAI/tbox/Schedule.cpp @@ -10,6 +10,7 @@ #include "SAMRAI/tbox/Schedule.h" #include "SAMRAI/tbox/AllocatorDatabase.h" #include "SAMRAI/tbox/InputManager.h" +#include "SAMRAI/tbox/StagedKernelFusers.h" #include "SAMRAI/tbox/PIO.h" #include "SAMRAI/tbox/SAMRAIManager.h" #include "SAMRAI/tbox/SAMRAI_MPI.h" @@ -40,7 +41,11 @@ const int Schedule::s_default_second_tag = 1; * MPI communication. This parameter should be dependent on the MPI * implementation. */ +#if defined(HAVE_CUDA) +const size_t Schedule::s_default_first_message_length = 0; +#else const size_t Schedule::s_default_first_message_length = 1000; +#endif const std::string Schedule::s_default_timer_prefix("tbox::Schedule"); std::map Schedule::s_static_timers; @@ -60,7 +65,6 @@ Schedule::s_initialize_finalize_handler( */ Schedule::Schedule(): - d_coms(0), d_com_stage(), d_mpi(SAMRAI_MPI::getSAMRAIWorld()), d_first_tag(s_default_first_tag), @@ -101,13 +105,40 @@ Schedule::addTransaction( const int src_id = transaction->getSourceProcessor(); const int dst_id = transaction->getDestinationProcessor(); + std::shared_ptr fuseable_transaction{ + std::dynamic_pointer_cast(transaction)}; + if ((d_mpi.getRank() == src_id) && (d_mpi.getRank() == dst_id)) { - d_local_set.push_front(transaction); + if (fuseable_transaction) { + if (!d_local_fusers) { + d_local_fusers = StagedKernelFusers::getInstance(); + } + fuseable_transaction->setKernelFuser(d_local_fusers); + d_local_set_fuseable.push_front(fuseable_transaction); + } else { + d_local_set.push_front(transaction); + } } else { if (d_mpi.getRank() == dst_id) { - d_recv_sets[src_id].push_front(transaction); + if (fuseable_transaction) { + if (!d_recv_fusers) { + d_recv_fusers = StagedKernelFusers::getInstance(); + } + fuseable_transaction->setKernelFuser(d_recv_fusers); + d_recv_sets_fuseable[src_id].push_front(fuseable_transaction); + } else { + d_recv_sets[src_id].push_front(transaction); + } } else if (d_mpi.getRank() == src_id) { - d_send_sets[dst_id].push_front(transaction); + if (fuseable_transaction) { + if (!d_send_fusers) { + d_send_fusers = StagedKernelFusers::getInstance(); + } + fuseable_transaction->setKernelFuser(d_send_fusers); + d_send_sets_fuseable[dst_id].push_front(fuseable_transaction); + } else { + d_send_sets[dst_id].push_front(transaction); + } } } } @@ -126,13 +157,40 @@ Schedule::appendTransaction( const int src_id = transaction->getSourceProcessor(); const int dst_id = transaction->getDestinationProcessor(); + std::shared_ptr fuseable_transaction{ + std::dynamic_pointer_cast(transaction)}; + if ((d_mpi.getRank() == src_id) && (d_mpi.getRank() == dst_id)) { - d_local_set.push_back(transaction); + if (fuseable_transaction) { + if (!d_local_fusers) { + d_local_fusers = StagedKernelFusers::getInstance(); + } + fuseable_transaction->setKernelFuser(d_local_fusers); + d_local_set_fuseable.push_back(fuseable_transaction); + } else { + d_local_set.push_back(transaction); + } } else { if (d_mpi.getRank() == dst_id) { - d_recv_sets[src_id].push_back(transaction); + if (fuseable_transaction) { + if (!d_recv_fusers) { + d_recv_fusers = StagedKernelFusers::getInstance(); + } + fuseable_transaction->setKernelFuser(d_recv_fusers); + d_recv_sets_fuseable[src_id].push_back(fuseable_transaction); + } else { + d_recv_sets[src_id].push_back(transaction); + } } else if (d_mpi.getRank() == src_id) { - d_send_sets[dst_id].push_back(transaction); + if (fuseable_transaction) { + if (!d_send_fusers) { + d_send_fusers = StagedKernelFusers::getInstance(); + } + fuseable_transaction->setKernelFuser(d_send_fusers); + d_send_sets_fuseable[dst_id].push_back(transaction); + } else { + d_send_sets[dst_id].push_back(transaction); + } } } } @@ -149,8 +207,13 @@ Schedule::getNumSendTransactions( int size = 0; TransactionSets::const_iterator mi = d_send_sets.find(rank); if (mi != d_send_sets.end()) { - size = static_cast(mi->second.size()); + size += static_cast(mi->second.size()); + } + mi = d_send_sets_fuseable.find(rank); + if (mi != d_send_sets_fuseable.end()) { + size += static_cast(mi->second.size()); } + return size; } @@ -166,7 +229,11 @@ Schedule::getNumRecvTransactions( int size = 0; TransactionSets::const_iterator mi = d_recv_sets.find(rank); if (mi != d_recv_sets.end()) { - size = static_cast(mi->second.size()); + size += static_cast(mi->second.size()); + } + mi = d_recv_sets_fuseable.find(rank); + if (mi != d_recv_sets_fuseable.end()) { + size += static_cast(mi->second.size()); } return size; } @@ -186,6 +253,7 @@ Schedule::communicate() #endif d_object_timers->t_communicate->start(); + d_completed_transactions = false; beginCommunication(); finalizeCommunication(); d_object_timers->t_communicate->stop(); @@ -226,9 +294,6 @@ Schedule::finalizeCommunication() { d_object_timers->t_finalize_communication->start(); performLocalCopies(); -#if defined(HAVE_RAJA) - parallel_synchronize(); -#endif processCompletedCommunications(); deallocateCommunicationObjects(); d_object_timers->t_finalize_communication->stop(); @@ -245,7 +310,7 @@ Schedule::finalizeCommunication() void Schedule::postReceives() { - if (d_recv_sets.empty()) { + if (d_recv_sets.empty() && d_recv_sets_fuseable.empty()) { /* * Short cut because some looping logic in this method assumes * non-empty d_recv_sets. @@ -253,6 +318,8 @@ Schedule::postReceives() return; } + d_completed_transactions = true; + int rank = d_mpi.getRank(); /* @@ -263,60 +330,87 @@ Schedule::postReceives() * send posted earlier is paired with a receive that is also posted * earlier. */ - AsyncCommPeer* recv_coms = d_coms; + for (CommMap::reverse_iterator comm_peer(d_recv_coms.lower_bound(rank)); + comm_peer != d_recv_coms.rend(); + ++comm_peer) { + const int recv_rank = (*comm_peer).first; + auto& comm = (*comm_peer).second; + // Compute incoming message size, if possible. + unsigned int byte_count = 0; + bool can_estimate_incoming_message_size = true; - // Initialize iterators to where we want to start looping. - size_t icom = 0; // Index into recv_coms. - while (icom < d_recv_sets.size() && - recv_coms[icom].getPeerRank() < rank) { - ++icom; - } - icom = icom > 0 ? icom - 1 : d_recv_sets.size() - 1; + for (const auto& t : d_recv_sets[recv_rank] ) { + if (!t->canEstimateIncomingMessageSize()) { + can_estimate_incoming_message_size = false; + break; + } + byte_count += + static_cast(t->computeIncomingMessageSize()); + } - // Map iterator mi corresponds to recv_coms[icom]. - TransactionSets::const_iterator mi = - d_recv_sets.find(recv_coms[icom].getPeerRank()); + for (const auto& t: d_recv_sets_fuseable[recv_rank]) { + if (!t->canEstimateIncomingMessageSize()) { + can_estimate_incoming_message_size = false; + break; + } + byte_count += + static_cast(t->computeIncomingMessageSize()); + } - for (size_t counter = 0; - counter < d_recv_sets.size(); - ++counter, --mi, --icom) { + // Set AsyncCommPeer to receive known message length. + if (can_estimate_incoming_message_size) { + comm->limitFirstDataLength(byte_count); + } - TBOX_ASSERT(mi->first == recv_coms[icom].getPeerRank()); + // Begin non-blocking receive operation. + d_object_timers->t_post_receives->start(); + comm->beginRecv(); + if (comm->isDone()) { + comm->pushToCompletionQueue(); + } + d_object_timers->t_post_receives->stop(); + } + CommMap::reverse_iterator stop(d_recv_coms.lower_bound(rank)); + for (CommMap::reverse_iterator comm_peer = d_recv_coms.rbegin(); comm_peer != stop; ++comm_peer) { + const int recv_rank = (*comm_peer).first; + auto& comm = (*comm_peer).second; // Compute incoming message size, if possible. - const std::list >& transactions = - mi->second; unsigned int byte_count = 0; bool can_estimate_incoming_message_size = true; - for (ConstIterator r = transactions.begin(); - r != transactions.end(); ++r) { - if (!(*r)->canEstimateIncomingMessageSize()) { + + for (const auto& t : d_recv_sets[recv_rank] ) { + if (!t->canEstimateIncomingMessageSize()) { + can_estimate_incoming_message_size = false; + break; + } + byte_count += + static_cast(t->computeIncomingMessageSize()); + } + + for (const auto& t: d_recv_sets_fuseable[recv_rank]) { + if (!t->canEstimateIncomingMessageSize()) { can_estimate_incoming_message_size = false; break; } byte_count += - static_cast((*r)->computeIncomingMessageSize()); + static_cast(t->computeIncomingMessageSize()); } // Set AsyncCommPeer to receive known message length. if (can_estimate_incoming_message_size) { - recv_coms[icom].limitFirstDataLength(byte_count); + comm->limitFirstDataLength(byte_count); } // Begin non-blocking receive operation. d_object_timers->t_post_receives->start(); - recv_coms[icom].beginRecv(); - if (recv_coms[icom].isDone()) { - recv_coms[icom].pushToCompletionQueue(); + comm->beginRecv(); + if (comm->isDone()) { + comm->pushToCompletionQueue(); } d_object_timers->t_post_receives->stop(); - - if (mi == d_recv_sets.begin()) { - // Continue loop at the opposite end. - mi = d_recv_sets.end(); - icom = d_recv_sets.size(); - } } + } /* @@ -339,38 +433,100 @@ Schedule::postSends() int rank = d_mpi.getRank(); - AsyncCommPeer* send_coms = d_coms + d_recv_sets.size(); + for (auto comm_peer = d_send_coms.lower_bound(rank); + comm_peer != d_send_coms.end(); + ++comm_peer) { + const int peer_rank = (*comm_peer).first; + auto& comm = (*comm_peer).second; - // Initialize iterators to where we want to start looping. - TransactionSets::const_iterator mi = d_send_sets.upper_bound(rank); - size_t icom = 0; // send_coms[icom] corresponds to mi. - while (icom < d_send_sets.size() && - send_coms[icom].getPeerRank() < rank) { - ++icom; - } + size_t byte_count = 0; + bool can_estimate_incoming_message_size = true; + for (const auto& transaction : d_send_sets[peer_rank]) { + if (!transaction->canEstimateIncomingMessageSize()) { + can_estimate_incoming_message_size = false; + } + byte_count += transaction->computeOutgoingMessageSize(); + } + + for (const auto& transaction : d_send_sets_fuseable[peer_rank]) { + if (!transaction->canEstimateIncomingMessageSize()) { + can_estimate_incoming_message_size = false; + } + byte_count += transaction->computeOutgoingMessageSize(); + } + + // Pack outgoing data into a message. + MessageStream outgoing_stream( + byte_count, + MessageStream::Write, + nullptr, + true +#ifdef HAVE_UMPIRE + , AllocatorDatabase::getDatabase()->getStreamAllocator() +#endif + ); + + d_object_timers->t_pack_stream->start(); - for (size_t counter = 0; - counter < d_send_sets.size(); - ++counter, ++mi, ++icom) { + bool have_fuseable = !(d_send_sets_fuseable[peer_rank].empty()); - if (mi == d_send_sets.end()) { - // Continue loop at the opposite end. - mi = d_send_sets.begin(); - icom = 0; + for (const auto& transaction : d_send_sets_fuseable[peer_rank]) { + transaction->packStream(outgoing_stream); } - TBOX_ASSERT(mi->first == send_coms[icom].getPeerRank()); + if (d_send_fusers && have_fuseable) { + d_send_fusers->launch(); + } + + for (const auto& transaction : d_send_sets[peer_rank]) { + transaction->packStream(outgoing_stream); + } + + bool have_non_fuseable = !(d_send_sets[peer_rank].empty()); + + if (have_fuseable || have_non_fuseable) { + d_completed_transactions = true; +#if defined(HAVE_RAJA) + parallel_synchronize(); + if (d_send_fusers) d_send_fusers->cleanup(); +#endif + } + + d_object_timers->t_pack_stream->stop(); + + if (can_estimate_incoming_message_size) { + // Receiver knows message size so set it exactly. + comm->limitFirstDataLength(byte_count); + } + + // Begin non-blocking send operation. + comm->beginSend( + (const char *)outgoing_stream.getBufferStart(), + static_cast(outgoing_stream.getCurrentSize())); + if (comm->isDone()) { + comm->pushToCompletionQueue(); + } + } + + for (auto comm_peer = d_send_coms.begin(); + comm_peer != d_send_coms.lower_bound(rank); + ++comm_peer) { + const int peer_rank = (*comm_peer).first; + auto& comm = (*comm_peer).second; - // Compute message size and whether receiver can estimate it. - const std::list >& transactions = - mi->second; size_t byte_count = 0; bool can_estimate_incoming_message_size = true; - for (ConstIterator pack = transactions.begin(); - pack != transactions.end(); ++pack) { - if (!(*pack)->canEstimateIncomingMessageSize()) { + for (const auto& transaction : d_send_sets[peer_rank]) { + if (!transaction->canEstimateIncomingMessageSize()) { + can_estimate_incoming_message_size = false; + } + byte_count += transaction->computeOutgoingMessageSize(); + } + + for (const auto& transaction : d_send_sets_fuseable[peer_rank]) { + if (!transaction->canEstimateIncomingMessageSize()) { can_estimate_incoming_message_size = false; } - byte_count += (*pack)->computeOutgoingMessageSize(); + byte_count += transaction->computeOutgoingMessageSize(); } // Pack outgoing data into a message. @@ -384,28 +540,42 @@ Schedule::postSends() #endif ); + bool have_fuseable = !(d_send_sets_fuseable[peer_rank].empty()); + d_object_timers->t_pack_stream->start(); - for (ConstIterator pack = transactions.begin(); - pack != transactions.end(); ++pack) { - (*pack)->packStream(outgoing_stream); + for (const auto& transaction : d_send_sets_fuseable[peer_rank]) { + transaction->packStream(outgoing_stream); } -#if defined(HAVE_RAJA) - parallel_synchronize(); + if (d_send_fusers && have_fuseable) { + d_send_fusers->launch(); + } + + for (const auto& transaction : d_send_sets[peer_rank]) { + transaction->packStream(outgoing_stream); + } + bool have_non_fuseable = !(d_send_sets[peer_rank].empty()); + + if (have_fuseable || have_non_fuseable) { + d_completed_transactions = true; +#if defined(HAVE_RAJA) + parallel_synchronize(); + if (d_send_fusers) d_send_fusers->cleanup(); #endif + } d_object_timers->t_pack_stream->stop(); if (can_estimate_incoming_message_size) { // Receiver knows message size so set it exactly. - send_coms[icom].limitFirstDataLength(byte_count); + comm->limitFirstDataLength(byte_count); } // Begin non-blocking send operation. - send_coms[icom].beginSend( + comm->beginSend( (const char *)outgoing_stream.getBufferStart(), static_cast(outgoing_stream.getCurrentSize())); - if (send_coms[icom].isDone()) { - send_coms[icom].pushToCompletionQueue(); + if (comm->isDone()) { + comm->pushToCompletionQueue(); } } @@ -420,12 +590,30 @@ Schedule::postSends() void Schedule::performLocalCopies() { + bool have_fuseable = !d_local_set_fuseable.empty(); + d_object_timers->t_local_copies->start(); - for (Iterator local = d_local_set.begin(); - local != d_local_set.end(); ++local) { - (*local)->copyLocalData(); + for (const auto& local : d_local_set_fuseable) { + local->copyLocalData(); + } + if (d_local_fusers && have_fuseable) { + d_local_fusers->launch(); + } + + for (const auto& local : d_local_set) { + local->copyLocalData(); } d_object_timers->t_local_copies->stop(); + + bool have_non_fuseable = !d_local_set.empty(); + if (have_fuseable || have_non_fuseable) { + d_completed_transactions = true; +#if defined(HAVE_RAJA) + parallel_synchronize(); + if (d_local_fusers) d_local_fusers->cleanup(); +#endif + } + } /* @@ -446,39 +634,57 @@ Schedule::processCompletedCommunications() if (d_unpack_in_deterministic_order) { // Unpack in deterministic order. Wait for receive as needed. + // Deterministic order is lowest to highest recv rank int irecv = 0; - for (TransactionSets::iterator recv_itr = d_recv_sets.begin(); - recv_itr != d_recv_sets.end(); ++recv_itr, ++irecv) { - - int sender = recv_itr->first; - AsyncCommPeer& completed_comm = d_coms[irecv]; - TBOX_ASSERT(sender == completed_comm.getPeerRank()); - completed_comm.completeCurrentOperation(); - completed_comm.yankFromCompletionQueue(); + for (auto& comms : d_recv_coms) { + auto& completed_comm = comms.second; + int sender = comms.first; + TBOX_ASSERT(sender == completed_comm->getPeerRank()); + completed_comm->completeCurrentOperation(); + completed_comm->yankFromCompletionQueue(); MessageStream incoming_stream( - static_cast(completed_comm.getRecvSize()) * sizeof(char), + static_cast(completed_comm->getRecvSize()) * sizeof(char), MessageStream::Read, - completed_comm.getRecvData(), + completed_comm->getRecvData(), false /* don't use deep copy */ #ifdef HAVE_UMPIRE , AllocatorDatabase::getDatabase()->getStreamAllocator() #endif ); + + bool have_fuseable = !(d_recv_sets_fuseable[sender].empty()); + d_object_timers->t_unpack_stream->start(); - for (Iterator recv = d_recv_sets[sender].begin(); - recv != d_recv_sets[sender].end(); ++recv) { - (*recv)->unpackStream(incoming_stream); + for (const auto& transaction : d_recv_sets_fuseable[sender]) { + transaction->unpackStream(incoming_stream); + } + if (d_recv_fusers || have_fuseable) { + d_recv_fusers->launch(); } #if defined(HAVE_RAJA) - parallel_synchronize(); + if (have_fuseable) { + parallel_synchronize(); + if (d_recv_fusers) d_recv_fusers->cleanup(); + } +#endif + for (const auto& transaction : d_recv_sets[sender]) { + transaction->unpackStream(incoming_stream); + } + bool have_non_fuseable = !(d_recv_sets[sender].empty()); +#if defined(HAVE_RAJA) + if (have_non_fuseable) { + parallel_synchronize(); + } #endif + if (have_fuseable || have_non_fuseable) { + d_completed_transactions = true; + } d_object_timers->t_unpack_stream->stop(); - completed_comm.clearRecvData(); - + completed_comm->clearRecvData(); } // Complete sends. @@ -499,7 +705,7 @@ Schedule::processCompletedCommunications() TBOX_ASSERT(completed_comm != 0); TBOX_ASSERT(completed_comm->isDone()); - if (static_cast(completed_comm - d_coms) < num_senders) { + if (!completed_comm->isSender()) { const int sender = completed_comm->getPeerRank(); @@ -513,14 +719,34 @@ Schedule::processCompletedCommunications() #endif ); + bool have_fuseable = !(d_recv_sets_fuseable[sender].empty()); + d_object_timers->t_unpack_stream->start(); - for (Iterator recv = d_recv_sets[sender].begin(); - recv != d_recv_sets[sender].end(); ++recv) { - (*recv)->unpackStream(incoming_stream); + for (const auto& transaction : d_recv_sets_fuseable[sender]) { + transaction->unpackStream(incoming_stream); + } + if (d_recv_fusers && have_fuseable) { + d_recv_fusers->launch(); } #if defined(HAVE_RAJA) - parallel_synchronize(); + if (have_fuseable) { + parallel_synchronize(); + if (d_recv_fusers) d_recv_fusers->cleanup(); + } +#endif + for (const auto& transaction : d_recv_sets[sender]) { + transaction->unpackStream(incoming_stream); + } + bool have_non_fuseable = !(d_recv_sets[sender].empty()); +#if defined(HAVE_RAJA) + if (have_non_fuseable) { + parallel_synchronize(); + } #endif + if (have_fuseable || have_non_fuseable) { + d_completed_transactions = true; + } + d_object_timers->t_unpack_stream->stop(); completed_comm->clearRecvData(); } else { @@ -542,37 +768,69 @@ Schedule::processCompletedCommunications() void Schedule::allocateCommunicationObjects() { - const size_t length = d_recv_sets.size() + d_send_sets.size(); - if (length > 0) { - d_coms = new AsyncCommPeer[length]; + for (const auto& transaction : d_recv_sets) { + int rank = transaction.first; + + auto peer = std::make_shared>(); + peer->initialize(&d_com_stage); + peer->setPeerRank(rank); + peer->setMPITag(d_first_tag, d_second_tag); + peer->setMPI(d_mpi); + peer->limitFirstDataLength(d_first_message_length); +#ifdef HAVE_UMPIRE + peer->setAllocator(AllocatorDatabase::getDatabase()->getStreamAllocator()); +#endif + d_recv_coms[rank] = peer; } - size_t counter = 0; - for (TransactionSets::iterator ti = d_recv_sets.begin(); - ti != d_recv_sets.end(); - ++ti) { - d_coms[counter].initialize(&d_com_stage); - d_coms[counter].setPeerRank(ti->first); - d_coms[counter].setMPITag(d_first_tag, d_second_tag); - d_coms[counter].setMPI(d_mpi); - d_coms[counter].limitFirstDataLength(d_first_message_length); + for (const auto transaction : d_recv_sets_fuseable) { + int rank = transaction.first; + + if (d_recv_coms.find(rank) == d_recv_coms.end()) { + auto peer = std::make_shared>(); + peer->initialize(&d_com_stage); + peer->setPeerRank(rank); + peer->setMPITag(d_first_tag, d_second_tag); + peer->setMPI(d_mpi); + peer->limitFirstDataLength(d_first_message_length); #ifdef HAVE_UMPIRE - d_coms[counter].setAllocator(AllocatorDatabase::getDatabase()->getStreamAllocator()); + peer->setAllocator(AllocatorDatabase::getDatabase()->getStreamAllocator()); #endif - ++counter; + d_recv_coms[rank] = peer; + } } - for (TransactionSets::iterator ti = d_send_sets.begin(); - ti != d_send_sets.end(); - ++ti) { - d_coms[counter].initialize(&d_com_stage); - d_coms[counter].setPeerRank(ti->first); - d_coms[counter].setMPITag(d_first_tag, d_second_tag); - d_coms[counter].setMPI(d_mpi); - d_coms[counter].limitFirstDataLength(d_first_message_length); + + for (const auto& transaction : d_send_sets) { + int rank = transaction.first; + auto peer = std::make_shared>(); + + peer->initialize(&d_com_stage); + peer->setPeerRank(rank); + peer->setMPITag(d_first_tag, d_second_tag); + peer->setMPI(d_mpi); + peer->limitFirstDataLength(d_first_message_length); #ifdef HAVE_UMPIRE - d_coms[counter].setAllocator(AllocatorDatabase::getDatabase()->getStreamAllocator()); + peer->setAllocator(AllocatorDatabase::getDatabase()->getStreamAllocator()); #endif - ++counter; + d_send_coms[rank] = peer; + } + + for (const auto& transaction : d_send_sets_fuseable) { + int rank = transaction.first; + + if (d_send_coms.find(rank) == d_send_coms.end()) { + auto peer = std::make_shared>(); + + peer->initialize(&d_com_stage); + peer->setPeerRank(rank); + peer->setMPITag(d_first_tag, d_second_tag); + peer->setMPI(d_mpi); + peer->limitFirstDataLength(d_first_message_length); +#ifdef HAVE_UMPIRE + peer->setAllocator(AllocatorDatabase::getDatabase()->getStreamAllocator()); +#endif + d_send_coms[rank] = peer; + } } } diff --git a/source/SAMRAI/tbox/Schedule.h b/source/SAMRAI/tbox/Schedule.h index 058e050ab0..bd436bef5a 100644 --- a/source/SAMRAI/tbox/Schedule.h +++ b/source/SAMRAI/tbox/Schedule.h @@ -17,6 +17,9 @@ #include "SAMRAI/tbox/SAMRAI_MPI.h" #include "SAMRAI/tbox/MessageStream.h" #include "SAMRAI/tbox/Transaction.h" +#include "SAMRAI/tbox/TransactionFuseable.h" +#include "SAMRAI/tbox/KernelFuser.h" +#include "SAMRAI/tbox/StagedKernelFusers.h" #include #include @@ -275,7 +278,7 @@ class Schedule bool allocatedCommunicationObjects() { - return d_coms != 0; + return (d_recv_coms.size() > 0 && d_send_coms.size() > 0); } /*! @@ -287,16 +290,19 @@ class Schedule return "Schedule"; } + bool completedTransactions() const + { + return d_completed_transactions; + } + private: void allocateCommunicationObjects(); void deallocateCommunicationObjects() { - if (d_coms) { - delete[] d_coms; - } - d_coms = 0; + d_send_coms.clear(); + d_recv_coms.clear(); } void @@ -344,12 +350,28 @@ class Schedule TransactionSets d_send_sets; TransactionSets d_recv_sets; + TransactionSets d_send_sets_fuseable; + TransactionSets d_recv_sets_fuseable; + + StagedKernelFusers* d_send_fusers{nullptr}; + StagedKernelFusers* d_recv_fusers{nullptr}; + + bool d_completed_transactions = false; + /* * @brief Transactions where the source and destination are the * local process. */ std::list > d_local_set; + /* + * @brief Fuseable transactions where the source and destination are the + * local process. + */ + std::list > d_local_set_fuseable; + + StagedKernelFusers* d_local_fusers{nullptr}; + //@{ @name High-level asynchronous messages passing objects /*! @@ -359,7 +381,10 @@ class Schedule * d_coms is typed for byte sending because our data is of * unknown mixed type. */ - AsyncCommPeer* d_coms; + using CommMap = std::map>>; + CommMap d_send_coms; + CommMap d_recv_coms; + /*! * @brief Stage for advancing communication operations to * completion. diff --git a/source/SAMRAI/tbox/ScheduleKernelFuser.cpp b/source/SAMRAI/tbox/ScheduleKernelFuser.cpp new file mode 100644 index 0000000000..3b2883c9bb --- /dev/null +++ b/source/SAMRAI/tbox/ScheduleKernelFuser.cpp @@ -0,0 +1,66 @@ +/************************************************************************* + * + * This file is part of the SAMRAI distribution. For full copyright + * information, see COPYRIGHT and LICENSE. + * + * Copyright: (c) 1997-2021 Lawrence Livermore National Security, LLC + * Description: Singleton kernel fuser + * + ************************************************************************/ + +#include "SAMRAI/tbox/ScheduleKernelFuser.h" + + +namespace SAMRAI { +namespace tbox { + +ScheduleKernelFuser* ScheduleKernelFuser::s_schedule_kernel_fuser_instance(nullptr); + +StartupShutdownManager::Handler +ScheduleKernelFuser::s_startup_handler( + 0, + ScheduleKernelFuser::startupCallback, + 0, + 0, + tbox::StartupShutdownManager::priorityArenaManager); + +void +ScheduleKernelFuser::startupCallback() +{ + ScheduleKernelFuser::getInstance()->initialize(); +} + +void +ScheduleKernelFuser::shutdownCallback() +{ + if (s_schedule_kernel_fuser_instance) { + delete s_schedule_kernel_fuser_instance; + } + s_schedule_kernel_fuser_instance = nullptr; +} + +ScheduleKernelFuser * +ScheduleKernelFuser::getInstance() +{ + if (!s_schedule_kernel_fuser_instance) { + s_schedule_kernel_fuser_instance = new ScheduleKernelFuser(); + } + return s_schedule_kernel_fuser_instance; +} + +ScheduleKernelFuser::~ScheduleKernelFuser() +{ + delete d_kernel_fuser; + d_kernel_fuser = nullptr; +} + +void +ScheduleKernelFuser::initialize() +{ + d_kernel_fuser = new KernelFuser(); +} + + +} +} + diff --git a/source/SAMRAI/tbox/ScheduleKernelFuser.h b/source/SAMRAI/tbox/ScheduleKernelFuser.h new file mode 100644 index 0000000000..9e07f0cd27 --- /dev/null +++ b/source/SAMRAI/tbox/ScheduleKernelFuser.h @@ -0,0 +1,70 @@ +#ifndef included_tbox_ScheduleKernelFuser +#define included_tbox_ScheduleKernelFuser + +#include "SAMRAI/SAMRAI_config.h" + +#include "SAMRAI/tbox/KernelFuser.h" + +#ifdef HAVE_RAJA +#include "RAJA/RAJA.hpp" +#endif + + +namespace SAMRAI { +namespace tbox { + +class ScheduleKernelFuser +{ +public: + + static ScheduleKernelFuser* getInstance(); + +#ifdef HAVE_RAJA + template + void enqueue(int stage, int begin, int end, Kernel&& kernel) { + d_kernel_fuser->enqueue(begin, end, kernel); + } +#endif + + void launch() + { + d_kernel_fuser->launch(); + } + + void cleanup() + { + d_kernel_fuser->cleanup(); + } + + KernelFuser* getFuser() + { + return d_kernel_fuser; + } + + void initialize(); + +protected: + ScheduleKernelFuser() + { + } + + virtual ~ScheduleKernelFuser(); + + +private: + + static void startupCallback(); + static void shutdownCallback(); + + static ScheduleKernelFuser* s_schedule_kernel_fuser_instance; + + static StartupShutdownManager::Handler + s_startup_handler; + + KernelFuser* d_kernel_fuser = nullptr; +}; + +} +} + +#endif diff --git a/source/SAMRAI/tbox/StagedKernelFusers.cpp b/source/SAMRAI/tbox/StagedKernelFusers.cpp new file mode 100644 index 0000000000..57d885227c --- /dev/null +++ b/source/SAMRAI/tbox/StagedKernelFusers.cpp @@ -0,0 +1,63 @@ +/************************************************************************* + * + * This file is part of the SAMRAI distribution. For full copyright + * information, see COPYRIGHT and LICENSE. + * + * Copyright: (c) 1997-2021 Lawrence Livermore National Security, LLC + * Description: Singleton kernel fuser + * + ************************************************************************/ + +#include "SAMRAI/tbox/StagedKernelFusers.h" + + +namespace SAMRAI { +namespace tbox { + +StagedKernelFusers* StagedKernelFusers::s_staged_kernel_fusers_instance(nullptr); + +StartupShutdownManager::Handler +StagedKernelFusers::s_startup_handler( + 0, + StagedKernelFusers::startupCallback, + 0, + 0, + tbox::StartupShutdownManager::priorityArenaManager); + +void +StagedKernelFusers::startupCallback() +{ + StagedKernelFusers::getInstance()->initialize(); +} + +void +StagedKernelFusers::shutdownCallback() +{ + if (s_staged_kernel_fusers_instance) { + delete s_staged_kernel_fusers_instance; + } + s_staged_kernel_fusers_instance = nullptr; +} + +StagedKernelFusers * +StagedKernelFusers::getInstance() +{ + if (!s_staged_kernel_fusers_instance) { + s_staged_kernel_fusers_instance = new StagedKernelFusers(); + } + return s_staged_kernel_fusers_instance; +} + +StagedKernelFusers::~StagedKernelFusers() +{ +} + +void +StagedKernelFusers::initialize() +{ +} + + +} +} + diff --git a/source/SAMRAI/tbox/StagedKernelFusers.h b/source/SAMRAI/tbox/StagedKernelFusers.h new file mode 100644 index 0000000000..919886b33d --- /dev/null +++ b/source/SAMRAI/tbox/StagedKernelFusers.h @@ -0,0 +1,107 @@ +#ifndef included_tbox_StagedKernelFusers +#define included_tbox_StagedKernelFusers + +#include "SAMRAI/SAMRAI_config.h" + +#include "SAMRAI/tbox/Collectives.h" +#include "SAMRAI/tbox/KernelFuser.h" + +#ifdef HAVE_RAJA +#include "RAJA/RAJA.hpp" +#endif + + +namespace SAMRAI { +namespace tbox { + +class StagedKernelFusers +{ +public: + + static StagedKernelFusers* getInstance(); + +#ifdef HAVE_RAJA + template + void enqueue(int stage, int begin, int end, Kernel&& kernel) { + d_kernel_fusers[stage].enqueue(begin, end, kernel); + } +#endif + + void launch() + { + for (auto& fuser : d_kernel_fusers) { + fuser.second.launch(); + d_launched = (d_launched || fuser.second.launched()); + } + } + + void cleanup() + { + for (auto& fuser : d_kernel_fusers) { + fuser.second.cleanup(); + } + d_launched = false; + } + + KernelFuser* getFuser(int stage) + { + return &d_kernel_fusers[stage]; + } + + void clearKernelFuser(int stage) + { + d_kernel_fusers.erase(stage); + } + + void clearAllFusers() + { + d_kernel_fusers.clear(); + d_launched = false; + } + + bool launched() + { + return d_launched; + } + + void initialize(); + + void launchAndCleanup() + { + launch(); +#ifdef HAVE_RAJA + if (d_launched) { + tbox::parallel_synchronize(); + } +#endif + cleanup(); + } + +protected: + StagedKernelFusers() + { + } + + virtual ~StagedKernelFusers(); + + +private: + + static void startupCallback(); + static void shutdownCallback(); + + static StagedKernelFusers* s_staged_kernel_fusers_instance; + + static StartupShutdownManager::Handler + s_startup_handler; + + std::map d_kernel_fusers; + + bool d_launched = false; + +}; + +} +} + +#endif diff --git a/source/SAMRAI/tbox/TransactionFuseable.cpp b/source/SAMRAI/tbox/TransactionFuseable.cpp new file mode 100644 index 0000000000..f70d83f42f --- /dev/null +++ b/source/SAMRAI/tbox/TransactionFuseable.cpp @@ -0,0 +1,19 @@ +#include "SAMRAI/tbox/TransactionFuseable.h" + +namespace SAMRAI { +namespace tbox { + +void +TransactionFuseable::setKernelFuser(StagedKernelFusers* fuser) +{ + d_fuser = fuser; +} + +StagedKernelFusers* +TransactionFuseable::getKernelFuser() +{ + return d_fuser; +} + +} +} diff --git a/source/SAMRAI/tbox/TransactionFuseable.h b/source/SAMRAI/tbox/TransactionFuseable.h new file mode 100644 index 0000000000..686268680b --- /dev/null +++ b/source/SAMRAI/tbox/TransactionFuseable.h @@ -0,0 +1,38 @@ +/************************************************************************* + * + * This file is part of the SAMRAI distribution. For full copyright + * information, see COPYRIGHT and LICENSE. + * + * Copyright: (c) 1997-2020 Lawrence Livermore National Security, LLC + * Description: Abstract base class for all schedule transactions + * + ************************************************************************/ + +#ifndef included_tbox_TransactionFuseable +#define included_tbox_TransactionFuseable + +#include "SAMRAI/SAMRAI_config.h" + +#include "SAMRAI/tbox/Transaction.h" +#include "SAMRAI/tbox/StagedKernelFusers.h" + +#include + +namespace SAMRAI { +namespace tbox { + +class TransactionFuseable : + public Transaction +{ +public: + void setKernelFuser(StagedKernelFusers* fuser); + StagedKernelFusers* getKernelFuser(); + +private: + StagedKernelFusers* d_fuser{nullptr}; +}; + +} +} + +#endif diff --git a/source/SAMRAI/xfer/CoarsenCopyTransaction.cpp b/source/SAMRAI/xfer/CoarsenCopyTransaction.cpp index 470f3554bc..f8a287079f 100644 --- a/source/SAMRAI/xfer/CoarsenCopyTransaction.cpp +++ b/source/SAMRAI/xfer/CoarsenCopyTransaction.cpp @@ -131,7 +131,7 @@ CoarsenCopyTransaction::packStream( tbox::MessageStream& stream) { d_src_patch->getPatchData(d_coarsen_data[d_item_id]->d_src) - ->packStream(stream, *d_overlap); + ->packStreamFuseable(stream, *d_overlap); } void @@ -139,7 +139,7 @@ CoarsenCopyTransaction::unpackStream( tbox::MessageStream& stream) { d_dst_patch->getPatchData(d_coarsen_data[d_item_id]->d_dst) - ->unpackStream(stream, *d_overlap); + ->unpackStreamFuseable(stream, *d_overlap); } void @@ -151,7 +151,7 @@ CoarsenCopyTransaction::copyLocalData() const hier::PatchData& src_data = *d_src_patch->getPatchData(d_coarsen_data[d_item_id]->d_src); - dst_data.copy(src_data, *d_overlap); + dst_data.copyFuseable(src_data, *d_overlap); } /* diff --git a/source/SAMRAI/xfer/CoarsenCopyTransaction.h b/source/SAMRAI/xfer/CoarsenCopyTransaction.h index 2676619f6e..48588106f4 100644 --- a/source/SAMRAI/xfer/CoarsenCopyTransaction.h +++ b/source/SAMRAI/xfer/CoarsenCopyTransaction.h @@ -14,7 +14,7 @@ #include "SAMRAI/SAMRAI_config.h" -#include "SAMRAI/tbox/Transaction.h" +#include "SAMRAI/tbox/TransactionFuseable.h" #include "SAMRAI/hier/BaseGridGeometry.h" #include "SAMRAI/hier/PatchLevel.h" #include "SAMRAI/xfer/CoarsenClasses.h" @@ -40,7 +40,7 @@ namespace xfer { * @see tbox::Transaction */ -class CoarsenCopyTransaction:public tbox::Transaction +class CoarsenCopyTransaction:public tbox::TransactionFuseable { public: /*! diff --git a/source/SAMRAI/xfer/CoarsenPatchStrategy.h b/source/SAMRAI/xfer/CoarsenPatchStrategy.h index 5a087d6693..b282db4d46 100644 --- a/source/SAMRAI/xfer/CoarsenPatchStrategy.h +++ b/source/SAMRAI/xfer/CoarsenPatchStrategy.h @@ -139,6 +139,43 @@ class CoarsenPatchStrategy const hier::Box& coarse_box, const hier::IntVector& ratio) = 0; + virtual void + setPostCoarsenSyncFlag() + { + setNeedCoarsenSynchronize(true); + } + + /*! + * @brief Check flag for if host-device synchronization is needed. + * + * Returns current value of the flag while setting the flag back to + * the default value of true. + */ + bool + needSynchronize() + { + bool flag = d_need_synchronize; + d_need_synchronize = true; + return flag; + } + +protected: + + /*! + * @brief Set flag indicating if device synchronization is needed after + * a child class operation. + * + * This allows implementations of methods such as preprocessCoarsen and + * postprocessCoarsen to set the flag to false if they have done nothing + * that requires host-device synchronization and do not need + * CoarsenSchedule to call the synchronize routine. + */ + void + setNeedCoarsenSynchronize(bool flag) + { + d_need_synchronize = flag; + } + private: /*! * @brief Get the set of CoarsenPatchStrategy objects that have been @@ -163,6 +200,8 @@ class CoarsenPatchStrategy current_objects.insert(this); } + bool d_need_synchronize = true; + }; } diff --git a/source/SAMRAI/xfer/CoarsenSchedule.cpp b/source/SAMRAI/xfer/CoarsenSchedule.cpp index 904715a62d..c7c19d7f0d 100644 --- a/source/SAMRAI/xfer/CoarsenSchedule.cpp +++ b/source/SAMRAI/xfer/CoarsenSchedule.cpp @@ -313,7 +313,9 @@ CoarsenSchedule::coarsenData() const d_schedule->communicate(); #if defined(HAVE_RAJA) - tbox::parallel_synchronize(); +// if (d_schedule->completedTransactions()) { +// tbox::parallel_synchronize(); +// } #endif /* @@ -1023,10 +1025,13 @@ CoarsenSchedule::coarsenSourceData( patch_strategy->preprocessCoarsen(*temp_patch, *fine_patch, box, block_ratio); #if defined(HAVE_RAJA) - tbox::parallel_synchronize(); + if (patch_strategy->needSynchronize()) { + tbox::parallel_synchronize(); + } #endif } + bool need_sync = false; for (size_t ici = 0; ici < d_number_coarsen_items; ++ici) { const CoarsenClasses::Data * const crs_item = d_coarsen_items[ici]; @@ -1035,22 +1040,39 @@ CoarsenSchedule::coarsenSourceData( crs_item->d_opcoarsen->coarsen(*temp_patch, *fine_patch, source_id, source_id, box, block_ratio); + need_sync = true; } } + #if defined(HAVE_RAJA) - tbox::parallel_synchronize(); +// if (need_sync) { +// tbox::parallel_synchronize(); +// } #endif if (patch_strategy) { + d_coarsen_patch_strategy->setPostCoarsenSyncFlag(); +#if defined(HAVE_RAJA) + if (d_coarsen_patch_strategy->needSynchronize()) { + tbox::parallel_synchronize(); + } +#endif + patch_strategy->postprocessCoarsen(*temp_patch, *fine_patch, box, block_ratio); -#if defined(HAVE_RAJA) - tbox::parallel_synchronize(); -#endif } } + + tbox::StagedKernelFusers::getInstance()->launch(); +#if defined(HAVE_RAJA) + if (!patch_strategy || patch_strategy->needSynchronize() || tbox::StagedKernelFusers::getInstance()->launched()) { + tbox::parallel_synchronize(); + } +#endif + tbox::StagedKernelFusers::getInstance()->cleanup(); + } /* diff --git a/source/SAMRAI/xfer/RefineCopyTransaction.cpp b/source/SAMRAI/xfer/RefineCopyTransaction.cpp index 0684dc21b2..deb939d8d5 100644 --- a/source/SAMRAI/xfer/RefineCopyTransaction.cpp +++ b/source/SAMRAI/xfer/RefineCopyTransaction.cpp @@ -131,7 +131,7 @@ RefineCopyTransaction::packStream( tbox::MessageStream& stream) { d_src_patch->getPatchData(d_refine_data[d_item_id]->d_src) - ->packStream(stream, *d_overlap); + ->packStreamFuseable(stream, *d_overlap); } void @@ -139,7 +139,7 @@ RefineCopyTransaction::unpackStream( tbox::MessageStream& stream) { d_dst_patch->getPatchData(d_refine_data[d_item_id]->d_scratch) - ->unpackStream(stream, *d_overlap); + ->unpackStreamFuseable(stream, *d_overlap); } void @@ -151,7 +151,7 @@ RefineCopyTransaction::copyLocalData() const hier::PatchData& src_data = *d_src_patch->getPatchData(d_refine_data[d_item_id]->d_src); - dst_data.copy(src_data, *d_overlap); + dst_data.copyFuseable(src_data, *d_overlap); } /* diff --git a/source/SAMRAI/xfer/RefineCopyTransaction.h b/source/SAMRAI/xfer/RefineCopyTransaction.h index c2cce76270..e3a0ba8278 100644 --- a/source/SAMRAI/xfer/RefineCopyTransaction.h +++ b/source/SAMRAI/xfer/RefineCopyTransaction.h @@ -14,7 +14,7 @@ #include "SAMRAI/SAMRAI_config.h" -#include "SAMRAI/tbox/Transaction.h" +#include "SAMRAI/tbox/TransactionFuseable.h" #include "SAMRAI/hier/BaseGridGeometry.h" #include "SAMRAI/hier/PatchLevel.h" #include "SAMRAI/xfer/RefineClasses.h" @@ -40,7 +40,7 @@ namespace xfer { * @see tbox::Transaction */ -class RefineCopyTransaction:public tbox::Transaction +class RefineCopyTransaction:public tbox::TransactionFuseable { public: /*! diff --git a/source/SAMRAI/xfer/RefinePatchStrategy.h b/source/SAMRAI/xfer/RefinePatchStrategy.h index a5f79e7e43..8ac8069264 100644 --- a/source/SAMRAI/xfer/RefinePatchStrategy.h +++ b/source/SAMRAI/xfer/RefinePatchStrategy.h @@ -265,6 +265,7 @@ class RefinePatchStrategy NULL_USE(coarse_to_unfilled); NULL_USE(overlaps); NULL_USE(refine_items); + setNeedRefineSynchronize(false); } /*! @@ -290,6 +291,44 @@ class RefinePatchStrategy NULL_USE(coarse_level); NULL_USE(coarse_to_fine); NULL_USE(coarse_to_unfilled); + setNeedRefineSynchronize(false); + } + + virtual void + setPostRefineSyncFlag() + { + setNeedRefineSynchronize(true); + } + + /*! + * @brief Check flag for if host-device synchronization is needed. + * + * Returns current value of the flag while setting the flag back to + * the default value of true. + */ + bool + needSynchronize() + { + bool flag = d_need_synchronize; + d_need_synchronize = true; + return flag; + } + +protected: + + /*! + * @brief Set flag indicating if device synchronization is needed after + * a child class operation. + * + * This allows implementations of methods such as preprocessRefine and + * postprocessRefine to set the flag to false if they have done nothing + * that requires host-device synchronization and do not need + * RefineSchedule to call the synchronize routine. + */ + void + setNeedRefineSynchronize(bool flag) + { + d_need_synchronize = flag; } private: @@ -327,6 +366,8 @@ class RefinePatchStrategy current_objects.erase(this); } + bool d_need_synchronize = true; + }; } diff --git a/source/SAMRAI/xfer/RefineSchedule.cpp b/source/SAMRAI/xfer/RefineSchedule.cpp index 8423d04b17..023f1b4b3d 100644 --- a/source/SAMRAI/xfer/RefineSchedule.cpp +++ b/source/SAMRAI/xfer/RefineSchedule.cpp @@ -29,6 +29,7 @@ #include "SAMRAI/tbox/MathUtilities.h" #include "SAMRAI/tbox/InputManager.h" #include "SAMRAI/tbox/OpenMPUtilities.h" +#include "SAMRAI/tbox/StagedKernelFusers.h" #include "SAMRAI/tbox/StartupShutdownManager.h" #include "SAMRAI/tbox/TimerManager.h" #include "SAMRAI/tbox/Utilities.h" @@ -2098,9 +2099,11 @@ RefineSchedule::fillData( * space. */ - copyScratchToDestination(); + bool copied = copyScratchToDestination(); #if defined(HAVE_RAJA) - tbox::parallel_synchronize(); + if (copied) { + tbox::parallel_synchronize(); + } #endif /* @@ -2149,6 +2152,8 @@ RefineSchedule::recursiveFill( double fill_time, bool do_physical_boundary_fill) const { + int rank = d_dst_level->getBoxLevel()->getMPI().getRank(); + /* * Copy data from the source interiors of the source level into the ghost * cells and interiors of the scratch space on the destination level @@ -2156,7 +2161,10 @@ RefineSchedule::recursiveFill( */ d_coarse_priority_level_schedule->communicate(); #if defined(HAVE_RAJA) - tbox::parallel_synchronize(); +// TODO: Be sure that this sync isn't needed. +// if (d_coarse_priority_level_schedule->completedTransactions()) { +// tbox::parallel_synchronize(); +// } #endif /* @@ -2213,12 +2221,12 @@ RefineSchedule::recursiveFill( * Recursively call the fill routine to fill the required coarse fill * boxes on the coarser level. */ - d_coarse_interp_schedule->recursiveFill(fill_time, do_physical_boundary_fill); #if defined(HAVE_RAJA) - tbox::parallel_synchronize(); +// TODO: This sync probably isn't necessary, but keep an eye on it. +// tbox::parallel_synchronize(); #endif /* @@ -2338,7 +2346,10 @@ RefineSchedule::recursiveFill( */ d_fine_priority_level_schedule->communicate(); #if defined(HAVE_RAJA) - tbox::parallel_synchronize(); +// TODO: Be sure that this sync isn't needed. +// if (d_fine_priority_level_schedule->completedTransactions()) { +// tbox::parallel_synchronize(); +// } #endif /* @@ -2373,6 +2384,9 @@ RefineSchedule::fillPhysicalBoundaries( d_dst_level->setBoundaryBoxes(); if (d_refine_patch_strategy) { +#if defined(HAVE_RAJA) + bool bdry_is_filled = false; +#endif for (hier::PatchLevel::iterator p(d_dst_level->begin()); p != d_dst_level->end(); ++p) { const std::shared_ptr& patch(*p); @@ -2381,8 +2395,17 @@ RefineSchedule::fillPhysicalBoundaries( setPhysicalBoundaryConditions(*patch, fill_time, d_boundary_fill_ghost_width); +#if defined(HAVE_RAJA) + bdry_is_filled = true; +#endif } } +#if defined(HAVE_RAJA) + if (bdry_is_filled && d_refine_patch_strategy->needSynchronize()) { + tbox::parallel_synchronize(); + } +#endif + } t_fill_physical_boundaries->stop(); } @@ -2416,7 +2439,9 @@ RefineSchedule::fillSingularityBoundaries( const hier::IntVector& ratio = d_dst_level->getRatioToLevelZero(); if (d_singularity_patch_strategy) { - +#if defined(HAVE_RAJA) + bool sing_is_filled = false; +#endif for (hier::BlockId::block_t bn = 0; bn < grid_geometry->getNumberBlocks(); ++bn) { hier::BlockId block_id(bn); @@ -2465,6 +2490,9 @@ RefineSchedule::fillSingularityBoundaries( d_dst_to_encon, fill_box, nboxes[bb], grid_geometry); +#if defined(HAVE_RAJA) + sing_is_filled = true; +#endif } } } @@ -2492,6 +2520,9 @@ RefineSchedule::fillSingularityBoundaries( d_dst_to_encon, fill_box, eboxes[bb], grid_geometry); +#if defined(HAVE_RAJA) + sing_is_filled = true; +#endif } } } @@ -2500,6 +2531,11 @@ RefineSchedule::fillSingularityBoundaries( } } } +#if defined(HAVE_RAJA) + if (sing_is_filled && d_singularity_patch_strategy->needSynchronize()) { + tbox::parallel_synchronize(); + } +#endif } } t_fill_singularity_boundaries->stop(); @@ -2604,10 +2640,11 @@ RefineSchedule::allocateWorkSpace( ************************************************************************** */ -void +bool RefineSchedule::copyScratchToDestination() const { TBOX_ASSERT(d_dst_level); + bool copied = false; for (hier::PatchLevel::iterator p(d_dst_level->begin()); p != d_dst_level->end(); ++p) { @@ -2621,11 +2658,12 @@ RefineSchedule::copyScratchToDestination() const getPatchData(dst_id)->getTime(), patch->getPatchData(src_id)->getTime())); patch->getPatchData(dst_id)->copy(*patch->getPatchData(src_id)); + copied = true; } } - } + return copied; } /* @@ -2647,6 +2685,7 @@ RefineSchedule::refineScratchData( overlaps) const { t_refine_scratch_data->start(); + int rank = d_dst_level->getBoxLevel()->getMPI().getRank(); #ifdef DEBUG_CHECK_ASSERTIONS bool is_encon = (fine_level == d_encon_level); @@ -2661,6 +2700,11 @@ RefineSchedule::refineScratchData( coarse_to_unfilled, overlaps, d_refine_items); +#if defined(HAVE_RAJA) + if (d_refine_patch_strategy->needSynchronize()) { + tbox::parallel_synchronize(); + } +#endif } const hier::IntVector ratio(fine_level->getRatioToLevelZero() @@ -2713,7 +2757,9 @@ RefineSchedule::refineScratchData( fill_boxes, local_ratio); #if defined(HAVE_RAJA) - tbox::parallel_synchronize(); + if (d_refine_patch_strategy->needSynchronize()) { + tbox::parallel_synchronize(); + } #endif } @@ -2729,20 +2775,24 @@ RefineSchedule::refineScratchData( ref_item->d_oprefine->refine(*fine_patch, *crse_patch, scratch_id, scratch_id, *refine_overlap, local_ratio); - } } -#if defined(HAVE_RAJA) - tbox::parallel_synchronize(); -#endif if (d_refine_patch_strategy) { + d_refine_patch_strategy->setPostRefineSyncFlag(); +#if defined(HAVE_RAJA) + if (d_refine_patch_strategy->needSynchronize()) { + tbox::parallel_synchronize(); + } +#endif d_refine_patch_strategy->postprocessRefineBoxes(*fine_patch, *crse_patch, fill_boxes, local_ratio); #if defined(HAVE_RAJA) - tbox::parallel_synchronize(); + if (d_refine_patch_strategy->needSynchronize()) { + tbox::parallel_synchronize(); + } #endif } @@ -2773,12 +2823,14 @@ RefineSchedule::refineScratchData( d_nbr_blk_fill_level->getPatch(unfilled_id)); if (d_refine_patch_strategy) { - d_refine_patch_strategy->preprocessRefineBoxes(*nbr_fill_patch, + d_refine_patch_strategy->preprocessRefineBoxes(*nbr_fill_patch, *crse_patch, fill_boxes, local_ratio); #if defined(HAVE_RAJA) - tbox::parallel_synchronize(); + if (d_refine_patch_strategy->needSynchronize()) { + tbox::parallel_synchronize(); + } #endif } @@ -2808,7 +2860,9 @@ RefineSchedule::refineScratchData( fill_boxes, local_ratio); #if defined(HAVE_RAJA) - tbox::parallel_synchronize(); + if (d_refine_patch_strategy->needSynchronize()) { + tbox::parallel_synchronize(); + } #endif } @@ -2837,6 +2891,12 @@ RefineSchedule::refineScratchData( } } + tbox::StagedKernelFusers::getInstance()->launch(); +#if defined(HAVE_RAJA) + tbox::parallel_synchronize(); +#endif + tbox::StagedKernelFusers::getInstance()->cleanup(); + if (d_refine_patch_strategy) { d_refine_patch_strategy->postprocessRefineLevel( *fine_level, @@ -2844,7 +2904,9 @@ RefineSchedule::refineScratchData( coarse_to_fine, coarse_to_unfilled); #if defined(HAVE_RAJA) - tbox::parallel_synchronize(); + if (d_refine_patch_strategy->needSynchronize()) { + tbox::parallel_synchronize(); + } #endif } diff --git a/source/SAMRAI/xfer/RefineSchedule.h b/source/SAMRAI/xfer/RefineSchedule.h index 0dd10c8c93..cf1335e865 100644 --- a/source/SAMRAI/xfer/RefineSchedule.h +++ b/source/SAMRAI/xfer/RefineSchedule.h @@ -534,9 +534,11 @@ class RefineSchedule * If the scratch and destination patch data components are the same, * then no copying is performed. * + * @return Returns true only if copies were performed. + * * @pre d_dst_level */ - void + bool copyScratchToDestination() const; /*! diff --git a/source/SAMRAI/xfer/SingularityPatchStrategy.h b/source/SAMRAI/xfer/SingularityPatchStrategy.h index 144a656b05..0124f258f5 100644 --- a/source/SAMRAI/xfer/SingularityPatchStrategy.h +++ b/source/SAMRAI/xfer/SingularityPatchStrategy.h @@ -90,6 +90,27 @@ class SingularityPatchStrategy const hier::BoundaryBox& boundary_box, const std::shared_ptr& grid_geometry) = 0; + bool + needSynchronize() + { + bool flag = d_need_synchronize; + d_need_synchronize = true; + return flag; + } + +protected: + + void + setNeedSingularitySynchronize(bool flag) + { + d_need_synchronize = flag; + } + +private: + + bool d_need_synchronize = true; + + }; } diff --git a/source/test/MappedBoxLevelConnectorUtilsTests/CMakeLists.txt b/source/test/MappedBoxLevelConnectorUtilsTests/CMakeLists.txt index 2d329d7d69..a524991ca4 100644 --- a/source/test/MappedBoxLevelConnectorUtilsTests/CMakeLists.txt +++ b/source/test/MappedBoxLevelConnectorUtilsTests/CMakeLists.txt @@ -8,7 +8,8 @@ blt_add_executable( SAMRAI_hier SAMRAI_geom SAMRAI_mesh - SAMRAI_tbox) + SAMRAI_tbox + cuda) target_compile_definitions(mblcu PUBLIC TESTING=1) diff --git a/source/test/MappingConnector/CMakeLists.txt b/source/test/MappingConnector/CMakeLists.txt index 4359563198..2b7e2386eb 100644 --- a/source/test/MappingConnector/CMakeLists.txt +++ b/source/test/MappingConnector/CMakeLists.txt @@ -8,7 +8,8 @@ blt_add_executable( SAMRAI_hier SAMRAI_geom SAMRAI_mesh - SAMRAI_tbox) + SAMRAI_tbox + cuda) target_compile_definitions(mapping-connector PUBLIC TESTING=1) diff --git a/source/test/mblktree/CMakeLists.txt b/source/test/mblktree/CMakeLists.txt index 4abe346d13..88e6349abe 100644 --- a/source/test/mblktree/CMakeLists.txt +++ b/source/test/mblktree/CMakeLists.txt @@ -5,7 +5,8 @@ set (mblktree_depends_on SAMRAI_hier SAMRAI_geom SAMRAI_mesh - SAMRAI_tbox) + SAMRAI_tbox + cuda) blt_add_executable( NAME mblktree diff --git a/source/test/sparsedata/CMakeLists.txt b/source/test/sparsedata/CMakeLists.txt index f14417c13c..f61867c5ab 100644 --- a/source/test/sparsedata/CMakeLists.txt +++ b/source/test/sparsedata/CMakeLists.txt @@ -23,7 +23,8 @@ blt_add_executable( SAMRAI_tbox SAMRAI_hier SAMRAI_pdat - SAMRAI_geom) + SAMRAI_geom + cuda) target_compile_definitions(sparse PUBLIC TESTING=1)