diff --git a/CMakeLists.txt b/CMakeLists.txt
index fb792da9f5..758b49302b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -59,8 +59,6 @@ option(ENABLE_SAMRAI_TESTS "Enable SAMRAI Test Programs" On)
 option(ENABLE_PERF_TESTS "Enable Performance Tests." Off)
 set(NUM_PERF_PROCS 8 CACHE INT "Number of processors for performance tests.")
 option(ENABLE_CHECK_ASSERTIONS "Enable assertion checking." On)
-option(ENABLE_CHECK_DEV_ASSERTIONS "Enable SAMRAI developer assertion checking." Off)
-option(ENABLE_CHECK_DIM_ASSERTIONS "Enable assertion checking for dimensions." Off)
 option(ENABLE_BOX_COUNTING "Turns on box telemetry." Off)
 option(ENABLE_DEPRECATED "Build with deprecated features." On)
 option(ENABLE_TIMERS "Enable SAMRAI timers." On)
@@ -72,6 +70,7 @@ set(CUDA_ARCH "sm_70" CACHE STRING "Compute architecture to pass to CUDA builds"
 set(CMAKE_CUDA_FLAGS "" CACHE STRING "")
 set(CMAKE_INSTALL_LIBDIR lib)
 #set(CMAKE_INSTALL_RPATH_USE_LINK_PATH Off CACHE Bool "Rpath uses Link path")
+set(SAMRAI_RAJA_WORKGROUP_THREADS 512 CACHE INT "Number of workgroup threads")
 
 include(GNUInstallDirs)
 
diff --git a/config/SAMRAI_config.h.cmake.in b/config/SAMRAI_config.h.cmake.in
index 013a8e5609..ad9a805b93 100644
--- a/config/SAMRAI_config.h.cmake.in
+++ b/config/SAMRAI_config.h.cmake.in
@@ -340,6 +340,8 @@
 /* Maximum dimension allowed */
 #define SAMRAI_MAXIMUM_DIMENSION @SAMRAI_MAXIMUM_DIMENSION@
 
+#define SAMRAI_RAJA_WORKGROUP_THREADS @SAMRAI_RAJA_WORKGROUP_THREADS@
+
 /* Define to 1 if you have the ANSI C header files. */
 #undef STDC_HEADERS
 
@@ -358,7 +360,9 @@
 /* Configure for compiling on BGL family of machines */
 #undef __BGL_FAMILY__
 
-
+#ifdef HAVE_RAJA
+#define SAMRAI_HAVE_KERNEL_FUSER
+#endif
 
 namespace SAMRAI {
    static const unsigned short MAX_DIM_VAL = SAMRAI_MAXIMUM_DIMENSION;
diff --git a/source/SAMRAI/hier/CMakeLists.txt b/source/SAMRAI/hier/CMakeLists.txt
index 8cdff48343..1342c6251e 100644
--- a/source/SAMRAI/hier/CMakeLists.txt
+++ b/source/SAMRAI/hier/CMakeLists.txt
@@ -140,6 +140,10 @@ if (ENABLE_MPI)
   set (hier_depends ${hier_depends} mpi)
 endif ()
 
+if (ENABLE_CUDA)
+  set (hier_depends ${hier_depends} cuda)
+endif ()
+
 blt_add_library(
   NAME SAMRAI_hier
   SOURCES ${hier_sources}
diff --git a/source/SAMRAI/hier/ForAll.h b/source/SAMRAI/hier/ForAll.h
index 533702ba31..656b4b2c14 100644
--- a/source/SAMRAI/hier/ForAll.h
+++ b/source/SAMRAI/hier/ForAll.h
@@ -19,6 +19,7 @@
 #include "SAMRAI/hier/Box.h"
 #include "SAMRAI/hier/Index.h"
 #include "SAMRAI/tbox/ExecutionPolicy.h"
+#include "SAMRAI/tbox/KernelFuser.h"
 
 #include <type_traits>
 #include <tuple>
@@ -145,8 +146,37 @@ struct for_all<1> {
           RAJA::make_tuple(make_range(ifirst, ilast, 0)),
           body);
    }
+
+   template <typename Policy, typename LoopBody,
+             typename std::enable_if<std::is_base_of<tbox::policy::base, Policy>::value, int>::type = 0>
+   inline static void eval(tbox::KernelFuser* fuser, const hier::Index& ifirst, const hier::Index& ilast, LoopBody body)
+   {
+      if (fuser == nullptr) {
+         RAJA::kernel<typename tbox::detail::policy_traits<Policy>::Policy1d>(
+             RAJA::make_tuple(make_range(ifirst, ilast, 0)),
+             body);
+      } else {
+         fuser->enqueue(ifirst(0), ilast(0), body);
+      }
+   }
+
+   template <typename Policy, typename LoopBody,
+             typename std::enable_if<!std::is_base_of<tbox::policy::base, Policy>::value, int>::type = 0>
+   inline static void eval(tbox::KernelFuser* fuser, const hier::Index& ifirst, const hier::Index& ilast, LoopBody body)
+   {
+      if (fuser == nullptr) {
+         RAJA::kernel<Policy>(
+             RAJA::make_tuple(make_range(ifirst, ilast, 0)),
+             body);
+      } else {
+         fuser->enqueue(ifirst(0), ilast(0), body);
+      }
+   }
 };
 
+
+// 2D and 3D don't use the fuser for anything pending suppor for
+// multidimensional loops in KernelFuser.
 template <>
 struct for_all<2> {
    template <typename Policy, typename LoopBody,
@@ -168,6 +198,28 @@ struct for_all<2> {
                            make_range(ifirst, ilast, 1)),
           body);
    }
+
+   template <typename Policy, typename LoopBody,
+             typename std::enable_if<std::is_base_of<tbox::policy::base, Policy>::value, int>::type = 0>
+   inline static void eval(tbox::KernelFuser* fuser, const hier::Index& ifirst, const hier::Index& ilast, LoopBody body)
+   {
+      NULL_USE(fuser);
+      RAJA::kernel<typename tbox::detail::policy_traits<Policy>::Policy2d>(
+          RAJA::make_tuple(make_range(ifirst, ilast, 0),
+                           make_range(ifirst, ilast, 1)),
+          body);
+   }
+
+   template <typename Policy, typename LoopBody,
+             typename std::enable_if<!std::is_base_of<tbox::policy::base, Policy>::value, int>::type = 0>
+   inline static void eval(tbox::KernelFuser* fuser, const hier::Index& ifirst, const hier::Index& ilast, LoopBody body)
+   {
+      NULL_USE(fuser);
+      RAJA::kernel<Policy>(
+          RAJA::make_tuple(make_range(ifirst, ilast, 0),
+                           make_range(ifirst, ilast, 1)),
+          body);
+   }
 };
 
 template <>
@@ -193,6 +245,30 @@ struct for_all<3> {
                            make_range(ifirst, ilast, 2)),
           body);
    }
+
+   template <typename Policy, typename LoopBody,
+             typename std::enable_if<std::is_base_of<tbox::policy::base, Policy>::value, int>::type = 0>
+   inline static void eval(tbox::KernelFuser* fuser, const hier::Index& ifirst, const hier::Index& ilast, LoopBody body)
+   {
+      NULL_USE(fuser);
+      RAJA::kernel<typename tbox::detail::policy_traits<Policy>::Policy3d>(
+          RAJA::make_tuple(make_range(ifirst, ilast, 0),
+                           make_range(ifirst, ilast, 1),
+                           make_range(ifirst, ilast, 2)),
+          body);
+   }
+
+   template <typename Policy, typename LoopBody,
+             typename std::enable_if<!std::is_base_of<tbox::policy::base, Policy>::value, int>::type = 0>
+   inline static void eval(tbox::KernelFuser* fuser, const hier::Index& ifirst, const hier::Index& ilast, LoopBody body)
+   {
+      NULL_USE(fuser);
+      RAJA::kernel<Policy>(
+          RAJA::make_tuple(make_range(ifirst, ilast, 0),
+                           make_range(ifirst, ilast, 1),
+                           make_range(ifirst, ilast, 2)),
+          body);
+   }
 };
 
 }  // namespace detail
@@ -205,6 +281,17 @@ inline void for_all(int begin, int end, LoopBody body)
    RAJA::forall<typename tbox::detail::policy_traits<Policy>::Policy>(RAJA::RangeSegment(begin, end), body);
 }
 
+template <typename Policy, typename LoopBody,
+          typename std::enable_if<std::is_base_of<tbox::policy::base, Policy>::value, int>::type = 0>
+inline void for_all(tbox::KernelFuser* fuser, int begin, int end, LoopBody body)
+{
+   if (fuser == nullptr) {
+      RAJA::forall<typename tbox::detail::policy_traits<Policy>::Policy>(RAJA::RangeSegment(begin, end), body);
+   } else {
+      fuser->enqueue(begin, end, body);
+   }
+}
+
 template <typename Policy, typename LoopBody,
           typename std::enable_if<!std::is_base_of<tbox::policy::base, Policy>::value, int>::type = 0>
 inline void for_all(int begin, int end, LoopBody body)
@@ -212,6 +299,17 @@ inline void for_all(int begin, int end, LoopBody body)
    RAJA::forall<Policy>(RAJA::RangeSegment(begin, end), body);
 }
 
+template <typename Policy, typename LoopBody,
+          typename std::enable_if<!std::is_base_of<tbox::policy::base, Policy>::value, int>::type = 0>
+inline void for_all(tbox::KernelFuser* fuser, int begin, int end, LoopBody body)
+{
+   if (fuser == nullptr) {
+      RAJA::forall<Policy>(RAJA::RangeSegment(begin, end), body);
+   } else {
+      fuser->enqueue(begin, end, body);
+   }
+}
+
 // does NOT include end
 template <typename LoopBody>
 inline void parallel_for_all(int begin, int end, LoopBody body)
@@ -219,6 +317,16 @@ inline void parallel_for_all(int begin, int end, LoopBody body)
    for_all<tbox::policy::parallel>(begin, end, body);
 }
 
+template <typename LoopBody>
+inline void parallel_for_all(tbox::KernelFuser* fuser, int begin, int end, LoopBody body)
+{
+   if (fuser == nullptr) {
+      for_all<tbox::policy::parallel>(begin, end, body);
+   } else {
+      for_all<tbox::policy::parallel>(fuser, begin, end, body);
+   }
+}
+
 template <typename LoopBody>
 inline void host_parallel_for_all(int begin, int end, LoopBody body)
 {
@@ -231,12 +339,25 @@ inline void for_all(const hier::Box& box, const int dim, LoopBody body)
    for_all<Policy>(box.lower()(dim), box.upper()(dim) + 1, body);
 }
 
+
+template <typename Policy, typename LoopBody>
+inline void for_all(tbox::KernelFuser* fuser, const hier::Box& box, const int dim, LoopBody body)
+{
+   for_all<Policy>(fuser, box.lower()(dim), box.upper()(dim) + 1, body);
+}
+
 template <typename LoopBody>
 inline void parallel_for_all(const hier::Box& box, const int dim, LoopBody body)
 {
    for_all<tbox::policy::parallel>(box.lower()(dim), box.upper()(dim) + 1, body);
 }
 
+template <typename LoopBody>
+inline void parallel_for_all(tbox::KernelFuser* fuser, const hier::Box& box, const int dim, LoopBody body)
+{
+   for_all<tbox::policy::parallel>(fuser, box.lower()(dim), box.upper()(dim) + 1, body);
+}
+
 template <typename LoopBody>
 inline void host_parallel_for_all(const hier::Box& box, const int dim, LoopBody body)
 {
@@ -250,12 +371,29 @@ inline void for_all(const hier::Box& box, LoopBody body)
    detail::for_all<arg_count>::template eval<Policy>(box.lower(), box.upper(), body);
 }
 
+template <typename Policy, typename LoopBody>
+inline void for_all(tbox::KernelFuser* fuser, const hier::Box& box, LoopBody body)
+{
+   if (fuser == nullptr) {
+      for_all<Policy,LoopBody>(box, body);
+   } else {
+      constexpr int arg_count = detail::function_traits<LoopBody>::argument_count;
+      detail::for_all<arg_count>::template eval<Policy>(fuser, box.lower(), box.upper(), body);
+   }
+}
+
 template <typename LoopBody>
 inline void parallel_for_all(const hier::Box& box, LoopBody body)
 {
    for_all<tbox::policy::parallel>(box, body);
 }
 
+template <typename LoopBody>
+inline void parallel_for_all(tbox::KernelFuser* fuser, const hier::Box& box, LoopBody body)
+{
+   for_all<tbox::policy::parallel>(fuser, box, body);
+}
+
 template <typename LoopBody>
 inline void host_parallel_for_all(const hier::Box& box, LoopBody body)
 {
diff --git a/source/SAMRAI/hier/PatchData.cpp b/source/SAMRAI/hier/PatchData.cpp
index 385b208c8e..2376bf6d44 100644
--- a/source/SAMRAI/hier/PatchData.cpp
+++ b/source/SAMRAI/hier/PatchData.cpp
@@ -31,6 +31,30 @@ PatchData::~PatchData()
 {
 }
 
+void
+PatchData::copyFuseable(
+   const PatchData& src,
+   const BoxOverlap& overlap)
+{
+   copy(src, overlap);
+}
+
+void
+PatchData::packStreamFuseable(
+   tbox::MessageStream& stream,
+   const BoxOverlap& overlap) const
+{
+   packStream(stream, overlap);
+}
+
+void
+PatchData::unpackStreamFuseable(
+   tbox::MessageStream& stream,
+   const BoxOverlap& overlap)
+{
+   unpackStream(stream, overlap);
+}
+
 /*
  *************************************************************************
  *
diff --git a/source/SAMRAI/hier/PatchData.h b/source/SAMRAI/hier/PatchData.h
index a4a4c0e2ca..1d2f320b2e 100644
--- a/source/SAMRAI/hier/PatchData.h
+++ b/source/SAMRAI/hier/PatchData.h
@@ -20,6 +20,15 @@
 #include "SAMRAI/tbox/Utilities.h"
 
 namespace SAMRAI {
+
+/*
+ * Forward declaration of KernelFuser class - required here because it sucks in
+ * RAJA and requires CUDA.
+ */
+//namespace tbox {
+//class KernelFuser;
+//}
+
 namespace hier {
 
 /**
@@ -160,6 +169,11 @@ class PatchData
       const PatchData& src,
       const BoxOverlap& overlap) = 0;
 
+   virtual void
+   copyFuseable(
+      const PatchData& src,
+      const BoxOverlap& overlap);
+
    /**
     * Copy data from the source into the destination using the designated
     * overlap descriptor.  The overlap description will have been computed
@@ -206,6 +220,18 @@ class PatchData
       tbox::MessageStream& stream,
       const BoxOverlap& overlap) const = 0;
 
+   /**
+    * Pack data lying on the specified index set into the output stream using
+    * the given KernelFuser. The default implementation of this method will
+    * call packStream without the fuser argument. See the abstract stream
+    * virtual base class for more information about the packing operators
+    * defined for streams.
+    */
+   virtual void
+   packStreamFuseable(
+      tbox::MessageStream& stream,
+      const BoxOverlap& overlap) const;
+
    /**
     * Unpack data from the message stream into the specified index set.
     * See the abstract stream virtual base class for more information about
@@ -216,6 +242,18 @@ class PatchData
       tbox::MessageStream& stream,
       const BoxOverlap& overlap) = 0;
 
+   /**
+    * Unpack data from the message stream into the specified index set using
+    * the given KernelFuser. The default implementation of this method will
+    * call unpackStream without the fuser argument. See the abstract stream
+    * virtual base class for more information about the packing operators
+    * defined for streams.
+    */
+   virtual void
+   unpackStreamFuseable(
+      tbox::MessageStream& stream,
+      const BoxOverlap& overlap);
+
    /**
     * Checks that class version and restart file version are equal.  If so,
     * reads in the data members common to all patch data types from restart
diff --git a/source/SAMRAI/hier/PatchLevel.cpp b/source/SAMRAI/hier/PatchLevel.cpp
index 30f29c3b66..671ba9f5ac 100644
--- a/source/SAMRAI/hier/PatchLevel.cpp
+++ b/source/SAMRAI/hier/PatchLevel.cpp
@@ -9,7 +9,9 @@
  ************************************************************************/
 #include "SAMRAI/hier/PatchLevel.h"
 
+#include "SAMRAI/tbox/Collectives.h"
 #include "SAMRAI/tbox/MathUtilities.h"
+#include "SAMRAI/tbox/StagedKernelFusers.h"
 #include "SAMRAI/tbox/TimerManager.h"
 #include "SAMRAI/hier/BaseGridGeometry.h"
 #include "SAMRAI/hier/BoxContainer.h"
diff --git a/source/SAMRAI/hier/PatchLevel.h b/source/SAMRAI/hier/PatchLevel.h
index eba972eb5a..595db02210 100644
--- a/source/SAMRAI/hier/PatchLevel.h
+++ b/source/SAMRAI/hier/PatchLevel.h
@@ -17,6 +17,7 @@
 #include "SAMRAI/hier/BoxLevel.h"
 #include "SAMRAI/hier/PatchFactory.h"
 #include "SAMRAI/hier/ProcessorMapping.h"
+#include "SAMRAI/tbox/StagedKernelFusers.h"
 #include "SAMRAI/tbox/Utilities.h"
 
 #include <map>
@@ -758,6 +759,11 @@ class PatchLevel
       for (Iterator ip(begin()); ip != end(); ++ip) {
          ip->allocatePatchData(id, timestamp);
       }
+
+#if defined(HAVE_RAJA)
+      tbox::StagedKernelFusers::getInstance()->launchAndCleanup();
+#endif
+
    }
 
    /*!
@@ -775,6 +781,11 @@ class PatchLevel
       for (Iterator ip(begin()); ip != end(); ++ip) {
          ip->allocatePatchData(components, timestamp);
       }
+
+#if defined(HAVE_RAJA)
+      tbox::StagedKernelFusers::getInstance()->launchAndCleanup();
+#endif
+
    }
 
    /*!
@@ -812,6 +823,10 @@ class PatchLevel
       for (Iterator ip(begin()); ip != end(); ++ip) {
          ip->deallocatePatchData(id);
       }
+
+#if defined(HAVE_RAJA)
+      tbox::StagedKernelFusers::getInstance()->launchAndCleanup();
+#endif
    }
 
    /*!
@@ -829,6 +844,10 @@ class PatchLevel
       for (Iterator ip(begin()); ip != end(); ++ip) {
          ip->deallocatePatchData(components);
       }
+
+#if defined(HAVE_RAJA)
+      tbox::StagedKernelFusers::getInstance()->launchAndCleanup();
+#endif
    }
 
    /*!
diff --git a/source/SAMRAI/pdat/ArrayData.cpp b/source/SAMRAI/pdat/ArrayData.cpp
index f65d23ed7d..33733c826b 100644
--- a/source/SAMRAI/pdat/ArrayData.cpp
+++ b/source/SAMRAI/pdat/ArrayData.cpp
@@ -11,6 +11,8 @@
 #ifndef included_pdat_ArrayData_C
 #define included_pdat_ArrayData_C
 
+#include "SAMRAI/tbox/KernelFuser.h"
+#include "SAMRAI/tbox/StagedKernelFusers.h"
 #include "SAMRAI/tbox/MessageStream.h"
 #include "SAMRAI/tbox/Utilities.h"
 #include "SAMRAI/tbox/MathUtilities.h"
@@ -101,7 +103,8 @@ ArrayData<TYPE>::ArrayData(
                           ,
                           d_array(d_depth * d_offset),
 #endif
-                          d_on_host(true)
+                          d_on_host(true),
+                          d_use_fuser(false)
 {
    TBOX_ASSERT(depth > 0);
 
@@ -132,7 +135,8 @@ ArrayData<TYPE>::ArrayData(
 #else
    d_array(d_depth * d_offset),
 #endif
-   d_on_host(true) 
+   d_on_host(true),
+   d_use_fuser(false)
 {
 #ifndef HAVE_UMPIRE
    NULL_USE(allocator);
@@ -298,12 +302,14 @@ void ArrayData<TYPE>::copy(
       const TYPE* const src_ptr = &src.d_array[0];
       const size_t n = d_offset * d_depth;
 #if defined(HAVE_RAJA)
+      tbox::KernelFuser* fuser = d_use_fuser ?
+         tbox::StagedKernelFusers::getInstance()->getFuser(0) : nullptr;
       if (d_on_host) {
          hier::host_parallel_for_all(0, n, [=] (int i) {
             copyop(dst_ptr[i], src_ptr[i]);
          });
       } else {
-         hier::parallel_for_all(0, n, [=] SAMRAI_HOST_DEVICE(int i) {
+         hier::parallel_for_all(fuser, 0, n, [=] SAMRAI_HOST_DEVICE(int i) {
             copyop(dst_ptr[i], src_ptr[i]);
          });
       } 
@@ -492,12 +498,15 @@ void ArrayData<TYPE>::copyDepth(
 
 
 #if defined(HAVE_RAJA)
+      tbox::KernelFuser* fuser = d_use_fuser ?
+         tbox::StagedKernelFusers::getInstance()->getFuser(0) : nullptr;
+
       if (d_on_host) {
          hier::host_parallel_for_all(0, d_offset, [=] (int i) {
             copyop(dst_ptr_d[i], src_ptr_d[i]);
          });
       } else {
-         hier::parallel_for_all(0, d_offset, [=] SAMRAI_HOST_DEVICE(int i) {
+         hier::parallel_for_all(fuser, 0, d_offset, [=] SAMRAI_HOST_DEVICE(int i) {
             copyop(dst_ptr_d[i], src_ptr_d[i]);
          });
       }
@@ -1009,12 +1018,15 @@ void ArrayData<TYPE>::fillAll(
       TYPE* ptr = &d_array[0];
       const size_t n = d_depth * d_offset;
 #if defined(HAVE_RAJA)
+      tbox::KernelFuser* fuser = d_use_fuser ?
+         tbox::StagedKernelFusers::getInstance()->getFuser(0) : nullptr;
+
       if (d_on_host) {
          hier::host_parallel_for_all(0, n, [=] (int i) {
             ptr[i] = t;
          });
       } else {
-         hier::parallel_for_all(0, n, [=] SAMRAI_HOST_DEVICE(int i) {
+         hier::parallel_for_all(fuser, 0, n, [=] SAMRAI_HOST_DEVICE(int i) {
             ptr[i] = t;
          });
       }
@@ -1051,12 +1063,15 @@ void ArrayData<TYPE>::fill(
    const size_t n = d_offset;
    if (!d_box.empty()) {
 #if defined(HAVE_RAJA)
+      tbox::KernelFuser* fuser = d_use_fuser ?
+         tbox::StagedKernelFusers::getInstance()->getFuser(0) : nullptr;
+
       if (d_on_host) {
          hier::host_parallel_for_all(0, n, [=] (int i) {
             ptr[i] = t;
          });
       } else {
-         hier::parallel_for_all(0, n, [=] SAMRAI_HOST_DEVICE(int i) {
+         hier::parallel_for_all(fuser, 0, n, [=] SAMRAI_HOST_DEVICE(int i) {
             ptr[i] = t;
          });
       }
@@ -1082,6 +1097,9 @@ void ArrayData<TYPE>::fill(
 
    if (!ispace.empty()) {
 #if defined(HAVE_RAJA)
+      tbox::KernelFuser* fuser = d_use_fuser ?
+         tbox::StagedKernelFusers::getInstance()->getFuser(0) : nullptr;
+
       switch (ispace.getDim().getValue()) {
          case 1: {
             auto data = getView<1>(d);
@@ -1090,7 +1108,7 @@ void ArrayData<TYPE>::fill(
                   data(i) = t;
                });
             } else {
-               hier::parallel_for_all(ispace, [=] SAMRAI_HOST_DEVICE(int i) {
+               hier::parallel_for_all(fuser, ispace, [=] SAMRAI_HOST_DEVICE(int i) {
                   data(i) = t;
                });
             }
@@ -1103,7 +1121,7 @@ void ArrayData<TYPE>::fill(
                   data(i,j) = t;
                });
             } else {
-               hier::parallel_for_all(ispace, [=] SAMRAI_HOST_DEVICE(int i, int j) {
+               hier::parallel_for_all(fuser, ispace, [=] SAMRAI_HOST_DEVICE(int i, int j) {
                   data(i,j) = t;
                });
             }
@@ -1116,7 +1134,7 @@ void ArrayData<TYPE>::fill(
                   data(i,j,k) = t;
                });
             } else {
-               hier::parallel_for_all(ispace, [=] SAMRAI_HOST_DEVICE(int i, int j, int k) {
+               hier::parallel_for_all(fuser, ispace, [=] SAMRAI_HOST_DEVICE(int i, int j, int k) {
                   data(i,j,k) = t;
                });
             }
diff --git a/source/SAMRAI/pdat/ArrayData.h b/source/SAMRAI/pdat/ArrayData.h
index 671de1c850..4b1dbb7ce1 100644
--- a/source/SAMRAI/pdat/ArrayData.h
+++ b/source/SAMRAI/pdat/ArrayData.h
@@ -653,6 +653,21 @@ class ArrayData
       return d_on_host;
    }
 
+   void startKernelFuser()
+   {
+      d_use_fuser = true;
+   }
+
+   void stopKernelFuser()
+   {
+      d_use_fuser = false;
+   }
+
+   bool useFuser() const
+   {
+      return d_use_fuser;
+   }
+
    /*!
     * The array data iterator iterates over the elements of a box
     * associated with an ArrayData object.  This typedef is
@@ -729,6 +744,7 @@ class ArrayData
 #endif
 
    bool d_on_host;
+   bool d_use_fuser;
 };
 
 #if defined(HAVE_RAJA)
diff --git a/source/SAMRAI/pdat/ArrayDataOperationUtilities.cpp b/source/SAMRAI/pdat/ArrayDataOperationUtilities.cpp
index 6aee4cbdda..0e8fd6db96 100644
--- a/source/SAMRAI/pdat/ArrayDataOperationUtilities.cpp
+++ b/source/SAMRAI/pdat/ArrayDataOperationUtilities.cpp
@@ -14,10 +14,11 @@
 #include "SAMRAI/pdat/ArrayDataOperationUtilities.h"
 #include "SAMRAI/pdat/ArrayData.h"
 #include "SAMRAI/hier/ForAll.h"
-#include "SAMRAI/tbox/Utilities.h"
 #include "SAMRAI/pdat/SumOperation.h"
 #include "SAMRAI/tbox/Collectives.h"
 #include "SAMRAI/tbox/NVTXUtilities.h"
+#include "SAMRAI/tbox/StagedKernelFusers.h"
+#include "SAMRAI/tbox/Utilities.h"
 
 namespace SAMRAI
 {
@@ -112,6 +113,12 @@ void ArrayDataOperationUtilities<TYPE, OP>::doArrayDataOperationOnBox(
    bool on_host = (src_on_host && dst_on_host);
 #endif
 
+#if defined(HAVE_RAJA)
+   bool use_fuser = dst.useFuser();
+   tbox::KernelFuser* fuser = use_fuser ?
+      tbox::StagedKernelFusers::getInstance()->getFuser(0) : nullptr;
+#endif
+
    /*
     * Loop over the depth sections of the data arrays.
     */
@@ -131,7 +138,7 @@ void ArrayDataOperationUtilities<TYPE, OP>::doArrayDataOperationOnBox(
                   op(dest(i), s2(i));
                });
             } else {
-               hier::parallel_for_all(opbox, [=] SAMRAI_HOST_DEVICE(int i) {
+               hier::parallel_for_all(fuser, opbox, [=] SAMRAI_HOST_DEVICE(int i) {
                   op(dest(i), s2(i));
                });
             }
@@ -148,7 +155,7 @@ void ArrayDataOperationUtilities<TYPE, OP>::doArrayDataOperationOnBox(
                   op(dest(i, j), s2(i, j));
                });
             } else {
-               hier::parallel_for_all(opbox, [=] SAMRAI_HOST_DEVICE(int i, int j) {
+               hier::parallel_for_all(fuser, opbox, [=] SAMRAI_HOST_DEVICE(int i, int j) {
                   op(dest(i, j), s2(i, j));
                });
             }
@@ -166,7 +173,7 @@ void ArrayDataOperationUtilities<TYPE, OP>::doArrayDataOperationOnBox(
                   op(dest(i, j, k), s2(i, j, k));
                });
             } else {
-               hier::parallel_for_all(opbox, [=] SAMRAI_HOST_DEVICE(int i, int j, int k) {
+               hier::parallel_for_all(fuser, opbox, [=] SAMRAI_HOST_DEVICE(int i, int j, int k) {
                   op(dest(i, j, k), s2(i, j, k));
                });
             }
@@ -312,6 +319,12 @@ void ArrayDataOperationUtilities<TYPE, OP>::doArrayDataBufferOperationOnBox(
    bool on_host = arraydata.dataOnHost();
 #endif
 
+#if defined(HAVE_RAJA)
+   bool use_fuser = arraydata.useFuser();
+   tbox::KernelFuser* fuser = use_fuser ?
+      tbox::StagedKernelFusers::getInstance()->getFuser(0) : nullptr;
+#endif
+
    /*
     * Loop over the depth sections of the data arrays.
     */
@@ -335,7 +348,7 @@ void ArrayDataOperationUtilities<TYPE, OP>::doArrayDataBufferOperationOnBox(
                   op(dest(i), source(i));
                });
             } else {
-               hier::parallel_for_all(opbox, [=] SAMRAI_HOST_DEVICE(int i) {
+               hier::parallel_for_all(fuser, opbox, [=] SAMRAI_HOST_DEVICE(int i) {
                   op(dest(i), source(i));
                });
             }
@@ -349,7 +362,7 @@ void ArrayDataOperationUtilities<TYPE, OP>::doArrayDataBufferOperationOnBox(
                   op(dest(i, j), source(i, j));
                });
             } else {
-               hier::parallel_for_all(opbox, [=] SAMRAI_HOST_DEVICE(int i, int j) {
+               hier::parallel_for_all(fuser, opbox, [=] SAMRAI_HOST_DEVICE(int i, int j) {
                   op(dest(i, j), source(i, j));
                });
             }
@@ -363,7 +376,7 @@ void ArrayDataOperationUtilities<TYPE, OP>::doArrayDataBufferOperationOnBox(
                   op(dest(i, j, k), source(i, j, k));
                });
             } else {
-               hier::parallel_for_all(opbox, [=] SAMRAI_HOST_DEVICE(int i, int j, int k) {
+               hier::parallel_for_all(fuser, opbox, [=] SAMRAI_HOST_DEVICE(int i, int j, int k) {
                   op(dest(i, j, k), source(i, j, k));
                });
             }
@@ -511,6 +524,12 @@ inline void ArrayDataOperationUtilities<dcomplex,SumOperation<dcomplex> >::doArr
    bool on_host = (src_on_host && dst_on_host);
 #endif
 
+#if defined(HAVE_RAJA)
+//   bool use_fuser = dst.useFuser();
+//   tbox::KernelFuser* fuser = use_fuser ?
+//      tbox::KernelFuser::getFuser() : nullptr;
+#endif
+
    /*
     * Loop over the depth sections of the data arrays.
     */
@@ -535,6 +554,7 @@ inline void ArrayDataOperationUtilities<dcomplex,SumOperation<dcomplex> >::doArr
                   sumop_dbl(dest_imag, s2_imag);
                });
             } else {
+               //hier::parallel_for_all(fuser, opbox, [=] SAMRAI_HOST_DEVICE(int i) {
                hier::parallel_for_all(opbox, [=] SAMRAI_HOST_DEVICE(int i) {
                   double &dest_real = reinterpret_cast<double(&)[2]>(dest(i))[0];
                   double &dest_imag = reinterpret_cast<double(&)[2]>(dest(i))[1];
@@ -562,6 +582,7 @@ inline void ArrayDataOperationUtilities<dcomplex,SumOperation<dcomplex> >::doArr
                   sumop_dbl(dest_imag, s2_imag);
                });
             } else {
+               //hier::parallel_for_all(fuser, opbox, [=] SAMRAI_HOST_DEVICE(int i, int j) {
                hier::parallel_for_all(opbox, [=] SAMRAI_HOST_DEVICE(int i, int j) {
                   double &dest_real = reinterpret_cast<double(&)[2]>(dest(i,j))[0];
                   double &dest_imag = reinterpret_cast<double(&)[2]>(dest(i,j))[1];
@@ -590,6 +611,7 @@ inline void ArrayDataOperationUtilities<dcomplex,SumOperation<dcomplex> >::doArr
                   sumop_dbl(dest_imag, s2_imag);
                });
             } else {
+               //hier::parallel_for_all(fuser, opbox, [=] SAMRAI_HOST_DEVICE(int i, int j, int k) {
                hier::parallel_for_all(opbox, [=] SAMRAI_HOST_DEVICE(int i, int j, int k) {
                   double &dest_real = reinterpret_cast<double(&)[2]>(dest(i,j,k))[0];
                   double &dest_imag = reinterpret_cast<double(&)[2]>(dest(i,j,k))[1];
@@ -745,6 +767,12 @@ inline void ArrayDataOperationUtilities<dcomplex, SumOperation<dcomplex> >::doAr
    bool on_host = arraydata.dataOnHost();
 #endif
 
+#if defined(HAVE_RAJA)
+//   bool use_fuser = arraydata.useFuser();
+//   tbox::KernelFuser* fuser = use_fuser ?
+//      tbox::KernelFuser::getFuser() : nullptr;
+#endif
+
    /*
     * Loop over the depth sections of the data arrays.
     */
@@ -773,6 +801,7 @@ inline void ArrayDataOperationUtilities<dcomplex, SumOperation<dcomplex> >::doAr
                   sumop_dbl(dest_imag, source_imag);
                });
             } else {
+               //hier::parallel_for_all(fuser, opbox, [=] SAMRAI_HOST_DEVICE(int i) {
                hier::parallel_for_all(opbox, [=] SAMRAI_HOST_DEVICE(int i) {
                   double &dest_real = reinterpret_cast<double(&)[2]>(dest(i))[0];
                   double &dest_imag = reinterpret_cast<double(&)[2]>(dest(i))[1];
@@ -797,6 +826,7 @@ inline void ArrayDataOperationUtilities<dcomplex, SumOperation<dcomplex> >::doAr
                   sumop_dbl(dest_imag, source_imag);
                });
             } else {
+               //hier::parallel_for_all(fuser, opbox, [=] SAMRAI_HOST_DEVICE(int i, int j) {
                hier::parallel_for_all(opbox, [=] SAMRAI_HOST_DEVICE(int i, int j) {
                   double &dest_real = reinterpret_cast<double(&)[2]>(dest(i,j))[0];
                   double &dest_imag = reinterpret_cast<double(&)[2]>(dest(i,j))[1];
@@ -821,6 +851,7 @@ inline void ArrayDataOperationUtilities<dcomplex, SumOperation<dcomplex> >::doAr
                   sumop_dbl(dest_imag, source_imag);
                });
             } else {
+               //hier::parallel_for_all(fuser, opbox, [=] SAMRAI_HOST_DEVICE(int i, int j, int k) {
                hier::parallel_for_all(opbox, [=] SAMRAI_HOST_DEVICE(int i, int j, int k) {
                   double &dest_real = reinterpret_cast<double(&)[2]>(dest(i,j,k))[0];
                   double &dest_imag = reinterpret_cast<double(&)[2]>(dest(i,j,k))[1];
diff --git a/source/SAMRAI/pdat/ArrayDataOperationUtilities.h b/source/SAMRAI/pdat/ArrayDataOperationUtilities.h
index 0315f9def3..bca5e2d8fa 100644
--- a/source/SAMRAI/pdat/ArrayDataOperationUtilities.h
+++ b/source/SAMRAI/pdat/ArrayDataOperationUtilities.h
@@ -113,7 +113,6 @@ class ArrayDataOperationUtilities
    ArrayDataOperationUtilities&
    operator = (
       const ArrayDataOperationUtilities&);
-
 };
 
 }
diff --git a/source/SAMRAI/pdat/ArrayView.h b/source/SAMRAI/pdat/ArrayView.h
index d649e817d3..8f3c6d2bcf 100644
--- a/source/SAMRAI/pdat/ArrayView.h
+++ b/source/SAMRAI/pdat/ArrayView.h
@@ -70,7 +70,7 @@ struct ArrayView<1, TYPE> : public RAJA::View<TYPE, detail::layout_traits::Layou
          data,
          RAJA::make_permuted_offset_layout(
             std::array<RAJA::Index_type, 1>{ {box.lower()[0]} },
-            std::array<RAJA::Index_type, 1>{ {box.upper()[0]} },
+            std::array<RAJA::Index_type, 1>{ {box.upper()[0]+1} },
             RAJA::as_array<RAJA::PERM_I>::get())){}
 };
 
@@ -84,7 +84,7 @@ struct ArrayView<2, TYPE> : public RAJA::View<TYPE, detail::layout_traits::Layou
          data,
          RAJA::make_permuted_offset_layout(
             std::array<RAJA::Index_type, 2>{ {box.lower()[0], box.lower()[1]} },
-            std::array<RAJA::Index_type, 2>{ {box.upper()[0], box.upper()[1]} },
+            std::array<RAJA::Index_type, 2>{ {box.upper()[0]+1, box.upper()[1]+1} },
             RAJA::as_array<RAJA::PERM_JI>::get())){}
 };
 
@@ -98,7 +98,7 @@ struct ArrayView<3, TYPE> : public RAJA::View<TYPE, detail::layout_traits::Layou
          data,
          RAJA::make_permuted_offset_layout(
             std::array<RAJA::Index_type, 3>{ {box.lower()[0], box.lower()[1], box.lower()[2]} },
-            std::array<RAJA::Index_type, 3>{ {box.upper()[0], box.upper()[1], box.upper()[2]} },
+            std::array<RAJA::Index_type, 3>{ {box.upper()[0]+1, box.upper()[1]+1, box.upper()[2]+1} },
             RAJA::as_array<RAJA::PERM_KJI>::get())){};
 };
 
@@ -112,7 +112,7 @@ struct ArrayView<1, const TYPE> : public RAJA::View<const TYPE, detail::layout_t
          data,
          RAJA::make_permuted_offset_layout(
             std::array<RAJA::Index_type, 1>{ {box.lower()[0]} },
-            std::array<RAJA::Index_type, 1>{ {box.upper()[0]} },
+            std::array<RAJA::Index_type, 1>{ {box.upper()[0]+1} },
             RAJA::as_array<RAJA::PERM_I>::get())){}
 };
 
@@ -126,7 +126,7 @@ struct ArrayView<2, const TYPE> : public RAJA::View<const TYPE, detail::layout_t
          data,
          RAJA::make_permuted_offset_layout(
             std::array<RAJA::Index_type, 2>{ {box.lower()[0], box.lower()[1]} },
-            std::array<RAJA::Index_type, 2>{ {box.upper()[0], box.upper()[1]} },
+            std::array<RAJA::Index_type, 2>{ {box.upper()[0]+1, box.upper()[1]+1} },
             RAJA::as_array<RAJA::PERM_JI>::get())){}
 };
 
@@ -141,7 +141,7 @@ struct ArrayView<3, const TYPE> : public RAJA::View<const TYPE, detail::layout_t
          data,
          RAJA::make_permuted_offset_layout(
             std::array<RAJA::Index_type, 3>{ {box.lower()[0], box.lower()[1], box.lower()[2]} },
-            std::array<RAJA::Index_type, 3>{ {box.upper()[0], box.upper()[1], box.upper()[2]} },
+            std::array<RAJA::Index_type, 3>{ {box.upper()[0]+1, box.upper()[1]+1, box.upper()[2]+1} },
             RAJA::as_array<RAJA::PERM_KJI>::get())){};
 };
 
diff --git a/source/SAMRAI/pdat/CMakeLists.txt b/source/SAMRAI/pdat/CMakeLists.txt
index 7162d09099..bae6902eab 100644
--- a/source/SAMRAI/pdat/CMakeLists.txt
+++ b/source/SAMRAI/pdat/CMakeLists.txt
@@ -339,6 +339,10 @@ target_include_directories(
   $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/source>
   $<INSTALL_INTERFACE:include>)
 
+blt_print_target_properties(TARGET SAMRAI_pdat)
+blt_print_target_properties(TARGET raja)
+blt_print_target_properties(TARGET RAJA)
+
 install(TARGETS SAMRAI_pdat
   EXPORT SAMRAITargets
   ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
diff --git a/source/SAMRAI/pdat/CellData.h b/source/SAMRAI/pdat/CellData.h
index 99e84adb7d..df90422254 100644
--- a/source/SAMRAI/pdat/CellData.h
+++ b/source/SAMRAI/pdat/CellData.h
@@ -279,6 +279,14 @@ class CellData:public hier::PatchData
       const hier::PatchData& src,
       const hier::BoxOverlap& overlap);
 
+   virtual void
+   copyFuseable(
+      const hier::PatchData& src,
+      const hier::BoxOverlap& overlap)
+   {
+       copy(src, overlap);
+   }
+
    /*!
     * @brief Copy data from source (i.e., this) to destination
     * patch data object on the given overlap.
@@ -356,6 +364,14 @@ class CellData:public hier::PatchData
       tbox::MessageStream& stream,
       const hier::BoxOverlap& overlap) const;
 
+   virtual void
+   packStreamFuseable(
+      tbox::MessageStream& stream,
+      const hier::BoxOverlap& overlap) const
+   {
+      packStream(stream, overlap);
+   }
+
    /*!
     * @brief Unpack data from stream into this patch data object
     * over the specified box overlap region.  The overlap must be a
@@ -368,6 +384,15 @@ class CellData:public hier::PatchData
       tbox::MessageStream& stream,
       const hier::BoxOverlap& overlap);
 
+   virtual void
+   unpackStreamFuseable(
+      tbox::MessageStream& stream,
+      const hier::BoxOverlap& overlap)
+   {
+      unpackStream(stream, overlap);
+   }
+
+
    /*!
     * @brief Add data from source to destination (i.e., this)
     * patch data object on the given overlap.
diff --git a/source/SAMRAI/pdat/EdgeData.h b/source/SAMRAI/pdat/EdgeData.h
index cd9cbaf514..6fc6124934 100644
--- a/source/SAMRAI/pdat/EdgeData.h
+++ b/source/SAMRAI/pdat/EdgeData.h
@@ -395,6 +395,7 @@ class EdgeData:public hier::PatchData
       tbox::MessageStream& stream,
       const hier::BoxOverlap& overlap);
 
+
    /*!
     * @brief Fill all values at depth d with the value t.
     *
diff --git a/source/SAMRAI/pdat/NodeData.h b/source/SAMRAI/pdat/NodeData.h
index 62c665fdce..b645d0a297 100644
--- a/source/SAMRAI/pdat/NodeData.h
+++ b/source/SAMRAI/pdat/NodeData.h
@@ -284,6 +284,16 @@ class NodeData:public hier::PatchData
       const hier::PatchData& src,
       const hier::BoxOverlap& overlap);
 
+   virtual void
+   copyFuseable(
+      const hier::PatchData& src,
+      const hier::BoxOverlap& overlap)
+   {
+       d_data->startKernelFuser();
+       copy(src, overlap);
+       d_data->stopKernelFuser();
+   }
+
    /*!
     * @brief Copy data from source (i.e., this) to destination
     * patch data object on the given overlap.
@@ -362,6 +372,16 @@ class NodeData:public hier::PatchData
       tbox::MessageStream& stream,
       const hier::BoxOverlap& overlap) const;
 
+   virtual void
+   packStreamFuseable(
+      tbox::MessageStream& stream,
+      const hier::BoxOverlap& overlap) const
+   {
+      d_data->startKernelFuser();
+      packStream(stream, overlap);
+      d_data->stopKernelFuser();
+   }
+
    /*!
     * @brief Unpack data from stream into this patch data object over
     * the specified box overlap region. The overlap must be a
@@ -374,6 +394,17 @@ class NodeData:public hier::PatchData
       tbox::MessageStream& stream,
       const hier::BoxOverlap& overlap);
 
+   virtual void
+   unpackStreamFuseable(
+      tbox::MessageStream& stream,
+      const hier::BoxOverlap& overlap)
+   {
+      d_data->startKernelFuser();
+      unpackStream(stream, overlap);
+      d_data->stopKernelFuser();
+   }
+
+
    /*!
     * @brief Fill all values at depth d with the value t.
     *
diff --git a/source/SAMRAI/pdat/OuterfaceData.h b/source/SAMRAI/pdat/OuterfaceData.h
index d43d00a798..51d324f8f6 100644
--- a/source/SAMRAI/pdat/OuterfaceData.h
+++ b/source/SAMRAI/pdat/OuterfaceData.h
@@ -415,6 +415,16 @@ class OuterfaceData:public hier::PatchData
       tbox::MessageStream& stream,
       const hier::BoxOverlap& overlap) const;
 
+   virtual void
+   packStream(
+      tbox::MessageStream& stream,
+      const hier::BoxOverlap& overlap,
+      tbox::KernelFuser& fuser) const
+   {
+      NULL_USE(fuser);
+      packStream(stream, overlap);
+   }
+
    /*!
     * @brief Unpack data from stream into this patch data object over
     * the specified box overlap region. The overlap must be an
@@ -427,6 +437,16 @@ class OuterfaceData:public hier::PatchData
       tbox::MessageStream& stream,
       const hier::BoxOverlap& overlap);
 
+   virtual void
+   unpackStream(
+      tbox::MessageStream& stream,
+      const hier::BoxOverlap& overlap,
+      tbox::KernelFuser& fuser)
+   {
+      NULL_USE(fuser);
+      unpackStream(stream, overlap);
+   }
+
    /*!
     * @brief Fill all values at depth d with the value t.
     *
diff --git a/source/SAMRAI/tbox/AllocatorDatabase.cpp b/source/SAMRAI/tbox/AllocatorDatabase.cpp
index db7f6f720b..77dc0ad319 100644
--- a/source/SAMRAI/tbox/AllocatorDatabase.cpp
+++ b/source/SAMRAI/tbox/AllocatorDatabase.cpp
@@ -100,6 +100,16 @@ AllocatorDatabase::initialize()
     rm.makeAllocator<umpire::strategy::QuickPool>("samrai::stream_allocator", allocator);
   }
 
+  if (!rm.isAllocator("samrai::fuser_allocator")) {
+#if defined(HAVE_CUDA)
+    auto allocator = rm.getAllocator(umpire::resource::Pinned);
+#else
+    auto allocator = rm.getAllocator(umpire::resource::Host);
+#endif
+
+    rm.makeAllocator<umpire::strategy::QuickPool>("samrai::fuser_allocator", allocator);
+  }
+
   if (!rm.isAllocator("samrai::temporary_data_allocator")) {
 #if defined(HAVE_CUDA)
     //auto allocator = rm.getAllocator(umpire::resource::Device);
@@ -133,6 +143,13 @@ AllocatorDatabase::getStreamAllocator()
   return umpire::TypedAllocator<char>(rm.getAllocator("samrai::stream_allocator"));
 }
 
+umpire::TypedAllocator<char>
+AllocatorDatabase::getKernelFuserAllocator()
+{
+  umpire::ResourceManager& rm = umpire::ResourceManager::getInstance();
+  return umpire::TypedAllocator<char>(rm.getAllocator("samrai::fuser_allocator"));
+}
+
 umpire::TypedAllocator<char>
 AllocatorDatabase::getInternalHostAllocator()
 {
diff --git a/source/SAMRAI/tbox/AllocatorDatabase.h b/source/SAMRAI/tbox/AllocatorDatabase.h
index 311d9c4abc..51869eca73 100644
--- a/source/SAMRAI/tbox/AllocatorDatabase.h
+++ b/source/SAMRAI/tbox/AllocatorDatabase.h
@@ -88,6 +88,13 @@ class AllocatorDatabase
    umpire::TypedAllocator<char> getStreamAllocator();
 #endif
 
+   /*!
+    * @brief Get the kernel fuser allocator.
+    */
+#ifdef HAVE_UMPIRE
+   umpire::TypedAllocator<char> getKernelFuserAllocator();
+#endif
+
    /*!
     * @brief Get a host allocator.
     */
diff --git a/source/SAMRAI/tbox/AsyncCommPeer.cpp b/source/SAMRAI/tbox/AsyncCommPeer.cpp
index 4cf86de49a..f9c3611147 100644
--- a/source/SAMRAI/tbox/AsyncCommPeer.cpp
+++ b/source/SAMRAI/tbox/AsyncCommPeer.cpp
@@ -76,6 +76,7 @@ AsyncCommPeer<TYPE>::AsyncCommPeer():
    d_external_buf(0),
    d_internal_buf_size(0),
    d_internal_buf(0),
+   d_count_buf(0),
    d_mpi(SAMRAI_MPI::getSAMRAIWorld()),
    d_tag0(-1),
    d_tag1(-1),
@@ -119,6 +120,7 @@ AsyncCommPeer<TYPE>::AsyncCommPeer(
    d_external_buf(0),
    d_internal_buf_size(0),
    d_internal_buf(0),
+   d_count_buf(0),
    d_mpi(SAMRAI_MPI::getSAMRAIWorld()),
    d_tag0(-1),
    d_tag1(-1),
@@ -165,6 +167,16 @@ AsyncCommPeer<TYPE>::~AsyncCommPeer()
 #endif
       d_internal_buf = 0;
    }
+   if (d_count_buf) {
+#ifdef HAVE_UMPIRE
+      d_allocator.deallocate(
+         (char*)d_count_buf, 2 * sizeof(FlexData));
+#else
+      free(d_count_buf);
+#endif
+      d_count_buf = 0;
+   }
+   d_first_recv_buf = 0;
 
 }
 
@@ -570,7 +582,31 @@ AsyncCommPeer<TYPE>::checkRecv(
             // Post receive for first (and maybe only) chunk of data.
             const size_t first_chunk_count = getNumberOfFlexData(
                   d_max_first_data_len);
-            resizeBuffer(first_chunk_count + 2);
+
+            if (first_chunk_count > 0) {
+               resizeBuffer(first_chunk_count + 2);
+               d_first_recv_buf = d_internal_buf;
+            } else {
+               // If the size of the first chunk is zero, due to
+               // d_max_first_data_len being set to zero, then we use
+               // a small buffer to get only the full count size, deferring
+               // the receipt of the full data to the second Irecv.
+               if (d_count_buf) {
+#ifdef HAVE_UMPIRE
+                  d_allocator.deallocate(
+                     (char*)d_count_buf, 2 * sizeof(FlexData));
+#else
+                  free(d_count_buf);
+#endif
+               }
+#ifdef HAVE_UMPIRE
+               d_count_buf =
+                  (FlexData *)d_allocator.allocate(2 * sizeof(FlexData));
+#else
+               d_count_buf = (FlexData *)malloc(2 * sizeof(FlexData));
+#endif
+               d_first_recv_buf = d_count_buf;
+            }
 
             TBOX_ASSERT(req[0] == MPI_REQUEST_NULL);
 #ifdef DEBUG_CHECK_ASSERTIONS
@@ -578,7 +614,7 @@ AsyncCommPeer<TYPE>::checkRecv(
 #endif
             t_recv_timer->start();
             d_mpi_err = d_mpi.Irecv(
-                  d_internal_buf,
+                  d_first_recv_buf,
                   static_cast<int>(sizeof(FlexData) * (first_chunk_count + 2)),
                   MPI_BYTE,
                   d_peer_rank,
@@ -647,9 +683,9 @@ AsyncCommPeer<TYPE>::checkRecv(
             TBOX_ASSERT(mpi_status[0].MPI_SOURCE == d_peer_rank);
             TBOX_ASSERT(req[0] == MPI_REQUEST_NULL);
             // Get full count embedded in message.
-            d_full_count = d_internal_buf[count - 1].d_i;
+            d_full_count = d_first_recv_buf[count - 1].d_i;
 
-            TBOX_ASSERT(d_internal_buf[count - 2].d_i == 0); // Sequence number check.
+            TBOX_ASSERT(d_first_recv_buf[count - 2].d_i == 0); // Sequence number check.
             TBOX_ASSERT(getNumberOfFlexData(d_full_count) >= count - 2);
 
             if (d_full_count > d_max_first_data_len) {
@@ -664,7 +700,18 @@ AsyncCommPeer<TYPE>::checkRecv(
                const size_t second_chunk_count = getNumberOfFlexData(
                      d_full_count - d_max_first_data_len);
 
-               resizeBuffer(d_internal_buf_size + second_chunk_count);
+               size_t new_internal_buf_size =
+                  d_internal_buf_size + second_chunk_count;
+
+               // If the first Irecv didn't use d_internal_buf, then
+               // the message in the second Irecv will contain the entire
+               // buffer of data for this communicattion instance, and we need
+               // to add 2 to the buffer size to make room for the trailing
+               // metadata.
+               if (d_internal_buf_size == 0) {
+                  new_internal_buf_size += 2;
+               }
+               resizeBuffer(new_internal_buf_size);
 
                TBOX_ASSERT(req[1] == MPI_REQUEST_NULL);
                req[1] = MPI_REQUEST_NULL;
@@ -971,6 +1018,16 @@ AsyncCommPeer<TYPE>::clearRecvData()
 #endif
       d_internal_buf = 0;
    }
+   if (d_count_buf) {
+#ifdef HAVE_UMPIRE
+      d_allocator.deallocate(
+         (char*)d_count_buf, 2 * sizeof(FlexData));
+#else
+      free(d_count_buf);
+#endif
+      d_count_buf = 0;
+   }
+   d_first_recv_buf = 0;
 }
 
 /*
diff --git a/source/SAMRAI/tbox/AsyncCommPeer.h b/source/SAMRAI/tbox/AsyncCommPeer.h
index e3a0b09bcc..e8e60eb9d6 100644
--- a/source/SAMRAI/tbox/AsyncCommPeer.h
+++ b/source/SAMRAI/tbox/AsyncCommPeer.h
@@ -615,7 +615,10 @@ class AsyncCommPeer:public AsyncCommStage::Member
     * for overhead data.
     */
    size_t d_internal_buf_size;
-   FlexData* d_internal_buf;
+   FlexData* d_internal_buf = nullptr;
+
+   FlexData* d_count_buf = nullptr;
+   FlexData* d_first_recv_buf = nullptr;
 
    /*!
     *
diff --git a/source/SAMRAI/tbox/CMakeLists.txt b/source/SAMRAI/tbox/CMakeLists.txt
index 3ce0bccd93..8027d882f5 100644
--- a/source/SAMRAI/tbox/CMakeLists.txt
+++ b/source/SAMRAI/tbox/CMakeLists.txt
@@ -27,6 +27,7 @@ set ( tbox_headers
   InputDatabase.h
   InputManager.h
   IOStream.h
+  KernelFuser.h
   Logger.h
   MathUtilities.h
   MathUtilities.cpp
@@ -48,9 +49,11 @@ set ( tbox_headers
   SAMRAI_MPI.h
   SAMRAIManager.h
   Schedule.h
+  ScheduleKernelFuser.h
   Serializable.h
   SiloDatabase.h
   SiloDatabaseFactory.h
+  StagedKernelFusers.h
   StartupShutdownManager.h
   Statistic.h
   Statistician.h
@@ -59,6 +62,7 @@ set ( tbox_headers
   TimerManager.h
   Tracer.h
   Transaction.h
+  TransactionFuseable.h
   Utilities.h)
 
 set_source_files_properties(
@@ -88,6 +92,7 @@ set (tbox_sources
   HDFDatabaseFactory.cpp
   IEEE.cpp
   InputManager.cpp
+  KernelFuser.cpp
   Logger.cpp
   MathUtilitiesSpecial.cpp
   MemoryDatabase.cpp
@@ -106,9 +111,11 @@ set (tbox_sources
   SAMRAI_MPI.cpp
   Scanner.cpp
   Schedule.cpp
+  ScheduleKernelFuser.cpp
   Serializable.cpp
   SiloDatabase.cpp
   SiloDatabaseFactory.cpp
+  StagedKernelFusers.cpp
   StartupShutdownManager.cpp
   StatTransaction.cpp
   Statistic.cpp
@@ -117,6 +124,7 @@ set (tbox_sources
   TimerManager.cpp
   Tracer.cpp
   Transaction.cpp
+  TransactionFuseable.cpp
   Utilities.cpp)
 
 if (ENABLE_HDF5)
@@ -148,9 +156,11 @@ if (ENABLE_RAJA)
 endif ()
 
 if (ENABLE_CUDA)
-  set(cuda_sources GPUUtilities.cpp Schedule.cpp)
+  set(cuda_sources GPUUtilities.cpp Schedule.cpp TransactionFuseable.cpp)
   set_source_files_properties(${cuda_sources} PROPERTIES LANGUAGE CUDA)
 
+  set (tbox_depends ${tbox_depends} cuda)
+
   if (ENABLE_NVTX_REGIONS)
     find_package(CUDA REQUIRED)
 
@@ -178,6 +188,9 @@ target_include_directories( SAMRAI_tbox
   $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/source>
   $<INSTALL_INTERFACE:include>)
 
+blt_print_target_properties(
+  TARGET SAMRAI_tbox)
+
 
 install(TARGETS SAMRAI_tbox
   EXPORT SAMRAITargets
diff --git a/source/SAMRAI/tbox/ExecutionPolicy.h b/source/SAMRAI/tbox/ExecutionPolicy.h
index 2fc09b52c4..90c70ccc9b 100644
--- a/source/SAMRAI/tbox/ExecutionPolicy.h
+++ b/source/SAMRAI/tbox/ExecutionPolicy.h
@@ -11,6 +11,8 @@
 #ifndef included_tbox_ExecutionPolicy
 #define included_tbox_ExecutionPolicy
 
+#include "SAMRAI/SAMRAI_config.h"
+
 #if defined(HAVE_RAJA)
 
 #include "RAJA/RAJA.hpp"
@@ -112,6 +114,11 @@ struct policy_traits<policy::parallel> {
    >;
 
    using ReductionPolicy = RAJA::cuda_reduce;
+
+   using WorkGroupPolicy = RAJA::WorkGroupPolicy<
+      RAJA::cuda_work_async<SAMRAI_RAJA_WORKGROUP_THREADS>, 
+      RAJA::unordered_cuda_loop_y_block_iter_x_threadblock_average,
+      RAJA::constant_stride_array_of_objects>;
 };
 
 #else
@@ -146,6 +153,12 @@ struct policy_traits<policy::parallel> {
    >;
 
    using ReductionPolicy = RAJA::seq_reduce;
+
+   using WorkGroupPolicy = RAJA::WorkGroupPolicy<
+      RAJA::loop_work,
+      RAJA::reverse_ordered,
+      RAJA::ragged_array_of_objects>;
+
 };
 
 #endif // HAVE_CUDA
diff --git a/source/SAMRAI/tbox/KernelFuser.cpp b/source/SAMRAI/tbox/KernelFuser.cpp
new file mode 100644
index 0000000000..6f77f3639f
--- /dev/null
+++ b/source/SAMRAI/tbox/KernelFuser.cpp
@@ -0,0 +1,30 @@
+/*************************************************************************
+ *
+ * This file is part of the SAMRAI distribution.  For full copyright
+ * information, see COPYRIGHT and LICENSE.
+ *
+ * Copyright:     (c) 1997-2021 Lawrence Livermore National Security, LLC
+ * Description:   Singleton kernel fuser
+ *
+ ************************************************************************/
+
+#include "SAMRAI/tbox/KernelFuser.h"
+
+
+namespace SAMRAI {
+namespace tbox {
+
+
+KernelFuser::~KernelFuser()
+{
+}
+
+void
+KernelFuser::initialize()
+{
+}
+
+
+}
+}
+
diff --git a/source/SAMRAI/tbox/KernelFuser.h b/source/SAMRAI/tbox/KernelFuser.h
new file mode 100644
index 0000000000..8b187cae6e
--- /dev/null
+++ b/source/SAMRAI/tbox/KernelFuser.h
@@ -0,0 +1,116 @@
+#ifndef included_tbox_KernelFuser
+#define included_tbox_KernelFuser
+
+#include "SAMRAI/SAMRAI_config.h"
+
+#ifdef HAVE_UMPIRE
+#include "umpire/ResourceManager.hpp"
+#include "umpire/TypedAllocator.hpp"
+#endif
+
+#include "SAMRAI/tbox/ExecutionPolicy.h"
+#include "SAMRAI/tbox/AllocatorDatabase.h"
+#include "SAMRAI/tbox/Utilities.h"
+
+#ifdef HAVE_RAJA
+#include "RAJA/RAJA.hpp"
+#endif
+
+
+#ifdef HAVE_UMPIRE
+#include "umpire/Allocator.hpp"
+#include "umpire/TypedAllocator.hpp"
+#endif
+
+namespace SAMRAI {
+namespace tbox {
+
+class KernelFuser
+{
+public:
+
+
+#ifdef HAVE_RAJA
+  template<typename Kernel>
+  void enqueue(int begin, int end, Kernel&& kernel) {
+     if (d_launched) {
+        TBOX_ERROR("KernelFuser Error: Cannont enqueue until cleanup called after previous launch.");
+     }
+
+     d_workpool.enqueue(RAJA::RangeSegment(begin, end), std::forward<Kernel>(kernel));
+  }
+#endif
+
+  void launch()
+  {
+     if (d_launched) {
+        TBOX_ERROR("KernelFuser Error: This KernelFuser already launched.");
+     }
+
+#ifdef HAVE_RAJA
+     if (d_workpool.num_loops() > 0) {
+        d_workgroup = d_workpool.instantiate();
+        d_worksite = d_workgroup.run();
+     }
+     d_launched = true;
+#endif
+  }
+
+  void cleanup()
+  {
+#ifdef HAVE_RAJA
+     d_workpool.clear();
+     d_workgroup.clear();
+     d_worksite.clear();
+     d_launched = false;
+#endif
+  }
+
+  bool launched() const
+  {
+     return d_launched;
+  }
+
+  void initialize();
+
+  KernelFuser() :
+#ifdef HAVE_RAJA
+     d_workpool(AllocatorDatabase::getDatabase()->getKernelFuserAllocator()),
+     d_workgroup(d_workpool.instantiate()),
+     d_worksite(d_workgroup.run()),
+#endif
+     d_launched(false)
+  {
+  }
+
+
+   virtual ~KernelFuser();
+
+
+private:
+#ifdef HAVE_UMPIRE
+  using Allocator = umpire::TypedAllocator<char>;
+#else
+  using Allocator = ResourceAllocator;
+#endif
+
+#ifdef HAVE_RAJA
+  using Policy = typename tbox::detail::policy_traits< tbox::policy::parallel >::WorkGroupPolicy;
+  using WorkPool  = RAJA::WorkPool <Policy, int, RAJA::xargs<>, Allocator>;
+  using WorkGroup = RAJA::WorkGroup<Policy, int, RAJA::xargs<>, Allocator>;
+  using WorkSite  = RAJA::WorkSite <Policy, int, RAJA::xargs<>, Allocator>;
+#endif
+
+#ifdef HAVE_RAJA
+  WorkPool d_workpool;
+  WorkGroup d_workgroup;
+  WorkSite d_worksite;
+#endif
+
+  bool d_launched;
+};
+
+}
+}
+
+#endif 
diff --git a/source/SAMRAI/tbox/Schedule.cpp b/source/SAMRAI/tbox/Schedule.cpp
index 72c3021fb6..0e2cfa61da 100644
--- a/source/SAMRAI/tbox/Schedule.cpp
+++ b/source/SAMRAI/tbox/Schedule.cpp
@@ -10,6 +10,7 @@
 #include "SAMRAI/tbox/Schedule.h"
 #include "SAMRAI/tbox/AllocatorDatabase.h"
 #include "SAMRAI/tbox/InputManager.h"
+#include "SAMRAI/tbox/StagedKernelFusers.h"
 #include "SAMRAI/tbox/PIO.h"
 #include "SAMRAI/tbox/SAMRAIManager.h"
 #include "SAMRAI/tbox/SAMRAI_MPI.h"
@@ -40,7 +41,11 @@ const int Schedule::s_default_second_tag = 1;
  * MPI communication.  This parameter should be dependent on the MPI
  * implementation.
  */
+#if defined(HAVE_CUDA)
+const size_t Schedule::s_default_first_message_length = 0;
+#else
 const size_t Schedule::s_default_first_message_length = 1000;
+#endif
 
 const std::string Schedule::s_default_timer_prefix("tbox::Schedule");
 std::map<std::string, Schedule::TimerStruct> Schedule::s_static_timers;
@@ -60,7 +65,6 @@ Schedule::s_initialize_finalize_handler(
  */
 
 Schedule::Schedule():
-   d_coms(0),
    d_com_stage(),
    d_mpi(SAMRAI_MPI::getSAMRAIWorld()),
    d_first_tag(s_default_first_tag),
@@ -101,13 +105,40 @@ Schedule::addTransaction(
    const int src_id = transaction->getSourceProcessor();
    const int dst_id = transaction->getDestinationProcessor();
 
+   std::shared_ptr<TransactionFuseable> fuseable_transaction{
+      std::dynamic_pointer_cast<TransactionFuseable>(transaction)};
+
    if ((d_mpi.getRank() == src_id) && (d_mpi.getRank() == dst_id)) {
-      d_local_set.push_front(transaction);
+      if (fuseable_transaction) {
+         if (!d_local_fusers) {
+            d_local_fusers = StagedKernelFusers::getInstance();
+         }
+         fuseable_transaction->setKernelFuser(d_local_fusers);
+         d_local_set_fuseable.push_front(fuseable_transaction);
+      } else {
+         d_local_set.push_front(transaction);
+      }
    } else {
       if (d_mpi.getRank() == dst_id) {
-         d_recv_sets[src_id].push_front(transaction);
+         if (fuseable_transaction) {
+            if (!d_recv_fusers) {
+               d_recv_fusers = StagedKernelFusers::getInstance();
+            }
+            fuseable_transaction->setKernelFuser(d_recv_fusers);
+            d_recv_sets_fuseable[src_id].push_front(fuseable_transaction);
+         } else {
+            d_recv_sets[src_id].push_front(transaction);
+         }
       } else if (d_mpi.getRank() == src_id) {
-         d_send_sets[dst_id].push_front(transaction);
+         if (fuseable_transaction) {
+            if (!d_send_fusers) {
+               d_send_fusers = StagedKernelFusers::getInstance();
+            }
+            fuseable_transaction->setKernelFuser(d_send_fusers);
+            d_send_sets_fuseable[dst_id].push_front(fuseable_transaction);
+         } else {
+            d_send_sets[dst_id].push_front(transaction);
+         }
       }
    }
 }
@@ -126,13 +157,40 @@ Schedule::appendTransaction(
    const int src_id = transaction->getSourceProcessor();
    const int dst_id = transaction->getDestinationProcessor();
 
+   std::shared_ptr<TransactionFuseable> fuseable_transaction{
+      std::dynamic_pointer_cast<TransactionFuseable>(transaction)};
+
    if ((d_mpi.getRank() == src_id) && (d_mpi.getRank() == dst_id)) {
-      d_local_set.push_back(transaction);
+      if (fuseable_transaction) {
+         if (!d_local_fusers) {
+            d_local_fusers = StagedKernelFusers::getInstance();
+         }
+         fuseable_transaction->setKernelFuser(d_local_fusers);
+         d_local_set_fuseable.push_back(fuseable_transaction);
+      } else {
+         d_local_set.push_back(transaction);
+      }
    } else {
       if (d_mpi.getRank() == dst_id) {
-         d_recv_sets[src_id].push_back(transaction);
+         if (fuseable_transaction) {
+            if (!d_recv_fusers) {
+               d_recv_fusers = StagedKernelFusers::getInstance();
+            }
+            fuseable_transaction->setKernelFuser(d_recv_fusers);
+            d_recv_sets_fuseable[src_id].push_back(fuseable_transaction);
+         } else {
+            d_recv_sets[src_id].push_back(transaction);
+         }
       } else if (d_mpi.getRank() == src_id) {
-         d_send_sets[dst_id].push_back(transaction);
+         if (fuseable_transaction) {
+            if (!d_send_fusers) {
+               d_send_fusers = StagedKernelFusers::getInstance();
+            }
+            fuseable_transaction->setKernelFuser(d_send_fusers);
+            d_send_sets_fuseable[dst_id].push_back(transaction);
+         } else {
+            d_send_sets[dst_id].push_back(transaction);
+         }
       }
    }
 }
@@ -149,8 +207,13 @@ Schedule::getNumSendTransactions(
    int size = 0;
    TransactionSets::const_iterator mi = d_send_sets.find(rank);
    if (mi != d_send_sets.end()) {
-      size = static_cast<int>(mi->second.size());
+      size += static_cast<int>(mi->second.size());
+   }
+   mi = d_send_sets_fuseable.find(rank);
+   if (mi != d_send_sets_fuseable.end()) {
+      size += static_cast<int>(mi->second.size());
    }
+
    return size;
 }
 
@@ -166,7 +229,11 @@ Schedule::getNumRecvTransactions(
    int size = 0;
    TransactionSets::const_iterator mi = d_recv_sets.find(rank);
    if (mi != d_recv_sets.end()) {
-      size = static_cast<int>(mi->second.size());
+      size += static_cast<int>(mi->second.size());
+   }
+   mi = d_recv_sets_fuseable.find(rank);
+   if (mi != d_recv_sets_fuseable.end()) {
+      size += static_cast<int>(mi->second.size());
    }
    return size;
 }
@@ -186,6 +253,7 @@ Schedule::communicate()
 #endif
 
    d_object_timers->t_communicate->start();
+   d_completed_transactions = false;
    beginCommunication();
    finalizeCommunication();
    d_object_timers->t_communicate->stop();
@@ -226,9 +294,6 @@ Schedule::finalizeCommunication()
 {
    d_object_timers->t_finalize_communication->start();
    performLocalCopies();
-#if defined(HAVE_RAJA)
-   parallel_synchronize();
-#endif
    processCompletedCommunications();
    deallocateCommunicationObjects();
    d_object_timers->t_finalize_communication->stop();
@@ -245,7 +310,7 @@ Schedule::finalizeCommunication()
 void
 Schedule::postReceives()
 {
-   if (d_recv_sets.empty()) {
+   if (d_recv_sets.empty() && d_recv_sets_fuseable.empty()) {
       /*
        * Short cut because some looping logic in this method assumes
        * non-empty d_recv_sets.
@@ -253,6 +318,8 @@ Schedule::postReceives()
       return;
    }
 
+   d_completed_transactions = true;
+
    int rank = d_mpi.getRank();
 
    /*
@@ -263,60 +330,87 @@ Schedule::postReceives()
     * send posted earlier is paired with a receive that is also posted
     * earlier.
     */
-   AsyncCommPeer<char>* recv_coms = d_coms;
+   for (CommMap::reverse_iterator comm_peer(d_recv_coms.lower_bound(rank));
+        comm_peer != d_recv_coms.rend();
+        ++comm_peer) {
+      const int recv_rank = (*comm_peer).first;
+      auto& comm = (*comm_peer).second;
+      // Compute incoming message size, if possible.
+      unsigned int byte_count = 0;
+      bool can_estimate_incoming_message_size = true;
 
-   // Initialize iterators to where we want to start looping.
-   size_t icom = 0; // Index into recv_coms.
-   while (icom < d_recv_sets.size() &&
-          recv_coms[icom].getPeerRank() < rank) {
-      ++icom;
-   }
-   icom = icom > 0 ? icom - 1 : d_recv_sets.size() - 1;
+      for (const auto& t : d_recv_sets[recv_rank] ) {
+         if (!t->canEstimateIncomingMessageSize()) {
+            can_estimate_incoming_message_size = false;
+            break;
+         }
+         byte_count +=
+            static_cast<unsigned int>(t->computeIncomingMessageSize());
+      }
 
-   // Map iterator mi corresponds to recv_coms[icom].
-   TransactionSets::const_iterator mi =
-      d_recv_sets.find(recv_coms[icom].getPeerRank());
+      for (const auto& t: d_recv_sets_fuseable[recv_rank]) {
+         if (!t->canEstimateIncomingMessageSize()) {
+            can_estimate_incoming_message_size = false;
+            break;
+         }
+         byte_count +=
+            static_cast<unsigned int>(t->computeIncomingMessageSize());
+      }
 
-   for (size_t counter = 0;
-        counter < d_recv_sets.size();
-        ++counter, --mi, --icom) {
+      // Set AsyncCommPeer to receive known message length.
+      if (can_estimate_incoming_message_size) {
+         comm->limitFirstDataLength(byte_count);
+      }
 
-      TBOX_ASSERT(mi->first == recv_coms[icom].getPeerRank());
+      // Begin non-blocking receive operation.
+      d_object_timers->t_post_receives->start();
+      comm->beginRecv();
+      if (comm->isDone()) {
+         comm->pushToCompletionQueue();
+      }
+      d_object_timers->t_post_receives->stop();
+   }
 
+   CommMap::reverse_iterator stop(d_recv_coms.lower_bound(rank));
+   for (CommMap::reverse_iterator comm_peer = d_recv_coms.rbegin(); comm_peer != stop; ++comm_peer) {
+      const int recv_rank = (*comm_peer).first;
+      auto& comm = (*comm_peer).second;
       // Compute incoming message size, if possible.
-      const std::list<std::shared_ptr<Transaction> >& transactions =
-         mi->second;
       unsigned int byte_count = 0;
       bool can_estimate_incoming_message_size = true;
-      for (ConstIterator r = transactions.begin();
-           r != transactions.end(); ++r) {
-         if (!(*r)->canEstimateIncomingMessageSize()) {
+
+      for (const auto& t : d_recv_sets[recv_rank] ) {
+         if (!t->canEstimateIncomingMessageSize()) {
+            can_estimate_incoming_message_size = false;
+            break;
+         }
+         byte_count +=
+            static_cast<unsigned int>(t->computeIncomingMessageSize());
+      }
+
+      for (const auto& t: d_recv_sets_fuseable[recv_rank]) {
+         if (!t->canEstimateIncomingMessageSize()) {
             can_estimate_incoming_message_size = false;
             break;
          }
          byte_count +=
-            static_cast<unsigned int>((*r)->computeIncomingMessageSize());
+            static_cast<unsigned int>(t->computeIncomingMessageSize());
       }
 
       // Set AsyncCommPeer to receive known message length.
       if (can_estimate_incoming_message_size) {
-         recv_coms[icom].limitFirstDataLength(byte_count);
+         comm->limitFirstDataLength(byte_count);
       }
 
       // Begin non-blocking receive operation.
       d_object_timers->t_post_receives->start();
-      recv_coms[icom].beginRecv();
-      if (recv_coms[icom].isDone()) {
-         recv_coms[icom].pushToCompletionQueue();
+      comm->beginRecv();
+      if (comm->isDone()) {
+         comm->pushToCompletionQueue();
       }
       d_object_timers->t_post_receives->stop();
-
-      if (mi == d_recv_sets.begin()) {
-         // Continue loop at the opposite end.
-         mi = d_recv_sets.end();
-         icom = d_recv_sets.size();
-      }
    }
+
 }
 
 /*
@@ -339,38 +433,100 @@ Schedule::postSends()
 
    int rank = d_mpi.getRank();
 
-   AsyncCommPeer<char>* send_coms = d_coms + d_recv_sets.size();
+   for (auto comm_peer = d_send_coms.lower_bound(rank);
+        comm_peer != d_send_coms.end(); 
+        ++comm_peer) {
+      const int peer_rank = (*comm_peer).first;
+      auto& comm = (*comm_peer).second;
 
-   // Initialize iterators to where we want to start looping.
-   TransactionSets::const_iterator mi = d_send_sets.upper_bound(rank);
-   size_t icom = 0; // send_coms[icom] corresponds to mi.
-   while (icom < d_send_sets.size() &&
-          send_coms[icom].getPeerRank() < rank) {
-      ++icom;
-   }
+      size_t byte_count = 0;
+      bool can_estimate_incoming_message_size = true;
+      for (const auto& transaction : d_send_sets[peer_rank]) {
+         if (!transaction->canEstimateIncomingMessageSize()) {
+            can_estimate_incoming_message_size = false;
+         }
+         byte_count += transaction->computeOutgoingMessageSize();
+      }
+
+      for (const auto& transaction : d_send_sets_fuseable[peer_rank]) {
+         if (!transaction->canEstimateIncomingMessageSize()) {
+            can_estimate_incoming_message_size = false;
+         }
+         byte_count += transaction->computeOutgoingMessageSize();
+      }
+
+      // Pack outgoing data into a message.
+      MessageStream outgoing_stream(
+         byte_count,
+         MessageStream::Write,
+         nullptr,
+         true
+#ifdef HAVE_UMPIRE
+         , AllocatorDatabase::getDatabase()->getStreamAllocator()
+#endif
+         );
+
+      d_object_timers->t_pack_stream->start();
 
-   for (size_t counter = 0;
-        counter < d_send_sets.size();
-        ++counter, ++mi, ++icom) {
+      bool have_fuseable = !(d_send_sets_fuseable[peer_rank].empty());
 
-      if (mi == d_send_sets.end()) {
-         // Continue loop at the opposite end.
-         mi = d_send_sets.begin();
-         icom = 0;
+      for (const auto& transaction : d_send_sets_fuseable[peer_rank]) {
+         transaction->packStream(outgoing_stream);
       }
-      TBOX_ASSERT(mi->first == send_coms[icom].getPeerRank());
+      if (d_send_fusers && have_fuseable) {
+         d_send_fusers->launch();
+      }
+ 
+      for (const auto& transaction : d_send_sets[peer_rank]) {
+         transaction->packStream(outgoing_stream);
+      }
+
+      bool have_non_fuseable = !(d_send_sets[peer_rank].empty());
+
+      if (have_fuseable || have_non_fuseable) {
+         d_completed_transactions = true;
+#if defined(HAVE_RAJA)
+         parallel_synchronize();
+         if (d_send_fusers) d_send_fusers->cleanup();
+#endif
+      }
+
+      d_object_timers->t_pack_stream->stop();
+
+      if (can_estimate_incoming_message_size) {
+         // Receiver knows message size so set it exactly.
+         comm->limitFirstDataLength(byte_count);
+      }
+
+      // Begin non-blocking send operation.
+      comm->beginSend(
+         (const char *)outgoing_stream.getBufferStart(),
+         static_cast<int>(outgoing_stream.getCurrentSize()));
+      if (comm->isDone()) {
+         comm->pushToCompletionQueue();
+      }
+   }
+
+   for (auto comm_peer = d_send_coms.begin();
+        comm_peer != d_send_coms.lower_bound(rank); 
+        ++comm_peer) {
+      const int peer_rank = (*comm_peer).first;
+      auto& comm = (*comm_peer).second;
 
-      // Compute message size and whether receiver can estimate it.
-      const std::list<std::shared_ptr<Transaction> >& transactions =
-         mi->second;
       size_t byte_count = 0;
       bool can_estimate_incoming_message_size = true;
-      for (ConstIterator pack = transactions.begin();
-           pack != transactions.end(); ++pack) {
-         if (!(*pack)->canEstimateIncomingMessageSize()) {
+      for (const auto& transaction : d_send_sets[peer_rank]) {
+         if (!transaction->canEstimateIncomingMessageSize()) {
+            can_estimate_incoming_message_size = false;
+         }
+         byte_count += transaction->computeOutgoingMessageSize();
+      }
+
+      for (const auto& transaction : d_send_sets_fuseable[peer_rank]) {
+         if (!transaction->canEstimateIncomingMessageSize()) {
             can_estimate_incoming_message_size = false;
          }
-         byte_count += (*pack)->computeOutgoingMessageSize();
+         byte_count += transaction->computeOutgoingMessageSize();
       }
 
       // Pack outgoing data into a message.
@@ -384,28 +540,42 @@ Schedule::postSends()
 #endif
          );
 
+      bool have_fuseable = !(d_send_sets_fuseable[peer_rank].empty());
+
       d_object_timers->t_pack_stream->start();
-      for (ConstIterator pack = transactions.begin();
-           pack != transactions.end(); ++pack) {
-         (*pack)->packStream(outgoing_stream);
+      for (const auto& transaction : d_send_sets_fuseable[peer_rank]) {
+         transaction->packStream(outgoing_stream);
       }
-#if defined(HAVE_RAJA)      
-      parallel_synchronize();
+      if (d_send_fusers && have_fuseable) {
+         d_send_fusers->launch();
+      }
+
+      for (const auto& transaction : d_send_sets[peer_rank]) {
+         transaction->packStream(outgoing_stream);
+      }
+      bool have_non_fuseable = !(d_send_sets[peer_rank].empty());
+
+      if (have_fuseable || have_non_fuseable) {
+         d_completed_transactions = true;
+#if defined(HAVE_RAJA)
+         parallel_synchronize();
+         if (d_send_fusers) d_send_fusers->cleanup();
 #endif
+      }
 
       d_object_timers->t_pack_stream->stop();
 
       if (can_estimate_incoming_message_size) {
          // Receiver knows message size so set it exactly.
-         send_coms[icom].limitFirstDataLength(byte_count);
+         comm->limitFirstDataLength(byte_count);
       }
 
       // Begin non-blocking send operation.
-      send_coms[icom].beginSend(
+      comm->beginSend(
          (const char *)outgoing_stream.getBufferStart(),
          static_cast<int>(outgoing_stream.getCurrentSize()));
-      if (send_coms[icom].isDone()) {
-         send_coms[icom].pushToCompletionQueue();
+      if (comm->isDone()) {
+         comm->pushToCompletionQueue();
       }
    }
 
@@ -420,12 +590,30 @@ Schedule::postSends()
 void
 Schedule::performLocalCopies()
 {
+   bool have_fuseable = !d_local_set_fuseable.empty();
+
    d_object_timers->t_local_copies->start();
-   for (Iterator local = d_local_set.begin();
-        local != d_local_set.end(); ++local) {
-      (*local)->copyLocalData();
+   for (const auto& local : d_local_set_fuseable) {
+      local->copyLocalData();
+   }
+   if (d_local_fusers && have_fuseable) {
+      d_local_fusers->launch();
+   }
+
+   for (const auto& local : d_local_set) {
+      local->copyLocalData();
    }
    d_object_timers->t_local_copies->stop();
+
+   bool have_non_fuseable = !d_local_set.empty();
+   if (have_fuseable || have_non_fuseable) {
+      d_completed_transactions = true;
+#if defined(HAVE_RAJA)
+      parallel_synchronize();
+      if (d_local_fusers) d_local_fusers->cleanup();
+#endif
+   }
+
 }
 
 /*
@@ -446,39 +634,57 @@ Schedule::processCompletedCommunications()
    if (d_unpack_in_deterministic_order) {
 
       // Unpack in deterministic order.  Wait for receive as needed.
+      // Deterministic order is lowest to highest recv rank
 
       int irecv = 0;
-      for (TransactionSets::iterator recv_itr = d_recv_sets.begin();
-           recv_itr != d_recv_sets.end(); ++recv_itr, ++irecv) {
-
-         int sender = recv_itr->first;
-         AsyncCommPeer<char>& completed_comm = d_coms[irecv];
-         TBOX_ASSERT(sender == completed_comm.getPeerRank());
-         completed_comm.completeCurrentOperation();
-         completed_comm.yankFromCompletionQueue();
+      for (auto& comms : d_recv_coms) {
+         auto& completed_comm = comms.second;
+         int sender = comms.first;
+         TBOX_ASSERT(sender == completed_comm->getPeerRank());
+         completed_comm->completeCurrentOperation();
+         completed_comm->yankFromCompletionQueue();
 
          MessageStream incoming_stream(
-            static_cast<size_t>(completed_comm.getRecvSize()) * sizeof(char),
+            static_cast<size_t>(completed_comm->getRecvSize()) * sizeof(char),
             MessageStream::Read,
-            completed_comm.getRecvData(),
+            completed_comm->getRecvData(),
             false /* don't use deep copy */
 #ifdef HAVE_UMPIRE
             , AllocatorDatabase::getDatabase()->getStreamAllocator()
 #endif
             );
 
+
+         bool have_fuseable = !(d_recv_sets_fuseable[sender].empty());
+
          d_object_timers->t_unpack_stream->start();
-         for (Iterator recv = d_recv_sets[sender].begin();
-              recv != d_recv_sets[sender].end(); ++recv) {
-            (*recv)->unpackStream(incoming_stream);
+         for (const auto& transaction : d_recv_sets_fuseable[sender]) {
+            transaction->unpackStream(incoming_stream);
+         }
+         if (d_recv_fusers || have_fuseable) {
+            d_recv_fusers->launch();
          }
 #if defined(HAVE_RAJA)
-         parallel_synchronize();
+         if (have_fuseable) {
+            parallel_synchronize();
+            if (d_recv_fusers) d_recv_fusers->cleanup();
+         }
+#endif
+         for (const auto& transaction : d_recv_sets[sender]) {
+            transaction->unpackStream(incoming_stream);
+         }
+         bool have_non_fuseable = !(d_recv_sets[sender].empty());
+#if defined(HAVE_RAJA)
+         if (have_non_fuseable) {
+            parallel_synchronize();
+         }
 #endif
+         if (have_fuseable || have_non_fuseable) {
+            d_completed_transactions = true;
+         }
 
          d_object_timers->t_unpack_stream->stop();
-         completed_comm.clearRecvData();
-
+         completed_comm->clearRecvData();
       }
 
       // Complete sends.
@@ -499,7 +705,7 @@ Schedule::processCompletedCommunications()
 
          TBOX_ASSERT(completed_comm != 0);
          TBOX_ASSERT(completed_comm->isDone());
-         if (static_cast<size_t>(completed_comm - d_coms) < num_senders) {
+         if (!completed_comm->isSender()) {
 
             const int sender = completed_comm->getPeerRank();
 
@@ -513,14 +719,34 @@ Schedule::processCompletedCommunications()
 #endif
                );
 
+            bool have_fuseable = !(d_recv_sets_fuseable[sender].empty());
+
             d_object_timers->t_unpack_stream->start();
-            for (Iterator recv = d_recv_sets[sender].begin();
-                 recv != d_recv_sets[sender].end(); ++recv) {
-               (*recv)->unpackStream(incoming_stream);
+            for (const auto& transaction : d_recv_sets_fuseable[sender]) {
+               transaction->unpackStream(incoming_stream);
+            }
+            if (d_recv_fusers && have_fuseable) {
+               d_recv_fusers->launch();
             }
 #if defined(HAVE_RAJA)
-            parallel_synchronize();
+            if (have_fuseable) {
+               parallel_synchronize();
+               if (d_recv_fusers) d_recv_fusers->cleanup();
+            }
+#endif
+            for (const auto& transaction : d_recv_sets[sender]) {
+               transaction->unpackStream(incoming_stream);
+            }
+            bool have_non_fuseable = !(d_recv_sets[sender].empty());
+#if defined(HAVE_RAJA)
+            if (have_non_fuseable) {
+               parallel_synchronize();
+            }
 #endif
+            if (have_fuseable || have_non_fuseable) {
+               d_completed_transactions = true;
+            }
+
             d_object_timers->t_unpack_stream->stop();
             completed_comm->clearRecvData();
          } else {
@@ -542,37 +768,69 @@ Schedule::processCompletedCommunications()
 void
 Schedule::allocateCommunicationObjects()
 {
-   const size_t length = d_recv_sets.size() + d_send_sets.size();
-   if (length > 0) {
-      d_coms = new AsyncCommPeer<char>[length];
+   for (const auto& transaction : d_recv_sets) {
+      int rank = transaction.first;
+
+      auto peer = std::make_shared<AsyncCommPeer<char>>();
+      peer->initialize(&d_com_stage);
+      peer->setPeerRank(rank);
+      peer->setMPITag(d_first_tag, d_second_tag);
+      peer->setMPI(d_mpi);
+      peer->limitFirstDataLength(d_first_message_length);
+#ifdef HAVE_UMPIRE
+      peer->setAllocator(AllocatorDatabase::getDatabase()->getStreamAllocator());
+#endif
+      d_recv_coms[rank] = peer;
    }
 
-   size_t counter = 0;
-   for (TransactionSets::iterator ti = d_recv_sets.begin();
-        ti != d_recv_sets.end();
-        ++ti) {
-      d_coms[counter].initialize(&d_com_stage);
-      d_coms[counter].setPeerRank(ti->first);
-      d_coms[counter].setMPITag(d_first_tag, d_second_tag);
-      d_coms[counter].setMPI(d_mpi);
-      d_coms[counter].limitFirstDataLength(d_first_message_length);
+   for (const auto transaction : d_recv_sets_fuseable) {
+      int rank = transaction.first;
+      
+      if (d_recv_coms.find(rank) == d_recv_coms.end()) {
+         auto peer = std::make_shared<AsyncCommPeer<char>>();
+         peer->initialize(&d_com_stage);
+         peer->setPeerRank(rank);
+         peer->setMPITag(d_first_tag, d_second_tag);
+         peer->setMPI(d_mpi);
+         peer->limitFirstDataLength(d_first_message_length);
 #ifdef HAVE_UMPIRE
-      d_coms[counter].setAllocator(AllocatorDatabase::getDatabase()->getStreamAllocator());
+         peer->setAllocator(AllocatorDatabase::getDatabase()->getStreamAllocator());
 #endif
-      ++counter;
+         d_recv_coms[rank] = peer;
+      }
    }
-   for (TransactionSets::iterator ti = d_send_sets.begin();
-        ti != d_send_sets.end();
-        ++ti) {
-      d_coms[counter].initialize(&d_com_stage);
-      d_coms[counter].setPeerRank(ti->first);
-      d_coms[counter].setMPITag(d_first_tag, d_second_tag);
-      d_coms[counter].setMPI(d_mpi);
-      d_coms[counter].limitFirstDataLength(d_first_message_length);
+
+   for (const auto& transaction : d_send_sets) {
+      int rank = transaction.first;
+      auto peer = std::make_shared<AsyncCommPeer<char>>();
+
+      peer->initialize(&d_com_stage);
+      peer->setPeerRank(rank);
+      peer->setMPITag(d_first_tag, d_second_tag);
+      peer->setMPI(d_mpi);
+      peer->limitFirstDataLength(d_first_message_length);
 #ifdef HAVE_UMPIRE
-      d_coms[counter].setAllocator(AllocatorDatabase::getDatabase()->getStreamAllocator());
+      peer->setAllocator(AllocatorDatabase::getDatabase()->getStreamAllocator());
 #endif
-      ++counter;
+      d_send_coms[rank] = peer;
+   }
+
+   for (const auto& transaction : d_send_sets_fuseable) {
+      int rank = transaction.first;
+      
+      if (d_send_coms.find(rank) == d_send_coms.end()) {
+         auto peer = std::make_shared<AsyncCommPeer<char>>();
+
+         peer->initialize(&d_com_stage);
+         peer->setPeerRank(rank);
+         peer->setMPITag(d_first_tag, d_second_tag);
+         peer->setMPI(d_mpi);
+         peer->limitFirstDataLength(d_first_message_length);
+#ifdef HAVE_UMPIRE
+         peer->setAllocator(AllocatorDatabase::getDatabase()->getStreamAllocator());
+#endif
+         d_send_coms[rank] = peer;
+      }
    }
 }
 
diff --git a/source/SAMRAI/tbox/Schedule.h b/source/SAMRAI/tbox/Schedule.h
index 058e050ab0..bd436bef5a 100644
--- a/source/SAMRAI/tbox/Schedule.h
+++ b/source/SAMRAI/tbox/Schedule.h
@@ -17,6 +17,9 @@
 #include "SAMRAI/tbox/SAMRAI_MPI.h"
 #include "SAMRAI/tbox/MessageStream.h"
 #include "SAMRAI/tbox/Transaction.h"
+#include "SAMRAI/tbox/TransactionFuseable.h"
+#include "SAMRAI/tbox/KernelFuser.h"
+#include "SAMRAI/tbox/StagedKernelFusers.h"
 
 #include <iostream>
 #include <map>
@@ -275,7 +278,7 @@ class Schedule
    bool
    allocatedCommunicationObjects()
    {
-      return d_coms != 0;
+      return (d_recv_coms.size() > 0 && d_send_coms.size() > 0);
    }
 
    /*!
@@ -287,16 +290,19 @@ class Schedule
       return "Schedule";
    }
 
+   bool completedTransactions() const
+   {
+      return d_completed_transactions;
+   }
+
 private:
    void
    allocateCommunicationObjects();
    void
    deallocateCommunicationObjects()
    {
-      if (d_coms) {
-         delete[] d_coms;
-      }
-      d_coms = 0;
+      d_send_coms.clear();
+      d_recv_coms.clear();
    }
 
    void
@@ -344,12 +350,28 @@ class Schedule
    TransactionSets d_send_sets;
    TransactionSets d_recv_sets;
 
+   TransactionSets d_send_sets_fuseable;
+   TransactionSets d_recv_sets_fuseable;
+
+   StagedKernelFusers* d_send_fusers{nullptr};
+   StagedKernelFusers* d_recv_fusers{nullptr};
+
+   bool d_completed_transactions = false;
+
    /*
     * @brief Transactions where the source and destination are the
     * local process.
     */
    std::list<std::shared_ptr<Transaction> > d_local_set;
 
+   /*
+    * @brief Fuseable transactions where the source and destination are the
+    * local process.
+    */
+   std::list<std::shared_ptr<Transaction> > d_local_set_fuseable;
+
+   StagedKernelFusers* d_local_fusers{nullptr};
+
    //@{ @name High-level asynchronous messages passing objects
 
    /*!
@@ -359,7 +381,10 @@ class Schedule
     * d_coms is typed for byte sending because our data is of
     * unknown mixed type.
     */
-   AsyncCommPeer<char>* d_coms;
+   using CommMap = std::map<int, std::shared_ptr<AsyncCommPeer<char>>>;
+   CommMap d_send_coms;
+   CommMap d_recv_coms;
+
    /*!
     * @brief Stage for advancing communication operations to
     * completion.
diff --git a/source/SAMRAI/tbox/ScheduleKernelFuser.cpp b/source/SAMRAI/tbox/ScheduleKernelFuser.cpp
new file mode 100644
index 0000000000..3b2883c9bb
--- /dev/null
+++ b/source/SAMRAI/tbox/ScheduleKernelFuser.cpp
@@ -0,0 +1,66 @@
+/*************************************************************************
+ *
+ * This file is part of the SAMRAI distribution.  For full copyright
+ * information, see COPYRIGHT and LICENSE.
+ *
+ * Copyright:     (c) 1997-2021 Lawrence Livermore National Security, LLC
+ * Description:   Singleton kernel fuser
+ *
+ ************************************************************************/
+
+#include "SAMRAI/tbox/ScheduleKernelFuser.h"
+
+
+namespace SAMRAI {
+namespace tbox {
+
+ScheduleKernelFuser* ScheduleKernelFuser::s_schedule_kernel_fuser_instance(nullptr);
+
+StartupShutdownManager::Handler
+ScheduleKernelFuser::s_startup_handler(
+    0,
+    ScheduleKernelFuser::startupCallback,
+    0,
+    0,
+    tbox::StartupShutdownManager::priorityArenaManager);
+
+void
+ScheduleKernelFuser::startupCallback()
+{
+  ScheduleKernelFuser::getInstance()->initialize();
+}
+
+void
+ScheduleKernelFuser::shutdownCallback()
+{
+   if (s_schedule_kernel_fuser_instance) {
+      delete s_schedule_kernel_fuser_instance;
+   }
+   s_schedule_kernel_fuser_instance = nullptr;
+}
+
+ScheduleKernelFuser *
+ScheduleKernelFuser::getInstance()
+{
+   if (!s_schedule_kernel_fuser_instance) {
+      s_schedule_kernel_fuser_instance = new ScheduleKernelFuser();
+   }
+   return s_schedule_kernel_fuser_instance;
+}
+
+ScheduleKernelFuser::~ScheduleKernelFuser()
+{
+   delete d_kernel_fuser;
+   d_kernel_fuser = nullptr;
+}
+
+void
+ScheduleKernelFuser::initialize()
+{
+   d_kernel_fuser = new KernelFuser();
+}
+
+
+}
+}
+
diff --git a/source/SAMRAI/tbox/ScheduleKernelFuser.h b/source/SAMRAI/tbox/ScheduleKernelFuser.h
new file mode 100644
index 0000000000..9e07f0cd27
--- /dev/null
+++ b/source/SAMRAI/tbox/ScheduleKernelFuser.h
@@ -0,0 +1,70 @@
+#ifndef included_tbox_ScheduleKernelFuser
+#define included_tbox_ScheduleKernelFuser
+
+#include "SAMRAI/SAMRAI_config.h"
+
+#include "SAMRAI/tbox/KernelFuser.h"
+
+#ifdef HAVE_RAJA
+#include "RAJA/RAJA.hpp"
+#endif
+
+
+namespace SAMRAI {
+namespace tbox {
+
+class ScheduleKernelFuser
+{
+public:
+
+   static ScheduleKernelFuser* getInstance();
+
+#ifdef HAVE_RAJA
+   template<typename Kernel>
+   void enqueue(int stage, int begin, int end, Kernel&& kernel) {
+      d_kernel_fuser->enqueue(begin, end, kernel);
+   }
+#endif
+
+   void launch()
+   {
+      d_kernel_fuser->launch();
+   }
+
+   void cleanup()
+   {
+      d_kernel_fuser->cleanup(); 
+   }
+
+   KernelFuser* getFuser()
+   {
+      return d_kernel_fuser;
+   }
+
+   void initialize();
+
+protected:
+   ScheduleKernelFuser()
+   {
+   }
+
+   virtual ~ScheduleKernelFuser();
+
+
+private:
+
+   static void startupCallback();
+   static void shutdownCallback();
+
+   static ScheduleKernelFuser* s_schedule_kernel_fuser_instance;
+
+   static StartupShutdownManager::Handler
+   s_startup_handler;
+
+   KernelFuser* d_kernel_fuser = nullptr;
+};
+
+}
+}
+
+#endif
diff --git a/source/SAMRAI/tbox/StagedKernelFusers.cpp b/source/SAMRAI/tbox/StagedKernelFusers.cpp
new file mode 100644
index 0000000000..57d885227c
--- /dev/null
+++ b/source/SAMRAI/tbox/StagedKernelFusers.cpp
@@ -0,0 +1,63 @@
+/*************************************************************************
+ *
+ * This file is part of the SAMRAI distribution.  For full copyright
+ * information, see COPYRIGHT and LICENSE.
+ *
+ * Copyright:     (c) 1997-2021 Lawrence Livermore National Security, LLC
+ * Description:   Singleton kernel fuser
+ *
+ ************************************************************************/
+
+#include "SAMRAI/tbox/StagedKernelFusers.h"
+
+
+namespace SAMRAI {
+namespace tbox {
+
+StagedKernelFusers* StagedKernelFusers::s_staged_kernel_fusers_instance(nullptr);
+
+StartupShutdownManager::Handler
+StagedKernelFusers::s_startup_handler(
+    0,
+    StagedKernelFusers::startupCallback,
+    0,
+    0,
+    tbox::StartupShutdownManager::priorityArenaManager);
+
+void
+StagedKernelFusers::startupCallback()
+{
+  StagedKernelFusers::getInstance()->initialize();
+}
+
+void
+StagedKernelFusers::shutdownCallback()
+{
+   if (s_staged_kernel_fusers_instance) {
+      delete s_staged_kernel_fusers_instance;
+   }
+   s_staged_kernel_fusers_instance = nullptr;
+}
+
+StagedKernelFusers *
+StagedKernelFusers::getInstance()
+{
+   if (!s_staged_kernel_fusers_instance) {
+      s_staged_kernel_fusers_instance = new StagedKernelFusers();
+   }
+   return s_staged_kernel_fusers_instance;
+}
+
+StagedKernelFusers::~StagedKernelFusers()
+{
+}
+
+void
+StagedKernelFusers::initialize()
+{
+}
+
+
+}
+}
+
diff --git a/source/SAMRAI/tbox/StagedKernelFusers.h b/source/SAMRAI/tbox/StagedKernelFusers.h
new file mode 100644
index 0000000000..919886b33d
--- /dev/null
+++ b/source/SAMRAI/tbox/StagedKernelFusers.h
@@ -0,0 +1,107 @@
+#ifndef included_tbox_StagedKernelFusers
+#define included_tbox_StagedKernelFusers
+
+#include "SAMRAI/SAMRAI_config.h"
+
+#include "SAMRAI/tbox/Collectives.h"
+#include "SAMRAI/tbox/KernelFuser.h"
+
+#ifdef HAVE_RAJA
+#include "RAJA/RAJA.hpp"
+#endif
+
+
+namespace SAMRAI {
+namespace tbox {
+
+class StagedKernelFusers
+{
+public:
+
+   static StagedKernelFusers* getInstance();
+
+#ifdef HAVE_RAJA
+   template<typename Kernel>
+   void enqueue(int stage, int begin, int end, Kernel&& kernel) {
+      d_kernel_fusers[stage].enqueue(begin, end, kernel);
+   }
+#endif
+
+   void launch()
+   {
+      for (auto& fuser : d_kernel_fusers) {
+         fuser.second.launch();
+         d_launched = (d_launched || fuser.second.launched()); 
+      }
+   }
+
+   void cleanup()
+   {
+      for (auto& fuser : d_kernel_fusers) {
+         fuser.second.cleanup();
+      }
+      d_launched = false;
+   }
+
+   KernelFuser* getFuser(int stage)
+   {
+      return &d_kernel_fusers[stage];
+   }
+
+   void clearKernelFuser(int stage)
+   {
+      d_kernel_fusers.erase(stage);
+   }
+
+   void clearAllFusers()
+   {
+      d_kernel_fusers.clear();
+      d_launched = false;
+   }
+
+   bool launched()
+   {
+      return d_launched;
+   }
+
+   void initialize();
+
+   void launchAndCleanup()
+   {
+      launch();
+#ifdef HAVE_RAJA
+      if (d_launched) {
+         tbox::parallel_synchronize();
+      }
+#endif
+      cleanup();
+   }
+
+protected:
+   StagedKernelFusers()
+   {
+   }
+
+   virtual ~StagedKernelFusers();
+
+
+private:
+
+   static void startupCallback();
+   static void shutdownCallback();
+
+   static StagedKernelFusers* s_staged_kernel_fusers_instance;
+
+   static StartupShutdownManager::Handler
+   s_startup_handler;
+
+   std::map<int, KernelFuser> d_kernel_fusers;
+
+   bool d_launched = false;
+
+};
+
+}
+}
+
+#endif
diff --git a/source/SAMRAI/tbox/TransactionFuseable.cpp b/source/SAMRAI/tbox/TransactionFuseable.cpp
new file mode 100644
index 0000000000..f70d83f42f
--- /dev/null
+++ b/source/SAMRAI/tbox/TransactionFuseable.cpp
@@ -0,0 +1,19 @@
+#include "SAMRAI/tbox/TransactionFuseable.h"
+
+namespace SAMRAI {
+namespace tbox {
+
+void
+TransactionFuseable::setKernelFuser(StagedKernelFusers* fuser)
+{
+  d_fuser = fuser;
+}
+
+StagedKernelFusers*
+TransactionFuseable::getKernelFuser()
+{
+  return d_fuser;
+}
+
+}
+}
diff --git a/source/SAMRAI/tbox/TransactionFuseable.h b/source/SAMRAI/tbox/TransactionFuseable.h
new file mode 100644
index 0000000000..686268680b
--- /dev/null
+++ b/source/SAMRAI/tbox/TransactionFuseable.h
@@ -0,0 +1,38 @@
+/*************************************************************************
+ *
+ * This file is part of the SAMRAI distribution.  For full copyright
+ * information, see COPYRIGHT and LICENSE.
+ *
+ * Copyright:     (c) 1997-2020 Lawrence Livermore National Security, LLC
+ * Description:   Abstract base class for all schedule transactions
+ *
+ ************************************************************************/
+
+#ifndef included_tbox_TransactionFuseable
+#define included_tbox_TransactionFuseable
+
+#include "SAMRAI/SAMRAI_config.h"
+
+#include "SAMRAI/tbox/Transaction.h"
+#include "SAMRAI/tbox/StagedKernelFusers.h"
+
+#include <iostream>
+
+namespace SAMRAI {
+namespace tbox {
+
+class TransactionFuseable :
+  public Transaction
+{
+public:
+  void setKernelFuser(StagedKernelFusers* fuser);
+  StagedKernelFusers* getKernelFuser();
+
+private:
+  StagedKernelFusers* d_fuser{nullptr};
+};
+
+}
+}
+
+#endif
diff --git a/source/SAMRAI/xfer/CoarsenCopyTransaction.cpp b/source/SAMRAI/xfer/CoarsenCopyTransaction.cpp
index 470f3554bc..f8a287079f 100644
--- a/source/SAMRAI/xfer/CoarsenCopyTransaction.cpp
+++ b/source/SAMRAI/xfer/CoarsenCopyTransaction.cpp
@@ -131,7 +131,7 @@ CoarsenCopyTransaction::packStream(
    tbox::MessageStream& stream)
 {
    d_src_patch->getPatchData(d_coarsen_data[d_item_id]->d_src)
-   ->packStream(stream, *d_overlap);
+   ->packStreamFuseable(stream, *d_overlap);
 }
 
 void
@@ -139,7 +139,7 @@ CoarsenCopyTransaction::unpackStream(
    tbox::MessageStream& stream)
 {
    d_dst_patch->getPatchData(d_coarsen_data[d_item_id]->d_dst)
-   ->unpackStream(stream, *d_overlap);
+   ->unpackStreamFuseable(stream, *d_overlap);
 }
 
 void
@@ -151,7 +151,7 @@ CoarsenCopyTransaction::copyLocalData()
    const hier::PatchData& src_data =
       *d_src_patch->getPatchData(d_coarsen_data[d_item_id]->d_src);
 
-   dst_data.copy(src_data, *d_overlap);
+   dst_data.copyFuseable(src_data, *d_overlap);
 }
 
 /*
diff --git a/source/SAMRAI/xfer/CoarsenCopyTransaction.h b/source/SAMRAI/xfer/CoarsenCopyTransaction.h
index 2676619f6e..48588106f4 100644
--- a/source/SAMRAI/xfer/CoarsenCopyTransaction.h
+++ b/source/SAMRAI/xfer/CoarsenCopyTransaction.h
@@ -14,7 +14,7 @@
 
 #include "SAMRAI/SAMRAI_config.h"
 
-#include "SAMRAI/tbox/Transaction.h"
+#include "SAMRAI/tbox/TransactionFuseable.h"
 #include "SAMRAI/hier/BaseGridGeometry.h"
 #include "SAMRAI/hier/PatchLevel.h"
 #include "SAMRAI/xfer/CoarsenClasses.h"
@@ -40,7 +40,7 @@ namespace xfer {
  * @see tbox::Transaction
  */
 
-class CoarsenCopyTransaction:public tbox::Transaction
+class CoarsenCopyTransaction:public tbox::TransactionFuseable
 {
 public:
    /*!
diff --git a/source/SAMRAI/xfer/CoarsenPatchStrategy.h b/source/SAMRAI/xfer/CoarsenPatchStrategy.h
index 5a087d6693..b282db4d46 100644
--- a/source/SAMRAI/xfer/CoarsenPatchStrategy.h
+++ b/source/SAMRAI/xfer/CoarsenPatchStrategy.h
@@ -139,6 +139,43 @@ class CoarsenPatchStrategy
       const hier::Box& coarse_box,
       const hier::IntVector& ratio) = 0;
 
+   virtual void
+   setPostCoarsenSyncFlag()
+   {
+      setNeedCoarsenSynchronize(true);
+   }
+
+   /*!
+    * @brief Check flag for if host-device synchronization is needed.
+    *
+    * Returns current value of the flag while setting the flag back to
+    * the default value of true.
+    */
+   bool
+   needSynchronize()
+   {
+      bool flag = d_need_synchronize;
+      d_need_synchronize = true;
+      return flag;
+   }
+
+protected:
+
+   /*!
+    * @brief Set flag indicating if device synchronization is needed after
+    * a child class operation.
+    *
+    * This allows implementations of methods such as preprocessCoarsen and
+    * postprocessCoarsen to set the flag to false if they have done nothing
+    * that requires host-device synchronization and do not need
+    * CoarsenSchedule to call the synchronize routine.
+    */
+   void
+   setNeedCoarsenSynchronize(bool flag)
+   {
+      d_need_synchronize = flag;
+   }
+
 private:
    /*!
     * @brief Get the set of CoarsenPatchStrategy objects that have been
@@ -163,6 +200,8 @@ class CoarsenPatchStrategy
       current_objects.insert(this);
    }
 
+   bool d_need_synchronize = true;
+
 };
 
 }
diff --git a/source/SAMRAI/xfer/CoarsenSchedule.cpp b/source/SAMRAI/xfer/CoarsenSchedule.cpp
index 904715a62d..c7c19d7f0d 100644
--- a/source/SAMRAI/xfer/CoarsenSchedule.cpp
+++ b/source/SAMRAI/xfer/CoarsenSchedule.cpp
@@ -313,7 +313,9 @@ CoarsenSchedule::coarsenData() const
 
    d_schedule->communicate();
 #if defined(HAVE_RAJA)
-   tbox::parallel_synchronize();
+//   if (d_schedule->completedTransactions()) {
+//      tbox::parallel_synchronize();
+//   }
 #endif
 
    /*
@@ -1023,10 +1025,13 @@ CoarsenSchedule::coarsenSourceData(
          patch_strategy->preprocessCoarsen(*temp_patch,
             *fine_patch, box, block_ratio);
 #if defined(HAVE_RAJA)
-         tbox::parallel_synchronize();
+         if (patch_strategy->needSynchronize()) {
+            tbox::parallel_synchronize();
+         }
 #endif
       }
 
+      bool need_sync = false;
       for (size_t ici = 0; ici < d_number_coarsen_items; ++ici) {
          const CoarsenClasses::Data * const crs_item =
             d_coarsen_items[ici];
@@ -1035,22 +1040,39 @@ CoarsenSchedule::coarsenSourceData(
             crs_item->d_opcoarsen->coarsen(*temp_patch, *fine_patch,
                source_id, source_id,
                box, block_ratio);
+            need_sync = true;
          }
       }
+
 #if defined(HAVE_RAJA)
-      tbox::parallel_synchronize();
+//      if (need_sync) {
+//         tbox::parallel_synchronize();
+//      }
 #endif
 
       if (patch_strategy) {
+         d_coarsen_patch_strategy->setPostCoarsenSyncFlag();
+#if defined(HAVE_RAJA)
+         if (d_coarsen_patch_strategy->needSynchronize()) {
+            tbox::parallel_synchronize();
+         }
+#endif
+
          patch_strategy->postprocessCoarsen(*temp_patch,
             *fine_patch,
             box,
             block_ratio);
-#if defined(HAVE_RAJA)
-        tbox::parallel_synchronize();
-#endif
       }
    }
+
+   tbox::StagedKernelFusers::getInstance()->launch();
+#if defined(HAVE_RAJA)
+   if (!patch_strategy || patch_strategy->needSynchronize() || tbox::StagedKernelFusers::getInstance()->launched()) {
+      tbox::parallel_synchronize();
+   }
+#endif
+   tbox::StagedKernelFusers::getInstance()->cleanup();
+
 }
 
 /*
diff --git a/source/SAMRAI/xfer/RefineCopyTransaction.cpp b/source/SAMRAI/xfer/RefineCopyTransaction.cpp
index 0684dc21b2..deb939d8d5 100644
--- a/source/SAMRAI/xfer/RefineCopyTransaction.cpp
+++ b/source/SAMRAI/xfer/RefineCopyTransaction.cpp
@@ -131,7 +131,7 @@ RefineCopyTransaction::packStream(
    tbox::MessageStream& stream)
 {
    d_src_patch->getPatchData(d_refine_data[d_item_id]->d_src)
-   ->packStream(stream, *d_overlap);
+   ->packStreamFuseable(stream, *d_overlap);
 }
 
 void
@@ -139,7 +139,7 @@ RefineCopyTransaction::unpackStream(
    tbox::MessageStream& stream)
 {
    d_dst_patch->getPatchData(d_refine_data[d_item_id]->d_scratch)
-   ->unpackStream(stream, *d_overlap);
+   ->unpackStreamFuseable(stream, *d_overlap);
 }
 
 void
@@ -151,7 +151,7 @@ RefineCopyTransaction::copyLocalData()
    const hier::PatchData& src_data =
       *d_src_patch->getPatchData(d_refine_data[d_item_id]->d_src);
 
-   dst_data.copy(src_data, *d_overlap);
+   dst_data.copyFuseable(src_data, *d_overlap);
 }
 
 /*
diff --git a/source/SAMRAI/xfer/RefineCopyTransaction.h b/source/SAMRAI/xfer/RefineCopyTransaction.h
index c2cce76270..e3a0ba8278 100644
--- a/source/SAMRAI/xfer/RefineCopyTransaction.h
+++ b/source/SAMRAI/xfer/RefineCopyTransaction.h
@@ -14,7 +14,7 @@
 
 #include "SAMRAI/SAMRAI_config.h"
 
-#include "SAMRAI/tbox/Transaction.h"
+#include "SAMRAI/tbox/TransactionFuseable.h"
 #include "SAMRAI/hier/BaseGridGeometry.h"
 #include "SAMRAI/hier/PatchLevel.h"
 #include "SAMRAI/xfer/RefineClasses.h"
@@ -40,7 +40,7 @@ namespace xfer {
  * @see tbox::Transaction
  */
 
-class RefineCopyTransaction:public tbox::Transaction
+class RefineCopyTransaction:public tbox::TransactionFuseable
 {
 public:
    /*!
diff --git a/source/SAMRAI/xfer/RefinePatchStrategy.h b/source/SAMRAI/xfer/RefinePatchStrategy.h
index a5f79e7e43..8ac8069264 100644
--- a/source/SAMRAI/xfer/RefinePatchStrategy.h
+++ b/source/SAMRAI/xfer/RefinePatchStrategy.h
@@ -265,6 +265,7 @@ class RefinePatchStrategy
       NULL_USE(coarse_to_unfilled);
       NULL_USE(overlaps);
       NULL_USE(refine_items);
+      setNeedRefineSynchronize(false);
    }
 
    /*!
@@ -290,6 +291,44 @@ class RefinePatchStrategy
       NULL_USE(coarse_level);
       NULL_USE(coarse_to_fine);
       NULL_USE(coarse_to_unfilled);
+      setNeedRefineSynchronize(false);
+   }
+
+   virtual void
+   setPostRefineSyncFlag()
+   {
+      setNeedRefineSynchronize(true);
+   }
+
+   /*!
+    * @brief Check flag for if host-device synchronization is needed.
+    *
+    * Returns current value of the flag while setting the flag back to
+    * the default value of true.
+    */
+   bool
+   needSynchronize()
+   {
+      bool flag = d_need_synchronize;
+      d_need_synchronize = true;
+      return flag;
+   }
+
+protected:
+
+   /*!
+    * @brief Set flag indicating if device synchronization is needed after
+    * a child class operation.
+    *
+    * This allows implementations of methods such as preprocessRefine and
+    * postprocessRefine to set the flag to false if they have done nothing
+    * that requires host-device synchronization and do not need
+    * RefineSchedule to call the synchronize routine.
+    */
+   void
+   setNeedRefineSynchronize(bool flag)
+   {
+      d_need_synchronize = flag;
    }
 
 private:
@@ -327,6 +366,8 @@ class RefinePatchStrategy
       current_objects.erase(this);
    }
 
+   bool d_need_synchronize = true;
+
 };
 
 }
diff --git a/source/SAMRAI/xfer/RefineSchedule.cpp b/source/SAMRAI/xfer/RefineSchedule.cpp
index 8423d04b17..023f1b4b3d 100644
--- a/source/SAMRAI/xfer/RefineSchedule.cpp
+++ b/source/SAMRAI/xfer/RefineSchedule.cpp
@@ -29,6 +29,7 @@
 #include "SAMRAI/tbox/MathUtilities.h"
 #include "SAMRAI/tbox/InputManager.h"
 #include "SAMRAI/tbox/OpenMPUtilities.h"
+#include "SAMRAI/tbox/StagedKernelFusers.h"
 #include "SAMRAI/tbox/StartupShutdownManager.h"
 #include "SAMRAI/tbox/TimerManager.h"
 #include "SAMRAI/tbox/Utilities.h"
@@ -2098,9 +2099,11 @@ RefineSchedule::fillData(
     * space.
     */
 
-   copyScratchToDestination();
+   bool copied = copyScratchToDestination();
 #if defined(HAVE_RAJA)
-   tbox::parallel_synchronize();
+   if (copied) {
+      tbox::parallel_synchronize();
+   }
 #endif
 
    /*
@@ -2149,6 +2152,8 @@ RefineSchedule::recursiveFill(
    double fill_time,
    bool do_physical_boundary_fill) const
 {
+   int rank = d_dst_level->getBoxLevel()->getMPI().getRank();
+
    /*
     * Copy data from the source interiors of the source level into the ghost
     * cells and interiors of the scratch space on the destination level
@@ -2156,7 +2161,10 @@ RefineSchedule::recursiveFill(
     */
    d_coarse_priority_level_schedule->communicate();
 #if defined(HAVE_RAJA)
-   tbox::parallel_synchronize();
+//   TODO:  Be sure that this sync isn't needed.
+//   if (d_coarse_priority_level_schedule->completedTransactions()) {
+//      tbox::parallel_synchronize();
+//   }
 #endif
 
    /*
@@ -2213,12 +2221,12 @@ RefineSchedule::recursiveFill(
        * Recursively call the fill routine to fill the required coarse fill
        * boxes on the coarser level.
        */
-
       d_coarse_interp_schedule->recursiveFill(fill_time,
          do_physical_boundary_fill);
 
 #if defined(HAVE_RAJA)
-      tbox::parallel_synchronize();
+//      TODO: This sync probably isn't necessary, but keep an eye on it. 
+//      tbox::parallel_synchronize();
 #endif
 
       /*
@@ -2338,7 +2346,10 @@ RefineSchedule::recursiveFill(
     */
    d_fine_priority_level_schedule->communicate();
 #if defined(HAVE_RAJA)
-   tbox::parallel_synchronize();
+//   TODO:  Be sure that this sync isn't needed.
+//   if (d_fine_priority_level_schedule->completedTransactions()) {
+//      tbox::parallel_synchronize();
+//   }
 #endif
 
    /*
@@ -2373,6 +2384,9 @@ RefineSchedule::fillPhysicalBoundaries(
    d_dst_level->setBoundaryBoxes();
 
    if (d_refine_patch_strategy) {
+#if defined(HAVE_RAJA)
+      bool bdry_is_filled = false; 
+#endif 
       for (hier::PatchLevel::iterator p(d_dst_level->begin());
            p != d_dst_level->end(); ++p) {
          const std::shared_ptr<hier::Patch>& patch(*p);
@@ -2381,8 +2395,17 @@ RefineSchedule::fillPhysicalBoundaries(
             setPhysicalBoundaryConditions(*patch,
                fill_time,
                d_boundary_fill_ghost_width);
+#if defined(HAVE_RAJA)
+            bdry_is_filled = true;
+#endif
          }
       }
+#if defined(HAVE_RAJA) 
+      if (bdry_is_filled && d_refine_patch_strategy->needSynchronize()) {
+         tbox::parallel_synchronize();
+      }
+#endif
+
    }
    t_fill_physical_boundaries->stop();
 }
@@ -2416,7 +2439,9 @@ RefineSchedule::fillSingularityBoundaries(
       const hier::IntVector& ratio = d_dst_level->getRatioToLevelZero();
 
       if (d_singularity_patch_strategy) {
-
+#if defined(HAVE_RAJA)
+         bool sing_is_filled = false;
+#endif
          for (hier::BlockId::block_t bn = 0; bn < grid_geometry->getNumberBlocks(); ++bn) {
 
             hier::BlockId block_id(bn);
@@ -2465,6 +2490,9 @@ RefineSchedule::fillSingularityBoundaries(
                                  d_dst_to_encon,
                                  fill_box, nboxes[bb],
                                  grid_geometry);
+#if defined(HAVE_RAJA)
+                              sing_is_filled = true;
+#endif
                            }
                         }
                      }
@@ -2492,6 +2520,9 @@ RefineSchedule::fillSingularityBoundaries(
                                     d_dst_to_encon,
                                     fill_box, eboxes[bb],
                                     grid_geometry);
+#if defined(HAVE_RAJA)
+                                 sing_is_filled = true;
+#endif
                               }
                            }
                         }
@@ -2500,6 +2531,11 @@ RefineSchedule::fillSingularityBoundaries(
                }
             }
          }
+#if defined(HAVE_RAJA) 
+         if (sing_is_filled && d_singularity_patch_strategy->needSynchronize()) {
+            tbox::parallel_synchronize();
+         }
+#endif
       }
    }
    t_fill_singularity_boundaries->stop();
@@ -2604,10 +2640,11 @@ RefineSchedule::allocateWorkSpace(
  **************************************************************************
  */
 
-void
+bool
 RefineSchedule::copyScratchToDestination() const
 {
    TBOX_ASSERT(d_dst_level);
+   bool copied = false;
 
    for (hier::PatchLevel::iterator p(d_dst_level->begin());
         p != d_dst_level->end(); ++p) {
@@ -2621,11 +2658,12 @@ RefineSchedule::copyScratchToDestination() const
                   getPatchData(dst_id)->getTime(),
                   patch->getPatchData(src_id)->getTime()));
             patch->getPatchData(dst_id)->copy(*patch->getPatchData(src_id));
+            copied = true;
          }
       }
-
    }
 
+   return copied;
 }
 
 /*
@@ -2647,6 +2685,7 @@ RefineSchedule::refineScratchData(
    overlaps) const
 {
    t_refine_scratch_data->start();
+   int rank = d_dst_level->getBoxLevel()->getMPI().getRank();
 
 #ifdef DEBUG_CHECK_ASSERTIONS
    bool is_encon = (fine_level == d_encon_level);
@@ -2661,6 +2700,11 @@ RefineSchedule::refineScratchData(
          coarse_to_unfilled,
          overlaps,
          d_refine_items);
+#if defined(HAVE_RAJA)
+      if (d_refine_patch_strategy->needSynchronize()) {
+         tbox::parallel_synchronize();
+      }
+#endif
    }
 
    const hier::IntVector ratio(fine_level->getRatioToLevelZero()
@@ -2713,7 +2757,9 @@ RefineSchedule::refineScratchData(
                fill_boxes,
                local_ratio);
 #if defined(HAVE_RAJA)
-            tbox::parallel_synchronize();
+            if (d_refine_patch_strategy->needSynchronize()) {
+               tbox::parallel_synchronize();
+            }
 #endif
          }
 
@@ -2729,20 +2775,24 @@ RefineSchedule::refineScratchData(
                ref_item->d_oprefine->refine(*fine_patch, *crse_patch,
                   scratch_id, scratch_id,
                   *refine_overlap, local_ratio);
-
             }
          }
-#if defined(HAVE_RAJA)
-         tbox::parallel_synchronize();
-#endif
 
          if (d_refine_patch_strategy) {
+            d_refine_patch_strategy->setPostRefineSyncFlag();
+#if defined(HAVE_RAJA)
+            if (d_refine_patch_strategy->needSynchronize()) {
+               tbox::parallel_synchronize();
+            }
+#endif
             d_refine_patch_strategy->postprocessRefineBoxes(*fine_patch,
                *crse_patch,
                fill_boxes,
                local_ratio);
 #if defined(HAVE_RAJA)
-            tbox::parallel_synchronize();
+            if (d_refine_patch_strategy->needSynchronize()) {
+               tbox::parallel_synchronize();
+            }
 #endif
          }
 
@@ -2773,12 +2823,14 @@ RefineSchedule::refineScratchData(
             d_nbr_blk_fill_level->getPatch(unfilled_id));
 
          if (d_refine_patch_strategy) {
-		    d_refine_patch_strategy->preprocessRefineBoxes(*nbr_fill_patch,
+	    d_refine_patch_strategy->preprocessRefineBoxes(*nbr_fill_patch,
                *crse_patch,
                fill_boxes,
                local_ratio);
 #if defined(HAVE_RAJA)
-         tbox::parallel_synchronize();
+            if (d_refine_patch_strategy->needSynchronize()) {
+               tbox::parallel_synchronize();
+            }
 #endif
          }
 
@@ -2808,7 +2860,9 @@ RefineSchedule::refineScratchData(
                fill_boxes,
                local_ratio);
 #if defined(HAVE_RAJA)
-            tbox::parallel_synchronize();
+            if (d_refine_patch_strategy->needSynchronize()) {
+               tbox::parallel_synchronize();
+            }
 #endif
          }
 
@@ -2837,6 +2891,12 @@ RefineSchedule::refineScratchData(
       }
    }
 
+   tbox::StagedKernelFusers::getInstance()->launch();
+#if defined(HAVE_RAJA)
+   tbox::parallel_synchronize();
+#endif
+   tbox::StagedKernelFusers::getInstance()->cleanup();
+
    if (d_refine_patch_strategy) {
       d_refine_patch_strategy->postprocessRefineLevel(
          *fine_level,
@@ -2844,7 +2904,9 @@ RefineSchedule::refineScratchData(
          coarse_to_fine,
          coarse_to_unfilled);
 #if defined(HAVE_RAJA)
-      tbox::parallel_synchronize();
+      if (d_refine_patch_strategy->needSynchronize()) {
+         tbox::parallel_synchronize();
+      }
 #endif
 
    }
diff --git a/source/SAMRAI/xfer/RefineSchedule.h b/source/SAMRAI/xfer/RefineSchedule.h
index 0dd10c8c93..cf1335e865 100644
--- a/source/SAMRAI/xfer/RefineSchedule.h
+++ b/source/SAMRAI/xfer/RefineSchedule.h
@@ -534,9 +534,11 @@ class RefineSchedule
     * If the scratch and destination patch data components are the same,
     * then no copying is performed.
     *
+    * @return  Returns true only if copies were performed.
+    *
     * @pre d_dst_level
     */
-   void
+   bool 
    copyScratchToDestination() const;
 
    /*!
diff --git a/source/SAMRAI/xfer/SingularityPatchStrategy.h b/source/SAMRAI/xfer/SingularityPatchStrategy.h
index 144a656b05..0124f258f5 100644
--- a/source/SAMRAI/xfer/SingularityPatchStrategy.h
+++ b/source/SAMRAI/xfer/SingularityPatchStrategy.h
@@ -90,6 +90,27 @@ class SingularityPatchStrategy
       const hier::BoundaryBox& boundary_box,
       const std::shared_ptr<hier::BaseGridGeometry>& grid_geometry) = 0;
 
+   bool
+   needSynchronize()
+   {
+      bool flag = d_need_synchronize;
+      d_need_synchronize = true;
+      return flag;
+   }
+
+protected:
+
+   void
+   setNeedSingularitySynchronize(bool flag)
+   {
+      d_need_synchronize = flag;
+   }
+
+private:
+
+   bool d_need_synchronize = true;
+
+
 };
 
 }
diff --git a/source/test/MappedBoxLevelConnectorUtilsTests/CMakeLists.txt b/source/test/MappedBoxLevelConnectorUtilsTests/CMakeLists.txt
index 2d329d7d69..a524991ca4 100644
--- a/source/test/MappedBoxLevelConnectorUtilsTests/CMakeLists.txt
+++ b/source/test/MappedBoxLevelConnectorUtilsTests/CMakeLists.txt
@@ -8,7 +8,8 @@ blt_add_executable(
     SAMRAI_hier
     SAMRAI_geom
     SAMRAI_mesh
-    SAMRAI_tbox)
+    SAMRAI_tbox
+    cuda)
 
 target_compile_definitions(mblcu PUBLIC TESTING=1)
 
diff --git a/source/test/MappingConnector/CMakeLists.txt b/source/test/MappingConnector/CMakeLists.txt
index 4359563198..2b7e2386eb 100644
--- a/source/test/MappingConnector/CMakeLists.txt
+++ b/source/test/MappingConnector/CMakeLists.txt
@@ -8,7 +8,8 @@ blt_add_executable(
     SAMRAI_hier
     SAMRAI_geom
     SAMRAI_mesh
-    SAMRAI_tbox)
+    SAMRAI_tbox
+    cuda)
 
 target_compile_definitions(mapping-connector PUBLIC TESTING=1)
 
diff --git a/source/test/mblktree/CMakeLists.txt b/source/test/mblktree/CMakeLists.txt
index 4abe346d13..88e6349abe 100644
--- a/source/test/mblktree/CMakeLists.txt
+++ b/source/test/mblktree/CMakeLists.txt
@@ -5,7 +5,8 @@ set (mblktree_depends_on
     SAMRAI_hier
     SAMRAI_geom
     SAMRAI_mesh
-    SAMRAI_tbox)
+    SAMRAI_tbox
+    cuda)
 
 blt_add_executable(
   NAME mblktree
diff --git a/source/test/sparsedata/CMakeLists.txt b/source/test/sparsedata/CMakeLists.txt
index f14417c13c..f61867c5ab 100644
--- a/source/test/sparsedata/CMakeLists.txt
+++ b/source/test/sparsedata/CMakeLists.txt
@@ -23,7 +23,8 @@ blt_add_executable(
     SAMRAI_tbox
     SAMRAI_hier
     SAMRAI_pdat
-    SAMRAI_geom)
+    SAMRAI_geom
+    cuda)
 
 target_compile_definitions(sparse PUBLIC TESTING=1)