From 3ad6204d06d50f7d237da33b550f11707a6e55ee Mon Sep 17 00:00:00 2001
From: Carl Pearson <cwpears@sandia.gov>
Date: Thu, 20 Jul 2023 14:47:26 -0600
Subject: [PATCH 01/10] Adds the merge-based spmv behind an opt-in
 `controls.setParameter("algorithm", "merge")`.

This SpMV is up to 200x faster than the existing native implementation for matrices with highly skewed row lengths.

* Adds `KokkosSparse::Impl::diagonal_search` to support the merge-based SpMV
* Moves `KokkosKernels_Iota.hpp` into the `impl` directory
* Removes the static assert for `Kokkos::View` and `Kokkos::Iota` from `lower_bound` and friends, since they can also operate on things that appear like a view but are not present in the common component
* Changes `KokkosSparse::Impl::MergeMatrixDiagonal::MatrixPosition` -> `KokkosSparse::Impl::MergeMatrixPosition`. Various `MergeMatrixDiagonals` have the same `MatrixPosition` type, which were technically different because they were scoped.
---
 common/{src => impl}/KokkosKernels_Iota.hpp   |   0
 common/src/KokkosKernels_LowerBound.hpp       |  22 +-
 .../KokkosSparse_merge_matrix.hpp}            | 132 ++++++-
 sparse/impl/KokkosSparse_spmv_impl.hpp        |  35 +-
 sparse/impl/KokkosSparse_spmv_impl_merge.hpp  | 373 ++++++++++++++++++
 sparse/impl/KokkosSparse_spmv_struct_impl.hpp |  14 +-
 sparse/unit_test/Test_Sparse_MergeMatrix.hpp  |  40 +-
 sparse/unit_test/Test_Sparse_spmv.hpp         |  75 ++--
 8 files changed, 588 insertions(+), 103 deletions(-)
 rename common/{src => impl}/KokkosKernels_Iota.hpp (100%)
 rename sparse/{src/KokkosSparse_MergeMatrix.hpp => impl/KokkosSparse_merge_matrix.hpp} (56%)
 create mode 100644 sparse/impl/KokkosSparse_spmv_impl_merge.hpp
diff --git a/common/src/KokkosKernels_Iota.hpp b/common/impl/KokkosKernels_Iota.hpp
similarity index 100%
rename from common/src/KokkosKernels_Iota.hpp
rename to common/impl/KokkosKernels_Iota.hpp
diff --git a/common/src/KokkosKernels_LowerBound.hpp b/common/src/KokkosKernels_LowerBound.hpp
index 160bd496f3..e091932453 100644
--- a/common/src/KokkosKernels_LowerBound.hpp
+++ b/common/src/KokkosKernels_LowerBound.hpp
@@ -77,8 +77,8 @@ namespace Impl {
 
 /*! \brief Single-thread sequential lower-bound search
 
-    \tparam ViewLike A Kokkos::View or KokkosKernels::Impl::Iota
-    \tparam Pred a binary predicate function
+    \tparam ViewLike A Kokkos::View, KokkosKernels::Impl::Iota, or
+   KokkosSparse::MergeMatrixDiagonal \tparam Pred a binary predicate function
     \param view the view to search
     \param value the value to search for
     \param pred a binary predicate function
@@ -96,9 +96,6 @@ lower_bound_sequential_thread(
   using size_type = typename ViewLike::size_type;
   static_assert(1 == ViewLike::rank,
                 "lower_bound_sequential_thread requires rank-1 views");
-  static_assert(is_iota_v<ViewLike> || Kokkos::is_view<ViewLike>::value,
-                "lower_bound_sequential_thread requires a "
-                "KokkosKernels::Impl::Iota or a Kokkos::View");
 
   size_type i = 0;
   while (i < view.size() && pred(view(i), value)) {
@@ -109,8 +106,8 @@ lower_bound_sequential_thread(
 
 /*! \brief Single-thread binary lower-bound search
 
-    \tparam ViewLike A Kokkos::View or KokkosKernels::Impl::Iota
-    \tparam Pred a binary predicate function
+    \tparam ViewLike A Kokkos::View, KokkosKernels::Impl::Iota, or
+   KokkosSparse::MergeMatrixDiagonal \tparam Pred a binary predicate function
     \param view the view to search
     \param value the value to search for
     \param pred a binary predicate function
@@ -127,9 +124,6 @@ KOKKOS_INLINE_FUNCTION typename ViewLike::size_type lower_bound_binary_thread(
   using size_type = typename ViewLike::size_type;
   static_assert(1 == ViewLike::rank,
                 "lower_bound_binary_thread requires rank-1 views");
-  static_assert(is_iota_v<ViewLike> || Kokkos::is_view<ViewLike>::value,
-                "lower_bound_binary_thread requires a "
-                "KokkosKernels::Impl::Iota or a Kokkos::View");
 
   size_type lo = 0;
   size_type hi = view.size();
@@ -150,8 +144,8 @@ KOKKOS_INLINE_FUNCTION typename ViewLike::size_type lower_bound_binary_thread(
 
 /*! \brief single-thread lower-bound search
 
-    \tparam ViewLike A Kokkos::View or KokkosKernels::Impl::Iota
-    \tparam Pred a binary predicate function
+    \tparam ViewLike A Kokkos::View, KokkosKernels::Impl::Iota, or
+   KokkosSparse::MergeMatrixDiagonal \tparam Pred a binary predicate function
     \param view the view to search
     \param value the value to search for
     \param pred a binary predicate function
@@ -168,10 +162,6 @@ KOKKOS_INLINE_FUNCTION typename ViewLike::size_type lower_bound_thread(
     Pred pred = Pred()) {
   static_assert(1 == ViewLike::rank,
                 "lower_bound_thread requires rank-1 views");
-  static_assert(KokkosKernels::Impl::is_iota_v<ViewLike> ||
-                    Kokkos::is_view<ViewLike>::value,
-                "lower_bound_thread requires a "
-                "KokkosKernels::Impl::Iota or a Kokkos::View");
   /*
      sequential search makes on average 0.5 * view.size memory accesses
      binary search makes log2(view.size)+1 accesses
diff --git a/sparse/src/KokkosSparse_MergeMatrix.hpp b/sparse/impl/KokkosSparse_merge_matrix.hpp
similarity index 56%
rename from sparse/src/KokkosSparse_MergeMatrix.hpp
rename to sparse/impl/KokkosSparse_merge_matrix.hpp
index d573a5550f..83dfe42e84 100644
--- a/sparse/src/KokkosSparse_MergeMatrix.hpp
+++ b/sparse/impl/KokkosSparse_merge_matrix.hpp
@@ -20,13 +20,20 @@
 #include <type_traits>
 
 #include "KokkosKernels_Iota.hpp"
+#include "KokkosKernels_LowerBound.hpp"
+#include "KokkosKernels_Predicates.hpp"
 #include "KokkosKernels_SafeCompare.hpp"
 
-/// \file KokkosSparse_MergeMatrix.hpp
+/// \file KokkosSparse_merge_matrix.hpp
 
-namespace KokkosSparse {
-namespace Experimental {
-namespace Impl {
+namespace KokkosSparse::Impl {
+
+// a joint index into a and b
+template <typename AIndex, typename BIndex>
+struct MergeMatrixPosition {
+  AIndex ai;
+  BIndex bi;
+};
 
 /*! \class MergeMatrixDiagonal
     \brief a view into the entries of the Merge Matrix along a diagonal
@@ -88,14 +95,7 @@ class MergeMatrixDiagonal {
   using a_value_type = typename AView::non_const_value_type;
   using b_value_type = typename BViewLike::non_const_value_type;
 
-  /*! \struct MatrixPosition
-   *  \brief indices into the a_ and b_ views.
-   */
-  struct MatrixPosition {
-    a_index_type ai;
-    b_index_type bi;
-  };
-  using position_type = MatrixPosition;
+  using position_type = MergeMatrixPosition<a_index_type, b_index_type>;
 
   // implement bare minimum parts of the view interface
   enum { rank = 1 };
@@ -145,9 +145,9 @@ class MergeMatrixDiagonal {
   KOKKOS_INLINE_FUNCTION
   bool operator()(const size_type di) const {
     position_type pos = diag_to_a_b(di);
-    if (pos.ai >= a_.size()) {
+    if (size_t(pos.ai) >= a_.size()) {
       return true;  // on the +a side out of matrix bounds is 1
-    } else if (pos.bi >= b_.size()) {
+    } else if (size_t(pos.bi) >= b_.size()) {
       return false;  // on the +b side out of matrix bounds is 0
     } else {
       return KokkosKernels::Impl::safe_gt(a_(pos.ai), b_(pos.bi));
@@ -192,8 +192,106 @@ class MergeMatrixDiagonal {
   size_type d_;  ///< diagonal
 };
 
-}  // namespace Impl
-}  // namespace Experimental
-}  // namespace KokkosSparse
+/*! \brief Return the first index on diagonal \code diag
+           in the merge matrix of \code a and \code b that is not 1
+This is effectively a lower-bound search on the merge matrix diagonal
+where the predicate is "equals 1"
+*/
+template <typename AView, typename BViewLike>
+KOKKOS_INLINE_FUNCTION
+    typename MergeMatrixDiagonal<AView, BViewLike>::position_type
+    diagonal_search(
+        const AView &a, const BViewLike &b,
+        typename MergeMatrixDiagonal<AView, BViewLike>::size_type diag) {
+  // unmanaged view types for a and b
+  using um_a_view =
+      Kokkos::View<typename AView::value_type *, typename AView::device_type,
+                   Kokkos::MemoryUnmanaged>;
+  using um_b_view =
+      Kokkos::View<typename BViewLike::value_type *,
+                   typename BViewLike::device_type, Kokkos::MemoryUnmanaged>;
+
+  um_a_view ua(a.data(), a.size());
+
+  // if BViewLike is an Iota, pass it on directly to MMD,
+  // otherwise, create an unmanaged view of B
+  using b_type =
+      typename std::conditional<KokkosKernels::Impl::is_iota<BViewLike>::value,
+                                BViewLike, um_b_view>::type;
+
+  using MMD = MergeMatrixDiagonal<um_a_view, b_type>;
+  MMD mmd;
+  if constexpr (KokkosKernels::Impl::is_iota<BViewLike>::value) {
+    mmd = MMD(ua, b, diag);
+  } else {
+    b_type ub(b.data(), b.size());
+    mmd = MMD(ua, ub, diag);
+  }
+
+  // returns index of the first element that does not satisfy pred(element,
+  // value) our input view is the merge matrix entry along the diagonal, and we
+  // want the first one that is not true. so our predicate just tells us if the
+  // merge matrix diagonal entry is equal to true or not
+  const typename MMD::size_type idx = KokkosKernels::lower_bound_thread(
+      mmd, true, KokkosKernels::Equal<bool>());
+  return mmd.position(idx);
+}
+
+template <typename TeamMember, typename AView, typename BViewLike>
+KOKKOS_INLINE_FUNCTION
+    typename MergeMatrixDiagonal<AView, BViewLike>::position_type
+    diagonal_search(
+        const TeamMember &handle, const AView &a, const BViewLike &b,
+        typename MergeMatrixDiagonal<AView, BViewLike>::size_type diag) {
+  // unmanaged view types for a and b
+  using um_a_view =
+      Kokkos::View<typename AView::value_type *, typename AView::device_type,
+                   Kokkos::MemoryUnmanaged>;
+  using um_b_view =
+      Kokkos::View<typename BViewLike::value_type *,
+                   typename BViewLike::device_type, Kokkos::MemoryUnmanaged>;
+
+  um_a_view ua(a.data(), a.size());
+
+  // if BViewLike is an Iota, pass it on directly to MMD,
+  // otherwise, create an unmanaged view of B
+  using b_type =
+      typename std::conditional<KokkosKernels::Impl::is_iota<BViewLike>::value,
+                                BViewLike, um_b_view>::type;
+
+  using MMD = MergeMatrixDiagonal<um_a_view, b_type>;
+  MMD mmd;
+  if constexpr (KokkosKernels::Impl::is_iota<BViewLike>::value) {
+    mmd = MMD(ua, b, diag);
+  } else {
+    b_type ub(b.data(), b.size());
+    mmd = MMD(ua, ub, diag);
+  }
+
+  // returns index of the first element that does not satisfy pred(element,
+  // value) our input view is the merge matrix entry along the diagonal, and we
+  // want the first one that is not true. so our predicate just tells us if the
+  // merge matrix diagonal entry is equal to true or not
+  const typename MMD::size_type idx = KokkosKernels::lower_bound_team(
+      handle, mmd, true, KokkosKernels::Equal<bool>());
+  return mmd.position(idx);
+}
+
+/*! \brief
+
+    \return A MergeMatrixDiagonal::position_type
+ */
+template <typename View>
+KOKKOS_INLINE_FUNCTION auto diagonal_search(
+    const View &a, typename View::non_const_value_type totalWork,
+    typename View::size_type diag) {
+  using value_type = typename View::non_const_value_type;
+  using size_type  = typename View::size_type;
+
+  KokkosKernels::Impl::Iota<value_type, size_type> iota(totalWork);
+  return diagonal_search(a, iota, diag);
+}
+
+}  // namespace KokkosSparse::Impl
 
 #endif  // KOKKOSSPARSE_MERGEMATRIX_HPP
diff --git a/sparse/impl/KokkosSparse_spmv_impl.hpp b/sparse/impl/KokkosSparse_spmv_impl.hpp
index d00808558f..16717c6e62 100644
--- a/sparse/impl/KokkosSparse_spmv_impl.hpp
+++ b/sparse/impl/KokkosSparse_spmv_impl.hpp
@@ -17,17 +17,22 @@
 #ifndef KOKKOSSPARSE_IMPL_SPMV_DEF_HPP_
 #define KOKKOSSPARSE_IMPL_SPMV_DEF_HPP_
 
+#include <sstream>
+
 #include "KokkosKernels_Controls.hpp"
 #include "Kokkos_InnerProductSpaceTraits.hpp"
 #include "KokkosBlas1_scal.hpp"
 #include "KokkosKernels_ExecSpaceUtils.hpp"
 #include "KokkosSparse_CrsMatrix.hpp"
 #include "KokkosSparse_spmv_impl_omp.hpp"
+#include "KokkosSparse_spmv_impl_merge.hpp"
 #include "KokkosKernels_Error.hpp"
 
 namespace KokkosSparse {
 namespace Impl {
 
+constexpr const char* KOKKOSSPARSE_ALG_MERGE = "merge";
+
 // This TransposeFunctor is functional, but not necessarily performant.
 template <class execution_space, class AMatrix, class XVector, class YVector,
           bool conjugate>
@@ -629,11 +634,21 @@ static void spmv_beta(const execution_space& exec,
                       typename YVector::const_value_type& beta,
                       const YVector& y) {
   if (mode[0] == NoTranspose[0]) {
-    spmv_beta_no_transpose<execution_space, AMatrix, XVector, YVector, dobeta,
-                           false>(exec, controls, alpha, A, x, beta, y);
+    if (controls.getParameter("algorithm") == KOKKOSSPARSE_ALG_MERGE) {
+      SpmvMergeHierarchical<execution_space, AMatrix, XVector, YVector>::spmv(exec, mode, alpha, A, x,
+                                                             beta, y);
+    } else {
+      spmv_beta_no_transpose<execution_space, XVector, YVector, dobeta, false>(
+          exec, controls, alpha, A, x, beta, y);
+    }
   } else if (mode[0] == Conjugate[0]) {
-    spmv_beta_no_transpose<execution_space, AMatrix, XVector, YVector, dobeta,
-                           true>(exec, controls, alpha, A, x, beta, y);
+    if (controls.getParameter("algorithm") == KOKKOSSPARSE_ALG_MERGE) {
+      SpmvMergeHierarchical<execution_space, AMatrix, XVector, YVector>::spmv(exec, mode, alpha, A, x,
+                                                             beta, y);
+    } else {
+      spmv_beta_no_transpose<execution_space, AMatrix, XVector, YVector, dobeta, true>(
+          exec, controls, alpha, A, x, beta, y);
+    }
   } else if (mode[0] == Transpose[0]) {
     spmv_beta_transpose<execution_space, AMatrix, XVector, YVector, dobeta,
                         false>(exec, alpha, A, x, beta, y);
@@ -641,8 +656,10 @@ static void spmv_beta(const execution_space& exec,
     spmv_beta_transpose<execution_space, AMatrix, XVector, YVector, dobeta,
                         true>(exec, alpha, A, x, beta, y);
   } else {
-    KokkosKernels::Impl::throw_runtime_exception(
-        "Invalid Transpose Mode for KokkosSparse::spmv()");
+    std::stringstream ss;
+    ss << __FILE__ << ":" << __LINE__ << " Invalid transpose mode " << mode
+       << " for KokkosSparse::spmv()";
+    KokkosKernels::Impl::throw_runtime_exception(ss.str());
   }
 }
 
@@ -1460,8 +1477,10 @@ static void spmv_alpha_beta_mv(
                                  doalpha, dobeta, true>(exec, alpha, A, x, beta,
                                                         y);
   } else {
-    KokkosKernels::Impl::throw_runtime_exception(
-        "Invalid Transpose Mode for KokkosSparse::spmv()");
+    std::stringstream ss;
+    ss << __FILE__ << ":" << __LINE__ << " Invalid transpose mode " << mode
+       << " for KokkosSparse::spmv()";
+    KokkosKernels::Impl::throw_runtime_exception(ss.str());
   }
 }
 
diff --git a/sparse/impl/KokkosSparse_spmv_impl_merge.hpp b/sparse/impl/KokkosSparse_spmv_impl_merge.hpp
new file mode 100644
index 0000000000..7ed09ee477
--- /dev/null
+++ b/sparse/impl/KokkosSparse_spmv_impl_merge.hpp
@@ -0,0 +1,373 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOSSPARSE_SPMV_IMPL_MERGE_HPP
+#define KOKKOSSPARSE_SPMV_IMPL_MERGE_HPP
+
+#include <sstream>
+
+#include "KokkosKernels_Iota.hpp"
+#include "KokkosKernels_AlwaysFalse.hpp"
+
+#include "KokkosSparse_merge_matrix.hpp"
+
+namespace KokkosSparse::Impl {
+
+/*! \brief Merge-based SpMV
+  Hierarchical GPU implementation
+  Each team uses MergePath search to find the non-zeros and rows it is
+  responsible for Each thread in the team similarly uses diagonal search within
+  the team to determine which entries it will be responsible for
+  The threads then atomically accumulate partial produces
+*/
+template <class AMatrix, class XVector, class YVector>
+struct SpmvMergeHierarchical {
+  using device_type    = typename YVector::device_type;
+  using exec_space     = typename device_type::execution_space;
+  using y_value_type   = typename YVector::non_const_value_type;
+  using x_value_type   = typename XVector::non_const_value_type;
+  using A_value_type   = typename AMatrix::non_const_value_type;
+  using A_ordinal_type = typename AMatrix::non_const_ordinal_type;
+  using A_size_type    = typename AMatrix::non_const_size_type;
+  using row_map_non_const_value_type =
+      typename AMatrix::row_map_type::non_const_value_type;
+
+  using policy_type = Kokkos::TeamPolicy<exec_space>;
+  using team_member = typename policy_type::member_type;
+
+  using um_row_map_type =
+      Kokkos::View<typename AMatrix::row_map_type::data_type,
+                   typename AMatrix::row_map_type::device_type::memory_space,
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
+
+  using row_map_scratch_type =
+      Kokkos::View<typename AMatrix::row_map_type::non_const_data_type,
+                   typename exec_space::scratch_memory_space,
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
+
+  using iota_type = KokkosKernels::Impl::Iota<A_size_type, A_size_type>;
+
+  using DSR = typename KokkosSparse::Impl::MergeMatrixDiagonal<
+      um_row_map_type, iota_type>::position_type;
+
+  using KAT = Kokkos::ArithTraits<A_value_type>;
+
+  // results of a lower-bound and upper-bound diagonal search
+  struct Chunk {
+    DSR lb;  // lower bound
+    DSR ub;  // upper bound
+  };
+
+  template <bool NONZEROS_USE_SCRATCH, bool ROWENDS_USE_SCRATCH,
+            bool Y_USE_SCRATCH, bool CONJ>
+  struct Functor {
+    Functor(const y_value_type& _alpha, const AMatrix& _A, const XVector& _x,
+            const YVector& _y, const A_size_type pathLengthThreadChunk)
+        : alpha(_alpha),
+          A(_A),
+          x(_x),
+          y(_y),
+          pathLengthThreadChunk_(pathLengthThreadChunk) {}
+
+    y_value_type alpha;
+    AMatrix A;
+    XVector x;
+    YVector y;
+    A_size_type pathLengthThreadChunk_;
+
+    KOKKOS_INLINE_FUNCTION void operator()(const team_member& thread) const {
+      const A_size_type pathLengthTeamChunk =
+          thread.team_size() * pathLengthThreadChunk_;
+
+      const A_size_type pathLength = A.numRows() + A.nnz();
+      const A_size_type teamD =
+          thread.league_rank() * pathLengthTeamChunk;  // diagonal
+      const A_size_type teamDEnd =
+          KOKKOSKERNELS_MACRO_MIN(teamD + pathLengthTeamChunk, pathLength);
+
+      // iota(i) -> i
+      iota_type iota(A.nnz());
+
+      // remove leading 0 from row_map
+      um_row_map_type rowEnds(&A.graph.row_map(1), A.graph.row_map.size() - 1);
+
+      // compiler thinks these are "used" in team_broadcast below, so initialize
+      // them with something to silence the warning
+      DSR lb{};
+      DSR ub{};
+
+      // thread 0 does the lower bound, thread 1 does the upper bound
+      if (0 == thread.team_rank() || 1 == thread.team_rank()) {
+        const A_size_type d = thread.team_rank() ? teamDEnd : teamD;
+        DSR dsr             = diagonal_search(rowEnds, iota, d);
+        if (0 == thread.team_rank()) {
+          lb = dsr;
+        }
+        if (1 == thread.team_rank()) {
+          ub = dsr;
+        }
+      }
+      thread.team_broadcast(lb, 0);
+      thread.team_broadcast(ub, 1);
+      const A_size_type teamNnzBegin =
+          lb.bi;  // the first nnz this team will handle
+      const A_size_type teamNnzEnd =
+          ub.bi;  // one-past the last nnz this team will handle
+      const A_ordinal_type teamRowBegin =
+          lb.ai;  // <= the row than the first nnz is in
+      const A_ordinal_type teamRowEnd =
+          ub.ai;  // >= the row than the last nnz is in
+
+      // team-collaborative copy of matrix data into scratch
+      A_size_type* rowEndsS{nullptr};
+      A_ordinal_type* entriesS{nullptr};
+      A_value_type* valuesS{nullptr};
+      y_value_type* yS{nullptr};
+
+      if constexpr (ROWENDS_USE_SCRATCH) {
+        rowEndsS = (A_size_type*)thread.team_shmem().get_shmem(
+            pathLengthTeamChunk * sizeof(A_size_type));
+
+        // teamRowEnd may be equal to the row the team's last nnz is in
+        // so in most cases we want to read it (teamRowEnd+1). However,
+        // however, guard against reading off the end of the view
+        Kokkos::parallel_for(
+            Kokkos::TeamThreadRange(thread, teamRowBegin, teamRowEnd + 1),
+            [&](const A_ordinal_type& i) {
+              if (i < A.numRows()) {
+                rowEndsS[i - teamRowBegin] = rowEnds(i);
+              } else {
+                rowEndsS[i - teamRowBegin] = A.nnz();
+              }
+            });
+      } else {
+        (void)(rowEndsS == rowEndsS);  // set but unused, expr has no effect
+      }
+
+      if constexpr (NONZEROS_USE_SCRATCH) {
+        valuesS = (A_value_type*)thread.team_shmem().get_shmem(
+            pathLengthTeamChunk * sizeof(A_value_type));
+        entriesS = (A_ordinal_type*)thread.team_shmem().get_shmem(
+            pathLengthTeamChunk * sizeof(A_ordinal_type));
+        Kokkos::parallel_for(
+            Kokkos::TeamThreadRange(thread, teamNnzBegin, teamNnzEnd),
+            [=](const A_ordinal_type& i) {
+              valuesS[i - teamNnzBegin]  = A.values(i);
+              entriesS[i - teamNnzBegin] = A.graph.entries(i);
+            });
+      } else {
+        (void)(entriesS == entriesS);  // set but unused, expr has no effect
+        (void)(valuesS == valuesS);    // set but unused, expr has no effect
+      }
+
+      if constexpr (Y_USE_SCRATCH) {
+        yS = (y_value_type*)thread.team_shmem().get_shmem(pathLengthTeamChunk *
+                                                          sizeof(y_value_type));
+        Kokkos::parallel_for(
+            Kokkos::TeamThreadRange(thread, teamRowBegin, teamRowEnd + 1),
+            [&](const A_ordinal_type& i) {
+              if (i < A.numRows()) {
+                yS[i - teamRowBegin] = 0;
+              }
+            });
+      } else {
+        (void)(yS == yS);  // set but unused, expr has no effect
+      }
+
+      if constexpr (ROWENDS_USE_SCRATCH || NONZEROS_USE_SCRATCH ||
+                    Y_USE_SCRATCH) {
+        thread.team_barrier();
+      }
+
+      // each thread determines its location within the team chunk
+
+      // team's view of row map is either in scratch or global
+      typename std::conditional<ROWENDS_USE_SCRATCH, row_map_scratch_type,
+                                um_row_map_type>::type teamRowEnds;
+      if constexpr (ROWENDS_USE_SCRATCH) {
+        teamRowEnds = row_map_scratch_type(rowEndsS, teamRowEnd - teamRowBegin);
+      } else {
+        teamRowEnds =
+            um_row_map_type(&rowEnds(teamRowBegin), teamRowEnd - teamRowBegin);
+      }
+
+      iota_type teamIota(teamNnzEnd - teamNnzBegin,
+                         teamNnzBegin);  // teamNnzBegin..<teamNnzEnd
+
+      // diagonal is local to the team's path
+      A_size_type threadD = KOKKOSKERNELS_MACRO_MIN(
+          teamD + thread.team_rank() * pathLengthThreadChunk_, pathLength);
+      threadD -= teamD;
+
+      DSR threadLb;
+      threadLb = diagonal_search(teamRowEnds, teamIota, threadD);
+      const A_size_type threadNnzBegin    = threadLb.bi + teamNnzBegin;
+      const A_ordinal_type threadRowBegin = threadLb.ai + teamRowBegin;
+
+      // each thread does some accumulating
+      y_value_type acc      = 0;
+      A_ordinal_type curRow = threadRowBegin;
+      A_size_type curNnz    = threadNnzBegin;
+      for (A_size_type i = 0;
+           i < pathLengthThreadChunk_ /*some threads have less work*/ &&
+           curNnz < A.nnz() + 1 && curRow < A.numRows();
+           ++i) {
+        A_size_type curRowEnd;
+        if constexpr (ROWENDS_USE_SCRATCH) {
+          curRowEnd = rowEndsS[curRow - teamRowBegin];
+        } else {
+          curRowEnd = rowEnds(curRow);
+        }
+
+        if (curNnz < curRowEnd) {
+          typename AMatrix::non_const_ordinal_type col;
+          A_value_type val;
+          if constexpr (NONZEROS_USE_SCRATCH) {
+            col = entriesS[curNnz - teamNnzBegin];
+            val = (CONJ ? KAT::conj(valuesS[curNnz - teamNnzBegin])
+                        : valuesS[curNnz - teamNnzBegin]);
+          } else {
+            col = A.graph.entries(curNnz);
+            val = (CONJ ? KAT::conj(A.values(curNnz)) : A.values(curNnz));
+          }
+
+          acc += val * x(col);
+          ++curNnz;
+        } else {
+          if constexpr (Y_USE_SCRATCH) {
+            Kokkos::atomic_add(&yS[curRow - teamRowBegin], alpha * acc);
+          } else {
+            Kokkos::atomic_add(&y(curRow), alpha * acc);
+          }
+          acc = 0;
+          ++curRow;
+        }
+      }
+      // save the accumulated results of a partial last row.
+      // might be 0 if last row was not partial
+      if (curRow < A.numRows()) {
+        if constexpr (Y_USE_SCRATCH) {
+          Kokkos::atomic_add(&yS[curRow - teamRowBegin], alpha * acc);
+        } else {
+          Kokkos::atomic_add(&y(curRow), alpha * acc);
+        }
+      }
+
+      if constexpr (Y_USE_SCRATCH) {
+        thread.team_barrier();
+
+        Kokkos::parallel_for(
+            Kokkos::TeamThreadRange(thread, teamRowBegin, teamRowEnd + 1),
+            [&](const A_ordinal_type& i) {
+              if (i < A.numRows()) {
+                if (i > teamRowBegin && i < teamRowEnd) {
+                  y(i) += yS[i - teamRowBegin];
+                } else {
+                  Kokkos::atomic_add(&y(i), yS[i - teamRowBegin]);
+                }
+              }
+            });
+      }
+    }
+
+    size_t team_shmem_size(int teamSize) const {
+      const A_size_type pathLengthTeamChunk = pathLengthThreadChunk_ * teamSize;
+      (void)pathLengthTeamChunk;  // silence declared but not referenced
+      size_t val = 0;
+      if constexpr (Y_USE_SCRATCH) {
+        val += sizeof(y_value_type) * pathLengthTeamChunk;
+      }
+      if constexpr (ROWENDS_USE_SCRATCH) {
+        val += sizeof(row_map_non_const_value_type) * pathLengthTeamChunk;
+      }
+      if constexpr (NONZEROS_USE_SCRATCH) {
+        val += sizeof(A_ordinal_type) * pathLengthTeamChunk;
+        val += sizeof(A_value_type) * pathLengthTeamChunk;
+      }
+      return val;
+    }
+  };
+
+  static void spmv(const char mode[], const y_value_type& alpha,
+                   const AMatrix& A, const XVector& x, const y_value_type& beta,
+                   const YVector& y) {
+    static_assert(XVector::rank == 1, "");
+    static_assert(YVector::rank == 1, "");
+
+    KokkosBlas::scal(y, beta, y);
+
+    /* determine launch parameters for different architectures
+       On architectures where there is a natural execution hierarchy with true
+       team scratch, we'll assign each team to use an appropriate amount of the
+       scratch.
+       On other architectures, just have each team do the maximal amount of work
+       to amortize the cost of the diagonal search
+    */
+    const A_size_type pathLength = A.numRows() + A.nnz();
+    A_size_type pathLengthThreadChunk;
+    int teamSize;
+    if constexpr (KokkosKernels::Impl::kk_is_gpu_exec_space<exec_space>()) {
+      pathLengthThreadChunk = 4;
+      teamSize              = 128;
+    } else {
+      teamSize              = 1;
+      pathLengthThreadChunk = (pathLength + exec_space::concurrency() - 1) /
+                              exec_space::concurrency();
+    }
+
+    const size_t pathLengthTeamChunk = pathLengthThreadChunk * teamSize;
+    const int leagueSize =
+        (pathLength + pathLengthTeamChunk - 1) / pathLengthTeamChunk;
+
+    policy_type policy(exec_space(), leagueSize, teamSize);
+
+    /* Currently:
+       On GPU, assume atomics are fast, so don't accumuate into scratch.
+       On CPU spaces, there's no real point to using scratch, just rely on the
+       memory hierarchy. Using scratch just increases the number of required
+       atomic operations
+    */
+    if (KokkosSparse::NoTranspose[0] == mode[0]) {
+      constexpr bool CONJ = false;
+      using GpuOp         = Functor<true, true, false, CONJ>;
+      using CpuOp         = Functor<false, false, false, CONJ>;
+      using Op            = typename std::conditional<
+          KokkosKernels::Impl::kk_is_gpu_exec_space<exec_space>(), GpuOp,
+          CpuOp>::type;
+      Op op(alpha, A, x, y, pathLengthThreadChunk);
+      Kokkos::parallel_for("SpmvMergeHierarchical::spmv", policy, op);
+    } else if (KokkosSparse::Conjugate[0] == mode[0]) {
+      constexpr bool CONJ = true;
+      using GpuOp         = Functor<true, true, false, CONJ>;
+      using CpuOp         = Functor<false, false, false, CONJ>;
+      using Op            = typename std::conditional<
+          KokkosKernels::Impl::kk_is_gpu_exec_space<exec_space>(), GpuOp,
+          CpuOp>::type;
+      Op op(alpha, A, x, y, pathLengthThreadChunk);
+      Kokkos::parallel_for("SpmvMergeHierarchical::spmv", policy, op);
+    } else {
+      std::stringstream ss;
+      ss << __FILE__ << ":" << __LINE__
+         << "SpmvMergeHierarchical::spmv() called with unsupported mode "
+         << mode;
+      throw std::logic_error(ss.str());
+    }
+  }
+};
+
+}  // namespace KokkosSparse::Impl
+
+#endif  // KOKKOSSPARSE_SPMV_IMPL_MERGE_HPP
diff --git a/sparse/impl/KokkosSparse_spmv_struct_impl.hpp b/sparse/impl/KokkosSparse_spmv_struct_impl.hpp
index c18018f54f..a582f18e40 100644
--- a/sparse/impl/KokkosSparse_spmv_struct_impl.hpp
+++ b/sparse/impl/KokkosSparse_spmv_struct_impl.hpp
@@ -17,6 +17,8 @@
 #ifndef KOKKOSSPARSE_IMPL_SPMV_STRUCT_DEF_HPP_
 #define KOKKOSSPARSE_IMPL_SPMV_STRUCT_DEF_HPP_
 
+#include <sstream>
+
 #include "Kokkos_InnerProductSpaceTraits.hpp"
 #include "KokkosKernels_ExecSpaceUtils.hpp"
 #include "KokkosBlas1_scal.hpp"
@@ -923,8 +925,10 @@ static void spmv_struct_beta(
                                dobeta, true>(exec, stencil_type, structure,
                                              alpha, A, x, beta, y);
   } else {
-    KokkosKernels::Impl::throw_runtime_exception(
-        "Invalid Transpose Mode for KokkosSparse::spmv_struct()");
+    std::stringstream ss;
+    ss << __FILE__ << ":" << __LINE__ << " Invalid transpose mode " << mode
+       << " for KokkosSparse::spmv_struct()";
+    KokkosKernels::Impl::throw_runtime_exception(ss.str());
   }
 }
 
@@ -1454,8 +1458,10 @@ static void spmv_alpha_beta_mv_struct(
                                         YVector, doalpha, dobeta, true>(
         exec, alpha, A, x, beta, y);
   } else {
-    KokkosKernels::Impl::throw_runtime_exception(
-        "Invalid Transpose Mode for KokkosSparse::spmv()");
+    std::stringstream ss;
+    ss << __FILE__ << ":" << __LINE__ << " Invalid transpose mode " << mode
+       << " for KokkosSparse::spmv_struct()";
+    KokkosKernels::Impl::throw_runtime_exception(ss.str());
   }
 }
 
diff --git a/sparse/unit_test/Test_Sparse_MergeMatrix.hpp b/sparse/unit_test/Test_Sparse_MergeMatrix.hpp
index b6301778a3..85c35c0044 100644
--- a/sparse/unit_test/Test_Sparse_MergeMatrix.hpp
+++ b/sparse/unit_test/Test_Sparse_MergeMatrix.hpp
@@ -26,7 +26,7 @@
 #include <Kokkos_Core.hpp>
 
 #include "KokkosKernels_Iota.hpp"
-#include "KokkosSparse_MergeMatrix.hpp"
+#include "KokkosSparse_merge_matrix.hpp"
 
 namespace Test_Sparse_MergeMatrix {
 
@@ -85,8 +85,7 @@ template <typename AEntry, typename BEntry, typename ExecSpace>
 void view_view_empty_empty() {
   using AView = Kokkos::View<AEntry *, ExecSpace>;
   using BView = Kokkos::View<BEntry *, ExecSpace>;
-  using MMD =
-      KokkosSparse::Experimental::Impl::MergeMatrixDiagonal<AView, BView>;
+  using MMD   = KokkosSparse::Impl::MergeMatrixDiagonal<AView, BView>;
 
   AView a("view-view-empty-empty-a", 0);
   BView b("view-view-empty-empty-b", 0);
@@ -102,8 +101,7 @@ template <typename AEntry, typename BEntry, typename ExecSpace>
 void view_view_full_empty() {
   using AView = Kokkos::View<AEntry *, ExecSpace>;
   using BView = Kokkos::View<BEntry *, ExecSpace>;
-  using MMD =
-      KokkosSparse::Experimental::Impl::MergeMatrixDiagonal<AView, BView>;
+  using MMD   = KokkosSparse::Impl::MergeMatrixDiagonal<AView, BView>;
 
   size_t aNonzero = 5;
   AView a("view-view-full-empty-a", aNonzero);
@@ -123,8 +121,7 @@ template <typename AEntry, typename BEntry, typename ExecSpace>
 void view_view_empty_full() {
   using AView = Kokkos::View<AEntry *, ExecSpace>;
   using BView = Kokkos::View<BEntry *, ExecSpace>;
-  using MMD =
-      KokkosSparse::Experimental::Impl::MergeMatrixDiagonal<AView, BView>;
+  using MMD   = KokkosSparse::Impl::MergeMatrixDiagonal<AView, BView>;
 
   AView a("view-view-empty-full-a", 0);
   BView b = from_std_vec<BView>("view-view-empty-full-b", {0, 1, 2, 3});
@@ -284,10 +281,9 @@ std::tuple<AView, BView> view_view_case_5() {
 */
 template <typename AEntry, typename BEntry, typename ExecSpace>
 void view_view_full_full() {
-  using AView = Kokkos::View<AEntry *, ExecSpace>;
-  using BView = Kokkos::View<BEntry *, ExecSpace>;
-  using MMD =
-      KokkosSparse::Experimental::Impl::MergeMatrixDiagonal<AView, BView>;
+  using AView          = Kokkos::View<AEntry *, ExecSpace>;
+  using BView          = Kokkos::View<BEntry *, ExecSpace>;
+  using MMD            = KokkosSparse::Impl::MergeMatrixDiagonal<AView, BView>;
   using mmd_value_type = typename MMD::non_const_value_type;
 
   {
@@ -377,8 +373,7 @@ template <typename AEntry, typename BEntry, typename ExecSpace>
 void view_iota_empty_empty() {
   using AView = Kokkos::View<AEntry *, ExecSpace>;
   using BView = KokkosKernels::Impl::Iota<BEntry>;
-  using MMD =
-      KokkosSparse::Experimental::Impl::MergeMatrixDiagonal<AView, BView>;
+  using MMD   = KokkosSparse::Impl::MergeMatrixDiagonal<AView, BView>;
 
   AView a("view-iota-empty-empty-a", 0);
   BView b(0);
@@ -394,8 +389,7 @@ template <typename AEntry, typename BEntry, typename ExecSpace>
 void view_iota_full_empty() {
   using AView = Kokkos::View<AEntry *, ExecSpace>;
   using BView = KokkosKernels::Impl::Iota<BEntry>;
-  using MMD =
-      KokkosSparse::Experimental::Impl::MergeMatrixDiagonal<AView, BView>;
+  using MMD   = KokkosSparse::Impl::MergeMatrixDiagonal<AView, BView>;
 
   size_t aNonzero = 5;
   AView a("view-iota-full-empty-a", aNonzero);
@@ -415,8 +409,7 @@ template <typename AEntry, typename BEntry, typename ExecSpace>
 void view_iota_empty_full() {
   using AView = Kokkos::View<AEntry *, ExecSpace>;
   using BView = KokkosKernels::Impl::Iota<BEntry>;
-  using MMD =
-      KokkosSparse::Experimental::Impl::MergeMatrixDiagonal<AView, BView>;
+  using MMD   = KokkosSparse::Impl::MergeMatrixDiagonal<AView, BView>;
 
   AView a("view-iota-empty-full-a", 0);
   BView b(4);
@@ -487,10 +480,9 @@ std::tuple<AView, BView> view_iota_case_1() {
 */
 template <typename AEntry, typename BEntry, typename ExecSpace>
 void view_iota_full_full() {
-  using AView = Kokkos::View<AEntry *, ExecSpace>;
-  using BView = KokkosKernels::Impl::Iota<BEntry>;
-  using MMD =
-      KokkosSparse::Experimental::Impl::MergeMatrixDiagonal<AView, BView>;
+  using AView          = Kokkos::View<AEntry *, ExecSpace>;
+  using BView          = KokkosKernels::Impl::Iota<BEntry>;
+  using MMD            = KokkosSparse::Impl::MergeMatrixDiagonal<AView, BView>;
   using mmd_value_type = typename MMD::non_const_value_type;
 
   {
@@ -537,8 +529,7 @@ void test_rank() {
   {
     using AView = Kokkos::View<AEntry *, ExecSpace>;
     using BView = Kokkos::View<BEntry *, ExecSpace>;
-    using MMD =
-        KokkosSparse::Experimental::Impl::MergeMatrixDiagonal<AView, BView>;
+    using MMD   = KokkosSparse::Impl::MergeMatrixDiagonal<AView, BView>;
     static_assert(MMD::rank == 1,
                   "MergeMatrixDiagonal should look like a rank-1 view");
   }
@@ -546,8 +537,7 @@ void test_rank() {
   {
     using AView = Kokkos::View<AEntry *, ExecSpace>;
     using BView = KokkosKernels::Impl::Iota<BEntry>;
-    using MMD =
-        KokkosSparse::Experimental::Impl::MergeMatrixDiagonal<AView, BView>;
+    using MMD   = KokkosSparse::Impl::MergeMatrixDiagonal<AView, BView>;
     static_assert(MMD::rank == 1,
                   "MergeMatrixDiagonal should look like a rank-1 view");
   }
diff --git a/sparse/unit_test/Test_Sparse_spmv.hpp b/sparse/unit_test/Test_Sparse_spmv.hpp
index b6a64e4f6d..9f980fedc9 100644
--- a/sparse/unit_test/Test_Sparse_spmv.hpp
+++ b/sparse/unit_test/Test_Sparse_spmv.hpp
@@ -125,7 +125,7 @@ template <typename crsMat_t, typename x_vector_type, typename y_vector_type>
 void sequential_spmv(crsMat_t input_mat, x_vector_type x, y_vector_type y,
                      typename y_vector_type::non_const_value_type alpha,
                      typename y_vector_type::non_const_value_type beta,
-                     char mode = 'N') {
+                     const std::string &mode = "N") {
   using graph_t          = typename crsMat_t::StaticCrsGraphType;
   using size_type_view_t = typename graph_t::row_map_type;
   using lno_view_t       = typename graph_t::entries_type;
@@ -136,8 +136,6 @@ void sequential_spmv(crsMat_t input_mat, x_vector_type x, y_vector_type y,
   using scalar_t  = typename scalar_view_t::non_const_value_type;
   using KAT       = Kokkos::ArithTraits<scalar_t>;
 
-  mode = toupper(mode);
-
   typename scalar_view_t::HostMirror h_values =
       Kokkos::create_mirror_view(input_mat.values);
   Kokkos::deep_copy(h_values, input_mat.values);
@@ -168,13 +166,13 @@ void sequential_spmv(crsMat_t input_mat, x_vector_type x, y_vector_type y,
     for (size_type j = h_rowmap(row); j < h_rowmap(row + 1); ++j) {
       lno_t col    = h_entries(j);
       scalar_t val = h_values(j);
-      if (mode == 'N')
+      if (mode == "N")
         h_y(row) += alpha * val * h_x(col);
-      else if (mode == 'C')
+      else if (mode == "C")
         h_y(row) += alpha * KAT::conj(val) * h_x(col);
-      else if (mode == 'T')
+      else if (mode == "T")
         h_y(col) += alpha * val * h_x(row);
-      else if (mode == 'H')
+      else if (mode == "H")
         h_y(col) += alpha * KAT::conj(val) * h_x(row);
     }
   }
@@ -184,12 +182,14 @@ void sequential_spmv(crsMat_t input_mat, x_vector_type x, y_vector_type y,
 
 template <typename crsMat_t, typename x_vector_type, typename y_vector_type>
 void check_spmv(
-    const Controls &controls, crsMat_t input_mat, x_vector_type x,
-    y_vector_type y, typename y_vector_type::non_const_value_type alpha,
-    typename y_vector_type::non_const_value_type beta, char mode,
+    const KokkosKernels::Experimental::Controls &controls, crsMat_t input_mat,
+    x_vector_type x, y_vector_type y,
+    typename y_vector_type::non_const_value_type alpha,
+    typename y_vector_type::non_const_value_type beta, const std::string &mode,
     typename Kokkos::ArithTraits<typename crsMat_t::value_type>::mag_type
         max_val) {
-  // typedef typename crsMat_t::StaticCrsGraphType graph_t;
+  EXPECT_TRUE(mode.size() == 1);
+
   using ExecSpace        = typename crsMat_t::execution_space;
   using my_exec_space    = Kokkos::RangePolicy<ExecSpace>;
   using y_value_type     = typename y_vector_type::non_const_value_type;
@@ -198,7 +198,7 @@ void check_spmv(
 
   const y_value_mag_type eps =
       10 * Kokkos::ArithTraits<y_value_mag_type>::eps();
-  bool transposed = (mode == 'T') || (mode == 'H');
+  bool transposed = (mode == "T") || (mode == "H");
   y_vector_type expected_y(
       "expected", transposed ? input_mat.numCols() : input_mat.numRows());
   Kokkos::deep_copy(expected_y, y);
@@ -208,7 +208,7 @@ void check_spmv(
   bool threw = false;
   std::string msg;
   try {
-    KokkosSparse::spmv(controls, &mode, alpha, input_mat, x, beta, y);
+    KokkosSparse::spmv(controls, mode.data(), alpha, input_mat, x, beta, y);
     Kokkos::fence();
   } catch (std::exception &e) {
     threw = true;
@@ -234,9 +234,12 @@ void check_spmv_mv(
     crsMat_t input_mat, x_vector_type x, y_vector_type y,
     y_vector_type expected_y,
     typename y_vector_type::non_const_value_type alpha,
-    typename y_vector_type::non_const_value_type beta, int numMV, char mode,
+    typename y_vector_type::non_const_value_type beta, int numMV,
+    const std::string &mode,
     typename Kokkos::ArithTraits<typename crsMat_t::value_type>::mag_type
         max_val) {
+  EXPECT_TRUE(mode.size() == 1);
+
   using ExecSpace        = typename crsMat_t::execution_space;
   using my_exec_space    = Kokkos::RangePolicy<ExecSpace>;
   using y_value_type     = typename y_vector_type::non_const_value_type;
@@ -256,7 +259,7 @@ void check_spmv_mv(
   bool threw = false;
   std::string msg;
   try {
-    KokkosSparse::spmv(&mode, alpha, input_mat, x, beta, y);
+    KokkosSparse::spmv(mode.data(), alpha, input_mat, x, beta, y);
     Kokkos::fence();
   } catch (std::exception &e) {
     threw = true;
@@ -491,12 +494,12 @@ void test_spmv(const Controls &controls, lno_t numRows, size_type nnz,
   Kokkos::fill_random(input_mat.values, rand_pool,
                       randomUpperBound<scalar_t>(max_val));
 
-  std::vector<char> nonTransModes   = {'N'};
-  std::vector<char> transModes      = {'T'};
-  std::vector<double> testAlphaBeta = {0.0, 1.0};
+  std::vector<const char *> nonTransModes = {"N"};
+  std::vector<const char *> transModes    = {"T"};
+  std::vector<double> testAlphaBeta       = {0.0, 1.0};
   if (heavy) {
-    nonTransModes.push_back('C');
-    transModes.push_back('H');
+    nonTransModes.push_back("C");
+    transModes.push_back("H");
     testAlphaBeta.push_back(-1.0);
     testAlphaBeta.push_back(2.5);
   }
@@ -528,17 +531,23 @@ template <typename scalar_t, typename lno_t, typename size_type,
 void test_spmv_algorithms(lno_t numRows, size_type nnz, lno_t bandwidth,
                           lno_t row_size_variance, bool heavy) {
   {
-    Controls controls;
+    KokkosKernels::Experimental::Controls controls;
     test_spmv<scalar_t, lno_t, size_type, Device>(
         controls, numRows, nnz, bandwidth, row_size_variance, heavy);
   }
 
   {
-    Controls controls;
+    KokkosKernels::Experimental::Controls controls;
     controls.setParameter("algorithm", "native");
     test_spmv<scalar_t, lno_t, size_type, Device>(
         controls, numRows, nnz, bandwidth, row_size_variance, heavy);
   }
+  {
+    KokkosKernels::Experimental::Controls controls;
+    controls.setParameter("algorithm", "merge");
+    test_spmv<scalar_t, lno_t, size_type, Device>(
+        controls, numRows, nnz, bandwidth, row_size_variance, heavy);
+  }
 }
 
 template <typename scalar_t, typename lno_t, typename size_type,
@@ -588,12 +597,12 @@ void test_spmv_mv(lno_t numRows, size_type nnz, lno_t bandwidth,
   Kokkos::deep_copy(b_y_copy, b_y);
   Kokkos::deep_copy(b_yt_copy, b_yt);
 
-  std::vector<char> nonTransModes   = {'N'};
-  std::vector<char> transModes      = {'T'};
-  std::vector<double> testAlphaBeta = {0.0, 1.0};
+  std::vector<const char *> nonTransModes = {"N"};
+  std::vector<const char *> transModes    = {"T"};
+  std::vector<double> testAlphaBeta       = {0.0, 1.0};
   if (heavy) {
-    nonTransModes.push_back('C');
-    transModes.push_back('H');
+    nonTransModes.push_back("C");
+    transModes.push_back("H");
     testAlphaBeta.push_back(-1.0);
     testAlphaBeta.push_back(2.5);
   }
@@ -662,18 +671,18 @@ void test_spmv_mv_heavy(lno_t numRows, size_type nnz, lno_t bandwidth,
 
     Kokkos::deep_copy(b_y_copy, b_y);
 
-    Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 0.0, nv, 'N',
+    Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 0.0, nv, "N",
                         max_nnz_per_row * max_val * max_x);
-    Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 0.0, 1.0, nv, 'N',
+    Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 0.0, 1.0, nv, "N",
                         max_y);
-    Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 1.0, nv, 'N',
+    Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 1.0, nv, "N",
                         max_y + max_nnz_per_row * max_val * max_x);
-    Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 0.0, nv, 'T',
+    Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 0.0, nv, "T",
                         max_nnz_per_row * max_val * max_x);
-    Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 0.0, 1.0, nv, 'T',
+    Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 0.0, 1.0, nv, "T",
                         max_y);
     // Testing all modes together, since matrix is square
-    std::vector<char> modes           = {'N', 'C', 'T', 'H'};
+    std::vector<const char *> modes   = {"N", "C", "T", "H"};
     std::vector<double> testAlphaBeta = {0.0, 1.0, -1.0, 2.5};
     for (auto mode : modes) {
       for (double alpha : testAlphaBeta) {

From 245216841087370869d6f2d77b5a2f77e21e2dec Mon Sep 17 00:00:00 2001
From: Carl Pearson <cwpears@sandia.gov>
Date: Mon, 24 Jul 2023 10:34:43 -0600
Subject: [PATCH 02/10] KokkosKernels_Controls: add missing <string> include

Also remove pointless warning to stderr
---
 sparse/src/KokkosKernels_Controls.hpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sparse/src/KokkosKernels_Controls.hpp b/sparse/src/KokkosKernels_Controls.hpp
index 1ee8cd108e..594df031a3 100644
--- a/sparse/src/KokkosKernels_Controls.hpp
+++ b/sparse/src/KokkosKernels_Controls.hpp
@@ -20,6 +20,7 @@
 /// \brief Mechanism to control internal behavior of kernels
 /// \author Luc Berger-Vergiat (lberge@sandia.gov)
 
+#include <string>
 #include <unordered_map>
 #include <string>
 #include "KokkosKernels_config.h"

From b2d22caadfd0746df2f3a15cc1db41e74847c9ee Mon Sep 17 00:00:00 2001
From: Carl Pearson <cwpears@sandia.gov>
Date: Wed, 26 Jul 2023 15:22:43 -0600
Subject: [PATCH 03/10] Test_Sparse_spmv.hpp: more explicit namespacing

---
 sparse/unit_test/Test_Sparse_spmv.hpp | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/sparse/unit_test/Test_Sparse_spmv.hpp b/sparse/unit_test/Test_Sparse_spmv.hpp
index 9f980fedc9..8fdb56b5f4 100644
--- a/sparse/unit_test/Test_Sparse_spmv.hpp
+++ b/sparse/unit_test/Test_Sparse_spmv.hpp
@@ -452,8 +452,9 @@ Kokkos::complex<float> randomUpperBound<Kokkos::complex<float>>(int mag) {
 
 template <typename scalar_t, typename lno_t, typename size_type,
           typename Device>
-void test_spmv(const Controls &controls, lno_t numRows, size_type nnz,
-               lno_t bandwidth, lno_t row_size_variance, bool heavy) {
+void test_spmv(const KokkosKernels::Experimental::Controls &controls,
+               lno_t numRows, size_type nnz, lno_t bandwidth,
+               lno_t row_size_variance, bool heavy) {
   using crsMat_t = typename KokkosSparse::CrsMatrix<scalar_t, lno_t, Device,
                                                     void, size_type>;
   using scalar_view_t = typename crsMat_t::values_type::non_const_type;
@@ -953,7 +954,8 @@ void test_spmv_mv_struct_1D(lno_t nx, int numMV) {
 template <typename scalar_t, typename lno_t, typename size_type, class Device>
 void test_spmv_controls(lno_t numRows, size_type nnz, lno_t bandwidth,
                         lno_t row_size_variance,
-                        const Controls &controls = Controls()) {
+                        const KokkosKernels::Experimental::Controls &controls =
+                            KokkosKernels::Experimental::Controls()) {
   using crsMat_t = typename KokkosSparse::CrsMatrix<scalar_t, lno_t, Device,
                                                     void, size_type>;
   using scalar_view_t = typename crsMat_t::values_type::non_const_type;
@@ -996,7 +998,7 @@ void test_spmv_controls(lno_t numRows, size_type nnz, lno_t bandwidth,
 template <typename scalar_t, typename lno_t, typename size_type, class Device>
 void test_spmv_native(lno_t numRows, size_type nnz, lno_t bandwidth,
                       lno_t row_size_variance) {
-  Controls controls;
+  KokkosKernels::Experimental::Controls controls;
   controls.setParameter("algorithm", "native");
   test_spmv_controls(numRows, nnz, bandwidth, row_size_variance, controls);
 }  // test_spmv_native

From a373631a6792176f38f10b001fa7e8c9f620050f Mon Sep 17 00:00:00 2001
From: Carl Pearson <cwpears@sandia.gov>
Date: Mon, 28 Aug 2023 11:57:13 -0600
Subject: [PATCH 04/10] add Execution Space support for merge-based SpMV

---
 sparse/impl/KokkosSparse_spmv_impl.hpp       | 16 ++++++++--------
 sparse/impl/KokkosSparse_spmv_impl_merge.hpp | 11 ++++++-----
 2 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/sparse/impl/KokkosSparse_spmv_impl.hpp b/sparse/impl/KokkosSparse_spmv_impl.hpp
index 16717c6e62..c3df57c65f 100644
--- a/sparse/impl/KokkosSparse_spmv_impl.hpp
+++ b/sparse/impl/KokkosSparse_spmv_impl.hpp
@@ -635,19 +635,19 @@ static void spmv_beta(const execution_space& exec,
                       const YVector& y) {
   if (mode[0] == NoTranspose[0]) {
     if (controls.getParameter("algorithm") == KOKKOSSPARSE_ALG_MERGE) {
-      SpmvMergeHierarchical<execution_space, AMatrix, XVector, YVector>::spmv(exec, mode, alpha, A, x,
-                                                             beta, y);
+      SpmvMergeHierarchical<execution_space, AMatrix, XVector, YVector>::spmv(
+          exec, mode, alpha, A, x, beta, y);
     } else {
-      spmv_beta_no_transpose<execution_space, XVector, YVector, dobeta, false>(
-          exec, controls, alpha, A, x, beta, y);
+      spmv_beta_no_transpose<execution_space, AMatrix, XVector, YVector, dobeta,
+                             false>(exec, controls, alpha, A, x, beta, y);
     }
   } else if (mode[0] == Conjugate[0]) {
     if (controls.getParameter("algorithm") == KOKKOSSPARSE_ALG_MERGE) {
-      SpmvMergeHierarchical<execution_space, AMatrix, XVector, YVector>::spmv(exec, mode, alpha, A, x,
-                                                             beta, y);
+      SpmvMergeHierarchical<execution_space, AMatrix, XVector, YVector>::spmv(
+          exec, mode, alpha, A, x, beta, y);
     } else {
-      spmv_beta_no_transpose<execution_space, AMatrix, XVector, YVector, dobeta, true>(
-          exec, controls, alpha, A, x, beta, y);
+      spmv_beta_no_transpose<execution_space, AMatrix, XVector, YVector, dobeta,
+                             true>(exec, controls, alpha, A, x, beta, y);
     }
   } else if (mode[0] == Transpose[0]) {
     spmv_beta_transpose<execution_space, AMatrix, XVector, YVector, dobeta,
diff --git a/sparse/impl/KokkosSparse_spmv_impl_merge.hpp b/sparse/impl/KokkosSparse_spmv_impl_merge.hpp
index 7ed09ee477..07d4aa68e1 100644
--- a/sparse/impl/KokkosSparse_spmv_impl_merge.hpp
+++ b/sparse/impl/KokkosSparse_spmv_impl_merge.hpp
@@ -33,10 +33,10 @@ namespace KokkosSparse::Impl {
   the team to determine which entries it will be responsible for
   The threads then atomically accumulate partial produces
 */
-template <class AMatrix, class XVector, class YVector>
+template <class ExecutionSpace, class AMatrix, class XVector, class YVector>
 struct SpmvMergeHierarchical {
   using device_type    = typename YVector::device_type;
-  using exec_space     = typename device_type::execution_space;
+  using exec_space     = ExecutionSpace;
   using y_value_type   = typename YVector::non_const_value_type;
   using x_value_type   = typename XVector::non_const_value_type;
   using A_value_type   = typename AMatrix::non_const_value_type;
@@ -301,8 +301,9 @@ struct SpmvMergeHierarchical {
     }
   };
 
-  static void spmv(const char mode[], const y_value_type& alpha,
-                   const AMatrix& A, const XVector& x, const y_value_type& beta,
+  static void spmv(const ExecutionSpace& space, const char mode[],
+                   const y_value_type& alpha, const AMatrix& A,
+                   const XVector& x, const y_value_type& beta,
                    const YVector& y) {
     static_assert(XVector::rank == 1, "");
     static_assert(YVector::rank == 1, "");
@@ -332,7 +333,7 @@ struct SpmvMergeHierarchical {
     const int leagueSize =
         (pathLength + pathLengthTeamChunk - 1) / pathLengthTeamChunk;
 
-    policy_type policy(exec_space(), leagueSize, teamSize);
+    policy_type policy(space, leagueSize, teamSize);
 
     /* Currently:
        On GPU, assume atomics are fast, so don't accumuate into scratch.

From 0c4a2a97e58aa8316277fe415bc424d7fc1d2698 Mon Sep 17 00:00:00 2001
From: Carl Pearson <cwpears@sandia.gov>
Date: Tue, 19 Sep 2023 11:15:03 -0600
Subject: [PATCH 05/10] exec_space::concurrency -> exec_space().concurrency()

---
 sparse/impl/KokkosSparse_spmv_impl_merge.hpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/sparse/impl/KokkosSparse_spmv_impl_merge.hpp b/sparse/impl/KokkosSparse_spmv_impl_merge.hpp
index 07d4aa68e1..f1e9df66bc 100644
--- a/sparse/impl/KokkosSparse_spmv_impl_merge.hpp
+++ b/sparse/impl/KokkosSparse_spmv_impl_merge.hpp
@@ -320,13 +320,13 @@ struct SpmvMergeHierarchical {
     const A_size_type pathLength = A.numRows() + A.nnz();
     A_size_type pathLengthThreadChunk;
     int teamSize;
-    if constexpr (KokkosKernels::Impl::kk_is_gpu_exec_space<exec_space>()) {
+    if constexpr (KokkosKernels::Impl::kk_is_gpu_exec_space<ExecutionSpace>()) {
       pathLengthThreadChunk = 4;
       teamSize              = 128;
     } else {
       teamSize              = 1;
-      pathLengthThreadChunk = (pathLength + exec_space::concurrency() - 1) /
-                              exec_space::concurrency();
+      pathLengthThreadChunk = (pathLength + exec_space().concurrency() - 1) /
+                              exec_space().concurrency();
     }
 
     const size_t pathLengthTeamChunk = pathLengthThreadChunk * teamSize;
@@ -346,7 +346,7 @@ struct SpmvMergeHierarchical {
       using GpuOp         = Functor<true, true, false, CONJ>;
       using CpuOp         = Functor<false, false, false, CONJ>;
       using Op            = typename std::conditional<
-          KokkosKernels::Impl::kk_is_gpu_exec_space<exec_space>(), GpuOp,
+          KokkosKernels::Impl::kk_is_gpu_exec_space<ExecutionSpace>(), GpuOp,
           CpuOp>::type;
       Op op(alpha, A, x, y, pathLengthThreadChunk);
       Kokkos::parallel_for("SpmvMergeHierarchical::spmv", policy, op);
@@ -355,7 +355,7 @@ struct SpmvMergeHierarchical {
       using GpuOp         = Functor<true, true, false, CONJ>;
       using CpuOp         = Functor<false, false, false, CONJ>;
       using Op            = typename std::conditional<
-          KokkosKernels::Impl::kk_is_gpu_exec_space<exec_space>(), GpuOp,
+          KokkosKernels::Impl::kk_is_gpu_exec_space<ExecutionSpace>(), GpuOp,
           CpuOp>::type;
       Op op(alpha, A, x, y, pathLengthThreadChunk);
       Kokkos::parallel_for("SpmvMergeHierarchical::spmv", policy, op);

From aeb9bf1654e2b370f5b5e621c41fb769ba8e7934 Mon Sep 17 00:00:00 2001
From: Carl Pearson <cwpears@sandia.gov>
Date: Mon, 9 Oct 2023 09:23:53 -0600
Subject: [PATCH 06/10] KokkosSparse_spmv_impl_merge.hpp: reduce chance of
 identifier collision

---
 sparse/impl/KokkosSparse_spmv_impl_merge.hpp | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/sparse/impl/KokkosSparse_spmv_impl_merge.hpp b/sparse/impl/KokkosSparse_spmv_impl_merge.hpp
index f1e9df66bc..9329b8a097 100644
--- a/sparse/impl/KokkosSparse_spmv_impl_merge.hpp
+++ b/sparse/impl/KokkosSparse_spmv_impl_merge.hpp
@@ -73,9 +73,10 @@ struct SpmvMergeHierarchical {
 
   template <bool NONZEROS_USE_SCRATCH, bool ROWENDS_USE_SCRATCH,
             bool Y_USE_SCRATCH, bool CONJ>
-  struct Functor {
-    Functor(const y_value_type& _alpha, const AMatrix& _A, const XVector& _x,
-            const YVector& _y, const A_size_type pathLengthThreadChunk)
+  struct SpmvMergeImplFunctor {
+    SpmvMergeImplFunctor(const y_value_type& _alpha, const AMatrix& _A,
+                         const XVector& _x, const YVector& _y,
+                         const A_size_type pathLengthThreadChunk)
         : alpha(_alpha),
           A(_A),
           x(_x),
@@ -299,7 +300,7 @@ struct SpmvMergeHierarchical {
       }
       return val;
     }
-  };
+  };  // struct SpmvMergeImplFunctor
 
   static void spmv(const ExecutionSpace& space, const char mode[],
                    const y_value_type& alpha, const AMatrix& A,
@@ -343,8 +344,8 @@ struct SpmvMergeHierarchical {
     */
     if (KokkosSparse::NoTranspose[0] == mode[0]) {
       constexpr bool CONJ = false;
-      using GpuOp         = Functor<true, true, false, CONJ>;
-      using CpuOp         = Functor<false, false, false, CONJ>;
+      using GpuOp         = SpmvMergeImplFunctor<true, true, false, CONJ>;
+      using CpuOp         = SpmvMergeImplFunctor<false, false, false, CONJ>;
       using Op            = typename std::conditional<
           KokkosKernels::Impl::kk_is_gpu_exec_space<ExecutionSpace>(), GpuOp,
           CpuOp>::type;
@@ -352,8 +353,8 @@ struct SpmvMergeHierarchical {
       Kokkos::parallel_for("SpmvMergeHierarchical::spmv", policy, op);
     } else if (KokkosSparse::Conjugate[0] == mode[0]) {
       constexpr bool CONJ = true;
-      using GpuOp         = Functor<true, true, false, CONJ>;
-      using CpuOp         = Functor<false, false, false, CONJ>;
+      using GpuOp         = SpmvMergeImplFunctor<true, true, false, CONJ>;
+      using CpuOp         = SpmvMergeImplFunctor<false, false, false, CONJ>;
       using Op            = typename std::conditional<
           KokkosKernels::Impl::kk_is_gpu_exec_space<ExecutionSpace>(), GpuOp,
           CpuOp>::type;

From 2a95f5e030aaeea7a231d09a14647c05a242d13b Mon Sep 17 00:00:00 2001
From: Carl Pearson <cwpears@sandia.gov>
Date: Mon, 9 Oct 2023 09:24:22 -0600
Subject: [PATCH 07/10] KokkosSparse_merge_matrix.hpp: compare with size_type

---
 sparse/impl/KokkosSparse_merge_matrix.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sparse/impl/KokkosSparse_merge_matrix.hpp b/sparse/impl/KokkosSparse_merge_matrix.hpp
index 83dfe42e84..6510975c87 100644
--- a/sparse/impl/KokkosSparse_merge_matrix.hpp
+++ b/sparse/impl/KokkosSparse_merge_matrix.hpp
@@ -145,9 +145,9 @@ class MergeMatrixDiagonal {
   KOKKOS_INLINE_FUNCTION
   bool operator()(const size_type di) const {
     position_type pos = diag_to_a_b(di);
-    if (size_t(pos.ai) >= a_.size()) {
+    if (size_type(pos.ai) >= a_.size()) {
       return true;  // on the +a side out of matrix bounds is 1
-    } else if (size_t(pos.bi) >= b_.size()) {
+    } else if (size_type(pos.bi) >= b_.size()) {
       return false;  // on the +b side out of matrix bounds is 0
     } else {
       return KokkosKernels::Impl::safe_gt(a_(pos.ai), b_(pos.bi));

From 09d442da94859df278618efd9b7168637e5052ad Mon Sep 17 00:00:00 2001
From: Carl Pearson <cwpears@sandia.gov>
Date: Mon, 9 Oct 2023 11:14:11 -0600
Subject: [PATCH 08/10] KokkosSparse_spmv.hpp: 4.0 ::rank handling

---
 sparse/src/KokkosSparse_spmv.hpp | 34 ++++++++++++++++++++++++++------
 1 file changed, 28 insertions(+), 6 deletions(-)

diff --git a/sparse/src/KokkosSparse_spmv.hpp b/sparse/src/KokkosSparse_spmv.hpp
index 2aab1cef60..0658adbccf 100644
--- a/sparse/src/KokkosSparse_spmv.hpp
+++ b/sparse/src/KokkosSparse_spmv.hpp
@@ -98,13 +98,23 @@ void spmv(const ExecutionSpace& space,
                                  typename YVector::memory_space>::accessible,
       "KokkosBlas::spmv: YVector must be accessible from ExecutionSpace");
 
-  // Make sure that x and y have the same rank.
-  static_assert(XVector::rank == YVector::rank,
+// Make sure that x and y have the same rank.
+// Make sure that x (and therefore y) is rank 1.
+#if (KOKKOS_VERSION >= 40100)
+  static_assert(XVector::rank() == YVector::rank(),
                 "KokkosSparse::spmv: Vector ranks do not match.");
-  // Make sure that x (and therefore y) is rank 1.
+
+  static_assert(XVector::rank() == 1,
+                "KokkosSparse::spmv: Both Vector inputs must have rank 1 "
+                "in order to call this specialization of spmv.");
+#else
+  static_assert(
+      static_cast<int>(XVector::rank) == static_cast<int>(YVector::rank),
+      "KokkosSparse::spmv: Vector ranks do not match.");
   static_assert(static_cast<int>(XVector::rank) == 1,
                 "KokkosSparse::spmv: Both Vector inputs must have rank 1 "
                 "in order to call this specialization of spmv.");
+#endif
   // Make sure that y is non-const.
   static_assert(std::is_same<typename YVector::value_type,
                              typename YVector::non_const_value_type>::value,
@@ -296,8 +306,14 @@ void spmv(const ExecutionSpace& space,
                                  typename YVector::memory_space>::accessible,
       "KokkosBlas::spmv: YVector must be accessible from ExecutionSpace");
   // Make sure that x and y have the same rank.
-  static_assert(XVector::rank == YVector::rank,
+#if (KOKKOS_VERSION >= 40100)
+  static_assert(XVector::rank() == YVector::rank(),
                 "KokkosSparse::spmv: Vector ranks do not match.");
+#else
+  static_assert(
+      static_cast<int>(XVector::rank) == static_cast<int>(YVector::rank),
+      "KokkosSparse::spmv: Vector ranks do not match.");
+#endif
   // Make sure that x (and therefore y) is rank 1.
   static_assert(static_cast<int>(XVector::rank) == 1,
                 "KokkosSparse::spmv: Both Vector inputs must have rank 1 "
@@ -673,9 +689,15 @@ void spmv(const ExecutionSpace& space,
       Kokkos::SpaceAccessibility<ExecutionSpace,
                                  typename YVector::memory_space>::accessible,
       "KokkosBlas::spmv: YVector must be accessible from ExecutionSpace");
-  // Make sure that x and y have the same rank.
-  static_assert(XVector::rank == YVector::rank,
+// Make sure that x and y have the same rank.
+#if (KOKKOS_VERSION >= 40100)
+  static_assert(XVector::rank() == YVector::rank(),
                 "KokkosSparse::spmv: Vector ranks do not match.");
+#else
+  static_assert(
+      static_cast<int>(XVector::rank) == static_cast<int>(YVector::rank),
+      "KokkosSparse::spmv: Vector ranks do not match.");
+#endif
   // Make sure that x (and therefore y) is rank 2.
   static_assert(static_cast<int>(XVector::rank) == 2,
                 "KokkosSparse::spmv: Both Vector inputs must have rank 2 "

From b8eb1bcb31c1fb73d94fe2269e728cb7a5d94f02 Mon Sep 17 00:00:00 2001
From: Carl Pearson <cwpears@sandia.gov>
Date: Mon, 9 Oct 2023 11:37:00 -0600
Subject: [PATCH 09/10] KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp: used
 provided execution space for TPL

---
 sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp
index 8932beb88a..97019e4682 100644
--- a/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp
+++ b/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp
@@ -819,6 +819,8 @@ void spmv_block_impl_rocsparse(
                 "A entries must be contiguous");
 
   rocsparse_handle handle = controls.getRocsparseHandle();
+  // resets handle stream to NULL when out of scope
+  KokkosSparse::Impl::TemporarySetRocsparseStream tsrs(handle, exec);
 
   // set the mode
   rocsparse_operation trans;

From 01e97739b682690ef97f68cefd5b64b0b29f70b2 Mon Sep 17 00:00:00 2001
From: Carl Pearson <cwpears@sandia.gov>
Date: Mon, 9 Oct 2023 16:30:59 -0600
Subject: [PATCH 10/10] KokkosSparse_merge_matrix.hpp: fix comparison
 signedness

---
 sparse/impl/KokkosSparse_merge_matrix.hpp | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/sparse/impl/KokkosSparse_merge_matrix.hpp b/sparse/impl/KokkosSparse_merge_matrix.hpp
index 6510975c87..18c9467a9a 100644
--- a/sparse/impl/KokkosSparse_merge_matrix.hpp
+++ b/sparse/impl/KokkosSparse_merge_matrix.hpp
@@ -31,6 +31,9 @@ namespace KokkosSparse::Impl {
 // a joint index into a and b
 template <typename AIndex, typename BIndex>
 struct MergeMatrixPosition {
+  using a_index_type = AIndex;
+  using b_index_type = BIndex;
+
   AIndex ai;
   BIndex bi;
 };
@@ -145,9 +148,10 @@ class MergeMatrixDiagonal {
   KOKKOS_INLINE_FUNCTION
   bool operator()(const size_type di) const {
     position_type pos = diag_to_a_b(di);
-    if (size_type(pos.ai) >= a_.size()) {
+
+    if (pos.ai >= typename position_type::a_index_type(a_.size())) {
       return true;  // on the +a side out of matrix bounds is 1
-    } else if (size_type(pos.bi) >= b_.size()) {
+    } else if (pos.bi >= typename position_type::b_index_type(b_.size())) {
       return false;  // on the +b side out of matrix bounds is 0
     } else {
       return KokkosKernels::Impl::safe_gt(a_(pos.ai), b_(pos.bi));
@@ -161,9 +165,9 @@ class MergeMatrixDiagonal {
    */
   KOKKOS_INLINE_FUNCTION
   size_type size() const noexcept {
-    if (d_ <= a_.size() && d_ <= b_.size()) {
+    if (d_ <= size_type(a_.size()) && d_ <= size_type(b_.size())) {
       return d_;
-    } else if (d_ > a_.size() && d_ > b_.size()) {
+    } else if (d_ > size_type(a_.size()) && d_ > size_type(b_.size())) {
       // TODO: this returns nonsense if d_ happens to be outside the merge
       // matrix
       return a_.size() + b_.size() - d_;
@@ -182,8 +186,8 @@ class MergeMatrixDiagonal {
   KOKKOS_INLINE_FUNCTION
   position_type diag_to_a_b(const size_type &di) const noexcept {
     position_type res;
-    res.ai = d_ < a_.size() ? (d_ - 1) - di : a_.size() - 1 - di;
-    res.bi = d_ < a_.size() ? di : d_ + di - a_.size();
+    res.ai = d_ < size_type(a_.size()) ? (d_ - 1) - di : a_.size() - 1 - di;
+    res.bi = d_ < size_type(a_.size()) ? di : d_ + di - a_.size();
     return res;
   }