From 3ad6204d06d50f7d237da33b550f11707a6e55ee Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Thu, 20 Jul 2023 14:47:26 -0600 Subject: [PATCH 01/10] Adds the merge-based spmv behind an opt-in `controls.setParameter("algorithm", "merge")`. This SpMV is up to 200x faster than the existing native implementation for matrices with highly skewed row lengths. * Adds `KokkosSparse::Impl::diagonal_search` to support the merge-based SpMV * Moves `KokkosKernels_Iota.hpp` into the `impl` directory * Removes the static assert for `Kokkos::View` and `Kokkos::Iota` from `lower_bound` and friends, since they can also operate on things that appear like a view but are not present in the common component * Changes `KokkosSparse::Impl::MergeMatrixDiagonal::MatrixPosition` -> `KokkosSparse::Impl::MergeMatrixPosition`. Various `MergeMatrixDiagonals` have the same `MatrixPosition` type, which were technically different because they were scoped. --- common/{src => impl}/KokkosKernels_Iota.hpp | 0 common/src/KokkosKernels_LowerBound.hpp | 22 +- .../KokkosSparse_merge_matrix.hpp} | 132 ++++++- sparse/impl/KokkosSparse_spmv_impl.hpp | 35 +- sparse/impl/KokkosSparse_spmv_impl_merge.hpp | 373 ++++++++++++++++++ sparse/impl/KokkosSparse_spmv_struct_impl.hpp | 14 +- sparse/unit_test/Test_Sparse_MergeMatrix.hpp | 40 +- sparse/unit_test/Test_Sparse_spmv.hpp | 75 ++-- 8 files changed, 588 insertions(+), 103 deletions(-) rename common/{src => impl}/KokkosKernels_Iota.hpp (100%) rename sparse/{src/KokkosSparse_MergeMatrix.hpp => impl/KokkosSparse_merge_matrix.hpp} (56%) create mode 100644 sparse/impl/KokkosSparse_spmv_impl_merge.hpp diff --git a/common/src/KokkosKernels_Iota.hpp b/common/impl/KokkosKernels_Iota.hpp similarity index 100% rename from common/src/KokkosKernels_Iota.hpp rename to common/impl/KokkosKernels_Iota.hpp diff --git a/common/src/KokkosKernels_LowerBound.hpp b/common/src/KokkosKernels_LowerBound.hpp index 160bd496f3..e091932453 100644 --- a/common/src/KokkosKernels_LowerBound.hpp +++ b/common/src/KokkosKernels_LowerBound.hpp @@ -77,8 +77,8 @@ namespace Impl { /*! \brief Single-thread sequential lower-bound search - \tparam ViewLike A Kokkos::View or KokkosKernels::Impl::Iota - \tparam Pred a binary predicate function + \tparam ViewLike A Kokkos::View, KokkosKernels::Impl::Iota, or + KokkosSparse::MergeMatrixDiagonal \tparam Pred a binary predicate function \param view the view to search \param value the value to search for \param pred a binary predicate function @@ -96,9 +96,6 @@ lower_bound_sequential_thread( using size_type = typename ViewLike::size_type; static_assert(1 == ViewLike::rank, "lower_bound_sequential_thread requires rank-1 views"); - static_assert(is_iota_v || Kokkos::is_view::value, - "lower_bound_sequential_thread requires a " - "KokkosKernels::Impl::Iota or a Kokkos::View"); size_type i = 0; while (i < view.size() && pred(view(i), value)) { @@ -109,8 +106,8 @@ lower_bound_sequential_thread( /*! \brief Single-thread binary lower-bound search - \tparam ViewLike A Kokkos::View or KokkosKernels::Impl::Iota - \tparam Pred a binary predicate function + \tparam ViewLike A Kokkos::View, KokkosKernels::Impl::Iota, or + KokkosSparse::MergeMatrixDiagonal \tparam Pred a binary predicate function \param view the view to search \param value the value to search for \param pred a binary predicate function @@ -127,9 +124,6 @@ KOKKOS_INLINE_FUNCTION typename ViewLike::size_type lower_bound_binary_thread( using size_type = typename ViewLike::size_type; static_assert(1 == ViewLike::rank, "lower_bound_binary_thread requires rank-1 views"); - static_assert(is_iota_v || Kokkos::is_view::value, - "lower_bound_binary_thread requires a " - "KokkosKernels::Impl::Iota or a Kokkos::View"); size_type lo = 0; size_type hi = view.size(); @@ -150,8 +144,8 @@ KOKKOS_INLINE_FUNCTION typename ViewLike::size_type lower_bound_binary_thread( /*! \brief single-thread lower-bound search - \tparam ViewLike A Kokkos::View or KokkosKernels::Impl::Iota - \tparam Pred a binary predicate function + \tparam ViewLike A Kokkos::View, KokkosKernels::Impl::Iota, or + KokkosSparse::MergeMatrixDiagonal \tparam Pred a binary predicate function \param view the view to search \param value the value to search for \param pred a binary predicate function @@ -168,10 +162,6 @@ KOKKOS_INLINE_FUNCTION typename ViewLike::size_type lower_bound_thread( Pred pred = Pred()) { static_assert(1 == ViewLike::rank, "lower_bound_thread requires rank-1 views"); - static_assert(KokkosKernels::Impl::is_iota_v || - Kokkos::is_view::value, - "lower_bound_thread requires a " - "KokkosKernels::Impl::Iota or a Kokkos::View"); /* sequential search makes on average 0.5 * view.size memory accesses binary search makes log2(view.size)+1 accesses diff --git a/sparse/src/KokkosSparse_MergeMatrix.hpp b/sparse/impl/KokkosSparse_merge_matrix.hpp similarity index 56% rename from sparse/src/KokkosSparse_MergeMatrix.hpp rename to sparse/impl/KokkosSparse_merge_matrix.hpp index d573a5550f..83dfe42e84 100644 --- a/sparse/src/KokkosSparse_MergeMatrix.hpp +++ b/sparse/impl/KokkosSparse_merge_matrix.hpp @@ -20,13 +20,20 @@ #include #include "KokkosKernels_Iota.hpp" +#include "KokkosKernels_LowerBound.hpp" +#include "KokkosKernels_Predicates.hpp" #include "KokkosKernels_SafeCompare.hpp" -/// \file KokkosSparse_MergeMatrix.hpp +/// \file KokkosSparse_merge_matrix.hpp -namespace KokkosSparse { -namespace Experimental { -namespace Impl { +namespace KokkosSparse::Impl { + +// a joint index into a and b +template +struct MergeMatrixPosition { + AIndex ai; + BIndex bi; +}; /*! \class MergeMatrixDiagonal \brief a view into the entries of the Merge Matrix along a diagonal @@ -88,14 +95,7 @@ class MergeMatrixDiagonal { using a_value_type = typename AView::non_const_value_type; using b_value_type = typename BViewLike::non_const_value_type; - /*! \struct MatrixPosition - * \brief indices into the a_ and b_ views. - */ - struct MatrixPosition { - a_index_type ai; - b_index_type bi; - }; - using position_type = MatrixPosition; + using position_type = MergeMatrixPosition; // implement bare minimum parts of the view interface enum { rank = 1 }; @@ -145,9 +145,9 @@ class MergeMatrixDiagonal { KOKKOS_INLINE_FUNCTION bool operator()(const size_type di) const { position_type pos = diag_to_a_b(di); - if (pos.ai >= a_.size()) { + if (size_t(pos.ai) >= a_.size()) { return true; // on the +a side out of matrix bounds is 1 - } else if (pos.bi >= b_.size()) { + } else if (size_t(pos.bi) >= b_.size()) { return false; // on the +b side out of matrix bounds is 0 } else { return KokkosKernels::Impl::safe_gt(a_(pos.ai), b_(pos.bi)); @@ -192,8 +192,106 @@ class MergeMatrixDiagonal { size_type d_; ///< diagonal }; -} // namespace Impl -} // namespace Experimental -} // namespace KokkosSparse +/*! \brief Return the first index on diagonal \code diag + in the merge matrix of \code a and \code b that is not 1 +This is effectively a lower-bound search on the merge matrix diagonal +where the predicate is "equals 1" +*/ +template +KOKKOS_INLINE_FUNCTION + typename MergeMatrixDiagonal::position_type + diagonal_search( + const AView &a, const BViewLike &b, + typename MergeMatrixDiagonal::size_type diag) { + // unmanaged view types for a and b + using um_a_view = + Kokkos::View; + using um_b_view = + Kokkos::View; + + um_a_view ua(a.data(), a.size()); + + // if BViewLike is an Iota, pass it on directly to MMD, + // otherwise, create an unmanaged view of B + using b_type = + typename std::conditional::value, + BViewLike, um_b_view>::type; + + using MMD = MergeMatrixDiagonal; + MMD mmd; + if constexpr (KokkosKernels::Impl::is_iota::value) { + mmd = MMD(ua, b, diag); + } else { + b_type ub(b.data(), b.size()); + mmd = MMD(ua, ub, diag); + } + + // returns index of the first element that does not satisfy pred(element, + // value) our input view is the merge matrix entry along the diagonal, and we + // want the first one that is not true. so our predicate just tells us if the + // merge matrix diagonal entry is equal to true or not + const typename MMD::size_type idx = KokkosKernels::lower_bound_thread( + mmd, true, KokkosKernels::Equal()); + return mmd.position(idx); +} + +template +KOKKOS_INLINE_FUNCTION + typename MergeMatrixDiagonal::position_type + diagonal_search( + const TeamMember &handle, const AView &a, const BViewLike &b, + typename MergeMatrixDiagonal::size_type diag) { + // unmanaged view types for a and b + using um_a_view = + Kokkos::View; + using um_b_view = + Kokkos::View; + + um_a_view ua(a.data(), a.size()); + + // if BViewLike is an Iota, pass it on directly to MMD, + // otherwise, create an unmanaged view of B + using b_type = + typename std::conditional::value, + BViewLike, um_b_view>::type; + + using MMD = MergeMatrixDiagonal; + MMD mmd; + if constexpr (KokkosKernels::Impl::is_iota::value) { + mmd = MMD(ua, b, diag); + } else { + b_type ub(b.data(), b.size()); + mmd = MMD(ua, ub, diag); + } + + // returns index of the first element that does not satisfy pred(element, + // value) our input view is the merge matrix entry along the diagonal, and we + // want the first one that is not true. so our predicate just tells us if the + // merge matrix diagonal entry is equal to true or not + const typename MMD::size_type idx = KokkosKernels::lower_bound_team( + handle, mmd, true, KokkosKernels::Equal()); + return mmd.position(idx); +} + +/*! \brief + + \return A MergeMatrixDiagonal::position_type + */ +template +KOKKOS_INLINE_FUNCTION auto diagonal_search( + const View &a, typename View::non_const_value_type totalWork, + typename View::size_type diag) { + using value_type = typename View::non_const_value_type; + using size_type = typename View::size_type; + + KokkosKernels::Impl::Iota iota(totalWork); + return diagonal_search(a, iota, diag); +} + +} // namespace KokkosSparse::Impl #endif // KOKKOSSPARSE_MERGEMATRIX_HPP diff --git a/sparse/impl/KokkosSparse_spmv_impl.hpp b/sparse/impl/KokkosSparse_spmv_impl.hpp index d00808558f..16717c6e62 100644 --- a/sparse/impl/KokkosSparse_spmv_impl.hpp +++ b/sparse/impl/KokkosSparse_spmv_impl.hpp @@ -17,17 +17,22 @@ #ifndef KOKKOSSPARSE_IMPL_SPMV_DEF_HPP_ #define KOKKOSSPARSE_IMPL_SPMV_DEF_HPP_ +#include + #include "KokkosKernels_Controls.hpp" #include "Kokkos_InnerProductSpaceTraits.hpp" #include "KokkosBlas1_scal.hpp" #include "KokkosKernels_ExecSpaceUtils.hpp" #include "KokkosSparse_CrsMatrix.hpp" #include "KokkosSparse_spmv_impl_omp.hpp" +#include "KokkosSparse_spmv_impl_merge.hpp" #include "KokkosKernels_Error.hpp" namespace KokkosSparse { namespace Impl { +constexpr const char* KOKKOSSPARSE_ALG_MERGE = "merge"; + // This TransposeFunctor is functional, but not necessarily performant. template @@ -629,11 +634,21 @@ static void spmv_beta(const execution_space& exec, typename YVector::const_value_type& beta, const YVector& y) { if (mode[0] == NoTranspose[0]) { - spmv_beta_no_transpose(exec, controls, alpha, A, x, beta, y); + if (controls.getParameter("algorithm") == KOKKOSSPARSE_ALG_MERGE) { + SpmvMergeHierarchical::spmv(exec, mode, alpha, A, x, + beta, y); + } else { + spmv_beta_no_transpose( + exec, controls, alpha, A, x, beta, y); + } } else if (mode[0] == Conjugate[0]) { - spmv_beta_no_transpose(exec, controls, alpha, A, x, beta, y); + if (controls.getParameter("algorithm") == KOKKOSSPARSE_ALG_MERGE) { + SpmvMergeHierarchical::spmv(exec, mode, alpha, A, x, + beta, y); + } else { + spmv_beta_no_transpose( + exec, controls, alpha, A, x, beta, y); + } } else if (mode[0] == Transpose[0]) { spmv_beta_transpose(exec, alpha, A, x, beta, y); @@ -641,8 +656,10 @@ static void spmv_beta(const execution_space& exec, spmv_beta_transpose(exec, alpha, A, x, beta, y); } else { - KokkosKernels::Impl::throw_runtime_exception( - "Invalid Transpose Mode for KokkosSparse::spmv()"); + std::stringstream ss; + ss << __FILE__ << ":" << __LINE__ << " Invalid transpose mode " << mode + << " for KokkosSparse::spmv()"; + KokkosKernels::Impl::throw_runtime_exception(ss.str()); } } @@ -1460,8 +1477,10 @@ static void spmv_alpha_beta_mv( doalpha, dobeta, true>(exec, alpha, A, x, beta, y); } else { - KokkosKernels::Impl::throw_runtime_exception( - "Invalid Transpose Mode for KokkosSparse::spmv()"); + std::stringstream ss; + ss << __FILE__ << ":" << __LINE__ << " Invalid transpose mode " << mode + << " for KokkosSparse::spmv()"; + KokkosKernels::Impl::throw_runtime_exception(ss.str()); } } diff --git a/sparse/impl/KokkosSparse_spmv_impl_merge.hpp b/sparse/impl/KokkosSparse_spmv_impl_merge.hpp new file mode 100644 index 0000000000..7ed09ee477 --- /dev/null +++ b/sparse/impl/KokkosSparse_spmv_impl_merge.hpp @@ -0,0 +1,373 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSSPARSE_SPMV_IMPL_MERGE_HPP +#define KOKKOSSPARSE_SPMV_IMPL_MERGE_HPP + +#include + +#include "KokkosKernels_Iota.hpp" +#include "KokkosKernels_AlwaysFalse.hpp" + +#include "KokkosSparse_merge_matrix.hpp" + +namespace KokkosSparse::Impl { + +/*! \brief Merge-based SpMV + Hierarchical GPU implementation + Each team uses MergePath search to find the non-zeros and rows it is + responsible for Each thread in the team similarly uses diagonal search within + the team to determine which entries it will be responsible for + The threads then atomically accumulate partial produces +*/ +template +struct SpmvMergeHierarchical { + using device_type = typename YVector::device_type; + using exec_space = typename device_type::execution_space; + using y_value_type = typename YVector::non_const_value_type; + using x_value_type = typename XVector::non_const_value_type; + using A_value_type = typename AMatrix::non_const_value_type; + using A_ordinal_type = typename AMatrix::non_const_ordinal_type; + using A_size_type = typename AMatrix::non_const_size_type; + using row_map_non_const_value_type = + typename AMatrix::row_map_type::non_const_value_type; + + using policy_type = Kokkos::TeamPolicy; + using team_member = typename policy_type::member_type; + + using um_row_map_type = + Kokkos::View>; + + using row_map_scratch_type = + Kokkos::View>; + + using iota_type = KokkosKernels::Impl::Iota; + + using DSR = typename KokkosSparse::Impl::MergeMatrixDiagonal< + um_row_map_type, iota_type>::position_type; + + using KAT = Kokkos::ArithTraits; + + // results of a lower-bound and upper-bound diagonal search + struct Chunk { + DSR lb; // lower bound + DSR ub; // upper bound + }; + + template + struct Functor { + Functor(const y_value_type& _alpha, const AMatrix& _A, const XVector& _x, + const YVector& _y, const A_size_type pathLengthThreadChunk) + : alpha(_alpha), + A(_A), + x(_x), + y(_y), + pathLengthThreadChunk_(pathLengthThreadChunk) {} + + y_value_type alpha; + AMatrix A; + XVector x; + YVector y; + A_size_type pathLengthThreadChunk_; + + KOKKOS_INLINE_FUNCTION void operator()(const team_member& thread) const { + const A_size_type pathLengthTeamChunk = + thread.team_size() * pathLengthThreadChunk_; + + const A_size_type pathLength = A.numRows() + A.nnz(); + const A_size_type teamD = + thread.league_rank() * pathLengthTeamChunk; // diagonal + const A_size_type teamDEnd = + KOKKOSKERNELS_MACRO_MIN(teamD + pathLengthTeamChunk, pathLength); + + // iota(i) -> i + iota_type iota(A.nnz()); + + // remove leading 0 from row_map + um_row_map_type rowEnds(&A.graph.row_map(1), A.graph.row_map.size() - 1); + + // compiler thinks these are "used" in team_broadcast below, so initialize + // them with something to silence the warning + DSR lb{}; + DSR ub{}; + + // thread 0 does the lower bound, thread 1 does the upper bound + if (0 == thread.team_rank() || 1 == thread.team_rank()) { + const A_size_type d = thread.team_rank() ? teamDEnd : teamD; + DSR dsr = diagonal_search(rowEnds, iota, d); + if (0 == thread.team_rank()) { + lb = dsr; + } + if (1 == thread.team_rank()) { + ub = dsr; + } + } + thread.team_broadcast(lb, 0); + thread.team_broadcast(ub, 1); + const A_size_type teamNnzBegin = + lb.bi; // the first nnz this team will handle + const A_size_type teamNnzEnd = + ub.bi; // one-past the last nnz this team will handle + const A_ordinal_type teamRowBegin = + lb.ai; // <= the row than the first nnz is in + const A_ordinal_type teamRowEnd = + ub.ai; // >= the row than the last nnz is in + + // team-collaborative copy of matrix data into scratch + A_size_type* rowEndsS{nullptr}; + A_ordinal_type* entriesS{nullptr}; + A_value_type* valuesS{nullptr}; + y_value_type* yS{nullptr}; + + if constexpr (ROWENDS_USE_SCRATCH) { + rowEndsS = (A_size_type*)thread.team_shmem().get_shmem( + pathLengthTeamChunk * sizeof(A_size_type)); + + // teamRowEnd may be equal to the row the team's last nnz is in + // so in most cases we want to read it (teamRowEnd+1). However, + // however, guard against reading off the end of the view + Kokkos::parallel_for( + Kokkos::TeamThreadRange(thread, teamRowBegin, teamRowEnd + 1), + [&](const A_ordinal_type& i) { + if (i < A.numRows()) { + rowEndsS[i - teamRowBegin] = rowEnds(i); + } else { + rowEndsS[i - teamRowBegin] = A.nnz(); + } + }); + } else { + (void)(rowEndsS == rowEndsS); // set but unused, expr has no effect + } + + if constexpr (NONZEROS_USE_SCRATCH) { + valuesS = (A_value_type*)thread.team_shmem().get_shmem( + pathLengthTeamChunk * sizeof(A_value_type)); + entriesS = (A_ordinal_type*)thread.team_shmem().get_shmem( + pathLengthTeamChunk * sizeof(A_ordinal_type)); + Kokkos::parallel_for( + Kokkos::TeamThreadRange(thread, teamNnzBegin, teamNnzEnd), + [=](const A_ordinal_type& i) { + valuesS[i - teamNnzBegin] = A.values(i); + entriesS[i - teamNnzBegin] = A.graph.entries(i); + }); + } else { + (void)(entriesS == entriesS); // set but unused, expr has no effect + (void)(valuesS == valuesS); // set but unused, expr has no effect + } + + if constexpr (Y_USE_SCRATCH) { + yS = (y_value_type*)thread.team_shmem().get_shmem(pathLengthTeamChunk * + sizeof(y_value_type)); + Kokkos::parallel_for( + Kokkos::TeamThreadRange(thread, teamRowBegin, teamRowEnd + 1), + [&](const A_ordinal_type& i) { + if (i < A.numRows()) { + yS[i - teamRowBegin] = 0; + } + }); + } else { + (void)(yS == yS); // set but unused, expr has no effect + } + + if constexpr (ROWENDS_USE_SCRATCH || NONZEROS_USE_SCRATCH || + Y_USE_SCRATCH) { + thread.team_barrier(); + } + + // each thread determines its location within the team chunk + + // team's view of row map is either in scratch or global + typename std::conditional::type teamRowEnds; + if constexpr (ROWENDS_USE_SCRATCH) { + teamRowEnds = row_map_scratch_type(rowEndsS, teamRowEnd - teamRowBegin); + } else { + teamRowEnds = + um_row_map_type(&rowEnds(teamRowBegin), teamRowEnd - teamRowBegin); + } + + iota_type teamIota(teamNnzEnd - teamNnzBegin, + teamNnzBegin); // teamNnzBegin.. teamRowBegin && i < teamRowEnd) { + y(i) += yS[i - teamRowBegin]; + } else { + Kokkos::atomic_add(&y(i), yS[i - teamRowBegin]); + } + } + }); + } + } + + size_t team_shmem_size(int teamSize) const { + const A_size_type pathLengthTeamChunk = pathLengthThreadChunk_ * teamSize; + (void)pathLengthTeamChunk; // silence declared but not referenced + size_t val = 0; + if constexpr (Y_USE_SCRATCH) { + val += sizeof(y_value_type) * pathLengthTeamChunk; + } + if constexpr (ROWENDS_USE_SCRATCH) { + val += sizeof(row_map_non_const_value_type) * pathLengthTeamChunk; + } + if constexpr (NONZEROS_USE_SCRATCH) { + val += sizeof(A_ordinal_type) * pathLengthTeamChunk; + val += sizeof(A_value_type) * pathLengthTeamChunk; + } + return val; + } + }; + + static void spmv(const char mode[], const y_value_type& alpha, + const AMatrix& A, const XVector& x, const y_value_type& beta, + const YVector& y) { + static_assert(XVector::rank == 1, ""); + static_assert(YVector::rank == 1, ""); + + KokkosBlas::scal(y, beta, y); + + /* determine launch parameters for different architectures + On architectures where there is a natural execution hierarchy with true + team scratch, we'll assign each team to use an appropriate amount of the + scratch. + On other architectures, just have each team do the maximal amount of work + to amortize the cost of the diagonal search + */ + const A_size_type pathLength = A.numRows() + A.nnz(); + A_size_type pathLengthThreadChunk; + int teamSize; + if constexpr (KokkosKernels::Impl::kk_is_gpu_exec_space()) { + pathLengthThreadChunk = 4; + teamSize = 128; + } else { + teamSize = 1; + pathLengthThreadChunk = (pathLength + exec_space::concurrency() - 1) / + exec_space::concurrency(); + } + + const size_t pathLengthTeamChunk = pathLengthThreadChunk * teamSize; + const int leagueSize = + (pathLength + pathLengthTeamChunk - 1) / pathLengthTeamChunk; + + policy_type policy(exec_space(), leagueSize, teamSize); + + /* Currently: + On GPU, assume atomics are fast, so don't accumuate into scratch. + On CPU spaces, there's no real point to using scratch, just rely on the + memory hierarchy. Using scratch just increases the number of required + atomic operations + */ + if (KokkosSparse::NoTranspose[0] == mode[0]) { + constexpr bool CONJ = false; + using GpuOp = Functor; + using CpuOp = Functor; + using Op = typename std::conditional< + KokkosKernels::Impl::kk_is_gpu_exec_space(), GpuOp, + CpuOp>::type; + Op op(alpha, A, x, y, pathLengthThreadChunk); + Kokkos::parallel_for("SpmvMergeHierarchical::spmv", policy, op); + } else if (KokkosSparse::Conjugate[0] == mode[0]) { + constexpr bool CONJ = true; + using GpuOp = Functor; + using CpuOp = Functor; + using Op = typename std::conditional< + KokkosKernels::Impl::kk_is_gpu_exec_space(), GpuOp, + CpuOp>::type; + Op op(alpha, A, x, y, pathLengthThreadChunk); + Kokkos::parallel_for("SpmvMergeHierarchical::spmv", policy, op); + } else { + std::stringstream ss; + ss << __FILE__ << ":" << __LINE__ + << "SpmvMergeHierarchical::spmv() called with unsupported mode " + << mode; + throw std::logic_error(ss.str()); + } + } +}; + +} // namespace KokkosSparse::Impl + +#endif // KOKKOSSPARSE_SPMV_IMPL_MERGE_HPP diff --git a/sparse/impl/KokkosSparse_spmv_struct_impl.hpp b/sparse/impl/KokkosSparse_spmv_struct_impl.hpp index c18018f54f..a582f18e40 100644 --- a/sparse/impl/KokkosSparse_spmv_struct_impl.hpp +++ b/sparse/impl/KokkosSparse_spmv_struct_impl.hpp @@ -17,6 +17,8 @@ #ifndef KOKKOSSPARSE_IMPL_SPMV_STRUCT_DEF_HPP_ #define KOKKOSSPARSE_IMPL_SPMV_STRUCT_DEF_HPP_ +#include + #include "Kokkos_InnerProductSpaceTraits.hpp" #include "KokkosKernels_ExecSpaceUtils.hpp" #include "KokkosBlas1_scal.hpp" @@ -923,8 +925,10 @@ static void spmv_struct_beta( dobeta, true>(exec, stencil_type, structure, alpha, A, x, beta, y); } else { - KokkosKernels::Impl::throw_runtime_exception( - "Invalid Transpose Mode for KokkosSparse::spmv_struct()"); + std::stringstream ss; + ss << __FILE__ << ":" << __LINE__ << " Invalid transpose mode " << mode + << " for KokkosSparse::spmv_struct()"; + KokkosKernels::Impl::throw_runtime_exception(ss.str()); } } @@ -1454,8 +1458,10 @@ static void spmv_alpha_beta_mv_struct( YVector, doalpha, dobeta, true>( exec, alpha, A, x, beta, y); } else { - KokkosKernels::Impl::throw_runtime_exception( - "Invalid Transpose Mode for KokkosSparse::spmv()"); + std::stringstream ss; + ss << __FILE__ << ":" << __LINE__ << " Invalid transpose mode " << mode + << " for KokkosSparse::spmv_struct()"; + KokkosKernels::Impl::throw_runtime_exception(ss.str()); } } diff --git a/sparse/unit_test/Test_Sparse_MergeMatrix.hpp b/sparse/unit_test/Test_Sparse_MergeMatrix.hpp index b6301778a3..85c35c0044 100644 --- a/sparse/unit_test/Test_Sparse_MergeMatrix.hpp +++ b/sparse/unit_test/Test_Sparse_MergeMatrix.hpp @@ -26,7 +26,7 @@ #include #include "KokkosKernels_Iota.hpp" -#include "KokkosSparse_MergeMatrix.hpp" +#include "KokkosSparse_merge_matrix.hpp" namespace Test_Sparse_MergeMatrix { @@ -85,8 +85,7 @@ template void view_view_empty_empty() { using AView = Kokkos::View; using BView = Kokkos::View; - using MMD = - KokkosSparse::Experimental::Impl::MergeMatrixDiagonal; + using MMD = KokkosSparse::Impl::MergeMatrixDiagonal; AView a("view-view-empty-empty-a", 0); BView b("view-view-empty-empty-b", 0); @@ -102,8 +101,7 @@ template void view_view_full_empty() { using AView = Kokkos::View; using BView = Kokkos::View; - using MMD = - KokkosSparse::Experimental::Impl::MergeMatrixDiagonal; + using MMD = KokkosSparse::Impl::MergeMatrixDiagonal; size_t aNonzero = 5; AView a("view-view-full-empty-a", aNonzero); @@ -123,8 +121,7 @@ template void view_view_empty_full() { using AView = Kokkos::View; using BView = Kokkos::View; - using MMD = - KokkosSparse::Experimental::Impl::MergeMatrixDiagonal; + using MMD = KokkosSparse::Impl::MergeMatrixDiagonal; AView a("view-view-empty-full-a", 0); BView b = from_std_vec("view-view-empty-full-b", {0, 1, 2, 3}); @@ -284,10 +281,9 @@ std::tuple view_view_case_5() { */ template void view_view_full_full() { - using AView = Kokkos::View; - using BView = Kokkos::View; - using MMD = - KokkosSparse::Experimental::Impl::MergeMatrixDiagonal; + using AView = Kokkos::View; + using BView = Kokkos::View; + using MMD = KokkosSparse::Impl::MergeMatrixDiagonal; using mmd_value_type = typename MMD::non_const_value_type; { @@ -377,8 +373,7 @@ template void view_iota_empty_empty() { using AView = Kokkos::View; using BView = KokkosKernels::Impl::Iota; - using MMD = - KokkosSparse::Experimental::Impl::MergeMatrixDiagonal; + using MMD = KokkosSparse::Impl::MergeMatrixDiagonal; AView a("view-iota-empty-empty-a", 0); BView b(0); @@ -394,8 +389,7 @@ template void view_iota_full_empty() { using AView = Kokkos::View; using BView = KokkosKernels::Impl::Iota; - using MMD = - KokkosSparse::Experimental::Impl::MergeMatrixDiagonal; + using MMD = KokkosSparse::Impl::MergeMatrixDiagonal; size_t aNonzero = 5; AView a("view-iota-full-empty-a", aNonzero); @@ -415,8 +409,7 @@ template void view_iota_empty_full() { using AView = Kokkos::View; using BView = KokkosKernels::Impl::Iota; - using MMD = - KokkosSparse::Experimental::Impl::MergeMatrixDiagonal; + using MMD = KokkosSparse::Impl::MergeMatrixDiagonal; AView a("view-iota-empty-full-a", 0); BView b(4); @@ -487,10 +480,9 @@ std::tuple view_iota_case_1() { */ template void view_iota_full_full() { - using AView = Kokkos::View; - using BView = KokkosKernels::Impl::Iota; - using MMD = - KokkosSparse::Experimental::Impl::MergeMatrixDiagonal; + using AView = Kokkos::View; + using BView = KokkosKernels::Impl::Iota; + using MMD = KokkosSparse::Impl::MergeMatrixDiagonal; using mmd_value_type = typename MMD::non_const_value_type; { @@ -537,8 +529,7 @@ void test_rank() { { using AView = Kokkos::View; using BView = Kokkos::View; - using MMD = - KokkosSparse::Experimental::Impl::MergeMatrixDiagonal; + using MMD = KokkosSparse::Impl::MergeMatrixDiagonal; static_assert(MMD::rank == 1, "MergeMatrixDiagonal should look like a rank-1 view"); } @@ -546,8 +537,7 @@ void test_rank() { { using AView = Kokkos::View; using BView = KokkosKernels::Impl::Iota; - using MMD = - KokkosSparse::Experimental::Impl::MergeMatrixDiagonal; + using MMD = KokkosSparse::Impl::MergeMatrixDiagonal; static_assert(MMD::rank == 1, "MergeMatrixDiagonal should look like a rank-1 view"); } diff --git a/sparse/unit_test/Test_Sparse_spmv.hpp b/sparse/unit_test/Test_Sparse_spmv.hpp index b6a64e4f6d..9f980fedc9 100644 --- a/sparse/unit_test/Test_Sparse_spmv.hpp +++ b/sparse/unit_test/Test_Sparse_spmv.hpp @@ -125,7 +125,7 @@ template void sequential_spmv(crsMat_t input_mat, x_vector_type x, y_vector_type y, typename y_vector_type::non_const_value_type alpha, typename y_vector_type::non_const_value_type beta, - char mode = 'N') { + const std::string &mode = "N") { using graph_t = typename crsMat_t::StaticCrsGraphType; using size_type_view_t = typename graph_t::row_map_type; using lno_view_t = typename graph_t::entries_type; @@ -136,8 +136,6 @@ void sequential_spmv(crsMat_t input_mat, x_vector_type x, y_vector_type y, using scalar_t = typename scalar_view_t::non_const_value_type; using KAT = Kokkos::ArithTraits; - mode = toupper(mode); - typename scalar_view_t::HostMirror h_values = Kokkos::create_mirror_view(input_mat.values); Kokkos::deep_copy(h_values, input_mat.values); @@ -168,13 +166,13 @@ void sequential_spmv(crsMat_t input_mat, x_vector_type x, y_vector_type y, for (size_type j = h_rowmap(row); j < h_rowmap(row + 1); ++j) { lno_t col = h_entries(j); scalar_t val = h_values(j); - if (mode == 'N') + if (mode == "N") h_y(row) += alpha * val * h_x(col); - else if (mode == 'C') + else if (mode == "C") h_y(row) += alpha * KAT::conj(val) * h_x(col); - else if (mode == 'T') + else if (mode == "T") h_y(col) += alpha * val * h_x(row); - else if (mode == 'H') + else if (mode == "H") h_y(col) += alpha * KAT::conj(val) * h_x(row); } } @@ -184,12 +182,14 @@ void sequential_spmv(crsMat_t input_mat, x_vector_type x, y_vector_type y, template void check_spmv( - const Controls &controls, crsMat_t input_mat, x_vector_type x, - y_vector_type y, typename y_vector_type::non_const_value_type alpha, - typename y_vector_type::non_const_value_type beta, char mode, + const KokkosKernels::Experimental::Controls &controls, crsMat_t input_mat, + x_vector_type x, y_vector_type y, + typename y_vector_type::non_const_value_type alpha, + typename y_vector_type::non_const_value_type beta, const std::string &mode, typename Kokkos::ArithTraits::mag_type max_val) { - // typedef typename crsMat_t::StaticCrsGraphType graph_t; + EXPECT_TRUE(mode.size() == 1); + using ExecSpace = typename crsMat_t::execution_space; using my_exec_space = Kokkos::RangePolicy; using y_value_type = typename y_vector_type::non_const_value_type; @@ -198,7 +198,7 @@ void check_spmv( const y_value_mag_type eps = 10 * Kokkos::ArithTraits::eps(); - bool transposed = (mode == 'T') || (mode == 'H'); + bool transposed = (mode == "T") || (mode == "H"); y_vector_type expected_y( "expected", transposed ? input_mat.numCols() : input_mat.numRows()); Kokkos::deep_copy(expected_y, y); @@ -208,7 +208,7 @@ void check_spmv( bool threw = false; std::string msg; try { - KokkosSparse::spmv(controls, &mode, alpha, input_mat, x, beta, y); + KokkosSparse::spmv(controls, mode.data(), alpha, input_mat, x, beta, y); Kokkos::fence(); } catch (std::exception &e) { threw = true; @@ -234,9 +234,12 @@ void check_spmv_mv( crsMat_t input_mat, x_vector_type x, y_vector_type y, y_vector_type expected_y, typename y_vector_type::non_const_value_type alpha, - typename y_vector_type::non_const_value_type beta, int numMV, char mode, + typename y_vector_type::non_const_value_type beta, int numMV, + const std::string &mode, typename Kokkos::ArithTraits::mag_type max_val) { + EXPECT_TRUE(mode.size() == 1); + using ExecSpace = typename crsMat_t::execution_space; using my_exec_space = Kokkos::RangePolicy; using y_value_type = typename y_vector_type::non_const_value_type; @@ -256,7 +259,7 @@ void check_spmv_mv( bool threw = false; std::string msg; try { - KokkosSparse::spmv(&mode, alpha, input_mat, x, beta, y); + KokkosSparse::spmv(mode.data(), alpha, input_mat, x, beta, y); Kokkos::fence(); } catch (std::exception &e) { threw = true; @@ -491,12 +494,12 @@ void test_spmv(const Controls &controls, lno_t numRows, size_type nnz, Kokkos::fill_random(input_mat.values, rand_pool, randomUpperBound(max_val)); - std::vector nonTransModes = {'N'}; - std::vector transModes = {'T'}; - std::vector testAlphaBeta = {0.0, 1.0}; + std::vector nonTransModes = {"N"}; + std::vector transModes = {"T"}; + std::vector testAlphaBeta = {0.0, 1.0}; if (heavy) { - nonTransModes.push_back('C'); - transModes.push_back('H'); + nonTransModes.push_back("C"); + transModes.push_back("H"); testAlphaBeta.push_back(-1.0); testAlphaBeta.push_back(2.5); } @@ -528,17 +531,23 @@ template ( controls, numRows, nnz, bandwidth, row_size_variance, heavy); } { - Controls controls; + KokkosKernels::Experimental::Controls controls; controls.setParameter("algorithm", "native"); test_spmv( controls, numRows, nnz, bandwidth, row_size_variance, heavy); } + { + KokkosKernels::Experimental::Controls controls; + controls.setParameter("algorithm", "merge"); + test_spmv( + controls, numRows, nnz, bandwidth, row_size_variance, heavy); + } } template nonTransModes = {'N'}; - std::vector transModes = {'T'}; - std::vector testAlphaBeta = {0.0, 1.0}; + std::vector nonTransModes = {"N"}; + std::vector transModes = {"T"}; + std::vector testAlphaBeta = {0.0, 1.0}; if (heavy) { - nonTransModes.push_back('C'); - transModes.push_back('H'); + nonTransModes.push_back("C"); + transModes.push_back("H"); testAlphaBeta.push_back(-1.0); testAlphaBeta.push_back(2.5); } @@ -662,18 +671,18 @@ void test_spmv_mv_heavy(lno_t numRows, size_type nnz, lno_t bandwidth, Kokkos::deep_copy(b_y_copy, b_y); - Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 0.0, nv, 'N', + Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 0.0, nv, "N", max_nnz_per_row * max_val * max_x); - Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 0.0, 1.0, nv, 'N', + Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 0.0, 1.0, nv, "N", max_y); - Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 1.0, nv, 'N', + Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 1.0, nv, "N", max_y + max_nnz_per_row * max_val * max_x); - Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 0.0, nv, 'T', + Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 0.0, nv, "T", max_nnz_per_row * max_val * max_x); - Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 0.0, 1.0, nv, 'T', + Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 0.0, 1.0, nv, "T", max_y); // Testing all modes together, since matrix is square - std::vector modes = {'N', 'C', 'T', 'H'}; + std::vector modes = {"N", "C", "T", "H"}; std::vector testAlphaBeta = {0.0, 1.0, -1.0, 2.5}; for (auto mode : modes) { for (double alpha : testAlphaBeta) { From 245216841087370869d6f2d77b5a2f77e21e2dec Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Mon, 24 Jul 2023 10:34:43 -0600 Subject: [PATCH 02/10] KokkosKernels_Controls: add missing include Also remove pointless warning to stderr --- sparse/src/KokkosKernels_Controls.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/sparse/src/KokkosKernels_Controls.hpp b/sparse/src/KokkosKernels_Controls.hpp index 1ee8cd108e..594df031a3 100644 --- a/sparse/src/KokkosKernels_Controls.hpp +++ b/sparse/src/KokkosKernels_Controls.hpp @@ -20,6 +20,7 @@ /// \brief Mechanism to control internal behavior of kernels /// \author Luc Berger-Vergiat (lberge@sandia.gov) +#include #include #include #include "KokkosKernels_config.h" From b2d22caadfd0746df2f3a15cc1db41e74847c9ee Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Wed, 26 Jul 2023 15:22:43 -0600 Subject: [PATCH 03/10] Test_Sparse_spmv.hpp: more explicit namespacing --- sparse/unit_test/Test_Sparse_spmv.hpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/sparse/unit_test/Test_Sparse_spmv.hpp b/sparse/unit_test/Test_Sparse_spmv.hpp index 9f980fedc9..8fdb56b5f4 100644 --- a/sparse/unit_test/Test_Sparse_spmv.hpp +++ b/sparse/unit_test/Test_Sparse_spmv.hpp @@ -452,8 +452,9 @@ Kokkos::complex randomUpperBound>(int mag) { template -void test_spmv(const Controls &controls, lno_t numRows, size_type nnz, - lno_t bandwidth, lno_t row_size_variance, bool heavy) { +void test_spmv(const KokkosKernels::Experimental::Controls &controls, + lno_t numRows, size_type nnz, lno_t bandwidth, + lno_t row_size_variance, bool heavy) { using crsMat_t = typename KokkosSparse::CrsMatrix; using scalar_view_t = typename crsMat_t::values_type::non_const_type; @@ -953,7 +954,8 @@ void test_spmv_mv_struct_1D(lno_t nx, int numMV) { template void test_spmv_controls(lno_t numRows, size_type nnz, lno_t bandwidth, lno_t row_size_variance, - const Controls &controls = Controls()) { + const KokkosKernels::Experimental::Controls &controls = + KokkosKernels::Experimental::Controls()) { using crsMat_t = typename KokkosSparse::CrsMatrix; using scalar_view_t = typename crsMat_t::values_type::non_const_type; @@ -996,7 +998,7 @@ void test_spmv_controls(lno_t numRows, size_type nnz, lno_t bandwidth, template void test_spmv_native(lno_t numRows, size_type nnz, lno_t bandwidth, lno_t row_size_variance) { - Controls controls; + KokkosKernels::Experimental::Controls controls; controls.setParameter("algorithm", "native"); test_spmv_controls(numRows, nnz, bandwidth, row_size_variance, controls); } // test_spmv_native From a373631a6792176f38f10b001fa7e8c9f620050f Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Mon, 28 Aug 2023 11:57:13 -0600 Subject: [PATCH 04/10] add Execution Space support for merge-based SpMV --- sparse/impl/KokkosSparse_spmv_impl.hpp | 16 ++++++++-------- sparse/impl/KokkosSparse_spmv_impl_merge.hpp | 11 ++++++----- 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/sparse/impl/KokkosSparse_spmv_impl.hpp b/sparse/impl/KokkosSparse_spmv_impl.hpp index 16717c6e62..c3df57c65f 100644 --- a/sparse/impl/KokkosSparse_spmv_impl.hpp +++ b/sparse/impl/KokkosSparse_spmv_impl.hpp @@ -635,19 +635,19 @@ static void spmv_beta(const execution_space& exec, const YVector& y) { if (mode[0] == NoTranspose[0]) { if (controls.getParameter("algorithm") == KOKKOSSPARSE_ALG_MERGE) { - SpmvMergeHierarchical::spmv(exec, mode, alpha, A, x, - beta, y); + SpmvMergeHierarchical::spmv( + exec, mode, alpha, A, x, beta, y); } else { - spmv_beta_no_transpose( - exec, controls, alpha, A, x, beta, y); + spmv_beta_no_transpose(exec, controls, alpha, A, x, beta, y); } } else if (mode[0] == Conjugate[0]) { if (controls.getParameter("algorithm") == KOKKOSSPARSE_ALG_MERGE) { - SpmvMergeHierarchical::spmv(exec, mode, alpha, A, x, - beta, y); + SpmvMergeHierarchical::spmv( + exec, mode, alpha, A, x, beta, y); } else { - spmv_beta_no_transpose( - exec, controls, alpha, A, x, beta, y); + spmv_beta_no_transpose(exec, controls, alpha, A, x, beta, y); } } else if (mode[0] == Transpose[0]) { spmv_beta_transpose +template struct SpmvMergeHierarchical { using device_type = typename YVector::device_type; - using exec_space = typename device_type::execution_space; + using exec_space = ExecutionSpace; using y_value_type = typename YVector::non_const_value_type; using x_value_type = typename XVector::non_const_value_type; using A_value_type = typename AMatrix::non_const_value_type; @@ -301,8 +301,9 @@ struct SpmvMergeHierarchical { } }; - static void spmv(const char mode[], const y_value_type& alpha, - const AMatrix& A, const XVector& x, const y_value_type& beta, + static void spmv(const ExecutionSpace& space, const char mode[], + const y_value_type& alpha, const AMatrix& A, + const XVector& x, const y_value_type& beta, const YVector& y) { static_assert(XVector::rank == 1, ""); static_assert(YVector::rank == 1, ""); @@ -332,7 +333,7 @@ struct SpmvMergeHierarchical { const int leagueSize = (pathLength + pathLengthTeamChunk - 1) / pathLengthTeamChunk; - policy_type policy(exec_space(), leagueSize, teamSize); + policy_type policy(space, leagueSize, teamSize); /* Currently: On GPU, assume atomics are fast, so don't accumuate into scratch. From 0c4a2a97e58aa8316277fe415bc424d7fc1d2698 Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Tue, 19 Sep 2023 11:15:03 -0600 Subject: [PATCH 05/10] exec_space::concurrency -> exec_space().concurrency() --- sparse/impl/KokkosSparse_spmv_impl_merge.hpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sparse/impl/KokkosSparse_spmv_impl_merge.hpp b/sparse/impl/KokkosSparse_spmv_impl_merge.hpp index 07d4aa68e1..f1e9df66bc 100644 --- a/sparse/impl/KokkosSparse_spmv_impl_merge.hpp +++ b/sparse/impl/KokkosSparse_spmv_impl_merge.hpp @@ -320,13 +320,13 @@ struct SpmvMergeHierarchical { const A_size_type pathLength = A.numRows() + A.nnz(); A_size_type pathLengthThreadChunk; int teamSize; - if constexpr (KokkosKernels::Impl::kk_is_gpu_exec_space()) { + if constexpr (KokkosKernels::Impl::kk_is_gpu_exec_space()) { pathLengthThreadChunk = 4; teamSize = 128; } else { teamSize = 1; - pathLengthThreadChunk = (pathLength + exec_space::concurrency() - 1) / - exec_space::concurrency(); + pathLengthThreadChunk = (pathLength + exec_space().concurrency() - 1) / + exec_space().concurrency(); } const size_t pathLengthTeamChunk = pathLengthThreadChunk * teamSize; @@ -346,7 +346,7 @@ struct SpmvMergeHierarchical { using GpuOp = Functor; using CpuOp = Functor; using Op = typename std::conditional< - KokkosKernels::Impl::kk_is_gpu_exec_space(), GpuOp, + KokkosKernels::Impl::kk_is_gpu_exec_space(), GpuOp, CpuOp>::type; Op op(alpha, A, x, y, pathLengthThreadChunk); Kokkos::parallel_for("SpmvMergeHierarchical::spmv", policy, op); @@ -355,7 +355,7 @@ struct SpmvMergeHierarchical { using GpuOp = Functor; using CpuOp = Functor; using Op = typename std::conditional< - KokkosKernels::Impl::kk_is_gpu_exec_space(), GpuOp, + KokkosKernels::Impl::kk_is_gpu_exec_space(), GpuOp, CpuOp>::type; Op op(alpha, A, x, y, pathLengthThreadChunk); Kokkos::parallel_for("SpmvMergeHierarchical::spmv", policy, op); From aeb9bf1654e2b370f5b5e621c41fb769ba8e7934 Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Mon, 9 Oct 2023 09:23:53 -0600 Subject: [PATCH 06/10] KokkosSparse_spmv_impl_merge.hpp: reduce chance of identifier collision --- sparse/impl/KokkosSparse_spmv_impl_merge.hpp | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/sparse/impl/KokkosSparse_spmv_impl_merge.hpp b/sparse/impl/KokkosSparse_spmv_impl_merge.hpp index f1e9df66bc..9329b8a097 100644 --- a/sparse/impl/KokkosSparse_spmv_impl_merge.hpp +++ b/sparse/impl/KokkosSparse_spmv_impl_merge.hpp @@ -73,9 +73,10 @@ struct SpmvMergeHierarchical { template - struct Functor { - Functor(const y_value_type& _alpha, const AMatrix& _A, const XVector& _x, - const YVector& _y, const A_size_type pathLengthThreadChunk) + struct SpmvMergeImplFunctor { + SpmvMergeImplFunctor(const y_value_type& _alpha, const AMatrix& _A, + const XVector& _x, const YVector& _y, + const A_size_type pathLengthThreadChunk) : alpha(_alpha), A(_A), x(_x), @@ -299,7 +300,7 @@ struct SpmvMergeHierarchical { } return val; } - }; + }; // struct SpmvMergeImplFunctor static void spmv(const ExecutionSpace& space, const char mode[], const y_value_type& alpha, const AMatrix& A, @@ -343,8 +344,8 @@ struct SpmvMergeHierarchical { */ if (KokkosSparse::NoTranspose[0] == mode[0]) { constexpr bool CONJ = false; - using GpuOp = Functor; - using CpuOp = Functor; + using GpuOp = SpmvMergeImplFunctor; + using CpuOp = SpmvMergeImplFunctor; using Op = typename std::conditional< KokkosKernels::Impl::kk_is_gpu_exec_space(), GpuOp, CpuOp>::type; @@ -352,8 +353,8 @@ struct SpmvMergeHierarchical { Kokkos::parallel_for("SpmvMergeHierarchical::spmv", policy, op); } else if (KokkosSparse::Conjugate[0] == mode[0]) { constexpr bool CONJ = true; - using GpuOp = Functor; - using CpuOp = Functor; + using GpuOp = SpmvMergeImplFunctor; + using CpuOp = SpmvMergeImplFunctor; using Op = typename std::conditional< KokkosKernels::Impl::kk_is_gpu_exec_space(), GpuOp, CpuOp>::type; From 2a95f5e030aaeea7a231d09a14647c05a242d13b Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Mon, 9 Oct 2023 09:24:22 -0600 Subject: [PATCH 07/10] KokkosSparse_merge_matrix.hpp: compare with size_type --- sparse/impl/KokkosSparse_merge_matrix.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sparse/impl/KokkosSparse_merge_matrix.hpp b/sparse/impl/KokkosSparse_merge_matrix.hpp index 83dfe42e84..6510975c87 100644 --- a/sparse/impl/KokkosSparse_merge_matrix.hpp +++ b/sparse/impl/KokkosSparse_merge_matrix.hpp @@ -145,9 +145,9 @@ class MergeMatrixDiagonal { KOKKOS_INLINE_FUNCTION bool operator()(const size_type di) const { position_type pos = diag_to_a_b(di); - if (size_t(pos.ai) >= a_.size()) { + if (size_type(pos.ai) >= a_.size()) { return true; // on the +a side out of matrix bounds is 1 - } else if (size_t(pos.bi) >= b_.size()) { + } else if (size_type(pos.bi) >= b_.size()) { return false; // on the +b side out of matrix bounds is 0 } else { return KokkosKernels::Impl::safe_gt(a_(pos.ai), b_(pos.bi)); From 09d442da94859df278618efd9b7168637e5052ad Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Mon, 9 Oct 2023 11:14:11 -0600 Subject: [PATCH 08/10] KokkosSparse_spmv.hpp: 4.0 ::rank handling --- sparse/src/KokkosSparse_spmv.hpp | 34 ++++++++++++++++++++++++++------ 1 file changed, 28 insertions(+), 6 deletions(-) diff --git a/sparse/src/KokkosSparse_spmv.hpp b/sparse/src/KokkosSparse_spmv.hpp index 2aab1cef60..0658adbccf 100644 --- a/sparse/src/KokkosSparse_spmv.hpp +++ b/sparse/src/KokkosSparse_spmv.hpp @@ -98,13 +98,23 @@ void spmv(const ExecutionSpace& space, typename YVector::memory_space>::accessible, "KokkosBlas::spmv: YVector must be accessible from ExecutionSpace"); - // Make sure that x and y have the same rank. - static_assert(XVector::rank == YVector::rank, +// Make sure that x and y have the same rank. +// Make sure that x (and therefore y) is rank 1. +#if (KOKKOS_VERSION >= 40100) + static_assert(XVector::rank() == YVector::rank(), "KokkosSparse::spmv: Vector ranks do not match."); - // Make sure that x (and therefore y) is rank 1. + + static_assert(XVector::rank() == 1, + "KokkosSparse::spmv: Both Vector inputs must have rank 1 " + "in order to call this specialization of spmv."); +#else + static_assert( + static_cast(XVector::rank) == static_cast(YVector::rank), + "KokkosSparse::spmv: Vector ranks do not match."); static_assert(static_cast(XVector::rank) == 1, "KokkosSparse::spmv: Both Vector inputs must have rank 1 " "in order to call this specialization of spmv."); +#endif // Make sure that y is non-const. static_assert(std::is_same::value, @@ -296,8 +306,14 @@ void spmv(const ExecutionSpace& space, typename YVector::memory_space>::accessible, "KokkosBlas::spmv: YVector must be accessible from ExecutionSpace"); // Make sure that x and y have the same rank. - static_assert(XVector::rank == YVector::rank, +#if (KOKKOS_VERSION >= 40100) + static_assert(XVector::rank() == YVector::rank(), "KokkosSparse::spmv: Vector ranks do not match."); +#else + static_assert( + static_cast(XVector::rank) == static_cast(YVector::rank), + "KokkosSparse::spmv: Vector ranks do not match."); +#endif // Make sure that x (and therefore y) is rank 1. static_assert(static_cast(XVector::rank) == 1, "KokkosSparse::spmv: Both Vector inputs must have rank 1 " @@ -673,9 +689,15 @@ void spmv(const ExecutionSpace& space, Kokkos::SpaceAccessibility::accessible, "KokkosBlas::spmv: YVector must be accessible from ExecutionSpace"); - // Make sure that x and y have the same rank. - static_assert(XVector::rank == YVector::rank, +// Make sure that x and y have the same rank. +#if (KOKKOS_VERSION >= 40100) + static_assert(XVector::rank() == YVector::rank(), "KokkosSparse::spmv: Vector ranks do not match."); +#else + static_assert( + static_cast(XVector::rank) == static_cast(YVector::rank), + "KokkosSparse::spmv: Vector ranks do not match."); +#endif // Make sure that x (and therefore y) is rank 2. static_assert(static_cast(XVector::rank) == 2, "KokkosSparse::spmv: Both Vector inputs must have rank 2 " From b8eb1bcb31c1fb73d94fe2269e728cb7a5d94f02 Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Mon, 9 Oct 2023 11:37:00 -0600 Subject: [PATCH 09/10] KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp: used provided execution space for TPL --- sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp index 8932beb88a..97019e4682 100644 --- a/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp @@ -819,6 +819,8 @@ void spmv_block_impl_rocsparse( "A entries must be contiguous"); rocsparse_handle handle = controls.getRocsparseHandle(); + // resets handle stream to NULL when out of scope + KokkosSparse::Impl::TemporarySetRocsparseStream tsrs(handle, exec); // set the mode rocsparse_operation trans; From 01e97739b682690ef97f68cefd5b64b0b29f70b2 Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Mon, 9 Oct 2023 16:30:59 -0600 Subject: [PATCH 10/10] KokkosSparse_merge_matrix.hpp: fix comparison signedness --- sparse/impl/KokkosSparse_merge_matrix.hpp | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/sparse/impl/KokkosSparse_merge_matrix.hpp b/sparse/impl/KokkosSparse_merge_matrix.hpp index 6510975c87..18c9467a9a 100644 --- a/sparse/impl/KokkosSparse_merge_matrix.hpp +++ b/sparse/impl/KokkosSparse_merge_matrix.hpp @@ -31,6 +31,9 @@ namespace KokkosSparse::Impl { // a joint index into a and b template struct MergeMatrixPosition { + using a_index_type = AIndex; + using b_index_type = BIndex; + AIndex ai; BIndex bi; }; @@ -145,9 +148,10 @@ class MergeMatrixDiagonal { KOKKOS_INLINE_FUNCTION bool operator()(const size_type di) const { position_type pos = diag_to_a_b(di); - if (size_type(pos.ai) >= a_.size()) { + + if (pos.ai >= typename position_type::a_index_type(a_.size())) { return true; // on the +a side out of matrix bounds is 1 - } else if (size_type(pos.bi) >= b_.size()) { + } else if (pos.bi >= typename position_type::b_index_type(b_.size())) { return false; // on the +b side out of matrix bounds is 0 } else { return KokkosKernels::Impl::safe_gt(a_(pos.ai), b_(pos.bi)); @@ -161,9 +165,9 @@ class MergeMatrixDiagonal { */ KOKKOS_INLINE_FUNCTION size_type size() const noexcept { - if (d_ <= a_.size() && d_ <= b_.size()) { + if (d_ <= size_type(a_.size()) && d_ <= size_type(b_.size())) { return d_; - } else if (d_ > a_.size() && d_ > b_.size()) { + } else if (d_ > size_type(a_.size()) && d_ > size_type(b_.size())) { // TODO: this returns nonsense if d_ happens to be outside the merge // matrix return a_.size() + b_.size() - d_; @@ -182,8 +186,8 @@ class MergeMatrixDiagonal { KOKKOS_INLINE_FUNCTION position_type diag_to_a_b(const size_type &di) const noexcept { position_type res; - res.ai = d_ < a_.size() ? (d_ - 1) - di : a_.size() - 1 - di; - res.bi = d_ < a_.size() ? di : d_ + di - a_.size(); + res.ai = d_ < size_type(a_.size()) ? (d_ - 1) - di : a_.size() - 1 - di; + res.bi = d_ < size_type(a_.size()) ? di : d_ + di - a_.size(); return res; }