From 0ea31c3888088e19c5c2621db6cbf0c04a34dbb3 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 26 Sep 2023 16:51:44 -0600 Subject: [PATCH 01/12] Add sptrsv execution space overloads --- docs/developer/apidocs/sparse.rst | 13 + .../KokkosSparse_sptrsv_cuSPARSE_impl.hpp | 179 +++++----- .../impl/KokkosSparse_sptrsv_solve_impl.hpp | 326 +++++++++++------- .../impl/KokkosSparse_sptrsv_solve_spec.hpp | 38 +- .../KokkosSparse_sptrsv_symbolic_impl.hpp | 57 +-- .../KokkosSparse_sptrsv_symbolic_spec.hpp | 22 +- sparse/src/KokkosSparse_sptrsv.hpp | 253 ++++++++++++-- 7 files changed, 599 insertions(+), 289 deletions(-) diff --git a/docs/developer/apidocs/sparse.rst b/docs/developer/apidocs/sparse.rst index 415f72eec8..3a55e50c8b 100644 --- a/docs/developer/apidocs/sparse.rst +++ b/docs/developer/apidocs/sparse.rst @@ -94,3 +94,16 @@ par_ilut gmres ----- .. doxygenfunction:: gmres(KernelHandle* handle, AMatrix& A, BType& B, XType& X, Preconditioner* precond) + +sptrsv +------ +.. doxygenfunction:: sptrsv_symbolic(const ExecutionSpace &space, KernelHandle *handle, lno_row_view_t_ rowmap, lno_nnz_view_t_ entries) +.. doxygenfunction:: sptrsv_symbolic(KernelHandle *handle, lno_row_view_t_ rowmap, lno_nnz_view_t_ entries) +.. doxygenfunction:: sptrsv_symbolic(ExecutionSpace &space, KernelHandle *handle, lno_row_view_t_ rowmap, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values) +.. doxygenfunction:: sptrsv_symbolic(KernelHandle *handle, lno_row_view_t_ rowmap, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values) +.. doxygenfunction:: sptrsv_solve(ExecutionSpace &space, KernelHandle *handle, lno_row_view_t_ rowmap, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, BType b, XType x) +.. doxygenfunction:: sptrsv_solve(KernelHandle *handle, lno_row_view_t_ rowmap, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, BType b, XType x) +.. doxygenfunction:: sptrsv_solve(ExecutionSpace &space, KernelHandle *handle, XType x, XType b) +.. doxygenfunction:: sptrsv_solve(KernelHandle *handle, XType x, XType b) +.. doxygenfunction:: sptrsv_solve(ExecutionSpace &space, KernelHandle *handleL, KernelHandle *handleU, XType x, XType b) +.. doxygenfunction:: sptrsv_solve(KernelHandle *handleL, KernelHandle *handleU, XType x, XType b) diff --git a/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp index 7605f03fa2..0b9afa7796 100644 --- a/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp @@ -22,10 +22,11 @@ namespace KokkosSparse { namespace Impl { -template -void sptrsvcuSPARSE_symbolic(KernelHandle *sptrsv_handle, +void sptrsvcuSPARSE_symbolic(ExecutionSpace &space, KernelHandle *sptrsv_handle, typename KernelHandle::nnz_lno_t nrows, ain_row_index_view_type row_map, ain_nonzero_index_view_type entries, @@ -61,6 +62,9 @@ void sptrsvcuSPARSE_symbolic(KernelHandle *sptrsv_handle, typename KernelHandle::SPTRSVcuSparseHandleType *h = sptrsv_handle->get_cuSparseHandle(); + KOKKOS_CUSPARSE_SAFE_CALL( + cusparseSetStream(h->handle, space.cuda_stream())); + int64_t nnz = static_cast(entries.extent(0)); size_t pBufferSize; void *rm; @@ -98,13 +102,13 @@ void sptrsvcuSPARSE_symbolic(KernelHandle *sptrsv_handle, CUSPARSE_INDEX_BASE_ZERO, cudaValueType)); // Create dummy dense vector B (RHS) - nnz_scalar_view_t b_dummy("b_dummy", nrows); + nnz_scalar_view_t b_dummy(Kokkos::view_alloc(space, "b_dummy"), nrows); KOKKOS_CUSPARSE_SAFE_CALL( cusparseCreateDnVec(&(h->vecBDescr_dummy), static_cast(nrows), b_dummy.data(), cudaValueType)); // Create dummy dense vector X (LHS) - nnz_scalar_view_t x_dummy("x_dummy", nrows); + nnz_scalar_view_t x_dummy(Kokkos::view_alloc(space, "x_dummy"), nrows); KOKKOS_CUSPARSE_SAFE_CALL( cusparseCreateDnVec(&(h->vecXDescr_dummy), static_cast(nrows), x_dummy.data(), cudaValueType)); @@ -163,9 +167,12 @@ void sptrsvcuSPARSE_symbolic(KernelHandle *sptrsv_handle, bool is_lower = sptrsv_handle->is_lower_tri(); sptrsv_handle->create_cuSPARSE_Handle(trans, is_lower); - typename KernelHandle::SPTRSVcuSparseHandleType* h = + typename KernelHandle::SPTRSVcuSparseHandleType *h = sptrsv_handle->get_cuSparseHandle(); + KOKKOS_CUSPARSE_SAFE_CALL( + cusparseSetStream(h->handle, space.cuda_stream())); + cusparseStatus_t status; status = cusparseCreateCsrsv2Info(&(h->info)); if (CUSPARSE_STATUS_SUCCESS != status) @@ -178,85 +185,86 @@ void sptrsvcuSPARSE_symbolic(KernelHandle *sptrsv_handle, if (!std::is_same::value) sptrsv_handle->allocate_tmp_int_rowmap(row_map.extent(0)); - const int* rm = !std::is_same::value + const int *rm = !std::is_same::value ? sptrsv_handle->get_int_rowmap_ptr_copy(row_map) - : (const int*)row_map.data(); - const int* ent = (const int*)entries.data(); - const scalar_type* vals = values.data(); + : (const int *)row_map.data(); + const int *ent = (const int *)entries.data(); + const scalar_type *vals = values.data(); if (std::is_same::value) { cusparseDcsrsv2_bufferSize(h->handle, h->transpose, nrows, nnz, h->descr, - (double*)vals, (int*)rm, (int*)ent, h->info, + (double *)vals, (int *)rm, (int *)ent, h->info, &pBufferSize); // pBuffer returned by cudaMalloc is automatically aligned to 128 bytes. cudaError_t my_error; - my_error = cudaMalloc((void**)&(h->pBuffer), pBufferSize); + my_error = cudaMalloc((void **)&(h->pBuffer), pBufferSize); if (cudaSuccess != my_error) std::cout << "cudmalloc pBuffer error_t error name " << cudaGetErrorString(my_error) << std::endl; status = cusparseDcsrsv2_analysis( - h->handle, h->transpose, nrows, nnz, h->descr, (double*)vals, - (int*)rm, (int*)ent, h->info, h->policy, h->pBuffer); + h->handle, h->transpose, nrows, nnz, h->descr, (double *)vals, + (int *)rm, (int *)ent, h->info, h->policy, h->pBuffer); if (CUSPARSE_STATUS_SUCCESS != status) std::cout << "analysis status error name " << (status) << std::endl; } else if (std::is_same::value) { cusparseScsrsv2_bufferSize(h->handle, h->transpose, nrows, nnz, h->descr, - (float*)vals, (int*)rm, (int*)ent, h->info, + (float *)vals, (int *)rm, (int *)ent, h->info, &pBufferSize); // pBuffer returned by cudaMalloc is automatically aligned to 128 bytes. cudaError_t my_error; - my_error = cudaMalloc((void**)&(h->pBuffer), pBufferSize); + my_error = cudaMalloc((void **)&(h->pBuffer), pBufferSize); if (cudaSuccess != my_error) std::cout << "cudmalloc pBuffer error_t error name " << cudaGetErrorString(my_error) << std::endl; status = cusparseScsrsv2_analysis( - h->handle, h->transpose, nrows, nnz, h->descr, (float*)vals, (int*)rm, - (int*)ent, h->info, h->policy, h->pBuffer); + h->handle, h->transpose, nrows, nnz, h->descr, (float *)vals, + (int *)rm, (int *)ent, h->info, h->policy, h->pBuffer); if (CUSPARSE_STATUS_SUCCESS != status) std::cout << "analysis status error name " << (status) << std::endl; } else if (std::is_same >::value) { cusparseZcsrsv2_bufferSize(h->handle, h->transpose, nrows, nnz, h->descr, - (cuDoubleComplex*)vals, (int*)rm, (int*)ent, + (cuDoubleComplex *)vals, (int *)rm, (int *)ent, h->info, &pBufferSize); // pBuffer returned by cudaMalloc is automatically aligned to 128 bytes. cudaError_t my_error; - my_error = cudaMalloc((void**)&(h->pBuffer), pBufferSize); + my_error = cudaMalloc((void **)&(h->pBuffer), pBufferSize); if (cudaSuccess != my_error) std::cout << "cudmalloc pBuffer error_t error name " << cudaGetErrorString(my_error) << std::endl; - status = cusparseZcsrsv2_analysis( - h->handle, h->transpose, nrows, nnz, h->descr, (cuDoubleComplex*)vals, - (int*)rm, (int*)ent, h->info, h->policy, h->pBuffer); + status = + cusparseZcsrsv2_analysis(h->handle, h->transpose, nrows, nnz, + h->descr, (cuDoubleComplex *)vals, (int *)rm, + (int *)ent, h->info, h->policy, h->pBuffer); if (CUSPARSE_STATUS_SUCCESS != status) std::cout << "analysis status error name " << (status) << std::endl; } else if (std::is_same >::value) { cusparseCcsrsv2_bufferSize(h->handle, h->transpose, nrows, nnz, h->descr, - (cuComplex*)vals, (int*)rm, (int*)ent, h->info, - &pBufferSize); + (cuComplex *)vals, (int *)rm, (int *)ent, + h->info, &pBufferSize); // pBuffer returned by cudaMalloc is automatically aligned to 128 bytes. cudaError_t my_error; - my_error = cudaMalloc((void**)&(h->pBuffer), pBufferSize); + my_error = cudaMalloc((void **)&(h->pBuffer), pBufferSize); if (cudaSuccess != my_error) std::cout << "cudmalloc pBuffer error_t error name " << cudaGetErrorString(my_error) << std::endl; status = cusparseCcsrsv2_analysis( - h->handle, h->transpose, nrows, nnz, h->descr, (cuComplex*)vals, - (int*)rm, (int*)ent, h->info, h->policy, h->pBuffer); + h->handle, h->transpose, nrows, nnz, h->descr, (cuComplex *)vals, + (int *)rm, (int *)ent, h->info, h->policy, h->pBuffer); if (CUSPARSE_STATUS_SUCCESS != status) std::cout << "analysis status error name " << (status) << std::endl; @@ -281,10 +289,11 @@ void sptrsvcuSPARSE_symbolic(KernelHandle *sptrsv_handle, } template < - typename KernelHandle, typename ain_row_index_view_type, - typename ain_nonzero_index_view_type, typename ain_values_scalar_view_type, - typename b_values_scalar_view_type, typename x_values_scalar_view_type> -void sptrsvcuSPARSE_solve(KernelHandle *sptrsv_handle, + typename ExecutionSpace, typename KernelHandle, + typename ain_row_index_view_type, typename ain_nonzero_index_view_type, + typename ain_values_scalar_view_type, typename b_values_scalar_view_type, + typename x_values_scalar_view_type> +void sptrsvcuSPARSE_solve(ExecutionSpace &space, KernelHandle *sptrsv_handle, typename KernelHandle::nnz_lno_t nrows, ain_row_index_view_type row_map, ain_nonzero_index_view_type entries, @@ -323,6 +332,9 @@ void sptrsvcuSPARSE_solve(KernelHandle *sptrsv_handle, typename KernelHandle::SPTRSVcuSparseHandleType *h = sptrsv_handle->get_cuSparseHandle(); + KOKKOS_CUSPARSE_SAFE_CALL( + cusparseSetStream(h->handle, space.cuda_stream())); + const scalar_type alpha = scalar_type(1.0); cudaDataType cudaValueType = cuda_data_type_from(); @@ -354,18 +366,21 @@ void sptrsvcuSPARSE_solve(KernelHandle *sptrsv_handle, if (std::is_same::value) { cusparseStatus_t status; - typename KernelHandle::SPTRSVcuSparseHandleType* h = + typename KernelHandle::SPTRSVcuSparseHandleType *h = sptrsv_handle->get_cuSparseHandle(); + KOKKOS_CUSPARSE_SAFE_CALL( + cusparseSetStream(h->handle, space.cuda_stream())); + int nnz = entries.extent_int(0); - const int* rm = !std::is_same::value + const int *rm = !std::is_same::value ? sptrsv_handle->get_int_rowmap_ptr() - : (const int*)row_map.data(); - const int* ent = (const int*)entries.data(); - const scalar_type* vals = values.data(); - const scalar_type* bv = rhs.data(); - scalar_type* xv = lhs.data(); + : (const int *)row_map.data(); + const int *ent = (const int *)entries.data(); + const scalar_type *vals = values.data(); + const scalar_type *bv = rhs.data(); + scalar_type *xv = lhs.data(); if (std::is_same::value) { if (h->pBuffer == nullptr) { @@ -373,10 +388,10 @@ void sptrsvcuSPARSE_solve(KernelHandle *sptrsv_handle, } const double alpha = double(1); - status = cusparseDcsrsv2_solve(h->handle, h->transpose, nrows, nnz, - &alpha, h->descr, (double*)vals, (int*)rm, - (int*)ent, h->info, (double*)bv, - (double*)xv, h->policy, h->pBuffer); + status = cusparseDcsrsv2_solve( + h->handle, h->transpose, nrows, nnz, &alpha, h->descr, (double *)vals, + (int *)rm, (int *)ent, h->info, (double *)bv, (double *)xv, h->policy, + h->pBuffer); if (CUSPARSE_STATUS_SUCCESS != status) std::cout << "solve status error name " << (status) << std::endl; @@ -387,9 +402,9 @@ void sptrsvcuSPARSE_solve(KernelHandle *sptrsv_handle, const float alpha = float(1); status = cusparseScsrsv2_solve(h->handle, h->transpose, nrows, nnz, - &alpha, h->descr, (float*)vals, (int*)rm, - (int*)ent, h->info, (float*)bv, (float*)xv, - h->policy, h->pBuffer); + &alpha, h->descr, (float *)vals, (int *)rm, + (int *)ent, h->info, (float *)bv, + (float *)xv, h->policy, h->pBuffer); if (CUSPARSE_STATUS_SUCCESS != status) std::cout << "solve status error name " << (status) << std::endl; @@ -399,8 +414,8 @@ void sptrsvcuSPARSE_solve(KernelHandle *sptrsv_handle, cualpha.y = 0.0; status = cusparseZcsrsv2_solve( h->handle, h->transpose, nrows, nnz, &cualpha, h->descr, - (cuDoubleComplex*)vals, (int*)rm, (int*)ent, h->info, - (cuDoubleComplex*)bv, (cuDoubleComplex*)xv, h->policy, h->pBuffer); + (cuDoubleComplex *)vals, (int *)rm, (int *)ent, h->info, + (cuDoubleComplex *)bv, (cuDoubleComplex *)xv, h->policy, h->pBuffer); if (CUSPARSE_STATUS_SUCCESS != status) std::cout << "solve status error name " << (status) << std::endl; @@ -410,8 +425,8 @@ void sptrsvcuSPARSE_solve(KernelHandle *sptrsv_handle, cualpha.y = 0.0; status = cusparseCcsrsv2_solve( h->handle, h->transpose, nrows, nnz, &cualpha, h->descr, - (cuComplex*)vals, (int*)rm, (int*)ent, h->info, (cuComplex*)bv, - (cuComplex*)xv, h->policy, h->pBuffer); + (cuComplex *)vals, (int *)rm, (int *)ent, h->info, (cuComplex *)bv, + (cuComplex *)xv, h->policy, h->pBuffer); if (CUSPARSE_STATUS_SUCCESS != status) std::cout << "solve status error name " << (status) << std::endl; @@ -539,13 +554,13 @@ void sptrsvcuSPARSE_solve_streams( "CUSPARSE requires local ordinals to be integer.\n"); } else { const scalar_type alpha = scalar_type(1.0); - std::vector sptrsv_handle_v(nstreams); - std::vector h_v(nstreams); - std::vector rm_v(nstreams); - std::vector ent_v(nstreams); - std::vector vals_v(nstreams); - std::vector bv_v(nstreams); - std::vector xv_v(nstreams); + std::vector sptrsv_handle_v(nstreams); + std::vector h_v(nstreams); + std::vector rm_v(nstreams); + std::vector ent_v(nstreams); + std::vector vals_v(nstreams); + std::vector bv_v(nstreams); + std::vector xv_v(nstreams); for (int i = 0; i < nstreams; i++) { sptrsv_handle_v[i] = handle_v[i].get_sptrsv_handle(); @@ -560,8 +575,8 @@ void sptrsvcuSPARSE_solve_streams( } rm_v[i] = !std::is_same::value ? sptrsv_handle_v[i]->get_int_rowmap_ptr() - : reinterpret_cast(row_map_v[i].data()); - ent_v[i] = reinterpret_cast(entries_v[i].data()); + : reinterpret_cast(row_map_v[i].data()); + ent_v[i] = reinterpret_cast(entries_v[i].data()); vals_v[i] = values_v[i].data(); bv_v[i] = rhs_v[i].data(); xv_v[i] = lhs_v[i].data(); @@ -573,42 +588,42 @@ void sptrsvcuSPARSE_solve_streams( if (std::is_same::value) { KOKKOS_CUSPARSE_SAFE_CALL(cusparseDcsrsv2_solve( h_v[i]->handle, h_v[i]->transpose, nrows, nnz, - reinterpret_cast(&alpha), h_v[i]->descr, - reinterpret_cast(vals_v[i]), - reinterpret_cast(rm_v[i]), - reinterpret_cast(ent_v[i]), h_v[i]->info, - reinterpret_cast(bv_v[i]), - reinterpret_cast(xv_v[i]), h_v[i]->policy, + reinterpret_cast(&alpha), h_v[i]->descr, + reinterpret_cast(vals_v[i]), + reinterpret_cast(rm_v[i]), + reinterpret_cast(ent_v[i]), h_v[i]->info, + reinterpret_cast(bv_v[i]), + reinterpret_cast(xv_v[i]), h_v[i]->policy, h_v[i]->pBuffer)); } else if (std::is_same::value) { KOKKOS_CUSPARSE_SAFE_CALL(cusparseScsrsv2_solve( h_v[i]->handle, h_v[i]->transpose, nrows, nnz, - reinterpret_cast(&alpha), h_v[i]->descr, - reinterpret_cast(vals_v[i]), - reinterpret_cast(rm_v[i]), - reinterpret_cast(ent_v[i]), h_v[i]->info, - reinterpret_cast(bv_v[i]), - reinterpret_cast(xv_v[i]), h_v[i]->policy, + reinterpret_cast(&alpha), h_v[i]->descr, + reinterpret_cast(vals_v[i]), + reinterpret_cast(rm_v[i]), + reinterpret_cast(ent_v[i]), h_v[i]->info, + reinterpret_cast(bv_v[i]), + reinterpret_cast(xv_v[i]), h_v[i]->policy, h_v[i]->pBuffer)); } else if (std::is_same >::value) { KOKKOS_CUSPARSE_SAFE_CALL(cusparseZcsrsv2_solve( h_v[i]->handle, h_v[i]->transpose, nrows, nnz, - reinterpret_cast(&alpha), h_v[i]->descr, - reinterpret_cast(vals_v[i]), - reinterpret_cast(rm_v[i]), - reinterpret_cast(ent_v[i]), h_v[i]->info, - reinterpret_cast(bv_v[i]), - reinterpret_cast(xv_v[i]), h_v[i]->policy, + reinterpret_cast(&alpha), h_v[i]->descr, + reinterpret_cast(vals_v[i]), + reinterpret_cast(rm_v[i]), + reinterpret_cast(ent_v[i]), h_v[i]->info, + reinterpret_cast(bv_v[i]), + reinterpret_cast(xv_v[i]), h_v[i]->policy, h_v[i]->pBuffer)); } else if (std::is_same >::value) { KOKKOS_CUSPARSE_SAFE_CALL(cusparseCcsrsv2_solve( h_v[i]->handle, h_v[i]->transpose, nrows, nnz, - reinterpret_cast(&alpha), h_v[i]->descr, - reinterpret_cast(vals_v[i]), - reinterpret_cast(rm_v[i]), - reinterpret_cast(ent_v[i]), h_v[i]->info, - reinterpret_cast(bv_v[i]), - reinterpret_cast(xv_v[i]), h_v[i]->policy, + reinterpret_cast(&alpha), h_v[i]->descr, + reinterpret_cast(vals_v[i]), + reinterpret_cast(rm_v[i]), + reinterpret_cast(ent_v[i]), h_v[i]->info, + reinterpret_cast(bv_v[i]), + reinterpret_cast(xv_v[i]), h_v[i]->policy, h_v[i]->pBuffer)); } else { throw std::runtime_error("CUSPARSE wrapper error: unsupported type.\n"); diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp index b14c9be072..1d8613922b 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp @@ -2891,16 +2891,15 @@ void upper_tri_solve_cg(TriSolveHandle &thandle, const RowMapType row_map, #endif -template -void lower_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, - const EntriesType entries, const ValuesType values, - const RHSType &rhs, LHSType &lhs) { +template +void lower_tri_solve(ExecutionSpace &space, TriSolveHandle &thandle, + const RowMapType row_map, const EntriesType entries, + const ValuesType values, const RHSType &rhs, + LHSType &lhs) { #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSPSTRSV_SOLVE_IMPL_PROFILE) cudaProfilerStop(); #endif - - typedef typename TriSolveHandle::execution_space execution_space; typedef typename TriSolveHandle::size_type size_type; typedef typename TriSolveHandle::nnz_lno_view_t NGBLType; @@ -2981,8 +2980,10 @@ void lower_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_RP) { Kokkos::parallel_for( "parfor_fixed_lvl", - Kokkos::RangePolicy(node_count, - node_count + lvl_nodes), + Kokkos::Experimental::require( + Kokkos::RangePolicy(space, node_count, + node_count + lvl_nodes), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), LowerTriLvlSchedRPSolverFunctor( @@ -2990,8 +2991,8 @@ void lower_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, } else if (thandle.get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm:: SEQLVLSCHD_TP1) { - typedef Kokkos::TeamPolicy policy_type; - int team_size = thandle.get_team_size(); + using team_policy_t = Kokkos::TeamPolicy; + int team_size = thandle.get_team_size(); #ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED TriLvlSchedTP1SolverFunctor; + using team_policy_type = Kokkos::TeamPolicy; using supernode_view_type = Kokkos::View; @@ -3079,9 +3088,12 @@ void lower_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, SparseTriSupernodalSpMVFunctor sptrsv_init_functor(-2, node_count, nodes_grouped_by_level, supercols, work_offset_data, lhs, work); - Kokkos::parallel_for("parfor_tri_supernode_spmv", - team_policy_type(lvl_nodes, Kokkos::AUTO), - sptrsv_init_functor); + Kokkos::parallel_for( + "parfor_tri_supernode_spmv", + Kokkos::Experimental::require( + team_policy_type(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_init_functor); } for (size_type league_rank = 0; league_rank < lvl_nodes; @@ -3118,7 +3130,7 @@ void lower_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, auto Ljj = Kokkos::subview( viewL, range_type(0, nsrow), Kokkos::ALL()); // s-th supernocal column of L - KokkosBlas::gemv("N", one, Ljj, Xj, zero, Y); + KokkosBlas::gemv(space, "N", one, Ljj, Xj, zero, Y); } else { auto Xj = Kokkos::subview( lhs, @@ -3131,7 +3143,7 @@ void lower_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, if (invert_diagonal) { auto Y = Kokkos::subview( work, range_type(workoffset, workoffset + nscol)); - KokkosBlas::gemv("N", one, Ljj, Y, zero, Xj); + KokkosBlas::gemv(space, "N", one, Ljj, Y, zero, Xj); } else { char unit_diag = (unit_diagonal ? 'U' : 'N'); // NOTE: we currently supports only default_layout = @@ -3139,7 +3151,9 @@ void lower_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, Kokkos::View Xjj(Xj.data(), nscol, 1); - KokkosBlas::trsm("L", "L", "N", &unit_diag, one, Ljj, Xjj); + KokkosBlas::trsm(space, "L", "L", "N", &unit_diag, one, Ljj, + Xjj); + // TODO: space.fence(); Kokkos::fence(); } // update off-diagonal blocks @@ -3155,7 +3169,7 @@ void lower_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, viewL, range_type(nscol, nsrow), Kokkos::ALL()); // off-diagonal blocks of s-th supernodal // column of L - KokkosBlas::gemv("N", one, Lij, Xj, zero, Z); + KokkosBlas::gemv(space, "N", one, Lij, Xj, zero, Z); } } } @@ -3165,9 +3179,12 @@ void lower_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, SparseTriSupernodalSpMVFunctor sptrsv_init_functor(-1, node_count, nodes_grouped_by_level, supercols, work_offset_data, lhs, work); - Kokkos::parallel_for("parfor_tri_supernode_spmv", - team_policy_type(lvl_nodes, Kokkos::AUTO), - sptrsv_init_functor); + Kokkos::parallel_for( + "parfor_tri_supernode_spmv", + Kokkos::Experimental::require( + team_policy_type(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_init_functor); } } @@ -3178,9 +3195,12 @@ void lower_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, supercols, row_map, entries, values, lvl, kernel_type, diag_kernel_type, lhs, work, work_offset, nodes_grouped_by_level, node_count); - Kokkos::parallel_for("parfor_lsolve_supernode", - team_policy_type(lvl_nodes, Kokkos::AUTO), - sptrsv_functor); + Kokkos::parallel_for( + "parfor_lsolve_supernode", + Kokkos::Experimental::require( + team_policy_type(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_functor); #ifdef profile_supernodal_etree Kokkos::fence(); @@ -3200,7 +3220,7 @@ void lower_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, #endif // initialize input & output vectors - using team_policy_type = Kokkos::TeamPolicy; + using team_policy_type = Kokkos::TeamPolicy; // update with spmv (one or two SpMV) bool transpose_spmv = @@ -3210,36 +3230,45 @@ void lower_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, if (!invert_offdiagonal) { // solve with diagonals auto digmat = thandle.get_diagblock(lvl); - KokkosSparse::spmv(tran, one, digmat, lhs, one, work); + KokkosSparse::spmv(space, tran, one, digmat, lhs, one, work); // copy from work to lhs corresponding to diagonal blocks SparseTriSupernodalSpMVFunctor sptrsv_init_functor(-1, node_count, nodes_grouped_by_level, supercols, supercols, lhs, work); - Kokkos::parallel_for("parfor_lsolve_supernode", - team_policy_type(lvl_nodes, Kokkos::AUTO), - sptrsv_init_functor); + Kokkos::parallel_for( + "parfor_lsolve_supernode", + Kokkos::Experimental::require( + team_policy_type(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_init_functor); } else { // copy lhs corresponding to diagonal blocks to work and zero out in // lhs SparseTriSupernodalSpMVFunctor sptrsv_init_functor(1, node_count, nodes_grouped_by_level, supercols, supercols, lhs, work); - Kokkos::parallel_for("parfor_lsolve_supernode", - team_policy_type(lvl_nodes, Kokkos::AUTO), - sptrsv_init_functor); + Kokkos::parallel_for( + "parfor_lsolve_supernode", + Kokkos::Experimental::require( + team_policy_type(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_init_functor); } // update off-diagonals (potentiall combined with solve with // diagonals) auto submat = thandle.get_submatrix(lvl); - KokkosSparse::spmv(tran, one, submat, work, one, lhs); + KokkosSparse::spmv(space, tran, one, submat, work, one, lhs); // reinitialize workspace SparseTriSupernodalSpMVFunctor sptrsv_finalize_functor(0, node_count, nodes_grouped_by_level, supercols, supercols, lhs, work); - Kokkos::parallel_for("parfor_lsolve_supernode", - team_policy_type(lvl_nodes, Kokkos::AUTO), - sptrsv_finalize_functor); + Kokkos::parallel_for( + "parfor_lsolve_supernode", + Kokkos::Experimental::require( + team_policy_type(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_finalize_functor); #ifdef profile_supernodal_etree Kokkos::fence(); @@ -3272,16 +3301,16 @@ void lower_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, } // end lower_tri_solve -template -void upper_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, - const EntriesType entries, const ValuesType values, - const RHSType &rhs, LHSType &lhs) { +template +void upper_tri_solve(ExecutionSpace &space, TriSolveHandle &thandle, + const RowMapType row_map, const EntriesType entries, + const ValuesType values, const RHSType &rhs, + LHSType &lhs) { #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSPSTRSV_SOLVE_IMPL_PROFILE) cudaProfilerStop(); #endif - typedef typename TriSolveHandle::execution_space execution_space; - + using memory_space = typename ExecutionSpace::memory_space; typedef typename TriSolveHandle::size_type size_type; typedef typename TriSolveHandle::nnz_lno_view_t NGBLType; @@ -3298,7 +3327,6 @@ void upper_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, #if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) using namespace KokkosSparse::Experimental; - using memory_space = typename TriSolveHandle::memory_space; using integer_view_t = typename TriSolveHandle::integer_view_t; using integer_view_host_t = typename TriSolveHandle::integer_view_host_t; using scalar_t = typename ValuesType::non_const_value_type; @@ -3365,14 +3393,16 @@ void upper_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_RP) { Kokkos::parallel_for( "parfor_fixed_lvl", - Kokkos::RangePolicy(node_count, - node_count + lvl_nodes), + Kokkos::Experimental::require( + Kokkos::RangePolicy(space, node_count, + node_count + lvl_nodes), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), UpperTriLvlSchedRPSolverFunctor( row_map, entries, values, lhs, rhs, nodes_grouped_by_level)); } else if (thandle.get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1) { - typedef Kokkos::TeamPolicy policy_type; + using team_policy_t = Kokkos::TeamPolicy; int team_size = thandle.get_team_size(); @@ -3388,11 +3418,19 @@ void upper_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, node_count); #endif if (team_size == -1) - Kokkos::parallel_for("parfor_u_team", - policy_type(lvl_nodes, Kokkos::AUTO), tstf); + Kokkos::parallel_for( + "parfor_u_team", + Kokkos::Experimental::require( + team_policy_t(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + tstf); else - Kokkos::parallel_for("parfor_u_team", - policy_type(lvl_nodes, team_size), tstf); + Kokkos::parallel_for( + "parfor_u_team", + Kokkos::Experimental::require( + team_policy_t(space, lvl_nodes, team_size), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + tstf); } // TP2 algorithm has issues with some offset-ordinal combo to be addressed /* @@ -3444,7 +3482,7 @@ tstf); } // end elseif timer.reset(); #endif - using team_policy_type = Kokkos::TeamPolicy; + using team_policy_type = Kokkos::TeamPolicy; if (thandle.is_column_major()) { // U stored in CSC if (diag_kernel_type_host(lvl) == 3) { // using device-level kernels (functor is called to gather the input @@ -3457,9 +3495,12 @@ tstf); } // end elseif SparseTriSupernodalSpMVFunctor sptrsv_init_functor(-2, node_count, nodes_grouped_by_level, supercols, work_offset_data, lhs, work); - Kokkos::parallel_for("parfor_tri_supernode_spmv", - team_policy_type(lvl_nodes, Kokkos::AUTO), - sptrsv_init_functor); + Kokkos::parallel_for( + "parfor_tri_supernode_spmv", + Kokkos::Experimental::require( + team_policy_type(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_init_functor); } for (size_type league_rank = 0; league_rank < lvl_nodes; league_rank++) { @@ -3500,7 +3541,7 @@ tstf); } // end elseif workoffset, workoffset + nsrow)); // needed with gemv for update&scatter - KokkosBlas::gemv("N", one, Uij, Xj, zero, Z); + KokkosBlas::gemv(space, "N", one, Uij, Xj, zero, Z); } else { // extract part of the solution, corresponding to the diagonal // block @@ -3517,14 +3558,14 @@ tstf); } // end elseif workoffset, workoffset + nscol)); // needed for gemv instead of trmv/trsv - KokkosBlas::gemv("N", one, Ujj, Y, zero, Xj); + KokkosBlas::gemv(space, "N", one, Ujj, Y, zero, Xj); } else { // NOTE: we currently supports only default_layout = // LayoutLeft Kokkos::View Xjj(Xj.data(), nscol, 1); - KokkosBlas::trsm("L", "U", "N", "N", one, Ujj, Xjj); + KokkosBlas::trsm(space, "L", "U", "N", "N", one, Ujj, Xjj); } // update off-diagonal blocks if (nsrow2 > 0) { @@ -3538,7 +3579,7 @@ tstf); } // end elseif workoffset + nscol, workoffset + nscol + nsrow2)); // needed with gemv for update&scatter - KokkosBlas::gemv("N", one, Uij, Xj, zero, Z); + KokkosBlas::gemv(space, "N", one, Uij, Xj, zero, Z); } } } @@ -3548,9 +3589,12 @@ tstf); } // end elseif SparseTriSupernodalSpMVFunctor sptrsv_init_functor(-1, node_count, nodes_grouped_by_level, supercols, work_offset_data, lhs, work); - Kokkos::parallel_for("parfor_tri_supernode_spmv", - team_policy_type(lvl_nodes, Kokkos::AUTO), - sptrsv_init_functor); + Kokkos::parallel_for( + "parfor_tri_supernode_spmv", + Kokkos::Experimental::require( + team_policy_type(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_init_functor); } } @@ -3562,10 +3606,13 @@ tstf); } // end elseif diag_kernel_type, lhs, work, work_offset, nodes_grouped_by_level, node_count); - using policy_type = Kokkos::TeamPolicy; - Kokkos::parallel_for("parfor_usolve_tran_supernode", - policy_type(lvl_nodes, Kokkos::AUTO), - sptrsv_functor); + using team_policy_t = Kokkos::TeamPolicy; + Kokkos::parallel_for( + "parfor_usolve_tran_supernode", + Kokkos::Experimental::require( + team_policy_t(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_functor); } else { // U stored in CSR // launching sparse-triangular solve functor UpperTriSupernodalFunctor; - Kokkos::parallel_for("parfor_usolve_supernode", - policy_type(lvl_nodes, Kokkos::AUTO), - sptrsv_functor); + using team_policy_t = Kokkos::TeamPolicy; + Kokkos::parallel_for( + "parfor_usolve_supernode", + Kokkos::Experimental::require( + team_policy_t(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_functor); if (diag_kernel_type_host(lvl) == 3) { // using device-level kernels (functor is called to gather the input @@ -3634,7 +3684,7 @@ tstf); } // end elseif workoffset + nscol, workoffset + nscol + nsrow2)); // needed with gemv for update&scatter - KokkosBlas::gemv("T", -one, Uij, Z, one, Xj); + KokkosBlas::gemv(space, "T", -one, Uij, Z, one, Xj); } // "triangular-solve" to compute Xj @@ -3642,13 +3692,13 @@ tstf); } // end elseif auto Ujj = Kokkos::subview(viewU, range_type(0, nscol), Kokkos::ALL()); if (invert_diagonal) { - KokkosBlas::gemv("T", one, Ujj, Xj, zero, Y); + KokkosBlas::gemv(space, "T", one, Ujj, Xj, zero, Y); } else { // NOTE: we currently supports only default_layout = LayoutLeft Kokkos::View Xjj(Xj.data(), nscol, 1); - KokkosBlas::trsm("L", "L", "T", "N", one, Ujj, Xjj); + KokkosBlas::trsm(space, "L", "L", "T", "N", one, Ujj, Xjj); } } if (invert_diagonal) { @@ -3657,9 +3707,12 @@ tstf); } // end elseif SparseTriSupernodalSpMVFunctor sptrsv_init_functor(-1, node_count, nodes_grouped_by_level, supercols, work_offset_data, lhs, work); - Kokkos::parallel_for("parfor_tri_supernode_spmv", - team_policy_type(lvl_nodes, Kokkos::AUTO), - sptrsv_init_functor); + Kokkos::parallel_for( + "parfor_tri_supernode_spmv", + Kokkos::Experimental::require( + team_policy_type(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_init_functor); } } } @@ -3680,7 +3733,7 @@ tstf); } // end elseif #endif // initialize input & output vectors - using team_policy_type = Kokkos::TeamPolicy; + using team_policy_type = Kokkos::TeamPolicy; // update with one, or two, spmv bool transpose_spmv = @@ -3691,28 +3744,34 @@ tstf); } // end elseif if (!invert_offdiagonal) { // solve with diagonals auto digmat = thandle.get_diagblock(lvl); - KokkosSparse::spmv(tran, one, digmat, lhs, one, work); + KokkosSparse::spmv(space, tran, one, digmat, lhs, one, work); // copy from work to lhs corresponding to diagonal blocks SparseTriSupernodalSpMVFunctor sptrsv_init_functor(-1, node_count, nodes_grouped_by_level, supercols, supercols, lhs, work); - Kokkos::parallel_for("parfor_lsolve_supernode", - team_policy_type(lvl_nodes, Kokkos::AUTO), - sptrsv_init_functor); + Kokkos::parallel_for( + "parfor_lsolve_supernode", + Kokkos::Experimental::require( + team_policy_type(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_init_functor); } else { // zero out lhs corresponding to diagonal blocks in lhs, and copy to // work SparseTriSupernodalSpMVFunctor sptrsv_init_functor(1, node_count, nodes_grouped_by_level, supercols, supercols, lhs, work); - Kokkos::parallel_for("parfor_lsolve_supernode", - team_policy_type(lvl_nodes, Kokkos::AUTO), - sptrsv_init_functor); + Kokkos::parallel_for( + "parfor_lsolve_supernode", + Kokkos::Experimental::require( + team_policy_type(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_init_functor); } // update with off-diagonals (potentiall combined with diagonal // solves) auto submat = thandle.get_submatrix(lvl); - KokkosSparse::spmv(tran, one, submat, work, one, lhs); + KokkosSparse::spmv(space, tran, one, submat, work, one, lhs); } else { if (!invert_offdiagonal) { // zero out lhs corresponding to diagonal blocks in lhs, and copy to @@ -3720,17 +3779,20 @@ tstf); } // end elseif SparseTriSupernodalSpMVFunctor sptrsv_init_functor(1, node_count, nodes_grouped_by_level, supercols, supercols, lhs, work); - Kokkos::parallel_for("parfor_lsolve_supernode", - team_policy_type(lvl_nodes, Kokkos::AUTO), - sptrsv_init_functor); + Kokkos::parallel_for( + "parfor_lsolve_supernode", + Kokkos::Experimental::require( + team_policy_type(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_init_functor); // update with off-diagonals auto submat = thandle.get_submatrix(lvl); - KokkosSparse::spmv(tran, one, submat, lhs, one, work); + KokkosSparse::spmv(space, tran, one, submat, lhs, one, work); // solve with diagonals auto digmat = thandle.get_diagblock(lvl); - KokkosSparse::spmv(tran, one, digmat, work, one, lhs); + KokkosSparse::spmv(space, tran, one, digmat, work, one, lhs); } else { std::cout << " ** invert_offdiag with U in CSR not supported **" << std::endl; @@ -3740,9 +3802,12 @@ tstf); } // end elseif SparseTriSupernodalSpMVFunctor sptrsv_finalize_functor(0, node_count, nodes_grouped_by_level, supercols, supercols, lhs, work); - Kokkos::parallel_for("parfor_lsolve_supernode", - team_policy_type(lvl_nodes, Kokkos::AUTO), - sptrsv_finalize_functor); + Kokkos::parallel_for( + "parfor_lsolve_supernode", + Kokkos::Experimental::require( + team_policy_type(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_finalize_functor); #ifdef profile_supernodal_etree Kokkos::fence(); @@ -3765,23 +3830,22 @@ tstf); } // end elseif double sptrsv_time_seconds = sptrsv_timer.seconds(); std::cout << " + SpTrsv(uppper) time: " << sptrsv_time_seconds << std::endl << std::endl; - std::cout << " + Execution space : " << execution_space::name() + std::cout << " + Execution space : " << ExecutionSpace::name() << std::endl; std::cout << " + Memory space : " << memory_space::name() << std::endl; #endif } // end upper_tri_solve -template -void tri_solve_chain(TriSolveHandle &thandle, const RowMapType row_map, - const EntriesType entries, const ValuesType values, - const RHSType &rhs, LHSType &lhs, +template +void tri_solve_chain(ExecutionSpace &space, TriSolveHandle &thandle, + const RowMapType row_map, const EntriesType entries, + const ValuesType values, const RHSType &rhs, LHSType &lhs, const bool /*is_lowertri_*/) { #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSPSTRSV_SOLVE_IMPL_PROFILE) cudaProfilerStop(); #endif - typedef typename TriSolveHandle::execution_space execution_space; typedef typename TriSolveHandle::size_type size_type; typedef typename TriSolveHandle::nnz_lno_view_t NGBLType; @@ -3802,9 +3866,9 @@ void tri_solve_chain(TriSolveHandle &thandle, const RowMapType row_map, size_type node_count = 0; // REFACTORED to cleanup; next, need debug and timer routines - using policy_type = Kokkos::TeamPolicy; + using policy_type = Kokkos::TeamPolicy; using large_cutoff_policy_type = - Kokkos::TeamPolicy; + Kokkos::TeamPolicy; /* using TP1Functor = TriLvlSchedTP1SolverFunctor; using LTP1Functor = @@ -3865,14 +3929,17 @@ void tri_solve_chain(TriSolveHandle &thandle, const RowMapType row_map, #endif if (team_size == -1) { team_size = - policy_type(1, 1, vector_size) + policy_type(space, 1, 1, vector_size) .team_size_recommended(tstf, Kokkos::ParallelForTag()); } size_type lvl_nodes = hnodes_per_level(schain); // lvl == echain???? - Kokkos::parallel_for("parfor_l_team_chain1", - policy_type(lvl_nodes, team_size, vector_size), - tstf); + Kokkos::parallel_for( + "parfor_l_team_chain1", + Kokkos::Experimental::require( + policy_type(space, lvl_nodes, team_size, vector_size), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + tstf); node_count += lvl_nodes; } else { @@ -3884,7 +3951,7 @@ void tri_solve_chain(TriSolveHandle &thandle, const RowMapType row_map, if (team_size_singleblock <= 0) { team_size_singleblock = - policy_type(1, 1, vector_size) + policy_type(space, 1, 1, vector_size) .team_size_recommended( SingleBlockFunctor(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, @@ -3907,7 +3974,10 @@ void tri_solve_chain(TriSolveHandle &thandle, const RowMapType row_map, #endif Kokkos::parallel_for( "parfor_l_team_chainmulti", - policy_type(1, team_size_singleblock, vector_size), tstf); + Kokkos::Experimental::require( + policy_type(space, 1, team_size_singleblock, vector_size), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + tstf); } else { // team_size_singleblock < cutoff => kernel must allow for a // block-stride internally @@ -3925,11 +3995,15 @@ void tri_solve_chain(TriSolveHandle &thandle, const RowMapType row_map, #endif Kokkos::parallel_for( "parfor_l_team_chainmulti_cutoff", - large_cutoff_policy_type(1, team_size_singleblock, vector_size), + Kokkos::Experimental::require( + large_cutoff_policy_type(1, team_size_singleblock, + vector_size), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), tstf); } node_count += lvl_nodes; } + // TODO: space.fence() Kokkos::fence(); // TODO - is this necessary? that is, can the // parallel_for launch before the s/echain values have // been updated? @@ -3955,16 +4029,19 @@ void tri_solve_chain(TriSolveHandle &thandle, const RowMapType row_map, #endif if (team_size == -1) { team_size = - policy_type(1, 1, vector_size) + policy_type(space, 1, 1, vector_size) .team_size_recommended(tstf, Kokkos::ParallelForTag()); } // TODO To use cudagraph here, need to know how many non-unit chains // there are, create a graph for each and launch accordingly size_type lvl_nodes = hnodes_per_level(schain); // lvl == echain???? - Kokkos::parallel_for("parfor_u_team_chain1", - policy_type(lvl_nodes, team_size, vector_size), - tstf); + Kokkos::parallel_for( + "parfor_u_team_chain1", + Kokkos::Experimental::require( + policy_type(space, lvl_nodes, team_size, vector_size), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + tstf); node_count += lvl_nodes; } else { @@ -3980,7 +4057,7 @@ void tri_solve_chain(TriSolveHandle &thandle, const RowMapType row_map, // values, lhs, rhs, nodes_grouped_by_level, is_lowertri, node_count), // Kokkos::ParallelForTag()); team_size_singleblock = - policy_type(1, 1, vector_size) + policy_type(space, 1, 1, vector_size) .team_size_recommended( SingleBlockFunctor(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, @@ -4003,7 +4080,10 @@ void tri_solve_chain(TriSolveHandle &thandle, const RowMapType row_map, #endif Kokkos::parallel_for( "parfor_u_team_chainmulti", - policy_type(1, team_size_singleblock, vector_size), tstf); + Kokkos::Experimental::require( + policy_type(space, 1, team_size_singleblock, vector_size), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + tstf); } else { // team_size_singleblock < cutoff => kernel must allow for a // block-stride internally @@ -4021,11 +4101,15 @@ void tri_solve_chain(TriSolveHandle &thandle, const RowMapType row_map, #endif Kokkos::parallel_for( "parfor_u_team_chainmulti_cutoff", - large_cutoff_policy_type(1, team_size_singleblock, vector_size), + Kokkos::Experimental::require( + large_cutoff_policy_type(1, team_size_singleblock, + vector_size), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), tstf); } node_count += lvl_nodes; } + // TODO: space.fence() Kokkos::fence(); // TODO - is this necessary? that is, can the // parallel_for launch before the s/echain values have // been updated? diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp index e36b9df236..6ad321c286 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp @@ -96,9 +96,9 @@ template ::value> struct SPTRSV_SOLVE { - static void sptrsv_solve(KernelHandle *handle, const RowMapType row_map, - const EntriesType entries, const ValuesType values, - BType b, XType x); + static void sptrsv_solve(ExecutionSpace &space, KernelHandle *handle, + const RowMapType row_map, const EntriesType entries, + const ValuesType values, BType b, XType x); static void sptrsv_solve_streams( const std::vector &execspace_v, @@ -117,9 +117,9 @@ template { - static void sptrsv_solve(KernelHandle *handle, const RowMapType row_map, - const EntriesType entries, const ValuesType values, - BType b, XType x) { + static void sptrsv_solve(ExecutionSpace &space, KernelHandle *handle, + const RowMapType row_map, const EntriesType entries, + const ValuesType values, BType b, XType x) { // Call specific algorithm type auto sptrsv_handle = handle->get_sptrsv_handle(); Kokkos::Profiling::pushRegion(sptrsv_handle->is_lower_tri() @@ -127,40 +127,44 @@ struct SPTRSV_SOLVEis_lower_tri()) { if (sptrsv_handle->is_symbolic_complete() == false) { - Experimental::lower_tri_symbolic(*sptrsv_handle, row_map, entries); + Experimental::lower_tri_symbolic(space, *sptrsv_handle, row_map, + entries); } if (sptrsv_handle->get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN) { - Experimental::tri_solve_chain(*sptrsv_handle, row_map, entries, values, - b, x, true); + Experimental::tri_solve_chain(space, *sptrsv_handle, row_map, entries, + values, b, x, true); } else { #ifdef KOKKOSKERNELS_SPTRSV_CUDAGRAPHSUPPORT using ExecSpace = typename RowMapType::memory_space::execution_space; if (std::is_same::value) + // TODO: set stream in thandle's sptrsvCudaGraph Experimental::lower_tri_solve_cg(*sptrsv_handle, row_map, entries, values, b, x); else #endif - Experimental::lower_tri_solve(*sptrsv_handle, row_map, entries, + Experimental::lower_tri_solve(space, *sptrsv_handle, row_map, entries, values, b, x); } } else { if (sptrsv_handle->is_symbolic_complete() == false) { - Experimental::upper_tri_symbolic(*sptrsv_handle, row_map, entries); + Experimental::upper_tri_symbolic(space, *sptrsv_handle, row_map, + entries); } if (sptrsv_handle->get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN) { - Experimental::tri_solve_chain(*sptrsv_handle, row_map, entries, values, - b, x, false); + Experimental::tri_solve_chain(space, *sptrsv_handle, row_map, entries, + values, b, x, false); } else { #ifdef KOKKOSKERNELS_SPTRSV_CUDAGRAPHSUPPORT using ExecSpace = typename RowMapType::memory_space::execution_space; if (std::is_same::value) + // TODO: set stream in thandle's sptrsvCudaGraph Experimental::upper_tri_solve_cg(*sptrsv_handle, row_map, entries, values, b, x); else #endif - Experimental::upper_tri_solve(*sptrsv_handle, row_map, entries, + Experimental::upper_tri_solve(space, *sptrsv_handle, row_map, entries, values, b, x); } } @@ -188,7 +192,8 @@ struct SPTRSV_SOLVEis_lower_tri()) { for (int i = 0; i < static_cast(execspace_v.size()); i++) { if (sptrsv_handle_v[i]->is_symbolic_complete() == false) { - Experimental::lower_tri_symbolic(*(sptrsv_handle_v[i]), row_map_v[i], + Experimental::lower_tri_symbolic(execspace_v[i], + *(sptrsv_handle_v[i]), row_map_v[i], entries_v[i]); } } @@ -198,7 +203,8 @@ struct SPTRSV_SOLVE(execspace_v.size()); i++) { if (sptrsv_handle_v[i]->is_symbolic_complete() == false) { - Experimental::upper_tri_symbolic(*(sptrsv_handle_v[i]), row_map_v[i], + Experimental::upper_tri_symbolic(execspace_v[i], + *(sptrsv_handle_v[i]), row_map_v[i], entries_v[i]); } } diff --git a/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp index 3ef3be8780..36ea2d9df8 100644 --- a/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp @@ -147,9 +147,10 @@ void symbolic_chain_phase(TriSolveHandle& thandle, #endif } // end symbolic_chain_phase -template -void lower_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map, - const EntriesType dentries) { +template +void lower_tri_symbolic(ExecSpaceIn& space, TriSolveHandle& thandle, + const RowMapType drow_map, const EntriesType dentries) { #ifdef TRISOLVE_SYMB_TIMERS Kokkos::Timer timer_sym_lowertri_total; Kokkos::Timer timer; @@ -177,10 +178,10 @@ void lower_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map, size_type nrows = drow_map.extent(0) - 1; auto row_map = Kokkos::create_mirror_view(drow_map); - Kokkos::deep_copy(row_map, drow_map); + Kokkos::deep_copy(space, row_map, drow_map); auto entries = Kokkos::create_mirror_view(dentries); - Kokkos::deep_copy(entries, dentries); + Kokkos::deep_copy(space, entries, dentries); // get device view - will deep_copy to it at end of this host routine DeviceEntriesType dnodes_per_level = thandle.get_nodes_per_level(); @@ -193,11 +194,12 @@ void lower_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map, DeviceSignedEntriesType dlevel_list = thandle.get_level_list(); HostSignedEntriesType level_list = Kokkos::create_mirror_view(dlevel_list); - Kokkos::deep_copy(level_list, dlevel_list); + Kokkos::deep_copy(space, level_list, dlevel_list); signed_integral_t level = 0; size_type node_count = 0; + space.fence(); // wait for deep copy write to land typename DeviceEntriesType::HostMirror level_ptr( "lp", nrows + 1); // temp View used for index bookkeeping level_ptr(0) = 0; @@ -227,9 +229,9 @@ void lower_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map, // Create the chain now if (thandle.algm_requires_symb_chain()) { + // No need to pass in space, chain phase runs on the host symbolic_chain_phase(thandle, nodes_per_level); } - thandle.set_symbolic_complete(); // Output check @@ -257,9 +259,9 @@ void lower_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map, #endif // Deep copy to device views - Kokkos::deep_copy(dnodes_grouped_by_level, nodes_grouped_by_level); - Kokkos::deep_copy(dnodes_per_level, nodes_per_level); - Kokkos::deep_copy(dlevel_list, level_list); + Kokkos::deep_copy(space, dnodes_grouped_by_level, nodes_grouped_by_level); + Kokkos::deep_copy(space, dnodes_per_level, nodes_per_level); + Kokkos::deep_copy(space, dlevel_list, level_list); // Extra check: #ifdef LVL_OUTPUT_INFO @@ -279,6 +281,7 @@ void lower_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map, check_count); std::cout << " host check_count= " << check_count << std::endl; + space.fence(); // wait for deep copy writes to land check_count = 0; // reset Kokkos::parallel_reduce( "check_count device", @@ -568,20 +571,21 @@ void lower_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map, thandle.set_workspace_size(max_lwork); // workspace offset initialized to be zero integer_view_t work_offset = thandle.get_work_offset(); - Kokkos::deep_copy(work_offset, work_offset_host); + Kokkos::deep_copy(space, work_offset, work_offset_host); // kernel types // > off-diagonal integer_view_t dkernel_type_by_level = thandle.get_kernel_type(); - Kokkos::deep_copy(dkernel_type_by_level, kernel_type_by_level); + Kokkos::deep_copy(space, dkernel_type_by_level, kernel_type_by_level); // > diagonal integer_view_t ddiag_kernel_type_by_level = thandle.get_diag_kernel_type(); - Kokkos::deep_copy(ddiag_kernel_type_by_level, diag_kernel_type_by_level); + Kokkos::deep_copy(space, ddiag_kernel_type_by_level, + diag_kernel_type_by_level); // deep copy to device (of scheduling info) - Kokkos::deep_copy(dnodes_grouped_by_level, nodes_grouped_by_level); - Kokkos::deep_copy(dnodes_per_level, nodes_per_level); - Kokkos::deep_copy(dlevel_list, level_list); + Kokkos::deep_copy(space, dnodes_grouped_by_level, nodes_grouped_by_level); + Kokkos::deep_copy(space, dnodes_per_level, nodes_per_level); + Kokkos::deep_copy(space, dlevel_list, level_list); #ifdef TRISOLVE_SYMB_TIMERS std::cout << " + workspace time = " << timer.seconds() << std::endl; @@ -598,9 +602,10 @@ void lower_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map, #endif } // end lower_tri_symbolic -template -void upper_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map, - const EntriesType dentries) { +template +void upper_tri_symbolic(ExecutionSpace& space, TriSolveHandle& thandle, + const RowMapType drow_map, const EntriesType dentries) { #ifdef TRISOLVE_SYMB_TIMERS Kokkos::Timer timer_sym_uppertri_total; Kokkos::Timer timer; @@ -626,10 +631,10 @@ void upper_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map, size_type nrows = drow_map.extent(0) - 1; auto row_map = Kokkos::create_mirror_view(drow_map); - Kokkos::deep_copy(row_map, drow_map); + Kokkos::deep_copy(space, row_map, drow_map); auto entries = Kokkos::create_mirror_view(dentries); - Kokkos::deep_copy(entries, dentries); + Kokkos::deep_copy(space, entries, dentries); // get device view - will deep_copy to it at end of this host routine DeviceEntriesType dnodes_per_level = thandle.get_nodes_per_level(); @@ -642,11 +647,12 @@ void upper_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map, DeviceSignedEntriesType dlevel_list = thandle.get_level_list(); HostSignedEntriesType level_list = Kokkos::create_mirror_view(dlevel_list); - Kokkos::deep_copy(level_list, dlevel_list); + Kokkos::deep_copy(space, level_list, dlevel_list); signed_integral_t level = 0; size_type node_count = 0; + space.fence(); // Wait for deep copy writes to land typename DeviceEntriesType::HostMirror level_ptr( "lp", nrows + 1); // temp View used for index bookkeeping level_ptr(0) = 0; @@ -708,9 +714,9 @@ void upper_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map, #endif // Deep copy to device views - Kokkos::deep_copy(dnodes_grouped_by_level, nodes_grouped_by_level); - Kokkos::deep_copy(dnodes_per_level, nodes_per_level); - Kokkos::deep_copy(dlevel_list, level_list); + Kokkos::deep_copy(space, dnodes_grouped_by_level, nodes_grouped_by_level); + Kokkos::deep_copy(space, dnodes_per_level, nodes_per_level); + Kokkos::deep_copy(space, dlevel_list, level_list); // Extra check: #ifdef LVL_OUTPUT_INFO @@ -730,6 +736,7 @@ void upper_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map, check_count); std::cout << " host check_count= " << check_count << std::endl; + space.fence(); // wait for deep copy writes to land check_count = 0; // reset Kokkos::parallel_reduce( "check_count device", diff --git a/sparse/impl/KokkosSparse_sptrsv_symbolic_spec.hpp b/sparse/impl/KokkosSparse_sptrsv_symbolic_spec.hpp index 73389d10d0..5b9304356d 100644 --- a/sparse/impl/KokkosSparse_sptrsv_symbolic_spec.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_symbolic_spec.hpp @@ -67,33 +67,37 @@ namespace Impl { // Unification layer /// \brief Implementation of KokkosSparse::sptrsv_symbolic -template ::value, bool eti_spec_avail = sptrsv_symbolic_eti_spec_avail< KernelHandle, RowMapType, EntriesType>::value> struct SPTRSV_SYMBOLIC { - static void sptrsv_symbolic(KernelHandle *handle, const RowMapType row_map, + static void sptrsv_symbolic(const ExecutionSpace &space, KernelHandle *handle, + const RowMapType row_map, const EntriesType entries); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY //! Full specialization of sptrsv_symbolic // Unification layer -template -struct SPTRSV_SYMBOLIC { - static void sptrsv_symbolic(KernelHandle *handle, const RowMapType row_map, +template +struct SPTRSV_SYMBOLIC { + static void sptrsv_symbolic(const ExecutionSpace &space, KernelHandle *handle, + const RowMapType row_map, const EntriesType entries) { auto sptrsv_handle = handle->get_sptrsv_handle(); auto nrows = row_map.extent(0) - 1; sptrsv_handle->new_init_handle(nrows); if (sptrsv_handle->is_lower_tri()) { - Experimental::lower_tri_symbolic(*sptrsv_handle, row_map, entries); + Experimental::lower_tri_symbolic(space, *sptrsv_handle, row_map, entries); sptrsv_handle->set_symbolic_complete(); } else { - Experimental::upper_tri_symbolic(*sptrsv_handle, row_map, entries); + Experimental::upper_tri_symbolic(space, *sptrsv_handle, row_map, entries); sptrsv_handle->set_symbolic_complete(); } } @@ -113,6 +117,7 @@ struct SPTRSV_SYMBOLIC, \ @@ -130,6 +135,7 @@ struct SPTRSV_SYMBOLIC, \ diff --git a/sparse/src/KokkosSparse_sptrsv.hpp b/sparse/src/KokkosSparse_sptrsv.hpp index 859918c58d..9ab7c9fc6a 100644 --- a/sparse/src/KokkosSparse_sptrsv.hpp +++ b/sparse/src/KokkosSparse_sptrsv.hpp @@ -40,10 +40,23 @@ namespace Experimental { std::is_same::type, \ typename std::remove_const::type>::value -template -void sptrsv_symbolic(KernelHandle *handle, lno_row_view_t_ rowmap, - lno_nnz_view_t_ entries) { +/** + * @brief sptrsv symbolic phase for linear system Ax=b + * + * @tparam ExecutionSpace This kernels execution space type + * @tparam KernelHandle A specialization of + * KokkosKernels::Experimental::KokkosKernelsHandle + * @tparam lno_row_view_t_ The CRS matrix's (A) rowmap type + * @tparam lno_nnz_view_t_ The CRS matrix's (A) entries type + * @param space The execution space instance this kernel will run on + * @param handle KernelHandle instance + * @param rowmap The CRS matrix's (A) rowmap + * @param entries The CRS matrix's (A) entries + */ +template +void sptrsv_symbolic(const ExecutionSpace &space, KernelHandle *handle, + lno_row_view_t_ rowmap, lno_nnz_view_t_ entries) { typedef typename KernelHandle::size_type size_type; typedef typename KernelHandle::nnz_lno_t ordinal_type; @@ -94,8 +107,9 @@ void sptrsv_symbolic(KernelHandle *handle, lno_row_view_t_ rowmap, Entries_Internal entries_i = entries; KokkosSparse::Impl::SPTRSV_SYMBOLIC< - const_handle_type, RowMap_Internal, - Entries_Internal>::sptrsv_symbolic(&tmp_handle, rowmap_i, entries_i); + ExecutionSpace, const_handle_type, RowMap_Internal, + Entries_Internal>::sptrsv_symbolic(space, &tmp_handle, rowmap_i, + entries_i); #ifdef KK_TRISOLVE_TIMERS std::cout << " > sptrsv_symbolic time = " << timer_sptrsv.seconds() @@ -103,10 +117,46 @@ void sptrsv_symbolic(KernelHandle *handle, lno_row_view_t_ rowmap, #endif } // sptrsv_symbolic +/** + * @brief sptrsv symbolic phase for linear system Ax=b + * + * @tparam KernelHandle A specialization of + * KokkosKernels::Experimental::KokkosKernelsHandle + * @tparam lno_row_view_t_ The CRS matrix's (A) rowmap type + * @tparam lno_nnz_view_t_ The CRS matrix's (A) entries type + * @param handle KernelHandle instance + * @param rowmap The CRS matrix's (A) rowmap + * @param entries The CRS matrix's (A) entries + */ template + typename lno_nnz_view_t_> void sptrsv_symbolic(KernelHandle *handle, lno_row_view_t_ rowmap, - lno_nnz_view_t_ entries, scalar_nnz_view_t_ values) { + lno_nnz_view_t_ entries) { + using ExecutionSpace = typename KernelHandle::HandleExecSpace; + auto my_exec_space = ExecutionSpace(); + sptrsv_symbolic(my_exec_space, handle, rowmap, entries); +} + +/** + * @brief sptrsv symbolic phase for linear system Ax=b + * + * @tparam ExecutionSpace This kernels execution space type + * @tparam KernelHandle A specialization of + * KokkosKernels::Experimental::KokkosKernelsHandle + * @tparam lno_row_view_t_ The CRS matrix's (A) rowmap type + * @tparam lno_nnz_view_t_ The CRS matrix's (A) entries type + * @param space The execution space instance this kernel will run on + * @param handle KernelHandle instance + * @param rowmap The CRS matrix's (A) rowmap + * @param entries The CRS matrix's (A) entries + * @param values The CRS matrix's (A) values + */ +template +void sptrsv_symbolic(ExecutionSpace &space, KernelHandle *handle, + lno_row_view_t_ rowmap, lno_nnz_view_t_ entries, + scalar_nnz_view_t_ values) { typedef typename KernelHandle::size_type size_type; typedef typename KernelHandle::nnz_lno_t ordinal_type; typedef typename KernelHandle::nnz_scalar_t scalar_type; @@ -179,11 +229,12 @@ void sptrsv_symbolic(KernelHandle *handle, lno_row_view_t_ rowmap, auto nrows = sh->get_nrows(); KokkosSparse::Impl::sptrsvcuSPARSE_symbolic< - sptrsvHandleType, RowMap_Internal, Entries_Internal, Values_Internal>( - sh, nrows, rowmap_i, entries_i, values_i, false); + ExecutionSpace, sptrsvHandleType, RowMap_Internal, Entries_Internal, + Values_Internal>(space, sh, nrows, rowmap_i, entries_i, values_i, + false); } else { - KokkosSparse::Experimental::sptrsv_symbolic(handle, rowmap, entries); + KokkosSparse::Experimental::sptrsv_symbolic(space, handle, rowmap, entries); } #ifdef KK_TRISOLVE_TIMERS std::cout << " + sptrsv_symbolic time = " << timer_sptrsv.seconds() @@ -191,12 +242,52 @@ void sptrsv_symbolic(KernelHandle *handle, lno_row_view_t_ rowmap, #endif } // sptrsv_symbolic +/** + * @brief sptrsv symbolic phase for linear system Ax=b + * + * @tparam KernelHandle A specialization of + * KokkosKernels::Experimental::KokkosKernelsHandle + * @tparam lno_row_view_t_ The CRS matrix's (A) rowmap type + * @tparam lno_nnz_view_t_ The CRS matrix's (A) entries type + * @param handle KernelHandle instance + * @param rowmap The CRS matrix's (A) rowmap + * @param entries The CRS matrix's (A) entries + * @param values The CRS matrix's (A) values + */ template -void sptrsv_solve(KernelHandle *handle, lno_row_view_t_ rowmap, - lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, BType b, - XType x) { + typename lno_nnz_view_t_, typename scalar_nnz_view_t_> +void sptrsv_symbolic(KernelHandle *handle, lno_row_view_t_ rowmap, + lno_nnz_view_t_ entries, scalar_nnz_view_t_ values) { + using ExecutionSpace = typename KernelHandle::HandleExecSpace; + auto my_exec_space = ExecutionSpace(); + sptrsv_symbolic(my_exec_space, handle, rowmap, entries, values); +} + +/** + * @brief sptrsv solve phase of x for linear system Ax=b + * + * @tparam ExecutionSpace This kernels execution space + * @tparam KernelHandle A specialization of + * KokkosKernels::Experimental::KokkosKernelsHandle + * @tparam lno_row_view_t_ The CRS matrix's (A) rowmap type + * @tparam lno_nnz_view_t_ The CRS matrix's (A) entries type + * @tparam scalar_nnz_view_t_ The CRS matrix's (A) values type + * @tparam BType The b vector type + * @tparam XType The x vector type + * @param space The execution space instance this kernel will be run on + * @param handle KernelHandle instance + * @param rowmap The CRS matrix's (A) rowmap + * @param entries The CRS matrix's (A) entries + * @param values The CRS matrix's (A) values + * @param b The b vector + * @param x The x vector + */ +template +void sptrsv_solve(ExecutionSpace &space, KernelHandle *handle, + lno_row_view_t_ rowmap, lno_nnz_view_t_ entries, + scalar_nnz_view_t_ values, BType b, XType x) { typedef typename KernelHandle::size_type size_type; typedef typename KernelHandle::nnz_lno_t ordinal_type; typedef typename KernelHandle::nnz_scalar_t scalar_type; @@ -305,25 +396,65 @@ void sptrsv_solve(KernelHandle *handle, lno_row_view_t_ rowmap, sptrsvHandleType *sh = handle->get_sptrsv_handle(); auto nrows = sh->get_nrows(); - KokkosSparse::Impl::sptrsvcuSPARSE_solve( - sh, nrows, rowmap_i, entries_i, values_i, b_i, x_i, false); + KokkosSparse::Impl::sptrsvcuSPARSE_solve< + ExecutionSpace, sptrsvHandleType, RowMap_Internal, Entries_Internal, + Values_Internal, BType_Internal, XType_Internal>( + space, sh, nrows, rowmap_i, entries_i, values_i, b_i, x_i, false); } else { KokkosSparse::Impl::SPTRSV_SOLVE< - typename scalar_nnz_view_t_::execution_space, const_handle_type, - RowMap_Internal, Entries_Internal, Values_Internal, BType_Internal, - XType_Internal>::sptrsv_solve(&tmp_handle, rowmap_i, entries_i, + ExecutionSpace, const_handle_type, RowMap_Internal, Entries_Internal, + Values_Internal, BType_Internal, + XType_Internal>::sptrsv_solve(space, &tmp_handle, rowmap_i, entries_i, values_i, b_i, x_i); } } // sptrsv_solve -#if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) -// --------------------------------------------------------------------- -template -void sptrsv_solve(KernelHandle *handle, XType x, XType b) { +/** + * @brief sptrsv solve phase of x for linear system Ax=b + * + * @tparam KernelHandle A specialization of + * KokkosKernels::Experimental::KokkosKernelsHandle + * @tparam lno_row_view_t_ The CRS matrix's (A) rowmap type + * @tparam lno_nnz_view_t_ The CRS matrix's (A) entries type + * @tparam scalar_nnz_view_t_ The CRS matrix's (A) values type + * @tparam BType The b vector type + * @tparam XType The x vector type + * @param handle KernelHandle instance + * @param rowmap The CRS matrix's (A) rowmap + * @param entries The CRS matrix's (A) entries + * @param values The CRS matrix's (A) values + * @param b The b vector + * @param x The x vector + */ +template +void sptrsv_solve(KernelHandle *handle, lno_row_view_t_ rowmap, + lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, BType b, + XType x) { + using ExecutionSpace = typename KernelHandle::HandleExecSpace; + auto my_exec_space = ExecutionSpace(); + sptrsv_solve(my_exec_space, handle, rowmap, entries, values, b, x); +} + +#if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) || defined(DOXY) +/** + * @brief Supernodal sptrsv solve phase of x for linear system Ax=b + * + * @tparam ExecutionSpace This kernels execution space + * @tparam KernelHandle A specialization of + * KokkosKernels::Experimental::KokkosKernelsHandle + * @tparam XType The x and b vector type + * @param space The execution space instance this kernel will run on + * @param handle KernelHandle instance + * @param x The x vector + * @param b The b vector + */ +template +void sptrsv_solve(ExecutionSpace &space, KernelHandle *handle, XType x, + XType b) { auto crsmat = handle->get_sptrsv_handle()->get_crsmat(); auto values = crsmat.values; auto graph = crsmat.graph; @@ -341,31 +472,79 @@ void sptrsv_solve(KernelHandle *handle, XType x, XType b) { if (handle->is_sptrsv_lower_tri()) { // apply forward pivoting - Kokkos::deep_copy(x, b); + Kokkos::deep_copy(space, x, b); // the fifth argument (i.e., first x) is not used - sptrsv_solve(handle, row_map, entries, values, x, x); + sptrsv_solve(space, handle, row_map, entries, values, x, x); } else { // the fifth argument (i.e., first x) is not used - sptrsv_solve(handle, row_map, entries, values, b, b); + sptrsv_solve(space, handle, row_map, entries, values, b, b); // apply backward pivoting - Kokkos::deep_copy(x, b); + Kokkos::deep_copy(space, x, b); } } -// --------------------------------------------------------------------- +/** + * @brief Supernodal sptrsv solve phase of x for linear system Ax=b + * + * @tparam KernelHandle A specialization of + * KokkosKernels::Experimental::KokkosKernelsHandle + * @tparam XType The x and b vector type + * @param handle KernelHandle instance + * @param x The x vector + * @param b The b vector + */ template -void sptrsv_solve(KernelHandle *handleL, KernelHandle *handleU, XType x, - XType b) { +void sptrsv_solve(KernelHandle *handle, XType x, XType b) { + using ExecutionSpace = typename KernelHandle::HandleExecSpace; + auto my_exec_space = ExecutionSpace(); + sptrsv_solve(my_exec_space, handle, x, b); +} + +/** + * @brief Supernodal sptrsv solve phase of x for linear system Ax=b + * + * @tparam ExecutionSpace This kernels execution space + * @tparam KernelHandle A specialization of + * KokkosKernels::Experimental::KokkosKernelsHandle + * @tparam XType The x and b vector type + * @param space The execution space instance this kernel will run on + * @param handleL KernelHandle instance for lower triangular matrix + * @param handleU KernelHandle instance for upper triangular matrix + * @param x The x vector + * @param b The b vector + */ +template +void sptrsv_solve(ExecutionSpace &space, KernelHandle *handleL, + KernelHandle *handleU, XType x, XType b) { // Lower-triangular solve - sptrsv_solve(handleL, x, b); + sptrsv_solve(space, handleL, x, b); // copy the solution to rhs - Kokkos::deep_copy(b, x); + Kokkos::deep_copy(space, b, x); // uper-triangular solve - sptrsv_solve(handleU, x, b); + sptrsv_solve(space, handleU, x, b); +} + +/** + * @brief Supernodal sptrsv solve phase of x for linear system Ax=b + * + * @tparam KernelHandle A specialization of + * KokkosKernels::Experimental::KokkosKernelsHandle + * @tparam XType The x and b vector type + * @param handleL KernelHandle instance for lower triangular matrix + * @param handleU KernelHandle instance for upper triangular matrix + * @param x The x vector + * @param b The b vector + */ +template +void sptrsv_solve(KernelHandle *handleL, KernelHandle *handleU, XType x, + XType b) { + using ExecutionSpace = typename KernelHandle::HandleExecSpace; + auto my_exec_space = ExecutionSpace(); + sptrsv_solve(my_exec_space, handleL, handleU, x, b); } #endif From f648609285ebb44ea2fa864f6878895750ac27fb Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Mon, 2 Oct 2023 08:12:35 -0600 Subject: [PATCH 02/12] Address CI build errors --- blas/impl/KokkosBlas2_gemv_impl.hpp | 79 +++++++++---------- blas/impl/KokkosBlas2_gemv_spec.hpp | 6 +- .../KokkosSparse_sptrsv_cuSPARSE_impl.hpp | 12 ++- .../impl/KokkosSparse_sptrsv_solve_impl.hpp | 16 ++-- 4 files changed, 58 insertions(+), 55 deletions(-) diff --git a/blas/impl/KokkosBlas2_gemv_impl.hpp b/blas/impl/KokkosBlas2_gemv_impl.hpp index 730f88602a..dc0f531583 100644 --- a/blas/impl/KokkosBlas2_gemv_impl.hpp +++ b/blas/impl/KokkosBlas2_gemv_impl.hpp @@ -199,10 +199,9 @@ struct SingleLevelTransposeGEMV { }; // Single-level parallel version of GEMV. -template -void singleLevelGemv(const typename AViewType::execution_space& space, - const char trans[], +template +void singleLevelGemv(const ExecutionSpace& space, const char trans[], typename AViewType::const_value_type& alpha, const AViewType& A, const XViewType& x, typename YViewType::const_value_type& beta, @@ -222,9 +221,8 @@ void singleLevelGemv(const typename AViewType::execution_space& space, static_assert(std::is_integral::value, "IndexType must be an integer"); - using y_value_type = typename YViewType::non_const_value_type; - using execution_space = typename AViewType::execution_space; - using policy_type = Kokkos::RangePolicy; + using y_value_type = typename YViewType::non_const_value_type; + using policy_type = Kokkos::RangePolicy; using AlphaCoeffType = typename AViewType::non_const_value_type; using BetaCoeffType = typename YViewType::non_const_value_type; @@ -442,8 +440,8 @@ struct TwoLevelGEMV_LayoutRightTag {}; // --------------------------------------------------------------------------------------------- // Functor for a two-level parallel_reduce version of GEMV (non-transpose), // designed for performance on GPU. Kernel depends on the layout of A. -template +template struct TwoLevelGEMV { using y_value_type = typename YViewType::non_const_value_type; using AlphaCoeffType = typename AViewType::non_const_value_type; @@ -453,9 +451,8 @@ struct TwoLevelGEMV { std::is_same::value, float, y_value_type>::type; - using execution_space = typename AViewType::execution_space; - using policy_type = Kokkos::TeamPolicy; - using member_type = typename policy_type::member_type; + using policy_type = Kokkos::TeamPolicy; + using member_type = typename policy_type::member_type; TwoLevelGEMV(const AlphaCoeffType& alpha, const AViewType& A, const XViewType& x, const BetaCoeffType& beta, @@ -564,7 +561,8 @@ struct TwoLevelGEMV { // transpose GEMV. The functor uses parallel-for over the columns of the input // matrix A and each team uses parallel-reduce over the row of its column. // The output vector y is the reduction result. -template struct TwoLevelTransposeGEMV { using y_value_type = typename YViewType::non_const_value_type; @@ -575,9 +573,8 @@ struct TwoLevelTransposeGEMV { std::is_same::value, float, y_value_type>::type; - using execution_space = typename AViewType::execution_space; - using policy_type = Kokkos::TeamPolicy; - using member_type = typename policy_type::member_type; + using policy_type = Kokkos::TeamPolicy; + using member_type = typename policy_type::member_type; TwoLevelTransposeGEMV(const AlphaCoeffType& alpha, const AViewType& A, const XViewType& x, const BetaCoeffType& beta, @@ -637,10 +634,9 @@ struct TwoLevelTransposeGEMV { }; // Two-level parallel version of GEMV. -template -void twoLevelGemv(const typename AViewType::execution_space& space, - const char trans[], +template +void twoLevelGemv(const ExecutionSpace& space, const char trans[], typename AViewType::const_value_type& alpha, const AViewType& A, const XViewType& x, typename YViewType::const_value_type& beta, @@ -661,9 +657,8 @@ void twoLevelGemv(const typename AViewType::execution_space& space, "IndexType must be an integer"); using y_value_type = typename YViewType::non_const_value_type; - using execution_space = typename AViewType::execution_space; - using team_policy_type = Kokkos::TeamPolicy; - using range_policy_type = Kokkos::RangePolicy; + using team_policy_type = Kokkos::TeamPolicy; + using range_policy_type = Kokkos::RangePolicy; using Kokkos::ArithTraits; using KAT = ArithTraits; @@ -704,19 +699,19 @@ void twoLevelGemv(const typename AViewType::execution_space& space, using layout_tag = typename std::conditional::type; - using tagged_policy = Kokkos::TeamPolicy; - using functor_type = - TwoLevelGEMV; + using tagged_policy = Kokkos::TeamPolicy; + using functor_type = TwoLevelGEMV; functor_type functor(alpha, A, x, beta, y); tagged_policy team; - if (isLayoutLeft) { + if constexpr (isLayoutLeft) { using AccumScalar = typename std::conditional< std::is_same::value || std::is_same::value, float, y_value_type>::type; size_t sharedPerTeam = 32 * sizeof(AccumScalar); IndexType numTeams = (A.extent(0) + 31) / 32; - tagged_policy temp(1, 1); + tagged_policy temp(space, 1, 1); temp.set_scratch_size(0, Kokkos::PerTeam(sharedPerTeam)); int teamSize = temp.team_size_recommended(functor, Kokkos::ParallelForTag()); @@ -727,7 +722,7 @@ void twoLevelGemv(const typename AViewType::execution_space& space, // FIXME SYCL: team_size_recommended() returns too big of a team size. // Kernel hangs with 1024 threads on XEHP. #ifdef KOKKOS_ENABLE_SYCL - if (std::is_same::value) { + if (std::is_same::value) { if (teamSize > 256) teamSize = 256; } #endif @@ -749,16 +744,18 @@ void twoLevelGemv(const typename AViewType::execution_space& space, } else if (tr == 'T') { // transpose, and not conj transpose team_policy_type team(space, A.extent(1), Kokkos::AUTO); - using functor_type = TwoLevelTransposeGEMV; + using functor_type = + TwoLevelTransposeGEMV; functor_type functor(alpha, A, x, beta, y); Kokkos::parallel_for("KokkosBlas::gemv[twoLevelTranspose]", team, functor); } else if (tr == 'C' || tr == 'H') { // conjugate transpose team_policy_type team(space, A.extent(1), Kokkos::AUTO); - using functor_type = TwoLevelTransposeGEMV; + using functor_type = + TwoLevelTransposeGEMV; functor_type functor(alpha, A, x, beta, y); Kokkos::parallel_for("KokkosBlas::gemv[twoLevelTranspose]", team, functor); @@ -769,11 +766,11 @@ void twoLevelGemv(const typename AViewType::execution_space& space, // generalGemv: use 1 level (Range) or 2 level (Team) implementation, // depending on whether execution space is CPU or GPU. enable_if makes sure // unused kernels are not instantiated. -template ()>::type* = nullptr> -void generalGemvImpl(const typename AViewType::execution_space& space, - const char trans[], + ExecutionSpace>()>::type* = nullptr> +void generalGemvImpl(const ExecutionSpace& space, const char trans[], typename AViewType::const_value_type& alpha, const AViewType& A, const XViewType& x, typename YViewType::const_value_type& beta, @@ -781,11 +778,11 @@ void generalGemvImpl(const typename AViewType::execution_space& space, singleLevelGemv(space, trans, alpha, A, x, beta, y); } -template ()>::type* = nullptr> -void generalGemvImpl(const typename AViewType::execution_space& space, - const char trans[], + ExecutionSpace>()>::type* = nullptr> +void generalGemvImpl(const ExecutionSpace& space, const char trans[], typename AViewType::const_value_type& alpha, const AViewType& A, const XViewType& x, typename YViewType::const_value_type& beta, diff --git a/blas/impl/KokkosBlas2_gemv_spec.hpp b/blas/impl/KokkosBlas2_gemv_spec.hpp index 08842a61c0..97e6e2717e 100644 --- a/blas/impl/KokkosBlas2_gemv_spec.hpp +++ b/blas/impl/KokkosBlas2_gemv_spec.hpp @@ -104,10 +104,10 @@ struct GEMV { // Prefer int as the index type, but use a larger type if needed. if (numRows < static_cast(INT_MAX) && numCols < static_cast(INT_MAX)) { - generalGemvImpl(space, trans, alpha, - A, x, beta, y); + generalGemvImpl( + space, trans, alpha, A, x, beta, y); } else { - generalGemvImpl( + generalGemvImpl( space, trans, alpha, A, x, beta, y); } Kokkos::Profiling::popRegion(); diff --git a/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp index 0b9afa7796..019a63fcd7 100644 --- a/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp @@ -159,11 +159,11 @@ void sptrsvcuSPARSE_symbolic(ExecutionSpace &space, KernelHandle *sptrsv_handle, std::is_same::value || std::is_same::value; - if (!is_cuda_space) { + if constexpr (!is_cuda_space) { throw std::runtime_error( "KokkosKernels sptrsvcuSPARSE_symbolic: MEMORY IS NOT ALLOCATED IN GPU " "DEVICE for CUSPARSE\n"); - } else if (std::is_same::value) { + } else if constexpr (std::is_same::value) { bool is_lower = sptrsv_handle->is_lower_tri(); sptrsv_handle->create_cuSPARSE_Handle(trans, is_lower); @@ -277,6 +277,7 @@ void sptrsvcuSPARSE_symbolic(ExecutionSpace &space, KernelHandle *sptrsv_handle, } #endif #else + (void)space; (void)sptrsv_handle; (void)nrows; (void)row_map; @@ -369,8 +370,10 @@ void sptrsvcuSPARSE_solve(ExecutionSpace &space, KernelHandle *sptrsv_handle, typename KernelHandle::SPTRSVcuSparseHandleType *h = sptrsv_handle->get_cuSparseHandle(); - KOKKOS_CUSPARSE_SAFE_CALL( - cusparseSetStream(h->handle, space.cuda_stream())); + if constexpr (std::is_same_v) { + KOKKOS_CUSPARSE_SAFE_CALL( + cusparseSetStream(h->handle, space.cuda_stream())); + } int nnz = entries.extent_int(0); @@ -440,6 +443,7 @@ void sptrsvcuSPARSE_solve(ExecutionSpace &space, KernelHandle *sptrsv_handle, } #endif #else + (void)space; (void)sptrsv_handle; (void)nrows; (void)row_map; diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp index 1d8613922b..ecc5d13308 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp @@ -2913,7 +2913,8 @@ void lower_tri_solve(ExecutionSpace &space, TriSolveHandle &thandle, #if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) using namespace KokkosSparse::Experimental; - using memory_space = typename TriSolveHandle::memory_space; + using memory_space = typename ExecutionSpace::memory_space; + using device_t = Kokkos::Device; using integer_view_t = typename TriSolveHandle::integer_view_t; using integer_view_host_t = typename TriSolveHandle::integer_view_host_t; using scalar_t = typename ValuesType::non_const_value_type; @@ -3075,7 +3076,7 @@ void lower_tri_solve(ExecutionSpace &space, TriSolveHandle &thandle, // NOTE: we currently supports only default_layout = LayoutLeft using team_policy_type = Kokkos::TeamPolicy; using supernode_view_type = - Kokkos::View; if (diag_kernel_type_host(lvl) == 3) { // using device-level kernels (functor is called to scatter the @@ -3148,7 +3149,7 @@ void lower_tri_solve(ExecutionSpace &space, TriSolveHandle &thandle, char unit_diag = (unit_diagonal ? 'U' : 'N'); // NOTE: we currently supports only default_layout = // LayoutLeft - Kokkos::View Xjj(Xj.data(), nscol, 1); KokkosBlas::trsm(space, "L", "L", "N", &unit_diag, one, Ljj, @@ -3311,6 +3312,7 @@ void upper_tri_solve(ExecutionSpace &space, TriSolveHandle &thandle, cudaProfilerStop(); #endif using memory_space = typename ExecutionSpace::memory_space; + using device_t = Kokkos::Device; typedef typename TriSolveHandle::size_type size_type; typedef typename TriSolveHandle::nnz_lno_view_t NGBLType; @@ -3527,7 +3529,7 @@ tstf); } // end elseif // create a view for the s-th supernocal block column // NOTE: we currently supports only default_layout = LayoutLeft - Kokkos::View viewU(&dataU[i1], nsrow, nscol); @@ -3562,7 +3564,7 @@ tstf); } // end elseif } else { // NOTE: we currently supports only default_layout = // LayoutLeft - Kokkos::View Xjj(Xj.data(), nscol, 1); KokkosBlas::trsm(space, "L", "U", "N", "N", one, Ujj, Xjj); @@ -3658,7 +3660,7 @@ tstf); } // end elseif // create a view for the s-th supernocal block column // NOTE: we currently supports only default_layout = LayoutLeft - Kokkos::View viewU(&dataU[i1], nsrow, nscol); @@ -3695,7 +3697,7 @@ tstf); } // end elseif KokkosBlas::gemv(space, "T", one, Ujj, Xj, zero, Y); } else { // NOTE: we currently supports only default_layout = LayoutLeft - Kokkos::View Xjj(Xj.data(), nscol, 1); KokkosBlas::trsm(space, "L", "L", "T", "N", one, Ujj, Xjj); From ce0c2a521d634ce4795d071050c28ad4a30ab42d Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 26 Sep 2023 08:58:33 -0600 Subject: [PATCH 03/12] Stream support for Gauss-Seidel: Symbolic, Numeric, Apply (Twostage) - Note: everything except for KokkosGraph and sptrsv runs on streams. --- common/src/KokkosKernels_Utils.hpp | 5 +- .../impl/KokkosSparse_gauss_seidel_impl.hpp | 10 +- ...okkosSparse_twostage_gauss_seidel_impl.hpp | 207 +++++++++++------- sparse/src/KokkosKernels_Handle.hpp | 22 +- sparse/src/KokkosSparse_gauss_seidel.hpp | 96 ++++---- .../src/KokkosSparse_gauss_seidel_handle.hpp | 15 +- 6 files changed, 205 insertions(+), 150 deletions(-) diff --git a/common/src/KokkosKernels_Utils.hpp b/common/src/KokkosKernels_Utils.hpp index e1c15505ff..ba8049cecf 100644 --- a/common/src/KokkosKernels_Utils.hpp +++ b/common/src/KokkosKernels_Utils.hpp @@ -890,7 +890,7 @@ void permute_block_vector(typename idx_array_type::value_type num_elements, // TODO BMK: clean this up by removing 1st argument. It is unused but // its name gives the impression that only num_elements of the vector are // zeroed, when really it's always the whole thing. -template +template void zero_vector(ExecSpaceIn &exec_space_in, typename value_array_type::value_type /* num_elements */, value_array_type &vector) { @@ -906,8 +906,7 @@ void zero_vector(typename value_array_type::value_type /* num_elements */, using ne_tmp_t = typename value_array_type::value_type; ne_tmp_t ne_tmp = ne_tmp_t(0); MyExecSpace my_exec_space; - zero_vector(my_exec_space, ne_tmp, - vector); + zero_vector(my_exec_space, ne_tmp, vector); } template diff --git a/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp b/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp index 7391e00e3d..12f7dea38a 100644 --- a/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp +++ b/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp @@ -1547,9 +1547,8 @@ class PointGaussSeidel { Permuted_Yvector); } if (init_zero_x_vector) { - KokkosKernels::Impl::zero_vector< - MyExecSpace, scalar_persistent_work_view2d_t, MyExecSpace>( - my_exec_space, num_cols * block_size, Permuted_Xvector); + KokkosKernels::Impl::zero_vector(my_exec_space, num_cols * block_size, + Permuted_Xvector); } else { KokkosKernels::Impl::permute_block_vector< x_value_array_type, scalar_persistent_work_view2d_t, @@ -1664,9 +1663,8 @@ class PointGaussSeidel { Permuted_Yvector); } if (init_zero_x_vector) { - KokkosKernels::Impl::zero_vector< - MyExecSpace, scalar_persistent_work_view2d_t, MyExecSpace>( - my_exec_space, num_cols, Permuted_Xvector); + KokkosKernels::Impl::zero_vector(my_exec_space, num_cols, + Permuted_Xvector); } else { KokkosKernels::Impl::permute_vector< x_value_array_type, scalar_persistent_work_view2d_t, diff --git a/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp b/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp index f1f7a0e6cd..f4c2ed3662 100644 --- a/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp +++ b/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp @@ -576,10 +576,11 @@ class TwostageGaussSeidel { tic = timer.seconds(); #endif auto *gsHandle = get_gs_handle(); + auto my_exec_space = gsHandle->get_execution_space(); bool two_stage = gsHandle->isTwoStage(); bool compact_form = gsHandle->isCompactForm(); GSDirection direction = gsHandle->getSweepDirection(); - using GS_Functor_t = + using TSGS_Functor_t = TwostageGaussSeidel_functor; // count nnz in local L & U matrices (rowmap_viewL/rowmap_viewU stores @@ -587,28 +588,29 @@ class TwostageGaussSeidel { ordinal_t nnzA = column_view.extent(0); ordinal_t nnzL = 0; // lower-part of diagonal block ordinal_t nnzU = 0; // upper-part of diagonal block - row_map_view_t rowmap_viewL("row_mapL", + row_map_view_t rowmap_viewL(Kokkos::view_alloc(my_exec_space, "row_mapL"), num_rows + 1); // lower-part of diagonal block - row_map_view_t rowmap_viewU("row_mapU", + row_map_view_t rowmap_viewU(Kokkos::view_alloc(my_exec_space, "row_mapU"), num_rows + 1); // upper-part of diagonal block - row_map_view_t rowmap_viewLa("row_mapLa", + row_map_view_t rowmap_viewLa(Kokkos::view_alloc(my_exec_space, "row_mapLa"), num_rows + 1); // complement of U+D - row_map_view_t rowmap_viewUa("row_mapUa", + row_map_view_t rowmap_viewUa(Kokkos::view_alloc(my_exec_space, "row_mapUa"), num_rows + 1); // complement of L+D if (direction == GS_FORWARD || direction == GS_SYMMETRIC) { using range_policy = Kokkos::RangePolicy; Kokkos::parallel_reduce( - "nnzL", range_policy(0, num_rows), - GS_Functor_t(two_stage, compact_form, num_rows, rowmap_view, - column_view, rowmap_viewL, rowmap_viewUa), + "nnzL", range_policy(my_exec_space, 0, num_rows), + TSGS_Functor_t(two_stage, compact_form, num_rows, rowmap_view, + column_view, rowmap_viewL, rowmap_viewUa), nnzL); } + // TODO: continue if (direction == GS_BACKWARD || direction == GS_SYMMETRIC) { using range_policy = Kokkos::RangePolicy; Kokkos::parallel_reduce( "nnzU", range_policy(0, num_rows), - GS_Functor_t(two_stage, compact_form, num_rows, rowmap_view, - column_view, rowmap_viewU, rowmap_viewLa), + TSGS_Functor_t(two_stage, compact_form, num_rows, rowmap_view, + column_view, rowmap_viewU, rowmap_viewLa), nnzU); } ordinal_t nnzLa = 0; // complement of U+D @@ -633,19 +635,19 @@ class TwostageGaussSeidel { // shift ptr so that it now contains offsets (combine it with the previous // functor calls?) if (direction == GS_FORWARD || direction == GS_SYMMETRIC) { - KokkosKernels::Impl::kk_inclusive_parallel_prefix_sum( - 1 + num_rows, rowmap_viewL); + KokkosKernels::Impl::kk_inclusive_parallel_prefix_sum( + my_exec_space, 1 + num_rows, rowmap_viewL); if (compact_form) { - KokkosKernels::Impl::kk_inclusive_parallel_prefix_sum( - 1 + num_rows, rowmap_viewLa); + KokkosKernels::Impl::kk_inclusive_parallel_prefix_sum( + my_exec_space, 1 + num_rows, rowmap_viewLa); } } if (direction == GS_BACKWARD || direction == GS_SYMMETRIC) { - KokkosKernels::Impl::kk_inclusive_parallel_prefix_sum( - 1 + num_rows, rowmap_viewU); + KokkosKernels::Impl::kk_inclusive_parallel_prefix_sum( + my_exec_space, 1 + num_rows, rowmap_viewU); if (compact_form) { - KokkosKernels::Impl::kk_inclusive_parallel_prefix_sum( - 1 + num_rows, rowmap_viewUa); + KokkosKernels::Impl::kk_inclusive_parallel_prefix_sum( + my_exec_space, 1 + num_rows, rowmap_viewUa); } } #ifdef KOKKOSSPARSE_IMPL_TIME_TWOSTAGE_GS @@ -657,31 +659,48 @@ class TwostageGaussSeidel { #endif // allocate memory to store local D values_view_t viewD( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "diags"), num_rows); + Kokkos::view_alloc(my_exec_space, Kokkos::WithoutInitializing, "diags"), + num_rows); // allocate memory to store local L entries_view_t column_viewL( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "entriesL"), nnzL); + Kokkos::view_alloc(my_exec_space, Kokkos::WithoutInitializing, + "entriesL"), + nnzL); values_view_t values_viewL( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "valuesL"), nnzL); + Kokkos::view_alloc(my_exec_space, Kokkos::WithoutInitializing, + "valuesL"), + nnzL); // allocate memory to store local U entries_view_t column_viewU( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "entriesU"), nnzU); + Kokkos::view_alloc(my_exec_space, Kokkos::WithoutInitializing, + "entriesU"), + nnzU); values_view_t values_viewU( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "valuesU"), nnzU); + Kokkos::view_alloc(my_exec_space, Kokkos::WithoutInitializing, + "valuesU"), + nnzU); // allocate memory to store complement of U+D entries_view_t column_viewLa( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "entriesLa"), nnzLa); + Kokkos::view_alloc(my_exec_space, Kokkos::WithoutInitializing, + "entriesLa"), + nnzLa); values_view_t values_viewLa( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "valuesLa"), nnzLa); + Kokkos::view_alloc(my_exec_space, Kokkos::WithoutInitializing, + "valuesLa"), + nnzLa); // allocate memory to store complement of L+D entries_view_t column_viewUa( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "entriesUa"), nnzUa); + Kokkos::view_alloc(my_exec_space, Kokkos::WithoutInitializing, + "entriesUa"), + nnzUa); values_view_t values_viewUa( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "valuesUa"), nnzUa); + Kokkos::view_alloc(my_exec_space, Kokkos::WithoutInitializing, + "valuesUa"), + nnzUa); #ifdef KOKKOSSPARSE_IMPL_TIME_TWOSTAGE_GS Kokkos::fence(); tic = timer.seconds(); @@ -694,8 +713,8 @@ class TwostageGaussSeidel { // extract local L & U structures (for computing (L+D)^{-1} or (D+U)^{-1}) using range_policy = Kokkos::RangePolicy; Kokkos::parallel_for( - "entriesLU", range_policy(0, num_rows), - GS_Functor_t( + "entriesLU", range_policy(my_exec_space, 0, num_rows), + TSGS_Functor_t( two_stage, compact_form, num_rows, rowmap_view, column_view, rowmap_viewL, column_viewL, rowmap_viewU, column_viewU, // @@ -709,6 +728,7 @@ class TwostageGaussSeidel { timer.reset(); #endif + my_exec_space.fence(); // Wait for non-default stream to finish work // construct CrsMat with them graph_t graphL(column_viewL, rowmap_viewL); graph_t graphU(column_viewU, rowmap_viewU); @@ -732,7 +752,9 @@ class TwostageGaussSeidel { gsHandle->setUa(crsmatUa); values_view_t viewDa( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "diags"), num_rows); + Kokkos::view_alloc(my_exec_space, Kokkos::WithoutInitializing, + "diags"), + num_rows); gsHandle->setDa(viewDa); } @@ -748,6 +770,8 @@ class TwostageGaussSeidel { crsmatL.graph.entries); sptrsv_symbolic(handle->get_gs_sptrsvU_handle(), rowmap_viewU, crsmatU.graph.entries); + Kokkos::fence(); // Ensure writes land before launching more work on + // non-default streams, TODO: remove } } } @@ -762,13 +786,14 @@ class TwostageGaussSeidel { Kokkos::fence(); timer.reset(); #endif - using GS_Functor_t = + using TSGS_Functor_t = TwostageGaussSeidel_functor; - auto *gsHandle = get_gs_handle(); - bool two_stage = gsHandle->isTwoStage(); - bool compact_form = gsHandle->isCompactForm(); + auto *gsHandle = get_gs_handle(); + auto my_exec_space = gsHandle->get_execution_space(); + bool two_stage = gsHandle->isTwoStage(); + bool compact_form = gsHandle->isCompactForm(); // load local D from handle auto viewD = gsHandle->getD(); @@ -799,12 +824,12 @@ class TwostageGaussSeidel { // extract local L, D & U matrices using range_policy = Kokkos::RangePolicy; Kokkos::parallel_for( - "valueLU", range_policy(0, num_rows), - GS_Functor_t(two_stage, compact_form, diagos_given, num_rows, - rowmap_view, column_view, values_view, d_invert_view, - rowmap_viewL, values_viewL, viewD, rowmap_viewU, - values_viewU, rowmap_viewLa, values_viewLa, viewDa, - rowmap_viewUa, values_viewUa)); + "valueLU", range_policy(my_exec_space, 0, num_rows), + TSGS_Functor_t(two_stage, compact_form, diagos_given, num_rows, + rowmap_view, column_view, values_view, d_invert_view, + rowmap_viewL, values_viewL, viewD, rowmap_viewU, + values_viewU, rowmap_viewLa, values_viewLa, viewDa, + rowmap_viewUa, values_viewUa)); #ifdef KOKKOSSPARSE_IMPL_TIME_TWOSTAGE_GS Kokkos::fence(); tic = timer.seconds(); @@ -813,6 +838,8 @@ class TwostageGaussSeidel { timer.reset(); #endif + my_exec_space.fence(); // Wait for non-default stream work to finish before + // sptrsv, TODO: remove if (!(gsHandle->isTwoStage())) { using namespace KokkosSparse::Experimental; auto sptrsv_algo = @@ -834,6 +861,8 @@ class TwostageGaussSeidel { crsmatL.graph.entries, values_viewL); sptrsv_symbolic(handle->get_gs_sptrsvU_handle(), rowmap_viewU, crsmatU.graph.entries, values_viewU); + Kokkos::fence(); // Wait for sptrsv to finish before launching work on + // a stream, TODO: remove } } } @@ -857,10 +886,11 @@ class TwostageGaussSeidel { #endif // - auto *gsHandle = get_gs_handle(); - bool two_stage = gsHandle->isTwoStage(); - bool compact_form = gsHandle->isCompactForm(); - scalar_t gamma = gsHandle->getInnerDampFactor(); + auto *gsHandle = get_gs_handle(); + auto my_exec_space = gsHandle->get_execution_space(); + bool two_stage = gsHandle->isTwoStage(); + bool compact_form = gsHandle->isCompactForm(); + scalar_t gamma = gsHandle->getInnerDampFactor(); GSDirection direction = gsHandle->getSweepDirection(); if (apply_forward && apply_backward) { @@ -883,7 +913,7 @@ class TwostageGaussSeidel { auto crsmatUa = gsHandle->getUa(); // complement of U+D (used only for compact form) - // wratp A into crsmat + // wrap A into crsmat input_crsmat_t crsmatA("A", num_rows, num_cols, values_view.extent(0), values_view, rowmap_view, column_view); #ifdef KOKKOSSPARSE_IMPL_TIME_TWOSTAGE_GS @@ -916,36 +946,36 @@ class TwostageGaussSeidel { NumSweeps *= 2; } if (init_zero_x_vector) { - KokkosKernels::Impl::zero_vector( - nrhs, localX); + KokkosKernels::Impl::zero_vector(my_exec_space, nrhs, localX); } for (int sweep = 0; sweep < NumSweeps; ++sweep) { bool forward_sweep = (direction == GS_FORWARD || (direction == GS_SYMMETRIC && sweep % 2 == 0)); // compute residual vector - KokkosBlas::scal(localR, one, localB); + KokkosBlas::scal(my_exec_space, localR, one, localB); if (sweep > 0 || !init_zero_x_vector) { if (compact_form) { if (forward_sweep) { // R = B - U*x - KokkosSparse::spmv("N", scalar_t(-one), crsmatUa, localX, one, - localR); + KokkosSparse::spmv(my_exec_space, "N", scalar_t(-one), crsmatUa, + localX, one, localR); } else { // R = B - L*x - KokkosSparse::spmv("N", scalar_t(-one), crsmatLa, localX, one, - localR); + KokkosSparse::spmv(my_exec_space, "N", scalar_t(-one), crsmatLa, + localX, one, localR); } if (omega != one) { // R = B - (U + (1-1/omega)D)*x scalar_t omega2 = (one / omega - one); auto localY = Kokkos::subview(localX, range_type(0, num_rows), Kokkos::ALL()); - KokkosBlas::mult(zero, localZ, one, localDa, localY); - KokkosBlas::axpy(omega2, localZ, localR); + KokkosBlas::mult(my_exec_space, zero, localZ, one, localDa, localY); + KokkosBlas::axpy(my_exec_space, omega2, localZ, localR); } } else { // not compact_form // R = B - A*x - KokkosSparse::spmv("N", scalar_t(-one), crsmatA, localX, one, localR); + KokkosSparse::spmv(my_exec_space, "N", scalar_t(-one), crsmatA, + localX, one, localR); #ifdef KOKKOSSPARSE_IMPL_TIME_TWOSTAGE_GS { auto localRj = @@ -981,8 +1011,11 @@ class TwostageGaussSeidel { Kokkos::subview(localZ, Kokkos::ALL(), range_type(j, j + 1)); single_vector_view_t Rj(localRj.data(), num_rows); single_vector_view_t Zj(localZj.data(), num_rows); + my_exec_space + .fence(); // Wait for writes to R and Z to land, TODO: remove sptrsv_solve(handle->get_gs_sptrsvL_handle(), crsmatL.graph.row_map, crsmatL.graph.entries, crsmatL.values, Rj, Zj); + Kokkos::fence(); // TODO: call sptrsv_solve on stream and remove } } else { using namespace KokkosSparse::Experimental; @@ -995,8 +1028,11 @@ class TwostageGaussSeidel { Kokkos::subview(localZ, Kokkos::ALL(), range_type(j, j + 1)); single_vector_view_t Rj(localRj.data(), num_rows); single_vector_view_t Zj(localZj.data(), num_rows); + my_exec_space + .fence(); // Wait for writes to R and Z to land, TODO: remove sptrsv_solve(handle->get_gs_sptrsvU_handle(), crsmatU.graph.row_map, crsmatU.graph.entries, crsmatU.values, Rj, Zj); + Kokkos::fence(); // TODO: call sptrsv_solve on stream and remove } } @@ -1005,21 +1041,25 @@ class TwostageGaussSeidel { Kokkos::subview(localX, range_type(0, num_rows), Kokkos::ALL()); if (compact_form) { // Y = omega * Z - KokkosBlas::scal(localY, one, localZ); + KokkosBlas::scal(my_exec_space, localY, one, localZ); } else { // Y = Y + omega * Z - KokkosBlas::axpy(one, localZ, localY); + KokkosBlas::axpy(my_exec_space, one, localZ, localY); } } else { // ====== inner Jacobi-Richardson ===== #ifdef KOKKOSSPARSE_IMPL_TIME_TWOSTAGE_GS // compute initial residual norm // > compute RHS for the inner loop, R = B - A*x - internal_vector_view_t tempR("tempR", num_rows, 1); - KokkosBlas::scal(tempR, one, localB); - KokkosSparse::spmv("N", scalar_t(-one), crsmatA, localX, one, tempR); + internal_vector_view_t tempR( + Kokkos::view_alloc(my_exec_space, std::string("tempR")), num_rows, + 1); + KokkosBlas::scal(my_exec_space, tempR, one, localB); + KokkosSparse::spmv(my_exec_space, "N", scalar_t(-one), crsmatA, localX, + one, tempR); // > initial vector for the inner loop is zero - Kokkos::deep_copy(localZ, zero); + Kokkos::deep_copy(my_exec_space, localZ, zero); + my_exec_space.fence(); using Norm_Functor_t = TwostageGaussSeidel_functor; @@ -1027,7 +1067,7 @@ class TwostageGaussSeidel { { mag_t normR = zero; Kokkos::parallel_reduce( - "normR", range_policy(0, num_rows), + "normR", range_policy(my_exec_space, 0, num_rows), Norm_Functor_t(forward_sweep, num_rows, rowmap_view, column_view, values_view, localD, localZ, tempR), normR); @@ -1042,23 +1082,23 @@ class TwostageGaussSeidel { // row-scale: (D^{-1}*L)*Y = D^{-1}*B // compute Z := D^{-1}*R - KokkosBlas::mult(zero, localZ, one, localD, localR); + KokkosBlas::mult(my_exec_space, zero, localZ, one, localD, localR); // apply inner damping factor, if not one if (gamma != one) { // Z = gamma * Z - KokkosBlas::scal(localZ, gamma, localZ); + KokkosBlas::scal(my_exec_space, localZ, gamma, localZ); } } else { // copy to localT (workspace used to save D^{-1}*R for JR iteration) - KokkosBlas::mult(zero, localT, one, localD, localR); + KokkosBlas::mult(my_exec_space, zero, localT, one, localD, localR); // initialize Jacobi-Richardson (using R as workspace for JR // iteration) - KokkosBlas::scal(localR, one, localT); + KokkosBlas::scal(my_exec_space, localR, one, localT); // apply inner damping factor, if not one if (gamma != one) { // R = gamma * R - KokkosBlas::scal(localR, gamma, localR); + KokkosBlas::scal(my_exec_space, localR, gamma, localR); } } #ifdef KOKKOSSPARSE_IMPL_TIME_TWOSTAGE_GS @@ -1066,7 +1106,7 @@ class TwostageGaussSeidel { // compute residual norm of the starting vector (D^{-1}R) mag_t normR = zero; Kokkos::parallel_reduce( - "normR", range_policy(0, num_rows), + "normR", range_policy(my_exec_space, 0, num_rows), Norm_Functor_t(forward_sweep, num_rows, rowmap_view, column_view, values_view, localD, localT, tempR), normR); @@ -1077,34 +1117,34 @@ class TwostageGaussSeidel { for (int ii = 0; ii < NumInnerSweeps; ii++) { // T = D^{-1}*R, and L = D^{-1}*L and U = D^{-1}*U // copy T into Z - KokkosBlas::scal(localZ, one, localT); + KokkosBlas::scal(my_exec_space, localZ, one, localT); if (forward_sweep) { // Z = Z - L*R - KokkosSparse::spmv("N", scalar_t(-omega), crsmatL, localR, one, - localZ); + KokkosSparse::spmv(my_exec_space, "N", scalar_t(-omega), crsmatL, + localR, one, localZ); } else { // Z = R - U*T - KokkosSparse::spmv("N", scalar_t(-omega), crsmatU, localR, one, - localZ); + KokkosSparse::spmv(my_exec_space, "N", scalar_t(-omega), crsmatU, + localR, one, localZ); } // apply inner damping factor, if not one if (gamma != one) { // Z = gamma * Z - KokkosBlas::scal(localZ, gamma, localZ); + KokkosBlas::scal(my_exec_space, localZ, gamma, localZ); // Z = Z + (one - one/gamma) * R scalar_t gamma2 = one - gamma; - KokkosBlas::axpy(gamma2, localR, localZ); + KokkosBlas::axpy(my_exec_space, gamma2, localR, localZ); } if (ii + 1 < NumInnerSweeps) { // reinitialize (R to be Z) - KokkosBlas::scal(localR, one, localZ); + KokkosBlas::scal(my_exec_space, localR, one, localZ); } #ifdef KOKKOSSPARSE_IMPL_TIME_TWOSTAGE_GS { // compute residual norm(r - (L+D)*y) mag_t normR = zero; Kokkos::parallel_reduce( - "normR", range_policy(0, num_rows), + "normR", range_policy(my_exec_space, 0, num_rows), Norm_Functor_t(forward_sweep, num_rows, rowmap_view, column_view, values_view, localD, localZ, tempR), normR); @@ -1119,22 +1159,23 @@ class TwostageGaussSeidel { Kokkos::subview(localX, range_type(0, num_rows), Kokkos::ALL()); if (compact_form) { // Y := omega * z - KokkosBlas::scal(localY, omega, localZ); + KokkosBlas::scal(my_exec_space, localY, omega, localZ); } else { // Y := X + omega * Z - KokkosBlas::axpy(omega, localZ, localY); + KokkosBlas::axpy(my_exec_space, omega, localZ, localY); } } // end of inner GS sweep } // end of outer GS sweep #ifdef KOKKOSSPARSE_IMPL_TIME_TWOSTAGE_GS { // R = B - A*x - KokkosBlas::scal(localR, one, localB); - KokkosSparse::spmv("N", scalar_t(-one), crsmatA, localX, one, localR); + KokkosBlas::scal(my_exec_space, localR, one, localB); + KokkosSparse::spmv(my_exec_space, "N", scalar_t(-one), crsmatA, localX, + one, localR); auto localRj = Kokkos::subview(localR, Kokkos::ALL(), range_type(0, 1)); single_vector_view_t Rj(localRj.data(), num_rows); - std::cout << "norm(GS)-" << NumSweeps << " " << KokkosBlas::nrm2(Rj) - << std::endl; + std::cout << "norm(GS)-" << NumSweeps << " " + << KokkosBlas::nrm2(my_exec_space, Rj) << std::endl; } #endif } diff --git a/sparse/src/KokkosKernels_Handle.hpp b/sparse/src/KokkosKernels_Handle.hpp index d500f19d48..03cabdb09e 100644 --- a/sparse/src/KokkosKernels_Handle.hpp +++ b/sparse/src/KokkosKernels_Handle.hpp @@ -610,14 +610,19 @@ class KokkosKernelsHandle { * @param num_streams The number of streams to allocate memory for. * @param gs_algorithm Specifies which algorithm to use: * - * KokkosSpace::GS_DEFAULT PointGaussSeidel - * KokkosSpace::GS_PERMUTED ?? - * KokkosSpace::GS_TEAM ?? - * KokkosSpace::GS_CLUSTER ?? - * KokkosSpace::GS_TWOSTAGE ?? + * KokkosSpace::GS_DEFAULT PointGaussSeidel or BlockGaussSeidel, depending on matrix type. + * KokkosSpace::GS_PERMUTED Reorders rows/cols into colors to improve locality. Uses RangePolicy over rows. + * KokkosSpace::GS_TEAM Uses TeamPolicy over batches of rows with ThreadVector within rows. + * KokkosSpace::GS_CLUSTER Uses independent clusters of nodes in the graph. Within a cluster, x is updated sequentially. + * For more information, see: https://arxiv.org/pdf/2204.02934.pdf. + * KokkosSpace::GS_TWOSTAGE Uses spmv to parallelize inner sweeps of x. + * For more information, see: https://arxiv.org/pdf/2104.01196.pdf. * @param coloring_algorithm Specifies which coloring algorithm to color the graph with: * - * KokkosGraph::COLORING_DEFAULT ?? + * KokkosGraph::COLORING_DEFAULT Depends on execution space: + * COLORING_SERIAL on Kokkos::Serial; + * COLORING_EB on GPUs; + * COLORING_VBBIT on Kokkos::Sycl or elsewhere. * KokkosGraph::COLORING_SERIAL Serial Greedy Coloring * KokkosGraph::COLORING_VB Vertex Based Coloring * KokkosGraph::COLORING_VBBIT Vertex Based Coloring with bit array @@ -754,7 +759,10 @@ class KokkosKernelsHandle { * @param hint_verts_per_cluster Hint how many verticies to use per cluster * @param coloring_algorithm Specifies which coloring algorithm to color the graph with: * - * KokkosGraph::COLORING_DEFAULT ?? + * KokkosGraph::COLORING_DEFAULT Depends on execution space: + * COLORING_SERIAL on Kokkos::Serial; + * COLORING_EB on GPUs; + * COLORING_VBBIT on Kokkos::Sycl or elsewhere. * KokkosGraph::COLORING_SERIAL Serial Greedy Coloring * KokkosGraph::COLORING_VB Vertex Based Coloring * KokkosGraph::COLORING_VBBIT Vertex Based Coloring with bit array diff --git a/sparse/src/KokkosSparse_gauss_seidel.hpp b/sparse/src/KokkosSparse_gauss_seidel.hpp index 036fe1b119..4e362f2781 100644 --- a/sparse/src/KokkosSparse_gauss_seidel.hpp +++ b/sparse/src/KokkosSparse_gauss_seidel.hpp @@ -29,13 +29,13 @@ namespace Experimental { /// @brief Gauss-Seidel preconditioner setup (first phase, based on sparsity /// pattern only) /// -/// @tparam ExecutionSpace This kernels execution space type. +/// @tparam ExecutionSpace This kernels execution space type /// @tparam KernelHandle A specialization of /// KokkosKernels::Experimental::KokkosKernelsHandle /// @tparam lno_row_view_t_ The matrix's rowmap type /// @tparam lno_nnz_view_t_ The matrix's entries type -/// @param space The execution space instance this kernel will be run on. -/// @param handle KernelHandle instance +/// @param space The execution space instance this kernel will be run on +/// @param handle A KokkosKernelsHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix /// @param row_map The matrix's rowmap @@ -112,7 +112,7 @@ void gauss_seidel_symbolic(const ExecutionSpace &space, KernelHandle *handle, /// KokkosKernels::Experimental::KokkosKernelsHandle /// @tparam lno_row_view_t_ The matrix's rowmap type /// @tparam lno_nnz_view_t_ The matrix's entries type -/// @param handle KernelHandle instance +/// @param handle A KokkosKernelsHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix /// @param row_map The matrix's rowmap @@ -141,7 +141,7 @@ void gauss_seidel_symbolic(KernelHandle *handle, /// KokkosKernels::Experimental::KokkosKernelsHandle /// @tparam lno_row_view_t_ The matrix's rowmap type /// @tparam lno_nnz_view_t_ The matrix's entries type -/// @param handle KernelHandle instance +/// @param handle A KokkosKernelsHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix /// @param block_size The number of degrees of freedom per block @@ -173,15 +173,15 @@ void block_gauss_seidel_symbolic( /// @brief Gauss-Seidel preconditioner setup (second phase, based on matrix's /// numeric values) /// -/// @tparam ExecutionSpace This kernels execution space type. +/// @tparam ExecutionSpace This kernels execution space type /// @tparam format The matrix storage format, CRS or BSR /// @tparam KernelHandle A specialization of /// KokkosKernels::Experimental::KokkosKernelsHandle /// @tparam lno_row_view_t_ The matrix's rowmap type /// @tparam lno_nnz_view_t_ The matrix's entries type /// @tparam scalar_nnz_view_t_ The matrix's values type -/// @param space The execution space instance this kernel will be run on. -/// @param handle KernelHandle instance +/// @param space The execution space instance this kernel will be run on +/// @param handle A KokkosKernelsHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix /// @param row_map The matrix's rowmap @@ -279,8 +279,8 @@ void gauss_seidel_numeric(const ExecutionSpace &space, KernelHandle *handle, /// @tparam lno_row_view_t_ The matrix's rowmap type /// @tparam lno_nnz_view_t_ The matrix's entries type /// @tparam scalar_nnz_view_t_ The matrix's values type. The user-provided -/// inverse diagonal must share this type. -/// @param handle KernelHandle instance +/// inverse diagonal must share this type +/// @param handle A KokkosKernelsHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix /// @param row_map The matrix's rowmap @@ -313,16 +313,16 @@ void gauss_seidel_numeric(KernelHandle *handle, /// numeric values). This version accepts the matrix's inverse diagonal from the /// user. /// -/// @tparam ExecutionSpace This kernels execution space type. +/// @tparam ExecutionSpace This kernels execution space type /// @tparam format The matrix storage format, CRS or BSR /// @tparam KernelHandle A specialization of /// KokkosKernels::Experimental::KokkosKernelsHandle /// @tparam lno_row_view_t_ The matrix's rowmap type /// @tparam lno_nnz_view_t_ The matrix's entries type /// @tparam scalar_nnz_view_t_ The matrix's values type. The user-provided -/// inverse diagonal must share this type. -/// @param space The execution space instance this kernel will be run on. -/// @param handle KernelHandle instance +/// inverse diagonal must share this type +/// @param space The execution space instance this kernel will be run on +/// @param handle A KokkosKernelsHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix /// @param row_map The matrix's rowmap @@ -427,8 +427,8 @@ void gauss_seidel_numeric(const ExecutionSpace &space, KernelHandle *handle, /// @tparam lno_row_view_t_ The matrix's rowmap type /// @tparam lno_nnz_view_t_ The matrix's entries type /// @tparam scalar_nnz_view_t_ The matrix's values type. The user-provided -/// inverse diagonal must share this type. -/// @param handle KernelHandle instance +/// inverse diagonal must share this type +/// @param handle A KokkosKernelsHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix /// @param row_map The matrix's rowmap @@ -468,7 +468,7 @@ void gauss_seidel_numeric(KernelHandle *handle, /// @tparam lno_row_view_t_ The matrix's rowmap type /// @tparam lno_nnz_view_t_ The matrix's entries type /// @tparam scalar_nnz_view_t_ The matrix's values type -/// @param handle handle A KokkosKernelsHandle instance +/// @param handle A KokkosKernelsHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix /// @param block_size The number of degrees of freedom per block @@ -504,7 +504,7 @@ void block_gauss_seidel_numeric( /// @brief Apply symmetric (forward + backward) Gauss-Seidel preconditioner to /// system AX=Y /// -/// @tparam ExecutionSpace This kernels execution space type. +/// @tparam ExecutionSpace This kernels execution space type /// @tparam format The matrix storage format, CRS or BSR /// @tparam KernelHandle A specialization of /// KokkosKernels::Experimental::KokkosKernelsHandle @@ -512,12 +512,12 @@ void block_gauss_seidel_numeric( /// @tparam lno_nnz_view_t_ The matrix's entries type /// @tparam scalar_nnz_view_t_ The matrix's values type /// @tparam x_scalar_view_t The type of the X (left-hand side, unknown) vector. -/// May be rank-1 or rank-2 View. +/// May be rank-1 or rank-2 View /// @tparam y_scalar_view_t The type of the Y (right-hand side) vector. May be -/// rank-1 or rank-2 View. +/// rank-1 or rank-2 View /// @param space The execution space instance this kernel will be run -/// on. NOTE: Currently only used for GS_DEFAULT. -/// @param handle handle A KokkosKernelsHandle instance +/// on. NOTE: Currently only used for GS_DEFAULT and GS_TWOSTAGE +/// @param handle A KokkosKernelsHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix /// @param row_map The matrix's rowmap @@ -680,8 +680,8 @@ void symmetric_gauss_seidel_apply( /// @tparam x_scalar_view_t The type of the X (left-hand side, unknown) vector. /// May be rank-1 or rank-2 View. /// @tparam y_scalar_view_t The type of the Y (right-hand side) vector. May be -/// rank-1 or rank-2 View. -/// @param handle handle A KokkosKernelsHandle instance +/// rank-1 or rank-2 View +/// @param handle A KokkosKernelsHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix /// @param row_map The matrix's rowmap @@ -727,10 +727,10 @@ void symmetric_gauss_seidel_apply( /// @tparam lno_nnz_view_t_ The matrix's entries type /// @tparam scalar_nnz_view_t_ The matrix's values type /// @tparam x_scalar_view_t The type of the X (left-hand side, unknown) vector. -/// May be rank-1 or rank-2 View. +/// May be rank-1 or rank-2 View /// @tparam y_scalar_view_t The type of the Y (right-hand side) vector. May be -/// rank-1 or rank-2 View. -/// @param handle handle A KokkosKernelsHandle instance. +/// rank-1 or rank-2 View +/// @param handle A KokkosKernelsHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix /// @param block_size The number of degrees of freedom per block @@ -793,12 +793,12 @@ void symmetric_block_gauss_seidel_apply( /// @tparam lno_nnz_view_t_ The matrix's entries type /// @tparam scalar_nnz_view_t_ The matrix's values type /// @tparam x_scalar_view_t The type of the X (left-hand side, unknown) vector. -/// May be rank-1 or rank-2 View. +/// May be rank-1 or rank-2 View /// @tparam y_scalar_view_t The type of the Y (right-hand side) vector. May be -/// rank-1 or rank-2 View. +/// rank-1 or rank-2 View /// @param space The execution space instance this kernel will be run -/// on. NOTE: Currently only used for GS_DEFAULT. -/// @param handle KernelHandle instance +/// on. NOTE: Currently only used for GS_DEFAULT and GS_TWOSTAGE +/// @param handle A KokkosKernelsHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix /// @param row_map The matrix's rowmap @@ -960,10 +960,10 @@ void forward_sweep_gauss_seidel_apply( /// @tparam lno_nnz_view_t_ The matrix's entries type /// @tparam scalar_nnz_view_t_ The matrix's values type /// @tparam x_scalar_view_t The type of the X (left-hand side, unknown) vector. -/// May be rank-1 or rank-2 View. +/// May be rank-1 or rank-2 View /// @tparam y_scalar_view_t The type of the Y (right-hand side) vector. May be -/// rank-1 or rank-2 View. -/// @param handle KernelHandle instance +/// rank-1 or rank-2 View +/// @param handle A KokkosKernelsHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix /// @param row_map The matrix's rowmap @@ -1008,10 +1008,10 @@ void forward_sweep_gauss_seidel_apply( /// @tparam lno_nnz_view_t_ The matrix's entries type /// @tparam scalar_nnz_view_t_ The matrix's values type /// @tparam x_scalar_view_t The type of the X (left-hand side, unknown) vector. -/// May be rank-1 or rank-2 View. +/// May be rank-1 or rank-2 View /// @tparam y_scalar_view_t The type of the Y (right-hand side) vector. May be -/// rank-1 or rank-2 View. -/// @param handle KernelHandle instance +/// rank-1 or rank-2 View +/// @param handle A KokkosKernelsHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix /// @param block_size The number of degrees of freedom per block @@ -1067,7 +1067,7 @@ void forward_sweep_block_gauss_seidel_apply( /// /// @brief Apply backward Gauss-Seidel preconditioner to system AX=Y /// -/// @tparam ExecutionSpace This kernels execution space type. +/// @tparam ExecutionSpace This kernels execution space type /// @tparam format The matrix storage format, CRS or BSR /// @tparam KernelHandle A specialization of /// KokkosKernels::Experimental::KokkosKernelsHandle @@ -1075,12 +1075,12 @@ void forward_sweep_block_gauss_seidel_apply( /// @tparam lno_nnz_view_t_ The matrix's entries type /// @tparam scalar_nnz_view_t_ The matrix's values type /// @tparam x_scalar_view_t The type of the X (left-hand side, unknown) vector. -/// May be rank-1 or rank-2 View. +/// May be rank-1 or rank-2 View /// @tparam y_scalar_view_t The type of the Y (right-hand side) vector. May be -/// rank-1 or rank-2 View. +/// rank-1 or rank-2 View /// @param space The execution space instance this kernel will be run -/// on. NOTE: Currently only used for GS_DEFAULT. -/// @param handle KernelHandle instance +/// on. NOTE: Currently only used for GS_DEFAULT and GS_TWOSTAGE +/// @param handle A KokkosKernelsHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix /// @param row_map The matrix's rowmap @@ -1242,10 +1242,10 @@ void backward_sweep_gauss_seidel_apply( /// @tparam lno_nnz_view_t_ The matrix's entries type /// @tparam scalar_nnz_view_t_ The matrix's values type /// @tparam x_scalar_view_t The type of the X (left-hand side, unknown) vector. -/// May be rank-1 or rank-2 View. +/// May be rank-1 or rank-2 View /// @tparam y_scalar_view_t The type of the Y (right-hand side) vector. May be -/// rank-1 or rank-2 View. -/// @param handle KernelHandle instance +/// rank-1 or rank-2 View +/// @param handle A KokkosKernelsHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix /// @param row_map The matrix's rowmap @@ -1290,10 +1290,10 @@ void backward_sweep_gauss_seidel_apply( /// @tparam lno_nnz_view_t_ The matrix's entries type /// @tparam scalar_nnz_view_t_ The matrix's values type /// @tparam x_scalar_view_t The type of the X (left-hand side, unknown) vector. -/// May be rank-1 or rank-2 View. +/// May be rank-1 or rank-2 View /// @tparam y_scalar_view_t The type of the Y (right-hand side) vector. May be -/// rank-1 or rank-2 View. -/// @param handle KernelHandle instance +/// rank-1 or rank-2 View +/// @param handle A KokkosKernelsHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix /// @param block_size The number of degrees of freedom per block diff --git a/sparse/src/KokkosSparse_gauss_seidel_handle.hpp b/sparse/src/KokkosSparse_gauss_seidel_handle.hpp index 649229918d..0b47b3e92c 100644 --- a/sparse/src/KokkosSparse_gauss_seidel_handle.hpp +++ b/sparse/src/KokkosSparse_gauss_seidel_handle.hpp @@ -758,9 +758,18 @@ class TwoStageGaussSeidelHandle void initVectors(int nrows_, int nrhs_) { if (this->nrows != nrows_ || this->nrhs != nrhs_) { - this->localR = vector_view_t("temp", nrows_, nrhs_); - this->localT = vector_view_t("temp", nrows_, nrhs_); - this->localZ = vector_view_t("temp", nrows_, nrhs_); + vector_view_t r( + Kokkos::view_alloc(this->execution_space, std::string("temp")), + nrows_, nrhs_); + this->localR = r; + vector_view_t t( + Kokkos::view_alloc(this->execution_space, std::string("temp")), + nrows_, nrhs_); + this->localT = t; + vector_view_t z( + Kokkos::view_alloc(this->execution_space, std::string("temp")), + nrows_, nrhs_); + this->localZ = z; this->nrows = nrows_; this->nrhs = nrhs_; } From 13317b8713013bbec190be439b260dcfe7df4555 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 26 Sep 2023 12:07:41 -0600 Subject: [PATCH 04/12] sparse/unit_test: Test twostage with streams --- sparse/unit_test/Test_Sparse_gauss_seidel.hpp | 73 +++++++++++++++---- 1 file changed, 57 insertions(+), 16 deletions(-) diff --git a/sparse/unit_test/Test_Sparse_gauss_seidel.hpp b/sparse/unit_test/Test_Sparse_gauss_seidel.hpp index 35fbcb44a4..cf445d6d06 100644 --- a/sparse/unit_test/Test_Sparse_gauss_seidel.hpp +++ b/sparse/unit_test/Test_Sparse_gauss_seidel.hpp @@ -753,18 +753,12 @@ void test_gauss_seidel_streams_rank1( #endif // KOKKOS_ENABLE_OPENMP std::vector instances; - if (nstreams == 1) - instances = Kokkos::Experimental::partition_space(execution_space(), 1); - else if (nstreams == 2) - instances = Kokkos::Experimental::partition_space(execution_space(), 1, 1); - else if (nstreams == 3) - instances = - Kokkos::Experimental::partition_space(execution_space(), 1, 1, 1); - else - instances = - Kokkos::Experimental::partition_space(execution_space(), 1, 1, 1, 1); - - std::vector kh_v(nstreams); + std::vector weights(nstreams); + std::fill(weights.begin(), weights.end(), 1); + instances = Kokkos::Experimental::partition_space(execution_space(), weights); + std::vector kh_psgs_v(nstreams); + std::vector kh_tsgs_v(nstreams); + std::vector kh_tsgs_classic_v(nstreams); std::vector input_mat_v(nstreams); std::vector solution_x_v(nstreams); std::vector x_vector_v(nstreams); @@ -773,6 +767,8 @@ void test_gauss_seidel_streams_rank1( const scalar_t one = Kokkos::ArithTraits::one(); const scalar_t zero = Kokkos::ArithTraits::zero(); + // two-stage with SpTRSV supports only omega = one + const double omega_tsgs_classic = one; for (int i = 0; i < nstreams; i++) { input_mat_v[i] = @@ -801,8 +797,18 @@ void test_gauss_seidel_streams_rank1( Kokkos::view_alloc(Kokkos::WithoutInitializing, "x vector"), nv); x_vector_v[i] = x_vector_tmp; - kh_v[i] = KernelHandle(); // Initialize KokkosKernelsHandle defaults. - kh_v[i].create_gs_handle(instances[i], nstreams, GS_DEFAULT, coloringAlgo); + kh_psgs_v[i] = KernelHandle(); // Initialize KokkosKernelsHandle defaults. + kh_tsgs_v[i] = KernelHandle(); // Initialize KokkosKernelsHandle defaults. + kh_tsgs_classic_v[i] = + KernelHandle(); // Initialize KokkosKernelsHandle defaults. + kh_psgs_v[i].create_gs_handle(instances[i], nstreams, GS_DEFAULT, + coloringAlgo); + kh_tsgs_v[i].create_gs_handle(instances[i], nstreams, GS_TWOSTAGE, + coloringAlgo); + kh_tsgs_v[i].set_gs_twostage(true, input_mat_v[i].numRows()); + kh_tsgs_classic_v[i].create_gs_handle(instances[i], nstreams, GS_TWOSTAGE, + coloringAlgo); + kh_tsgs_classic_v[i].set_gs_twostage(false, input_mat_v[i].numRows()); } int apply_count = 3; // test symmetric, forward, backward @@ -811,7 +817,7 @@ void test_gauss_seidel_streams_rank1( for (int i = 0; i < nstreams; i++) Kokkos::deep_copy(instances[i], x_vector_v[i], zero); - run_gauss_seidel_streams(instances, kh_v, input_mat_v, x_vector_v, + run_gauss_seidel_streams(instances, kh_psgs_v, input_mat_v, x_vector_v, y_vector_v, symmetric, m_omega, apply_type, nstreams); for (int i = 0; i < nstreams; i++) { @@ -823,7 +829,42 @@ void test_gauss_seidel_streams_rank1( } } - for (int i = 0; i < nstreams; i++) kh_v[i].destroy_gs_handle(); + //*** Two-stage version **** + for (int apply_type = 0; apply_type < apply_count; ++apply_type) { + for (int i = 0; i < nstreams; i++) + Kokkos::deep_copy(instances[i], x_vector_v[i], zero); + run_gauss_seidel_streams(instances, kh_tsgs_v, input_mat_v, x_vector_v, + y_vector_v, symmetric, m_omega, apply_type, + nstreams); + for (int i = 0; i < nstreams; i++) { + KokkosBlas::axpby(instances[i], one, solution_x_v[i], -one, + x_vector_v[i]); + mag_t result_norm_res = KokkosBlas::nrm2(instances[i], x_vector_v[i]); + EXPECT_LT(result_norm_res, initial_norm_res_v[i]) + << "on stream_idx: " << i; + } + } + //*** Two-stage version (classic) **** + for (int apply_type = 0; apply_type < apply_count; ++apply_type) { + for (int i = 0; i < nstreams; i++) + Kokkos::deep_copy(instances[i], x_vector_v[i], zero); + run_gauss_seidel_streams(instances, kh_tsgs_classic_v, input_mat_v, + x_vector_v, y_vector_v, symmetric, + omega_tsgs_classic, apply_type, nstreams); + for (int i = 0; i < nstreams; i++) { + KokkosBlas::axpby(instances[i], one, solution_x_v[i], -one, + x_vector_v[i]); + mag_t result_norm_res = KokkosBlas::nrm2(instances[i], x_vector_v[i]); + EXPECT_LT(result_norm_res, initial_norm_res_v[i]) + << "on stream_idx: " << i; + } + } + + for (int i = 0; i < nstreams; i++) { + kh_psgs_v[i].destroy_gs_handle(); + kh_tsgs_v[i].destroy_gs_handle(); + kh_tsgs_classic_v[i].destroy_gs_handle(); + } } #define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ From 14ebc66100104fe33366c2e240ff6461f6403171 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 26 Sep 2023 16:56:00 -0600 Subject: [PATCH 05/12] Update omega_tsgs_classic type --- sparse/unit_test/Test_Sparse_gauss_seidel.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sparse/unit_test/Test_Sparse_gauss_seidel.hpp b/sparse/unit_test/Test_Sparse_gauss_seidel.hpp index cf445d6d06..213ac12303 100644 --- a/sparse/unit_test/Test_Sparse_gauss_seidel.hpp +++ b/sparse/unit_test/Test_Sparse_gauss_seidel.hpp @@ -768,7 +768,7 @@ void test_gauss_seidel_streams_rank1( const scalar_t one = Kokkos::ArithTraits::one(); const scalar_t zero = Kokkos::ArithTraits::zero(); // two-stage with SpTRSV supports only omega = one - const double omega_tsgs_classic = one; + auto omega_tsgs_classic = one; for (int i = 0; i < nstreams; i++) { input_mat_v[i] = From 4b2e82c5ecc7c262462d7b2e4d8b6325d7aaa46c Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 27 Sep 2023 08:45:14 -0600 Subject: [PATCH 06/12] Move sptrsv to streams --- ...okkosSparse_twostage_gauss_seidel_impl.hpp | 40 +++++++------------ 1 file changed, 14 insertions(+), 26 deletions(-) diff --git a/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp b/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp index f4c2ed3662..12e0e1d1ee 100644 --- a/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp +++ b/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp @@ -604,7 +604,6 @@ class TwostageGaussSeidel { column_view, rowmap_viewL, rowmap_viewUa), nnzL); } - // TODO: continue if (direction == GS_BACKWARD || direction == GS_SYMMETRIC) { using range_policy = Kokkos::RangePolicy; Kokkos::parallel_reduce( @@ -766,12 +765,10 @@ class TwostageGaussSeidel { if (sptrsv_algo != SPTRSVAlgorithm::SPTRSV_CUSPARSE) { // symbolic with CuSparse needs // values - sptrsv_symbolic(handle->get_gs_sptrsvL_handle(), rowmap_viewL, - crsmatL.graph.entries); - sptrsv_symbolic(handle->get_gs_sptrsvU_handle(), rowmap_viewU, - crsmatU.graph.entries); - Kokkos::fence(); // Ensure writes land before launching more work on - // non-default streams, TODO: remove + sptrsv_symbolic(my_exec_space, handle->get_gs_sptrsvL_handle(), + rowmap_viewL, crsmatL.graph.entries); + sptrsv_symbolic(my_exec_space, handle->get_gs_sptrsvU_handle(), + rowmap_viewU, crsmatU.graph.entries); } } } @@ -837,9 +834,6 @@ class TwostageGaussSeidel { << "TWO-STAGE GS::NUMERIC::INSERT LU TIME : " << tic << std::endl; timer.reset(); #endif - - my_exec_space.fence(); // Wait for non-default stream work to finish before - // sptrsv, TODO: remove if (!(gsHandle->isTwoStage())) { using namespace KokkosSparse::Experimental; auto sptrsv_algo = @@ -857,12 +851,10 @@ class TwostageGaussSeidel { rowmap_viewU, column_viewU, values_viewU); // now do symbolic - sptrsv_symbolic(handle->get_gs_sptrsvL_handle(), rowmap_viewL, - crsmatL.graph.entries, values_viewL); - sptrsv_symbolic(handle->get_gs_sptrsvU_handle(), rowmap_viewU, - crsmatU.graph.entries, values_viewU); - Kokkos::fence(); // Wait for sptrsv to finish before launching work on - // a stream, TODO: remove + sptrsv_symbolic(my_exec_space, handle->get_gs_sptrsvL_handle(), + rowmap_viewL, crsmatL.graph.entries, values_viewL); + sptrsv_symbolic(my_exec_space, handle->get_gs_sptrsvU_handle(), + rowmap_viewU, crsmatU.graph.entries, values_viewU); } } } @@ -1011,11 +1003,9 @@ class TwostageGaussSeidel { Kokkos::subview(localZ, Kokkos::ALL(), range_type(j, j + 1)); single_vector_view_t Rj(localRj.data(), num_rows); single_vector_view_t Zj(localZj.data(), num_rows); - my_exec_space - .fence(); // Wait for writes to R and Z to land, TODO: remove - sptrsv_solve(handle->get_gs_sptrsvL_handle(), crsmatL.graph.row_map, - crsmatL.graph.entries, crsmatL.values, Rj, Zj); - Kokkos::fence(); // TODO: call sptrsv_solve on stream and remove + sptrsv_solve(my_exec_space, handle->get_gs_sptrsvL_handle(), + crsmatL.graph.row_map, crsmatL.graph.entries, + crsmatL.values, Rj, Zj); } } else { using namespace KokkosSparse::Experimental; @@ -1028,11 +1018,9 @@ class TwostageGaussSeidel { Kokkos::subview(localZ, Kokkos::ALL(), range_type(j, j + 1)); single_vector_view_t Rj(localRj.data(), num_rows); single_vector_view_t Zj(localZj.data(), num_rows); - my_exec_space - .fence(); // Wait for writes to R and Z to land, TODO: remove - sptrsv_solve(handle->get_gs_sptrsvU_handle(), crsmatU.graph.row_map, - crsmatU.graph.entries, crsmatU.values, Rj, Zj); - Kokkos::fence(); // TODO: call sptrsv_solve on stream and remove + sptrsv_solve(my_exec_space, handle->get_gs_sptrsvU_handle(), + crsmatU.graph.row_map, crsmatU.graph.entries, + crsmatU.values, Rj, Zj); } } From df23b0ed5f2b6727da114be4548d78fad5e2fc28 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 27 Sep 2023 09:59:19 -0600 Subject: [PATCH 07/12] Add lightweighthint --- .../impl/KokkosSparse_twostage_gauss_seidel_impl.hpp | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp b/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp index 12e0e1d1ee..05c109b7db 100644 --- a/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp +++ b/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp @@ -712,7 +712,10 @@ class TwostageGaussSeidel { // extract local L & U structures (for computing (L+D)^{-1} or (D+U)^{-1}) using range_policy = Kokkos::RangePolicy; Kokkos::parallel_for( - "entriesLU", range_policy(my_exec_space, 0, num_rows), + "entriesLU", + Kokkos::Experimental::require( + range_policy(my_exec_space, 0, num_rows), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), TSGS_Functor_t( two_stage, compact_form, num_rows, rowmap_view, column_view, rowmap_viewL, column_viewL, rowmap_viewU, column_viewU, @@ -821,7 +824,10 @@ class TwostageGaussSeidel { // extract local L, D & U matrices using range_policy = Kokkos::RangePolicy; Kokkos::parallel_for( - "valueLU", range_policy(my_exec_space, 0, num_rows), + "valueLU", + Kokkos::Experimental::require( + range_policy(my_exec_space, 0, num_rows), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), TSGS_Functor_t(two_stage, compact_form, diagos_given, num_rows, rowmap_view, column_view, values_view, d_invert_view, rowmap_viewL, values_viewL, viewD, rowmap_viewU, From 77581022c97fef65ec898ab03ed56f889d6d3b64 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 3 Oct 2023 11:13:17 -0600 Subject: [PATCH 08/12] WIP, need twostage changes before proceeding --- .../KokkosSparse_cluster_gauss_seidel_impl.hpp | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp b/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp index 501e71e3e7..a422c96b5f 100644 --- a/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp +++ b/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp @@ -528,6 +528,7 @@ class ClusterGaussSeidel { // symmetric and non-symmetric input cases. rowmap_t sym_xadj; colinds_t sym_adj; + // TODO: pass my_exec_space into KokkosGraph kernels if (!this->is_symmetric) { KokkosKernels::Impl::symmetrize_graph_symbolic_hashmap< in_rowmap_t, in_colinds_t, rowmap_t, colinds_t, MyExecSpace>( @@ -735,7 +736,8 @@ class ClusterGaussSeidel { }; void initialize_numeric() { - auto gsHandle = get_gs_handle(); + auto gsHandle = get_gs_handle(); + auto my_exec_space = gsHandle->get_execution_space(); if (!gsHandle->is_symbolic_called()) { this->initialize_symbolic(); } @@ -753,10 +755,11 @@ class ClusterGaussSeidel { scalar_persistent_work_view_t inverse_diagonal( Kokkos::view_alloc(Kokkos::WithoutInitializing, "Aii^-1"), num_rows); nnz_lno_t rows_per_team = this->handle->get_team_work_size( - suggested_team_size, MyExecSpace().concurrency(), num_rows); + suggested_team_size, my_exec_space.concurrency(), num_rows); if (have_diagonal_given) { - Kokkos::deep_copy(inverse_diagonal, this->given_inverse_diagonal); + Kokkos::deep_copy(my_exec_space, inverse_diagonal, + this->given_inverse_diagonal); } else { // extract inverse diagonal from matrix Get_Matrix_Diagonals gmd(this->row_map, this->entries, this->values, @@ -764,7 +767,8 @@ class ClusterGaussSeidel { if (gsHandle->use_teams()) { Kokkos::parallel_for( "KokkosSparse::GaussSeidel::team_get_matrix_diagonals", - team_policy_t((num_rows + rows_per_team - 1) / rows_per_team, + team_policy_t(my_exec_space, + (num_rows + rows_per_team - 1) / rows_per_team, suggested_team_size, suggested_vector_size), gmd); } else { @@ -787,7 +791,8 @@ class ClusterGaussSeidel { nnz_scalar_t omega = Kokkos::ArithTraits::one(), bool apply_forward = true, bool apply_backward = true, bool /*update_y_vector*/ = true) { - auto gsHandle = get_gs_handle(); + auto gsHandle = get_gs_handle(); + auto my_exec_space = gsHandle->get_execution_space(); size_type nnz = entries.extent(0); nnz_lno_persistent_work_view_t color_adj = gsHandle->get_color_adj(); @@ -796,6 +801,7 @@ class ClusterGaussSeidel { color_t numColors = gsHandle->get_num_colors(); + // TODO: continue after twostage merges if (init_zero_x_vector) { KokkosKernels::Impl::zero_vector( num_cols, x_lhs_output_vec); From ec2fa4868fd8c4e4906bf0528fa6d13713a178d2 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Mon, 9 Oct 2023 09:47:06 -0600 Subject: [PATCH 09/12] Stream support for Gauss-Seidel: Symbolic, Numeric, Apply (Cluster) - Note: everything except for KokkosGraph and graph clustering runs on streams. --- docs/developer/apidocs/sparse.rst | 4 +- ...KokkosSparse_cluster_gauss_seidel_impl.hpp | 95 ++++++++++++------- sparse/src/KokkosKernels_Handle.hpp | 53 ++++++++++- .../src/KokkosSparse_gauss_seidel_handle.hpp | 37 +++++--- sparse/unit_test/Test_Sparse_gauss_seidel.hpp | 36 ++++++- 5 files changed, 174 insertions(+), 51 deletions(-) diff --git a/docs/developer/apidocs/sparse.rst b/docs/developer/apidocs/sparse.rst index 3a55e50c8b..c538b75afb 100644 --- a/docs/developer/apidocs/sparse.rst +++ b/docs/developer/apidocs/sparse.rst @@ -60,9 +60,11 @@ block_spgemm gauss_seidel ------------ -.. doxygenfunction:: create_gs_handle(KokkosSparse::GSAlgorithm gs_algorithm, KokkosGraph::ColoringAlgorithm coloring_algorithm) .. doxygenfunction:: create_gs_handle(const HandleExecSpace&, int, KokkosSparse::GSAlgorithm gs_algorithm, KokkosGraph::ColoringAlgorithm coloring_algorithm) +.. doxygenfunction:: create_gs_handle(KokkosSparse::GSAlgorithm gs_algorithm, KokkosGraph::ColoringAlgorithm coloring_algorithm) +.. doxygenfunction:: create_gs_handle(const HandleExecSpace&, KokkosSparse::ClusteringAlgorithm, nnz_lno_t, KokkosGraph::ColoringAlgorithm) .. doxygenfunction:: create_gs_handle(KokkosSparse::ClusteringAlgorithm, nnz_lno_t, KokkosGraph::ColoringAlgorithm) +.. doxygenfunction:: destroy_gs_handle() .. doxygenfunction:: gauss_seidel_symbolic(const ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, bool is_graph_symmetric) .. doxygenfunction:: gauss_seidel_symbolic(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, bool is_graph_symmetric) .. doxygenfunction:: gauss_seidel_numeric(const ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, bool is_graph_symmetric) diff --git a/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp b/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp index a422c96b5f..5291b7b14b 100644 --- a/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp +++ b/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp @@ -89,7 +89,7 @@ class ClusterGaussSeidel { typedef typename HandleType::scalar_persistent_work_view_t scalar_persistent_work_view_t; - typedef Kokkos::RangePolicy my_exec_space; + typedef Kokkos::RangePolicy range_policy_t; typedef nnz_lno_t color_t; typedef Kokkos::View color_view_t; typedef Kokkos::Bitset bitset_t; @@ -519,6 +519,7 @@ class ClusterGaussSeidel { using raw_colinds_t = Kokkos::View>; auto gsHandle = get_gs_handle(); + auto my_exec_space = gsHandle->get_execution_space(); #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE Kokkos::Timer timer; #endif @@ -528,7 +529,8 @@ class ClusterGaussSeidel { // symmetric and non-symmetric input cases. rowmap_t sym_xadj; colinds_t sym_adj; - // TODO: pass my_exec_space into KokkosGraph kernels + // TODO: pass my_exec_space into KokkosGraph kernels. Requires + // https://github.com/kokkos/kokkos-kernels/issues/1879. if (!this->is_symmetric) { KokkosKernels::Impl::symmetrize_graph_symbolic_hashmap< in_rowmap_t, in_colinds_t, rowmap_t, colinds_t, MyExecSpace>( @@ -650,10 +652,14 @@ class ClusterGaussSeidel { #endif nnz_lno_persistent_work_view_t color_xadj; nnz_lno_persistent_work_view_t color_adj; + + // Wait for coloring to finish on its stream + using ColoringExecSpace = typename HandleType::HandleExecSpace; + ColoringExecSpace().fence(); KokkosKernels::Impl::create_reverse_map< typename HandleType::GraphColoringHandleType::color_view_t, nnz_lno_persistent_work_view_t, MyExecSpace>( - numClusters, numColors, colors, color_xadj, color_adj); + my_exec_space, numClusters, numColors, colors, color_xadj, color_adj); #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE MyExecSpace().fence(); std::cout << "CREATE_REVERSE_MAP:" << timer.seconds() << std::endl; @@ -662,6 +668,8 @@ class ClusterGaussSeidel { nnz_lno_persistent_work_host_view_t color_xadj_host( Kokkos::view_alloc(Kokkos::WithoutInitializing, "Color xadj"), color_xadj.extent(0)); + // NOTE: the below deep copy ensures that we don't start running numeric + // on a non-default stream until our symbolic data has landed. Kokkos::deep_copy(color_xadj_host, color_xadj); gsHandle->set_color_xadj(color_xadj_host); gsHandle->set_color_adj(color_adj); @@ -753,7 +761,9 @@ class ClusterGaussSeidel { this->handle->get_suggested_team_size(suggested_vector_size); scalar_persistent_work_view_t inverse_diagonal( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "Aii^-1"), num_rows); + Kokkos::view_alloc(my_exec_space, Kokkos::WithoutInitializing, + "Aii^-1"), + num_rows); nnz_lno_t rows_per_team = this->handle->get_team_work_size( suggested_team_size, my_exec_space.concurrency(), num_rows); @@ -773,7 +783,7 @@ class ClusterGaussSeidel { gmd); } else { Kokkos::parallel_for("KokkosSparse::GaussSeidel::get_matrix_diagonals", - my_exec_space(0, num_rows), gmd); + range_policy_t(my_exec_space, 0, num_rows), gmd); } } gsHandle->set_inverse_diagonal(inverse_diagonal); @@ -801,10 +811,9 @@ class ClusterGaussSeidel { color_t numColors = gsHandle->get_num_colors(); - // TODO: continue after twostage merges if (init_zero_x_vector) { - KokkosKernels::Impl::zero_vector( - num_cols, x_lhs_output_vec); + KokkosKernels::Impl::zero_vector(my_exec_space, num_cols, + x_lhs_output_vec); } scalar_persistent_work_view_t inverse_diagonal = @@ -817,7 +826,7 @@ class ClusterGaussSeidel { this->handle->get_suggested_team_size(suggested_vector_size); nnz_lno_t rows_per_team = this->handle->get_team_work_size( - suggested_team_size, MyExecSpace().concurrency(), num_rows); + suggested_team_size, my_exec_space.concurrency(), num_rows); // Get clusters per team. Round down to favor finer granularity, since // this is sensitive to load imbalance nnz_lno_t clusters_per_team = @@ -831,33 +840,34 @@ class ClusterGaussSeidel { color_adj, gsHandle->get_cluster_xadj(), gsHandle->get_cluster_adj(), inverse_diagonal, clusters_per_team, omega); - this->IterativeTeamPSGS(gs, numColors, h_color_xadj, suggested_team_size, - suggested_vector_size, numIter, apply_forward, - apply_backward); + this->IterativeTeamPSGS(my_exec_space, gs, numColors, h_color_xadj, + suggested_team_size, suggested_vector_size, + numIter, apply_forward, apply_backward); } else { PSGS gs( this->row_map, this->entries, this->values, x_lhs_output_vec, y_rhs_input_vec, color_adj, gsHandle->get_cluster_xadj(), gsHandle->get_cluster_adj(), omega, inverse_diagonal); - this->IterativePSGS(gs, numColors, h_color_xadj, numIter, apply_forward, - apply_backward); + this->IterativePSGS(my_exec_space, gs, numColors, h_color_xadj, numIter, + apply_forward, apply_backward); } } template - void IterativeTeamPSGS(TPSGS& gs, color_t numColors, + void IterativeTeamPSGS(MyExecSpace& my_exec_space, TPSGS& gs, + color_t numColors, nnz_lno_persistent_work_host_view_t h_color_xadj, nnz_lno_t team_size, nnz_lno_t vec_size, int num_iteration, bool apply_forward, bool apply_backward) { for (int i = 0; i < num_iteration; ++i) - this->DoTeamPSGS(gs, numColors, h_color_xadj, team_size, vec_size, - apply_forward, apply_backward); + this->DoTeamPSGS(my_exec_space, gs, numColors, h_color_xadj, team_size, + vec_size, apply_forward, apply_backward); } template - void DoTeamPSGS(TPSGS& gs, color_t numColors, + void DoTeamPSGS(MyExecSpace& my_exec_space, TPSGS& gs, color_t numColors, nnz_lno_persistent_work_host_view_t h_color_xadj, nnz_lno_t team_size, nnz_lno_t vec_size, bool apply_forward, bool apply_backward) { @@ -871,9 +881,12 @@ class ClusterGaussSeidel { gs._color_set_end = color_index_end; Kokkos::parallel_for( "KokkosSparse::GaussSeidel::Team_PSGS::forward", - team_policy_t((overall_work + gs._clusters_per_team - 1) / - gs._clusters_per_team, - team_size, vec_size), + Kokkos::Experimental::require( + team_policy_t(my_exec_space, + (overall_work + gs._clusters_per_team - 1) / + gs._clusters_per_team, + team_size, vec_size), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), gs); } } @@ -888,10 +901,13 @@ class ClusterGaussSeidel { gs._color_set_begin = color_index_begin; gs._color_set_end = color_index_end; Kokkos::parallel_for( - "KokkosSparse::GaussSeidel::Team_PSGS::forward", - team_policy_t((overall_work + gs._clusters_per_team - 1) / - gs._clusters_per_team, - team_size, vec_size), + "KokkosSparse::GaussSeidel::Team_PSGS::backward", + Kokkos::Experimental::require( + team_policy_t(my_exec_space, + (overall_work + gs._clusters_per_team - 1) / + gs._clusters_per_team, + team_size, vec_size), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), gs); if (i == 0) { break; @@ -901,17 +917,18 @@ class ClusterGaussSeidel { } template - void IterativePSGS(PSGS& gs, color_t numColors, + void IterativePSGS(MyExecSpace& my_exec_space, PSGS& gs, color_t numColors, nnz_lno_persistent_work_host_view_t h_color_xadj, int num_iteration, bool apply_forward, bool apply_backward) { for (int i = 0; i < num_iteration; ++i) { - this->DoPSGS(gs, numColors, h_color_xadj, apply_forward, apply_backward); + this->DoPSGS(my_exec_space, gs, numColors, h_color_xadj, apply_forward, + apply_backward); } } template - void DoPSGS(PSGS& gs, color_t numColors, + void DoPSGS(MyExecSpace& my_exec_space, PSGS& gs, color_t numColors, nnz_lno_persistent_work_host_view_t h_color_xadj, bool apply_forward, bool apply_backward) { if (apply_forward) { @@ -920,10 +937,13 @@ class ClusterGaussSeidel { nnz_lno_t color_index_end = h_color_xadj(i + 1); gs._color_set_begin = color_index_begin; gs._color_set_end = color_index_end; - Kokkos::parallel_for("KokkosSparse::GaussSeidel::PSGS::forward", - Kokkos::RangePolicy( - 0, color_index_end - color_index_begin), - gs); + Kokkos::parallel_for( + "KokkosSparse::GaussSeidel::PSGS::forward", + Kokkos::Experimental::require( + Kokkos::RangePolicy( + my_exec_space, 0, color_index_end - color_index_begin), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + gs); } } if (apply_backward && numColors) { @@ -932,10 +952,13 @@ class ClusterGaussSeidel { nnz_lno_t color_index_end = h_color_xadj(i + 1); gs._color_set_begin = color_index_begin; gs._color_set_end = color_index_end; - Kokkos::parallel_for("KokkosSparse::GaussSeidel::PSGS::backward", - Kokkos::RangePolicy( - 0, color_index_end - color_index_begin), - gs); + Kokkos::parallel_for( + "KokkosSparse::GaussSeidel::PSGS::backward", + Kokkos::Experimental::require( + Kokkos::RangePolicy( + my_exec_space, 0, color_index_end - color_index_begin), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + gs); if (i == 0) { break; } diff --git a/sparse/src/KokkosKernels_Handle.hpp b/sparse/src/KokkosKernels_Handle.hpp index 03cabdb09e..daf180f418 100644 --- a/sparse/src/KokkosKernels_Handle.hpp +++ b/sparse/src/KokkosKernels_Handle.hpp @@ -605,6 +605,7 @@ class KokkosKernelsHandle { // clang-format off /** * @brief Create a gauss seidel handle object + * @note: Caller is responsible for freeing this via destroy_gs_handle() * * @param handle_exec_space The execution space instance to execute kernels on. * @param num_streams The number of streams to allocate memory for. @@ -654,6 +655,7 @@ class KokkosKernelsHandle { // clang-format off /** * @brief Create a gauss seidel handle object + * @note: Caller is responsible for freeing this via destroy_gs_handle() * * @param gs_algorithm Specifies which algorithm to use: * @@ -749,7 +751,9 @@ class KokkosKernelsHandle { // clang-format off /** * @brief Create a gs handle object + * @note: Caller is responsible for freeing this via destroy_gs_handle() * + * @param handle_exec_space The execution space instance to execute kernels on. * @param clusterAlgo Specifies which clustering algorithm to use: * * KokkosSparse::ClusteringAlgorithm::CLUSTER_DEFAULT ?? @@ -774,15 +778,60 @@ class KokkosKernelsHandle { * backwards compatibility for SPGEMM and other use cases) */ // clang-format on - void create_gs_handle(KokkosSparse::ClusteringAlgorithm clusterAlgo, + void create_gs_handle(const HandleExecSpace &handle_exec_space, + KokkosSparse::ClusteringAlgorithm clusterAlgo, nnz_lno_t hint_verts_per_cluster, KokkosGraph::ColoringAlgorithm coloring_algorithm = KokkosGraph::COLORING_DEFAULT) { this->destroy_gs_handle(); this->is_owner_of_the_gs_handle = true; this->gsHandle = new ClusterGaussSeidelHandleType( - clusterAlgo, hint_verts_per_cluster, coloring_algorithm); + handle_exec_space, clusterAlgo, hint_verts_per_cluster, + coloring_algorithm); + } + + /** + * @brief Create a gs handle object + * @note: Caller is responsible for freeing this via destroy_gs_handle() + * + * @param clusterAlgo Specifies which clustering algorithm to use: + * + * KokkosSparse::ClusteringAlgorithm::CLUSTER_DEFAULT ?? + * KokkosSparse::ClusteringAlgorithm::CLUSTER_MIS2 ?? + * KokkosSparse::ClusteringAlgorithm::CLUSTER_BALLOON ?? + * KokkosSparse::ClusteringAlgorithm::NUM_CLUSTERING_ALGORITHMS + * ?? + * @param hint_verts_per_cluster Hint how many verticies to use per cluster + * @param coloring_algorithm Specifies which coloring algorithm to color the + * graph with: + * + * KokkosGraph::COLORING_DEFAULT Depends on + * execution space: COLORING_SERIAL on Kokkos::Serial; COLORING_EB on GPUs; + * COLORING_VBBIT + * on Kokkos::Sycl or elsewhere. KokkosGraph::COLORING_SERIAL Serial Greedy + * Coloring KokkosGraph::COLORING_VB Vertex Based Coloring + * KokkosGraph::COLORING_VBBIT Vertex Based + * Coloring with bit array KokkosGraph::COLORING_VBCS Vertex Based Color + * Set KokkosGraph::COLORING_VBD Vertex Based Deterministic Coloring + * KokkosGraph::COLORING_VBDBIT Vertex Based + * Deterministic Coloring with bit array KokkosGraph::COLORING_EB Edge + * Based Coloring KokkosGraph::COLORING_SERIAL2 Serial Distance-2 Graph + * Coloring (kept here for backwards compatibility for SPGEMM and other use + * cases) + */ + // clang-format on + void create_gs_handle(KokkosSparse::ClusteringAlgorithm clusterAlgo, + nnz_lno_t hint_verts_per_cluster, + KokkosGraph::ColoringAlgorithm coloring_algorithm = + KokkosGraph::COLORING_DEFAULT) { + HandleExecSpace handle_exec_space; + create_gs_handle(handle_exec_space, clusterAlgo, hint_verts_per_cluster, + coloring_algorithm); } + + /** + * Free the dynamically allocated gs handle allocation. + */ void destroy_gs_handle() { if (is_owner_of_the_gs_handle && this->gsHandle != NULL) { delete this->gsHandle; diff --git a/sparse/src/KokkosSparse_gauss_seidel_handle.hpp b/sparse/src/KokkosSparse_gauss_seidel_handle.hpp index 0b47b3e92c..ad7ae87be7 100644 --- a/sparse/src/KokkosSparse_gauss_seidel_handle.hpp +++ b/sparse/src/KokkosSparse_gauss_seidel_handle.hpp @@ -95,6 +95,7 @@ class GaussSeidelHandle { bool called_symbolic; bool called_numeric; + bool execution_space_is_set; int suggested_vector_size; int suggested_team_size; @@ -112,6 +113,7 @@ class GaussSeidelHandle { numColors(0), called_symbolic(false), called_numeric(false), + execution_space_is_set(false), suggested_vector_size(0), suggested_team_size(0) {} @@ -125,6 +127,7 @@ class GaussSeidelHandle { numColors(0), called_symbolic(false), called_numeric(false), + execution_space_is_set(false), suggested_vector_size(0), suggested_team_size(0) {} @@ -150,8 +153,7 @@ class GaussSeidelHandle { template void set_execution_space(const ExecSpaceIn exec_space_in) { - static bool is_set = false; - if (!is_set) { + if (!execution_space_is_set) { static_assert(std::is_same::value, "The type of exec_space_in should be the same as " "GaussSeidelHandle::HandleExecSpace"); @@ -163,7 +165,6 @@ class GaussSeidelHandle { "without multiple handles. Please create a new handle via " "create_gs_handle.\n"); } - is_set = true; } void set_algorithm_type(const GSAlgorithm sgs_algo) { @@ -487,7 +488,7 @@ class ClusterGaussSeidelHandle typedef GaussSeidelHandle GSHandle; - typedef ExecutionSpace HandleExecSpace; + using HandleExecSpace = typename GSHandle::HandleExecSpace; typedef TemporaryMemorySpace HandleTempMemorySpace; typedef PersistentMemorySpace HandlePersistentMemorySpace; @@ -544,15 +545,12 @@ class ClusterGaussSeidelHandle nnz_lno_persistent_work_view_t vert_clusters; public: - /** - * \brief Default constructor. - */ - - // Constructor for cluster-coloring based GS and SGS - ClusterGaussSeidelHandle(ClusteringAlgorithm cluster_algo_, + // Constructors for cluster-coloring based GS and SGS + ClusterGaussSeidelHandle(GSHandle gs_handle, + ClusteringAlgorithm cluster_algo_, nnz_lno_t cluster_size_, KokkosGraph::ColoringAlgorithm coloring_algo_) - : GSHandle(GS_CLUSTER), + : GSHandle(gs_handle), cluster_algo(cluster_algo_), cluster_size(cluster_size_), coloring_algo(coloring_algo_), @@ -561,6 +559,23 @@ class ClusterGaussSeidelHandle cluster_adj(), vert_clusters() {} + /** + * \brief Default constructor. + */ + ClusterGaussSeidelHandle(ClusteringAlgorithm cluster_algo_, + nnz_lno_t cluster_size_, + KokkosGraph::ColoringAlgorithm coloring_algo_) + : ClusterGaussSeidelHandle(GSHandle(GS_CLUSTER), cluster_algo_, + cluster_size_, coloring_algo_) {} + + ClusterGaussSeidelHandle(HandleExecSpace handle_exec_space, int n_streams, + ClusteringAlgorithm cluster_algo_, + nnz_lno_t cluster_size_, + KokkosGraph::ColoringAlgorithm coloring_algo_) + : ClusterGaussSeidelHandle( + GSHandle(handle_exec_space, n_streams, GS_CLUSTER), cluster_algo_, + cluster_size_, coloring_algo_) {} + void set_cluster_size(nnz_lno_t cs) { this->cluster_size = cs; } nnz_lno_t get_cluster_size() const { return this->cluster_size; } diff --git a/sparse/unit_test/Test_Sparse_gauss_seidel.hpp b/sparse/unit_test/Test_Sparse_gauss_seidel.hpp index 213ac12303..95d71b4458 100644 --- a/sparse/unit_test/Test_Sparse_gauss_seidel.hpp +++ b/sparse/unit_test/Test_Sparse_gauss_seidel.hpp @@ -759,6 +759,7 @@ void test_gauss_seidel_streams_rank1( std::vector kh_psgs_v(nstreams); std::vector kh_tsgs_v(nstreams); std::vector kh_tsgs_classic_v(nstreams); + std::vector kh_cluster_v(nstreams); std::vector input_mat_v(nstreams); std::vector solution_x_v(nstreams); std::vector x_vector_v(nstreams); @@ -801,6 +802,7 @@ void test_gauss_seidel_streams_rank1( kh_tsgs_v[i] = KernelHandle(); // Initialize KokkosKernelsHandle defaults. kh_tsgs_classic_v[i] = KernelHandle(); // Initialize KokkosKernelsHandle defaults. + kh_psgs_v[i].create_gs_handle(instances[i], nstreams, GS_DEFAULT, coloringAlgo); kh_tsgs_v[i].create_gs_handle(instances[i], nstreams, GS_TWOSTAGE, @@ -810,7 +812,6 @@ void test_gauss_seidel_streams_rank1( coloringAlgo); kh_tsgs_classic_v[i].set_gs_twostage(false, input_mat_v[i].numRows()); } - int apply_count = 3; // test symmetric, forward, backward //*** Point-coloring version **** for (int apply_type = 0; apply_type < apply_count; ++apply_type) { @@ -829,6 +830,39 @@ void test_gauss_seidel_streams_rank1( } } + //*** Cluster-coloring version **** + int clusterSizes[3] = {2, 5, 34}; + std::vector clusteringAlgos = {CLUSTER_MIS2, + CLUSTER_BALLOON}; + for (int csize = 0; csize < 3; csize++) { + for (auto clusterAlgo : clusteringAlgos) { + for (int i = 0; i < nstreams; i++) { + kh_cluster_v[i] = + KernelHandle(); // Initialize KokkosKernelsHandle defaults. + kh_cluster_v[i].create_gs_handle(instances[i], clusterAlgo, + clusterSizes[csize], coloringAlgo); + } + + for (int apply_type = 0; apply_type < apply_count; ++apply_type) { + for (int i = 0; i < nstreams; i++) { + Kokkos::deep_copy(instances[i], x_vector_v[i], zero); + instances[i].fence(); + } + + run_gauss_seidel_streams(instances, kh_cluster_v, input_mat_v, + x_vector_v, y_vector_v, symmetric, m_omega, + apply_type, nstreams); + for (int i = 0; i < nstreams; i++) { + KokkosBlas::axpby(instances[i], one, solution_x_v[i], -one, + x_vector_v[i]); + mag_t result_norm_res = KokkosBlas::nrm2(instances[i], x_vector_v[i]); + EXPECT_LT(result_norm_res, initial_norm_res_v[i]); + } + } + for (int i = 0; i < nstreams; i++) kh_cluster_v[i].destroy_gs_handle(); + } + } + //*** Two-stage version **** for (int apply_type = 0; apply_type < apply_count; ++apply_type) { for (int i = 0; i < nstreams; i++) From a6b148edc41fe452d589321ffea93bfc2c74abf6 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 10 Oct 2023 08:02:04 -0600 Subject: [PATCH 10/12] sparse/unit_test: Reduce runtime and add diag output --- sparse/unit_test/Test_Sparse_gauss_seidel.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sparse/unit_test/Test_Sparse_gauss_seidel.hpp b/sparse/unit_test/Test_Sparse_gauss_seidel.hpp index 95d71b4458..da9a8fe415 100644 --- a/sparse/unit_test/Test_Sparse_gauss_seidel.hpp +++ b/sparse/unit_test/Test_Sparse_gauss_seidel.hpp @@ -856,7 +856,10 @@ void test_gauss_seidel_streams_rank1( KokkosBlas::axpby(instances[i], one, solution_x_v[i], -one, x_vector_v[i]); mag_t result_norm_res = KokkosBlas::nrm2(instances[i], x_vector_v[i]); - EXPECT_LT(result_norm_res, initial_norm_res_v[i]); + EXPECT_LT(result_norm_res, initial_norm_res_v[i]) + << "with (clusterSize:" << clusterSizes[csize] + << ", clusterAlgo:" << clusterAlgo + << ",coloringAlgo:" << coloringAlgo << ") on stream_idx: " << i; } } for (int i = 0; i < nstreams; i++) kh_cluster_v[i].destroy_gs_handle(); @@ -914,9 +917,6 @@ void test_gauss_seidel_streams_rank1( test_gauss_seidel_streams_rank1( \ 2000, 2000 * 20, 200, 10, false, 0.9, KokkosGraph::COLORING_DEFAULT, \ 1); \ - test_gauss_seidel_streams_rank1( \ - 2000, 2000 * 20, 200, 10, false, 0.9, KokkosGraph::COLORING_DEFAULT, \ - 2); \ test_gauss_seidel_streams_rank1( \ 2000, 2000 * 20, 200, 10, false, 0.9, KokkosGraph::COLORING_DEFAULT, \ 3); \ From 4209417964d4cd9a66b95a5c78034c75bf90119e Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Thu, 26 Oct 2023 08:57:09 -0600 Subject: [PATCH 11/12] Fix constructors --- sparse/src/KokkosKernels_Handle.hpp | 6 ++++-- sparse/unit_test/Test_Sparse_gauss_seidel.hpp | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/sparse/src/KokkosKernels_Handle.hpp b/sparse/src/KokkosKernels_Handle.hpp index daf180f418..eda138cc02 100644 --- a/sparse/src/KokkosKernels_Handle.hpp +++ b/sparse/src/KokkosKernels_Handle.hpp @@ -754,6 +754,7 @@ class KokkosKernelsHandle { * @note: Caller is responsible for freeing this via destroy_gs_handle() * * @param handle_exec_space The execution space instance to execute kernels on. + * @param num_streams The number of streams to allocate memory for. * @param clusterAlgo Specifies which clustering algorithm to use: * * KokkosSparse::ClusteringAlgorithm::CLUSTER_DEFAULT ?? @@ -779,6 +780,7 @@ class KokkosKernelsHandle { */ // clang-format on void create_gs_handle(const HandleExecSpace &handle_exec_space, + int num_streams, KokkosSparse::ClusteringAlgorithm clusterAlgo, nnz_lno_t hint_verts_per_cluster, KokkosGraph::ColoringAlgorithm coloring_algorithm = @@ -786,7 +788,7 @@ class KokkosKernelsHandle { this->destroy_gs_handle(); this->is_owner_of_the_gs_handle = true; this->gsHandle = new ClusterGaussSeidelHandleType( - handle_exec_space, clusterAlgo, hint_verts_per_cluster, + handle_exec_space, num_streams, clusterAlgo, hint_verts_per_cluster, coloring_algorithm); } @@ -825,7 +827,7 @@ class KokkosKernelsHandle { KokkosGraph::ColoringAlgorithm coloring_algorithm = KokkosGraph::COLORING_DEFAULT) { HandleExecSpace handle_exec_space; - create_gs_handle(handle_exec_space, clusterAlgo, hint_verts_per_cluster, + create_gs_handle(handle_exec_space, 1, clusterAlgo, hint_verts_per_cluster, coloring_algorithm); } diff --git a/sparse/unit_test/Test_Sparse_gauss_seidel.hpp b/sparse/unit_test/Test_Sparse_gauss_seidel.hpp index da9a8fe415..0ded238178 100644 --- a/sparse/unit_test/Test_Sparse_gauss_seidel.hpp +++ b/sparse/unit_test/Test_Sparse_gauss_seidel.hpp @@ -839,7 +839,7 @@ void test_gauss_seidel_streams_rank1( for (int i = 0; i < nstreams; i++) { kh_cluster_v[i] = KernelHandle(); // Initialize KokkosKernelsHandle defaults. - kh_cluster_v[i].create_gs_handle(instances[i], clusterAlgo, + kh_cluster_v[i].create_gs_handle(instances[i], nstreams, clusterAlgo, clusterSizes[csize], coloringAlgo); } From dc39a891aea6b68c115e67bd1e035b9cc529aabb Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 31 Oct 2023 12:47:28 -0600 Subject: [PATCH 12/12] Fix sphinx build --- docs/developer/apidocs/sparse.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/developer/apidocs/sparse.rst b/docs/developer/apidocs/sparse.rst index c538b75afb..9174464473 100644 --- a/docs/developer/apidocs/sparse.rst +++ b/docs/developer/apidocs/sparse.rst @@ -62,7 +62,7 @@ gauss_seidel ------------ .. doxygenfunction:: create_gs_handle(const HandleExecSpace&, int, KokkosSparse::GSAlgorithm gs_algorithm, KokkosGraph::ColoringAlgorithm coloring_algorithm) .. doxygenfunction:: create_gs_handle(KokkosSparse::GSAlgorithm gs_algorithm, KokkosGraph::ColoringAlgorithm coloring_algorithm) -.. doxygenfunction:: create_gs_handle(const HandleExecSpace&, KokkosSparse::ClusteringAlgorithm, nnz_lno_t, KokkosGraph::ColoringAlgorithm) +.. doxygenfunction:: create_gs_handle(const HandleExecSpace&, int, KokkosSparse::ClusteringAlgorithm, nnz_lno_t, KokkosGraph::ColoringAlgorithm) .. doxygenfunction:: create_gs_handle(KokkosSparse::ClusteringAlgorithm, nnz_lno_t, KokkosGraph::ColoringAlgorithm) .. doxygenfunction:: destroy_gs_handle() .. doxygenfunction:: gauss_seidel_symbolic(const ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, bool is_graph_symmetric)