From 336ee5fb6c37ea562949df3c17e2df5156df327c Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Thu, 12 Sep 2024 10:57:08 -0600 Subject: [PATCH] [4.4.01] Patches to 4.4.01 (#2327) * Restore size_t as default offset, in Tribits builds (#2313) If building KokkosKernels standalone, leave int as the default offset (this was the case since #2140). But if building KokkosKernels as a Trilinos/Tribits package, then make size_t the default offset because this is what the Tpetra stack currently uses. Signed-off-by: Brian Kelley * Improve crs/bsr sorting performance (#2293) * CRS sorting improvements - Wrote bulk sort/permutation based sorting for CRS graph, matrix, and BSR matrix (bulk = one large sort of all the entries, using row-major dense index as keys) - This is more performant for imbalanced entries per row - If matrix dimensions are too large to do bulk sort, fall back to sorting within each row with a thread. * Add perf test for sort_crs_matrix * sort_crs: improve parallel labels * Work around kokkos issue 7036 * sort_crs: replace radix sort lambda with functor (Lambda segfaults with nvcc+openmp) --------- Signed-off-by: Brian Kelley * SpAdd handle: delete sort_option getter/setter (#2296) SpAdd handle was originally a copy-paste of the spgemm handle way back in #122, and included get_sort_option() and set_sort_option() from spgemm. But these try to use the member bool sort_option, which doesn't exist. Somehow these functions never produced compile errors until someone tried to call them. * Improve GH action to produce release artifacts (#2312) * coo2csr: add parens to function calls (#2318) * Update changelog * Update master_history.txt * .github/workflows: Group jobs under common github-AT2 name (#2320) * Update master_history.txt --------- Signed-off-by: Brian Kelley Co-authored-by: brian-kelley Co-authored-by: Damien L-G Co-authored-by: Carl Pearson Co-authored-by: Evan Harvey <57234914+e10harvey@users.noreply.github.com> --- .github/workflows/at2.yml | 29 + .github/workflows/bdw.yml | 22 +- .github/workflows/h100.yml | 23 +- .github/workflows/mi210.yml | 24 +- .github/workflows/release.yml | 36 +- .github/workflows/spr.yml | 25 +- CHANGELOG.md | 14 + cmake/kokkoskernels_eti_offsets.cmake | 18 +- common/src/KokkosKernels_SimpleUtils.hpp | 20 +- common/src/KokkosKernels_Utils.hpp | 6 + master_history.txt | 1 + perf_test/sparse/CMakeLists.txt | 9 + perf_test/sparse/KokkosSparse_sort_crs.cpp | 103 +++ sparse/impl/KokkosSparse_sort_crs_impl.hpp | 366 +++++++++++ sparse/src/KokkosSparse_SortCrs.hpp | 727 ++++++++------------- sparse/src/KokkosSparse_Utils.hpp | 13 + sparse/src/KokkosSparse_coo2crs.hpp | 2 +- sparse/src/KokkosSparse_spadd_handle.hpp | 4 - 18 files changed, 848 insertions(+), 594 deletions(-) create mode 100644 .github/workflows/at2.yml create mode 100644 perf_test/sparse/KokkosSparse_sort_crs.cpp create mode 100644 sparse/impl/KokkosSparse_sort_crs_impl.hpp diff --git a/.github/workflows/at2.yml b/.github/workflows/at2.yml new file mode 100644 index 0000000000..042ad27a93 --- /dev/null +++ b/.github/workflows/at2.yml @@ -0,0 +1,29 @@ +name: github-AT2 + +on: + pull_request: + paths-ignore: + - '**/*.rst' + - '**/*.md' + - '**/requirements.txt' + - '**/*.py' + - 'docs/**' + types: [ opened, reopened, synchronize ] + +permissions: + contents: none + +# Cancels any in progress 'workflow' associated with this PR +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + mi210: + uses: ./.github/workflows/mi210.yml + h100: + uses: ./.github/workflows/h100.yml + bdw: + uses: ./.github/workflows/bdw.yml + #spr: + #uses: ./.github/workflows/spr.yml \ No newline at end of file diff --git a/.github/workflows/bdw.yml b/.github/workflows/bdw.yml index e4036d68b9..3db73f06b4 100644 --- a/.github/workflows/bdw.yml +++ b/.github/workflows/bdw.yml @@ -1,25 +1,7 @@ -name: github-BDW +name: Reusable BDW workflow on: - pull_request: - paths-ignore: - - '**/*.rst' - - '**/*.md' - - '**/requirements.txt' - - '**/*.py' - - 'docs/**' - types: [ opened, reopened, synchronize ] - pull_request_review: - types: - - submitted - -permissions: - contents: none - -# Cancels any in progress 'workflow' associated with this PR -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true + workflow_call jobs: # PR_BDW_GNU1020_OPENMP_LEFT_REL_NOETI: diff --git a/.github/workflows/h100.yml b/.github/workflows/h100.yml index 3473f8edd7..15665b673d 100644 --- a/.github/workflows/h100.yml +++ b/.github/workflows/h100.yml @@ -1,26 +1,7 @@ -name: github-H100 +name: Reusable H100 workflow -# Only allow manual runs until at2 runners are available. on: - pull_request: - paths-ignore: - - '**/*.rst' - - '**/*.md' - - '**/requirements.txt' - - '**/*.py' - - 'docs/**' - types: [ opened, reopened, synchronize ] - pull_request_review: - types: - - submitted - -permissions: - contents: none - -# Cancels any in progress 'workflow' associated with this PR -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true + workflow_call jobs: PR_HOPPER90_CUDA1180_CUDA_LEFT_RIGHT_REL: diff --git a/.github/workflows/mi210.yml b/.github/workflows/mi210.yml index 70b91a908c..c9fc4a6aed 100644 --- a/.github/workflows/mi210.yml +++ b/.github/workflows/mi210.yml @@ -1,25 +1,7 @@ -name: github-MI210 +name: Reusable MI210 workflow -on: - pull_request: - paths-ignore: - - '**/*.rst' - - '**/*.md' - - '**/requirements.txt' - - '**/*.py' - - 'docs/**' - types: [ opened, reopened, synchronize ] - pull_request_review: - types: - - submitted - -permissions: - contents: none - -# Cancels any in progress 'workflow' associated with this PR -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true +on: + workflow_call jobs: # PR_VEGA90A_ROCM561_HIP_SERIAL_LEFT_REL: diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index a7faca45fe..f21120e376 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -13,11 +13,11 @@ jobs: hashes: ${{ steps.hash.outputs.hashes }} runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 - name: Build artifacts run: | - git archive -o kokkos-kernels-${{ github.ref_name }}.zip HEAD - git archive -o kokkos-kernels-${{ github.ref_name }}.tar.gz HEAD + git archive --prefix=kokkos-kernels-${{ github.ref_name }}/ -o kokkos-kernels-${{ github.ref_name }}.zip HEAD + git archive --prefix=kokkos-kernels-${{ github.ref_name }}/ -o kokkos-kernels-${{ github.ref_name }}.tar.gz HEAD - name: Generate hashes shell: bash @@ -25,21 +25,14 @@ jobs: run: | # sha256sum generates sha256 hash for all artifacts. # base64 -w0 encodes to base64 and outputs on a single line. - echo "hashes=$(sha256sum kokkos-kernels-${{ github.ref_name }}.zip kokkos-kernels-${{ github.ref_name }}.tar.gz | base64 -w0)" >> "$GITHUB_OUTPUT" + sha256sum kokkos-kernels-${{ github.ref_name }}.zip kokkos-kernels-${{ github.ref_name }}.tar.gz > kokkos-kernels-${{ github.ref_name }}-SHA-256.txt + echo "hashes=$(base64 -w0 kokkos-kernels-${{ github.ref_name }}-SHA-256.txt)" >> "$GITHUB_OUTPUT" - - name: Upload source code (zip) - uses: actions/upload-artifact@89ef406dd8d7e03cfd12d9e0a4a378f454709029 # v4.3.5 + - name: Upload artifacts + uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0 with: - name: kokkos-kernels-${{ github.ref_name }}.zip - path: kokkos-kernels-${{ github.ref_name }}.zip - if-no-files-found: error - retention-days: 5 - - - name: Upload source code (tar.gz) - uses: actions/upload-artifact@89ef406dd8d7e03cfd12d9e0a4a378f454709029 # v4.3.5 - with: - name: kokkos-kernels-${{ github.ref_name }}.tar.gz - path: kokkos-kernels-${{ github.ref_name }}.tar.gz + name: release-artifacts + path: kokkos-kernels-${{ github.ref_name }}* if-no-files-found: error retention-days: 5 @@ -65,19 +58,14 @@ jobs: runs-on: ubuntu-latest if: startsWith(github.ref, 'refs/tags/') steps: - - name: Download kokkos-kernels-${{ github.ref_name }}.zip + - name: Download artifacts uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8 with: - name: kokkos-kernels-${{ github.ref_name }}.zip - - - name: Download kokkos-kernels-${{ github.ref_name }}.tar.gz - uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8 - with: - name: kokkos-kernels-${{ github.ref_name }}.tar.gz - + name: release-artifacts - name: Upload assets uses: softprops/action-gh-release@c062e08bd532815e2082a85e87e3ef29c3e6d191 # v2.0.8 with: files: | kokkos-kernels-${{ github.ref_name }}.zip kokkos-kernels-${{ github.ref_name }}.tar.gz + kokkos-kernels-${{ github.ref_name }}-SHA-256.txt diff --git a/.github/workflows/spr.yml b/.github/workflows/spr.yml index 8b2d504926..7f9136a699 100644 --- a/.github/workflows/spr.yml +++ b/.github/workflows/spr.yml @@ -1,26 +1,7 @@ -name: github-SPR +name: Reusable SPR workflow -# Only allow manual runs until at2 runners are available. -on: workflow_dispatch - #pull_request: - # paths-ignore: - # - '**/*.rst' - # - '**/*.md' - # - '**/requirements.txt' - # - '**/*.py' - # - 'docs/**' - # types: [ opened, reopened, synchronize ] - #pull_request_review: - # types: - # - submitted - -permissions: - contents: none - -# Cancels any in progress 'workflow' associated with this PR -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true +on: + workflow_call jobs: PR_SPR_ONEAPI202310_OPENMP_LEFT_MKLBLAS_MKLLAPACK_REL: diff --git a/CHANGELOG.md b/CHANGELOG.md index cefc116c83..343f815ed7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,19 @@ # Change Log +## [4.4.01](https://github.com/kokkos/kokkos-kernels/tree/4.4.01) +[Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/4.4.00...4.4.01) + +### Build System: +- Restore size_t as default offset, in Tribits builds [\#2313](https://github.com/kokkos/kokkos-kernels/pull/2313) + +### Enhancements: +- Improve crs/bsr sorting performance [\#2293](https://github.com/kokkos/kokkos-kernels/pull/2293) + +### Bug Fixes: +- SpAdd handle: delete sort_option getter/setter [\#2296](https://github.com/kokkos/kokkos-kernels/pull/2296) +- Improve GH action to produce release artifacts [\#2312](https://github.com/kokkos/kokkos-kernels/pull/2312) +- coo2csr: add parens to function calls [\#2318](https://github.com/kokkos/kokkos-kernels/pull/2318) + ## [4.4.00](https://github.com/kokkos/kokkos-kernels/tree/4.4.00) [Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/4.3.01...4.4.00) diff --git a/cmake/kokkoskernels_eti_offsets.cmake b/cmake/kokkoskernels_eti_offsets.cmake index 1cf02f1327..39531cabf3 100644 --- a/cmake/kokkoskernels_eti_offsets.cmake +++ b/cmake/kokkoskernels_eti_offsets.cmake @@ -1,5 +1,15 @@ -SET(KOKKOSKERNELS_INST_OFFSET_SIZE_T_DEFAULT OFF) -SET(KOKKOSKERNELS_INST_OFFSET_INT_DEFAULT ${KOKKOSKERNELS_ADD_DEFAULT_ETI}) +IF(KOKKOSKERNELS_HAS_TRILINOS) + # In a Trilinos build, size_t is the default offset because this is what Tpetra uses + # TODO: update this when Tpetra can use different offsets + SET(KOKKOSKERNELS_INST_OFFSET_SIZE_T_DEFAULT ${KOKKOSKERNELS_ADD_DEFAULT_ETI}) + SET(KOKKOSKERNELS_INST_OFFSET_INT_DEFAULT OFF) +ELSE() + # But in a standalone KokkosKernels build, int is the default offset type + # This provides the maximum TPL compatibility + SET(KOKKOSKERNELS_INST_OFFSET_SIZE_T_DEFAULT OFF) + SET(KOKKOSKERNELS_INST_OFFSET_INT_DEFAULT ${KOKKOSKERNELS_ADD_DEFAULT_ETI}) +ENDIF() + SET(OFFSETS OFFSET_INT OFFSET_SIZE_T @@ -12,14 +22,14 @@ KOKKOSKERNELS_ADD_OPTION( INST_OFFSET_INT ${KOKKOSKERNELS_INST_OFFSET_INT_DEFAULT} BOOL - "Whether to pre instantiate kernels for the offset type int. This option is KokkosKernels_INST_OFFSET_INT=OFF by default. Default: OFF" + "Whether to pre instantiate kernels for the offset type int. This option is KokkosKernels_INST_OFFSET_INT=OFF by default. Default: ${KOKKOSKERNELS_INST_OFFSET_INT_DEFAULT}" ) KOKKOSKERNELS_ADD_OPTION( INST_OFFSET_SIZE_T ${KOKKOSKERNELS_INST_OFFSET_SIZE_T_DEFAULT} BOOL - "Whether to pre instantiate kernels for the offset type size_t. This option is KokkosKernels_INST_OFFSET_SIZE_T=ON by default. Default: ON" + "Whether to pre instantiate kernels for the offset type size_t. This option is KokkosKernels_INST_OFFSET_SIZE_T=ON by default. Default: ${KOKKOSKERNELS_INST_OFFSET_SIZE_T_DEFAULT}" ) IF (KOKKOSKERNELS_INST_OFFSET_INT) diff --git a/common/src/KokkosKernels_SimpleUtils.hpp b/common/src/KokkosKernels_SimpleUtils.hpp index 0ae29a2f50..51ff697bde 100644 --- a/common/src/KokkosKernels_SimpleUtils.hpp +++ b/common/src/KokkosKernels_SimpleUtils.hpp @@ -358,13 +358,19 @@ struct ReduceMaxFunctor { }; template -void kk_view_reduce_max(size_t num_elements, view_type view_to_reduce, +void kk_view_reduce_max(const MyExecSpace &exec, size_t num_elements, view_type view_to_reduce, typename view_type::non_const_value_type &max_reduction) { - typedef Kokkos::RangePolicy my_exec_space; - Kokkos::parallel_reduce("KokkosKernels::Common::ReduceMax", my_exec_space(0, num_elements), + typedef Kokkos::RangePolicy policy_t; + Kokkos::parallel_reduce("KokkosKernels::Common::ReduceMax", policy_t(exec, 0, num_elements), ReduceMaxFunctor(view_to_reduce), max_reduction); } +template +void kk_view_reduce_max(size_t num_elements, view_type view_to_reduce, + typename view_type::non_const_value_type &max_reduction) { + kk_view_reduce_max(MyExecSpace(), num_elements, view_to_reduce, max_reduction); +} + // xorshift hash/pseudorandom function (supported for 32- and 64-bit integer // types only) template @@ -429,10 +435,14 @@ struct SequentialFillFunctor { val_type start; }; +template +void sequential_fill(const ExecSpace &exec, const V &v, typename V::non_const_value_type start = 0) { + Kokkos::parallel_for(Kokkos::RangePolicy(exec, 0, v.extent(0)), SequentialFillFunctor(v, start)); +} + template void sequential_fill(const V &v, typename V::non_const_value_type start = 0) { - Kokkos::parallel_for(Kokkos::RangePolicy(0, v.extent(0)), - SequentialFillFunctor(v, start)); + sequential_fill(typename V::execution_space(), v, start); } } // namespace Impl diff --git a/common/src/KokkosKernels_Utils.hpp b/common/src/KokkosKernels_Utils.hpp index a087002d31..f0add80c50 100644 --- a/common/src/KokkosKernels_Utils.hpp +++ b/common/src/KokkosKernels_Utils.hpp @@ -1076,6 +1076,12 @@ void view_reduce_max(size_t num_elements, view_type view_to_reduce, kk_view_reduce_max(num_elements, view_to_reduce, max_reduction); } +template +void view_reduce_max(const MyExecSpace &exec, size_t num_elements, view_type view_to_reduce, + typename view_type::non_const_value_type &max_reduction) { + kk_view_reduce_max(exec, num_elements, view_to_reduce, max_reduction); +} + template struct ReduceRowSizeFunctor { const size_type *rowmap_view_begins; diff --git a/master_history.txt b/master_history.txt index 6a546fb885..438dd05803 100644 --- a/master_history.txt +++ b/master_history.txt @@ -27,3 +27,4 @@ tag: 4.2.01 date: 01/30/2024 master: f429f6ec release: bcf9854b tag: 4.3.00 date: 04/03/2024 master: afd65f03 release: ebbf4b78 tag: 4.3.01 date: 05/07/2024 master: 1b0a15f5 release: 58785c1b tag: 4.4.00 date: 08/08/2024 master: d1a91b8a release: 1145f529 +tag: 4.4.01 date: 09/05/2024 master: 0608a337 release: a360d003 diff --git a/perf_test/sparse/CMakeLists.txt b/perf_test/sparse/CMakeLists.txt index ef0bf7d995..514ef0ed82 100644 --- a/perf_test/sparse/CMakeLists.txt +++ b/perf_test/sparse/CMakeLists.txt @@ -116,6 +116,15 @@ KOKKOSKERNELS_ADD_EXECUTABLE( SOURCES KokkosSparse_mdf.cpp ) +# For the sake of build times, don't build this CRS sorting perf test by default. +# It can be enabled if needed by setting -DKokkosKernels_ENABLE_SORT_CRS_PERFTEST=ON. +if (KokkosKernels_ENABLE_SORT_CRS_PERFTEST) + KOKKOSKERNELS_ADD_EXECUTABLE( + sparse_sort_crs + SOURCES KokkosSparse_sort_crs.cpp +) +endif () + if (KokkosKernels_ENABLE_BENCHMARK) KOKKOSKERNELS_ADD_BENCHMARK( sparse_par_ilut diff --git a/perf_test/sparse/KokkosSparse_sort_crs.cpp b/perf_test/sparse/KokkosSparse_sort_crs.cpp new file mode 100644 index 0000000000..cd3ed91521 --- /dev/null +++ b/perf_test/sparse/KokkosSparse_sort_crs.cpp @@ -0,0 +1,103 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include +#include +#include "KokkosKernels_config.h" +#include "KokkosSparse_IOUtils.hpp" +#include "KokkosKernels_perf_test_utilities.hpp" + +#include "KokkosSparse_CrsMatrix.hpp" +#include "KokkosSparse_SortCrs.hpp" + +using perf_test::CommonInputParams; + +struct LocalParams { + std::string mtxFile; +}; + +void print_options() { + std::cerr << "Options\n" << std::endl; + + std::cerr << perf_test::list_common_options(); + + std::cerr << "\t[Required] --mtx :: matrix to sort\n"; + std::cerr << "\t[Optional] --repeat :: how many times to repeat sorting\n"; +} + +int parse_inputs(LocalParams& params, int argc, char** argv) { + for (int i = 1; i < argc; ++i) { + if (perf_test::check_arg_str(i, argc, argv, "--mtx", params.mtxFile)) { + ++i; + } else { + std::cerr << "Unrecognized command line argument #" << i << ": " << argv[i] << std::endl; + print_options(); + return 1; + } + } + return 0; +} + +template +void run_experiment(int argc, char** argv, const CommonInputParams& common_params) { + using namespace KokkosSparse; + + using mem_space = typename exec_space::memory_space; + using device_t = typename Kokkos::Device; + using size_type = default_size_type; + using lno_t = default_lno_t; + using scalar_t = default_scalar; + using crsMat_t = KokkosSparse::CrsMatrix; + + using graph_t = typename crsMat_t::StaticCrsGraphType; + + LocalParams params; + if (parse_inputs(params, argc, argv)) return; + + crsMat_t A = KokkosSparse::Impl::read_kokkos_crst_matrix(params.mtxFile.c_str()); + std::cout << "Loaded matrix: " << A.numRows() << "x" << A.numCols() << " with " << A.nnz() << " entries.\n"; + // This first sort call serves as a warm-up + KokkosSparse::sort_crs_matrix(A); + lno_t m = A.numRows(); + lno_t n = A.numCols(); + auto rowmapHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.graph.row_map); + auto entriesHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.graph.entries); + typename crsMat_t::index_type shuffledEntries("shuffled entries", A.nnz()); + // Randomly shuffle the entries within each row, so that the rows aren't + // already sorted. Leave the values alone; this changes the matrix numerically + // but this doesn't affect sorting. + for (lno_t i = 0; i < m; i++) { + std::random_shuffle(entriesHost.data() + i, entriesHost.data() + i + 1); + } + Kokkos::deep_copy(shuffledEntries, entriesHost); + exec_space exec; + Kokkos::Timer timer; + double totalTime = 0; + for (int rep = 0; rep < common_params.repeat; rep++) { + Kokkos::deep_copy(exec, A.graph.entries, shuffledEntries); + exec.fence(); + timer.reset(); + KokkosSparse::sort_crs_matrix(exec, A); + exec.fence(); + totalTime += timer.seconds(); + } + std::cout << "Mean sort_crs_matrix time over " << common_params.repeat << " trials: "; + std::cout << totalTime / common_params.repeat << "\n"; +} + +#define KOKKOSKERNELS_PERF_TEST_NAME run_experiment +#include "KokkosKernels_perf_test_instantiation.hpp" +int main(int argc, char** argv) { return main_instantiation(argc, argv); } // main diff --git a/sparse/impl/KokkosSparse_sort_crs_impl.hpp b/sparse/impl/KokkosSparse_sort_crs_impl.hpp new file mode 100644 index 0000000000..5e18c3fd5c --- /dev/null +++ b/sparse/impl/KokkosSparse_sort_crs_impl.hpp @@ -0,0 +1,366 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef _KOKKOSSPARSE_SORTCRS_IMPL_HPP +#define _KOKKOSSPARSE_SORTCRS_IMPL_HPP + +#include "Kokkos_Core.hpp" +#include "Kokkos_Sort.hpp" +#include "KokkosKernels_Sorting.hpp" + +// Workaround for issue with Kokkos::Experimental::sort_by_key, with nvcc and OpenMP enabled +// (Kokkos issue #7036, fixed in 4.4 release) +// Once support for Kokkos < 4.4 is dropped, +// all code inside "ifdef KK_DISABLE_BULK_SORT_BY_KEY" can be deleted. +#if (KOKKOS_VERSION < 40400) && defined(KOKKOS_ENABLE_CUDA) +#define KK_DISABLE_BULK_SORT_BY_KEY +#endif + +namespace KokkosSparse { +namespace Impl { + +template +struct MatrixRadixSortFunctor { + using Offset = typename rowmap_t::non_const_value_type; + using Ordinal = typename entries_t::non_const_value_type; + using UnsignedOrdinal = typename std::make_unsigned::type; + using Scalar = typename values_t::non_const_value_type; + // The functor owns memory for entriesAux, so it can't have + // MemoryTraits + using entries_managed_t = Kokkos::View; + using values_managed_t = Kokkos::View; + + MatrixRadixSortFunctor(const rowmap_t& rowmap_, const entries_t& entries_, const values_t& values_) + : rowmap(rowmap_), entries(entries_), values(values_) { + entriesAux = entries_managed_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Entries aux"), entries.extent(0)); + valuesAux = values_managed_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Values aux"), values.extent(0)); + } + + KOKKOS_INLINE_FUNCTION void operator()(Ordinal i) const { + Offset rowStart = rowmap(i); + Offset rowEnd = rowmap(i + 1); + Ordinal rowNum = rowEnd - rowStart; + // Radix sort requires unsigned keys for comparison + KokkosKernels::SerialRadixSort2( + (UnsignedOrdinal*)entries.data() + rowStart, (UnsignedOrdinal*)entriesAux.data() + rowStart, + values.data() + rowStart, valuesAux.data() + rowStart, rowNum); + } + + rowmap_t rowmap; + entries_t entries; + entries_managed_t entriesAux; + values_t values; + values_managed_t valuesAux; +}; + +template +struct MatrixThreadSortFunctor { + using Offset = typename rowmap_t::non_const_value_type; + + MatrixThreadSortFunctor(Ordinal numRows_, const rowmap_t& rowmap_, const entries_t& entries_, const values_t& values_) + : numRows(numRows_), rowmap(rowmap_), entries(entries_), values(values_) {} + + KOKKOS_INLINE_FUNCTION void operator()(const typename Policy::member_type& t) const { + Ordinal i = t.league_rank() * t.team_size() + t.team_rank(); + if (i >= numRows) return; + Offset rowStart = rowmap(i); + Offset rowEnd = rowmap(i + 1); + auto rowEntries = Kokkos::subview(entries, Kokkos::make_pair(rowStart, rowEnd)); + auto rowValues = Kokkos::subview(values, Kokkos::make_pair(rowStart, rowEnd)); + Kokkos::Experimental::sort_by_key_thread(t, rowEntries, rowValues); + } + + Ordinal numRows; + rowmap_t rowmap; + entries_t entries; + values_t values; +}; + +template +struct GraphRadixSortFunctor { + using Offset = typename rowmap_t::non_const_value_type; + using Ordinal = typename entries_t::non_const_value_type; + using UnsignedOrdinal = typename std::make_unsigned::type; + // The functor owns memory for entriesAux, so it can't have + // MemoryTraits + using entries_managed_t = Kokkos::View; + + GraphRadixSortFunctor(const rowmap_t& rowmap_, const entries_t& entries_) : rowmap(rowmap_), entries(entries_) { + entriesAux = entries_managed_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Entries aux"), entries.extent(0)); + } + + KOKKOS_INLINE_FUNCTION void operator()(Ordinal i) const { + Offset rowStart = rowmap(i); + Offset rowEnd = rowmap(i + 1); + Ordinal rowNum = rowEnd - rowStart; + // Radix sort requires unsigned keys for comparison + KokkosKernels::SerialRadixSort((UnsignedOrdinal*)entries.data() + rowStart, + (UnsignedOrdinal*)entriesAux.data() + rowStart, rowNum); + } + + rowmap_t rowmap; + entries_t entries; + entries_managed_t entriesAux; +}; + +template +struct GraphThreadSortFunctor { + using Offset = typename rowmap_t::non_const_value_type; + + GraphThreadSortFunctor(Ordinal numRows_, const rowmap_t& rowmap_, const entries_t& entries_) + : numRows(numRows_), rowmap(rowmap_), entries(entries_) {} + + KOKKOS_INLINE_FUNCTION void operator()(const typename Policy::member_type& t) const { + Ordinal i = t.league_rank() * t.team_size() + t.team_rank(); + if (i >= numRows) return; + Offset rowStart = rowmap(i); + Offset rowEnd = rowmap(i + 1); + auto rowEntries = Kokkos::subview(entries, Kokkos::make_pair(rowStart, rowEnd)); + Kokkos::Experimental::sort_thread(t, rowEntries); + } + + Ordinal numRows; + rowmap_t rowmap; + entries_t entries; +}; + +template +struct MergedRowmapFunctor { + using size_type = typename rowmap_t::non_const_value_type; + using lno_t = typename entries_t::non_const_value_type; + using c_rowmap_t = typename rowmap_t::const_type; + + // Precondition: entries are sorted within each row + MergedRowmapFunctor(const rowmap_t& mergedCounts_, const c_rowmap_t& rowmap_, const entries_t& entries_) + : mergedCounts(mergedCounts_), rowmap(rowmap_), entries(entries_) {} + + KOKKOS_INLINE_FUNCTION void operator()(lno_t row, size_type& lnewNNZ) const { + size_type rowBegin = rowmap(row); + size_type rowEnd = rowmap(row + 1); + if (rowEnd == rowBegin) { + // Row was empty to begin with + mergedCounts(row) = 0; + return; + } + // Otherwise, the first entry in the row exists + lno_t uniqueEntries = 1; + for (size_type j = rowBegin + 1; j < rowEnd; j++) { + if (entries(j - 1) != entries(j)) uniqueEntries++; + } + mergedCounts(row) = uniqueEntries; + lnewNNZ += uniqueEntries; + if (row == lno_t((rowmap.extent(0) - 1) - 1)) mergedCounts(row + 1) = 0; + } + + rowmap_t mergedCounts; + c_rowmap_t rowmap; + entries_t entries; +}; + +template +struct MatrixMergedEntriesFunctor { + using size_type = typename rowmap_t::non_const_value_type; + using lno_t = typename entries_t::non_const_value_type; + using scalar_t = typename values_t::non_const_value_type; + + // Precondition: entries are sorted within each row + MatrixMergedEntriesFunctor(const typename rowmap_t::const_type& rowmap_, const entries_t& entries_, + const values_t& values_, const rowmap_t& mergedRowmap_, const entries_t& mergedEntries_, + const values_t& mergedValues_) + : rowmap(rowmap_), + entries(entries_), + values(values_), + mergedRowmap(mergedRowmap_), + mergedEntries(mergedEntries_), + mergedValues(mergedValues_) {} + + KOKKOS_INLINE_FUNCTION void operator()(lno_t row) const { + size_type rowBegin = rowmap(row); + size_type rowEnd = rowmap(row + 1); + if (rowEnd == rowBegin) { + // Row was empty to begin with, nothing to do + return; + } + // Otherwise, accumulate the value for each column + scalar_t accumVal = values(rowBegin); + lno_t accumCol = entries(rowBegin); + size_type insertPos = mergedRowmap(row); + for (size_type j = rowBegin + 1; j < rowEnd; j++) { + if (accumCol == entries(j)) { + // accumulate + accumVal += values(j); + } else { + // write out and reset + mergedValues(insertPos) = accumVal; + mergedEntries(insertPos) = accumCol; + insertPos++; + accumVal = values(j); + accumCol = entries(j); + } + } + // always left with the last unique entry + mergedValues(insertPos) = accumVal; + mergedEntries(insertPos) = accumCol; + } + + typename rowmap_t::const_type rowmap; + entries_t entries; + values_t values; + rowmap_t mergedRowmap; + entries_t mergedEntries; + values_t mergedValues; +}; + +template +struct GraphMergedEntriesFunctor { + using size_type = typename rowmap_t::non_const_value_type; + using lno_t = typename entries_t::non_const_value_type; + + // Precondition: entries are sorted within each row + GraphMergedEntriesFunctor(const typename rowmap_t::const_type& rowmap_, const entries_t& entries_, + const rowmap_t& mergedRowmap_, const entries_t& mergedEntries_) + : rowmap(rowmap_), entries(entries_), mergedRowmap(mergedRowmap_), mergedEntries(mergedEntries_) {} + + KOKKOS_INLINE_FUNCTION void operator()(lno_t row) const { + size_type rowBegin = rowmap(row); + size_type rowEnd = rowmap(row + 1); + if (rowEnd == rowBegin) { + // Row was empty to begin with, nothing to do + return; + } + // Otherwise, accumulate the value for each column + lno_t accumCol = entries(rowBegin); + size_type insertPos = mergedRowmap(row); + for (size_type j = rowBegin + 1; j < rowEnd; j++) { + if (accumCol != entries(j)) { + // write out and reset + mergedEntries(insertPos) = accumCol; + insertPos++; + accumCol = entries(j); + } + } + // always left with the last unique entry + mergedEntries(insertPos) = accumCol; + } + + typename rowmap_t::const_type rowmap; + entries_t entries; + rowmap_t mergedRowmap; + entries_t mergedEntries; +}; + +template +struct MaxScanFunctor { + using value_type = uint64_t; + + MaxScanFunctor(uint64_t ncols_, const Keys& keys_, const Entries& entries_) + : ncols(ncols_), keys(keys_), entries(entries_) {} + + KOKKOS_INLINE_FUNCTION + void init(uint64_t& update) const { update = 0; } + + KOKKOS_INLINE_FUNCTION + void join(uint64_t& update, const uint64_t& input) const { update = Kokkos::max(update, input); } + + KOKKOS_INLINE_FUNCTION + void operator()(Offset i, uint64_t& lmax, bool finalPass) const { + lmax = Kokkos::max(lmax, keys(i)); + if (finalPass) { + // lmax is the row containing entry i. + // The key is equivalent to the entry's linear + // index if the matrix were dense and row-major. + keys(i) = lmax * ncols + entries(i); + } + } + + uint64_t ncols; + Keys keys; + Entries entries; +}; + +template +Kokkos::View generateBulkCrsKeys(const ExecSpace& exec, const Rowmap& rowmap, + const Entries& entries, + typename Entries::non_const_value_type ncols) { + using Offset = typename Rowmap::non_const_value_type; + using Ordinal = typename Entries::non_const_value_type; + Ordinal numRows = rowmap.extent(0) ? rowmap.extent(0) - 1 : 0; + Kokkos::View keys("keys", entries.extent(0)); + Kokkos::parallel_for( + "CRS bulk sorting: mark row begins", Kokkos::RangePolicy(exec, 0, numRows), KOKKOS_LAMBDA(Ordinal i) { + Offset rowBegin = rowmap(i); + // Only mark the beginnings of non-empty rows. + // Otherwise multiple rows could try to update the same key. + if (rowmap(i + 1) != rowBegin) { + keys(rowBegin) = uint64_t(i); + } + }); + Kokkos::fence(); + Kokkos::parallel_scan("CRS bulk sorting: compute keys", Kokkos::RangePolicy(exec, 0, entries.extent(0)), + MaxScanFunctor(ncols, keys, entries)); + Kokkos::fence(); + return keys; +} + +#ifndef KK_DISABLE_BULK_SORT_BY_KEY +template +Kokkos::View computeEntryPermutation( + const ExecSpace& exec, const Rowmap& rowmap, const Entries& entries, typename Entries::non_const_value_type ncols) { + using Offset = typename Rowmap::non_const_value_type; + auto keys = generateBulkCrsKeys(exec, rowmap, entries, ncols); + Kokkos::View permutation(Kokkos::view_alloc(Kokkos::WithoutInitializing, "permutation"), + entries.extent(0)); + // This initializes permutation as the identity + KokkosKernels::Impl::sequential_fill(exec, permutation); + Kokkos::Experimental::sort_by_key(exec, keys, permutation); + return permutation; +} + +// Heuristic for choosing bulk sorting algorithm +template +bool useBulkSortHeuristic(Ordinal avgDeg, Ordinal maxDeg) { + // Use bulk sort if matrix is highly imbalanced, + // OR the longest rows have many entries. + return (maxDeg / 10 > avgDeg) || (maxDeg > 1024); +} +#endif + +template +void applyPermutation(const ExecSpace& exec, const Permutation& permutation, const InView& in, const OutView& out) { + Kokkos::parallel_for( + "CRS bulk sorting: permute", Kokkos::RangePolicy(exec, 0, in.extent(0)), + KOKKOS_LAMBDA(size_t i) { out(i) = in(permutation(i)); }); +} + +template +void applyPermutationBlockValues(const ExecSpace& exec, const Permutation& permutation, const InView& in, + const OutView& out, Ordinal blockSize) { + uint64_t scalarsPerBlock = (uint64_t)blockSize * blockSize; + if (in.extent(0) % scalarsPerBlock) + throw std::invalid_argument( + "sort_bsr_matrix: matrix values extent not divisible by graph entries " + "extent"); + Kokkos::parallel_for( + "BSR bulk sorting: permute", Kokkos::RangePolicy(exec, 0, in.extent(0)), KOKKOS_LAMBDA(size_t i) { + uint64_t blockIndex = i / scalarsPerBlock; + uint64_t offsetInBlock = i % scalarsPerBlock; + out(i) = in(permutation(blockIndex) * scalarsPerBlock + offsetInBlock); + }); +} + +} // namespace Impl +} // namespace KokkosSparse + +#endif diff --git a/sparse/src/KokkosSparse_SortCrs.hpp b/sparse/src/KokkosSparse_SortCrs.hpp index 455068b56f..1203cd244b 100644 --- a/sparse/src/KokkosSparse_SortCrs.hpp +++ b/sparse/src/KokkosSparse_SortCrs.hpp @@ -16,38 +16,11 @@ #ifndef _KOKKOSSPARSE_SORTCRS_HPP #define _KOKKOSSPARSE_SORTCRS_HPP -#include "Kokkos_Core.hpp" -#include "KokkosKernels_Sorting.hpp" +#include "KokkosSparse_sort_crs_impl.hpp" +#include "KokkosSparse_Utils.hpp" namespace KokkosSparse { -// ---------------------------------- -// BSR matrix/graph sorting utilities -// ---------------------------------- - -// Sort a BRS matrix: within each row, sort entries ascending by column and -// permute the values accordingly. -template -void sort_bsr_matrix(const lno_t blockdim, const rowmap_t& rowmap, const entries_t& entries, const values_t& values); - -// Sort a BRS matrix on the given execution space instance: within each row, -// sort entries ascending by column and permute the values accordingly. -template -void sort_bsr_matrix(const execution_space& exec, const lno_t blockdim, const rowmap_t& rowmap, - const entries_t& entries, const values_t& values); - -// Sort a BRS matrix: within each row, sort entries ascending by column and -// permute the values accordingly. -template -void sort_bsr_matrix(const bsrMat_t& A); - -// Sort a BRS matrix on the given execution space instance: within each row, -// sort entries ascending by column and permute the values accordingly. -template -void sort_bsr_matrix(const typename bsrMat_t::execution_space& exec, const bsrMat_t& A); - // ---------------------------------- // CRS matrix/graph sorting utilities // ---------------------------------- @@ -63,269 +36,13 @@ void sort_bsr_matrix(const typename bsrMat_t::execution_space& exec, const bsrMa // duplicated entries in A, A is sorted and returned (instead of a newly // allocated matrix). -namespace Impl { - -template -struct SortCrsMatrixFunctor { - using size_type = typename rowmap_t::non_const_value_type; - using lno_t = typename entries_t::non_const_value_type; - using scalar_t = typename values_t::non_const_value_type; - using team_mem = typename Kokkos::TeamPolicy::member_type; - // The functor owns memory for entriesAux, so it can't have - // MemoryTraits - using entries_managed_t = Kokkos::View; - using values_managed_t = Kokkos::View; - - SortCrsMatrixFunctor(bool usingRangePol, const rowmap_t& rowmap_, const entries_t& entries_, const values_t& values_) - : rowmap(rowmap_), entries(entries_), values(values_) { - if (usingRangePol) { - entriesAux = entries_managed_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Entries aux"), entries.extent(0)); - valuesAux = values_managed_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Values aux"), values.extent(0)); - } - // otherwise, aux arrays won't be allocated (sorting in place) - } - - KOKKOS_INLINE_FUNCTION void operator()(const lno_t i) const { - size_type rowStart = rowmap(i); - size_type rowEnd = rowmap(i + 1); - lno_t rowNum = rowEnd - rowStart; - // Radix sort requires unsigned keys for comparison - using unsigned_lno_t = typename std::make_unsigned::type; - KokkosKernels::SerialRadixSort2( - (unsigned_lno_t*)entries.data() + rowStart, (unsigned_lno_t*)entriesAux.data() + rowStart, - values.data() + rowStart, valuesAux.data() + rowStart, rowNum); - } - - KOKKOS_INLINE_FUNCTION void operator()(const team_mem t) const { - size_type i = t.league_rank(); - size_type rowStart = rowmap(i); - size_type rowEnd = rowmap(i + 1); - lno_t rowNum = rowEnd - rowStart; - KokkosKernels::TeamBitonicSort2(entries.data() + rowStart, - values.data() + rowStart, rowNum, t); - } - - rowmap_t rowmap; - entries_t entries; - entries_managed_t entriesAux; - values_t values; - values_managed_t valuesAux; -}; - -template -struct SortCrsGraphFunctor { - using size_type = typename rowmap_t::non_const_value_type; - using lno_t = typename entries_t::non_const_value_type; - using team_mem = typename Kokkos::TeamPolicy::member_type; - // The functor owns memory for entriesAux, so it can't have - // MemoryTraits - using entries_managed_t = Kokkos::View; - - SortCrsGraphFunctor(bool usingRangePol, const rowmap_t& rowmap_, const entries_t& entries_) - : rowmap(rowmap_), entries(entries_) { - if (usingRangePol) { - entriesAux = entries_managed_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Entries aux"), entries.extent(0)); - } - // otherwise, aux arrays won't be allocated (sorting in place) - } - - KOKKOS_INLINE_FUNCTION void operator()(const lno_t i) const { - size_type rowStart = rowmap(i); - size_type rowEnd = rowmap(i + 1); - lno_t rowNum = rowEnd - rowStart; - // Radix sort requires unsigned keys for comparison - using unsigned_lno_t = typename std::make_unsigned::type; - KokkosKernels::SerialRadixSort((unsigned_lno_t*)entries.data() + rowStart, - (unsigned_lno_t*)entriesAux.data() + rowStart, rowNum); - } - - KOKKOS_INLINE_FUNCTION void operator()(const team_mem t) const { - size_type i = t.league_rank(); - size_type rowStart = rowmap(i); - size_type rowEnd = rowmap(i + 1); - lno_t rowNum = rowEnd - rowStart; - KokkosKernels::TeamBitonicSort(entries.data() + rowStart, rowNum, t); - } - - rowmap_t rowmap; - entries_t entries; - entries_managed_t entriesAux; -}; - -template -struct MergedRowmapFunctor { - using size_type = typename rowmap_t::non_const_value_type; - using lno_t = typename entries_t::non_const_value_type; - using c_rowmap_t = typename rowmap_t::const_type; - - // Precondition: entries are sorted within each row - MergedRowmapFunctor(const rowmap_t& mergedCounts_, const c_rowmap_t& rowmap_, const entries_t& entries_) - : mergedCounts(mergedCounts_), rowmap(rowmap_), entries(entries_) {} - - KOKKOS_INLINE_FUNCTION void operator()(lno_t row, size_type& lnewNNZ) const { - size_type rowBegin = rowmap(row); - size_type rowEnd = rowmap(row + 1); - if (rowEnd == rowBegin) { - // Row was empty to begin with - mergedCounts(row) = 0; - return; - } - // Otherwise, the first entry in the row exists - lno_t uniqueEntries = 1; - for (size_type j = rowBegin + 1; j < rowEnd; j++) { - if (entries(j - 1) != entries(j)) uniqueEntries++; - } - mergedCounts(row) = uniqueEntries; - lnewNNZ += uniqueEntries; - if (row == lno_t((rowmap.extent(0) - 1) - 1)) mergedCounts(row + 1) = 0; - } - - rowmap_t mergedCounts; - c_rowmap_t rowmap; - entries_t entries; -}; - -template -struct MatrixMergedEntriesFunctor { - using size_type = typename rowmap_t::non_const_value_type; - using lno_t = typename entries_t::non_const_value_type; - using scalar_t = typename values_t::non_const_value_type; - - // Precondition: entries are sorted within each row - MatrixMergedEntriesFunctor(const typename rowmap_t::const_type& rowmap_, const entries_t& entries_, - const values_t& values_, const rowmap_t& mergedRowmap_, const entries_t& mergedEntries_, - const values_t& mergedValues_) - : rowmap(rowmap_), - entries(entries_), - values(values_), - mergedRowmap(mergedRowmap_), - mergedEntries(mergedEntries_), - mergedValues(mergedValues_) {} - - KOKKOS_INLINE_FUNCTION void operator()(lno_t row) const { - size_type rowBegin = rowmap(row); - size_type rowEnd = rowmap(row + 1); - if (rowEnd == rowBegin) { - // Row was empty to begin with, nothing to do - return; - } - // Otherwise, accumulate the value for each column - scalar_t accumVal = values(rowBegin); - lno_t accumCol = entries(rowBegin); - size_type insertPos = mergedRowmap(row); - for (size_type j = rowBegin + 1; j < rowEnd; j++) { - if (accumCol == entries(j)) { - // accumulate - accumVal += values(j); - } else { - // write out and reset - mergedValues(insertPos) = accumVal; - mergedEntries(insertPos) = accumCol; - insertPos++; - accumVal = values(j); - accumCol = entries(j); - } - } - // always left with the last unique entry - mergedValues(insertPos) = accumVal; - mergedEntries(insertPos) = accumCol; - } - - typename rowmap_t::const_type rowmap; - entries_t entries; - values_t values; - rowmap_t mergedRowmap; - entries_t mergedEntries; - values_t mergedValues; -}; - -template -struct GraphMergedEntriesFunctor { - using size_type = typename rowmap_t::non_const_value_type; - using lno_t = typename entries_t::non_const_value_type; - - // Precondition: entries are sorted within each row - GraphMergedEntriesFunctor(const typename rowmap_t::const_type& rowmap_, const entries_t& entries_, - const rowmap_t& mergedRowmap_, const entries_t& mergedEntries_) - : rowmap(rowmap_), entries(entries_), mergedRowmap(mergedRowmap_), mergedEntries(mergedEntries_) {} - - KOKKOS_INLINE_FUNCTION void operator()(lno_t row) const { - size_type rowBegin = rowmap(row); - size_type rowEnd = rowmap(row + 1); - if (rowEnd == rowBegin) { - // Row was empty to begin with, nothing to do - return; - } - // Otherwise, accumulate the value for each column - lno_t accumCol = entries(rowBegin); - size_type insertPos = mergedRowmap(row); - for (size_type j = rowBegin + 1; j < rowEnd; j++) { - if (accumCol != entries(j)) { - // write out and reset - mergedEntries(insertPos) = accumCol; - insertPos++; - accumCol = entries(j); - } - } - // always left with the last unique entry - mergedEntries(insertPos) = accumCol; - } - - typename rowmap_t::const_type rowmap; - entries_t entries; - rowmap_t mergedRowmap; - entries_t mergedEntries; -}; - -template -KOKKOS_INLINE_FUNCTION void kk_swap(T& a, T& b) { - T t = a; - a = b; - b = t; -} - -template -struct sort_bsr_functor { - using lno_t = typename entries_type::non_const_value_type; - - row_map_type rowmap; - entries_type entries; - values_type values; - const lno_t blocksize; - - sort_bsr_functor(row_map_type rowmap_, entries_type entries_, values_type values_, const lno_t blocksize_) - : rowmap(rowmap_), entries(entries_), values(values_), blocksize(blocksize_) {} - - KOKKOS_INLINE_FUNCTION - void operator()(const lno_t i) const { - const lno_t rowStart = rowmap(i); - const lno_t rowSize = rowmap(i + 1) - rowStart; - auto* e = entries.data() + rowStart; - auto* v = values.data() + rowStart * blocksize; - bool done = false; - while (!done) { - done = true; - for (lno_t j = 1; j < rowSize; ++j) { - const lno_t jp = j - 1; - if (e[jp] <= e[j]) continue; - Impl::kk_swap(e[jp], e[j]); - auto const vb = v + j * blocksize; - auto const vbp = v + jp * blocksize; - for (lno_t k = 0; k < blocksize; ++k) // std::swap_ranges(vb, vb + blocksize, vbp); - Impl::kk_swap(vb[k], vbp[k]); - done = false; - } - } - } -}; - -} // namespace Impl - // Sort a CRS matrix: within each row, sort entries ascending by column. // At the same time, permute the values. template void sort_crs_matrix(const execution_space& exec, const rowmap_t& rowmap, const entries_t& entries, - const values_t& values) { + const values_t& values, + typename entries_t::non_const_value_type numCols = + Kokkos::ArithTraits::max()) { static_assert(Kokkos::SpaceAccessibility::accessible, "sort_crs_matrix: rowmap_t is not accessible from the given execution " "space"); @@ -338,71 +55,156 @@ void sort_crs_matrix(const execution_space& exec, const rowmap_t& rowmap, const static_assert(!std::is_const_v, "sort_crs_matrix: entries_t must not be const-valued"); static_assert(!std::is_const_v, "sort_crs_matrix: value_t must not be const-valued"); - using lno_t = typename entries_t::non_const_value_type; - using team_pol = Kokkos::TeamPolicy; - bool useRadix = !KokkosKernels::Impl::kk_is_gpu_exec_space(); - lno_t numRows = rowmap.extent(0) ? rowmap.extent(0) - 1 : 0; - if (numRows == 0) return; - Impl::SortCrsMatrixFunctor funct(useRadix, rowmap, entries, values); - if (useRadix) { - Kokkos::parallel_for("sort_crs_matrix", Kokkos::RangePolicy(exec, 0, numRows), funct); + using Ordinal = typename entries_t::non_const_value_type; + // This early return condition covers having 0 or 1 entries, + // which is also implied by having 0 rows or 0 columns. + // If only 1 entry, the matrix is already sorted. + if (entries.extent(0) <= size_t(1)) { + return; + } + Ordinal numRows = rowmap.extent(0) ? rowmap.extent(0) - 1 : 0; + if constexpr (!KokkosKernels::Impl::kk_is_gpu_exec_space()) { + // On CPUs, use a sequential radix sort within each row. + Kokkos::parallel_for("sort_crs_matrix[CPU,radix]", + Kokkos::RangePolicy>(exec, 0, numRows), + Impl::MatrixRadixSortFunctor(rowmap, entries, values)); } else { - // Try to get teamsize to be largest power of 2 not greater than avg entries - // per row - // TODO (probably important for performnce): add thread-level sort also, and - // use that for small avg degree. But this works for now. - lno_t idealTeamSize = 1; - lno_t avgDeg = (entries.extent(0) + numRows - 1) / numRows; - while (idealTeamSize < avgDeg / 2) { - idealTeamSize *= 2; + // On GPUs: + // If the matrix is highly imbalanced, or has long rows AND the dimensions + // are not too large to do one large bulk sort, do that. Otherwise, sort + // using one Kokkos thread per row. + Ordinal avgDeg = (entries.extent(0) + numRows - 1) / numRows; +#ifndef KK_DISABLE_BULK_SORT_BY_KEY + Ordinal maxDeg = KokkosSparse::Impl::graph_max_degree(exec, rowmap); + bool useBulkSort = false; + if (KokkosSparse::Impl::useBulkSortHeuristic(avgDeg, maxDeg)) { + // Calculate the true number of columns if user didn't pass it in + if (numCols == Kokkos::ArithTraits::max()) { + KokkosKernels::Impl::kk_view_reduce_max(exec, entries.extent(0), entries, numCols); + numCols++; + } + uint64_t maxBulkKey = (uint64_t)numRows * (uint64_t)numCols; + useBulkSort = maxBulkKey / numRows == (uint64_t)numCols; + } + if (useBulkSort) { + auto permutation = KokkosSparse::Impl::computeEntryPermutation(exec, rowmap, entries, numCols); + // Permutations cannot be done in-place + Kokkos::View origValues( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "origValues"), values.extent(0)); + Kokkos::View origEntries( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "origEntries"), entries.extent(0)); + Kokkos::deep_copy(exec, origValues, values); + Kokkos::deep_copy(exec, origEntries, entries); + KokkosSparse::Impl::applyPermutation(exec, permutation, origEntries, entries); + KokkosSparse::Impl::applyPermutation(exec, permutation, origValues, values); + } else +#else + (void)numCols; +#endif + { + using TeamPol = Kokkos::TeamPolicy; + // Can't use bulk sort approach as matrix dimensions are too large. + // Fall back to parallel thread-level sort within each row. + Ordinal vectorLength = 1; + while (vectorLength < avgDeg / 2) { + vectorLength *= 2; + } + if (vectorLength > TeamPol ::vector_length_max()) vectorLength = TeamPol ::vector_length_max(); + Impl::MatrixThreadSortFunctor funct(numRows, rowmap, entries, + values); + Ordinal teamSize = TeamPol(exec, 1, 1, vectorLength).team_size_recommended(funct, Kokkos::ParallelForTag()); + Kokkos::parallel_for("sort_crs_matrix[GPU,bitonic]", + TeamPol(exec, (numRows + teamSize - 1) / teamSize, teamSize, vectorLength), funct); } - team_pol temp(exec, numRows, 1); - lno_t maxTeamSize = temp.team_size_max(funct, Kokkos::ParallelForTag()); - lno_t teamSize = std::min(idealTeamSize, maxTeamSize); - Kokkos::parallel_for("sort_crs_matrix", team_pol(exec, numRows, teamSize), funct); } } template -void sort_crs_matrix(const rowmap_t& rowmap, const entries_t& entries, const values_t& values) { - sort_crs_matrix(execution_space(), rowmap, entries, values); +void sort_crs_matrix(const rowmap_t& rowmap, const entries_t& entries, const values_t& values, + typename entries_t::const_value_type numCols = + Kokkos::ArithTraits::max()) { + sort_crs_matrix(execution_space(), rowmap, entries, values, numCols); } template -void sort_crs_matrix(const rowmap_t& rowmap, const entries_t& entries, const values_t& values) { - sort_crs_matrix(typename entries_t::execution_space(), rowmap, entries, values); +void sort_crs_matrix(const rowmap_t& rowmap, const entries_t& entries, const values_t& values, + typename entries_t::const_value_type numCols = + Kokkos::ArithTraits::max()) { + sort_crs_matrix(typename entries_t::execution_space(), rowmap, entries, values, numCols); } template void sort_crs_matrix(const typename crsMat_t::execution_space& exec, const crsMat_t& A) { - sort_crs_matrix(exec, A.graph.row_map, A.graph.entries, A.values); + sort_crs_matrix(exec, A.graph.row_map, A.graph.entries, A.values, A.numCols()); } template void sort_crs_matrix(const crsMat_t& A) { - sort_crs_matrix(typename crsMat_t::execution_space(), A.graph.row_map, A.graph.entries, A.values); + sort_crs_matrix(typename crsMat_t::execution_space(), A.graph.row_map, A.graph.entries, A.values, A.numCols()); } // Sort a BRS matrix: within each row, sort entries ascending by column and // permute the values accordingly. -template -void sort_bsr_matrix(const execution_space& exec, const lno_t blockdim, const rowmap_t& rowmap, - const entries_t& entries, const values_t& values) { - // TODO: this is O(N^2) mock for debugging - do regular implementation based - // on Radix/Bitonic sort (like CSR) IDEA: maybe we need only one general - // Radix2/Bitonic2 and CSR sorting may call it with blockSize=1 ? - lno_t numRows = rowmap.extent(0) ? rowmap.extent(0) - 1 : 0; - if (numRows == 0) return; - const lno_t blocksize = blockdim * blockdim; - - assert(values.extent(0) == entries.extent(0) * blocksize); - Impl::sort_bsr_functor bsr_sorter(rowmap, entries, values, blocksize); - Kokkos::parallel_for("sort_bsr_matrix", Kokkos::RangePolicy(exec, 0, numRows), bsr_sorter); -} - -template -void sort_bsr_matrix(const lno_t blockdim, const rowmap_t& rowmap, const entries_t& entries, const values_t& values) { - sort_bsr_matrix(execution_space(), blockdim, rowmap, entries, values); +template +void sort_bsr_matrix(const execution_space& exec, Ordinal blockSize, const rowmap_t& rowmap, const entries_t& entries, + const values_t& values, + typename entries_t::non_const_value_type numCols = + Kokkos::ArithTraits::max()) { + static_assert(std::is_same_v, + "sort_bsr_matrix: Ordinal type must match nonconst value type of " + "entries_t (default template parameter)"); + if (entries.extent(0) <= size_t(1)) { + return; + } + Ordinal numRows = rowmap.extent(0) ? rowmap.extent(0) - 1 : 0; + if (numCols == Kokkos::ArithTraits::max()) { + KokkosKernels::Impl::kk_view_reduce_max(exec, entries.extent(0), entries, numCols); + numCols++; + } + uint64_t maxBulkKey = (uint64_t)numRows * (uint64_t)numCols; + if (maxBulkKey / numRows != (uint64_t)numCols) + throw std::invalid_argument( + "sort_bsr_matrix: implementation requires that numRows * numCols is " + "representable in uint64_t"); +#ifdef KK_DISABLE_BULK_SORT_BY_KEY + using TeamPol = Kokkos::TeamPolicy; + using Offset = typename rowmap_t::non_const_value_type; + // Temporary workaround: do not use Kokkos::Experimental::sort_by_key, instead + // sort bulk keys one row at a time + auto keys = Impl::generateBulkCrsKeys(exec, rowmap, entries, numCols); + Kokkos::View permutation(Kokkos::view_alloc(Kokkos::WithoutInitializing, "permutation"), + entries.extent(0)); + KokkosKernels::Impl::sequential_fill(exec, permutation); + Ordinal vectorLength = 1; + Ordinal avgDeg = (entries.extent(0) + numRows - 1) / numRows; + while (vectorLength < avgDeg / 2) { + vectorLength *= 2; + } + if (vectorLength > TeamPol ::vector_length_max()) vectorLength = TeamPol ::vector_length_max(); + Impl::MatrixThreadSortFunctor funct( + numRows, rowmap, keys, permutation); + Ordinal teamSize = TeamPol(exec, 1, 1, vectorLength).team_size_recommended(funct, Kokkos::ParallelForTag()); + Kokkos::parallel_for("sort_bulk_keys_by_row[GPU,bitonic]", + TeamPol(exec, (numRows + teamSize - 1) / teamSize, teamSize, vectorLength), funct); +#else + auto permutation = KokkosSparse::Impl::computeEntryPermutation(exec, rowmap, entries, numCols); +#endif + // Permutations cannot be done in-place + Kokkos::View origValues( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "origValues"), values.extent(0)); + Kokkos::View origEntries( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "origEntries"), entries.extent(0)); + Kokkos::deep_copy(exec, origValues, values); + Kokkos::deep_copy(exec, origEntries, entries); + KokkosSparse::Impl::applyPermutation(exec, permutation, origEntries, entries); + KokkosSparse::Impl::applyPermutationBlockValues(exec, permutation, origValues, values, blockSize); +} + +template +void sort_bsr_matrix(Ordinal blockdim, const rowmap_t& rowmap, const entries_t& entries, const values_t& values, + Ordinal numCols = Kokkos::ArithTraits::max()) { + sort_bsr_matrix(execution_space(), blockdim, rowmap, entries, values, numCols); } // Sort a BSR matrix (like CRS but single values are replaced with contignous @@ -413,7 +215,7 @@ void sort_bsr_matrix(const typename bsrMat_t::execution_space& exec, const bsrMa // directly sort_bsr_matrix( - exec, A.blockDim(), A.graph.row_map, A.graph.entries, A.values); + exec, A.blockDim(), A.graph.row_map, A.graph.entries, A.values, A.numCols()); } template @@ -423,9 +225,10 @@ void sort_bsr_matrix(const bsrMat_t& A) { // Sort a CRS graph: within each row, sort entries ascending by column. template -void sort_crs_graph(const execution_space& exec, const rowmap_t& rowmap, const entries_t& entries) { - using lno_t = typename entries_t::non_const_value_type; - using team_pol = Kokkos::TeamPolicy; +void sort_crs_graph(const execution_space& exec, const rowmap_t& rowmap, const entries_t& entries, + typename entries_t::non_const_value_type numCols = + Kokkos::ArithTraits::max()) { + using Ordinal = typename entries_t::non_const_value_type; static_assert(Kokkos::SpaceAccessibility::accessible, "sort_crs_graph: rowmap_t is not accessible from the given execution " "space"); @@ -433,27 +236,55 @@ void sort_crs_graph(const execution_space& exec, const rowmap_t& rowmap, const e "sort_crs_graph: entries_t is not accessible from the given execution " "space"); static_assert(!std::is_const_v, "sort_crs_graph: entries_t must not be const-valued"); - bool useRadix = !KokkosKernels::Impl::kk_is_gpu_exec_space(); - lno_t numRows = rowmap.extent(0) ? rowmap.extent(0) - 1 : 0; - if (numRows == 0) return; - Impl::SortCrsGraphFunctor funct(useRadix, rowmap, entries); - if (useRadix) { - Kokkos::parallel_for("sort_crs_graph", Kokkos::RangePolicy(exec, 0, numRows), funct); + Ordinal numRows = rowmap.extent(0) ? rowmap.extent(0) - 1 : 0; + if (entries.extent(0) <= size_t(1)) { + return; + } + if constexpr (!KokkosKernels::Impl::kk_is_gpu_exec_space()) { + // If on CPU, sort each row independently. Don't need to know numCols for + // this. + Kokkos::parallel_for("sort_crs_graph[CPU,radix]", + Kokkos::RangePolicy>(exec, 0, numRows), + Impl::GraphRadixSortFunctor(rowmap, entries)); } else { - // Try to get teamsize to be largest power of 2 less than or equal to - // half the entries per row. 0.5 * #entries is bitonic's parallelism within - // a row. - // TODO (probably important for performnce): add thread-level sort also, and - // use that for small avg degree. But this works for now. - lno_t idealTeamSize = 1; - lno_t avgDeg = (entries.extent(0) + numRows - 1) / numRows; - while (idealTeamSize < avgDeg / 2) { - idealTeamSize *= 2; + // On GPUs: + // If the graph is highly imbalanced AND the dimensions are not too large + // to do one large bulk sort, do that. Otherwise, sort using one Kokkos + // thread per row. + Ordinal avgDeg = (entries.extent(0) + numRows - 1) / numRows; +#ifndef KK_DISABLE_BULK_SORT_BY_KEY + Ordinal maxDeg = KokkosSparse::Impl::graph_max_degree(exec, rowmap); + bool useBulkSort = false; + if (KokkosSparse::Impl::useBulkSortHeuristic(avgDeg, maxDeg)) { + // Calculate the true number of columns if user didn't pass it in + if (numCols == Kokkos::ArithTraits::max()) { + KokkosKernels::Impl::kk_view_reduce_max(exec, entries.extent(0), entries, numCols); + numCols++; + } + uint64_t maxBulkKey = (uint64_t)numRows * (uint64_t)numCols; + useBulkSort = maxBulkKey / numRows == (uint64_t)numCols; + } + if (useBulkSort) { + auto keys = KokkosSparse::Impl::generateBulkCrsKeys(exec, rowmap, entries, numCols); + Kokkos::Experimental::sort_by_key(exec, keys, entries); + } else +#else + (void)numCols; +#endif + { + using TeamPol = Kokkos::TeamPolicy; + // Fall back to thread-level sort within each row + Ordinal vectorLength = 1; + while (vectorLength < avgDeg / 2) { + vectorLength *= 2; + } + if (vectorLength > TeamPol ::vector_length_max()) vectorLength = TeamPol ::vector_length_max(); + + Impl::GraphThreadSortFunctor funct(numRows, rowmap, entries); + Ordinal teamSize = TeamPol(exec, 1, 1, vectorLength).team_size_recommended(funct, Kokkos::ParallelForTag()); + Kokkos::parallel_for("sort_crs_graph[GPU,bitonic]", + TeamPol(exec, (numRows + teamSize - 1) / teamSize, teamSize, vectorLength), funct); } - team_pol temp(exec, numRows, 1); - lno_t maxTeamSize = temp.team_size_max(funct, Kokkos::ParallelForTag()); - lno_t teamSize = std::min(idealTeamSize, maxTeamSize); - Kokkos::parallel_for("sort_crs_graph", team_pol(exec, numRows, teamSize), funct); } } @@ -462,36 +293,38 @@ void sort_crs_graph(const rowmap_t& rowmap, const entries_t& entries) { sort_crs_graph(execution_space(), rowmap, entries); } -// This overload covers 2 cases, while allowing all template args to be deduced: -// - sort_crs_graph(exec, G) -// - sort_crs_graph(rowmap, entries) -template -void sort_crs_graph(const Arg1& a1, const Arg2& a2) { - if constexpr (Kokkos::is_execution_space_v) { - // a1 is an exec instance, a2 is a graph - sort_crs_graph(a1, a2.row_map, a2.entries); - } else if constexpr (Kokkos::is_view_v) { - // a1 is rowmap, a2 is entries - sort_crs_graph(typename Arg2::execution_space(), a1, a2); - } else { - static_assert(Arg1::doesnthavethisthing, - "sort_crs_graph(arg1, arg2): expect either (exec, G) or " - "(rowmap, entries)"); - } +template +typename std::enable_if_t> sort_crs_graph( + const rowmap_t& rowmap, const entries_t& entries, + typename entries_t::const_value_type& numCols = + Kokkos::ArithTraits::max()) { + sort_crs_graph(typename entries_t::execution_space(), rowmap, entries, numCols); +} + +template +typename std::enable_if_t> sort_crs_graph( + const execution_space& exec, const crsGraph_t& G, + typename crsGraph_t::entries_type::const_value_type& numCols = + Kokkos::ArithTraits::max()) { + sort_crs_graph(exec, G.row_map, G.entries, numCols); } template -void sort_crs_graph(const crsGraph_t& G) { - sort_crs_graph(typename crsGraph_t::execution_space(), G); +void sort_crs_graph(const crsGraph_t& G, + typename crsGraph_t::entries_type::const_value_type& numCols = + Kokkos::ArithTraits::max()) { + sort_crs_graph(typename crsGraph_t::execution_space(), G, numCols); } template void sort_and_merge_matrix(const exec_space& exec, const typename rowmap_t::const_type& rowmap_in, const entries_t& entries_in, const values_t& values_in, rowmap_t& rowmap_out, - entries_t& entries_out, values_t& values_out) { + entries_t& entries_out, values_t& values_out, + typename entries_t::const_value_type& numCols = + Kokkos::ArithTraits::max()) { using nc_rowmap_t = typename rowmap_t::non_const_type; - using size_type = typename nc_rowmap_t::value_type; - using ordinal_t = typename entries_t::value_type; + using Offset = typename nc_rowmap_t::value_type; + using Ordinal = typename entries_t::value_type; using range_t = Kokkos::RangePolicy; static_assert(Kokkos::SpaceAccessibility::accessible, "sort_and_merge_matrix: rowmap_t is not accessible from the given " @@ -507,8 +340,8 @@ void sort_and_merge_matrix(const exec_space& exec, const typename rowmap_t::cons static_assert(!std::is_const_v, "sort_and_merge_matrix: value_t must not be const-valued"); - ordinal_t numRows = rowmap_in.extent(0) ? ordinal_t(rowmap_in.extent(0) - 1) : ordinal_t(0); - size_type nnz = entries_in.extent(0); + Ordinal numRows = rowmap_in.extent(0) ? Ordinal(rowmap_in.extent(0) - 1) : Ordinal(0); + Offset nnz = entries_in.extent(0); if (numRows == 0) { rowmap_out = typename rowmap_t::non_const_type("SortedMerged rowmap", rowmap_in.extent(0)); @@ -517,13 +350,13 @@ void sort_and_merge_matrix(const exec_space& exec, const typename rowmap_t::cons return; } - sort_crs_matrix(exec, rowmap_in, entries_in, values_in); + sort_crs_matrix(exec, rowmap_in, entries_in, values_in, numCols); // Count entries per row into a new rowmap, in terms of merges that can be // done nc_rowmap_t nc_rowmap_out(Kokkos::view_alloc(exec, Kokkos::WithoutInitializing, "SortedMerged rowmap"), numRows + 1); - size_type numCompressedEntries = 0; - Kokkos::parallel_reduce(range_t(exec, 0, numRows), + Offset numCompressedEntries = 0; + Kokkos::parallel_reduce("KokkosSparse::Impl::MergedRowmapFunctor", range_t(exec, 0, numRows), Impl::MergedRowmapFunctor(nc_rowmap_out, rowmap_in, entries_in), numCompressedEntries); if (nnz == numCompressedEntries) { @@ -555,7 +388,7 @@ void sort_and_merge_matrix(const exec_space& exec, const typename rowmap_t::cons values_out = values_t(Kokkos::view_alloc(exec, Kokkos::WithoutInitializing, "SortedMerged values"), numCompressedEntries); // Compute merged entries and values - Kokkos::parallel_for(range_t(exec, 0, numRows), + Kokkos::parallel_for("KokkosSparse::Impl::MatrixMergedEntriesFunctor", range_t(exec, 0, numRows), Impl::MatrixMergedEntriesFunctor( rowmap_orig, entries_orig, values_orig, rowmap_out, entries_out, values_out)); } @@ -571,7 +404,8 @@ crsMat_t sort_and_merge_matrix(const typename crsMat_t::execution_space& exec, c entries_t entries_out; values_t values_out; - sort_and_merge_matrix(exec, A.graph.row_map, A.graph.entries, A.values, rowmap_out, entries_out, values_out); + sort_and_merge_matrix(exec, A.graph.row_map, A.graph.entries, A.values, rowmap_out, entries_out, values_out, + A.numCols()); return crsMat_t("SortedMerged", A.numRows(), A.numCols(), values_out.extent(0), values_out, rowmap_out, entries_out); } @@ -584,23 +418,29 @@ crsMat_t sort_and_merge_matrix(const crsMat_t& A) { template void sort_and_merge_matrix(const typename rowmap_t::const_type& rowmap_in, const entries_t& entries_in, const values_t& values_in, rowmap_t& rowmap_out, entries_t& entries_out, - values_t& values_out) { - sort_and_merge_matrix(exec_space(), rowmap_in, entries_in, values_in, rowmap_out, entries_out, values_out); + values_t& values_out, + typename entries_t::const_value_type& numCols = + Kokkos::ArithTraits::max()) { + sort_and_merge_matrix(exec_space(), rowmap_in, entries_in, values_in, rowmap_out, entries_out, values_out, numCols); } template void sort_and_merge_matrix(const typename rowmap_t::const_type& rowmap_in, const entries_t& entries_in, const values_t& values_in, rowmap_t& rowmap_out, entries_t& entries_out, - values_t& values_out) { + values_t& values_out, + typename entries_t::const_value_type& numCols = + Kokkos::ArithTraits::max()) { sort_and_merge_matrix(typename entries_t::execution_space(), rowmap_in, entries_in, values_in, rowmap_out, - entries_out, values_out); + entries_out, values_out, numCols); } template void sort_and_merge_graph(const exec_space& exec, const typename rowmap_t::const_type& rowmap_in, - const entries_t& entries_in, rowmap_t& rowmap_out, entries_t& entries_out) { - using size_type = typename rowmap_t::non_const_value_type; - using lno_t = typename entries_t::value_type; + const entries_t& entries_in, rowmap_t& rowmap_out, entries_t& entries_out, + typename entries_t::const_value_type& numCols = + Kokkos::ArithTraits::max()) { + using Offset = typename rowmap_t::non_const_value_type; + using Ordinal = typename entries_t::value_type; using range_t = Kokkos::RangePolicy; using nc_rowmap_t = typename rowmap_t::non_const_type; static_assert(Kokkos::SpaceAccessibility::accessible, @@ -612,19 +452,19 @@ void sort_and_merge_graph(const exec_space& exec, const typename rowmap_t::const static_assert(!std::is_const_v, "sort_and_merge_graph: entries_t must not be const-valued"); - lno_t numRows = rowmap_in.extent(0) ? rowmap_in.extent(0) - 1 : 0; + Ordinal numRows = rowmap_in.extent(0) ? rowmap_in.extent(0) - 1 : 0; if (numRows == 0) { rowmap_out = typename rowmap_t::non_const_type("SortedMerged rowmap", rowmap_in.extent(0)); entries_out = entries_t(); return; } // Sort in place - sort_crs_graph(exec, rowmap_in, entries_in); + sort_crs_graph(exec, rowmap_in, entries_in, numCols); // Count entries per row into a new rowmap, in terms of merges that can be // done nc_rowmap_t nc_rowmap_out(Kokkos::view_alloc(exec, Kokkos::WithoutInitializing, "SortedMerged rowmap"), numRows + 1); - size_type numCompressedEntries = 0; - Kokkos::parallel_reduce(range_t(exec, 0, numRows), + Offset numCompressedEntries = 0; + Kokkos::parallel_reduce("KokkosSparse::Impl::MergedRowmapFunctor", range_t(exec, 0, numRows), Impl::MergedRowmapFunctor(nc_rowmap_out, rowmap_in, entries_in), numCompressedEntries); if (entries_in.extent(0) == size_t(numCompressedEntries)) { @@ -655,107 +495,50 @@ void sort_and_merge_graph(const exec_space& exec, const typename rowmap_t::const entries_out = entries_t(Kokkos::view_alloc(exec, Kokkos::WithoutInitializing, "SortedMerged entries"), numCompressedEntries); // Compute merged entries and values - Kokkos::parallel_for(range_t(exec, 0, numRows), Impl::GraphMergedEntriesFunctor( - rowmap_orig, entries_orig, rowmap_out, entries_out)); + Kokkos::parallel_for( + "KokkosSparse::Impl::GraphMergedEntriesFunctor", range_t(exec, 0, numRows), + Impl::GraphMergedEntriesFunctor(rowmap_orig, entries_orig, rowmap_out, entries_out)); } template void sort_and_merge_graph(const typename rowmap_t::const_type& rowmap_in, const entries_t& entries_in, - rowmap_t& rowmap_out, entries_t& entries_out) { - return sort_and_merge_graph(exec_space(), rowmap_in, entries_in, rowmap_out, entries_out); + rowmap_t& rowmap_out, entries_t& entries_out, + typename entries_t::const_value_type& numCols = + Kokkos::ArithTraits::max()) { + return sort_and_merge_graph(exec_space(), rowmap_in, entries_in, rowmap_out, entries_out, numCols); } template void sort_and_merge_graph(const typename rowmap_t::const_type& rowmap_in, const entries_t& entries_in, - rowmap_t& rowmap_out, entries_t& entries_out) { - return sort_and_merge_graph(typename entries_t::execution_space(), rowmap_in, entries_in, rowmap_out, entries_out); + rowmap_t& rowmap_out, entries_t& entries_out, + typename entries_t::const_value_type& numCols = + Kokkos::ArithTraits::max()) { + return sort_and_merge_graph(typename entries_t::execution_space(), rowmap_in, entries_in, rowmap_out, entries_out, + numCols); } template -crsGraph_t sort_and_merge_graph(const typename crsGraph_t::execution_space& exec, const crsGraph_t& G) { +crsGraph_t sort_and_merge_graph( + const typename crsGraph_t::execution_space& exec, const crsGraph_t& G, + typename crsGraph_t::entries_type::const_value_type& numCols = + Kokkos::ArithTraits::max()) { using rowmap_t = typename crsGraph_t::row_map_type::non_const_type; using entries_t = typename crsGraph_t::entries_type; static_assert(!std::is_const::value, "sort_and_merge_graph requires StaticCrsGraph entries to be non-const."); rowmap_t mergedRowmap; entries_t mergedEntries; - sort_and_merge_graph(exec, G.row_map, G.entries, mergedRowmap, mergedEntries); + sort_and_merge_graph(exec, G.row_map, G.entries, mergedRowmap, mergedEntries, numCols); return crsGraph_t(mergedEntries, mergedRowmap); } template -crsGraph_t sort_and_merge_graph(const crsGraph_t& G) { - return sort_and_merge_graph(typename crsGraph_t::execution_space(), G); +crsGraph_t sort_and_merge_graph( + const crsGraph_t& G, typename crsGraph_t::entries_type::const_value_type& numCols = + Kokkos::ArithTraits::max()) { + return sort_and_merge_graph(typename crsGraph_t::execution_space(), G, numCols); } } // namespace KokkosSparse -namespace KokkosKernels { - -// ---------------------------------- -// BSR matrix/graph sorting utilities -// ---------------------------------- - -// Sort a BRS matrix: within each row, sort entries ascending by column and -// permute the values accordingly. -template -[[deprecated]] void sort_bsr_matrix(const lno_t blockdim, const rowmap_t& rowmap, const entries_t& entries, - const values_t& values) { - KokkosSparse::sort_bsr_matrix(blockdim, rowmap, entries, values); -} - -template -[[deprecated]] void sort_bsr_matrix(const bsrMat_t& A) { - KokkosSparse::sort_bsr_matrix(A); -} - -// ---------------------------------- -// CRS matrix/graph sorting utilities -// ---------------------------------- - -// The sort_crs* functions sort the adjacent column list for each row into -// ascending order. - -template -[[deprecated]] void sort_crs_matrix(const rowmap_t& rowmap, const entries_t& entries, const values_t& values) { - KokkosSparse::sort_crs_matrix(rowmap, entries, values); -} - -template -[[deprecated]] void sort_crs_matrix(const crsMat_t& A) { - KokkosSparse::sort_crs_matrix(A); -} - -template -[[deprecated]] void sort_crs_graph(const rowmap_t& rowmap, const entries_t& entries) { - KokkosSparse::sort_crs_graph(rowmap, entries); -} - -template -[[deprecated]] void sort_crs_graph(const crsGraph_t& G) { - KokkosSparse::sort_crs_graph(G); -} - -// sort_and_merge_matrix produces a new matrix which is equivalent to A but is -// sorted and has no duplicated entries: each (i, j) is unique. Values for -// duplicated entries are summed. -template -[[deprecated]] crsMat_t sort_and_merge_matrix(const crsMat_t& A) { - KokkosSparse::sort_and_merge_matrix(A); -} - -template -[[deprecated]] crsGraph_t sort_and_merge_graph(const crsGraph_t& G) { - KokkosSparse::sort_and_merge_graph(G); -} - -template -[[deprecated]] void sort_and_merge_graph(const typename rowmap_t::const_type& rowmap_in, const entries_t& entries_in, - rowmap_t& rowmap_out, entries_t& entries_out) { - KokkosSparse::sort_and_merge_graph(rowmap_in, entries_in, rowmap_out, entries_out); -} - -} // namespace KokkosKernels - #endif // _KOKKOSSPARSE_SORTCRS_HPP diff --git a/sparse/src/KokkosSparse_Utils.hpp b/sparse/src/KokkosSparse_Utils.hpp index 781857ef55..d73787481e 100644 --- a/sparse/src/KokkosSparse_Utils.hpp +++ b/sparse/src/KokkosSparse_Utils.hpp @@ -848,6 +848,19 @@ ordinal_t graph_max_degree(const rowmap_t &rowmap) { return val; } +template +typename rowmap_t::non_const_value_type graph_max_degree(const execution_space &exec, const rowmap_t &rowmap) { + using Offset = typename rowmap_t::non_const_value_type; + using Reducer = Kokkos::Max; + Offset nrows = rowmap.extent(0); + if (nrows) nrows--; + if (nrows == 0) return 0; + Offset val; + Kokkos::parallel_reduce(Kokkos::RangePolicy(exec, 0, nrows), + MaxDegreeFunctor(rowmap), Reducer(val)); + return val; +} + template void graph_min_max_degree(const rowmap_t &rowmap, ordinal_t &min_degree, ordinal_t &max_degree) { using Reducer = Kokkos::MinMax; diff --git a/sparse/src/KokkosSparse_coo2crs.hpp b/sparse/src/KokkosSparse_coo2crs.hpp index d10ef9974c..d9964e18b7 100644 --- a/sparse/src/KokkosSparse_coo2crs.hpp +++ b/sparse/src/KokkosSparse_coo2crs.hpp @@ -79,7 +79,7 @@ auto coo2crs(DimType m, DimType n, RowViewType row, ColViewType col, DataViewTyp // clang-format on template auto coo2crs(KokkosSparse::CooMatrix &cooMatrix) { - return coo2crs(cooMatrix.numRows(), cooMatrix.numCols(), cooMatrix.row, cooMatrix.col, cooMatrix.data); + return coo2crs(cooMatrix.numRows(), cooMatrix.numCols(), cooMatrix.row(), cooMatrix.col(), cooMatrix.data()); } } // namespace KokkosSparse #endif // _KOKKOSSPARSE_COO2CRS_HPP diff --git a/sparse/src/KokkosSparse_spadd_handle.hpp b/sparse/src/KokkosSparse_spadd_handle.hpp index ea9594ca3e..8d28309585 100644 --- a/sparse/src/KokkosSparse_spadd_handle.hpp +++ b/sparse/src/KokkosSparse_spadd_handle.hpp @@ -102,10 +102,6 @@ class SPADDHandle { */ size_type get_c_nnz() { return this->result_nnz_size; } - void set_sort_option(int option) { this->sort_option = option; } - - int get_sort_option() { return this->sort_option; } - #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE SpaddCusparseData cusparseData; #endif