diff --git a/.github/workflows/continuous-integration-workflow-32bit.yml b/.github/workflows/continuous-integration-workflow-32bit.yml index 87c21d3a6e7..0260cb5894a 100644 --- a/.github/workflows/continuous-integration-workflow-32bit.yml +++ b/.github/workflows/continuous-integration-workflow-32bit.yml @@ -36,7 +36,7 @@ jobs: -DKokkos_ENABLE_DEPRECATED_CODE_4=ON \ -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ - -DCMAKE_CXX_FLAGS="-Werror -m32 -DKOKKOS_IMPL_32BIT" \ + -DCMAKE_CXX_FLAGS="-Werror -m32" \ -DCMAKE_CXX_COMPILER=g++ \ -DCMAKE_BUILD_TYPE=RelWithDebInfo - name: Build diff --git a/.jenkins_nightly b/.jenkins_nightly index a8facd365c2..b723f12c0fc 100644 --- a/.jenkins_nightly +++ b/.jenkins_nightly @@ -95,7 +95,8 @@ pipeline { -DKokkos_ENABLE_BENCHMARKS=ON \ -DKokkos_ENABLE_EXAMPLES=ON \ -DKokkos_ENABLE_TESTS=ON \ - -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ + -DKokkos_ENABLE_DEPRECATED_CODE_4=ON \ + -DKokkos_ENABLE_DEPRECATION_WARNINGS=ON \ -DKokkos_ENABLE_SERIAL=ON \ .. && \ make -j8 && ctest --verbose @@ -123,7 +124,7 @@ pipeline { -DKokkos_ARCH_NATIVE=ON \ -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ -DKokkos_ENABLE_DEPRECATED_CODE_4=ON \ - -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ + -DKokkos_ENABLE_DEPRECATION_WARNINGS=ON \ -DKokkos_ENABLE_TESTS=ON \ -DKokkos_ENABLE_BENCHMARKS=ON \ -DKokkos_ENABLE_HIP=ON \ diff --git a/CHANGELOG.md b/CHANGELOG.md index f7b8af7695c..f8d288db5da 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,18 @@ # CHANGELOG +## [4.3.01](https://github.com/kokkos/kokkos/tree/4.3.01) +[Full Changelog](https://github.com/kokkos/kokkos/compare/4.3.00...4.3.01) + +### Backend and Architecture Enhancements: + +#### HIP: +* MI300 support unified memory [\#6877](https://github.com/kokkos/kokkos/pull/6877) + +### Bug Fixes +* Serial: Use the provided execution space instance in TeamPolicy [\#6951](https://github.com/kokkos/kokkos/pull/6951) +* `nvcc_wrapper`: bring back support for `--fmad` option [\#6931](https://github.com/kokkos/kokkos/pull/6931) +* Fix CUDA reduction overflow for `RangePolicy` [\#6578](https://github.com/kokkos/kokkos/pull/6578) + ## [4.3.00](https://github.com/kokkos/kokkos/tree/4.3.00) (2024-03-19) [Full Changelog](https://github.com/kokkos/kokkos/compare/4.2.01...4.3.00) @@ -39,7 +52,7 @@ * Make the OpenACC backend asynchronous [\#6772](https://github.com/kokkos/kokkos/pull/6772) #### Threads: -* Add missing broadcast to TeamThreadRange parallel_scan [\#6601](https://github.com/kokkos/kokkos/pull/6446) +* Add missing broadcast to TeamThreadRange parallel_scan [\#6601](https://github.com/kokkos/kokkos/pull/6601) #### OpenMP: * Improve performance of view initializations and filling with zeros [\#6573](https://github.com/kokkos/kokkos/pull/6573) diff --git a/CITATION.cff b/CITATION.cff new file mode 100644 index 00000000000..28c674c451b --- /dev/null +++ b/CITATION.cff @@ -0,0 +1,65 @@ +cff-version: 1.2.0 +title: Kokkos +message: >- + If you use this software, please cite the overview paper +type: software +authors: + - name: The Kokkos authors + website: https://kokkos.org/community/team/ +identifiers: + - type: url + website: https://kokkos.org/kokkos-core-wiki/citation.html +repository-code: 'https://github.com/kokkos/kokkos' +url: 'https://kokkos.org/' +license: Apache-2.0 +preferred-citation: + type: article + authors: + - given-names: Christian R. + family-names: Trott + - given-names: Damien + family-names: Lebrun-Grandié + - given-names: Daniel + family-names: Arndt + - family-names: Ciesko + given-names: Jan + - given-names: Vinh + family-names: Dang + - family-names: Ellingwood + given-names: Nathan + - given-names: Rahulkumar + family-names: Gayatri + - given-names: Evan + family-names: Harvey + - given-names: Daisy S. + family-names: Hollman + - given-names: Dan + family-names: Ibanez + - given-names: Nevin + family-names: Liber + - given-names: Jonathan + family-names: Madsen + - given-names: Jeff + family-names: Miles + - given-names: David + family-names: Poliakoff + - given-names: Amy + family-names: Powell + - given-names: Sivasankaran + family-names: Rajamanickam + - given-names: Mikael + family-names: Simberg + - given-names: Dan + family-names: Sunderland + - given-names: Bruno + family-names: Turcksin + - given-names: Jeremiah + family-names: Wilke + doi: 10.1109/TPDS.2021.3097283 + journal: IEEE Transactions on Parallel and Distributed Systems + start: 805 + end: 817 + title: "Kokkos 3: Programming Model Extensions for the Exascale Era" + volume: 33 + issue: 4 + year: 2022 diff --git a/Copyright.txt b/Copyright.txt index 5e2f8d8647b..cbba3efc7bc 100644 --- a/Copyright.txt +++ b/Copyright.txt @@ -1,41 +1,8 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 3.0 -// Copyright (2020) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Questions? Contact Christian R. Trott (crtrott@sandia.gov) -// -// ************************************************************************ -//@HEADER +************************************************************************ + + Kokkos v. 4.0 + Copyright (2022) National Technology & Engineering + Solutions of Sandia, LLC (NTESS). + +Under the terms of Contract DE-NA0003525 with NTESS, +the U.S. Government retains certain rights in this software. diff --git a/LICENSE b/LICENSE index 6572cc2db05..4d9d69d7c44 100644 --- a/LICENSE +++ b/LICENSE @@ -1,13 +1,3 @@ - ************************************************************************ - - Kokkos v. 4.0 - Copyright (2022) National Technology & Engineering - Solutions of Sandia, LLC (NTESS). - - Under the terms of Contract DE-NA0003525 with NTESS, - the U.S. Government retains certain rights in this software. - - ============================================================================== Kokkos is under the Apache License v2.0 with LLVM Exceptions: ============================================================================== diff --git a/README.md b/README.md index f4252437111..7d9d70fac5c 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,8 @@ backends in development. **Kokkos Core is part of the [Kokkos C++ Performance Portability Programming Ecosystem](https://kokkos.org/about/abstract/).** +Kokkos is a [Linux Foundation](https://linuxfoundation.org) project. + ## Learning about Kokkos To start learning about Kokkos: @@ -28,12 +30,12 @@ To start learning about Kokkos: The latest release of Kokkos can be obtained from the [GitHub releases page](https://github.com/kokkos/kokkos/releases/latest). -The current release is [4.2.01](https://github.com/kokkos/kokkos/releases/tag/4.2.01). +The current release is [4.3.00](https://github.com/kokkos/kokkos/releases/tag/4.3.00). ```bash -curl -OJ -L https://github.com/kokkos/kokkos/archive/refs/tags/4.2.01.tar.gz +curl -OJ -L https://github.com/kokkos/kokkos/archive/refs/tags/4.3.00.tar.gz # Or with wget -wget https://github.com/kokkos/kokkos/archive/refs/tags/4.2.01.tar.gz +wget https://github.com/kokkos/kokkos/archive/refs/tags/4.3.00.tar.gz ``` To clone the latest development version of Kokkos from GitHub: @@ -44,7 +46,7 @@ git clone -b develop https://github.com/kokkos/kokkos.git ### Building Kokkos -To build Kokkos, you will need to have a C++ compiler that supports C++14 or later. +To build Kokkos, you will need to have a C++ compiler that supports C++17 or later. All requirements including minimum and primary tested compiler versions can be found [here](https://kokkos.org/kokkos-core-wiki/requirements.html). Building and installation instructions are described [here](https://kokkos.org/kokkos-core-wiki/building.html). diff --git a/algorithms/src/std_algorithms/impl/Kokkos_AdjacentDifference.hpp b/algorithms/src/std_algorithms/impl/Kokkos_AdjacentDifference.hpp index a8171fa068d..9f7fcf94fe0 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_AdjacentDifference.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_AdjacentDifference.hpp @@ -82,6 +82,11 @@ OutputIteratorType adjacent_difference_exespace_impl( return first_dest; } +#ifdef KOKKOS_ENABLE_DEBUG + // check for overlapping iterators + Impl::expect_no_overlap(first_from, last_from, first_dest); +#endif + // run const auto num_elements = Kokkos::Experimental::distance(first_from, last_from); @@ -114,6 +119,11 @@ KOKKOS_FUNCTION OutputIteratorType adjacent_difference_team_impl( return first_dest; } +#ifdef KOKKOS_ENABLE_DEBUG + // check for overlapping iterators + Impl::expect_no_overlap(first_from, last_from, first_dest); +#endif + // run const auto num_elements = Kokkos::Experimental::distance(first_from, last_from); diff --git a/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp b/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp index 27ce5a6fad6..54bb13e25b9 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp @@ -24,18 +24,21 @@ namespace Kokkos { namespace Experimental { namespace Impl { +template +class RandomAccessIterator; + template struct is_admissible_to_kokkos_std_algorithms : std::false_type {}; template struct is_admissible_to_kokkos_std_algorithms< - T, std::enable_if_t< ::Kokkos::is_view::value && T::rank() == 1 && - (std::is_same::value || - std::is_same::value || - std::is_same::value)> > + T, std::enable_if_t<::Kokkos::is_view::value && T::rank() == 1 && + (std::is_same::value || + std::is_same::value || + std::is_same::value)>> : std::true_type {}; template @@ -58,6 +61,18 @@ using is_iterator = Kokkos::is_detected; template inline constexpr bool is_iterator_v = is_iterator::value; +template +struct is_kokkos_iterator : std::false_type {}; + +template +struct is_kokkos_iterator> { + static constexpr bool value = + is_admissible_to_kokkos_std_algorithms::value; +}; + +template +inline constexpr bool is_kokkos_iterator_v = is_kokkos_iterator::value; + // // are_iterators // @@ -215,6 +230,38 @@ KOKKOS_INLINE_FUNCTION void expect_valid_range(IteratorType first, (void)last; } +// +// Check if kokkos iterators are overlapping +// +template +KOKKOS_INLINE_FUNCTION void expect_no_overlap( + [[maybe_unused]] IteratorType1 first, [[maybe_unused]] IteratorType1 last, + [[maybe_unused]] IteratorType2 s_first) { + if constexpr (is_kokkos_iterator_v && + is_kokkos_iterator_v) { + auto const view1 = first.view(); + auto const view2 = s_first.view(); + + std::size_t stride1 = view1.stride(0); + std::size_t stride2 = view2.stride(0); + ptrdiff_t first_diff = view1.data() - view2.data(); + + // FIXME If strides are not identical, checks may not be made + // with the cost of O(1) + // Currently, checks are made only if strides are identical + // If first_diff == 0, there is already an overlap + if (stride1 == stride2 || first_diff == 0) { + [[maybe_unused]] bool is_no_overlap = (first_diff % stride1); + auto* first_pointer1 = view1.data(); + auto* first_pointer2 = view2.data(); + [[maybe_unused]] auto* last_pointer1 = first_pointer1 + (last - first); + [[maybe_unused]] auto* last_pointer2 = first_pointer2 + (last - first); + KOKKOS_EXPECTS(first_pointer1 >= last_pointer2 || + last_pointer1 <= first_pointer2 || is_no_overlap); + } + } +} + } // namespace Impl } // namespace Experimental } // namespace Kokkos diff --git a/algorithms/src/std_algorithms/impl/Kokkos_RandomAccessIterator.hpp b/algorithms/src/std_algorithms/impl/Kokkos_RandomAccessIterator.hpp index 5c9854b87d7..ff74a32275d 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_RandomAccessIterator.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_RandomAccessIterator.hpp @@ -59,6 +59,38 @@ class RandomAccessIterator< ::Kokkos::View > { ptrdiff_t current_index) : m_view(view), m_current_index(current_index) {} +// FIXME The C++20 requires expression is not supported with Clang 9 and GCC 9 +// The following guards is unsufficient until we increase our minimum CXX20 +// compiler requirements. +// #ifndef KOKKOS_ENABLE_CXX17 // C++20 and beyond +// We replace the Kokkos guards with standard C++ feature testing in the +// meantime. +#if (defined(__cpp_concepts) && (__cpp_concepts >= 201907L)) && \ + (defined(__cpp_conditional_explicit) && \ + (__cpp_conditional_explicit >= 201806L)) + template + requires(std::is_constructible_v) KOKKOS_FUNCTION + explicit(!std::is_convertible_v) + RandomAccessIterator(const RandomAccessIterator& other) + : m_view(other.m_view), m_current_index(other.m_current_index) {} +#else + template < + class OtherViewType, + std::enable_if_t && + !std::is_convertible_v, + int> = 0> + KOKKOS_FUNCTION explicit RandomAccessIterator( + const RandomAccessIterator& other) + : m_view(other.m_view), m_current_index(other.m_current_index) {} + + template , + int> = 0> + KOKKOS_FUNCTION RandomAccessIterator( + const RandomAccessIterator& other) + : m_view(other.m_view), m_current_index(other.m_current_index) {} +#endif + KOKKOS_FUNCTION iterator_type& operator++() { ++m_current_index; @@ -152,9 +184,16 @@ class RandomAccessIterator< ::Kokkos::View > { KOKKOS_FUNCTION reference operator*() const { return m_view(m_current_index); } + KOKKOS_FUNCTION + view_type view() const { return m_view; } + private: view_type m_view; ptrdiff_t m_current_index = 0; + + // Needed for the converting constructor accepting another iterator + template + friend class RandomAccessIterator; }; } // namespace Impl diff --git a/algorithms/unit_tests/TestRandomAccessIterator.cpp b/algorithms/unit_tests/TestRandomAccessIterator.cpp index 282d85548c5..7d484136b6d 100644 --- a/algorithms/unit_tests/TestRandomAccessIterator.cpp +++ b/algorithms/unit_tests/TestRandomAccessIterator.cpp @@ -46,6 +46,44 @@ TEST_F(random_access_iterator_test, constructor) { EXPECT_TRUE(true); } +TEST_F(random_access_iterator_test, constructiblity) { + auto first_d = KE::begin(m_dynamic_view); + auto cfirst_d = KE::cbegin(m_dynamic_view); + + static_assert(std::is_constructible_v); + static_assert( + !std::is_constructible_v); + [[maybe_unused]] decltype(cfirst_d) tmp_cfirst_d(first_d); + + auto first_s = KE::begin(m_static_view); + auto cfirst_s = KE::cbegin(m_static_view); + + static_assert(std::is_constructible_v); + static_assert( + !std::is_constructible_v); + [[maybe_unused]] decltype(cfirst_s) tmp_cfirst_s(first_s); + + auto first_st = KE::begin(m_strided_view); + auto cfirst_st = KE::cbegin(m_strided_view); + + static_assert( + std::is_constructible_v); + static_assert( + !std::is_constructible_v); + [[maybe_unused]] decltype(cfirst_st) tmp_cfirst_st(first_st); + + // [FIXME] Better to have tests for the explicit specifier with an expression. + // As soon as View converting constructors are re-implemented with a + // conditional explicit, we may add those tests. + static_assert(std::is_constructible_v); + static_assert(std::is_constructible_v); + static_assert(std::is_constructible_v); + static_assert(std::is_constructible_v); + static_assert(std::is_constructible_v); + static_assert(std::is_constructible_v); + EXPECT_TRUE(true); +} + template void test_random_access_it_verify(IteratorType it, ValueType gold_value) { using view_t = Kokkos::View; diff --git a/algorithms/unit_tests/TestStdAlgorithmsConstraints.cpp b/algorithms/unit_tests/TestStdAlgorithmsConstraints.cpp index 386d533f7a8..2a4525a8c33 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsConstraints.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsConstraints.cpp @@ -81,5 +81,114 @@ TEST(std_algorithms, is_admissible_to_std_algorithms) { strided_view_3d_t>::value); } +TEST(std_algorithms, expect_no_overlap) { + namespace KE = Kokkos::Experimental; + using value_type = double; + + static constexpr size_t extent0 = 13; + + //------------- + // 1d views + //------------- + using static_view_1d_t = Kokkos::View; + [[maybe_unused]] static_view_1d_t static_view_1d{ + "std-algo-test-1d-contiguous-view-static"}; + + using dyn_view_1d_t = Kokkos::View; + [[maybe_unused]] dyn_view_1d_t dynamic_view_1d{ + "std-algo-test-1d-contiguous-view-dynamic", extent0}; + + using strided_view_1d_t = Kokkos::View; + Kokkos::LayoutStride layout1d{extent0, 2}; + strided_view_1d_t strided_view_1d{"std-algo-test-1d-strided-view", layout1d}; + +// Overlapping because iterators are identical +#if defined(KOKKOS_ENABLE_DEBUG) + auto first_s = KE::begin(static_view_1d); + auto last_s = first_s + extent0; + EXPECT_DEATH({ KE::Impl::expect_no_overlap(first_s, last_s, first_s); }, + "Kokkos contract violation:.*"); + + auto first_d = KE::begin(dynamic_view_1d); + auto last_d = first_d + extent0; + EXPECT_DEATH({ KE::Impl::expect_no_overlap(first_d, last_d, first_d); }, + "Kokkos contract violation:.*"); + + auto first_st = KE::begin(strided_view_1d); + auto last_st = first_st + extent0; + EXPECT_DEATH({ KE::Impl::expect_no_overlap(first_st, last_st, first_st); }, + "Kokkos contract violation:.*"); +#endif + + // Ranges are overlapped + static constexpr size_t sub_extent0 = 6, offset0 = 3; + std::pair range0(0, sub_extent0), + range1(offset0, offset0 + sub_extent0); +#if defined(KOKKOS_ENABLE_DEBUG) + auto static_view_1d_0 = Kokkos::subview(static_view_1d, range0); + auto static_view_1d_1 = Kokkos::subview(static_view_1d, range1); + auto first_s0 = KE::begin(static_view_1d_0); // [0, 6) + auto last_s0 = first_s0 + static_view_1d_0.extent(0); + auto first_s1 = KE::begin(static_view_1d_1); // [3, 9) + EXPECT_DEATH({ KE::Impl::expect_no_overlap(first_s0, last_s0, first_s1); }, + "Kokkos contract violation:.*"); + + auto dynamic_view_1d_0 = Kokkos::subview(dynamic_view_1d, range0); + auto dynamic_view_1d_1 = Kokkos::subview(dynamic_view_1d, range1); + auto first_d0 = KE::begin(dynamic_view_1d_0); // [0, 6) + auto last_d0 = first_d0 + dynamic_view_1d_0.extent(0); + auto first_d1 = KE::begin(dynamic_view_1d_1); // [3, 9) + EXPECT_DEATH({ KE::Impl::expect_no_overlap(first_d0, last_d0, first_d1); }, + "Kokkos contract violation:.*"); +#endif + + auto strided_view_1d_0 = Kokkos::subview(strided_view_1d, range0); + auto strided_view_1d_1 = Kokkos::subview(strided_view_1d, range1); + auto first_st0 = KE::begin(strided_view_1d_0); // [0, 12) + auto last_st0 = first_st0 + strided_view_1d_0.extent(0); + auto first_st1 = KE::begin(strided_view_1d_1); // [3, 15) + // Does not overlap since offset (=3) is not divisible by stride (=2) + EXPECT_NO_THROW( + { KE::Impl::expect_no_overlap(first_st0, last_st0, first_st1); }); + + // Iterating over the same range without overlapping + Kokkos::View static_view_2d{ + "std-algo-test-2d-contiguous-view-static"}; + auto sub_static_view_1d_0 = Kokkos::subview(static_view_2d, 0, Kokkos::ALL); + auto sub_static_view_1d_1 = Kokkos::subview(static_view_2d, 1, Kokkos::ALL); + auto sub_first_s0 = KE::begin(sub_static_view_1d_0); // 0, 2, 4, ... + auto sub_last_s0 = sub_first_s0 + sub_static_view_1d_0.extent(0); + auto sub_first_s1 = KE::begin(sub_static_view_1d_1); // 1, 3, 5, ... + + EXPECT_NO_THROW({ + KE::Impl::expect_no_overlap(sub_first_s0, sub_last_s0, sub_first_s1); + }); + + Kokkos::View dynamic_view_2d{ + "std-algo-test-2d-contiguous-view-dynamic", 2, extent0}; + auto sub_dynamic_view_1d_0 = Kokkos::subview(dynamic_view_2d, 0, Kokkos::ALL); + auto sub_dynamic_view_1d_1 = Kokkos::subview(dynamic_view_2d, 1, Kokkos::ALL); + auto sub_first_d0 = KE::begin(sub_dynamic_view_1d_0); // 0, 2, 4, ... + auto sub_last_d0 = sub_first_d0 + sub_dynamic_view_1d_0.extent(0); + auto sub_first_d1 = KE::begin(sub_dynamic_view_1d_1); // 1, 3, 5, ... + + EXPECT_NO_THROW({ + KE::Impl::expect_no_overlap(sub_first_d0, sub_last_d0, sub_first_d1); + }); + + Kokkos::LayoutStride layout2d{2, 3, extent0, 2 * 3}; + Kokkos::View strided_view_2d{ + "std-algo-test-2d-contiguous-view-strided", layout2d}; + auto sub_strided_view_1d_0 = Kokkos::subview(strided_view_2d, 0, Kokkos::ALL); + auto sub_strided_view_1d_1 = Kokkos::subview(strided_view_2d, 1, Kokkos::ALL); + auto sub_first_st0 = KE::begin(sub_strided_view_1d_0); // 0, 6, 12, ... + auto sub_last_st0 = sub_first_st0 + sub_strided_view_1d_0.extent(0); + auto sub_first_st1 = KE::begin(sub_strided_view_1d_1); // 1, 7, 13, ... + + EXPECT_NO_THROW({ + KE::Impl::expect_no_overlap(sub_first_st0, sub_last_st0, sub_first_st1); + }); +} + } // namespace stdalgos } // namespace Test diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index 611c089b2e3..fb1e73b5799 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -1,6 +1,5 @@ TRIBITS_PACKAGE_DEFINE_DEPENDENCIES( LIB_OPTIONAL_TPLS Pthread CUDA HWLOC DLlib - TEST_OPTIONAL_TPLS CUSPARSE ) TRIBITS_TPL_TENTATIVELY_ENABLE(DLlib) diff --git a/cmake/deps/CUDA.cmake b/cmake/deps/CUDA.cmake index 68bf5b3d579..5b6afd61512 100644 --- a/cmake/deps/CUDA.cmake +++ b/cmake/deps/CUDA.cmake @@ -35,7 +35,6 @@ IF(NOT _CUDA_FAILURE) GLOBAL_SET(TPL_CUDA_LIBRARY_DIRS) GLOBAL_SET(TPL_CUDA_INCLUDE_DIRS ${CUDA_TOOLKIT_INCLUDE}) GLOBAL_SET(TPL_CUDA_LIBRARIES ${CUDA_CUDART_LIBRARY} ${CUDA_cublas_LIBRARY} ${CUDA_cufft_LIBRARY}) - KOKKOS_CREATE_IMPORTED_TPL_LIBRARY(CUSPARSE) ELSE() SET(TPL_ENABLE_CUDA OFF) ENDIF() diff --git a/cmake/deps/CUSPARSE.cmake b/cmake/deps/CUSPARSE.cmake deleted file mode 100644 index b016971ab91..00000000000 --- a/cmake/deps/CUSPARSE.cmake +++ /dev/null @@ -1,26 +0,0 @@ -#@HEADER -# ************************************************************************ -# -# Kokkos v. 4.0 -# Copyright (2022) National Technology & Engineering -# Solutions of Sandia, LLC (NTESS). -# -# Under the terms of Contract DE-NA0003525 with NTESS, -# the U.S. Government retains certain rights in this software. -# -# Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -# -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# -# ************************************************************************ -# @HEADER - -#include(${TRIBITS_DEPS_DIR}/CUDA.cmake) - -#IF (TPL_ENABLE_CUDA) -# GLOBAL_SET(TPL_CUSPARSE_LIBRARY_DIRS) -# GLOBAL_SET(TPL_CUSPARSE_INCLUDE_DIRS ${TPL_CUDA_INCLUDE_DIRS}) -# GLOBAL_SET(TPL_CUSPARSE_LIBRARIES ${CUDA_cusparse_LIBRARY}) -# KOKKOS_CREATE_IMPORTED_TPL_LIBRARY(CUSPARSE) -#ENDIF() - diff --git a/cmake/kokkos_functions.cmake b/cmake/kokkos_functions.cmake index 9dab1ca00ea..d1f1e0d7a78 100644 --- a/cmake/kokkos_functions.cmake +++ b/cmake/kokkos_functions.cmake @@ -709,7 +709,12 @@ MACRO(kokkos_find_imported NAME) ENDIF() IF (NOT TPL_LIBRARY_SUFFIXES) - SET(TPL_LIBRARY_SUFFIXES lib lib64) + SET(TPL_LIBRARY_SUFFIXES lib) + IF(KOKKOS_IMPL_32BIT) + LIST(APPEND TPL_LIBRARY_SUFFIXES lib32) + ELSE() + LIST(APPEND TPL_LIBRARY_SUFFIXES lib64) + ENDIF() ENDIF() SET(${NAME}_INCLUDE_DIRS) diff --git a/cmake/kokkos_tpls.cmake b/cmake/kokkos_tpls.cmake index 6ef3b79bde2..cda9e0d6004 100644 --- a/cmake/kokkos_tpls.cmake +++ b/cmake/kokkos_tpls.cmake @@ -103,13 +103,19 @@ if (Kokkos_ENABLE_IMPL_MDSPAN AND Kokkos_ENABLE_MDSPAN_EXTERNAL) endif() IF (Kokkos_ENABLE_OPENMP) - find_package(OpenMP REQUIRED) + find_package(OpenMP REQUIRED COMPONENTS CXX) # FIXME_TRILINOS Trilinos doesn't allow for Kokkos to use find_dependency # so we just append the flags here instead of linking with the OpenMP target. IF(KOKKOS_HAS_TRILINOS) COMPILER_SPECIFIC_FLAGS(DEFAULT ${OpenMP_CXX_FLAGS}) ELSE() - KOKKOS_EXPORT_CMAKE_TPL(OpenMP REQUIRED) + KOKKOS_EXPORT_CMAKE_TPL(OpenMP REQUIRED COMPONENTS CXX) + ENDIF() + IF(Kokkos_ENABLE_HIP AND KOKKOS_COMPILE_LANGUAGE STREQUAL HIP) + GLOBAL_APPEND(KOKKOS_AMDGPU_OPTIONS ${OpenMP_CXX_FLAGS}) + ENDIF() + IF(Kokkos_ENABLE_CUDA AND KOKKOS_COMPILE_LANGUAGE STREQUAL CUDA) + GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS -Xcompiler ${OpenMP_CXX_FLAGS}) ENDIF() ENDIF() diff --git a/cmake/tpls/FindTPLCUSPARSE.cmake b/cmake/tpls/FindTPLCUSPARSE.cmake deleted file mode 100644 index 4709f8002b1..00000000000 --- a/cmake/tpls/FindTPLCUSPARSE.cmake +++ /dev/null @@ -1,26 +0,0 @@ -#@HEADER -# ************************************************************************ -# -# Kokkos v. 4.0 -# Copyright (2022) National Technology & Engineering -# Solutions of Sandia, LLC (NTESS). -# -# Under the terms of Contract DE-NA0003525 with NTESS, -# the U.S. Government retains certain rights in this software. -# -# Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -# -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# -#@HEADER - -# Check for CUDA support - -IF (NOT TPL_ENABLE_CUDA) - MESSAGE(FATAL_ERROR "\nCUSPARSE requires CUDA") -ELSE() - GLOBAL_SET(TPL_CUSPARSE_LIBRARY_DIRS) - GLOBAL_SET(TPL_CUSPARSE_INCLUDE_DIRS ${TPL_CUDA_INCLUDE_DIRS}) - GLOBAL_SET(TPL_CUSPARSE_LIBRARIES ${CUDA_cusparse_LIBRARY}) -ENDIF() - diff --git a/containers/src/Kokkos_DualView.hpp b/containers/src/Kokkos_DualView.hpp index e821570a8d5..1fb174943fe 100644 --- a/containers/src/Kokkos_DualView.hpp +++ b/containers/src/Kokkos_DualView.hpp @@ -944,13 +944,13 @@ class DualView : public ViewTraits { if (sizeMismatch) { ::Kokkos::realloc(arg_prop, d_view, n0, n1, n2, n3, n4, n5, n6, n7); - if (alloc_prop_input::initialize) { + if constexpr (alloc_prop_input::initialize) { h_view = create_mirror_view(typename t_host::memory_space(), d_view); } else { h_view = create_mirror_view(Kokkos::WithoutInitializing, typename t_host::memory_space(), d_view); } - } else if (alloc_prop_input::initialize) { + } else if constexpr (alloc_prop_input::initialize) { if constexpr (alloc_prop_input::has_execution_space) { const auto& exec_space = Impl::get_property(arg_prop); @@ -1038,7 +1038,7 @@ class DualView : public ViewTraits { /* Resize on Device */ if (sizeMismatch) { ::Kokkos::resize(properties, d_view, n0, n1, n2, n3, n4, n5, n6, n7); - if (alloc_prop_input::initialize) { + if constexpr (alloc_prop_input::initialize) { h_view = create_mirror_view(typename t_host::memory_space(), d_view); } else { h_view = create_mirror_view(Kokkos::WithoutInitializing, @@ -1054,7 +1054,7 @@ class DualView : public ViewTraits { /* Resize on Host */ if (sizeMismatch) { ::Kokkos::resize(properties, h_view, n0, n1, n2, n3, n4, n5, n6, n7); - if (alloc_prop_input::initialize) { + if constexpr (alloc_prop_input::initialize) { d_view = create_mirror_view(typename t_dev::memory_space(), h_view); } else { diff --git a/containers/src/Kokkos_DynRankView.hpp b/containers/src/Kokkos_DynRankView.hpp index 3989911aca4..0af479590e7 100644 --- a/containers/src/Kokkos_DynRankView.hpp +++ b/containers/src/Kokkos_DynRankView.hpp @@ -1932,65 +1932,35 @@ struct MirrorDRVType { } // namespace Impl namespace Impl { -template -inline typename DynRankView::HostMirror create_mirror( - const DynRankView& src, - const Impl::ViewCtorProp& arg_prop, - std::enable_if_t::has_memory_space>* = - nullptr) { - using src_type = DynRankView; - using dst_type = typename src_type::HostMirror; - using alloc_prop_input = Impl::ViewCtorProp; - - static_assert( - !alloc_prop_input::has_label, - "The view constructor arguments passed to Kokkos::create_mirror " - "must not include a label!"); - static_assert( - !alloc_prop_input::has_pointer, - "The view constructor arguments passed to Kokkos::create_mirror must " - "not include a pointer!"); - static_assert( - !alloc_prop_input::allow_padding, - "The view constructor arguments passed to Kokkos::create_mirror must " - "not explicitly allow padding!"); +// create a mirror +// private interface that accepts arbitrary view constructor args passed by a +// view_alloc +template +inline auto create_mirror(const DynRankView& src, + const Impl::ViewCtorProp& arg_prop) { + check_view_ctor_args_create_mirror(); auto prop_copy = Impl::with_properties_if_unset( arg_prop, std::string(src.label()).append("_mirror")); - return dst_type(prop_copy, Impl::reconstructLayout(src.layout(), src.rank())); -} - -template -inline auto create_mirror( - const DynRankView& src, - const Impl::ViewCtorProp& arg_prop, - std::enable_if_t::has_memory_space>* = - nullptr) { - using dst_type = typename Impl::MirrorDRVType< - typename Impl::ViewCtorProp::memory_space, T, - P...>::view_type; - - using alloc_prop_input = Impl::ViewCtorProp; - - static_assert( - !alloc_prop_input::has_label, - "The view constructor arguments passed to Kokkos::create_mirror " - "must not include a label!"); - static_assert( - !alloc_prop_input::has_pointer, - "The view constructor arguments passed to Kokkos::create_mirror must " - "not include a pointer!"); - static_assert( - !alloc_prop_input::allow_padding, - "The view constructor arguments passed to Kokkos::create_mirror must " - "not explicitly allow padding!"); + if constexpr (Impl::ViewCtorProp::has_memory_space) { + using dst_type = typename Impl::MirrorDRVType< + typename Impl::ViewCtorProp::memory_space, T, + P...>::view_type; - auto prop_copy = Impl::with_properties_if_unset( - arg_prop, std::string(src.label()).append("_mirror")); + return dst_type(prop_copy, + Impl::reconstructLayout(src.layout(), src.rank())); + } else { + using src_type = DynRankView; + using dst_type = typename src_type::HostMirror; - return dst_type(prop_copy, Impl::reconstructLayout(src.layout(), src.rank())); + return dst_type(prop_copy, + Impl::reconstructLayout(src.layout(), src.rank())); + } +#if defined KOKKOS_COMPILER_INTEL + __builtin_unreachable(); +#endif } } // namespace Impl @@ -2057,71 +2027,42 @@ inline auto create_mirror( } namespace Impl { -template -inline std::enable_if_t< - !Impl::ViewCtorProp::has_memory_space && - std::is_same< - typename DynRankView::memory_space, - typename DynRankView::HostMirror::memory_space>::value && - std::is_same< - typename DynRankView::data_type, - typename DynRankView::HostMirror::data_type>::value, - typename DynRankView::HostMirror> -create_mirror_view(const DynRankView& src, - const typename Impl::ViewCtorProp&) { - return src; -} +// create a mirror view +// private interface that accepts arbitrary view constructor args passed by a +// view_alloc template -inline std::enable_if_t< - !Impl::ViewCtorProp::has_memory_space && - !(std::is_same< - typename DynRankView::memory_space, - typename DynRankView::HostMirror::memory_space>::value && - std::is_same< - typename DynRankView::data_type, - typename DynRankView::HostMirror::data_type>::value), - typename DynRankView::HostMirror> -create_mirror_view( +inline auto create_mirror_view( const DynRankView& src, - const typename Impl::ViewCtorProp& arg_prop) { - return Kokkos::Impl::create_mirror(src, arg_prop); -} - -template ::has_memory_space>> -inline std::enable_if_t< - Kokkos::is_space< - typename Impl::ViewCtorProp::memory_space>::value && - Impl::MirrorDRViewType< - typename Impl::ViewCtorProp::memory_space, T, - P...>::is_same_memspace, - typename Impl::MirrorDRViewType< - typename Impl::ViewCtorProp::memory_space, T, - P...>::view_type> -create_mirror_view(const Kokkos::DynRankView& src, - const typename Impl::ViewCtorProp&) { - return src; + [[maybe_unused]] const typename Impl::ViewCtorProp& + arg_prop) { + if constexpr (!Impl::ViewCtorProp::has_memory_space) { + if constexpr (std::is_same::memory_space, + typename DynRankView< + T, P...>::HostMirror::memory_space>::value && + std::is_same::data_type, + typename DynRankView< + T, P...>::HostMirror::data_type>::value) { + return typename DynRankView::HostMirror(src); + } else { + return Kokkos::Impl::create_mirror(src, arg_prop); + } + } else { + if constexpr (Impl::MirrorDRViewType::memory_space, + T, P...>::is_same_memspace) { + return typename Impl::MirrorDRViewType< + typename Impl::ViewCtorProp::memory_space, T, + P...>::view_type(src); + } else { + return Kokkos::Impl::create_mirror(src, arg_prop); + } + } +#if defined KOKKOS_COMPILER_INTEL + __builtin_unreachable(); +#endif } -template ::has_memory_space>> -inline std::enable_if_t< - Kokkos::is_space< - typename Impl::ViewCtorProp::memory_space>::value && - !Impl::MirrorDRViewType< - typename Impl::ViewCtorProp::memory_space, T, - P...>::is_same_memspace, - typename Impl::MirrorDRViewType< - typename Impl::ViewCtorProp::memory_space, T, - P...>::view_type> -create_mirror_view( - const Kokkos::DynRankView& src, - const typename Impl::ViewCtorProp& arg_prop) { - return Kokkos::Impl::create_mirror(src, arg_prop); -} } // namespace Impl // Create a mirror view in host space @@ -2194,75 +2135,47 @@ inline auto create_mirror_view( return Impl::create_mirror_view(src, arg_prop); } -template +// create a mirror view and deep copy it +// public interface that accepts arbitrary view constructor args passed by a +// view_alloc +template ::specialize>::value>> auto create_mirror_view_and_copy( - const Impl::ViewCtorProp&, - const Kokkos::DynRankView& src, - std::enable_if_t< - std::is_void::specialize>::value && - Impl::MirrorDRViewType< - typename Impl::ViewCtorProp::memory_space, T, - P...>::is_same_memspace>* = nullptr) { + [[maybe_unused]] const Impl::ViewCtorProp& arg_prop, + const Kokkos::DynRankView& src) { using alloc_prop_input = Impl::ViewCtorProp; - static_assert( - alloc_prop_input::has_memory_space, - "The view constructor arguments passed to " - "Kokkos::create_mirror_view_and_copy must include a memory space!"); - static_assert(!alloc_prop_input::has_pointer, - "The view constructor arguments passed to " - "Kokkos::create_mirror_view_and_copy must " - "not include a pointer!"); - static_assert(!alloc_prop_input::allow_padding, - "The view constructor arguments passed to " - "Kokkos::create_mirror_view_and_copy must " - "not explicitly allow padding!"); - - // same behavior as deep_copy(src, src) - if (!alloc_prop_input::has_execution_space) - fence( - "Kokkos::create_mirror_view_and_copy: fence before returning src view"); - return src; -} -template -auto create_mirror_view_and_copy( - const Impl::ViewCtorProp& arg_prop, - const Kokkos::DynRankView& src, - std::enable_if_t< - std::is_void::specialize>::value && - !Impl::MirrorDRViewType< - typename Impl::ViewCtorProp::memory_space, T, - P...>::is_same_memspace>* = nullptr) { - using alloc_prop_input = Impl::ViewCtorProp; - static_assert( - alloc_prop_input::has_memory_space, - "The view constructor arguments passed to " - "Kokkos::create_mirror_view_and_copy must include a memory space!"); - static_assert(!alloc_prop_input::has_pointer, - "The view constructor arguments passed to " - "Kokkos::create_mirror_view_and_copy must " - "not include a pointer!"); - static_assert(!alloc_prop_input::allow_padding, - "The view constructor arguments passed to " - "Kokkos::create_mirror_view_and_copy must " - "not explicitly allow padding!"); - using Space = typename alloc_prop_input::memory_space; - using Mirror = typename Impl::MirrorDRViewType::view_type; - - auto arg_prop_copy = Impl::with_properties_if_unset( - arg_prop, std::string{}, WithoutInitializing, - typename Space::execution_space{}); - - std::string& label = Impl::get_property(arg_prop_copy); - if (label.empty()) label = src.label(); - auto mirror = typename Mirror::non_const_type{ - arg_prop_copy, Impl::reconstructLayout(src.layout(), src.rank())}; - if constexpr (alloc_prop_input::has_execution_space) { - deep_copy(Impl::get_property(arg_prop_copy), - mirror, src); - } else - deep_copy(mirror, src); - return mirror; + Impl::check_view_ctor_args_create_mirror_view_and_copy(); + + if constexpr (Impl::MirrorDRViewType< + typename Impl::ViewCtorProp::memory_space, + T, P...>::is_same_memspace) { + // same behavior as deep_copy(src, src) + if constexpr (!alloc_prop_input::has_execution_space) + fence( + "Kokkos::create_mirror_view_and_copy: fence before returning src " + "view"); + return src; + } else { + using Space = typename alloc_prop_input::memory_space; + using Mirror = typename Impl::MirrorDRViewType::view_type; + + auto arg_prop_copy = Impl::with_properties_if_unset( + arg_prop, std::string{}, WithoutInitializing, + typename Space::execution_space{}); + + std::string& label = Impl::get_property(arg_prop_copy); + if (label.empty()) label = src.label(); + auto mirror = typename Mirror::non_const_type{ + arg_prop_copy, Impl::reconstructLayout(src.layout(), src.rank())}; + if constexpr (alloc_prop_input::has_execution_space) { + deep_copy(Impl::get_property(arg_prop_copy), + mirror, src); + } else + deep_copy(mirror, src); + return mirror; + } } template diff --git a/containers/src/Kokkos_DynamicView.hpp b/containers/src/Kokkos_DynamicView.hpp index 12885edbae9..8e29042ace2 100644 --- a/containers/src/Kokkos_DynamicView.hpp +++ b/containers/src/Kokkos_DynamicView.hpp @@ -590,71 +590,42 @@ struct MirrorDynamicViewType { } // namespace Impl namespace Impl { + +// create a mirror +// private interface that accepts arbitrary view constructor args passed by a +// view_alloc template -inline auto create_mirror( - const Kokkos::Experimental::DynamicView& src, - const Impl::ViewCtorProp& arg_prop, - std::enable_if_t::has_memory_space>* = - nullptr) { +inline auto create_mirror(const Kokkos::Experimental::DynamicView& src, + const Impl::ViewCtorProp& arg_prop) { using alloc_prop_input = Impl::ViewCtorProp; - - static_assert( - !alloc_prop_input::has_label, - "The view constructor arguments passed to Kokkos::create_mirror " - "must not include a label!"); - static_assert( - !alloc_prop_input::has_pointer, - "The view constructor arguments passed to Kokkos::create_mirror must " - "not include a pointer!"); - static_assert( - !alloc_prop_input::allow_padding, - "The view constructor arguments passed to Kokkos::create_mirror must " - "not explicitly allow padding!"); + check_view_ctor_args_create_mirror(); auto prop_copy = Impl::with_properties_if_unset( arg_prop, std::string(src.label()).append("_mirror")); - auto ret = typename Kokkos::Experimental::DynamicView::HostMirror( - prop_copy, src.chunk_size(), src.chunk_max() * src.chunk_size()); - - ret.resize_serial(src.extent(0)); + if constexpr (Impl::ViewCtorProp::has_memory_space) { + using MemorySpace = typename alloc_prop_input::memory_space; - return ret; -} - -template -inline auto create_mirror( - const Kokkos::Experimental::DynamicView& src, - const Impl::ViewCtorProp& arg_prop, - std::enable_if_t::has_memory_space>* = - nullptr) { - using alloc_prop_input = Impl::ViewCtorProp; + auto ret = typename Kokkos::Impl::MirrorDynamicViewType< + MemorySpace, T, P...>::view_type(prop_copy, src.chunk_size(), + src.chunk_max() * src.chunk_size()); - static_assert( - !alloc_prop_input::has_label, - "The view constructor arguments passed to Kokkos::create_mirror " - "must not include a label!"); - static_assert( - !alloc_prop_input::has_pointer, - "The view constructor arguments passed to Kokkos::create_mirror must " - "not include a pointer!"); - static_assert( - !alloc_prop_input::allow_padding, - "The view constructor arguments passed to Kokkos::create_mirror must " - "not explicitly allow padding!"); - - using MemorySpace = typename alloc_prop_input::memory_space; - auto prop_copy = Impl::with_properties_if_unset( - arg_prop, std::string(src.label()).append("_mirror")); + ret.resize_serial(src.extent(0)); - auto ret = typename Kokkos::Impl::MirrorDynamicViewType< - MemorySpace, T, P...>::view_type(prop_copy, src.chunk_size(), - src.chunk_max() * src.chunk_size()); + return ret; + } else { + auto ret = typename Kokkos::Experimental::DynamicView::HostMirror( + prop_copy, src.chunk_size(), src.chunk_max() * src.chunk_size()); - ret.resize_serial(src.extent(0)); + ret.resize_serial(src.extent(0)); - return ret; + return ret; + } +#if defined KOKKOS_COMPILER_INTEL + __builtin_unreachable(); +#endif } + } // namespace Impl // Create a mirror in host space @@ -696,67 +667,44 @@ inline auto create_mirror( namespace Impl { +// create a mirror view +// private interface that accepts arbitrary view constructor args passed by a +// view_alloc template -inline std::enable_if_t< - !Impl::ViewCtorProp::has_memory_space && - (std::is_same< - typename Kokkos::Experimental::DynamicView::memory_space, - typename Kokkos::Experimental::DynamicView< - T, P...>::HostMirror::memory_space>::value && - std::is_same< - typename Kokkos::Experimental::DynamicView::data_type, - typename Kokkos::Experimental::DynamicView< - T, P...>::HostMirror::data_type>::value), - typename Kokkos::Experimental::DynamicView::HostMirror> -create_mirror_view(const Kokkos::Experimental::DynamicView& src, - const Impl::ViewCtorProp&) { - return src; -} - -template -inline std::enable_if_t< - !Impl::ViewCtorProp::has_memory_space && - !(std::is_same< - typename Kokkos::Experimental::DynamicView::memory_space, - typename Kokkos::Experimental::DynamicView< - T, P...>::HostMirror::memory_space>::value && - std::is_same< - typename Kokkos::Experimental::DynamicView::data_type, - typename Kokkos::Experimental::DynamicView< - T, P...>::HostMirror::data_type>::value), - typename Kokkos::Experimental::DynamicView::HostMirror> -create_mirror_view(const Kokkos::Experimental::DynamicView& src, - const Impl::ViewCtorProp& arg_prop) { - return Kokkos::create_mirror(arg_prop, src); -} - -template ::has_memory_space>> -std::enable_if_t::memory_space, - T, P...>::is_same_memspace, - typename Impl::MirrorDynamicViewType< - typename Impl::ViewCtorProp::memory_space, - T, P...>::view_type> -create_mirror_view(const Kokkos::Experimental::DynamicView& src, - const Impl::ViewCtorProp&) { - return src; +inline auto create_mirror_view( + const Kokkos::Experimental::DynamicView& src, + [[maybe_unused]] const Impl::ViewCtorProp& arg_prop) { + if constexpr (!Impl::ViewCtorProp::has_memory_space) { + if constexpr (std::is_same::memory_space, + typename Kokkos::Experimental::DynamicView< + T, P...>::HostMirror::memory_space>::value && + std::is_same::data_type, + typename Kokkos::Experimental::DynamicView< + T, P...>::HostMirror::data_type>::value) { + return + typename Kokkos::Experimental::DynamicView::HostMirror(src); + } else { + return Kokkos::Impl::create_mirror(src, arg_prop); + } + } else { + if constexpr (Impl::MirrorDynamicViewType< + typename Impl::ViewCtorProp< + ViewCtorArgs...>::memory_space, + T, P...>::is_same_memspace) { + return typename Impl::MirrorDynamicViewType< + typename Impl::ViewCtorProp::memory_space, T, + P...>::view_type(src); + } else { + return Kokkos::Impl::create_mirror(src, arg_prop); + } + } +#if defined KOKKOS_COMPILER_INTEL + __builtin_unreachable(); +#endif } -template ::has_memory_space>> -std::enable_if_t::memory_space, - T, P...>::is_same_memspace, - typename Impl::MirrorDynamicViewType< - typename Impl::ViewCtorProp::memory_space, - T, P...>::view_type> -create_mirror_view(const Kokkos::Experimental::DynamicView& src, - const Impl::ViewCtorProp& arg_prop) { - return Kokkos::Impl::create_mirror(src, arg_prop); -} } // namespace Impl // Create a mirror view in host space @@ -985,80 +933,53 @@ struct ViewCopy, } // namespace Impl -template +// create a mirror view and deep copy it +// public interface that accepts arbitrary view constructor args passed by a +// view_alloc +template ::specialize>::value>> auto create_mirror_view_and_copy( - const Impl::ViewCtorProp&, - const Kokkos::Experimental::DynamicView& src, - std::enable_if_t< - std::is_void::specialize>::value && - Impl::MirrorDynamicViewType< - typename Impl::ViewCtorProp::memory_space, T, - P...>::is_same_memspace>* = nullptr) { + [[maybe_unused]] const Impl::ViewCtorProp& arg_prop, + const Kokkos::Experimental::DynamicView& src) { using alloc_prop_input = Impl::ViewCtorProp; - static_assert( - alloc_prop_input::has_memory_space, - "The view constructor arguments passed to " - "Kokkos::create_mirror_view_and_copy must include a memory space!"); - static_assert(!alloc_prop_input::has_pointer, - "The view constructor arguments passed to " - "Kokkos::create_mirror_view_and_copy must " - "not include a pointer!"); - static_assert(!alloc_prop_input::allow_padding, - "The view constructor arguments passed to " - "Kokkos::create_mirror_view_and_copy must " - "not explicitly allow padding!"); - - // same behavior as deep_copy(src, src) - if (!alloc_prop_input::has_execution_space) - fence( - "Kokkos::create_mirror_view_and_copy: fence before returning src view"); - return src; -} -template -auto create_mirror_view_and_copy( - const Impl::ViewCtorProp& arg_prop, - const Kokkos::Experimental::DynamicView& src, - std::enable_if_t< - std::is_void::specialize>::value && - !Impl::MirrorDynamicViewType< - typename Impl::ViewCtorProp::memory_space, T, - P...>::is_same_memspace>* = nullptr) { - using alloc_prop_input = Impl::ViewCtorProp; - static_assert( - alloc_prop_input::has_memory_space, - "The view constructor arguments passed to " - "Kokkos::create_mirror_view_and_copy must include a memory space!"); - static_assert(!alloc_prop_input::has_pointer, - "The view constructor arguments passed to " - "Kokkos::create_mirror_view_and_copy must " - "not include a pointer!"); - static_assert(!alloc_prop_input::allow_padding, - "The view constructor arguments passed to " - "Kokkos::create_mirror_view_and_copy must " - "not explicitly allow padding!"); - using Space = typename alloc_prop_input::memory_space; - using Mirror = - typename Impl::MirrorDynamicViewType::view_type; - - auto arg_prop_copy = Impl::with_properties_if_unset( - arg_prop, std::string{}, WithoutInitializing, - typename Space::execution_space{}); - - std::string& label = Impl::get_property(arg_prop_copy); - if (label.empty()) label = src.label(); - auto mirror = typename Mirror::non_const_type( - arg_prop_copy, src.chunk_size(), src.chunk_max() * src.chunk_size()); - mirror.resize_serial(src.extent(0)); - if constexpr (alloc_prop_input::has_execution_space) { - deep_copy(Impl::get_property(arg_prop_copy), - mirror, src); - } else - deep_copy(mirror, src); - return mirror; + Impl::check_view_ctor_args_create_mirror_view_and_copy(); + + if constexpr (Impl::MirrorDynamicViewType< + typename Impl::ViewCtorProp::memory_space, + T, P...>::is_same_memspace) { + // same behavior as deep_copy(src, src) + if constexpr (!alloc_prop_input::has_execution_space) + fence( + "Kokkos::create_mirror_view_and_copy: fence before returning src " + "view"); + return src; + } else { + using Space = typename alloc_prop_input::memory_space; + using Mirror = + typename Impl::MirrorDynamicViewType::view_type; + + auto arg_prop_copy = Impl::with_properties_if_unset( + arg_prop, std::string{}, WithoutInitializing, + typename Space::execution_space{}); + + std::string& label = Impl::get_property(arg_prop_copy); + if (label.empty()) label = src.label(); + auto mirror = typename Mirror::non_const_type( + arg_prop_copy, src.chunk_size(), src.chunk_max() * src.chunk_size()); + mirror.resize_serial(src.extent(0)); + if constexpr (alloc_prop_input::has_execution_space) { + deep_copy(Impl::get_property(arg_prop_copy), + mirror, src); + } else + deep_copy(mirror, src); + return mirror; + } } -template +template ::value>> auto create_mirror_view_and_copy( const Space&, const Kokkos::Experimental::DynamicView& src, std::string const& name = "") { diff --git a/containers/src/Kokkos_OffsetView.hpp b/containers/src/Kokkos_OffsetView.hpp index 91a7e4a9273..720e71b8c16 100644 --- a/containers/src/Kokkos_OffsetView.hpp +++ b/containers/src/Kokkos_OffsetView.hpp @@ -1841,45 +1841,34 @@ struct MirrorOffsetType { } // namespace Impl namespace Impl { -template -inline std::enable_if_t< - !Impl::ViewCtorProp::has_memory_space, - typename Kokkos::Experimental::OffsetView::HostMirror> -create_mirror(const Kokkos::Experimental::OffsetView& src, - const Impl::ViewCtorProp& arg_prop) { - return typename Kokkos::Experimental::OffsetView::HostMirror( - Kokkos::create_mirror(arg_prop, src.view()), src.begins()); -} -template ::has_memory_space>> +// create a mirror +// private interface that accepts arbitrary view constructor args passed by a +// view_alloc +template inline auto create_mirror(const Kokkos::Experimental::OffsetView& src, const Impl::ViewCtorProp& arg_prop) { - using alloc_prop_input = Impl::ViewCtorProp; - using Space = typename Impl::ViewCtorProp::memory_space; + check_view_ctor_args_create_mirror(); - static_assert( - !alloc_prop_input::has_label, - "The view constructor arguments passed to Kokkos::create_mirror " - "must not include a label!"); - static_assert( - !alloc_prop_input::has_pointer, - "The view constructor arguments passed to Kokkos::create_mirror must " - "not include a pointer!"); - static_assert( - !alloc_prop_input::allow_padding, - "The view constructor arguments passed to Kokkos::create_mirror must " - "not explicitly allow padding!"); + if constexpr (Impl::ViewCtorProp::has_memory_space) { + using Space = typename Impl::ViewCtorProp::memory_space; - auto prop_copy = Impl::with_properties_if_unset( - arg_prop, std::string(src.label()).append("_mirror")); + auto prop_copy = Impl::with_properties_if_unset( + arg_prop, std::string(src.label()).append("_mirror")); - return typename Kokkos::Impl::MirrorOffsetType::view_type( - prop_copy, src.layout(), - {src.begin(0), src.begin(1), src.begin(2), src.begin(3), src.begin(4), - src.begin(5), src.begin(6), src.begin(7)}); + return typename Kokkos::Impl::MirrorOffsetType::view_type( + prop_copy, src.layout(), + {src.begin(0), src.begin(1), src.begin(2), src.begin(3), src.begin(4), + src.begin(5), src.begin(6), src.begin(7)}); + } else { + return typename Kokkos::Experimental::OffsetView::HostMirror( + Kokkos::create_mirror(arg_prop, src.view()), src.begins()); + } +#if defined KOKKOS_COMPILER_INTEL + __builtin_unreachable(); +#endif } + } // namespace Impl // Create a mirror in host space @@ -1921,67 +1910,44 @@ inline auto create_mirror( } namespace Impl { -template -inline std::enable_if_t< - !Impl::ViewCtorProp::has_memory_space && - (std::is_same< - typename Kokkos::Experimental::OffsetView::memory_space, - typename Kokkos::Experimental::OffsetView< - T, P...>::HostMirror::memory_space>::value && - std::is_same< - typename Kokkos::Experimental::OffsetView::data_type, - typename Kokkos::Experimental::OffsetView< - T, P...>::HostMirror::data_type>::value), - typename Kokkos::Experimental::OffsetView::HostMirror> -create_mirror_view(const Kokkos::Experimental::OffsetView& src, - const Impl::ViewCtorProp&) { - return src; -} +// create a mirror view +// private interface that accepts arbitrary view constructor args passed by a +// view_alloc template -inline std::enable_if_t< - !Impl::ViewCtorProp::has_memory_space && - !(std::is_same< - typename Kokkos::Experimental::OffsetView::memory_space, - typename Kokkos::Experimental::OffsetView< - T, P...>::HostMirror::memory_space>::value && - std::is_same< - typename Kokkos::Experimental::OffsetView::data_type, - typename Kokkos::Experimental::OffsetView< - T, P...>::HostMirror::data_type>::value), - typename Kokkos::Experimental::OffsetView::HostMirror> -create_mirror_view(const Kokkos::Experimental::OffsetView& src, - const Impl::ViewCtorProp& arg_prop) { - return Kokkos::create_mirror(arg_prop, src); -} - -template ::has_memory_space>> -std::enable_if_t::memory_space, - T, P...>::is_same_memspace, - typename Impl::MirrorOffsetViewType< - typename Impl::ViewCtorProp::memory_space, - T, P...>::view_type> -create_mirror_view(const Kokkos::Experimental::OffsetView& src, - const Impl::ViewCtorProp&) { - return src; +inline auto create_mirror_view( + const Kokkos::Experimental::OffsetView& src, + [[maybe_unused]] const Impl::ViewCtorProp& arg_prop) { + if constexpr (!Impl::ViewCtorProp::has_memory_space) { + if constexpr (std::is_same::memory_space, + typename Kokkos::Experimental::OffsetView< + T, P...>::HostMirror::memory_space>::value && + std::is_same::data_type, + typename Kokkos::Experimental::OffsetView< + T, P...>::HostMirror::data_type>::value) { + return + typename Kokkos::Experimental::OffsetView::HostMirror(src); + } else { + return Kokkos::Impl::create_mirror(src, arg_prop); + } + } else { + if constexpr (Impl::MirrorOffsetViewType::memory_space, + T, P...>::is_same_memspace) { + return typename Impl::MirrorOffsetViewType< + typename Impl::ViewCtorProp::memory_space, T, + P...>::view_type(src); + } else { + return Kokkos::Impl::create_mirror(src, arg_prop); + } + } +#if defined KOKKOS_COMPILER_INTEL + __builtin_unreachable(); +#endif } -template ::has_memory_space>> -std::enable_if_t::memory_space, - T, P...>::is_same_memspace, - typename Impl::MirrorOffsetViewType< - typename Impl::ViewCtorProp::memory_space, - T, P...>::view_type> -create_mirror_view(const Kokkos::Experimental::OffsetView& src, - const Impl::ViewCtorProp& arg_prop) { - return Kokkos::Impl::create_mirror(src, arg_prop); -} } // namespace Impl // Create a mirror view in host space diff --git a/containers/unit_tests/TestDualView.hpp b/containers/unit_tests/TestDualView.hpp index a15e5fa2997..2512cb5c491 100644 --- a/containers/unit_tests/TestDualView.hpp +++ b/containers/unit_tests/TestDualView.hpp @@ -55,8 +55,8 @@ struct test_dualview_alloc { bool result = false; test_dualview_alloc(unsigned int size) { - result = run_me >( - size, 3); + result = + run_me>(size, 3); } }; @@ -154,7 +154,7 @@ struct test_dualview_combinations { } test_dualview_combinations(unsigned int size, bool with_init) { - result = run_me >( + result = run_me>( size, 3, with_init); } }; @@ -253,21 +253,18 @@ struct test_dual_view_deep_copy { } // end run_me test_dual_view_deep_copy() { - run_me >(10, 5, - true); - run_me >(10, 5, - false); + run_me>(10, 5, true); + run_me>(10, 5, + false); // Test zero length but allocated (a.d_view.data!=nullptr but // a.d_view.span()==0) - run_me >(0, 5, true); - run_me >(0, 5, - false); + run_me>(0, 5, true); + run_me>(0, 5, false); // Test default constructed view - run_me >(-1, 5, - true); - run_me >(-1, 5, - false); + run_me>(-1, 5, true); + run_me>(-1, 5, + false); } }; @@ -282,15 +279,20 @@ struct test_dualview_resize { const unsigned int m = 5; const unsigned int factor = 2; - ViewType a("A", n, m); + ViewType a; + if constexpr (Initialize) + a = ViewType("A", n, m); + else + a = ViewType(Kokkos::view_alloc(Kokkos::WithoutInitializing, "A"), n, m); + Kokkos::deep_copy(a.d_view, 1); /* Covers case "Resize on Device" */ a.modify_device(); - if (Initialize) - Kokkos::resize(Kokkos::WithoutInitializing, a, factor * n, factor * m); - else + if constexpr (Initialize) Kokkos::resize(a, factor * n, factor * m); + else + Kokkos::resize(Kokkos::WithoutInitializing, a, factor * n, factor * m); ASSERT_EQ(a.extent(0), n * factor); ASSERT_EQ(a.extent(1), m * factor); @@ -298,33 +300,38 @@ struct test_dualview_resize { a.sync_host(); // Check device view is initialized as expected - scalar_type a_d_sum = 0; // Execute on the execution_space associated with t_dev's memory space using t_dev_exec_space = typename ViewType::t_dev::memory_space::execution_space; - Kokkos::parallel_reduce( - Kokkos::RangePolicy(0, a.d_view.extent(0)), - SumViewEntriesFunctor(a.d_view), - a_d_sum); + Kokkos::View errors_d( + "errors"); + Kokkos::parallel_for( + Kokkos::MDRangePolicy>( + {0, 0}, {a.d_view.extent(0), a.d_view.extent(1)}), + KOKKOS_LAMBDA(int i, int j) { + if (a.d_view(i, j) != 1) Kokkos::atomic_inc(errors_d.data()); + }); + int errors_d_scalar; + Kokkos::deep_copy(errors_d_scalar, errors_d); // Check host view is synced as expected - scalar_type a_h_sum = 0; + int errors_h_scalar = 0; for (size_t i = 0; i < a.h_view.extent(0); ++i) for (size_t j = 0; j < a.h_view.extent(1); ++j) { - a_h_sum += a.h_view(i, j); + if (a.h_view(i, j) != 1) ++errors_h_scalar; } // Check - ASSERT_EQ(a_h_sum, a_d_sum); - ASSERT_EQ(a_h_sum, scalar_type(a.extent(0) * a.extent(1))); + ASSERT_EQ(errors_d_scalar, 0); + ASSERT_EQ(errors_h_scalar, 0); /* Covers case "Resize on Host" */ a.modify_host(); - if (Initialize) - Kokkos::resize(Kokkos::WithoutInitializing, a, n / factor, m / factor); - else + if constexpr (Initialize) Kokkos::resize(a, n / factor, m / factor); + else + Kokkos::resize(Kokkos::WithoutInitializing, a, n / factor, m / factor); ASSERT_EQ(a.extent(0), n / factor); ASSERT_EQ(a.extent(1), m / factor); @@ -332,30 +339,33 @@ struct test_dualview_resize { a.sync_device(Kokkos::DefaultExecutionSpace{}); // Check device view is initialized as expected - a_d_sum = 0; + Kokkos::deep_copy(errors_d, 0); // Execute on the execution_space associated with t_dev's memory space using t_dev_exec_space = typename ViewType::t_dev::memory_space::execution_space; - Kokkos::parallel_reduce( - Kokkos::RangePolicy(0, a.d_view.extent(0)), - SumViewEntriesFunctor(a.d_view), - a_d_sum); + Kokkos::parallel_for( + Kokkos::MDRangePolicy>( + {0, 0}, {a.d_view.extent(0), a.d_view.extent(1)}), + KOKKOS_LAMBDA(int i, int j) { + if (a.d_view(i, j) != 1) Kokkos::atomic_inc(errors_d.data()); + }); + Kokkos::deep_copy(errors_d_scalar, errors_d); // Check host view is synced as expected - a_h_sum = 0; + errors_h_scalar = 0; for (size_t i = 0; i < a.h_view.extent(0); ++i) for (size_t j = 0; j < a.h_view.extent(1); ++j) { - a_h_sum += a.h_view(i, j); + if (a.h_view(i, j) != 1) ++errors_h_scalar; } // Check - ASSERT_EQ(a_h_sum, scalar_type(a.extent(0) * a.extent(1))); - ASSERT_EQ(a_h_sum, a_d_sum); + ASSERT_EQ(errors_d_scalar, 0); + ASSERT_EQ(errors_h_scalar, 0); } // end run_me test_dualview_resize() { - run_me >(); + run_me>(); } }; @@ -369,40 +379,51 @@ struct test_dualview_realloc { const unsigned int n = 10; const unsigned int m = 5; - ViewType a("A", n, m); - if (Initialize) - Kokkos::realloc(Kokkos::WithoutInitializing, a, n, m); - else + ViewType a; + if constexpr (Initialize) { + a = ViewType("A", n, m); Kokkos::realloc(a, n, m); + } else { + a = ViewType(Kokkos::view_alloc(Kokkos::WithoutInitializing, "A"), n, m); + Kokkos::realloc(Kokkos::WithoutInitializing, a, n, m); + } + ASSERT_EQ(a.extent(0), n); + ASSERT_EQ(a.extent(1), m); Kokkos::deep_copy(a.d_view, 1); + a.modify_device(); a.sync_host(); // Check device view is initialized as expected - scalar_type a_d_sum = 0; // Execute on the execution_space associated with t_dev's memory space using t_dev_exec_space = typename ViewType::t_dev::memory_space::execution_space; - Kokkos::parallel_reduce( - Kokkos::RangePolicy(0, a.d_view.extent(0)), - SumViewEntriesFunctor(a.d_view), - a_d_sum); + Kokkos::View errors_d( + "errors"); + Kokkos::parallel_for( + Kokkos::MDRangePolicy>( + {0, 0}, {a.d_view.extent(0), a.d_view.extent(1)}), + KOKKOS_LAMBDA(int i, int j) { + if (a.d_view(i, j) != 1) Kokkos::atomic_inc(errors_d.data()); + }); + int errors_d_scalar; + Kokkos::deep_copy(errors_d_scalar, errors_d); // Check host view is synced as expected - scalar_type a_h_sum = 0; + int errors_h_scalar = 0; for (size_t i = 0; i < a.h_view.extent(0); ++i) for (size_t j = 0; j < a.h_view.extent(1); ++j) { - a_h_sum += a.h_view(i, j); + if (a.h_view(i, j) != 1) ++errors_h_scalar; } // Check - ASSERT_EQ(a_h_sum, scalar_type(a.extent(0) * a.extent(1))); - ASSERT_EQ(a_h_sum, a_d_sum); + ASSERT_EQ(errors_d_scalar, 0); + ASSERT_EQ(errors_h_scalar, 0); } // end run_me test_dualview_realloc() { - run_me >(); + run_me>(); } }; @@ -463,12 +484,23 @@ TEST(TEST_CATEGORY, dualview_deep_copy) { test_dualview_deep_copy(); } +struct NoDefaultConstructor { + NoDefaultConstructor(int i_) : i(i_) {} + KOKKOS_FUNCTION operator int() const { return i; } + + int i; +}; + TEST(TEST_CATEGORY, dualview_realloc) { test_dualview_realloc(); + Impl::test_dualview_realloc(); } TEST(TEST_CATEGORY, dualview_resize) { test_dualview_resize(); + Impl::test_dualview_resize(); } namespace { diff --git a/containers/unit_tests/TestVector.hpp b/containers/unit_tests/TestVector.hpp index a7d341b789d..19901a52ad5 100644 --- a/containers/unit_tests/TestVector.hpp +++ b/containers/unit_tests/TestVector.hpp @@ -21,6 +21,8 @@ #include #include #include +#include +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() #include namespace Test { diff --git a/core/src/Cuda/Kokkos_Cuda_Instance.hpp b/core/src/Cuda/Kokkos_Cuda_Instance.hpp index 24f4af31019..25aa6502152 100644 --- a/core/src/Cuda/Kokkos_Cuda_Instance.hpp +++ b/core/src/Cuda/Kokkos_Cuda_Instance.hpp @@ -421,23 +421,6 @@ class CudaInternal { return cudaStreamSynchronize(stream); } - // The following are only available for cuda 11.2 and greater -#if (defined(KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC) && CUDART_VERSION >= 11020) - template - cudaError_t cuda_malloc_async_wrapper(void** devPtr, size_t size, - cudaStream_t hStream = nullptr) const { - if constexpr (setCudaDevice) set_cuda_device(); - return cudaMallocAsync(devPtr, size, get_input_stream(hStream)); - } - - template - cudaError_t cuda_free_async_wrapper(void* devPtr, - cudaStream_t hStream = nullptr) const { - if constexpr (setCudaDevice) set_cuda_device(); - return cudaFreeAsync(devPtr, get_input_stream(hStream)); - } -#endif - // C++ API routines template cudaError_t cuda_func_get_attributes_wrapper(cudaFuncAttributes* attr, diff --git a/core/src/HPX/Kokkos_HPX.cpp b/core/src/HPX/Kokkos_HPX.cpp index 6d541a64148..1f3d0783449 100644 --- a/core/src/HPX/Kokkos_HPX.cpp +++ b/core/src/HPX/Kokkos_HPX.cpp @@ -153,7 +153,7 @@ void HPX::impl_instance_fence_locked(const std::string &name) const { auto &s = impl_get_sender(); hpx::this_thread::experimental::sync_wait(std::move(s)); - s = hpx::execution::experimental::unique_any_sender( + s = hpx::execution::experimental::unique_any_sender<>( hpx::execution::experimental::just()); }); } @@ -184,7 +184,7 @@ void HPX::impl_static_fence(const std::string &name) { } hpx::this_thread::experimental::sync_wait(std::move(s)); - s = hpx::execution::experimental::unique_any_sender( + s = hpx::execution::experimental::unique_any_sender<>( hpx::execution::experimental::just()); }); } diff --git a/core/src/Kokkos_Array.hpp b/core/src/Kokkos_Array.hpp index ba1626bb72e..26e8a12be11 100644 --- a/core/src/Kokkos_Array.hpp +++ b/core/src/Kokkos_Array.hpp @@ -29,7 +29,6 @@ #include #include #include -#include #include namespace Kokkos { @@ -80,7 +79,11 @@ struct ArrayBoundsCheck { /**\brief Derived from the C++17 'std::array'. * Dropping the iterator interface. */ +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 template +#else +template +#endif struct Array { public: /** @@ -129,10 +132,26 @@ struct Array { KOKKOS_INLINE_FUNCTION constexpr const_pointer data() const { return &m_internal_implementation_private_member_data[0]; } + + private: + template + friend KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t< + Impl::is_swappable::value> + kokkos_swap(Array& a, + Array& b) noexcept(Impl::is_nothrow_swappable_v) { + for (std::size_t i = 0; i < N; ++i) { + kokkos_swap(a[i], b[i]); + } + } }; +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 template struct Array { +#else +template +struct Array { +#endif public: using reference = T&; using const_reference = std::add_const_t&; @@ -176,16 +195,27 @@ struct Array { // for default move constructor and move assignment operator. // Array( Array && ) = default ; // Array & operator = ( Array && ) = default ; + + private: + friend KOKKOS_INLINE_FUNCTION constexpr void kokkos_swap( + Array&, Array&) noexcept {} }; +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +namespace Impl { +struct KokkosArrayContiguous {}; +struct KokkosArrayStrided {}; +} // namespace Impl + template <> -struct Array { - struct contiguous {}; - struct strided {}; +struct KOKKOS_DEPRECATED Array { + using contiguous = Impl::KokkosArrayContiguous; + using strided = Impl::KokkosArrayStrided; }; template -struct Array::contiguous> { +struct KOKKOS_DEPRECATED + Array { private: T* m_elem; size_t m_size; @@ -253,7 +283,8 @@ struct Array::contiguous> { }; template -struct Array::strided> { +struct KOKKOS_DEPRECATED + Array { private: T* m_elem; size_t m_size; @@ -320,10 +351,37 @@ struct Array::strided> { size_type arg_stride) : m_elem(arg_ptr), m_size(arg_size), m_stride(arg_stride) {} }; +#endif template Array(T, Us...)->Array; +namespace Impl { + +template +KOKKOS_FUNCTION constexpr Array, N> to_array_impl( + T (&a)[N], std::index_sequence) { + return {{a[I]...}}; +} + +template +KOKKOS_FUNCTION constexpr Array, N> to_array_impl( + T(&&a)[N], std::index_sequence) { + return {{std::move(a[I])...}}; +} + +} // namespace Impl + +template +KOKKOS_FUNCTION constexpr auto to_array(T (&a)[N]) { + return Impl::to_array_impl(a, std::make_index_sequence{}); +} + +template +KOKKOS_FUNCTION constexpr auto to_array(T(&&a)[N]) { + return Impl::to_array_impl(std::move(a), std::make_index_sequence{}); +} + } // namespace Kokkos // diff --git a/core/src/Kokkos_CopyViews.hpp b/core/src/Kokkos_CopyViews.hpp index ee8d1e09d3a..fbd6668a611 100644 --- a/core/src/Kokkos_CopyViews.hpp +++ b/core/src/Kokkos_CopyViews.hpp @@ -3235,7 +3235,10 @@ impl_realloc(Kokkos::View& v, const size_t n0, const size_t n1, v = view_type(); // Best effort to deallocate in case no other view refers // to the shared allocation v = view_type(arg_prop_copy, n0, n1, n2, n3, n4, n5, n6, n7); - } else if (alloc_prop_input::initialize) { + return; + } + + if constexpr (alloc_prop_input::initialize) { if constexpr (alloc_prop_input::has_execution_space) { const auto& exec_space = Impl::get_property(arg_prop); @@ -3330,7 +3333,10 @@ impl_realloc(Kokkos::View& v, if (v.layout() != layout) { v = view_type(); // Deallocate first, if the only view to allocation v = view_type(arg_prop, layout); - } else if (alloc_prop_input::initialize) { + return; + } + + if constexpr (alloc_prop_input::initialize) { if constexpr (alloc_prop_input::has_execution_space) { const auto& exec_space = Impl::get_property(arg_prop); @@ -3450,6 +3456,7 @@ struct MirrorType { using view_type = Kokkos::View; }; +// collection of static asserts for create_mirror and create_mirror_view template void check_view_ctor_args_create_mirror() { using alloc_prop_input = Impl::ViewCtorProp; @@ -3468,36 +3475,29 @@ void check_view_ctor_args_create_mirror() { "not explicitly allow padding!"); } +// create a mirror +// private interface that accepts arbitrary view constructor args passed by a +// view_alloc template -inline std::enable_if_t::has_memory_space, - typename Kokkos::View::HostMirror> -create_mirror(const Kokkos::View& src, - const Impl::ViewCtorProp& arg_prop) { - using src_type = View; - using dst_type = typename src_type::HostMirror; - +inline auto create_mirror(const Kokkos::View& src, + const Impl::ViewCtorProp& arg_prop) { check_view_ctor_args_create_mirror(); auto prop_copy = Impl::with_properties_if_unset( arg_prop, std::string(src.label()).append("_mirror")); - return dst_type(prop_copy, src.layout()); -} - -// Create a mirror in a new space (specialization for different space) -template ::has_memory_space>> -auto create_mirror(const Kokkos::View& src, - const Impl::ViewCtorProp& arg_prop) { - check_view_ctor_args_create_mirror(); - - auto prop_copy = Impl::with_properties_if_unset( - arg_prop, std::string(src.label()).append("_mirror")); - using alloc_prop = decltype(prop_copy); - - return typename Impl::MirrorType::view_type(prop_copy, src.layout()); + if constexpr (Impl::ViewCtorProp::has_memory_space) { + using memory_space = typename decltype(prop_copy)::memory_space; + using dst_type = + typename Impl::MirrorType::view_type; + return dst_type(prop_copy, src.layout()); + } else { + using dst_type = typename View::HostMirror; + return dst_type(prop_copy, src.layout()); + } +#if defined KOKKOS_COMPILER_INTEL + __builtin_unreachable(); +#endif } } // namespace Impl @@ -3555,66 +3555,40 @@ create_mirror(Kokkos::Impl::WithoutInitializing_t wi, Space const&, namespace Impl { +// create a mirror view +// private interface that accepts arbitrary view constructor args passed by a +// view_alloc template -inline std::enable_if_t< - !Impl::ViewCtorProp::has_memory_space && - (std::is_same< - typename Kokkos::View::memory_space, - typename Kokkos::View::HostMirror::memory_space>::value && - std::is_same< - typename Kokkos::View::data_type, - typename Kokkos::View::HostMirror::data_type>::value), - typename Kokkos::View::HostMirror> -create_mirror_view(const Kokkos::View& src, - const Impl::ViewCtorProp&) { - check_view_ctor_args_create_mirror(); - return src; -} - -template -inline std::enable_if_t< - !Impl::ViewCtorProp::has_memory_space && - !(std::is_same::memory_space, - typename Kokkos::View< - T, P...>::HostMirror::memory_space>::value && - std::is_same< - typename Kokkos::View::data_type, - typename Kokkos::View::HostMirror::data_type>::value), - typename Kokkos::View::HostMirror> -create_mirror_view(const Kokkos::View& src, - const Impl::ViewCtorProp& arg_prop) { - return Kokkos::Impl::create_mirror(src, arg_prop); -} - -// Create a mirror view in a new space (specialization for same space) -template ::has_memory_space>> -std::enable_if_t::memory_space, - T, P...>::is_same_memspace, - typename Impl::MirrorViewType< - typename Impl::ViewCtorProp::memory_space, - T, P...>::view_type> -create_mirror_view(const Kokkos::View& src, - const Impl::ViewCtorProp&) { - check_view_ctor_args_create_mirror(); - return src; -} - -// Create a mirror view in a new space (specialization for different space) -template ::has_memory_space>> -std::enable_if_t::memory_space, - T, P...>::is_same_memspace, - typename Impl::MirrorViewType< - typename Impl::ViewCtorProp::memory_space, - T, P...>::view_type> -create_mirror_view(const Kokkos::View& src, - const Impl::ViewCtorProp& arg_prop) { - return Kokkos::Impl::create_mirror(src, arg_prop); +inline auto create_mirror_view( + const Kokkos::View& src, + [[maybe_unused]] const Impl::ViewCtorProp& arg_prop) { + if constexpr (!Impl::ViewCtorProp::has_memory_space) { + if constexpr (std::is_same::memory_space, + typename Kokkos::View< + T, P...>::HostMirror::memory_space>::value && + std::is_same::data_type, + typename Kokkos::View< + T, P...>::HostMirror::data_type>::value) { + check_view_ctor_args_create_mirror(); + return typename Kokkos::View::HostMirror(src); + } else { + return Kokkos::Impl::create_mirror(src, arg_prop); + } + } else { + if constexpr (Impl::MirrorViewType::memory_space, + T, P...>::is_same_memspace) { + check_view_ctor_args_create_mirror(); + return typename Impl::MirrorViewType< + typename Impl::ViewCtorProp::memory_space, T, + P...>::view_type(src); + } else { + return Kokkos::Impl::create_mirror(src, arg_prop); + } + } +#if defined KOKKOS_COMPILER_INTEL + __builtin_unreachable(); +#endif } } // namespace Impl @@ -3685,16 +3659,13 @@ auto create_mirror_view(const Impl::ViewCtorProp& arg_prop, return Impl::create_mirror_view(src, arg_prop); } -template -auto create_mirror_view_and_copy( - const Impl::ViewCtorProp&, - const Kokkos::View& src, - std::enable_if_t< - std::is_void::specialize>::value && - Impl::MirrorViewType< - typename Impl::ViewCtorProp::memory_space, T, - P...>::is_same_memspace>* = nullptr) { +namespace Impl { + +// collection of static asserts for create_mirror_view_and_copy +template +void check_view_ctor_args_create_mirror_view_and_copy() { using alloc_prop_input = Impl::ViewCtorProp; + static_assert( alloc_prop_input::has_memory_space, "The view constructor arguments passed to " @@ -3707,52 +3678,49 @@ auto create_mirror_view_and_copy( "The view constructor arguments passed to " "Kokkos::create_mirror_view_and_copy must " "not explicitly allow padding!"); - - // same behavior as deep_copy(src, src) - if (!alloc_prop_input::has_execution_space) - fence( - "Kokkos::create_mirror_view_and_copy: fence before returning src view"); - return src; } -template +} // namespace Impl + +// create a mirror view and deep copy it +// public interface that accepts arbitrary view constructor args passed by a +// view_alloc +template ::specialize>::value>> auto create_mirror_view_and_copy( - const Impl::ViewCtorProp& arg_prop, - const Kokkos::View& src, - std::enable_if_t< - std::is_void::specialize>::value && - !Impl::MirrorViewType< - typename Impl::ViewCtorProp::memory_space, T, - P...>::is_same_memspace>* = nullptr) { + [[maybe_unused]] const Impl::ViewCtorProp& arg_prop, + const Kokkos::View& src) { using alloc_prop_input = Impl::ViewCtorProp; - static_assert( - alloc_prop_input::has_memory_space, - "The view constructor arguments passed to " - "Kokkos::create_mirror_view_and_copy must include a memory space!"); - static_assert(!alloc_prop_input::has_pointer, - "The view constructor arguments passed to " - "Kokkos::create_mirror_view_and_copy must " - "not include a pointer!"); - static_assert(!alloc_prop_input::allow_padding, - "The view constructor arguments passed to " - "Kokkos::create_mirror_view_and_copy must " - "not explicitly allow padding!"); - using Space = typename alloc_prop_input::memory_space; - using Mirror = typename Impl::MirrorViewType::view_type; - - auto arg_prop_copy = Impl::with_properties_if_unset( - arg_prop, std::string{}, WithoutInitializing, - typename Space::execution_space{}); - - std::string& label = Impl::get_property(arg_prop_copy); - if (label.empty()) label = src.label(); - auto mirror = typename Mirror::non_const_type{arg_prop_copy, src.layout()}; - if constexpr (alloc_prop_input::has_execution_space) { - deep_copy(Impl::get_property(arg_prop_copy), - mirror, src); - } else - deep_copy(mirror, src); - return mirror; + + Impl::check_view_ctor_args_create_mirror_view_and_copy(); + + if constexpr (Impl::MirrorViewType::is_same_memspace) { + // same behavior as deep_copy(src, src) + if constexpr (!alloc_prop_input::has_execution_space) + fence( + "Kokkos::create_mirror_view_and_copy: fence before returning src " + "view"); + return src; + } else { + using Space = typename alloc_prop_input::memory_space; + using Mirror = typename Impl::MirrorViewType::view_type; + + auto arg_prop_copy = Impl::with_properties_if_unset( + arg_prop, std::string{}, WithoutInitializing, + typename Space::execution_space{}); + + std::string& label = Impl::get_property(arg_prop_copy); + if (label.empty()) label = src.label(); + auto mirror = typename Mirror::non_const_type{arg_prop_copy, src.layout()}; + if constexpr (alloc_prop_input::has_execution_space) { + deep_copy(Impl::get_property(arg_prop_copy), + mirror, src); + } else + deep_copy(mirror, src); + return mirror; + } } // Previously when using auto here, the intel compiler 19.3 would diff --git a/core/src/Kokkos_ExecPolicy.hpp b/core/src/Kokkos_ExecPolicy.hpp index 5f251eeb26a..3a04101aad1 100644 --- a/core/src/Kokkos_ExecPolicy.hpp +++ b/core/src/Kokkos_ExecPolicy.hpp @@ -714,6 +714,58 @@ class TeamPolicy } }; +// Execution space not provided deduces to TeamPolicy<> + +TeamPolicy()->TeamPolicy<>; + +TeamPolicy(int, int)->TeamPolicy<>; +TeamPolicy(int, int, int)->TeamPolicy<>; +TeamPolicy(int, Kokkos::AUTO_t const&)->TeamPolicy<>; +TeamPolicy(int, Kokkos::AUTO_t const&, int)->TeamPolicy<>; +TeamPolicy(int, Kokkos::AUTO_t const&, Kokkos::AUTO_t const&)->TeamPolicy<>; +TeamPolicy(int, int, Kokkos::AUTO_t const&)->TeamPolicy<>; + +// DefaultExecutionSpace deduces to TeamPolicy<> + +TeamPolicy(DefaultExecutionSpace const&, int, int)->TeamPolicy<>; +TeamPolicy(DefaultExecutionSpace const&, int, int, int)->TeamPolicy<>; +TeamPolicy(DefaultExecutionSpace const&, int, Kokkos::AUTO_t const&) + ->TeamPolicy<>; +TeamPolicy(DefaultExecutionSpace const&, int, Kokkos::AUTO_t const&, int) + ->TeamPolicy<>; +TeamPolicy(DefaultExecutionSpace const&, int, Kokkos::AUTO_t const&, + Kokkos::AUTO_t const&) + ->TeamPolicy<>; +TeamPolicy(DefaultExecutionSpace const&, int, int, Kokkos::AUTO_t const&) + ->TeamPolicy<>; + +// ES != DefaultExecutionSpace deduces to TeamPolicy + +template >> +TeamPolicy(ES const&, int, int)->TeamPolicy; + +template >> +TeamPolicy(ES const&, int, int, int)->TeamPolicy; + +template >> +TeamPolicy(ES const&, int, Kokkos::AUTO_t const&)->TeamPolicy; + +template >> +TeamPolicy(ES const&, int, Kokkos::AUTO_t const&, int)->TeamPolicy; + +template >> +TeamPolicy(ES const&, int, Kokkos::AUTO_t const&, Kokkos::AUTO_t const&) + ->TeamPolicy; + +template >> +TeamPolicy(ES const&, int, int, Kokkos::AUTO_t const&)->TeamPolicy; + namespace Impl { template diff --git a/core/src/Kokkos_Macros.hpp b/core/src/Kokkos_Macros.hpp index b255d2a5195..ceca2130e75 100644 --- a/core/src/Kokkos_Macros.hpp +++ b/core/src/Kokkos_Macros.hpp @@ -562,6 +562,36 @@ static constexpr bool kokkos_omp_on_host() { return false; } #define KOKKOS_IMPL_WARNING(desc) KOKKOS_IMPL_DO_PRAGMA(message(#desc)) #endif +// clang-format off +#if defined(__NVCOMPILER) + #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() \ + _Pragma("diag_suppress 1216") + #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() \ + _Pragma("diag_default 1216") +#elif defined(__EDG__) + #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() \ + _Pragma("warning push") \ + _Pragma("warning disable 1478") + #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() \ + _Pragma("warning pop") +#elif defined(__GNUC__) || defined(__clang__) + #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"") + #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() \ + _Pragma("GCC diagnostic pop") +#elif defined(_MSC_VER) + #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() \ + _Pragma("warning(push)") \ + _Pragma("warning(disable: 4996)") + #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() \ + _Pragma("warning(pop)") +#else + #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() + #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() +#endif +// clang-format on + #define KOKKOS_ATTRIBUTE_NODISCARD [[nodiscard]] #if (defined(KOKKOS_COMPILER_GNU) || defined(KOKKOS_COMPILER_CLANG) || \ diff --git a/core/src/Kokkos_MathematicalFunctions.hpp b/core/src/Kokkos_MathematicalFunctions.hpp index 3fead8dd293..19967782e5e 100644 --- a/core/src/Kokkos_MathematicalFunctions.hpp +++ b/core/src/Kokkos_MathematicalFunctions.hpp @@ -277,12 +277,20 @@ KOKKOS_INLINE_FUNCTION long long abs(long long n) { #endif } KOKKOS_INLINE_FUNCTION float abs(float x) { +#ifdef KOKKOS_ENABLE_SYCL + return sycl::fabs(x); // sycl::abs is only provided for integral types +#else using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::abs; return abs(x); +#endif } KOKKOS_INLINE_FUNCTION double abs(double x) { +#ifdef KOKKOS_ENABLE_SYCL + return sycl::fabs(x); // sycl::abs is only provided for integral types +#else using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::abs; return abs(x); +#endif } inline long double abs(long double x) { using std::abs; diff --git a/core/src/Kokkos_Pair.hpp b/core/src/Kokkos_Pair.hpp index 9be8d8d7aa1..2b7f275d06d 100644 --- a/core/src/Kokkos_Pair.hpp +++ b/core/src/Kokkos_Pair.hpp @@ -413,12 +413,13 @@ KOKKOS_FORCEINLINE_FUNCTION pair tie(T1& x, T2& y) { return (pair(x, y)); } +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 // // Specialization of Kokkos::pair for a \c void second argument. This // is not actually a "pair"; it only contains one element, the first. // template -struct pair { +struct KOKKOS_DEPRECATED pair { using first_type = T1; using second_type = void; @@ -449,40 +450,41 @@ struct pair { // template -KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator==( +KOKKOS_DEPRECATED KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator==( const pair& lhs, const pair& rhs) { return lhs.first == rhs.first; } template -KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator!=( +KOKKOS_DEPRECATED KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator!=( const pair& lhs, const pair& rhs) { return !(lhs == rhs); } template -KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator<( +KOKKOS_DEPRECATED KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator<( const pair& lhs, const pair& rhs) { return lhs.first < rhs.first; } template -KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator<=( +KOKKOS_DEPRECATED KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator<=( const pair& lhs, const pair& rhs) { return !(rhs < lhs); } template -KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator>( +KOKKOS_DEPRECATED KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator>( const pair& lhs, const pair& rhs) { return rhs < lhs; } template -KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator>=( +KOKKOS_DEPRECATED KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator>=( const pair& lhs, const pair& rhs) { return !(lhs < rhs); } +#endif namespace Impl { template diff --git a/core/src/Kokkos_View.hpp b/core/src/Kokkos_View.hpp index 09c6e780ef5..a6c6c955b87 100644 --- a/core/src/Kokkos_View.hpp +++ b/core/src/Kokkos_View.hpp @@ -38,6 +38,7 @@ static_assert(false, #ifdef KOKKOS_ENABLE_IMPL_MDSPAN #include +#include #endif #include @@ -372,6 +373,32 @@ struct ViewTraits { //------------------------------------ }; +#ifdef KOKKOS_ENABLE_IMPL_MDSPAN +namespace Impl { +struct UnsupportedKokkosArrayLayout; + +template +struct MDSpanViewTraits { + using mdspan_type = UnsupportedKokkosArrayLayout; +}; + +// "Natural" mdspan for a view if the View's ArrayLayout is supported. +template +struct MDSpanViewTraits::type>> { + using index_type = std::size_t; + using extents_type = + typename Impl::ExtentsFromDataType::type; + using mdspan_layout_type = + typename Impl::LayoutFromArrayLayout::type; + using mdspan_type = + mdspan; +}; +} // namespace Impl +#endif // KOKKOS_ENABLE_IMPL_MDSPAN + /** \class View * \brief View to an array of data. * @@ -1722,6 +1749,76 @@ class View : public ViewTraits { "Layout is not constructible from extent arguments. Use " "overload taking a layout object instead."); } + + //---------------------------------------- + // MDSpan converting constructors +#ifdef KOKKOS_ENABLE_IMPL_MDSPAN + template ::mdspan_type> + KOKKOS_INLINE_FUNCTION +#if defined(__cpp_conditional_explicit) && \ + (__cpp_conditional_explicit >= 201806L) + // FIXME C++20 reevaluate after determining minium compiler versions + explicit(traits::is_managed) +#endif + View(const typename Impl::MDSpanViewTraits::mdspan_type& mds, + std::enable_if_t< + !std::is_same_v>* = + nullptr) + : View(mds.data_handle(), + Impl::array_layout_from_mapping< + typename traits::array_layout, + typename Impl::MDSpanViewTraits::mdspan_type>( + mds.mapping())) { + } + + template + KOKKOS_INLINE_FUNCTION +#if defined(__cpp_conditional_explicit) && \ + (__cpp_conditional_explicit >= 201806L) + // FIXME C++20 reevaluate after determining minium compiler versions + explicit(!std::is_convertible_v< + Kokkos::mdspan, + typename Impl::MDSpanViewTraits::mdspan_type>) +#endif + View(const Kokkos::mdspan& mds) + : View(typename Impl::MDSpanViewTraits::mdspan_type(mds)) { + } + + //---------------------------------------- + // Conversion to MDSpan + template , + typename Impl::MDSpanViewTraits::mdspan_type>>> + KOKKOS_INLINE_FUNCTION constexpr operator mdspan< + OtherElementType, OtherExtents, OtherLayoutPolicy, OtherAccessor>() { + using mdspan_type = typename Impl::MDSpanViewTraits::mdspan_type; + return mdspan_type{data(), + Impl::mapping_from_view_mapping(m_map)}; + } + + template , + typename = std::enable_if_t>> + KOKKOS_INLINE_FUNCTION constexpr auto to_mdspan( + const OtherAccessorType& other_accessor = OtherAccessorType()) { + using mdspan_type = typename Impl::MDSpanViewTraits::mdspan_type; + using ret_mdspan_type = + mdspan; + return ret_mdspan_type{data(), + Impl::mapping_from_view_mapping(m_map), + other_accessor}; + } +#endif // KOKKOS_ENABLE_IMPL_MDSPAN }; template diff --git a/core/src/OpenACC/Kokkos_OpenACC_ParallelFor_Team.hpp b/core/src/OpenACC/Kokkos_OpenACC_ParallelFor_Team.hpp index 4fce680aef0..2b98018e3bb 100644 --- a/core/src/OpenACC/Kokkos_OpenACC_ParallelFor_Team.hpp +++ b/core/src/OpenACC/Kokkos_OpenACC_ParallelFor_Team.hpp @@ -44,10 +44,12 @@ class Kokkos::Impl::ParallelFor, auto team_size = m_policy.team_size(); auto vector_length = m_policy.impl_vector_length(); + int const async_arg = m_policy.space().acc_async_queue(); + auto const a_functor(m_functor); #pragma acc parallel loop gang vector num_gangs(league_size) \ - vector_length(team_size* vector_length) copyin(a_functor) + vector_length(team_size* vector_length) copyin(a_functor) async(async_arg) for (int i = 0; i < league_size * team_size * vector_length; i++) { int league_id = i / (team_size * vector_length); typename Policy::member_type team(league_id, league_size, team_size, @@ -145,10 +147,12 @@ class Kokkos::Impl::ParallelFor, auto team_size = m_policy.team_size(); auto vector_length = m_policy.impl_vector_length(); + int const async_arg = m_policy.space().acc_async_queue(); + auto const a_functor(m_functor); #pragma acc parallel loop gang num_gangs(league_size) num_workers(team_size) \ - vector_length(vector_length) copyin(a_functor) + vector_length(vector_length) copyin(a_functor) async(async_arg) for (int i = 0; i < league_size; i++) { int league_id = i; typename Policy::member_type team(league_id, league_size, team_size, diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget.hpp index ea4e7f6baba..84c7b85f11d 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget.hpp @@ -146,7 +146,8 @@ struct DeviceTypeTraits<::Kokkos::Experimental::OpenMPTarget> { /*--------------------------------------------------------------------------*/ #include -#include +#include +#include #include /*--------------------------------------------------------------------------*/ diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_MDRangePolicy.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_MDRangePolicy.hpp index d718f56d38b..e353676b617 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_MDRangePolicy.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_MDRangePolicy.hpp @@ -22,6 +22,10 @@ namespace Kokkos { namespace Impl { +using OpenMPTargetIterateLeft = std::integral_constant; +using OpenMPTargetIterateRight = + std::integral_constant; + template struct ThreadAndVectorNestLevel +#include +#include "Kokkos_OpenMPTarget_MDRangePolicy.hpp" + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template +class ParallelFor, + Kokkos::Experimental::OpenMPTarget> { + private: + using Policy = Kokkos::MDRangePolicy; + using WorkTag = typename Policy::work_tag; + using Member = typename Policy::member_type; + using Index = typename Policy::index_type; + + const FunctorType m_functor; + const Policy m_policy; + + public: + inline void execute() const { + OpenMPTargetExec::verify_is_process( + "Kokkos::Experimental::OpenMPTarget parallel_for"); + OpenMPTargetExec::verify_initialized( + "Kokkos::Experimental::OpenMPTarget parallel_for"); + FunctorType functor(m_functor); + Policy policy = m_policy; + + typename Policy::point_type unused; + static_assert(1 < Policy::rank && Policy::rank < 7); + static_assert(Policy::inner_direction == Iterate::Left || + Policy::inner_direction == Iterate::Right); + + execute_tile( + unused, functor, policy, + std::integral_constant()); + } + + template + inline std::enable_if_t execute_tile( + typename Policy::point_type offset, const FunctorType& functor, + const Policy& policy, OpenMPTargetIterateRight) const { + (void)offset; + const Index begin_0 = policy.m_lower[0]; + const Index begin_1 = policy.m_lower[1]; + + const Index end_0 = policy.m_upper[0]; + const Index end_1 = policy.m_upper[1]; + +#pragma omp target teams distribute parallel for collapse(2) map(to : functor) + for (auto i0 = begin_0; i0 < end_0; ++i0) + for (auto i1 = begin_1; i1 < end_1; ++i1) { + if constexpr (std::is_void::value) + functor(i0, i1); + else + functor(typename Policy::work_tag(), i0, i1); + } + } + + template + inline std::enable_if_t execute_tile( + typename Policy::point_type offset, const FunctorType& functor, + const Policy& policy, OpenMPTargetIterateRight) const { + (void)offset; + const Index begin_0 = policy.m_lower[0]; + const Index begin_1 = policy.m_lower[1]; + const Index begin_2 = policy.m_lower[2]; + + const Index end_0 = policy.m_upper[0]; + const Index end_1 = policy.m_upper[1]; + const Index end_2 = policy.m_upper[2]; + +#pragma omp target teams distribute parallel for collapse(3) map(to : functor) + for (auto i0 = begin_0; i0 < end_0; ++i0) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i2 = begin_2; i2 < end_2; ++i2) { + if constexpr (std::is_void::value) + functor(i0, i1, i2); + else + functor(typename Policy::work_tag(), i0, i1, i2); + } + } + } + } + + template + inline std::enable_if_t execute_tile( + typename Policy::point_type offset, const FunctorType& functor, + const Policy& policy, OpenMPTargetIterateRight) const { + (void)offset; + const Index begin_0 = policy.m_lower[0]; + const Index begin_1 = policy.m_lower[1]; + const Index begin_2 = policy.m_lower[2]; + const Index begin_3 = policy.m_lower[3]; + + const Index end_0 = policy.m_upper[0]; + const Index end_1 = policy.m_upper[1]; + const Index end_2 = policy.m_upper[2]; + const Index end_3 = policy.m_upper[3]; + +#pragma omp target teams distribute parallel for collapse(4) map(to : functor) + for (auto i0 = begin_0; i0 < end_0; ++i0) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i3 = begin_3; i3 < end_3; ++i3) { + if constexpr (std::is_void::value) + functor(i0, i1, i2, i3); + else + functor(typename Policy::work_tag(), i0, i1, i2, i3); + } + } + } + } + } + + template + inline std::enable_if_t execute_tile( + typename Policy::point_type offset, const FunctorType& functor, + const Policy& policy, OpenMPTargetIterateRight) const { + (void)offset; + const Index begin_0 = policy.m_lower[0]; + const Index begin_1 = policy.m_lower[1]; + const Index begin_2 = policy.m_lower[2]; + const Index begin_3 = policy.m_lower[3]; + const Index begin_4 = policy.m_lower[4]; + + const Index end_0 = policy.m_upper[0]; + const Index end_1 = policy.m_upper[1]; + const Index end_2 = policy.m_upper[2]; + const Index end_3 = policy.m_upper[3]; + const Index end_4 = policy.m_upper[4]; + +#pragma omp target teams distribute parallel for collapse(5) map(to : functor) + for (auto i0 = begin_0; i0 < end_0; ++i0) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i3 = begin_3; i3 < end_3; ++i3) { + for (auto i4 = begin_4; i4 < end_4; ++i4) { + if constexpr (std::is_same::value) + functor(i0, i1, i2, i3, i4); + else + functor(typename Policy::work_tag(), i0, i1, i2, i3, i4); + } + } + } + } + } + } + + template + inline std::enable_if_t execute_tile( + typename Policy::point_type offset, const FunctorType& functor, + const Policy& policy, OpenMPTargetIterateRight) const { + (void)offset; + const Index begin_0 = policy.m_lower[0]; + const Index begin_1 = policy.m_lower[1]; + const Index begin_2 = policy.m_lower[2]; + const Index begin_3 = policy.m_lower[3]; + const Index begin_4 = policy.m_lower[4]; + const Index begin_5 = policy.m_lower[5]; + + const Index end_0 = policy.m_upper[0]; + const Index end_1 = policy.m_upper[1]; + const Index end_2 = policy.m_upper[2]; + const Index end_3 = policy.m_upper[3]; + const Index end_4 = policy.m_upper[4]; + const Index end_5 = policy.m_upper[5]; + +#pragma omp target teams distribute parallel for collapse(6) map(to : functor) + for (auto i0 = begin_0; i0 < end_0; ++i0) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i3 = begin_3; i3 < end_3; ++i3) { + for (auto i4 = begin_4; i4 < end_4; ++i4) { + for (auto i5 = begin_5; i5 < end_5; ++i5) { + { + if constexpr (std::is_same::value) + functor(i0, i1, i2, i3, i4, i5); + else + functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, + i5); + } + } + } + } + } + } + } + } + + template + inline std::enable_if_t execute_tile( + typename Policy::point_type offset, const FunctorType& functor, + const Policy& policy, OpenMPTargetIterateLeft) const { + (void)offset; + const Index begin_0 = policy.m_lower[0]; + const Index begin_1 = policy.m_lower[1]; + + const Index end_0 = policy.m_upper[0]; + const Index end_1 = policy.m_upper[1]; + +#pragma omp target teams distribute parallel for collapse(2) map(to : functor) + for (auto i1 = begin_1; i1 < end_1; ++i1) + for (auto i0 = begin_0; i0 < end_0; ++i0) { + if constexpr (std::is_void::value) + functor(i0, i1); + else + functor(typename Policy::work_tag(), i0, i1); + } + } + + template + inline std::enable_if_t execute_tile( + typename Policy::point_type offset, const FunctorType& functor, + const Policy& policy, OpenMPTargetIterateLeft) const { + (void)offset; + const Index begin_0 = policy.m_lower[0]; + const Index begin_1 = policy.m_lower[1]; + const Index begin_2 = policy.m_lower[2]; + + const Index end_0 = policy.m_upper[0]; + const Index end_1 = policy.m_upper[1]; + const Index end_2 = policy.m_upper[2]; + +#pragma omp target teams distribute parallel for collapse(3) map(to : functor) + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i0 = begin_0; i0 < end_0; ++i0) { + if constexpr (std::is_void::value) + functor(i0, i1, i2); + else + functor(typename Policy::work_tag(), i0, i1, i2); + } + } + } + } + + template + inline std::enable_if_t execute_tile( + typename Policy::point_type offset, const FunctorType& functor, + const Policy& policy, OpenMPTargetIterateLeft) const { + (void)offset; + const Index begin_0 = policy.m_lower[0]; + const Index begin_1 = policy.m_lower[1]; + const Index begin_2 = policy.m_lower[2]; + const Index begin_3 = policy.m_lower[3]; + + const Index end_0 = policy.m_upper[0]; + const Index end_1 = policy.m_upper[1]; + const Index end_2 = policy.m_upper[2]; + const Index end_3 = policy.m_upper[3]; + +#pragma omp target teams distribute parallel for collapse(4) map(to : functor) + for (auto i3 = begin_3; i3 < end_3; ++i3) { + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i0 = begin_0; i0 < end_0; ++i0) { + if constexpr (std::is_void::value) + functor(i0, i1, i2, i3); + else + functor(typename Policy::work_tag(), i0, i1, i2, i3); + } + } + } + } + } + + template + inline std::enable_if_t execute_tile( + typename Policy::point_type offset, const FunctorType& functor, + const Policy& policy, OpenMPTargetIterateLeft) const { + (void)offset; + const Index begin_0 = policy.m_lower[0]; + const Index begin_1 = policy.m_lower[1]; + const Index begin_2 = policy.m_lower[2]; + const Index begin_3 = policy.m_lower[3]; + const Index begin_4 = policy.m_lower[4]; + + const Index end_0 = policy.m_upper[0]; + const Index end_1 = policy.m_upper[1]; + const Index end_2 = policy.m_upper[2]; + const Index end_3 = policy.m_upper[3]; + const Index end_4 = policy.m_upper[4]; + +#pragma omp target teams distribute parallel for collapse(5) map(to : functor) + for (auto i4 = begin_4; i4 < end_4; ++i4) { + for (auto i3 = begin_3; i3 < end_3; ++i3) { + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i0 = begin_0; i0 < end_0; ++i0) { + if constexpr (std::is_same::value) + functor(i0, i1, i2, i3, i4); + else + functor(typename Policy::work_tag(), i0, i1, i2, i3, i4); + } + } + } + } + } + } + + template + inline std::enable_if_t execute_tile( + typename Policy::point_type offset, const FunctorType& functor, + const Policy& policy, OpenMPTargetIterateLeft) const { + (void)offset; + const Index begin_0 = policy.m_lower[0]; + const Index begin_1 = policy.m_lower[1]; + const Index begin_2 = policy.m_lower[2]; + const Index begin_3 = policy.m_lower[3]; + const Index begin_4 = policy.m_lower[4]; + const Index begin_5 = policy.m_lower[5]; + + const Index end_0 = policy.m_upper[0]; + const Index end_1 = policy.m_upper[1]; + const Index end_2 = policy.m_upper[2]; + const Index end_3 = policy.m_upper[3]; + const Index end_4 = policy.m_upper[4]; + const Index end_5 = policy.m_upper[5]; + +#pragma omp target teams distribute parallel for collapse(6) map(to : functor) + for (auto i5 = begin_5; i5 < end_5; ++i5) { + for (auto i4 = begin_4; i4 < end_4; ++i4) { + for (auto i3 = begin_3; i3 < end_3; ++i3) { + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i0 = begin_0; i0 < end_0; ++i0) { + { + if constexpr (std::is_same::value) + functor(i0, i1, i2, i3, i4, i5); + else + functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, + i5); + } + } + } + } + } + } + } + } + + inline ParallelFor(const FunctorType& arg_functor, Policy arg_policy) + : m_functor(arg_functor), m_policy(arg_policy) {} + // TODO DZP: based on a conversation with Christian, we're using 256 as a + // heuristic here. We need something better once we can query these kinds of + // properties + template + static int max_tile_size_product(const Policy&, const Functor&) { + return 256; + } +}; + +} // namespace Impl +} // namespace Kokkos + +#endif /* KOKKOS_OPENMPTARGET_PARALLELFOR_MDRANGE_HPP */ diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_MDRange.hpp similarity index 61% rename from core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp rename to core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_MDRange.hpp index 6878531730d..e86a1219749 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_MDRange.hpp @@ -14,128 +14,120 @@ // //@HEADER -#ifndef KOKKOS_OPENMPTARGET_PARALLEL_MDRANGE_HPP -#define KOKKOS_OPENMPTARGET_PARALLEL_MDRANGE_HPP +#ifndef KOKKOS_OPENMPTARGET_PARALLELREDUCE_MDRANGE_HPP +#define KOKKOS_OPENMPTARGET_PARALLELREDUCE_MDRANGE_HPP #include #include -#include +#include "Kokkos_OpenMPTarget_MDRangePolicy.hpp" #include -// WORKAROUND OPENMPTARGET: sometimes tile sizes don't make it correctly, -// this was tracked down to a bug in clang with regards of mapping structs -// with arrays of long in it. Arrays of int might be fine though ... -#define KOKKOS_IMPL_MDRANGE_USE_NO_TILES // undef EOF - //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- namespace Kokkos { namespace Impl { -template -class ParallelFor, - Kokkos::Experimental::OpenMPTarget> { +template +class ParallelReduce, + Kokkos::Experimental::OpenMPTarget> { private: - using Policy = Kokkos::MDRangePolicy; + using Policy = Kokkos::MDRangePolicy; + using FunctorType = typename CombinedFunctorReducerType::functor_type; + using ReducerType = typename CombinedFunctorReducerType::reducer_type; + using WorkTag = typename Policy::work_tag; using Member = typename Policy::member_type; using Index = typename Policy::index_type; - const FunctorType m_functor; + using pointer_type = typename ReducerType::pointer_type; + using reference_type = typename ReducerType::reference_type; + + static constexpr bool UseReducer = + !std::is_same_v; + + const pointer_type m_result_ptr; + const CombinedFunctorReducerType m_functor_reducer; const Policy m_policy; + using ParReduceCopy = ParallelReduceCopy; + + bool m_result_ptr_on_device; + public: inline void execute() const { - OpenMPTargetExec::verify_is_process( - "Kokkos::Experimental::OpenMPTarget parallel_for"); - OpenMPTargetExec::verify_initialized( - "Kokkos::Experimental::OpenMPTarget parallel_for"); - FunctorType functor(m_functor); - Policy policy = m_policy; - -#ifdef KOKKOS_IMPL_MDRANGE_USE_NO_TILES - typename Policy::point_type unused; - - execute_tile(unused, functor, policy); -#else - const int64_t begin = 0; - const int64_t end = m_policy.m_num_tiles; - -#pragma omp target teams distribute map(to : functor) num_teams(end - begin) - { - for (ptrdiff_t tile_idx = begin; tile_idx < end; ++tile_idx) { - -#pragma omp parallel - { - typename Policy::point_type offset; - if (Policy::outer_direction == Policy::Left) { - for (int i = 0; i < Policy::rank; ++i) { - offset[i] = (tile_idx % policy.m_tile_end[i]) * policy.m_tile[i] + - policy.m_lower[i]; - tile_idx /= policy.m_tile_end[i]; - } - } else { - for (int i = Policy::rank - 1; i >= 0; --i) { - offset[i] = (tile_idx % policy.m_tile_end[i]) * policy.m_tile[i] + - policy.m_lower[i]; - tile_idx /= policy.m_tile_end[i]; - } - } - execute_tile(offset, functor, policy); - } - } - } -#endif + // Only let one ParallelReduce instance at a time use the scratch memory. + std::scoped_lock scratch_memory_lock( + OpenMPTargetExec::m_mutex_scratch_ptr); + execute_tile( + m_functor_reducer.get_functor(), m_policy, m_result_ptr, + std::integral_constant()); } - template + template + inline ParallelReduce(const CombinedFunctorReducerType& arg_functor_reducer, + Policy arg_policy, const ViewType& arg_result_view) + : m_result_ptr(arg_result_view.data()), + m_functor_reducer(arg_functor_reducer), + m_policy(arg_policy), + m_result_ptr_on_device( + MemorySpaceAccess::accessible) {} + + template inline std::enable_if_t execute_tile( - typename Policy::point_type offset, const FunctorType& functor, - const Policy& policy) const { -#ifdef KOKKOS_IMPL_MDRANGE_USE_NO_TILES - (void)offset; + const FunctorType& functor, const Policy& policy, pointer_type ptr, + OpenMPTargetIterateLeft) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; const Index end_0 = policy.m_upper[0]; const Index end_1 = policy.m_upper[1]; -#pragma omp target teams distribute parallel for collapse(2) map(to : functor) - for (auto i0 = begin_0; i0 < end_0; ++i0) { + ValueType result = ValueType(); + + // FIXME_OPENMPTARGET: Unable to separate directives and their companion + // loops which leads to code duplication for different reduction types. + if constexpr (UseReducer) { +#pragma omp declare reduction( \ + custom:ValueType \ + : OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) + +#pragma omp target teams distribute parallel for collapse(2) map(to \ + : functor) \ + reduction(custom \ + : result) for (auto i1 = begin_1; i1 < end_1; ++i1) { - if constexpr (std::is_void::value) - functor(i0, i1); - else - functor(typename Policy::work_tag(), i0, i1); + for (auto i0 = begin_0; i0 < end_0; ++i0) { + if constexpr (std::is_void::value) + functor(i0, i1, result); + else + functor(typename Policy::work_tag(), i0, i1, result); + } } - } -#else - const ptrdiff_t begin_0 = offset[0]; - ptrdiff_t end_0 = begin_0 + policy.m_tile[0]; - end_0 = end_0 < policy.m_upper[0] ? end_0 : policy.m_upper[0]; - - const ptrdiff_t begin_1 = offset[1]; - ptrdiff_t end_1 = begin_1 + policy.m_tile[1]; - end_1 = end_1 < policy.m_upper[1] ? end_1 : policy.m_upper[1]; - -#pragma omp for collapse(2) - for (ptrdiff_t i0 = begin_0; i0 < end_0; ++i0) - for (ptrdiff_t i1 = begin_1; i1 < end_1; ++i1) { - if constexpr (std::is_void::value) - functor(i0, i1); - else - functor(typename Policy::work_tag(), i0, i1); + } else { +#pragma omp target teams distribute parallel for collapse(2) map(to : functor) \ +reduction(+:result) + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i0 = begin_0; i0 < end_0; ++i0) { + if constexpr (std::is_void::value) + functor(i0, i1, result); + else + functor(typename Policy::work_tag(), i0, i1, result); + } } -#endif + } + + ParReduceCopy::memcpy_result(ptr, &result, sizeof(ValueType), + m_result_ptr_on_device); } - template + template inline std::enable_if_t execute_tile( - typename Policy::point_type offset, const FunctorType& functor, - const Policy& policy) const { -#ifdef KOKKOS_IMPL_MDRANGE_USE_NO_TILES - (void)offset; + const FunctorType& functor, const Policy& policy, pointer_type ptr, + OpenMPTargetIterateLeft) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; const Index begin_2 = policy.m_lower[2]; @@ -144,107 +136,119 @@ class ParallelFor, const Index end_1 = policy.m_upper[1]; const Index end_2 = policy.m_upper[2]; -#pragma omp target teams distribute parallel for collapse(3) map(to : functor) - for (auto i0 = begin_0; i0 < end_0; ++i0) { - for (auto i1 = begin_1; i1 < end_1; ++i1) { - for (auto i2 = begin_2; i2 < end_2; ++i2) { - if constexpr (std::is_void::value) - functor(i0, i1, i2); - else - functor(typename Policy::work_tag(), i0, i1, i2); + ValueType result = ValueType(); + + // FIXME_OPENMPTARGET: Unable to separate directives and their companion + // loops which leads to code duplication for different reduction types. + if constexpr (UseReducer) { +#pragma omp declare reduction( \ + custom:ValueType \ + : OpenMPTargetReducerWrapper ::join( \ + omp_out, omp_in)) \ + initializer( \ + OpenMPTargetReducerWrapper ::init( \ + omp_priv)) + +#pragma omp target teams distribute parallel for collapse(3) map(to \ + : functor) \ + reduction(custom \ + : result) + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i0 = begin_0; i0 < end_0; ++i0) { + if constexpr (std::is_void::value) + functor(i0, i1, i2, result); + else + functor(typename Policy::work_tag(), i0, i1, i2, result); + } } } - } -#else - const ptrdiff_t begin_0 = offset[0]; - ptrdiff_t end_0 = begin_0 + policy.m_tile[0]; - end_0 = end_0 < policy.m_upper[0] ? end_0 : policy.m_upper[0]; - - const ptrdiff_t begin_1 = offset[1]; - ptrdiff_t end_1 = begin_1 + policy.m_tile[1]; - end_1 = end_1 < policy.m_upper[1] ? end_1 : policy.m_upper[1]; - - const ptrdiff_t begin_2 = offset[2]; - ptrdiff_t end_2 = begin_2 + policy.m_tile[2]; - end_2 = end_2 < policy.m_upper[2] ? end_2 : policy.m_upper[2]; - -#pragma omp for collapse(3) - for (ptrdiff_t i0 = begin_0; i0 < end_0; ++i0) - for (ptrdiff_t i1 = begin_1; i1 < end_1; ++i1) - for (ptrdiff_t i2 = begin_2; i2 < end_2; ++i2) { - if constexpr (std::is_void::value) - functor(i0, i1, i2); - else - functor(typename Policy::work_tag(), i0, i1, i2); + } else { +#pragma omp target teams distribute parallel for collapse(3) map(to : functor) \ +reduction(+:result) + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i0 = begin_0; i0 < end_0; ++i0) { + if constexpr (std::is_void::value) + functor(i0, i1, i2, result); + else + functor(typename Policy::work_tag(), i0, i1, i2, result); + } } -#endif + } + } + + ParReduceCopy::memcpy_result(ptr, &result, sizeof(ValueType), + m_result_ptr_on_device); } - template + template inline std::enable_if_t execute_tile( - typename Policy::point_type offset, const FunctorType& functor, - const Policy& policy) const { -#ifdef KOKKOS_IMPL_MDRANGE_USE_NO_TILES - (void)offset; + const FunctorType& functor, const Policy& policy, pointer_type ptr, + OpenMPTargetIterateLeft) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; - const Index begin_2 = policy.m_lower[2]; - const Index begin_3 = policy.m_lower[3]; + const Index begin_2 = policy.m_lower[3]; + const Index begin_3 = policy.m_lower[2]; const Index end_0 = policy.m_upper[0]; const Index end_1 = policy.m_upper[1]; const Index end_2 = policy.m_upper[2]; const Index end_3 = policy.m_upper[3]; -#pragma omp target teams distribute parallel for collapse(4) map(to : functor) - for (auto i0 = begin_0; i0 < end_0; ++i0) { - for (auto i1 = begin_1; i1 < end_1; ++i1) { + ValueType result = ValueType(); + + // FIXME_OPENMPTARGET: Unable to separate directives and their companion + // loops which leads to code duplication for different reduction types. + if constexpr (UseReducer) { +#pragma omp declare reduction( \ + custom:ValueType \ + : OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) + +#pragma omp target teams distribute parallel for collapse(4) map(to \ + : functor) \ + reduction(custom \ + : result) + for (auto i3 = begin_3; i3 < end_3; ++i3) { for (auto i2 = begin_2; i2 < end_2; ++i2) { - for (auto i3 = begin_3; i3 < end_3; ++i3) { - if constexpr (std::is_void::value) - functor(i0, i1, i2, i3); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3); + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i0 = begin_0; i0 < end_0; ++i0) { + if constexpr (std::is_same::value) + functor(i0, i1, i2, i3, result); + else + functor(typename Policy::work_tag(), i0, i1, i2, i3, result); + } } } } - } -#else - const ptrdiff_t begin_0 = offset[0]; - ptrdiff_t end_0 = begin_0 + policy.m_tile[0]; - end_0 = end_0 < policy.m_upper[0] ? end_0 : policy.m_upper[0]; - - const ptrdiff_t begin_1 = offset[1]; - ptrdiff_t end_1 = begin_1 + policy.m_tile[1]; - end_1 = end_1 < policy.m_upper[1] ? end_1 : policy.m_upper[1]; - - const ptrdiff_t begin_2 = offset[2]; - ptrdiff_t end_2 = begin_2 + policy.m_tile[2]; - end_2 = end_2 < policy.m_upper[2] ? end_2 : policy.m_upper[2]; - - const ptrdiff_t begin_3 = offset[3]; - ptrdiff_t end_3 = begin_3 + policy.m_tile[3]; - end_3 = end_3 < policy.m_upper[3] ? end_3 : policy.m_upper[3]; - -#pragma omp for collapse(4) - for (ptrdiff_t i0 = begin_0; i0 < end_0; ++i0) - for (ptrdiff_t i1 = begin_1; i1 < end_1; ++i1) - for (ptrdiff_t i2 = begin_2; i2 < end_2; ++i2) - for (ptrdiff_t i3 = begin_3; i3 < end_3; ++i3) { - if constexpr (std::is_void::value) - functor(i0, i1, i2, i3); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3); + } else { +#pragma omp target teams distribute parallel for collapse(4) map(to : functor) \ +reduction(+:result) + for (auto i3 = begin_3; i3 < end_3; ++i3) { + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i0 = begin_0; i0 < end_0; ++i0) { + if constexpr (std::is_same::value) + functor(i0, i1, i2, i3, result); + else + functor(typename Policy::work_tag(), i0, i1, i2, i3, result); + } } -#endif + } + } + } + + ParReduceCopy::memcpy_result(ptr, &result, sizeof(ValueType), + m_result_ptr_on_device); } - template + template inline std::enable_if_t execute_tile( - typename Policy::point_type offset, const FunctorType& functor, - const Policy& policy) const { -#ifdef KOKKOS_IMPL_MDRANGE_USE_NO_TILES - (void)offset; + const FunctorType& functor, const Policy& policy, pointer_type ptr, + OpenMPTargetIterateLeft) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; const Index begin_2 = policy.m_lower[2]; @@ -257,64 +261,65 @@ class ParallelFor, const Index end_3 = policy.m_upper[3]; const Index end_4 = policy.m_upper[4]; -#pragma omp target teams distribute parallel for collapse(5) map(to : functor) - for (auto i0 = begin_0; i0 < end_0; ++i0) { - for (auto i1 = begin_1; i1 < end_1; ++i1) { - for (auto i2 = begin_2; i2 < end_2; ++i2) { - for (auto i3 = begin_3; i3 < end_3; ++i3) { - for (auto i4 = begin_4; i4 < end_4; ++i4) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, i4); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4); + ValueType result = ValueType(); + + // FIXME_OPENMPTARGET: Unable to separate directives and their companion + // loops which leads to code duplication for different reduction types. + if constexpr (UseReducer) { +#pragma omp declare reduction( \ + custom:ValueType \ + : OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) + +#pragma omp target teams distribute parallel for collapse(5) map(to \ + : functor) \ + reduction(custom \ + : result) + for (auto i4 = begin_4; i4 < end_4; ++i4) { + for (auto i3 = begin_3; i3 < end_3; ++i3) { + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i0 = begin_0; i0 < end_0; ++i0) { + if constexpr (std::is_same::value) + functor(i0, i1, i2, i3, i4, result); + else + functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, + result); + } } } } } - } -#else - const ptrdiff_t begin_0 = offset[0]; - ptrdiff_t end_0 = begin_0 + policy.m_tile[0]; - end_0 = end_0 < policy.m_upper[0] ? end_0 : policy.m_upper[0]; - - const ptrdiff_t begin_1 = offset[1]; - ptrdiff_t end_1 = begin_1 + policy.m_tile[1]; - end_1 = end_1 < policy.m_upper[1] ? end_1 : policy.m_upper[1]; - - const ptrdiff_t begin_2 = offset[2]; - ptrdiff_t end_2 = begin_2 + policy.m_tile[2]; - end_2 = end_2 < policy.m_upper[2] ? end_2 : policy.m_upper[2]; - - const ptrdiff_t begin_3 = offset[3]; - ptrdiff_t end_3 = begin_3 + policy.m_tile[3]; - end_3 = end_3 < policy.m_upper[3] ? end_3 : policy.m_upper[3]; - - const ptrdiff_t begin_4 = offset[4]; - ptrdiff_t end_4 = begin_4 + policy.m_tile[4]; - end_4 = end_4 < policy.m_upper[4] ? end_4 : policy.m_upper[4]; - -#pragma omp for collapse(5) - for (ptrdiff_t i0 = begin_0; i0 < end_0; ++i0) - for (ptrdiff_t i1 = begin_1; i1 < end_1; ++i1) - for (ptrdiff_t i2 = begin_2; i2 < end_2; ++i2) - for (ptrdiff_t i3 = begin_3; i3 < end_3; ++i3) - for (ptrdiff_t i4 = begin_4; i4 < end_4; ++i4) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, i4); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4); + } else { +#pragma omp target teams distribute parallel for collapse(5) map(to : functor) \ +reduction(+:result) + for (auto i4 = begin_4; i4 < end_4; ++i4) { + for (auto i3 = begin_3; i3 < end_3; ++i3) { + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i0 = begin_0; i0 < end_0; ++i0) { + if constexpr (std::is_same::value) + functor(i0, i1, i2, i3, i4, result); + else + functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, + result); + } } -#endif + } + } + } + } + + ParReduceCopy::memcpy_result(ptr, &result, sizeof(ValueType), + m_result_ptr_on_device); } - template + template inline std::enable_if_t execute_tile( - typename Policy::point_type offset, const FunctorType& functor, - const Policy& policy) const { -#ifdef KOKKOS_IMPL_MDRANGE_USE_NO_TILES - (void)offset; + const FunctorType& functor, const Policy& policy, pointer_type ptr, + OpenMPTargetIterateLeft) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; const Index begin_2 = policy.m_lower[2]; @@ -329,140 +334,69 @@ class ParallelFor, const Index end_4 = policy.m_upper[4]; const Index end_5 = policy.m_upper[5]; -#pragma omp target teams distribute parallel for collapse(6) map(to : functor) - for (auto i0 = begin_0; i0 < end_0; ++i0) { - for (auto i1 = begin_1; i1 < end_1; ++i1) { - for (auto i2 = begin_2; i2 < end_2; ++i2) { + ValueType result = ValueType(); + + // FIXME_OPENMPTARGET: Unable to separate directives and their companion + // loops which leads to code duplication for different reduction types. + if constexpr (UseReducer) { +#pragma omp declare reduction( \ + custom:ValueType \ + : OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) + +#pragma omp target teams distribute parallel for collapse(6) map(to \ + : functor) \ + reduction(custom \ + : result) + for (auto i5 = begin_5; i5 < end_5; ++i5) { + for (auto i4 = begin_4; i4 < end_4; ++i4) { for (auto i3 = begin_3; i3 < end_3; ++i3) { - for (auto i4 = begin_4; i4 < end_4; ++i4) { - for (auto i5 = begin_5; i5 < end_5; ++i5) { - { + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i0 = begin_0; i0 < end_0; ++i0) { if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, i4, i5); + functor(i0, i1, i2, i3, i4, i5, result); else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, - i5); + functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, i5, + result); } } } } } } - } -#else - const ptrdiff_t begin_0 = offset[0]; - ptrdiff_t end_0 = begin_0 + policy.m_tile[0]; - end_0 = end_0 < policy.m_upper[0] ? end_0 : policy.m_upper[0]; - - const ptrdiff_t begin_1 = offset[1]; - ptrdiff_t end_1 = begin_1 + policy.m_tile[1]; - end_1 = end_1 < policy.m_upper[1] ? end_1 : policy.m_upper[1]; - - const ptrdiff_t begin_2 = offset[2]; - ptrdiff_t end_2 = begin_2 + policy.m_tile[2]; - end_2 = end_2 < policy.m_upper[2] ? end_2 : policy.m_upper[2]; - - const ptrdiff_t begin_3 = offset[3]; - ptrdiff_t end_3 = begin_3 + policy.m_tile[3]; - end_3 = end_3 < policy.m_upper[3] ? end_3 : policy.m_upper[3]; - - const ptrdiff_t begin_4 = offset[4]; - ptrdiff_t end_4 = begin_4 + policy.m_tile[4]; - end_4 = end_4 < policy.m_upper[4] ? end_4 : policy.m_upper[4]; - - const ptrdiff_t begin_5 = offset[5]; - ptrdiff_t end_5 = begin_5 + policy.m_tile[5]; - end_5 = end_5 < policy.m_upper[5] ? end_5 : policy.m_upper[5]; - -#pragma omp for collapse(6) - for (ptrdiff_t i0 = begin_0; i0 < end_0; ++i0) - for (ptrdiff_t i1 = begin_1; i1 < end_1; ++i1) - for (ptrdiff_t i2 = begin_2; i2 < end_2; ++i2) - for (ptrdiff_t i3 = begin_3; i3 < end_3; ++i3) - for (ptrdiff_t i4 = begin_4; i4 < end_4; ++i4) - for (ptrdiff_t i5 = begin_5; i5 < end_5; ++i5) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, i4, i5); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, i5); + } else { +#pragma omp target teams distribute parallel for collapse(6) map(to : functor) \ +reduction(+:result) + for (auto i5 = begin_5; i5 < end_5; ++i5) { + for (auto i4 = begin_4; i4 < end_4; ++i4) { + for (auto i3 = begin_3; i3 < end_3; ++i3) { + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i0 = begin_0; i0 < end_0; ++i0) { + if constexpr (std::is_same::value) + functor(i0, i1, i2, i3, i4, i5, result); + else + functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, i5, + result); + } } -#endif - } - - inline ParallelFor(const FunctorType& arg_functor, Policy arg_policy) - : m_functor(arg_functor), m_policy(arg_policy) {} - // TODO DZP: based on a conversation with Christian, we're using 256 as a - // heuristic here. We need something better once we can query these kinds of - // properties - template - static int max_tile_size_product(const Policy&, const Functor&) { - return 256; - } -}; - -} // namespace Impl -} // namespace Kokkos - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -template -class ParallelReduce, - Kokkos::Experimental::OpenMPTarget> { - private: - using Policy = Kokkos::MDRangePolicy; - using FunctorType = typename CombinedFunctorReducerType::functor_type; - using ReducerType = typename CombinedFunctorReducerType::reducer_type; - - using WorkTag = typename Policy::work_tag; - using Member = typename Policy::member_type; - using Index = typename Policy::index_type; - - using pointer_type = typename ReducerType::pointer_type; - using reference_type = typename ReducerType::reference_type; - - static constexpr bool UseReducer = - !std::is_same_v; - - const pointer_type m_result_ptr; - const CombinedFunctorReducerType m_functor_reducer; - const Policy m_policy; - - using ParReduceCopy = ParallelReduceCopy; - - bool m_result_ptr_on_device; - - // Only let one ParallelReduce instance at a time use the scratch memory. - // The constructor acquires the mutex which is released in the destructor. - std::scoped_lock m_scratch_memory_lock; + } + } + } + } + } - public: - inline void execute() const { - execute_tile( - m_functor_reducer.get_functor(), m_policy, m_result_ptr); + ParReduceCopy::memcpy_result(ptr, &result, sizeof(ValueType), + m_result_ptr_on_device); } - template - inline ParallelReduce(const CombinedFunctorReducerType& arg_functor_reducer, - Policy arg_policy, const ViewType& arg_result_view) - : m_result_ptr(arg_result_view.data()), - m_functor_reducer(arg_functor_reducer), - m_policy(arg_policy), - m_result_ptr_on_device( - MemorySpaceAccess::accessible), - m_scratch_memory_lock(OpenMPTargetExec::m_mutex_scratch_ptr) {} - template - inline std::enable_if_t execute_tile(const FunctorType& functor, - const Policy& policy, - pointer_type ptr) const { + inline std::enable_if_t execute_tile( + const FunctorType& functor, const Policy& policy, pointer_type ptr, + OpenMPTargetIterateRight) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; @@ -509,9 +443,9 @@ reduction(+:result) } template - inline std::enable_if_t execute_tile(const FunctorType& functor, - const Policy& policy, - pointer_type ptr) const { + inline std::enable_if_t execute_tile( + const FunctorType& functor, const Policy& policy, pointer_type ptr, + OpenMPTargetIterateRight) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; const Index begin_2 = policy.m_lower[2]; @@ -567,9 +501,9 @@ reduction(+:result) } template - inline std::enable_if_t execute_tile(const FunctorType& functor, - const Policy& policy, - pointer_type ptr) const { + inline std::enable_if_t execute_tile( + const FunctorType& functor, const Policy& policy, pointer_type ptr, + OpenMPTargetIterateRight) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; const Index begin_2 = policy.m_lower[3]; @@ -630,9 +564,9 @@ reduction(+:result) } template - inline std::enable_if_t execute_tile(const FunctorType& functor, - const Policy& policy, - pointer_type ptr) const { + inline std::enable_if_t execute_tile( + const FunctorType& functor, const Policy& policy, pointer_type ptr, + OpenMPTargetIterateRight) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; const Index begin_2 = policy.m_lower[2]; @@ -701,9 +635,9 @@ reduction(+:result) } template - inline std::enable_if_t execute_tile(const FunctorType& functor, - const Policy& policy, - pointer_type ptr) const { + inline std::enable_if_t execute_tile( + const FunctorType& functor, const Policy& policy, pointer_type ptr, + OpenMPTargetIterateRight) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; const Index begin_2 = policy.m_lower[2]; @@ -788,5 +722,4 @@ reduction(+:result) //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- -#undef KOKKOS_IMPL_MDRANGE_USE_NO_TILES -#endif /* KOKKOS_OPENMPTARGET_PARALLEL_HPP */ +#endif /* KOKKOS_OPENMPTARGET_PARALLELREDUCE_MDRANGE_HPP */ diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp index caa568a8925..4a112ed11d0 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp @@ -55,13 +55,13 @@ class ParallelReduce, const pointer_type m_result_ptr; bool m_result_ptr_on_device; const int m_result_ptr_num_elems; - // Only let one ParallelReduce instance at a time use the scratch memory. - // The constructor acquires the mutex which is released in the destructor. - std::scoped_lock m_scratch_memory_lock; using TagType = typename Policy::work_tag; public: void execute() const { + // Only let one ParallelReduce instance at a time use the scratch memory. + std::scoped_lock scratch_memory_lock( + OpenMPTargetExec::m_mutex_scratch_ptr); const FunctorType& functor = m_functor_reducer.get_functor(); if constexpr (FunctorHasJoin) { // Enter this loop if the Functor has a init-join. @@ -108,8 +108,7 @@ class ParallelReduce, m_result_ptr_on_device( MemorySpaceAccess::accessible), - m_result_ptr_num_elems(arg_result_view.size()), - m_scratch_memory_lock(OpenMPTargetExec::m_mutex_scratch_ptr) {} + m_result_ptr_num_elems(arg_result_view.size()) {} }; } // namespace Impl diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp index 8abffa47a43..16c0eedb818 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp @@ -470,12 +470,11 @@ class ParallelReduce m_scratch_memory_lock; - public: void execute() const { + // Only let one ParallelReduce instance at a time use the scratch memory. + std::scoped_lock scratch_memory_lock( + OpenMPTargetExec::m_mutex_scratch_ptr); const FunctorType& functor = m_functor_reducer.get_functor(); if constexpr (FunctorHasJoin) { ParReduceSpecialize::execute_init_join(functor, m_policy, m_result_ptr, @@ -521,8 +520,7 @@ class ParallelReduce::value( - arg_functor_reducer.get_functor(), arg_policy.team_size())), - m_scratch_memory_lock(OpenMPTargetExec::m_mutex_scratch_ptr) {} + arg_functor_reducer.get_functor(), arg_policy.team_size())) {} }; } // namespace Impl diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp index c1f7851f413..b0d69328024 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp @@ -177,6 +177,10 @@ class ParallelScan, const idx_type chunk_size = 128; const idx_type n_chunks = (N + chunk_size - 1) / chunk_size; + // Only let one ParallelReduce instance at a time use the scratch memory. + std::scoped_lock scratch_memory_lock( + OpenMPTargetExec::m_mutex_scratch_ptr); + // This could be scratch memory per team Kokkos::View @@ -225,6 +229,10 @@ class ParallelScanWithTotal, const int64_t n_chunks = (N + chunk_size - 1) / chunk_size; if (N > 0) { + // Only let one ParallelReduce instance at a time use the scratch memory. + std::scoped_lock scratch_memory_lock( + OpenMPTargetExec::m_mutex_scratch_ptr); + // This could be scratch memory per team Kokkos::View diff --git a/core/src/SYCL/Kokkos_SYCL.cpp b/core/src/SYCL/Kokkos_SYCL.cpp index 9a246f7642f..de5ddf405d4 100644 --- a/core/src/SYCL/Kokkos_SYCL.cpp +++ b/core/src/SYCL/Kokkos_SYCL.cpp @@ -110,6 +110,26 @@ void SYCL::print_configuration(std::ostream& os, bool verbose) const { #else os << "macro KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES : undefined\n"; #endif +#ifdef SYCL_EXT_INTEL_QUEUE_IMMEDIATE_COMMAND_LIST + if (sycl_queue() + .has_property< + sycl::ext::intel::property::queue::immediate_command_list>()) + os << "Immediate command lists enforced\n"; + else if (sycl_queue() + .has_property()) + os << "Standard command queue enforced\n"; + else +#endif + { + os << "Immediate command lists and standard command queue allowed.\n"; + if (const char* environment_setting = + std::getenv("SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS")) + os << "SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=" + << environment_setting << " takes precedence.\n"; + else + os << "SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS not defined.\n"; + } int counter = 0; int active_device = Kokkos::device_id(); diff --git a/core/src/SYCL/Kokkos_SYCL_Instance.cpp b/core/src/SYCL/Kokkos_SYCL_Instance.cpp index 4a1c910c73d..5843dca8123 100644 --- a/core/src/SYCL/Kokkos_SYCL_Instance.cpp +++ b/core/src/SYCL/Kokkos_SYCL_Instance.cpp @@ -166,7 +166,7 @@ int SYCLInternal::acquire_team_scratch_space() { return current_team_scratch; } -sycl::device_ptr SYCLInternal::resize_team_scratch_space( +Kokkos::Impl::sycl_device_ptr SYCLInternal::resize_team_scratch_space( int scratch_pool_id, std::int64_t bytes, bool force_shrink) { // Multiple ParallelFor/Reduce Teams can call this function at the same time // and invalidate the m_team_scratch_ptr. We use a pool to avoid any race @@ -251,7 +251,8 @@ void SYCLInternal::finalize() { m_queue.reset(); } -sycl::device_ptr SYCLInternal::scratch_space(const std::size_t size) { +Kokkos::Impl::sycl_device_ptr SYCLInternal::scratch_space( + const std::size_t size) { if (verify_is_initialized("scratch_space") && m_scratchSpaceCount < scratch_count(size)) { auto mem_space = Kokkos::Experimental::SYCLDeviceUSMSpace(*m_queue); @@ -271,7 +272,8 @@ sycl::device_ptr SYCLInternal::scratch_space(const std::size_t size) { return m_scratchSpace; } -sycl::host_ptr SYCLInternal::scratch_host(const std::size_t size) { +Kokkos::Impl::sycl_host_ptr SYCLInternal::scratch_host( + const std::size_t size) { if (verify_is_initialized("scratch_unified") && m_scratchHostCount < scratch_count(size)) { auto mem_space = Kokkos::Experimental::SYCLHostUSMSpace(*m_queue); @@ -291,7 +293,8 @@ sycl::host_ptr SYCLInternal::scratch_host(const std::size_t size) { return m_scratchHost; } -sycl::device_ptr SYCLInternal::scratch_flags(const std::size_t size) { +Kokkos::Impl::sycl_device_ptr SYCLInternal::scratch_flags( + const std::size_t size) { if (verify_is_initialized("scratch_flags") && m_scratchFlagsCount < scratch_count(size)) { auto mem_space = Kokkos::Experimental::SYCLDeviceUSMSpace(*m_queue); diff --git a/core/src/SYCL/Kokkos_SYCL_Instance.hpp b/core/src/SYCL/Kokkos_SYCL_Instance.hpp index ab7e8ce71e0..2d784ef8a5f 100644 --- a/core/src/SYCL/Kokkos_SYCL_Instance.hpp +++ b/core/src/SYCL/Kokkos_SYCL_Instance.hpp @@ -43,13 +43,12 @@ class SYCLInternal { SYCLInternal& operator=(SYCLInternal&&) = delete; SYCLInternal(SYCLInternal&&) = delete; - sycl::device_ptr scratch_space(const std::size_t size); - sycl::device_ptr scratch_flags(const std::size_t size); - sycl::host_ptr scratch_host(const std::size_t size); + Kokkos::Impl::sycl_device_ptr scratch_space(const std::size_t size); + Kokkos::Impl::sycl_device_ptr scratch_flags(const std::size_t size); + Kokkos::Impl::sycl_host_ptr scratch_host(const std::size_t size); int acquire_team_scratch_space(); - sycl::device_ptr resize_team_scratch_space(int scratch_pool_id, - std::int64_t bytes, - bool force_shrink = false); + Kokkos::Impl::sycl_device_ptr resize_team_scratch_space( + int scratch_pool_id, std::int64_t bytes, bool force_shrink = false); void register_team_scratch_event(int scratch_pool_id, sycl::event event); uint32_t impl_get_instance_id() const; @@ -59,21 +58,22 @@ class SYCLInternal { uint32_t m_maxConcurrency = 0; uint64_t m_maxShmemPerBlock = 0; - std::size_t m_scratchSpaceCount = 0; - sycl::device_ptr m_scratchSpace = nullptr; - std::size_t m_scratchHostCount = 0; - sycl::host_ptr m_scratchHost = nullptr; - std::size_t m_scratchFlagsCount = 0; - sycl::device_ptr m_scratchFlags = nullptr; + std::size_t m_scratchSpaceCount = 0; + Kokkos::Impl::sycl_device_ptr m_scratchSpace = nullptr; + std::size_t m_scratchHostCount = 0; + Kokkos::Impl::sycl_host_ptr m_scratchHost = nullptr; + std::size_t m_scratchFlagsCount = 0; + Kokkos::Impl::sycl_device_ptr m_scratchFlags = nullptr; // mutex to access shared memory mutable std::mutex m_mutexScratchSpace; // Team Scratch Level 1 Space - static constexpr int m_n_team_scratch = 10; - mutable int64_t m_team_scratch_current_size[m_n_team_scratch] = {}; - mutable sycl::device_ptr m_team_scratch_ptr[m_n_team_scratch] = {}; - mutable int m_current_team_scratch = 0; - mutable sycl::event m_team_scratch_event[m_n_team_scratch] = {}; + static constexpr int m_n_team_scratch = 10; + mutable int64_t m_team_scratch_current_size[m_n_team_scratch] = {}; + mutable Kokkos::Impl::sycl_device_ptr + m_team_scratch_ptr[m_n_team_scratch] = {}; + mutable int m_current_team_scratch = 0; + mutable sycl::event m_team_scratch_event[m_n_team_scratch] = {}; mutable std::mutex m_team_scratch_mutex; uint32_t m_instance_id = Kokkos::Tools::Experimental::Impl::idForInstance< diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp index 7fbf5420f83..b58885192b9 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp @@ -181,12 +181,6 @@ class Kokkos::Impl::ParallelFor, functor_wrapper.register_event(event); } - ParallelFor(const ParallelFor&) = delete; - ParallelFor(ParallelFor&&) = delete; - ParallelFor& operator=(const ParallelFor&) = delete; - ParallelFor& operator=(ParallelFor&&) = delete; - ~ParallelFor() = default; - ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy) : m_functor(arg_functor), m_policy(arg_policy), diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp index b4de7eb89ff..30c1ce41db4 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp @@ -43,8 +43,8 @@ template struct FunctorWrapperRangePolicyParallelForCustom { using WorkTag = typename Policy::work_tag; - void operator()(sycl::item<1> item) const { - const typename Policy::index_type id = item.get_linear_id(); + void operator()(sycl::nd_item<1> item) const { + const typename Policy::index_type id = item.get_global_linear_id(); if (id < m_work_size) { const auto shifted_id = id + m_begin; if constexpr (std::is_void_v) @@ -137,12 +137,6 @@ class Kokkos::Impl::ParallelFor, functor_wrapper.register_event(event); } - ParallelFor(const ParallelFor&) = delete; - ParallelFor(ParallelFor&&) = delete; - ParallelFor& operator=(const ParallelFor&) = delete; - ParallelFor& operator=(ParallelFor&&) = delete; - ~ParallelFor() = default; - ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy) : m_functor(arg_functor), m_policy(arg_policy) {} }; diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp index ecb4a863da2..57ff97e7f31 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp @@ -28,7 +28,7 @@ template class Kokkos::Impl::ParallelFor, Kokkos::Experimental::SYCL> { public: - using Policy = TeamPolicyInternal; + using Policy = TeamPolicy; using functor_type = FunctorType; using size_type = ::Kokkos::Experimental::SYCL::size_type; @@ -44,19 +44,14 @@ class Kokkos::Impl::ParallelFor, size_type const m_vector_size; int m_shmem_begin; int m_shmem_size; - sycl::device_ptr m_global_scratch_ptr; size_t m_scratch_size[2]; - // Only let one ParallelFor instance at a time use the team scratch memory. - // The constructor acquires the mutex which is released in the destructor. - std::scoped_lock m_scratch_buffers_lock; - int m_scratch_pool_id = -1; template - sycl::event sycl_direct_launch(const Policy& policy, + sycl::event sycl_direct_launch(const sycl_device_ptr global_scratch_ptr, const FunctorWrapper& functor_wrapper, const sycl::event& memcpy_event) const { // Convenience references - const Kokkos::Experimental::SYCL& space = policy.space(); + const Kokkos::Experimental::SYCL& space = m_policy.space(); sycl::queue& q = space.sycl_queue(); desul::ensure_sycl_lock_arrays_on_device(q); @@ -72,7 +67,6 @@ class Kokkos::Impl::ParallelFor, // Avoid capturing *this since it might not be trivially copyable const auto shmem_begin = m_shmem_begin; const size_t scratch_size[2] = {m_scratch_size[0], m_scratch_size[1]}; - sycl::device_ptr const global_scratch_ptr = m_global_scratch_ptr; auto lambda = [=](sycl::nd_item<2> item) { const member_type team_member( @@ -125,17 +119,31 @@ class Kokkos::Impl::ParallelFor, inline void execute() const { if (m_league_size == 0) return; - auto& space = *m_policy.space().impl_internal_space_instance(); + auto& instance = *m_policy.space().impl_internal_space_instance(); + + // Only let one instance at a time resize the instance's scratch memory + // allocations. + std::scoped_lock team_scratch_lock( + instance.m_team_scratch_mutex); + + // Functor's reduce memory, team scan memory, and team shared memory depend + // upon team size. + int scratch_pool_id = instance.acquire_team_scratch_space(); + const sycl_device_ptr global_scratch_ptr = + static_cast>(instance.resize_team_scratch_space( + scratch_pool_id, + static_cast(m_scratch_size[1]) * m_league_size)); + Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem& - indirectKernelMem = space.get_indirect_kernel_mem(); + indirectKernelMem = instance.get_indirect_kernel_mem(); auto functor_wrapper = Experimental::Impl::make_sycl_function_wrapper( m_functor, indirectKernelMem); - sycl::event event = sycl_direct_launch(m_policy, functor_wrapper, + sycl::event event = sycl_direct_launch(global_scratch_ptr, functor_wrapper, functor_wrapper.get_copy_event()); functor_wrapper.register_event(event); - space.register_team_scratch_event(m_scratch_pool_id, event); + instance.register_team_scratch_event(scratch_pool_id, event); } ParallelFor(FunctorType const& arg_functor, Policy const& arg_policy) @@ -143,10 +151,7 @@ class Kokkos::Impl::ParallelFor, m_policy(arg_policy), m_league_size(arg_policy.league_size()), m_team_size(arg_policy.team_size()), - m_vector_size(arg_policy.impl_vector_length()), - m_scratch_buffers_lock(arg_policy.space() - .impl_internal_space_instance() - ->m_team_scratch_mutex) { + m_vector_size(arg_policy.impl_vector_length()) { // FIXME_SYCL optimize if (m_team_size < 0) m_team_size = @@ -159,22 +164,14 @@ class Kokkos::Impl::ParallelFor, m_scratch_size[0] = m_shmem_size; m_scratch_size[1] = m_policy.scratch_size(1, m_team_size); - // Functor's reduce memory, team scan memory, and team shared memory depend - // upon team size. - auto& space = *m_policy.space().impl_internal_space_instance(); - m_scratch_pool_id = space.acquire_team_scratch_space(); - m_global_scratch_ptr = - static_cast>(space.resize_team_scratch_space( - m_scratch_pool_id, - static_cast(m_scratch_size[1]) * m_league_size)); - - if (static_cast(space.m_maxShmemPerBlock) < + const auto& instance = *m_policy.space().impl_internal_space_instance(); + if (static_cast(instance.m_maxShmemPerBlock) < m_shmem_size - m_shmem_begin) { std::stringstream out; out << "Kokkos::Impl::ParallelFor insufficient shared memory! " "Requested " << m_shmem_size - m_shmem_begin << " bytes but maximum is " - << space.m_maxShmemPerBlock << '\n'; + << instance.m_maxShmemPerBlock << '\n'; Kokkos::Impl::throw_runtime_exception(out.str()); } diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp index f55280e22e3..79f8afd4a3d 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp @@ -77,9 +77,7 @@ class Kokkos::Impl::ParallelReduce::accessible), - m_scratch_buffers_lock( - m_space.impl_internal_space_instance()->m_mutexScratchSpace) {} + typename View::memory_space>::accessible) {} private: template @@ -94,10 +92,10 @@ class Kokkos::Impl::ParallelReduce results_ptr; + sycl_device_ptr results_ptr; auto host_result_ptr = (m_result_ptr && !m_result_ptr_device_accessible) - ? static_cast>( + ? static_cast>( instance.scratch_host(sizeof(value_type) * value_count)) : nullptr; @@ -114,7 +112,7 @@ class Kokkos::Impl::ParallelReduce>( + results_ptr = static_cast>( instance.scratch_space(sizeof(value_type) * value_count)); auto device_accessible_result_ptr = m_result_ptr_device_accessible @@ -155,13 +153,13 @@ class Kokkos::Impl::ParallelReduce>( + results_ptr = static_cast>( instance.scratch_space(sizeof(value_type) * value_count * n_wgroups)); auto device_accessible_result_ptr = m_result_ptr_device_accessible ? static_cast>(m_result_ptr) : static_cast>(host_result_ptr); - auto scratch_flags = static_cast>( + auto scratch_flags = static_cast>( instance.scratch_flags(sizeof(unsigned int))); auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) { @@ -330,6 +328,12 @@ class Kokkos::Impl::ParallelReduce scratch_buffers_lock( + instance.m_mutexScratchSpace); + using IndirectKernelMem = Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem; IndirectKernelMem& indirectKernelMem = instance.get_indirect_kernel_mem(); @@ -349,10 +353,6 @@ class Kokkos::Impl::ParallelReduce m_scratch_buffers_lock; }; #endif /* KOKKOS_SYCL_PARALLEL_REDUCE_MDRANGE_HPP */ diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp index 5333e3c8a83..2bad7749759 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp @@ -50,9 +50,7 @@ class Kokkos::Impl::ParallelReduce::accessible), - m_scratch_buffers_lock( - p.space().impl_internal_space_instance()->m_mutexScratchSpace) {} + typename View::memory_space>::accessible) {} private: template @@ -69,10 +67,10 @@ class Kokkos::Impl::ParallelReduce results_ptr = nullptr; + sycl_device_ptr results_ptr = nullptr; auto host_result_ptr = (m_result_ptr && !m_result_ptr_device_accessible) - ? static_cast>( + ? static_cast>( instance.scratch_host(sizeof(value_type) * value_count)) : nullptr; auto device_accessible_result_ptr = @@ -88,7 +86,7 @@ class Kokkos::Impl::ParallelReduce>( + results_ptr = static_cast>( instance.scratch_space(sizeof(value_type) * value_count)); auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) { @@ -125,13 +123,13 @@ class Kokkos::Impl::ParallelReduce>( + auto scratch_flags = static_cast>( instance.scratch_flags(sizeof(unsigned int))); auto reduction_lambda_factory = [&](sycl::local_accessor local_mem, sycl::local_accessor num_teams_done, - sycl::device_ptr results_ptr, int values_per_thread) { + sycl_device_ptr results_ptr, int values_per_thread) { const auto begin = policy.begin(); auto lambda = [=](sycl::nd_item<1> item) { @@ -302,7 +300,7 @@ class Kokkos::Impl::ParallelReduce>(instance.scratch_space( + static_cast>(instance.scratch_space( sizeof(value_type) * value_count * n_wgroups)); sycl::local_accessor local_mem( @@ -347,6 +345,12 @@ class Kokkos::Impl::ParallelReduce scratch_buffers_lock( + instance.m_mutexScratchSpace); + using IndirectKernelMem = Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem; IndirectKernelMem& indirectKernelMem = instance.get_indirect_kernel_mem(); @@ -366,10 +370,6 @@ class Kokkos::Impl::ParallelReduce m_scratch_buffers_lock; }; #endif /* KOKKOS_SYCL_PARALLEL_REDUCE_RANGE_HPP */ diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp index 27165c59e3a..43c6ca44019 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp @@ -30,7 +30,7 @@ class Kokkos::Impl::ParallelReduce, Kokkos::Experimental::SYCL> { public: - using Policy = TeamPolicyInternal; + using Policy = TeamPolicy; using FunctorType = typename CombinedFunctorReducerType::functor_type; using ReducerType = typename CombinedFunctorReducerType::reducer_type; @@ -54,24 +54,18 @@ class Kokkos::Impl::ParallelReduce m_global_scratch_ptr; size_t m_scratch_size[2]; const size_type m_league_size; int m_team_size; const size_type m_vector_size; - // Only let one ParallelReduce instance at a time use the team scratch memory - // and the host scratch memory. The constructor acquires the mutex which is - // released in the destructor. - std::scoped_lock m_scratch_buffers_lock; - int m_scratch_pool_id = -1; - template + template sycl::event sycl_direct_launch( - const PolicyType& policy, + const sycl_device_ptr global_scratch_ptr, const CombinedFunctorReducerWrapper& functor_reducer_wrapper, const sycl::event& memcpy_event) const { // Convenience references - const Kokkos::Experimental::SYCL& space = policy.space(); + const Kokkos::Experimental::SYCL& space = m_policy.space(); Kokkos::Experimental::Impl::SYCLInternal& instance = *space.impl_internal_space_instance(); sycl::queue& q = space.sycl_queue(); @@ -82,7 +76,7 @@ class Kokkos::Impl::ParallelReduce>( + ? static_cast>( instance.scratch_host(sizeof(value_type) * value_count)) : nullptr; @@ -95,7 +89,7 @@ class Kokkos::Impl::ParallelReduce>(instance.scratch_space( + static_cast>(instance.scratch_space( sizeof(value_type) * std::max(value_count, 1u))); auto device_accessible_result_ptr = m_result_ptr_device_accessible @@ -113,7 +107,6 @@ class Kokkos::Impl::ParallelReduce const global_scratch_ptr = m_global_scratch_ptr; #ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES cgh.depends_on(memcpy_event); @@ -156,7 +149,7 @@ class Kokkos::Impl::ParallelReduce>( + auto scratch_flags = static_cast>( instance.scratch_flags(sizeof(unsigned int))); // FIXME_SYCL accessors seem to need a size greater than zero at least @@ -170,12 +163,11 @@ class Kokkos::Impl::ParallelReduce const global_scratch_ptr = m_global_scratch_ptr; sycl::local_accessor num_teams_done(1, cgh); auto team_reduction_factory = [&](sycl::local_accessor local_mem, - sycl::device_ptr results_ptr) { + sycl_device_ptr results_ptr) { auto device_accessible_result_ptr = m_result_ptr_device_accessible ? static_cast>(m_result_ptr) @@ -331,7 +323,7 @@ class Kokkos::Impl::ParallelReduce((size + wgroup_size - 1) / wgroup_size, 1); results_ptr = - static_cast>(instance.scratch_space( + static_cast>(instance.scratch_space( sizeof(value_type) * std::max(value_count, 1u) * init_size)); size_t max_work_groups = @@ -386,6 +378,22 @@ class Kokkos::Impl::ParallelReduce scratch_buffers_lock( + instance.m_mutexScratchSpace); + std::scoped_lock team_scratch_lock( + instance.m_team_scratch_mutex); + + // Functor's reduce memory, team scan memory, and team shared memory depend + // upon team size. + int scratch_pool_id = instance.acquire_team_scratch_space(); + const sycl_device_ptr global_scratch_ptr = + static_cast>(instance.resize_team_scratch_space( + scratch_pool_id, + static_cast(m_scratch_size[1]) * m_league_size)); + using IndirectKernelMem = Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem; IndirectKernelMem& indirectKernelMem = instance.get_indirect_kernel_mem(); @@ -395,14 +403,24 @@ class Kokkos::Impl::ParallelReduce + ParallelReduce(CombinedFunctorReducerType const& arg_functor_reducer, + Policy const& arg_policy, ViewType const& arg_result) + : m_functor_reducer(arg_functor_reducer), + m_policy(arg_policy), + m_result_ptr(arg_result.data()), + m_result_ptr_device_accessible( + MemorySpaceAccess::accessible), + m_league_size(arg_policy.league_size()), + m_team_size(arg_policy.team_size()), + m_vector_size(arg_policy.impl_vector_length()) { // FIXME_SYCL optimize if (m_team_size < 0) m_team_size = m_policy.team_size_recommended( @@ -423,22 +441,15 @@ class Kokkos::Impl::ParallelReduce>(space.resize_team_scratch_space( - m_scratch_pool_id, - static_cast(m_scratch_size[1]) * m_league_size)); - - if (static_cast(space.m_maxShmemPerBlock) < + const Kokkos::Experimental::Impl::SYCLInternal& instance = + *m_policy.space().impl_internal_space_instance(); + if (static_cast(instance.m_maxShmemPerBlock) < m_shmem_size - m_shmem_begin) { std::stringstream out; out << "Kokkos::Impl::ParallelFor insufficient shared memory! " "Requested " << m_shmem_size - m_shmem_begin << " bytes but maximum is " - << space.m_maxShmemPerBlock << '\n'; + << instance.m_maxShmemPerBlock << '\n'; Kokkos::Impl::throw_runtime_exception(out.str()); } @@ -448,25 +459,6 @@ class Kokkos::Impl::ParallelReduce requested too large team size."); } - - public: - template - ParallelReduce(CombinedFunctorReducerType const& arg_functor_reducer, - Policy const& arg_policy, ViewType const& arg_result) - : m_functor_reducer(arg_functor_reducer), - m_policy(arg_policy), - m_result_ptr(arg_result.data()), - m_result_ptr_device_accessible( - MemorySpaceAccess::accessible), - m_league_size(arg_policy.league_size()), - m_team_size(arg_policy.team_size()), - m_vector_size(arg_policy.impl_vector_length()), - m_scratch_buffers_lock(arg_policy.space() - .impl_internal_space_instance() - ->m_team_scratch_mutex) { - initialize(); - } }; #endif diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp index 58cfea6a97a..b3d3e9e35ce 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp @@ -146,14 +146,10 @@ class ParallelScanSYCLBase { const CombinedFunctorReducer m_functor_reducer; const Policy m_policy; - sycl::host_ptr m_scratch_host = nullptr; + sycl_host_ptr m_scratch_host = nullptr; pointer_type m_result_ptr; const bool m_result_ptr_device_accessible; - // Only let one ParallelScan instance at a time use the host scratch memory. - // The constructor acquires the mutex which is released in the destructor. - std::scoped_lock m_scratch_buffers_lock; - private: template sycl::event sycl_direct_launch(const FunctorWrapper& functor_wrapper, @@ -166,95 +162,93 @@ class ParallelScanSYCLBase { const auto size = m_policy.end() - m_policy.begin(); - auto scratch_flags = static_cast>( + auto scratch_flags = static_cast>( instance.scratch_flags(sizeof(unsigned int))); const auto begin = m_policy.begin(); // Initialize global memory - auto scan_lambda_factory = - [&](sycl::local_accessor local_mem, - sycl::local_accessor num_teams_done, - sycl::device_ptr global_mem_, - sycl::device_ptr group_results_) { - auto lambda = [=](sycl::nd_item<1> item) { - auto global_mem = global_mem_; - auto group_results = group_results_; - - const CombinedFunctorReducer< - FunctorType, typename Analysis::Reducer>& functor_reducer = - functor_wrapper.get_functor(); - const FunctorType& functor = functor_reducer.get_functor(); - const typename Analysis::Reducer& reducer = - functor_reducer.get_reducer(); - - const auto n_wgroups = item.get_group_range()[0]; - const int wgroup_size = item.get_local_range()[0]; - - const int local_id = item.get_local_linear_id(); - const index_type global_id = item.get_global_linear_id(); - - // Initialize local memory - value_type local_value; - reducer.init(&local_value); - if (global_id < size) { - if constexpr (std::is_void::value) - functor(global_id + begin, local_value, false); - else - functor(WorkTag(), global_id + begin, local_value, false); - } + auto scan_lambda_factory = [&](sycl::local_accessor local_mem, + sycl::local_accessor + num_teams_done, + sycl_device_ptr global_mem_, + sycl_device_ptr group_results_) { + auto lambda = [=](sycl::nd_item<1> item) { + auto global_mem = global_mem_; + auto group_results = group_results_; + + const CombinedFunctorReducer& + functor_reducer = functor_wrapper.get_functor(); + const FunctorType& functor = functor_reducer.get_functor(); + const typename Analysis::Reducer& reducer = + functor_reducer.get_reducer(); + + const auto n_wgroups = item.get_group_range()[0]; + const int wgroup_size = item.get_local_range()[0]; + + const int local_id = item.get_local_linear_id(); + const index_type global_id = item.get_global_linear_id(); + + // Initialize local memory + value_type local_value; + reducer.init(&local_value); + if (global_id < size) { + if constexpr (std::is_void::value) + functor(global_id + begin, local_value, false); + else + functor(WorkTag(), global_id + begin, local_value, false); + } - workgroup_scan<>(item, reducer, local_mem, local_value, - wgroup_size); + workgroup_scan<>(item, reducer, local_mem, local_value, wgroup_size); - // Write results to global memory - if (global_id < size) global_mem[global_id] = local_value; + // Write results to global memory + if (global_id < size) global_mem[global_id] = local_value; - if (local_id == wgroup_size - 1) { - group_results[item.get_group_linear_id()] = - local_mem[item.get_sub_group().get_group_range()[0] - 1]; + if (local_id == wgroup_size - 1) { + group_results[item.get_group_linear_id()] = + local_mem[item.get_sub_group().get_group_range()[0] - 1]; - sycl::atomic_ref - scratch_flags_ref(*scratch_flags); - num_teams_done[0] = ++scratch_flags_ref; - } - item.barrier(sycl::access::fence_space::global_space); - if (num_teams_done[0] == n_wgroups) { - if (local_id == 0) *scratch_flags = 0; - value_type total; - reducer.init(&total); - - for (unsigned int offset = 0; offset < n_wgroups; - offset += wgroup_size) { - index_type id = local_id + offset; - if (id < static_cast(n_wgroups)) - local_value = group_results[id]; - else - reducer.init(&local_value); - workgroup_scan<>( - item, reducer, local_mem, local_value, - std::min(n_wgroups - offset, wgroup_size)); - if (id < static_cast(n_wgroups)) { - reducer.join(&local_value, &total); - group_results[id] = local_value; - } - reducer.join( - &total, - &local_mem[item.get_sub_group().get_group_range()[0] - 1]); - if (offset + wgroup_size < n_wgroups) - item.barrier(sycl::access::fence_space::global_space); - } + sycl::atomic_ref + scratch_flags_ref(*scratch_flags); + num_teams_done[0] = ++scratch_flags_ref; + } + item.barrier(sycl::access::fence_space::global_space); + if (num_teams_done[0] == n_wgroups) { + if (local_id == 0) *scratch_flags = 0; + value_type total; + reducer.init(&total); + + for (unsigned int offset = 0; offset < n_wgroups; + offset += wgroup_size) { + index_type id = local_id + offset; + if (id < static_cast(n_wgroups)) + local_value = group_results[id]; + else + reducer.init(&local_value); + workgroup_scan<>( + item, reducer, local_mem, local_value, + std::min(n_wgroups - offset, wgroup_size)); + if (id < static_cast(n_wgroups)) { + reducer.join(&local_value, &total); + group_results[id] = local_value; } - }; - return lambda; - }; + reducer.join( + &total, + &local_mem[item.get_sub_group().get_group_range()[0] - 1]); + if (offset + wgroup_size < n_wgroups) + item.barrier(sycl::access::fence_space::global_space); + } + } + }; + return lambda; + }; size_t wgroup_size; size_t n_wgroups; - sycl::device_ptr global_mem; - sycl::device_ptr group_results; + sycl_device_ptr global_mem; + sycl_device_ptr group_results; desul::ensure_sycl_lock_arrays_on_device(q); @@ -289,9 +283,9 @@ class ParallelScanSYCLBase { // FIXME_SYCL consider only storing one value per block and recreate // initial results in the end before doing the final pass global_mem = - static_cast>(instance.scratch_space( + static_cast>(instance.scratch_space( n_wgroups * (wgroup_size + 1) * sizeof(value_type))); - m_scratch_host = static_cast>( + m_scratch_host = static_cast>( instance.scratch_host(sizeof(value_type))); group_results = global_mem + n_wgroups * wgroup_size; @@ -369,6 +363,11 @@ class ParallelScanSYCLBase { auto& instance = *m_policy.space().impl_internal_space_instance(); + // Only let one instance at a time resize the instance's scratch memory + // allocations. + std::scoped_lock scratch_buffers_lock( + instance.m_mutexScratchSpace); + Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem& indirectKernelMem = instance.get_indirect_kernel_mem(); @@ -387,10 +386,7 @@ class ParallelScanSYCLBase { : m_functor_reducer(arg_functor, typename Analysis::Reducer{arg_functor}), m_policy(arg_policy), m_result_ptr(arg_result_ptr), - m_result_ptr_device_accessible(arg_result_ptr_device_accessible), - m_scratch_buffers_lock(m_policy.space() - .impl_internal_space_instance() - ->m_mutexScratchSpace) {} + m_result_ptr_device_accessible(arg_result_ptr_device_accessible) {} }; } // namespace Kokkos::Impl diff --git a/core/src/SYCL/Kokkos_SYCL_Team.hpp b/core/src/SYCL/Kokkos_SYCL_Team.hpp index 2b4c2be5227..910e3602714 100644 --- a/core/src/SYCL/Kokkos_SYCL_Team.hpp +++ b/core/src/SYCL/Kokkos_SYCL_Team.hpp @@ -339,7 +339,7 @@ class SYCLTeamMember { KOKKOS_INLINE_FUNCTION SYCLTeamMember(sycl::local_ptr shared, const std::size_t shared_begin, const std::size_t shared_size, - sycl::device_ptr scratch_level_1_ptr, + sycl_device_ptr scratch_level_1_ptr, const std::size_t scratch_level_1_size, const sycl::nd_item<2> item, const int arg_league_rank, const int arg_league_size) diff --git a/core/src/SYCL/Kokkos_SYCL_WorkgroupReduction.hpp b/core/src/SYCL/Kokkos_SYCL_WorkgroupReduction.hpp index 7069805a5b5..c838a1abc58 100644 --- a/core/src/SYCL/Kokkos_SYCL_WorkgroupReduction.hpp +++ b/core/src/SYCL/Kokkos_SYCL_WorkgroupReduction.hpp @@ -21,14 +21,17 @@ namespace Kokkos::Impl::SYCLReduction { +// FIXME_SYCL For some types, shuffle reductions are competitive with local +// memory reductions but they are significantly slower for the value type used +// in combined reductions with multiple double arguments. template -inline constexpr bool use_shuffle_based_algorithm = - std::is_reference_v; +inline constexpr bool use_shuffle_based_algorithm = false; +// std::is_reference_v; template std::enable_if_t> workgroup_reduction( sycl::nd_item& item, sycl::local_accessor local_mem, - sycl::device_ptr results_ptr, + sycl_device_ptr results_ptr, sycl::global_ptr device_accessible_result_ptr, const unsigned int value_count_, const ReducerType& final_reducer, bool final, unsigned int max_size) { @@ -100,7 +103,7 @@ std::enable_if_t> workgroup_reduction( template std::enable_if_t> workgroup_reduction( sycl::nd_item& item, sycl::local_accessor local_mem, - ValueType local_value, sycl::device_ptr results_ptr, + ValueType local_value, sycl_device_ptr results_ptr, sycl::global_ptr device_accessible_result_ptr, const ReducerType& final_reducer, bool final, unsigned int max_size) { const auto local_id = item.get_local_linear_id(); diff --git a/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp b/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp index f34a7daaca0..a25b51496ef 100644 --- a/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp +++ b/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp @@ -37,6 +37,8 @@ class TeamPolicyInternal int m_league_size; int m_chunk_size; + Kokkos::Serial m_space; + public: //! Tag this class as a kokkos execution policy using execution_policy = TeamPolicyInternal; @@ -46,10 +48,7 @@ class TeamPolicyInternal //! Execution space of this execution policy: using execution_space = Kokkos::Serial; - const typename traits::execution_space& space() const { - static typename traits::execution_space m_space; - return m_space; - } + const typename traits::execution_space& space() const { return m_space; } template friend class TeamPolicyInternal; @@ -116,12 +115,13 @@ class TeamPolicyInternal return (level == 0 ? 1024 * 32 : 20 * 1024 * 1024); } /** \brief Specify league size, request team size */ - TeamPolicyInternal(const execution_space&, int league_size_request, + TeamPolicyInternal(const execution_space& space, int league_size_request, int team_size_request, int /* vector_length_request */ = 1) : m_team_scratch_size{0, 0}, m_thread_scratch_size{0, 0}, m_league_size(league_size_request), - m_chunk_size(32) { + m_chunk_size(32), + m_space(space) { if (team_size_request > 1) Kokkos::abort("Kokkos::abort: Requested Team Size is too large!"); } diff --git a/core/src/Threads/Kokkos_Threads_Team.hpp b/core/src/Threads/Kokkos_Threads_Team.hpp index fd0f221365b..a3501a437d2 100644 --- a/core/src/Threads/Kokkos_Threads_Team.hpp +++ b/core/src/Threads/Kokkos_Threads_Team.hpp @@ -188,8 +188,6 @@ class ThreadsExecTeamMember { using type = typename if_c::type; - if (m_instance == nullptr) return value; - if (team_rank() != team_size() - 1) * ((volatile type*)m_instance->scratch_memory()) = value; @@ -229,8 +227,6 @@ class ThreadsExecTeamMember { using type = typename if_c::type; - if (m_instance == nullptr) return; - type* const local_value = ((type*)m_instance->scratch_memory()); // Set this thread's contribution @@ -285,8 +281,6 @@ class ThreadsExecTeamMember { using type = typename if_c::type; - if (m_instance == nullptr) return type(0); - volatile type* const work_value = ((type*)m_instance->scratch_memory()); *work_value = value; @@ -358,6 +352,7 @@ class ThreadsExecTeamMember { m_chunk_size(team.chunk_size()), m_league_chunk_end(0), m_team_alloc(team.team_alloc()) { + KOKKOS_ASSERT(m_instance != nullptr); if (team.league_size()) { // Execution is using device-team interface: diff --git a/core/src/View/MDSpan/Kokkos_MDSpan_Extents.hpp b/core/src/View/MDSpan/Kokkos_MDSpan_Extents.hpp index 3846b52d239..29d1e00adfc 100644 --- a/core/src/View/MDSpan/Kokkos_MDSpan_Extents.hpp +++ b/core/src/View/MDSpan/Kokkos_MDSpan_Extents.hpp @@ -37,9 +37,6 @@ struct ViewDimension; template struct ViewDataType; -} // namespace Kokkos::Impl - -namespace Kokkos::Experimental::Impl { // A few things to note -- // - mdspan allows for 0-rank extents similarly to View, so we don't need @@ -106,6 +103,20 @@ struct DataTypeFromExtents { // Will cause a compile error if it is malformed (i.e. dynamic after static) using type = typename ::Kokkos::Impl::ViewDataType::type; }; -} // namespace Kokkos::Experimental::Impl + +template +constexpr KOKKOS_INLINE_FUNCTION auto extents_from_view_mapping_impl( + const VM &view_mapping, std::index_sequence) { + return Extents{view_mapping.extent(Indices)...}; +} + +template +constexpr KOKKOS_INLINE_FUNCTION auto extents_from_view_mapping( + const VM &view_mapping) { + static_assert(Extents::rank() == VM::Rank); + return extents_from_view_mapping_impl( + view_mapping, std::make_index_sequence{}); +} +} // namespace Kokkos::Impl #endif // KOKKOS_EXPERIMENTAL_MDSPAN_EXTENTS_HPP diff --git a/core/src/View/MDSpan/Kokkos_MDSpan_Layout.hpp b/core/src/View/MDSpan/Kokkos_MDSpan_Layout.hpp new file mode 100644 index 00000000000..8073dee1eed --- /dev/null +++ b/core/src/View/MDSpan/Kokkos_MDSpan_Layout.hpp @@ -0,0 +1,148 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE +static_assert(false, + "Including non-public Kokkos header files is not allowed."); +#endif + +#ifndef KOKKOS_EXPERIMENTAL_MDSPAN_LAYOUT_HPP +#define KOKKOS_EXPERIMENTAL_MDSPAN_LAYOUT_HPP + +#include "Kokkos_MDSpan_Extents.hpp" +#include + +namespace Kokkos::Impl { + +template +struct LayoutFromArrayLayout; + +template <> +struct LayoutFromArrayLayout { + using type = Experimental::layout_left_padded; +}; + +template <> +struct LayoutFromArrayLayout { + using type = Experimental::layout_right_padded; +}; + +template <> +struct LayoutFromArrayLayout { + using type = layout_stride; +}; + +template +KOKKOS_INLINE_FUNCTION auto array_layout_from_mapping( + const typename MDSpanType::mapping_type &mapping) { + using mapping_type = typename MDSpanType::mapping_type; + using extents_type = typename mapping_type::extents_type; + + constexpr auto rank = extents_type::rank(); + const auto &ext = mapping.extents(); + + static_assert(rank <= ARRAY_LAYOUT_MAX_RANK, + "Unsupported rank for mdspan (must be <= 8)"); + + if constexpr (std::is_same_v) { + return Kokkos::LayoutStride{ + rank > 0 ? ext.extent(0) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + rank > 0 ? mapping.stride(0) : 0, + rank > 1 ? ext.extent(1) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + rank > 1 ? mapping.stride(1) : 0, + rank > 2 ? ext.extent(2) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + rank > 2 ? mapping.stride(2) : 0, + rank > 3 ? ext.extent(3) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + rank > 3 ? mapping.stride(3) : 0, + rank > 4 ? ext.extent(4) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + rank > 4 ? mapping.stride(4) : 0, + rank > 5 ? ext.extent(5) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + rank > 5 ? mapping.stride(5) : 0, + rank > 6 ? ext.extent(6) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + rank > 6 ? mapping.stride(6) : 0, + rank > 7 ? ext.extent(7) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + rank > 7 ? mapping.stride(7) : 0, + }; + } else { + // FIXME: Kokkos Layouts don't store stride (it's in the mapping) + // We could conceivably fix this by adding an extra ViewCtorProp for + // an abritrary padding. For now we will check for this. + if constexpr (rank > 1 && + (std::is_same_v< + typename mapping_type::layout_type, + Experimental::layout_left_padded> || + std::is_same_v< + typename mapping_type::layout_type, + Experimental::layout_right_padded>)) { + [[maybe_unused]] constexpr size_t strided_index = + std::is_same_v> + ? 1 + : rank - 2; + [[maybe_unused]] constexpr size_t extent_index = + std::is_same_v> + ? 0 + : rank - 1; + KOKKOS_ASSERT(mapping.stride(strided_index) == ext.extent(extent_index)); + } + + return ArrayLayout{rank > 0 ? ext.extent(0) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + rank > 1 ? ext.extent(1) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + rank > 2 ? ext.extent(2) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + rank > 3 ? ext.extent(3) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + rank > 4 ? ext.extent(4) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + rank > 5 ? ext.extent(5) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + rank > 6 ? ext.extent(6) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + rank > 7 ? ext.extent(7) : KOKKOS_IMPL_CTOR_DEFAULT_ARG}; + } +} + +template +KOKKOS_INLINE_FUNCTION auto mapping_from_view_mapping(const VM &view_mapping) { + using mapping_type = typename MDSpanType::mapping_type; + using extents_type = typename mapping_type::extents_type; + + // std::span is not available in C++17 (our current requirements), + // so we need to use the std::array constructor for layout mappings. + // FIXME When C++20 is available, we can use std::span here instead + std::size_t strides[VM::Rank]; + view_mapping.stride_fill(&strides[0]); + if constexpr (std::is_same_v) { + return mapping_type(Kokkos::mdspan_non_standard, + extents_from_view_mapping(view_mapping), + strides); + } else if constexpr (VM::Rank > 1 && + std::is_same_v>) { + return mapping_type(extents_from_view_mapping(view_mapping), + strides[1]); + } else if constexpr (VM::Rank > 1 && + std::is_same_v>) { + return mapping_type(extents_from_view_mapping(view_mapping), + strides[VM::Rank - 2]); + } else { + return mapping_type(extents_from_view_mapping(view_mapping)); + } +} + +} // namespace Kokkos::Impl + +#endif // KOKKOS_EXPERIMENTAL_MDSPAN_LAYOUT_HPP diff --git a/core/src/impl/Kokkos_ViewMapping.hpp b/core/src/impl/Kokkos_ViewMapping.hpp index 3217c76e380..c1f4c0290c1 100644 --- a/core/src/impl/Kokkos_ViewMapping.hpp +++ b/core/src/impl/Kokkos_ViewMapping.hpp @@ -17,6 +17,7 @@ #ifndef KOKKOS_EXPERIMENTAL_VIEW_MAPPING_HPP #define KOKKOS_EXPERIMENTAL_VIEW_MAPPING_HPP +#include #include #include @@ -647,34 +648,60 @@ struct ViewOffset< m_dim.N5 * m_dim.N6; } - // Stride with [ rank ] value is the total length + // Fill the target unbounded array s with the stride. + // This method differs from stride() in that it does not write the total + // length to the last index of the array. Preconditions: s must be an array of + // dimension_type::rank elements + // FIXME: The version of clang-format in CI fails from maybe_unused + // clang-format off template - KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { - s[0] = 1; - if (0 < dimension_type::rank) { - s[1] = m_dim.N0; + KOKKOS_INLINE_FUNCTION iType + stride_fill([[maybe_unused]] iType* const s) const { + iType n = 1; + if constexpr (0 < dimension_type::rank) { + s[0] = n; + n *= m_dim.N0; } - if (1 < dimension_type::rank) { - s[2] = s[1] * m_dim.N1; + if constexpr (1 < dimension_type::rank) { + s[1] = n; + n *= m_dim.N1; } - if (2 < dimension_type::rank) { - s[3] = s[2] * m_dim.N2; + if constexpr (2 < dimension_type::rank) { + s[2] = n; + n *= m_dim.N2; } - if (3 < dimension_type::rank) { - s[4] = s[3] * m_dim.N3; + if constexpr (3 < dimension_type::rank) { + s[3] = n; + n *= m_dim.N3; } - if (4 < dimension_type::rank) { - s[5] = s[4] * m_dim.N4; + if constexpr (4 < dimension_type::rank) { + s[4] = n; + n *= m_dim.N4; } - if (5 < dimension_type::rank) { - s[6] = s[5] * m_dim.N5; + if constexpr (5 < dimension_type::rank) { + s[5] = n; + n *= m_dim.N5; } - if (6 < dimension_type::rank) { - s[7] = s[6] * m_dim.N6; + if constexpr (6 < dimension_type::rank) { + s[6] = n; + n *= m_dim.N6; } - if (7 < dimension_type::rank) { - s[8] = s[7] * m_dim.N7; + if constexpr (7 < dimension_type::rank) { + s[7] = n; + n *= m_dim.N7; } + return n; + } + // clang-format on + + // Fill the target unbounded array s with the stride and the total spanned + // size. This method differs from stride_fill() in that it writes the total + // spanned size to the last index of the array. Preconditions: s must be an + // array of dimension_type::rank + 1 elements Stride with [ rank ] value is + // the total length + template + KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { + s[dimension_type::rank] = stride_fill(s); } //---------------------------------------- @@ -935,34 +962,59 @@ struct ViewOffset< m_dim.N6; } - // Stride with [ rank ] value is the total length + // Fill the target unbounded array s with the stride. + // This method differs from stride() in that it does not write the total + // length to the last index of the array. Preconditions: s must be an array of + // dimension_type::rank elements + // The version of clang-format in CI fails from maybe_unused + // clang-format off template - KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { - s[0] = 1; - if (0 < dimension_type::rank) { - s[1] = m_stride; + KOKKOS_INLINE_FUNCTION iType + stride_fill([[maybe_unused]] iType* const s) const { + iType n = 1; + if constexpr (0 < dimension_type::rank) { + s[0] = n; + n *= m_stride; } - if (1 < dimension_type::rank) { - s[2] = s[1] * m_dim.N1; + if constexpr (1 < dimension_type::rank) { + s[1] = n; + n *= m_dim.N1; } - if (2 < dimension_type::rank) { - s[3] = s[2] * m_dim.N2; + if constexpr (2 < dimension_type::rank) { + s[2] = n; + n *= m_dim.N2; } - if (3 < dimension_type::rank) { - s[4] = s[3] * m_dim.N3; + if constexpr (3 < dimension_type::rank) { + s[3] = n; + n *= m_dim.N3; } - if (4 < dimension_type::rank) { - s[5] = s[4] * m_dim.N4; + if constexpr (4 < dimension_type::rank) { + s[4] = n; + n *= m_dim.N4; } - if (5 < dimension_type::rank) { - s[6] = s[5] * m_dim.N5; + if constexpr (5 < dimension_type::rank) { + s[5] = n; + n *= m_dim.N5; } - if (6 < dimension_type::rank) { - s[7] = s[6] * m_dim.N6; + if constexpr (6 < dimension_type::rank) { + s[6] = n; + n *= m_dim.N6; } - if (7 < dimension_type::rank) { - s[8] = s[7] * m_dim.N7; + if constexpr (7 < dimension_type::rank) { + s[7] = n; + n *= m_dim.N7; } + return n; + } + // clang-format on + + // Fill the target unbounded array s with the stride and the total spanned + // size. This method differs from stride_fill() in that it writes the total + // spanned size to the last index of the array. Preconditions: s must be an + // array of dimension_type::rank + 1 elements + template + KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { + s[dimension_type::rank] = stride_fill(s); } //---------------------------------------- @@ -1286,42 +1338,58 @@ struct ViewOffset< m_dim.N1; } - // Stride with [ rank ] value is the total length + // Fill the target unbounded array s with the stride. + // This method differs from stride() in that it does not write the total + // length to the last index of the array. Preconditions: s must be an array of + // dimension_type::rank elements + // The version of clang-format in CI fails from maybe_unused + // clang-format off template - KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { + KOKKOS_INLINE_FUNCTION iType + stride_fill([[maybe_unused]] iType* const s) const { size_type n = 1; - if (7 < dimension_type::rank) { + if constexpr (7 < dimension_type::rank) { s[7] = n; n *= m_dim.N7; } - if (6 < dimension_type::rank) { + if constexpr (6 < dimension_type::rank) { s[6] = n; n *= m_dim.N6; } - if (5 < dimension_type::rank) { + if constexpr (5 < dimension_type::rank) { s[5] = n; n *= m_dim.N5; } - if (4 < dimension_type::rank) { + if constexpr (4 < dimension_type::rank) { s[4] = n; n *= m_dim.N4; } - if (3 < dimension_type::rank) { + if constexpr (3 < dimension_type::rank) { s[3] = n; n *= m_dim.N3; } - if (2 < dimension_type::rank) { + if constexpr (2 < dimension_type::rank) { s[2] = n; n *= m_dim.N2; } - if (1 < dimension_type::rank) { + if constexpr (1 < dimension_type::rank) { s[1] = n; n *= m_dim.N1; } - if (0 < dimension_type::rank) { + if constexpr (0 < dimension_type::rank) { s[0] = n; } - s[dimension_type::rank] = n * m_dim.N0; + return n * m_dim.N0; + } + // clang-format on + + // Fill the target unbounded array s with the stride and the total spanned + // size. This method differs from stride_fill() in that it writes the total + // spanned size to the last index of the array. Preconditions: s must be an + // array of dimension_type::rank + 1 elements + template + KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { + s[dimension_type::rank] = stride_fill(s); } //---------------------------------------- @@ -1573,41 +1641,57 @@ struct ViewOffset< return m_stride; } - // Stride with [ rank ] value is the total length + // Fill the target unbounded array s with the stride. + // This method differs from stride() in that it does not write the total + // length to the last index of the array. Preconditions: s must be an array of + // dimension_type::rank elements + // The version of clang-format in CI fails from maybe_unused + // clang-format off template - KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { + KOKKOS_INLINE_FUNCTION iType + stride_fill([[maybe_unused]] iType* const s) const { size_type n = 1; - if (7 < dimension_type::rank) { + if constexpr (7 < dimension_type::rank) { s[7] = n; n *= m_dim.N7; } - if (6 < dimension_type::rank) { + if constexpr (6 < dimension_type::rank) { s[6] = n; n *= m_dim.N6; } - if (5 < dimension_type::rank) { + if constexpr (5 < dimension_type::rank) { s[5] = n; n *= m_dim.N5; } - if (4 < dimension_type::rank) { + if constexpr (4 < dimension_type::rank) { s[4] = n; n *= m_dim.N4; } - if (3 < dimension_type::rank) { + if constexpr (3 < dimension_type::rank) { s[3] = n; n *= m_dim.N3; } - if (2 < dimension_type::rank) { + if constexpr (2 < dimension_type::rank) { s[2] = n; n *= m_dim.N2; } - if (1 < dimension_type::rank) { + if constexpr (1 < dimension_type::rank) { s[1] = n; } - if (0 < dimension_type::rank) { + if constexpr (0 < dimension_type::rank) { s[0] = m_stride; } - s[dimension_type::rank] = m_stride * m_dim.N0; + return m_stride * m_dim.N0; + } + // clang-format on + + // Fill the target unbounded array s with the stride and the total spanned + // size. This method differs from stride_fill() in that it writes the total + // spanned size to the last index of the array. Preconditions: s must be an + // array of dimension_type::rank + 1 elements + template + KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { + s[dimension_type::rank] = stride_fill(s); } //---------------------------------------- @@ -2133,34 +2217,50 @@ struct ViewOffset { return m_stride.S7; } - // Stride with [ rank ] value is the total length + // Fill the target unbounded array s with the stride. + // This method differs from stride() in that it does not write the total + // length to the last index of the array. Preconditions: s must be an array of + // dimension_type::rank elements + // The version of clang-format in CI fails from maybe_unused + // clang-format off template - KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { - if (0 < dimension_type::rank) { + KOKKOS_INLINE_FUNCTION iType + stride_fill([[maybe_unused]] iType* const s) const { + if constexpr (0 < dimension_type::rank) { s[0] = m_stride.S0; } - if (1 < dimension_type::rank) { + if constexpr (1 < dimension_type::rank) { s[1] = m_stride.S1; } - if (2 < dimension_type::rank) { + if constexpr (2 < dimension_type::rank) { s[2] = m_stride.S2; } - if (3 < dimension_type::rank) { + if constexpr (3 < dimension_type::rank) { s[3] = m_stride.S3; } - if (4 < dimension_type::rank) { + if constexpr (4 < dimension_type::rank) { s[4] = m_stride.S4; } - if (5 < dimension_type::rank) { + if constexpr (5 < dimension_type::rank) { s[5] = m_stride.S5; } - if (6 < dimension_type::rank) { + if constexpr (6 < dimension_type::rank) { s[6] = m_stride.S6; } - if (7 < dimension_type::rank) { + if constexpr (7 < dimension_type::rank) { s[7] = m_stride.S7; } - s[dimension_type::rank] = span(); + return span(); + } + // clang-format on + + // Fill the target unbounded array s with the stride and the total spanned + // size. This method differs from stride_fill() in that it writes the total + // spanned size to the last index of the array. Preconditions: s must be an + // array of dimension_type::rank + 1 elements + template + KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { + s[dimension_type::rank] = stride_fill(s); } //---------------------------------------- @@ -2430,19 +2530,9 @@ namespace Kokkos { namespace Impl { template -inline bool is_zero_byte(const T& t) { - using comparison_type = std::conditional_t< - sizeof(T) % sizeof(long long int) == 0, long long int, - std::conditional_t< - sizeof(T) % sizeof(long int) == 0, long int, - std::conditional_t< - sizeof(T) % sizeof(int) == 0, int, - std::conditional_t>>>; - const auto* const ptr = reinterpret_cast(&t); - for (std::size_t i = 0; i < sizeof(T) / sizeof(comparison_type); ++i) - if (ptr[i] != 0) return false; - return true; +bool is_zero_byte(const T& x) { + constexpr std::byte all_zeroes[sizeof(T)] = {}; + return std::memcmp(&x, all_zeroes, sizeof(T)) == 0; } //---------------------------------------------------------------------------- @@ -2814,11 +2904,24 @@ class ViewMapping< return m_impl_offset.stride_7(); } + // Fill the target unbounded array s with the stride and the total spanned + // size. This method differs from stride_fill() in that it writes the total + // spanned size to the last index of the array. Preconditions: s must be an + // array of dimension_type::rank + 1 elements template KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { m_impl_offset.stride(s); } + // Fill the target unbounded array s with the stride. + // This method differs from stride() in that it does not write the total + // length to the last index of the array. Preconditions: s must be an array of + // dimension_type::rank elements + template + KOKKOS_INLINE_FUNCTION iType stride_fill(iType* const s) const { + return m_impl_offset.stride_fill(s); + } + //---------------------------------------- // Range span diff --git a/core/src/setup/Kokkos_Setup_SYCL.hpp b/core/src/setup/Kokkos_Setup_SYCL.hpp index 30f6fa2ad23..b117d75acb9 100644 --- a/core/src/setup/Kokkos_Setup_SYCL.hpp +++ b/core/src/setup/Kokkos_Setup_SYCL.hpp @@ -45,4 +45,21 @@ #define KOKKOS_IMPL_SYCL_GET_MULTI_PTR(accessor) accessor.get_pointer() #endif +// FIXME_SYCL Use type directly once it has stabilized in SYCL. +namespace Kokkos::Impl { +#ifndef SYCL_EXT_INTEL_USM_ADDRESS_SPACES +#error SYCL_EXT_INTEL_USM_ADDRESS_SPACES undefined! +#elif SYCL_EXT_INTEL_USM_ADDRESS_SPACES >= 2 +template +using sycl_device_ptr = sycl::ext::intel::device_ptr; +template +using sycl_host_ptr = sycl::ext::intel::host_ptr; +#else +template +using sycl_device_ptr = sycl::device_ptr; +template +using sycl_host_ptr = sycl::host_ptr; +#endif +} // namespace Kokkos::Impl + #endif diff --git a/core/unit_test/CMakeLists.txt b/core/unit_test/CMakeLists.txt index 3b14bec03a2..2a56e46a943 100644 --- a/core/unit_test/CMakeLists.txt +++ b/core/unit_test/CMakeLists.txt @@ -93,6 +93,7 @@ SET(COMPILE_ONLY_SOURCES TestViewTypeTraits.cpp TestTypeList.cpp TestMDRangePolicyCTAD.cpp + TestTeamPolicyCTAD.cpp view/TestExtentsDatatypeConversion.cpp ) @@ -148,8 +149,10 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;OpenACC;HIP;SYCL) Crs DeepCopyAlignment ExecSpacePartitioning + ExecSpaceThreadSafety ExecutionSpace FunctorAnalysis + Graph HostSharedPtr HostSharedPtrAccessOnDevice Init @@ -173,7 +176,7 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;OpenACC;HIP;SYCL) endforeach() set(${Tag}_SOURCES1B) - foreach(Name + set(${Tag}_TESTNAMES1B MDRange_a MDRange_b MDRange_c @@ -184,6 +187,7 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;OpenACC;HIP;SYCL) MDRangePolicyConstructors MDRangeReduce MDSpan + MDSpanConversion MinMaxClamp NumericTraits OccupancyControlTrait @@ -205,6 +209,10 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;OpenACC;HIP;SYCL) SharedAlloc Swap ) + IF (NOT Kokkos_ENABLE_IMPL_MDSPAN) + LIST(REMOVE_ITEM ${Tag}_TESTNAMES1B MDSpanConversion) + ENDIF() + foreach(Name IN LISTS ${Tag}_TESTNAMES1B) set(file ${dir}/Test${Tag}_${Name}.cpp) # Write to a temporary intermediate file and call configure_file to avoid # updating timestamps triggering unnecessary rebuilds on subsequent cmake runs. @@ -637,6 +645,8 @@ IF(KOKKOS_ENABLE_OPENACC AND KOKKOS_CXX_COMPILER_ID STREQUAL Clang) endif() if(Kokkos_ENABLE_SERIAL) + list(REMOVE_ITEM Serial_SOURCES1 + ${CMAKE_CURRENT_BINARY_DIR}/serial/TestSerial_ExecSpaceThreadSafety.cpp) KOKKOS_ADD_EXECUTABLE_AND_TEST( CoreUnitTest_Serial1 SOURCES @@ -650,12 +660,6 @@ if(Kokkos_ENABLE_SERIAL) UnitTestMainInit.cpp ${Serial_SOURCES2} ) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - CoreUnitTest_SerialGraph - SOURCES - UnitTestMainInit.cpp - serial/TestSerial_Graph.cpp - ) endif() if(Kokkos_ENABLE_THREADS) @@ -667,6 +671,9 @@ if(Kokkos_ENABLE_THREADS) endif() if (Kokkos_ENABLE_OPENMP) + list(REMOVE_ITEM OpenMP_SOURCES + ${CMAKE_CURRENT_BINARY_DIR}/openmp/TestOpenMP_ExecSpaceThreadSafety.cpp) + set(OpenMP_EXTRA_SOURCES openmp/TestOpenMP_Task.cpp ) @@ -683,12 +690,6 @@ if (Kokkos_ENABLE_OPENMP) UnitTestMain.cpp openmp/TestOpenMP_InterOp.cpp ) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - CoreUnitTest_OpenMPGraph - SOURCES - UnitTestMainInit.cpp - openmp/TestOpenMP_Graph.cpp - ) endif() if(Kokkos_ENABLE_HPX) @@ -796,12 +797,6 @@ if(Kokkos_ENABLE_CUDA) UnitTestMainInit.cpp cuda/TestCuda_InterOp_StreamsMultiGPU.cpp ) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - CoreUnitTest_CudaGraph - SOURCES - UnitTestMainInit.cpp - cuda/TestCuda_Graph.cpp - ) endif() if(Kokkos_ENABLE_HIP) @@ -829,12 +824,6 @@ if(Kokkos_ENABLE_HIP) UnitTestMain.cpp hip/TestHIP_InterOp_Streams.cpp ) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - UnitTest_HIPGraph - SOURCES - UnitTestMainInit.cpp - hip/TestHIP_Graph.cpp - ) endif() if(Kokkos_ENABLE_SYCL) diff --git a/core/unit_test/TestArray.cpp b/core/unit_test/TestArray.cpp index e138a64d6db..fb6334322b7 100644 --- a/core/unit_test/TestArray.cpp +++ b/core/unit_test/TestArray.cpp @@ -18,6 +18,11 @@ namespace { +// nvcc errors on variables only used in static_asserts +// Passing those variables to this function should eliminate the warning +template +KOKKOS_FUNCTION constexpr void maybe_unused(Ts&&...) {} + KOKKOS_FUNCTION constexpr bool test_array() { constexpr Kokkos::Array a{{1, 2}}; @@ -120,4 +125,69 @@ static_assert(test_array_aggregate_initialization()); } } +constexpr bool test_array_const_qualified_element_type() { + Kokkos::Array a{255}; + return a[0] == 255; +} + +static_assert(test_array_const_qualified_element_type()); + +// User-defined type providing a sepcialization of kokkos_swap +struct MyInt { + int i; + + private: + friend constexpr void kokkos_swap(MyInt& lhs, MyInt& rhs) noexcept { + lhs.i = 255; + rhs.i = 127; + } +}; + +constexpr bool test_array_specialization_kokkos_swap() { + Kokkos::Array a{MyInt{1}, MyInt{2}}; + Kokkos::Array b{MyInt{11}, MyInt{22}}; + + // sanity check + if (a[0].i != 1 || a[1].i != 2 || b[0].i != 11 || b[1].i != 22) { + return false; + } + + using Kokkos::kokkos_swap; + kokkos_swap(a, b); + + // check that the user-definied kokkos_swap(MyInt) overload was called + if (a[0].i != 255 || a[1].i != 255 || b[0].i != 127 || b[1].i != 127) { + return false; + } + + return true; +} + +static_assert(test_array_specialization_kokkos_swap()); + +constexpr bool test_to_array() { + // copies a string literal + [[maybe_unused]] auto a1 = Kokkos::to_array("foo"); + static_assert(a1.size() == 4); + maybe_unused(a1); + + // deduces both element type and length + [[maybe_unused]] auto a2 = Kokkos::to_array({0, 2, 1, 3}); + static_assert(std::is_same_v>); + maybe_unused(a2); + +// gcc8 doesn't support the implicit conversion +#if !defined(KOKKOS_COMPILER_GNU) || (KOKKOS_COMPILER_GNU >= 910) + // deduces length with element type specified + // implicit conversion happens + [[maybe_unused]] auto a3 = Kokkos::to_array({0, 1, 3}); + static_assert(std::is_same_v>); + maybe_unused(a3); +#endif + + return true; +} + +static_assert(test_to_array()); + } // namespace diff --git a/core/unit_test/TestArrayOps.hpp b/core/unit_test/TestArrayOps.hpp index 06528572714..6b8e0f3aca3 100644 --- a/core/unit_test/TestArrayOps.hpp +++ b/core/unit_test/TestArrayOps.hpp @@ -111,6 +111,8 @@ TEST(TEST_CATEGORY, array_zero_data_nullptr) { ASSERT_EQ(ce.data(), nullptr); } +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() TEST(TEST_CATEGORY, array_contiguous_capacity) { using A = Kokkos::Array::contiguous>; @@ -389,5 +391,7 @@ TEST(TEST_CATEGORY, array_strided_assignment) { ASSERT_EQ(e.max_size(), std::size(ee) / eStride); ASSERT_EQ(e[0], ee[0]); } +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() +#endif } // namespace diff --git a/core/unit_test/TestExecSpacePartitioning.hpp b/core/unit_test/TestExecSpacePartitioning.hpp index 65314d6be7c..f8b570ab64d 100644 --- a/core/unit_test/TestExecSpacePartitioning.hpp +++ b/core/unit_test/TestExecSpacePartitioning.hpp @@ -28,6 +28,17 @@ struct SumFunctor { void operator()(int i, int& lsum) const { lsum += i; } }; +template +void check_space_member_for_policies(const ExecSpace& exec) { + Kokkos::RangePolicy range_policy(exec, 0, 1); + ASSERT_EQ(range_policy.space(), exec); + Kokkos::MDRangePolicy> mdrange_policy(exec, {0, 0}, + {1, 1}); + ASSERT_EQ(mdrange_policy.space(), exec); + Kokkos::TeamPolicy team_policy(exec, 1, Kokkos::AUTO); + ASSERT_EQ(team_policy.space(), exec); +} + template void check_distinctive([[maybe_unused]] ExecSpace exec1, [[maybe_unused]] ExecSpace exec2) { @@ -89,6 +100,9 @@ void run_threaded_test(const Lambda1 l1, const Lambda2 l2) { void test_partitioning(std::vector& instances) { check_distinctive(instances[0], instances[1]); + check_space_member_for_policies(instances[0]); + check_space_member_for_policies(instances[1]); + int sum1, sum2; int N = 3910; run_threaded_test( diff --git a/core/unit_test/TestExecSpaceThreadSafety.hpp b/core/unit_test/TestExecSpaceThreadSafety.hpp new file mode 100644 index 00000000000..20b802babe0 --- /dev/null +++ b/core/unit_test/TestExecSpaceThreadSafety.hpp @@ -0,0 +1,319 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include +#include + +namespace { + +#ifdef KOKKOS_ENABLE_OPENMP +template +void run_threaded_test(const Lambda1 l1, const Lambda2 l2) { +#pragma omp parallel num_threads(2) + { + if (omp_get_thread_num() == 0) l1(); + if (omp_get_thread_num() == 1) l2(); + } +} +// We cannot run the multithreaded test when threads or HPX is enabled because +// we cannot launch a thread from inside another thread +#elif !defined(KOKKOS_ENABLE_THREADS) && !defined(KOKKOS_ENABLE_HPX) +template +void run_threaded_test(const Lambda1 l1, const Lambda2 l2) { + std::thread t1(l1); + std::thread t2(l2); + t1.join(); + t2.join(); +} +#else +template +void run_threaded_test(const Lambda1 l1, const Lambda2 l2) { + l1(); + l2(); +} +#endif + +// The idea for all of these tests is to access a View from kernels submitted by +// two different threads to the same execution space instance. If the kernels +// are executed concurrently, we expect to count too many increments. +void run_exec_space_thread_safety_range() { + constexpr int N = 10000000; + constexpr int M = 10; + + Kokkos::View view("view"); + Kokkos::View error("error"); + + auto lambda = [=]() { + TEST_EXECSPACE exec; + for (int j = 0; j < M; ++j) { + Kokkos::parallel_for( + Kokkos::RangePolicy(exec, 0, 1), KOKKOS_LAMBDA(int) { + Kokkos::atomic_store(view.data(), 0); + for (int i = 0; i < N; ++i) Kokkos::atomic_inc(view.data()); + if (Kokkos::atomic_load(view.data()) != N) + Kokkos::atomic_store(error.data(), 1); + }); + } + }; + + run_threaded_test(lambda, lambda); + + auto host_error = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, error); + ASSERT_EQ(host_error(), 0); +} + +TEST(TEST_CATEGORY, exec_space_thread_safety_range) { +#ifdef KOKKOS_ENABLE_OPENMPTARGET + if (std::is_same_v) + GTEST_SKIP() << "skipping since test is known to fail for OpenMPTarget"; +#endif + run_exec_space_thread_safety_range(); +} + +void run_exec_space_thread_safety_mdrange() { + constexpr int N = 1000000; + constexpr int M = 10; + + Kokkos::View view("view"); + Kokkos::View error("error"); + + auto lambda = [=]() { + TEST_EXECSPACE exec; + for (int j = 0; j < M; ++j) { + Kokkos::parallel_for( + Kokkos::MDRangePolicy>(exec, {0, 0}, + {1, 1}), + KOKKOS_LAMBDA(int, int) { + Kokkos::atomic_store(view.data(), 0); + for (int i = 0; i < N; ++i) Kokkos::atomic_inc(view.data()); + if (Kokkos::atomic_load(view.data()) != N) + Kokkos::atomic_store(error.data(), 1); + }); + } + }; + + run_threaded_test(lambda, lambda); + + auto host_error = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, error); + ASSERT_EQ(host_error(), 0); +} + +TEST(TEST_CATEGORY, exec_space_thread_safety_mdrange) { +#ifdef KOKKOS_ENABLE_OPENMPTARGET + if (std::is_same_v) + GTEST_SKIP() << "skipping since test is known to fail for OpenMPTarget"; +#endif + run_exec_space_thread_safety_mdrange(); +} + +void run_exec_space_thread_safety_team_policy() { + constexpr int N = 1000000; + constexpr int M = 10; + + Kokkos::View view("view"); + Kokkos::View error("error"); + + auto lambda = [=]() { + TEST_EXECSPACE exec; + for (int j = 0; j < M; ++j) { + Kokkos::parallel_for( + Kokkos::TeamPolicy(exec, 1, 1, 1), + KOKKOS_LAMBDA(const Kokkos::TeamPolicy::member_type + &team_member) { + Kokkos::single(Kokkos::PerTeam(team_member), [=]() { + Kokkos::atomic_store(view.data(), 0); + for (int i = 0; i < N; ++i) Kokkos::atomic_inc(view.data()); + if (Kokkos::atomic_load(view.data()) != N) + Kokkos::atomic_store(error.data(), 1); + }); + }); + } + }; + + run_threaded_test(lambda, lambda); + + auto host_error = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, error); + ASSERT_EQ(host_error(), 0); +} + +TEST(TEST_CATEGORY, exec_space_thread_safety_team_policy) { +// FIXME_OPENMPTARGET +#ifdef KOKKOS_ENABLE_OPENMPTARGET + if (std::is_same_v) + GTEST_SKIP() << "skipping for OpenMPTarget since the test is designed to " + "run with vector_length=1"; +#endif + run_exec_space_thread_safety_team_policy(); +} + +void run_exec_space_thread_safety_range_reduce() { + constexpr int N = 1000000; + constexpr int M = 10; + + Kokkos::View view("view"); + Kokkos::View error("error"); + + auto lambda = [=]() { + TEST_EXECSPACE exec; + for (int j = 0; j < M; ++j) { + Kokkos::parallel_reduce( + Kokkos::RangePolicy(exec, 0, 1), + KOKKOS_LAMBDA(int, int &update) { + Kokkos::atomic_store(view.data(), 0); + for (int i = 0; i < N; ++i) Kokkos::atomic_inc(view.data()); + if (Kokkos::atomic_load(view.data()) != N) ++update; + }, + error); + } + exec.fence(); + }; + + run_threaded_test(lambda, lambda); + + auto host_error = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, error); + ASSERT_EQ(host_error(), 0); +} + +TEST(TEST_CATEGORY, exec_space_thread_safety_range_reduce) { + run_exec_space_thread_safety_range_reduce(); +} + +void run_exec_space_thread_safety_mdrange_reduce() { + constexpr int N = 1000000; + constexpr int M = 10; + + Kokkos::View view("view"); + Kokkos::View error("error"); + + auto lambda = [=]() { + TEST_EXECSPACE exec; + for (int j = 0; j < M; ++j) { + Kokkos::parallel_reduce( + Kokkos::MDRangePolicy>(exec, {0, 0}, + {1, 1}), + KOKKOS_LAMBDA(int, int, int &update) { + Kokkos::atomic_store(view.data(), 0); + for (int i = 0; i < N; ++i) Kokkos::atomic_inc(view.data()); + if (Kokkos::atomic_load(view.data()) != N) ++update; + }, + error); + } + exec.fence(); + }; + + run_threaded_test(lambda, lambda); + + auto host_error = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, error); + ASSERT_EQ(host_error(), 0); +} + +TEST(TEST_CATEGORY, exec_space_thread_safety_mdrange_reduce) { +// FIXME_INTEL +#ifdef KOKKOS_COMPILER_INTEL + if (std::is_same_v) + GTEST_SKIP() << "skipping since test is known to fail for OpenMP using the " + "legacy Intel compiler"; +#endif + run_exec_space_thread_safety_mdrange_reduce(); +} + +void run_exec_space_thread_safety_team_policy_reduce() { + constexpr int N = 1000000; + constexpr int M = 10; + + Kokkos::View view("view"); + Kokkos::View error("error"); + + auto lambda = [=]() { + TEST_EXECSPACE exec; + for (int j = 0; j < M; ++j) { + Kokkos::parallel_reduce( + Kokkos::TeamPolicy(exec, 1, 1, 1), + KOKKOS_LAMBDA(const Kokkos::TeamPolicy::member_type + &team_member, + int &update) { + Kokkos::single(Kokkos::PerTeam(team_member), [=, &update]() { + Kokkos::atomic_store(view.data(), 0); + for (int i = 0; i < N; ++i) Kokkos::atomic_inc(view.data()); + if (Kokkos::atomic_load(view.data()) != N) ++update; + }); + }, + error); + } + }; + run_threaded_test(lambda, lambda); + + auto host_error = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, error); + ASSERT_EQ(host_error(), 0); +} + +TEST(TEST_CATEGORY, exec_space_thread_safety_team_policy_reduce) { +// FIXME_OPENMPTARGET +#ifdef KOKKOS_ENABLE_OPENMPTARGET + if (std::is_same_v) + GTEST_SKIP() << "skipping for OpenMPTarget since the test is designed to " + "run with vector_length=1"; +#endif + // FIXME_SYCL +#if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) + if (std::is_same_v) + GTEST_SKIP() << "skipping since test is know to fail with SYCL+Cuda"; +#endif + run_exec_space_thread_safety_team_policy_reduce(); +} + +void run_exec_space_thread_safety_range_scan() { + constexpr int N = 1000000; + constexpr int M = 10; + + Kokkos::View view("view"); + Kokkos::View error("error"); + + auto lambda = [=]() { + TEST_EXECSPACE exec; + for (int j = 0; j < M; ++j) { + Kokkos::parallel_scan( + Kokkos::RangePolicy(exec, 0, 1), + KOKKOS_LAMBDA(int, int &, const bool final) { + if (final) { + Kokkos::atomic_store(view.data(), 0); + for (int i = 0; i < N; ++i) Kokkos::atomic_inc(view.data()); + if (Kokkos::atomic_load(view.data()) != N) + Kokkos::atomic_store(error.data(), 1); + } + }); + } + exec.fence(); + }; + + run_threaded_test(lambda, lambda); + + auto host_error = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, error); + ASSERT_EQ(host_error(), 0); +} + +TEST(TEST_CATEGORY, exec_space_thread_safety_range_scan) { + run_exec_space_thread_safety_range_scan(); +} + +} // namespace diff --git a/core/unit_test/TestGraph.hpp b/core/unit_test/TestGraph.hpp index 9a36d08f445..735114d4c25 100644 --- a/core/unit_test/TestGraph.hpp +++ b/core/unit_test/TestGraph.hpp @@ -66,7 +66,7 @@ struct SetResultToViewFunctor { } }; -struct TEST_CATEGORY_FIXTURE(count_bugs) : public ::testing::Test { +struct TEST_CATEGORY_FIXTURE(graph) : public ::testing::Test { public: using count_functor = CountTestFunctor; using set_functor = SetViewToValueFunctor; @@ -88,7 +88,7 @@ struct TEST_CATEGORY_FIXTURE(count_bugs) : public ::testing::Test { } }; -TEST_F(TEST_CATEGORY_FIXTURE(count_bugs), launch_one) { +TEST_F(TEST_CATEGORY_FIXTURE(graph), launch_one) { auto graph = Kokkos::Experimental::create_graph([&](auto root) { root.then_parallel_for(1, count_functor{count, bugs, 0, 0}); @@ -101,7 +101,7 @@ TEST_F(TEST_CATEGORY_FIXTURE(count_bugs), launch_one) { ASSERT_EQ(0, bugs_host()); } -TEST_F(TEST_CATEGORY_FIXTURE(count_bugs), launch_one_rvalue) { +TEST_F(TEST_CATEGORY_FIXTURE(graph), launch_one_rvalue) { Kokkos::Experimental::create_graph(ex, [&](auto root) { root.then_parallel_for(1, count_functor{count, bugs, 0, 0}); }).submit(); @@ -112,7 +112,16 @@ TEST_F(TEST_CATEGORY_FIXTURE(count_bugs), launch_one_rvalue) { ASSERT_EQ(0, bugs_host()); } -TEST_F(TEST_CATEGORY_FIXTURE(count_bugs), launch_six) { +TEST_F(TEST_CATEGORY_FIXTURE(graph), launch_six) { +#ifdef KOKKOS_ENABLE_OPENMPTARGET // FIXME_OPENMPTARGET team_size incompatible + if (std::is_same_v) + GTEST_SKIP() << "skipping since OpenMPTarget can't use team_size 1"; +#endif +#if defined(KOKKOS_ENABLE_SYCL) // FIXME_SYCL + if (std::is_same_v) + GTEST_SKIP() << "skipping since test case is known to fail with SYCL"; +#endif + auto graph = Kokkos::Experimental::create_graph(ex, [&](auto root) { auto f_setup_count = root.then_parallel_for(1, set_functor{count, 0}); auto f_setup_bugs = root.then_parallel_for(1, set_functor{bugs, 0}); @@ -145,7 +154,7 @@ TEST_F(TEST_CATEGORY_FIXTURE(count_bugs), launch_six) { ASSERT_EQ(0, bugs_host()); } -TEST_F(TEST_CATEGORY_FIXTURE(count_bugs), when_all_cycle) { +TEST_F(TEST_CATEGORY_FIXTURE(graph), when_all_cycle) { view_type reduction_out{"reduction_out"}; view_host reduction_host{"reduction_host"}; Kokkos::Experimental::create_graph(ex, [&](auto root) { @@ -172,7 +181,7 @@ TEST_F(TEST_CATEGORY_FIXTURE(count_bugs), when_all_cycle) { // This test is disabled because we don't currently support copying to host, // even asynchronously. We _may_ want to do that eventually? -TEST_F(TEST_CATEGORY_FIXTURE(count_bugs), DISABLED_repeat_chain) { +TEST_F(TEST_CATEGORY_FIXTURE(graph), DISABLED_repeat_chain) { auto graph = Kokkos::Experimental::create_graph( ex, [&, count_host = count_host](auto root) { //---------------------------------------- @@ -198,7 +207,7 @@ TEST_F(TEST_CATEGORY_FIXTURE(count_bugs), DISABLED_repeat_chain) { //---------------------------------------- } -TEST_F(TEST_CATEGORY_FIXTURE(count_bugs), zero_work_reduce) { +TEST_F(TEST_CATEGORY_FIXTURE(graph), zero_work_reduce) { auto graph = Kokkos::Experimental::create_graph(ex, [&](auto root) { root.then_parallel_reduce(0, set_result_functor{bugs}, count); }); @@ -214,9 +223,13 @@ TEST_F(TEST_CATEGORY_FIXTURE(count_bugs), zero_work_reduce) { // UVM works on pre pascal cards. #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_ENABLE_CUDA_UVM) && \ (defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL)) - Kokkos::fence(); + if constexpr (std::is_same_v) Kokkos::fence(); +#endif +#ifdef KOKKOS_ENABLE_HPX // FIXME_HPX graph.submit() isn't properly enqueued + if constexpr (std::is_same_v) + Kokkos::fence(); #endif - graph.submit(); // should reset to 0, but doesn't + graph.submit(); Kokkos::deep_copy(ex, count_host, count); ex.fence(); ASSERT_EQ(count_host(), 0); diff --git a/core/unit_test/TestMDSpanConversion.hpp b/core/unit_test/TestMDSpanConversion.hpp new file mode 100644 index 00000000000..6519a7c277d --- /dev/null +++ b/core/unit_test/TestMDSpanConversion.hpp @@ -0,0 +1,504 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include +#include + +#include +#include "experimental/__p0009_bits/layout_stride.hpp" + +namespace { + +template +struct TestViewMDSpanConversion { + using value_type = T; + + template + using layout_left_padded = Kokkos::Experimental::layout_left_padded; + + template + using layout_right_padded = + Kokkos::Experimental::layout_right_padded; + + struct TestAccessor { + using offset_policy = TestAccessor; + using element_type = value_type; + using reference = element_type &; + using data_handle_type = element_type *; + + constexpr TestAccessor() noexcept = default; + constexpr reference access(data_handle_type p, std::size_t i) noexcept { + return p[i]; + } + constexpr data_handle_type offset(data_handle_type p, + std::size_t i) noexcept { + return p + i; + } + }; + + template + static void test_conversion_from_mdspan( + Kokkos::View ref, + const MDSpanLayoutMapping &mapping) { + using unmanaged_view_type = + Kokkos::View>; + using natural_mdspan_type = typename Kokkos::Impl::MDSpanViewTraits< + typename unmanaged_view_type::traits>::mdspan_type; + using mapping_type = MDSpanLayoutMapping; + using mdspan_layout_type = typename MDSpanLayoutMapping::layout_type; + using extents_type = typename mapping_type::extents_type; + using mdspan_type = + Kokkos::mdspan; + + static_assert(std::is_constructible_v); + static_assert(std::is_convertible_v == + std::is_convertible_v); + // Manually create an mdspan from ref so we have a valid pointer to play + // with + const auto &exts = mapping.extents(); + auto mds = mdspan_type{ref.data(), mapping}; + + auto test_view = unmanaged_view_type(mds); + + ASSERT_EQ(test_view.data(), ref.data()); + ASSERT_EQ(test_view.data(), mds.data_handle()); + ASSERT_EQ(test_view.layout(), ref.layout()); + for (std::size_t r = 0; r < mdspan_type::rank(); ++r) { + ASSERT_EQ(test_view.extent(r), ref.extent(r)); + ASSERT_EQ(test_view.extent(r), exts.extent(r)); + } + } + + template + static void test_conversion_to_mdspan( + const MDSpanLayoutMapping &ref_layout_mapping, ViewType v) { + using view_type = ViewType; + using natural_mdspan_type = typename Kokkos::Impl::MDSpanViewTraits< + typename view_type::traits>::mdspan_type; + + static_assert(natural_mdspan_type::rank() == view_type::rank); + static_assert(std::is_same_v); + constexpr bool is_strided_layout = + std::is_same_v; + if constexpr (!is_strided_layout) { + static_assert(natural_mdspan_type::mapping_type::padding_value == + Kokkos::dynamic_extent); + } + // test conversion operator to natural mdspan + { + natural_mdspan_type cvt = v; + ASSERT_EQ(cvt.data_handle(), v.data()); + ASSERT_EQ(cvt.mapping(), ref_layout_mapping); + + if constexpr (!is_strided_layout && natural_mdspan_type::rank() > 1) { + ASSERT_EQ(cvt.mapping().stride(1), ref_layout_mapping.stride(1)); + } + } + // test to_mdspan() returning natural mdspan + { + auto cvt = v.to_mdspan(); + static_assert(std::is_same_v); + ASSERT_EQ(cvt.data_handle(), v.data()); + ASSERT_EQ(cvt.mapping(), ref_layout_mapping); + } + // test conversion operator to different mdspan type + { + using mdspan_type = Kokkos::mdspan< + const typename natural_mdspan_type::element_type, + Kokkos::dextents, + typename natural_mdspan_type::layout_type, + typename natural_mdspan_type::accessor_type>; + mdspan_type cvt = v; + ASSERT_EQ(cvt.data_handle(), v.data()); + ASSERT_EQ(cvt.mapping(), ref_layout_mapping); + } + } + + template + static void test_conversion_to_mdspan_with_accessor( + const MDSpanLayoutMapping &ref_layout_mapping, ViewType v, + const AccessorType &a) { + auto cvt = v.to_mdspan(a); + static_assert(decltype(cvt)::rank() == ViewType::rank); + static_assert(std::is_same_v); + ASSERT_EQ(cvt.data_handle(), v.data()); + ASSERT_EQ(cvt.mapping(), ref_layout_mapping); + } + + template + using natural_mdspan_type_for_view = typename Kokkos::Impl::MDSpanViewTraits< + typename ViewType::traits>::mdspan_type; + + static void run_test() { + // Verify we can only convert to compatible mdspans + static_assert(std::is_convertible_v< + Kokkos::View, + natural_mdspan_type_for_view>>); + static_assert( + std::is_convertible_v< + Kokkos::View, + natural_mdspan_type_for_view>>); + + // Do not cast const away + static_assert(!std::is_convertible_v< + Kokkos::View, + natural_mdspan_type_for_view>>); + + // Mismatched dim + static_assert(!std::is_convertible_v< + Kokkos::View, + natural_mdspan_type_for_view>>); + + // Mismatched layouts + static_assert( + !std::is_convertible_v, + natural_mdspan_type_for_view>>); + static_assert( + !std::is_convertible_v, + natural_mdspan_type_for_view>>); + // nvcc doesn't do CTAD properly here, making this way more verbose.. + // LayoutLeft + test_conversion_from_mdspan( + Kokkos::View("ref", + 7), + typename layout_left_padded<7>::template mapping< + Kokkos::dextents>{ + Kokkos::dextents(7)}); + + test_conversion_from_mdspan( + Kokkos::View("ref"), + typename layout_left_padded<7>::template mapping< + Kokkos::extents>{ + Kokkos::extents()}); + test_conversion_from_mdspan( + Kokkos::View("ref"), + typename layout_left_padded<7>::template mapping< + Kokkos::dextents>{ + Kokkos::dextents(7)}); + test_conversion_from_mdspan( + Kokkos::View("ref", + 7), + typename layout_left_padded<7>::template mapping< + Kokkos::extents>{ + Kokkos::extents()}); + + test_conversion_from_mdspan( + Kokkos::View("ref", + 7, 3), + typename layout_left_padded<7>::template mapping< + Kokkos::dextents>{ + Kokkos::dextents(7, 3)}); + test_conversion_from_mdspan( + Kokkos::View( + "ref"), + typename layout_left_padded<7>::template mapping< + Kokkos::extents>{ + Kokkos::extents()}); + test_conversion_from_mdspan( + Kokkos::View( + "ref"), + typename layout_left_padded<7>::template mapping< + Kokkos::dextents>{ + Kokkos::dextents(7, 3)}); + test_conversion_from_mdspan( + Kokkos::View("ref", + 7, 3), + typename layout_left_padded<7>::template mapping< + Kokkos::extents>{ + Kokkos::extents()}); + + // LayoutRight + test_conversion_from_mdspan( + Kokkos::View("ref", + 7), + typename layout_right_padded<7>::template mapping< + Kokkos::dextents>{ + Kokkos::dextents(7)}); + test_conversion_from_mdspan( + Kokkos::View("ref"), + typename layout_right_padded<7>::template mapping< + Kokkos::extents>{ + Kokkos::extents()}); + test_conversion_from_mdspan( + Kokkos::View("ref"), + typename layout_right_padded<7>::template mapping< + Kokkos::dextents>{ + Kokkos::dextents(7)}); + test_conversion_from_mdspan( + Kokkos::View("ref", + 7), + typename layout_right_padded<7>::template mapping< + Kokkos::extents>{ + Kokkos::extents()}); + + test_conversion_from_mdspan( + Kokkos::View("ref", + 3, 7), + typename layout_right_padded<7>::template mapping< + Kokkos::dextents>{ + Kokkos::dextents(3, 7)}); + test_conversion_from_mdspan( + Kokkos::View( + "ref"), + typename layout_right_padded<7>::template mapping< + Kokkos::extents>{ + Kokkos::extents()}); + test_conversion_from_mdspan( + Kokkos::View( + "ref"), + typename layout_right_padded<7>::template mapping< + Kokkos::dextents>{ + Kokkos::dextents(3, 7)}); + test_conversion_from_mdspan( + Kokkos::View("ref", + 3, 7), + typename layout_right_padded<7>::template mapping< + Kokkos::extents>{ + Kokkos::extents()}); + + // LayoutStride + { + const size_t strides[] = {2}; + test_conversion_from_mdspan( + Kokkos::View( + "ref", Kokkos::LayoutStride{7, 2}), + Kokkos::layout_stride::mapping>{ + Kokkos::mdspan_non_standard, Kokkos::dextents{7}, + strides}); + } + { + const size_t strides[] = {2}; + test_conversion_from_mdspan( + Kokkos::View( + "ref", Kokkos::LayoutStride{7, 2}), + Kokkos::layout_stride::mapping>{ + Kokkos::mdspan_non_standard, {}, strides}); + } + { + const size_t strides[] = {2}; + test_conversion_from_mdspan( + Kokkos::View( + "ref", Kokkos::LayoutStride{7, 2}), + Kokkos::layout_stride::mapping>{ + Kokkos::mdspan_non_standard, Kokkos::dextents{7}, + strides}); + } + { + const size_t strides[] = {2}; + test_conversion_from_mdspan( + Kokkos::View( + "ref", Kokkos::LayoutStride{7, 2}), + Kokkos::layout_stride::mapping>{ + Kokkos::mdspan_non_standard, Kokkos::extents(), + strides}); + } + + { + const size_t strides[] = {2, 4}; + test_conversion_from_mdspan( + Kokkos::View( + "ref", Kokkos::LayoutStride{7, 2, 3, 4}), + Kokkos::layout_stride::mapping>{ + Kokkos::mdspan_non_standard, + Kokkos::dextents(7, 3), strides}); + } + { + const size_t strides[] = {2, 4}; + test_conversion_from_mdspan( + Kokkos::View( + "ref", Kokkos::LayoutStride{7, 2, 3, 4}), + Kokkos::layout_stride::mapping>{ + Kokkos::mdspan_non_standard, Kokkos::extents(), + strides}); + } + { + const size_t strides[] = {2, 4}; + test_conversion_from_mdspan( + Kokkos::View( + "ref", Kokkos::LayoutStride{7, 2, 3, 4}), + Kokkos::layout_stride::mapping>{ + Kokkos::mdspan_non_standard, + Kokkos::dextents(7, 3), strides}); + } + { + const size_t strides[] = {2, 4}; + test_conversion_from_mdspan( + Kokkos::View( + "ref", Kokkos::LayoutStride{7, 2, 3, 4}), + Kokkos::layout_stride::mapping>{ + Kokkos::mdspan_non_standard, Kokkos::extents(), + strides}); + } + + // Conversion to mdspan + test_conversion_to_mdspan( + layout_left_padded::mapping< + Kokkos::extents>({}, 4), + Kokkos::View("v", 4)); + test_conversion_to_mdspan( + layout_left_padded::mapping< + Kokkos::extents>({}, 4), + Kokkos::View("v", 4, + 7)); + + test_conversion_to_mdspan( + layout_right_padded::mapping< + Kokkos::extents>({}, 4), + Kokkos::View("v", + 4)); + test_conversion_to_mdspan( + layout_right_padded::mapping< + Kokkos::extents>({}, 7), + Kokkos::View("v", 4, + 7)); + + { + const size_t strides[] = {5}; + test_conversion_to_mdspan( + Kokkos::layout_stride::mapping>( + Kokkos::mdspan_non_standard, {}, strides), + Kokkos::View( + "v", Kokkos::LayoutStride{4, 5})); + } + { + const size_t strides[] = {5, 9}; + test_conversion_to_mdspan( + Kokkos::layout_stride::mapping>( + Kokkos::mdspan_non_standard, {}, strides), + Kokkos::View( + "v", Kokkos::LayoutStride{4, 5, 7, 9})); + } + + // Aligned types (for padded layouts) + test_conversion_to_mdspan( + layout_left_padded::mapping< + Kokkos::extents>({}, 128), + Kokkos::View( + Kokkos::view_alloc("v", Kokkos::AllowPadding), 127, 7)); + + test_conversion_to_mdspan( + layout_right_padded::mapping< + Kokkos::extents>({}, 128), + Kokkos::View( + Kokkos::view_alloc("v", Kokkos::AllowPadding), 7, 127)); + + // Conversion with standard default_accessor + + test_conversion_to_mdspan_with_accessor( + layout_left_padded::mapping< + Kokkos::extents>({}, 4), + Kokkos::View("v", 4), + Kokkos::default_accessor{}); + test_conversion_to_mdspan_with_accessor( + layout_left_padded::mapping< + Kokkos::extents>({}, 4), + Kokkos::View("v", 4, + 7), + Kokkos::default_accessor{}); + + test_conversion_to_mdspan_with_accessor( + layout_right_padded::mapping< + Kokkos::extents>({}, 4), + Kokkos::View("v", 4), + Kokkos::default_accessor{}); + test_conversion_to_mdspan_with_accessor( + layout_right_padded::mapping< + Kokkos::extents>({}, 7), + Kokkos::View("v", 4, + 7), + Kokkos::default_accessor{}); + + { + const size_t strides[] = {5}; + test_conversion_to_mdspan_with_accessor( + Kokkos::layout_stride::mapping>( + Kokkos::mdspan_non_standard, {}, strides), + Kokkos::View( + "v", Kokkos::LayoutStride{4, 5}), + Kokkos::default_accessor{}); + } + { + const size_t strides[] = {5, 9}; + test_conversion_to_mdspan_with_accessor( + Kokkos::layout_stride::mapping>( + Kokkos::mdspan_non_standard, {}, strides), + Kokkos::View( + "v", Kokkos::LayoutStride{4, 5, 7, 9}), + Kokkos::default_accessor{}); + } + + // Conversion with a test accessor + + test_conversion_to_mdspan_with_accessor( + layout_left_padded::mapping< + Kokkos::extents>({}, 4), + Kokkos::View("v", 4), + TestAccessor{}); + test_conversion_to_mdspan_with_accessor( + layout_left_padded::mapping< + Kokkos::extents>({}, 4), + Kokkos::View("v", 4, + 7), + TestAccessor{}); + + test_conversion_to_mdspan_with_accessor( + layout_right_padded::mapping< + Kokkos::extents>({}, 4), + Kokkos::View("v", 4), + TestAccessor{}); + test_conversion_to_mdspan_with_accessor( + layout_right_padded::mapping< + Kokkos::extents>({}, 7), + Kokkos::View("v", 4, + 7), + TestAccessor{}); + + { + const size_t strides[] = {5}; + test_conversion_to_mdspan_with_accessor( + Kokkos::layout_stride::mapping>( + Kokkos::mdspan_non_standard, {}, strides), + Kokkos::View( + "v", Kokkos::LayoutStride{4, 5}), + TestAccessor{}); + } + { + const size_t strides[] = {5, 9}; + test_conversion_to_mdspan_with_accessor( + Kokkos::layout_stride::mapping>( + Kokkos::mdspan_non_standard, {}, strides), + Kokkos::View( + "v", Kokkos::LayoutStride{4, 5, 7, 9}), + TestAccessor{}); + } + } +}; + +TEST(TEST_CATEGORY, view_mdspan_conversion) { + TestViewMDSpanConversion::run_test(); + TestViewMDSpanConversion::run_test(); + TestViewMDSpanConversion::run_test(); +} + +} // namespace diff --git a/core/unit_test/TestRealloc.hpp b/core/unit_test/TestRealloc.hpp index 2c9dc5ee473..f30c9e15e1c 100644 --- a/core/unit_test/TestRealloc.hpp +++ b/core/unit_test/TestRealloc.hpp @@ -144,6 +144,11 @@ void impl_testRealloc() { EXPECT_EQ(oldPointer, newPointer); } } +struct NoDefaultConstructor { + int value; + KOKKOS_FUNCTION + NoDefaultConstructor(int x) : value(x) {} +}; template void testRealloc() { @@ -154,6 +159,14 @@ void testRealloc() { impl_testRealloc(); // without data initialization } + // Check #6992 fix (no default initialization in realloc without initializing) + { + using view_type = Kokkos::View; + view_type view_1d_no_default( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "view_1d_no_default"), + 5); + realloc_dispatch(WithoutInitializing{}, view_1d_no_default, 3); + } } } // namespace TestViewRealloc diff --git a/core/unit_test/TestResize.hpp b/core/unit_test/TestResize.hpp index 13d7e16d589..3102d2b9a16 100644 --- a/core/unit_test/TestResize.hpp +++ b/core/unit_test/TestResize.hpp @@ -358,6 +358,12 @@ void impl_testResize() { } } +struct NoDefaultConstructor { + int value; + KOKKOS_FUNCTION + NoDefaultConstructor(int x) : value(x) {} +}; + template void testResize() { { @@ -367,6 +373,13 @@ void testResize() { impl_testResize(); // without data initialization } + { + using view_type = Kokkos::View; + view_type view_1d_no_default( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "view_1d_no_default"), + 5); + resize_dispatch(WithoutInitializing{}, view_1d_no_default, 3); + } } } // namespace TestViewResize diff --git a/core/unit_test/TestTeamPolicyCTAD.cpp b/core/unit_test/TestTeamPolicyCTAD.cpp new file mode 100644 index 00000000000..07aaeae819e --- /dev/null +++ b/core/unit_test/TestTeamPolicyCTAD.cpp @@ -0,0 +1,135 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include + +namespace { + +struct TestTeamPolicyCTAD { + template + static void maybe_unused(Ts&&...) {} + + struct SomeExecutionSpace { + using execution_space = SomeExecutionSpace; + using size_type = size_t; + }; + static_assert(Kokkos::is_execution_space_v); + + struct ImplicitlyConvertibleToDefaultExecutionSpace { + [[maybe_unused]] operator Kokkos::DefaultExecutionSpace() const { + return Kokkos::DefaultExecutionSpace(); + } + }; + static_assert(!Kokkos::is_execution_space_v< + ImplicitlyConvertibleToDefaultExecutionSpace>); + + [[maybe_unused]] static inline Kokkos::DefaultExecutionSpace des; + [[maybe_unused]] static inline ImplicitlyConvertibleToDefaultExecutionSpace + notEs; + [[maybe_unused]] static inline SomeExecutionSpace ses; + + [[maybe_unused]] static inline int i; + + // Workaround for nvc++ (CUDA-11.7-NVHPC) ignoring [[maybe_unused]] on + // ImplicitlyConvertibleToDefaultExecutionSpace::operator + // Kokkos::DefaultExecutionSpace() const + [[maybe_unused]] static inline Kokkos::DefaultExecutionSpace notEsToDes = + notEs; + + // Workaround for HIP-ROCm-5.2 warning about was declared but never referenced + TestTeamPolicyCTAD() { maybe_unused(des, notEs, ses, i, notEsToDes); } + + // Default construction deduces to TeamPolicy<> + static_assert( + std::is_same_v, decltype(Kokkos::TeamPolicy{})>); + + // Execution space not provided deduces to TeamPolicy<> + + static_assert( + std::is_same_v, decltype(Kokkos::TeamPolicy(i, i))>); + static_assert(std::is_same_v, + decltype(Kokkos::TeamPolicy(i, i, i))>); + static_assert(std::is_same_v, + decltype(Kokkos::TeamPolicy(i, Kokkos::AUTO))>); + static_assert( + std::is_same_v, + decltype(Kokkos::TeamPolicy(i, Kokkos::AUTO, i))>); + static_assert(std::is_same_v, + decltype(Kokkos::TeamPolicy(i, Kokkos::AUTO, + Kokkos::AUTO))>); + static_assert( + std::is_same_v, + decltype(Kokkos::TeamPolicy(i, i, Kokkos::AUTO))>); + + // DefaultExecutionSpace deduces to TeamPolicy<> + + static_assert(std::is_same_v, + decltype(Kokkos::TeamPolicy(des, i, i))>); + static_assert(std::is_same_v, + decltype(Kokkos::TeamPolicy(des, i, i, i))>); + static_assert( + std::is_same_v, + decltype(Kokkos::TeamPolicy(des, i, Kokkos::AUTO))>); + static_assert( + std::is_same_v, + decltype(Kokkos::TeamPolicy(des, i, Kokkos::AUTO, i))>); + static_assert(std::is_same_v, + decltype(Kokkos::TeamPolicy(des, i, Kokkos::AUTO, + Kokkos::AUTO))>); + static_assert( + std::is_same_v, + decltype(Kokkos::TeamPolicy(des, i, i, Kokkos::AUTO))>); + + // Convertible to DefaultExecutionSpace deduces to TeamPolicy<> + + static_assert(std::is_same_v, + decltype(Kokkos::TeamPolicy(notEs, i, i))>); + static_assert(std::is_same_v, + decltype(Kokkos::TeamPolicy(notEs, i, i, i))>); + static_assert( + std::is_same_v, + decltype(Kokkos::TeamPolicy(notEs, i, Kokkos::AUTO))>); + static_assert( + std::is_same_v, + decltype(Kokkos::TeamPolicy(notEs, i, Kokkos::AUTO, i))>); + static_assert(std::is_same_v, + decltype(Kokkos::TeamPolicy( + notEs, i, Kokkos::AUTO, Kokkos::AUTO))>); + static_assert( + std::is_same_v, + decltype(Kokkos::TeamPolicy(notEs, i, i, Kokkos::AUTO))>); + + // SES != DefaultExecutionSpace deduces to TeamPolicy + + static_assert(std::is_same_v, + decltype(Kokkos::TeamPolicy(ses, i, i))>); + static_assert(std::is_same_v, + decltype(Kokkos::TeamPolicy(ses, i, i, i))>); + static_assert( + std::is_same_v, + decltype(Kokkos::TeamPolicy(ses, i, Kokkos::AUTO))>); + static_assert( + std::is_same_v, + decltype(Kokkos::TeamPolicy(ses, i, Kokkos::AUTO, i))>); + static_assert(std::is_same_v, + decltype(Kokkos::TeamPolicy(ses, i, Kokkos::AUTO, + Kokkos::AUTO))>); + static_assert( + std::is_same_v, + decltype(Kokkos::TeamPolicy(ses, i, i, Kokkos::AUTO))>); +}; + +} // namespace diff --git a/core/unit_test/TestTeamVector.hpp b/core/unit_test/TestTeamVector.hpp index 5e16539d652..e278789992f 100644 --- a/core/unit_test/TestTeamVector.hpp +++ b/core/unit_test/TestTeamVector.hpp @@ -1067,6 +1067,10 @@ TEST(TEST_CATEGORY, parallel_scan_with_reducers) { } #endif +#ifdef KOKKOS_IMPL_32BIT + GTEST_SKIP() << "Failing KOKKOS_IMPL_32BIT"; // FIXME_32BIT +#endif + checkScan>() .run(); diff --git a/core/unit_test/category_files/TestHPX_Category.hpp b/core/unit_test/category_files/TestHPX_Category.hpp index d3a7cdbea53..c6a2aa9f201 100644 --- a/core/unit_test/category_files/TestHPX_Category.hpp +++ b/core/unit_test/category_files/TestHPX_Category.hpp @@ -23,5 +23,6 @@ #define TEST_CATEGORY_NUMBER 3 #define TEST_CATEGORY_DEATH hpx_DeathTest #define TEST_EXECSPACE Kokkos::Experimental::HPX +#define TEST_CATEGORY_FIXTURE(name) hpx_##name #endif diff --git a/core/unit_test/category_files/TestOpenACC_Category.hpp b/core/unit_test/category_files/TestOpenACC_Category.hpp index 0c4e4b7e119..6105eadf14f 100644 --- a/core/unit_test/category_files/TestOpenACC_Category.hpp +++ b/core/unit_test/category_files/TestOpenACC_Category.hpp @@ -23,5 +23,6 @@ #define TEST_CATEGORY_NUMBER 8 #define TEST_CATEGORY_DEATH openacc_DeathTest #define TEST_EXECSPACE Kokkos::Experimental::OpenACC +#define TEST_CATEGORY_FIXTURE(name) openacc_##name #endif diff --git a/core/unit_test/category_files/TestOpenMPTarget_Category.hpp b/core/unit_test/category_files/TestOpenMPTarget_Category.hpp index 235b34ffab7..921cff78902 100644 --- a/core/unit_test/category_files/TestOpenMPTarget_Category.hpp +++ b/core/unit_test/category_files/TestOpenMPTarget_Category.hpp @@ -23,5 +23,6 @@ #define TEST_CATEGORY_NUMBER 4 #define TEST_CATEGORY_DEATH openmptarget_DeathTest #define TEST_EXECSPACE Kokkos::Experimental::OpenMPTarget +#define TEST_CATEGORY_FIXTURE(name) openmptarget_##name #endif diff --git a/core/unit_test/category_files/TestSYCL_Category.hpp b/core/unit_test/category_files/TestSYCL_Category.hpp index 8e1b18c9acd..59e72c72c77 100644 --- a/core/unit_test/category_files/TestSYCL_Category.hpp +++ b/core/unit_test/category_files/TestSYCL_Category.hpp @@ -23,5 +23,6 @@ #define TEST_CATEGORY_NUMBER 7 #define TEST_CATEGORY_DEATH sycl_DeathTest #define TEST_EXECSPACE Kokkos::Experimental::SYCL +#define TEST_CATEGORY_FIXTURE(name) sycl_##name #endif diff --git a/core/unit_test/category_files/TestThreads_Category.hpp b/core/unit_test/category_files/TestThreads_Category.hpp index 13b0b653f21..ae8ac608339 100644 --- a/core/unit_test/category_files/TestThreads_Category.hpp +++ b/core/unit_test/category_files/TestThreads_Category.hpp @@ -23,5 +23,6 @@ #define TEST_CATEGORY_NUMBER 1 #define TEST_CATEGORY_DEATH threads_DeathTest #define TEST_EXECSPACE Kokkos::Threads +#define TEST_CATEGORY_FIXTURE(name) threads_##name #endif diff --git a/core/unit_test/cuda/TestCuda_Graph.cpp b/core/unit_test/cuda/TestCuda_Graph.cpp deleted file mode 100644 index 27203639690..00000000000 --- a/core/unit_test/cuda/TestCuda_Graph.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#include -#include diff --git a/core/unit_test/headers_self_contained/CMakeLists.txt b/core/unit_test/headers_self_contained/CMakeLists.txt index f792b03ed88..4c364ceee75 100644 --- a/core/unit_test/headers_self_contained/CMakeLists.txt +++ b/core/unit_test/headers_self_contained/CMakeLists.txt @@ -10,7 +10,8 @@ file(GLOB KOKKOS_CONTAINERS_HEADERS RELATIVE ${BASE_DIR}/containers/src file(GLOB KOKKOS_ALGORITHMS_HEADERS RELATIVE ${BASE_DIR}/algorithms/src ${BASE_DIR}/algorithms/src/*.hpp) -if(NOT Kokkos_ENABLE_DEPRECATED_CODE_4) +# erroring out when deprecated code is disabled and raising warnings that are treated as errors in the CI otherwise +if(NOT Kokkos_ENABLE_DEPRECATED_CODE_4 OR Kokkos_ENABLE_DEPRECATION_WARNINGS) list(REMOVE_ITEM KOKKOS_CONTAINERS_HEADERS "Kokkos_Vector.hpp") endif() diff --git a/core/unit_test/hip/TestHIP_Graph.cpp b/core/unit_test/hip/TestHIP_Graph.cpp deleted file mode 100644 index 405cb76c643..00000000000 --- a/core/unit_test/hip/TestHIP_Graph.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#include -#include diff --git a/core/unit_test/incremental/Test01_execspace.hpp b/core/unit_test/incremental/Test01_execspace.hpp index d7b2a57b442..a7fa26c7282 100644 --- a/core/unit_test/incremental/Test01_execspace.hpp +++ b/core/unit_test/incremental/Test01_execspace.hpp @@ -63,7 +63,9 @@ struct TestIncrExecSpace { ASSERT_GT(concurrency, 0); #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() int in_parallel = ExecSpace::in_parallel(); + KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() ASSERT_FALSE(in_parallel); #endif diff --git a/core/unit_test/openmp/TestOpenMP_Graph.cpp b/core/unit_test/openmp/TestOpenMP_Graph.cpp deleted file mode 100644 index 22c8ab1bf8f..00000000000 --- a/core/unit_test/openmp/TestOpenMP_Graph.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#include -#include diff --git a/core/unit_test/serial/TestSerial_Graph.cpp b/core/unit_test/serial/TestSerial_Graph.cpp deleted file mode 100644 index bff64d83e27..00000000000 --- a/core/unit_test/serial/TestSerial_Graph.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#include -#include diff --git a/core/unit_test/view/TestExtentsDatatypeConversion.cpp b/core/unit_test/view/TestExtentsDatatypeConversion.cpp index b95890614e0..1b9b2a36819 100644 --- a/core/unit_test/view/TestExtentsDatatypeConversion.cpp +++ b/core/unit_test/view/TestExtentsDatatypeConversion.cpp @@ -23,15 +23,14 @@ namespace { // Helper to make static tests more succinct template -constexpr bool datatype_matches_extent = - std::is_same_v::type, - Extent>; +constexpr bool datatype_matches_extent = std::is_same_v< + typename Kokkos::Impl::ExtentsFromDataType::type, + Extent>; template constexpr bool extent_matches_datatype = - std::is_same_v::type>; + std::is_same_v::type>; // Conversion from DataType to extents // 0-rank view diff --git a/example/README b/example/README index 66860512448..2fe87276484 100644 --- a/example/README +++ b/example/README @@ -1,7 +1,7 @@ This directory contains example application proxies that use different parts of Kokkos. If you are looking for the FENL ("finite element -nonlinear" solve) example, it has moved into the LinAlg subpackage of -Tpetra. +nonlinear" solve) example, it has moved into the TrilinosCouplings +package in Trilinos. MANIFEST: diff --git a/example/build_cmake_installed/CMakeLists.txt b/example/build_cmake_installed/CMakeLists.txt index aaf745b418d..c025f1d7d28 100644 --- a/example/build_cmake_installed/CMakeLists.txt +++ b/example/build_cmake_installed/CMakeLists.txt @@ -12,6 +12,7 @@ find_package(Kokkos REQUIRED) add_executable(example cmake_example.cpp foo.f) if(CMAKE_Fortran_COMPILER_ID STREQUAL LLVMFlang) set_target_properties(example PROPERTIES LINKER_LANGUAGE Fortran) + target_link_options(example PRIVATE -fno-fortran-main) endif() # This is the only thing required to set up compiler/linker flags diff --git a/simd/unit_tests/include/SIMDTesting_Ops.hpp b/simd/unit_tests/include/SIMDTesting_Ops.hpp index c587ccf3046..74141f25316 100644 --- a/simd/unit_tests/include/SIMDTesting_Ops.hpp +++ b/simd/unit_tests/include/SIMDTesting_Ops.hpp @@ -81,7 +81,9 @@ class absolutes { auto on_host(T const& a) const { if constexpr (std::is_signed_v) { #if defined(KOKKOS_ENABLE_DEPRECATED_CODE_4) + KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() return Kokkos::Experimental::abs(a); + KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() #else return Kokkos::abs(a); #endif diff --git a/tpls/desul/include/desul/atomics/Adapt_HIP.hpp b/tpls/desul/include/desul/atomics/Adapt_HIP.hpp new file mode 100644 index 00000000000..0eab27fe989 --- /dev/null +++ b/tpls/desul/include/desul/atomics/Adapt_HIP.hpp @@ -0,0 +1,77 @@ +/* +Copyright (c) 2019, Lawrence Livermore National Security, LLC +and DESUL project contributors. See the COPYRIGHT file for details. +Source: https://github.com/desul/desul + +SPDX-License-Identifier: (BSD-3-Clause) +*/ + +#ifndef DESUL_ATOMICS_ADAPT_HIP_HPP_ +#define DESUL_ATOMICS_ADAPT_HIP_HPP_ + +#include + +namespace desul { +namespace Impl { + +// FIXME same code as GCCMemoryOrder +template +struct HIPMemoryOrder; + +template <> +struct HIPMemoryOrder { + static constexpr int value = __ATOMIC_RELAXED; +}; + +template <> +struct HIPMemoryOrder { + static constexpr int value = __ATOMIC_ACQUIRE; +}; + +template <> +struct HIPMemoryOrder { + static constexpr int value = __ATOMIC_RELEASE; +}; + +template <> +struct HIPMemoryOrder { + static constexpr int value = __ATOMIC_ACQ_REL; +}; + +template <> +struct HIPMemoryOrder { + static constexpr int value = __ATOMIC_SEQ_CST; +}; + +// __HIP_MEMORY_SCOPE_SYSTEM +// __HIP_MEMORY_SCOPE_AGENT +// __HIP_MEMORY_SCOPE_WORKGROUP +// __HIP_MEMORY_SCOPE_WAVEFRONT +// __HIP_MEMORY_SCOPE_SINGLETHREAD +template +struct HIPMemoryScope; + +template <> +struct HIPMemoryScope { + static constexpr int value = __HIP_MEMORY_SCOPE_WORKGROUP; +}; + +template <> +struct HIPMemoryScope { + static constexpr int value = __HIP_MEMORY_SCOPE_AGENT; +}; + +template <> +struct HIPMemoryScope { + static constexpr int value = __HIP_MEMORY_SCOPE_SYSTEM; +}; + +template <> +struct HIPMemoryScope { + static constexpr int value = __HIP_MEMORY_SCOPE_SYSTEM; +}; + +} // namespace Impl +} // namespace desul + +#endif diff --git a/tpls/desul/include/desul/atomics/Compare_Exchange_HIP.hpp b/tpls/desul/include/desul/atomics/Compare_Exchange_HIP.hpp index 8c909bacdf4..0ade34f25df 100644 --- a/tpls/desul/include/desul/atomics/Compare_Exchange_HIP.hpp +++ b/tpls/desul/include/desul/atomics/Compare_Exchange_HIP.hpp @@ -9,6 +9,7 @@ SPDX-License-Identifier: (BSD-3-Clause) #ifndef DESUL_ATOMICS_COMPARE_EXCHANGE_HIP_HPP_ #define DESUL_ATOMICS_COMPARE_EXCHANGE_HIP_HPP_ +#include #include #include #include @@ -17,130 +18,40 @@ SPDX-License-Identifier: (BSD-3-Clause) namespace desul { namespace Impl { -template -__device__ std::enable_if_t device_atomic_compare_exchange( - T* const dest, T compare, T value, MemoryOrderRelaxed, MemoryScope) { - static_assert(sizeof(unsigned int) == 4, - "this function assumes an unsigned int is 32-bit"); - unsigned int return_val = atomicCAS(reinterpret_cast(dest), - reinterpret_cast(compare), - reinterpret_cast(value)); - return reinterpret_cast(return_val); -} -template -__device__ std::enable_if_t device_atomic_compare_exchange( - T* const dest, T compare, T value, MemoryOrderRelaxed, MemoryScope) { - static_assert(sizeof(unsigned long long int) == 8, - "this function assumes an unsigned long long is 64-bit"); - unsigned long long int return_val = - atomicCAS(reinterpret_cast(dest), - reinterpret_cast(compare), - reinterpret_cast(value)); - return reinterpret_cast(return_val); -} +template +struct atomic_exchange_available_hip { + constexpr static bool value = + ((sizeof(T) == 1 && alignof(T) == 1) || (sizeof(T) == 4 && alignof(T) == 4) || + (sizeof(T) == 8 && alignof(T) == 8)) && + std::is_trivially_copyable::value; +}; -template -__device__ std::enable_if_t -device_atomic_compare_exchange( - T* const dest, T compare, T value, MemoryOrderRelease, MemoryScope) { - T return_val = atomic_compare_exchange( - dest, compare, value, MemoryOrderRelaxed(), MemoryScope()); - atomic_thread_fence(MemoryOrderRelease(), MemoryScope()); - return return_val; -} - -template -__device__ std::enable_if_t -device_atomic_compare_exchange( - T* const dest, T compare, T value, MemoryOrderAcquire, MemoryScope) { - atomic_thread_fence(MemoryOrderAcquire(), MemoryScope()); - T return_val = atomic_compare_exchange( - dest, compare, value, MemoryOrderRelaxed(), MemoryScope()); - return return_val; -} - -template -__device__ std::enable_if_t +template +__device__ std::enable_if_t::value, T> device_atomic_compare_exchange( - T* const dest, T compare, T value, MemoryOrderAcqRel, MemoryScope) { - atomic_thread_fence(MemoryOrderAcquire(), MemoryScope()); - T return_val = atomic_compare_exchange( - dest, compare, value, MemoryOrderRelaxed(), MemoryScope()); - atomic_thread_fence(MemoryOrderRelease(), MemoryScope()); - return return_val; + T* const dest, T compare, T value, MemoryOrder, MemoryScope) { + (void)__hip_atomic_compare_exchange_strong( + dest, + &compare, + value, + HIPMemoryOrder::value, + HIPMemoryOrder>::value, + HIPMemoryScope::value); + return compare; } -template -__device__ std::enable_if_t device_atomic_exchange( - T* const dest, T value, MemoryOrderRelaxed, MemoryScope) { - static_assert(sizeof(unsigned int) == 4, - "this function assumes an unsigned int is 32-bit"); - unsigned int return_val = atomicExch(reinterpret_cast(dest), - reinterpret_cast(value)); - return reinterpret_cast(return_val); -} -template -__device__ std::enable_if_t device_atomic_exchange( - T* const dest, T value, MemoryOrderRelaxed, MemoryScope) { - static_assert(sizeof(unsigned long long int) == 8, - "this function assumes an unsigned long long is 64-bit"); - unsigned long long int return_val = - atomicExch(reinterpret_cast(dest), - reinterpret_cast(value)); - return reinterpret_cast(return_val); -} - -template -__device__ std::enable_if_t device_atomic_exchange( - T* const dest, T compare, T value, MemoryOrderRelease, MemoryScope) { - T return_val = device_atomic_compare_exchange( - dest, compare, value, MemoryOrderRelaxed(), MemoryScope()); - device_atomic_thread_fence(MemoryOrderRelease(), MemoryScope()); - return reinterpret_cast(return_val); -} - -template -__device__ std::enable_if_t device_atomic_exchange( - T* const dest, T /*compare*/, T value, MemoryOrderAcquire, MemoryScope) { - device_atomic_thread_fence(MemoryOrderAcquire(), MemoryScope()); - T return_val = - device_atomic_exchange(dest, value, MemoryOrderRelaxed(), MemoryScope()); - return reinterpret_cast(return_val); -} - -template -__device__ std::enable_if_t device_atomic_exchange( - T* const dest, T value, MemoryOrderAcqRel, MemoryScope) { - device_atomic_thread_fence(MemoryOrderAcquire(), MemoryScope()); - T return_val = - device_atomic_exchange(dest, value, MemoryOrderRelaxed(), MemoryScope()); - device_atomic_thread_fence(MemoryOrderRelease(), MemoryScope()); - return reinterpret_cast(return_val); -} - -template -__device__ std::enable_if_t device_atomic_exchange( - T* const dest, T value, MemoryOrderSeqCst, MemoryScope) { - device_atomic_thread_fence(MemoryOrderAcquire(), MemoryScope()); - T return_val = - device_atomic_exchange(dest, value, MemoryOrderRelaxed(), MemoryScope()); - device_atomic_thread_fence(MemoryOrderRelease(), MemoryScope()); - return reinterpret_cast(return_val); -} - -template -__device__ std::enable_if_t -device_atomic_compare_exchange( - T* const dest, T compare, T value, MemoryOrderSeqCst, MemoryScope) { - device_atomic_thread_fence(MemoryOrderAcquire(), MemoryScope()); - T return_val = device_atomic_compare_exchange( - dest, compare, value, MemoryOrderRelaxed(), MemoryScope()); - device_atomic_thread_fence(MemoryOrderRelease(), MemoryScope()); +template +__device__ std::enable_if_t::value, T> +device_atomic_exchange(T* const dest, T value, MemoryOrder, MemoryScope) { + T return_val = __hip_atomic_exchange(dest, + value, + HIPMemoryOrder::value, + HIPMemoryScope::value); return return_val; } template -__device__ std::enable_if_t<(sizeof(T) != 8) && (sizeof(T) != 4), T> +__device__ std::enable_if_t::value, T> device_atomic_compare_exchange( T* const dest, T compare, T value, MemoryOrder, MemoryScope scope) { // This is a way to avoid deadlock in a warp or wave front @@ -169,7 +80,7 @@ device_atomic_compare_exchange( } template -__device__ std::enable_if_t<(sizeof(T) != 8) && (sizeof(T) != 4), T> +__device__ std::enable_if_t::value, T> device_atomic_exchange(T* const dest, T value, MemoryOrder, MemoryScope scope) { // This is a way to avoid deadlock in a warp or wave front T return_val; diff --git a/tpls/desul/include/desul/atomics/Fetch_Op_HIP.hpp b/tpls/desul/include/desul/atomics/Fetch_Op_HIP.hpp index e9c749809de..920722084d1 100644 --- a/tpls/desul/include/desul/atomics/Fetch_Op_HIP.hpp +++ b/tpls/desul/include/desul/atomics/Fetch_Op_HIP.hpp @@ -9,99 +9,106 @@ SPDX-License-Identifier: (BSD-3-Clause) #ifndef DESUL_ATOMICS_FECH_OP_HIP_HPP_ #define DESUL_ATOMICS_FECH_OP_HIP_HPP_ +#include + namespace desul { namespace Impl { -// clang-format off -inline __device__ int device_atomic_fetch_add( int* ptr, int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAdd(ptr, val); } -inline __device__ unsigned int device_atomic_fetch_add( unsigned int* ptr, unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAdd(ptr, val); } -inline __device__ unsigned long long device_atomic_fetch_add(unsigned long long* ptr, unsigned long long val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAdd(ptr, val); } -inline __device__ float device_atomic_fetch_add( float* ptr, float val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAdd(ptr, val); } -inline __device__ double device_atomic_fetch_add( double* ptr, double val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAdd(ptr, val); } - -inline __device__ int device_atomic_fetch_sub( int* ptr, int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicSub(ptr, val); } -inline __device__ unsigned int device_atomic_fetch_sub( unsigned int* ptr, unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicSub(ptr, val); } -inline __device__ unsigned long long device_atomic_fetch_sub(unsigned long long* ptr, unsigned long long val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAdd(ptr, -val); } -inline __device__ float device_atomic_fetch_sub( float* ptr, float val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAdd(ptr, -val); } -inline __device__ double device_atomic_fetch_sub( double* ptr, double val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAdd(ptr, -val); } - -inline __device__ int device_atomic_fetch_min( int* ptr, int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicMin(ptr, val); } -inline __device__ unsigned int device_atomic_fetch_min( unsigned int* ptr, unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicMin(ptr, val); } -inline __device__ unsigned long long device_atomic_fetch_min(unsigned long long* ptr, unsigned long long val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicMin(ptr, val); } - -inline __device__ int device_atomic_fetch_max( int* ptr, int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicMax(ptr, val); } -inline __device__ unsigned int device_atomic_fetch_max( unsigned int* ptr, unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicMax(ptr, val); } -inline __device__ unsigned long long device_atomic_fetch_max(unsigned long long* ptr, unsigned long long val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicMax(ptr, val); } - -inline __device__ int device_atomic_fetch_and( int* ptr, int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAnd(ptr, val); } -inline __device__ unsigned int device_atomic_fetch_and( unsigned int* ptr, unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAnd(ptr, val); } -inline __device__ unsigned long long device_atomic_fetch_and(unsigned long long* ptr, unsigned long long val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAnd(ptr, val); } - -inline __device__ int device_atomic_fetch_or ( int* ptr, int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicOr (ptr, val); } -inline __device__ unsigned int device_atomic_fetch_or ( unsigned int* ptr, unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicOr (ptr, val); } -inline __device__ unsigned long long device_atomic_fetch_or (unsigned long long* ptr, unsigned long long val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicOr (ptr, val); } +#define DESUL_IMPL_HIP_ATOMIC_FETCH_OP(OP, T) \ + template \ + __device__ inline T device_atomic_fetch_##OP( \ + T* ptr, T val, MemoryOrder, MemoryScope) { \ + return __hip_atomic_fetch_##OP(ptr, \ + val, \ + HIPMemoryOrder::value, \ + HIPMemoryScope::value); \ + } -inline __device__ int device_atomic_fetch_xor( int* ptr, int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicXor(ptr, val); } -inline __device__ unsigned int device_atomic_fetch_xor( unsigned int* ptr, unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicXor(ptr, val); } -inline __device__ unsigned long long device_atomic_fetch_xor(unsigned long long* ptr, unsigned long long val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicXor(ptr, val); } +#define DESUL_IMPL_HIP_ATOMIC_FETCH_OP_INTEGRAL(OP) \ + DESUL_IMPL_HIP_ATOMIC_FETCH_OP(OP, int) \ + DESUL_IMPL_HIP_ATOMIC_FETCH_OP(OP, long long) \ + DESUL_IMPL_HIP_ATOMIC_FETCH_OP(OP, unsigned int) \ + DESUL_IMPL_HIP_ATOMIC_FETCH_OP(OP, unsigned long long) + +#define DESUL_IMPL_HIP_ATOMIC_FETCH_OP_FLOATING_POINT(OP) \ + DESUL_IMPL_HIP_ATOMIC_FETCH_OP(OP, float) \ + DESUL_IMPL_HIP_ATOMIC_FETCH_OP(OP, double) + +DESUL_IMPL_HIP_ATOMIC_FETCH_OP_INTEGRAL(add) +DESUL_IMPL_HIP_ATOMIC_FETCH_OP_INTEGRAL(min) +DESUL_IMPL_HIP_ATOMIC_FETCH_OP_INTEGRAL(max) +DESUL_IMPL_HIP_ATOMIC_FETCH_OP_INTEGRAL(and) +DESUL_IMPL_HIP_ATOMIC_FETCH_OP_INTEGRAL(or) +DESUL_IMPL_HIP_ATOMIC_FETCH_OP_INTEGRAL(xor) +DESUL_IMPL_HIP_ATOMIC_FETCH_OP_FLOATING_POINT(add) +// atomic min/max gives the wrong results (tested with ROCm 6.0 on Frontier) +// DESUL_IMPL_HIP_ATOMIC_FETCH_OP_FLOATING_POINT(min) +// DESUL_IMPL_HIP_ATOMIC_FETCH_OP_FLOATING_POINT(max) + +#undef DESUL_IMPL_HIP_ATOMIC_FETCH_OP_FLOATING_POINT +#undef DESUL_IMPL_HIP_ATOMIC_FETCH_OP_INTEGRAL +#undef DESUL_IMPL_HIP_ATOMIC_FETCH_OP + +#define DESUL_IMPL_HIP_ATOMIC_FETCH_SUB(T) \ + template \ + __device__ inline T device_atomic_fetch_sub( \ + T* ptr, T val, MemoryOrder, MemoryScope) { \ + return __hip_atomic_fetch_add(ptr, \ + -val, \ + HIPMemoryOrder::value, \ + HIPMemoryScope::value); \ + } -inline __device__ int device_atomic_fetch_inc( int* ptr, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAdd(ptr, 1 ); } -inline __device__ unsigned int device_atomic_fetch_inc( unsigned int* ptr, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAdd(ptr, 1u ); } -inline __device__ unsigned long long device_atomic_fetch_inc(unsigned long long* ptr, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAdd(ptr, 1ull); } +DESUL_IMPL_HIP_ATOMIC_FETCH_SUB(int) +DESUL_IMPL_HIP_ATOMIC_FETCH_SUB(long long) +DESUL_IMPL_HIP_ATOMIC_FETCH_SUB(unsigned int) +DESUL_IMPL_HIP_ATOMIC_FETCH_SUB(unsigned long long) +DESUL_IMPL_HIP_ATOMIC_FETCH_SUB(float) +DESUL_IMPL_HIP_ATOMIC_FETCH_SUB(double) + +#undef DESUL_IMPL_HIP_ATOMIC_FETCH_SUB + +#define DESUL_IMPL_HIP_ATOMIC_FETCH_INC(T) \ + template \ + __device__ inline T device_atomic_fetch_inc( T* ptr, MemoryOrder, MemoryScope) { \ + return __hip_atomic_fetch_add(ptr, \ + 1, \ + HIPMemoryOrder::value, \ + HIPMemoryScope::value); \ + } \ + template \ + __device__ inline T device_atomic_fetch_dec( T* ptr, MemoryOrder, MemoryScope) { \ + return __hip_atomic_fetch_add(ptr, \ + -1, \ + HIPMemoryOrder::value, \ + HIPMemoryScope::value); \ + } -inline __device__ int device_atomic_fetch_dec( int* ptr, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicSub(ptr, 1 ); } -inline __device__ unsigned int device_atomic_fetch_dec( unsigned int* ptr, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicSub(ptr, 1u ); } -inline __device__ unsigned long long device_atomic_fetch_dec(unsigned long long* ptr, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAdd(ptr, -1 ); } +DESUL_IMPL_HIP_ATOMIC_FETCH_INC(int) +DESUL_IMPL_HIP_ATOMIC_FETCH_INC(long long) +DESUL_IMPL_HIP_ATOMIC_FETCH_INC(unsigned int) +DESUL_IMPL_HIP_ATOMIC_FETCH_INC(unsigned long long) -inline __device__ unsigned int device_atomic_fetch_inc_mod( unsigned int* ptr, unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicInc(ptr, val); } -inline __device__ unsigned int device_atomic_fetch_dec_mod( unsigned int* ptr, unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicDec(ptr, val); } -// clang-format on +#undef DESUL_IMPL_HIP_ATOMIC_FETCH_INC -#define DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP(OP, TYPE) \ +#define DESUL_IMPL_HIP_ATOMIC_FETCH_INC_MOD(MEMORY_SCOPE, MEMORY_SCOPE_STRING_LITERAL) \ template \ - __device__ TYPE device_atomic_fetch_##OP( \ - TYPE* ptr, TYPE val, MemoryOrder, MemoryScopeDevice) { \ - __threadfence(); \ - TYPE return_val = \ - device_atomic_fetch_##OP(ptr, val, MemoryOrderRelaxed(), MemoryScopeDevice()); \ - __threadfence(); \ - return return_val; \ + __device__ inline unsigned int device_atomic_fetch_inc_mod(unsigned int* ptr, unsigned int val, MemoryOrder, MEMORY_SCOPE) { \ + return __builtin_amdgcn_atomic_inc32( \ + ptr, val, HIPMemoryOrder::value, MEMORY_SCOPE_STRING_LITERAL); \ } \ template \ - __device__ TYPE device_atomic_fetch_##OP( \ - TYPE* ptr, TYPE val, MemoryOrder, MemoryScopeCore) { \ - return device_atomic_fetch_##OP(ptr, val, MemoryOrder(), MemoryScopeDevice()); \ + __device__ inline unsigned int device_atomic_fetch_dec_mod(unsigned int* ptr, unsigned int val, MemoryOrder, MEMORY_SCOPE) { \ + return __builtin_amdgcn_atomic_dec32( \ + ptr, val, HIPMemoryOrder::value, MEMORY_SCOPE_STRING_LITERAL); \ } -#define DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(OP) \ - DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP(OP, int) \ - DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP(OP, unsigned int) \ - DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP(OP, unsigned long long) - -#define DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_FLOATING_POINT(OP) \ - DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP(OP, float) \ - DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP(OP, double) - -DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(min) -DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(max) -DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(and) -DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(or) -DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(xor) - -DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_FLOATING_POINT(add) -DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(add) -DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_FLOATING_POINT(sub) -DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(sub) - -DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(inc) -DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(dec) - -DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP(inc_mod, unsigned int) -DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP(dec_mod, unsigned int) +DESUL_IMPL_HIP_ATOMIC_FETCH_INC_MOD(MemoryScopeCore, "workgroup") +DESUL_IMPL_HIP_ATOMIC_FETCH_INC_MOD(MemoryScopeDevice, "agent") +DESUL_IMPL_HIP_ATOMIC_FETCH_INC_MOD(MemoryScopeNode, "") +DESUL_IMPL_HIP_ATOMIC_FETCH_INC_MOD(MemoryScopeSystem, "") -#undef DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_FLOATING_POINT -#undef DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_INTEGRAL -#undef DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP +#undef DESUL_IMPL_HIP_ATOMIC_FETCH_INC_MOD } // namespace Impl } // namespace desul diff --git a/tpls/mdspan/include/experimental/__p0009_bits/layout_stride.hpp b/tpls/mdspan/include/experimental/__p0009_bits/layout_stride.hpp index 15ad577d149..05fce8ba44c 100644 --- a/tpls/mdspan/include/experimental/__p0009_bits/layout_stride.hpp +++ b/tpls/mdspan/include/experimental/__p0009_bits/layout_stride.hpp @@ -199,6 +199,12 @@ struct layout_stride { return __strides_storage_t{static_cast(s[Idxs])...}; } + template + MDSPAN_INLINE_FUNCTION + static constexpr const __strides_storage_t fill_strides(mdspan_non_standard_tag, const IntegralType (&s)[extents_type::rank()]) { + return __strides_storage_t{static_cast(s[Idxs])...}; + } + #ifdef __cpp_lib_span template MDSPAN_INLINE_FUNCTION @@ -309,6 +315,44 @@ struct layout_stride { */ } + MDSPAN_TEMPLATE_REQUIRES( + class IntegralTypes, + /* requires */ ( + // MSVC 19.32 does not like using index_type here, requires the typename Extents::index_type + // error C2641: cannot deduce template arguments for 'MDSPAN_IMPL_STANDARD_NAMESPACE::layout_stride::mapping' + _MDSPAN_TRAIT(std::is_convertible, const std::remove_const_t&, typename Extents::index_type) && + _MDSPAN_TRAIT(std::is_nothrow_constructible, typename Extents::index_type, const std::remove_const_t&) + ) + ) + MDSPAN_INLINE_FUNCTION + constexpr + mapping( + mdspan_non_standard_tag, + extents_type const& e, + IntegralTypes (&s)[extents_type::rank()] + ) noexcept +#if defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS) + : __members{ +#else + : __base_t(__base_t{__member_pair_t( +#endif + e, __strides_storage_t(__impl::fill_strides(mdspan_non_standard, s)) +#if defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS) + } +#else + )}) +#endif + { + /* + * TODO: check preconditions + * - s[i] > 0 is true for all i in the range [0, rank_ ). + * - REQUIRED-SPAN-SIZE(e, s) is a representable value of type index_type ([basic.fundamental]). + * - If rank_ is greater than 0, then there exists a permutation P of the integers in the + * range [0, rank_), such that s[ pi ] >= s[ pi − 1 ] * e.extent( pi − 1 ) is true for + * all i in the range [1, rank_ ), where pi is the ith element of P. + */ + } + #ifdef __cpp_lib_span MDSPAN_TEMPLATE_REQUIRES( class IntegralTypes, diff --git a/tpls/mdspan/include/experimental/__p0009_bits/macros.hpp b/tpls/mdspan/include/experimental/__p0009_bits/macros.hpp index 3eeb39755c8..523bca4e11d 100644 --- a/tpls/mdspan/include/experimental/__p0009_bits/macros.hpp +++ b/tpls/mdspan/include/experimental/__p0009_bits/macros.hpp @@ -629,3 +629,8 @@ struct __bools; // end Pre-C++14 constexpr }}}1 //============================================================================== + +namespace MDSPAN_IMPL_STANDARD_NAMESPACE { +constexpr struct mdspan_non_standard_tag { +} mdspan_non_standard; +} // namespace MDSPAN_IMPL_STANDARD_NAMESPACE diff --git a/tpls/mdspan/include/experimental/__p2642_bits/layout_padded.hpp b/tpls/mdspan/include/experimental/__p2642_bits/layout_padded.hpp index a8014867923..1f5ad70a6cf 100644 --- a/tpls/mdspan/include/experimental/__p2642_bits/layout_padded.hpp +++ b/tpls/mdspan/include/experimental/__p2642_bits/layout_padded.hpp @@ -158,19 +158,21 @@ class layout_left_padded::mapping { typename padded_stride_type::static_array_type padded_stride = {}; extents_type exts = {}; - constexpr index_type compute_offset(std::index_sequence<>) const { + MDSPAN_INLINE_FUNCTION constexpr index_type + compute_offset(std::index_sequence<>) const { return 0; } template - constexpr index_type compute_offset(std::index_sequence, - IndexOffset index_offset) const { + MDSPAN_INLINE_FUNCTION constexpr index_type + compute_offset(std::index_sequence, IndexOffset index_offset) const { return index_offset; } template - constexpr index_type compute_offset(std::index_sequence, - IndexOffsets... index_offsets) const { + MDSPAN_INLINE_FUNCTION constexpr index_type + compute_offset(std::index_sequence, + IndexOffsets... index_offsets) const { index_type indices[] = {static_cast(index_offsets)...}; // self-recursive fold trick from // https://github.com/llvm/llvm-project/blob/96e1914aa2e6d8966acbfbe2f4d184201f1aa318/libcxx/include/mdspan/layout_left.h#L144 @@ -241,62 +243,71 @@ class layout_left_padded::mapping { /** * Converting constructor from `layout_left::mapping`. * - * This overload participates in overload resolution only if `is_constructible_v` is true. - * If `OtherExtents::rank() > 1` then one of `padding_value`, `static_extent(0)`, or `OtherExtents::static_extent(0)` must be `dynamic_extent`; - * otherwise, `OtherExtents::static_extent(0)` must be equal to the least multiple of `padding_value` greater than or equal to `extents_type::static_extent(0)` + * This overload participates in overload resolution only if + * `is_constructible_v` is true. If + * `OtherExtents::rank() > 1` then one of `padding_value`, `static_extent(0)`, + * or `OtherExtents::static_extent(0)` must be `dynamic_extent`; otherwise, + * `OtherExtents::static_extent(0)` must be equal to the least multiple of + * `padding_value` greater than or equal to `extents_type::static_extent(0)` */ MDSPAN_TEMPLATE_REQUIRES( - class _OtherExtents, - /* requires */ ( - std::is_constructible_v - ) - ) - MDSPAN_CONDITIONAL_EXPLICIT((!std::is_convertible_v<_OtherExtents, extents_type>)) + class _OtherExtents, + /* requires */ (std::is_constructible_v)) + MDSPAN_CONDITIONAL_EXPLICIT( + (!std::is_convertible_v<_OtherExtents, extents_type>)) + MDSPAN_INLINE_FUNCTION constexpr mapping(const layout_left::mapping<_OtherExtents> &other_mapping) - : padded_stride(padded_stride_type::init_padding(other_mapping, std::integral_constant{})), - exts(other_mapping.extents()) - { - static_assert((_OtherExtents::rank() > 1) || (static_padding_stride != dynamic_extent) || (_OtherExtents::static_extent(extent_to_pad_idx) != dynamic_extent) - || (static_padding_stride == _OtherExtents::static_extent(extent_to_pad_idx))); + : padded_stride(padded_stride_type::init_padding( + other_mapping, + std::integral_constant{})), + exts(other_mapping.extents()) { + static_assert( + (_OtherExtents::rank() > 1) || + (static_padding_stride != dynamic_extent) || + (_OtherExtents::static_extent(extent_to_pad_idx) != dynamic_extent) || + (static_padding_stride == + _OtherExtents::static_extent(extent_to_pad_idx))); } /** * Converting constructor from `layout_stride::mapping`. * - * This overload participates in overload resolution only if `is_constructible_v` is true + * This overload participates in overload resolution only if + * `is_constructible_v` is true */ MDSPAN_TEMPLATE_REQUIRES( - class _OtherExtents, - /* requires */ ( - std::is_constructible_v - ) - ) + class _OtherExtents, + /* requires */ (std::is_constructible_v)) MDSPAN_CONDITIONAL_EXPLICIT((extents_type::rank() > 0)) + MDSPAN_INLINE_FUNCTION constexpr mapping(const layout_stride::mapping<_OtherExtents> &other_mapping) - : padded_stride(padded_stride_type::init_padding(other_mapping, std::integral_constant{})), - exts(other_mapping.extents()) - { - } + : padded_stride(padded_stride_type::init_padding( + other_mapping, + std::integral_constant{})), + exts(other_mapping.extents()) {} /** * Converting constructor from `layout_left_padded::mapping`. * - * This overload participates in overload resolution only if `is_constructible_v` is true. - * Either `padding_value` or `OtherPaddingStride` must be `std::dynamic_extent`, or `padding_value == OtherPaddingStride`. + * This overload participates in overload resolution only if + * `is_constructible_v` is true. Either + * `padding_value` or `OtherPaddingStride` must be `std::dynamic_extent`, or + * `padding_value == OtherPaddingStride`. */ MDSPAN_TEMPLATE_REQUIRES( - class _Mapping, - /* requires */ ( - detail::is_layout_left_padded_mapping<_Mapping>::value - && std::is_constructible_v - ) - ) - MDSPAN_CONDITIONAL_EXPLICIT((extents_type::rank() > 1 && (padding_value == dynamic_extent || _Mapping::padding_value == dynamic_extent))) - constexpr - mapping(const _Mapping &other_mapping) - : padded_stride(padded_stride_type::init_padding(other_mapping, std::integral_constant{})), - exts(other_mapping.extents()) - { + class _Mapping, + /* requires */ (detail::is_layout_left_padded_mapping<_Mapping>::value + &&std::is_constructible_v< + extents_type, typename _Mapping::extents_type>)) + MDSPAN_CONDITIONAL_EXPLICIT((extents_type::rank() > 1 && + (padding_value == dynamic_extent || + _Mapping::padding_value == dynamic_extent))) + MDSPAN_INLINE_FUNCTION + constexpr mapping(const _Mapping &other_mapping) + : padded_stride(padded_stride_type::init_padding( + other_mapping, + std::integral_constant{})), + exts(other_mapping.extents()) { static_assert(padding_value == dynamic_extent || _Mapping::padding_value == dynamic_extent || padding_value == _Mapping::padding_value); @@ -305,42 +316,43 @@ class layout_left_padded::mapping { /** * Converting constructor from `layout_right_padded::mapping`. * - * This overload participates in overload resolution only if `extents_type::rank()` is 0 or 1 and `is_constructible_v` is `true`. + * This overload participates in overload resolution only if + * `extents_type::rank()` is 0 or 1 and `is_constructible_v` is `true`. */ MDSPAN_TEMPLATE_REQUIRES( - class _Mapping, - /* requires */ ( - detail::is_layout_right_padded_mapping<_Mapping>::value - && extents_type::rank() <= 1 - && std::is_constructible_v - ) - ) - MDSPAN_CONDITIONAL_EXPLICIT((!std::is_convertible_v)) - constexpr - mapping(const _Mapping &other_mapping) noexcept - : padded_stride(padded_stride_type::init_padding(other_mapping.extents(), other_mapping.extents().extent(extent_to_pad_idx))), - exts(other_mapping.extents()) - {} + class _Mapping, + /* requires */ (detail::is_layout_right_padded_mapping<_Mapping>::value + &&extents_type::rank() <= 1 && + std::is_constructible_v)) + MDSPAN_CONDITIONAL_EXPLICIT( + (!std::is_convertible_v)) + MDSPAN_INLINE_FUNCTION + constexpr mapping(const _Mapping &other_mapping) noexcept + : padded_stride(padded_stride_type::init_padding( + other_mapping.extents(), + other_mapping.extents().extent(extent_to_pad_idx))), + exts(other_mapping.extents()) {} - constexpr const extents_type &extents() const noexcept - { + MDSPAN_INLINE_FUNCTION constexpr const extents_type & + extents() const noexcept { return exts; } - constexpr std::array - strides() const noexcept - { - if constexpr ( extents_type::rank() == 0 ) { + MDSPAN_INLINE_FUNCTION constexpr std::array + strides() const noexcept { + if constexpr (extents_type::rank() == 0) { return {}; - } else if constexpr ( extents_type::rank() == 1 ) { + } else if constexpr (extents_type::rank() == 1) { return {1}; } else { index_type value = 1; std::array s{}; s[extent_to_pad_idx] = value; value *= padded_stride.value(0); - for (rank_type r = extent_to_pad_idx + 1; r < extents_type::rank() - 1; ++r) - { + for (rank_type r = extent_to_pad_idx + 1; r < extents_type::rank() - 1; + ++r) { s[r] = value; value *= exts.extent(r); } @@ -349,12 +361,11 @@ class layout_left_padded::mapping { } } - constexpr index_type - required_span_size() const noexcept - { - if constexpr ( extents_type::rank() == 0 ) { + MDSPAN_INLINE_FUNCTION constexpr index_type + required_span_size() const noexcept { + if constexpr (extents_type::rank() == 0) { return 1; - } else if constexpr ( extents_type::rank() == 1 ) { + } else if constexpr (extents_type::rank() == 1) { return exts.extent(0); } else { index_type value = padded_stride.value(0); @@ -375,40 +386,47 @@ class layout_left_padded::mapping { */ MDSPAN_TEMPLATE_REQUIRES( class... _Indices, - /* requires */ ( - sizeof...(_Indices) == extents_type::rank() && - (::MDSPAN_IMPL_STANDARD_NAMESPACE::detail::are_valid_indices()) - ) - ) - constexpr size_t operator()(_Indices... idxs) const noexcept - { + /* requires */ (sizeof...(_Indices) == extents_type::rank() && + (::MDSPAN_IMPL_STANDARD_NAMESPACE::detail:: + are_valid_indices()))) + MDSPAN_INLINE_FUNCTION constexpr size_t + operator()(_Indices... idxs) const noexcept { return compute_offset(std::index_sequence_for<_Indices...>{}, idxs...); } - static constexpr bool is_always_unique() noexcept { return true; } - static constexpr bool is_always_exhaustive() noexcept - { - return (extents_type::rank() <= rank_type(1)) - || (extents_type::static_extent(extent_to_pad_idx) != dynamic_extent - && extents_type::static_extent(extent_to_pad_idx) == padded_stride_type::static_value()); + MDSPAN_INLINE_FUNCTION static constexpr bool is_always_unique() noexcept { + return true; + } + MDSPAN_INLINE_FUNCTION static constexpr bool is_always_exhaustive() noexcept { + return (extents_type::rank() <= rank_type(1)) || + (extents_type::static_extent(extent_to_pad_idx) != dynamic_extent && + extents_type::static_extent(extent_to_pad_idx) == + padded_stride_type::static_value()); + } + MDSPAN_INLINE_FUNCTION static constexpr bool is_always_strided() noexcept { + return true; } - static constexpr bool is_always_strided() noexcept { return true; } - static constexpr bool is_unique() noexcept { return true; } - constexpr bool is_exhaustive() const noexcept - { - return (extents_type::rank() < 2) - || (exts.extent(extent_to_pad_idx) == padded_stride.value(0)); + MDSPAN_INLINE_FUNCTION static constexpr bool is_unique() noexcept { + return true; + } + MDSPAN_INLINE_FUNCTION constexpr bool is_exhaustive() const noexcept { + return (extents_type::rank() < 2) || + (exts.extent(extent_to_pad_idx) == padded_stride.value(0)); + } + MDSPAN_INLINE_FUNCTION static constexpr bool is_strided() noexcept { + return true; } - static constexpr bool is_strided() noexcept { return true; } - constexpr index_type stride(rank_type r) const noexcept - { + MDSPAN_INLINE_FUNCTION + constexpr index_type stride(rank_type r) const noexcept { assert(r < extents_type::rank()); - if(r == 0) return index_type(1); + if (r == 0) + return index_type(1); index_type value = padded_stride.value(0); - for (rank_type k = 1; k < r; k++) value *= exts.extent(k); + for (rank_type k = 1; k < r; k++) + value *= exts.extent(k); return value; } @@ -416,26 +434,26 @@ class layout_left_padded::mapping { /** * Equality operator between `layout_left_padded`s * - * This overload only participates in overload resolution if `OtherExtents::rank() == extents_type::rank()`. + * This overload only participates in overload resolution if + * `OtherExtents::rank() == extents_type::rank()`. * - * \note There is currently a difference from p2642r2, where this function is specified as taking - * `layout_left_padded< padding_value >::mapping< Extents>`. However, this makes `padding_value` non-deducible. + * \note There is currently a difference from p2642r2, where this function is + * specified as taking `layout_left_padded< padding_value >::mapping< + * Extents>`. However, this makes `padding_value` non-deducible. */ MDSPAN_TEMPLATE_REQUIRES( - class _Mapping, - /* requires */ ( - detail::is_layout_left_padded_mapping<_Mapping>::value - && (_Mapping::extents_type::rank() == extents_type::rank()) - ) - ) - friend constexpr bool operator==(const mapping &left, const _Mapping &right) noexcept - { - // Workaround for some compilers not short-circuiting properly with compile-time checks - // i.e. we can't access stride(_padding_stride_idx) of a rank 0 mapping + class _Mapping, + /* requires */ (detail::is_layout_left_padded_mapping<_Mapping>::value && + (_Mapping::extents_type::rank() == extents_type::rank()))) + MDSPAN_INLINE_FUNCTION friend constexpr bool + operator==(const mapping &left, const _Mapping &right) noexcept { + // Workaround for some compilers not short-circuiting properly with + // compile-time checks i.e. we can't access stride(_padding_stride_idx) of a + // rank 0 mapping bool strides_equal = true; - if constexpr (extents_type::rank() > rank_type(1)) - { - strides_equal = left.stride(padded_stride_idx) == right.stride(padded_stride_idx); + if constexpr (extents_type::rank() > rank_type(1)) { + strides_equal = + left.stride(padded_stride_idx) == right.stride(padded_stride_idx); } return (left.extents() == right.extents()) && strides_equal; } @@ -444,17 +462,15 @@ class layout_left_padded::mapping { /** * Inequality operator between `layout_left_padded`s * - * This overload only participates in overload resolution if `OtherExtents::rank() == extents_type::rank()`. + * This overload only participates in overload resolution if + * `OtherExtents::rank() == extents_type::rank()`. */ MDSPAN_TEMPLATE_REQUIRES( - class _Mapping, - /* requires */ ( - detail::is_layout_left_padded_mapping<_Mapping>::value - && (_Mapping::extents_type::rank() == extents_type::rank()) - ) - ) - friend constexpr bool operator!=(const mapping &left, const _Mapping &right) noexcept - { + class _Mapping, + /* requires */ (detail::is_layout_left_padded_mapping<_Mapping>::value && + (_Mapping::extents_type::rank() == extents_type::rank()))) + MDSPAN_INLINE_FUNCTION friend constexpr bool + operator!=(const mapping &left, const _Mapping &right) noexcept { return !(left == right); } #endif @@ -490,25 +506,27 @@ class layout_right_padded::mapping { typename padded_stride_type::static_array_type padded_stride = {}; extents_type exts = {}; - constexpr index_type compute_offset(std::index_sequence<>) const { + MDSPAN_INLINE_FUNCTION constexpr index_type + compute_offset(std::index_sequence<>) const { return 0; } template - constexpr index_type compute_offset(std::index_sequence, - IndexOffset index_offset) const { + MDSPAN_INLINE_FUNCTION constexpr index_type + compute_offset(std::index_sequence, IndexOffset index_offset) const { return index_offset; } template - constexpr index_type compute_offset(std::index_sequence, - IndexOffsets... index_offsets) const { + MDSPAN_INLINE_FUNCTION constexpr index_type + compute_offset(std::index_sequence, + IndexOffsets... index_offsets) const { // self-recursive fold trick from // https://github.com/llvm/llvm-project/blob/4d9771741d40cc9cfcccb6b033f43689d36b705a/libcxx/include/mdspan/layout_right.h#L141 index_type res = 0; ((res = static_cast(index_offsets) + (Ranks == extent_to_pad_idx ? padded_stride.value(0) - : exts.extent(Ranks)) * + : exts.extent(Ranks)) * res), ...); return res; @@ -577,56 +595,62 @@ class layout_right_padded::mapping { */ MDSPAN_TEMPLATE_REQUIRES( class _OtherExtents, - /* requires */ ( - std::is_constructible_v - ) - ) - MDSPAN_CONDITIONAL_EXPLICIT((!std::is_convertible_v<_OtherExtents, extents_type>)) + /* requires */ (std::is_constructible_v)) + MDSPAN_CONDITIONAL_EXPLICIT( + (!std::is_convertible_v<_OtherExtents, extents_type>)) + MDSPAN_INLINE_FUNCTION constexpr mapping(const layout_right::mapping<_OtherExtents> &other_mapping) - : padded_stride(padded_stride_type::init_padding(other_mapping, std::integral_constant{})), - exts(other_mapping.extents()) - { - static_assert((_OtherExtents::rank() > 1) || (padded_stride_type::static_value() != dynamic_extent) || (_OtherExtents::static_extent(extent_to_pad_idx) != dynamic_extent) - || (padded_stride_type::static_value() == _OtherExtents::static_extent(extent_to_pad_idx))); + : padded_stride(padded_stride_type::init_padding( + other_mapping, + std::integral_constant{})), + exts(other_mapping.extents()) { + static_assert( + (_OtherExtents::rank() > 1) || + (padded_stride_type::static_value() != dynamic_extent) || + (_OtherExtents::static_extent(extent_to_pad_idx) != dynamic_extent) || + (padded_stride_type::static_value() == + _OtherExtents::static_extent(extent_to_pad_idx))); } /** * Converting constructor from `layout_stride::mapping`. * - * This overload participates in overload resolution only if `is_constructible_v` is true + * This overload participates in overload resolution only if + * `is_constructible_v` is true */ MDSPAN_TEMPLATE_REQUIRES( class _OtherExtents, - /* requires */ ( - std::is_constructible_v - ) - ) + /* requires */ (std::is_constructible_v)) MDSPAN_CONDITIONAL_EXPLICIT((extents_type::rank() > 0)) + MDSPAN_INLINE_FUNCTION constexpr mapping(const layout_stride::mapping<_OtherExtents> &other_mapping) - : padded_stride(padded_stride_type::init_padding(other_mapping, std::integral_constant{})), - exts(other_mapping.extents()) - {} + : padded_stride(padded_stride_type::init_padding( + other_mapping, + std::integral_constant{})), + exts(other_mapping.extents()) {} /** * Converting constructor from `layout_right_padded::mapping`. * - * This overload participates in overload resolution only if `is_constructible_v` is true. - * Either `padding_value` or `OtherPaddingStride` must be `std::dynamic_extent`, or `padding_value == OtherPaddingStride`. + * This overload participates in overload resolution only if + * `is_constructible_v` is true. Either + * `padding_value` or `OtherPaddingStride` must be `std::dynamic_extent`, or + * `padding_value == OtherPaddingStride`. */ MDSPAN_TEMPLATE_REQUIRES( class _Mapping, - /* requires */ ( - detail::is_layout_right_padded_mapping<_Mapping>::value - && std::is_constructible_v - ) - ) + /* requires */ (detail::is_layout_right_padded_mapping<_Mapping>::value + &&std::is_constructible_v< + extents_type, typename _Mapping::extents_type>)) MDSPAN_CONDITIONAL_EXPLICIT((extents_type::rank() > 1 && (padding_value == dynamic_extent || _Mapping::padding_value == dynamic_extent))) + MDSPAN_INLINE_FUNCTION constexpr mapping(const _Mapping &other_mapping) - : padded_stride(padded_stride_type::init_padding(other_mapping, std::integral_constant{})), - exts(other_mapping.extents()) - { + : padded_stride(padded_stride_type::init_padding( + other_mapping, + std::integral_constant{})), + exts(other_mapping.extents()) { static_assert(padding_value == dynamic_extent || _Mapping::padding_value == dynamic_extent || padding_value == _Mapping::padding_value); @@ -635,41 +659,42 @@ class layout_right_padded::mapping { /** * Converting constructor from `layout_left_padded::mapping`. * - * This overload participates in overload resolution only if `extents_type::rank()` is 0 or 1 and `is_constructible_v` is `true`. + * This overload participates in overload resolution only if + * `extents_type::rank()` is 0 or 1 and `is_constructible_v` is `true`. */ MDSPAN_TEMPLATE_REQUIRES( class _Mapping, - /* requires */ ( - detail::is_layout_left_padded_mapping<_Mapping>::value - && extents_type::rank() <= 1 - && std::is_constructible_v - ) - ) - MDSPAN_CONDITIONAL_EXPLICIT((!std::is_convertible_v)) + /* requires */ (detail::is_layout_left_padded_mapping<_Mapping>::value + &&extents_type::rank() <= 1 && + std::is_constructible_v)) + MDSPAN_CONDITIONAL_EXPLICIT( + (!std::is_convertible_v)) + MDSPAN_INLINE_FUNCTION constexpr mapping(const _Mapping &other_mapping) noexcept - : padded_stride(padded_stride_type::init_padding(other_mapping.extents(), other_mapping.extents().extent(extent_to_pad_idx))), - exts(other_mapping.extents()) - {} + : padded_stride(padded_stride_type::init_padding( + other_mapping.extents(), + other_mapping.extents().extent(extent_to_pad_idx))), + exts(other_mapping.extents()) {} - constexpr const extents_type &extents() const noexcept - { + MDSPAN_INLINE_FUNCTION constexpr const extents_type & + extents() const noexcept { return exts; } - constexpr std::array - strides() const noexcept - { - if constexpr ( extents_type::rank() == 0 ) { + MDSPAN_INLINE_FUNCTION constexpr std::array + strides() const noexcept { + if constexpr (extents_type::rank() == 0) { return {}; - } else if constexpr ( extents_type::rank() == 1 ) { + } else if constexpr (extents_type::rank() == 1) { return {1}; } else { index_type value = 1; std::array s{}; s[extent_to_pad_idx] = value; value *= padded_stride.value(0); - for (rank_type r = extent_to_pad_idx - 1; r > 0; --r) - { + for (rank_type r = extent_to_pad_idx - 1; r > 0; --r) { s[r] = value; value *= exts.extent(r); } @@ -678,17 +703,15 @@ class layout_right_padded::mapping { } } - constexpr index_type - required_span_size() const noexcept - { - if constexpr ( extents_type::rank() == 0 ) { + MDSPAN_INLINE_FUNCTION constexpr index_type + required_span_size() const noexcept { + if constexpr (extents_type::rank() == 0) { return 1; - } else if constexpr ( extents_type::rank() == 1 ) { + } else if constexpr (extents_type::rank() == 1) { return exts.extent(0); } else { index_type value = 1; - for (rank_type r = 0; r < extent_to_pad_idx; ++r) - { + for (rank_type r = 0; r < extent_to_pad_idx; ++r) { value *= exts.extent(r); } return value * padded_stride.value(0); @@ -705,40 +728,47 @@ class layout_right_padded::mapping { */ MDSPAN_TEMPLATE_REQUIRES( class... _Indices, - /* requires */ ( - sizeof...(_Indices) == extents_type::rank() && - (::MDSPAN_IMPL_STANDARD_NAMESPACE::detail::are_valid_indices()) - ) - ) - constexpr size_t operator()(_Indices... idxs) const noexcept - { + /* requires */ (sizeof...(_Indices) == extents_type::rank() && + (::MDSPAN_IMPL_STANDARD_NAMESPACE::detail:: + are_valid_indices()))) + MDSPAN_INLINE_FUNCTION constexpr size_t + operator()(_Indices... idxs) const noexcept { return compute_offset(std::index_sequence_for<_Indices...>{}, idxs...); } - static constexpr bool is_always_unique() noexcept { return true; } - static constexpr bool is_always_exhaustive() noexcept - { - return (extents_type::rank() <= rank_type(1)) - || (extents_type::static_extent(extent_to_pad_idx) != dynamic_extent - && extents_type::static_extent(extent_to_pad_idx) == padded_stride_type::static_value()); + MDSPAN_INLINE_FUNCTION static constexpr bool is_always_unique() noexcept { + return true; + } + MDSPAN_INLINE_FUNCTION static constexpr bool is_always_exhaustive() noexcept { + return (extents_type::rank() <= rank_type(1)) || + (extents_type::static_extent(extent_to_pad_idx) != dynamic_extent && + extents_type::static_extent(extent_to_pad_idx) == + padded_stride_type::static_value()); + } + MDSPAN_INLINE_FUNCTION static constexpr bool is_always_strided() noexcept { + return true; } - static constexpr bool is_always_strided() noexcept { return true; } - static constexpr bool is_unique() noexcept { return true; } - constexpr bool is_exhaustive() const noexcept - { - return (extents_type::rank() < 2) - || (exts.extent(extent_to_pad_idx) == padded_stride.value(0)); + MDSPAN_INLINE_FUNCTION static constexpr bool is_unique() noexcept { + return true; + } + MDSPAN_INLINE_FUNCTION constexpr bool is_exhaustive() const noexcept { + return (extents_type::rank() < 2) || + (exts.extent(extent_to_pad_idx) == padded_stride.value(0)); + } + MDSPAN_INLINE_FUNCTION static constexpr bool is_strided() noexcept { + return true; } - static constexpr bool is_strided() noexcept { return true; } - constexpr index_type stride(rank_type r) const noexcept - { + MDSPAN_INLINE_FUNCTION constexpr index_type + stride(rank_type r) const noexcept { assert(r < extents_type::rank()); - if(r == extents_type::rank() - 1) return index_type(1); + if (r == extents_type::rank() - 1) + return index_type(1); index_type value = padded_stride.value(0); - for (rank_type k = extents_type::rank() - 2; k > r; k--) value *= exts.extent(k); + for (rank_type k = extents_type::rank() - 2; k > r; k--) + value *= exts.extent(k); return value; } @@ -746,26 +776,26 @@ class layout_right_padded::mapping { /** * Equality operator between `layout_right_padded`s * - * This overload only participates in overload resolution if `OtherExtents::rank() == extents_type::rank()`. + * This overload only participates in overload resolution if + * `OtherExtents::rank() == extents_type::rank()`. * - * \note There is currently a difference from p2642r2, where this function is specified as taking - * `layout_right_padded< padding_value >::mapping< Extents>`. However, this makes `padding_value` non-deducible. + * \note There is currently a difference from p2642r2, where this function is + * specified as taking `layout_right_padded< padding_value >::mapping< + * Extents>`. However, this makes `padding_value` non-deducible. */ MDSPAN_TEMPLATE_REQUIRES( class _Mapping, - /* requires */ ( - detail::is_layout_right_padded_mapping<_Mapping>::value - && (_Mapping::extents_type::rank() == extents_type::rank()) - ) - ) - friend constexpr bool operator==(const mapping &left, const _Mapping &right) noexcept - { - // Workaround for some compilers not short-circuiting properly with compile-time checks - // i.e. we can't access stride(_padding_stride_idx) of a rank 0 mapping + /* requires */ (detail::is_layout_right_padded_mapping<_Mapping>::value && + (_Mapping::extents_type::rank() == extents_type::rank()))) + MDSPAN_INLINE_FUNCTION friend constexpr bool + operator==(const mapping &left, const _Mapping &right) noexcept { + // Workaround for some compilers not short-circuiting properly with + // compile-time checks i.e. we can't access stride(_padding_stride_idx) of a + // rank 0 mapping bool strides_equal = true; - if constexpr (extents_type::rank() > rank_type(1)) - { - strides_equal = left.stride(padded_stride_idx) == right.stride(padded_stride_idx); + if constexpr (extents_type::rank() > rank_type(1)) { + strides_equal = + left.stride(padded_stride_idx) == right.stride(padded_stride_idx); } return (left.extents() == right.extents()) && strides_equal; } @@ -774,17 +804,15 @@ class layout_right_padded::mapping { /** * Inequality operator between `layout_right_padded`s * - * This overload only participates in overload resolution if `OtherExtents::rank() == extents_type::rank()`. + * This overload only participates in overload resolution if + * `OtherExtents::rank() == extents_type::rank()`. */ MDSPAN_TEMPLATE_REQUIRES( class _Mapping, - /* requires */ ( - detail::is_layout_right_padded_mapping<_Mapping>::value - && (_Mapping::extents_type::rank() == extents_type::rank()) - ) - ) - friend constexpr bool operator!=(const mapping &left, const _Mapping &right) noexcept - { + /* requires */ (detail::is_layout_right_padded_mapping<_Mapping>::value && + (_Mapping::extents_type::rank() == extents_type::rank()))) + MDSPAN_INLINE_FUNCTION friend constexpr bool + operator!=(const mapping &left, const _Mapping &right) noexcept { return !(left == right); } #endif