From 46354d25d4befe2f9f2fc4b46ea30e3121b3d64e Mon Sep 17 00:00:00 2001 From: Bruno Turcksin Date: Mon, 18 Mar 2024 10:32:55 -0400 Subject: [PATCH 001/103] Use builtin for atomic_fetch in the HIP backend Co-authored-by: Damien L-G --- .../desul/include/desul/atomics/Adapt_HIP.hpp | 77 ++++++++ .../include/desul/atomics/Fetch_Op_HIP.hpp | 165 +++++++++--------- 2 files changed, 163 insertions(+), 79 deletions(-) create mode 100644 tpls/desul/include/desul/atomics/Adapt_HIP.hpp diff --git a/tpls/desul/include/desul/atomics/Adapt_HIP.hpp b/tpls/desul/include/desul/atomics/Adapt_HIP.hpp new file mode 100644 index 00000000000..0eab27fe989 --- /dev/null +++ b/tpls/desul/include/desul/atomics/Adapt_HIP.hpp @@ -0,0 +1,77 @@ +/* +Copyright (c) 2019, Lawrence Livermore National Security, LLC +and DESUL project contributors. See the COPYRIGHT file for details. +Source: https://github.com/desul/desul + +SPDX-License-Identifier: (BSD-3-Clause) +*/ + +#ifndef DESUL_ATOMICS_ADAPT_HIP_HPP_ +#define DESUL_ATOMICS_ADAPT_HIP_HPP_ + +#include + +namespace desul { +namespace Impl { + +// FIXME same code as GCCMemoryOrder +template +struct HIPMemoryOrder; + +template <> +struct HIPMemoryOrder { + static constexpr int value = __ATOMIC_RELAXED; +}; + +template <> +struct HIPMemoryOrder { + static constexpr int value = __ATOMIC_ACQUIRE; +}; + +template <> +struct HIPMemoryOrder { + static constexpr int value = __ATOMIC_RELEASE; +}; + +template <> +struct HIPMemoryOrder { + static constexpr int value = __ATOMIC_ACQ_REL; +}; + +template <> +struct HIPMemoryOrder { + static constexpr int value = __ATOMIC_SEQ_CST; +}; + +// __HIP_MEMORY_SCOPE_SYSTEM +// __HIP_MEMORY_SCOPE_AGENT +// __HIP_MEMORY_SCOPE_WORKGROUP +// __HIP_MEMORY_SCOPE_WAVEFRONT +// __HIP_MEMORY_SCOPE_SINGLETHREAD +template +struct HIPMemoryScope; + +template <> +struct HIPMemoryScope { + static constexpr int value = __HIP_MEMORY_SCOPE_WORKGROUP; +}; + +template <> +struct HIPMemoryScope { + static constexpr int value = __HIP_MEMORY_SCOPE_AGENT; +}; + +template <> +struct HIPMemoryScope { + static constexpr int value = __HIP_MEMORY_SCOPE_SYSTEM; +}; + +template <> +struct HIPMemoryScope { + static constexpr int value = __HIP_MEMORY_SCOPE_SYSTEM; +}; + +} // namespace Impl +} // namespace desul + +#endif diff --git a/tpls/desul/include/desul/atomics/Fetch_Op_HIP.hpp b/tpls/desul/include/desul/atomics/Fetch_Op_HIP.hpp index e9c749809de..920722084d1 100644 --- a/tpls/desul/include/desul/atomics/Fetch_Op_HIP.hpp +++ b/tpls/desul/include/desul/atomics/Fetch_Op_HIP.hpp @@ -9,99 +9,106 @@ SPDX-License-Identifier: (BSD-3-Clause) #ifndef DESUL_ATOMICS_FECH_OP_HIP_HPP_ #define DESUL_ATOMICS_FECH_OP_HIP_HPP_ +#include + namespace desul { namespace Impl { -// clang-format off -inline __device__ int device_atomic_fetch_add( int* ptr, int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAdd(ptr, val); } -inline __device__ unsigned int device_atomic_fetch_add( unsigned int* ptr, unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAdd(ptr, val); } -inline __device__ unsigned long long device_atomic_fetch_add(unsigned long long* ptr, unsigned long long val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAdd(ptr, val); } -inline __device__ float device_atomic_fetch_add( float* ptr, float val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAdd(ptr, val); } -inline __device__ double device_atomic_fetch_add( double* ptr, double val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAdd(ptr, val); } - -inline __device__ int device_atomic_fetch_sub( int* ptr, int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicSub(ptr, val); } -inline __device__ unsigned int device_atomic_fetch_sub( unsigned int* ptr, unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicSub(ptr, val); } -inline __device__ unsigned long long device_atomic_fetch_sub(unsigned long long* ptr, unsigned long long val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAdd(ptr, -val); } -inline __device__ float device_atomic_fetch_sub( float* ptr, float val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAdd(ptr, -val); } -inline __device__ double device_atomic_fetch_sub( double* ptr, double val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAdd(ptr, -val); } - -inline __device__ int device_atomic_fetch_min( int* ptr, int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicMin(ptr, val); } -inline __device__ unsigned int device_atomic_fetch_min( unsigned int* ptr, unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicMin(ptr, val); } -inline __device__ unsigned long long device_atomic_fetch_min(unsigned long long* ptr, unsigned long long val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicMin(ptr, val); } - -inline __device__ int device_atomic_fetch_max( int* ptr, int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicMax(ptr, val); } -inline __device__ unsigned int device_atomic_fetch_max( unsigned int* ptr, unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicMax(ptr, val); } -inline __device__ unsigned long long device_atomic_fetch_max(unsigned long long* ptr, unsigned long long val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicMax(ptr, val); } - -inline __device__ int device_atomic_fetch_and( int* ptr, int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAnd(ptr, val); } -inline __device__ unsigned int device_atomic_fetch_and( unsigned int* ptr, unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAnd(ptr, val); } -inline __device__ unsigned long long device_atomic_fetch_and(unsigned long long* ptr, unsigned long long val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAnd(ptr, val); } - -inline __device__ int device_atomic_fetch_or ( int* ptr, int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicOr (ptr, val); } -inline __device__ unsigned int device_atomic_fetch_or ( unsigned int* ptr, unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicOr (ptr, val); } -inline __device__ unsigned long long device_atomic_fetch_or (unsigned long long* ptr, unsigned long long val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicOr (ptr, val); } +#define DESUL_IMPL_HIP_ATOMIC_FETCH_OP(OP, T) \ + template \ + __device__ inline T device_atomic_fetch_##OP( \ + T* ptr, T val, MemoryOrder, MemoryScope) { \ + return __hip_atomic_fetch_##OP(ptr, \ + val, \ + HIPMemoryOrder::value, \ + HIPMemoryScope::value); \ + } -inline __device__ int device_atomic_fetch_xor( int* ptr, int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicXor(ptr, val); } -inline __device__ unsigned int device_atomic_fetch_xor( unsigned int* ptr, unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicXor(ptr, val); } -inline __device__ unsigned long long device_atomic_fetch_xor(unsigned long long* ptr, unsigned long long val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicXor(ptr, val); } +#define DESUL_IMPL_HIP_ATOMIC_FETCH_OP_INTEGRAL(OP) \ + DESUL_IMPL_HIP_ATOMIC_FETCH_OP(OP, int) \ + DESUL_IMPL_HIP_ATOMIC_FETCH_OP(OP, long long) \ + DESUL_IMPL_HIP_ATOMIC_FETCH_OP(OP, unsigned int) \ + DESUL_IMPL_HIP_ATOMIC_FETCH_OP(OP, unsigned long long) + +#define DESUL_IMPL_HIP_ATOMIC_FETCH_OP_FLOATING_POINT(OP) \ + DESUL_IMPL_HIP_ATOMIC_FETCH_OP(OP, float) \ + DESUL_IMPL_HIP_ATOMIC_FETCH_OP(OP, double) + +DESUL_IMPL_HIP_ATOMIC_FETCH_OP_INTEGRAL(add) +DESUL_IMPL_HIP_ATOMIC_FETCH_OP_INTEGRAL(min) +DESUL_IMPL_HIP_ATOMIC_FETCH_OP_INTEGRAL(max) +DESUL_IMPL_HIP_ATOMIC_FETCH_OP_INTEGRAL(and) +DESUL_IMPL_HIP_ATOMIC_FETCH_OP_INTEGRAL(or) +DESUL_IMPL_HIP_ATOMIC_FETCH_OP_INTEGRAL(xor) +DESUL_IMPL_HIP_ATOMIC_FETCH_OP_FLOATING_POINT(add) +// atomic min/max gives the wrong results (tested with ROCm 6.0 on Frontier) +// DESUL_IMPL_HIP_ATOMIC_FETCH_OP_FLOATING_POINT(min) +// DESUL_IMPL_HIP_ATOMIC_FETCH_OP_FLOATING_POINT(max) + +#undef DESUL_IMPL_HIP_ATOMIC_FETCH_OP_FLOATING_POINT +#undef DESUL_IMPL_HIP_ATOMIC_FETCH_OP_INTEGRAL +#undef DESUL_IMPL_HIP_ATOMIC_FETCH_OP + +#define DESUL_IMPL_HIP_ATOMIC_FETCH_SUB(T) \ + template \ + __device__ inline T device_atomic_fetch_sub( \ + T* ptr, T val, MemoryOrder, MemoryScope) { \ + return __hip_atomic_fetch_add(ptr, \ + -val, \ + HIPMemoryOrder::value, \ + HIPMemoryScope::value); \ + } -inline __device__ int device_atomic_fetch_inc( int* ptr, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAdd(ptr, 1 ); } -inline __device__ unsigned int device_atomic_fetch_inc( unsigned int* ptr, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAdd(ptr, 1u ); } -inline __device__ unsigned long long device_atomic_fetch_inc(unsigned long long* ptr, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAdd(ptr, 1ull); } +DESUL_IMPL_HIP_ATOMIC_FETCH_SUB(int) +DESUL_IMPL_HIP_ATOMIC_FETCH_SUB(long long) +DESUL_IMPL_HIP_ATOMIC_FETCH_SUB(unsigned int) +DESUL_IMPL_HIP_ATOMIC_FETCH_SUB(unsigned long long) +DESUL_IMPL_HIP_ATOMIC_FETCH_SUB(float) +DESUL_IMPL_HIP_ATOMIC_FETCH_SUB(double) + +#undef DESUL_IMPL_HIP_ATOMIC_FETCH_SUB + +#define DESUL_IMPL_HIP_ATOMIC_FETCH_INC(T) \ + template \ + __device__ inline T device_atomic_fetch_inc( T* ptr, MemoryOrder, MemoryScope) { \ + return __hip_atomic_fetch_add(ptr, \ + 1, \ + HIPMemoryOrder::value, \ + HIPMemoryScope::value); \ + } \ + template \ + __device__ inline T device_atomic_fetch_dec( T* ptr, MemoryOrder, MemoryScope) { \ + return __hip_atomic_fetch_add(ptr, \ + -1, \ + HIPMemoryOrder::value, \ + HIPMemoryScope::value); \ + } -inline __device__ int device_atomic_fetch_dec( int* ptr, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicSub(ptr, 1 ); } -inline __device__ unsigned int device_atomic_fetch_dec( unsigned int* ptr, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicSub(ptr, 1u ); } -inline __device__ unsigned long long device_atomic_fetch_dec(unsigned long long* ptr, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAdd(ptr, -1 ); } +DESUL_IMPL_HIP_ATOMIC_FETCH_INC(int) +DESUL_IMPL_HIP_ATOMIC_FETCH_INC(long long) +DESUL_IMPL_HIP_ATOMIC_FETCH_INC(unsigned int) +DESUL_IMPL_HIP_ATOMIC_FETCH_INC(unsigned long long) -inline __device__ unsigned int device_atomic_fetch_inc_mod( unsigned int* ptr, unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicInc(ptr, val); } -inline __device__ unsigned int device_atomic_fetch_dec_mod( unsigned int* ptr, unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicDec(ptr, val); } -// clang-format on +#undef DESUL_IMPL_HIP_ATOMIC_FETCH_INC -#define DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP(OP, TYPE) \ +#define DESUL_IMPL_HIP_ATOMIC_FETCH_INC_MOD(MEMORY_SCOPE, MEMORY_SCOPE_STRING_LITERAL) \ template \ - __device__ TYPE device_atomic_fetch_##OP( \ - TYPE* ptr, TYPE val, MemoryOrder, MemoryScopeDevice) { \ - __threadfence(); \ - TYPE return_val = \ - device_atomic_fetch_##OP(ptr, val, MemoryOrderRelaxed(), MemoryScopeDevice()); \ - __threadfence(); \ - return return_val; \ + __device__ inline unsigned int device_atomic_fetch_inc_mod(unsigned int* ptr, unsigned int val, MemoryOrder, MEMORY_SCOPE) { \ + return __builtin_amdgcn_atomic_inc32( \ + ptr, val, HIPMemoryOrder::value, MEMORY_SCOPE_STRING_LITERAL); \ } \ template \ - __device__ TYPE device_atomic_fetch_##OP( \ - TYPE* ptr, TYPE val, MemoryOrder, MemoryScopeCore) { \ - return device_atomic_fetch_##OP(ptr, val, MemoryOrder(), MemoryScopeDevice()); \ + __device__ inline unsigned int device_atomic_fetch_dec_mod(unsigned int* ptr, unsigned int val, MemoryOrder, MEMORY_SCOPE) { \ + return __builtin_amdgcn_atomic_dec32( \ + ptr, val, HIPMemoryOrder::value, MEMORY_SCOPE_STRING_LITERAL); \ } -#define DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(OP) \ - DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP(OP, int) \ - DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP(OP, unsigned int) \ - DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP(OP, unsigned long long) - -#define DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_FLOATING_POINT(OP) \ - DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP(OP, float) \ - DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP(OP, double) - -DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(min) -DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(max) -DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(and) -DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(or) -DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(xor) - -DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_FLOATING_POINT(add) -DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(add) -DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_FLOATING_POINT(sub) -DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(sub) - -DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(inc) -DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(dec) - -DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP(inc_mod, unsigned int) -DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP(dec_mod, unsigned int) +DESUL_IMPL_HIP_ATOMIC_FETCH_INC_MOD(MemoryScopeCore, "workgroup") +DESUL_IMPL_HIP_ATOMIC_FETCH_INC_MOD(MemoryScopeDevice, "agent") +DESUL_IMPL_HIP_ATOMIC_FETCH_INC_MOD(MemoryScopeNode, "") +DESUL_IMPL_HIP_ATOMIC_FETCH_INC_MOD(MemoryScopeSystem, "") -#undef DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_FLOATING_POINT -#undef DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_INTEGRAL -#undef DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP +#undef DESUL_IMPL_HIP_ATOMIC_FETCH_INC_MOD } // namespace Impl } // namespace desul From a2af4e0d4137fab559a6ea1698af0050a0cbdf03 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 11 Apr 2024 17:41:27 -0400 Subject: [PATCH 002/103] Deprecate trailing Proxy template argument in Kokkos::Array --- core/src/Kokkos_Array.hpp | 17 ++++++++++++++--- core/unit_test/TestArrayOps.hpp | 2 ++ 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/core/src/Kokkos_Array.hpp b/core/src/Kokkos_Array.hpp index ba1626bb72e..461b98f6a72 100644 --- a/core/src/Kokkos_Array.hpp +++ b/core/src/Kokkos_Array.hpp @@ -80,7 +80,11 @@ struct ArrayBoundsCheck { /**\brief Derived from the C++17 'std::array'. * Dropping the iterator interface. */ +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 template +#else +template +#endif struct Array { public: /** @@ -131,8 +135,13 @@ struct Array { } }; +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 template struct Array { +#else +template +struct Array { +#endif public: using reference = T&; using const_reference = std::add_const_t&; @@ -178,14 +187,15 @@ struct Array { // Array & operator = ( Array && ) = default ; }; +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 template <> -struct Array { +struct KOKKOS_DEPRECATED Array { struct contiguous {}; struct strided {}; }; template -struct Array::contiguous> { +struct KOKKOS_DEPRECATED Array::contiguous> { private: T* m_elem; size_t m_size; @@ -253,7 +263,7 @@ struct Array::contiguous> { }; template -struct Array::strided> { +struct KOKKOS_DEPRECATED Array::strided> { private: T* m_elem; size_t m_size; @@ -320,6 +330,7 @@ struct Array::strided> { size_type arg_stride) : m_elem(arg_ptr), m_size(arg_size), m_stride(arg_stride) {} }; +#endif template Array(T, Us...)->Array; diff --git a/core/unit_test/TestArrayOps.hpp b/core/unit_test/TestArrayOps.hpp index 06528572714..387589fbe88 100644 --- a/core/unit_test/TestArrayOps.hpp +++ b/core/unit_test/TestArrayOps.hpp @@ -111,6 +111,7 @@ TEST(TEST_CATEGORY, array_zero_data_nullptr) { ASSERT_EQ(ce.data(), nullptr); } +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 TEST(TEST_CATEGORY, array_contiguous_capacity) { using A = Kokkos::Array::contiguous>; @@ -389,5 +390,6 @@ TEST(TEST_CATEGORY, array_strided_assignment) { ASSERT_EQ(e.max_size(), std::size(ee) / eStride); ASSERT_EQ(e[0], ee[0]); } +#endif } // namespace From f2d37801dc88ac476cbc47054e726c6c433de2cc Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Mon, 15 Apr 2024 17:26:15 -0400 Subject: [PATCH 003/103] Remove unnecessary header include Reported in https://github.com/kokkos/kokkos/pull/6934#pullrequestreview-2001860702 Co-authored-by: Nevin Liber --- core/src/Kokkos_Array.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/core/src/Kokkos_Array.hpp b/core/src/Kokkos_Array.hpp index 461b98f6a72..3d71d09fde1 100644 --- a/core/src/Kokkos_Array.hpp +++ b/core/src/Kokkos_Array.hpp @@ -29,7 +29,6 @@ #include #include #include -#include #include namespace Kokkos { From a8115e5df7d2fd68c215758abaf73bbe8598e0d2 Mon Sep 17 00:00:00 2001 From: yasahi-hpc <57478230+yasahi-hpc@users.noreply.github.com> Date: Tue, 16 Apr 2024 03:42:26 +0200 Subject: [PATCH 004/103] Adding converting constructor in Kokkos::RandomAccessIterator (#6929) * Adding converting constructor in Kokkos::RandomAccessIterator * fix constructible tests for Kokkos::RandomAccessIterator * fix converting constructor in Kokkos::RandomAccessIterator * Add comments to explain friend class of RandomAccessIterator is needed for converting constructor * Introduce KOKKOS_IMPL_CONDITIONAL_EXPLICIT macro from #6830 * Adding a conditional explicit in converting constructor of RandomAccessIterator * Rename ViewType to OtherViewType in converting constructor for readability * Replace tests with static_assert if they rely on compile time behaviour only * fix a condition for conditional explicit * Revert "Introduce KOKKOS_IMPL_CONDITIONAL_EXPLICIT macro from #6830" This reverts commit ee42c6d62e9b8373bd3494c79c97a8845593b325. * On second thought `KOKKOS_IMPL_CONDITIONAL_EXPLICIT` is not such a good idea because it let user write code that would compile with C++17 but not with later standards. --------- Co-authored-by: Yuuichi Asahi --- .../impl/Kokkos_RandomAccessIterator.hpp | 28 ++++++++++++++ .../unit_tests/TestRandomAccessIterator.cpp | 38 +++++++++++++++++++ 2 files changed, 66 insertions(+) diff --git a/algorithms/src/std_algorithms/impl/Kokkos_RandomAccessIterator.hpp b/algorithms/src/std_algorithms/impl/Kokkos_RandomAccessIterator.hpp index 5c9854b87d7..7bcc16a9b55 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_RandomAccessIterator.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_RandomAccessIterator.hpp @@ -59,6 +59,30 @@ class RandomAccessIterator< ::Kokkos::View > { ptrdiff_t current_index) : m_view(view), m_current_index(current_index) {} +#ifndef KOKKOS_ENABLE_CXX17 // C++20 and beyond + template + requires(std::is_constructible_v) KOKKOS_FUNCTION + explicit(!std::is_convertible_v) + RandomAccessIterator(const RandomAccessIterator& other) + : m_view(other.m_view), m_current_index(other.m_current_index) {} +#else + template < + class OtherViewType, + std::enable_if_t && + !std::is_convertible_v, + int> = 0> + KOKKOS_FUNCTION explicit RandomAccessIterator( + const RandomAccessIterator& other) + : m_view(other.m_view), m_current_index(other.m_current_index) {} + + template , + int> = 0> + KOKKOS_FUNCTION RandomAccessIterator( + const RandomAccessIterator& other) + : m_view(other.m_view), m_current_index(other.m_current_index) {} +#endif + KOKKOS_FUNCTION iterator_type& operator++() { ++m_current_index; @@ -155,6 +179,10 @@ class RandomAccessIterator< ::Kokkos::View > { private: view_type m_view; ptrdiff_t m_current_index = 0; + + // Needed for the converting constructor accepting another iterator + template + friend class RandomAccessIterator; }; } // namespace Impl diff --git a/algorithms/unit_tests/TestRandomAccessIterator.cpp b/algorithms/unit_tests/TestRandomAccessIterator.cpp index 282d85548c5..7d484136b6d 100644 --- a/algorithms/unit_tests/TestRandomAccessIterator.cpp +++ b/algorithms/unit_tests/TestRandomAccessIterator.cpp @@ -46,6 +46,44 @@ TEST_F(random_access_iterator_test, constructor) { EXPECT_TRUE(true); } +TEST_F(random_access_iterator_test, constructiblity) { + auto first_d = KE::begin(m_dynamic_view); + auto cfirst_d = KE::cbegin(m_dynamic_view); + + static_assert(std::is_constructible_v); + static_assert( + !std::is_constructible_v); + [[maybe_unused]] decltype(cfirst_d) tmp_cfirst_d(first_d); + + auto first_s = KE::begin(m_static_view); + auto cfirst_s = KE::cbegin(m_static_view); + + static_assert(std::is_constructible_v); + static_assert( + !std::is_constructible_v); + [[maybe_unused]] decltype(cfirst_s) tmp_cfirst_s(first_s); + + auto first_st = KE::begin(m_strided_view); + auto cfirst_st = KE::cbegin(m_strided_view); + + static_assert( + std::is_constructible_v); + static_assert( + !std::is_constructible_v); + [[maybe_unused]] decltype(cfirst_st) tmp_cfirst_st(first_st); + + // [FIXME] Better to have tests for the explicit specifier with an expression. + // As soon as View converting constructors are re-implemented with a + // conditional explicit, we may add those tests. + static_assert(std::is_constructible_v); + static_assert(std::is_constructible_v); + static_assert(std::is_constructible_v); + static_assert(std::is_constructible_v); + static_assert(std::is_constructible_v); + static_assert(std::is_constructible_v); + EXPECT_TRUE(true); +} + template void test_random_access_it_verify(IteratorType it, ValueType gold_value) { using view_t = Kokkos::View; From f94e8d34de523813f5e23e5622615566c80de8fc Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Tue, 16 Apr 2024 14:46:49 -0400 Subject: [PATCH 005/103] Prefer standard C++ feature testing to guard the C++20 requires expression Temporary fix for our nightly builds so we can make decision on minimum CXX20 compiler requirements when we see fit. --- .../impl/Kokkos_RandomAccessIterator.hpp | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/algorithms/src/std_algorithms/impl/Kokkos_RandomAccessIterator.hpp b/algorithms/src/std_algorithms/impl/Kokkos_RandomAccessIterator.hpp index 7bcc16a9b55..ba0cdc91eea 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_RandomAccessIterator.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_RandomAccessIterator.hpp @@ -59,7 +59,15 @@ class RandomAccessIterator< ::Kokkos::View > { ptrdiff_t current_index) : m_view(view), m_current_index(current_index) {} -#ifndef KOKKOS_ENABLE_CXX17 // C++20 and beyond +// FIXME The C++20 requires expression is not supported with Clang 9 and GCC 9 +// The following guards is unsufficient until we increase our minimum CXX20 +// compiler requirements. +// #ifndef KOKKOS_ENABLE_CXX17 // C++20 and beyond +// We replace the Kokkos guards with standard C++ feature testing in the +// meantime. +#if (defined(__cpp_concepts) && (__cpp_concepts >= 201907L)) && \ + (defined(__cpp_conditional_explicit) && \ + (__cpp_conditional_explicit >= 201806L)) template requires(std::is_constructible_v) KOKKOS_FUNCTION explicit(!std::is_convertible_v) From c9e21ce2ab8e03710494ed53a12c255df56fd3b1 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Tue, 16 Apr 2024 12:21:57 -0400 Subject: [PATCH 006/103] Add `kokkos_swap(Array)` sepcialization Specializing the swap algorithm for Kokkos arrays was initially proposed in #6697 but we dropped it to focus on the Kokkos swap ADL ordeal. Somehow we overlooked a stray header include in the Kokkos::Array header file. This PR reintroduce a `Kokkos::kokkos_swap(Kokkos::Array)` specialization, following closely what the standard library does for `std::swap(std::array)`. --- core/src/Kokkos_Array.hpp | 15 +++++++++++++++ core/unit_test/TestArray.cpp | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+) diff --git a/core/src/Kokkos_Array.hpp b/core/src/Kokkos_Array.hpp index 461b98f6a72..7fd81030ecd 100644 --- a/core/src/Kokkos_Array.hpp +++ b/core/src/Kokkos_Array.hpp @@ -133,6 +133,17 @@ struct Array { KOKKOS_INLINE_FUNCTION constexpr const_pointer data() const { return &m_internal_implementation_private_member_data[0]; } + + private: + template + friend KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t< + Impl::is_swappable::value> + kokkos_swap(Array& a, + Array& b) noexcept(Impl::is_nothrow_swappable_v) { + for (std::size_t i = 0; i < N; ++i) { + kokkos_swap(a[i], b[i]); + } + } }; #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 @@ -185,6 +196,10 @@ struct Array { // for default move constructor and move assignment operator. // Array( Array && ) = default ; // Array & operator = ( Array && ) = default ; + + private: + friend KOKKOS_INLINE_FUNCTION constexpr void kokkos_swap( + Array&, Array&) noexcept(Impl::is_nothrow_swappable_v) {} }; #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 diff --git a/core/unit_test/TestArray.cpp b/core/unit_test/TestArray.cpp index e138a64d6db..e691d83ebe2 100644 --- a/core/unit_test/TestArray.cpp +++ b/core/unit_test/TestArray.cpp @@ -120,4 +120,37 @@ static_assert(test_array_aggregate_initialization()); } } +// User-defined type providing a sepcialization of kokkos_swap +struct MyInt { + int i; + + private: + friend constexpr void kokkos_swap(MyInt& lhs, MyInt& rhs) noexcept { + lhs.i = 255; + rhs.i = 127; + } +}; + +constexpr bool test_array_specialization_kokkos_swap() { + Kokkos::Array a{MyInt{1}, MyInt{2}}; + Kokkos::Array b{MyInt{11}, MyInt{22}}; + + // sanity check + if (a[0].i != 1 || a[1].i != 2 || b[0].i != 11 || b[1].i != 22) { + return false; + } + + using Kokkos::kokkos_swap; + kokkos_swap(a, b); + + // check that the user-definied kokkos_swap(MyInt) overload was called + if (a[0].i != 255 || a[1].i != 255 || b[0].i != 127 || b[1].i != 127) { + return false; + } + + return true; +} + +static_assert(test_array_specialization_kokkos_swap()); + } // namespace From 730d8d828f9d43b0cf3a1010b06c0008a7be128a Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 17 Apr 2024 08:58:22 -0400 Subject: [PATCH 007/103] Deprecate specialization of Kokkos::pair for a single element This specialization is not documented, does not follow the standard library, it is not tested and has no known usage in Trilinos. `Kokkos::pair`, as we generally describe it, was intended as a drop-in replacement for `std::pair`. Hence, obscure departure from the standard implementation do not look like a good idea. This PR suggest to deprecate that `T2=void` specialization for degenerate pair that only hold one element. --- core/src/Kokkos_Pair.hpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/core/src/Kokkos_Pair.hpp b/core/src/Kokkos_Pair.hpp index 9be8d8d7aa1..d1bd11f7162 100644 --- a/core/src/Kokkos_Pair.hpp +++ b/core/src/Kokkos_Pair.hpp @@ -413,12 +413,13 @@ KOKKOS_FORCEINLINE_FUNCTION pair tie(T1& x, T2& y) { return (pair(x, y)); } +#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_4 // // Specialization of Kokkos::pair for a \c void second argument. This // is not actually a "pair"; it only contains one element, the first. // template -struct pair { +struct KOKKOS_DEPRECATED pair { using first_type = T1; using second_type = void; @@ -483,6 +484,7 @@ KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator>=( const pair& lhs, const pair& rhs) { return !(lhs < rhs); } +#endif namespace Impl { template From d914fe316ba64c6755abfd8e68cd7d1b872e04f7 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 17 Apr 2024 10:05:24 -0400 Subject: [PATCH 008/103] Fix deprecated warning from `Kokkos::Array` specialization (#6945) * Fix deprecated warning from Kokkos::Array specialization The warnings come from the template arguments in deprecated specialization `Kokkos::Array<>::{contiguous,strided}` which refer to `Kokkos::Array<>` that is marked as deprecated. Minimal reproducer [here](https://godbolt.org/z/s18Txa5P6). GCC9 eats it but GCC10 onwards raise a warning. I propose the easy way out, that is we drop the `[[deprecated]]` attribute on `Kokkos::Array<>`. Let me know if you have a better idea. Sample warning from ArborX nightlies for completeness: ``` In file included from /var/jenkins/workspace/ArborX_nightly/source-kokkos/core/src/KokkosExp_MDRangePolicy.hpp:29, from /var/jenkins/workspace/ArborX_nightly/source-kokkos/core/src/Kokkos_Tuners.hpp:28, from /var/jenkins/workspace/ArborX_nightly/source-kokkos/core/src/impl/Kokkos_Tools_Generic.hpp:26, from /var/jenkins/workspace/ArborX_nightly/source-kokkos/core/src/Kokkos_Parallel.hpp:34, from /var/jenkins/workspace/ArborX_nightly/source-kokkos/core/src/Kokkos_MemoryPool.hpp:26, from /var/jenkins/workspace/ArborX_nightly/source-kokkos/core/src/Kokkos_TaskScheduler.hpp:34, from /var/jenkins/workspace/ArborX_nightly/source-kokkos/core/src/Serial/Kokkos_Serial.hpp:37, from /var/jenkins/workspace/ArborX_nightly/source-kokkos/core/src/decl/Kokkos_Declare_SERIAL.hpp:21, from /var/jenkins/workspace/ArborX_nightly/build-kokkos/KokkosCore_Config_DeclareBackend.hpp:22, from /var/jenkins/workspace/ArborX_nightly/source-kokkos/core/src/Kokkos_Core.hpp:45, from /var/jenkins/workspace/ArborX_nightly/source-kokkos/core/src/impl/Kokkos_Core.cpp:21: /var/jenkins/workspace/ArborX_nightly/source-kokkos/core/src/Kokkos_Array.hpp:197:66: warning: 'Array' is deprecated [-Wdeprecated-declarations] 197 | struct KOKKOS_DEPRECATED Array::contiguous> { | ^~~~~~~~~~ /var/jenkins/workspace/ArborX_nightly/source-kokkos/core/src/Kokkos_Array.hpp:191:26: note: declared here 191 | struct KOKKOS_DEPRECATED Array { | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ /var/jenkins/workspace/ArborX_nightly/source-kokkos/core/src/Kokkos_Array.hpp:265:66: warning: 'Array' is deprecated [-Wdeprecated-declarations] 265 | struct KOKKOS_DEPRECATED Array::strided> { | ^~~~~~~ /var/jenkins/workspace/ArborX_nightly/source-kokkos/core/src/Kokkos_Array.hpp:191:26: note: declared here 191 | struct KOKKOS_DEPRECATED Array { | | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ``` * Revert "Fix deprecated warning from Kokkos::Array specialization" This reverts commit 38db1cab74df5fc547e779d6b9e3e65ebcb89a14. * Let Array<>::{contiguous,strided} be aliases to Impl:: tag classes Better approach to suppress the GCC deprecation warning suggested by Thomas on Slack. Co-Authored-By: Thomas Padioleau --------- Co-authored-by: Thomas Padioleau --- core/src/Kokkos_Array.hpp | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/core/src/Kokkos_Array.hpp b/core/src/Kokkos_Array.hpp index 3d71d09fde1..fed18d73fef 100644 --- a/core/src/Kokkos_Array.hpp +++ b/core/src/Kokkos_Array.hpp @@ -187,14 +187,20 @@ struct Array { }; #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +namespace Impl { +struct KokkosArrayContiguous {}; +struct KokkosArrayStrided {}; +} // namespace Impl + template <> struct KOKKOS_DEPRECATED Array { - struct contiguous {}; - struct strided {}; + using contiguous = Impl::KokkosArrayContiguous; + using strided = Impl::KokkosArrayStrided; }; template -struct KOKKOS_DEPRECATED Array::contiguous> { +struct KOKKOS_DEPRECATED + Array { private: T* m_elem; size_t m_size; @@ -262,7 +268,8 @@ struct KOKKOS_DEPRECATED Array::contiguous> { }; template -struct KOKKOS_DEPRECATED Array::strided> { +struct KOKKOS_DEPRECATED + Array { private: T* m_elem; size_t m_size; From 69c527a4245f495ae7d03c2bf4fcd7dd4364d0a7 Mon Sep 17 00:00:00 2001 From: Bruno Turcksin Date: Wed, 17 Apr 2024 13:25:04 -0400 Subject: [PATCH 009/103] [ci skip] Enable deprecated code and deprecated warnings in nightly CI --- .jenkins_nightly | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.jenkins_nightly b/.jenkins_nightly index a8facd365c2..b723f12c0fc 100644 --- a/.jenkins_nightly +++ b/.jenkins_nightly @@ -95,7 +95,8 @@ pipeline { -DKokkos_ENABLE_BENCHMARKS=ON \ -DKokkos_ENABLE_EXAMPLES=ON \ -DKokkos_ENABLE_TESTS=ON \ - -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ + -DKokkos_ENABLE_DEPRECATED_CODE_4=ON \ + -DKokkos_ENABLE_DEPRECATION_WARNINGS=ON \ -DKokkos_ENABLE_SERIAL=ON \ .. && \ make -j8 && ctest --verbose @@ -123,7 +124,7 @@ pipeline { -DKokkos_ARCH_NATIVE=ON \ -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ -DKokkos_ENABLE_DEPRECATED_CODE_4=ON \ - -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ + -DKokkos_ENABLE_DEPRECATION_WARNINGS=ON \ -DKokkos_ENABLE_TESTS=ON \ -DKokkos_ENABLE_BENCHMARKS=ON \ -DKokkos_ENABLE_HIP=ON \ From e7b486ff614abb6454e1172098888f8de15f7b65 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 17 Apr 2024 18:10:26 -0400 Subject: [PATCH 010/103] Serial: Use the provided execution space instance in TeamPolicy --- core/src/Serial/Kokkos_Serial_Parallel_Team.hpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp b/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp index f34a7daaca0..a25b51496ef 100644 --- a/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp +++ b/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp @@ -37,6 +37,8 @@ class TeamPolicyInternal int m_league_size; int m_chunk_size; + Kokkos::Serial m_space; + public: //! Tag this class as a kokkos execution policy using execution_policy = TeamPolicyInternal; @@ -46,10 +48,7 @@ class TeamPolicyInternal //! Execution space of this execution policy: using execution_space = Kokkos::Serial; - const typename traits::execution_space& space() const { - static typename traits::execution_space m_space; - return m_space; - } + const typename traits::execution_space& space() const { return m_space; } template friend class TeamPolicyInternal; @@ -116,12 +115,13 @@ class TeamPolicyInternal return (level == 0 ? 1024 * 32 : 20 * 1024 * 1024); } /** \brief Specify league size, request team size */ - TeamPolicyInternal(const execution_space&, int league_size_request, + TeamPolicyInternal(const execution_space& space, int league_size_request, int team_size_request, int /* vector_length_request */ = 1) : m_team_scratch_size{0, 0}, m_thread_scratch_size{0, 0}, m_league_size(league_size_request), - m_chunk_size(32) { + m_chunk_size(32), + m_space(space) { if (team_size_request > 1) Kokkos::abort("Kokkos::abort: Requested Team Size is too large!"); } From 0859ab0af9b44315832cd27353bd3acf188853a3 Mon Sep 17 00:00:00 2001 From: "Nevin \":-)\" Liber" Date: Wed, 17 Apr 2024 17:31:45 -0500 Subject: [PATCH 011/103] Fixed the link for P6601 (Threads backend change) --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f7b8af7695c..c70ee5505f8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -39,7 +39,7 @@ * Make the OpenACC backend asynchronous [\#6772](https://github.com/kokkos/kokkos/pull/6772) #### Threads: -* Add missing broadcast to TeamThreadRange parallel_scan [\#6601](https://github.com/kokkos/kokkos/pull/6446) +* Add missing broadcast to TeamThreadRange parallel_scan [\#6601](https://github.com/kokkos/kokkos/pull/6601) #### OpenMP: * Improve performance of view initializations and filling with zeros [\#6573](https://github.com/kokkos/kokkos/pull/6573) From 34d0db2f41dc11c9d30c3ff3449cddfc366c7e3d Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Thu, 18 Apr 2024 07:54:59 -0400 Subject: [PATCH 012/103] Add test --- core/unit_test/TestExecSpacePartitioning.hpp | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/core/unit_test/TestExecSpacePartitioning.hpp b/core/unit_test/TestExecSpacePartitioning.hpp index 65314d6be7c..8703cc3b273 100644 --- a/core/unit_test/TestExecSpacePartitioning.hpp +++ b/core/unit_test/TestExecSpacePartitioning.hpp @@ -28,6 +28,17 @@ struct SumFunctor { void operator()(int i, int& lsum) const { lsum += i; } }; +template +void check_space_member_for_policies(const ExecSpace& exec) { + Kokkos::RangePolicy range_policy(exec, 0, 1); + ASSERT_EQ(range_policy.space(), exec); + Kokkos::MDRangePolicy> mdrange_policy(exec, {0, 0}, + {1, 1}); + ASSERT_EQ(mdrange_policy.space(), exec); + Kokkos::TeamPolicy team_policy(exec, 1, 1); + ASSERT_EQ(team_policy.space(), exec); +} + template void check_distinctive([[maybe_unused]] ExecSpace exec1, [[maybe_unused]] ExecSpace exec2) { @@ -89,6 +100,9 @@ void run_threaded_test(const Lambda1 l1, const Lambda2 l2) { void test_partitioning(std::vector& instances) { check_distinctive(instances[0], instances[1]); + check_space_member_for_policies(instances[0]); + check_space_member_for_policies(instances[1]); + int sum1, sum2; int N = 3910; run_threaded_test( From 44fde213fb9515adfeb6645fbdbca5a4deeca633 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Thu, 18 Apr 2024 10:34:25 -0400 Subject: [PATCH 013/103] Use Kokkos::AUTO for OpenMPTarget --- core/unit_test/TestExecSpacePartitioning.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/unit_test/TestExecSpacePartitioning.hpp b/core/unit_test/TestExecSpacePartitioning.hpp index 8703cc3b273..f8b570ab64d 100644 --- a/core/unit_test/TestExecSpacePartitioning.hpp +++ b/core/unit_test/TestExecSpacePartitioning.hpp @@ -35,7 +35,7 @@ void check_space_member_for_policies(const ExecSpace& exec) { Kokkos::MDRangePolicy> mdrange_policy(exec, {0, 0}, {1, 1}); ASSERT_EQ(mdrange_policy.space(), exec); - Kokkos::TeamPolicy team_policy(exec, 1, 1); + Kokkos::TeamPolicy team_policy(exec, 1, Kokkos::AUTO); ASSERT_EQ(team_policy.space(), exec); } From 8706b68d5bcb66473f180e131696e3d520bd34a7 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 18 Apr 2024 13:56:19 -0500 Subject: [PATCH 014/103] kokkos_swap(Array) member friend should not be templated on some other type U Co-Authored-By: Maarten Arnst --- core/src/Kokkos_Array.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/core/src/Kokkos_Array.hpp b/core/src/Kokkos_Array.hpp index 7fd81030ecd..d1132bdbb4e 100644 --- a/core/src/Kokkos_Array.hpp +++ b/core/src/Kokkos_Array.hpp @@ -135,7 +135,6 @@ struct Array { } private: - template friend KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t< Impl::is_swappable::value> kokkos_swap(Array& a, From 86f5988b3128cd751da53d0b0c1af87d4ff7324a Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 18 Apr 2024 13:57:54 -0500 Subject: [PATCH 015/103] Fix noexcept specification for kokkos_swap on zero-sized arrays Co-authored-by: Nevin Liber --- core/src/Kokkos_Array.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/Kokkos_Array.hpp b/core/src/Kokkos_Array.hpp index d1132bdbb4e..09681c18842 100644 --- a/core/src/Kokkos_Array.hpp +++ b/core/src/Kokkos_Array.hpp @@ -198,7 +198,7 @@ struct Array { private: friend KOKKOS_INLINE_FUNCTION constexpr void kokkos_swap( - Array&, Array&) noexcept(Impl::is_nothrow_swappable_v) {} + Array&, Array&) noexcept {} }; #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 From 205fd156d990138dd6b6b400fb44d4aa9b196aa0 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Mon, 22 Apr 2024 15:07:03 +0000 Subject: [PATCH 016/103] Replace deprecated sycl::device_ptr/sycl::host_ptr --- core/src/SYCL/Kokkos_SYCL_Instance.cpp | 11 ++++-- core/src/SYCL/Kokkos_SYCL_Instance.hpp | 34 ++++++++--------- .../src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp | 13 ++++--- .../Kokkos_SYCL_ParallelReduce_MDRange.hpp | 13 ++++--- .../SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp | 20 +++++----- .../SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp | 37 ++++++++++--------- .../SYCL/Kokkos_SYCL_ParallelScan_Range.hpp | 23 ++++++------ core/src/SYCL/Kokkos_SYCL_Team.hpp | 2 +- .../SYCL/Kokkos_SYCL_WorkgroupReduction.hpp | 4 +- 9 files changed, 84 insertions(+), 73 deletions(-) diff --git a/core/src/SYCL/Kokkos_SYCL_Instance.cpp b/core/src/SYCL/Kokkos_SYCL_Instance.cpp index 4a1c910c73d..d2112e3e4f2 100644 --- a/core/src/SYCL/Kokkos_SYCL_Instance.cpp +++ b/core/src/SYCL/Kokkos_SYCL_Instance.cpp @@ -166,7 +166,7 @@ int SYCLInternal::acquire_team_scratch_space() { return current_team_scratch; } -sycl::device_ptr SYCLInternal::resize_team_scratch_space( +sycl::ext::intel::device_ptr SYCLInternal::resize_team_scratch_space( int scratch_pool_id, std::int64_t bytes, bool force_shrink) { // Multiple ParallelFor/Reduce Teams can call this function at the same time // and invalidate the m_team_scratch_ptr. We use a pool to avoid any race @@ -251,7 +251,8 @@ void SYCLInternal::finalize() { m_queue.reset(); } -sycl::device_ptr SYCLInternal::scratch_space(const std::size_t size) { +sycl::ext::intel::device_ptr SYCLInternal::scratch_space( + const std::size_t size) { if (verify_is_initialized("scratch_space") && m_scratchSpaceCount < scratch_count(size)) { auto mem_space = Kokkos::Experimental::SYCLDeviceUSMSpace(*m_queue); @@ -271,7 +272,8 @@ sycl::device_ptr SYCLInternal::scratch_space(const std::size_t size) { return m_scratchSpace; } -sycl::host_ptr SYCLInternal::scratch_host(const std::size_t size) { +sycl::ext::intel::host_ptr SYCLInternal::scratch_host( + const std::size_t size) { if (verify_is_initialized("scratch_unified") && m_scratchHostCount < scratch_count(size)) { auto mem_space = Kokkos::Experimental::SYCLHostUSMSpace(*m_queue); @@ -291,7 +293,8 @@ sycl::host_ptr SYCLInternal::scratch_host(const std::size_t size) { return m_scratchHost; } -sycl::device_ptr SYCLInternal::scratch_flags(const std::size_t size) { +sycl::ext::intel::device_ptr SYCLInternal::scratch_flags( + const std::size_t size) { if (verify_is_initialized("scratch_flags") && m_scratchFlagsCount < scratch_count(size)) { auto mem_space = Kokkos::Experimental::SYCLDeviceUSMSpace(*m_queue); diff --git a/core/src/SYCL/Kokkos_SYCL_Instance.hpp b/core/src/SYCL/Kokkos_SYCL_Instance.hpp index ab7e8ce71e0..0666e1bd626 100644 --- a/core/src/SYCL/Kokkos_SYCL_Instance.hpp +++ b/core/src/SYCL/Kokkos_SYCL_Instance.hpp @@ -43,13 +43,12 @@ class SYCLInternal { SYCLInternal& operator=(SYCLInternal&&) = delete; SYCLInternal(SYCLInternal&&) = delete; - sycl::device_ptr scratch_space(const std::size_t size); - sycl::device_ptr scratch_flags(const std::size_t size); - sycl::host_ptr scratch_host(const std::size_t size); + sycl::ext::intel::device_ptr scratch_space(const std::size_t size); + sycl::ext::intel::device_ptr scratch_flags(const std::size_t size); + sycl::ext::intel::host_ptr scratch_host(const std::size_t size); int acquire_team_scratch_space(); - sycl::device_ptr resize_team_scratch_space(int scratch_pool_id, - std::int64_t bytes, - bool force_shrink = false); + sycl::ext::intel::device_ptr resize_team_scratch_space( + int scratch_pool_id, std::int64_t bytes, bool force_shrink = false); void register_team_scratch_event(int scratch_pool_id, sycl::event event); uint32_t impl_get_instance_id() const; @@ -59,21 +58,22 @@ class SYCLInternal { uint32_t m_maxConcurrency = 0; uint64_t m_maxShmemPerBlock = 0; - std::size_t m_scratchSpaceCount = 0; - sycl::device_ptr m_scratchSpace = nullptr; - std::size_t m_scratchHostCount = 0; - sycl::host_ptr m_scratchHost = nullptr; - std::size_t m_scratchFlagsCount = 0; - sycl::device_ptr m_scratchFlags = nullptr; + std::size_t m_scratchSpaceCount = 0; + sycl::ext::intel::device_ptr m_scratchSpace = nullptr; + std::size_t m_scratchHostCount = 0; + sycl::ext::intel::host_ptr m_scratchHost = nullptr; + std::size_t m_scratchFlagsCount = 0; + sycl::ext::intel::device_ptr m_scratchFlags = nullptr; // mutex to access shared memory mutable std::mutex m_mutexScratchSpace; // Team Scratch Level 1 Space - static constexpr int m_n_team_scratch = 10; - mutable int64_t m_team_scratch_current_size[m_n_team_scratch] = {}; - mutable sycl::device_ptr m_team_scratch_ptr[m_n_team_scratch] = {}; - mutable int m_current_team_scratch = 0; - mutable sycl::event m_team_scratch_event[m_n_team_scratch] = {}; + static constexpr int m_n_team_scratch = 10; + mutable int64_t m_team_scratch_current_size[m_n_team_scratch] = {}; + mutable sycl::ext::intel::device_ptr + m_team_scratch_ptr[m_n_team_scratch] = {}; + mutable int m_current_team_scratch = 0; + mutable sycl::event m_team_scratch_event[m_n_team_scratch] = {}; mutable std::mutex m_team_scratch_mutex; uint32_t m_instance_id = Kokkos::Tools::Experimental::Impl::idForInstance< diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp index ecb4a863da2..b1d32172d82 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp @@ -44,7 +44,7 @@ class Kokkos::Impl::ParallelFor, size_type const m_vector_size; int m_shmem_begin; int m_shmem_size; - sycl::device_ptr m_global_scratch_ptr; + sycl::ext::intel::device_ptr m_global_scratch_ptr; size_t m_scratch_size[2]; // Only let one ParallelFor instance at a time use the team scratch memory. // The constructor acquires the mutex which is released in the destructor. @@ -72,7 +72,8 @@ class Kokkos::Impl::ParallelFor, // Avoid capturing *this since it might not be trivially copyable const auto shmem_begin = m_shmem_begin; const size_t scratch_size[2] = {m_scratch_size[0], m_scratch_size[1]}; - sycl::device_ptr const global_scratch_ptr = m_global_scratch_ptr; + sycl::ext::intel::device_ptr const global_scratch_ptr = + m_global_scratch_ptr; auto lambda = [=](sycl::nd_item<2> item) { const member_type team_member( @@ -161,10 +162,10 @@ class Kokkos::Impl::ParallelFor, // Functor's reduce memory, team scan memory, and team shared memory depend // upon team size. - auto& space = *m_policy.space().impl_internal_space_instance(); - m_scratch_pool_id = space.acquire_team_scratch_space(); - m_global_scratch_ptr = - static_cast>(space.resize_team_scratch_space( + auto& space = *m_policy.space().impl_internal_space_instance(); + m_scratch_pool_id = space.acquire_team_scratch_space(); + m_global_scratch_ptr = static_cast>( + space.resize_team_scratch_space( m_scratch_pool_id, static_cast(m_scratch_size[1]) * m_league_size)); diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp index f55280e22e3..8fec299c5d5 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp @@ -94,10 +94,10 @@ class Kokkos::Impl::ParallelReduce results_ptr; + sycl::ext::intel::device_ptr results_ptr; auto host_result_ptr = (m_result_ptr && !m_result_ptr_device_accessible) - ? static_cast>( + ? static_cast>( instance.scratch_host(sizeof(value_type) * value_count)) : nullptr; @@ -114,7 +114,7 @@ class Kokkos::Impl::ParallelReduce>( + results_ptr = static_cast>( instance.scratch_space(sizeof(value_type) * value_count)); auto device_accessible_result_ptr = m_result_ptr_device_accessible @@ -155,14 +155,15 @@ class Kokkos::Impl::ParallelReduce>( + results_ptr = static_cast>( instance.scratch_space(sizeof(value_type) * value_count * n_wgroups)); auto device_accessible_result_ptr = m_result_ptr_device_accessible ? static_cast>(m_result_ptr) : static_cast>(host_result_ptr); - auto scratch_flags = static_cast>( - instance.scratch_flags(sizeof(unsigned int))); + auto scratch_flags = + static_cast>( + instance.scratch_flags(sizeof(unsigned int))); auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) { sycl::local_accessor local_mem( diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp index 5333e3c8a83..7feb2110068 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp @@ -69,10 +69,10 @@ class Kokkos::Impl::ParallelReduce results_ptr = nullptr; + sycl::ext::intel::device_ptr results_ptr = nullptr; auto host_result_ptr = (m_result_ptr && !m_result_ptr_device_accessible) - ? static_cast>( + ? static_cast>( instance.scratch_host(sizeof(value_type) * value_count)) : nullptr; auto device_accessible_result_ptr = @@ -88,7 +88,7 @@ class Kokkos::Impl::ParallelReduce>( + results_ptr = static_cast>( instance.scratch_space(sizeof(value_type) * value_count)); auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) { @@ -125,13 +125,15 @@ class Kokkos::Impl::ParallelReduce>( - instance.scratch_flags(sizeof(unsigned int))); + auto scratch_flags = + static_cast>( + instance.scratch_flags(sizeof(unsigned int))); auto reduction_lambda_factory = [&](sycl::local_accessor local_mem, sycl::local_accessor num_teams_done, - sycl::device_ptr results_ptr, int values_per_thread) { + sycl::ext::intel::device_ptr results_ptr, + int values_per_thread) { const auto begin = policy.begin(); auto lambda = [=](sycl::nd_item<1> item) { @@ -301,9 +303,9 @@ class Kokkos::Impl::ParallelReduce>(instance.scratch_space( - sizeof(value_type) * value_count * n_wgroups)); + results_ptr = static_cast>( + instance.scratch_space(sizeof(value_type) * value_count * + n_wgroups)); sycl::local_accessor local_mem( sycl::range<1>(wgroup_size) * value_count, cgh); diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp index 27165c59e3a..edb1b54b827 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp @@ -54,7 +54,7 @@ class Kokkos::Impl::ParallelReduce m_global_scratch_ptr; + sycl::ext::intel::device_ptr m_global_scratch_ptr; size_t m_scratch_size[2]; const size_type m_league_size; int m_team_size; @@ -82,7 +82,7 @@ class Kokkos::Impl::ParallelReduce>( + ? static_cast>( instance.scratch_host(sizeof(value_type) * value_count)) : nullptr; @@ -94,9 +94,9 @@ class Kokkos::Impl::ParallelReduce>(instance.scratch_space( - sizeof(value_type) * std::max(value_count, 1u))); + results_ptr = static_cast>( + instance.scratch_space(sizeof(value_type) * + std::max(value_count, 1u))); auto device_accessible_result_ptr = m_result_ptr_device_accessible ? static_cast>(m_result_ptr) @@ -113,7 +113,8 @@ class Kokkos::Impl::ParallelReduce const global_scratch_ptr = m_global_scratch_ptr; + sycl::ext::intel::device_ptr const global_scratch_ptr = + m_global_scratch_ptr; #ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES cgh.depends_on(memcpy_event); @@ -156,8 +157,9 @@ class Kokkos::Impl::ParallelReduce>( - instance.scratch_flags(sizeof(unsigned int))); + auto scratch_flags = + static_cast>( + instance.scratch_flags(sizeof(unsigned int))); // FIXME_SYCL accessors seem to need a size greater than zero at least // for host queues @@ -170,12 +172,13 @@ class Kokkos::Impl::ParallelReduce const global_scratch_ptr = m_global_scratch_ptr; + sycl::ext::intel::device_ptr const global_scratch_ptr = + m_global_scratch_ptr; sycl::local_accessor num_teams_done(1, cgh); auto team_reduction_factory = [&](sycl::local_accessor local_mem, - sycl::device_ptr results_ptr) { + sycl::ext::intel::device_ptr results_ptr) { auto device_accessible_result_ptr = m_result_ptr_device_accessible ? static_cast>(m_result_ptr) @@ -330,9 +333,9 @@ class Kokkos::Impl::ParallelReduce((size + wgroup_size - 1) / wgroup_size, 1); - results_ptr = - static_cast>(instance.scratch_space( - sizeof(value_type) * std::max(value_count, 1u) * init_size)); + results_ptr = static_cast>( + instance.scratch_space(sizeof(value_type) * + std::max(value_count, 1u) * init_size)); size_t max_work_groups = 2 * @@ -425,10 +428,10 @@ class Kokkos::Impl::ParallelReduce>(space.resize_team_scratch_space( + auto& space = *m_policy.space().impl_internal_space_instance(); + m_scratch_pool_id = space.acquire_team_scratch_space(); + m_global_scratch_ptr = static_cast>( + space.resize_team_scratch_space( m_scratch_pool_id, static_cast(m_scratch_size[1]) * m_league_size)); diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp index 58cfea6a97a..b773af6cda7 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp @@ -146,7 +146,7 @@ class ParallelScanSYCLBase { const CombinedFunctorReducer m_functor_reducer; const Policy m_policy; - sycl::host_ptr m_scratch_host = nullptr; + sycl::ext::intel::host_ptr m_scratch_host = nullptr; pointer_type m_result_ptr; const bool m_result_ptr_device_accessible; @@ -166,8 +166,9 @@ class ParallelScanSYCLBase { const auto size = m_policy.end() - m_policy.begin(); - auto scratch_flags = static_cast>( - instance.scratch_flags(sizeof(unsigned int))); + auto scratch_flags = + static_cast>( + instance.scratch_flags(sizeof(unsigned int))); const auto begin = m_policy.begin(); @@ -175,8 +176,8 @@ class ParallelScanSYCLBase { auto scan_lambda_factory = [&](sycl::local_accessor local_mem, sycl::local_accessor num_teams_done, - sycl::device_ptr global_mem_, - sycl::device_ptr group_results_) { + sycl::ext::intel::device_ptr global_mem_, + sycl::ext::intel::device_ptr group_results_) { auto lambda = [=](sycl::nd_item<1> item) { auto global_mem = global_mem_; auto group_results = group_results_; @@ -253,8 +254,8 @@ class ParallelScanSYCLBase { size_t wgroup_size; size_t n_wgroups; - sycl::device_ptr global_mem; - sycl::device_ptr group_results; + sycl::ext::intel::device_ptr global_mem; + sycl::ext::intel::device_ptr group_results; desul::ensure_sycl_lock_arrays_on_device(q); @@ -288,10 +289,10 @@ class ParallelScanSYCLBase { // that will contain the sum of the previous workgroups totals. // FIXME_SYCL consider only storing one value per block and recreate // initial results in the end before doing the final pass - global_mem = - static_cast>(instance.scratch_space( - n_wgroups * (wgroup_size + 1) * sizeof(value_type))); - m_scratch_host = static_cast>( + global_mem = static_cast>( + instance.scratch_space(n_wgroups * (wgroup_size + 1) * + sizeof(value_type))); + m_scratch_host = static_cast>( instance.scratch_host(sizeof(value_type))); group_results = global_mem + n_wgroups * wgroup_size; diff --git a/core/src/SYCL/Kokkos_SYCL_Team.hpp b/core/src/SYCL/Kokkos_SYCL_Team.hpp index 2b4c2be5227..715d65a98f2 100644 --- a/core/src/SYCL/Kokkos_SYCL_Team.hpp +++ b/core/src/SYCL/Kokkos_SYCL_Team.hpp @@ -339,7 +339,7 @@ class SYCLTeamMember { KOKKOS_INLINE_FUNCTION SYCLTeamMember(sycl::local_ptr shared, const std::size_t shared_begin, const std::size_t shared_size, - sycl::device_ptr scratch_level_1_ptr, + sycl::ext::intel::device_ptr scratch_level_1_ptr, const std::size_t scratch_level_1_size, const sycl::nd_item<2> item, const int arg_league_rank, const int arg_league_size) diff --git a/core/src/SYCL/Kokkos_SYCL_WorkgroupReduction.hpp b/core/src/SYCL/Kokkos_SYCL_WorkgroupReduction.hpp index 7069805a5b5..3b818490901 100644 --- a/core/src/SYCL/Kokkos_SYCL_WorkgroupReduction.hpp +++ b/core/src/SYCL/Kokkos_SYCL_WorkgroupReduction.hpp @@ -28,7 +28,7 @@ inline constexpr bool use_shuffle_based_algorithm = template std::enable_if_t> workgroup_reduction( sycl::nd_item& item, sycl::local_accessor local_mem, - sycl::device_ptr results_ptr, + sycl::ext::intel::device_ptr results_ptr, sycl::global_ptr device_accessible_result_ptr, const unsigned int value_count_, const ReducerType& final_reducer, bool final, unsigned int max_size) { @@ -100,7 +100,7 @@ std::enable_if_t> workgroup_reduction( template std::enable_if_t> workgroup_reduction( sycl::nd_item& item, sycl::local_accessor local_mem, - ValueType local_value, sycl::device_ptr results_ptr, + ValueType local_value, sycl::ext::intel::device_ptr results_ptr, sycl::global_ptr device_accessible_result_ptr, const ReducerType& final_reducer, bool final, unsigned int max_size) { const auto local_id = item.get_local_linear_id(); From 5932685c939a08b1b29b9c56bcb264728c1b16ba Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Mon, 22 Apr 2024 17:12:16 +0000 Subject: [PATCH 017/103] Introduce alias based on feature macro --- core/src/SYCL/Kokkos_SYCL_Instance.cpp | 11 +++--- core/src/SYCL/Kokkos_SYCL_Instance.hpp | 24 ++++++------ .../src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp | 17 ++++---- .../Kokkos_SYCL_ParallelReduce_MDRange.hpp | 17 ++++---- .../SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp | 20 +++++----- .../SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp | 39 ++++++++++--------- .../SYCL/Kokkos_SYCL_ParallelScan_Range.hpp | 19 ++++----- core/src/SYCL/Kokkos_SYCL_Team.hpp | 2 +- .../SYCL/Kokkos_SYCL_WorkgroupReduction.hpp | 5 ++- core/src/setup/Kokkos_Setup_SYCL.hpp | 17 ++++++++ 10 files changed, 101 insertions(+), 70 deletions(-) diff --git a/core/src/SYCL/Kokkos_SYCL_Instance.cpp b/core/src/SYCL/Kokkos_SYCL_Instance.cpp index d2112e3e4f2..adfd4c10b04 100644 --- a/core/src/SYCL/Kokkos_SYCL_Instance.cpp +++ b/core/src/SYCL/Kokkos_SYCL_Instance.cpp @@ -166,8 +166,9 @@ int SYCLInternal::acquire_team_scratch_space() { return current_team_scratch; } -sycl::ext::intel::device_ptr SYCLInternal::resize_team_scratch_space( - int scratch_pool_id, std::int64_t bytes, bool force_shrink) { +Kokkos::Impl::SYCLTypes::device_ptr +SYCLInternal::resize_team_scratch_space(int scratch_pool_id, std::int64_t bytes, + bool force_shrink) { // Multiple ParallelFor/Reduce Teams can call this function at the same time // and invalidate the m_team_scratch_ptr. We use a pool to avoid any race // condition. @@ -251,7 +252,7 @@ void SYCLInternal::finalize() { m_queue.reset(); } -sycl::ext::intel::device_ptr SYCLInternal::scratch_space( +Kokkos::Impl::SYCLTypes::device_ptr SYCLInternal::scratch_space( const std::size_t size) { if (verify_is_initialized("scratch_space") && m_scratchSpaceCount < scratch_count(size)) { @@ -272,7 +273,7 @@ sycl::ext::intel::device_ptr SYCLInternal::scratch_space( return m_scratchSpace; } -sycl::ext::intel::host_ptr SYCLInternal::scratch_host( +Kokkos::Impl::SYCLTypes::host_ptr SYCLInternal::scratch_host( const std::size_t size) { if (verify_is_initialized("scratch_unified") && m_scratchHostCount < scratch_count(size)) { @@ -293,7 +294,7 @@ sycl::ext::intel::host_ptr SYCLInternal::scratch_host( return m_scratchHost; } -sycl::ext::intel::device_ptr SYCLInternal::scratch_flags( +Kokkos::Impl::SYCLTypes::device_ptr SYCLInternal::scratch_flags( const std::size_t size) { if (verify_is_initialized("scratch_flags") && m_scratchFlagsCount < scratch_count(size)) { diff --git a/core/src/SYCL/Kokkos_SYCL_Instance.hpp b/core/src/SYCL/Kokkos_SYCL_Instance.hpp index 0666e1bd626..de77b8efdeb 100644 --- a/core/src/SYCL/Kokkos_SYCL_Instance.hpp +++ b/core/src/SYCL/Kokkos_SYCL_Instance.hpp @@ -43,11 +43,13 @@ class SYCLInternal { SYCLInternal& operator=(SYCLInternal&&) = delete; SYCLInternal(SYCLInternal&&) = delete; - sycl::ext::intel::device_ptr scratch_space(const std::size_t size); - sycl::ext::intel::device_ptr scratch_flags(const std::size_t size); - sycl::ext::intel::host_ptr scratch_host(const std::size_t size); + Kokkos::Impl::SYCLTypes::device_ptr scratch_space( + const std::size_t size); + Kokkos::Impl::SYCLTypes::device_ptr scratch_flags( + const std::size_t size); + Kokkos::Impl::SYCLTypes::host_ptr scratch_host(const std::size_t size); int acquire_team_scratch_space(); - sycl::ext::intel::device_ptr resize_team_scratch_space( + Kokkos::Impl::SYCLTypes::device_ptr resize_team_scratch_space( int scratch_pool_id, std::int64_t bytes, bool force_shrink = false); void register_team_scratch_event(int scratch_pool_id, sycl::event event); @@ -58,19 +60,19 @@ class SYCLInternal { uint32_t m_maxConcurrency = 0; uint64_t m_maxShmemPerBlock = 0; - std::size_t m_scratchSpaceCount = 0; - sycl::ext::intel::device_ptr m_scratchSpace = nullptr; - std::size_t m_scratchHostCount = 0; - sycl::ext::intel::host_ptr m_scratchHost = nullptr; - std::size_t m_scratchFlagsCount = 0; - sycl::ext::intel::device_ptr m_scratchFlags = nullptr; + std::size_t m_scratchSpaceCount = 0; + Kokkos::Impl::SYCLTypes::device_ptr m_scratchSpace = nullptr; + std::size_t m_scratchHostCount = 0; + Kokkos::Impl::SYCLTypes::host_ptr m_scratchHost = nullptr; + std::size_t m_scratchFlagsCount = 0; + Kokkos::Impl::SYCLTypes::device_ptr m_scratchFlags = nullptr; // mutex to access shared memory mutable std::mutex m_mutexScratchSpace; // Team Scratch Level 1 Space static constexpr int m_n_team_scratch = 10; mutable int64_t m_team_scratch_current_size[m_n_team_scratch] = {}; - mutable sycl::ext::intel::device_ptr + mutable Kokkos::Impl::SYCLTypes::device_ptr m_team_scratch_ptr[m_n_team_scratch] = {}; mutable int m_current_team_scratch = 0; mutable sycl::event m_team_scratch_event[m_n_team_scratch] = {}; diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp index b1d32172d82..7f258ecccae 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp @@ -44,7 +44,7 @@ class Kokkos::Impl::ParallelFor, size_type const m_vector_size; int m_shmem_begin; int m_shmem_size; - sycl::ext::intel::device_ptr m_global_scratch_ptr; + Kokkos::Impl::SYCLTypes::device_ptr m_global_scratch_ptr; size_t m_scratch_size[2]; // Only let one ParallelFor instance at a time use the team scratch memory. // The constructor acquires the mutex which is released in the destructor. @@ -72,7 +72,7 @@ class Kokkos::Impl::ParallelFor, // Avoid capturing *this since it might not be trivially copyable const auto shmem_begin = m_shmem_begin; const size_t scratch_size[2] = {m_scratch_size[0], m_scratch_size[1]}; - sycl::ext::intel::device_ptr const global_scratch_ptr = + Kokkos::Impl::SYCLTypes::device_ptr const global_scratch_ptr = m_global_scratch_ptr; auto lambda = [=](sycl::nd_item<2> item) { @@ -162,12 +162,13 @@ class Kokkos::Impl::ParallelFor, // Functor's reduce memory, team scan memory, and team shared memory depend // upon team size. - auto& space = *m_policy.space().impl_internal_space_instance(); - m_scratch_pool_id = space.acquire_team_scratch_space(); - m_global_scratch_ptr = static_cast>( - space.resize_team_scratch_space( - m_scratch_pool_id, - static_cast(m_scratch_size[1]) * m_league_size)); + auto& space = *m_policy.space().impl_internal_space_instance(); + m_scratch_pool_id = space.acquire_team_scratch_space(); + m_global_scratch_ptr = + static_cast>( + space.resize_team_scratch_space( + m_scratch_pool_id, + static_cast(m_scratch_size[1]) * m_league_size)); if (static_cast(space.m_maxShmemPerBlock) < m_shmem_size - m_shmem_begin) { diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp index 8fec299c5d5..155f4b00821 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp @@ -94,10 +94,10 @@ class Kokkos::Impl::ParallelReduce results_ptr; + Kokkos::Impl::SYCLTypes::device_ptr results_ptr; auto host_result_ptr = (m_result_ptr && !m_result_ptr_device_accessible) - ? static_cast>( + ? static_cast>( instance.scratch_host(sizeof(value_type) * value_count)) : nullptr; @@ -114,8 +114,9 @@ class Kokkos::Impl::ParallelReduce>( - instance.scratch_space(sizeof(value_type) * value_count)); + results_ptr = + static_cast>( + instance.scratch_space(sizeof(value_type) * value_count)); auto device_accessible_result_ptr = m_result_ptr_device_accessible ? static_cast>(m_result_ptr) @@ -155,14 +156,16 @@ class Kokkos::Impl::ParallelReduce>( - instance.scratch_space(sizeof(value_type) * value_count * n_wgroups)); + results_ptr = + static_cast>( + instance.scratch_space(sizeof(value_type) * value_count * + n_wgroups)); auto device_accessible_result_ptr = m_result_ptr_device_accessible ? static_cast>(m_result_ptr) : static_cast>(host_result_ptr); auto scratch_flags = - static_cast>( + static_cast>( instance.scratch_flags(sizeof(unsigned int))); auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) { diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp index 7feb2110068..c00f9bb6232 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp @@ -69,10 +69,10 @@ class Kokkos::Impl::ParallelReduce results_ptr = nullptr; + Kokkos::Impl::SYCLTypes::device_ptr results_ptr = nullptr; auto host_result_ptr = (m_result_ptr && !m_result_ptr_device_accessible) - ? static_cast>( + ? static_cast>( instance.scratch_host(sizeof(value_type) * value_count)) : nullptr; auto device_accessible_result_ptr = @@ -88,8 +88,9 @@ class Kokkos::Impl::ParallelReduce>( - instance.scratch_space(sizeof(value_type) * value_count)); + results_ptr = + static_cast>( + instance.scratch_space(sizeof(value_type) * value_count)); auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) { const auto begin = policy.begin(); @@ -126,13 +127,13 @@ class Kokkos::Impl::ParallelReduce>( + static_cast>( instance.scratch_flags(sizeof(unsigned int))); auto reduction_lambda_factory = [&](sycl::local_accessor local_mem, sycl::local_accessor num_teams_done, - sycl::ext::intel::device_ptr results_ptr, + Kokkos::Impl::SYCLTypes::device_ptr results_ptr, int values_per_thread) { const auto begin = policy.begin(); @@ -303,9 +304,10 @@ class Kokkos::Impl::ParallelReduce>( - instance.scratch_space(sizeof(value_type) * value_count * - n_wgroups)); + results_ptr = + static_cast>( + instance.scratch_space(sizeof(value_type) * value_count * + n_wgroups)); sycl::local_accessor local_mem( sycl::range<1>(wgroup_size) * value_count, cgh); diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp index edb1b54b827..b9be1148832 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp @@ -54,7 +54,7 @@ class Kokkos::Impl::ParallelReduce m_global_scratch_ptr; + Kokkos::Impl::SYCLTypes::device_ptr m_global_scratch_ptr; size_t m_scratch_size[2]; const size_type m_league_size; int m_team_size; @@ -82,7 +82,7 @@ class Kokkos::Impl::ParallelReduce>( + ? static_cast>( instance.scratch_host(sizeof(value_type) * value_count)) : nullptr; @@ -94,9 +94,10 @@ class Kokkos::Impl::ParallelReduce>( - instance.scratch_space(sizeof(value_type) * - std::max(value_count, 1u))); + results_ptr = + static_cast>( + instance.scratch_space(sizeof(value_type) * + std::max(value_count, 1u))); auto device_accessible_result_ptr = m_result_ptr_device_accessible ? static_cast>(m_result_ptr) @@ -113,7 +114,7 @@ class Kokkos::Impl::ParallelReduce const global_scratch_ptr = + Kokkos::Impl::SYCLTypes::device_ptr const global_scratch_ptr = m_global_scratch_ptr; #ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES @@ -158,7 +159,7 @@ class Kokkos::Impl::ParallelReduce>( + static_cast>( instance.scratch_flags(sizeof(unsigned int))); // FIXME_SYCL accessors seem to need a size greater than zero at least @@ -172,13 +173,13 @@ class Kokkos::Impl::ParallelReduce const global_scratch_ptr = + Kokkos::Impl::SYCLTypes::device_ptr const global_scratch_ptr = m_global_scratch_ptr; sycl::local_accessor num_teams_done(1, cgh); auto team_reduction_factory = [&](sycl::local_accessor local_mem, - sycl::ext::intel::device_ptr results_ptr) { + Kokkos::Impl::SYCLTypes::device_ptr results_ptr) { auto device_accessible_result_ptr = m_result_ptr_device_accessible ? static_cast>(m_result_ptr) @@ -333,9 +334,10 @@ class Kokkos::Impl::ParallelReduce((size + wgroup_size - 1) / wgroup_size, 1); - results_ptr = static_cast>( - instance.scratch_space(sizeof(value_type) * - std::max(value_count, 1u) * init_size)); + results_ptr = + static_cast>( + instance.scratch_space(sizeof(value_type) * + std::max(value_count, 1u) * init_size)); size_t max_work_groups = 2 * @@ -428,12 +430,13 @@ class Kokkos::Impl::ParallelReduce>( - space.resize_team_scratch_space( - m_scratch_pool_id, - static_cast(m_scratch_size[1]) * m_league_size)); + auto& space = *m_policy.space().impl_internal_space_instance(); + m_scratch_pool_id = space.acquire_team_scratch_space(); + m_global_scratch_ptr = + static_cast>( + space.resize_team_scratch_space( + m_scratch_pool_id, + static_cast(m_scratch_size[1]) * m_league_size)); if (static_cast(space.m_maxShmemPerBlock) < m_shmem_size - m_shmem_begin) { diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp index b773af6cda7..a3efe56b99c 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp @@ -146,7 +146,7 @@ class ParallelScanSYCLBase { const CombinedFunctorReducer m_functor_reducer; const Policy m_policy; - sycl::ext::intel::host_ptr m_scratch_host = nullptr; + Kokkos::Impl::SYCLTypes::host_ptr m_scratch_host = nullptr; pointer_type m_result_ptr; const bool m_result_ptr_device_accessible; @@ -167,7 +167,7 @@ class ParallelScanSYCLBase { const auto size = m_policy.end() - m_policy.begin(); auto scratch_flags = - static_cast>( + static_cast>( instance.scratch_flags(sizeof(unsigned int))); const auto begin = m_policy.begin(); @@ -176,8 +176,8 @@ class ParallelScanSYCLBase { auto scan_lambda_factory = [&](sycl::local_accessor local_mem, sycl::local_accessor num_teams_done, - sycl::ext::intel::device_ptr global_mem_, - sycl::ext::intel::device_ptr group_results_) { + Kokkos::Impl::SYCLTypes::device_ptr global_mem_, + Kokkos::Impl::SYCLTypes::device_ptr group_results_) { auto lambda = [=](sycl::nd_item<1> item) { auto global_mem = global_mem_; auto group_results = group_results_; @@ -254,8 +254,8 @@ class ParallelScanSYCLBase { size_t wgroup_size; size_t n_wgroups; - sycl::ext::intel::device_ptr global_mem; - sycl::ext::intel::device_ptr group_results; + Kokkos::Impl::SYCLTypes::device_ptr global_mem; + Kokkos::Impl::SYCLTypes::device_ptr group_results; desul::ensure_sycl_lock_arrays_on_device(q); @@ -289,11 +289,12 @@ class ParallelScanSYCLBase { // that will contain the sum of the previous workgroups totals. // FIXME_SYCL consider only storing one value per block and recreate // initial results in the end before doing the final pass - global_mem = static_cast>( + global_mem = static_cast>( instance.scratch_space(n_wgroups * (wgroup_size + 1) * sizeof(value_type))); - m_scratch_host = static_cast>( - instance.scratch_host(sizeof(value_type))); + m_scratch_host = + static_cast>( + instance.scratch_host(sizeof(value_type))); group_results = global_mem + n_wgroups * wgroup_size; diff --git a/core/src/SYCL/Kokkos_SYCL_Team.hpp b/core/src/SYCL/Kokkos_SYCL_Team.hpp index 715d65a98f2..d838dc94c67 100644 --- a/core/src/SYCL/Kokkos_SYCL_Team.hpp +++ b/core/src/SYCL/Kokkos_SYCL_Team.hpp @@ -339,7 +339,7 @@ class SYCLTeamMember { KOKKOS_INLINE_FUNCTION SYCLTeamMember(sycl::local_ptr shared, const std::size_t shared_begin, const std::size_t shared_size, - sycl::ext::intel::device_ptr scratch_level_1_ptr, + Kokkos::Impl::SYCLTypes::device_ptr scratch_level_1_ptr, const std::size_t scratch_level_1_size, const sycl::nd_item<2> item, const int arg_league_rank, const int arg_league_size) diff --git a/core/src/SYCL/Kokkos_SYCL_WorkgroupReduction.hpp b/core/src/SYCL/Kokkos_SYCL_WorkgroupReduction.hpp index 3b818490901..f25dec76777 100644 --- a/core/src/SYCL/Kokkos_SYCL_WorkgroupReduction.hpp +++ b/core/src/SYCL/Kokkos_SYCL_WorkgroupReduction.hpp @@ -28,7 +28,7 @@ inline constexpr bool use_shuffle_based_algorithm = template std::enable_if_t> workgroup_reduction( sycl::nd_item& item, sycl::local_accessor local_mem, - sycl::ext::intel::device_ptr results_ptr, + Kokkos::Impl::SYCLTypes::device_ptr results_ptr, sycl::global_ptr device_accessible_result_ptr, const unsigned int value_count_, const ReducerType& final_reducer, bool final, unsigned int max_size) { @@ -100,7 +100,8 @@ std::enable_if_t> workgroup_reduction( template std::enable_if_t> workgroup_reduction( sycl::nd_item& item, sycl::local_accessor local_mem, - ValueType local_value, sycl::ext::intel::device_ptr results_ptr, + ValueType local_value, + Kokkos::Impl::SYCLTypes::device_ptr results_ptr, sycl::global_ptr device_accessible_result_ptr, const ReducerType& final_reducer, bool final, unsigned int max_size) { const auto local_id = item.get_local_linear_id(); diff --git a/core/src/setup/Kokkos_Setup_SYCL.hpp b/core/src/setup/Kokkos_Setup_SYCL.hpp index 30f6fa2ad23..7fb10bb39a2 100644 --- a/core/src/setup/Kokkos_Setup_SYCL.hpp +++ b/core/src/setup/Kokkos_Setup_SYCL.hpp @@ -45,4 +45,21 @@ #define KOKKOS_IMPL_SYCL_GET_MULTI_PTR(accessor) accessor.get_pointer() #endif +// FIXME_SYCL Use type directly once it has stabilized in SYCL. +namespace Kokkos::Impl::SYCLTypes { +#ifndef SYCL_EXT_INTEL_USM_ADDRESS_SPACES +#error SYCL_EXT_INTEL_USM_ADDRESS_SPACES undefined! +#elif SYCL_EXT_INTEL_USM_ADDRESS_SPACES >= 2 +template +using device_ptr = sycl::ext::intel::device_ptr; +template +using host_ptr = sycl::ext::intel::host_ptr; +#else +template +using device_ptr = sycl::device_ptr; +template +using host_ptr = sycl::host_ptr; +#endif +} // namespace Kokkos::Impl::SYCLTypes + #endif From a7827731cf8256a9387a8786555aa5f97dccd17a Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Mon, 22 Apr 2024 19:07:13 -0400 Subject: [PATCH 018/103] Kokkos::Impl::SYCLTypes:: -> Kokkos::Impl::sycl_ --- core/src/SYCL/Kokkos_SYCL_Instance.cpp | 11 +- core/src/SYCL/Kokkos_SYCL_Instance.hpp | 24 ++- .../src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp | 12 +- .../Kokkos_SYCL_ParallelReduce_MDRange.hpp | 20 +-- .../SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp | 22 +-- .../SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp | 34 ++-- .../SYCL/Kokkos_SYCL_ParallelScan_Range.hpp | 166 +++++++++--------- core/src/SYCL/Kokkos_SYCL_Team.hpp | 2 +- .../SYCL/Kokkos_SYCL_WorkgroupReduction.hpp | 5 +- core/src/setup/Kokkos_Setup_SYCL.hpp | 12 +- 10 files changed, 142 insertions(+), 166 deletions(-) diff --git a/core/src/SYCL/Kokkos_SYCL_Instance.cpp b/core/src/SYCL/Kokkos_SYCL_Instance.cpp index adfd4c10b04..5843dca8123 100644 --- a/core/src/SYCL/Kokkos_SYCL_Instance.cpp +++ b/core/src/SYCL/Kokkos_SYCL_Instance.cpp @@ -166,9 +166,8 @@ int SYCLInternal::acquire_team_scratch_space() { return current_team_scratch; } -Kokkos::Impl::SYCLTypes::device_ptr -SYCLInternal::resize_team_scratch_space(int scratch_pool_id, std::int64_t bytes, - bool force_shrink) { +Kokkos::Impl::sycl_device_ptr SYCLInternal::resize_team_scratch_space( + int scratch_pool_id, std::int64_t bytes, bool force_shrink) { // Multiple ParallelFor/Reduce Teams can call this function at the same time // and invalidate the m_team_scratch_ptr. We use a pool to avoid any race // condition. @@ -252,7 +251,7 @@ void SYCLInternal::finalize() { m_queue.reset(); } -Kokkos::Impl::SYCLTypes::device_ptr SYCLInternal::scratch_space( +Kokkos::Impl::sycl_device_ptr SYCLInternal::scratch_space( const std::size_t size) { if (verify_is_initialized("scratch_space") && m_scratchSpaceCount < scratch_count(size)) { @@ -273,7 +272,7 @@ Kokkos::Impl::SYCLTypes::device_ptr SYCLInternal::scratch_space( return m_scratchSpace; } -Kokkos::Impl::SYCLTypes::host_ptr SYCLInternal::scratch_host( +Kokkos::Impl::sycl_host_ptr SYCLInternal::scratch_host( const std::size_t size) { if (verify_is_initialized("scratch_unified") && m_scratchHostCount < scratch_count(size)) { @@ -294,7 +293,7 @@ Kokkos::Impl::SYCLTypes::host_ptr SYCLInternal::scratch_host( return m_scratchHost; } -Kokkos::Impl::SYCLTypes::device_ptr SYCLInternal::scratch_flags( +Kokkos::Impl::sycl_device_ptr SYCLInternal::scratch_flags( const std::size_t size) { if (verify_is_initialized("scratch_flags") && m_scratchFlagsCount < scratch_count(size)) { diff --git a/core/src/SYCL/Kokkos_SYCL_Instance.hpp b/core/src/SYCL/Kokkos_SYCL_Instance.hpp index de77b8efdeb..2d784ef8a5f 100644 --- a/core/src/SYCL/Kokkos_SYCL_Instance.hpp +++ b/core/src/SYCL/Kokkos_SYCL_Instance.hpp @@ -43,13 +43,11 @@ class SYCLInternal { SYCLInternal& operator=(SYCLInternal&&) = delete; SYCLInternal(SYCLInternal&&) = delete; - Kokkos::Impl::SYCLTypes::device_ptr scratch_space( - const std::size_t size); - Kokkos::Impl::SYCLTypes::device_ptr scratch_flags( - const std::size_t size); - Kokkos::Impl::SYCLTypes::host_ptr scratch_host(const std::size_t size); + Kokkos::Impl::sycl_device_ptr scratch_space(const std::size_t size); + Kokkos::Impl::sycl_device_ptr scratch_flags(const std::size_t size); + Kokkos::Impl::sycl_host_ptr scratch_host(const std::size_t size); int acquire_team_scratch_space(); - Kokkos::Impl::SYCLTypes::device_ptr resize_team_scratch_space( + Kokkos::Impl::sycl_device_ptr resize_team_scratch_space( int scratch_pool_id, std::int64_t bytes, bool force_shrink = false); void register_team_scratch_event(int scratch_pool_id, sycl::event event); @@ -60,19 +58,19 @@ class SYCLInternal { uint32_t m_maxConcurrency = 0; uint64_t m_maxShmemPerBlock = 0; - std::size_t m_scratchSpaceCount = 0; - Kokkos::Impl::SYCLTypes::device_ptr m_scratchSpace = nullptr; - std::size_t m_scratchHostCount = 0; - Kokkos::Impl::SYCLTypes::host_ptr m_scratchHost = nullptr; - std::size_t m_scratchFlagsCount = 0; - Kokkos::Impl::SYCLTypes::device_ptr m_scratchFlags = nullptr; + std::size_t m_scratchSpaceCount = 0; + Kokkos::Impl::sycl_device_ptr m_scratchSpace = nullptr; + std::size_t m_scratchHostCount = 0; + Kokkos::Impl::sycl_host_ptr m_scratchHost = nullptr; + std::size_t m_scratchFlagsCount = 0; + Kokkos::Impl::sycl_device_ptr m_scratchFlags = nullptr; // mutex to access shared memory mutable std::mutex m_mutexScratchSpace; // Team Scratch Level 1 Space static constexpr int m_n_team_scratch = 10; mutable int64_t m_team_scratch_current_size[m_n_team_scratch] = {}; - mutable Kokkos::Impl::SYCLTypes::device_ptr + mutable Kokkos::Impl::sycl_device_ptr m_team_scratch_ptr[m_n_team_scratch] = {}; mutable int m_current_team_scratch = 0; mutable sycl::event m_team_scratch_event[m_n_team_scratch] = {}; diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp index 7f258ecccae..d98f4837315 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp @@ -44,7 +44,7 @@ class Kokkos::Impl::ParallelFor, size_type const m_vector_size; int m_shmem_begin; int m_shmem_size; - Kokkos::Impl::SYCLTypes::device_ptr m_global_scratch_ptr; + sycl_device_ptr m_global_scratch_ptr; size_t m_scratch_size[2]; // Only let one ParallelFor instance at a time use the team scratch memory. // The constructor acquires the mutex which is released in the destructor. @@ -72,8 +72,7 @@ class Kokkos::Impl::ParallelFor, // Avoid capturing *this since it might not be trivially copyable const auto shmem_begin = m_shmem_begin; const size_t scratch_size[2] = {m_scratch_size[0], m_scratch_size[1]}; - Kokkos::Impl::SYCLTypes::device_ptr const global_scratch_ptr = - m_global_scratch_ptr; + sycl_device_ptr const global_scratch_ptr = m_global_scratch_ptr; auto lambda = [=](sycl::nd_item<2> item) { const member_type team_member( @@ -165,10 +164,9 @@ class Kokkos::Impl::ParallelFor, auto& space = *m_policy.space().impl_internal_space_instance(); m_scratch_pool_id = space.acquire_team_scratch_space(); m_global_scratch_ptr = - static_cast>( - space.resize_team_scratch_space( - m_scratch_pool_id, - static_cast(m_scratch_size[1]) * m_league_size)); + static_cast>(space.resize_team_scratch_space( + m_scratch_pool_id, + static_cast(m_scratch_size[1]) * m_league_size)); if (static_cast(space.m_maxShmemPerBlock) < m_shmem_size - m_shmem_begin) { diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp index 155f4b00821..c1414ee0581 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp @@ -94,10 +94,10 @@ class Kokkos::Impl::ParallelReduce results_ptr; + sycl_device_ptr results_ptr; auto host_result_ptr = (m_result_ptr && !m_result_ptr_device_accessible) - ? static_cast>( + ? static_cast>( instance.scratch_host(sizeof(value_type) * value_count)) : nullptr; @@ -114,9 +114,8 @@ class Kokkos::Impl::ParallelReduce>( - instance.scratch_space(sizeof(value_type) * value_count)); + results_ptr = static_cast>( + instance.scratch_space(sizeof(value_type) * value_count)); auto device_accessible_result_ptr = m_result_ptr_device_accessible ? static_cast>(m_result_ptr) @@ -156,17 +155,14 @@ class Kokkos::Impl::ParallelReduce>( - instance.scratch_space(sizeof(value_type) * value_count * - n_wgroups)); + results_ptr = static_cast>( + instance.scratch_space(sizeof(value_type) * value_count * n_wgroups)); auto device_accessible_result_ptr = m_result_ptr_device_accessible ? static_cast>(m_result_ptr) : static_cast>(host_result_ptr); - auto scratch_flags = - static_cast>( - instance.scratch_flags(sizeof(unsigned int))); + auto scratch_flags = static_cast>( + instance.scratch_flags(sizeof(unsigned int))); auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) { sycl::local_accessor local_mem( diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp index c00f9bb6232..dbe2366b8bc 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp @@ -69,10 +69,10 @@ class Kokkos::Impl::ParallelReduce results_ptr = nullptr; + sycl_device_ptr results_ptr = nullptr; auto host_result_ptr = (m_result_ptr && !m_result_ptr_device_accessible) - ? static_cast>( + ? static_cast>( instance.scratch_host(sizeof(value_type) * value_count)) : nullptr; auto device_accessible_result_ptr = @@ -88,9 +88,8 @@ class Kokkos::Impl::ParallelReduce>( - instance.scratch_space(sizeof(value_type) * value_count)); + results_ptr = static_cast>( + instance.scratch_space(sizeof(value_type) * value_count)); auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) { const auto begin = policy.begin(); @@ -126,15 +125,13 @@ class Kokkos::Impl::ParallelReduce>( - instance.scratch_flags(sizeof(unsigned int))); + auto scratch_flags = static_cast>( + instance.scratch_flags(sizeof(unsigned int))); auto reduction_lambda_factory = [&](sycl::local_accessor local_mem, sycl::local_accessor num_teams_done, - Kokkos::Impl::SYCLTypes::device_ptr results_ptr, - int values_per_thread) { + sycl_device_ptr results_ptr, int values_per_thread) { const auto begin = policy.begin(); auto lambda = [=](sycl::nd_item<1> item) { @@ -305,9 +302,8 @@ class Kokkos::Impl::ParallelReduce>( - instance.scratch_space(sizeof(value_type) * value_count * - n_wgroups)); + static_cast>(instance.scratch_space( + sizeof(value_type) * value_count * n_wgroups)); sycl::local_accessor local_mem( sycl::range<1>(wgroup_size) * value_count, cgh); diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp index b9be1148832..1332fafde94 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp @@ -54,7 +54,7 @@ class Kokkos::Impl::ParallelReduce m_global_scratch_ptr; + sycl_device_ptr m_global_scratch_ptr; size_t m_scratch_size[2]; const size_type m_league_size; int m_team_size; @@ -82,7 +82,7 @@ class Kokkos::Impl::ParallelReduce>( + ? static_cast>( instance.scratch_host(sizeof(value_type) * value_count)) : nullptr; @@ -95,9 +95,8 @@ class Kokkos::Impl::ParallelReduce>( - instance.scratch_space(sizeof(value_type) * - std::max(value_count, 1u))); + static_cast>(instance.scratch_space( + sizeof(value_type) * std::max(value_count, 1u))); auto device_accessible_result_ptr = m_result_ptr_device_accessible ? static_cast>(m_result_ptr) @@ -114,8 +113,7 @@ class Kokkos::Impl::ParallelReduce const global_scratch_ptr = - m_global_scratch_ptr; + sycl_device_ptr const global_scratch_ptr = m_global_scratch_ptr; #ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES cgh.depends_on(memcpy_event); @@ -158,9 +156,8 @@ class Kokkos::Impl::ParallelReduce>( - instance.scratch_flags(sizeof(unsigned int))); + auto scratch_flags = static_cast>( + instance.scratch_flags(sizeof(unsigned int))); // FIXME_SYCL accessors seem to need a size greater than zero at least // for host queues @@ -173,13 +170,12 @@ class Kokkos::Impl::ParallelReduce const global_scratch_ptr = - m_global_scratch_ptr; + sycl_device_ptr const global_scratch_ptr = m_global_scratch_ptr; sycl::local_accessor num_teams_done(1, cgh); auto team_reduction_factory = [&](sycl::local_accessor local_mem, - Kokkos::Impl::SYCLTypes::device_ptr results_ptr) { + sycl_device_ptr results_ptr) { auto device_accessible_result_ptr = m_result_ptr_device_accessible ? static_cast>(m_result_ptr) @@ -335,9 +331,8 @@ class Kokkos::Impl::ParallelReduce((size + wgroup_size - 1) / wgroup_size, 1); results_ptr = - static_cast>( - instance.scratch_space(sizeof(value_type) * - std::max(value_count, 1u) * init_size)); + static_cast>(instance.scratch_space( + sizeof(value_type) * std::max(value_count, 1u) * init_size)); size_t max_work_groups = 2 * @@ -433,10 +428,9 @@ class Kokkos::Impl::ParallelReduce>( - space.resize_team_scratch_space( - m_scratch_pool_id, - static_cast(m_scratch_size[1]) * m_league_size)); + static_cast>(space.resize_team_scratch_space( + m_scratch_pool_id, + static_cast(m_scratch_size[1]) * m_league_size)); if (static_cast(space.m_maxShmemPerBlock) < m_shmem_size - m_shmem_begin) { diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp index a3efe56b99c..bfc3fba7412 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp @@ -146,7 +146,7 @@ class ParallelScanSYCLBase { const CombinedFunctorReducer m_functor_reducer; const Policy m_policy; - Kokkos::Impl::SYCLTypes::host_ptr m_scratch_host = nullptr; + sycl_host_ptr m_scratch_host = nullptr; pointer_type m_result_ptr; const bool m_result_ptr_device_accessible; @@ -166,96 +166,93 @@ class ParallelScanSYCLBase { const auto size = m_policy.end() - m_policy.begin(); - auto scratch_flags = - static_cast>( - instance.scratch_flags(sizeof(unsigned int))); + auto scratch_flags = static_cast>( + instance.scratch_flags(sizeof(unsigned int))); const auto begin = m_policy.begin(); // Initialize global memory - auto scan_lambda_factory = - [&](sycl::local_accessor local_mem, - sycl::local_accessor num_teams_done, - Kokkos::Impl::SYCLTypes::device_ptr global_mem_, - Kokkos::Impl::SYCLTypes::device_ptr group_results_) { - auto lambda = [=](sycl::nd_item<1> item) { - auto global_mem = global_mem_; - auto group_results = group_results_; - - const CombinedFunctorReducer< - FunctorType, typename Analysis::Reducer>& functor_reducer = - functor_wrapper.get_functor(); - const FunctorType& functor = functor_reducer.get_functor(); - const typename Analysis::Reducer& reducer = - functor_reducer.get_reducer(); - - const auto n_wgroups = item.get_group_range()[0]; - const int wgroup_size = item.get_local_range()[0]; - - const int local_id = item.get_local_linear_id(); - const index_type global_id = item.get_global_linear_id(); - - // Initialize local memory - value_type local_value; - reducer.init(&local_value); - if (global_id < size) { - if constexpr (std::is_void::value) - functor(global_id + begin, local_value, false); - else - functor(WorkTag(), global_id + begin, local_value, false); - } + auto scan_lambda_factory = [&](sycl::local_accessor local_mem, + sycl::local_accessor + num_teams_done, + sycl_device_ptr global_mem_, + sycl_device_ptr group_results_) { + auto lambda = [=](sycl::nd_item<1> item) { + auto global_mem = global_mem_; + auto group_results = group_results_; + + const CombinedFunctorReducer& + functor_reducer = functor_wrapper.get_functor(); + const FunctorType& functor = functor_reducer.get_functor(); + const typename Analysis::Reducer& reducer = + functor_reducer.get_reducer(); + + const auto n_wgroups = item.get_group_range()[0]; + const int wgroup_size = item.get_local_range()[0]; + + const int local_id = item.get_local_linear_id(); + const index_type global_id = item.get_global_linear_id(); + + // Initialize local memory + value_type local_value; + reducer.init(&local_value); + if (global_id < size) { + if constexpr (std::is_void::value) + functor(global_id + begin, local_value, false); + else + functor(WorkTag(), global_id + begin, local_value, false); + } - workgroup_scan<>(item, reducer, local_mem, local_value, - wgroup_size); + workgroup_scan<>(item, reducer, local_mem, local_value, wgroup_size); - // Write results to global memory - if (global_id < size) global_mem[global_id] = local_value; + // Write results to global memory + if (global_id < size) global_mem[global_id] = local_value; - if (local_id == wgroup_size - 1) { - group_results[item.get_group_linear_id()] = - local_mem[item.get_sub_group().get_group_range()[0] - 1]; + if (local_id == wgroup_size - 1) { + group_results[item.get_group_linear_id()] = + local_mem[item.get_sub_group().get_group_range()[0] - 1]; - sycl::atomic_ref - scratch_flags_ref(*scratch_flags); - num_teams_done[0] = ++scratch_flags_ref; - } - item.barrier(sycl::access::fence_space::global_space); - if (num_teams_done[0] == n_wgroups) { - if (local_id == 0) *scratch_flags = 0; - value_type total; - reducer.init(&total); - - for (unsigned int offset = 0; offset < n_wgroups; - offset += wgroup_size) { - index_type id = local_id + offset; - if (id < static_cast(n_wgroups)) - local_value = group_results[id]; - else - reducer.init(&local_value); - workgroup_scan<>( - item, reducer, local_mem, local_value, - std::min(n_wgroups - offset, wgroup_size)); - if (id < static_cast(n_wgroups)) { - reducer.join(&local_value, &total); - group_results[id] = local_value; - } - reducer.join( - &total, - &local_mem[item.get_sub_group().get_group_range()[0] - 1]); - if (offset + wgroup_size < n_wgroups) - item.barrier(sycl::access::fence_space::global_space); - } + sycl::atomic_ref + scratch_flags_ref(*scratch_flags); + num_teams_done[0] = ++scratch_flags_ref; + } + item.barrier(sycl::access::fence_space::global_space); + if (num_teams_done[0] == n_wgroups) { + if (local_id == 0) *scratch_flags = 0; + value_type total; + reducer.init(&total); + + for (unsigned int offset = 0; offset < n_wgroups; + offset += wgroup_size) { + index_type id = local_id + offset; + if (id < static_cast(n_wgroups)) + local_value = group_results[id]; + else + reducer.init(&local_value); + workgroup_scan<>( + item, reducer, local_mem, local_value, + std::min(n_wgroups - offset, wgroup_size)); + if (id < static_cast(n_wgroups)) { + reducer.join(&local_value, &total); + group_results[id] = local_value; } - }; - return lambda; - }; + reducer.join( + &total, + &local_mem[item.get_sub_group().get_group_range()[0] - 1]); + if (offset + wgroup_size < n_wgroups) + item.barrier(sycl::access::fence_space::global_space); + } + } + }; + return lambda; + }; size_t wgroup_size; size_t n_wgroups; - Kokkos::Impl::SYCLTypes::device_ptr global_mem; - Kokkos::Impl::SYCLTypes::device_ptr group_results; + sycl_device_ptr global_mem; + sycl_device_ptr group_results; desul::ensure_sycl_lock_arrays_on_device(q); @@ -289,12 +286,11 @@ class ParallelScanSYCLBase { // that will contain the sum of the previous workgroups totals. // FIXME_SYCL consider only storing one value per block and recreate // initial results in the end before doing the final pass - global_mem = static_cast>( - instance.scratch_space(n_wgroups * (wgroup_size + 1) * - sizeof(value_type))); - m_scratch_host = - static_cast>( - instance.scratch_host(sizeof(value_type))); + global_mem = + static_cast>(instance.scratch_space( + n_wgroups * (wgroup_size + 1) * sizeof(value_type))); + m_scratch_host = static_cast>( + instance.scratch_host(sizeof(value_type))); group_results = global_mem + n_wgroups * wgroup_size; diff --git a/core/src/SYCL/Kokkos_SYCL_Team.hpp b/core/src/SYCL/Kokkos_SYCL_Team.hpp index d838dc94c67..910e3602714 100644 --- a/core/src/SYCL/Kokkos_SYCL_Team.hpp +++ b/core/src/SYCL/Kokkos_SYCL_Team.hpp @@ -339,7 +339,7 @@ class SYCLTeamMember { KOKKOS_INLINE_FUNCTION SYCLTeamMember(sycl::local_ptr shared, const std::size_t shared_begin, const std::size_t shared_size, - Kokkos::Impl::SYCLTypes::device_ptr scratch_level_1_ptr, + sycl_device_ptr scratch_level_1_ptr, const std::size_t scratch_level_1_size, const sycl::nd_item<2> item, const int arg_league_rank, const int arg_league_size) diff --git a/core/src/SYCL/Kokkos_SYCL_WorkgroupReduction.hpp b/core/src/SYCL/Kokkos_SYCL_WorkgroupReduction.hpp index f25dec76777..06be143ecca 100644 --- a/core/src/SYCL/Kokkos_SYCL_WorkgroupReduction.hpp +++ b/core/src/SYCL/Kokkos_SYCL_WorkgroupReduction.hpp @@ -28,7 +28,7 @@ inline constexpr bool use_shuffle_based_algorithm = template std::enable_if_t> workgroup_reduction( sycl::nd_item& item, sycl::local_accessor local_mem, - Kokkos::Impl::SYCLTypes::device_ptr results_ptr, + sycl_device_ptr results_ptr, sycl::global_ptr device_accessible_result_ptr, const unsigned int value_count_, const ReducerType& final_reducer, bool final, unsigned int max_size) { @@ -100,8 +100,7 @@ std::enable_if_t> workgroup_reduction( template std::enable_if_t> workgroup_reduction( sycl::nd_item& item, sycl::local_accessor local_mem, - ValueType local_value, - Kokkos::Impl::SYCLTypes::device_ptr results_ptr, + ValueType local_value, sycl_device_ptr results_ptr, sycl::global_ptr device_accessible_result_ptr, const ReducerType& final_reducer, bool final, unsigned int max_size) { const auto local_id = item.get_local_linear_id(); diff --git a/core/src/setup/Kokkos_Setup_SYCL.hpp b/core/src/setup/Kokkos_Setup_SYCL.hpp index 7fb10bb39a2..b117d75acb9 100644 --- a/core/src/setup/Kokkos_Setup_SYCL.hpp +++ b/core/src/setup/Kokkos_Setup_SYCL.hpp @@ -46,20 +46,20 @@ #endif // FIXME_SYCL Use type directly once it has stabilized in SYCL. -namespace Kokkos::Impl::SYCLTypes { +namespace Kokkos::Impl { #ifndef SYCL_EXT_INTEL_USM_ADDRESS_SPACES #error SYCL_EXT_INTEL_USM_ADDRESS_SPACES undefined! #elif SYCL_EXT_INTEL_USM_ADDRESS_SPACES >= 2 template -using device_ptr = sycl::ext::intel::device_ptr; +using sycl_device_ptr = sycl::ext::intel::device_ptr; template -using host_ptr = sycl::ext::intel::host_ptr; +using sycl_host_ptr = sycl::ext::intel::host_ptr; #else template -using device_ptr = sycl::device_ptr; +using sycl_device_ptr = sycl::device_ptr; template -using host_ptr = sycl::host_ptr; +using sycl_host_ptr = sycl::host_ptr; #endif -} // namespace Kokkos::Impl::SYCLTypes +} // namespace Kokkos::Impl #endif From ab3cae4865aec2114a7fe21288fdab2916b92188 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 24 Apr 2024 10:14:56 +0200 Subject: [PATCH 019/103] Fix wrong macro guards for deprecated Kokkos::pair specialization Co-Authored-By: Nicolas Morales --- core/src/Kokkos_Pair.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/Kokkos_Pair.hpp b/core/src/Kokkos_Pair.hpp index d1bd11f7162..9c3516eb222 100644 --- a/core/src/Kokkos_Pair.hpp +++ b/core/src/Kokkos_Pair.hpp @@ -413,7 +413,7 @@ KOKKOS_FORCEINLINE_FUNCTION pair tie(T1& x, T2& y) { return (pair(x, y)); } -#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_4 +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 // // Specialization of Kokkos::pair for a \c void second argument. This // is not actually a "pair"; it only contains one element, the first. From fafe861d0683cdde279a44dc8dc10b71d9866c30 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 24 Apr 2024 14:55:20 +0200 Subject: [PATCH 020/103] Fix support for Kokkos::Array of const-qualified element type --- core/src/Kokkos_Array.hpp | 3 ++- core/unit_test/TestArray.cpp | 7 +++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/core/src/Kokkos_Array.hpp b/core/src/Kokkos_Array.hpp index dcba8a42484..29e5edd9baa 100644 --- a/core/src/Kokkos_Array.hpp +++ b/core/src/Kokkos_Array.hpp @@ -134,8 +134,9 @@ struct Array { } private: + template friend KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t< - Impl::is_swappable::value> + Impl::is_swappable::value> kokkos_swap(Array& a, Array& b) noexcept(Impl::is_nothrow_swappable_v) { for (std::size_t i = 0; i < N; ++i) { diff --git a/core/unit_test/TestArray.cpp b/core/unit_test/TestArray.cpp index e691d83ebe2..d20d355b792 100644 --- a/core/unit_test/TestArray.cpp +++ b/core/unit_test/TestArray.cpp @@ -120,6 +120,13 @@ static_assert(test_array_aggregate_initialization()); } } +constexpr bool test_array_const_qualified_element_type() { + Kokkos::Array a{255}; + return a[0] == 255; +} + +static_assert(test_array_const_qualified_element_type()); + // User-defined type providing a sepcialization of kokkos_swap struct MyInt { int i; From 63eef4623a84634f97b2761c354792e5c0613cd9 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 24 Apr 2024 23:35:59 +0200 Subject: [PATCH 021/103] Try to fix the CUDA 11.0 build --- core/src/Kokkos_Array.hpp | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/core/src/Kokkos_Array.hpp b/core/src/Kokkos_Array.hpp index 29e5edd9baa..6ff27db061b 100644 --- a/core/src/Kokkos_Array.hpp +++ b/core/src/Kokkos_Array.hpp @@ -137,8 +137,13 @@ struct Array { template friend KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t< Impl::is_swappable::value> - kokkos_swap(Array& a, - Array& b) noexcept(Impl::is_nothrow_swappable_v) { + kokkos_swap( +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + Array& a, Array& b +#else + Array& a, Array& b +#endif + ) noexcept(Impl::is_nothrow_swappable_v) { for (std::size_t i = 0; i < N; ++i) { kokkos_swap(a[i], b[i]); } From ebb1cb308a956a4b98f2a5eb26660ed8ca3fe6ad Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 25 Apr 2024 07:43:16 +0200 Subject: [PATCH 022/103] Revert "Try to fix the CUDA 11.0 build" This reverts commit 63eef4623a84634f97b2761c354792e5c0613cd9. --- core/src/Kokkos_Array.hpp | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/core/src/Kokkos_Array.hpp b/core/src/Kokkos_Array.hpp index 6ff27db061b..29e5edd9baa 100644 --- a/core/src/Kokkos_Array.hpp +++ b/core/src/Kokkos_Array.hpp @@ -137,13 +137,8 @@ struct Array { template friend KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t< Impl::is_swappable::value> - kokkos_swap( -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - Array& a, Array& b -#else - Array& a, Array& b -#endif - ) noexcept(Impl::is_nothrow_swappable_v) { + kokkos_swap(Array& a, + Array& b) noexcept(Impl::is_nothrow_swappable_v) { for (std::size_t i = 0; i < N; ++i) { kokkos_swap(a[i], b[i]); } From 031f6d94a4294c767c4e049f9aa5fadee47c9ef3 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 25 Apr 2024 08:11:46 +0200 Subject: [PATCH 023/103] Alternate definition of Impl::is_nothrow_swappable_v for NVCC version less than 11.4 --- core/src/Kokkos_Swap.hpp | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/core/src/Kokkos_Swap.hpp b/core/src/Kokkos_Swap.hpp index 2f849a13ab6..fd69a8e6266 100644 --- a/core/src/Kokkos_Swap.hpp +++ b/core/src/Kokkos_Swap.hpp @@ -37,6 +37,26 @@ kokkos_swap(T& a, T& b) noexcept(std::is_nothrow_move_constructible_v&& namespace Impl { +// Workaround for the definition of is_nothrow_swappable_v +#if defined(KOKKOS_COMPILER_NVCC) && (KOKKOS_COMPILER_NVCC < 1140) +template +struct is_swappable { + template + static decltype(kokkos_swap(std::declval(), std::declval())) + test_swap(int) noexcept(noexcept(kokkos_swap(std::declval(), + std::declval()))); + struct Nope {}; // test_swap must return a complete type for the definition + // of nothrow below + template + static Nope test_swap(long); + static constexpr bool value = + !std::is_same_v(0)), Nope>; + static constexpr bool nothrow = noexcept(test_swap(0)); +}; + +template +inline constexpr bool is_nothrow_swappable_v = is_swappable::nothrow; +#else template struct is_swappable { template @@ -52,6 +72,7 @@ struct is_swappable { template inline constexpr bool is_nothrow_swappable_v = noexcept(kokkos_swap(std::declval(), std::declval())); +#endif } // namespace Impl From 2391f1765318725042dcdad6581eca6c03cb5adc Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 25 Apr 2024 17:23:11 +0200 Subject: [PATCH 024/103] Avoid introducing a 2nd definition of the Impl::swappable trait Co-Authored-By: Daniel Arndt --- core/src/Kokkos_Swap.hpp | 27 +++++++-------------------- 1 file changed, 7 insertions(+), 20 deletions(-) diff --git a/core/src/Kokkos_Swap.hpp b/core/src/Kokkos_Swap.hpp index fd69a8e6266..907f8607a7e 100644 --- a/core/src/Kokkos_Swap.hpp +++ b/core/src/Kokkos_Swap.hpp @@ -37,26 +37,6 @@ kokkos_swap(T& a, T& b) noexcept(std::is_nothrow_move_constructible_v&& namespace Impl { -// Workaround for the definition of is_nothrow_swappable_v -#if defined(KOKKOS_COMPILER_NVCC) && (KOKKOS_COMPILER_NVCC < 1140) -template -struct is_swappable { - template - static decltype(kokkos_swap(std::declval(), std::declval())) - test_swap(int) noexcept(noexcept(kokkos_swap(std::declval(), - std::declval()))); - struct Nope {}; // test_swap must return a complete type for the definition - // of nothrow below - template - static Nope test_swap(long); - static constexpr bool value = - !std::is_same_v(0)), Nope>; - static constexpr bool nothrow = noexcept(test_swap(0)); -}; - -template -inline constexpr bool is_nothrow_swappable_v = is_swappable::nothrow; -#else template struct is_swappable { template @@ -69,6 +49,13 @@ struct is_swappable { !std::is_same_v(0)), Nope>; }; +#if defined(KOKKOS_COMPILER_NVCC) && (KOKKOS_COMPILER_NVCC < 1140) +template +inline constexpr bool is_nothrow_swappable_v = + is_swappable::value&& noexcept( + kokkos_swap(std::declval&>(), + std::declval&>())); +#else template inline constexpr bool is_nothrow_swappable_v = noexcept(kokkos_swap(std::declval(), std::declval())); From d434f87e91069bc0d0af020053f4ca7c3f3b80c4 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 25 Apr 2024 20:51:34 +0200 Subject: [PATCH 025/103] Do not require OpenMP support for languages other than CXX Specify CXX component when searching for OpenMP so that OpenMP support is not required for other languages with CMake. One caveat is that finding the OpenMP dependency downstream will require CMake minimum version of 3.10 https://cmake.org/cmake/help/latest/module/FindOpenMP.html Co-Authored-By: Luca Bertagna --- cmake/kokkos_tpls.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmake/kokkos_tpls.cmake b/cmake/kokkos_tpls.cmake index 6ef3b79bde2..df01f200d13 100644 --- a/cmake/kokkos_tpls.cmake +++ b/cmake/kokkos_tpls.cmake @@ -103,13 +103,13 @@ if (Kokkos_ENABLE_IMPL_MDSPAN AND Kokkos_ENABLE_MDSPAN_EXTERNAL) endif() IF (Kokkos_ENABLE_OPENMP) - find_package(OpenMP REQUIRED) + find_package(OpenMP REQUIRED COMPONENTS CXX) # FIXME_TRILINOS Trilinos doesn't allow for Kokkos to use find_dependency # so we just append the flags here instead of linking with the OpenMP target. IF(KOKKOS_HAS_TRILINOS) COMPILER_SPECIFIC_FLAGS(DEFAULT ${OpenMP_CXX_FLAGS}) ELSE() - KOKKOS_EXPORT_CMAKE_TPL(OpenMP REQUIRED) + KOKKOS_EXPORT_CMAKE_TPL(OpenMP REQUIRED COMPONENTS CXX) ENDIF() ENDIF() From 19ca9ce97a80bbf9f43353b22c09c437f1389384 Mon Sep 17 00:00:00 2001 From: Christian Trott Date: Fri, 26 Apr 2024 09:47:04 -0600 Subject: [PATCH 026/103] Update version --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index f4252437111..d0cf6696f89 100644 --- a/README.md +++ b/README.md @@ -28,12 +28,12 @@ To start learning about Kokkos: The latest release of Kokkos can be obtained from the [GitHub releases page](https://github.com/kokkos/kokkos/releases/latest). -The current release is [4.2.01](https://github.com/kokkos/kokkos/releases/tag/4.2.01). +The current release is [4.3.00](https://github.com/kokkos/kokkos/releases/tag/4.3.00). ```bash -curl -OJ -L https://github.com/kokkos/kokkos/archive/refs/tags/4.2.01.tar.gz +curl -OJ -L https://github.com/kokkos/kokkos/archive/refs/tags/4.3.00.tar.gz # Or with wget -wget https://github.com/kokkos/kokkos/archive/refs/tags/4.2.01.tar.gz +wget https://github.com/kokkos/kokkos/archive/refs/tags/4.3.00.tar.gz ``` To clone the latest development version of Kokkos from GitHub: From 9686392118b72205251a0c9511c5fceacc1b6db8 Mon Sep 17 00:00:00 2001 From: Christian Trott Date: Fri, 26 Apr 2024 09:52:03 -0600 Subject: [PATCH 027/103] Add Linux Foundation notice and fix C++ standard --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index d0cf6696f89..7d9d70fac5c 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,8 @@ backends in development. **Kokkos Core is part of the [Kokkos C++ Performance Portability Programming Ecosystem](https://kokkos.org/about/abstract/).** +Kokkos is a [Linux Foundation](https://linuxfoundation.org) project. + ## Learning about Kokkos To start learning about Kokkos: @@ -44,7 +46,7 @@ git clone -b develop https://github.com/kokkos/kokkos.git ### Building Kokkos -To build Kokkos, you will need to have a C++ compiler that supports C++14 or later. +To build Kokkos, you will need to have a C++ compiler that supports C++17 or later. All requirements including minimum and primary tested compiler versions can be found [here](https://kokkos.org/kokkos-core-wiki/requirements.html). Building and installation instructions are described [here](https://kokkos.org/kokkos-core-wiki/building.html). From 7e7709fdb8029e6e97f5a5f8549c3a42a4ddbdbd Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Sat, 27 Apr 2024 12:32:21 -0400 Subject: [PATCH 028/103] SYCL: Avoid deprecated floating-point number abs overloads (#6959) * Avoid deprecated floating-point number abs overloads * Add a comment --- core/src/Kokkos_MathematicalFunctions.hpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/core/src/Kokkos_MathematicalFunctions.hpp b/core/src/Kokkos_MathematicalFunctions.hpp index 3fead8dd293..19967782e5e 100644 --- a/core/src/Kokkos_MathematicalFunctions.hpp +++ b/core/src/Kokkos_MathematicalFunctions.hpp @@ -277,12 +277,20 @@ KOKKOS_INLINE_FUNCTION long long abs(long long n) { #endif } KOKKOS_INLINE_FUNCTION float abs(float x) { +#ifdef KOKKOS_ENABLE_SYCL + return sycl::fabs(x); // sycl::abs is only provided for integral types +#else using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::abs; return abs(x); +#endif } KOKKOS_INLINE_FUNCTION double abs(double x) { +#ifdef KOKKOS_ENABLE_SYCL + return sycl::fabs(x); // sycl::abs is only provided for integral types +#else using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::abs; return abs(x); +#endif } inline long double abs(long double x) { using std::abs; From 4ec82963fbcd174aab79a86875c85e09a4fcc170 Mon Sep 17 00:00:00 2001 From: Rahulkumar Gayatri Date: Sun, 28 Apr 2024 08:52:18 -0700 Subject: [PATCH 029/103] OpenMPTarget: Update loop order in MDRange (#6925) * OpenMPTarget: Reverse loop order in MDRange. * OpenMPTarget: Honor user request for iteration in MDRange. * OpenMPTarget: clang-format * OpenMPTarget: Remove unecessary includes. --------- Co-authored-by: Rahulkumar Gayatri --- core/src/OpenMPTarget/Kokkos_OpenMPTarget.hpp | 3 +- .../Kokkos_OpenMPTarget_MDRangePolicy.hpp | 5 + ...okkos_OpenMPTarget_ParallelFor_MDRange.hpp | 383 +++++++++++ ...s_OpenMPTarget_ParallelReduce_MDRange.hpp} | 633 ++++++++---------- 4 files changed, 674 insertions(+), 350 deletions(-) create mode 100644 core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_MDRange.hpp rename core/src/OpenMPTarget/{Kokkos_OpenMPTarget_Parallel_MDRange.hpp => Kokkos_OpenMPTarget_ParallelReduce_MDRange.hpp} (62%) diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget.hpp index ea4e7f6baba..84c7b85f11d 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget.hpp @@ -146,7 +146,8 @@ struct DeviceTypeTraits<::Kokkos::Experimental::OpenMPTarget> { /*--------------------------------------------------------------------------*/ #include -#include +#include +#include #include /*--------------------------------------------------------------------------*/ diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_MDRangePolicy.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_MDRangePolicy.hpp index d718f56d38b..e353676b617 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_MDRangePolicy.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_MDRangePolicy.hpp @@ -22,6 +22,10 @@ namespace Kokkos { namespace Impl { +using OpenMPTargetIterateLeft = std::integral_constant; +using OpenMPTargetIterateRight = + std::integral_constant; + template struct ThreadAndVectorNestLevel +#include +#include "Kokkos_OpenMPTarget_MDRangePolicy.hpp" + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template +class ParallelFor, + Kokkos::Experimental::OpenMPTarget> { + private: + using Policy = Kokkos::MDRangePolicy; + using WorkTag = typename Policy::work_tag; + using Member = typename Policy::member_type; + using Index = typename Policy::index_type; + + const FunctorType m_functor; + const Policy m_policy; + + public: + inline void execute() const { + OpenMPTargetExec::verify_is_process( + "Kokkos::Experimental::OpenMPTarget parallel_for"); + OpenMPTargetExec::verify_initialized( + "Kokkos::Experimental::OpenMPTarget parallel_for"); + FunctorType functor(m_functor); + Policy policy = m_policy; + + typename Policy::point_type unused; + static_assert(1 < Policy::rank && Policy::rank < 7); + static_assert(Policy::inner_direction == Iterate::Left || + Policy::inner_direction == Iterate::Right); + + execute_tile( + unused, functor, policy, + std::integral_constant()); + } + + template + inline std::enable_if_t execute_tile( + typename Policy::point_type offset, const FunctorType& functor, + const Policy& policy, OpenMPTargetIterateRight) const { + (void)offset; + const Index begin_0 = policy.m_lower[0]; + const Index begin_1 = policy.m_lower[1]; + + const Index end_0 = policy.m_upper[0]; + const Index end_1 = policy.m_upper[1]; + +#pragma omp target teams distribute parallel for collapse(2) map(to : functor) + for (auto i0 = begin_0; i0 < end_0; ++i0) + for (auto i1 = begin_1; i1 < end_1; ++i1) { + if constexpr (std::is_void::value) + functor(i0, i1); + else + functor(typename Policy::work_tag(), i0, i1); + } + } + + template + inline std::enable_if_t execute_tile( + typename Policy::point_type offset, const FunctorType& functor, + const Policy& policy, OpenMPTargetIterateRight) const { + (void)offset; + const Index begin_0 = policy.m_lower[0]; + const Index begin_1 = policy.m_lower[1]; + const Index begin_2 = policy.m_lower[2]; + + const Index end_0 = policy.m_upper[0]; + const Index end_1 = policy.m_upper[1]; + const Index end_2 = policy.m_upper[2]; + +#pragma omp target teams distribute parallel for collapse(3) map(to : functor) + for (auto i0 = begin_0; i0 < end_0; ++i0) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i2 = begin_2; i2 < end_2; ++i2) { + if constexpr (std::is_void::value) + functor(i0, i1, i2); + else + functor(typename Policy::work_tag(), i0, i1, i2); + } + } + } + } + + template + inline std::enable_if_t execute_tile( + typename Policy::point_type offset, const FunctorType& functor, + const Policy& policy, OpenMPTargetIterateRight) const { + (void)offset; + const Index begin_0 = policy.m_lower[0]; + const Index begin_1 = policy.m_lower[1]; + const Index begin_2 = policy.m_lower[2]; + const Index begin_3 = policy.m_lower[3]; + + const Index end_0 = policy.m_upper[0]; + const Index end_1 = policy.m_upper[1]; + const Index end_2 = policy.m_upper[2]; + const Index end_3 = policy.m_upper[3]; + +#pragma omp target teams distribute parallel for collapse(4) map(to : functor) + for (auto i0 = begin_0; i0 < end_0; ++i0) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i3 = begin_3; i3 < end_3; ++i3) { + if constexpr (std::is_void::value) + functor(i0, i1, i2, i3); + else + functor(typename Policy::work_tag(), i0, i1, i2, i3); + } + } + } + } + } + + template + inline std::enable_if_t execute_tile( + typename Policy::point_type offset, const FunctorType& functor, + const Policy& policy, OpenMPTargetIterateRight) const { + (void)offset; + const Index begin_0 = policy.m_lower[0]; + const Index begin_1 = policy.m_lower[1]; + const Index begin_2 = policy.m_lower[2]; + const Index begin_3 = policy.m_lower[3]; + const Index begin_4 = policy.m_lower[4]; + + const Index end_0 = policy.m_upper[0]; + const Index end_1 = policy.m_upper[1]; + const Index end_2 = policy.m_upper[2]; + const Index end_3 = policy.m_upper[3]; + const Index end_4 = policy.m_upper[4]; + +#pragma omp target teams distribute parallel for collapse(5) map(to : functor) + for (auto i0 = begin_0; i0 < end_0; ++i0) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i3 = begin_3; i3 < end_3; ++i3) { + for (auto i4 = begin_4; i4 < end_4; ++i4) { + if constexpr (std::is_same::value) + functor(i0, i1, i2, i3, i4); + else + functor(typename Policy::work_tag(), i0, i1, i2, i3, i4); + } + } + } + } + } + } + + template + inline std::enable_if_t execute_tile( + typename Policy::point_type offset, const FunctorType& functor, + const Policy& policy, OpenMPTargetIterateRight) const { + (void)offset; + const Index begin_0 = policy.m_lower[0]; + const Index begin_1 = policy.m_lower[1]; + const Index begin_2 = policy.m_lower[2]; + const Index begin_3 = policy.m_lower[3]; + const Index begin_4 = policy.m_lower[4]; + const Index begin_5 = policy.m_lower[5]; + + const Index end_0 = policy.m_upper[0]; + const Index end_1 = policy.m_upper[1]; + const Index end_2 = policy.m_upper[2]; + const Index end_3 = policy.m_upper[3]; + const Index end_4 = policy.m_upper[4]; + const Index end_5 = policy.m_upper[5]; + +#pragma omp target teams distribute parallel for collapse(6) map(to : functor) + for (auto i0 = begin_0; i0 < end_0; ++i0) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i3 = begin_3; i3 < end_3; ++i3) { + for (auto i4 = begin_4; i4 < end_4; ++i4) { + for (auto i5 = begin_5; i5 < end_5; ++i5) { + { + if constexpr (std::is_same::value) + functor(i0, i1, i2, i3, i4, i5); + else + functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, + i5); + } + } + } + } + } + } + } + } + + template + inline std::enable_if_t execute_tile( + typename Policy::point_type offset, const FunctorType& functor, + const Policy& policy, OpenMPTargetIterateLeft) const { + (void)offset; + const Index begin_0 = policy.m_lower[0]; + const Index begin_1 = policy.m_lower[1]; + + const Index end_0 = policy.m_upper[0]; + const Index end_1 = policy.m_upper[1]; + +#pragma omp target teams distribute parallel for collapse(2) map(to : functor) + for (auto i1 = begin_1; i1 < end_1; ++i1) + for (auto i0 = begin_0; i0 < end_0; ++i0) { + if constexpr (std::is_void::value) + functor(i0, i1); + else + functor(typename Policy::work_tag(), i0, i1); + } + } + + template + inline std::enable_if_t execute_tile( + typename Policy::point_type offset, const FunctorType& functor, + const Policy& policy, OpenMPTargetIterateLeft) const { + (void)offset; + const Index begin_0 = policy.m_lower[0]; + const Index begin_1 = policy.m_lower[1]; + const Index begin_2 = policy.m_lower[2]; + + const Index end_0 = policy.m_upper[0]; + const Index end_1 = policy.m_upper[1]; + const Index end_2 = policy.m_upper[2]; + +#pragma omp target teams distribute parallel for collapse(3) map(to : functor) + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i0 = begin_0; i0 < end_0; ++i0) { + if constexpr (std::is_void::value) + functor(i0, i1, i2); + else + functor(typename Policy::work_tag(), i0, i1, i2); + } + } + } + } + + template + inline std::enable_if_t execute_tile( + typename Policy::point_type offset, const FunctorType& functor, + const Policy& policy, OpenMPTargetIterateLeft) const { + (void)offset; + const Index begin_0 = policy.m_lower[0]; + const Index begin_1 = policy.m_lower[1]; + const Index begin_2 = policy.m_lower[2]; + const Index begin_3 = policy.m_lower[3]; + + const Index end_0 = policy.m_upper[0]; + const Index end_1 = policy.m_upper[1]; + const Index end_2 = policy.m_upper[2]; + const Index end_3 = policy.m_upper[3]; + +#pragma omp target teams distribute parallel for collapse(4) map(to : functor) + for (auto i3 = begin_3; i3 < end_3; ++i3) { + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i0 = begin_0; i0 < end_0; ++i0) { + if constexpr (std::is_void::value) + functor(i0, i1, i2, i3); + else + functor(typename Policy::work_tag(), i0, i1, i2, i3); + } + } + } + } + } + + template + inline std::enable_if_t execute_tile( + typename Policy::point_type offset, const FunctorType& functor, + const Policy& policy, OpenMPTargetIterateLeft) const { + (void)offset; + const Index begin_0 = policy.m_lower[0]; + const Index begin_1 = policy.m_lower[1]; + const Index begin_2 = policy.m_lower[2]; + const Index begin_3 = policy.m_lower[3]; + const Index begin_4 = policy.m_lower[4]; + + const Index end_0 = policy.m_upper[0]; + const Index end_1 = policy.m_upper[1]; + const Index end_2 = policy.m_upper[2]; + const Index end_3 = policy.m_upper[3]; + const Index end_4 = policy.m_upper[4]; + +#pragma omp target teams distribute parallel for collapse(5) map(to : functor) + for (auto i4 = begin_4; i4 < end_4; ++i4) { + for (auto i3 = begin_3; i3 < end_3; ++i3) { + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i0 = begin_0; i0 < end_0; ++i0) { + if constexpr (std::is_same::value) + functor(i0, i1, i2, i3, i4); + else + functor(typename Policy::work_tag(), i0, i1, i2, i3, i4); + } + } + } + } + } + } + + template + inline std::enable_if_t execute_tile( + typename Policy::point_type offset, const FunctorType& functor, + const Policy& policy, OpenMPTargetIterateLeft) const { + (void)offset; + const Index begin_0 = policy.m_lower[0]; + const Index begin_1 = policy.m_lower[1]; + const Index begin_2 = policy.m_lower[2]; + const Index begin_3 = policy.m_lower[3]; + const Index begin_4 = policy.m_lower[4]; + const Index begin_5 = policy.m_lower[5]; + + const Index end_0 = policy.m_upper[0]; + const Index end_1 = policy.m_upper[1]; + const Index end_2 = policy.m_upper[2]; + const Index end_3 = policy.m_upper[3]; + const Index end_4 = policy.m_upper[4]; + const Index end_5 = policy.m_upper[5]; + +#pragma omp target teams distribute parallel for collapse(6) map(to : functor) + for (auto i5 = begin_5; i5 < end_5; ++i5) { + for (auto i4 = begin_4; i4 < end_4; ++i4) { + for (auto i3 = begin_3; i3 < end_3; ++i3) { + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i0 = begin_0; i0 < end_0; ++i0) { + { + if constexpr (std::is_same::value) + functor(i0, i1, i2, i3, i4, i5); + else + functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, + i5); + } + } + } + } + } + } + } + } + + inline ParallelFor(const FunctorType& arg_functor, Policy arg_policy) + : m_functor(arg_functor), m_policy(arg_policy) {} + // TODO DZP: based on a conversation with Christian, we're using 256 as a + // heuristic here. We need something better once we can query these kinds of + // properties + template + static int max_tile_size_product(const Policy&, const Functor&) { + return 256; + } +}; + +} // namespace Impl +} // namespace Kokkos + +#endif /* KOKKOS_OPENMPTARGET_PARALLELFOR_MDRANGE_HPP */ diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_MDRange.hpp similarity index 62% rename from core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp rename to core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_MDRange.hpp index 6878531730d..0782a79302a 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_MDRange.hpp @@ -14,128 +14,122 @@ // //@HEADER -#ifndef KOKKOS_OPENMPTARGET_PARALLEL_MDRANGE_HPP -#define KOKKOS_OPENMPTARGET_PARALLEL_MDRANGE_HPP +#ifndef KOKKOS_OPENMPTARGET_PARALLELREDUCE_MDRANGE_HPP +#define KOKKOS_OPENMPTARGET_PARALLELREDUCE_MDRANGE_HPP #include #include -#include +#include "Kokkos_OpenMPTarget_MDRangePolicy.hpp" #include -// WORKAROUND OPENMPTARGET: sometimes tile sizes don't make it correctly, -// this was tracked down to a bug in clang with regards of mapping structs -// with arrays of long in it. Arrays of int might be fine though ... -#define KOKKOS_IMPL_MDRANGE_USE_NO_TILES // undef EOF - //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- namespace Kokkos { namespace Impl { -template -class ParallelFor, - Kokkos::Experimental::OpenMPTarget> { +template +class ParallelReduce, + Kokkos::Experimental::OpenMPTarget> { private: - using Policy = Kokkos::MDRangePolicy; + using Policy = Kokkos::MDRangePolicy; + using FunctorType = typename CombinedFunctorReducerType::functor_type; + using ReducerType = typename CombinedFunctorReducerType::reducer_type; + using WorkTag = typename Policy::work_tag; using Member = typename Policy::member_type; using Index = typename Policy::index_type; - const FunctorType m_functor; + using pointer_type = typename ReducerType::pointer_type; + using reference_type = typename ReducerType::reference_type; + + static constexpr bool UseReducer = + !std::is_same_v; + + const pointer_type m_result_ptr; + const CombinedFunctorReducerType m_functor_reducer; const Policy m_policy; + using ParReduceCopy = ParallelReduceCopy; + + bool m_result_ptr_on_device; + + // Only let one ParallelReduce instance at a time use the scratch memory. + // The constructor acquires the mutex which is released in the destructor. + std::scoped_lock m_scratch_memory_lock; + public: inline void execute() const { - OpenMPTargetExec::verify_is_process( - "Kokkos::Experimental::OpenMPTarget parallel_for"); - OpenMPTargetExec::verify_initialized( - "Kokkos::Experimental::OpenMPTarget parallel_for"); - FunctorType functor(m_functor); - Policy policy = m_policy; - -#ifdef KOKKOS_IMPL_MDRANGE_USE_NO_TILES - typename Policy::point_type unused; - - execute_tile(unused, functor, policy); -#else - const int64_t begin = 0; - const int64_t end = m_policy.m_num_tiles; - -#pragma omp target teams distribute map(to : functor) num_teams(end - begin) - { - for (ptrdiff_t tile_idx = begin; tile_idx < end; ++tile_idx) { - -#pragma omp parallel - { - typename Policy::point_type offset; - if (Policy::outer_direction == Policy::Left) { - for (int i = 0; i < Policy::rank; ++i) { - offset[i] = (tile_idx % policy.m_tile_end[i]) * policy.m_tile[i] + - policy.m_lower[i]; - tile_idx /= policy.m_tile_end[i]; - } - } else { - for (int i = Policy::rank - 1; i >= 0; --i) { - offset[i] = (tile_idx % policy.m_tile_end[i]) * policy.m_tile[i] + - policy.m_lower[i]; - tile_idx /= policy.m_tile_end[i]; - } - } - execute_tile(offset, functor, policy); - } - } - } -#endif + execute_tile( + m_functor_reducer.get_functor(), m_policy, m_result_ptr, + std::integral_constant()); } - template + template + inline ParallelReduce(const CombinedFunctorReducerType& arg_functor_reducer, + Policy arg_policy, const ViewType& arg_result_view) + : m_result_ptr(arg_result_view.data()), + m_functor_reducer(arg_functor_reducer), + m_policy(arg_policy), + m_result_ptr_on_device( + MemorySpaceAccess::accessible), + m_scratch_memory_lock(OpenMPTargetExec::m_mutex_scratch_ptr) {} + + template inline std::enable_if_t execute_tile( - typename Policy::point_type offset, const FunctorType& functor, - const Policy& policy) const { -#ifdef KOKKOS_IMPL_MDRANGE_USE_NO_TILES - (void)offset; + const FunctorType& functor, const Policy& policy, pointer_type ptr, + OpenMPTargetIterateLeft) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; const Index end_0 = policy.m_upper[0]; const Index end_1 = policy.m_upper[1]; -#pragma omp target teams distribute parallel for collapse(2) map(to : functor) - for (auto i0 = begin_0; i0 < end_0; ++i0) { + ValueType result = ValueType(); + + // FIXME_OPENMPTARGET: Unable to separate directives and their companion + // loops which leads to code duplication for different reduction types. + if constexpr (UseReducer) { +#pragma omp declare reduction( \ + custom:ValueType \ + : OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) + +#pragma omp target teams distribute parallel for collapse(2) map(to \ + : functor) \ + reduction(custom \ + : result) for (auto i1 = begin_1; i1 < end_1; ++i1) { - if constexpr (std::is_void::value) - functor(i0, i1); - else - functor(typename Policy::work_tag(), i0, i1); + for (auto i0 = begin_0; i0 < end_0; ++i0) { + if constexpr (std::is_void::value) + functor(i0, i1, result); + else + functor(typename Policy::work_tag(), i0, i1, result); + } } - } -#else - const ptrdiff_t begin_0 = offset[0]; - ptrdiff_t end_0 = begin_0 + policy.m_tile[0]; - end_0 = end_0 < policy.m_upper[0] ? end_0 : policy.m_upper[0]; - - const ptrdiff_t begin_1 = offset[1]; - ptrdiff_t end_1 = begin_1 + policy.m_tile[1]; - end_1 = end_1 < policy.m_upper[1] ? end_1 : policy.m_upper[1]; - -#pragma omp for collapse(2) - for (ptrdiff_t i0 = begin_0; i0 < end_0; ++i0) - for (ptrdiff_t i1 = begin_1; i1 < end_1; ++i1) { - if constexpr (std::is_void::value) - functor(i0, i1); - else - functor(typename Policy::work_tag(), i0, i1); + } else { +#pragma omp target teams distribute parallel for collapse(2) map(to : functor) \ +reduction(+:result) + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i0 = begin_0; i0 < end_0; ++i0) { + if constexpr (std::is_void::value) + functor(i0, i1, result); + else + functor(typename Policy::work_tag(), i0, i1, result); + } } -#endif + } + + ParReduceCopy::memcpy_result(ptr, &result, sizeof(ValueType), + m_result_ptr_on_device); } - template + template inline std::enable_if_t execute_tile( - typename Policy::point_type offset, const FunctorType& functor, - const Policy& policy) const { -#ifdef KOKKOS_IMPL_MDRANGE_USE_NO_TILES - (void)offset; + const FunctorType& functor, const Policy& policy, pointer_type ptr, + OpenMPTargetIterateLeft) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; const Index begin_2 = policy.m_lower[2]; @@ -144,107 +138,119 @@ class ParallelFor, const Index end_1 = policy.m_upper[1]; const Index end_2 = policy.m_upper[2]; -#pragma omp target teams distribute parallel for collapse(3) map(to : functor) - for (auto i0 = begin_0; i0 < end_0; ++i0) { - for (auto i1 = begin_1; i1 < end_1; ++i1) { - for (auto i2 = begin_2; i2 < end_2; ++i2) { - if constexpr (std::is_void::value) - functor(i0, i1, i2); - else - functor(typename Policy::work_tag(), i0, i1, i2); + ValueType result = ValueType(); + + // FIXME_OPENMPTARGET: Unable to separate directives and their companion + // loops which leads to code duplication for different reduction types. + if constexpr (UseReducer) { +#pragma omp declare reduction( \ + custom:ValueType \ + : OpenMPTargetReducerWrapper ::join( \ + omp_out, omp_in)) \ + initializer( \ + OpenMPTargetReducerWrapper ::init( \ + omp_priv)) + +#pragma omp target teams distribute parallel for collapse(3) map(to \ + : functor) \ + reduction(custom \ + : result) + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i0 = begin_0; i0 < end_0; ++i0) { + if constexpr (std::is_void::value) + functor(i0, i1, i2, result); + else + functor(typename Policy::work_tag(), i0, i1, i2, result); + } } } - } -#else - const ptrdiff_t begin_0 = offset[0]; - ptrdiff_t end_0 = begin_0 + policy.m_tile[0]; - end_0 = end_0 < policy.m_upper[0] ? end_0 : policy.m_upper[0]; - - const ptrdiff_t begin_1 = offset[1]; - ptrdiff_t end_1 = begin_1 + policy.m_tile[1]; - end_1 = end_1 < policy.m_upper[1] ? end_1 : policy.m_upper[1]; - - const ptrdiff_t begin_2 = offset[2]; - ptrdiff_t end_2 = begin_2 + policy.m_tile[2]; - end_2 = end_2 < policy.m_upper[2] ? end_2 : policy.m_upper[2]; - -#pragma omp for collapse(3) - for (ptrdiff_t i0 = begin_0; i0 < end_0; ++i0) - for (ptrdiff_t i1 = begin_1; i1 < end_1; ++i1) - for (ptrdiff_t i2 = begin_2; i2 < end_2; ++i2) { - if constexpr (std::is_void::value) - functor(i0, i1, i2); - else - functor(typename Policy::work_tag(), i0, i1, i2); + } else { +#pragma omp target teams distribute parallel for collapse(3) map(to : functor) \ +reduction(+:result) + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i0 = begin_0; i0 < end_0; ++i0) { + if constexpr (std::is_void::value) + functor(i0, i1, i2, result); + else + functor(typename Policy::work_tag(), i0, i1, i2, result); + } } -#endif + } + } + + ParReduceCopy::memcpy_result(ptr, &result, sizeof(ValueType), + m_result_ptr_on_device); } - template + template inline std::enable_if_t execute_tile( - typename Policy::point_type offset, const FunctorType& functor, - const Policy& policy) const { -#ifdef KOKKOS_IMPL_MDRANGE_USE_NO_TILES - (void)offset; + const FunctorType& functor, const Policy& policy, pointer_type ptr, + OpenMPTargetIterateLeft) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; - const Index begin_2 = policy.m_lower[2]; - const Index begin_3 = policy.m_lower[3]; + const Index begin_2 = policy.m_lower[3]; + const Index begin_3 = policy.m_lower[2]; const Index end_0 = policy.m_upper[0]; const Index end_1 = policy.m_upper[1]; const Index end_2 = policy.m_upper[2]; const Index end_3 = policy.m_upper[3]; -#pragma omp target teams distribute parallel for collapse(4) map(to : functor) - for (auto i0 = begin_0; i0 < end_0; ++i0) { - for (auto i1 = begin_1; i1 < end_1; ++i1) { + ValueType result = ValueType(); + + // FIXME_OPENMPTARGET: Unable to separate directives and their companion + // loops which leads to code duplication for different reduction types. + if constexpr (UseReducer) { +#pragma omp declare reduction( \ + custom:ValueType \ + : OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) + +#pragma omp target teams distribute parallel for collapse(4) map(to \ + : functor) \ + reduction(custom \ + : result) + for (auto i3 = begin_3; i3 < end_3; ++i3) { for (auto i2 = begin_2; i2 < end_2; ++i2) { - for (auto i3 = begin_3; i3 < end_3; ++i3) { - if constexpr (std::is_void::value) - functor(i0, i1, i2, i3); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3); + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i0 = begin_0; i0 < end_0; ++i0) { + if constexpr (std::is_same::value) + functor(i0, i1, i2, i3, result); + else + functor(typename Policy::work_tag(), i0, i1, i2, i3, result); + } } } } - } -#else - const ptrdiff_t begin_0 = offset[0]; - ptrdiff_t end_0 = begin_0 + policy.m_tile[0]; - end_0 = end_0 < policy.m_upper[0] ? end_0 : policy.m_upper[0]; - - const ptrdiff_t begin_1 = offset[1]; - ptrdiff_t end_1 = begin_1 + policy.m_tile[1]; - end_1 = end_1 < policy.m_upper[1] ? end_1 : policy.m_upper[1]; - - const ptrdiff_t begin_2 = offset[2]; - ptrdiff_t end_2 = begin_2 + policy.m_tile[2]; - end_2 = end_2 < policy.m_upper[2] ? end_2 : policy.m_upper[2]; - - const ptrdiff_t begin_3 = offset[3]; - ptrdiff_t end_3 = begin_3 + policy.m_tile[3]; - end_3 = end_3 < policy.m_upper[3] ? end_3 : policy.m_upper[3]; - -#pragma omp for collapse(4) - for (ptrdiff_t i0 = begin_0; i0 < end_0; ++i0) - for (ptrdiff_t i1 = begin_1; i1 < end_1; ++i1) - for (ptrdiff_t i2 = begin_2; i2 < end_2; ++i2) - for (ptrdiff_t i3 = begin_3; i3 < end_3; ++i3) { - if constexpr (std::is_void::value) - functor(i0, i1, i2, i3); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3); + } else { +#pragma omp target teams distribute parallel for collapse(4) map(to : functor) \ +reduction(+:result) + for (auto i3 = begin_3; i3 < end_3; ++i3) { + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i0 = begin_0; i0 < end_0; ++i0) { + if constexpr (std::is_same::value) + functor(i0, i1, i2, i3, result); + else + functor(typename Policy::work_tag(), i0, i1, i2, i3, result); + } } -#endif + } + } + } + + ParReduceCopy::memcpy_result(ptr, &result, sizeof(ValueType), + m_result_ptr_on_device); } - template + template inline std::enable_if_t execute_tile( - typename Policy::point_type offset, const FunctorType& functor, - const Policy& policy) const { -#ifdef KOKKOS_IMPL_MDRANGE_USE_NO_TILES - (void)offset; + const FunctorType& functor, const Policy& policy, pointer_type ptr, + OpenMPTargetIterateLeft) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; const Index begin_2 = policy.m_lower[2]; @@ -257,64 +263,65 @@ class ParallelFor, const Index end_3 = policy.m_upper[3]; const Index end_4 = policy.m_upper[4]; -#pragma omp target teams distribute parallel for collapse(5) map(to : functor) - for (auto i0 = begin_0; i0 < end_0; ++i0) { - for (auto i1 = begin_1; i1 < end_1; ++i1) { - for (auto i2 = begin_2; i2 < end_2; ++i2) { - for (auto i3 = begin_3; i3 < end_3; ++i3) { - for (auto i4 = begin_4; i4 < end_4; ++i4) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, i4); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4); + ValueType result = ValueType(); + + // FIXME_OPENMPTARGET: Unable to separate directives and their companion + // loops which leads to code duplication for different reduction types. + if constexpr (UseReducer) { +#pragma omp declare reduction( \ + custom:ValueType \ + : OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) + +#pragma omp target teams distribute parallel for collapse(5) map(to \ + : functor) \ + reduction(custom \ + : result) + for (auto i4 = begin_4; i4 < end_4; ++i4) { + for (auto i3 = begin_3; i3 < end_3; ++i3) { + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i0 = begin_0; i0 < end_0; ++i0) { + if constexpr (std::is_same::value) + functor(i0, i1, i2, i3, i4, result); + else + functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, + result); + } } } } } - } -#else - const ptrdiff_t begin_0 = offset[0]; - ptrdiff_t end_0 = begin_0 + policy.m_tile[0]; - end_0 = end_0 < policy.m_upper[0] ? end_0 : policy.m_upper[0]; - - const ptrdiff_t begin_1 = offset[1]; - ptrdiff_t end_1 = begin_1 + policy.m_tile[1]; - end_1 = end_1 < policy.m_upper[1] ? end_1 : policy.m_upper[1]; - - const ptrdiff_t begin_2 = offset[2]; - ptrdiff_t end_2 = begin_2 + policy.m_tile[2]; - end_2 = end_2 < policy.m_upper[2] ? end_2 : policy.m_upper[2]; - - const ptrdiff_t begin_3 = offset[3]; - ptrdiff_t end_3 = begin_3 + policy.m_tile[3]; - end_3 = end_3 < policy.m_upper[3] ? end_3 : policy.m_upper[3]; - - const ptrdiff_t begin_4 = offset[4]; - ptrdiff_t end_4 = begin_4 + policy.m_tile[4]; - end_4 = end_4 < policy.m_upper[4] ? end_4 : policy.m_upper[4]; - -#pragma omp for collapse(5) - for (ptrdiff_t i0 = begin_0; i0 < end_0; ++i0) - for (ptrdiff_t i1 = begin_1; i1 < end_1; ++i1) - for (ptrdiff_t i2 = begin_2; i2 < end_2; ++i2) - for (ptrdiff_t i3 = begin_3; i3 < end_3; ++i3) - for (ptrdiff_t i4 = begin_4; i4 < end_4; ++i4) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, i4); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4); + } else { +#pragma omp target teams distribute parallel for collapse(5) map(to : functor) \ +reduction(+:result) + for (auto i4 = begin_4; i4 < end_4; ++i4) { + for (auto i3 = begin_3; i3 < end_3; ++i3) { + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i0 = begin_0; i0 < end_0; ++i0) { + if constexpr (std::is_same::value) + functor(i0, i1, i2, i3, i4, result); + else + functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, + result); + } } -#endif + } + } + } + } + + ParReduceCopy::memcpy_result(ptr, &result, sizeof(ValueType), + m_result_ptr_on_device); } - template + template inline std::enable_if_t execute_tile( - typename Policy::point_type offset, const FunctorType& functor, - const Policy& policy) const { -#ifdef KOKKOS_IMPL_MDRANGE_USE_NO_TILES - (void)offset; + const FunctorType& functor, const Policy& policy, pointer_type ptr, + OpenMPTargetIterateLeft) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; const Index begin_2 = policy.m_lower[2]; @@ -329,140 +336,69 @@ class ParallelFor, const Index end_4 = policy.m_upper[4]; const Index end_5 = policy.m_upper[5]; -#pragma omp target teams distribute parallel for collapse(6) map(to : functor) - for (auto i0 = begin_0; i0 < end_0; ++i0) { - for (auto i1 = begin_1; i1 < end_1; ++i1) { - for (auto i2 = begin_2; i2 < end_2; ++i2) { + ValueType result = ValueType(); + + // FIXME_OPENMPTARGET: Unable to separate directives and their companion + // loops which leads to code duplication for different reduction types. + if constexpr (UseReducer) { +#pragma omp declare reduction( \ + custom:ValueType \ + : OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) + +#pragma omp target teams distribute parallel for collapse(6) map(to \ + : functor) \ + reduction(custom \ + : result) + for (auto i5 = begin_5; i5 < end_5; ++i5) { + for (auto i4 = begin_4; i4 < end_4; ++i4) { for (auto i3 = begin_3; i3 < end_3; ++i3) { - for (auto i4 = begin_4; i4 < end_4; ++i4) { - for (auto i5 = begin_5; i5 < end_5; ++i5) { - { + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i0 = begin_0; i0 < end_0; ++i0) { if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, i4, i5); + functor(i0, i1, i2, i3, i4, i5, result); else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, - i5); + functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, i5, + result); } } } } } } - } -#else - const ptrdiff_t begin_0 = offset[0]; - ptrdiff_t end_0 = begin_0 + policy.m_tile[0]; - end_0 = end_0 < policy.m_upper[0] ? end_0 : policy.m_upper[0]; - - const ptrdiff_t begin_1 = offset[1]; - ptrdiff_t end_1 = begin_1 + policy.m_tile[1]; - end_1 = end_1 < policy.m_upper[1] ? end_1 : policy.m_upper[1]; - - const ptrdiff_t begin_2 = offset[2]; - ptrdiff_t end_2 = begin_2 + policy.m_tile[2]; - end_2 = end_2 < policy.m_upper[2] ? end_2 : policy.m_upper[2]; - - const ptrdiff_t begin_3 = offset[3]; - ptrdiff_t end_3 = begin_3 + policy.m_tile[3]; - end_3 = end_3 < policy.m_upper[3] ? end_3 : policy.m_upper[3]; - - const ptrdiff_t begin_4 = offset[4]; - ptrdiff_t end_4 = begin_4 + policy.m_tile[4]; - end_4 = end_4 < policy.m_upper[4] ? end_4 : policy.m_upper[4]; - - const ptrdiff_t begin_5 = offset[5]; - ptrdiff_t end_5 = begin_5 + policy.m_tile[5]; - end_5 = end_5 < policy.m_upper[5] ? end_5 : policy.m_upper[5]; - -#pragma omp for collapse(6) - for (ptrdiff_t i0 = begin_0; i0 < end_0; ++i0) - for (ptrdiff_t i1 = begin_1; i1 < end_1; ++i1) - for (ptrdiff_t i2 = begin_2; i2 < end_2; ++i2) - for (ptrdiff_t i3 = begin_3; i3 < end_3; ++i3) - for (ptrdiff_t i4 = begin_4; i4 < end_4; ++i4) - for (ptrdiff_t i5 = begin_5; i5 < end_5; ++i5) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, i4, i5); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, i5); + } else { +#pragma omp target teams distribute parallel for collapse(6) map(to : functor) \ +reduction(+:result) + for (auto i5 = begin_5; i5 < end_5; ++i5) { + for (auto i4 = begin_4; i4 < end_4; ++i4) { + for (auto i3 = begin_3; i3 < end_3; ++i3) { + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i0 = begin_0; i0 < end_0; ++i0) { + if constexpr (std::is_same::value) + functor(i0, i1, i2, i3, i4, i5, result); + else + functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, i5, + result); + } } -#endif - } - - inline ParallelFor(const FunctorType& arg_functor, Policy arg_policy) - : m_functor(arg_functor), m_policy(arg_policy) {} - // TODO DZP: based on a conversation with Christian, we're using 256 as a - // heuristic here. We need something better once we can query these kinds of - // properties - template - static int max_tile_size_product(const Policy&, const Functor&) { - return 256; - } -}; - -} // namespace Impl -} // namespace Kokkos - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -template -class ParallelReduce, - Kokkos::Experimental::OpenMPTarget> { - private: - using Policy = Kokkos::MDRangePolicy; - using FunctorType = typename CombinedFunctorReducerType::functor_type; - using ReducerType = typename CombinedFunctorReducerType::reducer_type; - - using WorkTag = typename Policy::work_tag; - using Member = typename Policy::member_type; - using Index = typename Policy::index_type; - - using pointer_type = typename ReducerType::pointer_type; - using reference_type = typename ReducerType::reference_type; - - static constexpr bool UseReducer = - !std::is_same_v; - - const pointer_type m_result_ptr; - const CombinedFunctorReducerType m_functor_reducer; - const Policy m_policy; - - using ParReduceCopy = ParallelReduceCopy; - - bool m_result_ptr_on_device; - - // Only let one ParallelReduce instance at a time use the scratch memory. - // The constructor acquires the mutex which is released in the destructor. - std::scoped_lock m_scratch_memory_lock; + } + } + } + } + } - public: - inline void execute() const { - execute_tile( - m_functor_reducer.get_functor(), m_policy, m_result_ptr); + ParReduceCopy::memcpy_result(ptr, &result, sizeof(ValueType), + m_result_ptr_on_device); } - template - inline ParallelReduce(const CombinedFunctorReducerType& arg_functor_reducer, - Policy arg_policy, const ViewType& arg_result_view) - : m_result_ptr(arg_result_view.data()), - m_functor_reducer(arg_functor_reducer), - m_policy(arg_policy), - m_result_ptr_on_device( - MemorySpaceAccess::accessible), - m_scratch_memory_lock(OpenMPTargetExec::m_mutex_scratch_ptr) {} - template - inline std::enable_if_t execute_tile(const FunctorType& functor, - const Policy& policy, - pointer_type ptr) const { + inline std::enable_if_t execute_tile( + const FunctorType& functor, const Policy& policy, pointer_type ptr, + OpenMPTargetIterateRight) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; @@ -509,9 +445,9 @@ reduction(+:result) } template - inline std::enable_if_t execute_tile(const FunctorType& functor, - const Policy& policy, - pointer_type ptr) const { + inline std::enable_if_t execute_tile( + const FunctorType& functor, const Policy& policy, pointer_type ptr, + OpenMPTargetIterateRight) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; const Index begin_2 = policy.m_lower[2]; @@ -567,9 +503,9 @@ reduction(+:result) } template - inline std::enable_if_t execute_tile(const FunctorType& functor, - const Policy& policy, - pointer_type ptr) const { + inline std::enable_if_t execute_tile( + const FunctorType& functor, const Policy& policy, pointer_type ptr, + OpenMPTargetIterateRight) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; const Index begin_2 = policy.m_lower[3]; @@ -630,9 +566,9 @@ reduction(+:result) } template - inline std::enable_if_t execute_tile(const FunctorType& functor, - const Policy& policy, - pointer_type ptr) const { + inline std::enable_if_t execute_tile( + const FunctorType& functor, const Policy& policy, pointer_type ptr, + OpenMPTargetIterateRight) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; const Index begin_2 = policy.m_lower[2]; @@ -701,9 +637,9 @@ reduction(+:result) } template - inline std::enable_if_t execute_tile(const FunctorType& functor, - const Policy& policy, - pointer_type ptr) const { + inline std::enable_if_t execute_tile( + const FunctorType& functor, const Policy& policy, pointer_type ptr, + OpenMPTargetIterateRight) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; const Index begin_2 = policy.m_lower[2]; @@ -788,5 +724,4 @@ reduction(+:result) //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- -#undef KOKKOS_IMPL_MDRANGE_USE_NO_TILES -#endif /* KOKKOS_OPENMPTARGET_PARALLEL_HPP */ +#endif /* KOKKOS_OPENMPTARGET_PARALLELREDUCE_MDRANGE_HPP */ From 77ea52f97685908e62e11f10fe263f51f4fc0c46 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Tue, 30 Apr 2024 20:49:51 -0400 Subject: [PATCH 030/103] Threads: Don't silently allow m_instance to be a nullptr (#6969) * Threads: Don't silently allod m_instance to be a nullptr * Assert that m_instance is not nullptr --- core/src/Threads/Kokkos_Threads_Team.hpp | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/core/src/Threads/Kokkos_Threads_Team.hpp b/core/src/Threads/Kokkos_Threads_Team.hpp index fd0f221365b..a3501a437d2 100644 --- a/core/src/Threads/Kokkos_Threads_Team.hpp +++ b/core/src/Threads/Kokkos_Threads_Team.hpp @@ -188,8 +188,6 @@ class ThreadsExecTeamMember { using type = typename if_c::type; - if (m_instance == nullptr) return value; - if (team_rank() != team_size() - 1) * ((volatile type*)m_instance->scratch_memory()) = value; @@ -229,8 +227,6 @@ class ThreadsExecTeamMember { using type = typename if_c::type; - if (m_instance == nullptr) return; - type* const local_value = ((type*)m_instance->scratch_memory()); // Set this thread's contribution @@ -285,8 +281,6 @@ class ThreadsExecTeamMember { using type = typename if_c::type; - if (m_instance == nullptr) return type(0); - volatile type* const work_value = ((type*)m_instance->scratch_memory()); *work_value = value; @@ -358,6 +352,7 @@ class ThreadsExecTeamMember { m_chunk_size(team.chunk_size()), m_league_chunk_end(0), m_team_alloc(team.team_alloc()) { + KOKKOS_ASSERT(m_instance != nullptr); if (team.league_size()) { // Execution is using device-team interface: From f699a2c7a2668832e74747ed4816bef683937ba7 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Tue, 30 Apr 2024 21:44:09 -0400 Subject: [PATCH 031/103] Fix enabling OpenMP with HIP and "compile as CMake language" --- cmake/kokkos_tpls.cmake | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cmake/kokkos_tpls.cmake b/cmake/kokkos_tpls.cmake index df01f200d13..c7c352ae35f 100644 --- a/cmake/kokkos_tpls.cmake +++ b/cmake/kokkos_tpls.cmake @@ -111,6 +111,9 @@ IF (Kokkos_ENABLE_OPENMP) ELSE() KOKKOS_EXPORT_CMAKE_TPL(OpenMP REQUIRED COMPONENTS CXX) ENDIF() + IF(Kokkos_ENABLE_HIP AND KOKKOS_COMPILE_LANGUAGE STREQUAL HIP) + GLOBAL_APPEND(KOKKOS_AMDGPU_OPTIONS ${OpenMP_CXX_FLAGS}) + ENDIF() ENDIF() #Convert list to newlines (which CMake doesn't always like in cache variables) From 2574b802922d2d13cdd80b2b171ff5f3cd5ef15b Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Tue, 30 Apr 2024 21:49:46 -0400 Subject: [PATCH 032/103] Fix OpenMP+CUDA when `Kokkos_ENABLE_COMPILE_AS_CMAKE_LANGUAGE` is `ON` Co-Authored-By: Daniel Arndt --- cmake/kokkos_tpls.cmake | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cmake/kokkos_tpls.cmake b/cmake/kokkos_tpls.cmake index c7c352ae35f..cda9e0d6004 100644 --- a/cmake/kokkos_tpls.cmake +++ b/cmake/kokkos_tpls.cmake @@ -114,6 +114,9 @@ IF (Kokkos_ENABLE_OPENMP) IF(Kokkos_ENABLE_HIP AND KOKKOS_COMPILE_LANGUAGE STREQUAL HIP) GLOBAL_APPEND(KOKKOS_AMDGPU_OPTIONS ${OpenMP_CXX_FLAGS}) ENDIF() + IF(Kokkos_ENABLE_CUDA AND KOKKOS_COMPILE_LANGUAGE STREQUAL CUDA) + GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS -Xcompiler ${OpenMP_CXX_FLAGS}) + ENDIF() ENDIF() #Convert list to newlines (which CMake doesn't always like in cache variables) From ccd0126b88a08b27c86f11ecd0447454b96b0b52 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 1 May 2024 12:03:36 -0400 Subject: [PATCH 033/103] Fix fedora CI builds with flang-new --- example/build_cmake_installed/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/example/build_cmake_installed/CMakeLists.txt b/example/build_cmake_installed/CMakeLists.txt index aaf745b418d..c025f1d7d28 100644 --- a/example/build_cmake_installed/CMakeLists.txt +++ b/example/build_cmake_installed/CMakeLists.txt @@ -12,6 +12,7 @@ find_package(Kokkos REQUIRED) add_executable(example cmake_example.cpp foo.f) if(CMAKE_Fortran_COMPILER_ID STREQUAL LLVMFlang) set_target_properties(example PROPERTIES LINKER_LANGUAGE Fortran) + target_link_options(example PRIVATE -fno-fortran-main) endif() # This is the only thing required to set up compiler/linker flags From 45a14049163732fbc5eb249282d15424ebb91d55 Mon Sep 17 00:00:00 2001 From: Christian Trott Date: Thu, 2 May 2024 09:22:10 -0600 Subject: [PATCH 034/103] Fix Copyright file --- Copyright.txt | 49 ++++++++----------------------------------------- LICENSE | 10 ---------- 2 files changed, 8 insertions(+), 51 deletions(-) diff --git a/Copyright.txt b/Copyright.txt index 5e2f8d8647b..cbba3efc7bc 100644 --- a/Copyright.txt +++ b/Copyright.txt @@ -1,41 +1,8 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 3.0 -// Copyright (2020) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Questions? Contact Christian R. Trott (crtrott@sandia.gov) -// -// ************************************************************************ -//@HEADER +************************************************************************ + + Kokkos v. 4.0 + Copyright (2022) National Technology & Engineering + Solutions of Sandia, LLC (NTESS). + +Under the terms of Contract DE-NA0003525 with NTESS, +the U.S. Government retains certain rights in this software. diff --git a/LICENSE b/LICENSE index 6572cc2db05..4d9d69d7c44 100644 --- a/LICENSE +++ b/LICENSE @@ -1,13 +1,3 @@ - ************************************************************************ - - Kokkos v. 4.0 - Copyright (2022) National Technology & Engineering - Solutions of Sandia, LLC (NTESS). - - Under the terms of Contract DE-NA0003525 with NTESS, - the U.S. Government retains certain rights in this software. - - ============================================================================== Kokkos is under the Apache License v2.0 with LLVM Exceptions: ============================================================================== From c6d86474a83c460e9de37bba938b63f5d9580070 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Thu, 2 May 2024 16:10:07 -0400 Subject: [PATCH 035/103] Also use is_nothrow_swappable workaround for Intel Classic Compilers (#6983) * Also use is_nothrow_swappable workaround for Intel Classic Compilers * Use template parameter U directly in kokkos_swap overload --- core/src/Kokkos_Array.hpp | 2 +- core/src/Kokkos_Swap.hpp | 8 -------- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/core/src/Kokkos_Array.hpp b/core/src/Kokkos_Array.hpp index 29e5edd9baa..0a1ced93c8f 100644 --- a/core/src/Kokkos_Array.hpp +++ b/core/src/Kokkos_Array.hpp @@ -138,7 +138,7 @@ struct Array { friend KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t< Impl::is_swappable::value> kokkos_swap(Array& a, - Array& b) noexcept(Impl::is_nothrow_swappable_v) { + Array& b) noexcept(Impl::is_nothrow_swappable_v) { for (std::size_t i = 0; i < N; ++i) { kokkos_swap(a[i], b[i]); } diff --git a/core/src/Kokkos_Swap.hpp b/core/src/Kokkos_Swap.hpp index 907f8607a7e..2f849a13ab6 100644 --- a/core/src/Kokkos_Swap.hpp +++ b/core/src/Kokkos_Swap.hpp @@ -49,17 +49,9 @@ struct is_swappable { !std::is_same_v(0)), Nope>; }; -#if defined(KOKKOS_COMPILER_NVCC) && (KOKKOS_COMPILER_NVCC < 1140) -template -inline constexpr bool is_nothrow_swappable_v = - is_swappable::value&& noexcept( - kokkos_swap(std::declval&>(), - std::declval&>())); -#else template inline constexpr bool is_nothrow_swappable_v = noexcept(kokkos_swap(std::declval(), std::declval())); -#endif } // namespace Impl From 69567f3051dbdc51e65221c8b68ee41849749a4f Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Fri, 3 May 2024 08:57:09 -0400 Subject: [PATCH 036/103] Add thread-safety tests (#6938) * Add thread-safety tests * Disable thread-safety tests for Serial and OpenMP for now * Cleanup include and namespace * Skip tests for OpenACC in CMakeLists.txt * Avoid std::move * Comment on tests * Use more atomics * Simplify test --- core/unit_test/CMakeLists.txt | 7 + core/unit_test/TestExecSpaceThreadSafety.hpp | 319 +++++++++++++++++++ 2 files changed, 326 insertions(+) create mode 100644 core/unit_test/TestExecSpaceThreadSafety.hpp diff --git a/core/unit_test/CMakeLists.txt b/core/unit_test/CMakeLists.txt index 3b14bec03a2..4d0ce3b22e3 100644 --- a/core/unit_test/CMakeLists.txt +++ b/core/unit_test/CMakeLists.txt @@ -148,6 +148,7 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;OpenACC;HIP;SYCL) Crs DeepCopyAlignment ExecSpacePartitioning + ExecSpaceThreadSafety ExecutionSpace FunctorAnalysis HostSharedPtr @@ -426,6 +427,7 @@ if(Kokkos_ENABLE_OPENACC) ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_complexdouble.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_complexfloat.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Crs.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ExecSpaceThreadSafety.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_JoinBackwardCompatibility.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_LocalDeepCopy.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Other.cpp @@ -637,6 +639,8 @@ IF(KOKKOS_ENABLE_OPENACC AND KOKKOS_CXX_COMPILER_ID STREQUAL Clang) endif() if(Kokkos_ENABLE_SERIAL) + list(REMOVE_ITEM Serial_SOURCES1 + ${CMAKE_CURRENT_BINARY_DIR}/serial/TestSerial_ExecSpaceThreadSafety.cpp) KOKKOS_ADD_EXECUTABLE_AND_TEST( CoreUnitTest_Serial1 SOURCES @@ -667,6 +671,9 @@ if(Kokkos_ENABLE_THREADS) endif() if (Kokkos_ENABLE_OPENMP) + list(REMOVE_ITEM OpenMP_SOURCES + ${CMAKE_CURRENT_BINARY_DIR}/openmp/TestOpenMP_ExecSpaceThreadSafety.cpp) + set(OpenMP_EXTRA_SOURCES openmp/TestOpenMP_Task.cpp ) diff --git a/core/unit_test/TestExecSpaceThreadSafety.hpp b/core/unit_test/TestExecSpaceThreadSafety.hpp new file mode 100644 index 00000000000..20b802babe0 --- /dev/null +++ b/core/unit_test/TestExecSpaceThreadSafety.hpp @@ -0,0 +1,319 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include +#include + +namespace { + +#ifdef KOKKOS_ENABLE_OPENMP +template +void run_threaded_test(const Lambda1 l1, const Lambda2 l2) { +#pragma omp parallel num_threads(2) + { + if (omp_get_thread_num() == 0) l1(); + if (omp_get_thread_num() == 1) l2(); + } +} +// We cannot run the multithreaded test when threads or HPX is enabled because +// we cannot launch a thread from inside another thread +#elif !defined(KOKKOS_ENABLE_THREADS) && !defined(KOKKOS_ENABLE_HPX) +template +void run_threaded_test(const Lambda1 l1, const Lambda2 l2) { + std::thread t1(l1); + std::thread t2(l2); + t1.join(); + t2.join(); +} +#else +template +void run_threaded_test(const Lambda1 l1, const Lambda2 l2) { + l1(); + l2(); +} +#endif + +// The idea for all of these tests is to access a View from kernels submitted by +// two different threads to the same execution space instance. If the kernels +// are executed concurrently, we expect to count too many increments. +void run_exec_space_thread_safety_range() { + constexpr int N = 10000000; + constexpr int M = 10; + + Kokkos::View view("view"); + Kokkos::View error("error"); + + auto lambda = [=]() { + TEST_EXECSPACE exec; + for (int j = 0; j < M; ++j) { + Kokkos::parallel_for( + Kokkos::RangePolicy(exec, 0, 1), KOKKOS_LAMBDA(int) { + Kokkos::atomic_store(view.data(), 0); + for (int i = 0; i < N; ++i) Kokkos::atomic_inc(view.data()); + if (Kokkos::atomic_load(view.data()) != N) + Kokkos::atomic_store(error.data(), 1); + }); + } + }; + + run_threaded_test(lambda, lambda); + + auto host_error = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, error); + ASSERT_EQ(host_error(), 0); +} + +TEST(TEST_CATEGORY, exec_space_thread_safety_range) { +#ifdef KOKKOS_ENABLE_OPENMPTARGET + if (std::is_same_v) + GTEST_SKIP() << "skipping since test is known to fail for OpenMPTarget"; +#endif + run_exec_space_thread_safety_range(); +} + +void run_exec_space_thread_safety_mdrange() { + constexpr int N = 1000000; + constexpr int M = 10; + + Kokkos::View view("view"); + Kokkos::View error("error"); + + auto lambda = [=]() { + TEST_EXECSPACE exec; + for (int j = 0; j < M; ++j) { + Kokkos::parallel_for( + Kokkos::MDRangePolicy>(exec, {0, 0}, + {1, 1}), + KOKKOS_LAMBDA(int, int) { + Kokkos::atomic_store(view.data(), 0); + for (int i = 0; i < N; ++i) Kokkos::atomic_inc(view.data()); + if (Kokkos::atomic_load(view.data()) != N) + Kokkos::atomic_store(error.data(), 1); + }); + } + }; + + run_threaded_test(lambda, lambda); + + auto host_error = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, error); + ASSERT_EQ(host_error(), 0); +} + +TEST(TEST_CATEGORY, exec_space_thread_safety_mdrange) { +#ifdef KOKKOS_ENABLE_OPENMPTARGET + if (std::is_same_v) + GTEST_SKIP() << "skipping since test is known to fail for OpenMPTarget"; +#endif + run_exec_space_thread_safety_mdrange(); +} + +void run_exec_space_thread_safety_team_policy() { + constexpr int N = 1000000; + constexpr int M = 10; + + Kokkos::View view("view"); + Kokkos::View error("error"); + + auto lambda = [=]() { + TEST_EXECSPACE exec; + for (int j = 0; j < M; ++j) { + Kokkos::parallel_for( + Kokkos::TeamPolicy(exec, 1, 1, 1), + KOKKOS_LAMBDA(const Kokkos::TeamPolicy::member_type + &team_member) { + Kokkos::single(Kokkos::PerTeam(team_member), [=]() { + Kokkos::atomic_store(view.data(), 0); + for (int i = 0; i < N; ++i) Kokkos::atomic_inc(view.data()); + if (Kokkos::atomic_load(view.data()) != N) + Kokkos::atomic_store(error.data(), 1); + }); + }); + } + }; + + run_threaded_test(lambda, lambda); + + auto host_error = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, error); + ASSERT_EQ(host_error(), 0); +} + +TEST(TEST_CATEGORY, exec_space_thread_safety_team_policy) { +// FIXME_OPENMPTARGET +#ifdef KOKKOS_ENABLE_OPENMPTARGET + if (std::is_same_v) + GTEST_SKIP() << "skipping for OpenMPTarget since the test is designed to " + "run with vector_length=1"; +#endif + run_exec_space_thread_safety_team_policy(); +} + +void run_exec_space_thread_safety_range_reduce() { + constexpr int N = 1000000; + constexpr int M = 10; + + Kokkos::View view("view"); + Kokkos::View error("error"); + + auto lambda = [=]() { + TEST_EXECSPACE exec; + for (int j = 0; j < M; ++j) { + Kokkos::parallel_reduce( + Kokkos::RangePolicy(exec, 0, 1), + KOKKOS_LAMBDA(int, int &update) { + Kokkos::atomic_store(view.data(), 0); + for (int i = 0; i < N; ++i) Kokkos::atomic_inc(view.data()); + if (Kokkos::atomic_load(view.data()) != N) ++update; + }, + error); + } + exec.fence(); + }; + + run_threaded_test(lambda, lambda); + + auto host_error = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, error); + ASSERT_EQ(host_error(), 0); +} + +TEST(TEST_CATEGORY, exec_space_thread_safety_range_reduce) { + run_exec_space_thread_safety_range_reduce(); +} + +void run_exec_space_thread_safety_mdrange_reduce() { + constexpr int N = 1000000; + constexpr int M = 10; + + Kokkos::View view("view"); + Kokkos::View error("error"); + + auto lambda = [=]() { + TEST_EXECSPACE exec; + for (int j = 0; j < M; ++j) { + Kokkos::parallel_reduce( + Kokkos::MDRangePolicy>(exec, {0, 0}, + {1, 1}), + KOKKOS_LAMBDA(int, int, int &update) { + Kokkos::atomic_store(view.data(), 0); + for (int i = 0; i < N; ++i) Kokkos::atomic_inc(view.data()); + if (Kokkos::atomic_load(view.data()) != N) ++update; + }, + error); + } + exec.fence(); + }; + + run_threaded_test(lambda, lambda); + + auto host_error = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, error); + ASSERT_EQ(host_error(), 0); +} + +TEST(TEST_CATEGORY, exec_space_thread_safety_mdrange_reduce) { +// FIXME_INTEL +#ifdef KOKKOS_COMPILER_INTEL + if (std::is_same_v) + GTEST_SKIP() << "skipping since test is known to fail for OpenMP using the " + "legacy Intel compiler"; +#endif + run_exec_space_thread_safety_mdrange_reduce(); +} + +void run_exec_space_thread_safety_team_policy_reduce() { + constexpr int N = 1000000; + constexpr int M = 10; + + Kokkos::View view("view"); + Kokkos::View error("error"); + + auto lambda = [=]() { + TEST_EXECSPACE exec; + for (int j = 0; j < M; ++j) { + Kokkos::parallel_reduce( + Kokkos::TeamPolicy(exec, 1, 1, 1), + KOKKOS_LAMBDA(const Kokkos::TeamPolicy::member_type + &team_member, + int &update) { + Kokkos::single(Kokkos::PerTeam(team_member), [=, &update]() { + Kokkos::atomic_store(view.data(), 0); + for (int i = 0; i < N; ++i) Kokkos::atomic_inc(view.data()); + if (Kokkos::atomic_load(view.data()) != N) ++update; + }); + }, + error); + } + }; + run_threaded_test(lambda, lambda); + + auto host_error = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, error); + ASSERT_EQ(host_error(), 0); +} + +TEST(TEST_CATEGORY, exec_space_thread_safety_team_policy_reduce) { +// FIXME_OPENMPTARGET +#ifdef KOKKOS_ENABLE_OPENMPTARGET + if (std::is_same_v) + GTEST_SKIP() << "skipping for OpenMPTarget since the test is designed to " + "run with vector_length=1"; +#endif + // FIXME_SYCL +#if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) + if (std::is_same_v) + GTEST_SKIP() << "skipping since test is know to fail with SYCL+Cuda"; +#endif + run_exec_space_thread_safety_team_policy_reduce(); +} + +void run_exec_space_thread_safety_range_scan() { + constexpr int N = 1000000; + constexpr int M = 10; + + Kokkos::View view("view"); + Kokkos::View error("error"); + + auto lambda = [=]() { + TEST_EXECSPACE exec; + for (int j = 0; j < M; ++j) { + Kokkos::parallel_scan( + Kokkos::RangePolicy(exec, 0, 1), + KOKKOS_LAMBDA(int, int &, const bool final) { + if (final) { + Kokkos::atomic_store(view.data(), 0); + for (int i = 0; i < N; ++i) Kokkos::atomic_inc(view.data()); + if (Kokkos::atomic_load(view.data()) != N) + Kokkos::atomic_store(error.data(), 1); + } + }); + } + exec.fence(); + }; + + run_threaded_test(lambda, lambda); + + auto host_error = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, error); + ASSERT_EQ(host_error(), 0); +} + +TEST(TEST_CATEGORY, exec_space_thread_safety_range_scan) { + run_exec_space_thread_safety_range_scan(); +} + +} // namespace From 9c7920291d7fc100ed94133da0dbe2412c8d2f05 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Fri, 3 May 2024 18:30:04 -0400 Subject: [PATCH 037/103] Fix deprecation warnings with GCC for pair comparison operators Co-Authored-By: Andrey Prokopenko --- core/src/Kokkos_Pair.hpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/core/src/Kokkos_Pair.hpp b/core/src/Kokkos_Pair.hpp index 9c3516eb222..2b7f275d06d 100644 --- a/core/src/Kokkos_Pair.hpp +++ b/core/src/Kokkos_Pair.hpp @@ -450,37 +450,37 @@ struct KOKKOS_DEPRECATED pair { // template -KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator==( +KOKKOS_DEPRECATED KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator==( const pair& lhs, const pair& rhs) { return lhs.first == rhs.first; } template -KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator!=( +KOKKOS_DEPRECATED KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator!=( const pair& lhs, const pair& rhs) { return !(lhs == rhs); } template -KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator<( +KOKKOS_DEPRECATED KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator<( const pair& lhs, const pair& rhs) { return lhs.first < rhs.first; } template -KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator<=( +KOKKOS_DEPRECATED KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator<=( const pair& lhs, const pair& rhs) { return !(rhs < lhs); } template -KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator>( +KOKKOS_DEPRECATED KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator>( const pair& lhs, const pair& rhs) { return rhs < lhs; } template -KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator>=( +KOKKOS_DEPRECATED KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator>=( const pair& lhs, const pair& rhs) { return !(lhs < rhs); } From 7b8e3a68fcbf3a8deef67dbf5287c5331c73df3a Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Mon, 6 May 2024 17:23:34 +0000 Subject: [PATCH 038/103] Fix TPL_LIBRARY_SUFFIXES for 32-bit build --- cmake/kokkos_functions.cmake | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/cmake/kokkos_functions.cmake b/cmake/kokkos_functions.cmake index 9dab1ca00ea..d1f1e0d7a78 100644 --- a/cmake/kokkos_functions.cmake +++ b/cmake/kokkos_functions.cmake @@ -709,7 +709,12 @@ MACRO(kokkos_find_imported NAME) ENDIF() IF (NOT TPL_LIBRARY_SUFFIXES) - SET(TPL_LIBRARY_SUFFIXES lib lib64) + SET(TPL_LIBRARY_SUFFIXES lib) + IF(KOKKOS_IMPL_32BIT) + LIST(APPEND TPL_LIBRARY_SUFFIXES lib32) + ELSE() + LIST(APPEND TPL_LIBRARY_SUFFIXES lib64) + ENDIF() ENDIF() SET(${NAME}_INCLUDE_DIRS) From 28260178f4d68fb2bfb7ccfc8d7239e264b1d166 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Mon, 6 May 2024 13:34:15 -0400 Subject: [PATCH 039/103] Avoid duplicated definition of KOKKOS_IMPL_32BIT --- .github/workflows/continuous-integration-workflow-32bit.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/continuous-integration-workflow-32bit.yml b/.github/workflows/continuous-integration-workflow-32bit.yml index 87c21d3a6e7..0260cb5894a 100644 --- a/.github/workflows/continuous-integration-workflow-32bit.yml +++ b/.github/workflows/continuous-integration-workflow-32bit.yml @@ -36,7 +36,7 @@ jobs: -DKokkos_ENABLE_DEPRECATED_CODE_4=ON \ -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ - -DCMAKE_CXX_FLAGS="-Werror -m32 -DKOKKOS_IMPL_32BIT" \ + -DCMAKE_CXX_FLAGS="-Werror -m32" \ -DCMAKE_CXX_COMPILER=g++ \ -DCMAKE_BUILD_TYPE=RelWithDebInfo - name: Build From ccadc7d9ba2be086be38a8f1731c5df9339fbe06 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Mon, 6 May 2024 14:15:50 -0400 Subject: [PATCH 040/103] Disable failing parallel_scan_with_reducers test --- core/unit_test/TestTeamVector.hpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/core/unit_test/TestTeamVector.hpp b/core/unit_test/TestTeamVector.hpp index 5e16539d652..e278789992f 100644 --- a/core/unit_test/TestTeamVector.hpp +++ b/core/unit_test/TestTeamVector.hpp @@ -1067,6 +1067,10 @@ TEST(TEST_CATEGORY, parallel_scan_with_reducers) { } #endif +#ifdef KOKKOS_IMPL_32BIT + GTEST_SKIP() << "Failing KOKKOS_IMPL_32BIT"; // FIXME_32BIT +#endif + checkScan>() .run(); From d61d75aceceacf4c3b5a6463626f58826ce47849 Mon Sep 17 00:00:00 2001 From: Andrey Prokopenko Date: Wed, 8 May 2024 07:41:50 -0400 Subject: [PATCH 041/103] Fix a bug when using realloc on views of non-default constructible element types (#6993) * Add few missing constexpr for alloc_prop_input Co-authored-by: Daniel Arndt * Update tests * Fix DualView * Address review comments * Add missing decorators * Move NoDefaultConstructor out of function --------- Co-authored-by: Daniel Arndt --- containers/src/Kokkos_DualView.hpp | 8 ++++---- core/src/Kokkos_CopyViews.hpp | 10 ++++++++-- core/unit_test/TestRealloc.hpp | 13 +++++++++++++ core/unit_test/TestResize.hpp | 13 +++++++++++++ 4 files changed, 38 insertions(+), 6 deletions(-) diff --git a/containers/src/Kokkos_DualView.hpp b/containers/src/Kokkos_DualView.hpp index e821570a8d5..1fb174943fe 100644 --- a/containers/src/Kokkos_DualView.hpp +++ b/containers/src/Kokkos_DualView.hpp @@ -944,13 +944,13 @@ class DualView : public ViewTraits { if (sizeMismatch) { ::Kokkos::realloc(arg_prop, d_view, n0, n1, n2, n3, n4, n5, n6, n7); - if (alloc_prop_input::initialize) { + if constexpr (alloc_prop_input::initialize) { h_view = create_mirror_view(typename t_host::memory_space(), d_view); } else { h_view = create_mirror_view(Kokkos::WithoutInitializing, typename t_host::memory_space(), d_view); } - } else if (alloc_prop_input::initialize) { + } else if constexpr (alloc_prop_input::initialize) { if constexpr (alloc_prop_input::has_execution_space) { const auto& exec_space = Impl::get_property(arg_prop); @@ -1038,7 +1038,7 @@ class DualView : public ViewTraits { /* Resize on Device */ if (sizeMismatch) { ::Kokkos::resize(properties, d_view, n0, n1, n2, n3, n4, n5, n6, n7); - if (alloc_prop_input::initialize) { + if constexpr (alloc_prop_input::initialize) { h_view = create_mirror_view(typename t_host::memory_space(), d_view); } else { h_view = create_mirror_view(Kokkos::WithoutInitializing, @@ -1054,7 +1054,7 @@ class DualView : public ViewTraits { /* Resize on Host */ if (sizeMismatch) { ::Kokkos::resize(properties, h_view, n0, n1, n2, n3, n4, n5, n6, n7); - if (alloc_prop_input::initialize) { + if constexpr (alloc_prop_input::initialize) { d_view = create_mirror_view(typename t_dev::memory_space(), h_view); } else { diff --git a/core/src/Kokkos_CopyViews.hpp b/core/src/Kokkos_CopyViews.hpp index ee8d1e09d3a..40fdd590f6f 100644 --- a/core/src/Kokkos_CopyViews.hpp +++ b/core/src/Kokkos_CopyViews.hpp @@ -3235,7 +3235,10 @@ impl_realloc(Kokkos::View& v, const size_t n0, const size_t n1, v = view_type(); // Best effort to deallocate in case no other view refers // to the shared allocation v = view_type(arg_prop_copy, n0, n1, n2, n3, n4, n5, n6, n7); - } else if (alloc_prop_input::initialize) { + return; + } + + if constexpr (alloc_prop_input::initialize) { if constexpr (alloc_prop_input::has_execution_space) { const auto& exec_space = Impl::get_property(arg_prop); @@ -3330,7 +3333,10 @@ impl_realloc(Kokkos::View& v, if (v.layout() != layout) { v = view_type(); // Deallocate first, if the only view to allocation v = view_type(arg_prop, layout); - } else if (alloc_prop_input::initialize) { + return; + } + + if constexpr (alloc_prop_input::initialize) { if constexpr (alloc_prop_input::has_execution_space) { const auto& exec_space = Impl::get_property(arg_prop); diff --git a/core/unit_test/TestRealloc.hpp b/core/unit_test/TestRealloc.hpp index 2c9dc5ee473..f30c9e15e1c 100644 --- a/core/unit_test/TestRealloc.hpp +++ b/core/unit_test/TestRealloc.hpp @@ -144,6 +144,11 @@ void impl_testRealloc() { EXPECT_EQ(oldPointer, newPointer); } } +struct NoDefaultConstructor { + int value; + KOKKOS_FUNCTION + NoDefaultConstructor(int x) : value(x) {} +}; template void testRealloc() { @@ -154,6 +159,14 @@ void testRealloc() { impl_testRealloc(); // without data initialization } + // Check #6992 fix (no default initialization in realloc without initializing) + { + using view_type = Kokkos::View; + view_type view_1d_no_default( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "view_1d_no_default"), + 5); + realloc_dispatch(WithoutInitializing{}, view_1d_no_default, 3); + } } } // namespace TestViewRealloc diff --git a/core/unit_test/TestResize.hpp b/core/unit_test/TestResize.hpp index 13d7e16d589..3102d2b9a16 100644 --- a/core/unit_test/TestResize.hpp +++ b/core/unit_test/TestResize.hpp @@ -358,6 +358,12 @@ void impl_testResize() { } } +struct NoDefaultConstructor { + int value; + KOKKOS_FUNCTION + NoDefaultConstructor(int x) : value(x) {} +}; + template void testResize() { { @@ -367,6 +373,13 @@ void testResize() { impl_testResize(); // without data initialization } + { + using view_type = Kokkos::View; + view_type view_1d_no_default( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "view_1d_no_default"), + 5); + resize_dispatch(WithoutInitializing{}, view_1d_no_default, 3); + } } } // namespace TestViewResize From 50a862cf63d532d3de6d7dc2767279d0725cd05d Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 8 May 2024 12:05:05 -0400 Subject: [PATCH 042/103] SYCL: Prepare Parallel* for Graphs (#6988) * SYCL: Make Parallel* copyable * Address review comments * Refactor Team policies further * Fix alias for SYCL TeamPolicy ParallelReduce * Improve const-correctness in Kokkos_SYCL_ParallelReduce_Team * Fix up Kokkos_SYCL_ParallelReduce_Team.hpp --- .../SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp | 6 -- .../SYCL/Kokkos_SYCL_ParallelFor_Range.hpp | 6 -- .../src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp | 53 ++++++------ .../Kokkos_SYCL_ParallelReduce_MDRange.hpp | 14 ++-- .../SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp | 14 ++-- .../SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp | 84 +++++++++---------- .../SYCL/Kokkos_SYCL_ParallelScan_Range.hpp | 14 ++-- 7 files changed, 83 insertions(+), 108 deletions(-) diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp index 7fbf5420f83..b58885192b9 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp @@ -181,12 +181,6 @@ class Kokkos::Impl::ParallelFor, functor_wrapper.register_event(event); } - ParallelFor(const ParallelFor&) = delete; - ParallelFor(ParallelFor&&) = delete; - ParallelFor& operator=(const ParallelFor&) = delete; - ParallelFor& operator=(ParallelFor&&) = delete; - ~ParallelFor() = default; - ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy) : m_functor(arg_functor), m_policy(arg_policy), diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp index b4de7eb89ff..2f8db922d3d 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp @@ -137,12 +137,6 @@ class Kokkos::Impl::ParallelFor, functor_wrapper.register_event(event); } - ParallelFor(const ParallelFor&) = delete; - ParallelFor(ParallelFor&&) = delete; - ParallelFor& operator=(const ParallelFor&) = delete; - ParallelFor& operator=(ParallelFor&&) = delete; - ~ParallelFor() = default; - ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy) : m_functor(arg_functor), m_policy(arg_policy) {} }; diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp index d98f4837315..57ff97e7f31 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp @@ -28,7 +28,7 @@ template class Kokkos::Impl::ParallelFor, Kokkos::Experimental::SYCL> { public: - using Policy = TeamPolicyInternal; + using Policy = TeamPolicy; using functor_type = FunctorType; using size_type = ::Kokkos::Experimental::SYCL::size_type; @@ -44,19 +44,14 @@ class Kokkos::Impl::ParallelFor, size_type const m_vector_size; int m_shmem_begin; int m_shmem_size; - sycl_device_ptr m_global_scratch_ptr; size_t m_scratch_size[2]; - // Only let one ParallelFor instance at a time use the team scratch memory. - // The constructor acquires the mutex which is released in the destructor. - std::scoped_lock m_scratch_buffers_lock; - int m_scratch_pool_id = -1; template - sycl::event sycl_direct_launch(const Policy& policy, + sycl::event sycl_direct_launch(const sycl_device_ptr global_scratch_ptr, const FunctorWrapper& functor_wrapper, const sycl::event& memcpy_event) const { // Convenience references - const Kokkos::Experimental::SYCL& space = policy.space(); + const Kokkos::Experimental::SYCL& space = m_policy.space(); sycl::queue& q = space.sycl_queue(); desul::ensure_sycl_lock_arrays_on_device(q); @@ -72,7 +67,6 @@ class Kokkos::Impl::ParallelFor, // Avoid capturing *this since it might not be trivially copyable const auto shmem_begin = m_shmem_begin; const size_t scratch_size[2] = {m_scratch_size[0], m_scratch_size[1]}; - sycl_device_ptr const global_scratch_ptr = m_global_scratch_ptr; auto lambda = [=](sycl::nd_item<2> item) { const member_type team_member( @@ -125,17 +119,31 @@ class Kokkos::Impl::ParallelFor, inline void execute() const { if (m_league_size == 0) return; - auto& space = *m_policy.space().impl_internal_space_instance(); + auto& instance = *m_policy.space().impl_internal_space_instance(); + + // Only let one instance at a time resize the instance's scratch memory + // allocations. + std::scoped_lock team_scratch_lock( + instance.m_team_scratch_mutex); + + // Functor's reduce memory, team scan memory, and team shared memory depend + // upon team size. + int scratch_pool_id = instance.acquire_team_scratch_space(); + const sycl_device_ptr global_scratch_ptr = + static_cast>(instance.resize_team_scratch_space( + scratch_pool_id, + static_cast(m_scratch_size[1]) * m_league_size)); + Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem& - indirectKernelMem = space.get_indirect_kernel_mem(); + indirectKernelMem = instance.get_indirect_kernel_mem(); auto functor_wrapper = Experimental::Impl::make_sycl_function_wrapper( m_functor, indirectKernelMem); - sycl::event event = sycl_direct_launch(m_policy, functor_wrapper, + sycl::event event = sycl_direct_launch(global_scratch_ptr, functor_wrapper, functor_wrapper.get_copy_event()); functor_wrapper.register_event(event); - space.register_team_scratch_event(m_scratch_pool_id, event); + instance.register_team_scratch_event(scratch_pool_id, event); } ParallelFor(FunctorType const& arg_functor, Policy const& arg_policy) @@ -143,10 +151,7 @@ class Kokkos::Impl::ParallelFor, m_policy(arg_policy), m_league_size(arg_policy.league_size()), m_team_size(arg_policy.team_size()), - m_vector_size(arg_policy.impl_vector_length()), - m_scratch_buffers_lock(arg_policy.space() - .impl_internal_space_instance() - ->m_team_scratch_mutex) { + m_vector_size(arg_policy.impl_vector_length()) { // FIXME_SYCL optimize if (m_team_size < 0) m_team_size = @@ -159,22 +164,14 @@ class Kokkos::Impl::ParallelFor, m_scratch_size[0] = m_shmem_size; m_scratch_size[1] = m_policy.scratch_size(1, m_team_size); - // Functor's reduce memory, team scan memory, and team shared memory depend - // upon team size. - auto& space = *m_policy.space().impl_internal_space_instance(); - m_scratch_pool_id = space.acquire_team_scratch_space(); - m_global_scratch_ptr = - static_cast>(space.resize_team_scratch_space( - m_scratch_pool_id, - static_cast(m_scratch_size[1]) * m_league_size)); - - if (static_cast(space.m_maxShmemPerBlock) < + const auto& instance = *m_policy.space().impl_internal_space_instance(); + if (static_cast(instance.m_maxShmemPerBlock) < m_shmem_size - m_shmem_begin) { std::stringstream out; out << "Kokkos::Impl::ParallelFor insufficient shared memory! " "Requested " << m_shmem_size - m_shmem_begin << " bytes but maximum is " - << space.m_maxShmemPerBlock << '\n'; + << instance.m_maxShmemPerBlock << '\n'; Kokkos::Impl::throw_runtime_exception(out.str()); } diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp index c1414ee0581..79f8afd4a3d 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp @@ -77,9 +77,7 @@ class Kokkos::Impl::ParallelReduce::accessible), - m_scratch_buffers_lock( - m_space.impl_internal_space_instance()->m_mutexScratchSpace) {} + typename View::memory_space>::accessible) {} private: template @@ -330,6 +328,12 @@ class Kokkos::Impl::ParallelReduce scratch_buffers_lock( + instance.m_mutexScratchSpace); + using IndirectKernelMem = Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem; IndirectKernelMem& indirectKernelMem = instance.get_indirect_kernel_mem(); @@ -349,10 +353,6 @@ class Kokkos::Impl::ParallelReduce m_scratch_buffers_lock; }; #endif /* KOKKOS_SYCL_PARALLEL_REDUCE_MDRANGE_HPP */ diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp index dbe2366b8bc..2bad7749759 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp @@ -50,9 +50,7 @@ class Kokkos::Impl::ParallelReduce::accessible), - m_scratch_buffers_lock( - p.space().impl_internal_space_instance()->m_mutexScratchSpace) {} + typename View::memory_space>::accessible) {} private: template @@ -347,6 +345,12 @@ class Kokkos::Impl::ParallelReduce scratch_buffers_lock( + instance.m_mutexScratchSpace); + using IndirectKernelMem = Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem; IndirectKernelMem& indirectKernelMem = instance.get_indirect_kernel_mem(); @@ -366,10 +370,6 @@ class Kokkos::Impl::ParallelReduce m_scratch_buffers_lock; }; #endif /* KOKKOS_SYCL_PARALLEL_REDUCE_RANGE_HPP */ diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp index 1332fafde94..43c6ca44019 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp @@ -30,7 +30,7 @@ class Kokkos::Impl::ParallelReduce, Kokkos::Experimental::SYCL> { public: - using Policy = TeamPolicyInternal; + using Policy = TeamPolicy; using FunctorType = typename CombinedFunctorReducerType::functor_type; using ReducerType = typename CombinedFunctorReducerType::reducer_type; @@ -54,24 +54,18 @@ class Kokkos::Impl::ParallelReduce m_global_scratch_ptr; size_t m_scratch_size[2]; const size_type m_league_size; int m_team_size; const size_type m_vector_size; - // Only let one ParallelReduce instance at a time use the team scratch memory - // and the host scratch memory. The constructor acquires the mutex which is - // released in the destructor. - std::scoped_lock m_scratch_buffers_lock; - int m_scratch_pool_id = -1; - template + template sycl::event sycl_direct_launch( - const PolicyType& policy, + const sycl_device_ptr global_scratch_ptr, const CombinedFunctorReducerWrapper& functor_reducer_wrapper, const sycl::event& memcpy_event) const { // Convenience references - const Kokkos::Experimental::SYCL& space = policy.space(); + const Kokkos::Experimental::SYCL& space = m_policy.space(); Kokkos::Experimental::Impl::SYCLInternal& instance = *space.impl_internal_space_instance(); sycl::queue& q = space.sycl_queue(); @@ -113,7 +107,6 @@ class Kokkos::Impl::ParallelReduce const global_scratch_ptr = m_global_scratch_ptr; #ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES cgh.depends_on(memcpy_event); @@ -170,7 +163,6 @@ class Kokkos::Impl::ParallelReduce const global_scratch_ptr = m_global_scratch_ptr; sycl::local_accessor num_teams_done(1, cgh); auto team_reduction_factory = @@ -386,6 +378,22 @@ class Kokkos::Impl::ParallelReduce scratch_buffers_lock( + instance.m_mutexScratchSpace); + std::scoped_lock team_scratch_lock( + instance.m_team_scratch_mutex); + + // Functor's reduce memory, team scan memory, and team shared memory depend + // upon team size. + int scratch_pool_id = instance.acquire_team_scratch_space(); + const sycl_device_ptr global_scratch_ptr = + static_cast>(instance.resize_team_scratch_space( + scratch_pool_id, + static_cast(m_scratch_size[1]) * m_league_size)); + using IndirectKernelMem = Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem; IndirectKernelMem& indirectKernelMem = instance.get_indirect_kernel_mem(); @@ -395,14 +403,24 @@ class Kokkos::Impl::ParallelReduce + ParallelReduce(CombinedFunctorReducerType const& arg_functor_reducer, + Policy const& arg_policy, ViewType const& arg_result) + : m_functor_reducer(arg_functor_reducer), + m_policy(arg_policy), + m_result_ptr(arg_result.data()), + m_result_ptr_device_accessible( + MemorySpaceAccess::accessible), + m_league_size(arg_policy.league_size()), + m_team_size(arg_policy.team_size()), + m_vector_size(arg_policy.impl_vector_length()) { // FIXME_SYCL optimize if (m_team_size < 0) m_team_size = m_policy.team_size_recommended( @@ -423,22 +441,15 @@ class Kokkos::Impl::ParallelReduce>(space.resize_team_scratch_space( - m_scratch_pool_id, - static_cast(m_scratch_size[1]) * m_league_size)); - - if (static_cast(space.m_maxShmemPerBlock) < + const Kokkos::Experimental::Impl::SYCLInternal& instance = + *m_policy.space().impl_internal_space_instance(); + if (static_cast(instance.m_maxShmemPerBlock) < m_shmem_size - m_shmem_begin) { std::stringstream out; out << "Kokkos::Impl::ParallelFor insufficient shared memory! " "Requested " << m_shmem_size - m_shmem_begin << " bytes but maximum is " - << space.m_maxShmemPerBlock << '\n'; + << instance.m_maxShmemPerBlock << '\n'; Kokkos::Impl::throw_runtime_exception(out.str()); } @@ -448,25 +459,6 @@ class Kokkos::Impl::ParallelReduce requested too large team size."); } - - public: - template - ParallelReduce(CombinedFunctorReducerType const& arg_functor_reducer, - Policy const& arg_policy, ViewType const& arg_result) - : m_functor_reducer(arg_functor_reducer), - m_policy(arg_policy), - m_result_ptr(arg_result.data()), - m_result_ptr_device_accessible( - MemorySpaceAccess::accessible), - m_league_size(arg_policy.league_size()), - m_team_size(arg_policy.team_size()), - m_vector_size(arg_policy.impl_vector_length()), - m_scratch_buffers_lock(arg_policy.space() - .impl_internal_space_instance() - ->m_team_scratch_mutex) { - initialize(); - } }; #endif diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp index bfc3fba7412..b3d3e9e35ce 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp @@ -150,10 +150,6 @@ class ParallelScanSYCLBase { pointer_type m_result_ptr; const bool m_result_ptr_device_accessible; - // Only let one ParallelScan instance at a time use the host scratch memory. - // The constructor acquires the mutex which is released in the destructor. - std::scoped_lock m_scratch_buffers_lock; - private: template sycl::event sycl_direct_launch(const FunctorWrapper& functor_wrapper, @@ -367,6 +363,11 @@ class ParallelScanSYCLBase { auto& instance = *m_policy.space().impl_internal_space_instance(); + // Only let one instance at a time resize the instance's scratch memory + // allocations. + std::scoped_lock scratch_buffers_lock( + instance.m_mutexScratchSpace); + Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem& indirectKernelMem = instance.get_indirect_kernel_mem(); @@ -385,10 +386,7 @@ class ParallelScanSYCLBase { : m_functor_reducer(arg_functor, typename Analysis::Reducer{arg_functor}), m_policy(arg_policy), m_result_ptr(arg_result_ptr), - m_result_ptr_device_accessible(arg_result_ptr_device_accessible), - m_scratch_buffers_lock(m_policy.space() - .impl_internal_space_instance() - ->m_mutexScratchSpace) {} + m_result_ptr_device_accessible(arg_result_ptr_device_accessible) {} }; } // namespace Kokkos::Impl From f5b34222c166c71e86ea44fd7867d443ba25856e Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 8 May 2024 13:27:28 -0400 Subject: [PATCH 043/103] SYCL: Fix deprecation in custom parallel_for RangePolicy implementation --- core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp index b4de7eb89ff..341c6c335d4 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp @@ -43,8 +43,8 @@ template struct FunctorWrapperRangePolicyParallelForCustom { using WorkTag = typename Policy::work_tag; - void operator()(sycl::item<1> item) const { - const typename Policy::index_type id = item.get_linear_id(); + void operator()(sycl::nd_item<1> item) const { + const typename Policy::index_type id = item.get_global_linear_id(); if (id < m_work_size) { const auto shifted_id = id + m_begin; if constexpr (std::is_void_v) From 37986fde4cee878aa4d9f60ae11f7ea6c80976ff Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Wed, 8 May 2024 14:18:41 -0600 Subject: [PATCH 044/103] [ci skip] update changelog for 4.3.1 (#6995) * [ci skip] update changelog for 4.3.1 * changelog: fixup --- CHANGELOG.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index c70ee5505f8..f8d288db5da 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,18 @@ # CHANGELOG +## [4.3.01](https://github.com/kokkos/kokkos/tree/4.3.01) +[Full Changelog](https://github.com/kokkos/kokkos/compare/4.3.00...4.3.01) + +### Backend and Architecture Enhancements: + +#### HIP: +* MI300 support unified memory [\#6877](https://github.com/kokkos/kokkos/pull/6877) + +### Bug Fixes +* Serial: Use the provided execution space instance in TeamPolicy [\#6951](https://github.com/kokkos/kokkos/pull/6951) +* `nvcc_wrapper`: bring back support for `--fmad` option [\#6931](https://github.com/kokkos/kokkos/pull/6931) +* Fix CUDA reduction overflow for `RangePolicy` [\#6578](https://github.com/kokkos/kokkos/pull/6578) + ## [4.3.00](https://github.com/kokkos/kokkos/tree/4.3.00) (2024-03-19) [Full Changelog](https://github.com/kokkos/kokkos/compare/4.2.01...4.3.00) From 7cad3e7c3b66bf3e6f4de75bd9043abde9f8194a Mon Sep 17 00:00:00 2001 From: Rahulkumar Gayatri Date: Wed, 8 May 2024 14:01:34 -0700 Subject: [PATCH 045/103] OpenMPTarget: Use mutex lock for parallel scan. --- .../Kokkos_OpenMPTarget_ParallelScan_Range.hpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp index c1f7851f413..c886c397966 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp @@ -48,6 +48,10 @@ class ParallelScan, value_type* m_result_ptr; const bool m_result_ptr_device_accessible; + // Only let one ParallelScan instance at a time use the scratch memory. + // The constructor acquires the mutex which is released in the destructor. + std::scoped_lock m_scratch_memory_lock; + template std::enable_if_t::value> call_with_tag( const FunctorType& f, const idx_type& idx, value_type& val, @@ -197,7 +201,8 @@ class ParallelScan, : m_functor_reducer(arg_functor, typename Analysis::Reducer{arg_functor}), m_policy(arg_policy), m_result_ptr(arg_result_ptr), - m_result_ptr_device_accessible(arg_result_ptr_device_accessible) {} + m_result_ptr_device_accessible(arg_result_ptr_device_accessible), + m_scratch_memory_lock(OpenMPTargetExec::m_mutex_scratch_ptr) {} //---------------------------------------- }; From 00170ae80cb54b39dd11f77dcef4318c754afd5a Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 9 May 2024 11:33:01 -0400 Subject: [PATCH 046/103] Remove cuSPARSE TPL It looks like an oversight. It is unused. Example code that referred to it was removed in #2688 because it was just sitting there, i.e. not built nor tested. KokkosKernels has its own CMake logic to find it and link against it. --- cmake/Dependencies.cmake | 1 - cmake/deps/CUDA.cmake | 1 - cmake/deps/CUSPARSE.cmake | 26 -------------------------- cmake/tpls/FindTPLCUSPARSE.cmake | 26 -------------------------- 4 files changed, 54 deletions(-) delete mode 100644 cmake/deps/CUSPARSE.cmake delete mode 100644 cmake/tpls/FindTPLCUSPARSE.cmake diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index 611c089b2e3..fb1e73b5799 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -1,6 +1,5 @@ TRIBITS_PACKAGE_DEFINE_DEPENDENCIES( LIB_OPTIONAL_TPLS Pthread CUDA HWLOC DLlib - TEST_OPTIONAL_TPLS CUSPARSE ) TRIBITS_TPL_TENTATIVELY_ENABLE(DLlib) diff --git a/cmake/deps/CUDA.cmake b/cmake/deps/CUDA.cmake index 68bf5b3d579..5b6afd61512 100644 --- a/cmake/deps/CUDA.cmake +++ b/cmake/deps/CUDA.cmake @@ -35,7 +35,6 @@ IF(NOT _CUDA_FAILURE) GLOBAL_SET(TPL_CUDA_LIBRARY_DIRS) GLOBAL_SET(TPL_CUDA_INCLUDE_DIRS ${CUDA_TOOLKIT_INCLUDE}) GLOBAL_SET(TPL_CUDA_LIBRARIES ${CUDA_CUDART_LIBRARY} ${CUDA_cublas_LIBRARY} ${CUDA_cufft_LIBRARY}) - KOKKOS_CREATE_IMPORTED_TPL_LIBRARY(CUSPARSE) ELSE() SET(TPL_ENABLE_CUDA OFF) ENDIF() diff --git a/cmake/deps/CUSPARSE.cmake b/cmake/deps/CUSPARSE.cmake deleted file mode 100644 index b016971ab91..00000000000 --- a/cmake/deps/CUSPARSE.cmake +++ /dev/null @@ -1,26 +0,0 @@ -#@HEADER -# ************************************************************************ -# -# Kokkos v. 4.0 -# Copyright (2022) National Technology & Engineering -# Solutions of Sandia, LLC (NTESS). -# -# Under the terms of Contract DE-NA0003525 with NTESS, -# the U.S. Government retains certain rights in this software. -# -# Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -# -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# -# ************************************************************************ -# @HEADER - -#include(${TRIBITS_DEPS_DIR}/CUDA.cmake) - -#IF (TPL_ENABLE_CUDA) -# GLOBAL_SET(TPL_CUSPARSE_LIBRARY_DIRS) -# GLOBAL_SET(TPL_CUSPARSE_INCLUDE_DIRS ${TPL_CUDA_INCLUDE_DIRS}) -# GLOBAL_SET(TPL_CUSPARSE_LIBRARIES ${CUDA_cusparse_LIBRARY}) -# KOKKOS_CREATE_IMPORTED_TPL_LIBRARY(CUSPARSE) -#ENDIF() - diff --git a/cmake/tpls/FindTPLCUSPARSE.cmake b/cmake/tpls/FindTPLCUSPARSE.cmake deleted file mode 100644 index 4709f8002b1..00000000000 --- a/cmake/tpls/FindTPLCUSPARSE.cmake +++ /dev/null @@ -1,26 +0,0 @@ -#@HEADER -# ************************************************************************ -# -# Kokkos v. 4.0 -# Copyright (2022) National Technology & Engineering -# Solutions of Sandia, LLC (NTESS). -# -# Under the terms of Contract DE-NA0003525 with NTESS, -# the U.S. Government retains certain rights in this software. -# -# Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -# -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# -#@HEADER - -# Check for CUDA support - -IF (NOT TPL_ENABLE_CUDA) - MESSAGE(FATAL_ERROR "\nCUSPARSE requires CUDA") -ELSE() - GLOBAL_SET(TPL_CUSPARSE_LIBRARY_DIRS) - GLOBAL_SET(TPL_CUSPARSE_INCLUDE_DIRS ${TPL_CUDA_INCLUDE_DIRS}) - GLOBAL_SET(TPL_CUSPARSE_LIBRARIES ${CUDA_cusparse_LIBRARY}) -ENDIF() - From 1d9d0df2eecfce635fe5c77559eb17adfa128d04 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Fri, 10 May 2024 11:49:19 -0400 Subject: [PATCH 047/103] SYCL: Print submission command queue property (#7004) * SYCL: Print submission command queue property * Also print SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS if set * Reword printout for environment variable Co-authored-by: Damien L-G --------- Co-authored-by: Damien L-G --- core/src/SYCL/Kokkos_SYCL.cpp | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/core/src/SYCL/Kokkos_SYCL.cpp b/core/src/SYCL/Kokkos_SYCL.cpp index 9a246f7642f..de5ddf405d4 100644 --- a/core/src/SYCL/Kokkos_SYCL.cpp +++ b/core/src/SYCL/Kokkos_SYCL.cpp @@ -110,6 +110,26 @@ void SYCL::print_configuration(std::ostream& os, bool verbose) const { #else os << "macro KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES : undefined\n"; #endif +#ifdef SYCL_EXT_INTEL_QUEUE_IMMEDIATE_COMMAND_LIST + if (sycl_queue() + .has_property< + sycl::ext::intel::property::queue::immediate_command_list>()) + os << "Immediate command lists enforced\n"; + else if (sycl_queue() + .has_property()) + os << "Standard command queue enforced\n"; + else +#endif + { + os << "Immediate command lists and standard command queue allowed.\n"; + if (const char* environment_setting = + std::getenv("SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS")) + os << "SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=" + << environment_setting << " takes precedence.\n"; + else + os << "SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS not defined.\n"; + } int counter = 0; int active_device = Kokkos::device_id(); From cadab6c1ed26cfef885fdb29c6d3eace98862f3e Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Tue, 7 May 2024 17:35:21 -0400 Subject: [PATCH 048/103] Test DualView resize/realloc for types without default constructor --- containers/unit_tests/TestDualView.hpp | 61 ++++++++++++++++++++------ 1 file changed, 47 insertions(+), 14 deletions(-) diff --git a/containers/unit_tests/TestDualView.hpp b/containers/unit_tests/TestDualView.hpp index a15e5fa2997..ecb06d1c652 100644 --- a/containers/unit_tests/TestDualView.hpp +++ b/containers/unit_tests/TestDualView.hpp @@ -282,15 +282,20 @@ struct test_dualview_resize { const unsigned int m = 5; const unsigned int factor = 2; - ViewType a("A", n, m); + ViewType a; + if constexpr (Initialize) + a = ViewType("A", n, m); + else + a = ViewType(Kokkos::view_alloc(Kokkos::WithoutInitializing, "A"), n, m); + Kokkos::deep_copy(a.d_view, 1); /* Covers case "Resize on Device" */ a.modify_device(); - if (Initialize) - Kokkos::resize(Kokkos::WithoutInitializing, a, factor * n, factor * m); - else + if constexpr (Initialize) Kokkos::resize(a, factor * n, factor * m); + else + Kokkos::resize(Kokkos::WithoutInitializing, a, factor * n, factor * m); ASSERT_EQ(a.extent(0), n * factor); ASSERT_EQ(a.extent(1), m * factor); @@ -305,7 +310,7 @@ struct test_dualview_resize { Kokkos::parallel_reduce( Kokkos::RangePolicy(0, a.d_view.extent(0)), SumViewEntriesFunctor(a.d_view), - a_d_sum); + Kokkos::Sum(a_d_sum)); // Check host view is synced as expected scalar_type a_h_sum = 0; @@ -321,10 +326,10 @@ struct test_dualview_resize { /* Covers case "Resize on Host" */ a.modify_host(); - if (Initialize) - Kokkos::resize(Kokkos::WithoutInitializing, a, n / factor, m / factor); - else + if constexpr (Initialize) Kokkos::resize(a, n / factor, m / factor); + else + Kokkos::resize(Kokkos::WithoutInitializing, a, n / factor, m / factor); ASSERT_EQ(a.extent(0), n / factor); ASSERT_EQ(a.extent(1), m / factor); @@ -339,7 +344,7 @@ struct test_dualview_resize { Kokkos::parallel_reduce( Kokkos::RangePolicy(0, a.d_view.extent(0)), SumViewEntriesFunctor(a.d_view), - a_d_sum); + Kokkos::Sum(a_d_sum)); // Check host view is synced as expected a_h_sum = 0; @@ -369,13 +374,17 @@ struct test_dualview_realloc { const unsigned int n = 10; const unsigned int m = 5; - ViewType a("A", n, m); - if (Initialize) - Kokkos::realloc(Kokkos::WithoutInitializing, a, n, m); - else + ViewType a; + if constexpr (Initialize) { + a = ViewType("A", n, m); Kokkos::realloc(a, n, m); + } else { + a = ViewType(Kokkos::view_alloc(Kokkos::WithoutInitializing, "A"), n, m); + Kokkos::realloc(Kokkos::WithoutInitializing, a, n, m); + } Kokkos::deep_copy(a.d_view, 1); + a.modify_device(); a.sync_host(); @@ -387,7 +396,7 @@ struct test_dualview_realloc { Kokkos::parallel_reduce( Kokkos::RangePolicy(0, a.d_view.extent(0)), SumViewEntriesFunctor(a.d_view), - a_d_sum); + Kokkos::Sum(a_d_sum)); // Check host view is synced as expected scalar_type a_h_sum = 0; @@ -463,12 +472,36 @@ TEST(TEST_CATEGORY, dualview_deep_copy) { test_dualview_deep_copy(); } +struct NoDefaultConstructor { + NoDefaultConstructor(int i_) : i(i_) {} + + operator int() const { return i; } + NoDefaultConstructor& operator+=(const NoDefaultConstructor& other) { + i += other.i; + return *this; + } + + int i; +}; +} // namespace Test + +template <> +struct Kokkos::reduction_identity { + static Test::NoDefaultConstructor sum() { return {0}; } +}; + +namespace Test { + TEST(TEST_CATEGORY, dualview_realloc) { test_dualview_realloc(); + Impl::test_dualview_realloc(); } TEST(TEST_CATEGORY, dualview_resize) { test_dualview_resize(); + Impl::test_dualview_resize(); } namespace { From df018d97f52a7bfa29cf7d29fbda42f244001f40 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Mon, 13 May 2024 22:59:25 +0200 Subject: [PATCH 049/103] Suppress deprecated warnings via pragma push/pop in the tests (#6999) * Introduce `KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_{PUSH,POP} macros to suppress diagnostics when appropriate * Suppress all deprecated warnings I can see in tests * Update EDG diag suppress to fix the Intel Compiler Classic and provide a fallback empty definition for the macros --- containers/unit_tests/TestVector.hpp | 2 ++ core/src/Kokkos_Macros.hpp | 25 +++++++++++++++++++ core/unit_test/TestArrayOps.hpp | 2 ++ .../incremental/Test01_execspace.hpp | 2 ++ simd/unit_tests/include/SIMDTesting_Ops.hpp | 2 ++ 5 files changed, 33 insertions(+) diff --git a/containers/unit_tests/TestVector.hpp b/containers/unit_tests/TestVector.hpp index a7d341b789d..19901a52ad5 100644 --- a/containers/unit_tests/TestVector.hpp +++ b/containers/unit_tests/TestVector.hpp @@ -21,6 +21,8 @@ #include #include #include +#include +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() #include namespace Test { diff --git a/core/src/Kokkos_Macros.hpp b/core/src/Kokkos_Macros.hpp index b255d2a5195..27b32b15214 100644 --- a/core/src/Kokkos_Macros.hpp +++ b/core/src/Kokkos_Macros.hpp @@ -562,6 +562,31 @@ static constexpr bool kokkos_omp_on_host() { return false; } #define KOKKOS_IMPL_WARNING(desc) KOKKOS_IMPL_DO_PRAGMA(message(#desc)) #endif +// clang-format off +#if defined(__EDG__) + #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() \ + _Pragma("warning push") \ + _Pragma("warning disable 1478") + #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() \ + _Pragma("warning pop") +#elif defined(__GNUC__) || defined(__clang__) + #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"") + #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() \ + _Pragma("GCC diagnostic pop") +#elif defined(_MSC_VER) + #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() \ + _Pragma("warning(push)") \ + _Pragma("warning(disable: 4996)") + #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() \ + _Pragma("warning(pop)") +#else + #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() + #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() +#endif +// clang-format on + #define KOKKOS_ATTRIBUTE_NODISCARD [[nodiscard]] #if (defined(KOKKOS_COMPILER_GNU) || defined(KOKKOS_COMPILER_CLANG) || \ diff --git a/core/unit_test/TestArrayOps.hpp b/core/unit_test/TestArrayOps.hpp index 387589fbe88..6b8e0f3aca3 100644 --- a/core/unit_test/TestArrayOps.hpp +++ b/core/unit_test/TestArrayOps.hpp @@ -112,6 +112,7 @@ TEST(TEST_CATEGORY, array_zero_data_nullptr) { } #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() TEST(TEST_CATEGORY, array_contiguous_capacity) { using A = Kokkos::Array::contiguous>; @@ -390,6 +391,7 @@ TEST(TEST_CATEGORY, array_strided_assignment) { ASSERT_EQ(e.max_size(), std::size(ee) / eStride); ASSERT_EQ(e[0], ee[0]); } +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() #endif } // namespace diff --git a/core/unit_test/incremental/Test01_execspace.hpp b/core/unit_test/incremental/Test01_execspace.hpp index d7b2a57b442..a7fa26c7282 100644 --- a/core/unit_test/incremental/Test01_execspace.hpp +++ b/core/unit_test/incremental/Test01_execspace.hpp @@ -63,7 +63,9 @@ struct TestIncrExecSpace { ASSERT_GT(concurrency, 0); #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() int in_parallel = ExecSpace::in_parallel(); + KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() ASSERT_FALSE(in_parallel); #endif diff --git a/simd/unit_tests/include/SIMDTesting_Ops.hpp b/simd/unit_tests/include/SIMDTesting_Ops.hpp index c587ccf3046..74141f25316 100644 --- a/simd/unit_tests/include/SIMDTesting_Ops.hpp +++ b/simd/unit_tests/include/SIMDTesting_Ops.hpp @@ -81,7 +81,9 @@ class absolutes { auto on_host(T const& a) const { if constexpr (std::is_signed_v) { #if defined(KOKKOS_ENABLE_DEPRECATED_CODE_4) + KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() return Kokkos::Experimental::abs(a); + KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() #else return Kokkos::abs(a); #endif From da8be22574e62efc901bc9da540a863d98b8492a Mon Sep 17 00:00:00 2001 From: Seyong Lee Date: Wed, 15 May 2024 15:21:52 -0400 Subject: [PATCH 050/103] This PR changes the default execution behavior of the parallel_for(team-policy) constructs in the OpenACC backend. - This PR handles a missing case not covered by the previous PR #6772 This PR also fixes the OpenACC backend error in the thread-safety test in PR #6938. --- core/src/OpenACC/Kokkos_OpenACC_ParallelFor_Team.hpp | 8 ++++++-- core/unit_test/CMakeLists.txt | 1 - 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/core/src/OpenACC/Kokkos_OpenACC_ParallelFor_Team.hpp b/core/src/OpenACC/Kokkos_OpenACC_ParallelFor_Team.hpp index 4fce680aef0..2b98018e3bb 100644 --- a/core/src/OpenACC/Kokkos_OpenACC_ParallelFor_Team.hpp +++ b/core/src/OpenACC/Kokkos_OpenACC_ParallelFor_Team.hpp @@ -44,10 +44,12 @@ class Kokkos::Impl::ParallelFor, auto team_size = m_policy.team_size(); auto vector_length = m_policy.impl_vector_length(); + int const async_arg = m_policy.space().acc_async_queue(); + auto const a_functor(m_functor); #pragma acc parallel loop gang vector num_gangs(league_size) \ - vector_length(team_size* vector_length) copyin(a_functor) + vector_length(team_size* vector_length) copyin(a_functor) async(async_arg) for (int i = 0; i < league_size * team_size * vector_length; i++) { int league_id = i / (team_size * vector_length); typename Policy::member_type team(league_id, league_size, team_size, @@ -145,10 +147,12 @@ class Kokkos::Impl::ParallelFor, auto team_size = m_policy.team_size(); auto vector_length = m_policy.impl_vector_length(); + int const async_arg = m_policy.space().acc_async_queue(); + auto const a_functor(m_functor); #pragma acc parallel loop gang num_gangs(league_size) num_workers(team_size) \ - vector_length(vector_length) copyin(a_functor) + vector_length(vector_length) copyin(a_functor) async(async_arg) for (int i = 0; i < league_size; i++) { int league_id = i; typename Policy::member_type team(league_id, league_size, team_size, diff --git a/core/unit_test/CMakeLists.txt b/core/unit_test/CMakeLists.txt index 4d0ce3b22e3..4344b74e5e7 100644 --- a/core/unit_test/CMakeLists.txt +++ b/core/unit_test/CMakeLists.txt @@ -427,7 +427,6 @@ if(Kokkos_ENABLE_OPENACC) ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_complexdouble.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_complexfloat.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Crs.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ExecSpaceThreadSafety.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_JoinBackwardCompatibility.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_LocalDeepCopy.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Other.cpp From 2b7b98a1a6e9138389813c6e4115672459f02195 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Thu, 16 May 2024 13:30:18 +0000 Subject: [PATCH 051/103] Use parallel_for instead of parallel_reduce for check --- containers/unit_tests/TestDualView.hpp | 113 ++++++++++++------------- 1 file changed, 56 insertions(+), 57 deletions(-) diff --git a/containers/unit_tests/TestDualView.hpp b/containers/unit_tests/TestDualView.hpp index ecb06d1c652..2512cb5c491 100644 --- a/containers/unit_tests/TestDualView.hpp +++ b/containers/unit_tests/TestDualView.hpp @@ -55,8 +55,8 @@ struct test_dualview_alloc { bool result = false; test_dualview_alloc(unsigned int size) { - result = run_me >( - size, 3); + result = + run_me>(size, 3); } }; @@ -154,7 +154,7 @@ struct test_dualview_combinations { } test_dualview_combinations(unsigned int size, bool with_init) { - result = run_me >( + result = run_me>( size, 3, with_init); } }; @@ -253,21 +253,18 @@ struct test_dual_view_deep_copy { } // end run_me test_dual_view_deep_copy() { - run_me >(10, 5, - true); - run_me >(10, 5, - false); + run_me>(10, 5, true); + run_me>(10, 5, + false); // Test zero length but allocated (a.d_view.data!=nullptr but // a.d_view.span()==0) - run_me >(0, 5, true); - run_me >(0, 5, - false); + run_me>(0, 5, true); + run_me>(0, 5, false); // Test default constructed view - run_me >(-1, 5, - true); - run_me >(-1, 5, - false); + run_me>(-1, 5, true); + run_me>(-1, 5, + false); } }; @@ -303,25 +300,30 @@ struct test_dualview_resize { a.sync_host(); // Check device view is initialized as expected - scalar_type a_d_sum = 0; // Execute on the execution_space associated with t_dev's memory space using t_dev_exec_space = typename ViewType::t_dev::memory_space::execution_space; - Kokkos::parallel_reduce( - Kokkos::RangePolicy(0, a.d_view.extent(0)), - SumViewEntriesFunctor(a.d_view), - Kokkos::Sum(a_d_sum)); + Kokkos::View errors_d( + "errors"); + Kokkos::parallel_for( + Kokkos::MDRangePolicy>( + {0, 0}, {a.d_view.extent(0), a.d_view.extent(1)}), + KOKKOS_LAMBDA(int i, int j) { + if (a.d_view(i, j) != 1) Kokkos::atomic_inc(errors_d.data()); + }); + int errors_d_scalar; + Kokkos::deep_copy(errors_d_scalar, errors_d); // Check host view is synced as expected - scalar_type a_h_sum = 0; + int errors_h_scalar = 0; for (size_t i = 0; i < a.h_view.extent(0); ++i) for (size_t j = 0; j < a.h_view.extent(1); ++j) { - a_h_sum += a.h_view(i, j); + if (a.h_view(i, j) != 1) ++errors_h_scalar; } // Check - ASSERT_EQ(a_h_sum, a_d_sum); - ASSERT_EQ(a_h_sum, scalar_type(a.extent(0) * a.extent(1))); + ASSERT_EQ(errors_d_scalar, 0); + ASSERT_EQ(errors_h_scalar, 0); /* Covers case "Resize on Host" */ a.modify_host(); @@ -337,30 +339,33 @@ struct test_dualview_resize { a.sync_device(Kokkos::DefaultExecutionSpace{}); // Check device view is initialized as expected - a_d_sum = 0; + Kokkos::deep_copy(errors_d, 0); // Execute on the execution_space associated with t_dev's memory space using t_dev_exec_space = typename ViewType::t_dev::memory_space::execution_space; - Kokkos::parallel_reduce( - Kokkos::RangePolicy(0, a.d_view.extent(0)), - SumViewEntriesFunctor(a.d_view), - Kokkos::Sum(a_d_sum)); + Kokkos::parallel_for( + Kokkos::MDRangePolicy>( + {0, 0}, {a.d_view.extent(0), a.d_view.extent(1)}), + KOKKOS_LAMBDA(int i, int j) { + if (a.d_view(i, j) != 1) Kokkos::atomic_inc(errors_d.data()); + }); + Kokkos::deep_copy(errors_d_scalar, errors_d); // Check host view is synced as expected - a_h_sum = 0; + errors_h_scalar = 0; for (size_t i = 0; i < a.h_view.extent(0); ++i) for (size_t j = 0; j < a.h_view.extent(1); ++j) { - a_h_sum += a.h_view(i, j); + if (a.h_view(i, j) != 1) ++errors_h_scalar; } // Check - ASSERT_EQ(a_h_sum, scalar_type(a.extent(0) * a.extent(1))); - ASSERT_EQ(a_h_sum, a_d_sum); + ASSERT_EQ(errors_d_scalar, 0); + ASSERT_EQ(errors_h_scalar, 0); } // end run_me test_dualview_resize() { - run_me >(); + run_me>(); } }; @@ -382,6 +387,8 @@ struct test_dualview_realloc { a = ViewType(Kokkos::view_alloc(Kokkos::WithoutInitializing, "A"), n, m); Kokkos::realloc(Kokkos::WithoutInitializing, a, n, m); } + ASSERT_EQ(a.extent(0), n); + ASSERT_EQ(a.extent(1), m); Kokkos::deep_copy(a.d_view, 1); @@ -389,29 +396,34 @@ struct test_dualview_realloc { a.sync_host(); // Check device view is initialized as expected - scalar_type a_d_sum = 0; // Execute on the execution_space associated with t_dev's memory space using t_dev_exec_space = typename ViewType::t_dev::memory_space::execution_space; - Kokkos::parallel_reduce( - Kokkos::RangePolicy(0, a.d_view.extent(0)), - SumViewEntriesFunctor(a.d_view), - Kokkos::Sum(a_d_sum)); + Kokkos::View errors_d( + "errors"); + Kokkos::parallel_for( + Kokkos::MDRangePolicy>( + {0, 0}, {a.d_view.extent(0), a.d_view.extent(1)}), + KOKKOS_LAMBDA(int i, int j) { + if (a.d_view(i, j) != 1) Kokkos::atomic_inc(errors_d.data()); + }); + int errors_d_scalar; + Kokkos::deep_copy(errors_d_scalar, errors_d); // Check host view is synced as expected - scalar_type a_h_sum = 0; + int errors_h_scalar = 0; for (size_t i = 0; i < a.h_view.extent(0); ++i) for (size_t j = 0; j < a.h_view.extent(1); ++j) { - a_h_sum += a.h_view(i, j); + if (a.h_view(i, j) != 1) ++errors_h_scalar; } // Check - ASSERT_EQ(a_h_sum, scalar_type(a.extent(0) * a.extent(1))); - ASSERT_EQ(a_h_sum, a_d_sum); + ASSERT_EQ(errors_d_scalar, 0); + ASSERT_EQ(errors_h_scalar, 0); } // end run_me test_dualview_realloc() { - run_me >(); + run_me>(); } }; @@ -474,23 +486,10 @@ TEST(TEST_CATEGORY, dualview_deep_copy) { struct NoDefaultConstructor { NoDefaultConstructor(int i_) : i(i_) {} - - operator int() const { return i; } - NoDefaultConstructor& operator+=(const NoDefaultConstructor& other) { - i += other.i; - return *this; - } + KOKKOS_FUNCTION operator int() const { return i; } int i; }; -} // namespace Test - -template <> -struct Kokkos::reduction_identity { - static Test::NoDefaultConstructor sum() { return {0}; } -}; - -namespace Test { TEST(TEST_CATEGORY, dualview_realloc) { test_dualview_realloc(); From fc4383ab6f7a200cd2557f68c042bf2b59e8fa97 Mon Sep 17 00:00:00 2001 From: Gregor Daiss Date: Sun, 19 May 2024 09:26:31 -0500 Subject: [PATCH 052/103] Fix unique_any_senders nvcc template deduction --- core/src/HPX/Kokkos_HPX.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core/src/HPX/Kokkos_HPX.cpp b/core/src/HPX/Kokkos_HPX.cpp index 6d541a64148..1f3d0783449 100644 --- a/core/src/HPX/Kokkos_HPX.cpp +++ b/core/src/HPX/Kokkos_HPX.cpp @@ -153,7 +153,7 @@ void HPX::impl_instance_fence_locked(const std::string &name) const { auto &s = impl_get_sender(); hpx::this_thread::experimental::sync_wait(std::move(s)); - s = hpx::execution::experimental::unique_any_sender( + s = hpx::execution::experimental::unique_any_sender<>( hpx::execution::experimental::just()); }); } @@ -184,7 +184,7 @@ void HPX::impl_static_fence(const std::string &name) { } hpx::this_thread::experimental::sync_wait(std::move(s)); - s = hpx::execution::experimental::unique_any_sender( + s = hpx::execution::experimental::unique_any_sender<>( hpx::execution::experimental::just()); }); } From 226aecfb8c161042d88421ac0176aa3c6d697fb6 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Mon, 20 May 2024 13:01:30 -0400 Subject: [PATCH 053/103] Properly guard deprecated `Kokkos_Vector.hpp` header self contained test (#7016) * Properly guard deprecated header self contained test registration * Unconditionally remove the Kokkos_Vector.hpp header self contained test * On second thought prefer guards * Fix typo disa[b]led Co-authored-by: Daniel Arndt --------- Co-authored-by: Daniel Arndt --- core/unit_test/headers_self_contained/CMakeLists.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/core/unit_test/headers_self_contained/CMakeLists.txt b/core/unit_test/headers_self_contained/CMakeLists.txt index f792b03ed88..4c364ceee75 100644 --- a/core/unit_test/headers_self_contained/CMakeLists.txt +++ b/core/unit_test/headers_self_contained/CMakeLists.txt @@ -10,7 +10,8 @@ file(GLOB KOKKOS_CONTAINERS_HEADERS RELATIVE ${BASE_DIR}/containers/src file(GLOB KOKKOS_ALGORITHMS_HEADERS RELATIVE ${BASE_DIR}/algorithms/src ${BASE_DIR}/algorithms/src/*.hpp) -if(NOT Kokkos_ENABLE_DEPRECATED_CODE_4) +# erroring out when deprecated code is disabled and raising warnings that are treated as errors in the CI otherwise +if(NOT Kokkos_ENABLE_DEPRECATED_CODE_4 OR Kokkos_ENABLE_DEPRECATION_WARNINGS) list(REMOVE_ITEM KOKKOS_CONTAINERS_HEADERS "Kokkos_Vector.hpp") endif() From 81b63c5c5e6bb1b076f64bf6f9c975bba2aeddc6 Mon Sep 17 00:00:00 2001 From: Nicolas Morales Date: Mon, 20 May 2024 14:36:59 -0700 Subject: [PATCH 054/103] mdspan converting constructors (#6830) This PR adds conversions between mdspan and View - a (for now internal) "natural" mdspan type is introduced for View - constructors which take that "natural" mdspan type are added - to_mdspan and operator mdspan are introduced to get an mdspan from View - both leverage the "natural" mdspan type - all of them are restricted to the layout types we actually can convert - some error checks happen at runtime regarding padded Kokkos::View - includes some necessary updates for mdspan ppl - note we expect to do a clean mdspan merge before release --- core/src/Kokkos_View.hpp | 97 ++++ .../src/View/MDSpan/Kokkos_MDSpan_Extents.hpp | 19 +- core/src/View/MDSpan/Kokkos_MDSpan_Layout.hpp | 148 +++++ core/src/impl/Kokkos_ViewMapping.hpp | 254 ++++++--- core/unit_test/CMakeLists.txt | 7 +- core/unit_test/TestMDSpanConversion.hpp | 504 ++++++++++++++++++ .../view/TestExtentsDatatypeConversion.cpp | 11 +- .../__p0009_bits/layout_stride.hpp | 44 ++ .../experimental/__p0009_bits/macros.hpp | 5 + .../__p2642_bits/layout_padded.hpp | 484 +++++++++-------- 10 files changed, 1263 insertions(+), 310 deletions(-) create mode 100644 core/src/View/MDSpan/Kokkos_MDSpan_Layout.hpp create mode 100644 core/unit_test/TestMDSpanConversion.hpp diff --git a/core/src/Kokkos_View.hpp b/core/src/Kokkos_View.hpp index 09c6e780ef5..a6c6c955b87 100644 --- a/core/src/Kokkos_View.hpp +++ b/core/src/Kokkos_View.hpp @@ -38,6 +38,7 @@ static_assert(false, #ifdef KOKKOS_ENABLE_IMPL_MDSPAN #include +#include #endif #include @@ -372,6 +373,32 @@ struct ViewTraits { //------------------------------------ }; +#ifdef KOKKOS_ENABLE_IMPL_MDSPAN +namespace Impl { +struct UnsupportedKokkosArrayLayout; + +template +struct MDSpanViewTraits { + using mdspan_type = UnsupportedKokkosArrayLayout; +}; + +// "Natural" mdspan for a view if the View's ArrayLayout is supported. +template +struct MDSpanViewTraits::type>> { + using index_type = std::size_t; + using extents_type = + typename Impl::ExtentsFromDataType::type; + using mdspan_layout_type = + typename Impl::LayoutFromArrayLayout::type; + using mdspan_type = + mdspan; +}; +} // namespace Impl +#endif // KOKKOS_ENABLE_IMPL_MDSPAN + /** \class View * \brief View to an array of data. * @@ -1722,6 +1749,76 @@ class View : public ViewTraits { "Layout is not constructible from extent arguments. Use " "overload taking a layout object instead."); } + + //---------------------------------------- + // MDSpan converting constructors +#ifdef KOKKOS_ENABLE_IMPL_MDSPAN + template ::mdspan_type> + KOKKOS_INLINE_FUNCTION +#if defined(__cpp_conditional_explicit) && \ + (__cpp_conditional_explicit >= 201806L) + // FIXME C++20 reevaluate after determining minium compiler versions + explicit(traits::is_managed) +#endif + View(const typename Impl::MDSpanViewTraits::mdspan_type& mds, + std::enable_if_t< + !std::is_same_v>* = + nullptr) + : View(mds.data_handle(), + Impl::array_layout_from_mapping< + typename traits::array_layout, + typename Impl::MDSpanViewTraits::mdspan_type>( + mds.mapping())) { + } + + template + KOKKOS_INLINE_FUNCTION +#if defined(__cpp_conditional_explicit) && \ + (__cpp_conditional_explicit >= 201806L) + // FIXME C++20 reevaluate after determining minium compiler versions + explicit(!std::is_convertible_v< + Kokkos::mdspan, + typename Impl::MDSpanViewTraits::mdspan_type>) +#endif + View(const Kokkos::mdspan& mds) + : View(typename Impl::MDSpanViewTraits::mdspan_type(mds)) { + } + + //---------------------------------------- + // Conversion to MDSpan + template , + typename Impl::MDSpanViewTraits::mdspan_type>>> + KOKKOS_INLINE_FUNCTION constexpr operator mdspan< + OtherElementType, OtherExtents, OtherLayoutPolicy, OtherAccessor>() { + using mdspan_type = typename Impl::MDSpanViewTraits::mdspan_type; + return mdspan_type{data(), + Impl::mapping_from_view_mapping(m_map)}; + } + + template , + typename = std::enable_if_t>> + KOKKOS_INLINE_FUNCTION constexpr auto to_mdspan( + const OtherAccessorType& other_accessor = OtherAccessorType()) { + using mdspan_type = typename Impl::MDSpanViewTraits::mdspan_type; + using ret_mdspan_type = + mdspan; + return ret_mdspan_type{data(), + Impl::mapping_from_view_mapping(m_map), + other_accessor}; + } +#endif // KOKKOS_ENABLE_IMPL_MDSPAN }; template diff --git a/core/src/View/MDSpan/Kokkos_MDSpan_Extents.hpp b/core/src/View/MDSpan/Kokkos_MDSpan_Extents.hpp index 3846b52d239..29d1e00adfc 100644 --- a/core/src/View/MDSpan/Kokkos_MDSpan_Extents.hpp +++ b/core/src/View/MDSpan/Kokkos_MDSpan_Extents.hpp @@ -37,9 +37,6 @@ struct ViewDimension; template struct ViewDataType; -} // namespace Kokkos::Impl - -namespace Kokkos::Experimental::Impl { // A few things to note -- // - mdspan allows for 0-rank extents similarly to View, so we don't need @@ -106,6 +103,20 @@ struct DataTypeFromExtents { // Will cause a compile error if it is malformed (i.e. dynamic after static) using type = typename ::Kokkos::Impl::ViewDataType::type; }; -} // namespace Kokkos::Experimental::Impl + +template +constexpr KOKKOS_INLINE_FUNCTION auto extents_from_view_mapping_impl( + const VM &view_mapping, std::index_sequence) { + return Extents{view_mapping.extent(Indices)...}; +} + +template +constexpr KOKKOS_INLINE_FUNCTION auto extents_from_view_mapping( + const VM &view_mapping) { + static_assert(Extents::rank() == VM::Rank); + return extents_from_view_mapping_impl( + view_mapping, std::make_index_sequence{}); +} +} // namespace Kokkos::Impl #endif // KOKKOS_EXPERIMENTAL_MDSPAN_EXTENTS_HPP diff --git a/core/src/View/MDSpan/Kokkos_MDSpan_Layout.hpp b/core/src/View/MDSpan/Kokkos_MDSpan_Layout.hpp new file mode 100644 index 00000000000..8073dee1eed --- /dev/null +++ b/core/src/View/MDSpan/Kokkos_MDSpan_Layout.hpp @@ -0,0 +1,148 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE +static_assert(false, + "Including non-public Kokkos header files is not allowed."); +#endif + +#ifndef KOKKOS_EXPERIMENTAL_MDSPAN_LAYOUT_HPP +#define KOKKOS_EXPERIMENTAL_MDSPAN_LAYOUT_HPP + +#include "Kokkos_MDSpan_Extents.hpp" +#include + +namespace Kokkos::Impl { + +template +struct LayoutFromArrayLayout; + +template <> +struct LayoutFromArrayLayout { + using type = Experimental::layout_left_padded; +}; + +template <> +struct LayoutFromArrayLayout { + using type = Experimental::layout_right_padded; +}; + +template <> +struct LayoutFromArrayLayout { + using type = layout_stride; +}; + +template +KOKKOS_INLINE_FUNCTION auto array_layout_from_mapping( + const typename MDSpanType::mapping_type &mapping) { + using mapping_type = typename MDSpanType::mapping_type; + using extents_type = typename mapping_type::extents_type; + + constexpr auto rank = extents_type::rank(); + const auto &ext = mapping.extents(); + + static_assert(rank <= ARRAY_LAYOUT_MAX_RANK, + "Unsupported rank for mdspan (must be <= 8)"); + + if constexpr (std::is_same_v) { + return Kokkos::LayoutStride{ + rank > 0 ? ext.extent(0) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + rank > 0 ? mapping.stride(0) : 0, + rank > 1 ? ext.extent(1) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + rank > 1 ? mapping.stride(1) : 0, + rank > 2 ? ext.extent(2) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + rank > 2 ? mapping.stride(2) : 0, + rank > 3 ? ext.extent(3) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + rank > 3 ? mapping.stride(3) : 0, + rank > 4 ? ext.extent(4) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + rank > 4 ? mapping.stride(4) : 0, + rank > 5 ? ext.extent(5) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + rank > 5 ? mapping.stride(5) : 0, + rank > 6 ? ext.extent(6) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + rank > 6 ? mapping.stride(6) : 0, + rank > 7 ? ext.extent(7) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + rank > 7 ? mapping.stride(7) : 0, + }; + } else { + // FIXME: Kokkos Layouts don't store stride (it's in the mapping) + // We could conceivably fix this by adding an extra ViewCtorProp for + // an abritrary padding. For now we will check for this. + if constexpr (rank > 1 && + (std::is_same_v< + typename mapping_type::layout_type, + Experimental::layout_left_padded> || + std::is_same_v< + typename mapping_type::layout_type, + Experimental::layout_right_padded>)) { + [[maybe_unused]] constexpr size_t strided_index = + std::is_same_v> + ? 1 + : rank - 2; + [[maybe_unused]] constexpr size_t extent_index = + std::is_same_v> + ? 0 + : rank - 1; + KOKKOS_ASSERT(mapping.stride(strided_index) == ext.extent(extent_index)); + } + + return ArrayLayout{rank > 0 ? ext.extent(0) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + rank > 1 ? ext.extent(1) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + rank > 2 ? ext.extent(2) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + rank > 3 ? ext.extent(3) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + rank > 4 ? ext.extent(4) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + rank > 5 ? ext.extent(5) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + rank > 6 ? ext.extent(6) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + rank > 7 ? ext.extent(7) : KOKKOS_IMPL_CTOR_DEFAULT_ARG}; + } +} + +template +KOKKOS_INLINE_FUNCTION auto mapping_from_view_mapping(const VM &view_mapping) { + using mapping_type = typename MDSpanType::mapping_type; + using extents_type = typename mapping_type::extents_type; + + // std::span is not available in C++17 (our current requirements), + // so we need to use the std::array constructor for layout mappings. + // FIXME When C++20 is available, we can use std::span here instead + std::size_t strides[VM::Rank]; + view_mapping.stride_fill(&strides[0]); + if constexpr (std::is_same_v) { + return mapping_type(Kokkos::mdspan_non_standard, + extents_from_view_mapping(view_mapping), + strides); + } else if constexpr (VM::Rank > 1 && + std::is_same_v>) { + return mapping_type(extents_from_view_mapping(view_mapping), + strides[1]); + } else if constexpr (VM::Rank > 1 && + std::is_same_v>) { + return mapping_type(extents_from_view_mapping(view_mapping), + strides[VM::Rank - 2]); + } else { + return mapping_type(extents_from_view_mapping(view_mapping)); + } +} + +} // namespace Kokkos::Impl + +#endif // KOKKOS_EXPERIMENTAL_MDSPAN_LAYOUT_HPP diff --git a/core/src/impl/Kokkos_ViewMapping.hpp b/core/src/impl/Kokkos_ViewMapping.hpp index 3217c76e380..c37112be896 100644 --- a/core/src/impl/Kokkos_ViewMapping.hpp +++ b/core/src/impl/Kokkos_ViewMapping.hpp @@ -647,34 +647,60 @@ struct ViewOffset< m_dim.N5 * m_dim.N6; } - // Stride with [ rank ] value is the total length + // Fill the target unbounded array s with the stride. + // This method differs from stride() in that it does not write the total + // length to the last index of the array. Preconditions: s must be an array of + // dimension_type::rank elements + // FIXME: The version of clang-format in CI fails from maybe_unused + // clang-format off template - KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { - s[0] = 1; - if (0 < dimension_type::rank) { - s[1] = m_dim.N0; + KOKKOS_INLINE_FUNCTION iType + stride_fill([[maybe_unused]] iType* const s) const { + iType n = 1; + if constexpr (0 < dimension_type::rank) { + s[0] = n; + n *= m_dim.N0; } - if (1 < dimension_type::rank) { - s[2] = s[1] * m_dim.N1; + if constexpr (1 < dimension_type::rank) { + s[1] = n; + n *= m_dim.N1; } - if (2 < dimension_type::rank) { - s[3] = s[2] * m_dim.N2; + if constexpr (2 < dimension_type::rank) { + s[2] = n; + n *= m_dim.N2; } - if (3 < dimension_type::rank) { - s[4] = s[3] * m_dim.N3; + if constexpr (3 < dimension_type::rank) { + s[3] = n; + n *= m_dim.N3; } - if (4 < dimension_type::rank) { - s[5] = s[4] * m_dim.N4; + if constexpr (4 < dimension_type::rank) { + s[4] = n; + n *= m_dim.N4; } - if (5 < dimension_type::rank) { - s[6] = s[5] * m_dim.N5; + if constexpr (5 < dimension_type::rank) { + s[5] = n; + n *= m_dim.N5; } - if (6 < dimension_type::rank) { - s[7] = s[6] * m_dim.N6; + if constexpr (6 < dimension_type::rank) { + s[6] = n; + n *= m_dim.N6; } - if (7 < dimension_type::rank) { - s[8] = s[7] * m_dim.N7; + if constexpr (7 < dimension_type::rank) { + s[7] = n; + n *= m_dim.N7; } + return n; + } + // clang-format on + + // Fill the target unbounded array s with the stride and the total spanned + // size. This method differs from stride_fill() in that it writes the total + // spanned size to the last index of the array. Preconditions: s must be an + // array of dimension_type::rank + 1 elements Stride with [ rank ] value is + // the total length + template + KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { + s[dimension_type::rank] = stride_fill(s); } //---------------------------------------- @@ -935,34 +961,59 @@ struct ViewOffset< m_dim.N6; } - // Stride with [ rank ] value is the total length + // Fill the target unbounded array s with the stride. + // This method differs from stride() in that it does not write the total + // length to the last index of the array. Preconditions: s must be an array of + // dimension_type::rank elements + // The version of clang-format in CI fails from maybe_unused + // clang-format off template - KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { - s[0] = 1; - if (0 < dimension_type::rank) { - s[1] = m_stride; + KOKKOS_INLINE_FUNCTION iType + stride_fill([[maybe_unused]] iType* const s) const { + iType n = 1; + if constexpr (0 < dimension_type::rank) { + s[0] = n; + n *= m_stride; } - if (1 < dimension_type::rank) { - s[2] = s[1] * m_dim.N1; + if constexpr (1 < dimension_type::rank) { + s[1] = n; + n *= m_dim.N1; } - if (2 < dimension_type::rank) { - s[3] = s[2] * m_dim.N2; + if constexpr (2 < dimension_type::rank) { + s[2] = n; + n *= m_dim.N2; } - if (3 < dimension_type::rank) { - s[4] = s[3] * m_dim.N3; + if constexpr (3 < dimension_type::rank) { + s[3] = n; + n *= m_dim.N3; } - if (4 < dimension_type::rank) { - s[5] = s[4] * m_dim.N4; + if constexpr (4 < dimension_type::rank) { + s[4] = n; + n *= m_dim.N4; } - if (5 < dimension_type::rank) { - s[6] = s[5] * m_dim.N5; + if constexpr (5 < dimension_type::rank) { + s[5] = n; + n *= m_dim.N5; } - if (6 < dimension_type::rank) { - s[7] = s[6] * m_dim.N6; + if constexpr (6 < dimension_type::rank) { + s[6] = n; + n *= m_dim.N6; } - if (7 < dimension_type::rank) { - s[8] = s[7] * m_dim.N7; + if constexpr (7 < dimension_type::rank) { + s[7] = n; + n *= m_dim.N7; } + return n; + } + // clang-format on + + // Fill the target unbounded array s with the stride and the total spanned + // size. This method differs from stride_fill() in that it writes the total + // spanned size to the last index of the array. Preconditions: s must be an + // array of dimension_type::rank + 1 elements + template + KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { + s[dimension_type::rank] = stride_fill(s); } //---------------------------------------- @@ -1286,42 +1337,58 @@ struct ViewOffset< m_dim.N1; } - // Stride with [ rank ] value is the total length + // Fill the target unbounded array s with the stride. + // This method differs from stride() in that it does not write the total + // length to the last index of the array. Preconditions: s must be an array of + // dimension_type::rank elements + // The version of clang-format in CI fails from maybe_unused + // clang-format off template - KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { + KOKKOS_INLINE_FUNCTION iType + stride_fill([[maybe_unused]] iType* const s) const { size_type n = 1; - if (7 < dimension_type::rank) { + if constexpr (7 < dimension_type::rank) { s[7] = n; n *= m_dim.N7; } - if (6 < dimension_type::rank) { + if constexpr (6 < dimension_type::rank) { s[6] = n; n *= m_dim.N6; } - if (5 < dimension_type::rank) { + if constexpr (5 < dimension_type::rank) { s[5] = n; n *= m_dim.N5; } - if (4 < dimension_type::rank) { + if constexpr (4 < dimension_type::rank) { s[4] = n; n *= m_dim.N4; } - if (3 < dimension_type::rank) { + if constexpr (3 < dimension_type::rank) { s[3] = n; n *= m_dim.N3; } - if (2 < dimension_type::rank) { + if constexpr (2 < dimension_type::rank) { s[2] = n; n *= m_dim.N2; } - if (1 < dimension_type::rank) { + if constexpr (1 < dimension_type::rank) { s[1] = n; n *= m_dim.N1; } - if (0 < dimension_type::rank) { + if constexpr (0 < dimension_type::rank) { s[0] = n; } - s[dimension_type::rank] = n * m_dim.N0; + return n * m_dim.N0; + } + // clang-format on + + // Fill the target unbounded array s with the stride and the total spanned + // size. This method differs from stride_fill() in that it writes the total + // spanned size to the last index of the array. Preconditions: s must be an + // array of dimension_type::rank + 1 elements + template + KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { + s[dimension_type::rank] = stride_fill(s); } //---------------------------------------- @@ -1573,41 +1640,57 @@ struct ViewOffset< return m_stride; } - // Stride with [ rank ] value is the total length + // Fill the target unbounded array s with the stride. + // This method differs from stride() in that it does not write the total + // length to the last index of the array. Preconditions: s must be an array of + // dimension_type::rank elements + // The version of clang-format in CI fails from maybe_unused + // clang-format off template - KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { + KOKKOS_INLINE_FUNCTION iType + stride_fill([[maybe_unused]] iType* const s) const { size_type n = 1; - if (7 < dimension_type::rank) { + if constexpr (7 < dimension_type::rank) { s[7] = n; n *= m_dim.N7; } - if (6 < dimension_type::rank) { + if constexpr (6 < dimension_type::rank) { s[6] = n; n *= m_dim.N6; } - if (5 < dimension_type::rank) { + if constexpr (5 < dimension_type::rank) { s[5] = n; n *= m_dim.N5; } - if (4 < dimension_type::rank) { + if constexpr (4 < dimension_type::rank) { s[4] = n; n *= m_dim.N4; } - if (3 < dimension_type::rank) { + if constexpr (3 < dimension_type::rank) { s[3] = n; n *= m_dim.N3; } - if (2 < dimension_type::rank) { + if constexpr (2 < dimension_type::rank) { s[2] = n; n *= m_dim.N2; } - if (1 < dimension_type::rank) { + if constexpr (1 < dimension_type::rank) { s[1] = n; } - if (0 < dimension_type::rank) { + if constexpr (0 < dimension_type::rank) { s[0] = m_stride; } - s[dimension_type::rank] = m_stride * m_dim.N0; + return m_stride * m_dim.N0; + } + // clang-format on + + // Fill the target unbounded array s with the stride and the total spanned + // size. This method differs from stride_fill() in that it writes the total + // spanned size to the last index of the array. Preconditions: s must be an + // array of dimension_type::rank + 1 elements + template + KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { + s[dimension_type::rank] = stride_fill(s); } //---------------------------------------- @@ -2133,34 +2216,50 @@ struct ViewOffset { return m_stride.S7; } - // Stride with [ rank ] value is the total length + // Fill the target unbounded array s with the stride. + // This method differs from stride() in that it does not write the total + // length to the last index of the array. Preconditions: s must be an array of + // dimension_type::rank elements + // The version of clang-format in CI fails from maybe_unused + // clang-format off template - KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { - if (0 < dimension_type::rank) { + KOKKOS_INLINE_FUNCTION iType + stride_fill([[maybe_unused]] iType* const s) const { + if constexpr (0 < dimension_type::rank) { s[0] = m_stride.S0; } - if (1 < dimension_type::rank) { + if constexpr (1 < dimension_type::rank) { s[1] = m_stride.S1; } - if (2 < dimension_type::rank) { + if constexpr (2 < dimension_type::rank) { s[2] = m_stride.S2; } - if (3 < dimension_type::rank) { + if constexpr (3 < dimension_type::rank) { s[3] = m_stride.S3; } - if (4 < dimension_type::rank) { + if constexpr (4 < dimension_type::rank) { s[4] = m_stride.S4; } - if (5 < dimension_type::rank) { + if constexpr (5 < dimension_type::rank) { s[5] = m_stride.S5; } - if (6 < dimension_type::rank) { + if constexpr (6 < dimension_type::rank) { s[6] = m_stride.S6; } - if (7 < dimension_type::rank) { + if constexpr (7 < dimension_type::rank) { s[7] = m_stride.S7; } - s[dimension_type::rank] = span(); + return span(); + } + // clang-format on + + // Fill the target unbounded array s with the stride and the total spanned + // size. This method differs from stride_fill() in that it writes the total + // spanned size to the last index of the array. Preconditions: s must be an + // array of dimension_type::rank + 1 elements + template + KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { + s[dimension_type::rank] = stride_fill(s); } //---------------------------------------- @@ -2814,11 +2913,24 @@ class ViewMapping< return m_impl_offset.stride_7(); } + // Fill the target unbounded array s with the stride and the total spanned + // size. This method differs from stride_fill() in that it writes the total + // spanned size to the last index of the array. Preconditions: s must be an + // array of dimension_type::rank + 1 elements template KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { m_impl_offset.stride(s); } + // Fill the target unbounded array s with the stride. + // This method differs from stride() in that it does not write the total + // length to the last index of the array. Preconditions: s must be an array of + // dimension_type::rank elements + template + KOKKOS_INLINE_FUNCTION iType stride_fill(iType* const s) const { + return m_impl_offset.stride_fill(s); + } + //---------------------------------------- // Range span diff --git a/core/unit_test/CMakeLists.txt b/core/unit_test/CMakeLists.txt index 4344b74e5e7..5df8d1e2cf8 100644 --- a/core/unit_test/CMakeLists.txt +++ b/core/unit_test/CMakeLists.txt @@ -174,7 +174,7 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;OpenACC;HIP;SYCL) endforeach() set(${Tag}_SOURCES1B) - foreach(Name + set(${Tag}_TESTNAMES1B MDRange_a MDRange_b MDRange_c @@ -185,6 +185,7 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;OpenACC;HIP;SYCL) MDRangePolicyConstructors MDRangeReduce MDSpan + MDSpanConversion MinMaxClamp NumericTraits OccupancyControlTrait @@ -206,6 +207,10 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;OpenACC;HIP;SYCL) SharedAlloc Swap ) + IF (NOT Kokkos_ENABLE_IMPL_MDSPAN) + LIST(REMOVE_ITEM ${Tag}_TESTNAMES1B MDSpanConversion) + ENDIF() + foreach(Name IN LISTS ${Tag}_TESTNAMES1B) set(file ${dir}/Test${Tag}_${Name}.cpp) # Write to a temporary intermediate file and call configure_file to avoid # updating timestamps triggering unnecessary rebuilds on subsequent cmake runs. diff --git a/core/unit_test/TestMDSpanConversion.hpp b/core/unit_test/TestMDSpanConversion.hpp new file mode 100644 index 00000000000..6519a7c277d --- /dev/null +++ b/core/unit_test/TestMDSpanConversion.hpp @@ -0,0 +1,504 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include +#include + +#include +#include "experimental/__p0009_bits/layout_stride.hpp" + +namespace { + +template +struct TestViewMDSpanConversion { + using value_type = T; + + template + using layout_left_padded = Kokkos::Experimental::layout_left_padded; + + template + using layout_right_padded = + Kokkos::Experimental::layout_right_padded; + + struct TestAccessor { + using offset_policy = TestAccessor; + using element_type = value_type; + using reference = element_type &; + using data_handle_type = element_type *; + + constexpr TestAccessor() noexcept = default; + constexpr reference access(data_handle_type p, std::size_t i) noexcept { + return p[i]; + } + constexpr data_handle_type offset(data_handle_type p, + std::size_t i) noexcept { + return p + i; + } + }; + + template + static void test_conversion_from_mdspan( + Kokkos::View ref, + const MDSpanLayoutMapping &mapping) { + using unmanaged_view_type = + Kokkos::View>; + using natural_mdspan_type = typename Kokkos::Impl::MDSpanViewTraits< + typename unmanaged_view_type::traits>::mdspan_type; + using mapping_type = MDSpanLayoutMapping; + using mdspan_layout_type = typename MDSpanLayoutMapping::layout_type; + using extents_type = typename mapping_type::extents_type; + using mdspan_type = + Kokkos::mdspan; + + static_assert(std::is_constructible_v); + static_assert(std::is_convertible_v == + std::is_convertible_v); + // Manually create an mdspan from ref so we have a valid pointer to play + // with + const auto &exts = mapping.extents(); + auto mds = mdspan_type{ref.data(), mapping}; + + auto test_view = unmanaged_view_type(mds); + + ASSERT_EQ(test_view.data(), ref.data()); + ASSERT_EQ(test_view.data(), mds.data_handle()); + ASSERT_EQ(test_view.layout(), ref.layout()); + for (std::size_t r = 0; r < mdspan_type::rank(); ++r) { + ASSERT_EQ(test_view.extent(r), ref.extent(r)); + ASSERT_EQ(test_view.extent(r), exts.extent(r)); + } + } + + template + static void test_conversion_to_mdspan( + const MDSpanLayoutMapping &ref_layout_mapping, ViewType v) { + using view_type = ViewType; + using natural_mdspan_type = typename Kokkos::Impl::MDSpanViewTraits< + typename view_type::traits>::mdspan_type; + + static_assert(natural_mdspan_type::rank() == view_type::rank); + static_assert(std::is_same_v); + constexpr bool is_strided_layout = + std::is_same_v; + if constexpr (!is_strided_layout) { + static_assert(natural_mdspan_type::mapping_type::padding_value == + Kokkos::dynamic_extent); + } + // test conversion operator to natural mdspan + { + natural_mdspan_type cvt = v; + ASSERT_EQ(cvt.data_handle(), v.data()); + ASSERT_EQ(cvt.mapping(), ref_layout_mapping); + + if constexpr (!is_strided_layout && natural_mdspan_type::rank() > 1) { + ASSERT_EQ(cvt.mapping().stride(1), ref_layout_mapping.stride(1)); + } + } + // test to_mdspan() returning natural mdspan + { + auto cvt = v.to_mdspan(); + static_assert(std::is_same_v); + ASSERT_EQ(cvt.data_handle(), v.data()); + ASSERT_EQ(cvt.mapping(), ref_layout_mapping); + } + // test conversion operator to different mdspan type + { + using mdspan_type = Kokkos::mdspan< + const typename natural_mdspan_type::element_type, + Kokkos::dextents, + typename natural_mdspan_type::layout_type, + typename natural_mdspan_type::accessor_type>; + mdspan_type cvt = v; + ASSERT_EQ(cvt.data_handle(), v.data()); + ASSERT_EQ(cvt.mapping(), ref_layout_mapping); + } + } + + template + static void test_conversion_to_mdspan_with_accessor( + const MDSpanLayoutMapping &ref_layout_mapping, ViewType v, + const AccessorType &a) { + auto cvt = v.to_mdspan(a); + static_assert(decltype(cvt)::rank() == ViewType::rank); + static_assert(std::is_same_v); + ASSERT_EQ(cvt.data_handle(), v.data()); + ASSERT_EQ(cvt.mapping(), ref_layout_mapping); + } + + template + using natural_mdspan_type_for_view = typename Kokkos::Impl::MDSpanViewTraits< + typename ViewType::traits>::mdspan_type; + + static void run_test() { + // Verify we can only convert to compatible mdspans + static_assert(std::is_convertible_v< + Kokkos::View, + natural_mdspan_type_for_view>>); + static_assert( + std::is_convertible_v< + Kokkos::View, + natural_mdspan_type_for_view>>); + + // Do not cast const away + static_assert(!std::is_convertible_v< + Kokkos::View, + natural_mdspan_type_for_view>>); + + // Mismatched dim + static_assert(!std::is_convertible_v< + Kokkos::View, + natural_mdspan_type_for_view>>); + + // Mismatched layouts + static_assert( + !std::is_convertible_v, + natural_mdspan_type_for_view>>); + static_assert( + !std::is_convertible_v, + natural_mdspan_type_for_view>>); + // nvcc doesn't do CTAD properly here, making this way more verbose.. + // LayoutLeft + test_conversion_from_mdspan( + Kokkos::View("ref", + 7), + typename layout_left_padded<7>::template mapping< + Kokkos::dextents>{ + Kokkos::dextents(7)}); + + test_conversion_from_mdspan( + Kokkos::View("ref"), + typename layout_left_padded<7>::template mapping< + Kokkos::extents>{ + Kokkos::extents()}); + test_conversion_from_mdspan( + Kokkos::View("ref"), + typename layout_left_padded<7>::template mapping< + Kokkos::dextents>{ + Kokkos::dextents(7)}); + test_conversion_from_mdspan( + Kokkos::View("ref", + 7), + typename layout_left_padded<7>::template mapping< + Kokkos::extents>{ + Kokkos::extents()}); + + test_conversion_from_mdspan( + Kokkos::View("ref", + 7, 3), + typename layout_left_padded<7>::template mapping< + Kokkos::dextents>{ + Kokkos::dextents(7, 3)}); + test_conversion_from_mdspan( + Kokkos::View( + "ref"), + typename layout_left_padded<7>::template mapping< + Kokkos::extents>{ + Kokkos::extents()}); + test_conversion_from_mdspan( + Kokkos::View( + "ref"), + typename layout_left_padded<7>::template mapping< + Kokkos::dextents>{ + Kokkos::dextents(7, 3)}); + test_conversion_from_mdspan( + Kokkos::View("ref", + 7, 3), + typename layout_left_padded<7>::template mapping< + Kokkos::extents>{ + Kokkos::extents()}); + + // LayoutRight + test_conversion_from_mdspan( + Kokkos::View("ref", + 7), + typename layout_right_padded<7>::template mapping< + Kokkos::dextents>{ + Kokkos::dextents(7)}); + test_conversion_from_mdspan( + Kokkos::View("ref"), + typename layout_right_padded<7>::template mapping< + Kokkos::extents>{ + Kokkos::extents()}); + test_conversion_from_mdspan( + Kokkos::View("ref"), + typename layout_right_padded<7>::template mapping< + Kokkos::dextents>{ + Kokkos::dextents(7)}); + test_conversion_from_mdspan( + Kokkos::View("ref", + 7), + typename layout_right_padded<7>::template mapping< + Kokkos::extents>{ + Kokkos::extents()}); + + test_conversion_from_mdspan( + Kokkos::View("ref", + 3, 7), + typename layout_right_padded<7>::template mapping< + Kokkos::dextents>{ + Kokkos::dextents(3, 7)}); + test_conversion_from_mdspan( + Kokkos::View( + "ref"), + typename layout_right_padded<7>::template mapping< + Kokkos::extents>{ + Kokkos::extents()}); + test_conversion_from_mdspan( + Kokkos::View( + "ref"), + typename layout_right_padded<7>::template mapping< + Kokkos::dextents>{ + Kokkos::dextents(3, 7)}); + test_conversion_from_mdspan( + Kokkos::View("ref", + 3, 7), + typename layout_right_padded<7>::template mapping< + Kokkos::extents>{ + Kokkos::extents()}); + + // LayoutStride + { + const size_t strides[] = {2}; + test_conversion_from_mdspan( + Kokkos::View( + "ref", Kokkos::LayoutStride{7, 2}), + Kokkos::layout_stride::mapping>{ + Kokkos::mdspan_non_standard, Kokkos::dextents{7}, + strides}); + } + { + const size_t strides[] = {2}; + test_conversion_from_mdspan( + Kokkos::View( + "ref", Kokkos::LayoutStride{7, 2}), + Kokkos::layout_stride::mapping>{ + Kokkos::mdspan_non_standard, {}, strides}); + } + { + const size_t strides[] = {2}; + test_conversion_from_mdspan( + Kokkos::View( + "ref", Kokkos::LayoutStride{7, 2}), + Kokkos::layout_stride::mapping>{ + Kokkos::mdspan_non_standard, Kokkos::dextents{7}, + strides}); + } + { + const size_t strides[] = {2}; + test_conversion_from_mdspan( + Kokkos::View( + "ref", Kokkos::LayoutStride{7, 2}), + Kokkos::layout_stride::mapping>{ + Kokkos::mdspan_non_standard, Kokkos::extents(), + strides}); + } + + { + const size_t strides[] = {2, 4}; + test_conversion_from_mdspan( + Kokkos::View( + "ref", Kokkos::LayoutStride{7, 2, 3, 4}), + Kokkos::layout_stride::mapping>{ + Kokkos::mdspan_non_standard, + Kokkos::dextents(7, 3), strides}); + } + { + const size_t strides[] = {2, 4}; + test_conversion_from_mdspan( + Kokkos::View( + "ref", Kokkos::LayoutStride{7, 2, 3, 4}), + Kokkos::layout_stride::mapping>{ + Kokkos::mdspan_non_standard, Kokkos::extents(), + strides}); + } + { + const size_t strides[] = {2, 4}; + test_conversion_from_mdspan( + Kokkos::View( + "ref", Kokkos::LayoutStride{7, 2, 3, 4}), + Kokkos::layout_stride::mapping>{ + Kokkos::mdspan_non_standard, + Kokkos::dextents(7, 3), strides}); + } + { + const size_t strides[] = {2, 4}; + test_conversion_from_mdspan( + Kokkos::View( + "ref", Kokkos::LayoutStride{7, 2, 3, 4}), + Kokkos::layout_stride::mapping>{ + Kokkos::mdspan_non_standard, Kokkos::extents(), + strides}); + } + + // Conversion to mdspan + test_conversion_to_mdspan( + layout_left_padded::mapping< + Kokkos::extents>({}, 4), + Kokkos::View("v", 4)); + test_conversion_to_mdspan( + layout_left_padded::mapping< + Kokkos::extents>({}, 4), + Kokkos::View("v", 4, + 7)); + + test_conversion_to_mdspan( + layout_right_padded::mapping< + Kokkos::extents>({}, 4), + Kokkos::View("v", + 4)); + test_conversion_to_mdspan( + layout_right_padded::mapping< + Kokkos::extents>({}, 7), + Kokkos::View("v", 4, + 7)); + + { + const size_t strides[] = {5}; + test_conversion_to_mdspan( + Kokkos::layout_stride::mapping>( + Kokkos::mdspan_non_standard, {}, strides), + Kokkos::View( + "v", Kokkos::LayoutStride{4, 5})); + } + { + const size_t strides[] = {5, 9}; + test_conversion_to_mdspan( + Kokkos::layout_stride::mapping>( + Kokkos::mdspan_non_standard, {}, strides), + Kokkos::View( + "v", Kokkos::LayoutStride{4, 5, 7, 9})); + } + + // Aligned types (for padded layouts) + test_conversion_to_mdspan( + layout_left_padded::mapping< + Kokkos::extents>({}, 128), + Kokkos::View( + Kokkos::view_alloc("v", Kokkos::AllowPadding), 127, 7)); + + test_conversion_to_mdspan( + layout_right_padded::mapping< + Kokkos::extents>({}, 128), + Kokkos::View( + Kokkos::view_alloc("v", Kokkos::AllowPadding), 7, 127)); + + // Conversion with standard default_accessor + + test_conversion_to_mdspan_with_accessor( + layout_left_padded::mapping< + Kokkos::extents>({}, 4), + Kokkos::View("v", 4), + Kokkos::default_accessor{}); + test_conversion_to_mdspan_with_accessor( + layout_left_padded::mapping< + Kokkos::extents>({}, 4), + Kokkos::View("v", 4, + 7), + Kokkos::default_accessor{}); + + test_conversion_to_mdspan_with_accessor( + layout_right_padded::mapping< + Kokkos::extents>({}, 4), + Kokkos::View("v", 4), + Kokkos::default_accessor{}); + test_conversion_to_mdspan_with_accessor( + layout_right_padded::mapping< + Kokkos::extents>({}, 7), + Kokkos::View("v", 4, + 7), + Kokkos::default_accessor{}); + + { + const size_t strides[] = {5}; + test_conversion_to_mdspan_with_accessor( + Kokkos::layout_stride::mapping>( + Kokkos::mdspan_non_standard, {}, strides), + Kokkos::View( + "v", Kokkos::LayoutStride{4, 5}), + Kokkos::default_accessor{}); + } + { + const size_t strides[] = {5, 9}; + test_conversion_to_mdspan_with_accessor( + Kokkos::layout_stride::mapping>( + Kokkos::mdspan_non_standard, {}, strides), + Kokkos::View( + "v", Kokkos::LayoutStride{4, 5, 7, 9}), + Kokkos::default_accessor{}); + } + + // Conversion with a test accessor + + test_conversion_to_mdspan_with_accessor( + layout_left_padded::mapping< + Kokkos::extents>({}, 4), + Kokkos::View("v", 4), + TestAccessor{}); + test_conversion_to_mdspan_with_accessor( + layout_left_padded::mapping< + Kokkos::extents>({}, 4), + Kokkos::View("v", 4, + 7), + TestAccessor{}); + + test_conversion_to_mdspan_with_accessor( + layout_right_padded::mapping< + Kokkos::extents>({}, 4), + Kokkos::View("v", 4), + TestAccessor{}); + test_conversion_to_mdspan_with_accessor( + layout_right_padded::mapping< + Kokkos::extents>({}, 7), + Kokkos::View("v", 4, + 7), + TestAccessor{}); + + { + const size_t strides[] = {5}; + test_conversion_to_mdspan_with_accessor( + Kokkos::layout_stride::mapping>( + Kokkos::mdspan_non_standard, {}, strides), + Kokkos::View( + "v", Kokkos::LayoutStride{4, 5}), + TestAccessor{}); + } + { + const size_t strides[] = {5, 9}; + test_conversion_to_mdspan_with_accessor( + Kokkos::layout_stride::mapping>( + Kokkos::mdspan_non_standard, {}, strides), + Kokkos::View( + "v", Kokkos::LayoutStride{4, 5, 7, 9}), + TestAccessor{}); + } + } +}; + +TEST(TEST_CATEGORY, view_mdspan_conversion) { + TestViewMDSpanConversion::run_test(); + TestViewMDSpanConversion::run_test(); + TestViewMDSpanConversion::run_test(); +} + +} // namespace diff --git a/core/unit_test/view/TestExtentsDatatypeConversion.cpp b/core/unit_test/view/TestExtentsDatatypeConversion.cpp index b95890614e0..1b9b2a36819 100644 --- a/core/unit_test/view/TestExtentsDatatypeConversion.cpp +++ b/core/unit_test/view/TestExtentsDatatypeConversion.cpp @@ -23,15 +23,14 @@ namespace { // Helper to make static tests more succinct template -constexpr bool datatype_matches_extent = - std::is_same_v::type, - Extent>; +constexpr bool datatype_matches_extent = std::is_same_v< + typename Kokkos::Impl::ExtentsFromDataType::type, + Extent>; template constexpr bool extent_matches_datatype = - std::is_same_v::type>; + std::is_same_v::type>; // Conversion from DataType to extents // 0-rank view diff --git a/tpls/mdspan/include/experimental/__p0009_bits/layout_stride.hpp b/tpls/mdspan/include/experimental/__p0009_bits/layout_stride.hpp index 15ad577d149..05fce8ba44c 100644 --- a/tpls/mdspan/include/experimental/__p0009_bits/layout_stride.hpp +++ b/tpls/mdspan/include/experimental/__p0009_bits/layout_stride.hpp @@ -199,6 +199,12 @@ struct layout_stride { return __strides_storage_t{static_cast(s[Idxs])...}; } + template + MDSPAN_INLINE_FUNCTION + static constexpr const __strides_storage_t fill_strides(mdspan_non_standard_tag, const IntegralType (&s)[extents_type::rank()]) { + return __strides_storage_t{static_cast(s[Idxs])...}; + } + #ifdef __cpp_lib_span template MDSPAN_INLINE_FUNCTION @@ -309,6 +315,44 @@ struct layout_stride { */ } + MDSPAN_TEMPLATE_REQUIRES( + class IntegralTypes, + /* requires */ ( + // MSVC 19.32 does not like using index_type here, requires the typename Extents::index_type + // error C2641: cannot deduce template arguments for 'MDSPAN_IMPL_STANDARD_NAMESPACE::layout_stride::mapping' + _MDSPAN_TRAIT(std::is_convertible, const std::remove_const_t&, typename Extents::index_type) && + _MDSPAN_TRAIT(std::is_nothrow_constructible, typename Extents::index_type, const std::remove_const_t&) + ) + ) + MDSPAN_INLINE_FUNCTION + constexpr + mapping( + mdspan_non_standard_tag, + extents_type const& e, + IntegralTypes (&s)[extents_type::rank()] + ) noexcept +#if defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS) + : __members{ +#else + : __base_t(__base_t{__member_pair_t( +#endif + e, __strides_storage_t(__impl::fill_strides(mdspan_non_standard, s)) +#if defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS) + } +#else + )}) +#endif + { + /* + * TODO: check preconditions + * - s[i] > 0 is true for all i in the range [0, rank_ ). + * - REQUIRED-SPAN-SIZE(e, s) is a representable value of type index_type ([basic.fundamental]). + * - If rank_ is greater than 0, then there exists a permutation P of the integers in the + * range [0, rank_), such that s[ pi ] >= s[ pi − 1 ] * e.extent( pi − 1 ) is true for + * all i in the range [1, rank_ ), where pi is the ith element of P. + */ + } + #ifdef __cpp_lib_span MDSPAN_TEMPLATE_REQUIRES( class IntegralTypes, diff --git a/tpls/mdspan/include/experimental/__p0009_bits/macros.hpp b/tpls/mdspan/include/experimental/__p0009_bits/macros.hpp index 3eeb39755c8..523bca4e11d 100644 --- a/tpls/mdspan/include/experimental/__p0009_bits/macros.hpp +++ b/tpls/mdspan/include/experimental/__p0009_bits/macros.hpp @@ -629,3 +629,8 @@ struct __bools; // end Pre-C++14 constexpr }}}1 //============================================================================== + +namespace MDSPAN_IMPL_STANDARD_NAMESPACE { +constexpr struct mdspan_non_standard_tag { +} mdspan_non_standard; +} // namespace MDSPAN_IMPL_STANDARD_NAMESPACE diff --git a/tpls/mdspan/include/experimental/__p2642_bits/layout_padded.hpp b/tpls/mdspan/include/experimental/__p2642_bits/layout_padded.hpp index a8014867923..1f5ad70a6cf 100644 --- a/tpls/mdspan/include/experimental/__p2642_bits/layout_padded.hpp +++ b/tpls/mdspan/include/experimental/__p2642_bits/layout_padded.hpp @@ -158,19 +158,21 @@ class layout_left_padded::mapping { typename padded_stride_type::static_array_type padded_stride = {}; extents_type exts = {}; - constexpr index_type compute_offset(std::index_sequence<>) const { + MDSPAN_INLINE_FUNCTION constexpr index_type + compute_offset(std::index_sequence<>) const { return 0; } template - constexpr index_type compute_offset(std::index_sequence, - IndexOffset index_offset) const { + MDSPAN_INLINE_FUNCTION constexpr index_type + compute_offset(std::index_sequence, IndexOffset index_offset) const { return index_offset; } template - constexpr index_type compute_offset(std::index_sequence, - IndexOffsets... index_offsets) const { + MDSPAN_INLINE_FUNCTION constexpr index_type + compute_offset(std::index_sequence, + IndexOffsets... index_offsets) const { index_type indices[] = {static_cast(index_offsets)...}; // self-recursive fold trick from // https://github.com/llvm/llvm-project/blob/96e1914aa2e6d8966acbfbe2f4d184201f1aa318/libcxx/include/mdspan/layout_left.h#L144 @@ -241,62 +243,71 @@ class layout_left_padded::mapping { /** * Converting constructor from `layout_left::mapping`. * - * This overload participates in overload resolution only if `is_constructible_v` is true. - * If `OtherExtents::rank() > 1` then one of `padding_value`, `static_extent(0)`, or `OtherExtents::static_extent(0)` must be `dynamic_extent`; - * otherwise, `OtherExtents::static_extent(0)` must be equal to the least multiple of `padding_value` greater than or equal to `extents_type::static_extent(0)` + * This overload participates in overload resolution only if + * `is_constructible_v` is true. If + * `OtherExtents::rank() > 1` then one of `padding_value`, `static_extent(0)`, + * or `OtherExtents::static_extent(0)` must be `dynamic_extent`; otherwise, + * `OtherExtents::static_extent(0)` must be equal to the least multiple of + * `padding_value` greater than or equal to `extents_type::static_extent(0)` */ MDSPAN_TEMPLATE_REQUIRES( - class _OtherExtents, - /* requires */ ( - std::is_constructible_v - ) - ) - MDSPAN_CONDITIONAL_EXPLICIT((!std::is_convertible_v<_OtherExtents, extents_type>)) + class _OtherExtents, + /* requires */ (std::is_constructible_v)) + MDSPAN_CONDITIONAL_EXPLICIT( + (!std::is_convertible_v<_OtherExtents, extents_type>)) + MDSPAN_INLINE_FUNCTION constexpr mapping(const layout_left::mapping<_OtherExtents> &other_mapping) - : padded_stride(padded_stride_type::init_padding(other_mapping, std::integral_constant{})), - exts(other_mapping.extents()) - { - static_assert((_OtherExtents::rank() > 1) || (static_padding_stride != dynamic_extent) || (_OtherExtents::static_extent(extent_to_pad_idx) != dynamic_extent) - || (static_padding_stride == _OtherExtents::static_extent(extent_to_pad_idx))); + : padded_stride(padded_stride_type::init_padding( + other_mapping, + std::integral_constant{})), + exts(other_mapping.extents()) { + static_assert( + (_OtherExtents::rank() > 1) || + (static_padding_stride != dynamic_extent) || + (_OtherExtents::static_extent(extent_to_pad_idx) != dynamic_extent) || + (static_padding_stride == + _OtherExtents::static_extent(extent_to_pad_idx))); } /** * Converting constructor from `layout_stride::mapping`. * - * This overload participates in overload resolution only if `is_constructible_v` is true + * This overload participates in overload resolution only if + * `is_constructible_v` is true */ MDSPAN_TEMPLATE_REQUIRES( - class _OtherExtents, - /* requires */ ( - std::is_constructible_v - ) - ) + class _OtherExtents, + /* requires */ (std::is_constructible_v)) MDSPAN_CONDITIONAL_EXPLICIT((extents_type::rank() > 0)) + MDSPAN_INLINE_FUNCTION constexpr mapping(const layout_stride::mapping<_OtherExtents> &other_mapping) - : padded_stride(padded_stride_type::init_padding(other_mapping, std::integral_constant{})), - exts(other_mapping.extents()) - { - } + : padded_stride(padded_stride_type::init_padding( + other_mapping, + std::integral_constant{})), + exts(other_mapping.extents()) {} /** * Converting constructor from `layout_left_padded::mapping`. * - * This overload participates in overload resolution only if `is_constructible_v` is true. - * Either `padding_value` or `OtherPaddingStride` must be `std::dynamic_extent`, or `padding_value == OtherPaddingStride`. + * This overload participates in overload resolution only if + * `is_constructible_v` is true. Either + * `padding_value` or `OtherPaddingStride` must be `std::dynamic_extent`, or + * `padding_value == OtherPaddingStride`. */ MDSPAN_TEMPLATE_REQUIRES( - class _Mapping, - /* requires */ ( - detail::is_layout_left_padded_mapping<_Mapping>::value - && std::is_constructible_v - ) - ) - MDSPAN_CONDITIONAL_EXPLICIT((extents_type::rank() > 1 && (padding_value == dynamic_extent || _Mapping::padding_value == dynamic_extent))) - constexpr - mapping(const _Mapping &other_mapping) - : padded_stride(padded_stride_type::init_padding(other_mapping, std::integral_constant{})), - exts(other_mapping.extents()) - { + class _Mapping, + /* requires */ (detail::is_layout_left_padded_mapping<_Mapping>::value + &&std::is_constructible_v< + extents_type, typename _Mapping::extents_type>)) + MDSPAN_CONDITIONAL_EXPLICIT((extents_type::rank() > 1 && + (padding_value == dynamic_extent || + _Mapping::padding_value == dynamic_extent))) + MDSPAN_INLINE_FUNCTION + constexpr mapping(const _Mapping &other_mapping) + : padded_stride(padded_stride_type::init_padding( + other_mapping, + std::integral_constant{})), + exts(other_mapping.extents()) { static_assert(padding_value == dynamic_extent || _Mapping::padding_value == dynamic_extent || padding_value == _Mapping::padding_value); @@ -305,42 +316,43 @@ class layout_left_padded::mapping { /** * Converting constructor from `layout_right_padded::mapping`. * - * This overload participates in overload resolution only if `extents_type::rank()` is 0 or 1 and `is_constructible_v` is `true`. + * This overload participates in overload resolution only if + * `extents_type::rank()` is 0 or 1 and `is_constructible_v` is `true`. */ MDSPAN_TEMPLATE_REQUIRES( - class _Mapping, - /* requires */ ( - detail::is_layout_right_padded_mapping<_Mapping>::value - && extents_type::rank() <= 1 - && std::is_constructible_v - ) - ) - MDSPAN_CONDITIONAL_EXPLICIT((!std::is_convertible_v)) - constexpr - mapping(const _Mapping &other_mapping) noexcept - : padded_stride(padded_stride_type::init_padding(other_mapping.extents(), other_mapping.extents().extent(extent_to_pad_idx))), - exts(other_mapping.extents()) - {} + class _Mapping, + /* requires */ (detail::is_layout_right_padded_mapping<_Mapping>::value + &&extents_type::rank() <= 1 && + std::is_constructible_v)) + MDSPAN_CONDITIONAL_EXPLICIT( + (!std::is_convertible_v)) + MDSPAN_INLINE_FUNCTION + constexpr mapping(const _Mapping &other_mapping) noexcept + : padded_stride(padded_stride_type::init_padding( + other_mapping.extents(), + other_mapping.extents().extent(extent_to_pad_idx))), + exts(other_mapping.extents()) {} - constexpr const extents_type &extents() const noexcept - { + MDSPAN_INLINE_FUNCTION constexpr const extents_type & + extents() const noexcept { return exts; } - constexpr std::array - strides() const noexcept - { - if constexpr ( extents_type::rank() == 0 ) { + MDSPAN_INLINE_FUNCTION constexpr std::array + strides() const noexcept { + if constexpr (extents_type::rank() == 0) { return {}; - } else if constexpr ( extents_type::rank() == 1 ) { + } else if constexpr (extents_type::rank() == 1) { return {1}; } else { index_type value = 1; std::array s{}; s[extent_to_pad_idx] = value; value *= padded_stride.value(0); - for (rank_type r = extent_to_pad_idx + 1; r < extents_type::rank() - 1; ++r) - { + for (rank_type r = extent_to_pad_idx + 1; r < extents_type::rank() - 1; + ++r) { s[r] = value; value *= exts.extent(r); } @@ -349,12 +361,11 @@ class layout_left_padded::mapping { } } - constexpr index_type - required_span_size() const noexcept - { - if constexpr ( extents_type::rank() == 0 ) { + MDSPAN_INLINE_FUNCTION constexpr index_type + required_span_size() const noexcept { + if constexpr (extents_type::rank() == 0) { return 1; - } else if constexpr ( extents_type::rank() == 1 ) { + } else if constexpr (extents_type::rank() == 1) { return exts.extent(0); } else { index_type value = padded_stride.value(0); @@ -375,40 +386,47 @@ class layout_left_padded::mapping { */ MDSPAN_TEMPLATE_REQUIRES( class... _Indices, - /* requires */ ( - sizeof...(_Indices) == extents_type::rank() && - (::MDSPAN_IMPL_STANDARD_NAMESPACE::detail::are_valid_indices()) - ) - ) - constexpr size_t operator()(_Indices... idxs) const noexcept - { + /* requires */ (sizeof...(_Indices) == extents_type::rank() && + (::MDSPAN_IMPL_STANDARD_NAMESPACE::detail:: + are_valid_indices()))) + MDSPAN_INLINE_FUNCTION constexpr size_t + operator()(_Indices... idxs) const noexcept { return compute_offset(std::index_sequence_for<_Indices...>{}, idxs...); } - static constexpr bool is_always_unique() noexcept { return true; } - static constexpr bool is_always_exhaustive() noexcept - { - return (extents_type::rank() <= rank_type(1)) - || (extents_type::static_extent(extent_to_pad_idx) != dynamic_extent - && extents_type::static_extent(extent_to_pad_idx) == padded_stride_type::static_value()); + MDSPAN_INLINE_FUNCTION static constexpr bool is_always_unique() noexcept { + return true; + } + MDSPAN_INLINE_FUNCTION static constexpr bool is_always_exhaustive() noexcept { + return (extents_type::rank() <= rank_type(1)) || + (extents_type::static_extent(extent_to_pad_idx) != dynamic_extent && + extents_type::static_extent(extent_to_pad_idx) == + padded_stride_type::static_value()); + } + MDSPAN_INLINE_FUNCTION static constexpr bool is_always_strided() noexcept { + return true; } - static constexpr bool is_always_strided() noexcept { return true; } - static constexpr bool is_unique() noexcept { return true; } - constexpr bool is_exhaustive() const noexcept - { - return (extents_type::rank() < 2) - || (exts.extent(extent_to_pad_idx) == padded_stride.value(0)); + MDSPAN_INLINE_FUNCTION static constexpr bool is_unique() noexcept { + return true; + } + MDSPAN_INLINE_FUNCTION constexpr bool is_exhaustive() const noexcept { + return (extents_type::rank() < 2) || + (exts.extent(extent_to_pad_idx) == padded_stride.value(0)); + } + MDSPAN_INLINE_FUNCTION static constexpr bool is_strided() noexcept { + return true; } - static constexpr bool is_strided() noexcept { return true; } - constexpr index_type stride(rank_type r) const noexcept - { + MDSPAN_INLINE_FUNCTION + constexpr index_type stride(rank_type r) const noexcept { assert(r < extents_type::rank()); - if(r == 0) return index_type(1); + if (r == 0) + return index_type(1); index_type value = padded_stride.value(0); - for (rank_type k = 1; k < r; k++) value *= exts.extent(k); + for (rank_type k = 1; k < r; k++) + value *= exts.extent(k); return value; } @@ -416,26 +434,26 @@ class layout_left_padded::mapping { /** * Equality operator between `layout_left_padded`s * - * This overload only participates in overload resolution if `OtherExtents::rank() == extents_type::rank()`. + * This overload only participates in overload resolution if + * `OtherExtents::rank() == extents_type::rank()`. * - * \note There is currently a difference from p2642r2, where this function is specified as taking - * `layout_left_padded< padding_value >::mapping< Extents>`. However, this makes `padding_value` non-deducible. + * \note There is currently a difference from p2642r2, where this function is + * specified as taking `layout_left_padded< padding_value >::mapping< + * Extents>`. However, this makes `padding_value` non-deducible. */ MDSPAN_TEMPLATE_REQUIRES( - class _Mapping, - /* requires */ ( - detail::is_layout_left_padded_mapping<_Mapping>::value - && (_Mapping::extents_type::rank() == extents_type::rank()) - ) - ) - friend constexpr bool operator==(const mapping &left, const _Mapping &right) noexcept - { - // Workaround for some compilers not short-circuiting properly with compile-time checks - // i.e. we can't access stride(_padding_stride_idx) of a rank 0 mapping + class _Mapping, + /* requires */ (detail::is_layout_left_padded_mapping<_Mapping>::value && + (_Mapping::extents_type::rank() == extents_type::rank()))) + MDSPAN_INLINE_FUNCTION friend constexpr bool + operator==(const mapping &left, const _Mapping &right) noexcept { + // Workaround for some compilers not short-circuiting properly with + // compile-time checks i.e. we can't access stride(_padding_stride_idx) of a + // rank 0 mapping bool strides_equal = true; - if constexpr (extents_type::rank() > rank_type(1)) - { - strides_equal = left.stride(padded_stride_idx) == right.stride(padded_stride_idx); + if constexpr (extents_type::rank() > rank_type(1)) { + strides_equal = + left.stride(padded_stride_idx) == right.stride(padded_stride_idx); } return (left.extents() == right.extents()) && strides_equal; } @@ -444,17 +462,15 @@ class layout_left_padded::mapping { /** * Inequality operator between `layout_left_padded`s * - * This overload only participates in overload resolution if `OtherExtents::rank() == extents_type::rank()`. + * This overload only participates in overload resolution if + * `OtherExtents::rank() == extents_type::rank()`. */ MDSPAN_TEMPLATE_REQUIRES( - class _Mapping, - /* requires */ ( - detail::is_layout_left_padded_mapping<_Mapping>::value - && (_Mapping::extents_type::rank() == extents_type::rank()) - ) - ) - friend constexpr bool operator!=(const mapping &left, const _Mapping &right) noexcept - { + class _Mapping, + /* requires */ (detail::is_layout_left_padded_mapping<_Mapping>::value && + (_Mapping::extents_type::rank() == extents_type::rank()))) + MDSPAN_INLINE_FUNCTION friend constexpr bool + operator!=(const mapping &left, const _Mapping &right) noexcept { return !(left == right); } #endif @@ -490,25 +506,27 @@ class layout_right_padded::mapping { typename padded_stride_type::static_array_type padded_stride = {}; extents_type exts = {}; - constexpr index_type compute_offset(std::index_sequence<>) const { + MDSPAN_INLINE_FUNCTION constexpr index_type + compute_offset(std::index_sequence<>) const { return 0; } template - constexpr index_type compute_offset(std::index_sequence, - IndexOffset index_offset) const { + MDSPAN_INLINE_FUNCTION constexpr index_type + compute_offset(std::index_sequence, IndexOffset index_offset) const { return index_offset; } template - constexpr index_type compute_offset(std::index_sequence, - IndexOffsets... index_offsets) const { + MDSPAN_INLINE_FUNCTION constexpr index_type + compute_offset(std::index_sequence, + IndexOffsets... index_offsets) const { // self-recursive fold trick from // https://github.com/llvm/llvm-project/blob/4d9771741d40cc9cfcccb6b033f43689d36b705a/libcxx/include/mdspan/layout_right.h#L141 index_type res = 0; ((res = static_cast(index_offsets) + (Ranks == extent_to_pad_idx ? padded_stride.value(0) - : exts.extent(Ranks)) * + : exts.extent(Ranks)) * res), ...); return res; @@ -577,56 +595,62 @@ class layout_right_padded::mapping { */ MDSPAN_TEMPLATE_REQUIRES( class _OtherExtents, - /* requires */ ( - std::is_constructible_v - ) - ) - MDSPAN_CONDITIONAL_EXPLICIT((!std::is_convertible_v<_OtherExtents, extents_type>)) + /* requires */ (std::is_constructible_v)) + MDSPAN_CONDITIONAL_EXPLICIT( + (!std::is_convertible_v<_OtherExtents, extents_type>)) + MDSPAN_INLINE_FUNCTION constexpr mapping(const layout_right::mapping<_OtherExtents> &other_mapping) - : padded_stride(padded_stride_type::init_padding(other_mapping, std::integral_constant{})), - exts(other_mapping.extents()) - { - static_assert((_OtherExtents::rank() > 1) || (padded_stride_type::static_value() != dynamic_extent) || (_OtherExtents::static_extent(extent_to_pad_idx) != dynamic_extent) - || (padded_stride_type::static_value() == _OtherExtents::static_extent(extent_to_pad_idx))); + : padded_stride(padded_stride_type::init_padding( + other_mapping, + std::integral_constant{})), + exts(other_mapping.extents()) { + static_assert( + (_OtherExtents::rank() > 1) || + (padded_stride_type::static_value() != dynamic_extent) || + (_OtherExtents::static_extent(extent_to_pad_idx) != dynamic_extent) || + (padded_stride_type::static_value() == + _OtherExtents::static_extent(extent_to_pad_idx))); } /** * Converting constructor from `layout_stride::mapping`. * - * This overload participates in overload resolution only if `is_constructible_v` is true + * This overload participates in overload resolution only if + * `is_constructible_v` is true */ MDSPAN_TEMPLATE_REQUIRES( class _OtherExtents, - /* requires */ ( - std::is_constructible_v - ) - ) + /* requires */ (std::is_constructible_v)) MDSPAN_CONDITIONAL_EXPLICIT((extents_type::rank() > 0)) + MDSPAN_INLINE_FUNCTION constexpr mapping(const layout_stride::mapping<_OtherExtents> &other_mapping) - : padded_stride(padded_stride_type::init_padding(other_mapping, std::integral_constant{})), - exts(other_mapping.extents()) - {} + : padded_stride(padded_stride_type::init_padding( + other_mapping, + std::integral_constant{})), + exts(other_mapping.extents()) {} /** * Converting constructor from `layout_right_padded::mapping`. * - * This overload participates in overload resolution only if `is_constructible_v` is true. - * Either `padding_value` or `OtherPaddingStride` must be `std::dynamic_extent`, or `padding_value == OtherPaddingStride`. + * This overload participates in overload resolution only if + * `is_constructible_v` is true. Either + * `padding_value` or `OtherPaddingStride` must be `std::dynamic_extent`, or + * `padding_value == OtherPaddingStride`. */ MDSPAN_TEMPLATE_REQUIRES( class _Mapping, - /* requires */ ( - detail::is_layout_right_padded_mapping<_Mapping>::value - && std::is_constructible_v - ) - ) + /* requires */ (detail::is_layout_right_padded_mapping<_Mapping>::value + &&std::is_constructible_v< + extents_type, typename _Mapping::extents_type>)) MDSPAN_CONDITIONAL_EXPLICIT((extents_type::rank() > 1 && (padding_value == dynamic_extent || _Mapping::padding_value == dynamic_extent))) + MDSPAN_INLINE_FUNCTION constexpr mapping(const _Mapping &other_mapping) - : padded_stride(padded_stride_type::init_padding(other_mapping, std::integral_constant{})), - exts(other_mapping.extents()) - { + : padded_stride(padded_stride_type::init_padding( + other_mapping, + std::integral_constant{})), + exts(other_mapping.extents()) { static_assert(padding_value == dynamic_extent || _Mapping::padding_value == dynamic_extent || padding_value == _Mapping::padding_value); @@ -635,41 +659,42 @@ class layout_right_padded::mapping { /** * Converting constructor from `layout_left_padded::mapping`. * - * This overload participates in overload resolution only if `extents_type::rank()` is 0 or 1 and `is_constructible_v` is `true`. + * This overload participates in overload resolution only if + * `extents_type::rank()` is 0 or 1 and `is_constructible_v` is `true`. */ MDSPAN_TEMPLATE_REQUIRES( class _Mapping, - /* requires */ ( - detail::is_layout_left_padded_mapping<_Mapping>::value - && extents_type::rank() <= 1 - && std::is_constructible_v - ) - ) - MDSPAN_CONDITIONAL_EXPLICIT((!std::is_convertible_v)) + /* requires */ (detail::is_layout_left_padded_mapping<_Mapping>::value + &&extents_type::rank() <= 1 && + std::is_constructible_v)) + MDSPAN_CONDITIONAL_EXPLICIT( + (!std::is_convertible_v)) + MDSPAN_INLINE_FUNCTION constexpr mapping(const _Mapping &other_mapping) noexcept - : padded_stride(padded_stride_type::init_padding(other_mapping.extents(), other_mapping.extents().extent(extent_to_pad_idx))), - exts(other_mapping.extents()) - {} + : padded_stride(padded_stride_type::init_padding( + other_mapping.extents(), + other_mapping.extents().extent(extent_to_pad_idx))), + exts(other_mapping.extents()) {} - constexpr const extents_type &extents() const noexcept - { + MDSPAN_INLINE_FUNCTION constexpr const extents_type & + extents() const noexcept { return exts; } - constexpr std::array - strides() const noexcept - { - if constexpr ( extents_type::rank() == 0 ) { + MDSPAN_INLINE_FUNCTION constexpr std::array + strides() const noexcept { + if constexpr (extents_type::rank() == 0) { return {}; - } else if constexpr ( extents_type::rank() == 1 ) { + } else if constexpr (extents_type::rank() == 1) { return {1}; } else { index_type value = 1; std::array s{}; s[extent_to_pad_idx] = value; value *= padded_stride.value(0); - for (rank_type r = extent_to_pad_idx - 1; r > 0; --r) - { + for (rank_type r = extent_to_pad_idx - 1; r > 0; --r) { s[r] = value; value *= exts.extent(r); } @@ -678,17 +703,15 @@ class layout_right_padded::mapping { } } - constexpr index_type - required_span_size() const noexcept - { - if constexpr ( extents_type::rank() == 0 ) { + MDSPAN_INLINE_FUNCTION constexpr index_type + required_span_size() const noexcept { + if constexpr (extents_type::rank() == 0) { return 1; - } else if constexpr ( extents_type::rank() == 1 ) { + } else if constexpr (extents_type::rank() == 1) { return exts.extent(0); } else { index_type value = 1; - for (rank_type r = 0; r < extent_to_pad_idx; ++r) - { + for (rank_type r = 0; r < extent_to_pad_idx; ++r) { value *= exts.extent(r); } return value * padded_stride.value(0); @@ -705,40 +728,47 @@ class layout_right_padded::mapping { */ MDSPAN_TEMPLATE_REQUIRES( class... _Indices, - /* requires */ ( - sizeof...(_Indices) == extents_type::rank() && - (::MDSPAN_IMPL_STANDARD_NAMESPACE::detail::are_valid_indices()) - ) - ) - constexpr size_t operator()(_Indices... idxs) const noexcept - { + /* requires */ (sizeof...(_Indices) == extents_type::rank() && + (::MDSPAN_IMPL_STANDARD_NAMESPACE::detail:: + are_valid_indices()))) + MDSPAN_INLINE_FUNCTION constexpr size_t + operator()(_Indices... idxs) const noexcept { return compute_offset(std::index_sequence_for<_Indices...>{}, idxs...); } - static constexpr bool is_always_unique() noexcept { return true; } - static constexpr bool is_always_exhaustive() noexcept - { - return (extents_type::rank() <= rank_type(1)) - || (extents_type::static_extent(extent_to_pad_idx) != dynamic_extent - && extents_type::static_extent(extent_to_pad_idx) == padded_stride_type::static_value()); + MDSPAN_INLINE_FUNCTION static constexpr bool is_always_unique() noexcept { + return true; + } + MDSPAN_INLINE_FUNCTION static constexpr bool is_always_exhaustive() noexcept { + return (extents_type::rank() <= rank_type(1)) || + (extents_type::static_extent(extent_to_pad_idx) != dynamic_extent && + extents_type::static_extent(extent_to_pad_idx) == + padded_stride_type::static_value()); + } + MDSPAN_INLINE_FUNCTION static constexpr bool is_always_strided() noexcept { + return true; } - static constexpr bool is_always_strided() noexcept { return true; } - static constexpr bool is_unique() noexcept { return true; } - constexpr bool is_exhaustive() const noexcept - { - return (extents_type::rank() < 2) - || (exts.extent(extent_to_pad_idx) == padded_stride.value(0)); + MDSPAN_INLINE_FUNCTION static constexpr bool is_unique() noexcept { + return true; + } + MDSPAN_INLINE_FUNCTION constexpr bool is_exhaustive() const noexcept { + return (extents_type::rank() < 2) || + (exts.extent(extent_to_pad_idx) == padded_stride.value(0)); + } + MDSPAN_INLINE_FUNCTION static constexpr bool is_strided() noexcept { + return true; } - static constexpr bool is_strided() noexcept { return true; } - constexpr index_type stride(rank_type r) const noexcept - { + MDSPAN_INLINE_FUNCTION constexpr index_type + stride(rank_type r) const noexcept { assert(r < extents_type::rank()); - if(r == extents_type::rank() - 1) return index_type(1); + if (r == extents_type::rank() - 1) + return index_type(1); index_type value = padded_stride.value(0); - for (rank_type k = extents_type::rank() - 2; k > r; k--) value *= exts.extent(k); + for (rank_type k = extents_type::rank() - 2; k > r; k--) + value *= exts.extent(k); return value; } @@ -746,26 +776,26 @@ class layout_right_padded::mapping { /** * Equality operator between `layout_right_padded`s * - * This overload only participates in overload resolution if `OtherExtents::rank() == extents_type::rank()`. + * This overload only participates in overload resolution if + * `OtherExtents::rank() == extents_type::rank()`. * - * \note There is currently a difference from p2642r2, where this function is specified as taking - * `layout_right_padded< padding_value >::mapping< Extents>`. However, this makes `padding_value` non-deducible. + * \note There is currently a difference from p2642r2, where this function is + * specified as taking `layout_right_padded< padding_value >::mapping< + * Extents>`. However, this makes `padding_value` non-deducible. */ MDSPAN_TEMPLATE_REQUIRES( class _Mapping, - /* requires */ ( - detail::is_layout_right_padded_mapping<_Mapping>::value - && (_Mapping::extents_type::rank() == extents_type::rank()) - ) - ) - friend constexpr bool operator==(const mapping &left, const _Mapping &right) noexcept - { - // Workaround for some compilers not short-circuiting properly with compile-time checks - // i.e. we can't access stride(_padding_stride_idx) of a rank 0 mapping + /* requires */ (detail::is_layout_right_padded_mapping<_Mapping>::value && + (_Mapping::extents_type::rank() == extents_type::rank()))) + MDSPAN_INLINE_FUNCTION friend constexpr bool + operator==(const mapping &left, const _Mapping &right) noexcept { + // Workaround for some compilers not short-circuiting properly with + // compile-time checks i.e. we can't access stride(_padding_stride_idx) of a + // rank 0 mapping bool strides_equal = true; - if constexpr (extents_type::rank() > rank_type(1)) - { - strides_equal = left.stride(padded_stride_idx) == right.stride(padded_stride_idx); + if constexpr (extents_type::rank() > rank_type(1)) { + strides_equal = + left.stride(padded_stride_idx) == right.stride(padded_stride_idx); } return (left.extents() == right.extents()) && strides_equal; } @@ -774,17 +804,15 @@ class layout_right_padded::mapping { /** * Inequality operator between `layout_right_padded`s * - * This overload only participates in overload resolution if `OtherExtents::rank() == extents_type::rank()`. + * This overload only participates in overload resolution if + * `OtherExtents::rank() == extents_type::rank()`. */ MDSPAN_TEMPLATE_REQUIRES( class _Mapping, - /* requires */ ( - detail::is_layout_right_padded_mapping<_Mapping>::value - && (_Mapping::extents_type::rank() == extents_type::rank()) - ) - ) - friend constexpr bool operator!=(const mapping &left, const _Mapping &right) noexcept - { + /* requires */ (detail::is_layout_right_padded_mapping<_Mapping>::value && + (_Mapping::extents_type::rank() == extents_type::rank()))) + MDSPAN_INLINE_FUNCTION friend constexpr bool + operator!=(const mapping &left, const _Mapping &right) noexcept { return !(left == right); } #endif From 64fe756374d8ffe947b524905c8e20ea7ef7dbb0 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Mon, 20 May 2024 17:47:49 -0400 Subject: [PATCH 055/103] SYCL: Don't use shuffles for top-level reductions (#7009) SYCL: Don't use shuffles for top-level reductions --- core/src/SYCL/Kokkos_SYCL_WorkgroupReduction.hpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/core/src/SYCL/Kokkos_SYCL_WorkgroupReduction.hpp b/core/src/SYCL/Kokkos_SYCL_WorkgroupReduction.hpp index 06be143ecca..c838a1abc58 100644 --- a/core/src/SYCL/Kokkos_SYCL_WorkgroupReduction.hpp +++ b/core/src/SYCL/Kokkos_SYCL_WorkgroupReduction.hpp @@ -21,9 +21,12 @@ namespace Kokkos::Impl::SYCLReduction { +// FIXME_SYCL For some types, shuffle reductions are competitive with local +// memory reductions but they are significantly slower for the value type used +// in combined reductions with multiple double arguments. template -inline constexpr bool use_shuffle_based_algorithm = - std::is_reference_v; +inline constexpr bool use_shuffle_based_algorithm = false; +// std::is_reference_v; template std::enable_if_t> workgroup_reduction( From 6aa2ad7da687095ded07167ca74f50d138625493 Mon Sep 17 00:00:00 2001 From: Alex Dutka <97711898+dutkalex@users.noreply.github.com> Date: Mon, 20 May 2024 23:52:21 +0200 Subject: [PATCH 056/103] Add a CITATION.cff file (#7008) * Create CITATION.cff * Update CITATION.cff * Update CITATION.cff Co-authored-by: Daniel Arndt * Update CITATION.cff Co-authored-by: Daniel Arndt * Add issue number [ci skip] --------- Co-authored-by: Daniel Arndt Co-authored-by: Damien L-G --- CITATION.cff | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) create mode 100644 CITATION.cff diff --git a/CITATION.cff b/CITATION.cff new file mode 100644 index 00000000000..28c674c451b --- /dev/null +++ b/CITATION.cff @@ -0,0 +1,65 @@ +cff-version: 1.2.0 +title: Kokkos +message: >- + If you use this software, please cite the overview paper +type: software +authors: + - name: The Kokkos authors + website: https://kokkos.org/community/team/ +identifiers: + - type: url + website: https://kokkos.org/kokkos-core-wiki/citation.html +repository-code: 'https://github.com/kokkos/kokkos' +url: 'https://kokkos.org/' +license: Apache-2.0 +preferred-citation: + type: article + authors: + - given-names: Christian R. + family-names: Trott + - given-names: Damien + family-names: Lebrun-Grandié + - given-names: Daniel + family-names: Arndt + - family-names: Ciesko + given-names: Jan + - given-names: Vinh + family-names: Dang + - family-names: Ellingwood + given-names: Nathan + - given-names: Rahulkumar + family-names: Gayatri + - given-names: Evan + family-names: Harvey + - given-names: Daisy S. + family-names: Hollman + - given-names: Dan + family-names: Ibanez + - given-names: Nevin + family-names: Liber + - given-names: Jonathan + family-names: Madsen + - given-names: Jeff + family-names: Miles + - given-names: David + family-names: Poliakoff + - given-names: Amy + family-names: Powell + - given-names: Sivasankaran + family-names: Rajamanickam + - given-names: Mikael + family-names: Simberg + - given-names: Dan + family-names: Sunderland + - given-names: Bruno + family-names: Turcksin + - given-names: Jeremiah + family-names: Wilke + doi: 10.1109/TPDS.2021.3097283 + journal: IEEE Transactions on Parallel and Distributed Systems + start: 805 + end: 817 + title: "Kokkos 3: Programming Model Extensions for the Exascale Era" + volume: 33 + issue: 4 + year: 2022 From f8f0cc473a53ad559326b38f05e560233c642239 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Mon, 20 May 2024 17:53:29 -0400 Subject: [PATCH 057/103] Always run Graph tests (#7011) * Always run Graph tests * Workaround for HPX * Move comment --- core/unit_test/CMakeLists.txt | 25 +------------------ core/unit_test/TestGraph.hpp | 22 +++++++++------- .../category_files/TestHPX_Category.hpp | 1 + .../category_files/TestOpenACC_Category.hpp | 1 + .../TestOpenMPTarget_Category.hpp | 1 + .../category_files/TestSYCL_Category.hpp | 1 + .../category_files/TestThreads_Category.hpp | 1 + core/unit_test/cuda/TestCuda_Graph.cpp | 18 ------------- core/unit_test/hip/TestHIP_Graph.cpp | 18 ------------- core/unit_test/openmp/TestOpenMP_Graph.cpp | 18 ------------- core/unit_test/serial/TestSerial_Graph.cpp | 18 ------------- 11 files changed, 19 insertions(+), 105 deletions(-) delete mode 100644 core/unit_test/cuda/TestCuda_Graph.cpp delete mode 100644 core/unit_test/hip/TestHIP_Graph.cpp delete mode 100644 core/unit_test/openmp/TestOpenMP_Graph.cpp delete mode 100644 core/unit_test/serial/TestSerial_Graph.cpp diff --git a/core/unit_test/CMakeLists.txt b/core/unit_test/CMakeLists.txt index 5df8d1e2cf8..5f325ed4c12 100644 --- a/core/unit_test/CMakeLists.txt +++ b/core/unit_test/CMakeLists.txt @@ -151,6 +151,7 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;OpenACC;HIP;SYCL) ExecSpaceThreadSafety ExecutionSpace FunctorAnalysis + Graph HostSharedPtr HostSharedPtrAccessOnDevice Init @@ -658,12 +659,6 @@ if(Kokkos_ENABLE_SERIAL) UnitTestMainInit.cpp ${Serial_SOURCES2} ) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - CoreUnitTest_SerialGraph - SOURCES - UnitTestMainInit.cpp - serial/TestSerial_Graph.cpp - ) endif() if(Kokkos_ENABLE_THREADS) @@ -694,12 +689,6 @@ if (Kokkos_ENABLE_OPENMP) UnitTestMain.cpp openmp/TestOpenMP_InterOp.cpp ) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - CoreUnitTest_OpenMPGraph - SOURCES - UnitTestMainInit.cpp - openmp/TestOpenMP_Graph.cpp - ) endif() if(Kokkos_ENABLE_HPX) @@ -807,12 +796,6 @@ if(Kokkos_ENABLE_CUDA) UnitTestMainInit.cpp cuda/TestCuda_InterOp_StreamsMultiGPU.cpp ) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - CoreUnitTest_CudaGraph - SOURCES - UnitTestMainInit.cpp - cuda/TestCuda_Graph.cpp - ) endif() if(Kokkos_ENABLE_HIP) @@ -840,12 +823,6 @@ if(Kokkos_ENABLE_HIP) UnitTestMain.cpp hip/TestHIP_InterOp_Streams.cpp ) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - UnitTest_HIPGraph - SOURCES - UnitTestMainInit.cpp - hip/TestHIP_Graph.cpp - ) endif() if(Kokkos_ENABLE_SYCL) diff --git a/core/unit_test/TestGraph.hpp b/core/unit_test/TestGraph.hpp index 9a36d08f445..45c86e50d39 100644 --- a/core/unit_test/TestGraph.hpp +++ b/core/unit_test/TestGraph.hpp @@ -66,7 +66,7 @@ struct SetResultToViewFunctor { } }; -struct TEST_CATEGORY_FIXTURE(count_bugs) : public ::testing::Test { +struct TEST_CATEGORY_FIXTURE(graph) : public ::testing::Test { public: using count_functor = CountTestFunctor; using set_functor = SetViewToValueFunctor; @@ -88,7 +88,7 @@ struct TEST_CATEGORY_FIXTURE(count_bugs) : public ::testing::Test { } }; -TEST_F(TEST_CATEGORY_FIXTURE(count_bugs), launch_one) { +TEST_F(TEST_CATEGORY_FIXTURE(graph), launch_one) { auto graph = Kokkos::Experimental::create_graph([&](auto root) { root.then_parallel_for(1, count_functor{count, bugs, 0, 0}); @@ -101,7 +101,7 @@ TEST_F(TEST_CATEGORY_FIXTURE(count_bugs), launch_one) { ASSERT_EQ(0, bugs_host()); } -TEST_F(TEST_CATEGORY_FIXTURE(count_bugs), launch_one_rvalue) { +TEST_F(TEST_CATEGORY_FIXTURE(graph), launch_one_rvalue) { Kokkos::Experimental::create_graph(ex, [&](auto root) { root.then_parallel_for(1, count_functor{count, bugs, 0, 0}); }).submit(); @@ -112,7 +112,7 @@ TEST_F(TEST_CATEGORY_FIXTURE(count_bugs), launch_one_rvalue) { ASSERT_EQ(0, bugs_host()); } -TEST_F(TEST_CATEGORY_FIXTURE(count_bugs), launch_six) { +TEST_F(TEST_CATEGORY_FIXTURE(graph), launch_six) { auto graph = Kokkos::Experimental::create_graph(ex, [&](auto root) { auto f_setup_count = root.then_parallel_for(1, set_functor{count, 0}); auto f_setup_bugs = root.then_parallel_for(1, set_functor{bugs, 0}); @@ -145,7 +145,7 @@ TEST_F(TEST_CATEGORY_FIXTURE(count_bugs), launch_six) { ASSERT_EQ(0, bugs_host()); } -TEST_F(TEST_CATEGORY_FIXTURE(count_bugs), when_all_cycle) { +TEST_F(TEST_CATEGORY_FIXTURE(graph), when_all_cycle) { view_type reduction_out{"reduction_out"}; view_host reduction_host{"reduction_host"}; Kokkos::Experimental::create_graph(ex, [&](auto root) { @@ -172,7 +172,7 @@ TEST_F(TEST_CATEGORY_FIXTURE(count_bugs), when_all_cycle) { // This test is disabled because we don't currently support copying to host, // even asynchronously. We _may_ want to do that eventually? -TEST_F(TEST_CATEGORY_FIXTURE(count_bugs), DISABLED_repeat_chain) { +TEST_F(TEST_CATEGORY_FIXTURE(graph), DISABLED_repeat_chain) { auto graph = Kokkos::Experimental::create_graph( ex, [&, count_host = count_host](auto root) { //---------------------------------------- @@ -198,7 +198,7 @@ TEST_F(TEST_CATEGORY_FIXTURE(count_bugs), DISABLED_repeat_chain) { //---------------------------------------- } -TEST_F(TEST_CATEGORY_FIXTURE(count_bugs), zero_work_reduce) { +TEST_F(TEST_CATEGORY_FIXTURE(graph), zero_work_reduce) { auto graph = Kokkos::Experimental::create_graph(ex, [&](auto root) { root.then_parallel_reduce(0, set_result_functor{bugs}, count); }); @@ -214,9 +214,13 @@ TEST_F(TEST_CATEGORY_FIXTURE(count_bugs), zero_work_reduce) { // UVM works on pre pascal cards. #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_ENABLE_CUDA_UVM) && \ (defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL)) - Kokkos::fence(); + if constexpr (std::is_same_v) Kokkos::fence(); +#endif +#ifdef KOKKOS_ENABLE_HPX // FIXME_HPX graph.submit() isn't properly enqueued + if constexpr (std::is_same_v) + Kokkos::fence(); #endif - graph.submit(); // should reset to 0, but doesn't + graph.submit(); Kokkos::deep_copy(ex, count_host, count); ex.fence(); ASSERT_EQ(count_host(), 0); diff --git a/core/unit_test/category_files/TestHPX_Category.hpp b/core/unit_test/category_files/TestHPX_Category.hpp index d3a7cdbea53..c6a2aa9f201 100644 --- a/core/unit_test/category_files/TestHPX_Category.hpp +++ b/core/unit_test/category_files/TestHPX_Category.hpp @@ -23,5 +23,6 @@ #define TEST_CATEGORY_NUMBER 3 #define TEST_CATEGORY_DEATH hpx_DeathTest #define TEST_EXECSPACE Kokkos::Experimental::HPX +#define TEST_CATEGORY_FIXTURE(name) hpx_##name #endif diff --git a/core/unit_test/category_files/TestOpenACC_Category.hpp b/core/unit_test/category_files/TestOpenACC_Category.hpp index 0c4e4b7e119..6105eadf14f 100644 --- a/core/unit_test/category_files/TestOpenACC_Category.hpp +++ b/core/unit_test/category_files/TestOpenACC_Category.hpp @@ -23,5 +23,6 @@ #define TEST_CATEGORY_NUMBER 8 #define TEST_CATEGORY_DEATH openacc_DeathTest #define TEST_EXECSPACE Kokkos::Experimental::OpenACC +#define TEST_CATEGORY_FIXTURE(name) openacc_##name #endif diff --git a/core/unit_test/category_files/TestOpenMPTarget_Category.hpp b/core/unit_test/category_files/TestOpenMPTarget_Category.hpp index 235b34ffab7..921cff78902 100644 --- a/core/unit_test/category_files/TestOpenMPTarget_Category.hpp +++ b/core/unit_test/category_files/TestOpenMPTarget_Category.hpp @@ -23,5 +23,6 @@ #define TEST_CATEGORY_NUMBER 4 #define TEST_CATEGORY_DEATH openmptarget_DeathTest #define TEST_EXECSPACE Kokkos::Experimental::OpenMPTarget +#define TEST_CATEGORY_FIXTURE(name) openmptarget_##name #endif diff --git a/core/unit_test/category_files/TestSYCL_Category.hpp b/core/unit_test/category_files/TestSYCL_Category.hpp index 8e1b18c9acd..59e72c72c77 100644 --- a/core/unit_test/category_files/TestSYCL_Category.hpp +++ b/core/unit_test/category_files/TestSYCL_Category.hpp @@ -23,5 +23,6 @@ #define TEST_CATEGORY_NUMBER 7 #define TEST_CATEGORY_DEATH sycl_DeathTest #define TEST_EXECSPACE Kokkos::Experimental::SYCL +#define TEST_CATEGORY_FIXTURE(name) sycl_##name #endif diff --git a/core/unit_test/category_files/TestThreads_Category.hpp b/core/unit_test/category_files/TestThreads_Category.hpp index 13b0b653f21..ae8ac608339 100644 --- a/core/unit_test/category_files/TestThreads_Category.hpp +++ b/core/unit_test/category_files/TestThreads_Category.hpp @@ -23,5 +23,6 @@ #define TEST_CATEGORY_NUMBER 1 #define TEST_CATEGORY_DEATH threads_DeathTest #define TEST_EXECSPACE Kokkos::Threads +#define TEST_CATEGORY_FIXTURE(name) threads_##name #endif diff --git a/core/unit_test/cuda/TestCuda_Graph.cpp b/core/unit_test/cuda/TestCuda_Graph.cpp deleted file mode 100644 index 27203639690..00000000000 --- a/core/unit_test/cuda/TestCuda_Graph.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#include -#include diff --git a/core/unit_test/hip/TestHIP_Graph.cpp b/core/unit_test/hip/TestHIP_Graph.cpp deleted file mode 100644 index 405cb76c643..00000000000 --- a/core/unit_test/hip/TestHIP_Graph.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#include -#include diff --git a/core/unit_test/openmp/TestOpenMP_Graph.cpp b/core/unit_test/openmp/TestOpenMP_Graph.cpp deleted file mode 100644 index 22c8ab1bf8f..00000000000 --- a/core/unit_test/openmp/TestOpenMP_Graph.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#include -#include diff --git a/core/unit_test/serial/TestSerial_Graph.cpp b/core/unit_test/serial/TestSerial_Graph.cpp deleted file mode 100644 index bff64d83e27..00000000000 --- a/core/unit_test/serial/TestSerial_Graph.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#include -#include From ce0915b5eeb0a78dae3e9fa743416f711ad798bb Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Mon, 20 May 2024 22:22:24 -0400 Subject: [PATCH 058/103] Fix undefined behavior in is_zero_byte (#7014) * Fix undefined behavior in is_zero_byte * Remove include file comments --- core/src/impl/Kokkos_ViewMapping.hpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/core/src/impl/Kokkos_ViewMapping.hpp b/core/src/impl/Kokkos_ViewMapping.hpp index c37112be896..a0fe5c1a5d7 100644 --- a/core/src/impl/Kokkos_ViewMapping.hpp +++ b/core/src/impl/Kokkos_ViewMapping.hpp @@ -20,6 +20,8 @@ #include #include +#include +#include #include #include #include @@ -2538,9 +2540,10 @@ inline bool is_zero_byte(const T& t) { sizeof(T) % sizeof(int) == 0, int, std::conditional_t>>>; - const auto* const ptr = reinterpret_cast(&t); + auto bit_values = Kokkos::bit_cast< + Kokkos::Array>(t); for (std::size_t i = 0; i < sizeof(T) / sizeof(comparison_type); ++i) - if (ptr[i] != 0) return false; + if (bit_values[i] != 0) return false; return true; } From fa8b501028b52b0141d4065a7197320bae207795 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Tue, 21 May 2024 06:52:56 -0400 Subject: [PATCH 059/103] Disable OpenMPTarget Kokkos::Graph test (does not compile) --- core/unit_test/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/core/unit_test/CMakeLists.txt b/core/unit_test/CMakeLists.txt index 5f325ed4c12..413d4ef1c58 100644 --- a/core/unit_test/CMakeLists.txt +++ b/core/unit_test/CMakeLists.txt @@ -383,6 +383,7 @@ endforeach() # Disable non-compiling tests based on clang version. if(Kokkos_ENABLE_OPENMPTARGET) list(REMOVE_ITEM OpenMPTarget_SOURCES + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Graph.cpp ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Other.cpp ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamCombinedReducers.cpp ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamReductionScan.cpp From bfe9aa2f1630f155d3b62d9873dcb7b993be9e29 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Tue, 21 May 2024 07:14:06 -0400 Subject: [PATCH 060/103] Fixup for disabling deprecation warnings with NVC++ Fixup for #6999 Deprecation warnings are still showing in the OpenACC CI build ``` "/var/jenkins/workspace/Kokkos_PR-7017/core/unit_test/TestArrayOps.hpp", line 136: warning: class "Kokkos::Array" was declared deprecated [deprecated_entity] using A = ^ "/var/jenkins/workspace/Kokkos_PR-7017/core/src/Kokkos_Array.hpp", line 217: note: because of a "deprecated" attribute struct KOKKOS_DEPRECATED ^ Remark: individual warnings can be suppressed with "--diag_suppress " "/var/jenkins/workspace/Kokkos_PR-7017/core/unit_test/TestArrayOps.hpp", line 197: warning: class "Kokkos::Array" was declared deprecated [deprecated_entity] using A = ^ "/var/jenkins/workspace/Kokkos_PR-7017/core/src/Kokkos_Array.hpp", line 217: note: because of a "deprecated" attribute struct KOKKOS_DEPRECATED ^ "/var/jenkins/workspace/Kokkos_PR-7017/core/unit_test/TestArrayOps.hpp", line 274: warning: class "Kokkos::Array" was declared deprecated [deprecated_entity] using A = Kokkos::Array::strided>; ^ "/var/jenkins/workspace/Kokkos_PR-7017/core/src/Kokkos_Array.hpp", line 286: note: because of a "deprecated" attribute struct KOKKOS_DEPRECATED ^ "/var/jenkins/workspace/Kokkos_PR-7017/core/unit_test/TestArrayOps.hpp", line 338: warning: class "Kokkos::Array" was declared deprecated [deprecated_entity] using A = Kokkos::Array::strided>; ^ "/var/jenkins/workspace/Kokkos_PR-7017/core/src/Kokkos_Array.hpp", line 286: note: because of a "deprecated" attribute struct KOKKOS_DEPRECATED ^ ``` The generic EDG warning disable did not work so we handle NVC++ separately and use diagnostic pragmas. In case anyone wants to try something else https://godbolt.org/z/nxWbPMT95 --- core/src/Kokkos_Macros.hpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/core/src/Kokkos_Macros.hpp b/core/src/Kokkos_Macros.hpp index 27b32b15214..ceca2130e75 100644 --- a/core/src/Kokkos_Macros.hpp +++ b/core/src/Kokkos_Macros.hpp @@ -563,7 +563,12 @@ static constexpr bool kokkos_omp_on_host() { return false; } #endif // clang-format off -#if defined(__EDG__) +#if defined(__NVCOMPILER) + #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() \ + _Pragma("diag_suppress 1216") + #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() \ + _Pragma("diag_default 1216") +#elif defined(__EDG__) #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() \ _Pragma("warning push") \ _Pragma("warning disable 1478") From f3bd253d3d09d0dfd6ab76125e8b56b840db542f Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Tue, 21 May 2024 11:38:36 -0400 Subject: [PATCH 061/103] Remove unused CudaInternal::cuda_{malloc,free}_async_wrapper --- core/src/Cuda/Kokkos_Cuda_Instance.hpp | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/core/src/Cuda/Kokkos_Cuda_Instance.hpp b/core/src/Cuda/Kokkos_Cuda_Instance.hpp index 24f4af31019..25aa6502152 100644 --- a/core/src/Cuda/Kokkos_Cuda_Instance.hpp +++ b/core/src/Cuda/Kokkos_Cuda_Instance.hpp @@ -421,23 +421,6 @@ class CudaInternal { return cudaStreamSynchronize(stream); } - // The following are only available for cuda 11.2 and greater -#if (defined(KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC) && CUDART_VERSION >= 11020) - template - cudaError_t cuda_malloc_async_wrapper(void** devPtr, size_t size, - cudaStream_t hStream = nullptr) const { - if constexpr (setCudaDevice) set_cuda_device(); - return cudaMallocAsync(devPtr, size, get_input_stream(hStream)); - } - - template - cudaError_t cuda_free_async_wrapper(void* devPtr, - cudaStream_t hStream = nullptr) const { - if constexpr (setCudaDevice) set_cuda_device(); - return cudaFreeAsync(devPtr, get_input_stream(hStream)); - } -#endif - // C++ API routines template cudaError_t cuda_func_get_attributes_wrapper(cudaFuncAttributes* attr, From 083fb014cc3abff24cbd042b86fbbbf77817bb25 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Tue, 21 May 2024 22:09:26 -0400 Subject: [PATCH 062/103] Improve `Impl::is_zero_byte()` (#7017) * Improve Impl::is_zero_byte() by implementing it in terms of std::memcmp * Prefer function scope for the all zeroes buffer Co-authored-by: Daniel Arndt --------- Co-authored-by: Daniel Arndt --- core/src/impl/Kokkos_ViewMapping.hpp | 20 ++++---------------- 1 file changed, 4 insertions(+), 16 deletions(-) diff --git a/core/src/impl/Kokkos_ViewMapping.hpp b/core/src/impl/Kokkos_ViewMapping.hpp index a0fe5c1a5d7..c1f4c0290c1 100644 --- a/core/src/impl/Kokkos_ViewMapping.hpp +++ b/core/src/impl/Kokkos_ViewMapping.hpp @@ -17,11 +17,10 @@ #ifndef KOKKOS_EXPERIMENTAL_VIEW_MAPPING_HPP #define KOKKOS_EXPERIMENTAL_VIEW_MAPPING_HPP +#include #include #include -#include -#include #include #include #include @@ -2531,20 +2530,9 @@ namespace Kokkos { namespace Impl { template -inline bool is_zero_byte(const T& t) { - using comparison_type = std::conditional_t< - sizeof(T) % sizeof(long long int) == 0, long long int, - std::conditional_t< - sizeof(T) % sizeof(long int) == 0, long int, - std::conditional_t< - sizeof(T) % sizeof(int) == 0, int, - std::conditional_t>>>; - auto bit_values = Kokkos::bit_cast< - Kokkos::Array>(t); - for (std::size_t i = 0; i < sizeof(T) / sizeof(comparison_type); ++i) - if (bit_values[i] != 0) return false; - return true; +bool is_zero_byte(const T& x) { + constexpr std::byte all_zeroes[sizeof(T)] = {}; + return std::memcmp(&x, all_zeroes, sizeof(T)) == 0; } //---------------------------------------------------------------------------- From 6f176cde00c35970153aed8ec64b57b0f7163b90 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 22 May 2024 07:50:46 -0400 Subject: [PATCH 063/103] OpenMPTarget: Fix compiling Graph tests (#7020) * OpenMPTarget: Fix compiling Graph tests * Use team_size 32 if compiling with OpenMPTarget support * Skip launch_six for OpenMPTarget * Reenable TestOpenMPTarget_Graph --- ...Kokkos_OpenMPTarget_ParallelReduce_MDRange.hpp | 10 ++++------ .../Kokkos_OpenMPTarget_ParallelReduce_Range.hpp | 9 ++++----- .../Kokkos_OpenMPTarget_ParallelReduce_Team.hpp | 10 ++++------ .../Kokkos_OpenMPTarget_ParallelScan_Range.hpp | 15 +++++++++------ core/unit_test/CMakeLists.txt | 1 - core/unit_test/TestGraph.hpp | 4 ++++ 6 files changed, 25 insertions(+), 24 deletions(-) diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_MDRange.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_MDRange.hpp index 0782a79302a..e86a1219749 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_MDRange.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_MDRange.hpp @@ -55,12 +55,11 @@ class ParallelReduce m_scratch_memory_lock; - public: inline void execute() const { + // Only let one ParallelReduce instance at a time use the scratch memory. + std::scoped_lock scratch_memory_lock( + OpenMPTargetExec::m_mutex_scratch_ptr); execute_tile( m_functor_reducer.get_functor(), m_policy, m_result_ptr, std::integral_constant()); @@ -74,8 +73,7 @@ class ParallelReduce::accessible), - m_scratch_memory_lock(OpenMPTargetExec::m_mutex_scratch_ptr) {} + typename ViewType::memory_space>::accessible) {} template inline std::enable_if_t execute_tile( diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp index caa568a8925..4a112ed11d0 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp @@ -55,13 +55,13 @@ class ParallelReduce, const pointer_type m_result_ptr; bool m_result_ptr_on_device; const int m_result_ptr_num_elems; - // Only let one ParallelReduce instance at a time use the scratch memory. - // The constructor acquires the mutex which is released in the destructor. - std::scoped_lock m_scratch_memory_lock; using TagType = typename Policy::work_tag; public: void execute() const { + // Only let one ParallelReduce instance at a time use the scratch memory. + std::scoped_lock scratch_memory_lock( + OpenMPTargetExec::m_mutex_scratch_ptr); const FunctorType& functor = m_functor_reducer.get_functor(); if constexpr (FunctorHasJoin) { // Enter this loop if the Functor has a init-join. @@ -108,8 +108,7 @@ class ParallelReduce, m_result_ptr_on_device( MemorySpaceAccess::accessible), - m_result_ptr_num_elems(arg_result_view.size()), - m_scratch_memory_lock(OpenMPTargetExec::m_mutex_scratch_ptr) {} + m_result_ptr_num_elems(arg_result_view.size()) {} }; } // namespace Impl diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp index 8abffa47a43..16c0eedb818 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp @@ -470,12 +470,11 @@ class ParallelReduce m_scratch_memory_lock; - public: void execute() const { + // Only let one ParallelReduce instance at a time use the scratch memory. + std::scoped_lock scratch_memory_lock( + OpenMPTargetExec::m_mutex_scratch_ptr); const FunctorType& functor = m_functor_reducer.get_functor(); if constexpr (FunctorHasJoin) { ParReduceSpecialize::execute_init_join(functor, m_policy, m_result_ptr, @@ -521,8 +520,7 @@ class ParallelReduce::value( - arg_functor_reducer.get_functor(), arg_policy.team_size())), - m_scratch_memory_lock(OpenMPTargetExec::m_mutex_scratch_ptr) {} + arg_functor_reducer.get_functor(), arg_policy.team_size())) {} }; } // namespace Impl diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp index c886c397966..b0d69328024 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp @@ -48,10 +48,6 @@ class ParallelScan, value_type* m_result_ptr; const bool m_result_ptr_device_accessible; - // Only let one ParallelScan instance at a time use the scratch memory. - // The constructor acquires the mutex which is released in the destructor. - std::scoped_lock m_scratch_memory_lock; - template std::enable_if_t::value> call_with_tag( const FunctorType& f, const idx_type& idx, value_type& val, @@ -181,6 +177,10 @@ class ParallelScan, const idx_type chunk_size = 128; const idx_type n_chunks = (N + chunk_size - 1) / chunk_size; + // Only let one ParallelReduce instance at a time use the scratch memory. + std::scoped_lock scratch_memory_lock( + OpenMPTargetExec::m_mutex_scratch_ptr); + // This could be scratch memory per team Kokkos::View @@ -201,8 +201,7 @@ class ParallelScan, : m_functor_reducer(arg_functor, typename Analysis::Reducer{arg_functor}), m_policy(arg_policy), m_result_ptr(arg_result_ptr), - m_result_ptr_device_accessible(arg_result_ptr_device_accessible), - m_scratch_memory_lock(OpenMPTargetExec::m_mutex_scratch_ptr) {} + m_result_ptr_device_accessible(arg_result_ptr_device_accessible) {} //---------------------------------------- }; @@ -230,6 +229,10 @@ class ParallelScanWithTotal, const int64_t n_chunks = (N + chunk_size - 1) / chunk_size; if (N > 0) { + // Only let one ParallelReduce instance at a time use the scratch memory. + std::scoped_lock scratch_memory_lock( + OpenMPTargetExec::m_mutex_scratch_ptr); + // This could be scratch memory per team Kokkos::View diff --git a/core/unit_test/CMakeLists.txt b/core/unit_test/CMakeLists.txt index 413d4ef1c58..5f325ed4c12 100644 --- a/core/unit_test/CMakeLists.txt +++ b/core/unit_test/CMakeLists.txt @@ -383,7 +383,6 @@ endforeach() # Disable non-compiling tests based on clang version. if(Kokkos_ENABLE_OPENMPTARGET) list(REMOVE_ITEM OpenMPTarget_SOURCES - ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Graph.cpp ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Other.cpp ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamCombinedReducers.cpp ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamReductionScan.cpp diff --git a/core/unit_test/TestGraph.hpp b/core/unit_test/TestGraph.hpp index 45c86e50d39..cefcda8e061 100644 --- a/core/unit_test/TestGraph.hpp +++ b/core/unit_test/TestGraph.hpp @@ -113,6 +113,10 @@ TEST_F(TEST_CATEGORY_FIXTURE(graph), launch_one_rvalue) { } TEST_F(TEST_CATEGORY_FIXTURE(graph), launch_six) { +#ifdef KOKKOS_ENABLE_OPENMPTARGET // FIXME_OPENMPTARGET team_size incompatible + if (std::is_same_v) + GTEST_SKIP() << "skipping since OpenMPTarget can't use team_size 1"; +#endif auto graph = Kokkos::Experimental::create_graph(ex, [&](auto root) { auto f_setup_count = root.then_parallel_for(1, set_functor{count, 0}); auto f_setup_bugs = root.then_parallel_for(1, set_functor{bugs, 0}); From cb27c99414fa604bd04d5efd8e684a3b4149d89c Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 22 May 2024 08:04:18 -0400 Subject: [PATCH 064/103] SYCL: Skip launch_six Graph test --- core/unit_test/TestGraph.hpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/core/unit_test/TestGraph.hpp b/core/unit_test/TestGraph.hpp index cefcda8e061..735114d4c25 100644 --- a/core/unit_test/TestGraph.hpp +++ b/core/unit_test/TestGraph.hpp @@ -117,6 +117,11 @@ TEST_F(TEST_CATEGORY_FIXTURE(graph), launch_six) { if (std::is_same_v) GTEST_SKIP() << "skipping since OpenMPTarget can't use team_size 1"; #endif +#if defined(KOKKOS_ENABLE_SYCL) // FIXME_SYCL + if (std::is_same_v) + GTEST_SKIP() << "skipping since test case is known to fail with SYCL"; +#endif + auto graph = Kokkos::Experimental::create_graph(ex, [&](auto root) { auto f_setup_count = root.then_parallel_for(1, set_functor{count, 0}); auto f_setup_bugs = root.then_parallel_for(1, set_functor{bugs, 0}); From c8e0a95cbbe961f95befbda8d41d30bc6fff6a40 Mon Sep 17 00:00:00 2001 From: Bruno Turcksin Date: Wed, 22 May 2024 08:31:30 -0400 Subject: [PATCH 065/103] HIP: Use builtin atomic for compare_exchange (#7000) * Use builtin atomic for compare_exchange * Add generic implementation of atomic_exchange * Remove device_atomic_exchange function that takes a compare operator --- .../desul/atomics/Compare_Exchange_HIP.hpp | 145 ++++-------------- 1 file changed, 28 insertions(+), 117 deletions(-) diff --git a/tpls/desul/include/desul/atomics/Compare_Exchange_HIP.hpp b/tpls/desul/include/desul/atomics/Compare_Exchange_HIP.hpp index 8c909bacdf4..0ade34f25df 100644 --- a/tpls/desul/include/desul/atomics/Compare_Exchange_HIP.hpp +++ b/tpls/desul/include/desul/atomics/Compare_Exchange_HIP.hpp @@ -9,6 +9,7 @@ SPDX-License-Identifier: (BSD-3-Clause) #ifndef DESUL_ATOMICS_COMPARE_EXCHANGE_HIP_HPP_ #define DESUL_ATOMICS_COMPARE_EXCHANGE_HIP_HPP_ +#include #include #include #include @@ -17,130 +18,40 @@ SPDX-License-Identifier: (BSD-3-Clause) namespace desul { namespace Impl { -template -__device__ std::enable_if_t device_atomic_compare_exchange( - T* const dest, T compare, T value, MemoryOrderRelaxed, MemoryScope) { - static_assert(sizeof(unsigned int) == 4, - "this function assumes an unsigned int is 32-bit"); - unsigned int return_val = atomicCAS(reinterpret_cast(dest), - reinterpret_cast(compare), - reinterpret_cast(value)); - return reinterpret_cast(return_val); -} -template -__device__ std::enable_if_t device_atomic_compare_exchange( - T* const dest, T compare, T value, MemoryOrderRelaxed, MemoryScope) { - static_assert(sizeof(unsigned long long int) == 8, - "this function assumes an unsigned long long is 64-bit"); - unsigned long long int return_val = - atomicCAS(reinterpret_cast(dest), - reinterpret_cast(compare), - reinterpret_cast(value)); - return reinterpret_cast(return_val); -} +template +struct atomic_exchange_available_hip { + constexpr static bool value = + ((sizeof(T) == 1 && alignof(T) == 1) || (sizeof(T) == 4 && alignof(T) == 4) || + (sizeof(T) == 8 && alignof(T) == 8)) && + std::is_trivially_copyable::value; +}; -template -__device__ std::enable_if_t -device_atomic_compare_exchange( - T* const dest, T compare, T value, MemoryOrderRelease, MemoryScope) { - T return_val = atomic_compare_exchange( - dest, compare, value, MemoryOrderRelaxed(), MemoryScope()); - atomic_thread_fence(MemoryOrderRelease(), MemoryScope()); - return return_val; -} - -template -__device__ std::enable_if_t -device_atomic_compare_exchange( - T* const dest, T compare, T value, MemoryOrderAcquire, MemoryScope) { - atomic_thread_fence(MemoryOrderAcquire(), MemoryScope()); - T return_val = atomic_compare_exchange( - dest, compare, value, MemoryOrderRelaxed(), MemoryScope()); - return return_val; -} - -template -__device__ std::enable_if_t +template +__device__ std::enable_if_t::value, T> device_atomic_compare_exchange( - T* const dest, T compare, T value, MemoryOrderAcqRel, MemoryScope) { - atomic_thread_fence(MemoryOrderAcquire(), MemoryScope()); - T return_val = atomic_compare_exchange( - dest, compare, value, MemoryOrderRelaxed(), MemoryScope()); - atomic_thread_fence(MemoryOrderRelease(), MemoryScope()); - return return_val; + T* const dest, T compare, T value, MemoryOrder, MemoryScope) { + (void)__hip_atomic_compare_exchange_strong( + dest, + &compare, + value, + HIPMemoryOrder::value, + HIPMemoryOrder>::value, + HIPMemoryScope::value); + return compare; } -template -__device__ std::enable_if_t device_atomic_exchange( - T* const dest, T value, MemoryOrderRelaxed, MemoryScope) { - static_assert(sizeof(unsigned int) == 4, - "this function assumes an unsigned int is 32-bit"); - unsigned int return_val = atomicExch(reinterpret_cast(dest), - reinterpret_cast(value)); - return reinterpret_cast(return_val); -} -template -__device__ std::enable_if_t device_atomic_exchange( - T* const dest, T value, MemoryOrderRelaxed, MemoryScope) { - static_assert(sizeof(unsigned long long int) == 8, - "this function assumes an unsigned long long is 64-bit"); - unsigned long long int return_val = - atomicExch(reinterpret_cast(dest), - reinterpret_cast(value)); - return reinterpret_cast(return_val); -} - -template -__device__ std::enable_if_t device_atomic_exchange( - T* const dest, T compare, T value, MemoryOrderRelease, MemoryScope) { - T return_val = device_atomic_compare_exchange( - dest, compare, value, MemoryOrderRelaxed(), MemoryScope()); - device_atomic_thread_fence(MemoryOrderRelease(), MemoryScope()); - return reinterpret_cast(return_val); -} - -template -__device__ std::enable_if_t device_atomic_exchange( - T* const dest, T /*compare*/, T value, MemoryOrderAcquire, MemoryScope) { - device_atomic_thread_fence(MemoryOrderAcquire(), MemoryScope()); - T return_val = - device_atomic_exchange(dest, value, MemoryOrderRelaxed(), MemoryScope()); - return reinterpret_cast(return_val); -} - -template -__device__ std::enable_if_t device_atomic_exchange( - T* const dest, T value, MemoryOrderAcqRel, MemoryScope) { - device_atomic_thread_fence(MemoryOrderAcquire(), MemoryScope()); - T return_val = - device_atomic_exchange(dest, value, MemoryOrderRelaxed(), MemoryScope()); - device_atomic_thread_fence(MemoryOrderRelease(), MemoryScope()); - return reinterpret_cast(return_val); -} - -template -__device__ std::enable_if_t device_atomic_exchange( - T* const dest, T value, MemoryOrderSeqCst, MemoryScope) { - device_atomic_thread_fence(MemoryOrderAcquire(), MemoryScope()); - T return_val = - device_atomic_exchange(dest, value, MemoryOrderRelaxed(), MemoryScope()); - device_atomic_thread_fence(MemoryOrderRelease(), MemoryScope()); - return reinterpret_cast(return_val); -} - -template -__device__ std::enable_if_t -device_atomic_compare_exchange( - T* const dest, T compare, T value, MemoryOrderSeqCst, MemoryScope) { - device_atomic_thread_fence(MemoryOrderAcquire(), MemoryScope()); - T return_val = device_atomic_compare_exchange( - dest, compare, value, MemoryOrderRelaxed(), MemoryScope()); - device_atomic_thread_fence(MemoryOrderRelease(), MemoryScope()); +template +__device__ std::enable_if_t::value, T> +device_atomic_exchange(T* const dest, T value, MemoryOrder, MemoryScope) { + T return_val = __hip_atomic_exchange(dest, + value, + HIPMemoryOrder::value, + HIPMemoryScope::value); return return_val; } template -__device__ std::enable_if_t<(sizeof(T) != 8) && (sizeof(T) != 4), T> +__device__ std::enable_if_t::value, T> device_atomic_compare_exchange( T* const dest, T compare, T value, MemoryOrder, MemoryScope scope) { // This is a way to avoid deadlock in a warp or wave front @@ -169,7 +80,7 @@ device_atomic_compare_exchange( } template -__device__ std::enable_if_t<(sizeof(T) != 8) && (sizeof(T) != 4), T> +__device__ std::enable_if_t::value, T> device_atomic_exchange(T* const dest, T value, MemoryOrder, MemoryScope scope) { // This is a way to avoid deadlock in a warp or wave front T return_val; From a5bb0d41bb2af2597533ad0aa8994bdaf770bef6 Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Wed, 21 Feb 2024 14:17:15 -0700 Subject: [PATCH 066/103] Fix Kokkos README's FENL link --- example/README | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/example/README b/example/README index 66860512448..2fe87276484 100644 --- a/example/README +++ b/example/README @@ -1,7 +1,7 @@ This directory contains example application proxies that use different parts of Kokkos. If you are looking for the FENL ("finite element -nonlinear" solve) example, it has moved into the LinAlg subpackage of -Tpetra. +nonlinear" solve) example, it has moved into the TrilinosCouplings +package in Trilinos. MANIFEST: From a78d4ddb243af5cb60015031c06c54aa03ac86a2 Mon Sep 17 00:00:00 2001 From: "Nevin \":-)\" Liber" Date: Thu, 23 May 2024 13:30:55 -0500 Subject: [PATCH 067/103] Copied the deduction guides and test cases over from branch nliber/ctad-teampolicy-crtp This compiles under llvm12, llvm18, gcc11 and gcc14 --- core/src/Kokkos_ExecPolicy.hpp | 52 ++++++++++ core/unit_test/CMakeLists.txt | 1 + core/unit_test/TestTeamPolicyCTAD.cpp | 135 ++++++++++++++++++++++++++ 3 files changed, 188 insertions(+) create mode 100644 core/unit_test/TestTeamPolicyCTAD.cpp diff --git a/core/src/Kokkos_ExecPolicy.hpp b/core/src/Kokkos_ExecPolicy.hpp index 5f251eeb26a..3a04101aad1 100644 --- a/core/src/Kokkos_ExecPolicy.hpp +++ b/core/src/Kokkos_ExecPolicy.hpp @@ -714,6 +714,58 @@ class TeamPolicy } }; +// Execution space not provided deduces to TeamPolicy<> + +TeamPolicy()->TeamPolicy<>; + +TeamPolicy(int, int)->TeamPolicy<>; +TeamPolicy(int, int, int)->TeamPolicy<>; +TeamPolicy(int, Kokkos::AUTO_t const&)->TeamPolicy<>; +TeamPolicy(int, Kokkos::AUTO_t const&, int)->TeamPolicy<>; +TeamPolicy(int, Kokkos::AUTO_t const&, Kokkos::AUTO_t const&)->TeamPolicy<>; +TeamPolicy(int, int, Kokkos::AUTO_t const&)->TeamPolicy<>; + +// DefaultExecutionSpace deduces to TeamPolicy<> + +TeamPolicy(DefaultExecutionSpace const&, int, int)->TeamPolicy<>; +TeamPolicy(DefaultExecutionSpace const&, int, int, int)->TeamPolicy<>; +TeamPolicy(DefaultExecutionSpace const&, int, Kokkos::AUTO_t const&) + ->TeamPolicy<>; +TeamPolicy(DefaultExecutionSpace const&, int, Kokkos::AUTO_t const&, int) + ->TeamPolicy<>; +TeamPolicy(DefaultExecutionSpace const&, int, Kokkos::AUTO_t const&, + Kokkos::AUTO_t const&) + ->TeamPolicy<>; +TeamPolicy(DefaultExecutionSpace const&, int, int, Kokkos::AUTO_t const&) + ->TeamPolicy<>; + +// ES != DefaultExecutionSpace deduces to TeamPolicy + +template >> +TeamPolicy(ES const&, int, int)->TeamPolicy; + +template >> +TeamPolicy(ES const&, int, int, int)->TeamPolicy; + +template >> +TeamPolicy(ES const&, int, Kokkos::AUTO_t const&)->TeamPolicy; + +template >> +TeamPolicy(ES const&, int, Kokkos::AUTO_t const&, int)->TeamPolicy; + +template >> +TeamPolicy(ES const&, int, Kokkos::AUTO_t const&, Kokkos::AUTO_t const&) + ->TeamPolicy; + +template >> +TeamPolicy(ES const&, int, int, Kokkos::AUTO_t const&)->TeamPolicy; + namespace Impl { template diff --git a/core/unit_test/CMakeLists.txt b/core/unit_test/CMakeLists.txt index 5f325ed4c12..2a56e46a943 100644 --- a/core/unit_test/CMakeLists.txt +++ b/core/unit_test/CMakeLists.txt @@ -93,6 +93,7 @@ SET(COMPILE_ONLY_SOURCES TestViewTypeTraits.cpp TestTypeList.cpp TestMDRangePolicyCTAD.cpp + TestTeamPolicyCTAD.cpp view/TestExtentsDatatypeConversion.cpp ) diff --git a/core/unit_test/TestTeamPolicyCTAD.cpp b/core/unit_test/TestTeamPolicyCTAD.cpp new file mode 100644 index 00000000000..07aaeae819e --- /dev/null +++ b/core/unit_test/TestTeamPolicyCTAD.cpp @@ -0,0 +1,135 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include + +namespace { + +struct TestTeamPolicyCTAD { + template + static void maybe_unused(Ts&&...) {} + + struct SomeExecutionSpace { + using execution_space = SomeExecutionSpace; + using size_type = size_t; + }; + static_assert(Kokkos::is_execution_space_v); + + struct ImplicitlyConvertibleToDefaultExecutionSpace { + [[maybe_unused]] operator Kokkos::DefaultExecutionSpace() const { + return Kokkos::DefaultExecutionSpace(); + } + }; + static_assert(!Kokkos::is_execution_space_v< + ImplicitlyConvertibleToDefaultExecutionSpace>); + + [[maybe_unused]] static inline Kokkos::DefaultExecutionSpace des; + [[maybe_unused]] static inline ImplicitlyConvertibleToDefaultExecutionSpace + notEs; + [[maybe_unused]] static inline SomeExecutionSpace ses; + + [[maybe_unused]] static inline int i; + + // Workaround for nvc++ (CUDA-11.7-NVHPC) ignoring [[maybe_unused]] on + // ImplicitlyConvertibleToDefaultExecutionSpace::operator + // Kokkos::DefaultExecutionSpace() const + [[maybe_unused]] static inline Kokkos::DefaultExecutionSpace notEsToDes = + notEs; + + // Workaround for HIP-ROCm-5.2 warning about was declared but never referenced + TestTeamPolicyCTAD() { maybe_unused(des, notEs, ses, i, notEsToDes); } + + // Default construction deduces to TeamPolicy<> + static_assert( + std::is_same_v, decltype(Kokkos::TeamPolicy{})>); + + // Execution space not provided deduces to TeamPolicy<> + + static_assert( + std::is_same_v, decltype(Kokkos::TeamPolicy(i, i))>); + static_assert(std::is_same_v, + decltype(Kokkos::TeamPolicy(i, i, i))>); + static_assert(std::is_same_v, + decltype(Kokkos::TeamPolicy(i, Kokkos::AUTO))>); + static_assert( + std::is_same_v, + decltype(Kokkos::TeamPolicy(i, Kokkos::AUTO, i))>); + static_assert(std::is_same_v, + decltype(Kokkos::TeamPolicy(i, Kokkos::AUTO, + Kokkos::AUTO))>); + static_assert( + std::is_same_v, + decltype(Kokkos::TeamPolicy(i, i, Kokkos::AUTO))>); + + // DefaultExecutionSpace deduces to TeamPolicy<> + + static_assert(std::is_same_v, + decltype(Kokkos::TeamPolicy(des, i, i))>); + static_assert(std::is_same_v, + decltype(Kokkos::TeamPolicy(des, i, i, i))>); + static_assert( + std::is_same_v, + decltype(Kokkos::TeamPolicy(des, i, Kokkos::AUTO))>); + static_assert( + std::is_same_v, + decltype(Kokkos::TeamPolicy(des, i, Kokkos::AUTO, i))>); + static_assert(std::is_same_v, + decltype(Kokkos::TeamPolicy(des, i, Kokkos::AUTO, + Kokkos::AUTO))>); + static_assert( + std::is_same_v, + decltype(Kokkos::TeamPolicy(des, i, i, Kokkos::AUTO))>); + + // Convertible to DefaultExecutionSpace deduces to TeamPolicy<> + + static_assert(std::is_same_v, + decltype(Kokkos::TeamPolicy(notEs, i, i))>); + static_assert(std::is_same_v, + decltype(Kokkos::TeamPolicy(notEs, i, i, i))>); + static_assert( + std::is_same_v, + decltype(Kokkos::TeamPolicy(notEs, i, Kokkos::AUTO))>); + static_assert( + std::is_same_v, + decltype(Kokkos::TeamPolicy(notEs, i, Kokkos::AUTO, i))>); + static_assert(std::is_same_v, + decltype(Kokkos::TeamPolicy( + notEs, i, Kokkos::AUTO, Kokkos::AUTO))>); + static_assert( + std::is_same_v, + decltype(Kokkos::TeamPolicy(notEs, i, i, Kokkos::AUTO))>); + + // SES != DefaultExecutionSpace deduces to TeamPolicy + + static_assert(std::is_same_v, + decltype(Kokkos::TeamPolicy(ses, i, i))>); + static_assert(std::is_same_v, + decltype(Kokkos::TeamPolicy(ses, i, i, i))>); + static_assert( + std::is_same_v, + decltype(Kokkos::TeamPolicy(ses, i, Kokkos::AUTO))>); + static_assert( + std::is_same_v, + decltype(Kokkos::TeamPolicy(ses, i, Kokkos::AUTO, i))>); + static_assert(std::is_same_v, + decltype(Kokkos::TeamPolicy(ses, i, Kokkos::AUTO, + Kokkos::AUTO))>); + static_assert( + std::is_same_v, + decltype(Kokkos::TeamPolicy(ses, i, i, Kokkos::AUTO))>); +}; + +} // namespace From 0410363d77b78d32937a8b77e01c8373cf90230d Mon Sep 17 00:00:00 2001 From: Paul Zehner Date: Thu, 23 May 2024 23:57:03 +0200 Subject: [PATCH 068/103] Refactor: Replace SFINAE by `if constexpr` for `create_mirror*` functions (#6955) * Uniformize view name for create_mirror * Uniformize view name for create_mirror_view * Combine the two Impl::create_mirror functions into one with constexpr * Format create_mirror * Combine the four Impl::create_mirror_view in one using if constexpr * Combine the two create_mirror_view_and_copy functions into one with constexpr * Fix formatting * Format with clang * Use if constexpr for offset view create_mirror* * Use if constexpr for dynamic view create_mirror* * Use if constexpr for dynamic rank view create_mirror* * Add comments * Restore inline specifiers * Add maybe_unused * Mutualize check functions * Simplify code * Fix missing maybe_unused * Restore previous types when create_mirror_view returns the source view * Fix linting * Remove unused namespaces --------- Co-authored-by: thierry antoun --- containers/src/Kokkos_DynRankView.hpp | 263 ++++++++---------------- containers/src/Kokkos_DynamicView.hpp | 275 +++++++++----------------- containers/src/Kokkos_OffsetView.hpp | 138 +++++-------- core/src/Kokkos_CopyViews.hpp | 224 +++++++++------------ 4 files changed, 319 insertions(+), 581 deletions(-) diff --git a/containers/src/Kokkos_DynRankView.hpp b/containers/src/Kokkos_DynRankView.hpp index 3989911aca4..dadd4535953 100644 --- a/containers/src/Kokkos_DynRankView.hpp +++ b/containers/src/Kokkos_DynRankView.hpp @@ -1932,65 +1932,32 @@ struct MirrorDRVType { } // namespace Impl namespace Impl { -template -inline typename DynRankView::HostMirror create_mirror( - const DynRankView& src, - const Impl::ViewCtorProp& arg_prop, - std::enable_if_t::has_memory_space>* = - nullptr) { - using src_type = DynRankView; - using dst_type = typename src_type::HostMirror; - using alloc_prop_input = Impl::ViewCtorProp; - - static_assert( - !alloc_prop_input::has_label, - "The view constructor arguments passed to Kokkos::create_mirror " - "must not include a label!"); - static_assert( - !alloc_prop_input::has_pointer, - "The view constructor arguments passed to Kokkos::create_mirror must " - "not include a pointer!"); - static_assert( - !alloc_prop_input::allow_padding, - "The view constructor arguments passed to Kokkos::create_mirror must " - "not explicitly allow padding!"); +// create a mirror +// private interface that accepts arbitrary view constructor args passed by a +// view_alloc +template +inline auto create_mirror(const DynRankView& src, + const Impl::ViewCtorProp& arg_prop) { + check_view_ctor_args_create_mirror(); auto prop_copy = Impl::with_properties_if_unset( arg_prop, std::string(src.label()).append("_mirror")); - return dst_type(prop_copy, Impl::reconstructLayout(src.layout(), src.rank())); -} - -template -inline auto create_mirror( - const DynRankView& src, - const Impl::ViewCtorProp& arg_prop, - std::enable_if_t::has_memory_space>* = - nullptr) { - using dst_type = typename Impl::MirrorDRVType< - typename Impl::ViewCtorProp::memory_space, T, - P...>::view_type; - - using alloc_prop_input = Impl::ViewCtorProp; - - static_assert( - !alloc_prop_input::has_label, - "The view constructor arguments passed to Kokkos::create_mirror " - "must not include a label!"); - static_assert( - !alloc_prop_input::has_pointer, - "The view constructor arguments passed to Kokkos::create_mirror must " - "not include a pointer!"); - static_assert( - !alloc_prop_input::allow_padding, - "The view constructor arguments passed to Kokkos::create_mirror must " - "not explicitly allow padding!"); + if constexpr (Impl::ViewCtorProp::has_memory_space) { + using dst_type = typename Impl::MirrorDRVType< + typename Impl::ViewCtorProp::memory_space, T, + P...>::view_type; - auto prop_copy = Impl::with_properties_if_unset( - arg_prop, std::string(src.label()).append("_mirror")); + return dst_type(prop_copy, + Impl::reconstructLayout(src.layout(), src.rank())); + } else { + using src_type = DynRankView; + using dst_type = typename src_type::HostMirror; - return dst_type(prop_copy, Impl::reconstructLayout(src.layout(), src.rank())); + return dst_type(prop_copy, + Impl::reconstructLayout(src.layout(), src.rank())); + } } } // namespace Impl @@ -2057,71 +2024,39 @@ inline auto create_mirror( } namespace Impl { -template -inline std::enable_if_t< - !Impl::ViewCtorProp::has_memory_space && - std::is_same< - typename DynRankView::memory_space, - typename DynRankView::HostMirror::memory_space>::value && - std::is_same< - typename DynRankView::data_type, - typename DynRankView::HostMirror::data_type>::value, - typename DynRankView::HostMirror> -create_mirror_view(const DynRankView& src, - const typename Impl::ViewCtorProp&) { - return src; -} +// create a mirror view +// private interface that accepts arbitrary view constructor args passed by a +// view_alloc template -inline std::enable_if_t< - !Impl::ViewCtorProp::has_memory_space && - !(std::is_same< - typename DynRankView::memory_space, - typename DynRankView::HostMirror::memory_space>::value && - std::is_same< - typename DynRankView::data_type, - typename DynRankView::HostMirror::data_type>::value), - typename DynRankView::HostMirror> -create_mirror_view( +inline auto create_mirror_view( const DynRankView& src, - const typename Impl::ViewCtorProp& arg_prop) { - return Kokkos::Impl::create_mirror(src, arg_prop); -} - -template ::has_memory_space>> -inline std::enable_if_t< - Kokkos::is_space< - typename Impl::ViewCtorProp::memory_space>::value && - Impl::MirrorDRViewType< - typename Impl::ViewCtorProp::memory_space, T, - P...>::is_same_memspace, - typename Impl::MirrorDRViewType< - typename Impl::ViewCtorProp::memory_space, T, - P...>::view_type> -create_mirror_view(const Kokkos::DynRankView& src, - const typename Impl::ViewCtorProp&) { - return src; + [[maybe_unused]] const typename Impl::ViewCtorProp& + arg_prop) { + if constexpr (!Impl::ViewCtorProp::has_memory_space) { + if constexpr (std::is_same::memory_space, + typename DynRankView< + T, P...>::HostMirror::memory_space>::value && + std::is_same::data_type, + typename DynRankView< + T, P...>::HostMirror::data_type>::value) { + return typename DynRankView::HostMirror(src); + } else { + return Kokkos::Impl::create_mirror(src, arg_prop); + } + } else { + if constexpr (Impl::MirrorDRViewType::memory_space, + T, P...>::is_same_memspace) { + return typename Impl::MirrorDRViewType< + typename Impl::ViewCtorProp::memory_space, T, + P...>::view_type(src); + } else { + return Kokkos::Impl::create_mirror(src, arg_prop); + } + } } -template ::has_memory_space>> -inline std::enable_if_t< - Kokkos::is_space< - typename Impl::ViewCtorProp::memory_space>::value && - !Impl::MirrorDRViewType< - typename Impl::ViewCtorProp::memory_space, T, - P...>::is_same_memspace, - typename Impl::MirrorDRViewType< - typename Impl::ViewCtorProp::memory_space, T, - P...>::view_type> -create_mirror_view( - const Kokkos::DynRankView& src, - const typename Impl::ViewCtorProp& arg_prop) { - return Kokkos::Impl::create_mirror(src, arg_prop); -} } // namespace Impl // Create a mirror view in host space @@ -2194,75 +2129,47 @@ inline auto create_mirror_view( return Impl::create_mirror_view(src, arg_prop); } -template +// create a mirror view and deep copy it +// public interface that accepts arbitrary view constructor args passed by a +// view_alloc +template ::specialize>::value>> auto create_mirror_view_and_copy( - const Impl::ViewCtorProp&, - const Kokkos::DynRankView& src, - std::enable_if_t< - std::is_void::specialize>::value && - Impl::MirrorDRViewType< - typename Impl::ViewCtorProp::memory_space, T, - P...>::is_same_memspace>* = nullptr) { + [[maybe_unused]] const Impl::ViewCtorProp& arg_prop, + const Kokkos::DynRankView& src) { using alloc_prop_input = Impl::ViewCtorProp; - static_assert( - alloc_prop_input::has_memory_space, - "The view constructor arguments passed to " - "Kokkos::create_mirror_view_and_copy must include a memory space!"); - static_assert(!alloc_prop_input::has_pointer, - "The view constructor arguments passed to " - "Kokkos::create_mirror_view_and_copy must " - "not include a pointer!"); - static_assert(!alloc_prop_input::allow_padding, - "The view constructor arguments passed to " - "Kokkos::create_mirror_view_and_copy must " - "not explicitly allow padding!"); - - // same behavior as deep_copy(src, src) - if (!alloc_prop_input::has_execution_space) - fence( - "Kokkos::create_mirror_view_and_copy: fence before returning src view"); - return src; -} -template -auto create_mirror_view_and_copy( - const Impl::ViewCtorProp& arg_prop, - const Kokkos::DynRankView& src, - std::enable_if_t< - std::is_void::specialize>::value && - !Impl::MirrorDRViewType< - typename Impl::ViewCtorProp::memory_space, T, - P...>::is_same_memspace>* = nullptr) { - using alloc_prop_input = Impl::ViewCtorProp; - static_assert( - alloc_prop_input::has_memory_space, - "The view constructor arguments passed to " - "Kokkos::create_mirror_view_and_copy must include a memory space!"); - static_assert(!alloc_prop_input::has_pointer, - "The view constructor arguments passed to " - "Kokkos::create_mirror_view_and_copy must " - "not include a pointer!"); - static_assert(!alloc_prop_input::allow_padding, - "The view constructor arguments passed to " - "Kokkos::create_mirror_view_and_copy must " - "not explicitly allow padding!"); - using Space = typename alloc_prop_input::memory_space; - using Mirror = typename Impl::MirrorDRViewType::view_type; - - auto arg_prop_copy = Impl::with_properties_if_unset( - arg_prop, std::string{}, WithoutInitializing, - typename Space::execution_space{}); - - std::string& label = Impl::get_property(arg_prop_copy); - if (label.empty()) label = src.label(); - auto mirror = typename Mirror::non_const_type{ - arg_prop_copy, Impl::reconstructLayout(src.layout(), src.rank())}; - if constexpr (alloc_prop_input::has_execution_space) { - deep_copy(Impl::get_property(arg_prop_copy), - mirror, src); - } else - deep_copy(mirror, src); - return mirror; + Impl::check_view_ctor_args_create_mirror_view_and_copy(); + + if constexpr (Impl::MirrorDRViewType< + typename Impl::ViewCtorProp::memory_space, + T, P...>::is_same_memspace) { + // same behavior as deep_copy(src, src) + if constexpr (!alloc_prop_input::has_execution_space) + fence( + "Kokkos::create_mirror_view_and_copy: fence before returning src " + "view"); + return src; + } else { + using Space = typename alloc_prop_input::memory_space; + using Mirror = typename Impl::MirrorDRViewType::view_type; + + auto arg_prop_copy = Impl::with_properties_if_unset( + arg_prop, std::string{}, WithoutInitializing, + typename Space::execution_space{}); + + std::string& label = Impl::get_property(arg_prop_copy); + if (label.empty()) label = src.label(); + auto mirror = typename Mirror::non_const_type{ + arg_prop_copy, Impl::reconstructLayout(src.layout(), src.rank())}; + if constexpr (alloc_prop_input::has_execution_space) { + deep_copy(Impl::get_property(arg_prop_copy), + mirror, src); + } else + deep_copy(mirror, src); + return mirror; + } } template diff --git a/containers/src/Kokkos_DynamicView.hpp b/containers/src/Kokkos_DynamicView.hpp index 12885edbae9..ff4b5dc171d 100644 --- a/containers/src/Kokkos_DynamicView.hpp +++ b/containers/src/Kokkos_DynamicView.hpp @@ -590,71 +590,39 @@ struct MirrorDynamicViewType { } // namespace Impl namespace Impl { + +// create a mirror +// private interface that accepts arbitrary view constructor args passed by a +// view_alloc template -inline auto create_mirror( - const Kokkos::Experimental::DynamicView& src, - const Impl::ViewCtorProp& arg_prop, - std::enable_if_t::has_memory_space>* = - nullptr) { +inline auto create_mirror(const Kokkos::Experimental::DynamicView& src, + const Impl::ViewCtorProp& arg_prop) { using alloc_prop_input = Impl::ViewCtorProp; - - static_assert( - !alloc_prop_input::has_label, - "The view constructor arguments passed to Kokkos::create_mirror " - "must not include a label!"); - static_assert( - !alloc_prop_input::has_pointer, - "The view constructor arguments passed to Kokkos::create_mirror must " - "not include a pointer!"); - static_assert( - !alloc_prop_input::allow_padding, - "The view constructor arguments passed to Kokkos::create_mirror must " - "not explicitly allow padding!"); + check_view_ctor_args_create_mirror(); auto prop_copy = Impl::with_properties_if_unset( arg_prop, std::string(src.label()).append("_mirror")); - auto ret = typename Kokkos::Experimental::DynamicView::HostMirror( - prop_copy, src.chunk_size(), src.chunk_max() * src.chunk_size()); + if constexpr (Impl::ViewCtorProp::has_memory_space) { + using MemorySpace = typename alloc_prop_input::memory_space; - ret.resize_serial(src.extent(0)); + auto ret = typename Kokkos::Impl::MirrorDynamicViewType< + MemorySpace, T, P...>::view_type(prop_copy, src.chunk_size(), + src.chunk_max() * src.chunk_size()); - return ret; -} + ret.resize_serial(src.extent(0)); -template -inline auto create_mirror( - const Kokkos::Experimental::DynamicView& src, - const Impl::ViewCtorProp& arg_prop, - std::enable_if_t::has_memory_space>* = - nullptr) { - using alloc_prop_input = Impl::ViewCtorProp; - - static_assert( - !alloc_prop_input::has_label, - "The view constructor arguments passed to Kokkos::create_mirror " - "must not include a label!"); - static_assert( - !alloc_prop_input::has_pointer, - "The view constructor arguments passed to Kokkos::create_mirror must " - "not include a pointer!"); - static_assert( - !alloc_prop_input::allow_padding, - "The view constructor arguments passed to Kokkos::create_mirror must " - "not explicitly allow padding!"); - - using MemorySpace = typename alloc_prop_input::memory_space; - auto prop_copy = Impl::with_properties_if_unset( - arg_prop, std::string(src.label()).append("_mirror")); - - auto ret = typename Kokkos::Impl::MirrorDynamicViewType< - MemorySpace, T, P...>::view_type(prop_copy, src.chunk_size(), - src.chunk_max() * src.chunk_size()); + return ret; + } else { + auto ret = typename Kokkos::Experimental::DynamicView::HostMirror( + prop_copy, src.chunk_size(), src.chunk_max() * src.chunk_size()); - ret.resize_serial(src.extent(0)); + ret.resize_serial(src.extent(0)); - return ret; + return ret; + } } + } // namespace Impl // Create a mirror in host space @@ -696,67 +664,41 @@ inline auto create_mirror( namespace Impl { +// create a mirror view +// private interface that accepts arbitrary view constructor args passed by a +// view_alloc template -inline std::enable_if_t< - !Impl::ViewCtorProp::has_memory_space && - (std::is_same< - typename Kokkos::Experimental::DynamicView::memory_space, - typename Kokkos::Experimental::DynamicView< - T, P...>::HostMirror::memory_space>::value && - std::is_same< - typename Kokkos::Experimental::DynamicView::data_type, - typename Kokkos::Experimental::DynamicView< - T, P...>::HostMirror::data_type>::value), - typename Kokkos::Experimental::DynamicView::HostMirror> -create_mirror_view(const Kokkos::Experimental::DynamicView& src, - const Impl::ViewCtorProp&) { - return src; -} - -template -inline std::enable_if_t< - !Impl::ViewCtorProp::has_memory_space && - !(std::is_same< - typename Kokkos::Experimental::DynamicView::memory_space, - typename Kokkos::Experimental::DynamicView< - T, P...>::HostMirror::memory_space>::value && - std::is_same< - typename Kokkos::Experimental::DynamicView::data_type, - typename Kokkos::Experimental::DynamicView< - T, P...>::HostMirror::data_type>::value), - typename Kokkos::Experimental::DynamicView::HostMirror> -create_mirror_view(const Kokkos::Experimental::DynamicView& src, - const Impl::ViewCtorProp& arg_prop) { - return Kokkos::create_mirror(arg_prop, src); -} - -template ::has_memory_space>> -std::enable_if_t::memory_space, - T, P...>::is_same_memspace, - typename Impl::MirrorDynamicViewType< - typename Impl::ViewCtorProp::memory_space, - T, P...>::view_type> -create_mirror_view(const Kokkos::Experimental::DynamicView& src, - const Impl::ViewCtorProp&) { - return src; +inline auto create_mirror_view( + const Kokkos::Experimental::DynamicView& src, + [[maybe_unused]] const Impl::ViewCtorProp& arg_prop) { + if constexpr (!Impl::ViewCtorProp::has_memory_space) { + if constexpr (std::is_same::memory_space, + typename Kokkos::Experimental::DynamicView< + T, P...>::HostMirror::memory_space>::value && + std::is_same::data_type, + typename Kokkos::Experimental::DynamicView< + T, P...>::HostMirror::data_type>::value) { + return + typename Kokkos::Experimental::DynamicView::HostMirror(src); + } else { + return Kokkos::Impl::create_mirror(src, arg_prop); + } + } else { + if constexpr (Impl::MirrorDynamicViewType< + typename Impl::ViewCtorProp< + ViewCtorArgs...>::memory_space, + T, P...>::is_same_memspace) { + return typename Impl::MirrorDynamicViewType< + typename Impl::ViewCtorProp::memory_space, T, + P...>::view_type(src); + } else { + return Kokkos::Impl::create_mirror(src, arg_prop); + } + } } -template ::has_memory_space>> -std::enable_if_t::memory_space, - T, P...>::is_same_memspace, - typename Impl::MirrorDynamicViewType< - typename Impl::ViewCtorProp::memory_space, - T, P...>::view_type> -create_mirror_view(const Kokkos::Experimental::DynamicView& src, - const Impl::ViewCtorProp& arg_prop) { - return Kokkos::Impl::create_mirror(src, arg_prop); -} } // namespace Impl // Create a mirror view in host space @@ -985,80 +927,53 @@ struct ViewCopy, } // namespace Impl -template +// create a mirror view and deep copy it +// public interface that accepts arbitrary view constructor args passed by a +// view_alloc +template ::specialize>::value>> auto create_mirror_view_and_copy( - const Impl::ViewCtorProp&, - const Kokkos::Experimental::DynamicView& src, - std::enable_if_t< - std::is_void::specialize>::value && - Impl::MirrorDynamicViewType< - typename Impl::ViewCtorProp::memory_space, T, - P...>::is_same_memspace>* = nullptr) { + [[maybe_unused]] const Impl::ViewCtorProp& arg_prop, + const Kokkos::Experimental::DynamicView& src) { using alloc_prop_input = Impl::ViewCtorProp; - static_assert( - alloc_prop_input::has_memory_space, - "The view constructor arguments passed to " - "Kokkos::create_mirror_view_and_copy must include a memory space!"); - static_assert(!alloc_prop_input::has_pointer, - "The view constructor arguments passed to " - "Kokkos::create_mirror_view_and_copy must " - "not include a pointer!"); - static_assert(!alloc_prop_input::allow_padding, - "The view constructor arguments passed to " - "Kokkos::create_mirror_view_and_copy must " - "not explicitly allow padding!"); - - // same behavior as deep_copy(src, src) - if (!alloc_prop_input::has_execution_space) - fence( - "Kokkos::create_mirror_view_and_copy: fence before returning src view"); - return src; -} -template -auto create_mirror_view_and_copy( - const Impl::ViewCtorProp& arg_prop, - const Kokkos::Experimental::DynamicView& src, - std::enable_if_t< - std::is_void::specialize>::value && - !Impl::MirrorDynamicViewType< - typename Impl::ViewCtorProp::memory_space, T, - P...>::is_same_memspace>* = nullptr) { - using alloc_prop_input = Impl::ViewCtorProp; - static_assert( - alloc_prop_input::has_memory_space, - "The view constructor arguments passed to " - "Kokkos::create_mirror_view_and_copy must include a memory space!"); - static_assert(!alloc_prop_input::has_pointer, - "The view constructor arguments passed to " - "Kokkos::create_mirror_view_and_copy must " - "not include a pointer!"); - static_assert(!alloc_prop_input::allow_padding, - "The view constructor arguments passed to " - "Kokkos::create_mirror_view_and_copy must " - "not explicitly allow padding!"); - using Space = typename alloc_prop_input::memory_space; - using Mirror = - typename Impl::MirrorDynamicViewType::view_type; - - auto arg_prop_copy = Impl::with_properties_if_unset( - arg_prop, std::string{}, WithoutInitializing, - typename Space::execution_space{}); - - std::string& label = Impl::get_property(arg_prop_copy); - if (label.empty()) label = src.label(); - auto mirror = typename Mirror::non_const_type( - arg_prop_copy, src.chunk_size(), src.chunk_max() * src.chunk_size()); - mirror.resize_serial(src.extent(0)); - if constexpr (alloc_prop_input::has_execution_space) { - deep_copy(Impl::get_property(arg_prop_copy), - mirror, src); - } else - deep_copy(mirror, src); - return mirror; + Impl::check_view_ctor_args_create_mirror_view_and_copy(); + + if constexpr (Impl::MirrorDynamicViewType< + typename Impl::ViewCtorProp::memory_space, + T, P...>::is_same_memspace) { + // same behavior as deep_copy(src, src) + if constexpr (!alloc_prop_input::has_execution_space) + fence( + "Kokkos::create_mirror_view_and_copy: fence before returning src " + "view"); + return src; + } else { + using Space = typename alloc_prop_input::memory_space; + using Mirror = + typename Impl::MirrorDynamicViewType::view_type; + + auto arg_prop_copy = Impl::with_properties_if_unset( + arg_prop, std::string{}, WithoutInitializing, + typename Space::execution_space{}); + + std::string& label = Impl::get_property(arg_prop_copy); + if (label.empty()) label = src.label(); + auto mirror = typename Mirror::non_const_type( + arg_prop_copy, src.chunk_size(), src.chunk_max() * src.chunk_size()); + mirror.resize_serial(src.extent(0)); + if constexpr (alloc_prop_input::has_execution_space) { + deep_copy(Impl::get_property(arg_prop_copy), + mirror, src); + } else + deep_copy(mirror, src); + return mirror; + } } -template +template ::value>> auto create_mirror_view_and_copy( const Space&, const Kokkos::Experimental::DynamicView& src, std::string const& name = "") { diff --git a/containers/src/Kokkos_OffsetView.hpp b/containers/src/Kokkos_OffsetView.hpp index 91a7e4a9273..b2db436b4a9 100644 --- a/containers/src/Kokkos_OffsetView.hpp +++ b/containers/src/Kokkos_OffsetView.hpp @@ -1841,45 +1841,31 @@ struct MirrorOffsetType { } // namespace Impl namespace Impl { -template -inline std::enable_if_t< - !Impl::ViewCtorProp::has_memory_space, - typename Kokkos::Experimental::OffsetView::HostMirror> -create_mirror(const Kokkos::Experimental::OffsetView& src, - const Impl::ViewCtorProp& arg_prop) { - return typename Kokkos::Experimental::OffsetView::HostMirror( - Kokkos::create_mirror(arg_prop, src.view()), src.begins()); -} -template ::has_memory_space>> +// create a mirror +// private interface that accepts arbitrary view constructor args passed by a +// view_alloc +template inline auto create_mirror(const Kokkos::Experimental::OffsetView& src, const Impl::ViewCtorProp& arg_prop) { - using alloc_prop_input = Impl::ViewCtorProp; - using Space = typename Impl::ViewCtorProp::memory_space; + check_view_ctor_args_create_mirror(); - static_assert( - !alloc_prop_input::has_label, - "The view constructor arguments passed to Kokkos::create_mirror " - "must not include a label!"); - static_assert( - !alloc_prop_input::has_pointer, - "The view constructor arguments passed to Kokkos::create_mirror must " - "not include a pointer!"); - static_assert( - !alloc_prop_input::allow_padding, - "The view constructor arguments passed to Kokkos::create_mirror must " - "not explicitly allow padding!"); + if constexpr (Impl::ViewCtorProp::has_memory_space) { + using Space = typename Impl::ViewCtorProp::memory_space; - auto prop_copy = Impl::with_properties_if_unset( - arg_prop, std::string(src.label()).append("_mirror")); + auto prop_copy = Impl::with_properties_if_unset( + arg_prop, std::string(src.label()).append("_mirror")); - return typename Kokkos::Impl::MirrorOffsetType::view_type( - prop_copy, src.layout(), - {src.begin(0), src.begin(1), src.begin(2), src.begin(3), src.begin(4), - src.begin(5), src.begin(6), src.begin(7)}); + return typename Kokkos::Impl::MirrorOffsetType::view_type( + prop_copy, src.layout(), + {src.begin(0), src.begin(1), src.begin(2), src.begin(3), src.begin(4), + src.begin(5), src.begin(6), src.begin(7)}); + } else { + return typename Kokkos::Experimental::OffsetView::HostMirror( + Kokkos::create_mirror(arg_prop, src.view()), src.begins()); + } } + } // namespace Impl // Create a mirror in host space @@ -1921,67 +1907,41 @@ inline auto create_mirror( } namespace Impl { -template -inline std::enable_if_t< - !Impl::ViewCtorProp::has_memory_space && - (std::is_same< - typename Kokkos::Experimental::OffsetView::memory_space, - typename Kokkos::Experimental::OffsetView< - T, P...>::HostMirror::memory_space>::value && - std::is_same< - typename Kokkos::Experimental::OffsetView::data_type, - typename Kokkos::Experimental::OffsetView< - T, P...>::HostMirror::data_type>::value), - typename Kokkos::Experimental::OffsetView::HostMirror> -create_mirror_view(const Kokkos::Experimental::OffsetView& src, - const Impl::ViewCtorProp&) { - return src; -} +// create a mirror view +// private interface that accepts arbitrary view constructor args passed by a +// view_alloc template -inline std::enable_if_t< - !Impl::ViewCtorProp::has_memory_space && - !(std::is_same< - typename Kokkos::Experimental::OffsetView::memory_space, - typename Kokkos::Experimental::OffsetView< - T, P...>::HostMirror::memory_space>::value && - std::is_same< - typename Kokkos::Experimental::OffsetView::data_type, - typename Kokkos::Experimental::OffsetView< - T, P...>::HostMirror::data_type>::value), - typename Kokkos::Experimental::OffsetView::HostMirror> -create_mirror_view(const Kokkos::Experimental::OffsetView& src, - const Impl::ViewCtorProp& arg_prop) { - return Kokkos::create_mirror(arg_prop, src); -} - -template ::has_memory_space>> -std::enable_if_t::memory_space, - T, P...>::is_same_memspace, - typename Impl::MirrorOffsetViewType< - typename Impl::ViewCtorProp::memory_space, - T, P...>::view_type> -create_mirror_view(const Kokkos::Experimental::OffsetView& src, - const Impl::ViewCtorProp&) { - return src; +inline auto create_mirror_view( + const Kokkos::Experimental::OffsetView& src, + [[maybe_unused]] const Impl::ViewCtorProp& arg_prop) { + if constexpr (!Impl::ViewCtorProp::has_memory_space) { + if constexpr (std::is_same::memory_space, + typename Kokkos::Experimental::OffsetView< + T, P...>::HostMirror::memory_space>::value && + std::is_same::data_type, + typename Kokkos::Experimental::OffsetView< + T, P...>::HostMirror::data_type>::value) { + return + typename Kokkos::Experimental::OffsetView::HostMirror(src); + } else { + return Kokkos::Impl::create_mirror(src, arg_prop); + } + } else { + if constexpr (Impl::MirrorOffsetViewType::memory_space, + T, P...>::is_same_memspace) { + return typename Impl::MirrorOffsetViewType< + typename Impl::ViewCtorProp::memory_space, T, + P...>::view_type(src); + } else { + return Kokkos::Impl::create_mirror(src, arg_prop); + } + } } -template ::has_memory_space>> -std::enable_if_t::memory_space, - T, P...>::is_same_memspace, - typename Impl::MirrorOffsetViewType< - typename Impl::ViewCtorProp::memory_space, - T, P...>::view_type> -create_mirror_view(const Kokkos::Experimental::OffsetView& src, - const Impl::ViewCtorProp& arg_prop) { - return Kokkos::Impl::create_mirror(src, arg_prop); -} } // namespace Impl // Create a mirror view in host space diff --git a/core/src/Kokkos_CopyViews.hpp b/core/src/Kokkos_CopyViews.hpp index 40fdd590f6f..7dce03c7124 100644 --- a/core/src/Kokkos_CopyViews.hpp +++ b/core/src/Kokkos_CopyViews.hpp @@ -3456,6 +3456,7 @@ struct MirrorType { using view_type = Kokkos::View; }; +// collection of static asserts for create_mirror and create_mirror_view template void check_view_ctor_args_create_mirror() { using alloc_prop_input = Impl::ViewCtorProp; @@ -3474,36 +3475,26 @@ void check_view_ctor_args_create_mirror() { "not explicitly allow padding!"); } +// create a mirror +// private interface that accepts arbitrary view constructor args passed by a +// view_alloc template -inline std::enable_if_t::has_memory_space, - typename Kokkos::View::HostMirror> -create_mirror(const Kokkos::View& src, - const Impl::ViewCtorProp& arg_prop) { - using src_type = View; - using dst_type = typename src_type::HostMirror; - - check_view_ctor_args_create_mirror(); - - auto prop_copy = Impl::with_properties_if_unset( - arg_prop, std::string(src.label()).append("_mirror")); - - return dst_type(prop_copy, src.layout()); -} - -// Create a mirror in a new space (specialization for different space) -template ::has_memory_space>> -auto create_mirror(const Kokkos::View& src, - const Impl::ViewCtorProp& arg_prop) { +inline auto create_mirror(const Kokkos::View& src, + const Impl::ViewCtorProp& arg_prop) { check_view_ctor_args_create_mirror(); auto prop_copy = Impl::with_properties_if_unset( arg_prop, std::string(src.label()).append("_mirror")); - using alloc_prop = decltype(prop_copy); - return typename Impl::MirrorType::view_type(prop_copy, src.layout()); + if constexpr (Impl::ViewCtorProp::has_memory_space) { + using memory_space = typename decltype(prop_copy)::memory_space; + using dst_type = + typename Impl::MirrorType::view_type; + return dst_type(prop_copy, src.layout()); + } else { + using dst_type = typename View::HostMirror; + return dst_type(prop_copy, src.layout()); + } } } // namespace Impl @@ -3561,66 +3552,37 @@ create_mirror(Kokkos::Impl::WithoutInitializing_t wi, Space const&, namespace Impl { +// create a mirror view +// private interface that accepts arbitrary view constructor args passed by a +// view_alloc template -inline std::enable_if_t< - !Impl::ViewCtorProp::has_memory_space && - (std::is_same< - typename Kokkos::View::memory_space, - typename Kokkos::View::HostMirror::memory_space>::value && - std::is_same< - typename Kokkos::View::data_type, - typename Kokkos::View::HostMirror::data_type>::value), - typename Kokkos::View::HostMirror> -create_mirror_view(const Kokkos::View& src, - const Impl::ViewCtorProp&) { - check_view_ctor_args_create_mirror(); - return src; -} - -template -inline std::enable_if_t< - !Impl::ViewCtorProp::has_memory_space && - !(std::is_same::memory_space, - typename Kokkos::View< - T, P...>::HostMirror::memory_space>::value && - std::is_same< - typename Kokkos::View::data_type, - typename Kokkos::View::HostMirror::data_type>::value), - typename Kokkos::View::HostMirror> -create_mirror_view(const Kokkos::View& src, - const Impl::ViewCtorProp& arg_prop) { - return Kokkos::Impl::create_mirror(src, arg_prop); -} - -// Create a mirror view in a new space (specialization for same space) -template ::has_memory_space>> -std::enable_if_t::memory_space, - T, P...>::is_same_memspace, - typename Impl::MirrorViewType< - typename Impl::ViewCtorProp::memory_space, - T, P...>::view_type> -create_mirror_view(const Kokkos::View& src, - const Impl::ViewCtorProp&) { - check_view_ctor_args_create_mirror(); - return src; -} - -// Create a mirror view in a new space (specialization for different space) -template ::has_memory_space>> -std::enable_if_t::memory_space, - T, P...>::is_same_memspace, - typename Impl::MirrorViewType< - typename Impl::ViewCtorProp::memory_space, - T, P...>::view_type> -create_mirror_view(const Kokkos::View& src, - const Impl::ViewCtorProp& arg_prop) { - return Kokkos::Impl::create_mirror(src, arg_prop); +inline auto create_mirror_view( + const Kokkos::View& src, + [[maybe_unused]] const Impl::ViewCtorProp& arg_prop) { + if constexpr (!Impl::ViewCtorProp::has_memory_space) { + if constexpr (std::is_same::memory_space, + typename Kokkos::View< + T, P...>::HostMirror::memory_space>::value && + std::is_same::data_type, + typename Kokkos::View< + T, P...>::HostMirror::data_type>::value) { + check_view_ctor_args_create_mirror(); + return typename Kokkos::View::HostMirror(src); + } else { + return Kokkos::Impl::create_mirror(src, arg_prop); + } + } else { + if constexpr (Impl::MirrorViewType::memory_space, + T, P...>::is_same_memspace) { + check_view_ctor_args_create_mirror(); + return typename Impl::MirrorViewType< + typename Impl::ViewCtorProp::memory_space, T, + P...>::view_type(src); + } else { + return Kokkos::Impl::create_mirror(src, arg_prop); + } + } } } // namespace Impl @@ -3691,16 +3653,13 @@ auto create_mirror_view(const Impl::ViewCtorProp& arg_prop, return Impl::create_mirror_view(src, arg_prop); } -template -auto create_mirror_view_and_copy( - const Impl::ViewCtorProp&, - const Kokkos::View& src, - std::enable_if_t< - std::is_void::specialize>::value && - Impl::MirrorViewType< - typename Impl::ViewCtorProp::memory_space, T, - P...>::is_same_memspace>* = nullptr) { +namespace Impl { + +// collection of static asserts for create_mirror_view_and_copy +template +void check_view_ctor_args_create_mirror_view_and_copy() { using alloc_prop_input = Impl::ViewCtorProp; + static_assert( alloc_prop_input::has_memory_space, "The view constructor arguments passed to " @@ -3713,52 +3672,49 @@ auto create_mirror_view_and_copy( "The view constructor arguments passed to " "Kokkos::create_mirror_view_and_copy must " "not explicitly allow padding!"); - - // same behavior as deep_copy(src, src) - if (!alloc_prop_input::has_execution_space) - fence( - "Kokkos::create_mirror_view_and_copy: fence before returning src view"); - return src; } -template +} // namespace Impl + +// create a mirror view and deep copy it +// public interface that accepts arbitrary view constructor args passed by a +// view_alloc +template ::specialize>::value>> auto create_mirror_view_and_copy( - const Impl::ViewCtorProp& arg_prop, - const Kokkos::View& src, - std::enable_if_t< - std::is_void::specialize>::value && - !Impl::MirrorViewType< - typename Impl::ViewCtorProp::memory_space, T, - P...>::is_same_memspace>* = nullptr) { + [[maybe_unused]] const Impl::ViewCtorProp& arg_prop, + const Kokkos::View& src) { using alloc_prop_input = Impl::ViewCtorProp; - static_assert( - alloc_prop_input::has_memory_space, - "The view constructor arguments passed to " - "Kokkos::create_mirror_view_and_copy must include a memory space!"); - static_assert(!alloc_prop_input::has_pointer, - "The view constructor arguments passed to " - "Kokkos::create_mirror_view_and_copy must " - "not include a pointer!"); - static_assert(!alloc_prop_input::allow_padding, - "The view constructor arguments passed to " - "Kokkos::create_mirror_view_and_copy must " - "not explicitly allow padding!"); - using Space = typename alloc_prop_input::memory_space; - using Mirror = typename Impl::MirrorViewType::view_type; - - auto arg_prop_copy = Impl::with_properties_if_unset( - arg_prop, std::string{}, WithoutInitializing, - typename Space::execution_space{}); - - std::string& label = Impl::get_property(arg_prop_copy); - if (label.empty()) label = src.label(); - auto mirror = typename Mirror::non_const_type{arg_prop_copy, src.layout()}; - if constexpr (alloc_prop_input::has_execution_space) { - deep_copy(Impl::get_property(arg_prop_copy), - mirror, src); - } else - deep_copy(mirror, src); - return mirror; + + Impl::check_view_ctor_args_create_mirror_view_and_copy(); + + if constexpr (Impl::MirrorViewType::is_same_memspace) { + // same behavior as deep_copy(src, src) + if constexpr (!alloc_prop_input::has_execution_space) + fence( + "Kokkos::create_mirror_view_and_copy: fence before returning src " + "view"); + return src; + } else { + using Space = typename alloc_prop_input::memory_space; + using Mirror = typename Impl::MirrorViewType::view_type; + + auto arg_prop_copy = Impl::with_properties_if_unset( + arg_prop, std::string{}, WithoutInitializing, + typename Space::execution_space{}); + + std::string& label = Impl::get_property(arg_prop_copy); + if (label.empty()) label = src.label(); + auto mirror = typename Mirror::non_const_type{arg_prop_copy, src.layout()}; + if constexpr (alloc_prop_input::has_execution_space) { + deep_copy(Impl::get_property(arg_prop_copy), + mirror, src); + } else + deep_copy(mirror, src); + return mirror; + } } // Previously when using auto here, the intel compiler 19.3 would From 7c67b020c661955e38499f53dfd436642e2ead9e Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Fri, 24 May 2024 15:17:44 -0600 Subject: [PATCH 069/103] Workaround icpc warnings Workaround "missing return statement at end of non-void function" warnings triggering -Werror with intel classic compilers Address issue #7031 Co-authored-by: Damien L-G --- containers/src/Kokkos_DynRankView.hpp | 6 ++++++ containers/src/Kokkos_DynamicView.hpp | 6 ++++++ containers/src/Kokkos_OffsetView.hpp | 6 ++++++ core/src/Kokkos_CopyViews.hpp | 6 ++++++ 4 files changed, 24 insertions(+) diff --git a/containers/src/Kokkos_DynRankView.hpp b/containers/src/Kokkos_DynRankView.hpp index dadd4535953..0af479590e7 100644 --- a/containers/src/Kokkos_DynRankView.hpp +++ b/containers/src/Kokkos_DynRankView.hpp @@ -1958,6 +1958,9 @@ inline auto create_mirror(const DynRankView& src, return dst_type(prop_copy, Impl::reconstructLayout(src.layout(), src.rank())); } +#if defined KOKKOS_COMPILER_INTEL + __builtin_unreachable(); +#endif } } // namespace Impl @@ -2055,6 +2058,9 @@ inline auto create_mirror_view( return Kokkos::Impl::create_mirror(src, arg_prop); } } +#if defined KOKKOS_COMPILER_INTEL + __builtin_unreachable(); +#endif } } // namespace Impl diff --git a/containers/src/Kokkos_DynamicView.hpp b/containers/src/Kokkos_DynamicView.hpp index ff4b5dc171d..8e29042ace2 100644 --- a/containers/src/Kokkos_DynamicView.hpp +++ b/containers/src/Kokkos_DynamicView.hpp @@ -621,6 +621,9 @@ inline auto create_mirror(const Kokkos::Experimental::DynamicView& src, return ret; } +#if defined KOKKOS_COMPILER_INTEL + __builtin_unreachable(); +#endif } } // namespace Impl @@ -697,6 +700,9 @@ inline auto create_mirror_view( return Kokkos::Impl::create_mirror(src, arg_prop); } } +#if defined KOKKOS_COMPILER_INTEL + __builtin_unreachable(); +#endif } } // namespace Impl diff --git a/containers/src/Kokkos_OffsetView.hpp b/containers/src/Kokkos_OffsetView.hpp index b2db436b4a9..720e71b8c16 100644 --- a/containers/src/Kokkos_OffsetView.hpp +++ b/containers/src/Kokkos_OffsetView.hpp @@ -1864,6 +1864,9 @@ inline auto create_mirror(const Kokkos::Experimental::OffsetView& src, return typename Kokkos::Experimental::OffsetView::HostMirror( Kokkos::create_mirror(arg_prop, src.view()), src.begins()); } +#if defined KOKKOS_COMPILER_INTEL + __builtin_unreachable(); +#endif } } // namespace Impl @@ -1940,6 +1943,9 @@ inline auto create_mirror_view( return Kokkos::Impl::create_mirror(src, arg_prop); } } +#if defined KOKKOS_COMPILER_INTEL + __builtin_unreachable(); +#endif } } // namespace Impl diff --git a/core/src/Kokkos_CopyViews.hpp b/core/src/Kokkos_CopyViews.hpp index 7dce03c7124..fbd6668a611 100644 --- a/core/src/Kokkos_CopyViews.hpp +++ b/core/src/Kokkos_CopyViews.hpp @@ -3495,6 +3495,9 @@ inline auto create_mirror(const Kokkos::View& src, using dst_type = typename View::HostMirror; return dst_type(prop_copy, src.layout()); } +#if defined KOKKOS_COMPILER_INTEL + __builtin_unreachable(); +#endif } } // namespace Impl @@ -3583,6 +3586,9 @@ inline auto create_mirror_view( return Kokkos::Impl::create_mirror(src, arg_prop); } } +#if defined KOKKOS_COMPILER_INTEL + __builtin_unreachable(); +#endif } } // namespace Impl From cf791bc2eb814c232ca62f0a7302d7947466c463 Mon Sep 17 00:00:00 2001 From: "Nevin \":-)\" Liber" Date: Fri, 24 May 2024 17:06:36 -0500 Subject: [PATCH 070/103] Adding `Kokkos::to_array` (#6375) * Added Kokkos::to_Array for Kokkos::Array (analogous to std::to_array for std::array) * Changed KOKKOS_INLINE_FUNCTION into KOKKOS_FUNCTION (for inline template functions) * Added back to_Array tests * Fixed surperfluous KOKKOS_INLINE_FUNCTION * Fix for "set but not used" error for variables in a static_assert under nvcc * Guard against Kokkos::to_Array({0, 1, 3}), as that is not supported under gcc8 * Renamed to_Array to to_array and to_Array_impl to to_array_impl --- core/src/Kokkos_Array.hpp | 26 ++++++++++++++++++++++++++ core/unit_test/TestArray.cpp | 30 ++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+) diff --git a/core/src/Kokkos_Array.hpp b/core/src/Kokkos_Array.hpp index 0a1ced93c8f..26e8a12be11 100644 --- a/core/src/Kokkos_Array.hpp +++ b/core/src/Kokkos_Array.hpp @@ -356,6 +356,32 @@ struct KOKKOS_DEPRECATED template Array(T, Us...)->Array; +namespace Impl { + +template +KOKKOS_FUNCTION constexpr Array, N> to_array_impl( + T (&a)[N], std::index_sequence) { + return {{a[I]...}}; +} + +template +KOKKOS_FUNCTION constexpr Array, N> to_array_impl( + T(&&a)[N], std::index_sequence) { + return {{std::move(a[I])...}}; +} + +} // namespace Impl + +template +KOKKOS_FUNCTION constexpr auto to_array(T (&a)[N]) { + return Impl::to_array_impl(a, std::make_index_sequence{}); +} + +template +KOKKOS_FUNCTION constexpr auto to_array(T(&&a)[N]) { + return Impl::to_array_impl(std::move(a), std::make_index_sequence{}); +} + } // namespace Kokkos // diff --git a/core/unit_test/TestArray.cpp b/core/unit_test/TestArray.cpp index d20d355b792..fb6334322b7 100644 --- a/core/unit_test/TestArray.cpp +++ b/core/unit_test/TestArray.cpp @@ -18,6 +18,11 @@ namespace { +// nvcc errors on variables only used in static_asserts +// Passing those variables to this function should eliminate the warning +template +KOKKOS_FUNCTION constexpr void maybe_unused(Ts&&...) {} + KOKKOS_FUNCTION constexpr bool test_array() { constexpr Kokkos::Array a{{1, 2}}; @@ -160,4 +165,29 @@ constexpr bool test_array_specialization_kokkos_swap() { static_assert(test_array_specialization_kokkos_swap()); +constexpr bool test_to_array() { + // copies a string literal + [[maybe_unused]] auto a1 = Kokkos::to_array("foo"); + static_assert(a1.size() == 4); + maybe_unused(a1); + + // deduces both element type and length + [[maybe_unused]] auto a2 = Kokkos::to_array({0, 2, 1, 3}); + static_assert(std::is_same_v>); + maybe_unused(a2); + +// gcc8 doesn't support the implicit conversion +#if !defined(KOKKOS_COMPILER_GNU) || (KOKKOS_COMPILER_GNU >= 910) + // deduces length with element type specified + // implicit conversion happens + [[maybe_unused]] auto a3 = Kokkos::to_array({0, 1, 3}); + static_assert(std::is_same_v>); + maybe_unused(a3); +#endif + + return true; +} + +static_assert(test_to_array()); + } // namespace From bd107d83a01af03d7abc202195104f598553130f Mon Sep 17 00:00:00 2001 From: Yuuichi Asahi Date: Wed, 27 Mar 2024 15:56:29 +0100 Subject: [PATCH 071/103] Add assertion in adjacent_difference with overlapping source and destination --- .../Kokkos_AdjacentDifference.hpp | 4 +++ .../impl/Kokkos_Constraints.hpp | 16 ++++++++++++ .../TestStdAlgorithmsAdjacentDifference.cpp | 26 +++++++++++++++++++ 3 files changed, 46 insertions(+) diff --git a/algorithms/src/std_algorithms/Kokkos_AdjacentDifference.hpp b/algorithms/src/std_algorithms/Kokkos_AdjacentDifference.hpp index f254686dbaf..ff736de52aa 100644 --- a/algorithms/src/std_algorithms/Kokkos_AdjacentDifference.hpp +++ b/algorithms/src/std_algorithms/Kokkos_AdjacentDifference.hpp @@ -109,6 +109,7 @@ auto adjacent_difference( namespace KE = ::Kokkos::Experimental; Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); + Impl::expect_not_identical(view_from, view_dest); using view_type1 = ::Kokkos::View; using view_type2 = ::Kokkos::View; @@ -134,6 +135,7 @@ auto adjacent_difference( namespace KE = ::Kokkos::Experimental; Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); + Impl::expect_not_identical(view_from, view_dest); return Impl::adjacent_difference_exespace_impl( "Kokkos::adjacent_difference_view_api", ex, KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest), bin_op); @@ -150,6 +152,7 @@ auto adjacent_difference( namespace KE = ::Kokkos::Experimental; Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); + Impl::expect_not_identical(view_from, view_dest); using view_type1 = ::Kokkos::View; using view_type2 = ::Kokkos::View; @@ -176,6 +179,7 @@ auto adjacent_difference( namespace KE = ::Kokkos::Experimental; Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); + Impl::expect_not_identical(view_from, view_dest); return Impl::adjacent_difference_exespace_impl( label, ex, KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest), bin_op); diff --git a/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp b/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp index 27ce5a6fad6..b44b18365d9 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp @@ -215,6 +215,22 @@ KOKKOS_INLINE_FUNCTION void expect_valid_range(IteratorType first, (void)last; } +// +// views are not the same +// +template < + typename DataType1, typename... Properties1, + typename DataType2, typename... Properties2> +KOKKOS_INLINE_FUNCTION void expect_not_identical( + const ::Kokkos::View& view_from, + const ::Kokkos::View& view_dest) { + + // this is a no-op for release + KOKKOS_EXPECTS( view_dest != view_from ); + // avoid compiler complaining when KOKKOS_EXPECTS is no-op + (void)view_from; + (void)view_dest; +} } // namespace Impl } // namespace Experimental } // namespace Kokkos diff --git a/algorithms/unit_tests/TestStdAlgorithmsAdjacentDifference.cpp b/algorithms/unit_tests/TestStdAlgorithmsAdjacentDifference.cpp index 75ad533f6ee..19c7487bfa2 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsAdjacentDifference.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsAdjacentDifference.cpp @@ -19,6 +19,7 @@ #include #include #include +#include namespace Test { namespace stdalgos { @@ -230,6 +231,31 @@ void run_single_scenario(const InfoType& scenario_info, } Kokkos::fence(); + + #if !defined(NDEBUG) || defined(KOKKOS_ENFORCE_CONTRACTS) || \ + defined(KOKKOS_ENABLE_DEBUG) + { + auto view_dest = view_from; + EXPECT_DEATH( + {KE::adjacent_difference(exespace(), view_from, + view_dest, args...); + Kokkos::fence(); + }, + "Kokkos contract violation:.*"); + } + + { + auto view_dest = view_from; + EXPECT_DEATH( + {KE::adjacent_difference("label", exespace(), view_from, + view_dest, args...); + Kokkos::fence(); + }, + "Kokkos contract violation:.*"); + } + #endif + + Kokkos::fence(); } template From 82511c455cb1ce563e8981afe7d592f930bc2a9c Mon Sep 17 00:00:00 2001 From: Yuuichi Asahi Date: Wed, 3 Apr 2024 15:30:36 +0200 Subject: [PATCH 072/103] Disallow overlap not identical inputs --- .../Kokkos_AdjacentDifference.hpp | 4 --- .../impl/Kokkos_AdjacentDifference.hpp | 24 ++++++++++++++--- .../impl/Kokkos_Constraints.hpp | 16 ------------ .../TestStdAlgorithmsAdjacentDifference.cpp | 26 ------------------- 4 files changed, 20 insertions(+), 50 deletions(-) diff --git a/algorithms/src/std_algorithms/Kokkos_AdjacentDifference.hpp b/algorithms/src/std_algorithms/Kokkos_AdjacentDifference.hpp index ff736de52aa..f254686dbaf 100644 --- a/algorithms/src/std_algorithms/Kokkos_AdjacentDifference.hpp +++ b/algorithms/src/std_algorithms/Kokkos_AdjacentDifference.hpp @@ -109,7 +109,6 @@ auto adjacent_difference( namespace KE = ::Kokkos::Experimental; Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); - Impl::expect_not_identical(view_from, view_dest); using view_type1 = ::Kokkos::View; using view_type2 = ::Kokkos::View; @@ -135,7 +134,6 @@ auto adjacent_difference( namespace KE = ::Kokkos::Experimental; Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); - Impl::expect_not_identical(view_from, view_dest); return Impl::adjacent_difference_exespace_impl( "Kokkos::adjacent_difference_view_api", ex, KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest), bin_op); @@ -152,7 +150,6 @@ auto adjacent_difference( namespace KE = ::Kokkos::Experimental; Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); - Impl::expect_not_identical(view_from, view_dest); using view_type1 = ::Kokkos::View; using view_type2 = ::Kokkos::View; @@ -179,7 +176,6 @@ auto adjacent_difference( namespace KE = ::Kokkos::Experimental; Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); - Impl::expect_not_identical(view_from, view_dest); return Impl::adjacent_difference_exespace_impl( label, ex, KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest), bin_op); diff --git a/algorithms/src/std_algorithms/impl/Kokkos_AdjacentDifference.hpp b/algorithms/src/std_algorithms/impl/Kokkos_AdjacentDifference.hpp index a8171fa068d..95b8d7ff9fe 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_AdjacentDifference.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_AdjacentDifference.hpp @@ -21,6 +21,7 @@ #include "Kokkos_Constraints.hpp" #include "Kokkos_HelperPredicates.hpp" #include +#include #include namespace Kokkos { @@ -78,13 +79,21 @@ OutputIteratorType adjacent_difference_exespace_impl( first_dest); Impl::expect_valid_range(first_from, last_from); + // ranges shall not overlap + const auto num_elements = + Kokkos::Experimental::distance(first_from, last_from); +#if !defined(NDEBUG) || defined(KOKKOS_ENFORCE_CONTRACTS) || \ + defined(KOKKOS_ENABLE_DEBUG) + auto last_dest = first_dest + num_elements; + auto found_first = Kokkos::Experimental::find_first_of(ex, first_from, last_from, first_dest, last_dest); + KOKKOS_EXPECTS(found_first == last_from); +#endif + if (first_from == last_from) { return first_dest; } // run - const auto num_elements = - Kokkos::Experimental::distance(first_from, last_from); ::Kokkos::parallel_for( label, RangePolicy(ex, 0, num_elements), StdAdjacentDiffFunctor(first_from, first_dest, bin_op)); @@ -109,14 +118,21 @@ KOKKOS_FUNCTION OutputIteratorType adjacent_difference_team_impl( Impl::static_assert_iterators_have_matching_difference_type(first_from, first_dest); Impl::expect_valid_range(first_from, last_from); + // ranges shall not overlap + const auto num_elements = + Kokkos::Experimental::distance(first_from, last_from); + auto last_dest = first_dest + num_elements; +#if !defined(NDEBUG) || defined(KOKKOS_ENFORCE_CONTRACTS) || \ + defined(KOKKOS_ENABLE_DEBUG) + auto found_first = Kokkos::Experimental::find_first_of(teamHandle, first_from, last_from, first_dest, last_dest); + KOKKOS_EXPECTS(found_first == last_from); +#endif if (first_from == last_from) { return first_dest; } // run - const auto num_elements = - Kokkos::Experimental::distance(first_from, last_from); ::Kokkos::parallel_for( TeamThreadRange(teamHandle, 0, num_elements), StdAdjacentDiffFunctor(first_from, first_dest, bin_op)); diff --git a/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp b/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp index b44b18365d9..27ce5a6fad6 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp @@ -215,22 +215,6 @@ KOKKOS_INLINE_FUNCTION void expect_valid_range(IteratorType first, (void)last; } -// -// views are not the same -// -template < - typename DataType1, typename... Properties1, - typename DataType2, typename... Properties2> -KOKKOS_INLINE_FUNCTION void expect_not_identical( - const ::Kokkos::View& view_from, - const ::Kokkos::View& view_dest) { - - // this is a no-op for release - KOKKOS_EXPECTS( view_dest != view_from ); - // avoid compiler complaining when KOKKOS_EXPECTS is no-op - (void)view_from; - (void)view_dest; -} } // namespace Impl } // namespace Experimental } // namespace Kokkos diff --git a/algorithms/unit_tests/TestStdAlgorithmsAdjacentDifference.cpp b/algorithms/unit_tests/TestStdAlgorithmsAdjacentDifference.cpp index 19c7487bfa2..75ad533f6ee 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsAdjacentDifference.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsAdjacentDifference.cpp @@ -19,7 +19,6 @@ #include #include #include -#include namespace Test { namespace stdalgos { @@ -231,31 +230,6 @@ void run_single_scenario(const InfoType& scenario_info, } Kokkos::fence(); - - #if !defined(NDEBUG) || defined(KOKKOS_ENFORCE_CONTRACTS) || \ - defined(KOKKOS_ENABLE_DEBUG) - { - auto view_dest = view_from; - EXPECT_DEATH( - {KE::adjacent_difference(exespace(), view_from, - view_dest, args...); - Kokkos::fence(); - }, - "Kokkos contract violation:.*"); - } - - { - auto view_dest = view_from; - EXPECT_DEATH( - {KE::adjacent_difference("label", exespace(), view_from, - view_dest, args...); - Kokkos::fence(); - }, - "Kokkos contract violation:.*"); - } - #endif - - Kokkos::fence(); } template From 2b66300658f4c4da15c32c5806937cecb9862fce Mon Sep 17 00:00:00 2001 From: Yuuichi Asahi Date: Wed, 3 Apr 2024 15:51:39 +0200 Subject: [PATCH 073/103] check equality of first and last in advance --- .../impl/Kokkos_AdjacentDifference.hpp | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/algorithms/src/std_algorithms/impl/Kokkos_AdjacentDifference.hpp b/algorithms/src/std_algorithms/impl/Kokkos_AdjacentDifference.hpp index 95b8d7ff9fe..acd4e00da33 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_AdjacentDifference.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_AdjacentDifference.hpp @@ -79,6 +79,10 @@ OutputIteratorType adjacent_difference_exespace_impl( first_dest); Impl::expect_valid_range(first_from, last_from); + if (first_from == last_from) { + return first_dest; + } + // ranges shall not overlap const auto num_elements = Kokkos::Experimental::distance(first_from, last_from); @@ -89,10 +93,6 @@ OutputIteratorType adjacent_difference_exespace_impl( KOKKOS_EXPECTS(found_first == last_from); #endif - if (first_from == last_from) { - return first_dest; - } - // run ::Kokkos::parallel_for( label, RangePolicy(ex, 0, num_elements), @@ -118,6 +118,11 @@ KOKKOS_FUNCTION OutputIteratorType adjacent_difference_team_impl( Impl::static_assert_iterators_have_matching_difference_type(first_from, first_dest); Impl::expect_valid_range(first_from, last_from); + + if (first_from == last_from) { + return first_dest; + } + // ranges shall not overlap const auto num_elements = Kokkos::Experimental::distance(first_from, last_from); @@ -128,10 +133,6 @@ KOKKOS_FUNCTION OutputIteratorType adjacent_difference_team_impl( KOKKOS_EXPECTS(found_first == last_from); #endif - if (first_from == last_from) { - return first_dest; - } - // run ::Kokkos::parallel_for( TeamThreadRange(teamHandle, 0, num_elements), From f9939f66f79540cc1f2a55ab49cb2ab8846fc1f7 Mon Sep 17 00:00:00 2001 From: Yuuichi Asahi Date: Fri, 5 Apr 2024 10:17:57 +0200 Subject: [PATCH 074/103] Test overlap only if iterators are convertible --- .../impl/Kokkos_AdjacentDifference.hpp | 13 ++----------- .../src/std_algorithms/impl/Kokkos_Constraints.hpp | 14 ++++++++++++++ 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/algorithms/src/std_algorithms/impl/Kokkos_AdjacentDifference.hpp b/algorithms/src/std_algorithms/impl/Kokkos_AdjacentDifference.hpp index acd4e00da33..3f9b4a43c50 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_AdjacentDifference.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_AdjacentDifference.hpp @@ -21,7 +21,6 @@ #include "Kokkos_Constraints.hpp" #include "Kokkos_HelperPredicates.hpp" #include -#include #include namespace Kokkos { @@ -86,12 +85,8 @@ OutputIteratorType adjacent_difference_exespace_impl( // ranges shall not overlap const auto num_elements = Kokkos::Experimental::distance(first_from, last_from); -#if !defined(NDEBUG) || defined(KOKKOS_ENFORCE_CONTRACTS) || \ - defined(KOKKOS_ENABLE_DEBUG) auto last_dest = first_dest + num_elements; - auto found_first = Kokkos::Experimental::find_first_of(ex, first_from, last_from, first_dest, last_dest); - KOKKOS_EXPECTS(found_first == last_from); -#endif + Impl::expect_no_overlap(first_from, last_from, first_dest, last_dest); // run ::Kokkos::parallel_for( @@ -127,11 +122,7 @@ KOKKOS_FUNCTION OutputIteratorType adjacent_difference_team_impl( const auto num_elements = Kokkos::Experimental::distance(first_from, last_from); auto last_dest = first_dest + num_elements; -#if !defined(NDEBUG) || defined(KOKKOS_ENFORCE_CONTRACTS) || \ - defined(KOKKOS_ENABLE_DEBUG) - auto found_first = Kokkos::Experimental::find_first_of(teamHandle, first_from, last_from, first_dest, last_dest); - KOKKOS_EXPECTS(found_first == last_from); -#endif + Impl::expect_no_overlap(first_from, last_from, first_dest, last_dest); // run ::Kokkos::parallel_for( diff --git a/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp b/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp index 27ce5a6fad6..321b59ce691 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp @@ -215,6 +215,20 @@ KOKKOS_INLINE_FUNCTION void expect_valid_range(IteratorType first, (void)last; } +// +// Check if iterators are overlapped +// +template +KOKKOS_INLINE_FUNCTION void expect_no_overlap(IteratorType1 first, + IteratorType1 last, + IteratorType2 s_first, + IteratorType2 s_last) { + if constexpr( std::is_constructible_v ) { + IteratorType1 s_first1(s_first), s_last1(s_last); + KOKKOS_EXPECTS(first < s_first1 && s_last1 < last); + } +} + } // namespace Impl } // namespace Experimental } // namespace Kokkos From 2acd84fc9a6ecbac2d115ff0266417f933e59170 Mon Sep 17 00:00:00 2001 From: Yuuichi Asahi Date: Fri, 5 Apr 2024 18:18:52 +0200 Subject: [PATCH 075/103] add else if for the opposite case --- algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp b/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp index 321b59ce691..0b54fcf9fe9 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp @@ -220,12 +220,15 @@ KOKKOS_INLINE_FUNCTION void expect_valid_range(IteratorType first, // template KOKKOS_INLINE_FUNCTION void expect_no_overlap(IteratorType1 first, - IteratorType1 last, + IteratorType1 last, IteratorType2 s_first, IteratorType2 s_last) { - if constexpr( std::is_constructible_v ) { + if constexpr (std::is_constructible_v) { IteratorType1 s_first1(s_first), s_last1(s_last); KOKKOS_EXPECTS(first < s_first1 && s_last1 < last); + } else if constexpr (std::is_constructible_v) { + IteratorType2 first2(first), last2(last); + KOKKOS_EXPECTS(first2 < s_first && s_last < last2); } } From 66c5df1bfeb26e773b83dbb4da4b8abc66d7b846 Mon Sep 17 00:00:00 2001 From: Yuuichi Asahi Date: Mon, 8 Apr 2024 09:11:40 +0200 Subject: [PATCH 076/103] fix check for overlapping iterators --- algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp b/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp index 0b54fcf9fe9..00bc8267ad7 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp @@ -225,10 +225,10 @@ KOKKOS_INLINE_FUNCTION void expect_no_overlap(IteratorType1 first, IteratorType2 s_last) { if constexpr (std::is_constructible_v) { IteratorType1 s_first1(s_first), s_last1(s_last); - KOKKOS_EXPECTS(first < s_first1 && s_last1 < last); + KOKKOS_EXPECTS(first > s_last1 || last < s_last1); } else if constexpr (std::is_constructible_v) { IteratorType2 first2(first), last2(last); - KOKKOS_EXPECTS(first2 < s_first && s_last < last2); + KOKKOS_EXPECTS(first2 > s_last || last2 < s_last); } } From a54a3fdc9f87915782e220dc5a9a2c9f6a32e6ac Mon Sep 17 00:00:00 2001 From: Yuuichi Asahi Date: Mon, 8 Apr 2024 10:45:13 +0200 Subject: [PATCH 077/103] avoid no-op in no-overlap check --- algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp b/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp index 00bc8267ad7..d15d73206bf 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp @@ -230,6 +230,11 @@ KOKKOS_INLINE_FUNCTION void expect_no_overlap(IteratorType1 first, IteratorType2 first2(first), last2(last); KOKKOS_EXPECTS(first2 > s_last || last2 < s_last); } + // avoid compiler complaining when KOKKOS_EXPECTS is no-op + (void)first; + (void)last; + (void)s_first; + (void)s_last; } } // namespace Impl From 417b1271594df5993fbbdf1ada0788be16a8693d Mon Sep 17 00:00:00 2001 From: Yuuichi Asahi Date: Mon, 8 Apr 2024 15:20:45 +0200 Subject: [PATCH 078/103] fix check conditions for overlapping iterators --- .../impl/Kokkos_Constraints.hpp | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp b/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp index d15d73206bf..6547f11c531 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp @@ -216,25 +216,20 @@ KOKKOS_INLINE_FUNCTION void expect_valid_range(IteratorType first, } // -// Check if iterators are overlapped +// Check if iterators are overlapping // template -KOKKOS_INLINE_FUNCTION void expect_no_overlap(IteratorType1 first, - IteratorType1 last, - IteratorType2 s_first, - IteratorType2 s_last) { +KOKKOS_INLINE_FUNCTION void expect_no_overlap([[maybe_unused]] IteratorType1 first, + [[maybe_unused]] IteratorType1 last, + [[maybe_unused]] IteratorType2 s_first, + [[maybe_unused]] IteratorType2 s_last) { if constexpr (std::is_constructible_v) { IteratorType1 s_first1(s_first), s_last1(s_last); - KOKKOS_EXPECTS(first > s_last1 || last < s_last1); + KOKKOS_EXPECTS(first > s_last1 || last < s_first1); } else if constexpr (std::is_constructible_v) { IteratorType2 first2(first), last2(last); - KOKKOS_EXPECTS(first2 > s_last || last2 < s_last); + KOKKOS_EXPECTS(first2 > s_last || last2 < s_first); } - // avoid compiler complaining when KOKKOS_EXPECTS is no-op - (void)first; - (void)last; - (void)s_first; - (void)s_last; } } // namespace Impl From 629b0ee363c60a8a7cf826fd9bdf11a0cd13bdf4 Mon Sep 17 00:00:00 2001 From: Yuuichi Asahi Date: Mon, 8 Apr 2024 15:26:38 +0200 Subject: [PATCH 079/103] Formatting expect_no_overlap --- algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp b/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp index 6547f11c531..aa2e0da52d2 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp @@ -219,10 +219,10 @@ KOKKOS_INLINE_FUNCTION void expect_valid_range(IteratorType first, // Check if iterators are overlapping // template -KOKKOS_INLINE_FUNCTION void expect_no_overlap([[maybe_unused]] IteratorType1 first, - [[maybe_unused]] IteratorType1 last, - [[maybe_unused]] IteratorType2 s_first, - [[maybe_unused]] IteratorType2 s_last) { +KOKKOS_INLINE_FUNCTION void expect_no_overlap( + [[maybe_unused]] IteratorType1 first, [[maybe_unused]] IteratorType1 last, + [[maybe_unused]] IteratorType2 s_first, + [[maybe_unused]] IteratorType2 s_last) { if constexpr (std::is_constructible_v) { IteratorType1 s_first1(s_first), s_last1(s_last); KOKKOS_EXPECTS(first > s_last1 || last < s_first1); From 2d111444c3ee945127312e9105bfce9d4e5b37c0 Mon Sep 17 00:00:00 2001 From: Yuuichi Asahi Date: Thu, 16 May 2024 12:03:39 +0200 Subject: [PATCH 080/103] fix check conditions for overlapping iterators --- .../impl/Kokkos_Constraints.hpp | 43 ++++++++++++++++--- 1 file changed, 36 insertions(+), 7 deletions(-) diff --git a/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp b/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp index aa2e0da52d2..e1dbff15cb9 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp @@ -24,6 +24,9 @@ namespace Kokkos { namespace Experimental { namespace Impl { +template +class RandomAccessIterator; + template struct is_admissible_to_kokkos_std_algorithms : std::false_type {}; @@ -58,6 +61,18 @@ using is_iterator = Kokkos::is_detected; template inline constexpr bool is_iterator_v = is_iterator::value; +template +struct is_kokkos_iterator : std::false_type {}; + +template +struct is_kokkos_iterator> { + static constexpr bool value = + is_admissible_to_kokkos_std_algorithms::value; +}; + +template +inline constexpr bool is_kokkos_iterator_v = is_kokkos_iterator::value; + // // are_iterators // @@ -216,19 +231,33 @@ KOKKOS_INLINE_FUNCTION void expect_valid_range(IteratorType first, } // -// Check if iterators are overlapping +// Check if kokkos iterators are overlapping // template KOKKOS_INLINE_FUNCTION void expect_no_overlap( [[maybe_unused]] IteratorType1 first, [[maybe_unused]] IteratorType1 last, [[maybe_unused]] IteratorType2 s_first, [[maybe_unused]] IteratorType2 s_last) { - if constexpr (std::is_constructible_v) { - IteratorType1 s_first1(s_first), s_last1(s_last); - KOKKOS_EXPECTS(first > s_last1 || last < s_first1); - } else if constexpr (std::is_constructible_v) { - IteratorType2 first2(first), last2(last); - KOKKOS_EXPECTS(first2 > s_last || last2 < s_first); + if constexpr (is_kokkos_iterator_v && + is_kokkos_iterator_v) { + if constexpr (std::is_constructible_v) { + IteratorType2 first2(first), last2(last); + IteratorType2 next_first2 = first2; + ptrdiff_t stride = &*(++next_first2) - &*first2; + ptrdiff_t first_diff = &*first2 - &*s_first; + bool is_no_overlap = first_diff % stride; + KOKKOS_EXPECTS((&*first2 >= &*s_last || &*last2 <= &*s_first) || + is_no_overlap); + } else if constexpr (std::is_constructible_v) { + IteratorType1 s_first1(s_first), s_last1(s_last); + IteratorType1 next_first = first; + ptrdiff_t stride = &*(++next_first) - &*first; + ptrdiff_t first_diff = &*first - &*s_first1; + bool is_no_overlap = first_diff % stride; + KOKKOS_EXPECTS((&*first >= &*s_last1 || &*last <= &*s_first1) || + is_no_overlap); + } } } From c1cc41a5772f5f7fea4b543db2bfd202832095b9 Mon Sep 17 00:00:00 2001 From: Yuuichi Asahi Date: Thu, 16 May 2024 15:12:23 +0200 Subject: [PATCH 081/103] fix conflicts --- .../Kokkos_AdjacentDifference.hpp | 4 +++ .../TestStdAlgorithmsAdjacentDifference.cpp | 26 +++++++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/algorithms/src/std_algorithms/Kokkos_AdjacentDifference.hpp b/algorithms/src/std_algorithms/Kokkos_AdjacentDifference.hpp index f254686dbaf..ff736de52aa 100644 --- a/algorithms/src/std_algorithms/Kokkos_AdjacentDifference.hpp +++ b/algorithms/src/std_algorithms/Kokkos_AdjacentDifference.hpp @@ -109,6 +109,7 @@ auto adjacent_difference( namespace KE = ::Kokkos::Experimental; Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); + Impl::expect_not_identical(view_from, view_dest); using view_type1 = ::Kokkos::View; using view_type2 = ::Kokkos::View; @@ -134,6 +135,7 @@ auto adjacent_difference( namespace KE = ::Kokkos::Experimental; Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); + Impl::expect_not_identical(view_from, view_dest); return Impl::adjacent_difference_exespace_impl( "Kokkos::adjacent_difference_view_api", ex, KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest), bin_op); @@ -150,6 +152,7 @@ auto adjacent_difference( namespace KE = ::Kokkos::Experimental; Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); + Impl::expect_not_identical(view_from, view_dest); using view_type1 = ::Kokkos::View; using view_type2 = ::Kokkos::View; @@ -176,6 +179,7 @@ auto adjacent_difference( namespace KE = ::Kokkos::Experimental; Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); + Impl::expect_not_identical(view_from, view_dest); return Impl::adjacent_difference_exespace_impl( label, ex, KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest), bin_op); diff --git a/algorithms/unit_tests/TestStdAlgorithmsAdjacentDifference.cpp b/algorithms/unit_tests/TestStdAlgorithmsAdjacentDifference.cpp index 75ad533f6ee..19c7487bfa2 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsAdjacentDifference.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsAdjacentDifference.cpp @@ -19,6 +19,7 @@ #include #include #include +#include namespace Test { namespace stdalgos { @@ -230,6 +231,31 @@ void run_single_scenario(const InfoType& scenario_info, } Kokkos::fence(); + + #if !defined(NDEBUG) || defined(KOKKOS_ENFORCE_CONTRACTS) || \ + defined(KOKKOS_ENABLE_DEBUG) + { + auto view_dest = view_from; + EXPECT_DEATH( + {KE::adjacent_difference(exespace(), view_from, + view_dest, args...); + Kokkos::fence(); + }, + "Kokkos contract violation:.*"); + } + + { + auto view_dest = view_from; + EXPECT_DEATH( + {KE::adjacent_difference("label", exespace(), view_from, + view_dest, args...); + Kokkos::fence(); + }, + "Kokkos contract violation:.*"); + } + #endif + + Kokkos::fence(); } template From 1126fedbf4a53d34df722225ad8981fad21223d3 Mon Sep 17 00:00:00 2001 From: Yuuichi Asahi Date: Thu, 16 May 2024 15:20:24 +0200 Subject: [PATCH 082/103] fix conflicts --- .../Kokkos_AdjacentDifference.hpp | 4 --- .../impl/Kokkos_AdjacentDifference.hpp | 15 ++++++----- .../TestStdAlgorithmsAdjacentDifference.cpp | 26 ------------------- 3 files changed, 8 insertions(+), 37 deletions(-) diff --git a/algorithms/src/std_algorithms/Kokkos_AdjacentDifference.hpp b/algorithms/src/std_algorithms/Kokkos_AdjacentDifference.hpp index ff736de52aa..f254686dbaf 100644 --- a/algorithms/src/std_algorithms/Kokkos_AdjacentDifference.hpp +++ b/algorithms/src/std_algorithms/Kokkos_AdjacentDifference.hpp @@ -109,7 +109,6 @@ auto adjacent_difference( namespace KE = ::Kokkos::Experimental; Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); - Impl::expect_not_identical(view_from, view_dest); using view_type1 = ::Kokkos::View; using view_type2 = ::Kokkos::View; @@ -135,7 +134,6 @@ auto adjacent_difference( namespace KE = ::Kokkos::Experimental; Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); - Impl::expect_not_identical(view_from, view_dest); return Impl::adjacent_difference_exespace_impl( "Kokkos::adjacent_difference_view_api", ex, KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest), bin_op); @@ -152,7 +150,6 @@ auto adjacent_difference( namespace KE = ::Kokkos::Experimental; Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); - Impl::expect_not_identical(view_from, view_dest); using view_type1 = ::Kokkos::View; using view_type2 = ::Kokkos::View; @@ -179,7 +176,6 @@ auto adjacent_difference( namespace KE = ::Kokkos::Experimental; Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); - Impl::expect_not_identical(view_from, view_dest); return Impl::adjacent_difference_exespace_impl( label, ex, KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest), bin_op); diff --git a/algorithms/src/std_algorithms/impl/Kokkos_AdjacentDifference.hpp b/algorithms/src/std_algorithms/impl/Kokkos_AdjacentDifference.hpp index 3f9b4a43c50..dd772052f90 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_AdjacentDifference.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_AdjacentDifference.hpp @@ -21,6 +21,7 @@ #include "Kokkos_Constraints.hpp" #include "Kokkos_HelperPredicates.hpp" #include +#include #include namespace Kokkos { @@ -78,9 +79,9 @@ OutputIteratorType adjacent_difference_exespace_impl( first_dest); Impl::expect_valid_range(first_from, last_from); - if (first_from == last_from) { - return first_dest; - } + // ranges shall not overlap + const auto num_elements = + Kokkos::Experimental::distance(first_from, last_from); // ranges shall not overlap const auto num_elements = @@ -113,10 +114,10 @@ KOKKOS_FUNCTION OutputIteratorType adjacent_difference_team_impl( Impl::static_assert_iterators_have_matching_difference_type(first_from, first_dest); Impl::expect_valid_range(first_from, last_from); - - if (first_from == last_from) { - return first_dest; - } + // ranges shall not overlap + const auto num_elements = + Kokkos::Experimental::distance(first_from, last_from); + auto last_dest = first_dest + num_elements; // ranges shall not overlap const auto num_elements = diff --git a/algorithms/unit_tests/TestStdAlgorithmsAdjacentDifference.cpp b/algorithms/unit_tests/TestStdAlgorithmsAdjacentDifference.cpp index 19c7487bfa2..75ad533f6ee 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsAdjacentDifference.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsAdjacentDifference.cpp @@ -19,7 +19,6 @@ #include #include #include -#include namespace Test { namespace stdalgos { @@ -231,31 +230,6 @@ void run_single_scenario(const InfoType& scenario_info, } Kokkos::fence(); - - #if !defined(NDEBUG) || defined(KOKKOS_ENFORCE_CONTRACTS) || \ - defined(KOKKOS_ENABLE_DEBUG) - { - auto view_dest = view_from; - EXPECT_DEATH( - {KE::adjacent_difference(exespace(), view_from, - view_dest, args...); - Kokkos::fence(); - }, - "Kokkos contract violation:.*"); - } - - { - auto view_dest = view_from; - EXPECT_DEATH( - {KE::adjacent_difference("label", exespace(), view_from, - view_dest, args...); - Kokkos::fence(); - }, - "Kokkos contract violation:.*"); - } - #endif - - Kokkos::fence(); } template From 70cbe747ed1143e04d7d9f92909680e0a0dbe926 Mon Sep 17 00:00:00 2001 From: Yuuichi Asahi Date: Thu, 16 May 2024 15:23:02 +0200 Subject: [PATCH 083/103] fix conflicts --- .../impl/Kokkos_AdjacentDifference.hpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/algorithms/src/std_algorithms/impl/Kokkos_AdjacentDifference.hpp b/algorithms/src/std_algorithms/impl/Kokkos_AdjacentDifference.hpp index dd772052f90..38b101723b8 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_AdjacentDifference.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_AdjacentDifference.hpp @@ -79,9 +79,9 @@ OutputIteratorType adjacent_difference_exespace_impl( first_dest); Impl::expect_valid_range(first_from, last_from); - // ranges shall not overlap - const auto num_elements = - Kokkos::Experimental::distance(first_from, last_from); + if (first_from == last_from) { + return first_dest; + } // ranges shall not overlap const auto num_elements = @@ -114,10 +114,10 @@ KOKKOS_FUNCTION OutputIteratorType adjacent_difference_team_impl( Impl::static_assert_iterators_have_matching_difference_type(first_from, first_dest); Impl::expect_valid_range(first_from, last_from); - // ranges shall not overlap - const auto num_elements = - Kokkos::Experimental::distance(first_from, last_from); - auto last_dest = first_dest + num_elements; + + if (first_from == last_from) { + return first_dest; + } // ranges shall not overlap const auto num_elements = From cddbec55263406fac3fb0bb3a5aea3f570a33722 Mon Sep 17 00:00:00 2001 From: Yuuichi Asahi Date: Thu, 16 May 2024 15:24:43 +0200 Subject: [PATCH 084/103] fix conflicts --- algorithms/src/std_algorithms/impl/Kokkos_AdjacentDifference.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/algorithms/src/std_algorithms/impl/Kokkos_AdjacentDifference.hpp b/algorithms/src/std_algorithms/impl/Kokkos_AdjacentDifference.hpp index 38b101723b8..3f9b4a43c50 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_AdjacentDifference.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_AdjacentDifference.hpp @@ -21,7 +21,6 @@ #include "Kokkos_Constraints.hpp" #include "Kokkos_HelperPredicates.hpp" #include -#include #include namespace Kokkos { From e7da00edb65b622c320e8288c3442fb024273abe Mon Sep 17 00:00:00 2001 From: Yuuichi Asahi Date: Mon, 8 Apr 2024 10:45:13 +0200 Subject: [PATCH 085/103] avoid no-op in no-overlap check --- algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp b/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp index e1dbff15cb9..6940e5854d6 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp @@ -259,6 +259,11 @@ KOKKOS_INLINE_FUNCTION void expect_no_overlap( is_no_overlap); } } + // avoid compiler complaining when KOKKOS_EXPECTS is no-op + (void)first; + (void)last; + (void)s_first; + (void)s_last; } } // namespace Impl From 62a06ec24b80b4361e13fc6d77c6cd61f5eaac2a Mon Sep 17 00:00:00 2001 From: Yuuichi Asahi Date: Thu, 16 May 2024 13:30:20 +0200 Subject: [PATCH 086/103] Add maybe unused --- .../impl/Kokkos_Constraints.hpp | 30 +++++++++---------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp b/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp index 6940e5854d6..a1277e45a71 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp @@ -32,13 +32,13 @@ struct is_admissible_to_kokkos_std_algorithms : std::false_type {}; template struct is_admissible_to_kokkos_std_algorithms< - T, std::enable_if_t< ::Kokkos::is_view::value && T::rank() == 1 && - (std::is_same::value || - std::is_same::value || - std::is_same::value)> > + T, std::enable_if_t<::Kokkos::is_view::value && T::rank() == 1 && + (std::is_same::value || + std::is_same::value || + std::is_same::value)>> : std::true_type {}; template @@ -242,19 +242,19 @@ KOKKOS_INLINE_FUNCTION void expect_no_overlap( is_kokkos_iterator_v) { if constexpr (std::is_constructible_v) { IteratorType2 first2(first), last2(last); - IteratorType2 next_first2 = first2; - ptrdiff_t stride = &*(++next_first2) - &*first2; - ptrdiff_t first_diff = &*first2 - &*s_first; - bool is_no_overlap = first_diff % stride; + IteratorType2 next_first2 = first2; + ptrdiff_t stride = &*(++next_first2) - &*first2; + ptrdiff_t first_diff = &*first2 - &*s_first; + [[maybe_unused]] bool is_no_overlap = first_diff % stride; KOKKOS_EXPECTS((&*first2 >= &*s_last || &*last2 <= &*s_first) || is_no_overlap); } else if constexpr (std::is_constructible_v) { IteratorType1 s_first1(s_first), s_last1(s_last); - IteratorType1 next_first = first; - ptrdiff_t stride = &*(++next_first) - &*first; - ptrdiff_t first_diff = &*first - &*s_first1; - bool is_no_overlap = first_diff % stride; + IteratorType1 next_first = first; + ptrdiff_t stride = &*(++next_first) - &*first; + ptrdiff_t first_diff = &*first - &*s_first1; + [[maybe_unused]] bool is_no_overlap = first_diff % stride; KOKKOS_EXPECTS((&*first >= &*s_last1 || &*last <= &*s_first1) || is_no_overlap); } From da24fdd41aa6863c8036c24cb5f2af5d11c89634 Mon Sep 17 00:00:00 2001 From: Yuuichi Asahi Date: Thu, 16 May 2024 13:50:48 +0200 Subject: [PATCH 087/103] Check must be made on exec space --- .../src/std_algorithms/impl/Kokkos_AdjacentDifference.hpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/algorithms/src/std_algorithms/impl/Kokkos_AdjacentDifference.hpp b/algorithms/src/std_algorithms/impl/Kokkos_AdjacentDifference.hpp index 3f9b4a43c50..a72b63f47e7 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_AdjacentDifference.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_AdjacentDifference.hpp @@ -86,7 +86,11 @@ OutputIteratorType adjacent_difference_exespace_impl( const auto num_elements = Kokkos::Experimental::distance(first_from, last_from); auto last_dest = first_dest + num_elements; - Impl::expect_no_overlap(first_from, last_from, first_dest, last_dest); + // Iterators must be accessed from Execution space + ::Kokkos::parallel_for( + "Check", RangePolicy(ex, 0, 1), KOKKOS_LAMBDA(int) { + Impl::expect_no_overlap(first_from, last_from, first_dest, last_dest); + }); // run ::Kokkos::parallel_for( From 07b871fc34317ba2e8e0aac7cd7db419739e64b3 Mon Sep 17 00:00:00 2001 From: Yuuichi Asahi Date: Thu, 16 May 2024 15:33:12 +0200 Subject: [PATCH 088/103] remove unused lines --- algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp | 5 ----- 1 file changed, 5 deletions(-) diff --git a/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp b/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp index a1277e45a71..41bce871a97 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp @@ -259,11 +259,6 @@ KOKKOS_INLINE_FUNCTION void expect_no_overlap( is_no_overlap); } } - // avoid compiler complaining when KOKKOS_EXPECTS is no-op - (void)first; - (void)last; - (void)s_first; - (void)s_last; } } // namespace Impl From 366ea745143f346ef25055eaf9b9e437cb580925 Mon Sep 17 00:00:00 2001 From: Yuuichi Asahi Date: Thu, 16 May 2024 19:01:35 +0200 Subject: [PATCH 089/103] Enable checks for debug mode only --- .../std_algorithms/impl/Kokkos_AdjacentDifference.hpp | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/algorithms/src/std_algorithms/impl/Kokkos_AdjacentDifference.hpp b/algorithms/src/std_algorithms/impl/Kokkos_AdjacentDifference.hpp index a72b63f47e7..4414694ba73 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_AdjacentDifference.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_AdjacentDifference.hpp @@ -85,12 +85,16 @@ OutputIteratorType adjacent_difference_exespace_impl( // ranges shall not overlap const auto num_elements = Kokkos::Experimental::distance(first_from, last_from); + +#ifdef KOKKOS_ENABLE_DEBUG auto last_dest = first_dest + num_elements; // Iterators must be accessed from Execution space ::Kokkos::parallel_for( - "Check", RangePolicy(ex, 0, 1), KOKKOS_LAMBDA(int) { + "Kokkos::adjacent_difference::check_no_overlap", + RangePolicy(ex, 0, 1), KOKKOS_LAMBDA(int) { Impl::expect_no_overlap(first_from, last_from, first_dest, last_dest); }); +#endif // run ::Kokkos::parallel_for( @@ -125,8 +129,11 @@ KOKKOS_FUNCTION OutputIteratorType adjacent_difference_team_impl( // ranges shall not overlap const auto num_elements = Kokkos::Experimental::distance(first_from, last_from); + +#ifdef KOKKOS_ENABLE_DEBUG auto last_dest = first_dest + num_elements; Impl::expect_no_overlap(first_from, last_from, first_dest, last_dest); +#endif // run ::Kokkos::parallel_for( From 0b3358caaa583437836a00b887e9b1a94469f20d Mon Sep 17 00:00:00 2001 From: Yuuichi Asahi Date: Thu, 16 May 2024 19:02:45 +0200 Subject: [PATCH 090/103] suppress unnecessary iterator conversions --- .../impl/Kokkos_Constraints.hpp | 24 +++++-------------- 1 file changed, 6 insertions(+), 18 deletions(-) diff --git a/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp b/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp index 41bce871a97..693a981f609 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp @@ -240,24 +240,12 @@ KOKKOS_INLINE_FUNCTION void expect_no_overlap( [[maybe_unused]] IteratorType2 s_last) { if constexpr (is_kokkos_iterator_v && is_kokkos_iterator_v) { - if constexpr (std::is_constructible_v) { - IteratorType2 first2(first), last2(last); - IteratorType2 next_first2 = first2; - ptrdiff_t stride = &*(++next_first2) - &*first2; - ptrdiff_t first_diff = &*first2 - &*s_first; - [[maybe_unused]] bool is_no_overlap = first_diff % stride; - KOKKOS_EXPECTS((&*first2 >= &*s_last || &*last2 <= &*s_first) || - is_no_overlap); - } else if constexpr (std::is_constructible_v) { - IteratorType1 s_first1(s_first), s_last1(s_last); - IteratorType1 next_first = first; - ptrdiff_t stride = &*(++next_first) - &*first; - ptrdiff_t first_diff = &*first - &*s_first1; - [[maybe_unused]] bool is_no_overlap = first_diff % stride; - KOKKOS_EXPECTS((&*first >= &*s_last1 || &*last <= &*s_first1) || - is_no_overlap); - } + IteratorType1 next_first = first; + ptrdiff_t stride = &*(++next_first) - &*first; + ptrdiff_t first_diff = &*first - &*s_first; + bool is_no_overlap = first_diff % stride; + KOKKOS_EXPECTS((&*first >= &*s_last || &*last <= &*s_first) || + is_no_overlap); } } From 28d45d862261da744dd42d220451c8fbddf7d902 Mon Sep 17 00:00:00 2001 From: Yuuichi Asahi Date: Thu, 16 May 2024 19:43:03 +0200 Subject: [PATCH 091/103] improve comments for checks --- .../src/std_algorithms/impl/Kokkos_AdjacentDifference.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/algorithms/src/std_algorithms/impl/Kokkos_AdjacentDifference.hpp b/algorithms/src/std_algorithms/impl/Kokkos_AdjacentDifference.hpp index 4414694ba73..ff4347e5548 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_AdjacentDifference.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_AdjacentDifference.hpp @@ -82,11 +82,11 @@ OutputIteratorType adjacent_difference_exespace_impl( return first_dest; } - // ranges shall not overlap const auto num_elements = Kokkos::Experimental::distance(first_from, last_from); #ifdef KOKKOS_ENABLE_DEBUG + // check for overlapping iterators auto last_dest = first_dest + num_elements; // Iterators must be accessed from Execution space ::Kokkos::parallel_for( @@ -126,11 +126,11 @@ KOKKOS_FUNCTION OutputIteratorType adjacent_difference_team_impl( return first_dest; } - // ranges shall not overlap const auto num_elements = Kokkos::Experimental::distance(first_from, last_from); #ifdef KOKKOS_ENABLE_DEBUG + // check for overlapping iterators auto last_dest = first_dest + num_elements; Impl::expect_no_overlap(first_from, last_from, first_dest, last_dest); #endif From 167de91b46e75113b0bca673eda3fc4bd084e6f2 Mon Sep 17 00:00:00 2001 From: Yuuichi Asahi Date: Thu, 16 May 2024 19:52:18 +0200 Subject: [PATCH 092/103] fix iterator overlapping check --- .../src/std_algorithms/impl/Kokkos_Constraints.hpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp b/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp index 693a981f609..890103030c9 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp @@ -240,10 +240,13 @@ KOKKOS_INLINE_FUNCTION void expect_no_overlap( [[maybe_unused]] IteratorType2 s_last) { if constexpr (is_kokkos_iterator_v && is_kokkos_iterator_v) { - IteratorType1 next_first = first; - ptrdiff_t stride = &*(++next_first) - &*first; - ptrdiff_t first_diff = &*first - &*s_first; - bool is_no_overlap = first_diff % stride; + IteratorType1 next_first = first; + IteratorType2 next_s_first = s_first; + ptrdiff_t stride1 = &*(++next_first) - &*first; + ptrdiff_t stride2 = &*(++next_s_first) - &*s_first; + ptrdiff_t first_diff = &*first - &*s_first; + [[maybe_unused]] bool is_no_overlap = + (first_diff % stride1) + (first_diff % stride2); KOKKOS_EXPECTS((&*first >= &*s_last || &*last <= &*s_first) || is_no_overlap); } From 027dc0df5c513d2c28bf0ae5aecbb31440be86d8 Mon Sep 17 00:00:00 2001 From: Yuuichi Asahi Date: Fri, 17 May 2024 09:10:00 +0200 Subject: [PATCH 093/103] fix iterator overlapping check --- .../src/std_algorithms/impl/Kokkos_Constraints.hpp | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp b/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp index 890103030c9..f8a0df2de9c 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp @@ -245,10 +245,15 @@ KOKKOS_INLINE_FUNCTION void expect_no_overlap( ptrdiff_t stride1 = &*(++next_first) - &*first; ptrdiff_t stride2 = &*(++next_s_first) - &*s_first; ptrdiff_t first_diff = &*first - &*s_first; - [[maybe_unused]] bool is_no_overlap = - (first_diff % stride1) + (first_diff % stride2); - KOKKOS_EXPECTS((&*first >= &*s_last || &*last <= &*s_first) || - is_no_overlap); + // FIXME If strides are not identical, checks may not be made + // with the cost of O(1) + // Currently, checks are made only if strides are identical + // If first_diff == 0, there is already an overlap + if (stride1 == stride2 || first_diff == 0) { + [[maybe_unused]] bool is_no_overlap = (first_diff % stride1); + KOKKOS_EXPECTS((&*first >= &*s_last || &*last <= &*s_first) || + is_no_overlap); + } } } From bf6301a8b9f4d69a52ce2b7b96217b41a422129b Mon Sep 17 00:00:00 2001 From: Yuuichi Asahi Date: Fri, 17 May 2024 09:49:20 +0200 Subject: [PATCH 094/103] fix iterator overlapping check --- algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp b/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp index f8a0df2de9c..1506983b4e4 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp @@ -249,10 +249,13 @@ KOKKOS_INLINE_FUNCTION void expect_no_overlap( // with the cost of O(1) // Currently, checks are made only if strides are identical // If first_diff == 0, there is already an overlap + // If one of strides is 1, first_diff is always divisible if (stride1 == stride2 || first_diff == 0) { [[maybe_unused]] bool is_no_overlap = (first_diff % stride1); KOKKOS_EXPECTS((&*first >= &*s_last || &*last <= &*s_first) || is_no_overlap); + } else if (stride1 == 1 || stride2 == 1) { + KOKKOS_EXPECTS(&*first >= &*s_last || &*last <= &*s_first); } } } From 7c38c17cbcd8f252b3ec8b3a3c146ecd62c7cd11 Mon Sep 17 00:00:00 2001 From: Yuuichi Asahi Date: Fri, 17 May 2024 11:25:24 +0200 Subject: [PATCH 095/103] Revert "fix iterator overlapping check" This reverts commit 7d4833a4e87320ff8e8189afaeb5a12ddf53c061. --- algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp b/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp index 1506983b4e4..f8a0df2de9c 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp @@ -249,13 +249,10 @@ KOKKOS_INLINE_FUNCTION void expect_no_overlap( // with the cost of O(1) // Currently, checks are made only if strides are identical // If first_diff == 0, there is already an overlap - // If one of strides is 1, first_diff is always divisible if (stride1 == stride2 || first_diff == 0) { [[maybe_unused]] bool is_no_overlap = (first_diff % stride1); KOKKOS_EXPECTS((&*first >= &*s_last || &*last <= &*s_first) || is_no_overlap); - } else if (stride1 == 1 || stride2 == 1) { - KOKKOS_EXPECTS(&*first >= &*s_last || &*last <= &*s_first); } } } From 594180fef6a8101efa64cb550588b7d02d8bb526 Mon Sep 17 00:00:00 2001 From: Yuuichi Asahi Date: Tue, 21 May 2024 19:06:03 +0200 Subject: [PATCH 096/103] Use internval views to check iterator overlaps --- .../impl/Kokkos_AdjacentDifference.hpp | 7 +------ .../std_algorithms/impl/Kokkos_Constraints.hpp | 15 +++++++++------ .../impl/Kokkos_RandomAccessIterator.hpp | 7 +++++++ 3 files changed, 17 insertions(+), 12 deletions(-) diff --git a/algorithms/src/std_algorithms/impl/Kokkos_AdjacentDifference.hpp b/algorithms/src/std_algorithms/impl/Kokkos_AdjacentDifference.hpp index ff4347e5548..2d443230b76 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_AdjacentDifference.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_AdjacentDifference.hpp @@ -88,12 +88,7 @@ OutputIteratorType adjacent_difference_exespace_impl( #ifdef KOKKOS_ENABLE_DEBUG // check for overlapping iterators auto last_dest = first_dest + num_elements; - // Iterators must be accessed from Execution space - ::Kokkos::parallel_for( - "Kokkos::adjacent_difference::check_no_overlap", - RangePolicy(ex, 0, 1), KOKKOS_LAMBDA(int) { - Impl::expect_no_overlap(first_from, last_from, first_dest, last_dest); - }); + Impl::expect_no_overlap(first_from, last_from, first_dest, last_dest); #endif // run diff --git a/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp b/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp index f8a0df2de9c..5b5f26a24f1 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp @@ -240,18 +240,21 @@ KOKKOS_INLINE_FUNCTION void expect_no_overlap( [[maybe_unused]] IteratorType2 s_last) { if constexpr (is_kokkos_iterator_v && is_kokkos_iterator_v) { - IteratorType1 next_first = first; - IteratorType2 next_s_first = s_first; - ptrdiff_t stride1 = &*(++next_first) - &*first; - ptrdiff_t stride2 = &*(++next_s_first) - &*s_first; - ptrdiff_t first_diff = &*first - &*s_first; + auto const view = first.view(); + auto const s_view = s_first.view(); + + std::size_t stride1 = view.stride(0); + std::size_t stride2 = s_view.stride(0); + ptrdiff_t first_diff = view.data() - s_view.data(); + // FIXME If strides are not identical, checks may not be made // with the cost of O(1) // Currently, checks are made only if strides are identical // If first_diff == 0, there is already an overlap if (stride1 == stride2 || first_diff == 0) { [[maybe_unused]] bool is_no_overlap = (first_diff % stride1); - KOKKOS_EXPECTS((&*first >= &*s_last || &*last <= &*s_first) || + KOKKOS_EXPECTS((first.view().data() >= s_last.view().data() || + last.view().data() <= s_first.view().data()) || is_no_overlap); } } diff --git a/algorithms/src/std_algorithms/impl/Kokkos_RandomAccessIterator.hpp b/algorithms/src/std_algorithms/impl/Kokkos_RandomAccessIterator.hpp index ba0cdc91eea..acf335db84d 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_RandomAccessIterator.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_RandomAccessIterator.hpp @@ -184,6 +184,13 @@ class RandomAccessIterator< ::Kokkos::View > { KOKKOS_FUNCTION reference operator*() const { return m_view(m_current_index); } + KOKKOS_FUNCTION + auto view() const { + Kokkos::pair offset(m_current_index, + m_current_index + m_view.size()); + return Kokkos::subview(m_view, offset); + } + private: view_type m_view; ptrdiff_t m_current_index = 0; From c626131a78b86c6ada2d66998c9fcff1a254ca2e Mon Sep 17 00:00:00 2001 From: Yuuichi Asahi Date: Fri, 24 May 2024 10:26:06 +0200 Subject: [PATCH 097/103] fix view() method in RandomAccessIterator --- .../impl/Kokkos_Constraints.hpp | 22 +++++++++++-------- .../impl/Kokkos_RandomAccessIterator.hpp | 6 +---- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp b/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp index 5b5f26a24f1..d4220b4db71 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp @@ -240,22 +240,26 @@ KOKKOS_INLINE_FUNCTION void expect_no_overlap( [[maybe_unused]] IteratorType2 s_last) { if constexpr (is_kokkos_iterator_v && is_kokkos_iterator_v) { - auto const view = first.view(); - auto const s_view = s_first.view(); + auto const view1 = first.view(); + auto const view2 = s_first.view(); - std::size_t stride1 = view.stride(0); - std::size_t stride2 = s_view.stride(0); - ptrdiff_t first_diff = view.data() - s_view.data(); + std::size_t stride1 = view1.stride(0); + std::size_t stride2 = view2.stride(0); + ptrdiff_t first_diff = view1.data() - view2.data(); // FIXME If strides are not identical, checks may not be made // with the cost of O(1) // Currently, checks are made only if strides are identical // If first_diff == 0, there is already an overlap if (stride1 == stride2 || first_diff == 0) { - [[maybe_unused]] bool is_no_overlap = (first_diff % stride1); - KOKKOS_EXPECTS((first.view().data() >= s_last.view().data() || - last.view().data() <= s_first.view().data()) || - is_no_overlap); + [[maybe_unused]] bool is_no_overlap = (first_diff % stride1); + auto* first_pointer1 = view1.data(); + auto* first_pointer2 = view2.data(); + [[maybe_unused]] auto* last_pointer1 = first_pointer1 + (last - first); + [[maybe_unused]] auto* last_pointer2 = + first_pointer2 + (s_last - s_first); + KOKKOS_EXPECTS(first_pointer1 >= last_pointer2 || + last_pointer1 <= first_pointer2 || is_no_overlap); } } } diff --git a/algorithms/src/std_algorithms/impl/Kokkos_RandomAccessIterator.hpp b/algorithms/src/std_algorithms/impl/Kokkos_RandomAccessIterator.hpp index acf335db84d..f673c7ff7f3 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_RandomAccessIterator.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_RandomAccessIterator.hpp @@ -185,11 +185,7 @@ class RandomAccessIterator< ::Kokkos::View > { reference operator*() const { return m_view(m_current_index); } KOKKOS_FUNCTION - auto view() const { - Kokkos::pair offset(m_current_index, - m_current_index + m_view.size()); - return Kokkos::subview(m_view, offset); - } + auto view() const { return m_view; } private: view_type m_view; From 55298035bc1f0f7c034136eddcf07e1b0d25ce19 Mon Sep 17 00:00:00 2001 From: Yuuichi Asahi Date: Fri, 24 May 2024 10:28:55 +0200 Subject: [PATCH 098/103] unit-test for expect_no_overlap --- .../TestStdAlgorithmsConstraints.cpp | 117 ++++++++++++++++++ 1 file changed, 117 insertions(+) diff --git a/algorithms/unit_tests/TestStdAlgorithmsConstraints.cpp b/algorithms/unit_tests/TestStdAlgorithmsConstraints.cpp index 386d533f7a8..35ec0a9dd2a 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsConstraints.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsConstraints.cpp @@ -81,5 +81,122 @@ TEST(std_algorithms, is_admissible_to_std_algorithms) { strided_view_3d_t>::value); } +TEST(std_algorithms, expect_no_overlap) { + namespace KE = Kokkos::Experimental; + using value_type = double; + + static constexpr size_t extent0 = 13; + + //------------- + // 1d views + //------------- + using static_view_1d_t = Kokkos::View; + [[maybe_unused]] static_view_1d_t static_view_1d{ + "std-algo-test-1d-contiguous-view-static"}; + + using dyn_view_1d_t = Kokkos::View; + [[maybe_unused]] dyn_view_1d_t dynamic_view_1d{ + "std-algo-test-1d-contiguous-view-dynamic", extent0}; + + using strided_view_1d_t = Kokkos::View; + Kokkos::LayoutStride layout1d{extent0, 2}; + [[maybe_unused]] strided_view_1d_t strided_view_1d{ + "std-algo-test-1d-strided-view", layout1d}; + +#if !defined(NDEBUG) || defined(KOKKOS_ENFORCE_CONTRACTS) || \ + defined(KOKKOS_ENABLE_DEBUG) + // Overlapping because iterators are identical + auto first_s = KE::begin(static_view_1d); + auto last_s = first_s + extent0; + EXPECT_DEATH( + { KE::Impl::expect_no_overlap(first_s, last_s, first_s, last_s); }, + "Kokkos contract violation:.*"); + + auto first_d = KE::begin(dynamic_view_1d); + auto last_d = first_d + extent0; + EXPECT_DEATH( + { KE::Impl::expect_no_overlap(first_d, last_d, first_d, last_d); }, + "Kokkos contract violation:.*"); + + auto first_st = KE::begin(strided_view_1d); + auto last_st = first_st + extent0; + EXPECT_DEATH( + { KE::Impl::expect_no_overlap(first_st, last_st, first_st, last_st); }, + "Kokkos contract violation:.*"); + + // Ranges are overlapped + static constexpr size_t sub_extent0 = 6, offset0 = 3; + std::pair range0(0, sub_extent0), + range1(offset0, offset0 + sub_extent0); + auto static_view_1d_0 = Kokkos::subview(static_view_1d, range0); + auto static_view_1d_1 = Kokkos::subview(static_view_1d, range1); + auto first_s0 = KE::begin(static_view_1d_0); // [0, 6] + auto last_s0 = first_s0 + static_view_1d_0.extent(0); + auto first_s1 = KE::begin(static_view_1d_1); // [3, 9] + auto last_s1 = first_s1 + static_view_1d_1.extent(1); + EXPECT_DEATH( + { KE::Impl::expect_no_overlap(first_s0, last_s0, first_s1, last_s1); }, + "Kokkos contract violation:.*"); + + auto dynamic_view_1d_0 = Kokkos::subview(dynamic_view_1d, range0); + auto dynamic_view_1d_1 = Kokkos::subview(dynamic_view_1d, range1); + auto first_d0 = KE::begin(dynamic_view_1d_0); // [0, 6] + auto last_d0 = first_d0 + dynamic_view_1d_0.extent(0); + auto first_d1 = KE::begin(dynamic_view_1d_1); // [3, 9] + auto last_d1 = first_d1 + dynamic_view_1d_1.extent(1); + EXPECT_DEATH( + { KE::Impl::expect_no_overlap(first_d0, last_d0, first_d1, last_d1); }, + "Kokkos contract violation:.*"); + + auto strided_view_1d_0 = Kokkos::subview(strided_view_1d, range0); + auto strided_view_1d_1 = Kokkos::subview(strided_view_1d, range1); + auto first_st0 = KE::begin(strided_view_1d_0); // [0, 12] + auto last_st0 = first_st0 + strided_view_1d_0.extent(0); + auto first_st1 = KE::begin(strided_view_1d_1); // [3, 15] + auto last_st1 = first_st1 + strided_view_1d_1.extent(1); + // Does not overlap since offset (=3) is not divisible by stride (=2) + KE::Impl::expect_no_overlap(first_st0, last_st0, first_st1, last_st1); + + // Iterating over the same range without overlapping + Kokkos::View static_view_2d{ + "std-algo-test-2d-contiguous-view-static", extent0}; + auto sub_static_view_1d_0 = Kokkos::subview(static_view_2d, 0, Kokkos::ALL); + auto sub_static_view_1d_1 = Kokkos::subview(static_view_2d, 1, Kokkos::ALL); + auto sub_first_s0 = KE::begin(sub_static_view_1d_0); // 0, 2, 4, ... + auto sub_last_s0 = sub_first_s0 + sub_static_view_1d_0.extent(0); + auto sub_first_s1 = KE::begin(sub_static_view_1d_1); // 1, 3, 5, ... + auto sub_last_s1 = sub_first_s1 + sub_static_view_1d_1.extent(0); + + KE::Impl::expect_no_overlap(sub_first_s0, sub_last_s0, sub_first_s1, + sub_last_s1); + + Kokkos::View dynamic_view_2d{ + "std-algo-test-2d-contiguous-view-dynamic", 2, extent0}; + auto sub_dynamic_view_1d_0 = Kokkos::subview(dynamic_view_2d, 0, Kokkos::ALL); + auto sub_dynamic_view_1d_1 = Kokkos::subview(dynamic_view_2d, 1, Kokkos::ALL); + auto sub_first_d0 = KE::begin(sub_dynamic_view_1d_0); // 0, 2, 4, ... + auto sub_last_d0 = sub_first_d0 + sub_dynamic_view_1d_0.extent(0); + auto sub_first_d1 = KE::begin(sub_dynamic_view_1d_1); // 1, 3, 5, ... + auto sub_last_d1 = sub_first_d1 + sub_dynamic_view_1d_1.extent(0); + + KE::Impl::expect_no_overlap(sub_first_d0, sub_last_d0, sub_first_d1, + sub_last_d1); + + Kokkos::LayoutStride layout2d{2, 3, extent0, 2}; + Kokkos::View strided_view_2d{ + "std-algo-test-2d-contiguous-view-strided", layout2d}; + auto sub_strided_view_1d_0 = Kokkos::subview(strided_view_2d, 0, Kokkos::ALL); + auto sub_strided_view_1d_1 = Kokkos::subview(strided_view_2d, 1, Kokkos::ALL); + auto sub_first_st0 = KE::begin(sub_strided_view_1d_0); // 0, 6, 12, ... + auto sub_last_st0 = sub_first_st0 + sub_strided_view_1d_0.extent(0); + auto sub_first_st1 = KE::begin(sub_strided_view_1d_1); // 1, 7, 13, ... + auto sub_last_st1 = sub_first_st1 + sub_strided_view_1d_1.extent(0); + + KE::Impl::expect_no_overlap(sub_first_st0, sub_last_st0, sub_first_st1, + sub_last_st1); +#endif + EXPECT_TRUE(true); +} + } // namespace stdalgos } // namespace Test From 0173e7e228156f4224769b17b87c89d3d5dcb0f1 Mon Sep 17 00:00:00 2001 From: Yuuichi Asahi Date: Fri, 24 May 2024 14:36:00 +0200 Subject: [PATCH 099/103] fix arguments for static 2D View constructor --- algorithms/unit_tests/TestStdAlgorithmsConstraints.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/algorithms/unit_tests/TestStdAlgorithmsConstraints.cpp b/algorithms/unit_tests/TestStdAlgorithmsConstraints.cpp index 35ec0a9dd2a..0384981f5d0 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsConstraints.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsConstraints.cpp @@ -159,7 +159,7 @@ TEST(std_algorithms, expect_no_overlap) { // Iterating over the same range without overlapping Kokkos::View static_view_2d{ - "std-algo-test-2d-contiguous-view-static", extent0}; + "std-algo-test-2d-contiguous-view-static"}; auto sub_static_view_1d_0 = Kokkos::subview(static_view_2d, 0, Kokkos::ALL); auto sub_static_view_1d_1 = Kokkos::subview(static_view_2d, 1, Kokkos::ALL); auto sub_first_s0 = KE::begin(sub_static_view_1d_0); // 0, 2, 4, ... From 52a814e4456fa91774b5391092d44ea67edfbce6 Mon Sep 17 00:00:00 2001 From: Yuuichi Asahi Date: Tue, 28 May 2024 16:22:36 +0200 Subject: [PATCH 100/103] remove unused s_last from expect_no_overlap --- .../impl/Kokkos_AdjacentDifference.hpp | 16 ++++++---------- .../std_algorithms/impl/Kokkos_Constraints.hpp | 6 ++---- .../impl/Kokkos_RandomAccessIterator.hpp | 2 +- 3 files changed, 9 insertions(+), 15 deletions(-) diff --git a/algorithms/src/std_algorithms/impl/Kokkos_AdjacentDifference.hpp b/algorithms/src/std_algorithms/impl/Kokkos_AdjacentDifference.hpp index 2d443230b76..9f7fcf94fe0 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_AdjacentDifference.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_AdjacentDifference.hpp @@ -82,16 +82,14 @@ OutputIteratorType adjacent_difference_exespace_impl( return first_dest; } - const auto num_elements = - Kokkos::Experimental::distance(first_from, last_from); - #ifdef KOKKOS_ENABLE_DEBUG // check for overlapping iterators - auto last_dest = first_dest + num_elements; - Impl::expect_no_overlap(first_from, last_from, first_dest, last_dest); + Impl::expect_no_overlap(first_from, last_from, first_dest); #endif // run + const auto num_elements = + Kokkos::Experimental::distance(first_from, last_from); ::Kokkos::parallel_for( label, RangePolicy(ex, 0, num_elements), StdAdjacentDiffFunctor(first_from, first_dest, bin_op)); @@ -121,16 +119,14 @@ KOKKOS_FUNCTION OutputIteratorType adjacent_difference_team_impl( return first_dest; } - const auto num_elements = - Kokkos::Experimental::distance(first_from, last_from); - #ifdef KOKKOS_ENABLE_DEBUG // check for overlapping iterators - auto last_dest = first_dest + num_elements; - Impl::expect_no_overlap(first_from, last_from, first_dest, last_dest); + Impl::expect_no_overlap(first_from, last_from, first_dest); #endif // run + const auto num_elements = + Kokkos::Experimental::distance(first_from, last_from); ::Kokkos::parallel_for( TeamThreadRange(teamHandle, 0, num_elements), StdAdjacentDiffFunctor(first_from, first_dest, bin_op)); diff --git a/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp b/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp index d4220b4db71..54bb13e25b9 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp @@ -236,8 +236,7 @@ KOKKOS_INLINE_FUNCTION void expect_valid_range(IteratorType first, template KOKKOS_INLINE_FUNCTION void expect_no_overlap( [[maybe_unused]] IteratorType1 first, [[maybe_unused]] IteratorType1 last, - [[maybe_unused]] IteratorType2 s_first, - [[maybe_unused]] IteratorType2 s_last) { + [[maybe_unused]] IteratorType2 s_first) { if constexpr (is_kokkos_iterator_v && is_kokkos_iterator_v) { auto const view1 = first.view(); @@ -256,8 +255,7 @@ KOKKOS_INLINE_FUNCTION void expect_no_overlap( auto* first_pointer1 = view1.data(); auto* first_pointer2 = view2.data(); [[maybe_unused]] auto* last_pointer1 = first_pointer1 + (last - first); - [[maybe_unused]] auto* last_pointer2 = - first_pointer2 + (s_last - s_first); + [[maybe_unused]] auto* last_pointer2 = first_pointer2 + (last - first); KOKKOS_EXPECTS(first_pointer1 >= last_pointer2 || last_pointer1 <= first_pointer2 || is_no_overlap); } diff --git a/algorithms/src/std_algorithms/impl/Kokkos_RandomAccessIterator.hpp b/algorithms/src/std_algorithms/impl/Kokkos_RandomAccessIterator.hpp index f673c7ff7f3..ff74a32275d 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_RandomAccessIterator.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_RandomAccessIterator.hpp @@ -185,7 +185,7 @@ class RandomAccessIterator< ::Kokkos::View > { reference operator*() const { return m_view(m_current_index); } KOKKOS_FUNCTION - auto view() const { return m_view; } + view_type view() const { return m_view; } private: view_type m_view; From 70893d4f6a00bb5093c47c6a19b34cb11d8f354b Mon Sep 17 00:00:00 2001 From: Yuuichi Asahi Date: Tue, 28 May 2024 16:23:30 +0200 Subject: [PATCH 101/103] wrapping with EXPECT_NO_THROW for no overlapping cases --- .../TestStdAlgorithmsConstraints.cpp | 64 ++++++++----------- 1 file changed, 28 insertions(+), 36 deletions(-) diff --git a/algorithms/unit_tests/TestStdAlgorithmsConstraints.cpp b/algorithms/unit_tests/TestStdAlgorithmsConstraints.cpp index 0384981f5d0..4555dc3194b 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsConstraints.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsConstraints.cpp @@ -100,62 +100,56 @@ TEST(std_algorithms, expect_no_overlap) { using strided_view_1d_t = Kokkos::View; Kokkos::LayoutStride layout1d{extent0, 2}; - [[maybe_unused]] strided_view_1d_t strided_view_1d{ - "std-algo-test-1d-strided-view", layout1d}; + strided_view_1d_t strided_view_1d{"std-algo-test-1d-strided-view", layout1d}; -#if !defined(NDEBUG) || defined(KOKKOS_ENFORCE_CONTRACTS) || \ - defined(KOKKOS_ENABLE_DEBUG) - // Overlapping because iterators are identical +// Overlapping because iterators are identical +#if defined(KOKKOS_ENABLE_DEBUG) && !defined(NDEBUG) auto first_s = KE::begin(static_view_1d); auto last_s = first_s + extent0; - EXPECT_DEATH( - { KE::Impl::expect_no_overlap(first_s, last_s, first_s, last_s); }, - "Kokkos contract violation:.*"); + EXPECT_DEATH({ KE::Impl::expect_no_overlap(first_s, last_s, first_s); }, + "Kokkos contract violation:.*"); auto first_d = KE::begin(dynamic_view_1d); auto last_d = first_d + extent0; - EXPECT_DEATH( - { KE::Impl::expect_no_overlap(first_d, last_d, first_d, last_d); }, - "Kokkos contract violation:.*"); + EXPECT_DEATH({ KE::Impl::expect_no_overlap(first_d, last_d, first_d); }, + "Kokkos contract violation:.*"); auto first_st = KE::begin(strided_view_1d); auto last_st = first_st + extent0; - EXPECT_DEATH( - { KE::Impl::expect_no_overlap(first_st, last_st, first_st, last_st); }, - "Kokkos contract violation:.*"); + EXPECT_DEATH({ KE::Impl::expect_no_overlap(first_st, last_st, first_st); }, + "Kokkos contract violation:.*"); +#endif // Ranges are overlapped static constexpr size_t sub_extent0 = 6, offset0 = 3; std::pair range0(0, sub_extent0), range1(offset0, offset0 + sub_extent0); +#if defined(KOKKOS_ENABLE_DEBUG) && !defined(NDEBUG) auto static_view_1d_0 = Kokkos::subview(static_view_1d, range0); auto static_view_1d_1 = Kokkos::subview(static_view_1d, range1); auto first_s0 = KE::begin(static_view_1d_0); // [0, 6] auto last_s0 = first_s0 + static_view_1d_0.extent(0); auto first_s1 = KE::begin(static_view_1d_1); // [3, 9] - auto last_s1 = first_s1 + static_view_1d_1.extent(1); - EXPECT_DEATH( - { KE::Impl::expect_no_overlap(first_s0, last_s0, first_s1, last_s1); }, - "Kokkos contract violation:.*"); + EXPECT_DEATH({ KE::Impl::expect_no_overlap(first_s0, last_s0, first_s1); }, + "Kokkos contract violation:.*"); auto dynamic_view_1d_0 = Kokkos::subview(dynamic_view_1d, range0); auto dynamic_view_1d_1 = Kokkos::subview(dynamic_view_1d, range1); auto first_d0 = KE::begin(dynamic_view_1d_0); // [0, 6] auto last_d0 = first_d0 + dynamic_view_1d_0.extent(0); auto first_d1 = KE::begin(dynamic_view_1d_1); // [3, 9] - auto last_d1 = first_d1 + dynamic_view_1d_1.extent(1); - EXPECT_DEATH( - { KE::Impl::expect_no_overlap(first_d0, last_d0, first_d1, last_d1); }, - "Kokkos contract violation:.*"); + EXPECT_DEATH({ KE::Impl::expect_no_overlap(first_d0, last_d0, first_d1); }, + "Kokkos contract violation:.*"); +#endif auto strided_view_1d_0 = Kokkos::subview(strided_view_1d, range0); auto strided_view_1d_1 = Kokkos::subview(strided_view_1d, range1); auto first_st0 = KE::begin(strided_view_1d_0); // [0, 12] auto last_st0 = first_st0 + strided_view_1d_0.extent(0); auto first_st1 = KE::begin(strided_view_1d_1); // [3, 15] - auto last_st1 = first_st1 + strided_view_1d_1.extent(1); // Does not overlap since offset (=3) is not divisible by stride (=2) - KE::Impl::expect_no_overlap(first_st0, last_st0, first_st1, last_st1); + EXPECT_NO_THROW( + { KE::Impl::expect_no_overlap(first_st0, last_st0, first_st1); }); // Iterating over the same range without overlapping Kokkos::View static_view_2d{ @@ -165,10 +159,10 @@ TEST(std_algorithms, expect_no_overlap) { auto sub_first_s0 = KE::begin(sub_static_view_1d_0); // 0, 2, 4, ... auto sub_last_s0 = sub_first_s0 + sub_static_view_1d_0.extent(0); auto sub_first_s1 = KE::begin(sub_static_view_1d_1); // 1, 3, 5, ... - auto sub_last_s1 = sub_first_s1 + sub_static_view_1d_1.extent(0); - KE::Impl::expect_no_overlap(sub_first_s0, sub_last_s0, sub_first_s1, - sub_last_s1); + EXPECT_NO_THROW({ + KE::Impl::expect_no_overlap(sub_first_s0, sub_last_s0, sub_first_s1); + }); Kokkos::View dynamic_view_2d{ "std-algo-test-2d-contiguous-view-dynamic", 2, extent0}; @@ -177,12 +171,12 @@ TEST(std_algorithms, expect_no_overlap) { auto sub_first_d0 = KE::begin(sub_dynamic_view_1d_0); // 0, 2, 4, ... auto sub_last_d0 = sub_first_d0 + sub_dynamic_view_1d_0.extent(0); auto sub_first_d1 = KE::begin(sub_dynamic_view_1d_1); // 1, 3, 5, ... - auto sub_last_d1 = sub_first_d1 + sub_dynamic_view_1d_1.extent(0); - KE::Impl::expect_no_overlap(sub_first_d0, sub_last_d0, sub_first_d1, - sub_last_d1); + EXPECT_NO_THROW({ + KE::Impl::expect_no_overlap(sub_first_d0, sub_last_d0, sub_first_d1); + }); - Kokkos::LayoutStride layout2d{2, 3, extent0, 2}; + Kokkos::LayoutStride layout2d{2, 3, extent0, 2 * 3}; Kokkos::View strided_view_2d{ "std-algo-test-2d-contiguous-view-strided", layout2d}; auto sub_strided_view_1d_0 = Kokkos::subview(strided_view_2d, 0, Kokkos::ALL); @@ -190,12 +184,10 @@ TEST(std_algorithms, expect_no_overlap) { auto sub_first_st0 = KE::begin(sub_strided_view_1d_0); // 0, 6, 12, ... auto sub_last_st0 = sub_first_st0 + sub_strided_view_1d_0.extent(0); auto sub_first_st1 = KE::begin(sub_strided_view_1d_1); // 1, 7, 13, ... - auto sub_last_st1 = sub_first_st1 + sub_strided_view_1d_1.extent(0); - KE::Impl::expect_no_overlap(sub_first_st0, sub_last_st0, sub_first_st1, - sub_last_st1); -#endif - EXPECT_TRUE(true); + EXPECT_NO_THROW({ + KE::Impl::expect_no_overlap(sub_first_st0, sub_last_st0, sub_first_st1); + }); } } // namespace stdalgos From 9da24c5c6cade2f52759dc76b3860957502cfb31 Mon Sep 17 00:00:00 2001 From: Yuuichi Asahi Date: Tue, 28 May 2024 19:00:30 +0200 Subject: [PATCH 102/103] remove NDEBUG check --- algorithms/unit_tests/TestStdAlgorithmsConstraints.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/algorithms/unit_tests/TestStdAlgorithmsConstraints.cpp b/algorithms/unit_tests/TestStdAlgorithmsConstraints.cpp index 4555dc3194b..fab64ebd3e0 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsConstraints.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsConstraints.cpp @@ -103,7 +103,7 @@ TEST(std_algorithms, expect_no_overlap) { strided_view_1d_t strided_view_1d{"std-algo-test-1d-strided-view", layout1d}; // Overlapping because iterators are identical -#if defined(KOKKOS_ENABLE_DEBUG) && !defined(NDEBUG) +#if defined(KOKKOS_ENABLE_DEBUG) auto first_s = KE::begin(static_view_1d); auto last_s = first_s + extent0; EXPECT_DEATH({ KE::Impl::expect_no_overlap(first_s, last_s, first_s); }, @@ -124,7 +124,7 @@ TEST(std_algorithms, expect_no_overlap) { static constexpr size_t sub_extent0 = 6, offset0 = 3; std::pair range0(0, sub_extent0), range1(offset0, offset0 + sub_extent0); -#if defined(KOKKOS_ENABLE_DEBUG) && !defined(NDEBUG) +#if defined(KOKKOS_ENABLE_DEBUG) auto static_view_1d_0 = Kokkos::subview(static_view_1d, range0); auto static_view_1d_1 = Kokkos::subview(static_view_1d, range1); auto first_s0 = KE::begin(static_view_1d_0); // [0, 6] From 4a10a40183876b74dfbf7999830a62675d5b31b6 Mon Sep 17 00:00:00 2001 From: Yuuichi Asahi Date: Tue, 28 May 2024 19:04:40 +0200 Subject: [PATCH 103/103] fix comment: last element is exclusive --- .../unit_tests/TestStdAlgorithmsConstraints.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/algorithms/unit_tests/TestStdAlgorithmsConstraints.cpp b/algorithms/unit_tests/TestStdAlgorithmsConstraints.cpp index fab64ebd3e0..2a4525a8c33 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsConstraints.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsConstraints.cpp @@ -127,26 +127,26 @@ TEST(std_algorithms, expect_no_overlap) { #if defined(KOKKOS_ENABLE_DEBUG) auto static_view_1d_0 = Kokkos::subview(static_view_1d, range0); auto static_view_1d_1 = Kokkos::subview(static_view_1d, range1); - auto first_s0 = KE::begin(static_view_1d_0); // [0, 6] + auto first_s0 = KE::begin(static_view_1d_0); // [0, 6) auto last_s0 = first_s0 + static_view_1d_0.extent(0); - auto first_s1 = KE::begin(static_view_1d_1); // [3, 9] + auto first_s1 = KE::begin(static_view_1d_1); // [3, 9) EXPECT_DEATH({ KE::Impl::expect_no_overlap(first_s0, last_s0, first_s1); }, "Kokkos contract violation:.*"); auto dynamic_view_1d_0 = Kokkos::subview(dynamic_view_1d, range0); auto dynamic_view_1d_1 = Kokkos::subview(dynamic_view_1d, range1); - auto first_d0 = KE::begin(dynamic_view_1d_0); // [0, 6] + auto first_d0 = KE::begin(dynamic_view_1d_0); // [0, 6) auto last_d0 = first_d0 + dynamic_view_1d_0.extent(0); - auto first_d1 = KE::begin(dynamic_view_1d_1); // [3, 9] + auto first_d1 = KE::begin(dynamic_view_1d_1); // [3, 9) EXPECT_DEATH({ KE::Impl::expect_no_overlap(first_d0, last_d0, first_d1); }, "Kokkos contract violation:.*"); #endif auto strided_view_1d_0 = Kokkos::subview(strided_view_1d, range0); auto strided_view_1d_1 = Kokkos::subview(strided_view_1d, range1); - auto first_st0 = KE::begin(strided_view_1d_0); // [0, 12] + auto first_st0 = KE::begin(strided_view_1d_0); // [0, 12) auto last_st0 = first_st0 + strided_view_1d_0.extent(0); - auto first_st1 = KE::begin(strided_view_1d_1); // [3, 15] + auto first_st1 = KE::begin(strided_view_1d_1); // [3, 15) // Does not overlap since offset (=3) is not divisible by stride (=2) EXPECT_NO_THROW( { KE::Impl::expect_no_overlap(first_st0, last_st0, first_st1); });