From b4bc4061426e57f2cae17a932464ceb5203e6fc2 Mon Sep 17 00:00:00 2001 From: Bruno Turcksin Date: Fri, 2 Feb 2024 14:42:03 -0500 Subject: [PATCH 01/95] Reenable TestHIP_Memory_Requirements --- core/unit_test/hip/TestHIP_Memory_Requirements.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/core/unit_test/hip/TestHIP_Memory_Requirements.cpp b/core/unit_test/hip/TestHIP_Memory_Requirements.cpp index a213453ea18..8c72e9f2972 100644 --- a/core/unit_test/hip/TestHIP_Memory_Requirements.cpp +++ b/core/unit_test/hip/TestHIP_Memory_Requirements.cpp @@ -48,9 +48,6 @@ TEST(hip, memory_requirements) { // we want all user-facing memory in hip to be coarse grained. As of // today(07.01.22) the documentation is not reliable/correct, we test the // memory on the device and host - // FIXME_HIP - GTEST_SKIP() << "skipping the test because the CI on MI100 returns: error( " - "hipErrorInvalidValue)"; KOKKOS_TEST_MEMORY_COARSEGRAINEDNESS(Kokkos::HIPSpace, int, 10); KOKKOS_TEST_MEMORY_COARSEGRAINEDNESS(Kokkos::HIPHostPinnedSpace, int, 10); KOKKOS_TEST_MEMORY_COARSEGRAINEDNESS(Kokkos::HIPManagedSpace, int, 10); From c1a800650e83da2eb515599f495f9bce2aaba80f Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 13 Mar 2024 12:17:03 -0400 Subject: [PATCH 02/95] Don't use Fedora development version in GitHub CI --- .github/workflows/continuous-integration-workflow.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/continuous-integration-workflow.yml b/.github/workflows/continuous-integration-workflow.yml index 6425cc2668e..5c7067c6686 100644 --- a/.github/workflows/continuous-integration-workflow.yml +++ b/.github/workflows/continuous-integration-workflow.yml @@ -20,7 +20,7 @@ jobs: continue-on-error: true strategy: matrix: - distro: ['fedora:latest', 'fedora:rawhide', 'ubuntu:latest'] + distro: ['fedora:latest', 'ubuntu:latest'] cxx: ['g++', 'clang++'] cxx_extra_flags: [''] cmake_build_type: ['Release', 'Debug'] From 841b3a9f96eecb28bcc637ea6e5fbe63c9e3e92d Mon Sep 17 00:00:00 2001 From: Cedric Chevalier Date: Thu, 15 Feb 2024 18:35:34 +0100 Subject: [PATCH 03/95] Fix deep copy when filling Rank-7 views Typo in loop indices, from wrong copy-paste. --- core/src/Kokkos_CopyViews.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core/src/Kokkos_CopyViews.hpp b/core/src/Kokkos_CopyViews.hpp index 08f6ba8d696..cab14b84025 100644 --- a/core/src/Kokkos_CopyViews.hpp +++ b/core/src/Kokkos_CopyViews.hpp @@ -223,8 +223,8 @@ struct ViewFill { : a(a_), val(val_) { Kokkos::parallel_for("Kokkos::ViewFill-7D", policy_type(space, {0, 0, 0, 0, 0, 0}, - {a.extent(0), a.extent(1), a.extent(2), - a.extent(3), a.extent(5), a.extent(6)}), + {a.extent(0), a.extent(1), a.extent(3), + a.extent(4), a.extent(5), a.extent(6)}), *this); } From a2f2ba4049c89ee0085712ee15ba22160ae3baf3 Mon Sep 17 00:00:00 2001 From: Cedric Chevalier Date: Fri, 16 Feb 2024 11:58:34 +0100 Subject: [PATCH 04/95] TestViewCopy_c.hpp: add new unit test for deep copy (ViewFill) --- core/src/Kokkos_CopyViews.hpp | 8 + core/unit_test/CMakeLists.txt | 2 + core/unit_test/Makefile | 22 +- core/unit_test/TestViewCopy_c.hpp | 435 ++++++++++++++++++++++++++++++ 4 files changed, 456 insertions(+), 11 deletions(-) create mode 100644 core/unit_test/TestViewCopy_c.hpp diff --git a/core/src/Kokkos_CopyViews.hpp b/core/src/Kokkos_CopyViews.hpp index cab14b84025..bcb7b61d2cb 100644 --- a/core/src/Kokkos_CopyViews.hpp +++ b/core/src/Kokkos_CopyViews.hpp @@ -221,6 +221,8 @@ struct ViewFill { ViewFill(const ViewType& a_, typename ViewType::const_value_type& val_, const ExecSpace& space) : a(a_), val(val_) { + // MDRangePolicy is not supported for 7D views + // Iterate separately over extent(2) Kokkos::parallel_for("Kokkos::ViewFill-7D", policy_type(space, {0, 0, 0, 0, 0, 0}, {a.extent(0), a.extent(1), a.extent(3), @@ -249,6 +251,8 @@ struct ViewFill { ViewFill(const ViewType& a_, typename ViewType::const_value_type& val_, const ExecSpace& space) : a(a_), val(val_) { + // MDRangePolicy is not supported for 8D views + // Iterate separately over extent(2) and extent(4) Kokkos::parallel_for("Kokkos::ViewFill-8D", policy_type(space, {0, 0, 0, 0, 0, 0}, {a.extent(0), a.extent(1), a.extent(3), @@ -461,6 +465,8 @@ struct ViewCopy { ViewCopy(const ViewTypeA& a_, const ViewTypeB& b_, const ExecSpace space = ExecSpace()) : a(a_), b(b_) { + // MDRangePolicy is not supported for 7D views + // Iterate separately over extent(2) Kokkos::parallel_for("Kokkos::ViewCopy-7D", policy_type(space, {0, 0, 0, 0, 0, 0}, {a.extent(0), a.extent(1), a.extent(3), @@ -494,6 +500,8 @@ struct ViewCopy { ViewCopy(const ViewTypeA& a_, const ViewTypeB& b_, const ExecSpace space = ExecSpace()) : a(a_), b(b_) { + // MDRangePolicy is not supported for 8D views + // Iterate separately over extent(2) and extent(4) Kokkos::parallel_for("Kokkos::ViewCopy-8D", policy_type(space, {0, 0, 0, 0, 0, 0}, {a.extent(0), a.extent(1), a.extent(3), diff --git a/core/unit_test/CMakeLists.txt b/core/unit_test/CMakeLists.txt index 6dfb7505c5d..dd3f51af004 100644 --- a/core/unit_test/CMakeLists.txt +++ b/core/unit_test/CMakeLists.txt @@ -236,6 +236,7 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;OpenACC;HIP;SYCL) ViewAPI_e ViewCopy_a ViewCopy_b + ViewCopy_c ViewCtorDimMatch ViewEmptyRuntimeUnmanaged ViewHooks @@ -353,6 +354,7 @@ foreach(PairDeviceSpace HIP-HostPinned;HIP-Managed;Cuda-HostPinned;Cuda-UVM;SYCL ViewAPI_e ViewCopy_a ViewCopy_b + ViewCopy_c ViewMapping_a ViewMapping_b ViewMapping_subview diff --git a/core/unit_test/Makefile b/core/unit_test/Makefile index 202809d3fc9..a4d65687e54 100644 --- a/core/unit_test/Makefile +++ b/core/unit_test/Makefile @@ -62,7 +62,7 @@ else STACK_TRACE_TERMINATE_FILTER := endif -TESTS = AtomicOperations_int AtomicOperations_unsignedint AtomicOperations_longint AtomicOperations_unsignedlongint AtomicOperations_longlongint AtomicOperations_double AtomicOperations_float AtomicOperations_complexdouble AtomicOperations_complexfloat AtomicViews Atomics BlockSizeDeduction Concepts Complex Crs DeepCopyAlignment FunctorAnalysis Init LocalDeepCopy MDRange_a MDRange_b MDRange_c MDRange_d MDRange_e MDRange_f Other ParallelScanRangePolicy RangePolicy RangePolicyRequire Reductions Reducers_a Reducers_b Reducers_c Reducers_d Reducers_e Reductions_DeviceView SharedAlloc TeamBasic TeamReductionScan TeamScratch TeamTeamSize TeamVectorRange UniqueToken ViewAPI_a ViewAPI_b ViewAPI_c ViewAPI_d ViewAPI_e ViewCopy_a ViewCopy_b ViewLayoutStrideAssignment ViewMapping_a ViewMapping_b ViewMapping_subview ViewOfClass WorkGraph View_64bit ViewResize +TESTS = AtomicOperations_int AtomicOperations_unsignedint AtomicOperations_longint AtomicOperations_unsignedlongint AtomicOperations_longlongint AtomicOperations_double AtomicOperations_float AtomicOperations_complexdouble AtomicOperations_complexfloat AtomicViews Atomics BlockSizeDeduction Concepts Complex Crs DeepCopyAlignment FunctorAnalysis Init LocalDeepCopy MDRange_a MDRange_b MDRange_c MDRange_d MDRange_e MDRange_f Other ParallelScanRangePolicy RangePolicy RangePolicyRequire Reductions Reducers_a Reducers_b Reducers_c Reducers_d Reducers_e Reductions_DeviceView SharedAlloc TeamBasic TeamReductionScan TeamScratch TeamTeamSize TeamVectorRange UniqueToken ViewAPI_a ViewAPI_b ViewAPI_c ViewAPI_d ViewAPI_e ViewCopy_a ViewCopy_b ViewCopy_c ViewLayoutStrideAssignment ViewMapping_a ViewMapping_b ViewMapping_subview ViewOfClass WorkGraph View_64bit ViewResize tmp := $(foreach device, $(KOKKOS_DEVICELIST), \ tmp2 := $(foreach test, $(TESTS), \ @@ -73,7 +73,7 @@ tmp := $(foreach device, $(KOKKOS_DEVICELIST), \ ) \ ) -GPU_SPACE_TESTS = SharedAlloc ViewAPI_a ViewAPI_b ViewAPI_c ViewAPI_d ViewAPI_e ViewCopy_a ViewCopy_b ViewMapping_a ViewMapping_b ViewMapping_subview +GPU_SPACE_TESTS = SharedAlloc ViewAPI_a ViewAPI_b ViewAPI_c ViewAPI_d ViewAPI_e ViewCopy_a ViewCopy_b ViewCopy_c ViewMapping_a ViewMapping_b ViewMapping_subview SUBVIEW_TESTS = SubView_a SubView_b SubView_c01 SubView_c02 SubView_c03 SubView_c04 SubView_c05 SubView_c06 SubView_c07 SubView_c08 SubView_c09 SubView_c10 SubView_c11 SubView_c12 SubView_c13 @@ -110,14 +110,14 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) OBJ_CUDA += TestCuda_Init.o OBJ_CUDA += TestCuda_SharedAlloc.o TestCudaUVM_SharedAlloc.o TestCudaHostPinned_SharedAlloc.o OBJ_CUDA += TestCuda_RangePolicy.o TestCuda_RangePolicyRequire.o - OBJ_CUDA += TestCuda_ViewAPI_a.o TestCuda_ViewAPI_b.o TestCuda_ViewAPI_c.o TestCuda_ViewAPI_d.o TestCuda_ViewAPI_e.o TestCuda_ViewCopy_a.o TestCuda_ViewCopy_b.o + OBJ_CUDA += TestCuda_ViewAPI_a.o TestCuda_ViewAPI_b.o TestCuda_ViewAPI_c.o TestCuda_ViewAPI_d.o TestCuda_ViewAPI_e.o TestCuda_ViewCopy_a.o TestCuda_ViewCopy_b.o TestCuda_ViewCopy_c.o OBJ_CUDA += TestCuda_DeepCopyAlignment.o OBJ_CUDA += TestCuda_ViewMapping_a.o TestCuda_ViewMapping_b.o TestCuda_ViewMapping_subview.o TestCuda_ViewResize.o TestCuda_ViewLayoutStrideAssignment.o OBJ_CUDA += TestCudaUVM_ViewAPI_a.o TestCudaUVM_ViewAPI_b.o TestCudaUVM_ViewAPI_c.o TestCudaUVM_ViewAPI_d.o TestCudaUVM_ViewAPI_e.o - OBJ_CUDA += TestCudaUVM_ViewCopy_a.o TestCudaUVM_ViewCopy_b.o + OBJ_CUDA += TestCudaUVM_ViewCopy_a.o TestCudaUVM_ViewCopy_b.o TestCudaUVM_ViewCopy_c.o OBJ_CUDA += TestCudaUVM_ViewMapping_a.o TestCudaUVM_ViewMapping_b.o TestCudaUVM_ViewMapping_subview.o OBJ_CUDA += TestCudaHostPinned_ViewAPI_a.o TestCudaHostPinned_ViewAPI_b.o TestCudaHostPinned_ViewAPI_c.o TestCudaHostPinned_ViewAPI_d.o TestCudaHostPinned_ViewAPI_e.o - OBJ_CUDA += TestCudaHostPinned_ViewCopy_a.o TestCudaHostPinned_ViewCopy_b.o + OBJ_CUDA += TestCudaHostPinned_ViewCopy_a.o TestCudaHostPinned_ViewCopy_b.o TestCudaHostPinned_ViewCopy_c.o OBJ_CUDA += TestCudaHostPinned_ViewMapping_a.o TestCudaHostPinned_ViewMapping_b.o TestCudaHostPinned_ViewMapping_subview.o OBJ_CUDA += TestCuda_View_64bit.o OBJ_CUDA += TestCuda_ViewOfClass.o @@ -162,7 +162,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_THREADS), 1) OBJ_THREADS += TestThreads_RangePolicy.o TestThreads_RangePolicyRequire.o OBJ_THREADS += TestThreads_View_64bit.o OBJ_THREADS += TestThreads_ViewAPI_a.o TestThreads_ViewAPI_b.o TestThreads_ViewAPI_c.o TestThreads_ViewAPI_d.o TestThreads_ViewAPI_e.o - OBJ_THREADS += TestThreads_ViewCopy_a.o TestThreads_ViewCopy_b.o + OBJ_THREADS += TestThreads_ViewCopy_a.o TestThreads_ViewCopy_b.o TestThreads_ViewCopy_c.o OBJ_THREADS += TestThreads_DeepCopyAlignment.o OBJ_THREADS += TestThreads_ViewMapping_a.o TestThreads_ViewMapping_b.o TestThreads_ViewMapping_subview.o TestThreads_ViewResize.o TestThreads_ViewLayoutStrideAssignment.o OBJ_THREADS += TestThreads_ViewOfClass.o @@ -198,7 +198,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1) OBJ_OPENMP += TestOpenMP_RangePolicy.o TestOpenMP_RangePolicyRequire.o OBJ_OPENMP += TestOpenMP_View_64bit.o OBJ_OPENMP += TestOpenMP_ViewAPI_a.o TestOpenMP_ViewAPI_b.o TestOpenMP_ViewAPI_c.o TestOpenMP_ViewAPI_d.o TestOpenMP_ViewAPI_e.o - OBJ_OPENMP += TestOpenMP_DeepCopyAlignment.o TestOpenMP_ViewCopy_a.o TestOpenMP_ViewCopy_b.o + OBJ_OPENMP += TestOpenMP_DeepCopyAlignment.o TestOpenMP_ViewCopy_a.o TestOpenMP_ViewCopy_b.o TestOpenMP_ViewCopy_c.o OBJ_OPENMP += TestOpenMP_ViewMapping_a.o TestOpenMP_ViewMapping_b.o TestOpenMP_ViewMapping_subview.o TestOpenMP_ViewResize.o TestOpenMP_ViewLayoutStrideAssignment.o OBJ_OPENMP += TestOpenMP_ViewOfClass.o OBJ_OPENMP += TestOpenMP_SubView_a.o TestOpenMP_SubView_b.o @@ -237,7 +237,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1) #OBJ_OPENMPTARGET += TestOpenMPTarget_SharedAlloc.o OBJ_OPENMPTARGET += TestOpenMPTarget_RangePolicy.o OBJ_OPENMPTARGET += TestOpenMPTarget_ViewAPI_a.o TestOpenMPTarget_ViewAPI_b.o TestOpenMPTarget_ViewAPI_c.o TestOpenMPTarget_ViewAPI_d.o #Some commented out code - #OBJ_OPENMPTARGET += TestOpenMPTarget_ViewAPI_e.o TestOpenMPTarget_ViewCopy_a.o TestOpenMPTarget_ViewCopy_b.o + #OBJ_OPENMPTARGET += TestOpenMPTarget_ViewAPI_e.o TestOpenMPTarget_ViewCopy_a.o TestOpenMPTarget_ViewCopy_b.o TestOpenMPTarget_ViewCopy_c.o OBJ_OPENMPTARGET += TestOpenMPTarget_DeepCopyAlignment.o OBJ_OPENMPTARGET += TestOpenMPTarget_ViewMapping_a.o OBJ_OPENMPTARGET += TestOpenMPTarget_ViewMapping_b.o @@ -292,7 +292,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1) OBJ_HIP += TestHIP_Memory_Requirements.o OBJ_HIP += TestHIP_ParallelScanRangePolicy.o OBJ_HIP += TestHIPHostPinned_ViewAPI_a.o TestHIPHostPinned_ViewAPI_b.o TestHIPHostPinned_ViewAPI_c.o TestHIPHostPinned_ViewAPI_d.o TestHIPHostPinned_ViewAPI_e.o - OBJ_HIP += TestHIPHostPinned_ViewCopy_a.o TestHIPHostPinned_ViewCopy_b.o + OBJ_HIP += TestHIPHostPinned_ViewCopy_a.o TestHIPHostPinned_ViewCopy_b.o TestHIPHostPinned_ViewCopy_c.o OBJ_HIP += TestHIPHostPinned_ViewMapping_a.o TestHIPHostPinned_ViewMapping_b.o TestHIPHostPinned_ViewMapping_subview.o TARGETS += KokkosCore_UnitTest_HIP @@ -307,7 +307,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1) OBJ_HPX += TestHPX_RangePolicy.o TestHPX_RangePolicyRequire.o OBJ_HPX += TestHPX_View_64bit.o OBJ_HPX += TestHPX_ViewAPI_a.o TestHPX_ViewAPI_b.o TestHPX_ViewAPI_c.o TestHPX_ViewAPI_d.o TestHPX_ViewAPI_e.o - OBJ_HPX += TestHPX_ViewCopy_a.o TestHPX_ViewCopy_b.o + OBJ_HPX += TestHPX_ViewCopy_a.o TestHPX_ViewCopy_b.o TestHPX_ViewCopy_c.o OBJ_HPX += TestHPX_ViewMapping_a.o TestHPX_ViewMapping_b.o TestHPX_ViewMapping_subview.o TestHPX_ViewResize.o OBJ_HPX += TestHPX_ViewOfClass.o OBJ_HPX += TestHPX_SubView_a.o TestHPX_SubView_b.o @@ -347,7 +347,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1) OBJ_SERIAL += TestSerial_RangePolicy.o TestSerial_RangePolicyRequire.o OBJ_SERIAL += TestSerial_View_64bit.o OBJ_SERIAL += TestSerial_ViewAPI_a.o TestSerial_ViewAPI_b.o TestSerial_ViewAPI_c.o TestSerial_ViewAPI_d.o TestSerial_ViewAPI_e.o - OBJ_SERIAL += TestSerial_DeepCopyAlignment.o TestSerial_ViewCopy_a.o TestSerial_ViewCopy_b.o + OBJ_SERIAL += TestSerial_DeepCopyAlignment.o TestSerial_ViewCopy_a.o TestSerial_ViewCopy_b.o TestSerial_ViewCopy_c.o OBJ_SERIAL += TestSerial_ViewMapping_a.o TestSerial_ViewMapping_b.o TestSerial_ViewMapping_subview.o TestSerial_ViewResize.o TestSerial_ViewLayoutStrideAssignment.o OBJ_SERIAL += TestSerial_ViewOfClass.o OBJ_SERIAL += TestSerial_SubView_a.o TestSerial_SubView_b.o diff --git a/core/unit_test/TestViewCopy_c.hpp b/core/unit_test/TestViewCopy_c.hpp new file mode 100644 index 00000000000..2f34b35fb07 --- /dev/null +++ b/core/unit_test/TestViewCopy_c.hpp @@ -0,0 +1,435 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include + +#include + +namespace { +// Do not rely on deep_copy(0) as we want to test it! +template +void reset_view(const ExecSpace& space, ViewType& a, int magic) { + auto policy = Kokkos::RangePolicy(space, 0, a.span()); + + assert(a.span_is_contiguous()); + + Kokkos::parallel_for( + "TestViewCopy::ResetView", policy, + KOKKOS_LAMBDA(int i) { a.data()[i] = magic; }); +} + +template +size_t compute_overall_sum(const ExecSpace& space, ViewType& a) { + auto policy = Kokkos::RangePolicy(space, 0, a.span()); + + assert(a.span_is_contiguous()); + + typename ViewType::value_type sum = 0; + Kokkos::parallel_reduce( + "TestViewCopy::ComputeSum", policy, + KOKKOS_LAMBDA(int i, int& lcl_sum) { lcl_sum += a.data()[i]; }, sum); + + return static_cast(sum); +} + +template +bool check_magic_value( + const ExecSpace& space, const Kokkos::View& a, int magic, + std::enable_if_t::rank == 0>* = nullptr) { + auto policy = Kokkos::RangePolicy(space, 0, 1); + + bool all_elements_are_set; // Uninitialized, set by parallel_reduce + + Kokkos::parallel_reduce( + "TestViewCopy::CheckMagicValueRank0", policy, + KOKKOS_LAMBDA(int, bool& local_check) { local_check &= (a() == magic); }, + Kokkos::LAnd(all_elements_are_set)); + + return all_elements_are_set; +} + +template +bool check_magic_value( + const ExecSpace& space, const Kokkos::View& a, int magic, + std::enable_if_t::rank == 1>* = nullptr) { + auto policy = Kokkos::RangePolicy(space, 0, a.extent(0)); + + bool all_elements_are_set; // Uninitialized, set by parallel_reduce + + Kokkos::parallel_reduce( + "TestViewCopy::CheckMagicValueRank1", policy, + KOKKOS_LAMBDA(int i, bool& local_check) { + local_check &= (a(i) == magic); + }, + Kokkos::LAnd(all_elements_are_set)); + + return all_elements_are_set; +} + +template +bool check_magic_value( + const ExecSpace& space, const Kokkos::View& a, int magic, + std::enable_if_t::rank == 2>* = nullptr) { + auto policy = Kokkos::MDRangePolicy, ExecSpace>( + space, {0, 0}, {a.extent(0), a.extent(1)}); + + bool all_elements_are_set; // Uninitialized, set by parallel_reduce + + Kokkos::parallel_reduce( + "TestViewCopy::CheckMagicValueRank2", policy, + KOKKOS_LAMBDA(int i0, int i1, bool& local_check) { + local_check &= (a(i0, i1) == magic); + }, + Kokkos::LAnd(all_elements_are_set)); + + return all_elements_are_set; +} + +template +bool check_magic_value( + const ExecSpace& space, const Kokkos::View& a, int magic, + std::enable_if_t::rank == 3>* = nullptr) { + auto policy = Kokkos::MDRangePolicy, ExecSpace>( + space, {0, 0, 0}, {a.extent(0), a.extent(1), a.extent(2)}); + + bool all_elements_are_set; // Uninitialized, set by parallel_reduce + + Kokkos::parallel_reduce( + "TestViewCopy::CheckMagicValueRank3", policy, + KOKKOS_LAMBDA(int i0, int i1, int i2, bool& local_check) { + local_check &= (a(i0, i1, i2) == magic); + }, + Kokkos::LAnd(all_elements_are_set)); + + return all_elements_are_set; +} + +template +bool check_magic_value( + const ExecSpace& space, const Kokkos::View& a, int magic, + std::enable_if_t::rank == 4>* = nullptr) { + auto policy = Kokkos::MDRangePolicy, ExecSpace>( + space, {0, 0, 0, 0}, + {a.extent(0), a.extent(1), a.extent(2), a.extent(3)}); + + bool all_elements_are_set; // Uninitialized, set by parallel_reduce + + Kokkos::parallel_reduce( + "TestViewCopy::CheckMagicValueRank4", policy, + KOKKOS_LAMBDA(int i0, int i1, int i2, int i3, bool& local_check) { + local_check &= (a(i0, i1, i2, i3) == magic); + }, + Kokkos::LAnd(all_elements_are_set)); + + return all_elements_are_set; +} + +template +bool check_magic_value( + const ExecSpace& space, const Kokkos::View& a, int magic, + std::enable_if_t::rank == 5>* = nullptr) { + auto policy = Kokkos::MDRangePolicy, ExecSpace>( + space, {0, 0, 0, 0, 0}, + {a.extent(0), a.extent(1), a.extent(2), a.extent(3), a.extent(4)}); + + bool all_elements_are_set; // Uninitialized, set by parallel_reduce + + Kokkos::parallel_reduce( + "TestViewCopy::CheckMagicValueRank5", policy, + KOKKOS_LAMBDA(int i0, int i1, int i2, int i3, int i4, bool& local_check) { + local_check &= (a(i0, i1, i2, i3, i4) == magic); + }, + Kokkos::LAnd(all_elements_are_set)); + + return all_elements_are_set; +} + +template +bool check_magic_value( + const ExecSpace& space, const Kokkos::View& a, int magic, + std::enable_if_t::rank == 6>* = nullptr) { + auto policy = Kokkos::MDRangePolicy, ExecSpace>( + space, {0, 0, 0, 0, 0, 0}, + {a.extent(0), a.extent(1), a.extent(2), a.extent(3), a.extent(4), + a.extent(5)}); + + bool all_elements_are_set; // Uninitialized, set by parallel_reduce + + Kokkos::parallel_reduce( + "TestViewCopy::CheckMagicValueRank6", policy, + KOKKOS_LAMBDA(int i0, int i1, int i2, int i3, int i4, int i5, + bool& local_check) { + local_check &= (a(i0, i1, i2, i3, i4, i5) == magic); + }, + Kokkos::LAnd(all_elements_are_set)); + + return all_elements_are_set; +} + +template +bool check_magic_value( + const ExecSpace& space, const Kokkos::View& a, int magic, + std::enable_if_t::rank == 7>* = nullptr) { + auto policy = Kokkos::MDRangePolicy, ExecSpace>( + space, {0, 0, 0, 0, 0, 0}, + {a.extent(0), a.extent(1), a.extent(2), a.extent(3), a.extent(4), + a.extent(5)}); + + bool all_elements_are_set = true; + + for (size_t outer = 0; outer < a.extent(6); ++outer) { + bool all_local_elements_are_set; // Uninitialized, set by parallel_reduce + Kokkos::parallel_reduce( + "TestViewCopy::CheckMagicValueRank7", policy, + KOKKOS_LAMBDA(int i0, int i1, int i2, int i3, int i4, int i5, + bool& local_check) { + local_check &= (a(i0, i1, i2, i3, i4, i5, outer) == magic); + }, + Kokkos::LAnd(all_local_elements_are_set)); + + all_elements_are_set = all_elements_are_set && all_local_elements_are_set; + } + return all_elements_are_set; +} + +template +bool check_magic_value( + const ExecSpace& space, const Kokkos::View& a, int magic, + std::enable_if_t::rank == 8>* = nullptr) { + auto policy = Kokkos::MDRangePolicy, ExecSpace>( + space, {0, 0, 0, 0, 0, 0}, + {a.extent(0), a.extent(1), a.extent(2), a.extent(3), a.extent(4), + a.extent(5)}); + + bool all_elements_are_set = true; + + for (size_t outer = 0; outer < a.extent(7); ++outer) { + for (size_t inner = 0; inner < a.extent(6); ++inner) { + bool all_local_elements_are_set; // Uninitialized, set by parallel_reduce + Kokkos::parallel_reduce( + "TestViewCopy::CheckMagicValueRank8", policy, + KOKKOS_LAMBDA(int i0, int i1, int i2, int i3, int i4, int i5, + bool& local_check) { + local_check &= (a(i0, i1, i2, i3, i4, i5, inner, outer) == magic); + }, + Kokkos::LAnd(all_local_elements_are_set)); + + all_elements_are_set = all_elements_are_set && all_local_elements_are_set; + } + } + return all_elements_are_set; +} + +template +bool view_fill_test(const ExecSpace& space, ViewType& a, int magic) { + Kokkos::deep_copy(space, a, magic); +#if defined(KOKKOS_ENABLE_OPENMPTARGET) + if constexpr (std::is_same_v) { + return true; + } +#endif // KOKKOS_ENABLE_OPENMPTARGET + return check_magic_value(space, a, magic); +} + +template +void run_test() { + int magic = 19; + + using ViewType = Kokkos::View; + // Create views with different lengths for each dimension + // We want to test if all loops are over the correct dimensions + // We use prime numbers to make sure that the strides are different + ViewType a_decreasing("a", 23, 19, 17, 13, 11, 7, 5, 3); + // We also test with increasing strides to catch more "out-of-bounds" errors + // within subviews. + ViewType a_increasing("a", 3, 5, 7, 11, 13, 17, 19, 23); + + using exec_space = typename Space::execution_space; + auto space = exec_space(); + + // Use subviews in the tests to have cases with different ranks and + // non-contiguous memory + // Tests have two parts: + // 1. Fill the subview with a magic value and check that all elements are set + // 2. Check if only the subview is set by summing all elements in the view and + // comparing to the subview size times the magic value + + // Rank 0 + { + auto sub_dec = Kokkos::subview(a_decreasing, 0, 0, 0, 0, 0, 0, 0, 0); + EXPECT_TRUE(view_fill_test(space, sub_dec, magic)); + EXPECT_EQ(compute_overall_sum(space, a_decreasing), + static_cast(magic)); + + auto sub_inc = Kokkos::subview(a_increasing, 0, 0, 0, 0, 0, 0, 0, 0); + EXPECT_TRUE(view_fill_test(space, sub_inc, magic)); + EXPECT_EQ(compute_overall_sum(space, a_increasing), + static_cast(magic)); + } + reset_view(space, a_decreasing, 0); + reset_view(space, a_increasing, 0); + + // Rank 1 + { + auto sub_dec = + Kokkos::subview(a_decreasing, Kokkos::ALL, 0, 0, 0, 0, 0, 0, 0); + EXPECT_TRUE(view_fill_test(space, sub_dec, magic)); + EXPECT_EQ(compute_overall_sum(space, a_decreasing), sub_dec.size() * magic); + + auto sub_inc = + Kokkos::subview(a_increasing, Kokkos::ALL, 0, 0, 0, 0, 0, 0, 0); + EXPECT_TRUE(view_fill_test(space, sub_inc, magic)); + EXPECT_EQ(compute_overall_sum(space, a_increasing), sub_inc.size() * magic); + } + reset_view(space, a_decreasing, 0); + reset_view(space, a_increasing, 0); + + // Rank 2 + { + auto sub_dec = Kokkos::subview(a_decreasing, Kokkos::ALL, Kokkos::ALL, 0, 0, + 0, 0, 0, 0); + EXPECT_TRUE(view_fill_test(space, sub_dec, magic)); + EXPECT_EQ(compute_overall_sum(space, a_decreasing), sub_dec.size() * magic); + + auto sub_inc = Kokkos::subview(a_increasing, Kokkos::ALL, Kokkos::ALL, 0, 0, + 0, 0, 0, 0); + EXPECT_TRUE(view_fill_test(space, sub_inc, magic)); + EXPECT_EQ(compute_overall_sum(space, a_increasing), sub_inc.size() * magic); + } + reset_view(space, a_decreasing, 0); + reset_view(space, a_increasing, 0); + space.fence(); + + // Rank 3 + { + auto sub_dec = Kokkos::subview(a_decreasing, Kokkos::ALL, Kokkos::ALL, + Kokkos::ALL, 0, 0, 0, 0, 0); + EXPECT_TRUE(view_fill_test(space, sub_dec, magic)); + EXPECT_EQ( + compute_overall_sum(space, a_decreasing), + sub_dec.extent(0) * sub_dec.extent(1) * sub_dec.extent(2) * magic); + + auto sub_inc = Kokkos::subview(a_increasing, Kokkos::ALL, Kokkos::ALL, + Kokkos::ALL, 0, 0, 0, 0, 0); + EXPECT_TRUE(view_fill_test(space, sub_inc, magic)); + EXPECT_EQ(compute_overall_sum(space, a_increasing), sub_inc.size() * magic); + } + reset_view(space, a_decreasing, 0); + reset_view(space, a_increasing, 0); + space.fence(); + + // Rank 4 + { + auto sub_dec = Kokkos::subview(a_decreasing, Kokkos::ALL, Kokkos::ALL, + Kokkos::ALL, Kokkos::ALL, 0, 0, 0, 0); + EXPECT_TRUE(view_fill_test(space, sub_dec, magic)); + EXPECT_EQ(compute_overall_sum(space, a_decreasing), + sub_dec.extent(0) * sub_dec.extent(1) * sub_dec.extent(2) * + sub_dec.extent(3) * magic); + + auto sub_inc = Kokkos::subview(a_increasing, Kokkos::ALL, Kokkos::ALL, + Kokkos::ALL, Kokkos::ALL, 0, 0, 0, 0); + EXPECT_TRUE(view_fill_test(space, sub_inc, magic)); + EXPECT_EQ(compute_overall_sum(space, a_increasing), sub_inc.size() * magic); + } + reset_view(space, a_decreasing, 0); + reset_view(space, a_increasing, 0); + space.fence(); + + // Rank 5 + { + auto sub_dec = + Kokkos::subview(a_decreasing, Kokkos::ALL, Kokkos::ALL, Kokkos::ALL, + Kokkos::ALL, Kokkos::ALL, 0, 0, 0); + EXPECT_TRUE(view_fill_test(space, sub_dec, magic)); + EXPECT_EQ(compute_overall_sum(space, a_decreasing), sub_dec.size() * magic); + + auto sub_inc = + Kokkos::subview(a_increasing, Kokkos::ALL, Kokkos::ALL, Kokkos::ALL, + Kokkos::ALL, Kokkos::ALL, 0, 0, 0); + EXPECT_TRUE(view_fill_test(space, sub_inc, magic)); + EXPECT_EQ(compute_overall_sum(space, a_increasing), sub_inc.size() * magic); + } + reset_view(space, a_decreasing, 0); + reset_view(space, a_increasing, 0); + space.fence(); + + // Rank 6 + { + auto sub_dec = + Kokkos::subview(a_decreasing, Kokkos::ALL, Kokkos::ALL, Kokkos::ALL, + Kokkos::ALL, Kokkos::ALL, Kokkos::ALL, 0, 0); + EXPECT_TRUE(view_fill_test(space, sub_dec, magic)); + EXPECT_EQ(compute_overall_sum(space, a_decreasing), sub_dec.size() * magic); + + auto sub_inc = + Kokkos::subview(a_increasing, Kokkos::ALL, Kokkos::ALL, Kokkos::ALL, + Kokkos::ALL, Kokkos::ALL, Kokkos::ALL, 0, 0); + EXPECT_TRUE(view_fill_test(space, sub_inc, magic)); + EXPECT_EQ(compute_overall_sum(space, a_increasing), sub_inc.size() * magic); + } + reset_view(space, a_decreasing, 0); + reset_view(space, a_increasing, 0); + space.fence(); + + // Rank 7 + { + auto sub_dec = + Kokkos::subview(a_decreasing, Kokkos::ALL, Kokkos::ALL, Kokkos::ALL, + Kokkos::ALL, Kokkos::ALL, Kokkos::ALL, Kokkos::ALL, 0); + EXPECT_TRUE(view_fill_test(space, sub_dec, magic)); + EXPECT_EQ(compute_overall_sum(space, a_decreasing), sub_dec.size() * magic); + + auto sub_inc = + Kokkos::subview(a_increasing, Kokkos::ALL, Kokkos::ALL, Kokkos::ALL, + Kokkos::ALL, Kokkos::ALL, Kokkos::ALL, Kokkos::ALL, 0); + EXPECT_TRUE(view_fill_test(space, sub_inc, magic)); + EXPECT_EQ(compute_overall_sum(space, a_increasing), sub_inc.size() * magic); + } + reset_view(space, a_decreasing, 0); + reset_view(space, a_increasing, 0); + space.fence(); + + // Rank 8 + { + auto sub_dec = Kokkos::subview( + a_decreasing, Kokkos::ALL, Kokkos::ALL, Kokkos::ALL, Kokkos::ALL, + Kokkos::ALL, Kokkos::ALL, Kokkos::ALL, std::make_pair(0, 2)); + EXPECT_TRUE(view_fill_test(space, sub_dec, magic)); + EXPECT_EQ(compute_overall_sum(space, a_decreasing), sub_dec.size() * magic); + + auto sub_inc = Kokkos::subview( + a_increasing, Kokkos::ALL, Kokkos::ALL, Kokkos::ALL, Kokkos::ALL, + Kokkos::ALL, Kokkos::ALL, Kokkos::ALL, std::make_pair(0, 2)); + EXPECT_TRUE(view_fill_test(space, sub_inc, magic)); + EXPECT_EQ(compute_overall_sum(space, a_increasing), sub_inc.size() * magic); + } +} + +TEST(TEST_CATEGORY, view_fill_tests_layout_right) { + using Space = TEST_EXECSPACE; + using Layout = Kokkos::LayoutRight; + run_test(); +} + +TEST(TEST_CATEGORY, view_fill_tests_layout_left) { + using Space = TEST_EXECSPACE; + using Layout = Kokkos::LayoutLeft; + run_test(); +} + +} // namespace From ae4d0013dd0a7ae4b3cd036a3ae9f0eb9002bea7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Chevalier?= Date: Mon, 11 Mar 2024 14:12:15 +0100 Subject: [PATCH 05/95] TestViewCopy_c.hpp: better handling for OpenMPTarget Disabling call to value checker as it is not supported by the backend. --- core/unit_test/TestViewCopy_c.hpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/core/unit_test/TestViewCopy_c.hpp b/core/unit_test/TestViewCopy_c.hpp index 2f34b35fb07..758af13c7df 100644 --- a/core/unit_test/TestViewCopy_c.hpp +++ b/core/unit_test/TestViewCopy_c.hpp @@ -237,12 +237,11 @@ template bool view_fill_test(const ExecSpace& space, ViewType& a, int magic) { Kokkos::deep_copy(space, a, magic); #if defined(KOKKOS_ENABLE_OPENMPTARGET) - if constexpr (std::is_same_v) { - return true; - } -#endif // KOKKOS_ENABLE_OPENMPTARGET + // FIXME_OPENMPTARGET Does not work with Land reducer + return true; +#else // KOKKOS_ENABLE_OPENMPTARGET return check_magic_value(space, a, magic); +#endif // KOKKOS_ENABLE_OPENMPTARGET } template From 46354d25d4befe2f9f2fc4b46ea30e3121b3d64e Mon Sep 17 00:00:00 2001 From: Bruno Turcksin Date: Mon, 18 Mar 2024 10:32:55 -0400 Subject: [PATCH 06/95] Use builtin for atomic_fetch in the HIP backend Co-authored-by: Damien L-G --- .../desul/include/desul/atomics/Adapt_HIP.hpp | 77 ++++++++ .../include/desul/atomics/Fetch_Op_HIP.hpp | 165 +++++++++--------- 2 files changed, 163 insertions(+), 79 deletions(-) create mode 100644 tpls/desul/include/desul/atomics/Adapt_HIP.hpp diff --git a/tpls/desul/include/desul/atomics/Adapt_HIP.hpp b/tpls/desul/include/desul/atomics/Adapt_HIP.hpp new file mode 100644 index 00000000000..0eab27fe989 --- /dev/null +++ b/tpls/desul/include/desul/atomics/Adapt_HIP.hpp @@ -0,0 +1,77 @@ +/* +Copyright (c) 2019, Lawrence Livermore National Security, LLC +and DESUL project contributors. See the COPYRIGHT file for details. +Source: https://github.com/desul/desul + +SPDX-License-Identifier: (BSD-3-Clause) +*/ + +#ifndef DESUL_ATOMICS_ADAPT_HIP_HPP_ +#define DESUL_ATOMICS_ADAPT_HIP_HPP_ + +#include + +namespace desul { +namespace Impl { + +// FIXME same code as GCCMemoryOrder +template +struct HIPMemoryOrder; + +template <> +struct HIPMemoryOrder { + static constexpr int value = __ATOMIC_RELAXED; +}; + +template <> +struct HIPMemoryOrder { + static constexpr int value = __ATOMIC_ACQUIRE; +}; + +template <> +struct HIPMemoryOrder { + static constexpr int value = __ATOMIC_RELEASE; +}; + +template <> +struct HIPMemoryOrder { + static constexpr int value = __ATOMIC_ACQ_REL; +}; + +template <> +struct HIPMemoryOrder { + static constexpr int value = __ATOMIC_SEQ_CST; +}; + +// __HIP_MEMORY_SCOPE_SYSTEM +// __HIP_MEMORY_SCOPE_AGENT +// __HIP_MEMORY_SCOPE_WORKGROUP +// __HIP_MEMORY_SCOPE_WAVEFRONT +// __HIP_MEMORY_SCOPE_SINGLETHREAD +template +struct HIPMemoryScope; + +template <> +struct HIPMemoryScope { + static constexpr int value = __HIP_MEMORY_SCOPE_WORKGROUP; +}; + +template <> +struct HIPMemoryScope { + static constexpr int value = __HIP_MEMORY_SCOPE_AGENT; +}; + +template <> +struct HIPMemoryScope { + static constexpr int value = __HIP_MEMORY_SCOPE_SYSTEM; +}; + +template <> +struct HIPMemoryScope { + static constexpr int value = __HIP_MEMORY_SCOPE_SYSTEM; +}; + +} // namespace Impl +} // namespace desul + +#endif diff --git a/tpls/desul/include/desul/atomics/Fetch_Op_HIP.hpp b/tpls/desul/include/desul/atomics/Fetch_Op_HIP.hpp index e9c749809de..920722084d1 100644 --- a/tpls/desul/include/desul/atomics/Fetch_Op_HIP.hpp +++ b/tpls/desul/include/desul/atomics/Fetch_Op_HIP.hpp @@ -9,99 +9,106 @@ SPDX-License-Identifier: (BSD-3-Clause) #ifndef DESUL_ATOMICS_FECH_OP_HIP_HPP_ #define DESUL_ATOMICS_FECH_OP_HIP_HPP_ +#include + namespace desul { namespace Impl { -// clang-format off -inline __device__ int device_atomic_fetch_add( int* ptr, int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAdd(ptr, val); } -inline __device__ unsigned int device_atomic_fetch_add( unsigned int* ptr, unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAdd(ptr, val); } -inline __device__ unsigned long long device_atomic_fetch_add(unsigned long long* ptr, unsigned long long val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAdd(ptr, val); } -inline __device__ float device_atomic_fetch_add( float* ptr, float val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAdd(ptr, val); } -inline __device__ double device_atomic_fetch_add( double* ptr, double val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAdd(ptr, val); } - -inline __device__ int device_atomic_fetch_sub( int* ptr, int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicSub(ptr, val); } -inline __device__ unsigned int device_atomic_fetch_sub( unsigned int* ptr, unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicSub(ptr, val); } -inline __device__ unsigned long long device_atomic_fetch_sub(unsigned long long* ptr, unsigned long long val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAdd(ptr, -val); } -inline __device__ float device_atomic_fetch_sub( float* ptr, float val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAdd(ptr, -val); } -inline __device__ double device_atomic_fetch_sub( double* ptr, double val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAdd(ptr, -val); } - -inline __device__ int device_atomic_fetch_min( int* ptr, int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicMin(ptr, val); } -inline __device__ unsigned int device_atomic_fetch_min( unsigned int* ptr, unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicMin(ptr, val); } -inline __device__ unsigned long long device_atomic_fetch_min(unsigned long long* ptr, unsigned long long val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicMin(ptr, val); } - -inline __device__ int device_atomic_fetch_max( int* ptr, int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicMax(ptr, val); } -inline __device__ unsigned int device_atomic_fetch_max( unsigned int* ptr, unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicMax(ptr, val); } -inline __device__ unsigned long long device_atomic_fetch_max(unsigned long long* ptr, unsigned long long val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicMax(ptr, val); } - -inline __device__ int device_atomic_fetch_and( int* ptr, int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAnd(ptr, val); } -inline __device__ unsigned int device_atomic_fetch_and( unsigned int* ptr, unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAnd(ptr, val); } -inline __device__ unsigned long long device_atomic_fetch_and(unsigned long long* ptr, unsigned long long val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAnd(ptr, val); } - -inline __device__ int device_atomic_fetch_or ( int* ptr, int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicOr (ptr, val); } -inline __device__ unsigned int device_atomic_fetch_or ( unsigned int* ptr, unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicOr (ptr, val); } -inline __device__ unsigned long long device_atomic_fetch_or (unsigned long long* ptr, unsigned long long val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicOr (ptr, val); } +#define DESUL_IMPL_HIP_ATOMIC_FETCH_OP(OP, T) \ + template \ + __device__ inline T device_atomic_fetch_##OP( \ + T* ptr, T val, MemoryOrder, MemoryScope) { \ + return __hip_atomic_fetch_##OP(ptr, \ + val, \ + HIPMemoryOrder::value, \ + HIPMemoryScope::value); \ + } -inline __device__ int device_atomic_fetch_xor( int* ptr, int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicXor(ptr, val); } -inline __device__ unsigned int device_atomic_fetch_xor( unsigned int* ptr, unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicXor(ptr, val); } -inline __device__ unsigned long long device_atomic_fetch_xor(unsigned long long* ptr, unsigned long long val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicXor(ptr, val); } +#define DESUL_IMPL_HIP_ATOMIC_FETCH_OP_INTEGRAL(OP) \ + DESUL_IMPL_HIP_ATOMIC_FETCH_OP(OP, int) \ + DESUL_IMPL_HIP_ATOMIC_FETCH_OP(OP, long long) \ + DESUL_IMPL_HIP_ATOMIC_FETCH_OP(OP, unsigned int) \ + DESUL_IMPL_HIP_ATOMIC_FETCH_OP(OP, unsigned long long) + +#define DESUL_IMPL_HIP_ATOMIC_FETCH_OP_FLOATING_POINT(OP) \ + DESUL_IMPL_HIP_ATOMIC_FETCH_OP(OP, float) \ + DESUL_IMPL_HIP_ATOMIC_FETCH_OP(OP, double) + +DESUL_IMPL_HIP_ATOMIC_FETCH_OP_INTEGRAL(add) +DESUL_IMPL_HIP_ATOMIC_FETCH_OP_INTEGRAL(min) +DESUL_IMPL_HIP_ATOMIC_FETCH_OP_INTEGRAL(max) +DESUL_IMPL_HIP_ATOMIC_FETCH_OP_INTEGRAL(and) +DESUL_IMPL_HIP_ATOMIC_FETCH_OP_INTEGRAL(or) +DESUL_IMPL_HIP_ATOMIC_FETCH_OP_INTEGRAL(xor) +DESUL_IMPL_HIP_ATOMIC_FETCH_OP_FLOATING_POINT(add) +// atomic min/max gives the wrong results (tested with ROCm 6.0 on Frontier) +// DESUL_IMPL_HIP_ATOMIC_FETCH_OP_FLOATING_POINT(min) +// DESUL_IMPL_HIP_ATOMIC_FETCH_OP_FLOATING_POINT(max) + +#undef DESUL_IMPL_HIP_ATOMIC_FETCH_OP_FLOATING_POINT +#undef DESUL_IMPL_HIP_ATOMIC_FETCH_OP_INTEGRAL +#undef DESUL_IMPL_HIP_ATOMIC_FETCH_OP + +#define DESUL_IMPL_HIP_ATOMIC_FETCH_SUB(T) \ + template \ + __device__ inline T device_atomic_fetch_sub( \ + T* ptr, T val, MemoryOrder, MemoryScope) { \ + return __hip_atomic_fetch_add(ptr, \ + -val, \ + HIPMemoryOrder::value, \ + HIPMemoryScope::value); \ + } -inline __device__ int device_atomic_fetch_inc( int* ptr, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAdd(ptr, 1 ); } -inline __device__ unsigned int device_atomic_fetch_inc( unsigned int* ptr, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAdd(ptr, 1u ); } -inline __device__ unsigned long long device_atomic_fetch_inc(unsigned long long* ptr, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAdd(ptr, 1ull); } +DESUL_IMPL_HIP_ATOMIC_FETCH_SUB(int) +DESUL_IMPL_HIP_ATOMIC_FETCH_SUB(long long) +DESUL_IMPL_HIP_ATOMIC_FETCH_SUB(unsigned int) +DESUL_IMPL_HIP_ATOMIC_FETCH_SUB(unsigned long long) +DESUL_IMPL_HIP_ATOMIC_FETCH_SUB(float) +DESUL_IMPL_HIP_ATOMIC_FETCH_SUB(double) + +#undef DESUL_IMPL_HIP_ATOMIC_FETCH_SUB + +#define DESUL_IMPL_HIP_ATOMIC_FETCH_INC(T) \ + template \ + __device__ inline T device_atomic_fetch_inc( T* ptr, MemoryOrder, MemoryScope) { \ + return __hip_atomic_fetch_add(ptr, \ + 1, \ + HIPMemoryOrder::value, \ + HIPMemoryScope::value); \ + } \ + template \ + __device__ inline T device_atomic_fetch_dec( T* ptr, MemoryOrder, MemoryScope) { \ + return __hip_atomic_fetch_add(ptr, \ + -1, \ + HIPMemoryOrder::value, \ + HIPMemoryScope::value); \ + } -inline __device__ int device_atomic_fetch_dec( int* ptr, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicSub(ptr, 1 ); } -inline __device__ unsigned int device_atomic_fetch_dec( unsigned int* ptr, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicSub(ptr, 1u ); } -inline __device__ unsigned long long device_atomic_fetch_dec(unsigned long long* ptr, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAdd(ptr, -1 ); } +DESUL_IMPL_HIP_ATOMIC_FETCH_INC(int) +DESUL_IMPL_HIP_ATOMIC_FETCH_INC(long long) +DESUL_IMPL_HIP_ATOMIC_FETCH_INC(unsigned int) +DESUL_IMPL_HIP_ATOMIC_FETCH_INC(unsigned long long) -inline __device__ unsigned int device_atomic_fetch_inc_mod( unsigned int* ptr, unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicInc(ptr, val); } -inline __device__ unsigned int device_atomic_fetch_dec_mod( unsigned int* ptr, unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicDec(ptr, val); } -// clang-format on +#undef DESUL_IMPL_HIP_ATOMIC_FETCH_INC -#define DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP(OP, TYPE) \ +#define DESUL_IMPL_HIP_ATOMIC_FETCH_INC_MOD(MEMORY_SCOPE, MEMORY_SCOPE_STRING_LITERAL) \ template \ - __device__ TYPE device_atomic_fetch_##OP( \ - TYPE* ptr, TYPE val, MemoryOrder, MemoryScopeDevice) { \ - __threadfence(); \ - TYPE return_val = \ - device_atomic_fetch_##OP(ptr, val, MemoryOrderRelaxed(), MemoryScopeDevice()); \ - __threadfence(); \ - return return_val; \ + __device__ inline unsigned int device_atomic_fetch_inc_mod(unsigned int* ptr, unsigned int val, MemoryOrder, MEMORY_SCOPE) { \ + return __builtin_amdgcn_atomic_inc32( \ + ptr, val, HIPMemoryOrder::value, MEMORY_SCOPE_STRING_LITERAL); \ } \ template \ - __device__ TYPE device_atomic_fetch_##OP( \ - TYPE* ptr, TYPE val, MemoryOrder, MemoryScopeCore) { \ - return device_atomic_fetch_##OP(ptr, val, MemoryOrder(), MemoryScopeDevice()); \ + __device__ inline unsigned int device_atomic_fetch_dec_mod(unsigned int* ptr, unsigned int val, MemoryOrder, MEMORY_SCOPE) { \ + return __builtin_amdgcn_atomic_dec32( \ + ptr, val, HIPMemoryOrder::value, MEMORY_SCOPE_STRING_LITERAL); \ } -#define DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(OP) \ - DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP(OP, int) \ - DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP(OP, unsigned int) \ - DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP(OP, unsigned long long) - -#define DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_FLOATING_POINT(OP) \ - DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP(OP, float) \ - DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP(OP, double) - -DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(min) -DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(max) -DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(and) -DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(or) -DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(xor) - -DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_FLOATING_POINT(add) -DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(add) -DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_FLOATING_POINT(sub) -DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(sub) - -DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(inc) -DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(dec) - -DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP(inc_mod, unsigned int) -DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP(dec_mod, unsigned int) +DESUL_IMPL_HIP_ATOMIC_FETCH_INC_MOD(MemoryScopeCore, "workgroup") +DESUL_IMPL_HIP_ATOMIC_FETCH_INC_MOD(MemoryScopeDevice, "agent") +DESUL_IMPL_HIP_ATOMIC_FETCH_INC_MOD(MemoryScopeNode, "") +DESUL_IMPL_HIP_ATOMIC_FETCH_INC_MOD(MemoryScopeSystem, "") -#undef DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_FLOATING_POINT -#undef DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_INTEGRAL -#undef DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP +#undef DESUL_IMPL_HIP_ATOMIC_FETCH_INC_MOD } // namespace Impl } // namespace desul From 70603df3dd1d1d4485ffda0ccb01e72ff8ba64a1 Mon Sep 17 00:00:00 2001 From: thierry antoun Date: Thu, 21 Mar 2024 10:39:13 +0100 Subject: [PATCH 07/95] Combine the two Impl::create_mirror functions into one with constexpr --- core/src/Kokkos_CopyViews.hpp | 32 ++++++++++---------------------- 1 file changed, 10 insertions(+), 22 deletions(-) diff --git a/core/src/Kokkos_CopyViews.hpp b/core/src/Kokkos_CopyViews.hpp index 88f5c4e707b..d84e84445c1 100644 --- a/core/src/Kokkos_CopyViews.hpp +++ b/core/src/Kokkos_CopyViews.hpp @@ -3471,35 +3471,23 @@ void check_view_ctor_args_create_mirror() { } template -inline std::enable_if_t::has_memory_space, - typename Kokkos::View::HostMirror> -create_mirror(const Kokkos::View& src, +auto create_mirror(const Kokkos::View& src, const Impl::ViewCtorProp& arg_prop) { - using src_type = View; - using dst_type = typename src_type::HostMirror; - - check_view_ctor_args_create_mirror(); - - auto prop_copy = Impl::with_properties_if_unset( - arg_prop, std::string(src.label()).append("_mirror")); - - return dst_type(prop_copy, src.layout()); -} -// Create a mirror in a new space (specialization for different space) -template ::has_memory_space>> -auto create_mirror(const Kokkos::View& src, - const Impl::ViewCtorProp& arg_prop) { check_view_ctor_args_create_mirror(); auto prop_copy = Impl::with_properties_if_unset( arg_prop, std::string(src.label()).append("_mirror")); - using alloc_prop = decltype(prop_copy); - return typename Impl::MirrorType::view_type(prop_copy, src.layout()); + if constexpr (Impl::ViewCtorProp::has_memory_space){ + using memory_space = typename decltype(prop_copy)::memory_space; + using dst_type = typename Impl::MirrorType::view_type; + return dst_type(prop_copy, src.layout()); + } else { + using dst_type = typename View::HostMirror; + return dst_type(prop_copy, src.layout()); + } } } // namespace Impl From 6524e2b41ab0f38e106ab7b99864a7ce9a98cddb Mon Sep 17 00:00:00 2001 From: thierry antoun Date: Thu, 21 Mar 2024 11:54:42 +0100 Subject: [PATCH 08/95] Format create_mirror --- core/src/Kokkos_CopyViews.hpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/core/src/Kokkos_CopyViews.hpp b/core/src/Kokkos_CopyViews.hpp index d84e84445c1..70fd9c2ccf4 100644 --- a/core/src/Kokkos_CopyViews.hpp +++ b/core/src/Kokkos_CopyViews.hpp @@ -3472,17 +3472,16 @@ void check_view_ctor_args_create_mirror() { template auto create_mirror(const Kokkos::View& src, - const Impl::ViewCtorProp& arg_prop) { - + const Impl::ViewCtorProp& arg_prop) { check_view_ctor_args_create_mirror(); auto prop_copy = Impl::with_properties_if_unset( arg_prop, std::string(src.label()).append("_mirror")); - if constexpr (Impl::ViewCtorProp::has_memory_space){ + if constexpr (Impl::ViewCtorProp::has_memory_space) { using memory_space = typename decltype(prop_copy)::memory_space; - using dst_type = typename Impl::MirrorType::view_type; + using dst_type = + typename Impl::MirrorType::view_type; return dst_type(prop_copy, src.layout()); } else { using dst_type = typename View::HostMirror; @@ -3539,7 +3538,8 @@ std::enable_if_t::specialize>::value, typename Impl::MirrorType::view_type> create_mirror(Kokkos::Impl::WithoutInitializing_t wi, Space const&, Kokkos::View const& src) { - return Impl::create_mirror(src, view_alloc(typename Space::memory_space{}, wi)); + return Impl::create_mirror(src, + view_alloc(typename Space::memory_space{}, wi)); } namespace Impl { From 1635ff81f73cc82a9794fdd3809c9a5bc9bc92d4 Mon Sep 17 00:00:00 2001 From: thierry antoun Date: Thu, 21 Mar 2024 11:56:25 +0100 Subject: [PATCH 09/95] Combine the four Impl::create_mirror_view in one using if constexpr --- core/src/Kokkos_CopyViews.hpp | 83 ++++++++++------------------------- 1 file changed, 24 insertions(+), 59 deletions(-) diff --git a/core/src/Kokkos_CopyViews.hpp b/core/src/Kokkos_CopyViews.hpp index 70fd9c2ccf4..7ae8c605277 100644 --- a/core/src/Kokkos_CopyViews.hpp +++ b/core/src/Kokkos_CopyViews.hpp @@ -3545,65 +3545,30 @@ create_mirror(Kokkos::Impl::WithoutInitializing_t wi, Space const&, namespace Impl { template -inline std::enable_if_t< - !Impl::ViewCtorProp::has_memory_space && - (std::is_same< - typename Kokkos::View::memory_space, - typename Kokkos::View::HostMirror::memory_space>::value && - std::is_same< - typename Kokkos::View::data_type, - typename Kokkos::View::HostMirror::data_type>::value), - typename Kokkos::View::HostMirror> -create_mirror_view(const Kokkos::View& src, - const Impl::ViewCtorProp&) { - check_view_ctor_args_create_mirror(); - return src; -} - -template -inline std::enable_if_t< - !Impl::ViewCtorProp::has_memory_space && - !(std::is_same::memory_space, - typename Kokkos::View< - T, P...>::HostMirror::memory_space>::value && - std::is_same< - typename Kokkos::View::data_type, - typename Kokkos::View::HostMirror::data_type>::value), - typename Kokkos::View::HostMirror> -create_mirror_view(const Kokkos::View& src, - const Impl::ViewCtorProp& arg_prop) { - return Kokkos::Impl::create_mirror(src, arg_prop); -} - -// Create a mirror view in a new space (specialization for same space) -template ::has_memory_space>> -std::enable_if_t::memory_space, - T, P...>::is_same_memspace, - typename Impl::MirrorViewType< - typename Impl::ViewCtorProp::memory_space, - T, P...>::view_type> -create_mirror_view(const Kokkos::View& src, - const Impl::ViewCtorProp&) { - check_view_ctor_args_create_mirror(); - return src; -} - -// Create a mirror view in a new space (specialization for different space) -template ::has_memory_space>> -std::enable_if_t::memory_space, - T, P...>::is_same_memspace, - typename Impl::MirrorViewType< - typename Impl::ViewCtorProp::memory_space, - T, P...>::view_type> -create_mirror_view(const Kokkos::View& src, - const Impl::ViewCtorProp& arg_prop) { - return Kokkos::Impl::create_mirror(src, arg_prop); +auto create_mirror_view(const Kokkos::View& src, + const Impl::ViewCtorProp& arg_prop) { + if constexpr (!Impl::ViewCtorProp::has_memory_space) { + if constexpr (std::is_same::memory_space, + typename Kokkos::View< + T, P...>::HostMirror::memory_space>::value && + std::is_same::data_type, + typename Kokkos::View< + T, P...>::HostMirror::data_type>::value) { + check_view_ctor_args_create_mirror(); + return src; + } else { + return Kokkos::Impl::create_mirror(src, arg_prop); + } + } else { + if constexpr (Impl::MirrorViewType::memory_space, + T, P...>::is_same_memspace) { + check_view_ctor_args_create_mirror(); + return src; + } else { + return Kokkos::Impl::create_mirror(src, arg_prop); + } + } } } // namespace Impl From d3bb03a4d71ca3f7915a435f6f0079e221fd3cf4 Mon Sep 17 00:00:00 2001 From: Paul Zehner Date: Thu, 21 Mar 2024 18:20:18 +0100 Subject: [PATCH 10/95] Combine the two create_mirror_view_and_copy functions into one with constexpr --- core/src/Kokkos_CopyViews.hpp | 87 +++++++++++++++-------------------- 1 file changed, 37 insertions(+), 50 deletions(-) diff --git a/core/src/Kokkos_CopyViews.hpp b/core/src/Kokkos_CopyViews.hpp index 7ae8c605277..04708f73e77 100644 --- a/core/src/Kokkos_CopyViews.hpp +++ b/core/src/Kokkos_CopyViews.hpp @@ -3639,16 +3639,12 @@ auto create_mirror_view(const Impl::ViewCtorProp& arg_prop, return Impl::create_mirror_view(src, arg_prop); } -template -auto create_mirror_view_and_copy( - const Impl::ViewCtorProp&, - const Kokkos::View& src, - std::enable_if_t< - std::is_void::specialize>::value && - Impl::MirrorViewType< - typename Impl::ViewCtorProp::memory_space, T, - P...>::is_same_memspace>* = nullptr) { +namespace Impl { + +template +void check_view_ctor_args_create_mirror_view_and_copy() { using alloc_prop_input = Impl::ViewCtorProp; + static_assert( alloc_prop_input::has_memory_space, "The view constructor arguments passed to " @@ -3661,52 +3657,43 @@ auto create_mirror_view_and_copy( "The view constructor arguments passed to " "Kokkos::create_mirror_view_and_copy must " "not explicitly allow padding!"); - - // same behavior as deep_copy(src, src) - if (!alloc_prop_input::has_execution_space) - fence( - "Kokkos::create_mirror_view_and_copy: fence before returning src view"); - return src; } -template +} // namespace Impl + +template ::specialize>::value>> auto create_mirror_view_and_copy( const Impl::ViewCtorProp& arg_prop, - const Kokkos::View& src, - std::enable_if_t< - std::is_void::specialize>::value && - !Impl::MirrorViewType< - typename Impl::ViewCtorProp::memory_space, T, - P...>::is_same_memspace>* = nullptr) { + const Kokkos::View& src) { using alloc_prop_input = Impl::ViewCtorProp; - static_assert( - alloc_prop_input::has_memory_space, - "The view constructor arguments passed to " - "Kokkos::create_mirror_view_and_copy must include a memory space!"); - static_assert(!alloc_prop_input::has_pointer, - "The view constructor arguments passed to " - "Kokkos::create_mirror_view_and_copy must " - "not include a pointer!"); - static_assert(!alloc_prop_input::allow_padding, - "The view constructor arguments passed to " - "Kokkos::create_mirror_view_and_copy must " - "not explicitly allow padding!"); - using Space = typename alloc_prop_input::memory_space; - using Mirror = typename Impl::MirrorViewType::view_type; - - auto arg_prop_copy = Impl::with_properties_if_unset( - arg_prop, std::string{}, WithoutInitializing, - typename Space::execution_space{}); - - std::string& label = Impl::get_property(arg_prop_copy); - if (label.empty()) label = src.label(); - auto mirror = typename Mirror::non_const_type{arg_prop_copy, src.layout()}; - if constexpr (alloc_prop_input::has_execution_space) { - deep_copy(Impl::get_property(arg_prop_copy), - mirror, src); - } else - deep_copy(mirror, src); - return mirror; + + Impl::check_view_ctor_args_create_mirror_view_and_copy(); + + if constexpr (Impl::MirrorViewType::is_same_memspace) { + // same behavior as deep_copy(src, src) + if constexpr (!alloc_prop_input::has_execution_space) + fence( + "Kokkos::create_mirror_view_and_copy: fence before returning src view"); + return src; + } else { + using Space = typename alloc_prop_input::memory_space; + using Mirror = typename Impl::MirrorViewType::view_type; + + auto arg_prop_copy = Impl::with_properties_if_unset( + arg_prop, std::string{}, WithoutInitializing, + typename Space::execution_space{}); + + std::string& label = Impl::get_property(arg_prop_copy); + if (label.empty()) label = src.label(); + auto mirror = typename Mirror::non_const_type{arg_prop_copy, src.layout()}; + if constexpr (alloc_prop_input::has_execution_space) { + deep_copy(Impl::get_property(arg_prop_copy), + mirror, src); + } else + deep_copy(mirror, src); + return mirror; + } } // Previously when using auto here, the intel compiler 19.3 would From 872dc422ff8b77616ac5884f26b83ced61328ee0 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Fri, 22 Mar 2024 16:03:24 -0400 Subject: [PATCH 11/95] Fix Makefile.kokkos for Threads --- Makefile.targets | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.targets b/Makefile.targets index e6900a822a8..e8e429e0275 100644 --- a/Makefile.targets +++ b/Makefile.targets @@ -81,7 +81,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_THREADS), 1) Kokkos_Threads_Instance.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Threads/Kokkos_Threads_Instance.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Threads/Kokkos_Threads_Instance.cpp Kokkos_Threads_Spinwait.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Threads/Kokkos_Threads_Spinwait.cpp - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Threads/Kokkos_Spinwait.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Threads/Kokkos_Threads_Spinwait.cpp endif ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1) From 2035e313d7a54f9e1572eb5f315249ea841fb258 Mon Sep 17 00:00:00 2001 From: Bruno Turcksin Date: Mon, 25 Mar 2024 19:54:46 -0400 Subject: [PATCH 12/95] Fix a bug in Makefile when using AMD GPU architectures (#6892) * Fix bug in Makefile when using AMD GPU architectures * Fix indentation * Fix documentation of the architecture to match the code --- Makefile.kokkos | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/Makefile.kokkos b/Makefile.kokkos index b146b4a296d..6b7fff0dd6e 100644 --- a/Makefile.kokkos +++ b/Makefile.kokkos @@ -13,7 +13,7 @@ KOKKOS_DEVICES ?= "Threads" # NVIDIA: Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86,Ada89,Hopper90 # ARM: ARMv80,ARMv81,ARMv8-ThunderX,ARMv8-TX2,A64FX # IBM: Power8,Power9 -# AMD-GPUS: GFX906,GFX908,GFX90A,GFX940,GFX942,GFX1030,GFX1100 +# AMD-GPUS: AMD_GFX906,AMD_GFX908,AMD_GFX90A,AMD_GFX940,AMD_GFX942,AMD_GFX1030,AMD_GFX1100 # AMD-CPUS: AMDAVX,Zen,Zen2,Zen3 # Intel-GPUs: Gen9,Gen11,Gen12LP,DG1,XeHP,PVC KOKKOS_ARCH ?= "" @@ -400,13 +400,29 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ZEN3), 0) KOKKOS_INTERNAL_USE_ARCH_ZEN := $(call kokkos_has_string,$(KOKKOS_ARCH),Zen) endif endif -KOKKOS_INTERNAL_USE_ARCH_AMD_GFX906 := $(or $(call kokkos_has_string,$(KOKKOS_ARCH),VEGA906),$(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX906)) -KOKKOS_INTERNAL_USE_ARCH_AMD_GFX908 := $(or $(call kokkos_has_string,$(KOKKOS_ARCH),VEGA908),$(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX908)) -KOKKOS_INTERNAL_USE_ARCH_AMD_GFX90A := $(or $(call kokkos_has_string,$(KOKKOS_ARCH),VEGA90A),$(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX90A)) + +KOKKOS_INTERNAL_USE_ARCH_AMD_GFX906 := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX906) +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX906), 0) + KOKKOS_INTERNAL_USE_ARCH_AMD_GFX906 := $(call kokkos_has_string,$(KOKKOS_ARCH),VEGA906) +endif +KOKKOS_INTERNAL_USE_ARCH_AMD_GFX908 := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX908) +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX908), 0) + KOKKOS_INTERNAL_USE_ARCH_AMD_GFX908 := $(call kokkos_has_string,$(KOKKOS_ARCH),VEGA908) +endif +KOKKOS_INTERNAL_USE_ARCH_AMD_GFX90A := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX90A) +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX90A), 0) + KOKKOS_INTERNAL_USE_ARCH_AMD_GFX90A := $(call kokkos_has_string,$(KOKKOS_ARCH),VEGA90A) +endif KOKKOS_INTERNAL_USE_ARCH_AMD_GFX940 := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX940) KOKKOS_INTERNAL_USE_ARCH_AMD_GFX942 := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX942) -KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1030 := $(or $(call kokkos_has_string,$(KOKKOS_ARCH),NAVI1030),$(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX1030)) -KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1100 := $(or $(call kokkos_has_string,$(KOKKOS_ARCH),NAVI1100),$(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX1100)) +KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1030 := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX1030) +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1030), 0) + KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1030 := $(call kokkos_has_string,$(KOKKOS_ARCH),NAVI1030) +endif +KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1100 := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX1100) +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1100), 0) + KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1100 := $(call kokkos_has_string,$(KOKKOS_ARCH),NAVI1100) +endif # Any AVX? KOKKOS_INTERNAL_USE_ARCH_AVX := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_SNB) + $(KOKKOS_INTERNAL_USE_ARCH_AMDAVX)) From 8d734b0267284c3232e715355eb2c690d305ae3f Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 27 Mar 2024 23:08:05 -0400 Subject: [PATCH 13/95] Cuda: Fix configuring with CMake 3.28.4 (#6898) * Cuda: Fix configuring with CMake 3.29.0 * CMake 3.28.4 is also affected --- cmake/Modules/FindTPLCUDA.cmake | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cmake/Modules/FindTPLCUDA.cmake b/cmake/Modules/FindTPLCUDA.cmake index 792c92c07e9..5a62c530fce 100644 --- a/cmake/Modules/FindTPLCUDA.cmake +++ b/cmake/Modules/FindTPLCUDA.cmake @@ -7,7 +7,8 @@ IF (NOT CUDAToolkit_ROOT) ENDIF() ENDIF() -IF(CMAKE_VERSION VERSION_GREATER_EQUAL "3.17.0") +# FIXME CMake 3.28.4 creates more targets than we export +IF(CMAKE_VERSION VERSION_GREATER_EQUAL "3.17.0" AND CMAKE_VERSION VERSION_LESS "3.28.4") find_package(CUDAToolkit) ELSE() include(${CMAKE_CURRENT_LIST_DIR}/CudaToolkit.cmake) From 68c66846945c28e7b95ba099f2b4fbbe2b72fd63 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Thu, 28 Mar 2024 17:13:00 -0400 Subject: [PATCH 14/95] Update Intel GPU architectures in Makefile (#6895) * Update Intel GPU architectures in Makefile * Add some comments --- Makefile.kokkos | 35 +++++++++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/Makefile.kokkos b/Makefile.kokkos index 6b7fff0dd6e..73decf8b47a 100644 --- a/Makefile.kokkos +++ b/Makefile.kokkos @@ -15,7 +15,7 @@ KOKKOS_DEVICES ?= "Threads" # IBM: Power8,Power9 # AMD-GPUS: AMD_GFX906,AMD_GFX908,AMD_GFX90A,AMD_GFX940,AMD_GFX942,AMD_GFX1030,AMD_GFX1100 # AMD-CPUS: AMDAVX,Zen,Zen2,Zen3 -# Intel-GPUs: Gen9,Gen11,Gen12LP,DG1,XeHP,PVC +# Intel-GPUs: Intel_Gen,Intel_Gen9,Intel_Gen11,Intel_Gen12LP,Intel_DG1,Intel_XeHP,Intel_PVC KOKKOS_ARCH ?= "" # Options: yes,no KOKKOS_DEBUG ?= "no" @@ -318,12 +318,43 @@ KOKKOS_INTERNAL_USE_ARCH_ICL := $(call kokkos_has_string,$(KOKKOS_ARCH),ICL) KOKKOS_INTERNAL_USE_ARCH_ICX := $(call kokkos_has_string,$(KOKKOS_ARCH),ICX) KOKKOS_INTERNAL_USE_ARCH_SPR := $(call kokkos_has_string,$(KOKKOS_ARCH),SPR) -KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelGen) +# Traditionally, we supported, e.g., IntelGen9 instead of Intel_Gen9. The latter +# matches the CMake option but we also accept the former for backward-compatibility. KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN9 := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelGen9) +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN9), 0) + KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN9 := $(call kokkos_has_string,$(KOKKOS_ARCH),Intel_Gen9) +endif KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN11 := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelGen11) +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN11), 0) + KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN11 := $(call kokkos_has_string,$(KOKKOS_ARCH),Intel_Gen11) +endif KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN12LP := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelGen12LP) +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN12LP), 0) + KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN12LP := $(call kokkos_has_string,$(KOKKOS_ARCH),Intel_Gen12LP) +endif +KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelGen9) +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN9), 0) + KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN9 := $(call kokkos_has_string,$(KOKKOS_ARCH),Intel_Gen9) +endif +KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN_SET := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN9) \ + + $(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN11) \ + + $(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN12LP)) +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN_SET), 0) + KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelGen) + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN), 0) + KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN := $(call kokkos_has_string,$(KOKKOS_ARCH),Intel_Gen) + endif +endif KOKKOS_INTERNAL_USE_ARCH_INTEL_DG1 := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelDG1) +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_DG1), 0) + KOKKOS_INTERNAL_USE_ARCH_INTEL_DG1 := $(call kokkos_has_string,$(KOKKOS_ARCH),Intel_DG1) +endif KOKKOS_INTERNAL_USE_ARCH_INTEL_XEHP := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelXeHP) +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_XEHP), 0) + KOKKOS_INTERNAL_USE_ARCH_INTEL_XEHP := $(call kokkos_has_string,$(KOKKOS_ARCH),Intel_XeHP) +endif +# Traditionally the architecture was called PVC instead of Intel_PVC. This +# version makes us accept IntelPVC and Intel_PVC as well. KOKKOS_INTERNAL_USE_ARCH_INTEL_PVC := $(call kokkos_has_string,$(KOKKOS_ARCH),PVC) # NVIDIA based. From e2cfdec5434e1b0e8ff9a47c081fcc49ad3f8550 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 28 Mar 2024 23:25:46 -0400 Subject: [PATCH 15/95] Drop Experimental::LayoutTiled class template --- core/src/Kokkos_Layout.hpp | 110 +- core/src/impl/Kokkos_ViewLayoutTiled.hpp | 1425 ------------------ core/unit_test/TestOther.hpp | 4 - core/unit_test/TestViewLayoutTiled.hpp | 1756 ---------------------- 4 files changed, 1 insertion(+), 3294 deletions(-) delete mode 100644 core/src/impl/Kokkos_ViewLayoutTiled.hpp delete mode 100644 core/unit_test/TestViewLayoutTiled.hpp diff --git a/core/src/Kokkos_Layout.hpp b/core/src/Kokkos_Layout.hpp index ca4d956784c..6c0a0c9cf1d 100644 --- a/core/src/Kokkos_Layout.hpp +++ b/core/src/Kokkos_Layout.hpp @@ -217,81 +217,9 @@ enum class Iterate { Right // Right indices stride fastest }; -// To check for LayoutTiled -// This is to hide extra compile-time 'identifier' info within the LayoutTiled -// class by not relying on template specialization to include the ArgN*'s -template +template struct is_layouttiled : std::false_type {}; -template -struct is_layouttiled> - : std::true_type {}; - -namespace Experimental { - -/// LayoutTiled -// Must have Rank >= 2 -template < - Kokkos::Iterate OuterP, Kokkos::Iterate InnerP, unsigned ArgN0, - unsigned ArgN1, unsigned ArgN2 = 0, unsigned ArgN3 = 0, unsigned ArgN4 = 0, - unsigned ArgN5 = 0, unsigned ArgN6 = 0, unsigned ArgN7 = 0, - bool IsPowerOfTwo = - (Kokkos::Impl::is_integral_power_of_two(ArgN0) && - Kokkos::Impl::is_integral_power_of_two(ArgN1) && - (Kokkos::Impl::is_integral_power_of_two(ArgN2) || (ArgN2 == 0)) && - (Kokkos::Impl::is_integral_power_of_two(ArgN3) || (ArgN3 == 0)) && - (Kokkos::Impl::is_integral_power_of_two(ArgN4) || (ArgN4 == 0)) && - (Kokkos::Impl::is_integral_power_of_two(ArgN5) || (ArgN5 == 0)) && - (Kokkos::Impl::is_integral_power_of_two(ArgN6) || (ArgN6 == 0)) && - (Kokkos::Impl::is_integral_power_of_two(ArgN7) || (ArgN7 == 0)))> -struct LayoutTiled { - static_assert(IsPowerOfTwo, - "LayoutTiled must be given power-of-two tile dimensions"); - - using array_layout = LayoutTiled; - static constexpr Iterate outer_pattern = OuterP; - static constexpr Iterate inner_pattern = InnerP; - - enum { N0 = ArgN0 }; - enum { N1 = ArgN1 }; - enum { N2 = ArgN2 }; - enum { N3 = ArgN3 }; - enum { N4 = ArgN4 }; - enum { N5 = ArgN5 }; - enum { N6 = ArgN6 }; - enum { N7 = ArgN7 }; - - size_t dimension[ARRAY_LAYOUT_MAX_RANK]; - - enum : bool { is_extent_constructible = true }; - - LayoutTiled(LayoutTiled const&) = default; - LayoutTiled(LayoutTiled&&) = default; - LayoutTiled& operator=(LayoutTiled const&) = default; - LayoutTiled& operator=(LayoutTiled&&) = default; - - KOKKOS_INLINE_FUNCTION - explicit constexpr LayoutTiled(size_t argN0 = 0, size_t argN1 = 0, - size_t argN2 = 0, size_t argN3 = 0, - size_t argN4 = 0, size_t argN5 = 0, - size_t argN6 = 0, size_t argN7 = 0) - : dimension{argN0, argN1, argN2, argN3, argN4, argN5, argN6, argN7} {} - - friend bool operator==(const LayoutTiled& left, const LayoutTiled& right) { - for (unsigned int rank = 0; rank < ARRAY_LAYOUT_MAX_RANK; ++rank) - if (left.dimension[rank] != right.dimension[rank]) return false; - return true; - } - - friend bool operator!=(const LayoutTiled& left, const LayoutTiled& right) { - return !(left == right); - } -}; - -} // namespace Experimental - // For use with view_copy template struct layout_iterate_type_selector { @@ -321,42 +249,6 @@ struct layout_iterate_type_selector { Kokkos::Iterate::Default; }; -template -struct layout_iterate_type_selector> { - static const Kokkos::Iterate outer_iteration_pattern = Kokkos::Iterate::Left; - static const Kokkos::Iterate inner_iteration_pattern = Kokkos::Iterate::Left; -}; - -template -struct layout_iterate_type_selector> { - static const Kokkos::Iterate outer_iteration_pattern = Kokkos::Iterate::Right; - static const Kokkos::Iterate inner_iteration_pattern = Kokkos::Iterate::Left; -}; - -template -struct layout_iterate_type_selector> { - static const Kokkos::Iterate outer_iteration_pattern = Kokkos::Iterate::Left; - static const Kokkos::Iterate inner_iteration_pattern = Kokkos::Iterate::Right; -}; - -template -struct layout_iterate_type_selector> { - static const Kokkos::Iterate outer_iteration_pattern = Kokkos::Iterate::Right; - static const Kokkos::Iterate inner_iteration_pattern = Kokkos::Iterate::Right; -}; - } // namespace Kokkos #endif // #ifndef KOKKOS_LAYOUT_HPP diff --git a/core/src/impl/Kokkos_ViewLayoutTiled.hpp b/core/src/impl/Kokkos_ViewLayoutTiled.hpp deleted file mode 100644 index 957717f973d..00000000000 --- a/core/src/impl/Kokkos_ViewLayoutTiled.hpp +++ /dev/null @@ -1,1425 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_EXPERIMENTAL_VIEWLAYOUTTILE_HPP -#define KOKKOS_EXPERIMENTAL_VIEWLAYOUTTILE_HPP - -#include -#include - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { - -// View offset and mapping for tiled view's - -template -struct is_array_layout> - : public std::true_type {}; - -template -struct is_array_layout> - : public std::true_type {}; - -template -struct is_array_layout> - : public std::true_type {}; - -template -struct is_array_layout> - : public std::true_type {}; - -template -struct is_array_layout> - : public std::true_type {}; - -template -struct is_array_layout> - : public std::true_type {}; - -template -struct is_array_layout< - Kokkos::Experimental::LayoutTiled> - : public std::true_type {}; - -template -struct is_array_layout_tiled : public std::false_type {}; - -template -struct is_array_layout_tiled> : public std::true_type { -}; // Last template parameter "true" meaning this currently only supports - // powers-of-two - -namespace Impl { - -template -struct ViewOffset< - Dimension, Layout, - std::enable_if_t<((Dimension::rank <= 8) && (Dimension::rank >= 2) && - is_array_layout::value && - is_array_layout_tiled::value)>> { - public: - static constexpr Kokkos::Iterate outer_pattern = Layout::outer_pattern; - static constexpr Kokkos::Iterate inner_pattern = Layout::inner_pattern; - - static constexpr int VORank = Dimension::rank; - - static constexpr unsigned SHIFT_0 = - Kokkos::Impl::integral_power_of_two(Layout::N0); - static constexpr unsigned SHIFT_1 = - Kokkos::Impl::integral_power_of_two(Layout::N1); - static constexpr unsigned SHIFT_2 = - Kokkos::Impl::integral_power_of_two(Layout::N2); - static constexpr unsigned SHIFT_3 = - Kokkos::Impl::integral_power_of_two(Layout::N3); - static constexpr unsigned SHIFT_4 = - Kokkos::Impl::integral_power_of_two(Layout::N4); - static constexpr unsigned SHIFT_5 = - Kokkos::Impl::integral_power_of_two(Layout::N5); - static constexpr unsigned SHIFT_6 = - Kokkos::Impl::integral_power_of_two(Layout::N6); - static constexpr unsigned SHIFT_7 = - Kokkos::Impl::integral_power_of_two(Layout::N7); - static constexpr int MASK_0 = Layout::N0 - 1; - static constexpr int MASK_1 = Layout::N1 - 1; - static constexpr int MASK_2 = Layout::N2 - 1; - static constexpr int MASK_3 = Layout::N3 - 1; - static constexpr int MASK_4 = Layout::N4 - 1; - static constexpr int MASK_5 = Layout::N5 - 1; - static constexpr int MASK_6 = Layout::N6 - 1; - static constexpr int MASK_7 = Layout::N7 - 1; - - static constexpr unsigned SHIFT_2T = SHIFT_0 + SHIFT_1; - static constexpr unsigned SHIFT_3T = SHIFT_0 + SHIFT_1 + SHIFT_2; - static constexpr unsigned SHIFT_4T = SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3; - static constexpr unsigned SHIFT_5T = - SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3 + SHIFT_4; - static constexpr unsigned SHIFT_6T = - SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3 + SHIFT_4 + SHIFT_5; - static constexpr unsigned SHIFT_7T = - SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3 + SHIFT_4 + SHIFT_5 + SHIFT_6; - static constexpr unsigned SHIFT_8T = SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3 + - SHIFT_4 + SHIFT_5 + SHIFT_6 + SHIFT_7; - - // Is an irregular layout that does not have uniform striding for each index. - using is_mapping_plugin = std::true_type; - using is_regular = std::false_type; - - using size_type = size_t; - using dimension_type = Dimension; - using array_layout = Layout; - - dimension_type m_dim; - size_type m_tile_N0; // Num tiles dim 0 - size_type m_tile_N1; - size_type m_tile_N2; - size_type m_tile_N3; - size_type m_tile_N4; - size_type m_tile_N5; - size_type m_tile_N6; - size_type m_tile_N7; - - //---------------------------------------- - -#define KOKKOS_IMPL_DEBUG_OUTPUT_CHECK 0 - - // Rank 2 - template - KOKKOS_INLINE_FUNCTION size_type operator()(I0 const& i0, - I1 const& i1) const { - auto tile_offset = - (outer_pattern == (Kokkos::Iterate::Left)) - ? (((i0 >> SHIFT_0) + m_tile_N0 * ((i1 >> SHIFT_1))) << SHIFT_2T) - : (((m_tile_N1 * (i0 >> SHIFT_0) + (i1 >> SHIFT_1))) << SHIFT_2T); - // ( num_tiles[1] * ti0 + ti1 ) * FTD - - auto local_offset = (inner_pattern == (Kokkos::Iterate::Left)) - ? ((i0 & MASK_0) + ((i1 & MASK_1) << SHIFT_0)) - : (((i0 & MASK_0) << SHIFT_1) + (i1 & MASK_1)); - // ( tile_dim[1] * li0 + li1 ) - -#if KOKKOS_IMPL_DEBUG_OUTPUT_CHECK - std::cout << "Am I Outer Left? " - << (outer_pattern == (Kokkos::Iterate::Left)) << std::endl; - std::cout << "Am I Inner Left? " - << (inner_pattern == (Kokkos::Iterate::Left)) << std::endl; - std::cout << "i0 = " << i0 << " i1 = " << i1 - << "\ntilei0 = " << (i0 >> SHIFT_0) - << " tilei1 = " << (i1 >> SHIFT_1) - << "locali0 = " << (i0 & MASK_0) - << "\nlocali1 = " << (i1 & MASK_1) << std::endl; -#endif - - return tile_offset + local_offset; - } - - // Rank 3 - template - KOKKOS_INLINE_FUNCTION size_type operator()(I0 const& i0, I1 const& i1, - I2 const& i2) const { - auto tile_offset = - (outer_pattern == Kokkos::Iterate::Left) - ? (((i0 >> SHIFT_0) + - m_tile_N0 * ((i1 >> SHIFT_1) + m_tile_N1 * (i2 >> SHIFT_2))) - << SHIFT_3T) - : ((m_tile_N2 * (m_tile_N1 * (i0 >> SHIFT_0) + (i1 >> SHIFT_1)) + - (i2 >> SHIFT_2)) - << SHIFT_3T); - - auto local_offset = (inner_pattern == Kokkos::Iterate::Left) - ? ((i0 & MASK_0) + ((i1 & MASK_1) << SHIFT_0) + - ((i2 & MASK_2) << (SHIFT_0 + SHIFT_1))) - : (((i0 & MASK_0) << (SHIFT_2 + SHIFT_1)) + - ((i1 & MASK_1) << (SHIFT_2)) + (i2 & MASK_2)); - -#if KOKKOS_IMPL_DEBUG_OUTPUT_CHECK - std::cout << "Am I Outer Left? " - << (outer_pattern == (Kokkos::Iterate::Left)) << std::endl; - std::cout << "Am I Inner Left? " - << (inner_pattern == (Kokkos::Iterate::Left)) << std::endl; - std::cout << "i0 = " << i0 << " i1 = " << i1 << " i2 = " << i2 - << "\ntilei0 = " << (i0 >> SHIFT_0) - << " tilei1 = " << (i1 >> SHIFT_1) - << " tilei2 = " << (i2 >> SHIFT_2) - << "\nlocali0 = " << (i0 & MASK_0) - << "locali1 = " << (i1 & MASK_1) << "locali2 = " << (i2 & MASK_2) - << std::endl; -#endif - - return tile_offset + local_offset; - } - - // Rank 4 - template - KOKKOS_INLINE_FUNCTION size_type operator()(I0 const& i0, I1 const& i1, - I2 const& i2, - I3 const& i3) const { - auto tile_offset = - (outer_pattern == Kokkos::Iterate::Left) - ? (((i0 >> SHIFT_0) + - m_tile_N0 * ((i1 >> SHIFT_1) + - m_tile_N1 * ((i2 >> SHIFT_2) + - m_tile_N2 * (i3 >> SHIFT_3)))) - << SHIFT_4T) - : ((m_tile_N3 * (m_tile_N2 * (m_tile_N1 * (i0 >> SHIFT_0) + - (i1 >> SHIFT_1)) + - (i2 >> SHIFT_2)) + - (i3 >> SHIFT_3)) - << SHIFT_4T); - - auto local_offset = - (inner_pattern == Kokkos::Iterate::Left) - ? ((i0 & MASK_0) + ((i1 & MASK_1) << SHIFT_0) + - ((i2 & MASK_2) << (SHIFT_0 + SHIFT_1)) + - ((i3 & MASK_3) << (SHIFT_0 + SHIFT_1 + SHIFT_2))) - : (((i0 & MASK_0) << (SHIFT_3 + SHIFT_2 + SHIFT_1)) + - ((i1 & MASK_1) << (SHIFT_3 + SHIFT_2)) + - ((i2 & MASK_2) << (SHIFT_3)) + (i3 & MASK_3)); - - return tile_offset + local_offset; - } - - // Rank 5 - template - KOKKOS_INLINE_FUNCTION size_type operator()(I0 const& i0, I1 const& i1, - I2 const& i2, I3 const& i3, - I4 const& i4) const { - auto tile_offset = - (outer_pattern == Kokkos::Iterate::Left) - ? (((i0 >> SHIFT_0) + - m_tile_N0 * - ((i1 >> SHIFT_1) + - m_tile_N1 * ((i2 >> SHIFT_2) + - m_tile_N2 * ((i3 >> SHIFT_3) + - m_tile_N3 * (i4 >> SHIFT_4))))) - << SHIFT_5T) - : ((m_tile_N4 * - (m_tile_N3 * (m_tile_N2 * (m_tile_N1 * (i0 >> SHIFT_0) + - (i1 >> SHIFT_1)) + - (i2 >> SHIFT_2)) + - (i3 >> SHIFT_3)) + - (i4 >> SHIFT_4)) - << SHIFT_5T); - - auto local_offset = - (inner_pattern == Kokkos::Iterate::Left) - ? ((i0 & MASK_0) + ((i1 & MASK_1) << SHIFT_0) + - ((i2 & MASK_2) << (SHIFT_0 + SHIFT_1)) + - ((i3 & MASK_3) << (SHIFT_0 + SHIFT_1 + SHIFT_2)) + - ((i4 & MASK_4) << (SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3))) - : (((i0 & MASK_0) << (SHIFT_4 + SHIFT_3 + SHIFT_2 + SHIFT_1)) + - ((i1 & MASK_1) << (SHIFT_4 + SHIFT_3 + SHIFT_2)) + - ((i2 & MASK_2) << (SHIFT_4 + SHIFT_3)) + - ((i3 & MASK_3) << (SHIFT_4)) + (i4 & MASK_4)); - - return tile_offset + local_offset; - } - - // Rank 6 - template - KOKKOS_INLINE_FUNCTION size_type operator()(I0 const& i0, I1 const& i1, - I2 const& i2, I3 const& i3, - I4 const& i4, - I5 const& i5) const { - auto tile_offset = - (outer_pattern == Kokkos::Iterate::Left) - ? (((i0 >> SHIFT_0) + - m_tile_N0 * - ((i1 >> SHIFT_1) + - m_tile_N1 * - ((i2 >> SHIFT_2) + - m_tile_N2 * - ((i3 >> SHIFT_3) + - m_tile_N3 * ((i4 >> SHIFT_4) + - m_tile_N4 * (i5 >> SHIFT_5)))))) - << SHIFT_6T) - : ((m_tile_N5 * - (m_tile_N4 * - (m_tile_N3 * - (m_tile_N2 * (m_tile_N1 * (i0 >> SHIFT_0) + - (i1 >> SHIFT_1)) + - (i2 >> SHIFT_2)) + - (i3 >> SHIFT_3)) + - (i4 >> SHIFT_4)) + - (i5 >> SHIFT_5)) - << SHIFT_6T); - - auto local_offset = - (inner_pattern == Kokkos::Iterate::Left) - ? ((i0 & MASK_0) + ((i1 & MASK_1) << SHIFT_0) + - ((i2 & MASK_2) << (SHIFT_0 + SHIFT_1)) + - ((i3 & MASK_3) << (SHIFT_0 + SHIFT_1 + SHIFT_2)) + - ((i4 & MASK_4) << (SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3)) + - ((i5 & MASK_5) - << (SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3 + SHIFT_4))) - : (((i0 & MASK_0) - << (SHIFT_5 + SHIFT_4 + SHIFT_3 + SHIFT_2 + SHIFT_1)) + - ((i1 & MASK_1) << (SHIFT_5 + SHIFT_4 + SHIFT_3 + SHIFT_2)) + - ((i2 & MASK_2) << (SHIFT_5 + SHIFT_4 + SHIFT_3)) + - ((i3 & MASK_3) << (SHIFT_5 + SHIFT_4)) + - ((i4 & MASK_4) << (SHIFT_5)) + (i5 & MASK_5)); - - return tile_offset + local_offset; - } - - // Rank 7 - template - KOKKOS_INLINE_FUNCTION size_type operator()(I0 const& i0, I1 const& i1, - I2 const& i2, I3 const& i3, - I4 const& i4, I5 const& i5, - I6 const& i6) const { - auto tile_offset = - (outer_pattern == Kokkos::Iterate::Left) - ? (((i0 >> SHIFT_0) + - m_tile_N0 * - ((i1 >> SHIFT_1) + - m_tile_N1 * - ((i2 >> SHIFT_2) + - m_tile_N2 * - ((i3 >> SHIFT_3) + - m_tile_N3 * - ((i4 >> SHIFT_4) + - m_tile_N4 * - ((i5 >> SHIFT_5) + - m_tile_N5 * (i6 >> SHIFT_6))))))) - << SHIFT_7T) - : ((m_tile_N6 * - (m_tile_N5 * - (m_tile_N4 * - (m_tile_N3 * - (m_tile_N2 * (m_tile_N1 * (i0 >> SHIFT_0) + - (i1 >> SHIFT_1)) + - (i2 >> SHIFT_2)) + - (i3 >> SHIFT_3)) + - (i4 >> SHIFT_4)) + - (i5 >> SHIFT_5)) + - (i6 >> SHIFT_6)) - << SHIFT_7T); - - auto local_offset = - (inner_pattern == Kokkos::Iterate::Left) - ? ((i0 & MASK_0) + ((i1 & MASK_1) << SHIFT_0) + - ((i2 & MASK_2) << (SHIFT_0 + SHIFT_1)) + - ((i3 & MASK_3) << (SHIFT_0 + SHIFT_1 + SHIFT_2)) + - ((i4 & MASK_4) << (SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3)) + - ((i5 & MASK_5) - << (SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3 + SHIFT_4)) + - ((i6 & MASK_6) - << (SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3 + SHIFT_4 + SHIFT_5))) - : (((i0 & MASK_0) << (SHIFT_6 + SHIFT_5 + SHIFT_4 + SHIFT_3 + - SHIFT_2 + SHIFT_1)) + - ((i1 & MASK_1) - << (SHIFT_6 + SHIFT_5 + SHIFT_4 + SHIFT_3 + SHIFT_2)) + - ((i2 & MASK_2) << (SHIFT_6 + SHIFT_5 + SHIFT_4 + SHIFT_3)) + - ((i3 & MASK_3) << (SHIFT_6 + SHIFT_5 + SHIFT_4)) + - ((i4 & MASK_4) << (SHIFT_6 + SHIFT_5)) + - ((i5 & MASK_5) << (SHIFT_6)) + (i6 & MASK_6)); - - return tile_offset + local_offset; - } - - // Rank 8 - template - KOKKOS_INLINE_FUNCTION size_type operator()(I0 const& i0, I1 const& i1, - I2 const& i2, I3 const& i3, - I4 const& i4, I5 const& i5, - I6 const& i6, - I7 const& i7) const { - auto tile_offset = - (outer_pattern == Kokkos::Iterate::Left) - ? (((i0 >> SHIFT_0) + - m_tile_N0 * - ((i1 >> SHIFT_1) + - m_tile_N1 * - ((i2 >> SHIFT_2) + - m_tile_N2 * - ((i3 >> SHIFT_3) + - m_tile_N3 * - ((i4 >> SHIFT_4) + - m_tile_N4 * - ((i5 >> SHIFT_5) + - m_tile_N5 * - ((i6 >> SHIFT_6) + - m_tile_N6 * (i7 >> SHIFT_7)))))))) - << SHIFT_8T) - : ((m_tile_N7 * - (m_tile_N6 * - (m_tile_N5 * - (m_tile_N4 * - (m_tile_N3 * - (m_tile_N2 * - (m_tile_N1 * (i0 >> SHIFT_0) + - (i1 >> SHIFT_1)) + - (i2 >> SHIFT_2)) + - (i3 >> SHIFT_3)) + - (i4 >> SHIFT_4)) + - (i5 >> SHIFT_5)) + - (i6 >> SHIFT_6)) + - (i7 >> SHIFT_7)) - << SHIFT_8T); - - auto local_offset = - (inner_pattern == Kokkos::Iterate::Left) - ? ((i0 & MASK_0) + ((i1 & MASK_1) << SHIFT_0) + - ((i2 & MASK_2) << (SHIFT_0 + SHIFT_1)) + - ((i3 & MASK_3) << (SHIFT_0 + SHIFT_1 + SHIFT_2)) + - ((i4 & MASK_4) << (SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3)) + - ((i5 & MASK_5) - << (SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3 + SHIFT_4)) + - ((i6 & MASK_6) << (SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3 + - SHIFT_4 + SHIFT_5)) + - ((i7 & MASK_7) << (SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3 + - SHIFT_4 + SHIFT_5 + SHIFT_6))) - : (((i0 & MASK_0) << (SHIFT_7 + SHIFT_6 + SHIFT_5 + SHIFT_4 + - SHIFT_3 + SHIFT_2 + SHIFT_1)) + - ((i1 & MASK_1) << (SHIFT_7 + SHIFT_6 + SHIFT_5 + SHIFT_4 + - SHIFT_3 + SHIFT_2)) + - ((i2 & MASK_2) - << (SHIFT_7 + SHIFT_6 + SHIFT_5 + SHIFT_4 + SHIFT_3)) + - ((i3 & MASK_3) << (SHIFT_7 + SHIFT_6 + SHIFT_5 + SHIFT_4)) + - ((i4 & MASK_4) << (SHIFT_7 + SHIFT_6 + SHIFT_5)) + - ((i5 & MASK_5) << (SHIFT_7 + SHIFT_6)) + - ((i6 & MASK_6) << (SHIFT_7)) + (i7 & MASK_7)); - - return tile_offset + local_offset; - } - - //---------------------------------------- - - KOKKOS_INLINE_FUNCTION constexpr array_layout layout() const { - return array_layout((VORank > 0 ? m_dim.N0 : KOKKOS_INVALID_INDEX), - (VORank > 1 ? m_dim.N1 : KOKKOS_INVALID_INDEX), - (VORank > 2 ? m_dim.N2 : KOKKOS_INVALID_INDEX), - (VORank > 3 ? m_dim.N3 : KOKKOS_INVALID_INDEX), - (VORank > 4 ? m_dim.N4 : KOKKOS_INVALID_INDEX), - (VORank > 5 ? m_dim.N5 : KOKKOS_INVALID_INDEX), - (VORank > 6 ? m_dim.N6 : KOKKOS_INVALID_INDEX), - (VORank > 7 ? m_dim.N7 : KOKKOS_INVALID_INDEX)); - } - - KOKKOS_INLINE_FUNCTION constexpr size_type dimension_0() const { - return m_dim.N0; - } - KOKKOS_INLINE_FUNCTION constexpr size_type dimension_1() const { - return m_dim.N1; - } - KOKKOS_INLINE_FUNCTION constexpr size_type dimension_2() const { - return m_dim.N2; - } - KOKKOS_INLINE_FUNCTION constexpr size_type dimension_3() const { - return m_dim.N3; - } - KOKKOS_INLINE_FUNCTION constexpr size_type dimension_4() const { - return m_dim.N4; - } - KOKKOS_INLINE_FUNCTION constexpr size_type dimension_5() const { - return m_dim.N5; - } - KOKKOS_INLINE_FUNCTION constexpr size_type dimension_6() const { - return m_dim.N6; - } - KOKKOS_INLINE_FUNCTION constexpr size_type dimension_7() const { - return m_dim.N7; - } - - KOKKOS_INLINE_FUNCTION constexpr size_type size() const { - return m_dim.N0 * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 * m_dim.N5 * - m_dim.N6 * m_dim.N7; - } - - // Strides are meaningless due to irregularity - KOKKOS_INLINE_FUNCTION constexpr size_type stride_0() const { return 0; } - KOKKOS_INLINE_FUNCTION constexpr size_type stride_1() const { return 0; } - KOKKOS_INLINE_FUNCTION constexpr size_type stride_2() const { return 0; } - KOKKOS_INLINE_FUNCTION constexpr size_type stride_3() const { return 0; } - KOKKOS_INLINE_FUNCTION constexpr size_type stride_4() const { return 0; } - KOKKOS_INLINE_FUNCTION constexpr size_type stride_5() const { return 0; } - KOKKOS_INLINE_FUNCTION constexpr size_type stride_6() const { return 0; } - KOKKOS_INLINE_FUNCTION constexpr size_type stride_7() const { return 0; } - - // Stride with [ rank ] value is the total length - template - KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { - s[0] = 0; - if (0 < dimension_type::rank) { - s[1] = 0; - } - if (1 < dimension_type::rank) { - s[2] = 0; - } - if (2 < dimension_type::rank) { - s[3] = 0; - } - if (3 < dimension_type::rank) { - s[4] = 0; - } - if (4 < dimension_type::rank) { - s[5] = 0; - } - if (5 < dimension_type::rank) { - s[6] = 0; - } - if (6 < dimension_type::rank) { - s[7] = 0; - } - if (7 < dimension_type::rank) { - s[8] = 0; - } - } - - KOKKOS_INLINE_FUNCTION constexpr size_type span() const { - // Rank2: ( NumTile0 * ( NumTile1 ) ) * TileSize, etc - return (VORank == 2) - ? (m_tile_N0 * m_tile_N1) << SHIFT_2T - : (VORank == 3) - ? (m_tile_N0 * m_tile_N1 * m_tile_N2) << SHIFT_3T - : (VORank == 4) - ? (m_tile_N0 * m_tile_N1 * m_tile_N2 * m_tile_N3) - << SHIFT_4T - : (VORank == 5) - ? (m_tile_N0 * m_tile_N1 * m_tile_N2 * - m_tile_N3 * m_tile_N4) - << SHIFT_5T - : (VORank == 6) - ? (m_tile_N0 * m_tile_N1 * m_tile_N2 * - m_tile_N3 * m_tile_N4 * m_tile_N5) - << SHIFT_6T - : (VORank == 7) - ? (m_tile_N0 * m_tile_N1 * - m_tile_N2 * m_tile_N3 * - m_tile_N4 * m_tile_N5 * - m_tile_N6) - << SHIFT_7T - : (m_tile_N0 * m_tile_N1 * - m_tile_N2 * m_tile_N3 * - m_tile_N4 * m_tile_N5 * - m_tile_N6 * m_tile_N7) - << SHIFT_8T; - } - - KOKKOS_INLINE_FUNCTION constexpr bool span_is_contiguous() const { - return true; - } - - //---------------------------------------- -#ifdef KOKKOS_IMPL_WINDOWS_CUDA - KOKKOS_FUNCTION ViewOffset() {} - KOKKOS_FUNCTION ViewOffset(const ViewOffset& src) { - m_dim = src.m_dim; - m_tile_N0 = src.m_tile_N0; - m_tile_N1 = src.m_tile_N1; - m_tile_N2 = src.m_tile_N2; - m_tile_N3 = src.m_tile_N3; - m_tile_N4 = src.m_tile_N4; - m_tile_N5 = src.m_tile_N5; - m_tile_N6 = src.m_tile_N6; - m_tile_N7 = src.m_tile_N7; - } - KOKKOS_FUNCTION ViewOffset& operator=(const ViewOffset& src) { - m_dim = src.m_dim; - m_tile_N0 = src.m_tile_N0; - m_tile_N1 = src.m_tile_N1; - m_tile_N2 = src.m_tile_N2; - m_tile_N3 = src.m_tile_N3; - m_tile_N4 = src.m_tile_N4; - m_tile_N5 = src.m_tile_N5; - m_tile_N6 = src.m_tile_N6; - m_tile_N7 = src.m_tile_N7; - return *this; - } -#else - KOKKOS_DEFAULTED_FUNCTION ~ViewOffset() = default; - KOKKOS_DEFAULTED_FUNCTION ViewOffset() = default; - KOKKOS_DEFAULTED_FUNCTION ViewOffset(const ViewOffset&) = default; - KOKKOS_DEFAULTED_FUNCTION ViewOffset& operator=(const ViewOffset&) = default; -#endif - - template - KOKKOS_INLINE_FUNCTION constexpr ViewOffset( - std::integral_constant const&, - array_layout const arg_layout) - : m_dim(arg_layout.dimension[0], arg_layout.dimension[1], - arg_layout.dimension[2], arg_layout.dimension[3], - arg_layout.dimension[4], arg_layout.dimension[5], - arg_layout.dimension[6], arg_layout.dimension[7]), - m_tile_N0((arg_layout.dimension[0] + MASK_0) >> - SHIFT_0 /* number of tiles in first dimension */), - m_tile_N1((arg_layout.dimension[1] + MASK_1) >> SHIFT_1), - m_tile_N2((VORank > 2) ? (arg_layout.dimension[2] + MASK_2) >> SHIFT_2 - : 0), - m_tile_N3((VORank > 3) ? (arg_layout.dimension[3] + MASK_3) >> SHIFT_3 - : 0), - m_tile_N4((VORank > 4) ? (arg_layout.dimension[4] + MASK_4) >> SHIFT_4 - : 0), - m_tile_N5((VORank > 5) ? (arg_layout.dimension[5] + MASK_5) >> SHIFT_5 - : 0), - m_tile_N6((VORank > 6) ? (arg_layout.dimension[6] + MASK_6) >> SHIFT_6 - : 0), - m_tile_N7((VORank > 7) ? (arg_layout.dimension[7] + MASK_7) >> SHIFT_7 - : 0) {} -}; - -// FIXME Remove the out-of-class definitions when we require C++17 -#define KOKKOS_ITERATE_VIEW_OFFSET_ENABLE \ - std::enable_if_t<((Dimension::rank <= 8) && (Dimension::rank >= 2) && \ - is_array_layout::value && \ - is_array_layout_tiled::value)> -template -constexpr Kokkos::Iterate ViewOffset< - Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::outer_pattern; -template -constexpr Kokkos::Iterate ViewOffset< - Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::inner_pattern; -template -constexpr int - ViewOffset::VORank; -template -constexpr unsigned - ViewOffset::SHIFT_0; -template -constexpr unsigned - ViewOffset::SHIFT_1; -template -constexpr unsigned - ViewOffset::SHIFT_2; -template -constexpr unsigned - ViewOffset::SHIFT_3; -template -constexpr unsigned - ViewOffset::SHIFT_4; -template -constexpr unsigned - ViewOffset::SHIFT_5; -template -constexpr unsigned - ViewOffset::SHIFT_6; -template -constexpr unsigned - ViewOffset::SHIFT_7; -template -constexpr int - ViewOffset::MASK_0; -template -constexpr int - ViewOffset::MASK_1; -template -constexpr int - ViewOffset::MASK_2; -template -constexpr int - ViewOffset::MASK_3; -template -constexpr int - ViewOffset::MASK_4; -template -constexpr int - ViewOffset::MASK_5; -template -constexpr int - ViewOffset::MASK_6; -template -constexpr int - ViewOffset::MASK_7; -template -constexpr unsigned - ViewOffset::SHIFT_2T; -template -constexpr unsigned - ViewOffset::SHIFT_3T; -template -constexpr unsigned - ViewOffset::SHIFT_4T; -template -constexpr unsigned - ViewOffset::SHIFT_5T; -template -constexpr unsigned - ViewOffset::SHIFT_6T; -template -constexpr unsigned - ViewOffset::SHIFT_7T; -template -constexpr unsigned - ViewOffset::SHIFT_8T; -#undef KOKKOS_ITERATE_VIEW_OFFSET_ENABLE - -//---------------------------------------- - -// ViewMapping assign method needed in order to return a 'subview' tile as a -// proper View The outer iteration pattern determines the mapping of the pointer -// offset to the beginning of requested tile The inner iteration pattern is -// needed for the layout of the tile's View to be returned Rank 2 -template -class ViewMapping // void - , - Kokkos::ViewTraits< - T**, - Kokkos::Experimental::LayoutTiled< - OuterP, InnerP, N0, N1, N2, N3, N4, N5, N6, N7, true>, - P...>, - Kokkos::Experimental::LayoutTiled, - iType0, iType1> { - public: - using src_layout = - Kokkos::Experimental::LayoutTiled; - using src_traits = Kokkos::ViewTraits; - - static constexpr bool is_outer_left = (OuterP == Kokkos::Iterate::Left); - static constexpr bool is_inner_left = (InnerP == Kokkos::Iterate::Left); - using array_layout = std::conditional_t; - using traits = Kokkos::ViewTraits; - using type = Kokkos::View; - - KOKKOS_INLINE_FUNCTION static void assign( - ViewMapping& dst, const ViewMapping& src, - const src_layout&, const iType0 i_tile0, const iType1 i_tile1) { - using dst_map_type = ViewMapping; - using src_map_type = ViewMapping; - using dst_handle_type = typename dst_map_type::handle_type; - using dst_offset_type = typename dst_map_type::offset_type; - using src_offset_type = typename src_map_type::offset_type; - - dst = dst_map_type( - dst_handle_type( - src.m_impl_handle + - (is_outer_left ? ((i_tile0 + src.m_impl_offset.m_tile_N0 * i_tile1) - << src_offset_type::SHIFT_2T) - : ((src.m_impl_offset.m_tile_N1 * i_tile0 + i_tile1) - << src_offset_type::SHIFT_2T)) // offset to start - // of the tile - ), - dst_offset_type()); - } -}; - -// Rank 3 -template -class ViewMapping // void - , - Kokkos::ViewTraits< - T***, - Kokkos::Experimental::LayoutTiled< - OuterP, InnerP, N0, N1, N2, N3, N4, N5, N6, N7, true>, - P...>, - Kokkos::Experimental::LayoutTiled, - iType0, iType1, iType2> { - public: - using src_layout = - Kokkos::Experimental::LayoutTiled; - using src_traits = Kokkos::ViewTraits; - - static constexpr bool is_outer_left = (OuterP == Kokkos::Iterate::Left); - static constexpr bool is_inner_left = (InnerP == Kokkos::Iterate::Left); - using array_layout = std::conditional_t; - using traits = Kokkos::ViewTraits; - using type = Kokkos::View; - - KOKKOS_INLINE_FUNCTION static void assign( - ViewMapping& dst, const ViewMapping& src, - const src_layout&, const iType0 i_tile0, const iType1 i_tile1, - const iType2 i_tile2) { - using dst_map_type = ViewMapping; - using src_map_type = ViewMapping; - using dst_handle_type = typename dst_map_type::handle_type; - using dst_offset_type = typename dst_map_type::offset_type; - using src_offset_type = typename src_map_type::offset_type; - - dst = dst_map_type( - dst_handle_type( - src.m_impl_handle + - (is_outer_left - ? ((i_tile0 + - src.m_impl_offset.m_tile_N0 * - (i_tile1 + src.m_impl_offset.m_tile_N1 * i_tile2)) - << src_offset_type::SHIFT_3T) - : ((src.m_impl_offset.m_tile_N2 * - (src.m_impl_offset.m_tile_N1 * i_tile0 + i_tile1) + - i_tile2) - << src_offset_type::SHIFT_3T))) // offset to start of the - // tile - , - dst_offset_type()); - } -}; - -// Rank 4 -template -class ViewMapping< - std::enable_if_t<(N4 == 0 && N5 == 0 && N6 == 0 && N7 == 0)> // void - , - Kokkos::ViewTraits< - T****, - Kokkos::Experimental::LayoutTiled, - P...>, - Kokkos::Experimental::LayoutTiled, - iType0, iType1, iType2, iType3> { - public: - using src_layout = - Kokkos::Experimental::LayoutTiled; - using src_traits = Kokkos::ViewTraits; - - static constexpr bool is_outer_left = (OuterP == Kokkos::Iterate::Left); - static constexpr bool is_inner_left = (InnerP == Kokkos::Iterate::Left); - using array_layout = std::conditional_t; - using traits = Kokkos::ViewTraits; - using type = Kokkos::View; - - KOKKOS_INLINE_FUNCTION static void assign( - ViewMapping& dst, const ViewMapping& src, - const src_layout&, const iType0 i_tile0, const iType1 i_tile1, - const iType2 i_tile2, const iType3 i_tile3) { - using dst_map_type = ViewMapping; - using src_map_type = ViewMapping; - using dst_handle_type = typename dst_map_type::handle_type; - using dst_offset_type = typename dst_map_type::offset_type; - using src_offset_type = typename src_map_type::offset_type; - - dst = dst_map_type( - dst_handle_type( - src.m_impl_handle + - (is_outer_left - ? ((i_tile0 + - src.m_impl_offset.m_tile_N0 * - (i_tile1 + src.m_impl_offset.m_tile_N1 * - (i_tile2 + src.m_impl_offset.m_tile_N2 * - i_tile3))) - << src_offset_type::SHIFT_4T) - : ((src.m_impl_offset.m_tile_N3 * - (src.m_impl_offset.m_tile_N2 * - (src.m_impl_offset.m_tile_N1 * i_tile0 + - i_tile1) + - i_tile2) + - i_tile3) - << src_offset_type::SHIFT_4T))) // offset to start of the - // tile - , - dst_offset_type()); - } -}; - -// Rank 5 -template -class ViewMapping // void - , - Kokkos::ViewTraits< - T*****, - Kokkos::Experimental::LayoutTiled< - OuterP, InnerP, N0, N1, N2, N3, N4, N5, N6, N7, true>, - P...>, - Kokkos::Experimental::LayoutTiled, - iType0, iType1, iType2, iType3, iType4> { - public: - using src_layout = - Kokkos::Experimental::LayoutTiled; - using src_traits = Kokkos::ViewTraits; - - static constexpr bool is_outer_left = (OuterP == Kokkos::Iterate::Left); - static constexpr bool is_inner_left = (InnerP == Kokkos::Iterate::Left); - using array_layout = std::conditional_t; - using traits = Kokkos::ViewTraits; - using type = Kokkos::View; - - KOKKOS_INLINE_FUNCTION static void assign( - ViewMapping& dst, const ViewMapping& src, - const src_layout&, const iType0 i_tile0, const iType1 i_tile1, - const iType2 i_tile2, const iType3 i_tile3, const iType4 i_tile4) { - using dst_map_type = ViewMapping; - using src_map_type = ViewMapping; - using dst_handle_type = typename dst_map_type::handle_type; - using dst_offset_type = typename dst_map_type::offset_type; - using src_offset_type = typename src_map_type::offset_type; - - dst = dst_map_type( - dst_handle_type( - src.m_impl_handle + - (is_outer_left - ? ((i_tile0 + - src.m_impl_offset.m_tile_N0 * - (i_tile1 + - src.m_impl_offset.m_tile_N1 * - (i_tile2 + - src.m_impl_offset.m_tile_N2 * - (i_tile3 + - src.m_impl_offset.m_tile_N3 * i_tile4)))) - << src_offset_type::SHIFT_5T) - : ((src.m_impl_offset.m_tile_N4 * - (src.m_impl_offset.m_tile_N3 * - (src.m_impl_offset.m_tile_N2 * - (src.m_impl_offset.m_tile_N1 * i_tile0 + - i_tile1) + - i_tile2) + - i_tile3) + - i_tile4) - << src_offset_type::SHIFT_5T))) // offset to start of the - // tile - , - dst_offset_type()); - } -}; - -// Rank 6 -template -class ViewMapping // void - , - Kokkos::ViewTraits< - T******, - Kokkos::Experimental::LayoutTiled< - OuterP, InnerP, N0, N1, N2, N3, N4, N5, N6, N7, true>, - P...>, - Kokkos::Experimental::LayoutTiled, - iType0, iType1, iType2, iType3, iType4, iType5> { - public: - using src_layout = - Kokkos::Experimental::LayoutTiled; - using src_traits = Kokkos::ViewTraits; - - static constexpr bool is_outer_left = (OuterP == Kokkos::Iterate::Left); - static constexpr bool is_inner_left = (InnerP == Kokkos::Iterate::Left); - using array_layout = std::conditional_t; - using traits = - Kokkos::ViewTraits; - using type = Kokkos::View; - - KOKKOS_INLINE_FUNCTION static void assign( - ViewMapping& dst, const ViewMapping& src, - const src_layout&, const iType0 i_tile0, const iType1 i_tile1, - const iType2 i_tile2, const iType3 i_tile3, const iType4 i_tile4, - const iType5 i_tile5) { - using dst_map_type = ViewMapping; - using src_map_type = ViewMapping; - using dst_handle_type = typename dst_map_type::handle_type; - using dst_offset_type = typename dst_map_type::offset_type; - using src_offset_type = typename src_map_type::offset_type; - - dst = dst_map_type( - dst_handle_type( - src.m_impl_handle + - (is_outer_left - ? ((i_tile0 + - src.m_impl_offset.m_tile_N0 * - (i_tile1 + - src.m_impl_offset.m_tile_N1 * - (i_tile2 + - src.m_impl_offset.m_tile_N2 * - (i_tile3 + - src.m_impl_offset.m_tile_N3 * - (i_tile4 + src.m_impl_offset.m_tile_N4 * - i_tile5))))) - << src_offset_type::SHIFT_6T) - : ((src.m_impl_offset.m_tile_N5 * - (src.m_impl_offset.m_tile_N4 * - (src.m_impl_offset.m_tile_N3 * - (src.m_impl_offset.m_tile_N2 * - (src.m_impl_offset.m_tile_N1 * i_tile0 + - i_tile1) + - i_tile2) + - i_tile3) + - i_tile4) + - i_tile5) - << src_offset_type::SHIFT_6T))) // offset to start of the - // tile - , - dst_offset_type()); - } -}; - -// Rank 7 -template -class ViewMapping // void - , - Kokkos::ViewTraits< - T*******, - Kokkos::Experimental::LayoutTiled< - OuterP, InnerP, N0, N1, N2, N3, N4, N5, N6, N7, true>, - P...>, - Kokkos::Experimental::LayoutTiled, - iType0, iType1, iType2, iType3, iType4, iType5, iType6> { - public: - using src_layout = - Kokkos::Experimental::LayoutTiled; - using src_traits = Kokkos::ViewTraits; - - static constexpr bool is_outer_left = (OuterP == Kokkos::Iterate::Left); - static constexpr bool is_inner_left = (InnerP == Kokkos::Iterate::Left); - using array_layout = std::conditional_t; - using traits = - Kokkos::ViewTraits; - using type = Kokkos::View; - - KOKKOS_INLINE_FUNCTION static void assign( - ViewMapping& dst, const ViewMapping& src, - const src_layout&, const iType0 i_tile0, const iType1 i_tile1, - const iType2 i_tile2, const iType3 i_tile3, const iType4 i_tile4, - const iType5 i_tile5, const iType6 i_tile6) { - using dst_map_type = ViewMapping; - using src_map_type = ViewMapping; - using dst_handle_type = typename dst_map_type::handle_type; - using dst_offset_type = typename dst_map_type::offset_type; - using src_offset_type = typename src_map_type::offset_type; - - dst = dst_map_type( - dst_handle_type( - src.m_impl_handle + - (is_outer_left - ? ((i_tile0 + - src.m_impl_offset.m_tile_N0 * - (i_tile1 + - src.m_impl_offset.m_tile_N1 * - (i_tile2 + - src.m_impl_offset.m_tile_N2 * - (i_tile3 + - src.m_impl_offset.m_tile_N3 * - (i_tile4 + - src.m_impl_offset.m_tile_N4 * - (i_tile5 + - src.m_impl_offset.m_tile_N5 * - i_tile6)))))) - << src_offset_type::SHIFT_7T) - : ((src.m_impl_offset.m_tile_N6 * - (src.m_impl_offset.m_tile_N5 * - (src.m_impl_offset.m_tile_N4 * - (src.m_impl_offset.m_tile_N3 * - (src.m_impl_offset.m_tile_N2 * - (src.m_impl_offset.m_tile_N1 * - i_tile0 + - i_tile1) + - i_tile2) + - i_tile3) + - i_tile4) + - i_tile5) + - i_tile6) - << src_offset_type::SHIFT_7T))) // offset to start of the - // tile - , - dst_offset_type()); - } -}; - -// Rank 8 -template -class ViewMapping< - std::enable_if_t<(N0 != 0 && N1 != 0 && N2 != 0 && N3 != 0 && N4 != 0 && - N5 != 0 && N6 != 0 && N7 != 0)> // void - , - Kokkos::ViewTraits< - T********, - Kokkos::Experimental::LayoutTiled, - P...>, - Kokkos::Experimental::LayoutTiled, - iType0, iType1, iType2, iType3, iType4, iType5, iType6, iType7> { - public: - using src_layout = - Kokkos::Experimental::LayoutTiled; - using src_traits = Kokkos::ViewTraits; - - static constexpr bool is_outer_left = (OuterP == Kokkos::Iterate::Left); - static constexpr bool is_inner_left = (InnerP == Kokkos::Iterate::Left); - using array_layout = std::conditional_t; - using traits = - Kokkos::ViewTraits; - using type = - Kokkos::View; - - KOKKOS_INLINE_FUNCTION static void assign( - ViewMapping& dst, const ViewMapping& src, - const src_layout&, const iType0 i_tile0, const iType1 i_tile1, - const iType2 i_tile2, const iType3 i_tile3, const iType4 i_tile4, - const iType5 i_tile5, const iType6 i_tile6, const iType7 i_tile7) { - using dst_map_type = ViewMapping; - using src_map_type = ViewMapping; - using dst_handle_type = typename dst_map_type::handle_type; - using dst_offset_type = typename dst_map_type::offset_type; - using src_offset_type = typename src_map_type::offset_type; - - dst = dst_map_type( - dst_handle_type( - src.m_impl_handle + - (is_outer_left - ? ((i_tile0 + - src.m_impl_offset.m_tile_N0 * - (i_tile1 + - src.m_impl_offset.m_tile_N1 * - (i_tile2 + - src.m_impl_offset.m_tile_N2 * - (i_tile3 + - src.m_impl_offset.m_tile_N3 * - (i_tile4 + - src.m_impl_offset.m_tile_N4 * - (i_tile5 + - src.m_impl_offset.m_tile_N5 * - (i_tile6 + - src.m_impl_offset.m_tile_N6 * - i_tile7))))))) - << src_offset_type::SHIFT_8T) - : ((src.m_impl_offset.m_tile_N7 * - (src.m_impl_offset.m_tile_N6 * - (src.m_impl_offset.m_tile_N5 * - (src.m_impl_offset.m_tile_N4 * - (src.m_impl_offset.m_tile_N3 * - (src.m_impl_offset.m_tile_N2 * - (src.m_impl_offset.m_tile_N1 * - i_tile0 + - i_tile1) + - i_tile2) + - i_tile3) + - i_tile4) + - i_tile5) + - i_tile6) + - i_tile7) - << src_offset_type::SHIFT_8T))) // offset to start of the - // tile - , - dst_offset_type()); - } -}; - -} /* namespace Impl */ -} /* namespace Kokkos */ - -//---------------------------------------- - -namespace Kokkos { - -// Rank 2 -template -KOKKOS_INLINE_FUNCTION - Kokkos::View, - P...> - tile_subview(const Kokkos::View< - T**, - Kokkos::Experimental::LayoutTiled< - OuterP, InnerP, N0, N1, N2, N3, N4, N5, N6, N7, true>, - P...>& src, - const size_t i_tile0, const size_t i_tile1) { - // Force the specialized ViewMapping for extracting a tile - // by using the first subview argument as the layout. - using array_layout = - std::conditional_t<(InnerP == Kokkos::Iterate::Left), Kokkos::LayoutLeft, - Kokkos::LayoutRight>; - using SrcLayout = - Kokkos::Experimental::LayoutTiled; - - return Kokkos::View(src, SrcLayout(), i_tile0, - i_tile1); -} - -// Rank 3 -template -KOKKOS_INLINE_FUNCTION - Kokkos::View, - P...> - tile_subview(const Kokkos::View< - T***, - Kokkos::Experimental::LayoutTiled< - OuterP, InnerP, N0, N1, N2, N3, N4, N5, N6, N7, true>, - P...>& src, - const size_t i_tile0, const size_t i_tile1, - const size_t i_tile2) { - // Force the specialized ViewMapping for extracting a tile - // by using the first subview argument as the layout. - using array_layout = - std::conditional_t<(InnerP == Kokkos::Iterate::Left), Kokkos::LayoutLeft, - Kokkos::LayoutRight>; - using SrcLayout = - Kokkos::Experimental::LayoutTiled; - - return Kokkos::View( - src, SrcLayout(), i_tile0, i_tile1, i_tile2); -} - -// Rank 4 -template -KOKKOS_INLINE_FUNCTION - Kokkos::View, - P...> - tile_subview(const Kokkos::View< - T****, - Kokkos::Experimental::LayoutTiled< - OuterP, InnerP, N0, N1, N2, N3, N4, N5, N6, N7, true>, - P...>& src, - const size_t i_tile0, const size_t i_tile1, - const size_t i_tile2, const size_t i_tile3) { - // Force the specialized ViewMapping for extracting a tile - // by using the first subview argument as the layout. - using array_layout = - std::conditional_t<(InnerP == Kokkos::Iterate::Left), Kokkos::LayoutLeft, - Kokkos::LayoutRight>; - using SrcLayout = - Kokkos::Experimental::LayoutTiled; - - return Kokkos::View( - src, SrcLayout(), i_tile0, i_tile1, i_tile2, i_tile3); -} - -// Rank 5 -template -KOKKOS_INLINE_FUNCTION - Kokkos::View, - P...> - tile_subview(const Kokkos::View< - T*****, - Kokkos::Experimental::LayoutTiled< - OuterP, InnerP, N0, N1, N2, N3, N4, N5, N6, N7, true>, - P...>& src, - const size_t i_tile0, const size_t i_tile1, - const size_t i_tile2, const size_t i_tile3, - const size_t i_tile4) { - // Force the specialized ViewMapping for extracting a tile - // by using the first subview argument as the layout. - using array_layout = - std::conditional_t<(InnerP == Kokkos::Iterate::Left), Kokkos::LayoutLeft, - Kokkos::LayoutRight>; - using SrcLayout = - Kokkos::Experimental::LayoutTiled; - - return Kokkos::View( - src, SrcLayout(), i_tile0, i_tile1, i_tile2, i_tile3, i_tile4); -} - -// Rank 6 -template -KOKKOS_INLINE_FUNCTION - Kokkos::View, - P...> - tile_subview(const Kokkos::View< - T******, - Kokkos::Experimental::LayoutTiled< - OuterP, InnerP, N0, N1, N2, N3, N4, N5, N6, N7, true>, - P...>& src, - const size_t i_tile0, const size_t i_tile1, - const size_t i_tile2, const size_t i_tile3, - const size_t i_tile4, const size_t i_tile5) { - // Force the specialized ViewMapping for extracting a tile - // by using the first subview argument as the layout. - using array_layout = - std::conditional_t<(InnerP == Kokkos::Iterate::Left), Kokkos::LayoutLeft, - Kokkos::LayoutRight>; - using SrcLayout = - Kokkos::Experimental::LayoutTiled; - - return Kokkos::View( - src, SrcLayout(), i_tile0, i_tile1, i_tile2, i_tile3, i_tile4, i_tile5); -} - -// Rank 7 -template -KOKKOS_INLINE_FUNCTION - Kokkos::View, - P...> - tile_subview(const Kokkos::View< - T*******, - Kokkos::Experimental::LayoutTiled< - OuterP, InnerP, N0, N1, N2, N3, N4, N5, N6, N7, true>, - P...>& src, - const size_t i_tile0, const size_t i_tile1, - const size_t i_tile2, const size_t i_tile3, - const size_t i_tile4, const size_t i_tile5, - const size_t i_tile6) { - // Force the specialized ViewMapping for extracting a tile - // by using the first subview argument as the layout. - using array_layout = - std::conditional_t<(InnerP == Kokkos::Iterate::Left), Kokkos::LayoutLeft, - Kokkos::LayoutRight>; - using SrcLayout = - Kokkos::Experimental::LayoutTiled; - - return Kokkos::View( - src, SrcLayout(), i_tile0, i_tile1, i_tile2, i_tile3, i_tile4, i_tile5, - i_tile6); -} - -// Rank 8 -template -KOKKOS_INLINE_FUNCTION - Kokkos::View, - P...> - tile_subview(const Kokkos::View< - T********, - Kokkos::Experimental::LayoutTiled< - OuterP, InnerP, N0, N1, N2, N3, N4, N5, N6, N7, true>, - P...>& src, - const size_t i_tile0, const size_t i_tile1, - const size_t i_tile2, const size_t i_tile3, - const size_t i_tile4, const size_t i_tile5, - const size_t i_tile6, const size_t i_tile7) { - // Force the specialized ViewMapping for extracting a tile - // by using the first subview argument as the layout. - using array_layout = - std::conditional_t<(InnerP == Kokkos::Iterate::Left), Kokkos::LayoutLeft, - Kokkos::LayoutRight>; - using SrcLayout = - Kokkos::Experimental::LayoutTiled; - - return Kokkos::View( - src, SrcLayout(), i_tile0, i_tile1, i_tile2, i_tile3, i_tile4, i_tile5, - i_tile6, i_tile7); -} - -} /* namespace Kokkos */ -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -#endif /* #ifndef KOKKOS_EXPERIENTAL_VIEWLAYOUTTILE_HPP */ diff --git a/core/unit_test/TestOther.hpp b/core/unit_test/TestOther.hpp index fcf0353a88c..935ae2b5f58 100644 --- a/core/unit_test/TestOther.hpp +++ b/core/unit_test/TestOther.hpp @@ -21,8 +21,4 @@ #include #include -// with VS 16.11.3 and CUDA 11.4.2 getting cudafe stackoverflow crash -#if !(defined(_WIN32) && defined(KOKKOS_ENABLE_CUDA)) -#include -#endif #endif diff --git a/core/unit_test/TestViewLayoutTiled.hpp b/core/unit_test/TestViewLayoutTiled.hpp deleted file mode 100644 index 67308212ee0..00000000000 --- a/core/unit_test/TestViewLayoutTiled.hpp +++ /dev/null @@ -1,1756 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE -#define KOKKOS_IMPL_PUBLIC_INCLUDE -#endif - -#include - -#include - -#include -#include - -#include -#include - -namespace Test { - -namespace { - -template -struct TestViewLayoutTiled { - using Scalar = double; - - static constexpr int T0 = 2; - static constexpr int T1 = 4; - static constexpr int T2 = 4; - static constexpr int T3 = 2; - static constexpr int T4 = 2; - static constexpr int T5 = 2; - static constexpr int T6 = 2; - static constexpr int T7 = 2; - - // Rank 2 - using LayoutLL_2D_2x4 = - Kokkos::Experimental::LayoutTiled; - using LayoutRL_2D_2x4 = - Kokkos::Experimental::LayoutTiled; - using LayoutLR_2D_2x4 = - Kokkos::Experimental::LayoutTiled; - using LayoutRR_2D_2x4 = - Kokkos::Experimental::LayoutTiled; - - // Rank 3 - using LayoutLL_3D_2x4x4 = - Kokkos::Experimental::LayoutTiled; - using LayoutRL_3D_2x4x4 = - Kokkos::Experimental::LayoutTiled; - using LayoutLR_3D_2x4x4 = - Kokkos::Experimental::LayoutTiled; - using LayoutRR_3D_2x4x4 = - Kokkos::Experimental::LayoutTiled; - - // Rank 4 - using LayoutLL_4D_2x4x4x2 = - Kokkos::Experimental::LayoutTiled; - using LayoutRL_4D_2x4x4x2 = - Kokkos::Experimental::LayoutTiled; - using LayoutLR_4D_2x4x4x2 = - Kokkos::Experimental::LayoutTiled; - using LayoutRR_4D_2x4x4x2 = - Kokkos::Experimental::LayoutTiled; - -#if !defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) - static void test_view_layout_tiled_2d(const int, const int) { -#else - static void test_view_layout_tiled_2d(const int N0, const int N1) { - const int FT = T0 * T1; - - const int NT0 = int(std::ceil(N0 / T0)); - const int NT1 = int(std::ceil(N1 / T1)); - // Test create_mirror_view, deep_copy - // Create LL View - { - using ViewType = - typename Kokkos::View; - ViewType v("v", N0, N1); - - typename ViewType::HostMirror hv = Kokkos::create_mirror_view(v); - - // Initialize host-view - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - hv(ti * T0 + i, tj * T1 + j) = - (ti + tj * NT0) * FT + (i + j * T0); - } - } - } - } - - // copy to device - Kokkos::deep_copy(v, hv); - - Kokkos::MDRangePolicy< - Kokkos::Rank<2, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, - ExecSpace> - mdrangepolicy({0, 0}, {NT0, NT1}, {T0, T1}); - - // iterate by tile - Kokkos::parallel_for( - "ViewTile rank 2 LL", mdrangepolicy, - KOKKOS_LAMBDA(const int ti, const int tj) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - if ((ti * T0 + i < N0) && (tj * T1 + j < N1)) { - v(ti * T0 + i, tj * T1 + j) += 1; - } - } - } - }); - - Kokkos::deep_copy(hv, v); - - long counter_subview = 0; - long counter_inc = 0; - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - auto tile_subview = Kokkos::tile_subview(hv, ti, tj); - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - if (tile_subview(i, j) != hv(ti * T0 + i, tj * T1 + j)) { - ++counter_subview; - } - if (tile_subview(i, j) != - ((ti + tj * NT0) * FT + (i + j * T0) + 1)) { - ++counter_inc; - } - } - } - } - } - ASSERT_EQ(counter_subview, long(0)); - ASSERT_EQ(counter_inc, long(0)); - } - - // Create RL View - { - using ViewType = - typename Kokkos::View; - Kokkos::View v("v", N0, N1); - - typename ViewType::HostMirror hv = Kokkos::create_mirror_view(v); - - // Initialize host-view - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - hv(ti * T0 + i, tj * T1 + j) = - (ti * NT1 + tj) * FT + (i + j * T0); - } - } - } - } - - // copy to device - Kokkos::deep_copy(v, hv); - - Kokkos::MDRangePolicy< - Kokkos::Rank<2, Kokkos::Iterate::Right, Kokkos::Iterate::Left>, - ExecSpace> - mdrangepolicy({0, 0}, {NT0, NT1}, {T0, T1}); - - // iterate by tile - Kokkos::parallel_for( - "ViewTile rank 2 RL", mdrangepolicy, - KOKKOS_LAMBDA(const int ti, const int tj) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - if ((ti * T0 + i < N0) && (tj * T1 + j < N1)) { - v(ti * T0 + i, tj * T1 + j) += 1; - } - } - } - }); - - Kokkos::deep_copy(hv, v); - - long counter_subview = 0; - long counter_inc = 0; - - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - auto tile_subview = Kokkos::tile_subview(hv, ti, tj); - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - if (tile_subview(i, j) != hv(ti * T0 + i, tj * T1 + j)) { - ++counter_subview; - } - if (tile_subview(i, j) != - ((ti * NT1 + tj) * FT + (i + j * T0) + 1)) { - ++counter_inc; - } - } - } - } - } - ASSERT_EQ(counter_subview, long(0)); - ASSERT_EQ(counter_inc, long(0)); - } // end scope - - // Create LR View - { - using ViewType = - typename Kokkos::View; - Kokkos::View v("v", N0, N1); - - typename ViewType::HostMirror hv = Kokkos::create_mirror_view(v); - - // Initialize host-view - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - hv(ti * T0 + i, tj * T1 + j) = - (ti + tj * NT0) * FT + (i * T1 + j); - } - } - } - } - - // copy to device - Kokkos::deep_copy(v, hv); - - Kokkos::MDRangePolicy< - Kokkos::Rank<2, Kokkos::Iterate::Left, Kokkos::Iterate::Right>, - ExecSpace> - mdrangepolicy({0, 0}, {NT0, NT1}, {T0, T1}); - - // iterate by tile - Kokkos::parallel_for( - "ViewTile rank 2 LR", mdrangepolicy, - KOKKOS_LAMBDA(const int ti, const int tj) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - if ((ti * T0 + i < N0) && (tj * T1 + j < N1)) { - v(ti * T0 + i, tj * T1 + j) += 1; - } - } - } - }); - - Kokkos::deep_copy(hv, v); - - long counter_subview = 0; - long counter_inc = 0; - - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - auto tile_subview = Kokkos::tile_subview(hv, ti, tj); - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - if (tile_subview(i, j) != hv(ti * T0 + i, tj * T1 + j)) { - ++counter_subview; - } - if (tile_subview(i, j) != - ((ti + tj * NT0) * FT + (i * T1 + j) + 1)) { - ++counter_inc; - } - } - } - } - } - ASSERT_EQ(counter_subview, long(0)); - ASSERT_EQ(counter_inc, long(0)); - } // end scope - - // Create RR View - { - using ViewType = - typename Kokkos::View; - Kokkos::View v("v", N0, N1); - - typename ViewType::HostMirror hv = Kokkos::create_mirror_view(v); - - // Initialize host-view - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - hv(ti * T0 + i, tj * T1 + j) = - (ti * NT1 + tj) * FT + (i * T1 + j); - } - } - } - } - - // copy to device - Kokkos::deep_copy(v, hv); - - Kokkos::MDRangePolicy< - Kokkos::Rank<2, Kokkos::Iterate::Left, Kokkos::Iterate::Right>, - ExecSpace> - mdrangepolicy({0, 0}, {NT0, NT1}, {T0, T1}); - - // iterate by tile - Kokkos::parallel_for( - "ViewTile rank 2 LR", mdrangepolicy, - KOKKOS_LAMBDA(const int ti, const int tj) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - if ((ti * T0 + i < N0) && (tj * T1 + j < N1)) { - v(ti * T0 + i, tj * T1 + j) += 1; - } - } - } - }); - - Kokkos::deep_copy(hv, v); - - long counter_subview = 0; - long counter_inc = 0; - - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - auto tile_subview = Kokkos::tile_subview(hv, ti, tj); - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - if (tile_subview(i, j) != hv(ti * T0 + i, tj * T1 + j)) { - ++counter_subview; - } - if (tile_subview(i, j) != - ((ti * NT1 + tj) * FT + (i * T1 + j) + 1)) { - ++counter_inc; - } - } - } - } - } - ASSERT_EQ(counter_subview, long(0)); - ASSERT_EQ(counter_inc, long(0)); - } // end scope -#endif - } // end test_view_layout_tiled_2d - -#if !defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) - static void test_view_layout_tiled_3d(const int, const int, const int) { -#else - static void test_view_layout_tiled_3d(const int N0, const int N1, - const int N2) { - const int FT = T0 * T1 * T2; - - const int NT0 = int(std::ceil(N0 / T0)); - const int NT1 = int(std::ceil(N1 / T1)); - const int NT2 = int(std::ceil(N2 / T2)); - - // Create LL View - { - using ViewType = Kokkos::View; - Kokkos::View dv("dv", N0, N1, - N2); - - typename ViewType::HostMirror v = Kokkos::create_mirror_view(dv); - - // Initialize on host - for (int tk = 0; tk < NT2; ++tk) { - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - for (int k = 0; k < T2; ++k) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - v(ti * T0 + i, tj * T1 + j, tk * T2 + k) = - (ti + tj * NT0 + tk * N0 * N1) * FT + - (i + j * T0 + k * T0 * T1); - } - } - } - } - } - } - - // copy to device - Kokkos::deep_copy(dv, v); - - Kokkos::MDRangePolicy< - Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, - ExecSpace> - mdrangepolicy({0, 0, 0}, {N0, N1, N2}, {T0, T1, T2}); - - // iterate by tile - Kokkos::parallel_for( - "ViewTile rank 3 LL", mdrangepolicy, - KOKKOS_LAMBDA(const int i, const int j, const int k) { - dv(i, j, k) += 1; - }); - - Kokkos::deep_copy(v, dv); - - long counter_subview = 0; - long counter_inc = 0; - - for (int tk = 0; tk < NT2; ++tk) { - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - auto tile_subview = Kokkos::tile_subview(v, ti, tj, tk); - for (int k = 0; k < T2; ++k) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - if (tile_subview(i, j, k) != - v(ti * T0 + i, tj * T1 + j, tk * T2 + k)) { - ++counter_subview; - } - if (tile_subview(i, j, k) != - ((ti + tj * NT0 + tk * N0 * N1) * FT + - (i + j * T0 + k * T0 * T1) + 1)) { - ++counter_inc; - } - } - } - } - } - } - } - ASSERT_EQ(counter_subview, long(0)); - ASSERT_EQ(counter_inc, long(0)); - } // end scope - - // Create RL View - { - using ViewType = Kokkos::View; - Kokkos::View dv("dv", N0, N1, - N2); - - typename ViewType::HostMirror v = Kokkos::create_mirror_view(dv); - - // Initialize on host - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - for (int tk = 0; tk < NT2; ++tk) { - for (int k = 0; k < T2; ++k) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - v(ti * T0 + i, tj * T1 + j, tk * T2 + k) = - (ti * NT1 * NT2 + tj * NT2 + tk) * FT + - (i + j * T0 + k * T0 * T1); - } - } - } - } - } - } - - // copy to device - Kokkos::deep_copy(dv, v); - - Kokkos::MDRangePolicy< - Kokkos::Rank<3, Kokkos::Iterate::Right, Kokkos::Iterate::Left>, - ExecSpace> - mdrangepolicy({0, 0, 0}, {N0, N1, N2}, {T0, T1, T2}); - - // iterate by tile - Kokkos::parallel_for( - "ViewTile rank 3 RL", mdrangepolicy, - KOKKOS_LAMBDA(const int i, const int j, const int k) { - dv(i, j, k) += 1; - }); - - Kokkos::deep_copy(v, dv); - - long counter_subview = 0; - long counter_inc = 0; - - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - for (int tk = 0; tk < NT2; ++tk) { - auto tile_subview = Kokkos::tile_subview(v, ti, tj, tk); - for (int k = 0; k < T2; ++k) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - if (tile_subview(i, j, k) != - v(ti * T0 + i, tj * T1 + j, tk * T2 + k)) { - ++counter_subview; - } - if (tile_subview(i, j, k) != - ((ti * NT1 * NT2 + tj * NT2 + tk) * FT + - (i + j * T0 + k * T0 * T1) + 1)) { - ++counter_inc; - } - } - } - } - } - } - } - ASSERT_EQ(counter_subview, long(0)); - ASSERT_EQ(counter_inc, long(0)); - } // end scope - - // Create LR View - { - using ViewType = Kokkos::View; - Kokkos::View dv("dv", N0, N1, - N2); - - typename ViewType::HostMirror v = Kokkos::create_mirror_view(dv); - - // Initialize on host - for (int tk = 0; tk < NT2; ++tk) { - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - for (int k = 0; k < T2; ++k) { - v(ti * T0 + i, tj * T1 + j, tk * T2 + k) = - (ti + tj * NT0 + tk * NT0 * NT1) * FT + - (i * T1 * T2 + j * T2 + k); - } - } - } - } - } - } - - // copy to device - Kokkos::deep_copy(dv, v); - - Kokkos::MDRangePolicy< - Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Right>, - ExecSpace> - mdrangepolicy({0, 0, 0}, {N0, N1, N2}, {T0, T1, T2}); - - // iterate by tile - Kokkos::parallel_for( - "ViewTile rank 3 LR", mdrangepolicy, - KOKKOS_LAMBDA(const int i, const int j, const int k) { - dv(i, j, k) += 1; - }); - - Kokkos::deep_copy(v, dv); - - long counter_subview = 0; - long counter_inc = 0; - - for (int tk = 0; tk < NT2; ++tk) { - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - auto tile_subview = Kokkos::tile_subview(v, ti, tj, tk); - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - for (int k = 0; k < T2; ++k) { - if (tile_subview(i, j, k) != - v(ti * T0 + i, tj * T1 + j, tk * T2 + k)) { - ++counter_subview; - } - if (tile_subview(i, j, k) != - ((ti + tj * NT0 + tk * NT0 * NT1) * FT + - (i * T1 * T2 + j * T2 + k) + 1)) { - ++counter_inc; - } - } - } - } - } - } - } - ASSERT_EQ(counter_subview, long(0)); - ASSERT_EQ(counter_inc, long(0)); - } // end scope - - // Create RR View - { - using ViewType = Kokkos::View; - Kokkos::View dv("dv", N0, N1, - N2); - - typename ViewType::HostMirror v = Kokkos::create_mirror_view(dv); - - // Initialize on host - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - for (int tk = 0; tk < NT2; ++tk) { - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - for (int k = 0; k < T2; ++k) { - v(ti * T0 + i, tj * T1 + j, tk * T2 + k) = - (ti * NT1 * NT2 + tj * NT2 + tk) * FT + - (i * T1 * T2 + j * T2 + k); - } - } - } - } - } - } - - // copy to device - Kokkos::deep_copy(dv, v); - - Kokkos::MDRangePolicy< - Kokkos::Rank<3, Kokkos::Iterate::Right, Kokkos::Iterate::Right>, - ExecSpace> - mdrangepolicy({0, 0, 0}, {N0, N1, N2}, {T0, T1, T2}); - - // iterate by tile - Kokkos::parallel_for( - "ViewTile rank 3 RR", mdrangepolicy, - KOKKOS_LAMBDA(const int i, const int j, const int k) { - dv(i, j, k) += 1; - }); - - Kokkos::deep_copy(v, dv); - - long counter_subview = 0; - long counter_inc = 0; - - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - for (int tk = 0; tk < NT2; ++tk) { - auto tile_subview = Kokkos::tile_subview(v, ti, tj, tk); - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - for (int k = 0; k < T2; ++k) { - if (tile_subview(i, j, k) != - v(ti * T0 + i, tj * T1 + j, tk * T2 + k)) { - ++counter_subview; - } - if (tile_subview(i, j, k) != - ((ti * NT1 * NT2 + tj * NT2 + tk) * FT + - (i * T1 * T2 + j * T2 + k) + 1)) { - ++counter_inc; - } - } - } - } - } - } - } - ASSERT_EQ(counter_subview, long(0)); - ASSERT_EQ(counter_inc, long(0)); - } // end scope -#endif - } // end test_view_layout_tiled_3d - -#if !defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) - static void test_view_layout_tiled_4d(const int, const int, const int, - const int){ -#else - static void test_view_layout_tiled_4d(const int N0, const int N1, - const int N2, const int N3) { - const int FT = T0 * T1 * T2 * T3; - - const int NT0 = int(std::ceil(N0 / T0)); - const int NT1 = int(std::ceil(N1 / T1)); - const int NT2 = int(std::ceil(N2 / T2)); - const int NT3 = int(std::ceil(N3 / T3)); - - // Create LL View - { - using ViewType = Kokkos::View; - Kokkos::View dv("dv", N0, N1, - N2, N3); - - typename ViewType::HostMirror v = Kokkos::create_mirror_view(dv); - - // Initialize on host - for (int tl = 0; tl < NT3; ++tl) { - for (int tk = 0; tk < NT2; ++tk) { - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - for (int l = 0; l < T3; ++l) { - for (int k = 0; k < T2; ++k) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - v(ti * T0 + i, tj * T1 + j, tk * T2 + k, tl * T3 + l) = - (ti + tj * NT0 + tk * N0 * N1 + tl * N0 * N1 * N2) * - FT + - (i + j * T0 + k * T0 * T1 + l * T0 * T1 * T2); - } - } - } - } - } - } - } - } - - // copy to device - Kokkos::deep_copy(dv, v); - - Kokkos::MDRangePolicy< - Kokkos::Rank<4, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, - ExecSpace> - mdrangepolicy({0, 0, 0, 0}, {N0, N1, N2, N3}, {T0, T1, T2, T3}); - - // iterate by tile - Kokkos::parallel_for( - "ViewTile rank 4 LL", mdrangepolicy, - KOKKOS_LAMBDA(const int i, const int j, const int k, const int l) { - dv(i, j, k, l) += 1; - }); - - Kokkos::deep_copy(v, dv); - - long counter_subview = 0; - long counter_inc = 0; - - for (int tl = 0; tl < NT3; ++tl) { - for (int tk = 0; tk < NT2; ++tk) { - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - auto tile_subview = Kokkos::tile_subview(v, ti, tj, tk, tl); - for (int l = 0; l < T3; ++l) { - for (int k = 0; k < T2; ++k) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - if (tile_subview(i, j, k, l) != - v(ti * T0 + i, tj * T1 + j, tk * T2 + k, - tl * T3 + l)) { - ++counter_subview; - } - if (tile_subview(i, j, k, l) != - ((ti + tj * NT0 + tk * N0 * N1 + tl * N0 * N1 * N2) * - FT + - (i + j * T0 + k * T0 * T1 + l * T0 * T1 * T2) + 1)) { - ++counter_inc; - } - } - } - } - } - } - } - } - } - ASSERT_EQ(counter_subview, long(0)); - ASSERT_EQ(counter_inc, long(0)); - } // end scope - - // Create RL View - { - using ViewType = Kokkos::View; - Kokkos::View dv("dv", N0, N1, - N2, N3); - - typename ViewType::HostMirror v = Kokkos::create_mirror_view(dv); - - // Initialize on host - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - for (int tk = 0; tk < NT2; ++tk) { - for (int tl = 0; tl < NT3; ++tl) { - for (int l = 0; l < T3; ++l) { - for (int k = 0; k < T2; ++k) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - v(ti * T0 + i, tj * T1 + j, tk * T2 + k, tl * T3 + l) = - (ti * NT1 * NT2 * N3 + tj * NT2 * N3 + tk * N3 + tl) * - FT + - (i + j * T0 + k * T0 * T1 + l * T0 * T1 * T2); - } - } - } - } - } - } - } - } - - // copy to device - Kokkos::deep_copy(dv, v); - - Kokkos::MDRangePolicy< - Kokkos::Rank<4, Kokkos::Iterate::Right, Kokkos::Iterate::Left>, - ExecSpace> - mdrangepolicy({0, 0, 0, 0}, {N0, N1, N2, N3}, {T0, T1, T2, T3}); - - // iterate by tile - Kokkos::parallel_for( - "ViewTile rank 4 RL", mdrangepolicy, - KOKKOS_LAMBDA(const int i, const int j, const int k, const int l) { - dv(i, j, k, l) += 1; - }); - - Kokkos::deep_copy(v, dv); - - long counter_subview = 0; - long counter_inc = 0; - - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - for (int tk = 0; tk < NT2; ++tk) { - for (int tl = 0; tl < NT3; ++tl) { - auto tile_subview = Kokkos::tile_subview(v, ti, tj, tk, tl); - for (int l = 0; l < T3; ++l) { - for (int k = 0; k < T2; ++k) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - if (tile_subview(i, j, k, l) != - v(ti * T0 + i, tj * T1 + j, tk * T2 + k, - tl * T3 + l)) { - ++counter_subview; - } - if (tile_subview(i, j, k, l) != - ((ti * NT1 * NT2 * N3 + tj * NT2 * N3 + tk * N3 + - tl) * - FT + - (i + j * T0 + k * T0 * T1 + l * T0 * T1 * T2) + 1)) { - ++counter_inc; - } - } - } - } - } - } - } - } - } - ASSERT_EQ(counter_subview, long(0)); - ASSERT_EQ(counter_inc, long(0)); - } // end scope - - // Create LR View - { - using ViewType = Kokkos::View; - Kokkos::View dv("dv", N0, N1, - N2, N3); - - typename ViewType::HostMirror v = Kokkos::create_mirror_view(dv); - - // Initialize on host - for (int tl = 0; tl < NT3; ++tl) { - for (int tk = 0; tk < NT2; ++tk) { - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - for (int k = 0; k < T2; ++k) { - for (int l = 0; l < T3; ++l) { - v(ti * T0 + i, tj * T1 + j, tk * T2 + k, tl * T3 + l) = - (ti + tj * NT0 + tk * NT0 * NT1 + - tl * NT0 * NT1 * NT2) * - FT + - (i * T1 * T2 * T3 + j * T2 * T3 + k * T3 + l); - } - } - } - } - } - } - } - } - - // copy to device - Kokkos::deep_copy(dv, v); - - Kokkos::MDRangePolicy< - Kokkos::Rank<4, Kokkos::Iterate::Left, Kokkos::Iterate::Right>, - ExecSpace> - mdrangepolicy({0, 0, 0, 0}, {N0, N1, N2, N3}, {T0, T1, T2, T3}); - - // iterate by tile - Kokkos::parallel_for( - "ViewTile rank 4 LR", mdrangepolicy, - KOKKOS_LAMBDA(const int i, const int j, const int k, const int l) { - dv(i, j, k, l) += 1; - }); - - Kokkos::deep_copy(v, dv); - - long counter_subview = 0; - long counter_inc = 0; - - for (int tl = 0; tl < NT3; ++tl) { - for (int tk = 0; tk < NT2; ++tk) { - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - auto tile_subview = Kokkos::tile_subview(v, ti, tj, tk, tl); - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - for (int k = 0; k < T2; ++k) { - for (int l = 0; l < T3; ++l) { - if (tile_subview(i, j, k, l) != - v(ti * T0 + i, tj * T1 + j, tk * T2 + k, - tl * T3 + l)) { - ++counter_subview; - } - if (tile_subview(i, j, k, l) != - ((ti + tj * NT0 + tk * NT0 * NT1 + - tl * NT0 * NT1 * NT2) * - FT + - (i * T1 * T2 * T3 + j * T2 * T3 + k * T3 + l) + 1)) { - ++counter_inc; - } - } - } - } - } - } - } - } - } - ASSERT_EQ(counter_subview, long(0)); - ASSERT_EQ(counter_inc, long(0)); - } // end scope - - // Create RR View - { - using ViewType = Kokkos::View; - Kokkos::View dv("dv", N0, N1, - N2, N3); - - typename ViewType::HostMirror v = Kokkos::create_mirror_view(dv); - - // Initialize on host - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - for (int tk = 0; tk < NT2; ++tk) { - for (int tl = 0; tl < NT3; ++tl) { - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - for (int k = 0; k < T2; ++k) { - for (int l = 0; l < T3; ++l) { - v(ti * T0 + i, tj * T1 + j, tk * T2 + k, tl * T3 + l) = - (ti * NT1 * NT2 * NT3 + tj * NT2 * NT3 + tk * NT3 + - tl) * - FT + - (i * T1 * T2 * T3 + j * T2 * T3 + k * T3 + l); - } - } - } - } - } - } - } - } - - // copy to device - Kokkos::deep_copy(dv, v); - - Kokkos::MDRangePolicy< - Kokkos::Rank<4, Kokkos::Iterate::Right, Kokkos::Iterate::Right>, - ExecSpace> - mdrangepolicy({0, 0, 0, 0}, {N0, N1, N2, N3}, {T0, T1, T2, T3}); - - // iterate by tile - Kokkos::parallel_for( - "ViewTile rank 4 RR", mdrangepolicy, - KOKKOS_LAMBDA(const int i, const int j, const int k, const int l) { - dv(i, j, k, l) += 1; - }); - - Kokkos::deep_copy(v, dv); - - long counter_subview = 0; - long counter_inc = 0; - - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - for (int tk = 0; tk < NT2; ++tk) { - for (int tl = 0; tl < NT3; ++tl) { - auto tile_subview = Kokkos::tile_subview(v, ti, tj, tk, tl); - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - for (int k = 0; k < T2; ++k) { - for (int l = 0; l < T3; ++l) { - if (tile_subview(i, j, k, l) != - v(ti * T0 + i, tj * T1 + j, tk * T2 + k, - tl * T3 + l)) { - ++counter_subview; - } - if (tile_subview(i, j, k, l) != - ((ti * NT1 * NT2 * NT3 + tj * NT2 * NT3 + tk * NT3 + - tl) * - FT + - (i * T1 * T2 * T3 + j * T2 * T3 + k * T3 + l) + 1)) { - ++counter_inc; - } - } - } - } - } - } - } - } - } - ASSERT_EQ(counter_subview, long(0)); - ASSERT_EQ(counter_inc, long(0)); - } // end scope -#endif - } // end test_view_layout_tiled_4d - - static void test_view_layout_tiled_subtile_2d(const int N0, const int N1) { - const int FT = T0 * T1; - - const int NT0 = int(std::ceil(N0 / T0)); - const int NT1 = int(std::ceil(N1 / T1)); - - // Counter to check for errors at the end - long counter[4] = {0}; - - // Create LL View - { - Kokkos::View v("v", N0, N1); - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - v(ti * T0 + i, tj * T1 + j) = (ti + tj * NT0) * FT + (i + j * T0); - } - } - } - } - - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - auto tile_subview = Kokkos::tile_subview(v, ti, tj); - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - if (tile_subview(i, j) != v(ti * T0 + i, tj * T1 + j)) { - ++counter[0]; - } -#ifdef KOKKOS_VERBOSE_LAYOUTTILED_OUTPUT - std::cout << "idx0,idx1 = " << ti * T0 + i << "," << tj * T1 + j - << std::endl; - std::cout << "ti,tj,i,j: " << ti << "," << tj << "," << i << "," - << j << " v = " << v(ti * T0 + i, tj * T1 + j) - << " flat idx = " - << (ti + tj * NT0) * FT + (i + j * T0) << std::endl; - std::cout << "subview_tile output = " << tile_subview(i, j) - << std::endl; -#endif - } - } - } - } - } // end scope - - // Create RL View - { - Kokkos::View v("v", N0, N1); - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - v(ti * T0 + i, tj * T1 + j) = (ti * NT1 + tj) * FT + (i + j * T0); - } - } - } - } - - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - auto tile_subview = Kokkos::tile_subview(v, ti, tj); - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - if (tile_subview(i, j) != v(ti * T0 + i, tj * T1 + j)) { - ++counter[1]; - } -#ifdef KOKKOS_VERBOSE_LAYOUTTILED_OUTPUT - std::cout << "idx0,idx1 = " << ti * T0 + i << "," << tj * T1 + j - << std::endl; - std::cout << "ti,tj,i,j: " << ti << "," << tj << "," << i << "," - << j << " v = " << v(ti * T0 + i, tj * T1 + j) - << " flat idx = " - << (ti * NT1 + tj) * FT + (i + j * T0) << std::endl; - std::cout << "subview_tile output = " << tile_subview(i, j) - << std::endl; -#endif - } - } - } - } - } // end scope - - // Create LR View - { - Kokkos::View v("v", N0, N1); - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - v(ti * T0 + i, tj * T1 + j) = (ti + tj * NT0) * FT + (i * T1 + j); - } - } - } - } - - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - auto tile_subview = Kokkos::tile_subview(v, ti, tj); - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - if (tile_subview(i, j) != v(ti * T0 + i, tj * T1 + j)) { - ++counter[2]; - } -#ifdef KOKKOS_VERBOSE_LAYOUTTILED_OUTPUT - std::cout << "idx0,idx1 = " << ti * T0 + i << "," << tj * T1 + j - << std::endl; - std::cout << "ti,tj,i,j: " << ti << "," << tj << "," << i << "," - << j << " v = " << v(ti * T0 + i, tj * T1 + j) - << " flat idx = " - << (ti + tj * NT0) * FT + (i * T1 + j) << std::endl; - std::cout << "subview_tile output = " << tile_subview(i, j) - << std::endl; -#endif - } - } - } - } - } // end scope - - // Create RR View - { - Kokkos::View v("v", N0, N1); - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - v(ti * T0 + i, tj * T1 + j) = (ti * NT1 + tj) * FT + (i * T1 + j); - } - } - } - } - - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - auto tile_subview = Kokkos::tile_subview(v, ti, tj); - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - if (tile_subview(i, j) != v(ti * T0 + i, tj * T1 + j)) { - ++counter[3]; - } -#ifdef KOKKOS_VERBOSE_LAYOUTTILED_OUTPUT - std::cout << "idx0,idx1 = " << ti * T0 + i << "," << tj * T1 + j - << std::endl; - std::cout << "ti,tj,i,j: " << ti << "," << tj << "," << i << "," - << j << " v = " << v(ti * T0 + i, tj * T1 + j) - << " flat idx = " - << (ti * NT1 + tj) * FT + (i * T1 + j) << std::endl; - std::cout << "subview_tile output = " << tile_subview(i, j) - << std::endl; - std::cout << "subview tile rank = " << Kokkos::rank(tile_subview) - << std::endl; -#endif - } - } - } - } - } // end scope - -#ifdef KOKKOS_VERBOSE_LAYOUTTILED_OUTPUT - std::cout << "subview_tile vs view errors:\n" - << " LL: " << counter[0] << " RL: " << counter[1] - << " LR: " << counter[2] << " RR: " << counter[3] << std::endl; -#endif - - ASSERT_EQ(counter[0], long(0)); - ASSERT_EQ(counter[1], long(0)); - ASSERT_EQ(counter[2], long(0)); - ASSERT_EQ(counter[3], long(0)); - } // end test_view_layout_tiled_subtile_2d - - static void test_view_layout_tiled_subtile_3d(const int N0, const int N1, - const int N2) { - const int FT = T0 * T1 * T2; - - const int NT0 = int(std::ceil(N0 / T0)); - const int NT1 = int(std::ceil(N1 / T1)); - const int NT2 = int(std::ceil(N2 / T2)); - - // Counter to check for errors at the end - long counter[4] = {0}; - // Create LL View - { - Kokkos::View v("v", N0, - N1, N2); - for (int tk = 0; tk < NT2; ++tk) { - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - for (int k = 0; k < T2; ++k) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - v(ti * T0 + i, tj * T1 + j, tk * T2 + k) = - (ti + tj * NT0 + tk * N0 * N1) * FT + - (i + j * T0 + k * T0 * T1); - } - } - } - } - } - } - - for (int tk = 0; tk < NT2; ++tk) { - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - auto tile_subview = Kokkos::tile_subview(v, ti, tj, tk); - for (int k = 0; k < T2; ++k) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - if (tile_subview(i, j, k) != - v(ti * T0 + i, tj * T1 + j, tk * T2 + k)) { - ++counter[0]; - } -#ifdef KOKKOS_VERBOSE_LAYOUTTILED_OUTPUT - std::cout << "idx0,idx1,idx2 = " << ti * T0 + i << "," - << tj * T1 + j << "," << tk * T2 + k << std::endl; - std::cout - << "ti,tj,tk,i,j,k: " << ti << "," << tj << "," << tk - << "," << i << "," << j << "," << k - << " v = " << v(ti * T0 + i, tj * T1 + j, tk * T2 + k) - << " flat idx = " - << (ti + tj * NT0 + tk * N0 * N1) * FT + - (i + j * T0 + k * T0 * T1) - << std::endl; - std::cout << "subview_tile output = " << tile_subview(i, j, k) - << std::endl; - std::cout - << "subview tile rank = " << Kokkos::rank(tile_subview) - << std::endl; -#endif - } - } - } - } - } - } - } // end scope - - // Create RL View - { - Kokkos::View v("v", N0, - N1, N2); - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - for (int tk = 0; tk < NT2; ++tk) { - for (int k = 0; k < T2; ++k) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - v(ti * T0 + i, tj * T1 + j, tk * T2 + k) = - (ti * NT1 * NT2 + tj * NT2 + tk) * FT + - (i + j * T0 + k * T0 * T1); - } - } - } - } - } - } - - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - for (int tk = 0; tk < NT2; ++tk) { - auto tile_subview = Kokkos::tile_subview(v, ti, tj, tk); - for (int k = 0; k < T2; ++k) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - if (tile_subview(i, j, k) != - v(ti * T0 + i, tj * T1 + j, tk * T2 + k)) { - ++counter[1]; - } -#ifdef KOKKOS_VERBOSE_LAYOUTTILED_OUTPUT - std::cout << "idx0,idx1,idx2 = " << ti * T0 + i << "," - << tj * T1 + j << "," << tk * T2 + k << std::endl; - std::cout - << "ti,tj,tk,i,j,k: " << ti << "," << tj << "," << tk - << "," << i << "," << j << "," << k - << " v = " << v(ti * T0 + i, tj * T1 + j, tk * T2 + k) - << " flat idx = " - << (ti * NT1 * NT2 + tj * NT2 + tk) * FT + - (i + j * T0 + k * T0 * T1) - << std::endl; - std::cout << "subview_tile output = " << tile_subview(i, j, k) - << std::endl; -#endif - } - } - } - } - } - } - } // end scope - - // Create LR View - { - Kokkos::View v("v", N0, - N1, N2); - for (int tk = 0; tk < NT2; ++tk) { - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - for (int k = 0; k < T2; ++k) { - v(ti * T0 + i, tj * T1 + j, tk * T2 + k) = - (ti + tj * NT0 + tk * NT0 * NT1) * FT + - (i * T1 * T2 + j * T2 + k); - } - } - } - } - } - } - - for (int tk = 0; tk < NT2; ++tk) { - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - auto tile_subview = Kokkos::tile_subview(v, ti, tj, tk); - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - for (int k = 0; k < T2; ++k) { - if (tile_subview(i, j, k) != - v(ti * T0 + i, tj * T1 + j, tk * T2 + k)) { - ++counter[2]; - } -#ifdef KOKKOS_VERBOSE_LAYOUTTILED_OUTPUT - std::cout << "idx0,idx1,idx2 = " << ti * T0 + i << "," - << tj * T1 + j << "," << tk * T2 + k << std::endl; - std::cout - << "ti,tj,tk,i,j,k: " << ti << "," << tj << "," << tk - << "," << i << "," << j << "," << k - << " v = " << v(ti * T0 + i, tj * T1 + j, tk * T2 + k) - << " flat idx = " - << (ti + tj * NT0 + tk * NT0 * NT1) * FT + - (i * T1 * T2 + j * T2 + k) - << std::endl; - std::cout << "subview_tile output = " << tile_subview(i, j, k) - << std::endl; - std::cout - << "subview tile rank = " << Kokkos::rank(tile_subview) - << std::endl; -#endif - } - } - } - } - } - } - } // end scope - - // Create RR View - { - Kokkos::View v("v", N0, - N1, N2); - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - for (int tk = 0; tk < NT2; ++tk) { - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - for (int k = 0; k < T2; ++k) { - v(ti * T0 + i, tj * T1 + j, tk * T2 + k) = - (ti * NT1 * NT2 + tj * NT2 + tk) * FT + - (i * T1 * T2 + j * T2 + k); - } - } - } - } - } - } - - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - for (int tk = 0; tk < NT2; ++tk) { - auto tile_subview = Kokkos::tile_subview(v, ti, tj, tk); - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - for (int k = 0; k < T2; ++k) { - if (tile_subview(i, j, k) != - v(ti * T0 + i, tj * T1 + j, tk * T2 + k)) { - ++counter[3]; - } -#ifdef KOKKOS_VERBOSE_LAYOUTTILED_OUTPUT - std::cout << "idx0,idx1,idx2 = " << ti * T0 + i << "," - << tj * T1 + j << "," << tk * T2 + k << std::endl; - std::cout - << "ti,tj,tk,i,j,k: " << ti << "," << tj << "," << tk - << "," << i << "," << j << "," << k - << " v = " << v(ti * T0 + i, tj * T1 + j, tk * T2 + k) - << " flat idx = " - << (ti * NT1 * NT2 + tj * NT2 + tk) * FT + - (i * T1 * T2 + j * T2 + k) - << std::endl; - std::cout << "subview_tile output = " << tile_subview(i, j, k) - << std::endl; - std::cout - << "subview tile rank = " << Kokkos::rank(tile_subview) - << std::endl; -#endif - } - } - } - } - } - } - } // end scope - -#ifdef KOKKOS_VERBOSE_LAYOUTTILED_OUTPUT - std::cout << "subview_tile vs view errors:\n" - << " LL: " << counter[0] << " RL: " << counter[1] - << " LR: " << counter[2] << " RR: " << counter[3] << std::endl; -#endif - - ASSERT_EQ(counter[0], long(0)); - ASSERT_EQ(counter[1], long(0)); - ASSERT_EQ(counter[2], long(0)); - ASSERT_EQ(counter[3], long(0)); - - } // end test_view_layout_tiled_subtile_3d - - static void test_view_layout_tiled_subtile_4d(const int N0, const int N1, - const int N2, const int N3) { - const int FT = T0 * T1 * T2 * T3; - - const int NT0 = int(std::ceil(N0 / T0)); - const int NT1 = int(std::ceil(N1 / T1)); - const int NT2 = int(std::ceil(N2 / T2)); - const int NT3 = int(std::ceil(N3 / T3)); - - // Counter to check for errors at the end - long counter[4] = {0}; - // Create LL View - { - Kokkos::View v( - "v", N0, N1, N2, N3); - for (int tl = 0; tl < NT3; ++tl) { - for (int tk = 0; tk < NT2; ++tk) { - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - for (int l = 0; l < T3; ++l) { - for (int k = 0; k < T2; ++k) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - v(ti * T0 + i, tj * T1 + j, tk * T2 + k, tl * T3 + l) = - (ti + tj * NT0 + tk * N0 * N1 + tl * N0 * N1 * N2) * - FT + - (i + j * T0 + k * T0 * T1 + l * T0 * T1 * T2); - } - } - } - } - } - } - } - } - - for (int tl = 0; tl < NT3; ++tl) { - for (int tk = 0; tk < NT2; ++tk) { - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - auto tile_subview = Kokkos::tile_subview(v, ti, tj, tk, tl); - for (int l = 0; l < T3; ++l) { - for (int k = 0; k < T2; ++k) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - if (tile_subview(i, j, k, l) != - v(ti * T0 + i, tj * T1 + j, tk * T2 + k, - tl * T3 + l)) { - ++counter[0]; - } -#ifdef KOKKOS_VERBOSE_LAYOUTTILED_OUTPUT - std::cout << "idx0,idx1,idx2,idx3 = " << ti * T0 + i - << "," << tj * T1 + j << "," << tk * T2 + k - << "," << tl * T3 + l << std::endl; - std::cout - << "ti,tj,tk,tl: " << ti << "," << tj << "," << tk - << "," << tl << "," - << " i,j,k,l: " << i << "," << j << "," << k << "," - << l << " v = " - << v(ti * T0 + i, tj * T1 + j, tk * T2 + k, - tl * T3 + l) - << " flat idx = " - << (ti + tj * NT0 + tk * N0 * N1 + - tl * N0 * N1 * N2) * - FT + - (i + j * T0 + k * T0 * T1 + l * T0 * T1 * T2) - << std::endl; - std::cout << "subview_tile output = " - << tile_subview(i, j, k, l) << std::endl; - std::cout << "subview tile rank = " - << Kokkos::rank(tile_subview) << std::endl; -#endif - } - } - } - } - } - } - } - } - } // end scope - - // Create RL View - { - Kokkos::View v( - "v", N0, N1, N2, N3); - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - for (int tk = 0; tk < NT2; ++tk) { - for (int tl = 0; tl < NT3; ++tl) { - for (int l = 0; l < T3; ++l) { - for (int k = 0; k < T2; ++k) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - v(ti * T0 + i, tj * T1 + j, tk * T2 + k, tl * T3 + l) = - (ti * NT1 * NT2 * N3 + tj * NT2 * N3 + tk * N3 + tl) * - FT + - (i + j * T0 + k * T0 * T1 + l * T0 * T1 * T2); - } - } - } - } - } - } - } - } - - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - for (int tk = 0; tk < NT2; ++tk) { - for (int tl = 0; tl < NT3; ++tl) { - auto tile_subview = Kokkos::tile_subview(v, ti, tj, tk, tl); - for (int l = 0; l < T3; ++l) { - for (int k = 0; k < T2; ++k) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - if (tile_subview(i, j, k, l) != - v(ti * T0 + i, tj * T1 + j, tk * T2 + k, - tl * T3 + l)) { - ++counter[1]; - } -#ifdef KOKKOS_VERBOSE_LAYOUTTILED_OUTPUT - std::cout << "idx0,idx1,idx2,idx3 = " << ti * T0 + i - << "," << tj * T1 + j << "," << tk * T2 + k - << "," << tl * T3 + l << std::endl; - std::cout - << "ti,tj,tk,tl: " << ti << "," << tj << "," << tk - << "," << tl << "," - << " i,j,k,l: " << i << "," << j << "," << k << "," - << l << " v = " - << v(ti * T0 + i, tj * T1 + j, tk * T2 + k, - tl * T3 + l) - << " flat idx = " - << (ti * NT1 * NT2 * N3 + tj * NT2 * N3 + tk * N3 + - tl) * FT + - (i + j * T0 + k * T0 * T1 + l * T0 * T1 * T2) - << std::endl; - std::cout << "subview_tile output = " - << tile_subview(i, j, k, l) << std::endl; - std::cout << "subview tile rank = " - << Kokkos::rank(tile_subview) << std::endl; -#endif - } - } - } - } - } - } - } - } - } // end scope - - // Create LR View - { - Kokkos::View v( - "v", N0, N1, N2, N3); - for (int tl = 0; tl < NT3; ++tl) { - for (int tk = 0; tk < NT2; ++tk) { - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - for (int k = 0; k < T2; ++k) { - for (int l = 0; l < T3; ++l) { - v(ti * T0 + i, tj * T1 + j, tk * T2 + k, tl * T3 + l) = - (ti + tj * NT0 + tk * NT0 * NT1 + - tl * NT0 * NT1 * NT2) * - FT + - (i * T1 * T2 * T3 + j * T2 * T3 + k * T3 + l); - } - } - } - } - } - } - } - } - - for (int tl = 0; tl < NT3; ++tl) { - for (int tk = 0; tk < NT2; ++tk) { - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - auto tile_subview = Kokkos::tile_subview(v, ti, tj, tk, tl); - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - for (int k = 0; k < T2; ++k) { - for (int l = 0; l < T3; ++l) { - if (tile_subview(i, j, k, l) != - v(ti * T0 + i, tj * T1 + j, tk * T2 + k, - tl * T3 + l)) { - ++counter[2]; - } -#ifdef KOKKOS_VERBOSE_LAYOUTTILED_OUTPUT - std::cout << "idx0,idx1,idx2,idx3 = " << ti * T0 + i - << "," << tj * T1 + j << "," << tk * T2 + k - << "," << tl * T3 + l << std::endl; - std::cout - << "ti,tj,tk,tl: " << ti << "," << tj << "," << tk - << "," << tl << "," - << " i,j,k,l: " << i << "," << j << "," << k << "," - << l << " v = " - << v(ti * T0 + i, tj * T1 + j, tk * T2 + k, - tl * T3 + l) - << " flat idx = " - << (ti + tj * NT0 + tk * NT0 * NT1 + - tl * NT0 * NT1 * NT2) * - FT + - (i * T1 * T2 * T3 + j * T2 * T3 + k * T3 + l) - << std::endl; - std::cout << "subview_tile output = " - << tile_subview(i, j, k, l) << std::endl; - std::cout << "subview tile rank = " - << Kokkos::rank(tile_subview) << std::endl; -#endif - } - } - } - } - } - } - } - } - } // end scope - - // Create RR View - { - Kokkos::View v( - "v", N0, N1, N2, N3); - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - for (int tk = 0; tk < NT2; ++tk) { - for (int tl = 0; tl < NT3; ++tl) { - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - for (int k = 0; k < T2; ++k) { - for (int l = 0; l < T3; ++l) { - v(ti * T0 + i, tj * T1 + j, tk * T2 + k, tl * T3 + l) = - (ti * NT1 * NT2 * NT3 + tj * NT2 * NT3 + tk * NT3 + - tl) * - FT + - (i * T1 * T2 * T3 + j * T2 * T3 + k * T3 + l); - } - } - } - } - } - } - } - } - - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - for (int tk = 0; tk < NT2; ++tk) { - for (int tl = 0; tl < NT3; ++tl) { - auto tile_subview = Kokkos::tile_subview(v, ti, tj, tk, tl); - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - for (int k = 0; k < T2; ++k) { - for (int l = 0; l < T3; ++l) { - if (tile_subview(i, j, k, l) != - v(ti * T0 + i, tj * T1 + j, tk * T2 + k, - tl * T3 + l)) { - ++counter[3]; - } -#ifdef KOKKOS_VERBOSE_LAYOUTTILED_OUTPUT - std::cout << "idx0,idx1,idx2,idx3 = " << ti * T0 + i - << "," << tj * T1 + j << "," << tk * T2 + k - << "," << tl * T3 + l << std::endl; - std::cout - << "ti,tj,tk,tl: " << ti << "," << tj << "," << tk - << "," << tl << "," - << " i,j,k,l: " << i << "," << j << "," << k << "," - << l << " v = " - << v(ti * T0 + i, tj * T1 + j, tk * T2 + k, - tl * T3 + l) - << " flat idx = " - << (ti * NT1 * NT2 * NT3 + tj * NT2 * NT3 + tk * NT3 + - tl) * FT + - (i * T1 * T2 * T3 + j * T2 * T3 + k * T3 + l) - << std::endl; - std::cout << "subview_tile output = " - << tile_subview(i, j, k, l) << std::endl; - std::cout << "subview tile rank = " - << Kokkos::rank(tile_subview) << std::endl; -#endif - } - } - } - } - } - } - } - } - } // end scope - -#ifdef KOKKOS_VERBOSE_LAYOUTTILED_OUTPUT - std::cout << "subview_tile vs view errors:\n" - << " LL: " << counter[0] << " RL: " << counter[1] - << " LR: " << counter[2] << " RR: " << counter[3] << std::endl; -#endif - - ASSERT_EQ(counter[0], long(0)); - ASSERT_EQ(counter[1], long(0)); - ASSERT_EQ(counter[2], long(0)); - ASSERT_EQ(counter[3], long(0)); - - } // end test_view_layout_tiled_subtile_4d - -}; // end TestViewLayoutTiled struct - -} // namespace - -TEST(TEST_CATEGORY, view_layouttiled) { - // These two examples are iterating by tile, then within a tile - not by - // extents If N# is not a power of two, but want to iterate by tile then - // within a tile, need to check that mapped index is within extent - TestViewLayoutTiled::test_view_layout_tiled_2d(4, 12); - TestViewLayoutTiled::test_view_layout_tiled_3d(4, 12, 16); - TestViewLayoutTiled::test_view_layout_tiled_4d(4, 12, 16, 12); -} -TEST(TEST_CATEGORY, view_layouttiled_subtile) { - // These two examples are iterating by tile, then within a tile - not by - // extents If N# is not a power of two, but want to iterate by tile then - // within a tile, need to check that mapped index is within extent - TestViewLayoutTiled::test_view_layout_tiled_subtile_2d(4, 12); - TestViewLayoutTiled::test_view_layout_tiled_subtile_3d(4, 12, - 16); - TestViewLayoutTiled::test_view_layout_tiled_subtile_4d( - 4, 12, 16, 12); -} -} // namespace Test - -#undef KOKKOS_IMPL_PUBLIC_INCLUDE From 51b98e1d76b198478b3975ff738a99272450c67e Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 28 Mar 2024 23:26:50 -0400 Subject: [PATCH 16/95] Get rid of now unnecessary use of is_layouttiled trait --- containers/src/Kokkos_DynRankView.hpp | 3 +-- core/src/Kokkos_CopyViews.hpp | 26 ++++++++------------------ core/src/Kokkos_View.hpp | 6 ++---- 3 files changed, 11 insertions(+), 24 deletions(-) diff --git a/containers/src/Kokkos_DynRankView.hpp b/containers/src/Kokkos_DynRankView.hpp index 5fa59f1b7cd..3989911aca4 100644 --- a/containers/src/Kokkos_DynRankView.hpp +++ b/containers/src/Kokkos_DynRankView.hpp @@ -1657,8 +1657,7 @@ KOKKOS_FUNCTION auto as_view_of_rank_n( if constexpr (std::is_same_v || std::is_same_v || - std::is_same_v || - is_layouttiled::value) { + std::is_same_v) { for (int i = N; i < 7; ++i) layout.dimension[i] = KOKKOS_IMPL_CTOR_DEFAULT_ARG; } diff --git a/core/src/Kokkos_CopyViews.hpp b/core/src/Kokkos_CopyViews.hpp index 08f6ba8d696..bb44d33ba21 100644 --- a/core/src/Kokkos_CopyViews.hpp +++ b/core/src/Kokkos_CopyViews.hpp @@ -539,11 +539,8 @@ void view_copy(const ExecutionSpace& space, const DstType& dst, int64_t strides[DstType::rank + 1]; dst.stride(strides); Kokkos::Iterate iterate; - if (Kokkos::is_layouttiled::value) { - iterate = Kokkos::layout_iterate_type_selector< - typename DstType::array_layout>::outer_iteration_pattern; - } else if (std::is_same::value) { + if (std::is_same::value) { iterate = Kokkos::Iterate::Right; } else if (std::is_same::value) { @@ -630,11 +627,8 @@ void view_copy(const DstType& dst, const SrcType& src) { int64_t strides[DstType::rank + 1]; dst.stride(strides); Kokkos::Iterate iterate; - if (Kokkos::is_layouttiled::value) { - iterate = Kokkos::layout_iterate_type_selector< - typename DstType::array_layout>::outer_iteration_pattern; - } else if (std::is_same::value) { + if (std::is_same::value) { iterate = Kokkos::Iterate::Right; } else if (std::is_same::value) { @@ -3092,8 +3086,7 @@ inline std::enable_if_t< std::is_same::array_layout, Kokkos::LayoutRight>::value || std::is_same::array_layout, - Kokkos::LayoutStride>::value || - is_layouttiled::array_layout>::value> + Kokkos::LayoutStride>::value> impl_resize(const Impl::ViewCtorProp& arg_prop, Kokkos::View& v, const typename Kokkos::View::array_layout& layout) { @@ -3139,8 +3132,7 @@ inline std::enable_if_t< std::is_same::array_layout, Kokkos::LayoutRight>::value || std::is_same::array_layout, - Kokkos::LayoutStride>::value || - is_layouttiled::array_layout>::value)> + Kokkos::LayoutStride>::value)> impl_resize(const Impl::ViewCtorProp& arg_prop, Kokkos::View& v, const typename Kokkos::View::array_layout& layout) { @@ -3308,8 +3300,7 @@ inline std::enable_if_t< std::is_same::array_layout, Kokkos::LayoutRight>::value || std::is_same::array_layout, - Kokkos::LayoutStride>::value || - is_layouttiled::array_layout>::value> + Kokkos::LayoutStride>::value> impl_realloc(Kokkos::View& v, const typename Kokkos::View::array_layout& layout, const Impl::ViewCtorProp& arg_prop) { @@ -3351,8 +3342,7 @@ inline std::enable_if_t< std::is_same::array_layout, Kokkos::LayoutRight>::value || std::is_same::array_layout, - Kokkos::LayoutStride>::value || - is_layouttiled::array_layout>::value)> + Kokkos::LayoutStride>::value)> impl_realloc(Kokkos::View& v, const typename Kokkos::View::array_layout& layout, const Impl::ViewCtorProp& arg_prop) { diff --git a/core/src/Kokkos_View.hpp b/core/src/Kokkos_View.hpp index 484a0e6f62e..0124d31107d 100644 --- a/core/src/Kokkos_View.hpp +++ b/core/src/Kokkos_View.hpp @@ -1442,8 +1442,7 @@ class View : public ViewTraits { std::is_same_v || std::is_same_v || - is_layouttiled::value) { + Kokkos::LayoutStride>) { size_t i0 = arg_layout.dimension[0]; size_t i1 = arg_layout.dimension[1]; size_t i2 = arg_layout.dimension[2]; @@ -1495,8 +1494,7 @@ class View : public ViewTraits { std::is_same_v || std::is_same_v || - is_layouttiled::value) { + Kokkos::LayoutStride>) { size_t i0 = arg_layout.dimension[0]; size_t i1 = arg_layout.dimension[1]; size_t i2 = arg_layout.dimension[2]; From 1efeb5d76d677e68701cd9592d475ee56f5a1dbc Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 28 Mar 2024 23:27:18 -0400 Subject: [PATCH 17/95] Deprecate is_layouttiled trait --- core/src/Kokkos_Layout.hpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/core/src/Kokkos_Layout.hpp b/core/src/Kokkos_Layout.hpp index 6c0a0c9cf1d..92719b3ec39 100644 --- a/core/src/Kokkos_Layout.hpp +++ b/core/src/Kokkos_Layout.hpp @@ -217,8 +217,10 @@ enum class Iterate { Right // Right indices stride fastest }; +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 template -struct is_layouttiled : std::false_type {}; +struct KOKKOS_DEPRECATED is_layouttiled : std::false_type {}; +#endif // For use with view_copy template From 635551058d436885bf3757eb0bd1ea17bbf0abd4 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Mon, 1 Apr 2024 18:52:03 -0400 Subject: [PATCH 18/95] Move `Kokkos::Array` tests to a more suitable place (#6905) * Move Kokkos::Array tests to a more suitable place * Workaround bogous(?) compile error with Array::operator[] not being constexpr --- core/unit_test/TestAggregate.hpp | 36 ------------------------ core/unit_test/TestArray.cpp | 47 ++++++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+), 36 deletions(-) diff --git a/core/unit_test/TestAggregate.hpp b/core/unit_test/TestAggregate.hpp index f1316a7426a..b4d924793c1 100644 --- a/core/unit_test/TestAggregate.hpp +++ b/core/unit_test/TestAggregate.hpp @@ -63,42 +63,6 @@ void TestViewAggregate() { ASSERT_EQ(y.extent(0), 4u); ASSERT_EQ(y.extent(1), 5u); ASSERT_EQ(y.extent(2), 32u); - - // Initialize arrays from brace-init-list as for std::array. - // - // Comment: Clang will issue the following warning if we don't use double - // braces here (one for initializing the Kokkos::Array and one for - // initializing the sub-aggreagate C-array data member), - // - // warning: suggest braces around initialization of subobject - // - // but single brace syntax would be valid as well. - Kokkos::Array aggregate_initialization_syntax_1 = {{1.41, 3.14}}; - ASSERT_FLOAT_EQ(aggregate_initialization_syntax_1[0], 1.41); - ASSERT_FLOAT_EQ(aggregate_initialization_syntax_1[1], 3.14); - - Kokkos::Array aggregate_initialization_syntax_2{ - {0, 1, 2}}; // since C++11 - for (int i = 0; i < 3; ++i) { - ASSERT_EQ(aggregate_initialization_syntax_2[i], i); - } - - // Note that this is a valid initialization. - Kokkos::Array initialized_with_one_argument_missing = {{255, 255}}; - for (int i = 0; i < 2; ++i) { - ASSERT_DOUBLE_EQ(initialized_with_one_argument_missing[i], 255); - } - // But the following line would not compile - // Kokkos::Array< double, 3 > initialized_with_too_many{ { 1, 2, 3, 4 } }; - - // The code below must compile for zero-sized arrays. - using T = float; - - constexpr int N = 0; - Kokkos::Array a; - for (int i = 0; i < N; ++i) { - a[i] = T(); - } } TEST(TEST_CATEGORY, view_aggregate) { TestViewAggregate(); } diff --git a/core/unit_test/TestArray.cpp b/core/unit_test/TestArray.cpp index 673d0036b71..e138a64d6db 100644 --- a/core/unit_test/TestArray.cpp +++ b/core/unit_test/TestArray.cpp @@ -73,4 +73,51 @@ KOKKOS_FUNCTION constexpr bool test_array_ctad() { static_assert(test_array_ctad()); #endif +KOKKOS_FUNCTION constexpr bool test_array_aggregate_initialization() { + // Initialize arrays from brace-init-list as for std::array. + + Kokkos::Array aggregate_initialization_syntax_1 = {1.41f, 3.14f}; + if ((aggregate_initialization_syntax_1[0] != 1.41f) || + (aggregate_initialization_syntax_1[1] != 3.14f)) + return false; + + Kokkos::Array aggregate_initialization_syntax_2{ + {0, 1, 2}}; // since C++11 + if ((aggregate_initialization_syntax_2[0] != 0) || + (aggregate_initialization_syntax_2[1] != 1) || + (aggregate_initialization_syntax_2[2] != 2)) + return false; + + // Note that this is a valid initialization. + Kokkos::Array initialized_with_one_argument_missing = {{255, 255}}; + if ((initialized_with_one_argument_missing[0] != 255) || + (initialized_with_one_argument_missing[1] != 255) || + (initialized_with_one_argument_missing[2] != 0)) + return false; + + // But the following line would not compile + // Kokkos::Array< double, 3 > initialized_with_too_many{ { 1, 2, 3, 4 } }; + + return true; +} + +static_assert(test_array_aggregate_initialization()); + +// A few compilers, such as GCC 8.4, were erroring out when the function below +// appeared in a constant expression because +// Kokkos::Array::operator[] is non-constexpr. The issue +// disappears with GCC 9.1 (https://godbolt.org/z/TG4TEef1b). As a workaround, +// the static_assert was dropped and the [[maybe_unused]] is used as an attempt +// to silent warnings that the function is never used. +[[maybe_unused]] KOKKOS_FUNCTION void test_array_zero_sized() { + using T = float; + + // The code below must compile for zero-sized arrays. + constexpr int N = 0; + Kokkos::Array a; + for (int i = 0; i < N; ++i) { + a[i] = T(); + } +} + } // namespace From 391e0408bed63b0ca0758da5dacb75a70677d0f7 Mon Sep 17 00:00:00 2001 From: Thomas Padioleau Date: Tue, 2 Apr 2024 11:19:17 +0200 Subject: [PATCH 19/95] Do not return a copy of the input functor for Kokkos::Experimental::for_each --- .../src/std_algorithms/Kokkos_ForEach.hpp | 56 +++++++++---------- .../impl/Kokkos_ForEachForEachN.hpp | 20 +++---- 2 files changed, 34 insertions(+), 42 deletions(-) diff --git a/algorithms/src/std_algorithms/Kokkos_ForEach.hpp b/algorithms/src/std_algorithms/Kokkos_ForEach.hpp index 6215b325afc..05969be463a 100644 --- a/algorithms/src/std_algorithms/Kokkos_ForEach.hpp +++ b/algorithms/src/std_algorithms/Kokkos_ForEach.hpp @@ -29,49 +29,46 @@ namespace Experimental { template < class ExecutionSpace, class IteratorType, class UnaryFunctorType, std::enable_if_t, int> = 0> -UnaryFunctorType for_each(const std::string& label, const ExecutionSpace& ex, - IteratorType first, IteratorType last, - UnaryFunctorType functor) { - return Impl::for_each_exespace_impl(label, ex, first, last, - std::move(functor)); +void for_each(const std::string& label, const ExecutionSpace& ex, + IteratorType first, IteratorType last, UnaryFunctorType functor) { + Impl::for_each_exespace_impl(label, ex, first, last, std::move(functor)); } template < class ExecutionSpace, class IteratorType, class UnaryFunctorType, std::enable_if_t, int> = 0> -UnaryFunctorType for_each(const ExecutionSpace& ex, IteratorType first, - IteratorType last, UnaryFunctorType functor) { - return Impl::for_each_exespace_impl("Kokkos::for_each_iterator_api_default", - ex, first, last, std::move(functor)); +void for_each(const ExecutionSpace& ex, IteratorType first, IteratorType last, + UnaryFunctorType functor) { + Impl::for_each_exespace_impl("Kokkos::for_each_iterator_api_default", ex, + first, last, std::move(functor)); } template < class ExecutionSpace, class DataType, class... Properties, class UnaryFunctorType, std::enable_if_t, int> = 0> -UnaryFunctorType for_each(const std::string& label, const ExecutionSpace& ex, - const ::Kokkos::View& v, - UnaryFunctorType functor) { +void for_each(const std::string& label, const ExecutionSpace& ex, + const ::Kokkos::View& v, + UnaryFunctorType functor) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v); namespace KE = ::Kokkos::Experimental; - return Impl::for_each_exespace_impl(label, ex, KE::begin(v), KE::end(v), - std::move(functor)); + Impl::for_each_exespace_impl(label, ex, KE::begin(v), KE::end(v), + std::move(functor)); } template < class ExecutionSpace, class DataType, class... Properties, class UnaryFunctorType, std::enable_if_t, int> = 0> -UnaryFunctorType for_each(const ExecutionSpace& ex, - const ::Kokkos::View& v, - UnaryFunctorType functor) { +void for_each(const ExecutionSpace& ex, + const ::Kokkos::View& v, + UnaryFunctorType functor) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v); namespace KE = ::Kokkos::Experimental; - return Impl::for_each_exespace_impl("Kokkos::for_each_view_api_default", ex, - KE::begin(v), KE::end(v), - std::move(functor)); + Impl::for_each_exespace_impl("Kokkos::for_each_view_api_default", ex, + KE::begin(v), KE::end(v), std::move(functor)); } // @@ -82,24 +79,23 @@ UnaryFunctorType for_each(const ExecutionSpace& ex, template , int> = 0> -KOKKOS_FUNCTION UnaryFunctorType for_each(const TeamHandleType& teamHandle, - IteratorType first, IteratorType last, - UnaryFunctorType functor) { - return Impl::for_each_team_impl(teamHandle, first, last, std::move(functor)); +KOKKOS_FUNCTION void for_each(const TeamHandleType& teamHandle, + IteratorType first, IteratorType last, + UnaryFunctorType functor) { + Impl::for_each_team_impl(teamHandle, first, last, std::move(functor)); } template , int> = 0> -KOKKOS_FUNCTION UnaryFunctorType -for_each(const TeamHandleType& teamHandle, - const ::Kokkos::View& v, - UnaryFunctorType functor) { +KOKKOS_FUNCTION void for_each(const TeamHandleType& teamHandle, + const ::Kokkos::View& v, + UnaryFunctorType functor) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v); namespace KE = ::Kokkos::Experimental; - return Impl::for_each_team_impl(teamHandle, KE::begin(v), KE::end(v), - std::move(functor)); + Impl::for_each_team_impl(teamHandle, KE::begin(v), KE::end(v), + std::move(functor)); } } // namespace Experimental diff --git a/algorithms/src/std_algorithms/impl/Kokkos_ForEachForEachN.hpp b/algorithms/src/std_algorithms/impl/Kokkos_ForEachForEachN.hpp index d3be3b7f667..99cc4a1cf3a 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_ForEachForEachN.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_ForEachForEachN.hpp @@ -42,10 +42,9 @@ struct StdForEachFunctor { }; template -UnaryFunctorType for_each_exespace_impl(const std::string& label, - const HandleType& handle, - IteratorType first, IteratorType last, - UnaryFunctorType functor) { +void for_each_exespace_impl(const std::string& label, const HandleType& handle, + IteratorType first, IteratorType last, + UnaryFunctorType functor) { // checks Impl::static_assert_random_access_and_accessible(handle, first); Impl::expect_valid_range(first, last); @@ -56,8 +55,6 @@ UnaryFunctorType for_each_exespace_impl(const std::string& label, label, RangePolicy(handle, 0, num_elements), StdForEachFunctor(first, functor)); handle.fence("Kokkos::for_each: fence after operation"); - - return functor; } template -KOKKOS_FUNCTION UnaryFunctorType -for_each_team_impl(const TeamHandleType& teamHandle, IteratorType first, - IteratorType last, UnaryFunctorType functor) { +KOKKOS_FUNCTION void for_each_team_impl(const TeamHandleType& teamHandle, + IteratorType first, IteratorType last, + UnaryFunctorType functor) { // checks Impl::static_assert_random_access_and_accessible(teamHandle, first); Impl::expect_valid_range(first, last); @@ -96,7 +93,6 @@ for_each_team_impl(const TeamHandleType& teamHandle, IteratorType first, TeamThreadRange(teamHandle, 0, num_elements), StdForEachFunctor(first, functor)); teamHandle.team_barrier(); - return functor; } template Date: Thu, 28 Mar 2024 22:37:37 -0400 Subject: [PATCH 20/95] Drop specialization of ViewMapping for Kokkos::Array --- core/src/Kokkos_View.hpp | 1 - core/src/impl/Kokkos_ViewArray.hpp | 622 ----------------------------- core/unit_test/TestAggregate.hpp | 72 ---- core/unit_test/TestOther.hpp | 1 - 4 files changed, 696 deletions(-) delete mode 100644 core/src/impl/Kokkos_ViewArray.hpp delete mode 100644 core/unit_test/TestAggregate.hpp diff --git a/core/src/Kokkos_View.hpp b/core/src/Kokkos_View.hpp index 484a0e6f62e..d8bf5fc0a88 100644 --- a/core/src/Kokkos_View.hpp +++ b/core/src/Kokkos_View.hpp @@ -522,7 +522,6 @@ constexpr bool is_assignable(const Kokkos::View& dst, //---------------------------------------------------------------------------- #include -#include //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- diff --git a/core/src/impl/Kokkos_ViewArray.hpp b/core/src/impl/Kokkos_ViewArray.hpp deleted file mode 100644 index fe43b630184..00000000000 --- a/core/src/impl/Kokkos_ViewArray.hpp +++ /dev/null @@ -1,622 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_EXPERIMENTAL_VIEW_ARRAY_MAPPING_HPP -#define KOKKOS_EXPERIMENTAL_VIEW_ARRAY_MAPPING_HPP - -#include - -namespace Kokkos { -namespace Impl { - -template -struct ViewDataAnalysis> { - private: - using array_analysis = ViewArrayAnalysis; - - static_assert(std::is_void

::value); - static_assert(std::is_same>::value); - static_assert(std::is_scalar::value, - "View of Array type must be of a scalar type"); - - public: - using specialize = Kokkos::Array<>; - - using dimension = typename array_analysis::dimension; - - private: - enum { - is_const = std::is_same::value - }; - - using array_scalar_dimension = typename dimension::template append::type; - - using scalar_type = std::conditional_t; - using non_const_scalar_type = V; - using const_scalar_type = const V; - - public: - using value_type = typename array_analysis::value_type; - using const_value_type = typename array_analysis::const_value_type; - using non_const_value_type = typename array_analysis::non_const_value_type; - - using type = typename ViewDataType::type; - using const_type = typename ViewDataType::type; - using non_const_type = - typename ViewDataType::type; - - using scalar_array_type = - typename ViewDataType::type; - using const_scalar_array_type = - typename ViewDataType::type; - using non_const_scalar_array_type = - typename ViewDataType::type; -}; - -} // namespace Impl -} // namespace Kokkos - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -/** \brief View mapping for non-specialized data type and standard layout */ -template -class ViewMapping> { - private: - template - friend class ViewMapping; - template - friend class Kokkos::View; - - using offset_type = ViewOffset; - - using handle_type = typename Traits::value_type::pointer; - - handle_type m_impl_handle; - offset_type m_impl_offset; - size_t m_stride = 0; - - using scalar_type = typename Traits::value_type::value_type; - - using contiguous_reference = Kokkos::Array::contiguous>; - using strided_reference = - Kokkos::Array::strided>; - - enum { - is_contiguous_reference = - (Traits::rank == 0) || (std::is_same::value) - }; - - enum { Array_N = Traits::value_type::size() }; - enum { Array_S = is_contiguous_reference ? Array_N : 1 }; - - KOKKOS_INLINE_FUNCTION - ViewMapping(const handle_type &arg_handle, const offset_type &arg_offset) - : m_impl_handle(arg_handle), - m_impl_offset(arg_offset), - m_stride(is_contiguous_reference ? 0 : arg_offset.span()) {} - - public: - //---------------------------------------- - // Domain dimensions - - static constexpr unsigned Rank = Traits::dimension::rank; - - template - KOKKOS_INLINE_FUNCTION constexpr size_t extent(const iType &r) const { - return m_impl_offset.m_dim.extent(r); - } - - static KOKKOS_INLINE_FUNCTION constexpr size_t static_extent( - const unsigned r) noexcept { - using dim_type = typename offset_type::dimension_type; - return dim_type::static_extent(r); - } - - KOKKOS_INLINE_FUNCTION constexpr typename Traits::array_layout layout() - const { - return m_impl_offset.layout(); - } - - KOKKOS_INLINE_FUNCTION constexpr size_t dimension_0() const { - return m_impl_offset.dimension_0(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t dimension_1() const { - return m_impl_offset.dimension_1(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t dimension_2() const { - return m_impl_offset.dimension_2(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t dimension_3() const { - return m_impl_offset.dimension_3(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t dimension_4() const { - return m_impl_offset.dimension_4(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t dimension_5() const { - return m_impl_offset.dimension_5(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t dimension_6() const { - return m_impl_offset.dimension_6(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t dimension_7() const { - return m_impl_offset.dimension_7(); - } - - // Is a regular layout with uniform striding for each index. - using is_regular = typename offset_type::is_regular; - - KOKKOS_INLINE_FUNCTION constexpr size_t stride_0() const { - return m_impl_offset.stride_0(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_1() const { - return m_impl_offset.stride_1(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_2() const { - return m_impl_offset.stride_2(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_3() const { - return m_impl_offset.stride_3(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_4() const { - return m_impl_offset.stride_4(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_5() const { - return m_impl_offset.stride_5(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_6() const { - return m_impl_offset.stride_6(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_7() const { - return m_impl_offset.stride_7(); - } - - //---------------------------------------- - // Range span - - /** \brief Span of the mapped range */ - KOKKOS_INLINE_FUNCTION constexpr size_t span() const { - return m_impl_offset.span() * Array_N; - } - - /** \brief Is the mapped range span contiguous */ - KOKKOS_INLINE_FUNCTION constexpr bool span_is_contiguous() const { - return m_impl_offset.span_is_contiguous(); - } - - using reference_type = - std::conditional_t; - - using pointer_type = handle_type; - - /** \brief If data references are lvalue_reference than can query pointer to - * memory */ - KOKKOS_INLINE_FUNCTION constexpr pointer_type data() const { - return m_impl_handle; - } - - //---------------------------------------- - // The View class performs all rank and bounds checking before - // calling these element reference methods. - - KOKKOS_FORCEINLINE_FUNCTION - reference_type reference() const { - return reference_type(m_impl_handle + 0, Array_N, 0); - } - - template - KOKKOS_FORCEINLINE_FUNCTION reference_type reference(const I0 &i0) const { - return reference_type(m_impl_handle + m_impl_offset(i0) * Array_S, Array_N, - m_stride); - } - - template - KOKKOS_FORCEINLINE_FUNCTION reference_type reference(const I0 &i0, - const I1 &i1) const { - return reference_type(m_impl_handle + m_impl_offset(i0, i1) * Array_S, - Array_N, m_stride); - } - - template - KOKKOS_FORCEINLINE_FUNCTION reference_type reference(const I0 &i0, - const I1 &i1, - const I2 &i2) const { - return reference_type(m_impl_handle + m_impl_offset(i0, i1, i2) * Array_S, - Array_N, m_stride); - } - - template - KOKKOS_FORCEINLINE_FUNCTION reference_type - reference(const I0 &i0, const I1 &i1, const I2 &i2, const I3 &i3) const { - return reference_type( - m_impl_handle + m_impl_offset(i0, i1, i2, i3) * Array_S, Array_N, - m_stride); - } - - template - KOKKOS_FORCEINLINE_FUNCTION reference_type reference(const I0 &i0, - const I1 &i1, - const I2 &i2, - const I3 &i3, - const I4 &i4) const { - return reference_type( - m_impl_handle + m_impl_offset(i0, i1, i2, i3, i4) * Array_S, Array_N, - m_stride); - } - - template - KOKKOS_FORCEINLINE_FUNCTION reference_type - reference(const I0 &i0, const I1 &i1, const I2 &i2, const I3 &i3, - const I4 &i4, const I5 &i5) const { - return reference_type( - m_impl_handle + m_impl_offset(i0, i1, i2, i3, i4, i5) * Array_S, - Array_N, m_stride); - } - - template - KOKKOS_FORCEINLINE_FUNCTION reference_type - reference(const I0 &i0, const I1 &i1, const I2 &i2, const I3 &i3, - const I4 &i4, const I5 &i5, const I6 &i6) const { - return reference_type( - m_impl_handle + m_impl_offset(i0, i1, i2, i3, i4, i5, i6) * Array_S, - Array_N, m_stride); - } - - template - KOKKOS_FORCEINLINE_FUNCTION reference_type - reference(const I0 &i0, const I1 &i1, const I2 &i2, const I3 &i3, - const I4 &i4, const I5 &i5, const I6 &i6, const I7 &i7) const { - return reference_type( - m_impl_handle + m_impl_offset(i0, i1, i2, i3, i4, i5, i6, i7) * Array_S, - Array_N, m_stride); - } - - //---------------------------------------- - - private: - enum { MemorySpanMask = 8 - 1 /* Force alignment on 8 byte boundary */ }; - enum { MemorySpanSize = sizeof(scalar_type) }; - - public: - /** \brief Span, in bytes, of the referenced memory */ - KOKKOS_INLINE_FUNCTION constexpr size_t memory_span() const { - return (m_impl_offset.span() * Array_N * MemorySpanSize + MemorySpanMask) & - ~size_t(MemorySpanMask); - } - - //---------------------------------------- - - KOKKOS_DEFAULTED_FUNCTION ViewMapping() = default; - - //---------------------------------------- - - template - KOKKOS_INLINE_FUNCTION ViewMapping(pointer_type ptr, Args... args) - : m_impl_handle(ptr), - m_impl_offset(std::integral_constant(), args...), - m_stride(m_impl_offset.span()) {} - - //---------------------------------------- - - template - Kokkos::Impl::SharedAllocationRecord<> *allocate_shared( - Kokkos::Impl::ViewCtorProp const &arg_prop, - typename Traits::array_layout const &arg_layout, - bool execution_space_specified) { - using alloc_prop = Kokkos::Impl::ViewCtorProp; - - using execution_space = typename alloc_prop::execution_space; - using memory_space = typename Traits::memory_space; - static_assert( - SpaceAccessibility::accessible); - using functor_type = - ViewValueFunctor; - using record_type = - Kokkos::Impl::SharedAllocationRecord; - - // Query the mapping for byte-size of allocation. - using padding = std::integral_constant< - unsigned int, alloc_prop::allow_padding ? sizeof(scalar_type) : 0>; - - m_impl_offset = offset_type(padding(), arg_layout); - - const size_t alloc_size = - (m_impl_offset.span() * Array_N * MemorySpanSize + MemorySpanMask) & - ~size_t(MemorySpanMask); - const auto &alloc_name = Impl::get_property(arg_prop); - const execution_space &exec_space = - Impl::get_property(arg_prop); - const memory_space &mem_space = - Impl::get_property(arg_prop); - - // Allocate memory from the memory space and create tracking record. - record_type *const record = - execution_space_specified - ? record_type::allocate(exec_space, mem_space, alloc_name, - alloc_size) - : record_type::allocate(mem_space, alloc_name, alloc_size); - - m_impl_handle = handle_type(reinterpret_cast(record->data())); - - functor_type functor = - execution_space_specified - ? functor_type(exec_space, (pointer_type)m_impl_handle, - m_impl_offset.span() * Array_N, alloc_name) - : functor_type((pointer_type)m_impl_handle, - m_impl_offset.span() * Array_N, alloc_name); - -#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || \ - defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_OPENMPTARGET) - if (false) { - // Make sure the destroy functor gets instantiated. - // This avoids "cudaErrorInvalidDeviceFunction"-type errors. - functor.destroy_shared_allocation(); - } -#endif - - // Only initialize if the allocation is non-zero. - // May be zero if one of the dimensions is zero. - if constexpr (alloc_prop::initialize) - if (alloc_size) { - // Assume destruction is only required when construction is requested. - // The ViewValueFunctor has both value construction and destruction - // operators. - record->m_destroy = std::move(functor); - - // Construct values - record->m_destroy.construct_shared_allocation(); - } - - return record; - } -}; - -/** \brief Assign Array to non-Array */ - -template -class ViewMapping< - DstTraits, SrcTraits, - std::enable_if_t<( - std::is_same::value && - std::is_void::value && - (std::is_same::value || - std::is_same::value || - std::is_same::value) && - std::is_same>::value && - (std::is_same::value || - std::is_same::value || - std::is_same::value))>> { - public: - // Can only convert to View::array_type - - enum { - is_assignable_data_type = - std::is_same::value && - (DstTraits::rank == SrcTraits::rank + 1) - }; - enum { - is_assignable = - std::is_same::value && - std::is_same::value - }; - - using TrackType = Kokkos::Impl::SharedAllocationTracker; - using DstType = ViewMapping; - using SrcType = ViewMapping>; - - KOKKOS_INLINE_FUNCTION - static void assign(DstType &dst, const SrcType &src, - const TrackType & /*src_track*/) { - static_assert(is_assignable, "Can only convert to array_type"); - - using dst_offset_type = typename DstType::offset_type; - - // Array dimension becomes the last dimension. - // Arguments beyond the destination rank are ignored. - if (src.span_is_contiguous()) { // not padded - dst.m_impl_offset = dst_offset_type( - std::integral_constant(), - typename DstTraits::array_layout( - (0 < SrcType::Rank ? src.dimension_0() - : SrcTraits::value_type::size()), - (1 < SrcType::Rank ? src.dimension_1() - : SrcTraits::value_type::size()), - (2 < SrcType::Rank ? src.dimension_2() - : SrcTraits::value_type::size()), - (3 < SrcType::Rank ? src.dimension_3() - : SrcTraits::value_type::size()), - (4 < SrcType::Rank ? src.dimension_4() - : SrcTraits::value_type::size()), - (5 < SrcType::Rank ? src.dimension_5() - : SrcTraits::value_type::size()), - (6 < SrcType::Rank ? src.dimension_6() - : SrcTraits::value_type::size()), - (7 < SrcType::Rank ? src.dimension_7() - : SrcTraits::value_type::size()))); - } else { // is padded - using padded = std::integral_constant< - unsigned int, sizeof(typename SrcTraits::value_type::value_type)>; - - dst.m_impl_offset = dst_offset_type( - padded(), typename DstTraits::array_layout( - (0 < SrcType::Rank ? src.dimension_0() - : SrcTraits::value_type::size()), - (1 < SrcType::Rank ? src.dimension_1() - : SrcTraits::value_type::size()), - (2 < SrcType::Rank ? src.dimension_2() - : SrcTraits::value_type::size()), - (3 < SrcType::Rank ? src.dimension_3() - : SrcTraits::value_type::size()), - (4 < SrcType::Rank ? src.dimension_4() - : SrcTraits::value_type::size()), - (5 < SrcType::Rank ? src.dimension_5() - : SrcTraits::value_type::size()), - (6 < SrcType::Rank ? src.dimension_6() - : SrcTraits::value_type::size()), - (7 < SrcType::Rank ? src.dimension_7() - : SrcTraits::value_type::size()))); - } - - dst.m_impl_handle = src.m_impl_handle; - } -}; - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -template -class ViewMapping< - std::enable_if_t<( - std::is_same>::value && - (std::is_same::value || - std::is_same::value || - std::is_same::value))>, - SrcTraits, Args...> { - private: - static_assert(SrcTraits::rank == sizeof...(Args)); - - enum : bool { - R0 = is_integral_extent<0, Args...>::value, - R1 = is_integral_extent<1, Args...>::value, - R2 = is_integral_extent<2, Args...>::value, - R3 = is_integral_extent<3, Args...>::value, - R4 = is_integral_extent<4, Args...>::value, - R5 = is_integral_extent<5, Args...>::value, - R6 = is_integral_extent<6, Args...>::value, - R7 = is_integral_extent<7, Args...>::value - }; - - enum { - rank = unsigned(R0) + unsigned(R1) + unsigned(R2) + unsigned(R3) + - unsigned(R4) + unsigned(R5) + unsigned(R6) + unsigned(R7) - }; - - // Whether right-most rank is a range. - enum { - R0_rev = - 0 == SrcTraits::rank - ? false - : (1 == SrcTraits::rank - ? R0 - : (2 == SrcTraits::rank - ? R1 - : (3 == SrcTraits::rank - ? R2 - : (4 == SrcTraits::rank - ? R3 - : (5 == SrcTraits::rank - ? R4 - : (6 == SrcTraits::rank - ? R5 - : (7 == SrcTraits::rank - ? R6 - : R7))))))) - }; - - // Subview's layout - using array_layout = - std::conditional_t<((rank == 0) || - (rank <= 2 && R0 && - std::is_same::value) || - (rank <= 2 && R0_rev && - std::is_same::value)), - typename SrcTraits::array_layout, - Kokkos::LayoutStride>; - - using value_type = typename SrcTraits::value_type; - - using data_type = std::conditional_t< - rank == 0, value_type, - std::conditional_t< - rank == 1, value_type *, - std::conditional_t< - rank == 2, value_type **, - std::conditional_t< - rank == 3, value_type ***, - std::conditional_t< - rank == 4, value_type ****, - std::conditional_t< - rank == 5, value_type *****, - std::conditional_t< - rank == 6, value_type ******, - std::conditional_t>>>>>>>; - - public: - using traits_type = Kokkos::ViewTraits; - - using type = - Kokkos::View; - - KOKKOS_INLINE_FUNCTION - static void assign(ViewMapping &dst, - ViewMapping const &src, Args... args) { - using DstType = ViewMapping; - - using dst_offset_type = typename DstType::offset_type; - using dst_handle_type = typename DstType::handle_type; - - const SubviewExtents extents(src.m_impl_offset.m_dim, - args...); - - dst.m_impl_offset = dst_offset_type(src.m_impl_offset, extents); - dst.m_impl_handle = dst_handle_type( - src.m_impl_handle + - src.m_impl_offset(extents.domain_offset(0), extents.domain_offset(1), - extents.domain_offset(2), extents.domain_offset(3), - extents.domain_offset(4), extents.domain_offset(5), - extents.domain_offset(6), extents.domain_offset(7))); - } -}; - -} // namespace Impl -} // namespace Kokkos - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -#endif /* #ifndef KOKKOS_EXPERIMENTAL_VIEW_ARRAY_MAPPING_HPP */ diff --git a/core/unit_test/TestAggregate.hpp b/core/unit_test/TestAggregate.hpp deleted file mode 100644 index b4d924793c1..00000000000 --- a/core/unit_test/TestAggregate.hpp +++ /dev/null @@ -1,72 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef TEST_AGGREGATE_HPP -#define TEST_AGGREGATE_HPP - -#include - -namespace Test { - -template -void TestViewAggregate() { - using value_type = Kokkos::Array; - using analysis_1d = - Kokkos::Impl::ViewDataAnalysis; - - static_assert( - std::is_same >::value); - - using a32_traits = Kokkos::ViewTraits; - using flat_traits = - Kokkos::ViewTraits; - - static_assert( - std::is_same >::value); - static_assert( - std::is_same::value); - static_assert(a32_traits::rank == 2); - static_assert(a32_traits::rank_dynamic == 2); - - static_assert(std::is_void::value); - static_assert(flat_traits::rank == 3); - static_assert(flat_traits::rank_dynamic == 2); - static_assert(flat_traits::dimension::N2 == 32); - - using a32_type = Kokkos::View **, DeviceType>; - using a32_flat_type = typename a32_type::array_type; - - static_assert(std::is_same::value); - static_assert(std::is_same::value); - static_assert(a32_type::rank == 2); - static_assert(a32_flat_type::rank == 3); - - a32_type x("test", 4, 5); - a32_flat_type y(x); - - ASSERT_EQ(x.extent(0), 4u); - ASSERT_EQ(x.extent(1), 5u); - ASSERT_EQ(y.extent(0), 4u); - ASSERT_EQ(y.extent(1), 5u); - ASSERT_EQ(y.extent(2), 32u); -} - -TEST(TEST_CATEGORY, view_aggregate) { TestViewAggregate(); } - -} // namespace Test - -#endif /* #ifndef TEST_AGGREGATE_HPP */ diff --git a/core/unit_test/TestOther.hpp b/core/unit_test/TestOther.hpp index fcf0353a88c..2a5bf4a7166 100644 --- a/core/unit_test/TestOther.hpp +++ b/core/unit_test/TestOther.hpp @@ -16,7 +16,6 @@ #ifndef KOKKOS_TEST_OTHER_HPP #define KOKKOS_TEST_OTHER_HPP -#include #include #include From 059cd15c0b2c23443d2e870bcfe767611386157f Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Tue, 2 Apr 2024 16:12:26 -0400 Subject: [PATCH 21/95] Accommodate users that depend on a code that define silly macros (#6909) * Accommodate users that depend on a code that define silly macros Reported on Slack, user was getting when including some other dependency ``` /include/Kokkos_Tuners.hpp(259): error: expected a "," or ">" template ^ ``` It turns out that dependency defines macros such as "X", "Y", "Z", or even "DIM" which is really asking for trouble. See here https://github.com/fluiddynsci/EngSketchPad/blob/1fe3fc4c68a759e0832c02f8d3c2bd8722f183a8/include/libCart3D/c3d_global.h#L24-L28 Since it does not take much change to accommodate that user, we avoid named parameters that collide with these silly macro defines. We are not promising that we support it but this should help. * Per review rename SZ -> ArraySize --- core/src/Kokkos_Tuners.hpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/core/src/Kokkos_Tuners.hpp b/core/src/Kokkos_Tuners.hpp index 618401654e7..f5ffc66af5b 100644 --- a/core/src/Kokkos_Tuners.hpp +++ b/core/src/Kokkos_Tuners.hpp @@ -256,13 +256,14 @@ auto get_point_helper(const PointType& in, const ArrayType& indices, template struct GetPoint; -template -struct GetPoint> { +template +struct GetPoint< + PointType, + std::array> { using index_set_type = - std::array; + std::array; static auto build(const PointType& in, const index_set_type& indices) { - return get_point_helper(in, indices, std::make_index_sequence{}); + return get_point_helper(in, indices, std::make_index_sequence{}); } }; From 2aecb1d2412eca0729f3bc3da9e10c215c17da78 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Tue, 2 Apr 2024 21:57:09 -0400 Subject: [PATCH 22/95] SYCL: Fix multi-GPU support and add test (#6887) * SYCL: Fix multi-GPU support and add test * Combine Cuda and SYCL test * Drop exception_handler --- core/src/SYCL/Kokkos_SYCL_Instance.cpp | 23 +-- core/unit_test/CMakeLists.txt | 10 +- core/unit_test/TestMultiGPU.hpp | 184 ++++++++++++++++++ .../cuda/TestCuda_InterOp_StreamsMultiGPU.cpp | 162 +-------------- .../sycl/TestSYCL_InterOp_StreamsMultiGPU.cpp | 64 ++++++ 5 files changed, 269 insertions(+), 174 deletions(-) create mode 100644 core/unit_test/TestMultiGPU.hpp create mode 100644 core/unit_test/sycl/TestSYCL_InterOp_StreamsMultiGPU.cpp diff --git a/core/src/SYCL/Kokkos_SYCL_Instance.cpp b/core/src/SYCL/Kokkos_SYCL_Instance.cpp index 0e67adb5787..4a1c910c73d 100644 --- a/core/src/SYCL/Kokkos_SYCL_Instance.cpp +++ b/core/src/SYCL/Kokkos_SYCL_Instance.cpp @@ -171,21 +171,22 @@ sycl::device_ptr SYCLInternal::resize_team_scratch_space( // Multiple ParallelFor/Reduce Teams can call this function at the same time // and invalidate the m_team_scratch_ptr. We use a pool to avoid any race // condition. - if (m_team_scratch_current_size[scratch_pool_id] == 0) { + auto mem_space = Kokkos::Experimental::SYCLDeviceUSMSpace(*m_queue); + if (m_team_scratch_current_size[scratch_pool_id] == 0 && bytes > 0) { m_team_scratch_current_size[scratch_pool_id] = bytes; - m_team_scratch_ptr[scratch_pool_id] = - Kokkos::kokkos_malloc( - "Kokkos::Experimental::SYCLDeviceUSMSpace::TeamScratchMemory", - m_team_scratch_current_size[scratch_pool_id]); + m_team_scratch_ptr[scratch_pool_id] = mem_space.allocate( + "Kokkos::Experimental::SYCL::InternalTeamScratchMemory", + m_team_scratch_current_size[scratch_pool_id]); } if ((bytes > m_team_scratch_current_size[scratch_pool_id]) || ((bytes < m_team_scratch_current_size[scratch_pool_id]) && (force_shrink))) { + mem_space.deallocate(m_team_scratch_ptr[scratch_pool_id], + m_team_scratch_current_size[scratch_pool_id]); m_team_scratch_current_size[scratch_pool_id] = bytes; - m_team_scratch_ptr[scratch_pool_id] = - Kokkos::kokkos_realloc( - m_team_scratch_ptr[scratch_pool_id], - m_team_scratch_current_size[scratch_pool_id]); + m_team_scratch_ptr[scratch_pool_id] = mem_space.allocate( + "Kokkos::Experimental::SYCL::InternalTeamScratchMemory", + m_team_scratch_current_size[scratch_pool_id]); } return m_team_scratch_ptr[scratch_pool_id]; } @@ -234,8 +235,8 @@ void SYCLInternal::finalize() { for (int i = 0; i < m_n_team_scratch; ++i) { if (m_team_scratch_current_size[i] > 0) { - Kokkos::kokkos_free( - m_team_scratch_ptr[i]); + device_mem_space.deallocate(m_team_scratch_ptr[i], + m_team_scratch_current_size[i]); m_team_scratch_current_size[i] = 0; m_team_scratch_ptr[i] = nullptr; } diff --git a/core/unit_test/CMakeLists.txt b/core/unit_test/CMakeLists.txt index 6dfb7505c5d..a2350235318 100644 --- a/core/unit_test/CMakeLists.txt +++ b/core/unit_test/CMakeLists.txt @@ -902,15 +902,21 @@ if(Kokkos_ENABLE_SYCL) KOKKOS_ADD_EXECUTABLE_AND_TEST( CoreUnitTest_SYCLInterOpInit_Context SOURCES - UnitTestMainInit.cpp + UnitTestMainInit.cpp sycl/TestSYCL_InterOp_Init_Context.cpp ) KOKKOS_ADD_EXECUTABLE_AND_TEST( CoreUnitTest_SYCLInterOpStreams SOURCES - UnitTestMain.cpp + UnitTestMain.cpp sycl/TestSYCL_InterOp_Streams.cpp ) + KOKKOS_ADD_EXECUTABLE_AND_TEST( + CoreUnitTest_SYCLInterOpStreamsMultiGPU + SOURCES + UnitTestMainInit.cpp + sycl/TestSYCL_InterOp_StreamsMultiGPU.cpp + ) endif() SET(DEFAULT_DEVICE_SOURCES diff --git a/core/unit_test/TestMultiGPU.hpp b/core/unit_test/TestMultiGPU.hpp new file mode 100644 index 00000000000..aad2fa45f49 --- /dev/null +++ b/core/unit_test/TestMultiGPU.hpp @@ -0,0 +1,184 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include + +namespace { + +void test_policies(TEST_EXECSPACE exec0, Kokkos::View v0, + TEST_EXECSPACE exec, Kokkos::View v) { + using MemorySpace = typename TEST_EXECSPACE::memory_space; + + exec.fence(); + exec0.fence(); + + Kokkos::deep_copy(exec, v, 5); + Kokkos::deep_copy(exec0, v0, 5); + + Kokkos::deep_copy(v, v0); + + int sum; + int sum0; + + Kokkos::parallel_for("Test::Range_0", + Kokkos::RangePolicy(exec0, 0, 100), + Test::FunctorRange(v0)); + Kokkos::parallel_for("Test::Range", + Kokkos::RangePolicy(exec, 0, 100), + Test::FunctorRange(v)); + exec.fence(); + exec0.fence(); + Kokkos::parallel_reduce( + "Test::RangeReduce_0", + Kokkos::RangePolicy>(exec0, + 0, 100), + Test::FunctorRangeReduce(v0), sum0); + Kokkos::parallel_reduce( + "Test::RangeReduce", + Kokkos::RangePolicy>(exec, 0, + 100), + Test::FunctorRangeReduce(v), sum); + ASSERT_EQ(600, sum0); + ASSERT_EQ(600, sum); + + Kokkos::parallel_for("Test::MDRange_0", + Kokkos::MDRangePolicy>( + exec0, {0, 0}, {10, 10}), + Test::FunctorMDRange(v0)); + Kokkos::parallel_for("Test::MDRange", + Kokkos::MDRangePolicy>( + exec, {0, 0}, {10, 10}), + Test::FunctorMDRange(v)); + Kokkos::parallel_reduce("Test::MDRangeReduce_0", + Kokkos::MDRangePolicy, + Kokkos::LaunchBounds<128, 2>>( + exec0, {0, 0}, {10, 10}), + Test::FunctorMDRangeReduce(v0), sum0); + Kokkos::parallel_reduce("Test::MDRangeReduce", + Kokkos::MDRangePolicy, + Kokkos::LaunchBounds<128, 2>>( + exec, {0, 0}, {10, 10}), + Test::FunctorMDRangeReduce(v), sum); + ASSERT_EQ(700, sum0); + ASSERT_EQ(700, sum); + + Kokkos::parallel_for("Test::Team_0", + Kokkos::TeamPolicy(exec0, 10, 10), + Test::FunctorTeam(v0)); + Kokkos::parallel_for("Test::Team", + Kokkos::TeamPolicy(exec, 10, 10), + Test::FunctorTeam(v)); + Kokkos::parallel_reduce( + "Test::Team_0", + Kokkos::TeamPolicy>(exec0, + 10, 10), + Test::FunctorTeamReduce(v0), sum0); + Kokkos::parallel_reduce( + "Test::Team", + Kokkos::TeamPolicy>(exec, 10, + 10), + Test::FunctorTeamReduce(v), sum); + ASSERT_EQ(800, sum0); + ASSERT_EQ(800, sum); +} + +struct ScratchFunctor { + int scratch_size; + int R; + + ScratchFunctor(int scratch_size_, int R_) + : scratch_size(scratch_size_), R(R_) {} + + KOKKOS_FUNCTION + void operator()(const Kokkos::TeamPolicy::member_type &team, + int &error_accum) const { + Kokkos::View scratch_mem( + team.team_scratch(1), scratch_size); + + // Initialize scratch memory + Kokkos::parallel_for(Kokkos::TeamVectorRange(team, 0, scratch_size), + [&](int i) { scratch_mem(i) = 0; }); + team.team_barrier(); + + // Increment each entry in scratch memory R times + for (int r = 0; r < R; ++r) { + Kokkos::parallel_for(Kokkos::TeamVectorRange(team, 0, scratch_size), + [&](int i) { scratch_mem(i) += 1; }); + } + team.team_barrier(); + + // Check that each scratch entry has been incremented exactly R times + int team_error_accum; + auto R_loc = R; // avoid implicit capture of this + Kokkos::parallel_reduce( + Kokkos::TeamVectorRange(team, 0, scratch_size), + [&](int i, int &tsum) { + if (scratch_mem(i) != R_loc) { + tsum += 1; + } + }, + team_error_accum); + Kokkos::single(Kokkos::PerTeam(team), + [&]() { error_accum += team_error_accum; }); + } +}; + +void test_scratch(TEST_EXECSPACE exec0, TEST_EXECSPACE exec1) { + constexpr int N = 10; + constexpr int R = 1000; + constexpr int scratch_size = 100; + using ScratchType = Kokkos::View; + + // Test allocating and using scratch space + ScratchFunctor f(scratch_size, R); + + auto policy0 = + Kokkos::TeamPolicy(exec0, N, 10) + .set_scratch_size( + 1, Kokkos::PerTeam(ScratchType::shmem_size(scratch_size))); + auto policy1 = + Kokkos::TeamPolicy(exec1, N, 10) + .set_scratch_size( + 1, Kokkos::PerTeam(ScratchType::shmem_size(scratch_size))); + + int error0, error1; + + Kokkos::parallel_reduce("test_scratch_device_0", policy0, f, error0); + Kokkos::parallel_reduce("test_scratch_device_1", policy1, f, error1); + ASSERT_EQ(error0, 0); + ASSERT_EQ(error1, 0); + + // Request larger scratch size to trigger a realloc and test + const auto new_scratch_size = scratch_size + 10; + ScratchFunctor f_more_scratch(new_scratch_size, R); + + auto policy0_more_scratch = + Kokkos::TeamPolicy(exec0, N, 10) + .set_scratch_size( + 1, Kokkos::PerTeam(ScratchType::shmem_size(new_scratch_size))); + auto policy1_more_scratch = + Kokkos::TeamPolicy(exec1, N, 10) + .set_scratch_size( + 1, Kokkos::PerTeam(ScratchType::shmem_size(new_scratch_size))); + + Kokkos::parallel_reduce("test_realloc_scratch_device_0", policy0_more_scratch, + f_more_scratch, error0); + Kokkos::parallel_reduce("test_realloc_scratch_device_1", policy1_more_scratch, + f_more_scratch, error1); + ASSERT_EQ(error0, 0); + ASSERT_EQ(error1, 0); +} +} // namespace diff --git a/core/unit_test/cuda/TestCuda_InterOp_StreamsMultiGPU.cpp b/core/unit_test/cuda/TestCuda_InterOp_StreamsMultiGPU.cpp index d94735ceb23..40955e9c7ca 100644 --- a/core/unit_test/cuda/TestCuda_InterOp_StreamsMultiGPU.cpp +++ b/core/unit_test/cuda/TestCuda_InterOp_StreamsMultiGPU.cpp @@ -15,7 +15,7 @@ //@HEADER #include -#include +#include namespace { @@ -57,79 +57,6 @@ std::array get_execution_spaces( return {exec0, exec1}; } -// Test Interoperability with Cuda Streams -void test_policies(TEST_EXECSPACE exec0, Kokkos::View v0, - TEST_EXECSPACE exec, Kokkos::View v) { - using MemorySpace = typename TEST_EXECSPACE::memory_space; - - Kokkos::deep_copy(exec, v, 5); - Kokkos::deep_copy(exec0, v0, 5); - - Kokkos::deep_copy(v, v0); - - int sum; - int sum0; - - Kokkos::parallel_for("Test::cuda::raw_cuda_stream::Range_0", - Kokkos::RangePolicy(exec0, 0, 100), - Test::FunctorRange(v0)); - Kokkos::parallel_for("Test::cuda::raw_cuda_stream::Range", - Kokkos::RangePolicy(exec, 0, 100), - Test::FunctorRange(v)); - Kokkos::parallel_reduce( - "Test::cuda::raw_cuda_stream::RangeReduce_0", - Kokkos::RangePolicy>(exec0, - 0, 100), - Test::FunctorRangeReduce(v0), sum0); - Kokkos::parallel_reduce( - "Test::cuda::raw_cuda_stream::RangeReduce", - Kokkos::RangePolicy>(exec, 0, - 100), - Test::FunctorRangeReduce(v), sum); - ASSERT_EQ(600, sum0); - ASSERT_EQ(600, sum); - - Kokkos::parallel_for("Test::cuda::raw_cuda_stream::MDRange_0", - Kokkos::MDRangePolicy>( - exec0, {0, 0}, {10, 10}), - Test::FunctorMDRange(v0)); - Kokkos::parallel_for("Test::cuda::raw_cuda_stream::MDRange", - Kokkos::MDRangePolicy>( - exec, {0, 0}, {10, 10}), - Test::FunctorMDRange(v)); - Kokkos::parallel_reduce("Test::cuda::raw_cuda_stream::MDRangeReduce_0", - Kokkos::MDRangePolicy, - Kokkos::LaunchBounds<128, 2>>( - exec0, {0, 0}, {10, 10}), - Test::FunctorMDRangeReduce(v0), sum0); - Kokkos::parallel_reduce("Test::cuda::raw_cuda_stream::MDRangeReduce", - Kokkos::MDRangePolicy, - Kokkos::LaunchBounds<128, 2>>( - exec, {0, 0}, {10, 10}), - Test::FunctorMDRangeReduce(v), sum); - ASSERT_EQ(700, sum0); - ASSERT_EQ(700, sum); - - Kokkos::parallel_for("Test::cuda::raw_cuda_stream::Team_0", - Kokkos::TeamPolicy(exec0, 10, 10), - Test::FunctorTeam(v0)); - Kokkos::parallel_for("Test::cuda::raw_cuda_stream::Team", - Kokkos::TeamPolicy(exec, 10, 10), - Test::FunctorTeam(v)); - Kokkos::parallel_reduce( - "Test::cuda::raw_cuda_stream::Team_0", - Kokkos::TeamPolicy>(exec0, - 10, 10), - Test::FunctorTeamReduce(v0), sum0); - Kokkos::parallel_reduce( - "Test::cuda::raw_cuda_stream::Team", - Kokkos::TeamPolicy>(exec, 10, - 10), - Test::FunctorTeamReduce(v), sum); - ASSERT_EQ(800, sum0); - ASSERT_EQ(800, sum); -} - TEST(cuda_multi_gpu, managed_views) { StreamsAndDevices streams_and_devices; { @@ -169,93 +96,6 @@ TEST(cuda_multi_gpu, unmanaged_views) { } } -struct ScratchFunctor { - int scratch_size; - int R; - - ScratchFunctor(int scratch_size_, int R_) - : scratch_size(scratch_size_), R(R_) {} - - KOKKOS_FUNCTION - void operator()(const Kokkos::TeamPolicy::member_type &team, - int &error_accum) const { - Kokkos::View scratch_mem( - team.team_scratch(1), scratch_size); - - // Initialize scratch memory - Kokkos::parallel_for(Kokkos::TeamVectorRange(team, 0, scratch_size), - [&](int i) { scratch_mem(i) = 0; }); - team.team_barrier(); - - // Increment each entry in scratch memory R times - for (int r = 0; r < R; ++r) { - Kokkos::parallel_for(Kokkos::TeamVectorRange(team, 0, scratch_size), - [&](int i) { scratch_mem(i) += 1; }); - } - team.team_barrier(); - - // Check that each scratch entry has been incremented exactly R times - int team_error_accum; - auto R_loc = R; // avoid implicit capture of this - Kokkos::parallel_reduce( - Kokkos::TeamVectorRange(team, 0, scratch_size), - [&](int i, int &tsum) { - if (scratch_mem(i) != R_loc) { - tsum += 1; - } - }, - team_error_accum); - Kokkos::single(Kokkos::PerTeam(team), - [&]() { error_accum += team_error_accum; }); - } -}; - -void test_scratch(TEST_EXECSPACE exec0, TEST_EXECSPACE exec1) { - constexpr int N = 10; - constexpr int R = 1000; - constexpr int scratch_size = 100; - using ScratchType = Kokkos::View; - - // Test allocating and using scratch space - ScratchFunctor f(scratch_size, R); - - auto policy0 = - Kokkos::TeamPolicy(exec0, N, 10) - .set_scratch_size( - 1, Kokkos::PerTeam(ScratchType::shmem_size(scratch_size))); - auto policy1 = - Kokkos::TeamPolicy(exec1, N, 10) - .set_scratch_size( - 1, Kokkos::PerTeam(ScratchType::shmem_size(scratch_size))); - - int error0, error1; - - Kokkos::parallel_reduce("test_scratch_device_0", policy0, f, error0); - Kokkos::parallel_reduce("test_scratch_device_1", policy1, f, error1); - ASSERT_EQ(error0, 0); - ASSERT_EQ(error1, 0); - - // Request larger scratch size to trigger a realloc and test - const auto new_scratch_size = scratch_size + 10; - ScratchFunctor f_more_scratch(new_scratch_size, R); - - auto policy0_more_scratch = - Kokkos::TeamPolicy(exec0, N, 10) - .set_scratch_size( - 1, Kokkos::PerTeam(ScratchType::shmem_size(new_scratch_size))); - auto policy1_more_scratch = - Kokkos::TeamPolicy(exec1, N, 10) - .set_scratch_size( - 1, Kokkos::PerTeam(ScratchType::shmem_size(new_scratch_size))); - - Kokkos::parallel_reduce("test_realloc_scratch_device_0", policy0_more_scratch, - f_more_scratch, error0); - Kokkos::parallel_reduce("test_realloc_scratch_device_1", policy1_more_scratch, - f_more_scratch, error1); - ASSERT_EQ(error0, 0); - ASSERT_EQ(error1, 0); -} - TEST(cuda_multi_gpu, scratch_space) { StreamsAndDevices streams_and_devices; { diff --git a/core/unit_test/sycl/TestSYCL_InterOp_StreamsMultiGPU.cpp b/core/unit_test/sycl/TestSYCL_InterOp_StreamsMultiGPU.cpp new file mode 100644 index 00000000000..d3906e409f5 --- /dev/null +++ b/core/unit_test/sycl/TestSYCL_InterOp_StreamsMultiGPU.cpp @@ -0,0 +1,64 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include +#include + +namespace { + +std::array get_execution_spaces() { + std::vector gpu_devices = + sycl::device::get_devices(sycl::info::device_type::gpu); + + TEST_EXECSPACE exec0( + sycl::queue{gpu_devices.front(), sycl::property::queue::in_order()}); + TEST_EXECSPACE exec1( + sycl::queue{gpu_devices.back(), sycl::property::queue::in_order()}); + + return {exec0, exec1}; +} + +TEST(sycl_multi_gpu, managed_views) { + std::array execs = get_execution_spaces(); + + Kokkos::View view0(Kokkos::view_alloc("v0", execs[0]), + 100); + Kokkos::View view(Kokkos::view_alloc("v", execs[1]), + 100); + + test_policies(execs[0], view0, execs[1], view); +} + +TEST(sycl_multi_gpu, unmanaged_views) { + std::array execs = get_execution_spaces(); + + int *p0 = sycl::malloc_device(100, execs[0].sycl_queue()); + Kokkos::View view0(p0, 100); + + int *p1 = sycl::malloc_device(100, execs[1].sycl_queue()); + Kokkos::View view1(p1, 100); + + test_policies(execs[0], view0, execs[1], view1); + sycl::free(p0, execs[0].sycl_queue()); + sycl::free(p1, execs[1].sycl_queue()); +} + +TEST(sycl_multi_gpu, scratch_space) { + std::array execs = get_execution_spaces(); + + test_scratch(execs[0], execs[1]); +} +} // namespace From caa139c9b9307c88599bbe9e5e180b6db44692e4 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 3 Apr 2024 18:35:21 -0400 Subject: [PATCH 23/95] SYCL: Unroll shuffle loops for top-level parallel_reduce and parallel_scan (#6750) * SYCL: Unroll shuffle loops for top-level parallel_reduce and parallel_scan * Rename second lambda * Assert upper bounds and remove unattainable cases * Fix sign comparison warnings --- .../SYCL/Kokkos_SYCL_ParallelScan_Range.hpp | 35 ++++++++++++++ .../SYCL/Kokkos_SYCL_WorkgroupReduction.hpp | 48 +++++++++++++++---- 2 files changed, 73 insertions(+), 10 deletions(-) diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp index 977b69bc9eb..3306efa957e 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp @@ -36,10 +36,25 @@ void workgroup_scan(sycl::nd_item item, const FunctorType& final_reducer, const int sg_group_id = sg.get_group_id()[0]; const int id_in_sg = sg.get_local_id()[0]; +#if defined(KOKKOS_ARCH_INTEL_GPU) || defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) + auto shuffle_combine = [&](int stride) { + if (stride < global_range) { + auto tmp = sg.shuffle_up(local_value, stride); + if (id_in_sg >= stride) final_reducer.join(&local_value, &tmp); + } + }; + shuffle_combine(1); + shuffle_combine(2); + shuffle_combine(4); + shuffle_combine(8); + shuffle_combine(16); + KOKKOS_ASSERT(global_range <= 32); +#else for (int stride = 1; stride < global_range; stride <<= 1) { auto tmp = sg.shuffle_up(local_value, stride); if (id_in_sg >= stride) final_reducer.join(&local_value, &tmp); } +#endif const int max_subgroup_size = sg.get_max_local_range()[0]; const int n_active_subgroups = @@ -61,6 +76,25 @@ void workgroup_scan(sycl::nd_item item, const FunctorType& final_reducer, const auto upper_bound = std::min(local_range, n_active_subgroups - round * local_range); auto local_sg_value = local_mem[idx < n_active_subgroups ? idx : 0]; +#if defined(KOKKOS_ARCH_INTEL_GPU) || defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) + auto shuffle_combine_sg = [&](int stride) { + if (stride < upper_bound) { + auto tmp = sg.shuffle_up(local_sg_value, stride); + if (id_in_sg >= stride) { + if (idx < n_active_subgroups) + final_reducer.join(&local_sg_value, &tmp); + else + local_sg_value = tmp; + } + } + }; + shuffle_combine_sg(1); + shuffle_combine_sg(2); + shuffle_combine_sg(4); + shuffle_combine_sg(8); + shuffle_combine_sg(16); + KOKKOS_ASSERT(upper_bound <= 32); +#else for (int stride = 1; stride < upper_bound; stride <<= 1) { auto tmp = sg.shuffle_up(local_sg_value, stride); if (id_in_sg >= stride) { @@ -70,6 +104,7 @@ void workgroup_scan(sycl::nd_item item, const FunctorType& final_reducer, local_sg_value = tmp; } } +#endif if (idx < n_active_subgroups) { local_mem[idx] = local_sg_value; if (round > 0) diff --git a/core/src/SYCL/Kokkos_SYCL_WorkgroupReduction.hpp b/core/src/SYCL/Kokkos_SYCL_WorkgroupReduction.hpp index c308384af09..7069805a5b5 100644 --- a/core/src/SYCL/Kokkos_SYCL_WorkgroupReduction.hpp +++ b/core/src/SYCL/Kokkos_SYCL_WorkgroupReduction.hpp @@ -21,11 +21,9 @@ namespace Kokkos::Impl::SYCLReduction { -// FIXME_SYCL It appears that using shuffles is slower than going through local -// memory. template -inline constexpr bool use_shuffle_based_algorithm = false; -// std::is_reference_v; +inline constexpr bool use_shuffle_based_algorithm = + std::is_reference_v; template std::enable_if_t> workgroup_reduction( @@ -109,17 +107,31 @@ std::enable_if_t> workgroup_reduction( // Perform the actual workgroup reduction in each subgroup // separately. - auto sg = item.get_sub_group(); - const int id_in_sg = sg.get_local_id()[0]; - const auto local_range = - std::min(sg.get_local_range()[0], max_size); + auto sg = item.get_sub_group(); + const int id_in_sg = sg.get_local_id()[0]; + const int local_range = std::min(sg.get_local_range()[0], max_size); const auto upper_stride_bound = - std::min(local_range - id_in_sg, max_size - local_id); + std::min(local_range - id_in_sg, max_size - local_id); +#if defined(KOKKOS_ARCH_INTEL_GPU) || defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) + auto shuffle_combine = [&](int stride) { + if (stride < local_range) { + auto tmp = sg.shuffle_down(local_value, stride); + if (stride < upper_stride_bound) final_reducer.join(&local_value, &tmp); + } + }; + shuffle_combine(1); + shuffle_combine(2); + shuffle_combine(4); + shuffle_combine(8); + shuffle_combine(16); + KOKKOS_ASSERT(local_range <= 32); +#else for (unsigned int stride = 1; stride < local_range; stride <<= 1) { auto tmp = sg.shuffle_down(local_value, stride); if (stride < upper_stride_bound) final_reducer.join(&local_value, &tmp); } +#endif // Copy the subgroup results into the first positions of the // reduction array. @@ -140,7 +152,7 @@ std::enable_if_t> workgroup_reduction( // the first subgroup, we first combine the items with a higher // index. if (n_active_subgroups > local_range) { - for (unsigned int offset = local_range; offset < n_active_subgroups; + for (int offset = local_range; offset < n_active_subgroups; offset += local_range) if (id_in_sg + offset < n_active_subgroups) { final_reducer.join(&sg_value, &local_mem[(id_in_sg + offset)]); @@ -149,11 +161,27 @@ std::enable_if_t> workgroup_reduction( } // Then, we proceed as before. +#if defined(KOKKOS_ARCH_INTEL_GPU) || defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) + auto shuffle_combine_sg = [&](int stride) { + if (stride < local_range) { + auto tmp = sg.shuffle_down(sg_value, stride); + if (id_in_sg + stride < n_active_subgroups) + final_reducer.join(&sg_value, &tmp); + } + }; + shuffle_combine_sg(1); + shuffle_combine_sg(2); + shuffle_combine_sg(4); + shuffle_combine_sg(8); + shuffle_combine_sg(16); + KOKKOS_ASSERT(local_range <= 32); +#else for (unsigned int stride = 1; stride < local_range; stride <<= 1) { auto tmp = sg.shuffle_down(sg_value, stride); if (id_in_sg + stride < n_active_subgroups) final_reducer.join(&sg_value, &tmp); } +#endif // Finally, we copy the workgroup results back to global memory // to be used in the next iteration. If this is the last From a833fb00ba79b36354fd45e3fbae1e1621e187d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Chevalier?= Date: Thu, 4 Apr 2024 13:53:12 +0200 Subject: [PATCH 24/95] Preparing readme for develop as the default branch (#6796) * Add 'Quick Start' section to README.md Added an extensive 'Quick Start' guide to the README.md document, which includes instructions for obtaining the latest release of Kokkos, building Kokkos for different platforms such as CPU on Linux or macOS, CUDA on Linux, and HIP on Linux, and using Kokkos in a CMake project. * Update hyperlinks in README.md The previous links pointing to kokkos.github.io have been changed to point correctly to kokkos.org. * Refine instructions for getting and building Kokkos in README.md The README.md file has been updated with simplified, clearer instructions. Changes include refined steps to obtain the latest Kokkos release using curl and to clone the development version of Kokkos from GitHub. * Add Kokkos installation guide using Spack in README.md * Reorder Readme and add links * Readme: Fix typo in cmake option * Fix typo in Readme Co-authored-by: Daniel Arndt * Readme: update version and hip instructions. * README: Remove building examples Adding one liner to get latest release * Fix a typo * dalg24 suggestions Co-authored-by: Damien L-G --------- Co-authored-by: Daniel Arndt Co-authored-by: Damien L-G --- README.md | 61 +++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 41 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index 19793bb82d9..f4252437111 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -![Kokkos](https://avatars2.githubusercontent.com/u/10199860?s=200&v=4) +[![Kokkos](https://avatars2.githubusercontent.com/u/10199860?s=200&v=4)](https://kokkos.org) # Kokkos: Core Libraries @@ -10,43 +10,64 @@ hierarchies and multiple types of execution resources. It currently can use CUDA, HIP, SYCL, HPX, OpenMP and C++ threads as backend programming models with several other backends in development. -**Kokkos Core is part of the Kokkos C++ Performance Portability Programming EcoSystem.** +**Kokkos Core is part of the [Kokkos C++ Performance Portability Programming Ecosystem](https://kokkos.org/about/abstract/).** -For the complete documentation, click below: +## Learning about Kokkos -# [kokkos.github.io/kokkos-core-wiki](https://kokkos.github.io/kokkos-core-wiki) +To start learning about Kokkos: -# Learning about Kokkos +- [Kokkos Lectures](https://kokkos.org/kokkos-core-wiki/videolectures.html): they contain a mix of lecture videos and hands-on exercises covering all the important capabilities. -To start learning about Kokkos: +- [Programming guide](https://kokkos.org/kokkos-core-wiki/programmingguide.html): contains in "narrative" form a technical description of the programming model, machine model, and the main building blocks like the Views and parallel dispatch. -- [Kokkos Lectures](https://kokkos.github.io/kokkos-core-wiki/videolectures.html): they contain a mix of lecture videos and hands-on exercises covering all the important Kokkos Ecosystem capabilities. +- [API reference](https://kokkos.org/kokkos-core-wiki/): organized by category, i.e., [core](https://kokkos.org/kokkos-core-wiki/API/core-index.html), [algorithms](https://kokkos.org/kokkos-core-wiki/API/algorithms-index.html) and [containers](https://kokkos.org/kokkos-core-wiki/API/containers-index.html) or, if you prefer, in [alphabetical order](https://kokkos.org/kokkos-core-wiki/API/alphabetical.html). -- [Programming guide](https://kokkos.github.io/kokkos-core-wiki/programmingguide.html): contains in "narrative" form a technical description of the programming model, machine model, and the main building blocks like the Views and parallel dispatch. +- [Use cases and Examples](https://kokkos.org/kokkos-core-wiki/usecases.html): a serie of examples ranging from how to use Kokkos with MPI to Fortran interoperability. -- [API reference](https://kokkos.github.io/kokkos-core-wiki/): organized by category, i.e., [core](https://kokkos.github.io/kokkos-core-wiki/API/core-index.html), [algorithms](https://kokkos.github.io/kokkos-core-wiki/API/algorithms-index.html) and [containers](https://kokkos.github.io/kokkos-core-wiki/API/containers-index.html) or, if you prefer, in [alphabetical order](https://kokkos.github.io/kokkos-core-wiki/API/alphabetical.html). +## Obtaining Kokkos -- [Use cases and Examples](https://kokkos.github.io/kokkos-core-wiki/usecases.html): a series of examples ranging from how to use Kokkos with MPI to Fortran interoperability. +The latest release of Kokkos can be obtained from the [GitHub releases page](https://github.com/kokkos/kokkos/releases/latest). -For questions find us on Slack: https://kokkosteam.slack.com or open a GitHub issue. +The current release is [4.2.01](https://github.com/kokkos/kokkos/releases/tag/4.2.01). -For non-public questions send an email to: *crtrott(at)sandia.gov* +```bash +curl -OJ -L https://github.com/kokkos/kokkos/archive/refs/tags/4.2.01.tar.gz +# Or with wget +wget https://github.com/kokkos/kokkos/archive/refs/tags/4.2.01.tar.gz +``` + +To clone the latest development version of Kokkos from GitHub: + +```bash +git clone -b develop https://github.com/kokkos/kokkos.git +``` + +### Building Kokkos -# Contributing to Kokkos +To build Kokkos, you will need to have a C++ compiler that supports C++14 or later. +All requirements including minimum and primary tested compiler versions can be found [here](https://kokkos.org/kokkos-core-wiki/requirements.html). -Please see [this page](https://kokkos.github.io/kokkos-core-wiki/contributing.html) for details on how to contribute. +Building and installation instructions are described [here](https://kokkos.org/kokkos-core-wiki/building.html). -# Requirements, Building and Installing +You can also install Kokkos using [Spack](https://spack.io/): `spack install kokkos`. [Available configuration options](https://packages.spack.io/package.html?name=kokkos) can be displayed using `spack info kokkos`. + +## For the complete documentation: [kokkos.org/kokkos-core-wiki/](https://kokkos.org/kokkos-core-wiki/) + +## Support + +For questions find us on Slack: https://kokkosteam.slack.com or open a GitHub issue. + +For non-public questions send an email to: *crtrott(at)sandia.gov* -All requirements including minimum and primary tested compiler versions can be found [here](https://kokkos.github.io/kokkos-core-wiki/requirements.html). +## Contributing -Building and installation instructions are described [here](https://kokkos.github.io/kokkos-core-wiki/building.html). +Please see [this page](https://kokkos.org/kokkos-core-wiki/contributing.html) for details on how to contribute. -# Citing Kokkos +## Citing Kokkos -Please see the [following page](https://kokkos.github.io/kokkos-core-wiki/citation.html). +Please see the [following page](https://kokkos.org/kokkos-core-wiki/citation.html). -# License +## License [![License](https://img.shields.io/badge/License-Apache--2.0_WITH_LLVM--exception-blue)](https://spdx.org/licenses/LLVM-exception.html) From 682291f52cf0a4d0a837b717caa4fd41df9c6272 Mon Sep 17 00:00:00 2001 From: thierry antoun Date: Thu, 4 Apr 2024 14:49:38 +0200 Subject: [PATCH 25/95] Format with clang --- core/src/Kokkos_CopyViews.hpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/core/src/Kokkos_CopyViews.hpp b/core/src/Kokkos_CopyViews.hpp index 04708f73e77..6348c3d308f 100644 --- a/core/src/Kokkos_CopyViews.hpp +++ b/core/src/Kokkos_CopyViews.hpp @@ -3659,10 +3659,11 @@ void check_view_ctor_args_create_mirror_view_and_copy() { "not explicitly allow padding!"); } -} // namespace Impl +} // namespace Impl template ::specialize>::value>> + class = std::enable_if< + std::is_void::specialize>::value>> auto create_mirror_view_and_copy( const Impl::ViewCtorProp& arg_prop, const Kokkos::View& src) { @@ -3670,11 +3671,13 @@ auto create_mirror_view_and_copy( Impl::check_view_ctor_args_create_mirror_view_and_copy(); - if constexpr (Impl::MirrorViewType::is_same_memspace) { + if constexpr (Impl::MirrorViewType::is_same_memspace) { // same behavior as deep_copy(src, src) if constexpr (!alloc_prop_input::has_execution_space) fence( - "Kokkos::create_mirror_view_and_copy: fence before returning src view"); + "Kokkos::create_mirror_view_and_copy: fence before returning src " + "view"); return src; } else { using Space = typename alloc_prop_input::memory_space; From 497b438f1f16d63faa55240772b1f0d8681a7a89 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Thu, 4 Apr 2024 14:18:13 -0600 Subject: [PATCH 26/95] CHANGELOG.md: 4.3.00 update --- CHANGELOG.md | 100 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 100 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index e1d06958295..f7b8af7695c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,105 @@ # CHANGELOG +## [4.3.00](https://github.com/kokkos/kokkos/tree/4.3.00) (2024-03-19) +[Full Changelog](https://github.com/kokkos/kokkos/compare/4.2.01...4.3.00) + +### Features: +* Add `Experimental::sort_by_key(exec, keys, values)` algorithm [\#6801](https://github.com/kokkos/kokkos/pull/6801) + +### Backend and Architecture Enhancements: + +#### CUDA: +* Experimental multi-GPU support (from the same process) [\#6782](https://github.com/kokkos/kokkos/pull/6782) +* Link against CUDA libraries even with KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE [\#6701](https://github.com/kokkos/kokkos/pull/6701) +* Don't use the compiler launcher script if the CMake compile language is CUDA. [\#6704](https://github.com/kokkos/kokkos/pull/6704) +* nvcc(wrapper): adding "long" and "short" versions for all flags [\#6615](https://github.com/kokkos/kokkos/pull/6615) + +#### HIP: + * Fix compilation when using amdclang (with ROCm >= 5.7) and RDC [\#6857](https://github.com/kokkos/kokkos/pull/6857) + * Use rocthrust for sorting, when available [\#6793](https://github.com/kokkos/kokkos/pull/6793) + +#### SYCL: +* We only support OneAPI SYCL implementation: add check during initialization + * Error out on initialization if the backend is different from `ext_oneapi_*` [\#6784](https://github.com/kokkos/kokkos/pull/6784) + * Filter GPU devices for `ext_onapi_*` GPU devices [\#6758](https://github.com/kokkos/kokkos/pull/6784) +* Performance Improvements + * Avoid unnecessary zero-memset of the scratch flags in SYCL [\#6739](https://github.com/kokkos/kokkos/pull/6739) + * Use host-pinned memory to copy reduction/scan result [\#6500](https://github.com/kokkos/kokkos/pull/6500) +* Address deprecations after oneAPI 2023.2.0 [\#6577](https://github.com/kokkos/kokkos/pull/6739) +* Make sure to call find_dependency for oneDPL if necessary [\#6870](https://github.com/kokkos/kokkos/pull/6870) + +#### OpenMPTarget: +* Use LLVM extensions for dynamic shared memory [\#6380](https://github.com/kokkos/kokkos/pull/6380) +* Guard scratch memory usage in ParallelReduce [\#6585 ](https://github.com/kokkos/kokkos/pull/6585) +* Update linker flags for Intel GPUs update [\#6735](https://github.com/kokkos/kokkos/pull/6735) +* Improve handling of printf on Intel GPUs [\#6652](https://github.com/kokkos/kokkos/pull/6652) + +#### OpenACC: +* Add atomics support [\#6446](https://github.com/kokkos/kokkos/pull/6446) +* Make the OpenACC backend asynchronous [\#6772](https://github.com/kokkos/kokkos/pull/6772) + +#### Threads: +* Add missing broadcast to TeamThreadRange parallel_scan [\#6601](https://github.com/kokkos/kokkos/pull/6446) + +#### OpenMP: +* Improve performance of view initializations and filling with zeros [\#6573](https://github.com/kokkos/kokkos/pull/6573) + +### General Enhancements + +* Improve performance of random number generation when using a normal distribution on GPUs [\#6556](https://github.com/kokkos/kokkos/pull/6556) +* Allocate temporary view with the user-provided execution space instance and do not initialize in `unique` algorithm [\#6598](https://github.com/kokkos/kokkos/pull/6598) +* Add deduction guide for `Kokkos::Array` [\#6373](https://github.com/kokkos/kokkos/pull/6373) +* Provide new public headers `` and `` [\#6687](https://github.com/kokkos/kokkos/pull/6687) +* Fix/improvement to `remove_if` parallel algorithm: use the provided execution space instance for temporary allocations and drop unnecessaryinitialization + avoid evaluating twice the predicate during final pass [\#6747](https://github.com/kokkos/kokkos/pull/6747) +* Add runtime function to query the number of devices and make device ID consistent with `KOKKOS_VISIBLE_DEVICES` [\#6713](https://github.com/kokkos/kokkos/pull/6713) +* simd: support `vector_aligned_tag` [\#6243](https://github.com/kokkos/kokkos/pull/6243) +* Avoid unnecessary allocation when default constructing Bitset [\#6524](https://github.com/kokkos/kokkos/pull/6524) +* Fix constness for views in std algorithms [\#6813](https://github.com/kokkos/kokkos/pull/6813) +* Improve error message on unsafe implicit conversion in MDRangePolicy [\#6855](https://github.com/kokkos/kokkos/pull/6855) +* CTAD (deduction guides) for RangePolicy [\#6850](https://github.com/kokkos/kokkos/pull/6850) +* CTAD (deduction guides) for MDRangePolicy [\#5516](https://github.com/kokkos/kokkos/pull/5516) + +### Build System Changes +* Require `Kokkos_ENABLE_ATOMICS_BYPASS` option to bypass atomic operation for Serial backend only builds [\#6692](https://github.com/kokkos/kokkos/pull/6692) +* Add support for RISCV and the Milk-V's Pioneer [\#6773](https://github.com/kokkos/kokkos/pull/6773) +* Add C++26 standard to CMake setup [\#6733](https://github.com/kokkos/kokkos/pull/6733) +* Fix Makefile when using gnu_generate_makefile.sh and make >= 4.3 [\#6606](https://github.com/kokkos/kokkos/pull/6606) +* Cuda: Fix configuring with CMake >= 3.28.4 - temporary fallback to internal CudaToolkit.cmake [\#6898](https://github.com/kokkos/kokkos/pull/6898) + +### Incompatibilities (i.e. breaking changes) +* Remove all `DEPRECATED_CODE_3` option and all code that was guarded by it [\#6523](https://github.com/kokkos/kokkos/pull/6523) +* Drop guards to accommodate external code defining `KOKKOS_ASSERT` [\#6665](https://github.com/kokkos/kokkos/pull/6665) +* `Profiling::ProfilingSection(std::string)` constructor marked explicit and nodiscard [\#6690](https://github.com/kokkos/kokkos/pull/6690) +* Add bound check preconditions for `RangePolicy` and `MDRangePolicy` [\#6617](https://github.com/kokkos/kokkos/pull/6617) [\#6726](https://github.com/kokkos/kokkos/pull/6726) +* Add checks for unsafe implicit conversions in RangePolicy [\#6754](https://github.com/kokkos/kokkos/pull/6754) +* Remove Kokkos::[b]half_t volatile overloads [\#6579](https://github.com/kokkos/kokkos/pull/6579) +* Remove KOKKOS_IMPL_DO_NOT_USE_PRINTF [\#6593](https://github.com/kokkos/kokkos/pull/6593) +* Check matching static extents in View constructor [\#5190 ](https://github.com/kokkos/kokkos/pull/5190) +* Tools(profiling): fix typo Kokkos_Tools_Optim[i]zationGoal [\#6642](https://github.com/kokkos/kokkos/pull/6642) +* Remove variadic range policy constructor (disallow passing multiple trailing chunk size arguments) [\#6845](https://github.com/kokkos/kokkos/pull/6845) +* Improve message on view out of bounds access and always abort [\#6861](https://github.com/kokkos/kokkos/pull/6861) +* Drop `KOKKOS_ENABLE_INTEL_MM_ALLOC` macro [\#6797](https://github.com/kokkos/kokkos/pull/6797) +* Remove `Kokkos::Experimental::LogicalMemorySpace` (without going through deprecation) [\#6557](https://github.com/kokkos/kokkos/pull/6557) +* Remove `Experimental::HBWSpace` and support for linking against memkind [\#6791](https://github.com/kokkos/kokkos/pull/6791) +* Drop librt TPL and associated `KOKKOS_ENABLE_LIBRT` macro [\#6798](https://github.com/kokkos/kokkos/pull/6798) +* Drop support for old CPU architectures (`ARCH_BGQ`, `ARCH_POWER7`, `ARCH_WSM` and associated `ARCH_SSE4` macro) [\#6806](https://github.com/kokkos/kokkos/pull/6806) +* Drop support for deprecated command-line arguments and environment variables [\#6744](https://github.com/kokkos/kokkos/pull/6744) + +### Deprecations +* Provide kokkos_swap as part of Core and deprecate Experimental::swap in Algorithms [\#6697](https://github.com/kokkos/kokkos/pull/6697) +* Deprecate {Cuda,HIP}::detect_device_count() and Cuda::[detect_]device_arch() [\#6710](https://github.com/kokkos/kokkos/pull/6710) +* Deprecate `ExecutionSpace::in_parallel()` [\#6582](https://github.com/kokkos/kokkos/pull/6582) + +### Bug Fixes +* Fix team-level MDRange reductions: [\#6511](https://github.com/kokkos/kokkos/pull/6511) +* Fix CUDA and SYCL small value type (16-bit) team reductions [\#5334](https://github.com/kokkos/kokkos/pull/5334) +* Enable `{transform_}exclusive_scan` in place [\#6667](https://github.com/kokkos/kokkos/pull/6667) +* `fill_random` overload that do not take an execution space instance argument should fence [\#6658](https://github.com/kokkos/kokkos/pull/6658) +* HIP,Cuda,OpenMPTarget: Fixup use provided execution space when copying host inaccessible reduction result [\#6777](https://github.com/kokkos/kokkos/pull/6777) +* Fix typo in `cuda_func_set_attribute[s]_wrapper` preventing proper setting of desired occupancy [\#6786](https://github.com/kokkos/kokkos/pull/6786) +* Avoid undefined behavior due to conversion between signed and unsigned integers in shift_{right, left}_team_impl [\#6821](https://github.com/kokkos/kokkos/pull/6821) +* Fix a bug in Makefile.kokkos when using AMD GPU architectures as `AMD_GFXYYY` [\#6892](https://github.com/kokkos/kokkos/pull/6892) + ## [4.2.01](https://github.com/kokkos/kokkos/tree/4.2.01) (2023-12-07) [Full Changelog](https://github.com/kokkos/kokkos/compare/4.2.00...4.2.01) From 4b90930999610dbc60bfdaddfe7ff1385a4fb997 Mon Sep 17 00:00:00 2001 From: Paul Zehner Date: Fri, 5 Apr 2024 05:34:08 +0200 Subject: [PATCH 27/95] Refactor: Uniformize `create_mirror*` parameter name for views (#6917) * Uniformize view name for create_mirror * Uniformize view name for create_mirror_view * Fix formatting --- core/src/Kokkos_CopyViews.hpp | 37 ++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/core/src/Kokkos_CopyViews.hpp b/core/src/Kokkos_CopyViews.hpp index 08f6ba8d696..90a438a9df8 100644 --- a/core/src/Kokkos_CopyViews.hpp +++ b/core/src/Kokkos_CopyViews.hpp @@ -3506,24 +3506,24 @@ auto create_mirror(const Kokkos::View& src, template std::enable_if_t::specialize>::value, typename Kokkos::View::HostMirror> -create_mirror(Kokkos::View const& v) { - return Impl::create_mirror(v, Impl::ViewCtorProp<>{}); +create_mirror(Kokkos::View const& src) { + return Impl::create_mirror(src, Impl::ViewCtorProp<>{}); } template std::enable_if_t::specialize>::value, typename Kokkos::View::HostMirror> create_mirror(Kokkos::Impl::WithoutInitializing_t wi, - Kokkos::View const& v) { - return Impl::create_mirror(v, view_alloc(wi)); + Kokkos::View const& src) { + return Impl::create_mirror(src, view_alloc(wi)); } template ::value>> std::enable_if_t::specialize>::value, typename Impl::MirrorType::view_type> -create_mirror(Space const&, Kokkos::View const& v) { - return Impl::create_mirror(v, view_alloc(typename Space::memory_space{})); +create_mirror(Space const&, Kokkos::View const& src) { + return Impl::create_mirror(src, view_alloc(typename Space::memory_space{})); } template ::specialize>::value && Impl::ViewCtorProp::has_memory_space>> auto create_mirror(Impl::ViewCtorProp const& arg_prop, - Kokkos::View const& v) { - return Impl::create_mirror(v, arg_prop); + Kokkos::View const& src) { + return Impl::create_mirror(src, arg_prop); } template @@ -3541,8 +3541,8 @@ std::enable_if_t< !Impl::ViewCtorProp::has_memory_space, typename Kokkos::View::HostMirror> create_mirror(Impl::ViewCtorProp const& arg_prop, - Kokkos::View const& v) { - return Impl::create_mirror(v, arg_prop); + Kokkos::View const& src) { + return Impl::create_mirror(src, arg_prop); } template ::specialize>::value, typename Impl::MirrorType::view_type> create_mirror(Kokkos::Impl::WithoutInitializing_t wi, Space const&, - Kokkos::View const& v) { - return Impl::create_mirror(v, view_alloc(typename Space::memory_space{}, wi)); + Kokkos::View const& src) { + return Impl::create_mirror(src, + view_alloc(typename Space::memory_space{}, wi)); } namespace Impl { @@ -3647,8 +3648,8 @@ create_mirror_view(const Kokkos::View& src) { template typename Kokkos::View::HostMirror create_mirror_view( - Kokkos::Impl::WithoutInitializing_t wi, Kokkos::View const& v) { - return Impl::create_mirror_view(v, view_alloc(wi)); + Kokkos::Impl::WithoutInitializing_t wi, Kokkos::View const& src) { + return Impl::create_mirror_view(src, view_alloc(wi)); } // FIXME_C++17 Improve SFINAE here. @@ -3675,15 +3676,15 @@ template ::value>> typename Impl::MirrorViewType::view_type create_mirror_view( Kokkos::Impl::WithoutInitializing_t wi, Space const&, - Kokkos::View const& v) { + Kokkos::View const& src) { return Impl::create_mirror_view( - v, view_alloc(typename Space::memory_space{}, wi)); + src, view_alloc(typename Space::memory_space{}, wi)); } template auto create_mirror_view(const Impl::ViewCtorProp& arg_prop, - const Kokkos::View& v) { - return Impl::create_mirror_view(v, arg_prop); + const Kokkos::View& src) { + return Impl::create_mirror_view(src, arg_prop); } template From 98b1a38e5391c57b866c4b16bf8108fce56d827a Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Fri, 5 Apr 2024 16:06:48 -0400 Subject: [PATCH 28/95] SYCL: Improve team_reduce implementation (#6562) * SYCL: Improve team_reduce implementation * Manually unroll loop to propagate shuffle destinations at compile-time * Update step_width * Fix sign comparison warnings * Comment and choosing step_width=16 * KOKKOS_ASSERT that the subgroup range doesn't exceed 32 * Remove unattainable cases * Add a barrier guarding the reduction array upon exiting team_reduce * Update comment to better reflect intent of barrier --------- Co-authored-by: Christian Trott --- core/src/SYCL/Kokkos_SYCL_Team.hpp | 77 ++++++++++++++---------------- 1 file changed, 37 insertions(+), 40 deletions(-) diff --git a/core/src/SYCL/Kokkos_SYCL_Team.hpp b/core/src/SYCL/Kokkos_SYCL_Team.hpp index dbba3827581..2b4c2be5227 100644 --- a/core/src/SYCL/Kokkos_SYCL_Team.hpp +++ b/core/src/SYCL/Kokkos_SYCL_Team.hpp @@ -133,72 +133,69 @@ class SYCLTeamMember { const unsigned int team_rank_ = team_rank(); // First combine the values in the same subgroup +#if defined(KOKKOS_ARCH_INTEL_GPU) || defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) + auto shuffle_combine = [&](int shift) { + if (vector_range * shift < sub_group_range) { + const value_type tmp = sg.shuffle_down(value, vector_range * shift); + if (team_rank_ + shift < team_size_) reducer.join(value, tmp); + } + }; + shuffle_combine(1); + shuffle_combine(2); + shuffle_combine(4); + shuffle_combine(8); + shuffle_combine(16); + KOKKOS_ASSERT(sub_group_range <= 32); +#else for (unsigned int shift = 1; vector_range * shift < sub_group_range; shift <<= 1) { const value_type tmp = sg.shuffle_down(value, vector_range * shift); if (team_rank_ + shift < team_size_) reducer.join(value, tmp); } +#endif value = sg.shuffle(value, 0); - const auto n_subgroups = sg.get_group_range()[0]; + const int n_subgroups = sg.get_group_range()[0]; if (n_subgroups == 1) { reducer.reference() = value; return; } - // We need to chunk up the whole reduction because we might not have - // allocated enough memory. - const unsigned int maximum_work_range = - std::min(m_team_reduce_size / sizeof(value_type), n_subgroups); + // It was found experimentally that 16 is a good value for Intel PVC. + // Since there is a maximum number of 1024 threads with subgroup size 16, + // we have a maximum of 64 subgroups per workgroup which means 64/16=4 + // rounds for loading values into the reduction_array, and 16 redundant + // reduction steps executed by every thread. + constexpr int step_width = 16; + auto tmp_alloc = sycl::ext::oneapi::group_local_memory_for_overwrite< + value_type[step_width]>(m_item.get_group()); + auto& reduction_array = *tmp_alloc; const auto id_in_sg = sg.get_local_id()[0]; - auto reduction_array = - static_cast>(m_team_reduce); - // Load values into the first maximum_work_range values of the reduction + // Load values into the first step_width values of the reduction // array in chunks. This means that only sub groups with an id in the // corresponding chunk load values. - const auto group_id = sg.get_group_id()[0]; - if (id_in_sg == 0 && group_id < maximum_work_range) + const int group_id = sg.get_group_id()[0]; + if (id_in_sg == 0 && group_id < step_width) reduction_array[group_id] = value; sycl::group_barrier(m_item.get_group()); - for (unsigned int start = maximum_work_range; start < n_subgroups; - start += maximum_work_range) { + for (int start = step_width; start < n_subgroups; start += step_width) { if (id_in_sg == 0 && group_id >= start && - group_id < - std::min(start + maximum_work_range, n_subgroups)) + group_id < std::min(start + step_width, n_subgroups)) reducer.join(reduction_array[group_id - start], value); sycl::group_barrier(m_item.get_group()); } - // Let the first subgroup do the final reduction - if (group_id == 0) { - const auto local_range = sg.get_local_range()[0]; - auto result = - reduction_array[id_in_sg < maximum_work_range ? id_in_sg : 0]; - // In case the maximum_work_range is larger than the range of the first - // subgroup, we first combine the items with a higher index. - for (unsigned int offset = local_range; offset < maximum_work_range; - offset += local_range) - if (id_in_sg + offset < maximum_work_range) - reducer.join(result, reduction_array[id_in_sg + offset]); - sycl::group_barrier(sg); - - // Now do the actual subgroup reduction. - const auto min_range = - std::min(maximum_work_range, local_range); - for (unsigned int stride = 1; stride < min_range; stride <<= 1) { - const auto tmp = sg.shuffle_down(result, stride); - if (id_in_sg + stride < min_range) reducer.join(result, tmp); - } - if (id_in_sg == 0) reduction_array[0] = result; - } - sycl::group_barrier(m_item.get_group()); + // Do the final reduction for all threads redundantly + value = reduction_array[0]; + for (int i = 1; i < std::min(step_width, n_subgroups); ++i) + reducer.join(value, reduction_array[i]); - reducer.reference() = reduction_array[0]; - // Make sure that the reduction array hasn't been modified in the meantime. - m_item.barrier(sycl::access::fence_space::local_space); + reducer.reference() = value; + // Make sure that every thread is done using the reduction array. + sycl::group_barrier(m_item.get_group()); } //-------------------------------------------------------------------------- From 55c5757502bdc5d5f6b9658a91114dcf56df6a4b Mon Sep 17 00:00:00 2001 From: Thomas Conrad Clevenger Date: Mon, 8 Apr 2024 11:04:49 -0600 Subject: [PATCH 29/95] Use recommended/max team size functions in Cuda ParallelFor and Reduce constructors (#6891) * Use team_size_recommended in cuda ParallelFor constructor Previous computation was intended to match team_size_recommended, but missing extra scratch space allocation. * Same change for cuda ParallelReduce() * Remove unused attr variable * Use team_size_max() in pfor constructor instead of recomputing --- core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp | 38 +++++---------------- 1 file changed, 9 insertions(+), 29 deletions(-) diff --git a/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp b/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp index 9f7be45c839..71e77518210 100644 --- a/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp +++ b/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp @@ -539,17 +539,9 @@ class ParallelFor, m_vector_size(arg_policy.impl_vector_length()) { auto internal_space_instance = m_policy.space().impl_internal_space_instance(); - cudaFuncAttributes attr = - CudaParallelLaunch::get_cuda_func_attributes( - internal_space_instance->m_cudaDev); - m_team_size = - m_team_size >= 0 - ? m_team_size - : Kokkos::Impl::cuda_get_opt_block_size( - internal_space_instance, attr, m_functor, m_vector_size, - m_policy.team_scratch_size(0), - m_policy.thread_scratch_size(0)) / - m_vector_size; + m_team_size = m_team_size >= 0 ? m_team_size + : arg_policy.team_size_recommended( + arg_functor, ParallelForTag()); m_shmem_begin = (sizeof(double) * (m_team_size + 2)); m_shmem_size = @@ -585,13 +577,7 @@ class ParallelFor, "Kokkos::Impl::ParallelFor< Cuda > insufficient shared memory")); } - if (int(m_team_size) > - int(Kokkos::Impl::cuda_get_max_block_size( - internal_space_instance, attr, arg_functor, - arg_policy.impl_vector_length(), - arg_policy.team_scratch_size(0), - arg_policy.thread_scratch_size(0)) / - arg_policy.impl_vector_length())) { + if (m_team_size > arg_policy.team_size_max(arg_functor, ParallelForTag())) { Kokkos::Impl::throw_runtime_exception(std::string( "Kokkos::Impl::ParallelFor< Cuda > requested too large team size.")); } @@ -909,17 +895,11 @@ class ParallelReduce:: - get_cuda_func_attributes(internal_space_instance->m_cudaDev); - m_team_size = - m_team_size >= 0 - ? m_team_size - : Kokkos::Impl::cuda_get_opt_block_size( - internal_space_instance, attr, - m_functor_reducer.get_functor(), m_vector_size, - m_policy.team_scratch_size(0), - m_policy.thread_scratch_size(0)) / - m_vector_size; + m_team_size = m_team_size >= 0 ? m_team_size + : arg_policy.team_size_recommended( + arg_functor_reducer.get_functor(), + arg_functor_reducer.get_reducer(), + ParallelReduceTag()); m_team_begin = UseShflReduction From 8cf841076625376f34c516e557c1c66bcda97122 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Mon, 8 Apr 2024 14:16:12 -0400 Subject: [PATCH 30/95] SYCL: Fix range in subgroup scan for workgroup_scan --- core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp index 3306efa957e..58cfea6a97a 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp @@ -35,10 +35,11 @@ void workgroup_scan(sycl::nd_item item, const FunctorType& final_reducer, auto sg = item.get_sub_group(); const int sg_group_id = sg.get_group_id()[0]; const int id_in_sg = sg.get_local_id()[0]; + const int local_range = std::min(sg.get_local_range()[0], global_range); #if defined(KOKKOS_ARCH_INTEL_GPU) || defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) auto shuffle_combine = [&](int stride) { - if (stride < global_range) { + if (stride < local_range) { auto tmp = sg.shuffle_up(local_value, stride); if (id_in_sg >= stride) final_reducer.join(&local_value, &tmp); } @@ -48,9 +49,9 @@ void workgroup_scan(sycl::nd_item item, const FunctorType& final_reducer, shuffle_combine(4); shuffle_combine(8); shuffle_combine(16); - KOKKOS_ASSERT(global_range <= 32); + KOKKOS_ASSERT(local_range <= 32); #else - for (int stride = 1; stride < global_range; stride <<= 1) { + for (int stride = 1; stride < local_range; stride <<= 1) { auto tmp = sg.shuffle_up(local_value, stride); if (id_in_sg >= stride) final_reducer.join(&local_value, &tmp); } @@ -60,7 +61,6 @@ void workgroup_scan(sycl::nd_item item, const FunctorType& final_reducer, const int n_active_subgroups = (global_range + max_subgroup_size - 1) / max_subgroup_size; - const int local_range = sg.get_local_range()[0]; if (id_in_sg == local_range - 1 && sg_group_id < n_active_subgroups) local_mem[sg_group_id] = local_value; local_value = sg.shuffle_up(local_value, 1); From 3a27cdbc276fd74f1e1860c939b6862936eb3bcb Mon Sep 17 00:00:00 2001 From: Bruno Turcksin Date: Wed, 10 Apr 2024 09:48:48 -0400 Subject: [PATCH 31/95] Add ROCm 6.0 in the nightly CI --- .jenkins_nightly | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/.jenkins_nightly b/.jenkins_nightly index 5d585817891..b213ba3061a 100644 --- a/.jenkins_nightly +++ b/.jenkins_nightly @@ -102,6 +102,39 @@ pipeline { ''' } } + stage('HIP-ROCM-6.0') { + agent { + dockerfile { + filename 'Dockerfile.hipcc' + dir 'scripts/docker' + additionalBuildArgs '--build-arg BASE=rocm/dev-ubuntu-20.04:6.0.2-complete' + label 'rocm-docker && AMD_Radeon_Instinct_MI210' + args '-v /tmp/ccache.kokkos:/tmp/ccache --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --env HIP_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES' + } + } + steps { + sh 'ccache --zero-stats' + sh '''rm -rf build && mkdir -p build && cd build && \ + cmake \ + -DCMAKE_BUILD_TYPE=RelWithDebInfo \ + -DCMAKE_CXX_COMPILER=hipcc \ + -DCMAKE_CXX_FLAGS="-Werror -Wno-unused-command-line-argument" \ + -DCMAKE_CXX_STANDARD=20 \ + -DKokkos_ARCH_NATIVE=ON \ + -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ + -DKokkos_ENABLE_DEPRECATED_CODE_4=ON \ + -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ + -DKokkos_ENABLE_TESTS=ON \ + -DKokkos_ENABLE_BENCHMARKS=ON \ + -DKokkos_ENABLE_HIP=ON \ + .. && \ + make -j8 && ctest --verbose''' + } + post { + always { + sh 'ccache --show-stats' + } + } } } } From 164519d7d94cf9d260dd7834cef9fae979f93358 Mon Sep 17 00:00:00 2001 From: Christian Trott Date: Wed, 10 Apr 2024 14:20:00 -0600 Subject: [PATCH 32/95] MI300 support unified memory support (#6877) * Make Host backends be included before device backends There was an inclusion order issue when trying to enable unified memory, which was due to incomplete types. Effectively the Host backends must be defined before the device memory spaces for them to be marked accessible. We may have the same issue if we try to mark HostSpace accessible by device memory spaces - but I am not sure. * Add unified memory arch support for AMD This is intended for MI300A, but for now you have to set -DKokkos_ENABLE_IMPL_HIP_UNIFIED_MEMORY=ON This marks HIPSpace as host accessible, and thus will make create_mirror_view a no-op. * Fix two tests for AMD unified memory archs * Address review comments --- Makefile.kokkos | 32 ++++++++-------- .../src/sorting/impl/Kokkos_SortImpl.hpp | 5 +++ cmake/KokkosCore_config.h.in | 1 + cmake/kokkos_enable_devices.cmake | 37 ++++++++++--------- cmake/kokkos_enable_options.cmake | 1 + core/src/HIP/Kokkos_HIP.cpp | 4 ++ .../HIP/Kokkos_HIP_SharedAllocationRecord.cpp | 4 ++ .../HIP/Kokkos_HIP_SharedAllocationRecord.hpp | 4 ++ core/src/HIP/Kokkos_HIP_Space.hpp | 19 +++++++++- core/unit_test/hip/TestHIP_Spaces.cpp | 16 ++++++++ 10 files changed, 89 insertions(+), 34 deletions(-) diff --git a/Makefile.kokkos b/Makefile.kokkos index 73decf8b47a..f6028cb1418 100644 --- a/Makefile.kokkos +++ b/Makefile.kokkos @@ -1253,6 +1253,22 @@ ifneq ($(KOKKOS_INTERNAL_NEW_CONFIG), 0) tmp := $(call kokkos_update_config_header, KOKKOS_FWD_HPP_, "KokkosCore_Config_FwdBackend.tmp", "KokkosCore_Config_FwdBackend.hpp") tmp := $(call kokkos_update_config_header, KOKKOS_SETUP_HPP_, "KokkosCore_Config_SetupBackend.tmp", "KokkosCore_Config_SetupBackend.hpp") tmp := $(call kokkos_update_config_header, KOKKOS_DECLARE_HPP_, "KokkosCore_Config_DeclareBackend.tmp", "KokkosCore_Config_DeclareBackend.hpp") + ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1) + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") + endif + ifeq ($(KOKKOS_INTERNAL_USE_THREADS), 1) + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") + endif + ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1) + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") + endif + ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1) + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") + endif ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") @@ -1272,26 +1288,10 @@ ifneq ($(KOKKOS_INTERNAL_NEW_CONFIG), 0) tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_SetupBackend.hpp") endif - ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1) - tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") - tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") - endif ifeq ($(KOKKOS_INTERNAL_USE_OPENACC), 1) tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") endif - ifeq ($(KOKKOS_INTERNAL_USE_THREADS), 1) - tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") - tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") - endif - ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1) - tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") - tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") - endif - ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1) - tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") - tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") - endif endif KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/*.hpp) diff --git a/algorithms/src/sorting/impl/Kokkos_SortImpl.hpp b/algorithms/src/sorting/impl/Kokkos_SortImpl.hpp index 4c174b5fda9..08946228919 100644 --- a/algorithms/src/sorting/impl/Kokkos_SortImpl.hpp +++ b/algorithms/src/sorting/impl/Kokkos_SortImpl.hpp @@ -399,9 +399,14 @@ sort_device_view_with_comparator( using ViewType = Kokkos::View; using MemSpace = typename ViewType::memory_space; +// Note with HIP unified memory this code path is still the right thing to do +// if we end up here when RocThrust is not enabled. +// The create_mirror_view_and_copy will do the right thing (no copy). +#ifndef KOKKOS_ENABLE_IMPL_HIP_UNIFIED_MEMORY static_assert(!SpaceAccessibility::accessible, "Impl::sort_device_view_with_comparator: should not be called " "on a view that is already accessible on the host"); +#endif copy_to_host_run_stdsort_copy_back(exec, view, comparator); } diff --git a/cmake/KokkosCore_config.h.in b/cmake/KokkosCore_config.h.in index 2df0f6c5205..3ab39cd6abf 100644 --- a/cmake/KokkosCore_config.h.in +++ b/cmake/KokkosCore_config.h.in @@ -39,6 +39,7 @@ #cmakedefine KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC #cmakedefine KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE #cmakedefine KOKKOS_ENABLE_HIP_MULTIPLE_KERNEL_INSTANTIATIONS +#cmakedefine KOKKOS_ENABLE_IMPL_HIP_UNIFIED_MEMORY #cmakedefine KOKKOS_ENABLE_IMPL_HPX_ASYNC_DISPATCH #cmakedefine KOKKOS_ENABLE_DEBUG #cmakedefine KOKKOS_ENABLE_DEBUG_DUALVIEW_MODIFY_CHECK diff --git a/cmake/kokkos_enable_devices.cmake b/cmake/kokkos_enable_devices.cmake index 9a977520a3a..c7d189285c5 100644 --- a/cmake/kokkos_enable_devices.cmake +++ b/cmake/kokkos_enable_devices.cmake @@ -40,6 +40,26 @@ ELSE() ENDIF() KOKKOS_DEVICE_OPTION(OPENMP ${OMP_DEFAULT} HOST "Whether to build OpenMP backend") + +# We want this to default to OFF for cache reasons, but if no +# host space is given, then activate serial +IF (KOKKOS_HAS_TRILINOS) + #However, Trilinos always wants Serial ON + SET(SERIAL_DEFAULT ON) +ELSEIF (KOKKOS_HAS_HOST) + SET(SERIAL_DEFAULT OFF) +ELSE() + SET(SERIAL_DEFAULT ON) + IF (NOT DEFINED Kokkos_ENABLE_SERIAL) + MESSAGE(STATUS "SERIAL backend is being turned on to ensure there is at least one Host space. To change this, you must enable another host execution space and configure with -DKokkos_ENABLE_SERIAL=OFF or change CMakeCache.txt") + ENDIF() +ENDIF() +KOKKOS_DEVICE_OPTION(SERIAL ${SERIAL_DEFAULT} HOST "Whether to build serial backend") + +KOKKOS_DEVICE_OPTION(HPX OFF HOST "Whether to build HPX backend (experimental)") + +# Device backends have to come after host backends for header include order reasons +# Without this we can't make e.g. CudaSpace accessible by HostSpace KOKKOS_DEVICE_OPTION(OPENACC OFF DEVICE "Whether to build the OpenACC backend") IF (KOKKOS_ENABLE_OPENACC) COMPILER_SPECIFIC_FLAGS( @@ -90,23 +110,6 @@ IF (KOKKOS_ENABLE_CUDA) LIST(APPEND DEVICE_SETUP_LIST Cuda) ENDIF() -# We want this to default to OFF for cache reasons, but if no -# host space is given, then activate serial -IF (KOKKOS_HAS_TRILINOS) - #However, Trilinos always wants Serial ON - SET(SERIAL_DEFAULT ON) -ELSEIF (KOKKOS_HAS_HOST) - SET(SERIAL_DEFAULT OFF) -ELSE() - SET(SERIAL_DEFAULT ON) - IF (NOT DEFINED Kokkos_ENABLE_SERIAL) - MESSAGE(STATUS "SERIAL backend is being turned on to ensure there is at least one Host space. To change this, you must enable another host execution space and configure with -DKokkos_ENABLE_SERIAL=OFF or change CMakeCache.txt") - ENDIF() -ENDIF() -KOKKOS_DEVICE_OPTION(SERIAL ${SERIAL_DEFAULT} HOST "Whether to build serial backend") - -KOKKOS_DEVICE_OPTION(HPX OFF HOST "Whether to build HPX backend (experimental)") - KOKKOS_DEVICE_OPTION(HIP OFF DEVICE "Whether to build HIP backend") ## HIP has extra setup requirements, turn on Kokkos_Setup_HIP.hpp in macros diff --git a/cmake/kokkos_enable_options.cmake b/cmake/kokkos_enable_options.cmake index a437f6132aa..32788e7aa0f 100644 --- a/cmake/kokkos_enable_options.cmake +++ b/cmake/kokkos_enable_options.cmake @@ -70,6 +70,7 @@ KOKKOS_ENABLE_OPTION(TUNING OFF "Whether to create bindings for tu KOKKOS_ENABLE_OPTION(AGGRESSIVE_VECTORIZATION OFF "Whether to aggressively vectorize loops") KOKKOS_ENABLE_OPTION(COMPILE_AS_CMAKE_LANGUAGE OFF "Whether to use native cmake language support") KOKKOS_ENABLE_OPTION(HIP_MULTIPLE_KERNEL_INSTANTIATIONS OFF "Whether multiple kernels are instantiated at compile time - improve performance but increase compile time") +KOKKOS_ENABLE_OPTION(IMPL_HIP_UNIFIED_MEMORY OFF "Whether to leverage unified memory architectures for HIP") # This option will go away eventually, but allows fallback to old implementation when needed. KOKKOS_ENABLE_OPTION(DESUL_ATOMICS_EXTERNAL OFF "Whether to use an external desul installation") diff --git a/core/src/HIP/Kokkos_HIP.cpp b/core/src/HIP/Kokkos_HIP.cpp index 309e07fb3fb..aced2083ffb 100644 --- a/core/src/HIP/Kokkos_HIP.cpp +++ b/core/src/HIP/Kokkos_HIP.cpp @@ -146,6 +146,10 @@ void HIP::print_configuration(std::ostream& os, bool /*verbose*/) const { #else os << "no\n"; #endif +#ifdef KOKKOS_ENABLE_IMPL_HIP_UNIFIED_MEMORY + os << " KOKKOS_ENABLE_IMPL_HIP_UNIFIED_MEMORY: "; + os << "yes\n"; +#endif os << "\nRuntime Configuration:\n"; diff --git a/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.cpp b/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.cpp index ab24004f5fc..83f829fddae 100644 --- a/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.cpp +++ b/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.cpp @@ -23,8 +23,12 @@ #include #include +#ifndef KOKKOS_ENABLE_IMPL_HIP_UNIFIED_MEMORY KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( Kokkos::HIPSpace); +#else +KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION(Kokkos::HIPSpace); +#endif KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( Kokkos::HIPHostPinnedSpace); KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( diff --git a/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.hpp b/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.hpp index fbae5188344..1ca7bd5cd0e 100644 --- a/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.hpp +++ b/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.hpp @@ -20,8 +20,12 @@ #include #include +#if defined(KOKKOS_ENABLE_IMPL_HIP_UNIFIED_MEMORY) +KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(Kokkos::HIPSpace); +#else KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_SPECIALIZATION( Kokkos::HIPSpace); +#endif KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(Kokkos::HIPHostPinnedSpace); KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(Kokkos::HIPManagedSpace); diff --git a/core/src/HIP/Kokkos_HIP_Space.hpp b/core/src/HIP/Kokkos_HIP_Space.hpp index 7f2004e5cbc..e1b4768b877 100644 --- a/core/src/HIP/Kokkos_HIP_Space.hpp +++ b/core/src/HIP/Kokkos_HIP_Space.hpp @@ -65,6 +65,18 @@ class HIPSpace { ~HIPSpace() = default; /**\brief Allocate untracked memory in the hip space */ +#ifdef KOKKOS_ENABLE_IMPL_HIP_UNIFIED_MEMORY + template + void* allocate(const ExecutionSpace&, const size_t arg_alloc_size) const { + return allocate(arg_alloc_size); + } + template + void* allocate(const ExecutionSpace&, const char* arg_label, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const { + return allocate(arg_label, arg_alloc_size, arg_logical_size); + } +#else // FIXME_HIP Use execution space instance void* allocate(const HIP&, const size_t arg_alloc_size) const { return allocate(arg_alloc_size); @@ -74,6 +86,7 @@ class HIPSpace { const size_t arg_logical_size = 0) const { return allocate(arg_label, arg_alloc_size, arg_logical_size); } +#endif void* allocate(const size_t arg_alloc_size) const; void* allocate(const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size = 0) const; @@ -267,7 +280,11 @@ static_assert(Kokkos::Impl::MemorySpaceAccess::assignable); template <> struct MemorySpaceAccess { enum : bool { assignable = false }; - enum : bool { accessible = false }; +#if !defined(KOKKOS_ENABLE_IMPL_HIP_UNIFIED_MEMORY) + enum : bool{accessible = false}; +#else + enum : bool { accessible = true }; +#endif enum : bool { deepcopy = true }; }; diff --git a/core/unit_test/hip/TestHIP_Spaces.cpp b/core/unit_test/hip/TestHIP_Spaces.cpp index 8f7499c244b..673c0f0fff3 100644 --- a/core/unit_test/hip/TestHIP_Spaces.cpp +++ b/core/unit_test/hip/TestHIP_Spaces.cpp @@ -38,8 +38,13 @@ TEST(hip, space_access) { static_assert(!Kokkos::Impl::MemorySpaceAccess::assignable); +#if !defined(KOKKOS_ENABLE_IMPL_HIP_UNIFIED_MEMORY) static_assert(!Kokkos::Impl::MemorySpaceAccess::accessible); +#else + static_assert(Kokkos::Impl::MemorySpaceAccess::accessible); +#endif static_assert( !Kokkos::Impl::MemorySpaceAccess::accessible); +#if !defined(KOKKOS_ENABLE_IMPL_HIP_UNIFIED_MEMORY) static_assert(!Kokkos::SpaceAccessibility::accessible); +#else + static_assert(Kokkos::SpaceAccessibility::accessible); +#endif static_assert( Kokkos::SpaceAccessibility::accessible); +#if !defined(KOKKOS_ENABLE_IMPL_HIP_UNIFIED_MEMORY) static_assert(std::is_same::Space, Kokkos::HostSpace>::value); +#else + static_assert(std::is_same::Space, + Kokkos::Device>::value); +#endif static_assert( std::is_same::Space, From 6ea7be76ef65b8477bce566e715afb0453e61a6c Mon Sep 17 00:00:00 2001 From: Francesco Rizzi Date: Wed, 10 Apr 2024 23:19:08 +0200 Subject: [PATCH 33/95] cuda: reduction with `RangePolicy`: fix grid dimensions to work for large values and avoid overflow (#6578) Fixes issue #6578 and adds a test based on the bug report --- core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp | 5 ++-- core/unit_test/TestReduce.hpp | 26 ++++++++++++++++++++ 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp b/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp index 0f052be3c30..334834938a1 100644 --- a/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp +++ b/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp @@ -312,8 +312,9 @@ class ParallelReduce, // REQUIRED ( 1 , N , 1 ) dim3 block(1, block_size, 1); // Required grid.x <= block.y - dim3 grid(std::min(int(block.y), int((nwork + block.y - 1) / block.y)), 1, - 1); + dim3 grid(std::min(index_type(block.y), + index_type((nwork + block.y - 1) / block.y)), + 1, 1); // TODO @graph We need to effectively insert this in to the graph const int shmem = diff --git a/core/unit_test/TestReduce.hpp b/core/unit_test/TestReduce.hpp index e1aa851f102..61b2bfb1505 100644 --- a/core/unit_test/TestReduce.hpp +++ b/core/unit_test/TestReduce.hpp @@ -625,4 +625,30 @@ TEST(TEST_CATEGORY, int_combined_reduce_mixed) { } #endif #endif + +#if defined(NDEBUG) +// the following test was made for: +// https://github.com/kokkos/kokkos/issues/6517 + +struct FunctorReductionWithLargeIterationCount { + KOKKOS_FUNCTION void operator()(const int64_t /*i*/, double& update) const { + update += 1.0; + } +}; + +TEST(TEST_CATEGORY, reduction_with_large_iteration_count) { + if constexpr (std::is_same_v) { + GTEST_SKIP() << "Disabling for host backends"; + } + + const int64_t N = pow(2LL, 39LL) - pow(2LL, 8LL) + 1; + Kokkos::RangePolicy> p(0, N); + double nu = 0; + EXPECT_NO_THROW(Kokkos::parallel_reduce( + "sample reduction", p, FunctorReductionWithLargeIterationCount(), nu)); + ASSERT_DOUBLE_EQ(nu, double(N)); +} +#endif + } // namespace Test From 0099c10bebf3448d309a0a623a303307d94f6e02 Mon Sep 17 00:00:00 2001 From: Bruno Turcksin Date: Thu, 11 Apr 2024 10:00:33 -0400 Subject: [PATCH 34/95] Fix nightly CI --- .jenkins_nightly | 1 + 1 file changed, 1 insertion(+) diff --git a/.jenkins_nightly b/.jenkins_nightly index b213ba3061a..a8facd365c2 100644 --- a/.jenkins_nightly +++ b/.jenkins_nightly @@ -134,6 +134,7 @@ pipeline { always { sh 'ccache --show-stats' } + } } } } From a2af4e0d4137fab559a6ea1698af0050a0cbdf03 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 11 Apr 2024 17:41:27 -0400 Subject: [PATCH 35/95] Deprecate trailing Proxy template argument in Kokkos::Array --- core/src/Kokkos_Array.hpp | 17 ++++++++++++++--- core/unit_test/TestArrayOps.hpp | 2 ++ 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/core/src/Kokkos_Array.hpp b/core/src/Kokkos_Array.hpp index ba1626bb72e..461b98f6a72 100644 --- a/core/src/Kokkos_Array.hpp +++ b/core/src/Kokkos_Array.hpp @@ -80,7 +80,11 @@ struct ArrayBoundsCheck { /**\brief Derived from the C++17 'std::array'. * Dropping the iterator interface. */ +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 template +#else +template +#endif struct Array { public: /** @@ -131,8 +135,13 @@ struct Array { } }; +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 template struct Array { +#else +template +struct Array { +#endif public: using reference = T&; using const_reference = std::add_const_t&; @@ -178,14 +187,15 @@ struct Array { // Array & operator = ( Array && ) = default ; }; +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 template <> -struct Array { +struct KOKKOS_DEPRECATED Array { struct contiguous {}; struct strided {}; }; template -struct Array::contiguous> { +struct KOKKOS_DEPRECATED Array::contiguous> { private: T* m_elem; size_t m_size; @@ -253,7 +263,7 @@ struct Array::contiguous> { }; template -struct Array::strided> { +struct KOKKOS_DEPRECATED Array::strided> { private: T* m_elem; size_t m_size; @@ -320,6 +330,7 @@ struct Array::strided> { size_type arg_stride) : m_elem(arg_ptr), m_size(arg_size), m_stride(arg_stride) {} }; +#endif template Array(T, Us...)->Array; diff --git a/core/unit_test/TestArrayOps.hpp b/core/unit_test/TestArrayOps.hpp index 06528572714..387589fbe88 100644 --- a/core/unit_test/TestArrayOps.hpp +++ b/core/unit_test/TestArrayOps.hpp @@ -111,6 +111,7 @@ TEST(TEST_CATEGORY, array_zero_data_nullptr) { ASSERT_EQ(ce.data(), nullptr); } +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 TEST(TEST_CATEGORY, array_contiguous_capacity) { using A = Kokkos::Array::contiguous>; @@ -389,5 +390,6 @@ TEST(TEST_CATEGORY, array_strided_assignment) { ASSERT_EQ(e.max_size(), std::size(ee) / eStride); ASSERT_EQ(e[0], ee[0]); } +#endif } // namespace From 92e02b50c428c8f62af56b9446f00e9cede758de Mon Sep 17 00:00:00 2001 From: Rahulkumar Gayatri Date: Thu, 11 Apr 2024 18:03:11 -0700 Subject: [PATCH 36/95] CUDA: Update nvcc_wrapper --- bin/nvcc_wrapper | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/nvcc_wrapper b/bin/nvcc_wrapper index 9b935835d5f..9460ec60a84 100755 --- a/bin/nvcc_wrapper +++ b/bin/nvcc_wrapper @@ -233,7 +233,7 @@ do cuda_args="$cuda_args $1" ;; #Handle more known nvcc args - --extended-lambda|--expt-extended-lambda|--expt-relaxed-constexpr|--Wno-deprecated-gpu-targets|-Wno-deprecated-gpu-targets|-allow-unsupported-compiler|--allow-unsupported-compiler) + --extended-lambda|--expt-extended-lambda|--expt-relaxed-constexpr|--Wno-deprecated-gpu-targets|-Wno-deprecated-gpu-targets|-allow-unsupported-compiler|--allow-unsupported-compiler|--disable-warnings) cuda_args="$cuda_args $1" ;; #Handle known nvcc args that have an argument From d88e2a5b0e7948c5315a605f5bc1ead00fc39fc6 Mon Sep 17 00:00:00 2001 From: Geoffroy Lesur Date: Fri, 12 Apr 2024 20:56:29 +0200 Subject: [PATCH 37/95] bring back --fmad option to nvcc_wrapper (#6931) * bring back --fmad option to nvcc_wrapper * Preserve support for flag with single leading dash --------- Co-authored-by: Damien L-G --- bin/nvcc_wrapper | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/nvcc_wrapper b/bin/nvcc_wrapper index 9460ec60a84..d58645f98ad 100755 --- a/bin/nvcc_wrapper +++ b/bin/nvcc_wrapper @@ -229,7 +229,7 @@ do fi ;; #Handle known nvcc args - --dryrun|-dryrun|--verbose|--keep|-keep|--source-in-ptx|-src-in-ptx|--keep-dir*|-keep-dir*|-G|-lineinfo|--generate-line-info|-extended-lambda|-expt-extended-lambda|-expt-relaxed-constexpr|--resource-usage|-res-usage|-fmad=*|--use_fast_math|-use_fast_math|--Wext-lambda-captures-this|-Wext-lambda-captures-this) + --dryrun|-dryrun|--verbose|--keep|-keep|--source-in-ptx|-src-in-ptx|--keep-dir*|-keep-dir*|-G|-lineinfo|--generate-line-info|-extended-lambda|-expt-extended-lambda|-expt-relaxed-constexpr|--resource-usage|-res-usage|--fmad=*|-fmad=*|--use_fast_math|-use_fast_math|--Wext-lambda-captures-this|-Wext-lambda-captures-this) cuda_args="$cuda_args $1" ;; #Handle more known nvcc args From f2d37801dc88ac476cbc47054e726c6c433de2cc Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Mon, 15 Apr 2024 17:26:15 -0400 Subject: [PATCH 38/95] Remove unnecessary header include Reported in https://github.com/kokkos/kokkos/pull/6934#pullrequestreview-2001860702 Co-authored-by: Nevin Liber --- core/src/Kokkos_Array.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/core/src/Kokkos_Array.hpp b/core/src/Kokkos_Array.hpp index 461b98f6a72..3d71d09fde1 100644 --- a/core/src/Kokkos_Array.hpp +++ b/core/src/Kokkos_Array.hpp @@ -29,7 +29,6 @@ #include #include #include -#include #include namespace Kokkos { From a8115e5df7d2fd68c215758abaf73bbe8598e0d2 Mon Sep 17 00:00:00 2001 From: yasahi-hpc <57478230+yasahi-hpc@users.noreply.github.com> Date: Tue, 16 Apr 2024 03:42:26 +0200 Subject: [PATCH 39/95] Adding converting constructor in Kokkos::RandomAccessIterator (#6929) * Adding converting constructor in Kokkos::RandomAccessIterator * fix constructible tests for Kokkos::RandomAccessIterator * fix converting constructor in Kokkos::RandomAccessIterator * Add comments to explain friend class of RandomAccessIterator is needed for converting constructor * Introduce KOKKOS_IMPL_CONDITIONAL_EXPLICIT macro from #6830 * Adding a conditional explicit in converting constructor of RandomAccessIterator * Rename ViewType to OtherViewType in converting constructor for readability * Replace tests with static_assert if they rely on compile time behaviour only * fix a condition for conditional explicit * Revert "Introduce KOKKOS_IMPL_CONDITIONAL_EXPLICIT macro from #6830" This reverts commit ee42c6d62e9b8373bd3494c79c97a8845593b325. * On second thought `KOKKOS_IMPL_CONDITIONAL_EXPLICIT` is not such a good idea because it let user write code that would compile with C++17 but not with later standards. --------- Co-authored-by: Yuuichi Asahi --- .../impl/Kokkos_RandomAccessIterator.hpp | 28 ++++++++++++++ .../unit_tests/TestRandomAccessIterator.cpp | 38 +++++++++++++++++++ 2 files changed, 66 insertions(+) diff --git a/algorithms/src/std_algorithms/impl/Kokkos_RandomAccessIterator.hpp b/algorithms/src/std_algorithms/impl/Kokkos_RandomAccessIterator.hpp index 5c9854b87d7..7bcc16a9b55 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_RandomAccessIterator.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_RandomAccessIterator.hpp @@ -59,6 +59,30 @@ class RandomAccessIterator< ::Kokkos::View > { ptrdiff_t current_index) : m_view(view), m_current_index(current_index) {} +#ifndef KOKKOS_ENABLE_CXX17 // C++20 and beyond + template + requires(std::is_constructible_v) KOKKOS_FUNCTION + explicit(!std::is_convertible_v) + RandomAccessIterator(const RandomAccessIterator& other) + : m_view(other.m_view), m_current_index(other.m_current_index) {} +#else + template < + class OtherViewType, + std::enable_if_t && + !std::is_convertible_v, + int> = 0> + KOKKOS_FUNCTION explicit RandomAccessIterator( + const RandomAccessIterator& other) + : m_view(other.m_view), m_current_index(other.m_current_index) {} + + template , + int> = 0> + KOKKOS_FUNCTION RandomAccessIterator( + const RandomAccessIterator& other) + : m_view(other.m_view), m_current_index(other.m_current_index) {} +#endif + KOKKOS_FUNCTION iterator_type& operator++() { ++m_current_index; @@ -155,6 +179,10 @@ class RandomAccessIterator< ::Kokkos::View > { private: view_type m_view; ptrdiff_t m_current_index = 0; + + // Needed for the converting constructor accepting another iterator + template + friend class RandomAccessIterator; }; } // namespace Impl diff --git a/algorithms/unit_tests/TestRandomAccessIterator.cpp b/algorithms/unit_tests/TestRandomAccessIterator.cpp index 282d85548c5..7d484136b6d 100644 --- a/algorithms/unit_tests/TestRandomAccessIterator.cpp +++ b/algorithms/unit_tests/TestRandomAccessIterator.cpp @@ -46,6 +46,44 @@ TEST_F(random_access_iterator_test, constructor) { EXPECT_TRUE(true); } +TEST_F(random_access_iterator_test, constructiblity) { + auto first_d = KE::begin(m_dynamic_view); + auto cfirst_d = KE::cbegin(m_dynamic_view); + + static_assert(std::is_constructible_v); + static_assert( + !std::is_constructible_v); + [[maybe_unused]] decltype(cfirst_d) tmp_cfirst_d(first_d); + + auto first_s = KE::begin(m_static_view); + auto cfirst_s = KE::cbegin(m_static_view); + + static_assert(std::is_constructible_v); + static_assert( + !std::is_constructible_v); + [[maybe_unused]] decltype(cfirst_s) tmp_cfirst_s(first_s); + + auto first_st = KE::begin(m_strided_view); + auto cfirst_st = KE::cbegin(m_strided_view); + + static_assert( + std::is_constructible_v); + static_assert( + !std::is_constructible_v); + [[maybe_unused]] decltype(cfirst_st) tmp_cfirst_st(first_st); + + // [FIXME] Better to have tests for the explicit specifier with an expression. + // As soon as View converting constructors are re-implemented with a + // conditional explicit, we may add those tests. + static_assert(std::is_constructible_v); + static_assert(std::is_constructible_v); + static_assert(std::is_constructible_v); + static_assert(std::is_constructible_v); + static_assert(std::is_constructible_v); + static_assert(std::is_constructible_v); + EXPECT_TRUE(true); +} + template void test_random_access_it_verify(IteratorType it, ValueType gold_value) { using view_t = Kokkos::View; From 0e3a67343f4169c33ac33f8d35f2e634be97cbed Mon Sep 17 00:00:00 2001 From: Paul Zehner Date: Tue, 16 Apr 2024 15:39:37 +0200 Subject: [PATCH 40/95] Use if constexpr for offset view create_mirror* --- containers/src/Kokkos_OffsetView.hpp | 113 ++++++++++----------------- 1 file changed, 41 insertions(+), 72 deletions(-) diff --git a/containers/src/Kokkos_OffsetView.hpp b/containers/src/Kokkos_OffsetView.hpp index 91a7e4a9273..d465e72a97e 100644 --- a/containers/src/Kokkos_OffsetView.hpp +++ b/containers/src/Kokkos_OffsetView.hpp @@ -1841,23 +1841,10 @@ struct MirrorOffsetType { } // namespace Impl namespace Impl { -template -inline std::enable_if_t< - !Impl::ViewCtorProp::has_memory_space, - typename Kokkos::Experimental::OffsetView::HostMirror> -create_mirror(const Kokkos::Experimental::OffsetView& src, - const Impl::ViewCtorProp& arg_prop) { - return typename Kokkos::Experimental::OffsetView::HostMirror( - Kokkos::create_mirror(arg_prop, src.view()), src.begins()); -} -template ::has_memory_space>> -inline auto create_mirror(const Kokkos::Experimental::OffsetView& src, - const Impl::ViewCtorProp& arg_prop) { +template +void check_view_ctor_args_create_mirror_offset() { using alloc_prop_input = Impl::ViewCtorProp; - using Space = typename Impl::ViewCtorProp::memory_space; static_assert( !alloc_prop_input::has_label, @@ -1871,15 +1858,29 @@ inline auto create_mirror(const Kokkos::Experimental::OffsetView& src, !alloc_prop_input::allow_padding, "The view constructor arguments passed to Kokkos::create_mirror must " "not explicitly allow padding!"); +} + +template +auto create_mirror(const Kokkos::Experimental::OffsetView& src, + const Impl::ViewCtorProp& arg_prop) { + check_view_ctor_args_create_mirror_offset(); - auto prop_copy = Impl::with_properties_if_unset( - arg_prop, std::string(src.label()).append("_mirror")); + if constexpr (Impl::ViewCtorProp::has_memory_space) { + using Space = typename Impl::ViewCtorProp::memory_space; - return typename Kokkos::Impl::MirrorOffsetType::view_type( - prop_copy, src.layout(), - {src.begin(0), src.begin(1), src.begin(2), src.begin(3), src.begin(4), - src.begin(5), src.begin(6), src.begin(7)}); + auto prop_copy = Impl::with_properties_if_unset( + arg_prop, std::string(src.label()).append("_mirror")); + + return typename Kokkos::Impl::MirrorOffsetType::view_type( + prop_copy, src.layout(), + {src.begin(0), src.begin(1), src.begin(2), src.begin(3), src.begin(4), + src.begin(5), src.begin(6), src.begin(7)}); + } else { + return typename Kokkos::Experimental::OffsetView::HostMirror( + Kokkos::create_mirror(arg_prop, src.view()), src.begins()); + } } + } // namespace Impl // Create a mirror in host space @@ -1921,67 +1922,35 @@ inline auto create_mirror( } namespace Impl { + template -inline std::enable_if_t< - !Impl::ViewCtorProp::has_memory_space && - (std::is_same< +auto create_mirror_view(const Kokkos::Experimental::OffsetView& src, + const Impl::ViewCtorProp& arg_prop) { + if constexpr (!Impl::ViewCtorProp::has_memory_space) { + if constexpr ( + std::is_same< typename Kokkos::Experimental::OffsetView::memory_space, typename Kokkos::Experimental::OffsetView< T, P...>::HostMirror::memory_space>::value && std::is_same< typename Kokkos::Experimental::OffsetView::data_type, typename Kokkos::Experimental::OffsetView< - T, P...>::HostMirror::data_type>::value), - typename Kokkos::Experimental::OffsetView::HostMirror> -create_mirror_view(const Kokkos::Experimental::OffsetView& src, - const Impl::ViewCtorProp&) { - return src; -} - -template -inline std::enable_if_t< - !Impl::ViewCtorProp::has_memory_space && - !(std::is_same< - typename Kokkos::Experimental::OffsetView::memory_space, - typename Kokkos::Experimental::OffsetView< - T, P...>::HostMirror::memory_space>::value && - std::is_same< - typename Kokkos::Experimental::OffsetView::data_type, - typename Kokkos::Experimental::OffsetView< - T, P...>::HostMirror::data_type>::value), - typename Kokkos::Experimental::OffsetView::HostMirror> -create_mirror_view(const Kokkos::Experimental::OffsetView& src, - const Impl::ViewCtorProp& arg_prop) { - return Kokkos::create_mirror(arg_prop, src); -} - -template ::has_memory_space>> -std::enable_if_t::memory_space, - T, P...>::is_same_memspace, - typename Impl::MirrorOffsetViewType< + T, P...>::HostMirror::data_type>::value) { + return src; + } else { + return Kokkos::Impl::create_mirror(src, arg_prop); + } + } else { + if constexpr (Impl::MirrorOffsetViewType< typename Impl::ViewCtorProp::memory_space, - T, P...>::view_type> -create_mirror_view(const Kokkos::Experimental::OffsetView& src, - const Impl::ViewCtorProp&) { - return src; + T, P...>::is_same_memspace) { + return src; + } else { + return Kokkos::Impl::create_mirror(src, arg_prop); + } + } } -template ::has_memory_space>> -std::enable_if_t::memory_space, - T, P...>::is_same_memspace, - typename Impl::MirrorOffsetViewType< - typename Impl::ViewCtorProp::memory_space, - T, P...>::view_type> -create_mirror_view(const Kokkos::Experimental::OffsetView& src, - const Impl::ViewCtorProp& arg_prop) { - return Kokkos::Impl::create_mirror(src, arg_prop); -} } // namespace Impl // Create a mirror view in host space From 856bcc2c4a745553433288763a0fe838d311ce5c Mon Sep 17 00:00:00 2001 From: Paul Zehner Date: Tue, 16 Apr 2024 17:32:45 +0200 Subject: [PATCH 41/95] Use if constexpr for dynamic view create_mirror* --- containers/src/Kokkos_DynamicView.hpp | 238 ++++++++++---------------- 1 file changed, 91 insertions(+), 147 deletions(-) diff --git a/containers/src/Kokkos_DynamicView.hpp b/containers/src/Kokkos_DynamicView.hpp index 12885edbae9..de98e6e75be 100644 --- a/containers/src/Kokkos_DynamicView.hpp +++ b/containers/src/Kokkos_DynamicView.hpp @@ -590,12 +590,9 @@ struct MirrorDynamicViewType { } // namespace Impl namespace Impl { -template -inline auto create_mirror( - const Kokkos::Experimental::DynamicView& src, - const Impl::ViewCtorProp& arg_prop, - std::enable_if_t::has_memory_space>* = - nullptr) { + +template +void check_view_ctor_args_create_mirror_dynamic() { using alloc_prop_input = Impl::ViewCtorProp; static_assert( @@ -610,51 +607,39 @@ inline auto create_mirror( !alloc_prop_input::allow_padding, "The view constructor arguments passed to Kokkos::create_mirror must " "not explicitly allow padding!"); - - auto prop_copy = Impl::with_properties_if_unset( - arg_prop, std::string(src.label()).append("_mirror")); - - auto ret = typename Kokkos::Experimental::DynamicView::HostMirror( - prop_copy, src.chunk_size(), src.chunk_max() * src.chunk_size()); - - ret.resize_serial(src.extent(0)); - - return ret; } template -inline auto create_mirror( - const Kokkos::Experimental::DynamicView& src, - const Impl::ViewCtorProp& arg_prop, - std::enable_if_t::has_memory_space>* = - nullptr) { +auto create_mirror(const Kokkos::Experimental::DynamicView& src, + const Impl::ViewCtorProp& arg_prop) { using alloc_prop_input = Impl::ViewCtorProp; + check_view_ctor_args_create_mirror_dynamic(); - static_assert( - !alloc_prop_input::has_label, - "The view constructor arguments passed to Kokkos::create_mirror " - "must not include a label!"); - static_assert( - !alloc_prop_input::has_pointer, - "The view constructor arguments passed to Kokkos::create_mirror must " - "not include a pointer!"); - static_assert( - !alloc_prop_input::allow_padding, - "The view constructor arguments passed to Kokkos::create_mirror must " - "not explicitly allow padding!"); + if constexpr (Impl::ViewCtorProp::has_memory_space) { + using MemorySpace = typename alloc_prop_input::memory_space; + auto prop_copy = Impl::with_properties_if_unset( + arg_prop, std::string(src.label()).append("_mirror")); - using MemorySpace = typename alloc_prop_input::memory_space; - auto prop_copy = Impl::with_properties_if_unset( - arg_prop, std::string(src.label()).append("_mirror")); + auto ret = typename Kokkos::Impl::MirrorDynamicViewType< + MemorySpace, T, P...>::view_type(prop_copy, src.chunk_size(), + src.chunk_max() * src.chunk_size()); - auto ret = typename Kokkos::Impl::MirrorDynamicViewType< - MemorySpace, T, P...>::view_type(prop_copy, src.chunk_size(), - src.chunk_max() * src.chunk_size()); + ret.resize_serial(src.extent(0)); - ret.resize_serial(src.extent(0)); + return ret; + } else { + auto prop_copy = Impl::with_properties_if_unset( + arg_prop, std::string(src.label()).append("_mirror")); + + auto ret = typename Kokkos::Experimental::DynamicView::HostMirror( + prop_copy, src.chunk_size(), src.chunk_max() * src.chunk_size()); - return ret; + ret.resize_serial(src.extent(0)); + + return ret; + } } + } // namespace Impl // Create a mirror in host space @@ -697,66 +682,34 @@ inline auto create_mirror( namespace Impl { template -inline std::enable_if_t< - !Impl::ViewCtorProp::has_memory_space && - (std::is_same< +auto create_mirror_view(const Kokkos::Experimental::DynamicView& src, + const Impl::ViewCtorProp& arg_prop) { + if constexpr (!Impl::ViewCtorProp::has_memory_space) { + if constexpr ( + std::is_same< typename Kokkos::Experimental::DynamicView::memory_space, typename Kokkos::Experimental::DynamicView< T, P...>::HostMirror::memory_space>::value && - std::is_same< + std::is_same< typename Kokkos::Experimental::DynamicView::data_type, typename Kokkos::Experimental::DynamicView< - T, P...>::HostMirror::data_type>::value), - typename Kokkos::Experimental::DynamicView::HostMirror> -create_mirror_view(const Kokkos::Experimental::DynamicView& src, - const Impl::ViewCtorProp&) { - return src; -} - -template -inline std::enable_if_t< - !Impl::ViewCtorProp::has_memory_space && - !(std::is_same< - typename Kokkos::Experimental::DynamicView::memory_space, - typename Kokkos::Experimental::DynamicView< - T, P...>::HostMirror::memory_space>::value && - std::is_same< - typename Kokkos::Experimental::DynamicView::data_type, - typename Kokkos::Experimental::DynamicView< - T, P...>::HostMirror::data_type>::value), - typename Kokkos::Experimental::DynamicView::HostMirror> -create_mirror_view(const Kokkos::Experimental::DynamicView& src, - const Impl::ViewCtorProp& arg_prop) { - return Kokkos::create_mirror(arg_prop, src); -} - -template ::has_memory_space>> -std::enable_if_t::memory_space, - T, P...>::is_same_memspace, - typename Impl::MirrorDynamicViewType< + T, P...>::HostMirror::data_type>::value + ) { + return src; + } else { + return Kokkos::Impl::create_mirror(src, arg_prop); + } + } else { + if constexpr (Impl::MirrorDynamicViewType< typename Impl::ViewCtorProp::memory_space, - T, P...>::view_type> -create_mirror_view(const Kokkos::Experimental::DynamicView& src, - const Impl::ViewCtorProp&) { - return src; + T, P...>::is_same_memspace) { + return src; + } else { + return Kokkos::Impl::create_mirror(src, arg_prop); + } + } } -template ::has_memory_space>> -std::enable_if_t::memory_space, - T, P...>::is_same_memspace, - typename Impl::MirrorDynamicViewType< - typename Impl::ViewCtorProp::memory_space, - T, P...>::view_type> -create_mirror_view(const Kokkos::Experimental::DynamicView& src, - const Impl::ViewCtorProp& arg_prop) { - return Kokkos::Impl::create_mirror(src, arg_prop); -} } // namespace Impl // Create a mirror view in host space @@ -985,16 +938,12 @@ struct ViewCopy, } // namespace Impl -template -auto create_mirror_view_and_copy( - const Impl::ViewCtorProp&, - const Kokkos::Experimental::DynamicView& src, - std::enable_if_t< - std::is_void::specialize>::value && - Impl::MirrorDynamicViewType< - typename Impl::ViewCtorProp::memory_space, T, - P...>::is_same_memspace>* = nullptr) { +namespace Impl { + +template +void check_view_ctor_args_create_mirror_view_and_copy_dynamic() { using alloc_prop_input = Impl::ViewCtorProp; + static_assert( alloc_prop_input::has_memory_space, "The view constructor arguments passed to " @@ -1007,58 +956,53 @@ auto create_mirror_view_and_copy( "The view constructor arguments passed to " "Kokkos::create_mirror_view_and_copy must " "not explicitly allow padding!"); - - // same behavior as deep_copy(src, src) - if (!alloc_prop_input::has_execution_space) - fence( - "Kokkos::create_mirror_view_and_copy: fence before returning src view"); - return src; } -template +} // namespace Impl + // +template ::specialize>::value>> auto create_mirror_view_and_copy( const Impl::ViewCtorProp& arg_prop, - const Kokkos::Experimental::DynamicView& src, - std::enable_if_t< - std::is_void::specialize>::value && - !Impl::MirrorDynamicViewType< - typename Impl::ViewCtorProp::memory_space, T, - P...>::is_same_memspace>* = nullptr) { + const Kokkos::Experimental::DynamicView& src) { using alloc_prop_input = Impl::ViewCtorProp; - static_assert( - alloc_prop_input::has_memory_space, - "The view constructor arguments passed to " - "Kokkos::create_mirror_view_and_copy must include a memory space!"); - static_assert(!alloc_prop_input::has_pointer, - "The view constructor arguments passed to " - "Kokkos::create_mirror_view_and_copy must " - "not include a pointer!"); - static_assert(!alloc_prop_input::allow_padding, - "The view constructor arguments passed to " - "Kokkos::create_mirror_view_and_copy must " - "not explicitly allow padding!"); - using Space = typename alloc_prop_input::memory_space; - using Mirror = - typename Impl::MirrorDynamicViewType::view_type; - - auto arg_prop_copy = Impl::with_properties_if_unset( - arg_prop, std::string{}, WithoutInitializing, - typename Space::execution_space{}); - - std::string& label = Impl::get_property(arg_prop_copy); - if (label.empty()) label = src.label(); - auto mirror = typename Mirror::non_const_type( - arg_prop_copy, src.chunk_size(), src.chunk_max() * src.chunk_size()); - mirror.resize_serial(src.extent(0)); - if constexpr (alloc_prop_input::has_execution_space) { - deep_copy(Impl::get_property(arg_prop_copy), - mirror, src); - } else - deep_copy(mirror, src); - return mirror; + + Impl::check_view_ctor_args_create_mirror_view_and_copy_dynamic(); + + if constexpr (Impl::MirrorDynamicViewType< + typename Impl::ViewCtorProp::memory_space, T, + P...>::is_same_memspace) { + // same behavior as deep_copy(src, src) + if constexpr (!alloc_prop_input::has_execution_space) + fence( + "Kokkos::create_mirror_view_and_copy: fence before returning src view"); + return src; + } else { + using Space = typename alloc_prop_input::memory_space; + using Mirror = + typename Impl::MirrorDynamicViewType::view_type; + + auto arg_prop_copy = Impl::with_properties_if_unset( + arg_prop, std::string{}, WithoutInitializing, + typename Space::execution_space{}); + + std::string& label = Impl::get_property(arg_prop_copy); + if (label.empty()) label = src.label(); + auto mirror = typename Mirror::non_const_type( + arg_prop_copy, src.chunk_size(), src.chunk_max() * src.chunk_size()); + mirror.resize_serial(src.extent(0)); + if constexpr (alloc_prop_input::has_execution_space) { + deep_copy(Impl::get_property(arg_prop_copy), + mirror, src); + } else + deep_copy(mirror, src); + return mirror; + } } -template +template ::value>> auto create_mirror_view_and_copy( const Space&, const Kokkos::Experimental::DynamicView& src, std::string const& name = "") { From f1f4741d95fe921fa41586183a1367d953478fb0 Mon Sep 17 00:00:00 2001 From: Paul Zehner Date: Tue, 16 Apr 2024 17:33:15 +0200 Subject: [PATCH 42/95] Use if constexpr for dynamic rank view create_mirror* --- containers/src/Kokkos_DynRankView.hpp | 219 +++++++++----------------- 1 file changed, 78 insertions(+), 141 deletions(-) diff --git a/containers/src/Kokkos_DynRankView.hpp b/containers/src/Kokkos_DynRankView.hpp index 5fa59f1b7cd..b9d40de9064 100644 --- a/containers/src/Kokkos_DynRankView.hpp +++ b/containers/src/Kokkos_DynRankView.hpp @@ -1933,15 +1933,9 @@ struct MirrorDRVType { } // namespace Impl namespace Impl { -template -inline typename DynRankView::HostMirror create_mirror( - const DynRankView& src, - const Impl::ViewCtorProp& arg_prop, - std::enable_if_t::has_memory_space>* = - nullptr) { - using src_type = DynRankView; - using dst_type = typename src_type::HostMirror; +template +void check_view_ctor_args_create_mirror_drv() { using alloc_prop_input = Impl::ViewCtorProp; static_assert( @@ -1956,42 +1950,30 @@ inline typename DynRankView::HostMirror create_mirror( !alloc_prop_input::allow_padding, "The view constructor arguments passed to Kokkos::create_mirror must " "not explicitly allow padding!"); - - auto prop_copy = Impl::with_properties_if_unset( - arg_prop, std::string(src.label()).append("_mirror")); - - return dst_type(prop_copy, Impl::reconstructLayout(src.layout(), src.rank())); } template -inline auto create_mirror( +auto create_mirror( const DynRankView& src, - const Impl::ViewCtorProp& arg_prop, - std::enable_if_t::has_memory_space>* = - nullptr) { - using dst_type = typename Impl::MirrorDRVType< - typename Impl::ViewCtorProp::memory_space, T, - P...>::view_type; - - using alloc_prop_input = Impl::ViewCtorProp; + const Impl::ViewCtorProp& arg_prop) { - static_assert( - !alloc_prop_input::has_label, - "The view constructor arguments passed to Kokkos::create_mirror " - "must not include a label!"); - static_assert( - !alloc_prop_input::has_pointer, - "The view constructor arguments passed to Kokkos::create_mirror must " - "not include a pointer!"); - static_assert( - !alloc_prop_input::allow_padding, - "The view constructor arguments passed to Kokkos::create_mirror must " - "not explicitly allow padding!"); + check_view_ctor_args_create_mirror_drv(); auto prop_copy = Impl::with_properties_if_unset( arg_prop, std::string(src.label()).append("_mirror")); - return dst_type(prop_copy, Impl::reconstructLayout(src.layout(), src.rank())); + if constexpr (Impl::ViewCtorProp::has_memory_space) { + using dst_type = typename Impl::MirrorDRVType< + typename Impl::ViewCtorProp::memory_space, T, + P...>::view_type; + + return dst_type(prop_copy, Impl::reconstructLayout(src.layout(), src.rank())); + } else { + using src_type = DynRankView; + using dst_type = typename src_type::HostMirror; + + return dst_type(prop_copy, Impl::reconstructLayout(src.layout(), src.rank())); + } } } // namespace Impl @@ -2058,71 +2040,36 @@ inline auto create_mirror( } namespace Impl { + template -inline std::enable_if_t< - !Impl::ViewCtorProp::has_memory_space && +auto create_mirror_view(const DynRankView& src, + const typename Impl::ViewCtorProp& arg_prop) { + if constexpr (!Impl::ViewCtorProp::has_memory_space) { + if constexpr ( std::is_same< typename DynRankView::memory_space, typename DynRankView::HostMirror::memory_space>::value && std::is_same< typename DynRankView::data_type, - typename DynRankView::HostMirror::data_type>::value, - typename DynRankView::HostMirror> -create_mirror_view(const DynRankView& src, - const typename Impl::ViewCtorProp&) { - return src; -} - -template -inline std::enable_if_t< - !Impl::ViewCtorProp::has_memory_space && - !(std::is_same< - typename DynRankView::memory_space, - typename DynRankView::HostMirror::memory_space>::value && - std::is_same< - typename DynRankView::data_type, - typename DynRankView::HostMirror::data_type>::value), - typename DynRankView::HostMirror> -create_mirror_view( - const DynRankView& src, - const typename Impl::ViewCtorProp& arg_prop) { - return Kokkos::Impl::create_mirror(src, arg_prop); -} - -template ::has_memory_space>> -inline std::enable_if_t< - Kokkos::is_space< - typename Impl::ViewCtorProp::memory_space>::value && + typename DynRankView::HostMirror::data_type>::value + ) { + return src; + } else { + return Kokkos::Impl::create_mirror(src, arg_prop); + } + } else { + if constexpr ( Impl::MirrorDRViewType< typename Impl::ViewCtorProp::memory_space, T, - P...>::is_same_memspace, - typename Impl::MirrorDRViewType< - typename Impl::ViewCtorProp::memory_space, T, - P...>::view_type> -create_mirror_view(const Kokkos::DynRankView& src, - const typename Impl::ViewCtorProp&) { - return src; + P...>::is_same_memspace + ) { + return src; + } else { + return Kokkos::Impl::create_mirror(src, arg_prop); + } + } } -template ::has_memory_space>> -inline std::enable_if_t< - Kokkos::is_space< - typename Impl::ViewCtorProp::memory_space>::value && - !Impl::MirrorDRViewType< - typename Impl::ViewCtorProp::memory_space, T, - P...>::is_same_memspace, - typename Impl::MirrorDRViewType< - typename Impl::ViewCtorProp::memory_space, T, - P...>::view_type> -create_mirror_view( - const Kokkos::DynRankView& src, - const typename Impl::ViewCtorProp& arg_prop) { - return Kokkos::Impl::create_mirror(src, arg_prop); -} } // namespace Impl // Create a mirror view in host space @@ -2195,16 +2142,12 @@ inline auto create_mirror_view( return Impl::create_mirror_view(src, arg_prop); } -template -auto create_mirror_view_and_copy( - const Impl::ViewCtorProp&, - const Kokkos::DynRankView& src, - std::enable_if_t< - std::is_void::specialize>::value && - Impl::MirrorDRViewType< - typename Impl::ViewCtorProp::memory_space, T, - P...>::is_same_memspace>* = nullptr) { +namespace Impl { + +template +void check_view_ctor_args_create_mirror_view_and_copy_drv() { using alloc_prop_input = Impl::ViewCtorProp; + static_assert( alloc_prop_input::has_memory_space, "The view constructor arguments passed to " @@ -2217,53 +2160,47 @@ auto create_mirror_view_and_copy( "The view constructor arguments passed to " "Kokkos::create_mirror_view_and_copy must " "not explicitly allow padding!"); - - // same behavior as deep_copy(src, src) - if (!alloc_prop_input::has_execution_space) - fence( - "Kokkos::create_mirror_view_and_copy: fence before returning src view"); - return src; } -template +} // namespace Impl + +template ::specialize>::value>> auto create_mirror_view_and_copy( const Impl::ViewCtorProp& arg_prop, - const Kokkos::DynRankView& src, - std::enable_if_t< - std::is_void::specialize>::value && - !Impl::MirrorDRViewType< - typename Impl::ViewCtorProp::memory_space, T, - P...>::is_same_memspace>* = nullptr) { + const Kokkos::DynRankView& src) { using alloc_prop_input = Impl::ViewCtorProp; - static_assert( - alloc_prop_input::has_memory_space, - "The view constructor arguments passed to " - "Kokkos::create_mirror_view_and_copy must include a memory space!"); - static_assert(!alloc_prop_input::has_pointer, - "The view constructor arguments passed to " - "Kokkos::create_mirror_view_and_copy must " - "not include a pointer!"); - static_assert(!alloc_prop_input::allow_padding, - "The view constructor arguments passed to " - "Kokkos::create_mirror_view_and_copy must " - "not explicitly allow padding!"); - using Space = typename alloc_prop_input::memory_space; - using Mirror = typename Impl::MirrorDRViewType::view_type; - - auto arg_prop_copy = Impl::with_properties_if_unset( - arg_prop, std::string{}, WithoutInitializing, - typename Space::execution_space{}); - - std::string& label = Impl::get_property(arg_prop_copy); - if (label.empty()) label = src.label(); - auto mirror = typename Mirror::non_const_type{ - arg_prop_copy, Impl::reconstructLayout(src.layout(), src.rank())}; - if constexpr (alloc_prop_input::has_execution_space) { - deep_copy(Impl::get_property(arg_prop_copy), - mirror, src); - } else - deep_copy(mirror, src); - return mirror; + + Impl::check_view_ctor_args_create_mirror_view_and_copy_drv(); + + if constexpr (Impl::MirrorDRViewType< + typename Impl::ViewCtorProp::memory_space, T, + P...>::is_same_memspace) { + // same behavior as deep_copy(src, src) + if constexpr (!alloc_prop_input::has_execution_space) + fence( + "Kokkos::create_mirror_view_and_copy: fence before returning src view"); + return src; + } else { + using Space = typename alloc_prop_input::memory_space; + using Mirror = typename Impl::MirrorDRViewType::view_type; + + auto arg_prop_copy = Impl::with_properties_if_unset( + arg_prop, std::string{}, WithoutInitializing, + typename Space::execution_space{}); + + std::string& label = Impl::get_property(arg_prop_copy); + if (label.empty()) label = src.label(); + auto mirror = typename Mirror::non_const_type{ + arg_prop_copy, Impl::reconstructLayout(src.layout(), src.rank())}; + if constexpr (alloc_prop_input::has_execution_space) { + deep_copy(Impl::get_property(arg_prop_copy), + mirror, src); + } else + deep_copy(mirror, src); + return mirror; + } } template From f94e8d34de523813f5e23e5622615566c80de8fc Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Tue, 16 Apr 2024 14:46:49 -0400 Subject: [PATCH 43/95] Prefer standard C++ feature testing to guard the C++20 requires expression Temporary fix for our nightly builds so we can make decision on minimum CXX20 compiler requirements when we see fit. --- .../impl/Kokkos_RandomAccessIterator.hpp | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/algorithms/src/std_algorithms/impl/Kokkos_RandomAccessIterator.hpp b/algorithms/src/std_algorithms/impl/Kokkos_RandomAccessIterator.hpp index 7bcc16a9b55..ba0cdc91eea 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_RandomAccessIterator.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_RandomAccessIterator.hpp @@ -59,7 +59,15 @@ class RandomAccessIterator< ::Kokkos::View > { ptrdiff_t current_index) : m_view(view), m_current_index(current_index) {} -#ifndef KOKKOS_ENABLE_CXX17 // C++20 and beyond +// FIXME The C++20 requires expression is not supported with Clang 9 and GCC 9 +// The following guards is unsufficient until we increase our minimum CXX20 +// compiler requirements. +// #ifndef KOKKOS_ENABLE_CXX17 // C++20 and beyond +// We replace the Kokkos guards with standard C++ feature testing in the +// meantime. +#if (defined(__cpp_concepts) && (__cpp_concepts >= 201907L)) && \ + (defined(__cpp_conditional_explicit) && \ + (__cpp_conditional_explicit >= 201806L)) template requires(std::is_constructible_v) KOKKOS_FUNCTION explicit(!std::is_convertible_v) From c9e21ce2ab8e03710494ed53a12c255df56fd3b1 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Tue, 16 Apr 2024 12:21:57 -0400 Subject: [PATCH 44/95] Add `kokkos_swap(Array)` sepcialization Specializing the swap algorithm for Kokkos arrays was initially proposed in #6697 but we dropped it to focus on the Kokkos swap ADL ordeal. Somehow we overlooked a stray header include in the Kokkos::Array header file. This PR reintroduce a `Kokkos::kokkos_swap(Kokkos::Array)` specialization, following closely what the standard library does for `std::swap(std::array)`. --- core/src/Kokkos_Array.hpp | 15 +++++++++++++++ core/unit_test/TestArray.cpp | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+) diff --git a/core/src/Kokkos_Array.hpp b/core/src/Kokkos_Array.hpp index 461b98f6a72..7fd81030ecd 100644 --- a/core/src/Kokkos_Array.hpp +++ b/core/src/Kokkos_Array.hpp @@ -133,6 +133,17 @@ struct Array { KOKKOS_INLINE_FUNCTION constexpr const_pointer data() const { return &m_internal_implementation_private_member_data[0]; } + + private: + template + friend KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t< + Impl::is_swappable::value> + kokkos_swap(Array& a, + Array& b) noexcept(Impl::is_nothrow_swappable_v) { + for (std::size_t i = 0; i < N; ++i) { + kokkos_swap(a[i], b[i]); + } + } }; #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 @@ -185,6 +196,10 @@ struct Array { // for default move constructor and move assignment operator. // Array( Array && ) = default ; // Array & operator = ( Array && ) = default ; + + private: + friend KOKKOS_INLINE_FUNCTION constexpr void kokkos_swap( + Array&, Array&) noexcept(Impl::is_nothrow_swappable_v) {} }; #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 diff --git a/core/unit_test/TestArray.cpp b/core/unit_test/TestArray.cpp index e138a64d6db..e691d83ebe2 100644 --- a/core/unit_test/TestArray.cpp +++ b/core/unit_test/TestArray.cpp @@ -120,4 +120,37 @@ static_assert(test_array_aggregate_initialization()); } } +// User-defined type providing a sepcialization of kokkos_swap +struct MyInt { + int i; + + private: + friend constexpr void kokkos_swap(MyInt& lhs, MyInt& rhs) noexcept { + lhs.i = 255; + rhs.i = 127; + } +}; + +constexpr bool test_array_specialization_kokkos_swap() { + Kokkos::Array a{MyInt{1}, MyInt{2}}; + Kokkos::Array b{MyInt{11}, MyInt{22}}; + + // sanity check + if (a[0].i != 1 || a[1].i != 2 || b[0].i != 11 || b[1].i != 22) { + return false; + } + + using Kokkos::kokkos_swap; + kokkos_swap(a, b); + + // check that the user-definied kokkos_swap(MyInt) overload was called + if (a[0].i != 255 || a[1].i != 255 || b[0].i != 127 || b[1].i != 127) { + return false; + } + + return true; +} + +static_assert(test_array_specialization_kokkos_swap()); + } // namespace From 730d8d828f9d43b0cf3a1010b06c0008a7be128a Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 17 Apr 2024 08:58:22 -0400 Subject: [PATCH 45/95] Deprecate specialization of Kokkos::pair for a single element This specialization is not documented, does not follow the standard library, it is not tested and has no known usage in Trilinos. `Kokkos::pair`, as we generally describe it, was intended as a drop-in replacement for `std::pair`. Hence, obscure departure from the standard implementation do not look like a good idea. This PR suggest to deprecate that `T2=void` specialization for degenerate pair that only hold one element. --- core/src/Kokkos_Pair.hpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/core/src/Kokkos_Pair.hpp b/core/src/Kokkos_Pair.hpp index 9be8d8d7aa1..d1bd11f7162 100644 --- a/core/src/Kokkos_Pair.hpp +++ b/core/src/Kokkos_Pair.hpp @@ -413,12 +413,13 @@ KOKKOS_FORCEINLINE_FUNCTION pair tie(T1& x, T2& y) { return (pair(x, y)); } +#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_4 // // Specialization of Kokkos::pair for a \c void second argument. This // is not actually a "pair"; it only contains one element, the first. // template -struct pair { +struct KOKKOS_DEPRECATED pair { using first_type = T1; using second_type = void; @@ -483,6 +484,7 @@ KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator>=( const pair& lhs, const pair& rhs) { return !(lhs < rhs); } +#endif namespace Impl { template From d914fe316ba64c6755abfd8e68cd7d1b872e04f7 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 17 Apr 2024 10:05:24 -0400 Subject: [PATCH 46/95] Fix deprecated warning from `Kokkos::Array` specialization (#6945) * Fix deprecated warning from Kokkos::Array specialization The warnings come from the template arguments in deprecated specialization `Kokkos::Array<>::{contiguous,strided}` which refer to `Kokkos::Array<>` that is marked as deprecated. Minimal reproducer [here](https://godbolt.org/z/s18Txa5P6). GCC9 eats it but GCC10 onwards raise a warning. I propose the easy way out, that is we drop the `[[deprecated]]` attribute on `Kokkos::Array<>`. Let me know if you have a better idea. Sample warning from ArborX nightlies for completeness: ``` In file included from /var/jenkins/workspace/ArborX_nightly/source-kokkos/core/src/KokkosExp_MDRangePolicy.hpp:29, from /var/jenkins/workspace/ArborX_nightly/source-kokkos/core/src/Kokkos_Tuners.hpp:28, from /var/jenkins/workspace/ArborX_nightly/source-kokkos/core/src/impl/Kokkos_Tools_Generic.hpp:26, from /var/jenkins/workspace/ArborX_nightly/source-kokkos/core/src/Kokkos_Parallel.hpp:34, from /var/jenkins/workspace/ArborX_nightly/source-kokkos/core/src/Kokkos_MemoryPool.hpp:26, from /var/jenkins/workspace/ArborX_nightly/source-kokkos/core/src/Kokkos_TaskScheduler.hpp:34, from /var/jenkins/workspace/ArborX_nightly/source-kokkos/core/src/Serial/Kokkos_Serial.hpp:37, from /var/jenkins/workspace/ArborX_nightly/source-kokkos/core/src/decl/Kokkos_Declare_SERIAL.hpp:21, from /var/jenkins/workspace/ArborX_nightly/build-kokkos/KokkosCore_Config_DeclareBackend.hpp:22, from /var/jenkins/workspace/ArborX_nightly/source-kokkos/core/src/Kokkos_Core.hpp:45, from /var/jenkins/workspace/ArborX_nightly/source-kokkos/core/src/impl/Kokkos_Core.cpp:21: /var/jenkins/workspace/ArborX_nightly/source-kokkos/core/src/Kokkos_Array.hpp:197:66: warning: 'Array' is deprecated [-Wdeprecated-declarations] 197 | struct KOKKOS_DEPRECATED Array::contiguous> { | ^~~~~~~~~~ /var/jenkins/workspace/ArborX_nightly/source-kokkos/core/src/Kokkos_Array.hpp:191:26: note: declared here 191 | struct KOKKOS_DEPRECATED Array { | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ /var/jenkins/workspace/ArborX_nightly/source-kokkos/core/src/Kokkos_Array.hpp:265:66: warning: 'Array' is deprecated [-Wdeprecated-declarations] 265 | struct KOKKOS_DEPRECATED Array::strided> { | ^~~~~~~ /var/jenkins/workspace/ArborX_nightly/source-kokkos/core/src/Kokkos_Array.hpp:191:26: note: declared here 191 | struct KOKKOS_DEPRECATED Array { | | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ``` * Revert "Fix deprecated warning from Kokkos::Array specialization" This reverts commit 38db1cab74df5fc547e779d6b9e3e65ebcb89a14. * Let Array<>::{contiguous,strided} be aliases to Impl:: tag classes Better approach to suppress the GCC deprecation warning suggested by Thomas on Slack. Co-Authored-By: Thomas Padioleau --------- Co-authored-by: Thomas Padioleau --- core/src/Kokkos_Array.hpp | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/core/src/Kokkos_Array.hpp b/core/src/Kokkos_Array.hpp index 3d71d09fde1..fed18d73fef 100644 --- a/core/src/Kokkos_Array.hpp +++ b/core/src/Kokkos_Array.hpp @@ -187,14 +187,20 @@ struct Array { }; #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +namespace Impl { +struct KokkosArrayContiguous {}; +struct KokkosArrayStrided {}; +} // namespace Impl + template <> struct KOKKOS_DEPRECATED Array { - struct contiguous {}; - struct strided {}; + using contiguous = Impl::KokkosArrayContiguous; + using strided = Impl::KokkosArrayStrided; }; template -struct KOKKOS_DEPRECATED Array::contiguous> { +struct KOKKOS_DEPRECATED + Array { private: T* m_elem; size_t m_size; @@ -262,7 +268,8 @@ struct KOKKOS_DEPRECATED Array::contiguous> { }; template -struct KOKKOS_DEPRECATED Array::strided> { +struct KOKKOS_DEPRECATED + Array { private: T* m_elem; size_t m_size; From e1b8afaab8a237221c2dded5ef42a95fa432dab2 Mon Sep 17 00:00:00 2001 From: Paul Zehner Date: Wed, 17 Apr 2024 17:34:20 +0200 Subject: [PATCH 47/95] Add comments --- containers/src/Kokkos_DynRankView.hpp | 11 +++++++++++ containers/src/Kokkos_DynamicView.hpp | 12 +++++++++++- containers/src/Kokkos_OffsetView.hpp | 7 +++++++ core/src/Kokkos_CopyViews.hpp | 11 +++++++++++ 4 files changed, 40 insertions(+), 1 deletion(-) diff --git a/containers/src/Kokkos_DynRankView.hpp b/containers/src/Kokkos_DynRankView.hpp index b9d40de9064..03f7b66970a 100644 --- a/containers/src/Kokkos_DynRankView.hpp +++ b/containers/src/Kokkos_DynRankView.hpp @@ -1934,6 +1934,7 @@ struct MirrorDRVType { namespace Impl { +// collection of static asserts for create_mirror template void check_view_ctor_args_create_mirror_drv() { using alloc_prop_input = Impl::ViewCtorProp; @@ -1952,6 +1953,9 @@ void check_view_ctor_args_create_mirror_drv() { "not explicitly allow padding!"); } +// create a mirror +// private interface that accepts arbitrary view constructor args passed by a +// view_alloc template auto create_mirror( const DynRankView& src, @@ -2041,6 +2045,9 @@ inline auto create_mirror( namespace Impl { +// create a mirror view +// private interface that accepts arbitrary view constructor args passed by a +// view_alloc template auto create_mirror_view(const DynRankView& src, const typename Impl::ViewCtorProp& arg_prop) { @@ -2144,6 +2151,7 @@ inline auto create_mirror_view( namespace Impl { +// collection of static asserts for create_mirror_view_and_copy template void check_view_ctor_args_create_mirror_view_and_copy_drv() { using alloc_prop_input = Impl::ViewCtorProp; @@ -2164,6 +2172,9 @@ void check_view_ctor_args_create_mirror_view_and_copy_drv() { } // namespace Impl +// create a mirror view and deep copy it +// public interface that accepts arbitrary view constructor args passed by a +// view_alloc template ::specialize>::value>> diff --git a/containers/src/Kokkos_DynamicView.hpp b/containers/src/Kokkos_DynamicView.hpp index de98e6e75be..7c3e0b176c2 100644 --- a/containers/src/Kokkos_DynamicView.hpp +++ b/containers/src/Kokkos_DynamicView.hpp @@ -591,6 +591,7 @@ struct MirrorDynamicViewType { namespace Impl { +// collection of static asserts for create_mirror template void check_view_ctor_args_create_mirror_dynamic() { using alloc_prop_input = Impl::ViewCtorProp; @@ -609,6 +610,9 @@ void check_view_ctor_args_create_mirror_dynamic() { "not explicitly allow padding!"); } +// create a mirror +// private interface that accepts arbitrary view constructor args passed by a +// view_alloc template auto create_mirror(const Kokkos::Experimental::DynamicView& src, const Impl::ViewCtorProp& arg_prop) { @@ -681,6 +685,9 @@ inline auto create_mirror( namespace Impl { +// create a mirror view +// private interface that accepts arbitrary view constructor args passed by a +// view_alloc template auto create_mirror_view(const Kokkos::Experimental::DynamicView& src, const Impl::ViewCtorProp& arg_prop) { @@ -959,7 +966,10 @@ void check_view_ctor_args_create_mirror_view_and_copy_dynamic() { } } // namespace Impl - // + +// create a mirror view and deep copy it +// public interface that accepts arbitrary view constructor args passed by a +// view_alloc template ::specialize>::value>> diff --git a/containers/src/Kokkos_OffsetView.hpp b/containers/src/Kokkos_OffsetView.hpp index d465e72a97e..142864e91d8 100644 --- a/containers/src/Kokkos_OffsetView.hpp +++ b/containers/src/Kokkos_OffsetView.hpp @@ -1842,6 +1842,7 @@ struct MirrorOffsetType { namespace Impl { +// collection of static asserts for create_mirror template void check_view_ctor_args_create_mirror_offset() { using alloc_prop_input = Impl::ViewCtorProp; @@ -1860,6 +1861,9 @@ void check_view_ctor_args_create_mirror_offset() { "not explicitly allow padding!"); } +// create a mirror +// private interface that accepts arbitrary view constructor args passed by a +// view_alloc template auto create_mirror(const Kokkos::Experimental::OffsetView& src, const Impl::ViewCtorProp& arg_prop) { @@ -1923,6 +1927,9 @@ inline auto create_mirror( namespace Impl { +// create a mirror view +// private interface that accepts arbitrary view constructor args passed by a +// view_alloc template auto create_mirror_view(const Kokkos::Experimental::OffsetView& src, const Impl::ViewCtorProp& arg_prop) { diff --git a/core/src/Kokkos_CopyViews.hpp b/core/src/Kokkos_CopyViews.hpp index 6348c3d308f..50213a14b0a 100644 --- a/core/src/Kokkos_CopyViews.hpp +++ b/core/src/Kokkos_CopyViews.hpp @@ -3452,6 +3452,7 @@ struct MirrorType { using view_type = Kokkos::View; }; +// collection of static asserts for create_mirror and create_mirror_view template void check_view_ctor_args_create_mirror() { using alloc_prop_input = Impl::ViewCtorProp; @@ -3470,6 +3471,9 @@ void check_view_ctor_args_create_mirror() { "not explicitly allow padding!"); } +// create a mirror +// private interface that accepts arbitrary view constructor args passed by a +// view_alloc template auto create_mirror(const Kokkos::View& src, const Impl::ViewCtorProp& arg_prop) { @@ -3544,6 +3548,9 @@ create_mirror(Kokkos::Impl::WithoutInitializing_t wi, Space const&, namespace Impl { +// create a mirror view +// private interface that accepts arbitrary view constructor args passed by a +// view_alloc template auto create_mirror_view(const Kokkos::View& src, const Impl::ViewCtorProp& arg_prop) { @@ -3641,6 +3648,7 @@ auto create_mirror_view(const Impl::ViewCtorProp& arg_prop, namespace Impl { +// collection of static asserts for create_mirror_view_and_copy template void check_view_ctor_args_create_mirror_view_and_copy() { using alloc_prop_input = Impl::ViewCtorProp; @@ -3661,6 +3669,9 @@ void check_view_ctor_args_create_mirror_view_and_copy() { } // namespace Impl +// create a mirror view and deep copy it +// public interface that accepts arbitrary view constructor args passed by a +// view_alloc template ::specialize>::value>> From 1287f7fd54e6ba058686fe6c7a372d23f4e4225f Mon Sep 17 00:00:00 2001 From: Paul Zehner Date: Wed, 17 Apr 2024 17:50:45 +0200 Subject: [PATCH 48/95] Restore inline specifiers --- containers/src/Kokkos_DynRankView.hpp | 4 ++-- containers/src/Kokkos_DynamicView.hpp | 4 ++-- containers/src/Kokkos_OffsetView.hpp | 4 ++-- core/src/Kokkos_CopyViews.hpp | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/containers/src/Kokkos_DynRankView.hpp b/containers/src/Kokkos_DynRankView.hpp index 03f7b66970a..9eb6ea4018d 100644 --- a/containers/src/Kokkos_DynRankView.hpp +++ b/containers/src/Kokkos_DynRankView.hpp @@ -1957,7 +1957,7 @@ void check_view_ctor_args_create_mirror_drv() { // private interface that accepts arbitrary view constructor args passed by a // view_alloc template -auto create_mirror( +inline auto create_mirror( const DynRankView& src, const Impl::ViewCtorProp& arg_prop) { @@ -2049,7 +2049,7 @@ namespace Impl { // private interface that accepts arbitrary view constructor args passed by a // view_alloc template -auto create_mirror_view(const DynRankView& src, +inline auto create_mirror_view(const DynRankView& src, const typename Impl::ViewCtorProp& arg_prop) { if constexpr (!Impl::ViewCtorProp::has_memory_space) { if constexpr ( diff --git a/containers/src/Kokkos_DynamicView.hpp b/containers/src/Kokkos_DynamicView.hpp index 7c3e0b176c2..dce716f74ef 100644 --- a/containers/src/Kokkos_DynamicView.hpp +++ b/containers/src/Kokkos_DynamicView.hpp @@ -614,7 +614,7 @@ void check_view_ctor_args_create_mirror_dynamic() { // private interface that accepts arbitrary view constructor args passed by a // view_alloc template -auto create_mirror(const Kokkos::Experimental::DynamicView& src, +inline auto create_mirror(const Kokkos::Experimental::DynamicView& src, const Impl::ViewCtorProp& arg_prop) { using alloc_prop_input = Impl::ViewCtorProp; check_view_ctor_args_create_mirror_dynamic(); @@ -689,7 +689,7 @@ namespace Impl { // private interface that accepts arbitrary view constructor args passed by a // view_alloc template -auto create_mirror_view(const Kokkos::Experimental::DynamicView& src, +inline auto create_mirror_view(const Kokkos::Experimental::DynamicView& src, const Impl::ViewCtorProp& arg_prop) { if constexpr (!Impl::ViewCtorProp::has_memory_space) { if constexpr ( diff --git a/containers/src/Kokkos_OffsetView.hpp b/containers/src/Kokkos_OffsetView.hpp index 142864e91d8..936262c0a32 100644 --- a/containers/src/Kokkos_OffsetView.hpp +++ b/containers/src/Kokkos_OffsetView.hpp @@ -1865,7 +1865,7 @@ void check_view_ctor_args_create_mirror_offset() { // private interface that accepts arbitrary view constructor args passed by a // view_alloc template -auto create_mirror(const Kokkos::Experimental::OffsetView& src, +inline auto create_mirror(const Kokkos::Experimental::OffsetView& src, const Impl::ViewCtorProp& arg_prop) { check_view_ctor_args_create_mirror_offset(); @@ -1931,7 +1931,7 @@ namespace Impl { // private interface that accepts arbitrary view constructor args passed by a // view_alloc template -auto create_mirror_view(const Kokkos::Experimental::OffsetView& src, +inline auto create_mirror_view(const Kokkos::Experimental::OffsetView& src, const Impl::ViewCtorProp& arg_prop) { if constexpr (!Impl::ViewCtorProp::has_memory_space) { if constexpr ( diff --git a/core/src/Kokkos_CopyViews.hpp b/core/src/Kokkos_CopyViews.hpp index 50213a14b0a..2d9ca6d1bba 100644 --- a/core/src/Kokkos_CopyViews.hpp +++ b/core/src/Kokkos_CopyViews.hpp @@ -3475,7 +3475,7 @@ void check_view_ctor_args_create_mirror() { // private interface that accepts arbitrary view constructor args passed by a // view_alloc template -auto create_mirror(const Kokkos::View& src, +inline auto create_mirror(const Kokkos::View& src, const Impl::ViewCtorProp& arg_prop) { check_view_ctor_args_create_mirror(); @@ -3552,7 +3552,7 @@ namespace Impl { // private interface that accepts arbitrary view constructor args passed by a // view_alloc template -auto create_mirror_view(const Kokkos::View& src, +inline auto create_mirror_view(const Kokkos::View& src, const Impl::ViewCtorProp& arg_prop) { if constexpr (!Impl::ViewCtorProp::has_memory_space) { if constexpr (std::is_same::memory_space, From 69c527a4245f495ae7d03c2bf4fcd7dd4364d0a7 Mon Sep 17 00:00:00 2001 From: Bruno Turcksin Date: Wed, 17 Apr 2024 13:25:04 -0400 Subject: [PATCH 49/95] [ci skip] Enable deprecated code and deprecated warnings in nightly CI --- .jenkins_nightly | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.jenkins_nightly b/.jenkins_nightly index a8facd365c2..b723f12c0fc 100644 --- a/.jenkins_nightly +++ b/.jenkins_nightly @@ -95,7 +95,8 @@ pipeline { -DKokkos_ENABLE_BENCHMARKS=ON \ -DKokkos_ENABLE_EXAMPLES=ON \ -DKokkos_ENABLE_TESTS=ON \ - -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ + -DKokkos_ENABLE_DEPRECATED_CODE_4=ON \ + -DKokkos_ENABLE_DEPRECATION_WARNINGS=ON \ -DKokkos_ENABLE_SERIAL=ON \ .. && \ make -j8 && ctest --verbose @@ -123,7 +124,7 @@ pipeline { -DKokkos_ARCH_NATIVE=ON \ -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ -DKokkos_ENABLE_DEPRECATED_CODE_4=ON \ - -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ + -DKokkos_ENABLE_DEPRECATION_WARNINGS=ON \ -DKokkos_ENABLE_TESTS=ON \ -DKokkos_ENABLE_BENCHMARKS=ON \ -DKokkos_ENABLE_HIP=ON \ From e7b486ff614abb6454e1172098888f8de15f7b65 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 17 Apr 2024 18:10:26 -0400 Subject: [PATCH 50/95] Serial: Use the provided execution space instance in TeamPolicy --- core/src/Serial/Kokkos_Serial_Parallel_Team.hpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp b/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp index f34a7daaca0..a25b51496ef 100644 --- a/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp +++ b/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp @@ -37,6 +37,8 @@ class TeamPolicyInternal int m_league_size; int m_chunk_size; + Kokkos::Serial m_space; + public: //! Tag this class as a kokkos execution policy using execution_policy = TeamPolicyInternal; @@ -46,10 +48,7 @@ class TeamPolicyInternal //! Execution space of this execution policy: using execution_space = Kokkos::Serial; - const typename traits::execution_space& space() const { - static typename traits::execution_space m_space; - return m_space; - } + const typename traits::execution_space& space() const { return m_space; } template friend class TeamPolicyInternal; @@ -116,12 +115,13 @@ class TeamPolicyInternal return (level == 0 ? 1024 * 32 : 20 * 1024 * 1024); } /** \brief Specify league size, request team size */ - TeamPolicyInternal(const execution_space&, int league_size_request, + TeamPolicyInternal(const execution_space& space, int league_size_request, int team_size_request, int /* vector_length_request */ = 1) : m_team_scratch_size{0, 0}, m_thread_scratch_size{0, 0}, m_league_size(league_size_request), - m_chunk_size(32) { + m_chunk_size(32), + m_space(space) { if (team_size_request > 1) Kokkos::abort("Kokkos::abort: Requested Team Size is too large!"); } From 0859ab0af9b44315832cd27353bd3acf188853a3 Mon Sep 17 00:00:00 2001 From: "Nevin \":-)\" Liber" Date: Wed, 17 Apr 2024 17:31:45 -0500 Subject: [PATCH 51/95] Fixed the link for P6601 (Threads backend change) --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f7b8af7695c..c70ee5505f8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -39,7 +39,7 @@ * Make the OpenACC backend asynchronous [\#6772](https://github.com/kokkos/kokkos/pull/6772) #### Threads: -* Add missing broadcast to TeamThreadRange parallel_scan [\#6601](https://github.com/kokkos/kokkos/pull/6446) +* Add missing broadcast to TeamThreadRange parallel_scan [\#6601](https://github.com/kokkos/kokkos/pull/6601) #### OpenMP: * Improve performance of view initializations and filling with zeros [\#6573](https://github.com/kokkos/kokkos/pull/6573) From 34d0db2f41dc11c9d30c3ff3449cddfc366c7e3d Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Thu, 18 Apr 2024 07:54:59 -0400 Subject: [PATCH 52/95] Add test --- core/unit_test/TestExecSpacePartitioning.hpp | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/core/unit_test/TestExecSpacePartitioning.hpp b/core/unit_test/TestExecSpacePartitioning.hpp index 65314d6be7c..8703cc3b273 100644 --- a/core/unit_test/TestExecSpacePartitioning.hpp +++ b/core/unit_test/TestExecSpacePartitioning.hpp @@ -28,6 +28,17 @@ struct SumFunctor { void operator()(int i, int& lsum) const { lsum += i; } }; +template +void check_space_member_for_policies(const ExecSpace& exec) { + Kokkos::RangePolicy range_policy(exec, 0, 1); + ASSERT_EQ(range_policy.space(), exec); + Kokkos::MDRangePolicy> mdrange_policy(exec, {0, 0}, + {1, 1}); + ASSERT_EQ(mdrange_policy.space(), exec); + Kokkos::TeamPolicy team_policy(exec, 1, 1); + ASSERT_EQ(team_policy.space(), exec); +} + template void check_distinctive([[maybe_unused]] ExecSpace exec1, [[maybe_unused]] ExecSpace exec2) { @@ -89,6 +100,9 @@ void run_threaded_test(const Lambda1 l1, const Lambda2 l2) { void test_partitioning(std::vector& instances) { check_distinctive(instances[0], instances[1]); + check_space_member_for_policies(instances[0]); + check_space_member_for_policies(instances[1]); + int sum1, sum2; int N = 3910; run_threaded_test( From 44fde213fb9515adfeb6645fbdbca5a4deeca633 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Thu, 18 Apr 2024 10:34:25 -0400 Subject: [PATCH 53/95] Use Kokkos::AUTO for OpenMPTarget --- core/unit_test/TestExecSpacePartitioning.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/unit_test/TestExecSpacePartitioning.hpp b/core/unit_test/TestExecSpacePartitioning.hpp index 8703cc3b273..f8b570ab64d 100644 --- a/core/unit_test/TestExecSpacePartitioning.hpp +++ b/core/unit_test/TestExecSpacePartitioning.hpp @@ -35,7 +35,7 @@ void check_space_member_for_policies(const ExecSpace& exec) { Kokkos::MDRangePolicy> mdrange_policy(exec, {0, 0}, {1, 1}); ASSERT_EQ(mdrange_policy.space(), exec); - Kokkos::TeamPolicy team_policy(exec, 1, 1); + Kokkos::TeamPolicy team_policy(exec, 1, Kokkos::AUTO); ASSERT_EQ(team_policy.space(), exec); } From 8706b68d5bcb66473f180e131696e3d520bd34a7 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 18 Apr 2024 13:56:19 -0500 Subject: [PATCH 54/95] kokkos_swap(Array) member friend should not be templated on some other type U Co-Authored-By: Maarten Arnst --- core/src/Kokkos_Array.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/core/src/Kokkos_Array.hpp b/core/src/Kokkos_Array.hpp index 7fd81030ecd..d1132bdbb4e 100644 --- a/core/src/Kokkos_Array.hpp +++ b/core/src/Kokkos_Array.hpp @@ -135,7 +135,6 @@ struct Array { } private: - template friend KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t< Impl::is_swappable::value> kokkos_swap(Array& a, From 86f5988b3128cd751da53d0b0c1af87d4ff7324a Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 18 Apr 2024 13:57:54 -0500 Subject: [PATCH 55/95] Fix noexcept specification for kokkos_swap on zero-sized arrays Co-authored-by: Nevin Liber --- core/src/Kokkos_Array.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/Kokkos_Array.hpp b/core/src/Kokkos_Array.hpp index d1132bdbb4e..09681c18842 100644 --- a/core/src/Kokkos_Array.hpp +++ b/core/src/Kokkos_Array.hpp @@ -198,7 +198,7 @@ struct Array { private: friend KOKKOS_INLINE_FUNCTION constexpr void kokkos_swap( - Array&, Array&) noexcept(Impl::is_nothrow_swappable_v) {} + Array&, Array&) noexcept {} }; #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 From 3dc6f5594bf57396bb6f8005e0c6fd3f23a0175d Mon Sep 17 00:00:00 2001 From: Paul Zehner Date: Mon, 22 Apr 2024 10:39:46 +0200 Subject: [PATCH 56/95] Add maybe_unused --- containers/src/Kokkos_DynRankView.hpp | 4 ++-- containers/src/Kokkos_DynamicView.hpp | 2 +- containers/src/Kokkos_OffsetView.hpp | 2 +- core/src/Kokkos_CopyViews.hpp | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/containers/src/Kokkos_DynRankView.hpp b/containers/src/Kokkos_DynRankView.hpp index 9eb6ea4018d..5b0db37f789 100644 --- a/containers/src/Kokkos_DynRankView.hpp +++ b/containers/src/Kokkos_DynRankView.hpp @@ -2050,7 +2050,7 @@ namespace Impl { // view_alloc template inline auto create_mirror_view(const DynRankView& src, - const typename Impl::ViewCtorProp& arg_prop) { + [[maybe_unused]] const typename Impl::ViewCtorProp& arg_prop) { if constexpr (!Impl::ViewCtorProp::has_memory_space) { if constexpr ( std::is_same< @@ -2179,7 +2179,7 @@ template ::specialize>::value>> auto create_mirror_view_and_copy( - const Impl::ViewCtorProp& arg_prop, + [[maybe_unused]] const Impl::ViewCtorProp& arg_prop, const Kokkos::DynRankView& src) { using alloc_prop_input = Impl::ViewCtorProp; diff --git a/containers/src/Kokkos_DynamicView.hpp b/containers/src/Kokkos_DynamicView.hpp index dce716f74ef..3a2fe240083 100644 --- a/containers/src/Kokkos_DynamicView.hpp +++ b/containers/src/Kokkos_DynamicView.hpp @@ -690,7 +690,7 @@ namespace Impl { // view_alloc template inline auto create_mirror_view(const Kokkos::Experimental::DynamicView& src, - const Impl::ViewCtorProp& arg_prop) { + [[maybe_unused]] const Impl::ViewCtorProp& arg_prop) { if constexpr (!Impl::ViewCtorProp::has_memory_space) { if constexpr ( std::is_same< diff --git a/containers/src/Kokkos_OffsetView.hpp b/containers/src/Kokkos_OffsetView.hpp index 936262c0a32..8399c934f06 100644 --- a/containers/src/Kokkos_OffsetView.hpp +++ b/containers/src/Kokkos_OffsetView.hpp @@ -1932,7 +1932,7 @@ namespace Impl { // view_alloc template inline auto create_mirror_view(const Kokkos::Experimental::OffsetView& src, - const Impl::ViewCtorProp& arg_prop) { + [[maybe_unused]] const Impl::ViewCtorProp& arg_prop) { if constexpr (!Impl::ViewCtorProp::has_memory_space) { if constexpr ( std::is_same< diff --git a/core/src/Kokkos_CopyViews.hpp b/core/src/Kokkos_CopyViews.hpp index 2d9ca6d1bba..c39dfd108cb 100644 --- a/core/src/Kokkos_CopyViews.hpp +++ b/core/src/Kokkos_CopyViews.hpp @@ -3553,7 +3553,7 @@ namespace Impl { // view_alloc template inline auto create_mirror_view(const Kokkos::View& src, - const Impl::ViewCtorProp& arg_prop) { + [[maybe_unused]] const Impl::ViewCtorProp& arg_prop) { if constexpr (!Impl::ViewCtorProp::has_memory_space) { if constexpr (std::is_same::memory_space, typename Kokkos::View< @@ -3676,7 +3676,7 @@ template ::specialize>::value>> auto create_mirror_view_and_copy( - const Impl::ViewCtorProp& arg_prop, + [[maybe_unused]] const Impl::ViewCtorProp& arg_prop, const Kokkos::View& src) { using alloc_prop_input = Impl::ViewCtorProp; From 2b54e2c40ced28530db273ff9726d02412703242 Mon Sep 17 00:00:00 2001 From: Paul Zehner Date: Mon, 22 Apr 2024 15:31:57 +0200 Subject: [PATCH 57/95] Mutualize check functions --- containers/src/Kokkos_DynRankView.hpp | 42 ++------------------------- containers/src/Kokkos_DynamicView.hpp | 41 ++------------------------ containers/src/Kokkos_OffsetView.hpp | 21 +------------- 3 files changed, 5 insertions(+), 99 deletions(-) diff --git a/containers/src/Kokkos_DynRankView.hpp b/containers/src/Kokkos_DynRankView.hpp index 5b0db37f789..a055f22ada0 100644 --- a/containers/src/Kokkos_DynRankView.hpp +++ b/containers/src/Kokkos_DynRankView.hpp @@ -1934,25 +1934,6 @@ struct MirrorDRVType { namespace Impl { -// collection of static asserts for create_mirror -template -void check_view_ctor_args_create_mirror_drv() { - using alloc_prop_input = Impl::ViewCtorProp; - - static_assert( - !alloc_prop_input::has_label, - "The view constructor arguments passed to Kokkos::create_mirror " - "must not include a label!"); - static_assert( - !alloc_prop_input::has_pointer, - "The view constructor arguments passed to Kokkos::create_mirror must " - "not include a pointer!"); - static_assert( - !alloc_prop_input::allow_padding, - "The view constructor arguments passed to Kokkos::create_mirror must " - "not explicitly allow padding!"); -} - // create a mirror // private interface that accepts arbitrary view constructor args passed by a // view_alloc @@ -1961,7 +1942,7 @@ inline auto create_mirror( const DynRankView& src, const Impl::ViewCtorProp& arg_prop) { - check_view_ctor_args_create_mirror_drv(); + check_view_ctor_args_create_mirror(); auto prop_copy = Impl::with_properties_if_unset( arg_prop, std::string(src.label()).append("_mirror")); @@ -2151,25 +2132,6 @@ inline auto create_mirror_view( namespace Impl { -// collection of static asserts for create_mirror_view_and_copy -template -void check_view_ctor_args_create_mirror_view_and_copy_drv() { - using alloc_prop_input = Impl::ViewCtorProp; - - static_assert( - alloc_prop_input::has_memory_space, - "The view constructor arguments passed to " - "Kokkos::create_mirror_view_and_copy must include a memory space!"); - static_assert(!alloc_prop_input::has_pointer, - "The view constructor arguments passed to " - "Kokkos::create_mirror_view_and_copy must " - "not include a pointer!"); - static_assert(!alloc_prop_input::allow_padding, - "The view constructor arguments passed to " - "Kokkos::create_mirror_view_and_copy must " - "not explicitly allow padding!"); -} - } // namespace Impl // create a mirror view and deep copy it @@ -2183,7 +2145,7 @@ auto create_mirror_view_and_copy( const Kokkos::DynRankView& src) { using alloc_prop_input = Impl::ViewCtorProp; - Impl::check_view_ctor_args_create_mirror_view_and_copy_drv(); + Impl::check_view_ctor_args_create_mirror_view_and_copy(); if constexpr (Impl::MirrorDRViewType< typename Impl::ViewCtorProp::memory_space, T, diff --git a/containers/src/Kokkos_DynamicView.hpp b/containers/src/Kokkos_DynamicView.hpp index 3a2fe240083..b2b73589e3a 100644 --- a/containers/src/Kokkos_DynamicView.hpp +++ b/containers/src/Kokkos_DynamicView.hpp @@ -591,25 +591,6 @@ struct MirrorDynamicViewType { namespace Impl { -// collection of static asserts for create_mirror -template -void check_view_ctor_args_create_mirror_dynamic() { - using alloc_prop_input = Impl::ViewCtorProp; - - static_assert( - !alloc_prop_input::has_label, - "The view constructor arguments passed to Kokkos::create_mirror " - "must not include a label!"); - static_assert( - !alloc_prop_input::has_pointer, - "The view constructor arguments passed to Kokkos::create_mirror must " - "not include a pointer!"); - static_assert( - !alloc_prop_input::allow_padding, - "The view constructor arguments passed to Kokkos::create_mirror must " - "not explicitly allow padding!"); -} - // create a mirror // private interface that accepts arbitrary view constructor args passed by a // view_alloc @@ -617,7 +598,7 @@ template inline auto create_mirror(const Kokkos::Experimental::DynamicView& src, const Impl::ViewCtorProp& arg_prop) { using alloc_prop_input = Impl::ViewCtorProp; - check_view_ctor_args_create_mirror_dynamic(); + check_view_ctor_args_create_mirror(); if constexpr (Impl::ViewCtorProp::has_memory_space) { using MemorySpace = typename alloc_prop_input::memory_space; @@ -947,24 +928,6 @@ struct ViewCopy, namespace Impl { -template -void check_view_ctor_args_create_mirror_view_and_copy_dynamic() { - using alloc_prop_input = Impl::ViewCtorProp; - - static_assert( - alloc_prop_input::has_memory_space, - "The view constructor arguments passed to " - "Kokkos::create_mirror_view_and_copy must include a memory space!"); - static_assert(!alloc_prop_input::has_pointer, - "The view constructor arguments passed to " - "Kokkos::create_mirror_view_and_copy must " - "not include a pointer!"); - static_assert(!alloc_prop_input::allow_padding, - "The view constructor arguments passed to " - "Kokkos::create_mirror_view_and_copy must " - "not explicitly allow padding!"); -} - } // namespace Impl // create a mirror view and deep copy it @@ -978,7 +941,7 @@ auto create_mirror_view_and_copy( const Kokkos::Experimental::DynamicView& src) { using alloc_prop_input = Impl::ViewCtorProp; - Impl::check_view_ctor_args_create_mirror_view_and_copy_dynamic(); + Impl::check_view_ctor_args_create_mirror_view_and_copy(); if constexpr (Impl::MirrorDynamicViewType< typename Impl::ViewCtorProp::memory_space, T, diff --git a/containers/src/Kokkos_OffsetView.hpp b/containers/src/Kokkos_OffsetView.hpp index 8399c934f06..bdfd44f3234 100644 --- a/containers/src/Kokkos_OffsetView.hpp +++ b/containers/src/Kokkos_OffsetView.hpp @@ -1842,32 +1842,13 @@ struct MirrorOffsetType { namespace Impl { -// collection of static asserts for create_mirror -template -void check_view_ctor_args_create_mirror_offset() { - using alloc_prop_input = Impl::ViewCtorProp; - - static_assert( - !alloc_prop_input::has_label, - "The view constructor arguments passed to Kokkos::create_mirror " - "must not include a label!"); - static_assert( - !alloc_prop_input::has_pointer, - "The view constructor arguments passed to Kokkos::create_mirror must " - "not include a pointer!"); - static_assert( - !alloc_prop_input::allow_padding, - "The view constructor arguments passed to Kokkos::create_mirror must " - "not explicitly allow padding!"); -} - // create a mirror // private interface that accepts arbitrary view constructor args passed by a // view_alloc template inline auto create_mirror(const Kokkos::Experimental::OffsetView& src, const Impl::ViewCtorProp& arg_prop) { - check_view_ctor_args_create_mirror_offset(); + check_view_ctor_args_create_mirror(); if constexpr (Impl::ViewCtorProp::has_memory_space) { using Space = typename Impl::ViewCtorProp::memory_space; From 7b6edf7f6c35eaf9ec7071220d23e4c7a4bb50d1 Mon Sep 17 00:00:00 2001 From: Paul Zehner Date: Mon, 22 Apr 2024 15:54:08 +0200 Subject: [PATCH 58/95] Simplify code --- containers/src/Kokkos_DynamicView.hpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/containers/src/Kokkos_DynamicView.hpp b/containers/src/Kokkos_DynamicView.hpp index b2b73589e3a..87e296516b2 100644 --- a/containers/src/Kokkos_DynamicView.hpp +++ b/containers/src/Kokkos_DynamicView.hpp @@ -600,10 +600,11 @@ inline auto create_mirror(const Kokkos::Experimental::DynamicView& src, using alloc_prop_input = Impl::ViewCtorProp; check_view_ctor_args_create_mirror(); + auto prop_copy = Impl::with_properties_if_unset( + arg_prop, std::string(src.label()).append("_mirror")); + if constexpr (Impl::ViewCtorProp::has_memory_space) { using MemorySpace = typename alloc_prop_input::memory_space; - auto prop_copy = Impl::with_properties_if_unset( - arg_prop, std::string(src.label()).append("_mirror")); auto ret = typename Kokkos::Impl::MirrorDynamicViewType< MemorySpace, T, P...>::view_type(prop_copy, src.chunk_size(), @@ -613,8 +614,6 @@ inline auto create_mirror(const Kokkos::Experimental::DynamicView& src, return ret; } else { - auto prop_copy = Impl::with_properties_if_unset( - arg_prop, std::string(src.label()).append("_mirror")); auto ret = typename Kokkos::Experimental::DynamicView::HostMirror( prop_copy, src.chunk_size(), src.chunk_max() * src.chunk_size()); From 1ee800ad78b3f1d32540d0301a7ae45e325d296b Mon Sep 17 00:00:00 2001 From: Paul Zehner Date: Mon, 22 Apr 2024 16:47:17 +0200 Subject: [PATCH 59/95] Fix missing maybe_unused --- containers/src/Kokkos_DynamicView.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/containers/src/Kokkos_DynamicView.hpp b/containers/src/Kokkos_DynamicView.hpp index 87e296516b2..5f2330a5c05 100644 --- a/containers/src/Kokkos_DynamicView.hpp +++ b/containers/src/Kokkos_DynamicView.hpp @@ -936,7 +936,7 @@ template ::specialize>::value>> auto create_mirror_view_and_copy( - const Impl::ViewCtorProp& arg_prop, + [[maybe_unused]] const Impl::ViewCtorProp& arg_prop, const Kokkos::Experimental::DynamicView& src) { using alloc_prop_input = Impl::ViewCtorProp; From 205fd156d990138dd6b6b400fb44d4aa9b196aa0 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Mon, 22 Apr 2024 15:07:03 +0000 Subject: [PATCH 60/95] Replace deprecated sycl::device_ptr/sycl::host_ptr --- core/src/SYCL/Kokkos_SYCL_Instance.cpp | 11 ++++-- core/src/SYCL/Kokkos_SYCL_Instance.hpp | 34 ++++++++--------- .../src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp | 13 ++++--- .../Kokkos_SYCL_ParallelReduce_MDRange.hpp | 13 ++++--- .../SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp | 20 +++++----- .../SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp | 37 ++++++++++--------- .../SYCL/Kokkos_SYCL_ParallelScan_Range.hpp | 23 ++++++------ core/src/SYCL/Kokkos_SYCL_Team.hpp | 2 +- .../SYCL/Kokkos_SYCL_WorkgroupReduction.hpp | 4 +- 9 files changed, 84 insertions(+), 73 deletions(-) diff --git a/core/src/SYCL/Kokkos_SYCL_Instance.cpp b/core/src/SYCL/Kokkos_SYCL_Instance.cpp index 4a1c910c73d..d2112e3e4f2 100644 --- a/core/src/SYCL/Kokkos_SYCL_Instance.cpp +++ b/core/src/SYCL/Kokkos_SYCL_Instance.cpp @@ -166,7 +166,7 @@ int SYCLInternal::acquire_team_scratch_space() { return current_team_scratch; } -sycl::device_ptr SYCLInternal::resize_team_scratch_space( +sycl::ext::intel::device_ptr SYCLInternal::resize_team_scratch_space( int scratch_pool_id, std::int64_t bytes, bool force_shrink) { // Multiple ParallelFor/Reduce Teams can call this function at the same time // and invalidate the m_team_scratch_ptr. We use a pool to avoid any race @@ -251,7 +251,8 @@ void SYCLInternal::finalize() { m_queue.reset(); } -sycl::device_ptr SYCLInternal::scratch_space(const std::size_t size) { +sycl::ext::intel::device_ptr SYCLInternal::scratch_space( + const std::size_t size) { if (verify_is_initialized("scratch_space") && m_scratchSpaceCount < scratch_count(size)) { auto mem_space = Kokkos::Experimental::SYCLDeviceUSMSpace(*m_queue); @@ -271,7 +272,8 @@ sycl::device_ptr SYCLInternal::scratch_space(const std::size_t size) { return m_scratchSpace; } -sycl::host_ptr SYCLInternal::scratch_host(const std::size_t size) { +sycl::ext::intel::host_ptr SYCLInternal::scratch_host( + const std::size_t size) { if (verify_is_initialized("scratch_unified") && m_scratchHostCount < scratch_count(size)) { auto mem_space = Kokkos::Experimental::SYCLHostUSMSpace(*m_queue); @@ -291,7 +293,8 @@ sycl::host_ptr SYCLInternal::scratch_host(const std::size_t size) { return m_scratchHost; } -sycl::device_ptr SYCLInternal::scratch_flags(const std::size_t size) { +sycl::ext::intel::device_ptr SYCLInternal::scratch_flags( + const std::size_t size) { if (verify_is_initialized("scratch_flags") && m_scratchFlagsCount < scratch_count(size)) { auto mem_space = Kokkos::Experimental::SYCLDeviceUSMSpace(*m_queue); diff --git a/core/src/SYCL/Kokkos_SYCL_Instance.hpp b/core/src/SYCL/Kokkos_SYCL_Instance.hpp index ab7e8ce71e0..0666e1bd626 100644 --- a/core/src/SYCL/Kokkos_SYCL_Instance.hpp +++ b/core/src/SYCL/Kokkos_SYCL_Instance.hpp @@ -43,13 +43,12 @@ class SYCLInternal { SYCLInternal& operator=(SYCLInternal&&) = delete; SYCLInternal(SYCLInternal&&) = delete; - sycl::device_ptr scratch_space(const std::size_t size); - sycl::device_ptr scratch_flags(const std::size_t size); - sycl::host_ptr scratch_host(const std::size_t size); + sycl::ext::intel::device_ptr scratch_space(const std::size_t size); + sycl::ext::intel::device_ptr scratch_flags(const std::size_t size); + sycl::ext::intel::host_ptr scratch_host(const std::size_t size); int acquire_team_scratch_space(); - sycl::device_ptr resize_team_scratch_space(int scratch_pool_id, - std::int64_t bytes, - bool force_shrink = false); + sycl::ext::intel::device_ptr resize_team_scratch_space( + int scratch_pool_id, std::int64_t bytes, bool force_shrink = false); void register_team_scratch_event(int scratch_pool_id, sycl::event event); uint32_t impl_get_instance_id() const; @@ -59,21 +58,22 @@ class SYCLInternal { uint32_t m_maxConcurrency = 0; uint64_t m_maxShmemPerBlock = 0; - std::size_t m_scratchSpaceCount = 0; - sycl::device_ptr m_scratchSpace = nullptr; - std::size_t m_scratchHostCount = 0; - sycl::host_ptr m_scratchHost = nullptr; - std::size_t m_scratchFlagsCount = 0; - sycl::device_ptr m_scratchFlags = nullptr; + std::size_t m_scratchSpaceCount = 0; + sycl::ext::intel::device_ptr m_scratchSpace = nullptr; + std::size_t m_scratchHostCount = 0; + sycl::ext::intel::host_ptr m_scratchHost = nullptr; + std::size_t m_scratchFlagsCount = 0; + sycl::ext::intel::device_ptr m_scratchFlags = nullptr; // mutex to access shared memory mutable std::mutex m_mutexScratchSpace; // Team Scratch Level 1 Space - static constexpr int m_n_team_scratch = 10; - mutable int64_t m_team_scratch_current_size[m_n_team_scratch] = {}; - mutable sycl::device_ptr m_team_scratch_ptr[m_n_team_scratch] = {}; - mutable int m_current_team_scratch = 0; - mutable sycl::event m_team_scratch_event[m_n_team_scratch] = {}; + static constexpr int m_n_team_scratch = 10; + mutable int64_t m_team_scratch_current_size[m_n_team_scratch] = {}; + mutable sycl::ext::intel::device_ptr + m_team_scratch_ptr[m_n_team_scratch] = {}; + mutable int m_current_team_scratch = 0; + mutable sycl::event m_team_scratch_event[m_n_team_scratch] = {}; mutable std::mutex m_team_scratch_mutex; uint32_t m_instance_id = Kokkos::Tools::Experimental::Impl::idForInstance< diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp index ecb4a863da2..b1d32172d82 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp @@ -44,7 +44,7 @@ class Kokkos::Impl::ParallelFor, size_type const m_vector_size; int m_shmem_begin; int m_shmem_size; - sycl::device_ptr m_global_scratch_ptr; + sycl::ext::intel::device_ptr m_global_scratch_ptr; size_t m_scratch_size[2]; // Only let one ParallelFor instance at a time use the team scratch memory. // The constructor acquires the mutex which is released in the destructor. @@ -72,7 +72,8 @@ class Kokkos::Impl::ParallelFor, // Avoid capturing *this since it might not be trivially copyable const auto shmem_begin = m_shmem_begin; const size_t scratch_size[2] = {m_scratch_size[0], m_scratch_size[1]}; - sycl::device_ptr const global_scratch_ptr = m_global_scratch_ptr; + sycl::ext::intel::device_ptr const global_scratch_ptr = + m_global_scratch_ptr; auto lambda = [=](sycl::nd_item<2> item) { const member_type team_member( @@ -161,10 +162,10 @@ class Kokkos::Impl::ParallelFor, // Functor's reduce memory, team scan memory, and team shared memory depend // upon team size. - auto& space = *m_policy.space().impl_internal_space_instance(); - m_scratch_pool_id = space.acquire_team_scratch_space(); - m_global_scratch_ptr = - static_cast>(space.resize_team_scratch_space( + auto& space = *m_policy.space().impl_internal_space_instance(); + m_scratch_pool_id = space.acquire_team_scratch_space(); + m_global_scratch_ptr = static_cast>( + space.resize_team_scratch_space( m_scratch_pool_id, static_cast(m_scratch_size[1]) * m_league_size)); diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp index f55280e22e3..8fec299c5d5 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp @@ -94,10 +94,10 @@ class Kokkos::Impl::ParallelReduce results_ptr; + sycl::ext::intel::device_ptr results_ptr; auto host_result_ptr = (m_result_ptr && !m_result_ptr_device_accessible) - ? static_cast>( + ? static_cast>( instance.scratch_host(sizeof(value_type) * value_count)) : nullptr; @@ -114,7 +114,7 @@ class Kokkos::Impl::ParallelReduce>( + results_ptr = static_cast>( instance.scratch_space(sizeof(value_type) * value_count)); auto device_accessible_result_ptr = m_result_ptr_device_accessible @@ -155,14 +155,15 @@ class Kokkos::Impl::ParallelReduce>( + results_ptr = static_cast>( instance.scratch_space(sizeof(value_type) * value_count * n_wgroups)); auto device_accessible_result_ptr = m_result_ptr_device_accessible ? static_cast>(m_result_ptr) : static_cast>(host_result_ptr); - auto scratch_flags = static_cast>( - instance.scratch_flags(sizeof(unsigned int))); + auto scratch_flags = + static_cast>( + instance.scratch_flags(sizeof(unsigned int))); auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) { sycl::local_accessor local_mem( diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp index 5333e3c8a83..7feb2110068 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp @@ -69,10 +69,10 @@ class Kokkos::Impl::ParallelReduce results_ptr = nullptr; + sycl::ext::intel::device_ptr results_ptr = nullptr; auto host_result_ptr = (m_result_ptr && !m_result_ptr_device_accessible) - ? static_cast>( + ? static_cast>( instance.scratch_host(sizeof(value_type) * value_count)) : nullptr; auto device_accessible_result_ptr = @@ -88,7 +88,7 @@ class Kokkos::Impl::ParallelReduce>( + results_ptr = static_cast>( instance.scratch_space(sizeof(value_type) * value_count)); auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) { @@ -125,13 +125,15 @@ class Kokkos::Impl::ParallelReduce>( - instance.scratch_flags(sizeof(unsigned int))); + auto scratch_flags = + static_cast>( + instance.scratch_flags(sizeof(unsigned int))); auto reduction_lambda_factory = [&](sycl::local_accessor local_mem, sycl::local_accessor num_teams_done, - sycl::device_ptr results_ptr, int values_per_thread) { + sycl::ext::intel::device_ptr results_ptr, + int values_per_thread) { const auto begin = policy.begin(); auto lambda = [=](sycl::nd_item<1> item) { @@ -301,9 +303,9 @@ class Kokkos::Impl::ParallelReduce>(instance.scratch_space( - sizeof(value_type) * value_count * n_wgroups)); + results_ptr = static_cast>( + instance.scratch_space(sizeof(value_type) * value_count * + n_wgroups)); sycl::local_accessor local_mem( sycl::range<1>(wgroup_size) * value_count, cgh); diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp index 27165c59e3a..edb1b54b827 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp @@ -54,7 +54,7 @@ class Kokkos::Impl::ParallelReduce m_global_scratch_ptr; + sycl::ext::intel::device_ptr m_global_scratch_ptr; size_t m_scratch_size[2]; const size_type m_league_size; int m_team_size; @@ -82,7 +82,7 @@ class Kokkos::Impl::ParallelReduce>( + ? static_cast>( instance.scratch_host(sizeof(value_type) * value_count)) : nullptr; @@ -94,9 +94,9 @@ class Kokkos::Impl::ParallelReduce>(instance.scratch_space( - sizeof(value_type) * std::max(value_count, 1u))); + results_ptr = static_cast>( + instance.scratch_space(sizeof(value_type) * + std::max(value_count, 1u))); auto device_accessible_result_ptr = m_result_ptr_device_accessible ? static_cast>(m_result_ptr) @@ -113,7 +113,8 @@ class Kokkos::Impl::ParallelReduce const global_scratch_ptr = m_global_scratch_ptr; + sycl::ext::intel::device_ptr const global_scratch_ptr = + m_global_scratch_ptr; #ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES cgh.depends_on(memcpy_event); @@ -156,8 +157,9 @@ class Kokkos::Impl::ParallelReduce>( - instance.scratch_flags(sizeof(unsigned int))); + auto scratch_flags = + static_cast>( + instance.scratch_flags(sizeof(unsigned int))); // FIXME_SYCL accessors seem to need a size greater than zero at least // for host queues @@ -170,12 +172,13 @@ class Kokkos::Impl::ParallelReduce const global_scratch_ptr = m_global_scratch_ptr; + sycl::ext::intel::device_ptr const global_scratch_ptr = + m_global_scratch_ptr; sycl::local_accessor num_teams_done(1, cgh); auto team_reduction_factory = [&](sycl::local_accessor local_mem, - sycl::device_ptr results_ptr) { + sycl::ext::intel::device_ptr results_ptr) { auto device_accessible_result_ptr = m_result_ptr_device_accessible ? static_cast>(m_result_ptr) @@ -330,9 +333,9 @@ class Kokkos::Impl::ParallelReduce((size + wgroup_size - 1) / wgroup_size, 1); - results_ptr = - static_cast>(instance.scratch_space( - sizeof(value_type) * std::max(value_count, 1u) * init_size)); + results_ptr = static_cast>( + instance.scratch_space(sizeof(value_type) * + std::max(value_count, 1u) * init_size)); size_t max_work_groups = 2 * @@ -425,10 +428,10 @@ class Kokkos::Impl::ParallelReduce>(space.resize_team_scratch_space( + auto& space = *m_policy.space().impl_internal_space_instance(); + m_scratch_pool_id = space.acquire_team_scratch_space(); + m_global_scratch_ptr = static_cast>( + space.resize_team_scratch_space( m_scratch_pool_id, static_cast(m_scratch_size[1]) * m_league_size)); diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp index 58cfea6a97a..b773af6cda7 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp @@ -146,7 +146,7 @@ class ParallelScanSYCLBase { const CombinedFunctorReducer m_functor_reducer; const Policy m_policy; - sycl::host_ptr m_scratch_host = nullptr; + sycl::ext::intel::host_ptr m_scratch_host = nullptr; pointer_type m_result_ptr; const bool m_result_ptr_device_accessible; @@ -166,8 +166,9 @@ class ParallelScanSYCLBase { const auto size = m_policy.end() - m_policy.begin(); - auto scratch_flags = static_cast>( - instance.scratch_flags(sizeof(unsigned int))); + auto scratch_flags = + static_cast>( + instance.scratch_flags(sizeof(unsigned int))); const auto begin = m_policy.begin(); @@ -175,8 +176,8 @@ class ParallelScanSYCLBase { auto scan_lambda_factory = [&](sycl::local_accessor local_mem, sycl::local_accessor num_teams_done, - sycl::device_ptr global_mem_, - sycl::device_ptr group_results_) { + sycl::ext::intel::device_ptr global_mem_, + sycl::ext::intel::device_ptr group_results_) { auto lambda = [=](sycl::nd_item<1> item) { auto global_mem = global_mem_; auto group_results = group_results_; @@ -253,8 +254,8 @@ class ParallelScanSYCLBase { size_t wgroup_size; size_t n_wgroups; - sycl::device_ptr global_mem; - sycl::device_ptr group_results; + sycl::ext::intel::device_ptr global_mem; + sycl::ext::intel::device_ptr group_results; desul::ensure_sycl_lock_arrays_on_device(q); @@ -288,10 +289,10 @@ class ParallelScanSYCLBase { // that will contain the sum of the previous workgroups totals. // FIXME_SYCL consider only storing one value per block and recreate // initial results in the end before doing the final pass - global_mem = - static_cast>(instance.scratch_space( - n_wgroups * (wgroup_size + 1) * sizeof(value_type))); - m_scratch_host = static_cast>( + global_mem = static_cast>( + instance.scratch_space(n_wgroups * (wgroup_size + 1) * + sizeof(value_type))); + m_scratch_host = static_cast>( instance.scratch_host(sizeof(value_type))); group_results = global_mem + n_wgroups * wgroup_size; diff --git a/core/src/SYCL/Kokkos_SYCL_Team.hpp b/core/src/SYCL/Kokkos_SYCL_Team.hpp index 2b4c2be5227..715d65a98f2 100644 --- a/core/src/SYCL/Kokkos_SYCL_Team.hpp +++ b/core/src/SYCL/Kokkos_SYCL_Team.hpp @@ -339,7 +339,7 @@ class SYCLTeamMember { KOKKOS_INLINE_FUNCTION SYCLTeamMember(sycl::local_ptr shared, const std::size_t shared_begin, const std::size_t shared_size, - sycl::device_ptr scratch_level_1_ptr, + sycl::ext::intel::device_ptr scratch_level_1_ptr, const std::size_t scratch_level_1_size, const sycl::nd_item<2> item, const int arg_league_rank, const int arg_league_size) diff --git a/core/src/SYCL/Kokkos_SYCL_WorkgroupReduction.hpp b/core/src/SYCL/Kokkos_SYCL_WorkgroupReduction.hpp index 7069805a5b5..3b818490901 100644 --- a/core/src/SYCL/Kokkos_SYCL_WorkgroupReduction.hpp +++ b/core/src/SYCL/Kokkos_SYCL_WorkgroupReduction.hpp @@ -28,7 +28,7 @@ inline constexpr bool use_shuffle_based_algorithm = template std::enable_if_t> workgroup_reduction( sycl::nd_item& item, sycl::local_accessor local_mem, - sycl::device_ptr results_ptr, + sycl::ext::intel::device_ptr results_ptr, sycl::global_ptr device_accessible_result_ptr, const unsigned int value_count_, const ReducerType& final_reducer, bool final, unsigned int max_size) { @@ -100,7 +100,7 @@ std::enable_if_t> workgroup_reduction( template std::enable_if_t> workgroup_reduction( sycl::nd_item& item, sycl::local_accessor local_mem, - ValueType local_value, sycl::device_ptr results_ptr, + ValueType local_value, sycl::ext::intel::device_ptr results_ptr, sycl::global_ptr device_accessible_result_ptr, const ReducerType& final_reducer, bool final, unsigned int max_size) { const auto local_id = item.get_local_linear_id(); From 5932685c939a08b1b29b9c56bcb264728c1b16ba Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Mon, 22 Apr 2024 17:12:16 +0000 Subject: [PATCH 61/95] Introduce alias based on feature macro --- core/src/SYCL/Kokkos_SYCL_Instance.cpp | 11 +++--- core/src/SYCL/Kokkos_SYCL_Instance.hpp | 24 ++++++------ .../src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp | 17 ++++---- .../Kokkos_SYCL_ParallelReduce_MDRange.hpp | 17 ++++---- .../SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp | 20 +++++----- .../SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp | 39 ++++++++++--------- .../SYCL/Kokkos_SYCL_ParallelScan_Range.hpp | 19 ++++----- core/src/SYCL/Kokkos_SYCL_Team.hpp | 2 +- .../SYCL/Kokkos_SYCL_WorkgroupReduction.hpp | 5 ++- core/src/setup/Kokkos_Setup_SYCL.hpp | 17 ++++++++ 10 files changed, 101 insertions(+), 70 deletions(-) diff --git a/core/src/SYCL/Kokkos_SYCL_Instance.cpp b/core/src/SYCL/Kokkos_SYCL_Instance.cpp index d2112e3e4f2..adfd4c10b04 100644 --- a/core/src/SYCL/Kokkos_SYCL_Instance.cpp +++ b/core/src/SYCL/Kokkos_SYCL_Instance.cpp @@ -166,8 +166,9 @@ int SYCLInternal::acquire_team_scratch_space() { return current_team_scratch; } -sycl::ext::intel::device_ptr SYCLInternal::resize_team_scratch_space( - int scratch_pool_id, std::int64_t bytes, bool force_shrink) { +Kokkos::Impl::SYCLTypes::device_ptr +SYCLInternal::resize_team_scratch_space(int scratch_pool_id, std::int64_t bytes, + bool force_shrink) { // Multiple ParallelFor/Reduce Teams can call this function at the same time // and invalidate the m_team_scratch_ptr. We use a pool to avoid any race // condition. @@ -251,7 +252,7 @@ void SYCLInternal::finalize() { m_queue.reset(); } -sycl::ext::intel::device_ptr SYCLInternal::scratch_space( +Kokkos::Impl::SYCLTypes::device_ptr SYCLInternal::scratch_space( const std::size_t size) { if (verify_is_initialized("scratch_space") && m_scratchSpaceCount < scratch_count(size)) { @@ -272,7 +273,7 @@ sycl::ext::intel::device_ptr SYCLInternal::scratch_space( return m_scratchSpace; } -sycl::ext::intel::host_ptr SYCLInternal::scratch_host( +Kokkos::Impl::SYCLTypes::host_ptr SYCLInternal::scratch_host( const std::size_t size) { if (verify_is_initialized("scratch_unified") && m_scratchHostCount < scratch_count(size)) { @@ -293,7 +294,7 @@ sycl::ext::intel::host_ptr SYCLInternal::scratch_host( return m_scratchHost; } -sycl::ext::intel::device_ptr SYCLInternal::scratch_flags( +Kokkos::Impl::SYCLTypes::device_ptr SYCLInternal::scratch_flags( const std::size_t size) { if (verify_is_initialized("scratch_flags") && m_scratchFlagsCount < scratch_count(size)) { diff --git a/core/src/SYCL/Kokkos_SYCL_Instance.hpp b/core/src/SYCL/Kokkos_SYCL_Instance.hpp index 0666e1bd626..de77b8efdeb 100644 --- a/core/src/SYCL/Kokkos_SYCL_Instance.hpp +++ b/core/src/SYCL/Kokkos_SYCL_Instance.hpp @@ -43,11 +43,13 @@ class SYCLInternal { SYCLInternal& operator=(SYCLInternal&&) = delete; SYCLInternal(SYCLInternal&&) = delete; - sycl::ext::intel::device_ptr scratch_space(const std::size_t size); - sycl::ext::intel::device_ptr scratch_flags(const std::size_t size); - sycl::ext::intel::host_ptr scratch_host(const std::size_t size); + Kokkos::Impl::SYCLTypes::device_ptr scratch_space( + const std::size_t size); + Kokkos::Impl::SYCLTypes::device_ptr scratch_flags( + const std::size_t size); + Kokkos::Impl::SYCLTypes::host_ptr scratch_host(const std::size_t size); int acquire_team_scratch_space(); - sycl::ext::intel::device_ptr resize_team_scratch_space( + Kokkos::Impl::SYCLTypes::device_ptr resize_team_scratch_space( int scratch_pool_id, std::int64_t bytes, bool force_shrink = false); void register_team_scratch_event(int scratch_pool_id, sycl::event event); @@ -58,19 +60,19 @@ class SYCLInternal { uint32_t m_maxConcurrency = 0; uint64_t m_maxShmemPerBlock = 0; - std::size_t m_scratchSpaceCount = 0; - sycl::ext::intel::device_ptr m_scratchSpace = nullptr; - std::size_t m_scratchHostCount = 0; - sycl::ext::intel::host_ptr m_scratchHost = nullptr; - std::size_t m_scratchFlagsCount = 0; - sycl::ext::intel::device_ptr m_scratchFlags = nullptr; + std::size_t m_scratchSpaceCount = 0; + Kokkos::Impl::SYCLTypes::device_ptr m_scratchSpace = nullptr; + std::size_t m_scratchHostCount = 0; + Kokkos::Impl::SYCLTypes::host_ptr m_scratchHost = nullptr; + std::size_t m_scratchFlagsCount = 0; + Kokkos::Impl::SYCLTypes::device_ptr m_scratchFlags = nullptr; // mutex to access shared memory mutable std::mutex m_mutexScratchSpace; // Team Scratch Level 1 Space static constexpr int m_n_team_scratch = 10; mutable int64_t m_team_scratch_current_size[m_n_team_scratch] = {}; - mutable sycl::ext::intel::device_ptr + mutable Kokkos::Impl::SYCLTypes::device_ptr m_team_scratch_ptr[m_n_team_scratch] = {}; mutable int m_current_team_scratch = 0; mutable sycl::event m_team_scratch_event[m_n_team_scratch] = {}; diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp index b1d32172d82..7f258ecccae 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp @@ -44,7 +44,7 @@ class Kokkos::Impl::ParallelFor, size_type const m_vector_size; int m_shmem_begin; int m_shmem_size; - sycl::ext::intel::device_ptr m_global_scratch_ptr; + Kokkos::Impl::SYCLTypes::device_ptr m_global_scratch_ptr; size_t m_scratch_size[2]; // Only let one ParallelFor instance at a time use the team scratch memory. // The constructor acquires the mutex which is released in the destructor. @@ -72,7 +72,7 @@ class Kokkos::Impl::ParallelFor, // Avoid capturing *this since it might not be trivially copyable const auto shmem_begin = m_shmem_begin; const size_t scratch_size[2] = {m_scratch_size[0], m_scratch_size[1]}; - sycl::ext::intel::device_ptr const global_scratch_ptr = + Kokkos::Impl::SYCLTypes::device_ptr const global_scratch_ptr = m_global_scratch_ptr; auto lambda = [=](sycl::nd_item<2> item) { @@ -162,12 +162,13 @@ class Kokkos::Impl::ParallelFor, // Functor's reduce memory, team scan memory, and team shared memory depend // upon team size. - auto& space = *m_policy.space().impl_internal_space_instance(); - m_scratch_pool_id = space.acquire_team_scratch_space(); - m_global_scratch_ptr = static_cast>( - space.resize_team_scratch_space( - m_scratch_pool_id, - static_cast(m_scratch_size[1]) * m_league_size)); + auto& space = *m_policy.space().impl_internal_space_instance(); + m_scratch_pool_id = space.acquire_team_scratch_space(); + m_global_scratch_ptr = + static_cast>( + space.resize_team_scratch_space( + m_scratch_pool_id, + static_cast(m_scratch_size[1]) * m_league_size)); if (static_cast(space.m_maxShmemPerBlock) < m_shmem_size - m_shmem_begin) { diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp index 8fec299c5d5..155f4b00821 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp @@ -94,10 +94,10 @@ class Kokkos::Impl::ParallelReduce results_ptr; + Kokkos::Impl::SYCLTypes::device_ptr results_ptr; auto host_result_ptr = (m_result_ptr && !m_result_ptr_device_accessible) - ? static_cast>( + ? static_cast>( instance.scratch_host(sizeof(value_type) * value_count)) : nullptr; @@ -114,8 +114,9 @@ class Kokkos::Impl::ParallelReduce>( - instance.scratch_space(sizeof(value_type) * value_count)); + results_ptr = + static_cast>( + instance.scratch_space(sizeof(value_type) * value_count)); auto device_accessible_result_ptr = m_result_ptr_device_accessible ? static_cast>(m_result_ptr) @@ -155,14 +156,16 @@ class Kokkos::Impl::ParallelReduce>( - instance.scratch_space(sizeof(value_type) * value_count * n_wgroups)); + results_ptr = + static_cast>( + instance.scratch_space(sizeof(value_type) * value_count * + n_wgroups)); auto device_accessible_result_ptr = m_result_ptr_device_accessible ? static_cast>(m_result_ptr) : static_cast>(host_result_ptr); auto scratch_flags = - static_cast>( + static_cast>( instance.scratch_flags(sizeof(unsigned int))); auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) { diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp index 7feb2110068..c00f9bb6232 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp @@ -69,10 +69,10 @@ class Kokkos::Impl::ParallelReduce results_ptr = nullptr; + Kokkos::Impl::SYCLTypes::device_ptr results_ptr = nullptr; auto host_result_ptr = (m_result_ptr && !m_result_ptr_device_accessible) - ? static_cast>( + ? static_cast>( instance.scratch_host(sizeof(value_type) * value_count)) : nullptr; auto device_accessible_result_ptr = @@ -88,8 +88,9 @@ class Kokkos::Impl::ParallelReduce>( - instance.scratch_space(sizeof(value_type) * value_count)); + results_ptr = + static_cast>( + instance.scratch_space(sizeof(value_type) * value_count)); auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) { const auto begin = policy.begin(); @@ -126,13 +127,13 @@ class Kokkos::Impl::ParallelReduce>( + static_cast>( instance.scratch_flags(sizeof(unsigned int))); auto reduction_lambda_factory = [&](sycl::local_accessor local_mem, sycl::local_accessor num_teams_done, - sycl::ext::intel::device_ptr results_ptr, + Kokkos::Impl::SYCLTypes::device_ptr results_ptr, int values_per_thread) { const auto begin = policy.begin(); @@ -303,9 +304,10 @@ class Kokkos::Impl::ParallelReduce>( - instance.scratch_space(sizeof(value_type) * value_count * - n_wgroups)); + results_ptr = + static_cast>( + instance.scratch_space(sizeof(value_type) * value_count * + n_wgroups)); sycl::local_accessor local_mem( sycl::range<1>(wgroup_size) * value_count, cgh); diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp index edb1b54b827..b9be1148832 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp @@ -54,7 +54,7 @@ class Kokkos::Impl::ParallelReduce m_global_scratch_ptr; + Kokkos::Impl::SYCLTypes::device_ptr m_global_scratch_ptr; size_t m_scratch_size[2]; const size_type m_league_size; int m_team_size; @@ -82,7 +82,7 @@ class Kokkos::Impl::ParallelReduce>( + ? static_cast>( instance.scratch_host(sizeof(value_type) * value_count)) : nullptr; @@ -94,9 +94,10 @@ class Kokkos::Impl::ParallelReduce>( - instance.scratch_space(sizeof(value_type) * - std::max(value_count, 1u))); + results_ptr = + static_cast>( + instance.scratch_space(sizeof(value_type) * + std::max(value_count, 1u))); auto device_accessible_result_ptr = m_result_ptr_device_accessible ? static_cast>(m_result_ptr) @@ -113,7 +114,7 @@ class Kokkos::Impl::ParallelReduce const global_scratch_ptr = + Kokkos::Impl::SYCLTypes::device_ptr const global_scratch_ptr = m_global_scratch_ptr; #ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES @@ -158,7 +159,7 @@ class Kokkos::Impl::ParallelReduce>( + static_cast>( instance.scratch_flags(sizeof(unsigned int))); // FIXME_SYCL accessors seem to need a size greater than zero at least @@ -172,13 +173,13 @@ class Kokkos::Impl::ParallelReduce const global_scratch_ptr = + Kokkos::Impl::SYCLTypes::device_ptr const global_scratch_ptr = m_global_scratch_ptr; sycl::local_accessor num_teams_done(1, cgh); auto team_reduction_factory = [&](sycl::local_accessor local_mem, - sycl::ext::intel::device_ptr results_ptr) { + Kokkos::Impl::SYCLTypes::device_ptr results_ptr) { auto device_accessible_result_ptr = m_result_ptr_device_accessible ? static_cast>(m_result_ptr) @@ -333,9 +334,10 @@ class Kokkos::Impl::ParallelReduce((size + wgroup_size - 1) / wgroup_size, 1); - results_ptr = static_cast>( - instance.scratch_space(sizeof(value_type) * - std::max(value_count, 1u) * init_size)); + results_ptr = + static_cast>( + instance.scratch_space(sizeof(value_type) * + std::max(value_count, 1u) * init_size)); size_t max_work_groups = 2 * @@ -428,12 +430,13 @@ class Kokkos::Impl::ParallelReduce>( - space.resize_team_scratch_space( - m_scratch_pool_id, - static_cast(m_scratch_size[1]) * m_league_size)); + auto& space = *m_policy.space().impl_internal_space_instance(); + m_scratch_pool_id = space.acquire_team_scratch_space(); + m_global_scratch_ptr = + static_cast>( + space.resize_team_scratch_space( + m_scratch_pool_id, + static_cast(m_scratch_size[1]) * m_league_size)); if (static_cast(space.m_maxShmemPerBlock) < m_shmem_size - m_shmem_begin) { diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp index b773af6cda7..a3efe56b99c 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp @@ -146,7 +146,7 @@ class ParallelScanSYCLBase { const CombinedFunctorReducer m_functor_reducer; const Policy m_policy; - sycl::ext::intel::host_ptr m_scratch_host = nullptr; + Kokkos::Impl::SYCLTypes::host_ptr m_scratch_host = nullptr; pointer_type m_result_ptr; const bool m_result_ptr_device_accessible; @@ -167,7 +167,7 @@ class ParallelScanSYCLBase { const auto size = m_policy.end() - m_policy.begin(); auto scratch_flags = - static_cast>( + static_cast>( instance.scratch_flags(sizeof(unsigned int))); const auto begin = m_policy.begin(); @@ -176,8 +176,8 @@ class ParallelScanSYCLBase { auto scan_lambda_factory = [&](sycl::local_accessor local_mem, sycl::local_accessor num_teams_done, - sycl::ext::intel::device_ptr global_mem_, - sycl::ext::intel::device_ptr group_results_) { + Kokkos::Impl::SYCLTypes::device_ptr global_mem_, + Kokkos::Impl::SYCLTypes::device_ptr group_results_) { auto lambda = [=](sycl::nd_item<1> item) { auto global_mem = global_mem_; auto group_results = group_results_; @@ -254,8 +254,8 @@ class ParallelScanSYCLBase { size_t wgroup_size; size_t n_wgroups; - sycl::ext::intel::device_ptr global_mem; - sycl::ext::intel::device_ptr group_results; + Kokkos::Impl::SYCLTypes::device_ptr global_mem; + Kokkos::Impl::SYCLTypes::device_ptr group_results; desul::ensure_sycl_lock_arrays_on_device(q); @@ -289,11 +289,12 @@ class ParallelScanSYCLBase { // that will contain the sum of the previous workgroups totals. // FIXME_SYCL consider only storing one value per block and recreate // initial results in the end before doing the final pass - global_mem = static_cast>( + global_mem = static_cast>( instance.scratch_space(n_wgroups * (wgroup_size + 1) * sizeof(value_type))); - m_scratch_host = static_cast>( - instance.scratch_host(sizeof(value_type))); + m_scratch_host = + static_cast>( + instance.scratch_host(sizeof(value_type))); group_results = global_mem + n_wgroups * wgroup_size; diff --git a/core/src/SYCL/Kokkos_SYCL_Team.hpp b/core/src/SYCL/Kokkos_SYCL_Team.hpp index 715d65a98f2..d838dc94c67 100644 --- a/core/src/SYCL/Kokkos_SYCL_Team.hpp +++ b/core/src/SYCL/Kokkos_SYCL_Team.hpp @@ -339,7 +339,7 @@ class SYCLTeamMember { KOKKOS_INLINE_FUNCTION SYCLTeamMember(sycl::local_ptr shared, const std::size_t shared_begin, const std::size_t shared_size, - sycl::ext::intel::device_ptr scratch_level_1_ptr, + Kokkos::Impl::SYCLTypes::device_ptr scratch_level_1_ptr, const std::size_t scratch_level_1_size, const sycl::nd_item<2> item, const int arg_league_rank, const int arg_league_size) diff --git a/core/src/SYCL/Kokkos_SYCL_WorkgroupReduction.hpp b/core/src/SYCL/Kokkos_SYCL_WorkgroupReduction.hpp index 3b818490901..f25dec76777 100644 --- a/core/src/SYCL/Kokkos_SYCL_WorkgroupReduction.hpp +++ b/core/src/SYCL/Kokkos_SYCL_WorkgroupReduction.hpp @@ -28,7 +28,7 @@ inline constexpr bool use_shuffle_based_algorithm = template std::enable_if_t> workgroup_reduction( sycl::nd_item& item, sycl::local_accessor local_mem, - sycl::ext::intel::device_ptr results_ptr, + Kokkos::Impl::SYCLTypes::device_ptr results_ptr, sycl::global_ptr device_accessible_result_ptr, const unsigned int value_count_, const ReducerType& final_reducer, bool final, unsigned int max_size) { @@ -100,7 +100,8 @@ std::enable_if_t> workgroup_reduction( template std::enable_if_t> workgroup_reduction( sycl::nd_item& item, sycl::local_accessor local_mem, - ValueType local_value, sycl::ext::intel::device_ptr results_ptr, + ValueType local_value, + Kokkos::Impl::SYCLTypes::device_ptr results_ptr, sycl::global_ptr device_accessible_result_ptr, const ReducerType& final_reducer, bool final, unsigned int max_size) { const auto local_id = item.get_local_linear_id(); diff --git a/core/src/setup/Kokkos_Setup_SYCL.hpp b/core/src/setup/Kokkos_Setup_SYCL.hpp index 30f6fa2ad23..7fb10bb39a2 100644 --- a/core/src/setup/Kokkos_Setup_SYCL.hpp +++ b/core/src/setup/Kokkos_Setup_SYCL.hpp @@ -45,4 +45,21 @@ #define KOKKOS_IMPL_SYCL_GET_MULTI_PTR(accessor) accessor.get_pointer() #endif +// FIXME_SYCL Use type directly once it has stabilized in SYCL. +namespace Kokkos::Impl::SYCLTypes { +#ifndef SYCL_EXT_INTEL_USM_ADDRESS_SPACES +#error SYCL_EXT_INTEL_USM_ADDRESS_SPACES undefined! +#elif SYCL_EXT_INTEL_USM_ADDRESS_SPACES >= 2 +template +using device_ptr = sycl::ext::intel::device_ptr; +template +using host_ptr = sycl::ext::intel::host_ptr; +#else +template +using device_ptr = sycl::device_ptr; +template +using host_ptr = sycl::host_ptr; +#endif +} // namespace Kokkos::Impl::SYCLTypes + #endif From a7827731cf8256a9387a8786555aa5f97dccd17a Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Mon, 22 Apr 2024 19:07:13 -0400 Subject: [PATCH 62/95] Kokkos::Impl::SYCLTypes:: -> Kokkos::Impl::sycl_ --- core/src/SYCL/Kokkos_SYCL_Instance.cpp | 11 +- core/src/SYCL/Kokkos_SYCL_Instance.hpp | 24 ++- .../src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp | 12 +- .../Kokkos_SYCL_ParallelReduce_MDRange.hpp | 20 +-- .../SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp | 22 +-- .../SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp | 34 ++-- .../SYCL/Kokkos_SYCL_ParallelScan_Range.hpp | 166 +++++++++--------- core/src/SYCL/Kokkos_SYCL_Team.hpp | 2 +- .../SYCL/Kokkos_SYCL_WorkgroupReduction.hpp | 5 +- core/src/setup/Kokkos_Setup_SYCL.hpp | 12 +- 10 files changed, 142 insertions(+), 166 deletions(-) diff --git a/core/src/SYCL/Kokkos_SYCL_Instance.cpp b/core/src/SYCL/Kokkos_SYCL_Instance.cpp index adfd4c10b04..5843dca8123 100644 --- a/core/src/SYCL/Kokkos_SYCL_Instance.cpp +++ b/core/src/SYCL/Kokkos_SYCL_Instance.cpp @@ -166,9 +166,8 @@ int SYCLInternal::acquire_team_scratch_space() { return current_team_scratch; } -Kokkos::Impl::SYCLTypes::device_ptr -SYCLInternal::resize_team_scratch_space(int scratch_pool_id, std::int64_t bytes, - bool force_shrink) { +Kokkos::Impl::sycl_device_ptr SYCLInternal::resize_team_scratch_space( + int scratch_pool_id, std::int64_t bytes, bool force_shrink) { // Multiple ParallelFor/Reduce Teams can call this function at the same time // and invalidate the m_team_scratch_ptr. We use a pool to avoid any race // condition. @@ -252,7 +251,7 @@ void SYCLInternal::finalize() { m_queue.reset(); } -Kokkos::Impl::SYCLTypes::device_ptr SYCLInternal::scratch_space( +Kokkos::Impl::sycl_device_ptr SYCLInternal::scratch_space( const std::size_t size) { if (verify_is_initialized("scratch_space") && m_scratchSpaceCount < scratch_count(size)) { @@ -273,7 +272,7 @@ Kokkos::Impl::SYCLTypes::device_ptr SYCLInternal::scratch_space( return m_scratchSpace; } -Kokkos::Impl::SYCLTypes::host_ptr SYCLInternal::scratch_host( +Kokkos::Impl::sycl_host_ptr SYCLInternal::scratch_host( const std::size_t size) { if (verify_is_initialized("scratch_unified") && m_scratchHostCount < scratch_count(size)) { @@ -294,7 +293,7 @@ Kokkos::Impl::SYCLTypes::host_ptr SYCLInternal::scratch_host( return m_scratchHost; } -Kokkos::Impl::SYCLTypes::device_ptr SYCLInternal::scratch_flags( +Kokkos::Impl::sycl_device_ptr SYCLInternal::scratch_flags( const std::size_t size) { if (verify_is_initialized("scratch_flags") && m_scratchFlagsCount < scratch_count(size)) { diff --git a/core/src/SYCL/Kokkos_SYCL_Instance.hpp b/core/src/SYCL/Kokkos_SYCL_Instance.hpp index de77b8efdeb..2d784ef8a5f 100644 --- a/core/src/SYCL/Kokkos_SYCL_Instance.hpp +++ b/core/src/SYCL/Kokkos_SYCL_Instance.hpp @@ -43,13 +43,11 @@ class SYCLInternal { SYCLInternal& operator=(SYCLInternal&&) = delete; SYCLInternal(SYCLInternal&&) = delete; - Kokkos::Impl::SYCLTypes::device_ptr scratch_space( - const std::size_t size); - Kokkos::Impl::SYCLTypes::device_ptr scratch_flags( - const std::size_t size); - Kokkos::Impl::SYCLTypes::host_ptr scratch_host(const std::size_t size); + Kokkos::Impl::sycl_device_ptr scratch_space(const std::size_t size); + Kokkos::Impl::sycl_device_ptr scratch_flags(const std::size_t size); + Kokkos::Impl::sycl_host_ptr scratch_host(const std::size_t size); int acquire_team_scratch_space(); - Kokkos::Impl::SYCLTypes::device_ptr resize_team_scratch_space( + Kokkos::Impl::sycl_device_ptr resize_team_scratch_space( int scratch_pool_id, std::int64_t bytes, bool force_shrink = false); void register_team_scratch_event(int scratch_pool_id, sycl::event event); @@ -60,19 +58,19 @@ class SYCLInternal { uint32_t m_maxConcurrency = 0; uint64_t m_maxShmemPerBlock = 0; - std::size_t m_scratchSpaceCount = 0; - Kokkos::Impl::SYCLTypes::device_ptr m_scratchSpace = nullptr; - std::size_t m_scratchHostCount = 0; - Kokkos::Impl::SYCLTypes::host_ptr m_scratchHost = nullptr; - std::size_t m_scratchFlagsCount = 0; - Kokkos::Impl::SYCLTypes::device_ptr m_scratchFlags = nullptr; + std::size_t m_scratchSpaceCount = 0; + Kokkos::Impl::sycl_device_ptr m_scratchSpace = nullptr; + std::size_t m_scratchHostCount = 0; + Kokkos::Impl::sycl_host_ptr m_scratchHost = nullptr; + std::size_t m_scratchFlagsCount = 0; + Kokkos::Impl::sycl_device_ptr m_scratchFlags = nullptr; // mutex to access shared memory mutable std::mutex m_mutexScratchSpace; // Team Scratch Level 1 Space static constexpr int m_n_team_scratch = 10; mutable int64_t m_team_scratch_current_size[m_n_team_scratch] = {}; - mutable Kokkos::Impl::SYCLTypes::device_ptr + mutable Kokkos::Impl::sycl_device_ptr m_team_scratch_ptr[m_n_team_scratch] = {}; mutable int m_current_team_scratch = 0; mutable sycl::event m_team_scratch_event[m_n_team_scratch] = {}; diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp index 7f258ecccae..d98f4837315 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp @@ -44,7 +44,7 @@ class Kokkos::Impl::ParallelFor, size_type const m_vector_size; int m_shmem_begin; int m_shmem_size; - Kokkos::Impl::SYCLTypes::device_ptr m_global_scratch_ptr; + sycl_device_ptr m_global_scratch_ptr; size_t m_scratch_size[2]; // Only let one ParallelFor instance at a time use the team scratch memory. // The constructor acquires the mutex which is released in the destructor. @@ -72,8 +72,7 @@ class Kokkos::Impl::ParallelFor, // Avoid capturing *this since it might not be trivially copyable const auto shmem_begin = m_shmem_begin; const size_t scratch_size[2] = {m_scratch_size[0], m_scratch_size[1]}; - Kokkos::Impl::SYCLTypes::device_ptr const global_scratch_ptr = - m_global_scratch_ptr; + sycl_device_ptr const global_scratch_ptr = m_global_scratch_ptr; auto lambda = [=](sycl::nd_item<2> item) { const member_type team_member( @@ -165,10 +164,9 @@ class Kokkos::Impl::ParallelFor, auto& space = *m_policy.space().impl_internal_space_instance(); m_scratch_pool_id = space.acquire_team_scratch_space(); m_global_scratch_ptr = - static_cast>( - space.resize_team_scratch_space( - m_scratch_pool_id, - static_cast(m_scratch_size[1]) * m_league_size)); + static_cast>(space.resize_team_scratch_space( + m_scratch_pool_id, + static_cast(m_scratch_size[1]) * m_league_size)); if (static_cast(space.m_maxShmemPerBlock) < m_shmem_size - m_shmem_begin) { diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp index 155f4b00821..c1414ee0581 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp @@ -94,10 +94,10 @@ class Kokkos::Impl::ParallelReduce results_ptr; + sycl_device_ptr results_ptr; auto host_result_ptr = (m_result_ptr && !m_result_ptr_device_accessible) - ? static_cast>( + ? static_cast>( instance.scratch_host(sizeof(value_type) * value_count)) : nullptr; @@ -114,9 +114,8 @@ class Kokkos::Impl::ParallelReduce>( - instance.scratch_space(sizeof(value_type) * value_count)); + results_ptr = static_cast>( + instance.scratch_space(sizeof(value_type) * value_count)); auto device_accessible_result_ptr = m_result_ptr_device_accessible ? static_cast>(m_result_ptr) @@ -156,17 +155,14 @@ class Kokkos::Impl::ParallelReduce>( - instance.scratch_space(sizeof(value_type) * value_count * - n_wgroups)); + results_ptr = static_cast>( + instance.scratch_space(sizeof(value_type) * value_count * n_wgroups)); auto device_accessible_result_ptr = m_result_ptr_device_accessible ? static_cast>(m_result_ptr) : static_cast>(host_result_ptr); - auto scratch_flags = - static_cast>( - instance.scratch_flags(sizeof(unsigned int))); + auto scratch_flags = static_cast>( + instance.scratch_flags(sizeof(unsigned int))); auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) { sycl::local_accessor local_mem( diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp index c00f9bb6232..dbe2366b8bc 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp @@ -69,10 +69,10 @@ class Kokkos::Impl::ParallelReduce results_ptr = nullptr; + sycl_device_ptr results_ptr = nullptr; auto host_result_ptr = (m_result_ptr && !m_result_ptr_device_accessible) - ? static_cast>( + ? static_cast>( instance.scratch_host(sizeof(value_type) * value_count)) : nullptr; auto device_accessible_result_ptr = @@ -88,9 +88,8 @@ class Kokkos::Impl::ParallelReduce>( - instance.scratch_space(sizeof(value_type) * value_count)); + results_ptr = static_cast>( + instance.scratch_space(sizeof(value_type) * value_count)); auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) { const auto begin = policy.begin(); @@ -126,15 +125,13 @@ class Kokkos::Impl::ParallelReduce>( - instance.scratch_flags(sizeof(unsigned int))); + auto scratch_flags = static_cast>( + instance.scratch_flags(sizeof(unsigned int))); auto reduction_lambda_factory = [&](sycl::local_accessor local_mem, sycl::local_accessor num_teams_done, - Kokkos::Impl::SYCLTypes::device_ptr results_ptr, - int values_per_thread) { + sycl_device_ptr results_ptr, int values_per_thread) { const auto begin = policy.begin(); auto lambda = [=](sycl::nd_item<1> item) { @@ -305,9 +302,8 @@ class Kokkos::Impl::ParallelReduce>( - instance.scratch_space(sizeof(value_type) * value_count * - n_wgroups)); + static_cast>(instance.scratch_space( + sizeof(value_type) * value_count * n_wgroups)); sycl::local_accessor local_mem( sycl::range<1>(wgroup_size) * value_count, cgh); diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp index b9be1148832..1332fafde94 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp @@ -54,7 +54,7 @@ class Kokkos::Impl::ParallelReduce m_global_scratch_ptr; + sycl_device_ptr m_global_scratch_ptr; size_t m_scratch_size[2]; const size_type m_league_size; int m_team_size; @@ -82,7 +82,7 @@ class Kokkos::Impl::ParallelReduce>( + ? static_cast>( instance.scratch_host(sizeof(value_type) * value_count)) : nullptr; @@ -95,9 +95,8 @@ class Kokkos::Impl::ParallelReduce>( - instance.scratch_space(sizeof(value_type) * - std::max(value_count, 1u))); + static_cast>(instance.scratch_space( + sizeof(value_type) * std::max(value_count, 1u))); auto device_accessible_result_ptr = m_result_ptr_device_accessible ? static_cast>(m_result_ptr) @@ -114,8 +113,7 @@ class Kokkos::Impl::ParallelReduce const global_scratch_ptr = - m_global_scratch_ptr; + sycl_device_ptr const global_scratch_ptr = m_global_scratch_ptr; #ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES cgh.depends_on(memcpy_event); @@ -158,9 +156,8 @@ class Kokkos::Impl::ParallelReduce>( - instance.scratch_flags(sizeof(unsigned int))); + auto scratch_flags = static_cast>( + instance.scratch_flags(sizeof(unsigned int))); // FIXME_SYCL accessors seem to need a size greater than zero at least // for host queues @@ -173,13 +170,12 @@ class Kokkos::Impl::ParallelReduce const global_scratch_ptr = - m_global_scratch_ptr; + sycl_device_ptr const global_scratch_ptr = m_global_scratch_ptr; sycl::local_accessor num_teams_done(1, cgh); auto team_reduction_factory = [&](sycl::local_accessor local_mem, - Kokkos::Impl::SYCLTypes::device_ptr results_ptr) { + sycl_device_ptr results_ptr) { auto device_accessible_result_ptr = m_result_ptr_device_accessible ? static_cast>(m_result_ptr) @@ -335,9 +331,8 @@ class Kokkos::Impl::ParallelReduce((size + wgroup_size - 1) / wgroup_size, 1); results_ptr = - static_cast>( - instance.scratch_space(sizeof(value_type) * - std::max(value_count, 1u) * init_size)); + static_cast>(instance.scratch_space( + sizeof(value_type) * std::max(value_count, 1u) * init_size)); size_t max_work_groups = 2 * @@ -433,10 +428,9 @@ class Kokkos::Impl::ParallelReduce>( - space.resize_team_scratch_space( - m_scratch_pool_id, - static_cast(m_scratch_size[1]) * m_league_size)); + static_cast>(space.resize_team_scratch_space( + m_scratch_pool_id, + static_cast(m_scratch_size[1]) * m_league_size)); if (static_cast(space.m_maxShmemPerBlock) < m_shmem_size - m_shmem_begin) { diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp index a3efe56b99c..bfc3fba7412 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp @@ -146,7 +146,7 @@ class ParallelScanSYCLBase { const CombinedFunctorReducer m_functor_reducer; const Policy m_policy; - Kokkos::Impl::SYCLTypes::host_ptr m_scratch_host = nullptr; + sycl_host_ptr m_scratch_host = nullptr; pointer_type m_result_ptr; const bool m_result_ptr_device_accessible; @@ -166,96 +166,93 @@ class ParallelScanSYCLBase { const auto size = m_policy.end() - m_policy.begin(); - auto scratch_flags = - static_cast>( - instance.scratch_flags(sizeof(unsigned int))); + auto scratch_flags = static_cast>( + instance.scratch_flags(sizeof(unsigned int))); const auto begin = m_policy.begin(); // Initialize global memory - auto scan_lambda_factory = - [&](sycl::local_accessor local_mem, - sycl::local_accessor num_teams_done, - Kokkos::Impl::SYCLTypes::device_ptr global_mem_, - Kokkos::Impl::SYCLTypes::device_ptr group_results_) { - auto lambda = [=](sycl::nd_item<1> item) { - auto global_mem = global_mem_; - auto group_results = group_results_; - - const CombinedFunctorReducer< - FunctorType, typename Analysis::Reducer>& functor_reducer = - functor_wrapper.get_functor(); - const FunctorType& functor = functor_reducer.get_functor(); - const typename Analysis::Reducer& reducer = - functor_reducer.get_reducer(); - - const auto n_wgroups = item.get_group_range()[0]; - const int wgroup_size = item.get_local_range()[0]; - - const int local_id = item.get_local_linear_id(); - const index_type global_id = item.get_global_linear_id(); - - // Initialize local memory - value_type local_value; - reducer.init(&local_value); - if (global_id < size) { - if constexpr (std::is_void::value) - functor(global_id + begin, local_value, false); - else - functor(WorkTag(), global_id + begin, local_value, false); - } + auto scan_lambda_factory = [&](sycl::local_accessor local_mem, + sycl::local_accessor + num_teams_done, + sycl_device_ptr global_mem_, + sycl_device_ptr group_results_) { + auto lambda = [=](sycl::nd_item<1> item) { + auto global_mem = global_mem_; + auto group_results = group_results_; + + const CombinedFunctorReducer& + functor_reducer = functor_wrapper.get_functor(); + const FunctorType& functor = functor_reducer.get_functor(); + const typename Analysis::Reducer& reducer = + functor_reducer.get_reducer(); + + const auto n_wgroups = item.get_group_range()[0]; + const int wgroup_size = item.get_local_range()[0]; + + const int local_id = item.get_local_linear_id(); + const index_type global_id = item.get_global_linear_id(); + + // Initialize local memory + value_type local_value; + reducer.init(&local_value); + if (global_id < size) { + if constexpr (std::is_void::value) + functor(global_id + begin, local_value, false); + else + functor(WorkTag(), global_id + begin, local_value, false); + } - workgroup_scan<>(item, reducer, local_mem, local_value, - wgroup_size); + workgroup_scan<>(item, reducer, local_mem, local_value, wgroup_size); - // Write results to global memory - if (global_id < size) global_mem[global_id] = local_value; + // Write results to global memory + if (global_id < size) global_mem[global_id] = local_value; - if (local_id == wgroup_size - 1) { - group_results[item.get_group_linear_id()] = - local_mem[item.get_sub_group().get_group_range()[0] - 1]; + if (local_id == wgroup_size - 1) { + group_results[item.get_group_linear_id()] = + local_mem[item.get_sub_group().get_group_range()[0] - 1]; - sycl::atomic_ref - scratch_flags_ref(*scratch_flags); - num_teams_done[0] = ++scratch_flags_ref; - } - item.barrier(sycl::access::fence_space::global_space); - if (num_teams_done[0] == n_wgroups) { - if (local_id == 0) *scratch_flags = 0; - value_type total; - reducer.init(&total); - - for (unsigned int offset = 0; offset < n_wgroups; - offset += wgroup_size) { - index_type id = local_id + offset; - if (id < static_cast(n_wgroups)) - local_value = group_results[id]; - else - reducer.init(&local_value); - workgroup_scan<>( - item, reducer, local_mem, local_value, - std::min(n_wgroups - offset, wgroup_size)); - if (id < static_cast(n_wgroups)) { - reducer.join(&local_value, &total); - group_results[id] = local_value; - } - reducer.join( - &total, - &local_mem[item.get_sub_group().get_group_range()[0] - 1]); - if (offset + wgroup_size < n_wgroups) - item.barrier(sycl::access::fence_space::global_space); - } + sycl::atomic_ref + scratch_flags_ref(*scratch_flags); + num_teams_done[0] = ++scratch_flags_ref; + } + item.barrier(sycl::access::fence_space::global_space); + if (num_teams_done[0] == n_wgroups) { + if (local_id == 0) *scratch_flags = 0; + value_type total; + reducer.init(&total); + + for (unsigned int offset = 0; offset < n_wgroups; + offset += wgroup_size) { + index_type id = local_id + offset; + if (id < static_cast(n_wgroups)) + local_value = group_results[id]; + else + reducer.init(&local_value); + workgroup_scan<>( + item, reducer, local_mem, local_value, + std::min(n_wgroups - offset, wgroup_size)); + if (id < static_cast(n_wgroups)) { + reducer.join(&local_value, &total); + group_results[id] = local_value; } - }; - return lambda; - }; + reducer.join( + &total, + &local_mem[item.get_sub_group().get_group_range()[0] - 1]); + if (offset + wgroup_size < n_wgroups) + item.barrier(sycl::access::fence_space::global_space); + } + } + }; + return lambda; + }; size_t wgroup_size; size_t n_wgroups; - Kokkos::Impl::SYCLTypes::device_ptr global_mem; - Kokkos::Impl::SYCLTypes::device_ptr group_results; + sycl_device_ptr global_mem; + sycl_device_ptr group_results; desul::ensure_sycl_lock_arrays_on_device(q); @@ -289,12 +286,11 @@ class ParallelScanSYCLBase { // that will contain the sum of the previous workgroups totals. // FIXME_SYCL consider only storing one value per block and recreate // initial results in the end before doing the final pass - global_mem = static_cast>( - instance.scratch_space(n_wgroups * (wgroup_size + 1) * - sizeof(value_type))); - m_scratch_host = - static_cast>( - instance.scratch_host(sizeof(value_type))); + global_mem = + static_cast>(instance.scratch_space( + n_wgroups * (wgroup_size + 1) * sizeof(value_type))); + m_scratch_host = static_cast>( + instance.scratch_host(sizeof(value_type))); group_results = global_mem + n_wgroups * wgroup_size; diff --git a/core/src/SYCL/Kokkos_SYCL_Team.hpp b/core/src/SYCL/Kokkos_SYCL_Team.hpp index d838dc94c67..910e3602714 100644 --- a/core/src/SYCL/Kokkos_SYCL_Team.hpp +++ b/core/src/SYCL/Kokkos_SYCL_Team.hpp @@ -339,7 +339,7 @@ class SYCLTeamMember { KOKKOS_INLINE_FUNCTION SYCLTeamMember(sycl::local_ptr shared, const std::size_t shared_begin, const std::size_t shared_size, - Kokkos::Impl::SYCLTypes::device_ptr scratch_level_1_ptr, + sycl_device_ptr scratch_level_1_ptr, const std::size_t scratch_level_1_size, const sycl::nd_item<2> item, const int arg_league_rank, const int arg_league_size) diff --git a/core/src/SYCL/Kokkos_SYCL_WorkgroupReduction.hpp b/core/src/SYCL/Kokkos_SYCL_WorkgroupReduction.hpp index f25dec76777..06be143ecca 100644 --- a/core/src/SYCL/Kokkos_SYCL_WorkgroupReduction.hpp +++ b/core/src/SYCL/Kokkos_SYCL_WorkgroupReduction.hpp @@ -28,7 +28,7 @@ inline constexpr bool use_shuffle_based_algorithm = template std::enable_if_t> workgroup_reduction( sycl::nd_item& item, sycl::local_accessor local_mem, - Kokkos::Impl::SYCLTypes::device_ptr results_ptr, + sycl_device_ptr results_ptr, sycl::global_ptr device_accessible_result_ptr, const unsigned int value_count_, const ReducerType& final_reducer, bool final, unsigned int max_size) { @@ -100,8 +100,7 @@ std::enable_if_t> workgroup_reduction( template std::enable_if_t> workgroup_reduction( sycl::nd_item& item, sycl::local_accessor local_mem, - ValueType local_value, - Kokkos::Impl::SYCLTypes::device_ptr results_ptr, + ValueType local_value, sycl_device_ptr results_ptr, sycl::global_ptr device_accessible_result_ptr, const ReducerType& final_reducer, bool final, unsigned int max_size) { const auto local_id = item.get_local_linear_id(); diff --git a/core/src/setup/Kokkos_Setup_SYCL.hpp b/core/src/setup/Kokkos_Setup_SYCL.hpp index 7fb10bb39a2..b117d75acb9 100644 --- a/core/src/setup/Kokkos_Setup_SYCL.hpp +++ b/core/src/setup/Kokkos_Setup_SYCL.hpp @@ -46,20 +46,20 @@ #endif // FIXME_SYCL Use type directly once it has stabilized in SYCL. -namespace Kokkos::Impl::SYCLTypes { +namespace Kokkos::Impl { #ifndef SYCL_EXT_INTEL_USM_ADDRESS_SPACES #error SYCL_EXT_INTEL_USM_ADDRESS_SPACES undefined! #elif SYCL_EXT_INTEL_USM_ADDRESS_SPACES >= 2 template -using device_ptr = sycl::ext::intel::device_ptr; +using sycl_device_ptr = sycl::ext::intel::device_ptr; template -using host_ptr = sycl::ext::intel::host_ptr; +using sycl_host_ptr = sycl::ext::intel::host_ptr; #else template -using device_ptr = sycl::device_ptr; +using sycl_device_ptr = sycl::device_ptr; template -using host_ptr = sycl::host_ptr; +using sycl_host_ptr = sycl::host_ptr; #endif -} // namespace Kokkos::Impl::SYCLTypes +} // namespace Kokkos::Impl #endif From ab3cae4865aec2114a7fe21288fdab2916b92188 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 24 Apr 2024 10:14:56 +0200 Subject: [PATCH 63/95] Fix wrong macro guards for deprecated Kokkos::pair specialization Co-Authored-By: Nicolas Morales --- core/src/Kokkos_Pair.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/Kokkos_Pair.hpp b/core/src/Kokkos_Pair.hpp index d1bd11f7162..9c3516eb222 100644 --- a/core/src/Kokkos_Pair.hpp +++ b/core/src/Kokkos_Pair.hpp @@ -413,7 +413,7 @@ KOKKOS_FORCEINLINE_FUNCTION pair tie(T1& x, T2& y) { return (pair(x, y)); } -#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_4 +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 // // Specialization of Kokkos::pair for a \c void second argument. This // is not actually a "pair"; it only contains one element, the first. From fafe861d0683cdde279a44dc8dc10b71d9866c30 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 24 Apr 2024 14:55:20 +0200 Subject: [PATCH 64/95] Fix support for Kokkos::Array of const-qualified element type --- core/src/Kokkos_Array.hpp | 3 ++- core/unit_test/TestArray.cpp | 7 +++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/core/src/Kokkos_Array.hpp b/core/src/Kokkos_Array.hpp index dcba8a42484..29e5edd9baa 100644 --- a/core/src/Kokkos_Array.hpp +++ b/core/src/Kokkos_Array.hpp @@ -134,8 +134,9 @@ struct Array { } private: + template friend KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t< - Impl::is_swappable::value> + Impl::is_swappable::value> kokkos_swap(Array& a, Array& b) noexcept(Impl::is_nothrow_swappable_v) { for (std::size_t i = 0; i < N; ++i) { diff --git a/core/unit_test/TestArray.cpp b/core/unit_test/TestArray.cpp index e691d83ebe2..d20d355b792 100644 --- a/core/unit_test/TestArray.cpp +++ b/core/unit_test/TestArray.cpp @@ -120,6 +120,13 @@ static_assert(test_array_aggregate_initialization()); } } +constexpr bool test_array_const_qualified_element_type() { + Kokkos::Array a{255}; + return a[0] == 255; +} + +static_assert(test_array_const_qualified_element_type()); + // User-defined type providing a sepcialization of kokkos_swap struct MyInt { int i; From 63eef4623a84634f97b2761c354792e5c0613cd9 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 24 Apr 2024 23:35:59 +0200 Subject: [PATCH 65/95] Try to fix the CUDA 11.0 build --- core/src/Kokkos_Array.hpp | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/core/src/Kokkos_Array.hpp b/core/src/Kokkos_Array.hpp index 29e5edd9baa..6ff27db061b 100644 --- a/core/src/Kokkos_Array.hpp +++ b/core/src/Kokkos_Array.hpp @@ -137,8 +137,13 @@ struct Array { template friend KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t< Impl::is_swappable::value> - kokkos_swap(Array& a, - Array& b) noexcept(Impl::is_nothrow_swappable_v) { + kokkos_swap( +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + Array& a, Array& b +#else + Array& a, Array& b +#endif + ) noexcept(Impl::is_nothrow_swappable_v) { for (std::size_t i = 0; i < N; ++i) { kokkos_swap(a[i], b[i]); } From ebb1cb308a956a4b98f2a5eb26660ed8ca3fe6ad Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 25 Apr 2024 07:43:16 +0200 Subject: [PATCH 66/95] Revert "Try to fix the CUDA 11.0 build" This reverts commit 63eef4623a84634f97b2761c354792e5c0613cd9. --- core/src/Kokkos_Array.hpp | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/core/src/Kokkos_Array.hpp b/core/src/Kokkos_Array.hpp index 6ff27db061b..29e5edd9baa 100644 --- a/core/src/Kokkos_Array.hpp +++ b/core/src/Kokkos_Array.hpp @@ -137,13 +137,8 @@ struct Array { template friend KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t< Impl::is_swappable::value> - kokkos_swap( -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - Array& a, Array& b -#else - Array& a, Array& b -#endif - ) noexcept(Impl::is_nothrow_swappable_v) { + kokkos_swap(Array& a, + Array& b) noexcept(Impl::is_nothrow_swappable_v) { for (std::size_t i = 0; i < N; ++i) { kokkos_swap(a[i], b[i]); } From 031f6d94a4294c767c4e049f9aa5fadee47c9ef3 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 25 Apr 2024 08:11:46 +0200 Subject: [PATCH 67/95] Alternate definition of Impl::is_nothrow_swappable_v for NVCC version less than 11.4 --- core/src/Kokkos_Swap.hpp | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/core/src/Kokkos_Swap.hpp b/core/src/Kokkos_Swap.hpp index 2f849a13ab6..fd69a8e6266 100644 --- a/core/src/Kokkos_Swap.hpp +++ b/core/src/Kokkos_Swap.hpp @@ -37,6 +37,26 @@ kokkos_swap(T& a, T& b) noexcept(std::is_nothrow_move_constructible_v&& namespace Impl { +// Workaround for the definition of is_nothrow_swappable_v +#if defined(KOKKOS_COMPILER_NVCC) && (KOKKOS_COMPILER_NVCC < 1140) +template +struct is_swappable { + template + static decltype(kokkos_swap(std::declval(), std::declval())) + test_swap(int) noexcept(noexcept(kokkos_swap(std::declval(), + std::declval()))); + struct Nope {}; // test_swap must return a complete type for the definition + // of nothrow below + template + static Nope test_swap(long); + static constexpr bool value = + !std::is_same_v(0)), Nope>; + static constexpr bool nothrow = noexcept(test_swap(0)); +}; + +template +inline constexpr bool is_nothrow_swappable_v = is_swappable::nothrow; +#else template struct is_swappable { template @@ -52,6 +72,7 @@ struct is_swappable { template inline constexpr bool is_nothrow_swappable_v = noexcept(kokkos_swap(std::declval(), std::declval())); +#endif } // namespace Impl From 2391f1765318725042dcdad6581eca6c03cb5adc Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 25 Apr 2024 17:23:11 +0200 Subject: [PATCH 68/95] Avoid introducing a 2nd definition of the Impl::swappable trait Co-Authored-By: Daniel Arndt --- core/src/Kokkos_Swap.hpp | 27 +++++++-------------------- 1 file changed, 7 insertions(+), 20 deletions(-) diff --git a/core/src/Kokkos_Swap.hpp b/core/src/Kokkos_Swap.hpp index fd69a8e6266..907f8607a7e 100644 --- a/core/src/Kokkos_Swap.hpp +++ b/core/src/Kokkos_Swap.hpp @@ -37,26 +37,6 @@ kokkos_swap(T& a, T& b) noexcept(std::is_nothrow_move_constructible_v&& namespace Impl { -// Workaround for the definition of is_nothrow_swappable_v -#if defined(KOKKOS_COMPILER_NVCC) && (KOKKOS_COMPILER_NVCC < 1140) -template -struct is_swappable { - template - static decltype(kokkos_swap(std::declval(), std::declval())) - test_swap(int) noexcept(noexcept(kokkos_swap(std::declval(), - std::declval()))); - struct Nope {}; // test_swap must return a complete type for the definition - // of nothrow below - template - static Nope test_swap(long); - static constexpr bool value = - !std::is_same_v(0)), Nope>; - static constexpr bool nothrow = noexcept(test_swap(0)); -}; - -template -inline constexpr bool is_nothrow_swappable_v = is_swappable::nothrow; -#else template struct is_swappable { template @@ -69,6 +49,13 @@ struct is_swappable { !std::is_same_v(0)), Nope>; }; +#if defined(KOKKOS_COMPILER_NVCC) && (KOKKOS_COMPILER_NVCC < 1140) +template +inline constexpr bool is_nothrow_swappable_v = + is_swappable::value&& noexcept( + kokkos_swap(std::declval&>(), + std::declval&>())); +#else template inline constexpr bool is_nothrow_swappable_v = noexcept(kokkos_swap(std::declval(), std::declval())); From d434f87e91069bc0d0af020053f4ca7c3f3b80c4 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 25 Apr 2024 20:51:34 +0200 Subject: [PATCH 69/95] Do not require OpenMP support for languages other than CXX Specify CXX component when searching for OpenMP so that OpenMP support is not required for other languages with CMake. One caveat is that finding the OpenMP dependency downstream will require CMake minimum version of 3.10 https://cmake.org/cmake/help/latest/module/FindOpenMP.html Co-Authored-By: Luca Bertagna --- cmake/kokkos_tpls.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmake/kokkos_tpls.cmake b/cmake/kokkos_tpls.cmake index 6ef3b79bde2..df01f200d13 100644 --- a/cmake/kokkos_tpls.cmake +++ b/cmake/kokkos_tpls.cmake @@ -103,13 +103,13 @@ if (Kokkos_ENABLE_IMPL_MDSPAN AND Kokkos_ENABLE_MDSPAN_EXTERNAL) endif() IF (Kokkos_ENABLE_OPENMP) - find_package(OpenMP REQUIRED) + find_package(OpenMP REQUIRED COMPONENTS CXX) # FIXME_TRILINOS Trilinos doesn't allow for Kokkos to use find_dependency # so we just append the flags here instead of linking with the OpenMP target. IF(KOKKOS_HAS_TRILINOS) COMPILER_SPECIFIC_FLAGS(DEFAULT ${OpenMP_CXX_FLAGS}) ELSE() - KOKKOS_EXPORT_CMAKE_TPL(OpenMP REQUIRED) + KOKKOS_EXPORT_CMAKE_TPL(OpenMP REQUIRED COMPONENTS CXX) ENDIF() ENDIF() From 19ca9ce97a80bbf9f43353b22c09c437f1389384 Mon Sep 17 00:00:00 2001 From: Christian Trott Date: Fri, 26 Apr 2024 09:47:04 -0600 Subject: [PATCH 70/95] Update version --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index f4252437111..d0cf6696f89 100644 --- a/README.md +++ b/README.md @@ -28,12 +28,12 @@ To start learning about Kokkos: The latest release of Kokkos can be obtained from the [GitHub releases page](https://github.com/kokkos/kokkos/releases/latest). -The current release is [4.2.01](https://github.com/kokkos/kokkos/releases/tag/4.2.01). +The current release is [4.3.00](https://github.com/kokkos/kokkos/releases/tag/4.3.00). ```bash -curl -OJ -L https://github.com/kokkos/kokkos/archive/refs/tags/4.2.01.tar.gz +curl -OJ -L https://github.com/kokkos/kokkos/archive/refs/tags/4.3.00.tar.gz # Or with wget -wget https://github.com/kokkos/kokkos/archive/refs/tags/4.2.01.tar.gz +wget https://github.com/kokkos/kokkos/archive/refs/tags/4.3.00.tar.gz ``` To clone the latest development version of Kokkos from GitHub: From 9686392118b72205251a0c9511c5fceacc1b6db8 Mon Sep 17 00:00:00 2001 From: Christian Trott Date: Fri, 26 Apr 2024 09:52:03 -0600 Subject: [PATCH 71/95] Add Linux Foundation notice and fix C++ standard --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index d0cf6696f89..7d9d70fac5c 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,8 @@ backends in development. **Kokkos Core is part of the [Kokkos C++ Performance Portability Programming Ecosystem](https://kokkos.org/about/abstract/).** +Kokkos is a [Linux Foundation](https://linuxfoundation.org) project. + ## Learning about Kokkos To start learning about Kokkos: @@ -44,7 +46,7 @@ git clone -b develop https://github.com/kokkos/kokkos.git ### Building Kokkos -To build Kokkos, you will need to have a C++ compiler that supports C++14 or later. +To build Kokkos, you will need to have a C++ compiler that supports C++17 or later. All requirements including minimum and primary tested compiler versions can be found [here](https://kokkos.org/kokkos-core-wiki/requirements.html). Building and installation instructions are described [here](https://kokkos.org/kokkos-core-wiki/building.html). From 7e7709fdb8029e6e97f5a5f8549c3a42a4ddbdbd Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Sat, 27 Apr 2024 12:32:21 -0400 Subject: [PATCH 72/95] SYCL: Avoid deprecated floating-point number abs overloads (#6959) * Avoid deprecated floating-point number abs overloads * Add a comment --- core/src/Kokkos_MathematicalFunctions.hpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/core/src/Kokkos_MathematicalFunctions.hpp b/core/src/Kokkos_MathematicalFunctions.hpp index 3fead8dd293..19967782e5e 100644 --- a/core/src/Kokkos_MathematicalFunctions.hpp +++ b/core/src/Kokkos_MathematicalFunctions.hpp @@ -277,12 +277,20 @@ KOKKOS_INLINE_FUNCTION long long abs(long long n) { #endif } KOKKOS_INLINE_FUNCTION float abs(float x) { +#ifdef KOKKOS_ENABLE_SYCL + return sycl::fabs(x); // sycl::abs is only provided for integral types +#else using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::abs; return abs(x); +#endif } KOKKOS_INLINE_FUNCTION double abs(double x) { +#ifdef KOKKOS_ENABLE_SYCL + return sycl::fabs(x); // sycl::abs is only provided for integral types +#else using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::abs; return abs(x); +#endif } inline long double abs(long double x) { using std::abs; From 4ec82963fbcd174aab79a86875c85e09a4fcc170 Mon Sep 17 00:00:00 2001 From: Rahulkumar Gayatri Date: Sun, 28 Apr 2024 08:52:18 -0700 Subject: [PATCH 73/95] OpenMPTarget: Update loop order in MDRange (#6925) * OpenMPTarget: Reverse loop order in MDRange. * OpenMPTarget: Honor user request for iteration in MDRange. * OpenMPTarget: clang-format * OpenMPTarget: Remove unecessary includes. --------- Co-authored-by: Rahulkumar Gayatri --- core/src/OpenMPTarget/Kokkos_OpenMPTarget.hpp | 3 +- .../Kokkos_OpenMPTarget_MDRangePolicy.hpp | 5 + ...okkos_OpenMPTarget_ParallelFor_MDRange.hpp | 383 +++++++++++ ...s_OpenMPTarget_ParallelReduce_MDRange.hpp} | 633 ++++++++---------- 4 files changed, 674 insertions(+), 350 deletions(-) create mode 100644 core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_MDRange.hpp rename core/src/OpenMPTarget/{Kokkos_OpenMPTarget_Parallel_MDRange.hpp => Kokkos_OpenMPTarget_ParallelReduce_MDRange.hpp} (62%) diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget.hpp index ea4e7f6baba..84c7b85f11d 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget.hpp @@ -146,7 +146,8 @@ struct DeviceTypeTraits<::Kokkos::Experimental::OpenMPTarget> { /*--------------------------------------------------------------------------*/ #include -#include +#include +#include #include /*--------------------------------------------------------------------------*/ diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_MDRangePolicy.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_MDRangePolicy.hpp index d718f56d38b..e353676b617 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_MDRangePolicy.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_MDRangePolicy.hpp @@ -22,6 +22,10 @@ namespace Kokkos { namespace Impl { +using OpenMPTargetIterateLeft = std::integral_constant; +using OpenMPTargetIterateRight = + std::integral_constant; + template struct ThreadAndVectorNestLevel +#include +#include "Kokkos_OpenMPTarget_MDRangePolicy.hpp" + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template +class ParallelFor, + Kokkos::Experimental::OpenMPTarget> { + private: + using Policy = Kokkos::MDRangePolicy; + using WorkTag = typename Policy::work_tag; + using Member = typename Policy::member_type; + using Index = typename Policy::index_type; + + const FunctorType m_functor; + const Policy m_policy; + + public: + inline void execute() const { + OpenMPTargetExec::verify_is_process( + "Kokkos::Experimental::OpenMPTarget parallel_for"); + OpenMPTargetExec::verify_initialized( + "Kokkos::Experimental::OpenMPTarget parallel_for"); + FunctorType functor(m_functor); + Policy policy = m_policy; + + typename Policy::point_type unused; + static_assert(1 < Policy::rank && Policy::rank < 7); + static_assert(Policy::inner_direction == Iterate::Left || + Policy::inner_direction == Iterate::Right); + + execute_tile( + unused, functor, policy, + std::integral_constant()); + } + + template + inline std::enable_if_t execute_tile( + typename Policy::point_type offset, const FunctorType& functor, + const Policy& policy, OpenMPTargetIterateRight) const { + (void)offset; + const Index begin_0 = policy.m_lower[0]; + const Index begin_1 = policy.m_lower[1]; + + const Index end_0 = policy.m_upper[0]; + const Index end_1 = policy.m_upper[1]; + +#pragma omp target teams distribute parallel for collapse(2) map(to : functor) + for (auto i0 = begin_0; i0 < end_0; ++i0) + for (auto i1 = begin_1; i1 < end_1; ++i1) { + if constexpr (std::is_void::value) + functor(i0, i1); + else + functor(typename Policy::work_tag(), i0, i1); + } + } + + template + inline std::enable_if_t execute_tile( + typename Policy::point_type offset, const FunctorType& functor, + const Policy& policy, OpenMPTargetIterateRight) const { + (void)offset; + const Index begin_0 = policy.m_lower[0]; + const Index begin_1 = policy.m_lower[1]; + const Index begin_2 = policy.m_lower[2]; + + const Index end_0 = policy.m_upper[0]; + const Index end_1 = policy.m_upper[1]; + const Index end_2 = policy.m_upper[2]; + +#pragma omp target teams distribute parallel for collapse(3) map(to : functor) + for (auto i0 = begin_0; i0 < end_0; ++i0) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i2 = begin_2; i2 < end_2; ++i2) { + if constexpr (std::is_void::value) + functor(i0, i1, i2); + else + functor(typename Policy::work_tag(), i0, i1, i2); + } + } + } + } + + template + inline std::enable_if_t execute_tile( + typename Policy::point_type offset, const FunctorType& functor, + const Policy& policy, OpenMPTargetIterateRight) const { + (void)offset; + const Index begin_0 = policy.m_lower[0]; + const Index begin_1 = policy.m_lower[1]; + const Index begin_2 = policy.m_lower[2]; + const Index begin_3 = policy.m_lower[3]; + + const Index end_0 = policy.m_upper[0]; + const Index end_1 = policy.m_upper[1]; + const Index end_2 = policy.m_upper[2]; + const Index end_3 = policy.m_upper[3]; + +#pragma omp target teams distribute parallel for collapse(4) map(to : functor) + for (auto i0 = begin_0; i0 < end_0; ++i0) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i3 = begin_3; i3 < end_3; ++i3) { + if constexpr (std::is_void::value) + functor(i0, i1, i2, i3); + else + functor(typename Policy::work_tag(), i0, i1, i2, i3); + } + } + } + } + } + + template + inline std::enable_if_t execute_tile( + typename Policy::point_type offset, const FunctorType& functor, + const Policy& policy, OpenMPTargetIterateRight) const { + (void)offset; + const Index begin_0 = policy.m_lower[0]; + const Index begin_1 = policy.m_lower[1]; + const Index begin_2 = policy.m_lower[2]; + const Index begin_3 = policy.m_lower[3]; + const Index begin_4 = policy.m_lower[4]; + + const Index end_0 = policy.m_upper[0]; + const Index end_1 = policy.m_upper[1]; + const Index end_2 = policy.m_upper[2]; + const Index end_3 = policy.m_upper[3]; + const Index end_4 = policy.m_upper[4]; + +#pragma omp target teams distribute parallel for collapse(5) map(to : functor) + for (auto i0 = begin_0; i0 < end_0; ++i0) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i3 = begin_3; i3 < end_3; ++i3) { + for (auto i4 = begin_4; i4 < end_4; ++i4) { + if constexpr (std::is_same::value) + functor(i0, i1, i2, i3, i4); + else + functor(typename Policy::work_tag(), i0, i1, i2, i3, i4); + } + } + } + } + } + } + + template + inline std::enable_if_t execute_tile( + typename Policy::point_type offset, const FunctorType& functor, + const Policy& policy, OpenMPTargetIterateRight) const { + (void)offset; + const Index begin_0 = policy.m_lower[0]; + const Index begin_1 = policy.m_lower[1]; + const Index begin_2 = policy.m_lower[2]; + const Index begin_3 = policy.m_lower[3]; + const Index begin_4 = policy.m_lower[4]; + const Index begin_5 = policy.m_lower[5]; + + const Index end_0 = policy.m_upper[0]; + const Index end_1 = policy.m_upper[1]; + const Index end_2 = policy.m_upper[2]; + const Index end_3 = policy.m_upper[3]; + const Index end_4 = policy.m_upper[4]; + const Index end_5 = policy.m_upper[5]; + +#pragma omp target teams distribute parallel for collapse(6) map(to : functor) + for (auto i0 = begin_0; i0 < end_0; ++i0) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i3 = begin_3; i3 < end_3; ++i3) { + for (auto i4 = begin_4; i4 < end_4; ++i4) { + for (auto i5 = begin_5; i5 < end_5; ++i5) { + { + if constexpr (std::is_same::value) + functor(i0, i1, i2, i3, i4, i5); + else + functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, + i5); + } + } + } + } + } + } + } + } + + template + inline std::enable_if_t execute_tile( + typename Policy::point_type offset, const FunctorType& functor, + const Policy& policy, OpenMPTargetIterateLeft) const { + (void)offset; + const Index begin_0 = policy.m_lower[0]; + const Index begin_1 = policy.m_lower[1]; + + const Index end_0 = policy.m_upper[0]; + const Index end_1 = policy.m_upper[1]; + +#pragma omp target teams distribute parallel for collapse(2) map(to : functor) + for (auto i1 = begin_1; i1 < end_1; ++i1) + for (auto i0 = begin_0; i0 < end_0; ++i0) { + if constexpr (std::is_void::value) + functor(i0, i1); + else + functor(typename Policy::work_tag(), i0, i1); + } + } + + template + inline std::enable_if_t execute_tile( + typename Policy::point_type offset, const FunctorType& functor, + const Policy& policy, OpenMPTargetIterateLeft) const { + (void)offset; + const Index begin_0 = policy.m_lower[0]; + const Index begin_1 = policy.m_lower[1]; + const Index begin_2 = policy.m_lower[2]; + + const Index end_0 = policy.m_upper[0]; + const Index end_1 = policy.m_upper[1]; + const Index end_2 = policy.m_upper[2]; + +#pragma omp target teams distribute parallel for collapse(3) map(to : functor) + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i0 = begin_0; i0 < end_0; ++i0) { + if constexpr (std::is_void::value) + functor(i0, i1, i2); + else + functor(typename Policy::work_tag(), i0, i1, i2); + } + } + } + } + + template + inline std::enable_if_t execute_tile( + typename Policy::point_type offset, const FunctorType& functor, + const Policy& policy, OpenMPTargetIterateLeft) const { + (void)offset; + const Index begin_0 = policy.m_lower[0]; + const Index begin_1 = policy.m_lower[1]; + const Index begin_2 = policy.m_lower[2]; + const Index begin_3 = policy.m_lower[3]; + + const Index end_0 = policy.m_upper[0]; + const Index end_1 = policy.m_upper[1]; + const Index end_2 = policy.m_upper[2]; + const Index end_3 = policy.m_upper[3]; + +#pragma omp target teams distribute parallel for collapse(4) map(to : functor) + for (auto i3 = begin_3; i3 < end_3; ++i3) { + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i0 = begin_0; i0 < end_0; ++i0) { + if constexpr (std::is_void::value) + functor(i0, i1, i2, i3); + else + functor(typename Policy::work_tag(), i0, i1, i2, i3); + } + } + } + } + } + + template + inline std::enable_if_t execute_tile( + typename Policy::point_type offset, const FunctorType& functor, + const Policy& policy, OpenMPTargetIterateLeft) const { + (void)offset; + const Index begin_0 = policy.m_lower[0]; + const Index begin_1 = policy.m_lower[1]; + const Index begin_2 = policy.m_lower[2]; + const Index begin_3 = policy.m_lower[3]; + const Index begin_4 = policy.m_lower[4]; + + const Index end_0 = policy.m_upper[0]; + const Index end_1 = policy.m_upper[1]; + const Index end_2 = policy.m_upper[2]; + const Index end_3 = policy.m_upper[3]; + const Index end_4 = policy.m_upper[4]; + +#pragma omp target teams distribute parallel for collapse(5) map(to : functor) + for (auto i4 = begin_4; i4 < end_4; ++i4) { + for (auto i3 = begin_3; i3 < end_3; ++i3) { + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i0 = begin_0; i0 < end_0; ++i0) { + if constexpr (std::is_same::value) + functor(i0, i1, i2, i3, i4); + else + functor(typename Policy::work_tag(), i0, i1, i2, i3, i4); + } + } + } + } + } + } + + template + inline std::enable_if_t execute_tile( + typename Policy::point_type offset, const FunctorType& functor, + const Policy& policy, OpenMPTargetIterateLeft) const { + (void)offset; + const Index begin_0 = policy.m_lower[0]; + const Index begin_1 = policy.m_lower[1]; + const Index begin_2 = policy.m_lower[2]; + const Index begin_3 = policy.m_lower[3]; + const Index begin_4 = policy.m_lower[4]; + const Index begin_5 = policy.m_lower[5]; + + const Index end_0 = policy.m_upper[0]; + const Index end_1 = policy.m_upper[1]; + const Index end_2 = policy.m_upper[2]; + const Index end_3 = policy.m_upper[3]; + const Index end_4 = policy.m_upper[4]; + const Index end_5 = policy.m_upper[5]; + +#pragma omp target teams distribute parallel for collapse(6) map(to : functor) + for (auto i5 = begin_5; i5 < end_5; ++i5) { + for (auto i4 = begin_4; i4 < end_4; ++i4) { + for (auto i3 = begin_3; i3 < end_3; ++i3) { + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i0 = begin_0; i0 < end_0; ++i0) { + { + if constexpr (std::is_same::value) + functor(i0, i1, i2, i3, i4, i5); + else + functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, + i5); + } + } + } + } + } + } + } + } + + inline ParallelFor(const FunctorType& arg_functor, Policy arg_policy) + : m_functor(arg_functor), m_policy(arg_policy) {} + // TODO DZP: based on a conversation with Christian, we're using 256 as a + // heuristic here. We need something better once we can query these kinds of + // properties + template + static int max_tile_size_product(const Policy&, const Functor&) { + return 256; + } +}; + +} // namespace Impl +} // namespace Kokkos + +#endif /* KOKKOS_OPENMPTARGET_PARALLELFOR_MDRANGE_HPP */ diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_MDRange.hpp similarity index 62% rename from core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp rename to core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_MDRange.hpp index 6878531730d..0782a79302a 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_MDRange.hpp @@ -14,128 +14,122 @@ // //@HEADER -#ifndef KOKKOS_OPENMPTARGET_PARALLEL_MDRANGE_HPP -#define KOKKOS_OPENMPTARGET_PARALLEL_MDRANGE_HPP +#ifndef KOKKOS_OPENMPTARGET_PARALLELREDUCE_MDRANGE_HPP +#define KOKKOS_OPENMPTARGET_PARALLELREDUCE_MDRANGE_HPP #include #include -#include +#include "Kokkos_OpenMPTarget_MDRangePolicy.hpp" #include -// WORKAROUND OPENMPTARGET: sometimes tile sizes don't make it correctly, -// this was tracked down to a bug in clang with regards of mapping structs -// with arrays of long in it. Arrays of int might be fine though ... -#define KOKKOS_IMPL_MDRANGE_USE_NO_TILES // undef EOF - //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- namespace Kokkos { namespace Impl { -template -class ParallelFor, - Kokkos::Experimental::OpenMPTarget> { +template +class ParallelReduce, + Kokkos::Experimental::OpenMPTarget> { private: - using Policy = Kokkos::MDRangePolicy; + using Policy = Kokkos::MDRangePolicy; + using FunctorType = typename CombinedFunctorReducerType::functor_type; + using ReducerType = typename CombinedFunctorReducerType::reducer_type; + using WorkTag = typename Policy::work_tag; using Member = typename Policy::member_type; using Index = typename Policy::index_type; - const FunctorType m_functor; + using pointer_type = typename ReducerType::pointer_type; + using reference_type = typename ReducerType::reference_type; + + static constexpr bool UseReducer = + !std::is_same_v; + + const pointer_type m_result_ptr; + const CombinedFunctorReducerType m_functor_reducer; const Policy m_policy; + using ParReduceCopy = ParallelReduceCopy; + + bool m_result_ptr_on_device; + + // Only let one ParallelReduce instance at a time use the scratch memory. + // The constructor acquires the mutex which is released in the destructor. + std::scoped_lock m_scratch_memory_lock; + public: inline void execute() const { - OpenMPTargetExec::verify_is_process( - "Kokkos::Experimental::OpenMPTarget parallel_for"); - OpenMPTargetExec::verify_initialized( - "Kokkos::Experimental::OpenMPTarget parallel_for"); - FunctorType functor(m_functor); - Policy policy = m_policy; - -#ifdef KOKKOS_IMPL_MDRANGE_USE_NO_TILES - typename Policy::point_type unused; - - execute_tile(unused, functor, policy); -#else - const int64_t begin = 0; - const int64_t end = m_policy.m_num_tiles; - -#pragma omp target teams distribute map(to : functor) num_teams(end - begin) - { - for (ptrdiff_t tile_idx = begin; tile_idx < end; ++tile_idx) { - -#pragma omp parallel - { - typename Policy::point_type offset; - if (Policy::outer_direction == Policy::Left) { - for (int i = 0; i < Policy::rank; ++i) { - offset[i] = (tile_idx % policy.m_tile_end[i]) * policy.m_tile[i] + - policy.m_lower[i]; - tile_idx /= policy.m_tile_end[i]; - } - } else { - for (int i = Policy::rank - 1; i >= 0; --i) { - offset[i] = (tile_idx % policy.m_tile_end[i]) * policy.m_tile[i] + - policy.m_lower[i]; - tile_idx /= policy.m_tile_end[i]; - } - } - execute_tile(offset, functor, policy); - } - } - } -#endif + execute_tile( + m_functor_reducer.get_functor(), m_policy, m_result_ptr, + std::integral_constant()); } - template + template + inline ParallelReduce(const CombinedFunctorReducerType& arg_functor_reducer, + Policy arg_policy, const ViewType& arg_result_view) + : m_result_ptr(arg_result_view.data()), + m_functor_reducer(arg_functor_reducer), + m_policy(arg_policy), + m_result_ptr_on_device( + MemorySpaceAccess::accessible), + m_scratch_memory_lock(OpenMPTargetExec::m_mutex_scratch_ptr) {} + + template inline std::enable_if_t execute_tile( - typename Policy::point_type offset, const FunctorType& functor, - const Policy& policy) const { -#ifdef KOKKOS_IMPL_MDRANGE_USE_NO_TILES - (void)offset; + const FunctorType& functor, const Policy& policy, pointer_type ptr, + OpenMPTargetIterateLeft) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; const Index end_0 = policy.m_upper[0]; const Index end_1 = policy.m_upper[1]; -#pragma omp target teams distribute parallel for collapse(2) map(to : functor) - for (auto i0 = begin_0; i0 < end_0; ++i0) { + ValueType result = ValueType(); + + // FIXME_OPENMPTARGET: Unable to separate directives and their companion + // loops which leads to code duplication for different reduction types. + if constexpr (UseReducer) { +#pragma omp declare reduction( \ + custom:ValueType \ + : OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) + +#pragma omp target teams distribute parallel for collapse(2) map(to \ + : functor) \ + reduction(custom \ + : result) for (auto i1 = begin_1; i1 < end_1; ++i1) { - if constexpr (std::is_void::value) - functor(i0, i1); - else - functor(typename Policy::work_tag(), i0, i1); + for (auto i0 = begin_0; i0 < end_0; ++i0) { + if constexpr (std::is_void::value) + functor(i0, i1, result); + else + functor(typename Policy::work_tag(), i0, i1, result); + } } - } -#else - const ptrdiff_t begin_0 = offset[0]; - ptrdiff_t end_0 = begin_0 + policy.m_tile[0]; - end_0 = end_0 < policy.m_upper[0] ? end_0 : policy.m_upper[0]; - - const ptrdiff_t begin_1 = offset[1]; - ptrdiff_t end_1 = begin_1 + policy.m_tile[1]; - end_1 = end_1 < policy.m_upper[1] ? end_1 : policy.m_upper[1]; - -#pragma omp for collapse(2) - for (ptrdiff_t i0 = begin_0; i0 < end_0; ++i0) - for (ptrdiff_t i1 = begin_1; i1 < end_1; ++i1) { - if constexpr (std::is_void::value) - functor(i0, i1); - else - functor(typename Policy::work_tag(), i0, i1); + } else { +#pragma omp target teams distribute parallel for collapse(2) map(to : functor) \ +reduction(+:result) + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i0 = begin_0; i0 < end_0; ++i0) { + if constexpr (std::is_void::value) + functor(i0, i1, result); + else + functor(typename Policy::work_tag(), i0, i1, result); + } } -#endif + } + + ParReduceCopy::memcpy_result(ptr, &result, sizeof(ValueType), + m_result_ptr_on_device); } - template + template inline std::enable_if_t execute_tile( - typename Policy::point_type offset, const FunctorType& functor, - const Policy& policy) const { -#ifdef KOKKOS_IMPL_MDRANGE_USE_NO_TILES - (void)offset; + const FunctorType& functor, const Policy& policy, pointer_type ptr, + OpenMPTargetIterateLeft) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; const Index begin_2 = policy.m_lower[2]; @@ -144,107 +138,119 @@ class ParallelFor, const Index end_1 = policy.m_upper[1]; const Index end_2 = policy.m_upper[2]; -#pragma omp target teams distribute parallel for collapse(3) map(to : functor) - for (auto i0 = begin_0; i0 < end_0; ++i0) { - for (auto i1 = begin_1; i1 < end_1; ++i1) { - for (auto i2 = begin_2; i2 < end_2; ++i2) { - if constexpr (std::is_void::value) - functor(i0, i1, i2); - else - functor(typename Policy::work_tag(), i0, i1, i2); + ValueType result = ValueType(); + + // FIXME_OPENMPTARGET: Unable to separate directives and their companion + // loops which leads to code duplication for different reduction types. + if constexpr (UseReducer) { +#pragma omp declare reduction( \ + custom:ValueType \ + : OpenMPTargetReducerWrapper ::join( \ + omp_out, omp_in)) \ + initializer( \ + OpenMPTargetReducerWrapper ::init( \ + omp_priv)) + +#pragma omp target teams distribute parallel for collapse(3) map(to \ + : functor) \ + reduction(custom \ + : result) + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i0 = begin_0; i0 < end_0; ++i0) { + if constexpr (std::is_void::value) + functor(i0, i1, i2, result); + else + functor(typename Policy::work_tag(), i0, i1, i2, result); + } } } - } -#else - const ptrdiff_t begin_0 = offset[0]; - ptrdiff_t end_0 = begin_0 + policy.m_tile[0]; - end_0 = end_0 < policy.m_upper[0] ? end_0 : policy.m_upper[0]; - - const ptrdiff_t begin_1 = offset[1]; - ptrdiff_t end_1 = begin_1 + policy.m_tile[1]; - end_1 = end_1 < policy.m_upper[1] ? end_1 : policy.m_upper[1]; - - const ptrdiff_t begin_2 = offset[2]; - ptrdiff_t end_2 = begin_2 + policy.m_tile[2]; - end_2 = end_2 < policy.m_upper[2] ? end_2 : policy.m_upper[2]; - -#pragma omp for collapse(3) - for (ptrdiff_t i0 = begin_0; i0 < end_0; ++i0) - for (ptrdiff_t i1 = begin_1; i1 < end_1; ++i1) - for (ptrdiff_t i2 = begin_2; i2 < end_2; ++i2) { - if constexpr (std::is_void::value) - functor(i0, i1, i2); - else - functor(typename Policy::work_tag(), i0, i1, i2); + } else { +#pragma omp target teams distribute parallel for collapse(3) map(to : functor) \ +reduction(+:result) + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i0 = begin_0; i0 < end_0; ++i0) { + if constexpr (std::is_void::value) + functor(i0, i1, i2, result); + else + functor(typename Policy::work_tag(), i0, i1, i2, result); + } } -#endif + } + } + + ParReduceCopy::memcpy_result(ptr, &result, sizeof(ValueType), + m_result_ptr_on_device); } - template + template inline std::enable_if_t execute_tile( - typename Policy::point_type offset, const FunctorType& functor, - const Policy& policy) const { -#ifdef KOKKOS_IMPL_MDRANGE_USE_NO_TILES - (void)offset; + const FunctorType& functor, const Policy& policy, pointer_type ptr, + OpenMPTargetIterateLeft) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; - const Index begin_2 = policy.m_lower[2]; - const Index begin_3 = policy.m_lower[3]; + const Index begin_2 = policy.m_lower[3]; + const Index begin_3 = policy.m_lower[2]; const Index end_0 = policy.m_upper[0]; const Index end_1 = policy.m_upper[1]; const Index end_2 = policy.m_upper[2]; const Index end_3 = policy.m_upper[3]; -#pragma omp target teams distribute parallel for collapse(4) map(to : functor) - for (auto i0 = begin_0; i0 < end_0; ++i0) { - for (auto i1 = begin_1; i1 < end_1; ++i1) { + ValueType result = ValueType(); + + // FIXME_OPENMPTARGET: Unable to separate directives and their companion + // loops which leads to code duplication for different reduction types. + if constexpr (UseReducer) { +#pragma omp declare reduction( \ + custom:ValueType \ + : OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) + +#pragma omp target teams distribute parallel for collapse(4) map(to \ + : functor) \ + reduction(custom \ + : result) + for (auto i3 = begin_3; i3 < end_3; ++i3) { for (auto i2 = begin_2; i2 < end_2; ++i2) { - for (auto i3 = begin_3; i3 < end_3; ++i3) { - if constexpr (std::is_void::value) - functor(i0, i1, i2, i3); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3); + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i0 = begin_0; i0 < end_0; ++i0) { + if constexpr (std::is_same::value) + functor(i0, i1, i2, i3, result); + else + functor(typename Policy::work_tag(), i0, i1, i2, i3, result); + } } } } - } -#else - const ptrdiff_t begin_0 = offset[0]; - ptrdiff_t end_0 = begin_0 + policy.m_tile[0]; - end_0 = end_0 < policy.m_upper[0] ? end_0 : policy.m_upper[0]; - - const ptrdiff_t begin_1 = offset[1]; - ptrdiff_t end_1 = begin_1 + policy.m_tile[1]; - end_1 = end_1 < policy.m_upper[1] ? end_1 : policy.m_upper[1]; - - const ptrdiff_t begin_2 = offset[2]; - ptrdiff_t end_2 = begin_2 + policy.m_tile[2]; - end_2 = end_2 < policy.m_upper[2] ? end_2 : policy.m_upper[2]; - - const ptrdiff_t begin_3 = offset[3]; - ptrdiff_t end_3 = begin_3 + policy.m_tile[3]; - end_3 = end_3 < policy.m_upper[3] ? end_3 : policy.m_upper[3]; - -#pragma omp for collapse(4) - for (ptrdiff_t i0 = begin_0; i0 < end_0; ++i0) - for (ptrdiff_t i1 = begin_1; i1 < end_1; ++i1) - for (ptrdiff_t i2 = begin_2; i2 < end_2; ++i2) - for (ptrdiff_t i3 = begin_3; i3 < end_3; ++i3) { - if constexpr (std::is_void::value) - functor(i0, i1, i2, i3); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3); + } else { +#pragma omp target teams distribute parallel for collapse(4) map(to : functor) \ +reduction(+:result) + for (auto i3 = begin_3; i3 < end_3; ++i3) { + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i0 = begin_0; i0 < end_0; ++i0) { + if constexpr (std::is_same::value) + functor(i0, i1, i2, i3, result); + else + functor(typename Policy::work_tag(), i0, i1, i2, i3, result); + } } -#endif + } + } + } + + ParReduceCopy::memcpy_result(ptr, &result, sizeof(ValueType), + m_result_ptr_on_device); } - template + template inline std::enable_if_t execute_tile( - typename Policy::point_type offset, const FunctorType& functor, - const Policy& policy) const { -#ifdef KOKKOS_IMPL_MDRANGE_USE_NO_TILES - (void)offset; + const FunctorType& functor, const Policy& policy, pointer_type ptr, + OpenMPTargetIterateLeft) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; const Index begin_2 = policy.m_lower[2]; @@ -257,64 +263,65 @@ class ParallelFor, const Index end_3 = policy.m_upper[3]; const Index end_4 = policy.m_upper[4]; -#pragma omp target teams distribute parallel for collapse(5) map(to : functor) - for (auto i0 = begin_0; i0 < end_0; ++i0) { - for (auto i1 = begin_1; i1 < end_1; ++i1) { - for (auto i2 = begin_2; i2 < end_2; ++i2) { - for (auto i3 = begin_3; i3 < end_3; ++i3) { - for (auto i4 = begin_4; i4 < end_4; ++i4) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, i4); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4); + ValueType result = ValueType(); + + // FIXME_OPENMPTARGET: Unable to separate directives and their companion + // loops which leads to code duplication for different reduction types. + if constexpr (UseReducer) { +#pragma omp declare reduction( \ + custom:ValueType \ + : OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) + +#pragma omp target teams distribute parallel for collapse(5) map(to \ + : functor) \ + reduction(custom \ + : result) + for (auto i4 = begin_4; i4 < end_4; ++i4) { + for (auto i3 = begin_3; i3 < end_3; ++i3) { + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i0 = begin_0; i0 < end_0; ++i0) { + if constexpr (std::is_same::value) + functor(i0, i1, i2, i3, i4, result); + else + functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, + result); + } } } } } - } -#else - const ptrdiff_t begin_0 = offset[0]; - ptrdiff_t end_0 = begin_0 + policy.m_tile[0]; - end_0 = end_0 < policy.m_upper[0] ? end_0 : policy.m_upper[0]; - - const ptrdiff_t begin_1 = offset[1]; - ptrdiff_t end_1 = begin_1 + policy.m_tile[1]; - end_1 = end_1 < policy.m_upper[1] ? end_1 : policy.m_upper[1]; - - const ptrdiff_t begin_2 = offset[2]; - ptrdiff_t end_2 = begin_2 + policy.m_tile[2]; - end_2 = end_2 < policy.m_upper[2] ? end_2 : policy.m_upper[2]; - - const ptrdiff_t begin_3 = offset[3]; - ptrdiff_t end_3 = begin_3 + policy.m_tile[3]; - end_3 = end_3 < policy.m_upper[3] ? end_3 : policy.m_upper[3]; - - const ptrdiff_t begin_4 = offset[4]; - ptrdiff_t end_4 = begin_4 + policy.m_tile[4]; - end_4 = end_4 < policy.m_upper[4] ? end_4 : policy.m_upper[4]; - -#pragma omp for collapse(5) - for (ptrdiff_t i0 = begin_0; i0 < end_0; ++i0) - for (ptrdiff_t i1 = begin_1; i1 < end_1; ++i1) - for (ptrdiff_t i2 = begin_2; i2 < end_2; ++i2) - for (ptrdiff_t i3 = begin_3; i3 < end_3; ++i3) - for (ptrdiff_t i4 = begin_4; i4 < end_4; ++i4) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, i4); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4); + } else { +#pragma omp target teams distribute parallel for collapse(5) map(to : functor) \ +reduction(+:result) + for (auto i4 = begin_4; i4 < end_4; ++i4) { + for (auto i3 = begin_3; i3 < end_3; ++i3) { + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i0 = begin_0; i0 < end_0; ++i0) { + if constexpr (std::is_same::value) + functor(i0, i1, i2, i3, i4, result); + else + functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, + result); + } } -#endif + } + } + } + } + + ParReduceCopy::memcpy_result(ptr, &result, sizeof(ValueType), + m_result_ptr_on_device); } - template + template inline std::enable_if_t execute_tile( - typename Policy::point_type offset, const FunctorType& functor, - const Policy& policy) const { -#ifdef KOKKOS_IMPL_MDRANGE_USE_NO_TILES - (void)offset; + const FunctorType& functor, const Policy& policy, pointer_type ptr, + OpenMPTargetIterateLeft) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; const Index begin_2 = policy.m_lower[2]; @@ -329,140 +336,69 @@ class ParallelFor, const Index end_4 = policy.m_upper[4]; const Index end_5 = policy.m_upper[5]; -#pragma omp target teams distribute parallel for collapse(6) map(to : functor) - for (auto i0 = begin_0; i0 < end_0; ++i0) { - for (auto i1 = begin_1; i1 < end_1; ++i1) { - for (auto i2 = begin_2; i2 < end_2; ++i2) { + ValueType result = ValueType(); + + // FIXME_OPENMPTARGET: Unable to separate directives and their companion + // loops which leads to code duplication for different reduction types. + if constexpr (UseReducer) { +#pragma omp declare reduction( \ + custom:ValueType \ + : OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) + +#pragma omp target teams distribute parallel for collapse(6) map(to \ + : functor) \ + reduction(custom \ + : result) + for (auto i5 = begin_5; i5 < end_5; ++i5) { + for (auto i4 = begin_4; i4 < end_4; ++i4) { for (auto i3 = begin_3; i3 < end_3; ++i3) { - for (auto i4 = begin_4; i4 < end_4; ++i4) { - for (auto i5 = begin_5; i5 < end_5; ++i5) { - { + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i0 = begin_0; i0 < end_0; ++i0) { if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, i4, i5); + functor(i0, i1, i2, i3, i4, i5, result); else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, - i5); + functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, i5, + result); } } } } } } - } -#else - const ptrdiff_t begin_0 = offset[0]; - ptrdiff_t end_0 = begin_0 + policy.m_tile[0]; - end_0 = end_0 < policy.m_upper[0] ? end_0 : policy.m_upper[0]; - - const ptrdiff_t begin_1 = offset[1]; - ptrdiff_t end_1 = begin_1 + policy.m_tile[1]; - end_1 = end_1 < policy.m_upper[1] ? end_1 : policy.m_upper[1]; - - const ptrdiff_t begin_2 = offset[2]; - ptrdiff_t end_2 = begin_2 + policy.m_tile[2]; - end_2 = end_2 < policy.m_upper[2] ? end_2 : policy.m_upper[2]; - - const ptrdiff_t begin_3 = offset[3]; - ptrdiff_t end_3 = begin_3 + policy.m_tile[3]; - end_3 = end_3 < policy.m_upper[3] ? end_3 : policy.m_upper[3]; - - const ptrdiff_t begin_4 = offset[4]; - ptrdiff_t end_4 = begin_4 + policy.m_tile[4]; - end_4 = end_4 < policy.m_upper[4] ? end_4 : policy.m_upper[4]; - - const ptrdiff_t begin_5 = offset[5]; - ptrdiff_t end_5 = begin_5 + policy.m_tile[5]; - end_5 = end_5 < policy.m_upper[5] ? end_5 : policy.m_upper[5]; - -#pragma omp for collapse(6) - for (ptrdiff_t i0 = begin_0; i0 < end_0; ++i0) - for (ptrdiff_t i1 = begin_1; i1 < end_1; ++i1) - for (ptrdiff_t i2 = begin_2; i2 < end_2; ++i2) - for (ptrdiff_t i3 = begin_3; i3 < end_3; ++i3) - for (ptrdiff_t i4 = begin_4; i4 < end_4; ++i4) - for (ptrdiff_t i5 = begin_5; i5 < end_5; ++i5) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, i4, i5); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, i5); + } else { +#pragma omp target teams distribute parallel for collapse(6) map(to : functor) \ +reduction(+:result) + for (auto i5 = begin_5; i5 < end_5; ++i5) { + for (auto i4 = begin_4; i4 < end_4; ++i4) { + for (auto i3 = begin_3; i3 < end_3; ++i3) { + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i0 = begin_0; i0 < end_0; ++i0) { + if constexpr (std::is_same::value) + functor(i0, i1, i2, i3, i4, i5, result); + else + functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, i5, + result); + } } -#endif - } - - inline ParallelFor(const FunctorType& arg_functor, Policy arg_policy) - : m_functor(arg_functor), m_policy(arg_policy) {} - // TODO DZP: based on a conversation with Christian, we're using 256 as a - // heuristic here. We need something better once we can query these kinds of - // properties - template - static int max_tile_size_product(const Policy&, const Functor&) { - return 256; - } -}; - -} // namespace Impl -} // namespace Kokkos - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -template -class ParallelReduce, - Kokkos::Experimental::OpenMPTarget> { - private: - using Policy = Kokkos::MDRangePolicy; - using FunctorType = typename CombinedFunctorReducerType::functor_type; - using ReducerType = typename CombinedFunctorReducerType::reducer_type; - - using WorkTag = typename Policy::work_tag; - using Member = typename Policy::member_type; - using Index = typename Policy::index_type; - - using pointer_type = typename ReducerType::pointer_type; - using reference_type = typename ReducerType::reference_type; - - static constexpr bool UseReducer = - !std::is_same_v; - - const pointer_type m_result_ptr; - const CombinedFunctorReducerType m_functor_reducer; - const Policy m_policy; - - using ParReduceCopy = ParallelReduceCopy; - - bool m_result_ptr_on_device; - - // Only let one ParallelReduce instance at a time use the scratch memory. - // The constructor acquires the mutex which is released in the destructor. - std::scoped_lock m_scratch_memory_lock; + } + } + } + } + } - public: - inline void execute() const { - execute_tile( - m_functor_reducer.get_functor(), m_policy, m_result_ptr); + ParReduceCopy::memcpy_result(ptr, &result, sizeof(ValueType), + m_result_ptr_on_device); } - template - inline ParallelReduce(const CombinedFunctorReducerType& arg_functor_reducer, - Policy arg_policy, const ViewType& arg_result_view) - : m_result_ptr(arg_result_view.data()), - m_functor_reducer(arg_functor_reducer), - m_policy(arg_policy), - m_result_ptr_on_device( - MemorySpaceAccess::accessible), - m_scratch_memory_lock(OpenMPTargetExec::m_mutex_scratch_ptr) {} - template - inline std::enable_if_t execute_tile(const FunctorType& functor, - const Policy& policy, - pointer_type ptr) const { + inline std::enable_if_t execute_tile( + const FunctorType& functor, const Policy& policy, pointer_type ptr, + OpenMPTargetIterateRight) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; @@ -509,9 +445,9 @@ reduction(+:result) } template - inline std::enable_if_t execute_tile(const FunctorType& functor, - const Policy& policy, - pointer_type ptr) const { + inline std::enable_if_t execute_tile( + const FunctorType& functor, const Policy& policy, pointer_type ptr, + OpenMPTargetIterateRight) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; const Index begin_2 = policy.m_lower[2]; @@ -567,9 +503,9 @@ reduction(+:result) } template - inline std::enable_if_t execute_tile(const FunctorType& functor, - const Policy& policy, - pointer_type ptr) const { + inline std::enable_if_t execute_tile( + const FunctorType& functor, const Policy& policy, pointer_type ptr, + OpenMPTargetIterateRight) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; const Index begin_2 = policy.m_lower[3]; @@ -630,9 +566,9 @@ reduction(+:result) } template - inline std::enable_if_t execute_tile(const FunctorType& functor, - const Policy& policy, - pointer_type ptr) const { + inline std::enable_if_t execute_tile( + const FunctorType& functor, const Policy& policy, pointer_type ptr, + OpenMPTargetIterateRight) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; const Index begin_2 = policy.m_lower[2]; @@ -701,9 +637,9 @@ reduction(+:result) } template - inline std::enable_if_t execute_tile(const FunctorType& functor, - const Policy& policy, - pointer_type ptr) const { + inline std::enable_if_t execute_tile( + const FunctorType& functor, const Policy& policy, pointer_type ptr, + OpenMPTargetIterateRight) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; const Index begin_2 = policy.m_lower[2]; @@ -788,5 +724,4 @@ reduction(+:result) //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- -#undef KOKKOS_IMPL_MDRANGE_USE_NO_TILES -#endif /* KOKKOS_OPENMPTARGET_PARALLEL_HPP */ +#endif /* KOKKOS_OPENMPTARGET_PARALLELREDUCE_MDRANGE_HPP */ From 4fd18647bff37fb7e8b99bb87797bf21b8603905 Mon Sep 17 00:00:00 2001 From: Paul Zehner Date: Mon, 29 Apr 2024 12:20:40 +0200 Subject: [PATCH 74/95] Restore previous types when create_mirror_view returns the source view --- containers/src/Kokkos_DynRankView.hpp | 6 ++++-- containers/src/Kokkos_DynamicView.hpp | 6 ++++-- containers/src/Kokkos_OffsetView.hpp | 6 ++++-- core/src/Kokkos_CopyViews.hpp | 6 ++++-- 4 files changed, 16 insertions(+), 8 deletions(-) diff --git a/containers/src/Kokkos_DynRankView.hpp b/containers/src/Kokkos_DynRankView.hpp index a055f22ada0..50a49f3e4fa 100644 --- a/containers/src/Kokkos_DynRankView.hpp +++ b/containers/src/Kokkos_DynRankView.hpp @@ -2041,7 +2041,7 @@ inline auto create_mirror_view(const DynRankView& src, typename DynRankView::data_type, typename DynRankView::HostMirror::data_type>::value ) { - return src; + return typename DynRankView::HostMirror(src); } else { return Kokkos::Impl::create_mirror(src, arg_prop); } @@ -2051,7 +2051,9 @@ inline auto create_mirror_view(const DynRankView& src, typename Impl::ViewCtorProp::memory_space, T, P...>::is_same_memspace ) { - return src; + return typename Impl::MirrorDRViewType< + typename Impl::ViewCtorProp::memory_space, T, + P...>::view_type(src); } else { return Kokkos::Impl::create_mirror(src, arg_prop); } diff --git a/containers/src/Kokkos_DynamicView.hpp b/containers/src/Kokkos_DynamicView.hpp index 5f2330a5c05..aab608794c8 100644 --- a/containers/src/Kokkos_DynamicView.hpp +++ b/containers/src/Kokkos_DynamicView.hpp @@ -682,7 +682,7 @@ inline auto create_mirror_view(const Kokkos::Experimental::DynamicView& typename Kokkos::Experimental::DynamicView< T, P...>::HostMirror::data_type>::value ) { - return src; + return typename Kokkos::Experimental::DynamicView::HostMirror(src); } else { return Kokkos::Impl::create_mirror(src, arg_prop); } @@ -690,7 +690,9 @@ inline auto create_mirror_view(const Kokkos::Experimental::DynamicView& if constexpr (Impl::MirrorDynamicViewType< typename Impl::ViewCtorProp::memory_space, T, P...>::is_same_memspace) { - return src; + return typename Impl::MirrorDynamicViewType< + typename Impl::ViewCtorProp::memory_space, + T, P...>::view_type(src); } else { return Kokkos::Impl::create_mirror(src, arg_prop); } diff --git a/containers/src/Kokkos_OffsetView.hpp b/containers/src/Kokkos_OffsetView.hpp index bdfd44f3234..9048c549f26 100644 --- a/containers/src/Kokkos_OffsetView.hpp +++ b/containers/src/Kokkos_OffsetView.hpp @@ -1924,7 +1924,7 @@ inline auto create_mirror_view(const Kokkos::Experimental::OffsetView& typename Kokkos::Experimental::OffsetView::data_type, typename Kokkos::Experimental::OffsetView< T, P...>::HostMirror::data_type>::value) { - return src; + return typename Kokkos::Experimental::OffsetView::HostMirror(src); } else { return Kokkos::Impl::create_mirror(src, arg_prop); } @@ -1932,7 +1932,9 @@ inline auto create_mirror_view(const Kokkos::Experimental::OffsetView& if constexpr (Impl::MirrorOffsetViewType< typename Impl::ViewCtorProp::memory_space, T, P...>::is_same_memspace) { - return src; + return typename Impl::MirrorOffsetViewType< + typename Impl::ViewCtorProp::memory_space, + T, P...>::view_type(src); } else { return Kokkos::Impl::create_mirror(src, arg_prop); } diff --git a/core/src/Kokkos_CopyViews.hpp b/core/src/Kokkos_CopyViews.hpp index c39dfd108cb..f2c5961eedc 100644 --- a/core/src/Kokkos_CopyViews.hpp +++ b/core/src/Kokkos_CopyViews.hpp @@ -3562,7 +3562,7 @@ inline auto create_mirror_view(const Kokkos::View& src, typename Kokkos::View< T, P...>::HostMirror::data_type>::value) { check_view_ctor_args_create_mirror(); - return src; + return typename Kokkos::View::HostMirror(src); } else { return Kokkos::Impl::create_mirror(src, arg_prop); } @@ -3571,7 +3571,9 @@ inline auto create_mirror_view(const Kokkos::View& src, ViewCtorArgs...>::memory_space, T, P...>::is_same_memspace) { check_view_ctor_args_create_mirror(); - return src; + return typename Impl::MirrorViewType< + typename Impl::ViewCtorProp::memory_space, + T, P...>::view_type(src); } else { return Kokkos::Impl::create_mirror(src, arg_prop); } From 0140760b74b574e0b609a9adff22eefb3cc6e7ae Mon Sep 17 00:00:00 2001 From: Paul Zehner Date: Mon, 29 Apr 2024 14:48:38 +0200 Subject: [PATCH 75/95] Fix linting --- containers/src/Kokkos_DynRankView.hpp | 55 +++++++++++++-------------- containers/src/Kokkos_DynamicView.hpp | 49 ++++++++++++------------ containers/src/Kokkos_OffsetView.hpp | 37 +++++++++--------- core/src/Kokkos_CopyViews.hpp | 11 +++--- 4 files changed, 75 insertions(+), 77 deletions(-) diff --git a/containers/src/Kokkos_DynRankView.hpp b/containers/src/Kokkos_DynRankView.hpp index 50a49f3e4fa..52f46a57708 100644 --- a/containers/src/Kokkos_DynRankView.hpp +++ b/containers/src/Kokkos_DynRankView.hpp @@ -1938,10 +1938,8 @@ namespace Impl { // private interface that accepts arbitrary view constructor args passed by a // view_alloc template -inline auto create_mirror( - const DynRankView& src, - const Impl::ViewCtorProp& arg_prop) { - +inline auto create_mirror(const DynRankView& src, + const Impl::ViewCtorProp& arg_prop) { check_view_ctor_args_create_mirror(); auto prop_copy = Impl::with_properties_if_unset( @@ -1952,12 +1950,14 @@ inline auto create_mirror( typename Impl::ViewCtorProp::memory_space, T, P...>::view_type; - return dst_type(prop_copy, Impl::reconstructLayout(src.layout(), src.rank())); + return dst_type(prop_copy, + Impl::reconstructLayout(src.layout(), src.rank())); } else { using src_type = DynRankView; using dst_type = typename src_type::HostMirror; - return dst_type(prop_copy, Impl::reconstructLayout(src.layout(), src.rank())); + return dst_type(prop_copy, + Impl::reconstructLayout(src.layout(), src.rank())); } } @@ -2030,30 +2030,28 @@ namespace Impl { // private interface that accepts arbitrary view constructor args passed by a // view_alloc template -inline auto create_mirror_view(const DynRankView& src, - [[maybe_unused]] const typename Impl::ViewCtorProp& arg_prop) { +inline auto create_mirror_view( + const DynRankView& src, + [[maybe_unused]] const typename Impl::ViewCtorProp& + arg_prop) { if constexpr (!Impl::ViewCtorProp::has_memory_space) { - if constexpr ( - std::is_same< - typename DynRankView::memory_space, - typename DynRankView::HostMirror::memory_space>::value && - std::is_same< - typename DynRankView::data_type, - typename DynRankView::HostMirror::data_type>::value - ) { + if constexpr (std::is_same::memory_space, + typename DynRankView< + T, P...>::HostMirror::memory_space>::value && + std::is_same::data_type, + typename DynRankView< + T, P...>::HostMirror::data_type>::value) { return typename DynRankView::HostMirror(src); } else { return Kokkos::Impl::create_mirror(src, arg_prop); } } else { - if constexpr ( - Impl::MirrorDRViewType< - typename Impl::ViewCtorProp::memory_space, T, - P...>::is_same_memspace - ) { + if constexpr (Impl::MirrorDRViewType::memory_space, + T, P...>::is_same_memspace) { return typename Impl::MirrorDRViewType< - typename Impl::ViewCtorProp::memory_space, T, - P...>::view_type(src); + typename Impl::ViewCtorProp::memory_space, T, + P...>::view_type(src); } else { return Kokkos::Impl::create_mirror(src, arg_prop); } @@ -2132,9 +2130,7 @@ inline auto create_mirror_view( return Impl::create_mirror_view(src, arg_prop); } -namespace Impl { - -} // namespace Impl +namespace Impl {} // namespace Impl // create a mirror view and deep copy it // public interface that accepts arbitrary view constructor args passed by a @@ -2150,12 +2146,13 @@ auto create_mirror_view_and_copy( Impl::check_view_ctor_args_create_mirror_view_and_copy(); if constexpr (Impl::MirrorDRViewType< - typename Impl::ViewCtorProp::memory_space, T, - P...>::is_same_memspace) { + typename Impl::ViewCtorProp::memory_space, + T, P...>::is_same_memspace) { // same behavior as deep_copy(src, src) if constexpr (!alloc_prop_input::has_execution_space) fence( - "Kokkos::create_mirror_view_and_copy: fence before returning src view"); + "Kokkos::create_mirror_view_and_copy: fence before returning src " + "view"); return src; } else { using Space = typename alloc_prop_input::memory_space; diff --git a/containers/src/Kokkos_DynamicView.hpp b/containers/src/Kokkos_DynamicView.hpp index aab608794c8..4cfba5723fd 100644 --- a/containers/src/Kokkos_DynamicView.hpp +++ b/containers/src/Kokkos_DynamicView.hpp @@ -596,7 +596,7 @@ namespace Impl { // view_alloc template inline auto create_mirror(const Kokkos::Experimental::DynamicView& src, - const Impl::ViewCtorProp& arg_prop) { + const Impl::ViewCtorProp& arg_prop) { using alloc_prop_input = Impl::ViewCtorProp; check_view_ctor_args_create_mirror(); @@ -614,7 +614,6 @@ inline auto create_mirror(const Kokkos::Experimental::DynamicView& src, return ret; } else { - auto ret = typename Kokkos::Experimental::DynamicView::HostMirror( prop_copy, src.chunk_size(), src.chunk_max() * src.chunk_size()); @@ -669,30 +668,31 @@ namespace Impl { // private interface that accepts arbitrary view constructor args passed by a // view_alloc template -inline auto create_mirror_view(const Kokkos::Experimental::DynamicView& src, - [[maybe_unused]] const Impl::ViewCtorProp& arg_prop) { +inline auto create_mirror_view( + const Kokkos::Experimental::DynamicView& src, + [[maybe_unused]] const Impl::ViewCtorProp& arg_prop) { if constexpr (!Impl::ViewCtorProp::has_memory_space) { - if constexpr ( - std::is_same< - typename Kokkos::Experimental::DynamicView::memory_space, - typename Kokkos::Experimental::DynamicView< - T, P...>::HostMirror::memory_space>::value && - std::is_same< - typename Kokkos::Experimental::DynamicView::data_type, - typename Kokkos::Experimental::DynamicView< - T, P...>::HostMirror::data_type>::value - ) { - return typename Kokkos::Experimental::DynamicView::HostMirror(src); + if constexpr (std::is_same::memory_space, + typename Kokkos::Experimental::DynamicView< + T, P...>::HostMirror::memory_space>::value && + std::is_same::data_type, + typename Kokkos::Experimental::DynamicView< + T, P...>::HostMirror::data_type>::value) { + return + typename Kokkos::Experimental::DynamicView::HostMirror(src); } else { return Kokkos::Impl::create_mirror(src, arg_prop); } } else { if constexpr (Impl::MirrorDynamicViewType< - typename Impl::ViewCtorProp::memory_space, - T, P...>::is_same_memspace) { + typename Impl::ViewCtorProp< + ViewCtorArgs...>::memory_space, + T, P...>::is_same_memspace) { return typename Impl::MirrorDynamicViewType< - typename Impl::ViewCtorProp::memory_space, - T, P...>::view_type(src); + typename Impl::ViewCtorProp::memory_space, T, + P...>::view_type(src); } else { return Kokkos::Impl::create_mirror(src, arg_prop); } @@ -927,9 +927,7 @@ struct ViewCopy, } // namespace Impl -namespace Impl { - -} // namespace Impl +namespace Impl {} // namespace Impl // create a mirror view and deep copy it // public interface that accepts arbitrary view constructor args passed by a @@ -945,12 +943,13 @@ auto create_mirror_view_and_copy( Impl::check_view_ctor_args_create_mirror_view_and_copy(); if constexpr (Impl::MirrorDynamicViewType< - typename Impl::ViewCtorProp::memory_space, T, - P...>::is_same_memspace) { + typename Impl::ViewCtorProp::memory_space, + T, P...>::is_same_memspace) { // same behavior as deep_copy(src, src) if constexpr (!alloc_prop_input::has_execution_space) fence( - "Kokkos::create_mirror_view_and_copy: fence before returning src view"); + "Kokkos::create_mirror_view_and_copy: fence before returning src " + "view"); return src; } else { using Space = typename alloc_prop_input::memory_space; diff --git a/containers/src/Kokkos_OffsetView.hpp b/containers/src/Kokkos_OffsetView.hpp index 9048c549f26..b2db436b4a9 100644 --- a/containers/src/Kokkos_OffsetView.hpp +++ b/containers/src/Kokkos_OffsetView.hpp @@ -1847,7 +1847,7 @@ namespace Impl { // view_alloc template inline auto create_mirror(const Kokkos::Experimental::OffsetView& src, - const Impl::ViewCtorProp& arg_prop) { + const Impl::ViewCtorProp& arg_prop) { check_view_ctor_args_create_mirror(); if constexpr (Impl::ViewCtorProp::has_memory_space) { @@ -1912,29 +1912,30 @@ namespace Impl { // private interface that accepts arbitrary view constructor args passed by a // view_alloc template -inline auto create_mirror_view(const Kokkos::Experimental::OffsetView& src, - [[maybe_unused]] const Impl::ViewCtorProp& arg_prop) { +inline auto create_mirror_view( + const Kokkos::Experimental::OffsetView& src, + [[maybe_unused]] const Impl::ViewCtorProp& arg_prop) { if constexpr (!Impl::ViewCtorProp::has_memory_space) { - if constexpr ( - std::is_same< - typename Kokkos::Experimental::OffsetView::memory_space, - typename Kokkos::Experimental::OffsetView< - T, P...>::HostMirror::memory_space>::value && - std::is_same< - typename Kokkos::Experimental::OffsetView::data_type, - typename Kokkos::Experimental::OffsetView< - T, P...>::HostMirror::data_type>::value) { - return typename Kokkos::Experimental::OffsetView::HostMirror(src); + if constexpr (std::is_same::memory_space, + typename Kokkos::Experimental::OffsetView< + T, P...>::HostMirror::memory_space>::value && + std::is_same::data_type, + typename Kokkos::Experimental::OffsetView< + T, P...>::HostMirror::data_type>::value) { + return + typename Kokkos::Experimental::OffsetView::HostMirror(src); } else { return Kokkos::Impl::create_mirror(src, arg_prop); } } else { - if constexpr (Impl::MirrorOffsetViewType< - typename Impl::ViewCtorProp::memory_space, - T, P...>::is_same_memspace) { + if constexpr (Impl::MirrorOffsetViewType::memory_space, + T, P...>::is_same_memspace) { return typename Impl::MirrorOffsetViewType< - typename Impl::ViewCtorProp::memory_space, - T, P...>::view_type(src); + typename Impl::ViewCtorProp::memory_space, T, + P...>::view_type(src); } else { return Kokkos::Impl::create_mirror(src, arg_prop); } diff --git a/core/src/Kokkos_CopyViews.hpp b/core/src/Kokkos_CopyViews.hpp index f2c5961eedc..1c0edbf3cf3 100644 --- a/core/src/Kokkos_CopyViews.hpp +++ b/core/src/Kokkos_CopyViews.hpp @@ -3476,7 +3476,7 @@ void check_view_ctor_args_create_mirror() { // view_alloc template inline auto create_mirror(const Kokkos::View& src, - const Impl::ViewCtorProp& arg_prop) { + const Impl::ViewCtorProp& arg_prop) { check_view_ctor_args_create_mirror(); auto prop_copy = Impl::with_properties_if_unset( @@ -3552,8 +3552,9 @@ namespace Impl { // private interface that accepts arbitrary view constructor args passed by a // view_alloc template -inline auto create_mirror_view(const Kokkos::View& src, - [[maybe_unused]] const Impl::ViewCtorProp& arg_prop) { +inline auto create_mirror_view( + const Kokkos::View& src, + [[maybe_unused]] const Impl::ViewCtorProp& arg_prop) { if constexpr (!Impl::ViewCtorProp::has_memory_space) { if constexpr (std::is_same::memory_space, typename Kokkos::View< @@ -3572,8 +3573,8 @@ inline auto create_mirror_view(const Kokkos::View& src, T, P...>::is_same_memspace) { check_view_ctor_args_create_mirror(); return typename Impl::MirrorViewType< - typename Impl::ViewCtorProp::memory_space, - T, P...>::view_type(src); + typename Impl::ViewCtorProp::memory_space, T, + P...>::view_type(src); } else { return Kokkos::Impl::create_mirror(src, arg_prop); } From 5ad1ff43ee8468209cf08179860c8b61d6bb4c68 Mon Sep 17 00:00:00 2001 From: Paul Zehner Date: Mon, 29 Apr 2024 14:51:42 +0200 Subject: [PATCH 76/95] Remove unused namespaces --- containers/src/Kokkos_DynRankView.hpp | 2 -- containers/src/Kokkos_DynamicView.hpp | 2 -- 2 files changed, 4 deletions(-) diff --git a/containers/src/Kokkos_DynRankView.hpp b/containers/src/Kokkos_DynRankView.hpp index 52f46a57708..b5af4689866 100644 --- a/containers/src/Kokkos_DynRankView.hpp +++ b/containers/src/Kokkos_DynRankView.hpp @@ -2130,8 +2130,6 @@ inline auto create_mirror_view( return Impl::create_mirror_view(src, arg_prop); } -namespace Impl {} // namespace Impl - // create a mirror view and deep copy it // public interface that accepts arbitrary view constructor args passed by a // view_alloc diff --git a/containers/src/Kokkos_DynamicView.hpp b/containers/src/Kokkos_DynamicView.hpp index 4cfba5723fd..ff4b5dc171d 100644 --- a/containers/src/Kokkos_DynamicView.hpp +++ b/containers/src/Kokkos_DynamicView.hpp @@ -927,8 +927,6 @@ struct ViewCopy, } // namespace Impl -namespace Impl {} // namespace Impl - // create a mirror view and deep copy it // public interface that accepts arbitrary view constructor args passed by a // view_alloc From 77ea52f97685908e62e11f10fe263f51f4fc0c46 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Tue, 30 Apr 2024 20:49:51 -0400 Subject: [PATCH 77/95] Threads: Don't silently allow m_instance to be a nullptr (#6969) * Threads: Don't silently allod m_instance to be a nullptr * Assert that m_instance is not nullptr --- core/src/Threads/Kokkos_Threads_Team.hpp | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/core/src/Threads/Kokkos_Threads_Team.hpp b/core/src/Threads/Kokkos_Threads_Team.hpp index fd0f221365b..a3501a437d2 100644 --- a/core/src/Threads/Kokkos_Threads_Team.hpp +++ b/core/src/Threads/Kokkos_Threads_Team.hpp @@ -188,8 +188,6 @@ class ThreadsExecTeamMember { using type = typename if_c::type; - if (m_instance == nullptr) return value; - if (team_rank() != team_size() - 1) * ((volatile type*)m_instance->scratch_memory()) = value; @@ -229,8 +227,6 @@ class ThreadsExecTeamMember { using type = typename if_c::type; - if (m_instance == nullptr) return; - type* const local_value = ((type*)m_instance->scratch_memory()); // Set this thread's contribution @@ -285,8 +281,6 @@ class ThreadsExecTeamMember { using type = typename if_c::type; - if (m_instance == nullptr) return type(0); - volatile type* const work_value = ((type*)m_instance->scratch_memory()); *work_value = value; @@ -358,6 +352,7 @@ class ThreadsExecTeamMember { m_chunk_size(team.chunk_size()), m_league_chunk_end(0), m_team_alloc(team.team_alloc()) { + KOKKOS_ASSERT(m_instance != nullptr); if (team.league_size()) { // Execution is using device-team interface: From f699a2c7a2668832e74747ed4816bef683937ba7 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Tue, 30 Apr 2024 21:44:09 -0400 Subject: [PATCH 78/95] Fix enabling OpenMP with HIP and "compile as CMake language" --- cmake/kokkos_tpls.cmake | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cmake/kokkos_tpls.cmake b/cmake/kokkos_tpls.cmake index df01f200d13..c7c352ae35f 100644 --- a/cmake/kokkos_tpls.cmake +++ b/cmake/kokkos_tpls.cmake @@ -111,6 +111,9 @@ IF (Kokkos_ENABLE_OPENMP) ELSE() KOKKOS_EXPORT_CMAKE_TPL(OpenMP REQUIRED COMPONENTS CXX) ENDIF() + IF(Kokkos_ENABLE_HIP AND KOKKOS_COMPILE_LANGUAGE STREQUAL HIP) + GLOBAL_APPEND(KOKKOS_AMDGPU_OPTIONS ${OpenMP_CXX_FLAGS}) + ENDIF() ENDIF() #Convert list to newlines (which CMake doesn't always like in cache variables) From 2574b802922d2d13cdd80b2b171ff5f3cd5ef15b Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Tue, 30 Apr 2024 21:49:46 -0400 Subject: [PATCH 79/95] Fix OpenMP+CUDA when `Kokkos_ENABLE_COMPILE_AS_CMAKE_LANGUAGE` is `ON` Co-Authored-By: Daniel Arndt --- cmake/kokkos_tpls.cmake | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cmake/kokkos_tpls.cmake b/cmake/kokkos_tpls.cmake index c7c352ae35f..cda9e0d6004 100644 --- a/cmake/kokkos_tpls.cmake +++ b/cmake/kokkos_tpls.cmake @@ -114,6 +114,9 @@ IF (Kokkos_ENABLE_OPENMP) IF(Kokkos_ENABLE_HIP AND KOKKOS_COMPILE_LANGUAGE STREQUAL HIP) GLOBAL_APPEND(KOKKOS_AMDGPU_OPTIONS ${OpenMP_CXX_FLAGS}) ENDIF() + IF(Kokkos_ENABLE_CUDA AND KOKKOS_COMPILE_LANGUAGE STREQUAL CUDA) + GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS -Xcompiler ${OpenMP_CXX_FLAGS}) + ENDIF() ENDIF() #Convert list to newlines (which CMake doesn't always like in cache variables) From ccd0126b88a08b27c86f11ecd0447454b96b0b52 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 1 May 2024 12:03:36 -0400 Subject: [PATCH 80/95] Fix fedora CI builds with flang-new --- example/build_cmake_installed/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/example/build_cmake_installed/CMakeLists.txt b/example/build_cmake_installed/CMakeLists.txt index aaf745b418d..c025f1d7d28 100644 --- a/example/build_cmake_installed/CMakeLists.txt +++ b/example/build_cmake_installed/CMakeLists.txt @@ -12,6 +12,7 @@ find_package(Kokkos REQUIRED) add_executable(example cmake_example.cpp foo.f) if(CMAKE_Fortran_COMPILER_ID STREQUAL LLVMFlang) set_target_properties(example PROPERTIES LINKER_LANGUAGE Fortran) + target_link_options(example PRIVATE -fno-fortran-main) endif() # This is the only thing required to set up compiler/linker flags From 45a14049163732fbc5eb249282d15424ebb91d55 Mon Sep 17 00:00:00 2001 From: Christian Trott Date: Thu, 2 May 2024 09:22:10 -0600 Subject: [PATCH 81/95] Fix Copyright file --- Copyright.txt | 49 ++++++++----------------------------------------- LICENSE | 10 ---------- 2 files changed, 8 insertions(+), 51 deletions(-) diff --git a/Copyright.txt b/Copyright.txt index 5e2f8d8647b..cbba3efc7bc 100644 --- a/Copyright.txt +++ b/Copyright.txt @@ -1,41 +1,8 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 3.0 -// Copyright (2020) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Questions? Contact Christian R. Trott (crtrott@sandia.gov) -// -// ************************************************************************ -//@HEADER +************************************************************************ + + Kokkos v. 4.0 + Copyright (2022) National Technology & Engineering + Solutions of Sandia, LLC (NTESS). + +Under the terms of Contract DE-NA0003525 with NTESS, +the U.S. Government retains certain rights in this software. diff --git a/LICENSE b/LICENSE index 6572cc2db05..4d9d69d7c44 100644 --- a/LICENSE +++ b/LICENSE @@ -1,13 +1,3 @@ - ************************************************************************ - - Kokkos v. 4.0 - Copyright (2022) National Technology & Engineering - Solutions of Sandia, LLC (NTESS). - - Under the terms of Contract DE-NA0003525 with NTESS, - the U.S. Government retains certain rights in this software. - - ============================================================================== Kokkos is under the Apache License v2.0 with LLVM Exceptions: ============================================================================== From c6d86474a83c460e9de37bba938b63f5d9580070 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Thu, 2 May 2024 16:10:07 -0400 Subject: [PATCH 82/95] Also use is_nothrow_swappable workaround for Intel Classic Compilers (#6983) * Also use is_nothrow_swappable workaround for Intel Classic Compilers * Use template parameter U directly in kokkos_swap overload --- core/src/Kokkos_Array.hpp | 2 +- core/src/Kokkos_Swap.hpp | 8 -------- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/core/src/Kokkos_Array.hpp b/core/src/Kokkos_Array.hpp index 29e5edd9baa..0a1ced93c8f 100644 --- a/core/src/Kokkos_Array.hpp +++ b/core/src/Kokkos_Array.hpp @@ -138,7 +138,7 @@ struct Array { friend KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t< Impl::is_swappable::value> kokkos_swap(Array& a, - Array& b) noexcept(Impl::is_nothrow_swappable_v) { + Array& b) noexcept(Impl::is_nothrow_swappable_v) { for (std::size_t i = 0; i < N; ++i) { kokkos_swap(a[i], b[i]); } diff --git a/core/src/Kokkos_Swap.hpp b/core/src/Kokkos_Swap.hpp index 907f8607a7e..2f849a13ab6 100644 --- a/core/src/Kokkos_Swap.hpp +++ b/core/src/Kokkos_Swap.hpp @@ -49,17 +49,9 @@ struct is_swappable { !std::is_same_v(0)), Nope>; }; -#if defined(KOKKOS_COMPILER_NVCC) && (KOKKOS_COMPILER_NVCC < 1140) -template -inline constexpr bool is_nothrow_swappable_v = - is_swappable::value&& noexcept( - kokkos_swap(std::declval&>(), - std::declval&>())); -#else template inline constexpr bool is_nothrow_swappable_v = noexcept(kokkos_swap(std::declval(), std::declval())); -#endif } // namespace Impl From 69567f3051dbdc51e65221c8b68ee41849749a4f Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Fri, 3 May 2024 08:57:09 -0400 Subject: [PATCH 83/95] Add thread-safety tests (#6938) * Add thread-safety tests * Disable thread-safety tests for Serial and OpenMP for now * Cleanup include and namespace * Skip tests for OpenACC in CMakeLists.txt * Avoid std::move * Comment on tests * Use more atomics * Simplify test --- core/unit_test/CMakeLists.txt | 7 + core/unit_test/TestExecSpaceThreadSafety.hpp | 319 +++++++++++++++++++ 2 files changed, 326 insertions(+) create mode 100644 core/unit_test/TestExecSpaceThreadSafety.hpp diff --git a/core/unit_test/CMakeLists.txt b/core/unit_test/CMakeLists.txt index 3b14bec03a2..4d0ce3b22e3 100644 --- a/core/unit_test/CMakeLists.txt +++ b/core/unit_test/CMakeLists.txt @@ -148,6 +148,7 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;OpenACC;HIP;SYCL) Crs DeepCopyAlignment ExecSpacePartitioning + ExecSpaceThreadSafety ExecutionSpace FunctorAnalysis HostSharedPtr @@ -426,6 +427,7 @@ if(Kokkos_ENABLE_OPENACC) ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_complexdouble.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_complexfloat.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Crs.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ExecSpaceThreadSafety.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_JoinBackwardCompatibility.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_LocalDeepCopy.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Other.cpp @@ -637,6 +639,8 @@ IF(KOKKOS_ENABLE_OPENACC AND KOKKOS_CXX_COMPILER_ID STREQUAL Clang) endif() if(Kokkos_ENABLE_SERIAL) + list(REMOVE_ITEM Serial_SOURCES1 + ${CMAKE_CURRENT_BINARY_DIR}/serial/TestSerial_ExecSpaceThreadSafety.cpp) KOKKOS_ADD_EXECUTABLE_AND_TEST( CoreUnitTest_Serial1 SOURCES @@ -667,6 +671,9 @@ if(Kokkos_ENABLE_THREADS) endif() if (Kokkos_ENABLE_OPENMP) + list(REMOVE_ITEM OpenMP_SOURCES + ${CMAKE_CURRENT_BINARY_DIR}/openmp/TestOpenMP_ExecSpaceThreadSafety.cpp) + set(OpenMP_EXTRA_SOURCES openmp/TestOpenMP_Task.cpp ) diff --git a/core/unit_test/TestExecSpaceThreadSafety.hpp b/core/unit_test/TestExecSpaceThreadSafety.hpp new file mode 100644 index 00000000000..20b802babe0 --- /dev/null +++ b/core/unit_test/TestExecSpaceThreadSafety.hpp @@ -0,0 +1,319 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include +#include + +namespace { + +#ifdef KOKKOS_ENABLE_OPENMP +template +void run_threaded_test(const Lambda1 l1, const Lambda2 l2) { +#pragma omp parallel num_threads(2) + { + if (omp_get_thread_num() == 0) l1(); + if (omp_get_thread_num() == 1) l2(); + } +} +// We cannot run the multithreaded test when threads or HPX is enabled because +// we cannot launch a thread from inside another thread +#elif !defined(KOKKOS_ENABLE_THREADS) && !defined(KOKKOS_ENABLE_HPX) +template +void run_threaded_test(const Lambda1 l1, const Lambda2 l2) { + std::thread t1(l1); + std::thread t2(l2); + t1.join(); + t2.join(); +} +#else +template +void run_threaded_test(const Lambda1 l1, const Lambda2 l2) { + l1(); + l2(); +} +#endif + +// The idea for all of these tests is to access a View from kernels submitted by +// two different threads to the same execution space instance. If the kernels +// are executed concurrently, we expect to count too many increments. +void run_exec_space_thread_safety_range() { + constexpr int N = 10000000; + constexpr int M = 10; + + Kokkos::View view("view"); + Kokkos::View error("error"); + + auto lambda = [=]() { + TEST_EXECSPACE exec; + for (int j = 0; j < M; ++j) { + Kokkos::parallel_for( + Kokkos::RangePolicy(exec, 0, 1), KOKKOS_LAMBDA(int) { + Kokkos::atomic_store(view.data(), 0); + for (int i = 0; i < N; ++i) Kokkos::atomic_inc(view.data()); + if (Kokkos::atomic_load(view.data()) != N) + Kokkos::atomic_store(error.data(), 1); + }); + } + }; + + run_threaded_test(lambda, lambda); + + auto host_error = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, error); + ASSERT_EQ(host_error(), 0); +} + +TEST(TEST_CATEGORY, exec_space_thread_safety_range) { +#ifdef KOKKOS_ENABLE_OPENMPTARGET + if (std::is_same_v) + GTEST_SKIP() << "skipping since test is known to fail for OpenMPTarget"; +#endif + run_exec_space_thread_safety_range(); +} + +void run_exec_space_thread_safety_mdrange() { + constexpr int N = 1000000; + constexpr int M = 10; + + Kokkos::View view("view"); + Kokkos::View error("error"); + + auto lambda = [=]() { + TEST_EXECSPACE exec; + for (int j = 0; j < M; ++j) { + Kokkos::parallel_for( + Kokkos::MDRangePolicy>(exec, {0, 0}, + {1, 1}), + KOKKOS_LAMBDA(int, int) { + Kokkos::atomic_store(view.data(), 0); + for (int i = 0; i < N; ++i) Kokkos::atomic_inc(view.data()); + if (Kokkos::atomic_load(view.data()) != N) + Kokkos::atomic_store(error.data(), 1); + }); + } + }; + + run_threaded_test(lambda, lambda); + + auto host_error = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, error); + ASSERT_EQ(host_error(), 0); +} + +TEST(TEST_CATEGORY, exec_space_thread_safety_mdrange) { +#ifdef KOKKOS_ENABLE_OPENMPTARGET + if (std::is_same_v) + GTEST_SKIP() << "skipping since test is known to fail for OpenMPTarget"; +#endif + run_exec_space_thread_safety_mdrange(); +} + +void run_exec_space_thread_safety_team_policy() { + constexpr int N = 1000000; + constexpr int M = 10; + + Kokkos::View view("view"); + Kokkos::View error("error"); + + auto lambda = [=]() { + TEST_EXECSPACE exec; + for (int j = 0; j < M; ++j) { + Kokkos::parallel_for( + Kokkos::TeamPolicy(exec, 1, 1, 1), + KOKKOS_LAMBDA(const Kokkos::TeamPolicy::member_type + &team_member) { + Kokkos::single(Kokkos::PerTeam(team_member), [=]() { + Kokkos::atomic_store(view.data(), 0); + for (int i = 0; i < N; ++i) Kokkos::atomic_inc(view.data()); + if (Kokkos::atomic_load(view.data()) != N) + Kokkos::atomic_store(error.data(), 1); + }); + }); + } + }; + + run_threaded_test(lambda, lambda); + + auto host_error = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, error); + ASSERT_EQ(host_error(), 0); +} + +TEST(TEST_CATEGORY, exec_space_thread_safety_team_policy) { +// FIXME_OPENMPTARGET +#ifdef KOKKOS_ENABLE_OPENMPTARGET + if (std::is_same_v) + GTEST_SKIP() << "skipping for OpenMPTarget since the test is designed to " + "run with vector_length=1"; +#endif + run_exec_space_thread_safety_team_policy(); +} + +void run_exec_space_thread_safety_range_reduce() { + constexpr int N = 1000000; + constexpr int M = 10; + + Kokkos::View view("view"); + Kokkos::View error("error"); + + auto lambda = [=]() { + TEST_EXECSPACE exec; + for (int j = 0; j < M; ++j) { + Kokkos::parallel_reduce( + Kokkos::RangePolicy(exec, 0, 1), + KOKKOS_LAMBDA(int, int &update) { + Kokkos::atomic_store(view.data(), 0); + for (int i = 0; i < N; ++i) Kokkos::atomic_inc(view.data()); + if (Kokkos::atomic_load(view.data()) != N) ++update; + }, + error); + } + exec.fence(); + }; + + run_threaded_test(lambda, lambda); + + auto host_error = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, error); + ASSERT_EQ(host_error(), 0); +} + +TEST(TEST_CATEGORY, exec_space_thread_safety_range_reduce) { + run_exec_space_thread_safety_range_reduce(); +} + +void run_exec_space_thread_safety_mdrange_reduce() { + constexpr int N = 1000000; + constexpr int M = 10; + + Kokkos::View view("view"); + Kokkos::View error("error"); + + auto lambda = [=]() { + TEST_EXECSPACE exec; + for (int j = 0; j < M; ++j) { + Kokkos::parallel_reduce( + Kokkos::MDRangePolicy>(exec, {0, 0}, + {1, 1}), + KOKKOS_LAMBDA(int, int, int &update) { + Kokkos::atomic_store(view.data(), 0); + for (int i = 0; i < N; ++i) Kokkos::atomic_inc(view.data()); + if (Kokkos::atomic_load(view.data()) != N) ++update; + }, + error); + } + exec.fence(); + }; + + run_threaded_test(lambda, lambda); + + auto host_error = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, error); + ASSERT_EQ(host_error(), 0); +} + +TEST(TEST_CATEGORY, exec_space_thread_safety_mdrange_reduce) { +// FIXME_INTEL +#ifdef KOKKOS_COMPILER_INTEL + if (std::is_same_v) + GTEST_SKIP() << "skipping since test is known to fail for OpenMP using the " + "legacy Intel compiler"; +#endif + run_exec_space_thread_safety_mdrange_reduce(); +} + +void run_exec_space_thread_safety_team_policy_reduce() { + constexpr int N = 1000000; + constexpr int M = 10; + + Kokkos::View view("view"); + Kokkos::View error("error"); + + auto lambda = [=]() { + TEST_EXECSPACE exec; + for (int j = 0; j < M; ++j) { + Kokkos::parallel_reduce( + Kokkos::TeamPolicy(exec, 1, 1, 1), + KOKKOS_LAMBDA(const Kokkos::TeamPolicy::member_type + &team_member, + int &update) { + Kokkos::single(Kokkos::PerTeam(team_member), [=, &update]() { + Kokkos::atomic_store(view.data(), 0); + for (int i = 0; i < N; ++i) Kokkos::atomic_inc(view.data()); + if (Kokkos::atomic_load(view.data()) != N) ++update; + }); + }, + error); + } + }; + run_threaded_test(lambda, lambda); + + auto host_error = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, error); + ASSERT_EQ(host_error(), 0); +} + +TEST(TEST_CATEGORY, exec_space_thread_safety_team_policy_reduce) { +// FIXME_OPENMPTARGET +#ifdef KOKKOS_ENABLE_OPENMPTARGET + if (std::is_same_v) + GTEST_SKIP() << "skipping for OpenMPTarget since the test is designed to " + "run with vector_length=1"; +#endif + // FIXME_SYCL +#if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) + if (std::is_same_v) + GTEST_SKIP() << "skipping since test is know to fail with SYCL+Cuda"; +#endif + run_exec_space_thread_safety_team_policy_reduce(); +} + +void run_exec_space_thread_safety_range_scan() { + constexpr int N = 1000000; + constexpr int M = 10; + + Kokkos::View view("view"); + Kokkos::View error("error"); + + auto lambda = [=]() { + TEST_EXECSPACE exec; + for (int j = 0; j < M; ++j) { + Kokkos::parallel_scan( + Kokkos::RangePolicy(exec, 0, 1), + KOKKOS_LAMBDA(int, int &, const bool final) { + if (final) { + Kokkos::atomic_store(view.data(), 0); + for (int i = 0; i < N; ++i) Kokkos::atomic_inc(view.data()); + if (Kokkos::atomic_load(view.data()) != N) + Kokkos::atomic_store(error.data(), 1); + } + }); + } + exec.fence(); + }; + + run_threaded_test(lambda, lambda); + + auto host_error = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, error); + ASSERT_EQ(host_error(), 0); +} + +TEST(TEST_CATEGORY, exec_space_thread_safety_range_scan) { + run_exec_space_thread_safety_range_scan(); +} + +} // namespace From 9c7920291d7fc100ed94133da0dbe2412c8d2f05 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Fri, 3 May 2024 18:30:04 -0400 Subject: [PATCH 84/95] Fix deprecation warnings with GCC for pair comparison operators Co-Authored-By: Andrey Prokopenko --- core/src/Kokkos_Pair.hpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/core/src/Kokkos_Pair.hpp b/core/src/Kokkos_Pair.hpp index 9c3516eb222..2b7f275d06d 100644 --- a/core/src/Kokkos_Pair.hpp +++ b/core/src/Kokkos_Pair.hpp @@ -450,37 +450,37 @@ struct KOKKOS_DEPRECATED pair { // template -KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator==( +KOKKOS_DEPRECATED KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator==( const pair& lhs, const pair& rhs) { return lhs.first == rhs.first; } template -KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator!=( +KOKKOS_DEPRECATED KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator!=( const pair& lhs, const pair& rhs) { return !(lhs == rhs); } template -KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator<( +KOKKOS_DEPRECATED KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator<( const pair& lhs, const pair& rhs) { return lhs.first < rhs.first; } template -KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator<=( +KOKKOS_DEPRECATED KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator<=( const pair& lhs, const pair& rhs) { return !(rhs < lhs); } template -KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator>( +KOKKOS_DEPRECATED KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator>( const pair& lhs, const pair& rhs) { return rhs < lhs; } template -KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator>=( +KOKKOS_DEPRECATED KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator>=( const pair& lhs, const pair& rhs) { return !(lhs < rhs); } From 7b8e3a68fcbf3a8deef67dbf5287c5331c73df3a Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Mon, 6 May 2024 17:23:34 +0000 Subject: [PATCH 85/95] Fix TPL_LIBRARY_SUFFIXES for 32-bit build --- cmake/kokkos_functions.cmake | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/cmake/kokkos_functions.cmake b/cmake/kokkos_functions.cmake index 9dab1ca00ea..d1f1e0d7a78 100644 --- a/cmake/kokkos_functions.cmake +++ b/cmake/kokkos_functions.cmake @@ -709,7 +709,12 @@ MACRO(kokkos_find_imported NAME) ENDIF() IF (NOT TPL_LIBRARY_SUFFIXES) - SET(TPL_LIBRARY_SUFFIXES lib lib64) + SET(TPL_LIBRARY_SUFFIXES lib) + IF(KOKKOS_IMPL_32BIT) + LIST(APPEND TPL_LIBRARY_SUFFIXES lib32) + ELSE() + LIST(APPEND TPL_LIBRARY_SUFFIXES lib64) + ENDIF() ENDIF() SET(${NAME}_INCLUDE_DIRS) From 28260178f4d68fb2bfb7ccfc8d7239e264b1d166 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Mon, 6 May 2024 13:34:15 -0400 Subject: [PATCH 86/95] Avoid duplicated definition of KOKKOS_IMPL_32BIT --- .github/workflows/continuous-integration-workflow-32bit.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/continuous-integration-workflow-32bit.yml b/.github/workflows/continuous-integration-workflow-32bit.yml index 87c21d3a6e7..0260cb5894a 100644 --- a/.github/workflows/continuous-integration-workflow-32bit.yml +++ b/.github/workflows/continuous-integration-workflow-32bit.yml @@ -36,7 +36,7 @@ jobs: -DKokkos_ENABLE_DEPRECATED_CODE_4=ON \ -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ - -DCMAKE_CXX_FLAGS="-Werror -m32 -DKOKKOS_IMPL_32BIT" \ + -DCMAKE_CXX_FLAGS="-Werror -m32" \ -DCMAKE_CXX_COMPILER=g++ \ -DCMAKE_BUILD_TYPE=RelWithDebInfo - name: Build From ccadc7d9ba2be086be38a8f1731c5df9339fbe06 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Mon, 6 May 2024 14:15:50 -0400 Subject: [PATCH 87/95] Disable failing parallel_scan_with_reducers test --- core/unit_test/TestTeamVector.hpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/core/unit_test/TestTeamVector.hpp b/core/unit_test/TestTeamVector.hpp index 5e16539d652..e278789992f 100644 --- a/core/unit_test/TestTeamVector.hpp +++ b/core/unit_test/TestTeamVector.hpp @@ -1067,6 +1067,10 @@ TEST(TEST_CATEGORY, parallel_scan_with_reducers) { } #endif +#ifdef KOKKOS_IMPL_32BIT + GTEST_SKIP() << "Failing KOKKOS_IMPL_32BIT"; // FIXME_32BIT +#endif + checkScan>() .run(); From d61d75aceceacf4c3b5a6463626f58826ce47849 Mon Sep 17 00:00:00 2001 From: Andrey Prokopenko Date: Wed, 8 May 2024 07:41:50 -0400 Subject: [PATCH 88/95] Fix a bug when using realloc on views of non-default constructible element types (#6993) * Add few missing constexpr for alloc_prop_input Co-authored-by: Daniel Arndt * Update tests * Fix DualView * Address review comments * Add missing decorators * Move NoDefaultConstructor out of function --------- Co-authored-by: Daniel Arndt --- containers/src/Kokkos_DualView.hpp | 8 ++++---- core/src/Kokkos_CopyViews.hpp | 10 ++++++++-- core/unit_test/TestRealloc.hpp | 13 +++++++++++++ core/unit_test/TestResize.hpp | 13 +++++++++++++ 4 files changed, 38 insertions(+), 6 deletions(-) diff --git a/containers/src/Kokkos_DualView.hpp b/containers/src/Kokkos_DualView.hpp index e821570a8d5..1fb174943fe 100644 --- a/containers/src/Kokkos_DualView.hpp +++ b/containers/src/Kokkos_DualView.hpp @@ -944,13 +944,13 @@ class DualView : public ViewTraits { if (sizeMismatch) { ::Kokkos::realloc(arg_prop, d_view, n0, n1, n2, n3, n4, n5, n6, n7); - if (alloc_prop_input::initialize) { + if constexpr (alloc_prop_input::initialize) { h_view = create_mirror_view(typename t_host::memory_space(), d_view); } else { h_view = create_mirror_view(Kokkos::WithoutInitializing, typename t_host::memory_space(), d_view); } - } else if (alloc_prop_input::initialize) { + } else if constexpr (alloc_prop_input::initialize) { if constexpr (alloc_prop_input::has_execution_space) { const auto& exec_space = Impl::get_property(arg_prop); @@ -1038,7 +1038,7 @@ class DualView : public ViewTraits { /* Resize on Device */ if (sizeMismatch) { ::Kokkos::resize(properties, d_view, n0, n1, n2, n3, n4, n5, n6, n7); - if (alloc_prop_input::initialize) { + if constexpr (alloc_prop_input::initialize) { h_view = create_mirror_view(typename t_host::memory_space(), d_view); } else { h_view = create_mirror_view(Kokkos::WithoutInitializing, @@ -1054,7 +1054,7 @@ class DualView : public ViewTraits { /* Resize on Host */ if (sizeMismatch) { ::Kokkos::resize(properties, h_view, n0, n1, n2, n3, n4, n5, n6, n7); - if (alloc_prop_input::initialize) { + if constexpr (alloc_prop_input::initialize) { d_view = create_mirror_view(typename t_dev::memory_space(), h_view); } else { diff --git a/core/src/Kokkos_CopyViews.hpp b/core/src/Kokkos_CopyViews.hpp index ee8d1e09d3a..40fdd590f6f 100644 --- a/core/src/Kokkos_CopyViews.hpp +++ b/core/src/Kokkos_CopyViews.hpp @@ -3235,7 +3235,10 @@ impl_realloc(Kokkos::View& v, const size_t n0, const size_t n1, v = view_type(); // Best effort to deallocate in case no other view refers // to the shared allocation v = view_type(arg_prop_copy, n0, n1, n2, n3, n4, n5, n6, n7); - } else if (alloc_prop_input::initialize) { + return; + } + + if constexpr (alloc_prop_input::initialize) { if constexpr (alloc_prop_input::has_execution_space) { const auto& exec_space = Impl::get_property(arg_prop); @@ -3330,7 +3333,10 @@ impl_realloc(Kokkos::View& v, if (v.layout() != layout) { v = view_type(); // Deallocate first, if the only view to allocation v = view_type(arg_prop, layout); - } else if (alloc_prop_input::initialize) { + return; + } + + if constexpr (alloc_prop_input::initialize) { if constexpr (alloc_prop_input::has_execution_space) { const auto& exec_space = Impl::get_property(arg_prop); diff --git a/core/unit_test/TestRealloc.hpp b/core/unit_test/TestRealloc.hpp index 2c9dc5ee473..f30c9e15e1c 100644 --- a/core/unit_test/TestRealloc.hpp +++ b/core/unit_test/TestRealloc.hpp @@ -144,6 +144,11 @@ void impl_testRealloc() { EXPECT_EQ(oldPointer, newPointer); } } +struct NoDefaultConstructor { + int value; + KOKKOS_FUNCTION + NoDefaultConstructor(int x) : value(x) {} +}; template void testRealloc() { @@ -154,6 +159,14 @@ void testRealloc() { impl_testRealloc(); // without data initialization } + // Check #6992 fix (no default initialization in realloc without initializing) + { + using view_type = Kokkos::View; + view_type view_1d_no_default( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "view_1d_no_default"), + 5); + realloc_dispatch(WithoutInitializing{}, view_1d_no_default, 3); + } } } // namespace TestViewRealloc diff --git a/core/unit_test/TestResize.hpp b/core/unit_test/TestResize.hpp index 13d7e16d589..3102d2b9a16 100644 --- a/core/unit_test/TestResize.hpp +++ b/core/unit_test/TestResize.hpp @@ -358,6 +358,12 @@ void impl_testResize() { } } +struct NoDefaultConstructor { + int value; + KOKKOS_FUNCTION + NoDefaultConstructor(int x) : value(x) {} +}; + template void testResize() { { @@ -367,6 +373,13 @@ void testResize() { impl_testResize(); // without data initialization } + { + using view_type = Kokkos::View; + view_type view_1d_no_default( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "view_1d_no_default"), + 5); + resize_dispatch(WithoutInitializing{}, view_1d_no_default, 3); + } } } // namespace TestViewResize From 50a862cf63d532d3de6d7dc2767279d0725cd05d Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 8 May 2024 12:05:05 -0400 Subject: [PATCH 89/95] SYCL: Prepare Parallel* for Graphs (#6988) * SYCL: Make Parallel* copyable * Address review comments * Refactor Team policies further * Fix alias for SYCL TeamPolicy ParallelReduce * Improve const-correctness in Kokkos_SYCL_ParallelReduce_Team * Fix up Kokkos_SYCL_ParallelReduce_Team.hpp --- .../SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp | 6 -- .../SYCL/Kokkos_SYCL_ParallelFor_Range.hpp | 6 -- .../src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp | 53 ++++++------ .../Kokkos_SYCL_ParallelReduce_MDRange.hpp | 14 ++-- .../SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp | 14 ++-- .../SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp | 84 +++++++++---------- .../SYCL/Kokkos_SYCL_ParallelScan_Range.hpp | 14 ++-- 7 files changed, 83 insertions(+), 108 deletions(-) diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp index 7fbf5420f83..b58885192b9 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp @@ -181,12 +181,6 @@ class Kokkos::Impl::ParallelFor, functor_wrapper.register_event(event); } - ParallelFor(const ParallelFor&) = delete; - ParallelFor(ParallelFor&&) = delete; - ParallelFor& operator=(const ParallelFor&) = delete; - ParallelFor& operator=(ParallelFor&&) = delete; - ~ParallelFor() = default; - ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy) : m_functor(arg_functor), m_policy(arg_policy), diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp index b4de7eb89ff..2f8db922d3d 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp @@ -137,12 +137,6 @@ class Kokkos::Impl::ParallelFor, functor_wrapper.register_event(event); } - ParallelFor(const ParallelFor&) = delete; - ParallelFor(ParallelFor&&) = delete; - ParallelFor& operator=(const ParallelFor&) = delete; - ParallelFor& operator=(ParallelFor&&) = delete; - ~ParallelFor() = default; - ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy) : m_functor(arg_functor), m_policy(arg_policy) {} }; diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp index d98f4837315..57ff97e7f31 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp @@ -28,7 +28,7 @@ template class Kokkos::Impl::ParallelFor, Kokkos::Experimental::SYCL> { public: - using Policy = TeamPolicyInternal; + using Policy = TeamPolicy; using functor_type = FunctorType; using size_type = ::Kokkos::Experimental::SYCL::size_type; @@ -44,19 +44,14 @@ class Kokkos::Impl::ParallelFor, size_type const m_vector_size; int m_shmem_begin; int m_shmem_size; - sycl_device_ptr m_global_scratch_ptr; size_t m_scratch_size[2]; - // Only let one ParallelFor instance at a time use the team scratch memory. - // The constructor acquires the mutex which is released in the destructor. - std::scoped_lock m_scratch_buffers_lock; - int m_scratch_pool_id = -1; template - sycl::event sycl_direct_launch(const Policy& policy, + sycl::event sycl_direct_launch(const sycl_device_ptr global_scratch_ptr, const FunctorWrapper& functor_wrapper, const sycl::event& memcpy_event) const { // Convenience references - const Kokkos::Experimental::SYCL& space = policy.space(); + const Kokkos::Experimental::SYCL& space = m_policy.space(); sycl::queue& q = space.sycl_queue(); desul::ensure_sycl_lock_arrays_on_device(q); @@ -72,7 +67,6 @@ class Kokkos::Impl::ParallelFor, // Avoid capturing *this since it might not be trivially copyable const auto shmem_begin = m_shmem_begin; const size_t scratch_size[2] = {m_scratch_size[0], m_scratch_size[1]}; - sycl_device_ptr const global_scratch_ptr = m_global_scratch_ptr; auto lambda = [=](sycl::nd_item<2> item) { const member_type team_member( @@ -125,17 +119,31 @@ class Kokkos::Impl::ParallelFor, inline void execute() const { if (m_league_size == 0) return; - auto& space = *m_policy.space().impl_internal_space_instance(); + auto& instance = *m_policy.space().impl_internal_space_instance(); + + // Only let one instance at a time resize the instance's scratch memory + // allocations. + std::scoped_lock team_scratch_lock( + instance.m_team_scratch_mutex); + + // Functor's reduce memory, team scan memory, and team shared memory depend + // upon team size. + int scratch_pool_id = instance.acquire_team_scratch_space(); + const sycl_device_ptr global_scratch_ptr = + static_cast>(instance.resize_team_scratch_space( + scratch_pool_id, + static_cast(m_scratch_size[1]) * m_league_size)); + Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem& - indirectKernelMem = space.get_indirect_kernel_mem(); + indirectKernelMem = instance.get_indirect_kernel_mem(); auto functor_wrapper = Experimental::Impl::make_sycl_function_wrapper( m_functor, indirectKernelMem); - sycl::event event = sycl_direct_launch(m_policy, functor_wrapper, + sycl::event event = sycl_direct_launch(global_scratch_ptr, functor_wrapper, functor_wrapper.get_copy_event()); functor_wrapper.register_event(event); - space.register_team_scratch_event(m_scratch_pool_id, event); + instance.register_team_scratch_event(scratch_pool_id, event); } ParallelFor(FunctorType const& arg_functor, Policy const& arg_policy) @@ -143,10 +151,7 @@ class Kokkos::Impl::ParallelFor, m_policy(arg_policy), m_league_size(arg_policy.league_size()), m_team_size(arg_policy.team_size()), - m_vector_size(arg_policy.impl_vector_length()), - m_scratch_buffers_lock(arg_policy.space() - .impl_internal_space_instance() - ->m_team_scratch_mutex) { + m_vector_size(arg_policy.impl_vector_length()) { // FIXME_SYCL optimize if (m_team_size < 0) m_team_size = @@ -159,22 +164,14 @@ class Kokkos::Impl::ParallelFor, m_scratch_size[0] = m_shmem_size; m_scratch_size[1] = m_policy.scratch_size(1, m_team_size); - // Functor's reduce memory, team scan memory, and team shared memory depend - // upon team size. - auto& space = *m_policy.space().impl_internal_space_instance(); - m_scratch_pool_id = space.acquire_team_scratch_space(); - m_global_scratch_ptr = - static_cast>(space.resize_team_scratch_space( - m_scratch_pool_id, - static_cast(m_scratch_size[1]) * m_league_size)); - - if (static_cast(space.m_maxShmemPerBlock) < + const auto& instance = *m_policy.space().impl_internal_space_instance(); + if (static_cast(instance.m_maxShmemPerBlock) < m_shmem_size - m_shmem_begin) { std::stringstream out; out << "Kokkos::Impl::ParallelFor insufficient shared memory! " "Requested " << m_shmem_size - m_shmem_begin << " bytes but maximum is " - << space.m_maxShmemPerBlock << '\n'; + << instance.m_maxShmemPerBlock << '\n'; Kokkos::Impl::throw_runtime_exception(out.str()); } diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp index c1414ee0581..79f8afd4a3d 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp @@ -77,9 +77,7 @@ class Kokkos::Impl::ParallelReduce::accessible), - m_scratch_buffers_lock( - m_space.impl_internal_space_instance()->m_mutexScratchSpace) {} + typename View::memory_space>::accessible) {} private: template @@ -330,6 +328,12 @@ class Kokkos::Impl::ParallelReduce scratch_buffers_lock( + instance.m_mutexScratchSpace); + using IndirectKernelMem = Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem; IndirectKernelMem& indirectKernelMem = instance.get_indirect_kernel_mem(); @@ -349,10 +353,6 @@ class Kokkos::Impl::ParallelReduce m_scratch_buffers_lock; }; #endif /* KOKKOS_SYCL_PARALLEL_REDUCE_MDRANGE_HPP */ diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp index dbe2366b8bc..2bad7749759 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp @@ -50,9 +50,7 @@ class Kokkos::Impl::ParallelReduce::accessible), - m_scratch_buffers_lock( - p.space().impl_internal_space_instance()->m_mutexScratchSpace) {} + typename View::memory_space>::accessible) {} private: template @@ -347,6 +345,12 @@ class Kokkos::Impl::ParallelReduce scratch_buffers_lock( + instance.m_mutexScratchSpace); + using IndirectKernelMem = Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem; IndirectKernelMem& indirectKernelMem = instance.get_indirect_kernel_mem(); @@ -366,10 +370,6 @@ class Kokkos::Impl::ParallelReduce m_scratch_buffers_lock; }; #endif /* KOKKOS_SYCL_PARALLEL_REDUCE_RANGE_HPP */ diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp index 1332fafde94..43c6ca44019 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp @@ -30,7 +30,7 @@ class Kokkos::Impl::ParallelReduce, Kokkos::Experimental::SYCL> { public: - using Policy = TeamPolicyInternal; + using Policy = TeamPolicy; using FunctorType = typename CombinedFunctorReducerType::functor_type; using ReducerType = typename CombinedFunctorReducerType::reducer_type; @@ -54,24 +54,18 @@ class Kokkos::Impl::ParallelReduce m_global_scratch_ptr; size_t m_scratch_size[2]; const size_type m_league_size; int m_team_size; const size_type m_vector_size; - // Only let one ParallelReduce instance at a time use the team scratch memory - // and the host scratch memory. The constructor acquires the mutex which is - // released in the destructor. - std::scoped_lock m_scratch_buffers_lock; - int m_scratch_pool_id = -1; - template + template sycl::event sycl_direct_launch( - const PolicyType& policy, + const sycl_device_ptr global_scratch_ptr, const CombinedFunctorReducerWrapper& functor_reducer_wrapper, const sycl::event& memcpy_event) const { // Convenience references - const Kokkos::Experimental::SYCL& space = policy.space(); + const Kokkos::Experimental::SYCL& space = m_policy.space(); Kokkos::Experimental::Impl::SYCLInternal& instance = *space.impl_internal_space_instance(); sycl::queue& q = space.sycl_queue(); @@ -113,7 +107,6 @@ class Kokkos::Impl::ParallelReduce const global_scratch_ptr = m_global_scratch_ptr; #ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES cgh.depends_on(memcpy_event); @@ -170,7 +163,6 @@ class Kokkos::Impl::ParallelReduce const global_scratch_ptr = m_global_scratch_ptr; sycl::local_accessor num_teams_done(1, cgh); auto team_reduction_factory = @@ -386,6 +378,22 @@ class Kokkos::Impl::ParallelReduce scratch_buffers_lock( + instance.m_mutexScratchSpace); + std::scoped_lock team_scratch_lock( + instance.m_team_scratch_mutex); + + // Functor's reduce memory, team scan memory, and team shared memory depend + // upon team size. + int scratch_pool_id = instance.acquire_team_scratch_space(); + const sycl_device_ptr global_scratch_ptr = + static_cast>(instance.resize_team_scratch_space( + scratch_pool_id, + static_cast(m_scratch_size[1]) * m_league_size)); + using IndirectKernelMem = Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem; IndirectKernelMem& indirectKernelMem = instance.get_indirect_kernel_mem(); @@ -395,14 +403,24 @@ class Kokkos::Impl::ParallelReduce + ParallelReduce(CombinedFunctorReducerType const& arg_functor_reducer, + Policy const& arg_policy, ViewType const& arg_result) + : m_functor_reducer(arg_functor_reducer), + m_policy(arg_policy), + m_result_ptr(arg_result.data()), + m_result_ptr_device_accessible( + MemorySpaceAccess::accessible), + m_league_size(arg_policy.league_size()), + m_team_size(arg_policy.team_size()), + m_vector_size(arg_policy.impl_vector_length()) { // FIXME_SYCL optimize if (m_team_size < 0) m_team_size = m_policy.team_size_recommended( @@ -423,22 +441,15 @@ class Kokkos::Impl::ParallelReduce>(space.resize_team_scratch_space( - m_scratch_pool_id, - static_cast(m_scratch_size[1]) * m_league_size)); - - if (static_cast(space.m_maxShmemPerBlock) < + const Kokkos::Experimental::Impl::SYCLInternal& instance = + *m_policy.space().impl_internal_space_instance(); + if (static_cast(instance.m_maxShmemPerBlock) < m_shmem_size - m_shmem_begin) { std::stringstream out; out << "Kokkos::Impl::ParallelFor insufficient shared memory! " "Requested " << m_shmem_size - m_shmem_begin << " bytes but maximum is " - << space.m_maxShmemPerBlock << '\n'; + << instance.m_maxShmemPerBlock << '\n'; Kokkos::Impl::throw_runtime_exception(out.str()); } @@ -448,25 +459,6 @@ class Kokkos::Impl::ParallelReduce requested too large team size."); } - - public: - template - ParallelReduce(CombinedFunctorReducerType const& arg_functor_reducer, - Policy const& arg_policy, ViewType const& arg_result) - : m_functor_reducer(arg_functor_reducer), - m_policy(arg_policy), - m_result_ptr(arg_result.data()), - m_result_ptr_device_accessible( - MemorySpaceAccess::accessible), - m_league_size(arg_policy.league_size()), - m_team_size(arg_policy.team_size()), - m_vector_size(arg_policy.impl_vector_length()), - m_scratch_buffers_lock(arg_policy.space() - .impl_internal_space_instance() - ->m_team_scratch_mutex) { - initialize(); - } }; #endif diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp index bfc3fba7412..b3d3e9e35ce 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp @@ -150,10 +150,6 @@ class ParallelScanSYCLBase { pointer_type m_result_ptr; const bool m_result_ptr_device_accessible; - // Only let one ParallelScan instance at a time use the host scratch memory. - // The constructor acquires the mutex which is released in the destructor. - std::scoped_lock m_scratch_buffers_lock; - private: template sycl::event sycl_direct_launch(const FunctorWrapper& functor_wrapper, @@ -367,6 +363,11 @@ class ParallelScanSYCLBase { auto& instance = *m_policy.space().impl_internal_space_instance(); + // Only let one instance at a time resize the instance's scratch memory + // allocations. + std::scoped_lock scratch_buffers_lock( + instance.m_mutexScratchSpace); + Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem& indirectKernelMem = instance.get_indirect_kernel_mem(); @@ -385,10 +386,7 @@ class ParallelScanSYCLBase { : m_functor_reducer(arg_functor, typename Analysis::Reducer{arg_functor}), m_policy(arg_policy), m_result_ptr(arg_result_ptr), - m_result_ptr_device_accessible(arg_result_ptr_device_accessible), - m_scratch_buffers_lock(m_policy.space() - .impl_internal_space_instance() - ->m_mutexScratchSpace) {} + m_result_ptr_device_accessible(arg_result_ptr_device_accessible) {} }; } // namespace Kokkos::Impl From f5b34222c166c71e86ea44fd7867d443ba25856e Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 8 May 2024 13:27:28 -0400 Subject: [PATCH 90/95] SYCL: Fix deprecation in custom parallel_for RangePolicy implementation --- core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp index b4de7eb89ff..341c6c335d4 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp @@ -43,8 +43,8 @@ template struct FunctorWrapperRangePolicyParallelForCustom { using WorkTag = typename Policy::work_tag; - void operator()(sycl::item<1> item) const { - const typename Policy::index_type id = item.get_linear_id(); + void operator()(sycl::nd_item<1> item) const { + const typename Policy::index_type id = item.get_global_linear_id(); if (id < m_work_size) { const auto shifted_id = id + m_begin; if constexpr (std::is_void_v) From 37986fde4cee878aa4d9f60ae11f7ea6c80976ff Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Wed, 8 May 2024 14:18:41 -0600 Subject: [PATCH 91/95] [ci skip] update changelog for 4.3.1 (#6995) * [ci skip] update changelog for 4.3.1 * changelog: fixup --- CHANGELOG.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index c70ee5505f8..f8d288db5da 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,18 @@ # CHANGELOG +## [4.3.01](https://github.com/kokkos/kokkos/tree/4.3.01) +[Full Changelog](https://github.com/kokkos/kokkos/compare/4.3.00...4.3.01) + +### Backend and Architecture Enhancements: + +#### HIP: +* MI300 support unified memory [\#6877](https://github.com/kokkos/kokkos/pull/6877) + +### Bug Fixes +* Serial: Use the provided execution space instance in TeamPolicy [\#6951](https://github.com/kokkos/kokkos/pull/6951) +* `nvcc_wrapper`: bring back support for `--fmad` option [\#6931](https://github.com/kokkos/kokkos/pull/6931) +* Fix CUDA reduction overflow for `RangePolicy` [\#6578](https://github.com/kokkos/kokkos/pull/6578) + ## [4.3.00](https://github.com/kokkos/kokkos/tree/4.3.00) (2024-03-19) [Full Changelog](https://github.com/kokkos/kokkos/compare/4.2.01...4.3.00) From 7cad3e7c3b66bf3e6f4de75bd9043abde9f8194a Mon Sep 17 00:00:00 2001 From: Rahulkumar Gayatri Date: Wed, 8 May 2024 14:01:34 -0700 Subject: [PATCH 92/95] OpenMPTarget: Use mutex lock for parallel scan. --- .../Kokkos_OpenMPTarget_ParallelScan_Range.hpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp index c1f7851f413..c886c397966 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp @@ -48,6 +48,10 @@ class ParallelScan, value_type* m_result_ptr; const bool m_result_ptr_device_accessible; + // Only let one ParallelScan instance at a time use the scratch memory. + // The constructor acquires the mutex which is released in the destructor. + std::scoped_lock m_scratch_memory_lock; + template std::enable_if_t::value> call_with_tag( const FunctorType& f, const idx_type& idx, value_type& val, @@ -197,7 +201,8 @@ class ParallelScan, : m_functor_reducer(arg_functor, typename Analysis::Reducer{arg_functor}), m_policy(arg_policy), m_result_ptr(arg_result_ptr), - m_result_ptr_device_accessible(arg_result_ptr_device_accessible) {} + m_result_ptr_device_accessible(arg_result_ptr_device_accessible), + m_scratch_memory_lock(OpenMPTargetExec::m_mutex_scratch_ptr) {} //---------------------------------------- }; From 00170ae80cb54b39dd11f77dcef4318c754afd5a Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 9 May 2024 11:33:01 -0400 Subject: [PATCH 93/95] Remove cuSPARSE TPL It looks like an oversight. It is unused. Example code that referred to it was removed in #2688 because it was just sitting there, i.e. not built nor tested. KokkosKernels has its own CMake logic to find it and link against it. --- cmake/Dependencies.cmake | 1 - cmake/deps/CUDA.cmake | 1 - cmake/deps/CUSPARSE.cmake | 26 -------------------------- cmake/tpls/FindTPLCUSPARSE.cmake | 26 -------------------------- 4 files changed, 54 deletions(-) delete mode 100644 cmake/deps/CUSPARSE.cmake delete mode 100644 cmake/tpls/FindTPLCUSPARSE.cmake diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index 611c089b2e3..fb1e73b5799 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -1,6 +1,5 @@ TRIBITS_PACKAGE_DEFINE_DEPENDENCIES( LIB_OPTIONAL_TPLS Pthread CUDA HWLOC DLlib - TEST_OPTIONAL_TPLS CUSPARSE ) TRIBITS_TPL_TENTATIVELY_ENABLE(DLlib) diff --git a/cmake/deps/CUDA.cmake b/cmake/deps/CUDA.cmake index 68bf5b3d579..5b6afd61512 100644 --- a/cmake/deps/CUDA.cmake +++ b/cmake/deps/CUDA.cmake @@ -35,7 +35,6 @@ IF(NOT _CUDA_FAILURE) GLOBAL_SET(TPL_CUDA_LIBRARY_DIRS) GLOBAL_SET(TPL_CUDA_INCLUDE_DIRS ${CUDA_TOOLKIT_INCLUDE}) GLOBAL_SET(TPL_CUDA_LIBRARIES ${CUDA_CUDART_LIBRARY} ${CUDA_cublas_LIBRARY} ${CUDA_cufft_LIBRARY}) - KOKKOS_CREATE_IMPORTED_TPL_LIBRARY(CUSPARSE) ELSE() SET(TPL_ENABLE_CUDA OFF) ENDIF() diff --git a/cmake/deps/CUSPARSE.cmake b/cmake/deps/CUSPARSE.cmake deleted file mode 100644 index b016971ab91..00000000000 --- a/cmake/deps/CUSPARSE.cmake +++ /dev/null @@ -1,26 +0,0 @@ -#@HEADER -# ************************************************************************ -# -# Kokkos v. 4.0 -# Copyright (2022) National Technology & Engineering -# Solutions of Sandia, LLC (NTESS). -# -# Under the terms of Contract DE-NA0003525 with NTESS, -# the U.S. Government retains certain rights in this software. -# -# Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -# -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# -# ************************************************************************ -# @HEADER - -#include(${TRIBITS_DEPS_DIR}/CUDA.cmake) - -#IF (TPL_ENABLE_CUDA) -# GLOBAL_SET(TPL_CUSPARSE_LIBRARY_DIRS) -# GLOBAL_SET(TPL_CUSPARSE_INCLUDE_DIRS ${TPL_CUDA_INCLUDE_DIRS}) -# GLOBAL_SET(TPL_CUSPARSE_LIBRARIES ${CUDA_cusparse_LIBRARY}) -# KOKKOS_CREATE_IMPORTED_TPL_LIBRARY(CUSPARSE) -#ENDIF() - diff --git a/cmake/tpls/FindTPLCUSPARSE.cmake b/cmake/tpls/FindTPLCUSPARSE.cmake deleted file mode 100644 index 4709f8002b1..00000000000 --- a/cmake/tpls/FindTPLCUSPARSE.cmake +++ /dev/null @@ -1,26 +0,0 @@ -#@HEADER -# ************************************************************************ -# -# Kokkos v. 4.0 -# Copyright (2022) National Technology & Engineering -# Solutions of Sandia, LLC (NTESS). -# -# Under the terms of Contract DE-NA0003525 with NTESS, -# the U.S. Government retains certain rights in this software. -# -# Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -# -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# -#@HEADER - -# Check for CUDA support - -IF (NOT TPL_ENABLE_CUDA) - MESSAGE(FATAL_ERROR "\nCUSPARSE requires CUDA") -ELSE() - GLOBAL_SET(TPL_CUSPARSE_LIBRARY_DIRS) - GLOBAL_SET(TPL_CUSPARSE_INCLUDE_DIRS ${TPL_CUDA_INCLUDE_DIRS}) - GLOBAL_SET(TPL_CUSPARSE_LIBRARIES ${CUDA_cusparse_LIBRARY}) -ENDIF() - From 1d9d0df2eecfce635fe5c77559eb17adfa128d04 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Fri, 10 May 2024 11:49:19 -0400 Subject: [PATCH 94/95] SYCL: Print submission command queue property (#7004) * SYCL: Print submission command queue property * Also print SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS if set * Reword printout for environment variable Co-authored-by: Damien L-G --------- Co-authored-by: Damien L-G --- core/src/SYCL/Kokkos_SYCL.cpp | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/core/src/SYCL/Kokkos_SYCL.cpp b/core/src/SYCL/Kokkos_SYCL.cpp index 9a246f7642f..de5ddf405d4 100644 --- a/core/src/SYCL/Kokkos_SYCL.cpp +++ b/core/src/SYCL/Kokkos_SYCL.cpp @@ -110,6 +110,26 @@ void SYCL::print_configuration(std::ostream& os, bool verbose) const { #else os << "macro KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES : undefined\n"; #endif +#ifdef SYCL_EXT_INTEL_QUEUE_IMMEDIATE_COMMAND_LIST + if (sycl_queue() + .has_property< + sycl::ext::intel::property::queue::immediate_command_list>()) + os << "Immediate command lists enforced\n"; + else if (sycl_queue() + .has_property()) + os << "Standard command queue enforced\n"; + else +#endif + { + os << "Immediate command lists and standard command queue allowed.\n"; + if (const char* environment_setting = + std::getenv("SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS")) + os << "SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=" + << environment_setting << " takes precedence.\n"; + else + os << "SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS not defined.\n"; + } int counter = 0; int active_device = Kokkos::device_id(); From df018d97f52a7bfa29cf7d29fbda42f244001f40 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Mon, 13 May 2024 22:59:25 +0200 Subject: [PATCH 95/95] Suppress deprecated warnings via pragma push/pop in the tests (#6999) * Introduce `KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_{PUSH,POP} macros to suppress diagnostics when appropriate * Suppress all deprecated warnings I can see in tests * Update EDG diag suppress to fix the Intel Compiler Classic and provide a fallback empty definition for the macros --- containers/unit_tests/TestVector.hpp | 2 ++ core/src/Kokkos_Macros.hpp | 25 +++++++++++++++++++ core/unit_test/TestArrayOps.hpp | 2 ++ .../incremental/Test01_execspace.hpp | 2 ++ simd/unit_tests/include/SIMDTesting_Ops.hpp | 2 ++ 5 files changed, 33 insertions(+) diff --git a/containers/unit_tests/TestVector.hpp b/containers/unit_tests/TestVector.hpp index a7d341b789d..19901a52ad5 100644 --- a/containers/unit_tests/TestVector.hpp +++ b/containers/unit_tests/TestVector.hpp @@ -21,6 +21,8 @@ #include #include #include +#include +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() #include namespace Test { diff --git a/core/src/Kokkos_Macros.hpp b/core/src/Kokkos_Macros.hpp index b255d2a5195..27b32b15214 100644 --- a/core/src/Kokkos_Macros.hpp +++ b/core/src/Kokkos_Macros.hpp @@ -562,6 +562,31 @@ static constexpr bool kokkos_omp_on_host() { return false; } #define KOKKOS_IMPL_WARNING(desc) KOKKOS_IMPL_DO_PRAGMA(message(#desc)) #endif +// clang-format off +#if defined(__EDG__) + #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() \ + _Pragma("warning push") \ + _Pragma("warning disable 1478") + #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() \ + _Pragma("warning pop") +#elif defined(__GNUC__) || defined(__clang__) + #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"") + #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() \ + _Pragma("GCC diagnostic pop") +#elif defined(_MSC_VER) + #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() \ + _Pragma("warning(push)") \ + _Pragma("warning(disable: 4996)") + #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() \ + _Pragma("warning(pop)") +#else + #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() + #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() +#endif +// clang-format on + #define KOKKOS_ATTRIBUTE_NODISCARD [[nodiscard]] #if (defined(KOKKOS_COMPILER_GNU) || defined(KOKKOS_COMPILER_CLANG) || \ diff --git a/core/unit_test/TestArrayOps.hpp b/core/unit_test/TestArrayOps.hpp index 387589fbe88..6b8e0f3aca3 100644 --- a/core/unit_test/TestArrayOps.hpp +++ b/core/unit_test/TestArrayOps.hpp @@ -112,6 +112,7 @@ TEST(TEST_CATEGORY, array_zero_data_nullptr) { } #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() TEST(TEST_CATEGORY, array_contiguous_capacity) { using A = Kokkos::Array::contiguous>; @@ -390,6 +391,7 @@ TEST(TEST_CATEGORY, array_strided_assignment) { ASSERT_EQ(e.max_size(), std::size(ee) / eStride); ASSERT_EQ(e[0], ee[0]); } +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() #endif } // namespace diff --git a/core/unit_test/incremental/Test01_execspace.hpp b/core/unit_test/incremental/Test01_execspace.hpp index d7b2a57b442..a7fa26c7282 100644 --- a/core/unit_test/incremental/Test01_execspace.hpp +++ b/core/unit_test/incremental/Test01_execspace.hpp @@ -63,7 +63,9 @@ struct TestIncrExecSpace { ASSERT_GT(concurrency, 0); #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() int in_parallel = ExecSpace::in_parallel(); + KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() ASSERT_FALSE(in_parallel); #endif diff --git a/simd/unit_tests/include/SIMDTesting_Ops.hpp b/simd/unit_tests/include/SIMDTesting_Ops.hpp index c587ccf3046..74141f25316 100644 --- a/simd/unit_tests/include/SIMDTesting_Ops.hpp +++ b/simd/unit_tests/include/SIMDTesting_Ops.hpp @@ -81,7 +81,9 @@ class absolutes { auto on_host(T const& a) const { if constexpr (std::is_signed_v) { #if defined(KOKKOS_ENABLE_DEPRECATED_CODE_4) + KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() return Kokkos::Experimental::abs(a); + KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() #else return Kokkos::abs(a); #endif