From a35c7b4bea7d5cb81e172abb2c4c988f4aac392e Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Wed, 16 Feb 2022 14:33:17 +0000 Subject: [PATCH 01/89] Fix CUDA memory check for large array sizes Closes #123 --- src/cuda/CUDAStream.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cuda/CUDAStream.cu b/src/cuda/CUDAStream.cu index b467d00f..778a0445 100644 --- a/src/cuda/CUDAStream.cu +++ b/src/cuda/CUDAStream.cu @@ -51,7 +51,7 @@ CUDAStream::CUDAStream(const int ARRAY_SIZE, const int device_index) // Check buffers fit on the device cudaDeviceProp props; cudaGetDeviceProperties(&props, 0); - if (props.totalGlobalMem < 3*ARRAY_SIZE*sizeof(T)) + if (props.totalGlobalMem < std::size_t{3}*ARRAY_SIZE*sizeof(T)) throw std::runtime_error("Device does not have enough memory for all 3 buffers"); // Create device buffers From 5645b0290d13d401016386d8d9f01ea9567286a9 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Wed, 16 Feb 2022 14:36:20 +0000 Subject: [PATCH 02/89] update changelog --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 903cb02a..29587937 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,7 +2,7 @@ All notable changes to this project will be documented in this file. ## Unreleased -- None +- Fix CUDA memory limit check. ## [v4.0] - 2021-12-22 From e77a34158ce3ee00c11c66d0bf0a0e05c0c3ea7b Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Wed, 16 Feb 2022 14:37:58 +0000 Subject: [PATCH 03/89] fix memory limit check for HIP --- src/hip/HIPStream.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hip/HIPStream.cpp b/src/hip/HIPStream.cpp index fbc3b712..6aed1ee1 100644 --- a/src/hip/HIPStream.cpp +++ b/src/hip/HIPStream.cpp @@ -54,7 +54,7 @@ HIPStream::HIPStream(const int ARRAY_SIZE, const int device_index) // Check buffers fit on the device hipDeviceProp_t props; hipGetDeviceProperties(&props, 0); - if (props.totalGlobalMem < 3*ARRAY_SIZE*sizeof(T)) + if (props.totalGlobalMem < std::size_t{3}*ARRAY_SIZE*sizeof(T)) throw std::runtime_error("Device does not have enough memory for all 3 buffers"); // Create device buffers From 7b2bd5427c7fbbe56620eb97d87c1c5f6f047ea0 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Thu, 31 Mar 2022 14:50:10 +0100 Subject: [PATCH 04/89] Fix missing counting iterator operators for stdpar --- src/std-indices/STDIndicesStream.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/std-indices/STDIndicesStream.h b/src/std-indices/STDIndicesStream.h index bc068aa9..26c7cb0d 100644 --- a/src/std-indices/STDIndicesStream.h +++ b/src/std-indices/STDIndicesStream.h @@ -34,6 +34,7 @@ class ranged { iterator& operator++() { num++; return *this; } iterator operator++(int) { iterator retval = *this; ++(*this); return retval; } iterator operator+(const value_type v) const { return iterator(num + v); } + iterator operator+=(int x) { iterator retval = *this; this->num+=x; return retval; } bool operator==(iterator other) const { return num == other.num; } bool operator!=(iterator other) const { return *this != other; } From 6185d3aca6e89b064b599fa2f83e19272a7a0e13 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Fri, 1 Apr 2022 10:51:24 +0100 Subject: [PATCH 05/89] Use long double for check solution in case of very large problem sizes --- CHANGELOG.md | 1 + src/main.cpp | 10 +++++----- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 29587937..cc135f97 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ All notable changes to this project will be documented in this file. ## Unreleased - Fix CUDA memory limit check. +- Use long double for `check_solution` in case of large problem size. ## [v4.0] - 2021-12-22 diff --git a/src/main.cpp b/src/main.cpp index 3035da0c..c9d76942 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -487,15 +487,15 @@ void check_solution(const unsigned int ntimes, std::vector& a, std::vector goldSum = goldA * goldB * ARRAY_SIZE; // Calculate the average error - double errA = std::accumulate(a.begin(), a.end(), 0.0, [&](double sum, const T val){ return sum + fabs(val - goldA); }); + long double errA = std::accumulate(a.begin(), a.end(), 0.0, [&](double sum, const T val){ return sum + fabs(val - goldA); }); errA /= a.size(); - double errB = std::accumulate(b.begin(), b.end(), 0.0, [&](double sum, const T val){ return sum + fabs(val - goldB); }); + long double errB = std::accumulate(b.begin(), b.end(), 0.0, [&](double sum, const T val){ return sum + fabs(val - goldB); }); errB /= b.size(); - double errC = std::accumulate(c.begin(), c.end(), 0.0, [&](double sum, const T val){ return sum + fabs(val - goldC); }); + long double errC = std::accumulate(c.begin(), c.end(), 0.0, [&](double sum, const T val){ return sum + fabs(val - goldC); }); errC /= c.size(); - double errSum = fabs((sum - goldSum)/goldSum); + long double errSum = fabs((sum - goldSum)/goldSum); - double epsi = std::numeric_limits::epsilon() * 100.0; + long double epsi = std::numeric_limits::epsilon() * 100.0; if (errA > epsi) std::cerr From fdb0ef8af846af647018c480e5e413f330532b63 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Thu, 7 Apr 2022 23:22:54 +0100 Subject: [PATCH 06/89] Bump CI NVHPC version --- src/ci-prepare-bionic.sh | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/src/ci-prepare-bionic.sh b/src/ci-prepare-bionic.sh index 656d3384..0684f353 100755 --- a/src/ci-prepare-bionic.sh +++ b/src/ci-prepare-bionic.sh @@ -133,21 +133,26 @@ setup_aocc() { } setup_nvhpc() { - echo "Preparing Nvidia HPC SDK" - local tarball="nvhpc.tar.gz" -# local url="http://localhost:8000/nvhpc_2021_219_Linux_x86_64_cuda_11.4.tar.gz" - local url="https://developer.download.nvidia.com/hpc-sdk/21.9/nvhpc_2021_219_Linux_x86_64_cuda_11.4.tar.gz" + echo "Preparing Nvidia HPC SDK" + local nvhpc_ver="22.3" + local nvhpc_release="2022_223" + local cuda_ver="11.6" + + local tarball="nvhpc_$nvhpc_ver.tar.gz" + + local url="https://developer.download.nvidia.com/hpc-sdk/$nvhpc_ver/nvhpc_${nvhpc_release}_Linux_x86_64_cuda_$cuda_ver.tar.gz" get_and_untar "$tarball" "$url" - local sdk_dir="$PWD/nvhpc_2021_219_Linux_x86_64_cuda_11.4/install_components/Linux_x86_64/21.9" + local sdk_dir="$PWD/nvhpc_${nvhpc_release}_Linux_x86_64_cuda_$cuda_ver/install_components/Linux_x86_64/$nvhpc_ver" local bin_dir="$sdk_dir/compilers/bin" "$bin_dir/makelocalrc" "$bin_dir" -x export_var NVHPC_SDK_DIR "$sdk_dir" - export_var NVHPC_CUDA_DIR "$sdk_dir/cuda/11.4" + export_var NVHPC_CUDA_DIR "$sdk_dir/cuda/$cuda_ver" export_var NVHPC_NVCXX "$bin_dir/nvc++" - export_var NVHPC_NVCC "$sdk_dir/cuda/11.4/bin/nvcc" + export_var NVHPC_NVCC "$bin_dir/nvcc" +# export_var NVHPC_NVCC "$sdk_dir/cuda/$cuda_ver/bin/nvcc" echo "Installed CUDA versions:" ls "$sdk_dir/cuda" From b27def135e9a1eb46e53c9d45396dea9077be204 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Fri, 8 Apr 2022 05:34:15 +0100 Subject: [PATCH 07/89] Sync CUDA version with CI runner --- src/ci-prepare-bionic.sh | 1 + src/ci-test-compile.sh | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/ci-prepare-bionic.sh b/src/ci-prepare-bionic.sh index 0684f353..78bbd330 100755 --- a/src/ci-prepare-bionic.sh +++ b/src/ci-prepare-bionic.sh @@ -152,6 +152,7 @@ setup_nvhpc() { export_var NVHPC_NVCXX "$bin_dir/nvc++" export_var NVHPC_NVCC "$bin_dir/nvcc" + export_var NVHPC_CUDA_VER "$cuda_ver" # export_var NVHPC_NVCC "$sdk_dir/cuda/$cuda_ver/bin/nvcc" echo "Installed CUDA versions:" diff --git a/src/ci-test-compile.sh b/src/ci-test-compile.sh index 93886438..7e17379c 100755 --- a/src/ci-test-compile.sh +++ b/src/ci-test-compile.sh @@ -122,7 +122,7 @@ run_build() { AMD_ARCH="gfx_903" NV_ARCH="sm_70" -NV_ARCH_CCXY="cuda11.4,cc80" +NV_ARCH_CCXY="cuda${NVHPC_CUDA_VER:?},cc80" build_gcc() { local name="gcc_build" From 0f264081d75ddf315ecff461af17c8db6e3d4b78 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Fri, 8 Apr 2022 19:43:15 +0100 Subject: [PATCH 08/89] Fix Thrust/CUB path --- src/ci-test-compile.sh | 8 ++++---- src/thrust/model.cmake | 3 +++ 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src/ci-test-compile.sh b/src/ci-test-compile.sh index 7e17379c..a7c5bab5 100755 --- a/src/ci-test-compile.sh +++ b/src/ci-test-compile.sh @@ -175,9 +175,9 @@ build_gcc() { local current=$("$CMAKE_BIN" --version | head -n 1 | cut -d ' ' -f3) local required="3.15.0" if [ "$(printf '%s\n' "$required" "$current" | sort -V | head -n1)" = "$required" ]; then - run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/include -DTHRUST_IMPL=CUDA -DBACKEND=CUDA" - run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/include -DTHRUST_IMPL=CUDA -DBACKEND=OMP" - run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/include -DTHRUST_IMPL=CUDA -DBACKEND=CPP" + run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/lib64/cmake -DTHRUST_IMPL=CUDA -DBACKEND=CUDA" + run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/lib64/cmake -DTHRUST_IMPL=CUDA -DBACKEND=OMP" + run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/lib64/cmake -DTHRUST_IMPL=CUDA -DBACKEND=CPP" # FIXME CUDA Thrust + TBB throws the following error: # /usr/lib/gcc/x86_64-linux-gnu/9/include/avx512fintrin.h(9146): error: identifier "__builtin_ia32_rndscaless_round" is undefined @@ -187,7 +187,7 @@ build_gcc() { # /usr/lib/gcc/x86_64-linux-gnu/9/include/avx512dqintrin.h(1365): error: identifier "__builtin_ia32_fpclassss" is undefined # /usr/lib/gcc/x86_64-linux-gnu/9/include/avx512dqintrin.h(1372): error: identifier "__builtin_ia32_fpclasssd" is undefined - # run_build $name "${GCC_CXX:?}" THRUST "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/include -DTHRUST_IMPL=CUDA -DBACKEND=TBB" + # run_build $name "${GCC_CXX:?}" THRUST "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/lib64/cmake -DTHRUST_IMPL=CUDA -DBACKEND=TBB" else echo "CMake version ${current} < ${required}, skipping Thrust models" fi diff --git a/src/thrust/model.cmake b/src/thrust/model.cmake index 0c286c2d..2d687c72 100644 --- a/src/thrust/model.cmake +++ b/src/thrust/model.cmake @@ -53,6 +53,9 @@ macro(setup) message(STATUS "NVCC flags: ${CMAKE_CUDA_FLAGS} ${CMAKE_CUDA_FLAGS_${BUILD_TYPE}}") + # XXX NVHPC <= 21.9 has cub-config in `Linux_x86_64/21.9/cuda/11.4/include/cub/cmake` + # XXX NVHPC >= 22.3 has cub-config in `Linux_x86_64/22.3/cuda/11.6/lib64/cmake/cub/` + # same thing for thrust if (SDK_DIR) find_package(CUB REQUIRED CONFIG PATHS ${SDK_DIR}/cub) find_package(Thrust REQUIRED CONFIG PATHS ${SDK_DIR}/thrust) From 1d9cde42b00a42428b7e3b1043c0c1acc0af2b22 Mon Sep 17 00:00:00 2001 From: NoseKnowsAll Date: Wed, 20 Jul 2022 18:10:15 -0500 Subject: [PATCH 09/89] Reflect updated model options in README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index df95582c..7be3550d 100644 --- a/README.md +++ b/README.md @@ -100,7 +100,7 @@ The source for each model's implementations are located in `./src/`. Currently available models are: ``` -omp;ocl;std;std20;hip;cuda;kokkos;sycl;sycl2020;acc;raja;tbb;thrust +omp;ocl;std-data;std-indices;std-ranges;hip;cuda;kokkos;sycl;sycl2020;acc;raja;tbb;thrust ``` #### Overriding default flags From 5f6e714bdd8d34c305876f392df9cc569df33e5c Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Sun, 24 Jul 2022 21:17:17 +0100 Subject: [PATCH 10/89] Add options for std::vector or raw pointers for TBB/STD --- src/std-data/STDDataStream.cpp | 55 ++++++++++++++++++--------- src/std-data/STDDataStream.h | 8 ++-- src/std-data/model.cmake | 8 +++- src/std-indices/STDIndicesStream.cpp | 53 +++++++++++++++++--------- src/std-indices/STDIndicesStream.h | 13 +++++-- src/std-indices/model.cmake | 8 +++- src/std-ranges/STDRangesStream.cpp | 56 +++++++++++++++++++--------- src/std-ranges/STDRangesStream.hpp | 8 ++-- src/std-ranges/model.cmake | 7 ++++ src/tbb/TBBStream.cpp | 32 ++++++++++++++-- src/tbb/TBBStream.hpp | 13 +++++-- src/tbb/model.cmake | 11 +++++- 12 files changed, 200 insertions(+), 72 deletions(-) diff --git a/src/std-data/STDDataStream.cpp b/src/std-data/STDDataStream.cpp index 343e2470..2dead3b4 100644 --- a/src/std-data/STDDataStream.cpp +++ b/src/std-data/STDDataStream.cpp @@ -10,60 +10,79 @@ #include #include +#ifndef ALIGNMENT +#define ALIGNMENT (2*1024*1024) // 2MB +#endif + +#ifdef USE_VECTOR +#define BEGIN(x) (x).begin() +#define END(x) (x).end() +#else +#define BEGIN(x) (x) +#define END(x) ((x) + array_size) +#endif + // There are three execution policies: // auto exe_policy = std::execution::seq; // auto exe_policy = std::execution::par; -auto exe_policy = std::execution::par_unseq; +constexpr auto exe_policy = std::execution::par_unseq; template STDDataStream::STDDataStream(const int ARRAY_SIZE, int device) - noexcept : array_size{ARRAY_SIZE}, a(array_size), b(array_size), c(array_size) -{ -} + noexcept : array_size{ARRAY_SIZE}, +#ifdef USE_VECTOR + a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE) +#else +array_size(ARRAY_SIZE), + a((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)), + b((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)), + c((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)) +#endif +{ std::cout <<"Backing storage typeid: " << typeid(a).name() << std::endl; } template void STDDataStream::init_arrays(T initA, T initB, T initC) { - std::fill(exe_policy, a.begin(), a.end(), initA); - std::fill(exe_policy, b.begin(), b.end(), initB); - std::fill(exe_policy, c.begin(), c.end(), initC); + std::fill(exe_policy, BEGIN(a), END(a), initA); + std::fill(exe_policy, BEGIN(b), END(b), initB); + std::fill(exe_policy, BEGIN(c), END(c), initC); } template void STDDataStream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) { - h_a = a; - h_b = b; - h_c = c; + std::copy(BEGIN(a), END(a), h_a.begin()); + std::copy(BEGIN(b), END(b), h_b.begin()); + std::copy(BEGIN(c), END(c), h_c.begin()); } template void STDDataStream::copy() { // c[i] = a[i] - std::copy(exe_policy, a.begin(), a.end(), c.begin()); + std::copy(exe_policy, BEGIN(a), END(a), BEGIN(c)); } template void STDDataStream::mul() { // b[i] = scalar * c[i]; - std::transform(exe_policy, c.begin(), c.end(), b.begin(), [scalar = startScalar](T ci){ return scalar*ci; }); + std::transform(exe_policy, BEGIN(c), END(c), BEGIN(b), [scalar = startScalar](T ci){ return scalar*ci; }); } template void STDDataStream::add() { // c[i] = a[i] + b[i]; - std::transform(exe_policy, a.begin(), a.end(), b.begin(), c.begin(), std::plus()); + std::transform(exe_policy, BEGIN(a), END(a), BEGIN(b), BEGIN(c), std::plus()); } template void STDDataStream::triad() { // a[i] = b[i] + scalar * c[i]; - std::transform(exe_policy, b.begin(), b.end(), c.begin(), a.begin(), [scalar = startScalar](T bi, T ci){ return bi+scalar*ci; }); + std::transform(exe_policy, BEGIN(b), END(b), BEGIN(c), BEGIN(a), [scalar = startScalar](T bi, T ci){ return bi+scalar*ci; }); } template @@ -73,8 +92,8 @@ void STDDataStream::nstream() // Need to do in two stages with C++11 STL. // 1: a[i] += b[i] // 2: a[i] += scalar * c[i]; - std::transform(exe_policy, a.begin(), a.end(), b.begin(), a.begin(), [](T ai, T bi){ return ai + bi; }); - std::transform(exe_policy, a.begin(), a.end(), c.begin(), a.begin(), [scalar = startScalar](T ai, T ci){ return ai + scalar*ci; }); + std::transform(exe_policy, BEGIN(a), END(a), BEGIN(b), BEGIN(a), [](T ai, T bi){ return ai + bi; }); + std::transform(exe_policy, BEGIN(a), END(a), BEGIN(c), BEGIN(a), [scalar = startScalar](T ai, T ci){ return ai + scalar*ci; }); } @@ -82,7 +101,7 @@ template T STDDataStream::dot() { // sum = 0; sum += a[i]*b[i]; return sum; - return std::transform_reduce(exe_policy, a.begin(), a.end(), b.begin(), 0.0); + return std::transform_reduce(exe_policy, BEGIN(a), END(a), BEGIN(b), 0.0); } void listDevices(void) @@ -102,3 +121,5 @@ std::string getDeviceDriver(const int) template class STDDataStream; template class STDDataStream; +#undef BEGIN +#undef END \ No newline at end of file diff --git a/src/std-data/STDDataStream.h b/src/std-data/STDDataStream.h index 741fd6ce..84b4dcfb 100644 --- a/src/std-data/STDDataStream.h +++ b/src/std-data/STDDataStream.h @@ -21,9 +21,11 @@ class STDDataStream : public Stream int array_size; // Device side pointers - std::vector a; - std::vector b; - std::vector c; +#ifdef USE_VECTOR + std::vector a, b, c; +#else + T *a, *b, *c; +#endif public: diff --git a/src/std-data/model.cmake b/src/std-data/model.cmake index ef69f304..6f87bc94 100644 --- a/src/std-data/model.cmake +++ b/src/std-data/model.cmake @@ -3,6 +3,10 @@ register_flag_optional(CMAKE_CXX_COMPILER "Any CXX compiler that is supported by CMake detection" "c++") +register_flag_optional(USE_VECTOR + "Whether to use std::vector for storage or use aligned_alloc. C++ vectors are *zero* initialised where as aligned_alloc is uninitialised before first use." + "OFF") + register_flag_optional(NVHPC_OFFLOAD "Enable offloading support (via the non-standard `-stdpar`) for the new NVHPC SDK. The values are Nvidia architectures in ccXY format will be passed in via `-gpu=` (e.g `cc70`) @@ -28,6 +32,8 @@ macro(setup) register_append_cxx_flags(ANY ${NVHPC_FLAGS}) register_append_link_flags(${NVHPC_FLAGS}) endif () - + if(USE_VECTOR) + register_definitions(USE_VECTOR) + endif() endmacro() diff --git a/src/std-indices/STDIndicesStream.cpp b/src/std-indices/STDIndicesStream.cpp index 2221f903..8c0958c3 100644 --- a/src/std-indices/STDIndicesStream.cpp +++ b/src/std-indices/STDIndicesStream.cpp @@ -10,46 +10,63 @@ #include #include +#ifndef ALIGNMENT +#define ALIGNMENT (2*1024*1024) // 2MB +#endif + +#ifdef USE_VECTOR +#define BEGIN(x) (x).begin() +#define END(x) (x).end() +#else +#define BEGIN(x) (x) +#define END(x) ((x) + array_size) +#endif + // There are three execution policies: // auto exe_policy = std::execution::seq; // auto exe_policy = std::execution::par; -auto exe_policy = std::execution::par_unseq; - +constexpr auto exe_policy = std::execution::par_unseq; template STDIndicesStream::STDIndicesStream(const int ARRAY_SIZE, int device) - noexcept : array_size{ARRAY_SIZE}, range(0, array_size), a(array_size), b(array_size), c(array_size) -{ -} + noexcept : array_size{ARRAY_SIZE}, range(0, array_size), +#ifdef USE_VECTOR + a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE) +#else + a((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)), + b((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)), + c((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)) +#endif +{ std::cout <<"Backing storage typeid: " << typeid(a).name() << std::endl; } template void STDIndicesStream::init_arrays(T initA, T initB, T initC) { - std::fill(exe_policy, a.begin(), a.end(), initA); - std::fill(exe_policy, b.begin(), b.end(), initB); - std::fill(exe_policy, c.begin(), c.end(), initC); + std::fill(exe_policy, BEGIN(a), END(a), initA); + std::fill(exe_policy, BEGIN(b), END(b), initB); + std::fill(exe_policy, BEGIN(c), END(c), initC); } template void STDIndicesStream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) { - h_a = a; - h_b = b; - h_c = c; + std::copy(BEGIN(a), END(a), h_a.begin()); + std::copy(BEGIN(b), END(b), h_b.begin()); + std::copy(BEGIN(c), END(c), h_c.begin()); } template void STDIndicesStream::copy() { // c[i] = a[i] - std::copy(exe_policy, a.begin(), a.end(), c.begin()); + std::copy(exe_policy, BEGIN(a), END(a), BEGIN(c)); } template void STDIndicesStream::mul() { // b[i] = scalar * c[i]; - std::transform(exe_policy, range.begin(), range.end(), b.begin(), [&, scalar = startScalar](int i) { + std::transform(exe_policy, range.begin(), range.end(), BEGIN(b), [&, scalar = startScalar](int i) { return scalar * c[i]; }); } @@ -58,7 +75,7 @@ template void STDIndicesStream::add() { // c[i] = a[i] + b[i]; - std::transform(exe_policy, range.begin(), range.end(), c.begin(), [&](int i) { + std::transform(exe_policy, range.begin(), range.end(), BEGIN(c), [&](int i) { return a[i] + b[i]; }); } @@ -67,7 +84,7 @@ template void STDIndicesStream::triad() { // a[i] = b[i] + scalar * c[i]; - std::transform(exe_policy, range.begin(), range.end(), a.begin(), [&, scalar = startScalar](int i) { + std::transform(exe_policy, range.begin(), range.end(), BEGIN(a), [&, scalar = startScalar](int i) { return b[i] + scalar * c[i]; }); } @@ -79,7 +96,7 @@ void STDIndicesStream::nstream() // Need to do in two stages with C++11 STL. // 1: a[i] += b[i] // 2: a[i] += scalar * c[i]; - std::transform(exe_policy, range.begin(), range.end(), a.begin(), [&, scalar = startScalar](int i) { + std::transform(exe_policy, range.begin(), range.end(), BEGIN(a), [&, scalar = startScalar](int i) { return a[i] + b[i] + scalar * c[i]; }); } @@ -89,7 +106,7 @@ template T STDIndicesStream::dot() { // sum = 0; sum += a[i]*b[i]; return sum; - return std::transform_reduce(exe_policy, a.begin(), a.end(), b.begin(), 0.0); + return std::transform_reduce(exe_policy, BEGIN(a), END(a), BEGIN(b), 0.0); } void listDevices(void) @@ -109,3 +126,5 @@ std::string getDeviceDriver(const int) template class STDIndicesStream; template class STDIndicesStream; +#undef BEGIN +#undef END \ No newline at end of file diff --git a/src/std-indices/STDIndicesStream.h b/src/std-indices/STDIndicesStream.h index 26c7cb0d..6810888a 100644 --- a/src/std-indices/STDIndicesStream.h +++ b/src/std-indices/STDIndicesStream.h @@ -10,6 +10,11 @@ #include #include "Stream.h" +#ifdef USE_SPAN +#include +#endif + + #define IMPLEMENTATION_STRING "STD (index-oriented)" @@ -60,9 +65,11 @@ class STDIndicesStream : public Stream ranged range; // Device side pointers - std::vector a; - std::vector b; - std::vector c; +#ifdef USE_VECTOR + std::vector a, b, c; +#else + T *a, *b, *c; +#endif public: diff --git a/src/std-indices/model.cmake b/src/std-indices/model.cmake index ef69f304..6f87bc94 100644 --- a/src/std-indices/model.cmake +++ b/src/std-indices/model.cmake @@ -3,6 +3,10 @@ register_flag_optional(CMAKE_CXX_COMPILER "Any CXX compiler that is supported by CMake detection" "c++") +register_flag_optional(USE_VECTOR + "Whether to use std::vector for storage or use aligned_alloc. C++ vectors are *zero* initialised where as aligned_alloc is uninitialised before first use." + "OFF") + register_flag_optional(NVHPC_OFFLOAD "Enable offloading support (via the non-standard `-stdpar`) for the new NVHPC SDK. The values are Nvidia architectures in ccXY format will be passed in via `-gpu=` (e.g `cc70`) @@ -28,6 +32,8 @@ macro(setup) register_append_cxx_flags(ANY ${NVHPC_FLAGS}) register_append_link_flags(${NVHPC_FLAGS}) endif () - + if(USE_VECTOR) + register_definitions(USE_VECTOR) + endif() endmacro() diff --git a/src/std-ranges/STDRangesStream.cpp b/src/std-ranges/STDRangesStream.cpp index de615289..fc71fee4 100644 --- a/src/std-ranges/STDRangesStream.cpp +++ b/src/std-ranges/STDRangesStream.cpp @@ -10,20 +10,40 @@ #include #include +#ifndef ALIGNMENT +#define ALIGNMENT (2*1024*1024) // 2MB +#endif + +#ifdef USE_VECTOR +#define BEGIN(x) (x).begin() +#define END(x) (x).end() +#else +#define BEGIN(x) (x) +#define END(x) ((x) + array_size) +#endif + +// There are three execution policies: +// auto exe_policy = std::execution::seq; +// auto exe_policy = std::execution::par; +constexpr auto exe_policy = std::execution::par_unseq; + template STDRangesStream::STDRangesStream(const int ARRAY_SIZE, int device) - : array_size{ARRAY_SIZE} -{ - a = std::vector(array_size); - b = std::vector(array_size); - c = std::vector(array_size); -} + : array_size{ARRAY_SIZE}, +#ifdef USE_VECTOR + a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE) +#else + a((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)), + b((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)), + c((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)) +#endif +{ std::cout <<"Backing storage typeid: " << typeid(a).name() << std::endl; } template void STDRangesStream::init_arrays(T initA, T initB, T initC) { std::for_each_n( - std::execution::par_unseq, + exe_policy, std::views::iota(0).begin(), array_size, // loop range [&] (int i) { a[i] = initA; @@ -37,16 +57,16 @@ template void STDRangesStream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) { // Element-wise copy. - h_a = a; - h_b = b; - h_c = c; + std::copy(BEGIN(a), END(a), h_a.begin()); + std::copy(BEGIN(b), END(b), h_b.begin()); + std::copy(BEGIN(c), END(c), h_c.begin()); } template void STDRangesStream::copy() { std::for_each_n( - std::execution::par_unseq, + exe_policy, std::views::iota(0).begin(), array_size, [&] (int i) { c[i] = a[i]; @@ -60,7 +80,7 @@ void STDRangesStream::mul() const T scalar = startScalar; std::for_each_n( - std::execution::par_unseq, + exe_policy, std::views::iota(0).begin(), array_size, [&] (int i) { b[i] = scalar * c[i]; @@ -72,7 +92,7 @@ template void STDRangesStream::add() { std::for_each_n( - std::execution::par_unseq, + exe_policy, std::views::iota(0).begin(), array_size, [&] (int i) { c[i] = a[i] + b[i]; @@ -86,7 +106,7 @@ void STDRangesStream::triad() const T scalar = startScalar; std::for_each_n( - std::execution::par_unseq, + exe_policy, std::views::iota(0).begin(), array_size, [&] (int i) { a[i] = b[i] + scalar * c[i]; @@ -100,7 +120,7 @@ void STDRangesStream::nstream() const T scalar = startScalar; std::for_each_n( - std::execution::par_unseq, + exe_policy, std::views::iota(0).begin(), array_size, [&] (int i) { a[i] += b[i] + scalar * c[i]; @@ -114,8 +134,8 @@ T STDRangesStream::dot() // sum += a[i] * b[i]; return std::transform_reduce( - std::execution::par_unseq, - a.begin(), a.end(), b.begin(), 0.0); + exe_policy, + BEGIN(a), END(a), BEGIN(b), 0.0); } void listDevices(void) @@ -136,3 +156,5 @@ std::string getDeviceDriver(const int) template class STDRangesStream; template class STDRangesStream; +#undef BEGIN +#undef END \ No newline at end of file diff --git a/src/std-ranges/STDRangesStream.hpp b/src/std-ranges/STDRangesStream.hpp index 890e893f..33bc77bd 100644 --- a/src/std-ranges/STDRangesStream.hpp +++ b/src/std-ranges/STDRangesStream.hpp @@ -21,9 +21,11 @@ class STDRangesStream : public Stream int array_size; // Device side pointers - std::vector a; - std::vector b; - std::vector c; +#ifdef USE_VECTOR + std::vector a, b, c; +#else + T *a, *b, *c; +#endif public: STDRangesStream(const int, int); diff --git a/src/std-ranges/model.cmake b/src/std-ranges/model.cmake index fd07387d..ac56962b 100644 --- a/src/std-ranges/model.cmake +++ b/src/std-ranges/model.cmake @@ -3,6 +3,10 @@ register_flag_optional(CMAKE_CXX_COMPILER "Any CXX compiler that is supported by CMake detection and supports C++20 Ranges" "c++") +register_flag_optional(USE_VECTOR + "Whether to use std::vector for storage or use aligned_alloc. C++ vectors are *zero* initialised where as aligned_alloc is uninitialised before first use." + "OFF") + macro(setup) # TODO this needs to eventually be removed when CMake adds proper C++20 support or at least update the flag used here @@ -13,4 +17,7 @@ macro(setup) unset(CMAKE_CXX_STANDARD) # drop any existing standard we have set by default # and append our own: register_append_cxx_flags(ANY -std=c++2a) + if(USE_VECTOR) + register_definitions(USE_VECTOR) + endif() endmacro() diff --git a/src/tbb/TBBStream.cpp b/src/tbb/TBBStream.cpp index 9c34a506..bd94443b 100644 --- a/src/tbb/TBBStream.cpp +++ b/src/tbb/TBBStream.cpp @@ -5,15 +5,37 @@ // source code #include "TBBStream.hpp" +#include + +#ifndef ALIGNMENT +#define ALIGNMENT (2*1024*1024) // 2MB +#endif + +#ifdef USE_VECTOR +#define BEGIN(x) (x).begin() +#define END(x) (x).end() +#else +#define BEGIN(x) (x) +#define END(x) ((x) + array_size) +#endif template TBBStream::TBBStream(const int ARRAY_SIZE, int device) - : partitioner(), range(0, ARRAY_SIZE), a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE) + : partitioner(), range(0, ARRAY_SIZE), +#ifdef USE_VECTOR + a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE) +#else + array_size(ARRAY_SIZE), + a((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)), + b((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)), + c((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)) +#endif { if(device != 0){ throw std::runtime_error("Device != 0 is not supported by TBB"); } std::cout << "Using TBB partitioner: " PARTITIONER_NAME << std::endl; + std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl; } @@ -35,9 +57,9 @@ template void TBBStream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) { // Element-wise copy. - h_a = a; - h_b = b; - h_c = c; + std::copy(BEGIN(a), END(a), h_a.begin()); + std::copy(BEGIN(b), END(b), h_b.begin()); + std::copy(BEGIN(c), END(c), h_c.begin()); } template @@ -132,3 +154,5 @@ std::string getDeviceDriver(const int) template class TBBStream; template class TBBStream; +#undef BEGIN +#undef END \ No newline at end of file diff --git a/src/tbb/TBBStream.hpp b/src/tbb/TBBStream.hpp index 90763a9c..2744afc2 100644 --- a/src/tbb/TBBStream.hpp +++ b/src/tbb/TBBStream.hpp @@ -40,10 +40,15 @@ class TBBStream : public Stream tbb_partitioner partitioner; tbb::blocked_range range; // Device side pointers - std::vector a; - std::vector b; - std::vector c; - +#ifdef USE_VECTOR + std::vector a, b, c; +#else + size_t array_size; + T *a, *b, *c; +#endif + + + public: TBBStream(const int, int); ~TBBStream() = default; diff --git a/src/tbb/model.cmake b/src/tbb/model.cmake index e4d6bac3..c1ff9aac 100644 --- a/src/tbb/model.cmake +++ b/src/tbb/model.cmake @@ -1,7 +1,7 @@ register_flag_optional(ONE_TBB_DIR "Absolute path to oneTBB (with header `onetbb/tbb.h`) distribution, the directory should contain at least `include/` and `lib/. - If unspecified, the system TBB (with header `tbb/tbb.h`) will be used via CMake's find_package(TBB)." + If unspecified, the system TBB (with header `tbb/tbb.h`) will be used via CMake's find_package(TBB)." "") @@ -15,15 +15,22 @@ register_flag_optional(PARTITIONER See https://spec.oneapi.com/versions/latest/elements/oneTBB/source/algorithms.html#partitioners for more details." "AUTO") +register_flag_optional(USE_VECTOR + "Whether to use std::vector for storage or use aligned_alloc. C++ vectors are *zero* initialised where as aligned_alloc is uninitialised before first use." + "OFF") + macro(setup) if(ONE_TBB_DIR) set(TBB_ROOT "${ONE_TBB_DIR}") # see https://github.com/Kitware/VTK/blob/0a31a9a3c1531ae238ac96a372fec4be42282863/CMake/FindTBB.cmake#L34 # docs on Intel's website refers to TBB_DIR which is not correct endif() - + # see https://github.com/oneapi-src/oneTBB/blob/master/cmake/README.md#tbbconfig---integration-of-binary-packages find_package(TBB REQUIRED) register_link_library(TBB::tbb) register_definitions(PARTITIONER_${PARTITIONER}) + if(USE_VECTOR) + register_definitions(USE_VECTOR) + endif() endmacro() From a299d613bb0f848b7b931b27bfbc128140349a50 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Sun, 24 Jul 2022 21:27:34 +0100 Subject: [PATCH 11/89] Add CI tests with and without vectors Remove duplicate CI tests from bad merge Fix extra array_size init for std-data --- src/ci-test-compile.sh | 21 +++++++++++++-------- src/std-data/STDDataStream.cpp | 1 - 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/src/ci-test-compile.sh b/src/ci-test-compile.sh index a7c5bab5..e443ccab 100755 --- a/src/ci-test-compile.sh +++ b/src/ci-test-compile.sh @@ -140,8 +140,15 @@ build_gcc() { run_build $name "${GCC_CXX:?}" std-indices "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}" run_build $name "${GCC_CXX:?}" std-ranges "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}" + # std again but with vectors + run_build $name "${GCC_CXX:?}" std-data "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-} -DUSE_VECTOR=ON" + run_build $name "${GCC_CXX:?}" std-indices "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-} -DUSE_VECTOR=ON" + run_build $name "${GCC_CXX:?}" std-ranges "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-} -DUSE_VECTOR=ON" + + run_build $name "${GCC_CXX:?}" tbb "$cxx -DONE_TBB_DIR=$TBB_LIB" run_build $name "${GCC_CXX:?}" tbb "$cxx" # build TBB again with the system TBB + run_build $name "${GCC_CXX:?}" tbb "$cxx -DUSE_VECTOR=ON" # build with vectors if [ "${GCC_OMP_OFFLOAD_AMD:-false}" != "false" ]; then run_build "amd_$name" "${GCC_CXX:?}" acc "$cxx -DCXX_EXTRA_FLAGS=-foffload=amdgcn-amdhsa" @@ -207,14 +214,6 @@ build_clang() { run_build "nvidia_$name" "${GCC_CXX:?}" omp "$cxx -DOFFLOAD=NVIDIA:$NV_ARCH" fi - run_build $name "${CLANG_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH" - run_build $name "${CLANG_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=MANAGED" - run_build $name "${CLANG_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=PAGEFAULT" - run_build $name "${CLANG_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON" - run_build $name "${CLANG_CXX:?}" ocl "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}" - run_build $name "${CLANG_CXX:?}" std-data "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" - run_build $name "${CLANG_CXX:?}" std-indices "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" - # run_build $name "${LANG_CXX:?}" std20 "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" # not yet supported run_build $name "${CLANG_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}" run_build $name "${CLANG_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH" run_build $name "${CLANG_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=MANAGED" @@ -225,8 +224,14 @@ build_clang() { run_build $name "${CLANG_CXX:?}" std-indices "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" # run_build $name "${LANG_CXX:?}" std20 "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" # not yet supported + # std again but with vectors + run_build $name "${CLANG_CXX:?}" std-data "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-} -DUSE_VECTOR=ON" + run_build $name "${CLANG_CXX:?}" std-indices "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-} -DUSE_VECTOR=ON" + # run_build $name "${LANG_CXX:?}" std20 "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-} -DUSE_VECTOR=ON" # not yet supported + run_build $name "${CLANG_CXX:?}" tbb "$cxx -DONE_TBB_DIR=$TBB_LIB" run_build $name "${CLANG_CXX:?}" tbb "$cxx" # build TBB again with the system TBB + run_build $name "${CLANG_CXX:?}" tbb "$cxx -DUSE_VECTOR=ON" # build with vectors run_build $name "${CLANG_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}" # no clang /w RAJA+cuda because it needs nvcc which needs gcc diff --git a/src/std-data/STDDataStream.cpp b/src/std-data/STDDataStream.cpp index 2dead3b4..9eb12914 100644 --- a/src/std-data/STDDataStream.cpp +++ b/src/std-data/STDDataStream.cpp @@ -34,7 +34,6 @@ STDDataStream::STDDataStream(const int ARRAY_SIZE, int device) #ifdef USE_VECTOR a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE) #else -array_size(ARRAY_SIZE), a((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)), b((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)), c((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)) From 37dcdc224ce595d89661aae3e225a439559a2e8f Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Sun, 24 Jul 2022 21:43:09 +0100 Subject: [PATCH 12/89] nvc++: "last line of file ends without a newline" Add CI vector tests for NVHPC --- src/ci-test-compile.sh | 5 +++++ src/std-data/STDDataStream.cpp | 2 +- src/std-indices/STDIndicesStream.cpp | 2 +- src/std-ranges/STDRangesStream.cpp | 2 +- src/tbb/TBBStream.cpp | 2 +- 5 files changed, 9 insertions(+), 4 deletions(-) diff --git a/src/ci-test-compile.sh b/src/ci-test-compile.sh index e443ccab..c5ba953d 100755 --- a/src/ci-test-compile.sh +++ b/src/ci-test-compile.sh @@ -242,6 +242,11 @@ build_nvhpc() { local cxx="-DCMAKE_CXX_COMPILER=${NVHPC_NVCXX:?}" run_build $name "${NVHPC_NVCXX:?}" std-data "$cxx -DNVHPC_OFFLOAD=$NV_ARCH_CCXY" run_build $name "${NVHPC_NVCXX:?}" std-indices "$cxx -DNVHPC_OFFLOAD=$NV_ARCH_CCXY" + + # std again but with vectors + run_build $name "${NVHPC_NVCXX:?}" std-data "$cxx -DNVHPC_OFFLOAD=$NV_ARCH_CCXY -DUSE_VECTOR=ON" + run_build $name "${NVHPC_NVCXX:?}" std-indices "$cxx -DNVHPC_OFFLOAD=$NV_ARCH_CCXY -DUSE_VECTOR=ON" + run_build $name "${NVHPC_NVCXX:?}" acc "$cxx -DTARGET_DEVICE=gpu -DTARGET_PROCESSOR=px -DCUDA_ARCH=$NV_ARCH_CCXY" run_build $name "${NVHPC_NVCXX:?}" acc "$cxx -DTARGET_DEVICE=multicore -DTARGET_PROCESSOR=zen" } diff --git a/src/std-data/STDDataStream.cpp b/src/std-data/STDDataStream.cpp index 9eb12914..34059f5c 100644 --- a/src/std-data/STDDataStream.cpp +++ b/src/std-data/STDDataStream.cpp @@ -121,4 +121,4 @@ template class STDDataStream; template class STDDataStream; #undef BEGIN -#undef END \ No newline at end of file +#undef END diff --git a/src/std-indices/STDIndicesStream.cpp b/src/std-indices/STDIndicesStream.cpp index 8c0958c3..d3537774 100644 --- a/src/std-indices/STDIndicesStream.cpp +++ b/src/std-indices/STDIndicesStream.cpp @@ -127,4 +127,4 @@ template class STDIndicesStream; template class STDIndicesStream; #undef BEGIN -#undef END \ No newline at end of file +#undef END diff --git a/src/std-ranges/STDRangesStream.cpp b/src/std-ranges/STDRangesStream.cpp index fc71fee4..356e6dc5 100644 --- a/src/std-ranges/STDRangesStream.cpp +++ b/src/std-ranges/STDRangesStream.cpp @@ -157,4 +157,4 @@ template class STDRangesStream; template class STDRangesStream; #undef BEGIN -#undef END \ No newline at end of file +#undef END diff --git a/src/tbb/TBBStream.cpp b/src/tbb/TBBStream.cpp index bd94443b..c5e9d905 100644 --- a/src/tbb/TBBStream.cpp +++ b/src/tbb/TBBStream.cpp @@ -155,4 +155,4 @@ template class TBBStream; template class TBBStream; #undef BEGIN -#undef END \ No newline at end of file +#undef END From 193eaa7fe2b690035a85b365a6ff263659a43b86 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Sun, 24 Jul 2022 23:30:24 +0100 Subject: [PATCH 13/89] Fix index iterator on large problem sizes --- src/std-indices/STDIndicesStream.h | 66 ++++++++++++++++++------------ 1 file changed, 39 insertions(+), 27 deletions(-) diff --git a/src/std-indices/STDIndicesStream.h b/src/std-indices/STDIndicesStream.h index 6810888a..3fd88f35 100644 --- a/src/std-indices/STDIndicesStream.h +++ b/src/std-indices/STDIndicesStream.h @@ -23,35 +23,47 @@ // implementation doesn't target template class ranged { - N from, to; public: - ranged(N from, N to ): from(from), to(to) {} - class iterator { - N num; + class iterator { + friend class ranged; public: - using difference_type = N; - using value_type = N; - using pointer = const N*; - using reference = const N&; - using iterator_category = std::random_access_iterator_tag; - explicit iterator(N _num = 0) : num(_num) {} - - iterator& operator++() { num++; return *this; } - iterator operator++(int) { iterator retval = *this; ++(*this); return retval; } - iterator operator+(const value_type v) const { return iterator(num + v); } - iterator operator+=(int x) { iterator retval = *this; this->num+=x; return retval; } - - bool operator==(iterator other) const { return num == other.num; } - bool operator!=(iterator other) const { return *this != other; } - bool operator<(iterator other) const { return num < other.num; } - - reference operator*() const { return num;} - difference_type operator-(const iterator &it) const { return num - it.num; } - value_type operator[](const difference_type &i) const { return num + i; } - - }; - iterator begin() { return iterator(from); } - iterator end() { return iterator(to >= from? to+1 : to-1); } + using difference_type = N; + using value_type = N; + using pointer = const N*; + using reference = const N&; + using iterator_category = std::random_access_iterator_tag; + + reference operator *() const { return i_; } + const iterator &operator ++() { ++i_; return *this; } + iterator operator ++(int) { iterator copy(*this); ++i_; return copy; } + + const iterator &operator --() { --i_; return *this; } + iterator operator --(int) { iterator copy(*this); --i_; return copy; } + + const iterator &operator +=(N by) { i_+=by; return *this; } + + value_type operator[](const difference_type &i) const { return i_ + i; } + + difference_type operator-(const iterator &it) const { return i_ - it.i_; } + iterator operator+(const value_type v) const { return iterator(i_ + v); } + + bool operator ==(const iterator &other) const { return i_ == other.i_; } + bool operator !=(const iterator &other) const { return i_ != other.i_; } + bool operator < (const iterator &other) const { return i_ < other.i_; } + + protected: + explicit iterator(N start) : i_ (start) {} + + private: + N i_; + }; + + [[nodiscard]] iterator begin() const { return begin_; } + [[nodiscard]] iterator end() const { return end_; } + ranged(N begin, N end) : begin_(begin), end_(end) {} +private: + iterator begin_; + iterator end_; }; template From f5513cd69ec6a3c1c0802963135f081df49f3d27 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Tue, 26 Jul 2022 23:51:21 +0100 Subject: [PATCH 14/89] Add in-tree oneTBB build --- CMakeLists.txt | 19 +++++++++++++++++++ src/std-data/model.cmake | 8 +++++++- src/std-indices/model.cmake | 9 ++++++++- src/std-ranges/model.cmake | 7 +++++++ src/tbb/model.cmake | 7 ++++++- 5 files changed, 47 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6769952d..14bd39ec 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -71,6 +71,25 @@ hint_flag(CXX_EXTRA_LINKER_FLAGS " # Honor user's CXX_EXTRA_LINK_FLAGS set(CXX_EXTRA_LINK_FLAGS ${CXX_EXTRA_FLAGS} ${CXX_EXTRA_LINK_FLAGS}) +option(USE_TBB "Enable oneTBB library for *supported* models. Enabling this on models that + don't explicitly link against TBB is a no-op, see description of your selected + model on how this is used." OFF) + +if (USE_TBB) + include(FetchContent) + FetchContent_Declare( + TBB + GIT_REPOSITORY https://github.com/oneapi-src/oneTBB.git + GIT_TAG faaf43c4ab22cb4b4267d65d5e218fa58800eea8 + ) + # Not using FetchContent_MakeAvailable because we need EXCLUDE_FROM_ALL + FetchContent_GetProperties(TBB) + if (NOT TBB_POPULATED) + FetchContent_Populate(TBB) + add_subdirectory(${tbb_SOURCE_DIR} ${tbb_BINARY_DIR} EXCLUDE_FROM_ALL) + endif () +endif () + # include our macros include(cmake/register_models.cmake) diff --git a/src/std-data/model.cmake b/src/std-data/model.cmake index 6f87bc94..3f79f13a 100644 --- a/src/std-data/model.cmake +++ b/src/std-data/model.cmake @@ -23,6 +23,10 @@ register_flag_optional(NVHPC_OFFLOAD ccall - Compile for all supported compute capabilities" "") +register_flag_optional(USE_TBB + "No-op if ONE_TBB_DIR is set. Link against an in-tree oneTBB via FetchContent_Declare, see top level CMakeLists.txt for details." + "OFF") + macro(setup) set(CMAKE_CXX_STANDARD 17) @@ -35,5 +39,7 @@ macro(setup) if(USE_VECTOR) register_definitions(USE_VECTOR) endif() - + if (USE_TBB) + register_link_library(TBB::tbb) + endif () endmacro() diff --git a/src/std-indices/model.cmake b/src/std-indices/model.cmake index 6f87bc94..7dc22b9d 100644 --- a/src/std-indices/model.cmake +++ b/src/std-indices/model.cmake @@ -23,6 +23,11 @@ register_flag_optional(NVHPC_OFFLOAD ccall - Compile for all supported compute capabilities" "") +register_flag_optional(USE_TBB + "Link against an in-tree oneTBB via FetchContent_Declare, see top level CMakeLists.txt for details." + "OFF") + + macro(setup) set(CMAKE_CXX_STANDARD 17) @@ -35,5 +40,7 @@ macro(setup) if(USE_VECTOR) register_definitions(USE_VECTOR) endif() - + if (USE_TBB) + register_link_library(TBB::tbb) + endif () endmacro() diff --git a/src/std-ranges/model.cmake b/src/std-ranges/model.cmake index ac56962b..65e54894 100644 --- a/src/std-ranges/model.cmake +++ b/src/std-ranges/model.cmake @@ -7,6 +7,10 @@ register_flag_optional(USE_VECTOR "Whether to use std::vector for storage or use aligned_alloc. C++ vectors are *zero* initialised where as aligned_alloc is uninitialised before first use." "OFF") +register_flag_optional(USE_TBB + "No-op if ONE_TBB_DIR is set. Link against an in-tree oneTBB via FetchContent_Declare, see top level CMakeLists.txt for details." + "OFF") + macro(setup) # TODO this needs to eventually be removed when CMake adds proper C++20 support or at least update the flag used here @@ -20,4 +24,7 @@ macro(setup) if(USE_VECTOR) register_definitions(USE_VECTOR) endif() + if (USE_TBB) + register_link_library(TBB::tbb) + endif () endmacro() diff --git a/src/tbb/model.cmake b/src/tbb/model.cmake index c1ff9aac..eeb16377 100644 --- a/src/tbb/model.cmake +++ b/src/tbb/model.cmake @@ -19,15 +19,20 @@ register_flag_optional(USE_VECTOR "Whether to use std::vector for storage or use aligned_alloc. C++ vectors are *zero* initialised where as aligned_alloc is uninitialised before first use." "OFF") +register_flag_optional(USE_TBB + "No-op if ONE_TBB_DIR is set. Link against an in-tree oneTBB via FetchContent_Declare, see top level CMakeLists.txt for details." + "OFF") + macro(setup) if(ONE_TBB_DIR) set(TBB_ROOT "${ONE_TBB_DIR}") # see https://github.com/Kitware/VTK/blob/0a31a9a3c1531ae238ac96a372fec4be42282863/CMake/FindTBB.cmake#L34 # docs on Intel's website refers to TBB_DIR which is not correct + find_package(TBB REQUIRED) endif() + # No need to handle USE_TBB as both ONE_TBB_DIR and USE_TBB will create the TBB::tbb target # see https://github.com/oneapi-src/oneTBB/blob/master/cmake/README.md#tbbconfig---integration-of-binary-packages - find_package(TBB REQUIRED) register_link_library(TBB::tbb) register_definitions(PARTITIONER_${PARTITIONER}) if(USE_VECTOR) From 5197a4e5618292ea812f10d7cbd22a4ef75b36a0 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Wed, 27 Jul 2022 00:16:29 +0100 Subject: [PATCH 15/89] Find TBB if USE_TBB is not set --- src/tbb/model.cmake | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/tbb/model.cmake b/src/tbb/model.cmake index eeb16377..1cbd7fba 100644 --- a/src/tbb/model.cmake +++ b/src/tbb/model.cmake @@ -27,11 +27,12 @@ macro(setup) if(ONE_TBB_DIR) set(TBB_ROOT "${ONE_TBB_DIR}") # see https://github.com/Kitware/VTK/blob/0a31a9a3c1531ae238ac96a372fec4be42282863/CMake/FindTBB.cmake#L34 # docs on Intel's website refers to TBB_DIR which is not correct + endif() + if (NOT USE_TBB) + # Only find TBB when we're not building in-tree find_package(TBB REQUIRED) endif() - # No need to handle USE_TBB as both ONE_TBB_DIR and USE_TBB will create the TBB::tbb target - # see https://github.com/oneapi-src/oneTBB/blob/master/cmake/README.md#tbbconfig---integration-of-binary-packages register_link_library(TBB::tbb) register_definitions(PARTITIONER_${PARTITIONER}) From dfb4eb06b24245727c37e1356a3f0e73f2b83db0 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Thu, 28 Jul 2022 15:03:26 +0100 Subject: [PATCH 16/89] Add oneDPL for std models --- CMakeLists.txt | 26 +++++++++- cmake/register_models.cmake | 4 ++ cmake/shim_onedpl.cmake | 27 ++++++++++ src/dpl_shim.h | 75 ++++++++++++++++++++++++++++ src/std-data/STDDataStream.cpp | 46 ++++++++++------- src/std-data/STDDataStream.h | 5 +- src/std-data/model.cmake | 18 ++++++- src/std-indices/STDIndicesStream.cpp | 39 +++++++++++---- src/std-indices/STDIndicesStream.h | 11 ++-- src/std-indices/model.cmake | 17 +++++-- src/std-ranges/STDRangesStream.cpp | 39 +++++++++++---- src/std-ranges/STDRangesStream.hpp | 10 ++-- src/std-ranges/model.cmake | 17 ++++++- 13 files changed, 269 insertions(+), 65 deletions(-) create mode 100644 cmake/shim_onedpl.cmake create mode 100644 src/dpl_shim.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 14bd39ec..263555a0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -76,13 +76,15 @@ option(USE_TBB "Enable oneTBB library for *supported* models. Enabling this on m model on how this is used." OFF) if (USE_TBB) - include(FetchContent) FetchContent_Declare( TBB GIT_REPOSITORY https://github.com/oneapi-src/oneTBB.git GIT_TAG faaf43c4ab22cb4b4267d65d5e218fa58800eea8 ) - # Not using FetchContent_MakeAvailable because we need EXCLUDE_FROM_ALL + # Don't fail builds on waring (TBB has -Wall while not being free of warnings from unused symbols...) + set(CMAKE_POLICY_DEFAULT_CMP0077 NEW) + set(TBB_STRICT OFF) + # Not using FetchContent_MakeAvailable (CMake>= 3.14) because we need EXCLUDE_FROM_ALL FetchContent_GetProperties(TBB) if (NOT TBB_POPULATED) FetchContent_Populate(TBB) @@ -90,6 +92,25 @@ if (USE_TBB) endif () endif () +option(USE_TBB "Enable oneDPL library for *supported* models. Enabling this on models that + don't explicitly link against DPL is a no-op, see description of your selected + model on how this is used." OFF) + +if (USE_ONEDPL) + FetchContent_Declare( + oneDPL + GIT_REPOSITORY https://github.com/oneapi-src/oneDPL.git + GIT_TAG oneDPL-2021.7.0-release + ) + # Not using FetchContent_MakeAvailable (CMake>= 3.14) because we need EXCLUDE_FROM_ALL + FetchContent_GetProperties(oneDPL) + if (NOT oneDPL_POPULATED) + FetchContent_Populate(oneDPL) + add_subdirectory(${onedpl_SOURCE_DIR} ${onedpl_BINARY_DIR} EXCLUDE_FROM_ALL) + endif () +endif() + + # include our macros include(cmake/register_models.cmake) @@ -170,6 +191,7 @@ include_directories(src) add_executable(${EXE_NAME} ${IMPL_SOURCES} src/main.cpp) target_link_libraries(${EXE_NAME} PUBLIC ${LINK_LIBRARIES}) target_compile_definitions(${EXE_NAME} PUBLIC ${IMPL_DEFINITIONS}) +target_include_directories(${EXE_NAME} PUBLIC ${IMPL_DIRECTORIES}) if (CXX_EXTRA_LIBRARIES) target_link_libraries(${EXE_NAME} PUBLIC ${CXX_EXTRA_LIBRARIES}) diff --git a/cmake/register_models.cmake b/cmake/register_models.cmake index f180c03b..9432313e 100644 --- a/cmake/register_models.cmake +++ b/cmake/register_models.cmake @@ -71,6 +71,10 @@ macro(register_definitions) list(APPEND IMPL_DEFINITIONS ${ARGN}) endmacro() +macro(register_directories) + list(APPEND IMPL_DIRECTORIES ${ARGN}) +endmacro() + macro(register_flag_required NAME DESCRIPTION) list(APPEND CUSTOM_FLAGS_TRIPLE "${NAME}" "${DESCRIPTION}" ON "") endmacro() diff --git a/cmake/shim_onedpl.cmake b/cmake/shim_onedpl.cmake new file mode 100644 index 00000000..861d0697 --- /dev/null +++ b/cmake/shim_onedpl.cmake @@ -0,0 +1,27 @@ + + +if (USE_ONEDPL) + # # XXX see https://www.intel.com/content/www/us/en/develop/documentation/oneapi-dpcpp-library-guide/top/oneapi-dpc-library-onedpl-overview.html + # # this is to avoid the system TBB headers (if exists) from having precedence which isn't compatible with oneDPL's par implementation + # register_definitions( + # PSTL_USE_PARALLEL_POLICIES=0 + # _GLIBCXX_USE_TBB_PAR_BACKEND=0 + # ) + register_definitions(USE_ONEDPL) + if (USE_ONEDPL STREQUAL "TBB") + register_definitions(ONEDPL_USE_TBB_BACKEND=1) + # TBB will either be linked later (USE_TBB==ON) or via extra libraries, don't do anything here + elseif (USE_ONEDPL STREQUAL "OPENMP") + register_definitions(ONEDPL_USE_OPENMP_BACKEND=1) + # Link OpenMP via CMAKE + find_package(OpenMP REQUIRED) + register_link_library(OpenMP::OpenMP_CXX) + elseif (USE_ONEDPL STREQUAL "SYCL") + register_definitions(ONEDPL_USE_DPCPP_BACKEND=1) + # This needs a SYCL compiler, will fail if CXX doesn't SYCL2020 + register_append_cxx_flags(ANY -fsycl-unnamed-lambda -fsycl) + else () + message(FATAL_ERROR "Unsupported USE_ONEDPL backend: ${USE_ONEDPL}, see USE_ONEDPL flag description for available values.") + endif () + register_directories(ANY ${onedpl_SOURCE_DIR}/include) +endif () \ No newline at end of file diff --git a/src/dpl_shim.h b/src/dpl_shim.h new file mode 100644 index 00000000..89012a35 --- /dev/null +++ b/src/dpl_shim.h @@ -0,0 +1,75 @@ +#pragma once + +#include +#include + +#ifndef ALIGNMENT +#define ALIGNMENT (2*1024*1024) // 2MB +#endif + +#ifdef USE_ONEDPL + +// oneDPL C++17 PSTL + +#include +#include +#include + +#ifdef ONEDPL_USE_DPCPP_BACKEND + +#include + +const static auto exe_policy = oneapi::dpl::execution::device_policy<>{ + oneapi::dpl::execution::make_device_policy(cl::sycl::default_selector{}) +}; + +template using Allocator = sycl::usm_allocator; + +template +constexpr Allocator alloc_vec() { return {exe_policy.queue()}; }; + +template +T *alloc_raw(size_t size) { return sycl::malloc_shared(size, exe_policy.queue()); } + +template +void dealloc_raw(T *ptr) { sycl::free(ptr, exe_policy.queue()); } + +#else + +// auto exe_policy = dpl::execution::seq; +// auto exe_policy = dpl::execution::par; +static constexpr auto exe_policy = dpl::execution::par_unseq; +#define USE_STD_PTR_ALLOC_DEALLOC + +#endif + +#else + +// Normal C++17 PSTL + +#include +#include +#include + +// auto exe_policy = std::execution::seq; +// auto exe_policy = std::execution::par; +static constexpr auto exe_policy = std::execution::par_unseq; +#define USE_STD_PTR_ALLOC_DEALLOC + + +#endif + +#ifdef USE_STD_PTR_ALLOC_DEALLOC + +template using Allocator = std::allocator; + +template +constexpr Allocator alloc_vec() { return {}; }; + +template +T *alloc_raw(size_t size) { return (T *) aligned_alloc(ALIGNMENT, sizeof(T) * size); } + +template +void dealloc_raw(T *ptr) { free(ptr); } + +#endif diff --git a/src/std-data/STDDataStream.cpp b/src/std-data/STDDataStream.cpp index 34059f5c..2bb6a33a 100644 --- a/src/std-data/STDDataStream.cpp +++ b/src/std-data/STDDataStream.cpp @@ -6,14 +6,6 @@ #include "STDDataStream.h" -#include -#include -#include - -#ifndef ALIGNMENT -#define ALIGNMENT (2*1024*1024) // 2MB -#endif - #ifdef USE_VECTOR #define BEGIN(x) (x).begin() #define END(x) (x).end() @@ -22,23 +14,39 @@ #define END(x) ((x) + array_size) #endif -// There are three execution policies: -// auto exe_policy = std::execution::seq; -// auto exe_policy = std::execution::par; -constexpr auto exe_policy = std::execution::par_unseq; - - template STDDataStream::STDDataStream(const int ARRAY_SIZE, int device) noexcept : array_size{ARRAY_SIZE}, #ifdef USE_VECTOR - a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE) + a(ARRAY_SIZE, alloc_vec()), b(ARRAY_SIZE, alloc_vec()), c(ARRAY_SIZE, alloc_vec()) #else - a((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)), - b((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)), - c((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)) + a(alloc_raw(ARRAY_SIZE)), b(alloc_raw(ARRAY_SIZE)), c(alloc_raw(ARRAY_SIZE)) #endif -{ std::cout <<"Backing storage typeid: " << typeid(a).name() << std::endl; } +{ + std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl; +#if USE_ONEDPL + std::cout << "Using oneDPL backend: "; +#if defined(ONEDPL_USE_DPCPP_BACKEND) + std::cout << "SYCL USM (device=" << exe_policy.queue().get_device().get_info() << ")"; +#elif defined(ONEDPL_USE_TBB_BACKEND) + std::cout << "TBB " TBB_VERSION_STRING; +#elif defined(ONEDPL_USE_OPENMP_BACKEND) + std::cout << "OpenMP"; +#else + std::cout << "Default"; +#endif + std::cout << std::endl; +#endif +} + +template +STDDataStream::~STDDataStream() { +#ifndef USE_VECTOR + dealloc_raw(a); + dealloc_raw(b); + dealloc_raw(c); +#endif +} template void STDDataStream::init_arrays(T initA, T initB, T initC) diff --git a/src/std-data/STDDataStream.h b/src/std-data/STDDataStream.h index 84b4dcfb..e50c95d8 100644 --- a/src/std-data/STDDataStream.h +++ b/src/std-data/STDDataStream.h @@ -5,6 +5,7 @@ // source code #pragma once +#include "dpl_shim.h" #include #include @@ -22,7 +23,7 @@ class STDDataStream : public Stream // Device side pointers #ifdef USE_VECTOR - std::vector a, b, c; + std::vector> a, b, c; #else T *a, *b, *c; #endif @@ -30,7 +31,7 @@ class STDDataStream : public Stream public: STDDataStream(const int, int) noexcept; - ~STDDataStream() = default; + ~STDDataStream(); virtual void copy() override; virtual void add() override; diff --git a/src/std-data/model.cmake b/src/std-data/model.cmake index 3f79f13a..f2fecba8 100644 --- a/src/std-data/model.cmake +++ b/src/std-data/model.cmake @@ -27,18 +27,32 @@ register_flag_optional(USE_TBB "No-op if ONE_TBB_DIR is set. Link against an in-tree oneTBB via FetchContent_Declare, see top level CMakeLists.txt for details." "OFF") +register_flag_optional(USE_ONEDPL + "Link oneDPL which implements C++17 executor policies (via execution_policy_tag) for different backends. + + Possible values are: + OPENMP - Implements policies using OpenMP. + CMake will handle any flags needed to enable OpenMP if the compiler supports it. + TBB - Implements policies using TBB. + TBB must be linked via USE_TBB or be available in LD_LIBRARY_PATH. + SYCL - Implements policies through SYCL2020. + This requires the DPC++ compiler (other SYCL compilers are untested), required SYCL flags are added automatically." + "OFF") + macro(setup) set(CMAKE_CXX_STANDARD 17) + include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/shim_onedpl.cmake) + if (NVHPC_OFFLOAD) set(NVHPC_FLAGS -stdpar -gpu=${NVHPC_OFFLOAD}) # propagate flags to linker so that it links with the gpu stuff as well register_append_cxx_flags(ANY ${NVHPC_FLAGS}) register_append_link_flags(${NVHPC_FLAGS}) endif () - if(USE_VECTOR) + if (USE_VECTOR) register_definitions(USE_VECTOR) - endif() + endif () if (USE_TBB) register_link_library(TBB::tbb) endif () diff --git a/src/std-indices/STDIndicesStream.cpp b/src/std-indices/STDIndicesStream.cpp index d3537774..4ec9977d 100644 --- a/src/std-indices/STDIndicesStream.cpp +++ b/src/std-indices/STDIndicesStream.cpp @@ -22,22 +22,39 @@ #define END(x) ((x) + array_size) #endif -// There are three execution policies: -// auto exe_policy = std::execution::seq; -// auto exe_policy = std::execution::par; -constexpr auto exe_policy = std::execution::par_unseq; - template STDIndicesStream::STDIndicesStream(const int ARRAY_SIZE, int device) - noexcept : array_size{ARRAY_SIZE}, range(0, array_size), +noexcept : array_size{ARRAY_SIZE}, range(0, array_size), #ifdef USE_VECTOR - a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE) + a(ARRAY_SIZE, alloc_vec()), b(ARRAY_SIZE, alloc_vec()), c(ARRAY_SIZE, alloc_vec()) #else - a((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)), - b((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)), - c((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)) + a(alloc_raw(ARRAY_SIZE)), b(alloc_raw(ARRAY_SIZE)), c(alloc_raw(ARRAY_SIZE)) +#endif +{ + std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl; +#if USE_ONEDPL + std::cout << "Using oneDPL backend: "; +#if defined(ONEDPL_USE_DPCPP_BACKEND) + std::cout << "SYCL USM (device=" << exe_policy.queue().get_device().get_info() << ")"; +#elif defined(ONEDPL_USE_TBB_BACKEND) + std::cout << "TBB " TBB_VERSION_STRING; +#elif defined(ONEDPL_USE_OPENMP_BACKEND) + std::cout << "OpenMP"; +#else + std::cout << "Default"; +#endif + std::cout << std::endl; #endif -{ std::cout <<"Backing storage typeid: " << typeid(a).name() << std::endl; } +} + +template +STDIndicesStream::~STDIndicesStream() { +#ifndef USE_VECTOR + dealloc_raw(a); + dealloc_raw(b); + dealloc_raw(c); +#endif +} template void STDIndicesStream::init_arrays(T initA, T initB, T initC) diff --git a/src/std-indices/STDIndicesStream.h b/src/std-indices/STDIndicesStream.h index 3fd88f35..63254cdf 100644 --- a/src/std-indices/STDIndicesStream.h +++ b/src/std-indices/STDIndicesStream.h @@ -5,19 +5,14 @@ // source code #pragma once +#include "dpl_shim.h" #include #include #include "Stream.h" -#ifdef USE_SPAN -#include -#endif - - #define IMPLEMENTATION_STRING "STD (index-oriented)" - // A lightweight counting iterator which will be used by the STL algorithms // NB: C++ <= 17 doesn't have this built-in, and it's only added later in ranges-v3 (C++2a) which this // implementation doesn't target @@ -78,7 +73,7 @@ class STDIndicesStream : public Stream // Device side pointers #ifdef USE_VECTOR - std::vector a, b, c; + std::vector> a, b, c; #else T *a, *b, *c; #endif @@ -86,7 +81,7 @@ class STDIndicesStream : public Stream public: STDIndicesStream(const int, int) noexcept; - ~STDIndicesStream() = default; + ~STDIndicesStream(); virtual void copy() override; virtual void add() override; diff --git a/src/std-indices/model.cmake b/src/std-indices/model.cmake index 7dc22b9d..36e2ed82 100644 --- a/src/std-indices/model.cmake +++ b/src/std-indices/model.cmake @@ -27,19 +27,30 @@ register_flag_optional(USE_TBB "Link against an in-tree oneTBB via FetchContent_Declare, see top level CMakeLists.txt for details." "OFF") +register_flag_optional(USE_ONEDPL + "Link oneDPL which implements C++17 executor policies (via execution_policy_tag) for different backends. + + Possible values are: + OPENMP - Implements policies using OpenMP. + CMake will handle any flags needed to enable OpenMP if the compiler supports it. + TBB - Implements policies using TBB. + TBB must be linked via USE_TBB or be available in LD_LIBRARY_PATH. + SYCL - Implements policies through SYCL2020. + This requires the DPC++ compiler (other SYCL compilers are untested), required SYCL flags are added automatically." + "OFF") macro(setup) set(CMAKE_CXX_STANDARD 17) - + include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/shim_onedpl.cmake) if (NVHPC_OFFLOAD) set(NVHPC_FLAGS -stdpar -gpu=${NVHPC_OFFLOAD}) # propagate flags to linker so that it links with the gpu stuff as well register_append_cxx_flags(ANY ${NVHPC_FLAGS}) register_append_link_flags(${NVHPC_FLAGS}) endif () - if(USE_VECTOR) + if (USE_VECTOR) register_definitions(USE_VECTOR) - endif() + endif () if (USE_TBB) register_link_library(TBB::tbb) endif () diff --git a/src/std-ranges/STDRangesStream.cpp b/src/std-ranges/STDRangesStream.cpp index 356e6dc5..29993bc6 100644 --- a/src/std-ranges/STDRangesStream.cpp +++ b/src/std-ranges/STDRangesStream.cpp @@ -22,22 +22,39 @@ #define END(x) ((x) + array_size) #endif -// There are three execution policies: -// auto exe_policy = std::execution::seq; -// auto exe_policy = std::execution::par; -constexpr auto exe_policy = std::execution::par_unseq; - template STDRangesStream::STDRangesStream(const int ARRAY_SIZE, int device) - : array_size{ARRAY_SIZE}, +noexcept : array_size{ARRAY_SIZE}, #ifdef USE_VECTOR - a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE) + a(ARRAY_SIZE, alloc_vec()), b(ARRAY_SIZE, alloc_vec()), c(ARRAY_SIZE, alloc_vec()) #else - a((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)), - b((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)), - c((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)) + a(alloc_raw(ARRAY_SIZE)), b(alloc_raw(ARRAY_SIZE)), c(alloc_raw(ARRAY_SIZE)) +#endif +{ + std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl; +#if USE_ONEDPL + std::cout << "Using oneDPL backend: "; +#if defined(ONEDPL_USE_DPCPP_BACKEND) + std::cout << "SYCL USM (device=" << exe_policy.queue().get_device().get_info() << ")"; +#elif defined(ONEDPL_USE_TBB_BACKEND) + std::cout << "TBB " TBB_VERSION_STRING; +#elif defined(ONEDPL_USE_OPENMP_BACKEND) + std::cout << "OpenMP"; +#else + std::cout << "Default"; +#endif + std::cout << std::endl; #endif -{ std::cout <<"Backing storage typeid: " << typeid(a).name() << std::endl; } +} + +template +STDRangesStream::~STDRangesStream() { +#ifndef USE_VECTOR + dealloc_raw(a); + dealloc_raw(b); + dealloc_raw(c); +#endif +} template void STDRangesStream::init_arrays(T initA, T initB, T initC) diff --git a/src/std-ranges/STDRangesStream.hpp b/src/std-ranges/STDRangesStream.hpp index 33bc77bd..21902c6c 100644 --- a/src/std-ranges/STDRangesStream.hpp +++ b/src/std-ranges/STDRangesStream.hpp @@ -5,10 +5,10 @@ // source code #pragma once +#include "dpl_shim.h" #include -#include - +#include #include "Stream.h" #define IMPLEMENTATION_STRING "STD C++ ranges" @@ -22,14 +22,14 @@ class STDRangesStream : public Stream // Device side pointers #ifdef USE_VECTOR - std::vector a, b, c; + std::vector> a, b, c; #else T *a, *b, *c; #endif public: - STDRangesStream(const int, int); - ~STDRangesStream() = default; + STDRangesStream(const int, int) noexcept; + ~STDRangesStream(); virtual void copy() override; virtual void add() override; diff --git a/src/std-ranges/model.cmake b/src/std-ranges/model.cmake index 65e54894..2d90afc4 100644 --- a/src/std-ranges/model.cmake +++ b/src/std-ranges/model.cmake @@ -11,6 +11,18 @@ register_flag_optional(USE_TBB "No-op if ONE_TBB_DIR is set. Link against an in-tree oneTBB via FetchContent_Declare, see top level CMakeLists.txt for details." "OFF") +register_flag_optional(USE_ONEDPL + "Link oneDPL which implements C++17 executor policies (via execution_policy_tag) for different backends. + + Possible values are: + OPENMP - Implements policies using OpenMP. + CMake will handle any flags needed to enable OpenMP if the compiler supports it. + TBB - Implements policies using TBB. + TBB must be linked via USE_TBB or be available in LD_LIBRARY_PATH. + SYCL - Implements policies through SYCL2020. + This requires the DPC++ compiler (other SYCL compilers are untested), required SYCL flags are added automatically." + "OFF") + macro(setup) # TODO this needs to eventually be removed when CMake adds proper C++20 support or at least update the flag used here @@ -21,9 +33,10 @@ macro(setup) unset(CMAKE_CXX_STANDARD) # drop any existing standard we have set by default # and append our own: register_append_cxx_flags(ANY -std=c++2a) - if(USE_VECTOR) + include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/shim_onedpl.cmake) + if (USE_VECTOR) register_definitions(USE_VECTOR) - endif() + endif () if (USE_TBB) register_link_library(TBB::tbb) endif () From f77e43c6d533aea926aec8b39aea072d319a8611 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Thu, 28 Jul 2022 16:23:07 +0100 Subject: [PATCH 17/89] Don't capture `this` implicitly Relax const constraints on the range iterator --- src/std-indices/STDIndicesStream.cpp | 8 ++++---- src/std-indices/STDIndicesStream.h | 13 +++++++++---- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/src/std-indices/STDIndicesStream.cpp b/src/std-indices/STDIndicesStream.cpp index 4ec9977d..7cacde3f 100644 --- a/src/std-indices/STDIndicesStream.cpp +++ b/src/std-indices/STDIndicesStream.cpp @@ -83,7 +83,7 @@ template void STDIndicesStream::mul() { // b[i] = scalar * c[i]; - std::transform(exe_policy, range.begin(), range.end(), BEGIN(b), [&, scalar = startScalar](int i) { + std::transform(exe_policy, range.begin(), range.end(), BEGIN(b), [this, scalar = startScalar](int i) { return scalar * c[i]; }); } @@ -92,7 +92,7 @@ template void STDIndicesStream::add() { // c[i] = a[i] + b[i]; - std::transform(exe_policy, range.begin(), range.end(), BEGIN(c), [&](int i) { + std::transform(exe_policy, range.begin(), range.end(), BEGIN(c), [this](int i) { return a[i] + b[i]; }); } @@ -101,7 +101,7 @@ template void STDIndicesStream::triad() { // a[i] = b[i] + scalar * c[i]; - std::transform(exe_policy, range.begin(), range.end(), BEGIN(a), [&, scalar = startScalar](int i) { + std::transform(exe_policy, range.begin(), range.end(), BEGIN(a), [this, scalar = startScalar](int i) { return b[i] + scalar * c[i]; }); } @@ -113,7 +113,7 @@ void STDIndicesStream::nstream() // Need to do in two stages with C++11 STL. // 1: a[i] += b[i] // 2: a[i] += scalar * c[i]; - std::transform(exe_policy, range.begin(), range.end(), BEGIN(a), [&, scalar = startScalar](int i) { + std::transform(exe_policy, range.begin(), range.end(), BEGIN(a), [this, scalar = startScalar](int i) { return a[i] + b[i] + scalar * c[i]; }); } diff --git a/src/std-indices/STDIndicesStream.h b/src/std-indices/STDIndicesStream.h index 63254cdf..a955374f 100644 --- a/src/std-indices/STDIndicesStream.h +++ b/src/std-indices/STDIndicesStream.h @@ -25,17 +25,22 @@ class ranged { using difference_type = N; using value_type = N; using pointer = const N*; - using reference = const N&; + using reference = N; using iterator_category = std::random_access_iterator_tag; + // XXX This is not part of the iterator spec, it gets picked up by oneDPL if enabled. + // Without this, the DPL SYCL backend collects the iterator data on the host and copies to the device. + // This type is unused for any nother STL impl. + using is_passed_directly = std::true_type; + reference operator *() const { return i_; } - const iterator &operator ++() { ++i_; return *this; } + iterator &operator ++() { ++i_; return *this; } iterator operator ++(int) { iterator copy(*this); ++i_; return copy; } - const iterator &operator --() { --i_; return *this; } + iterator &operator --() { --i_; return *this; } iterator operator --(int) { iterator copy(*this); --i_; return copy; } - const iterator &operator +=(N by) { i_+=by; return *this; } + iterator &operator +=(N by) { i_+=by; return *this; } value_type operator[](const difference_type &i) const { return i_ + i; } From 0e8b3b4bcec68e2a6ffce9bac65308d7337d3342 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Thu, 28 Jul 2022 16:36:39 +0100 Subject: [PATCH 18/89] Add CI for dpl --- src/ci-test-compile.sh | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/src/ci-test-compile.sh b/src/ci-test-compile.sh index c5ba953d..cccbd2d7 100755 --- a/src/ci-test-compile.sh +++ b/src/ci-test-compile.sh @@ -135,16 +135,14 @@ build_gcc() { "./$BUILD_DIR/omp_$name/omp-stream" -s 1048576 -n 10 fi - # some distributions like Ubuntu bionic implements std par with TBB, so conditionally link it here - run_build $name "${GCC_CXX:?}" std-data "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}" - run_build $name "${GCC_CXX:?}" std-indices "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}" - run_build $name "${GCC_CXX:?}" std-ranges "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}" - - # std again but with vectors - run_build $name "${GCC_CXX:?}" std-data "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-} -DUSE_VECTOR=ON" - run_build $name "${GCC_CXX:?}" std-indices "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-} -DUSE_VECTOR=ON" - run_build $name "${GCC_CXX:?}" std-ranges "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-} -DUSE_VECTOR=ON" - + for use_onedpl in OFF OPENMP TBB; do + for use_vector in OFF ON; do + # some distributions like Ubuntu bionic implements std par with TBB, so conditionally link it here + run_build $name "${GCC_CXX:?}" std-data "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-} -DUSE_ONEDPL=$use_onedpl -DUSE_VECTOR=$use_vector" + run_build $name "${GCC_CXX:?}" std-indices "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-} -DUSE_ONEDPL=$use_onedpl -DUSE_VECTOR=$use_vector" + run_build $name "${GCC_CXX:?}" std-ranges "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-} -DUSE_ONEDPL=$use_onedpl -DUSE_VECTOR=$use_vector" + done + done run_build $name "${GCC_CXX:?}" tbb "$cxx -DONE_TBB_DIR=$TBB_LIB" run_build $name "${GCC_CXX:?}" tbb "$cxx" # build TBB again with the system TBB @@ -220,14 +218,14 @@ build_clang() { run_build $name "${CLANG_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=PAGEFAULT" run_build $name "${CLANG_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON" run_build $name "${CLANG_CXX:?}" ocl "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}" - run_build $name "${CLANG_CXX:?}" std-data "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" - run_build $name "${CLANG_CXX:?}" std-indices "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" - # run_build $name "${LANG_CXX:?}" std20 "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" # not yet supported - # std again but with vectors - run_build $name "${CLANG_CXX:?}" std-data "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-} -DUSE_VECTOR=ON" - run_build $name "${CLANG_CXX:?}" std-indices "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-} -DUSE_VECTOR=ON" - # run_build $name "${LANG_CXX:?}" std20 "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-} -DUSE_VECTOR=ON" # not yet supported + for use_onedpl in OFF OPENMP TBB; do + for use_vector in OFF ON; do + run_build $name "${CLANG_CXX:?}" std-data "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-} -DUSE_ONEDPL=$use_onedpl -DUSE_VECTOR=$use_vector" + run_build $name "${CLANG_CXX:?}" std-indices "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-} -DUSE_ONEDPL=$use_onedpl -DUSE_VECTOR=$use_vector" + # run_build $name "${CLANG_CXX:?}" std-ranges "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-} -DUSE_ONEDPL=$use_onedpl -DUSE_VECTOR=$use_vector" # not yet supported + done + done run_build $name "${CLANG_CXX:?}" tbb "$cxx -DONE_TBB_DIR=$TBB_LIB" run_build $name "${CLANG_CXX:?}" tbb "$cxx" # build TBB again with the system TBB From 379bc2032c34edb91345082b2cd1a5554f5880b6 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Thu, 28 Jul 2022 16:49:16 +0100 Subject: [PATCH 19/89] Add CI for dpl (again) --- src/ci-test-compile.sh | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/src/ci-test-compile.sh b/src/ci-test-compile.sh index cccbd2d7..041c9916 100755 --- a/src/ci-test-compile.sh +++ b/src/ci-test-compile.sh @@ -136,11 +136,15 @@ build_gcc() { fi for use_onedpl in OFF OPENMP TBB; do + case "$use_onedpl" in + OFF) dpl_conditional_flags="-DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}" ;; + *) dpl_conditional_flags="-DUSE_TBB=ON" ;; + esac for use_vector in OFF ON; do # some distributions like Ubuntu bionic implements std par with TBB, so conditionally link it here - run_build $name "${GCC_CXX:?}" std-data "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-} -DUSE_ONEDPL=$use_onedpl -DUSE_VECTOR=$use_vector" - run_build $name "${GCC_CXX:?}" std-indices "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-} -DUSE_ONEDPL=$use_onedpl -DUSE_VECTOR=$use_vector" - run_build $name "${GCC_CXX:?}" std-ranges "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-} -DUSE_ONEDPL=$use_onedpl -DUSE_VECTOR=$use_vector" + run_build $name "${GCC_CXX:?}" std-data "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DUSE_VECTOR=$use_vector" + run_build $name "${GCC_CXX:?}" std-indices "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DUSE_VECTOR=$use_vector" + run_build $name "${GCC_CXX:?}" std-ranges "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DUSE_VECTOR=$use_vector" done done @@ -221,9 +225,13 @@ build_clang() { for use_onedpl in OFF OPENMP TBB; do for use_vector in OFF ON; do - run_build $name "${CLANG_CXX:?}" std-data "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-} -DUSE_ONEDPL=$use_onedpl -DUSE_VECTOR=$use_vector" - run_build $name "${CLANG_CXX:?}" std-indices "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-} -DUSE_ONEDPL=$use_onedpl -DUSE_VECTOR=$use_vector" - # run_build $name "${CLANG_CXX:?}" std-ranges "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-} -DUSE_ONEDPL=$use_onedpl -DUSE_VECTOR=$use_vector" # not yet supported + case "$use_onedpl" in + OFF) dpl_conditional_flags="-DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" ;; + *) dpl_conditional_flags="-DUSE_TBB=ON" ;; + esac + run_build $name "${CLANG_CXX:?}" std-data "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DUSE_VECTOR=$use_vector " + run_build $name "${CLANG_CXX:?}" std-indices "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DUSE_VECTOR=$use_vector" + # run_build $name "${CLANG_CXX:?}" std-ranges "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DUSE_VECTOR=$use_vector" # not yet supported done done From 14844ceb5615d33ddff573197f48fb996fab4bb8 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Thu, 28 Jul 2022 16:53:57 +0100 Subject: [PATCH 20/89] Fix CMakeLists.txt typo on USE_ONEDPL --- CMakeLists.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 263555a0..f2af1e31 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -92,9 +92,9 @@ if (USE_TBB) endif () endif () -option(USE_TBB "Enable oneDPL library for *supported* models. Enabling this on models that - don't explicitly link against DPL is a no-op, see description of your selected - model on how this is used." OFF) +option(USE_ONEDPL "Enable oneDPL library for *supported* models. Enabling this on models that + don't explicitly link against DPL is a no-op, see description of your selected + model on how this is used." OFF) if (USE_ONEDPL) FetchContent_Declare( From 5a496a91b293f7cd79059afe5452d1b7856753ea Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Thu, 28 Jul 2022 19:29:46 +0100 Subject: [PATCH 21/89] Fixup oneDPL inclusion --- CMakeLists.txt | 14 ++++++++++++++ cmake/register_models.cmake | 4 ---- cmake/shim_onedpl.cmake | 27 --------------------------- src/dpl_shim.h | 2 +- src/std-data/STDDataStream.cpp | 8 ++++---- src/std-data/model.cmake | 7 ++++--- src/std-indices/STDIndicesStream.cpp | 8 ++++---- src/std-indices/model.cmake | 5 ++++- src/std-ranges/STDRangesStream.cpp | 8 ++++---- src/std-ranges/model.cmake | 5 ++++- 10 files changed, 39 insertions(+), 49 deletions(-) delete mode 100644 cmake/shim_onedpl.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index f2af1e31..eb9e57b2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -102,10 +102,24 @@ if (USE_ONEDPL) GIT_REPOSITORY https://github.com/oneapi-src/oneDPL.git GIT_TAG oneDPL-2021.7.0-release ) + string(TOLOWER ${USE_ONEDPL} ONEDPL_BACKEND) + # XXX oneDPL looks for omp instead of openmp, which mismatches(!) with ONEDPL_PAR_BACKEND if using find_package + if (ONEDPL_BACKEND STREQUAL "openmp") + set(ONEDPL_BACKEND omp) + endif () # Not using FetchContent_MakeAvailable (CMake>= 3.14) because we need EXCLUDE_FROM_ALL FetchContent_GetProperties(oneDPL) if (NOT oneDPL_POPULATED) FetchContent_Populate(oneDPL) + if (USE_TBB) + macro(find_package NAME) + if ("${NAME}" STREQUAL "TBB") + message(STATUS "Discarding oneDPL's call to find_package(${NAME} ${ARGN})") + else () + _find_package(${NAME} ${ARGN}) + endif () + endmacro() + endif () add_subdirectory(${onedpl_SOURCE_DIR} ${onedpl_BINARY_DIR} EXCLUDE_FROM_ALL) endif () endif() diff --git a/cmake/register_models.cmake b/cmake/register_models.cmake index 9432313e..f180c03b 100644 --- a/cmake/register_models.cmake +++ b/cmake/register_models.cmake @@ -71,10 +71,6 @@ macro(register_definitions) list(APPEND IMPL_DEFINITIONS ${ARGN}) endmacro() -macro(register_directories) - list(APPEND IMPL_DIRECTORIES ${ARGN}) -endmacro() - macro(register_flag_required NAME DESCRIPTION) list(APPEND CUSTOM_FLAGS_TRIPLE "${NAME}" "${DESCRIPTION}" ON "") endmacro() diff --git a/cmake/shim_onedpl.cmake b/cmake/shim_onedpl.cmake deleted file mode 100644 index 861d0697..00000000 --- a/cmake/shim_onedpl.cmake +++ /dev/null @@ -1,27 +0,0 @@ - - -if (USE_ONEDPL) - # # XXX see https://www.intel.com/content/www/us/en/develop/documentation/oneapi-dpcpp-library-guide/top/oneapi-dpc-library-onedpl-overview.html - # # this is to avoid the system TBB headers (if exists) from having precedence which isn't compatible with oneDPL's par implementation - # register_definitions( - # PSTL_USE_PARALLEL_POLICIES=0 - # _GLIBCXX_USE_TBB_PAR_BACKEND=0 - # ) - register_definitions(USE_ONEDPL) - if (USE_ONEDPL STREQUAL "TBB") - register_definitions(ONEDPL_USE_TBB_BACKEND=1) - # TBB will either be linked later (USE_TBB==ON) or via extra libraries, don't do anything here - elseif (USE_ONEDPL STREQUAL "OPENMP") - register_definitions(ONEDPL_USE_OPENMP_BACKEND=1) - # Link OpenMP via CMAKE - find_package(OpenMP REQUIRED) - register_link_library(OpenMP::OpenMP_CXX) - elseif (USE_ONEDPL STREQUAL "SYCL") - register_definitions(ONEDPL_USE_DPCPP_BACKEND=1) - # This needs a SYCL compiler, will fail if CXX doesn't SYCL2020 - register_append_cxx_flags(ANY -fsycl-unnamed-lambda -fsycl) - else () - message(FATAL_ERROR "Unsupported USE_ONEDPL backend: ${USE_ONEDPL}, see USE_ONEDPL flag description for available values.") - endif () - register_directories(ANY ${onedpl_SOURCE_DIR}/include) -endif () \ No newline at end of file diff --git a/src/dpl_shim.h b/src/dpl_shim.h index 89012a35..e47ae99b 100644 --- a/src/dpl_shim.h +++ b/src/dpl_shim.h @@ -15,7 +15,7 @@ #include #include -#ifdef ONEDPL_USE_DPCPP_BACKEND +#if ONEDPL_USE_DPCPP_BACKEND #include diff --git a/src/std-data/STDDataStream.cpp b/src/std-data/STDDataStream.cpp index 2bb6a33a..b6641dee 100644 --- a/src/std-data/STDDataStream.cpp +++ b/src/std-data/STDDataStream.cpp @@ -24,13 +24,13 @@ STDDataStream::STDDataStream(const int ARRAY_SIZE, int device) #endif { std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl; -#if USE_ONEDPL +#ifdef USE_ONEDPL std::cout << "Using oneDPL backend: "; -#if defined(ONEDPL_USE_DPCPP_BACKEND) +#if ONEDPL_USE_DPCPP_BACKEND std::cout << "SYCL USM (device=" << exe_policy.queue().get_device().get_info() << ")"; -#elif defined(ONEDPL_USE_TBB_BACKEND) +#elif ONEDPL_USE_TBB_BACKEND std::cout << "TBB " TBB_VERSION_STRING; -#elif defined(ONEDPL_USE_OPENMP_BACKEND) +#elif ONEDPL_USE_OPENMP_BACKEND std::cout << "OpenMP"; #else std::cout << "Default"; diff --git a/src/std-data/model.cmake b/src/std-data/model.cmake index f2fecba8..3d2399d6 100644 --- a/src/std-data/model.cmake +++ b/src/std-data/model.cmake @@ -41,9 +41,6 @@ register_flag_optional(USE_ONEDPL macro(setup) set(CMAKE_CXX_STANDARD 17) - - include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/shim_onedpl.cmake) - if (NVHPC_OFFLOAD) set(NVHPC_FLAGS -stdpar -gpu=${NVHPC_OFFLOAD}) # propagate flags to linker so that it links with the gpu stuff as well @@ -56,4 +53,8 @@ macro(setup) if (USE_TBB) register_link_library(TBB::tbb) endif () + if (USE_ONEDPL) + register_definitions(USE_ONEDPL) + register_link_library(oneDPL) + endif () endmacro() diff --git a/src/std-indices/STDIndicesStream.cpp b/src/std-indices/STDIndicesStream.cpp index 7cacde3f..9d98a1b0 100644 --- a/src/std-indices/STDIndicesStream.cpp +++ b/src/std-indices/STDIndicesStream.cpp @@ -32,13 +32,13 @@ noexcept : array_size{ARRAY_SIZE}, range(0, array_size), #endif { std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl; -#if USE_ONEDPL +#ifdef USE_ONEDPL std::cout << "Using oneDPL backend: "; -#if defined(ONEDPL_USE_DPCPP_BACKEND) +#if ONEDPL_USE_DPCPP_BACKEND std::cout << "SYCL USM (device=" << exe_policy.queue().get_device().get_info() << ")"; -#elif defined(ONEDPL_USE_TBB_BACKEND) +#elif ONEDPL_USE_TBB_BACKEND std::cout << "TBB " TBB_VERSION_STRING; -#elif defined(ONEDPL_USE_OPENMP_BACKEND) +#elif ONEDPL_USE_OPENMP_BACKEND std::cout << "OpenMP"; #else std::cout << "Default"; diff --git a/src/std-indices/model.cmake b/src/std-indices/model.cmake index 36e2ed82..befa9335 100644 --- a/src/std-indices/model.cmake +++ b/src/std-indices/model.cmake @@ -41,7 +41,6 @@ register_flag_optional(USE_ONEDPL macro(setup) set(CMAKE_CXX_STANDARD 17) - include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/shim_onedpl.cmake) if (NVHPC_OFFLOAD) set(NVHPC_FLAGS -stdpar -gpu=${NVHPC_OFFLOAD}) # propagate flags to linker so that it links with the gpu stuff as well @@ -54,4 +53,8 @@ macro(setup) if (USE_TBB) register_link_library(TBB::tbb) endif () + if (USE_ONEDPL) + register_definitions(USE_ONEDPL) + register_link_library(oneDPL) + endif () endmacro() diff --git a/src/std-ranges/STDRangesStream.cpp b/src/std-ranges/STDRangesStream.cpp index 29993bc6..3ea32e41 100644 --- a/src/std-ranges/STDRangesStream.cpp +++ b/src/std-ranges/STDRangesStream.cpp @@ -32,13 +32,13 @@ noexcept : array_size{ARRAY_SIZE}, #endif { std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl; -#if USE_ONEDPL +#ifdef USE_ONEDPL std::cout << "Using oneDPL backend: "; -#if defined(ONEDPL_USE_DPCPP_BACKEND) +#if ONEDPL_USE_DPCPP_BACKEND std::cout << "SYCL USM (device=" << exe_policy.queue().get_device().get_info() << ")"; -#elif defined(ONEDPL_USE_TBB_BACKEND) +#elif ONEDPL_USE_TBB_BACKEND std::cout << "TBB " TBB_VERSION_STRING; -#elif defined(ONEDPL_USE_OPENMP_BACKEND) +#elif ONEDPL_USE_OPENMP_BACKEND std::cout << "OpenMP"; #else std::cout << "Default"; diff --git a/src/std-ranges/model.cmake b/src/std-ranges/model.cmake index 2d90afc4..268cc149 100644 --- a/src/std-ranges/model.cmake +++ b/src/std-ranges/model.cmake @@ -33,11 +33,14 @@ macro(setup) unset(CMAKE_CXX_STANDARD) # drop any existing standard we have set by default # and append our own: register_append_cxx_flags(ANY -std=c++2a) - include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/shim_onedpl.cmake) if (USE_VECTOR) register_definitions(USE_VECTOR) endif () if (USE_TBB) register_link_library(TBB::tbb) endif () + if (USE_ONEDPL) + register_definitions(USE_ONEDPL) + register_link_library(oneDPL) + endif () endmacro() From ecb0464f6c7b35f94c8ea2abc1bcd50b036ecb97 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Thu, 28 Jul 2022 19:48:42 +0100 Subject: [PATCH 22/89] Fixup oneDPL and oneTBB in CI (gcc-10) --- src/ci-test-compile.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/ci-test-compile.sh b/src/ci-test-compile.sh index 041c9916..2a101910 100755 --- a/src/ci-test-compile.sh +++ b/src/ci-test-compile.sh @@ -137,8 +137,8 @@ build_gcc() { for use_onedpl in OFF OPENMP TBB; do case "$use_onedpl" in - OFF) dpl_conditional_flags="-DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}" ;; - *) dpl_conditional_flags="-DUSE_TBB=ON" ;; + OFF) dpl_conditional_flags="-DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}" ;; + *) dpl_conditional_flags="-DUSE_TBB=ON -DPSTL_USE_PARALLEL_POLICIES=0" ;; esac for use_vector in OFF ON; do # some distributions like Ubuntu bionic implements std par with TBB, so conditionally link it here @@ -227,7 +227,7 @@ build_clang() { for use_vector in OFF ON; do case "$use_onedpl" in OFF) dpl_conditional_flags="-DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" ;; - *) dpl_conditional_flags="-DUSE_TBB=ON" ;; + *) dpl_conditional_flags="-DUSE_TBB=ON -DPSTL_USE_PARALLEL_POLICIES=0" ;; esac run_build $name "${CLANG_CXX:?}" std-data "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DUSE_VECTOR=$use_vector " run_build $name "${CLANG_CXX:?}" std-indices "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DUSE_VECTOR=$use_vector" From 1f4bc3fffc64b7232d47a2772792f226ddb10aaf Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Thu, 28 Jul 2022 20:04:18 +0100 Subject: [PATCH 23/89] Fixup oneDPL and oneTBB in CI (gcc-10) take 2 --- src/ci-test-compile.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ci-test-compile.sh b/src/ci-test-compile.sh index 2a101910..a2952bc5 100755 --- a/src/ci-test-compile.sh +++ b/src/ci-test-compile.sh @@ -138,7 +138,7 @@ build_gcc() { for use_onedpl in OFF OPENMP TBB; do case "$use_onedpl" in OFF) dpl_conditional_flags="-DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}" ;; - *) dpl_conditional_flags="-DUSE_TBB=ON -DPSTL_USE_PARALLEL_POLICIES=0" ;; + *) dpl_conditional_flags="-DUSE_TBB=ON -DCXX_EXTRA_FLAGS=-DPSTL_USE_PARALLEL_POLICIES=0" ;; esac for use_vector in OFF ON; do # some distributions like Ubuntu bionic implements std par with TBB, so conditionally link it here @@ -227,7 +227,7 @@ build_clang() { for use_vector in OFF ON; do case "$use_onedpl" in OFF) dpl_conditional_flags="-DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" ;; - *) dpl_conditional_flags="-DUSE_TBB=ON -DPSTL_USE_PARALLEL_POLICIES=0" ;; + *) dpl_conditional_flags="-DUSE_TBB=ON -DCXX_EXTRA_FLAGS=-DPSTL_USE_PARALLEL_POLICIES=0" ;; esac run_build $name "${CLANG_CXX:?}" std-data "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DUSE_VECTOR=$use_vector " run_build $name "${CLANG_CXX:?}" std-indices "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DUSE_VECTOR=$use_vector" From d56dc956e09925c386c1cabb97931791e4688d3c Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Thu, 28 Jul 2022 20:30:27 +0100 Subject: [PATCH 24/89] Fixup oneDPL and oneTBB in CI (gcc-10) take 3 --- src/ci-test-compile.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ci-test-compile.sh b/src/ci-test-compile.sh index a2952bc5..d3fc5b71 100755 --- a/src/ci-test-compile.sh +++ b/src/ci-test-compile.sh @@ -138,7 +138,7 @@ build_gcc() { for use_onedpl in OFF OPENMP TBB; do case "$use_onedpl" in OFF) dpl_conditional_flags="-DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}" ;; - *) dpl_conditional_flags="-DUSE_TBB=ON -DCXX_EXTRA_FLAGS=-DPSTL_USE_PARALLEL_POLICIES=0" ;; + *) dpl_conditional_flags="-DUSE_TBB=ON -DCXX_EXTRA_FLAGS=-D_GLIBCXX_USE_TBB_PAR_BACKEND=0" ;; esac for use_vector in OFF ON; do # some distributions like Ubuntu bionic implements std par with TBB, so conditionally link it here @@ -227,7 +227,7 @@ build_clang() { for use_vector in OFF ON; do case "$use_onedpl" in OFF) dpl_conditional_flags="-DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" ;; - *) dpl_conditional_flags="-DUSE_TBB=ON -DCXX_EXTRA_FLAGS=-DPSTL_USE_PARALLEL_POLICIES=0" ;; + *) dpl_conditional_flags="-DUSE_TBB=ON -DCXX_EXTRA_FLAGS=-D_GLIBCXX_USE_TBB_PAR_BACKEND=0" ;; esac run_build $name "${CLANG_CXX:?}" std-data "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DUSE_VECTOR=$use_vector " run_build $name "${CLANG_CXX:?}" std-indices "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DUSE_VECTOR=$use_vector" From aa82e57ba08ae59971b880e645f64d5583e15b7d Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Thu, 28 Jul 2022 22:02:48 +0100 Subject: [PATCH 25/89] Fixup oneDPL dpcpp configuration Add conditional sync after each kernel. --- CMakeLists.txt | 9 ++++++++- src/dpl_shim.h | 4 ++++ src/std-data/STDDataStream.cpp | 5 +++++ src/std-data/model.cmake | 2 +- src/std-indices/STDIndicesStream.cpp | 5 +++++ src/std-indices/model.cmake | 2 +- src/std-ranges/STDRangesStream.cpp | 5 +++++ src/std-ranges/model.cmake | 2 +- 8 files changed, 30 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index eb9e57b2..7c137461 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -121,8 +121,15 @@ if (USE_ONEDPL) endmacro() endif () add_subdirectory(${onedpl_SOURCE_DIR} ${onedpl_BINARY_DIR} EXCLUDE_FROM_ALL) + + # Fixup oneDPL's omission on setting DPCPP definitions. + # We do this after the creation of the oneDPL target. + if (ONEDPL_BACKEND MATCHES "^(dpcpp|dpcpp_only)$") + target_compile_definitions(oneDPL INTERFACE ONEDPL_USE_DPCPP_BACKEND=1) + endif () + endif () -endif() +endif () # include our macros diff --git a/src/dpl_shim.h b/src/dpl_shim.h index e47ae99b..d341a591 100644 --- a/src/dpl_shim.h +++ b/src/dpl_shim.h @@ -34,6 +34,8 @@ T *alloc_raw(size_t size) { return sycl::malloc_shared(size, exe_policy.queue template void dealloc_raw(T *ptr) { sycl::free(ptr, exe_policy.queue()); } +static void sync_device(){exe_policy.queue().wait_and_throw(); } + #else // auto exe_policy = dpl::execution::seq; @@ -72,4 +74,6 @@ T *alloc_raw(size_t size) { return (T *) aligned_alloc(ALIGNMENT, sizeof(T) * si template void dealloc_raw(T *ptr) { free(ptr); } +static void sync_device(){ /*no-op*/ } + #endif diff --git a/src/std-data/STDDataStream.cpp b/src/std-data/STDDataStream.cpp index b6641dee..d4dc17f6 100644 --- a/src/std-data/STDDataStream.cpp +++ b/src/std-data/STDDataStream.cpp @@ -69,6 +69,7 @@ void STDDataStream::copy() { // c[i] = a[i] std::copy(exe_policy, BEGIN(a), END(a), BEGIN(c)); + sync_device(); } template @@ -76,6 +77,7 @@ void STDDataStream::mul() { // b[i] = scalar * c[i]; std::transform(exe_policy, BEGIN(c), END(c), BEGIN(b), [scalar = startScalar](T ci){ return scalar*ci; }); + sync_device(); } template @@ -83,6 +85,7 @@ void STDDataStream::add() { // c[i] = a[i] + b[i]; std::transform(exe_policy, BEGIN(a), END(a), BEGIN(b), BEGIN(c), std::plus()); + sync_device(); } template @@ -90,6 +93,7 @@ void STDDataStream::triad() { // a[i] = b[i] + scalar * c[i]; std::transform(exe_policy, BEGIN(b), END(b), BEGIN(c), BEGIN(a), [scalar = startScalar](T bi, T ci){ return bi+scalar*ci; }); + sync_device(); } template @@ -101,6 +105,7 @@ void STDDataStream::nstream() // 2: a[i] += scalar * c[i]; std::transform(exe_policy, BEGIN(a), END(a), BEGIN(b), BEGIN(a), [](T ai, T bi){ return ai + bi; }); std::transform(exe_policy, BEGIN(a), END(a), BEGIN(c), BEGIN(a), [scalar = startScalar](T ai, T ci){ return ai + scalar*ci; }); + sync_device(); } diff --git a/src/std-data/model.cmake b/src/std-data/model.cmake index 3d2399d6..e1697b6d 100644 --- a/src/std-data/model.cmake +++ b/src/std-data/model.cmake @@ -35,7 +35,7 @@ register_flag_optional(USE_ONEDPL CMake will handle any flags needed to enable OpenMP if the compiler supports it. TBB - Implements policies using TBB. TBB must be linked via USE_TBB or be available in LD_LIBRARY_PATH. - SYCL - Implements policies through SYCL2020. + DPCPP - Implements policies through SYCL2020. This requires the DPC++ compiler (other SYCL compilers are untested), required SYCL flags are added automatically." "OFF") diff --git a/src/std-indices/STDIndicesStream.cpp b/src/std-indices/STDIndicesStream.cpp index 9d98a1b0..04b78296 100644 --- a/src/std-indices/STDIndicesStream.cpp +++ b/src/std-indices/STDIndicesStream.cpp @@ -77,6 +77,7 @@ void STDIndicesStream::copy() { // c[i] = a[i] std::copy(exe_policy, BEGIN(a), END(a), BEGIN(c)); + sync_device(); } template @@ -86,6 +87,7 @@ void STDIndicesStream::mul() std::transform(exe_policy, range.begin(), range.end(), BEGIN(b), [this, scalar = startScalar](int i) { return scalar * c[i]; }); + sync_device(); } template @@ -95,6 +97,7 @@ void STDIndicesStream::add() std::transform(exe_policy, range.begin(), range.end(), BEGIN(c), [this](int i) { return a[i] + b[i]; }); + sync_device(); } template @@ -104,6 +107,7 @@ void STDIndicesStream::triad() std::transform(exe_policy, range.begin(), range.end(), BEGIN(a), [this, scalar = startScalar](int i) { return b[i] + scalar * c[i]; }); + sync_device(); } template @@ -116,6 +120,7 @@ void STDIndicesStream::nstream() std::transform(exe_policy, range.begin(), range.end(), BEGIN(a), [this, scalar = startScalar](int i) { return a[i] + b[i] + scalar * c[i]; }); + sync_device(); } diff --git a/src/std-indices/model.cmake b/src/std-indices/model.cmake index befa9335..c2fef288 100644 --- a/src/std-indices/model.cmake +++ b/src/std-indices/model.cmake @@ -35,7 +35,7 @@ register_flag_optional(USE_ONEDPL CMake will handle any flags needed to enable OpenMP if the compiler supports it. TBB - Implements policies using TBB. TBB must be linked via USE_TBB or be available in LD_LIBRARY_PATH. - SYCL - Implements policies through SYCL2020. + DPCPP - Implements policies through SYCL2020. This requires the DPC++ compiler (other SYCL compilers are untested), required SYCL flags are added automatically." "OFF") diff --git a/src/std-ranges/STDRangesStream.cpp b/src/std-ranges/STDRangesStream.cpp index 3ea32e41..8a77a682 100644 --- a/src/std-ranges/STDRangesStream.cpp +++ b/src/std-ranges/STDRangesStream.cpp @@ -89,6 +89,7 @@ void STDRangesStream::copy() c[i] = a[i]; } ); + sync_device(); } template @@ -103,6 +104,7 @@ void STDRangesStream::mul() b[i] = scalar * c[i]; } ); + sync_device(); } template @@ -115,6 +117,7 @@ void STDRangesStream::add() c[i] = a[i] + b[i]; } ); + sync_device(); } template @@ -129,6 +132,7 @@ void STDRangesStream::triad() a[i] = b[i] + scalar * c[i]; } ); + sync_device(); } template @@ -143,6 +147,7 @@ void STDRangesStream::nstream() a[i] += b[i] + scalar * c[i]; } ); + sync_device(); } template diff --git a/src/std-ranges/model.cmake b/src/std-ranges/model.cmake index 268cc149..35554c77 100644 --- a/src/std-ranges/model.cmake +++ b/src/std-ranges/model.cmake @@ -19,7 +19,7 @@ register_flag_optional(USE_ONEDPL CMake will handle any flags needed to enable OpenMP if the compiler supports it. TBB - Implements policies using TBB. TBB must be linked via USE_TBB or be available in LD_LIBRARY_PATH. - SYCL - Implements policies through SYCL2020. + DPCPP - Implements policies through SYCL2020. This requires the DPC++ compiler (other SYCL compilers are untested), required SYCL flags are added automatically." "OFF") From ed6206b54398f785ce3d7f2dfe048a98fd3d7a21 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Thu, 28 Jul 2022 23:45:43 +0100 Subject: [PATCH 26/89] Remove conditional sync after each kernel Don't capture `this`, capture each member instead --- src/dpl_shim.h | 4 ---- src/std-data/STDDataStream.cpp | 15 +++++---------- src/std-indices/STDIndicesStream.cpp | 23 +++++++++-------------- src/std-ranges/STDRangesStream.cpp | 15 +++++---------- 4 files changed, 19 insertions(+), 38 deletions(-) diff --git a/src/dpl_shim.h b/src/dpl_shim.h index d341a591..e47ae99b 100644 --- a/src/dpl_shim.h +++ b/src/dpl_shim.h @@ -34,8 +34,6 @@ T *alloc_raw(size_t size) { return sycl::malloc_shared(size, exe_policy.queue template void dealloc_raw(T *ptr) { sycl::free(ptr, exe_policy.queue()); } -static void sync_device(){exe_policy.queue().wait_and_throw(); } - #else // auto exe_policy = dpl::execution::seq; @@ -74,6 +72,4 @@ T *alloc_raw(size_t size) { return (T *) aligned_alloc(ALIGNMENT, sizeof(T) * si template void dealloc_raw(T *ptr) { free(ptr); } -static void sync_device(){ /*no-op*/ } - #endif diff --git a/src/std-data/STDDataStream.cpp b/src/std-data/STDDataStream.cpp index d4dc17f6..7c71163e 100644 --- a/src/std-data/STDDataStream.cpp +++ b/src/std-data/STDDataStream.cpp @@ -69,32 +69,28 @@ void STDDataStream::copy() { // c[i] = a[i] std::copy(exe_policy, BEGIN(a), END(a), BEGIN(c)); - sync_device(); -} + } template void STDDataStream::mul() { // b[i] = scalar * c[i]; std::transform(exe_policy, BEGIN(c), END(c), BEGIN(b), [scalar = startScalar](T ci){ return scalar*ci; }); - sync_device(); -} + } template void STDDataStream::add() { // c[i] = a[i] + b[i]; std::transform(exe_policy, BEGIN(a), END(a), BEGIN(b), BEGIN(c), std::plus()); - sync_device(); -} + } template void STDDataStream::triad() { // a[i] = b[i] + scalar * c[i]; std::transform(exe_policy, BEGIN(b), END(b), BEGIN(c), BEGIN(a), [scalar = startScalar](T bi, T ci){ return bi+scalar*ci; }); - sync_device(); -} + } template void STDDataStream::nstream() @@ -105,8 +101,7 @@ void STDDataStream::nstream() // 2: a[i] += scalar * c[i]; std::transform(exe_policy, BEGIN(a), END(a), BEGIN(b), BEGIN(a), [](T ai, T bi){ return ai + bi; }); std::transform(exe_policy, BEGIN(a), END(a), BEGIN(c), BEGIN(a), [scalar = startScalar](T ai, T ci){ return ai + scalar*ci; }); - sync_device(); -} + } template diff --git a/src/std-indices/STDIndicesStream.cpp b/src/std-indices/STDIndicesStream.cpp index 04b78296..f9397fab 100644 --- a/src/std-indices/STDIndicesStream.cpp +++ b/src/std-indices/STDIndicesStream.cpp @@ -77,38 +77,34 @@ void STDIndicesStream::copy() { // c[i] = a[i] std::copy(exe_policy, BEGIN(a), END(a), BEGIN(c)); - sync_device(); -} + } template void STDIndicesStream::mul() { // b[i] = scalar * c[i]; - std::transform(exe_policy, range.begin(), range.end(), BEGIN(b), [this, scalar = startScalar](int i) { + std::transform(exe_policy, range.begin(), range.end(), BEGIN(b), [c = this->c, scalar = startScalar](int i) { return scalar * c[i]; }); - sync_device(); -} + } template void STDIndicesStream::add() { // c[i] = a[i] + b[i]; - std::transform(exe_policy, range.begin(), range.end(), BEGIN(c), [this](int i) { + std::transform(exe_policy, range.begin(), range.end(), BEGIN(c), [a = this->a, b = this->b](int i) { return a[i] + b[i]; }); - sync_device(); -} + } template void STDIndicesStream::triad() { // a[i] = b[i] + scalar * c[i]; - std::transform(exe_policy, range.begin(), range.end(), BEGIN(a), [this, scalar = startScalar](int i) { + std::transform(exe_policy, range.begin(), range.end(), BEGIN(a), [b = this->b, c = this->c, scalar = startScalar](int i) { return b[i] + scalar * c[i]; }); - sync_device(); -} + } template void STDIndicesStream::nstream() @@ -117,11 +113,10 @@ void STDIndicesStream::nstream() // Need to do in two stages with C++11 STL. // 1: a[i] += b[i] // 2: a[i] += scalar * c[i]; - std::transform(exe_policy, range.begin(), range.end(), BEGIN(a), [this, scalar = startScalar](int i) { + std::transform(exe_policy, range.begin(), range.end(), BEGIN(a), [a = this->a, b = this->b, c = this->c, scalar = startScalar](int i) { return a[i] + b[i] + scalar * c[i]; }); - sync_device(); -} + } template diff --git a/src/std-ranges/STDRangesStream.cpp b/src/std-ranges/STDRangesStream.cpp index 8a77a682..9063ff20 100644 --- a/src/std-ranges/STDRangesStream.cpp +++ b/src/std-ranges/STDRangesStream.cpp @@ -89,8 +89,7 @@ void STDRangesStream::copy() c[i] = a[i]; } ); - sync_device(); -} + } template void STDRangesStream::mul() @@ -104,8 +103,7 @@ void STDRangesStream::mul() b[i] = scalar * c[i]; } ); - sync_device(); -} + } template void STDRangesStream::add() @@ -117,8 +115,7 @@ void STDRangesStream::add() c[i] = a[i] + b[i]; } ); - sync_device(); -} + } template void STDRangesStream::triad() @@ -132,8 +129,7 @@ void STDRangesStream::triad() a[i] = b[i] + scalar * c[i]; } ); - sync_device(); -} + } template void STDRangesStream::nstream() @@ -147,8 +143,7 @@ void STDRangesStream::nstream() a[i] += b[i] + scalar * c[i]; } ); - sync_device(); -} + } template T STDRangesStream::dot() From 72335f320e1976876d57c381abf731db19cb548e Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Fri, 29 Jul 2022 00:17:36 +0100 Subject: [PATCH 27/89] Revert to normal vector without allocators Prohibit vector type in indices --- src/dpl_shim.h | 10 ---------- src/std-data/STDDataStream.cpp | 12 ++++++------ src/std-data/STDDataStream.h | 2 +- src/std-indices/STDIndicesStream.cpp | 22 ++++++++++++++++------ src/std-indices/STDIndicesStream.h | 2 +- src/std-ranges/STDRangesStream.cpp | 12 ++++++------ src/std-ranges/STDRangesStream.hpp | 2 +- 7 files changed, 31 insertions(+), 31 deletions(-) diff --git a/src/dpl_shim.h b/src/dpl_shim.h index e47ae99b..226693bd 100644 --- a/src/dpl_shim.h +++ b/src/dpl_shim.h @@ -23,11 +23,6 @@ const static auto exe_policy = oneapi::dpl::execution::device_policy<>{ oneapi::dpl::execution::make_device_policy(cl::sycl::default_selector{}) }; -template using Allocator = sycl::usm_allocator; - -template -constexpr Allocator alloc_vec() { return {exe_policy.queue()}; }; - template T *alloc_raw(size_t size) { return sycl::malloc_shared(size, exe_policy.queue()); } @@ -61,11 +56,6 @@ static constexpr auto exe_policy = std::execution::par_unseq; #ifdef USE_STD_PTR_ALLOC_DEALLOC -template using Allocator = std::allocator; - -template -constexpr Allocator alloc_vec() { return {}; }; - template T *alloc_raw(size_t size) { return (T *) aligned_alloc(ALIGNMENT, sizeof(T) * size); } diff --git a/src/std-data/STDDataStream.cpp b/src/std-data/STDDataStream.cpp index 7c71163e..3d7ef18a 100644 --- a/src/std-data/STDDataStream.cpp +++ b/src/std-data/STDDataStream.cpp @@ -18,7 +18,7 @@ template STDDataStream::STDDataStream(const int ARRAY_SIZE, int device) noexcept : array_size{ARRAY_SIZE}, #ifdef USE_VECTOR - a(ARRAY_SIZE, alloc_vec()), b(ARRAY_SIZE, alloc_vec()), c(ARRAY_SIZE, alloc_vec()) + a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE) #else a(alloc_raw(ARRAY_SIZE)), b(alloc_raw(ARRAY_SIZE)), c(alloc_raw(ARRAY_SIZE)) #endif @@ -69,28 +69,28 @@ void STDDataStream::copy() { // c[i] = a[i] std::copy(exe_policy, BEGIN(a), END(a), BEGIN(c)); - } +} template void STDDataStream::mul() { // b[i] = scalar * c[i]; std::transform(exe_policy, BEGIN(c), END(c), BEGIN(b), [scalar = startScalar](T ci){ return scalar*ci; }); - } +} template void STDDataStream::add() { // c[i] = a[i] + b[i]; std::transform(exe_policy, BEGIN(a), END(a), BEGIN(b), BEGIN(c), std::plus()); - } +} template void STDDataStream::triad() { // a[i] = b[i] + scalar * c[i]; std::transform(exe_policy, BEGIN(b), END(b), BEGIN(c), BEGIN(a), [scalar = startScalar](T bi, T ci){ return bi+scalar*ci; }); - } +} template void STDDataStream::nstream() @@ -101,7 +101,7 @@ void STDDataStream::nstream() // 2: a[i] += scalar * c[i]; std::transform(exe_policy, BEGIN(a), END(a), BEGIN(b), BEGIN(a), [](T ai, T bi){ return ai + bi; }); std::transform(exe_policy, BEGIN(a), END(a), BEGIN(c), BEGIN(a), [scalar = startScalar](T ai, T ci){ return ai + scalar*ci; }); - } +} template diff --git a/src/std-data/STDDataStream.h b/src/std-data/STDDataStream.h index e50c95d8..911a621b 100644 --- a/src/std-data/STDDataStream.h +++ b/src/std-data/STDDataStream.h @@ -23,7 +23,7 @@ class STDDataStream : public Stream // Device side pointers #ifdef USE_VECTOR - std::vector> a, b, c; + std::vector a, b, c; #else T *a, *b, *c; #endif diff --git a/src/std-indices/STDIndicesStream.cpp b/src/std-indices/STDIndicesStream.cpp index f9397fab..6ea3362e 100644 --- a/src/std-indices/STDIndicesStream.cpp +++ b/src/std-indices/STDIndicesStream.cpp @@ -22,11 +22,21 @@ #define END(x) ((x) + array_size) #endif +#ifdef USE_VECTOR +#if (defined(__NVCOMPILER) || defined(__NVCOMPILER_LLVM__)) +#error "std::vector *is* supported in NVHPC if we capture `this`, however, oneDPL (via SYCL2020) only works correctly with explicit *value* captures." +#endif + +#if defined(USE_ONEDPL) +#error "std::vector is unspported: oneDPL (via SYCL2020) only works correctly with explicit *value* captures" +#endif +#endif + template STDIndicesStream::STDIndicesStream(const int ARRAY_SIZE, int device) noexcept : array_size{ARRAY_SIZE}, range(0, array_size), #ifdef USE_VECTOR - a(ARRAY_SIZE, alloc_vec()), b(ARRAY_SIZE, alloc_vec()), c(ARRAY_SIZE, alloc_vec()) + a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE) #else a(alloc_raw(ARRAY_SIZE)), b(alloc_raw(ARRAY_SIZE)), c(alloc_raw(ARRAY_SIZE)) #endif @@ -77,7 +87,7 @@ void STDIndicesStream::copy() { // c[i] = a[i] std::copy(exe_policy, BEGIN(a), END(a), BEGIN(c)); - } +} template void STDIndicesStream::mul() @@ -86,7 +96,7 @@ void STDIndicesStream::mul() std::transform(exe_policy, range.begin(), range.end(), BEGIN(b), [c = this->c, scalar = startScalar](int i) { return scalar * c[i]; }); - } +} template void STDIndicesStream::add() @@ -95,7 +105,7 @@ void STDIndicesStream::add() std::transform(exe_policy, range.begin(), range.end(), BEGIN(c), [a = this->a, b = this->b](int i) { return a[i] + b[i]; }); - } +} template void STDIndicesStream::triad() @@ -104,7 +114,7 @@ void STDIndicesStream::triad() std::transform(exe_policy, range.begin(), range.end(), BEGIN(a), [b = this->b, c = this->c, scalar = startScalar](int i) { return b[i] + scalar * c[i]; }); - } +} template void STDIndicesStream::nstream() @@ -116,7 +126,7 @@ void STDIndicesStream::nstream() std::transform(exe_policy, range.begin(), range.end(), BEGIN(a), [a = this->a, b = this->b, c = this->c, scalar = startScalar](int i) { return a[i] + b[i] + scalar * c[i]; }); - } +} template diff --git a/src/std-indices/STDIndicesStream.h b/src/std-indices/STDIndicesStream.h index a955374f..0916ef22 100644 --- a/src/std-indices/STDIndicesStream.h +++ b/src/std-indices/STDIndicesStream.h @@ -78,7 +78,7 @@ class STDIndicesStream : public Stream // Device side pointers #ifdef USE_VECTOR - std::vector> a, b, c; + std::vector a, b, c; #else T *a, *b, *c; #endif diff --git a/src/std-ranges/STDRangesStream.cpp b/src/std-ranges/STDRangesStream.cpp index 9063ff20..a8a13490 100644 --- a/src/std-ranges/STDRangesStream.cpp +++ b/src/std-ranges/STDRangesStream.cpp @@ -26,7 +26,7 @@ template STDRangesStream::STDRangesStream(const int ARRAY_SIZE, int device) noexcept : array_size{ARRAY_SIZE}, #ifdef USE_VECTOR - a(ARRAY_SIZE, alloc_vec()), b(ARRAY_SIZE, alloc_vec()), c(ARRAY_SIZE, alloc_vec()) + a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE) #else a(alloc_raw(ARRAY_SIZE)), b(alloc_raw(ARRAY_SIZE)), c(alloc_raw(ARRAY_SIZE)) #endif @@ -89,7 +89,7 @@ void STDRangesStream::copy() c[i] = a[i]; } ); - } +} template void STDRangesStream::mul() @@ -103,7 +103,7 @@ void STDRangesStream::mul() b[i] = scalar * c[i]; } ); - } +} template void STDRangesStream::add() @@ -115,7 +115,7 @@ void STDRangesStream::add() c[i] = a[i] + b[i]; } ); - } +} template void STDRangesStream::triad() @@ -129,7 +129,7 @@ void STDRangesStream::triad() a[i] = b[i] + scalar * c[i]; } ); - } +} template void STDRangesStream::nstream() @@ -143,7 +143,7 @@ void STDRangesStream::nstream() a[i] += b[i] + scalar * c[i]; } ); - } +} template T STDRangesStream::dot() diff --git a/src/std-ranges/STDRangesStream.hpp b/src/std-ranges/STDRangesStream.hpp index 21902c6c..9d36d46b 100644 --- a/src/std-ranges/STDRangesStream.hpp +++ b/src/std-ranges/STDRangesStream.hpp @@ -22,7 +22,7 @@ class STDRangesStream : public Stream // Device side pointers #ifdef USE_VECTOR - std::vector> a, b, c; + std::vector a, b, c; #else T *a, *b, *c; #endif From 80853e66e07faa97779495a49f4f8f1fec5433f4 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Sat, 30 Jul 2022 08:04:03 +0100 Subject: [PATCH 28/89] Don't include C++17 execution headers directly --- src/std-indices/STDIndicesStream.cpp | 4 ---- src/std-ranges/STDRangesStream.cpp | 4 ---- 2 files changed, 8 deletions(-) diff --git a/src/std-indices/STDIndicesStream.cpp b/src/std-indices/STDIndicesStream.cpp index 6ea3362e..6e135976 100644 --- a/src/std-indices/STDIndicesStream.cpp +++ b/src/std-indices/STDIndicesStream.cpp @@ -6,10 +6,6 @@ #include "STDIndicesStream.h" -#include -#include -#include - #ifndef ALIGNMENT #define ALIGNMENT (2*1024*1024) // 2MB #endif diff --git a/src/std-ranges/STDRangesStream.cpp b/src/std-ranges/STDRangesStream.cpp index a8a13490..e05a7d1c 100644 --- a/src/std-ranges/STDRangesStream.cpp +++ b/src/std-ranges/STDRangesStream.cpp @@ -6,10 +6,6 @@ #include "STDRangesStream.hpp" -#include -#include -#include - #ifndef ALIGNMENT #define ALIGNMENT (2*1024*1024) // 2MB #endif From 370d378fbc3d8581482f69b221f8febd97da61ff Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Wed, 17 Aug 2022 15:09:00 +0100 Subject: [PATCH 29/89] Don't use Kokkos internal headers Don't initialise kokkos view to zero in ctor Upgrade std to 17 for Kokkos (<17 is warning in 3.6, error is develop) --- src/kokkos/KokkosStream.cpp | 6 +++--- src/kokkos/KokkosStream.hpp | 2 -- src/kokkos/model.cmake | 2 +- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/src/kokkos/KokkosStream.cpp b/src/kokkos/KokkosStream.cpp index 00efe92c..04e0dafd 100644 --- a/src/kokkos/KokkosStream.cpp +++ b/src/kokkos/KokkosStream.cpp @@ -14,9 +14,9 @@ KokkosStream::KokkosStream( { Kokkos::initialize(); - d_a = new Kokkos::View("d_a", ARRAY_SIZE); - d_b = new Kokkos::View("d_b", ARRAY_SIZE); - d_c = new Kokkos::View("d_c", ARRAY_SIZE); + d_a = new Kokkos::View(Kokkos::ViewAllocateWithoutInitializing("d_a"), ARRAY_SIZE); + d_b = new Kokkos::View(Kokkos::ViewAllocateWithoutInitializing("d_b"), ARRAY_SIZE); + d_c = new Kokkos::View(Kokkos::ViewAllocateWithoutInitializing("d_c"), ARRAY_SIZE); hm_a = new typename Kokkos::View::HostMirror(); hm_b = new typename Kokkos::View::HostMirror(); hm_c = new typename Kokkos::View::HostMirror(); diff --git a/src/kokkos/KokkosStream.hpp b/src/kokkos/KokkosStream.hpp index 3aa7cf5f..d7333a71 100644 --- a/src/kokkos/KokkosStream.hpp +++ b/src/kokkos/KokkosStream.hpp @@ -10,8 +10,6 @@ #include #include -#include -#include #include "Stream.h" diff --git a/src/kokkos/model.cmake b/src/kokkos/model.cmake index 445991d4..46c773d9 100644 --- a/src/kokkos/model.cmake +++ b/src/kokkos/model.cmake @@ -17,7 +17,7 @@ set(KOKKOS_FLAGS_CPU_INTEL -qopt-streaming-stores=always) macro(setup) - set(CMAKE_CXX_STANDARD 14) + set(CMAKE_CXX_STANDARD 17) cmake_policy(SET CMP0074 NEW) #see https://github.com/kokkos/kokkos/blob/master/BUILD.md message(STATUS "Building using in-tree Kokkos source at `${KOKKOS_IN_TREE}`") From 1d8e383a29f7a3fc98cdfd5b8082a3366c62675c Mon Sep 17 00:00:00 2001 From: Rob Jones <62852815+robj0nes@users.noreply.github.com> Date: Mon, 12 Sep 2022 10:58:47 +0100 Subject: [PATCH 30/89] In-package Kokkos builds Updating kokkos/model.cmake to allow for in-package builds (eg. Spack) --- src/kokkos/model.cmake | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/src/kokkos/model.cmake b/src/kokkos/model.cmake index 445991d4..a95fdba6 100644 --- a/src/kokkos/model.cmake +++ b/src/kokkos/model.cmake @@ -1,16 +1,17 @@ - register_flag_optional(CMAKE_CXX_COMPILER "Any CXX compiler that is supported by CMake detection and RAJA. See https://github.com/kokkos/kokkos#primary-tested-compilers-on-x86-are" "c++") -register_flag_required(KOKKOS_IN_TREE +register_flag_optional(KOKKOS_IN_TREE "Absolute path to the *source* distribution directory of Kokkos. Remember to append Kokkos specific flags as well, for example: - -DKOKKOS_IN_TREE=... -DKokkos_ENABLE_OPENMP=ON -DKokkos_ARCH_ZEN=ON ... + See https://github.com/kokkos/kokkos/blob/master/BUILD.md for all available options" "") - See https://github.com/kokkos/kokkos/blob/master/BUILD.md for all available options") +register_flag_optional(KOKKOS_IN_PACKAGE + "Use if Kokkos is part of a package dependency: + Path to package R-Path containing Kokkos libs" "") # compiler vendor and arch specific flags set(KOKKOS_FLAGS_CPU_INTEL -qopt-streaming-stores=always) @@ -20,13 +21,18 @@ macro(setup) set(CMAKE_CXX_STANDARD 14) cmake_policy(SET CMP0074 NEW) #see https://github.com/kokkos/kokkos/blob/master/BUILD.md - message(STATUS "Building using in-tree Kokkos source at `${KOKKOS_IN_TREE}`") if (EXISTS "${KOKKOS_IN_TREE}") + message(STATUS "Building using in-tree Kokkos source at `${KOKKOS_IN_TREE}`") add_subdirectory(${KOKKOS_IN_TREE} ${CMAKE_BINARY_DIR}/kokkos) register_link_library(Kokkos::kokkos) - else () - message(FATAL_ERROR "`${KOKKOS_IN_TREE}` does not exist") + elseif (EXISTS "${KOKKOS_IN_PACKAGE}") + message(STATUS "Building using packaged Kokkos at `${KOKKOS_IN_PACKAGE}`") + set (Kokkos_DIR "${KOKKOS_IN_PACKAGE}/lib64/cmake/Kokkos") + find_package(Kokkos REQUIRED) + register_link_library(Kokkos::kokkos) + else() + message(FATAL_ERROR "Neither `${KOKKOS_IN_TREE}`, or `${KOKKOS_IN_PACKAGE}` exists") endif () register_append_compiler_and_arch_specific_cxx_flags( @@ -36,5 +42,3 @@ macro(setup) ) endmacro() - - From 407d6701dfb01d8ac3262cac2cc4dbcf4f2b590e Mon Sep 17 00:00:00 2001 From: Rob Jones <62852815+robj0nes@users.noreply.github.com> Date: Thu, 15 Sep 2022 11:32:23 +0100 Subject: [PATCH 31/89] In-package linking to RAJA Introduced RAJA_IN_PACKAGE to allow for linking to an in-package dependency of RAJA (eg. Spack) --- src/raja/model.cmake | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/src/raja/model.cmake b/src/raja/model.cmake index 4da4af6b..b1e7750d 100644 --- a/src/raja/model.cmake +++ b/src/raja/model.cmake @@ -1,18 +1,19 @@ - register_flag_optional(CMAKE_CXX_COMPILER "Any CXX compiler that is supported by CMake detection and RAJA. See https://raja.readthedocs.io/en/main/getting_started.html#build-and-install" "c++") -register_flag_required(RAJA_IN_TREE +register_flag_optional(RAJA_IN_TREE "Absolute path to the *source* distribution directory of RAJA. Make sure to use the release version of RAJA or clone RAJA recursively with submodules. Remember to append RAJA specific flags as well, for example: - -DRAJA_IN_TREE=... -DENABLE_OPENMP=ON -DENABLE_CUDA=ON ... - See https://github.com/LLNL/RAJA/blob/08cbbafd2d21589ebf341f7275c229412d0fe903/CMakeLists.txt#L44 for all available options -") +" "") + +register_flag_optional(RAJA_IN_PACKAGE + "Use if Raja is part of a package dependency: + Path to installation" "") register_flag_optional(TARGET "Target offload device, implemented values are CPU, NVIDIA" @@ -76,8 +77,14 @@ macro(setup) register_link_library(RAJA) # RAJA's cmake screws with where the binary will end up, resetting it here: set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) + + elseif (EXISTS "${RAJA_IN_PACKAGE}") + message(STATUS "Building using packaged Raja at `${RAJA_IN_PACKAGE}`") + find_package(RAJA REQUIRED) + register_link_library(RAJA) + else () - message(FATAL_ERROR "`${RAJA_IN_TREE}` does not exist") + message(FATAL_ERROR "Neither `${RAJA_IN_TREE}` or `${RAJA_IN_PACKAGE}` exists") endif () From 1c46f8efd9f2bc4a31cca7c72902740d4af178e9 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Sat, 1 Oct 2022 04:56:03 +0800 Subject: [PATCH 32/89] Bump rust-stream dependencies --- src/rust/rust-stream/Cargo.lock | 662 +++++++++++++++++++++++++++----- src/rust/rust-stream/Cargo.toml | 18 +- 2 files changed, 577 insertions(+), 103 deletions(-) diff --git a/src/rust/rust-stream/Cargo.lock b/src/rust/rust-stream/Cargo.lock index 5f225f03..723849ad 100644 --- a/src/rust/rust-stream/Cargo.lock +++ b/src/rust/rust-stream/Cargo.lock @@ -4,13 +4,131 @@ version = 3 [[package]] name = "ansi_term" -version = "0.11.0" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d52a9bb7ec0cf484c551830a7ce27bd20d67eac647e1befb56b0be4ee39a55d2" +dependencies = [ + "winapi 0.3.9", +] + +[[package]] +name = "async-attributes" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a3203e79f4dd9bdda415ed03cf14dae5a2bf775c683a00f94e9cd1faf0f596e5" +dependencies = [ + "quote", + "syn", +] + +[[package]] +name = "async-channel" +version = "1.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e14485364214912d3b19cc3435dde4df66065127f05fa0d75c712f36f12c2f28" +dependencies = [ + "concurrent-queue", + "event-listener", + "futures-core", +] + +[[package]] +name = "async-executor" +version = "1.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ee49baf6cb617b853aa8d93bf420db2383fab46d314482ca2803b40d5fde979b" +checksum = "871f9bb5e0a22eeb7e8cf16641feb87c9dc67032ccf8ff49e772eb9941d3a965" dependencies = [ + "async-task", + "concurrent-queue", + "fastrand", + "futures-lite", + "once_cell", + "slab", +] + +[[package]] +name = "async-global-executor" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0da5b41ee986eed3f524c380e6d64965aea573882a8907682ad100f7859305ca" +dependencies = [ + "async-channel", + "async-executor", + "async-io", + "async-lock", + "blocking", + "futures-lite", + "once_cell", +] + +[[package]] +name = "async-io" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "83e21f3a490c72b3b0cf44962180e60045de2925d8dff97918f7ee43c8f637c7" +dependencies = [ + "autocfg", + "concurrent-queue", + "futures-lite", + "libc", + "log", + "once_cell", + "parking", + "polling", + "slab", + "socket2", + "waker-fn", "winapi 0.3.9", ] +[[package]] +name = "async-lock" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e97a171d191782fba31bb902b14ad94e24a68145032b7eedf871ab0bc0d077b6" +dependencies = [ + "event-listener", +] + +[[package]] +name = "async-std" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62565bb4402e926b29953c785397c6dc0391b7b446e45008b0049eb43cec6f5d" +dependencies = [ + "async-attributes", + "async-channel", + "async-global-executor", + "async-io", + "async-lock", + "crossbeam-utils", + "futures-channel", + "futures-core", + "futures-io", + "futures-lite", + "gloo-timers", + "kv-log-macro", + "log", + "memchr", + "once_cell", + "pin-project-lite", + "pin-utils", + "slab", + "wasm-bindgen-futures", +] + +[[package]] +name = "async-task" +version = "4.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a40729d2133846d9ed0ea60a8b9541bccddab49cd30f0715a1da672fe9a2524" + +[[package]] +name = "atomic-waker" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "065374052e7df7ee4047b1160cca5e1467a12351a40b3da123c870ba0b8eda2a" + [[package]] name = "atty" version = "0.2.14" @@ -24,9 +142,9 @@ dependencies = [ [[package]] name = "autocfg" -version = "1.0.1" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a" +checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" [[package]] name = "bitflags" @@ -34,6 +152,38 @@ version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" +[[package]] +name = "blocking" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c6ccb65d468978a086b69884437ded69a90faab3bbe6e67f242173ea728acccc" +dependencies = [ + "async-channel", + "async-task", + "atomic-waker", + "fastrand", + "futures-lite", + "once_cell", +] + +[[package]] +name = "bumpalo" +version = "3.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1ad822118d20d2c234f427000d5acc36eabe1e29a348c89b63dd60b13f28e5d" + +[[package]] +name = "cache-padded" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1db59621ec70f09c5e9b597b220c7a2b43611f4710dc03ceb8748637775692c" + +[[package]] +name = "cc" +version = "1.0.73" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2fff2a6927b3bb87f9595d67196a70493f627687a71d87a0d692242c33f58c11" + [[package]] name = "cfg-if" version = "1.0.0" @@ -42,9 +192,9 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "clap" -version = "2.33.3" +version = "2.34.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "37e58ac78573c40708d45522f0d80fa2f01cc4f9b4e2bf749807255454312002" +checksum = "a0610544180c38b88101fecf2dd634b174a62eef6946f84dfc6a7127512b381c" dependencies = [ "ansi_term", "atty", @@ -64,6 +214,15 @@ dependencies = [ "crossterm", ] +[[package]] +name = "concurrent-queue" +version = "1.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af4780a44ab5696ea9e28294517f1fffb421a83a25af521333c838635509db9c" +dependencies = [ + "cache-padded", +] + [[package]] name = "core_affinity" version = "0.5.10" @@ -78,9 +237,9 @@ dependencies = [ [[package]] name = "crossbeam" -version = "0.8.1" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4ae5588f6b3c3cb05239e90bd110f257254aecd01e4635400391aeae07497845" +checksum = "2801af0d36612ae591caa9568261fddce32ce6e08a7275ea334a06a4ad021a2c" dependencies = [ "cfg-if", "crossbeam-channel", @@ -92,9 +251,9 @@ dependencies = [ [[package]] name = "crossbeam-channel" -version = "0.5.1" +version = "0.5.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "06ed27e177f16d65f0f0c22a213e17c696ace5dd64b14258b52f9417ccb52db4" +checksum = "c2dd04ddaf88237dc3b8d8f9a3c1004b506b54b3313403944054d23c0870c521" dependencies = [ "cfg-if", "crossbeam-utils", @@ -102,9 +261,9 @@ dependencies = [ [[package]] name = "crossbeam-deque" -version = "0.8.1" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6455c0ca19f0d2fbf751b908d5c55c1f5cbc65e03c4225427254b46890bdde1e" +checksum = "715e8152b692bba2d374b53d4875445368fdf21a94751410af607a5ac677d1fc" dependencies = [ "cfg-if", "crossbeam-epoch", @@ -113,22 +272,22 @@ dependencies = [ [[package]] name = "crossbeam-epoch" -version = "0.9.5" +version = "0.9.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4ec02e091aa634e2c3ada4a392989e7c3116673ef0ac5b72232439094d73b7fd" +checksum = "f916dfc5d356b0ed9dae65f1db9fc9770aa2851d2662b988ccf4fe3516e86348" dependencies = [ + "autocfg", "cfg-if", "crossbeam-utils", - "lazy_static", "memoffset", "scopeguard", ] [[package]] name = "crossbeam-queue" -version = "0.3.2" +version = "0.3.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b10ddc024425c88c2ad148c1b0fd53f4c6d38db9697c9f1588381212fa657c9" +checksum = "1cd42583b04998a5363558e5f9291ee5a5ff6b49944332103f251e7479a82aa7" dependencies = [ "cfg-if", "crossbeam-utils", @@ -136,12 +295,11 @@ dependencies = [ [[package]] name = "crossbeam-utils" -version = "0.8.5" +version = "0.8.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d82cfc11ce7f2c3faef78d8a684447b40d503d9681acebed6cb728d45940c4db" +checksum = "edbafec5fa1f196ca66527c1b12c2ec4745ca14b50f1ad8f9f6f720b55d11fac" dependencies = [ "cfg-if", - "lazy_static", ] [[package]] @@ -169,11 +327,158 @@ dependencies = [ "winapi 0.3.9", ] +[[package]] +name = "ctor" +version = "0.1.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cdffe87e1d521a10f9696f833fe502293ea446d7f256c06128293a4119bdf4cb" +dependencies = [ + "quote", + "syn", +] + [[package]] name = "either" -version = "1.6.1" +version = "1.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90e5c1c8368803113bf0c9584fc495a58b86dc8a29edbf8fe877d21d9507e797" + +[[package]] +name = "event-listener" +version = "2.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0206175f82b8d6bf6652ff7d71a1e27fd2e4efde587fd368662814d6ec1d9ce0" + +[[package]] +name = "fastrand" +version = "1.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7a407cfaa3385c4ae6b23e84623d48c2798d06e3e6a1878f7f59f17b3f86499" +dependencies = [ + "instant", +] + +[[package]] +name = "futures" +version = "0.3.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f21eda599937fba36daeb58a22e8f5cee2d14c4a17b5b7739c7c8e5e3b8230c" +dependencies = [ + "futures-channel", + "futures-core", + "futures-executor", + "futures-io", + "futures-sink", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-channel" +version = "0.3.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "30bdd20c28fadd505d0fd6712cdfcb0d4b5648baf45faef7f852afb2399bb050" +dependencies = [ + "futures-core", + "futures-sink", +] + +[[package]] +name = "futures-core" +version = "0.3.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e5aa3de05362c3fb88de6531e6296e85cde7739cccad4b9dfeeb7f6ebce56bf" + +[[package]] +name = "futures-executor" +version = "0.3.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ff63c23854bee61b6e9cd331d523909f238fc7636290b96826e9cfa5faa00ab" +dependencies = [ + "futures-core", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-io" +version = "0.3.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbf4d2a7a308fd4578637c0b17c7e1c7ba127b8f6ba00b29f717e9655d85eb68" + +[[package]] +name = "futures-lite" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7694489acd39452c77daa48516b894c153f192c3578d5a839b62c58099fcbf48" +dependencies = [ + "fastrand", + "futures-core", + "futures-io", + "memchr", + "parking", + "pin-project-lite", + "waker-fn", +] + +[[package]] +name = "futures-macro" +version = "0.3.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42cd15d1c7456c04dbdf7e88bcd69760d74f3a798d6444e16974b505b0e62f17" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "futures-sink" +version = "0.3.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "21b20ba5a92e727ba30e72834706623d94ac93a725410b6a6b6fbc1b07f7ba56" + +[[package]] +name = "futures-task" +version = "0.3.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457" +checksum = "a6508c467c73851293f390476d4491cf4d227dbabcd4170f3bb6044959b294f1" + +[[package]] +name = "futures-timer" +version = "3.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e64b03909df88034c26dc1547e8970b91f98bdb65165d6a4e9110d94263dbb2c" + +[[package]] +name = "futures-util" +version = "0.3.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44fb6cb1be61cc1d2e43b262516aafcf63b241cffdb1d3fa115f91d9c7b09c90" +dependencies = [ + "futures-channel", + "futures-core", + "futures-io", + "futures-macro", + "futures-sink", + "futures-task", + "memchr", + "pin-project-lite", + "pin-utils", + "slab", +] + +[[package]] +name = "gloo-timers" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fb7d06c1c8cc2a29bee7ec961009a0b2caa0793ee4900c2ffb348734ba1c8f9" +dependencies = [ + "futures-channel", + "futures-core", + "js-sys", + "wasm-bindgen", +] [[package]] name = "heck" @@ -202,6 +507,15 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "js-sys" +version = "0.3.60" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49409df3e3bf0856b916e2ceaca09ee28e6871cf7d9ce97a692cacfdb2a25a47" +dependencies = [ + "wasm-bindgen", +] + [[package]] name = "kernel32-sys" version = "0.2.2" @@ -212,6 +526,15 @@ dependencies = [ "winapi-build", ] +[[package]] +name = "kv-log-macro" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0de8b303297635ad57c9f5059fd9cee7a47f8e8daa09df0fcd07dd39fb22977f" +dependencies = [ + "log", +] + [[package]] name = "lazy_static" version = "1.4.0" @@ -220,33 +543,41 @@ checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" [[package]] name = "libc" -version = "0.2.108" +version = "0.2.134" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8521a1b57e76b1ec69af7599e75e38e7b7fad6610f037db8c79b127201b5d119" +checksum = "329c933548736bc49fd575ee68c89e8be4d260064184389a5b77517cddd99ffb" [[package]] name = "lock_api" -version = "0.4.5" +version = "0.4.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "712a4d093c9976e24e7dbca41db895dabcbac38eb5f4045393d17a95bdfb1109" +checksum = "435011366fe56583b16cf956f9df0095b405b82d76425bc8981c0e22e60ec4df" dependencies = [ + "autocfg", "scopeguard", ] [[package]] name = "log" -version = "0.4.14" +version = "0.4.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "51b9bbe6c47d51fc3e1a9b945965946b4c44142ab8792c50835a980d362c2710" +checksum = "abb12e687cfb44aa40f41fc3978ef76448f9b6038cad6aef4259d3c095a2382e" dependencies = [ "cfg-if", + "value-bag", ] +[[package]] +name = "memchr" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" + [[package]] name = "memoffset" -version = "0.6.4" +version = "0.6.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "59accc507f1338036a0477ef61afdae33cde60840f4dfe481319ce3ad116ddf9" +checksum = "5aa361d4faea93603064a027415f07bd8e1d5c88c9fbf68bf56a285428fd79ce" dependencies = [ "autocfg", ] @@ -275,32 +606,44 @@ dependencies = [ [[package]] name = "ntapi" -version = "0.3.6" +version = "0.3.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f6bb902e437b6d86e03cce10a7e2af662292c5dfef23b65899ea3ac9354ad44" +checksum = "c28774a7fd2fbb4f0babd8237ce554b73af68021b5f695a3cebd6c59bac0980f" dependencies = [ "winapi 0.3.9", ] [[package]] name = "num-traits" -version = "0.2.14" +version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a64b1ec5cda2586e284722486d802acf1f7dbdc623e2bfc57e65ca1cd099290" +checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd" dependencies = [ "autocfg", ] [[package]] name = "num_cpus" -version = "1.13.0" +version = "1.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05499f3756671c15885fee9034446956fff3f243d6077b91e5767df161f766b3" +checksum = "19e64526ebdee182341572e50e9ad03965aa510cd94427a4549448f285e957a1" dependencies = [ "hermit-abi", "libc", ] +[[package]] +name = "once_cell" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e82dad04139b71a90c080c8463fe0dc7902db5192d939bd0950f074d014339e1" + +[[package]] +name = "parking" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "427c3892f9e783d91cc128285287e70a59e206ca452770ece88a76f7a3eddd72" + [[package]] name = "parking_lot" version = "0.11.2" @@ -327,12 +670,29 @@ dependencies = [ ] [[package]] -name = "pest" -version = "2.1.3" +name = "pin-project-lite" +version = "0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e0a7ae3ac2f1173085d398531c705756c94a4c56843785df85a60c1a0afac116" + +[[package]] +name = "pin-utils" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" + +[[package]] +name = "polling" +version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "10f4872ae94d7b90ae48754df22fd42ad52ce740b8f370b03da4835417403e53" +checksum = "899b00b9c8ab553c743b3e11e87c5c7d423b2a2de229ba95b24a756344748011" dependencies = [ - "ucd-trie", + "autocfg", + "cfg-if", + "libc", + "log", + "wepoll-ffi", + "winapi 0.3.9", ] [[package]] @@ -361,27 +721,27 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.32" +version = "1.0.46" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba508cc11742c0dc5c1659771673afbab7a0efab23aa17e854cbab0837ed0b43" +checksum = "94e2ef8dbfc347b10c094890f778ee2e36ca9bb4262e86dc99cd217e35f3470b" dependencies = [ - "unicode-xid", + "unicode-ident", ] [[package]] name = "quote" -version = "1.0.10" +version = "1.0.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38bc8cc6a5f2e3655e0899c1b848643b2562f853f114bfec7be120678e3ace05" +checksum = "bbe448f377a7d6961e30f5955f9b8d106c3f5e449d493ee1b125c1d43c2b5179" dependencies = [ "proc-macro2", ] [[package]] name = "rayon" -version = "1.5.1" +version = "1.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c06aca804d41dbc8ba42dfd964f0d01334eceb64314b9ecf7c5fad5188a06d90" +checksum = "bd99e5772ead8baa5215278c9b15bf92087709e9c1b2d1f97cdb5a183c933a7d" dependencies = [ "autocfg", "crossbeam-deque", @@ -391,31 +751,43 @@ dependencies = [ [[package]] name = "rayon-core" -version = "1.9.1" +version = "1.9.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d78120e2c850279833f1dd3582f730c4ab53ed95aeaaaa862a2a5c71b1656d8e" +checksum = "258bcdb5ac6dad48491bb2992db6b7cf74878b0384908af124823d118c99683f" dependencies = [ "crossbeam-channel", "crossbeam-deque", "crossbeam-utils", - "lazy_static", "num_cpus", ] [[package]] name = "redox_syscall" -version = "0.2.10" +version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8383f39639269cde97d255a32bdb68c047337295414940c68bdd30c2e13203ff" +checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a" dependencies = [ "bitflags", ] [[package]] name = "rstest" -version = "0.10.0" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b939295f93cb1d12bc1a83cf9ee963199b133fb8a79832dd51b68bb9f59a04dc" +dependencies = [ + "async-std", + "futures", + "futures-timer", + "rstest_macros", + "rustc_version", +] + +[[package]] +name = "rstest_macros" +version = "0.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "041bb0202c14f6a158bbbf086afb03d0c6e975c2dec7d4912f8061ed44f290af" +checksum = "f78aba848123782ba59340928ec7d876ebe745aa0365d6af8a630f19a5c16116" dependencies = [ "cfg-if", "proc-macro2", @@ -426,7 +798,7 @@ dependencies = [ [[package]] name = "rust-stream" -version = "3.4.0" +version = "4.0.0" dependencies = [ "colour", "core_affinity", @@ -443,18 +815,18 @@ dependencies = [ [[package]] name = "rustc_version" -version = "0.3.3" +version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f0dfe2087c51c460008730de8b57e6a320782fbfb312e1f4d520e6c6fae155ee" +checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366" dependencies = [ "semver", ] [[package]] name = "rustversion" -version = "1.0.5" +version = "1.0.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61b3909d758bb75c79f23d4736fac9433868679d3ad2ea7a61e3c25cfda9a088" +checksum = "97477e48b4cf8603ad5f7aaf897467cf42ab4218a38ef76fb14c2d6773a6d6a8" [[package]] name = "scopeguard" @@ -464,21 +836,9 @@ checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" [[package]] name = "semver" -version = "0.11.0" +version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f301af10236f6df4160f7c3f04eec6dbc70ace82d23326abad5edee88801c6b6" -dependencies = [ - "semver-parser", -] - -[[package]] -name = "semver-parser" -version = "0.10.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "00b0bef5b7f9e0df16536d3961cfb6e84331c065b4066afb39768d0e319411f7" -dependencies = [ - "pest", -] +checksum = "e25dfac463d778e353db5be2449d1cce89bd6fd23c9f1ea21310ce6e5a1b29c4" [[package]] name = "signal-hook" @@ -500,11 +860,30 @@ dependencies = [ "libc", ] +[[package]] +name = "slab" +version = "0.4.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4614a76b2a8be0058caa9dbbaf66d988527d86d003c11a94fbd335d7661edcef" +dependencies = [ + "autocfg", +] + [[package]] name = "smallvec" -version = "1.7.0" +version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ecab6c735a6bb4139c0caafd0cc3635748bbb3acf4550e8138122099251f309" +checksum = "2fd0db749597d91ff862fd1d55ea87f7855a744a8425a64695b6fca237d1dad1" + +[[package]] +name = "socket2" +version = "0.4.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "02e2d2db9033d13a1567121ddd7a095ee144db4e1ca1b1bda3419bc0da294ebd" +dependencies = [ + "libc", + "winapi 0.3.9", +] [[package]] name = "strsim" @@ -514,9 +893,9 @@ checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a" [[package]] name = "structopt" -version = "0.3.25" +version = "0.3.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "40b9788f4202aa75c240ecc9c15c65185e6a39ccdeb0fd5d008b98825464c87c" +checksum = "0c6b5c64445ba8094a6ab0c3cd2ad323e07171012d9c98b0b15651daf1787a10" dependencies = [ "clap", "lazy_static", @@ -538,20 +917,20 @@ dependencies = [ [[package]] name = "syn" -version = "1.0.82" +version = "1.0.101" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8daf5dd0bb60cbd4137b1b587d2fc0ae729bc07cf01cd70b36a1ed5ade3b9d59" +checksum = "e90cde112c4b9690b8cbe810cba9ddd8bc1d7472e2cae317b69e9438c1cba7d2" dependencies = [ "proc-macro2", "quote", - "unicode-xid", + "unicode-ident", ] [[package]] name = "tabular" -version = "0.1.4" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e7e35bee02dcefe64a74065b6b869d241eab1a02fea0d65e6074ce4e51894c3b" +checksum = "d9a2882c514780a1973df90de9d68adcd8871bacc9a6331c3f28e6d2ff91a3d1" dependencies = [ "unicode-width", ] @@ -566,28 +945,32 @@ dependencies = [ ] [[package]] -name = "ucd-trie" -version = "0.1.3" +name = "unicode-ident" +version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "56dee185309b50d1f11bfedef0fe6d036842e3fb77413abef29f8f8d1c5d4c1c" +checksum = "dcc811dc4066ac62f84f11307873c4850cb653bfa9b1719cee2bd2204a4bc5dd" [[package]] name = "unicode-segmentation" -version = "1.8.0" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8895849a949e7845e06bd6dc1aa51731a103c42707010a5b591c0038fb73385b" +checksum = "0fdbf052a0783de01e944a6ce7a8cb939e295b1e7be835a1112c3b9a7f047a5a" [[package]] name = "unicode-width" -version = "0.1.9" +version = "0.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3ed742d4ea2bd1176e236172c8429aaf54486e7ac098db29ffe6529e0ce50973" +checksum = "c0edd1e5b14653f783770bce4a4dabb4a5108a5370a5f5d8cfe8710c361f6c8b" [[package]] -name = "unicode-xid" -version = "0.2.2" +name = "value-bag" +version = "1.0.0-alpha.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3" +checksum = "2209b78d1249f7e6f3293657c9779fe31ced465df091bbd433a1cf88e916ec55" +dependencies = [ + "ctor", + "version_check", +] [[package]] name = "vec_map" @@ -597,9 +980,100 @@ checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191" [[package]] name = "version_check" -version = "0.9.3" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" + +[[package]] +name = "waker-fn" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d5b2c62b4012a3e1eca5a7e077d13b3bf498c4073e33ccd58626607748ceeca" + +[[package]] +name = "wasm-bindgen" +version = "0.2.83" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eaf9f5aceeec8be17c128b2e93e031fb8a4d469bb9c4ae2d7dc1888b26887268" +dependencies = [ + "cfg-if", + "wasm-bindgen-macro", +] + +[[package]] +name = "wasm-bindgen-backend" +version = "0.2.83" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c8ffb332579b0557b52d268b91feab8df3615f265d5270fec2a8c95b17c1142" +dependencies = [ + "bumpalo", + "log", + "once_cell", + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-futures" +version = "0.4.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "23639446165ca5a5de86ae1d8896b737ae80319560fbaa4c2887b7da6e7ebd7d" +dependencies = [ + "cfg-if", + "js-sys", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.83" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "052be0f94026e6cbc75cdefc9bae13fd6052cdcaf532fa6c45e7ae33a1e6c810" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.83" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07bc0c051dc5f23e307b13285f9d75df86bfdf816c5721e573dec1f9b8aa193c" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-backend", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.83" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5fecdca9a5291cc2b8dcf7dc02453fee791a280f3743cb0905f8822ae463b3fe" +checksum = "1c38c045535d93ec4f0b4defec448e4291638ee608530863b1e2ba115d4fff7f" + +[[package]] +name = "web-sys" +version = "0.3.60" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bcda906d8be16e728fd5adc5b729afad4e444e106ab28cd1c7256e54fa61510f" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "wepoll-ffi" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d743fdedc5c64377b5fc2bc036b01c7fd642205a0d96356034ae3404d49eb7fb" +dependencies = [ + "cc", +] [[package]] name = "winapi" diff --git a/src/rust/rust-stream/Cargo.toml b/src/rust/rust-stream/Cargo.toml index 8ac456f2..d93a84f5 100644 --- a/src/rust/rust-stream/Cargo.toml +++ b/src/rust/rust-stream/Cargo.toml @@ -7,19 +7,19 @@ edition = "2018" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -num-traits = "0.2.14" -structopt = "0.3.13" -tabular = "0.1.4" -rayon = "1.5.1" -crossbeam = "0.8.1" -num_cpus = "1.13.0" -rustversion = "1.0" -libc = "0.2.97" +num-traits = "0.2.15" +structopt = "0.3.26" +tabular = "0.2.0" +rayon = "1.5.3" +crossbeam = "0.8.2" +num_cpus = "1.13.1" +rustversion = "1.0.9" +libc = "0.2.134" core_affinity = "0.5.10" colour = "0.6.0" [dev-dependencies] -rstest = "0.10.0" +rstest = "0.13.0" [build-dependencies] rustversion = "1.0" From a075455ad45f00be73b10cc54a8369397221c691 Mon Sep 17 00:00:00 2001 From: Thomas Gibson <14180421+thomasgibson@users.noreply.github.com> Date: Sat, 30 Apr 2022 21:59:45 -0500 Subject: [PATCH 33/89] Add tuned benchmark kernels Co-authored-by: Nick Curtis --- src/hip/HIPStream.cpp | 210 +++++++++++++++++++++++++++++++++--------- src/hip/HIPStream.h | 33 +++++++ src/hip/model.cmake | 15 ++- 3 files changed, 212 insertions(+), 46 deletions(-) diff --git a/src/hip/HIPStream.cpp b/src/hip/HIPStream.cpp index 6aed1ee1..dcf634e5 100644 --- a/src/hip/HIPStream.cpp +++ b/src/hip/HIPStream.cpp @@ -9,7 +9,32 @@ #include "hip/hip_runtime.h" #define TBSIZE 1024 -#define DOT_NUM_BLOCKS 256 + +#ifdef NONTEMPORAL +template +__device__ __forceinline__ T load(const T& ref) +{ + return __builtin_nontemporal_load(&ref); +} + +template +__device__ __forceinline__ void store(const T& value, T& ref) +{ + __builtin_nontemporal_store(value, &ref); +} +#else +template +__device__ __forceinline__ T load(const T& ref) +{ + return ref; +} + +template +__device__ __forceinline__ void store(const T& value, T& ref) +{ + ref = value; +} +#endif void check_error(void) { @@ -23,15 +48,27 @@ void check_error(void) template HIPStream::HIPStream(const int ARRAY_SIZE, const int device_index) + : array_size{ARRAY_SIZE}, + block_count(array_size / (TBSIZE * elements_per_lane * chunks_per_block)) { - // The array size must be divisible by TBSIZE for kernel launches - if (ARRAY_SIZE % TBSIZE != 0) + std::cerr << "Elements per lane: " << elements_per_lane << std::endl; + std::cerr << "Chunks per block: " << chunks_per_block << std::endl; + // The array size must be divisible by total number of elements + // moved per block for kernel launches + if (ARRAY_SIZE % (TBSIZE * elements_per_lane * chunks_per_block) != 0) { std::stringstream ss; - ss << "Array size must be a multiple of " << TBSIZE; + ss << "Array size must be a multiple of elements operated on per block (" + << TBSIZE * elements_per_lane * chunks_per_block + << ")."; throw std::runtime_error(ss.str()); } + std::cerr << "block count " << block_count << std::endl; + +#ifdef NONTEMPORAL + std::cerr << "Using non-temporal memory operations." << std::endl; +#endif // Set device int count; @@ -49,7 +86,7 @@ HIPStream::HIPStream(const int ARRAY_SIZE, const int device_index) array_size = ARRAY_SIZE; // Allocate the host array for partial sums for dot kernels - sums = (T*)malloc(sizeof(T) * DOT_NUM_BLOCKS); + sums = (T*)malloc(block_count*sizeof(T)); // Check buffers fit on the device hipDeviceProp_t props; @@ -64,7 +101,7 @@ HIPStream::HIPStream(const int ARRAY_SIZE, const int device_index) check_error(); hipMalloc(&d_c, ARRAY_SIZE*sizeof(T)); check_error(); - hipMalloc(&d_sum, DOT_NUM_BLOCKS*sizeof(T)); + hipMalloc(&d_sum, block_count*sizeof(T)); check_error(); } @@ -115,68 +152,115 @@ void HIPStream::read_arrays(std::vector& a, std::vector& b, std::vector check_error(); } - -template -__global__ void copy_kernel(const T * a, T * c) +template +__launch_bounds__(TBSIZE) +__global__ +void copy_kernel(const T * __restrict a, T * __restrict c) { - const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; - c[i] = a[i]; + const size_t dx = (blockDim.x * gridDim.x) * elements_per_lane; + const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; + for (size_t i = 0; i != chunks_per_block; ++i) + { + for (size_t j = 0; j != elements_per_lane; ++j) + { + store(load(a[gidx + i * dx + j]), c[gidx + i * dx + j]); + } + } } template void HIPStream::copy() { - hipLaunchKernelGGL(HIP_KERNEL_NAME(copy_kernel), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_a, d_c); + hipLaunchKernelGGL(HIP_KERNEL_NAME(copy_kernel), + dim3(block_count), + dim3(TBSIZE), + 0, 0, d_a, d_c); check_error(); hipDeviceSynchronize(); check_error(); } -template -__global__ void mul_kernel(T * b, const T * c) +template +__launch_bounds__(TBSIZE) +__global__ +void mul_kernel(T * __restrict b, const T * __restrict c) { const T scalar = startScalar; - const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; - b[i] = scalar * c[i]; + const size_t dx = (blockDim.x * gridDim.x) * elements_per_lane; + const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; + for (size_t i = 0; i != chunks_per_block; ++i) + { + for (size_t j = 0; j != elements_per_lane; ++j) + { + store(scalar * load(c[gidx + i * dx + j]), b[gidx + i * dx + j]); + } + } } template void HIPStream::mul() { - hipLaunchKernelGGL(HIP_KERNEL_NAME(mul_kernel), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_b, d_c); + hipLaunchKernelGGL(HIP_KERNEL_NAME(mul_kernel), + dim3(block_count), + dim3(TBSIZE), + 0, 0, d_b, d_c); check_error(); hipDeviceSynchronize(); check_error(); } -template -__global__ void add_kernel(const T * a, const T * b, T * c) +template +__launch_bounds__(TBSIZE) +__global__ +void add_kernel(const T * __restrict a, const T * __restrict b, T * __restrict c) { - const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; - c[i] = a[i] + b[i]; + const size_t dx = (blockDim.x * gridDim.x) * elements_per_lane; + const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; + for (size_t i = 0; i != chunks_per_block; ++i) + { + for (size_t j = 0; j != elements_per_lane; ++j) + { + store(load(a[gidx + i * dx + j]) + load(b[gidx + i * dx + j]), c[gidx + i * dx + j]); + } + } } template void HIPStream::add() { - hipLaunchKernelGGL(HIP_KERNEL_NAME(add_kernel), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_a, d_b, d_c); + hipLaunchKernelGGL(HIP_KERNEL_NAME(add_kernel), + dim3(block_count), + dim3(TBSIZE), + 0, 0, d_a, d_b, d_c); check_error(); hipDeviceSynchronize(); check_error(); } -template -__global__ void triad_kernel(T * a, const T * b, const T * c) +template +__launch_bounds__(TBSIZE) +__global__ +void triad_kernel(T * __restrict a, const T * __restrict b, const T * __restrict c) { const T scalar = startScalar; - const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; - a[i] = b[i] + scalar * c[i]; + const size_t dx = (blockDim.x * gridDim.x) * elements_per_lane; + const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; + for (size_t i = 0; i != chunks_per_block; ++i) + { + for (size_t j = 0; j != elements_per_lane; ++j) + { + store(load(b[gidx + i * dx + j]) + scalar * load(c[gidx + i * dx + j]), a[gidx + i * dx + j]); + } + } } template void HIPStream::triad() { - hipLaunchKernelGGL(HIP_KERNEL_NAME(triad_kernel), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_a, d_b, d_c); + hipLaunchKernelGGL(HIP_KERNEL_NAME(triad_kernel), + dim3(block_count), + dim3(TBSIZE), + 0, 0, d_a, d_b, d_c); check_error(); hipDeviceSynchronize(); check_error(); @@ -199,42 +283,78 @@ void HIPStream::nstream() check_error(); } -template -__global__ void dot_kernel(const T * a, const T * b, T * sum, int array_size) +template +struct Reducer { - __shared__ T tb_sum[TBSIZE]; - - int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; - const size_t local_i = hipThreadIdx_x; + template + __device__ + static + void reduce(I it) noexcept + { + if (n == 1) return; - tb_sum[local_i] = 0.0; - for (; i < array_size; i += hipBlockDim_x*hipGridDim_x) - tb_sum[local_i] += a[i] * b[i]; +#if defined(__HIP_PLATFORM_NVCC__) + constexpr unsigned int warpSize = 32; +#endif + constexpr bool is_same_warp{n <= warpSize * 2}; + if (static_cast(threadIdx.x) < n/2) + { + it[threadIdx.x] += it[threadIdx.x + n/2]; + } + is_same_warp ? __threadfence_block() : __syncthreads(); + Reducer::reduce(it); + } +}; + +template<> +struct Reducer<1u> { + template + __device__ + static + void reduce(I) noexcept + {} +}; + +template +__launch_bounds__(TBSIZE) +__global__ +__global__ void dot_kernel(const T * __restrict a, const T * __restrict b, T * __restrict sum) +{ + __shared__ T tb_sum[TBSIZE]; + const size_t tidx = threadIdx.x; + const size_t dx = (blockDim.x * gridDim.x) * elements_per_lane; + const size_t gidx = (tidx + blockIdx.x * blockDim.x) * elements_per_lane; - for (int offset = hipBlockDim_x / 2; offset > 0; offset /= 2) + T tmp{0}; + for (size_t i = 0; i != chunks_per_block; ++i) { - __syncthreads(); - if (local_i < offset) + for (size_t j = 0; j != elements_per_lane; ++j) { - tb_sum[local_i] += tb_sum[local_i+offset]; + tmp += load(a[gidx + i * dx + j]) * load(b[gidx + i * dx + j]); } } + tb_sum[tidx] = tmp; + __syncthreads(); - if (local_i == 0) - sum[hipBlockIdx_x] = tb_sum[local_i]; + Reducer<>::reduce(tb_sum); + if (tidx) return; + store(tb_sum[0], sum[blockIdx.x]); } template T HIPStream::dot() { - hipLaunchKernelGGL(HIP_KERNEL_NAME(dot_kernel), dim3(DOT_NUM_BLOCKS), dim3(TBSIZE), 0, 0, d_a, d_b, d_sum, array_size); + hipLaunchKernelGGL(HIP_KERNEL_NAME(dot_kernel), + dim3(block_count), + dim3(TBSIZE), + 0, 0, d_a, d_b, d_sum); check_error(); - hipMemcpy(sums, d_sum, DOT_NUM_BLOCKS*sizeof(T), hipMemcpyDeviceToHost); + hipMemcpy(sums, d_sum, block_count*sizeof(T), hipMemcpyDeviceToHost); check_error(); T sum = 0.0; - for (int i = 0; i < DOT_NUM_BLOCKS; i++) + for (int i = 0; i < block_count; i++) sum += sums[i]; return sum; diff --git a/src/hip/HIPStream.h b/src/hip/HIPStream.h index 44a2893d..ecdf929f 100644 --- a/src/hip/HIPStream.h +++ b/src/hip/HIPStream.h @@ -18,9 +18,42 @@ template class HIPStream : public Stream { +#ifdef __HIP_PLATFORM_NVCC__ + #ifndef DWORDS_PER_LANE + #define DWORDS_PER_LANE 1 + #endif + #ifndef CHUNKS_PER_BLOCK + #define CHUNKS_PER_BLOCK 8 + #endif +#else + #ifndef DWORDS_PER_LANE + #define DWORDS_PER_LANE 4 + #endif + #ifndef CHUNKS_PER_BLOCK + #define CHUNKS_PER_BLOCK 1 + #endif +#endif + // Make sure that either: + // DWORDS_PER_LANE is less than sizeof(T), in which case we default to 1 element + // or + // DWORDS_PER_LANE is divisible by sizeof(T) + static_assert((DWORDS_PER_LANE * sizeof(unsigned int) < sizeof(T)) || + (DWORDS_PER_LANE * sizeof(unsigned int) % sizeof(T) == 0), + "DWORDS_PER_LANE not divisible by sizeof(element_type)"); + + static constexpr unsigned int chunks_per_block{CHUNKS_PER_BLOCK}; + static constexpr unsigned int dwords_per_lane{DWORDS_PER_LANE}; + // Take into account the datatype size + // That is, if we specify 4 DWORDS_PER_LANE, this is 2 FP64 elements + // and 4 FP32 elements + static constexpr unsigned int elements_per_lane{ + (DWORDS_PER_LANE * sizeof(unsigned int)) < sizeof(T) ? 1 : ( + DWORDS_PER_LANE * sizeof(unsigned int) / sizeof(T))}; + protected: // Size of arrays int array_size; + int block_count; // Host array for partial sums for dot kernel T *sums; diff --git a/src/hip/model.cmake b/src/hip/model.cmake index 78150c4b..3ffaf7a5 100644 --- a/src/hip/model.cmake +++ b/src/hip/model.cmake @@ -2,6 +2,19 @@ register_flag_required(CMAKE_CXX_COMPILER "Absolute path to the AMD HIP C++ compiler") +register_flag_optional(USE_NONTEMPORAL_MEM + "Flag indicating to use non-temporal memory accesses to bypass cache." + "OFF") + +# TODO: Better flag descriptions +register_flag_optional(DWORDS_PER_LANE "Flag indicating the number of double data types per wavefront lane." 4) +register_flag_optional(CHUNKS_PER_BLOCK "Flag indicating the chunks per block." 1) + macro(setup) - # nothing to do here as hipcc does everything correctly, what a surprise! + # Ensure we set the proper preprocessor directives + if (USE_NONTEMPORAL_MEM) + add_definitions(-DNONTEMPORAL) + endif () + register_definitions(DWORDS_PER_LANE=${DWORDS_PER_LANE}) + register_definitions(CHUNKS_PER_BLOCK=${CHUNKS_PER_BLOCK}) endmacro() \ No newline at end of file From bcf8708f2c294187390e69d9b825b2e7dc709001 Mon Sep 17 00:00:00 2001 From: Thomas Gibson <14180421+thomasgibson@users.noreply.github.com> Date: Tue, 31 May 2022 11:29:42 -0500 Subject: [PATCH 34/89] Clean up kernels and drop unneeded modifications --- src/hip/HIPStream.cpp | 187 ++++++++++++------------------------------ src/hip/HIPStream.h | 7 -- src/hip/model.cmake | 12 +-- 3 files changed, 53 insertions(+), 153 deletions(-) diff --git a/src/hip/HIPStream.cpp b/src/hip/HIPStream.cpp index dcf634e5..eac77b45 100644 --- a/src/hip/HIPStream.cpp +++ b/src/hip/HIPStream.cpp @@ -10,31 +10,6 @@ #define TBSIZE 1024 -#ifdef NONTEMPORAL -template -__device__ __forceinline__ T load(const T& ref) -{ - return __builtin_nontemporal_load(&ref); -} - -template -__device__ __forceinline__ void store(const T& value, T& ref) -{ - __builtin_nontemporal_store(value, &ref); -} -#else -template -__device__ __forceinline__ T load(const T& ref) -{ - return ref; -} - -template -__device__ __forceinline__ void store(const T& value, T& ref) -{ - ref = value; -} -#endif void check_error(void) { @@ -49,27 +24,23 @@ void check_error(void) template HIPStream::HIPStream(const int ARRAY_SIZE, const int device_index) : array_size{ARRAY_SIZE}, - block_count(array_size / (TBSIZE * elements_per_lane * chunks_per_block)) + block_count(array_size / (TBSIZE * elements_per_lane)) { std::cerr << "Elements per lane: " << elements_per_lane << std::endl; std::cerr << "Chunks per block: " << chunks_per_block << std::endl; // The array size must be divisible by total number of elements // moved per block for kernel launches - if (ARRAY_SIZE % (TBSIZE * elements_per_lane * chunks_per_block) != 0) + if (ARRAY_SIZE % (TBSIZE * elements_per_lane) != 0) { std::stringstream ss; ss << "Array size must be a multiple of elements operated on per block (" - << TBSIZE * elements_per_lane * chunks_per_block + << TBSIZE * elements_per_lane << ")."; throw std::runtime_error(ss.str()); } std::cerr << "block count " << block_count << std::endl; -#ifdef NONTEMPORAL - std::cerr << "Using non-temporal memory operations." << std::endl; -#endif - // Set device int count; hipGetDeviceCount(&count); @@ -86,7 +57,8 @@ HIPStream::HIPStream(const int ARRAY_SIZE, const int device_index) array_size = ARRAY_SIZE; // Allocate the host array for partial sums for dot kernels - sums = (T*)malloc(block_count*sizeof(T)); + hipHostMalloc(&sums, sizeof(T) * block_count, hipHostMallocNonCoherent); + check_error(); // Check buffers fit on the device hipDeviceProp_t props; @@ -101,15 +73,14 @@ HIPStream::HIPStream(const int ARRAY_SIZE, const int device_index) check_error(); hipMalloc(&d_c, ARRAY_SIZE*sizeof(T)); check_error(); - hipMalloc(&d_sum, block_count*sizeof(T)); - check_error(); } template HIPStream::~HIPStream() { - free(sums); + hipHostFree(sums); + check_error(); hipFree(d_a); check_error(); @@ -117,15 +88,13 @@ HIPStream::~HIPStream() check_error(); hipFree(d_c); check_error(); - hipFree(d_sum); - check_error(); } template __global__ void init_kernel(T * a, T * b, T * c, T initA, T initB, T initC) { - const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + const size_t i = blockDim.x * blockIdx.x + threadIdx.x; a[i] = initA; b[i] = initB; c[i] = initC; @@ -152,26 +121,20 @@ void HIPStream::read_arrays(std::vector& a, std::vector& b, std::vector check_error(); } -template +template __launch_bounds__(TBSIZE) __global__ void copy_kernel(const T * __restrict a, T * __restrict c) { - const size_t dx = (blockDim.x * gridDim.x) * elements_per_lane; const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; - for (size_t i = 0; i != chunks_per_block; ++i) - { - for (size_t j = 0; j != elements_per_lane; ++j) - { - store(load(a[gidx + i * dx + j]), c[gidx + i * dx + j]); - } - } + for (size_t j = 0; j < elements_per_lane; ++j) + c[gidx + j] = a[gidx + j]; } template void HIPStream::copy() { - hipLaunchKernelGGL(HIP_KERNEL_NAME(copy_kernel), + hipLaunchKernelGGL(HIP_KERNEL_NAME(copy_kernel), dim3(block_count), dim3(TBSIZE), 0, 0, d_a, d_c); @@ -180,27 +143,21 @@ void HIPStream::copy() check_error(); } -template +template __launch_bounds__(TBSIZE) __global__ void mul_kernel(T * __restrict b, const T * __restrict c) { const T scalar = startScalar; - const size_t dx = (blockDim.x * gridDim.x) * elements_per_lane; const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; - for (size_t i = 0; i != chunks_per_block; ++i) - { - for (size_t j = 0; j != elements_per_lane; ++j) - { - store(scalar * load(c[gidx + i * dx + j]), b[gidx + i * dx + j]); - } - } + for (size_t j = 0; j < elements_per_lane; ++j) + b[gidx + j] = scalar * c[gidx + j]; } template void HIPStream::mul() { - hipLaunchKernelGGL(HIP_KERNEL_NAME(mul_kernel), + hipLaunchKernelGGL(HIP_KERNEL_NAME(mul_kernel), dim3(block_count), dim3(TBSIZE), 0, 0, d_b, d_c); @@ -209,26 +166,20 @@ void HIPStream::mul() check_error(); } -template +template __launch_bounds__(TBSIZE) __global__ void add_kernel(const T * __restrict a, const T * __restrict b, T * __restrict c) { - const size_t dx = (blockDim.x * gridDim.x) * elements_per_lane; const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; - for (size_t i = 0; i != chunks_per_block; ++i) - { - for (size_t j = 0; j != elements_per_lane; ++j) - { - store(load(a[gidx + i * dx + j]) + load(b[gidx + i * dx + j]), c[gidx + i * dx + j]); - } - } + for (size_t j = 0; j < elements_per_lane; ++j) + c[gidx + j] = a[gidx + j] + b[gidx + j]; } template void HIPStream::add() { - hipLaunchKernelGGL(HIP_KERNEL_NAME(add_kernel), + hipLaunchKernelGGL(HIP_KERNEL_NAME(add_kernel), dim3(block_count), dim3(TBSIZE), 0, 0, d_a, d_b, d_c); @@ -237,27 +188,21 @@ void HIPStream::add() check_error(); } -template +template __launch_bounds__(TBSIZE) __global__ void triad_kernel(T * __restrict a, const T * __restrict b, const T * __restrict c) { const T scalar = startScalar; - const size_t dx = (blockDim.x * gridDim.x) * elements_per_lane; const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; - for (size_t i = 0; i != chunks_per_block; ++i) - { - for (size_t j = 0; j != elements_per_lane; ++j) - { - store(load(b[gidx + i * dx + j]) + scalar * load(c[gidx + i * dx + j]), a[gidx + i * dx + j]); - } - } + for (size_t j = 0; j < elements_per_lane; ++j) + a[gidx + j] = b[gidx + j] + scalar * c[gidx + j]; } template void HIPStream::triad() { - hipLaunchKernelGGL(HIP_KERNEL_NAME(triad_kernel), + hipLaunchKernelGGL(HIP_KERNEL_NAME(triad_kernel), dim3(block_count), dim3(TBSIZE), 0, 0, d_a, d_b, d_c); @@ -266,91 +211,63 @@ void HIPStream::triad() check_error(); } -template -__global__ void nstream_kernel(T * a, const T * b, const T * c) +template +__launch_bounds__(TBSIZE) +__global__ void nstream_kernel(T * __restrict a, const T * __restrict b, const T * __restrict c) { const T scalar = startScalar; - const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; - a[i] += b[i] + scalar * c[i]; + const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; + for (size_t j = 0; j < elements_per_lane; ++j) + a[gidx + j] += b[gidx + j] + scalar * c[gidx + j]; } template void HIPStream::nstream() { - hipLaunchKernelGGL(HIP_KERNEL_NAME(nstream_kernel), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_a, d_b, d_c); + hipLaunchKernelGGL(HIP_KERNEL_NAME(nstream_kernel), + dim3(block_count), + dim3(TBSIZE), + 0, 0, d_a, d_b, d_c); check_error(); hipDeviceSynchronize(); check_error(); } -template -struct Reducer -{ - template - __device__ - static - void reduce(I it) noexcept - { - if (n == 1) return; - -#if defined(__HIP_PLATFORM_NVCC__) - constexpr unsigned int warpSize = 32; -#endif - constexpr bool is_same_warp{n <= warpSize * 2}; - if (static_cast(threadIdx.x) < n/2) - { - it[threadIdx.x] += it[threadIdx.x + n/2]; - } - is_same_warp ? __threadfence_block() : __syncthreads(); - Reducer::reduce(it); - } -}; - -template<> -struct Reducer<1u> { - template - __device__ - static - void reduce(I) noexcept - {} -}; - -template +template __launch_bounds__(TBSIZE) -__global__ -__global__ void dot_kernel(const T * __restrict a, const T * __restrict b, T * __restrict sum) +__global__ void dot_kernel(const T * __restrict a, const T * __restrict b, T * __restrict sum, int array_size) { __shared__ T tb_sum[TBSIZE]; - const size_t tidx = threadIdx.x; - const size_t dx = (blockDim.x * gridDim.x) * elements_per_lane; - const size_t gidx = (tidx + blockIdx.x * blockDim.x) * elements_per_lane; - T tmp{0}; - for (size_t i = 0; i != chunks_per_block; ++i) + const size_t local_i = threadIdx.x; + size_t i = blockDim.x * blockIdx.x + local_i; + + tb_sum[local_i] = 0.0; + for (size_t j = 0; j < elements_per_lane && i < array_size; ++j, i += blockDim.x*gridDim.x) + tb_sum[local_i] += a[i] * b[i]; + + for (size_t offset = blockDim.x / 2; offset > 0; offset /= 2) { - for (size_t j = 0; j != elements_per_lane; ++j) + __syncthreads(); + if (local_i < offset) { - tmp += load(a[gidx + i * dx + j]) * load(b[gidx + i * dx + j]); + tb_sum[local_i] += tb_sum[local_i+offset]; } } - tb_sum[tidx] = tmp; - __syncthreads(); - Reducer<>::reduce(tb_sum); - if (tidx) return; - store(tb_sum[0], sum[blockIdx.x]); + if (local_i == 0) + sum[blockIdx.x] = tb_sum[local_i]; } template T HIPStream::dot() { - hipLaunchKernelGGL(HIP_KERNEL_NAME(dot_kernel), + hipLaunchKernelGGL(HIP_KERNEL_NAME(dot_kernel), dim3(block_count), dim3(TBSIZE), - 0, 0, d_a, d_b, d_sum); + 0, 0, d_a, d_b, sums, array_size); check_error(); - - hipMemcpy(sums, d_sum, block_count*sizeof(T), hipMemcpyDeviceToHost); + hipDeviceSynchronize(); check_error(); T sum = 0.0; diff --git a/src/hip/HIPStream.h b/src/hip/HIPStream.h index ecdf929f..7bce0b54 100644 --- a/src/hip/HIPStream.h +++ b/src/hip/HIPStream.h @@ -22,16 +22,10 @@ class HIPStream : public Stream #ifndef DWORDS_PER_LANE #define DWORDS_PER_LANE 1 #endif - #ifndef CHUNKS_PER_BLOCK - #define CHUNKS_PER_BLOCK 8 - #endif #else #ifndef DWORDS_PER_LANE #define DWORDS_PER_LANE 4 #endif - #ifndef CHUNKS_PER_BLOCK - #define CHUNKS_PER_BLOCK 1 - #endif #endif // Make sure that either: // DWORDS_PER_LANE is less than sizeof(T), in which case we default to 1 element @@ -41,7 +35,6 @@ class HIPStream : public Stream (DWORDS_PER_LANE * sizeof(unsigned int) % sizeof(T) == 0), "DWORDS_PER_LANE not divisible by sizeof(element_type)"); - static constexpr unsigned int chunks_per_block{CHUNKS_PER_BLOCK}; static constexpr unsigned int dwords_per_lane{DWORDS_PER_LANE}; // Take into account the datatype size // That is, if we specify 4 DWORDS_PER_LANE, this is 2 FP64 elements diff --git a/src/hip/model.cmake b/src/hip/model.cmake index 3ffaf7a5..2f7d69e2 100644 --- a/src/hip/model.cmake +++ b/src/hip/model.cmake @@ -2,19 +2,9 @@ register_flag_required(CMAKE_CXX_COMPILER "Absolute path to the AMD HIP C++ compiler") -register_flag_optional(USE_NONTEMPORAL_MEM - "Flag indicating to use non-temporal memory accesses to bypass cache." - "OFF") - -# TODO: Better flag descriptions -register_flag_optional(DWORDS_PER_LANE "Flag indicating the number of double data types per wavefront lane." 4) -register_flag_optional(CHUNKS_PER_BLOCK "Flag indicating the chunks per block." 1) +register_flag_optional(DWORDS_PER_LANE "Flag indicating the number of dwords to process per wavefront lane." 4) macro(setup) # Ensure we set the proper preprocessor directives - if (USE_NONTEMPORAL_MEM) - add_definitions(-DNONTEMPORAL) - endif () register_definitions(DWORDS_PER_LANE=${DWORDS_PER_LANE}) - register_definitions(CHUNKS_PER_BLOCK=${CHUNKS_PER_BLOCK}) endmacro() \ No newline at end of file From f98aedf64d0f62764550d93ee0f1458be2146efd Mon Sep 17 00:00:00 2001 From: Thomas Gibson <14180421+thomasgibson@users.noreply.github.com> Date: Thu, 11 Aug 2022 10:09:57 -0500 Subject: [PATCH 35/89] Use triple-chevron syntax for hip kernel launching --- src/hip/HIPStream.cpp | 32 +++++++------------------------- 1 file changed, 7 insertions(+), 25 deletions(-) diff --git a/src/hip/HIPStream.cpp b/src/hip/HIPStream.cpp index eac77b45..ce691726 100644 --- a/src/hip/HIPStream.cpp +++ b/src/hip/HIPStream.cpp @@ -103,7 +103,7 @@ __global__ void init_kernel(T * a, T * b, T * c, T initA, T initB, T initC) template void HIPStream::init_arrays(T initA, T initB, T initC) { - hipLaunchKernelGGL(HIP_KERNEL_NAME(init_kernel), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_a, d_b, d_c, initA, initB, initC); + init_kernel<<>>(d_a, d_b, d_c, initA, initB, initC); check_error(); hipDeviceSynchronize(); check_error(); @@ -134,10 +134,7 @@ void copy_kernel(const T * __restrict a, T * __restrict c) template void HIPStream::copy() { - hipLaunchKernelGGL(HIP_KERNEL_NAME(copy_kernel), - dim3(block_count), - dim3(TBSIZE), - 0, 0, d_a, d_c); + copy_kernel<<>>(d_a, d_c); check_error(); hipDeviceSynchronize(); check_error(); @@ -157,10 +154,7 @@ void mul_kernel(T * __restrict b, const T * __restrict c) template void HIPStream::mul() { - hipLaunchKernelGGL(HIP_KERNEL_NAME(mul_kernel), - dim3(block_count), - dim3(TBSIZE), - 0, 0, d_b, d_c); + mul_kernel<<>>(d_b, d_c); check_error(); hipDeviceSynchronize(); check_error(); @@ -179,10 +173,7 @@ void add_kernel(const T * __restrict a, const T * __restrict b, T * __restrict c template void HIPStream::add() { - hipLaunchKernelGGL(HIP_KERNEL_NAME(add_kernel), - dim3(block_count), - dim3(TBSIZE), - 0, 0, d_a, d_b, d_c); + add_kernel<<>>(d_a, d_b, d_c); check_error(); hipDeviceSynchronize(); check_error(); @@ -202,10 +193,7 @@ void triad_kernel(T * __restrict a, const T * __restrict b, const T * __restrict template void HIPStream::triad() { - hipLaunchKernelGGL(HIP_KERNEL_NAME(triad_kernel), - dim3(block_count), - dim3(TBSIZE), - 0, 0, d_a, d_b, d_c); + triad_kernel<<>>(d_a, d_b, d_c); check_error(); hipDeviceSynchronize(); check_error(); @@ -224,10 +212,7 @@ __global__ void nstream_kernel(T * __restrict a, const T * __restrict b, const T template void HIPStream::nstream() { - hipLaunchKernelGGL(HIP_KERNEL_NAME(nstream_kernel), - dim3(block_count), - dim3(TBSIZE), - 0, 0, d_a, d_b, d_c); + nstream_kernel<<>>(d_a, d_b, d_c); check_error(); hipDeviceSynchronize(); check_error(); @@ -262,10 +247,7 @@ __global__ void dot_kernel(const T * __restrict a, const T * __restrict b, T * _ template T HIPStream::dot() { - hipLaunchKernelGGL(HIP_KERNEL_NAME(dot_kernel), - dim3(block_count), - dim3(TBSIZE), - 0, 0, d_a, d_b, sums, array_size); + dot_kernel<<>>(d_a, d_b, sums, array_size); check_error(); hipDeviceSynchronize(); check_error(); From de93c06e78a7051cfed4a44626ac6fc599f5c24d Mon Sep 17 00:00:00 2001 From: Thomas Gibson <14180421+thomasgibson@users.noreply.github.com> Date: Thu, 11 Aug 2022 10:32:20 -0500 Subject: [PATCH 36/89] Add clarifying comment and further clean-up --- src/hip/HIPStream.cpp | 8 ++++---- src/hip/HIPStream.h | 1 - src/hip/model.cmake | 1 - 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/src/hip/HIPStream.cpp b/src/hip/HIPStream.cpp index ce691726..37fce3b5 100644 --- a/src/hip/HIPStream.cpp +++ b/src/hip/HIPStream.cpp @@ -27,8 +27,6 @@ HIPStream::HIPStream(const int ARRAY_SIZE, const int device_index) block_count(array_size / (TBSIZE * elements_per_lane)) { - std::cerr << "Elements per lane: " << elements_per_lane << std::endl; - std::cerr << "Chunks per block: " << chunks_per_block << std::endl; // The array size must be divisible by total number of elements // moved per block for kernel launches if (ARRAY_SIZE % (TBSIZE * elements_per_lane) != 0) @@ -39,7 +37,6 @@ HIPStream::HIPStream(const int ARRAY_SIZE, const int device_index) << ")."; throw std::runtime_error(ss.str()); } - std::cerr << "block count " << block_count << std::endl; // Set device int count; @@ -56,7 +53,10 @@ HIPStream::HIPStream(const int ARRAY_SIZE, const int device_index) array_size = ARRAY_SIZE; - // Allocate the host array for partial sums for dot kernels + // Allocate the host array for partial sums for dot kernels using hipHostMalloc. + // This creates an array on the host which is visible to the device. However, it requires + // synchronization (e.g. hipDeviceSynchronize) for the result to be available on the host + // after it has been passed through to a kernel. hipHostMalloc(&sums, sizeof(T) * block_count, hipHostMallocNonCoherent); check_error(); diff --git a/src/hip/HIPStream.h b/src/hip/HIPStream.h index 7bce0b54..305e9376 100644 --- a/src/hip/HIPStream.h +++ b/src/hip/HIPStream.h @@ -55,7 +55,6 @@ class HIPStream : public Stream T *d_a; T *d_b; T *d_c; - T *d_sum; public: diff --git a/src/hip/model.cmake b/src/hip/model.cmake index 2f7d69e2..19e6fd09 100644 --- a/src/hip/model.cmake +++ b/src/hip/model.cmake @@ -5,6 +5,5 @@ register_flag_required(CMAKE_CXX_COMPILER register_flag_optional(DWORDS_PER_LANE "Flag indicating the number of dwords to process per wavefront lane." 4) macro(setup) - # Ensure we set the proper preprocessor directives register_definitions(DWORDS_PER_LANE=${DWORDS_PER_LANE}) endmacro() \ No newline at end of file From f44cd6fdd2bf434b91e40e0b117af0e6f05b578a Mon Sep 17 00:00:00 2001 From: Thomas Gibson <14180421+thomasgibson@users.noreply.github.com> Date: Mon, 5 Sep 2022 15:43:37 -0700 Subject: [PATCH 37/89] Roll back modifications for copy, mul, add, and triad --- src/hip/HIPStream.cpp | 50 +++++++++++++++++++++++++------------------ 1 file changed, 29 insertions(+), 21 deletions(-) diff --git a/src/hip/HIPStream.cpp b/src/hip/HIPStream.cpp index 37fce3b5..cc1d21f8 100644 --- a/src/hip/HIPStream.cpp +++ b/src/hip/HIPStream.cpp @@ -124,17 +124,19 @@ void HIPStream::read_arrays(std::vector& a, std::vector& b, std::vector template __launch_bounds__(TBSIZE) __global__ -void copy_kernel(const T * __restrict a, T * __restrict c) +void copy_kernel(const T * a, T * c) { - const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; - for (size_t j = 0; j < elements_per_lane; ++j) - c[gidx + j] = a[gidx + j]; + const size_t i = threadIdx.x + blockIdx.x * blockDim.x; + c[i] = a[i]; + // const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; + // for (size_t j = 0; j < elements_per_lane; ++j) + // c[gidx + j] = a[gidx + j]; } template void HIPStream::copy() { - copy_kernel<<>>(d_a, d_c); + copy_kernel<<>>(d_a, d_c); check_error(); hipDeviceSynchronize(); check_error(); @@ -143,18 +145,20 @@ void HIPStream::copy() template __launch_bounds__(TBSIZE) __global__ -void mul_kernel(T * __restrict b, const T * __restrict c) +void mul_kernel(T * b, const T * c) { const T scalar = startScalar; - const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; - for (size_t j = 0; j < elements_per_lane; ++j) - b[gidx + j] = scalar * c[gidx + j]; + const size_t i = threadIdx.x + blockIdx.x * blockDim.x; + b[i] = scalar * c[i]; + // const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; + // for (size_t j = 0; j < elements_per_lane; ++j) + // b[gidx + j] = scalar * c[gidx + j]; } template void HIPStream::mul() { - mul_kernel<<>>(d_b, d_c); + mul_kernel<<>>(d_b, d_c); check_error(); hipDeviceSynchronize(); check_error(); @@ -163,17 +167,19 @@ void HIPStream::mul() template __launch_bounds__(TBSIZE) __global__ -void add_kernel(const T * __restrict a, const T * __restrict b, T * __restrict c) +void add_kernel(const T * a, const T * b, T * c) { - const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; - for (size_t j = 0; j < elements_per_lane; ++j) - c[gidx + j] = a[gidx + j] + b[gidx + j]; + const size_t i = threadIdx.x + blockIdx.x * blockDim.x; + c[i] = a[i] + b[i]; + // const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; + // for (size_t j = 0; j < elements_per_lane; ++j) + // c[gidx + j] = a[gidx + j] + b[gidx + j]; } template void HIPStream::add() { - add_kernel<<>>(d_a, d_b, d_c); + add_kernel<<>>(d_a, d_b, d_c); check_error(); hipDeviceSynchronize(); check_error(); @@ -182,18 +188,20 @@ void HIPStream::add() template __launch_bounds__(TBSIZE) __global__ -void triad_kernel(T * __restrict a, const T * __restrict b, const T * __restrict c) +void triad_kernel(T * a, const T * b, const T * c) { const T scalar = startScalar; - const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; - for (size_t j = 0; j < elements_per_lane; ++j) - a[gidx + j] = b[gidx + j] + scalar * c[gidx + j]; + const size_t i = threadIdx.x + blockIdx.x * blockDim.x; + a[i] = b[i] + scalar * c[i]; + // const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; + // for (size_t j = 0; j < elements_per_lane; ++j) + // a[gidx + j] = b[gidx + j] + scalar * c[gidx + j]; } template void HIPStream::triad() { - triad_kernel<<>>(d_a, d_b, d_c); + triad_kernel<<>>(d_a, d_b, d_c); check_error(); hipDeviceSynchronize(); check_error(); @@ -220,7 +228,7 @@ void HIPStream::nstream() template __launch_bounds__(TBSIZE) -__global__ void dot_kernel(const T * __restrict a, const T * __restrict b, T * __restrict sum, int array_size) +__global__ void dot_kernel(const T * a, const T * b, T * sum, int array_size) { __shared__ T tb_sum[TBSIZE]; From 85d80915f60272b08bd41d5baae561d392003ed3 Mon Sep 17 00:00:00 2001 From: Thomas Gibson <14180421+thomasgibson@users.noreply.github.com> Date: Thu, 8 Sep 2022 11:44:37 -0500 Subject: [PATCH 38/89] Simplify/roll back unneeded modifications --- src/hip/HIPStream.cpp | 86 ++++++++++++++++++------------------------- src/hip/HIPStream.h | 32 ++++++---------- src/hip/model.cmake | 4 +- 3 files changed, 48 insertions(+), 74 deletions(-) diff --git a/src/hip/HIPStream.cpp b/src/hip/HIPStream.cpp index cc1d21f8..7fc732de 100644 --- a/src/hip/HIPStream.cpp +++ b/src/hip/HIPStream.cpp @@ -23,17 +23,23 @@ void check_error(void) template HIPStream::HIPStream(const int ARRAY_SIZE, const int device_index) - : array_size{ARRAY_SIZE}, - block_count(array_size / (TBSIZE * elements_per_lane)) { + // The array size must be divisible by TBSIZE for kernel launches + if (ARRAY_SIZE % TBSIZE != 0) + { + std::stringstream ss; + ss << "Array size must be a multiple of " << TBSIZE; + throw std::runtime_error(ss.str()); + } + // The array size must be divisible by total number of elements - // moved per block for kernel launches - if (ARRAY_SIZE % (TBSIZE * elements_per_lane) != 0) + // moved per block for the dot kernel + if (ARRAY_SIZE % (TBSIZE * dot_elements_per_lane) != 0) { std::stringstream ss; - ss << "Array size must be a multiple of elements operated on per block (" - << TBSIZE * elements_per_lane + ss << "Array size for the dot kernel must be a multiple of elements operated on per block (" + << TBSIZE * dot_elements_per_lane << ")."; throw std::runtime_error(ss.str()); } @@ -52,12 +58,13 @@ HIPStream::HIPStream(const int ARRAY_SIZE, const int device_index) std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl; array_size = ARRAY_SIZE; + dot_num_blocks = array_size / (TBSIZE * dot_elements_per_lane); // Allocate the host array for partial sums for dot kernels using hipHostMalloc. // This creates an array on the host which is visible to the device. However, it requires // synchronization (e.g. hipDeviceSynchronize) for the result to be available on the host // after it has been passed through to a kernel. - hipHostMalloc(&sums, sizeof(T) * block_count, hipHostMallocNonCoherent); + hipHostMalloc(&sums, sizeof(T) * dot_num_blocks, hipHostMallocNonCoherent); check_error(); // Check buffers fit on the device @@ -121,113 +128,90 @@ void HIPStream::read_arrays(std::vector& a, std::vector& b, std::vector check_error(); } -template -__launch_bounds__(TBSIZE) -__global__ -void copy_kernel(const T * a, T * c) +template +__global__ void copy_kernel(const T * a, T * c) { const size_t i = threadIdx.x + blockIdx.x * blockDim.x; c[i] = a[i]; - // const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; - // for (size_t j = 0; j < elements_per_lane; ++j) - // c[gidx + j] = a[gidx + j]; } template void HIPStream::copy() { - copy_kernel<<>>(d_a, d_c); + copy_kernel<<>>(d_a, d_c); check_error(); hipDeviceSynchronize(); check_error(); } -template -__launch_bounds__(TBSIZE) -__global__ -void mul_kernel(T * b, const T * c) +template +__global__ void mul_kernel(T * b, const T * c) { const T scalar = startScalar; const size_t i = threadIdx.x + blockIdx.x * blockDim.x; b[i] = scalar * c[i]; - // const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; - // for (size_t j = 0; j < elements_per_lane; ++j) - // b[gidx + j] = scalar * c[gidx + j]; } template void HIPStream::mul() { - mul_kernel<<>>(d_b, d_c); + mul_kernel<<>>(d_b, d_c); check_error(); hipDeviceSynchronize(); check_error(); } -template -__launch_bounds__(TBSIZE) -__global__ -void add_kernel(const T * a, const T * b, T * c) +template +__global__ void add_kernel(const T * a, const T * b, T * c) { const size_t i = threadIdx.x + blockIdx.x * blockDim.x; c[i] = a[i] + b[i]; - // const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; - // for (size_t j = 0; j < elements_per_lane; ++j) - // c[gidx + j] = a[gidx + j] + b[gidx + j]; } template void HIPStream::add() { - add_kernel<<>>(d_a, d_b, d_c); + add_kernel<<>>(d_a, d_b, d_c); check_error(); hipDeviceSynchronize(); check_error(); } -template -__launch_bounds__(TBSIZE) -__global__ -void triad_kernel(T * a, const T * b, const T * c) +template +__global__ void triad_kernel(T * a, const T * b, const T * c) { const T scalar = startScalar; const size_t i = threadIdx.x + blockIdx.x * blockDim.x; a[i] = b[i] + scalar * c[i]; - // const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; - // for (size_t j = 0; j < elements_per_lane; ++j) - // a[gidx + j] = b[gidx + j] + scalar * c[gidx + j]; } template void HIPStream::triad() { - triad_kernel<<>>(d_a, d_b, d_c); + triad_kernel<<>>(d_a, d_b, d_c); check_error(); hipDeviceSynchronize(); check_error(); } -template -__launch_bounds__(TBSIZE) -__global__ void nstream_kernel(T * __restrict a, const T * __restrict b, const T * __restrict c) +template +__global__ void nstream_kernel(T * a, const T * b, const T * c) { const T scalar = startScalar; - const size_t gidx = (threadIdx.x + blockIdx.x * blockDim.x) * elements_per_lane; - for (size_t j = 0; j < elements_per_lane; ++j) - a[gidx + j] += b[gidx + j] + scalar * c[gidx + j]; + const size_t i = threadIdx.x + blockIdx.x * blockDim.x; + a[i] += b[i] + scalar * c[i]; } template void HIPStream::nstream() { - nstream_kernel<<>>(d_a, d_b, d_c); + nstream_kernel<<>>(d_a, d_b, d_c); check_error(); hipDeviceSynchronize(); check_error(); } -template -__launch_bounds__(TBSIZE) +template __global__ void dot_kernel(const T * a, const T * b, T * sum, int array_size) { __shared__ T tb_sum[TBSIZE]; @@ -236,7 +220,7 @@ __global__ void dot_kernel(const T * a, const T * b, T * sum, int array_size) size_t i = blockDim.x * blockIdx.x + local_i; tb_sum[local_i] = 0.0; - for (size_t j = 0; j < elements_per_lane && i < array_size; ++j, i += blockDim.x*gridDim.x) + for (; i < array_size; i += blockDim.x*gridDim.x) tb_sum[local_i] += a[i] * b[i]; for (size_t offset = blockDim.x / 2; offset > 0; offset /= 2) @@ -255,13 +239,13 @@ __global__ void dot_kernel(const T * a, const T * b, T * sum, int array_size) template T HIPStream::dot() { - dot_kernel<<>>(d_a, d_b, sums, array_size); + dot_kernel<<>>(d_a, d_b, sums, array_size); check_error(); hipDeviceSynchronize(); check_error(); T sum = 0.0; - for (int i = 0; i < block_count; i++) + for (int i = 0; i < dot_num_blocks; i++) sum += sums[i]; return sum; diff --git a/src/hip/HIPStream.h b/src/hip/HIPStream.h index 305e9376..3c603e0b 100644 --- a/src/hip/HIPStream.h +++ b/src/hip/HIPStream.h @@ -14,39 +14,31 @@ #include "Stream.h" #define IMPLEMENTATION_STRING "HIP" +#define DOT_READ_DWORDS_PER_LANE 4 + template class HIPStream : public Stream { -#ifdef __HIP_PLATFORM_NVCC__ - #ifndef DWORDS_PER_LANE - #define DWORDS_PER_LANE 1 - #endif -#else - #ifndef DWORDS_PER_LANE - #define DWORDS_PER_LANE 4 - #endif -#endif // Make sure that either: - // DWORDS_PER_LANE is less than sizeof(T), in which case we default to 1 element + // DOT_READ_DWORDS_PER_LANE is less than sizeof(T), in which case we default to 1 element // or - // DWORDS_PER_LANE is divisible by sizeof(T) - static_assert((DWORDS_PER_LANE * sizeof(unsigned int) < sizeof(T)) || - (DWORDS_PER_LANE * sizeof(unsigned int) % sizeof(T) == 0), - "DWORDS_PER_LANE not divisible by sizeof(element_type)"); + // DOT_READ_DWORDS_PER_LANE is divisible by sizeof(T) + static_assert((DOT_READ_DWORDS_PER_LANE * sizeof(unsigned int) < sizeof(T)) || + (DOT_READ_DWORDS_PER_LANE * sizeof(unsigned int) % sizeof(T) == 0), + "DOT_READ_DWORDS_PER_LANE not divisible by sizeof(element_type)"); - static constexpr unsigned int dwords_per_lane{DWORDS_PER_LANE}; // Take into account the datatype size - // That is, if we specify 4 DWORDS_PER_LANE, this is 2 FP64 elements + // That is, for 4 DOT_READ_DWORDS_PER_LANE, this is 2 FP64 elements // and 4 FP32 elements - static constexpr unsigned int elements_per_lane{ - (DWORDS_PER_LANE * sizeof(unsigned int)) < sizeof(T) ? 1 : ( - DWORDS_PER_LANE * sizeof(unsigned int) / sizeof(T))}; + static constexpr unsigned int dot_elements_per_lane{ + (DOT_READ_DWORDS_PER_LANE * sizeof(unsigned int)) < sizeof(T) ? 1 : ( + DOT_READ_DWORDS_PER_LANE * sizeof(unsigned int) / sizeof(T))}; protected: // Size of arrays int array_size; - int block_count; + int dot_num_blocks; // Host array for partial sums for dot kernel T *sums; diff --git a/src/hip/model.cmake b/src/hip/model.cmake index 19e6fd09..78150c4b 100644 --- a/src/hip/model.cmake +++ b/src/hip/model.cmake @@ -2,8 +2,6 @@ register_flag_required(CMAKE_CXX_COMPILER "Absolute path to the AMD HIP C++ compiler") -register_flag_optional(DWORDS_PER_LANE "Flag indicating the number of dwords to process per wavefront lane." 4) - macro(setup) - register_definitions(DWORDS_PER_LANE=${DWORDS_PER_LANE}) + # nothing to do here as hipcc does everything correctly, what a surprise! endmacro() \ No newline at end of file From 66491909e45d61c52cde88456681e6f6dcc9c0a3 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 15 Nov 2022 06:48:39 +0200 Subject: [PATCH 39/89] BabelStream Fortran This is a new implementation of BabelStream using Fortran. The code uses a Fortran driver that is largely equivalent to the C++ one, with a few exceptions. First, it does not use a C++ class for the stream object, since that doesn't seem like a useful way to do things in Fortran. Instead, I use a module that contains the same methods, and which has alloc and dealloc that act like CTOR and DTOR. The current implementations are: - DO CONCURRENT - Fortran array notation - Sequential DO loops - OpenACC parallel loop - OpenACC kernels on Fortran array notation - OpenMP parallel do - OpenMP taskloop - OpenMP target teams distribute parallel do simd - OpenMP target teams loop - CUDA Fortran (handwritten CUDA Fortran kernels, except DOT) - CUDA Fortran kernels (!$cuf kernel do <<<*,*>>>) I have tested with GCC, Intel (ifort and ifx), and NVHPC compilers on AArch64, x86_64 and NVIDIA GPU targets, although not exhaustively. Cray and Fujitsu have been tested as well. The only untested compiler of significance is IBM XLF. The current build system is GNU Make, and requires the user to manually specify the compiler and implementation. CSV printing is supported. Squashed commit of the following: commit 15f13ef9d326102cc003b2fdfe1b31c4aea55373 Author: Jeff Hammond Date: Tue Nov 15 06:42:46 2022 +0200 8 cores unless user changes commit 62ca680546ff89a1987b6fb797273038f767bf7b Author: Jeff Hammond Date: Tue Nov 15 06:42:09 2022 +0200 hoist and disable orin flags commit 76495509abcdb0686f293a72f7ded7c8ed7bb882 Author: Jeff Hammond Date: Tue Nov 15 06:40:13 2022 +0200 cleanup scripts commit 5b45df87954282cbb6b0f7eb2dcb3570d08bb5c2 Author: Jeff Hammond Date: Tue Nov 15 06:39:31 2022 +0200 add autopar flag for GCC commit 87eb07e4a8c3e8d6247ab5f72e14bf90002733ce Merge: a732e7c 270644e Author: Jeff Hammond Date: Wed Nov 9 15:53:41 2022 +0200 Merge remote-tracking branch 'origin/fortran_compiler_details' into fortran-ports commit a732e7c49e12ce8aff15e9d4bcbd215fa4a05d82 Merge: cfafd99 5697d94 Author: Jeff Hammond Date: Wed Nov 9 15:53:36 2022 +0200 Merge remote-tracking branch 'origin/fortran_int32_option' into fortran-ports commit cfafd993b646d5f5a90eb6d37d347cc545ab36d4 Merge: de5ff67 26a9707 Author: Jeff Hammond Date: Wed Nov 9 15:53:25 2022 +0200 Merge remote-tracking branch 'origin/fortran_csv' into fortran-ports commit de5ff6772b2036ad259a6a9c331ff5408146b54c Merge: 3109653 1d0755f Author: Jeff Hammond Date: Wed Nov 9 15:51:40 2022 +0200 Merge branch 'UoB-HPC:main' into fortran-ports commit 310965399a9b518122ff610b61419cdaab75ecd0 Author: Jeff Hammond Date: Mon Sep 26 03:39:01 2022 -0700 because gomp so confict commit 270644e6fb89e8f3c3bfe4d73c9896fc3094d761 Author: Jeff Hammond Date: Fri Sep 16 11:46:49 2022 +0300 add compiler info flag commit 5697d94a9ce5162de9445f5fde76f8020eae8b83 Author: Jeff Hammond Date: Sun Sep 4 13:59:57 2022 +0300 implement INT32 indexing commit 830ad58dd2c985b9a2425093c0eed9ec1c7887dd Author: Jeff Hammond Date: Sun Sep 4 13:49:17 2022 +0300 remove swear words from debugging commit 26a9707a1f09249d04206adf647587e42cf5fab5 Author: Jeff Hammond Date: Sun Sep 4 13:47:18 2022 +0300 add an option for giga/gibi-bytes commit 4f6d693c03ca1b092d3bf003cdfcc367b8ad86ac Author: Jeff Hammond Date: Sun Sep 4 13:41:32 2022 +0300 CSV output seems done Signed-off-by: Jeff Hammond commit 94e62be05c11b9ef208f7ad09402ddf26e4586ae Merge: ad52adc 772c183 Author: Jeff Hammond Date: Sun Sep 4 12:59:01 2022 +0300 Merge branch 'fortran_nan_check' into fortran_csv commit 772c183de2fb1a8ea72ae7ef3c45c17895c4fdc9 Author: Jeff Hammond Date: Sun Sep 4 10:44:26 2022 +0300 fixed NaN check commit ad52adc9ba6eb702c0fefdf1d9a8d1830b74830b Author: Jeff Hammond Date: Sun Sep 4 10:28:00 2022 +0300 CSV WIP commit 6f7cefc42ca286ae3b698d827fd7c9ee14984ecb Author: Jeff Hammond Date: Sun Sep 4 10:08:14 2022 +0300 update help output commit 208207597d150fafa059ca593ac30bc9a2e6d1a7 Author: Jeff Hammond Date: Sun Sep 4 10:02:24 2022 +0300 add option for cpu_time intrinsic timer also adjust use statements and rename macro for OpenMP timer Signed-off-by: Jeff Hammond commit 78fa2fcb1087f00efd94dd911000dc0d485da406 Author: Jeff Hammond Date: Tue Aug 30 17:19:36 2022 +0300 add check for normal (not NaN, not Inf, not denormal) the previous error check failed to detect garbage results because comparisons against NaN always return true. i flipped the logical comparison and added a check for IEEE normal to prevent this. it works on the case that was missed previously. Signed-off-by: Jeff Hammond commit 22fc9fe918a378f47c88dbad3ce91a4a6688789b Author: Jeff Hammond Date: Tue Aug 30 17:19:30 2022 +0300 move commit d2d8c8555d2665fc553f9263a6767843ec14def8 Author: Jeff Hammond Date: Tue Aug 30 16:29:15 2022 +0300 so far so good commit ffe181536b78ef845f861a09ca0dc72d4fffcbe8 Author: Jeff Hammond Date: Tue Aug 30 16:29:09 2022 +0300 so far so good commit aa72b46a8187792ca819f9720c032e802525413a Author: Jeff Hammond Date: Tue Aug 30 16:28:52 2022 +0300 GPU on by default commit 0fc9e4acdd0fbb5b6d9399962fc6a1daaa4a84da Author: Jeff Hammond Date: Thu Aug 25 16:38:08 2022 +0300 better commit b1cbd6f5b6a7534502d29e14d1c09fa6be378dd8 Merge: bf14601 5fe03c6 Author: Jeff Hammond Date: Thu Aug 25 16:35:22 2022 +0300 Merge branch 'fortran-ports' of https://github.com/jeffhammond/BabelStream into fortran-ports commit bf146011d6ee1ac9dd0cb6d43bb4e60b8cc37acf Author: Jeff Hammond Date: Thu Aug 25 16:35:07 2022 +0300 autodetect GPU arch in build (who needs CMake?) commit 5fe03c664e318a33bd0d383fddf8e76a2266a4e0 Author: Jeff Hammond Date: Thu Aug 25 15:57:41 2022 +0300 be smarter and check for compilers in path commit a187612a68447302fbd036d717df53b2780df3b4 Author: Jeff Hammond Date: Thu Aug 25 15:35:58 2022 +0300 remove samsung paths commit 82af886943a67980dda1724edae7686c6d280e1e Merge: a46bf6b 0f59b50 Author: Jeff Hammond Date: Wed Aug 24 13:22:13 2022 +0300 merge fix plus build updates commit 0f59b5014477c9a3da5eeb97328e6c55554a8c24 Author: Jeff Hammond Date: Wed Aug 24 08:43:19 2022 +0000 typo in USE_OPENMP_TIMERS commit 4a9a0019585b0f03c42f151042ad592cba03d8b3 Author: Jeff Hammond Date: Wed Aug 24 08:42:59 2022 +0000 logic fix commit 74d8123864fdb603b409112f5b9c0e92c2a93071 Author: Jeff Hammond Date: Wed Aug 24 03:05:58 2022 -0500 no-gpu option commit dc1e39ff34e384ae66f50ab787e9ca8c92701c3b Author: Jeff Hammond Date: Wed Aug 24 03:05:17 2022 -0500 fix default case commit 0b2b0e0bb754b0ac86dd16eeb30db092a3b3e658 Author: Jeff Hammond Date: Wed Aug 24 02:57:02 2022 -0500 fix tp for aarch64 commit 1e213bec76d2e7f5f161a18eb365f2948563c925 Author: Jeff Hammond Date: Wed Aug 24 07:46:41 2022 +0000 fix MARCH and build.sh elif commit a46bf6b48eb730a2fa08ccd8dddd04725fe25371 Author: Jeff Hammond Date: Tue Aug 23 16:43:22 2022 +0300 orin updates commit a9fe9c028c08b9f0d468ee56f24970817087099d Merge: 2ab14de 9f4bee4 Author: Jeff Hammond Date: Tue Aug 23 06:32:01 2022 -0700 more CPU specialization fixes commit 2ab14de1535f71fd1b548a10585b035ed88daa26 Author: Jeff Hammond Date: Tue Aug 23 06:30:37 2022 -0700 more CPU specialization fixes commit 9f4bee439c36b592321f4af38235450cfb23cdf2 Author: Jeff Hammond Date: Tue Aug 23 16:12:13 2022 +0300 build and run updates commit aeff0854478e5f16536b11034f459ea387a222a2 Author: Jeff Hammond Date: Tue Aug 23 15:56:25 2022 +0300 aesthetics commit 89b1ab01369cd71d5bbb837474799c75eabd64b5 Author: Jeff Hammond Date: Tue Aug 23 15:56:08 2022 +0300 handle march flag better commit a284bfa6da9bbb1aa9de5e8d40b74c316e90f3c6 Author: Jeff Hammond Date: Tue Aug 23 15:56:04 2022 +0300 handle march flag better commit c18c3945eb053581f2cdf528961f158c4aa66271 Author: Jeff Hammond Date: Tue Aug 23 15:53:11 2022 +0300 handle march flag better commit a3a8ccf453a2ff7cc99a774b5a6262648690f7c8 Author: Jeff Hammond Date: Tue Aug 23 05:29:41 2022 -0700 brewster updates commit 1364c4100f4bb6241e2db5805a64625a66c9d2fa Author: Tom Deakin Date: Sun Aug 21 17:16:20 2022 +0100 Add Fujitsu compiler flags commit b82fe2cb38cab940d0bebf613e22ea9685a21d06 Author: Jeff Hammond Date: Sun Aug 21 15:40:28 2022 +0300 FJ timer workaround commit c1b2fa81155c4d6a3717793c5670b1b0d4cf6101 Author: Jeff Hammond Date: Sun Aug 21 15:29:13 2022 +0300 intel update/fix commit 063ef879d9c3a3010a0be3b9baad7600f62e52bf Author: Jeff Hammond Date: Sun Aug 21 04:43:29 2022 -0700 NERSC AMD compiler commit 2c68292667b62f3428fc8cf4dfa874a5b44e625d Merge: 2bdbbe8 ca98948 Author: Jeff Hammond Date: Sun Aug 21 02:12:12 2022 -0700 Merge branch 'fortran-ports' of https://github.com/jeffhammond/BabelStream into fortran-ports commit 2bdbbe81d782268fd7f48889fd6eeea32d5f1f58 Author: Jeff Hammond Date: Sun Aug 21 02:11:27 2022 -0700 AMD ROCM buikd commit ca9894801fdcca705e5d06c703af3a0f4e888c01 Author: Jeff Hammond Date: Sun Aug 21 09:10:16 2022 +0000 AWS stuff commit 4c539efda9522810dadc64c65339ce22ea6822b4 Author: Jeff Hammond Date: Sun Aug 21 09:09:59 2022 +0000 merge commit c3830658f8d403f602f3270b8f34b6ebd405c3e3 Author: Jeff Hammond Date: Sun Aug 21 02:08:46 2022 -0700 NERSC stuff commit 7d7f746206e1ace8753778fcd2416d5ae30b7470 Merge: 1fefb8e d929852 Author: Jeff Hammond Date: Sat Aug 20 20:56:09 2022 -0700 Merge branch 'fortran-ports' of https://github.com/jeffhammond/BabelStream into fortran-ports commit 1fefb8e657764b43cbcaf63278e051ead53bd29a Author: Jeff Hammond Date: Sat Aug 20 20:55:16 2022 -0700 Cray temp stuff commit d92985239b31e16d478ca3a8a740baba2c35c164 Author: Jeff Hammond Date: Fri Aug 19 02:11:07 2022 -0700 Xeon stuff commit 3f19e451bbc856ed6aa221077e51bd0578e48426 Merge: 38f28e1 c8dd609 Author: Jeff Hammond Date: Thu Aug 18 13:56:37 2022 +0000 Merge branch 'fortran-ports' of https://github.com/jeffhammond/BabelStream into fortran-ports commit 38f28e193c76970e5b6f641b437c6faefb9c608b Author: Jeff Hammond Date: Thu Aug 18 13:54:12 2022 +0000 TARGET for cpu too commit 6be181a07a93281a51cb897edf703404ead2c83e Author: Jeff Hammond Date: Thu Aug 18 13:52:58 2022 +0000 AWS flags commit e88479e09176510f707e410a4e69ea5290b2619e Author: Jeff Hammond Date: Thu Aug 18 13:52:42 2022 +0000 ARM stuff for AWS commit 1ee26cb3675b5e2739ddc21f56a1a864ff681950 Author: Jeff Hammond Date: Thu Aug 18 13:52:24 2022 +0000 disable shared for portability commit c8dd6099d95792b17abbcb025f771c3ae0ed773e Merge: 8bda56d 1b67999 Author: Jeff Hammond Date: Thu Aug 18 15:23:16 2022 +0300 Merge branch 'UoB-HPC:main' into fortran-ports commit 8bda56dd9053fdacc77aac572401bc4c7806efa0 Author: Jeff Hammond Date: Wed Aug 17 03:07:13 2022 -0700 add Cray compiler to build system - ignore temp files generated by Cray Fortran - workaround Cray not having reduce commit 3a0fec620d7ce5317a3260826087a26e0faee36c Author: Jeff Hammond Date: Wed Aug 17 02:09:19 2022 -0700 remove LOCAL, which causes problems commit e5a70ddbd995567c28a4c74373481c01a7489c88 Author: Jeff Hammond Date: Wed Aug 10 22:26:50 2022 +0300 add a way to use managed/device for everything DC uses managed by default. no way to not use it and be strictly standard right now. managed affects performance in some cases, so we want to compare apples-to-apples. thanks to Jeff Larkin for helping with this. Signed-off-by: Jeff Hammond commit 8fe956ab62737aecdec1ce7785a659587d814653 Author: Jeff Hammond Date: Wed Aug 10 22:26:41 2022 +0300 only do GPU flag for IFX commit de49723a7ae864847a2136353a45a49502291373 Author: Jeff Hammond Date: Wed Aug 10 22:26:23 2022 +0300 helper scripts commit e0971aa15d6fac2bc1de6e5080b53f7288975fe9 Author: Jeff Hammond Date: Wed Aug 10 22:26:21 2022 +0300 helper scripts commit a7ba50a60d321cab8e0f63d841b893c01a7df6b6 Author: Jeff Hammond Date: Wed Aug 10 12:29:28 2022 +0300 remove all the compiled intermediates with wildcard commit 31a594e82ec7b75d626639948eba532d503c4d81 Author: Jeff Hammond Date: Fri Aug 5 03:31:32 2022 -0700 build stuff commit 2cd3acd0f3cee82b60e5b05ac8dc01da3452bd1f Author: Jeff Hammond Date: Fri Aug 5 02:09:17 2022 -0700 build all with unique names commit ac230d127e15bdc9e56450862f9627d55da37f59 Author: Jeff Hammond Date: Fri Aug 5 09:28:03 2022 +0300 fix make clean commit bd0ef7736a43e26864167eb61345704731acbefa Author: Jeff Hammond Date: Fri Aug 5 09:24:12 2022 +0300 build check update commit 662520c4e443b841a88f1a4fe833bdb77b7cfd45 Author: Jeff Hammond Date: Fri Aug 5 09:21:48 2022 +0300 CUDA kernel version commit 25c321987b349f85a13f0140ae316382aa71e601 Author: Jeff Hammond Date: Fri Aug 5 09:15:32 2022 +0300 fixed CUDA Fortran dot commit 64612d2604401c2f200a3689b4248ecf7c93adaf Author: Jeff Hammond Date: Fri Aug 5 09:10:49 2022 +0300 CUDA Fortran working except DOT commit 4d35fe51a22978cc77bdd6311b7d15654856c564 Author: Jeff Hammond Date: Fri Aug 5 08:48:17 2022 +0300 CUDA Fortran is not compiling yet commit 0967c36695518a0c7bf7ee4c62a412f51338708e Author: Jeff Hammond Date: Fri Aug 5 07:50:40 2022 +0300 workshare commit 3ed69ea9ea655c364181144f21f0bfc0d3afa13c Author: Jeff Hammond Date: Fri Aug 5 07:42:49 2022 +0300 target loop commit 30dfb574c0c4435f09fc5a6e53644f9ab7fd95f3 Author: Jeff Hammond Date: Fri Aug 5 07:31:41 2022 +0300 OpenMP target commit a5306ce5c1144f38223074b786240db07a66b6bf Author: Jeff Hammond Date: Fri Aug 5 07:17:58 2022 +0300 makefile errors on non support commit 854c8135f5d80d5cecce22042d761a3f75a5ee13 Author: Jeff Hammond Date: Fri Aug 5 07:15:12 2022 +0300 fix taskloop commit f2894c583346410e14461988d68012d8469e583c Author: Jeff Hammond Date: Fri Aug 5 07:11:26 2022 +0300 add taskloop part 1 commit b7c0a43e9b49eed7ee54a4b4a8470118c092a922 Author: Jeff Hammond Date: Fri Aug 5 07:07:54 2022 +0300 add OpenMP traditional commit 7dafcc385f547738b9972f98b9e93f87e22d468c Author: Jeff Hammond Date: Fri Aug 5 07:02:36 2022 +0300 add OpenACC kernels + Array implementation commit 096e7d281015b09e5a099e5a1eb8b9b3e46cea5f Author: Jeff Hammond Date: Fri Aug 5 06:53:13 2022 +0300 formatting commit 284b62b47e508799dc49c85ea3d7a8d1f34f87a9 Author: Jeff Hammond Date: Thu Aug 4 19:41:27 2022 +0300 add placeholder for CSV commit 516bdd5929a13c17348040b031931485ca32e40e Author: Jeff Hammond Date: Thu Aug 4 19:14:00 2022 +0300 add --float commit d4e0ccaf6c00e6109e6130b3fd7c604df6feaa28 Author: Jeff Hammond Date: Thu Aug 4 19:13:23 2022 +0300 default message updates commit e8452f1c2e30fb84533b75a43ac9f5f265c96f60 Author: Jeff Hammond Date: Thu Aug 4 17:58:48 2022 +0300 list devices etc commit a80e82c323a5b0d1bffc524a8219de51cbdba8d2 Author: Jeff Hammond Date: Thu Aug 4 14:07:02 2022 +0300 better build system commit c3b090cf1f28641a9e34e331ab37cb055e82eec4 Author: Jeff Hammond Date: Thu Aug 4 14:03:27 2022 +0300 refactor build system commit 096cd43b7bc49751c17d686519620a7a4b1e5677 Author: Jeff Hammond Date: Thu Aug 4 13:43:17 2022 +0300 cleanup the rest commit 1e4fb8125e0729b32e8ec6d87f30d935310f55ca Author: Jeff Hammond Date: Thu Aug 4 13:40:38 2022 +0300 add Intel build and fix syntax issuse commit db3a9307b57bbc82456f9d52a6ff20d6e37b4083 Author: Jeff Hammond Date: Thu Aug 4 13:34:43 2022 +0300 use modern character syntax commit b66bd707d64a1823a3a7bd8a2c6acf30ce9043be Author: Jeff Hammond Date: Thu Aug 4 12:10:59 2022 +0300 printing commit ff842f62b952b5a61decfac80fd9b51dc56546d3 Author: Jeff Hammond Date: Thu Aug 4 11:06:43 2022 +0300 build stuff commit 05791085dd4cdde8b07f5b33a78ce051f4c8dd1d Author: Jeff Hammond Date: Wed Aug 3 20:10:33 2022 +0300 add OpenACC commit bb76b757a2765640b4a7bfb8d2d4850f96c478f7 Author: Jeff Hammond Date: Wed Aug 3 20:04:12 2022 +0300 better clean commit 2f53530d0f7f3d0cb4e138e2d76c325d81bbab8d Author: Jeff Hammond Date: Wed Aug 3 20:03:04 2022 +0300 Sequential loop Stream commit f5c0eaee60b04dfeabd96750c8b34694d2757f54 Author: Jeff Hammond Date: Wed Aug 3 19:56:54 2022 +0300 add array notation commit 76f836b1836b83006285ef69b4457abea39b400d Author: Jeff Hammond Date: Wed Aug 3 10:05:46 2022 +0300 implement BabelStream in Fortran 1. only DO CONCURRENT is supported right now. 2. the structure mostly matches C++ except we do not make a stream class. 3. there is no option for float versus double right now. it will be a compile-time choice later. Signed-off-by: Jeff Hammond --- .gitignore | 6 + src/.gitignore | 4 +- src/fortran/ArrayStream.F90 | 105 ++++ src/fortran/BabelStreamTypes.F90 | 21 + src/fortran/CUDAKernelStream.F90 | 230 +++++++++ src/fortran/CUDAStream.F90 | 309 ++++++++++++ src/fortran/DoConcurrentStream.F90 | 139 ++++++ src/fortran/Makefile | 109 ++++ src/fortran/OpenACCArrayStream.F90 | 144 ++++++ src/fortran/OpenACCStream.F90 | 161 ++++++ src/fortran/OpenMPStream.F90 | 137 +++++ src/fortran/OpenMPTargetLoopStream.F90 | 162 ++++++ src/fortran/OpenMPTargetStream.F90 | 163 ++++++ src/fortran/OpenMPTaskloopStream.F90 | 169 +++++++ src/fortran/OpenMPWorkshareStream.F90 | 120 +++++ src/fortran/SequentialStream.F90 | 130 +++++ src/fortran/build.sh | 54 ++ src/fortran/main.F90 | 666 +++++++++++++++++++++++++ src/fortran/make.inc.amd | 25 + src/fortran/make.inc.arm | 39 ++ src/fortran/make.inc.cray | 18 + src/fortran/make.inc.fj | 21 + src/fortran/make.inc.gcc | 33 ++ src/fortran/make.inc.nvhpc | 70 +++ src/fortran/make.inc.oneapi | 32 ++ src/fortran/run.sh | 35 ++ 26 files changed, 3101 insertions(+), 1 deletion(-) create mode 100644 src/fortran/ArrayStream.F90 create mode 100644 src/fortran/BabelStreamTypes.F90 create mode 100644 src/fortran/CUDAKernelStream.F90 create mode 100644 src/fortran/CUDAStream.F90 create mode 100644 src/fortran/DoConcurrentStream.F90 create mode 100644 src/fortran/Makefile create mode 100644 src/fortran/OpenACCArrayStream.F90 create mode 100644 src/fortran/OpenACCStream.F90 create mode 100644 src/fortran/OpenMPStream.F90 create mode 100644 src/fortran/OpenMPTargetLoopStream.F90 create mode 100644 src/fortran/OpenMPTargetStream.F90 create mode 100644 src/fortran/OpenMPTaskloopStream.F90 create mode 100644 src/fortran/OpenMPWorkshareStream.F90 create mode 100644 src/fortran/SequentialStream.F90 create mode 100755 src/fortran/build.sh create mode 100644 src/fortran/main.F90 create mode 100644 src/fortran/make.inc.amd create mode 100644 src/fortran/make.inc.arm create mode 100644 src/fortran/make.inc.cray create mode 100644 src/fortran/make.inc.fj create mode 100644 src/fortran/make.inc.gcc create mode 100644 src/fortran/make.inc.nvhpc create mode 100644 src/fortran/make.inc.oneapi create mode 100755 src/fortran/run.sh diff --git a/.gitignore b/.gitignore index 012d0e8b..59ea5dbb 100644 --- a/.gitignore +++ b/.gitignore @@ -10,12 +10,18 @@ sycl-stream hip-stream tbb-stream +src/fortran/BabelStream +src/fortran/BabelStream.* + *.o *.bc *.sycl *.tar *.gz *.a +*.mod +*.cub +*.ptx KokkosCore_config.* diff --git a/src/.gitignore b/src/.gitignore index 568a9534..9d8b17b3 100644 --- a/src/.gitignore +++ b/src/.gitignore @@ -16,6 +16,8 @@ **/*.gz **/*.a +**/*.swp + **/KokkosCore_Config_* **/.DS_Store @@ -26,4 +28,4 @@ cmake-build-*/ CMakeFiles/ .idea/ .vscode/ -.directory \ No newline at end of file +.directory diff --git a/src/fortran/ArrayStream.F90 b/src/fortran/ArrayStream.F90 new file mode 100644 index 00000000..5a8d5bc4 --- /dev/null +++ b/src/fortran/ArrayStream.F90 @@ -0,0 +1,105 @@ +module ArrayStream + use, intrinsic :: ISO_Fortran_env + use BabelStreamTypes + + implicit none + + character(len=5), parameter :: implementation_name = "Array" + + integer(kind=StreamIntKind) :: N + + real(kind=REAL64), allocatable :: A(:), B(:), C(:) + + contains + + subroutine list_devices() + implicit none + integer :: num + write(*,'(a36,a5)') "Listing devices is not supported by ", implementation_name + end subroutine list_devices + + subroutine set_device(dev) + implicit none + integer, intent(in) :: dev + write(*,'(a32,a5)') "Device != 0 is not supported by ", implementation_name + end subroutine set_device + + subroutine alloc(array_size) + implicit none + integer(kind=StreamIntKind) :: array_size + integer :: err + N = array_size + allocate( A(1:N), B(1:N), C(1:N), stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'allocate returned ',err + stop 1 + endif + end subroutine alloc + + subroutine dealloc() + implicit none + integer :: err + deallocate( A, B, C, stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'deallocate returned ',err + stop 1 + endif + end subroutine dealloc + + subroutine init_arrays(initA, initB, initC) + implicit none + real(kind=REAL64), intent(in) :: initA, initB, initC + A = initA + B = initB + C = initC + end subroutine init_arrays + + subroutine read_arrays(h_A, h_B, h_C) + implicit none + real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:) + h_A = A + h_B = B + h_C = C + end subroutine read_arrays + + subroutine copy() + implicit none + C = A + end subroutine copy + + subroutine add() + implicit none + C = A + B + end subroutine add + + subroutine mul(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + scalar = startScalar + B = scalar * C + end subroutine mul + + subroutine triad(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + scalar = startScalar + A = B + scalar * C + end subroutine triad + + subroutine nstream(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + scalar = startScalar + A = A + B + scalar * C + end subroutine nstream + + function dot() result(s) + implicit none + real(kind=REAL64) :: s + s = dot_product(A,B) + end function dot + +end module ArrayStream diff --git a/src/fortran/BabelStreamTypes.F90 b/src/fortran/BabelStreamTypes.F90 new file mode 100644 index 00000000..dd01d35a --- /dev/null +++ b/src/fortran/BabelStreamTypes.F90 @@ -0,0 +1,21 @@ +module BabelStreamTypes + use, intrinsic :: ISO_Fortran_env, only: REAL64,REAL32,INT64,INT32 + + implicit none + +#ifdef USE_FLOAT + integer, parameter :: StreamRealKind = REAL32 + character(len=6) :: StreamRealName = "REAL32" +#else + integer, parameter :: StreamRealKind = REAL64 + character(len=6) :: StreamRealName = "REAL64" +#endif + +#ifdef USE_INT32 +#warning There is no checking for overflowing INT32, so be careful. + integer, parameter :: StreamIntKind = INT32 +#else + integer, parameter :: StreamIntKind = INT64 +#endif + +end module BabelStreamTypes diff --git a/src/fortran/CUDAKernelStream.F90 b/src/fortran/CUDAKernelStream.F90 new file mode 100644 index 00000000..01668ead --- /dev/null +++ b/src/fortran/CUDAKernelStream.F90 @@ -0,0 +1,230 @@ +module CUDAKernelStream + use, intrinsic :: ISO_Fortran_env + use BabelStreamTypes + + implicit none + + character(len=10), parameter :: implementation_name = "CUDAKernel" + + integer(kind=StreamIntKind) :: N + +#ifdef USE_MANAGED + real(kind=REAL64), allocatable, managed :: A(:), B(:), C(:) +#else + real(kind=REAL64), allocatable, device :: A(:), B(:), C(:) +#endif + + contains + + subroutine list_devices() + use cudafor + implicit none + integer :: num, err + err = cudaGetDeviceCount(num) + if (err.ne.0) then + write(*,'(a)') "cudaGetDeviceCount failed" + write(*,'(a)') cudaGetErrorString(err) + stop + else if (num.eq.0) then + write(*,'(a17)') "No devices found." + else + write(*,'(a10,i1,a8)') "There are ",num," devices." + end if + end subroutine list_devices + + subroutine set_device(dev) + use cudafor + implicit none + integer, intent(in) :: dev + integer :: num, err + err = cudaGetDeviceCount(num) + if (err.ne.0) then + write(*,'(a)') "cudaGetDeviceCount failed" + write(*,'(a)') cudaGetErrorString(err) + stop + else if (num.eq.0) then + write(*,'(a17)') "No devices found." + stop + else if (dev.ge.num) then + write(*,'(a21)') "Invalid device index." + stop + else + err = cudaSetDevice(dev) + if (err.ne.0) then + write(*,'(a)') "cudaSetDevice failed" + write(*,'(a)') cudaGetErrorString(err) + stop + end if + end if + end subroutine set_device + + subroutine alloc(array_size) + implicit none + integer(kind=StreamIntKind) :: array_size + integer :: err + N = array_size + allocate( A(1:N), B(1:N), C(1:N), stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'allocate returned ',err + stop 1 + endif + end subroutine alloc + + subroutine dealloc() + implicit none + integer :: err + deallocate( A, B, C, stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'deallocate returned ',err + stop 1 + endif + end subroutine dealloc + + subroutine init_arrays(initA, initB, initC) + use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString + implicit none + real(kind=REAL64), intent(in) :: initA, initB, initC + integer(kind=StreamIntKind) :: i + integer :: err + A = initA + B = initB + C = initC + err = cudaDeviceSynchronize() + if (err.ne.0) then + write(*,'(a)') "cudaDeviceSynchronize failed" + write(*,'(a)') cudaGetErrorString(err) + stop + endif + end subroutine init_arrays + + subroutine read_arrays(h_A, h_B, h_C) + use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString + implicit none + real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:) + integer(kind=StreamIntKind) :: i + integer :: err + h_A = A + h_B = B + h_C = C + err = cudaDeviceSynchronize() + if (err.ne.0) then + write(*,'(a)') "cudaDeviceSynchronize failed" + write(*,'(a)') cudaGetErrorString(err) + stop + endif + end subroutine read_arrays + + subroutine copy() + use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString + implicit none + integer(kind=StreamIntKind) :: i + integer :: err + !$cuf kernel do <<< *, * >>> + do i=1,N + C(i) = A(i) + end do + err = cudaDeviceSynchronize() + if (err.ne.0) then + write(*,'(a)') "cudaDeviceSynchronize failed" + write(*,'(a)') cudaGetErrorString(err) + stop + endif + end subroutine copy + + subroutine add() + use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString + implicit none + integer(kind=StreamIntKind) :: i + integer :: err + !$cuf kernel do <<< *, * >>> + do i=1,N + C(i) = A(i) + B(i) + end do + err = cudaDeviceSynchronize() + if (err.ne.0) then + write(*,'(a)') "cudaDeviceSynchronize failed" + write(*,'(a)') cudaGetErrorString(err) + stop + endif + end subroutine add + + subroutine mul(startScalar) + use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + integer :: err + scalar = startScalar + !$cuf kernel do <<< *, * >>> + do i=1,N + B(i) = scalar * C(i) + end do + err = cudaDeviceSynchronize() + if (err.ne.0) then + write(*,'(a)') "cudaDeviceSynchronize failed" + write(*,'(a)') cudaGetErrorString(err) + stop + endif + end subroutine mul + + subroutine triad(startScalar) + use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + integer :: err + scalar = startScalar + !$cuf kernel do <<< *, * >>> + do i=1,N + A(i) = B(i) + scalar * C(i) + end do + err = cudaDeviceSynchronize() + if (err.ne.0) then + write(*,'(a)') "cudaDeviceSynchronize failed" + write(*,'(a)') cudaGetErrorString(err) + stop + endif + end subroutine triad + + subroutine nstream(startScalar) + use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + integer :: err + scalar = startScalar + !$cuf kernel do <<< *, * >>> + do i=1,N + A(i) = A(i) + B(i) + scalar * C(i) + end do + err = cudaDeviceSynchronize() + if (err.ne.0) then + write(*,'(a)') "cudaDeviceSynchronize failed" + write(*,'(a)') cudaGetErrorString(err) + stop + endif + end subroutine nstream + + function dot() result(r) + use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString + implicit none + real(kind=REAL64) :: r + integer(kind=StreamIntKind) :: i + integer :: err + r = real(0,kind=REAL64) + !$cuf kernel do <<< *, * >>> + do i=1,N + r = r + A(i) * B(i) + end do + err = cudaDeviceSynchronize() + if (err.ne.0) then + write(*,'(a)') "cudaDeviceSynchronize failed" + write(*,'(a)') cudaGetErrorString(err) + stop + endif + end function dot + +end module CUDAKernelStream diff --git a/src/fortran/CUDAStream.F90 b/src/fortran/CUDAStream.F90 new file mode 100644 index 00000000..208f1aa3 --- /dev/null +++ b/src/fortran/CUDAStream.F90 @@ -0,0 +1,309 @@ +module CUDAFortranKernels + use, intrinsic :: ISO_Fortran_env + use BabelStreamTypes + + contains + + attributes(global) subroutine do_copy(n,A,C) + implicit none + integer(kind=StreamIntKind), intent(in), value :: n + real(kind=REAL64), intent(in) :: A(n) + real(kind=REAL64), intent(out) :: C(n) + integer(kind=StreamIntKind) :: i + i = blockDim%x * (blockIdx%x - 1) + threadIdx%x + if (i <= N) then + C(i) = A(i) + endif + end subroutine do_copy + + attributes(global) subroutine do_add(n,A,B,C) + implicit none + integer(kind=StreamIntKind), intent(in), value :: n + real(kind=REAL64), intent(in) :: A(n), B(n) + real(kind=REAL64), intent(out) :: C(n) + integer(kind=StreamIntKind) :: i + i = blockDim%x * (blockIdx%x - 1) + threadIdx%x + if (i <= N) then + C(i) = A(i) + B(i) + endif + end subroutine do_add + + attributes(global) subroutine do_mul(n,scalar,B,C) + implicit none + integer(kind=StreamIntKind), intent(in), value :: n + real(kind=REAL64), intent(in), value :: scalar + real(kind=REAL64), intent(out) :: B(n) + real(kind=REAL64), intent(in) :: C(n) + integer(kind=StreamIntKind) :: i + i = blockDim%x * (blockIdx%x - 1) + threadIdx%x + if (i <= N) then + B(i) = scalar * C(i) + endif + end subroutine do_mul + + attributes(global) subroutine do_triad(n,scalar,A,B,C) + implicit none + integer(kind=StreamIntKind), intent(in), value :: n + real(kind=REAL64), intent(in), value :: scalar + real(kind=REAL64), intent(out) :: A(n) + real(kind=REAL64), intent(in) :: B(n), C(n) + integer(kind=StreamIntKind) :: i + i = blockDim%x * (blockIdx%x - 1) + threadIdx%x + if (i <= N) then + A(i) = B(i) + scalar * C(i) + endif + end subroutine do_triad + + attributes(global) subroutine do_nstream(n,scalar,A,B,C) + implicit none + integer(kind=StreamIntKind), intent(in), value :: n + real(kind=REAL64), intent(in), value :: scalar + real(kind=REAL64), intent(inout) :: A(n) + real(kind=REAL64), intent(in) :: B(n), C(n) + integer(kind=StreamIntKind) :: i + i = blockDim%x * (blockIdx%x - 1) + threadIdx%x + if (i <= N) then + A(i) = A(i) + B(i) + scalar * C(i) + endif + end subroutine do_nstream + +#if 0 + attributes(global) subroutine do_dot(n,A,B,r) + implicit none + integer(kind=StreamIntKind), intent(in), value :: n + real(kind=REAL64), intent(in) :: A(n), B(n) + real(kind=REAL64), intent(out) :: r + integer(kind=StreamIntKind) :: i + r = real(0,kind=REAL64) + !$cuf kernel do <<< *, * >>> + do i=1,N + r = r + A(i) * B(i) + end do + end subroutine do_dot +#endif + +end module CUDAFortranKernels + +module CUDAStream + use, intrinsic :: ISO_Fortran_env + use BabelStreamTypes + use cudafor, only: dim3 + + implicit none + + character(len=4), parameter :: implementation_name = "CUDA" + + integer(kind=StreamIntKind) :: N + +#ifdef USE_MANAGED + real(kind=REAL64), allocatable, managed :: A(:), B(:), C(:) +#else + real(kind=REAL64), allocatable, device :: A(:), B(:), C(:) +#endif + + type(dim3) :: grid, tblock + + contains + + subroutine list_devices() + use cudafor + implicit none + integer :: num, err + err = cudaGetDeviceCount(num) + if (err.ne.0) then + write(*,'(a)') "cudaGetDeviceCount failed" + write(*,'(a)') cudaGetErrorString(err) + stop + else if (num.eq.0) then + write(*,'(a17)') "No devices found." + else + write(*,'(a10,i1,a8)') "There are ",num," devices." + end if + end subroutine list_devices + + subroutine set_device(dev) + use cudafor + implicit none + integer, intent(in) :: dev + integer :: num, err + err = cudaGetDeviceCount(num) + if (err.ne.0) then + write(*,'(a)') "cudaGetDeviceCount failed" + write(*,'(a)') cudaGetErrorString(err) + stop + else if (num.eq.0) then + write(*,'(a17)') "No devices found." + stop + else if (dev.ge.num) then + write(*,'(a21)') "Invalid device index." + stop + else + err = cudaSetDevice(dev) + if (err.ne.0) then + write(*,'(a)') "cudaSetDevice failed" + write(*,'(a)') cudaGetErrorString(err) + stop + end if + end if + end subroutine set_device + + subroutine alloc(array_size) + implicit none + integer(kind=StreamIntKind) :: array_size + integer :: err + N = array_size + allocate( A(1:N), B(1:N), C(1:N), stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'allocate returned ',err + stop 1 + endif + ! move to separate subroutine later + tblock = dim3(128,1,1) + grid = dim3(ceiling(real(N)/tblock%x),1,1) + end subroutine alloc + + subroutine dealloc() + implicit none + integer :: err + deallocate( A, B, C, stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'deallocate returned ',err + stop 1 + endif + end subroutine dealloc + + subroutine init_arrays(initA, initB, initC) + use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString + implicit none + real(kind=REAL64), intent(in) :: initA, initB, initC + integer(kind=StreamIntKind) :: i + integer :: err + A = initA + B = initB + C = initC + err = cudaDeviceSynchronize() + if (err.ne.0) then + write(*,'(a)') "cudaDeviceSynchronize failed" + write(*,'(a)') cudaGetErrorString(err) + stop + endif + end subroutine init_arrays + + subroutine read_arrays(h_A, h_B, h_C) + use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString + implicit none + real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:) + integer(kind=StreamIntKind) :: i + integer :: err + h_A = A + h_B = B + h_C = C + err = cudaDeviceSynchronize() + if (err.ne.0) then + write(*,'(a)') "cudaDeviceSynchronize failed" + write(*,'(a)') cudaGetErrorString(err) + stop + endif + end subroutine read_arrays + + subroutine copy() + use CUDAFortranKernels, only: do_copy + use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString + implicit none + integer :: err + call do_copy<<>>(N, A, C) + err = cudaDeviceSynchronize() + if (err.ne.0) then + write(*,'(a)') "cudaDeviceSynchronize failed" + write(*,'(a)') cudaGetErrorString(err) + stop + endif + end subroutine copy + + subroutine add() + use CUDAFortranKernels, only: do_add + use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString + implicit none + integer :: err + call do_add<<>>(N, A, B, C) + err = cudaDeviceSynchronize() + if (err.ne.0) then + write(*,'(a)') "cudaDeviceSynchronize failed" + write(*,'(a)') cudaGetErrorString(err) + stop + endif + end subroutine add + + subroutine mul(startScalar) + use CUDAFortranKernels, only: do_mul + use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer :: err + scalar = startScalar + call do_mul<<>>(N, scalar, B, C) + err = cudaDeviceSynchronize() + if (err.ne.0) then + write(*,'(a)') "cudaDeviceSynchronize failed" + write(*,'(a)') cudaGetErrorString(err) + stop + endif + end subroutine mul + + subroutine triad(startScalar) + use CUDAFortranKernels, only: do_triad + use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer :: err + scalar = startScalar + call do_triad<<>>(N, scalar, A, B, C) + err = cudaDeviceSynchronize() + if (err.ne.0) then + write(*,'(a)') "cudaDeviceSynchronize failed" + write(*,'(a)') cudaGetErrorString(err) + stop + endif + end subroutine triad + + subroutine nstream(startScalar) + use CUDAFortranKernels, only: do_nstream + use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer :: err + scalar = startScalar + call do_nstream<<>>(N, scalar, A, B, C) + err = cudaDeviceSynchronize() + if (err.ne.0) then + write(*,'(a)') "cudaDeviceSynchronize failed" + write(*,'(a)') cudaGetErrorString(err) + stop + endif + end subroutine nstream + + function dot() result(r) + !use CUDAFortranKernels, only: do_dot + use cudafor, only: cudaDeviceSynchronize, cudaGetErrorString + implicit none + real(kind=REAL64) :: r + integer :: err + integer(kind=StreamIntKind) :: i + !call do_dot<<>>(N, B, C, r) + r = real(0,kind=REAL64) + !$cuf kernel do <<< *, * >>> + do i=1,N + r = r + A(i) * B(i) + end do + err = cudaDeviceSynchronize() + if (err.ne.0) then + write(*,'(a)') "cudaDeviceSynchronize failed" + write(*,'(a)') cudaGetErrorString(err) + stop + endif + end function dot + +end module CUDAStream diff --git a/src/fortran/DoConcurrentStream.F90 b/src/fortran/DoConcurrentStream.F90 new file mode 100644 index 00000000..781210d3 --- /dev/null +++ b/src/fortran/DoConcurrentStream.F90 @@ -0,0 +1,139 @@ +module DoConcurrentStream + use, intrinsic :: ISO_Fortran_env + use BabelStreamTypes + + implicit none + + character(len=12), parameter :: implementation_name = "DoConcurrent" + + integer(kind=StreamIntKind) :: N + +#ifdef USE_DEVICE + real(kind=REAL64), allocatable, device :: A(:), B(:), C(:) +#else + real(kind=REAL64), allocatable :: A(:), B(:), C(:) +#endif + + contains + + subroutine list_devices() + implicit none + integer :: num + write(*,'(a36,a12)') "Listing devices is not supported by ", implementation_name + end subroutine list_devices + + subroutine set_device(dev) + implicit none + integer, intent(in) :: dev + write(*,'(a32,a12)') "Device != 0 is not supported by ", implementation_name + end subroutine set_device + + subroutine alloc(array_size) + implicit none + integer(kind=StreamIntKind) :: array_size + integer :: err + N = array_size + allocate( A(1:N), B(1:N), C(1:N), stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'allocate returned ',err + stop 1 + endif + end subroutine alloc + + subroutine dealloc() + implicit none + integer :: err + deallocate( A, B, C, stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'deallocate returned ',err + stop 1 + endif + end subroutine dealloc + + subroutine init_arrays(initA, initB, initC) + implicit none + real(kind=REAL64), intent(in) :: initA, initB, initC + integer(kind=StreamIntKind) :: i + do concurrent (i=1:N) + A(i) = initA + B(i) = initB + C(i) = initC + end do + end subroutine init_arrays + + subroutine read_arrays(h_A, h_B, h_C) + implicit none + real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:) + integer(kind=StreamIntKind) :: i + do concurrent (i=1:N) !shared(A,B,C) + h_A(i) = A(i) + h_B(i) = B(i) + h_C(i) = C(i) + end do + end subroutine read_arrays + + subroutine copy() + implicit none + integer(kind=StreamIntKind) :: i + do concurrent (i=1:N) !shared(A,C) + C(i) = A(i) + end do + end subroutine copy + + subroutine add() + implicit none + integer(kind=StreamIntKind) :: i + do concurrent (i=1:N) !shared(A,B,C) + C(i) = A(i) + B(i) + end do + end subroutine add + + subroutine mul(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + do concurrent (i=1:N) !shared(B,C) + B(i) = scalar * C(i) + end do + end subroutine mul + + subroutine triad(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + do concurrent (i=1:N) !shared(A,B,C) + A(i) = B(i) + scalar * C(i) + end do + end subroutine triad + + subroutine nstream(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + do concurrent (i=1:N) !shared(A,B,C) + A(i) = A(i) + B(i) + scalar * C(i) + end do + end subroutine nstream + + function dot() result(s) + implicit none + real(kind=REAL64) :: s + integer(kind=StreamIntKind) :: i + ! reduction omitted because NVF infers it and other compilers do not support + s = real(0,kind=REAL64) +#ifdef CRAY_THREAD_DOCONCURRENT + do i=1,N +#else + do concurrent (i=1:N) !shared(A,B) +#endif + s = s + A(i) * B(i) + end do + end function dot + +end module DoConcurrentStream diff --git a/src/fortran/Makefile b/src/fortran/Makefile new file mode 100644 index 00000000..18685d46 --- /dev/null +++ b/src/fortran/Makefile @@ -0,0 +1,109 @@ +ifeq ($(COMPILER),nvhpc) + include make.inc.nvhpc +else ifeq ($(COMPILER),oneapi) + include make.inc.oneapi +else ifeq ($(COMPILER),gcc) + include make.inc.gcc +else ifeq ($(COMPILER),amd) + include make.inc.amd +else ifeq ($(COMPILER),arm) + include make.inc.arm +else ifeq ($(COMPILER),cray) + include make.inc.cray +else ifeq ($(COMPILER),fj) + include make.inc.fj +else + $(info Set COMPILER={nvhpc,oneapi,amd,arm,cray,fj,gcc}. Default is gcc.) + include make.inc.gcc + COMPILER=gcc +endif + +FCFLAGS += -DVERSION_STRING="4.0" +#FCFLAGS += -DUSE_INT32 + +ifeq ($(IMPLEMENTATION),DoConcurrent) + FCFLAGS += -DUSE_DOCONCURRENT $(DOCONCURRENT_FLAG) + IMPLEMENTATION_OBJECT = DoConcurrentStream.o + +else ifeq ($(IMPLEMENTATION),Array) + FCFLAGS += -DUSE_ARRAY $(ARRAY_FLAG) + IMPLEMENTATION_OBJECT = ArrayStream.o + +else ifeq ($(IMPLEMENTATION),OpenMP) + FCFLAGS += -DUSE_OPENMP $(OPENMP_FLAG) + IMPLEMENTATION_OBJECT = OpenMPStream.o + +else ifeq ($(IMPLEMENTATION),OpenMPWorkshare) + FCFLAGS += -DUSE_OPENMPWORKSHARE $(OPENMP_FLAG) + IMPLEMENTATION_OBJECT = OpenMPWorkshareStream.o + +else ifeq ($(IMPLEMENTATION),OpenMPTarget) + FCFLAGS += -DUSE_OPENMPTARGET $(OPENMP_FLAG) + IMPLEMENTATION_OBJECT = OpenMPTargetStream.o + +else ifeq ($(IMPLEMENTATION),OpenMPTargetLoop) + FCFLAGS += -DUSE_OPENMPTARGETLOOP $(OPENMP_FLAG) + IMPLEMENTATION_OBJECT = OpenMPTargetLoopStream.o + +else ifeq ($(IMPLEMENTATION),OpenMPTaskloop) + FCFLAGS += -DUSE_OPENMPTASKLOOP $(OPENMP_FLAG) + IMPLEMENTATION_OBJECT = OpenMPTaskloopStream.o + +else ifeq ($(IMPLEMENTATION),OpenACC) + FCFLAGS += -DUSE_OPENACC $(OPENACC_FLAG) + IMPLEMENTATION_OBJECT = OpenACCStream.o + +else ifeq ($(IMPLEMENTATION),OpenACCArray) + FCFLAGS += -DUSE_OPENACCARRAY $(OPENACC_FLAG) + IMPLEMENTATION_OBJECT = OpenACCArrayStream.o + +else ifeq ($(IMPLEMENTATION),CUDA) + FCFLAGS += -DUSE_CUDA $(CUDA_FLAG) + IMPLEMENTATION_OBJECT = CUDAStream.o + +else ifeq ($(IMPLEMENTATION),CUDAKernel) + FCFLAGS += -DUSE_CUDAKERNEL $(CUDA_FLAG) + IMPLEMENTATION_OBJECT = CUDAKernelStream.o + +else ifeq ($(IMPLEMENTATION),Sequential) + FCFLAGS += -DUSE_SEQUENTIAL $(SEQUENTIAL_FLAG) + IMPLEMENTATION_OBJECT = SequentialStream.o + +else + $(info Set IMPLEMENTATION={DoConcurrent,Array,OpenMP,OpenMPWorkshare,OpenMPTarget,OpenMPTargetLoop,OpenMPTaskloop,OpenACC,OpenACCArray,CUDA,CUDAKernel}.) + FCFLAGS += -DUSE_SEQUENTIAL $(SEQUENTIAL_FLAG) + IMPLEMENTATION=Sequential + IMPLEMENTATION_OBJECT = SequentialStream.o + +endif + +all: BabelStream.$(COMPILER).$(IMPLEMENTATION) + +BabelStream.$(COMPILER).$(IMPLEMENTATION): main.F90 $(IMPLEMENTATION_OBJECT) + $(FC) $(FCFLAGS) $^ BabelStreamTypes.o -o $@ + +BabelStreamTypes.o BabelStreamTypes.mod: BabelStreamTypes.F90 + $(FC) $(FCFLAGS) -c $< + +%.o: %.F90 BabelStreamTypes.mod + $(FC) $(FCFLAGS) -c $< + +clean: + -rm -f main.o BabelStreamUtil.mod babelstreamutil.mod + -rm -f BabelStreamTypes.o BabelStreamTypes.mod babelstreamtypes.mod + -rm -f DoConcurrentStream.o DoConcurrentStream.mod doconcurrentstream.mod + -rm -f ArrayStream.o ArrayStream.mod arraystream.mod + -rm -f SequentialStream.o SequentialStream.mod sequentialstream.mod + -rm -f OpenMPStream.o OpenMPStream.mod openmpstream.mod + -rm -f OpenMPWorkshareStream.o OpenMPWorkshareStream.mod openmpworksharestream.mod + -rm -f OpenMPTaskloopStream.o OpenMPTaskloopStream.mod openmptaskloopstream.mod + -rm -f OpenMPTargetStream.o OpenMPTargetStream.mod openmptargetstream.mod + -rm -f OpenMPTargetLoopStream.o OpenMPTargetLoopStream.mod openmptargetloopstream.mod + -rm -f OpenACCStream.o OpenACCStream.mod openaccstream.mod + -rm -f OpenACCArrayStream.o OpenACCArrayStream.mod openaccarraystream.mod + -rm -f CUDAStream.o CUDAStream.mod cudastream.mod CUDAFortranKernels.mod cudafortrankernels.mod + -rm -f CUDAKernelStream.o CUDAKernelStream.mod cudakernelstream.mod + -rm -f *.modmic *.mod *.o *.cub *.ptx + +realclean: clean + -rm -f BabelStream.* diff --git a/src/fortran/OpenACCArrayStream.F90 b/src/fortran/OpenACCArrayStream.F90 new file mode 100644 index 00000000..9225fe70 --- /dev/null +++ b/src/fortran/OpenACCArrayStream.F90 @@ -0,0 +1,144 @@ +module OpenACCArrayStream + use, intrinsic :: ISO_Fortran_env + use BabelStreamTypes + + implicit none + + character(len=12), parameter :: implementation_name = "OpenACCArray" + + integer(kind=StreamIntKind) :: N + + real(kind=REAL64), allocatable :: A(:), B(:), C(:) + + contains + + subroutine list_devices() + use openacc + implicit none + integer :: num + num = acc_get_num_devices(acc_get_device_type()) + if (num.eq.0) then + write(*,'(a17)') "No devices found." + else + write(*,'(a10,i1,a8)') "There are ",num," devices." + end if + end subroutine list_devices + + subroutine set_device(dev) + use openacc + implicit none + integer, intent(in) :: dev + integer :: num + num = acc_get_num_devices(acc_get_device_type()) + if (num.eq.0) then + write(*,'(a17)') "No devices found." + stop + else if (dev.gt.num) then + write(*,'(a21)') "Invalid device index." + stop + else + call acc_set_device_num(dev, acc_get_device_type()) + end if + end subroutine set_device + + subroutine alloc(array_size) + implicit none + integer(kind=StreamIntKind) :: array_size + integer :: err + N = array_size + allocate( A(1:N), B(1:N), C(1:N), stat=err) +#ifndef USE_MANAGED + !$acc enter data create(A,B,C) +#endif + if (err .ne. 0) then + write(*,'(a20,i3)') 'allocate returned ',err + stop 1 + endif + end subroutine alloc + + subroutine dealloc() + implicit none + integer :: err +#ifndef USE_MANAGED + !$acc exit data delete(A,B,C) +#endif + deallocate( A, B, C, stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'deallocate returned ',err + stop 1 + endif + end subroutine dealloc + + subroutine init_arrays(initA, initB, initC) + implicit none + real(kind=REAL64), intent(in) :: initA, initB, initC + !$acc kernels + A = initA + B = initB + C = initC + !$acc end kernels + end subroutine init_arrays + + subroutine read_arrays(h_A, h_B, h_C) + implicit none + real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:) + !$acc kernels + h_A = A + h_B = B + h_C = C + !$acc end kernels + end subroutine read_arrays + + subroutine copy() + implicit none + !$acc kernels + C = A + !$acc end kernels + end subroutine copy + + subroutine add() + implicit none + !$acc kernels + C = A + B + !$acc end kernels + end subroutine add + + subroutine mul(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + scalar = startScalar + !$acc kernels + B = scalar * C + !$acc end kernels + end subroutine mul + + subroutine triad(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + scalar = startScalar + !$acc kernels + A = B + scalar * C + !$acc end kernels + end subroutine triad + + subroutine nstream(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + scalar = startScalar + !$acc kernels + A = A + B + scalar * C + !$acc end kernels + end subroutine nstream + + function dot() result(s) + implicit none + real(kind=REAL64) :: s + !$acc kernels + s = dot_product(A,B) + !$acc end kernels + end function dot + +end module OpenACCArrayStream diff --git a/src/fortran/OpenACCStream.F90 b/src/fortran/OpenACCStream.F90 new file mode 100644 index 00000000..7326f383 --- /dev/null +++ b/src/fortran/OpenACCStream.F90 @@ -0,0 +1,161 @@ +module OpenACCStream + use, intrinsic :: ISO_Fortran_env + use BabelStreamTypes + + implicit none + + character(len=7), parameter :: implementation_name = "OpenACC" + + integer(kind=StreamIntKind) :: N + + real(kind=REAL64), allocatable :: A(:), B(:), C(:) + + contains + + subroutine list_devices() + use openacc + implicit none + integer :: num + num = acc_get_num_devices(acc_get_device_type()) + if (num.eq.0) then + write(*,'(a17)') "No devices found." + else + write(*,'(a10,i1,a8)') "There are ",num," devices." + end if + end subroutine list_devices + + subroutine set_device(dev) + use openacc + implicit none + integer, intent(in) :: dev + integer :: num + num = acc_get_num_devices(acc_get_device_type()) + if (num.eq.0) then + write(*,'(a17)') "No devices found." + stop + else if (dev.gt.num) then + write(*,'(a21)') "Invalid device index." + stop + else + call acc_set_device_num(dev, acc_get_device_type()) + end if + end subroutine set_device + + subroutine alloc(array_size) + implicit none + integer(kind=StreamIntKind) :: array_size + integer :: err + N = array_size + allocate( A(1:N), B(1:N), C(1:N), stat=err) +#ifndef USE_MANAGED + !$acc enter data create(A,B,C) +#endif + if (err .ne. 0) then + write(*,'(a20,i3)') 'allocate returned ',err + stop 1 + endif + end subroutine alloc + + subroutine dealloc() + implicit none + integer :: err +#ifndef USE_MANAGED + !$acc exit data delete(A,B,C) +#endif + deallocate( A, B, C, stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'deallocate returned ',err + stop 1 + endif + end subroutine dealloc + + subroutine init_arrays(initA, initB, initC) + implicit none + real(kind=REAL64), intent(in) :: initA, initB, initC + integer(kind=StreamIntKind) :: i + !$acc parallel loop + do i=1,N + A(i) = initA + B(i) = initB + C(i) = initC + end do + end subroutine init_arrays + + subroutine read_arrays(h_A, h_B, h_C) + implicit none + real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:) + integer(kind=StreamIntKind) :: i + !$acc parallel loop + do i=1,N + h_A(i) = A(i) + h_B(i) = B(i) + h_C(i) = C(i) + end do + end subroutine read_arrays + + subroutine copy() + implicit none + integer(kind=StreamIntKind) :: i + !$acc parallel loop + do i=1,N + C(i) = A(i) + end do + end subroutine copy + + subroutine add() + implicit none + integer(kind=StreamIntKind) :: i + !$acc parallel loop + do i=1,N + C(i) = A(i) + B(i) + end do + end subroutine add + + subroutine mul(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + !$acc parallel loop + do i=1,N + B(i) = scalar * C(i) + end do + end subroutine mul + + subroutine triad(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + !$acc parallel loop + do i=1,N + A(i) = B(i) + scalar * C(i) + end do + end subroutine triad + + subroutine nstream(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + !$acc parallel loop + do i=1,N + A(i) = A(i) + B(i) + scalar * C(i) + end do + end subroutine nstream + + function dot() result(s) + implicit none + real(kind=REAL64) :: s + integer(kind=StreamIntKind) :: i + s = real(0,kind=REAL64) + !$acc parallel loop reduction(+:s) + do i=1,N + s = s + A(i) * B(i) + end do + end function dot + +end module OpenACCStream diff --git a/src/fortran/OpenMPStream.F90 b/src/fortran/OpenMPStream.F90 new file mode 100644 index 00000000..7316d5b8 --- /dev/null +++ b/src/fortran/OpenMPStream.F90 @@ -0,0 +1,137 @@ +module OpenMPStream + use, intrinsic :: ISO_Fortran_env + use BabelStreamTypes + + implicit none + + character(len=6), parameter :: implementation_name = "OpenMP" + + integer(kind=StreamIntKind) :: N + + real(kind=REAL64), allocatable :: A(:), B(:), C(:) + + contains + + subroutine list_devices() + implicit none + write(*,'(a36,a12)') "Listing devices is not supported by ", implementation_name + end subroutine list_devices + + subroutine set_device(dev) + implicit none + integer, intent(in) :: dev + write(*,'(a32,a12)') "Device != 0 is not supported by ", implementation_name + end subroutine set_device + + subroutine alloc(array_size) + implicit none + integer(kind=StreamIntKind) :: array_size + integer :: err + N = array_size + allocate( A(1:N), B(1:N), C(1:N), stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'allocate returned ',err + stop 1 + endif + end subroutine alloc + + subroutine dealloc() + implicit none + integer :: err + deallocate( A, B, C, stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'deallocate returned ',err + stop 1 + endif + end subroutine dealloc + + subroutine init_arrays(initA, initB, initC) + implicit none + real(kind=REAL64), intent(in) :: initA, initB, initC + integer(kind=StreamIntKind) :: i + !$omp parallel do simd + do i=1,N + A(i) = initA + B(i) = initB + C(i) = initC + end do + end subroutine init_arrays + + subroutine read_arrays(h_A, h_B, h_C) + implicit none + real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:) + integer(kind=StreamIntKind) :: i + !$omp parallel do simd + do i=1,N + h_A(i) = A(i) + h_B(i) = B(i) + h_C(i) = C(i) + end do + end subroutine read_arrays + + subroutine copy() + implicit none + integer(kind=StreamIntKind) :: i + !$omp parallel do simd + do i=1,N + C(i) = A(i) + end do + end subroutine copy + + subroutine add() + implicit none + integer(kind=StreamIntKind) :: i + !$omp parallel do simd + do i=1,N + C(i) = A(i) + B(i) + end do + end subroutine add + + subroutine mul(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + !$omp parallel do simd + do i=1,N + B(i) = scalar * C(i) + end do + end subroutine mul + + subroutine triad(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + !$omp parallel do simd + do i=1,N + A(i) = B(i) + scalar * C(i) + end do + end subroutine triad + + subroutine nstream(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + !$omp parallel do simd + do i=1,N + A(i) = A(i) + B(i) + scalar * C(i) + end do + end subroutine nstream + + function dot() result(s) + implicit none + real(kind=REAL64) :: s + integer(kind=StreamIntKind) :: i + s = real(0,kind=REAL64) + !$omp parallel do simd reduction(+:s) + do i=1,N + s = s + A(i) * B(i) + end do + end function dot + +end module OpenMPStream diff --git a/src/fortran/OpenMPTargetLoopStream.F90 b/src/fortran/OpenMPTargetLoopStream.F90 new file mode 100644 index 00000000..9684cedc --- /dev/null +++ b/src/fortran/OpenMPTargetLoopStream.F90 @@ -0,0 +1,162 @@ +module OpenMPTargetLoopStream + use, intrinsic :: ISO_Fortran_env + use BabelStreamTypes + + implicit none + + character(len=16), parameter :: implementation_name = "OpenMPTargetLoop" + + integer(kind=StreamIntKind) :: N + + real(kind=REAL64), allocatable :: A(:), B(:), C(:) + + contains + + subroutine list_devices() + use omp_lib + implicit none + integer :: num + num = omp_get_num_devices() + if (num.eq.0) then + write(*,'(a17)') "No devices found." + else + write(*,'(a10,i1,a8)') "There are ",num," devices." + end if + end subroutine list_devices + + subroutine set_device(dev) + use omp_lib + implicit none + integer, intent(in) :: dev + integer :: num + num = omp_get_num_devices() + if (num.eq.0) then + write(*,'(a17)') "No devices found." + stop + else if (dev.gt.num) then + write(*,'(a21)') "Invalid device index." + stop + else + call omp_set_default_device(dev) + end if + end subroutine set_device + + subroutine alloc(array_size) + implicit none + integer(kind=StreamIntKind) :: array_size + integer :: err + N = array_size + allocate( A(1:N), B(1:N), C(1:N), stat=err) +#ifndef USE_MANAGED + !$omp target enter data map(alloc: A,B,C) +#endif + if (err .ne. 0) then + write(*,'(a20,i3)') 'allocate returned ',err + stop 1 + endif + end subroutine alloc + + subroutine dealloc() + implicit none + integer :: err +#ifndef USE_MANAGED + !$omp target exit data map(delete: A,B,C) +#endif + deallocate( A, B, C, stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'deallocate returned ',err + stop 1 + endif + end subroutine dealloc + + subroutine init_arrays(initA, initB, initC) + implicit none + real(kind=REAL64), intent(in) :: initA, initB, initC + integer(kind=StreamIntKind) :: i + !$omp target teams loop + do i=1,N + A(i) = initA + B(i) = initB + C(i) = initC + end do + end subroutine init_arrays + + subroutine read_arrays(h_A, h_B, h_C) + implicit none + real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:) + integer(kind=StreamIntKind) :: i + ! this might need to use a copy API instead... + !$omp target teams loop + do i=1,N + h_A(i) = A(i) + h_B(i) = B(i) + h_C(i) = C(i) + end do + end subroutine read_arrays + + subroutine copy() + implicit none + integer(kind=StreamIntKind) :: i + !$omp target teams loop + do i=1,N + C(i) = A(i) + end do + end subroutine copy + + subroutine add() + implicit none + integer(kind=StreamIntKind) :: i + !$omp target teams loop + do i=1,N + C(i) = A(i) + B(i) + end do + end subroutine add + + subroutine mul(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + !$omp target teams loop + do i=1,N + B(i) = scalar * C(i) + end do + end subroutine mul + + subroutine triad(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + !$omp target teams loop + do i=1,N + A(i) = B(i) + scalar * C(i) + end do + end subroutine triad + + subroutine nstream(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + !$omp target teams loop + do i=1,N + A(i) = A(i) + B(i) + scalar * C(i) + end do + end subroutine nstream + + function dot() result(s) + implicit none + real(kind=REAL64) :: s + integer(kind=StreamIntKind) :: i + s = real(0,kind=REAL64) + !$omp target teams loop reduction(+:s) + do i=1,N + s = s + A(i) * B(i) + end do + end function dot + +end module OpenMPTargetLoopStream diff --git a/src/fortran/OpenMPTargetStream.F90 b/src/fortran/OpenMPTargetStream.F90 new file mode 100644 index 00000000..0206d78b --- /dev/null +++ b/src/fortran/OpenMPTargetStream.F90 @@ -0,0 +1,163 @@ +module OpenMPTargetStream + use, intrinsic :: ISO_Fortran_env + use BabelStreamTypes + + implicit none + + character(len=12), parameter :: implementation_name = "OpenMPTarget" + + integer(kind=StreamIntKind) :: N + + real(kind=REAL64), allocatable :: A(:), B(:), C(:) + + contains + + subroutine list_devices() + use omp_lib + implicit none + integer :: num + num = omp_get_num_devices() + if (num.eq.0) then + write(*,'(a17)') "No devices found." + else + write(*,'(a10,i1,a8)') "There are ",num," devices." + end if + end subroutine list_devices + + subroutine set_device(dev) + use omp_lib + implicit none + integer, intent(in) :: dev + integer :: num + num = omp_get_num_devices() + if (num.eq.0) then + write(*,'(a17)') "No devices found." + stop + else if (dev.gt.num) then + write(*,'(a21)') "Invalid device index." + stop + else + call omp_set_default_device(dev) + end if + end subroutine set_device + + subroutine alloc(array_size) + implicit none + integer(kind=StreamIntKind) :: array_size + integer :: err + N = array_size + allocate( A(1:N), B(1:N), C(1:N), stat=err) +#ifndef USE_MANAGED + !$omp target enter data map(alloc: A,B,C) +#endif + if (err .ne. 0) then + write(*,'(a20,i3)') 'allocate returned ',err + stop 1 + endif + end subroutine alloc + + subroutine dealloc() + implicit none + integer :: err +#ifndef USE_MANAGED + !$omp target exit data map(delete: A,B,C) +#endif + deallocate( A, B, C, stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'deallocate returned ',err + stop 1 + endif + end subroutine dealloc + + subroutine init_arrays(initA, initB, initC) + implicit none + real(kind=REAL64), intent(in) :: initA, initB, initC + integer(kind=StreamIntKind) :: i + !$omp target teams distribute parallel do simd + do i=1,N + A(i) = initA + B(i) = initB + C(i) = initC + end do + end subroutine init_arrays + + subroutine read_arrays(h_A, h_B, h_C) + implicit none + real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:) + integer(kind=StreamIntKind) :: i + ! this might need to use a copy API instead... + !$omp target teams distribute parallel do simd + do i=1,N + h_A(i) = A(i) + h_B(i) = B(i) + h_C(i) = C(i) + end do + end subroutine read_arrays + + subroutine copy() + implicit none + integer(kind=StreamIntKind) :: i + !$omp target teams distribute parallel do simd + do i=1,N + C(i) = A(i) + end do + !$omp barrier + end subroutine copy + + subroutine add() + implicit none + integer(kind=StreamIntKind) :: i + !$omp target teams distribute parallel do simd + do i=1,N + C(i) = A(i) + B(i) + end do + end subroutine add + + subroutine mul(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + !$omp target teams distribute parallel do simd + do i=1,N + B(i) = scalar * C(i) + end do + end subroutine mul + + subroutine triad(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + !$omp target teams distribute parallel do simd + do i=1,N + A(i) = B(i) + scalar * C(i) + end do + end subroutine triad + + subroutine nstream(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + !$omp target teams distribute parallel do simd + do i=1,N + A(i) = A(i) + B(i) + scalar * C(i) + end do + end subroutine nstream + + function dot() result(s) + implicit none + real(kind=REAL64) :: s + integer(kind=StreamIntKind) :: i + s = real(0,kind=REAL64) + !$omp target teams distribute parallel do simd reduction(+:s) + do i=1,N + s = s + A(i) * B(i) + end do + end function dot + +end module OpenMPTargetStream diff --git a/src/fortran/OpenMPTaskloopStream.F90 b/src/fortran/OpenMPTaskloopStream.F90 new file mode 100644 index 00000000..579a7616 --- /dev/null +++ b/src/fortran/OpenMPTaskloopStream.F90 @@ -0,0 +1,169 @@ +module OpenMPTaskloopStream + use, intrinsic :: ISO_Fortran_env + use BabelStreamTypes + + implicit none + + character(len=14), parameter :: implementation_name = "OpenMPTaskloop" + + integer(kind=StreamIntKind) :: N + + real(kind=REAL64), allocatable :: A(:), B(:), C(:) + + contains + + subroutine list_devices() + implicit none + write(*,'(a36,a12)') "Listing devices is not supported by ", implementation_name + end subroutine list_devices + + subroutine set_device(dev) + implicit none + integer, intent(in) :: dev + write(*,'(a32,a12)') "Device != 0 is not supported by ", implementation_name + end subroutine set_device + + subroutine alloc(array_size) + implicit none + integer(kind=StreamIntKind) :: array_size + integer :: err + N = array_size + allocate( A(1:N), B(1:N), C(1:N), stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'allocate returned ',err + stop 1 + endif + end subroutine alloc + + subroutine dealloc() + implicit none + integer :: err + deallocate( A, B, C, stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'deallocate returned ',err + stop 1 + endif + end subroutine dealloc + + subroutine init_arrays(initA, initB, initC) + implicit none + real(kind=REAL64), intent(in) :: initA, initB, initC + integer(kind=StreamIntKind) :: i + !$omp parallel + !$omp master + !$omp taskloop + do i=1,N + A(i) = initA + B(i) = initB + C(i) = initC + end do + !$omp end master + !$omp end parallel + end subroutine init_arrays + + subroutine read_arrays(h_A, h_B, h_C) + implicit none + real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:) + integer(kind=StreamIntKind) :: i + !$omp parallel + !$omp master + !$omp taskloop + do i=1,N + h_A(i) = A(i) + h_B(i) = B(i) + h_C(i) = C(i) + end do + !$omp end master + !$omp end parallel + end subroutine read_arrays + + subroutine copy() + implicit none + integer(kind=StreamIntKind) :: i + !$omp parallel + !$omp master + !$omp taskloop + do i=1,N + C(i) = A(i) + end do + !$omp end master + !$omp end parallel + end subroutine copy + + subroutine add() + implicit none + integer(kind=StreamIntKind) :: i + !$omp parallel + !$omp master + !$omp taskloop + do i=1,N + C(i) = A(i) + B(i) + end do + !$omp end master + !$omp end parallel + end subroutine add + + subroutine mul(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + !$omp parallel + !$omp master + !$omp taskloop + do i=1,N + B(i) = scalar * C(i) + end do + !$omp end master + !$omp end parallel + end subroutine mul + + subroutine triad(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + !$omp parallel + !$omp master + !$omp taskloop + do i=1,N + A(i) = B(i) + scalar * C(i) + end do + !$omp end master + !$omp end parallel + end subroutine triad + + subroutine nstream(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + !$omp parallel + !$omp master + !$omp taskloop + do i=1,N + A(i) = A(i) + B(i) + scalar * C(i) + end do + !$omp end master + !$omp end parallel + end subroutine nstream + + function dot() result(s) + implicit none + real(kind=REAL64) :: s + integer(kind=StreamIntKind) :: i + s = real(0,kind=REAL64) + !$omp parallel + !$omp master + !$omp taskloop reduction(+:s) + do i=1,N + s = s + A(i) * B(i) + end do + !$omp end master + !$omp end parallel + end function dot + +end module OpenMPTaskloopStream diff --git a/src/fortran/OpenMPWorkshareStream.F90 b/src/fortran/OpenMPWorkshareStream.F90 new file mode 100644 index 00000000..fd50f86b --- /dev/null +++ b/src/fortran/OpenMPWorkshareStream.F90 @@ -0,0 +1,120 @@ +module OpenMPWorkshareStream + use, intrinsic :: ISO_Fortran_env + use BabelStreamTypes + + implicit none + + character(len=15), parameter :: implementation_name = "OpenMPWorkshare" + + integer(kind=StreamIntKind) :: N + + real(kind=REAL64), allocatable :: A(:), B(:), C(:) + + contains + + subroutine list_devices() + implicit none + write(*,'(a36,a12)') "Listing devices is not supported by ", implementation_name + end subroutine list_devices + + subroutine set_device(dev) + implicit none + integer, intent(in) :: dev + write(*,'(a32,a12)') "Device != 0 is not supported by ", implementation_name + end subroutine set_device + + subroutine alloc(array_size) + implicit none + integer(kind=StreamIntKind) :: array_size + integer :: err + N = array_size + allocate( A(1:N), B(1:N), C(1:N), stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'allocate returned ',err + stop 1 + endif + end subroutine alloc + + subroutine dealloc() + implicit none + integer :: err + deallocate( A, B, C, stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'deallocate returned ',err + stop 1 + endif + end subroutine dealloc + + subroutine init_arrays(initA, initB, initC) + implicit none + real(kind=REAL64), intent(in) :: initA, initB, initC + !$omp parallel workshare + A = initA + B = initB + C = initC + !$omp end parallel workshare + end subroutine init_arrays + + subroutine read_arrays(h_A, h_B, h_C) + implicit none + real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:) + !$omp parallel workshare + h_A = A + h_B = B + h_C = C + !$omp end parallel workshare + end subroutine read_arrays + + subroutine copy() + implicit none + !$omp parallel workshare + C = A + !$omp end parallel workshare + end subroutine copy + + subroutine add() + implicit none + !$omp parallel workshare + C = A + B + !$omp end parallel workshare + end subroutine add + + subroutine mul(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + scalar = startScalar + !$omp parallel workshare + B = scalar * C + !$omp end parallel workshare + end subroutine mul + + subroutine triad(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + scalar = startScalar + !$omp parallel workshare + A = B + scalar * C + !$omp end parallel workshare + end subroutine triad + + subroutine nstream(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + scalar = startScalar + !$omp parallel workshare + A = A + B + scalar * C + !$omp end parallel workshare + end subroutine nstream + + function dot() result(s) + implicit none + real(kind=REAL64) :: s + !$omp parallel workshare + s = dot_product(A,B) + !$omp end parallel workshare + end function dot + +end module OpenMPWorkshareStream diff --git a/src/fortran/SequentialStream.F90 b/src/fortran/SequentialStream.F90 new file mode 100644 index 00000000..a8f69172 --- /dev/null +++ b/src/fortran/SequentialStream.F90 @@ -0,0 +1,130 @@ +module SequentialStream + use, intrinsic :: ISO_Fortran_env + use BabelStreamTypes + + implicit none + + character(len=10), parameter :: implementation_name = "Sequential" + + integer(kind=StreamIntKind) :: N + + real(kind=REAL64), allocatable :: A(:), B(:), C(:) + + contains + + subroutine list_devices() + implicit none + integer :: num + write(*,'(a36,a10)') "Listing devices is not supported by ", implementation_name + end subroutine list_devices + + subroutine set_device(dev) + implicit none + integer, intent(in) :: dev + write(*,'(a32,a10)') "Device != 0 is not supported by ", implementation_name + end subroutine set_device + + subroutine alloc(array_size) + implicit none + integer(kind=StreamIntKind) :: array_size + integer :: err + N = array_size + allocate( A(1:N), B(1:N), C(1:N), stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'allocate returned ',err + stop 1 + endif + end subroutine alloc + + subroutine dealloc() + implicit none + integer :: err + deallocate( A, B, C, stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'deallocate returned ',err + stop 1 + endif + end subroutine dealloc + + subroutine init_arrays(initA, initB, initC) + implicit none + real(kind=REAL64), intent(in) :: initA, initB, initC + integer(kind=StreamIntKind) :: i + do i=1,N + A(i) = initA + B(i) = initB + C(i) = initC + end do + end subroutine init_arrays + + subroutine read_arrays(h_A, h_B, h_C) + implicit none + real(kind=REAL64), intent(inout) :: h_A(:), h_B(:), h_C(:) + integer(kind=StreamIntKind) :: i + do i=1,N + h_A(i) = A(i) + h_B(i) = B(i) + h_C(i) = C(i) + end do + end subroutine read_arrays + + subroutine copy() + implicit none + integer(kind=StreamIntKind) :: i + do i=1,N + C(i) = A(i) + end do + end subroutine copy + + subroutine add() + implicit none + integer(kind=StreamIntKind) :: i + do i=1,N + C(i) = A(i) + B(i) + end do + end subroutine add + + subroutine mul(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + do i=1,N + B(i) = scalar * C(i) + end do + end subroutine mul + + subroutine triad(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + do i=1,N + A(i) = B(i) + scalar * C(i) + end do + end subroutine triad + + subroutine nstream(startScalar) + implicit none + real(kind=REAL64), intent(in) :: startScalar + real(kind=REAL64) :: scalar + integer(kind=StreamIntKind) :: i + scalar = startScalar + do i=1,N + A(i) = A(i) + B(i) + scalar * C(i) + end do + end subroutine nstream + + function dot() result(s) + implicit none + real(kind=REAL64) :: s + integer(kind=StreamIntKind) :: i + s = real(0,kind=REAL64) + do i=1,N + s = s + A(i) * B(i) + end do + end function dot + +end module SequentialStream diff --git a/src/fortran/build.sh b/src/fortran/build.sh new file mode 100755 index 00000000..93433547 --- /dev/null +++ b/src/fortran/build.sh @@ -0,0 +1,54 @@ +#!/bin/bash + +# uncomment to disable GPU targets +#HAS_GPU=0 + +# Orin +#if [ "x${compiler}" == "xgcc" ] ; then +# export MCPU=cortex-a78ae +#fi +#if [ "x${compiler}" == "xarm" ] ; then +# export MCPU=cortex-a78 +#fi + +COMPILERS="gcc" +if [ $(which nvfortran) ] ; then + COMPILERS="${COMPILERS} nvhpc" +fi +if [ $(which crayftn) ] ; then + COMPILERS="${COMPILERS} cray" +fi +if [ $(uname -m) == "aarch64" ] ; then + if [ $(which armflang) ] ; then + COMPILERS="${COMPILERS} arm" + fi + if [ $(which frt) ] ; then + COMPILERS="${COMPILERS} fj" + fi +elif [ $(uname -m) == "x86_64" ] ; then + if [ $(which lscpu >& /dev/null && lscpu | grep GenuineIntel | awk '{print $3}') == "GenuineIntel" ] ; then + COMPILERS="${COMPILERS} oneapi" + if [ -f /opt/intel/oneapi/setvars.sh ] ; then + . /opt/intel/oneapi/setvars.sh >& /dev/null + fi + else + # ^ this detection can be improved + COMPILERS="${COMPILERS} amd" + fi +fi + +for compiler in ${COMPILERS} ; do + TARGETS="DoConcurrent Array OpenMP OpenMPTaskloop OpenMPWorkshare" + if [ "${HAS_GPU}" != "0" ] ; then + TARGETS="${TARGETS} OpenMPTarget OpenMPTargetLoop" + if [ "x${compiler}" == "xnvhpc" ] ; then + TARGETS="${TARGETS} CUDA CUDAKernel" + fi + fi + if [ "x${compiler}" == "xnvhpc" ] || [ "x${compiler}" == "xgcc" ] || [ "x${compiler}" == "xcray" ] ; then + TARGETS="${TARGETS} OpenACC OpenACCArray" + fi + for implementation in ${TARGETS} ; do + make COMPILER=${compiler} IMPLEMENTATION=${implementation} + done +done diff --git a/src/fortran/main.F90 b/src/fortran/main.F90 new file mode 100644 index 00000000..d86e8d4a --- /dev/null +++ b/src/fortran/main.F90 @@ -0,0 +1,666 @@ +module BabelStreamUtil + use, intrinsic :: ISO_Fortran_env, only: REAL64,INT64 + use BabelStreamTypes + + implicit none + + integer(kind=StreamIntKind) :: array_size = 33554432 + integer(kind=StreamIntKind) :: num_times = 100 + logical :: mibibytes = .false. + logical :: use_gigs = .false. + logical :: csv = .false. + character(len=1), parameter :: csv_sep = "," + + ! 1 = All + ! 2 = Triad + ! 3 = Nstream + integer :: selection = 1 + + real(kind=REAL64), parameter :: startA = real(0.1d0,kind=REAL64) + real(kind=REAL64), parameter :: startB = real(0.2d0,kind=REAL64) + real(kind=REAL64), parameter :: startC = real(0.0d0,kind=REAL64) + real(kind=REAL64), parameter :: startScalar = real(0.4d0,kind=REAL64) + + contains + + function get_wtime() result(t) +#if defined(USE_OMP_GET_WTIME) + use omp_lib + implicit none + real(kind=REAL64) :: t + t = omp_get_wtime() +#elif defined(USE_CPU_TIME) + implicit none + real(kind=REAL64) :: t + real :: r + call cpu_time(r) + t = r +#else + implicit none + real(kind=REAL64) :: t + integer(kind=INT64) :: c, r + call system_clock(count = c, count_rate = r) + t = real(c,REAL64) / real(r,REAL64) +#endif + end function get_wtime + + subroutine parseArguments() + use, intrinsic :: ISO_Fortran_env, only: compiler_version, compiler_options +#if defined(USE_DOCONCURRENT) + use DoConcurrentStream, only: list_devices, set_device +#elif defined(USE_ARRAY) + use ArrayStream, only: list_devices, set_device +#elif defined(USE_OPENMP) + use OpenMPStream, only: list_devices, set_device +#elif defined(USE_OPENMPWORKSHARE) + use OpenMPWorkshareStream, only: list_devices, set_device +#elif defined(USE_OPENMPTARGET) + use OpenMPTargetStream, only: list_devices, set_device +#elif defined(USE_OPENMPTARGETLOOP) + use OpenMPTargetLoopStream, only: list_devices, set_device +#elif defined(USE_OPENMPTASKLOOP) + use OpenMPTaskloopStream, only: list_devices, set_device +#elif defined(USE_OPENACC) + use OpenACCStream, only: list_devices, set_device +#elif defined(USE_OPENACCARRAY) + use OpenACCArrayStream, only: list_devices, set_device +#elif defined(USE_CUDA) + use CUDAStream, only: list_devices, set_device +#elif defined(USE_CUDAKERNEL) + use CUDAKernelStream, only: list_devices, set_device +#elif defined(USE_SEQUENTIAL) + use SequentialStream, only: list_devices, set_device +#endif + implicit none + integer :: i, argc + integer :: arglen,err,pos(2) + character(len=64) :: argtmp + argc = command_argument_count() + do i=1,argc + call get_command_argument(i,argtmp,arglen,err) + if (err.eq.0) then + ! + ! list devices + ! + pos(1) = index(argtmp,"--list") + if (pos(1).eq.1) then + call list_devices() + stop + endif + ! + ! set device number + ! + pos(1) = index(argtmp,"--device") + if (pos(1).eq.1) then + if (i+1.gt.argc) then + print*,'You failed to provide a value for ',argtmp + stop + else + call get_command_argument(i+1,argtmp,arglen,err) + block + integer :: dev + read(argtmp,'(i15)') dev + call set_device(dev) + end block + endif + cycle + endif + ! + ! array size + ! + pos(1) = index(argtmp,"--arraysize") + pos(2) = index(argtmp,"-s") + if (any(pos(:).eq.1) ) then + if (i+1.gt.argc) then + print*,'You failed to provide a value for ',argtmp + else + call get_command_argument(i+1,argtmp,arglen,err) + block + integer(kind=INT64) :: big_size + read(argtmp,'(i15)') big_size + if (big_size .gt. HUGE(array_size)) then + print*,'Array size does not fit into integer:' + print*,big_size,'>',HUGE(array_size) + print*,'Stop using USE_INT32' + stop + else + array_size = INT(big_size,kind=StreamIntKind) + endif + end block + endif + cycle + endif + ! + ! number of iterations + ! + pos(1) = index(argtmp,"--numtimes") + pos(2) = index(argtmp,"-n") + if (any(pos(:).eq.1) ) then + if (i+1.gt.argc) then + print*,'You failed to provide a value for ',argtmp + else + call get_command_argument(i+1,argtmp,arglen,err) + read(argtmp,'(i15)') num_times + if (num_times.lt.2) then + write(*,'(a)') "Number of times must be 2 or more" + stop + end if + endif + cycle + endif + ! + ! precision + ! + pos(1) = index(argtmp,"--float") + if (pos(1).eq.1) then + write(*,'(a46,a39)') "Sorry, you have to recompile with -DUSE_FLOAT ", & + "to run BabelStream in single precision." + stop + endif + ! + ! selection (All, Triad, Nstream) + ! + pos(1) = index(argtmp,"--triad-only") + if (pos(1).eq.1) then + selection = 2 + cycle + endif + pos(1) = index(argtmp,"--nstream-only") + if (pos(1).eq.1) then + selection = 3 + cycle + endif + ! + ! CSV + ! + pos(1) = index(argtmp,"--csv") + if (pos(1).eq.1) then + csv = .true. + !write(*,'(a39)') "Sorry, CSV support isn't available yet." + !stop + endif + ! + ! units + ! + pos(1) = index(argtmp,"--mibibytes") + if (pos(1).eq.1) then + mibibytes = .true. + cycle + endif + ! + ! giga/gibi instead of mega/mebi + ! + pos(1) = index(argtmp,"--gigs") + if (pos(1).eq.1) then + use_gigs = .true. + cycle + endif + ! + ! + ! + pos(1) = index(argtmp,"--compiler-info") + if (pos(1).eq.1) then + write(*,'(a)') 'Compiler version: ',compiler_version() + write(*,'(a)') 'Compiler options: ',compiler_options() + stop + endif + ! + ! help + ! + pos(1) = index(argtmp,"--help") + pos(2) = index(argtmp,"-h") + if (any(pos(:).eq.1) ) then + call get_command_argument(0,argtmp,arglen,err) + write(*,'(a7,a,a10)') "Usage: ", trim(argtmp), " [OPTIONS]" + write(*,'(a)') "Options:" + write(*,'(a)') " -h --help Print the message" + write(*,'(a)') " --list List available devices" + write(*,'(a)') " --device INDEX Select device at INDEX" + write(*,'(a)') " -s --arraysize SIZE Use SIZE elements in the array" + write(*,'(a)') " -n --numtimes NUM Run the test NUM times (NUM >= 2)" + !write(*,'(a)') " --float Use floats (rather than doubles)" + write(*,'(a)') " --triad-only Only run triad" + write(*,'(a)') " --nstream-only Only run nstream" + write(*,'(a)') " --csv Output as csv table" + write(*,'(a)') " --mibibytes Use MiB=2^20 for bandwidth calculation (default MB=10^6)" + write(*,'(a)') " --gigs Use GiB=2^30 or GB=10^9 instead of MiB/MB" + write(*,'(a)') " --compiler-info Print information about compiler and flags, then exit." + stop + endif + end if + end do + end subroutine parseArguments + + subroutine run_all(timings, summ) +#if defined(USE_DOCONCURRENT) + use DoConcurrentStream +#elif defined(USE_ARRAY) + use ArrayStream +#elif defined(USE_OPENMP) + use OpenMPStream +#elif defined(USE_OPENMPWORKSHARE) + use OpenMPWorkshareStream +#elif defined(USE_OPENMPTARGET) + use OpenMPTargetStream +#elif defined(USE_OPENMPTARGETLOOP) + use OpenMPTargetLoopStream +#elif defined(USE_OPENMPTASKLOOP) + use OpenMPTaskloopStream +#elif defined(USE_OPENACC) + use OpenACCStream +#elif defined(USE_OPENACCARRAY) + use OpenACCArrayStream +#elif defined(USE_CUDA) + use CUDAStream +#elif defined(USE_CUDAKERNEL) + use CUDAKernelStream +#elif defined(USE_SEQUENTIAL) + use SequentialStream +#endif + implicit none + real(kind=REAL64), intent(inout) :: timings(:,:) + real(kind=REAL64), intent(out) :: summ + real(kind=REAL64) :: t1, t2 + integer(kind=StreamIntKind) :: i + + do i=1,num_times + + t1 = get_wtime() + call copy() + t2 = get_wtime() + timings(1,i) = t2-t1 + + t1 = get_wtime() + call mul(startScalar) + t2 = get_wtime() + timings(2,i) = t2-t1 + + t1 = get_wtime() + call add() + t2 = get_wtime() + timings(3,i) = t2-t1 + + t1 = get_wtime() + call triad(startScalar) + t2 = get_wtime() + timings(4,i) = t2-t1 + + t1 = get_wtime() + summ = dot() + t2 = get_wtime() + timings(5,i) = t2-t1 + + end do + + end subroutine run_all + + subroutine run_triad(timings) +#if defined(USE_DOCONCURRENT) + use DoConcurrentStream +#elif defined(USE_ARRAY) + use ArrayStream +#elif defined(USE_OPENMP) + use OpenMPStream +#elif defined(USE_OPENMPWORKSHARE) + use OpenMPWorkshareStream +#elif defined(USE_OPENMPTARGET) + use OpenMPTargetStream +#elif defined(USE_OPENMPTARGETLOOP) + use OpenMPTargetLoopStream +#elif defined(USE_OPENMPTASKLOOP) + use OpenMPTaskloopStream +#elif defined(USE_OPENACC) + use OpenACCStream +#elif defined(USE_OPENACCARRAY) + use OpenACCArrayStream +#elif defined(USE_CUDA) + use CUDAStream +#elif defined(USE_CUDAKERNEL) + use CUDAKernelStream +#elif defined(USE_SEQUENTIAL) + use SequentialStream +#endif + implicit none + real(kind=REAL64), intent(inout) :: timings(:,:) + real(kind=REAL64) :: t1, t2 + integer(kind=StreamIntKind) :: i + + do i=1,num_times + + t1 = get_wtime() + call triad(startScalar) + t2 = get_wtime() + timings(1,i) = t2-t1 + + end do + + end subroutine run_triad + + subroutine run_nstream(timings) +#if defined(USE_DOCONCURRENT) + use DoConcurrentStream +#elif defined(USE_ARRAY) + use ArrayStream +#elif defined(USE_OPENMP) + use OpenMPStream +#elif defined(USE_OPENMPWORKSHARE) + use OpenMPWorkshareStream +#elif defined(USE_OPENMPTARGET) + use OpenMPTargetStream +#elif defined(USE_OPENMPTARGETLOOP) + use OpenMPTargetLoopStream +#elif defined(USE_OPENMPTASKLOOP) + use OpenMPTaskloopStream +#elif defined(USE_OPENACC) + use OpenACCStream +#elif defined(USE_OPENACCARRAY) + use OpenACCArrayStream +#elif defined(USE_CUDA) + use CUDAStream +#elif defined(USE_CUDAKERNEL) + use CUDAKernelStream +#elif defined(USE_SEQUENTIAL) + use SequentialStream +#endif + implicit none + real(kind=REAL64), intent(inout) :: timings(:,:) + real(kind=REAL64) :: t1, t2 + integer(kind=StreamIntKind) :: i + + do i=1,num_times + + t1 = get_wtime() + call nstream(startScalar) + t2 = get_wtime() + timings(1,i) = t2-t1 + + end do + + end subroutine run_nstream + + subroutine check_solution(A, B, C, summ) + use, intrinsic :: IEEE_Arithmetic, only: IEEE_Is_Normal + implicit none + real(kind=REAL64), intent(in) :: A(:), B(:), C(:) + real(kind=REAL64), intent(in) :: summ + + integer(kind=StreamIntKind) :: i + real(kind=REAL64) :: goldA, goldB, goldC, goldSum + real(kind=REAL64) :: scalar + + ! always use double because of accumulation error + real(kind=REAL64) :: errA, errB, errC, errSum, epsi + logical :: cleanA, cleanB, cleanC, cleanSum + + goldA = startA + goldB = startB + goldC = startC + goldSum = 0.0d0 + + scalar = startScalar + + do i=1,num_times + + if (selection.eq.1) then + goldC = goldA + goldB = scalar * goldC + goldC = goldA + goldB + goldA = goldB + scalar * goldC + else if (selection.eq.2) then + goldA = goldB + scalar * goldC + else if (selection.eq.3) then + goldA = goldA + goldB + scalar * goldC; + endif + + end do + + goldSum = goldA * goldB * array_size + + cleanA = ALL(IEEE_Is_Normal(A)) + cleanB = ALL(IEEE_Is_Normal(B)) + cleanC = ALL(IEEE_Is_Normal(C)) + cleanSum = IEEE_Is_Normal(summ) + + if (.not. cleanA) then + write(*,'(a51)') "Validation failed on A. Contains NaA/Inf/Subnormal." + end if + if (.not. cleanB) then + write(*,'(a51)') "Validation failed on B. Contains NaA/Inf/Subnormal." + end if + if (.not. cleanC) then + write(*,'(a51)') "Validation failed on C. Contains NaA/Inf/Subnormal." + end if + if (.not. cleanSum) then + write(*,'(a54,e20.12)') "Validation failed on Sum. Contains NaA/Inf/Subnormal: ",summ + end if + + errA = SUM( ABS( A - goldA ) ) / array_size + errB = SUM( ABS( B - goldB ) ) / array_size + errC = SUM( ABS( C - goldC ) ) / array_size + errSum = ABS( (summ - goldSum) / goldSum) + + epsi = epsilon(real(0,kind=StreamRealKind)) * 100.0d0 + + if (errA .gt. epsi) then + write(*,'(a38,e20.12)') "Validation failed on A. Average error ", errA + end if + if (errB .gt. epsi) then + write(*,'(a38,e20.12)') "Validation failed on B. Average error ", errB + end if + if (errC .gt. epsi) then + write(*,'(a38,e20.12)') "Validation failed on C. Average error ", errC + end if + + if (selection.eq.1) then + if (errSum .gt. 1.0e-8) then + write(*,'(a38,e20.12)') "Validation failed on Sum. Error ", errSum + write(*,'(a8,e20.12,a15,e20.12)') "Sum was ",summ, " but should be ", errSum + end if + endif + + end subroutine check_solution + +end module BabelStreamUtil + +program BabelStream + use BabelStreamUtil +#if defined(USE_DOCONCURRENT) + use DoConcurrentStream +#elif defined(USE_ARRAY) + use ArrayStream +#elif defined(USE_OPENMP) + use OpenMPStream +#elif defined(USE_OPENMPWORKSHARE) + use OpenMPWorkshareStream +#elif defined(USE_OPENMPTARGET) + use OpenMPTargetStream +#elif defined(USE_OPENMPTARGETLOOP) + use OpenMPTargetLoopStream +#elif defined(USE_OPENMPTASKLOOP) + use OpenMPTaskloopStream +#elif defined(USE_OPENACC) + use OpenACCStream +#elif defined(USE_OPENACCARRAY) + use OpenACCArrayStream +#elif defined(USE_CUDA) + use CUDAStream +#elif defined(USE_CUDAKERNEL) + use CUDAKernelStream +#elif defined(USE_SEQUENTIAL) + use SequentialStream +#endif + implicit none + integer :: element_size, err + real(kind=REAL64) :: scaling + character(len=3) :: label + real(kind=REAL64), allocatable :: timings(:,:) + real(kind=REAL64), allocatable :: h_A(:), h_B(:), h_C(:) + real(kind=REAL64) :: summ + + call parseArguments() + + element_size = storage_size(real(0,kind=StreamRealKind)) / 8 + + if (mibibytes) then + if (use_gigs) then + scaling = 2.0d0**(-30) + label = "GiB" + else + scaling = 2.0d0**(-20) + label = "MiB" + endif + else + if (use_gigs) then + scaling = 1.0d-9 + label = "GB" + else + scaling = 1.0d-6 + label = "MB" + endif + endif + + if (.not.csv) then + + write(*,'(a)') "BabelStream Fortran" + write(*,'(a9,f4.1)') "Version: ", VERSION_STRING + write(*,'(a16,a)') "Implementation: ", implementation_name + + block + character(len=32) :: printout + write(printout,'(i9,1x,a5)') num_times,'times' + write(*,'(a16,a)') 'Running kernels ',ADJUSTL(printout) + end block + write(*,'(a11,a6)') 'Precision: ',ADJUSTL(StreamRealName) + + write(*,'(a12,f9.1,a3)') 'Array size: ',1.0d0 * element_size * (array_size * scaling), label + write(*,'(a12,f9.1,a3)') 'Total size: ',3.0d0 * element_size * (array_size * scaling), label + + endif ! csv + + allocate( timings(5,num_times) ) + + call alloc(array_size) + + call init_arrays(startA, startB, startC) + summ = 0.0d0 + + timings = -1.0d0 + if (selection.eq.1) then + call run_all(timings, summ) + else if (selection.eq.2) then + call run_triad(timings) + else if (selection.eq.3) then + call run_nstream(timings) + endif + + allocate( h_A(1:array_size), h_B(1:array_size), h_C(1:array_size), stat=err) + if (err .ne. 0) then + write(*,'(a20,i3)') 'allocate returned ',err + stop 1 + endif + + call read_arrays(h_A, h_B, h_C) + call check_solution(h_A, h_B, h_C, summ) + + block + character(len=20) :: printout(8) + real(kind=REAL64) :: tmin,tmax,tavg,nbytes + + if (csv) then + write(*,'(a,a1)',advance='no') 'function', csv_sep + write(*,'(a,a1)',advance='no') 'num_times', csv_sep + write(*,'(a,a1)',advance='no') 'n_elements',csv_sep + write(*,'(a,a1)',advance='no') 'sizeof', csv_sep + if (mibibytes) then + write(*,'(a,a1)',advance='no') 'max_mibytes_per_sec',csv_sep + else + write(*,'(a,a1)',advance='no') 'max_mbytes_per_sec', csv_sep + endif + write(*,'(a,a1)',advance='no') 'min_runtime',csv_sep + write(*,'(a,a1)',advance='no') 'max_runtime',csv_sep + write(*,'(a,a1)',advance='yes') 'avg_runtime' + else + write(printout(1),'(a8)') 'Function' + write(printout(2),'(a3,a8)') TRIM(label),'ytes/sec' + write(printout(3),'(a9)') 'Min (sec)' + write(printout(4),'(a3)') 'Max' + write(printout(5),'(a7)') 'Average' + write(*,'(5a12)') ADJUSTL(printout(1:5)) + endif ! csv + + if (selection.eq.1) then + block + integer, parameter :: sizes(5) = [2,2,3,3,2] + character(len=5), parameter :: labels(5) = ["Copy ", "Mul ", "Add ", "Triad", "Dot "] + integer :: i + do i=1,5 + tmin = MINVAL(timings(i,2:num_times)) + tmax = MAXVAL(timings(i,2:num_times)) + tavg = SUM(timings(i,2:num_times)) / (num_times-1) + nbytes = element_size * REAL(array_size,kind=REAL64) * sizes(i) + write(printout(1),'(a)') labels(i) + if (csv) then + write(printout(2),'(i20)') num_times + write(printout(3),'(i20)') array_size + write(printout(4),'(i20)') element_size + write(printout(5),'(i20)') INT(scaling*nbytes/tmin) + write(printout(6),'(f20.8)') tmin + write(printout(7),'(f20.8)') tmax + write(printout(8),'(f20.8)') tavg + write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(1))),csv_sep + write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(2))),csv_sep + write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(3))),csv_sep + write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(4))),csv_sep + write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(5))),csv_sep + write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(6))),csv_sep + write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(7))),csv_sep + write(*,'(a,a1)',advance='yes') TRIM(ADJUSTL(printout(8))) + else + write(printout(2),'(f12.3)') scaling*nbytes/tmin + write(printout(3),'(f12.5)') tmin + write(printout(4),'(f12.5)') tmax + write(printout(5),'(f12.5)') tavg + write(*,'(5a12)') ADJUSTL(printout(1:5)) + endif + enddo + end block + else if ((selection.eq.2).or.(selection.eq.3)) then + tmin = MINVAL(timings(1,2:num_times)) + tmax = MAXVAL(timings(1,2:num_times)) + tavg = SUM(timings(1,2:num_times)) / (num_times-1) + if (selection.eq.2) then + nbytes = element_size * REAL(array_size,kind=REAL64) * 3 + write(printout(1),'(a12)') "Triad" + else if (selection.eq.3) then + nbytes = element_size * REAL(array_size,kind=REAL64) * 4 + write(printout(1),'(a12)') "Nstream" + endif + if (csv) then + write(printout(2),'(i20)') num_times + write(printout(3),'(i20)') array_size + write(printout(4),'(i20)') element_size + write(printout(5),'(i20)') INT(scaling*nbytes/tmin) + write(printout(6),'(f20.8)') tmin + write(printout(7),'(f20.8)') tmax + write(printout(8),'(f20.8)') tavg + write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(1))),csv_sep + write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(2))),csv_sep + write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(3))),csv_sep + write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(4))),csv_sep + write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(5))),csv_sep + write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(6))),csv_sep + write(*,'(a,a1)',advance='no') TRIM(ADJUSTL(printout(7))),csv_sep + write(*,'(a,a1)',advance='yes') TRIM(ADJUSTL(printout(8))) + else + write(printout(2),'(f12.3)') scaling*nbytes/tmin + write(printout(3),'(f12.5)') tmin + write(printout(4),'(f12.5)') tmax + write(printout(5),'(f12.5)') tavg + write(*,'(5a12)') ADJUSTL(printout(1:5)) + endif + endif + end block + + call dealloc() + +end program BabelStream diff --git a/src/fortran/make.inc.amd b/src/fortran/make.inc.amd new file mode 100644 index 00000000..a863de8a --- /dev/null +++ b/src/fortran/make.inc.amd @@ -0,0 +1,25 @@ +FC := /opt/rocm/llvm/bin/flang +FC := /global/u1/j/jhammond/AMD/aocc-compiler-3.2.0/bin/flang +FCFLAGS := -std=f2018 -O3 +FCFLAGS += -Wall -Wno-unused-variable + +ifdef MARCH +FCFLAGS += -march=$(MARCH) +else +FCFLAGS += -march=native +endif + +DOCONCURRENT_FLAG = -fopenmp # libomp.so required +ARRAY_FLAG = -fopenmp # libomp.so required +OPENMP_FLAG = -fopenmp +#OPENMP_FLAG += -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx908 +OPENACC_FLAG = -fopenacc +CUDA_FLAG = +SEQUENTIAL_FLAG = + +ifeq ($(IMPLEMENTATION),CUDA) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif +ifeq ($(IMPLEMENTATION),CUDAKernels) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif diff --git a/src/fortran/make.inc.arm b/src/fortran/make.inc.arm new file mode 100644 index 00000000..a3e2a67a --- /dev/null +++ b/src/fortran/make.inc.arm @@ -0,0 +1,39 @@ +FC = armflang +FCFLAGS = -std=f2018 -O3 +FCFLAGS += -Wall -Wno-unused-variable + +# MARCH=neoverse-v1,neoverse-n1,icelake-server,znver3,cortex-a78 +ARCH=$(shell uname -m) +ifeq ($(ARCH),aarch64) + ifdef MCPU + FCFLAGS += -mcpu=$(MCPU) + else + FCFLAGS += -mcpu=native + endif +else + ifdef MARCH + FCFLAGS += -march=$(MARCH) + else + FCFLAGS += -march=native + endif +endif + +DOCONCURRENT_FLAG = -fopenmp +ARRAY_FLAG = -fopenmp +OPENMP_FLAG = -fopenmp +OPENACC_FLAG = -fopenacc +CUDA_FLAG = +SEQUENTIAL_FLAG = + +ifeq ($(IMPLEMENTATION),OpenACC) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif +ifeq ($(IMPLEMENTATION),OpenACCArray) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif +ifeq ($(IMPLEMENTATION),CUDA) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif +ifeq ($(IMPLEMENTATION),CUDAKernels) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif diff --git a/src/fortran/make.inc.cray b/src/fortran/make.inc.cray new file mode 100644 index 00000000..dae4e759 --- /dev/null +++ b/src/fortran/make.inc.cray @@ -0,0 +1,18 @@ +FC := ftn +FCFLAGS = -e F -O3 + +DOCONCURRENT_FLAG = -h thread_do_concurrent -DCRAY_THREAD_DOCONCURRENT +ARRAY_FLAG = -h autothread +OPENMP_FLAG = -h omp +OPENACC_FLAG = -h acc +# CPU only +OPENACC_FLAG += -h omp +CUDA_FLAG = +SEQUENTIAL_FLAG = + +ifeq ($(IMPLEMENTATION),CUDA) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif +ifeq ($(IMPLEMENTATION),CUDAKernels) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif diff --git a/src/fortran/make.inc.fj b/src/fortran/make.inc.fj new file mode 100644 index 00000000..b4761e5f --- /dev/null +++ b/src/fortran/make.inc.fj @@ -0,0 +1,21 @@ +FC := frt +FCFLAGS = -X08 -Kfast -KA64FX -KSVE -KARMV8_3_A -Kzfill=100 -Kprefetch_sequential=soft -Kprefetch_line=8 -Kprefetch_line_L2=16 -Koptmsg=2 -Keval -DUSE_OMP_GET_WTIME=1 # FJ Fortran system_clock is low resolution + +DOCONCURRENT_FLAG = -Kparallel,reduction -DNOTSHARED +ARRAY_FLAG = -Kparallel,reduction +OPENMP_FLAG = -fopenmp +OPENACC_FLAG = +# CPU only +OPENACC_FLAG += +CUDA_FLAG = +SEQUENTIAL_FLAG = + +ifeq ($(IMPLEMENTATION),OPENACC) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif +ifeq ($(IMPLEMENTATION),CUDA) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif +ifeq ($(IMPLEMENTATION),CUDAKernels) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif diff --git a/src/fortran/make.inc.gcc b/src/fortran/make.inc.gcc new file mode 100644 index 00000000..f59c8bb8 --- /dev/null +++ b/src/fortran/make.inc.gcc @@ -0,0 +1,33 @@ +FC = gfortran +FCFLAGS = -std=f2018 -O3 +FCFLAGS += -Wall -Wno-unused-dummy-argument -Wno-unused-variable + +# MARCH=neoverse-v1,neoverse-n1,icelake-server,znver3,cortex-a78ae +ARCH=$(shell uname -m) +ifeq ($(ARCH),aarch64) + ifdef MCPU + FCFLAGS += -mcpu=$(MCPU) + else + FCFLAGS += -mcpu=native + endif +else + ifdef MARCH + FCFLAGS += -march=$(MARCH) + else + FCFLAGS += -march=native + endif +endif + +DOCONCURRENT_FLAG = -ftree-parallelize-loops=4 +ARRAY_FLAG = +OPENMP_FLAG = -fopenmp +OPENACC_FLAG = -fopenacc +CUDA_FLAG = +SEQUENTIAL_FLAG = + +ifeq ($(IMPLEMENTATION),CUDA) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif +ifeq ($(IMPLEMENTATION),CUDAKernels) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif diff --git a/src/fortran/make.inc.nvhpc b/src/fortran/make.inc.nvhpc new file mode 100644 index 00000000..dd4c442f --- /dev/null +++ b/src/fortran/make.inc.nvhpc @@ -0,0 +1,70 @@ +FC := nvfortran +#FCFLAGS := -O3 -Minform=inform -Minfo=all +FCFLAGS := -O3 -Minform=warn + +#TARGET=gpu +TARGET=multicore + +NVARCH=$(shell which nvidia-smi > /dev/null && nvidia-smi -q | grep "Product Architecture") +ifeq ($(findstring Ampere,$(NVARCH)),Ampere) + $(info Ampere detected) + GPU = cc80 +endif +ifeq ($(findstring Turing,$(NVARCH)),Turing) + $(info Turing detected) + GPU = cc75 +endif +ifeq ($(findstring Volta,$(NVARCH)),Volta) + $(info Volta detected) + GPU = cc70 +endif +ifeq ($(findstring Pascal,$(NVARCH)),Pascal) + $(info Pascal detected) + GPU = cc60,cc61 +endif +ifeq ($(shell which jetson_clocks > /dev/null && echo 1),1) + $(info Jetson AGX Orin detected) + GPU = ccn87,cc86 + # figure out Xavier later + #GPU = cc72 +endif +ifeq ($(GPU),) + $(error Your GPU architecture could not be detected. Set it manually.) +endif +GPUFLAG = -gpu=$(GPU) + +# MARCH=neoverse-v1,neoverse-n1,zen3 +ARCH=$(shell uname -m) +ifdef MARCH + ifeq ($(ARCH),aarch64) + ifeq ($(MARCH),neoverse-n1) + FCFLAGS += -tp=$(MARCH) + else + ifeq ($(MARCH),neoverse-v1) + FCFLAGS += -tp=$(MARCH) + else + FCFLAGS += -tp=native + endif + endif + else + FCFLAGS += -tp=$(MARCH) + endif +else + FCFLAGS += -tp=native +endif + +# this is to allow apples-to-apples comparison with DC in non-DC GPU impls +# set exactly one of these! +#MANAGED = -DUSE_MANAGED -gpu=managed +#DEVICE = -DUSE_DEVICE -cuda -gpu=nomanaged + +DOCONCURRENT_FLAG = $(GPUFLAG) -stdpar=$(TARGET) $(DEVICE) +ARRAY_FLAG = $(GPUFLAG) -stdpar=$(TARGET) $(MANAGED) +OPENMP_FLAG = $(GPUFLAG) -mp=$(TARGET) $(MANAGED) +OPENACC_FLAG = $(GPUFLAG) -acc=$(TARGET) $(MANAGED) +CUDA_FLAG = $(GPUFLAG) -cuda -acc=gpu $(MANAGED) +SEQUENTIAL_FLAG = + +ifeq ($(IMPLEMENTATION),OpenMPTaskloop) + $(error IMPLEMENTATION=OpenMPTaskloop is not supported by this compiler.) +endif diff --git a/src/fortran/make.inc.oneapi b/src/fortran/make.inc.oneapi new file mode 100644 index 00000000..b7e003c2 --- /dev/null +++ b/src/fortran/make.inc.oneapi @@ -0,0 +1,32 @@ +FC := ifx +FCFLAGS = -std18 +FCFLAGS += -Ofast -xHOST +FCFLAGS += -qopt-zmm-usage=low + +ifeq ($(FC),ifort) + FCFLAGS += -qopt-streaming-stores=always + PARALLEL = -parallel +endif + +DOCONCURRENT_FLAG = -qopenmp $(PARALLEL) +ARRAY_FLAG = -qopenmp $(PARALLEL) +OPENMP_FLAG = -qopenmp +ifeq ($(FC),ifx) + OPENMP_FLAG += -fopenmp-targets=spir64 -DUSE_FLOAT=1 +endif +OPENACC_FLAG = +CUDA_FLAG = +SEQUENTIAL_FLAG = + +ifeq ($(IMPLEMENTATION),OpenACC) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif +ifeq ($(IMPLEMENTATION),OpenACCArray) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif +ifeq ($(IMPLEMENTATION),CUDA) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif +ifeq ($(IMPLEMENTATION),CUDAKernels) + $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) +endif diff --git a/src/fortran/run.sh b/src/fortran/run.sh new file mode 100755 index 00000000..2b41babf --- /dev/null +++ b/src/fortran/run.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +cat ./run.sh + +if [ `uname -s` == Darwin ] ; then + NUM_HWTHREADS=`sysctl -n hw.ncpu` + MEMORY_BYTES=`sysctl -n hw.memsize` +else + NUM_HWTHREADS=`nproc` + MEMORY_KILOS=`grep MemTotal /proc/meminfo | awk '{print $2}'` +fi + +M=128 + +export OMP_NUM_THREADS=8 +export OMP_PROC_BIND=close +export OMP_PLACES=threads + +export ACC_NUM_CORES=${OMP_NUM_THREADS} + +AFFCONTROL="numactl -N 0 -m 0 -C `seq -s "," 0 $((${OMP_NUM_THREADS}-1))`" + +for compiler in gcc nvhpc cray oneapi arm amd fj ; do + #if [ "x$compiler" == "xgcc" ] ; then + # export LD_PRELOAD=/usr/lib/gcc/aarch64-linux-gnu/11/libgomp.so + #fi + for implementation in OpenMP OpenMPTaskloop OpenMPWorkshare DoConcurrent Array OpenACC OpenACCArray CUDA CUDAKernel ; do + if [ -f BabelStream.${compiler}.${implementation} ] ; then + echo "BabelStream.${compiler}.${implementation}" + ldd BabelStream.${compiler}.${implementation} + time $AFFCONTROL \ + ./BabelStream.${compiler}.${implementation} -s $((1024*1024*${M})) + fi + done +done From 7e94495da6240bdda21b805002705d360da55c99 Mon Sep 17 00:00:00 2001 From: Kaan Olgu Date: Fri, 27 Jan 2023 14:28:13 +0000 Subject: [PATCH 40/89] Added ICPX support for the OneAPI2023 and later versions since DPCPP is deprecated --- src/sycl2020/model.cmake | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/sycl2020/model.cmake b/src/sycl2020/model.cmake index e7b5a1c5..c60b9c91 100644 --- a/src/sycl2020/model.cmake +++ b/src/sycl2020/model.cmake @@ -7,6 +7,7 @@ register_flag_required(SYCL_COMPILER "Compile using the specified SYCL compiler implementation Supported values are ONEAPI-DPCPP - dpc++ that is part of an oneAPI Base Toolkit distribution (https://software.intel.com/content/www/us/en/develop/tools/oneapi/base-toolkit.html) + ONEAPI-ICPX - icpx as a standalone compiler DPCPP - dpc++ as a standalone compiler (https://github.com/intel/llvm) HIPSYCL - hipSYCL compiler (https://github.com/illuhad/hipSYCL) COMPUTECPP - ComputeCpp compiler (https://developer.codeplay.com/products/computecpp/ce/home)") @@ -14,6 +15,7 @@ register_flag_required(SYCL_COMPILER register_flag_optional(SYCL_COMPILER_DIR "Absolute path to the selected SYCL compiler directory, most are packaged differently so set the path according to `SYCL_COMPILER`: ONEAPI-DPCPP - not required but `dpcpp` must be on PATH, load oneAPI as per documentation (i.e `source /opt/intel/oneapi/setvars.sh` first) + ONEAPI-ICPX - `icpx` must be used for OneAPI 2023 and later on releases (i.e `source /opt/intel/oneapi/setvars.sh` first) HIPSYCL|DPCPP|COMPUTECPP - set to the root of the binary distribution that contains at least `bin/`, `include/`, and `lib/`" "") @@ -65,6 +67,12 @@ macro(setup) elseif (${SYCL_COMPILER} STREQUAL "ONEAPI-DPCPP") set(CMAKE_CXX_COMPILER dpcpp) register_definitions(CL_TARGET_OPENCL_VERSION=220) + elseif (${SYCL_COMPILER} STREQUAL "ONEAPI-ICPX") + set(CMAKE_CXX_COMPILER icpx) + include_directories(${SYCL_COMPILER_DIR}/include/sycl) + register_definitions(CL_TARGET_OPENCL_VERSION=220) + register_append_cxx_flags(ANY -fsycl) + register_append_link_flags(-fsycl) else () message(FATAL_ERROR "SYCL_COMPILER=${SYCL_COMPILER} is unsupported") endif () From 6a1122e5a34c0ff858f9666333939ecbca779c9d Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Mon, 30 Jan 2023 12:14:17 +0000 Subject: [PATCH 41/89] Update CHANGELOG.md --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 40e2c6d7..54725bb1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ All notable changes to this project will be documented in this file. - RAJA CUDA CMake build issues resolved. - Fix CUDA memory limit check. - Use long double for `check_solution` in case of large problem size. +- OneAPI DPCPP compiler is deprecated in favour of ICPX, so added new build option to SYCL 2020 version. ## [v4.0] - 2021-12-22 From 696ff6a8179b4fbb070070709c595f6e1a2f02a4 Mon Sep 17 00:00:00 2001 From: Thomas Gibson Date: Mon, 13 Mar 2023 10:47:37 -0500 Subject: [PATCH 42/89] Round up dot_num_blocks and remove extra check --- src/hip/HIPStream.cpp | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/src/hip/HIPStream.cpp b/src/hip/HIPStream.cpp index 7fc732de..0db84851 100644 --- a/src/hip/HIPStream.cpp +++ b/src/hip/HIPStream.cpp @@ -33,17 +33,6 @@ HIPStream::HIPStream(const int ARRAY_SIZE, const int device_index) throw std::runtime_error(ss.str()); } - // The array size must be divisible by total number of elements - // moved per block for the dot kernel - if (ARRAY_SIZE % (TBSIZE * dot_elements_per_lane) != 0) - { - std::stringstream ss; - ss << "Array size for the dot kernel must be a multiple of elements operated on per block (" - << TBSIZE * dot_elements_per_lane - << ")."; - throw std::runtime_error(ss.str()); - } - // Set device int count; hipGetDeviceCount(&count); @@ -58,7 +47,8 @@ HIPStream::HIPStream(const int ARRAY_SIZE, const int device_index) std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl; array_size = ARRAY_SIZE; - dot_num_blocks = array_size / (TBSIZE * dot_elements_per_lane); + // Round dot_num_blocks up to next multiple of (TBSIZE * dot_elements_per_lane) + dot_num_blocks = (array_size + (TBSIZE * dot_elements_per_lane - 1)) / (TBSIZE * dot_elements_per_lane); // Allocate the host array for partial sums for dot kernels using hipHostMalloc. // This creates an array on the host which is visible to the device. However, it requires From 8b862f09b3a9818bd3ae14353b0c036b714ebdba Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Tue, 2 May 2023 15:18:42 +0100 Subject: [PATCH 43/89] Update CHANGELOG.md --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 54725bb1..4efcbcc0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ All notable changes to this project will be documented in this file. - Fix CUDA memory limit check. - Use long double for `check_solution` in case of large problem size. - OneAPI DPCPP compiler is deprecated in favour of ICPX, so added new build option to SYCL 2020 version. +- Updates to the HIP kernels and API usage. ## [v4.0] - 2021-12-22 From 893af9f5d024a881ab5876bcdf567cd51a0ae478 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Sat, 10 Jun 2023 21:08:29 +0100 Subject: [PATCH 44/89] Fix compatibility with Kokkos 4+ --- src/kokkos/KokkosStream.cpp | 2 +- src/kokkos/KokkosStream.hpp | 3 --- src/kokkos/model.cmake | 12 ++++++------ 3 files changed, 7 insertions(+), 10 deletions(-) diff --git a/src/kokkos/KokkosStream.cpp b/src/kokkos/KokkosStream.cpp index 00efe92c..9cf32eb6 100644 --- a/src/kokkos/KokkosStream.cpp +++ b/src/kokkos/KokkosStream.cpp @@ -1,4 +1,4 @@ -// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, +// Copyright (c) 2015-23 Tom Deakin, Simon McIntosh-Smith, Wei-Chen (Tom) Lin // University of Bristol HPC // // For full license terms please see the LICENSE file distributed with this diff --git a/src/kokkos/KokkosStream.hpp b/src/kokkos/KokkosStream.hpp index 3aa7cf5f..a410a868 100644 --- a/src/kokkos/KokkosStream.hpp +++ b/src/kokkos/KokkosStream.hpp @@ -10,9 +10,6 @@ #include #include -#include -#include - #include "Stream.h" #define IMPLEMENTATION_STRING "Kokkos" diff --git a/src/kokkos/model.cmake b/src/kokkos/model.cmake index a95fdba6..927bc682 100644 --- a/src/kokkos/model.cmake +++ b/src/kokkos/model.cmake @@ -10,29 +10,29 @@ register_flag_optional(KOKKOS_IN_TREE See https://github.com/kokkos/kokkos/blob/master/BUILD.md for all available options" "") register_flag_optional(KOKKOS_IN_PACKAGE - "Use if Kokkos is part of a package dependency: - Path to package R-Path containing Kokkos libs" "") + "Absolute path to package R-Path containing Kokkos libs. + Use this instead of KOKKOS_IN_TREE if Kokkos is from a package manager like Spack." "") # compiler vendor and arch specific flags set(KOKKOS_FLAGS_CPU_INTEL -qopt-streaming-stores=always) macro(setup) - set(CMAKE_CXX_STANDARD 14) + set(CMAKE_CXX_STANDARD 17) # Kokkos 4+ requires CXX >= 17 cmake_policy(SET CMP0074 NEW) #see https://github.com/kokkos/kokkos/blob/master/BUILD.md if (EXISTS "${KOKKOS_IN_TREE}") - message(STATUS "Building using in-tree Kokkos source at `${KOKKOS_IN_TREE}`") + message(STATUS "Build using in-tree Kokkos source at `${KOKKOS_IN_TREE}`") add_subdirectory(${KOKKOS_IN_TREE} ${CMAKE_BINARY_DIR}/kokkos) register_link_library(Kokkos::kokkos) elseif (EXISTS "${KOKKOS_IN_PACKAGE}") - message(STATUS "Building using packaged Kokkos at `${KOKKOS_IN_PACKAGE}`") + message(STATUS "Build using packaged Kokkos at `${KOKKOS_IN_PACKAGE}`") set (Kokkos_DIR "${KOKKOS_IN_PACKAGE}/lib64/cmake/Kokkos") find_package(Kokkos REQUIRED) register_link_library(Kokkos::kokkos) else() - message(FATAL_ERROR "Neither `${KOKKOS_IN_TREE}`, or `${KOKKOS_IN_PACKAGE}` exists") + message(FATAL_ERROR "Neither `KOKKOS_IN_TREE`, or `KOKKOS_IN_PACKAGE` was set!") endif () register_append_compiler_and_arch_specific_cxx_flags( From 092ee677647a35e3967849635ebf9fb8f888d3b4 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Mon, 12 Jun 2023 15:49:59 +0100 Subject: [PATCH 45/89] Change CUDA DOT thread-blocks to 1024 This improves the performance on Ampere (A100) GPUs. Fixes #137. --- src/cuda/CUDAStream.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cuda/CUDAStream.h b/src/cuda/CUDAStream.h index 83b8c665..bb3f8665 100644 --- a/src/cuda/CUDAStream.h +++ b/src/cuda/CUDAStream.h @@ -22,7 +22,7 @@ #endif #define TBSIZE 1024 -#define DOT_NUM_BLOCKS 256 +#define DOT_NUM_BLOCKS 1024 template class CUDAStream : public Stream From 7643de8d0964bbd084d660bbe8d6a8e900fd5726 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Mon, 12 Jun 2023 16:38:31 +0100 Subject: [PATCH 46/89] update changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4efcbcc0..3b8aa1b6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ All notable changes to this project will be documented in this file. - Use long double for `check_solution` in case of large problem size. - OneAPI DPCPP compiler is deprecated in favour of ICPX, so added new build option to SYCL 2020 version. - Updates to the HIP kernels and API usage. +- Number of thread-blocks in CUDA dot kernel implementation changed to 1024. ## [v4.0] - 2021-12-22 From e81f6c28895b590ba807d4c28d6d5e5c22702c19 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Tue, 13 Jun 2023 22:18:48 +0100 Subject: [PATCH 47/89] Fix RAJA > v0.14.1 compatibility --- src/raja/model.cmake | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/src/raja/model.cmake b/src/raja/model.cmake index b1e7750d..eb4788cd 100644 --- a/src/raja/model.cmake +++ b/src/raja/model.cmake @@ -8,6 +8,8 @@ register_flag_optional(RAJA_IN_TREE Make sure to use the release version of RAJA or clone RAJA recursively with submodules. Remember to append RAJA specific flags as well, for example: -DRAJA_IN_TREE=... -DENABLE_OPENMP=ON -DENABLE_CUDA=ON ... + For RAJA >= v2022.03.0, remember to use the RAJA prefixed CMake options: + -DRAJA_IN_TREE=... -DRAJA_ENABLE_OPENMP=ON -DRAJA_ENABLE_CUDA=ON ... See https://github.com/LLNL/RAJA/blob/08cbbafd2d21589ebf341f7275c229412d0fe903/CMakeLists.txt#L44 for all available options " "") @@ -20,7 +22,7 @@ register_flag_optional(TARGET CPU) register_flag_optional(CUDA_TOOLKIT_ROOT_DIR - "[TARGET==NVIDIA only] Path to the CUDA toolkit directory (e.g `/opt/cuda-11.2`) if the ENABLE_CUDA flag is specified for RAJA" "") + "[TARGET==NVIDIA only] Path to the CUDA toolkit directory (e.g `/opt/cuda-11.2`) if the RAJA_ENABLE_CUDA or ENABLE_CUDA flag is specified for RAJA" "") # XXX CMake 3.18 supports CMAKE_CUDA_ARCHITECTURES/CUDA_ARCHITECTURES but we support older CMakes register_flag_optional(CUDA_ARCH @@ -58,7 +60,20 @@ macro(setup) set(ENABLE_BENCHMARKS OFF CACHE BOOL "") set(ENABLE_CUDA ${ENABLE_CUDA} CACHE BOOL "" FORCE) - if (ENABLE_CUDA) + # RAJA >= v2022.03.0 switched to prefixed variables, we keep the legacy ones for backwards compatibiity + set(RAJA_ENABLE_TESTS OFF CACHE BOOL "") + set(RAJA_ENABLE_EXAMPLES OFF CACHE BOOL "") + set(RAJA_ENABLE_REPRODUCERS OFF CACHE BOOL "") + set(RAJA_ENABLE_EXERCISES OFF CACHE BOOL "") + set(RAJA_ENABLE_DOCUMENTATION OFF CACHE BOOL "") + set(RAJA_ENABLE_BENCHMARKS OFF CACHE BOOL "") + set(RAJA_ENABLE_CUDA ${RAJA_ENABLE_CUDA} CACHE BOOL "" FORCE) + + if (ENABLE_CUDA OR RAJA_ENABLE_CUDA) + + # RAJA still needs ENABLE_CUDA for internal use, so if either is on, assert both. + set(RAJA_ENABLE_CUDA ON) + set(ENABLE_CUDA ON) # XXX CMake 3.18 supports CMAKE_CUDA_ARCHITECTURES/CUDA_ARCHITECTURES but we support older CMakes if(POLICY CMP0104) @@ -70,6 +85,10 @@ macro(setup) set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "-forward-unknown-to-host-compiler -extended-lambda -arch=${CUDA_ARCH}" ${CUDA_EXTRA_FLAGS}) list(APPEND CMAKE_CUDA_FLAGS) + # See https://github.com/LLNL/RAJA/pull/1302 + # And https://github.com/LLNL/RAJA/pull/1339 + set(RAJA_ENABLE_VECTORIZATION OFF CACHE BOOL "") + message(STATUS "NVCC flags: ${CMAKE_CUDA_FLAGS}") endif () From c3346318b970ae4e6c91c800478aaeebd0efe448 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Sun, 2 Jul 2023 02:31:47 +0100 Subject: [PATCH 48/89] Update SYCL options for --include-intel-llvm --- src/sycl/model.cmake | 23 ++++++++++++++++------- src/sycl2020/model.cmake | 21 +++++++++++---------- 2 files changed, 27 insertions(+), 17 deletions(-) diff --git a/src/sycl/model.cmake b/src/sycl/model.cmake index e7b5a1c5..6a517c1c 100644 --- a/src/sycl/model.cmake +++ b/src/sycl/model.cmake @@ -6,14 +6,16 @@ register_flag_optional(CMAKE_CXX_COMPILER register_flag_required(SYCL_COMPILER "Compile using the specified SYCL compiler implementation Supported values are - ONEAPI-DPCPP - dpc++ that is part of an oneAPI Base Toolkit distribution (https://software.intel.com/content/www/us/en/develop/tools/oneapi/base-toolkit.html) + ONEAPI-ICPX - icpx as a standalone compiler + ONEAPI-Clang - oneAPI's Clang driver (enabled via `source /opt/intel/oneapi/setvars.sh --include-intel-llvm`) DPCPP - dpc++ as a standalone compiler (https://github.com/intel/llvm) HIPSYCL - hipSYCL compiler (https://github.com/illuhad/hipSYCL) COMPUTECPP - ComputeCpp compiler (https://developer.codeplay.com/products/computecpp/ce/home)") register_flag_optional(SYCL_COMPILER_DIR "Absolute path to the selected SYCL compiler directory, most are packaged differently so set the path according to `SYCL_COMPILER`: - ONEAPI-DPCPP - not required but `dpcpp` must be on PATH, load oneAPI as per documentation (i.e `source /opt/intel/oneapi/setvars.sh` first) + ONEAPI-ICPX - `icpx` must be used for OneAPI 2023 and later on releases (i.e `source /opt/intel/oneapi/setvars.sh` first) + ONEAPI-Clang - set to the directory that contains the Intel clang++ binary. HIPSYCL|DPCPP|COMPUTECPP - set to the root of the binary distribution that contains at least `bin/`, `include/`, and `lib/`" "") @@ -47,7 +49,8 @@ macro(setup) list(APPEND CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake/Modules) set(ComputeCpp_DIR ${SYCL_COMPILER_DIR}) - setup_opencl_header_includes() + # don't point to the CL dir as the imports already have the CL prefix + set(OpenCL_INCLUDE_DIR "${CMAKE_SOURCE_DIR}") register_definitions(CL_TARGET_OPENCL_VERSION=220 _GLIBCXX_USE_CXX11_ABI=0) # ComputeCpp needs OpenCL @@ -59,12 +62,18 @@ macro(setup) elseif (${SYCL_COMPILER} STREQUAL "DPCPP") set(CMAKE_CXX_COMPILER ${SYCL_COMPILER_DIR}/bin/clang++) include_directories(${SYCL_COMPILER_DIR}/include/sycl) - register_definitions(CL_TARGET_OPENCL_VERSION=220) register_append_cxx_flags(ANY -fsycl) register_append_link_flags(-fsycl) - elseif (${SYCL_COMPILER} STREQUAL "ONEAPI-DPCPP") - set(CMAKE_CXX_COMPILER dpcpp) - register_definitions(CL_TARGET_OPENCL_VERSION=220) + elseif (${SYCL_COMPILER} STREQUAL "ONEAPI-ICPX") + set(CMAKE_CXX_COMPILER icpx) + set(CMAKE_C_COMPILER icx) + register_append_cxx_flags(ANY -fsycl) + register_append_link_flags(-fsycl) + elseif (${SYCL_COMPILER} STREQUAL "ONEAPI-Clang") + set(CMAKE_CXX_COMPILER clang++) + set(CMAKE_C_COMPILER clang) + register_append_cxx_flags(ANY -fsycl) + register_append_link_flags(-fsycl) else () message(FATAL_ERROR "SYCL_COMPILER=${SYCL_COMPILER} is unsupported") endif () diff --git a/src/sycl2020/model.cmake b/src/sycl2020/model.cmake index c60b9c91..6a517c1c 100644 --- a/src/sycl2020/model.cmake +++ b/src/sycl2020/model.cmake @@ -6,16 +6,16 @@ register_flag_optional(CMAKE_CXX_COMPILER register_flag_required(SYCL_COMPILER "Compile using the specified SYCL compiler implementation Supported values are - ONEAPI-DPCPP - dpc++ that is part of an oneAPI Base Toolkit distribution (https://software.intel.com/content/www/us/en/develop/tools/oneapi/base-toolkit.html) - ONEAPI-ICPX - icpx as a standalone compiler + ONEAPI-ICPX - icpx as a standalone compiler + ONEAPI-Clang - oneAPI's Clang driver (enabled via `source /opt/intel/oneapi/setvars.sh --include-intel-llvm`) DPCPP - dpc++ as a standalone compiler (https://github.com/intel/llvm) HIPSYCL - hipSYCL compiler (https://github.com/illuhad/hipSYCL) COMPUTECPP - ComputeCpp compiler (https://developer.codeplay.com/products/computecpp/ce/home)") register_flag_optional(SYCL_COMPILER_DIR "Absolute path to the selected SYCL compiler directory, most are packaged differently so set the path according to `SYCL_COMPILER`: - ONEAPI-DPCPP - not required but `dpcpp` must be on PATH, load oneAPI as per documentation (i.e `source /opt/intel/oneapi/setvars.sh` first) ONEAPI-ICPX - `icpx` must be used for OneAPI 2023 and later on releases (i.e `source /opt/intel/oneapi/setvars.sh` first) + ONEAPI-Clang - set to the directory that contains the Intel clang++ binary. HIPSYCL|DPCPP|COMPUTECPP - set to the root of the binary distribution that contains at least `bin/`, `include/`, and `lib/`" "") @@ -49,7 +49,8 @@ macro(setup) list(APPEND CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake/Modules) set(ComputeCpp_DIR ${SYCL_COMPILER_DIR}) - setup_opencl_header_includes() + # don't point to the CL dir as the imports already have the CL prefix + set(OpenCL_INCLUDE_DIR "${CMAKE_SOURCE_DIR}") register_definitions(CL_TARGET_OPENCL_VERSION=220 _GLIBCXX_USE_CXX11_ABI=0) # ComputeCpp needs OpenCL @@ -61,16 +62,16 @@ macro(setup) elseif (${SYCL_COMPILER} STREQUAL "DPCPP") set(CMAKE_CXX_COMPILER ${SYCL_COMPILER_DIR}/bin/clang++) include_directories(${SYCL_COMPILER_DIR}/include/sycl) - register_definitions(CL_TARGET_OPENCL_VERSION=220) register_append_cxx_flags(ANY -fsycl) register_append_link_flags(-fsycl) - elseif (${SYCL_COMPILER} STREQUAL "ONEAPI-DPCPP") - set(CMAKE_CXX_COMPILER dpcpp) - register_definitions(CL_TARGET_OPENCL_VERSION=220) elseif (${SYCL_COMPILER} STREQUAL "ONEAPI-ICPX") set(CMAKE_CXX_COMPILER icpx) - include_directories(${SYCL_COMPILER_DIR}/include/sycl) - register_definitions(CL_TARGET_OPENCL_VERSION=220) + set(CMAKE_C_COMPILER icx) + register_append_cxx_flags(ANY -fsycl) + register_append_link_flags(-fsycl) + elseif (${SYCL_COMPILER} STREQUAL "ONEAPI-Clang") + set(CMAKE_CXX_COMPILER clang++) + set(CMAKE_C_COMPILER clang) register_append_cxx_flags(ANY -fsycl) register_append_link_flags(-fsycl) else () From 180bd95ba3eebcd94bbf34a695c0dd3b42b9a19e Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Sun, 2 Jul 2023 04:33:07 +0100 Subject: [PATCH 49/89] Bump oneTBB to v2021.9.0 --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 4321e864..fe281795 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -79,7 +79,7 @@ if (USE_TBB) FetchContent_Declare( TBB GIT_REPOSITORY https://github.com/oneapi-src/oneTBB.git - GIT_TAG faaf43c4ab22cb4b4267d65d5e218fa58800eea8 + GIT_TAG v2021.9.0 ) # Don't fail builds on waring (TBB has -Wall while not being free of warnings from unused symbols...) set(CMAKE_POLICY_DEFAULT_CMP0077 NEW) From 6e47d341fa29c35a94b516054f2de82125678ece Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Sun, 2 Jul 2023 05:27:13 +0100 Subject: [PATCH 50/89] Bump oneDPL to oneDPL-2022.1.0 --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index fe281795..476ea560 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -100,7 +100,7 @@ if (USE_ONEDPL) FetchContent_Declare( oneDPL GIT_REPOSITORY https://github.com/oneapi-src/oneDPL.git - GIT_TAG oneDPL-2021.7.0-release + GIT_TAG oneDPL-2022.1.0-rc3 ) string(TOLOWER ${USE_ONEDPL} ONEDPL_BACKEND) # XXX oneDPL looks for omp instead of openmp, which mismatches(!) with ONEDPL_PAR_BACKEND if using find_package From 1d43fcb3e7d2f4d43b3d985194de0a6b438069b8 Mon Sep 17 00:00:00 2001 From: Aksel Alpay Date: Thu, 6 Jul 2023 22:38:50 +0200 Subject: [PATCH 51/89] std-indices: Fix infinite recursion in ranged::operator!= --- src/std-indices/STDIndicesStream.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/std-indices/STDIndicesStream.h b/src/std-indices/STDIndicesStream.h index bc068aa9..c2eec0e8 100644 --- a/src/std-indices/STDIndicesStream.h +++ b/src/std-indices/STDIndicesStream.h @@ -36,7 +36,7 @@ class ranged { iterator operator+(const value_type v) const { return iterator(num + v); } bool operator==(iterator other) const { return num == other.num; } - bool operator!=(iterator other) const { return *this != other; } + bool operator!=(iterator other) const { return num != other.num; } bool operator<(iterator other) const { return num < other.num; } reference operator*() const { return num;} From 288d0cb18958783f6a3149bd5057e7a0857c2c32 Mon Sep 17 00:00:00 2001 From: Aksel Alpay Date: Thu, 6 Jul 2023 23:57:46 +0200 Subject: [PATCH 52/89] std-indices: Use forward iterator tag to align with the implemented operators --- src/std-indices/STDIndicesStream.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/std-indices/STDIndicesStream.h b/src/std-indices/STDIndicesStream.h index bc068aa9..c1388b49 100644 --- a/src/std-indices/STDIndicesStream.h +++ b/src/std-indices/STDIndicesStream.h @@ -28,7 +28,7 @@ class ranged { using value_type = N; using pointer = const N*; using reference = const N&; - using iterator_category = std::random_access_iterator_tag; + using iterator_category = std::forward_iterator_tag; explicit iterator(N _num = 0) : num(_num) {} iterator& operator++() { num++; return *this; } From 6d11c723826ae94b3de1fb346ee787c56f6fba4b Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Thu, 27 Jul 2023 13:46:23 +0100 Subject: [PATCH 53/89] Add support for Thrust managed memory Closes #143 --- CHANGELOG.md | 1 + src/thrust/ThrustStream.h | 10 ++++++++++ src/thrust/model.cmake | 8 +++++++- 3 files changed, 18 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3b8aa1b6..371e2419 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ All notable changes to this project will be documented in this file. ## Unreleased ### Added - Ability to build Kokkos and RAJA versions against existing packages. +- Thrust managed memory. ### Changed - RAJA CUDA CMake build issues resolved. diff --git a/src/thrust/ThrustStream.h b/src/thrust/ThrustStream.h index f87ace73..a2a4b72f 100644 --- a/src/thrust/ThrustStream.h +++ b/src/thrust/ThrustStream.h @@ -8,7 +8,11 @@ #include #include +#if defined(MANAGED) +#include +#else #include +#endif #include "Stream.h" @@ -21,9 +25,15 @@ class ThrustStream : public Stream // Size of arrays int array_size; + #if defined(MANAGED) + thrust::universtal_vector a; + thrust::universtal_vector b; + thrust::universtal_vector c; + #else thrust::device_vector a; thrust::device_vector b; thrust::device_vector c; + #endif public: ThrustStream(const int, int); diff --git a/src/thrust/model.cmake b/src/thrust/model.cmake index 2d687c72..91821ef1 100644 --- a/src/thrust/model.cmake +++ b/src/thrust/model.cmake @@ -18,6 +18,9 @@ register_flag_optional(BACKEND " "CUDA") + register_flag_optional(MANAGED "Enabled managed memory mode." + "OFF") + register_flag_optional(CMAKE_CUDA_COMPILER "[THRUST_IMPL==CUDA] Path to the CUDA nvcc compiler" "") @@ -34,6 +37,9 @@ register_flag_optional(CUDA_EXTRA_FLAGS macro(setup) set(CMAKE_CXX_STANDARD 14) + if (MANAGED) + register_definitions(MANAGED) + endif () if (${THRUST_IMPL} STREQUAL "CUDA") @@ -91,4 +97,4 @@ macro(setup) endmacro() - \ No newline at end of file + From 09ad102966d5c244566a5f503c83c7fb01cea6aa Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Sun, 13 Aug 2023 23:41:57 +0100 Subject: [PATCH 54/89] Add hipSYCL workaround for std-indices --- src/dpl_shim.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/dpl_shim.h b/src/dpl_shim.h index 226693bd..b9540196 100644 --- a/src/dpl_shim.h +++ b/src/dpl_shim.h @@ -56,10 +56,21 @@ static constexpr auto exe_policy = std::execution::par_unseq; #ifdef USE_STD_PTR_ALLOC_DEALLOC +#if defined(__HIPSYCL__) || defined(__OPENSYCL__) +#include + +// TODO We temporarily use malloc_shared/free here for hipSYCL stdpar because there's a linking issue if we let it hijack new/delete +// for this to work, we compile with --hipsycl-stdpar-system-usm so that hijacking is disabled +static cl::sycl::queue queue{cl::sycl::default_selector_v}; +template T *alloc_raw(size_t size) { return cl::sycl::malloc_shared(size, queue); } +template void dealloc_raw(T *ptr) { cl::sycl::free(ptr, queue); } + +#else template T *alloc_raw(size_t size) { return (T *) aligned_alloc(ALIGNMENT, sizeof(T) * size); } template void dealloc_raw(T *ptr) { free(ptr); } +#endif #endif From eef3221df6bc9929c527b1bf728a50ecca90efcf Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Thu, 31 Aug 2023 17:53:50 +0100 Subject: [PATCH 55/89] Qualify all math calls --- src/main.cpp | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index c9d76942..d7208da8 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -222,10 +222,10 @@ void run() { // MiB = 2^20 std::cout << std::setprecision(1) << std::fixed - << "Array size: " << ARRAY_SIZE*sizeof(T)*pow(2.0, -20.0) << " MiB" - << " (=" << ARRAY_SIZE*sizeof(T)*pow(2.0, -30.0) << " GiB)" << std::endl; - std::cout << "Total size: " << 3.0*ARRAY_SIZE*sizeof(T)*pow(2.0, -20.0) << " MiB" - << " (=" << 3.0*ARRAY_SIZE*sizeof(T)*pow(2.0, -30.0) << " GiB)" << std::endl; + << "Array size: " << ARRAY_SIZE*sizeof(T)*std::pow(2.0, -20.0) << " MiB" + << " (=" << ARRAY_SIZE*sizeof(T)*std::pow(2.0, -30.0) << " GiB)" << std::endl; + std::cout << "Total size: " << 3.0*ARRAY_SIZE*sizeof(T)*std::pow(2.0, -20.0) << " MiB" + << " (=" << 3.0*ARRAY_SIZE*sizeof(T)*std::pow(2.0, -30.0) << " GiB)" << std::endl; } else { @@ -393,7 +393,7 @@ void run() << num_times << csv_separator << ARRAY_SIZE << csv_separator << sizeof(T) << csv_separator - << ((mibibytes) ? pow(2.0, -20.0) : 1.0E-6) * sizes[i] / (*minmax.first) << csv_separator + << ((mibibytes) ? std::pow(2.0, -20.0) : 1.0E-6) * sizes[i] / (*minmax.first) << csv_separator << *minmax.first << csv_separator << *minmax.second << csv_separator << average @@ -404,7 +404,7 @@ void run() std::cout << std::left << std::setw(12) << labels[i] << std::left << std::setw(12) << std::setprecision(3) << - ((mibibytes) ? pow(2.0, -20.0) : 1.0E-6) * sizes[i] / (*minmax.first) + ((mibibytes) ? std::pow(2.0, -20.0) : 1.0E-6) * sizes[i] / (*minmax.first) << std::left << std::setw(12) << std::setprecision(5) << *minmax.first << std::left << std::setw(12) << std::setprecision(5) << *minmax.second << std::left << std::setw(12) << std::setprecision(5) << average @@ -415,7 +415,7 @@ void run() { // Display timing results double total_bytes = 3 * sizeof(T) * ARRAY_SIZE * num_times; - double bandwidth = ((mibibytes) ? pow(2.0, -30.0) : 1.0E-9) * (total_bytes / timings[0][0]); + double bandwidth = ((mibibytes) ? std::pow(2.0, -30.0) : 1.0E-9) * (total_bytes / timings[0][0]); if (output_as_csv) { @@ -487,13 +487,13 @@ void check_solution(const unsigned int ntimes, std::vector& a, std::vector goldSum = goldA * goldB * ARRAY_SIZE; // Calculate the average error - long double errA = std::accumulate(a.begin(), a.end(), 0.0, [&](double sum, const T val){ return sum + fabs(val - goldA); }); + long double errA = std::accumulate(a.begin(), a.end(), 0.0, [&](double sum, const T val){ return sum + std::fabs(val - goldA); }); errA /= a.size(); - long double errB = std::accumulate(b.begin(), b.end(), 0.0, [&](double sum, const T val){ return sum + fabs(val - goldB); }); + long double errB = std::accumulate(b.begin(), b.end(), 0.0, [&](double sum, const T val){ return sum + std::fabs(val - goldB); }); errB /= b.size(); - long double errC = std::accumulate(c.begin(), c.end(), 0.0, [&](double sum, const T val){ return sum + fabs(val - goldC); }); + long double errC = std::accumulate(c.begin(), c.end(), 0.0, [&](double sum, const T val){ return sum + std::fabs(val - goldC); }); errC /= c.size(); - long double errSum = fabs((sum - goldSum)/goldSum); + long double errSum = std::fabs((sum - goldSum)/goldSum); long double epsi = std::numeric_limits::epsilon() * 100.0; From 28dcf6f9629326c46913cf3cf659c5a8fdf40c68 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Fri, 1 Sep 2023 03:48:19 +0100 Subject: [PATCH 56/89] Bump oneDPL version --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 476ea560..89b3a78f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -100,7 +100,7 @@ if (USE_ONEDPL) FetchContent_Declare( oneDPL GIT_REPOSITORY https://github.com/oneapi-src/oneDPL.git - GIT_TAG oneDPL-2022.1.0-rc3 + GIT_TAG oneDPL-2022.2.0-rc1 ) string(TOLOWER ${USE_ONEDPL} ONEDPL_BACKEND) # XXX oneDPL looks for omp instead of openmp, which mismatches(!) with ONEDPL_PAR_BACKEND if using find_package From 3f7bb631e18dd80b7798c7bb8218b041197b1975 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Tue, 5 Sep 2023 03:29:16 +0100 Subject: [PATCH 57/89] Initial SYCL2020 USM implementation --- CHANGELOG.md | 3 + CMakeLists.txt | 9 +- .../SYCLStream2020.cpp | 9 +- .../SYCLStream2020.h | 2 +- src/sycl2020-acc/model.cmake | 91 ++++++ src/sycl2020-usm/SYCLStream2020.cpp | 269 ++++++++++++++++++ src/sycl2020-usm/SYCLStream2020.h | 54 ++++ src/{sycl2020 => sycl2020-usm}/model.cmake | 3 - 8 files changed, 433 insertions(+), 7 deletions(-) rename src/{sycl2020 => sycl2020-acc}/SYCLStream2020.cpp (94%) rename src/{sycl2020 => sycl2020-acc}/SYCLStream2020.h (95%) create mode 100644 src/sycl2020-acc/model.cmake create mode 100644 src/sycl2020-usm/SYCLStream2020.cpp create mode 100644 src/sycl2020-usm/SYCLStream2020.h rename src/{sycl2020 => sycl2020-usm}/model.cmake (96%) diff --git a/CHANGELOG.md b/CHANGELOG.md index 371e2419..6bf53a3d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ All notable changes to this project will be documented in this file. ### Added - Ability to build Kokkos and RAJA versions against existing packages. - Thrust managed memory. +- New implementation using SYCL2020 USM (sycl2020-acc) and renamed original `sycl2020` to `sycl2020-acc`. ### Changed - RAJA CUDA CMake build issues resolved. @@ -13,6 +14,8 @@ All notable changes to this project will be documented in this file. - OneAPI DPCPP compiler is deprecated in favour of ICPX, so added new build option to SYCL 2020 version. - Updates to the HIP kernels and API usage. - Number of thread-blocks in CUDA dot kernel implementation changed to 1024. +- Fix compatibility of `sycl2020` (now `sycl2020-acc`) with hipSYCL. + ## [v4.0] - 2021-12-22 diff --git a/CMakeLists.txt b/CMakeLists.txt index 89b3a78f..da112a44 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -145,7 +145,8 @@ register_model(hip HIP HIPStream.cpp) register_model(cuda CUDA CUDAStream.cu) register_model(kokkos KOKKOS KokkosStream.cpp) register_model(sycl SYCL SYCLStream.cpp) -register_model(sycl2020 SYCL2020 SYCLStream2020.cpp) +register_model(sycl2020-acc SYCL2020 SYCLStream2020.cpp) +register_model(sycl2020-usm SYCL2020 SYCLStream2020.cpp) register_model(acc ACC ACCStream.cpp) # defining RAJA collides with the RAJA namespace so USE_RAJA register_model(raja USE_RAJA RAJAStream.cpp) @@ -162,6 +163,12 @@ else () message(STATUS "Selected model : ${MODEL}") endif () +if (MODEL STREQUAL "sycl2020") + message(FATAL_ERROR " + Model sycl2020 has been renamed to sycl2020-acc, and a new sycl2020-usm model is now available. + Please use sycl2020-acc for SYCL2020 style accessors and sycl2020-usm for USM") +endif () + # load the $MODEL.cmake file and setup the correct IMPL_* based on $MODEL load_model(${MODEL}) diff --git a/src/sycl2020/SYCLStream2020.cpp b/src/sycl2020-acc/SYCLStream2020.cpp similarity index 94% rename from src/sycl2020/SYCLStream2020.cpp rename to src/sycl2020-acc/SYCLStream2020.cpp index 17a5ab55..f88cbbbe 100644 --- a/src/sycl2020/SYCLStream2020.cpp +++ b/src/sycl2020-acc/SYCLStream2020.cpp @@ -164,8 +164,13 @@ T SYCLStream::dot() sycl::accessor kb {d_b, cgh, sycl::read_only}; cgh.parallel_for(sycl::range<1>{array_size}, - // Reduction object, to perform summation - initialises the result to zero - sycl::reduction(d_sum, cgh, std::plus(), sycl::property::reduction::initialize_to_identity{}), + // Reduction object, to perform summation - initialises the result to zero + // hipSYCL doesn't sypport the initialize_to_identity property yet +#if defined(__HIPSYCL__) || defined(__OPENSYCL__) + sycl::reduction(d_sum. template get_access(cgh), sycl::plus()), +#else + sycl::reduction(d_sum, cgh sycl::plus()), +#endif [=](sycl::id<1> idx, auto& sum) { sum += ka[idx] * kb[idx]; diff --git a/src/sycl2020/SYCLStream2020.h b/src/sycl2020-acc/SYCLStream2020.h similarity index 95% rename from src/sycl2020/SYCLStream2020.h rename to src/sycl2020-acc/SYCLStream2020.h index 7481d160..caaeae9e 100644 --- a/src/sycl2020/SYCLStream2020.h +++ b/src/sycl2020-acc/SYCLStream2020.h @@ -14,7 +14,7 @@ #include -#define IMPLEMENTATION_STRING "SYCL 2020" +#define IMPLEMENTATION_STRING "SYCL2020 accessors" template class SYCLStream : public Stream diff --git a/src/sycl2020-acc/model.cmake b/src/sycl2020-acc/model.cmake new file mode 100644 index 00000000..0cd8c92a --- /dev/null +++ b/src/sycl2020-acc/model.cmake @@ -0,0 +1,91 @@ + +register_flag_optional(CMAKE_CXX_COMPILER + "Any CXX compiler that is supported by CMake detection, this is used for host compilation when required by the SYCL compiler" + "c++") + +register_flag_required(SYCL_COMPILER + "Compile using the specified SYCL compiler implementation + Supported values are + ONEAPI-ICPX - icpx as a standalone compiler + ONEAPI-Clang - oneAPI's Clang driver (enabled via `source /opt/intel/oneapi/setvars.sh --include-intel-llvm`) + DPCPP - dpc++ as a standalone compiler (https://github.com/intel/llvm) + HIPSYCL - hipSYCL compiler (https://github.com/illuhad/hipSYCL) + COMPUTECPP - ComputeCpp compiler (https://developer.codeplay.com/products/computecpp/ce/home)") + +register_flag_optional(SYCL_COMPILER_DIR + "Absolute path to the selected SYCL compiler directory, most are packaged differently so set the path according to `SYCL_COMPILER`: + ONEAPI-ICPX - `icpx` must be used for OneAPI 2023 and later on releases (i.e `source /opt/intel/oneapi/setvars.sh` first) + ONEAPI-Clang - set to the directory that contains the Intel clang++ binary. + HIPSYCL|DPCPP|COMPUTECPP - set to the root of the binary distribution that contains at least `bin/`, `include/`, and `lib/`" + "") + +macro(setup) + set(CMAKE_CXX_STANDARD 17) + + + if (${SYCL_COMPILER} STREQUAL "HIPSYCL") + + + set(hipSYCL_DIR ${SYCL_COMPILER_DIR}/lib/cmake/hipSYCL) + + if (NOT EXISTS "${hipSYCL_DIR}") + message(WARNING "Falling back to hipSYCL < 0.9.0 CMake structure") + set(hipSYCL_DIR ${SYCL_COMPILER_DIR}/lib/cmake) + endif () + if (NOT EXISTS "${hipSYCL_DIR}") + message(FATAL_ERROR "Can't find the appropriate CMake definitions for hipSYCL") + endif () + + # register_definitions(_GLIBCXX_USE_CXX11_ABI=0) + find_package(hipSYCL CONFIG REQUIRED) + message(STATUS "ok") + + elseif (${SYCL_COMPILER} STREQUAL "COMPUTECPP") + + list(APPEND CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake/Modules) + set(ComputeCpp_DIR ${SYCL_COMPILER_DIR}) + + # don't point to the CL dir as the imports already have the CL prefix + set(OpenCL_INCLUDE_DIR "${CMAKE_SOURCE_DIR}") + + register_definitions(CL_TARGET_OPENCL_VERSION=220 _GLIBCXX_USE_CXX11_ABI=0) + # ComputeCpp needs OpenCL + find_package(ComputeCpp REQUIRED) + + # this must come after FindComputeCpp (!) + set(COMPUTECPP_USER_FLAGS -O3 -no-serial-memop) + + elseif (${SYCL_COMPILER} STREQUAL "DPCPP") + set(CMAKE_CXX_COMPILER ${SYCL_COMPILER_DIR}/bin/clang++) + include_directories(${SYCL_COMPILER_DIR}/include/sycl) + register_append_cxx_flags(ANY -fsycl) + register_append_link_flags(-fsycl) + elseif (${SYCL_COMPILER} STREQUAL "ONEAPI-ICPX") + set(CMAKE_CXX_COMPILER icpx) + set(CMAKE_C_COMPILER icx) + register_append_cxx_flags(ANY -fsycl) + register_append_link_flags(-fsycl) + elseif (${SYCL_COMPILER} STREQUAL "ONEAPI-Clang") + set(CMAKE_CXX_COMPILER clang++) + set(CMAKE_C_COMPILER clang) + register_append_cxx_flags(ANY -fsycl) + register_append_link_flags(-fsycl) + else () + message(FATAL_ERROR "SYCL_COMPILER=${SYCL_COMPILER} is unsupported") + endif () + +endmacro() + + +macro(setup_target NAME) + if ( + (${SYCL_COMPILER} STREQUAL "COMPUTECPP") OR + (${SYCL_COMPILER} STREQUAL "HIPSYCL")) + # so ComputeCpp and hipSYCL has this weird (and bad) CMake usage where they append their + # own custom integration header flags AFTER the target has been specified + # hence this macro here + add_sycl_to_target( + TARGET ${NAME} + SOURCES ${IMPL_SOURCES}) + endif () +endmacro() diff --git a/src/sycl2020-usm/SYCLStream2020.cpp b/src/sycl2020-usm/SYCLStream2020.cpp new file mode 100644 index 00000000..21a8a47b --- /dev/null +++ b/src/sycl2020-usm/SYCLStream2020.cpp @@ -0,0 +1,269 @@ + +// Copyright (c) 2015-23 Tom Deakin, Simon McIntosh-Smith, and Tom Lin +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#include "SYCLStream2020.h" + +#include + +// Cache list of devices +bool cached = false; +std::vector devices; +void getDeviceList(void); + +template +SYCLStream::SYCLStream(const size_t ARRAY_SIZE, const int device_index) +: array_size {ARRAY_SIZE} +{ + if (!cached) + getDeviceList(); + + if (device_index >= devices.size()) + throw std::runtime_error("Invalid device index"); + + sycl::device dev = devices[device_index]; + + // Print out device information + std::cout << "Using SYCL device " << getDeviceName(device_index) << std::endl; + std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl; + + // Check device can support FP64 if needed + if (sizeof(T) == sizeof(double)) + { + if (!dev.has(sycl::aspect::fp64)) + { + throw std::runtime_error("Device does not support double precision, please use --float"); + } + } + + queue = std::make_unique(dev, sycl::async_handler{[&](sycl::exception_list l) + { + bool error = false; + for(auto e: l) + { + try + { + std::rethrow_exception(e); + } + catch (sycl::exception e) + { + std::cout << e.what(); + error = true; + } + } + if(error) + { + throw std::runtime_error("SYCL errors detected"); + } + }}); + + a = sycl::malloc_shared(array_size, *queue); + b = sycl::malloc_shared(array_size, *queue); + c = sycl::malloc_shared(array_size, *queue); + sum = sycl::malloc_shared(1, *queue); + + // No longer need list of devices + devices.clear(); + cached = true; + + +} + +template +SYCLStream::~SYCLStream() { + sycl::free(a, *queue); + sycl::free(b, *queue); + sycl::free(c, *queue); + sycl::free(sum, *queue); +} + +template +void SYCLStream::copy() +{ + queue->submit([&](sycl::handler &cgh) + { + cgh.parallel_for(sycl::range<1>{array_size}, [=, c = this->c, a = this->a](sycl::id<1> idx) + { + c[idx] = a[idx]; + }); + }); + queue->wait(); +} + +template +void SYCLStream::mul() +{ + const T scalar = startScalar; + queue->submit([&](sycl::handler &cgh) + { + cgh.parallel_for(sycl::range<1>{array_size}, [=, b = this->b, c = this->c](sycl::id<1> idx) + { + b[idx] = scalar * c[idx]; + }); + }); + queue->wait(); +} + +template +void SYCLStream::add() +{ + queue->submit([&](sycl::handler &cgh) + { + cgh.parallel_for(sycl::range<1>{array_size}, [=, c = this->c, a = this->a, b = this->b](sycl::id<1> idx) + { + c[idx] = a[idx] + b[idx]; + }); + }); + queue->wait(); +} + +template +void SYCLStream::triad() +{ + const T scalar = startScalar; + queue->submit([&](sycl::handler &cgh) + { + cgh.parallel_for(sycl::range<1>{array_size}, [=, a = this->a, b = this->b, c = this->c](sycl::id<1> idx) + { + a[idx] = b[idx] + scalar * c[idx]; + }); + }); + queue->wait(); +} + +template +void SYCLStream::nstream() +{ + const T scalar = startScalar; + + queue->submit([&](sycl::handler &cgh) + { + cgh.parallel_for(sycl::range<1>{array_size}, [=, a = this->a, b = this->b, c = this->c](sycl::id<1> idx) + { + a[idx] += b[idx] + scalar * c[idx]; + }); + }); + queue->wait(); +} + +template +T SYCLStream::dot() +{ + queue->submit([&](sycl::handler &cgh) + { + cgh.parallel_for(sycl::range<1>{array_size}, + // Reduction object, to perform summation - initialises the result to zero + // hipSYCL doesn't sypport the initialize_to_identity property yet +#if defined(__HIPSYCL__) || defined(__OPENSYCL__) + sycl::reduction(sum, sycl::plus()), +#else + sycl::reduction(sum, sycl::plus(), sycl::property::reduction::initialize_to_identity{}), +#endif + [a = this->a, b = this->b](sycl::id<1> idx, auto& sum) + { + sum += a[idx] * b[idx]; + }); + + }); + queue->wait(); + return *sum; +} + +template +void SYCLStream::init_arrays(T initA, T initB, T initC) +{ + queue->submit([&](sycl::handler &cgh) + { + cgh.parallel_for(sycl::range<1>{array_size}, [=, a = this->a, b = this->b, c = this->c](sycl::id<1> idx) + { + a[idx] = initA; + b[idx] = initB; + c[idx] = initC; + }); + }); + + queue->wait(); +} + +template +void SYCLStream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) +{ + for (int i = 0; i < array_size; i++) + { + h_a[i] = a[i]; + h_b[i] = b[i]; + h_c[i] = c[i]; + } +} + +void getDeviceList(void) +{ + // Ask SYCL runtime for all devices in system + devices = sycl::device::get_devices(); + cached = true; +} + +void listDevices(void) +{ + getDeviceList(); + + // Print device names + if (devices.size() == 0) + { + std::cerr << "No devices found." << std::endl; + } + else + { + std::cout << std::endl; + std::cout << "Devices:" << std::endl; + for (int i = 0; i < devices.size(); i++) + { + std::cout << i << ": " << getDeviceName(i) << std::endl; + } + std::cout << std::endl; + } +} + +std::string getDeviceName(const int device) +{ + if (!cached) + getDeviceList(); + + std::string name; + + if (device < devices.size()) + { + name = devices[device].get_info(); + } + else + { + throw std::runtime_error("Error asking for name for non-existant device"); + } + + return name; +} + +std::string getDeviceDriver(const int device) +{ + if (!cached) + getDeviceList(); + + std::string driver; + + if (device < devices.size()) + { + driver = devices[device].get_info(); + } + else + { + throw std::runtime_error("Error asking for driver for non-existant device"); + } + + return driver; +} + +template class SYCLStream; +template class SYCLStream; diff --git a/src/sycl2020-usm/SYCLStream2020.h b/src/sycl2020-usm/SYCLStream2020.h new file mode 100644 index 00000000..0b2dc0db --- /dev/null +++ b/src/sycl2020-usm/SYCLStream2020.h @@ -0,0 +1,54 @@ + +// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#pragma once + +#include +#include + +#include "Stream.h" + +#include + +#define IMPLEMENTATION_STRING "SYCL2020 USM" + +template +class SYCLStream : public Stream +{ + protected: + // Size of arrays + size_t array_size; + + // SYCL objects + // Queue is a pointer because we allow device selection + std::unique_ptr queue; + + // Buffers + T *a{}; + T *b{}; + T *c{}; + T *sum{}; + + public: + + SYCLStream(const size_t, const int); + ~SYCLStream(); + + virtual void copy() override; + virtual void add() override; + virtual void mul() override; + virtual void triad() override; + virtual void nstream() override; + virtual T dot() override; + + virtual void init_arrays(T initA, T initB, T initC) override; + virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; + +}; + +// Populate the devices list +void getDeviceList(void); diff --git a/src/sycl2020/model.cmake b/src/sycl2020-usm/model.cmake similarity index 96% rename from src/sycl2020/model.cmake rename to src/sycl2020-usm/model.cmake index 6a517c1c..81ad9d72 100644 --- a/src/sycl2020/model.cmake +++ b/src/sycl2020-usm/model.cmake @@ -19,9 +19,6 @@ register_flag_optional(SYCL_COMPILER_DIR HIPSYCL|DPCPP|COMPUTECPP - set to the root of the binary distribution that contains at least `bin/`, `include/`, and `lib/`" "") -register_flag_optional(OpenCL_LIBRARY - "[ComputeCpp only] Path to OpenCL library, usually called libOpenCL.so" - "${OpenCL_LIBRARY}") macro(setup) set(CMAKE_CXX_STANDARD 17) From 87a38e949df2894a7d25ef8782dd96e3978f31ff Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Tue, 5 Sep 2023 03:35:05 +0100 Subject: [PATCH 58/89] Fix SYCL2020 accessors typo --- src/sycl2020-acc/SYCLStream2020.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sycl2020-acc/SYCLStream2020.cpp b/src/sycl2020-acc/SYCLStream2020.cpp index f88cbbbe..0de24bbb 100644 --- a/src/sycl2020-acc/SYCLStream2020.cpp +++ b/src/sycl2020-acc/SYCLStream2020.cpp @@ -169,7 +169,7 @@ T SYCLStream::dot() #if defined(__HIPSYCL__) || defined(__OPENSYCL__) sycl::reduction(d_sum. template get_access(cgh), sycl::plus()), #else - sycl::reduction(d_sum, cgh sycl::plus()), + sycl::reduction(d_sum, cgh, sycl::plus(), sycl::property::reduction::initialize_to_identity{}), #endif [=](sycl::id<1> idx, auto& sum) { From 3dcafd1af13ee24ca49cb42c26aa262944436e32 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Fri, 22 Sep 2023 02:31:14 +0100 Subject: [PATCH 59/89] Fix max element guard overflow for CUDA, resolves #136 --- src/cuda/CUDAStream.cu | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/src/cuda/CUDAStream.cu b/src/cuda/CUDAStream.cu index 778a0445..e1abe343 100644 --- a/src/cuda/CUDAStream.cu +++ b/src/cuda/CUDAStream.cu @@ -48,33 +48,37 @@ CUDAStream::CUDAStream(const int ARRAY_SIZE, const int device_index) // Allocate the host array for partial sums for dot kernels sums = (T*)malloc(sizeof(T) * DOT_NUM_BLOCKS); + size_t array_bytes = sizeof(T); + array_bytes *= ARRAY_SIZE; + size_t total_bytes = array_bytes * 3; + // Check buffers fit on the device cudaDeviceProp props; cudaGetDeviceProperties(&props, 0); - if (props.totalGlobalMem < std::size_t{3}*ARRAY_SIZE*sizeof(T)) + if (props.totalGlobalMem < total_bytes) throw std::runtime_error("Device does not have enough memory for all 3 buffers"); // Create device buffers #if defined(MANAGED) - cudaMallocManaged(&d_a, ARRAY_SIZE*sizeof(T)); + cudaMallocManaged(&d_a, array_bytes); check_error(); - cudaMallocManaged(&d_b, ARRAY_SIZE*sizeof(T)); + cudaMallocManaged(&d_b, array_bytes); check_error(); - cudaMallocManaged(&d_c, ARRAY_SIZE*sizeof(T)); + cudaMallocManaged(&d_c, array_bytes); check_error(); cudaMallocManaged(&d_sum, DOT_NUM_BLOCKS*sizeof(T)); check_error(); #elif defined(PAGEFAULT) - d_a = (T*)malloc(sizeof(T)*ARRAY_SIZE); - d_b = (T*)malloc(sizeof(T)*ARRAY_SIZE); - d_c = (T*)malloc(sizeof(T)*ARRAY_SIZE); + d_a = (T*)malloc(array_bytes); + d_b = (T*)malloc(array_bytes); + d_c = (T*)malloc(array_bytes); d_sum = (T*)malloc(sizeof(T)*DOT_NUM_BLOCKS); #else - cudaMalloc(&d_a, ARRAY_SIZE*sizeof(T)); + cudaMalloc(&d_a, array_bytes); check_error(); - cudaMalloc(&d_b, ARRAY_SIZE*sizeof(T)); + cudaMalloc(&d_b, array_bytes); check_error(); - cudaMalloc(&d_c, ARRAY_SIZE*sizeof(T)); + cudaMalloc(&d_c, array_bytes); check_error(); cudaMalloc(&d_sum, DOT_NUM_BLOCKS*sizeof(T)); check_error(); From 72be9f698035b542bc1734dcbbd14543db55e244 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Sun, 24 Sep 2023 21:11:35 +0100 Subject: [PATCH 60/89] Fix up CI, resolves #145, supersedes #154 Drop vector for std-* models --- .github/workflows/main.yaml | 84 +++++++++++++++-- CMakeLists.txt | 24 +++-- src/ci-prepare-bionic.sh | 92 +++++++++++-------- src/ci-test-compile.sh | 132 ++++++++++++++++----------- src/raja/model.cmake | 2 - src/std-data/STDDataStream.cpp | 49 ++++------ src/std-data/STDDataStream.h | 5 - src/std-data/model.cmake | 7 -- src/std-indices/STDIndicesStream.cpp | 57 +++--------- src/std-indices/STDIndicesStream.h | 5 - src/std-indices/model.cmake | 7 -- src/std-ranges/STDRangesStream.cpp | 32 ++----- src/std-ranges/STDRangesStream.hpp | 4 - src/std-ranges/model.cmake | 16 ++-- src/thrust/model.cmake | 12 ++- 15 files changed, 278 insertions(+), 250 deletions(-) diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index 08eed2d5..8dc6905a 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -12,7 +12,7 @@ on: jobs: test-rust: - runs-on: ubuntu-18.04 + runs-on: ubuntu-22.04 defaults: run: working-directory: ./src/rust/rust-stream @@ -28,7 +28,7 @@ jobs: run: ./target/release/rust-stream --arraysize 2048 test-java: - runs-on: ubuntu-18.04 + runs-on: ubuntu-22.04 defaults: run: working-directory: ./src/java/java-stream @@ -41,7 +41,7 @@ jobs: run: java -jar target/java-stream.jar --arraysize 2048 test-julia: - runs-on: ubuntu-18.04 + runs-on: ubuntu-22.04 defaults: run: working-directory: ./src/julia/JuliaStream.jl @@ -69,8 +69,24 @@ jobs: run: julia --project src/AMDGPUStream.jl --list + setup-cpp: + runs-on: ubuntu-22.04 + steps: + - name: Cache compiler + # if: ${{ !env.ACT }} + id: prepare-compilers + uses: actions/cache@v2 + with: + path: ./compilers + key: ${{ runner.os }}-${{ hashFiles('./src/ci-prepare-bionic.sh') }} + + - name: Prepare compilers + if: steps.prepare-compilers.outputs.cache-hit != 'true' + run: source ./src/ci-prepare-bionic.sh ./compilers SETUP false || true + test-cpp: - runs-on: ubuntu-18.04 + needs: setup-cpp + runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v2 @@ -84,15 +100,15 @@ jobs: - name: Prepare compilers if: steps.prepare-compilers.outputs.cache-hit != 'true' - run: source ./src/ci-prepare-bionic.sh ./compilers SETUP true || true + run: source ./src/ci-prepare-bionic.sh ./compilers SETUP false || true - name: Setup test environment run: source ./src/ci-prepare-bionic.sh ./compilers VARS false || true # Enable tmate debugging of manually-triggered workflows if the input option was provided - - name: Setup tmate session - uses: mxschmitt/action-tmate@v3 - if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.debug_enabled }} + # - name: Setup tmate session + # uses: mxschmitt/action-tmate@v3 + # if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.debug_enabled }} - name: Test compile gcc @ CMake 3.13 if: ${{ ! cancelled() }} @@ -167,4 +183,54 @@ jobs: run: ./src/ci-test-compile.sh ./build dpcpp all ${{ env.CMAKE_3_18_BIN }} - name: Test compile hipsycl @ CMake 3.18 if: ${{ ! cancelled() }} - run: ./src/ci-test-compile.sh ./build hipsycl all ${{ env.CMAKE_3_18_BIN }} \ No newline at end of file + run: ./src/ci-test-compile.sh ./build hipsycl all ${{ env.CMAKE_3_18_BIN }} + + - name: Test compile gcc @ CMake 3.20 + if: ${{ ! cancelled() }} + run: ./src/ci-test-compile.sh ./build gcc all ${{ env.CMAKE_3_20_BIN }} + - name: Test compile clang @ CMake 3.20 + if: ${{ ! cancelled() }} + run: ./src/ci-test-compile.sh ./build clang all ${{ env.CMAKE_3_20_BIN }} + - name: Test compile nvhpc @ CMake 3.20 + if: ${{ ! cancelled() }} + run: ./src/ci-test-compile.sh ./build nvhpc all ${{ env.CMAKE_3_20_BIN }} + - name: Test compile aocc @ CMake 3.20 + if: ${{ ! cancelled() }} + run: ./src/ci-test-compile.sh ./build aocc all ${{ env.CMAKE_3_20_BIN }} + - name: Test compile aomp @ CMake 3.20 + if: ${{ ! cancelled() }} + run: ./src/ci-test-compile.sh ./build aomp all ${{ env.CMAKE_3_20_BIN }} + - name: Test compile hip @ CMake 3.20 + if: ${{ ! cancelled() }} + run: ./src/ci-test-compile.sh ./build hip all ${{ env.CMAKE_3_20_BIN }} + - name: Test compile dpcpp @ CMake 3.20 + if: ${{ ! cancelled() }} + run: ./src/ci-test-compile.sh ./build dpcpp all ${{ env.CMAKE_3_20_BIN }} + - name: Test compile hipsycl @ CMake 3.20 + if: ${{ ! cancelled() }} + run: ./src/ci-test-compile.sh ./build hipsycl all ${{ env.CMAKE_3_20_BIN }} + + - name: Test compile gcc @ CMake 3.24 + if: ${{ ! cancelled() }} + run: ./src/ci-test-compile.sh ./build gcc all ${{ env.CMAKE_3_24_BIN }} + - name: Test compile clang @ CMake 3.24 + if: ${{ ! cancelled() }} + run: ./src/ci-test-compile.sh ./build clang all ${{ env.CMAKE_3_24_BIN }} + - name: Test compile nvhpc @ CMake 3.24 + if: ${{ ! cancelled() }} + run: ./src/ci-test-compile.sh ./build nvhpc all ${{ env.CMAKE_3_24_BIN }} + - name: Test compile aocc @ CMake 3.24 + if: ${{ ! cancelled() }} + run: ./src/ci-test-compile.sh ./build aocc all ${{ env.CMAKE_3_24_BIN }} + - name: Test compile aomp @ CMake 3.24 + if: ${{ ! cancelled() }} + run: ./src/ci-test-compile.sh ./build aomp all ${{ env.CMAKE_3_24_BIN }} + - name: Test compile hip @ CMake 3.24 + if: ${{ ! cancelled() }} + run: ./src/ci-test-compile.sh ./build hip all ${{ env.CMAKE_3_24_BIN }} + - name: Test compile dpcpp @ CMake 3.24 + if: ${{ ! cancelled() }} + run: ./src/ci-test-compile.sh ./build dpcpp all ${{ env.CMAKE_3_24_BIN }} + - name: Test compile hipsycl @ CMake 3.24 + if: ${{ ! cancelled() }} + run: ./src/ci-test-compile.sh ./build hipsycl all ${{ env.CMAKE_3_24_BIN }} \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index da112a44..879e4633 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,5 +1,9 @@ cmake_minimum_required(VERSION 3.13 FATAL_ERROR) +if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.24.0") + cmake_policy(SET CMP0135 NEW) +endif () + project(BabelStream VERSION 4.0 LANGUAGES CXX) # uncomment for debugging build issues: @@ -71,15 +75,19 @@ hint_flag(CXX_EXTRA_LINKER_FLAGS " # Honor user's CXX_EXTRA_LINK_FLAGS set(CXX_EXTRA_LINK_FLAGS ${CXX_EXTRA_FLAGS} ${CXX_EXTRA_LINK_FLAGS}) -option(USE_TBB "Enable oneTBB library for *supported* models. Enabling this on models that +option(USE_TBB "Enable the oneTBB library for *supported* models. Enabling this on models that don't explicitly link against TBB is a no-op, see description of your selected model on how this is used." OFF) -if (USE_TBB) +option(FETCH_TBB "Fetch (download) the oneTBB library for *supported* models. This uses CMake's + FetchContent feature. Specify version by setting FETCH_TBB_VERSION" OFF) +set(FETCH_TBB_VERSION "v2021.10.0" CACHE STRING "Specify version of oneTBB to use if FETCH_TBB is ON") + +if (FETCH_TBB) FetchContent_Declare( TBB GIT_REPOSITORY https://github.com/oneapi-src/oneTBB.git - GIT_TAG v2021.9.0 + GIT_TAG "${FETCH_TBB_VERSION}" ) # Don't fail builds on waring (TBB has -Wall while not being free of warnings from unused symbols...) set(CMAKE_POLICY_DEFAULT_CMP0077 NEW) @@ -92,15 +100,19 @@ if (USE_TBB) endif () endif () -option(USE_ONEDPL "Enable oneDPL library for *supported* models. Enabling this on models that +option(USE_ONEDPL "Enable the oneDPL library for *supported* models. Enabling this on models that don't explicitly link against DPL is a no-op, see description of your selected model on how this is used." OFF) -if (USE_ONEDPL) +option(FETCH_ONEDPL "Fetch (download) the oneDPL library for *supported* models. This uses CMake's + FetchContent feature. Specify version by setting FETCH_ONEDPL_VERSION" OFF) +set(FETCH_ONEDPL_VERSION "oneDPL-2022.2.0-rc1" CACHE STRING "Specify version of oneTBB to use if FETCH_ONEDPL is ON") + +if (FETCH_ONEDPL) FetchContent_Declare( oneDPL GIT_REPOSITORY https://github.com/oneapi-src/oneDPL.git - GIT_TAG oneDPL-2022.2.0-rc1 + GIT_TAG "${FETCH_ONEDPL_VERSION}" ) string(TOLOWER ${USE_ONEDPL} ONEDPL_BACKEND) # XXX oneDPL looks for omp instead of openmp, which mismatches(!) with ONEDPL_PAR_BACKEND if using find_package diff --git a/src/ci-prepare-bionic.sh b/src/ci-prepare-bionic.sh index 78bbd330..6a1a9595 100755 --- a/src/ci-prepare-bionic.sh +++ b/src/ci-prepare-bionic.sh @@ -83,6 +83,8 @@ get() { if [ ! -f "$name" ] || [ "$FORCE_DOWNLOAD" = true ]; then echo "$name not found, downloading..." wget -q --show-progress --progress=bar:force:noscroll "$pkg_url" -O "$name" + else + echo "$name found, skipping download..." fi fi } @@ -92,13 +94,15 @@ get_and_untar() { local pkg_url="$2" if [ "$SETUP" = true ]; then if [ ! -f "$name" ] || [ "$FORCE_DOWNLOAD" = true ]; then - echo "$name not found, downloading..." + echo "$name not found, downloading ($pkg_url)..." wget -q --show-progress --progress=bar:force:noscroll "$pkg_url" -O "$name" fi echo "Preparing to extract $name ..." tar -xf "$name" echo "$name extracted, deleting archive ..." rm -f "$name" # delete for space + else + echo "Skipping setup for $name ($pkg_url)..." fi } @@ -119,10 +123,10 @@ verify_dir_exists() { setup_aocc() { echo "Preparing AOCC" - local aocc_ver="2.3.0" + local aocc_ver="4.0.0" local tarball="aocc-$aocc_ver.tar.xz" # XXX it's actually XZ compressed, so it should be tar.xz - local AOCC_URL="http://developer.amd.com/wordpress/media/files/aocc-compiler-2.3.0.tar" + local AOCC_URL="https://download.amd.com/developer/eula/aocc-compiler/aocc-compiler-${aocc_ver}.tar" # local AOCC_URL="http://localhost:8000/aocc-compiler-2.3.0.tar" get_and_untar "$tarball" "$AOCC_URL" @@ -133,10 +137,10 @@ setup_aocc() { } setup_nvhpc() { - echo "Preparing Nvidia HPC SDK" - local nvhpc_ver="22.3" - local nvhpc_release="2022_223" - local cuda_ver="11.6" + echo "Preparing Nvidia HPC SDK" + local nvhpc_ver="23.1" # TODO FIXME > 23.1 has a bug with -A + local nvhpc_release="2023_231" + local cuda_ver="12.0" local tarball="nvhpc_$nvhpc_ver.tar.gz" @@ -145,7 +149,7 @@ setup_nvhpc() { local sdk_dir="$PWD/nvhpc_${nvhpc_release}_Linux_x86_64_cuda_$cuda_ver/install_components/Linux_x86_64/$nvhpc_ver" local bin_dir="$sdk_dir/compilers/bin" - "$bin_dir/makelocalrc" "$bin_dir" -x + "$bin_dir/makelocalrc" -d "$bin_dir" -x -gpp g++-12 -gcc gcc-12 -g77 gfortran-12 export_var NVHPC_SDK_DIR "$sdk_dir" export_var NVHPC_CUDA_DIR "$sdk_dir/cuda/$cuda_ver" @@ -166,7 +170,8 @@ setup_nvhpc() { setup_aomp() { echo "Preparing AOMP" - local AOMP_URL="https://github.com/ROCm-Developer-Tools/aomp/releases/download/rel_11.12-0/aomp_Ubuntu1804_11.12-0_amd64.deb" + local aomp_ver="18.0-0" + local AOMP_URL="https://github.com/ROCm-Developer-Tools/aomp/releases/download/rel_${aomp_ver}/aomp_Ubuntu2204_${aomp_ver}_amd64.deb" # local AOMP_URL="http://0.0.0.0:8000/aomp_Ubuntu1804_11.12-0_amd64.deb" get_and_install_deb "aomp" "aomp" "$AOMP_URL" @@ -189,9 +194,10 @@ setup_oclcpu() { setup_kokkos() { echo "Preparing Kokkos" - local kokkos_ver="3.3.01" + local kokkos_ver="4.1.00" local tarball="kokkos-$kokkos_ver.tar.gz" + local url="https://github.com/kokkos/kokkos/archive/$kokkos_ver.tar.gz" # local url="http://localhost:8000/$kokkos_ver.tar.gz" @@ -203,10 +209,10 @@ setup_kokkos() { setup_raja() { echo "Preparing RAJA" - local raja_ver="0.13.0" + local raja_ver="2023.06.1" local tarball="raja-$raja_ver.tar.gz" - local url="https://github.com/LLNL/RAJA/releases/download/v0.13.0/RAJA-v$raja_ver.tar.gz" + local url="https://github.com/LLNL/RAJA/releases/download/v$raja_ver/RAJA-v$raja_ver.tar.gz" # local url="http://localhost:8000/RAJA-v$raja_ver.tar.gz" get_and_untar "$tarball" "$url" @@ -217,7 +223,7 @@ setup_raja() { setup_tbb() { echo "Preparing TBB" - local tbb_ver="2021.2.0" + local tbb_ver="2021.9.0" local tarball="oneapi-tbb-$tbb_ver-lin.tgz" local url="https://github.com/oneapi-src/oneTBB/releases/download/v$tbb_ver/oneapi-tbb-$tbb_ver-lin.tgz" @@ -231,9 +237,9 @@ setup_tbb() { setup_clang_gcc() { - sudo apt-get install -y -qq gcc-10-offload-nvptx gcc-10-offload-amdgcn libtbb2 libtbb-dev g++-10 clang libomp-dev + sudo apt-get install -y -qq gcc-12-offload-nvptx gcc-12-offload-amdgcn libtbb2 libtbb-dev g++-12 clang libomp-dev libc6 - export_var GCC_CXX "$(which g++-10)" + export_var GCC_CXX "$(which g++-12)" verify_bin_exists "$GCC_CXX" "$GCC_CXX" --version @@ -254,7 +260,7 @@ setup_clang_gcc() { } setup_rocm() { - sudo apt-get install -y -qq rocm-dev rocthrust-dev + sudo apt-get install -y rocm-dev rocthrust-dev export_var ROCM_PATH "/opt/rocm" export_var PATH "$ROCM_PATH/bin:$PATH" # ROCm needs this for many of their libraries' CMake build to work export_var HIP_CXX "$ROCM_PATH/bin/hipcc" @@ -265,7 +271,7 @@ setup_rocm() { setup_dpcpp() { - local nightly="20210106" + local nightly="20230615" local tarball="dpcpp-$nightly.tar.gz" local url="https://github.com/intel/llvm/releases/download/sycl-nightly/$nightly/dpcpp-compiler.tar.gz" @@ -282,22 +288,22 @@ setup_dpcpp() { setup_hipsycl() { sudo apt-get install -y -qq libboost-fiber-dev libboost-context-dev - local hipsycl_ver="0.9.0" + local hipsycl_ver="0.9.1" local tarball="v$hipsycl_ver.tar.gz" local install_dir="$PWD/hipsycl_dist_$hipsycl_ver" - local url="https://github.com/illuhad/hipSYCL/archive/v$hipsycl_ver.tar.gz" - # local url="http://localhost:8000/hipSYCL-$hipsycl_ver.tar.gz" + local url="https://github.com/AdaptiveCpp/AdaptiveCpp/archive/v$hipsycl_ver.tar.gz" + # local url="http://localhost:8000/AdaptiveCpp-$hipsycl_ver.tar.gz" get_and_untar "$tarball" "$url" if [ "$SETUP" = true ]; then - local src="$PWD/hipSYCL-$hipsycl_ver" + local src="$PWD/AdaptiveCpp-$hipsycl_ver" rm -rf "$src/build" rm -rf "$install_dir" cmake "-B$src/build" "-H$src" \ - -DCMAKE_C_COMPILER="$(which gcc-10)" \ - -DCMAKE_CXX_COMPILER="$(which g++-10)" \ + -DCMAKE_C_COMPILER="$(which gcc-12)" \ + -DCMAKE_CXX_COMPILER="$(which g++-12)" \ -DCMAKE_INSTALL_PREFIX="$install_dir" \ -DWITH_ROCM_BACKEND=OFF \ -DWITH_CUDA_BACKEND=OFF \ @@ -312,25 +318,20 @@ setup_hipsycl() { check_size } -setup_computecpp() { - echo "TODO ComputeCpp requires registration+login to download" -} - if [ "${GITHUB_ACTIONS:-false}" = true ]; then echo "Running in GitHub Actions, defaulting to special export" TERM=xterm export TERM=xterm - # drop the lock in case we got one from a failed run - rm /var/lib/dpkg/lock-frontend || true - rm /var/cache/apt/archives/lock || true - - wget -q -O - "https://repo.radeon.com/rocm/rocm.gpg.key" | sudo apt-key add - - echo "deb http://archive.ubuntu.com/ubuntu focal main universe" | sudo tee -a /etc/apt/sources.list - echo 'deb [arch=amd64] https://repo.radeon.com/rocm/apt/4.5 ubuntu main' | sudo tee /etc/apt/sources.list.d/rocm.list + rm -rf /var/lib/dpkg/lock-frontend || true + rm -rf /var/cache/apt/archives/lock || true + mkdir --parents --mode=0755 /etc/apt/keyrings + wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | gpg --dearmor | sudo tee /etc/apt/keyrings/rocm.gpg > /dev/null + echo 'deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/5.7 jammy main' | sudo tee /etc/apt/sources.list.d/rocm.list + echo -e 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' | sudo tee /etc/apt/preferences.d/rocm-pin-600 sudo apt-get update -qq - sudo apt-get install -y -qq cmake + sudo apt-get install cmake gcc g++ libelf-dev libdrm-amdgpu1 libnuma-dev if [ "$SETUP" = true ]; then echo "Deleting extra packages for space in 2 seconds..." @@ -340,6 +341,7 @@ if [ "${GITHUB_ACTIONS:-false}" = true ]; then sudo apt-get autoremove -y check_size fi + sudo apt-get upgrade -qq else echo "Running locally, defaulting to standard export" fi @@ -368,6 +370,18 @@ setup_cmake() { verify_bin_exists "$CMAKE_3_18_BIN" "$CMAKE_3_18_BIN" --version + get "cmake-3.20.sh" "$cmake_release/v3.20.4/cmake-3.20.4-linux-x86_64.sh" + chmod +x "./cmake-3.20.sh" && "./cmake-3.20.sh" --skip-license --include-subdir + export_var CMAKE_3_20_BIN "$PWD/cmake-3.20.4-linux-x86_64/bin/cmake" + verify_bin_exists "$CMAKE_3_20_BIN" + "$CMAKE_3_20_BIN" --version + + get "cmake-3.24.sh" "$cmake_release/v3.24.4/cmake-3.24.4-linux-x86_64.sh" + chmod +x "./cmake-3.24.sh" && "./cmake-3.24.sh" --skip-license --include-subdir + export_var CMAKE_3_24_BIN "$PWD/cmake-3.24.4-linux-x86_64/bin/cmake" + verify_bin_exists "$CMAKE_3_24_BIN" + "$CMAKE_3_24_BIN" --version + check_size } @@ -385,6 +399,10 @@ if [ "$PARALLEL" = true ]; then setup_tbb & wait else + # these need apt + setup_clang_gcc + setup_rocm + setup_hipsycl setup_cmake setup_aocc setup_oclcpu @@ -394,10 +412,6 @@ else setup_kokkos setup_raja setup_tbb - # these need apt - setup_clang_gcc - setup_rocm - setup_hipsycl fi echo "Done!" diff --git a/src/ci-test-compile.sh b/src/ci-test-compile.sh index d3fc5b71..610c3f07 100755 --- a/src/ci-test-compile.sh +++ b/src/ci-test-compile.sh @@ -120,10 +120,21 @@ run_build() { # CLANG_OMP_OFFLOAD_NVIDIA=false ### +NV_ARCH_CC="70" AMD_ARCH="gfx_903" -NV_ARCH="sm_70" +NV_ARCH="sm_${NV_ARCH_CC}" NV_ARCH_CCXY="cuda${NVHPC_CUDA_VER:?},cc80" +check_cmake_ver(){ + local current=$("$CMAKE_BIN" --version | head -n 1 | cut -d ' ' -f3) + local required=$1 + if [ "$(printf '%s\n' "$required" "$current" | sort -V | head -n1)" = "$required" ]; then + return 0 + else + return 1 + fi +} + build_gcc() { local name="gcc_build" local cxx="-DCMAKE_CXX_COMPILER=${GCC_CXX:?}" @@ -138,14 +149,12 @@ build_gcc() { for use_onedpl in OFF OPENMP TBB; do case "$use_onedpl" in OFF) dpl_conditional_flags="-DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}" ;; - *) dpl_conditional_flags="-DUSE_TBB=ON -DCXX_EXTRA_FLAGS=-D_GLIBCXX_USE_TBB_PAR_BACKEND=0" ;; + *) dpl_conditional_flags="-DFETCH_ONEDPL=ON -DFETCH_TBB=ON -DUSE_TBB=ON -DCXX_EXTRA_FLAGS=-D_GLIBCXX_USE_TBB_PAR_BACKEND=0" ;; esac - for use_vector in OFF ON; do - # some distributions like Ubuntu bionic implements std par with TBB, so conditionally link it here - run_build $name "${GCC_CXX:?}" std-data "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DUSE_VECTOR=$use_vector" - run_build $name "${GCC_CXX:?}" std-indices "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DUSE_VECTOR=$use_vector" - run_build $name "${GCC_CXX:?}" std-ranges "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DUSE_VECTOR=$use_vector" - done + # some distributions like Ubuntu bionic implements std par with TBB, so conditionally link it here + run_build $name "${GCC_CXX:?}" std-data "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl" + run_build $name "${GCC_CXX:?}" std-indices "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl" + run_build $name "${GCC_CXX:?}" std-ranges "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl" done run_build $name "${GCC_CXX:?}" tbb "$cxx -DONE_TBB_DIR=$TBB_LIB" @@ -153,40 +162,45 @@ build_gcc() { run_build $name "${GCC_CXX:?}" tbb "$cxx -DUSE_VECTOR=ON" # build with vectors if [ "${GCC_OMP_OFFLOAD_AMD:-false}" != "false" ]; then - run_build "amd_$name" "${GCC_CXX:?}" acc "$cxx -DCXX_EXTRA_FLAGS=-foffload=amdgcn-amdhsa" + run_build "amd_$name" "${GCC_CXX:?}" acc "$cxx -DCXX_EXTRA_FLAGS=-foffload=amdgcn-amdhsa;-fno-stack-protector;-fcf-protection=none" run_build "amd_$name" "${GCC_CXX:?}" omp "$cxx -DOFFLOAD=AMD:$AMD_ARCH" fi if [ "${GCC_OMP_OFFLOAD_NVIDIA:-false}" != "false" ]; then - run_build "nvidia_$name" "${GCC_CXX:?}" acc "$cxx -DCXX_EXTRA_FLAGS=-foffload=nvptx-none" + run_build "nvidia_$name" "${GCC_CXX:?}" acc "$cxx -DCXX_EXTRA_FLAGS=-foffload=nvptx-none;-fno-stack-protector;-fcf-protection=none" run_build "nvidia_$name" "${GCC_CXX:?}" omp "$cxx -DOFFLOAD=NVIDIA:$NV_ARCH" fi run_build $name "${GCC_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH" run_build $name "${GCC_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=MANAGED" run_build $name "${GCC_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=PAGEFAULT" - # run_build $name "${CC_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_CUDA=ON" - run_build "cuda_$name" "${GCC_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON" + if check_cmake_ver "3.16.0"; then + # run_build $name "${CC_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_CUDA=ON" + run_build "cuda_$name" "${GCC_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON" + else + echo "Skipping Kokkos models due to CMake version requirement" + fi run_build $name "${GCC_CXX:?}" ocl "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}" - run_build $name "${GCC_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}" - -# FIXME fails due to https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100102 -# FIXME we also got https://github.com/NVIDIA/nccl/issues/494 - -# run_build "cuda_$name" "${GCC_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?} \ -# -DENABLE_CUDA=ON \ -# -DTARGET=NVIDIA \ -# -DCUDA_TOOLKIT_ROOT_DIR=${NVHPC_CUDA_DIR:?} \ -# -DCUDA_ARCH=$NV_ARCH" + if check_cmake_ver "3.20.0"; then + run_build $name "${GCC_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?} -DENABLE_OPENMP=ON" + else + echo "Skipping RAJA models due to CMake version requirement" + fi + if check_cmake_ver "3.20.0"; then + run_build "cuda_$name" "${GCC_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?} \ + -DENABLE_CUDA=ON \ + -DTARGET=NVIDIA \ + -DCUDA_TOOLKIT_ROOT_DIR=${NVHPC_CUDA_DIR:?} \ + -DCUDA_ARCH=$NV_ARCH" + else + echo "Skipping RAJA models due to CMake version requirement" + fi - # CMake >= 3.15 only due to Nvidia's Thrust CMake requirements - local current=$("$CMAKE_BIN" --version | head -n 1 | cut -d ' ' -f3) - local required="3.15.0" - if [ "$(printf '%s\n' "$required" "$current" | sort -V | head -n1)" = "$required" ]; then - run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/lib64/cmake -DTHRUST_IMPL=CUDA -DBACKEND=CUDA" - run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/lib64/cmake -DTHRUST_IMPL=CUDA -DBACKEND=OMP" - run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/lib64/cmake -DTHRUST_IMPL=CUDA -DBACKEND=CPP" + if check_cmake_ver "3.18.0"; then # CMake >= 3.15 only due to Nvidia's Thrust CMake requirements + run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH_CC -DSDK_DIR=$NVHPC_CUDA_DIR/lib64/cmake -DTHRUST_IMPL=CUDA -DBACKEND=CUDA" +# run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/lib64/cmake -DTHRUST_IMPL=CUDA -DBACKEND=OMP" # FIXME + run_build $name "${GCC_CXX:?}" thrust "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH_CC -DSDK_DIR=$NVHPC_CUDA_DIR/lib64/cmake -DTHRUST_IMPL=CUDA -DBACKEND=CPP" # FIXME CUDA Thrust + TBB throws the following error: # /usr/lib/gcc/x86_64-linux-gnu/9/include/avx512fintrin.h(9146): error: identifier "__builtin_ia32_rndscaless_round" is undefined @@ -198,7 +212,7 @@ build_gcc() { # run_build $name "${GCC_CXX:?}" THRUST "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DSDK_DIR=$NVHPC_CUDA_DIR/lib64/cmake -DTHRUST_IMPL=CUDA -DBACKEND=TBB" else - echo "CMake version ${current} < ${required}, skipping Thrust models" + echo "Skipping Thrust models due to CMake version requirement" fi } @@ -216,30 +230,39 @@ build_clang() { run_build "nvidia_$name" "${GCC_CXX:?}" omp "$cxx -DOFFLOAD=NVIDIA:$NV_ARCH" fi - run_build $name "${CLANG_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}" + if check_cmake_ver "3.20.0"; then + run_build $name "${CLANG_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?} -DENABLE_OPENMP=ON" + else + echo "Skipping RAJA models due to CMake version requirement" + fi run_build $name "${CLANG_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH" run_build $name "${CLANG_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=MANAGED" run_build $name "${CLANG_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=PAGEFAULT" - run_build $name "${CLANG_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON" + if check_cmake_ver "3.16.0"; then + run_build $name "${CLANG_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON" + else + echo "Skipping Kokkos models due to CMake version requirement" + fi run_build $name "${CLANG_CXX:?}" ocl "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}" for use_onedpl in OFF OPENMP TBB; do - for use_vector in OFF ON; do - case "$use_onedpl" in - OFF) dpl_conditional_flags="-DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" ;; - *) dpl_conditional_flags="-DUSE_TBB=ON -DCXX_EXTRA_FLAGS=-D_GLIBCXX_USE_TBB_PAR_BACKEND=0" ;; - esac - run_build $name "${CLANG_CXX:?}" std-data "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DUSE_VECTOR=$use_vector " - run_build $name "${CLANG_CXX:?}" std-indices "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DUSE_VECTOR=$use_vector" - # run_build $name "${CLANG_CXX:?}" std-ranges "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DUSE_VECTOR=$use_vector" # not yet supported - done + case "$use_onedpl" in + OFF) dpl_conditional_flags="-DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" ;; + *) dpl_conditional_flags="-DFETCH_ONEDPL=ON -DFETCH_TBB=ON -DUSE_TBB=ON -DCXX_EXTRA_FLAGS=-D_GLIBCXX_USE_TBB_PAR_BACKEND=0" ;; + esac + run_build $name "${CLANG_CXX:?}" std-data "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl" + run_build $name "${CLANG_CXX:?}" std-indices "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl" + # run_build $name "${CLANG_CXX:?}" std-ranges "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl" # not yet supported done run_build $name "${CLANG_CXX:?}" tbb "$cxx -DONE_TBB_DIR=$TBB_LIB" run_build $name "${CLANG_CXX:?}" tbb "$cxx" # build TBB again with the system TBB run_build $name "${CLANG_CXX:?}" tbb "$cxx -DUSE_VECTOR=ON" # build with vectors - - run_build $name "${CLANG_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}" + if check_cmake_ver "3.20.0"; then + run_build $name "${CLANG_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?} -DENABLE_OPENMP=ON" + else + echo "Skipping RAJA models due to CMake version requirement" + fi # no clang /w RAJA+cuda because it needs nvcc which needs gcc } @@ -249,10 +272,6 @@ build_nvhpc() { run_build $name "${NVHPC_NVCXX:?}" std-data "$cxx -DNVHPC_OFFLOAD=$NV_ARCH_CCXY" run_build $name "${NVHPC_NVCXX:?}" std-indices "$cxx -DNVHPC_OFFLOAD=$NV_ARCH_CCXY" - # std again but with vectors - run_build $name "${NVHPC_NVCXX:?}" std-data "$cxx -DNVHPC_OFFLOAD=$NV_ARCH_CCXY -DUSE_VECTOR=ON" - run_build $name "${NVHPC_NVCXX:?}" std-indices "$cxx -DNVHPC_OFFLOAD=$NV_ARCH_CCXY -DUSE_VECTOR=ON" - run_build $name "${NVHPC_NVCXX:?}" acc "$cxx -DTARGET_DEVICE=gpu -DTARGET_PROCESSOR=px -DCUDA_ARCH=$NV_ARCH_CCXY" run_build $name "${NVHPC_NVCXX:?}" acc "$cxx -DTARGET_DEVICE=multicore -DTARGET_PROCESSOR=zen" } @@ -291,15 +310,18 @@ build_icpc() { local cxx="-DCMAKE_CXX_COMPILER=${ICPC_CXX:?}" run_build $name "${ICPC_CXX:?}" omp "$cxx" run_build $name "${ICPC_CXX:?}" ocl "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}" - run_build $name "${ICPC_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}" - run_build $name "${ICPC_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON" -} + if check_cmake_ver "3.20.0"; then + run_build $name "${ICPC_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?} -DENABLE_OPENMP=ON" + else + echo "Skipping RAJA models due to CMake version requirement" + fi + + if check_cmake_ver "3.16.0"; then + run_build $name "${ICPC_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON" + else + echo "Skipping Kokkos models due to CMake version requirement" + fi -build_computecpp() { - run_build computecpp_build "compute++" sycl "-DCMAKE_CXX_COMPILER=${GCC_CXX:?} \ - -DSYCL_COMPILER=COMPUTECPP \ - -DSYCL_COMPILER_DIR=${COMPUTECPP_DIR:?} \ - -DOpenCL_LIBRARY=${OCL_LIB:?}" } build_dpcpp() { diff --git a/src/raja/model.cmake b/src/raja/model.cmake index eb4788cd..bf306313 100644 --- a/src/raja/model.cmake +++ b/src/raja/model.cmake @@ -8,8 +8,6 @@ register_flag_optional(RAJA_IN_TREE Make sure to use the release version of RAJA or clone RAJA recursively with submodules. Remember to append RAJA specific flags as well, for example: -DRAJA_IN_TREE=... -DENABLE_OPENMP=ON -DENABLE_CUDA=ON ... - For RAJA >= v2022.03.0, remember to use the RAJA prefixed CMake options: - -DRAJA_IN_TREE=... -DRAJA_ENABLE_OPENMP=ON -DRAJA_ENABLE_CUDA=ON ... See https://github.com/LLNL/RAJA/blob/08cbbafd2d21589ebf341f7275c229412d0fe903/CMakeLists.txt#L44 for all available options " "") diff --git a/src/std-data/STDDataStream.cpp b/src/std-data/STDDataStream.cpp index 3d7ef18a..e426835d 100644 --- a/src/std-data/STDDataStream.cpp +++ b/src/std-data/STDDataStream.cpp @@ -6,22 +6,10 @@ #include "STDDataStream.h" -#ifdef USE_VECTOR -#define BEGIN(x) (x).begin() -#define END(x) (x).end() -#else -#define BEGIN(x) (x) -#define END(x) ((x) + array_size) -#endif - template STDDataStream::STDDataStream(const int ARRAY_SIZE, int device) noexcept : array_size{ARRAY_SIZE}, -#ifdef USE_VECTOR - a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE) -#else a(alloc_raw(ARRAY_SIZE)), b(alloc_raw(ARRAY_SIZE)), c(alloc_raw(ARRAY_SIZE)) -#endif { std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl; #ifdef USE_ONEDPL @@ -41,55 +29,53 @@ STDDataStream::STDDataStream(const int ARRAY_SIZE, int device) template STDDataStream::~STDDataStream() { -#ifndef USE_VECTOR - dealloc_raw(a); - dealloc_raw(b); - dealloc_raw(c); -#endif + dealloc_raw(a); + dealloc_raw(b); + dealloc_raw(c); } template void STDDataStream::init_arrays(T initA, T initB, T initC) { - std::fill(exe_policy, BEGIN(a), END(a), initA); - std::fill(exe_policy, BEGIN(b), END(b), initB); - std::fill(exe_policy, BEGIN(c), END(c), initC); + std::fill(exe_policy, a, a + array_size, initA); + std::fill(exe_policy, b, b + array_size, initB); + std::fill(exe_policy, c, c + array_size, initC); } template void STDDataStream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) { - std::copy(BEGIN(a), END(a), h_a.begin()); - std::copy(BEGIN(b), END(b), h_b.begin()); - std::copy(BEGIN(c), END(c), h_c.begin()); + std::copy(a, a + array_size, h_a.begin()); + std::copy(b, b + array_size, h_b.begin()); + std::copy(c, c + array_size, h_c.begin()); } template void STDDataStream::copy() { // c[i] = a[i] - std::copy(exe_policy, BEGIN(a), END(a), BEGIN(c)); + std::copy(exe_policy, a, a + array_size, c); } template void STDDataStream::mul() { // b[i] = scalar * c[i]; - std::transform(exe_policy, BEGIN(c), END(c), BEGIN(b), [scalar = startScalar](T ci){ return scalar*ci; }); + std::transform(exe_policy, c, c + array_size, b, [scalar = startScalar](T ci){ return scalar*ci; }); } template void STDDataStream::add() { // c[i] = a[i] + b[i]; - std::transform(exe_policy, BEGIN(a), END(a), BEGIN(b), BEGIN(c), std::plus()); + std::transform(exe_policy, a, a + array_size, b, c, std::plus()); } template void STDDataStream::triad() { // a[i] = b[i] + scalar * c[i]; - std::transform(exe_policy, BEGIN(b), END(b), BEGIN(c), BEGIN(a), [scalar = startScalar](T bi, T ci){ return bi+scalar*ci; }); + std::transform(exe_policy, b, b + array_size, c, a, [scalar = startScalar](T bi, T ci){ return bi+scalar*ci; }); } template @@ -99,8 +85,8 @@ void STDDataStream::nstream() // Need to do in two stages with C++11 STL. // 1: a[i] += b[i] // 2: a[i] += scalar * c[i]; - std::transform(exe_policy, BEGIN(a), END(a), BEGIN(b), BEGIN(a), [](T ai, T bi){ return ai + bi; }); - std::transform(exe_policy, BEGIN(a), END(a), BEGIN(c), BEGIN(a), [scalar = startScalar](T ai, T ci){ return ai + scalar*ci; }); + std::transform(exe_policy, a, a + array_size, b, a, [](T ai, T bi){ return ai + bi; }); + std::transform(exe_policy, a, a + array_size, c, a, [scalar = startScalar](T ai, T ci){ return ai + scalar*ci; }); } @@ -108,7 +94,7 @@ template T STDDataStream::dot() { // sum = 0; sum += a[i]*b[i]; return sum; - return std::transform_reduce(exe_policy, BEGIN(a), END(a), BEGIN(b), 0.0); + return std::transform_reduce(exe_policy, a, a + array_size, b, 0.0); } void listDevices(void) @@ -127,6 +113,3 @@ std::string getDeviceDriver(const int) } template class STDDataStream; template class STDDataStream; - -#undef BEGIN -#undef END diff --git a/src/std-data/STDDataStream.h b/src/std-data/STDDataStream.h index 911a621b..65e1acee 100644 --- a/src/std-data/STDDataStream.h +++ b/src/std-data/STDDataStream.h @@ -22,12 +22,7 @@ class STDDataStream : public Stream int array_size; // Device side pointers -#ifdef USE_VECTOR - std::vector a, b, c; -#else T *a, *b, *c; -#endif - public: STDDataStream(const int, int) noexcept; diff --git a/src/std-data/model.cmake b/src/std-data/model.cmake index e1697b6d..e9e70998 100644 --- a/src/std-data/model.cmake +++ b/src/std-data/model.cmake @@ -3,10 +3,6 @@ register_flag_optional(CMAKE_CXX_COMPILER "Any CXX compiler that is supported by CMake detection" "c++") -register_flag_optional(USE_VECTOR - "Whether to use std::vector for storage or use aligned_alloc. C++ vectors are *zero* initialised where as aligned_alloc is uninitialised before first use." - "OFF") - register_flag_optional(NVHPC_OFFLOAD "Enable offloading support (via the non-standard `-stdpar`) for the new NVHPC SDK. The values are Nvidia architectures in ccXY format will be passed in via `-gpu=` (e.g `cc70`) @@ -47,9 +43,6 @@ macro(setup) register_append_cxx_flags(ANY ${NVHPC_FLAGS}) register_append_link_flags(${NVHPC_FLAGS}) endif () - if (USE_VECTOR) - register_definitions(USE_VECTOR) - endif () if (USE_TBB) register_link_library(TBB::tbb) endif () diff --git a/src/std-indices/STDIndicesStream.cpp b/src/std-indices/STDIndicesStream.cpp index 6e135976..1cf1cccb 100644 --- a/src/std-indices/STDIndicesStream.cpp +++ b/src/std-indices/STDIndicesStream.cpp @@ -10,32 +10,10 @@ #define ALIGNMENT (2*1024*1024) // 2MB #endif -#ifdef USE_VECTOR -#define BEGIN(x) (x).begin() -#define END(x) (x).end() -#else -#define BEGIN(x) (x) -#define END(x) ((x) + array_size) -#endif - -#ifdef USE_VECTOR -#if (defined(__NVCOMPILER) || defined(__NVCOMPILER_LLVM__)) -#error "std::vector *is* supported in NVHPC if we capture `this`, however, oneDPL (via SYCL2020) only works correctly with explicit *value* captures." -#endif - -#if defined(USE_ONEDPL) -#error "std::vector is unspported: oneDPL (via SYCL2020) only works correctly with explicit *value* captures" -#endif -#endif - template STDIndicesStream::STDIndicesStream(const int ARRAY_SIZE, int device) noexcept : array_size{ARRAY_SIZE}, range(0, array_size), -#ifdef USE_VECTOR - a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE) -#else a(alloc_raw(ARRAY_SIZE)), b(alloc_raw(ARRAY_SIZE)), c(alloc_raw(ARRAY_SIZE)) -#endif { std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl; #ifdef USE_ONEDPL @@ -55,41 +33,39 @@ noexcept : array_size{ARRAY_SIZE}, range(0, array_size), template STDIndicesStream::~STDIndicesStream() { -#ifndef USE_VECTOR - dealloc_raw(a); - dealloc_raw(b); - dealloc_raw(c); -#endif + dealloc_raw(a); + dealloc_raw(b); + dealloc_raw(c); } template void STDIndicesStream::init_arrays(T initA, T initB, T initC) { - std::fill(exe_policy, BEGIN(a), END(a), initA); - std::fill(exe_policy, BEGIN(b), END(b), initB); - std::fill(exe_policy, BEGIN(c), END(c), initC); + std::fill(exe_policy, a, a + array_size, initA); + std::fill(exe_policy, b, b + array_size, initB); + std::fill(exe_policy, c, c + array_size, initC); } template void STDIndicesStream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) { - std::copy(BEGIN(a), END(a), h_a.begin()); - std::copy(BEGIN(b), END(b), h_b.begin()); - std::copy(BEGIN(c), END(c), h_c.begin()); + std::copy(a, a + array_size, h_a.begin()); + std::copy(b, b + array_size, h_b.begin()); + std::copy(c, c + array_size, h_c.begin()); } template void STDIndicesStream::copy() { // c[i] = a[i] - std::copy(exe_policy, BEGIN(a), END(a), BEGIN(c)); + std::copy(exe_policy, a, a + array_size, c); } template void STDIndicesStream::mul() { // b[i] = scalar * c[i]; - std::transform(exe_policy, range.begin(), range.end(), BEGIN(b), [c = this->c, scalar = startScalar](int i) { + std::transform(exe_policy, range.begin(), range.end(), b, [c = this->c, scalar = startScalar](int i) { return scalar * c[i]; }); } @@ -98,7 +74,7 @@ template void STDIndicesStream::add() { // c[i] = a[i] + b[i]; - std::transform(exe_policy, range.begin(), range.end(), BEGIN(c), [a = this->a, b = this->b](int i) { + std::transform(exe_policy, range.begin(), range.end(), c, [a = this->a, b = this->b](int i) { return a[i] + b[i]; }); } @@ -107,7 +83,7 @@ template void STDIndicesStream::triad() { // a[i] = b[i] + scalar * c[i]; - std::transform(exe_policy, range.begin(), range.end(), BEGIN(a), [b = this->b, c = this->c, scalar = startScalar](int i) { + std::transform(exe_policy, range.begin(), range.end(), a, [b = this->b, c = this->c, scalar = startScalar](int i) { return b[i] + scalar * c[i]; }); } @@ -119,7 +95,7 @@ void STDIndicesStream::nstream() // Need to do in two stages with C++11 STL. // 1: a[i] += b[i] // 2: a[i] += scalar * c[i]; - std::transform(exe_policy, range.begin(), range.end(), BEGIN(a), [a = this->a, b = this->b, c = this->c, scalar = startScalar](int i) { + std::transform(exe_policy, range.begin(), range.end(), a, [a = this->a, b = this->b, c = this->c, scalar = startScalar](int i) { return a[i] + b[i] + scalar * c[i]; }); } @@ -129,7 +105,7 @@ template T STDIndicesStream::dot() { // sum = 0; sum += a[i]*b[i]; return sum; - return std::transform_reduce(exe_policy, BEGIN(a), END(a), BEGIN(b), 0.0); + return std::transform_reduce(exe_policy, a, a + array_size, b, 0.0); } void listDevices(void) @@ -148,6 +124,3 @@ std::string getDeviceDriver(const int) } template class STDIndicesStream; template class STDIndicesStream; - -#undef BEGIN -#undef END diff --git a/src/std-indices/STDIndicesStream.h b/src/std-indices/STDIndicesStream.h index 0916ef22..ffab9103 100644 --- a/src/std-indices/STDIndicesStream.h +++ b/src/std-indices/STDIndicesStream.h @@ -77,12 +77,7 @@ class STDIndicesStream : public Stream ranged range; // Device side pointers -#ifdef USE_VECTOR - std::vector a, b, c; -#else T *a, *b, *c; -#endif - public: STDIndicesStream(const int, int) noexcept; diff --git a/src/std-indices/model.cmake b/src/std-indices/model.cmake index c2fef288..60ef575f 100644 --- a/src/std-indices/model.cmake +++ b/src/std-indices/model.cmake @@ -3,10 +3,6 @@ register_flag_optional(CMAKE_CXX_COMPILER "Any CXX compiler that is supported by CMake detection" "c++") -register_flag_optional(USE_VECTOR - "Whether to use std::vector for storage or use aligned_alloc. C++ vectors are *zero* initialised where as aligned_alloc is uninitialised before first use." - "OFF") - register_flag_optional(NVHPC_OFFLOAD "Enable offloading support (via the non-standard `-stdpar`) for the new NVHPC SDK. The values are Nvidia architectures in ccXY format will be passed in via `-gpu=` (e.g `cc70`) @@ -47,9 +43,6 @@ macro(setup) register_append_cxx_flags(ANY ${NVHPC_FLAGS}) register_append_link_flags(${NVHPC_FLAGS}) endif () - if (USE_VECTOR) - register_definitions(USE_VECTOR) - endif () if (USE_TBB) register_link_library(TBB::tbb) endif () diff --git a/src/std-ranges/STDRangesStream.cpp b/src/std-ranges/STDRangesStream.cpp index e05a7d1c..d4976918 100644 --- a/src/std-ranges/STDRangesStream.cpp +++ b/src/std-ranges/STDRangesStream.cpp @@ -5,27 +5,16 @@ // source code #include "STDRangesStream.hpp" +#include #ifndef ALIGNMENT #define ALIGNMENT (2*1024*1024) // 2MB #endif -#ifdef USE_VECTOR -#define BEGIN(x) (x).begin() -#define END(x) (x).end() -#else -#define BEGIN(x) (x) -#define END(x) ((x) + array_size) -#endif - template STDRangesStream::STDRangesStream(const int ARRAY_SIZE, int device) noexcept : array_size{ARRAY_SIZE}, -#ifdef USE_VECTOR - a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE) -#else a(alloc_raw(ARRAY_SIZE)), b(alloc_raw(ARRAY_SIZE)), c(alloc_raw(ARRAY_SIZE)) -#endif { std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl; #ifdef USE_ONEDPL @@ -45,11 +34,9 @@ noexcept : array_size{ARRAY_SIZE}, template STDRangesStream::~STDRangesStream() { -#ifndef USE_VECTOR - dealloc_raw(a); - dealloc_raw(b); - dealloc_raw(c); -#endif + dealloc_raw(a); + dealloc_raw(b); + dealloc_raw(c); } template @@ -70,9 +57,9 @@ template void STDRangesStream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) { // Element-wise copy. - std::copy(BEGIN(a), END(a), h_a.begin()); - std::copy(BEGIN(b), END(b), h_b.begin()); - std::copy(BEGIN(c), END(c), h_c.begin()); + std::copy(a, a + array_size, h_a.begin()); + std::copy(b, b + array_size, h_b.begin()); + std::copy(c, c + array_size, h_c.begin()); } template @@ -148,7 +135,7 @@ T STDRangesStream::dot() return std::transform_reduce( exe_policy, - BEGIN(a), END(a), BEGIN(b), 0.0); + a, a + array_size, b, 0.0); } void listDevices(void) @@ -168,6 +155,3 @@ std::string getDeviceDriver(const int) template class STDRangesStream; template class STDRangesStream; - -#undef BEGIN -#undef END diff --git a/src/std-ranges/STDRangesStream.hpp b/src/std-ranges/STDRangesStream.hpp index 9d36d46b..6e7c29c6 100644 --- a/src/std-ranges/STDRangesStream.hpp +++ b/src/std-ranges/STDRangesStream.hpp @@ -21,11 +21,7 @@ class STDRangesStream : public Stream int array_size; // Device side pointers -#ifdef USE_VECTOR - std::vector a, b, c; -#else T *a, *b, *c; -#endif public: STDRangesStream(const int, int) noexcept; diff --git a/src/std-ranges/model.cmake b/src/std-ranges/model.cmake index 35554c77..8f735010 100644 --- a/src/std-ranges/model.cmake +++ b/src/std-ranges/model.cmake @@ -3,10 +3,6 @@ register_flag_optional(CMAKE_CXX_COMPILER "Any CXX compiler that is supported by CMake detection and supports C++20 Ranges" "c++") -register_flag_optional(USE_VECTOR - "Whether to use std::vector for storage or use aligned_alloc. C++ vectors are *zero* initialised where as aligned_alloc is uninitialised before first use." - "OFF") - register_flag_optional(USE_TBB "No-op if ONE_TBB_DIR is set. Link against an in-tree oneTBB via FetchContent_Declare, see top level CMakeLists.txt for details." "OFF") @@ -32,10 +28,7 @@ macro(setup) set(CMAKE_CXX_STANDARD_REQUIRED OFF) unset(CMAKE_CXX_STANDARD) # drop any existing standard we have set by default # and append our own: - register_append_cxx_flags(ANY -std=c++2a) - if (USE_VECTOR) - register_definitions(USE_VECTOR) - endif () + register_append_cxx_flags(ANY -std=c++20) if (USE_TBB) register_link_library(TBB::tbb) endif () @@ -44,3 +37,10 @@ macro(setup) register_link_library(oneDPL) endif () endmacro() + +macro(setup_target NAME) + if (USE_ONEDPL) + target_compile_features(${NAME} INTERFACE cxx_std_20) + target_compile_features(oneDPL INTERFACE cxx_std_20) + endif () +endmacro() diff --git a/src/thrust/model.cmake b/src/thrust/model.cmake index 91821ef1..6b82ef59 100644 --- a/src/thrust/model.cmake +++ b/src/thrust/model.cmake @@ -46,11 +46,12 @@ macro(setup) # see CUDA.cmake, we're only adding a few Thrust related libraries here if (POLICY CMP0104) - cmake_policy(SET CMP0104 OLD) + cmake_policy(SET CMP0104 NEW) endif () + set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH}) # add -forward-unknown-to-host-compiler for compatibility reasons - set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "--expt-extended-lambda -forward-unknown-to-host-compiler -arch=${CUDA_ARCH}" ${CUDA_EXTRA_FLAGS}) + set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "--expt-extended-lambda " ${CUDA_EXTRA_FLAGS}) enable_language(CUDA) # CMake defaults to -O2 for CUDA at Release, let's wipe that and use the global RELEASE_FLAG # appended later @@ -63,6 +64,7 @@ macro(setup) # XXX NVHPC >= 22.3 has cub-config in `Linux_x86_64/22.3/cuda/11.6/lib64/cmake/cub/` # same thing for thrust if (SDK_DIR) + list(APPEND CMAKE_PREFIX_PATH ${SDK_DIR}) find_package(CUB REQUIRED CONFIG PATHS ${SDK_DIR}/cub) find_package(Thrust REQUIRED CONFIG PATHS ${SDK_DIR}/thrust) else () @@ -73,9 +75,11 @@ macro(setup) message(STATUS "Using Thrust backend: ${BACKEND}") # this creates the interface that we can link to - thrust_create_target(Thrust HOST CPP DEVICE ${BACKEND}) + thrust_create_target(Thrust${BACKEND} + HOST CPP + DEVICE ${BACKEND}) - register_link_library(Thrust) + register_link_library(Thrust${BACKEND}) elseif (${THRUST_IMPL} STREQUAL "ROCM") if (SDK_DIR) find_package(rocprim REQUIRED CONFIG PATHS ${SDK_DIR}/rocprim) From a542f3bf67f08501a17b1432e3ac6daf8a8ef442 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Sun, 24 Sep 2023 21:41:31 +0100 Subject: [PATCH 61/89] Add experimental CI action for more disk space Don't use CI action dependencies Bump CI checkout/cache versions --- .github/workflows/main.yaml | 34 +++++++++++++--------------------- 1 file changed, 13 insertions(+), 21 deletions(-) diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index 8dc6905a..4f9b833c 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -17,7 +17,7 @@ jobs: run: working-directory: ./src/rust/rust-stream steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Setup project run: rustup install nightly - name: Compile project @@ -33,7 +33,7 @@ jobs: run: working-directory: ./src/java/java-stream steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Test build project run: ./mvnw clean package - name: Test run @@ -46,7 +46,7 @@ jobs: run: working-directory: ./src/julia/JuliaStream.jl steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Setup project run: julia --project -e 'import Pkg; Pkg.instantiate()' - name: Test run PlainStream.jl @@ -69,38 +69,30 @@ jobs: run: julia --project src/AMDGPUStream.jl --list - setup-cpp: + test-cpp: runs-on: ubuntu-22.04 steps: - - name: Cache compiler - # if: ${{ !env.ACT }} - id: prepare-compilers - uses: actions/cache@v2 + - name: Maximize build space + uses: easimon/maximize-build-space@v8 with: - path: ./compilers - key: ${{ runner.os }}-${{ hashFiles('./src/ci-prepare-bionic.sh') }} + root-reserve-mb: 512 + swap-size-mb: 1024 + remove-android: 'true' + remove-codeql: 'true' - - name: Prepare compilers - if: steps.prepare-compilers.outputs.cache-hit != 'true' - run: source ./src/ci-prepare-bionic.sh ./compilers SETUP false || true - - test-cpp: - needs: setup-cpp - runs-on: ubuntu-22.04 - steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Cache compiler if: ${{ !env.ACT }} id: prepare-compilers - uses: actions/cache@v2 + uses: actions/cache@v4 with: path: ./compilers key: ${{ runner.os }}-${{ hashFiles('./src/ci-prepare-bionic.sh') }} - name: Prepare compilers if: steps.prepare-compilers.outputs.cache-hit != 'true' - run: source ./src/ci-prepare-bionic.sh ./compilers SETUP false || true + run: source ./src/ci-prepare-bionic.sh ./compilers SETUP true || true - name: Setup test environment run: source ./src/ci-prepare-bionic.sh ./compilers VARS false || true From aea4e7d2a4d463672b91b269f3e0b40d53291fc2 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Sun, 24 Sep 2023 21:45:47 +0100 Subject: [PATCH 62/89] Fix CI cache version --- .github/workflows/main.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index 4f9b833c..b96e8801 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -85,7 +85,7 @@ jobs: - name: Cache compiler if: ${{ !env.ACT }} id: prepare-compilers - uses: actions/cache@v4 + uses: actions/cache@v3 with: path: ./compilers key: ${{ runner.os }}-${{ hashFiles('./src/ci-prepare-bionic.sh') }} From 154ad9f29765f33ad2003198671bf576591b9839 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Sun, 24 Sep 2023 21:51:26 +0100 Subject: [PATCH 63/89] Bump rust lockfiles --- src/rust/rust-stream/Cargo.lock | 461 ++++++++++++++++++++------------ 1 file changed, 284 insertions(+), 177 deletions(-) diff --git a/src/rust/rust-stream/Cargo.lock b/src/rust/rust-stream/Cargo.lock index 723849ad..20be2876 100644 --- a/src/rust/rust-stream/Cargo.lock +++ b/src/rust/rust-stream/Cargo.lock @@ -18,14 +18,14 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a3203e79f4dd9bdda415ed03cf14dae5a2bf775c683a00f94e9cd1faf0f596e5" dependencies = [ "quote", - "syn", + "syn 1.0.109", ] [[package]] name = "async-channel" -version = "1.7.1" +version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e14485364214912d3b19cc3435dde4df66065127f05fa0d75c712f36f12c2f28" +checksum = "81953c529336010edd6d8e358f886d9581267795c61b19475b71314bffa46d35" dependencies = [ "concurrent-queue", "event-listener", @@ -34,23 +34,23 @@ dependencies = [ [[package]] name = "async-executor" -version = "1.4.1" +version = "1.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "871f9bb5e0a22eeb7e8cf16641feb87c9dc67032ccf8ff49e772eb9941d3a965" +checksum = "6fa3dc5f2a8564f07759c008b9109dc0d39de92a88d5588b8a5036d286383afb" dependencies = [ + "async-lock", "async-task", "concurrent-queue", "fastrand", "futures-lite", - "once_cell", "slab", ] [[package]] name = "async-global-executor" -version = "2.3.0" +version = "2.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0da5b41ee986eed3f524c380e6d64965aea573882a8907682ad100f7859305ca" +checksum = "f1b6f5d7df27bd294849f8eec66ecfc63d11814df7a4f5d74168a2394467b776" dependencies = [ "async-channel", "async-executor", @@ -63,29 +63,29 @@ dependencies = [ [[package]] name = "async-io" -version = "1.9.0" +version = "1.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "83e21f3a490c72b3b0cf44962180e60045de2925d8dff97918f7ee43c8f637c7" +checksum = "0fc5b45d93ef0529756f812ca52e44c221b35341892d3dcc34132ac02f3dd2af" dependencies = [ + "async-lock", "autocfg", + "cfg-if", "concurrent-queue", "futures-lite", - "libc", "log", - "once_cell", "parking", "polling", + "rustix", "slab", "socket2", "waker-fn", - "winapi 0.3.9", ] [[package]] name = "async-lock" -version = "2.5.0" +version = "2.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e97a171d191782fba31bb902b14ad94e24a68145032b7eedf871ab0bc0d077b6" +checksum = "287272293e9d8c41773cec55e365490fe034813a2f172f502d6ddcf75b2f582b" dependencies = [ "event-listener", ] @@ -119,15 +119,15 @@ dependencies = [ [[package]] name = "async-task" -version = "4.3.0" +version = "4.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a40729d2133846d9ed0ea60a8b9541bccddab49cd30f0715a1da672fe9a2524" +checksum = "ecc7ab41815b3c653ccd2978ec3255c81349336702dfdf62ee6f7069b12a3aae" [[package]] name = "atomic-waker" -version = "1.0.0" +version = "1.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "065374052e7df7ee4047b1160cca5e1467a12351a40b3da123c870ba0b8eda2a" +checksum = "1181e1e0d1fce796a03db1ae795d67167da795f9cf4a39c37589e85ef57f26d3" [[package]] name = "atty" @@ -135,7 +135,7 @@ version = "0.2.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" dependencies = [ - "hermit-abi", + "hermit-abi 0.1.19", "libc", "winapi 0.3.9", ] @@ -154,35 +154,33 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] name = "blocking" -version = "1.2.0" +version = "1.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c6ccb65d468978a086b69884437ded69a90faab3bbe6e67f242173ea728acccc" +checksum = "77231a1c8f801696fc0123ec6150ce92cffb8e164a02afb9c8ddee0e9b65ad65" dependencies = [ "async-channel", + "async-lock", "async-task", "atomic-waker", "fastrand", "futures-lite", - "once_cell", + "log", ] [[package]] name = "bumpalo" -version = "3.11.0" +version = "3.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c1ad822118d20d2c234f427000d5acc36eabe1e29a348c89b63dd60b13f28e5d" - -[[package]] -name = "cache-padded" -version = "1.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c1db59621ec70f09c5e9b597b220c7a2b43611f4710dc03ceb8748637775692c" +checksum = "7f30e7476521f6f8af1a1c4c0b8cc94f0bee37d91763d0ca2665f299b6cd8aec" [[package]] name = "cc" -version = "1.0.73" +version = "1.0.83" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2fff2a6927b3bb87f9595d67196a70493f627687a71d87a0d692242c33f58c11" +checksum = "f1174fb0b6ec23863f8b971027804a42614e347eafb0a95bf0b12cdae21fc4d0" +dependencies = [ + "libc", +] [[package]] name = "cfg-if" @@ -216,11 +214,11 @@ dependencies = [ [[package]] name = "concurrent-queue" -version = "1.2.4" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af4780a44ab5696ea9e28294517f1fffb421a83a25af521333c838635509db9c" +checksum = "62ec6771ecfa0762d24683ee5a32ad78487a3d3afdc0fb8cae19d2c5deb50b7c" dependencies = [ - "cache-padded", + "crossbeam-utils", ] [[package]] @@ -251,9 +249,9 @@ dependencies = [ [[package]] name = "crossbeam-channel" -version = "0.5.6" +version = "0.5.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c2dd04ddaf88237dc3b8d8f9a3c1004b506b54b3313403944054d23c0870c521" +checksum = "a33c2bf77f2df06183c3aa30d1e96c0695a313d4f9c453cc3762a6db39f99200" dependencies = [ "cfg-if", "crossbeam-utils", @@ -261,9 +259,9 @@ dependencies = [ [[package]] name = "crossbeam-deque" -version = "0.8.2" +version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "715e8152b692bba2d374b53d4875445368fdf21a94751410af607a5ac677d1fc" +checksum = "ce6fd6f855243022dcecf8702fef0c297d4338e226845fe067f6341ad9fa0cef" dependencies = [ "cfg-if", "crossbeam-epoch", @@ -272,9 +270,9 @@ dependencies = [ [[package]] name = "crossbeam-epoch" -version = "0.9.11" +version = "0.9.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f916dfc5d356b0ed9dae65f1db9fc9770aa2851d2662b988ccf4fe3516e86348" +checksum = "ae211234986c545741a7dc064309f67ee1e5ad243d0e48335adc0484d960bcc7" dependencies = [ "autocfg", "cfg-if", @@ -285,9 +283,9 @@ dependencies = [ [[package]] name = "crossbeam-queue" -version = "0.3.6" +version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1cd42583b04998a5363558e5f9291ee5a5ff6b49944332103f251e7479a82aa7" +checksum = "d1cfb3ea8a53f37c40dea2c7bedcbd88bdfae54f5e2175d6ecaff1c988353add" dependencies = [ "cfg-if", "crossbeam-utils", @@ -295,9 +293,9 @@ dependencies = [ [[package]] name = "crossbeam-utils" -version = "0.8.12" +version = "0.8.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "edbafec5fa1f196ca66527c1b12c2ec4745ca14b50f1ad8f9f6f720b55d11fac" +checksum = "5a22b2d63d4d1dc0b7f1b6b2747dd0088008a9be28b6ddf0b1e7d335e3037294" dependencies = [ "cfg-if", ] @@ -328,20 +326,31 @@ dependencies = [ ] [[package]] -name = "ctor" -version = "0.1.23" +name = "either" +version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cdffe87e1d521a10f9696f833fe502293ea446d7f256c06128293a4119bdf4cb" +checksum = "a26ae43d7bcc3b814de94796a5e736d4029efb0ee900c12e2d54c993ad1a1e07" + +[[package]] +name = "errno" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "136526188508e25c6fef639d7927dfb3e0e3084488bf202267829cf7fc23dbdd" dependencies = [ - "quote", - "syn", + "errno-dragonfly", + "libc", + "windows-sys", ] [[package]] -name = "either" -version = "1.8.0" +name = "errno-dragonfly" +version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "90e5c1c8368803113bf0c9584fc495a58b86dc8a29edbf8fe877d21d9507e797" +checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf" +dependencies = [ + "cc", + "libc", +] [[package]] name = "event-listener" @@ -351,18 +360,18 @@ checksum = "0206175f82b8d6bf6652ff7d71a1e27fd2e4efde587fd368662814d6ec1d9ce0" [[package]] name = "fastrand" -version = "1.8.0" +version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a7a407cfaa3385c4ae6b23e84623d48c2798d06e3e6a1878f7f59f17b3f86499" +checksum = "e51093e27b0797c359783294ca4f0a911c270184cb10f85783b118614a1501be" dependencies = [ "instant", ] [[package]] name = "futures" -version = "0.3.24" +version = "0.3.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f21eda599937fba36daeb58a22e8f5cee2d14c4a17b5b7739c7c8e5e3b8230c" +checksum = "23342abe12aba583913b2e62f22225ff9c950774065e4bfb61a19cd9770fec40" dependencies = [ "futures-channel", "futures-core", @@ -375,9 +384,9 @@ dependencies = [ [[package]] name = "futures-channel" -version = "0.3.24" +version = "0.3.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "30bdd20c28fadd505d0fd6712cdfcb0d4b5648baf45faef7f852afb2399bb050" +checksum = "955518d47e09b25bbebc7a18df10b81f0c766eaf4c4f1cccef2fca5f2a4fb5f2" dependencies = [ "futures-core", "futures-sink", @@ -385,15 +394,15 @@ dependencies = [ [[package]] name = "futures-core" -version = "0.3.24" +version = "0.3.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4e5aa3de05362c3fb88de6531e6296e85cde7739cccad4b9dfeeb7f6ebce56bf" +checksum = "4bca583b7e26f571124fe5b7561d49cb2868d79116cfa0eefce955557c6fee8c" [[package]] name = "futures-executor" -version = "0.3.24" +version = "0.3.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ff63c23854bee61b6e9cd331d523909f238fc7636290b96826e9cfa5faa00ab" +checksum = "ccecee823288125bd88b4d7f565c9e58e41858e47ab72e8ea2d64e93624386e0" dependencies = [ "futures-core", "futures-task", @@ -402,15 +411,15 @@ dependencies = [ [[package]] name = "futures-io" -version = "0.3.24" +version = "0.3.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bbf4d2a7a308fd4578637c0b17c7e1c7ba127b8f6ba00b29f717e9655d85eb68" +checksum = "4fff74096e71ed47f8e023204cfd0aa1289cd54ae5430a9523be060cdb849964" [[package]] name = "futures-lite" -version = "1.12.0" +version = "1.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7694489acd39452c77daa48516b894c153f192c3578d5a839b62c58099fcbf48" +checksum = "49a9d51ce47660b1e808d3c990b4709f2f415d928835a17dfd16991515c46bce" dependencies = [ "fastrand", "futures-core", @@ -423,26 +432,26 @@ dependencies = [ [[package]] name = "futures-macro" -version = "0.3.24" +version = "0.3.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "42cd15d1c7456c04dbdf7e88bcd69760d74f3a798d6444e16974b505b0e62f17" +checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.37", ] [[package]] name = "futures-sink" -version = "0.3.24" +version = "0.3.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "21b20ba5a92e727ba30e72834706623d94ac93a725410b6a6b6fbc1b07f7ba56" +checksum = "f43be4fe21a13b9781a69afa4985b0f6ee0e1afab2c6f454a8cf30e2b2237b6e" [[package]] name = "futures-task" -version = "0.3.24" +version = "0.3.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a6508c467c73851293f390476d4491cf4d227dbabcd4170f3bb6044959b294f1" +checksum = "76d3d132be6c0e6aa1534069c705a74a5997a356c0dc2f86a47765e5617c5b65" [[package]] name = "futures-timer" @@ -452,9 +461,9 @@ checksum = "e64b03909df88034c26dc1547e8970b91f98bdb65165d6a4e9110d94263dbb2c" [[package]] name = "futures-util" -version = "0.3.24" +version = "0.3.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "44fb6cb1be61cc1d2e43b262516aafcf63b241cffdb1d3fa115f91d9c7b09c90" +checksum = "26b01e40b772d54cf6c6d721c1d1abd0647a0106a12ecaa1c186273392a69533" dependencies = [ "futures-channel", "futures-core", @@ -470,9 +479,9 @@ dependencies = [ [[package]] name = "gloo-timers" -version = "0.2.4" +version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5fb7d06c1c8cc2a29bee7ec961009a0b2caa0793ee4900c2ffb348734ba1c8f9" +checksum = "9b995a66bb87bebce9a0f4a95aed01daca4872c050bfcb21653361c03bc35e5c" dependencies = [ "futures-channel", "futures-core", @@ -498,6 +507,12 @@ dependencies = [ "libc", ] +[[package]] +name = "hermit-abi" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d77f7ec81a6d05a3abb01ab6eb7590f6083d08449fe5a1c8b1e620283546ccb7" + [[package]] name = "instant" version = "0.1.12" @@ -507,11 +522,22 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "io-lifetimes" +version = "1.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eae7b9aee968036d54dce06cebaefd919e4472e753296daccd6d344e3e2df0c2" +dependencies = [ + "hermit-abi 0.3.3", + "libc", + "windows-sys", +] + [[package]] name = "js-sys" -version = "0.3.60" +version = "0.3.64" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49409df3e3bf0856b916e2ceaca09ee28e6871cf7d9ce97a692cacfdb2a25a47" +checksum = "c5f195fe497f702db0f318b07fdd68edb16955aed830df8363d837542f8f935a" dependencies = [ "wasm-bindgen", ] @@ -543,15 +569,21 @@ checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" [[package]] name = "libc" -version = "0.2.134" +version = "0.2.148" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "329c933548736bc49fd575ee68c89e8be4d260064184389a5b77517cddd99ffb" +checksum = "9cdc71e17332e86d2e1d38c1f99edcb6288ee11b815fb1a4b049eaa2114d369b" + +[[package]] +name = "linux-raw-sys" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519" [[package]] name = "lock_api" -version = "0.4.9" +version = "0.4.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "435011366fe56583b16cf956f9df0095b405b82d76425bc8981c0e22e60ec4df" +checksum = "c1cc9717a20b1bb222f333e6a92fd32f7d8a18ddc5a3191a11af45dcbf4dcd16" dependencies = [ "autocfg", "scopeguard", @@ -559,25 +591,24 @@ dependencies = [ [[package]] name = "log" -version = "0.4.17" +version = "0.4.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "abb12e687cfb44aa40f41fc3978ef76448f9b6038cad6aef4259d3c095a2382e" +checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" dependencies = [ - "cfg-if", "value-bag", ] [[package]] name = "memchr" -version = "2.5.0" +version = "2.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" +checksum = "8f232d6ef707e1956a43342693d2a31e72989554d58299d7a88738cc95b0d35c" [[package]] name = "memoffset" -version = "0.6.5" +version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5aa361d4faea93603064a027415f07bd8e1d5c88c9fbf68bf56a285428fd79ce" +checksum = "5a634b1c61a95585bd15607c6ab0c4e5b226e695ff2800ba0cdccddf208c406c" dependencies = [ "autocfg", ] @@ -615,34 +646,34 @@ dependencies = [ [[package]] name = "num-traits" -version = "0.2.15" +version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd" +checksum = "f30b0abd723be7e2ffca1272140fac1a2f084c77ec3e123c192b66af1ee9e6c2" dependencies = [ "autocfg", ] [[package]] name = "num_cpus" -version = "1.13.1" +version = "1.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "19e64526ebdee182341572e50e9ad03965aa510cd94427a4549448f285e957a1" +checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43" dependencies = [ - "hermit-abi", + "hermit-abi 0.3.3", "libc", ] [[package]] name = "once_cell" -version = "1.15.0" +version = "1.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e82dad04139b71a90c080c8463fe0dc7902db5192d939bd0950f074d014339e1" +checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d" [[package]] name = "parking" -version = "2.0.0" +version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "427c3892f9e783d91cc128285287e70a59e206ca452770ece88a76f7a3eddd72" +checksum = "14f2252c834a40ed9bb5422029649578e63aa341ac401f74e719dd1afda8394e" [[package]] name = "parking_lot" @@ -657,9 +688,9 @@ dependencies = [ [[package]] name = "parking_lot_core" -version = "0.8.5" +version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d76e8e1493bcac0d2766c42737f34458f1c8c50c0d23bcb24ea953affb273216" +checksum = "60a2cfe6f0ad2bfc16aefa463b497d5c7a5ecd44a23efa72aa342d90177356dc" dependencies = [ "cfg-if", "instant", @@ -671,9 +702,9 @@ dependencies = [ [[package]] name = "pin-project-lite" -version = "0.2.9" +version = "0.2.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e0a7ae3ac2f1173085d398531c705756c94a4c56843785df85a60c1a0afac116" +checksum = "8afb450f006bf6385ca15ef45d71d2288452bc3683ce2e2cacc0d18e4be60b58" [[package]] name = "pin-utils" @@ -683,16 +714,18 @@ checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" [[package]] name = "polling" -version = "2.3.0" +version = "2.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "899b00b9c8ab553c743b3e11e87c5c7d423b2a2de229ba95b24a756344748011" +checksum = "4b2d323e8ca7996b3e23126511a523f7e62924d93ecd5ae73b333815b0eb3dce" dependencies = [ "autocfg", + "bitflags", "cfg-if", + "concurrent-queue", "libc", "log", - "wepoll-ffi", - "winapi 0.3.9", + "pin-project-lite", + "windows-sys", ] [[package]] @@ -704,7 +737,7 @@ dependencies = [ "proc-macro-error-attr", "proc-macro2", "quote", - "syn", + "syn 1.0.109", "version_check", ] @@ -721,44 +754,40 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.46" +version = "1.0.67" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94e2ef8dbfc347b10c094890f778ee2e36ca9bb4262e86dc99cd217e35f3470b" +checksum = "3d433d9f1a3e8c1263d9456598b16fec66f4acc9a74dacffd35c7bb09b3a1328" dependencies = [ "unicode-ident", ] [[package]] name = "quote" -version = "1.0.21" +version = "1.0.33" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bbe448f377a7d6961e30f5955f9b8d106c3f5e449d493ee1b125c1d43c2b5179" +checksum = "5267fca4496028628a95160fc423a33e8b2e6af8a5302579e322e4b520293cae" dependencies = [ "proc-macro2", ] [[package]] name = "rayon" -version = "1.5.3" +version = "1.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bd99e5772ead8baa5215278c9b15bf92087709e9c1b2d1f97cdb5a183c933a7d" +checksum = "9c27db03db7734835b3f53954b534c91069375ce6ccaa2e065441e07d9b6cdb1" dependencies = [ - "autocfg", - "crossbeam-deque", "either", "rayon-core", ] [[package]] name = "rayon-core" -version = "1.9.3" +version = "1.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "258bcdb5ac6dad48491bb2992db6b7cf74878b0384908af124823d118c99683f" +checksum = "5ce3fb6ad83f861aac485e76e1985cd109d9a3713802152be56c3b1f0e0658ed" dependencies = [ - "crossbeam-channel", "crossbeam-deque", "crossbeam-utils", - "num_cpus", ] [[package]] @@ -793,7 +822,7 @@ dependencies = [ "proc-macro2", "quote", "rustc_version", - "syn", + "syn 1.0.109", ] [[package]] @@ -822,23 +851,37 @@ dependencies = [ "semver", ] +[[package]] +name = "rustix" +version = "0.37.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4d69718bf81c6127a49dc64e44a742e8bb9213c0ff8869a22c308f84c1d4ab06" +dependencies = [ + "bitflags", + "errno", + "io-lifetimes", + "libc", + "linux-raw-sys", + "windows-sys", +] + [[package]] name = "rustversion" -version = "1.0.9" +version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97477e48b4cf8603ad5f7aaf897467cf42ab4218a38ef76fb14c2d6773a6d6a8" +checksum = "7ffc183a10b4478d04cbbbfc96d0873219d962dd5accaff2ffbd4ceb7df837f4" [[package]] name = "scopeguard" -version = "1.1.0" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" [[package]] name = "semver" -version = "1.0.14" +version = "1.0.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e25dfac463d778e353db5be2449d1cce89bd6fd23c9f1ea21310ce6e5a1b29c4" +checksum = "ad977052201c6de01a8ef2aa3378c4bd23217a056337d1d6da40468d267a4fb0" [[package]] name = "signal-hook" @@ -853,33 +896,33 @@ dependencies = [ [[package]] name = "signal-hook-registry" -version = "1.4.0" +version = "1.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e51e73328dc4ac0c7ccbda3a494dfa03df1de2f46018127f60c693f2648455b0" +checksum = "d8229b473baa5980ac72ef434c4415e70c4b5e71b423043adb4ba059f89c99a1" dependencies = [ "libc", ] [[package]] name = "slab" -version = "0.4.7" +version = "0.4.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4614a76b2a8be0058caa9dbbaf66d988527d86d003c11a94fbd335d7661edcef" +checksum = "8f92a496fb766b417c996b9c5e57daf2f7ad3b0bebe1ccfca4856390e3d3bb67" dependencies = [ "autocfg", ] [[package]] name = "smallvec" -version = "1.9.0" +version = "1.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2fd0db749597d91ff862fd1d55ea87f7855a744a8425a64695b6fca237d1dad1" +checksum = "942b4a808e05215192e39f4ab80813e599068285906cc91aa64f923db842bd5a" [[package]] name = "socket2" -version = "0.4.7" +version = "0.4.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "02e2d2db9033d13a1567121ddd7a095ee144db4e1ca1b1bda3419bc0da294ebd" +checksum = "64a4a911eed85daf18834cfaa86a79b7d266ff93ff5ba14005426219480ed662" dependencies = [ "libc", "winapi 0.3.9", @@ -912,14 +955,25 @@ dependencies = [ "proc-macro-error", "proc-macro2", "quote", - "syn", + "syn 1.0.109", +] + +[[package]] +name = "syn" +version = "1.0.109" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", ] [[package]] name = "syn" -version = "1.0.101" +version = "2.0.37" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e90cde112c4b9690b8cbe810cba9ddd8bc1d7472e2cae317b69e9438c1cba7d2" +checksum = "7303ef2c05cd654186cb250d29049a24840ca25d2747c25c0381c8d9e2f582e8" dependencies = [ "proc-macro2", "quote", @@ -946,31 +1000,27 @@ dependencies = [ [[package]] name = "unicode-ident" -version = "1.0.4" +version = "1.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dcc811dc4066ac62f84f11307873c4850cb653bfa9b1719cee2bd2204a4bc5dd" +checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" [[package]] name = "unicode-segmentation" -version = "1.10.0" +version = "1.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0fdbf052a0783de01e944a6ce7a8cb939e295b1e7be835a1112c3b9a7f047a5a" +checksum = "1dd624098567895118886609431a7c3b8f516e41d30e0643f03d94592a147e36" [[package]] name = "unicode-width" -version = "0.1.10" +version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c0edd1e5b14653f783770bce4a4dabb4a5108a5370a5f5d8cfe8710c361f6c8b" +checksum = "e51733f11c9c4f72aa0c160008246859e340b00807569a0da0e7a1079b27ba85" [[package]] name = "value-bag" -version = "1.0.0-alpha.9" +version = "1.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2209b78d1249f7e6f3293657c9779fe31ced465df091bbd433a1cf88e916ec55" -dependencies = [ - "ctor", - "version_check", -] +checksum = "d92ccd67fb88503048c01b59152a04effd0782d035a83a6d256ce6085f08f4a3" [[package]] name = "vec_map" @@ -992,9 +1042,9 @@ checksum = "9d5b2c62b4012a3e1eca5a7e077d13b3bf498c4073e33ccd58626607748ceeca" [[package]] name = "wasm-bindgen" -version = "0.2.83" +version = "0.2.87" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eaf9f5aceeec8be17c128b2e93e031fb8a4d469bb9c4ae2d7dc1888b26887268" +checksum = "7706a72ab36d8cb1f80ffbf0e071533974a60d0a308d01a5d0375bf60499a342" dependencies = [ "cfg-if", "wasm-bindgen-macro", @@ -1002,24 +1052,24 @@ dependencies = [ [[package]] name = "wasm-bindgen-backend" -version = "0.2.83" +version = "0.2.87" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c8ffb332579b0557b52d268b91feab8df3615f265d5270fec2a8c95b17c1142" +checksum = "5ef2b6d3c510e9625e5fe6f509ab07d66a760f0885d858736483c32ed7809abd" dependencies = [ "bumpalo", "log", "once_cell", "proc-macro2", "quote", - "syn", + "syn 2.0.37", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-futures" -version = "0.4.33" +version = "0.4.37" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "23639446165ca5a5de86ae1d8896b737ae80319560fbaa4c2887b7da6e7ebd7d" +checksum = "c02dbc21516f9f1f04f187958890d7e6026df8d16540b7ad9492bc34a67cea03" dependencies = [ "cfg-if", "js-sys", @@ -1029,9 +1079,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.83" +version = "0.2.87" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "052be0f94026e6cbc75cdefc9bae13fd6052cdcaf532fa6c45e7ae33a1e6c810" +checksum = "dee495e55982a3bd48105a7b947fd2a9b4a8ae3010041b9e0faab3f9cd028f1d" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -1039,42 +1089,33 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.83" +version = "0.2.87" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "07bc0c051dc5f23e307b13285f9d75df86bfdf816c5721e573dec1f9b8aa193c" +checksum = "54681b18a46765f095758388f2d0cf16eb8d4169b639ab575a8f5693af210c7b" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.37", "wasm-bindgen-backend", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-shared" -version = "0.2.83" +version = "0.2.87" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1c38c045535d93ec4f0b4defec448e4291638ee608530863b1e2ba115d4fff7f" +checksum = "ca6ad05a4870b2bf5fe995117d3728437bd27d7cd5f06f13c17443ef369775a1" [[package]] name = "web-sys" -version = "0.3.60" +version = "0.3.64" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bcda906d8be16e728fd5adc5b729afad4e444e106ab28cd1c7256e54fa61510f" +checksum = "9b85cbef8c220a6abc02aefd892dfc0fc23afb1c6a426316ec33253a3877249b" dependencies = [ "js-sys", "wasm-bindgen", ] -[[package]] -name = "wepoll-ffi" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d743fdedc5c64377b5fc2bc036b01c7fd642205a0d96356034ae3404d49eb7fb" -dependencies = [ - "cc", -] - [[package]] name = "winapi" version = "0.2.8" @@ -1108,3 +1149,69 @@ name = "winapi-x86_64-pc-windows-gnu" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "windows-sys" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-targets" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" + +[[package]] +name = "windows_i686_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" + +[[package]] +name = "windows_i686_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" From 5a1be9399ce17eb23817e1c7cd858432e8c53c97 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Sun, 24 Sep 2023 22:40:04 +0100 Subject: [PATCH 64/89] Bump Julia lockfiles and compat to 1.9 --- src/julia/JuliaStream.jl/AMDGPU/Manifest.toml | 558 ++++++------- src/julia/JuliaStream.jl/AMDGPU/Project.toml | 2 +- src/julia/JuliaStream.jl/CUDA/Manifest.toml | 559 +++++++++---- src/julia/JuliaStream.jl/CUDA/Project.toml | 2 +- .../KernelAbstractions/Manifest.toml | 716 ++++++++++------ .../KernelAbstractions/Project.toml | 2 +- src/julia/JuliaStream.jl/Manifest.toml | 778 +++++++++++------- src/julia/JuliaStream.jl/Project.toml | 2 +- .../JuliaStream.jl/Threaded/Manifest.toml | 20 +- .../JuliaStream.jl/Threaded/Project.toml | 2 +- src/julia/JuliaStream.jl/oneAPI/Manifest.toml | 448 ++++++---- src/julia/JuliaStream.jl/oneAPI/Project.toml | 2 +- src/julia/JuliaStream.jl/src/Stream.jl | 8 +- src/julia/JuliaStream.jl/update_all.sh | 5 +- 14 files changed, 1904 insertions(+), 1200 deletions(-) diff --git a/src/julia/JuliaStream.jl/AMDGPU/Manifest.toml b/src/julia/JuliaStream.jl/AMDGPU/Manifest.toml index 170213cb..9415ddca 100644 --- a/src/julia/JuliaStream.jl/AMDGPU/Manifest.toml +++ b/src/julia/JuliaStream.jl/AMDGPU/Manifest.toml @@ -1,415 +1,423 @@ # This file is machine-generated - editing it directly is not advised -[[AMDGPU]] -deps = ["AbstractFFTs", "Adapt", "BinaryProvider", "CEnum", "GPUArrays", "GPUCompiler", "HIP_jll", "LLVM", "Libdl", "LinearAlgebra", "MacroTools", "Pkg", "Printf", "ROCmDeviceLibs_jll", "Random", "Requires", "Setfield", "Statistics", "hsa_rocr_jll"] -git-tree-sha1 = "d59f1cf3f90ae6cf6626e8a21f337850cb3792f7" +julia_version = "1.9.3" +manifest_format = "2.0" +project_hash = "05982ec0602af8ada9509107382dd6c8b21db9b9" + +[[deps.AMDGPU]] +deps = ["AbstractFFTs", "Adapt", "CEnum", "ExprTools", "GPUArrays", "GPUCompiler", "KernelAbstractions", "LLD_jll", "LLVM", "LLVM_jll", "Libdl", "LinearAlgebra", "MacroTools", "Pkg", "Preferences", "Printf", "Random", "SparseArrays", "SpecialFunctions", "Statistics", "UnsafeAtomicsLLVM"] +git-tree-sha1 = "95437cf4c0ad651ca8463475de8af6a6935e23bd" uuid = "21141c5a-9bdb-4563-92ae-f87d6854732e" -version = "0.2.17" +version = "0.6.1" -[[AbstractFFTs]] +[[deps.AbstractFFTs]] deps = ["LinearAlgebra"] -git-tree-sha1 = "485ee0867925449198280d4af84bdb46a2a404d0" +git-tree-sha1 = "d92ad398961a3ed262d8bf04a1a2b8340f915fef" uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c" -version = "1.0.1" +version = "1.5.0" -[[Adapt]] -deps = ["LinearAlgebra"] -git-tree-sha1 = "84918055d15b3114ede17ac6a7182f68870c16f7" + [deps.AbstractFFTs.extensions] + AbstractFFTsChainRulesCoreExt = "ChainRulesCore" + AbstractFFTsTestExt = "Test" + + [deps.AbstractFFTs.weakdeps] + ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" + Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" + +[[deps.Adapt]] +deps = ["LinearAlgebra", "Requires"] +git-tree-sha1 = "76289dc51920fdc6e0013c872ba9551d54961c24" uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" -version = "3.3.1" +version = "3.6.2" +weakdeps = ["StaticArrays"] + + [deps.Adapt.extensions] + AdaptStaticArraysExt = "StaticArrays" -[[ArgParse]] +[[deps.ArgParse]] deps = ["Logging", "TextWrap"] git-tree-sha1 = "3102bce13da501c9104df33549f511cd25264d7d" uuid = "c7e460c6-2fb9-53a9-8c5b-16f535851c63" version = "1.1.4" -[[ArgTools]] +[[deps.ArgTools]] uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f" +version = "1.1.1" -[[Artifacts]] +[[deps.Artifacts]] uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" -[[Base64]] -uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" - -[[BinaryProvider]] -deps = ["Libdl", "Logging", "SHA"] -git-tree-sha1 = "ecdec412a9abc8db54c0efc5548c64dfce072058" -uuid = "b99e7846-7c00-51b0-8f62-c81ae34c0232" -version = "0.5.10" +[[deps.Atomix]] +deps = ["UnsafeAtomics"] +git-tree-sha1 = "c06a868224ecba914baa6942988e2f2aade419be" +uuid = "a9b6321e-bd34-4604-b9c9-b65b8de01458" +version = "0.1.0" -[[Bzip2_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "19a35467a82e236ff51bc17a3a44b69ef35185a2" -uuid = "6e34b625-4abd-537c-b88f-471c36dfa7a0" -version = "1.0.8+0" +[[deps.Base64]] +uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" -[[CEnum]] -git-tree-sha1 = "215a9aa4a1f23fbd05b92769fdd62559488d70e9" +[[deps.CEnum]] +git-tree-sha1 = "eb4cb44a499229b3b8426dcfb5dd85333951ff90" uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82" -version = "0.4.1" +version = "0.4.2" -[[ConstructionBase]] -deps = ["LinearAlgebra"] -git-tree-sha1 = "f74e9d5388b8620b4cee35d4c5a618dd4dc547f4" -uuid = "187b0558-2788-49d3-abe0-74a17ed4e7c9" -version = "1.3.0" +[[deps.CompilerSupportLibraries_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae" +version = "1.0.5+0" -[[Dates]] +[[deps.Dates]] deps = ["Printf"] uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" -[[Downloads]] -deps = ["ArgTools", "LibCURL", "NetworkOptions"] -uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6" +[[deps.DocStringExtensions]] +deps = ["LibGit2"] +git-tree-sha1 = "2fb1e02f2b635d0845df5d7c167fec4dd739b00d" +uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae" +version = "0.9.3" -[[Elfutils_jll]] -deps = ["Artifacts", "Bzip2_jll", "JLLWrappers", "Libdl", "Pkg", "XZ_jll", "Zlib_jll", "argp_standalone_jll", "fts_jll", "obstack_jll"] -git-tree-sha1 = "8f9fcde6d89b0a3ca51cb2028beab462705c5436" -uuid = "ab5a07f8-06af-567f-a878-e8bb879eba5a" -version = "0.182.0+0" +[[deps.Downloads]] +deps = ["ArgTools", "FileWatching", "LibCURL", "NetworkOptions"] +uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6" +version = "1.6.0" -[[ExprTools]] -git-tree-sha1 = "b7e3d17636b348f005f11040025ae8c6f645fe92" +[[deps.ExprTools]] +git-tree-sha1 = "27415f162e6028e81c72b82ef756bf321213b6ec" uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04" -version = "0.1.6" +version = "0.1.10" -[[Future]] -deps = ["Random"] -uuid = "9fa8497b-333b-5362-9e8d-4d0656e87820" +[[deps.FileWatching]] +uuid = "7b1f6079-737a-58dc-b8bc-7a2ca5c1b5ee" -[[GPUArrays]] -deps = ["Adapt", "LinearAlgebra", "Printf", "Random", "Serialization", "Statistics"] -git-tree-sha1 = "7772508f17f1d482fe0df72cabc5b55bec06bbe0" +[[deps.GPUArrays]] +deps = ["Adapt", "GPUArraysCore", "LLVM", "LinearAlgebra", "Printf", "Random", "Reexport", "Serialization", "Statistics"] +git-tree-sha1 = "8ad8f375ae365aa1eb2f42e2565a40b55a4b69a8" uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" -version = "8.1.2" +version = "9.0.0" -[[GPUCompiler]] -deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "TimerOutputs", "UUIDs"] -git-tree-sha1 = "4ed2616d5e656c8716736b64da86755467f26cf5" -uuid = "61eb1bfa-7361-4325-ad38-22787b887f55" -version = "0.12.9" +[[deps.GPUArraysCore]] +deps = ["Adapt"] +git-tree-sha1 = "2d6ca471a6c7b536127afccfa7564b5b39227fe0" +uuid = "46192b85-c4d5-4398-a991-12ede77f4527" +version = "0.1.5" -[[HIP_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "ROCmCompilerSupport_jll", "ROCmDeviceLibs_jll", "ROCmOpenCLRuntime_jll", "hsa_rocr_jll"] -git-tree-sha1 = "5097d8f7b6842156ab0928371b3d03fefd8decab" -uuid = "2696aab5-0948-5276-aa9a-2a86a37016b8" -version = "4.0.0+1" +[[deps.GPUCompiler]] +deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "Scratch", "TimerOutputs", "UUIDs"] +git-tree-sha1 = "5e4487558477f191c043166f8301dd0b4be4e2b2" +uuid = "61eb1bfa-7361-4325-ad38-22787b887f55" +version = "0.24.5" -[[InteractiveUtils]] +[[deps.InteractiveUtils]] deps = ["Markdown"] uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" -[[JLLWrappers]] -deps = ["Preferences"] -git-tree-sha1 = "642a199af8b68253517b80bd3bfd17eb4e84df6e" +[[deps.IrrationalConstants]] +git-tree-sha1 = "630b497eafcc20001bba38a4651b327dcfc491d2" +uuid = "92d709cd-6900-40b7-9082-c6be49f344b6" +version = "0.2.2" + +[[deps.JLLWrappers]] +deps = ["Artifacts", "Preferences"] +git-tree-sha1 = "7e5d6779a1e09a36db2a7b6cff50942a0a7d0fca" uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210" -version = "1.3.0" +version = "1.5.0" + +[[deps.KernelAbstractions]] +deps = ["Adapt", "Atomix", "InteractiveUtils", "LinearAlgebra", "MacroTools", "PrecompileTools", "Requires", "SparseArrays", "StaticArrays", "UUIDs", "UnsafeAtomics", "UnsafeAtomicsLLVM"] +git-tree-sha1 = "4c5875e4c228247e1c2b087669846941fb6e0118" +uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c" +version = "0.9.8" -[[LLVM]] + [deps.KernelAbstractions.extensions] + EnzymeExt = "EnzymeCore" + + [deps.KernelAbstractions.weakdeps] + EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869" + +[[deps.LLD_jll]] +deps = ["Artifacts", "Libdl", "Pkg", "TOML", "Zlib_jll", "libLLVM_jll"] +uuid = "d55e3150-da41-5e91-b323-ecfd1eec6109" +version = "14.0.6+3" + +[[deps.LLVM]] deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"] -git-tree-sha1 = "7cc22e69995e2329cc047a879395b2b74647ab5f" +git-tree-sha1 = "a9d2ce1d5007b1e8f6c5b89c5a31ff8bd146db5c" uuid = "929cbde3-209d-540e-8aea-75f648917ca0" -version = "4.7.0" +version = "6.2.1" -[[LLVMExtra_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "c5fc4bef251ecd37685bea1c4068a9cfa41e8b9a" +[[deps.LLVMExtra_jll]] +deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"] +git-tree-sha1 = "7ca6850ae880cc99b59b88517545f91a52020afa" uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab" -version = "0.0.13+0" +version = "0.0.25+0" + +[[deps.LLVM_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "TOML", "Zlib_jll", "libLLVM_jll"] +git-tree-sha1 = "c5131b433876973cf29a2d9ec426cc099567e68c" +uuid = "86de99a1-58d6-5da7-8064-bd56ce2e322c" +version = "14.0.6+4" -[[LibCURL]] +[[deps.LazyArtifacts]] +deps = ["Artifacts", "Pkg"] +uuid = "4af54fe1-eca0-43a8-85a7-787d91b784e3" + +[[deps.LibCURL]] deps = ["LibCURL_jll", "MozillaCACerts_jll"] uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21" +version = "0.6.3" -[[LibCURL_jll]] +[[deps.LibCURL_jll]] deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"] uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0" +version = "7.84.0+0" -[[LibGit2]] +[[deps.LibGit2]] deps = ["Base64", "NetworkOptions", "Printf", "SHA"] uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" -[[LibSSH2_jll]] +[[deps.LibSSH2_jll]] deps = ["Artifacts", "Libdl", "MbedTLS_jll"] uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8" +version = "1.10.2+0" -[[Libdl]] +[[deps.Libdl]] uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" -[[Libgcrypt_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Libgpg_error_jll", "Pkg"] -git-tree-sha1 = "64613c82a59c120435c067c2b809fc61cf5166ae" -uuid = "d4300ac3-e22c-5743-9152-c294e39db1e4" -version = "1.8.7+0" - -[[Libglvnd_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libX11_jll", "Xorg_libXext_jll"] -git-tree-sha1 = "7739f837d6447403596a75d19ed01fd08d6f56bf" -uuid = "7e76a0d4-f3c7-5321-8279-8d96eeed0f29" -version = "1.3.0+3" - -[[Libgpg_error_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "c333716e46366857753e273ce6a69ee0945a6db9" -uuid = "7add5ba3-2f88-524e-9cd5-f83b8a55f7b8" -version = "1.42.0+0" - -[[Libiconv_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "42b62845d70a619f063a7da093d995ec8e15e778" -uuid = "94ce4f54-9a6c-5748-9c1c-f9c7231a4531" -version = "1.16.1+1" - -[[LinearAlgebra]] -deps = ["Libdl"] +[[deps.LinearAlgebra]] +deps = ["Libdl", "OpenBLAS_jll", "libblastrampoline_jll"] uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" -[[Logging]] +[[deps.LogExpFunctions]] +deps = ["DocStringExtensions", "IrrationalConstants", "LinearAlgebra"] +git-tree-sha1 = "7d6dd4e9212aebaeed356de34ccf262a3cd415aa" +uuid = "2ab3a3ac-af41-5b50-aa03-7779005ae688" +version = "0.3.26" + + [deps.LogExpFunctions.extensions] + LogExpFunctionsChainRulesCoreExt = "ChainRulesCore" + LogExpFunctionsChangesOfVariablesExt = "ChangesOfVariables" + LogExpFunctionsInverseFunctionsExt = "InverseFunctions" + + [deps.LogExpFunctions.weakdeps] + ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" + ChangesOfVariables = "9e997f8a-9a97-42d5-a9f1-ce6bfc15e2c0" + InverseFunctions = "3587e190-3f89-42d0-90ee-14403ec27112" + +[[deps.Logging]] uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" -[[MacroTools]] +[[deps.MacroTools]] deps = ["Markdown", "Random"] -git-tree-sha1 = "3d3e902b31198a27340d0bf00d6ac452866021cf" +git-tree-sha1 = "9ee1618cbf5240e6d4e0371d6f24065083f60c48" uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" -version = "0.5.9" +version = "0.5.11" -[[Markdown]] +[[deps.Markdown]] deps = ["Base64"] uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" -[[MbedTLS_jll]] +[[deps.MbedTLS_jll]] deps = ["Artifacts", "Libdl"] uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1" +version = "2.28.2+0" -[[MozillaCACerts_jll]] +[[deps.MozillaCACerts_jll]] uuid = "14a3606d-f60d-562e-9121-12d972cd8159" +version = "2022.10.11" -[[NUMA_jll]] -deps = ["Libdl", "Pkg"] -git-tree-sha1 = "778f9bd14400cff2c32ed357e12766ac0e3d766e" -uuid = "7f51dc2b-bb24-59f8-b771-bb1490e4195d" -version = "2.0.13+1" - -[[NetworkOptions]] +[[deps.NetworkOptions]] uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908" +version = "1.2.0" + +[[deps.OpenBLAS_jll]] +deps = ["Artifacts", "CompilerSupportLibraries_jll", "Libdl"] +uuid = "4536629a-c528-5b80-bd46-f80d51c5b363" +version = "0.3.21+4" -[[OrderedCollections]] -git-tree-sha1 = "85f8e6578bf1f9ee0d11e7bb1b1456435479d47c" +[[deps.OpenLibm_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "05823500-19ac-5b8b-9628-191a04bc5112" +version = "0.8.1+0" + +[[deps.OpenSpecFun_jll]] +deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "13652491f6856acfd2db29360e1bbcd4565d04f1" +uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e" +version = "0.5.5+0" + +[[deps.OrderedCollections]] +git-tree-sha1 = "2e73fe17cac3c62ad1aebe70d44c963c3cfdc3e3" uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" -version = "1.4.1" +version = "1.6.2" -[[Parameters]] +[[deps.Parameters]] deps = ["OrderedCollections", "UnPack"] git-tree-sha1 = "34c0e9ad262e5f7fc75b10a9952ca7692cfc5fbe" uuid = "d96e819e-fc66-5662-9728-84c9c7592b0a" version = "0.12.3" -[[Pkg]] -deps = ["Artifacts", "Dates", "Downloads", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"] +[[deps.Pkg]] +deps = ["Artifacts", "Dates", "Downloads", "FileWatching", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"] uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" +version = "1.9.2" -[[Preferences]] +[[deps.PrecompileTools]] +deps = ["Preferences"] +git-tree-sha1 = "03b4c25b43cb84cee5c90aa9b5ea0a78fd848d2f" +uuid = "aea7be01-6a6a-4083-8856-8a6e6704d82a" +version = "1.2.0" + +[[deps.Preferences]] deps = ["TOML"] -git-tree-sha1 = "00cfd92944ca9c760982747e9a1d0d5d86ab1e5a" +git-tree-sha1 = "00805cd429dcb4870060ff49ef443486c262e38e" uuid = "21216c6a-2e73-6563-6e65-726566657250" -version = "1.2.2" +version = "1.4.1" -[[Printf]] +[[deps.Printf]] deps = ["Unicode"] uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" -[[REPL]] +[[deps.REPL]] deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"] uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" -[[ROCmCompilerSupport_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "ROCmDeviceLibs_jll", "hsa_rocr_jll"] -git-tree-sha1 = "56ddcfb5d8b60c9f8c1bc619886f8d363fd1926d" -uuid = "8fbdd1d2-db62-5cd0-981e-905da1486e17" -version = "4.0.0+1" - -[[ROCmDeviceLibs_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Zlib_jll"] -git-tree-sha1 = "d764f0f28b5af89aa004871a6a38e5d061f77257" -uuid = "873c0968-716b-5aa7-bb8d-d1e2e2aeff2d" -version = "4.0.0+0" - -[[ROCmOpenCLRuntime_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Libglvnd_jll", "Pkg", "ROCmCompilerSupport_jll", "ROCmDeviceLibs_jll", "Xorg_libX11_jll", "Xorg_xorgproto_jll", "hsa_rocr_jll"] -git-tree-sha1 = "f9e3e2cb40a7990535efa7da9b9dd0e0b458a973" -uuid = "10ae2a08-2eea-53f8-8c20-eec175020e9f" -version = "4.0.0+1" - -[[Random]] -deps = ["Serialization"] +[[deps.Random]] +deps = ["SHA", "Serialization"] uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" -[[Requires]] +[[deps.Reexport]] +git-tree-sha1 = "45e428421666073eab6f2da5c9d310d99bb12f9b" +uuid = "189a3867-3050-52da-a836-e630ba90ab69" +version = "1.2.2" + +[[deps.Requires]] deps = ["UUIDs"] -git-tree-sha1 = "8f82019e525f4d5c669692772a6f4b0a58b06a6a" +git-tree-sha1 = "838a3a4188e2ded87a4f9f184b4b0d78a1e91cb7" uuid = "ae029012-a4dd-5104-9daa-d747884805df" -version = "1.2.0" +version = "1.3.0" -[[SHA]] +[[deps.SHA]] uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" +version = "0.7.0" -[[Serialization]] -uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" +[[deps.Scratch]] +deps = ["Dates"] +git-tree-sha1 = "30449ee12237627992a99d5e30ae63e4d78cd24a" +uuid = "6c6a2e73-6563-6170-7368-637461726353" +version = "1.2.0" -[[Setfield]] -deps = ["ConstructionBase", "Future", "MacroTools", "Requires"] -git-tree-sha1 = "fca29e68c5062722b5b4435594c3d1ba557072a3" -uuid = "efcf1570-3423-57d1-acb7-fd33fddbac46" -version = "0.7.1" +[[deps.Serialization]] +uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" -[[Sockets]] +[[deps.Sockets]] uuid = "6462fe0b-24de-5631-8697-dd941f90decc" -[[SparseArrays]] -deps = ["LinearAlgebra", "Random"] +[[deps.SparseArrays]] +deps = ["Libdl", "LinearAlgebra", "Random", "Serialization", "SuiteSparse_jll"] uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" -[[Statistics]] +[[deps.SpecialFunctions]] +deps = ["IrrationalConstants", "LogExpFunctions", "OpenLibm_jll", "OpenSpecFun_jll"] +git-tree-sha1 = "e2cfc4012a19088254b3950b85c3c1d8882d864d" +uuid = "276daf66-3868-5448-9aa4-cd146d93841b" +version = "2.3.1" + + [deps.SpecialFunctions.extensions] + SpecialFunctionsChainRulesCoreExt = "ChainRulesCore" + + [deps.SpecialFunctions.weakdeps] + ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" + +[[deps.StaticArrays]] +deps = ["LinearAlgebra", "Random", "StaticArraysCore"] +git-tree-sha1 = "d5fb407ec3179063214bc6277712928ba78459e2" +uuid = "90137ffa-7385-5640-81b9-e52037218182" +version = "1.6.4" +weakdeps = ["Statistics"] + + [deps.StaticArrays.extensions] + StaticArraysStatisticsExt = "Statistics" + +[[deps.StaticArraysCore]] +git-tree-sha1 = "36b3d696ce6366023a0ea192b4cd442268995a0d" +uuid = "1e83bf80-4336-4d27-bf5d-d5a4f845583c" +version = "1.4.2" + +[[deps.Statistics]] deps = ["LinearAlgebra", "SparseArrays"] uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" +version = "1.9.0" + +[[deps.SuiteSparse_jll]] +deps = ["Artifacts", "Libdl", "Pkg", "libblastrampoline_jll"] +uuid = "bea87d4a-7f5b-5778-9afe-8cc45184846c" +version = "5.10.1+6" -[[TOML]] +[[deps.TOML]] deps = ["Dates"] uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76" +version = "1.0.3" -[[Tar]] +[[deps.Tar]] deps = ["ArgTools", "SHA"] uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e" +version = "1.10.0" -[[TextWrap]] +[[deps.TextWrap]] git-tree-sha1 = "9250ef9b01b66667380cf3275b3f7488d0e25faf" uuid = "b718987f-49a8-5099-9789-dcd902bef87d" version = "1.0.1" -[[TimerOutputs]] +[[deps.TimerOutputs]] deps = ["ExprTools", "Printf"] -git-tree-sha1 = "7cb456f358e8f9d102a8b25e8dfedf58fa5689bc" +git-tree-sha1 = "f548a9e9c490030e545f72074a41edfd0e5bcdd7" uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f" -version = "0.5.13" +version = "0.5.23" -[[UUIDs]] +[[deps.UUIDs]] deps = ["Random", "SHA"] uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" -[[UnPack]] +[[deps.UnPack]] git-tree-sha1 = "387c1f73762231e86e0c9c5443ce3b4a0a9a0c2b" uuid = "3a884ed6-31ef-47d7-9d2a-63182c4928ed" version = "1.0.2" -[[Unicode]] +[[deps.Unicode]] uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" -[[XML2_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Libiconv_jll", "Pkg", "Zlib_jll"] -git-tree-sha1 = "1acf5bdf07aa0907e0a37d3718bb88d4b687b74a" -uuid = "02c8fc9c-b97f-50b9-bbe4-9be30ff0a78a" -version = "2.9.12+0" - -[[XSLT_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Libgcrypt_jll", "Libgpg_error_jll", "Libiconv_jll", "Pkg", "XML2_jll", "Zlib_jll"] -git-tree-sha1 = "91844873c4085240b95e795f692c4cec4d805f8a" -uuid = "aed1982a-8fda-507f-9586-7b0439959a61" -version = "1.1.34+0" - -[[XZ_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "a921669cd9a45c23031fd4eb904f5cc3d20de415" -uuid = "ffd25f8a-64ca-5728-b0f7-c24cf3aae800" -version = "5.2.5+2" - -[[Xorg_libX11_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libxcb_jll", "Xorg_xtrans_jll"] -git-tree-sha1 = "5be649d550f3f4b95308bf0183b82e2582876527" -uuid = "4f6342f7-b3d2-589e-9d20-edeb45f2b2bc" -version = "1.6.9+4" - -[[Xorg_libXau_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "4e490d5c960c314f33885790ed410ff3a94ce67e" -uuid = "0c0b7dd1-d40b-584c-a123-a41640f87eec" -version = "1.0.9+4" - -[[Xorg_libXdmcp_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "4fe47bd2247248125c428978740e18a681372dd4" -uuid = "a3789734-cfe1-5b06-b2d0-1dd0d9d62d05" -version = "1.1.3+4" - -[[Xorg_libXext_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libX11_jll"] -git-tree-sha1 = "b7c0aa8c376b31e4852b360222848637f481f8c3" -uuid = "1082639a-0dae-5f34-9b06-72781eeb8cb3" -version = "1.3.4+4" - -[[Xorg_libpthread_stubs_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "6783737e45d3c59a4a4c4091f5f88cdcf0908cbb" -uuid = "14d82f49-176c-5ed1-bb49-ad3f5cbd8c74" -version = "0.1.0+3" - -[[Xorg_libxcb_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "XSLT_jll", "Xorg_libXau_jll", "Xorg_libXdmcp_jll", "Xorg_libpthread_stubs_jll"] -git-tree-sha1 = "daf17f441228e7a3833846cd048892861cff16d6" -uuid = "c7cfdc94-dc32-55de-ac96-5a1b8d977c5b" -version = "1.13.0+3" - -[[Xorg_xorgproto_jll]] -deps = ["Libdl", "Pkg"] -git-tree-sha1 = "9a9eb8ce756fe0bca01b4be16da770e18d264972" -uuid = "c4d99508-4286-5418-9131-c86396af500b" -version = "2019.2.0+2" - -[[Xorg_xtrans_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "79c31e7844f6ecf779705fbc12146eb190b7d845" -uuid = "c5fb5394-a638-5e4d-96e5-b29de1b5cf10" -version = "1.4.0+3" - -[[Zlib_jll]] +[[deps.UnsafeAtomics]] +git-tree-sha1 = "6331ac3440856ea1988316b46045303bef658278" +uuid = "013be700-e6cd-48c3-b4a1-df204f14c38f" +version = "0.2.1" + +[[deps.UnsafeAtomicsLLVM]] +deps = ["LLVM", "UnsafeAtomics"] +git-tree-sha1 = "323e3d0acf5e78a56dfae7bd8928c989b4f3083e" +uuid = "d80eeb9a-aca5-4d75-85e5-170c8b632249" +version = "0.1.3" + +[[deps.Zlib_jll]] deps = ["Libdl"] uuid = "83775a58-1f1d-513f-b197-d71354ab007a" +version = "1.2.13+0" -[[argp_standalone_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "feaf9f6293003c2bf53056fd6930d677ed340b34" -uuid = "c53206cc-00f7-50bf-ad1e-3ae1f6e49bc3" -version = "1.3.1+0" - -[[fts_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "78732b942383d2cb521df8a1a0814911144e663d" -uuid = "d65627f6-89bd-53e8-8ab5-8b75ff535eee" -version = "1.2.7+1" - -[[hsa_rocr_jll]] -deps = ["Artifacts", "Elfutils_jll", "JLLWrappers", "Libdl", "NUMA_jll", "Pkg", "Zlib_jll", "hsakmt_roct_jll"] -git-tree-sha1 = "df8d73efec8b1e53ad527d208f5343c0368f0fcd" -uuid = "dd59ff1a-a01a-568d-8b29-0669330f116a" -version = "4.0.0+0" - -[[hsakmt_roct_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "NUMA_jll", "Pkg"] -git-tree-sha1 = "ea54f6be23c6d25613a0872ec23dc5a0b77b4a00" -uuid = "1cecccd7-a9b6-5045-9cdc-a44c19b16d76" -version = "4.2.0+0" - -[[nghttp2_jll]] +[[deps.libLLVM_jll]] deps = ["Artifacts", "Libdl"] -uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d" +uuid = "8f36deef-c2a5-5394-99ed-8e07531fb29a" +version = "14.0.6+3" -[[obstack_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "1c4a6b66e934fc6db4649cb2910c72f53bbfea7e" -uuid = "c88a4935-d25e-5644-aacc-5db6f1b8ef79" -version = "1.2.2+0" +[[deps.libblastrampoline_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "8e850b90-86db-534c-a0d3-1478176c7d93" +version = "5.8.0+0" + +[[deps.nghttp2_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d" +version = "1.48.0+0" -[[p7zip_jll]] +[[deps.p7zip_jll]] deps = ["Artifacts", "Libdl"] uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0" +version = "17.4.0+0" diff --git a/src/julia/JuliaStream.jl/AMDGPU/Project.toml b/src/julia/JuliaStream.jl/AMDGPU/Project.toml index 5ab8447e..66596dfc 100644 --- a/src/julia/JuliaStream.jl/AMDGPU/Project.toml +++ b/src/julia/JuliaStream.jl/AMDGPU/Project.toml @@ -4,4 +4,4 @@ ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63" Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a" [compat] -julia = "1.6" +julia = "1.9" diff --git a/src/julia/JuliaStream.jl/CUDA/Manifest.toml b/src/julia/JuliaStream.jl/CUDA/Manifest.toml index 92af4d1a..cf7c0e90 100644 --- a/src/julia/JuliaStream.jl/CUDA/Manifest.toml +++ b/src/julia/JuliaStream.jl/CUDA/Manifest.toml @@ -1,332 +1,555 @@ # This file is machine-generated - editing it directly is not advised -[[AbstractFFTs]] +julia_version = "1.9.3" +manifest_format = "2.0" +project_hash = "6909ef39c97ad6037791040bed70b7aa111e1f64" + +[[deps.AbstractFFTs]] deps = ["LinearAlgebra"] -git-tree-sha1 = "485ee0867925449198280d4af84bdb46a2a404d0" +git-tree-sha1 = "d92ad398961a3ed262d8bf04a1a2b8340f915fef" uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c" -version = "1.0.1" +version = "1.5.0" -[[Adapt]] -deps = ["LinearAlgebra"] -git-tree-sha1 = "84918055d15b3114ede17ac6a7182f68870c16f7" + [deps.AbstractFFTs.extensions] + AbstractFFTsChainRulesCoreExt = "ChainRulesCore" + AbstractFFTsTestExt = "Test" + + [deps.AbstractFFTs.weakdeps] + ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" + Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" + +[[deps.Adapt]] +deps = ["LinearAlgebra", "Requires"] +git-tree-sha1 = "76289dc51920fdc6e0013c872ba9551d54961c24" uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" -version = "3.3.1" +version = "3.6.2" +weakdeps = ["StaticArrays"] -[[ArgParse]] + [deps.Adapt.extensions] + AdaptStaticArraysExt = "StaticArrays" + +[[deps.ArgParse]] deps = ["Logging", "TextWrap"] git-tree-sha1 = "3102bce13da501c9104df33549f511cd25264d7d" uuid = "c7e460c6-2fb9-53a9-8c5b-16f535851c63" version = "1.1.4" -[[ArgTools]] +[[deps.ArgTools]] uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f" +version = "1.1.1" -[[Artifacts]] +[[deps.Artifacts]] uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" -[[BFloat16s]] +[[deps.Atomix]] +deps = ["UnsafeAtomics"] +git-tree-sha1 = "c06a868224ecba914baa6942988e2f2aade419be" +uuid = "a9b6321e-bd34-4604-b9c9-b65b8de01458" +version = "0.1.0" + +[[deps.BFloat16s]] deps = ["LinearAlgebra", "Printf", "Random", "Test"] -git-tree-sha1 = "a598ecb0d717092b5539dbbe890c98bac842b072" +git-tree-sha1 = "dbf84058d0a8cbbadee18d25cf606934b22d7c66" uuid = "ab4f0b2a-ad5b-11e8-123f-65d77653426b" -version = "0.2.0" +version = "0.4.2" -[[Base64]] +[[deps.Base64]] uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" -[[CEnum]] -git-tree-sha1 = "215a9aa4a1f23fbd05b92769fdd62559488d70e9" +[[deps.CEnum]] +git-tree-sha1 = "eb4cb44a499229b3b8426dcfb5dd85333951ff90" uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82" -version = "0.4.1" +version = "0.4.2" -[[CUDA]] -deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CompilerSupportLibraries_jll", "ExprTools", "GPUArrays", "GPUCompiler", "LLVM", "LazyArtifacts", "Libdl", "LinearAlgebra", "Logging", "Printf", "Random", "Random123", "RandomNumbers", "Reexport", "Requires", "SparseArrays", "SpecialFunctions", "TimerOutputs"] -git-tree-sha1 = "1f8ebf85abb7d1eff965730e592794a27c1350d8" +[[deps.CUDA]] +deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CUDA_Driver_jll", "CUDA_Runtime_Discovery", "CUDA_Runtime_jll", "Crayons", "DataFrames", "ExprTools", "GPUArrays", "GPUCompiler", "KernelAbstractions", "LLVM", "LazyArtifacts", "Libdl", "LinearAlgebra", "Logging", "NVTX", "Preferences", "PrettyTables", "Printf", "Random", "Random123", "RandomNumbers", "Reexport", "Requires", "SparseArrays", "Statistics", "UnsafeAtomicsLLVM"] +git-tree-sha1 = "f062a48c26ae027f70c44f48f244862aec47bf99" uuid = "052768ef-5323-5732-b1bb-66c8b64840ba" -version = "3.6.0" - -[[ChainRulesCore]] -deps = ["Compat", "LinearAlgebra", "SparseArrays"] -git-tree-sha1 = "4c26b4e9e91ca528ea212927326ece5918a04b47" -uuid = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" -version = "1.11.2" - -[[ChangesOfVariables]] -deps = ["ChainRulesCore", "LinearAlgebra", "Test"] -git-tree-sha1 = "bf98fa45a0a4cee295de98d4c1462be26345b9a1" -uuid = "9e997f8a-9a97-42d5-a9f1-ce6bfc15e2c0" -version = "0.1.2" - -[[Compat]] -deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "SHA", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"] -git-tree-sha1 = "44c37b4636bc54afac5c574d2d02b625349d6582" +version = "5.0.0" + + [deps.CUDA.extensions] + SpecialFunctionsExt = "SpecialFunctions" + + [deps.CUDA.weakdeps] + SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b" + +[[deps.CUDA_Driver_jll]] +deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "Pkg"] +git-tree-sha1 = "35a37bb72b35964f2895c12c687ae263b4ac170c" +uuid = "4ee394cb-3365-5eb0-8335-949819d2adfc" +version = "0.6.0+3" + +[[deps.CUDA_Runtime_Discovery]] +deps = ["Libdl"] +git-tree-sha1 = "bcc4a23cbbd99c8535a5318455dcf0f2546ec536" +uuid = "1af6417a-86b4-443c-805f-a4643ffb695f" +version = "0.2.2" + +[[deps.CUDA_Runtime_jll]] +deps = ["Artifacts", "CUDA_Driver_jll", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"] +git-tree-sha1 = "bfe5a693a11522d58392f742243f2b50dc27afd6" +uuid = "76a88914-d11a-5bdc-97e0-2f5a05c973a2" +version = "0.9.2+0" + +[[deps.ColorTypes]] +deps = ["FixedPointNumbers", "Random"] +git-tree-sha1 = "eb7f0f8307f71fac7c606984ea5fb2817275d6e4" +uuid = "3da002f7-5984-5a60-b8a6-cbb66c0b333f" +version = "0.11.4" + +[[deps.Colors]] +deps = ["ColorTypes", "FixedPointNumbers", "Reexport"] +git-tree-sha1 = "fc08e5930ee9a4e03f84bfb5211cb54e7769758a" +uuid = "5ae59095-9a9b-59fe-a467-6f913c188581" +version = "0.12.10" + +[[deps.Compat]] +deps = ["UUIDs"] +git-tree-sha1 = "e460f044ca8b99be31d35fe54fc33a5c33dd8ed7" uuid = "34da2185-b29b-5c13-b0c7-acf172513d20" -version = "3.41.0" +version = "4.9.0" +weakdeps = ["Dates", "LinearAlgebra"] + + [deps.Compat.extensions] + CompatLinearAlgebraExt = "LinearAlgebra" -[[CompilerSupportLibraries_jll]] +[[deps.CompilerSupportLibraries_jll]] deps = ["Artifacts", "Libdl"] uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae" - -[[Dates]] +version = "1.0.5+0" + +[[deps.Crayons]] +git-tree-sha1 = "249fe38abf76d48563e2f4556bebd215aa317e15" +uuid = "a8cc5b0e-0ffa-5ad4-8c14-923d3ee1735f" +version = "4.1.1" + +[[deps.DataAPI]] +git-tree-sha1 = "8da84edb865b0b5b0100c0666a9bc9a0b71c553c" +uuid = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a" +version = "1.15.0" + +[[deps.DataFrames]] +deps = ["Compat", "DataAPI", "DataStructures", "Future", "InlineStrings", "InvertedIndices", "IteratorInterfaceExtensions", "LinearAlgebra", "Markdown", "Missings", "PooledArrays", "PrecompileTools", "PrettyTables", "Printf", "REPL", "Random", "Reexport", "SentinelArrays", "SortingAlgorithms", "Statistics", "TableTraits", "Tables", "Unicode"] +git-tree-sha1 = "04c738083f29f86e62c8afc341f0967d8717bdb8" +uuid = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" +version = "1.6.1" + +[[deps.DataStructures]] +deps = ["Compat", "InteractiveUtils", "OrderedCollections"] +git-tree-sha1 = "3dbd312d370723b6bb43ba9d02fc36abade4518d" +uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" +version = "0.18.15" + +[[deps.DataValueInterfaces]] +git-tree-sha1 = "bfc1187b79289637fa0ef6d4436ebdfe6905cbd6" +uuid = "e2d170a0-9d28-54be-80f0-106bbe20a464" +version = "1.0.0" + +[[deps.Dates]] deps = ["Printf"] uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" -[[DelimitedFiles]] -deps = ["Mmap"] -uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab" +[[deps.Downloads]] +deps = ["ArgTools", "FileWatching", "LibCURL", "NetworkOptions"] +uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6" +version = "1.6.0" -[[Distributed]] -deps = ["Random", "Serialization", "Sockets"] -uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b" +[[deps.ExprTools]] +git-tree-sha1 = "27415f162e6028e81c72b82ef756bf321213b6ec" +uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04" +version = "0.1.10" -[[DocStringExtensions]] -deps = ["LibGit2"] -git-tree-sha1 = "b19534d1895d702889b219c382a6e18010797f0b" -uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae" -version = "0.8.6" +[[deps.FileWatching]] +uuid = "7b1f6079-737a-58dc-b8bc-7a2ca5c1b5ee" -[[Downloads]] -deps = ["ArgTools", "LibCURL", "NetworkOptions"] -uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6" +[[deps.FixedPointNumbers]] +deps = ["Statistics"] +git-tree-sha1 = "335bfdceacc84c5cdf16aadc768aa5ddfc5383cc" +uuid = "53c48c17-4a7d-5ca2-90c5-79b7896eea93" +version = "0.8.4" -[[ExprTools]] -git-tree-sha1 = "b7e3d17636b348f005f11040025ae8c6f645fe92" -uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04" -version = "0.1.6" +[[deps.Future]] +deps = ["Random"] +uuid = "9fa8497b-333b-5362-9e8d-4d0656e87820" -[[GPUArrays]] -deps = ["Adapt", "LinearAlgebra", "Printf", "Random", "Serialization", "Statistics"] -git-tree-sha1 = "7772508f17f1d482fe0df72cabc5b55bec06bbe0" +[[deps.GPUArrays]] +deps = ["Adapt", "GPUArraysCore", "LLVM", "LinearAlgebra", "Printf", "Random", "Reexport", "Serialization", "Statistics"] +git-tree-sha1 = "8ad8f375ae365aa1eb2f42e2565a40b55a4b69a8" uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" -version = "8.1.2" +version = "9.0.0" + +[[deps.GPUArraysCore]] +deps = ["Adapt"] +git-tree-sha1 = "2d6ca471a6c7b536127afccfa7564b5b39227fe0" +uuid = "46192b85-c4d5-4398-a991-12ede77f4527" +version = "0.1.5" -[[GPUCompiler]] -deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "TimerOutputs", "UUIDs"] -git-tree-sha1 = "2cac236070c2c4b36de54ae9146b55ee2c34ac7a" +[[deps.GPUCompiler]] +deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "Scratch", "TimerOutputs", "UUIDs"] +git-tree-sha1 = "5e4487558477f191c043166f8301dd0b4be4e2b2" uuid = "61eb1bfa-7361-4325-ad38-22787b887f55" -version = "0.13.10" +version = "0.24.5" + +[[deps.InlineStrings]] +deps = ["Parsers"] +git-tree-sha1 = "9cc2baf75c6d09f9da536ddf58eb2f29dedaf461" +uuid = "842dd82b-1e85-43dc-bf29-5d0ee9dffc48" +version = "1.4.0" -[[InteractiveUtils]] +[[deps.InteractiveUtils]] deps = ["Markdown"] uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" -[[InverseFunctions]] -deps = ["Test"] -git-tree-sha1 = "a7254c0acd8e62f1ac75ad24d5db43f5f19f3c65" -uuid = "3587e190-3f89-42d0-90ee-14403ec27112" -version = "0.1.2" +[[deps.InvertedIndices]] +git-tree-sha1 = "0dc7b50b8d436461be01300fd8cd45aa0274b038" +uuid = "41ab1584-1d38-5bbf-9106-f11c6c58b48f" +version = "1.3.0" -[[IrrationalConstants]] -git-tree-sha1 = "7fd44fd4ff43fc60815f8e764c0f352b83c49151" -uuid = "92d709cd-6900-40b7-9082-c6be49f344b6" -version = "0.1.1" +[[deps.IteratorInterfaceExtensions]] +git-tree-sha1 = "a3f24677c21f5bbe9d2a714f95dcd58337fb2856" +uuid = "82899510-4779-5014-852e-03e436cf321d" +version = "1.0.0" -[[JLLWrappers]] -deps = ["Preferences"] -git-tree-sha1 = "642a199af8b68253517b80bd3bfd17eb4e84df6e" +[[deps.JLLWrappers]] +deps = ["Artifacts", "Preferences"] +git-tree-sha1 = "7e5d6779a1e09a36db2a7b6cff50942a0a7d0fca" uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210" -version = "1.3.0" +version = "1.5.0" + +[[deps.JuliaNVTXCallbacks_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "af433a10f3942e882d3c671aacb203e006a5808f" +uuid = "9c1d0b0a-7046-5b2e-a33f-ea22f176ac7e" +version = "0.2.1+0" + +[[deps.KernelAbstractions]] +deps = ["Adapt", "Atomix", "InteractiveUtils", "LinearAlgebra", "MacroTools", "PrecompileTools", "Requires", "SparseArrays", "StaticArrays", "UUIDs", "UnsafeAtomics", "UnsafeAtomicsLLVM"] +git-tree-sha1 = "4c5875e4c228247e1c2b087669846941fb6e0118" +uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c" +version = "0.9.8" -[[LLVM]] + [deps.KernelAbstractions.extensions] + EnzymeExt = "EnzymeCore" + + [deps.KernelAbstractions.weakdeps] + EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869" + +[[deps.LLVM]] deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"] -git-tree-sha1 = "7cc22e69995e2329cc047a879395b2b74647ab5f" +git-tree-sha1 = "a9d2ce1d5007b1e8f6c5b89c5a31ff8bd146db5c" uuid = "929cbde3-209d-540e-8aea-75f648917ca0" -version = "4.7.0" +version = "6.2.1" -[[LLVMExtra_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "c5fc4bef251ecd37685bea1c4068a9cfa41e8b9a" +[[deps.LLVMExtra_jll]] +deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"] +git-tree-sha1 = "7ca6850ae880cc99b59b88517545f91a52020afa" uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab" -version = "0.0.13+0" +version = "0.0.25+0" + +[[deps.LaTeXStrings]] +git-tree-sha1 = "f2355693d6778a178ade15952b7ac47a4ff97996" +uuid = "b964fa9f-0449-5b57-a5c2-d3ea65f4040f" +version = "1.3.0" -[[LazyArtifacts]] +[[deps.LazyArtifacts]] deps = ["Artifacts", "Pkg"] uuid = "4af54fe1-eca0-43a8-85a7-787d91b784e3" -[[LibCURL]] +[[deps.LibCURL]] deps = ["LibCURL_jll", "MozillaCACerts_jll"] uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21" +version = "0.6.3" -[[LibCURL_jll]] +[[deps.LibCURL_jll]] deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"] uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0" +version = "7.84.0+0" -[[LibGit2]] +[[deps.LibGit2]] deps = ["Base64", "NetworkOptions", "Printf", "SHA"] uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" -[[LibSSH2_jll]] +[[deps.LibSSH2_jll]] deps = ["Artifacts", "Libdl", "MbedTLS_jll"] uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8" +version = "1.10.2+0" -[[Libdl]] +[[deps.Libdl]] uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" -[[LinearAlgebra]] -deps = ["Libdl"] +[[deps.LinearAlgebra]] +deps = ["Libdl", "OpenBLAS_jll", "libblastrampoline_jll"] uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" -[[LogExpFunctions]] -deps = ["ChainRulesCore", "ChangesOfVariables", "DocStringExtensions", "InverseFunctions", "IrrationalConstants", "LinearAlgebra"] -git-tree-sha1 = "e5718a00af0ab9756305a0392832c8952c7426c1" -uuid = "2ab3a3ac-af41-5b50-aa03-7779005ae688" -version = "0.3.6" - -[[Logging]] +[[deps.Logging]] uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" -[[Markdown]] +[[deps.MacroTools]] +deps = ["Markdown", "Random"] +git-tree-sha1 = "9ee1618cbf5240e6d4e0371d6f24065083f60c48" +uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" +version = "0.5.11" + +[[deps.Markdown]] deps = ["Base64"] uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" -[[MbedTLS_jll]] +[[deps.MbedTLS_jll]] deps = ["Artifacts", "Libdl"] uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1" +version = "2.28.2+0" -[[Mmap]] -uuid = "a63ad114-7e13-5084-954f-fe012c677804" +[[deps.Missings]] +deps = ["DataAPI"] +git-tree-sha1 = "f66bdc5de519e8f8ae43bdc598782d35a25b1272" +uuid = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28" +version = "1.1.0" -[[MozillaCACerts_jll]] +[[deps.MozillaCACerts_jll]] uuid = "14a3606d-f60d-562e-9121-12d972cd8159" +version = "2022.10.11" -[[NetworkOptions]] -uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908" +[[deps.NVTX]] +deps = ["Colors", "JuliaNVTXCallbacks_jll", "Libdl", "NVTX_jll"] +git-tree-sha1 = "8bc9ce4233be3c63f8dcd78ccaf1b63a9c0baa34" +uuid = "5da4648a-3479-48b8-97b9-01cb529c0a1f" +version = "0.3.3" -[[OpenLibm_jll]] -deps = ["Artifacts", "Libdl"] -uuid = "05823500-19ac-5b8b-9628-191a04bc5112" +[[deps.NVTX_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "ce3269ed42816bf18d500c9f63418d4b0d9f5a3b" +uuid = "e98f9f5b-d649-5603-91fd-7774390e6439" +version = "3.1.0+2" + +[[deps.NetworkOptions]] +uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908" +version = "1.2.0" -[[OpenSpecFun_jll]] -deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "13652491f6856acfd2db29360e1bbcd4565d04f1" -uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e" -version = "0.5.5+0" +[[deps.OpenBLAS_jll]] +deps = ["Artifacts", "CompilerSupportLibraries_jll", "Libdl"] +uuid = "4536629a-c528-5b80-bd46-f80d51c5b363" +version = "0.3.21+4" -[[OrderedCollections]] -git-tree-sha1 = "85f8e6578bf1f9ee0d11e7bb1b1456435479d47c" +[[deps.OrderedCollections]] +git-tree-sha1 = "2e73fe17cac3c62ad1aebe70d44c963c3cfdc3e3" uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" -version = "1.4.1" +version = "1.6.2" -[[Parameters]] +[[deps.Parameters]] deps = ["OrderedCollections", "UnPack"] git-tree-sha1 = "34c0e9ad262e5f7fc75b10a9952ca7692cfc5fbe" uuid = "d96e819e-fc66-5662-9728-84c9c7592b0a" version = "0.12.3" -[[Pkg]] -deps = ["Artifacts", "Dates", "Downloads", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"] +[[deps.Parsers]] +deps = ["Dates", "PrecompileTools", "UUIDs"] +git-tree-sha1 = "716e24b21538abc91f6205fd1d8363f39b442851" +uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0" +version = "2.7.2" + +[[deps.Pkg]] +deps = ["Artifacts", "Dates", "Downloads", "FileWatching", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"] uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" +version = "1.9.2" + +[[deps.PooledArrays]] +deps = ["DataAPI", "Future"] +git-tree-sha1 = "36d8b4b899628fb92c2749eb488d884a926614d3" +uuid = "2dfb63ee-cc39-5dd5-95bd-886bf059d720" +version = "1.4.3" + +[[deps.PrecompileTools]] +deps = ["Preferences"] +git-tree-sha1 = "03b4c25b43cb84cee5c90aa9b5ea0a78fd848d2f" +uuid = "aea7be01-6a6a-4083-8856-8a6e6704d82a" +version = "1.2.0" -[[Preferences]] +[[deps.Preferences]] deps = ["TOML"] -git-tree-sha1 = "00cfd92944ca9c760982747e9a1d0d5d86ab1e5a" +git-tree-sha1 = "00805cd429dcb4870060ff49ef443486c262e38e" uuid = "21216c6a-2e73-6563-6e65-726566657250" -version = "1.2.2" +version = "1.4.1" + +[[deps.PrettyTables]] +deps = ["Crayons", "LaTeXStrings", "Markdown", "Printf", "Reexport", "StringManipulation", "Tables"] +git-tree-sha1 = "ee094908d720185ddbdc58dbe0c1cbe35453ec7a" +uuid = "08abe8d2-0d0c-5749-adfa-8a2ac140af0d" +version = "2.2.7" -[[Printf]] +[[deps.Printf]] deps = ["Unicode"] uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" -[[REPL]] +[[deps.REPL]] deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"] uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" -[[Random]] -deps = ["Serialization"] +[[deps.Random]] +deps = ["SHA", "Serialization"] uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" -[[Random123]] -deps = ["Libdl", "Random", "RandomNumbers"] -git-tree-sha1 = "0e8b146557ad1c6deb1367655e052276690e71a3" +[[deps.Random123]] +deps = ["Random", "RandomNumbers"] +git-tree-sha1 = "552f30e847641591ba3f39fd1bed559b9deb0ef3" uuid = "74087812-796a-5b5d-8853-05524746bad3" -version = "1.4.2" +version = "1.6.1" -[[RandomNumbers]] +[[deps.RandomNumbers]] deps = ["Random", "Requires"] git-tree-sha1 = "043da614cc7e95c703498a491e2c21f58a2b8111" uuid = "e6cf234a-135c-5ec9-84dd-332b85af5143" version = "1.5.3" -[[Reexport]] +[[deps.Reexport]] git-tree-sha1 = "45e428421666073eab6f2da5c9d310d99bb12f9b" uuid = "189a3867-3050-52da-a836-e630ba90ab69" version = "1.2.2" -[[Requires]] +[[deps.Requires]] deps = ["UUIDs"] -git-tree-sha1 = "8f82019e525f4d5c669692772a6f4b0a58b06a6a" +git-tree-sha1 = "838a3a4188e2ded87a4f9f184b4b0d78a1e91cb7" uuid = "ae029012-a4dd-5104-9daa-d747884805df" -version = "1.2.0" +version = "1.3.0" -[[SHA]] +[[deps.SHA]] uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" +version = "0.7.0" -[[Serialization]] -uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" +[[deps.Scratch]] +deps = ["Dates"] +git-tree-sha1 = "30449ee12237627992a99d5e30ae63e4d78cd24a" +uuid = "6c6a2e73-6563-6170-7368-637461726353" +version = "1.2.0" -[[SharedArrays]] -deps = ["Distributed", "Mmap", "Random", "Serialization"] -uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383" +[[deps.SentinelArrays]] +deps = ["Dates", "Random"] +git-tree-sha1 = "04bdff0b09c65ff3e06a05e3eb7b120223da3d39" +uuid = "91c51154-3ec4-41a3-a24f-3f23e20d615c" +version = "1.4.0" -[[Sockets]] +[[deps.Serialization]] +uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" + +[[deps.Sockets]] uuid = "6462fe0b-24de-5631-8697-dd941f90decc" -[[SparseArrays]] -deps = ["LinearAlgebra", "Random"] +[[deps.SortingAlgorithms]] +deps = ["DataStructures"] +git-tree-sha1 = "c60ec5c62180f27efea3ba2908480f8055e17cee" +uuid = "a2af1166-a08f-5f64-846c-94a0d3cef48c" +version = "1.1.1" + +[[deps.SparseArrays]] +deps = ["Libdl", "LinearAlgebra", "Random", "Serialization", "SuiteSparse_jll"] uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" -[[SpecialFunctions]] -deps = ["ChainRulesCore", "IrrationalConstants", "LogExpFunctions", "OpenLibm_jll", "OpenSpecFun_jll"] -git-tree-sha1 = "e08890d19787ec25029113e88c34ec20cac1c91e" -uuid = "276daf66-3868-5448-9aa4-cd146d93841b" -version = "2.0.0" +[[deps.StaticArrays]] +deps = ["LinearAlgebra", "Random", "StaticArraysCore"] +git-tree-sha1 = "d5fb407ec3179063214bc6277712928ba78459e2" +uuid = "90137ffa-7385-5640-81b9-e52037218182" +version = "1.6.4" +weakdeps = ["Statistics"] -[[Statistics]] + [deps.StaticArrays.extensions] + StaticArraysStatisticsExt = "Statistics" + +[[deps.StaticArraysCore]] +git-tree-sha1 = "36b3d696ce6366023a0ea192b4cd442268995a0d" +uuid = "1e83bf80-4336-4d27-bf5d-d5a4f845583c" +version = "1.4.2" + +[[deps.Statistics]] deps = ["LinearAlgebra", "SparseArrays"] uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" +version = "1.9.0" + +[[deps.StringManipulation]] +deps = ["PrecompileTools"] +git-tree-sha1 = "a04cabe79c5f01f4d723cc6704070ada0b9d46d5" +uuid = "892a3eda-7b42-436c-8928-eab12a02cf0e" +version = "0.3.4" -[[TOML]] +[[deps.SuiteSparse_jll]] +deps = ["Artifacts", "Libdl", "Pkg", "libblastrampoline_jll"] +uuid = "bea87d4a-7f5b-5778-9afe-8cc45184846c" +version = "5.10.1+6" + +[[deps.TOML]] deps = ["Dates"] uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76" +version = "1.0.3" + +[[deps.TableTraits]] +deps = ["IteratorInterfaceExtensions"] +git-tree-sha1 = "c06b2f539df1c6efa794486abfb6ed2022561a39" +uuid = "3783bdb8-4a98-5b6b-af9a-565f29a5fe9c" +version = "1.0.1" + +[[deps.Tables]] +deps = ["DataAPI", "DataValueInterfaces", "IteratorInterfaceExtensions", "LinearAlgebra", "OrderedCollections", "TableTraits"] +git-tree-sha1 = "a1f34829d5ac0ef499f6d84428bd6b4c71f02ead" +uuid = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" +version = "1.11.0" -[[Tar]] +[[deps.Tar]] deps = ["ArgTools", "SHA"] uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e" +version = "1.10.0" -[[Test]] +[[deps.Test]] deps = ["InteractiveUtils", "Logging", "Random", "Serialization"] uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40" -[[TextWrap]] +[[deps.TextWrap]] git-tree-sha1 = "9250ef9b01b66667380cf3275b3f7488d0e25faf" uuid = "b718987f-49a8-5099-9789-dcd902bef87d" version = "1.0.1" -[[TimerOutputs]] +[[deps.TimerOutputs]] deps = ["ExprTools", "Printf"] -git-tree-sha1 = "7cb456f358e8f9d102a8b25e8dfedf58fa5689bc" +git-tree-sha1 = "f548a9e9c490030e545f72074a41edfd0e5bcdd7" uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f" -version = "0.5.13" +version = "0.5.23" -[[UUIDs]] +[[deps.UUIDs]] deps = ["Random", "SHA"] uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" -[[UnPack]] +[[deps.UnPack]] git-tree-sha1 = "387c1f73762231e86e0c9c5443ce3b4a0a9a0c2b" uuid = "3a884ed6-31ef-47d7-9d2a-63182c4928ed" version = "1.0.2" -[[Unicode]] +[[deps.Unicode]] uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" -[[Zlib_jll]] +[[deps.UnsafeAtomics]] +git-tree-sha1 = "6331ac3440856ea1988316b46045303bef658278" +uuid = "013be700-e6cd-48c3-b4a1-df204f14c38f" +version = "0.2.1" + +[[deps.UnsafeAtomicsLLVM]] +deps = ["LLVM", "UnsafeAtomics"] +git-tree-sha1 = "323e3d0acf5e78a56dfae7bd8928c989b4f3083e" +uuid = "d80eeb9a-aca5-4d75-85e5-170c8b632249" +version = "0.1.3" + +[[deps.Zlib_jll]] deps = ["Libdl"] uuid = "83775a58-1f1d-513f-b197-d71354ab007a" +version = "1.2.13+0" + +[[deps.libblastrampoline_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "8e850b90-86db-534c-a0d3-1478176c7d93" +version = "5.8.0+0" -[[nghttp2_jll]] +[[deps.nghttp2_jll]] deps = ["Artifacts", "Libdl"] uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d" +version = "1.48.0+0" -[[p7zip_jll]] +[[deps.p7zip_jll]] deps = ["Artifacts", "Libdl"] uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0" +version = "17.4.0+0" diff --git a/src/julia/JuliaStream.jl/CUDA/Project.toml b/src/julia/JuliaStream.jl/CUDA/Project.toml index e50582e0..22cdf069 100644 --- a/src/julia/JuliaStream.jl/CUDA/Project.toml +++ b/src/julia/JuliaStream.jl/CUDA/Project.toml @@ -4,4 +4,4 @@ CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a" [compat] -julia = "1.6" +julia = "1.9" diff --git a/src/julia/JuliaStream.jl/KernelAbstractions/Manifest.toml b/src/julia/JuliaStream.jl/KernelAbstractions/Manifest.toml index 91093a72..a5f50535 100644 --- a/src/julia/JuliaStream.jl/KernelAbstractions/Manifest.toml +++ b/src/julia/JuliaStream.jl/KernelAbstractions/Manifest.toml @@ -1,557 +1,735 @@ # This file is machine-generated - editing it directly is not advised -[[AMDGPU]] -deps = ["AbstractFFTs", "Adapt", "BinaryProvider", "CEnum", "GPUArrays", "GPUCompiler", "HIP_jll", "LLVM", "Libdl", "LinearAlgebra", "MacroTools", "Pkg", "Printf", "ROCmDeviceLibs_jll", "Random", "Requires", "Setfield", "Statistics", "hsa_rocr_jll"] -git-tree-sha1 = "d59f1cf3f90ae6cf6626e8a21f337850cb3792f7" +julia_version = "1.9.3" +manifest_format = "2.0" +project_hash = "d273a081dfaa413b3d1144a4c6d874ffbde3e0d7" + +[[deps.AMDGPU]] +deps = ["AbstractFFTs", "Adapt", "BinaryProvider", "CEnum", "ExprTools", "GPUArrays", "GPUCompiler", "HIP_jll", "LLD_jll", "LLVM", "LLVM_jll", "Libdl", "LinearAlgebra", "MacroTools", "MsgPack", "ObjectFile", "Pkg", "Preferences", "Printf", "ROCmDeviceLibs_jll", "Random", "Requires", "Setfield", "SpecialFunctions", "Statistics", "TimespanLogging", "hsa_rocr_jll", "rocBLAS_jll", "rocRAND_jll", "rocSPARSE_jll"] +git-tree-sha1 = "06f51480c4fbd88edae71c7e60fd9a7362a579f2" uuid = "21141c5a-9bdb-4563-92ae-f87d6854732e" -version = "0.2.17" +version = "0.4.8" -[[AbstractFFTs]] +[[deps.AbstractFFTs]] deps = ["LinearAlgebra"] -git-tree-sha1 = "485ee0867925449198280d4af84bdb46a2a404d0" +git-tree-sha1 = "d92ad398961a3ed262d8bf04a1a2b8340f915fef" uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c" -version = "1.0.1" +version = "1.5.0" -[[Adapt]] -deps = ["LinearAlgebra"] -git-tree-sha1 = "84918055d15b3114ede17ac6a7182f68870c16f7" + [deps.AbstractFFTs.extensions] + AbstractFFTsChainRulesCoreExt = "ChainRulesCore" + AbstractFFTsTestExt = "Test" + + [deps.AbstractFFTs.weakdeps] + ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" + Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" + +[[deps.Adapt]] +deps = ["LinearAlgebra", "Requires"] +git-tree-sha1 = "76289dc51920fdc6e0013c872ba9551d54961c24" uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" -version = "3.3.1" +version = "3.6.2" +weakdeps = ["StaticArrays"] + + [deps.Adapt.extensions] + AdaptStaticArraysExt = "StaticArrays" -[[ArgParse]] +[[deps.ArgParse]] deps = ["Logging", "TextWrap"] git-tree-sha1 = "3102bce13da501c9104df33549f511cd25264d7d" uuid = "c7e460c6-2fb9-53a9-8c5b-16f535851c63" version = "1.1.4" -[[ArgTools]] +[[deps.ArgTools]] uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f" +version = "1.1.1" -[[Artifacts]] +[[deps.Artifacts]] uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" -[[BFloat16s]] -deps = ["LinearAlgebra", "Test"] -git-tree-sha1 = "4af69e205efc343068dc8722b8dfec1ade89254a" -uuid = "ab4f0b2a-ad5b-11e8-123f-65d77653426b" +[[deps.Atomix]] +deps = ["UnsafeAtomics"] +git-tree-sha1 = "c06a868224ecba914baa6942988e2f2aade419be" +uuid = "a9b6321e-bd34-4604-b9c9-b65b8de01458" version = "0.1.0" -[[Base64]] +[[deps.BFloat16s]] +deps = ["LinearAlgebra", "Printf", "Random", "Test"] +git-tree-sha1 = "dbf84058d0a8cbbadee18d25cf606934b22d7c66" +uuid = "ab4f0b2a-ad5b-11e8-123f-65d77653426b" +version = "0.4.2" + +[[deps.Base64]] uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" -[[BinaryProvider]] +[[deps.BinaryProvider]] deps = ["Libdl", "Logging", "SHA"] git-tree-sha1 = "ecdec412a9abc8db54c0efc5548c64dfce072058" uuid = "b99e7846-7c00-51b0-8f62-c81ae34c0232" version = "0.5.10" -[[Bzip2_jll]] +[[deps.Bzip2_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] git-tree-sha1 = "19a35467a82e236ff51bc17a3a44b69ef35185a2" uuid = "6e34b625-4abd-537c-b88f-471c36dfa7a0" version = "1.0.8+0" -[[CEnum]] -git-tree-sha1 = "215a9aa4a1f23fbd05b92769fdd62559488d70e9" +[[deps.CEnum]] +git-tree-sha1 = "eb4cb44a499229b3b8426dcfb5dd85333951ff90" uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82" -version = "0.4.1" +version = "0.4.2" -[[CUDA]] -deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CompilerSupportLibraries_jll", "ExprTools", "GPUArrays", "GPUCompiler", "LLVM", "LazyArtifacts", "Libdl", "LinearAlgebra", "Logging", "Printf", "Random", "Random123", "RandomNumbers", "Reexport", "Requires", "SparseArrays", "SpecialFunctions", "TimerOutputs"] -git-tree-sha1 = "335b3d2373733919b4972a51215a6840c7a33828" +[[deps.CUDA]] +deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CUDA_Driver_jll", "CUDA_Runtime_Discovery", "CUDA_Runtime_jll", "CompilerSupportLibraries_jll", "ExprTools", "GPUArrays", "GPUCompiler", "LLVM", "LazyArtifacts", "Libdl", "LinearAlgebra", "Logging", "Preferences", "Printf", "Random", "Random123", "RandomNumbers", "Reexport", "Requires", "SparseArrays", "SpecialFunctions"] +git-tree-sha1 = "edff14c60784c8f7191a62a23b15a421185bc8a8" uuid = "052768ef-5323-5732-b1bb-66c8b64840ba" -version = "3.4.2" +version = "4.0.1" -[[CUDAKernels]] -deps = ["Adapt", "CUDA", "Cassette", "KernelAbstractions", "SpecialFunctions", "StaticArrays"] -git-tree-sha1 = "81f76297b63c67723b1d60f5e7e002ae3393974b" +[[deps.CUDAKernels]] +deps = ["Adapt", "CUDA", "KernelAbstractions", "StaticArrays", "UnsafeAtomicsLLVM"] +git-tree-sha1 = "1680366a69e9c95744ef23a239e6cfe61cf2e1ca" uuid = "72cfdca4-0801-4ab0-bf6a-d52aa10adc57" -version = "0.3.0" +version = "0.4.7" + +[[deps.CUDA_Driver_jll]] +deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "Pkg"] +git-tree-sha1 = "75d7896d1ec079ef10d3aee8f3668c11354c03a1" +uuid = "4ee394cb-3365-5eb0-8335-949819d2adfc" +version = "0.2.0+0" -[[Cassette]] -git-tree-sha1 = "6ce3cd755d4130d43bab24ea5181e77b89b51839" -uuid = "7057c7e9-c182-5462-911a-8362d720325c" -version = "0.3.9" - -[[ChainRulesCore]] -deps = ["Compat", "LinearAlgebra", "SparseArrays"] -git-tree-sha1 = "4c26b4e9e91ca528ea212927326ece5918a04b47" -uuid = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" -version = "1.11.2" - -[[ChangesOfVariables]] -deps = ["ChainRulesCore", "LinearAlgebra", "Test"] -git-tree-sha1 = "bf98fa45a0a4cee295de98d4c1462be26345b9a1" -uuid = "9e997f8a-9a97-42d5-a9f1-ce6bfc15e2c0" +[[deps.CUDA_Runtime_Discovery]] +deps = ["Libdl"] +git-tree-sha1 = "d6b227a1cfa63ae89cb969157c6789e36b7c9624" +uuid = "1af6417a-86b4-443c-805f-a4643ffb695f" version = "0.1.2" -[[Compat]] -deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "SHA", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"] -git-tree-sha1 = "44c37b4636bc54afac5c574d2d02b625349d6582" -uuid = "34da2185-b29b-5c13-b0c7-acf172513d20" -version = "3.41.0" +[[deps.CUDA_Runtime_jll]] +deps = ["Artifacts", "CUDA_Driver_jll", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"] +git-tree-sha1 = "ed00f777d2454c45f5f49634ed0a589da07ee0b0" +uuid = "76a88914-d11a-5bdc-97e0-2f5a05c973a2" +version = "0.2.4+1" -[[CompilerSupportLibraries_jll]] +[[deps.CompilerSupportLibraries_jll]] deps = ["Artifacts", "Libdl"] uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae" +version = "1.0.5+0" -[[ConstructionBase]] +[[deps.ConstructionBase]] deps = ["LinearAlgebra"] -git-tree-sha1 = "f74e9d5388b8620b4cee35d4c5a618dd4dc547f4" +git-tree-sha1 = "c53fc348ca4d40d7b371e71fd52251839080cbc9" uuid = "187b0558-2788-49d3-abe0-74a17ed4e7c9" -version = "1.3.0" +version = "1.5.4" + + [deps.ConstructionBase.extensions] + ConstructionBaseIntervalSetsExt = "IntervalSets" + ConstructionBaseStaticArraysExt = "StaticArrays" -[[Dates]] + [deps.ConstructionBase.weakdeps] + IntervalSets = "8197267c-284f-5f27-9208-e0e47529a953" + StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" + +[[deps.Dates]] deps = ["Printf"] uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" -[[DelimitedFiles]] -deps = ["Mmap"] -uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab" - -[[Distributed]] +[[deps.Distributed]] deps = ["Random", "Serialization", "Sockets"] uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b" -[[DocStringExtensions]] +[[deps.DocStringExtensions]] deps = ["LibGit2"] -git-tree-sha1 = "b19534d1895d702889b219c382a6e18010797f0b" +git-tree-sha1 = "2fb1e02f2b635d0845df5d7c167fec4dd739b00d" uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae" -version = "0.8.6" +version = "0.9.3" -[[Downloads]] -deps = ["ArgTools", "LibCURL", "NetworkOptions"] +[[deps.Downloads]] +deps = ["ArgTools", "FileWatching", "LibCURL", "NetworkOptions"] uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6" +version = "1.6.0" -[[Elfutils_jll]] +[[deps.Elfutils_jll]] deps = ["Artifacts", "Bzip2_jll", "JLLWrappers", "Libdl", "Pkg", "XZ_jll", "Zlib_jll", "argp_standalone_jll", "fts_jll", "obstack_jll"] -git-tree-sha1 = "8f9fcde6d89b0a3ca51cb2028beab462705c5436" +git-tree-sha1 = "6880e234507b4b4eaabccb80c2316458d608f1c7" uuid = "ab5a07f8-06af-567f-a878-e8bb879eba5a" -version = "0.182.0+0" +version = "0.182.0+1" -[[ExprTools]] -git-tree-sha1 = "b7e3d17636b348f005f11040025ae8c6f645fe92" +[[deps.ExprTools]] +git-tree-sha1 = "27415f162e6028e81c72b82ef756bf321213b6ec" uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04" -version = "0.1.6" +version = "0.1.10" + +[[deps.FileWatching]] +uuid = "7b1f6079-737a-58dc-b8bc-7a2ca5c1b5ee" -[[Future]] +[[deps.Future]] deps = ["Random"] uuid = "9fa8497b-333b-5362-9e8d-4d0656e87820" -[[GPUArrays]] -deps = ["Adapt", "LinearAlgebra", "Printf", "Random", "Serialization", "Statistics"] -git-tree-sha1 = "7772508f17f1d482fe0df72cabc5b55bec06bbe0" +[[deps.GPUArrays]] +deps = ["Adapt", "GPUArraysCore", "LLVM", "LinearAlgebra", "Printf", "Random", "Reexport", "Serialization", "Statistics"] +git-tree-sha1 = "2e57b4a4f9cc15e85a24d603256fe08e527f48d1" uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" -version = "8.1.2" +version = "8.8.1" + +[[deps.GPUArraysCore]] +deps = ["Adapt"] +git-tree-sha1 = "2d6ca471a6c7b536127afccfa7564b5b39227fe0" +uuid = "46192b85-c4d5-4398-a991-12ede77f4527" +version = "0.1.5" -[[GPUCompiler]] +[[deps.GPUCompiler]] deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "TimerOutputs", "UUIDs"] -git-tree-sha1 = "4ed2616d5e656c8716736b64da86755467f26cf5" +git-tree-sha1 = "19d693666a304e8c371798f4900f7435558c7cde" uuid = "61eb1bfa-7361-4325-ad38-22787b887f55" -version = "0.12.9" +version = "0.17.3" -[[HIP_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "ROCmCompilerSupport_jll", "ROCmDeviceLibs_jll", "ROCmOpenCLRuntime_jll", "hsa_rocr_jll"] -git-tree-sha1 = "5097d8f7b6842156ab0928371b3d03fefd8decab" +[[deps.HIP_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "ROCmCompilerSupport_jll", "ROCmDeviceLibs_jll", "ROCmOpenCLRuntime_jll", "hsa_rocr_jll", "hsakmt_roct_jll", "rocminfo_jll"] +git-tree-sha1 = "6b91ab9bea10197163cb19ee57e52a1ebe0b28dc" uuid = "2696aab5-0948-5276-aa9a-2a86a37016b8" -version = "4.0.0+1" +version = "5.4.4+0" -[[InteractiveUtils]] +[[deps.InteractiveUtils]] deps = ["Markdown"] uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" -[[InverseFunctions]] -deps = ["Test"] -git-tree-sha1 = "a7254c0acd8e62f1ac75ad24d5db43f5f19f3c65" -uuid = "3587e190-3f89-42d0-90ee-14403ec27112" -version = "0.1.2" - -[[IrrationalConstants]] -git-tree-sha1 = "7fd44fd4ff43fc60815f8e764c0f352b83c49151" +[[deps.IrrationalConstants]] +git-tree-sha1 = "630b497eafcc20001bba38a4651b327dcfc491d2" uuid = "92d709cd-6900-40b7-9082-c6be49f344b6" -version = "0.1.1" +version = "0.2.2" -[[JLLWrappers]] -deps = ["Preferences"] -git-tree-sha1 = "642a199af8b68253517b80bd3bfd17eb4e84df6e" +[[deps.JLLWrappers]] +deps = ["Artifacts", "Preferences"] +git-tree-sha1 = "7e5d6779a1e09a36db2a7b6cff50942a0a7d0fca" uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210" -version = "1.3.0" +version = "1.5.0" -[[KernelAbstractions]] -deps = ["Adapt", "Cassette", "InteractiveUtils", "MacroTools", "SpecialFunctions", "StaticArrays", "UUIDs"] -git-tree-sha1 = "cb7d8b805413025a5bc866fc036b426223ffc059" +[[deps.KernelAbstractions]] +deps = ["Adapt", "Atomix", "InteractiveUtils", "LinearAlgebra", "MacroTools", "SparseArrays", "StaticArrays", "UUIDs", "UnsafeAtomics", "UnsafeAtomicsLLVM"] +git-tree-sha1 = "cf9cae1c4c1ff83f6c02cfaf01698f05448e8325" uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c" -version = "0.7.2" +version = "0.8.6" -[[LLVM]] +[[deps.LLD_jll]] +deps = ["Artifacts", "Libdl", "Pkg", "TOML", "Zlib_jll", "libLLVM_jll"] +uuid = "d55e3150-da41-5e91-b323-ecfd1eec6109" +version = "14.0.6+3" + +[[deps.LLVM]] deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"] -git-tree-sha1 = "7cc22e69995e2329cc047a879395b2b74647ab5f" +git-tree-sha1 = "f044a2796a9e18e0531b9b3072b0019a61f264bc" uuid = "929cbde3-209d-540e-8aea-75f648917ca0" -version = "4.7.0" +version = "4.17.1" -[[LLVMExtra_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "c5fc4bef251ecd37685bea1c4068a9cfa41e8b9a" +[[deps.LLVMExtra_jll]] +deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"] +git-tree-sha1 = "070e4b5b65827f82c16ae0916376cb47377aa1b5" uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab" -version = "0.0.13+0" +version = "0.0.18+0" + +[[deps.LLVM_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "TOML", "Zlib_jll", "libLLVM_jll"] +git-tree-sha1 = "c5131b433876973cf29a2d9ec426cc099567e68c" +uuid = "86de99a1-58d6-5da7-8064-bd56ce2e322c" +version = "14.0.6+4" -[[LazyArtifacts]] +[[deps.LazyArtifacts]] deps = ["Artifacts", "Pkg"] uuid = "4af54fe1-eca0-43a8-85a7-787d91b784e3" -[[LibCURL]] +[[deps.LibCURL]] deps = ["LibCURL_jll", "MozillaCACerts_jll"] uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21" +version = "0.6.3" -[[LibCURL_jll]] +[[deps.LibCURL_jll]] deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"] uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0" +version = "7.84.0+0" -[[LibGit2]] +[[deps.LibGit2]] deps = ["Base64", "NetworkOptions", "Printf", "SHA"] uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" -[[LibSSH2_jll]] +[[deps.LibSSH2_jll]] deps = ["Artifacts", "Libdl", "MbedTLS_jll"] uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8" +version = "1.10.2+0" -[[Libdl]] +[[deps.Libdl]] uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" -[[Libgcrypt_jll]] +[[deps.Libgcrypt_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Libgpg_error_jll", "Pkg"] git-tree-sha1 = "64613c82a59c120435c067c2b809fc61cf5166ae" uuid = "d4300ac3-e22c-5743-9152-c294e39db1e4" version = "1.8.7+0" -[[Libglvnd_jll]] +[[deps.Libglvnd_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libX11_jll", "Xorg_libXext_jll"] -git-tree-sha1 = "7739f837d6447403596a75d19ed01fd08d6f56bf" +git-tree-sha1 = "6f73d1dd803986947b2c750138528a999a6c7733" uuid = "7e76a0d4-f3c7-5321-8279-8d96eeed0f29" -version = "1.3.0+3" +version = "1.6.0+0" -[[Libgpg_error_jll]] +[[deps.Libgpg_error_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] git-tree-sha1 = "c333716e46366857753e273ce6a69ee0945a6db9" uuid = "7add5ba3-2f88-524e-9cd5-f83b8a55f7b8" version = "1.42.0+0" -[[Libiconv_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "42b62845d70a619f063a7da093d995ec8e15e778" +[[deps.Libiconv_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl"] +git-tree-sha1 = "f9557a255370125b405568f9767d6d195822a175" uuid = "94ce4f54-9a6c-5748-9c1c-f9c7231a4531" -version = "1.16.1+1" +version = "1.17.0+0" -[[LinearAlgebra]] -deps = ["Libdl"] +[[deps.LinearAlgebra]] +deps = ["Libdl", "OpenBLAS_jll", "libblastrampoline_jll"] uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" -[[LogExpFunctions]] -deps = ["ChainRulesCore", "ChangesOfVariables", "DocStringExtensions", "InverseFunctions", "IrrationalConstants", "LinearAlgebra"] -git-tree-sha1 = "e5718a00af0ab9756305a0392832c8952c7426c1" +[[deps.LogExpFunctions]] +deps = ["DocStringExtensions", "IrrationalConstants", "LinearAlgebra"] +git-tree-sha1 = "7d6dd4e9212aebaeed356de34ccf262a3cd415aa" uuid = "2ab3a3ac-af41-5b50-aa03-7779005ae688" -version = "0.3.6" +version = "0.3.26" -[[Logging]] + [deps.LogExpFunctions.extensions] + LogExpFunctionsChainRulesCoreExt = "ChainRulesCore" + LogExpFunctionsChangesOfVariablesExt = "ChangesOfVariables" + LogExpFunctionsInverseFunctionsExt = "InverseFunctions" + + [deps.LogExpFunctions.weakdeps] + ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" + ChangesOfVariables = "9e997f8a-9a97-42d5-a9f1-ce6bfc15e2c0" + InverseFunctions = "3587e190-3f89-42d0-90ee-14403ec27112" + +[[deps.Logging]] uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" -[[MacroTools]] +[[deps.MacroTools]] deps = ["Markdown", "Random"] -git-tree-sha1 = "3d3e902b31198a27340d0bf00d6ac452866021cf" +git-tree-sha1 = "9ee1618cbf5240e6d4e0371d6f24065083f60c48" uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" -version = "0.5.9" +version = "0.5.11" -[[Markdown]] +[[deps.Markdown]] deps = ["Base64"] uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" -[[MbedTLS_jll]] +[[deps.MbedTLS_jll]] deps = ["Artifacts", "Libdl"] uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1" +version = "2.28.2+0" -[[Mmap]] -uuid = "a63ad114-7e13-5084-954f-fe012c677804" - -[[MozillaCACerts_jll]] +[[deps.MozillaCACerts_jll]] uuid = "14a3606d-f60d-562e-9121-12d972cd8159" +version = "2022.10.11" -[[NUMA_jll]] -deps = ["Libdl", "Pkg"] -git-tree-sha1 = "778f9bd14400cff2c32ed357e12766ac0e3d766e" +[[deps.MsgPack]] +deps = ["Serialization"] +git-tree-sha1 = "fc8c15ca848b902015bd4a745d350f02cf791c2a" +uuid = "99f44e22-a591-53d1-9472-aa23ef4bd671" +version = "1.2.0" + +[[deps.NUMA_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "3da12251003f08e819c907c645879c362206f5b4" uuid = "7f51dc2b-bb24-59f8-b771-bb1490e4195d" -version = "2.0.13+1" +version = "2.0.14+0" -[[NetworkOptions]] +[[deps.NetworkOptions]] uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908" +version = "1.2.0" + +[[deps.ObjectFile]] +deps = ["Reexport", "StructIO"] +git-tree-sha1 = "55ce61d43409b1fb0279d1781bf3b0f22c83ab3b" +uuid = "d8793406-e978-5875-9003-1fc021f44a92" +version = "0.3.7" -[[OpenLibm_jll]] +[[deps.OpenBLAS_jll]] +deps = ["Artifacts", "CompilerSupportLibraries_jll", "Libdl"] +uuid = "4536629a-c528-5b80-bd46-f80d51c5b363" +version = "0.3.21+4" + +[[deps.OpenLibm_jll]] deps = ["Artifacts", "Libdl"] uuid = "05823500-19ac-5b8b-9628-191a04bc5112" +version = "0.8.1+0" -[[OpenSpecFun_jll]] +[[deps.OpenSpecFun_jll]] deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "Libdl", "Pkg"] git-tree-sha1 = "13652491f6856acfd2db29360e1bbcd4565d04f1" uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e" version = "0.5.5+0" -[[OrderedCollections]] -git-tree-sha1 = "85f8e6578bf1f9ee0d11e7bb1b1456435479d47c" +[[deps.OrderedCollections]] +git-tree-sha1 = "2e73fe17cac3c62ad1aebe70d44c963c3cfdc3e3" uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" -version = "1.4.1" +version = "1.6.2" -[[Parameters]] +[[deps.Parameters]] deps = ["OrderedCollections", "UnPack"] git-tree-sha1 = "34c0e9ad262e5f7fc75b10a9952ca7692cfc5fbe" uuid = "d96e819e-fc66-5662-9728-84c9c7592b0a" version = "0.12.3" -[[Pkg]] -deps = ["Artifacts", "Dates", "Downloads", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"] +[[deps.Pkg]] +deps = ["Artifacts", "Dates", "Downloads", "FileWatching", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"] uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" +version = "1.9.2" -[[Preferences]] +[[deps.Preferences]] deps = ["TOML"] -git-tree-sha1 = "00cfd92944ca9c760982747e9a1d0d5d86ab1e5a" +git-tree-sha1 = "00805cd429dcb4870060ff49ef443486c262e38e" uuid = "21216c6a-2e73-6563-6e65-726566657250" -version = "1.2.2" +version = "1.4.1" -[[Printf]] +[[deps.Printf]] deps = ["Unicode"] uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" -[[REPL]] +[[deps.Profile]] +deps = ["Printf"] +uuid = "9abbd945-dff8-562f-b5e8-e1ebf5ef1b79" + +[[deps.REPL]] deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"] uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" -[[ROCKernels]] -deps = ["AMDGPU", "Adapt", "Cassette", "KernelAbstractions", "SpecialFunctions", "StaticArrays"] -git-tree-sha1 = "5e13faac6e566cb30c6620ad0be967a747121aeb" +[[deps.ROCKernels]] +deps = ["AMDGPU", "Adapt", "KernelAbstractions", "LLVM", "StaticArrays", "UnsafeAtomicsLLVM"] +git-tree-sha1 = "4d4973642639c249ccf8f50392f7f04ee3fcca22" uuid = "7eb9e9f0-4bd3-4c4c-8bef-26bd9629d9b9" -version = "0.2.2" +version = "0.3.5" -[[ROCmCompilerSupport_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "ROCmDeviceLibs_jll", "hsa_rocr_jll"] -git-tree-sha1 = "56ddcfb5d8b60c9f8c1bc619886f8d363fd1926d" +[[deps.ROCmCompilerSupport_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "ROCmDeviceLibs_jll", "hsa_rocr_jll"] +git-tree-sha1 = "7a3f25087b24d33b89f2e32cccd26af39275d14d" uuid = "8fbdd1d2-db62-5cd0-981e-905da1486e17" -version = "4.0.0+1" +version = "5.4.4+0" -[[ROCmDeviceLibs_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Zlib_jll"] -git-tree-sha1 = "d764f0f28b5af89aa004871a6a38e5d061f77257" +[[deps.ROCmDeviceLibs_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Zlib_jll"] +git-tree-sha1 = "45d5a53be418b740fe740714c8100650aebba041" uuid = "873c0968-716b-5aa7-bb8d-d1e2e2aeff2d" -version = "4.0.0+0" +version = "5.4.4+0" -[[ROCmOpenCLRuntime_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Libglvnd_jll", "Pkg", "ROCmCompilerSupport_jll", "ROCmDeviceLibs_jll", "Xorg_libX11_jll", "Xorg_xorgproto_jll", "hsa_rocr_jll"] -git-tree-sha1 = "f9e3e2cb40a7990535efa7da9b9dd0e0b458a973" +[[deps.ROCmOpenCLRuntime_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Libglvnd_jll", "ROCmCompilerSupport_jll", "ROCmDeviceLibs_jll", "Xorg_libX11_jll", "Xorg_xorgproto_jll", "hsa_rocr_jll", "hsakmt_roct_jll"] +git-tree-sha1 = "f7cbafcda3eec208831f22ae7816f34a90ce8e0f" uuid = "10ae2a08-2eea-53f8-8c20-eec175020e9f" -version = "4.0.0+1" +version = "5.4.4+0" -[[Random]] -deps = ["Serialization"] +[[deps.Random]] +deps = ["SHA", "Serialization"] uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" -[[Random123]] -deps = ["Libdl", "Random", "RandomNumbers"] -git-tree-sha1 = "0e8b146557ad1c6deb1367655e052276690e71a3" +[[deps.Random123]] +deps = ["Random", "RandomNumbers"] +git-tree-sha1 = "552f30e847641591ba3f39fd1bed559b9deb0ef3" uuid = "74087812-796a-5b5d-8853-05524746bad3" -version = "1.4.2" +version = "1.6.1" -[[RandomNumbers]] +[[deps.RandomNumbers]] deps = ["Random", "Requires"] git-tree-sha1 = "043da614cc7e95c703498a491e2c21f58a2b8111" uuid = "e6cf234a-135c-5ec9-84dd-332b85af5143" version = "1.5.3" -[[Reexport]] +[[deps.Reexport]] git-tree-sha1 = "45e428421666073eab6f2da5c9d310d99bb12f9b" uuid = "189a3867-3050-52da-a836-e630ba90ab69" version = "1.2.2" -[[Requires]] +[[deps.Requires]] deps = ["UUIDs"] -git-tree-sha1 = "8f82019e525f4d5c669692772a6f4b0a58b06a6a" +git-tree-sha1 = "838a3a4188e2ded87a4f9f184b4b0d78a1e91cb7" uuid = "ae029012-a4dd-5104-9daa-d747884805df" -version = "1.2.0" +version = "1.3.0" -[[SHA]] +[[deps.SHA]] uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" +version = "0.7.0" -[[Serialization]] +[[deps.Serialization]] uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" -[[Setfield]] -deps = ["ConstructionBase", "Future", "MacroTools", "Requires"] -git-tree-sha1 = "fca29e68c5062722b5b4435594c3d1ba557072a3" +[[deps.Setfield]] +deps = ["ConstructionBase", "Future", "MacroTools", "StaticArraysCore"] +git-tree-sha1 = "e2cc6d8c88613c05e1defb55170bf5ff211fbeac" uuid = "efcf1570-3423-57d1-acb7-fd33fddbac46" -version = "0.7.1" - -[[SharedArrays]] -deps = ["Distributed", "Mmap", "Random", "Serialization"] -uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383" +version = "1.1.1" -[[Sockets]] +[[deps.Sockets]] uuid = "6462fe0b-24de-5631-8697-dd941f90decc" -[[SparseArrays]] -deps = ["LinearAlgebra", "Random"] +[[deps.SparseArrays]] +deps = ["Libdl", "LinearAlgebra", "Random", "Serialization", "SuiteSparse_jll"] uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" -[[SpecialFunctions]] -deps = ["ChainRulesCore", "IrrationalConstants", "LogExpFunctions", "OpenLibm_jll", "OpenSpecFun_jll"] -git-tree-sha1 = "f0bccf98e16759818ffc5d97ac3ebf87eb950150" +[[deps.SpecialFunctions]] +deps = ["IrrationalConstants", "LogExpFunctions", "OpenLibm_jll", "OpenSpecFun_jll"] +git-tree-sha1 = "e2cfc4012a19088254b3950b85c3c1d8882d864d" uuid = "276daf66-3868-5448-9aa4-cd146d93841b" -version = "1.8.1" +version = "2.3.1" + + [deps.SpecialFunctions.extensions] + SpecialFunctionsChainRulesCoreExt = "ChainRulesCore" -[[StaticArrays]] -deps = ["LinearAlgebra", "Random", "Statistics"] -git-tree-sha1 = "3c76dde64d03699e074ac02eb2e8ba8254d428da" + [deps.SpecialFunctions.weakdeps] + ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" + +[[deps.StaticArrays]] +deps = ["LinearAlgebra", "Random", "StaticArraysCore"] +git-tree-sha1 = "d5fb407ec3179063214bc6277712928ba78459e2" uuid = "90137ffa-7385-5640-81b9-e52037218182" -version = "1.2.13" +version = "1.6.4" +weakdeps = ["Statistics"] + + [deps.StaticArrays.extensions] + StaticArraysStatisticsExt = "Statistics" -[[Statistics]] +[[deps.StaticArraysCore]] +git-tree-sha1 = "36b3d696ce6366023a0ea192b4cd442268995a0d" +uuid = "1e83bf80-4336-4d27-bf5d-d5a4f845583c" +version = "1.4.2" + +[[deps.Statistics]] deps = ["LinearAlgebra", "SparseArrays"] uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" +version = "1.9.0" + +[[deps.StructIO]] +deps = ["Test"] +git-tree-sha1 = "010dc73c7146869c042b49adcdb6bf528c12e859" +uuid = "53d494c1-5632-5724-8f4c-31dff12d585f" +version = "0.3.0" -[[TOML]] +[[deps.SuiteSparse_jll]] +deps = ["Artifacts", "Libdl", "Pkg", "libblastrampoline_jll"] +uuid = "bea87d4a-7f5b-5778-9afe-8cc45184846c" +version = "5.10.1+6" + +[[deps.TOML]] deps = ["Dates"] uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76" +version = "1.0.3" -[[Tar]] +[[deps.Tar]] deps = ["ArgTools", "SHA"] uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e" +version = "1.10.0" -[[Test]] +[[deps.Test]] deps = ["InteractiveUtils", "Logging", "Random", "Serialization"] uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40" -[[TextWrap]] +[[deps.TextWrap]] git-tree-sha1 = "9250ef9b01b66667380cf3275b3f7488d0e25faf" uuid = "b718987f-49a8-5099-9789-dcd902bef87d" version = "1.0.1" -[[TimerOutputs]] +[[deps.TimerOutputs]] deps = ["ExprTools", "Printf"] -git-tree-sha1 = "7cb456f358e8f9d102a8b25e8dfedf58fa5689bc" +git-tree-sha1 = "f548a9e9c490030e545f72074a41edfd0e5bcdd7" uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f" -version = "0.5.13" +version = "0.5.23" + +[[deps.TimespanLogging]] +deps = ["Distributed", "Profile"] +git-tree-sha1 = "51be7dd35b0c8a5a613dc7af272d587ea6943d24" +uuid = "a526e669-04d3-4846-9525-c66122c55f63" +version = "0.1.0" -[[UUIDs]] +[[deps.UUIDs]] deps = ["Random", "SHA"] uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" -[[UnPack]] +[[deps.UnPack]] git-tree-sha1 = "387c1f73762231e86e0c9c5443ce3b4a0a9a0c2b" uuid = "3a884ed6-31ef-47d7-9d2a-63182c4928ed" version = "1.0.2" -[[Unicode]] +[[deps.Unicode]] uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" -[[XML2_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Libiconv_jll", "Pkg", "Zlib_jll"] -git-tree-sha1 = "1acf5bdf07aa0907e0a37d3718bb88d4b687b74a" +[[deps.UnsafeAtomics]] +git-tree-sha1 = "6331ac3440856ea1988316b46045303bef658278" +uuid = "013be700-e6cd-48c3-b4a1-df204f14c38f" +version = "0.2.1" + +[[deps.UnsafeAtomicsLLVM]] +deps = ["LLVM", "UnsafeAtomics"] +git-tree-sha1 = "ead6292c02aab389cb29fe64cc9375765ab1e219" +uuid = "d80eeb9a-aca5-4d75-85e5-170c8b632249" +version = "0.1.1" + +[[deps.XML2_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Libiconv_jll", "Zlib_jll"] +git-tree-sha1 = "04a51d15436a572301b5abbb9d099713327e9fc4" uuid = "02c8fc9c-b97f-50b9-bbe4-9be30ff0a78a" -version = "2.9.12+0" +version = "2.10.4+0" -[[XSLT_jll]] +[[deps.XSLT_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Libgcrypt_jll", "Libgpg_error_jll", "Libiconv_jll", "Pkg", "XML2_jll", "Zlib_jll"] git-tree-sha1 = "91844873c4085240b95e795f692c4cec4d805f8a" uuid = "aed1982a-8fda-507f-9586-7b0439959a61" version = "1.1.34+0" -[[XZ_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "a921669cd9a45c23031fd4eb904f5cc3d20de415" +[[deps.XZ_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl"] +git-tree-sha1 = "cf2c7de82431ca6f39250d2fc4aacd0daa1675c0" uuid = "ffd25f8a-64ca-5728-b0f7-c24cf3aae800" -version = "5.2.5+2" +version = "5.4.4+0" -[[Xorg_libX11_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libxcb_jll", "Xorg_xtrans_jll"] -git-tree-sha1 = "5be649d550f3f4b95308bf0183b82e2582876527" +[[deps.Xorg_libX11_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Xorg_libxcb_jll", "Xorg_xtrans_jll"] +git-tree-sha1 = "afead5aba5aa507ad5a3bf01f58f82c8d1403495" uuid = "4f6342f7-b3d2-589e-9d20-edeb45f2b2bc" -version = "1.6.9+4" +version = "1.8.6+0" -[[Xorg_libXau_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "4e490d5c960c314f33885790ed410ff3a94ce67e" +[[deps.Xorg_libXau_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl"] +git-tree-sha1 = "6035850dcc70518ca32f012e46015b9beeda49d8" uuid = "0c0b7dd1-d40b-584c-a123-a41640f87eec" -version = "1.0.9+4" +version = "1.0.11+0" -[[Xorg_libXdmcp_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "4fe47bd2247248125c428978740e18a681372dd4" +[[deps.Xorg_libXdmcp_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl"] +git-tree-sha1 = "34d526d318358a859d7de23da945578e8e8727b7" uuid = "a3789734-cfe1-5b06-b2d0-1dd0d9d62d05" -version = "1.1.3+4" +version = "1.1.4+0" -[[Xorg_libXext_jll]] +[[deps.Xorg_libXext_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libX11_jll"] git-tree-sha1 = "b7c0aa8c376b31e4852b360222848637f481f8c3" uuid = "1082639a-0dae-5f34-9b06-72781eeb8cb3" version = "1.3.4+4" -[[Xorg_libpthread_stubs_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "6783737e45d3c59a4a4c4091f5f88cdcf0908cbb" +[[deps.Xorg_libpciaccess_jll]] +deps = ["Libdl", "Pkg"] +git-tree-sha1 = "79a09b8c1d3a2659937503788ce11173ba29681b" +uuid = "a65dc6b1-eb27-53a1-bb3e-dea574b5389e" +version = "0.16.0+1" + +[[deps.Xorg_libpthread_stubs_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl"] +git-tree-sha1 = "8fdda4c692503d44d04a0603d9ac0982054635f9" uuid = "14d82f49-176c-5ed1-bb49-ad3f5cbd8c74" -version = "0.1.0+3" +version = "0.1.1+0" -[[Xorg_libxcb_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "XSLT_jll", "Xorg_libXau_jll", "Xorg_libXdmcp_jll", "Xorg_libpthread_stubs_jll"] -git-tree-sha1 = "daf17f441228e7a3833846cd048892861cff16d6" +[[deps.Xorg_libxcb_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "XSLT_jll", "Xorg_libXau_jll", "Xorg_libXdmcp_jll", "Xorg_libpthread_stubs_jll"] +git-tree-sha1 = "b4bfde5d5b652e22b9c790ad00af08b6d042b97d" uuid = "c7cfdc94-dc32-55de-ac96-5a1b8d977c5b" -version = "1.13.0+3" +version = "1.15.0+0" -[[Xorg_xorgproto_jll]] +[[deps.Xorg_xorgproto_jll]] deps = ["Libdl", "Pkg"] git-tree-sha1 = "9a9eb8ce756fe0bca01b4be16da770e18d264972" uuid = "c4d99508-4286-5418-9131-c86396af500b" version = "2019.2.0+2" -[[Xorg_xtrans_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "79c31e7844f6ecf779705fbc12146eb190b7d845" +[[deps.Xorg_xtrans_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl"] +git-tree-sha1 = "e92a1a012a10506618f10b7047e478403a046c77" uuid = "c5fb5394-a638-5e4d-96e5-b29de1b5cf10" -version = "1.4.0+3" +version = "1.5.0+0" -[[Zlib_jll]] +[[deps.Zlib_jll]] deps = ["Libdl"] uuid = "83775a58-1f1d-513f-b197-d71354ab007a" +version = "1.2.13+0" -[[argp_standalone_jll]] +[[deps.argp_standalone_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] git-tree-sha1 = "feaf9f6293003c2bf53056fd6930d677ed340b34" uuid = "c53206cc-00f7-50bf-ad1e-3ae1f6e49bc3" version = "1.3.1+0" -[[fts_jll]] +[[deps.fts_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "78732b942383d2cb521df8a1a0814911144e663d" +git-tree-sha1 = "aa21810b841ae26d2fc7f780cb1596b4170a4c49" uuid = "d65627f6-89bd-53e8-8ab5-8b75ff535eee" -version = "1.2.7+1" +version = "1.2.8+0" -[[hsa_rocr_jll]] -deps = ["Artifacts", "Elfutils_jll", "JLLWrappers", "Libdl", "NUMA_jll", "Pkg", "Zlib_jll", "hsakmt_roct_jll"] -git-tree-sha1 = "df8d73efec8b1e53ad527d208f5343c0368f0fcd" +[[deps.hsa_rocr_jll]] +deps = ["Artifacts", "Elfutils_jll", "JLLWrappers", "Libdl", "NUMA_jll", "ROCmDeviceLibs_jll", "XML2_jll", "Zlib_jll", "hsakmt_roct_jll"] +git-tree-sha1 = "0458f0ff5d72a270fbab764d354dc35d90b28ba9" uuid = "dd59ff1a-a01a-568d-8b29-0669330f116a" -version = "4.0.0+0" +version = "5.4.4+0" -[[hsakmt_roct_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "NUMA_jll", "Pkg"] -git-tree-sha1 = "ea54f6be23c6d25613a0872ec23dc5a0b77b4a00" +[[deps.hsakmt_roct_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "NUMA_jll", "libdrm_jll"] +git-tree-sha1 = "49db943b2bf868b1fa2866b93faf4d2222fa28ae" uuid = "1cecccd7-a9b6-5045-9cdc-a44c19b16d76" -version = "4.2.0+0" +version = "5.4.4+0" + +[[deps.libLLVM_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "8f36deef-c2a5-5394-99ed-8e07531fb29a" +version = "14.0.6+3" + +[[deps.libblastrampoline_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "8e850b90-86db-534c-a0d3-1478176c7d93" +version = "5.8.0+0" + +[[deps.libdrm_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libpciaccess_jll"] +git-tree-sha1 = "89b30a68162c12118311b77e57b20c8fa2685496" +uuid = "8e53e030-5e6c-5a89-a30b-be5b7263a166" +version = "2.4.110+0" + +[[deps.msgpack_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "dcbef55311e8e3d0a15dbe7dd86900c501ca2359" +uuid = "43dd8cde-e9ee-5d59-924a-18d3f2773c4d" +version = "3.0.1+0" -[[nghttp2_jll]] +[[deps.nghttp2_jll]] deps = ["Artifacts", "Libdl"] uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d" +version = "1.48.0+0" -[[obstack_jll]] +[[deps.obstack_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] git-tree-sha1 = "1c4a6b66e934fc6db4649cb2910c72f53bbfea7e" uuid = "c88a4935-d25e-5644-aacc-5db6f1b8ef79" version = "1.2.2+0" -[[p7zip_jll]] +[[deps.p7zip_jll]] deps = ["Artifacts", "Libdl"] uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0" +version = "17.4.0+0" + +[[deps.rocBLAS_jll]] +deps = ["Artifacts", "HIP_jll", "JLLWrappers", "Libdl", "Pkg", "ROCmCompilerSupport_jll", "ROCmDeviceLibs_jll", "ROCmOpenCLRuntime_jll", "hsa_rocr_jll", "msgpack_jll", "rocminfo_jll"] +git-tree-sha1 = "92d224a9e10a9ad04195d943a2b1bcbdafcaf06a" +uuid = "1ef8cab2-a151-54b4-a57f-5fbb4046a4ab" +version = "5.2.3+2" + +[[deps.rocPRIM_jll]] +deps = ["Artifacts", "HIP_jll", "JLLWrappers", "Libdl", "Pkg", "ROCmCompilerSupport_jll", "ROCmDeviceLibs_jll", "ROCmOpenCLRuntime_jll", "hsa_rocr_jll", "rocminfo_jll"] +git-tree-sha1 = "7a100de0bae8363cbd33fa429d37be45a0247d2c" +uuid = "52935e6f-76c5-5ebb-b227-36676f75be9c" +version = "5.2.3+1" + +[[deps.rocRAND_jll]] +deps = ["Artifacts", "HIP_jll", "JLLWrappers", "Libdl", "Pkg", "ROCmCompilerSupport_jll", "ROCmDeviceLibs_jll", "ROCmOpenCLRuntime_jll", "hsa_rocr_jll", "rocminfo_jll"] +git-tree-sha1 = "58a35917ddb4d79f7a0c2f6d438a210d2f398e85" +uuid = "a6151927-a32b-54c0-bc8c-bbd7b3f1a996" +version = "5.2.3+1" + +[[deps.rocSPARSE_jll]] +deps = ["Artifacts", "HIP_jll", "JLLWrappers", "Libdl", "Pkg", "ROCmCompilerSupport_jll", "ROCmDeviceLibs_jll", "ROCmOpenCLRuntime_jll", "hsa_rocr_jll", "rocPRIM_jll", "rocminfo_jll"] +git-tree-sha1 = "67bc29d47ab636ef1471e48d7f730c03a0edfcf8" +uuid = "8c6ce2ba-659c-5ec7-ba4c-37596cf1f22a" +version = "5.2.3+1" + +[[deps.rocminfo_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "hsa_rocr_jll", "hsakmt_roct_jll"] +git-tree-sha1 = "840acd2135e7bd025870d063e99ff70d05c0de46" +uuid = "5a766526-3cf8-5128-8c31-4f7b7ad60f0e" +version = "5.4.4+0" diff --git a/src/julia/JuliaStream.jl/KernelAbstractions/Project.toml b/src/julia/JuliaStream.jl/KernelAbstractions/Project.toml index 71715ff8..a328acd7 100644 --- a/src/julia/JuliaStream.jl/KernelAbstractions/Project.toml +++ b/src/julia/JuliaStream.jl/KernelAbstractions/Project.toml @@ -8,4 +8,4 @@ Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a" ROCKernels = "7eb9e9f0-4bd3-4c4c-8bef-26bd9629d9b9" [compat] -julia = "1.6" +julia = "1.9" diff --git a/src/julia/JuliaStream.jl/Manifest.toml b/src/julia/JuliaStream.jl/Manifest.toml index 927a3998..cf65e8d0 100644 --- a/src/julia/JuliaStream.jl/Manifest.toml +++ b/src/julia/JuliaStream.jl/Manifest.toml @@ -1,605 +1,789 @@ # This file is machine-generated - editing it directly is not advised -[[AMDGPU]] -deps = ["AbstractFFTs", "Adapt", "BinaryProvider", "CEnum", "GPUArrays", "GPUCompiler", "HIP_jll", "LLVM", "Libdl", "LinearAlgebra", "MacroTools", "Pkg", "Printf", "ROCmDeviceLibs_jll", "Random", "Requires", "Setfield", "Statistics", "hsa_rocr_jll"] -git-tree-sha1 = "d59f1cf3f90ae6cf6626e8a21f337850cb3792f7" +julia_version = "1.9.3" +manifest_format = "2.0" +project_hash = "d5cae1000e576b2ee3d194306272f6931085d077" + +[[deps.AMDGPU]] +deps = ["AbstractFFTs", "Adapt", "BinaryProvider", "CEnum", "ExprTools", "GPUArrays", "GPUCompiler", "HIP_jll", "LLD_jll", "LLVM", "LLVM_jll", "Libdl", "LinearAlgebra", "MacroTools", "MsgPack", "ObjectFile", "Pkg", "Preferences", "Printf", "ROCmDeviceLibs_jll", "Random", "Requires", "Setfield", "SpecialFunctions", "Statistics", "TimespanLogging", "hsa_rocr_jll", "rocBLAS_jll", "rocRAND_jll", "rocSPARSE_jll"] +git-tree-sha1 = "06f51480c4fbd88edae71c7e60fd9a7362a579f2" uuid = "21141c5a-9bdb-4563-92ae-f87d6854732e" -version = "0.2.17" +version = "0.4.8" -[[AbstractFFTs]] +[[deps.AbstractFFTs]] deps = ["LinearAlgebra"] -git-tree-sha1 = "485ee0867925449198280d4af84bdb46a2a404d0" +git-tree-sha1 = "d92ad398961a3ed262d8bf04a1a2b8340f915fef" uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c" -version = "1.0.1" +version = "1.5.0" -[[Adapt]] -deps = ["LinearAlgebra"] -git-tree-sha1 = "84918055d15b3114ede17ac6a7182f68870c16f7" + [deps.AbstractFFTs.extensions] + AbstractFFTsChainRulesCoreExt = "ChainRulesCore" + AbstractFFTsTestExt = "Test" + + [deps.AbstractFFTs.weakdeps] + ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" + Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" + +[[deps.Adapt]] +deps = ["LinearAlgebra", "Requires"] +git-tree-sha1 = "76289dc51920fdc6e0013c872ba9551d54961c24" uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" -version = "3.3.1" +version = "3.6.2" +weakdeps = ["StaticArrays"] + + [deps.Adapt.extensions] + AdaptStaticArraysExt = "StaticArrays" -[[ArgParse]] +[[deps.ArgParse]] deps = ["Logging", "TextWrap"] git-tree-sha1 = "3102bce13da501c9104df33549f511cd25264d7d" uuid = "c7e460c6-2fb9-53a9-8c5b-16f535851c63" version = "1.1.4" -[[ArgTools]] +[[deps.ArgTools]] uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f" +version = "1.1.1" -[[Artifacts]] +[[deps.Artifacts]] uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" -[[BFloat16s]] -deps = ["LinearAlgebra", "Test"] -git-tree-sha1 = "4af69e205efc343068dc8722b8dfec1ade89254a" -uuid = "ab4f0b2a-ad5b-11e8-123f-65d77653426b" +[[deps.Atomix]] +deps = ["UnsafeAtomics"] +git-tree-sha1 = "c06a868224ecba914baa6942988e2f2aade419be" +uuid = "a9b6321e-bd34-4604-b9c9-b65b8de01458" version = "0.1.0" -[[Base64]] +[[deps.BFloat16s]] +deps = ["LinearAlgebra", "Printf", "Random", "Test"] +git-tree-sha1 = "dbf84058d0a8cbbadee18d25cf606934b22d7c66" +uuid = "ab4f0b2a-ad5b-11e8-123f-65d77653426b" +version = "0.4.2" + +[[deps.Base64]] uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" -[[BinaryProvider]] +[[deps.BinaryProvider]] deps = ["Libdl", "Logging", "SHA"] git-tree-sha1 = "ecdec412a9abc8db54c0efc5548c64dfce072058" uuid = "b99e7846-7c00-51b0-8f62-c81ae34c0232" version = "0.5.10" -[[Bzip2_jll]] +[[deps.Bzip2_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] git-tree-sha1 = "19a35467a82e236ff51bc17a3a44b69ef35185a2" uuid = "6e34b625-4abd-537c-b88f-471c36dfa7a0" version = "1.0.8+0" -[[CEnum]] -git-tree-sha1 = "215a9aa4a1f23fbd05b92769fdd62559488d70e9" +[[deps.CEnum]] +git-tree-sha1 = "eb4cb44a499229b3b8426dcfb5dd85333951ff90" uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82" -version = "0.4.1" +version = "0.4.2" -[[CUDA]] -deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CompilerSupportLibraries_jll", "ExprTools", "GPUArrays", "GPUCompiler", "LLVM", "LazyArtifacts", "Libdl", "LinearAlgebra", "Logging", "Printf", "Random", "Random123", "RandomNumbers", "Reexport", "Requires", "SparseArrays", "SpecialFunctions", "TimerOutputs"] -git-tree-sha1 = "335b3d2373733919b4972a51215a6840c7a33828" +[[deps.CUDA]] +deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CUDA_Driver_jll", "CUDA_Runtime_Discovery", "CUDA_Runtime_jll", "CompilerSupportLibraries_jll", "ExprTools", "GPUArrays", "GPUCompiler", "LLVM", "LazyArtifacts", "Libdl", "LinearAlgebra", "Logging", "Preferences", "Printf", "Random", "Random123", "RandomNumbers", "Reexport", "Requires", "SparseArrays", "SpecialFunctions"] +git-tree-sha1 = "edff14c60784c8f7191a62a23b15a421185bc8a8" uuid = "052768ef-5323-5732-b1bb-66c8b64840ba" -version = "3.4.2" +version = "4.0.1" -[[CUDAKernels]] -deps = ["Adapt", "CUDA", "Cassette", "KernelAbstractions", "SpecialFunctions", "StaticArrays"] -git-tree-sha1 = "81f76297b63c67723b1d60f5e7e002ae3393974b" +[[deps.CUDAKernels]] +deps = ["Adapt", "CUDA", "KernelAbstractions", "StaticArrays", "UnsafeAtomicsLLVM"] +git-tree-sha1 = "1680366a69e9c95744ef23a239e6cfe61cf2e1ca" uuid = "72cfdca4-0801-4ab0-bf6a-d52aa10adc57" -version = "0.3.0" +version = "0.4.7" + +[[deps.CUDA_Driver_jll]] +deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "Pkg"] +git-tree-sha1 = "75d7896d1ec079ef10d3aee8f3668c11354c03a1" +uuid = "4ee394cb-3365-5eb0-8335-949819d2adfc" +version = "0.2.0+0" -[[Cassette]] -git-tree-sha1 = "6ce3cd755d4130d43bab24ea5181e77b89b51839" -uuid = "7057c7e9-c182-5462-911a-8362d720325c" -version = "0.3.9" - -[[ChainRulesCore]] -deps = ["Compat", "LinearAlgebra", "SparseArrays"] -git-tree-sha1 = "4c26b4e9e91ca528ea212927326ece5918a04b47" -uuid = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" -version = "1.11.2" - -[[ChangesOfVariables]] -deps = ["ChainRulesCore", "LinearAlgebra", "Test"] -git-tree-sha1 = "bf98fa45a0a4cee295de98d4c1462be26345b9a1" -uuid = "9e997f8a-9a97-42d5-a9f1-ce6bfc15e2c0" +[[deps.CUDA_Runtime_Discovery]] +deps = ["Libdl"] +git-tree-sha1 = "d6b227a1cfa63ae89cb969157c6789e36b7c9624" +uuid = "1af6417a-86b4-443c-805f-a4643ffb695f" version = "0.1.2" -[[Compat]] -deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "SHA", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"] -git-tree-sha1 = "44c37b4636bc54afac5c574d2d02b625349d6582" -uuid = "34da2185-b29b-5c13-b0c7-acf172513d20" -version = "3.41.0" +[[deps.CUDA_Runtime_jll]] +deps = ["Artifacts", "CUDA_Driver_jll", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"] +git-tree-sha1 = "ed00f777d2454c45f5f49634ed0a589da07ee0b0" +uuid = "76a88914-d11a-5bdc-97e0-2f5a05c973a2" +version = "0.2.4+1" -[[CompilerSupportLibraries_jll]] +[[deps.CompilerSupportLibraries_jll]] deps = ["Artifacts", "Libdl"] uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae" +version = "1.0.5+0" -[[ConstructionBase]] +[[deps.ConstructionBase]] deps = ["LinearAlgebra"] -git-tree-sha1 = "f74e9d5388b8620b4cee35d4c5a618dd4dc547f4" +git-tree-sha1 = "c53fc348ca4d40d7b371e71fd52251839080cbc9" uuid = "187b0558-2788-49d3-abe0-74a17ed4e7c9" -version = "1.3.0" +version = "1.5.4" + + [deps.ConstructionBase.extensions] + ConstructionBaseIntervalSetsExt = "IntervalSets" + ConstructionBaseStaticArraysExt = "StaticArrays" + + [deps.ConstructionBase.weakdeps] + IntervalSets = "8197267c-284f-5f27-9208-e0e47529a953" + StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" -[[Dates]] +[[deps.Dates]] deps = ["Printf"] uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" -[[DelimitedFiles]] -deps = ["Mmap"] -uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab" - -[[Distributed]] +[[deps.Distributed]] deps = ["Random", "Serialization", "Sockets"] uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b" -[[DocStringExtensions]] +[[deps.DocStringExtensions]] deps = ["LibGit2"] -git-tree-sha1 = "b19534d1895d702889b219c382a6e18010797f0b" +git-tree-sha1 = "2fb1e02f2b635d0845df5d7c167fec4dd739b00d" uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae" -version = "0.8.6" +version = "0.9.3" -[[Downloads]] -deps = ["ArgTools", "LibCURL", "NetworkOptions"] +[[deps.Downloads]] +deps = ["ArgTools", "FileWatching", "LibCURL", "NetworkOptions"] uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6" +version = "1.6.0" -[[Elfutils_jll]] +[[deps.Elfutils_jll]] deps = ["Artifacts", "Bzip2_jll", "JLLWrappers", "Libdl", "Pkg", "XZ_jll", "Zlib_jll", "argp_standalone_jll", "fts_jll", "obstack_jll"] -git-tree-sha1 = "8f9fcde6d89b0a3ca51cb2028beab462705c5436" +git-tree-sha1 = "6880e234507b4b4eaabccb80c2316458d608f1c7" uuid = "ab5a07f8-06af-567f-a878-e8bb879eba5a" -version = "0.182.0+0" +version = "0.182.0+1" -[[ExprTools]] -git-tree-sha1 = "b7e3d17636b348f005f11040025ae8c6f645fe92" +[[deps.ExprTools]] +git-tree-sha1 = "27415f162e6028e81c72b82ef756bf321213b6ec" uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04" -version = "0.1.6" +version = "0.1.10" + +[[deps.FileWatching]] +uuid = "7b1f6079-737a-58dc-b8bc-7a2ca5c1b5ee" -[[Future]] +[[deps.Future]] deps = ["Random"] uuid = "9fa8497b-333b-5362-9e8d-4d0656e87820" -[[GPUArrays]] -deps = ["Adapt", "LinearAlgebra", "Printf", "Random", "Serialization", "Statistics"] -git-tree-sha1 = "7772508f17f1d482fe0df72cabc5b55bec06bbe0" +[[deps.GPUArrays]] +deps = ["Adapt", "GPUArraysCore", "LLVM", "LinearAlgebra", "Printf", "Random", "Reexport", "Serialization", "Statistics"] +git-tree-sha1 = "2e57b4a4f9cc15e85a24d603256fe08e527f48d1" uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" -version = "8.1.2" +version = "8.8.1" -[[GPUCompiler]] +[[deps.GPUArraysCore]] +deps = ["Adapt"] +git-tree-sha1 = "2d6ca471a6c7b536127afccfa7564b5b39227fe0" +uuid = "46192b85-c4d5-4398-a991-12ede77f4527" +version = "0.1.5" + +[[deps.GPUCompiler]] deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "TimerOutputs", "UUIDs"] -git-tree-sha1 = "4ed2616d5e656c8716736b64da86755467f26cf5" +git-tree-sha1 = "19d693666a304e8c371798f4900f7435558c7cde" uuid = "61eb1bfa-7361-4325-ad38-22787b887f55" -version = "0.12.9" +version = "0.17.3" -[[HIP_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "ROCmCompilerSupport_jll", "ROCmDeviceLibs_jll", "ROCmOpenCLRuntime_jll", "hsa_rocr_jll"] -git-tree-sha1 = "5097d8f7b6842156ab0928371b3d03fefd8decab" +[[deps.HIP_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "ROCmCompilerSupport_jll", "ROCmDeviceLibs_jll", "ROCmOpenCLRuntime_jll", "hsa_rocr_jll", "hsakmt_roct_jll", "rocminfo_jll"] +git-tree-sha1 = "6b91ab9bea10197163cb19ee57e52a1ebe0b28dc" uuid = "2696aab5-0948-5276-aa9a-2a86a37016b8" -version = "4.0.0+1" +version = "5.4.4+0" -[[InteractiveUtils]] +[[deps.InteractiveUtils]] deps = ["Markdown"] uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" -[[InverseFunctions]] -deps = ["Test"] -git-tree-sha1 = "a7254c0acd8e62f1ac75ad24d5db43f5f19f3c65" -uuid = "3587e190-3f89-42d0-90ee-14403ec27112" -version = "0.1.2" - -[[IrrationalConstants]] -git-tree-sha1 = "7fd44fd4ff43fc60815f8e764c0f352b83c49151" +[[deps.IrrationalConstants]] +git-tree-sha1 = "630b497eafcc20001bba38a4651b327dcfc491d2" uuid = "92d709cd-6900-40b7-9082-c6be49f344b6" -version = "0.1.1" +version = "0.2.2" -[[JLLWrappers]] -deps = ["Preferences"] -git-tree-sha1 = "642a199af8b68253517b80bd3bfd17eb4e84df6e" +[[deps.JLLWrappers]] +deps = ["Artifacts", "Preferences"] +git-tree-sha1 = "7e5d6779a1e09a36db2a7b6cff50942a0a7d0fca" uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210" -version = "1.3.0" +version = "1.5.0" -[[KernelAbstractions]] -deps = ["Adapt", "Cassette", "InteractiveUtils", "MacroTools", "SpecialFunctions", "StaticArrays", "UUIDs"] -git-tree-sha1 = "cb7d8b805413025a5bc866fc036b426223ffc059" +[[deps.KernelAbstractions]] +deps = ["Adapt", "Atomix", "InteractiveUtils", "LinearAlgebra", "MacroTools", "SparseArrays", "StaticArrays", "UUIDs", "UnsafeAtomics", "UnsafeAtomicsLLVM"] +git-tree-sha1 = "cf9cae1c4c1ff83f6c02cfaf01698f05448e8325" uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c" -version = "0.7.2" +version = "0.8.6" -[[LLVM]] +[[deps.LLD_jll]] +deps = ["Artifacts", "Libdl", "Pkg", "TOML", "Zlib_jll", "libLLVM_jll"] +uuid = "d55e3150-da41-5e91-b323-ecfd1eec6109" +version = "14.0.6+3" + +[[deps.LLVM]] deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"] -git-tree-sha1 = "7cc22e69995e2329cc047a879395b2b74647ab5f" +git-tree-sha1 = "f044a2796a9e18e0531b9b3072b0019a61f264bc" uuid = "929cbde3-209d-540e-8aea-75f648917ca0" -version = "4.7.0" +version = "4.17.1" -[[LLVMExtra_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "c5fc4bef251ecd37685bea1c4068a9cfa41e8b9a" +[[deps.LLVMExtra_jll]] +deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"] +git-tree-sha1 = "070e4b5b65827f82c16ae0916376cb47377aa1b5" uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab" -version = "0.0.13+0" +version = "0.0.18+0" + +[[deps.LLVM_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "TOML", "Zlib_jll", "libLLVM_jll"] +git-tree-sha1 = "c5131b433876973cf29a2d9ec426cc099567e68c" +uuid = "86de99a1-58d6-5da7-8064-bd56ce2e322c" +version = "14.0.6+4" -[[LazyArtifacts]] +[[deps.LazyArtifacts]] deps = ["Artifacts", "Pkg"] uuid = "4af54fe1-eca0-43a8-85a7-787d91b784e3" -[[LibCURL]] +[[deps.LibCURL]] deps = ["LibCURL_jll", "MozillaCACerts_jll"] uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21" +version = "0.6.3" -[[LibCURL_jll]] +[[deps.LibCURL_jll]] deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"] uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0" +version = "7.84.0+0" -[[LibGit2]] +[[deps.LibGit2]] deps = ["Base64", "NetworkOptions", "Printf", "SHA"] uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" -[[LibSSH2_jll]] +[[deps.LibSSH2_jll]] deps = ["Artifacts", "Libdl", "MbedTLS_jll"] uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8" +version = "1.10.2+0" -[[Libdl]] +[[deps.Libdl]] uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" -[[Libgcrypt_jll]] +[[deps.Libgcrypt_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Libgpg_error_jll", "Pkg"] git-tree-sha1 = "64613c82a59c120435c067c2b809fc61cf5166ae" uuid = "d4300ac3-e22c-5743-9152-c294e39db1e4" version = "1.8.7+0" -[[Libglvnd_jll]] +[[deps.Libglvnd_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libX11_jll", "Xorg_libXext_jll"] -git-tree-sha1 = "7739f837d6447403596a75d19ed01fd08d6f56bf" +git-tree-sha1 = "6f73d1dd803986947b2c750138528a999a6c7733" uuid = "7e76a0d4-f3c7-5321-8279-8d96eeed0f29" -version = "1.3.0+3" +version = "1.6.0+0" -[[Libgpg_error_jll]] +[[deps.Libgpg_error_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] git-tree-sha1 = "c333716e46366857753e273ce6a69ee0945a6db9" uuid = "7add5ba3-2f88-524e-9cd5-f83b8a55f7b8" version = "1.42.0+0" -[[Libiconv_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "42b62845d70a619f063a7da093d995ec8e15e778" +[[deps.Libiconv_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl"] +git-tree-sha1 = "f9557a255370125b405568f9767d6d195822a175" uuid = "94ce4f54-9a6c-5748-9c1c-f9c7231a4531" -version = "1.16.1+1" +version = "1.17.0+0" -[[LinearAlgebra]] -deps = ["Libdl"] +[[deps.LinearAlgebra]] +deps = ["Libdl", "OpenBLAS_jll", "libblastrampoline_jll"] uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" -[[LogExpFunctions]] -deps = ["ChainRulesCore", "ChangesOfVariables", "DocStringExtensions", "InverseFunctions", "IrrationalConstants", "LinearAlgebra"] -git-tree-sha1 = "e5718a00af0ab9756305a0392832c8952c7426c1" +[[deps.LogExpFunctions]] +deps = ["DocStringExtensions", "IrrationalConstants", "LinearAlgebra"] +git-tree-sha1 = "7d6dd4e9212aebaeed356de34ccf262a3cd415aa" uuid = "2ab3a3ac-af41-5b50-aa03-7779005ae688" -version = "0.3.6" +version = "0.3.26" + + [deps.LogExpFunctions.extensions] + LogExpFunctionsChainRulesCoreExt = "ChainRulesCore" + LogExpFunctionsChangesOfVariablesExt = "ChangesOfVariables" + LogExpFunctionsInverseFunctionsExt = "InverseFunctions" -[[Logging]] + [deps.LogExpFunctions.weakdeps] + ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" + ChangesOfVariables = "9e997f8a-9a97-42d5-a9f1-ce6bfc15e2c0" + InverseFunctions = "3587e190-3f89-42d0-90ee-14403ec27112" + +[[deps.Logging]] uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" -[[MacroTools]] +[[deps.MacroTools]] deps = ["Markdown", "Random"] -git-tree-sha1 = "3d3e902b31198a27340d0bf00d6ac452866021cf" +git-tree-sha1 = "9ee1618cbf5240e6d4e0371d6f24065083f60c48" uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" -version = "0.5.9" +version = "0.5.11" -[[Markdown]] +[[deps.Markdown]] deps = ["Base64"] uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" -[[MbedTLS_jll]] +[[deps.MbedTLS_jll]] deps = ["Artifacts", "Libdl"] uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1" +version = "2.28.2+0" -[[Mmap]] -uuid = "a63ad114-7e13-5084-954f-fe012c677804" - -[[MozillaCACerts_jll]] +[[deps.MozillaCACerts_jll]] uuid = "14a3606d-f60d-562e-9121-12d972cd8159" +version = "2022.10.11" -[[NEO_jll]] +[[deps.MsgPack]] +deps = ["Serialization"] +git-tree-sha1 = "fc8c15ca848b902015bd4a745d350f02cf791c2a" +uuid = "99f44e22-a591-53d1-9472-aa23ef4bd671" +version = "1.2.0" + +[[deps.NEO_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "gmmlib_jll", "libigc_jll", "oneAPI_Level_Zero_Headers_jll"] -git-tree-sha1 = "15deea2649d70f1bbaedf0aa87c9fa20fb21f22c" +git-tree-sha1 = "48337227b88be34125e1b4f5402789694a184f5a" uuid = "700fe977-ac61-5f37-bbc8-c6c4b2b6a9fd" -version = "21.44.21506+0" +version = "22.53.25242+0" -[[NUMA_jll]] -deps = ["Libdl", "Pkg"] -git-tree-sha1 = "778f9bd14400cff2c32ed357e12766ac0e3d766e" +[[deps.NUMA_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "3da12251003f08e819c907c645879c362206f5b4" uuid = "7f51dc2b-bb24-59f8-b771-bb1490e4195d" -version = "2.0.13+1" +version = "2.0.14+0" -[[NetworkOptions]] +[[deps.NetworkOptions]] uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908" +version = "1.2.0" -[[OpenLibm_jll]] +[[deps.ObjectFile]] +deps = ["Reexport", "StructIO"] +git-tree-sha1 = "55ce61d43409b1fb0279d1781bf3b0f22c83ab3b" +uuid = "d8793406-e978-5875-9003-1fc021f44a92" +version = "0.3.7" + +[[deps.OpenBLAS_jll]] +deps = ["Artifacts", "CompilerSupportLibraries_jll", "Libdl"] +uuid = "4536629a-c528-5b80-bd46-f80d51c5b363" +version = "0.3.21+4" + +[[deps.OpenLibm_jll]] deps = ["Artifacts", "Libdl"] uuid = "05823500-19ac-5b8b-9628-191a04bc5112" +version = "0.8.1+0" -[[OpenSpecFun_jll]] +[[deps.OpenSpecFun_jll]] deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "Libdl", "Pkg"] git-tree-sha1 = "13652491f6856acfd2db29360e1bbcd4565d04f1" uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e" version = "0.5.5+0" -[[OrderedCollections]] -git-tree-sha1 = "85f8e6578bf1f9ee0d11e7bb1b1456435479d47c" +[[deps.OrderedCollections]] +git-tree-sha1 = "2e73fe17cac3c62ad1aebe70d44c963c3cfdc3e3" uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" -version = "1.4.1" +version = "1.6.2" -[[Parameters]] +[[deps.Parameters]] deps = ["OrderedCollections", "UnPack"] git-tree-sha1 = "34c0e9ad262e5f7fc75b10a9952ca7692cfc5fbe" uuid = "d96e819e-fc66-5662-9728-84c9c7592b0a" version = "0.12.3" -[[Pkg]] -deps = ["Artifacts", "Dates", "Downloads", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"] +[[deps.Pkg]] +deps = ["Artifacts", "Dates", "Downloads", "FileWatching", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"] uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" +version = "1.9.2" -[[Preferences]] +[[deps.Preferences]] deps = ["TOML"] -git-tree-sha1 = "00cfd92944ca9c760982747e9a1d0d5d86ab1e5a" +git-tree-sha1 = "00805cd429dcb4870060ff49ef443486c262e38e" uuid = "21216c6a-2e73-6563-6e65-726566657250" -version = "1.2.2" +version = "1.4.1" -[[Printf]] +[[deps.Printf]] deps = ["Unicode"] uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" -[[REPL]] +[[deps.Profile]] +deps = ["Printf"] +uuid = "9abbd945-dff8-562f-b5e8-e1ebf5ef1b79" + +[[deps.REPL]] deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"] uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" -[[ROCKernels]] -deps = ["AMDGPU", "Adapt", "Cassette", "KernelAbstractions", "SpecialFunctions", "StaticArrays"] -git-tree-sha1 = "5e13faac6e566cb30c6620ad0be967a747121aeb" +[[deps.ROCKernels]] +deps = ["AMDGPU", "Adapt", "KernelAbstractions", "LLVM", "StaticArrays", "UnsafeAtomicsLLVM"] +git-tree-sha1 = "4d4973642639c249ccf8f50392f7f04ee3fcca22" uuid = "7eb9e9f0-4bd3-4c4c-8bef-26bd9629d9b9" -version = "0.2.2" +version = "0.3.5" -[[ROCmCompilerSupport_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "ROCmDeviceLibs_jll", "hsa_rocr_jll"] -git-tree-sha1 = "56ddcfb5d8b60c9f8c1bc619886f8d363fd1926d" +[[deps.ROCmCompilerSupport_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "ROCmDeviceLibs_jll", "hsa_rocr_jll"] +git-tree-sha1 = "7a3f25087b24d33b89f2e32cccd26af39275d14d" uuid = "8fbdd1d2-db62-5cd0-981e-905da1486e17" -version = "4.0.0+1" +version = "5.4.4+0" -[[ROCmDeviceLibs_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Zlib_jll"] -git-tree-sha1 = "d764f0f28b5af89aa004871a6a38e5d061f77257" +[[deps.ROCmDeviceLibs_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Zlib_jll"] +git-tree-sha1 = "45d5a53be418b740fe740714c8100650aebba041" uuid = "873c0968-716b-5aa7-bb8d-d1e2e2aeff2d" -version = "4.0.0+0" +version = "5.4.4+0" -[[ROCmOpenCLRuntime_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Libglvnd_jll", "Pkg", "ROCmCompilerSupport_jll", "ROCmDeviceLibs_jll", "Xorg_libX11_jll", "Xorg_xorgproto_jll", "hsa_rocr_jll"] -git-tree-sha1 = "f9e3e2cb40a7990535efa7da9b9dd0e0b458a973" +[[deps.ROCmOpenCLRuntime_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Libglvnd_jll", "ROCmCompilerSupport_jll", "ROCmDeviceLibs_jll", "Xorg_libX11_jll", "Xorg_xorgproto_jll", "hsa_rocr_jll", "hsakmt_roct_jll"] +git-tree-sha1 = "f7cbafcda3eec208831f22ae7816f34a90ce8e0f" uuid = "10ae2a08-2eea-53f8-8c20-eec175020e9f" -version = "4.0.0+1" +version = "5.4.4+0" -[[Random]] -deps = ["Serialization"] +[[deps.Random]] +deps = ["SHA", "Serialization"] uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" -[[Random123]] -deps = ["Libdl", "Random", "RandomNumbers"] -git-tree-sha1 = "0e8b146557ad1c6deb1367655e052276690e71a3" +[[deps.Random123]] +deps = ["Random", "RandomNumbers"] +git-tree-sha1 = "552f30e847641591ba3f39fd1bed559b9deb0ef3" uuid = "74087812-796a-5b5d-8853-05524746bad3" -version = "1.4.2" +version = "1.6.1" -[[RandomNumbers]] +[[deps.RandomNumbers]] deps = ["Random", "Requires"] git-tree-sha1 = "043da614cc7e95c703498a491e2c21f58a2b8111" uuid = "e6cf234a-135c-5ec9-84dd-332b85af5143" version = "1.5.3" -[[Reexport]] +[[deps.Reexport]] git-tree-sha1 = "45e428421666073eab6f2da5c9d310d99bb12f9b" uuid = "189a3867-3050-52da-a836-e630ba90ab69" version = "1.2.2" -[[Requires]] +[[deps.Requires]] deps = ["UUIDs"] -git-tree-sha1 = "8f82019e525f4d5c669692772a6f4b0a58b06a6a" +git-tree-sha1 = "838a3a4188e2ded87a4f9f184b4b0d78a1e91cb7" uuid = "ae029012-a4dd-5104-9daa-d747884805df" -version = "1.2.0" +version = "1.3.0" -[[SHA]] +[[deps.SHA]] uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" +version = "0.7.0" -[[SPIRV_LLVM_Translator_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "8cca87d57f6ddf19373cc9791fddc741406c8fbf" -uuid = "4a5d46fc-d8cf-5151-a261-86b458210efb" -version = "11.0.0+2" +[[deps.SPIRV_LLVM_Translator_unified_jll]] +deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "Pkg", "TOML"] +git-tree-sha1 = "2f9c006df258116f90874e47207229c83d06c845" +uuid = "85f0d8ed-5b39-5caa-b1ae-7472de402361" +version = "0.2.0+0" -[[SPIRV_Tools_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "c0324b7e07bc4649f755bfe7e00f7c6ed6aa353f" +[[deps.SPIRV_Tools_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl"] +git-tree-sha1 = "c5ab754aa7d71ea015783a9884a25e196860707c" uuid = "6ac6d60f-d740-5983-97d7-a4482c0689f4" -version = "2021.2.0+0" +version = "2023.2.0+0" -[[Serialization]] +[[deps.Serialization]] uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" -[[Setfield]] -deps = ["ConstructionBase", "Future", "MacroTools", "Requires"] -git-tree-sha1 = "fca29e68c5062722b5b4435594c3d1ba557072a3" +[[deps.Setfield]] +deps = ["ConstructionBase", "Future", "MacroTools", "StaticArraysCore"] +git-tree-sha1 = "e2cc6d8c88613c05e1defb55170bf5ff211fbeac" uuid = "efcf1570-3423-57d1-acb7-fd33fddbac46" -version = "0.7.1" +version = "1.1.1" -[[SharedArrays]] -deps = ["Distributed", "Mmap", "Random", "Serialization"] -uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383" - -[[Sockets]] +[[deps.Sockets]] uuid = "6462fe0b-24de-5631-8697-dd941f90decc" -[[SparseArrays]] -deps = ["LinearAlgebra", "Random"] +[[deps.SparseArrays]] +deps = ["Libdl", "LinearAlgebra", "Random", "Serialization", "SuiteSparse_jll"] uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" -[[SpecialFunctions]] -deps = ["ChainRulesCore", "IrrationalConstants", "LogExpFunctions", "OpenLibm_jll", "OpenSpecFun_jll"] -git-tree-sha1 = "f0bccf98e16759818ffc5d97ac3ebf87eb950150" +[[deps.SpecialFunctions]] +deps = ["IrrationalConstants", "LogExpFunctions", "OpenLibm_jll", "OpenSpecFun_jll"] +git-tree-sha1 = "e2cfc4012a19088254b3950b85c3c1d8882d864d" uuid = "276daf66-3868-5448-9aa4-cd146d93841b" -version = "1.8.1" +version = "2.3.1" + + [deps.SpecialFunctions.extensions] + SpecialFunctionsChainRulesCoreExt = "ChainRulesCore" -[[StaticArrays]] -deps = ["LinearAlgebra", "Random", "Statistics"] -git-tree-sha1 = "3c76dde64d03699e074ac02eb2e8ba8254d428da" + [deps.SpecialFunctions.weakdeps] + ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" + +[[deps.StaticArrays]] +deps = ["LinearAlgebra", "Random", "StaticArraysCore"] +git-tree-sha1 = "d5fb407ec3179063214bc6277712928ba78459e2" uuid = "90137ffa-7385-5640-81b9-e52037218182" -version = "1.2.13" +version = "1.6.4" +weakdeps = ["Statistics"] + + [deps.StaticArrays.extensions] + StaticArraysStatisticsExt = "Statistics" + +[[deps.StaticArraysCore]] +git-tree-sha1 = "36b3d696ce6366023a0ea192b4cd442268995a0d" +uuid = "1e83bf80-4336-4d27-bf5d-d5a4f845583c" +version = "1.4.2" -[[Statistics]] +[[deps.Statistics]] deps = ["LinearAlgebra", "SparseArrays"] uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" +version = "1.9.0" -[[TOML]] +[[deps.StructIO]] +deps = ["Test"] +git-tree-sha1 = "010dc73c7146869c042b49adcdb6bf528c12e859" +uuid = "53d494c1-5632-5724-8f4c-31dff12d585f" +version = "0.3.0" + +[[deps.SuiteSparse_jll]] +deps = ["Artifacts", "Libdl", "Pkg", "libblastrampoline_jll"] +uuid = "bea87d4a-7f5b-5778-9afe-8cc45184846c" +version = "5.10.1+6" + +[[deps.TOML]] deps = ["Dates"] uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76" +version = "1.0.3" -[[Tar]] +[[deps.Tar]] deps = ["ArgTools", "SHA"] uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e" +version = "1.10.0" -[[Test]] +[[deps.Test]] deps = ["InteractiveUtils", "Logging", "Random", "Serialization"] uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40" -[[TextWrap]] +[[deps.TextWrap]] git-tree-sha1 = "9250ef9b01b66667380cf3275b3f7488d0e25faf" uuid = "b718987f-49a8-5099-9789-dcd902bef87d" version = "1.0.1" -[[TimerOutputs]] +[[deps.TimerOutputs]] deps = ["ExprTools", "Printf"] -git-tree-sha1 = "7cb456f358e8f9d102a8b25e8dfedf58fa5689bc" +git-tree-sha1 = "f548a9e9c490030e545f72074a41edfd0e5bcdd7" uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f" -version = "0.5.13" +version = "0.5.23" -[[UUIDs]] +[[deps.TimespanLogging]] +deps = ["Distributed", "Profile"] +git-tree-sha1 = "51be7dd35b0c8a5a613dc7af272d587ea6943d24" +uuid = "a526e669-04d3-4846-9525-c66122c55f63" +version = "0.1.0" + +[[deps.UUIDs]] deps = ["Random", "SHA"] uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" -[[UnPack]] +[[deps.UnPack]] git-tree-sha1 = "387c1f73762231e86e0c9c5443ce3b4a0a9a0c2b" uuid = "3a884ed6-31ef-47d7-9d2a-63182c4928ed" version = "1.0.2" -[[Unicode]] +[[deps.Unicode]] uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" -[[XML2_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Libiconv_jll", "Pkg", "Zlib_jll"] -git-tree-sha1 = "1acf5bdf07aa0907e0a37d3718bb88d4b687b74a" +[[deps.UnsafeAtomics]] +git-tree-sha1 = "6331ac3440856ea1988316b46045303bef658278" +uuid = "013be700-e6cd-48c3-b4a1-df204f14c38f" +version = "0.2.1" + +[[deps.UnsafeAtomicsLLVM]] +deps = ["LLVM", "UnsafeAtomics"] +git-tree-sha1 = "ead6292c02aab389cb29fe64cc9375765ab1e219" +uuid = "d80eeb9a-aca5-4d75-85e5-170c8b632249" +version = "0.1.1" + +[[deps.XML2_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Libiconv_jll", "Zlib_jll"] +git-tree-sha1 = "04a51d15436a572301b5abbb9d099713327e9fc4" uuid = "02c8fc9c-b97f-50b9-bbe4-9be30ff0a78a" -version = "2.9.12+0" +version = "2.10.4+0" -[[XSLT_jll]] +[[deps.XSLT_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Libgcrypt_jll", "Libgpg_error_jll", "Libiconv_jll", "Pkg", "XML2_jll", "Zlib_jll"] git-tree-sha1 = "91844873c4085240b95e795f692c4cec4d805f8a" uuid = "aed1982a-8fda-507f-9586-7b0439959a61" version = "1.1.34+0" -[[XZ_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "a921669cd9a45c23031fd4eb904f5cc3d20de415" +[[deps.XZ_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl"] +git-tree-sha1 = "cf2c7de82431ca6f39250d2fc4aacd0daa1675c0" uuid = "ffd25f8a-64ca-5728-b0f7-c24cf3aae800" -version = "5.2.5+2" +version = "5.4.4+0" -[[Xorg_libX11_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libxcb_jll", "Xorg_xtrans_jll"] -git-tree-sha1 = "5be649d550f3f4b95308bf0183b82e2582876527" +[[deps.Xorg_libX11_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Xorg_libxcb_jll", "Xorg_xtrans_jll"] +git-tree-sha1 = "afead5aba5aa507ad5a3bf01f58f82c8d1403495" uuid = "4f6342f7-b3d2-589e-9d20-edeb45f2b2bc" -version = "1.6.9+4" +version = "1.8.6+0" -[[Xorg_libXau_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "4e490d5c960c314f33885790ed410ff3a94ce67e" +[[deps.Xorg_libXau_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl"] +git-tree-sha1 = "6035850dcc70518ca32f012e46015b9beeda49d8" uuid = "0c0b7dd1-d40b-584c-a123-a41640f87eec" -version = "1.0.9+4" +version = "1.0.11+0" -[[Xorg_libXdmcp_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "4fe47bd2247248125c428978740e18a681372dd4" +[[deps.Xorg_libXdmcp_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl"] +git-tree-sha1 = "34d526d318358a859d7de23da945578e8e8727b7" uuid = "a3789734-cfe1-5b06-b2d0-1dd0d9d62d05" -version = "1.1.3+4" +version = "1.1.4+0" -[[Xorg_libXext_jll]] +[[deps.Xorg_libXext_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libX11_jll"] git-tree-sha1 = "b7c0aa8c376b31e4852b360222848637f481f8c3" uuid = "1082639a-0dae-5f34-9b06-72781eeb8cb3" version = "1.3.4+4" -[[Xorg_libpthread_stubs_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "6783737e45d3c59a4a4c4091f5f88cdcf0908cbb" +[[deps.Xorg_libpciaccess_jll]] +deps = ["Libdl", "Pkg"] +git-tree-sha1 = "79a09b8c1d3a2659937503788ce11173ba29681b" +uuid = "a65dc6b1-eb27-53a1-bb3e-dea574b5389e" +version = "0.16.0+1" + +[[deps.Xorg_libpthread_stubs_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl"] +git-tree-sha1 = "8fdda4c692503d44d04a0603d9ac0982054635f9" uuid = "14d82f49-176c-5ed1-bb49-ad3f5cbd8c74" -version = "0.1.0+3" +version = "0.1.1+0" -[[Xorg_libxcb_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "XSLT_jll", "Xorg_libXau_jll", "Xorg_libXdmcp_jll", "Xorg_libpthread_stubs_jll"] -git-tree-sha1 = "daf17f441228e7a3833846cd048892861cff16d6" +[[deps.Xorg_libxcb_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "XSLT_jll", "Xorg_libXau_jll", "Xorg_libXdmcp_jll", "Xorg_libpthread_stubs_jll"] +git-tree-sha1 = "b4bfde5d5b652e22b9c790ad00af08b6d042b97d" uuid = "c7cfdc94-dc32-55de-ac96-5a1b8d977c5b" -version = "1.13.0+3" +version = "1.15.0+0" -[[Xorg_xorgproto_jll]] +[[deps.Xorg_xorgproto_jll]] deps = ["Libdl", "Pkg"] git-tree-sha1 = "9a9eb8ce756fe0bca01b4be16da770e18d264972" uuid = "c4d99508-4286-5418-9131-c86396af500b" version = "2019.2.0+2" -[[Xorg_xtrans_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "79c31e7844f6ecf779705fbc12146eb190b7d845" +[[deps.Xorg_xtrans_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl"] +git-tree-sha1 = "e92a1a012a10506618f10b7047e478403a046c77" uuid = "c5fb5394-a638-5e4d-96e5-b29de1b5cf10" -version = "1.4.0+3" +version = "1.5.0+0" -[[Zlib_jll]] +[[deps.Zlib_jll]] deps = ["Libdl"] uuid = "83775a58-1f1d-513f-b197-d71354ab007a" +version = "1.2.13+0" -[[argp_standalone_jll]] +[[deps.argp_standalone_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] git-tree-sha1 = "feaf9f6293003c2bf53056fd6930d677ed340b34" uuid = "c53206cc-00f7-50bf-ad1e-3ae1f6e49bc3" version = "1.3.1+0" -[[fts_jll]] +[[deps.fts_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "78732b942383d2cb521df8a1a0814911144e663d" +git-tree-sha1 = "aa21810b841ae26d2fc7f780cb1596b4170a4c49" uuid = "d65627f6-89bd-53e8-8ab5-8b75ff535eee" -version = "1.2.7+1" +version = "1.2.8+0" -[[gmmlib_jll]] +[[deps.gmmlib_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "0d5e5461d21b14853b4c332045c57d2601c403bd" +git-tree-sha1 = "228b09be83d88cc5d2236ef7b516d988d2639dfc" uuid = "09858cae-167c-5acb-9302-fddc6874d481" -version = "21.2.1+0" +version = "22.3.0+0" -[[hsa_rocr_jll]] -deps = ["Artifacts", "Elfutils_jll", "JLLWrappers", "Libdl", "NUMA_jll", "Pkg", "Zlib_jll", "hsakmt_roct_jll"] -git-tree-sha1 = "df8d73efec8b1e53ad527d208f5343c0368f0fcd" +[[deps.hsa_rocr_jll]] +deps = ["Artifacts", "Elfutils_jll", "JLLWrappers", "Libdl", "NUMA_jll", "ROCmDeviceLibs_jll", "XML2_jll", "Zlib_jll", "hsakmt_roct_jll"] +git-tree-sha1 = "0458f0ff5d72a270fbab764d354dc35d90b28ba9" uuid = "dd59ff1a-a01a-568d-8b29-0669330f116a" -version = "4.0.0+0" +version = "5.4.4+0" -[[hsakmt_roct_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "NUMA_jll", "Pkg"] -git-tree-sha1 = "ea54f6be23c6d25613a0872ec23dc5a0b77b4a00" +[[deps.hsakmt_roct_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "NUMA_jll", "libdrm_jll"] +git-tree-sha1 = "49db943b2bf868b1fa2866b93faf4d2222fa28ae" uuid = "1cecccd7-a9b6-5045-9cdc-a44c19b16d76" -version = "4.2.0+0" +version = "5.4.4+0" -[[libigc_jll]] +[[deps.libLLVM_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "8f36deef-c2a5-5394-99ed-8e07531fb29a" +version = "14.0.6+3" + +[[deps.libblastrampoline_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "8e850b90-86db-534c-a0d3-1478176c7d93" +version = "5.8.0+0" + +[[deps.libdrm_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libpciaccess_jll"] +git-tree-sha1 = "89b30a68162c12118311b77e57b20c8fa2685496" +uuid = "8e53e030-5e6c-5a89-a30b-be5b7263a166" +version = "2.4.110+0" + +[[deps.libigc_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "4f7a6c63ee113ee6da9a6afd06c77eb44998b1f3" +git-tree-sha1 = "d577d44c9e92244cf60fbc183cb5506860916647" uuid = "94295238-5935-5bd7-bb0f-b00942e9bdd5" -version = "1.0.8744+0" +version = "1.0.12812+0" -[[nghttp2_jll]] +[[deps.msgpack_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "dcbef55311e8e3d0a15dbe7dd86900c501ca2359" +uuid = "43dd8cde-e9ee-5d59-924a-18d3f2773c4d" +version = "3.0.1+0" + +[[deps.nghttp2_jll]] deps = ["Artifacts", "Libdl"] uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d" +version = "1.48.0+0" -[[obstack_jll]] +[[deps.obstack_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] git-tree-sha1 = "1c4a6b66e934fc6db4649cb2910c72f53bbfea7e" uuid = "c88a4935-d25e-5644-aacc-5db6f1b8ef79" version = "1.2.2+0" -[[oneAPI]] -deps = ["Adapt", "CEnum", "ExprTools", "GPUArrays", "GPUCompiler", "LLVM", "LinearAlgebra", "NEO_jll", "Printf", "Random", "SPIRV_LLVM_Translator_jll", "SPIRV_Tools_jll", "SpecialFunctions", "oneAPI_Level_Zero_Headers_jll", "oneAPI_Level_Zero_Loader_jll"] -git-tree-sha1 = "efabcff2a259b0f1b10505db99aa18fc2de181ce" +[[deps.oneAPI]] +deps = ["Adapt", "CEnum", "ExprTools", "GPUArrays", "GPUCompiler", "LLVM", "LinearAlgebra", "NEO_jll", "Printf", "Random", "SPIRV_LLVM_Translator_unified_jll", "SPIRV_Tools_jll", "SpecialFunctions", "oneAPI_Level_Zero_Headers_jll", "oneAPI_Level_Zero_Loader_jll", "oneAPI_Support_jll"] +git-tree-sha1 = "1e562c5fc737870053e62c6001d742545000ee24" uuid = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b" -version = "0.2.1" +version = "1.0.2" -[[oneAPI_Level_Zero_Headers_jll]] +[[deps.oneAPI_Level_Zero_Headers_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "e106a6eed53928cd1864f544562ea991b5f11464" +git-tree-sha1 = "412efcf5d55c65d3352c3915cffec1e53955570f" uuid = "f4bc562b-d309-54f8-9efb-476e56f0410d" -version = "1.2.43+0" +version = "1.6.3+0" -[[oneAPI_Level_Zero_Loader_jll]] +[[deps.oneAPI_Level_Zero_Loader_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "oneAPI_Level_Zero_Headers_jll"] -git-tree-sha1 = "0f0fd4a92c4785454e4929c2e4db22c3d03d6889" +git-tree-sha1 = "87980483b19f0a00c8d62e8b6682acac1894c638" uuid = "13eca655-d68d-5b81-8367-6d99d727ab01" -version = "1.5.0+0" +version = "1.11.0+0" + +[[deps.oneAPI_Support_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "oneAPI_Level_Zero_Loader_jll"] +git-tree-sha1 = "39a73e1fcd9a33eeadfd69f9027e9c62d3c58219" +uuid = "b049733a-a71d-5ed3-8eba-7d323ac00b36" +version = "0.2.2+0" -[[p7zip_jll]] +[[deps.p7zip_jll]] deps = ["Artifacts", "Libdl"] uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0" +version = "17.4.0+0" + +[[deps.rocBLAS_jll]] +deps = ["Artifacts", "HIP_jll", "JLLWrappers", "Libdl", "Pkg", "ROCmCompilerSupport_jll", "ROCmDeviceLibs_jll", "ROCmOpenCLRuntime_jll", "hsa_rocr_jll", "msgpack_jll", "rocminfo_jll"] +git-tree-sha1 = "92d224a9e10a9ad04195d943a2b1bcbdafcaf06a" +uuid = "1ef8cab2-a151-54b4-a57f-5fbb4046a4ab" +version = "5.2.3+2" + +[[deps.rocPRIM_jll]] +deps = ["Artifacts", "HIP_jll", "JLLWrappers", "Libdl", "Pkg", "ROCmCompilerSupport_jll", "ROCmDeviceLibs_jll", "ROCmOpenCLRuntime_jll", "hsa_rocr_jll", "rocminfo_jll"] +git-tree-sha1 = "7a100de0bae8363cbd33fa429d37be45a0247d2c" +uuid = "52935e6f-76c5-5ebb-b227-36676f75be9c" +version = "5.2.3+1" + +[[deps.rocRAND_jll]] +deps = ["Artifacts", "HIP_jll", "JLLWrappers", "Libdl", "Pkg", "ROCmCompilerSupport_jll", "ROCmDeviceLibs_jll", "ROCmOpenCLRuntime_jll", "hsa_rocr_jll", "rocminfo_jll"] +git-tree-sha1 = "58a35917ddb4d79f7a0c2f6d438a210d2f398e85" +uuid = "a6151927-a32b-54c0-bc8c-bbd7b3f1a996" +version = "5.2.3+1" + +[[deps.rocSPARSE_jll]] +deps = ["Artifacts", "HIP_jll", "JLLWrappers", "Libdl", "Pkg", "ROCmCompilerSupport_jll", "ROCmDeviceLibs_jll", "ROCmOpenCLRuntime_jll", "hsa_rocr_jll", "rocPRIM_jll", "rocminfo_jll"] +git-tree-sha1 = "67bc29d47ab636ef1471e48d7f730c03a0edfcf8" +uuid = "8c6ce2ba-659c-5ec7-ba4c-37596cf1f22a" +version = "5.2.3+1" + +[[deps.rocminfo_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "hsa_rocr_jll", "hsakmt_roct_jll"] +git-tree-sha1 = "840acd2135e7bd025870d063e99ff70d05c0de46" +uuid = "5a766526-3cf8-5128-8c31-4f7b7ad60f0e" +version = "5.4.4+0" diff --git a/src/julia/JuliaStream.jl/Project.toml b/src/julia/JuliaStream.jl/Project.toml index f8095e0d..76c92024 100644 --- a/src/julia/JuliaStream.jl/Project.toml +++ b/src/julia/JuliaStream.jl/Project.toml @@ -16,4 +16,4 @@ ROCKernels = "7eb9e9f0-4bd3-4c4c-8bef-26bd9629d9b9" oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b" [compat] -julia = "1.6" +julia = "1.9" diff --git a/src/julia/JuliaStream.jl/Threaded/Manifest.toml b/src/julia/JuliaStream.jl/Threaded/Manifest.toml index dc0737e0..5445f326 100644 --- a/src/julia/JuliaStream.jl/Threaded/Manifest.toml +++ b/src/julia/JuliaStream.jl/Threaded/Manifest.toml @@ -1,31 +1,35 @@ # This file is machine-generated - editing it directly is not advised -[[ArgParse]] +julia_version = "1.9.3" +manifest_format = "2.0" +project_hash = "fbff310f722a52622a273a48a8a6b3b64f06b029" + +[[deps.ArgParse]] deps = ["Logging", "TextWrap"] git-tree-sha1 = "3102bce13da501c9104df33549f511cd25264d7d" uuid = "c7e460c6-2fb9-53a9-8c5b-16f535851c63" version = "1.1.4" -[[Logging]] +[[deps.Logging]] uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" -[[OrderedCollections]] -git-tree-sha1 = "85f8e6578bf1f9ee0d11e7bb1b1456435479d47c" +[[deps.OrderedCollections]] +git-tree-sha1 = "2e73fe17cac3c62ad1aebe70d44c963c3cfdc3e3" uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" -version = "1.4.1" +version = "1.6.2" -[[Parameters]] +[[deps.Parameters]] deps = ["OrderedCollections", "UnPack"] git-tree-sha1 = "34c0e9ad262e5f7fc75b10a9952ca7692cfc5fbe" uuid = "d96e819e-fc66-5662-9728-84c9c7592b0a" version = "0.12.3" -[[TextWrap]] +[[deps.TextWrap]] git-tree-sha1 = "9250ef9b01b66667380cf3275b3f7488d0e25faf" uuid = "b718987f-49a8-5099-9789-dcd902bef87d" version = "1.0.1" -[[UnPack]] +[[deps.UnPack]] git-tree-sha1 = "387c1f73762231e86e0c9c5443ce3b4a0a9a0c2b" uuid = "3a884ed6-31ef-47d7-9d2a-63182c4928ed" version = "1.0.2" diff --git a/src/julia/JuliaStream.jl/Threaded/Project.toml b/src/julia/JuliaStream.jl/Threaded/Project.toml index b65bdf57..367e0ef3 100644 --- a/src/julia/JuliaStream.jl/Threaded/Project.toml +++ b/src/julia/JuliaStream.jl/Threaded/Project.toml @@ -3,4 +3,4 @@ ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63" Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a" [compat] -julia = "1.6" +julia = "1.9" diff --git a/src/julia/JuliaStream.jl/oneAPI/Manifest.toml b/src/julia/JuliaStream.jl/oneAPI/Manifest.toml index 649ea533..ed47c3aa 100644 --- a/src/julia/JuliaStream.jl/oneAPI/Manifest.toml +++ b/src/julia/JuliaStream.jl/oneAPI/Manifest.toml @@ -1,335 +1,441 @@ # This file is machine-generated - editing it directly is not advised -[[Adapt]] -deps = ["LinearAlgebra"] -git-tree-sha1 = "84918055d15b3114ede17ac6a7182f68870c16f7" +julia_version = "1.9.3" +manifest_format = "2.0" +project_hash = "01f328e925b86927b3f24c30aee6ecdce5bd28cc" + +[[deps.Adapt]] +deps = ["LinearAlgebra", "Requires"] +git-tree-sha1 = "76289dc51920fdc6e0013c872ba9551d54961c24" uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" -version = "3.3.1" +version = "3.6.2" +weakdeps = ["StaticArrays"] + + [deps.Adapt.extensions] + AdaptStaticArraysExt = "StaticArrays" -[[ArgParse]] +[[deps.ArgParse]] deps = ["Logging", "TextWrap"] git-tree-sha1 = "3102bce13da501c9104df33549f511cd25264d7d" uuid = "c7e460c6-2fb9-53a9-8c5b-16f535851c63" version = "1.1.4" -[[ArgTools]] +[[deps.ArgTools]] uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f" +version = "1.1.1" -[[Artifacts]] +[[deps.Artifacts]] uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" -[[Base64]] +[[deps.Atomix]] +deps = ["UnsafeAtomics"] +git-tree-sha1 = "c06a868224ecba914baa6942988e2f2aade419be" +uuid = "a9b6321e-bd34-4604-b9c9-b65b8de01458" +version = "0.1.0" + +[[deps.Base64]] uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" -[[CEnum]] -git-tree-sha1 = "215a9aa4a1f23fbd05b92769fdd62559488d70e9" +[[deps.CEnum]] +git-tree-sha1 = "eb4cb44a499229b3b8426dcfb5dd85333951ff90" uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82" -version = "0.4.1" - -[[ChainRulesCore]] -deps = ["Compat", "LinearAlgebra", "SparseArrays"] -git-tree-sha1 = "4c26b4e9e91ca528ea212927326ece5918a04b47" -uuid = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" -version = "1.11.2" - -[[ChangesOfVariables]] -deps = ["ChainRulesCore", "LinearAlgebra", "Test"] -git-tree-sha1 = "bf98fa45a0a4cee295de98d4c1462be26345b9a1" -uuid = "9e997f8a-9a97-42d5-a9f1-ce6bfc15e2c0" -version = "0.1.2" - -[[Compat]] -deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "SHA", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"] -git-tree-sha1 = "44c37b4636bc54afac5c574d2d02b625349d6582" -uuid = "34da2185-b29b-5c13-b0c7-acf172513d20" -version = "3.41.0" - -[[CompilerSupportLibraries_jll]] +version = "0.4.2" + +[[deps.CompilerSupportLibraries_jll]] deps = ["Artifacts", "Libdl"] uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae" +version = "1.0.5+0" -[[Dates]] +[[deps.Dates]] deps = ["Printf"] uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" -[[DelimitedFiles]] -deps = ["Mmap"] -uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab" - -[[Distributed]] -deps = ["Random", "Serialization", "Sockets"] -uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b" - -[[DocStringExtensions]] +[[deps.DocStringExtensions]] deps = ["LibGit2"] -git-tree-sha1 = "b19534d1895d702889b219c382a6e18010797f0b" +git-tree-sha1 = "2fb1e02f2b635d0845df5d7c167fec4dd739b00d" uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae" -version = "0.8.6" +version = "0.9.3" -[[Downloads]] -deps = ["ArgTools", "LibCURL", "NetworkOptions"] +[[deps.Downloads]] +deps = ["ArgTools", "FileWatching", "LibCURL", "NetworkOptions"] uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6" +version = "1.6.0" -[[ExprTools]] -git-tree-sha1 = "b7e3d17636b348f005f11040025ae8c6f645fe92" +[[deps.ExprTools]] +git-tree-sha1 = "27415f162e6028e81c72b82ef756bf321213b6ec" uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04" -version = "0.1.6" +version = "0.1.10" + +[[deps.FileWatching]] +uuid = "7b1f6079-737a-58dc-b8bc-7a2ca5c1b5ee" -[[GPUArrays]] -deps = ["Adapt", "LinearAlgebra", "Printf", "Random", "Serialization", "Statistics"] -git-tree-sha1 = "7772508f17f1d482fe0df72cabc5b55bec06bbe0" +[[deps.GPUArrays]] +deps = ["Adapt", "GPUArraysCore", "LLVM", "LinearAlgebra", "Printf", "Random", "Reexport", "Serialization", "Statistics"] +git-tree-sha1 = "2e57b4a4f9cc15e85a24d603256fe08e527f48d1" uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" -version = "8.1.2" +version = "8.8.1" -[[GPUCompiler]] -deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "TimerOutputs", "UUIDs"] -git-tree-sha1 = "2cac236070c2c4b36de54ae9146b55ee2c34ac7a" +[[deps.GPUArraysCore]] +deps = ["Adapt"] +git-tree-sha1 = "2d6ca471a6c7b536127afccfa7564b5b39227fe0" +uuid = "46192b85-c4d5-4398-a991-12ede77f4527" +version = "0.1.5" + +[[deps.GPUCompiler]] +deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "Scratch", "TimerOutputs", "UUIDs"] +git-tree-sha1 = "72b2e3c2ba583d1a7aa35129e56cf92e07c083e3" uuid = "61eb1bfa-7361-4325-ad38-22787b887f55" -version = "0.13.10" +version = "0.21.4" -[[InteractiveUtils]] +[[deps.InteractiveUtils]] deps = ["Markdown"] uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" -[[InverseFunctions]] -deps = ["Test"] -git-tree-sha1 = "a7254c0acd8e62f1ac75ad24d5db43f5f19f3c65" -uuid = "3587e190-3f89-42d0-90ee-14403ec27112" -version = "0.1.2" - -[[IrrationalConstants]] -git-tree-sha1 = "7fd44fd4ff43fc60815f8e764c0f352b83c49151" +[[deps.IrrationalConstants]] +git-tree-sha1 = "630b497eafcc20001bba38a4651b327dcfc491d2" uuid = "92d709cd-6900-40b7-9082-c6be49f344b6" -version = "0.1.1" +version = "0.2.2" -[[JLLWrappers]] -deps = ["Preferences"] -git-tree-sha1 = "642a199af8b68253517b80bd3bfd17eb4e84df6e" +[[deps.JLLWrappers]] +deps = ["Artifacts", "Preferences"] +git-tree-sha1 = "7e5d6779a1e09a36db2a7b6cff50942a0a7d0fca" uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210" -version = "1.3.0" +version = "1.5.0" + +[[deps.KernelAbstractions]] +deps = ["Adapt", "Atomix", "InteractiveUtils", "LinearAlgebra", "MacroTools", "PrecompileTools", "Requires", "SparseArrays", "StaticArrays", "UUIDs", "UnsafeAtomics", "UnsafeAtomicsLLVM"] +git-tree-sha1 = "4c5875e4c228247e1c2b087669846941fb6e0118" +uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c" +version = "0.9.8" + + [deps.KernelAbstractions.extensions] + EnzymeExt = "EnzymeCore" -[[LLVM]] + [deps.KernelAbstractions.weakdeps] + EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869" + +[[deps.LLVM]] deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"] -git-tree-sha1 = "7cc22e69995e2329cc047a879395b2b74647ab5f" +git-tree-sha1 = "a9d2ce1d5007b1e8f6c5b89c5a31ff8bd146db5c" uuid = "929cbde3-209d-540e-8aea-75f648917ca0" -version = "4.7.0" +version = "6.2.1" -[[LLVMExtra_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "c5fc4bef251ecd37685bea1c4068a9cfa41e8b9a" +[[deps.LLVMExtra_jll]] +deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"] +git-tree-sha1 = "7ca6850ae880cc99b59b88517545f91a52020afa" uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab" -version = "0.0.13+0" +version = "0.0.25+0" -[[LibCURL]] +[[deps.LazyArtifacts]] +deps = ["Artifacts", "Pkg"] +uuid = "4af54fe1-eca0-43a8-85a7-787d91b784e3" + +[[deps.LibCURL]] deps = ["LibCURL_jll", "MozillaCACerts_jll"] uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21" +version = "0.6.3" -[[LibCURL_jll]] +[[deps.LibCURL_jll]] deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"] uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0" +version = "7.84.0+0" -[[LibGit2]] +[[deps.LibGit2]] deps = ["Base64", "NetworkOptions", "Printf", "SHA"] uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" -[[LibSSH2_jll]] +[[deps.LibSSH2_jll]] deps = ["Artifacts", "Libdl", "MbedTLS_jll"] uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8" +version = "1.10.2+0" -[[Libdl]] +[[deps.Libdl]] uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" -[[LinearAlgebra]] -deps = ["Libdl"] +[[deps.LinearAlgebra]] +deps = ["Libdl", "OpenBLAS_jll", "libblastrampoline_jll"] uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" -[[LogExpFunctions]] -deps = ["ChainRulesCore", "ChangesOfVariables", "DocStringExtensions", "InverseFunctions", "IrrationalConstants", "LinearAlgebra"] -git-tree-sha1 = "e5718a00af0ab9756305a0392832c8952c7426c1" +[[deps.LogExpFunctions]] +deps = ["DocStringExtensions", "IrrationalConstants", "LinearAlgebra"] +git-tree-sha1 = "7d6dd4e9212aebaeed356de34ccf262a3cd415aa" uuid = "2ab3a3ac-af41-5b50-aa03-7779005ae688" -version = "0.3.6" +version = "0.3.26" -[[Logging]] + [deps.LogExpFunctions.extensions] + LogExpFunctionsChainRulesCoreExt = "ChainRulesCore" + LogExpFunctionsChangesOfVariablesExt = "ChangesOfVariables" + LogExpFunctionsInverseFunctionsExt = "InverseFunctions" + + [deps.LogExpFunctions.weakdeps] + ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" + ChangesOfVariables = "9e997f8a-9a97-42d5-a9f1-ce6bfc15e2c0" + InverseFunctions = "3587e190-3f89-42d0-90ee-14403ec27112" + +[[deps.Logging]] uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" -[[Markdown]] +[[deps.MacroTools]] +deps = ["Markdown", "Random"] +git-tree-sha1 = "9ee1618cbf5240e6d4e0371d6f24065083f60c48" +uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" +version = "0.5.11" + +[[deps.Markdown]] deps = ["Base64"] uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" -[[MbedTLS_jll]] +[[deps.MbedTLS_jll]] deps = ["Artifacts", "Libdl"] uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1" +version = "2.28.2+0" -[[Mmap]] -uuid = "a63ad114-7e13-5084-954f-fe012c677804" - -[[MozillaCACerts_jll]] +[[deps.MozillaCACerts_jll]] uuid = "14a3606d-f60d-562e-9121-12d972cd8159" +version = "2022.10.11" -[[NEO_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "gmmlib_jll", "libigc_jll", "oneAPI_Level_Zero_Headers_jll"] -git-tree-sha1 = "15deea2649d70f1bbaedf0aa87c9fa20fb21f22c" +[[deps.NEO_jll]] +deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML", "gmmlib_jll", "libigc_jll", "oneAPI_Level_Zero_Headers_jll"] +git-tree-sha1 = "9846d87fd254cdaa1879dff93999e1bc32ed2658" uuid = "700fe977-ac61-5f37-bbc8-c6c4b2b6a9fd" -version = "21.44.21506+0" +version = "23.17.26241+0" -[[NetworkOptions]] +[[deps.NetworkOptions]] uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908" +version = "1.2.0" + +[[deps.OpenBLAS_jll]] +deps = ["Artifacts", "CompilerSupportLibraries_jll", "Libdl"] +uuid = "4536629a-c528-5b80-bd46-f80d51c5b363" +version = "0.3.21+4" -[[OpenLibm_jll]] +[[deps.OpenLibm_jll]] deps = ["Artifacts", "Libdl"] uuid = "05823500-19ac-5b8b-9628-191a04bc5112" +version = "0.8.1+0" -[[OpenSpecFun_jll]] +[[deps.OpenSpecFun_jll]] deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "Libdl", "Pkg"] git-tree-sha1 = "13652491f6856acfd2db29360e1bbcd4565d04f1" uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e" version = "0.5.5+0" -[[OrderedCollections]] -git-tree-sha1 = "85f8e6578bf1f9ee0d11e7bb1b1456435479d47c" +[[deps.OrderedCollections]] +git-tree-sha1 = "2e73fe17cac3c62ad1aebe70d44c963c3cfdc3e3" uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" -version = "1.4.1" +version = "1.6.2" -[[Parameters]] +[[deps.Parameters]] deps = ["OrderedCollections", "UnPack"] git-tree-sha1 = "34c0e9ad262e5f7fc75b10a9952ca7692cfc5fbe" uuid = "d96e819e-fc66-5662-9728-84c9c7592b0a" version = "0.12.3" -[[Pkg]] -deps = ["Artifacts", "Dates", "Downloads", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"] +[[deps.Pkg]] +deps = ["Artifacts", "Dates", "Downloads", "FileWatching", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"] uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" +version = "1.9.2" -[[Preferences]] +[[deps.PrecompileTools]] +deps = ["Preferences"] +git-tree-sha1 = "03b4c25b43cb84cee5c90aa9b5ea0a78fd848d2f" +uuid = "aea7be01-6a6a-4083-8856-8a6e6704d82a" +version = "1.2.0" + +[[deps.Preferences]] deps = ["TOML"] -git-tree-sha1 = "00cfd92944ca9c760982747e9a1d0d5d86ab1e5a" +git-tree-sha1 = "00805cd429dcb4870060ff49ef443486c262e38e" uuid = "21216c6a-2e73-6563-6e65-726566657250" -version = "1.2.2" +version = "1.4.1" -[[Printf]] +[[deps.Printf]] deps = ["Unicode"] uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" -[[REPL]] +[[deps.REPL]] deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"] uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" -[[Random]] -deps = ["Serialization"] +[[deps.Random]] +deps = ["SHA", "Serialization"] uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" -[[SHA]] +[[deps.Reexport]] +git-tree-sha1 = "45e428421666073eab6f2da5c9d310d99bb12f9b" +uuid = "189a3867-3050-52da-a836-e630ba90ab69" +version = "1.2.2" + +[[deps.Requires]] +deps = ["UUIDs"] +git-tree-sha1 = "838a3a4188e2ded87a4f9f184b4b0d78a1e91cb7" +uuid = "ae029012-a4dd-5104-9daa-d747884805df" +version = "1.3.0" + +[[deps.SHA]] uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" +version = "0.7.0" -[[SPIRV_LLVM_Translator_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "8cca87d57f6ddf19373cc9791fddc741406c8fbf" -uuid = "4a5d46fc-d8cf-5151-a261-86b458210efb" -version = "11.0.0+2" +[[deps.SPIRV_LLVM_Translator_unified_jll]] +deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"] +git-tree-sha1 = "fe95f28a96975bd1d473e9273873b36402b79a54" +uuid = "85f0d8ed-5b39-5caa-b1ae-7472de402361" +version = "0.3.0+0" -[[SPIRV_Tools_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "c0324b7e07bc4649f755bfe7e00f7c6ed6aa353f" +[[deps.SPIRV_Tools_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl"] +git-tree-sha1 = "c5ab754aa7d71ea015783a9884a25e196860707c" uuid = "6ac6d60f-d740-5983-97d7-a4482c0689f4" -version = "2021.2.0+0" +version = "2023.2.0+0" -[[Serialization]] -uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" +[[deps.Scratch]] +deps = ["Dates"] +git-tree-sha1 = "30449ee12237627992a99d5e30ae63e4d78cd24a" +uuid = "6c6a2e73-6563-6170-7368-637461726353" +version = "1.2.0" -[[SharedArrays]] -deps = ["Distributed", "Mmap", "Random", "Serialization"] -uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383" +[[deps.Serialization]] +uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" -[[Sockets]] +[[deps.Sockets]] uuid = "6462fe0b-24de-5631-8697-dd941f90decc" -[[SparseArrays]] -deps = ["LinearAlgebra", "Random"] +[[deps.SparseArrays]] +deps = ["Libdl", "LinearAlgebra", "Random", "Serialization", "SuiteSparse_jll"] uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" -[[SpecialFunctions]] -deps = ["ChainRulesCore", "IrrationalConstants", "LogExpFunctions", "OpenLibm_jll", "OpenSpecFun_jll"] -git-tree-sha1 = "f0bccf98e16759818ffc5d97ac3ebf87eb950150" +[[deps.SpecialFunctions]] +deps = ["IrrationalConstants", "LogExpFunctions", "OpenLibm_jll", "OpenSpecFun_jll"] +git-tree-sha1 = "e2cfc4012a19088254b3950b85c3c1d8882d864d" uuid = "276daf66-3868-5448-9aa4-cd146d93841b" -version = "1.8.1" +version = "2.3.1" + + [deps.SpecialFunctions.extensions] + SpecialFunctionsChainRulesCoreExt = "ChainRulesCore" + + [deps.SpecialFunctions.weakdeps] + ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" + +[[deps.StaticArrays]] +deps = ["LinearAlgebra", "Random", "StaticArraysCore"] +git-tree-sha1 = "d5fb407ec3179063214bc6277712928ba78459e2" +uuid = "90137ffa-7385-5640-81b9-e52037218182" +version = "1.6.4" +weakdeps = ["Statistics"] -[[Statistics]] + [deps.StaticArrays.extensions] + StaticArraysStatisticsExt = "Statistics" + +[[deps.StaticArraysCore]] +git-tree-sha1 = "36b3d696ce6366023a0ea192b4cd442268995a0d" +uuid = "1e83bf80-4336-4d27-bf5d-d5a4f845583c" +version = "1.4.2" + +[[deps.Statistics]] deps = ["LinearAlgebra", "SparseArrays"] uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" +version = "1.9.0" -[[TOML]] +[[deps.SuiteSparse_jll]] +deps = ["Artifacts", "Libdl", "Pkg", "libblastrampoline_jll"] +uuid = "bea87d4a-7f5b-5778-9afe-8cc45184846c" +version = "5.10.1+6" + +[[deps.TOML]] deps = ["Dates"] uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76" +version = "1.0.3" -[[Tar]] +[[deps.Tar]] deps = ["ArgTools", "SHA"] uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e" +version = "1.10.0" -[[Test]] -deps = ["InteractiveUtils", "Logging", "Random", "Serialization"] -uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40" - -[[TextWrap]] +[[deps.TextWrap]] git-tree-sha1 = "9250ef9b01b66667380cf3275b3f7488d0e25faf" uuid = "b718987f-49a8-5099-9789-dcd902bef87d" version = "1.0.1" -[[TimerOutputs]] +[[deps.TimerOutputs]] deps = ["ExprTools", "Printf"] -git-tree-sha1 = "7cb456f358e8f9d102a8b25e8dfedf58fa5689bc" +git-tree-sha1 = "f548a9e9c490030e545f72074a41edfd0e5bcdd7" uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f" -version = "0.5.13" +version = "0.5.23" -[[UUIDs]] +[[deps.UUIDs]] deps = ["Random", "SHA"] uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" -[[UnPack]] +[[deps.UnPack]] git-tree-sha1 = "387c1f73762231e86e0c9c5443ce3b4a0a9a0c2b" uuid = "3a884ed6-31ef-47d7-9d2a-63182c4928ed" version = "1.0.2" -[[Unicode]] +[[deps.Unicode]] uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" -[[Zlib_jll]] +[[deps.UnsafeAtomics]] +git-tree-sha1 = "6331ac3440856ea1988316b46045303bef658278" +uuid = "013be700-e6cd-48c3-b4a1-df204f14c38f" +version = "0.2.1" + +[[deps.UnsafeAtomicsLLVM]] +deps = ["LLVM", "UnsafeAtomics"] +git-tree-sha1 = "323e3d0acf5e78a56dfae7bd8928c989b4f3083e" +uuid = "d80eeb9a-aca5-4d75-85e5-170c8b632249" +version = "0.1.3" + +[[deps.Zlib_jll]] deps = ["Libdl"] uuid = "83775a58-1f1d-513f-b197-d71354ab007a" +version = "1.2.13+0" -[[gmmlib_jll]] +[[deps.gmmlib_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "0d5e5461d21b14853b4c332045c57d2601c403bd" +git-tree-sha1 = "228b09be83d88cc5d2236ef7b516d988d2639dfc" uuid = "09858cae-167c-5acb-9302-fddc6874d481" -version = "21.2.1+0" +version = "22.3.0+0" -[[libigc_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "4f7a6c63ee113ee6da9a6afd06c77eb44998b1f3" +[[deps.libblastrampoline_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "8e850b90-86db-534c-a0d3-1478176c7d93" +version = "5.8.0+0" + +[[deps.libigc_jll]] +deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"] +git-tree-sha1 = "7c0b5fa2ff90d96af106fd4a67ff6923cd3f9cb9" uuid = "94295238-5935-5bd7-bb0f-b00942e9bdd5" -version = "1.0.8744+0" +version = "1.0.13822+0" -[[nghttp2_jll]] +[[deps.nghttp2_jll]] deps = ["Artifacts", "Libdl"] uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d" +version = "1.48.0+0" -[[oneAPI]] -deps = ["Adapt", "CEnum", "ExprTools", "GPUArrays", "GPUCompiler", "LLVM", "LinearAlgebra", "NEO_jll", "Printf", "Random", "SPIRV_LLVM_Translator_jll", "SPIRV_Tools_jll", "SpecialFunctions", "oneAPI_Level_Zero_Headers_jll", "oneAPI_Level_Zero_Loader_jll"] -git-tree-sha1 = "efabcff2a259b0f1b10505db99aa18fc2de181ce" +[[deps.oneAPI]] +deps = ["Adapt", "CEnum", "ExprTools", "GPUArrays", "GPUCompiler", "KernelAbstractions", "LLVM", "LinearAlgebra", "NEO_jll", "Preferences", "Printf", "Random", "SPIRV_LLVM_Translator_unified_jll", "SPIRV_Tools_jll", "SpecialFunctions", "UnsafeAtomicsLLVM", "oneAPI_Level_Zero_Headers_jll", "oneAPI_Level_Zero_Loader_jll", "oneAPI_Support_jll"] +git-tree-sha1 = "9e6a675faf3ea27d08018c9bd0a03596003ff5cf" uuid = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b" -version = "0.2.1" +version = "1.3.0" -[[oneAPI_Level_Zero_Headers_jll]] +[[deps.oneAPI_Level_Zero_Headers_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "e106a6eed53928cd1864f544562ea991b5f11464" +git-tree-sha1 = "412efcf5d55c65d3352c3915cffec1e53955570f" uuid = "f4bc562b-d309-54f8-9efb-476e56f0410d" -version = "1.2.43+0" +version = "1.6.3+0" -[[oneAPI_Level_Zero_Loader_jll]] +[[deps.oneAPI_Level_Zero_Loader_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "oneAPI_Level_Zero_Headers_jll"] -git-tree-sha1 = "0f0fd4a92c4785454e4929c2e4db22c3d03d6889" +git-tree-sha1 = "87980483b19f0a00c8d62e8b6682acac1894c638" uuid = "13eca655-d68d-5b81-8367-6d99d727ab01" -version = "1.5.0+0" +version = "1.11.0+0" + +[[deps.oneAPI_Support_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "oneAPI_Level_Zero_Loader_jll"] +git-tree-sha1 = "39a73e1fcd9a33eeadfd69f9027e9c62d3c58219" +uuid = "b049733a-a71d-5ed3-8eba-7d323ac00b36" +version = "0.2.2+0" -[[p7zip_jll]] +[[deps.p7zip_jll]] deps = ["Artifacts", "Libdl"] uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0" +version = "17.4.0+0" diff --git a/src/julia/JuliaStream.jl/oneAPI/Project.toml b/src/julia/JuliaStream.jl/oneAPI/Project.toml index 9f89f826..2a1b49d9 100644 --- a/src/julia/JuliaStream.jl/oneAPI/Project.toml +++ b/src/julia/JuliaStream.jl/oneAPI/Project.toml @@ -4,4 +4,4 @@ Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a" oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b" [compat] -julia = "1.6" +julia = "1.9" diff --git a/src/julia/JuliaStream.jl/src/Stream.jl b/src/julia/JuliaStream.jl/src/Stream.jl index 755fe2b9..42030f86 100644 --- a/src/julia/JuliaStream.jl/src/Stream.jl +++ b/src/julia/JuliaStream.jl/src/Stream.jl @@ -220,10 +220,10 @@ function main() end function mk_row(xs::Vector{Float64}, name::String, total_bytes::Int) - tail = Base.rest(xs) - min = Iterators.minimum(tail) - max = Iterators.maximum(tail) - avg = Iterators.sum(tail) / Iterators.length(tail) + tail = Iterators.rest(xs) + min = Base.minimum(tail) + max = Base.maximum(tail) + avg = Base.sum(tail) / Base.length(tail) mbps = mega_scale * total_bytes / min if config.csv return [ diff --git a/src/julia/JuliaStream.jl/update_all.sh b/src/julia/JuliaStream.jl/update_all.sh index ad6c2ee6..648b4812 100755 --- a/src/julia/JuliaStream.jl/update_all.sh +++ b/src/julia/JuliaStream.jl/update_all.sh @@ -3,5 +3,6 @@ for BACKEND in "." "AMDGPU" "CUDA" "oneAPI" "Threaded" "KernelAbstractions" do - julia --project="$BACKEND" -e 'import Pkg; Pkg.resolve(); Pkg.instantiate(); Pkg.update(); Pkg.gc();' -done \ No newline at end of file + echo "Updating subproject $BACKEND" + julia --project="$BACKEND" -e 'import Pkg; Pkg.resolve(); Pkg.instantiate(); Pkg.update(); Pkg.gc();' +done From f47d27980f3b3f5889d7130f7aa769c9a0db8f1e Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Sun, 24 Sep 2023 22:44:47 +0100 Subject: [PATCH 65/89] Increase CI root reserve for ROCm and tmp spills --- .github/workflows/main.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index b96e8801..d4a57e8a 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -75,7 +75,7 @@ jobs: - name: Maximize build space uses: easimon/maximize-build-space@v8 with: - root-reserve-mb: 512 + root-reserve-mb: 2048 swap-size-mb: 1024 remove-android: 'true' remove-codeql: 'true' From 717c40cb3d5c24e59a28dbe573c7fd38aa937b40 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Sun, 24 Sep 2023 23:04:53 +0100 Subject: [PATCH 66/89] Increase CI root reserve for ROCm and tmp spills 2 --- .github/workflows/main.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index d4a57e8a..4167c499 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -75,7 +75,7 @@ jobs: - name: Maximize build space uses: easimon/maximize-build-space@v8 with: - root-reserve-mb: 2048 + root-reserve-mb: 1536 swap-size-mb: 1024 remove-android: 'true' remove-codeql: 'true' From 5c9cb660ec0d00a6800f86e5b53d23434d10c630 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Sun, 24 Sep 2023 23:15:58 +0100 Subject: [PATCH 67/89] Increase CI root reserve for ROCm and tmp spills 3 --- src/ci-prepare-bionic.sh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/ci-prepare-bionic.sh b/src/ci-prepare-bionic.sh index 6a1a9595..f5c1a704 100755 --- a/src/ci-prepare-bionic.sh +++ b/src/ci-prepare-bionic.sh @@ -260,7 +260,11 @@ setup_clang_gcc() { } setup_rocm() { - sudo apt-get install -y rocm-dev rocthrust-dev + if [ "$SETUP" = true ]; then + sudo apt-get install -y rocm-dev rocthrust-dev + else + echo "Skipping apt setup for ROCm" + fi export_var ROCM_PATH "/opt/rocm" export_var PATH "$ROCM_PATH/bin:$PATH" # ROCm needs this for many of their libraries' CMake build to work export_var HIP_CXX "$ROCM_PATH/bin/hipcc" From 177416229c8ac60835ee641bfe7a6f91329e4028 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Sun, 24 Sep 2023 23:23:40 +0100 Subject: [PATCH 68/89] Increase CI root reserve for ROCm and tmp spills 4 --- .github/workflows/main.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index 4167c499..7a9755cd 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -75,8 +75,8 @@ jobs: - name: Maximize build space uses: easimon/maximize-build-space@v8 with: - root-reserve-mb: 1536 - swap-size-mb: 1024 + root-reserve-mb: 2048 + swap-size-mb: 512 remove-android: 'true' remove-codeql: 'true' From e88043a5aff95cfdca4a0c0c310d20779cdb8e28 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Sun, 24 Sep 2023 23:50:56 +0100 Subject: [PATCH 69/89] Increase CI root reserve for ROCm and tmp spills 5 --- .github/workflows/main.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index 7a9755cd..1dc24421 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -75,7 +75,7 @@ jobs: - name: Maximize build space uses: easimon/maximize-build-space@v8 with: - root-reserve-mb: 2048 + root-reserve-mb: 4096 swap-size-mb: 512 remove-android: 'true' remove-codeql: 'true' From 42de93076c303ecb14e4d51e8acb6417088d29b1 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Mon, 25 Sep 2023 00:10:12 +0100 Subject: [PATCH 70/89] Increase CI root reserve for ROCm and tmp spills 6 --- .github/workflows/main.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index 1dc24421..95f84e90 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -75,7 +75,7 @@ jobs: - name: Maximize build space uses: easimon/maximize-build-space@v8 with: - root-reserve-mb: 4096 + root-reserve-mb: 8192 swap-size-mb: 512 remove-android: 'true' remove-codeql: 'true' From 29b03be57270cc7742dd7e9618e33e38f1e50fe0 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Mon, 25 Sep 2023 01:12:08 +0100 Subject: [PATCH 71/89] Update readme and changelog --- CHANGELOG.md | 2 ++ README.md | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6bf53a3d..c862837d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,8 @@ All notable changes to this project will be documented in this file. - Updates to the HIP kernels and API usage. - Number of thread-blocks in CUDA dot kernel implementation changed to 1024. - Fix compatibility of `sycl2020` (now `sycl2020-acc`) with hipSYCL. +- Bumped Julia compat to 1.9 +- Bumped Rust to 1.74.0-nightly (13e6f24b9 2023-09-23) ## [v4.0] - 2021-12-22 diff --git a/README.md b/README.md index 539262c4..9c185b6a 100644 --- a/README.md +++ b/README.md @@ -38,7 +38,7 @@ BabelStream is currently implemented in the following parallel programming model - C++ Parallel STL - Kokkos - RAJA -- SYCL and SYCL 2020 +- SYCL and SYCL2020 (USM and accessors) - TBB - Thrust (via CUDA or HIP) @@ -165,7 +165,7 @@ The `MODEL` variant selects one implementation of BabelStream to build. Currently available models are: ``` -omp;ocl;std;std20;hip;cuda;kokkos;sycl;sycl2020;acc;raja;tbb;thrust +omp;ocl;std-data;std-indices;std-ranges;hip;cuda;kokkos;sycl;sycl2020-acc;sycl2020-usm;acc;raja;tbb;thrust ``` ### GNU Make From bd6bb09b5d6fdf493917d60213f6db17df2ee497 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Mon, 25 Sep 2023 01:39:23 +0100 Subject: [PATCH 72/89] Fix MEM flag for CUDA, resolves #163 --- src/cuda/CUDAStream.cu | 8 +++++++- src/cuda/CUDAStream.h | 8 +------- src/cuda/model.cmake | 2 +- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/cuda/CUDAStream.cu b/src/cuda/CUDAStream.cu index e1abe343..573d24e9 100644 --- a/src/cuda/CUDAStream.cu +++ b/src/cuda/CUDAStream.cu @@ -42,7 +42,13 @@ CUDAStream::CUDAStream(const int ARRAY_SIZE, const int device_index) // Print out device information std::cout << "Using CUDA device " << getDeviceName(device_index) << std::endl; std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl; - +#if defined(MANAGED) + std::cout << "Memory: MANAGED" << std::endl; +#elif defined(PAGEFAULT) + std::cout << "Memory: PAGEFAULT" << std::endl; +#else + std::cout << "Memory: DEFAULT" << std::endl; +#endif array_size = ARRAY_SIZE; // Allocate the host array for partial sums for dot kernels diff --git a/src/cuda/CUDAStream.h b/src/cuda/CUDAStream.h index bb3f8665..5ff4f9b5 100644 --- a/src/cuda/CUDAStream.h +++ b/src/cuda/CUDAStream.h @@ -13,13 +13,7 @@ #include "Stream.h" -#if defined(PAGEFAULT) - #define IMPLEMENTATION_STRING "CUDA - Page Fault" -#elif defined(MANAGED) - #define IMPLEMENTATION_STRING "CUDA - Managed Memory" -#else - #define IMPLEMENTATION_STRING "CUDA" -#endif +#define IMPLEMENTATION_STRING "CUDA" #define TBSIZE 1024 #define DOT_NUM_BLOCKS 1024 diff --git a/src/cuda/model.cmake b/src/cuda/model.cmake index 8c6b5686..6202b2d3 100644 --- a/src/cuda/model.cmake +++ b/src/cuda/model.cmake @@ -29,7 +29,7 @@ macro(setup) endif() enable_language(CUDA) - register_definitions(MEM=${MEM}) + register_definitions(${MEM}) # add -forward-unknown-to-host-compiler for compatibility reasons set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "-forward-unknown-to-host-compiler -arch=${CUDA_ARCH}" ${CUDA_EXTRA_FLAGS}) From 369785c96a551693222b476a52be42ebb27933ef Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Mon, 25 Sep 2023 01:41:06 +0100 Subject: [PATCH 73/89] Add HIP managed memory support, resolves #162 --- CHANGELOG.md | 1 + src/ci-test-compile.sh | 2 ++ src/hip/HIPStream.cpp | 43 ++++++++++++++++++++++++++++++++++++++---- src/hip/model.cmake | 7 +++++++ 4 files changed, 49 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c862837d..605d3273 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ All notable changes to this project will be documented in this file. ### Added - Ability to build Kokkos and RAJA versions against existing packages. - Thrust managed memory. +- HIP managed memory. - New implementation using SYCL2020 USM (sycl2020-acc) and renamed original `sycl2020` to `sycl2020-acc`. ### Changed diff --git a/src/ci-test-compile.sh b/src/ci-test-compile.sh index 610c3f07..a67303c3 100755 --- a/src/ci-test-compile.sh +++ b/src/ci-test-compile.sh @@ -289,6 +289,8 @@ build_hip() { local name="hip_build" run_build $name "${HIP_CXX:?}" hip "-DCMAKE_CXX_COMPILER=${HIP_CXX:?}" + run_build $name "${HIP_CXX:?}" hip "-DCMAKE_CXX_COMPILER=${HIP_CXX:?} -DMEM=MANAGED" + run_build $name "${HIP_CXX:?}" hip "-DCMAKE_CXX_COMPILER=${HIP_CXX:?} -DMEM=PAGEFAULT" run_build $name "${GCC_CXX:?}" thrust "-DCMAKE_CXX_COMPILER=${HIP_CXX:?} -DSDK_DIR=$ROCM_PATH -DTHRUST_IMPL=ROCM" } diff --git a/src/hip/HIPStream.cpp b/src/hip/HIPStream.cpp index 0db84851..56c46ed3 100644 --- a/src/hip/HIPStream.cpp +++ b/src/hip/HIPStream.cpp @@ -45,11 +45,22 @@ HIPStream::HIPStream(const int ARRAY_SIZE, const int device_index) // Print out device information std::cout << "Using HIP device " << getDeviceName(device_index) << std::endl; std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl; +#if defined(MANAGED) + std::cout << "Memory: MANAGED" << std::endl; +#elif defined(PAGEFAULT) + std::cout << "Memory: PAGEFAULT" << std::endl; +#else + std::cout << "Memory: DEFAULT" << std::endl; +#endif array_size = ARRAY_SIZE; // Round dot_num_blocks up to next multiple of (TBSIZE * dot_elements_per_lane) dot_num_blocks = (array_size + (TBSIZE * dot_elements_per_lane - 1)) / (TBSIZE * dot_elements_per_lane); + size_t array_bytes = sizeof(T); + array_bytes *= ARRAY_SIZE; + size_t total_bytes = array_bytes * 3; + // Allocate the host array for partial sums for dot kernels using hipHostMalloc. // This creates an array on the host which is visible to the device. However, it requires // synchronization (e.g. hipDeviceSynchronize) for the result to be available on the host @@ -63,13 +74,26 @@ HIPStream::HIPStream(const int ARRAY_SIZE, const int device_index) if (props.totalGlobalMem < std::size_t{3}*ARRAY_SIZE*sizeof(T)) throw std::runtime_error("Device does not have enough memory for all 3 buffers"); - // Create device buffers - hipMalloc(&d_a, ARRAY_SIZE*sizeof(T)); + // Create device buffers +#if defined(MANAGED) + hipMallocManaged(&d_a, array_bytes); + check_error(); + hipMallocManaged(&d_b, array_bytes); + check_error(); + hipMallocManaged(&d_c, array_bytes); + check_error(); +#elif defined(PAGEFAULT) + d_a = (T*)malloc(array_bytes); + d_b = (T*)malloc(array_bytes); + d_c = (T*)malloc(array_bytes); +#else + hipMalloc(&d_a, array_bytes); check_error(); - hipMalloc(&d_b, ARRAY_SIZE*sizeof(T)); + hipMalloc(&d_b, array_bytes); check_error(); - hipMalloc(&d_c, ARRAY_SIZE*sizeof(T)); + hipMalloc(&d_c, array_bytes); check_error(); +#endif } @@ -109,13 +133,24 @@ void HIPStream::init_arrays(T initA, T initB, T initC) template void HIPStream::read_arrays(std::vector& a, std::vector& b, std::vector& c) { + // Copy device memory to host +#if defined(PAGEFAULT) || defined(MANAGED) + hipDeviceSynchronize(); + for (int i = 0; i < array_size; i++) + { + a[i] = d_a[i]; + b[i] = d_b[i]; + c[i] = d_c[i]; + } +#else hipMemcpy(a.data(), d_a, a.size()*sizeof(T), hipMemcpyDeviceToHost); check_error(); hipMemcpy(b.data(), d_b, b.size()*sizeof(T), hipMemcpyDeviceToHost); check_error(); hipMemcpy(c.data(), d_c, c.size()*sizeof(T), hipMemcpyDeviceToHost); check_error(); +#endif } template diff --git a/src/hip/model.cmake b/src/hip/model.cmake index 78150c4b..a63efec1 100644 --- a/src/hip/model.cmake +++ b/src/hip/model.cmake @@ -2,6 +2,13 @@ register_flag_required(CMAKE_CXX_COMPILER "Absolute path to the AMD HIP C++ compiler") +register_flag_optional(MEM "Device memory mode: + DEFAULT - allocate host and device memory pointers. + MANAGED - use HIP Managed Memory. + PAGEFAULT - shared memory, only host pointers allocated." + "DEFAULT") + macro(setup) # nothing to do here as hipcc does everything correctly, what a surprise! + register_definitions(${MEM}) endmacro() \ No newline at end of file From 2e3ebeecabc33b8c091927a6923d6d6fcef96ebd Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Sun, 1 Oct 2023 21:48:27 +0100 Subject: [PATCH 74/89] Don't set CMAKE_BUILD_TYPE unconditionally --- CMakeLists.txt | 2 -- 1 file changed, 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 879e4633..e89ada4c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -31,8 +31,6 @@ endmacro() # the final executable name set(EXE_NAME babelstream) -# select default build type -set(CMAKE_BUILD_TYPE "Release") # for chrono and some basic CXX features, models can overwrite this if required set(CMAKE_CXX_STANDARD 11) From 92fed7082b7cec8199a267575c2b069106dba4d5 Mon Sep 17 00:00:00 2001 From: Troels Henriksen Date: Tue, 3 Oct 2023 14:57:20 +0200 Subject: [PATCH 75/89] Add Futhark implementation (#146) * Add Futhark. --- .github/workflows/main.yaml | 13 ++- CMakeLists.txt | 3 +- README.md | 3 +- src/futhark/FutharkStream.cpp | 212 ++++++++++++++++++++++++++++++++++ src/futhark/FutharkStream.h | 60 ++++++++++ src/futhark/babelstream.fut | 62 ++++++++++ src/futhark/model.cmake | 55 +++++++++ src/main.cpp | 6 + 8 files changed, 411 insertions(+), 3 deletions(-) create mode 100644 src/futhark/FutharkStream.cpp create mode 100644 src/futhark/FutharkStream.h create mode 100644 src/futhark/babelstream.fut create mode 100644 src/futhark/model.cmake diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index 95f84e90..2e542016 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -225,4 +225,15 @@ jobs: run: ./src/ci-test-compile.sh ./build dpcpp all ${{ env.CMAKE_3_24_BIN }} - name: Test compile hipsycl @ CMake 3.24 if: ${{ ! cancelled() }} - run: ./src/ci-test-compile.sh ./build hipsycl all ${{ env.CMAKE_3_24_BIN }} \ No newline at end of file + run: ./src/ci-test-compile.sh ./build hipsycl all ${{ env.CMAKE_3_24_BIN }} + + test-futhark: + runs-on: ubuntu-22.04 + steps: + - uses: actions/checkout@v4 + - name: Prepare Futhark compiler + uses: diku-dk/install-futhark@HEAD + with: + version: 'latest' + - run: cmake -Bbuild -H. -DMODEL=futhark -DFUTHARK_BACKEND=multicore + - run: cmake --build build diff --git a/CMakeLists.txt b/CMakeLists.txt index e89ada4c..7551dc3e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -4,7 +4,7 @@ if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.24.0") cmake_policy(SET CMP0135 NEW) endif () -project(BabelStream VERSION 4.0 LANGUAGES CXX) +project(BabelStream VERSION 4.0 LANGUAGES CXX C) # uncomment for debugging build issues: #set(CMAKE_VERBOSE_MAKEFILE ON) @@ -162,6 +162,7 @@ register_model(acc ACC ACCStream.cpp) register_model(raja USE_RAJA RAJAStream.cpp) register_model(tbb TBB TBBStream.cpp) register_model(thrust THRUST ThrustStream.cu) # Thrust uses cu, even for rocThrust +register_model(futhark FUTHARK FutharkStream.cpp) set(USAGE ON CACHE BOOL "Whether to print all custom flags for the selected model") diff --git a/README.md b/README.md index 9c185b6a..487f8e90 100644 --- a/README.md +++ b/README.md @@ -41,6 +41,7 @@ BabelStream is currently implemented in the following parallel programming model - SYCL and SYCL2020 (USM and accessors) - TBB - Thrust (via CUDA or HIP) +- Futhark This project also contains implementations in alternative languages with different build systems: * Julia - [JuliaStream.jl](./src/julia/JuliaStream.jl) @@ -101,7 +102,7 @@ The source for each model's implementations are located in `./src/`. Currently available models are: ``` -omp;ocl;std-data;std-indices;std-ranges;hip;cuda;kokkos;sycl;sycl2020;acc;raja;tbb;thrust +omp;ocl;std-data;std-indices;std-ranges;hip;cuda;kokkos;sycl;sycl2020;acc;raja;tbb;thrust;futhark ``` #### Overriding default flags diff --git a/src/futhark/FutharkStream.cpp b/src/futhark/FutharkStream.cpp new file mode 100644 index 00000000..ebd3633b --- /dev/null +++ b/src/futhark/FutharkStream.cpp @@ -0,0 +1,212 @@ +// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, +// University of Bristol HPC +// Copyright (c) 2022 Troels Henriksen +// University of Copenhagen +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#include // For aligned_alloc +#include +#include "FutharkStream.h" + +template +FutharkStream::FutharkStream(const int ARRAY_SIZE, int device) +{ + this->array_size = ARRAY_SIZE; + this->cfg = futhark_context_config_new(); + this->device = "#" + std::to_string(device); +#if defined(FUTHARK_BACKEND_cuda) || defined(FUTHARK_BACKEND_opencl) + futhark_context_config_set_device(cfg, this->device.c_str()); +#endif + this->ctx = futhark_context_new(cfg); + this->a = NULL; + this->b = NULL; + this->c = NULL; +} + +template <> +FutharkStream::~FutharkStream() +{ + if (this->a) { + futhark_free_f32_1d(this->ctx, (futhark_f32_1d*)this->a); + } + if (this->b) { + futhark_free_f32_1d(this->ctx, (futhark_f32_1d*)this->b); + } + if (this->c) { + futhark_free_f32_1d(this->ctx, (futhark_f32_1d*)this->c); + } + futhark_context_free(this->ctx); + futhark_context_config_free(this->cfg); +} + +template <> +FutharkStream::~FutharkStream() +{ + if (this->a) { + futhark_free_f64_1d(this->ctx, (futhark_f64_1d*)this->a); + } + if (this->b) { + futhark_free_f64_1d(this->ctx, (futhark_f64_1d*)this->b); + } + if (this->c) { + futhark_free_f64_1d(this->ctx, (futhark_f64_1d*)this->c); + } + futhark_context_free(this->ctx); + futhark_context_config_free(this->cfg); +} + +template <> +void FutharkStream::init_arrays(float initA, float initB, float initC) { + int array_size = this->array_size; + float *a = new float[array_size]; + float *b = new float[array_size]; + float *c = new float[array_size]; + for (int i = 0; i < array_size; i++) { + a[i] = initA; + b[i] = initB; + c[i] = initC; + } + this->a = (futhark_f32_1d*)futhark_new_f32_1d(this->ctx, a, array_size); + this->b = (futhark_f32_1d*)futhark_new_f32_1d(this->ctx, b, array_size); + this->c = (futhark_f32_1d*)futhark_new_f32_1d(this->ctx, c, array_size); + futhark_context_sync(this->ctx); + delete[] a; + delete[] b; + delete[] c; +} + +template <> +void FutharkStream::init_arrays(double initA, double initB, double initC) { + int array_size = this->array_size; + double *a = new double[array_size]; + double *b = new double[array_size]; + double *c = new double[array_size]; + for (int i = 0; i < array_size; i++) { + a[i] = initA; + b[i] = initB; + c[i] = initC; + } + this->a = (futhark_f64_1d*)futhark_new_f64_1d(this->ctx, a, array_size); + this->b = (futhark_f64_1d*)futhark_new_f64_1d(this->ctx, b, array_size); + this->c = (futhark_f64_1d*)futhark_new_f64_1d(this->ctx, c, array_size); + futhark_context_sync(this->ctx); + delete[] a; + delete[] b; + delete[] c; +} + +template <> +void FutharkStream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) { + futhark_values_f32_1d(this->ctx, (futhark_f32_1d*)this->a, h_a.data()); + futhark_values_f32_1d(this->ctx, (futhark_f32_1d*)this->b, h_b.data()); + futhark_values_f32_1d(this->ctx, (futhark_f32_1d*)this->c, h_c.data()); + futhark_context_sync(this->ctx); +} + +template <> +void FutharkStream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) { + futhark_values_f64_1d(this->ctx, (futhark_f64_1d*)this->a, h_a.data()); + futhark_values_f64_1d(this->ctx, (futhark_f64_1d*)this->b, h_b.data()); + futhark_values_f64_1d(this->ctx, (futhark_f64_1d*)this->c, h_c.data()); + futhark_context_sync(this->ctx); +} + +template <> +void FutharkStream::copy() { + futhark_free_f32_1d(this->ctx, (futhark_f32_1d*)this->c); + futhark_entry_f32_copy(this->ctx, (futhark_f32_1d**)&this->c, (futhark_f32_1d*)this->a); + futhark_context_sync(this->ctx); +} + +template <> +void FutharkStream::copy() { + futhark_free_f64_1d(this->ctx, (futhark_f64_1d*)this->c); + futhark_entry_f64_copy(this->ctx, (futhark_f64_1d**)&this->c, (futhark_f64_1d*)this->a); + futhark_context_sync(this->ctx); +} + +template <> +void FutharkStream::mul() { + futhark_free_f32_1d(this->ctx, (futhark_f32_1d*)this->b); + futhark_entry_f32_mul(this->ctx, (futhark_f32_1d**)&this->b, (futhark_f32_1d*)this->c); + futhark_context_sync(this->ctx); +} + +template <> +void FutharkStream::mul() { + futhark_free_f64_1d(this->ctx, (futhark_f64_1d*)this->b); + futhark_entry_f64_mul(this->ctx, (futhark_f64_1d**)&this->b, (futhark_f64_1d*)this->c); + futhark_context_sync(this->ctx); +} + +template <> +void FutharkStream::add() { + futhark_free_f32_1d(this->ctx, (futhark_f32_1d*)this->c); + futhark_entry_f32_add(this->ctx, (futhark_f32_1d**)&this->c, (futhark_f32_1d*)this->a, (futhark_f32_1d*)this->b); + futhark_context_sync(this->ctx); +} + +template <> +void FutharkStream::add() { + futhark_free_f64_1d(this->ctx, (futhark_f64_1d*)this->c); + futhark_entry_f64_add(this->ctx, (futhark_f64_1d**)&this->c, (futhark_f64_1d*)this->a, (futhark_f64_1d*)this->b); + futhark_context_sync(this->ctx); +} + +template <> +void FutharkStream::triad() { + futhark_free_f32_1d(this->ctx, (futhark_f32_1d*)this->c); + futhark_entry_f32_triad(this->ctx, (futhark_f32_1d**)&this->c, (futhark_f32_1d*)this->a, (futhark_f32_1d*)this->b); + futhark_context_sync(this->ctx); +} + +template <> +void FutharkStream::triad() { + futhark_free_f64_1d(this->ctx, (futhark_f64_1d*)this->a); + futhark_entry_f64_triad(this->ctx, (futhark_f64_1d**)&this->a, (futhark_f64_1d*)this->b, (futhark_f64_1d*)this->c); + futhark_context_sync(this->ctx); +} + +template <> +void FutharkStream::nstream() { + futhark_f32_1d* d; + futhark_entry_f32_triad(this->ctx, &d, (futhark_f32_1d*)this->a, (futhark_f32_1d*)this->b); + futhark_free_f32_1d(this->ctx, (futhark_f32_1d*)this->c); + this->c = d; + futhark_context_sync(this->ctx); +} + +template <> +void FutharkStream::nstream() { + futhark_f64_1d* d; + futhark_entry_f64_triad(this->ctx, &d, (futhark_f64_1d*)this->a, (futhark_f64_1d*)this->b); + futhark_free_f64_1d(this->ctx, (futhark_f64_1d*)this->c); + this->c = d; + futhark_context_sync(this->ctx); +} + +template <> +float FutharkStream::dot() { + float res; + futhark_entry_f32_dot(this->ctx, &res, (futhark_f32_1d*)this->a, (futhark_f32_1d*)this->b); + futhark_context_sync(this->ctx); + return res; +} + +template <> +double FutharkStream::dot() { + double res; + futhark_entry_f64_dot(this->ctx, &res, (futhark_f64_1d*)this->a, (futhark_f64_1d*)this->b); + futhark_context_sync(this->ctx); + return res; +} + +void listDevices(void) +{ + std::cout << "Device selection not supported." << std::endl; +} + +template class FutharkStream; +template class FutharkStream; diff --git a/src/futhark/FutharkStream.h b/src/futhark/FutharkStream.h new file mode 100644 index 00000000..6290e79a --- /dev/null +++ b/src/futhark/FutharkStream.h @@ -0,0 +1,60 @@ +// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, +// University of Bristol HPC +// Copyright (c) 2022 Troels Henriksen +// University of Copenhagen +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#pragma once + +#include +#include + +#include "Stream.h" +#include "babelstream.h" + +#if defined(FUTHARK_BACKEND_c) +#define IMPLEMENTATION_STRING "Futhark (sequential)" +#elif defined(FUTHARK_BACKEND_multicore) +#define IMPLEMENTATION_STRING "Futhark (parallel CPU)" +#elif defined(FUTHARK_BACKEND_opencl) +#define IMPLEMENTATION_STRING "Futhark (OpencL)" +#elif defined(FUTHARK_BACKEND_cuda) +#define IMPLEMENTATION_STRING "Futhark (CUDA)" +#else +#define IMPLEMENTATION_STRING "Futhark (unknown backend)" +#endif + +template +class FutharkStream : public Stream +{ +protected: + // Size of arrays + int array_size; + // For device selection. + std::string device; + + // Futhark stuff + struct futhark_context_config *cfg; + struct futhark_context *ctx; + + // Device side arrays + void* a; + void* b; + void* c; + +public: + FutharkStream(const int, int); + ~FutharkStream(); + + virtual void copy() override; + virtual void add() override; + virtual void mul() override; + virtual void triad() override; + virtual void nstream() override; + virtual T dot() override; + + virtual void init_arrays(T initA, T initB, T initC) override; + virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; +}; diff --git a/src/futhark/babelstream.fut b/src/futhark/babelstream.fut new file mode 100644 index 00000000..d513a60e --- /dev/null +++ b/src/futhark/babelstream.fut @@ -0,0 +1,62 @@ +module type kernels = { + type t + val copy [n] : [n]t -> *[n]t + val mul [n] : t -> [n]t -> [n]t + val add [n] : [n]t -> [n]t -> [n]t + val triad [n] : t -> [n]t -> [n]t -> [n]t + val dot [n] : [n]t -> [n]t -> t + -- Uniqueness allows nstream to mutate the 'a' array. + val nstream [n] : t -> *[n]t -> [n]t -> [n]t -> [n]t +} + +module kernels (P: real) : kernels with t = P.t = { + type t = P.t + def copy = copy + def mul scalar c = map (P.*scalar) c + def add = map2 (P.+) + def triad scalar b c = map2 (P.+) b (map (P.* scalar) c) + def dot a b = reduce (P.+) (P.i32 0) (map2 (P.*) a b) + def nstream scalar a b c = map2 (P.+) a (map2 (P.+) b (map (P.*scalar) c)) +} + +module f32_kernels = kernels f32 +def f32_start_scalar : f32 = 0.4 +entry f32_copy = f32_kernels.copy +entry f32_mul = f32_kernels.mul f32_start_scalar +entry f32_add = f32_kernels.add +entry f32_triad = f32_kernels.triad f32_start_scalar +entry f32_nstream = f32_kernels.nstream f32_start_scalar +entry f32_dot = f32_kernels.dot + +module f64_kernels = kernels f64 +def f64_start_scalar : f64 = 0.4 +entry f64_copy = f64_kernels.copy +entry f64_mul = f64_kernels.mul f64_start_scalar +entry f64_add = f64_kernels.add +entry f64_triad = f64_kernels.triad f64_start_scalar +entry f64_nstream = f64_kernels.nstream f64_start_scalar +entry f64_dot = f64_kernels.dot + +-- == +-- entry: f32_copy f32_mul +-- random input { [33554432]f32 } + +-- == +-- entry: f32_add f32_dot f32_triad +-- random input { [33554432]f32 [33554432]f32 } + +-- == +-- entry: f32_nstream +-- random input { [33554432]f32 [33554432]f32 [33554432]f32 } + +-- == +-- entry: f64_copy f64_mul +-- random input { [33554432]f64 } + +-- == +-- entry: f64_add f64_dot f64_triad +-- random input { [33554432]f64 [33554432]f64 } + +-- == +-- entry: f64_nstream +-- random input { [33554432]f64 [33554432]f64 [33554432]f64 } diff --git a/src/futhark/model.cmake b/src/futhark/model.cmake new file mode 100644 index 00000000..edd21fa6 --- /dev/null +++ b/src/futhark/model.cmake @@ -0,0 +1,55 @@ +# Use +# +# cmake -Bbuild -H. -DMODEL=futhark -DFUTHARK_BACKEND=foo -DFUTHARK_COMPILER=foo/bar/bin/futhark +# +# to use the Futhark backend, where 'foo' must be one of 'multicore', +# 'c', 'opencl', or 'cuda'. Defaults to 'multicore'. +# +# Use -DFUTHARK_COMPILER to set the path to the Futhark compiler +# binary. Defaults to 'futhark' on the PATH. + +register_flag_optional(FUTHARK_BACKEND + "Use a specific Futhark backend, possible options are: + - c + - multicore + - opencl + - cuda" + "multicore") + +register_flag_optional(FUTHARK_COMPILER + "Absolute path to the Futhark compiler, defaults to the futhark compiler on PATH" + "futhark") + +macro(setup) + add_custom_command( + OUTPUT + ${CMAKE_CURRENT_BINARY_DIR}/babelstream.c + ${CMAKE_CURRENT_BINARY_DIR}/babelstream.h + COMMAND ${FUTHARK_COMPILER} ${FUTHARK_BACKEND} + --library src/futhark/babelstream.fut + -o ${CMAKE_CURRENT_BINARY_DIR}/babelstream + DEPENDS src/futhark/babelstream.fut + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + VERBATIM + ) + if (${FUTHARK_BACKEND} STREQUAL "c") + # Nothing to do. + elseif (${FUTHARK_BACKEND} STREQUAL "multicore") + set(THREADS_PREFER_PTHREAD_FLAG ON) + find_package(Threads REQUIRED) + register_link_library(Threads::Threads) + elseif (${FUTHARK_BACKEND} STREQUAL "opencl") + find_package(OpenCL REQUIRED) + register_link_library(OpenCL::OpenCL) + elseif (${FUTHARK_BACKEND} STREQUAL "cuda") + find_package(CUDA REQUIRED) + register_link_library("nvrtc" "cuda" "cudart") + else () + message(FATAL_ERROR "Unsupported Futhark backend: ${FUTHARK_BACKEND}") + endif() +endmacro() + +macro(setup_target) + target_sources(${EXE_NAME} PUBLIC "${CMAKE_CURRENT_BINARY_DIR}/babelstream.c") + include_directories("${CMAKE_CURRENT_BINARY_DIR}") +endmacro() diff --git a/src/main.cpp b/src/main.cpp index d7208da8..d946d775 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -49,6 +49,8 @@ #include "SYCLStream2020.h" #elif defined(OMP) #include "OMPStream.h" +#elif defined(FUTHARK) +#include "FutharkStream.h" #endif // Default size of 2^25 @@ -298,6 +300,10 @@ void run() // Use the OpenMP implementation stream = new OMPStream(ARRAY_SIZE, deviceIndex); +#elif defined(FUTHARK) + // Use the Futhark implementation + stream = new FutharkStream(ARRAY_SIZE, deviceIndex); + #endif stream->init_arrays(startA, startB, startC); From 9954b7d38cd85d20927428425a9840a72a56c3e4 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Fri, 6 Oct 2023 17:56:42 +0100 Subject: [PATCH 76/89] Set CUDA dot kernel to use number of blocks relative to device property This aligns with the approach implemented in other models (SYCL 1.2.1 and HIP) Cherry-picks the CUDA updates from lmeadows in #122 --- src/cuda/CUDAStream.cu | 26 ++++++++++++++++---------- src/cuda/CUDAStream.h | 3 ++- 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/src/cuda/CUDAStream.cu b/src/cuda/CUDAStream.cu index 573d24e9..c4a5e616 100644 --- a/src/cuda/CUDAStream.cu +++ b/src/cuda/CUDAStream.cu @@ -51,16 +51,22 @@ CUDAStream::CUDAStream(const int ARRAY_SIZE, const int device_index) #endif array_size = ARRAY_SIZE; + + // Query device for sensible dot kernel block count + cudaDeviceProp props; + cudaGetDeviceProperties(&props, device_index); + check_error(); + dot_num_blocks = props.multiProcessorCount * 4; + // Allocate the host array for partial sums for dot kernels - sums = (T*)malloc(sizeof(T) * DOT_NUM_BLOCKS); + sums = (T*)malloc(sizeof(T) * dot_num_blocks); size_t array_bytes = sizeof(T); array_bytes *= ARRAY_SIZE; - size_t total_bytes = array_bytes * 3; + size_t total_bytes = array_bytes * 4; + std::cout << "Reduction kernel config: " << dot_num_blocks << " groups of (fixed) size " << TBSIZE << std::endl; // Check buffers fit on the device - cudaDeviceProp props; - cudaGetDeviceProperties(&props, 0); if (props.totalGlobalMem < total_bytes) throw std::runtime_error("Device does not have enough memory for all 3 buffers"); @@ -72,13 +78,13 @@ CUDAStream::CUDAStream(const int ARRAY_SIZE, const int device_index) check_error(); cudaMallocManaged(&d_c, array_bytes); check_error(); - cudaMallocManaged(&d_sum, DOT_NUM_BLOCKS*sizeof(T)); + cudaMallocManaged(&d_sum, dot_num_blocks*sizeof(T)); check_error(); #elif defined(PAGEFAULT) d_a = (T*)malloc(array_bytes); d_b = (T*)malloc(array_bytes); d_c = (T*)malloc(array_bytes); - d_sum = (T*)malloc(sizeof(T)*DOT_NUM_BLOCKS); + d_sum = (T*)malloc(sizeof(T)*dot_num_blocks); #else cudaMalloc(&d_a, array_bytes); check_error(); @@ -86,7 +92,7 @@ CUDAStream::CUDAStream(const int ARRAY_SIZE, const int device_index) check_error(); cudaMalloc(&d_c, array_bytes); check_error(); - cudaMalloc(&d_sum, DOT_NUM_BLOCKS*sizeof(T)); + cudaMalloc(&d_sum, dot_num_blocks*sizeof(T)); check_error(); #endif } @@ -267,19 +273,19 @@ __global__ void dot_kernel(const T * a, const T * b, T * sum, int array_size) template T CUDAStream::dot() { - dot_kernel<<>>(d_a, d_b, d_sum, array_size); + dot_kernel<<>>(d_a, d_b, d_sum, array_size); check_error(); #if defined(MANAGED) || defined(PAGEFAULT) cudaDeviceSynchronize(); check_error(); #else - cudaMemcpy(sums, d_sum, DOT_NUM_BLOCKS*sizeof(T), cudaMemcpyDeviceToHost); + cudaMemcpy(sums, d_sum, dot_num_blocks*sizeof(T), cudaMemcpyDeviceToHost); check_error(); #endif T sum = 0.0; - for (int i = 0; i < DOT_NUM_BLOCKS; i++) + for (int i = 0; i < dot_num_blocks; i++) { #if defined(MANAGED) || defined(PAGEFAULT) sum += d_sum[i]; diff --git a/src/cuda/CUDAStream.h b/src/cuda/CUDAStream.h index 5ff4f9b5..d16511fe 100644 --- a/src/cuda/CUDAStream.h +++ b/src/cuda/CUDAStream.h @@ -16,7 +16,6 @@ #define IMPLEMENTATION_STRING "CUDA" #define TBSIZE 1024 -#define DOT_NUM_BLOCKS 1024 template class CUDAStream : public Stream @@ -34,6 +33,8 @@ class CUDAStream : public Stream T *d_c; T *d_sum; + // Number of blocks for dot kernel + int dot_num_blocks; public: From e347d2ff6c7a6af3b20a76df969130685c6ad40a Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Sat, 7 Oct 2023 09:41:18 +0100 Subject: [PATCH 77/89] Aggregate initialise numeric types, resolves #134 --- src/acc/ACCStream.cpp | 2 +- src/hip/HIPStream.cpp | 4 ++-- src/kokkos/KokkosStream.cpp | 2 +- src/main.cpp | 10 +++++----- src/ocl/OCLStream.cpp | 2 +- src/omp/OMPStream.cpp | 2 +- src/raja/RAJAStream.cpp | 2 +- src/std-data/STDDataStream.cpp | 2 +- src/std-indices/STDIndicesStream.cpp | 2 +- src/std-ranges/STDRangesStream.cpp | 2 +- src/sycl/SYCLStream.cpp | 4 ++-- 11 files changed, 17 insertions(+), 17 deletions(-) diff --git a/src/acc/ACCStream.cpp b/src/acc/ACCStream.cpp index 1e38c8b3..48b9f2de 100644 --- a/src/acc/ACCStream.cpp +++ b/src/acc/ACCStream.cpp @@ -149,7 +149,7 @@ void ACCStream::nstream() template T ACCStream::dot() { - T sum = 0.0; + T sum{}; int array_size = this->array_size; T * restrict a = this->a; diff --git a/src/hip/HIPStream.cpp b/src/hip/HIPStream.cpp index 56c46ed3..aa64dbd3 100644 --- a/src/hip/HIPStream.cpp +++ b/src/hip/HIPStream.cpp @@ -244,7 +244,7 @@ __global__ void dot_kernel(const T * a, const T * b, T * sum, int array_size) const size_t local_i = threadIdx.x; size_t i = blockDim.x * blockIdx.x + local_i; - tb_sum[local_i] = 0.0; + tb_sum[local_i]{}; for (; i < array_size; i += blockDim.x*gridDim.x) tb_sum[local_i] += a[i] * b[i]; @@ -269,7 +269,7 @@ T HIPStream::dot() hipDeviceSynchronize(); check_error(); - T sum = 0.0; + T sum{}; for (int i = 0; i < dot_num_blocks; i++) sum += sums[i]; diff --git a/src/kokkos/KokkosStream.cpp b/src/kokkos/KokkosStream.cpp index 158e109d..66b96622 100644 --- a/src/kokkos/KokkosStream.cpp +++ b/src/kokkos/KokkosStream.cpp @@ -140,7 +140,7 @@ T KokkosStream::dot() Kokkos::View a(*d_a); Kokkos::View b(*d_b); - T sum = 0.0; + T sum{}; Kokkos::parallel_reduce(array_size, KOKKOS_LAMBDA (const long index, T &tmp) { diff --git a/src/main.cpp b/src/main.cpp index d946d775..54a3ed97 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -309,7 +309,7 @@ void run() stream->init_arrays(startA, startB, startC); // Result of the Dot kernel, if used. - T sum = 0.0; + T sum{}; std::vector> timings; @@ -467,7 +467,7 @@ void check_solution(const unsigned int ntimes, std::vector& a, std::vector T goldA = startA; T goldB = startB; T goldC = startC; - T goldSum = 0.0; + T goldSum{}; const T scalar = startScalar; @@ -493,11 +493,11 @@ void check_solution(const unsigned int ntimes, std::vector& a, std::vector goldSum = goldA * goldB * ARRAY_SIZE; // Calculate the average error - long double errA = std::accumulate(a.begin(), a.end(), 0.0, [&](double sum, const T val){ return sum + std::fabs(val - goldA); }); + long double errA = std::accumulate(a.begin(), a.end(), T{}, [&](double sum, const T val){ return sum + std::fabs(val - goldA); }); errA /= a.size(); - long double errB = std::accumulate(b.begin(), b.end(), 0.0, [&](double sum, const T val){ return sum + std::fabs(val - goldB); }); + long double errB = std::accumulate(b.begin(), b.end(), T{}, [&](double sum, const T val){ return sum + std::fabs(val - goldB); }); errB /= b.size(); - long double errC = std::accumulate(c.begin(), c.end(), 0.0, [&](double sum, const T val){ return sum + std::fabs(val - goldC); }); + long double errC = std::accumulate(c.begin(), c.end(), T{}, [&](double sum, const T val){ return sum + std::fabs(val - goldC); }); errC /= c.size(); long double errSum = std::fabs((sum - goldSum)/goldSum); diff --git a/src/ocl/OCLStream.cpp b/src/ocl/OCLStream.cpp index be88ba9b..26b525a8 100644 --- a/src/ocl/OCLStream.cpp +++ b/src/ocl/OCLStream.cpp @@ -260,7 +260,7 @@ T OCLStream::dot() ); cl::copy(queue, d_sum, sums.begin(), sums.end()); - T sum = 0.0; + T sum{}; for (T val : sums) sum += val; diff --git a/src/omp/OMPStream.cpp b/src/omp/OMPStream.cpp index 0cd80358..774f61bf 100644 --- a/src/omp/OMPStream.cpp +++ b/src/omp/OMPStream.cpp @@ -220,7 +220,7 @@ void OMPStream::nstream() template T OMPStream::dot() { - T sum = 0.0; + T sum{}; #ifdef OMP_TARGET_GPU int array_size = this->array_size; diff --git a/src/raja/RAJAStream.cpp b/src/raja/RAJAStream.cpp index d271ea4f..6a99999d 100644 --- a/src/raja/RAJAStream.cpp +++ b/src/raja/RAJAStream.cpp @@ -131,7 +131,7 @@ T RAJAStream::dot() T* RAJA_RESTRICT a = d_a; T* RAJA_RESTRICT b = d_b; - RAJA::ReduceSum sum(0.0); + RAJA::ReduceSum sum(T{}); forall(range, [=] RAJA_DEVICE (RAJA::Index_type index) { diff --git a/src/std-data/STDDataStream.cpp b/src/std-data/STDDataStream.cpp index e426835d..a234d617 100644 --- a/src/std-data/STDDataStream.cpp +++ b/src/std-data/STDDataStream.cpp @@ -94,7 +94,7 @@ template T STDDataStream::dot() { // sum = 0; sum += a[i]*b[i]; return sum; - return std::transform_reduce(exe_policy, a, a + array_size, b, 0.0); + return std::transform_reduce(exe_policy, a, a + array_size, b, T{}); } void listDevices(void) diff --git a/src/std-indices/STDIndicesStream.cpp b/src/std-indices/STDIndicesStream.cpp index 1cf1cccb..fc9f3806 100644 --- a/src/std-indices/STDIndicesStream.cpp +++ b/src/std-indices/STDIndicesStream.cpp @@ -105,7 +105,7 @@ template T STDIndicesStream::dot() { // sum = 0; sum += a[i]*b[i]; return sum; - return std::transform_reduce(exe_policy, a, a + array_size, b, 0.0); + return std::transform_reduce(exe_policy, a, a + array_size, b, T{}); } void listDevices(void) diff --git a/src/std-ranges/STDRangesStream.cpp b/src/std-ranges/STDRangesStream.cpp index d4976918..b29d0c42 100644 --- a/src/std-ranges/STDRangesStream.cpp +++ b/src/std-ranges/STDRangesStream.cpp @@ -135,7 +135,7 @@ T STDRangesStream::dot() return std::transform_reduce( exe_policy, - a, a + array_size, b, 0.0); + a, a + array_size, b, T{}); } void listDevices(void) diff --git a/src/sycl/SYCLStream.cpp b/src/sycl/SYCLStream.cpp index 00c043f9..512517b6 100644 --- a/src/sycl/SYCLStream.cpp +++ b/src/sycl/SYCLStream.cpp @@ -191,7 +191,7 @@ T SYCLStream::dot() size_t li = item.get_local_id(0); size_t global_size = item.get_global_range()[0]; - wg_sum[li] = 0.0; + wg_sum[li] = {}; for (; i < N; i += global_size) wg_sum[li] += ka[i] * kb[i]; @@ -208,7 +208,7 @@ T SYCLStream::dot() }); }); - T sum = 0.0; + T sum{}; auto h_sum = d_sum->template get_access(); for (int i = 0; i < dot_num_groups; i++) { From ffae3ba83fbcee4d8dacc80d56a4e330a3b08e61 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Sat, 7 Oct 2023 09:45:16 +0100 Subject: [PATCH 78/89] Fix CMAKE_CUDA_FLAGS, resolves #166 --- src/cuda/model.cmake | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/cuda/model.cmake b/src/cuda/model.cmake index 6202b2d3..7c1b0d6e 100644 --- a/src/cuda/model.cmake +++ b/src/cuda/model.cmake @@ -32,7 +32,8 @@ macro(setup) register_definitions(${MEM}) # add -forward-unknown-to-host-compiler for compatibility reasons - set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "-forward-unknown-to-host-compiler -arch=${CUDA_ARCH}" ${CUDA_EXTRA_FLAGS}) + set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "-forward-unknown-to-host-compiler" "-arch=${CUDA_ARCH}" ${CUDA_EXTRA_FLAGS}) + string(REPLACE ";" " " CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS}") # CMake defaults to -O2 for CUDA at Release, let's wipe that and use the global RELEASE_FLAG # appended later From 5f3741e4044e814a58ad1aa0989eae431c73645c Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Sat, 7 Oct 2023 10:23:31 +0100 Subject: [PATCH 79/89] Add init/read timing for C++ models --- src/main.cpp | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/src/main.cpp b/src/main.cpp index d946d775..13059494 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -306,7 +306,9 @@ void run() #endif + auto init1 = std::chrono::high_resolution_clock::now(); stream->init_arrays(startA, startB, startC); + auto init2 = std::chrono::high_resolution_clock::now(); // Result of the Dot kernel, if used. T sum = 0.0; @@ -333,7 +335,54 @@ void run() std::vector c(ARRAY_SIZE); + auto read1 = std::chrono::high_resolution_clock::now(); stream->read_arrays(a, b, c); + auto read2 = std::chrono::high_resolution_clock::now(); + + auto initElapsedS = std::chrono::duration_cast>(read2 - read1).count(); + auto readElapsedS = std::chrono::duration_cast>(init2 - init1).count(); + auto initBWps = ((mibibytes ? std::pow(2.0, -20.0) : 1.0E-6) * (3 * sizeof(T) * ARRAY_SIZE)) / initElapsedS; + auto readBWps = ((mibibytes ? std::pow(2.0, -20.0) : 1.0E-6) * (3 * sizeof(T) * ARRAY_SIZE)) / readElapsedS; + + if (output_as_csv) + { + std::cout + << "phase" << csv_separator + << "n_elements" << csv_separator + << "sizeof" << csv_separator + << ((mibibytes) ? "max_mibytes_per_sec" : "max_mbytes_per_sec") << csv_separator + << "runtime" << std::endl; + std::cout + << "Init" << csv_separator + << ARRAY_SIZE << csv_separator + << sizeof(T) << csv_separator + << initBWps << csv_separator + << initElapsedS << std::endl; + std::cout + << "Read" << csv_separator + << ARRAY_SIZE << csv_separator + << sizeof(T) << csv_separator + << readBWps << csv_separator + << readElapsedS << std::endl; + } + else + { + std::cout << "Init: " + << std::setw(7) + << initElapsedS + << " s (=" + << initBWps + << (mibibytes ? " MiBytes/sec" : " MBytes/sec") + << ")" << std::endl; + std::cout << "Read: " + << std::setw(7) + << readElapsedS + << " s (=" + << readBWps + << (mibibytes ? " MiBytes/sec" : " MBytes/sec") + << ")" << std::endl; + } + check_solution(num_times, a, b, c, sum); // Display timing results From f2f7f3a3de77b0d2026c5f81ef2980096f83ae91 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Sat, 7 Oct 2023 11:12:08 +0100 Subject: [PATCH 80/89] Fix bad dot group initialiser in HIP and CUDA --- src/cuda/CUDAStream.cu | 2 +- src/hip/HIPStream.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cuda/CUDAStream.cu b/src/cuda/CUDAStream.cu index c4a5e616..75a8f3c0 100644 --- a/src/cuda/CUDAStream.cu +++ b/src/cuda/CUDAStream.cu @@ -253,7 +253,7 @@ __global__ void dot_kernel(const T * a, const T * b, T * sum, int array_size) int i = blockDim.x * blockIdx.x + threadIdx.x; const size_t local_i = threadIdx.x; - tb_sum[local_i] = 0.0; + tb_sum[local_i] = {}; for (; i < array_size; i += blockDim.x*gridDim.x) tb_sum[local_i] += a[i] * b[i]; diff --git a/src/hip/HIPStream.cpp b/src/hip/HIPStream.cpp index aa64dbd3..ed4ef779 100644 --- a/src/hip/HIPStream.cpp +++ b/src/hip/HIPStream.cpp @@ -244,7 +244,7 @@ __global__ void dot_kernel(const T * a, const T * b, T * sum, int array_size) const size_t local_i = threadIdx.x; size_t i = blockDim.x * blockIdx.x + local_i; - tb_sum[local_i]{}; + tb_sum[local_i] = {}; for (; i < array_size; i += blockDim.x*gridDim.x) tb_sum[local_i] += a[i] * b[i]; From 512a6fac0c43ca964d203a5f1cd0809a21219518 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Sat, 7 Oct 2023 11:16:46 +0100 Subject: [PATCH 81/89] Add init/read timing for Rust --- src/rust/rust-stream/rustfmt.toml | 2 +- src/rust/rust-stream/src/lib.rs | 47 +++++++++++++++++-- src/rust/rust-stream/src/stream.rs | 12 +++++ .../rust-stream/tests/integration_test.rs | 8 ++-- 4 files changed, 60 insertions(+), 9 deletions(-) diff --git a/src/rust/rust-stream/rustfmt.toml b/src/rust/rust-stream/rustfmt.toml index aa2f0e9a..66b62356 100644 --- a/src/rust/rust-stream/rustfmt.toml +++ b/src/rust/rust-stream/rustfmt.toml @@ -54,7 +54,7 @@ use_field_init_shorthand = false force_explicit_abi = true condense_wildcard_suffixes = false color = "Auto" -required_version = "1.4.38" +required_version = "1.6.0" unstable_features = false disable_all_formatting = false skip_children = false diff --git a/src/rust/rust-stream/src/lib.rs b/src/rust/rust-stream/src/lib.rs index 3ac72c31..41ac0c28 100644 --- a/src/rust/rust-stream/src/lib.rs +++ b/src/rust/rust-stream/src/lib.rs @@ -174,7 +174,7 @@ where StreamData: RustStream { ); } - stream.init_arrays(); + let init = stream.run_init_arrays(); let tabulate = |xs: &Vec, name: &str, t_size: usize| -> Vec<(&str, String)> { let tail = &xs[1..]; // tail only @@ -235,10 +235,47 @@ where StreamData: RustStream { }; }; + let show_setup = |init: Duration, read: Duration| { + let setup = vec![ + ("Init", init.as_secs_f64(), 3 * array_bytes), + ("Read", read.as_secs_f64(), 3 * array_bytes), + ]; + if option.csv { + tabulate_all( + setup + .iter() + .map(|(name, elapsed, t_size)| { + vec![ + ("phase", name.to_string()), + ("n_elements", option.arraysize.to_string()), + ("sizeof", t_size.to_string()), + ( + if option.mibibytes { "max_mibytes_per_sec" } else { "max_mbytes_per_sec" }, + (mega_scale * (*t_size as f64) / elapsed).to_string(), + ), + ("runtime", elapsed.to_string()), + ] + }) + .collect::>(), + ); + } else { + for (name, elapsed, t_size) in setup { + println!( + "{}: {:.5} s (={:.5} {})", + name, + elapsed, + mega_scale * (t_size as f64) / elapsed, + if option.mibibytes { "MiBytes/sec" } else { "MBytes/sec" } + ); + } + } + }; + let solutions_correct = match benchmark { Benchmark::All => { let (results, sum) = stream.run_all(option.numtimes); - stream.read_arrays(); + let read = stream.run_read_arrays(); + show_setup(init, read); let correct = check_solution(benchmark, option.numtimes, &stream, Some(sum)); tabulate_all(vec![ tabulate(&results.copy, "Copy", 2 * array_bytes), @@ -251,14 +288,16 @@ where StreamData: RustStream { } Benchmark::NStream => { let results = stream.run_nstream(option.numtimes); - stream.read_arrays(); + let read = stream.run_read_arrays(); + show_setup(init, read); let correct = check_solution(benchmark, option.numtimes, &stream, None); tabulate_all(vec![tabulate(&results, "Nstream", 4 * array_bytes)]); correct } Benchmark::Triad => { let results = stream.run_triad(option.numtimes); - stream.read_arrays(); + let read = stream.run_read_arrays(); + show_setup(init, read); let correct = check_solution(benchmark, option.numtimes, &stream, None); let total_bytes = 3 * array_bytes * option.numtimes; let bandwidth = giga_scale * (total_bytes as f64 / results.as_secs_f64()); diff --git a/src/rust/rust-stream/src/stream.rs b/src/rust/rust-stream/src/stream.rs index 560c6f1e..86de56b2 100644 --- a/src/rust/rust-stream/src/stream.rs +++ b/src/rust/rust-stream/src/stream.rs @@ -132,6 +132,18 @@ pub trait RustStream { fn nstream(&mut self); fn dot(&mut self) -> T; + fn run_init_arrays(&mut self) -> Duration { + timed(|| { + self.init_arrays(); + }) + } + + fn run_read_arrays(&mut self) -> Duration { + timed(|| { + self.read_arrays(); + }) + } + fn run_all(&mut self, n: usize) -> (AllTiming>, T) { let mut timings: AllTiming> = AllTiming { copy: vec![Duration::default(); n], diff --git a/src/rust/rust-stream/tests/integration_test.rs b/src/rust/rust-stream/tests/integration_test.rs index 8031a794..01705465 100644 --- a/src/rust/rust-stream/tests/integration_test.rs +++ b/src/rust/rust-stream/tests/integration_test.rs @@ -2,10 +2,10 @@ use rstest::rstest; #[rstest] fn test_main( - #[values(0, 1, 2, 3, 4)] device: usize, // - #[values("", "--pin")] pin: &str, // - #[values("", "--malloc")] malloc: &str, // - #[values("", "--init")] init: &str, // + #[values(0, 1, 2, 3, 4)] device: usize, // + #[values("", "--pin")] pin: &str, // + #[values("", "--malloc")] malloc: &str, // + #[values("", "--init")] init: &str, // #[values("", "--triad-only", "--nstream-only")] option: &str, // ) { let line = format!( From 971d1e8ac72b6d4fb76874d04a6a4873983f41e1 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Sat, 7 Oct 2023 12:10:08 +0100 Subject: [PATCH 82/89] Add init/read timing for Scala --- src/scala/scala-stream/.bsp/sbt.json | 1 - src/scala/scala-stream/.gitignore | 1 + src/scala/scala-stream/.scalafmt.conf | 2 +- src/scala/scala-stream/build.sbt | 13 ++-- .../scala-stream/project/build.properties | 2 +- src/scala/scala-stream/project/plugins.sbt | 6 +- .../main/scala/scalastream/ScalaStream.scala | 63 ++++++++++++++----- 7 files changed, 63 insertions(+), 25 deletions(-) delete mode 100644 src/scala/scala-stream/.bsp/sbt.json diff --git a/src/scala/scala-stream/.bsp/sbt.json b/src/scala/scala-stream/.bsp/sbt.json deleted file mode 100644 index 2e1edb1e..00000000 --- a/src/scala/scala-stream/.bsp/sbt.json +++ /dev/null @@ -1 +0,0 @@ -{"name":"sbt","version":"1.5.2","bspVersion":"2.0.0-M5","languages":["scala"],"argv":["/usr/lib/jvm/java-11-openjdk-11.0.11.0.9-2.fc33.x86_64/bin/java","-Xms100m","-Xmx100m","-classpath","/home/tom/.local/share/JetBrains/Toolbox/apps/IDEA-U/ch-0/211.7142.45.plugins/Scala/launcher/sbt-launch.jar","xsbt.boot.Boot","-bsp","--sbt-launch-jar=/home/tom/.local/share/JetBrains/Toolbox/apps/IDEA-U/ch-0/211.7142.45.plugins/Scala/launcher/sbt-launch.jar"]} \ No newline at end of file diff --git a/src/scala/scala-stream/.gitignore b/src/scala/scala-stream/.gitignore index 2f7896d1..ee5cda21 100644 --- a/src/scala/scala-stream/.gitignore +++ b/src/scala/scala-stream/.gitignore @@ -1 +1,2 @@ target/ +.bsp/ diff --git a/src/scala/scala-stream/.scalafmt.conf b/src/scala/scala-stream/.scalafmt.conf index 8c7d0c8e..5d87df36 100644 --- a/src/scala/scala-stream/.scalafmt.conf +++ b/src/scala/scala-stream/.scalafmt.conf @@ -1,4 +1,4 @@ -version = "3.0.0-RC2" +version = "3.7.14" runner.dialect = scala3 style = defaultWithAlign diff --git a/src/scala/scala-stream/build.sbt b/src/scala/scala-stream/build.sbt index 49164f63..b13fda3e 100644 --- a/src/scala/scala-stream/build.sbt +++ b/src/scala/scala-stream/build.sbt @@ -3,7 +3,7 @@ lazy val mainCls = Some("scalastream.App") lazy val root = (project in file(".")) .enablePlugins(NativeImagePlugin) .settings( - scalaVersion := "3.0.0", + scalaVersion := "3.3.1", version := "4.0", organization := "uk.ac.bristol.uob-hpc", organizationName := "University of Bristol", @@ -11,6 +11,11 @@ lazy val root = (project in file(".")) assembly / mainClass := mainCls, scalacOptions ~= filterConsoleScalacOptions, assembly / assemblyJarName := "scala-stream.jar", + assembly / assemblyMergeStrategy := { + case PathList("module-info.class") => MergeStrategy.discard + case PathList("META-INF", "versions", xs @ _, "module-info.class") => MergeStrategy.discard + case x => (ThisBuild / assemblyMergeStrategy).value(x) + }, nativeImageOptions := Seq( "--no-fallback", "-H:ReflectionConfigurationFiles=../../reflect-config.json" @@ -22,8 +27,8 @@ lazy val root = (project in file(".")) // Lazy val implementation in Scala 3 triggers an exception in nativeImage, use 2_13 for arg parsing for now otherwise we can't get to the benchmarking part ("com.github.scopt" %% "scopt" % "4.0.1").cross(CrossVersion.for3Use2_13), // par also uses lazy val at some point, so it doesn't work in nativeImage - "org.scala-lang.modules" %% "scala-parallel-collections" % "1.0.3", - "net.openhft" % "affinity" % "3.21ea1", - "org.slf4j" % "slf4j-simple" % "1.7.30" // for affinity + "org.scala-lang.modules" %% "scala-parallel-collections" % "1.0.4", + "net.openhft" % "affinity" % "3.23.2", + "org.slf4j" % "slf4j-simple" % "2.0.5" // for affinity ) ) diff --git a/src/scala/scala-stream/project/build.properties b/src/scala/scala-stream/project/build.properties index 19479ba4..875b706a 100644 --- a/src/scala/scala-stream/project/build.properties +++ b/src/scala/scala-stream/project/build.properties @@ -1 +1 @@ -sbt.version=1.5.2 +sbt.version=1.9.2 diff --git a/src/scala/scala-stream/project/plugins.sbt b/src/scala/scala-stream/project/plugins.sbt index 2c82902e..35a00f05 100644 --- a/src/scala/scala-stream/project/plugins.sbt +++ b/src/scala/scala-stream/project/plugins.sbt @@ -1,6 +1,6 @@ addSbtPlugin("com.timushev.sbt" % "sbt-updates" % "0.5.3") -addSbtPlugin("io.github.davidgregory084" % "sbt-tpolecat" % "0.1.17") +addSbtPlugin("io.github.davidgregory084" % "sbt-tpolecat" % "0.1.20") addSbtPlugin("org.scalameta" % "sbt-native-image" % "0.3.0") -addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.15.0") +addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "2.1.3") addSbtPlugin("ch.epfl.scala" % "sbt-scalafix" % "0.9.27") -addSbtPlugin("org.scalameta" % "sbt-scalafmt" % "2.4.2") +addSbtPlugin("org.scalameta" % "sbt-scalafmt" % "2.4.3") diff --git a/src/scala/scala-stream/src/main/scala/scalastream/ScalaStream.scala b/src/scala/scala-stream/src/main/scala/scalastream/ScalaStream.scala index 9c011a6d..888ba7c0 100644 --- a/src/scala/scala-stream/src/main/scala/scalastream/ScalaStream.scala +++ b/src/scala/scala-stream/src/main/scala/scalastream/ScalaStream.scala @@ -14,6 +14,7 @@ transparent trait ScalaStream[@specialized(Float, Double) A]: def config: Config[A] def initArrays(): Unit + def readArrays(): Unit = () def copy(): Unit def mul(): Unit def add(): Unit @@ -27,6 +28,8 @@ transparent trait ScalaStream[@specialized(Float, Double) A]: val end = System.nanoTime() FiniteDuration(end - start, TimeUnit.NANOSECONDS) -> r + inline def runInitArrays(): FiniteDuration = timed(initArrays())._1 + inline def runReadArrays(): FiniteDuration = timed(readArrays())._1 inline def runAll(times: Int)(using Fractional[A]): (Timings[Vector[FiniteDuration]], A) = val copy = ArrayBuffer.fill[FiniteDuration](times)(Duration.Zero) val mul = ArrayBuffer.fill[FiniteDuration](times)(Duration.Zero) @@ -62,7 +65,6 @@ transparent trait ScalaStream[@specialized(Float, Double) A]: def data(): Data[A] - trait Fractional[@specialized(Double, Float) A]: def toFractional(f: Float): A def toFractional(f: Double): A @@ -77,13 +79,13 @@ trait Fractional[@specialized(Double, Float) A]: extension (x: Int) inline def fractional = toFractional(x.toFloat) extension (x: Long) inline def fractional = toFractional(x.toDouble) extension (x: A) - inline def +(y: A) = add(x, y) - inline def -(y: A) = sub(x, y) - inline def *(y: A) = mul(x, y) - inline def /(y: A) = div(x, y) - inline def >(y: A) = compare(x, y) > 0 - inline def <(y: A) = compare(x, y) < 0 - inline def abs_ = abs(x) + inline def +(y: A) = add(x, y) + inline def -(y: A) = sub(x, y) + inline def *(y: A) = mul(x, y) + inline def /(y: A) = div(x, y) + inline def >(y: A) = compare(x, y) > 0 + inline def <(y: A) = compare(x, y) < 0 + inline def abs_ = abs(x) end Fractional given FloatFractional: Fractional[Float] with @@ -204,7 +206,7 @@ object App: validateXs("c", vec.c, goldC) dotSum.foreach { sum => - val goldSum = (goldA * goldB) * (config.options.arraysize).fractional + val goldSum = (goldA * goldB) * config.options.arraysize.fractional val error = ((sum - goldSum) / goldSum).abs_ if error > 1.fractional / 100000000.fractional then Console.err.println( @@ -238,10 +240,10 @@ object App: ) println(s"Running ${config.benchmark match { - case Benchmark.All => "kernels" - case Benchmark.Triad => "triad" - case Benchmark.NStream => "nstream" - }} ${opt.numtimes} times") + case Benchmark.All => "kernels" + case Benchmark.Triad => "triad" + case Benchmark.NStream => "nstream" + }} ${opt.numtimes} times") if config.benchmark == Benchmark.Triad then println(s"Number of elements: ${opt.arraysize}") @@ -288,11 +290,38 @@ object App: println(header.map(_._1.padTo(padding, ' ')).mkString(sep)) println(rows.map(_.map(_._2.padTo(padding, ' ')).mkString(sep)).mkString("\n")) + def showInit(init: FiniteDuration, read: FiniteDuration): Unit = { + val setup = + Vector(("Init", init.seconds, 3 * arrayBytes), ("Read", read.seconds, 3 * arrayBytes)) + if opt.csv then + tabulate( + setup.map((name, elapsed, totalBytes) => + Vector( + "phase" -> name, + "n_elements" -> opt.arraysize.toString, + "sizeof" -> arrayBytes.toString, + s"max_m${if opt.mibibytes then "i" else ""}bytes_per_sec" -> + (megaScale * totalBytes.toDouble / elapsed).toString, + "runtime" -> elapsed.toString + ) + ): _* + ) + else + for (name, elapsed, totalBytes) <- setup do + println( + f"$name: $elapsed%.5f s (=${megaScale * totalBytes.toDouble / elapsed}%.5f M${ + if opt.mibibytes then "i" else "" + }Bytes/sec)" + ) + } + val stream = mkStream(config) - stream.initArrays() + val init = stream.runInitArrays() config.benchmark match case Benchmark.All => val (results, sum) = stream.runAll(opt.numtimes) + val read = stream.runReadArrays() + showInit(init, read) validate(stream.data(), config, Some(sum)) tabulate( mkRow(results.copy, "Copy", 2 * arrayBytes), @@ -303,10 +332,14 @@ object App: ) case Benchmark.NStream => val result = stream.runNStream(opt.numtimes) + val read = stream.runReadArrays() + showInit(init, read) validate(stream.data(), config) tabulate(mkRow(result, "Nstream", 4 * arrayBytes)) case Benchmark.Triad => - val results = stream.runTriad(opt.numtimes) + val results = stream.runTriad(opt.numtimes) + val read = stream.runReadArrays() + showInit(init, read) val totalBytes = 3 * arrayBytes * opt.numtimes val bandwidth = megaScale * (totalBytes / results.seconds) println(f"Runtime (seconds): ${results.seconds}%.5f") From 3de019c156a803a6b5abd1b5865828a80965087c Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Sat, 7 Oct 2023 13:50:58 +0100 Subject: [PATCH 83/89] Add init/read timing for Java Upgrade to TornadoVM 0.15 API --- src/java/java-stream/pom.xml | 8 +- .../src/main/java/javastream/JavaStream.java | 14 ++- .../src/main/java/javastream/Main.java | 93 ++++++++++++++----- .../javastream/aparapi/AparapiStreams.java | 2 +- .../javastream/jdk/GenericPlainStream.java | 2 +- .../java/javastream/jdk/GenericStream.java | 2 +- .../jdk/SpecialisedDoubleStream.java | 2 +- .../jdk/SpecialisedFloatStream.java | 2 +- .../jdk/SpecialisedPlainDoubleStream.java | 2 +- .../jdk/SpecialisedPlainFloatStream.java | 2 +- .../tornadovm/GenericTornadoVMStream.java | 34 +++---- .../tornadovm/SpecialisedDouble.java | 52 +++++++++-- .../tornadovm/SpecialisedFloat.java | 52 +++++++++-- .../tornadovm/TornadoVMStreams.java | 26 ++++-- 14 files changed, 210 insertions(+), 83 deletions(-) diff --git a/src/java/java-stream/pom.xml b/src/java/java-stream/pom.xml index d28a3d5f..78d26b31 100644 --- a/src/java/java-stream/pom.xml +++ b/src/java/java-stream/pom.xml @@ -12,7 +12,7 @@ UTF-8 UTF-8 - 5.7.2 + 5.9.2 @@ -27,19 +27,19 @@ com.beust jcommander - 1.81 + 1.82 tornado tornado-api - 0.9 + 0.15.1 com.aparapi aparapi - 2.0.0 + 3.0.0 diff --git a/src/java/java-stream/src/main/java/javastream/JavaStream.java b/src/java/java-stream/src/main/java/javastream/JavaStream.java index 7ab96cb5..4fdb229b 100644 --- a/src/java/java-stream/src/main/java/javastream/JavaStream.java +++ b/src/java/java-stream/src/main/java/javastream/JavaStream.java @@ -56,7 +56,7 @@ protected JavaStream(Config config) { protected abstract T dot(); - protected abstract Data data(); + protected abstract Data readArrays(); public static class EnumeratedStream extends JavaStream { @@ -113,8 +113,8 @@ public T dot() { } @Override - public Data data() { - return actual.data(); + public Data readArrays() { + return actual.readArrays(); } } @@ -140,6 +140,14 @@ private static Duration timed(Runnable f) { return Duration.ofNanos(end - start); } + final Duration runInitArrays() { + return timed(this::initArrays); + } + + final SimpleImmutableEntry> runReadArrays() { + return timed(this::readArrays); + } + final SimpleImmutableEntry, T> runAll(int times) { Timings timings = new Timings<>(); T lastSum = null; diff --git a/src/java/java-stream/src/main/java/javastream/Main.java b/src/java/java-stream/src/main/java/javastream/Main.java index 24421281..3732a242 100644 --- a/src/java/java-stream/src/main/java/javastream/Main.java +++ b/src/java/java-stream/src/main/java/javastream/Main.java @@ -128,6 +128,40 @@ static final class Implementation { } } + @SuppressWarnings("unchecked") + static void showInit( + int totalBytes, double megaScale, Options opt, Duration init, Duration read) { + List> setup = + Arrays.asList( + new SimpleImmutableEntry<>("Init", durationToSeconds(init)), + new SimpleImmutableEntry<>("Read", durationToSeconds(read))); + if (opt.csv) { + tabulateCsv( + true, + setup.stream() + .map( + x -> + Arrays.asList( + new SimpleImmutableEntry<>("function", x.getKey()), + new SimpleImmutableEntry<>("n_elements", opt.arraysize + ""), + new SimpleImmutableEntry<>("sizeof", totalBytes + ""), + new SimpleImmutableEntry<>( + "max_m" + (opt.mibibytes ? "i" : "") + "bytes_per_sec", + ((megaScale * (double) totalBytes / x.getValue())) + ""), + new SimpleImmutableEntry<>("runtime", x.getValue() + ""))) + .toArray(List[]::new)); + } else { + for (Entry e : setup) { + System.out.printf( + "%s: %.5f s (%.5f M%sBytes/sec)%n", + e.getKey(), + e.getValue(), + megaScale * (double) totalBytes / e.getValue(), + opt.mibibytes ? "i" : ""); + } + } + } + static boolean run( String name, Config config, Function, JavaStream> mkStream) { @@ -183,35 +217,46 @@ static boolean run( JavaStream stream = mkStream.apply(config); - stream.initArrays(); - + Duration init = stream.runInitArrays(); final boolean ok; switch (config.benchmark) { case ALL: - Entry, T> results = stream.runAll(opt.numtimes); - ok = checkSolutions(stream.data(), config, Optional.of(results.getValue())); - Timings timings = results.getKey(); - tabulateCsv( - opt.csv, - mkCsvRow(timings.copy, "Copy", 2 * arrayBytes, megaScale, opt), - mkCsvRow(timings.mul, "Mul", 2 * arrayBytes, megaScale, opt), - mkCsvRow(timings.add, "Add", 3 * arrayBytes, megaScale, opt), - mkCsvRow(timings.triad, "Triad", 3 * arrayBytes, megaScale, opt), - mkCsvRow(timings.dot, "Dot", 2 * arrayBytes, megaScale, opt)); - break; + { + Entry, T> results = stream.runAll(opt.numtimes); + SimpleImmutableEntry> read = stream.runReadArrays(); + showInit(totalBytes, megaScale, opt, init, read.getKey()); + ok = checkSolutions(read.getValue(), config, Optional.of(results.getValue())); + Timings timings = results.getKey(); + tabulateCsv( + opt.csv, + mkCsvRow(timings.copy, "Copy", 2 * arrayBytes, megaScale, opt), + mkCsvRow(timings.mul, "Mul", 2 * arrayBytes, megaScale, opt), + mkCsvRow(timings.add, "Add", 3 * arrayBytes, megaScale, opt), + mkCsvRow(timings.triad, "Triad", 3 * arrayBytes, megaScale, opt), + mkCsvRow(timings.dot, "Dot", 2 * arrayBytes, megaScale, opt)); + break; + } case NSTREAM: - List nstreamResults = stream.runNStream(opt.numtimes); - ok = checkSolutions(stream.data(), config, Optional.empty()); - tabulateCsv(opt.csv, mkCsvRow(nstreamResults, "Nstream", 4 * arrayBytes, megaScale, opt)); - break; + { + List nstreamResults = stream.runNStream(opt.numtimes); + SimpleImmutableEntry> read = stream.runReadArrays(); + showInit(totalBytes, megaScale, opt, init, read.getKey()); + ok = checkSolutions(read.getValue(), config, Optional.empty()); + tabulateCsv(opt.csv, mkCsvRow(nstreamResults, "Nstream", 4 * arrayBytes, megaScale, opt)); + break; + } case TRIAD: - Duration triadResult = stream.runTriad(opt.numtimes); - ok = checkSolutions(stream.data(), config, Optional.empty()); - int triadTotalBytes = 3 * arrayBytes * opt.numtimes; - double bandwidth = megaScale * (triadTotalBytes / durationToSeconds(triadResult)); - System.out.printf("Runtime (seconds): %.5f", durationToSeconds(triadResult)); - System.out.printf("Bandwidth (%s/s): %.3f ", gigaSuffix, bandwidth); - break; + { + Duration triadResult = stream.runTriad(opt.numtimes); + SimpleImmutableEntry> read = stream.runReadArrays(); + showInit(totalBytes, megaScale, opt, init, read.getKey()); + ok = checkSolutions(read.getValue(), config, Optional.empty()); + int triadTotalBytes = 3 * arrayBytes * opt.numtimes; + double bandwidth = megaScale * (triadTotalBytes / durationToSeconds(triadResult)); + System.out.printf("Runtime (seconds): %.5f", durationToSeconds(triadResult)); + System.out.printf("Bandwidth (%s/s): %.3f ", gigaSuffix, bandwidth); + break; + } default: throw new AssertionError(); } diff --git a/src/java/java-stream/src/main/java/javastream/aparapi/AparapiStreams.java b/src/java/java-stream/src/main/java/javastream/aparapi/AparapiStreams.java index ab2de528..052c807d 100644 --- a/src/java/java-stream/src/main/java/javastream/aparapi/AparapiStreams.java +++ b/src/java/java-stream/src/main/java/javastream/aparapi/AparapiStreams.java @@ -122,7 +122,7 @@ public T dot() { } @Override - public Data data() { + public Data readArrays() { return kernels.syncAndDispose(); } } diff --git a/src/java/java-stream/src/main/java/javastream/jdk/GenericPlainStream.java b/src/java/java-stream/src/main/java/javastream/jdk/GenericPlainStream.java index 7f210fa8..8075603c 100644 --- a/src/java/java-stream/src/main/java/javastream/jdk/GenericPlainStream.java +++ b/src/java/java-stream/src/main/java/javastream/jdk/GenericPlainStream.java @@ -86,7 +86,7 @@ public T dot() { } @Override - public Data data() { + public Data readArrays() { return new Data<>(a, b, c); } } diff --git a/src/java/java-stream/src/main/java/javastream/jdk/GenericStream.java b/src/java/java-stream/src/main/java/javastream/jdk/GenericStream.java index 1e65b8f9..3cacf3ac 100644 --- a/src/java/java-stream/src/main/java/javastream/jdk/GenericStream.java +++ b/src/java/java-stream/src/main/java/javastream/jdk/GenericStream.java @@ -80,7 +80,7 @@ public T dot() { } @Override - public Data data() { + public Data readArrays() { return new Data<>(a, b, c); } } diff --git a/src/java/java-stream/src/main/java/javastream/jdk/SpecialisedDoubleStream.java b/src/java/java-stream/src/main/java/javastream/jdk/SpecialisedDoubleStream.java index 26406a62..1b54bc3a 100644 --- a/src/java/java-stream/src/main/java/javastream/jdk/SpecialisedDoubleStream.java +++ b/src/java/java-stream/src/main/java/javastream/jdk/SpecialisedDoubleStream.java @@ -78,7 +78,7 @@ public Double dot() { } @Override - public Data data() { + public Data readArrays() { return new Data<>(boxed(a), boxed(b), boxed(c)); } } diff --git a/src/java/java-stream/src/main/java/javastream/jdk/SpecialisedFloatStream.java b/src/java/java-stream/src/main/java/javastream/jdk/SpecialisedFloatStream.java index 6c414c16..4d8c137a 100644 --- a/src/java/java-stream/src/main/java/javastream/jdk/SpecialisedFloatStream.java +++ b/src/java/java-stream/src/main/java/javastream/jdk/SpecialisedFloatStream.java @@ -78,7 +78,7 @@ public Float dot() { } @Override - public Data data() { + public Data readArrays() { return new Data<>(boxed(a), boxed(b), boxed(c)); } } diff --git a/src/java/java-stream/src/main/java/javastream/jdk/SpecialisedPlainDoubleStream.java b/src/java/java-stream/src/main/java/javastream/jdk/SpecialisedPlainDoubleStream.java index afda2ef8..c4f38d0e 100644 --- a/src/java/java-stream/src/main/java/javastream/jdk/SpecialisedPlainDoubleStream.java +++ b/src/java/java-stream/src/main/java/javastream/jdk/SpecialisedPlainDoubleStream.java @@ -78,7 +78,7 @@ public Double dot() { } @Override - public Data data() { + public Data readArrays() { return new Data<>(boxed(a), boxed(b), boxed(c)); } } diff --git a/src/java/java-stream/src/main/java/javastream/jdk/SpecialisedPlainFloatStream.java b/src/java/java-stream/src/main/java/javastream/jdk/SpecialisedPlainFloatStream.java index 9ccee53e..5178ed27 100644 --- a/src/java/java-stream/src/main/java/javastream/jdk/SpecialisedPlainFloatStream.java +++ b/src/java/java-stream/src/main/java/javastream/jdk/SpecialisedPlainFloatStream.java @@ -78,7 +78,7 @@ public Float dot() { } @Override - public Data data() { + public Data readArrays() { return new Data<>(boxed(a), boxed(b), boxed(c)); } } diff --git a/src/java/java-stream/src/main/java/javastream/tornadovm/GenericTornadoVMStream.java b/src/java/java-stream/src/main/java/javastream/tornadovm/GenericTornadoVMStream.java index d936df60..a65c32ab 100644 --- a/src/java/java-stream/src/main/java/javastream/tornadovm/GenericTornadoVMStream.java +++ b/src/java/java-stream/src/main/java/javastream/tornadovm/GenericTornadoVMStream.java @@ -4,8 +4,8 @@ import java.util.stream.Collectors; import javastream.JavaStream; import javastream.Main.Config; -import uk.ac.manchester.tornado.api.TaskSchedule; -import uk.ac.manchester.tornado.api.TornadoRuntimeCI; +import uk.ac.manchester.tornado.api.TornadoExecutionPlan; +import uk.ac.manchester.tornado.api.TornadoRuntimeInterface; import uk.ac.manchester.tornado.api.common.TornadoDevice; import uk.ac.manchester.tornado.api.runtime.TornadoRuntime; @@ -13,18 +13,18 @@ abstract class GenericTornadoVMStream extends JavaStream { protected final TornadoDevice device; - protected TaskSchedule copyTask; - protected TaskSchedule mulTask; - protected TaskSchedule addTask; - protected TaskSchedule triadTask; - protected TaskSchedule nstreamTask; - protected TaskSchedule dotTask; + protected TornadoExecutionPlan copyTask; + protected TornadoExecutionPlan mulTask; + protected TornadoExecutionPlan addTask; + protected TornadoExecutionPlan triadTask; + protected TornadoExecutionPlan nstreamTask; + protected TornadoExecutionPlan dotTask; GenericTornadoVMStream(Config config) { super(config); try { - TornadoRuntimeCI runtime = TornadoRuntime.getTornadoRuntime(); + TornadoRuntimeInterface runtime = TornadoRuntime.getTornadoRuntime(); List devices = TornadoVMStreams.enumerateDevices(runtime); device = devices.get(config.options.device); @@ -42,10 +42,6 @@ abstract class GenericTornadoVMStream extends JavaStream { } } - protected static TaskSchedule mkSchedule() { - return new TaskSchedule(""); - } - @Override public List listDevices() { return TornadoVMStreams.enumerateDevices(TornadoRuntime.getTornadoRuntime()).stream() @@ -55,12 +51,12 @@ public List listDevices() { @Override public void initArrays() { - this.copyTask.warmup(); - this.mulTask.warmup(); - this.addTask.warmup(); - this.triadTask.warmup(); - this.nstreamTask.warmup(); - this.dotTask.warmup(); + this.copyTask.withWarmUp(); + this.mulTask.withWarmUp(); + this.addTask.withWarmUp(); + this.triadTask.withWarmUp(); + this.nstreamTask.withWarmUp(); + this.dotTask.withWarmUp(); } @Override diff --git a/src/java/java-stream/src/main/java/javastream/tornadovm/SpecialisedDouble.java b/src/java/java-stream/src/main/java/javastream/tornadovm/SpecialisedDouble.java index 7712e317..c10153e3 100644 --- a/src/java/java-stream/src/main/java/javastream/tornadovm/SpecialisedDouble.java +++ b/src/java/java-stream/src/main/java/javastream/tornadovm/SpecialisedDouble.java @@ -2,8 +2,11 @@ import java.util.Arrays; import javastream.Main.Config; +import uk.ac.manchester.tornado.api.TaskGraph; +import uk.ac.manchester.tornado.api.TornadoExecutionPlan; import uk.ac.manchester.tornado.api.annotations.Parallel; import uk.ac.manchester.tornado.api.annotations.Reduce; +import uk.ac.manchester.tornado.api.enums.DataTransferMode; final class SpecialisedDouble extends GenericTornadoVMStream { @@ -49,7 +52,7 @@ private static void dot_( private final double[] a, b, c; private final double[] dotSum; - @SuppressWarnings({"PrimitiveArrayArgumentToVarargsMethod", "DuplicatedCode"}) + @SuppressWarnings({"DuplicatedCode"}) SpecialisedDouble(Config config) { super(config); final int size = config.options.arraysize; @@ -58,12 +61,43 @@ private static void dot_( b = new double[size]; c = new double[size]; dotSum = new double[1]; - this.copyTask = mkSchedule().task("", SpecialisedDouble::copy, size, a, c); - this.mulTask = mkSchedule().task("", SpecialisedDouble::mul, size, b, c, scalar); - this.addTask = mkSchedule().task("", SpecialisedDouble::add, size, a, b, c); - this.triadTask = mkSchedule().task("", SpecialisedDouble::triad, size, a, b, c, scalar); - this.nstreamTask = mkSchedule().task("", SpecialisedDouble::nstream, size, a, b, c, scalar); - this.dotTask = mkSchedule().task("", SpecialisedDouble::dot_, a, b, dotSum).streamOut(dotSum); + this.copyTask = + new TornadoExecutionPlan( + new TaskGraph("copy") + .task("copy", SpecialisedDouble::copy, size, a, c) + .transferToDevice(DataTransferMode.FIRST_EXECUTION, a, c) + .snapshot()); + this.mulTask = + new TornadoExecutionPlan( + new TaskGraph("mul") + .task("mul", SpecialisedDouble::mul, size, b, c, scalar) + .transferToDevice(DataTransferMode.FIRST_EXECUTION, b, c) + .snapshot()); + this.addTask = + new TornadoExecutionPlan( + new TaskGraph("add") + .task("add", SpecialisedDouble::add, size, a, b, c) + .transferToDevice(DataTransferMode.FIRST_EXECUTION, a, b, c) + .snapshot()); + this.triadTask = + new TornadoExecutionPlan( + new TaskGraph("triad") + .task("triad", SpecialisedDouble::triad, size, a, b, c, scalar) + .transferToDevice(DataTransferMode.FIRST_EXECUTION, a, b, c) + .snapshot()); + this.nstreamTask = + new TornadoExecutionPlan( + new TaskGraph("nstream") + .task("nstream", SpecialisedDouble::nstream, size, a, b, c, scalar) + .transferToDevice(DataTransferMode.FIRST_EXECUTION, a, b, c) + .snapshot()); + this.dotTask = + new TornadoExecutionPlan( + new TaskGraph("dot") + .task("dot", SpecialisedDouble::dot_, a, b, dotSum) + .transferToDevice(DataTransferMode.FIRST_EXECUTION, a, b) + .transferToHost(DataTransferMode.EVERY_EXECUTION, new Object[] {dotSum}) + .snapshot()); } @Override @@ -72,7 +106,7 @@ public void initArrays() { Arrays.fill(a, config.initA); Arrays.fill(b, config.initB); Arrays.fill(c, config.initC); - TornadoVMStreams.xferToDevice(device, a, b, c); + TornadoVMStreams.allocAndXferToDevice(device, a, b, c); } @Override @@ -81,7 +115,7 @@ protected Double getSum() { } @Override - public Data data() { + public Data readArrays() { TornadoVMStreams.xferFromDevice(device, a, b, c); return new Data<>(boxed(a), boxed(b), boxed(c)); } diff --git a/src/java/java-stream/src/main/java/javastream/tornadovm/SpecialisedFloat.java b/src/java/java-stream/src/main/java/javastream/tornadovm/SpecialisedFloat.java index e61cfe9e..0f3fffa7 100644 --- a/src/java/java-stream/src/main/java/javastream/tornadovm/SpecialisedFloat.java +++ b/src/java/java-stream/src/main/java/javastream/tornadovm/SpecialisedFloat.java @@ -2,8 +2,11 @@ import java.util.Arrays; import javastream.Main.Config; +import uk.ac.manchester.tornado.api.TaskGraph; +import uk.ac.manchester.tornado.api.TornadoExecutionPlan; import uk.ac.manchester.tornado.api.annotations.Parallel; import uk.ac.manchester.tornado.api.annotations.Reduce; +import uk.ac.manchester.tornado.api.enums.DataTransferMode; final class SpecialisedFloat extends GenericTornadoVMStream { @@ -49,7 +52,7 @@ private static void dot_( private final float[] a, b, c; private final float[] dotSum; - @SuppressWarnings({"PrimitiveArrayArgumentToVarargsMethod", "DuplicatedCode"}) + @SuppressWarnings({"DuplicatedCode"}) SpecialisedFloat(Config config) { super(config); final int size = config.options.arraysize; @@ -58,12 +61,43 @@ private static void dot_( b = new float[size]; c = new float[size]; dotSum = new float[1]; - this.copyTask = mkSchedule().task("", SpecialisedFloat::copy, size, a, c); - this.mulTask = mkSchedule().task("", SpecialisedFloat::mul, size, b, c, scalar); - this.addTask = mkSchedule().task("", SpecialisedFloat::add, size, a, b, c); - this.triadTask = mkSchedule().task("", SpecialisedFloat::triad, size, a, b, c, scalar); - this.nstreamTask = mkSchedule().task("", SpecialisedFloat::nstream, size, a, b, c, scalar); - this.dotTask = mkSchedule().task("", SpecialisedFloat::dot_, a, b, dotSum).streamOut(dotSum); + this.copyTask = + new TornadoExecutionPlan( + new TaskGraph("copy") + .task("copy", SpecialisedFloat::copy, size, a, c) + .transferToDevice(DataTransferMode.FIRST_EXECUTION, a, c) + .snapshot()); + this.mulTask = + new TornadoExecutionPlan( + new TaskGraph("mul") + .task("mul", SpecialisedFloat::mul, size, b, c, scalar) + .transferToDevice(DataTransferMode.FIRST_EXECUTION, b, c) + .snapshot()); + this.addTask = + new TornadoExecutionPlan( + new TaskGraph("add") + .task("add", SpecialisedFloat::add, size, a, b, c) + .transferToDevice(DataTransferMode.FIRST_EXECUTION, a, b, c) + .snapshot()); + this.triadTask = + new TornadoExecutionPlan( + new TaskGraph("triad") + .task("triad", SpecialisedFloat::triad, size, a, b, c, scalar) + .transferToDevice(DataTransferMode.FIRST_EXECUTION, a, b, c) + .snapshot()); + this.nstreamTask = + new TornadoExecutionPlan( + new TaskGraph("nstream") + .task("nstream", SpecialisedFloat::nstream, size, a, b, c, scalar) + .transferToDevice(DataTransferMode.FIRST_EXECUTION, a, b, c) + .snapshot()); + this.dotTask = + new TornadoExecutionPlan( + new TaskGraph("dot") + .task("dot", SpecialisedFloat::dot_, a, b, dotSum) + .transferToDevice(DataTransferMode.FIRST_EXECUTION, a, b) + .transferToHost(DataTransferMode.EVERY_EXECUTION, new Object[] {dotSum}) + .snapshot()); } @Override @@ -72,7 +106,7 @@ public void initArrays() { Arrays.fill(a, config.initA); Arrays.fill(b, config.initB); Arrays.fill(c, config.initC); - TornadoVMStreams.xferToDevice(device, a, b, c); + TornadoVMStreams.allocAndXferToDevice(device, a, b, c); } @Override @@ -81,7 +115,7 @@ protected Float getSum() { } @Override - public Data data() { + public Data readArrays() { TornadoVMStreams.xferFromDevice(device, a, b, c); return new Data<>(boxed(a), boxed(b), boxed(c)); } diff --git a/src/java/java-stream/src/main/java/javastream/tornadovm/TornadoVMStreams.java b/src/java/java-stream/src/main/java/javastream/tornadovm/TornadoVMStreams.java index 68eecadc..a43c7c8d 100644 --- a/src/java/java-stream/src/main/java/javastream/tornadovm/TornadoVMStreams.java +++ b/src/java/java-stream/src/main/java/javastream/tornadovm/TornadoVMStreams.java @@ -1,36 +1,46 @@ package javastream.tornadovm; +import java.util.Arrays; import java.util.List; import java.util.function.Function; import java.util.stream.Collectors; import java.util.stream.IntStream; import javastream.JavaStream; import javastream.Main.Config; -import uk.ac.manchester.tornado.api.TornadoRuntimeCI; +import uk.ac.manchester.tornado.api.TornadoRuntimeInterface; +import uk.ac.manchester.tornado.api.common.Event; import uk.ac.manchester.tornado.api.common.TornadoDevice; -import uk.ac.manchester.tornado.api.mm.TornadoGlobalObjectState; +import uk.ac.manchester.tornado.api.memory.TornadoDeviceObjectState; +import uk.ac.manchester.tornado.api.memory.TornadoGlobalObjectState; import uk.ac.manchester.tornado.api.runtime.TornadoRuntime; public final class TornadoVMStreams { private TornadoVMStreams() {} - static void xferToDevice(TornadoDevice device, Object... xs) { + static void allocAndXferToDevice(TornadoDevice device, Object... xs) { for (Object x : xs) { TornadoGlobalObjectState state = TornadoRuntime.getTornadoRuntime().resolveObject(x); + device.allocateObjects( + new Object[] {x}, 0, new TornadoDeviceObjectState[] {state.getDeviceState(device)}); List writeEvent = device.ensurePresent(x, state.getDeviceState(device), null, 0, 0); if (writeEvent != null) writeEvent.forEach(e -> device.resolveEvent(e).waitOn()); } } static void xferFromDevice(TornadoDevice device, Object... xs) { - for (Object x : xs) { - TornadoGlobalObjectState state = TornadoRuntime.getTornadoRuntime().resolveObject(x); - device.resolveEvent(device.streamOut(x, 0, state.getDeviceState(device), null)).waitOn(); - } + Arrays.stream(xs) + .map( + x -> { + TornadoGlobalObjectState state = TornadoRuntime.getTornadoRuntime().resolveObject(x); + return device.resolveEvent( + device.streamOut(x, 0, state.getDeviceState(device), null)); + }) + .collect(Collectors.toList()) + .forEach(Event::waitOn); } - static List enumerateDevices(TornadoRuntimeCI runtime) { + static List enumerateDevices(TornadoRuntimeInterface runtime) { return IntStream.range(0, runtime.getNumDrivers()) .mapToObj(runtime::getDriver) .flatMap(d -> IntStream.range(0, d.getDeviceCount()).mapToObj(d::getDevice)) From e7774c13728844257594f19649e449f0dee779d4 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Sat, 7 Oct 2023 13:58:34 +0100 Subject: [PATCH 84/89] Update changelog for timing and version bump updates --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 605d3273..deba9842 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ All notable changes to this project will be documented in this file. - Thrust managed memory. - HIP managed memory. - New implementation using SYCL2020 USM (sycl2020-acc) and renamed original `sycl2020` to `sycl2020-acc`. +- Data initialisation and read-back timing for all models, including Java, Scala, Julia, and Rust +- Add support for the latest Aparapi (3.0.0) and TornadoVM (0.15.x) for Java ### Changed - RAJA CUDA CMake build issues resolved. @@ -17,6 +19,7 @@ All notable changes to this project will be documented in this file. - Number of thread-blocks in CUDA dot kernel implementation changed to 1024. - Fix compatibility of `sycl2020` (now `sycl2020-acc`) with hipSYCL. - Bumped Julia compat to 1.9 +- Bumped Scala to 3.3.1 - Bumped Rust to 1.74.0-nightly (13e6f24b9 2023-09-23) From 3cb01e76a80fb5162a39e27e70b1bdbbce5591a4 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Sat, 7 Oct 2023 14:59:26 +0100 Subject: [PATCH 85/89] Add init/read timing for Julia --- src/julia/JuliaStream.jl/src/Stream.jl | 69 +++++++++++++++++++------- 1 file changed, 50 insertions(+), 19 deletions(-) diff --git a/src/julia/JuliaStream.jl/src/Stream.jl b/src/julia/JuliaStream.jl/src/Stream.jl index 42030f86..226d44b7 100644 --- a/src/julia/JuliaStream.jl/src/Stream.jl +++ b/src/julia/JuliaStream.jl/src/Stream.jl @@ -20,6 +20,18 @@ end @enum Benchmark All Triad Nstream + +function run_init_arrays!(data::StreamData{T,C}, context, init::Tuple{T,T,T})::Float64 where {T,C} + return @elapsed init_arrays!(data, context, init) +end + +function run_read_data(data::StreamData{T,C}, context)::Tuple{Float64,VectorData{T}} where {T,C} + elapsed = @elapsed begin + result = read_data(data, context) + end + return (elapsed, result) +end + function run_all!(data::StreamData{T,C}, context, times::Int)::Tuple{Timings,T} where {T,C} timings = Timings(times) lastSum::T = 0 @@ -39,11 +51,7 @@ function run_triad!(data::StreamData{T,C}, context, times::Int)::Float64 where { end end -function run_nstream!( - data::StreamData{T,C}, - context, - times::Int, -)::Vector{Float64} where {T,C} +function run_nstream!(data::StreamData{T,C}, context, times::Int)::Vector{Float64} where {T,C} timings::Vector{Float64} = zeros(times) for i = 1:times @inbounds timings[i] = @elapsed nstream!(data, context) @@ -93,9 +101,7 @@ function check_solutions( error = abs((dot - gold_sum) / gold_sum) failed = error > 1.0e-8 if failed - println( - "Validation failed on sum. Error $error \nSum was $dot but should be $gold_sum", - ) + println("Validation failed on sum. Error $error \nSum was $dot but should be $gold_sum") end !failed end : true @@ -166,7 +172,7 @@ function main() parse_options(config) if config.list - for (i, (_,repr, impl)) in enumerate(devices()) + for (i, (_, repr, impl)) in enumerate(devices()) println("[$i] ($impl) $repr") end exit(0) @@ -175,9 +181,7 @@ function main() ds = devices() # TODO implement substring device match if config.device < 1 || config.device > length(ds) - error( - "Device $(config.device) out of range (1..$(length(ds))), NOTE: Julia is 1-indexed", - ) + error("Device $(config.device) out of range (1..$(length(ds))), NOTE: Julia is 1-indexed") else device = ds[config.device] end @@ -257,16 +261,42 @@ function main() end end + function show_init(init::Float64, read::Float64) + setup = [("Init", init, 3 * array_bytes), ("Read", read, 3 * array_bytes)] + if config.csv + tabulate( + map( + x -> [ + ("phase", x[1]), + ("n_elements", config.arraysize), + ("sizeof", x[3]), + ("max_m$(config.mibibytes ? "i" : "")bytes_per_sec", mega_scale * total_bytes / x[2]), + ("runtime", x[2]), + ], + setup, + )..., + ) + else + for (name, elapsed, total_bytes) in setup + println( + "$name: $(round(elapsed; digits=5)) s (=$(round(( mega_scale * total_bytes) / elapsed; digits = 5)) M$(config.mibibytes ? "i" : "")Bytes/sec)", + ) + end + end + end + init::Tuple{type,type,type} = DefaultInit scalar::type = DefaultScalar GC.enable(false) (data, context) = make_stream(config.arraysize, scalar, device, config.csv) - init_arrays!(data, context, init) + tInit = run_init_arrays!(data, context, init) if benchmark == All (timings, sum) = run_all!(data, context, config.numtimes) - valid = check_solutions(read_data(data, context), config.numtimes, init, benchmark, sum) + (tRead, result) = run_read_data(data, context) + show_init(tInit, tRead) + valid = check_solutions(result, config.numtimes, init, benchmark, sum) tabulate( mk_row(timings.copy, "Copy", 2 * array_bytes), mk_row(timings.mul, "Mul", 2 * array_bytes), @@ -276,13 +306,15 @@ function main() ) elseif benchmark == Nstream timings = run_nstream!(data, context, config.numtimes) - valid = - check_solutions(read_data(data, context), config.numtimes, init, benchmark, nothing) + (tRead, result) = run_read_data(data, context) + show_init(tInit, tRead) + valid = check_solutions(result, config.numtimes, init, benchmark, nothing) tabulate(mk_row(timings, "Nstream", 4 * array_bytes)) elseif benchmark == Triad elapsed = run_triad!(data, context, config.numtimes) - valid = - check_solutions(read_data(data, context), config.numtimes, init, benchmark, nothing) + (tRead, result) = run_read_data(data, context) + show_init(tInit, tRead) + valid = check_solutions(result, config.numtimes, init, benchmark, nothing) total_bytes = 3 * array_bytes * config.numtimes bandwidth = mega_scale * (total_bytes / elapsed) println("Runtime (seconds): $(round(elapsed; digits=5))") @@ -290,7 +322,6 @@ function main() else error("Bad benchmark $(benchmark)") end - GC.enable(true) if !valid From 01ef17e8b40276022acf54f98bbdcd26d96b5f7b Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Sat, 7 Oct 2023 15:14:28 +0100 Subject: [PATCH 86/89] Bump version to 5.0, resolves #167 --- CMakeLists.txt | 2 +- src/fortran/Makefile | 2 +- src/java/java-stream/pom.xml | 2 +- src/java/java-stream/src/main/java/javastream/Main.java | 2 +- src/julia/JuliaStream.jl/src/Stream.jl | 2 +- src/main.cpp | 2 +- src/rust/rust-stream/Cargo.toml | 2 +- src/scala/scala-stream/build.sbt | 2 +- .../scala-stream/src/main/scala/scalastream/ScalaStream.scala | 2 +- 9 files changed, 9 insertions(+), 9 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7551dc3e..27736b6b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -4,7 +4,7 @@ if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.24.0") cmake_policy(SET CMP0135 NEW) endif () -project(BabelStream VERSION 4.0 LANGUAGES CXX C) +project(BabelStream VERSION 5.0 LANGUAGES CXX C) # uncomment for debugging build issues: #set(CMAKE_VERBOSE_MAKEFILE ON) diff --git a/src/fortran/Makefile b/src/fortran/Makefile index 18685d46..adadcffb 100644 --- a/src/fortran/Makefile +++ b/src/fortran/Makefile @@ -18,7 +18,7 @@ else COMPILER=gcc endif -FCFLAGS += -DVERSION_STRING="4.0" +FCFLAGS += -DVERSION_STRING="5.0" #FCFLAGS += -DUSE_INT32 ifeq ($(IMPLEMENTATION),DoConcurrent) diff --git a/src/java/java-stream/pom.xml b/src/java/java-stream/pom.xml index 78d26b31..8cf229fa 100644 --- a/src/java/java-stream/pom.xml +++ b/src/java/java-stream/pom.xml @@ -7,7 +7,7 @@ java-stream javastream - 4.0 + 5.0 UTF-8 diff --git a/src/java/java-stream/src/main/java/javastream/Main.java b/src/java/java-stream/src/main/java/javastream/Main.java index 3732a242..ecd94993 100644 --- a/src/java/java-stream/src/main/java/javastream/Main.java +++ b/src/java/java-stream/src/main/java/javastream/Main.java @@ -382,7 +382,7 @@ private static void tabulateCsv(boolean csv, List>... rows } } - private static final String VERSION = "4.0"; + private static final String VERSION = "5.0"; private static final float START_SCALAR = 0.4f; private static final float START_A = 0.1f; diff --git a/src/julia/JuliaStream.jl/src/Stream.jl b/src/julia/JuliaStream.jl/src/Stream.jl index 226d44b7..97ba9432 100644 --- a/src/julia/JuliaStream.jl/src/Stream.jl +++ b/src/julia/JuliaStream.jl/src/Stream.jl @@ -164,7 +164,7 @@ end const DefaultInit = (0.1, 0.2, 0.0) const DefaultScalar = 0.4 -const Version = "4.0" +const Version = "5.0" function main() diff --git a/src/main.cpp b/src/main.cpp index 639f0c3f..abfc14e4 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -15,7 +15,7 @@ #include #include -#define VERSION_STRING "4.0" +#define VERSION_STRING "5.0" #include "Stream.h" diff --git a/src/rust/rust-stream/Cargo.toml b/src/rust/rust-stream/Cargo.toml index d93a84f5..24785184 100644 --- a/src/rust/rust-stream/Cargo.toml +++ b/src/rust/rust-stream/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "rust-stream" -version = "4.0.0" +version = "5.0.0" authors = ["Wei-Chen Lin "] edition = "2018" diff --git a/src/scala/scala-stream/build.sbt b/src/scala/scala-stream/build.sbt index b13fda3e..2513b539 100644 --- a/src/scala/scala-stream/build.sbt +++ b/src/scala/scala-stream/build.sbt @@ -4,7 +4,7 @@ lazy val root = (project in file(".")) .enablePlugins(NativeImagePlugin) .settings( scalaVersion := "3.3.1", - version := "4.0", + version := "5.0", organization := "uk.ac.bristol.uob-hpc", organizationName := "University of Bristol", Compile / mainClass := mainCls, diff --git a/src/scala/scala-stream/src/main/scala/scalastream/ScalaStream.scala b/src/scala/scala-stream/src/main/scala/scalastream/ScalaStream.scala index 888ba7c0..8f247b62 100644 --- a/src/scala/scala-stream/src/main/scala/scalastream/ScalaStream.scala +++ b/src/scala/scala-stream/src/main/scala/scalastream/ScalaStream.scala @@ -110,7 +110,7 @@ given DoubleFractional: Fractional[Double] with object App: - final val Version: String = "4.0" + final val Version: String = "5.0" case class Config[@specialized(Double, Float) A]( options: Options, From 165db1749ce95a2217163bce9c180fdc07cff91a Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Sat, 7 Oct 2023 15:26:52 +0100 Subject: [PATCH 87/89] Update changelog for v5 --- CHANGELOG.md | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index deba9842..eeccd69c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,21 +7,26 @@ All notable changes to this project will be documented in this file. - Thrust managed memory. - HIP managed memory. - New implementation using SYCL2020 USM (sycl2020-acc) and renamed original `sycl2020` to `sycl2020-acc`. +- New implementation in [Futhark](https://futhark-lang.org/) - Data initialisation and read-back timing for all models, including Java, Scala, Julia, and Rust -- Add support for the latest Aparapi (3.0.0) and TornadoVM (0.15.x) for Java +- Add support for the latest Aparapi (3.0.0) and TornadoVM (0.15.x) for Java +- JuliaStream.jl published to registry (pending #113) ### Changed +- Fix std-data/std-indices compatibility with oneDPL, NVHPC, and AdaptiveCpp (a.k.a. hipSYCL). - RAJA CUDA CMake build issues resolved. +- Kokkos build updates (CXX version upgraded to C++17). - Fix CUDA memory limit check. +- Fix CUDA CMake options for `-DMEM` and `-DCMAKE_CUDA_FLAGS`. - Use long double for `check_solution` in case of large problem size. - OneAPI DPCPP compiler is deprecated in favour of ICPX, so added new build option to SYCL 2020 version. - Updates to the HIP kernels and API usage. - Number of thread-blocks in CUDA dot kernel implementation changed to 1024. -- Fix compatibility of `sycl2020` (now `sycl2020-acc`) with hipSYCL. +- Fix compatibility of `sycl2020` (now `sycl2020-acc`) with AdaptiveCpp. - Bumped Julia compat to 1.9 - Bumped Scala to 3.3.1 - Bumped Rust to 1.74.0-nightly (13e6f24b9 2023-09-23) - +- Upgrade CI to Ubuntu 22.04 ## [v4.0] - 2021-12-22 From f3aaca06ddc285dea15d54ce2ac866dd5bcb7575 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Mon, 9 Oct 2023 11:16:14 +0100 Subject: [PATCH 88/89] Add Read and Init timings for Fortran (not csv) --- src/fortran/main.F90 | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/src/fortran/main.F90 b/src/fortran/main.F90 index d86e8d4a..153be936 100644 --- a/src/fortran/main.F90 +++ b/src/fortran/main.F90 @@ -496,6 +496,7 @@ program BabelStream real(kind=REAL64), allocatable :: timings(:,:) real(kind=REAL64), allocatable :: h_A(:), h_B(:), h_C(:) real(kind=REAL64) :: summ + real(kind=REAL64) :: init_tic, init_toc, read_tic, read_toc call parseArguments() @@ -541,9 +542,17 @@ program BabelStream call alloc(array_size) + init_tic = get_wtime() call init_arrays(startA, startB, startC) + init_toc = get_wtime() summ = 0.0d0 + if (.not.csv) then + write(*,'(a6,f9.1,a4,f9.1,a3,a9)') 'Init: ',init_toc-init_tic, 's (=', & + (3.0d0 * element_size * array_size * scaling) / (init_toc-init_tic), TRIM(label), 'ytes/sec)' + end if + + timings = -1.0d0 if (selection.eq.1) then call run_all(timings, summ) @@ -559,7 +568,15 @@ program BabelStream stop 1 endif + read_tic = get_wtime() call read_arrays(h_A, h_B, h_C) + read_toc = get_wtime() + + if (.not.csv) then + write(*,'(a6,f9.1,a4,f9.1,a3,a9)') 'Read: ',read_toc-read_tic, 's (=', & + (3.0d0 * element_size * array_size * scaling) / (read_toc-read_tic), TRIM(label), 'ytes/sec)' + end if + call check_solution(h_A, h_B, h_C, summ) block From 773814f0f274647d7bbd6836032b3477c5b170c6 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Thu, 12 Oct 2023 11:11:19 +0100 Subject: [PATCH 89/89] Update CHANGELOG.md --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index eeccd69c..76e868bb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ All notable changes to this project will be documented in this file. - Thrust managed memory. - HIP managed memory. - New implementation using SYCL2020 USM (sycl2020-acc) and renamed original `sycl2020` to `sycl2020-acc`. +- New implementation in Fortran - New implementation in [Futhark](https://futhark-lang.org/) - Data initialisation and read-back timing for all models, including Java, Scala, Julia, and Rust - Add support for the latest Aparapi (3.0.0) and TornadoVM (0.15.x) for Java