Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update Thrust to CCCL #206

Open
wants to merge 3 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 48 additions & 26 deletions src/thrust/ThrustStream.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
// Copyright (c) 2020 Tom Deakin
// University of Bristol HPC
// Copyright (c) 2020 Tom Deakin, 2024 Bernhard Manfred Gruber
// University of Bristol HPC, NVIDIA
//
// For full license terms please see the LICENSE file distributed with this
// source code
Expand All @@ -10,6 +10,25 @@
#include <thrust/iterator/zip_iterator.h>
#include <thrust/zip_function.h>

#if defined(MANAGED)
#include <thrust/universal_vector.h>
#else
#include <thrust/device_vector.h>
#endif

template <class T>
using vector =
#if defined(MANAGED)
thrust::universal_vector<T>;
#else
thrust::device_vector<T>;
#endif

template <class T>
struct ThrustStream<T>::Impl{
vector<T> a, b, c;
};

static inline void synchronise()
{
// rocThrust doesn't synchronise between thrust calls
Expand All @@ -20,7 +39,7 @@ static inline void synchronise()

template <class T>
ThrustStream<T>::ThrustStream(const intptr_t array_size, int device)
: array_size{array_size}, a(array_size), b(array_size), c(array_size) {
: array_size{array_size}, impl(new Impl{vector<T>(array_size), vector<T>(array_size), vector<T>(array_size)}) {
std::cout << "Using CUDA device: " << getDeviceName(device) << std::endl;
std::cout << "Driver: " << getDeviceDriver(device) << std::endl;
std::cout << "Thrust version: " << THRUST_VERSION << std::endl;
Expand Down Expand Up @@ -50,27 +69,30 @@ ThrustStream<T>::ThrustStream(const intptr_t array_size, int device)

}

template <class T>
ThrustStream<T>::~ThrustStream() = default;

template <class T>
void ThrustStream<T>::init_arrays(T initA, T initB, T initC)
{
thrust::fill(a.begin(), a.end(), initA);
thrust::fill(b.begin(), b.end(), initB);
thrust::fill(c.begin(), c.end(), initC);
thrust::fill(impl->a.begin(), impl->a.end(), initA);
thrust::fill(impl->b.begin(), impl->b.end(), initB);
thrust::fill(impl->c.begin(), impl->c.end(), initC);
synchronise();
}

template <class T>
void ThrustStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
{
thrust::copy(a.begin(), a.end(), h_a.begin());
thrust::copy(b.begin(), b.end(), h_b.begin());
thrust::copy(c.begin(), c.end(), h_c.begin());
thrust::copy(impl->a.begin(), impl->a.end(), h_a.begin());
thrust::copy(impl->b.begin(), impl->b.end(), h_b.begin());
thrust::copy(impl->c.begin(), impl->c.end(), h_c.begin());
}

template <class T>
void ThrustStream<T>::copy()
{
thrust::copy(a.begin(), a.end(),c.begin());
thrust::copy(impl->a.begin(), impl->a.end(),impl->c.begin());
synchronise();
}

Expand All @@ -79,9 +101,9 @@ void ThrustStream<T>::mul()
{
const T scalar = startScalar;
thrust::transform(
c.begin(),
c.end(),
b.begin(),
impl->c.begin(),
impl->c.end(),
impl->b.begin(),
[=] __device__ __host__ (const T &ci){
return ci * scalar;
}
Expand All @@ -93,9 +115,9 @@ template <class T>
void ThrustStream<T>::add()
{
thrust::transform(
thrust::make_zip_iterator(thrust::make_tuple(a.begin(), b.begin())),
thrust::make_zip_iterator(thrust::make_tuple(a.end(), b.end())),
c.begin(),
thrust::make_zip_iterator(impl->a.begin(), impl->b.begin()),
thrust::make_zip_iterator(impl->a.end(), impl->b.end()),
impl->c.begin(),
thrust::make_zip_function(
[] __device__ __host__ (const T& ai, const T& bi){
return ai + bi;
Expand All @@ -109,9 +131,9 @@ void ThrustStream<T>::triad()
{
const T scalar = startScalar;
thrust::transform(
thrust::make_zip_iterator(thrust::make_tuple(b.begin(), c.begin())),
thrust::make_zip_iterator(thrust::make_tuple(b.end(), c.end())),
a.begin(),
thrust::make_zip_iterator(impl->b.begin(), impl->c.begin()),
thrust::make_zip_iterator(impl->b.end(), impl->c.end()),
impl->a.begin(),
thrust::make_zip_function(
[=] __device__ __host__ (const T& bi, const T& ci){
return bi + scalar * ci;
Expand All @@ -125,9 +147,9 @@ void ThrustStream<T>::nstream()
{
const T scalar = startScalar;
thrust::transform(
thrust::make_zip_iterator(thrust::make_tuple(a.begin(), b.begin(), c.begin())),
thrust::make_zip_iterator(thrust::make_tuple(a.end(), b.end(), c.end())),
a.begin(),
thrust::make_zip_iterator(impl->a.begin(), impl->b.begin(), impl->c.begin()),
thrust::make_zip_iterator(impl->a.end(), impl->b.end(), impl->c.end()),
impl->a.begin(),
thrust::make_zip_function(
[=] __device__ __host__ (const T& ai, const T& bi, const T& ci){
return ai + bi + scalar * ci;
Expand All @@ -139,7 +161,7 @@ void ThrustStream<T>::nstream()
template <class T>
T ThrustStream<T>::dot()
{
return thrust::inner_product(a.begin(), a.end(), b.begin(), T{});
return thrust::inner_product(impl->a.begin(), impl->a.end(), impl->b.begin(), T{});
}

#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA || \
Expand All @@ -155,7 +177,7 @@ T ThrustStream<T>::dot()
# error Unsupported compiler for Thrust
#endif

void check_error(void)
void check_error()
{
IMPL_FN__(Error_t) err = IMPL_FN__(GetLastError());
if (err != IMPL_FN__(Success))
Expand All @@ -165,7 +187,7 @@ void check_error(void)
}
}

void listDevices(void)
void listDevices()
{
// Get number of devices
int count;
Expand Down Expand Up @@ -213,7 +235,7 @@ std::string getDeviceDriver(const int device)

#else

void listDevices(void)
void listDevices()
{
std::cout << "0: CPU" << std::endl;
}
Expand Down
27 changes: 7 additions & 20 deletions src/thrust/ThrustStream.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
// Copyright (c) 2020 Tom Deakin
// University of Bristol HPC
// Copyright (c) 2020 Tom Deakin, 2024 Bernhard Manfred Gruber
// University of Bristol HPC, NVIDIA
//
// For full license terms please see the LICENSE file distributed with this
// source code
Expand All @@ -8,11 +8,7 @@

#include <iostream>
#include <vector>
#if defined(MANAGED)
#include <thrust/universal_vector.h>
#else
#include <thrust/device_vector.h>
#endif
#include <memory>

#include "Stream.h"

Expand All @@ -22,22 +18,13 @@ template <class T>
class ThrustStream : public Stream<T>
{
protected:
// Size of arrays
struct Impl;
std::unique_ptr<Impl> impl; // avoid thrust vectors leaking into non-CUDA translation units
intptr_t array_size;

#if defined(MANAGED)
thrust::universtal_vector<T> a;
thrust::universtal_vector<T> b;
thrust::universtal_vector<T> c;
#else
thrust::device_vector<T> a;
thrust::device_vector<T> b;
thrust::device_vector<T> c;
#endif

public:
ThrustStream(const intptr_t, int);
~ThrustStream() = default;
ThrustStream(intptr_t array_size, int device);
~ThrustStream();

virtual void copy() override;
virtual void add() override;
Expand Down
61 changes: 34 additions & 27 deletions src/thrust/model.cmake
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@

register_flag_optional(THRUST_IMPL
"Which Thrust implementation to use, supported options include:
- CUDA (via https://github.com/NVIDIA/thrust)
- CUDA (via https://github.com/NVIDIA/thrust or https://github.com/NVIDIA/CCCL)
- ROCM (via https://github.com/ROCmSoftwarePlatform/rocThrust)
"
"CUDA")

register_flag_optional(SDK_DIR
"Path to the selected Thrust implementation (e.g `/opt/nvidia/hpc_sdk/Linux_x86_64/21.9/cuda/include` for NVHPC, `/opt/rocm` for ROCm)"
"Path to the installation prefix for CCCL or Thrust (e.g `/opt/nvidia/hpc_sdk/Linux_x86_64/24.5/cuda/12.4/lib64/cmake` for NVHPC, or `/usr/local/cuda-12.5/lib64/cmake` for nvcc, or `/usr/local/cuda-11.4/include` for older nvcc, or `/opt/rocm` for ROCm)"
"")

register_flag_optional(BACKEND
Expand All @@ -18,7 +18,7 @@ register_flag_optional(BACKEND
"
"CUDA")

register_flag_optional(MANAGED "Enabled managed memory mode."
register_flag_optional(MANAGED "Enabled managed memory mode."
"OFF")

register_flag_optional(CMAKE_CUDA_COMPILER
Expand All @@ -34,6 +34,9 @@ register_flag_optional(CUDA_EXTRA_FLAGS
"[THRUST_IMPL==CUDA] Additional CUDA flags passed to nvcc, this is appended after `CUDA_ARCH`"
"")

option(FETCH_CCCL "Fetch (download) the CCCL library. This uses CMake's FetchContent feature.
Specify version by setting FETCH_CCCL_VERSION" OFF)
set(FETCH_CCCL_VERSION "v2.4.0" CACHE STRING "Specify version of CCCL to use if FETCH_CCCL is ON")

macro(setup)
set(CMAKE_CXX_STANDARD 14)
Expand All @@ -42,44 +45,48 @@ macro(setup)
endif ()

if (${THRUST_IMPL} STREQUAL "CUDA")

# see CUDA.cmake, we're only adding a few Thrust related libraries here

if (POLICY CMP0104)
cmake_policy(SET CMP0104 NEW)
endif ()

set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH})
# add -forward-unknown-to-host-compiler for compatibility reasons
set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "--expt-extended-lambda " ${CUDA_EXTRA_FLAGS})
enable_language(CUDA)
# CMake defaults to -O2 for CUDA at Release, let's wipe that and use the global RELEASE_FLAG
# appended later
# CMake defaults to -O2 for CUDA at Release, let's wipe that and use the global RELEASE_FLAG appended later
wipe_gcc_style_optimisation_flags(CMAKE_CUDA_FLAGS_${BUILD_TYPE})

message(STATUS "NVCC flags: ${CMAKE_CUDA_FLAGS} ${CMAKE_CUDA_FLAGS_${BUILD_TYPE}}")


# XXX NVHPC <= 21.9 has cub-config in `Linux_x86_64/21.9/cuda/11.4/include/cub/cmake`
# XXX NVHPC >= 22.3 has cub-config in `Linux_x86_64/22.3/cuda/11.6/lib64/cmake/cub/`
# same thing for thrust
if (SDK_DIR)
# CMake tries several subdirectories below SDK_DIR, see documentation:
# https://cmake.org/cmake/help/latest/command/find_package.html#config-mode-search-procedure
list(APPEND CMAKE_PREFIX_PATH ${SDK_DIR})
find_package(CUB REQUIRED CONFIG PATHS ${SDK_DIR}/cub)
find_package(Thrust REQUIRED CONFIG PATHS ${SDK_DIR}/thrust)
else ()
find_package(CUB REQUIRED CONFIG)
find_package(Thrust REQUIRED CONFIG)
endif ()

message(STATUS "Using Thrust backend: ${BACKEND}")

# this creates the interface that we can link to
thrust_create_target(Thrust${BACKEND}
HOST CPP
DEVICE ${BACKEND})
Comment on lines -78 to -80

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These lines switch between the different Thrust backends, and this feature is lost in the new version. By default, CCCL uses the CUDA backend for Thrust, but it looks like BabelStream may want to use TBB or OMP.

To keep this functionality, force-set the cache variable CCCL_THRUST_DEVICE_SYSTEM to the desired backend before any find_package/CPM calls.

Force setting that variable tells the CCCL CMake package to use the desired device backend for the CCCL::Thrust target that gets linked into CCCL::CCCL. See this for details. This is a convenience for the common case where only one backend is used per configure. If multiple Thrust targets with different backends are needed in the same build, lmk, there's a different approach that can be used.

Also, using paths based off of CMAKE_SOURCE_DIR is fragile. If BabelStream is included in another project (via add_subdirectory, CPM, etc), this variable will point to the top-level project's source dir, not BabelStream's. Use the project specific variable BabelStream_SOURCE_DIR instead.

The include call should wrap it's argument in quotes. This is best practice for all paths in cmake to support spaces, etc.

Finally, rather than manually checking find_package and falling back to CPMAddPackage, it looks like CPMFindPackage will do this for us:

The function CPMFindPackage will try to find a local dependency via CMake's find_package and fallback to CPMAddPackage, if the dependency is not found.

All together, you're looking at something like:

set(MIN_CCCL_VERSION 2.4.0)
# Tell CCCL's package to configure Thrust using the desired backend:
set(CCCL_THRUST_DEVICE_SYSTEM ${BACKEND} CACHE STRING "" FORCE)

# CPMFindPackage will:
# 1. Attempt to locate a local installation of CCCL.
#    Set CCCL_DIR to the directory containing `cccl-config.cmake` to force a
#    specific installation to be used.
# 2. If no local package is found, the requested version will be downloaded
#    from GitHub and configured using CPMAddPackage.
include("${BabelStream_SOURCE_DIR}/src/thrust/CPM.cmake")
CPMFindPackage(
  NAME CCCL
  GITHUB_REPOSITORY nvidia/cccl
  GIT_TAG v${MIN_CCCL_VERSION}
)
register_link_library(CCCL::CCCL)

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you for the review! I set CCCL_THRUST_DEVICE_SYSTEM now before starting to find and fetch CCCL.

As for handling CPM, I decided now to revert to FetchContent to just make the CMake setup simpler.

Let me know if anything is still missing, thx!


register_link_library(Thrust${BACKEND})
set(CCCL_THRUST_DEVICE_SYSTEM ${BACKEND} CACHE STRING "" FORCE)

# fetch CCCL if user wants to
if (FETCH_CCCL)
FetchContent_Declare(
CCCL
GIT_REPOSITORY https://github.com/nvidia/cccl.git
GIT_TAG "${FETCH_CCCL_VERSION}"
)
FetchContent_MakeAvailable(CCCL)
register_link_library(CCCL::CCCL)
else()
# try to find CCCL locally
find_package(CCCL CONFIG)
if (CCCL_FOUND)
register_link_library(CCCL::CCCL)
else()
# backup: find legacy projects separately
message(WARNING "No CCCL found on your system. Trying Thrust and CUB legacy targets.")
find_package(CUB REQUIRED CONFIG)
find_package(Thrust REQUIRED CONFIG)
thrust_create_target(Thrust${BACKEND} HOST CPP DEVICE ${BACKEND})
register_link_library(Thrust${BACKEND})
endif()
endif()
elseif (${THRUST_IMPL} STREQUAL "ROCM")
if (SDK_DIR)
find_package(rocprim REQUIRED CONFIG PATHS ${SDK_DIR}/rocprim)
Expand Down