Skip to content

Commit

Permalink
Internalize cuda/detail/core/util.h
Browse files Browse the repository at this point in the history
  • Loading branch information
bernhardmgruber committed Jan 23, 2025
1 parent 3d738ad commit 56f56d1
Show file tree
Hide file tree
Showing 8 changed files with 107 additions and 186 deletions.
34 changes: 6 additions & 28 deletions thrust/thrust/system/cuda/detail/core/agent_launcher.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,8 @@ namespace cuda_cub
{
namespace core
{

namespace detail
{
# ifndef THRUST_DETAIL_KERNEL_ATTRIBUTES
# define THRUST_DETAIL_KERNEL_ATTRIBUTES CCCL_DETAIL_KERNEL_ATTRIBUTES
# endif
Expand Down Expand Up @@ -97,7 +98,7 @@ THRUST_DETAIL_KERNEL_ATTRIBUTES void _kernel_agent_vshmem(char*, Args... args)
template <class Agent>
struct AgentLauncher : Agent
{
core::AgentPlan plan;
AgentPlan plan;
size_t count;
cudaStream_t stream;
char const* name;
Expand Down Expand Up @@ -168,34 +169,10 @@ struct AgentLauncher : Agent
assert(plan.grid_size > 0);
}

# if 0
THRUST_RUNTIME_FUNCTION
AgentPlan static get_plan(cudaStream_t s, void* d_ptr = 0)
{
// in separable compilation mode, we have no choice
// but to call kernel to get agent_plan
// otherwise the risk is something may fail
// if user mix & match ptx versions in a separably compiled function
// http://nvbugs/1772071
// XXX may be it is too string of a requirements, consider relaxing it in
// the future
# ifdef __CUDACC_RDC__
return core::get_agent_plan<Agent>(s, d_ptr);
# else
return get_agent_plan<Agent>(core::get_ptx_version());
# endif
}
THRUST_RUNTIME_FUNCTION
AgentPlan static get_plan_default()
{
return get_agent_plan<Agent>(sm_arch<0>::type::ver);
}
# endif

THRUST_RUNTIME_FUNCTION typename core::get_plan<Agent>::type static get_plan(cudaStream_t, void* d_ptr = 0)
{
THRUST_UNUSED_VAR(d_ptr);
return get_agent_plan<Agent>(core::get_ptx_version());
return get_agent_plan<Agent>(get_ptx_version());
}

THRUST_RUNTIME_FUNCTION typename core::get_plan<Agent>::type static get_plan()
Expand Down Expand Up @@ -227,7 +204,7 @@ struct AgentLauncher : Agent
{
# if THRUST_DEBUG_SYNC_FLAG
cuda_optional<int> occ = max_sm_occupancy(k);
const int ptx_version = core::get_ptx_version();
const int ptx_version = get_ptx_version();
if (count > 0)
{
_CubLog(
Expand Down Expand Up @@ -305,6 +282,7 @@ struct AgentLauncher : Agent
}
};

} // namespace detail
} // namespace core
} // namespace cuda_cub

Expand Down
53 changes: 4 additions & 49 deletions thrust/thrust/system/cuda/detail/core/util.h
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,8 @@ namespace core
# endif
#endif

namespace detail
{
/// Typelist - a container of types
template <typename...>
struct typelist;
Expand Down Expand Up @@ -486,22 +488,9 @@ THRUST_RUNTIME_FUNCTION inline size_t get_max_shared_memory_per_block()
return static_cast<size_t>(i32value);
}

THRUST_RUNTIME_FUNCTION inline size_t virtual_shmem_size(size_t shmem_per_block)
{
size_t max_shmem_per_block = core::get_max_shared_memory_per_block();
if (shmem_per_block > max_shmem_per_block)
{
return shmem_per_block;
}
else
{
return 0;
}
}

THRUST_RUNTIME_FUNCTION inline size_t vshmem_size(size_t shmem_per_block, size_t num_blocks)
{
size_t max_shmem_per_block = core::get_max_shared_memory_per_block();
size_t max_shmem_per_block = get_max_shared_memory_per_block();
if (shmem_per_block > max_shmem_per_block)
{
return shmem_per_block * num_blocks;
Expand Down Expand Up @@ -573,22 +562,6 @@ struct BlockLoad
get_arch<PtxPlan>::type::ver>;
};

// BlockStore
// -----------
// a helper metaprogram that returns type of a block loader
template <class PtxPlan, class It, class T = typename iterator_traits<It>::value_type>
struct BlockStore
{
using type =
cub::BlockStore<T,
PtxPlan::BLOCK_THREADS,
PtxPlan::ITEMS_PER_THREAD,
PtxPlan::STORE_ALGORITHM,
1,
1,
get_arch<PtxPlan>::type::ver>;
};

// cuda_optional
// --------------
// used for function that return cudaError_t along with the result
Expand Down Expand Up @@ -683,16 +656,6 @@ THRUST_RUNTIME_FUNCTION inline int get_ptx_version()
return ptx_version;
}

THRUST_RUNTIME_FUNCTION inline cudaError_t sync_stream(cudaStream_t stream)
{
return cub::SyncStream(stream);
}

inline void _CCCL_DEVICE sync_threadblock()
{
__syncthreads();
}

// Deprecated [Since 2.8]
#define CUDA_CUB_RET_IF_FAIL(e) \
{ \
Expand Down Expand Up @@ -783,11 +746,6 @@ struct uninitialized_array
}
};

_CCCL_HOST_DEVICE _CCCL_FORCEINLINE size_t align_to(size_t n, size_t align)
{
return ((n + align - 1) / align) * align;
}

namespace host
{
inline cuda_optional<size_t> get_max_shared_memory_per_block()
Expand Down Expand Up @@ -817,11 +775,8 @@ THRUST_RUNTIME_FUNCTION cudaError_t alias_storage(
return cub::AliasTemporaries(storage_ptr, storage_size, allocations, allocation_sizes);
}

} // namespace detail
} // namespace core
using core::sm30;
using core::sm35;
using core::sm52;
using core::sm60;
} // namespace cuda_cub

THRUST_NAMESPACE_END
18 changes: 9 additions & 9 deletions thrust/thrust/system/cuda/detail/extrema.h
Original file line number Diff line number Diff line change
Expand Up @@ -182,10 +182,10 @@ cudaError_t THRUST_RUNTIME_FUNCTION doit_step(
OutputIt output_it,
cudaStream_t stream)
{
using core::AgentLauncher;
using core::AgentPlan;
using core::cuda_optional;
using core::get_agent_plan;
using core::detail::AgentLauncher;
using core::detail::AgentPlan;
using core::detail::cuda_optional;
using core::detail::get_agent_plan;

using UnsignedSize = typename detail::make_unsigned_special<Size>::type;

Expand All @@ -202,7 +202,7 @@ cudaError_t THRUST_RUNTIME_FUNCTION doit_step(

if (num_items <= reduce_plan.items_per_tile)
{
size_t vshmem_size = core::vshmem_size(reduce_plan.shared_memory_size, 1);
size_t vshmem_size = core::detail::vshmem_size(reduce_plan.shared_memory_size, 1);

// small, single tile size
if (d_temp_storage == nullptr)
Expand All @@ -219,7 +219,7 @@ cudaError_t THRUST_RUNTIME_FUNCTION doit_step(
else
{
// regular size
cuda_optional<int> sm_count = core::get_sm_count();
cuda_optional<int> sm_count = core::detail::get_sm_count();
CUDA_CUB_RET_IF_FAIL(sm_count.status());

// reduction will not use more cta counts than requested
Expand All @@ -243,7 +243,7 @@ cudaError_t THRUST_RUNTIME_FUNCTION doit_step(
// we will launch at most "max_blocks" blocks in a grid
// so preallocate virtual shared memory storage for this if required
//
size_t vshmem_size = core::vshmem_size(reduce_plan.shared_memory_size, max_blocks);
size_t vshmem_size = core::detail::vshmem_size(reduce_plan.shared_memory_size, max_blocks);

// Temporary storage allocation requirements
void* allocations[3] = {nullptr, nullptr, nullptr};
Expand Down Expand Up @@ -329,14 +329,14 @@ extrema(execution_policy<Derived>& policy, InputIt first, Size num_items, Binary
void* allocations[2] = {nullptr, nullptr};

size_t storage_size = 0;
status = core::alias_storage(nullptr, storage_size, allocations, allocation_sizes);
status = core::detail::alias_storage(nullptr, storage_size, allocations, allocation_sizes);
cuda_cub::throw_on_error(status, "extrema failed on 1st alias storage");

// Allocate temporary storage.
thrust::detail::temporary_array<std::uint8_t, Derived> tmp(policy, storage_size);
void* ptr = static_cast<void*>(tmp.data().get());

status = core::alias_storage(ptr, storage_size, allocations, allocation_sizes);
status = core::detail::alias_storage(ptr, storage_size, allocations, allocation_sizes);
cuda_cub::throw_on_error(status, "extrema failed on 2nd alias storage");

T* d_result = thrust::detail::aligned_reinterpret_cast<T*>(allocations[0]);
Expand Down
36 changes: 17 additions & 19 deletions thrust/thrust/system/cuda/detail/reduce.h
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ struct ReduceAgent
using tuning = Tuning<Arch, T>;

using Vector = typename cub::CubVector<T, PtxPlan::VECTOR_LOAD_LENGTH>;
using LoadIt = typename core::LoadIterator<PtxPlan, InputIt>::type;
using LoadIt = typename core::detail::LoadIterator<PtxPlan, InputIt>::type;
using BlockReduce = cub::BlockReduce<T, PtxPlan::BLOCK_THREADS, PtxPlan::BLOCK_ALGORITHM, 1, 1, Arch::ver>;

using VectorLoadIt = cub::CacheModifiedInputIterator<PtxPlan::LOAD_MODIFIER, Vector, Size>;
Expand All @@ -187,15 +187,15 @@ struct ReduceAgent
// Other algorithms, e.g. merge, may not need additional information,
// and may use AgentPlan directly, instead of defining their own Plan type.
//
struct Plan : core::AgentPlan
struct Plan : core::detail::AgentPlan
{
cub::GridMappingStrategy grid_mapping;

THRUST_RUNTIME_FUNCTION Plan() {}

template <class P>
THRUST_RUNTIME_FUNCTION Plan(P)
: core::AgentPlan(P())
: core::detail::AgentPlan(P())
, grid_mapping(P::GRID_MAPPING)
{}
};
Expand Down Expand Up @@ -242,7 +242,7 @@ struct ReduceAgent
THRUST_DEVICE_FUNCTION impl(TempStorage& storage_, InputIt input_it_, ReductionOp reduction_op_)
: storage(storage_)
, input_it(input_it_)
, load_it(core::make_load_iterator(ptx_plan(), input_it))
, load_it(core::detail::make_load_iterator(ptx_plan(), input_it))
, reduction_op(reduction_op_)
{}

Expand Down Expand Up @@ -440,8 +440,6 @@ struct ReduceAgent
THRUST_DEVICE_FUNCTION T
consume_tiles_impl(Size num_items, cub::GridQueue<UnsignedSize> queue, CAN_VECTORIZE can_vectorize)
{
using core::sync_threadblock;

// We give each thread block at least one tile of input.
T thread_aggregate;
Size block_offset = blockIdx.x * ITEMS_PER_TILE;
Expand All @@ -466,7 +464,7 @@ struct ReduceAgent
storage.dequeue_offset = queue.Drain(ITEMS_PER_TILE) + even_share_base;
}

sync_threadblock();
__syncthreads();

// Grab tile offset and check if we're done with full tiles
block_offset = storage.dequeue_offset;
Expand All @@ -477,15 +475,15 @@ struct ReduceAgent
consume_tile<false>(
thread_aggregate, block_offset, ITEMS_PER_TILE, thrust::detail::true_type(), can_vectorize);

sync_threadblock();
__syncthreads();

// Dequeue a tile of items
if (threadIdx.x == 0)
{
storage.dequeue_offset = queue.Drain(ITEMS_PER_TILE) + even_share_base;
}

sync_threadblock();
__syncthreads();

// Grab tile offset and check if we're done with full tiles
block_offset = storage.dequeue_offset;
Expand Down Expand Up @@ -598,7 +596,7 @@ struct DrainAgent
template <class Arch>
struct PtxPlan : PtxPolicy<1>
{};
using ptx_plan = core::specialize_plan<PtxPlan>;
using ptx_plan = core::detail::specialize_plan<PtxPlan>;

//---------------------------------------------------------------------
// Agent entry point
Expand All @@ -621,10 +619,10 @@ cudaError_t THRUST_RUNTIME_FUNCTION doit_step(
OutputIt output_it,
cudaStream_t stream)
{
using core::AgentLauncher;
using core::AgentPlan;
using core::cuda_optional;
using core::get_agent_plan;
using core::detail::AgentLauncher;
using core::detail::AgentPlan;
using core::detail::cuda_optional;
using core::detail::get_agent_plan;

using UnsignedSize = typename detail::make_unsigned_special<Size>::type;

Expand All @@ -641,7 +639,7 @@ cudaError_t THRUST_RUNTIME_FUNCTION doit_step(

if (num_items <= reduce_plan.items_per_tile)
{
size_t vshmem_size = core::vshmem_size(reduce_plan.shared_memory_size, 1);
size_t vshmem_size = core::detail::vshmem_size(reduce_plan.shared_memory_size, 1);

// small, single tile size
if (d_temp_storage == nullptr)
Expand All @@ -658,7 +656,7 @@ cudaError_t THRUST_RUNTIME_FUNCTION doit_step(
else
{
// regular size
cuda_optional<int> sm_count = core::get_sm_count();
cuda_optional<int> sm_count = core::detail::get_sm_count();
CUDA_CUB_RET_IF_FAIL(sm_count.status());

// reduction will not use more cta counts than requested
Expand All @@ -682,7 +680,7 @@ cudaError_t THRUST_RUNTIME_FUNCTION doit_step(
// we will launch at most "max_blocks" blocks in a grid
// so preallocate virtual shared memory storage for this if required
//
size_t vshmem_size = core::vshmem_size(reduce_plan.shared_memory_size, max_blocks);
size_t vshmem_size = core::detail::vshmem_size(reduce_plan.shared_memory_size, max_blocks);

// Temporary storage allocation requirements
void* allocations[3] = {nullptr, nullptr, nullptr};
Expand Down Expand Up @@ -767,14 +765,14 @@ reduce(execution_policy<Derived>& policy, InputIt first, Size num_items, T init,
void* allocations[2] = {nullptr, nullptr};

size_t storage_size = 0;
status = core::alias_storage(nullptr, storage_size, allocations, allocation_sizes);
status = core::detail::alias_storage(nullptr, storage_size, allocations, allocation_sizes);
cuda_cub::throw_on_error(status, "reduce failed on 1st alias_storage");

// Allocate temporary storage.
thrust::detail::temporary_array<std::uint8_t, Derived> tmp(policy, storage_size);
void* ptr = static_cast<void*>(tmp.data().get());

status = core::alias_storage(ptr, storage_size, allocations, allocation_sizes);
status = core::detail::alias_storage(ptr, storage_size, allocations, allocation_sizes);
cuda_cub::throw_on_error(status, "reduce failed on 2nd alias_storage");

T* d_result = thrust::detail::aligned_reinterpret_cast<T*>(allocations[0]);
Expand Down
Loading

0 comments on commit 56f56d1

Please sign in to comment.