Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Internalize cuda/detail/core/* #3505

Merged
merged 1 commit into from
Jan 30, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions cub/cub/agent/agent_adjacent_difference.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ template <typename Policy,
bool ReadLeft>
struct AgentDifference
{
using LoadIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, InputIteratorT>::type;
using LoadIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator<Policy, InputIteratorT>::type;

using BlockLoad = typename cub::BlockLoadType<Policy, LoadIt>::type;
using BlockStore = typename cub::BlockStoreType<Policy, OutputIteratorT, OutputT>::type;
Expand Down Expand Up @@ -119,7 +119,7 @@ struct AgentDifference
OffsetT num_items)
: temp_storage(temp_storage.Alias())
, input_it(input_it)
, load_it(THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(Policy(), input_it))
, load_it(THRUST_NS_QUALIFIER::cuda_cub::core::detail::make_load_iterator(Policy(), input_it))
, first_tile_previous(first_tile_previous)
, result(result)
, difference_op(difference_op)
Expand Down
8 changes: 4 additions & 4 deletions cub/cub/agent/agent_merge.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -64,10 +64,10 @@ struct agent_t
using key_type = typename ::cuda::std::iterator_traits<KeysIt1>::value_type;
using item_type = typename ::cuda::std::iterator_traits<ItemsIt1>::value_type;

using keys_load_it1 = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, KeysIt1>::type;
using keys_load_it2 = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, KeysIt2>::type;
using items_load_it1 = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, ItemsIt1>::type;
using items_load_it2 = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, ItemsIt2>::type;
using keys_load_it1 = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator<Policy, KeysIt1>::type;
using keys_load_it2 = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator<Policy, KeysIt2>::type;
using items_load_it1 = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator<Policy, ItemsIt1>::type;
using items_load_it2 = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator<Policy, ItemsIt2>::type;

using block_load_keys1 = typename BlockLoadType<Policy, keys_load_it1>::type;
using block_load_keys2 = typename BlockLoadType<Policy, keys_load_it2>::type;
Expand Down
15 changes: 9 additions & 6 deletions cub/cub/agent/agent_merge_sort.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -91,8 +91,10 @@ struct AgentBlockSort

using BlockMergeSortT = BlockMergeSort<KeyT, Policy::BLOCK_THREADS, Policy::ITEMS_PER_THREAD, ValueT>;

using KeysLoadIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, KeyInputIteratorT>::type;
using ItemsLoadIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, ValueInputIteratorT>::type;
using KeysLoadIt =
typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator<Policy, KeyInputIteratorT>::type;
using ItemsLoadIt =
typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator<Policy, ValueInputIteratorT>::type;

using BlockLoadKeys = typename cub::BlockLoadType<Policy, KeysLoadIt>::type;
using BlockLoadItems = typename cub::BlockLoadType<Policy, ItemsLoadIt>::type;
Expand Down Expand Up @@ -438,10 +440,11 @@ struct AgentMerge
//---------------------------------------------------------------------
// Types and constants
//---------------------------------------------------------------------
using KeysLoadPingIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, KeyIteratorT>::type;
using ItemsLoadPingIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, ValueIteratorT>::type;
using KeysLoadPongIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, KeyT*>::type;
using ItemsLoadPongIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, ValueT*>::type;
using KeysLoadPingIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator<Policy, KeyIteratorT>::type;
using ItemsLoadPingIt =
typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator<Policy, ValueIteratorT>::type;
using KeysLoadPongIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator<Policy, KeyT*>::type;
using ItemsLoadPongIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator<Policy, ValueT*>::type;

using KeysOutputPongIt = KeyIteratorT;
using ItemsOutputPongIt = ValueIteratorT;
Expand Down
4 changes: 2 additions & 2 deletions cub/cub/agent/agent_sub_warp_merge_sort.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -183,8 +183,8 @@ public:

using WarpMergeSortT = WarpMergeSort<KeyT, PolicyT::ITEMS_PER_THREAD, PolicyT::WARP_THREADS, ValueT>;

using KeysLoadItT = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<PolicyT, const KeyT*>::type;
using ItemsLoadItT = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<PolicyT, const ValueT*>::type;
using KeysLoadItT = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator<PolicyT, const KeyT*>::type;
using ItemsLoadItT = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator<PolicyT, const ValueT*>::type;

using WarpLoadKeysT = cub::WarpLoad<KeyT, PolicyT::ITEMS_PER_THREAD, PolicyT::LOAD_ALGORITHM, PolicyT::WARP_THREADS>;
using WarpLoadItemsT =
Expand Down
2 changes: 1 addition & 1 deletion cub/cub/device/dispatch/dispatch_merge.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ __launch_bounds__(
CompareOp>::type;
using MergePolicy = typename MergeAgent::policy;

using THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator;
using THRUST_NS_QUALIFIER::cuda_cub::core::detail::make_load_iterator;
using vsmem_helper_t = vsmem_helper_impl<MergeAgent>;
__shared__ typename vsmem_helper_t::static_temp_storage_t shared_temp_storage;
auto& temp_storage = vsmem_helper_t::get_temp_storage(shared_temp_storage, global_temp_storage);
Expand Down
19 changes: 10 additions & 9 deletions cub/cub/device/dispatch/kernels/merge_sort.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,13 @@

THRUST_NAMESPACE_BEGIN

namespace cuda_cub::core
namespace cuda_cub::core::detail
{
// We must forward declare here because make_load_iterator.h pulls in non NVRTC compilable code
template <class PtxPlan, class It>
typename LoadIterator<PtxPlan, It>::type _CCCL_DEVICE _CCCL_FORCEINLINE make_load_iterator(PtxPlan const&, It it);
} // namespace cuda_cub::core
typename detail::LoadIterator<PtxPlan, It>::type _CCCL_DEVICE _CCCL_FORCEINLINE
make_load_iterator(PtxPlan const&, It it);
} // namespace cuda_cub::core::detail

THRUST_NAMESPACE_END

Expand Down Expand Up @@ -174,8 +175,8 @@ __launch_bounds__(
AgentBlockSortT agent(
ping,
temp_storage,
THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), keys_in),
THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), items_in),
THRUST_NS_QUALIFIER::cuda_cub::core::detail::make_load_iterator(ActivePolicyT(), keys_in),
THRUST_NS_QUALIFIER::cuda_cub::core::detail::make_load_iterator(ActivePolicyT(), items_in),
keys_count,
keys_out,
items_out,
Expand Down Expand Up @@ -280,10 +281,10 @@ __launch_bounds__(
AgentMergeT agent(
ping,
temp_storage,
THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), keys_ping),
THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), items_ping),
THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), keys_pong),
THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), items_pong),
THRUST_NS_QUALIFIER::cuda_cub::core::detail::make_load_iterator(ActivePolicyT(), keys_ping),
THRUST_NS_QUALIFIER::cuda_cub::core::detail::make_load_iterator(ActivePolicyT(), items_ping),
THRUST_NS_QUALIFIER::cuda_cub::core::detail::make_load_iterator(ActivePolicyT(), keys_pong),
THRUST_NS_QUALIFIER::cuda_cub::core::detail::make_load_iterator(ActivePolicyT(), items_pong),
keys_count,
keys_pong,
items_pong,
Expand Down
46 changes: 12 additions & 34 deletions thrust/thrust/system/cuda/detail/core/agent_launcher.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,8 @@ namespace cuda_cub
{
namespace core
{

namespace detail
{
# ifndef THRUST_DETAIL_KERNEL_ATTRIBUTES
# define THRUST_DETAIL_KERNEL_ATTRIBUTES CCCL_DETAIL_KERNEL_ATTRIBUTES
# endif
Expand Down Expand Up @@ -97,7 +98,7 @@ THRUST_DETAIL_KERNEL_ATTRIBUTES void _kernel_agent_vshmem(char*, Args... args)
template <class Agent>
struct AgentLauncher : Agent
{
core::AgentPlan plan;
AgentPlan plan;
size_t count;
cudaStream_t stream;
char const* name;
Expand All @@ -121,7 +122,7 @@ struct AgentLauncher : Agent
, name(name_)
, grid(static_cast<unsigned int>((count + plan.items_per_tile - 1) / plan.items_per_tile))
, vshmem(nullptr)
, has_shmem((size_t) core::get_max_shared_memory_per_block() >= (size_t) plan.shared_memory_size)
, has_shmem((size_t) get_max_shared_memory_per_block() >= (size_t) plan.shared_memory_size)
, shmem_size(has_shmem ? plan.shared_memory_size : 0)
{
assert(count > 0);
Expand All @@ -136,7 +137,7 @@ struct AgentLauncher : Agent
, name(name_)
, grid(static_cast<unsigned int>((count + plan.items_per_tile - 1) / plan.items_per_tile))
, vshmem(vshmem)
, has_shmem((size_t) core::get_max_shared_memory_per_block() >= (size_t) plan.shared_memory_size)
, has_shmem((size_t) get_max_shared_memory_per_block() >= (size_t) plan.shared_memory_size)
, shmem_size(has_shmem ? plan.shared_memory_size : 0)
{
assert(count > 0);
Expand All @@ -149,7 +150,7 @@ struct AgentLauncher : Agent
, name(name_)
, grid(plan.grid_size)
, vshmem(nullptr)
, has_shmem((size_t) core::get_max_shared_memory_per_block() >= (size_t) plan.shared_memory_size)
, has_shmem((size_t) get_max_shared_memory_per_block() >= (size_t) plan.shared_memory_size)
, shmem_size(has_shmem ? plan.shared_memory_size : 0)
{
assert(plan.grid_size > 0);
Expand All @@ -162,43 +163,19 @@ struct AgentLauncher : Agent
, name(name_)
, grid(plan.grid_size)
, vshmem(vshmem)
, has_shmem((size_t) core::get_max_shared_memory_per_block() >= (size_t) plan.shared_memory_size)
, has_shmem((size_t) get_max_shared_memory_per_block() >= (size_t) plan.shared_memory_size)
, shmem_size(has_shmem ? plan.shared_memory_size : 0)
{
assert(plan.grid_size > 0);
}

# if 0
THRUST_RUNTIME_FUNCTION
AgentPlan static get_plan(cudaStream_t s, void* d_ptr = 0)
{
// in separable compilation mode, we have no choice
// but to call kernel to get agent_plan
// otherwise the risk is something may fail
// if user mix & match ptx versions in a separably compiled function
// http://nvbugs/1772071
// XXX may be it is too string of a requirements, consider relaxing it in
// the future
# ifdef __CUDACC_RDC__
return core::get_agent_plan<Agent>(s, d_ptr);
# else
return get_agent_plan<Agent>(core::get_ptx_version());
# endif
}
THRUST_RUNTIME_FUNCTION
AgentPlan static get_plan_default()
{
return get_agent_plan<Agent>(sm_arch<0>::type::ver);
}
# endif

THRUST_RUNTIME_FUNCTION typename core::get_plan<Agent>::type static get_plan(cudaStream_t, void* d_ptr = 0)
THRUST_RUNTIME_FUNCTION typename get_plan<Agent>::type static get_plan(cudaStream_t, void* d_ptr = 0)
{
THRUST_UNUSED_VAR(d_ptr);
return get_agent_plan<Agent>(core::get_ptx_version());
return get_agent_plan<Agent>(get_ptx_version());
}

THRUST_RUNTIME_FUNCTION typename core::get_plan<Agent>::type static get_plan()
THRUST_RUNTIME_FUNCTION typename detail::get_plan<Agent>::type static get_plan()
{
return get_agent_plan<Agent>(lowest_supported_sm_arch::ver);
}
Expand Down Expand Up @@ -227,7 +204,7 @@ struct AgentLauncher : Agent
{
# if THRUST_DEBUG_SYNC_FLAG
cuda_optional<int> occ = max_sm_occupancy(k);
const int ptx_version = core::get_ptx_version();
const int ptx_version = get_ptx_version();
if (count > 0)
{
_CubLog(
Expand Down Expand Up @@ -305,6 +282,7 @@ struct AgentLauncher : Agent
}
};

} // namespace detail
} // namespace core
} // namespace cuda_cub

Expand Down
4 changes: 2 additions & 2 deletions thrust/thrust/system/cuda/detail/core/load_iterator.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@

THRUST_NAMESPACE_BEGIN

namespace cuda_cub::core
namespace cuda_cub::core::detail
{

// LoadIterator
Expand All @@ -52,6 +52,6 @@ struct LoadIterator
cub::CacheModifiedInputIterator<PtxPlan::LOAD_MODIFIER, value_type, size_type>,
It>;
}; // struct Iterator
} // namespace cuda_cub::core
} // namespace cuda_cub::core::detail

THRUST_NAMESPACE_END
4 changes: 2 additions & 2 deletions thrust/thrust/system/cuda/detail/core/make_load_iterator.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@

THRUST_NAMESPACE_BEGIN

namespace cuda_cub::core
namespace cuda_cub::core::detail
{
template <class PtxPlan, class It>
typename LoadIterator<PtxPlan, It>::type _CCCL_DEVICE _CCCL_FORCEINLINE
Expand All @@ -55,6 +55,6 @@ typename LoadIterator<PtxPlan, It>::type _CCCL_DEVICE _CCCL_FORCEINLINE make_loa
return make_load_iterator_impl<PtxPlan>(it, typename is_contiguous_iterator<It>::type());
}

} // namespace cuda_cub::core
} // namespace cuda_cub::core::detail

THRUST_NAMESPACE_END
51 changes: 4 additions & 47 deletions thrust/thrust/system/cuda/detail/core/util.h
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,8 @@ namespace core
# endif
#endif

namespace detail
{
/// Typelist - a container of types
template <typename...>
struct typelist;
Expand Down Expand Up @@ -458,22 +460,9 @@ THRUST_RUNTIME_FUNCTION inline size_t get_max_shared_memory_per_block()
return static_cast<size_t>(i32value);
}

THRUST_RUNTIME_FUNCTION inline size_t virtual_shmem_size(size_t shmem_per_block)
{
size_t max_shmem_per_block = core::get_max_shared_memory_per_block();
if (shmem_per_block > max_shmem_per_block)
{
return shmem_per_block;
}
else
{
return 0;
}
}

THRUST_RUNTIME_FUNCTION inline size_t vshmem_size(size_t shmem_per_block, size_t num_blocks)
{
size_t max_shmem_per_block = core::get_max_shared_memory_per_block();
size_t max_shmem_per_block = get_max_shared_memory_per_block();
if (shmem_per_block > max_shmem_per_block)
{
return shmem_per_block * num_blocks;
Expand Down Expand Up @@ -509,22 +498,6 @@ struct BlockLoad
get_arch<PtxPlan>::type::ver>;
};

// BlockStore
// -----------
// a helper metaprogram that returns type of a block loader
template <class PtxPlan, class It, class T = typename iterator_traits<It>::value_type>
struct BlockStore
{
using type =
cub::BlockStore<T,
PtxPlan::BLOCK_THREADS,
PtxPlan::ITEMS_PER_THREAD,
PtxPlan::STORE_ALGORITHM,
1,
1,
get_arch<PtxPlan>::type::ver>;
};

// cuda_optional
// --------------
// used for function that return cudaError_t along with the result
Expand Down Expand Up @@ -619,16 +592,6 @@ THRUST_RUNTIME_FUNCTION inline int get_ptx_version()
return ptx_version;
}

THRUST_RUNTIME_FUNCTION inline cudaError_t sync_stream(cudaStream_t stream)
{
return cub::SyncStream(stream);
}

inline void _CCCL_DEVICE sync_threadblock()
{
__syncthreads();
}

// Deprecated [Since 2.8]
#define CUDA_CUB_RET_IF_FAIL(e) \
{ \
Expand Down Expand Up @@ -719,11 +682,6 @@ struct uninitialized_array
}
};

_CCCL_HOST_DEVICE _CCCL_FORCEINLINE size_t align_to(size_t n, size_t align)
{
return ((n + align - 1) / align) * align;
}

namespace host
{
inline cuda_optional<size_t> get_max_shared_memory_per_block()
Expand Down Expand Up @@ -753,9 +711,8 @@ THRUST_RUNTIME_FUNCTION cudaError_t alias_storage(
return cub::AliasTemporaries(storage_ptr, storage_size, allocations, allocation_sizes);
}

} // namespace detail
} // namespace core
using core::sm52;
using core::sm60;
} // namespace cuda_cub

THRUST_NAMESPACE_END
Loading
Loading