diff --git a/cub/cub/agent/agent_adjacent_difference.cuh b/cub/cub/agent/agent_adjacent_difference.cuh index c19cb90079a..8617c78193b 100644 --- a/cub/cub/agent/agent_adjacent_difference.cuh +++ b/cub/cub/agent/agent_adjacent_difference.cuh @@ -79,7 +79,7 @@ template struct AgentDifference { - using LoadIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator::type; + using LoadIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator::type; using BlockLoad = typename cub::BlockLoadType::type; using BlockStore = typename cub::BlockStoreType::type; @@ -119,7 +119,7 @@ struct AgentDifference OffsetT num_items) : temp_storage(temp_storage.Alias()) , input_it(input_it) - , load_it(THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(Policy(), input_it)) + , load_it(THRUST_NS_QUALIFIER::cuda_cub::core::detail::make_load_iterator(Policy(), input_it)) , first_tile_previous(first_tile_previous) , result(result) , difference_op(difference_op) diff --git a/cub/cub/agent/agent_merge.cuh b/cub/cub/agent/agent_merge.cuh index 9ae14c3e42e..5c7d5322456 100644 --- a/cub/cub/agent/agent_merge.cuh +++ b/cub/cub/agent/agent_merge.cuh @@ -64,10 +64,10 @@ struct agent_t using key_type = typename ::cuda::std::iterator_traits::value_type; using item_type = typename ::cuda::std::iterator_traits::value_type; - using keys_load_it1 = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator::type; - using keys_load_it2 = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator::type; - using items_load_it1 = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator::type; - using items_load_it2 = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator::type; + using keys_load_it1 = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator::type; + using keys_load_it2 = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator::type; + using items_load_it1 = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator::type; + using items_load_it2 = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator::type; using block_load_keys1 = typename BlockLoadType::type; using block_load_keys2 = typename BlockLoadType::type; diff --git a/cub/cub/agent/agent_merge_sort.cuh b/cub/cub/agent/agent_merge_sort.cuh index bf4984f7256..1ec952187a7 100644 --- a/cub/cub/agent/agent_merge_sort.cuh +++ b/cub/cub/agent/agent_merge_sort.cuh @@ -91,8 +91,10 @@ struct AgentBlockSort using BlockMergeSortT = BlockMergeSort; - using KeysLoadIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator::type; - using ItemsLoadIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator::type; + using KeysLoadIt = + typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator::type; + using ItemsLoadIt = + typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator::type; using BlockLoadKeys = typename cub::BlockLoadType::type; using BlockLoadItems = typename cub::BlockLoadType::type; @@ -438,10 +440,11 @@ struct AgentMerge //--------------------------------------------------------------------- // Types and constants //--------------------------------------------------------------------- - using KeysLoadPingIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator::type; - using ItemsLoadPingIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator::type; - using KeysLoadPongIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator::type; - using ItemsLoadPongIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator::type; + using KeysLoadPingIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator::type; + using ItemsLoadPingIt = + typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator::type; + using KeysLoadPongIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator::type; + using ItemsLoadPongIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator::type; using KeysOutputPongIt = KeyIteratorT; using ItemsOutputPongIt = ValueIteratorT; diff --git a/cub/cub/agent/agent_sub_warp_merge_sort.cuh b/cub/cub/agent/agent_sub_warp_merge_sort.cuh index b10f1cda3ea..9f98ac42e3b 100644 --- a/cub/cub/agent/agent_sub_warp_merge_sort.cuh +++ b/cub/cub/agent/agent_sub_warp_merge_sort.cuh @@ -183,8 +183,8 @@ public: using WarpMergeSortT = WarpMergeSort; - using KeysLoadItT = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator::type; - using ItemsLoadItT = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator::type; + using KeysLoadItT = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator::type; + using ItemsLoadItT = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator::type; using WarpLoadKeysT = cub::WarpLoad; using WarpLoadItemsT = diff --git a/cub/cub/device/dispatch/dispatch_merge.cuh b/cub/cub/device/dispatch/dispatch_merge.cuh index b3d0c8ab2ca..c4df61fd29a 100644 --- a/cub/cub/device/dispatch/dispatch_merge.cuh +++ b/cub/cub/device/dispatch/dispatch_merge.cuh @@ -138,7 +138,7 @@ __launch_bounds__( CompareOp>::type; using MergePolicy = typename MergeAgent::policy; - using THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator; + using THRUST_NS_QUALIFIER::cuda_cub::core::detail::make_load_iterator; using vsmem_helper_t = vsmem_helper_impl; __shared__ typename vsmem_helper_t::static_temp_storage_t shared_temp_storage; auto& temp_storage = vsmem_helper_t::get_temp_storage(shared_temp_storage, global_temp_storage); diff --git a/cub/cub/device/dispatch/kernels/merge_sort.cuh b/cub/cub/device/dispatch/kernels/merge_sort.cuh index 1065313c20d..8401744b226 100644 --- a/cub/cub/device/dispatch/kernels/merge_sort.cuh +++ b/cub/cub/device/dispatch/kernels/merge_sort.cuh @@ -19,12 +19,13 @@ THRUST_NAMESPACE_BEGIN -namespace cuda_cub::core +namespace cuda_cub::core::detail { // We must forward declare here because make_load_iterator.h pulls in non NVRTC compilable code template -typename LoadIterator::type _CCCL_DEVICE _CCCL_FORCEINLINE make_load_iterator(PtxPlan const&, It it); -} // namespace cuda_cub::core +typename detail::LoadIterator::type _CCCL_DEVICE _CCCL_FORCEINLINE +make_load_iterator(PtxPlan const&, It it); +} // namespace cuda_cub::core::detail THRUST_NAMESPACE_END @@ -174,8 +175,8 @@ __launch_bounds__( AgentBlockSortT agent( ping, temp_storage, - THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), keys_in), - THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), items_in), + THRUST_NS_QUALIFIER::cuda_cub::core::detail::make_load_iterator(ActivePolicyT(), keys_in), + THRUST_NS_QUALIFIER::cuda_cub::core::detail::make_load_iterator(ActivePolicyT(), items_in), keys_count, keys_out, items_out, @@ -280,10 +281,10 @@ __launch_bounds__( AgentMergeT agent( ping, temp_storage, - THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), keys_ping), - THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), items_ping), - THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), keys_pong), - THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), items_pong), + THRUST_NS_QUALIFIER::cuda_cub::core::detail::make_load_iterator(ActivePolicyT(), keys_ping), + THRUST_NS_QUALIFIER::cuda_cub::core::detail::make_load_iterator(ActivePolicyT(), items_ping), + THRUST_NS_QUALIFIER::cuda_cub::core::detail::make_load_iterator(ActivePolicyT(), keys_pong), + THRUST_NS_QUALIFIER::cuda_cub::core::detail::make_load_iterator(ActivePolicyT(), items_pong), keys_count, keys_pong, items_pong, diff --git a/thrust/thrust/system/cuda/detail/core/agent_launcher.h b/thrust/thrust/system/cuda/detail/core/agent_launcher.h index fb7c1ef22d6..d9baeb47593 100644 --- a/thrust/thrust/system/cuda/detail/core/agent_launcher.h +++ b/thrust/thrust/system/cuda/detail/core/agent_launcher.h @@ -62,7 +62,8 @@ namespace cuda_cub { namespace core { - +namespace detail +{ # ifndef THRUST_DETAIL_KERNEL_ATTRIBUTES # define THRUST_DETAIL_KERNEL_ATTRIBUTES CCCL_DETAIL_KERNEL_ATTRIBUTES # endif @@ -97,7 +98,7 @@ THRUST_DETAIL_KERNEL_ATTRIBUTES void _kernel_agent_vshmem(char*, Args... args) template struct AgentLauncher : Agent { - core::AgentPlan plan; + AgentPlan plan; size_t count; cudaStream_t stream; char const* name; @@ -121,7 +122,7 @@ struct AgentLauncher : Agent , name(name_) , grid(static_cast((count + plan.items_per_tile - 1) / plan.items_per_tile)) , vshmem(nullptr) - , has_shmem((size_t) core::get_max_shared_memory_per_block() >= (size_t) plan.shared_memory_size) + , has_shmem((size_t) get_max_shared_memory_per_block() >= (size_t) plan.shared_memory_size) , shmem_size(has_shmem ? plan.shared_memory_size : 0) { assert(count > 0); @@ -136,7 +137,7 @@ struct AgentLauncher : Agent , name(name_) , grid(static_cast((count + plan.items_per_tile - 1) / plan.items_per_tile)) , vshmem(vshmem) - , has_shmem((size_t) core::get_max_shared_memory_per_block() >= (size_t) plan.shared_memory_size) + , has_shmem((size_t) get_max_shared_memory_per_block() >= (size_t) plan.shared_memory_size) , shmem_size(has_shmem ? plan.shared_memory_size : 0) { assert(count > 0); @@ -149,7 +150,7 @@ struct AgentLauncher : Agent , name(name_) , grid(plan.grid_size) , vshmem(nullptr) - , has_shmem((size_t) core::get_max_shared_memory_per_block() >= (size_t) plan.shared_memory_size) + , has_shmem((size_t) get_max_shared_memory_per_block() >= (size_t) plan.shared_memory_size) , shmem_size(has_shmem ? plan.shared_memory_size : 0) { assert(plan.grid_size > 0); @@ -162,43 +163,19 @@ struct AgentLauncher : Agent , name(name_) , grid(plan.grid_size) , vshmem(vshmem) - , has_shmem((size_t) core::get_max_shared_memory_per_block() >= (size_t) plan.shared_memory_size) + , has_shmem((size_t) get_max_shared_memory_per_block() >= (size_t) plan.shared_memory_size) , shmem_size(has_shmem ? plan.shared_memory_size : 0) { assert(plan.grid_size > 0); } -# if 0 - THRUST_RUNTIME_FUNCTION - AgentPlan static get_plan(cudaStream_t s, void* d_ptr = 0) - { - // in separable compilation mode, we have no choice - // but to call kernel to get agent_plan - // otherwise the risk is something may fail - // if user mix & match ptx versions in a separably compiled function - // http://nvbugs/1772071 - // XXX may be it is too string of a requirements, consider relaxing it in - // the future -# ifdef __CUDACC_RDC__ - return core::get_agent_plan(s, d_ptr); -# else - return get_agent_plan(core::get_ptx_version()); -# endif - } - THRUST_RUNTIME_FUNCTION - AgentPlan static get_plan_default() - { - return get_agent_plan(sm_arch<0>::type::ver); - } -# endif - - THRUST_RUNTIME_FUNCTION typename core::get_plan::type static get_plan(cudaStream_t, void* d_ptr = 0) + THRUST_RUNTIME_FUNCTION typename get_plan::type static get_plan(cudaStream_t, void* d_ptr = 0) { THRUST_UNUSED_VAR(d_ptr); - return get_agent_plan(core::get_ptx_version()); + return get_agent_plan(get_ptx_version()); } - THRUST_RUNTIME_FUNCTION typename core::get_plan::type static get_plan() + THRUST_RUNTIME_FUNCTION typename detail::get_plan::type static get_plan() { return get_agent_plan(lowest_supported_sm_arch::ver); } @@ -227,7 +204,7 @@ struct AgentLauncher : Agent { # if THRUST_DEBUG_SYNC_FLAG cuda_optional occ = max_sm_occupancy(k); - const int ptx_version = core::get_ptx_version(); + const int ptx_version = get_ptx_version(); if (count > 0) { _CubLog( @@ -305,6 +282,7 @@ struct AgentLauncher : Agent } }; +} // namespace detail } // namespace core } // namespace cuda_cub diff --git a/thrust/thrust/system/cuda/detail/core/load_iterator.h b/thrust/thrust/system/cuda/detail/core/load_iterator.h index 07c5eba0eaa..6f2c118b151 100644 --- a/thrust/thrust/system/cuda/detail/core/load_iterator.h +++ b/thrust/thrust/system/cuda/detail/core/load_iterator.h @@ -34,7 +34,7 @@ THRUST_NAMESPACE_BEGIN -namespace cuda_cub::core +namespace cuda_cub::core::detail { // LoadIterator @@ -52,6 +52,6 @@ struct LoadIterator cub::CacheModifiedInputIterator, It>; }; // struct Iterator -} // namespace cuda_cub::core +} // namespace cuda_cub::core::detail THRUST_NAMESPACE_END diff --git a/thrust/thrust/system/cuda/detail/core/make_load_iterator.h b/thrust/thrust/system/cuda/detail/core/make_load_iterator.h index 28c65c813ea..9497ccacca9 100644 --- a/thrust/thrust/system/cuda/detail/core/make_load_iterator.h +++ b/thrust/thrust/system/cuda/detail/core/make_load_iterator.h @@ -33,7 +33,7 @@ THRUST_NAMESPACE_BEGIN -namespace cuda_cub::core +namespace cuda_cub::core::detail { template typename LoadIterator::type _CCCL_DEVICE _CCCL_FORCEINLINE @@ -55,6 +55,6 @@ typename LoadIterator::type _CCCL_DEVICE _CCCL_FORCEINLINE make_loa return make_load_iterator_impl(it, typename is_contiguous_iterator::type()); } -} // namespace cuda_cub::core +} // namespace cuda_cub::core::detail THRUST_NAMESPACE_END diff --git a/thrust/thrust/system/cuda/detail/core/util.h b/thrust/thrust/system/cuda/detail/core/util.h index 94a7e750aeb..b3bdcf1f086 100644 --- a/thrust/thrust/system/cuda/detail/core/util.h +++ b/thrust/thrust/system/cuda/detail/core/util.h @@ -78,6 +78,8 @@ namespace core # endif #endif +namespace detail +{ /// Typelist - a container of types template struct typelist; @@ -458,22 +460,9 @@ THRUST_RUNTIME_FUNCTION inline size_t get_max_shared_memory_per_block() return static_cast(i32value); } -THRUST_RUNTIME_FUNCTION inline size_t virtual_shmem_size(size_t shmem_per_block) -{ - size_t max_shmem_per_block = core::get_max_shared_memory_per_block(); - if (shmem_per_block > max_shmem_per_block) - { - return shmem_per_block; - } - else - { - return 0; - } -} - THRUST_RUNTIME_FUNCTION inline size_t vshmem_size(size_t shmem_per_block, size_t num_blocks) { - size_t max_shmem_per_block = core::get_max_shared_memory_per_block(); + size_t max_shmem_per_block = get_max_shared_memory_per_block(); if (shmem_per_block > max_shmem_per_block) { return shmem_per_block * num_blocks; @@ -509,22 +498,6 @@ struct BlockLoad get_arch::type::ver>; }; -// BlockStore -// ----------- -// a helper metaprogram that returns type of a block loader -template ::value_type> -struct BlockStore -{ - using type = - cub::BlockStore::type::ver>; -}; - // cuda_optional // -------------- // used for function that return cudaError_t along with the result @@ -619,16 +592,6 @@ THRUST_RUNTIME_FUNCTION inline int get_ptx_version() return ptx_version; } -THRUST_RUNTIME_FUNCTION inline cudaError_t sync_stream(cudaStream_t stream) -{ - return cub::SyncStream(stream); -} - -inline void _CCCL_DEVICE sync_threadblock() -{ - __syncthreads(); -} - // Deprecated [Since 2.8] #define CUDA_CUB_RET_IF_FAIL(e) \ { \ @@ -719,11 +682,6 @@ struct uninitialized_array } }; -_CCCL_HOST_DEVICE _CCCL_FORCEINLINE size_t align_to(size_t n, size_t align) -{ - return ((n + align - 1) / align) * align; -} - namespace host { inline cuda_optional get_max_shared_memory_per_block() @@ -753,9 +711,8 @@ THRUST_RUNTIME_FUNCTION cudaError_t alias_storage( return cub::AliasTemporaries(storage_ptr, storage_size, allocations, allocation_sizes); } +} // namespace detail } // namespace core -using core::sm52; -using core::sm60; } // namespace cuda_cub THRUST_NAMESPACE_END diff --git a/thrust/thrust/system/cuda/detail/extrema.h b/thrust/thrust/system/cuda/detail/extrema.h index 617eb8bbc79..b2124323424 100644 --- a/thrust/thrust/system/cuda/detail/extrema.h +++ b/thrust/thrust/system/cuda/detail/extrema.h @@ -184,10 +184,10 @@ cudaError_t THRUST_RUNTIME_FUNCTION doit_step( OutputIt output_it, cudaStream_t stream) { - using core::AgentLauncher; - using core::AgentPlan; - using core::cuda_optional; - using core::get_agent_plan; + using core::detail::AgentLauncher; + using core::detail::AgentPlan; + using core::detail::cuda_optional; + using core::detail::get_agent_plan; using UnsignedSize = typename detail::make_unsigned_special::type; @@ -204,7 +204,7 @@ cudaError_t THRUST_RUNTIME_FUNCTION doit_step( if (num_items <= reduce_plan.items_per_tile) { - size_t vshmem_size = core::vshmem_size(reduce_plan.shared_memory_size, 1); + size_t vshmem_size = core::detail::vshmem_size(reduce_plan.shared_memory_size, 1); // small, single tile size if (d_temp_storage == nullptr) @@ -221,7 +221,7 @@ cudaError_t THRUST_RUNTIME_FUNCTION doit_step( else { // regular size - cuda_optional sm_count = core::get_sm_count(); + cuda_optional sm_count = core::detail::get_sm_count(); CUDA_CUB_RET_IF_FAIL(sm_count.status()); // reduction will not use more cta counts than requested @@ -245,7 +245,7 @@ cudaError_t THRUST_RUNTIME_FUNCTION doit_step( // we will launch at most "max_blocks" blocks in a grid // so preallocate virtual shared memory storage for this if required // - size_t vshmem_size = core::vshmem_size(reduce_plan.shared_memory_size, max_blocks); + size_t vshmem_size = core::detail::vshmem_size(reduce_plan.shared_memory_size, max_blocks); // Temporary storage allocation requirements void* allocations[3] = {nullptr, nullptr, nullptr}; @@ -331,14 +331,14 @@ extrema(execution_policy& policy, InputIt first, Size num_items, Binary void* allocations[2] = {nullptr, nullptr}; size_t storage_size = 0; - status = core::alias_storage(nullptr, storage_size, allocations, allocation_sizes); + status = core::detail::alias_storage(nullptr, storage_size, allocations, allocation_sizes); cuda_cub::throw_on_error(status, "extrema failed on 1st alias storage"); // Allocate temporary storage. thrust::detail::temporary_array tmp(policy, storage_size); void* ptr = static_cast(tmp.data().get()); - status = core::alias_storage(ptr, storage_size, allocations, allocation_sizes); + status = core::detail::alias_storage(ptr, storage_size, allocations, allocation_sizes); cuda_cub::throw_on_error(status, "extrema failed on 2nd alias storage"); T* d_result = thrust::detail::aligned_reinterpret_cast(allocations[0]); diff --git a/thrust/thrust/system/cuda/detail/reduce.h b/thrust/thrust/system/cuda/detail/reduce.h index 3787ab62367..61ec2086adf 100644 --- a/thrust/thrust/system/cuda/detail/reduce.h +++ b/thrust/thrust/system/cuda/detail/reduce.h @@ -109,7 +109,7 @@ template struct Tuning; template -struct Tuning +struct Tuning { enum { @@ -155,7 +155,7 @@ struct ReduceAgent using tuning = Tuning; using Vector = typename cub::CubVector; - using LoadIt = typename core::LoadIterator::type; + using LoadIt = typename core::detail::LoadIterator::type; using BlockReduce = cub::BlockReduce; using VectorLoadIt = cub::CacheModifiedInputIterator; @@ -175,7 +175,7 @@ struct ReduceAgent // Other algorithms, e.g. merge, may not need additional information, // and may use AgentPlan directly, instead of defining their own Plan type. // - struct Plan : core::AgentPlan + struct Plan : core::detail::AgentPlan { cub::GridMappingStrategy grid_mapping; @@ -183,7 +183,7 @@ struct ReduceAgent template THRUST_RUNTIME_FUNCTION Plan(P) - : core::AgentPlan(P()) + : core::detail::AgentPlan(P()) , grid_mapping(P::GRID_MAPPING) {} }; @@ -192,7 +192,7 @@ struct ReduceAgent // ptx_plan type *must* only be used from device code // Its use from host code will result in *undefined behaviour* // - using ptx_plan = typename core::specialize_plan_msvc10_war::type::type; + using ptx_plan = typename core::detail::specialize_plan_msvc10_war::type::type; using TempStorage = typename ptx_plan::TempStorage; using Vector = typename ptx_plan::Vector; @@ -230,7 +230,7 @@ struct ReduceAgent THRUST_DEVICE_FUNCTION impl(TempStorage& storage_, InputIt input_it_, ReductionOp reduction_op_) : storage(storage_) , input_it(input_it_) - , load_it(core::make_load_iterator(ptx_plan(), input_it)) + , load_it(core::detail::make_load_iterator(ptx_plan(), input_it)) , reduction_op(reduction_op_) {} @@ -428,8 +428,6 @@ struct ReduceAgent THRUST_DEVICE_FUNCTION T consume_tiles_impl(Size num_items, cub::GridQueue queue, CAN_VECTORIZE can_vectorize) { - using core::sync_threadblock; - // We give each thread block at least one tile of input. T thread_aggregate; Size block_offset = blockIdx.x * ITEMS_PER_TILE; @@ -454,7 +452,7 @@ struct ReduceAgent storage.dequeue_offset = queue.Drain(ITEMS_PER_TILE) + even_share_base; } - sync_threadblock(); + __syncthreads(); // Grab tile offset and check if we're done with full tiles block_offset = storage.dequeue_offset; @@ -465,7 +463,7 @@ struct ReduceAgent consume_tile( thread_aggregate, block_offset, ITEMS_PER_TILE, thrust::detail::true_type(), can_vectorize); - sync_threadblock(); + __syncthreads(); // Dequeue a tile of items if (threadIdx.x == 0) @@ -473,7 +471,7 @@ struct ReduceAgent storage.dequeue_offset = queue.Drain(ITEMS_PER_TILE) + even_share_base; } - sync_threadblock(); + __syncthreads(); // Grab tile offset and check if we're done with full tiles block_offset = storage.dequeue_offset; @@ -586,7 +584,7 @@ struct DrainAgent template struct PtxPlan : PtxPolicy<1> {}; - using ptx_plan = core::specialize_plan; + using ptx_plan = core::detail::specialize_plan; //--------------------------------------------------------------------- // Agent entry point @@ -609,10 +607,10 @@ cudaError_t THRUST_RUNTIME_FUNCTION doit_step( OutputIt output_it, cudaStream_t stream) { - using core::AgentLauncher; - using core::AgentPlan; - using core::cuda_optional; - using core::get_agent_plan; + using core::detail::AgentLauncher; + using core::detail::AgentPlan; + using core::detail::cuda_optional; + using core::detail::get_agent_plan; using UnsignedSize = typename detail::make_unsigned_special::type; @@ -629,7 +627,7 @@ cudaError_t THRUST_RUNTIME_FUNCTION doit_step( if (num_items <= reduce_plan.items_per_tile) { - size_t vshmem_size = core::vshmem_size(reduce_plan.shared_memory_size, 1); + size_t vshmem_size = core::detail::vshmem_size(reduce_plan.shared_memory_size, 1); // small, single tile size if (d_temp_storage == nullptr) @@ -646,7 +644,7 @@ cudaError_t THRUST_RUNTIME_FUNCTION doit_step( else { // regular size - cuda_optional sm_count = core::get_sm_count(); + cuda_optional sm_count = core::detail::get_sm_count(); CUDA_CUB_RET_IF_FAIL(sm_count.status()); // reduction will not use more cta counts than requested @@ -670,7 +668,7 @@ cudaError_t THRUST_RUNTIME_FUNCTION doit_step( // we will launch at most "max_blocks" blocks in a grid // so preallocate virtual shared memory storage for this if required // - size_t vshmem_size = core::vshmem_size(reduce_plan.shared_memory_size, max_blocks); + size_t vshmem_size = core::detail::vshmem_size(reduce_plan.shared_memory_size, max_blocks); // Temporary storage allocation requirements void* allocations[3] = {nullptr, nullptr, nullptr}; @@ -755,14 +753,14 @@ reduce(execution_policy& policy, InputIt first, Size num_items, T init, void* allocations[2] = {nullptr, nullptr}; size_t storage_size = 0; - status = core::alias_storage(nullptr, storage_size, allocations, allocation_sizes); + status = core::detail::alias_storage(nullptr, storage_size, allocations, allocation_sizes); cuda_cub::throw_on_error(status, "reduce failed on 1st alias_storage"); // Allocate temporary storage. thrust::detail::temporary_array tmp(policy, storage_size); void* ptr = static_cast(tmp.data().get()); - status = core::alias_storage(ptr, storage_size, allocations, allocation_sizes); + status = core::detail::alias_storage(ptr, storage_size, allocations, allocation_sizes); cuda_cub::throw_on_error(status, "reduce failed on 2nd alias_storage"); T* d_result = thrust::detail::aligned_reinterpret_cast(allocations[0]); diff --git a/thrust/thrust/system/cuda/detail/reduce_by_key.h b/thrust/thrust/system/cuda/detail/reduce_by_key.h index ae1f0ffab96..8c1db436085 100644 --- a/thrust/thrust/system/cuda/detail/reduce_by_key.h +++ b/thrust/thrust/system/cuda/detail/reduce_by_key.h @@ -115,7 +115,7 @@ template struct Tuning; template -struct Tuning +struct Tuning { enum { @@ -163,11 +163,11 @@ struct ReduceByKeyAgent { using tuning = Tuning; - using KeysLoadIt = typename core::LoadIterator::type; - using ValuesLoadIt = typename core::LoadIterator::type; + using KeysLoadIt = typename core::detail::LoadIterator::type; + using ValuesLoadIt = typename core::detail::LoadIterator::type; - using BlockLoadKeys = typename core::BlockLoad::type; - using BlockLoadValues = typename core::BlockLoad::type; + using BlockLoadKeys = typename core::detail::BlockLoad::type; + using BlockLoadValues = typename core::detail::BlockLoad::type; using BlockDiscontinuityKeys = cub::BlockDiscontinuity; @@ -188,11 +188,11 @@ struct ReduceByKeyAgent typename BlockLoadKeys::TempStorage load_keys; typename BlockLoadValues::TempStorage load_values; - core::uninitialized_array raw_exchange; + core::detail::uninitialized_array raw_exchange; }; // union TempStorage }; // struct PtxPlan - using ptx_plan = typename core::specialize_plan_msvc10_war::type::type; + using ptx_plan = typename core::detail::specialize_plan_msvc10_war::type::type; using KeysLoadIt = typename ptx_plan::KeysLoadIt; using ValuesLoadIt = typename ptx_plan::ValuesLoadIt; @@ -360,9 +360,7 @@ struct ReduceByKeyAgent size_type num_tile_segments, size_type num_tile_segments_prefix) { - using core::sync_threadblock; - - sync_threadblock(); + __syncthreads(); // Compact and scatter keys # pragma unroll @@ -375,7 +373,7 @@ struct ReduceByKeyAgent } } - sync_threadblock(); + __syncthreads(); for (int item = threadIdx.x; item < num_tile_segments; item += BLOCK_THREADS) { @@ -445,8 +443,6 @@ struct ReduceByKeyAgent template THRUST_DEVICE_FUNCTION void consume_first_tile(Size num_remaining, Size tile_offset, ScanTileState& tile_state) { - using core::sync_threadblock; - key_type keys[ITEMS_PER_THREAD]; // Tile keys key_type pred_keys[ITEMS_PER_THREAD]; // Tile keys shifted up (predecessor) value_type values[ITEMS_PER_THREAD]; // Tile values @@ -468,7 +464,7 @@ struct ReduceByKeyAgent BlockLoadKeys(storage.load_keys).Load(keys_load_it + tile_offset, keys); } - sync_threadblock(); + __syncthreads(); // Load values (last tile repeats final element) if (IS_LAST_TILE) @@ -481,7 +477,7 @@ struct ReduceByKeyAgent BlockLoadValues(storage.load_values).Load(values_load_it + tile_offset, values); } - sync_threadblock(); + __syncthreads(); // Set head segment_flags. // First tile sets the first flag for the first item @@ -540,8 +536,6 @@ struct ReduceByKeyAgent THRUST_DEVICE_FUNCTION void consume_subsequent_tile(Size num_remaining, int tile_idx, Size tile_offset, ScanTileState& tile_state) { - using core::sync_threadblock; - key_type keys[ITEMS_PER_THREAD]; // Tile keys key_type pred_keys[ITEMS_PER_THREAD]; // Tile keys shifted up (predecessor) value_type values[ITEMS_PER_THREAD]; // Tile values @@ -563,7 +557,7 @@ struct ReduceByKeyAgent key_type tile_pred_key = (threadIdx.x == 0) ? key_type(keys_load_it[tile_offset - 1]) : key_type(); - sync_threadblock(); + __syncthreads(); // Load values (last tile repeats final element) if (IS_LAST_TILE) @@ -576,7 +570,7 @@ struct ReduceByKeyAgent BlockLoadValues(storage.load_values).Load(values_load_it + tile_offset, values); } - sync_threadblock(); + __syncthreads(); // Set head segment_flags BlockDiscontinuityKeys(storage.scan_storage.discontinuity) @@ -635,8 +629,8 @@ struct ReduceByKeyAgent int /*num_tiles*/, ScanTileState& tile_state) : storage(storage_) - , keys_load_it(core::make_load_iterator(ptx_plan(), keys_input_it_)) - , values_load_it(core::make_load_iterator(ptx_plan(), values_input_it_)) + , keys_load_it(core::detail::make_load_iterator(ptx_plan(), keys_input_it_)) + , values_load_it(core::detail::make_load_iterator(ptx_plan(), values_input_it_)) , keys_output_it(keys_output_it_) , values_output_it(values_output_it_) , num_runs_output_it(num_runs_output_it_) @@ -703,7 +697,7 @@ struct InitAgent template struct PtxPlan : PtxPolicy<128> {}; - using ptx_plan = core::specialize_plan; + using ptx_plan = core::detail::specialize_plan; //--------------------------------------------------------------------- // Agent entry point @@ -740,8 +734,8 @@ THRUST_RUNTIME_FUNCTION cudaError_t doit_step( Size num_items, cudaStream_t stream) { - using core::AgentLauncher; - using core::AgentPlan; + using core::detail::AgentLauncher; + using core::detail::AgentPlan; cudaError_t status = cudaSuccess; if (num_items == 0) @@ -762,7 +756,7 @@ THRUST_RUNTIME_FUNCTION cudaError_t doit_step( int tile_size = reduce_by_key_plan.items_per_tile; Size num_tiles = ::cuda::ceil_div(num_items, tile_size); - size_t vshmem_size = core::vshmem_size(reduce_by_key_plan.shared_memory_size, num_tiles); + size_t vshmem_size = core::detail::vshmem_size(reduce_by_key_plan.shared_memory_size, num_tiles); size_t allocation_sizes[2] = {9, vshmem_size}; status = ScanTileState::AllocationSize(static_cast(num_tiles), allocation_sizes[0]); @@ -848,14 +842,14 @@ THRUST_RUNTIME_FUNCTION pair reduce_by_key_dispatc void* allocations[2] = {nullptr, nullptr}; size_t storage_size = 0; - status = core::alias_storage(nullptr, storage_size, allocations, allocation_sizes); + status = core::detail::alias_storage(nullptr, storage_size, allocations, allocation_sizes); cuda_cub::throw_on_error(status, "reduce failed on 1st alias_storage"); // Allocate temporary storage. thrust::detail::temporary_array tmp(policy, storage_size); void* ptr = static_cast(tmp.data().get()); - status = core::alias_storage(ptr, storage_size, allocations, allocation_sizes); + status = core::detail::alias_storage(ptr, storage_size, allocations, allocation_sizes); cuda_cub::throw_on_error(status, "reduce failed on 2nd alias_storage"); Size* d_num_runs_out = thrust::detail::aligned_reinterpret_cast(allocations[0]); diff --git a/thrust/thrust/system/cuda/detail/set_operations.h b/thrust/thrust/system/cuda/detail/set_operations.h index 7a267080bf8..b336f8e55fa 100644 --- a/thrust/thrust/system/cuda/detail/set_operations.h +++ b/thrust/thrust/system/cuda/detail/set_operations.h @@ -222,7 +222,7 @@ struct Tuning; namespace mpl = thrust::detail::mpl::math; template -struct Tuning +struct Tuning { enum { @@ -243,7 +243,7 @@ struct Tuning }; // tuning sm52 template -struct Tuning +struct Tuning { enum { @@ -290,15 +290,15 @@ struct SetOpAgent { using tuning = Tuning; - using KeysLoadIt1 = typename core::LoadIterator::type; - using KeysLoadIt2 = typename core::LoadIterator::type; - using ValuesLoadIt1 = typename core::LoadIterator::type; - using ValuesLoadIt2 = typename core::LoadIterator::type; + using KeysLoadIt1 = typename core::detail::LoadIterator::type; + using KeysLoadIt2 = typename core::detail::LoadIterator::type; + using ValuesLoadIt1 = typename core::detail::LoadIterator::type; + using ValuesLoadIt2 = typename core::detail::LoadIterator::type; - using BlockLoadKeys1 = typename core::BlockLoad::type; - using BlockLoadKeys2 = typename core::BlockLoad::type; - using BlockLoadValues1 = typename core::BlockLoad::type; - using BlockLoadValues2 = typename core::BlockLoad::type; + using BlockLoadKeys1 = typename core::detail::BlockLoad::type; + using BlockLoadKeys2 = typename core::detail::BlockLoad::type; + using BlockLoadValues1 = typename core::detail::BlockLoad::type; + using BlockLoadValues2 = typename core::detail::BlockLoad::type; using TilePrefixCallback = cub::TilePrefixCallbackOp, ScanTileState, Arch::ver>; @@ -316,7 +316,7 @@ struct SetOpAgent struct LoadStorage { - core::uninitialized_array offset; + core::detail::uninitialized_array offset; union { // FIXME These don't appear to be used anywhere? @@ -328,15 +328,15 @@ struct SetOpAgent // Allocate extra shmem than truly necessary // This will permit to avoid range checks in // serial set operations, e.g. serial_set_difference - core::uninitialized_array keys_shared; + core::detail::uninitialized_array keys_shared; - core::uninitialized_array values_shared; + core::detail::uninitialized_array values_shared; }; // anon union } load_storage; // struct LoadStorage }; // union TempStorage }; // struct PtxPlan - using ptx_plan = typename core::specialize_plan_msvc10_war::type::type; + using ptx_plan = typename core::detail::specialize_plan_msvc10_war::type::type; using KeysLoadIt1 = typename ptx_plan::KeysLoadIt1; using KeysLoadIt2 = typename ptx_plan::KeysLoadIt2; @@ -441,8 +441,6 @@ struct SetOpAgent Size tile_output_prefix, int tile_output_count) { - using core::sync_threadblock; - int local_scatter_idx = thread_output_prefix - tile_output_prefix; # pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) @@ -452,7 +450,7 @@ struct SetOpAgent shared[local_scatter_idx++] = input[ITEM]; } } - sync_threadblock(); + __syncthreads(); for (int item = threadIdx.x; item < tile_output_count; item += BLOCK_THREADS) { @@ -483,8 +481,7 @@ struct SetOpAgent template void THRUST_DEVICE_FUNCTION consume_tile(Size tile_idx) { - using core::sync_threadblock; - using core::uninitialized_array; + using core::detail::uninitialized_array; pair partition_beg = partitions[tile_idx + 0]; pair partition_end = partitions[tile_idx + 1]; @@ -506,7 +503,7 @@ struct SetOpAgent reg_to_shared(&storage.load_storage.keys_shared[0], keys_loc); - sync_threadblock(); + __syncthreads(); int diag_loc = min(ITEMS_PER_THREAD * threadIdx.x, num_keys1 + num_keys2); @@ -529,7 +526,7 @@ struct SetOpAgent int dst = threadIdx.x == 0 ? BLOCK_THREADS - 1 : threadIdx.x - 1; storage.load_storage.offset[dst] = value; - core::sync_threadblock(); + __syncthreads(); pair partition1_loc = thrust::make_pair( storage.load_storage.offset[threadIdx.x] >> 16, storage.load_storage.offset[threadIdx.x] & 0xFFFF); @@ -554,7 +551,7 @@ struct SetOpAgent indices, compare_op, set_op); - sync_threadblock(); + __syncthreads(); # if 0 if (ITEMS_PER_THREAD*threadIdx.x >= num_keys1 + num_keys2) active_mask = 0; @@ -588,7 +585,7 @@ struct SetOpAgent tile_output_prefix = prefix_cb.GetExclusivePrefix(); } - sync_threadblock(); + __syncthreads(); // scatter results // @@ -605,11 +602,11 @@ struct SetOpAgent value_type values_loc[ITEMS_PER_THREAD]; gmem_to_reg(values_loc, values1_in + keys1_beg, values2_in + keys2_beg, num_keys1, num_keys2); - sync_threadblock(); + __syncthreads(); reg_to_shared(&storage.load_storage.values_shared[0], values_loc); - sync_threadblock(); + __syncthreads(); // gather items from shared mem // @@ -622,7 +619,7 @@ struct SetOpAgent } } - sync_threadblock(); + __syncthreads(); scatter(values_out, values_loc, @@ -660,10 +657,10 @@ struct SetOpAgent std::size_t* output_count_) : storage(storage_) , tile_state(tile_state_) - , keys1_in(core::make_load_iterator(ptx_plan(), keys1_)) - , keys2_in(core::make_load_iterator(ptx_plan(), keys2_)) - , values1_in(core::make_load_iterator(ptx_plan(), values1_)) - , values2_in(core::make_load_iterator(ptx_plan(), values2_)) + , keys1_in(core::detail::make_load_iterator(ptx_plan(), keys1_)) + , keys2_in(core::detail::make_load_iterator(ptx_plan(), keys2_)) + , values1_in(core::detail::make_load_iterator(ptx_plan(), values1_)) + , values2_in(core::detail::make_load_iterator(ptx_plan(), values2_)) , keys1_count(keys1_count_) , keys2_count(keys2_count_) , keys_out(keys_out_) @@ -733,7 +730,7 @@ struct PartitionAgent struct PtxPlan : PtxPolicy<256> {}; - using ptx_plan = core::specialize_plan; + using ptx_plan = core::detail::specialize_plan; //--------------------------------------------------------------------- // Agent entry point @@ -767,7 +764,7 @@ struct InitAgent struct PtxPlan : PtxPolicy<128> {}; - using ptx_plan = core::specialize_plan; + using ptx_plan = core::detail::specialize_plan; //--------------------------------------------------------------------- // Agent entry point @@ -1058,8 +1055,8 @@ cudaError_t THRUST_RUNTIME_FUNCTION doit_step( cudaError_t status = cudaSuccess; - using core::AgentLauncher; - using core::AgentPlan; + using core::detail::AgentLauncher; + using core::detail::AgentPlan; using set_op_agent = AgentLauncher< SetOpAgent>; @@ -1080,13 +1077,13 @@ cudaError_t THRUST_RUNTIME_FUNCTION doit_step( status = ScanTileState::AllocationSize(static_cast(num_tiles), tile_agent_storage); CUDA_CUB_RET_IF_FAIL(status); - size_t vshmem_storage = core::vshmem_size(set_op_plan.shared_memory_size, num_tiles); + size_t vshmem_storage = core::detail::vshmem_size(set_op_plan.shared_memory_size, num_tiles); size_t partition_agent_storage = (num_tiles + 1) * sizeof(Size) * 2; void* allocations[3] = {nullptr, nullptr, nullptr}; size_t allocation_sizes[3] = {tile_agent_storage, partition_agent_storage, vshmem_storage}; - status = core::alias_storage(d_temp_storage, temp_storage_size, allocations, allocation_sizes); + status = core::detail::alias_storage(d_temp_storage, temp_storage_size, allocations, allocation_sizes); CUDA_CUB_RET_IF_FAIL(status); if (d_temp_storage == nullptr) @@ -1192,14 +1189,14 @@ THRUST_RUNTIME_FUNCTION pair set_operations( size_t storage_size = 0; - status = core::alias_storage(nullptr, storage_size, allocations, allocation_sizes); + status = core::detail::alias_storage(nullptr, storage_size, allocations, allocation_sizes); cuda_cub::throw_on_error(status, "set_operations failed on 1st alias_storage"); // Allocate temporary storage. thrust::detail::temporary_array tmp(policy, storage_size); void* ptr = static_cast(tmp.data().get()); - status = core::alias_storage(ptr, storage_size, allocations, allocation_sizes); + status = core::detail::alias_storage(ptr, storage_size, allocations, allocation_sizes); cuda_cub::throw_on_error(status, "set_operations failed on 2nd alias_storage"); std::size_t* d_output_count = thrust::detail::aligned_reinterpret_cast(allocations[0]); diff --git a/thrust/thrust/system/cuda/detail/sort.h b/thrust/thrust/system/cuda/detail/sort.h index 2c3ef85202d..7ad67fd4e0c 100644 --- a/thrust/thrust/system/cuda/detail/sort.h +++ b/thrust/thrust/system/cuda/detail/sort.h @@ -58,6 +58,8 @@ # include # include +# include + # include # if defined(_CCCL_HAS_NVFP16) @@ -277,8 +279,8 @@ THRUST_RUNTIME_FUNCTION void radix_sort(execution_policy& policy, Key* dispatch::doit(nullptr, temp_storage_bytes, keys_buffer, items_buffer, keys_count, stream); cuda_cub::throw_on_error(status, "radix_sort: failed on 1st step"); - size_t keys_temp_storage = core::align_to(sizeof(Key) * keys_count, 128); - size_t items_temp_storage = core::align_to(sizeof(Item) * items_count, 128); + size_t keys_temp_storage = ::cuda::round_up(sizeof(Key) * keys_count, 128); + size_t items_temp_storage = ::cuda::round_up(sizeof(Item) * items_count, 128); size_t storage_size = keys_temp_storage + items_temp_storage + temp_storage_bytes; diff --git a/thrust/thrust/system/cuda/detail/unique.h b/thrust/thrust/system/cuda/detail/unique.h index ac94017758b..1d39b161866 100644 --- a/thrust/thrust/system/cuda/detail/unique.h +++ b/thrust/thrust/system/cuda/detail/unique.h @@ -123,7 +123,7 @@ struct items_per_thread }; template -struct Tuning +struct Tuning { const static int INPUT_SIZE = sizeof(T); enum @@ -149,16 +149,16 @@ struct UniqueAgent { using tuning = Tuning; - using ItemsLoadIt = typename core::LoadIterator::type; + using ItemsLoadIt = typename core::detail::LoadIterator::type; - using BlockLoadItems = typename core::BlockLoad::type; + using BlockLoadItems = typename core::detail::BlockLoad::type; using BlockDiscontinuityItems = cub::BlockDiscontinuity; using TilePrefixCallback = cub::TilePrefixCallbackOp, ScanTileState, Arch::ver>; using BlockScan = cub::BlockScan; - using shared_items_t = core::uninitialized_array; + using shared_items_t = core::detail::uninitialized_array; union TempStorage { @@ -175,7 +175,7 @@ struct UniqueAgent }; // union TempStorage }; // struct PtxPlan - using ptx_plan = typename core::specialize_plan_msvc10_war::type::type; + using ptx_plan = typename core::detail::specialize_plan_msvc10_war::type::type; using ItemsLoadIt = typename ptx_plan::ItemsLoadIt; using BlockLoadItems = typename ptx_plan::BlockLoadItems; @@ -224,8 +224,6 @@ struct UniqueAgent Size num_selections_prefix, Size /*num_selections*/) { - using core::sync_threadblock; - # pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { @@ -236,14 +234,14 @@ struct UniqueAgent } } - sync_threadblock(); + __syncthreads(); for (int item = threadIdx.x; item < num_tile_selections; item += BLOCK_THREADS) { items_out[num_selections_prefix + item] = get_shared()[item]; } - sync_threadblock(); + __syncthreads(); } //--------------------------------------------------------------------- @@ -253,8 +251,7 @@ struct UniqueAgent template Size THRUST_DEVICE_FUNCTION consume_tile_impl(int num_tile_items, int tile_idx, Size tile_base) { - using core::sync_threadblock; - using core::uninitialized_array; + using core::detail::uninitialized_array; item_type items_loc[ITEMS_PER_THREAD]; Size selection_flags[ITEMS_PER_THREAD]; @@ -270,7 +267,7 @@ struct UniqueAgent BlockLoadItems(temp_storage.load_items).Load(items_in + tile_base, items_loc); } - sync_threadblock(); + __syncthreads(); if (IS_FIRST_TILE) { @@ -294,7 +291,7 @@ struct UniqueAgent } } - sync_threadblock(); + __syncthreads(); Size num_tile_selections = 0; Size num_selections = 0; @@ -337,7 +334,7 @@ struct UniqueAgent } } - sync_threadblock(); + __syncthreads(); scatter(items_loc, selection_flags, @@ -420,7 +417,7 @@ struct UniqueAgent impl(storage, tile_state, - core::make_load_iterator(ptx_plan(), items_in), + core::detail::make_load_iterator(ptx_plan(), items_in), items_out, binary_pred, num_items, @@ -435,7 +432,7 @@ struct InitAgent template struct PtxPlan : PtxPolicy<128> {}; - using ptx_plan = core::specialize_plan; + using ptx_plan = core::detail::specialize_plan; //--------------------------------------------------------------------- // Agent entry point @@ -463,9 +460,9 @@ static cudaError_t THRUST_RUNTIME_FUNCTION doit_step( Size num_items, cudaStream_t stream) { - using core::AgentLauncher; - using core::AgentPlan; - using core::get_agent_plan; + using core::detail::AgentLauncher; + using core::detail::AgentPlan; + using core::detail::get_agent_plan; using unique_agent = AgentLauncher>; @@ -473,14 +470,14 @@ static cudaError_t THRUST_RUNTIME_FUNCTION doit_step( using init_agent = AgentLauncher>; - using core::get_plan; + using core::detail::get_plan; typename get_plan::type init_plan = init_agent::get_plan(); typename get_plan::type unique_plan = unique_agent::get_plan(stream); int tile_size = unique_plan.items_per_tile; size_t num_tiles = ::cuda::ceil_div(num_items, tile_size); - size_t vshmem_size = core::vshmem_size(unique_plan.shared_memory_size, num_tiles); + size_t vshmem_size = core::detail::vshmem_size(unique_plan.shared_memory_size, num_tiles); cudaError_t status = cudaSuccess; size_t allocation_sizes[2] = {0, vshmem_size}; @@ -550,14 +547,14 @@ THRUST_RUNTIME_FUNCTION ItemsOutputIt unique( void* allocations[2] = {nullptr, nullptr}; size_t storage_size = 0; - status = core::alias_storage(nullptr, storage_size, allocations, allocation_sizes); + status = core::detail::alias_storage(nullptr, storage_size, allocations, allocation_sizes); cuda_cub::throw_on_error(status, "unique: failed on 1st step"); // Allocate temporary storage. thrust::detail::temporary_array tmp(policy, storage_size); void* ptr = static_cast(tmp.data().get()); - status = core::alias_storage(ptr, storage_size, allocations, allocation_sizes); + status = core::detail::alias_storage(ptr, storage_size, allocations, allocation_sizes); cuda_cub::throw_on_error(status, "unique: failed on 2nd step"); size_type* d_num_selected_out = thrust::detail::aligned_reinterpret_cast(allocations[0]);