diff --git a/cub/cub/agent/agent_adjacent_difference.cuh b/cub/cub/agent/agent_adjacent_difference.cuh
index c19cb90079a..8617c78193b 100644
--- a/cub/cub/agent/agent_adjacent_difference.cuh
+++ b/cub/cub/agent/agent_adjacent_difference.cuh
@@ -79,7 +79,7 @@ template <typename Policy,
           bool ReadLeft>
 struct AgentDifference
 {
-  using LoadIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, InputIteratorT>::type;
+  using LoadIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator<Policy, InputIteratorT>::type;
 
   using BlockLoad  = typename cub::BlockLoadType<Policy, LoadIt>::type;
   using BlockStore = typename cub::BlockStoreType<Policy, OutputIteratorT, OutputT>::type;
@@ -119,7 +119,7 @@ struct AgentDifference
     OffsetT num_items)
       : temp_storage(temp_storage.Alias())
       , input_it(input_it)
-      , load_it(THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(Policy(), input_it))
+      , load_it(THRUST_NS_QUALIFIER::cuda_cub::core::detail::make_load_iterator(Policy(), input_it))
       , first_tile_previous(first_tile_previous)
       , result(result)
       , difference_op(difference_op)
diff --git a/cub/cub/agent/agent_merge.cuh b/cub/cub/agent/agent_merge.cuh
index 9ae14c3e42e..5c7d5322456 100644
--- a/cub/cub/agent/agent_merge.cuh
+++ b/cub/cub/agent/agent_merge.cuh
@@ -64,10 +64,10 @@ struct agent_t
   using key_type  = typename ::cuda::std::iterator_traits<KeysIt1>::value_type;
   using item_type = typename ::cuda::std::iterator_traits<ItemsIt1>::value_type;
 
-  using keys_load_it1  = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, KeysIt1>::type;
-  using keys_load_it2  = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, KeysIt2>::type;
-  using items_load_it1 = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, ItemsIt1>::type;
-  using items_load_it2 = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, ItemsIt2>::type;
+  using keys_load_it1  = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator<Policy, KeysIt1>::type;
+  using keys_load_it2  = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator<Policy, KeysIt2>::type;
+  using items_load_it1 = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator<Policy, ItemsIt1>::type;
+  using items_load_it2 = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator<Policy, ItemsIt2>::type;
 
   using block_load_keys1  = typename BlockLoadType<Policy, keys_load_it1>::type;
   using block_load_keys2  = typename BlockLoadType<Policy, keys_load_it2>::type;
diff --git a/cub/cub/agent/agent_merge_sort.cuh b/cub/cub/agent/agent_merge_sort.cuh
index bf4984f7256..1ec952187a7 100644
--- a/cub/cub/agent/agent_merge_sort.cuh
+++ b/cub/cub/agent/agent_merge_sort.cuh
@@ -91,8 +91,10 @@ struct AgentBlockSort
 
   using BlockMergeSortT = BlockMergeSort<KeyT, Policy::BLOCK_THREADS, Policy::ITEMS_PER_THREAD, ValueT>;
 
-  using KeysLoadIt  = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, KeyInputIteratorT>::type;
-  using ItemsLoadIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, ValueInputIteratorT>::type;
+  using KeysLoadIt =
+    typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator<Policy, KeyInputIteratorT>::type;
+  using ItemsLoadIt =
+    typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator<Policy, ValueInputIteratorT>::type;
 
   using BlockLoadKeys  = typename cub::BlockLoadType<Policy, KeysLoadIt>::type;
   using BlockLoadItems = typename cub::BlockLoadType<Policy, ItemsLoadIt>::type;
@@ -438,10 +440,11 @@ struct AgentMerge
   //---------------------------------------------------------------------
   // Types and constants
   //---------------------------------------------------------------------
-  using KeysLoadPingIt  = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, KeyIteratorT>::type;
-  using ItemsLoadPingIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, ValueIteratorT>::type;
-  using KeysLoadPongIt  = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, KeyT*>::type;
-  using ItemsLoadPongIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, ValueT*>::type;
+  using KeysLoadPingIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator<Policy, KeyIteratorT>::type;
+  using ItemsLoadPingIt =
+    typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator<Policy, ValueIteratorT>::type;
+  using KeysLoadPongIt  = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator<Policy, KeyT*>::type;
+  using ItemsLoadPongIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator<Policy, ValueT*>::type;
 
   using KeysOutputPongIt  = KeyIteratorT;
   using ItemsOutputPongIt = ValueIteratorT;
diff --git a/cub/cub/agent/agent_sub_warp_merge_sort.cuh b/cub/cub/agent/agent_sub_warp_merge_sort.cuh
index b10f1cda3ea..9f98ac42e3b 100644
--- a/cub/cub/agent/agent_sub_warp_merge_sort.cuh
+++ b/cub/cub/agent/agent_sub_warp_merge_sort.cuh
@@ -183,8 +183,8 @@ public:
 
   using WarpMergeSortT = WarpMergeSort<KeyT, PolicyT::ITEMS_PER_THREAD, PolicyT::WARP_THREADS, ValueT>;
 
-  using KeysLoadItT  = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<PolicyT, const KeyT*>::type;
-  using ItemsLoadItT = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<PolicyT, const ValueT*>::type;
+  using KeysLoadItT  = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator<PolicyT, const KeyT*>::type;
+  using ItemsLoadItT = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator<PolicyT, const ValueT*>::type;
 
   using WarpLoadKeysT = cub::WarpLoad<KeyT, PolicyT::ITEMS_PER_THREAD, PolicyT::LOAD_ALGORITHM, PolicyT::WARP_THREADS>;
   using WarpLoadItemsT =
diff --git a/cub/cub/device/dispatch/dispatch_merge.cuh b/cub/cub/device/dispatch/dispatch_merge.cuh
index b3d0c8ab2ca..c4df61fd29a 100644
--- a/cub/cub/device/dispatch/dispatch_merge.cuh
+++ b/cub/cub/device/dispatch/dispatch_merge.cuh
@@ -138,7 +138,7 @@ __launch_bounds__(
     CompareOp>::type;
   using MergePolicy = typename MergeAgent::policy;
 
-  using THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator;
+  using THRUST_NS_QUALIFIER::cuda_cub::core::detail::make_load_iterator;
   using vsmem_helper_t = vsmem_helper_impl<MergeAgent>;
   __shared__ typename vsmem_helper_t::static_temp_storage_t shared_temp_storage;
   auto& temp_storage = vsmem_helper_t::get_temp_storage(shared_temp_storage, global_temp_storage);
diff --git a/cub/cub/device/dispatch/kernels/merge_sort.cuh b/cub/cub/device/dispatch/kernels/merge_sort.cuh
index 1065313c20d..8401744b226 100644
--- a/cub/cub/device/dispatch/kernels/merge_sort.cuh
+++ b/cub/cub/device/dispatch/kernels/merge_sort.cuh
@@ -19,12 +19,13 @@
 
 THRUST_NAMESPACE_BEGIN
 
-namespace cuda_cub::core
+namespace cuda_cub::core::detail
 {
 // We must forward declare here because make_load_iterator.h pulls in non NVRTC compilable code
 template <class PtxPlan, class It>
-typename LoadIterator<PtxPlan, It>::type _CCCL_DEVICE _CCCL_FORCEINLINE make_load_iterator(PtxPlan const&, It it);
-} // namespace cuda_cub::core
+typename detail::LoadIterator<PtxPlan, It>::type _CCCL_DEVICE _CCCL_FORCEINLINE
+make_load_iterator(PtxPlan const&, It it);
+} // namespace cuda_cub::core::detail
 
 THRUST_NAMESPACE_END
 
@@ -174,8 +175,8 @@ __launch_bounds__(
   AgentBlockSortT agent(
     ping,
     temp_storage,
-    THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), keys_in),
-    THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), items_in),
+    THRUST_NS_QUALIFIER::cuda_cub::core::detail::make_load_iterator(ActivePolicyT(), keys_in),
+    THRUST_NS_QUALIFIER::cuda_cub::core::detail::make_load_iterator(ActivePolicyT(), items_in),
     keys_count,
     keys_out,
     items_out,
@@ -280,10 +281,10 @@ __launch_bounds__(
   AgentMergeT agent(
     ping,
     temp_storage,
-    THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), keys_ping),
-    THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), items_ping),
-    THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), keys_pong),
-    THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), items_pong),
+    THRUST_NS_QUALIFIER::cuda_cub::core::detail::make_load_iterator(ActivePolicyT(), keys_ping),
+    THRUST_NS_QUALIFIER::cuda_cub::core::detail::make_load_iterator(ActivePolicyT(), items_ping),
+    THRUST_NS_QUALIFIER::cuda_cub::core::detail::make_load_iterator(ActivePolicyT(), keys_pong),
+    THRUST_NS_QUALIFIER::cuda_cub::core::detail::make_load_iterator(ActivePolicyT(), items_pong),
     keys_count,
     keys_pong,
     items_pong,
diff --git a/thrust/thrust/system/cuda/detail/core/agent_launcher.h b/thrust/thrust/system/cuda/detail/core/agent_launcher.h
index fb7c1ef22d6..d9baeb47593 100644
--- a/thrust/thrust/system/cuda/detail/core/agent_launcher.h
+++ b/thrust/thrust/system/cuda/detail/core/agent_launcher.h
@@ -62,7 +62,8 @@ namespace cuda_cub
 {
 namespace core
 {
-
+namespace detail
+{
 #  ifndef THRUST_DETAIL_KERNEL_ATTRIBUTES
 #    define THRUST_DETAIL_KERNEL_ATTRIBUTES CCCL_DETAIL_KERNEL_ATTRIBUTES
 #  endif
@@ -97,7 +98,7 @@ THRUST_DETAIL_KERNEL_ATTRIBUTES void _kernel_agent_vshmem(char*, Args... args)
 template <class Agent>
 struct AgentLauncher : Agent
 {
-  core::AgentPlan plan;
+  AgentPlan plan;
   size_t count;
   cudaStream_t stream;
   char const* name;
@@ -121,7 +122,7 @@ struct AgentLauncher : Agent
       , name(name_)
       , grid(static_cast<unsigned int>((count + plan.items_per_tile - 1) / plan.items_per_tile))
       , vshmem(nullptr)
-      , has_shmem((size_t) core::get_max_shared_memory_per_block() >= (size_t) plan.shared_memory_size)
+      , has_shmem((size_t) get_max_shared_memory_per_block() >= (size_t) plan.shared_memory_size)
       , shmem_size(has_shmem ? plan.shared_memory_size : 0)
   {
     assert(count > 0);
@@ -136,7 +137,7 @@ struct AgentLauncher : Agent
       , name(name_)
       , grid(static_cast<unsigned int>((count + plan.items_per_tile - 1) / plan.items_per_tile))
       , vshmem(vshmem)
-      , has_shmem((size_t) core::get_max_shared_memory_per_block() >= (size_t) plan.shared_memory_size)
+      , has_shmem((size_t) get_max_shared_memory_per_block() >= (size_t) plan.shared_memory_size)
       , shmem_size(has_shmem ? plan.shared_memory_size : 0)
   {
     assert(count > 0);
@@ -149,7 +150,7 @@ struct AgentLauncher : Agent
       , name(name_)
       , grid(plan.grid_size)
       , vshmem(nullptr)
-      , has_shmem((size_t) core::get_max_shared_memory_per_block() >= (size_t) plan.shared_memory_size)
+      , has_shmem((size_t) get_max_shared_memory_per_block() >= (size_t) plan.shared_memory_size)
       , shmem_size(has_shmem ? plan.shared_memory_size : 0)
   {
     assert(plan.grid_size > 0);
@@ -162,43 +163,19 @@ struct AgentLauncher : Agent
       , name(name_)
       , grid(plan.grid_size)
       , vshmem(vshmem)
-      , has_shmem((size_t) core::get_max_shared_memory_per_block() >= (size_t) plan.shared_memory_size)
+      , has_shmem((size_t) get_max_shared_memory_per_block() >= (size_t) plan.shared_memory_size)
       , shmem_size(has_shmem ? plan.shared_memory_size : 0)
   {
     assert(plan.grid_size > 0);
   }
 
-#  if 0
-    THRUST_RUNTIME_FUNCTION
-    AgentPlan static get_plan(cudaStream_t s, void* d_ptr = 0)
-    {
-      // in separable compilation mode, we have no choice
-      // but to call kernel to get agent_plan
-      // otherwise the risk is something may fail
-      // if user mix & match ptx versions in a separably compiled function
-      // http://nvbugs/1772071
-      // XXX may be it is too string of a requirements, consider relaxing it in
-      // the future
-#    ifdef __CUDACC_RDC__
-      return core::get_agent_plan<Agent>(s, d_ptr);
-#    else
-      return get_agent_plan<Agent>(core::get_ptx_version());
-#    endif
-    }
-    THRUST_RUNTIME_FUNCTION
-    AgentPlan static get_plan_default()
-    {
-      return get_agent_plan<Agent>(sm_arch<0>::type::ver);
-    }
-#  endif
-
-  THRUST_RUNTIME_FUNCTION typename core::get_plan<Agent>::type static get_plan(cudaStream_t, void* d_ptr = 0)
+  THRUST_RUNTIME_FUNCTION typename get_plan<Agent>::type static get_plan(cudaStream_t, void* d_ptr = 0)
   {
     THRUST_UNUSED_VAR(d_ptr);
-    return get_agent_plan<Agent>(core::get_ptx_version());
+    return get_agent_plan<Agent>(get_ptx_version());
   }
 
-  THRUST_RUNTIME_FUNCTION typename core::get_plan<Agent>::type static get_plan()
+  THRUST_RUNTIME_FUNCTION typename detail::get_plan<Agent>::type static get_plan()
   {
     return get_agent_plan<Agent>(lowest_supported_sm_arch::ver);
   }
@@ -227,7 +204,7 @@ struct AgentLauncher : Agent
   {
 #  if THRUST_DEBUG_SYNC_FLAG
     cuda_optional<int> occ = max_sm_occupancy(k);
-    const int ptx_version  = core::get_ptx_version();
+    const int ptx_version  = get_ptx_version();
     if (count > 0)
     {
       _CubLog(
@@ -305,6 +282,7 @@ struct AgentLauncher : Agent
   }
 };
 
+} // namespace detail
 } // namespace core
 } // namespace cuda_cub
 
diff --git a/thrust/thrust/system/cuda/detail/core/load_iterator.h b/thrust/thrust/system/cuda/detail/core/load_iterator.h
index 07c5eba0eaa..6f2c118b151 100644
--- a/thrust/thrust/system/cuda/detail/core/load_iterator.h
+++ b/thrust/thrust/system/cuda/detail/core/load_iterator.h
@@ -34,7 +34,7 @@
 
 THRUST_NAMESPACE_BEGIN
 
-namespace cuda_cub::core
+namespace cuda_cub::core::detail
 {
 
 // LoadIterator
@@ -52,6 +52,6 @@ struct LoadIterator
                                cub::CacheModifiedInputIterator<PtxPlan::LOAD_MODIFIER, value_type, size_type>,
                                It>;
 }; // struct Iterator
-} // namespace cuda_cub::core
+} // namespace cuda_cub::core::detail
 
 THRUST_NAMESPACE_END
diff --git a/thrust/thrust/system/cuda/detail/core/make_load_iterator.h b/thrust/thrust/system/cuda/detail/core/make_load_iterator.h
index 28c65c813ea..9497ccacca9 100644
--- a/thrust/thrust/system/cuda/detail/core/make_load_iterator.h
+++ b/thrust/thrust/system/cuda/detail/core/make_load_iterator.h
@@ -33,7 +33,7 @@
 
 THRUST_NAMESPACE_BEGIN
 
-namespace cuda_cub::core
+namespace cuda_cub::core::detail
 {
 template <class PtxPlan, class It>
 typename LoadIterator<PtxPlan, It>::type _CCCL_DEVICE _CCCL_FORCEINLINE
@@ -55,6 +55,6 @@ typename LoadIterator<PtxPlan, It>::type _CCCL_DEVICE _CCCL_FORCEINLINE make_loa
   return make_load_iterator_impl<PtxPlan>(it, typename is_contiguous_iterator<It>::type());
 }
 
-} // namespace cuda_cub::core
+} // namespace cuda_cub::core::detail
 
 THRUST_NAMESPACE_END
diff --git a/thrust/thrust/system/cuda/detail/core/util.h b/thrust/thrust/system/cuda/detail/core/util.h
index 94a7e750aeb..b3bdcf1f086 100644
--- a/thrust/thrust/system/cuda/detail/core/util.h
+++ b/thrust/thrust/system/cuda/detail/core/util.h
@@ -78,6 +78,8 @@ namespace core
 #  endif
 #endif
 
+namespace detail
+{
 /// Typelist - a container of types
 template <typename...>
 struct typelist;
@@ -458,22 +460,9 @@ THRUST_RUNTIME_FUNCTION inline size_t get_max_shared_memory_per_block()
   return static_cast<size_t>(i32value);
 }
 
-THRUST_RUNTIME_FUNCTION inline size_t virtual_shmem_size(size_t shmem_per_block)
-{
-  size_t max_shmem_per_block = core::get_max_shared_memory_per_block();
-  if (shmem_per_block > max_shmem_per_block)
-  {
-    return shmem_per_block;
-  }
-  else
-  {
-    return 0;
-  }
-}
-
 THRUST_RUNTIME_FUNCTION inline size_t vshmem_size(size_t shmem_per_block, size_t num_blocks)
 {
-  size_t max_shmem_per_block = core::get_max_shared_memory_per_block();
+  size_t max_shmem_per_block = get_max_shared_memory_per_block();
   if (shmem_per_block > max_shmem_per_block)
   {
     return shmem_per_block * num_blocks;
@@ -509,22 +498,6 @@ struct BlockLoad
                    get_arch<PtxPlan>::type::ver>;
 };
 
-// BlockStore
-// -----------
-// a helper metaprogram that returns type of a block loader
-template <class PtxPlan, class It, class T = typename iterator_traits<It>::value_type>
-struct BlockStore
-{
-  using type =
-    cub::BlockStore<T,
-                    PtxPlan::BLOCK_THREADS,
-                    PtxPlan::ITEMS_PER_THREAD,
-                    PtxPlan::STORE_ALGORITHM,
-                    1,
-                    1,
-                    get_arch<PtxPlan>::type::ver>;
-};
-
 // cuda_optional
 // --------------
 // used for function that return cudaError_t along with the result
@@ -619,16 +592,6 @@ THRUST_RUNTIME_FUNCTION inline int get_ptx_version()
   return ptx_version;
 }
 
-THRUST_RUNTIME_FUNCTION inline cudaError_t sync_stream(cudaStream_t stream)
-{
-  return cub::SyncStream(stream);
-}
-
-inline void _CCCL_DEVICE sync_threadblock()
-{
-  __syncthreads();
-}
-
 // Deprecated [Since 2.8]
 #define CUDA_CUB_RET_IF_FAIL(e)                \
   {                                            \
@@ -719,11 +682,6 @@ struct uninitialized_array
   }
 };
 
-_CCCL_HOST_DEVICE _CCCL_FORCEINLINE size_t align_to(size_t n, size_t align)
-{
-  return ((n + align - 1) / align) * align;
-}
-
 namespace host
 {
 inline cuda_optional<size_t> get_max_shared_memory_per_block()
@@ -753,9 +711,8 @@ THRUST_RUNTIME_FUNCTION cudaError_t alias_storage(
   return cub::AliasTemporaries(storage_ptr, storage_size, allocations, allocation_sizes);
 }
 
+} // namespace detail
 } // namespace core
-using core::sm52;
-using core::sm60;
 } // namespace cuda_cub
 
 THRUST_NAMESPACE_END
diff --git a/thrust/thrust/system/cuda/detail/extrema.h b/thrust/thrust/system/cuda/detail/extrema.h
index 617eb8bbc79..b2124323424 100644
--- a/thrust/thrust/system/cuda/detail/extrema.h
+++ b/thrust/thrust/system/cuda/detail/extrema.h
@@ -184,10 +184,10 @@ cudaError_t THRUST_RUNTIME_FUNCTION doit_step(
   OutputIt output_it,
   cudaStream_t stream)
 {
-  using core::AgentLauncher;
-  using core::AgentPlan;
-  using core::cuda_optional;
-  using core::get_agent_plan;
+  using core::detail::AgentLauncher;
+  using core::detail::AgentPlan;
+  using core::detail::cuda_optional;
+  using core::detail::get_agent_plan;
 
   using UnsignedSize = typename detail::make_unsigned_special<Size>::type;
 
@@ -204,7 +204,7 @@ cudaError_t THRUST_RUNTIME_FUNCTION doit_step(
 
   if (num_items <= reduce_plan.items_per_tile)
   {
-    size_t vshmem_size = core::vshmem_size(reduce_plan.shared_memory_size, 1);
+    size_t vshmem_size = core::detail::vshmem_size(reduce_plan.shared_memory_size, 1);
 
     // small, single tile size
     if (d_temp_storage == nullptr)
@@ -221,7 +221,7 @@ cudaError_t THRUST_RUNTIME_FUNCTION doit_step(
   else
   {
     // regular size
-    cuda_optional<int> sm_count = core::get_sm_count();
+    cuda_optional<int> sm_count = core::detail::get_sm_count();
     CUDA_CUB_RET_IF_FAIL(sm_count.status());
 
     // reduction will not use more cta counts than requested
@@ -245,7 +245,7 @@ cudaError_t THRUST_RUNTIME_FUNCTION doit_step(
     // we will launch at most "max_blocks" blocks in a grid
     // so preallocate virtual shared memory storage for this if required
     //
-    size_t vshmem_size = core::vshmem_size(reduce_plan.shared_memory_size, max_blocks);
+    size_t vshmem_size = core::detail::vshmem_size(reduce_plan.shared_memory_size, max_blocks);
 
     // Temporary storage allocation requirements
     void* allocations[3]       = {nullptr, nullptr, nullptr};
@@ -331,14 +331,14 @@ extrema(execution_policy<Derived>& policy, InputIt first, Size num_items, Binary
   void* allocations[2]       = {nullptr, nullptr};
 
   size_t storage_size = 0;
-  status              = core::alias_storage(nullptr, storage_size, allocations, allocation_sizes);
+  status              = core::detail::alias_storage(nullptr, storage_size, allocations, allocation_sizes);
   cuda_cub::throw_on_error(status, "extrema failed on 1st alias storage");
 
   // Allocate temporary storage.
   thrust::detail::temporary_array<std::uint8_t, Derived> tmp(policy, storage_size);
   void* ptr = static_cast<void*>(tmp.data().get());
 
-  status = core::alias_storage(ptr, storage_size, allocations, allocation_sizes);
+  status = core::detail::alias_storage(ptr, storage_size, allocations, allocation_sizes);
   cuda_cub::throw_on_error(status, "extrema failed on 2nd alias storage");
 
   T* d_result = thrust::detail::aligned_reinterpret_cast<T*>(allocations[0]);
diff --git a/thrust/thrust/system/cuda/detail/reduce.h b/thrust/thrust/system/cuda/detail/reduce.h
index 3787ab62367..61ec2086adf 100644
--- a/thrust/thrust/system/cuda/detail/reduce.h
+++ b/thrust/thrust/system/cuda/detail/reduce.h
@@ -109,7 +109,7 @@ template <class, class>
 struct Tuning;
 
 template <class T>
-struct Tuning<sm52, T>
+struct Tuning<core::detail::sm52, T>
 {
   enum
   {
@@ -155,7 +155,7 @@ struct ReduceAgent
     using tuning = Tuning<Arch, T>;
 
     using Vector      = typename cub::CubVector<T, PtxPlan::VECTOR_LOAD_LENGTH>;
-    using LoadIt      = typename core::LoadIterator<PtxPlan, InputIt>::type;
+    using LoadIt      = typename core::detail::LoadIterator<PtxPlan, InputIt>::type;
     using BlockReduce = cub::BlockReduce<T, PtxPlan::BLOCK_THREADS, PtxPlan::BLOCK_ALGORITHM, 1, 1, Arch::ver>;
 
     using VectorLoadIt = cub::CacheModifiedInputIterator<PtxPlan::LOAD_MODIFIER, Vector, Size>;
@@ -175,7 +175,7 @@ struct ReduceAgent
   // Other algorithms, e.g. merge, may not need additional information,
   // and may use AgentPlan directly, instead of defining their own Plan type.
   //
-  struct Plan : core::AgentPlan
+  struct Plan : core::detail::AgentPlan
   {
     cub::GridMappingStrategy grid_mapping;
 
@@ -183,7 +183,7 @@ struct ReduceAgent
 
     template <class P>
     THRUST_RUNTIME_FUNCTION Plan(P)
-        : core::AgentPlan(P())
+        : core::detail::AgentPlan(P())
         , grid_mapping(P::GRID_MAPPING)
     {}
   };
@@ -192,7 +192,7 @@ struct ReduceAgent
   // ptx_plan type *must* only be used from device code
   // Its use from host code will result in *undefined behaviour*
   //
-  using ptx_plan = typename core::specialize_plan_msvc10_war<PtxPlan>::type::type;
+  using ptx_plan = typename core::detail::specialize_plan_msvc10_war<PtxPlan>::type::type;
 
   using TempStorage  = typename ptx_plan::TempStorage;
   using Vector       = typename ptx_plan::Vector;
@@ -230,7 +230,7 @@ struct ReduceAgent
     THRUST_DEVICE_FUNCTION impl(TempStorage& storage_, InputIt input_it_, ReductionOp reduction_op_)
         : storage(storage_)
         , input_it(input_it_)
-        , load_it(core::make_load_iterator(ptx_plan(), input_it))
+        , load_it(core::detail::make_load_iterator(ptx_plan(), input_it))
         , reduction_op(reduction_op_)
     {}
 
@@ -428,8 +428,6 @@ struct ReduceAgent
     THRUST_DEVICE_FUNCTION T
     consume_tiles_impl(Size num_items, cub::GridQueue<UnsignedSize> queue, CAN_VECTORIZE can_vectorize)
     {
-      using core::sync_threadblock;
-
       // We give each thread block at least one tile of input.
       T thread_aggregate;
       Size block_offset    = blockIdx.x * ITEMS_PER_TILE;
@@ -454,7 +452,7 @@ struct ReduceAgent
           storage.dequeue_offset = queue.Drain(ITEMS_PER_TILE) + even_share_base;
         }
 
-        sync_threadblock();
+        __syncthreads();
 
         // Grab tile offset and check if we're done with full tiles
         block_offset = storage.dequeue_offset;
@@ -465,7 +463,7 @@ struct ReduceAgent
           consume_tile<false>(
             thread_aggregate, block_offset, ITEMS_PER_TILE, thrust::detail::true_type(), can_vectorize);
 
-          sync_threadblock();
+          __syncthreads();
 
           // Dequeue a tile of items
           if (threadIdx.x == 0)
@@ -473,7 +471,7 @@ struct ReduceAgent
             storage.dequeue_offset = queue.Drain(ITEMS_PER_TILE) + even_share_base;
           }
 
-          sync_threadblock();
+          __syncthreads();
 
           // Grab tile offset and check if we're done with full tiles
           block_offset = storage.dequeue_offset;
@@ -586,7 +584,7 @@ struct DrainAgent
   template <class Arch>
   struct PtxPlan : PtxPolicy<1>
   {};
-  using ptx_plan = core::specialize_plan<PtxPlan>;
+  using ptx_plan = core::detail::specialize_plan<PtxPlan>;
 
   //---------------------------------------------------------------------
   // Agent entry point
@@ -609,10 +607,10 @@ cudaError_t THRUST_RUNTIME_FUNCTION doit_step(
   OutputIt output_it,
   cudaStream_t stream)
 {
-  using core::AgentLauncher;
-  using core::AgentPlan;
-  using core::cuda_optional;
-  using core::get_agent_plan;
+  using core::detail::AgentLauncher;
+  using core::detail::AgentPlan;
+  using core::detail::cuda_optional;
+  using core::detail::get_agent_plan;
 
   using UnsignedSize = typename detail::make_unsigned_special<Size>::type;
 
@@ -629,7 +627,7 @@ cudaError_t THRUST_RUNTIME_FUNCTION doit_step(
 
   if (num_items <= reduce_plan.items_per_tile)
   {
-    size_t vshmem_size = core::vshmem_size(reduce_plan.shared_memory_size, 1);
+    size_t vshmem_size = core::detail::vshmem_size(reduce_plan.shared_memory_size, 1);
 
     // small, single tile size
     if (d_temp_storage == nullptr)
@@ -646,7 +644,7 @@ cudaError_t THRUST_RUNTIME_FUNCTION doit_step(
   else
   {
     // regular size
-    cuda_optional<int> sm_count = core::get_sm_count();
+    cuda_optional<int> sm_count = core::detail::get_sm_count();
     CUDA_CUB_RET_IF_FAIL(sm_count.status());
 
     // reduction will not use more cta counts than requested
@@ -670,7 +668,7 @@ cudaError_t THRUST_RUNTIME_FUNCTION doit_step(
     // we will launch at most "max_blocks" blocks in a grid
     // so preallocate virtual shared memory storage for this if required
     //
-    size_t vshmem_size = core::vshmem_size(reduce_plan.shared_memory_size, max_blocks);
+    size_t vshmem_size = core::detail::vshmem_size(reduce_plan.shared_memory_size, max_blocks);
 
     // Temporary storage allocation requirements
     void* allocations[3]       = {nullptr, nullptr, nullptr};
@@ -755,14 +753,14 @@ reduce(execution_policy<Derived>& policy, InputIt first, Size num_items, T init,
   void* allocations[2]       = {nullptr, nullptr};
 
   size_t storage_size = 0;
-  status              = core::alias_storage(nullptr, storage_size, allocations, allocation_sizes);
+  status              = core::detail::alias_storage(nullptr, storage_size, allocations, allocation_sizes);
   cuda_cub::throw_on_error(status, "reduce failed on 1st alias_storage");
 
   // Allocate temporary storage.
   thrust::detail::temporary_array<std::uint8_t, Derived> tmp(policy, storage_size);
   void* ptr = static_cast<void*>(tmp.data().get());
 
-  status = core::alias_storage(ptr, storage_size, allocations, allocation_sizes);
+  status = core::detail::alias_storage(ptr, storage_size, allocations, allocation_sizes);
   cuda_cub::throw_on_error(status, "reduce failed on 2nd alias_storage");
 
   T* d_result = thrust::detail::aligned_reinterpret_cast<T*>(allocations[0]);
diff --git a/thrust/thrust/system/cuda/detail/reduce_by_key.h b/thrust/thrust/system/cuda/detail/reduce_by_key.h
index ae1f0ffab96..8c1db436085 100644
--- a/thrust/thrust/system/cuda/detail/reduce_by_key.h
+++ b/thrust/thrust/system/cuda/detail/reduce_by_key.h
@@ -115,7 +115,7 @@ template <class Arch, class Key, class Value>
 struct Tuning;
 
 template <class Key, class Value>
-struct Tuning<sm52, Key, Value>
+struct Tuning<core::detail::sm52, Key, Value>
 {
   enum
   {
@@ -163,11 +163,11 @@ struct ReduceByKeyAgent
   {
     using tuning = Tuning<Arch, key_type, value_type>;
 
-    using KeysLoadIt   = typename core::LoadIterator<PtxPlan, KeysInputIt>::type;
-    using ValuesLoadIt = typename core::LoadIterator<PtxPlan, ValuesInputIt>::type;
+    using KeysLoadIt   = typename core::detail::LoadIterator<PtxPlan, KeysInputIt>::type;
+    using ValuesLoadIt = typename core::detail::LoadIterator<PtxPlan, ValuesInputIt>::type;
 
-    using BlockLoadKeys   = typename core::BlockLoad<PtxPlan, KeysLoadIt>::type;
-    using BlockLoadValues = typename core::BlockLoad<PtxPlan, ValuesLoadIt>::type;
+    using BlockLoadKeys   = typename core::detail::BlockLoad<PtxPlan, KeysLoadIt>::type;
+    using BlockLoadValues = typename core::detail::BlockLoad<PtxPlan, ValuesLoadIt>::type;
 
     using BlockDiscontinuityKeys = cub::BlockDiscontinuity<key_type, PtxPlan::BLOCK_THREADS, 1, 1, Arch::ver>;
 
@@ -188,11 +188,11 @@ struct ReduceByKeyAgent
       typename BlockLoadKeys::TempStorage load_keys;
       typename BlockLoadValues::TempStorage load_values;
 
-      core::uninitialized_array<key_value_pair_t, PtxPlan::ITEMS_PER_TILE + 1> raw_exchange;
+      core::detail::uninitialized_array<key_value_pair_t, PtxPlan::ITEMS_PER_TILE + 1> raw_exchange;
     }; // union TempStorage
   }; // struct PtxPlan
 
-  using ptx_plan = typename core::specialize_plan_msvc10_war<PtxPlan>::type::type;
+  using ptx_plan = typename core::detail::specialize_plan_msvc10_war<PtxPlan>::type::type;
 
   using KeysLoadIt             = typename ptx_plan::KeysLoadIt;
   using ValuesLoadIt           = typename ptx_plan::ValuesLoadIt;
@@ -360,9 +360,7 @@ struct ReduceByKeyAgent
       size_type num_tile_segments,
       size_type num_tile_segments_prefix)
     {
-      using core::sync_threadblock;
-
-      sync_threadblock();
+      __syncthreads();
 
       // Compact and scatter keys
 #  pragma unroll
@@ -375,7 +373,7 @@ struct ReduceByKeyAgent
         }
       }
 
-      sync_threadblock();
+      __syncthreads();
 
       for (int item = threadIdx.x; item < num_tile_segments; item += BLOCK_THREADS)
       {
@@ -445,8 +443,6 @@ struct ReduceByKeyAgent
     template <bool IS_LAST_TILE>
     THRUST_DEVICE_FUNCTION void consume_first_tile(Size num_remaining, Size tile_offset, ScanTileState& tile_state)
     {
-      using core::sync_threadblock;
-
       key_type keys[ITEMS_PER_THREAD]; // Tile keys
       key_type pred_keys[ITEMS_PER_THREAD]; // Tile keys shifted up (predecessor)
       value_type values[ITEMS_PER_THREAD]; // Tile values
@@ -468,7 +464,7 @@ struct ReduceByKeyAgent
         BlockLoadKeys(storage.load_keys).Load(keys_load_it + tile_offset, keys);
       }
 
-      sync_threadblock();
+      __syncthreads();
 
       // Load values (last tile repeats final element)
       if (IS_LAST_TILE)
@@ -481,7 +477,7 @@ struct ReduceByKeyAgent
         BlockLoadValues(storage.load_values).Load(values_load_it + tile_offset, values);
       }
 
-      sync_threadblock();
+      __syncthreads();
 
       // Set head segment_flags.
       // First tile sets the first flag for the first item
@@ -540,8 +536,6 @@ struct ReduceByKeyAgent
     THRUST_DEVICE_FUNCTION void
     consume_subsequent_tile(Size num_remaining, int tile_idx, Size tile_offset, ScanTileState& tile_state)
     {
-      using core::sync_threadblock;
-
       key_type keys[ITEMS_PER_THREAD]; // Tile keys
       key_type pred_keys[ITEMS_PER_THREAD]; // Tile keys shifted up (predecessor)
       value_type values[ITEMS_PER_THREAD]; // Tile values
@@ -563,7 +557,7 @@ struct ReduceByKeyAgent
 
       key_type tile_pred_key = (threadIdx.x == 0) ? key_type(keys_load_it[tile_offset - 1]) : key_type();
 
-      sync_threadblock();
+      __syncthreads();
 
       // Load values (last tile repeats final element)
       if (IS_LAST_TILE)
@@ -576,7 +570,7 @@ struct ReduceByKeyAgent
         BlockLoadValues(storage.load_values).Load(values_load_it + tile_offset, values);
       }
 
-      sync_threadblock();
+      __syncthreads();
 
       // Set head segment_flags
       BlockDiscontinuityKeys(storage.scan_storage.discontinuity)
@@ -635,8 +629,8 @@ struct ReduceByKeyAgent
       int /*num_tiles*/,
       ScanTileState& tile_state)
         : storage(storage_)
-        , keys_load_it(core::make_load_iterator(ptx_plan(), keys_input_it_))
-        , values_load_it(core::make_load_iterator(ptx_plan(), values_input_it_))
+        , keys_load_it(core::detail::make_load_iterator(ptx_plan(), keys_input_it_))
+        , values_load_it(core::detail::make_load_iterator(ptx_plan(), values_input_it_))
         , keys_output_it(keys_output_it_)
         , values_output_it(values_output_it_)
         , num_runs_output_it(num_runs_output_it_)
@@ -703,7 +697,7 @@ struct InitAgent
   template <class Arch>
   struct PtxPlan : PtxPolicy<128>
   {};
-  using ptx_plan = core::specialize_plan<PtxPlan>;
+  using ptx_plan = core::detail::specialize_plan<PtxPlan>;
 
   //---------------------------------------------------------------------
   // Agent entry point
@@ -740,8 +734,8 @@ THRUST_RUNTIME_FUNCTION cudaError_t doit_step(
   Size num_items,
   cudaStream_t stream)
 {
-  using core::AgentLauncher;
-  using core::AgentPlan;
+  using core::detail::AgentLauncher;
+  using core::detail::AgentPlan;
 
   cudaError_t status = cudaSuccess;
   if (num_items == 0)
@@ -762,7 +756,7 @@ THRUST_RUNTIME_FUNCTION cudaError_t doit_step(
   int tile_size  = reduce_by_key_plan.items_per_tile;
   Size num_tiles = ::cuda::ceil_div(num_items, tile_size);
 
-  size_t vshmem_size = core::vshmem_size(reduce_by_key_plan.shared_memory_size, num_tiles);
+  size_t vshmem_size = core::detail::vshmem_size(reduce_by_key_plan.shared_memory_size, num_tiles);
 
   size_t allocation_sizes[2] = {9, vshmem_size};
   status                     = ScanTileState::AllocationSize(static_cast<int>(num_tiles), allocation_sizes[0]);
@@ -848,14 +842,14 @@ THRUST_RUNTIME_FUNCTION pair<KeysOutputIt, ValuesOutputIt> reduce_by_key_dispatc
   void* allocations[2]       = {nullptr, nullptr};
 
   size_t storage_size = 0;
-  status              = core::alias_storage(nullptr, storage_size, allocations, allocation_sizes);
+  status              = core::detail::alias_storage(nullptr, storage_size, allocations, allocation_sizes);
   cuda_cub::throw_on_error(status, "reduce failed on 1st alias_storage");
 
   // Allocate temporary storage.
   thrust::detail::temporary_array<std::uint8_t, Derived> tmp(policy, storage_size);
   void* ptr = static_cast<void*>(tmp.data().get());
 
-  status = core::alias_storage(ptr, storage_size, allocations, allocation_sizes);
+  status = core::detail::alias_storage(ptr, storage_size, allocations, allocation_sizes);
   cuda_cub::throw_on_error(status, "reduce failed on 2nd alias_storage");
 
   Size* d_num_runs_out = thrust::detail::aligned_reinterpret_cast<Size*>(allocations[0]);
diff --git a/thrust/thrust/system/cuda/detail/set_operations.h b/thrust/thrust/system/cuda/detail/set_operations.h
index 7a267080bf8..b336f8e55fa 100644
--- a/thrust/thrust/system/cuda/detail/set_operations.h
+++ b/thrust/thrust/system/cuda/detail/set_operations.h
@@ -222,7 +222,7 @@ struct Tuning;
 namespace mpl = thrust::detail::mpl::math;
 
 template <class T, class U>
-struct Tuning<sm52, T, U>
+struct Tuning<core::detail::sm52, T, U>
 {
   enum
   {
@@ -243,7 +243,7 @@ struct Tuning<sm52, T, U>
 }; // tuning sm52
 
 template <class T, class U>
-struct Tuning<sm60, T, U>
+struct Tuning<core::detail::sm60, T, U>
 {
   enum
   {
@@ -290,15 +290,15 @@ struct SetOpAgent
   {
     using tuning = Tuning<Arch, key_type, value_type>;
 
-    using KeysLoadIt1   = typename core::LoadIterator<PtxPlan, KeysIt1>::type;
-    using KeysLoadIt2   = typename core::LoadIterator<PtxPlan, KeysIt2>::type;
-    using ValuesLoadIt1 = typename core::LoadIterator<PtxPlan, ValuesIt1>::type;
-    using ValuesLoadIt2 = typename core::LoadIterator<PtxPlan, ValuesIt2>::type;
+    using KeysLoadIt1   = typename core::detail::LoadIterator<PtxPlan, KeysIt1>::type;
+    using KeysLoadIt2   = typename core::detail::LoadIterator<PtxPlan, KeysIt2>::type;
+    using ValuesLoadIt1 = typename core::detail::LoadIterator<PtxPlan, ValuesIt1>::type;
+    using ValuesLoadIt2 = typename core::detail::LoadIterator<PtxPlan, ValuesIt2>::type;
 
-    using BlockLoadKeys1   = typename core::BlockLoad<PtxPlan, KeysLoadIt1>::type;
-    using BlockLoadKeys2   = typename core::BlockLoad<PtxPlan, KeysLoadIt2>::type;
-    using BlockLoadValues1 = typename core::BlockLoad<PtxPlan, ValuesLoadIt1>::type;
-    using BlockLoadValues2 = typename core::BlockLoad<PtxPlan, ValuesLoadIt2>::type;
+    using BlockLoadKeys1   = typename core::detail::BlockLoad<PtxPlan, KeysLoadIt1>::type;
+    using BlockLoadKeys2   = typename core::detail::BlockLoad<PtxPlan, KeysLoadIt2>::type;
+    using BlockLoadValues1 = typename core::detail::BlockLoad<PtxPlan, ValuesLoadIt1>::type;
+    using BlockLoadValues2 = typename core::detail::BlockLoad<PtxPlan, ValuesLoadIt2>::type;
 
     using TilePrefixCallback = cub::TilePrefixCallbackOp<Size, ::cuda::std::plus<>, ScanTileState, Arch::ver>;
 
@@ -316,7 +316,7 @@ struct SetOpAgent
 
       struct LoadStorage
       {
-        core::uninitialized_array<int, PtxPlan::BLOCK_THREADS> offset;
+        core::detail::uninitialized_array<int, PtxPlan::BLOCK_THREADS> offset;
         union
         {
           // FIXME These don't appear to be used anywhere?
@@ -328,15 +328,15 @@ struct SetOpAgent
           // Allocate extra shmem than truly necessary
           // This will permit to avoid range checks in
           // serial set operations, e.g. serial_set_difference
-          core::uninitialized_array<key_type, PtxPlan::ITEMS_PER_TILE + PtxPlan::BLOCK_THREADS> keys_shared;
+          core::detail::uninitialized_array<key_type, PtxPlan::ITEMS_PER_TILE + PtxPlan::BLOCK_THREADS> keys_shared;
 
-          core::uninitialized_array<value_type, PtxPlan::ITEMS_PER_TILE + PtxPlan::BLOCK_THREADS> values_shared;
+          core::detail::uninitialized_array<value_type, PtxPlan::ITEMS_PER_TILE + PtxPlan::BLOCK_THREADS> values_shared;
         }; // anon union
       } load_storage; // struct LoadStorage
     }; // union TempStorage
   }; // struct PtxPlan
 
-  using ptx_plan = typename core::specialize_plan_msvc10_war<PtxPlan>::type::type;
+  using ptx_plan = typename core::detail::specialize_plan_msvc10_war<PtxPlan>::type::type;
 
   using KeysLoadIt1   = typename ptx_plan::KeysLoadIt1;
   using KeysLoadIt2   = typename ptx_plan::KeysLoadIt2;
@@ -441,8 +441,6 @@ struct SetOpAgent
       Size tile_output_prefix,
       int tile_output_count)
     {
-      using core::sync_threadblock;
-
       int local_scatter_idx = thread_output_prefix - tile_output_prefix;
 #  pragma unroll
       for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
@@ -452,7 +450,7 @@ struct SetOpAgent
           shared[local_scatter_idx++] = input[ITEM];
         }
       }
-      sync_threadblock();
+      __syncthreads();
 
       for (int item = threadIdx.x; item < tile_output_count; item += BLOCK_THREADS)
       {
@@ -483,8 +481,7 @@ struct SetOpAgent
     template <bool IS_LAST_TILE>
     void THRUST_DEVICE_FUNCTION consume_tile(Size tile_idx)
     {
-      using core::sync_threadblock;
-      using core::uninitialized_array;
+      using core::detail::uninitialized_array;
 
       pair<Size, Size> partition_beg = partitions[tile_idx + 0];
       pair<Size, Size> partition_end = partitions[tile_idx + 1];
@@ -506,7 +503,7 @@ struct SetOpAgent
 
       reg_to_shared(&storage.load_storage.keys_shared[0], keys_loc);
 
-      sync_threadblock();
+      __syncthreads();
 
       int diag_loc = min<int>(ITEMS_PER_THREAD * threadIdx.x, num_keys1 + num_keys2);
 
@@ -529,7 +526,7 @@ struct SetOpAgent
       int dst                          = threadIdx.x == 0 ? BLOCK_THREADS - 1 : threadIdx.x - 1;
       storage.load_storage.offset[dst] = value;
 
-      core::sync_threadblock();
+      __syncthreads();
 
       pair<int, int> partition1_loc = thrust::make_pair(
         storage.load_storage.offset[threadIdx.x] >> 16, storage.load_storage.offset[threadIdx.x] & 0xFFFF);
@@ -554,7 +551,7 @@ struct SetOpAgent
         indices,
         compare_op,
         set_op);
-      sync_threadblock();
+      __syncthreads();
 #  if 0
         if (ITEMS_PER_THREAD*threadIdx.x >= num_keys1 + num_keys2)
           active_mask = 0;
@@ -588,7 +585,7 @@ struct SetOpAgent
         tile_output_prefix = prefix_cb.GetExclusivePrefix();
       }
 
-      sync_threadblock();
+      __syncthreads();
 
       // scatter results
       //
@@ -605,11 +602,11 @@ struct SetOpAgent
         value_type values_loc[ITEMS_PER_THREAD];
         gmem_to_reg<!IS_LAST_TILE>(values_loc, values1_in + keys1_beg, values2_in + keys2_beg, num_keys1, num_keys2);
 
-        sync_threadblock();
+        __syncthreads();
 
         reg_to_shared(&storage.load_storage.values_shared[0], values_loc);
 
-        sync_threadblock();
+        __syncthreads();
 
         // gather items from shared mem
         //
@@ -622,7 +619,7 @@ struct SetOpAgent
           }
         }
 
-        sync_threadblock();
+        __syncthreads();
 
         scatter(values_out,
                 values_loc,
@@ -660,10 +657,10 @@ struct SetOpAgent
          std::size_t* output_count_)
         : storage(storage_)
         , tile_state(tile_state_)
-        , keys1_in(core::make_load_iterator(ptx_plan(), keys1_))
-        , keys2_in(core::make_load_iterator(ptx_plan(), keys2_))
-        , values1_in(core::make_load_iterator(ptx_plan(), values1_))
-        , values2_in(core::make_load_iterator(ptx_plan(), values2_))
+        , keys1_in(core::detail::make_load_iterator(ptx_plan(), keys1_))
+        , keys2_in(core::detail::make_load_iterator(ptx_plan(), keys2_))
+        , values1_in(core::detail::make_load_iterator(ptx_plan(), values1_))
+        , values2_in(core::detail::make_load_iterator(ptx_plan(), values2_))
         , keys1_count(keys1_count_)
         , keys2_count(keys2_count_)
         , keys_out(keys_out_)
@@ -733,7 +730,7 @@ struct PartitionAgent
   struct PtxPlan : PtxPolicy<256>
   {};
 
-  using ptx_plan = core::specialize_plan<PtxPlan>;
+  using ptx_plan = core::detail::specialize_plan<PtxPlan>;
 
   //---------------------------------------------------------------------
   // Agent entry point
@@ -767,7 +764,7 @@ struct InitAgent
   struct PtxPlan : PtxPolicy<128>
   {};
 
-  using ptx_plan = core::specialize_plan<PtxPlan>;
+  using ptx_plan = core::detail::specialize_plan<PtxPlan>;
 
   //---------------------------------------------------------------------
   // Agent entry point
@@ -1058,8 +1055,8 @@ cudaError_t THRUST_RUNTIME_FUNCTION doit_step(
 
   cudaError_t status = cudaSuccess;
 
-  using core::AgentLauncher;
-  using core::AgentPlan;
+  using core::detail::AgentLauncher;
+  using core::detail::AgentPlan;
 
   using set_op_agent = AgentLauncher<
     SetOpAgent<KeysIt1, KeysIt2, ValuesIt1, ValuesIt2, KeysOutputIt, ValuesOutputIt, Size, CompareOp, SetOp, HAS_VALUES>>;
@@ -1080,13 +1077,13 @@ cudaError_t THRUST_RUNTIME_FUNCTION doit_step(
   status = ScanTileState::AllocationSize(static_cast<int>(num_tiles), tile_agent_storage);
   CUDA_CUB_RET_IF_FAIL(status);
 
-  size_t vshmem_storage          = core::vshmem_size(set_op_plan.shared_memory_size, num_tiles);
+  size_t vshmem_storage          = core::detail::vshmem_size(set_op_plan.shared_memory_size, num_tiles);
   size_t partition_agent_storage = (num_tiles + 1) * sizeof(Size) * 2;
 
   void* allocations[3]       = {nullptr, nullptr, nullptr};
   size_t allocation_sizes[3] = {tile_agent_storage, partition_agent_storage, vshmem_storage};
 
-  status = core::alias_storage(d_temp_storage, temp_storage_size, allocations, allocation_sizes);
+  status = core::detail::alias_storage(d_temp_storage, temp_storage_size, allocations, allocation_sizes);
   CUDA_CUB_RET_IF_FAIL(status);
 
   if (d_temp_storage == nullptr)
@@ -1192,14 +1189,14 @@ THRUST_RUNTIME_FUNCTION pair<KeysOutputIt, ValuesOutputIt> set_operations(
 
   size_t storage_size = 0;
 
-  status = core::alias_storage(nullptr, storage_size, allocations, allocation_sizes);
+  status = core::detail::alias_storage(nullptr, storage_size, allocations, allocation_sizes);
   cuda_cub::throw_on_error(status, "set_operations failed on 1st alias_storage");
 
   // Allocate temporary storage.
   thrust::detail::temporary_array<std::uint8_t, Derived> tmp(policy, storage_size);
   void* ptr = static_cast<void*>(tmp.data().get());
 
-  status = core::alias_storage(ptr, storage_size, allocations, allocation_sizes);
+  status = core::detail::alias_storage(ptr, storage_size, allocations, allocation_sizes);
   cuda_cub::throw_on_error(status, "set_operations failed on 2nd alias_storage");
 
   std::size_t* d_output_count = thrust::detail::aligned_reinterpret_cast<std::size_t*>(allocations[0]);
diff --git a/thrust/thrust/system/cuda/detail/sort.h b/thrust/thrust/system/cuda/detail/sort.h
index 2c3ef85202d..7ad67fd4e0c 100644
--- a/thrust/thrust/system/cuda/detail/sort.h
+++ b/thrust/thrust/system/cuda/detail/sort.h
@@ -58,6 +58,8 @@
 #  include <thrust/system/cuda/detail/util.h>
 #  include <thrust/type_traits/is_contiguous_iterator.h>
 
+#  include <cuda/cmath>
+
 #  include <cstdint>
 
 #  if defined(_CCCL_HAS_NVFP16)
@@ -277,8 +279,8 @@ THRUST_RUNTIME_FUNCTION void radix_sort(execution_policy<Derived>& policy, Key*
     dispatch<SORT_ITEMS, CompareOp>::doit(nullptr, temp_storage_bytes, keys_buffer, items_buffer, keys_count, stream);
   cuda_cub::throw_on_error(status, "radix_sort: failed on 1st step");
 
-  size_t keys_temp_storage  = core::align_to(sizeof(Key) * keys_count, 128);
-  size_t items_temp_storage = core::align_to(sizeof(Item) * items_count, 128);
+  size_t keys_temp_storage  = ::cuda::round_up(sizeof(Key) * keys_count, 128);
+  size_t items_temp_storage = ::cuda::round_up(sizeof(Item) * items_count, 128);
 
   size_t storage_size = keys_temp_storage + items_temp_storage + temp_storage_bytes;
 
diff --git a/thrust/thrust/system/cuda/detail/unique.h b/thrust/thrust/system/cuda/detail/unique.h
index ac94017758b..1d39b161866 100644
--- a/thrust/thrust/system/cuda/detail/unique.h
+++ b/thrust/thrust/system/cuda/detail/unique.h
@@ -123,7 +123,7 @@ struct items_per_thread
 };
 
 template <class T>
-struct Tuning<sm52, T>
+struct Tuning<core::detail::sm52, T>
 {
   const static int INPUT_SIZE = sizeof(T);
   enum
@@ -149,16 +149,16 @@ struct UniqueAgent
   {
     using tuning = Tuning<Arch, item_type>;
 
-    using ItemsLoadIt = typename core::LoadIterator<PtxPlan, ItemsIt>::type;
+    using ItemsLoadIt = typename core::detail::LoadIterator<PtxPlan, ItemsIt>::type;
 
-    using BlockLoadItems = typename core::BlockLoad<PtxPlan, ItemsLoadIt>::type;
+    using BlockLoadItems = typename core::detail::BlockLoad<PtxPlan, ItemsLoadIt>::type;
 
     using BlockDiscontinuityItems = cub::BlockDiscontinuity<item_type, PtxPlan::BLOCK_THREADS, 1, 1, Arch::ver>;
 
     using TilePrefixCallback = cub::TilePrefixCallbackOp<Size, ::cuda::std::plus<>, ScanTileState, Arch::ver>;
     using BlockScan          = cub::BlockScan<Size, PtxPlan::BLOCK_THREADS, PtxPlan::SCAN_ALGORITHM, 1, 1, Arch::ver>;
 
-    using shared_items_t = core::uninitialized_array<item_type, PtxPlan::ITEMS_PER_TILE>;
+    using shared_items_t = core::detail::uninitialized_array<item_type, PtxPlan::ITEMS_PER_TILE>;
 
     union TempStorage
     {
@@ -175,7 +175,7 @@ struct UniqueAgent
     }; // union TempStorage
   }; // struct PtxPlan
 
-  using ptx_plan = typename core::specialize_plan_msvc10_war<PtxPlan>::type::type;
+  using ptx_plan = typename core::detail::specialize_plan_msvc10_war<PtxPlan>::type::type;
 
   using ItemsLoadIt             = typename ptx_plan::ItemsLoadIt;
   using BlockLoadItems          = typename ptx_plan::BlockLoadItems;
@@ -224,8 +224,6 @@ struct UniqueAgent
       Size num_selections_prefix,
       Size /*num_selections*/)
     {
-      using core::sync_threadblock;
-
 #  pragma unroll
       for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
       {
@@ -236,14 +234,14 @@ struct UniqueAgent
         }
       }
 
-      sync_threadblock();
+      __syncthreads();
 
       for (int item = threadIdx.x; item < num_tile_selections; item += BLOCK_THREADS)
       {
         items_out[num_selections_prefix + item] = get_shared()[item];
       }
 
-      sync_threadblock();
+      __syncthreads();
     }
 
     //---------------------------------------------------------------------
@@ -253,8 +251,7 @@ struct UniqueAgent
     template <bool IS_LAST_TILE, bool IS_FIRST_TILE>
     Size THRUST_DEVICE_FUNCTION consume_tile_impl(int num_tile_items, int tile_idx, Size tile_base)
     {
-      using core::sync_threadblock;
-      using core::uninitialized_array;
+      using core::detail::uninitialized_array;
 
       item_type items_loc[ITEMS_PER_THREAD];
       Size selection_flags[ITEMS_PER_THREAD];
@@ -270,7 +267,7 @@ struct UniqueAgent
         BlockLoadItems(temp_storage.load_items).Load(items_in + tile_base, items_loc);
       }
 
-      sync_threadblock();
+      __syncthreads();
 
       if (IS_FIRST_TILE)
       {
@@ -294,7 +291,7 @@ struct UniqueAgent
         }
       }
 
-      sync_threadblock();
+      __syncthreads();
 
       Size num_tile_selections   = 0;
       Size num_selections        = 0;
@@ -337,7 +334,7 @@ struct UniqueAgent
         }
       }
 
-      sync_threadblock();
+      __syncthreads();
 
       scatter(items_loc,
               selection_flags,
@@ -420,7 +417,7 @@ struct UniqueAgent
 
     impl(storage,
          tile_state,
-         core::make_load_iterator(ptx_plan(), items_in),
+         core::detail::make_load_iterator(ptx_plan(), items_in),
          items_out,
          binary_pred,
          num_items,
@@ -435,7 +432,7 @@ struct InitAgent
   template <class Arch>
   struct PtxPlan : PtxPolicy<128>
   {};
-  using ptx_plan = core::specialize_plan<PtxPlan>;
+  using ptx_plan = core::detail::specialize_plan<PtxPlan>;
 
   //---------------------------------------------------------------------
   // Agent entry point
@@ -463,9 +460,9 @@ static cudaError_t THRUST_RUNTIME_FUNCTION doit_step(
   Size num_items,
   cudaStream_t stream)
 {
-  using core::AgentLauncher;
-  using core::AgentPlan;
-  using core::get_agent_plan;
+  using core::detail::AgentLauncher;
+  using core::detail::AgentPlan;
+  using core::detail::get_agent_plan;
 
   using unique_agent = AgentLauncher<UniqueAgent<ItemsInputIt, ItemsOutputIt, BinaryPred, Size, NumSelectedOutIt>>;
 
@@ -473,14 +470,14 @@ static cudaError_t THRUST_RUNTIME_FUNCTION doit_step(
 
   using init_agent = AgentLauncher<InitAgent<ScanTileState, NumSelectedOutIt, Size>>;
 
-  using core::get_plan;
+  using core::detail::get_plan;
   typename get_plan<init_agent>::type init_plan     = init_agent::get_plan();
   typename get_plan<unique_agent>::type unique_plan = unique_agent::get_plan(stream);
 
   int tile_size    = unique_plan.items_per_tile;
   size_t num_tiles = ::cuda::ceil_div(num_items, tile_size);
 
-  size_t vshmem_size = core::vshmem_size(unique_plan.shared_memory_size, num_tiles);
+  size_t vshmem_size = core::detail::vshmem_size(unique_plan.shared_memory_size, num_tiles);
 
   cudaError_t status         = cudaSuccess;
   size_t allocation_sizes[2] = {0, vshmem_size};
@@ -550,14 +547,14 @@ THRUST_RUNTIME_FUNCTION ItemsOutputIt unique(
   void* allocations[2]       = {nullptr, nullptr};
 
   size_t storage_size = 0;
-  status              = core::alias_storage(nullptr, storage_size, allocations, allocation_sizes);
+  status              = core::detail::alias_storage(nullptr, storage_size, allocations, allocation_sizes);
   cuda_cub::throw_on_error(status, "unique: failed on 1st step");
 
   // Allocate temporary storage.
   thrust::detail::temporary_array<std::uint8_t, Derived> tmp(policy, storage_size);
   void* ptr = static_cast<void*>(tmp.data().get());
 
-  status = core::alias_storage(ptr, storage_size, allocations, allocation_sizes);
+  status = core::detail::alias_storage(ptr, storage_size, allocations, allocation_sizes);
   cuda_cub::throw_on_error(status, "unique: failed on 2nd step");
 
   size_type* d_num_selected_out = thrust::detail::aligned_reinterpret_cast<size_type*>(allocations[0]);