From 8d05f7abeb2e83055d69f7c1bf65d2e7cb14f646 Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Fri, 10 Jan 2025 22:26:04 +0100
Subject: [PATCH] Refactor CUB's util_debug

---
 cub/cub/detail/device_synchronize.cuh         | 54 -----------
 .../dispatch/dispatch_adjacent_difference.cuh |  8 +-
 .../device/dispatch/dispatch_batch_memcpy.cuh |  6 +-
 cub/cub/device/dispatch/dispatch_for.cuh      |  4 +-
 .../dispatch/dispatch_for_each_in_extents.cuh |  4 +-
 .../device/dispatch/dispatch_histogram.cuh    |  8 +-
 .../device/dispatch/dispatch_radix_sort.cuh   | 20 ++--
 cub/cub/device/dispatch/dispatch_reduce.cuh   | 16 ++--
 .../dispatch/dispatch_reduce_by_key.cuh       |  8 +-
 cub/cub/device/dispatch/dispatch_rle.cuh      |  8 +-
 cub/cub/device/dispatch/dispatch_scan.cuh     |  8 +-
 .../device/dispatch/dispatch_scan_by_key.cuh  |  8 +-
 .../dispatch/dispatch_segmented_sort.cuh      | 12 +--
 .../device/dispatch/dispatch_select_if.cuh    |  4 +-
 .../device/dispatch/dispatch_spmv_orig.cuh    | 20 ++--
 .../dispatch/dispatch_three_way_partition.cuh |  8 +-
 .../device/dispatch/dispatch_transform.cuh    | 14 +--
 .../dispatch/dispatch_unique_by_key.cuh       |  8 +-
 cub/cub/util_allocator.cuh                    | 18 ++--
 cub/cub/util_debug.cuh                        | 93 +++----------------
 cub/cub/util_device.cuh                       | 59 +++---------
 cub/test/CMakeLists.txt                       |  2 +-
 .../system/cuda/detail/core/agent_launcher.h  |  2 -
 thrust/thrust/system/cuda/detail/util.h       |  1 -
 24 files changed, 115 insertions(+), 278 deletions(-)
 delete mode 100644 cub/cub/detail/device_synchronize.cuh

diff --git a/cub/cub/detail/device_synchronize.cuh b/cub/cub/detail/device_synchronize.cuh
deleted file mode 100644
index 1d71c6ebc0d..00000000000
--- a/cub/cub/detail/device_synchronize.cuh
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- *  Copyright 2021 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <cub/config.cuh>
-
-#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
-#  pragma GCC system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
-#  pragma clang system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
-#  pragma system_header
-#endif // no system header
-
-#include <cuda_runtime_api.h>
-
-#include <cub/detail/detect_cuda_runtime.cuh>
-
-#include <nv/target>
-
-CUB_NAMESPACE_BEGIN
-
-namespace detail
-{
-
-/**
- * Call `cudaDeviceSynchronize()` using the proper API for the current CUB and
- * CUDA configuration.
- */
-_CCCL_EXEC_CHECK_DISABLE
-CUB_RUNTIME_FUNCTION inline cudaError_t device_synchronize()
-{
-  cudaError_t result = cudaErrorNotSupported;
-  NV_IF_TARGET(NV_IS_HOST, (result = cudaDeviceSynchronize();), ());
-  return result;
-}
-
-} // namespace detail
-
-CUB_NAMESPACE_END
diff --git a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh
index a8c733ef309..3e8184e4c2f 100644
--- a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh
+++ b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh
@@ -189,13 +189,13 @@ struct DispatchAdjacentDifference
         constexpr int init_block_size = AgentDifferenceInitT::BLOCK_THREADS;
         const int init_grid_size      = ::cuda::ceil_div(num_tiles, init_block_size);
 
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
         _CubLog("Invoking DeviceAdjacentDifferenceInitKernel"
                 "<<<%d, %d, 0, %lld>>>()\n",
                 init_grid_size,
                 init_block_size,
                 reinterpret_cast<long long>(stream));
-#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+#endif // CUB_DEBUG_LOG
 
         THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, init_block_size, 0, stream)
           .doit(DeviceAdjacentDifferenceInitKernel<AgentDifferenceInitT, InputIteratorT, InputT, OffsetT>,
@@ -219,13 +219,13 @@ struct DispatchAdjacentDifference
         }
       }
 
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
       _CubLog("Invoking DeviceAdjacentDifferenceDifferenceKernel"
               "<<<%d, %d, 0, %lld>>>()\n",
               num_tiles,
               AdjacentDifferencePolicyT::BLOCK_THREADS,
               reinterpret_cast<long long>(stream));
-#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+#endif // CUB_DEBUG_LOG
 
       THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
         num_tiles, AdjacentDifferencePolicyT::BLOCK_THREADS, 0, stream)
diff --git a/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh b/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh
index 287f702b095..68726772b54 100644
--- a/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh
+++ b/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh
@@ -536,7 +536,7 @@ struct DispatchBatchMemcpy
       return error;
     }
 
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
     _CubLog("Invoking "
             "InitTileStateKernel<<<%d, %d, 0, %lld>>>()\n",
             static_cast<int>(init_grid_size),
@@ -564,7 +564,7 @@ struct DispatchBatchMemcpy
       return error;
     }
 
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
     _CubLog("Invoking "
             "BatchMemcpyKernel<<<%d, %d, 0, %lld>>>()\n",
             static_cast<int>(batch_memcpy_grid_size),
@@ -603,7 +603,7 @@ struct DispatchBatchMemcpy
       return error;
     }
 
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
     _CubLog("Invoking "
             "MultiBlockBatchMemcpyKernel<<<%d, %d, 0, %lld>>>()\n",
             static_cast<int>(batch_memcpy_blev_grid_size),
diff --git a/cub/cub/device/dispatch/dispatch_for.cuh b/cub/cub/device/dispatch/dispatch_for.cuh
index 7ba478e3c00..453259b1896 100644
--- a/cub/cub/device/dispatch/dispatch_for.cuh
+++ b/cub/cub/device/dispatch/dispatch_for.cuh
@@ -101,7 +101,7 @@ struct dispatch_t
     const auto tile_size = static_cast<OffsetT>(block_threads * items_per_thread);
     const auto num_tiles = ::cuda::ceil_div(num_items, tile_size);
 
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
     _CubLog("Invoking detail::for_each::dynamic_kernel<<<%d, %d, 0, %lld>>>(), "
             "%d items per thread\n",
             static_cast<int>(num_tiles),
@@ -144,7 +144,7 @@ struct dispatch_t
     const auto tile_size = static_cast<OffsetT>(block_threads * items_per_thread);
     const auto num_tiles = ::cuda::ceil_div(num_items, tile_size);
 
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
     _CubLog("Invoking detail::for_each::static_kernel<<<%d, %d, 0, %lld>>>(), "
             "%d items per thread\n",
             static_cast<int>(num_tiles),
diff --git a/cub/cub/device/dispatch/dispatch_for_each_in_extents.cuh b/cub/cub/device/dispatch/dispatch_for_each_in_extents.cuh
index 6e346316d48..4375fac98dd 100644
--- a/cub/cub/device/dispatch/dispatch_for_each_in_extents.cuh
+++ b/cub/cub/device/dispatch/dispatch_for_each_in_extents.cuh
@@ -117,7 +117,7 @@ public:
     constexpr unsigned items_per_thread = ActivePolicyT::for_policy_t::items_per_thread;
     unsigned num_cta                    = ::cuda::ceil_div(_size, block_threads * items_per_thread);
 
-#  ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#  ifdef CUB_DEBUG_LOG
     _CubLog("Invoking detail::for_each_in_extents::static_kernel<<<%u, %u, 0, %p>>>(), items_per_thread: %u\n",
             num_cta,
             block_threads,
@@ -155,7 +155,7 @@ public:
     _CUB_RETURN_IF_ERROR(status)
     unsigned num_cta = ::cuda::ceil_div(_size, block_threads * items_per_thread);
 
-#  ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#  ifdef CUB_DEBUG_LOG
     _CubLog("Invoking detail::for_each_in_extents::dynamic_kernel<<<%u, %u, 0, %p>>>(), items_per_thread: %u\n",
             num_cta,
             block_threads,
diff --git a/cub/cub/device/dispatch/dispatch_histogram.cuh b/cub/cub/device/dispatch/dispatch_histogram.cuh
index 900f758cdfb..abcafe9d538 100644
--- a/cub/cub/device/dispatch/dispatch_histogram.cuh
+++ b/cub/cub/device/dispatch/dispatch_histogram.cuh
@@ -434,12 +434,12 @@ struct dispatch_histogram
         (max_num_output_bins + histogram_init_block_threads - 1) / histogram_init_block_threads;
 
 // Log DeviceHistogramInitKernel configuration
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
       _CubLog("Invoking DeviceHistogramInitKernel<<<%d, %d, 0, %lld>>>()\n",
               histogram_init_grid_dims,
               histogram_init_block_threads,
               (long long) stream);
-#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+#endif // CUB_DEBUG_LOG
 
       // Invoke histogram_init_kernel
       THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
@@ -453,7 +453,7 @@ struct dispatch_histogram
       }
 
 // Log histogram_sweep_kernel configuration
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
       _CubLog("Invoking histogram_sweep_kernel<<<{%d, %d, %d}, %d, 0, %lld>>>(), %d pixels "
               "per thread, %d SM occupancy\n",
               sweep_grid_dims.x,
@@ -463,7 +463,7 @@ struct dispatch_histogram
               (long long) stream,
               pixels_per_thread,
               histogram_sweep_sm_occupancy);
-#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+#endif // CUB_DEBUG_LOG
 
       // Invoke histogram_sweep_kernel
       THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(sweep_grid_dims, block_threads, 0, stream)
diff --git a/cub/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/cub/device/dispatch/dispatch_radix_sort.cuh
index c533afa1243..7c738fbdbb2 100644
--- a/cub/cub/device/dispatch/dispatch_radix_sort.cuh
+++ b/cub/cub/device/dispatch/dispatch_radix_sort.cuh
@@ -965,7 +965,7 @@ struct DispatchRadixSort
       }
 
 // Log single_tile_kernel configuration
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
       _CubLog("Invoking single_tile_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit "
               "%d, bit_grain %d\n",
               1,
@@ -1036,7 +1036,7 @@ struct DispatchRadixSort
       int pass_bits = CUB_MIN(pass_config.radix_bits, (end_bit - current_bit));
 
 // Log upsweep_kernel configuration
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
       _CubLog("Invoking upsweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, "
               "bit_grain %d\n",
               pass_config.even_share.grid_size,
@@ -1078,7 +1078,7 @@ struct DispatchRadixSort
       }
 
 // Log scan_kernel configuration
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
       _CubLog("Invoking scan_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread\n",
               1,
               pass_config.scan_config.block_threads,
@@ -1105,7 +1105,7 @@ struct DispatchRadixSort
       }
 
 // Log downsweep_kernel configuration
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
       _CubLog("Invoking downsweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
               pass_config.even_share.grid_size,
               pass_config.downsweep_config.block_threads,
@@ -1305,7 +1305,7 @@ struct DispatchRadixSort
       }
 
 // log histogram_kernel configuration
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
       _CubLog("Invoking histogram_kernel<<<%d, %d, 0, %lld>>>(), %d items per iteration, "
               "%d SM occupancy, bit_grain %d\n",
               histo_blocks_per_sm * num_sms,
@@ -1335,7 +1335,7 @@ struct DispatchRadixSort
       constexpr int SCAN_BLOCK_THREADS = ActivePolicyT::ExclusiveSumPolicy::BLOCK_THREADS;
 
 // log exclusive_sum_kernel configuration
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
       _CubLog("Invoking exclusive_sum_kernel<<<%d, %d, 0, %lld>>>(), bit_grain %d\n",
               num_passes,
               SCAN_BLOCK_THREADS,
@@ -1383,7 +1383,7 @@ struct DispatchRadixSort
           }
 
 // log onesweep_kernel configuration
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
           _CubLog("Invoking onesweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, "
                   "current bit %d, bit_grain %d, portion %d/%d\n",
                   num_blocks,
@@ -1672,7 +1672,7 @@ struct DispatchRadixSort
     }
 
 // Copy keys
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
     _CubLog("Invoking async copy of %lld keys on stream %lld\n", (long long) num_items, (long long) stream);
 #endif
     cudaError_t error = cudaSuccess;
@@ -1694,7 +1694,7 @@ struct DispatchRadixSort
     // Copy values if necessary
     if (!KEYS_ONLY)
     {
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
       _CubLog("Invoking async copy of %lld values on stream %lld\n", (long long) num_items, (long long) stream);
 #endif
       error = CubDebug(cudaMemcpyAsync(
@@ -2001,7 +2001,7 @@ struct DispatchSegmentedRadixSort
       int pass_bits = CUB_MIN(pass_config.radix_bits, (end_bit - current_bit));
 
 // Log kernel configuration
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
       _CubLog("Invoking segmented_kernels<<<%lld, %lld, 0, %lld>>>(), "
               "%lld items per thread, %lld SM occupancy, "
               "current bit %d, bit_grain %d\n",
diff --git a/cub/cub/device/dispatch/dispatch_reduce.cuh b/cub/cub/device/dispatch/dispatch_reduce.cuh
index 0cca1e1a982..c967cf09f6a 100644
--- a/cub/cub/device/dispatch/dispatch_reduce.cuh
+++ b/cub/cub/device/dispatch/dispatch_reduce.cuh
@@ -378,13 +378,13 @@ struct DispatchReduce
       }
 
 // Log single_reduce_sweep_kernel configuration
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
       _CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), "
               "%d items per thread\n",
               policy.SingleTile().BlockThreads(),
               (long long) stream,
               policy.SingleTile().ItemsPerThread());
-#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+#endif // CUB_DEBUG_LOG
 
       // Invoke single_reduce_sweep_kernel
       launcher_factory(1, policy.SingleTile().BlockThreads(), 0, stream)
@@ -490,7 +490,7 @@ struct DispatchReduce
       int reduce_grid_size = even_share.grid_size;
 
 // Log device_reduce_sweep_kernel configuration
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
       _CubLog("Invoking DeviceReduceKernel<<<%lu, %d, 0, %lld>>>(), %d items "
               "per thread, %d SM occupancy\n",
               (unsigned long) reduce_grid_size,
@@ -498,7 +498,7 @@ struct DispatchReduce
               (long long) stream,
               active_policy.Reduce().ItemsPerThread(),
               reduce_config.sm_occupancy);
-#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+#endif // CUB_DEBUG_LOG
 
       // Invoke DeviceReduceKernel
       launcher_factory(reduce_grid_size, active_policy.Reduce().BlockThreads(), 0, stream)
@@ -519,13 +519,13 @@ struct DispatchReduce
       }
 
 // Log single_reduce_sweep_kernel configuration
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
       _CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), "
               "%d items per thread\n",
               active_policy.SingleTile().BlockThreads(),
               (long long) stream,
               active_policy.SingleTile().ItemsPerThread());
-#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+#endif // CUB_DEBUG_LOG
 
       // Invoke DeviceReduceSingleTileKernel
       launcher_factory(1, active_policy.SingleTile().BlockThreads(), 0, stream)
@@ -881,7 +881,7 @@ struct DispatchSegmentedReduce
       }
 
 // Log device_reduce_sweep_kernel configuration
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
       _CubLog("Invoking SegmentedDeviceReduceKernel<<<%d, %d, 0, %lld>>>(), "
               "%d items per thread, %d SM occupancy\n",
               num_segments,
@@ -889,7 +889,7 @@ struct DispatchSegmentedReduce
               (long long) stream,
               ActivePolicyT::SegmentedReducePolicy::ITEMS_PER_THREAD,
               segmented_reduce_config.sm_occupancy);
-#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+#endif // CUB_DEBUG_LOG
 
       // Invoke DeviceReduceKernel
       THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
diff --git a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
index 804371588f3..3c0fd4b424e 100644
--- a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
+++ b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
@@ -341,9 +341,9 @@ struct DispatchReduceByKey
       // Log init_kernel configuration
       int init_grid_size = CUB_MAX(1, ::cuda::ceil_div(num_tiles, INIT_KERNEL_THREADS));
 
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
       _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
-#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+#endif // CUB_DEBUG_LOG
 
       // Invoke init_kernel to initialize tile descriptors
       THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream)
@@ -391,7 +391,7 @@ struct DispatchReduceByKey
       for (int start_tile = 0; start_tile < num_tiles; start_tile += scan_grid_size)
       {
 // Log reduce_by_key_kernel configuration
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
         _CubLog("Invoking %d reduce_by_key_kernel<<<%d, %d, 0, %lld>>>(), %d "
                 "items per thread, %d SM occupancy\n",
                 start_tile,
@@ -400,7 +400,7 @@ struct DispatchReduceByKey
                 (long long) stream,
                 items_per_thread,
                 reduce_by_key_sm_occupancy);
-#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+#endif // CUB_DEBUG_LOG
 
         // Invoke reduce_by_key_kernel
         THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(scan_grid_size, block_threads, 0, stream)
diff --git a/cub/cub/device/dispatch/dispatch_rle.cuh b/cub/cub/device/dispatch/dispatch_rle.cuh
index b1542462a58..1dea6c217ad 100644
--- a/cub/cub/device/dispatch/dispatch_rle.cuh
+++ b/cub/cub/device/dispatch/dispatch_rle.cuh
@@ -349,12 +349,12 @@ struct DeviceRleDispatch
       // Log device_scan_init_kernel configuration
       int init_grid_size = CUB_MAX(1, ::cuda::ceil_div(num_tiles, INIT_KERNEL_THREADS));
 
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
       _CubLog("Invoking device_scan_init_kernel<<<%d, %d, 0, %lld>>>()\n",
               init_grid_size,
               INIT_KERNEL_THREADS,
               (long long) stream);
-#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+#endif // CUB_DEBUG_LOG
 
       // Invoke device_scan_init_kernel to initialize tile descriptors and queue descriptors
       THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream)
@@ -405,7 +405,7 @@ struct DeviceRleDispatch
       scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x);
 
 // Log device_rle_sweep_kernel configuration
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
       _CubLog("Invoking device_rle_sweep_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per "
               "thread, %d SM occupancy\n",
               scan_grid_size.x,
@@ -415,7 +415,7 @@ struct DeviceRleDispatch
               (long long) stream,
               items_per_thread,
               device_rle_kernel_sm_occupancy);
-#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+#endif // CUB_DEBUG_LOG
 
       // Invoke device_rle_sweep_kernel
       THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(scan_grid_size, block_threads, 0, stream)
diff --git a/cub/cub/device/dispatch/dispatch_scan.cuh b/cub/cub/device/dispatch/dispatch_scan.cuh
index 4db31cf6989..f686c20c2fa 100644
--- a/cub/cub/device/dispatch/dispatch_scan.cuh
+++ b/cub/cub/device/dispatch/dispatch_scan.cuh
@@ -260,9 +260,9 @@ struct DispatchScan
       // Log init_kernel configuration
       int init_grid_size = ::cuda::ceil_div(num_tiles, INIT_KERNEL_THREADS);
 
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
       _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
-#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+#endif // CUB_DEBUG_LOG
 
       // Invoke init_kernel to initialize tile descriptors
       THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream)
@@ -305,7 +305,7 @@ struct DispatchScan
       for (int start_tile = 0; start_tile < num_tiles; start_tile += scan_grid_size)
       {
 // Log scan_kernel configuration
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
         _CubLog("Invoking %d scan_kernel<<<%d, %d, 0, %lld>>>(), %d items "
                 "per thread, %d SM occupancy\n",
                 start_tile,
@@ -314,7 +314,7 @@ struct DispatchScan
                 (long long) stream,
                 Policy::ITEMS_PER_THREAD,
                 scan_sm_occupancy);
-#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+#endif // CUB_DEBUG_LOG
 
         // Invoke scan_kernel
         THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(scan_grid_size, Policy::BLOCK_THREADS, 0, stream)
diff --git a/cub/cub/device/dispatch/dispatch_scan_by_key.cuh b/cub/cub/device/dispatch/dispatch_scan_by_key.cuh
index c88656dff48..e8676383e5f 100644
--- a/cub/cub/device/dispatch/dispatch_scan_by_key.cuh
+++ b/cub/cub/device/dispatch/dispatch_scan_by_key.cuh
@@ -406,9 +406,9 @@ struct DispatchScanByKey
 
       // Log init_kernel configuration
       int init_grid_size = ::cuda::ceil_div(num_tiles, INIT_KERNEL_THREADS);
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
       _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
-#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+#endif // CUB_DEBUG_LOG
 
       // Invoke init_kernel to initialize tile descriptors
       THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream)
@@ -441,7 +441,7 @@ struct DispatchScanByKey
       for (int start_tile = 0; start_tile < num_tiles; start_tile += scan_grid_size)
       {
 // Log scan_kernel configuration
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
         _CubLog("Invoking %d scan_kernel<<<%d, %d, 0, %lld>>>(), %d items "
                 "per thread\n",
                 start_tile,
@@ -449,7 +449,7 @@ struct DispatchScanByKey
                 Policy::BLOCK_THREADS,
                 (long long) stream,
                 Policy::ITEMS_PER_THREAD);
-#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+#endif // CUB_DEBUG_LOG
 
         // Invoke scan_kernel
         THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(scan_grid_size, Policy::BLOCK_THREADS, 0, stream)
diff --git a/cub/cub/device/dispatch/dispatch_segmented_sort.cuh b/cub/cub/device/dispatch/dispatch_segmented_sort.cuh
index 81381e5dad8..05d405b2bf8 100644
--- a/cub/cub/device/dispatch/dispatch_segmented_sort.cuh
+++ b/cub/cub/device/dispatch/dispatch_segmented_sort.cuh
@@ -593,13 +593,13 @@ CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN cudaError_t DeviceSegmentedSortCont
     // One CTA per segment
     const local_segment_index_t blocks_in_grid = large_segments;
 
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
     _CubLog("Invoking "
             "DeviceSegmentedSortKernelLarge<<<%d, %d, 0, %lld>>>()\n",
             static_cast<int>(blocks_in_grid),
             LargeSegmentPolicyT::BLOCK_THREADS,
             (long long) stream);
-#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+#endif // CUB_DEBUG_LOG
 
     THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
       blocks_in_grid, LargeSegmentPolicyT::BLOCK_THREADS, 0, stream)
@@ -643,13 +643,13 @@ CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN cudaError_t DeviceSegmentedSortCont
 
   if (small_and_medium_blocks_in_grid)
   {
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
     _CubLog("Invoking "
             "DeviceSegmentedSortKernelSmall<<<%d, %d, 0, %lld>>>()\n",
             static_cast<int>(small_and_medium_blocks_in_grid),
             SmallAndMediumPolicyT::BLOCK_THREADS,
             (long long) stream);
-#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+#endif // CUB_DEBUG_LOG
 
     THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
       small_and_medium_blocks_in_grid, SmallAndMediumPolicyT::BLOCK_THREADS, 0, stream)
@@ -1368,7 +1368,7 @@ private:
     constexpr auto threads_in_block = static_cast<unsigned int>(LargeSegmentPolicyT::BLOCK_THREADS);
 
 // Log kernel configuration
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
     _CubLog("Invoking DeviceSegmentedSortFallbackKernel<<<%d, %d, "
             "0, %lld>>>(), %d items per thread, bit_grain %d\n",
             blocks_in_grid,
@@ -1376,7 +1376,7 @@ private:
             (long long) stream,
             LargeSegmentPolicyT::ITEMS_PER_THREAD,
             LargeSegmentPolicyT::RADIX_BITS);
-#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+#endif // CUB_DEBUG_LOG
 
     // Invoke fallback kernel
     THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(blocks_in_grid, threads_in_block, 0, stream)
diff --git a/cub/cub/device/dispatch/dispatch_select_if.cuh b/cub/cub/device/dispatch/dispatch_select_if.cuh
index c41dfb389eb..19d161a9c7b 100644
--- a/cub/cub/device/dispatch/dispatch_select_if.cuh
+++ b/cub/cub/device/dispatch/dispatch_select_if.cuh
@@ -660,7 +660,7 @@ struct DispatchSelectIf
         // Log scan_init_kernel configuration
         int init_grid_size = CUB_MAX(1, ::cuda::ceil_div(current_num_tiles, INIT_KERNEL_THREADS));
 
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
         _CubLog("Invoking scan_init_kernel<<<%d, %d, 0, %lld>>>()\n",
                 init_grid_size,
                 INIT_KERNEL_THREADS,
@@ -693,7 +693,7 @@ struct DispatchSelectIf
         }
 
 // Log select_if_kernel configuration
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
         {
           // Get SM occupancy for select_if_kernel
           int range_select_sm_occupancy;
diff --git a/cub/cub/device/dispatch/dispatch_spmv_orig.cuh b/cub/cub/device/dispatch/dispatch_spmv_orig.cuh
index d659afa0ce9..24ef2845dee 100644
--- a/cub/cub/device/dispatch/dispatch_spmv_orig.cuh
+++ b/cub/cub/device/dispatch/dispatch_spmv_orig.cuh
@@ -636,12 +636,12 @@ struct CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") DispatchSpmv
         constexpr int threads_in_block = EMPTY_MATRIX_KERNEL_THREADS;
         const int blocks_in_grid       = ::cuda::ceil_div(spmv_params.num_rows, threads_in_block);
 
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
         _CubLog("Invoking spmv_empty_matrix_kernel<<<%d, %d, 0, %lld>>>()\n",
                 blocks_in_grid,
                 threads_in_block,
                 (long long) stream);
-#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+#endif // CUB_DEBUG_LOG
         error = THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(blocks_in_grid, threads_in_block, 0, stream)
                   .doit(spmv_empty_matrix_kernel, spmv_params);
 
@@ -673,12 +673,12 @@ struct CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") DispatchSpmv
         int degen_col_kernel_block_size = INIT_KERNEL_THREADS;
         int degen_col_kernel_grid_size  = ::cuda::ceil_div(spmv_params.num_rows, degen_col_kernel_block_size);
 
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
         _CubLog("Invoking spmv_1col_kernel<<<%d, %d, 0, %lld>>>()\n",
                 degen_col_kernel_grid_size,
                 degen_col_kernel_block_size,
                 (long long) stream);
-#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+#endif // CUB_DEBUG_LOG
 
         // Invoke spmv_search_kernel
         THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
@@ -800,12 +800,12 @@ struct CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") DispatchSpmv
 // Use separate search kernel if we have enough spmv tiles to saturate the device
 
 // Log spmv_search_kernel configuration
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
         _CubLog("Invoking spmv_search_kernel<<<%d, %d, 0, %lld>>>()\n",
                 search_grid_size,
                 search_block_size,
                 (long long) stream);
-#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+#endif // CUB_DEBUG_LOG
 
         // Invoke spmv_search_kernel
         THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(search_grid_size, search_block_size, 0, stream)
@@ -826,7 +826,7 @@ struct CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") DispatchSpmv
       }
 
 // Log spmv_kernel configuration
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
       _CubLog("Invoking spmv_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
               spmv_grid_size.x,
               spmv_grid_size.y,
@@ -835,7 +835,7 @@ struct CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") DispatchSpmv
               (long long) stream,
               spmv_config.items_per_thread,
               spmv_sm_occupancy);
-#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+#endif // CUB_DEBUG_LOG
 
       // Invoke spmv_kernel
       THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(spmv_grid_size, spmv_config.block_threads, 0, stream)
@@ -864,7 +864,7 @@ struct CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") DispatchSpmv
       if (num_merge_tiles > 1)
       {
 // Log segment_fixup_kernel configuration
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
         _CubLog("Invoking segment_fixup_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
                 segment_fixup_grid_size.x,
                 segment_fixup_grid_size.y,
@@ -873,7 +873,7 @@ struct CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") DispatchSpmv
                 (long long) stream,
                 segment_fixup_config.items_per_thread,
                 segment_fixup_sm_occupancy);
-#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+#endif // CUB_DEBUG_LOG
 
         // Invoke segment_fixup_kernel
         THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
diff --git a/cub/cub/device/dispatch/dispatch_three_way_partition.cuh b/cub/cub/device/dispatch/dispatch_three_way_partition.cuh
index 2d5566d76a3..5ca53eb1d5c 100644
--- a/cub/cub/device/dispatch/dispatch_three_way_partition.cuh
+++ b/cub/cub/device/dispatch/dispatch_three_way_partition.cuh
@@ -387,12 +387,12 @@ struct DispatchThreeWayPartitionIf
       // Log three_way_partition_init_kernel configuration
       int init_grid_size = CUB_MAX(1, ::cuda::ceil_div(current_num_tiles, INIT_KERNEL_THREADS));
 
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
       _CubLog("Invoking three_way_partition_init_kernel<<<%d, %d, 0, %lld>>>()\n",
               init_grid_size,
               INIT_KERNEL_THREADS,
               reinterpret_cast<long long>(stream));
-#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+#endif // CUB_DEBUG_LOG
 
       // Invoke three_way_partition_init_kernel to initialize tile descriptors
       THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream)
@@ -420,7 +420,7 @@ struct DispatchThreeWayPartitionIf
       }
 
 // Log select_if_kernel configuration
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
       {
         // Get SM occupancy for select_if_kernel
         int range_select_sm_occupancy;
@@ -440,7 +440,7 @@ struct DispatchThreeWayPartitionIf
                 items_per_thread,
                 range_select_sm_occupancy);
       }
-#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+#endif // CUB_DEBUG_LOG
 
       // Invoke select_if_kernel
       THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(current_num_tiles, block_threads, 0, stream)
diff --git a/cub/cub/device/dispatch/dispatch_transform.cuh b/cub/cub/device/dispatch/dispatch_transform.cuh
index fa4fa80d0ef..fb4b693c9da 100644
--- a/cub/cub/device/dispatch/dispatch_transform.cuh
+++ b/cub/cub/device/dispatch/dispatch_transform.cuh
@@ -688,10 +688,8 @@ struct dispatch_t<RequiresStableAddress,
         const int smem_size = bulk_copy_smem_for_tile_size<RandomAccessIteratorsIn...>(tile_size);
         if (smem_size > *max_smem)
         {
-#  ifdef CUB_DETAIL_DEBUG_ENABLE_HOST_ASSERTIONS
           // assert should be prevented by smem check in policy
-          assert(last_counts.elem_per_thread > 0 && "min_items_per_thread exceeds available shared memory");
-#  endif // CUB_DETAIL_DEBUG_ENABLE_HOST_ASSERTIONS
+          _CCCL_ASSERT_HOST(last_counts.elem_per_thread > 0, "min_items_per_thread exceeds available shared memory");
           return last_counts;
         }
 
@@ -729,12 +727,10 @@ struct dispatch_t<RequiresStableAddress,
     {
       return config.error;
     }
-#  ifdef CUB_DETAIL_DEBUG_ENABLE_HOST_ASSERTIONS
-    assert(config->elem_per_thread > 0);
-    assert(config->tile_size > 0);
-    assert(config->tile_size % bulk_copy_alignment == 0);
-    assert((sizeof...(RandomAccessIteratorsIn) == 0) != (config->smem_size != 0)); // logical xor
-#  endif // CUB_DETAIL_DEBUG_ENABLE_HOST_ASSERTIONS
+    _CCCL_ASSERT_HOST(config->elem_per_thread > 0, "");
+    _CCCL_ASSERT_HOST(config->tile_size > 0, "");
+    _CCCL_ASSERT_HOST(config->tile_size % bulk_copy_alignment == 0, "");
+    _CCCL_ASSERT_HOST((sizeof...(RandomAccessIteratorsIn) == 0) != (config->smem_size != 0), ""); // logical xor
 
     const auto grid_dim = static_cast<unsigned int>(::cuda::ceil_div(num_items, Offset{config->tile_size}));
     return ::cuda::std::make_tuple(
diff --git a/cub/cub/device/dispatch/dispatch_unique_by_key.cuh b/cub/cub/device/dispatch/dispatch_unique_by_key.cuh
index e07084fe24a..69e4c263b50 100644
--- a/cub/cub/device/dispatch/dispatch_unique_by_key.cuh
+++ b/cub/cub/device/dispatch/dispatch_unique_by_key.cuh
@@ -396,9 +396,9 @@ struct DispatchUniqueByKey
       num_tiles          = CUB_MAX(1, num_tiles);
       int init_grid_size = ::cuda::ceil_div(num_tiles, INIT_KERNEL_THREADS);
 
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
       _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
-#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+#endif // CUB_DEBUG_LOG
 
       // Invoke init_kernel to initialize tile descriptors
       THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream)
@@ -439,7 +439,7 @@ struct DispatchUniqueByKey
       scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x);
 
 // Log select_if_kernel configuration
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
       {
         // Get SM occupancy for unique_by_key_kernel
         int scan_sm_occupancy;
@@ -461,7 +461,7 @@ struct DispatchUniqueByKey
                 items_per_thread,
                 scan_sm_occupancy);
       }
-#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+#endif // CUB_DEBUG_LOG
 
       // Invoke select_if_kernel
       error =
diff --git a/cub/cub/util_allocator.cuh b/cub/cub/util_allocator.cuh
index a5ce583a1cc..524217c70ea 100644
--- a/cub/cub/util_allocator.cuh
+++ b/cub/cub/util_allocator.cuh
@@ -416,7 +416,7 @@ struct CachingDeviceAllocator
     // Lock
     mutex.lock();
 
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
     _CubLog(
       "Changing max_cached_bytes (%lld -> %lld)\n", (long long) this->max_cached_bytes, (long long) max_cached_bytes_);
 #endif
@@ -527,7 +527,7 @@ struct CachingDeviceAllocator
           cached_bytes[device].free -= search_key.bytes;
           cached_bytes[device].live += search_key.bytes;
 
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
           _CubLog("\tDevice %d reused cached block at %p (%lld bytes) for stream %lld (previously associated with "
                   "stream %lld).\n",
                   device,
@@ -572,7 +572,7 @@ struct CachingDeviceAllocator
       if (error == cudaErrorMemoryAllocation)
       {
         // The allocation attempt failed: free all cached blocks on device and retry
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
         _CubLog("\tDevice %d failed to allocate %lld bytes for stream %lld, retrying after freeing cached allocations",
                 device,
                 (long long) search_key.bytes,
@@ -611,7 +611,7 @@ struct CachingDeviceAllocator
           // Reduce balance and erase entry
           cached_bytes[device].free -= block_itr->bytes;
 
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
           _CubLog("\tDevice %d freed %lld bytes.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks "
                   "(%lld bytes) outstanding.\n",
                   device,
@@ -656,7 +656,7 @@ struct CachingDeviceAllocator
       cached_bytes[device].live += search_key.bytes;
       mutex.unlock();
 
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
       _CubLog("\tDevice %d allocated new device block at %p (%lld bytes associated with stream %lld).\n",
               device,
               search_key.d_ptr,
@@ -678,7 +678,7 @@ struct CachingDeviceAllocator
     // Copy device pointer to output parameter
     *d_ptr = search_key.d_ptr;
 
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
     if (debug)
     {
       _CubLog("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n",
@@ -761,7 +761,7 @@ struct CachingDeviceAllocator
         cached_blocks.insert(search_key);
         cached_bytes[device].free += search_key.bytes;
 
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
         _CubLog("\tDevice %d returned %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld "
                 "bytes), %lld live blocks outstanding. (%lld bytes)\n",
                 device,
@@ -819,7 +819,7 @@ struct CachingDeviceAllocator
         return error;
       }
 
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
       _CubLog("\tDevice %d freed %lld bytes from associated stream %lld.\n\t\t  %lld available blocks cached (%lld "
               "bytes), %lld live blocks (%lld bytes) outstanding.\n",
               device,
@@ -914,7 +914,7 @@ struct CachingDeviceAllocator
       cached_bytes[current_device].free -= block_bytes;
       cached_blocks.erase(begin);
 
-#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+#ifdef CUB_DEBUG_LOG
       _CubLog("\tDevice %d freed %lld bytes.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld "
               "bytes) outstanding.\n",
               current_device,
diff --git a/cub/cub/util_debug.cuh b/cub/cub/util_debug.cuh
index 275c915e8f2..099408897ad 100644
--- a/cub/cub/util_debug.cuh
+++ b/cub/cub/util_debug.cuh
@@ -66,22 +66,6 @@
  */
 #  define CUB_DEBUG_SYNC
 
-/**
- * @def CUB_DEBUG_HOST_ASSERTIONS
- *
- * Extends `CUB_DEBUG_SYNC` effects by checking host-side precondition
- * assertions.
- */
-#  define CUB_DEBUG_HOST_ASSERTIONS
-
-/**
- * @def CUB_DEBUG_DEVICE_ASSERTIONS
- *
- * Extends `CUB_DEBUG_HOST_ASSERTIONS` effects by checking device-side
- * precondition assertions.
- */
-#  define CUB_DEBUG_DEVICE_ASSERTIONS
-
 /**
  * @def CUB_DEBUG_ALL
  *
@@ -94,80 +78,29 @@
 
 #endif // _CCCL_DOXYGEN_INVOKED
 
-// `CUB_DETAIL_DEBUG_LEVEL_*`: Implementation details, internal use only:
-
-#define CUB_DETAIL_DEBUG_LEVEL_NONE                 0
-#define CUB_DETAIL_DEBUG_LEVEL_HOST_ASSERTIONS_ONLY 1
-#define CUB_DETAIL_DEBUG_LEVEL_LOG                  2
-#define CUB_DETAIL_DEBUG_LEVEL_SYNC                 3
-#define CUB_DETAIL_DEBUG_LEVEL_HOST_ASSERTIONS      4
-#define CUB_DETAIL_DEBUG_LEVEL_DEVICE_ASSERTIONS    5
-#define CUB_DETAIL_DEBUG_LEVEL_ALL                  1000
-
-// `CUB_DEBUG_*`: User interfaces:
-
-// Extra logging, no syncs
-#ifdef CUB_DEBUG_LOG
-#  define CUB_DETAIL_DEBUG_LEVEL CUB_DETAIL_DEBUG_LEVEL_LOG
-#endif
-
-// Logging + syncs
+// CUB_DEBUG_SYNC also enables CUB_DEBUG_LOG
 #ifdef CUB_DEBUG_SYNC
-#  define CUB_DETAIL_DEBUG_LEVEL CUB_DETAIL_DEBUG_LEVEL_SYNC
-#endif
-
-// Logging + syncs + host assertions
-#ifdef CUB_DEBUG_HOST_ASSERTIONS
-#  define CUB_DETAIL_DEBUG_LEVEL CUB_DETAIL_DEBUG_LEVEL_HOST_ASSERTIONS
-#endif
-
-// Logging + syncs + host assertions + device assertions
-#ifdef CUB_DEBUG_DEVICE_ASSERTIONS
-#  define CUB_DETAIL_DEBUG_LEVEL CUB_DETAIL_DEBUG_LEVEL_DEVICE_ASSERTIONS
-#endif
-
-// All
-#ifdef CUB_DEBUG_ALL
-#  define CUB_DETAIL_DEBUG_LEVEL CUB_DETAIL_DEBUG_LEVEL_ALL
-#endif
-
-// Default case, no extra debugging:
-#ifndef CUB_DETAIL_DEBUG_LEVEL
-#  ifdef NDEBUG
-#    define CUB_DETAIL_DEBUG_LEVEL CUB_DETAIL_DEBUG_LEVEL_NONE
-#  else
-#    define CUB_DETAIL_DEBUG_LEVEL CUB_DETAIL_DEBUG_LEVEL_HOST_ASSERTIONS_ONLY
+#  ifndef CUB_DEBUG_LOG
+#    define CUB_DEBUG_LOG
 #  endif
 #endif
 
-/*
- * `CUB_DETAIL_DEBUG_ENABLE_*`:
- * Internal implementation details, used for testing enabled debug features:
- */
-
-#if CUB_DETAIL_DEBUG_LEVEL >= CUB_DETAIL_DEBUG_LEVEL_LOG
-#  define CUB_DETAIL_DEBUG_ENABLE_LOG
-#endif
-
-#if CUB_DETAIL_DEBUG_LEVEL >= CUB_DETAIL_DEBUG_LEVEL_SYNC
-#  define CUB_DETAIL_DEBUG_ENABLE_SYNC
-#endif
-
-#if (CUB_DETAIL_DEBUG_LEVEL >= CUB_DETAIL_DEBUG_LEVEL_HOST_ASSERTIONS) \
-  || (CUB_DETAIL_DEBUG_LEVEL == CUB_DETAIL_DEBUG_LEVEL_HOST_ASSERTIONS_ONLY)
-#  define CUB_DETAIL_DEBUG_ENABLE_HOST_ASSERTIONS
-#endif
-
-#if CUB_DETAIL_DEBUG_LEVEL >= CUB_DETAIL_DEBUG_LEVEL_DEVICE_ASSERTIONS
-#  define CUB_DETAIL_DEBUG_ENABLE_DEVICE_ASSERTIONS
-#endif
+// CUB_DEBUG_ALL = CUB_DEBUG_LOG + CUB_DEBUG_SYNC
+#ifdef CUB_DEBUG_ALL
+#  ifndef CUB_DEBUG_LOG
+#    define CUB_DEBUG_LOG
+#  endif // CUB_DEBUG_LOG
+#  ifndef CUB_DEBUG_SYNC
+#    define CUB_DEBUG_SYNC
+#  endif // CUB_DEBUG_SYNC
+#endif // CUB_DEBUG_ALL
 
 /// CUB error reporting macro (prints error messages to stderr)
 #if (defined(DEBUG) || defined(_DEBUG)) && !defined(CUB_STDERR)
 #  define CUB_STDERR
 #endif
 
-#if defined(CUB_STDERR) || defined(CUB_DETAIL_DEBUG_ENABLE_LOG)
+#if defined(CUB_STDERR) || defined(CUB_DEBUG_LOG)
 #  include <cstdio>
 #endif
 
diff --git a/cub/cub/util_device.cuh b/cub/cub/util_device.cuh
index b9e4f5c25e6..cf65e954945 100644
--- a/cub/cub/util_device.cuh
+++ b/cub/cub/util_device.cuh
@@ -47,7 +47,6 @@
 #  pragma system_header
 #endif // no system header
 
-#include <cub/detail/device_synchronize.cuh> // IWYU pragma: export
 #include <cub/util_debug.cuh>
 #include <cub/util_type.cuh>
 // for backward compatibility
@@ -437,62 +436,28 @@ CUB_RUNTIME_FUNCTION inline cudaError_t SmVersion(int& sm_version, int device =
   return result;
 }
 
-/**
- * Synchronize the specified \p stream.
- */
+//! Synchronize the specified \p stream when called in host code. Otherwise, does nothing.
 CUB_RUNTIME_FUNCTION inline cudaError_t SyncStream(cudaStream_t stream)
 {
-  cudaError_t result = cudaErrorNotSupported;
-
-  NV_IF_TARGET(NV_IS_HOST,
-               (result = CubDebug(cudaStreamSynchronize(stream));),
-               ((void) stream; result = CubDebug(cub::detail::device_synchronize());));
-
-  return result;
+  NV_IF_TARGET(
+    NV_IS_HOST, (return CubDebug(cudaStreamSynchronize(stream));), ((void) stream; return cudaErrorNotSupported;));
 }
 
 namespace detail
 {
-
-/**
- * Same as SyncStream, but intended for use with the debug_synchronous flags
- * in device algorithms. This should not be used if synchronization is required
- * for correctness.
- *
- * If `debug_synchronous` is false, this function will immediately return
- * cudaSuccess. If true, one of the following will occur:
- *
- * If synchronization is supported by the current compilation target and
- * settings, the sync is performed and the sync result is returned.
- *
- * If syncs are not supported then no sync is performed, but a message is logged
- * via _CubLog and cudaSuccess is returned.
- */
+//! If CUB_DEBUG_SYNC is defined and this function is called from host code, a sync is performed and the
+//! sync result is returned. Otherwise, does nothing.
 CUB_RUNTIME_FUNCTION inline cudaError_t DebugSyncStream(cudaStream_t stream)
 {
-#ifndef CUB_DETAIL_DEBUG_ENABLE_SYNC
-
+#ifndef CUB_DEBUG_SYNC
   (void) stream;
   return cudaSuccess;
-
-#else // CUB_DETAIL_DEBUG_ENABLE_SYNC:
-
-#  define CUB_TMP_SYNC_AVAILABLE         \
-    _CubLog("%s\n", "Synchronizing..."); \
-    return SyncStream(stream)
-
-#  define CUB_TMP_DEVICE_SYNC_UNAVAILABLE                                        \
-    (void) stream;                                                               \
-    _CubLog("WARNING: Skipping CUB `debug_synchronous` synchronization (%s).\n", \
-            "device-side sync requires <sm_90, RDC, and CDPv1");                 \
-    return cudaSuccess
-
-  NV_IF_TARGET(NV_IS_HOST, (CUB_TMP_SYNC_AVAILABLE;), (CUB_TMP_DEVICE_SYNC_UNAVAILABLE;));
-
-#  undef CUB_TMP_DEVICE_SYNC_UNAVAILABLE
-#  undef CUB_TMP_SYNC_AVAILABLE
-
-#endif // CUB_DETAIL_DEBUG_ENABLE_SYNC
+#else // CUB_DEBUG_SYNC
+  NV_IF_TARGET(
+    NV_IS_HOST,
+    (_CubLog("Synchronizing...\n"); return SyncStream(stream)),
+    ((void) stream; _CubLog("WARNING: Skipping CUB debug synchronization in device code"); return cudaSuccess;));
+#endif // CUB_DEBUG_SYNC
 }
 
 /** \brief Gets whether the current device supports unified addressing */
diff --git a/cub/test/CMakeLists.txt b/cub/test/CMakeLists.txt
index c86d24754de..5a093526edd 100644
--- a/cub/test/CMakeLists.txt
+++ b/cub/test/CMakeLists.txt
@@ -260,7 +260,7 @@ function(cub_add_test target_name_var test_name test_src cub_target launcher_id)
     )
     cub_clone_target_properties(${test_target} ${cub_target})
     target_include_directories(${test_target} PRIVATE "${CUB_SOURCE_DIR}/test")
-    target_compile_definitions(${test_target} PRIVATE CUB_DETAIL_DEBUG_ENABLE_SYNC)
+    target_compile_definitions(${test_target} PRIVATE CUB_DEBUG_SYNC)
 
     if ("${test_target}" MATCHES "nvtx_in_usercode")
       target_link_libraries(${test_target} PRIVATE nvtx3-cpp)
diff --git a/thrust/thrust/system/cuda/detail/core/agent_launcher.h b/thrust/thrust/system/cuda/detail/core/agent_launcher.h
index 218ee4ab771..fb7c1ef22d6 100644
--- a/thrust/thrust/system/cuda/detail/core/agent_launcher.h
+++ b/thrust/thrust/system/cuda/detail/core/agent_launcher.h
@@ -36,8 +36,6 @@
 #  pragma system_header
 #endif // no system header
 
-#include <cub/detail/device_synchronize.cuh>
-
 #if _CCCL_HAS_CUDA_COMPILER
 #  include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
 #  include <thrust/system/cuda/detail/core/util.h>
diff --git a/thrust/thrust/system/cuda/detail/util.h b/thrust/thrust/system/cuda/detail/util.h
index 49e84b8025b..15937c2c59c 100644
--- a/thrust/thrust/system/cuda/detail/util.h
+++ b/thrust/thrust/system/cuda/detail/util.h
@@ -38,7 +38,6 @@
 
 #include <cub/config.cuh>
 
-#include <cub/detail/device_synchronize.cuh>
 #include <cub/util_device.cuh>
 
 #include <thrust/iterator/iterator_traits.h>