From 8d05f7abeb2e83055d69f7c1bf65d2e7cb14f646 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Fri, 10 Jan 2025 22:26:04 +0100 Subject: [PATCH] Refactor CUB's util_debug --- cub/cub/detail/device_synchronize.cuh | 54 ----------- .../dispatch/dispatch_adjacent_difference.cuh | 8 +- .../device/dispatch/dispatch_batch_memcpy.cuh | 6 +- cub/cub/device/dispatch/dispatch_for.cuh | 4 +- .../dispatch/dispatch_for_each_in_extents.cuh | 4 +- .../device/dispatch/dispatch_histogram.cuh | 8 +- .../device/dispatch/dispatch_radix_sort.cuh | 20 ++-- cub/cub/device/dispatch/dispatch_reduce.cuh | 16 ++-- .../dispatch/dispatch_reduce_by_key.cuh | 8 +- cub/cub/device/dispatch/dispatch_rle.cuh | 8 +- cub/cub/device/dispatch/dispatch_scan.cuh | 8 +- .../device/dispatch/dispatch_scan_by_key.cuh | 8 +- .../dispatch/dispatch_segmented_sort.cuh | 12 +-- .../device/dispatch/dispatch_select_if.cuh | 4 +- .../device/dispatch/dispatch_spmv_orig.cuh | 20 ++-- .../dispatch/dispatch_three_way_partition.cuh | 8 +- .../device/dispatch/dispatch_transform.cuh | 14 +-- .../dispatch/dispatch_unique_by_key.cuh | 8 +- cub/cub/util_allocator.cuh | 18 ++-- cub/cub/util_debug.cuh | 93 +++---------------- cub/cub/util_device.cuh | 59 +++--------- cub/test/CMakeLists.txt | 2 +- .../system/cuda/detail/core/agent_launcher.h | 2 - thrust/thrust/system/cuda/detail/util.h | 1 - 24 files changed, 115 insertions(+), 278 deletions(-) delete mode 100644 cub/cub/detail/device_synchronize.cuh diff --git a/cub/cub/detail/device_synchronize.cuh b/cub/cub/detail/device_synchronize.cuh deleted file mode 100644 index 1d71c6ebc0d..00000000000 --- a/cub/cub/detail/device_synchronize.cuh +++ /dev/null @@ -1,54 +0,0 @@ -/* - * Copyright 2021 NVIDIA Corporation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include - -#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) -# pragma GCC system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) -# pragma clang system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) -# pragma system_header -#endif // no system header - -#include - -#include - -#include - -CUB_NAMESPACE_BEGIN - -namespace detail -{ - -/** - * Call `cudaDeviceSynchronize()` using the proper API for the current CUB and - * CUDA configuration. - */ -_CCCL_EXEC_CHECK_DISABLE -CUB_RUNTIME_FUNCTION inline cudaError_t device_synchronize() -{ - cudaError_t result = cudaErrorNotSupported; - NV_IF_TARGET(NV_IS_HOST, (result = cudaDeviceSynchronize();), ()); - return result; -} - -} // namespace detail - -CUB_NAMESPACE_END diff --git a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh index a8c733ef309..3e8184e4c2f 100644 --- a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh +++ b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh @@ -189,13 +189,13 @@ struct DispatchAdjacentDifference constexpr int init_block_size = AgentDifferenceInitT::BLOCK_THREADS; const int init_grid_size = ::cuda::ceil_div(num_tiles, init_block_size); -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("Invoking DeviceAdjacentDifferenceInitKernel" "<<<%d, %d, 0, %lld>>>()\n", init_grid_size, init_block_size, reinterpret_cast(stream)); -#endif // CUB_DETAIL_DEBUG_ENABLE_LOG +#endif // CUB_DEBUG_LOG THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, init_block_size, 0, stream) .doit(DeviceAdjacentDifferenceInitKernel, @@ -219,13 +219,13 @@ struct DispatchAdjacentDifference } } -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("Invoking DeviceAdjacentDifferenceDifferenceKernel" "<<<%d, %d, 0, %lld>>>()\n", num_tiles, AdjacentDifferencePolicyT::BLOCK_THREADS, reinterpret_cast(stream)); -#endif // CUB_DETAIL_DEBUG_ENABLE_LOG +#endif // CUB_DEBUG_LOG THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( num_tiles, AdjacentDifferencePolicyT::BLOCK_THREADS, 0, stream) diff --git a/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh b/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh index 287f702b095..68726772b54 100644 --- a/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh +++ b/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh @@ -536,7 +536,7 @@ struct DispatchBatchMemcpy return error; } -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("Invoking " "InitTileStateKernel<<<%d, %d, 0, %lld>>>()\n", static_cast(init_grid_size), @@ -564,7 +564,7 @@ struct DispatchBatchMemcpy return error; } -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("Invoking " "BatchMemcpyKernel<<<%d, %d, 0, %lld>>>()\n", static_cast(batch_memcpy_grid_size), @@ -603,7 +603,7 @@ struct DispatchBatchMemcpy return error; } -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("Invoking " "MultiBlockBatchMemcpyKernel<<<%d, %d, 0, %lld>>>()\n", static_cast(batch_memcpy_blev_grid_size), diff --git a/cub/cub/device/dispatch/dispatch_for.cuh b/cub/cub/device/dispatch/dispatch_for.cuh index 7ba478e3c00..453259b1896 100644 --- a/cub/cub/device/dispatch/dispatch_for.cuh +++ b/cub/cub/device/dispatch/dispatch_for.cuh @@ -101,7 +101,7 @@ struct dispatch_t const auto tile_size = static_cast(block_threads * items_per_thread); const auto num_tiles = ::cuda::ceil_div(num_items, tile_size); -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("Invoking detail::for_each::dynamic_kernel<<<%d, %d, 0, %lld>>>(), " "%d items per thread\n", static_cast(num_tiles), @@ -144,7 +144,7 @@ struct dispatch_t const auto tile_size = static_cast(block_threads * items_per_thread); const auto num_tiles = ::cuda::ceil_div(num_items, tile_size); -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("Invoking detail::for_each::static_kernel<<<%d, %d, 0, %lld>>>(), " "%d items per thread\n", static_cast(num_tiles), diff --git a/cub/cub/device/dispatch/dispatch_for_each_in_extents.cuh b/cub/cub/device/dispatch/dispatch_for_each_in_extents.cuh index 6e346316d48..4375fac98dd 100644 --- a/cub/cub/device/dispatch/dispatch_for_each_in_extents.cuh +++ b/cub/cub/device/dispatch/dispatch_for_each_in_extents.cuh @@ -117,7 +117,7 @@ public: constexpr unsigned items_per_thread = ActivePolicyT::for_policy_t::items_per_thread; unsigned num_cta = ::cuda::ceil_div(_size, block_threads * items_per_thread); -# ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +# ifdef CUB_DEBUG_LOG _CubLog("Invoking detail::for_each_in_extents::static_kernel<<<%u, %u, 0, %p>>>(), items_per_thread: %u\n", num_cta, block_threads, @@ -155,7 +155,7 @@ public: _CUB_RETURN_IF_ERROR(status) unsigned num_cta = ::cuda::ceil_div(_size, block_threads * items_per_thread); -# ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +# ifdef CUB_DEBUG_LOG _CubLog("Invoking detail::for_each_in_extents::dynamic_kernel<<<%u, %u, 0, %p>>>(), items_per_thread: %u\n", num_cta, block_threads, diff --git a/cub/cub/device/dispatch/dispatch_histogram.cuh b/cub/cub/device/dispatch/dispatch_histogram.cuh index 900f758cdfb..abcafe9d538 100644 --- a/cub/cub/device/dispatch/dispatch_histogram.cuh +++ b/cub/cub/device/dispatch/dispatch_histogram.cuh @@ -434,12 +434,12 @@ struct dispatch_histogram (max_num_output_bins + histogram_init_block_threads - 1) / histogram_init_block_threads; // Log DeviceHistogramInitKernel configuration -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("Invoking DeviceHistogramInitKernel<<<%d, %d, 0, %lld>>>()\n", histogram_init_grid_dims, histogram_init_block_threads, (long long) stream); -#endif // CUB_DETAIL_DEBUG_ENABLE_LOG +#endif // CUB_DEBUG_LOG // Invoke histogram_init_kernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( @@ -453,7 +453,7 @@ struct dispatch_histogram } // Log histogram_sweep_kernel configuration -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("Invoking histogram_sweep_kernel<<<{%d, %d, %d}, %d, 0, %lld>>>(), %d pixels " "per thread, %d SM occupancy\n", sweep_grid_dims.x, @@ -463,7 +463,7 @@ struct dispatch_histogram (long long) stream, pixels_per_thread, histogram_sweep_sm_occupancy); -#endif // CUB_DETAIL_DEBUG_ENABLE_LOG +#endif // CUB_DEBUG_LOG // Invoke histogram_sweep_kernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(sweep_grid_dims, block_threads, 0, stream) diff --git a/cub/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/cub/device/dispatch/dispatch_radix_sort.cuh index c533afa1243..7c738fbdbb2 100644 --- a/cub/cub/device/dispatch/dispatch_radix_sort.cuh +++ b/cub/cub/device/dispatch/dispatch_radix_sort.cuh @@ -965,7 +965,7 @@ struct DispatchRadixSort } // Log single_tile_kernel configuration -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("Invoking single_tile_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit " "%d, bit_grain %d\n", 1, @@ -1036,7 +1036,7 @@ struct DispatchRadixSort int pass_bits = CUB_MIN(pass_config.radix_bits, (end_bit - current_bit)); // Log upsweep_kernel configuration -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("Invoking upsweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, " "bit_grain %d\n", pass_config.even_share.grid_size, @@ -1078,7 +1078,7 @@ struct DispatchRadixSort } // Log scan_kernel configuration -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("Invoking scan_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread\n", 1, pass_config.scan_config.block_threads, @@ -1105,7 +1105,7 @@ struct DispatchRadixSort } // Log downsweep_kernel configuration -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("Invoking downsweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", pass_config.even_share.grid_size, pass_config.downsweep_config.block_threads, @@ -1305,7 +1305,7 @@ struct DispatchRadixSort } // log histogram_kernel configuration -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("Invoking histogram_kernel<<<%d, %d, 0, %lld>>>(), %d items per iteration, " "%d SM occupancy, bit_grain %d\n", histo_blocks_per_sm * num_sms, @@ -1335,7 +1335,7 @@ struct DispatchRadixSort constexpr int SCAN_BLOCK_THREADS = ActivePolicyT::ExclusiveSumPolicy::BLOCK_THREADS; // log exclusive_sum_kernel configuration -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("Invoking exclusive_sum_kernel<<<%d, %d, 0, %lld>>>(), bit_grain %d\n", num_passes, SCAN_BLOCK_THREADS, @@ -1383,7 +1383,7 @@ struct DispatchRadixSort } // log onesweep_kernel configuration -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("Invoking onesweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, " "current bit %d, bit_grain %d, portion %d/%d\n", num_blocks, @@ -1672,7 +1672,7 @@ struct DispatchRadixSort } // Copy keys -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("Invoking async copy of %lld keys on stream %lld\n", (long long) num_items, (long long) stream); #endif cudaError_t error = cudaSuccess; @@ -1694,7 +1694,7 @@ struct DispatchRadixSort // Copy values if necessary if (!KEYS_ONLY) { -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("Invoking async copy of %lld values on stream %lld\n", (long long) num_items, (long long) stream); #endif error = CubDebug(cudaMemcpyAsync( @@ -2001,7 +2001,7 @@ struct DispatchSegmentedRadixSort int pass_bits = CUB_MIN(pass_config.radix_bits, (end_bit - current_bit)); // Log kernel configuration -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("Invoking segmented_kernels<<<%lld, %lld, 0, %lld>>>(), " "%lld items per thread, %lld SM occupancy, " "current bit %d, bit_grain %d\n", diff --git a/cub/cub/device/dispatch/dispatch_reduce.cuh b/cub/cub/device/dispatch/dispatch_reduce.cuh index 0cca1e1a982..c967cf09f6a 100644 --- a/cub/cub/device/dispatch/dispatch_reduce.cuh +++ b/cub/cub/device/dispatch/dispatch_reduce.cuh @@ -378,13 +378,13 @@ struct DispatchReduce } // Log single_reduce_sweep_kernel configuration -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), " "%d items per thread\n", policy.SingleTile().BlockThreads(), (long long) stream, policy.SingleTile().ItemsPerThread()); -#endif // CUB_DETAIL_DEBUG_ENABLE_LOG +#endif // CUB_DEBUG_LOG // Invoke single_reduce_sweep_kernel launcher_factory(1, policy.SingleTile().BlockThreads(), 0, stream) @@ -490,7 +490,7 @@ struct DispatchReduce int reduce_grid_size = even_share.grid_size; // Log device_reduce_sweep_kernel configuration -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("Invoking DeviceReduceKernel<<<%lu, %d, 0, %lld>>>(), %d items " "per thread, %d SM occupancy\n", (unsigned long) reduce_grid_size, @@ -498,7 +498,7 @@ struct DispatchReduce (long long) stream, active_policy.Reduce().ItemsPerThread(), reduce_config.sm_occupancy); -#endif // CUB_DETAIL_DEBUG_ENABLE_LOG +#endif // CUB_DEBUG_LOG // Invoke DeviceReduceKernel launcher_factory(reduce_grid_size, active_policy.Reduce().BlockThreads(), 0, stream) @@ -519,13 +519,13 @@ struct DispatchReduce } // Log single_reduce_sweep_kernel configuration -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), " "%d items per thread\n", active_policy.SingleTile().BlockThreads(), (long long) stream, active_policy.SingleTile().ItemsPerThread()); -#endif // CUB_DETAIL_DEBUG_ENABLE_LOG +#endif // CUB_DEBUG_LOG // Invoke DeviceReduceSingleTileKernel launcher_factory(1, active_policy.SingleTile().BlockThreads(), 0, stream) @@ -881,7 +881,7 @@ struct DispatchSegmentedReduce } // Log device_reduce_sweep_kernel configuration -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("Invoking SegmentedDeviceReduceKernel<<<%d, %d, 0, %lld>>>(), " "%d items per thread, %d SM occupancy\n", num_segments, @@ -889,7 +889,7 @@ struct DispatchSegmentedReduce (long long) stream, ActivePolicyT::SegmentedReducePolicy::ITEMS_PER_THREAD, segmented_reduce_config.sm_occupancy); -#endif // CUB_DETAIL_DEBUG_ENABLE_LOG +#endif // CUB_DEBUG_LOG // Invoke DeviceReduceKernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( diff --git a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh index 804371588f3..3c0fd4b424e 100644 --- a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh +++ b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh @@ -341,9 +341,9 @@ struct DispatchReduceByKey // Log init_kernel configuration int init_grid_size = CUB_MAX(1, ::cuda::ceil_div(num_tiles, INIT_KERNEL_THREADS)); -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream); -#endif // CUB_DETAIL_DEBUG_ENABLE_LOG +#endif // CUB_DEBUG_LOG // Invoke init_kernel to initialize tile descriptors THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream) @@ -391,7 +391,7 @@ struct DispatchReduceByKey for (int start_tile = 0; start_tile < num_tiles; start_tile += scan_grid_size) { // Log reduce_by_key_kernel configuration -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("Invoking %d reduce_by_key_kernel<<<%d, %d, 0, %lld>>>(), %d " "items per thread, %d SM occupancy\n", start_tile, @@ -400,7 +400,7 @@ struct DispatchReduceByKey (long long) stream, items_per_thread, reduce_by_key_sm_occupancy); -#endif // CUB_DETAIL_DEBUG_ENABLE_LOG +#endif // CUB_DEBUG_LOG // Invoke reduce_by_key_kernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(scan_grid_size, block_threads, 0, stream) diff --git a/cub/cub/device/dispatch/dispatch_rle.cuh b/cub/cub/device/dispatch/dispatch_rle.cuh index b1542462a58..1dea6c217ad 100644 --- a/cub/cub/device/dispatch/dispatch_rle.cuh +++ b/cub/cub/device/dispatch/dispatch_rle.cuh @@ -349,12 +349,12 @@ struct DeviceRleDispatch // Log device_scan_init_kernel configuration int init_grid_size = CUB_MAX(1, ::cuda::ceil_div(num_tiles, INIT_KERNEL_THREADS)); -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("Invoking device_scan_init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream); -#endif // CUB_DETAIL_DEBUG_ENABLE_LOG +#endif // CUB_DEBUG_LOG // Invoke device_scan_init_kernel to initialize tile descriptors and queue descriptors THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream) @@ -405,7 +405,7 @@ struct DeviceRleDispatch scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x); // Log device_rle_sweep_kernel configuration -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("Invoking device_rle_sweep_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per " "thread, %d SM occupancy\n", scan_grid_size.x, @@ -415,7 +415,7 @@ struct DeviceRleDispatch (long long) stream, items_per_thread, device_rle_kernel_sm_occupancy); -#endif // CUB_DETAIL_DEBUG_ENABLE_LOG +#endif // CUB_DEBUG_LOG // Invoke device_rle_sweep_kernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(scan_grid_size, block_threads, 0, stream) diff --git a/cub/cub/device/dispatch/dispatch_scan.cuh b/cub/cub/device/dispatch/dispatch_scan.cuh index 4db31cf6989..f686c20c2fa 100644 --- a/cub/cub/device/dispatch/dispatch_scan.cuh +++ b/cub/cub/device/dispatch/dispatch_scan.cuh @@ -260,9 +260,9 @@ struct DispatchScan // Log init_kernel configuration int init_grid_size = ::cuda::ceil_div(num_tiles, INIT_KERNEL_THREADS); -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream); -#endif // CUB_DETAIL_DEBUG_ENABLE_LOG +#endif // CUB_DEBUG_LOG // Invoke init_kernel to initialize tile descriptors THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream) @@ -305,7 +305,7 @@ struct DispatchScan for (int start_tile = 0; start_tile < num_tiles; start_tile += scan_grid_size) { // Log scan_kernel configuration -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("Invoking %d scan_kernel<<<%d, %d, 0, %lld>>>(), %d items " "per thread, %d SM occupancy\n", start_tile, @@ -314,7 +314,7 @@ struct DispatchScan (long long) stream, Policy::ITEMS_PER_THREAD, scan_sm_occupancy); -#endif // CUB_DETAIL_DEBUG_ENABLE_LOG +#endif // CUB_DEBUG_LOG // Invoke scan_kernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(scan_grid_size, Policy::BLOCK_THREADS, 0, stream) diff --git a/cub/cub/device/dispatch/dispatch_scan_by_key.cuh b/cub/cub/device/dispatch/dispatch_scan_by_key.cuh index c88656dff48..e8676383e5f 100644 --- a/cub/cub/device/dispatch/dispatch_scan_by_key.cuh +++ b/cub/cub/device/dispatch/dispatch_scan_by_key.cuh @@ -406,9 +406,9 @@ struct DispatchScanByKey // Log init_kernel configuration int init_grid_size = ::cuda::ceil_div(num_tiles, INIT_KERNEL_THREADS); -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream); -#endif // CUB_DETAIL_DEBUG_ENABLE_LOG +#endif // CUB_DEBUG_LOG // Invoke init_kernel to initialize tile descriptors THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream) @@ -441,7 +441,7 @@ struct DispatchScanByKey for (int start_tile = 0; start_tile < num_tiles; start_tile += scan_grid_size) { // Log scan_kernel configuration -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("Invoking %d scan_kernel<<<%d, %d, 0, %lld>>>(), %d items " "per thread\n", start_tile, @@ -449,7 +449,7 @@ struct DispatchScanByKey Policy::BLOCK_THREADS, (long long) stream, Policy::ITEMS_PER_THREAD); -#endif // CUB_DETAIL_DEBUG_ENABLE_LOG +#endif // CUB_DEBUG_LOG // Invoke scan_kernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(scan_grid_size, Policy::BLOCK_THREADS, 0, stream) diff --git a/cub/cub/device/dispatch/dispatch_segmented_sort.cuh b/cub/cub/device/dispatch/dispatch_segmented_sort.cuh index 81381e5dad8..05d405b2bf8 100644 --- a/cub/cub/device/dispatch/dispatch_segmented_sort.cuh +++ b/cub/cub/device/dispatch/dispatch_segmented_sort.cuh @@ -593,13 +593,13 @@ CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN cudaError_t DeviceSegmentedSortCont // One CTA per segment const local_segment_index_t blocks_in_grid = large_segments; -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("Invoking " "DeviceSegmentedSortKernelLarge<<<%d, %d, 0, %lld>>>()\n", static_cast(blocks_in_grid), LargeSegmentPolicyT::BLOCK_THREADS, (long long) stream); -#endif // CUB_DETAIL_DEBUG_ENABLE_LOG +#endif // CUB_DEBUG_LOG THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( blocks_in_grid, LargeSegmentPolicyT::BLOCK_THREADS, 0, stream) @@ -643,13 +643,13 @@ CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN cudaError_t DeviceSegmentedSortCont if (small_and_medium_blocks_in_grid) { -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("Invoking " "DeviceSegmentedSortKernelSmall<<<%d, %d, 0, %lld>>>()\n", static_cast(small_and_medium_blocks_in_grid), SmallAndMediumPolicyT::BLOCK_THREADS, (long long) stream); -#endif // CUB_DETAIL_DEBUG_ENABLE_LOG +#endif // CUB_DEBUG_LOG THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( small_and_medium_blocks_in_grid, SmallAndMediumPolicyT::BLOCK_THREADS, 0, stream) @@ -1368,7 +1368,7 @@ private: constexpr auto threads_in_block = static_cast(LargeSegmentPolicyT::BLOCK_THREADS); // Log kernel configuration -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("Invoking DeviceSegmentedSortFallbackKernel<<<%d, %d, " "0, %lld>>>(), %d items per thread, bit_grain %d\n", blocks_in_grid, @@ -1376,7 +1376,7 @@ private: (long long) stream, LargeSegmentPolicyT::ITEMS_PER_THREAD, LargeSegmentPolicyT::RADIX_BITS); -#endif // CUB_DETAIL_DEBUG_ENABLE_LOG +#endif // CUB_DEBUG_LOG // Invoke fallback kernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(blocks_in_grid, threads_in_block, 0, stream) diff --git a/cub/cub/device/dispatch/dispatch_select_if.cuh b/cub/cub/device/dispatch/dispatch_select_if.cuh index c41dfb389eb..19d161a9c7b 100644 --- a/cub/cub/device/dispatch/dispatch_select_if.cuh +++ b/cub/cub/device/dispatch/dispatch_select_if.cuh @@ -660,7 +660,7 @@ struct DispatchSelectIf // Log scan_init_kernel configuration int init_grid_size = CUB_MAX(1, ::cuda::ceil_div(current_num_tiles, INIT_KERNEL_THREADS)); -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("Invoking scan_init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, @@ -693,7 +693,7 @@ struct DispatchSelectIf } // Log select_if_kernel configuration -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG { // Get SM occupancy for select_if_kernel int range_select_sm_occupancy; diff --git a/cub/cub/device/dispatch/dispatch_spmv_orig.cuh b/cub/cub/device/dispatch/dispatch_spmv_orig.cuh index d659afa0ce9..24ef2845dee 100644 --- a/cub/cub/device/dispatch/dispatch_spmv_orig.cuh +++ b/cub/cub/device/dispatch/dispatch_spmv_orig.cuh @@ -636,12 +636,12 @@ struct CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") DispatchSpmv constexpr int threads_in_block = EMPTY_MATRIX_KERNEL_THREADS; const int blocks_in_grid = ::cuda::ceil_div(spmv_params.num_rows, threads_in_block); -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("Invoking spmv_empty_matrix_kernel<<<%d, %d, 0, %lld>>>()\n", blocks_in_grid, threads_in_block, (long long) stream); -#endif // CUB_DETAIL_DEBUG_ENABLE_LOG +#endif // CUB_DEBUG_LOG error = THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(blocks_in_grid, threads_in_block, 0, stream) .doit(spmv_empty_matrix_kernel, spmv_params); @@ -673,12 +673,12 @@ struct CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") DispatchSpmv int degen_col_kernel_block_size = INIT_KERNEL_THREADS; int degen_col_kernel_grid_size = ::cuda::ceil_div(spmv_params.num_rows, degen_col_kernel_block_size); -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("Invoking spmv_1col_kernel<<<%d, %d, 0, %lld>>>()\n", degen_col_kernel_grid_size, degen_col_kernel_block_size, (long long) stream); -#endif // CUB_DETAIL_DEBUG_ENABLE_LOG +#endif // CUB_DEBUG_LOG // Invoke spmv_search_kernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( @@ -800,12 +800,12 @@ struct CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") DispatchSpmv // Use separate search kernel if we have enough spmv tiles to saturate the device // Log spmv_search_kernel configuration -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("Invoking spmv_search_kernel<<<%d, %d, 0, %lld>>>()\n", search_grid_size, search_block_size, (long long) stream); -#endif // CUB_DETAIL_DEBUG_ENABLE_LOG +#endif // CUB_DEBUG_LOG // Invoke spmv_search_kernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(search_grid_size, search_block_size, 0, stream) @@ -826,7 +826,7 @@ struct CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") DispatchSpmv } // Log spmv_kernel configuration -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("Invoking spmv_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", spmv_grid_size.x, spmv_grid_size.y, @@ -835,7 +835,7 @@ struct CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") DispatchSpmv (long long) stream, spmv_config.items_per_thread, spmv_sm_occupancy); -#endif // CUB_DETAIL_DEBUG_ENABLE_LOG +#endif // CUB_DEBUG_LOG // Invoke spmv_kernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(spmv_grid_size, spmv_config.block_threads, 0, stream) @@ -864,7 +864,7 @@ struct CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") DispatchSpmv if (num_merge_tiles > 1) { // Log segment_fixup_kernel configuration -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("Invoking segment_fixup_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", segment_fixup_grid_size.x, segment_fixup_grid_size.y, @@ -873,7 +873,7 @@ struct CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") DispatchSpmv (long long) stream, segment_fixup_config.items_per_thread, segment_fixup_sm_occupancy); -#endif // CUB_DETAIL_DEBUG_ENABLE_LOG +#endif // CUB_DEBUG_LOG // Invoke segment_fixup_kernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( diff --git a/cub/cub/device/dispatch/dispatch_three_way_partition.cuh b/cub/cub/device/dispatch/dispatch_three_way_partition.cuh index 2d5566d76a3..5ca53eb1d5c 100644 --- a/cub/cub/device/dispatch/dispatch_three_way_partition.cuh +++ b/cub/cub/device/dispatch/dispatch_three_way_partition.cuh @@ -387,12 +387,12 @@ struct DispatchThreeWayPartitionIf // Log three_way_partition_init_kernel configuration int init_grid_size = CUB_MAX(1, ::cuda::ceil_div(current_num_tiles, INIT_KERNEL_THREADS)); -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("Invoking three_way_partition_init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, reinterpret_cast(stream)); -#endif // CUB_DETAIL_DEBUG_ENABLE_LOG +#endif // CUB_DEBUG_LOG // Invoke three_way_partition_init_kernel to initialize tile descriptors THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream) @@ -420,7 +420,7 @@ struct DispatchThreeWayPartitionIf } // Log select_if_kernel configuration -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG { // Get SM occupancy for select_if_kernel int range_select_sm_occupancy; @@ -440,7 +440,7 @@ struct DispatchThreeWayPartitionIf items_per_thread, range_select_sm_occupancy); } -#endif // CUB_DETAIL_DEBUG_ENABLE_LOG +#endif // CUB_DEBUG_LOG // Invoke select_if_kernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(current_num_tiles, block_threads, 0, stream) diff --git a/cub/cub/device/dispatch/dispatch_transform.cuh b/cub/cub/device/dispatch/dispatch_transform.cuh index fa4fa80d0ef..fb4b693c9da 100644 --- a/cub/cub/device/dispatch/dispatch_transform.cuh +++ b/cub/cub/device/dispatch/dispatch_transform.cuh @@ -688,10 +688,8 @@ struct dispatch_t(tile_size); if (smem_size > *max_smem) { -# ifdef CUB_DETAIL_DEBUG_ENABLE_HOST_ASSERTIONS // assert should be prevented by smem check in policy - assert(last_counts.elem_per_thread > 0 && "min_items_per_thread exceeds available shared memory"); -# endif // CUB_DETAIL_DEBUG_ENABLE_HOST_ASSERTIONS + _CCCL_ASSERT_HOST(last_counts.elem_per_thread > 0, "min_items_per_thread exceeds available shared memory"); return last_counts; } @@ -729,12 +727,10 @@ struct dispatch_telem_per_thread > 0); - assert(config->tile_size > 0); - assert(config->tile_size % bulk_copy_alignment == 0); - assert((sizeof...(RandomAccessIteratorsIn) == 0) != (config->smem_size != 0)); // logical xor -# endif // CUB_DETAIL_DEBUG_ENABLE_HOST_ASSERTIONS + _CCCL_ASSERT_HOST(config->elem_per_thread > 0, ""); + _CCCL_ASSERT_HOST(config->tile_size > 0, ""); + _CCCL_ASSERT_HOST(config->tile_size % bulk_copy_alignment == 0, ""); + _CCCL_ASSERT_HOST((sizeof...(RandomAccessIteratorsIn) == 0) != (config->smem_size != 0), ""); // logical xor const auto grid_dim = static_cast(::cuda::ceil_div(num_items, Offset{config->tile_size})); return ::cuda::std::make_tuple( diff --git a/cub/cub/device/dispatch/dispatch_unique_by_key.cuh b/cub/cub/device/dispatch/dispatch_unique_by_key.cuh index e07084fe24a..69e4c263b50 100644 --- a/cub/cub/device/dispatch/dispatch_unique_by_key.cuh +++ b/cub/cub/device/dispatch/dispatch_unique_by_key.cuh @@ -396,9 +396,9 @@ struct DispatchUniqueByKey num_tiles = CUB_MAX(1, num_tiles); int init_grid_size = ::cuda::ceil_div(num_tiles, INIT_KERNEL_THREADS); -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream); -#endif // CUB_DETAIL_DEBUG_ENABLE_LOG +#endif // CUB_DEBUG_LOG // Invoke init_kernel to initialize tile descriptors THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream) @@ -439,7 +439,7 @@ struct DispatchUniqueByKey scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x); // Log select_if_kernel configuration -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG { // Get SM occupancy for unique_by_key_kernel int scan_sm_occupancy; @@ -461,7 +461,7 @@ struct DispatchUniqueByKey items_per_thread, scan_sm_occupancy); } -#endif // CUB_DETAIL_DEBUG_ENABLE_LOG +#endif // CUB_DEBUG_LOG // Invoke select_if_kernel error = diff --git a/cub/cub/util_allocator.cuh b/cub/cub/util_allocator.cuh index a5ce583a1cc..524217c70ea 100644 --- a/cub/cub/util_allocator.cuh +++ b/cub/cub/util_allocator.cuh @@ -416,7 +416,7 @@ struct CachingDeviceAllocator // Lock mutex.lock(); -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog( "Changing max_cached_bytes (%lld -> %lld)\n", (long long) this->max_cached_bytes, (long long) max_cached_bytes_); #endif @@ -527,7 +527,7 @@ struct CachingDeviceAllocator cached_bytes[device].free -= search_key.bytes; cached_bytes[device].live += search_key.bytes; -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("\tDevice %d reused cached block at %p (%lld bytes) for stream %lld (previously associated with " "stream %lld).\n", device, @@ -572,7 +572,7 @@ struct CachingDeviceAllocator if (error == cudaErrorMemoryAllocation) { // The allocation attempt failed: free all cached blocks on device and retry -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("\tDevice %d failed to allocate %lld bytes for stream %lld, retrying after freeing cached allocations", device, (long long) search_key.bytes, @@ -611,7 +611,7 @@ struct CachingDeviceAllocator // Reduce balance and erase entry cached_bytes[device].free -= block_itr->bytes; -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("\tDevice %d freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks " "(%lld bytes) outstanding.\n", device, @@ -656,7 +656,7 @@ struct CachingDeviceAllocator cached_bytes[device].live += search_key.bytes; mutex.unlock(); -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("\tDevice %d allocated new device block at %p (%lld bytes associated with stream %lld).\n", device, search_key.d_ptr, @@ -678,7 +678,7 @@ struct CachingDeviceAllocator // Copy device pointer to output parameter *d_ptr = search_key.d_ptr; -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG if (debug) { _CubLog("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n", @@ -761,7 +761,7 @@ struct CachingDeviceAllocator cached_blocks.insert(search_key); cached_bytes[device].free += search_key.bytes; -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("\tDevice %d returned %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld " "bytes), %lld live blocks outstanding. (%lld bytes)\n", device, @@ -819,7 +819,7 @@ struct CachingDeviceAllocator return error; } -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("\tDevice %d freed %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld " "bytes), %lld live blocks (%lld bytes) outstanding.\n", device, @@ -914,7 +914,7 @@ struct CachingDeviceAllocator cached_bytes[current_device].free -= block_bytes; cached_blocks.erase(begin); -#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +#ifdef CUB_DEBUG_LOG _CubLog("\tDevice %d freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld " "bytes) outstanding.\n", current_device, diff --git a/cub/cub/util_debug.cuh b/cub/cub/util_debug.cuh index 275c915e8f2..099408897ad 100644 --- a/cub/cub/util_debug.cuh +++ b/cub/cub/util_debug.cuh @@ -66,22 +66,6 @@ */ # define CUB_DEBUG_SYNC -/** - * @def CUB_DEBUG_HOST_ASSERTIONS - * - * Extends `CUB_DEBUG_SYNC` effects by checking host-side precondition - * assertions. - */ -# define CUB_DEBUG_HOST_ASSERTIONS - -/** - * @def CUB_DEBUG_DEVICE_ASSERTIONS - * - * Extends `CUB_DEBUG_HOST_ASSERTIONS` effects by checking device-side - * precondition assertions. - */ -# define CUB_DEBUG_DEVICE_ASSERTIONS - /** * @def CUB_DEBUG_ALL * @@ -94,80 +78,29 @@ #endif // _CCCL_DOXYGEN_INVOKED -// `CUB_DETAIL_DEBUG_LEVEL_*`: Implementation details, internal use only: - -#define CUB_DETAIL_DEBUG_LEVEL_NONE 0 -#define CUB_DETAIL_DEBUG_LEVEL_HOST_ASSERTIONS_ONLY 1 -#define CUB_DETAIL_DEBUG_LEVEL_LOG 2 -#define CUB_DETAIL_DEBUG_LEVEL_SYNC 3 -#define CUB_DETAIL_DEBUG_LEVEL_HOST_ASSERTIONS 4 -#define CUB_DETAIL_DEBUG_LEVEL_DEVICE_ASSERTIONS 5 -#define CUB_DETAIL_DEBUG_LEVEL_ALL 1000 - -// `CUB_DEBUG_*`: User interfaces: - -// Extra logging, no syncs -#ifdef CUB_DEBUG_LOG -# define CUB_DETAIL_DEBUG_LEVEL CUB_DETAIL_DEBUG_LEVEL_LOG -#endif - -// Logging + syncs +// CUB_DEBUG_SYNC also enables CUB_DEBUG_LOG #ifdef CUB_DEBUG_SYNC -# define CUB_DETAIL_DEBUG_LEVEL CUB_DETAIL_DEBUG_LEVEL_SYNC -#endif - -// Logging + syncs + host assertions -#ifdef CUB_DEBUG_HOST_ASSERTIONS -# define CUB_DETAIL_DEBUG_LEVEL CUB_DETAIL_DEBUG_LEVEL_HOST_ASSERTIONS -#endif - -// Logging + syncs + host assertions + device assertions -#ifdef CUB_DEBUG_DEVICE_ASSERTIONS -# define CUB_DETAIL_DEBUG_LEVEL CUB_DETAIL_DEBUG_LEVEL_DEVICE_ASSERTIONS -#endif - -// All -#ifdef CUB_DEBUG_ALL -# define CUB_DETAIL_DEBUG_LEVEL CUB_DETAIL_DEBUG_LEVEL_ALL -#endif - -// Default case, no extra debugging: -#ifndef CUB_DETAIL_DEBUG_LEVEL -# ifdef NDEBUG -# define CUB_DETAIL_DEBUG_LEVEL CUB_DETAIL_DEBUG_LEVEL_NONE -# else -# define CUB_DETAIL_DEBUG_LEVEL CUB_DETAIL_DEBUG_LEVEL_HOST_ASSERTIONS_ONLY +# ifndef CUB_DEBUG_LOG +# define CUB_DEBUG_LOG # endif #endif -/* - * `CUB_DETAIL_DEBUG_ENABLE_*`: - * Internal implementation details, used for testing enabled debug features: - */ - -#if CUB_DETAIL_DEBUG_LEVEL >= CUB_DETAIL_DEBUG_LEVEL_LOG -# define CUB_DETAIL_DEBUG_ENABLE_LOG -#endif - -#if CUB_DETAIL_DEBUG_LEVEL >= CUB_DETAIL_DEBUG_LEVEL_SYNC -# define CUB_DETAIL_DEBUG_ENABLE_SYNC -#endif - -#if (CUB_DETAIL_DEBUG_LEVEL >= CUB_DETAIL_DEBUG_LEVEL_HOST_ASSERTIONS) \ - || (CUB_DETAIL_DEBUG_LEVEL == CUB_DETAIL_DEBUG_LEVEL_HOST_ASSERTIONS_ONLY) -# define CUB_DETAIL_DEBUG_ENABLE_HOST_ASSERTIONS -#endif - -#if CUB_DETAIL_DEBUG_LEVEL >= CUB_DETAIL_DEBUG_LEVEL_DEVICE_ASSERTIONS -# define CUB_DETAIL_DEBUG_ENABLE_DEVICE_ASSERTIONS -#endif +// CUB_DEBUG_ALL = CUB_DEBUG_LOG + CUB_DEBUG_SYNC +#ifdef CUB_DEBUG_ALL +# ifndef CUB_DEBUG_LOG +# define CUB_DEBUG_LOG +# endif // CUB_DEBUG_LOG +# ifndef CUB_DEBUG_SYNC +# define CUB_DEBUG_SYNC +# endif // CUB_DEBUG_SYNC +#endif // CUB_DEBUG_ALL /// CUB error reporting macro (prints error messages to stderr) #if (defined(DEBUG) || defined(_DEBUG)) && !defined(CUB_STDERR) # define CUB_STDERR #endif -#if defined(CUB_STDERR) || defined(CUB_DETAIL_DEBUG_ENABLE_LOG) +#if defined(CUB_STDERR) || defined(CUB_DEBUG_LOG) # include #endif diff --git a/cub/cub/util_device.cuh b/cub/cub/util_device.cuh index b9e4f5c25e6..cf65e954945 100644 --- a/cub/cub/util_device.cuh +++ b/cub/cub/util_device.cuh @@ -47,7 +47,6 @@ # pragma system_header #endif // no system header -#include // IWYU pragma: export #include #include // for backward compatibility @@ -437,62 +436,28 @@ CUB_RUNTIME_FUNCTION inline cudaError_t SmVersion(int& sm_version, int device = return result; } -/** - * Synchronize the specified \p stream. - */ +//! Synchronize the specified \p stream when called in host code. Otherwise, does nothing. CUB_RUNTIME_FUNCTION inline cudaError_t SyncStream(cudaStream_t stream) { - cudaError_t result = cudaErrorNotSupported; - - NV_IF_TARGET(NV_IS_HOST, - (result = CubDebug(cudaStreamSynchronize(stream));), - ((void) stream; result = CubDebug(cub::detail::device_synchronize());)); - - return result; + NV_IF_TARGET( + NV_IS_HOST, (return CubDebug(cudaStreamSynchronize(stream));), ((void) stream; return cudaErrorNotSupported;)); } namespace detail { - -/** - * Same as SyncStream, but intended for use with the debug_synchronous flags - * in device algorithms. This should not be used if synchronization is required - * for correctness. - * - * If `debug_synchronous` is false, this function will immediately return - * cudaSuccess. If true, one of the following will occur: - * - * If synchronization is supported by the current compilation target and - * settings, the sync is performed and the sync result is returned. - * - * If syncs are not supported then no sync is performed, but a message is logged - * via _CubLog and cudaSuccess is returned. - */ +//! If CUB_DEBUG_SYNC is defined and this function is called from host code, a sync is performed and the +//! sync result is returned. Otherwise, does nothing. CUB_RUNTIME_FUNCTION inline cudaError_t DebugSyncStream(cudaStream_t stream) { -#ifndef CUB_DETAIL_DEBUG_ENABLE_SYNC - +#ifndef CUB_DEBUG_SYNC (void) stream; return cudaSuccess; - -#else // CUB_DETAIL_DEBUG_ENABLE_SYNC: - -# define CUB_TMP_SYNC_AVAILABLE \ - _CubLog("%s\n", "Synchronizing..."); \ - return SyncStream(stream) - -# define CUB_TMP_DEVICE_SYNC_UNAVAILABLE \ - (void) stream; \ - _CubLog("WARNING: Skipping CUB `debug_synchronous` synchronization (%s).\n", \ - "device-side sync requires - #if _CCCL_HAS_CUDA_COMPILER # include # include diff --git a/thrust/thrust/system/cuda/detail/util.h b/thrust/thrust/system/cuda/detail/util.h index 49e84b8025b..15937c2c59c 100644 --- a/thrust/thrust/system/cuda/detail/util.h +++ b/thrust/thrust/system/cuda/detail/util.h @@ -38,7 +38,6 @@ #include -#include #include #include