Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor CUB's util_debug #3345

Merged
merged 1 commit into from
Jan 22, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 0 additions & 54 deletions cub/cub/detail/device_synchronize.cuh

This file was deleted.

8 changes: 4 additions & 4 deletions cub/cub/device/dispatch/dispatch_adjacent_difference.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -190,13 +190,13 @@ struct DispatchAdjacentDifference
constexpr int init_block_size = AgentDifferenceInitT::BLOCK_THREADS;
const int init_grid_size = ::cuda::ceil_div(num_tiles, init_block_size);

#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
#ifdef CUB_DEBUG_LOG
_CubLog("Invoking DeviceAdjacentDifferenceInitKernel"
"<<<%d, %d, 0, %lld>>>()\n",
init_grid_size,
init_block_size,
reinterpret_cast<long long>(stream));
#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
#endif // CUB_DEBUG_LOG

THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, init_block_size, 0, stream)
.doit(DeviceAdjacentDifferenceInitKernel<AgentDifferenceInitT, InputIteratorT, InputT, OffsetT>,
Expand All @@ -220,13 +220,13 @@ struct DispatchAdjacentDifference
}
}

#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
#ifdef CUB_DEBUG_LOG
_CubLog("Invoking DeviceAdjacentDifferenceDifferenceKernel"
"<<<%d, %d, 0, %lld>>>()\n",
num_tiles,
AdjacentDifferencePolicyT::BLOCK_THREADS,
reinterpret_cast<long long>(stream));
#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
#endif // CUB_DEBUG_LOG

THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
num_tiles, AdjacentDifferencePolicyT::BLOCK_THREADS, 0, stream)
Expand Down
6 changes: 3 additions & 3 deletions cub/cub/device/dispatch/dispatch_batch_memcpy.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -539,7 +539,7 @@ struct DispatchBatchMemcpy
return error;
}

#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
#ifdef CUB_DEBUG_LOG
_CubLog("Invoking "
"InitTileStateKernel<<<%d, %d, 0, %lld>>>()\n",
static_cast<int>(init_grid_size),
Expand Down Expand Up @@ -567,7 +567,7 @@ struct DispatchBatchMemcpy
return error;
}

#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
#ifdef CUB_DEBUG_LOG
_CubLog("Invoking "
"BatchMemcpyKernel<<<%d, %d, 0, %lld>>>()\n",
static_cast<int>(batch_memcpy_grid_size),
Expand Down Expand Up @@ -606,7 +606,7 @@ struct DispatchBatchMemcpy
return error;
}

#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
#ifdef CUB_DEBUG_LOG
_CubLog("Invoking "
"MultiBlockBatchMemcpyKernel<<<%d, %d, 0, %lld>>>()\n",
static_cast<int>(batch_memcpy_blev_grid_size),
Expand Down
4 changes: 2 additions & 2 deletions cub/cub/device/dispatch/dispatch_for.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ struct dispatch_t
const auto tile_size = static_cast<OffsetT>(block_threads * items_per_thread);
const auto num_tiles = ::cuda::ceil_div(num_items, tile_size);

#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
#ifdef CUB_DEBUG_LOG
_CubLog("Invoking detail::for_each::dynamic_kernel<<<%d, %d, 0, %lld>>>(), "
"%d items per thread\n",
static_cast<int>(num_tiles),
Expand Down Expand Up @@ -144,7 +144,7 @@ struct dispatch_t
const auto tile_size = static_cast<OffsetT>(block_threads * items_per_thread);
const auto num_tiles = ::cuda::ceil_div(num_items, tile_size);

#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
#ifdef CUB_DEBUG_LOG
_CubLog("Invoking detail::for_each::static_kernel<<<%d, %d, 0, %lld>>>(), "
"%d items per thread\n",
static_cast<int>(num_tiles),
Expand Down
4 changes: 2 additions & 2 deletions cub/cub/device/dispatch/dispatch_for_each_in_extents.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ public:
constexpr unsigned items_per_thread = ActivePolicyT::for_policy_t::items_per_thread;
unsigned num_cta = ::cuda::ceil_div(_size, block_threads * items_per_thread);

# ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
# ifdef CUB_DEBUG_LOG
_CubLog("Invoking detail::for_each_in_extents::static_kernel<<<%u, %u, 0, %p>>>(), items_per_thread: %u\n",
num_cta,
block_threads,
Expand Down Expand Up @@ -155,7 +155,7 @@ public:
_CUB_RETURN_IF_ERROR(status)
unsigned num_cta = ::cuda::ceil_div(_size, block_threads * items_per_thread);

# ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
# ifdef CUB_DEBUG_LOG
_CubLog("Invoking detail::for_each_in_extents::dynamic_kernel<<<%u, %u, 0, %p>>>(), items_per_thread: %u\n",
num_cta,
block_threads,
Expand Down
8 changes: 4 additions & 4 deletions cub/cub/device/dispatch/dispatch_histogram.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -434,12 +434,12 @@ struct dispatch_histogram
(max_num_output_bins + histogram_init_block_threads - 1) / histogram_init_block_threads;

// Log DeviceHistogramInitKernel configuration
#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
#ifdef CUB_DEBUG_LOG
_CubLog("Invoking DeviceHistogramInitKernel<<<%d, %d, 0, %lld>>>()\n",
histogram_init_grid_dims,
histogram_init_block_threads,
(long long) stream);
#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
#endif // CUB_DEBUG_LOG

// Invoke histogram_init_kernel
THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
Expand All @@ -453,7 +453,7 @@ struct dispatch_histogram
}

// Log histogram_sweep_kernel configuration
#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
#ifdef CUB_DEBUG_LOG
_CubLog("Invoking histogram_sweep_kernel<<<{%d, %d, %d}, %d, 0, %lld>>>(), %d pixels "
"per thread, %d SM occupancy\n",
sweep_grid_dims.x,
Expand All @@ -463,7 +463,7 @@ struct dispatch_histogram
(long long) stream,
pixels_per_thread,
histogram_sweep_sm_occupancy);
#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
#endif // CUB_DEBUG_LOG

// Invoke histogram_sweep_kernel
THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(sweep_grid_dims, block_threads, 0, stream)
Expand Down
20 changes: 10 additions & 10 deletions cub/cub/device/dispatch/dispatch_radix_sort.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -968,7 +968,7 @@ struct DispatchRadixSort
}

// Log single_tile_kernel configuration
#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
#ifdef CUB_DEBUG_LOG
_CubLog("Invoking single_tile_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit "
"%d, bit_grain %d\n",
1,
Expand Down Expand Up @@ -1039,7 +1039,7 @@ struct DispatchRadixSort
int pass_bits = CUB_MIN(pass_config.radix_bits, (end_bit - current_bit));

// Log upsweep_kernel configuration
#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
#ifdef CUB_DEBUG_LOG
_CubLog("Invoking upsweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, "
"bit_grain %d\n",
pass_config.even_share.grid_size,
Expand Down Expand Up @@ -1081,7 +1081,7 @@ struct DispatchRadixSort
}

// Log scan_kernel configuration
#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
#ifdef CUB_DEBUG_LOG
_CubLog("Invoking scan_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread\n",
1,
pass_config.scan_config.block_threads,
Expand All @@ -1108,7 +1108,7 @@ struct DispatchRadixSort
}

// Log downsweep_kernel configuration
#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
#ifdef CUB_DEBUG_LOG
_CubLog("Invoking downsweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
pass_config.even_share.grid_size,
pass_config.downsweep_config.block_threads,
Expand Down Expand Up @@ -1308,7 +1308,7 @@ struct DispatchRadixSort
}

// log histogram_kernel configuration
#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
#ifdef CUB_DEBUG_LOG
_CubLog("Invoking histogram_kernel<<<%d, %d, 0, %lld>>>(), %d items per iteration, "
"%d SM occupancy, bit_grain %d\n",
histo_blocks_per_sm * num_sms,
Expand Down Expand Up @@ -1338,7 +1338,7 @@ struct DispatchRadixSort
constexpr int SCAN_BLOCK_THREADS = ActivePolicyT::ExclusiveSumPolicy::BLOCK_THREADS;

// log exclusive_sum_kernel configuration
#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
#ifdef CUB_DEBUG_LOG
_CubLog("Invoking exclusive_sum_kernel<<<%d, %d, 0, %lld>>>(), bit_grain %d\n",
num_passes,
SCAN_BLOCK_THREADS,
Expand Down Expand Up @@ -1386,7 +1386,7 @@ struct DispatchRadixSort
}

// log onesweep_kernel configuration
#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
#ifdef CUB_DEBUG_LOG
_CubLog("Invoking onesweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, "
"current bit %d, bit_grain %d, portion %d/%d\n",
num_blocks,
Expand Down Expand Up @@ -1675,7 +1675,7 @@ struct DispatchRadixSort
}

// Copy keys
#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
#ifdef CUB_DEBUG_LOG
_CubLog("Invoking async copy of %lld keys on stream %lld\n", (long long) num_items, (long long) stream);
#endif
cudaError_t error = cudaSuccess;
Expand All @@ -1697,7 +1697,7 @@ struct DispatchRadixSort
// Copy values if necessary
if (!KEYS_ONLY)
{
#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
#ifdef CUB_DEBUG_LOG
_CubLog("Invoking async copy of %lld values on stream %lld\n", (long long) num_items, (long long) stream);
#endif
error = CubDebug(cudaMemcpyAsync(
Expand Down Expand Up @@ -2004,7 +2004,7 @@ struct DispatchSegmentedRadixSort
int pass_bits = CUB_MIN(pass_config.radix_bits, (end_bit - current_bit));

// Log kernel configuration
#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
#ifdef CUB_DEBUG_LOG
_CubLog("Invoking segmented_kernels<<<%lld, %lld, 0, %lld>>>(), "
"%lld items per thread, %lld SM occupancy, "
"current bit %d, bit_grain %d\n",
Expand Down
16 changes: 8 additions & 8 deletions cub/cub/device/dispatch/dispatch_reduce.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -378,13 +378,13 @@ struct DispatchReduce
}

// Log single_reduce_sweep_kernel configuration
#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
#ifdef CUB_DEBUG_LOG
_CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), "
"%d items per thread\n",
policy.SingleTile().BlockThreads(),
(long long) stream,
policy.SingleTile().ItemsPerThread());
#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
#endif // CUB_DEBUG_LOG

// Invoke single_reduce_sweep_kernel
launcher_factory(1, policy.SingleTile().BlockThreads(), 0, stream)
Expand Down Expand Up @@ -490,15 +490,15 @@ struct DispatchReduce
int reduce_grid_size = even_share.grid_size;

// Log device_reduce_sweep_kernel configuration
#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
#ifdef CUB_DEBUG_LOG
_CubLog("Invoking DeviceReduceKernel<<<%lu, %d, 0, %lld>>>(), %d items "
"per thread, %d SM occupancy\n",
(unsigned long) reduce_grid_size,
active_policy.Reduce().BlockThreads(),
(long long) stream,
active_policy.Reduce().ItemsPerThread(),
reduce_config.sm_occupancy);
#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
#endif // CUB_DEBUG_LOG

// Invoke DeviceReduceKernel
launcher_factory(reduce_grid_size, active_policy.Reduce().BlockThreads(), 0, stream)
Expand All @@ -519,13 +519,13 @@ struct DispatchReduce
}

// Log single_reduce_sweep_kernel configuration
#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
#ifdef CUB_DEBUG_LOG
_CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), "
"%d items per thread\n",
active_policy.SingleTile().BlockThreads(),
(long long) stream,
active_policy.SingleTile().ItemsPerThread());
#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
#endif // CUB_DEBUG_LOG

// Invoke DeviceReduceSingleTileKernel
launcher_factory(1, active_policy.SingleTile().BlockThreads(), 0, stream)
Expand Down Expand Up @@ -881,15 +881,15 @@ struct DispatchSegmentedReduce
}

// Log device_reduce_sweep_kernel configuration
#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
#ifdef CUB_DEBUG_LOG
_CubLog("Invoking SegmentedDeviceReduceKernel<<<%d, %d, 0, %lld>>>(), "
"%d items per thread, %d SM occupancy\n",
num_segments,
ActivePolicyT::SegmentedReducePolicy::BLOCK_THREADS,
(long long) stream,
ActivePolicyT::SegmentedReducePolicy::ITEMS_PER_THREAD,
segmented_reduce_config.sm_occupancy);
#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
#endif // CUB_DEBUG_LOG

// Invoke DeviceReduceKernel
THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
Expand Down
8 changes: 4 additions & 4 deletions cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -341,9 +341,9 @@ struct DispatchReduceByKey
// Log init_kernel configuration
int init_grid_size = CUB_MAX(1, ::cuda::ceil_div(num_tiles, INIT_KERNEL_THREADS));

#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
#ifdef CUB_DEBUG_LOG
_CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
#endif // CUB_DEBUG_LOG

// Invoke init_kernel to initialize tile descriptors
THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream)
Expand Down Expand Up @@ -391,7 +391,7 @@ struct DispatchReduceByKey
for (int start_tile = 0; start_tile < num_tiles; start_tile += scan_grid_size)
{
// Log reduce_by_key_kernel configuration
#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
#ifdef CUB_DEBUG_LOG
_CubLog("Invoking %d reduce_by_key_kernel<<<%d, %d, 0, %lld>>>(), %d "
"items per thread, %d SM occupancy\n",
start_tile,
Expand All @@ -400,7 +400,7 @@ struct DispatchReduceByKey
(long long) stream,
items_per_thread,
reduce_by_key_sm_occupancy);
#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
#endif // CUB_DEBUG_LOG

// Invoke reduce_by_key_kernel
THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(scan_grid_size, block_threads, 0, stream)
Expand Down
Loading
Loading