Skip to content

Commit

Permalink
Refactor CUB's util_debug
Browse files Browse the repository at this point in the history
  • Loading branch information
bernhardmgruber committed Jan 15, 2025
1 parent 048b2bd commit 8d05f7a
Show file tree
Hide file tree
Showing 24 changed files with 115 additions and 278 deletions.
54 changes: 0 additions & 54 deletions cub/cub/detail/device_synchronize.cuh

This file was deleted.

8 changes: 4 additions & 4 deletions cub/cub/device/dispatch/dispatch_adjacent_difference.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -189,13 +189,13 @@ struct DispatchAdjacentDifference
constexpr int init_block_size = AgentDifferenceInitT::BLOCK_THREADS;
const int init_grid_size = ::cuda::ceil_div(num_tiles, init_block_size);

#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
#ifdef CUB_DEBUG_LOG
_CubLog("Invoking DeviceAdjacentDifferenceInitKernel"
"<<<%d, %d, 0, %lld>>>()\n",
init_grid_size,
init_block_size,
reinterpret_cast<long long>(stream));
#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
#endif // CUB_DEBUG_LOG

THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, init_block_size, 0, stream)
.doit(DeviceAdjacentDifferenceInitKernel<AgentDifferenceInitT, InputIteratorT, InputT, OffsetT>,
Expand All @@ -219,13 +219,13 @@ struct DispatchAdjacentDifference
}
}

#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
#ifdef CUB_DEBUG_LOG
_CubLog("Invoking DeviceAdjacentDifferenceDifferenceKernel"
"<<<%d, %d, 0, %lld>>>()\n",
num_tiles,
AdjacentDifferencePolicyT::BLOCK_THREADS,
reinterpret_cast<long long>(stream));
#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
#endif // CUB_DEBUG_LOG

THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
num_tiles, AdjacentDifferencePolicyT::BLOCK_THREADS, 0, stream)
Expand Down
6 changes: 3 additions & 3 deletions cub/cub/device/dispatch/dispatch_batch_memcpy.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -536,7 +536,7 @@ struct DispatchBatchMemcpy
return error;
}

#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
#ifdef CUB_DEBUG_LOG
_CubLog("Invoking "
"InitTileStateKernel<<<%d, %d, 0, %lld>>>()\n",
static_cast<int>(init_grid_size),
Expand Down Expand Up @@ -564,7 +564,7 @@ struct DispatchBatchMemcpy
return error;
}

#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
#ifdef CUB_DEBUG_LOG
_CubLog("Invoking "
"BatchMemcpyKernel<<<%d, %d, 0, %lld>>>()\n",
static_cast<int>(batch_memcpy_grid_size),
Expand Down Expand Up @@ -603,7 +603,7 @@ struct DispatchBatchMemcpy
return error;
}

#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
#ifdef CUB_DEBUG_LOG
_CubLog("Invoking "
"MultiBlockBatchMemcpyKernel<<<%d, %d, 0, %lld>>>()\n",
static_cast<int>(batch_memcpy_blev_grid_size),
Expand Down
4 changes: 2 additions & 2 deletions cub/cub/device/dispatch/dispatch_for.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ struct dispatch_t
const auto tile_size = static_cast<OffsetT>(block_threads * items_per_thread);
const auto num_tiles = ::cuda::ceil_div(num_items, tile_size);

#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
#ifdef CUB_DEBUG_LOG
_CubLog("Invoking detail::for_each::dynamic_kernel<<<%d, %d, 0, %lld>>>(), "
"%d items per thread\n",
static_cast<int>(num_tiles),
Expand Down Expand Up @@ -144,7 +144,7 @@ struct dispatch_t
const auto tile_size = static_cast<OffsetT>(block_threads * items_per_thread);
const auto num_tiles = ::cuda::ceil_div(num_items, tile_size);

#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
#ifdef CUB_DEBUG_LOG
_CubLog("Invoking detail::for_each::static_kernel<<<%d, %d, 0, %lld>>>(), "
"%d items per thread\n",
static_cast<int>(num_tiles),
Expand Down
4 changes: 2 additions & 2 deletions cub/cub/device/dispatch/dispatch_for_each_in_extents.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ public:
constexpr unsigned items_per_thread = ActivePolicyT::for_policy_t::items_per_thread;
unsigned num_cta = ::cuda::ceil_div(_size, block_threads * items_per_thread);

# ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
# ifdef CUB_DEBUG_LOG
_CubLog("Invoking detail::for_each_in_extents::static_kernel<<<%u, %u, 0, %p>>>(), items_per_thread: %u\n",
num_cta,
block_threads,
Expand Down Expand Up @@ -155,7 +155,7 @@ public:
_CUB_RETURN_IF_ERROR(status)
unsigned num_cta = ::cuda::ceil_div(_size, block_threads * items_per_thread);

# ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
# ifdef CUB_DEBUG_LOG
_CubLog("Invoking detail::for_each_in_extents::dynamic_kernel<<<%u, %u, 0, %p>>>(), items_per_thread: %u\n",
num_cta,
block_threads,
Expand Down
8 changes: 4 additions & 4 deletions cub/cub/device/dispatch/dispatch_histogram.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -434,12 +434,12 @@ struct dispatch_histogram
(max_num_output_bins + histogram_init_block_threads - 1) / histogram_init_block_threads;

// Log DeviceHistogramInitKernel configuration
#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
#ifdef CUB_DEBUG_LOG
_CubLog("Invoking DeviceHistogramInitKernel<<<%d, %d, 0, %lld>>>()\n",
histogram_init_grid_dims,
histogram_init_block_threads,
(long long) stream);
#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
#endif // CUB_DEBUG_LOG

// Invoke histogram_init_kernel
THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
Expand All @@ -453,7 +453,7 @@ struct dispatch_histogram
}

// Log histogram_sweep_kernel configuration
#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
#ifdef CUB_DEBUG_LOG
_CubLog("Invoking histogram_sweep_kernel<<<{%d, %d, %d}, %d, 0, %lld>>>(), %d pixels "
"per thread, %d SM occupancy\n",
sweep_grid_dims.x,
Expand All @@ -463,7 +463,7 @@ struct dispatch_histogram
(long long) stream,
pixels_per_thread,
histogram_sweep_sm_occupancy);
#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
#endif // CUB_DEBUG_LOG

// Invoke histogram_sweep_kernel
THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(sweep_grid_dims, block_threads, 0, stream)
Expand Down
20 changes: 10 additions & 10 deletions cub/cub/device/dispatch/dispatch_radix_sort.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -965,7 +965,7 @@ struct DispatchRadixSort
}

// Log single_tile_kernel configuration
#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
#ifdef CUB_DEBUG_LOG
_CubLog("Invoking single_tile_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit "
"%d, bit_grain %d\n",
1,
Expand Down Expand Up @@ -1036,7 +1036,7 @@ struct DispatchRadixSort
int pass_bits = CUB_MIN(pass_config.radix_bits, (end_bit - current_bit));

// Log upsweep_kernel configuration
#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
#ifdef CUB_DEBUG_LOG
_CubLog("Invoking upsweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, "
"bit_grain %d\n",
pass_config.even_share.grid_size,
Expand Down Expand Up @@ -1078,7 +1078,7 @@ struct DispatchRadixSort
}

// Log scan_kernel configuration
#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
#ifdef CUB_DEBUG_LOG
_CubLog("Invoking scan_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread\n",
1,
pass_config.scan_config.block_threads,
Expand All @@ -1105,7 +1105,7 @@ struct DispatchRadixSort
}

// Log downsweep_kernel configuration
#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
#ifdef CUB_DEBUG_LOG
_CubLog("Invoking downsweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
pass_config.even_share.grid_size,
pass_config.downsweep_config.block_threads,
Expand Down Expand Up @@ -1305,7 +1305,7 @@ struct DispatchRadixSort
}

// log histogram_kernel configuration
#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
#ifdef CUB_DEBUG_LOG
_CubLog("Invoking histogram_kernel<<<%d, %d, 0, %lld>>>(), %d items per iteration, "
"%d SM occupancy, bit_grain %d\n",
histo_blocks_per_sm * num_sms,
Expand Down Expand Up @@ -1335,7 +1335,7 @@ struct DispatchRadixSort
constexpr int SCAN_BLOCK_THREADS = ActivePolicyT::ExclusiveSumPolicy::BLOCK_THREADS;

// log exclusive_sum_kernel configuration
#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
#ifdef CUB_DEBUG_LOG
_CubLog("Invoking exclusive_sum_kernel<<<%d, %d, 0, %lld>>>(), bit_grain %d\n",
num_passes,
SCAN_BLOCK_THREADS,
Expand Down Expand Up @@ -1383,7 +1383,7 @@ struct DispatchRadixSort
}

// log onesweep_kernel configuration
#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
#ifdef CUB_DEBUG_LOG
_CubLog("Invoking onesweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, "
"current bit %d, bit_grain %d, portion %d/%d\n",
num_blocks,
Expand Down Expand Up @@ -1672,7 +1672,7 @@ struct DispatchRadixSort
}

// Copy keys
#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
#ifdef CUB_DEBUG_LOG
_CubLog("Invoking async copy of %lld keys on stream %lld\n", (long long) num_items, (long long) stream);
#endif
cudaError_t error = cudaSuccess;
Expand All @@ -1694,7 +1694,7 @@ struct DispatchRadixSort
// Copy values if necessary
if (!KEYS_ONLY)
{
#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
#ifdef CUB_DEBUG_LOG
_CubLog("Invoking async copy of %lld values on stream %lld\n", (long long) num_items, (long long) stream);
#endif
error = CubDebug(cudaMemcpyAsync(
Expand Down Expand Up @@ -2001,7 +2001,7 @@ struct DispatchSegmentedRadixSort
int pass_bits = CUB_MIN(pass_config.radix_bits, (end_bit - current_bit));

// Log kernel configuration
#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
#ifdef CUB_DEBUG_LOG
_CubLog("Invoking segmented_kernels<<<%lld, %lld, 0, %lld>>>(), "
"%lld items per thread, %lld SM occupancy, "
"current bit %d, bit_grain %d\n",
Expand Down
16 changes: 8 additions & 8 deletions cub/cub/device/dispatch/dispatch_reduce.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -378,13 +378,13 @@ struct DispatchReduce
}

// Log single_reduce_sweep_kernel configuration
#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
#ifdef CUB_DEBUG_LOG
_CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), "
"%d items per thread\n",
policy.SingleTile().BlockThreads(),
(long long) stream,
policy.SingleTile().ItemsPerThread());
#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
#endif // CUB_DEBUG_LOG

// Invoke single_reduce_sweep_kernel
launcher_factory(1, policy.SingleTile().BlockThreads(), 0, stream)
Expand Down Expand Up @@ -490,15 +490,15 @@ struct DispatchReduce
int reduce_grid_size = even_share.grid_size;

// Log device_reduce_sweep_kernel configuration
#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
#ifdef CUB_DEBUG_LOG
_CubLog("Invoking DeviceReduceKernel<<<%lu, %d, 0, %lld>>>(), %d items "
"per thread, %d SM occupancy\n",
(unsigned long) reduce_grid_size,
active_policy.Reduce().BlockThreads(),
(long long) stream,
active_policy.Reduce().ItemsPerThread(),
reduce_config.sm_occupancy);
#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
#endif // CUB_DEBUG_LOG

// Invoke DeviceReduceKernel
launcher_factory(reduce_grid_size, active_policy.Reduce().BlockThreads(), 0, stream)
Expand All @@ -519,13 +519,13 @@ struct DispatchReduce
}

// Log single_reduce_sweep_kernel configuration
#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
#ifdef CUB_DEBUG_LOG
_CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), "
"%d items per thread\n",
active_policy.SingleTile().BlockThreads(),
(long long) stream,
active_policy.SingleTile().ItemsPerThread());
#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
#endif // CUB_DEBUG_LOG

// Invoke DeviceReduceSingleTileKernel
launcher_factory(1, active_policy.SingleTile().BlockThreads(), 0, stream)
Expand Down Expand Up @@ -881,15 +881,15 @@ struct DispatchSegmentedReduce
}

// Log device_reduce_sweep_kernel configuration
#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
#ifdef CUB_DEBUG_LOG
_CubLog("Invoking SegmentedDeviceReduceKernel<<<%d, %d, 0, %lld>>>(), "
"%d items per thread, %d SM occupancy\n",
num_segments,
ActivePolicyT::SegmentedReducePolicy::BLOCK_THREADS,
(long long) stream,
ActivePolicyT::SegmentedReducePolicy::ITEMS_PER_THREAD,
segmented_reduce_config.sm_occupancy);
#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
#endif // CUB_DEBUG_LOG

// Invoke DeviceReduceKernel
THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
Expand Down
8 changes: 4 additions & 4 deletions cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -341,9 +341,9 @@ struct DispatchReduceByKey
// Log init_kernel configuration
int init_grid_size = CUB_MAX(1, ::cuda::ceil_div(num_tiles, INIT_KERNEL_THREADS));

#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
#ifdef CUB_DEBUG_LOG
_CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
#endif // CUB_DEBUG_LOG

// Invoke init_kernel to initialize tile descriptors
THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream)
Expand Down Expand Up @@ -391,7 +391,7 @@ struct DispatchReduceByKey
for (int start_tile = 0; start_tile < num_tiles; start_tile += scan_grid_size)
{
// Log reduce_by_key_kernel configuration
#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
#ifdef CUB_DEBUG_LOG
_CubLog("Invoking %d reduce_by_key_kernel<<<%d, %d, 0, %lld>>>(), %d "
"items per thread, %d SM occupancy\n",
start_tile,
Expand All @@ -400,7 +400,7 @@ struct DispatchReduceByKey
(long long) stream,
items_per_thread,
reduce_by_key_sm_occupancy);
#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
#endif // CUB_DEBUG_LOG

// Invoke reduce_by_key_kernel
THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(scan_grid_size, block_threads, 0, stream)
Expand Down
Loading

0 comments on commit 8d05f7a

Please sign in to comment.