Skip to content

Commit

Permalink
Deprecate CUB iterators existing in Thrust (#3304)
Browse files Browse the repository at this point in the history
We keep CUB iterators inside the CUB implementation headers, because some of those are exposed to NVRTC which cannot compile Thrust iterators yet.
  • Loading branch information
bernhardmgruber authored Jan 27, 2025
1 parent 7699882 commit abfb7b4
Show file tree
Hide file tree
Showing 26 changed files with 116 additions and 35 deletions.
4 changes: 3 additions & 1 deletion cub/benchmarks/bench/run_length_encode/encode.cu
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@

#include <cub/device/device_run_length_encode.cuh>

#include <thrust/iterator/constant_iterator.h>

#include <look_back_helper.cuh>
#include <nvbench_helper.cuh>

Expand Down Expand Up @@ -74,7 +76,7 @@ static void rle(nvbench::state& state, nvbench::type_list<T, OffsetT>)
using offset_t = OffsetT;
using keys_input_it_t = const T*;
using unique_output_it_t = T*;
using vals_input_it_t = cub::ConstantInputIterator<offset_t, OffsetT>;
using vals_input_it_t = thrust::constant_iterator<offset_t, OffsetT>;
using aggregate_output_it_t = offset_t*;
using num_runs_output_iterator_t = offset_t*;
using equality_op_t = ::cuda::std::equal_to<>;
Expand Down
1 change: 0 additions & 1 deletion cub/cub/agent/agent_reduce_by_key.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@
#include <cub/block/block_scan.cuh>
#include <cub/block/block_store.cuh>
#include <cub/iterator/cache_modified_input_iterator.cuh>
#include <cub/iterator/constant_input_iterator.cuh>

#include <cuda/std/type_traits>

Expand Down
1 change: 0 additions & 1 deletion cub/cub/agent/agent_rle.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,6 @@
#include <cub/block/block_store.cuh>
#include <cub/grid/grid_queue.cuh>
#include <cub/iterator/cache_modified_input_iterator.cuh>
#include <cub/iterator/constant_input_iterator.cuh>

#include <cuda/ptx>
#include <cuda/std/type_traits>
Expand Down
1 change: 0 additions & 1 deletion cub/cub/agent/agent_segment_fixup.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,6 @@
#include <cub/block/block_scan.cuh>
#include <cub/block/block_store.cuh>
#include <cub/iterator/cache_modified_input_iterator.cuh>
#include <cub/iterator/constant_input_iterator.cuh>

#include <cuda/std/type_traits>

Expand Down
6 changes: 6 additions & 0 deletions cub/cub/agent/agent_spmv_orig.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -386,7 +386,9 @@ struct CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") AgentSpmv
__syncthreads();

// Search for the thread's starting coordinate within the merge tile
_CCCL_SUPPRESS_DEPRECATED_PUSH
CountingInputIterator<OffsetT> tile_nonzero_indices(tile_start_coord.y);
_CCCL_SUPPRESS_DEPRECATED_POP
CoordinateT thread_start_coord;

MergePathSearch(
Expand Down Expand Up @@ -567,7 +569,9 @@ struct CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") AgentSpmv
__syncthreads();

// Search for the thread's starting coordinate within the merge tile
_CCCL_SUPPRESS_DEPRECATED_PUSH
CountingInputIterator<OffsetT> tile_nonzero_indices(tile_start_coord.y);
_CCCL_SUPPRESS_DEPRECATED_POP
CoordinateT thread_start_coord;

MergePathSearch(
Expand Down Expand Up @@ -701,7 +705,9 @@ struct CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") AgentSpmv
// Search our starting coordinates
OffsetT diagonal = (tile_idx + threadIdx.x) * TILE_ITEMS;
CoordinateT tile_coord;
_CCCL_SUPPRESS_DEPRECATED_PUSH
CountingInputIterator<OffsetT> nonzero_indices(0);
_CCCL_SUPPRESS_DEPRECATED_POP

// Search the merge path
MergePathSearch(
Expand Down
5 changes: 5 additions & 0 deletions cub/cub/device/device_run_length_encode.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
#include <cub/device/dispatch/dispatch_reduce_by_key.cuh>
#include <cub/device/dispatch/dispatch_rle.cuh>
#include <cub/device/dispatch/tuning/tuning_run_length_encode.cuh>
#include <cub/iterator/constant_input_iterator.cuh>

#include <iterator>

Expand Down Expand Up @@ -199,14 +200,17 @@ struct DeviceRunLengthEncode
using length_t = cub::detail::non_void_value_t<LengthsOutputIteratorT, offset_t>;

// Generator type for providing 1s values for run-length reduction
_CCCL_SUPPRESS_DEPRECATED_PUSH
using lengths_input_iterator_t = ConstantInputIterator<length_t, offset_t>;
_CCCL_SUPPRESS_DEPRECATED_POP

using accum_t = ::cuda::std::__accumulator_t<reduction_op, length_t, length_t>;

using key_t = cub::detail::non_void_value_t<UniqueOutputIteratorT, cub::detail::value_t<InputIteratorT>>;

using policy_t = detail::rle::encode::policy_hub<accum_t, key_t>;

_CCCL_SUPPRESS_DEPRECATED_PUSH
return DispatchReduceByKey<
InputIteratorT,
UniqueOutputIteratorT,
Expand All @@ -228,6 +232,7 @@ struct DeviceRunLengthEncode
reduction_op(),
num_items,
stream);
_CCCL_SUPPRESS_DEPRECATED_POP
}

//! @rst
Expand Down
2 changes: 2 additions & 0 deletions cub/cub/device/dispatch/dispatch_spmv_orig.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,9 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSpmvSearchKernel(
{
OffsetT diagonal = (tile_idx * TILE_ITEMS);
CoordinateT tile_coordinate;
_CCCL_SUPPRESS_DEPRECATED_PUSH
CountingInputIterator<OffsetT> nonzero_indices(0);
_CCCL_SUPPRESS_DEPRECATED_POP

// Search the merge path
MergePathSearch(
Expand Down
19 changes: 14 additions & 5 deletions cub/cub/device/dispatch/dispatch_streaming_reduce.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -13,21 +13,20 @@
# pragma system_header
#endif // no system header

_CCCL_SUPPRESS_DEPRECATED_PUSH
#include <cuda/std/functional>
_CCCL_SUPPRESS_DEPRECATED_POP

#include <cub/device/dispatch/dispatch_reduce.cuh>
#include <cub/iterator/arg_index_input_iterator.cuh>
#include <cub/iterator/constant_input_iterator.cuh>

#include <thrust/iterator/iterator_adaptor.h>
#include <thrust/iterator/tabulate_output_iterator.h>

#include <cuda/std/functional>
#include <cuda/std/type_traits>

#ifndef _CCCL_DOXYGEN_INVOKED // Do not document

// suppress deprecation warnings for ConstantInputIterator
_CCCL_SUPPRESS_DEPRECATED_PUSH
CUB_NAMESPACE_BEGIN

namespace detail::reduce
Expand Down Expand Up @@ -190,6 +189,12 @@ template <typename InputIteratorT,
detail::reduce::policy_hub<KeyValuePair<PerPartitionOffsetT, InitT>, PerPartitionOffsetT, ReductionOpT>>
struct dispatch_streaming_arg_reduce_t
{
# if _CCCL_COMPILER(NVHPC)
// NVHPC fails to suppress a deprecation when the alias is inside the function below, so we put it here and span a
// deprecation suppression region across the entire file as well
using constant_offset_it_t = ConstantInputIterator<GlobalOffsetT>;
# endif // _CCCL_COMPILER(NVHPC)

// Internal dispatch routine for computing a device-wide argument extremum, like `ArgMin` and `ArgMax`
//
// @param[in] d_temp_storage
Expand Down Expand Up @@ -229,7 +234,11 @@ struct dispatch_streaming_arg_reduce_t
cudaStream_t stream)
{
// Constant iterator to provide the offset of the current partition for the user-provided input iterator
# if !_CCCL_COMPILER(NVHPC)
_CCCL_SUPPRESS_DEPRECATED_PUSH
using constant_offset_it_t = ConstantInputIterator<GlobalOffsetT>;
_CCCL_SUPPRESS_DEPRECATED_POP
# endif

// Wrapped input iterator to produce index-value tuples, i.e., <PerPartitionOffsetT, InputT>-tuples
// We make sure to offset the user-provided input iterator by the current partition's offset
Expand Down Expand Up @@ -373,7 +382,7 @@ struct dispatch_streaming_arg_reduce_t
};

} // namespace detail::reduce

_CCCL_SUPPRESS_DEPRECATED_POP
CUB_NAMESPACE_END

#endif // !_CCCL_DOXYGEN_INVOKED
10 changes: 9 additions & 1 deletion cub/cub/iterator/constant_input_iterator.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,13 @@ CUB_NAMESPACE_BEGIN
* The difference type of this iterator (Default: @p ptrdiff_t)
*/
template <typename ValueType, typename OffsetT = ptrdiff_t>
class ConstantInputIterator
class
#ifndef __CUDA_ARCH__
// Avoid generating a deprecation warning from length_encode.compute_xx.cpp1.ii, which is compiled by cicc for which
// we cannot suppress the warning
CCCL_DEPRECATED_BECAUSE("Use thrust::constant_iterator instead")
#endif
ConstantInputIterator
{
public:
// Required iterator traits
Expand Down Expand Up @@ -216,11 +222,13 @@ public:
}

/// ostream operator
_CCCL_SUPPRESS_DEPRECATED_PUSH
friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
{
os << "[" << itr.val << "," << itr.offset << "]";
return os;
}
_CCCL_SUPPRESS_DEPRECATED_POP
};

CUB_NAMESPACE_END
4 changes: 3 additions & 1 deletion cub/cub/iterator/counting_input_iterator.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ CUB_NAMESPACE_BEGIN
* The difference type of this iterator (Default: @p ptrdiff_t)
*/
template <typename ValueType, typename OffsetT = ptrdiff_t>
class CountingInputIterator
class CCCL_DEPRECATED_BECAUSE("Use thrust::counting_iterator instead") CountingInputIterator
{
public:
// Required iterator traits
Expand Down Expand Up @@ -218,11 +218,13 @@ public:

/// ostream operator
#if !_CCCL_COMPILER(NVRTC)
_CCCL_SUPPRESS_DEPRECATED_PUSH
friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
{
os << "[" << itr.val << "]";
return os;
}
_CCCL_SUPPRESS_DEPRECATED_POP
#endif // !_CCCL_COMPILER(NVRTC)
};

Expand Down
4 changes: 3 additions & 1 deletion cub/cub/iterator/discard_output_iterator.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ CUB_NAMESPACE_BEGIN
* @brief A discard iterator
*/
template <typename OffsetT = ptrdiff_t>
class DiscardOutputIterator
class CCCL_DEPRECATED_BECAUSE("Use thrust::discard_iterator instead") DiscardOutputIterator
{
public:
// Required iterator traits
Expand Down Expand Up @@ -191,11 +191,13 @@ public:
}

/// ostream operator
_CCCL_SUPPRESS_DEPRECATED_PUSH
friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
{
os << "[" << itr.offset << "]";
return os;
}
_CCCL_SUPPRESS_DEPRECATED_POP
};

CUB_NAMESPACE_END
4 changes: 3 additions & 1 deletion cub/cub/iterator/transform_input_iterator.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ CUB_NAMESPACE_BEGIN
* The difference type of this iterator (Default: @p ptrdiff_t)
*/
template <typename ValueType, typename ConversionOp, typename InputIteratorT, typename OffsetT = ptrdiff_t>
class TransformInputIterator
class CCCL_DEPRECATED_BECAUSE("Use thrust::transform_iterator instead") TransformInputIterator
{
public:
// Required iterator traits
Expand Down Expand Up @@ -233,10 +233,12 @@ public:
}

/// ostream operator
_CCCL_SUPPRESS_DEPRECATED_PUSH
friend std::ostream& operator<<(std::ostream& os, const self_type& /* itr */)
{
return os;
}
_CCCL_SUPPRESS_DEPRECATED_POP
};

CUB_NAMESPACE_END
9 changes: 5 additions & 4 deletions cub/test/catch2_test_block_run_length_decode.cu
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,11 @@
#include <cub/block/block_run_length_decode.cuh>
#include <cub/block/block_store.cuh>
#include <cub/device/device_scan.cuh>
#include <cub/iterator/counting_input_iterator.cuh>
#include <cub/iterator/transform_input_iterator.cuh>
#include <cub/util_allocator.cuh>

#include <thrust/iterator/counting_iterator.h>

#include <cuda/std/type_traits>

#include <c2h/catch2_test_helper.h>
Expand Down Expand Up @@ -332,11 +333,11 @@ void TestAlgorithmSpecialisation()

using RunItemT = float;
using RunLengthT = uint32_t;
using ItemItT = cub::CountingInputIterator<RunItemT>;
using RunLengthsItT = cub::TransformInputIterator<RunLengthT, ModOp, cub::CountingInputIterator<RunLengthT>>;
using ItemItT = thrust::counting_iterator<RunItemT>;
using RunLengthsItT = thrust::transform_iterator<ModOp, thrust::counting_iterator<RunLengthT>>;

ItemItT d_unique_items(1000U);
RunLengthsItT d_run_lengths(cub::CountingInputIterator<RunLengthT>(0), ModOp{});
RunLengthsItT d_run_lengths(thrust::counting_iterator<RunLengthT>(0), ModOp{});

constexpr uint32_t num_runs = 10000;
constexpr uint32_t num_blocks = (num_runs + (RUNS_PER_BLOCK - 1U)) / RUNS_PER_BLOCK;
Expand Down
1 change: 0 additions & 1 deletion cub/test/catch2_test_block_store.cu
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@

#include <cub/block/block_store.cuh>
#include <cub/iterator/cache_modified_output_iterator.cuh>
#include <cub/iterator/discard_output_iterator.cuh>
#include <cub/util_allocator.cuh>
#include <cub/util_arch.cuh>

Expand Down
4 changes: 2 additions & 2 deletions cub/test/catch2_test_device_for.cu
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,11 @@
// above header needs to be included first

#include <cub/device/device_for.cuh>
#include <cub/iterator/counting_input_iterator.cuh>

#include <thrust/count.h>
#include <thrust/detail/raw_pointer_cast.h>
#include <thrust/equal.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/sequence.h>

#include "catch2_test_launch_helper.h"
Expand Down Expand Up @@ -260,7 +260,7 @@ C2H_TEST("Device for each works with counting iterator", "[for][device]")
max_items,
}));

const auto it = cub::CountingInputIterator<int>{0};
const auto it = thrust::counting_iterator<int>{0};
c2h::device_vector<int> counts(num_items);
device_for_each(it, it + num_items, incrementer_t{thrust::raw_pointer_cast(counts.data())});

Expand Down
4 changes: 2 additions & 2 deletions cub/test/catch2_test_device_for_copy.cu
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,10 @@
// above header needs to be included first

#include <cub/device/device_for.cuh>
#include <cub/iterator/counting_input_iterator.cuh>

#include <thrust/count.h>
#include <thrust/detail/raw_pointer_cast.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/sequence.h>

#include "catch2_test_launch_helper.h"
Expand Down Expand Up @@ -200,7 +200,7 @@ C2H_TEST("Device for each works with counting iterator", "[for][device]")
max_items,
}));

const auto it = cub::CountingInputIterator<int>{0};
const auto it = thrust::counting_iterator<int>{0};
c2h::device_vector<int> counts(num_items);
device_for_each_copy(it, it + num_items, incrementer_t{thrust::raw_pointer_cast(counts.data())});

Expand Down
7 changes: 4 additions & 3 deletions cub/test/catch2_test_device_histogram.cu
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@
******************************************************************************/

#include <cub/device/device_histogram.cuh>
#include <cub/iterator/counting_input_iterator.cuh>

#include <thrust/iterator/counting_iterator.h>

#include <cuda/std/__algorithm_>
#include <cuda/std/array>
Expand Down Expand Up @@ -495,7 +496,7 @@ C2H_TEST("DeviceHistogram::HistogramEven sample iterator", "[histogram_even][dev
const auto lower_level = caller_vector<int>{0, -10, cs::numeric_limits<int>::lowest()};
const auto upper_level = caller_vector<int>{total_values, 10, cs::numeric_limits<int>::max()};

auto sample_iterator = cub::CountingInputIterator<sample_t>(0);
auto sample_iterator = thrust::counting_iterator<sample_t>(0);

// Channel #0: 0, 4, 8, 12
// Channel #1: 1, 5, 9, 13
Expand Down Expand Up @@ -584,7 +585,7 @@ C2H_TEST_LIST("DeviceHistogram::HistogramEven bin computation does not overflow"
constexpr sample_t lower_level = 0;
constexpr sample_t upper_level = cs::numeric_limits<sample_t>::max();
constexpr auto num_samples = 1000;
auto d_samples = cub::CountingInputIterator<sample_t>{0UL};
auto d_samples = thrust::counting_iterator<sample_t>{0UL};
auto d_histo_out = c2h::device_vector<counter_t>(1024);
const auto num_bins = GENERATE(1, 2);

Expand Down
4 changes: 4 additions & 0 deletions cub/test/catch2_test_device_reduce.cu
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,10 @@
#include <c2h/custom_type.h>
#include <c2h/extended_types.h>

// need to suppress deprecation warnings for ConstantInputIterator in the cudafe1.stub.c file, so there is no matching
// _CCCL_SUPPRESS_DEPRECATED_POP at the end of this file
_CCCL_SUPPRESS_DEPRECATED_PUSH

DECLARE_LAUNCH_WRAPPER(cub::DeviceReduce::Reduce, device_reduce);
DECLARE_LAUNCH_WRAPPER(cub::DeviceReduce::Sum, device_sum);
DECLARE_LAUNCH_WRAPPER(cub::DeviceReduce::Min, device_min);
Expand Down
Loading

0 comments on commit abfb7b4

Please sign in to comment.